From b735960c02a91d7e4abedde21adf9afb96f35b3f Mon Sep 17 00:00:00 2001 From: Akiyoshi Kurita Date: Tue, 23 Dec 2025 20:11:12 +0900 Subject: [PATCH 01/35] dm-raid: fix typo in documentation Signed-off-by: Akiyoshi Kurita Signed-off-by: Mikulas Patocka --- Documentation/admin-guide/device-mapper/dm-raid.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/device-mapper/dm-raid.rst b/Documentation/admin-guide/device-mapper/dm-raid.rst index e11f10764770..3780f6e6b6bb 100644 --- a/Documentation/admin-guide/device-mapper/dm-raid.rst +++ b/Documentation/admin-guide/device-mapper/dm-raid.rst @@ -433,7 +433,7 @@ Table line examples: 8192 1960886272 linear 8:0 0 2048 # previous data segment # Mapping table for e.g. raid5_rs reshape causing the size of the raid device to double-fold once the reshape finishes. -# Check the status output (e.g. "dmsetup status $RaidDev") for progess. +# Check the status output (e.g. "dmsetup status $RaidDev") for progress. 0 $((2 * 1960886272)) raid raid5 7 0 region_size 2048 data_offset 8192 delta_disk 1 2 /dev/dm-0 /dev/dm-1 /dev/dm-2 /dev/dm-3 From a8c3ec7d07a8d458a6b556eecc92010ff906b68a Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Tue, 30 Dec 2025 16:22:27 +0100 Subject: [PATCH 02/35] dm-vdo: adjust function name reference There is no function advance_compression_stage(). But advance_data_vio_compression_stage() does iterate through the values of the data_vio_compression_stage enum, so it seems to be what was intended. Signed-off-by: Julia Lawall Signed-off-by: Mikulas Patocka --- drivers/md/dm-vdo/data-vio.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-vdo/data-vio.h b/drivers/md/dm-vdo/data-vio.h index 067b983bb291..58a1ac806885 100644 --- a/drivers/md/dm-vdo/data-vio.h +++ b/drivers/md/dm-vdo/data-vio.h @@ -88,8 +88,8 @@ struct zoned_pbn { }; /* - * Where a data_vio is on the compression path; advance_compression_stage() depends on the order of - * this enum. + * Where a data_vio is on the compression path; advance_data_vio_compression_stage() + * depends on the order of this enum. */ enum data_vio_compression_stage { /* A data_vio which has not yet entered the compression path */ From f3a9c95a15d2f4466acad5c68faeff79ca5e9f47 Mon Sep 17 00:00:00 2001 From: Ding Hui Date: Sat, 20 Dec 2025 20:03:50 +0800 Subject: [PATCH 03/35] dm: remove fake timeout to avoid leak request Since commit 15f73f5b3e59 ("blk-mq: move failure injection out of blk_mq_complete_request"), drivers are responsible for calling blk_should_fake_timeout() at appropriate code paths and opportunities. However, the dm driver does not implement its own timeout handler and relies on the timeout handling of its slave devices. If an io-timeout-fail error is injected to a dm device, the request will be leaked and never completed, causing tasks to hang indefinitely. Reproduce: 1. prepare dm which has iscsi slave device 2. inject io-timeout-fail to dm echo 1 >/sys/class/block/dm-0/io-timeout-fail echo 100 >/sys/kernel/debug/fail_io_timeout/probability echo 10 >/sys/kernel/debug/fail_io_timeout/times 3. read/write dm 4. iscsiadm -m node -u Result: hang task like below [ 862.243768] INFO: task kworker/u514:2:151 blocked for more than 122 seconds. [ 862.244133] Tainted: G E 6.19.0-rc1+ #51 [ 862.244337] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 862.244718] task:kworker/u514:2 state:D stack:0 pid:151 tgid:151 ppid:2 task_flags:0x4288060 flags:0x00080000 [ 862.245024] Workqueue: iscsi_ctrl_3:1 __iscsi_unbind_session [scsi_transport_iscsi] [ 862.245264] Call Trace: [ 862.245587] [ 862.245814] __schedule+0x810/0x15c0 [ 862.246557] schedule+0x69/0x180 [ 862.246760] blk_mq_freeze_queue_wait+0xde/0x120 [ 862.247688] elevator_change+0x16d/0x460 [ 862.247893] elevator_set_none+0x87/0xf0 [ 862.248798] blk_unregister_queue+0x12e/0x2a0 [ 862.248995] __del_gendisk+0x231/0x7e0 [ 862.250143] del_gendisk+0x12f/0x1d0 [ 862.250339] sd_remove+0x85/0x130 [sd_mod] [ 862.250650] device_release_driver_internal+0x36d/0x530 [ 862.250849] bus_remove_device+0x1dd/0x3f0 [ 862.251042] device_del+0x38a/0x930 [ 862.252095] __scsi_remove_device+0x293/0x360 [ 862.252291] scsi_remove_target+0x486/0x760 [ 862.252654] __iscsi_unbind_session+0x18a/0x3e0 [scsi_transport_iscsi] [ 862.252886] process_one_work+0x633/0xe50 [ 862.253101] worker_thread+0x6df/0xf10 [ 862.253647] kthread+0x36d/0x720 [ 862.254533] ret_from_fork+0x2a6/0x470 [ 862.255852] ret_from_fork_asm+0x1a/0x30 [ 862.256037] Remove the blk_should_fake_timeout() check from dm, as dm has no native timeout handling and should not attempt to fake timeouts. Signed-off-by: Ding Hui Reviewed-by: Christoph Hellwig Signed-off-by: Mikulas Patocka --- drivers/md/dm-rq.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index a6ca92049c10..5e0854669614 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -278,8 +278,7 @@ static void dm_complete_request(struct request *rq, blk_status_t error) struct dm_rq_target_io *tio = tio_from_request(rq); tio->error = error; - if (likely(!blk_should_fake_timeout(rq->q))) - blk_mq_complete_request(rq); + blk_mq_complete_request(rq); } /* From b13ef361d47f09b7aecd18e0383ecc83ff61057e Mon Sep 17 00:00:00 2001 From: Daniel Gomez Date: Sat, 20 Dec 2025 04:49:37 +0100 Subject: [PATCH 04/35] dm: replace -EEXIST with -EBUSY The -EEXIST error code is reserved by the module loading infrastructure to indicate that a module is already loaded. When a module's init function returns -EEXIST, userspace tools like kmod interpret this as "module already loaded" and treat the operation as successful, returning 0 to the user even though the module initialization actually failed. This follows the precedent set by commit 54416fd76770 ("netfilter: conntrack: helper: Replace -EEXIST by -EBUSY") which fixed the same issue in nf_conntrack_helper_register(). Affected modules: * dm_cache dm_clone dm_integrity dm_mirror dm_multipath dm_pcache * dm_vdo dm-ps-round-robin dm_historical_service_time dm_io_affinity * dm_queue_length dm_service_time dm_snapshot Signed-off-by: Daniel Gomez Signed-off-by: Mikulas Patocka --- drivers/md/dm-exception-store.c | 2 +- drivers/md/dm-log.c | 2 +- drivers/md/dm-path-selector.c | 2 +- drivers/md/dm-target.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index c3799757bf4a..88f119a0a2ae 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c @@ -116,7 +116,7 @@ int dm_exception_store_type_register(struct dm_exception_store_type *type) if (!__find_exception_store_type(type->name)) list_add(&type->list, &_exception_store_types); else - r = -EEXIST; + r = -EBUSY; spin_unlock(&_lock); return r; diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 9d85d045f9d9..bced5a783ee3 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -121,7 +121,7 @@ int dm_dirty_log_type_register(struct dm_dirty_log_type *type) if (!__find_dirty_log_type(type->name)) list_add(&type->list, &_log_types); else - r = -EEXIST; + r = -EBUSY; spin_unlock(&_lock); return r; diff --git a/drivers/md/dm-path-selector.c b/drivers/md/dm-path-selector.c index d0b883fabfeb..2b0ac200f1c0 100644 --- a/drivers/md/dm-path-selector.c +++ b/drivers/md/dm-path-selector.c @@ -107,7 +107,7 @@ int dm_register_path_selector(struct path_selector_type *pst) if (__find_path_selector_type(pst->name)) { kfree(psi); - r = -EEXIST; + r = -EBUSY; } else list_add(&psi->list, &_path_selectors); diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index 8fede41adec0..1fd41289de36 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -88,7 +88,7 @@ int dm_register_target(struct target_type *tt) if (__find_target_type(tt->name)) { DMERR("%s: '%s' target already registered", __func__, tt->name); - rv = -EEXIST; + rv = -EBUSY; } else { list_add(&tt->list, &_targets); } From c1881c74f4dfdadc1bf827d971a605b21ba5a587 Mon Sep 17 00:00:00 2001 From: Yongpeng Yang Date: Fri, 12 Dec 2025 21:09:55 +0800 Subject: [PATCH 05/35] dm-stripe: adjust max_hw_discard_sectors to avoid unnecessary discard bio splitting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the max_hw_discard_sectors of a stripe target is set to the minimum max_hw_discard_sectors among all sub devices. When the discard bio is larger than max_hw_discard_sectors, this may cause the stripe device to split discard bios unnecessarily, because the value of max_hw_discard_sectors affects max_discard_sectors, which equal to min(max_hw_discard_sectors, max_user_discard_sectors). For example: root@vm:~# echo '0 33554432 striped 2 256 /dev/vdd 0 /dev/vde 0' | dmsetup create stripe_dev root@vm:~# cat /sys/block/dm-1/queue/discard_max_bytes 536870912 root@vm:~# cat /sys/block/dm-1/slaves/vdd/queue/discard_max_bytes 536870912 root@vm:~# blkdiscard -o 0 -l 1073741824 -p 1073741824 /dev/mapper/stripe_dev dm-1 is the stripe device, and its discard_max_bytes is equal to each sub device’s discard_max_bytes. Since the requested discard length exceeds discard_max_bytes, the block layer splits the discard bio: block_bio_queue: 252,1 DS 0 + 2097152 [blkdiscard] block_split: 252,1 DS 0 / 1048576 [blkdiscard] block_rq_issue: 253,48 DS 268435456 () 0 + 524288 be,0,4 [blkdiscard] block_bio_queue: 253,64 DS 524288 + 524288 [blkdiscard] However, both vdd and vde can actually handle a discard bio of 536870912 bytes, so this split is not necessary. This patch updates the stripe target’s q->limits.max_hw_discard_sectors to be the minimum max_hw_discard_sectors of the sub devices multiplied by the # of stripe devices, and max_hw_discard_sectors must round down to chunk size multiply # of stripe devices to avoid issue discard bio to sub devices which is larger than max_hw_discard_sectors. This patch enables the stripe device to handle larger discard bios without incurring unnecessary splitting. Signed-off-by: Yongpeng Yang Reviewed-by: Benjamin Marzinski Signed-off-by: Mikulas Patocka --- drivers/md/dm-stripe.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 1461dc740dae..8a872f5d633e 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -456,7 +456,7 @@ static void stripe_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct stripe_c *sc = ti->private; - unsigned int io_min, io_opt; + unsigned int io_min, io_opt, max_hw_discard_sectors = limits->max_hw_discard_sectors; limits->chunk_sectors = sc->chunk_size; @@ -465,6 +465,14 @@ static void stripe_io_hints(struct dm_target *ti, limits->io_min = io_min; limits->io_opt = io_opt; } + if (max_hw_discard_sectors >= sc->chunk_size) { + if (!check_mul_overflow(max_hw_discard_sectors, sc->stripes, &max_hw_discard_sectors)) { + max_hw_discard_sectors = rounddown(max_hw_discard_sectors, + sc->chunk_size * sc->stripes); + limits->max_hw_discard_sectors = max_hw_discard_sectors; + } else + limits->max_hw_discard_sectors = UINT_MAX >> SECTOR_SHIFT; + } } static struct target_type stripe_target = { From a2f0a98b13db005403d026274782ac668b59df32 Mon Sep 17 00:00:00 2001 From: Li Chen Date: Wed, 10 Dec 2025 11:17:56 +0800 Subject: [PATCH 06/35] dm cache: drop redundant origin size check The cache target already exposes the origin device through cache_iterate_devices(), which allows dm-table to call device_area_is_invalid() and verify that the mapping fits inside the underlying block device. The explicit ti->len > origin_sectors test in parse_origin_dev() is therefore redundant. Drop this check and rely on the core device validation instead. This changes the user-visible error string when the origin is too small, but preserves the failure behaviour. Signed-off-by: Li Chen Signed-off-by: Mikulas Patocka --- drivers/md/dm-cache-target.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index a10d75a562db..350a0aa53365 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2099,7 +2099,6 @@ static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, char **error) { - sector_t origin_sectors; int r; if (!at_least_one_arg(as, error)) @@ -2112,12 +2111,6 @@ static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, return r; } - origin_sectors = get_dev_size(ca->origin_dev); - if (ca->ti->len > origin_sectors) { - *error = "Device size larger than cached device"; - return -EINVAL; - } - return 0; } From a23cc8257ecdfdeb25fd26d25fec4539ef377944 Mon Sep 17 00:00:00 2001 From: Li Chen Date: Wed, 10 Dec 2025 11:17:57 +0800 Subject: [PATCH 07/35] dm clone: drop redundant size checks The clone target already exposes both source and destination devices via clone_iterate_devices(), so dm-table's device_area_is_invalid() helper ensures that the mapping does not extend past either underlying block device. The manual comparisons between ti->len and the source/destination device sizes in parse_source_dev() and parse_dest_dev() are therefore redundant. Remove these checks and rely on the core validation instead. This changes the error strings reported when the devices are too small, but preserves the failure behaviour. Signed-off-by: Li Chen Signed-off-by: Mikulas Patocka --- drivers/md/dm-clone-target.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index e956d980672c..ac94e3466560 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -1697,7 +1697,6 @@ static int parse_metadata_dev(struct clone *clone, struct dm_arg_set *as, char * static int parse_dest_dev(struct clone *clone, struct dm_arg_set *as, char **error) { int r; - sector_t dest_dev_size; r = dm_get_device(clone->ti, dm_shift_arg(as), BLK_OPEN_READ | BLK_OPEN_WRITE, &clone->dest_dev); @@ -1706,20 +1705,12 @@ static int parse_dest_dev(struct clone *clone, struct dm_arg_set *as, char **err return r; } - dest_dev_size = get_dev_size(clone->dest_dev); - if (dest_dev_size < clone->ti->len) { - dm_put_device(clone->ti, clone->dest_dev); - *error = "Device size larger than destination device"; - return -EINVAL; - } - return 0; } static int parse_source_dev(struct clone *clone, struct dm_arg_set *as, char **error) { int r; - sector_t source_dev_size; r = dm_get_device(clone->ti, dm_shift_arg(as), BLK_OPEN_READ, &clone->source_dev); @@ -1728,13 +1719,6 @@ static int parse_source_dev(struct clone *clone, struct dm_arg_set *as, char **e return r; } - source_dev_size = get_dev_size(clone->source_dev); - if (source_dev_size < clone->ti->len) { - dm_put_device(clone->ti, clone->source_dev); - *error = "Device size larger than source device"; - return -EINVAL; - } - return 0; } From b140a921eadfeaf48238a3a6d2da2a5e6946a31b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 19 Dec 2025 11:29:03 -0800 Subject: [PATCH 08/35] dm-verity: move dm_verity_fec_io to mempool Currently, struct dm_verity_fec_io is allocated in the front padding of struct bio using dm_target::per_io_data_size. Unfortunately, struct dm_verity_fec_io is very large: 3096 bytes when CONFIG_64BIT=y && PAGE_SIZE == 4096, or 9240 bytes when CONFIG_64BIT=y && PAGE_SIZE == 16384. This makes the bio size very large. Moreover, most of dm_verity_fec_io gets iterated over up to three times, even on I/O requests that don't require any error correction: 1. To zero the memory on allocation, if init_on_alloc=1. (This happens when the bio is allocated, not in dm-verity itself.) 2. To zero the buffers array in verity_fec_init_io(). 3. To free the buffers in verity_fec_finish_io(). Fix all of these inefficiencies by moving dm_verity_fec_io to a mempool. Replace the embedded dm_verity_fec_io with a pointer dm_verity_io::fec_io. verity_fec_init_io() initializes it to NULL, verity_fec_decode() allocates it on the first call, and verity_fec_finish_io() cleans it up. The normal case is that the pointer simply stays NULL, so the overhead becomes negligible. Reviewed-by: Sami Tolvanen Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.c | 96 +++++++++++++++----------------------- drivers/md/dm-verity-fec.h | 14 +++++- drivers/md/dm-verity.h | 4 ++ 3 files changed, 54 insertions(+), 60 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index c79de517afee..2c1544556a1c 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -18,16 +18,6 @@ bool verity_fec_is_enabled(struct dm_verity *v) return v->fec && v->fec->dev; } -/* - * Return a pointer to dm_verity_fec_io after dm_verity_io and its variable - * length fields. - */ -static inline struct dm_verity_fec_io *fec_io(struct dm_verity_io *io) -{ - return (struct dm_verity_fec_io *) - ((char *)io + io->v->ti->per_io_data_size - sizeof(struct dm_verity_fec_io)); -} - /* * Return an interleaved offset for a byte in RS block. */ @@ -211,7 +201,7 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, int i, j, target_index = -1; struct dm_buffer *buf; struct dm_bufio_client *bufio; - struct dm_verity_fec_io *fio = fec_io(io); + struct dm_verity_fec_io *fio = io->fec_io; u64 block, ileaved; u8 *bbuf, *rs_block; u8 want_digest[HASH_MAX_DIGESTSIZE]; @@ -307,39 +297,40 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, } /* - * Allocate RS control structure and FEC buffers from preallocated mempools, - * and attempt to allocate as many extra buffers as available. + * Allocate and initialize a struct dm_verity_fec_io to use for FEC for a bio. + * This runs the first time a block needs to be corrected for a bio. In the + * common case where no block needs to be corrected, this code never runs. + * + * This always succeeds, as all required allocations are done from mempools. + * Additional buffers are also allocated opportunistically to improve error + * correction performance, but these aren't required to succeed. */ -static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio) +static struct dm_verity_fec_io *fec_alloc_and_init_io(struct dm_verity *v) { + struct dm_verity_fec *f = v->fec; + struct dm_verity_fec_io *fio; unsigned int n; - if (!fio->rs) - fio->rs = mempool_alloc(&v->fec->rs_pool, GFP_NOIO); + fio = mempool_alloc(&f->fio_pool, GFP_NOIO); + fio->rs = mempool_alloc(&f->rs_pool, GFP_NOIO); - fec_for_each_prealloc_buffer(n) { - if (fio->bufs[n]) - continue; + memset(fio->bufs, 0, sizeof(fio->bufs)); - fio->bufs[n] = mempool_alloc(&v->fec->prealloc_pool, GFP_NOIO); - } + fec_for_each_prealloc_buffer(n) + fio->bufs[n] = mempool_alloc(&f->prealloc_pool, GFP_NOIO); /* try to allocate the maximum number of buffers */ fec_for_each_extra_buffer(fio, n) { - if (fio->bufs[n]) - continue; - - fio->bufs[n] = kmem_cache_alloc(v->fec->cache, GFP_NOWAIT); + fio->bufs[n] = kmem_cache_alloc(f->cache, GFP_NOWAIT); /* we can manage with even one buffer if necessary */ if (unlikely(!fio->bufs[n])) break; } fio->nbufs = n; - if (!fio->output) - fio->output = mempool_alloc(&v->fec->output_pool, GFP_NOIO); - - return 0; + fio->output = mempool_alloc(&f->output_pool, GFP_NOIO); + fio->level = 0; + return fio; } /* @@ -368,10 +359,6 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io, int r, neras = 0; unsigned int pos; - r = fec_alloc_bufs(v, fio); - if (unlikely(r < 0)) - return r; - for (pos = 0; pos < 1 << v->data_dev_block_bits; ) { fec_init_bufs(v, fio); @@ -408,12 +395,16 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, sector_t block, u8 *dest) { int r; - struct dm_verity_fec_io *fio = fec_io(io); + struct dm_verity_fec_io *fio; u64 offset, res, rsb; if (!verity_fec_is_enabled(v)) return -EOPNOTSUPP; + fio = io->fec_io; + if (!fio) + fio = io->fec_io = fec_alloc_and_init_io(v); + if (fio->level) return -EIO; @@ -463,14 +454,11 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, /* * Clean up per-bio data. */ -void verity_fec_finish_io(struct dm_verity_io *io) +void __verity_fec_finish_io(struct dm_verity_io *io) { unsigned int n; struct dm_verity_fec *f = io->v->fec; - struct dm_verity_fec_io *fio = fec_io(io); - - if (!verity_fec_is_enabled(io->v)) - return; + struct dm_verity_fec_io *fio = io->fec_io; mempool_free(fio->rs, &f->rs_pool); @@ -482,23 +470,9 @@ void verity_fec_finish_io(struct dm_verity_io *io) kmem_cache_free(f->cache, fio->bufs[n]); mempool_free(fio->output, &f->output_pool); -} -/* - * Initialize per-bio data. - */ -void verity_fec_init_io(struct dm_verity_io *io) -{ - struct dm_verity_fec_io *fio = fec_io(io); - - if (!verity_fec_is_enabled(io->v)) - return; - - fio->rs = NULL; - memset(fio->bufs, 0, sizeof(fio->bufs)); - fio->nbufs = 0; - fio->output = NULL; - fio->level = 0; + mempool_free(fio, &f->fio_pool); + io->fec_io = NULL; } /* @@ -529,6 +503,7 @@ void verity_fec_dtr(struct dm_verity *v) if (!verity_fec_is_enabled(v)) goto out; + mempool_exit(&f->fio_pool); mempool_exit(&f->rs_pool); mempool_exit(&f->prealloc_pool); mempool_exit(&f->output_pool); @@ -758,6 +733,14 @@ int verity_fec_ctr(struct dm_verity *v) return -E2BIG; } + /* Preallocate some dm_verity_fec_io structures */ + ret = mempool_init_kmalloc_pool(&f->fio_pool, num_online_cpus(), + sizeof(struct dm_verity_fec_io)); + if (ret) { + ti->error = "Cannot allocate FEC IO pool"; + return ret; + } + /* Preallocate an rs_control structure for each worker thread */ ret = mempool_init(&f->rs_pool, num_online_cpus(), fec_rs_alloc, fec_rs_free, (void *) v); @@ -791,8 +774,5 @@ int verity_fec_ctr(struct dm_verity *v) return ret; } - /* Reserve space for our per-bio data */ - ti->per_io_data_size += sizeof(struct dm_verity_fec_io); - return 0; } diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h index 5fd267873812..b9488d1ddf14 100644 --- a/drivers/md/dm-verity-fec.h +++ b/drivers/md/dm-verity-fec.h @@ -40,6 +40,7 @@ struct dm_verity_fec { sector_t hash_blocks; /* blocks covered after v->hash_start */ unsigned char roots; /* number of parity bytes, M-N of RS(M, N) */ unsigned char rsn; /* N of RS(M, N) */ + mempool_t fio_pool; /* mempool for dm_verity_fec_io */ mempool_t rs_pool; /* mempool for fio->rs */ mempool_t prealloc_pool; /* mempool for preallocated buffers */ mempool_t output_pool; /* mempool for output */ @@ -71,8 +72,17 @@ extern int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, extern unsigned int verity_fec_status_table(struct dm_verity *v, unsigned int sz, char *result, unsigned int maxlen); -extern void verity_fec_finish_io(struct dm_verity_io *io); -extern void verity_fec_init_io(struct dm_verity_io *io); +extern void __verity_fec_finish_io(struct dm_verity_io *io); +static inline void verity_fec_finish_io(struct dm_verity_io *io) +{ + if (unlikely(io->fec_io)) + __verity_fec_finish_io(io); +} + +static inline void verity_fec_init_io(struct dm_verity_io *io) +{ + io->fec_io = NULL; +} extern bool verity_is_fec_opt_arg(const char *arg_name); extern int verity_fec_parse_opt_args(struct dm_arg_set *as, diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index f975a9e5c5d6..4ad7ce3dae0a 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -104,6 +104,10 @@ struct dm_verity_io { bool in_bh; bool had_mismatch; +#ifdef CONFIG_DM_VERITY_FEC + struct dm_verity_fec_io *fec_io; +#endif + struct work_struct work; struct work_struct bh_work; From 533e641b4587cfe144f413e50eb771433ea82845 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 19 Dec 2025 11:29:04 -0800 Subject: [PATCH 09/35] dm-verity: make dm_verity_fec_io::bufs variable-length When correcting a data block, the FEC code performs optimally when it has enough buffers to hold all the needed RS blocks. That number of buffers is '1 << (v->data_dev_block_bits - DM_VERITY_FEC_BUF_RS_BITS)'. However, since v->data_dev_block_bits isn't a compile-time constant, the code actually used PAGE_SHIFT instead. With the traditional PAGE_SIZE == data_block_size == 4096, this was fine. However, when PAGE_SIZE > data_block_size, this wastes space. E.g., with data_block_size == 4096 && PAGE_SIZE == 16384, struct dm_verity_fec_io is 9240 bytes, when in fact only 3096 bytes are needed. Fix this by making dm_verity_fec_io::bufs a variable-length array. This makes the macros DM_VERITY_FEC_BUF_MAX and fec_for_each_extra_buffer() no longer apply, so remove them. For consistency, and because DM_VERITY_FEC_BUF_PREALLOC is fixed at 1 and was already assumed to be 1 (considering that mempool_alloc() shouldn't be called in a loop), also remove the related macros DM_VERITY_FEC_BUF_PREALLOC and fec_for_each_prealloc_buffer(). Signed-off-by: Eric Biggers Reviewed-by: Sami Tolvanen Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.c | 44 +++++++++++++++++++------------------- drivers/md/dm-verity-fec.h | 15 +++++++------ 2 files changed, 31 insertions(+), 28 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 2c1544556a1c..6d0b5b4b2699 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -10,6 +10,18 @@ #define DM_MSG_PREFIX "verity-fec" +/* + * When correcting a data block, the FEC code performs optimally when it can + * collect all the associated RS blocks at the same time. As each byte is part + * of a different RS block, there are '1 << data_dev_block_bits' RS blocks. + * There are '1 << DM_VERITY_FEC_BUF_RS_BITS' RS blocks per buffer, so that + * gives '1 << (data_dev_block_bits - DM_VERITY_FEC_BUF_RS_BITS)' buffers. + */ +static inline unsigned int fec_max_nbufs(struct dm_verity *v) +{ + return 1 << (v->data_dev_block_bits - DM_VERITY_FEC_BUF_RS_BITS); +} + /* * If error correction has been configured, returns true. */ @@ -59,14 +71,6 @@ static u8 *fec_read_parity(struct dm_verity *v, u64 rsb, int index, return res; } -/* Loop over each preallocated buffer slot. */ -#define fec_for_each_prealloc_buffer(__i) \ - for (__i = 0; __i < DM_VERITY_FEC_BUF_PREALLOC; __i++) - -/* Loop over each extra buffer slot. */ -#define fec_for_each_extra_buffer(io, __i) \ - for (__i = DM_VERITY_FEC_BUF_PREALLOC; __i < DM_VERITY_FEC_BUF_MAX; __i++) - /* Loop over each allocated buffer. */ #define fec_for_each_buffer(io, __i) \ for (__i = 0; __i < (io)->nbufs; __i++) @@ -307,6 +311,7 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, */ static struct dm_verity_fec_io *fec_alloc_and_init_io(struct dm_verity *v) { + const unsigned int max_nbufs = fec_max_nbufs(v); struct dm_verity_fec *f = v->fec; struct dm_verity_fec_io *fio; unsigned int n; @@ -314,13 +319,10 @@ static struct dm_verity_fec_io *fec_alloc_and_init_io(struct dm_verity *v) fio = mempool_alloc(&f->fio_pool, GFP_NOIO); fio->rs = mempool_alloc(&f->rs_pool, GFP_NOIO); - memset(fio->bufs, 0, sizeof(fio->bufs)); - - fec_for_each_prealloc_buffer(n) - fio->bufs[n] = mempool_alloc(&f->prealloc_pool, GFP_NOIO); + fio->bufs[0] = mempool_alloc(&f->prealloc_pool, GFP_NOIO); /* try to allocate the maximum number of buffers */ - fec_for_each_extra_buffer(fio, n) { + for (n = 1; n < max_nbufs; n++) { fio->bufs[n] = kmem_cache_alloc(f->cache, GFP_NOWAIT); /* we can manage with even one buffer if necessary */ if (unlikely(!fio->bufs[n])) @@ -462,12 +464,10 @@ void __verity_fec_finish_io(struct dm_verity_io *io) mempool_free(fio->rs, &f->rs_pool); - fec_for_each_prealloc_buffer(n) - mempool_free(fio->bufs[n], &f->prealloc_pool); + mempool_free(fio->bufs[0], &f->prealloc_pool); - fec_for_each_extra_buffer(fio, n) - if (fio->bufs[n]) - kmem_cache_free(f->cache, fio->bufs[n]); + for (n = 1; n < fio->nbufs; n++) + kmem_cache_free(f->cache, fio->bufs[n]); mempool_free(fio->output, &f->output_pool); @@ -735,7 +735,8 @@ int verity_fec_ctr(struct dm_verity *v) /* Preallocate some dm_verity_fec_io structures */ ret = mempool_init_kmalloc_pool(&f->fio_pool, num_online_cpus(), - sizeof(struct dm_verity_fec_io)); + struct_size((struct dm_verity_fec_io *)0, + bufs, fec_max_nbufs(v))); if (ret) { ti->error = "Cannot allocate FEC IO pool"; return ret; @@ -757,9 +758,8 @@ int verity_fec_ctr(struct dm_verity *v) return -ENOMEM; } - /* Preallocate DM_VERITY_FEC_BUF_PREALLOC buffers for each thread */ - ret = mempool_init_slab_pool(&f->prealloc_pool, num_online_cpus() * - DM_VERITY_FEC_BUF_PREALLOC, + /* Preallocate one buffer for each thread */ + ret = mempool_init_slab_pool(&f->prealloc_pool, num_online_cpus(), f->cache); if (ret) { ti->error = "Cannot allocate FEC buffer prealloc pool"; diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h index b9488d1ddf14..571097438311 100644 --- a/drivers/md/dm-verity-fec.h +++ b/drivers/md/dm-verity-fec.h @@ -17,11 +17,7 @@ #define DM_VERITY_FEC_MIN_RSN 231 /* ~10% space overhead */ /* buffers for deinterleaving and decoding */ -#define DM_VERITY_FEC_BUF_PREALLOC 1 /* buffers to preallocate */ #define DM_VERITY_FEC_BUF_RS_BITS 4 /* 1 << RS blocks per buffer */ -/* we need buffers for at most 1 << block size RS blocks */ -#define DM_VERITY_FEC_BUF_MAX \ - (1 << (PAGE_SHIFT - DM_VERITY_FEC_BUF_RS_BITS)) #define DM_VERITY_OPT_FEC_DEV "use_fec_from_device" #define DM_VERITY_OPT_FEC_BLOCKS "fec_blocks" @@ -52,10 +48,17 @@ struct dm_verity_fec { struct dm_verity_fec_io { struct rs_control *rs; /* Reed-Solomon state */ int erasures[DM_VERITY_FEC_MAX_RSN]; /* erasures for decode_rs8 */ - u8 *bufs[DM_VERITY_FEC_BUF_MAX]; /* bufs for deinterleaving */ - unsigned int nbufs; /* number of buffers allocated */ u8 *output; /* buffer for corrected output */ unsigned int level; /* recursion level */ + unsigned int nbufs; /* number of buffers allocated */ + /* + * Buffers for deinterleaving RS blocks. Each buffer has space for + * the data bytes of (1 << DM_VERITY_FEC_BUF_RS_BITS) RS blocks. The + * array length is fec_max_nbufs(v), and we try to allocate that many + * buffers. However, in low-memory situations we may be unable to + * allocate all buffers. 'nbufs' holds the number actually allocated. + */ + u8 *bufs[]; }; #ifdef CONFIG_DM_VERITY_FEC From 12f74a157750a05d0285086bef97149c9ea1c257 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 19 Dec 2025 11:29:05 -0800 Subject: [PATCH 10/35] dm-verity: remove unnecessary condition for verity_fec_finish_io() Make verity_finish_io() call verity_fec_finish_io() unconditionally, instead of skipping it when 'in_bh' is true. Although FEC can't have been done when 'in_bh' is true, verity_fec_finish_io() is a no-op when FEC wasn't done. An earlier change also made verity_fec_finish_io() very lightweight when FEC wasn't done. So it should just be called unconditionally. Reviewed-by: Sami Tolvanen Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-target.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 5c17472d7896..c9f5602a42c6 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -619,8 +619,7 @@ static void verity_finish_io(struct dm_verity_io *io, blk_status_t status) bio->bi_end_io = io->orig_bi_end_io; bio->bi_status = status; - if (!static_branch_unlikely(&use_bh_wq_enabled) || !io->in_bh) - verity_fec_finish_io(io); + verity_fec_finish_io(io); if (unlikely(status != BLK_STS_OK) && unlikely(!(bio->bi_opf & REQ_RAHEAD)) && From fa3d53140d430f27b54c2bd91f4faccb99c8fbdd Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 19 Dec 2025 11:29:06 -0800 Subject: [PATCH 11/35] dm-verity: remove unnecessary ifdef around verity_fec_decode() Since verity_fec_decode() has a !CONFIG_DM_VERITY_FEC stub, it can just be called unconditionally, similar to the other calls in the same file. Reviewed-by: Sami Tolvanen Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-target.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index c9f5602a42c6..777a0ebe8536 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -435,11 +435,9 @@ static int verity_handle_data_hash_mismatch(struct dm_verity *v, set_bit(blkno, v->validated_blocks); return 0; } -#if defined(CONFIG_DM_VERITY_FEC) if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, want_digest, blkno, data) == 0) return 0; -#endif if (bio->bi_status) return -EIO; /* Error correction failed; Just return error */ From 1a257c5fd33a5b641478a7dd851861f64529c7bb Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 19 Dec 2025 11:29:07 -0800 Subject: [PATCH 12/35] dm-verity: make verity_fec_is_enabled() an inline function verity_fec_is_enabled() is very short and is called in quite a few places, so make it an inline function. Reviewed-by: Sami Tolvanen Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.c | 8 -------- drivers/md/dm-verity-fec.h | 6 +++++- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 6d0b5b4b2699..ef9970b889aa 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -22,14 +22,6 @@ static inline unsigned int fec_max_nbufs(struct dm_verity *v) return 1 << (v->data_dev_block_bits - DM_VERITY_FEC_BUF_RS_BITS); } -/* - * If error correction has been configured, returns true. - */ -bool verity_fec_is_enabled(struct dm_verity *v) -{ - return v->fec && v->fec->dev; -} - /* * Return an interleaved offset for a byte in RS block. */ diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h index 571097438311..35d28d9f8a9b 100644 --- a/drivers/md/dm-verity-fec.h +++ b/drivers/md/dm-verity-fec.h @@ -66,7 +66,11 @@ struct dm_verity_fec_io { /* each feature parameter requires a value */ #define DM_VERITY_OPTS_FEC 8 -extern bool verity_fec_is_enabled(struct dm_verity *v); +/* Returns true if forward error correction is enabled. */ +static inline bool verity_fec_is_enabled(struct dm_verity *v) +{ + return v->fec && v->fec->dev; +} extern int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, enum verity_block_type type, const u8 *want_digest, From 119f4f04186fa4f33ee6bd39af145cdaff1ff17f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 19 Dec 2025 11:29:08 -0800 Subject: [PATCH 13/35] dm-verity: correctly handle dm_bufio_client_create() failure If either of the calls to dm_bufio_client_create() in verity_fec_ctr() fails, then dm_bufio_client_destroy() is later called with an ERR_PTR() argument. That causes a crash. Fix this. Fixes: a739ff3f543a ("dm verity: add support for forward error correction") Cc: stable@vger.kernel.org Reviewed-by: Sami Tolvanen Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index ef9970b889aa..7583607a8aa6 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -501,9 +501,9 @@ void verity_fec_dtr(struct dm_verity *v) mempool_exit(&f->output_pool); kmem_cache_destroy(f->cache); - if (f->data_bufio) + if (!IS_ERR_OR_NULL(f->data_bufio)) dm_bufio_client_destroy(f->data_bufio); - if (f->bufio) + if (!IS_ERR_OR_NULL(f->bufio)) dm_bufio_client_destroy(f->bufio); if (f->dev) From 8fbb8fe75d4cf92eaa7b21828ec39c1bf79a262f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 19 Dec 2025 11:29:09 -0800 Subject: [PATCH 14/35] dm-verity: allow REED_SOLOMON to be 'm' if DM_VERITY is 'm' The dm-verity kconfig options make the common mistake of selecting a dependency from a bool "sub-option" rather than the main tristate option. This unnecessarily forces the dependency to built-in ('y'). Fix this by moving the selections of REED_SOLOMON and REED_SOLOMON_DEC8 into DM_VERITY, conditional on DM_VERITY_FEC. This allows REED_SOLOMON to be 'm' if DM_VERITY is 'm'. Reviewed-by: Sami Tolvanen Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 239c1744a926..c58a9a8ea54e 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -549,6 +549,8 @@ config DM_VERITY select CRYPTO_HASH select CRYPTO_LIB_SHA256 select DM_BUFIO + select REED_SOLOMON if DM_VERITY_FEC + select REED_SOLOMON_DEC8 if DM_VERITY_FEC help This device-mapper target creates a read-only device that transparently validates the data on one underlying device against @@ -598,8 +600,6 @@ config DM_VERITY_VERIFY_ROOTHASH_SIG_PLATFORM_KEYRING config DM_VERITY_FEC bool "Verity forward error correction support" depends on DM_VERITY - select REED_SOLOMON - select REED_SOLOMON_DEC8 help Add forward error correction support to dm-verity. This option makes it possible to use pre-generated error correction data to From 24c405fdbe215c45e57bba672cc42859038491ee Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 8 Jan 2026 20:55:08 +0100 Subject: [PATCH 15/35] dm: fix unlocked test for dm_suspended_md The function dm_blk_report_zones tests if the device is suspended with the "dm_suspended_md" call. However, this function is called without holding any locks, so the device may be suspended just after it. Move the call to dm_suspended_md after dm_get_live_table, so that the device can't be suspended after the suspended state was tested. Signed-off-by: Mikulas Patocka Fixes: 37f53a2c60d0 ("dm: fix dm_blk_report_zones") Reviewed-by: Benjamin Marzinski --- drivers/md/dm-zone.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index c95e417194b3..bc4e45862a22 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -60,11 +60,13 @@ int dm_blk_report_zones(struct gendisk *disk, sector_t sector, * Zone revalidation during __bind() is in progress, but this * call is from a different process */ - if (dm_suspended_md(md)) - return -EAGAIN; - map = dm_get_live_table(md, &srcu_idx); put_table = true; + + if (dm_suspended_md(md)) { + ret = -EAGAIN; + goto do_put_table; + } } else { /* Zone revalidation during __bind() */ map = zone_revalidate_map; @@ -79,6 +81,7 @@ int dm_blk_report_zones(struct gendisk *disk, sector_t sector, ret = dm_blk_do_report_zones(md, map, nr_zones, &dm_args); } +do_put_table: if (put_table) dm_put_live_table(md, srcu_idx); From e9f5a55b70ae6187ab64ef2d1232ae2738e31d1f Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 8 Jan 2026 20:56:20 +0100 Subject: [PATCH 16/35] dm: use READ_ONCE in dm_blk_report_zones The functon dm_blk_report_zones reads md->zone_revalidate_map, however it may change while the function is running. Use READ_ONCE. Signed-off-by: Mikulas Patocka Fixes: 37f53a2c60d0 ("dm: fix dm_blk_report_zones") Reviewed-by: Benjamin Marzinski --- drivers/md/dm-zone.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index bc4e45862a22..f29acf64429a 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -50,7 +50,7 @@ int dm_blk_report_zones(struct gendisk *disk, sector_t sector, { struct mapped_device *md = disk->private_data; struct dm_table *map; - struct dm_table *zone_revalidate_map = md->zone_revalidate_map; + struct dm_table *zone_revalidate_map = READ_ONCE(md->zone_revalidate_map); int srcu_idx, ret = -EIO; bool put_table = false; From c698b7f417801fcd79f0dc844250b3361d38e6b8 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 12 Jan 2026 21:15:27 +0100 Subject: [PATCH 17/35] dm-integrity: fix a typo in the code for write/discard race If we send a write followed by a discard, it may be possible that the discarded data end up being overwritten by the previous write from the journal. The code tries to prevent that, but there was a typo in this logic that made it not being activated as it should be. Note that if we end up here the second time (when discard_retried is true), it means that the write bio is actually racing with the discard bio, and in this situation it is not specified which of them should win. Cc: stable@vger.kernel.org Fixes: 31843edab7cb ("dm integrity: improve discard in journal mode") Signed-off-by: Mikulas Patocka --- drivers/md/dm-integrity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 170bf67a2edd..79d60495454a 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -2411,7 +2411,7 @@ static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map new_pos = find_journal_node(ic, dio->range.logical_sector, &next_sector); if (unlikely(new_pos != NOT_FOUND) || - unlikely(next_sector < dio->range.logical_sector - dio->range.n_sectors)) { + unlikely(next_sector < dio->range.logical_sector + dio->range.n_sectors)) { remove_range_unlocked(ic, &dio->range); spin_unlock_irq(&ic->endio_wait.lock); queue_work(ic->commit_wq, &ic->commit_work); From d4880868670198df321627a949e7b7f2d76cf54e Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Tue, 13 Jan 2026 12:03:02 +0100 Subject: [PATCH 18/35] dm: add WQ_PERCPU to alloc_workqueue users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This continues the effort to refactor workqueue APIs, which began with the introduction of new workqueues and a new alloc_workqueue flag in: commit 128ea9f6ccfb ("workqueue: Add system_percpu_wq and system_dfl_wq") commit 930c2ea566af ("workqueue: Add new WQ_PERCPU flag") The refactoring is going to alter the default behavior of alloc_workqueue() to be unbound by default. With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND), any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND must now use WQ_PERCPU. For more details see the Link tag below. In order to keep alloc_workqueue() behavior identical, explicitly request WQ_PERCPU. Link: https://lore.kernel.org/all/20250221112003.1dSuoGyc@linutronix.de/ Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Signed-off-by: Mikulas Patocka --- drivers/md/dm-bufio.c | 3 ++- drivers/md/dm-cache-target.c | 3 ++- drivers/md/dm-clone-target.c | 3 ++- drivers/md/dm-crypt.c | 6 ++++-- drivers/md/dm-delay.c | 4 +++- drivers/md/dm-integrity.c | 15 ++++++++++----- drivers/md/dm-kcopyd.c | 3 ++- drivers/md/dm-log-userspace-base.c | 3 ++- drivers/md/dm-mpath.c | 5 +++-- drivers/md/dm-raid1.c | 5 +++-- drivers/md/dm-snap-persistent.c | 3 ++- drivers/md/dm-stripe.c | 2 +- drivers/md/dm-verity-target.c | 4 +++- drivers/md/dm-writecache.c | 3 ++- drivers/md/dm.c | 3 ++- drivers/md/md.c | 4 ++-- 16 files changed, 45 insertions(+), 24 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 5235f3e4924b..f41f649c01d4 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -2833,7 +2833,8 @@ static int __init dm_bufio_init(void) __cache_size_refresh(); mutex_unlock(&dm_bufio_clients_lock); - dm_bufio_wq = alloc_workqueue("dm_bufio_cache", WQ_MEM_RECLAIM, 0); + dm_bufio_wq = alloc_workqueue("dm_bufio_cache", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!dm_bufio_wq) return -ENOMEM; diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 350a0aa53365..62d1060619dd 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2526,7 +2526,8 @@ static int cache_create(struct cache_args *ca, struct cache **result) goto bad; } - cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0); + cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!cache->wq) { *error = "could not create workqueue for metadata object"; goto bad; diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index ac94e3466560..a7f73861a8cd 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -1861,7 +1861,8 @@ static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv) clone->hydration_offset = 0; atomic_set(&clone->hydrations_in_flight, 0); - clone->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0); + clone->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!clone->wq) { ti->error = "Failed to allocate workqueue"; r = -ENOMEM; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 79704fbc523b..0e479de75ad0 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -3400,7 +3400,9 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (test_bit(DM_CRYPT_HIGH_PRIORITY, &cc->flags)) common_wq_flags |= WQ_HIGHPRI; - cc->io_queue = alloc_workqueue("kcryptd_io-%s-%d", common_wq_flags, 1, devname, wq_id); + cc->io_queue = alloc_workqueue("kcryptd_io-%s-%d", + common_wq_flags | WQ_PERCPU, 1, + devname, wq_id); if (!cc->io_queue) { ti->error = "Couldn't create kcryptd io queue"; goto bad; @@ -3408,7 +3410,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags)) { cc->crypt_queue = alloc_workqueue("kcryptd-%s-%d", - common_wq_flags | WQ_CPU_INTENSIVE, + common_wq_flags | WQ_CPU_INTENSIVE | WQ_PERCPU, 1, devname, wq_id); } else { /* diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 4bb6553278c7..029f04776490 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -290,7 +290,9 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) } else { timer_setup(&dc->delay_timer, handle_delayed_timer, 0); INIT_WORK(&dc->flush_expired_bios, flush_expired_bios); - dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); + dc->kdelayd_wq = alloc_workqueue("kdelayd", + WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!dc->kdelayd_wq) { ret = -EINVAL; DMERR("Couldn't start kdelayd"); diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 79d60495454a..380527f43b2a 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -4990,7 +4990,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv } ic->metadata_wq = alloc_workqueue("dm-integrity-metadata", - WQ_MEM_RECLAIM, METADATA_WORKQUEUE_MAX_ACTIVE); + WQ_MEM_RECLAIM | WQ_PERCPU, + METADATA_WORKQUEUE_MAX_ACTIVE); if (!ic->metadata_wq) { ti->error = "Cannot allocate workqueue"; r = -ENOMEM; @@ -5008,7 +5009,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv goto bad; } - ic->offload_wq = alloc_workqueue("dm-integrity-offload", WQ_MEM_RECLAIM, + ic->offload_wq = alloc_workqueue("dm-integrity-offload", + WQ_MEM_RECLAIM | WQ_PERCPU, METADATA_WORKQUEUE_MAX_ACTIVE); if (!ic->offload_wq) { ti->error = "Cannot allocate workqueue"; @@ -5016,7 +5018,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv goto bad; } - ic->commit_wq = alloc_workqueue("dm-integrity-commit", WQ_MEM_RECLAIM, 1); + ic->commit_wq = alloc_workqueue("dm-integrity-commit", + WQ_MEM_RECLAIM | WQ_PERCPU, 1); if (!ic->commit_wq) { ti->error = "Cannot allocate workqueue"; r = -ENOMEM; @@ -5025,7 +5028,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv INIT_WORK(&ic->commit_work, integrity_commit); if (ic->mode == 'J' || ic->mode == 'B') { - ic->writer_wq = alloc_workqueue("dm-integrity-writer", WQ_MEM_RECLAIM, 1); + ic->writer_wq = alloc_workqueue("dm-integrity-writer", + WQ_MEM_RECLAIM | WQ_PERCPU, 1); if (!ic->writer_wq) { ti->error = "Cannot allocate workqueue"; r = -ENOMEM; @@ -5197,7 +5201,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv } if (ic->internal_hash) { - ic->recalc_wq = alloc_workqueue("dm-integrity-recalc", WQ_MEM_RECLAIM, 1); + ic->recalc_wq = alloc_workqueue("dm-integrity-recalc", + WQ_MEM_RECLAIM | WQ_PERCPU, 1); if (!ic->recalc_wq) { ti->error = "Cannot allocate workqueue"; r = -ENOMEM; diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 6ea75436a433..cec9a60227b6 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -934,7 +934,8 @@ struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *thro goto bad_slab; INIT_WORK(&kc->kcopyd_work, do_work); - kc->kcopyd_wq = alloc_workqueue("kcopyd", WQ_MEM_RECLAIM, 0); + kc->kcopyd_wq = alloc_workqueue("kcopyd", WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!kc->kcopyd_wq) { r = -ENOMEM; goto bad_workqueue; diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c index 9fbb4b48fb2b..607436804a8b 100644 --- a/drivers/md/dm-log-userspace-base.c +++ b/drivers/md/dm-log-userspace-base.c @@ -299,7 +299,8 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, } if (lc->integrated_flush) { - lc->dmlog_wq = alloc_workqueue("dmlogd", WQ_MEM_RECLAIM, 0); + lc->dmlog_wq = alloc_workqueue("dmlogd", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!lc->dmlog_wq) { DMERR("couldn't start dmlogd"); r = -ENOMEM; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index d5d6ef7ba838..c748e7f952c4 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -2328,7 +2328,8 @@ static int __init dm_multipath_init(void) { int r = -ENOMEM; - kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0); + kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!kmultipathd) { DMERR("failed to create workqueue kmpathd"); goto bad_alloc_kmultipathd; @@ -2347,7 +2348,7 @@ static int __init dm_multipath_init(void) goto bad_alloc_kmpath_handlerd; } - dm_mpath_wq = alloc_workqueue("dm_mpath_wq", 0, 0); + dm_mpath_wq = alloc_workqueue("dm_mpath_wq", WQ_PERCPU, 0); if (!dm_mpath_wq) { DMERR("failed to create workqueue dm_mpath_wq"); goto bad_alloc_dm_mpath_wq; diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 268f734ca9c3..943c0c6b2087 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -1128,7 +1128,8 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->num_discard_bios = 1; ti->per_io_data_size = sizeof(struct dm_raid1_bio_record); - ms->kmirrord_wq = alloc_workqueue("kmirrord", WQ_MEM_RECLAIM, 0); + ms->kmirrord_wq = alloc_workqueue("kmirrord", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!ms->kmirrord_wq) { DMERR("couldn't start kmirrord"); r = -ENOMEM; @@ -1500,7 +1501,7 @@ static int __init dm_mirror_init(void) { int r; - dm_raid1_wq = alloc_workqueue("dm_raid1_wq", 0, 0); + dm_raid1_wq = alloc_workqueue("dm_raid1_wq", WQ_PERCPU, 0); if (!dm_raid1_wq) { DMERR("Failed to alloc workqueue"); return -ENOMEM; diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 568d10842b1f..0e13d60bfdd1 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -871,7 +871,8 @@ static int persistent_ctr(struct dm_exception_store *store, char *options) atomic_set(&ps->pending_count, 0); ps->callbacks = NULL; - ps->metadata_wq = alloc_workqueue("ksnaphd", WQ_MEM_RECLAIM, 0); + ps->metadata_wq = alloc_workqueue("ksnaphd", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!ps->metadata_wq) { DMERR("couldn't start header metadata update thread"); r = -ENOMEM; diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 8a872f5d633e..20cce876d80c 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -497,7 +497,7 @@ int __init dm_stripe_init(void) { int r; - dm_stripe_wq = alloc_workqueue("dm_stripe_wq", 0, 0); + dm_stripe_wq = alloc_workqueue("dm_stripe_wq", WQ_PERCPU, 0); if (!dm_stripe_wq) return -ENOMEM; r = dm_register_target(&stripe_target); diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 777a0ebe8536..91fb465274ba 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -1648,7 +1648,9 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv) * will fall-back to using it for error handling (or if the bufio cache * doesn't have required hashes). */ - v->verify_wq = alloc_workqueue("kverityd", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); + v->verify_wq = alloc_workqueue("kverityd", + WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_PERCPU, + 0); if (!v->verify_wq) { ti->error = "Cannot allocate workqueue"; r = -ENOMEM; diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index d8de4a3076a1..af54e289bceb 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -2275,7 +2275,8 @@ static int writecache_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - wc->writeback_wq = alloc_workqueue("writecache-writeback", WQ_MEM_RECLAIM, 1); + wc->writeback_wq = alloc_workqueue("writecache-writeback", + WQ_MEM_RECLAIM | WQ_PERCPU, 1); if (!wc->writeback_wq) { r = -ENOMEM; ti->error = "Could not allocate writeback workqueue"; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index b63279202260..ea2c43cddde1 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2366,7 +2366,8 @@ static struct mapped_device *alloc_dev(int minor) format_dev_t(md->name, MKDEV(_major, minor)); - md->wq = alloc_workqueue("kdmflush/%s", WQ_MEM_RECLAIM, 0, md->name); + md->wq = alloc_workqueue("kdmflush/%s", WQ_MEM_RECLAIM | WQ_PERCPU, 0, + md->name); if (!md->wq) goto bad; diff --git a/drivers/md/md.c b/drivers/md/md.c index e5922a682953..a8c75cb92952 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -10454,11 +10454,11 @@ static int __init md_init(void) goto err_bitmap; ret = -ENOMEM; - md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); + md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!md_wq) goto err_wq; - md_misc_wq = alloc_workqueue("md_misc", 0, 0); + md_misc_wq = alloc_workqueue("md_misc", WQ_PERCPU, 0); if (!md_misc_wq) goto err_misc_wq; From 569e785957d7d27224e39d92bff4d20ab6aab324 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 11 Jan 2026 12:25:30 -0800 Subject: [PATCH 19/35] dm-verity: consolidate the BH and normal work structs Since each dm_verity_io is never on both the BH and normal workqueues at the same time, there's no need for two different work_structs. Replace the 'bh_work' and 'work' fields with just 'work'. Note: this is correct even though it means 'work' may be reused while verity_bh_work() is running. The workqueue API allows work functions to reuse or free their work_struct, and many workqueue users rely on that. Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-target.c | 8 ++++---- drivers/md/dm-verity.h | 1 - 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 91fb465274ba..e28e84562afb 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -651,7 +651,7 @@ static void verity_work(struct work_struct *w) static void verity_bh_work(struct work_struct *w) { - struct dm_verity_io *io = container_of(w, struct dm_verity_io, bh_work); + struct dm_verity_io *io = container_of(w, struct dm_verity_io, work); int err; io->in_bh = true; @@ -690,10 +690,10 @@ static void verity_end_io(struct bio *bio) if (static_branch_unlikely(&use_bh_wq_enabled) && io->v->use_bh_wq && verity_use_bh(bytes, ioprio)) { if (in_hardirq() || irqs_disabled()) { - INIT_WORK(&io->bh_work, verity_bh_work); - queue_work(system_bh_wq, &io->bh_work); + INIT_WORK(&io->work, verity_bh_work); + queue_work(system_bh_wq, &io->work); } else { - verity_bh_work(&io->bh_work); + verity_bh_work(&io->work); } } else { INIT_WORK(&io->work, verity_work); diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index 4ad7ce3dae0a..d6bfabb27113 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -109,7 +109,6 @@ struct dm_verity_io { #endif struct work_struct work; - struct work_struct bh_work; u8 tmp_digest[HASH_MAX_DIGESTSIZE]; From 17c0e16069765b319debba264c978c9a5c106e08 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 11 Jan 2026 12:27:49 -0800 Subject: [PATCH 20/35] dm-verity: switch to bio_advance_iter_single() dm-verity doesn't support data blocks that span pages, and it sets dma_alignment accordingly. As such, instead of using bio_advance_iter(), it can use the more lightweight function bio_advance_iter_single() to get the same result. Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-target.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index e28e84562afb..bb86145bed12 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -528,7 +528,7 @@ static int verity_verify_io(struct dm_verity_io *io) iter = &io->iter; for (b = 0; b < io->n_blocks; - b++, bio_advance_iter(bio, iter, block_size)) { + b++, bio_advance_iter_single(bio, iter, block_size)) { sector_t blkno = io->block + b; struct pending_block *block; bool is_zero; From c84e21a89b77731d69d27c74e92f99b39a5a54ef Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 11 Jan 2026 12:26:42 -0800 Subject: [PATCH 21/35] dm-verity: fix up various workqueue-related comments Replace obsolete mentions of "tasklets" with "softirq context", and "workqueue" with "kworker". This reflects the fact that the implementation of the "try_verify_in_tasklet" dm-verity option now accesses softirq context using either the BH workqueue API or inline execution, not the tasklet API. The old names conflated the API with the intended execution context, so they became outdated when the APIs changed. Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-target.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index bb86145bed12..a78b290c2e41 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -254,9 +254,9 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, data = dm_bufio_get(v->bufio, hash_block, &buf); if (IS_ERR_OR_NULL(data)) { /* - * In tasklet and the hash was not in the bufio cache. - * Return early and resume execution from a work-queue - * to read the hash from disk. + * In softirq and the hash was not in the bufio cache. + * Return early and resume execution from a kworker to + * read the hash from disk. */ return -EAGAIN; } @@ -303,7 +303,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, else if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) { /* * Error handling code (FEC included) cannot be run in a - * tasklet since it may sleep, so fallback to work-queue. + * softirq since it may sleep, so fallback to a kworker. */ r = -EAGAIN; goto release_ret_r; @@ -425,8 +425,8 @@ static int verity_handle_data_hash_mismatch(struct dm_verity *v, if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) { /* - * Error handling code (FEC included) cannot be run in the - * BH workqueue, so fallback to a standard workqueue. + * Error handling code (FEC included) cannot be run in a + * softirq since it may sleep, so fallback to a kworker. */ return -EAGAIN; } @@ -519,8 +519,8 @@ static int verity_verify_io(struct dm_verity_io *io) if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) { /* - * Copy the iterator in case we need to restart - * verification in a work-queue. + * Copy the iterator in case we need to restart verification in + * a kworker. */ iter_copy = io->iter; iter = &iter_copy; @@ -657,7 +657,7 @@ static void verity_bh_work(struct work_struct *w) io->in_bh = true; err = verity_verify_io(io); if (err == -EAGAIN || err == -ENOMEM) { - /* fallback to retrying with work-queue */ + /* fallback to retrying in a kworker */ INIT_WORK(&io->work, verity_work); queue_work(io->v->verify_wq, &io->work); return; @@ -1644,7 +1644,7 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv) * reducing wait times when reading from a dm-verity device. * * Also as required for the "try_verify_in_tasklet" feature: WQ_HIGHPRI - * allows verify_wq to preempt softirq since verification in BH workqueue + * allows verify_wq to preempt softirq since verification in softirq * will fall-back to using it for error handling (or if the bufio cache * doesn't have required hashes). */ From fb8a6c18fb9a6561f7a15b58b272442b77a242dd Mon Sep 17 00:00:00 2001 From: Michael Liang Date: Fri, 9 Jan 2026 15:52:54 -0700 Subject: [PATCH 22/35] dm: clear cloned request bio pointer when last clone bio completes Stale rq->bio values have been observed to cause double-initialization of cloned bios in request-based device-mapper targets, leading to use-after-free and double-free scenarios. One such case occurs when using dm-multipath on top of a PCIe NVMe namespace, where cloned request bios are freed during blk_complete_request(), but rq->bio is left intact. Subsequent clone teardown then attempts to free the same bios again via blk_rq_unprep_clone(). The resulting double-free path looks like: nvme_pci_complete_batch() nvme_complete_batch() blk_mq_end_request_batch() blk_complete_request() // called on a DM clone request bio_endio() // first free of all clone bios ... rq->end_io() // end_clone_request() dm_complete_request(tio->orig) dm_softirq_done() dm_done() dm_end_request() blk_rq_unprep_clone() // second free of clone bios Fix this by clearing the clone request's bio pointer when the last cloned bio completes, ensuring that later teardown paths do not attempt to free already-released bios. Signed-off-by: Michael Liang Reviewed-by: Mohamed Khalfella Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org --- drivers/md/dm-rq.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 5e0854669614..923252fb57ae 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -109,14 +109,21 @@ static void end_clone_bio(struct bio *clone) */ tio->completed += nr_bytes; + if (!is_last) + return; + /* + * At this moment we know this is the last bio of the cloned request, + * and all cloned bios have been released, so reset the clone request's + * bio pointer to avoid double free. + */ + tio->clone->bio = NULL; + exit: /* * Update the original request. * Do not use blk_mq_end_request() here, because it may complete * the original request before the clone, and break the ordering. */ - if (is_last) - exit: - blk_update_request(tio->orig, BLK_STS_OK, tio->completed); + blk_update_request(tio->orig, BLK_STS_OK, tio->completed); } static struct dm_rq_target_io *tio_from_request(struct request *rq) From 033724b1c627885aed049f775e4b10583d895af6 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 16 Jan 2026 15:30:34 +0100 Subject: [PATCH 23/35] dm-verity: add dm-verity keyring Add a dedicated ".dm-verity" keyring for root hash signature verification, similar to the ".fs-verity" keyring used by fs-verity. By default the keyring is unused retaining the exact same old behavior. For systems that provision additional keys only intended for dm-verity images during boot, the dm_verity.keyring_unsealed=1 kernel parameter leaves the keyring open. We want to use this in systemd as a way add keys during boot that are only used for creating dm-verity devices for later mounting and nothing else. The discoverable disk image (DDI) spec at [1] heavily relies on dm-verity and we would like to expand this even more. This will allow us to do that in a fully backward compatible way. Once provisioning is complete, userspace restricts and activates it for dm-verity verification. If userspace fully seals the keyring then it gains the guarantee that no new keys can be added. Link: https://uapi-group.org/specifications/specs/discoverable_partitions_specification [1] Co-developed-by: Aleksa Sarai Signed-off-by: Aleksa Sarai Signed-off-by: Christian Brauner Signed-off-by: Mikulas Patocka --- .../admin-guide/kernel-parameters.txt | 7 +++ drivers/md/dm-verity-target.c | 26 ++++++++++- drivers/md/dm-verity-verify-sig.c | 45 +++++++++++++++++++ drivers/md/dm-verity-verify-sig.h | 12 +++++ 4 files changed, 89 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a8d0afde7f85..374571c7921a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1370,6 +1370,13 @@ Kernel parameters For details see: Documentation/admin-guide/hw-vuln/reg-file-data-sampling.rst + dm_verity.keyring_unsealed= + [KNL] When set to 1, leave the dm-verity keyring + unsealed after initialization so userspace can + provision keys. Once the keyring is restricted + it becomes active and is searched during signature + verification. + driver_async_probe= [KNL] List of driver names to be probed asynchronously. * matches with all driver names. If * is specified, the diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index a78b290c2e41..631ccc6a2bb7 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -1802,7 +1802,31 @@ static struct target_type verity_target = { .preresume = verity_preresume, #endif /* CONFIG_SECURITY */ }; -module_dm(verity); + +static int __init dm_verity_init(void) +{ + int r; + + r = dm_verity_verify_sig_init(); + if (r) + return r; + + r = dm_register_target(&verity_target); + if (r) { + dm_verity_verify_sig_exit(); + return r; + } + + return 0; +} +module_init(dm_verity_init); + +static void __exit dm_verity_exit(void) +{ + dm_unregister_target(&verity_target); + dm_verity_verify_sig_exit(); +} +module_exit(dm_verity_exit); /* * Check whether a DM target is a verity target. diff --git a/drivers/md/dm-verity-verify-sig.c b/drivers/md/dm-verity-verify-sig.c index d5261a0e4232..2a2abd9864c9 100644 --- a/drivers/md/dm-verity-verify-sig.c +++ b/drivers/md/dm-verity-verify-sig.c @@ -7,6 +7,7 @@ */ #include #include +#include #include #include #include "dm-verity.h" @@ -14,6 +15,12 @@ #define DM_VERITY_VERIFY_ERR(s) DM_VERITY_ROOT_HASH_VERIFICATION " " s +static struct key *dm_verity_keyring; + +static bool dm_verity_keyring_unsealed __ro_after_init; +module_param_named(keyring_unsealed, dm_verity_keyring_unsealed, bool, 0444); +MODULE_PARM_DESC(keyring_unsealed, "Leave the dm-verity keyring unsealed"); + static bool require_signatures; module_param(require_signatures, bool, 0444); MODULE_PARM_DESC(require_signatures, @@ -143,6 +150,17 @@ int verity_verify_root_hash(const void *root_hash, size_t root_hash_len, VERIFYING_UNSPECIFIED_SIGNATURE, NULL, NULL); #endif + if (ret != -ENOKEY && ret != -EKEYREJECTED) + return ret; + + if (dm_verity_keyring->keys.nr_leaves_on_tree && + dm_verity_keyring->restrict_link) + ret = verify_pkcs7_signature(root_hash, root_hash_len, + sig_data, sig_len, + dm_verity_keyring, + VERIFYING_UNSPECIFIED_SIGNATURE, + NULL, NULL); + return ret; } @@ -152,3 +170,30 @@ void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts) sig_opts->sig = NULL; sig_opts->sig_size = 0; } + +int __init dm_verity_verify_sig_init(void) +{ + dm_verity_keyring = keyring_alloc(".dm-verity", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + current_cred(), + KEY_POS_SEARCH | + KEY_USR_VIEW | KEY_USR_READ | + KEY_USR_WRITE | KEY_USR_SEARCH | + KEY_USR_SETATTR, + KEY_ALLOC_NOT_IN_QUOTA, + NULL, NULL); + if (IS_ERR(dm_verity_keyring)) + panic("dm-verity can't allocate keyring\n"); + + if (!dm_verity_keyring_unsealed && + keyring_restrict(make_key_ref(dm_verity_keyring, true), NULL, NULL)) + panic("dm-verity can't seal keyring\n"); + + return 0; +} + +void __exit dm_verity_verify_sig_exit(void) +{ + key_revoke(dm_verity_keyring); + key_put(dm_verity_keyring); +} diff --git a/drivers/md/dm-verity-verify-sig.h b/drivers/md/dm-verity-verify-sig.h index f36ea92127bf..b0bb0d427244 100644 --- a/drivers/md/dm-verity-verify-sig.h +++ b/drivers/md/dm-verity-verify-sig.h @@ -30,6 +30,9 @@ int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts); +int __init dm_verity_verify_sig_init(void); +void __exit dm_verity_verify_sig_exit(void); + #else #define DM_VERITY_ROOT_HASH_VERIFICATION_OPTS 0 @@ -56,5 +59,14 @@ static inline void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig { } +static inline int dm_verity_verify_sig_init(void) +{ + return 0; +} + +static inline void dm_verity_verify_sig_exit(void) +{ +} + #endif /* CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG */ #endif /* DM_VERITY_SIG_VERIFICATION_H */ From f93bc869825fdba3632ff6ddece4906a6673e679 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 16 Jan 2026 15:30:35 +0100 Subject: [PATCH 24/35] selftests: add dm-verity keyring selftests Add selftests that verify the keyring behaves correctly. For simplicity this works with dm-verity as a module. Signed-off-by: Christian Brauner Signed-off-by: Mikulas Patocka --- tools/testing/selftests/dm-verity/Makefile | 5 + tools/testing/selftests/dm-verity/config | 10 + .../dm-verity/test-dm-verity-keyring.sh | 873 ++++++++++++++++++ 3 files changed, 888 insertions(+) create mode 100644 tools/testing/selftests/dm-verity/Makefile create mode 100644 tools/testing/selftests/dm-verity/config create mode 100755 tools/testing/selftests/dm-verity/test-dm-verity-keyring.sh diff --git a/tools/testing/selftests/dm-verity/Makefile b/tools/testing/selftests/dm-verity/Makefile new file mode 100644 index 000000000000..b75ee08a54af --- /dev/null +++ b/tools/testing/selftests/dm-verity/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 + +TEST_PROGS := test-dm-verity-keyring.sh + +include ../lib.mk diff --git a/tools/testing/selftests/dm-verity/config b/tools/testing/selftests/dm-verity/config new file mode 100644 index 000000000000..1cd3712fa0a4 --- /dev/null +++ b/tools/testing/selftests/dm-verity/config @@ -0,0 +1,10 @@ +CONFIG_BLK_DEV_DM=y +CONFIG_DM_VERITY=m +CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y +CONFIG_MODULE_UNLOAD=y +CONFIG_KEYS=y +CONFIG_ASYMMETRIC_KEY_TYPE=y +CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y +CONFIG_X509_CERTIFICATE_PARSER=y +CONFIG_PKCS7_MESSAGE_PARSER=y +CONFIG_SYSTEM_DATA_VERIFICATION=y diff --git a/tools/testing/selftests/dm-verity/test-dm-verity-keyring.sh b/tools/testing/selftests/dm-verity/test-dm-verity-keyring.sh new file mode 100755 index 000000000000..1f9601ef22f8 --- /dev/null +++ b/tools/testing/selftests/dm-verity/test-dm-verity-keyring.sh @@ -0,0 +1,873 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test script for dm-verity keyring functionality +# +# This script has two modes depending on kernel configuration: +# +# 1. keyring_unsealed=1 AND require_signatures=1: +# - Upload a test key to the .dm-verity keyring +# - Seal the keyring +# - Create a dm-verity device with a signed root hash +# - Verify signature verification works +# +# 2. keyring_unsealed=0 (default) OR require_signatures=0: +# - Verify the keyring is already sealed (if unsealed=0) +# - Verify keys cannot be added to a sealed keyring +# - Verify the keyring is inactive (not used for verification) +# +# Requirements: +# - Root privileges +# - openssl +# - veritysetup (cryptsetup) +# - keyctl (keyutils) + +set -e + +WORK_DIR="" +DATA_DEV="" +HASH_DEV="" +DM_NAME="verity-test-$$" +CLEANUP_DONE=0 + +# Module parameters (detected at runtime) +KEYRING_UNSEALED="" +REQUIRE_SIGNATURES="" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +log_info() { + echo -e "${GREEN}[INFO]${NC} $*" +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $*" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $*" >&2 +} + +log_pass() { + echo -e "${GREEN}[PASS]${NC} $*" +} + +log_fail() { + echo -e "${RED}[FAIL]${NC} $*" >&2 +} + +log_skip() { + echo -e "${YELLOW}[SKIP]${NC} $*" +} + +cleanup() { + if [ "$CLEANUP_DONE" -eq 1 ]; then + return + fi + CLEANUP_DONE=1 + + log_info "Cleaning up..." + + # Remove dm-verity device if it exists + if dmsetup info "$DM_NAME" &>/dev/null; then + dmsetup remove "$DM_NAME" 2>/dev/null || true + fi + + # Detach loop devices + if [ -n "$DATA_DEV" ] && [[ "$DATA_DEV" == /dev/loop* ]]; then + losetup -d "$DATA_DEV" 2>/dev/null || true + fi + if [ -n "$HASH_DEV" ] && [[ "$HASH_DEV" == /dev/loop* ]]; then + losetup -d "$HASH_DEV" 2>/dev/null || true + fi + + # Remove work directory + if [ -n "$WORK_DIR" ] && [ -d "$WORK_DIR" ]; then + rm -rf "$WORK_DIR" + fi +} + +trap cleanup EXIT + +die() { + log_error "$*" + exit 1 +} + +find_dm_verity_keyring() { + # The .dm-verity keyring is not linked to user-accessible keyrings, + # so we need to find it via /proc/keys + local serial_hex + serial_hex=$(awk '/\.dm-verity/ {print $1}' /proc/keys 2>/dev/null) + + if [ -z "$serial_hex" ]; then + return 1 + fi + + # Convert hex to decimal for keyctl + echo $((16#$serial_hex)) +} + +get_module_param() { + local param="$1" + local path="/sys/module/dm_verity/parameters/$param" + + if [ -f "$path" ]; then + cat "$path" + else + echo "" + fi +} + +check_requirements() { + log_info "Checking requirements..." + + # Check for root + if [ "$(id -u)" -ne 0 ]; then + die "This script must be run as root" + fi + + # Check for required tools + for cmd in openssl veritysetup keyctl losetup dmsetup dd awk; do + if ! command -v "$cmd" &>/dev/null; then + die "Required command not found: $cmd" + fi + done + + # Check for dm-verity module + if ! modprobe -n dm-verity &>/dev/null; then + die "dm-verity module not available" + fi + + # Verify OpenSSL can create signatures + # OpenSSL cms -sign with -binary -outform DER creates detached signatures by default + log_info "Using OpenSSL for PKCS#7 signatures" +} + +load_dm_verity_module() { + local keyring_unsealed="${1:-0}" + local require_signatures="${2:-0}" + + log_info "Loading dm-verity module with keyring_unsealed=$keyring_unsealed require_signatures=$require_signatures" + + # Unload if already loaded + if lsmod | grep -q '^dm_verity'; then + log_info "Unloading existing dm-verity module..." + modprobe -r dm-verity 2>/dev/null || \ + die "Failed to unload dm-verity module (may be in use)" + sleep 1 + fi + + # Load with specified parameters + modprobe dm-verity keyring_unsealed="$keyring_unsealed" require_signatures="$require_signatures" || \ + die "Failed to load dm-verity module" + + # Wait for keyring to be created (poll with timeout) + local keyring_id="" + local timeout=50 # 5 seconds (50 * 0.1s) + while [ $timeout -gt 0 ]; do + keyring_id=$(find_dm_verity_keyring) && break + sleep 0.1 + timeout=$((timeout - 1)) + done + + if [ -z "$keyring_id" ]; then + die "dm-verity keyring not found after module load (timeout)" + fi + + log_info "Found .dm-verity keyring: $keyring_id" + echo "$keyring_id" > "$WORK_DIR/keyring_id" + + # Read and display module parameters + KEYRING_UNSEALED=$(get_module_param "keyring_unsealed") + REQUIRE_SIGNATURES=$(get_module_param "require_signatures") + + log_info "Module parameters:" + log_info " keyring_unsealed=$KEYRING_UNSEALED" + log_info " require_signatures=$REQUIRE_SIGNATURES" +} + +unload_dm_verity_module() { + log_info "Unloading dm-verity module..." + + # Clean up any dm-verity devices first + local dm_dev + while read -r dm_dev _; do + [ -n "$dm_dev" ] || continue + log_info "Removing dm-verity device: $dm_dev" + dmsetup remove "$dm_dev" 2>/dev/null || true + done < <(dmsetup ls --target verity 2>/dev/null) + + if lsmod | grep -q '^dm_verity'; then + modprobe -r dm-verity 2>/dev/null || \ + log_warn "Failed to unload dm-verity module" + sleep 1 + fi +} + +generate_keys() { + log_info "Generating signing key pair..." + + # Generate private key (2048-bit for faster test execution) + openssl genrsa -out "$WORK_DIR/private.pem" 2048 2>/dev/null + + # Create OpenSSL config for certificate extensions + # The kernel requires digitalSignature key usage for signature verification + # Both subjectKeyIdentifier and authorityKeyIdentifier are needed for + # the kernel to match keys in the keyring (especially for self-signed certs) + cat > "$WORK_DIR/openssl.cnf" << 'EOF' +[req] +distinguished_name = req_distinguished_name +x509_extensions = v3_ca +prompt = no + +[req_distinguished_name] +CN = dm-verity-test-key + +[v3_ca] +basicConstraints = critical,CA:FALSE +keyUsage = digitalSignature +subjectKeyIdentifier = hash +authorityKeyIdentifier = keyid +EOF + + # Generate self-signed certificate with proper extensions + openssl req -new -x509 -key "$WORK_DIR/private.pem" \ + -out "$WORK_DIR/cert.pem" -days 365 \ + -config "$WORK_DIR/openssl.cnf" 2>/dev/null + + # Convert certificate to DER format for kernel + openssl x509 -in "$WORK_DIR/cert.pem" -outform DER \ + -out "$WORK_DIR/cert.der" + + # Show certificate info for debugging + log_info "Certificate details:" + openssl x509 -in "$WORK_DIR/cert.pem" -noout -text 2>/dev/null | \ + grep -E "Subject:|Issuer:|Key Usage|Extended" | head -10 + + log_info "Keys generated successfully" +} + +seal_keyring() { + log_info "Sealing the .dm-verity keyring..." + + local keyring_id + keyring_id=$(cat "$WORK_DIR/keyring_id") + + keyctl restrict_keyring "$keyring_id" || \ + die "Failed to seal keyring" + + log_info "Keyring sealed successfully" +} + +create_test_device() { + log_info "Creating test device images..." + + # Create data image with random content (8MB is sufficient for testing) + dd if=/dev/urandom of="$WORK_DIR/data.img" bs=1M count=8 status=none + + # Create hash image (will be populated by veritysetup) + dd if=/dev/zero of="$WORK_DIR/hash.img" bs=1M count=1 status=none + + # Setup loop devices + DATA_DEV=$(losetup --find --show "$WORK_DIR/data.img") + HASH_DEV=$(losetup --find --show "$WORK_DIR/hash.img") + + log_info "Data device: $DATA_DEV" + log_info "Hash device: $HASH_DEV" +} + +create_verity_hash() { + log_info "Creating dm-verity hash tree..." + + local root_hash output + output=$(veritysetup format "$DATA_DEV" "$HASH_DEV" 2>&1) + root_hash=$(echo "$output" | grep "Root hash:" | awk '{print $3}') + + if [ -z "$root_hash" ]; then + log_error "veritysetup format output:" + echo "$output" | sed 's/^/ /' + die "Failed to get root hash from veritysetup format" + fi + + echo "$root_hash" > "$WORK_DIR/root_hash" + log_info "Root hash: $root_hash" +} + +create_detached_signature() { + local infile="$1" + local outfile="$2" + local cert="$3" + local key="$4" + + # Use openssl smime (not cms) for PKCS#7 signatures compatible with kernel + # Flags from working veritysetup example: + # -nocerts: don't include certificate in signature + # -noattr: no signed attributes + # -binary: binary input mode + if openssl smime -sign -nocerts -noattr -binary \ + -in "$infile" \ + -inkey "$key" \ + -signer "$cert" \ + -outform der \ + -out "$outfile" 2>/dev/null; then + return 0 + fi + + log_error "Failed to create signature" + return 1 +} + +activate_verity_device() { + local with_sig="$1" + local root_hash + root_hash=$(cat "$WORK_DIR/root_hash") + + # Clear dmesg and capture any kernel messages during activation + dmesg -C 2>/dev/null || true + + if [ "$with_sig" = "yes" ]; then + log_info "Activating dm-verity device with signature..." + veritysetup open "$DATA_DEV" "$DM_NAME" "$HASH_DEV" "$root_hash" \ + --root-hash-signature="$WORK_DIR/root_hash.p7s" 2>&1 + local ret=$? + else + log_info "Activating dm-verity device without signature..." + veritysetup open "$DATA_DEV" "$DM_NAME" "$HASH_DEV" "$root_hash" 2>&1 + local ret=$? + fi + + # Show relevant kernel messages + local kmsg + kmsg=$(dmesg 2>/dev/null | grep -i -E 'verity|pkcs|signature|asymmetric|key' | tail -10) + if [ -n "$kmsg" ]; then + log_info "Kernel messages:" + echo "$kmsg" | while read -r line; do echo " $line"; done + fi + + return $ret +} + +deactivate_verity_device() { + if dmsetup info "$DM_NAME" &>/dev/null; then + dmsetup remove "$DM_NAME" 2>/dev/null || true + fi +} + +show_keyring_status() { + log_info "Keyring status:" + + local keyring_id + keyring_id=$(find_dm_verity_keyring) || true + + if [ -n "$keyring_id" ]; then + echo " Keyring ID: $keyring_id" + keyctl show "$keyring_id" 2>/dev/null || true + grep '\.dm-verity' /proc/keys 2>/dev/null || true + fi +} + +list_keyring_keys() { + log_info "Keys in .dm-verity keyring:" + + local keyring_id + keyring_id=$(cat "$WORK_DIR/keyring_id" 2>/dev/null) || \ + keyring_id=$(find_dm_verity_keyring) || true + + if [ -z "$keyring_id" ]; then + log_warn "Could not find keyring" + return + fi + + # List all keys in the keyring + local keys + keys=$(keyctl list "$keyring_id" 2>/dev/null) + if [ -z "$keys" ] || [ "$keys" = "keyring is empty" ]; then + echo " (empty)" + else + echo "$keys" | while read -r line; do + echo " $line" + done + + # Show detailed info for each key + log_info "Key details:" + keyctl list "$keyring_id" 2>/dev/null | awk '{print $1}' | grep -E '^[0-9]+$' | while read -r key_id; do + echo " Key $key_id:" + keyctl describe "$key_id" 2>/dev/null | sed 's/^/ /' + done + fi +} + +generate_named_key() { + local name="$1" + local key_dir="$WORK_DIR/keys/$name" + + mkdir -p "$key_dir" + + # Log to stderr so it doesn't interfere with return value + echo "[INFO] Generating key pair: $name" >&2 + + # Generate private key + openssl genrsa -out "$key_dir/private.pem" 2048 2>/dev/null + + # Create OpenSSL config for certificate extensions + # Both subjectKeyIdentifier and authorityKeyIdentifier are needed for + # the kernel to match keys in the keyring (especially for self-signed certs) + cat > "$key_dir/openssl.cnf" << EOF +[req] +distinguished_name = req_distinguished_name +x509_extensions = v3_ca +prompt = no + +[req_distinguished_name] +CN = dm-verity-test-$name + +[v3_ca] +basicConstraints = critical,CA:FALSE +keyUsage = digitalSignature +subjectKeyIdentifier = hash +authorityKeyIdentifier = keyid +EOF + + # Generate self-signed certificate with proper extensions + openssl req -new -x509 -key "$key_dir/private.pem" \ + -out "$key_dir/cert.pem" -days 365 \ + -config "$key_dir/openssl.cnf" 2>/dev/null + + # Convert certificate to DER format for kernel + openssl x509 -in "$key_dir/cert.pem" -outform DER \ + -out "$key_dir/cert.der" + + # Return the key directory path (only this goes to stdout) + echo "$key_dir" +} + +upload_named_key() { + local name="$1" + local key_dir="$2" + + local keyring_id + keyring_id=$(cat "$WORK_DIR/keyring_id") + + log_info "Uploading key '$name' to keyring..." + + local key_id + if key_id=$(keyctl padd asymmetric "$name" "$keyring_id" \ + < "$key_dir/cert.der" 2>&1); then + log_info "Key '$name' uploaded with ID: $key_id" + echo "$key_id" > "$key_dir/key_id" + return 0 + else + log_error "Failed to upload key '$name': $key_id" + return 1 + fi +} + +# +# Test: Verify sealed keyring rejects key additions +# +test_sealed_keyring_rejects_keys() { + log_info "TEST: Verify sealed keyring rejects key additions" + + local keyring_id + keyring_id=$(cat "$WORK_DIR/keyring_id") + + generate_keys + + # Try to add a key - should fail + if keyctl padd asymmetric "dm-verity-test" "$keyring_id" \ + < "$WORK_DIR/cert.der" 2>/dev/null; then + log_fail "Key addition should have been rejected on sealed keyring" + return 1 + else + log_pass "Sealed keyring correctly rejected key addition" + return 0 + fi +} + +# +# Test: Multiple keys in keyring +# +test_multiple_keys() { + log_info "TEST: Multiple keys in keyring" + + local key1_dir key2_dir key3_dir + + # Generate three different keys + key1_dir=$(generate_named_key "vendor-a") + key2_dir=$(generate_named_key "vendor-b") + key3_dir=$(generate_named_key "vendor-c") + + # Upload all three keys + upload_named_key "vendor-a" "$key1_dir" || return 1 + upload_named_key "vendor-b" "$key2_dir" || return 1 + upload_named_key "vendor-c" "$key3_dir" || return 1 + + log_info "" + log_info "Keys in keyring before sealing:" + list_keyring_keys + show_keyring_status + + # Seal the keyring + log_info "" + seal_keyring + + # List keys after sealing + log_info "" + log_info "Keys in keyring after sealing:" + list_keyring_keys + show_keyring_status + + log_pass "Key upload and keyring sealing succeeded" + + # Create test device + log_info "" + create_test_device + create_verity_hash + + # Test 1: Sign with key1, should verify successfully + log_info "" + log_info "Sub-test: Verify with vendor-a key" + if ! sign_root_hash_with_key "$key1_dir"; then + log_fail "Failed to sign with vendor-a key" + return 1 + fi + if activate_verity_device "yes"; then + log_pass "Verification with vendor-a key succeeded" + deactivate_verity_device + else + log_fail "Verification with vendor-a key should succeed" + return 1 + fi + + # Test 2: Sign with key2, should also verify successfully + log_info "" + log_info "Sub-test: Verify with vendor-b key" + if ! sign_root_hash_with_key "$key2_dir"; then + log_fail "Failed to sign with vendor-b key" + return 1 + fi + if activate_verity_device "yes"; then + log_pass "Verification with vendor-b key succeeded" + deactivate_verity_device + else + log_fail "Verification with vendor-b key should succeed" + return 1 + fi + + # Test 3: Sign with key3, should also verify successfully + log_info "" + log_info "Sub-test: Verify with vendor-c key" + if ! sign_root_hash_with_key "$key3_dir"; then + log_fail "Failed to sign with vendor-c key" + return 1 + fi + if activate_verity_device "yes"; then + log_pass "Verification with vendor-c key succeeded" + deactivate_verity_device + else + log_fail "Verification with vendor-c key should succeed" + return 1 + fi + + # Test 4: Generate a key NOT in the keyring, should fail + log_info "" + log_info "Sub-test: Verify with unknown key (should fail)" + local unknown_key_dir + unknown_key_dir=$(generate_named_key "unknown-vendor") + if ! sign_root_hash_with_key "$unknown_key_dir"; then + log_fail "Failed to sign with unknown-vendor key" + return 1 + fi + if activate_verity_device "yes"; then + log_fail "Verification with unknown key should fail" + deactivate_verity_device + return 1 + else + log_pass "Verification with unknown key correctly rejected" + fi + + log_info "" + log_pass "Multiple keys test completed successfully" + return 0 +} + +sign_root_hash_with_key() { + local key_dir="$1" + + local root_hash + root_hash=$(cat "$WORK_DIR/root_hash") + + # Create the data to sign (hex string, not binary) + echo -n "$root_hash" > "$WORK_DIR/root_hash.txt" + + # Debug: show exactly what we're signing + log_info "Root hash (hex): $root_hash" + log_info "Root hash hex string size: $(wc -c < "$WORK_DIR/root_hash.txt") bytes" + + # Create detached PKCS#7 signature + if ! create_detached_signature "$WORK_DIR/root_hash.txt" "$WORK_DIR/root_hash.p7s" \ + "$key_dir/cert.pem" "$key_dir/private.pem"; then + log_error "Failed to sign root hash with key from $key_dir" + return 1 + fi + + # Debug: show signing certificate info + log_info "Signed with certificate:" + openssl x509 -in "$key_dir/cert.pem" -noout -subject 2>/dev/null | sed 's/^/ /' + + # Debug: verify signature locally + # -nointern: cert not in signature, use -certfile + # -noverify: skip certificate chain validation (self-signed) + if openssl smime -verify -binary -inform der -nointern -noverify \ + -in "$WORK_DIR/root_hash.p7s" \ + -content "$WORK_DIR/root_hash.txt" \ + -certfile "$key_dir/cert.pem" \ + -out /dev/null 2>/dev/null; then + log_info "Local signature verification: PASSED" + else + log_warn "Local signature verification: FAILED" + fi + return 0 +} + +# +# Test: Verify corrupted signatures are rejected +# +test_corrupted_signature() { + log_info "TEST: Verify corrupted signatures are rejected" + + # This test requires a valid setup from test_multiple_keys or similar + # It modifies the signature file and verifies rejection + + if [ ! -f "$WORK_DIR/root_hash.p7s" ]; then + log_warn "No signature file found, skipping corrupted signature test" + return 0 + fi + + # Save original signature + cp "$WORK_DIR/root_hash.p7s" "$WORK_DIR/root_hash.p7s.orig" + + # Test 1: Truncated signature + log_info "Sub-test: Truncated signature (should fail)" + head -c 100 "$WORK_DIR/root_hash.p7s.orig" > "$WORK_DIR/root_hash.p7s" + if activate_verity_device "yes"; then + log_fail "Truncated signature should be rejected" + deactivate_verity_device + cp "$WORK_DIR/root_hash.p7s.orig" "$WORK_DIR/root_hash.p7s" + return 1 + else + log_pass "Truncated signature correctly rejected" + fi + + # Test 2: Corrupted signature (flip some bytes) + log_info "Sub-test: Corrupted signature bytes (should fail)" + cp "$WORK_DIR/root_hash.p7s.orig" "$WORK_DIR/root_hash.p7s" + # Corrupt bytes in the middle of the signature + local sig_size + sig_size=$(wc -c < "$WORK_DIR/root_hash.p7s") + local corrupt_offset=$((sig_size / 2)) + printf '\xff\xff\xff\xff' | dd of="$WORK_DIR/root_hash.p7s" bs=1 seek=$corrupt_offset conv=notrunc 2>/dev/null + if activate_verity_device "yes"; then + log_fail "Corrupted signature should be rejected" + deactivate_verity_device + cp "$WORK_DIR/root_hash.p7s.orig" "$WORK_DIR/root_hash.p7s" + return 1 + else + log_pass "Corrupted signature correctly rejected" + fi + + # Test 3: Signature over wrong data (sign different content) + log_info "Sub-test: Signature over wrong data (should fail)" + # Create a different root hash (all zeros as hex string) + printf '%064d' 0 > "$WORK_DIR/wrong_hash.txt" + # Get the first key directory that was used + local key_dir="$WORK_DIR/keys/vendor-a" + if [ -d "$key_dir" ]; then + create_detached_signature "$WORK_DIR/wrong_hash.txt" "$WORK_DIR/root_hash.p7s" \ + "$key_dir/cert.pem" "$key_dir/private.pem" + if activate_verity_device "yes"; then + log_fail "Signature over wrong data should be rejected" + deactivate_verity_device + cp "$WORK_DIR/root_hash.p7s.orig" "$WORK_DIR/root_hash.p7s" + return 1 + else + log_pass "Signature over wrong data correctly rejected" + fi + else + log_warn "Key directory not found, skipping wrong data test" + fi + + # Restore original signature + cp "$WORK_DIR/root_hash.p7s.orig" "$WORK_DIR/root_hash.p7s" + + log_pass "Corrupted signature test completed successfully" + return 0 +} + +# +# Test: Verify keyring is sealed when keyring_unsealed=0 +# +test_keyring_sealed_by_default() { + log_info "TEST: Verify keyring is sealed by default (keyring_unsealed=0)" + + local keyring_id + keyring_id=$(cat "$WORK_DIR/keyring_id") + + log_info "Current keyring state (should be empty and sealed):" + list_keyring_keys + show_keyring_status + + generate_keys + + # Try to add a key - should fail if keyring is sealed + log_info "Attempting to add key to sealed keyring..." + if keyctl padd asymmetric "dm-verity-test" "$keyring_id" \ + < "$WORK_DIR/cert.der" 2>/dev/null; then + log_fail "Keyring should be sealed when keyring_unsealed=0" + list_keyring_keys + return 1 + else + log_pass "Keyring is correctly sealed when keyring_unsealed=0" + log_info "Keyring state after failed add attempt:" + list_keyring_keys + return 0 + fi +} + +# +# Test: Verify dm-verity keyring is inactive when sealed empty +# +test_keyring_inactive_when_empty() { + log_info "TEST: Verify dm-verity keyring is inactive when sealed empty" + + # When keyring_unsealed=0, the keyring is sealed immediately while empty + # This means it should NOT be used for verification (nr_leaves_on_tree=0) + + log_info "Keyring state (should be empty and sealed):" + list_keyring_keys + show_keyring_status + + create_test_device + create_verity_hash + + # Without any keys in the dm-verity keyring, and with it sealed, + # verification should fall through to the secondary/platform keyrings + # and likely succeed (if require_signatures=0) or fail (if =1) + + log_info "Sub-test: Device activation with sealed empty keyring" + if [ "$REQUIRE_SIGNATURES" = "Y" ] || [ "$REQUIRE_SIGNATURES" = "1" ]; then + if activate_verity_device "no"; then + log_fail "Device should NOT activate without signature when require_signatures=1" + deactivate_verity_device + return 1 + else + log_pass "Device correctly rejected (require_signatures=1, no valid signature)" + fi + else + if activate_verity_device "no"; then + log_pass "Device activated (require_signatures=0, empty dm-verity keyring is inactive)" + deactivate_verity_device + else + log_fail "Device should activate when require_signatures=0" + return 1 + fi + fi + + return 0 +} + +main() { + local rc=0 + + log_info "=== dm-verity keyring test ===" + log_info "" + + # Create work directory + WORK_DIR=$(mktemp -d -t dm-verity-test.XXXXXX) + log_info "Work directory: $WORK_DIR" + + check_requirements + + # + # Test 1: UNSEALED keyring mode (keyring_unsealed=1) + # + log_info "" + log_info "========================================" + log_info "=== TEST MODE: UNSEALED KEYRING ===" + log_info "========================================" + log_info "" + + load_dm_verity_module 1 1 # keyring_unsealed=1, require_signatures=1 + show_keyring_status + + log_info "" + if ! test_multiple_keys; then + rc=1 + fi + + # After sealing, verify it rejects new keys + log_info "" + if ! test_sealed_keyring_rejects_keys; then + rc=1 + fi + + # Test corrupted signatures are rejected + log_info "" + if ! test_corrupted_signature; then + rc=1 + fi + + # Clean up devices before reloading module + deactivate_verity_device + if [ -n "$DATA_DEV" ] && [[ "$DATA_DEV" == /dev/loop* ]]; then + losetup -d "$DATA_DEV" 2>/dev/null || true + DATA_DEV="" + fi + if [ -n "$HASH_DEV" ] && [[ "$HASH_DEV" == /dev/loop* ]]; then + losetup -d "$HASH_DEV" 2>/dev/null || true + HASH_DEV="" + fi + + # + # Test 2: SEALED keyring mode (keyring_unsealed=0, default) + # + log_info "" + log_info "========================================" + log_info "=== TEST MODE: SEALED KEYRING (default) ===" + log_info "========================================" + log_info "" + + load_dm_verity_module 0 0 # keyring_unsealed=0, require_signatures=0 + show_keyring_status + + log_info "" + if ! test_keyring_sealed_by_default; then + rc=1 + fi + + log_info "" + if ! test_keyring_inactive_when_empty; then + rc=1 + fi + + # + # Summary + # + log_info "" + log_info "========================================" + if [ $rc -eq 0 ]; then + log_info "=== All tests PASSED ===" + else + log_error "=== Some tests FAILED ===" + fi + log_info "========================================" + + return $rc +} + +main "$@" From 1bf7ba4ca342ada012e7ef88274fb306e88917ad Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 11 Jan 2026 13:38:20 -0800 Subject: [PATCH 25/35] dm-bufio: merge cache_put() into cache_put_and_wake() Merge cache_put() into its only caller, cache_put_and_wake(). Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-bufio.c | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index f41f649c01d4..8a3f42bcbdcc 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -369,8 +369,8 @@ struct dm_buffer { * - IO * - Eviction or cache sizing. * - * cache_get() and cache_put() are threadsafe, you do not need to - * protect these calls with a surrounding mutex. All the other + * cache_get() and cache_put_and_wake() are threadsafe, you do not need + * to protect these calls with a surrounding mutex. All the other * methods are not threadsafe; they do use locking primitives, but * only enough to ensure get/put are threadsafe. */ @@ -619,24 +619,6 @@ static struct dm_buffer *cache_get(struct dm_buffer_cache *bc, sector_t block) /*--------------*/ -/* - * Returns true if the hold count hits zero. - * threadsafe - */ -static bool cache_put(struct dm_buffer_cache *bc, struct dm_buffer *b) -{ - bool r; - - cache_read_lock(bc, b->block); - BUG_ON(!atomic_read(&b->hold_count)); - r = atomic_dec_and_test(&b->hold_count); - cache_read_unlock(bc, b->block); - - return r; -} - -/*--------------*/ - typedef enum evict_result (*b_predicate)(struct dm_buffer *, void *); /* @@ -1745,12 +1727,18 @@ static void __check_watermark(struct dm_bufio_client *c, static void cache_put_and_wake(struct dm_bufio_client *c, struct dm_buffer *b) { + bool wake; + + cache_read_lock(&c->cache, b->block); + BUG_ON(!atomic_read(&b->hold_count)); + wake = atomic_dec_and_test(&b->hold_count); + cache_read_unlock(&c->cache, b->block); + /* * Relying on waitqueue_active() is racey, but we sleep * with schedule_timeout anyway. */ - if (cache_put(&c->cache, b) && - unlikely(waitqueue_active(&c->free_buffer_wait))) + if (wake && unlikely(waitqueue_active(&c->free_buffer_wait))) wake_up(&c->free_buffer_wait); } From be9badced98f89cf5c6f7690f7d9739a213c4502 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 11 Jan 2026 13:38:21 -0800 Subject: [PATCH 26/35] dm-bufio: avoid redundant buffer_tree lookups dm-bufio's map from block number to buffer is organized as a hash table of red-black trees. It does far more lookups in this hash table than necessary: typically one lookup to lock the tree, one lookup to search the tree, and one lookup to unlock the tree. Only one of those lookups is needed. Optimize it to do only the minimum number of lookups. This improves performance. It also reduces the object code size, considering that the redundant hash table lookups were being inlined. For example, the size of the text section of dm-bufio.o decreases from 15599 to 15070 bytes with gcc 15 and x86_64, or from 20652 to 20244 bytes with clang 21 and arm64. Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-bufio.c | 148 ++++++++++++++++++++++++++---------------- 1 file changed, 92 insertions(+), 56 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 8a3f42bcbdcc..60f7badec91f 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -401,36 +401,51 @@ static inline unsigned int cache_index(sector_t block, unsigned int num_locks) return dm_hash_locks_index(block, num_locks); } -static inline void cache_read_lock(struct dm_buffer_cache *bc, sector_t block) +/* Get the buffer tree in the cache for the given block. Doesn't lock it. */ +static inline struct buffer_tree *cache_get_tree(struct dm_buffer_cache *bc, + sector_t block) { - if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep) - read_lock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock); - else - down_read(&bc->trees[cache_index(block, bc->num_locks)].u.lock); + return &bc->trees[cache_index(block, bc->num_locks)]; } -static inline void cache_read_unlock(struct dm_buffer_cache *bc, sector_t block) +/* Lock the given buffer tree in the cache for reading. */ +static inline void cache_read_lock(struct dm_buffer_cache *bc, + struct buffer_tree *tree) { if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep) - read_unlock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock); + read_lock_bh(&tree->u.spinlock); else - up_read(&bc->trees[cache_index(block, bc->num_locks)].u.lock); + down_read(&tree->u.lock); } -static inline void cache_write_lock(struct dm_buffer_cache *bc, sector_t block) +/* Unlock the given buffer tree in the cache for reading. */ +static inline void cache_read_unlock(struct dm_buffer_cache *bc, + struct buffer_tree *tree) { if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep) - write_lock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock); + read_unlock_bh(&tree->u.spinlock); else - down_write(&bc->trees[cache_index(block, bc->num_locks)].u.lock); + up_read(&tree->u.lock); } -static inline void cache_write_unlock(struct dm_buffer_cache *bc, sector_t block) +/* Lock the given buffer tree in the cache for writing. */ +static inline void cache_write_lock(struct dm_buffer_cache *bc, + struct buffer_tree *tree) { if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep) - write_unlock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock); + write_lock_bh(&tree->u.spinlock); else - up_write(&bc->trees[cache_index(block, bc->num_locks)].u.lock); + down_write(&tree->u.lock); +} + +/* Unlock the given buffer tree in the cache for writing. */ +static inline void cache_write_unlock(struct dm_buffer_cache *bc, + struct buffer_tree *tree) +{ + if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep) + write_unlock_bh(&tree->u.spinlock); + else + up_write(&tree->u.lock); } /* @@ -602,17 +617,19 @@ static void __cache_inc_buffer(struct dm_buffer *b) WRITE_ONCE(b->last_accessed, jiffies); } -static struct dm_buffer *cache_get(struct dm_buffer_cache *bc, sector_t block) +static struct dm_buffer *cache_get(struct dm_buffer_cache *bc, + struct buffer_tree *tree, sector_t block) { struct dm_buffer *b; - cache_read_lock(bc, block); - b = __cache_get(&bc->trees[cache_index(block, bc->num_locks)].root, block); + /* Assuming tree == cache_get_tree(bc, block) */ + cache_read_lock(bc, tree); + b = __cache_get(&tree->root, block); if (b) { lru_reference(&b->lru); __cache_inc_buffer(b); } - cache_read_unlock(bc, block); + cache_read_unlock(bc, tree); return b; } @@ -663,7 +680,7 @@ static struct dm_buffer *__cache_evict(struct dm_buffer_cache *bc, int list_mode b = le_to_buffer(le); /* __evict_pred will have locked the appropriate tree. */ - rb_erase(&b->node, &bc->trees[cache_index(b->block, bc->num_locks)].root); + rb_erase(&b->node, &cache_get_tree(bc, b->block)->root); return b; } @@ -686,15 +703,17 @@ static struct dm_buffer *cache_evict(struct dm_buffer_cache *bc, int list_mode, /* * Mark a buffer as clean or dirty. Not threadsafe. */ -static void cache_mark(struct dm_buffer_cache *bc, struct dm_buffer *b, int list_mode) +static void cache_mark(struct dm_buffer_cache *bc, struct buffer_tree *tree, + struct dm_buffer *b, int list_mode) { - cache_write_lock(bc, b->block); + /* Assuming tree == cache_get_tree(bc, b->block) */ + cache_write_lock(bc, tree); if (list_mode != b->list_mode) { lru_remove(&bc->lru[b->list_mode], &b->lru); b->list_mode = list_mode; lru_insert(&bc->lru[b->list_mode], &b->lru); } - cache_write_unlock(bc, b->block); + cache_write_unlock(bc, tree); } /*--------------*/ @@ -820,19 +839,21 @@ static bool __cache_insert(struct rb_root *root, struct dm_buffer *b) return true; } -static bool cache_insert(struct dm_buffer_cache *bc, struct dm_buffer *b) +static bool cache_insert(struct dm_buffer_cache *bc, struct buffer_tree *tree, + struct dm_buffer *b) { bool r; if (WARN_ON_ONCE(b->list_mode >= LIST_SIZE)) return false; - cache_write_lock(bc, b->block); + /* Assuming tree == cache_get_tree(bc, b->block) */ + cache_write_lock(bc, tree); BUG_ON(atomic_read(&b->hold_count) != 1); - r = __cache_insert(&bc->trees[cache_index(b->block, bc->num_locks)].root, b); + r = __cache_insert(&tree->root, b); if (r) lru_insert(&bc->lru[b->list_mode], &b->lru); - cache_write_unlock(bc, b->block); + cache_write_unlock(bc, tree); return r; } @@ -845,21 +866,23 @@ static bool cache_insert(struct dm_buffer_cache *bc, struct dm_buffer *b) * * Not threadsafe. */ -static bool cache_remove(struct dm_buffer_cache *bc, struct dm_buffer *b) +static bool cache_remove(struct dm_buffer_cache *bc, struct buffer_tree *tree, + struct dm_buffer *b) { bool r; - cache_write_lock(bc, b->block); + /* Assuming tree == cache_get_tree(bc, b->block) */ + cache_write_lock(bc, tree); if (atomic_read(&b->hold_count) != 1) { r = false; } else { r = true; - rb_erase(&b->node, &bc->trees[cache_index(b->block, bc->num_locks)].root); + rb_erase(&b->node, &tree->root); lru_remove(&bc->lru[b->list_mode], &b->lru); } - cache_write_unlock(bc, b->block); + cache_write_unlock(bc, tree); return r; } @@ -1725,14 +1748,16 @@ static void __check_watermark(struct dm_bufio_client *c, *-------------------------------------------------------------- */ -static void cache_put_and_wake(struct dm_bufio_client *c, struct dm_buffer *b) +static void cache_put_and_wake(struct dm_bufio_client *c, + struct buffer_tree *tree, struct dm_buffer *b) { bool wake; - cache_read_lock(&c->cache, b->block); + /* Assuming tree == cache_get_tree(&c->cache, b->block) */ + cache_read_lock(&c->cache, tree); BUG_ON(!atomic_read(&b->hold_count)); wake = atomic_dec_and_test(&b->hold_count); - cache_read_unlock(&c->cache, b->block); + cache_read_unlock(&c->cache, tree); /* * Relying on waitqueue_active() is racey, but we sleep @@ -1746,7 +1771,8 @@ static void cache_put_and_wake(struct dm_bufio_client *c, struct dm_buffer *b) * This assumes you have already checked the cache to see if the buffer * is already present (it will recheck after dropping the lock for allocation). */ -static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, +static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, + struct buffer_tree *tree, sector_t block, enum new_flag nf, int *need_submit, struct list_head *write_list) { @@ -1766,7 +1792,7 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, * We've had a period where the mutex was unlocked, so need to * recheck the buffer tree. */ - b = cache_get(&c->cache, block); + b = cache_get(&c->cache, tree, block); if (b) { __free_buffer_wake(new_b); goto found_buffer; @@ -1794,13 +1820,13 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, * is set. Otherwise another thread could get it and use * it before it had been read. */ - cache_insert(&c->cache, b); + cache_insert(&c->cache, tree, b); return b; found_buffer: if (nf == NF_PREFETCH) { - cache_put_and_wake(c, b); + cache_put_and_wake(c, tree, b); return NULL; } @@ -1812,7 +1838,7 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, * the same buffer, it would deadlock if we waited. */ if (nf == NF_GET && unlikely(test_bit_acquire(B_READING, &b->state))) { - cache_put_and_wake(c, b); + cache_put_and_wake(c, tree, b); return NULL; } @@ -1846,6 +1872,7 @@ static void *new_read(struct dm_bufio_client *c, sector_t block, enum new_flag nf, struct dm_buffer **bp, unsigned short ioprio) { + struct buffer_tree *tree; int need_submit = 0; struct dm_buffer *b; @@ -1857,10 +1884,11 @@ static void *new_read(struct dm_bufio_client *c, sector_t block, * Fast path, hopefully the block is already in the cache. No need * to get the client lock for this. */ - b = cache_get(&c->cache, block); + tree = cache_get_tree(&c->cache, block); + b = cache_get(&c->cache, tree, block); if (b) { if (nf == NF_PREFETCH) { - cache_put_and_wake(c, b); + cache_put_and_wake(c, tree, b); return NULL; } @@ -1872,7 +1900,7 @@ static void *new_read(struct dm_bufio_client *c, sector_t block, * the same buffer, it would deadlock if we waited. */ if (nf == NF_GET && unlikely(test_bit_acquire(B_READING, &b->state))) { - cache_put_and_wake(c, b); + cache_put_and_wake(c, tree, b); return NULL; } } @@ -1882,7 +1910,7 @@ static void *new_read(struct dm_bufio_client *c, sector_t block, return NULL; dm_bufio_lock(c); - b = __bufio_new(c, block, nf, &need_submit, &write_list); + b = __bufio_new(c, tree, block, nf, &need_submit, &write_list); dm_bufio_unlock(c); } @@ -1969,18 +1997,20 @@ static void __dm_bufio_prefetch(struct dm_bufio_client *c, blk_start_plug(&plug); for (; n_blocks--; block++) { - int need_submit; + struct buffer_tree *tree; struct dm_buffer *b; + int need_submit; - b = cache_get(&c->cache, block); + tree = cache_get_tree(&c->cache, block); + b = cache_get(&c->cache, tree, block); if (b) { /* already in cache */ - cache_put_and_wake(c, b); + cache_put_and_wake(c, tree, b); continue; } dm_bufio_lock(c); - b = __bufio_new(c, block, NF_PREFETCH, &need_submit, + b = __bufio_new(c, tree, block, NF_PREFETCH, &need_submit, &write_list); if (unlikely(!list_empty(&write_list))) { dm_bufio_unlock(c); @@ -2025,6 +2055,7 @@ EXPORT_SYMBOL_GPL(dm_bufio_prefetch_with_ioprio); void dm_bufio_release(struct dm_buffer *b) { struct dm_bufio_client *c = b->c; + struct buffer_tree *tree = cache_get_tree(&c->cache, b->block); /* * If there were errors on the buffer, and the buffer is not @@ -2038,7 +2069,7 @@ void dm_bufio_release(struct dm_buffer *b) dm_bufio_lock(c); /* cache remove can fail if there are other holders */ - if (cache_remove(&c->cache, b)) { + if (cache_remove(&c->cache, tree, b)) { __free_buffer_wake(b); dm_bufio_unlock(c); return; @@ -2047,7 +2078,7 @@ void dm_bufio_release(struct dm_buffer *b) dm_bufio_unlock(c); } - cache_put_and_wake(c, b); + cache_put_and_wake(c, tree, b); } EXPORT_SYMBOL_GPL(dm_bufio_release); @@ -2066,7 +2097,8 @@ void dm_bufio_mark_partial_buffer_dirty(struct dm_buffer *b, if (!test_and_set_bit(B_DIRTY, &b->state)) { b->dirty_start = start; b->dirty_end = end; - cache_mark(&c->cache, b, LIST_DIRTY); + cache_mark(&c->cache, cache_get_tree(&c->cache, b->block), b, + LIST_DIRTY); } else { if (start < b->dirty_start) b->dirty_start = start; @@ -2131,6 +2163,7 @@ int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c) lru_iter_begin(&c->cache.lru[LIST_DIRTY], &it); while ((e = lru_iter_next(&it, is_writing, c))) { struct dm_buffer *b = le_to_buffer(e); + struct buffer_tree *tree; __cache_inc_buffer(b); BUG_ON(test_bit(B_READING, &b->state)); @@ -2144,10 +2177,12 @@ int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c) wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); } - if (!test_bit(B_DIRTY, &b->state) && !test_bit(B_WRITING, &b->state)) - cache_mark(&c->cache, b, LIST_CLEAN); + tree = cache_get_tree(&c->cache, b->block); - cache_put_and_wake(c, b); + if (!test_bit(B_DIRTY, &b->state) && !test_bit(B_WRITING, &b->state)) + cache_mark(&c->cache, tree, b, LIST_CLEAN); + + cache_put_and_wake(c, tree, b); cond_resched(); } @@ -2215,17 +2250,18 @@ EXPORT_SYMBOL_GPL(dm_bufio_issue_discard); static void forget_buffer(struct dm_bufio_client *c, sector_t block) { + struct buffer_tree *tree = cache_get_tree(&c->cache, block); struct dm_buffer *b; - b = cache_get(&c->cache, block); + b = cache_get(&c->cache, tree, block); if (b) { if (likely(!smp_load_acquire(&b->state))) { - if (cache_remove(&c->cache, b)) + if (cache_remove(&c->cache, tree, b)) __free_buffer_wake(b); else - cache_put_and_wake(c, b); + cache_put_and_wake(c, tree, b); } else { - cache_put_and_wake(c, b); + cache_put_and_wake(c, tree, b); } } } From 118ba36e446c01e3cd34b3eedabf1d9436525e1d Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 19 Jan 2026 15:06:02 +0100 Subject: [PATCH 27/35] dm-integrity: fix recalculation in bitmap mode There's a logic quirk in the handling of suspend in the bitmap mode: This is the sequence of calls if we are reloading a dm-integrity table: * dm_integrity_ctr reads a superblock with the flag SB_FLAG_DIRTY_BITMAP set. * dm_integrity_postsuspend initializes a journal and clears the flag SB_FLAG_DIRTY_BITMAP. * dm_integrity_resume sees the superblock with SB_FLAG_DIRTY_BITMAP set - thus it interprets the journal as if it were a bitmap. This quirk causes recalculation problem if the user increases the size of the device in the bitmap mode. Fix this by reading a fresh copy on the superblock in dm_integrity_resume. This commit also fixes another logic quirk - the branch that sets bitmap bits if the device was extended should only be executed if the flag SB_FLAG_DIRTY_BITMAP is set. Signed-off-by: Mikulas Patocka Tested-by: Ondrej Kozina Fixes: 468dfca38b1a ("dm integrity: add a bitmap mode") Cc: stable@vger.kernel.org --- drivers/md/dm-integrity.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 380527f43b2a..a9c0157bf42f 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -3788,14 +3788,27 @@ static void dm_integrity_resume(struct dm_target *ti) struct dm_integrity_c *ic = ti->private; __u64 old_provided_data_sectors = le64_to_cpu(ic->sb->provided_data_sectors); int r; + __le32 flags; DEBUG_print("resume\n"); ic->wrote_to_journal = false; + flags = ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING); + r = sync_rw_sb(ic, REQ_OP_READ); + if (r) + dm_integrity_io_error(ic, "reading superblock", r); + if ((ic->sb->flags & flags) != flags) { + ic->sb->flags |= flags; + r = sync_rw_sb(ic, REQ_OP_WRITE | REQ_FUA); + if (unlikely(r)) + dm_integrity_io_error(ic, "writing superblock", r); + } + if (ic->provided_data_sectors != old_provided_data_sectors) { if (ic->provided_data_sectors > old_provided_data_sectors && ic->mode == 'B' && + ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP) && ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit) { rw_journal_sectors(ic, REQ_OP_READ, 0, ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); From 83c10e8dd43628d0bf86486616556cd749a3c310 Mon Sep 17 00:00:00 2001 From: Matt Whitlock Date: Sun, 18 Jan 2026 13:36:15 -0500 Subject: [PATCH 28/35] dm-unstripe: fix mapping bug when there are multiple targets in a table The "unstriped" device-mapper target incorrectly calculates the sector offset on the mapped device when the target's origin is not zero. Take for example this hypothetical concatenation of the members of a two-disk RAID0: linearized: 0 2097152 unstriped 2 128 0 /dev/md/raid0 0 linearized: 2097152 2097152 unstriped 2 128 1 /dev/md/raid0 0 The intent in this example is to create a single device named /dev/mapper/linearized that comprises all of the chunks of the first disk of the RAID0 set, followed by all of the chunks of the second disk of the RAID0 set. This fails because dm-unstripe.c's map_to_core function does its computations based on the sector number within the mapper device rather than the sector number within the target. The bug turns invisible when the target's origin is at sector zero of the mapper device, as is the common case. In the example above, however, what happens is that the first half of the mapper device gets mapped correctly to the first disk of the RAID0, but the second half of the mapper device gets mapped past the end of the RAID0 device, and accesses to any of those sectors return errors. Signed-off-by: Matt Whitlock Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Fixes: 18a5bf270532 ("dm: add unstriped target") --- drivers/md/dm-unstripe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-unstripe.c b/drivers/md/dm-unstripe.c index e8a9432057dc..17be48359564 100644 --- a/drivers/md/dm-unstripe.c +++ b/drivers/md/dm-unstripe.c @@ -117,7 +117,7 @@ static void unstripe_dtr(struct dm_target *ti) static sector_t map_to_core(struct dm_target *ti, struct bio *bio) { struct unstripe_c *uc = ti->private; - sector_t sector = bio->bi_iter.bi_sector; + sector_t sector = dm_target_offset(ti, bio->bi_iter.bi_sector); sector_t tmp_sector = sector; /* Shift us up to the right "row" on the stripe */ From 0f1e16b3a8634d540fb0c7a11ca13412d2902974 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 21 Jan 2026 14:02:20 +0100 Subject: [PATCH 29/35] dm-verity: fix section mismatch error The function "__init dm_verity_init" was calling "__exit dm_verity_verify_sig_exit" and this triggered section mismatch error. Fix this by dropping the "__exit" tag on dm_verity_verify_sig_exit. Signed-off-by: Mikulas Patocka Fixes: 033724b1c627A ("dm-verity: add dm-verity keyring") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202601210645.11u5Myme-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202601211041.pcTzwcdp-lkp@intel.com/ --- drivers/md/dm-verity-verify-sig.c | 2 +- drivers/md/dm-verity-verify-sig.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-verity-verify-sig.c b/drivers/md/dm-verity-verify-sig.c index 2a2abd9864c9..b2b55c41e2cb 100644 --- a/drivers/md/dm-verity-verify-sig.c +++ b/drivers/md/dm-verity-verify-sig.c @@ -192,7 +192,7 @@ int __init dm_verity_verify_sig_init(void) return 0; } -void __exit dm_verity_verify_sig_exit(void) +void dm_verity_verify_sig_exit(void) { key_revoke(dm_verity_keyring); key_put(dm_verity_keyring); diff --git a/drivers/md/dm-verity-verify-sig.h b/drivers/md/dm-verity-verify-sig.h index b0bb0d427244..90d6b9933f05 100644 --- a/drivers/md/dm-verity-verify-sig.h +++ b/drivers/md/dm-verity-verify-sig.h @@ -31,7 +31,7 @@ int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts); int __init dm_verity_verify_sig_init(void); -void __exit dm_verity_verify_sig_exit(void); +void dm_verity_verify_sig_exit(void); #else From d6d0e6b9d54532264761405a1ba8ea5bd293acb1 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 16 Jan 2026 19:02:36 -0800 Subject: [PATCH 30/35] dm: fix excessive blk-crypto operations for invalid keys dm_exec_wrappedkey_op() passes through the derive_sw_secret, import_key, generate_key, and prepare_key blk-crypto operations to an underlying device. Currently, it calls the operation on every underlying device until one returns success. This logic is flawed when the operation is expected to fail, such as an invalid key being passed to derive_sw_secret. That can happen if userspace passes an invalid key to the FS_IOC_ADD_ENCRYPTION_KEY ioctl. When that happens on a device-mapper device that consists of many dm-linear targets, a lot of unnecessary key unwrapping requests get sent to the underlying key wrapping hardware. Fix this by considering the first device only. As already documented in the comment, it was already checked that all underlying devices support wrapped keys, so this should be fine. Fixes: e93912786e50 ("dm: pass through operations on wrapped inline crypto keys") Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-table.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 0522cd700e0e..4b70872725d0 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1237,9 +1237,6 @@ static int dm_wrappedkey_op_callback(struct dm_target *ti, struct dm_dev *dev, bdev_get_queue(bdev)->crypto_profile; int err = -EOPNOTSUPP; - if (!args->err) - return 0; - switch (args->op) { case DERIVE_SW_SECRET: err = blk_crypto_derive_sw_secret( @@ -1266,9 +1263,7 @@ static int dm_wrappedkey_op_callback(struct dm_target *ti, struct dm_dev *dev, break; } args->err = err; - - /* Try another device in case this fails. */ - return 0; + return 1; /* No need to continue the iteration. */ } static int dm_exec_wrappedkey_op(struct blk_crypto_profile *profile, @@ -1294,14 +1289,13 @@ static int dm_exec_wrappedkey_op(struct blk_crypto_profile *profile, * declared on all underlying devices. Thus, all the underlying devices * should support all wrapped key operations and they should behave * identically, i.e. work with the same keys. So, just executing the - * operation on the first device on which it works suffices for now. + * operation on the first device suffices for now. */ for (i = 0; i < t->num_targets; i++) { ti = dm_table_get_target(t, i); if (!ti->type->iterate_devices) continue; - ti->type->iterate_devices(ti, dm_wrappedkey_op_callback, args); - if (!args->err) + if (ti->type->iterate_devices(ti, dm_wrappedkey_op_callback, args) != 0) break; } out: From 2df8b310bcfe76827fd71092f58a2493ee6590b0 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 26 Jan 2026 15:36:22 +0100 Subject: [PATCH 31/35] dm: use bio_clone_blkg_association The origin bio carries blk-cgroup information which could be set from foreground(task_css(css) - wbc->wb->blkcg_css), so the blkcg won't control buffer io since commit ca522482e3eaf ("dm: pass NULL bdev to bio_alloc_clone"). The synchronous io is still under control by blkcg, because 'bio->bi_blkg' is set by io submitting task which has been added into 'cgroup.procs'. Fix it by using bio_clone_blkg_association when submitting a cloned bio. Link: https://bugzilla.kernel.org/show_bug.cgi?id=220985 Fixes: ca522482e3eaf ("dm: pass NULL bdev to bio_alloc_clone") Reported-by: Zhihao Cheng Signed-off-by: Mikulas Patocka Tested-by: Zhihao Cheng --- drivers/md/dm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index ea2c43cddde1..e178fe19973e 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1364,6 +1364,8 @@ void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone) if (!tgt_clone) tgt_clone = clone; + bio_clone_blkg_association(tgt_clone, io->orig_bio); + /* * Account io->origin_bio to DM dev on behalf of target * that took ownership of IO with DM_MAPIO_SUBMITTED. From ec8534021a71ebdea2ba565a2a147f2464e36356 Mon Sep 17 00:00:00 2001 From: Matthew Sakai Date: Tue, 27 Jan 2026 10:50:43 -0500 Subject: [PATCH 32/35] dm vdo encodings: clean up header and version functions Make several header functions static. Also remove vdo_is_upgradable_version, which is unused. Signed-off-by: Matthew Sakai Signed-off-by: Mikulas Patocka --- drivers/md/dm-vdo/encodings.c | 11 ++++++----- drivers/md/dm-vdo/encodings.h | 25 ------------------------- 2 files changed, 6 insertions(+), 30 deletions(-) diff --git a/drivers/md/dm-vdo/encodings.c b/drivers/md/dm-vdo/encodings.c index dd59691be840..bd60f4b3a0d0 100644 --- a/drivers/md/dm-vdo/encodings.c +++ b/drivers/md/dm-vdo/encodings.c @@ -172,9 +172,9 @@ static int __must_check validate_version(struct version_number expected_version, * VDO_INCORRECT_COMPONENT if the component ids don't match, * VDO_UNSUPPORTED_VERSION if the versions or sizes don't match. */ -int vdo_validate_header(const struct header *expected_header, - const struct header *actual_header, bool exact_size, - const char *name) +static int vdo_validate_header(const struct header *expected_header, + const struct header *actual_header, + bool exact_size, const char *name) { int result; @@ -210,7 +210,8 @@ static void encode_version_number(u8 *buffer, size_t *offset, *offset += sizeof(packed); } -void vdo_encode_header(u8 *buffer, size_t *offset, const struct header *header) +static void vdo_encode_header(u8 *buffer, size_t *offset, + const struct header *header) { struct packed_header packed = vdo_pack_header(header); @@ -228,7 +229,7 @@ static void decode_version_number(u8 *buffer, size_t *offset, *version = vdo_unpack_version_number(packed); } -void vdo_decode_header(u8 *buffer, size_t *offset, struct header *header) +static void vdo_decode_header(u8 *buffer, size_t *offset, struct header *header) { struct packed_header packed; diff --git a/drivers/md/dm-vdo/encodings.h b/drivers/md/dm-vdo/encodings.h index e5ff2b0aaa79..87b7d2f3b545 100644 --- a/drivers/md/dm-vdo/encodings.h +++ b/drivers/md/dm-vdo/encodings.h @@ -707,31 +707,6 @@ static inline bool vdo_are_same_version(struct version_number version_a, (version_a.minor_version == version_b.minor_version)); } -/** - * vdo_is_upgradable_version() - Check whether an actual version is upgradable to an expected - * version. - * @expected_version: The expected version. - * @actual_version: The version being validated. - * - * An actual version is upgradable if its major number is expected but its minor number differs, - * and the expected version's minor number is greater than the actual version's minor number. - * - * Return: true if the actual version is upgradable. - */ -static inline bool vdo_is_upgradable_version(struct version_number expected_version, - struct version_number actual_version) -{ - return ((expected_version.major_version == actual_version.major_version) && - (expected_version.minor_version > actual_version.minor_version)); -} - -int __must_check vdo_validate_header(const struct header *expected_header, - const struct header *actual_header, bool exact_size, - const char *component_name); - -void vdo_encode_header(u8 *buffer, size_t *offset, const struct header *header); -void vdo_decode_header(u8 *buffer, size_t *offset, struct header *header); - /** * vdo_pack_version_number() - Convert a version_number to its packed on-disk representation. * @version: The version number to convert. From 787bd63ee661b0148ce8e1fde92b7afddd85c446 Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Tue, 27 Jan 2026 19:12:22 -0500 Subject: [PATCH 33/35] dm mpath: Add missing dm_put_device when failing to get scsi dh name When commit fd81bc5cca8f ("scsi: device_handler: Return error pointer in scsi_dh_attached_handler_name()") added code to fail parsing the path if scsi_dh_attached_handler_name() failed with -ENOMEM, it didn't clean up the reference to the path device that had just been taken. Fix this, and steamline the error paths of parse_path() a little. Fixes: fd81bc5cca8f ("scsi: device_handler: Return error pointer in scsi_dh_attached_handler_name()") Cc: stable@vger.kernel.org Signed-off-by: Benjamin Marzinski Signed-off-by: Mikulas Patocka --- drivers/md/dm-mpath.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index c748e7f952c4..6f9d86f4b912 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -960,27 +960,27 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps attached_handler_name = NULL; } else { r = PTR_ERR(attached_handler_name); - goto bad; + ti->error = "error allocating handler name"; + goto bad_put_device; } } if (attached_handler_name || m->hw_handler_name) { INIT_DELAYED_WORK(&p->activate_path, activate_path_work); r = setup_scsi_dh(p->path.dev->bdev, m, &attached_handler_name, &ti->error); kfree(attached_handler_name); - if (r) { - dm_put_device(ti, p->path.dev); - goto bad; - } + if (r) + goto bad_put_device; } r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error); - if (r) { - dm_put_device(ti, p->path.dev); - goto bad; - } + if (r) + goto bad_put_device; return p; - bad: + +bad_put_device: + dm_put_device(ti, p->path.dev); +bad: free_pgpath(p); return ERR_PTR(r); } From 4550a71b179be9e2a17015c018b231a2daca2dd1 Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Tue, 27 Jan 2026 19:12:23 -0500 Subject: [PATCH 34/35] Revert "dm: fix a race condition in retrieve_deps" This reverts commit f6007dce0cd35d634d9be91ef3515a6385dcee16. Commit f6007dce0cd3 ("dm: fix a race condition in retrieve_deps") was added to fix a race between retrieving the list of dm table devices and multipath_message() modifying the list of table devices. But Commit a48f6b82c5c4 ("dm mpath: don't call dm_get_device in multipath_message") removed the call to dm_get_device() from multipath_message(). After that commit, the only calls to dm_get_device() and dm_put_device() are in target constructors and destructors, so the race with retrieve_deps() is no longer possible. Suggested-by: Martin Wilck Signed-off-by: Benjamin Marzinski Signed-off-by: Mikulas Patocka --- drivers/md/dm-core.h | 1 - drivers/md/dm-ioctl.c | 7 +------ drivers/md/dm-table.c | 32 ++++++++------------------------ 3 files changed, 9 insertions(+), 31 deletions(-) diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 1cda8618d74d..a3b8ad6e1c42 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -215,7 +215,6 @@ struct dm_table { /* a list of devices used by this table */ struct list_head devices; - struct rw_semaphore devices_lock; /* events get handed up using this callback */ void (*event_fn)(void *data); diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 4165fef4c170..fd4bf8e1d73e 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1648,8 +1648,6 @@ static void retrieve_deps(struct dm_table *table, struct dm_dev_internal *dd; struct dm_target_deps *deps; - down_read(&table->devices_lock); - deps = get_result_buffer(param, param_size, &len); /* @@ -1664,7 +1662,7 @@ static void retrieve_deps(struct dm_table *table, needed = struct_size(deps, dev, count); if (len < needed) { param->flags |= DM_BUFFER_FULL_FLAG; - goto out; + return; } /* @@ -1676,9 +1674,6 @@ static void retrieve_deps(struct dm_table *table, deps->dev[count++] = huge_encode_dev(dd->dm_dev->bdev->bd_dev); param->data_size = param->data_start + needed; - -out: - up_read(&table->devices_lock); } static int table_deps(struct file *filp, struct dm_ioctl *param, size_t param_size) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 4b70872725d0..7be1d8dc8bdd 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -139,7 +139,6 @@ int dm_table_create(struct dm_table **result, blk_mode_t mode, return -ENOMEM; INIT_LIST_HEAD(&t->devices); - init_rwsem(&t->devices_lock); if (!num_targets) num_targets = KEYS_PER_NODE; @@ -380,20 +379,16 @@ int dm_get_device(struct dm_target *ti, const char *path, blk_mode_t mode, if (dev == disk_devt(t->md->disk)) return -EINVAL; - down_write(&t->devices_lock); - dd = find_device(&t->devices, dev); if (!dd) { dd = kmalloc(sizeof(*dd), GFP_KERNEL); - if (!dd) { - r = -ENOMEM; - goto unlock_ret_r; - } + if (!dd) + return -ENOMEM; r = dm_get_table_device(t->md, dev, mode, &dd->dm_dev); if (r) { kfree(dd); - goto unlock_ret_r; + return r; } refcount_set(&dd->count, 1); @@ -403,17 +398,12 @@ int dm_get_device(struct dm_target *ti, const char *path, blk_mode_t mode, } else if (dd->dm_dev->mode != (mode | dd->dm_dev->mode)) { r = upgrade_mode(dd, mode, t->md); if (r) - goto unlock_ret_r; + return r; } refcount_inc(&dd->count); out: - up_write(&t->devices_lock); *result = dd->dm_dev; return 0; - -unlock_ret_r: - up_write(&t->devices_lock); - return r; } EXPORT_SYMBOL(dm_get_device); @@ -464,12 +454,9 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, void dm_put_device(struct dm_target *ti, struct dm_dev *d) { int found = 0; - struct dm_table *t = ti->table; - struct list_head *devices = &t->devices; + struct list_head *devices = &ti->table->devices; struct dm_dev_internal *dd; - down_write(&t->devices_lock); - list_for_each_entry(dd, devices, list) { if (dd->dm_dev == d) { found = 1; @@ -478,17 +465,14 @@ void dm_put_device(struct dm_target *ti, struct dm_dev *d) } if (!found) { DMERR("%s: device %s not in table devices list", - dm_device_name(t->md), d->name); - goto unlock_ret; + dm_device_name(ti->table->md), d->name); + return; } if (refcount_dec_and_test(&dd->count)) { - dm_put_table_device(t->md, d); + dm_put_table_device(ti->table->md, d); list_del(&dd->list); kfree(dd); } - -unlock_ret: - up_write(&t->devices_lock); } EXPORT_SYMBOL(dm_put_device); From 218b16992a37ea97b9e09b7659a25a864fb9976f Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Tue, 27 Jan 2026 19:12:24 -0500 Subject: [PATCH 35/35] dm mpath: make pg_init_delay_msecs settable "pg_init_delay_msecs X" can be passed as a feature in the multipath table and is used to set m->pg_init_delay_msecs in parse_features(). However, alloc_multipath_stage2(), which is called after parse_features(), resets m->pg_init_delay_msecs to its default value. Instead, set m->pg_init_delay_msecs in alloc_multipath(), which is called before parse_features(), to avoid overwriting a value passed in by the table. Signed-off-by: Benjamin Marzinski Cc: stable@vger.kernel.org --- drivers/md/dm-mpath.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 6f9d86f4b912..de03f9b06584 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -225,6 +225,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti) mutex_init(&m->work_mutex); m->queue_mode = DM_TYPE_NONE; + m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; m->ti = ti; ti->private = m; @@ -251,7 +252,6 @@ static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m) set_bit(MPATHF_QUEUE_IO, &m->flags); atomic_set(&m->pg_init_in_progress, 0); atomic_set(&m->pg_init_count, 0); - m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; init_waitqueue_head(&m->pg_init_wait); init_waitqueue_head(&m->probe_wait);