From 2760904d895279f87196f0fa9ec570c79fe6a2e4 Mon Sep 17 00:00:00 2001 From: Li Lingfeng Date: Thu, 1 Jun 2023 14:14:23 +0800 Subject: [PATCH 1/4] dm: don't lock fs when the map is NULL during suspend or resume As described in commit 38d11da522aa ("dm: don't lock fs when the map is NULL in process of resume"), a deadlock may be triggered between do_resume() and do_mount(). This commit preserves the fix from commit 38d11da522aa but moves it to where it also serves to fix a similar deadlock between do_suspend() and do_mount(). It does so, if the active map is NULL, by clearing DM_SUSPEND_LOCKFS_FLAG in dm_suspend() which is called by both do_suspend() and do_resume(). Fixes: 38d11da522aa ("dm: don't lock fs when the map is NULL in process of resume") Signed-off-by: Li Lingfeng Signed-off-by: Mike Snitzer --- drivers/md/dm-ioctl.c | 5 +---- drivers/md/dm.c | 4 ++++ 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index cc77cf3d4109..7d5c9c582ed2 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1168,13 +1168,10 @@ static int do_resume(struct dm_ioctl *param) /* Do we need to load a new map ? */ if (new_map) { sector_t old_size, new_size; - int srcu_idx; /* Suspend if it isn't already suspended */ - old_map = dm_get_live_table(md, &srcu_idx); - if ((param->flags & DM_SKIP_LOCKFS_FLAG) || !old_map) + if (param->flags & DM_SKIP_LOCKFS_FLAG) suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG; - dm_put_live_table(md, srcu_idx); if (param->flags & DM_NOFLUSH_FLAG) suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; if (!dm_suspended_md(md)) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 3b694ba3a106..8488547fc00d 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2808,6 +2808,10 @@ int dm_suspend(struct mapped_device *md, unsigned int suspend_flags) } map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); + if (!map) { + /* avoid deadlock with fs/namespace.c:do_mount() */ + suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG; + } r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED); if (r) From cb65b282c9640c27d3129e2e04b711ce1b352838 Mon Sep 17 00:00:00 2001 From: Li Lingfeng Date: Tue, 6 Jun 2023 20:20:24 +0800 Subject: [PATCH 2/4] dm thin metadata: check fail_io before using data_sm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Must check pmd->fail_io before using pmd->data_sm since pmd->data_sm may be destroyed by other processes. P1(kworker) P2(message) do_worker process_prepared process_prepared_discard_passdown_pt2 dm_pool_dec_data_range pool_message commit dm_pool_commit_metadata ↓ // commit failed metadata_operation_failed abort_transaction dm_pool_abort_metadata __open_or_format_metadata ↓ dm_sm_disk_open ↓ // open failed // pmd->data_sm is NULL dm_sm_dec_blocks ↓ // try to access pmd->data_sm --> UAF As shown above, if dm_pool_commit_metadata() and dm_pool_abort_metadata() fail in pool_message process, kworker may trigger UAF. Fixes: be500ed721a6 ("dm space maps: improve performance with inc/dec on ranges of blocks") Cc: stable@vger.kernel.org Signed-off-by: Li Lingfeng Signed-off-by: Mike Snitzer --- drivers/md/dm-thin-metadata.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 9f5cb52c5763..b9461faa9f0d 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -1756,13 +1756,15 @@ int dm_thin_remove_range(struct dm_thin_device *td, int dm_pool_block_is_shared(struct dm_pool_metadata *pmd, dm_block_t b, bool *result) { - int r; + int r = -EINVAL; uint32_t ref_count; down_read(&pmd->root_lock); - r = dm_sm_get_count(pmd->data_sm, b, &ref_count); - if (!r) - *result = (ref_count > 1); + if (!pmd->fail_io) { + r = dm_sm_get_count(pmd->data_sm, b, &ref_count); + if (!r) + *result = (ref_count > 1); + } up_read(&pmd->root_lock); return r; @@ -1770,10 +1772,11 @@ int dm_pool_block_is_shared(struct dm_pool_metadata *pmd, dm_block_t b, bool *re int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e) { - int r = 0; + int r = -EINVAL; pmd_write_lock(pmd); - r = dm_sm_inc_blocks(pmd->data_sm, b, e); + if (!pmd->fail_io) + r = dm_sm_inc_blocks(pmd->data_sm, b, e); pmd_write_unlock(pmd); return r; @@ -1781,10 +1784,11 @@ int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_ int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e) { - int r = 0; + int r = -EINVAL; pmd_write_lock(pmd); - r = dm_sm_dec_blocks(pmd->data_sm, b, e); + if (!pmd->fail_io) + r = dm_sm_dec_blocks(pmd->data_sm, b, e); pmd_write_unlock(pmd); return r; From 722d90822321497e2837cfc9000202e256e6b32f Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 13 Jun 2023 20:05:34 -0400 Subject: [PATCH 3/4] dm thin: fix issue_discard to pass GFP_NOIO to __blkdev_issue_discard issue_discard() passes GFP_NOWAIT to __blkdev_issue_discard() despite its code assuming bio_alloc() always succeeds. Commit 3dba53a958a75 ("dm thin: use __blkdev_issue_discard for async discard support") clearly shows where things went bad: Before commit 3dba53a958a75, dm-thin.c's open-coded __blkdev_issue_discard_async() properly handled using GFP_NOWAIT. Unfortunately __blkdev_issue_discard() doesn't and it was missed during review. Cc: stable@vger.kernel.org Signed-off-by: Mike Snitzer --- drivers/md/dm-thin.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 2b13c949bd72..39410bf186cf 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -401,8 +401,7 @@ static int issue_discard(struct discard_op *op, dm_block_t data_b, dm_block_t da sector_t s = block_to_sectors(tc->pool, data_b); sector_t len = block_to_sectors(tc->pool, data_e - data_b); - return __blkdev_issue_discard(tc->pool_dev->bdev, s, len, GFP_NOWAIT, - &op->bio); + return __blkdev_issue_discard(tc->pool_dev->bdev, s, len, GFP_NOIO, &op->bio); } static void end_discard(struct discard_op *op, int r) From be04c14a1bd262a49e5764e5cf864259b7e740fd Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 14 Jun 2023 21:47:46 -0400 Subject: [PATCH 4/4] dm: use op specific max_sectors when splitting abnormal io Split abnormal IO in terms of the corresponding operation specific max_sectors (max_discard_sectors, max_secure_erase_sectors or max_write_zeroes_sectors). This fixes a significant dm-thinp discard performance regression that was introduced with commit e2dd8aca2d76 ("dm bio prison v1: improve concurrent IO performance"). Relative to discard: max_discard_sectors is used instead of max_sectors; which fixes excessive discard splitting (e.g. max_sectors=128K vs max_discard_sectors=64M). Tested by discarding an 1 Petabyte dm-thin device: lvcreate -V 1125899906842624B -T test/pool -n thin time blkdiscard /dev/test/thin Before this fix (splitting discards every 128K): ~116m After this fix (splitting discards every 64M) : 0m33.460s Reported-by: Zorro Lang Fixes: 06961c487a33 ("dm: split discards further if target sets max_discard_granularity") Requires: 13f6facf3fae ("dm: allow targets to require splitting WRITE_ZEROES and SECURE_ERASE") Fixes: e2dd8aca2d76 ("dm bio prison v1: improve concurrent IO performance") Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 8488547fc00d..fffb0cbe2ac8 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1172,7 +1172,8 @@ static inline sector_t max_io_len_target_boundary(struct dm_target *ti, } static sector_t __max_io_len(struct dm_target *ti, sector_t sector, - unsigned int max_granularity) + unsigned int max_granularity, + unsigned int max_sectors) { sector_t target_offset = dm_target_offset(ti, sector); sector_t len = max_io_len_target_boundary(ti, target_offset); @@ -1186,13 +1187,13 @@ static sector_t __max_io_len(struct dm_target *ti, sector_t sector, if (!max_granularity) return len; return min_t(sector_t, len, - min(queue_max_sectors(ti->table->md->queue), + min(max_sectors ? : queue_max_sectors(ti->table->md->queue), blk_chunk_sectors_left(target_offset, max_granularity))); } static inline sector_t max_io_len(struct dm_target *ti, sector_t sector) { - return __max_io_len(ti, sector, ti->max_io_len); + return __max_io_len(ti, sector, ti->max_io_len, 0); } int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) @@ -1581,12 +1582,13 @@ static void __send_empty_flush(struct clone_info *ci) static void __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti, unsigned int num_bios, - unsigned int max_granularity) + unsigned int max_granularity, + unsigned int max_sectors) { unsigned int len, bios; len = min_t(sector_t, ci->sector_count, - __max_io_len(ti, ci->sector, max_granularity)); + __max_io_len(ti, ci->sector, max_granularity, max_sectors)); atomic_add(num_bios, &ci->io->io_count); bios = __send_duplicate_bios(ci, ti, num_bios, &len); @@ -1623,23 +1625,27 @@ static blk_status_t __process_abnormal_io(struct clone_info *ci, { unsigned int num_bios = 0; unsigned int max_granularity = 0; + unsigned int max_sectors = 0; struct queue_limits *limits = dm_get_queue_limits(ti->table->md); switch (bio_op(ci->bio)) { case REQ_OP_DISCARD: num_bios = ti->num_discard_bios; + max_sectors = limits->max_discard_sectors; if (ti->max_discard_granularity) - max_granularity = limits->max_discard_sectors; + max_granularity = max_sectors; break; case REQ_OP_SECURE_ERASE: num_bios = ti->num_secure_erase_bios; + max_sectors = limits->max_secure_erase_sectors; if (ti->max_secure_erase_granularity) - max_granularity = limits->max_secure_erase_sectors; + max_granularity = max_sectors; break; case REQ_OP_WRITE_ZEROES: num_bios = ti->num_write_zeroes_bios; + max_sectors = limits->max_write_zeroes_sectors; if (ti->max_write_zeroes_granularity) - max_granularity = limits->max_write_zeroes_sectors; + max_granularity = max_sectors; break; default: break; @@ -1654,7 +1660,8 @@ static blk_status_t __process_abnormal_io(struct clone_info *ci, if (unlikely(!num_bios)) return BLK_STS_NOTSUPP; - __send_changing_extent_only(ci, ti, num_bios, max_granularity); + __send_changing_extent_only(ci, ti, num_bios, + max_granularity, max_sectors); return BLK_STS_OK; }