From 2d3447261031503b181dacc549fe65ffe2d93d65 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 21 Aug 2024 15:53:18 -0400 Subject: [PATCH 1/4] btrfs: run delayed iputs when flushing delalloc We have transient failures with btrfs/301, specifically in the part where we do for i in $(seq 0 10); do write 50m to file rm -f file done Sometimes this will result in a transient quota error, and it's because sometimes we start writeback on the file which results in a delayed iput, and thus the rm doesn't actually clean the file up. When we're flushing the quota space we need to run the delayed iputs to make sure all the unlinks that we think have completed have actually completed. This removes the small window where we could fail to find enough space in our quota. CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5d57a285d59b..7d6f5d9420ec 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -4185,6 +4185,8 @@ static int try_flush_qgroup(struct btrfs_root *root) return 0; } + btrfs_run_delayed_iputs(root->fs_info); + btrfs_wait_on_delayed_iputs(root->fs_info); ret = btrfs_start_delalloc_snapshot(root, true); if (ret < 0) goto out; From 33f58a0480bb9e2479ccdf556f61363723a50d47 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 21 Aug 2024 01:19:57 +0200 Subject: [PATCH 2/4] btrfs: initialize last_extent_end to fix -Wmaybe-uninitialized warning in extent_fiemap() There's a warning (probably on some older compiler version): fs/btrfs/fiemap.c: warning: 'last_extent_end' may be used uninitialized in this function [-Wmaybe-uninitialized]: => 822:19 Initialize the variable to 0 although it's not necessary as it's either properly set or not used after an error. The called function is in the same file so this is a false alert but we want to fix all -Wmaybe-uninitialized reports. Link: https://lore.kernel.org/all/20240819070639.2558629-1-geert@linux-m68k.org/ Reported-by: Geert Uytterhoeven Signed-off-by: David Sterba --- fs/btrfs/fiemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c index 8f95f3e44e99..df7f09f3b02e 100644 --- a/fs/btrfs/fiemap.c +++ b/fs/btrfs/fiemap.c @@ -637,7 +637,7 @@ static int extent_fiemap(struct btrfs_inode *inode, struct btrfs_path *path; struct fiemap_cache cache = { 0 }; struct btrfs_backref_share_check_ctx *backref_ctx; - u64 last_extent_end; + u64 last_extent_end = 0; u64 prev_extent_end; u64 range_start; u64 range_end; From 10d9d8c3512f16cad47b2ff81ec6fc4b27d8ee10 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sat, 17 Aug 2024 18:34:30 +0930 Subject: [PATCH 3/4] btrfs: fix a use-after-free when hitting errors inside btrfs_submit_chunk() [BUG] There is an internal report that KASAN is reporting use-after-free, with the following backtrace: BUG: KASAN: slab-use-after-free in btrfs_check_read_bio+0xa68/0xb70 [btrfs] Read of size 4 at addr ffff8881117cec28 by task kworker/u16:2/45 CPU: 1 UID: 0 PID: 45 Comm: kworker/u16:2 Not tainted 6.11.0-rc2-next-20240805-default+ #76 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-3-gd478f380-rebuilt.opensuse.org 04/01/2014 Workqueue: btrfs-endio btrfs_end_bio_work [btrfs] Call Trace: dump_stack_lvl+0x61/0x80 print_address_description.constprop.0+0x5e/0x2f0 print_report+0x118/0x216 kasan_report+0x11d/0x1f0 btrfs_check_read_bio+0xa68/0xb70 [btrfs] process_one_work+0xce0/0x12a0 worker_thread+0x717/0x1250 kthread+0x2e3/0x3c0 ret_from_fork+0x2d/0x70 ret_from_fork_asm+0x11/0x20 Allocated by task 20917: kasan_save_stack+0x37/0x60 kasan_save_track+0x10/0x30 __kasan_slab_alloc+0x7d/0x80 kmem_cache_alloc_noprof+0x16e/0x3e0 mempool_alloc_noprof+0x12e/0x310 bio_alloc_bioset+0x3f0/0x7a0 btrfs_bio_alloc+0x2e/0x50 [btrfs] submit_extent_page+0x4d1/0xdb0 [btrfs] btrfs_do_readpage+0x8b4/0x12a0 [btrfs] btrfs_readahead+0x29a/0x430 [btrfs] read_pages+0x1a7/0xc60 page_cache_ra_unbounded+0x2ad/0x560 filemap_get_pages+0x629/0xa20 filemap_read+0x335/0xbf0 vfs_read+0x790/0xcb0 ksys_read+0xfd/0x1d0 do_syscall_64+0x6d/0x140 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Freed by task 20917: kasan_save_stack+0x37/0x60 kasan_save_track+0x10/0x30 kasan_save_free_info+0x37/0x50 __kasan_slab_free+0x4b/0x60 kmem_cache_free+0x214/0x5d0 bio_free+0xed/0x180 end_bbio_data_read+0x1cc/0x580 [btrfs] btrfs_submit_chunk+0x98d/0x1880 [btrfs] btrfs_submit_bio+0x33/0x70 [btrfs] submit_one_bio+0xd4/0x130 [btrfs] submit_extent_page+0x3ea/0xdb0 [btrfs] btrfs_do_readpage+0x8b4/0x12a0 [btrfs] btrfs_readahead+0x29a/0x430 [btrfs] read_pages+0x1a7/0xc60 page_cache_ra_unbounded+0x2ad/0x560 filemap_get_pages+0x629/0xa20 filemap_read+0x335/0xbf0 vfs_read+0x790/0xcb0 ksys_read+0xfd/0x1d0 do_syscall_64+0x6d/0x140 entry_SYSCALL_64_after_hwframe+0x4b/0x53 [CAUSE] Although I cannot reproduce the error, the report itself is good enough to pin down the cause. The call trace is the regular endio workqueue context, but the free-by-task trace is showing that during btrfs_submit_chunk() we already hit a critical error, and is calling btrfs_bio_end_io() to error out. And the original endio function called bio_put() to free the whole bio. This means a double freeing thus causing use-after-free, e.g.: 1. Enter btrfs_submit_bio() with a read bio The read bio length is 128K, crossing two 64K stripes. 2. The first run of btrfs_submit_chunk() 2.1 Call btrfs_map_block(), which returns 64K 2.2 Call btrfs_split_bio() Now there are two bios, one referring to the first 64K, the other referring to the second 64K. 2.3 The first half is submitted. 3. The second run of btrfs_submit_chunk() 3.1 Call btrfs_map_block(), which by somehow failed Now we call btrfs_bio_end_io() to handle the error 3.2 btrfs_bio_end_io() calls the original endio function Which is end_bbio_data_read(), and it calls bio_put() for the original bio. Now the original bio is freed. 4. The submitted first 64K bio finished Now we call into btrfs_check_read_bio() and tries to advance the bio iter. But since the original bio (thus its iter) is already freed, we trigger the above use-after free. And even if the memory is not poisoned/corrupted, we will later call the original endio function, causing a double freeing. [FIX] Instead of calling btrfs_bio_end_io(), call btrfs_orig_bbio_end_io(), which has the extra check on split bios and do the proper refcounting for cloned bios. Furthermore there is already one extra btrfs_cleanup_bio() call, but that is duplicated to btrfs_orig_bbio_end_io() call, so remove that label completely. Reported-by: David Sterba Fixes: 852eee62d31a ("btrfs: allow btrfs_submit_bio to split bios") CC: stable@vger.kernel.org # 6.6+ Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index f04d93109960..b4e31ae17cd9 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -668,7 +668,6 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) { struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = bbio->fs_info; - struct btrfs_bio *orig_bbio = bbio; struct bio *bio = &bbio->bio; u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 length = bio->bi_iter.bi_size; @@ -706,7 +705,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) bbio->saved_iter = bio->bi_iter; ret = btrfs_lookup_bio_sums(bbio); if (ret) - goto fail_put_bio; + goto fail; } if (btrfs_op(bio) == BTRFS_MAP_WRITE) { @@ -740,13 +739,13 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) ret = btrfs_bio_csum(bbio); if (ret) - goto fail_put_bio; + goto fail; } else if (use_append || (btrfs_is_zoned(fs_info) && inode && inode->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_alloc_dummy_sum(bbio); if (ret) - goto fail_put_bio; + goto fail; } } @@ -754,12 +753,23 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) done: return map_length == length; -fail_put_bio: - if (map_length < length) - btrfs_cleanup_bio(bbio); fail: btrfs_bio_counter_dec(fs_info); - btrfs_bio_end_io(orig_bbio, ret); + /* + * We have split the original bbio, now we have to end both the current + * @bbio and remaining one, as the remaining one will never be submitted. + */ + if (map_length < length) { + struct btrfs_bio *remaining = bbio->private; + + ASSERT(bbio->bio.bi_pool == &btrfs_clone_bioset); + ASSERT(remaining); + + remaining->bio.bi_status = ret; + btrfs_orig_bbio_end_io(remaining); + } + bbio->bio.bi_status = ret; + btrfs_orig_bbio_end_io(bbio); /* Do not submit another chunk */ return true; } From ecb54277cb63c273e8d74272e5b9bfd80c2185d9 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 27 Aug 2024 11:30:10 +0100 Subject: [PATCH 4/4] btrfs: fix uninitialized return value from btrfs_reclaim_sweep() The return variable 'ret' at btrfs_reclaim_sweep() is never assigned if none of the space infos is reclaimable (for example if periodic reclaim is disabled, which is the default), so we return an undefined value. This can be fixed my making btrfs_reclaim_sweep() not return any value as well as do_reclaim_sweep() because: 1) do_reclaim_sweep() always returns 0, so we can make it return void; 2) The only caller of btrfs_reclaim_sweep() (btrfs_reclaim_bgs()) doesn't care about its return value, and in its context there's nothing to do about any errors anyway. Therefore remove the return value from btrfs_reclaim_sweep() and do_reclaim_sweep(). Fixes: e4ca3932ae90 ("btrfs: periodic block_group reclaim") Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 17 +++++------------ fs/btrfs/space-info.h | 2 +- 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 68e14fd48638..c691784b4660 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1985,8 +1985,8 @@ static bool is_reclaim_urgent(struct btrfs_space_info *space_info) return unalloc < data_chunk_size; } -static int do_reclaim_sweep(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, int raid) +static void do_reclaim_sweep(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, int raid) { struct btrfs_block_group *bg; int thresh_pct; @@ -2031,7 +2031,6 @@ static int do_reclaim_sweep(struct btrfs_fs_info *fs_info, } up_read(&space_info->groups_sem); - return 0; } void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes) @@ -2074,21 +2073,15 @@ bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info) return ret; } -int btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info) +void btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info) { - int ret; int raid; struct btrfs_space_info *space_info; list_for_each_entry(space_info, &fs_info->space_info, list) { if (!btrfs_should_periodic_reclaim(space_info)) continue; - for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) { - ret = do_reclaim_sweep(fs_info, space_info, raid); - if (ret) - return ret; - } + for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) + do_reclaim_sweep(fs_info, space_info, raid); } - - return ret; } diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 88b44221ce97..5602026c5e14 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -294,6 +294,6 @@ void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s6 void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready); bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info); int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info); -int btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info); +void btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info); #endif /* BTRFS_SPACE_INFO_H */