From e262ecf61b01462c8f2cb1c65fdb70dbfa03db52 Mon Sep 17 00:00:00 2001 From: jiahao Date: Fri, 19 Feb 2021 20:46:32 +0800 Subject: [PATCH 01/58] f2fs: fix a spacing coding style Add a space before the plus. Signed-off-by: jiahao Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index c7fc8a33616f..762c035d744a 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1456,7 +1456,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) orphan_blocks); if (__remain_node_summaries(cpc->reason)) - ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+ + ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS + cp_payload_blks + data_sum_blocks + orphan_blocks + NR_CURSEG_NODE_TYPE); else From c6acf57aeb599ecb12febd45b92a3323eb5c171e Mon Sep 17 00:00:00 2001 From: xuyehan Date: Tue, 23 Feb 2021 09:31:43 +0800 Subject: [PATCH 02/58] f2fs: fix a spelling error Delete the letter 'e' before 'number' Signed-off-by: xuyehan Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index cbeac1bebe2f..9fa5a528cc23 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -276,7 +276,7 @@ Date April 2019 Contact: "Daniel Rosenberg" Description: If checkpoint=disable, it displays the number of blocks that are unusable. - If checkpoint=enable it displays the enumber of blocks that + If checkpoint=enable it displays the number of blocks that would be unusable if checkpoint=disable were to be set. What: /sys/fs/f2fs//encoding From ba25abde923f2e04fd9a9243182b89573ee82b82 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 20 Feb 2021 17:35:40 +0800 Subject: [PATCH 03/58] f2fs: fix to allow migrating fully valid segment F2FS_IOC_FLUSH_DEVICE/F2FS_IOC_RESIZE_FS needs to migrate all blocks of target segment to other place, no matter the segment has partially or fully valid blocks. However, after commit 803e74be04b3 ("f2fs: stop GC when the victim becomes fully valid"), we may skip migration due to target segment is fully valid, result in failing the ioctl interface, fix this. Fixes: 803e74be04b3 ("f2fs: stop GC when the victim becomes fully valid") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 9 +++++---- fs/f2fs/gc.c | 21 ++++++++++++--------- fs/f2fs/segment.c | 2 +- fs/f2fs/super.c | 2 +- 5 files changed, 20 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 506c801880f3..24565f38afd6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3546,7 +3546,7 @@ void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi); int f2fs_start_gc_thread(struct f2fs_sb_info *sbi); void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi); block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode); -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, bool force, unsigned int segno); void f2fs_build_gc_manager(struct f2fs_sb_info *sbi); int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 471a6ff0c937..9aa258baac55 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1656,7 +1656,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (has_not_enough_free_secs(sbi, 0, GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { down_write(&sbi->gc_lock); - err = f2fs_gc(sbi, true, false, NULL_SEGNO); + err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); if (err && err != -ENODATA && err != -EAGAIN) goto out_err; } @@ -2487,7 +2487,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) down_write(&sbi->gc_lock); } - ret = f2fs_gc(sbi, sync, true, NULL_SEGNO); + ret = f2fs_gc(sbi, sync, true, false, NULL_SEGNO); out: mnt_drop_write_file(filp); return ret; @@ -2523,7 +2523,8 @@ static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range) down_write(&sbi->gc_lock); } - ret = f2fs_gc(sbi, range->sync, true, GET_SEGNO(sbi, range->start)); + ret = f2fs_gc(sbi, range->sync, true, false, + GET_SEGNO(sbi, range->start)); if (ret) { if (ret == -EBUSY) ret = -EAGAIN; @@ -2976,7 +2977,7 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) sm->last_victim[GC_CB] = end_segno + 1; sm->last_victim[GC_GREEDY] = end_segno + 1; sm->last_victim[ALLOC_NEXT] = end_segno + 1; - ret = f2fs_gc(sbi, true, true, start_segno); + ret = f2fs_gc(sbi, true, true, true, start_segno); if (ret == -EAGAIN) ret = 0; else if (ret < 0) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 39330ad3c44e..b3af76340026 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -112,7 +112,7 @@ static int gc_thread_func(void *data) sync_mode = F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC; /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, sync_mode, true, NULL_SEGNO)) + if (f2fs_gc(sbi, sync_mode, true, false, NULL_SEGNO)) wait_ms = gc_th->no_gc_sleep_time; trace_f2fs_background_gc(sbi->sb, wait_ms, @@ -1354,7 +1354,8 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, * the victim data block is ignored. */ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, - struct gc_inode_list *gc_list, unsigned int segno, int gc_type) + struct gc_inode_list *gc_list, unsigned int segno, int gc_type, + bool force_migrate) { struct super_block *sb = sbi->sb; struct f2fs_summary *entry; @@ -1383,8 +1384,8 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, * race condition along with SSR block allocation. */ if ((gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) || - get_valid_blocks(sbi, segno, true) == - BLKS_PER_SEC(sbi)) + (!force_migrate && get_valid_blocks(sbi, segno, true) == + BLKS_PER_SEC(sbi))) return submitted; if (check_valid_map(sbi, segno, off) == 0) @@ -1519,7 +1520,8 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int start_segno, - struct gc_inode_list *gc_list, int gc_type) + struct gc_inode_list *gc_list, int gc_type, + bool force_migrate) { struct page *sum_page; struct f2fs_summary_block *sum; @@ -1606,7 +1608,8 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, gc_type); else submitted += gc_data_segment(sbi, sum->entries, gc_list, - segno, gc_type); + segno, gc_type, + force_migrate); stat_inc_seg_count(sbi, type, gc_type); migrated++; @@ -1634,7 +1637,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, } int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, - bool background, unsigned int segno) + bool background, bool force, unsigned int segno) { int gc_type = sync ? FG_GC : BG_GC; int sec_freed = 0, seg_freed = 0, total_freed = 0; @@ -1696,7 +1699,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, if (ret) goto stop; - seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type); + seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, force); if (gc_type == FG_GC && seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) sec_freed++; @@ -1835,7 +1838,7 @@ static int free_segment_range(struct f2fs_sb_info *sbi, .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; - do_garbage_collect(sbi, segno, &gc_list, FG_GC); + do_garbage_collect(sbi, segno, &gc_list, FG_GC, true); put_gc_inode(&gc_list); if (!gc_only && get_valid_blocks(sbi, segno, true)) { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4c35fefaf4d3..d35f084d40fb 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -504,7 +504,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) */ if (has_not_enough_free_secs(sbi, 0, 0)) { down_write(&sbi->gc_lock); - f2fs_gc(sbi, false, false, NULL_SEGNO); + f2fs_gc(sbi, false, false, false, NULL_SEGNO); } } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index c2ba4a087983..bea46e49e35c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1868,7 +1868,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) while (!f2fs_time_over(sbi, DISABLE_TIME)) { down_write(&sbi->gc_lock); - err = f2fs_gc(sbi, true, false, NULL_SEGNO); + err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); if (err == -ENODATA) { err = 0; break; From 841a52ae0711bc3eb9a903624f6cc0b01ec35eea Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 20 Feb 2021 17:35:41 +0800 Subject: [PATCH 04/58] f2fs: fix panic during f2fs_resize_fs() f2fs_resize_fs() hangs in below callstack with testcase: - mkfs 16GB image & mount image - dd 8GB fileA - dd 8GB fileB - sync - rm fileA - sync - resize filesystem to 8GB kernel BUG at segment.c:2484! Call Trace: allocate_segment_by_default+0x92/0xf0 [f2fs] f2fs_allocate_data_block+0x44b/0x7e0 [f2fs] do_write_page+0x5a/0x110 [f2fs] f2fs_outplace_write_data+0x55/0x100 [f2fs] f2fs_do_write_data_page+0x392/0x850 [f2fs] move_data_page+0x233/0x320 [f2fs] do_garbage_collect+0x14d9/0x1660 [f2fs] free_segment_range+0x1f7/0x310 [f2fs] f2fs_resize_fs+0x118/0x330 [f2fs] __f2fs_ioctl+0x487/0x3680 [f2fs] __x64_sys_ioctl+0x8e/0xd0 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The root cause is we forgot to check that whether we have enough space in resized filesystem to store all valid blocks in before-resizing filesystem, then allocator will run out-of-space during block migration in free_segment_range(). Fixes: b4b10061ef98 ("f2fs: refactor resize_fs to avoid meta updates in progress") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b3af76340026..86ba8ed0b8a7 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1977,7 +1977,20 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) /* stop CP to protect MAIN_SEC in free_segment_range */ f2fs_lock_op(sbi); + + spin_lock(&sbi->stat_lock); + if (shrunk_blocks + valid_user_blocks(sbi) + + sbi->current_reserved_blocks + sbi->unusable_block_count + + F2FS_OPTION(sbi).root_reserved_blocks > sbi->user_block_count) + err = -ENOSPC; + spin_unlock(&sbi->stat_lock); + + if (err) + goto out_unlock; + err = free_segment_range(sbi, secs, true); + +out_unlock: f2fs_unlock_op(sbi); up_write(&sbi->gc_lock); if (err) From b8719b37cd609f443daa8c4f47817fa6ab37e520 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 20 Feb 2021 17:38:41 +0800 Subject: [PATCH 05/58] f2fs: avoid unused f2fs_show_compress_options() LKP reports: fs/f2fs/super.c:1516:20: warning: unused function 'f2fs_show_compress_options' [-Wunused-function] static inline void f2fs_show_compress_options(struct seq_file *seq, Fix this issue by covering f2fs_show_compress_options() with CONFIG_F2FS_FS_COMPRESSION macro. Fixes: 4c8ff7095bef ("f2fs: support data compression") Reported-by: kernel test robot Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index bea46e49e35c..8fdcbabe43e8 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1619,6 +1619,7 @@ static inline void f2fs_show_quota_options(struct seq_file *seq, #endif } +#ifdef CONFIG_F2FS_FS_COMPRESSION static inline void f2fs_show_compress_options(struct seq_file *seq, struct super_block *sb) { @@ -1664,6 +1665,7 @@ static inline void f2fs_show_compress_options(struct seq_file *seq, else if (F2FS_OPTION(sbi).compress_mode == COMPR_MODE_USER) seq_printf(seq, ",compress_mode=%s", "user"); } +#endif static int f2fs_show_options(struct seq_file *seq, struct dentry *root) { From 14d0fdac3c6c0666bd3923400a1dc67e20ef577d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 20 Feb 2021 17:38:42 +0800 Subject: [PATCH 06/58] f2fs: remove unused FORCE_FG_GC macro FORCE_FG_GC was introduced by commit 6aefd93b0137 ("f2fs: introduce background_gc=sync mount option"), but never be used, remove it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 229814b4f4a6..144980b62f9e 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -172,12 +172,10 @@ enum { /* * BG_GC means the background cleaning job. * FG_GC means the on-demand cleaning job. - * FORCE_FG_GC means on-demand cleaning job in background. */ enum { BG_GC = 0, FG_GC, - FORCE_FG_GC, }; /* for a function parameter to select a victim segment */ From 6cd5fef3901bf42f106935a39e2f1ebbbe62309d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 20 Feb 2021 17:38:43 +0800 Subject: [PATCH 07/58] f2fs: update comments for explicit memory barrier Add more detailed comments for explicit memory barrier used by f2fs, in order to enhance code readability. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 6 +++++- fs/f2fs/segment.c | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 762c035d744a..31f951e31dca 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1818,7 +1818,11 @@ int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi) llist_add(&req.llnode, &cprc->issue_list); atomic_inc(&cprc->queued_ckpt); - /* update issue_list before we wake up issue_checkpoint thread */ + /* + * update issue_list before we wake up issue_checkpoint thread, + * this smp_mb() pairs with another barrier in ___wait_event(), + * see more details in comments of waitqueue_active(). + */ smp_mb(); if (waitqueue_active(&cprc->ckpt_wait_queue)) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d35f084d40fb..b1f1f92f7336 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -653,7 +653,11 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino) llist_add(&cmd.llnode, &fcc->issue_list); - /* update issue_list before we wake up issue_flush thread */ + /* + * update issue_list before we wake up issue_flush thread, this + * smp_mb() pairs with another barrier in ___wait_event(), see + * more details in comments of waitqueue_active(). + */ smp_mb(); if (waitqueue_active(&fcc->flush_wait_queue)) From 7688fb401da409a85b53605d5ce6430f864fad97 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 22 Feb 2021 18:07:33 +0800 Subject: [PATCH 08/58] f2fs: check discard command number before traversing discard pending list In trim thread, let's add a condition to check discard command number before traversing discard pending list, it can avoid unneeded traversing if there is no discard command. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b1f1f92f7336..9ba8ef511d7c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1759,6 +1759,8 @@ static int issue_discard_thread(void *data) wait_ms = dpolicy.max_interval; continue; } + if (!atomic_read(&dcc->discard_cmd_cnt)) + continue; if (sbi->gc_mode == GC_URGENT_HIGH) __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); From 1e2bd9c6c28c2f797adcb667c5ca65307a405347 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 26 Feb 2021 16:51:42 +0100 Subject: [PATCH 09/58] f2fs: compress: Allow modular (de)compression algorithms If F2FS_FS is modular, enabling the compressions options F2FS_FS_{LZ4,LZ4HZ,LZO,LZORLE,ZSTD} will make the (de)compression algorithms {LZ4,LZ4HC,LZO,ZSTD}_{,DE}COMPRESS builtin instead of modular, as the former depend on an intermediate boolean F2FS_FS_COMPRESSION, which in-turn depends on tristate F2FS_FS. Indeed, if a boolean symbol A depends directly on a tristate symbol B and selects another tristate symbol C: tristate B tristate C bool A depends on B select C and B is modular, then C will also be modular. However, if there is an intermediate boolean D in the dependency chain between A and B: tristate B tristate C bool D depends on B bool A depends on D select C then the modular state won't propagate from B to C, and C will be builtin instead of modular. As modular dependency propagation through intermediate symbols is obscure, fix this in a robust way by moving the selection of tristate (de)compression algorithms from the boolean compression options to the tristate main F2FS_FS option. Signed-off-by: Geert Uytterhoeven Reviewed-by: Chao Yu Reviewed-by: Masahiro Yamada Signed-off-by: Jaegeuk Kim --- fs/f2fs/Kconfig | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 62e638a49bbf..7669de7b49ce 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -7,6 +7,13 @@ config F2FS_FS select CRYPTO_CRC32 select F2FS_FS_XATTR if FS_ENCRYPTION select FS_ENCRYPTION_ALGS if FS_ENCRYPTION + select LZ4_COMPRESS if F2FS_FS_LZ4 + select LZ4_DECOMPRESS if F2FS_FS_LZ4 + select LZ4HC_COMPRESS if F2FS_FS_LZ4HC + select LZO_COMPRESS if F2FS_FS_LZO + select LZO_DECOMPRESS if F2FS_FS_LZO + select ZSTD_COMPRESS if F2FS_FS_ZSTD + select ZSTD_DECOMPRESS if F2FS_FS_ZSTD help F2FS is based on Log-structured File System (LFS), which supports versatile "flash-friendly" features. The design has been focused on @@ -94,8 +101,6 @@ config F2FS_FS_COMPRESSION config F2FS_FS_LZO bool "LZO compression support" depends on F2FS_FS_COMPRESSION - select LZO_COMPRESS - select LZO_DECOMPRESS default y help Support LZO compress algorithm, if unsure, say Y. @@ -103,8 +108,6 @@ config F2FS_FS_LZO config F2FS_FS_LZ4 bool "LZ4 compression support" depends on F2FS_FS_COMPRESSION - select LZ4_COMPRESS - select LZ4_DECOMPRESS default y help Support LZ4 compress algorithm, if unsure, say Y. @@ -113,7 +116,6 @@ config F2FS_FS_LZ4HC bool "LZ4HC compression support" depends on F2FS_FS_COMPRESSION depends on F2FS_FS_LZ4 - select LZ4HC_COMPRESS default y help Support LZ4HC compress algorithm, LZ4HC has compatible on-disk @@ -122,8 +124,6 @@ config F2FS_FS_LZ4HC config F2FS_FS_ZSTD bool "ZSTD compression support" depends on F2FS_FS_COMPRESSION - select ZSTD_COMPRESS - select ZSTD_DECOMPRESS default y help Support ZSTD compress algorithm, if unsure, say Y. @@ -132,8 +132,6 @@ config F2FS_FS_LZORLE bool "LZO-RLE compression support" depends on F2FS_FS_COMPRESSION depends on F2FS_FS_LZO - select LZO_COMPRESS - select LZO_DECOMPRESS default y help Support LZO-RLE compress algorithm, if unsure, say Y. From 54af5b35f42b5b0aabecd3a169d2875bb3003c46 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 24 Feb 2021 13:03:13 -0600 Subject: [PATCH 10/58] f2fs: Replace one-element array with flexible-array member MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. Refactor the code according to the use of a flexible-array member in struct f2fs_checkpoint, instead of a one-element arrays. Notice that a temporary pointer to void '*tmp_ptr' was used in order to fix the following errors when using a flexible array instead of a one element array in struct f2fs_checkpoint: CC [M] fs/f2fs/dir.o In file included from fs/f2fs/dir.c:13: fs/f2fs/f2fs.h: In function ‘__bitmap_ptr’: fs/f2fs/f2fs.h:2227:40: error: invalid use of flexible array member 2227 | return &ckpt->sit_nat_version_bitmap + offset + sizeof(__le32); | ^ fs/f2fs/f2fs.h:2227:49: error: invalid use of flexible array member 2227 | return &ckpt->sit_nat_version_bitmap + offset + sizeof(__le32); | ^ fs/f2fs/f2fs.h:2238:40: error: invalid use of flexible array member 2238 | return &ckpt->sit_nat_version_bitmap + offset; | ^ make[2]: *** [scripts/Makefile.build:287: fs/f2fs/dir.o] Error 1 make[1]: *** [scripts/Makefile.build:530: fs/f2fs] Error 2 make: *** [Makefile:1819: fs] Error 2 [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.9/process/deprecated.html#zero-length-and-one-element-arrays Link: https://github.com/KSPP/linux/issues/79 Build-tested-by: kernel test robot Link: https://lore.kernel.org/lkml/603647e4.DeEFbl4eqljuwAUe%25lkp@intel.com/ Signed-off-by: Gustavo A. R. Silva Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 5 +++-- include/linux/f2fs_fs.h | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 24565f38afd6..2e015d4c9e44 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2215,6 +2215,7 @@ static inline block_t __cp_payload(struct f2fs_sb_info *sbi) static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + void *tmp_ptr = &ckpt->sit_nat_version_bitmap; int offset; if (is_set_ckpt_flags(sbi, CP_LARGE_NAT_BITMAP_FLAG)) { @@ -2224,7 +2225,7 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) * if large_nat_bitmap feature is enabled, leave checksum * protection for all nat/sit bitmaps. */ - return &ckpt->sit_nat_version_bitmap + offset + sizeof(__le32); + return tmp_ptr + offset + sizeof(__le32); } if (__cp_payload(sbi) > 0) { @@ -2235,7 +2236,7 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) } else { offset = (flag == NAT_BITMAP) ? le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0; - return &ckpt->sit_nat_version_bitmap + offset; + return tmp_ptr + offset; } } diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index c6cc0a566ef5..5487a80617a3 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -168,7 +168,7 @@ struct f2fs_checkpoint { unsigned char alloc_type[MAX_ACTIVE_LOGS]; /* SIT and NAT version bitmap */ - unsigned char sit_nat_version_bitmap[1]; + unsigned char sit_nat_version_bitmap[]; } __packed; #define CP_CHKSUM_OFFSET 4092 /* default chksum offset in checkpoint */ From 8e8184cc49d01edbd319eab785a55d5749de5d5a Mon Sep 17 00:00:00 2001 From: "huangjianan@oppo.com" Date: Sat, 27 Feb 2021 20:02:29 +0800 Subject: [PATCH 11/58] f2fs: remove unnecessary IS_SWAPFILE check Now swapfile in f2fs directly submit IO to blockdev according to swapfile extents reported by f2fs when swapon, therefore there is no need to check IS_SWAPFILE when exec filesystem operation. Signed-off-by: Huang Jianan Signed-off-by: Guo Weichao Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 525a21133165..e0dc3b9d5b0e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1733,7 +1733,7 @@ static int get_data_block_dio_write(struct inode *inode, sector_t iblock, return __get_data_block(inode, iblock, bh_result, create, F2FS_GET_BLOCK_DIO, NULL, f2fs_rw_hint_to_seg_type(inode->i_write_hint), - IS_SWAPFILE(inode) ? false : true); + true); } static int get_data_block_dio(struct inode *inode, sector_t iblock, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2e015d4c9e44..ab9d7d1fa61b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4176,8 +4176,7 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, if (F2FS_IO_ALIGNED(sbi)) return true; } - if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED) && - !IS_SWAPFILE(inode)) + if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED)) return true; return false; From e0a507f5c5456f92b43201f537bf7b02938f484c Mon Sep 17 00:00:00 2001 From: "huangjianan@oppo.com" Date: Sat, 27 Feb 2021 20:02:30 +0800 Subject: [PATCH 12/58] f2fs: fix last_lblock check in check_swap_activate_fast Because page_no < sis->max guarantees that the while loop break out normally, the wrong check contidion here doesn't cause a problem. Signed-off-by: Huang Jianan Signed-off-by: Guo Weichao Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index e0dc3b9d5b0e..6396bf9b4736 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3814,7 +3814,7 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, last_lblock = bytes_to_blks(inode, i_size_read(inode)); len = i_size_read(inode); - while (cur_lblock <= last_lblock && cur_lblock < sis->max) { + while (cur_lblock < last_lblock && cur_lblock < sis->max) { struct f2fs_map_blocks map; pgoff_t next_pgofs; From 533d45fefdf652b93accd008fae8197c536b1d28 Mon Sep 17 00:00:00 2001 From: "huangjianan@oppo.com" Date: Mon, 1 Mar 2021 12:58:44 +0800 Subject: [PATCH 13/58] f2fs: check if swapfile is section-alligned If the swapfile isn't created by pin and fallocate, it can't be guaranteed section-aligned, so it may be selected by f2fs gc. When gc_pin_file_threshold is reached, the address of swapfile may change, but won't be synchronized to swap_extent, so swap will write to wrong address, which will cause data corruption. Signed-off-by: Huang Jianan Signed-off-by: Guo Weichao Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 109 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 88 insertions(+), 21 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 6396bf9b4736..299ed02ceeee 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3791,11 +3791,64 @@ int f2fs_migrate_page(struct address_space *mapping, #endif #ifdef CONFIG_SWAP +static int f2fs_is_file_aligned(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + block_t main_blkaddr = SM_I(sbi)->main_blkaddr; + block_t cur_lblock; + block_t last_lblock; + block_t pblock; + unsigned long nr_pblocks; + unsigned int blocks_per_sec = BLKS_PER_SEC(sbi); + int ret = 0; + + cur_lblock = 0; + last_lblock = bytes_to_blks(inode, i_size_read(inode)); + + while (cur_lblock < last_lblock) { + struct f2fs_map_blocks map; + + memset(&map, 0, sizeof(map)); + map.m_lblk = cur_lblock; + map.m_len = last_lblock - cur_lblock; + map.m_next_pgofs = NULL; + map.m_next_extent = NULL; + map.m_seg_type = NO_CHECK_TYPE; + map.m_may_create = false; + + ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP); + if (ret) + goto out; + + /* hole */ + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + f2fs_err(sbi, "Swapfile has holes\n"); + ret = -ENOENT; + goto out; + } + + pblock = map.m_pblk; + nr_pblocks = map.m_len; + + if ((pblock - main_blkaddr) & (blocks_per_sec - 1) || + nr_pblocks & (blocks_per_sec - 1)) { + f2fs_err(sbi, "Swapfile does not align to section"); + ret = -EINVAL; + goto out; + } + + cur_lblock += nr_pblocks; + } +out: + return ret; +} + static int check_swap_activate_fast(struct swap_info_struct *sis, struct file *swap_file, sector_t *span) { struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); sector_t cur_lblock; sector_t last_lblock; sector_t pblock; @@ -3803,8 +3856,8 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, sector_t highest_pblock = 0; int nr_extents = 0; unsigned long nr_pblocks; - u64 len; - int ret; + unsigned int blocks_per_sec = BLKS_PER_SEC(sbi); + int ret = 0; /* * Map all the blocks into the extent list. This code doesn't try @@ -3812,31 +3865,41 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, */ cur_lblock = 0; last_lblock = bytes_to_blks(inode, i_size_read(inode)); - len = i_size_read(inode); while (cur_lblock < last_lblock && cur_lblock < sis->max) { struct f2fs_map_blocks map; - pgoff_t next_pgofs; cond_resched(); memset(&map, 0, sizeof(map)); map.m_lblk = cur_lblock; - map.m_len = bytes_to_blks(inode, len) - cur_lblock; - map.m_next_pgofs = &next_pgofs; + map.m_len = last_lblock - cur_lblock; + map.m_next_pgofs = NULL; + map.m_next_extent = NULL; map.m_seg_type = NO_CHECK_TYPE; + map.m_may_create = false; ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP); if (ret) - goto err_out; + goto out; /* hole */ - if (!(map.m_flags & F2FS_MAP_FLAGS)) - goto err_out; + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + f2fs_err(sbi, "Swapfile has holes\n"); + ret = -ENOENT; + goto out; + } pblock = map.m_pblk; nr_pblocks = map.m_len; + if ((pblock - SM_I(sbi)->main_blkaddr) & (blocks_per_sec - 1) || + nr_pblocks & (blocks_per_sec - 1)) { + f2fs_err(sbi, "Swapfile does not align to section"); + ret = -EINVAL; + goto out; + } + if (cur_lblock + nr_pblocks >= sis->max) nr_pblocks = sis->max - cur_lblock; @@ -3865,9 +3928,6 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, sis->highest_bit = cur_lblock - 1; out: return ret; -err_out: - pr_err("swapon: swapfile has holes\n"); - return -EINVAL; } /* Copied from generic_swapfile_activate() to check any holes */ @@ -3876,6 +3936,7 @@ static int check_swap_activate(struct swap_info_struct *sis, { struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned blocks_per_page; unsigned long page_no; sector_t probe_block; @@ -3883,11 +3944,15 @@ static int check_swap_activate(struct swap_info_struct *sis, sector_t lowest_block = -1; sector_t highest_block = 0; int nr_extents = 0; - int ret; + int ret = 0; if (PAGE_SIZE == F2FS_BLKSIZE) return check_swap_activate_fast(sis, swap_file, span); + ret = f2fs_is_file_aligned(inode); + if (ret) + goto out; + blocks_per_page = bytes_to_blks(inode, PAGE_SIZE); /* @@ -3902,13 +3967,14 @@ static int check_swap_activate(struct swap_info_struct *sis, unsigned block_in_page; sector_t first_block; sector_t block = 0; - int err = 0; cond_resched(); block = probe_block; - err = bmap(inode, &block); - if (err || !block) + ret = bmap(inode, &block); + if (ret) + goto out; + if (!block) goto bad_bmap; first_block = block; @@ -3924,9 +3990,10 @@ static int check_swap_activate(struct swap_info_struct *sis, block_in_page++) { block = probe_block + block_in_page; - err = bmap(inode, &block); - - if (err || !block) + ret = bmap(inode, &block); + if (ret) + goto out; + if (!block) goto bad_bmap; if (block != first_block + block_in_page) { @@ -3966,8 +4033,8 @@ static int check_swap_activate(struct swap_info_struct *sis, out: return ret; bad_bmap: - pr_err("swapon: swapfile has holes\n"); - return -EINVAL; + f2fs_err(sbi, "Swapfile has holes\n"); + return -ENOENT; } static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, From 60eac71600e4465aae80de4cdd6db0cc08cfb50f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 2 Mar 2021 16:35:32 +0800 Subject: [PATCH 14/58] f2fs: remove unused file_clear_encrypt() - file_clear_encrypt() was never be used, remove it. - In addition, relocating macros for cleanup. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ab9d7d1fa61b..6af9e0cbc8b3 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -637,21 +637,26 @@ enum { #define FADVISE_MODIFIABLE_BITS (FADVISE_COLD_BIT | FADVISE_HOT_BIT) #define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) -#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) #define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT) -#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT) #define file_clear_cold(inode) clear_file(inode, FADVISE_COLD_BIT) + +#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) +#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT) #define file_got_pino(inode) clear_file(inode, FADVISE_LOST_PINO_BIT) + #define file_is_encrypt(inode) is_file(inode, FADVISE_ENCRYPT_BIT) #define file_set_encrypt(inode) set_file(inode, FADVISE_ENCRYPT_BIT) -#define file_clear_encrypt(inode) clear_file(inode, FADVISE_ENCRYPT_BIT) + #define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT) #define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT) + #define file_keep_isize(inode) is_file(inode, FADVISE_KEEP_SIZE_BIT) #define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT) + #define file_is_hot(inode) is_file(inode, FADVISE_HOT_BIT) #define file_set_hot(inode) set_file(inode, FADVISE_HOT_BIT) #define file_clear_hot(inode) clear_file(inode, FADVISE_HOT_BIT) + #define file_is_verity(inode) is_file(inode, FADVISE_VERITY_BIT) #define file_set_verity(inode) set_file(inode, FADVISE_VERITY_BIT) From 5459e25504c58b7519ec6d733274c99ae72ff0c3 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 4 Mar 2021 09:21:18 +0000 Subject: [PATCH 15/58] f2fs: fix a redundant call to f2fs_balance_fs if an error occurs The uninitialized variable dn.node_changed does not get set when a call to f2fs_get_node_page fails. This uninitialized value gets used in the call to f2fs_balance_fs() that may or not may not balances dirty node and dentry pages depending on the uninitialized state of the variable. Fix this by only calling f2fs_balance_fs if err is not set. Thanks to Jaegeuk Kim for suggesting an appropriate fix. Addresses-Coverity: ("Uninitialized scalar variable") Fixes: 2a3407607028 ("f2fs: call f2fs_balance_fs only when node was changed") Signed-off-by: Colin Ian King Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inline.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 993caefcd2bb..92652ca7a7c8 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -219,7 +219,8 @@ int f2fs_convert_inline_inode(struct inode *inode) f2fs_put_page(page, 1); - f2fs_balance_fs(sbi, dn.node_changed); + if (!err) + f2fs_balance_fs(sbi, dn.node_changed); return err; } From 672d8b3cab7a07e5787f3e76ba703dcdcf4c8fe6 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 4 Mar 2021 21:43:10 -0800 Subject: [PATCH 16/58] f2fs: fix error handling in f2fs_end_enable_verity() f2fs didn't properly clean up if verity failed to be enabled on a file: - It left verity metadata (pages past EOF) in the page cache, which would be exposed to userspace if the file was later extended. - It didn't truncate the verity metadata at all (either from cache or from disk) if an error occurred while setting the verity bit. Fix these bugs by adding a call to truncate_inode_pages() and ensuring that we truncate the verity metadata (both from cache and from disk) in all error paths. Also rework the code to cleanly separate the success path from the error paths, which makes it much easier to understand. Finally, log a message if f2fs_truncate() fails, since it might otherwise fail silently. Reported-by: Yunlei He Fixes: 95ae251fe828 ("f2fs: add fs-verity support") Cc: # v5.4+ Signed-off-by: Eric Biggers Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/verity.c | 75 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 54 insertions(+), 21 deletions(-) diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 054ec852b5ea..15ba36926fad 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -152,40 +152,73 @@ static int f2fs_end_enable_verity(struct file *filp, const void *desc, size_t desc_size, u64 merkle_tree_size) { struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); u64 desc_pos = f2fs_verity_metadata_pos(inode) + merkle_tree_size; struct fsverity_descriptor_location dloc = { .version = cpu_to_le32(F2FS_VERIFY_VER), .size = cpu_to_le32(desc_size), .pos = cpu_to_le64(desc_pos), }; - int err = 0; + int err = 0, err2 = 0; - if (desc != NULL) { - /* Succeeded; write the verity descriptor. */ - err = pagecache_write(inode, desc, desc_size, desc_pos); + /* + * If an error already occurred (which fs/verity/ signals by passing + * desc == NULL), then only clean-up is needed. + */ + if (desc == NULL) + goto cleanup; - /* Write all pages before clearing FI_VERITY_IN_PROGRESS. */ - if (!err) - err = filemap_write_and_wait(inode->i_mapping); - } + /* Append the verity descriptor. */ + err = pagecache_write(inode, desc, desc_size, desc_pos); + if (err) + goto cleanup; - /* If we failed, truncate anything we wrote past i_size. */ - if (desc == NULL || err) - f2fs_truncate(inode); + /* + * Write all pages (both data and verity metadata). Note that this must + * happen before clearing FI_VERITY_IN_PROGRESS; otherwise pages beyond + * i_size won't be written properly. For crash consistency, this also + * must happen before the verity inode flag gets persisted. + */ + err = filemap_write_and_wait(inode->i_mapping); + if (err) + goto cleanup; + + /* Set the verity xattr. */ + err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_VERITY, + F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc), + NULL, XATTR_CREATE); + if (err) + goto cleanup; + + /* Finally, set the verity inode flag. */ + file_set_verity(inode); + f2fs_set_inode_flags(inode); + f2fs_mark_inode_dirty_sync(inode, true); clear_inode_flag(inode, FI_VERITY_IN_PROGRESS); + return 0; - if (desc != NULL && !err) { - err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_VERITY, - F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc), - NULL, XATTR_CREATE); - if (!err) { - file_set_verity(inode); - f2fs_set_inode_flags(inode); - f2fs_mark_inode_dirty_sync(inode, true); - } +cleanup: + /* + * Verity failed to be enabled, so clean up by truncating any verity + * metadata that was written beyond i_size (both from cache and from + * disk) and clearing FI_VERITY_IN_PROGRESS. + * + * Taking i_gc_rwsem[WRITE] is needed to stop f2fs garbage collection + * from re-instantiating cached pages we are truncating (since unlike + * normal file accesses, garbage collection isn't limited by i_size). + */ + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + truncate_inode_pages(inode->i_mapping, inode->i_size); + err2 = f2fs_truncate(inode); + if (err2) { + f2fs_err(sbi, "Truncating verity metadata failed (errno=%d)", + err2); + set_sbi_flag(sbi, SBI_NEED_FSCK); } - return err; + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + clear_inode_flag(inode, FI_VERITY_IN_PROGRESS); + return err ?: err2; } static int f2fs_get_verity_descriptor(struct inode *inode, void *buf, From a5407f50d3837e52826aa9525c58665636995543 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 1 Mar 2021 17:28:16 -0800 Subject: [PATCH 17/58] f2fs: expose # of overprivision segments This is useful when checking conditions during checkpoint=disable in Android. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 5 +++++ fs/f2fs/sysfs.c | 9 +++++++++ 2 files changed, 14 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 9fa5a528cc23..4aa8f38b52d7 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -409,3 +409,8 @@ Description: Give a way to change checkpoint merge daemon's io priority. I/O priority "3". We can select the class between "rt" and "be", and set the I/O priority within valid range of it. "," delimiter is necessary in between I/O class and priority number. + +What: /sys/fs/f2fs//ovp_segments +Date: March 2021 +Contact: "Jaegeuk Kim" +Description: Shows the number of overprovision segments. diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index e38a7f6921dd..0c391ab2d8b7 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -91,6 +91,13 @@ static ssize_t free_segments_show(struct f2fs_attr *a, (unsigned long long)(free_segments(sbi))); } +static ssize_t ovp_segments_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sprintf(buf, "%llu\n", + (unsigned long long)(overprovision_segments(sbi))); +} + static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -629,6 +636,7 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag); F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio); F2FS_GENERAL_RO_ATTR(dirty_segments); F2FS_GENERAL_RO_ATTR(free_segments); +F2FS_GENERAL_RO_ATTR(ovp_segments); F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); F2FS_GENERAL_RO_ATTR(features); F2FS_GENERAL_RO_ATTR(current_reserved_blocks); @@ -715,6 +723,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(ckpt_thread_ioprio), ATTR_LIST(dirty_segments), ATTR_LIST(free_segments), + ATTR_LIST(ovp_segments), ATTR_LIST(unusable), ATTR_LIST(lifetime_write_kbytes), ATTR_LIST(features), From 26423921acc8a69eff9a1a4e3a10e53d7591177f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 5 Mar 2021 17:56:01 +0800 Subject: [PATCH 18/58] f2fs: fix to align to section for fallocate() on pinned file Now, fallocate() on a pinned file only allocates blocks which aligns to segment rather than section, so GC may try to migrate pinned file's block, and after several times of failure, pinned file's block could be migrated to other place, however user won't be aware of such condition, and then old obsolete block address may be readed/written incorrectly. To avoid such condition, let's try to allocate pinned file's blocks with section alignment. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 19 +++++++++---------- fs/f2fs/segment.c | 34 ++++++++++++++++++++++++++-------- 3 files changed, 36 insertions(+), 19 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6af9e0cbc8b3..5575f090161b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3388,7 +3388,7 @@ void f2fs_get_new_segment(struct f2fs_sb_info *sbi, unsigned int *newseg, bool new_sec, int dir); void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, unsigned int start, unsigned int end); -void f2fs_allocate_new_segment(struct f2fs_sb_info *sbi, int type); +void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type); void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi); int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 9aa258baac55..8c46ff4de1b7 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1644,27 +1644,26 @@ static int expand_inode_data(struct inode *inode, loff_t offset, return 0; if (f2fs_is_pinned_file(inode)) { - block_t len = (map.m_len >> sbi->log_blocks_per_seg) << - sbi->log_blocks_per_seg; + block_t sec_blks = BLKS_PER_SEC(sbi); + block_t sec_len = roundup(map.m_len, sec_blks); block_t done = 0; - if (map.m_len % sbi->blocks_per_seg) - len += sbi->blocks_per_seg; - - map.m_len = sbi->blocks_per_seg; + map.m_len = sec_blks; next_alloc: if (has_not_enough_free_secs(sbi, 0, GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { down_write(&sbi->gc_lock); err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); - if (err && err != -ENODATA && err != -EAGAIN) + if (err && err != -ENODATA && err != -EAGAIN) { + map.m_len = done; goto out_err; + } } down_write(&sbi->pin_sem); f2fs_lock_op(sbi); - f2fs_allocate_new_segment(sbi, CURSEG_COLD_DATA_PINNED); + f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED); f2fs_unlock_op(sbi); map.m_seg_type = CURSEG_COLD_DATA_PINNED; @@ -1673,9 +1672,9 @@ static int expand_inode_data(struct inode *inode, loff_t offset, up_write(&sbi->pin_sem); done += map.m_len; - len -= map.m_len; + sec_len -= map.m_len; map.m_lblk += map.m_len; - if (!err && len) + if (!err && sec_len) goto next_alloc; map.m_len = done; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9ba8ef511d7c..5092917b8fcf 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2899,7 +2899,8 @@ void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, up_read(&SM_I(sbi)->curseg_lock); } -static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type) +static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, + bool new_sec) { struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int old_segno; @@ -2907,10 +2908,22 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type) if (!curseg->inited) goto alloc; - if (!curseg->next_blkoff && - !get_valid_blocks(sbi, curseg->segno, false) && - !get_ckpt_valid_blocks(sbi, curseg->segno)) - return; + if (curseg->next_blkoff || + get_valid_blocks(sbi, curseg->segno, new_sec)) + goto alloc; + + if (new_sec) { + unsigned int segno = START_SEGNO(curseg->segno); + int i; + + for (i = 0; i < sbi->segs_per_sec; i++, segno++) { + if (get_ckpt_valid_blocks(sbi, segno)) + goto alloc; + } + } else { + if (!get_ckpt_valid_blocks(sbi, curseg->segno)) + return; + } alloc: old_segno = curseg->segno; @@ -2918,10 +2931,15 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type) locate_dirty_segment(sbi, old_segno); } -void f2fs_allocate_new_segment(struct f2fs_sb_info *sbi, int type) +static void __allocate_new_section(struct f2fs_sb_info *sbi, int type) +{ + __allocate_new_segment(sbi, type, true); +} + +void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type) { down_write(&SIT_I(sbi)->sentry_lock); - __allocate_new_segment(sbi, type); + __allocate_new_section(sbi, type); up_write(&SIT_I(sbi)->sentry_lock); } @@ -2931,7 +2949,7 @@ void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) down_write(&SIT_I(sbi)->sentry_lock); for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) - __allocate_new_segment(sbi, i); + __allocate_new_segment(sbi, i, false); up_write(&SIT_I(sbi)->sentry_lock); } From a5717d2ca674644caa5ab9d5c5334ec94991a560 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Tue, 9 Mar 2021 13:21:18 +0800 Subject: [PATCH 19/58] f2fs: fix to use per-inode maxbytes in f2fs_fiemap F2FS inode may have different max size, so change to use per-inode maxbytes. Signed-off-by: Chengguang Xu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 299ed02ceeee..ef208b36bf5c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1848,6 +1848,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int ret = 0; bool compr_cluster = false; unsigned int cluster_size = F2FS_I(inode)->i_cluster_size; + loff_t maxbytes; if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { ret = f2fs_precache_extents(inode); @@ -1861,6 +1862,15 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, inode_lock(inode); + maxbytes = max_file_blocks(inode) << F2FS_BLKSIZE_BITS; + if (start > maxbytes) { + ret = -EFBIG; + goto out; + } + + if (len > maxbytes || (maxbytes - len) < start) + len = maxbytes - start; + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { ret = f2fs_xattr_fiemap(inode, fieinfo); goto out; From 48b532703f8be2afa98bd9020ce5c6875e3ab021 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Mon, 15 Mar 2021 17:12:33 +0900 Subject: [PATCH 20/58] f2fs: add sysfs nodes to get runtime compression stat I've added new sysfs nodes to show runtime compression stat since mount. compr_written_block - show the block count written after compression compr_saved_block - show the saved block count with compression compr_new_inode - show the count of inode newly enabled for compression Signed-off-by: Daeho Jeong Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 24 ++++++++++++++++ fs/f2fs/compress.c | 1 + fs/f2fs/f2fs.h | 19 +++++++++++++ fs/f2fs/sysfs.c | 38 +++++++++++++++++++++++++ 4 files changed, 82 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 4aa8f38b52d7..4849b8e84e42 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -414,3 +414,27 @@ What: /sys/fs/f2fs//ovp_segments Date: March 2021 Contact: "Jaegeuk Kim" Description: Shows the number of overprovision segments. + +What: /sys/fs/f2fs//compr_written_block +Date: March 2021 +Contact: "Daeho Jeong" +Description: Show the block count written after compression since mount. Note + that when the compressed blocks are deleted, this count doesn't + decrease. If you write "0" here, you can initialize + compr_written_block and compr_saved_block to "0". + +What: /sys/fs/f2fs//compr_saved_block +Date: March 2021 +Contact: "Daeho Jeong" +Description: Show the saved block count with compression since mount. Note + that when the compressed blocks are deleted, this count doesn't + decrease. If you write "0" here, you can initialize + compr_written_block and compr_saved_block to "0". + +What: /sys/fs/f2fs//compr_new_inode +Date: March 2021 +Contact: "Daeho Jeong" +Description: Show the count of inode newly enabled for compression since mount. + Note that when the compression is disabled for the files, this count + doesn't decrease. If you write "0" here, you can initialize + compr_new_inode to "0". diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 77fa342de38f..3c9d797dbdd6 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1353,6 +1353,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, if (fio.compr_blocks) f2fs_i_compr_blocks_update(inode, fio.compr_blocks - 1, false); f2fs_i_compr_blocks_update(inode, cc->nr_cpages, true); + add_compr_block_stat(inode, cc->nr_cpages); set_inode_flag(cc->inode, FI_APPEND_WRITE); if (cc->cluster_idx == 0) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5575f090161b..85e53ee770d0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1628,6 +1628,11 @@ struct f2fs_sb_info { #ifdef CONFIG_F2FS_FS_COMPRESSION struct kmem_cache *page_array_slab; /* page array entry */ unsigned int page_array_slab_size; /* default page array slab size */ + + /* For runtime compression statistics */ + u64 compr_written_block; + u64 compr_saved_block; + u32 compr_new_inode; #endif }; @@ -3960,6 +3965,18 @@ int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi); void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi); int __init f2fs_init_compress_cache(void); void f2fs_destroy_compress_cache(void); +#define inc_compr_inode_stat(inode) \ + do { \ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); \ + sbi->compr_new_inode++; \ + } while (0) +#define add_compr_block_stat(inode, blocks) \ + do { \ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); \ + int diff = F2FS_I(inode)->i_cluster_size - blocks; \ + sbi->compr_written_block += blocks; \ + sbi->compr_saved_block += diff; \ + } while (0) #else static inline bool f2fs_is_compressed_page(struct page *page) { return false; } static inline bool f2fs_is_compress_backend_ready(struct inode *inode) @@ -3988,6 +4005,7 @@ static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { } static inline int __init f2fs_init_compress_cache(void) { return 0; } static inline void f2fs_destroy_compress_cache(void) { } +#define inc_compr_inode_stat(inode) do { } while (0) #endif static inline void set_compress_context(struct inode *inode) @@ -4011,6 +4029,7 @@ static inline void set_compress_context(struct inode *inode) F2FS_I(inode)->i_flags |= F2FS_COMPR_FL; set_inode_flag(inode, FI_COMPRESSED_FILE); stat_inc_compr_inode(inode); + inc_compr_inode_stat(inode); f2fs_mark_inode_dirty_sync(inode, true); } diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 0c391ab2d8b7..39b522ec73e7 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "f2fs.h" #include "segment.h" @@ -289,6 +290,17 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, return len; } +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (!strcmp(a->attr.name, "compr_written_block")) + return sysfs_emit(buf, "%llu\n", sbi->compr_written_block); + + if (!strcmp(a->attr.name, "compr_saved_block")) + return sysfs_emit(buf, "%llu\n", sbi->compr_saved_block); + + if (!strcmp(a->attr.name, "compr_new_inode")) + return sysfs_emit(buf, "%u\n", sbi->compr_new_inode); +#endif + ui = (unsigned int *)(ptr + a->offset); return sprintf(buf, "%u\n", *ui); @@ -465,6 +477,24 @@ static ssize_t __sbi_store(struct f2fs_attr *a, return count; } +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (!strcmp(a->attr.name, "compr_written_block") || + !strcmp(a->attr.name, "compr_saved_block")) { + if (t != 0) + return -EINVAL; + sbi->compr_written_block = 0; + sbi->compr_saved_block = 0; + return count; + } + + if (!strcmp(a->attr.name, "compr_new_inode")) { + if (t != 0) + return -EINVAL; + sbi->compr_new_inode = 0; + return count; + } +#endif + *ui = (unsigned int)t; return count; @@ -676,6 +706,9 @@ F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM); F2FS_FEATURE_RO_ATTR(casefold, FEAT_CASEFOLD); #ifdef CONFIG_F2FS_FS_COMPRESSION F2FS_FEATURE_RO_ATTR(compression, FEAT_COMPRESSION); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_written_block, compr_written_block); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_saved_block, compr_saved_block); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_new_inode, compr_new_inode); #endif #define ATTR_LIST(name) (&f2fs_attr_##name.attr) @@ -739,6 +772,11 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(moved_blocks_foreground), ATTR_LIST(moved_blocks_background), ATTR_LIST(avg_vblocks), +#endif +#ifdef CONFIG_F2FS_FS_COMPRESSION + ATTR_LIST(compr_written_block), + ATTR_LIST(compr_saved_block), + ATTR_LIST(compr_new_inode), #endif NULL, }; From f7dca8471da56afcd5d89d21e59e7a1789ffcf76 Mon Sep 17 00:00:00 2001 From: Weichao Guo Date: Wed, 17 Mar 2021 17:27:23 +0800 Subject: [PATCH 21/58] f2fs: do not use AT_SSR mode in FG_GC & high urgent BG_GC AT_SSR mode is introduced by age threshold based GC for better hot/cold data seperation and avoiding free segment cost. However, LFS write mode is preferred in the scenario of foreground or high urgent GC, which should be finished ASAP. Let's only use AT_SSR in background GC and not high urgent GC modes. Signed-off-by: Weichao Guo Signed-off-by: Huang Jianan Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 3 ++- fs/f2fs/segment.c | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 86ba8ed0b8a7..d96acc6531f2 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1120,7 +1120,8 @@ static int move_data_block(struct inode *inode, block_t bidx, block_t newaddr; int err = 0; bool lfs_mode = f2fs_lfs_mode(fio.sbi); - int type = fio.sbi->am.atgc_enabled ? + int type = fio.sbi->am.atgc_enabled && (gc_type == BG_GC) && + (fio.sbi->gc_mode != GC_URGENT_HIGH) ? CURSEG_ALL_DATA_ATGC : CURSEG_COLD_DATA; /* do not read out */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5092917b8fcf..34c8e201d6a1 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3263,7 +3263,9 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) struct inode *inode = fio->page->mapping->host; if (is_cold_data(fio->page)) { - if (fio->sbi->am.atgc_enabled) + if (fio->sbi->am.atgc_enabled && + (fio->io_type == FS_DATA_IO) && + (fio->sbi->gc_mode != GC_URGENT_HIGH)) return CURSEG_ALL_DATA_ATGC; else return CURSEG_COLD_DATA; From a940a9ad962a7da8224ebb458f6eaf589a35dcb3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 17 Mar 2021 17:56:03 +0800 Subject: [PATCH 22/58] f2fs: don't start checkpoint thread in readonly mountpoint In readonly mountpoint, there should be no write IOs include checkpoint IO, so that it's not needed to create kernel checkpoint thread. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8fdcbabe43e8..47ec2ee2b7c9 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2072,8 +2072,10 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } } - if (!test_opt(sbi, DISABLE_CHECKPOINT) && - test_opt(sbi, MERGE_CHECKPOINT)) { + if ((*flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) || + !test_opt(sbi, MERGE_CHECKPOINT)) { + f2fs_stop_ckpt_thread(sbi); + } else { err = f2fs_start_ckpt_thread(sbi); if (err) { f2fs_err(sbi, @@ -2081,8 +2083,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) err); goto restore_gc; } - } else { - f2fs_stop_ckpt_thread(sbi); } /* @@ -3838,7 +3838,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) /* setup checkpoint request control and start checkpoint issue thread */ f2fs_init_ckpt_req_control(sbi); - if (!test_opt(sbi, DISABLE_CHECKPOINT) && + if (!f2fs_readonly(sb) && !test_opt(sbi, DISABLE_CHECKPOINT) && test_opt(sbi, MERGE_CHECKPOINT)) { err = f2fs_start_ckpt_thread(sbi); if (err) { From 0d36b38e1e9262c3700613401bd4549a8cd24fb3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 22 Mar 2021 19:47:30 +0800 Subject: [PATCH 23/58] f2fs: fix to avoid out-of-bounds memory access butt3rflyh4ck reported a bug found by syzkaller fuzzer with custom modifications in 5.12.0-rc3+ [1]: dump_stack+0xfa/0x151 lib/dump_stack.c:120 print_address_description.constprop.0.cold+0x82/0x32c mm/kasan/report.c:232 __kasan_report mm/kasan/report.c:399 [inline] kasan_report.cold+0x7c/0xd8 mm/kasan/report.c:416 f2fs_test_bit fs/f2fs/f2fs.h:2572 [inline] current_nat_addr fs/f2fs/node.h:213 [inline] get_next_nat_page fs/f2fs/node.c:123 [inline] __flush_nat_entry_set fs/f2fs/node.c:2888 [inline] f2fs_flush_nat_entries+0x258e/0x2960 fs/f2fs/node.c:2991 f2fs_write_checkpoint+0x1372/0x6a70 fs/f2fs/checkpoint.c:1640 f2fs_issue_checkpoint+0x149/0x410 fs/f2fs/checkpoint.c:1807 f2fs_sync_fs+0x20f/0x420 fs/f2fs/super.c:1454 __sync_filesystem fs/sync.c:39 [inline] sync_filesystem fs/sync.c:67 [inline] sync_filesystem+0x1b5/0x260 fs/sync.c:48 generic_shutdown_super+0x70/0x370 fs/super.c:448 kill_block_super+0x97/0xf0 fs/super.c:1394 The root cause is, if nat entry in checkpoint journal area is corrupted, e.g. nid of journalled nat entry exceeds max nid value, during checkpoint, once it tries to flush nat journal to NAT area, get_next_nat_page() may access out-of-bounds memory on nat_bitmap due to it uses wrong nid value as bitmap offset. [1] https://lore.kernel.org/lkml/CAFcO6XOMWdr8pObek6eN6-fs58KG9doRFadgJj-FnF-1x43s2g@mail.gmail.com/T/#u Reported-and-tested-by: butt3rflyh4ck Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index a8a0fb890e8d..ee0da46eb309 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2785,6 +2785,9 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) struct f2fs_nat_entry raw_ne; nid_t nid = le32_to_cpu(nid_in_journal(journal, i)); + if (f2fs_check_nid_range(sbi, nid)) + continue; + raw_ne = nat_in_journal(journal, i); ne = __lookup_nat_cache(nm_i, nid); From fcbab6e80beb70e18bb98d679ba85b88fed65e8d Mon Sep 17 00:00:00 2001 From: qiulaibin Date: Tue, 23 Mar 2021 19:41:30 +0800 Subject: [PATCH 24/58] f2fs: fix wrong comment of nat_tree_lock Do trivial comment fix of nat_tree_lock. Signed-off-by: qiulaibin Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 85e53ee770d0..c3ecea143613 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -865,7 +865,7 @@ struct f2fs_nm_info { /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ struct radix_tree_root nat_set_root;/* root of the nat set cache */ - struct rw_semaphore nat_tree_lock; /* protect nat_tree_lock */ + struct rw_semaphore nat_tree_lock; /* protect nat entry tree */ struct list_head nat_entries; /* cached nat entry list (clean) */ spinlock_t nat_list_lock; /* protect clean nat entry list */ unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */ From c4346bbb51102cafe60ce8da92d74b8e07e469f6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 17 Mar 2021 17:56:04 +0800 Subject: [PATCH 25/58] f2fs: fix error path of f2fs_remount() In error path of f2fs_remount(), it missed to restart/stop kernel thread or enable/disable checkpoint, then mount option status may not be consistent with real condition of filesystem, so let's reorder remount flow a bit as below and do recovery correctly in error path: 1) handle gc thread 2) handle ckpt thread 3) handle flush thread 4) handle checkpoint disabling Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 47 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 47ec2ee2b7c9..4447295a008d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1930,8 +1930,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) struct f2fs_mount_info org_mount_opt; unsigned long old_sb_flags; int err; - bool need_restart_gc = false; - bool need_stop_gc = false; + bool need_restart_gc = false, need_stop_gc = false; + bool need_restart_ckpt = false, need_stop_ckpt = false; + bool need_restart_flush = false, need_stop_flush = false; bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); bool disable_checkpoint = test_opt(sbi, DISABLE_CHECKPOINT); bool no_io_align = !F2FS_IO_ALIGNED(sbi); @@ -2062,19 +2063,10 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) clear_sbi_flag(sbi, SBI_IS_CLOSE); } - if (checkpoint_changed) { - if (test_opt(sbi, DISABLE_CHECKPOINT)) { - err = f2fs_disable_checkpoint(sbi); - if (err) - goto restore_gc; - } else { - f2fs_enable_checkpoint(sbi); - } - } - if ((*flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) || !test_opt(sbi, MERGE_CHECKPOINT)) { f2fs_stop_ckpt_thread(sbi); + need_restart_ckpt = true; } else { err = f2fs_start_ckpt_thread(sbi); if (err) { @@ -2083,6 +2075,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) err); goto restore_gc; } + need_stop_ckpt = true; } /* @@ -2092,11 +2085,24 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if ((*flags & SB_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { clear_opt(sbi, FLUSH_MERGE); f2fs_destroy_flush_cmd_control(sbi, false); + need_restart_flush = true; } else { err = f2fs_create_flush_cmd_control(sbi); if (err) - goto restore_gc; + goto restore_ckpt; + need_stop_flush = true; } + + if (checkpoint_changed) { + if (test_opt(sbi, DISABLE_CHECKPOINT)) { + err = f2fs_disable_checkpoint(sbi); + if (err) + goto restore_flush; + } else { + f2fs_enable_checkpoint(sbi); + } + } + skip: #ifdef CONFIG_QUOTA /* Release old quota file names */ @@ -2111,6 +2117,21 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) adjust_unusable_cap_perc(sbi); *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); return 0; +restore_flush: + if (need_restart_flush) { + if (f2fs_create_flush_cmd_control(sbi)) + f2fs_warn(sbi, "background flush thread has stopped"); + } else if (need_stop_flush) { + clear_opt(sbi, FLUSH_MERGE); + f2fs_destroy_flush_cmd_control(sbi, false); + } +restore_ckpt: + if (need_restart_ckpt) { + if (f2fs_start_ckpt_thread(sbi)) + f2fs_warn(sbi, "background ckpt thread has stopped"); + } else if (need_stop_ckpt) { + f2fs_stop_ckpt_thread(sbi); + } restore_gc: if (need_restart_gc) { if (f2fs_start_gc_thread(sbi)) From e457ef67093a8b232761a1af0dc6d5434d988fe7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 24 Mar 2021 11:24:33 +0800 Subject: [PATCH 26/58] f2fs: fix to update last i_size if fallocate partially succeeds In the case of expanding pinned file, map.m_lblk and map.m_len will update in each round of section allocation, so in error path, last i_size will be calculated with wrong m_lblk and m_len, fix it. Fixes: f5a53edcf01e ("f2fs: support aligned pinned file") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 8c46ff4de1b7..a39667139a31 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1617,9 +1617,10 @@ static int expand_inode_data(struct inode *inode, loff_t offset, struct f2fs_map_blocks map = { .m_next_pgofs = NULL, .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE, .m_may_create = true }; - pgoff_t pg_end; + pgoff_t pg_start, pg_end; loff_t new_size = i_size_read(inode); loff_t off_end; + block_t expanded = 0; int err; err = inode_newsize_ok(inode, (len + offset)); @@ -1632,11 +1633,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset, f2fs_balance_fs(sbi, true); + pg_start = ((unsigned long long)offset) >> PAGE_SHIFT; pg_end = ((unsigned long long)offset + len) >> PAGE_SHIFT; off_end = (offset + len) & (PAGE_SIZE - 1); - map.m_lblk = ((unsigned long long)offset) >> PAGE_SHIFT; - map.m_len = pg_end - map.m_lblk; + map.m_lblk = pg_start; + map.m_len = pg_end - pg_start; if (off_end) map.m_len++; @@ -1646,7 +1648,6 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (f2fs_is_pinned_file(inode)) { block_t sec_blks = BLKS_PER_SEC(sbi); block_t sec_len = roundup(map.m_len, sec_blks); - block_t done = 0; map.m_len = sec_blks; next_alloc: @@ -1654,10 +1655,8 @@ static int expand_inode_data(struct inode *inode, loff_t offset, GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { down_write(&sbi->gc_lock); err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); - if (err && err != -ENODATA && err != -EAGAIN) { - map.m_len = done; + if (err && err != -ENODATA && err != -EAGAIN) goto out_err; - } } down_write(&sbi->pin_sem); @@ -1671,24 +1670,25 @@ static int expand_inode_data(struct inode *inode, loff_t offset, up_write(&sbi->pin_sem); - done += map.m_len; + expanded += map.m_len; sec_len -= map.m_len; map.m_lblk += map.m_len; if (!err && sec_len) goto next_alloc; - map.m_len = done; + map.m_len = expanded; } else { err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + expanded = map.m_len; } out_err: if (err) { pgoff_t last_off; - if (!map.m_len) + if (!expanded) return err; - last_off = map.m_lblk + map.m_len - 1; + last_off = pg_start + expanded - 1; /* update new size to the failed position */ new_size = (last_off == pg_end) ? offset + len : From adbeec301c575d778082155a8e9cda6af269772d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 24 Mar 2021 11:18:28 +0800 Subject: [PATCH 27/58] f2fs: fix to avoid touching checkpointed data in get_victim() In CP disabling mode, there are two issues when using LFS or SSR | AT_SSR mode to select victim: 1. LFS is set to find source section during GC, the victim should have no checkpointed data, since after GC, section could not be set free for reuse. Previously, we only check valid chpt blocks in current segment rather than section, fix it. 2. SSR | AT_SSR are set to find target segment for writes which can be fully filled by checkpointed and newly written blocks, we should never select such segment, otherwise it can cause panic or data corruption during allocation, potential case is described as below: a) target segment has 128 ckpt valid blocks b) GC migrates 'n' (n < 512) valid blocks to other segment (segment is still in dirty list) c) GC migrates '512 - n' blocks to target segment (segment has 'n' cp_vblocks and '512 - n' vblocks) d) If GC selects target segment via {AT,}SSR allocator, however there is no free space in targe segment. Fixes: 4354994f097d ("f2fs: checkpoint disabling") Fixes: 093749e296e2 ("f2fs: support age threshold based garbage collection") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/gc.c | 28 ++++++++++++++++++++-------- fs/f2fs/segment.c | 36 +++++++++++++++++++++--------------- fs/f2fs/segment.h | 14 +++++++++++++- 4 files changed, 55 insertions(+), 24 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c3ecea143613..8dc8453320b6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3386,6 +3386,7 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi); int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable); void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi); int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); +bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno); void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi); void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi); void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d96acc6531f2..a2ca483f9855 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -392,10 +392,6 @@ static void add_victim_entry(struct f2fs_sb_info *sbi, if (p->gc_mode == GC_AT && get_valid_blocks(sbi, segno, true) == 0) return; - - if (p->alloc_mode == AT_SSR && - get_seg_entry(sbi, segno)->ckpt_valid_blocks == 0) - return; } for (i = 0; i < sbi->segs_per_sec; i++) @@ -728,11 +724,27 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, if (sec_usage_check(sbi, secno)) goto next; + /* Don't touch checkpointed data */ - if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) && - get_ckpt_valid_blocks(sbi, segno) && - p.alloc_mode == LFS)) - goto next; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + if (p.alloc_mode == LFS) { + /* + * LFS is set to find source section during GC. + * The victim should have no checkpointed data. + */ + if (get_ckpt_valid_blocks(sbi, segno, true)) + goto next; + } else { + /* + * SSR | AT_SSR are set to find target segment + * for writes which can be full by checkpointed + * and newly written blocks. + */ + if (!f2fs_segment_has_free_slot(sbi, segno)) + goto next; + } + } + if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) goto next; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 34c8e201d6a1..b539b1aa5de1 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -865,7 +865,7 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_lock(&dirty_i->seglist_lock); valid_blocks = get_valid_blocks(sbi, segno, false); - ckpt_valid_blocks = get_ckpt_valid_blocks(sbi, segno); + ckpt_valid_blocks = get_ckpt_valid_blocks(sbi, segno, false); if (valid_blocks == 0 && (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) || ckpt_valid_blocks == usable_blocks)) { @@ -950,7 +950,7 @@ static unsigned int get_free_segment(struct f2fs_sb_info *sbi) for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { if (get_valid_blocks(sbi, segno, false)) continue; - if (get_ckpt_valid_blocks(sbi, segno)) + if (get_ckpt_valid_blocks(sbi, segno, false)) continue; mutex_unlock(&dirty_i->seglist_lock); return segno; @@ -2642,6 +2642,23 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, seg->next_blkoff++; } +bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno) +{ + struct seg_entry *se = get_seg_entry(sbi, segno); + int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); + unsigned long *target_map = SIT_I(sbi)->tmp_map; + unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; + unsigned long *cur_map = (unsigned long *)se->cur_valid_map; + int i, pos; + + for (i = 0; i < entries; i++) + target_map[i] = ckpt_map[i] | cur_map[i]; + + pos = __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, 0); + + return pos < sbi->blocks_per_seg; +} + /* * This function always allocates a used segment(from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks @@ -2912,19 +2929,8 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, get_valid_blocks(sbi, curseg->segno, new_sec)) goto alloc; - if (new_sec) { - unsigned int segno = START_SEGNO(curseg->segno); - int i; - - for (i = 0; i < sbi->segs_per_sec; i++, segno++) { - if (get_ckpt_valid_blocks(sbi, segno)) - goto alloc; - } - } else { - if (!get_ckpt_valid_blocks(sbi, curseg->segno)) - return; - } - + if (!get_ckpt_valid_blocks(sbi, curseg->segno, new_sec)) + return; alloc: old_segno = curseg->segno; SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 144980b62f9e..dab87ecba2b5 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -359,8 +359,20 @@ static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi, } static inline unsigned int get_ckpt_valid_blocks(struct f2fs_sb_info *sbi, - unsigned int segno) + unsigned int segno, bool use_section) { + if (use_section && __is_large_section(sbi)) { + unsigned int start_segno = START_SEGNO(segno); + unsigned int blocks = 0; + int i; + + for (i = 0; i < sbi->segs_per_sec; i++, start_segno++) { + struct seg_entry *se = get_seg_entry(sbi, start_segno); + + blocks += se->ckpt_valid_blocks; + } + return blocks; + } return get_seg_entry(sbi, segno)->ckpt_valid_blocks; } From 528611246fcbd537fd6b545a149905a678b2727c Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Tue, 16 Mar 2021 14:59:18 +0530 Subject: [PATCH 28/58] f2fs: allow to change discard policy based on cached discard cmds With the default DPOLICY_BG discard thread is ioaware, which prevents the discard thread from issuing the discard commands. On low RAM setups, it is observed that these discard commands in the cache are consuming high memory. This patch aims to relax the memory pressure on the system due to f2fs pending discard cmds by changing the policy to DPOLICY_FORCE based on the nm_i->ram_thresh configured. Signed-off-by: Sahitya Tummala Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 8 ++++++++ fs/f2fs/node.h | 1 + fs/f2fs/segment.c | 3 ++- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index ee0da46eb309..298b6a993d1b 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -43,11 +43,15 @@ int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) { struct f2fs_nm_info *nm_i = NM_I(sbi); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct sysinfo val; unsigned long avail_ram; unsigned long mem_size = 0; bool res = false; + if (!nm_i) + return true; + si_meminfo(&val); /* only uses low memory */ @@ -89,6 +93,10 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) /* it allows 20% / total_ram for inmemory pages */ mem_size = get_pages(sbi, F2FS_INMEM_PAGES); res = mem_size < (val.totalram / 5); + } else if (type == DISCARD_CACHE) { + mem_size = (atomic_read(&dcc->discard_cmd_cnt) * + sizeof(struct discard_cmd)) >> PAGE_SHIFT; + res = mem_size < (avail_ram * nm_i->ram_thresh / 100); } else { if (!sbi->sb->s_bdi->wb.dirty_exceeded) return true; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index f84541b57acb..7a45c0f10629 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -147,6 +147,7 @@ enum mem_type { INO_ENTRIES, /* indicates inode entries */ EXTENT_CACHE, /* indicates extent cache */ INMEM_PAGES, /* indicates inmemory pages */ + DISCARD_CACHE, /* indicates memory of cached discard cmds */ BASE_CHECK, /* check kernel status */ }; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b539b1aa5de1..67ef40c21d7a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1762,7 +1762,8 @@ static int issue_discard_thread(void *data) if (!atomic_read(&dcc->discard_cmd_cnt)) continue; - if (sbi->gc_mode == GC_URGENT_HIGH) + if (sbi->gc_mode == GC_URGENT_HIGH || + !f2fs_available_free_memory(sbi, DISCARD_CACHE)) __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); sb_start_intwrite(sbi->sb); From cc4ff1f3de7773db2c29a151687e6b0e8c88c654 Mon Sep 17 00:00:00 2001 From: Ruiqi Gong Date: Thu, 25 Mar 2021 02:38:11 -0400 Subject: [PATCH 29/58] f2fs: fix a typo in inode.c Do a trivial typo fix. s/runing/running Reported-by: Hulk Robot Signed-off-by: Ruiqi Gong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 349d9cb933ee..5d2253d53f17 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -698,7 +698,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) /* * We need to balance fs here to prevent from producing dirty node pages - * during the urgent cleaning time when runing out of free sections. + * during the urgent cleaning time when running out of free sections. */ f2fs_update_inode_page(inode); if (wbc && wbc->nr_to_write) From f74afaf1bff03c8fbbfd295275154f256856a72c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 26 Mar 2021 22:41:43 +0800 Subject: [PATCH 30/58] f2fs: delete empty compress.h Commit 75e91c888989 ("f2fs: compress: fix compression chksum") wrongly introduced empty compress.h, delete it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.h | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 fs/f2fs/compress.h diff --git a/fs/f2fs/compress.h b/fs/f2fs/compress.h deleted file mode 100644 index e69de29bb2d1..000000000000 From 6defd530060e05662673a075f7a57659ff788a3b Mon Sep 17 00:00:00 2001 From: Wang Xiaojun Date: Thu, 25 Mar 2021 10:19:20 -0400 Subject: [PATCH 31/58] f2fs: fix wrong alloc_type in f2fs_do_replace_block If the alloc_type of the original curseg is LFS, when we change_curseg and then do recover curseg, the alloc_type becomes SSR. Signed-off-by: Wang Xiaojun Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 67ef40c21d7a..9bf30c40a3ac 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3572,6 +3572,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, struct seg_entry *se; int type; unsigned short old_blkoff; + unsigned char old_alloc_type; segno = GET_SEGNO(sbi, new_blkaddr); se = get_seg_entry(sbi, segno); @@ -3605,6 +3606,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, old_cursegno = curseg->segno; old_blkoff = curseg->next_blkoff; + old_alloc_type = curseg->alloc_type; /* change the current segment */ if (segno != curseg->segno) { @@ -3639,6 +3641,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, change_curseg(sbi, type, true); } curseg->next_blkoff = old_blkoff; + curseg->alloc_type = old_alloc_type; } up_write(&sit_i->sentry_lock); From 79d06e20a471b3bf371f373c732c532fa1a815c9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 26 Mar 2021 09:46:22 +0800 Subject: [PATCH 32/58] f2fs: fix to cover __allocate_new_section() with curseg_lock In order to avoid race with f2fs_do_replace_block(). Fixes: f5a53edcf01e ("f2fs: support aligned pinned file") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9bf30c40a3ac..a9597b2ee679 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2945,19 +2945,23 @@ static void __allocate_new_section(struct f2fs_sb_info *sbi, int type) void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type) { + down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); __allocate_new_section(sbi, type); up_write(&SIT_I(sbi)->sentry_lock); + up_read(&SM_I(sbi)->curseg_lock); } void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) { int i; + down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) __allocate_new_segment(sbi, i, false); up_write(&SIT_I(sbi)->sentry_lock); + up_read(&SM_I(sbi)->curseg_lock); } static const struct segment_allocation default_salloc_ops = { From b667340b250853dd613c8271bb9f3f1088d5bcec Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 27 Mar 2021 17:57:06 +0800 Subject: [PATCH 33/58] f2fs: introduce gc_merge mount option In this patch, we will add two new mount options: "gc_merge" and "nogc_merge", when background_gc is on, "gc_merge" option can be set to let background GC thread to handle foreground GC requests, it can eliminate the sluggish issue caused by slow foreground GC operation when GC is triggered from a process with limited I/O and CPU resources. Original idea is from Xiang. Signed-off-by: Gao Xiang Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 6 ++++++ fs/f2fs/f2fs.h | 1 + fs/f2fs/gc.c | 26 ++++++++++++++++++++++---- fs/f2fs/gc.h | 6 ++++++ fs/f2fs/segment.c | 15 +++++++++++++-- fs/f2fs/super.c | 19 +++++++++++++++++-- 6 files changed, 65 insertions(+), 8 deletions(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 35ed01a5fbc9..63c0c49b726d 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -110,6 +110,12 @@ background_gc=%s Turn on/off cleaning operations, namely garbage on synchronous garbage collection running in background. Default value for this option is on. So garbage collection is on by default. +gc_merge When background_gc is on, this option can be enabled to + let background GC thread to handle foreground GC requests, + it can eliminate the sluggish issue caused by slow foreground + GC operation when GC is triggered from a process with limited + I/O and CPU resources. +nogc_merge Disable GC merge feature. disable_roll_forward Disable the roll-forward recovery routine norecovery Disable the roll-forward recovery routine, mounted read- only (i.e., -o ro,disable_roll_forward) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8dc8453320b6..5637deb31776 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -97,6 +97,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_NORECOVERY 0x04000000 #define F2FS_MOUNT_ATGC 0x08000000 #define F2FS_MOUNT_MERGE_CHECKPOINT 0x10000000 +#define F2FS_MOUNT_GC_MERGE 0x20000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index a2ca483f9855..5c48825fd12d 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -31,19 +31,24 @@ static int gc_thread_func(void *data) struct f2fs_sb_info *sbi = data; struct f2fs_gc_kthread *gc_th = sbi->gc_thread; wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; + wait_queue_head_t *fggc_wq = &sbi->gc_thread->fggc_wq; unsigned int wait_ms; wait_ms = gc_th->min_sleep_time; set_freezable(); do { - bool sync_mode; + bool sync_mode, foreground = false; wait_event_interruptible_timeout(*wq, kthread_should_stop() || freezing(current) || + waitqueue_active(fggc_wq) || gc_th->gc_wake, msecs_to_jiffies(wait_ms)); + if (test_opt(sbi, GC_MERGE) && waitqueue_active(fggc_wq)) + foreground = true; + /* give it a try one time */ if (gc_th->gc_wake) gc_th->gc_wake = 0; @@ -90,7 +95,10 @@ static int gc_thread_func(void *data) goto do_gc; } - if (!down_write_trylock(&sbi->gc_lock)) { + if (foreground) { + down_write(&sbi->gc_lock); + goto do_gc; + } else if (!down_write_trylock(&sbi->gc_lock)) { stat_other_skip_bggc_count(sbi); goto next; } @@ -107,14 +115,22 @@ static int gc_thread_func(void *data) else increase_sleep_time(gc_th, &wait_ms); do_gc: - stat_inc_bggc_count(sbi->stat_info); + if (!foreground) + stat_inc_bggc_count(sbi->stat_info); sync_mode = F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC; + /* foreground GC was been triggered via f2fs_balance_fs() */ + if (foreground) + sync_mode = false; + /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, sync_mode, true, false, NULL_SEGNO)) + if (f2fs_gc(sbi, sync_mode, !foreground, false, NULL_SEGNO)) wait_ms = gc_th->no_gc_sleep_time; + if (foreground) + wake_up_all(&gc_th->fggc_wq); + trace_f2fs_background_gc(sbi->sb, wait_ms, prefree_segments(sbi), free_segments(sbi)); @@ -148,6 +164,7 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) sbi->gc_thread = gc_th; init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); + init_waitqueue_head(&sbi->gc_thread->fggc_wq); sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(gc_th->f2fs_gc_task)) { @@ -165,6 +182,7 @@ void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi) if (!gc_th) return; kthread_stop(gc_th->f2fs_gc_task); + wake_up_all(&gc_th->fggc_wq); kfree(gc_th); sbi->gc_thread = NULL; } diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 0c8dae12dc51..3fe145e8e594 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -42,6 +42,12 @@ struct f2fs_gc_kthread { /* for changing gc mode */ unsigned int gc_wake; + + /* for GC_MERGE mount option */ + wait_queue_head_t fggc_wq; /* + * caller of f2fs_balance_fs() + * will wait on this wait queue. + */ }; struct gc_inode_list { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a9597b2ee679..18ca9bb2c062 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -503,8 +503,19 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) * dir/node pages without enough free segments. */ if (has_not_enough_free_secs(sbi, 0, 0)) { - down_write(&sbi->gc_lock); - f2fs_gc(sbi, false, false, false, NULL_SEGNO); + if (test_opt(sbi, GC_MERGE) && sbi->gc_thread && + sbi->gc_thread->f2fs_gc_task) { + DEFINE_WAIT(wait); + + prepare_to_wait(&sbi->gc_thread->fggc_wq, &wait, + TASK_UNINTERRUPTIBLE); + wake_up(&sbi->gc_thread->gc_wait_queue_head); + io_schedule(); + finish_wait(&sbi->gc_thread->fggc_wq, &wait); + } else { + down_write(&sbi->gc_lock); + f2fs_gc(sbi, false, false, false, NULL_SEGNO); + } } } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4447295a008d..926201176f5e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -151,6 +151,8 @@ enum { Opt_compress_chksum, Opt_compress_mode, Opt_atgc, + Opt_gc_merge, + Opt_nogc_merge, Opt_err, }; @@ -223,6 +225,8 @@ static match_table_t f2fs_tokens = { {Opt_compress_chksum, "compress_chksum"}, {Opt_compress_mode, "compress_mode=%s"}, {Opt_atgc, "atgc"}, + {Opt_gc_merge, "gc_merge"}, + {Opt_nogc_merge, "nogc_merge"}, {Opt_err, NULL}, }; @@ -1073,6 +1077,12 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) case Opt_atgc: set_opt(sbi, ATGC); break; + case Opt_gc_merge: + set_opt(sbi, GC_MERGE); + break; + case Opt_nogc_merge: + clear_opt(sbi, GC_MERGE); + break; default: f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", p); @@ -1678,6 +1688,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF) seq_printf(seq, ",background_gc=%s", "off"); + if (test_opt(sbi, GC_MERGE)) + seq_puts(seq, ",gc_merge"); + if (test_opt(sbi, DISABLE_ROLL_FORWARD)) seq_puts(seq, ",disable_roll_forward"); if (test_opt(sbi, NORECOVERY)) @@ -2041,7 +2054,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * option. Also sync the filesystem. */ if ((*flags & SB_RDONLY) || - F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF) { + (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF && + !test_opt(sbi, GC_MERGE))) { if (sbi->gc_thread) { f2fs_stop_gc_thread(sbi); need_restart_gc = true; @@ -4015,7 +4029,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) * If filesystem is not mounted as read-only then * do start the gc_thread. */ - if (F2FS_OPTION(sbi).bggc_mode != BGGC_MODE_OFF && !f2fs_readonly(sb)) { + if ((F2FS_OPTION(sbi).bggc_mode != BGGC_MODE_OFF || + test_opt(sbi, GC_MERGE)) && !f2fs_readonly(sb)) { /* After POR, we can run background GC thread.*/ err = f2fs_start_gc_thread(sbi); if (err) From e4491f813476eb1e31c14e38ac2b56682f8ef861 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 31 Mar 2021 11:16:32 +0800 Subject: [PATCH 34/58] f2fs: fix to restrict mount condition on readonly block device When we mount an unclean f2fs image in a readonly block device, let's make mount() succeed only when there is no recoverable data in that image, otherwise after mount(), file fsyned won't be recovered as user expected. Fixes: 938a184265d7 ("f2fs: give a warning only for readonly partition") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 926201176f5e..4df964c362c4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3969,10 +3969,18 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) * previous checkpoint was not done by clean system shutdown. */ if (f2fs_hw_is_readonly(sbi)) { - if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) - f2fs_err(sbi, "Need to recover fsync data, but write access unavailable"); - else - f2fs_info(sbi, "write access unavailable, skipping recovery"); + if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { + err = f2fs_recover_fsync_data(sbi, true); + if (err > 0) { + err = -EROFS; + f2fs_err(sbi, "Need to recover fsync data, but " + "write access unavailable, please try " + "mount w/ disable_roll_forward or norecovery"); + } + if (err < 0) + goto free_meta; + } + f2fs_info(sbi, "write access unavailable, skipping recovery"); goto reset_checkpoint; } From e15d220552aedbe2542f335be315e36747751d00 Mon Sep 17 00:00:00 2001 From: Yi Zhuang Date: Wed, 31 Mar 2021 17:34:14 +0800 Subject: [PATCH 35/58] f2fs: Fix a hungtask problem in atomic write MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the cache writing process, if it is an atomic file, increase the page count of F2FS_WB_CP_DATA, otherwise increase the page count of F2FS_WB_DATA. When you step into the hook branch due to insufficient memory in f2fs_write_begin, f2fs_drop_inmem_pages_all will be called to traverse all atomic inodes and clear the FI_ATOMIC_FILE mark of all atomic files. In f2fs_drop_inmem_pages,first acquire the inmem_lock , revoke all the inmem_pages, and then clear the FI_ATOMIC_FILE mark. Before this mark is cleared, other threads may hold inmem_lock to add inmem_pages to the inode that has just been emptied inmem_pages, and increase the page count of F2FS_WB_CP_DATA. When the IO returns, it is found that the FI_ATOMIC_FILE flag is cleared by f2fs_drop_inmem_pages_all, and f2fs_is_atomic_file returns false,which causes the page count of F2FS_WB_DATA to be decremented. The page count of F2FS_WB_CP_DATA cannot be cleared. Finally, hungtask is triggered in f2fs_wait_on_all_pages because get_pages will never return zero. process A: process B: f2fs_drop_inmem_pages_all ->f2fs_drop_inmem_pages of inode#1 ->mutex_lock(&fi->inmem_lock) ->__revoke_inmem_pages of inode#1 f2fs_ioc_commit_atomic_write ->mutex_unlock(&fi->inmem_lock) ->f2fs_commit_inmem_pages of inode#1 ->mutex_lock(&fi->inmem_lock) ->__f2fs_commit_inmem_pages ->f2fs_do_write_data_page ->f2fs_outplace_write_data ->do_write_page ->f2fs_submit_page_write ->inc_page_count(sbi, F2FS_WB_CP_DATA ) ->mutex_unlock(&fi->inmem_lock) ->spin_lock(&sbi->inode_lock[ATOMIC_FILE]); ->clear_inode_flag(inode, FI_ATOMIC_FILE) ->spin_unlock(&sbi->inode_lock[ATOMIC_FILE]) f2fs_write_end_io ->dec_page_count(sbi, F2FS_WB_DATA ); We can fix the problem by putting the action of clearing the FI_ATOMIC_FILE mark into the inmem_lock lock. This operation can ensure that no one will submit the inmem pages before the FI_ATOMIC_FILE mark is cleared, so that there will be no atomic writes waiting for writeback. Fixes: 57864ae5ce3a ("f2fs: limit # of inmemory pages") Signed-off-by: Yi Zhuang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 18ca9bb2c062..b198f793af8f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -324,23 +324,27 @@ void f2fs_drop_inmem_pages(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); - while (!list_empty(&fi->inmem_pages)) { + do { mutex_lock(&fi->inmem_lock); + if (list_empty(&fi->inmem_pages)) { + fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; + + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (!list_empty(&fi->inmem_ilist)) + list_del_init(&fi->inmem_ilist); + if (f2fs_is_atomic_file(inode)) { + clear_inode_flag(inode, FI_ATOMIC_FILE); + sbi->atomic_files--; + } + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + + mutex_unlock(&fi->inmem_lock); + break; + } __revoke_inmem_pages(inode, &fi->inmem_pages, true, false, true); mutex_unlock(&fi->inmem_lock); - } - - fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; - - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (!list_empty(&fi->inmem_ilist)) - list_del_init(&fi->inmem_ilist); - if (f2fs_is_atomic_file(inode)) { - clear_inode_flag(inode, FI_ATOMIC_FILE); - sbi->atomic_files--; - } - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + } while (1); } void f2fs_drop_inmem_page(struct inode *inode, struct page *page) From 7361785b8fafd3dfb74735632c5654b74fde1ba0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 1 Apr 2021 17:25:20 -0700 Subject: [PATCH 36/58] f2fs: set checkpoint_merge by default Once we introduced checkpoint_merge, we've seen some contention w/o the option. In order to avoid it, let's set it by default. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4df964c362c4..4cc6536d5e70 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1842,6 +1842,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, EXTENT_CACHE); set_opt(sbi, NOHEAP); clear_opt(sbi, DISABLE_CHECKPOINT); + set_opt(sbi, MERGE_CHECKPOINT); F2FS_OPTION(sbi).unusable_cap = 0; sbi->sb->s_flags |= SB_LAZYTIME; set_opt(sbi, FLUSH_MERGE); From 380faaa0aa40b8aec91b3885688c828b19bc2109 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 1 Apr 2021 11:01:53 +0800 Subject: [PATCH 37/58] f2fs: fix to avoid GC/mmap race with f2fs_truncate() It missed to hold i_gc_rwsem and i_map_sem around f2fs_truncate() in f2fs_file_write_iter() to avoid racing with background GC and mmap, fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a39667139a31..051167ce2630 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4441,8 +4441,13 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) clear_inode_flag(inode, FI_NO_PREALLOC); /* if we couldn't write data, we should deallocate blocks. */ - if (preallocated && i_size_read(inode) < target_size) + if (preallocated && i_size_read(inode) < target_size) { + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); f2fs_truncate(inode); + up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + } if (ret > 0) f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); From d731e43cfcffa3dc8bed8fb37d38f69e4a8fdb97 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 2 Apr 2021 17:22:23 +0800 Subject: [PATCH 38/58] f2fs: fix to avoid accessing invalid fio in f2fs_allocate_data_block() Callers may pass fio parameter with NULL value to f2fs_allocate_data_block(), so we should make sure accessing fio's field after fio's validation check. Fixes: f608c38c59c6 ("f2fs: clean up parameter of f2fs_allocate_data_block()") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b198f793af8f..0b4b1c486035 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3417,12 +3417,12 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, f2fs_inode_chksum_set(sbi, page); } - if (F2FS_IO_ALIGNED(sbi)) - fio->retry = false; - if (fio) { struct f2fs_bio_info *io; + if (F2FS_IO_ALIGNED(sbi)) + fio->retry = false; + INIT_LIST_HEAD(&fio->list); fio->in_list = true; io = sbi->write_io[fio->type] + fio->temp; From 1757598ffa173214855edee65a7e88223747c42f Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Tue, 6 Apr 2021 14:39:16 +0530 Subject: [PATCH 39/58] f2fs: fix the periodic wakeups of discard thread Fix the unnecessary periodic wakeups of discard thread that happens under below two conditions - 1. When f2fs is heavily utilized over 80%, the current discard policy sets the max sleep timeout of discard thread as 50ms (DEF_MIN_DISCARD_ISSUE_TIME). But this is set even when there are no pending discard commands to be issued. 2. In the issue_discard_thread() path when there are no pending discard commands, it fails to reset the wait_ms to max timeout value. Signed-off-by: Sahitya Tummala Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0b4b1c486035..b130d0be5347 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1114,6 +1114,8 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, int discard_type, unsigned int granularity) { + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + /* common policy */ dpolicy->type = discard_type; dpolicy->sync = true; @@ -1133,7 +1135,9 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->ordered = true; if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) { dpolicy->granularity = 1; - dpolicy->max_interval = DEF_MIN_DISCARD_ISSUE_TIME; + if (atomic_read(&dcc->discard_cmd_cnt)) + dpolicy->max_interval = + DEF_MIN_DISCARD_ISSUE_TIME; } } else if (discard_type == DPOLICY_FORCE) { dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; @@ -1749,8 +1753,15 @@ static int issue_discard_thread(void *data) set_freezable(); do { - __init_discard_policy(sbi, &dpolicy, DPOLICY_BG, - dcc->discard_granularity); + if (sbi->gc_mode == GC_URGENT_HIGH || + !f2fs_available_free_memory(sbi, DISCARD_CACHE)) + __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); + else + __init_discard_policy(sbi, &dpolicy, DPOLICY_BG, + dcc->discard_granularity); + + if (!atomic_read(&dcc->discard_cmd_cnt)) + wait_ms = dpolicy.max_interval; wait_event_interruptible_timeout(*q, kthread_should_stop() || freezing(current) || @@ -1777,10 +1788,6 @@ static int issue_discard_thread(void *data) if (!atomic_read(&dcc->discard_cmd_cnt)) continue; - if (sbi->gc_mode == GC_URGENT_HIGH || - !f2fs_available_free_memory(sbi, DISCARD_CACHE)) - __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); - sb_start_intwrite(sbi->sb); issued = __issue_discard_cmd(sbi, &dpolicy); From 9b42d1c8d455be23ed255ecdea3cb4f6d375de47 Mon Sep 17 00:00:00 2001 From: Yi Zhuang Date: Tue, 6 Apr 2021 09:47:35 +0800 Subject: [PATCH 40/58] f2fs: modify open brace '{' following function definitions Made suggested modifications from checkpatch in reference to ERROR: open brace '{' following function definitions go on the next line Signed-off-by: Yi Zhuang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b130d0be5347..2f2e42d9e372 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -4793,7 +4793,8 @@ static struct f2fs_dev_info *get_target_zoned_dev(struct f2fs_sb_info *sbi, } static int report_one_zone_cb(struct blk_zone *zone, unsigned int idx, - void *data) { + void *data) +{ memcpy(data, zone, sizeof(struct blk_zone)); return 0; } @@ -4909,7 +4910,8 @@ struct check_zone_write_pointer_args { }; static int check_zone_write_pointer_cb(struct blk_zone *zone, unsigned int idx, - void *data) { + void *data) +{ struct check_zone_write_pointer_args *args; args = (struct check_zone_write_pointer_args *)data; From f10ea3ce7455e4193f4762fbef265e187b1fc50e Mon Sep 17 00:00:00 2001 From: Yi Zhuang Date: Tue, 6 Apr 2021 09:47:35 +0800 Subject: [PATCH 41/58] f2fs: clean up build warnings This patch combined the below three clean-up patches. - modify open brace '{' following function definitions - ERROR: spaces required around that ':' - ERROR: spaces required before the open parenthesis '(' - ERROR: spaces prohibited before that ',' - Made suggested modifications from checkpatch in reference to WARNING: Missing a blank line after declarations Signed-off-by: Yi Zhuang Signed-off-by: Jia Yang Signed-off-by: Jack Qiu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 1 + fs/f2fs/checkpoint.c | 1 + fs/f2fs/data.c | 2 ++ fs/f2fs/debug.c | 3 +++ fs/f2fs/dir.c | 1 + fs/f2fs/file.c | 2 +- fs/f2fs/gc.c | 6 +++++- fs/f2fs/inode.c | 1 + fs/f2fs/namei.c | 3 +++ fs/f2fs/node.c | 8 +++++--- fs/f2fs/recovery.c | 3 ++- fs/f2fs/segment.c | 12 +++++++++++- fs/f2fs/super.c | 5 +++-- fs/f2fs/xattr.c | 1 + 14 files changed, 40 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 732ec10e7890..b21b98f924e1 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -29,6 +29,7 @@ static inline size_t f2fs_acl_size(int count) static inline int f2fs_acl_count(size_t size) { ssize_t s; + size -= sizeof(struct f2fs_acl_header); s = size - 4 * sizeof(struct f2fs_acl_entry_short); if (s < 0) { diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 31f951e31dca..134f8ffb6ee3 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -719,6 +719,7 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) orphan_blk = (struct f2fs_orphan_block *)page_address(page); for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) { nid_t ino = le32_to_cpu(orphan_blk->ino[j]); + err = recover_orphan_inode(sbi, ino); if (err) { f2fs_put_page(page, 1); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ef208b36bf5c..d3e4f8315b40 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1097,6 +1097,7 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) for (; count > 0; dn->ofs_in_node++) { block_t blkaddr = f2fs_data_blkaddr(dn); + if (blkaddr == NULL_ADDR) { dn->data_blkaddr = NEW_ADDR; __set_data_blkaddr(dn); @@ -3776,6 +3777,7 @@ int f2fs_migrate_page(struct address_space *mapping, if (atomic_written) { struct inmem_pages *cur; + list_for_each_entry(cur, &fi->inmem_pages, list) if (cur->page == page) { cur->page = newpage; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 91855d5721cd..c03949a7ccff 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -173,6 +173,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->util_invalid = 50 - si->util_free - si->util_valid; for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); + si->curseg[i] = curseg->segno; si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno); si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]); @@ -300,10 +301,12 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->page_mem = 0; if (sbi->node_inode) { unsigned npages = NODE_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } if (sbi->meta_inode) { unsigned npages = META_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index e6270a867be1..ebf65c5fac40 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -473,6 +473,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, struct page *page, struct inode *inode) { enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA; + lock_page(page); f2fs_wait_on_page_writeback(page, type, true, true); de->ino = cpu_to_le32(inode->i_ino); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 051167ce2630..da08a8db248f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2580,7 +2580,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, { struct inode *inode = file_inode(filp); struct f2fs_map_blocks map = { .m_next_extent = NULL, - .m_seg_type = NO_CHECK_TYPE , + .m_seg_type = NO_CHECK_TYPE, .m_may_create = false }; struct extent_info ei = {0, 0, 0}; pgoff_t pg_start, pg_end, next_pgofs; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 5c48825fd12d..8d1f17ab94d8 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -160,7 +160,7 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; - gc_th->gc_wake= 0; + gc_th->gc_wake = 0; sbi->gc_thread = gc_th; init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); @@ -179,6 +179,7 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi) { struct f2fs_gc_kthread *gc_th = sbi->gc_thread; + if (!gc_th) return; kthread_stop(gc_th->f2fs_gc_task); @@ -858,6 +859,7 @@ static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode) static void put_gc_inode(struct gc_inode_list *gc_list) { struct inode_entry *ie, *next_ie; + list_for_each_entry_safe(ie, next_ie, &gc_list->ilist, list) { radix_tree_delete(&gc_list->iroot, ie->inode->i_ino); iput(ie->inode); @@ -982,9 +984,11 @@ block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode) bidx = node_ofs - 1; } else if (node_ofs <= indirect_blks) { int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1); + bidx = node_ofs - 2 - dec; } else { int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); + bidx = node_ofs - 5 - dec; } return bidx * ADDRS_PER_BLOCK(inode) + ADDRS_PER_INODE(inode); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 5d2253d53f17..b401f08569f7 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -666,6 +666,7 @@ void f2fs_update_inode_page(struct inode *inode) node_page = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) { int err = PTR_ERR(node_page); + if (err == -ENOMEM) { cond_resched(); goto retry; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 887804968576..91f244279ac4 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -419,6 +419,7 @@ struct dentry *f2fs_get_parent(struct dentry *child) struct qstr dotdot = QSTR_INIT("..", 2); struct page *page; unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot, &page); + if (!ino) { if (IS_ERR(page)) return ERR_CAST(page); @@ -628,6 +629,7 @@ static const char *f2fs_get_link(struct dentry *dentry, struct delayed_call *done) { const char *link = page_get_link(dentry, inode, done); + if (!IS_ERR(link) && !*link) { /* this is broken symlink case */ do_delayed_call(done); @@ -765,6 +767,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) static int f2fs_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); + if (f2fs_empty_dir(inode)) return f2fs_unlink(dir, dentry); return -ENOTEMPTY; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 298b6a993d1b..965bcb3edac2 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -470,6 +470,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, /* increment version no as node is removed */ if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) { unsigned char version = nat_get_version(e); + nat_set_version(e, inc_node_version(version)); } @@ -1391,7 +1392,7 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, goto out_err; } page_hit: - if(unlikely(nid != nid_of_node(page))) { + if (unlikely(nid != nid_of_node(page))) { f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", nid, nid_of_node(page), ino_of_node(page), ofs_of_node(page), cpver_of_node(page), @@ -1783,7 +1784,7 @@ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, out: if (nwritten) f2fs_submit_merged_write_cond(sbi, NULL, NULL, ino, NODE); - return ret ? -EIO: 0; + return ret ? -EIO : 0; } static int f2fs_match_ino(struct inode *inode, unsigned long ino, void *data) @@ -2125,8 +2126,8 @@ static int __insert_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i) { struct f2fs_nm_info *nm_i = NM_I(sbi); - int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i); + if (err) return err; @@ -2991,6 +2992,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) while ((found = __gang_lookup_nat_set(nm_i, set_idx, SETVEC_SIZE, setvec))) { unsigned idx; + set_idx = setvec[found - 1]->set + 1; for (idx = 0; idx < found; idx++) __adjust_nat_entry_set(setvec[idx], &sets, diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index da75d5d52f0a..422146c6d866 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -458,6 +458,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, /* Get the previous summary */ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); + if (curseg->segno == segno) { sum = curseg->sum_blk->entries[blkoff]; goto got_it; @@ -875,5 +876,5 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) #endif sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ - return ret ? ret: err; + return ret ? ret : err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 2f2e42d9e372..6f86f52880ee 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1794,7 +1794,7 @@ static int issue_discard_thread(void *data) if (issued > 0) { __wait_all_discard_cmd(sbi, &dpolicy); wait_ms = dpolicy.min_interval; - } else if (issued == -1){ + } else if (issued == -1) { wait_ms = f2fs_time_to_wait(sbi, DISCARD_TIME); if (!wait_ms) wait_ms = dpolicy.mid_interval; @@ -2171,6 +2171,7 @@ static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type, unsigned int segno, int modified) { struct seg_entry *se = get_seg_entry(sbi, segno); + se->type = type; if (modified) __mark_sit_entry_dirty(sbi, segno); @@ -2362,6 +2363,7 @@ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type, { struct curseg_info *curseg = CURSEG_I(sbi, type); void *addr = curseg->sum_blk; + addr += curseg->next_blkoff * sizeof(struct f2fs_summary); memcpy(addr, sum, sizeof(struct f2fs_summary)); } @@ -3779,6 +3781,7 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi) for (j = 0; j < blk_off; j++) { struct f2fs_summary *s; + s = (struct f2fs_summary *)(kaddr + offset); seg_i->sum_blk->entries[j] = *s; offset += SUMMARY_SIZE; @@ -3841,6 +3844,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) if (__exist_node_summaries(sbi)) { struct f2fs_summary *ns = &sum->entries[0]; int i; + for (i = 0; i < sbi->blocks_per_seg; i++, ns++) { ns->version = 0; ns->ofs_in_node = 0; @@ -3942,6 +3946,7 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) /* Step 3: write summary entries */ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { unsigned short blkoff; + seg_i = CURSEG_I(sbi, i); if (sbi->ckpt->alloc_type[i] == SSR) blkoff = sbi->blocks_per_seg; @@ -3978,6 +3983,7 @@ static void write_normal_summaries(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { int i, end; + if (IS_DATASEG(type)) end = type + NR_CURSEG_DATA_TYPE; else @@ -4561,6 +4567,7 @@ static void init_free_segmap(struct f2fs_sb_info *sbi) /* set use the current segments */ for (type = CURSEG_HOT_DATA; type <= CURSEG_COLD_NODE; type++) { struct curseg_info *curseg_t = CURSEG_I(sbi, type); + __set_test_and_inuse(sbi, curseg_t->segno); } } @@ -4913,6 +4920,7 @@ static int check_zone_write_pointer_cb(struct blk_zone *zone, unsigned int idx, void *data) { struct check_zone_write_pointer_args *args; + args = (struct check_zone_write_pointer_args *)data; return check_zone_write_pointer(args->sbi, args->fdev, zone); @@ -5191,6 +5199,7 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi, static void destroy_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + kvfree(dirty_i->victim_secmap); } @@ -5235,6 +5244,7 @@ static void destroy_curseg(struct f2fs_sb_info *sbi) static void destroy_free_segmap(struct f2fs_sb_info *sbi) { struct free_segmap_info *free_i = SM_I(sbi)->free_info; + if (!free_i) return; SM_I(sbi)->free_info = NULL; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4cc6536d5e70..254227b82b2b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -559,6 +559,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) while ((p = strsep(&options, ",")) != NULL) { int token; + if (!*p) continue; /* @@ -1895,7 +1896,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) ret = sync_filesystem(sbi->sb); if (ret || err) { - err = ret ? ret: err; + err = ret ? ret : err; goto restore_flag; } @@ -3760,7 +3761,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sbi->iostat_period_ms = DEFAULT_IOSTAT_PERIOD_MS; for (i = 0; i < NR_PAGE_TYPE; i++) { - int n = (i == META) ? 1: NR_TEMP_TYPE; + int n = (i == META) ? 1 : NR_TEMP_TYPE; int j; sbi->write_io[i] = diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 8159fae74b9a..3d43b27bad95 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -486,6 +486,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, f2fs_wait_on_page_writeback(xpage, NODE, true, true); } else { struct dnode_of_data dn; + set_new_dnode(&dn, inode, NULL, NULL, new_nid); xpage = f2fs_new_node_page(&dn, XATTR_NODE_OFFSET); if (IS_ERR(xpage)) { From e882e7aeff55cf7059a3f68ef175053264d29341 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 13 Apr 2021 17:56:53 +0800 Subject: [PATCH 42/58] f2fs: document: add description about compressed space handling User or developer may still be confused about why f2fs doesn't expose compressed space to userspace, add description about compressed space handling policy into f2fs documentation. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 63c0c49b726d..992bf91eeec8 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -819,6 +819,14 @@ Compression implementation * chattr +c file * chattr +c dir; touch dir/file * mount w/ -o compress_extension=ext; touch file.ext + * mount w/ -o compress_extension=*; touch any_file + +- At this point, compression feature doesn't expose compressed space to user + directly in order to guarantee potential data updates later to the space. + Instead, the main goal is to reduce data writes to flash disk as much as + possible, resulting in extending disk life time as well as relaxing IO + congestion. Alternatively, we've added ioctl interface to reclaim compressed + space and show it to user after putting the immutable bit. Compress metadata layout:: From e4947f414c4062d06e9117dd0b4c066948dc5d1a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 13 Apr 2021 17:56:18 +0800 Subject: [PATCH 43/58] f2fs: avoid duplicated codes for cleanup f2fs_segment_has_free_slot() was copied and modified from __next_free_blkoff(), they are almost the same, clean up to reuse common code as much as possible. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6f86f52880ee..5a6c23999f16 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2635,22 +2635,20 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) curseg->alloc_type = LFS; } -static void __next_free_blkoff(struct f2fs_sb_info *sbi, - struct curseg_info *seg, block_t start) +static int __next_free_blkoff(struct f2fs_sb_info *sbi, + int segno, block_t start) { - struct seg_entry *se = get_seg_entry(sbi, seg->segno); + struct seg_entry *se = get_seg_entry(sbi, segno); int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); unsigned long *target_map = SIT_I(sbi)->tmp_map; unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; unsigned long *cur_map = (unsigned long *)se->cur_valid_map; - int i, pos; + int i; for (i = 0; i < entries; i++) target_map[i] = ckpt_map[i] | cur_map[i]; - pos = __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start); - - seg->next_blkoff = pos; + return __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start); } /* @@ -2662,26 +2660,16 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, struct curseg_info *seg) { if (seg->alloc_type == SSR) - __next_free_blkoff(sbi, seg, seg->next_blkoff + 1); + seg->next_blkoff = + __next_free_blkoff(sbi, seg->segno, + seg->next_blkoff + 1); else seg->next_blkoff++; } bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno) { - struct seg_entry *se = get_seg_entry(sbi, segno); - int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); - unsigned long *target_map = SIT_I(sbi)->tmp_map; - unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; - unsigned long *cur_map = (unsigned long *)se->cur_valid_map; - int i, pos; - - for (i = 0; i < entries; i++) - target_map[i] = ckpt_map[i] | cur_map[i]; - - pos = __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, 0); - - return pos < sbi->blocks_per_seg; + return __next_free_blkoff(sbi, segno, 0) < sbi->blocks_per_seg; } /* @@ -2709,7 +2697,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush) reset_curseg(sbi, type, 1); curseg->alloc_type = SSR; - __next_free_blkoff(sbi, curseg, 0); + curseg->next_blkoff = __next_free_blkoff(sbi, curseg->segno, 0); sum_page = f2fs_get_sum_page(sbi, new_segno); if (IS_ERR(sum_page)) { From 6125b201cc487c3051c686c6cd17c630eb2ca7b5 Mon Sep 17 00:00:00 2001 From: Yi Chen Date: Tue, 13 Apr 2021 17:30:50 +0800 Subject: [PATCH 44/58] f2fs: fix to avoid NULL pointer dereference Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 pc : f2fs_put_page+0x1c/0x26c lr : __revoke_inmem_pages+0x544/0x75c f2fs_put_page+0x1c/0x26c __revoke_inmem_pages+0x544/0x75c __f2fs_commit_inmem_pages+0x364/0x3c0 f2fs_commit_inmem_pages+0xc8/0x1a0 f2fs_ioc_commit_atomic_write+0xa4/0x15c f2fs_ioctl+0x5b0/0x1574 file_ioctl+0x154/0x320 do_vfs_ioctl+0x164/0x740 __arm64_sys_ioctl+0x78/0xa4 el0_svc_common+0xbc/0x1d0 el0_svc_handler+0x74/0x98 el0_svc+0x8/0xc In f2fs_put_page, we access page->mapping is NULL. The root cause is: In some cases, the page refcount and ATOMIC_WRITTEN_PAGE flag miss set for page-priavte flag has been set. We add f2fs_bug_on like this: f2fs_register_inmem_page() { ... f2fs_set_page_private(page, ATOMIC_WRITTEN_PAGE); f2fs_bug_on(F2FS_I_SB(inode), !IS_ATOMIC_WRITTEN_PAGE(page)); ... } The bug on stack follow link this: PC is at f2fs_register_inmem_page+0x238/0x2b4 LR is at f2fs_register_inmem_page+0x2a8/0x2b4 f2fs_register_inmem_page+0x238/0x2b4 f2fs_set_data_page_dirty+0x104/0x164 set_page_dirty+0x78/0xc8 f2fs_write_end+0x1b4/0x444 generic_perform_write+0x144/0x1cc __generic_file_write_iter+0xc4/0x174 f2fs_file_write_iter+0x2c0/0x350 __vfs_write+0x104/0x134 vfs_write+0xe8/0x19c SyS_pwrite64+0x78/0xb8 To fix this issue, let's add page refcount add page-priavte flag. The page-private flag is not cleared and needs further analysis. Signed-off-by: Chao Yu Signed-off-by: Ge Qiu Signed-off-by: Dehe Gu Signed-off-by: Yi Chen Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5a6c23999f16..ff3df676b2dd 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -186,7 +186,10 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) { struct inmem_pages *new; - f2fs_set_page_private(page, ATOMIC_WRITTEN_PAGE); + if (PagePrivate(page)) + set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE); + else + f2fs_set_page_private(page, ATOMIC_WRITTEN_PAGE); new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); From 12a68f402efc15d06cbeeb59dc8d60cf8a76f91b Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Mon, 19 Apr 2021 10:20:03 +0800 Subject: [PATCH 45/58] f2fs: remove unnecessary struct declaration struct dnode_of_data is defined at 897th line. The declaration here is unnecessary. Remove it. Signed-off-by: Wan Jiabing Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5637deb31776..116bc853440b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3310,7 +3310,6 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname); /* * node.c */ -struct dnode_of_data; struct node_info; int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid); From f35a48edeb85b70a0fb479e1ea16cb5b228513a9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 21 Apr 2021 09:54:55 +0800 Subject: [PATCH 46/58] f2fs: avoid using native allocate_segment_by_default() As we did for other cases, in fix_curseg_write_pointer(), let's use wrapped f2fs_allocate_new_section() instead of native allocate_segment_by_default(), by this way, it fixes to cover segment allocation with curseg_lock and sentry_lock. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 2 +- fs/f2fs/segment.c | 18 ++++++++++-------- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 116bc853440b..ea4ee76e8dcf 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3394,7 +3394,7 @@ void f2fs_get_new_segment(struct f2fs_sb_info *sbi, unsigned int *newseg, bool new_sec, int dir); void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, unsigned int start, unsigned int end); -void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type); +void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force); void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi); int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index da08a8db248f..3b612212917f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1662,7 +1662,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset, down_write(&sbi->pin_sem); f2fs_lock_op(sbi); - f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED); + f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false); f2fs_unlock_op(sbi); map.m_seg_type = CURSEG_COLD_DATA_PINNED; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ff3df676b2dd..3bf11d6fac67 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2933,7 +2933,7 @@ void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, } static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, - bool new_sec) + bool new_sec, bool force) { struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int old_segno; @@ -2941,7 +2941,7 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, if (!curseg->inited) goto alloc; - if (curseg->next_blkoff || + if (force || curseg->next_blkoff || get_valid_blocks(sbi, curseg->segno, new_sec)) goto alloc; @@ -2953,16 +2953,17 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, locate_dirty_segment(sbi, old_segno); } -static void __allocate_new_section(struct f2fs_sb_info *sbi, int type) +static void __allocate_new_section(struct f2fs_sb_info *sbi, + int type, bool force) { - __allocate_new_segment(sbi, type, true); + __allocate_new_segment(sbi, type, true, force); } -void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type) +void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force) { down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); - __allocate_new_section(sbi, type); + __allocate_new_section(sbi, type, force); up_write(&SIT_I(sbi)->sentry_lock); up_read(&SM_I(sbi)->curseg_lock); } @@ -2974,7 +2975,7 @@ void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) - __allocate_new_segment(sbi, i, false); + __allocate_new_segment(sbi, i, false, false); up_write(&SIT_I(sbi)->sentry_lock); up_read(&SM_I(sbi)->curseg_lock); } @@ -4844,7 +4845,8 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) f2fs_notice(sbi, "Assign new section to curseg[%d]: " "curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff); - allocate_segment_by_default(sbi, type, true); + + f2fs_allocate_new_section(sbi, type, true); /* check consistency of the zone curseg pointed to */ if (check_zone_write_pointer(sbi, zbd, &zone)) From 28a1e40c06a152b3b9f269d8e5ce085eefaa0aa8 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 23 Apr 2021 14:09:38 +0800 Subject: [PATCH 47/58] f2fs: clean up left deprecated IO trace codes Commit d5f7bc0064e0 ("f2fs: deprecate f2fs_trace_io") left some dead codes, delete them. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 6 ------ fs/f2fs/f2fs.h | 8 -------- 2 files changed, 14 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 3c9d797dbdd6..6e46a00c1930 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -76,12 +76,6 @@ bool f2fs_is_compressed_page(struct page *page) return false; if (IS_ATOMIC_WRITTEN_PAGE(page) || IS_DUMMY_WRITTEN_PAGE(page)) return false; - /* - * page->private may be set with pid. - * pid_max is enough to check if it is traced. - */ - if (IS_IO_TRACED_PAGE(page)) - return false; f2fs_bug_on(F2FS_M_SB(page->mapping), *((u32 *)page_private(page)) != F2FS_COMPRESSED_PAGE_MAGIC); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ea4ee76e8dcf..0ad5b73fdf5d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1303,14 +1303,6 @@ enum { #define IS_DUMMY_WRITTEN_PAGE(page) \ (page_private(page) == DUMMY_WRITTEN_PAGE) -#ifdef CONFIG_F2FS_IO_TRACE -#define IS_IO_TRACED_PAGE(page) \ - (page_private(page) > 0 && \ - page_private(page) < (unsigned long)PID_MAX_LIMIT) -#else -#define IS_IO_TRACED_PAGE(page) (0) -#endif - /* For compression */ enum compress_algorithm_type { COMPRESS_LZO, From 99139afb1f7b63e9ee95348bc047854dd4b8ad5f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 21 Apr 2021 16:39:41 +0800 Subject: [PATCH 48/58] f2fs: compress: remove unneed check condition In only call path of __cluster_may_compress(), __f2fs_write_data_pages() has checked SBI_POR_DOING condition, and also cluster_may_compress() has checked CP_ERROR_FLAG condition, so remove redundant check condition in __cluster_may_compress() for cleanup. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 6e46a00c1930..53b13787eb2c 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -890,7 +890,6 @@ bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index) static bool __cluster_may_compress(struct compress_ctx *cc) { - struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); loff_t i_size = i_size_read(cc->inode); unsigned nr_pages = DIV_ROUND_UP(i_size, PAGE_SIZE); int i; @@ -898,12 +897,7 @@ static bool __cluster_may_compress(struct compress_ctx *cc) for (i = 0; i < cc->cluster_size; i++) { struct page *page = cc->rpages[i]; - f2fs_bug_on(sbi, !page); - - if (unlikely(f2fs_cp_error(sbi))) - return false; - if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) - return false; + f2fs_bug_on(F2FS_I_SB(cc->inode), !page); /* beyond EOF */ if (page->index >= nr_pages) From c8bf55afd95fea84c2b8f6eb8f99f1a7b7291e44 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 22 Apr 2021 18:19:25 +0800 Subject: [PATCH 49/58] f2fs: drop inplace IO if fs status is abnormal If filesystem has cp_error or need_fsck status, let's drop inplace IO to avoid further corruption of fs data. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3bf11d6fac67..1f93fecfd7a4 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3552,7 +3552,13 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: incorrect segment(%u) type, run fsck to fix.", __func__, segno); - return -EFSCORRUPTED; + err = -EFSCORRUPTED; + goto drop_bio; + } + + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) || f2fs_cp_error(sbi)) { + err = -EIO; + goto drop_bio; } stat_inc_inplace_blocks(fio->sbi); @@ -3566,6 +3572,15 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); } + return err; +drop_bio: + if (fio->bio) { + struct bio *bio = *(fio->bio); + + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + fio->bio = NULL; + } return err; } From 5e76a34a2607977a35bcdd2f8da1c8b38a06babb Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 21 Apr 2021 09:55:10 +0200 Subject: [PATCH 50/58] fscrypt: relax Kconfig dependencies for crypto API algorithms Even if FS encryption has strict functional dependencies on various crypto algorithms and chaining modes. those dependencies could potentially be satisified by other implementations than the generic ones, and no link time dependency exists on the 'depends on' claused defined by CONFIG_FS_ENCRYPTION_ALGS. So let's relax these clauses to 'imply', so that the default behavior is still to pull in those generic algorithms, but in a way that permits them to be disabled again in Kconfig. Signed-off-by: Ard Biesheuvel Acked-by: Eric Biggers Signed-off-by: Herbert Xu --- fs/crypto/Kconfig | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig index a5f5c30368a2..2d0c8922f635 100644 --- a/fs/crypto/Kconfig +++ b/fs/crypto/Kconfig @@ -14,16 +14,30 @@ config FS_ENCRYPTION F2FS and UBIFS make use of this feature. # Filesystems supporting encryption must select this if FS_ENCRYPTION. This -# allows the algorithms to be built as modules when all the filesystems are. +# allows the algorithms to be built as modules when all the filesystems are, +# whereas selecting them from FS_ENCRYPTION would force them to be built-in. +# +# Note: this option only pulls in the algorithms that filesystem encryption +# needs "by default". If userspace will use "non-default" encryption modes such +# as Adiantum encryption, then those other modes need to be explicitly enabled +# in the crypto API; see Documentation/filesystems/fscrypt.rst for details. +# +# Also note that this option only pulls in the generic implementations of the +# algorithms, not any per-architecture optimized implementations. It is +# strongly recommended to enable optimized implementations too. It is safe to +# disable these generic implementations if corresponding optimized +# implementations will always be available too; for this reason, these are soft +# dependencies ('imply' rather than 'select'). Only disable these generic +# implementations if you're sure they will never be needed, though. config FS_ENCRYPTION_ALGS tristate - select CRYPTO_AES - select CRYPTO_CBC - select CRYPTO_CTS - select CRYPTO_ECB - select CRYPTO_HMAC - select CRYPTO_SHA512 - select CRYPTO_XTS + imply CRYPTO_AES + imply CRYPTO_CBC + imply CRYPTO_CTS + imply CRYPTO_ECB + imply CRYPTO_HMAC + imply CRYPTO_SHA512 + imply CRYPTO_XTS config FS_ENCRYPTION_INLINE_CRYPT bool "Enable fscrypt to use inline crypto" From 70d78348614c46c4365f834c44a1b788221172fc Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 21 Apr 2021 09:55:11 +0200 Subject: [PATCH 51/58] fsverity: relax build time dependency on CRYPTO_SHA256 CONFIG_CRYPTO_SHA256 denotes the generic C implementation of the SHA-256 shash algorithm, which is selected as the default crypto shash provider for fsverity. However, fsverity has no strict link time dependency, and the same shash could be exposed by an optimized implementation, and arm64 has a number of those (scalar, NEON-based and one based on special crypto instructions). In such cases, it makes little sense to require that the generic C implementation is incorporated as well, given that it will never be called. To address this, relax the 'select' clause to 'imply' so that the generic driver can be omitted from the build if desired. Acked-by: Eric Biggers Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- fs/verity/Kconfig | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/verity/Kconfig b/fs/verity/Kconfig index 88fb25119899..24d1b54de807 100644 --- a/fs/verity/Kconfig +++ b/fs/verity/Kconfig @@ -3,9 +3,13 @@ config FS_VERITY bool "FS Verity (read-only file-based authenticity protection)" select CRYPTO - # SHA-256 is selected as it's intended to be the default hash algorithm. + # SHA-256 is implied as it's intended to be the default hash algorithm. # To avoid bloat, other wanted algorithms must be selected explicitly. - select CRYPTO_SHA256 + # Note that CRYPTO_SHA256 denotes the generic C implementation, but + # some architectures provided optimized implementations of the same + # algorithm that may be used instead. In this case, CRYPTO_SHA256 may + # be omitted even if SHA-256 is being used. + imply CRYPTO_SHA256 help This option enables fs-verity. fs-verity is the dm-verity mechanism implemented at the file level. On supported From fd97226b07542c25781de2f9f19edf7a79b0b279 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sun, 9 May 2021 21:53:03 -0700 Subject: [PATCH 52/58] f2fs: avoid null pointer access when handling IPU error Unable to handle kernel NULL pointer dereference at virtual address 000000000000001a pc : f2fs_inplace_write_data+0x144/0x208 lr : f2fs_inplace_write_data+0x134/0x208 Call trace: f2fs_inplace_write_data+0x144/0x208 f2fs_do_write_data_page+0x270/0x770 f2fs_write_single_data_page+0x47c/0x830 __f2fs_write_data_pages+0x444/0x98c f2fs_write_data_pages.llvm.16514453770497736882+0x2c/0x38 do_writepages+0x58/0x118 __writeback_single_inode+0x44/0x300 writeback_sb_inodes+0x4b8/0x9c8 wb_writeback+0x148/0x42c wb_do_writeback+0xc8/0x390 wb_workfn+0xb0/0x2f4 process_one_work+0x1fc/0x444 worker_thread+0x268/0x4b4 kthread+0x13c/0x158 ret_from_fork+0x10/0x18 Fixes: 955772787667 ("f2fs: drop inplace IO if fs status is abnormal") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1f93fecfd7a4..2667e26d9b2d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3574,12 +3574,12 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) return err; drop_bio: - if (fio->bio) { + if (fio->bio && *(fio->bio)) { struct bio *bio = *(fio->bio); bio->bi_status = BLK_STS_IOERR; bio_endio(bio); - fio->bio = NULL; + *(fio->bio) = NULL; } return err; } From 4b1ceab67b1cc12c4c9c1130d7f5189c12c2bf9f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 6 May 2021 12:11:14 -0700 Subject: [PATCH 53/58] f2fs: support iflag change given the mask In f2fs_fileattr_set(), if (!fa->flags_valid) mask &= FS_COMMON_FL; In this case, we can set supported flags by mask only instead of BUG_ON. /* Flags shared betwen flags/xflags */ (FS_SYNC_FL | FS_IMMUTABLE_FL | FS_APPEND_FL | \ FS_NODUMP_FL | FS_NOATIME_FL | FS_DAX_FL | \ FS_PROJINHERIT_FL) Fixes: 9b1bb01c8ae7 ("f2fs: convert to fileattr") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 3b612212917f..baed3905bae6 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1812,7 +1812,8 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) struct f2fs_inode_info *fi = F2FS_I(inode); u32 masked_flags = fi->i_flags & mask; - f2fs_bug_on(F2FS_I_SB(inode), (iflags & ~mask)); + /* mask can be shrunk by flags_valid selector */ + iflags &= mask; /* Is it quota file? Do not allow user to mess with it */ if (IS_NOQUOTA(inode)) From 4de3199a4a8e48ebf3881cbc00a65d86f5872642 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 6 May 2021 17:00:43 +0800 Subject: [PATCH 54/58] f2fs: compress: fix to free compress page correctly In error path of f2fs_write_compressed_pages(), it needs to call f2fs_compress_free_page() to release temporary page. Fixes: 5e6bbde95982 ("f2fs: introduce mempool for {,de}compress intermediate page allocation") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 53b13787eb2c..2acaefa10036 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1372,7 +1372,8 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, for (i = 0; i < cc->nr_cpages; i++) { if (!cc->cpages[i]) continue; - f2fs_put_page(cc->cpages[i], 1); + f2fs_compress_free_page(cc->cpages[i]); + cc->cpages[i] = NULL; } out_put_cic: kmem_cache_free(cic_entry_slab, cic); From 7c6c8dacd48a6a15193315c25794f317264085ae Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 10 May 2021 17:30:31 +0800 Subject: [PATCH 55/58] f2fs: compress: fix race condition of overwrite vs truncate pos_fsstress testcase complains a panic as belew: ------------[ cut here ]------------ kernel BUG at fs/f2fs/compress.c:1082! invalid opcode: 0000 [#1] SMP PTI CPU: 4 PID: 2753477 Comm: kworker/u16:2 Tainted: G OE 5.12.0-rc1-custom #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 Workqueue: writeback wb_workfn (flush-252:16) RIP: 0010:prepare_compress_overwrite+0x4c0/0x760 [f2fs] Call Trace: f2fs_prepare_compress_overwrite+0x5f/0x80 [f2fs] f2fs_write_cache_pages+0x468/0x8a0 [f2fs] f2fs_write_data_pages+0x2a4/0x2f0 [f2fs] do_writepages+0x38/0xc0 __writeback_single_inode+0x44/0x2a0 writeback_sb_inodes+0x223/0x4d0 __writeback_inodes_wb+0x56/0xf0 wb_writeback+0x1dd/0x290 wb_workfn+0x309/0x500 process_one_work+0x220/0x3c0 worker_thread+0x53/0x420 kthread+0x12f/0x150 ret_from_fork+0x22/0x30 The root cause is truncate() may race with overwrite as below, so that one reference count left in page can not guarantee the page attaching in mapping tree all the time, after truncation, later find_lock_page() may return NULL pointer. - prepare_compress_overwrite - f2fs_pagecache_get_page - unlock_page - f2fs_setattr - truncate_setsize - truncate_inode_page - delete_from_page_cache - find_lock_page Fix this by avoiding referencing updated page. Fixes: 4c8ff7095bef ("f2fs: support data compression") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 35 ++++++++++++----------------------- 1 file changed, 12 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 2acaefa10036..79348bc56e35 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -117,19 +117,6 @@ static void f2fs_unlock_rpages(struct compress_ctx *cc, int len) f2fs_drop_rpages(cc, len, true); } -static void f2fs_put_rpages_mapping(struct address_space *mapping, - pgoff_t start, int len) -{ - int i; - - for (i = 0; i < len; i++) { - struct page *page = find_get_page(mapping, start + i); - - put_page(page); - put_page(page); - } -} - static void f2fs_put_rpages_wbc(struct compress_ctx *cc, struct writeback_control *wbc, bool redirty, int unlock) { @@ -1036,7 +1023,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, } if (PageUptodate(page)) - unlock_page(page); + f2fs_put_page(page, 1); else f2fs_compress_ctx_add_page(cc, page); } @@ -1046,32 +1033,34 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, ret = f2fs_read_multi_pages(cc, &bio, cc->cluster_size, &last_block_in_bio, false, true); + f2fs_put_rpages(cc); f2fs_destroy_compress_ctx(cc); if (ret) - goto release_pages; + goto out; if (bio) f2fs_submit_bio(sbi, bio, DATA); ret = f2fs_init_compress_ctx(cc); if (ret) - goto release_pages; + goto out; } for (i = 0; i < cc->cluster_size; i++) { f2fs_bug_on(sbi, cc->rpages[i]); page = find_lock_page(mapping, start_idx + i); - f2fs_bug_on(sbi, !page); + if (!page) { + /* page can be truncated */ + goto release_and_retry; + } f2fs_wait_on_page_writeback(page, DATA, true, true); - f2fs_compress_ctx_add_page(cc, page); - f2fs_put_page(page, 0); if (!PageUptodate(page)) { +release_and_retry: + f2fs_put_rpages(cc); f2fs_unlock_rpages(cc, i + 1); - f2fs_put_rpages_mapping(mapping, start_idx, - cc->cluster_size); f2fs_destroy_compress_ctx(cc); goto retry; } @@ -1103,10 +1092,10 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, } unlock_pages: + f2fs_put_rpages(cc); f2fs_unlock_rpages(cc, i); -release_pages: - f2fs_put_rpages_mapping(mapping, start_idx, i); f2fs_destroy_compress_ctx(cc); +out: return ret; } From 4c4dcb8c2420b73670c1c91087ae582c29092713 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 10 May 2021 17:30:32 +0800 Subject: [PATCH 56/58] f2fs: compress: fix to assign cc.cluster_idx correctly In f2fs_destroy_compress_ctx(), after f2fs_destroy_compress_ctx(), cc.cluster_idx will be cleared w/ NULL_CLUSTER, f2fs_cluster_blocks() may check wrong cluster metadata, fix it. Fixes: 4c8ff7095bef ("f2fs: support data compression") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 17 +++++++++-------- fs/f2fs/data.c | 6 +++--- fs/f2fs/f2fs.h | 2 +- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 79348bc56e35..925a5ca3744a 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -145,13 +145,14 @@ int f2fs_init_compress_ctx(struct compress_ctx *cc) return cc->rpages ? 0 : -ENOMEM; } -void f2fs_destroy_compress_ctx(struct compress_ctx *cc) +void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse) { page_array_free(cc->inode, cc->rpages, cc->cluster_size); cc->rpages = NULL; cc->nr_rpages = 0; cc->nr_cpages = 0; - cc->cluster_idx = NULL_CLUSTER; + if (!reuse) + cc->cluster_idx = NULL_CLUSTER; } void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page) @@ -1034,7 +1035,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, ret = f2fs_read_multi_pages(cc, &bio, cc->cluster_size, &last_block_in_bio, false, true); f2fs_put_rpages(cc); - f2fs_destroy_compress_ctx(cc); + f2fs_destroy_compress_ctx(cc, true); if (ret) goto out; if (bio) @@ -1061,7 +1062,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, release_and_retry: f2fs_put_rpages(cc); f2fs_unlock_rpages(cc, i + 1); - f2fs_destroy_compress_ctx(cc); + f2fs_destroy_compress_ctx(cc, true); goto retry; } } @@ -1094,7 +1095,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, unlock_pages: f2fs_put_rpages(cc); f2fs_unlock_rpages(cc, i); - f2fs_destroy_compress_ctx(cc); + f2fs_destroy_compress_ctx(cc, true); out: return ret; } @@ -1130,7 +1131,7 @@ bool f2fs_compress_write_end(struct inode *inode, void *fsdata, set_cluster_dirty(&cc); f2fs_put_rpages_wbc(&cc, NULL, false, 1); - f2fs_destroy_compress_ctx(&cc); + f2fs_destroy_compress_ctx(&cc, false); return first_index; } @@ -1350,7 +1351,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, f2fs_put_rpages(cc); page_array_free(cc->inode, cc->cpages, cc->nr_cpages); cc->cpages = NULL; - f2fs_destroy_compress_ctx(cc); + f2fs_destroy_compress_ctx(cc, false); return 0; out_destroy_crypt: @@ -1512,7 +1513,7 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, err = f2fs_write_raw_pages(cc, submitted, wbc, io_type); f2fs_put_rpages_wbc(cc, wbc, false, 0); destroy_out: - f2fs_destroy_compress_ctx(cc); + f2fs_destroy_compress_ctx(cc, false); return err; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d3e4f8315b40..75a6c0f86602 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2298,7 +2298,7 @@ static int f2fs_mpage_readpages(struct inode *inode, max_nr_pages, &last_block_in_bio, rac != NULL, false); - f2fs_destroy_compress_ctx(&cc); + f2fs_destroy_compress_ctx(&cc, false); if (ret) goto set_error_page; } @@ -2343,7 +2343,7 @@ static int f2fs_mpage_readpages(struct inode *inode, max_nr_pages, &last_block_in_bio, rac != NULL, false); - f2fs_destroy_compress_ctx(&cc); + f2fs_destroy_compress_ctx(&cc, false); } } #endif @@ -3044,7 +3044,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, } } if (f2fs_compressed_file(inode)) - f2fs_destroy_compress_ctx(&cc); + f2fs_destroy_compress_ctx(&cc, false); #endif if (retry) { index = 0; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0ad5b73fdf5d..3a86bca01fff 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3952,7 +3952,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc); void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed); void f2fs_put_page_dic(struct page *page); int f2fs_init_compress_ctx(struct compress_ctx *cc); -void f2fs_destroy_compress_ctx(struct compress_ctx *cc); +void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse); void f2fs_init_compress_info(struct f2fs_sb_info *sbi); int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi); void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi); From 2f998899f3cc46accdfe50edb28179d950fbdc3e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 11 May 2021 14:38:47 -0700 Subject: [PATCH 57/58] f2fs: avoid swapon failure by giving a warning first The final solution can be migrating blocks to form a section-aligned file internally. Meanwhile, let's ask users to do that when preparing the swap file initially like: 1) create() 2) ioctl(F2FS_IOC_SET_PIN_FILE) 3) fallocate() Reported-by: kernel test robot Fixes: 36e4d95891ed ("f2fs: check if swapfile is section-alligned") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 75a6c0f86602..0364f9384f5a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3812,6 +3812,7 @@ static int f2fs_is_file_aligned(struct inode *inode) block_t pblock; unsigned long nr_pblocks; unsigned int blocks_per_sec = BLKS_PER_SEC(sbi); + unsigned int not_aligned = 0; int ret = 0; cur_lblock = 0; @@ -3844,13 +3845,20 @@ static int f2fs_is_file_aligned(struct inode *inode) if ((pblock - main_blkaddr) & (blocks_per_sec - 1) || nr_pblocks & (blocks_per_sec - 1)) { - f2fs_err(sbi, "Swapfile does not align to section"); - ret = -EINVAL; - goto out; + if (f2fs_is_pinned_file(inode)) { + f2fs_err(sbi, "Swapfile does not align to section"); + ret = -EINVAL; + goto out; + } + not_aligned++; } cur_lblock += nr_pblocks; } + if (not_aligned) + f2fs_warn(sbi, "Swapfile (%u) is not align to section: \n" + "\t1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate()", + not_aligned); out: return ret; } @@ -3869,6 +3877,7 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, int nr_extents = 0; unsigned long nr_pblocks; unsigned int blocks_per_sec = BLKS_PER_SEC(sbi); + unsigned int not_aligned = 0; int ret = 0; /* @@ -3907,9 +3916,12 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, if ((pblock - SM_I(sbi)->main_blkaddr) & (blocks_per_sec - 1) || nr_pblocks & (blocks_per_sec - 1)) { - f2fs_err(sbi, "Swapfile does not align to section"); - ret = -EINVAL; - goto out; + if (f2fs_is_pinned_file(inode)) { + f2fs_err(sbi, "Swapfile does not align to section"); + ret = -EINVAL; + goto out; + } + not_aligned++; } if (cur_lblock + nr_pblocks >= sis->max) @@ -3938,6 +3950,11 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, sis->max = cur_lblock; sis->pages = cur_lblock - 1; sis->highest_bit = cur_lblock - 1; + + if (not_aligned) + f2fs_warn(sbi, "Swapfile (%u) is not align to section: \n" + "\t1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate()", + not_aligned); out: return ret; } From 030ee5e654f0dd51de7e86ba6d2136fe7b055508 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 12 May 2021 07:38:00 -0700 Subject: [PATCH 58/58] f2fs: return EINVAL for hole cases in swap file This tries to fix xfstests/generic/495. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0364f9384f5a..630fed85cb89 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3907,7 +3907,7 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, /* hole */ if (!(map.m_flags & F2FS_MAP_FLAGS)) { f2fs_err(sbi, "Swapfile has holes\n"); - ret = -ENOENT; + ret = -EINVAL; goto out; } @@ -4063,7 +4063,7 @@ static int check_swap_activate(struct swap_info_struct *sis, return ret; bad_bmap: f2fs_err(sbi, "Swapfile has holes\n"); - return -ENOENT; + return -EINVAL; } static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file,