diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 62c091b52bac..e0ed273e2e8a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -707,15 +707,6 @@ enum { * found an unwritten extent, we need to split it. */ #define EXT4_GET_BLOCKS_SPLIT_NOMERGE 0x0008 - /* - * Caller is from the dio or dioread_nolock buffered IO, reqest to - * create an unwritten extent if it does not exist or split the - * found unwritten extent. Also do not merge the newly created - * unwritten extent, io end will convert unwritten to written, - * and try to merge the written extent. - */ -#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_SPLIT_NOMERGE|\ - EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) /* Convert unwritten extent to initialized. */ #define EXT4_GET_BLOCKS_CONVERT 0x0010 /* Eventual metadata allocation (due to growing extent tree) @@ -1692,6 +1683,8 @@ struct ext4_sb_info { /* timer for periodic error stats printing */ struct timer_list s_err_report; + /* timeout in seconds for s_err_report; 0 disables the timer. */ + unsigned long s_err_report_sec; /* Lazy inode table initialization info */ struct ext4_li_request *s_li_request; @@ -1795,6 +1788,10 @@ struct ext4_sb_info { * Main fast commit lock. This lock protects accesses to the * following fields: * ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh. + * + * s_fc_lock can be taken from reclaim context (inode eviction) and is + * thus reclaim unsafe. Use ext4_fc_lock()/ext4_fc_unlock() helpers + * when acquiring / releasing the lock. */ struct mutex s_fc_lock; struct buffer_head *s_fc_bh; @@ -1839,6 +1836,18 @@ static inline void ext4_writepages_up_write(struct super_block *sb, int ctx) percpu_up_write(&EXT4_SB(sb)->s_writepages_rwsem); } +static inline int ext4_fc_lock(struct super_block *sb) +{ + mutex_lock(&EXT4_SB(sb)->s_fc_lock); + return memalloc_nofs_save(); +} + +static inline void ext4_fc_unlock(struct super_block *sb, int ctx) +{ + memalloc_nofs_restore(ctx); + mutex_unlock(&EXT4_SB(sb)->s_fc_lock); +} + static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) { return ino == EXT4_ROOT_INO || @@ -2373,7 +2382,6 @@ static inline int ext4_emergency_state(struct super_block *sb) #define EXT4_DEF_SB_UPDATE_INTERVAL_SEC (3600) /* seconds (1 hour) */ #define EXT4_DEF_SB_UPDATE_INTERVAL_KB (16384) /* kilobytes (16MB) */ - /* * Minimum number of groups in a flexgroup before we separate out * directories into the first block group of a flexgroup @@ -3199,6 +3207,7 @@ extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, unsigned int flags); extern unsigned int ext4_num_base_meta_blocks(struct super_block *sb, ext4_group_t block_group); +extern void print_daily_error_info(struct timer_list *t); extern __printf(7, 8) void __ext4_error(struct super_block *, const char *, unsigned int, bool, @@ -3795,6 +3804,10 @@ extern int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); +extern int ext4_map_query_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern int ext4_map_create_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int num, struct ext4_ext_path *path); @@ -3909,7 +3922,6 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) } extern const struct iomap_ops ext4_iomap_ops; -extern const struct iomap_ops ext4_iomap_overwrite_ops; extern const struct iomap_ops ext4_iomap_report_ops; static inline int ext4_buffer_uptodate(struct buffer_head *bh) diff --git a/fs/ext4/extents-test.c b/fs/ext4/extents-test.c new file mode 100644 index 000000000000..4879e68e465d --- /dev/null +++ b/fs/ext4/extents-test.c @@ -0,0 +1,1027 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Written by Ojaswin Mujoo (IBM) + * + * These Kunit tests are designed to test the functionality of + * extent split and conversion in ext4. + * + * Currently, ext4 can split extents in 2 ways: + * 1. By splitting the extents in the extent tree and optionally converting them + * to written or unwritten based on flags passed. + * 2. In case 1 encounters an error, ext4 instead zerooes out the unwritten + * areas of the extent and marks the complete extent written. + * + * The primary function that handles this is ext4_split_convert_extents(). + * + * We test both of the methods of split. The behavior we try to enforce is: + * 1. When passing EXT4_GET_BLOCKS_CONVERT flag to ext4_split_convert_extents(), + * the split extent should be converted to initialized. + * 2. When passing EXT4_GET_BLOCKS_CONVERT_UNWRITTEN flag to + * ext4_split_convert_extents(), the split extent should be converted to + * uninitialized. + * 3. In case we use the zeroout method, then we should correctly write zeroes + * to the unwritten areas of the extent and we should not corrupt/leak any + * data. + * + * Enforcing 1 and 2 is straight forward, we just setup a minimal inode with + * extent tree, call ext4_split_convert_extents() and check the final state of + * the extent tree. + * + * For zeroout testing, we maintain a separate buffer which represents the disk + * data corresponding to the extents. We then override ext4's zeroout functions + * to instead write zeroes to our buffer. Then, we override + * ext4_ext_insert_extent() to return -ENOSPC, which triggers the zeroout. + * Finally, we check the state of the extent tree and zeroout buffer to confirm + * everything went well. + */ + +#include +#include +#include +#include + +#include "ext4.h" +#include "ext4_extents.h" + +#define EXT_DATA_PBLK 100 +#define EXT_DATA_LBLK 10 +#define EXT_DATA_LEN 3 + +struct kunit_ctx { + /* + * Ext4 inode which has only 1 unwrit extent + */ + struct ext4_inode_info *k_ei; + /* + * Represents the underlying data area (used for zeroout testing) + */ + char *k_data; +} k_ctx; + +/* + * describes the state of an expected extent in extent tree. + */ +struct kunit_ext_state { + ext4_lblk_t ex_lblk; + ext4_lblk_t ex_len; + bool is_unwrit; +}; + +/* + * describes the state of the data area of a writ extent. Used for testing + * correctness of zeroout. + */ +struct kunit_ext_data_state { + char exp_char; + ext4_lblk_t off_blk; + ext4_lblk_t len_blk; +}; + +enum kunit_test_types { + TEST_SPLIT_CONVERT, + TEST_CREATE_BLOCKS, +}; + +struct kunit_ext_test_param { + /* description of test */ + char *desc; + + /* determines which function will be tested */ + int type; + + /* is extent unwrit at beginning of test */ + bool is_unwrit_at_start; + + /* flags to pass while splitting */ + int split_flags; + + /* map describing range to split */ + struct ext4_map_blocks split_map; + + /* disable zeroout */ + bool disable_zeroout; + + /* no of extents expected after split */ + int nr_exp_ext; + + /* + * expected state of extents after split. We will never split into more + * than 3 extents + */ + struct kunit_ext_state exp_ext_state[3]; + + /* Below fields used for zeroout tests */ + + bool is_zeroout_test; + /* + * no of expected data segments (zeroout tests). Example, if we expect + * data to be 4kb 0s, followed by 8kb non-zero, then nr_exp_data_segs==2 + */ + int nr_exp_data_segs; + + /* + * expected state of data area after zeroout. + */ + struct kunit_ext_data_state exp_data_state[3]; +}; + +static void ext_kill_sb(struct super_block *sb) +{ + generic_shutdown_super(sb); +} + +static int ext_set(struct super_block *sb, void *data) +{ + return 0; +} + +static struct file_system_type ext_fs_type = { + .name = "extents test", + .kill_sb = ext_kill_sb, +}; + +static void extents_kunit_exit(struct kunit *test) +{ + struct ext4_sb_info *sbi = k_ctx.k_ei->vfs_inode.i_sb->s_fs_info; + + kfree(sbi); + kfree(k_ctx.k_ei); + kfree(k_ctx.k_data); +} + +static int __ext4_ext_dirty_stub(const char *where, unsigned int line, + handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) +{ + return 0; +} + +static struct ext4_ext_path * +ext4_ext_insert_extent_stub(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *newext, int gb_flags) +{ + return ERR_PTR(-ENOSPC); +} + +/* + * We will zeroout the equivalent range in the data area + */ +static int ext4_ext_zeroout_stub(struct inode *inode, struct ext4_extent *ex) +{ + ext4_lblk_t ee_block, off_blk; + loff_t ee_len; + loff_t off_bytes; + struct kunit *test = kunit_get_current_test(); + + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + + KUNIT_EXPECT_EQ_MSG(test, 1, ee_block >= EXT_DATA_LBLK, "ee_block=%d", + ee_block); + KUNIT_EXPECT_EQ(test, 1, + ee_block + ee_len <= EXT_DATA_LBLK + EXT_DATA_LEN); + + off_blk = ee_block - EXT_DATA_LBLK; + off_bytes = off_blk << inode->i_sb->s_blocksize_bits; + memset(k_ctx.k_data + off_bytes, 0, + ee_len << inode->i_sb->s_blocksize_bits); + + return 0; +} + +static int ext4_issue_zeroout_stub(struct inode *inode, ext4_lblk_t lblk, + ext4_fsblk_t pblk, ext4_lblk_t len) +{ + ext4_lblk_t off_blk; + loff_t off_bytes; + struct kunit *test = kunit_get_current_test(); + + kunit_log(KERN_ALERT, test, + "%s: lblk=%u pblk=%llu len=%u", __func__, lblk, pblk, len); + KUNIT_EXPECT_EQ(test, 1, lblk >= EXT_DATA_LBLK); + KUNIT_EXPECT_EQ(test, 1, lblk + len <= EXT_DATA_LBLK + EXT_DATA_LEN); + KUNIT_EXPECT_EQ(test, 1, lblk - EXT_DATA_LBLK == pblk - EXT_DATA_PBLK); + + off_blk = lblk - EXT_DATA_LBLK; + off_bytes = off_blk << inode->i_sb->s_blocksize_bits; + memset(k_ctx.k_data + off_bytes, 0, + len << inode->i_sb->s_blocksize_bits); + + return 0; +} + +static int extents_kunit_init(struct kunit *test) +{ + struct ext4_extent_header *eh = NULL; + struct ext4_inode_info *ei; + struct inode *inode; + struct super_block *sb; + struct ext4_sb_info *sbi = NULL; + struct kunit_ext_test_param *param = + (struct kunit_ext_test_param *)(test->param_value); + int err; + + sb = sget(&ext_fs_type, NULL, ext_set, 0, NULL); + if (IS_ERR(sb)) + return PTR_ERR(sb); + + sb->s_blocksize = 4096; + sb->s_blocksize_bits = 12; + + sbi = kzalloc(sizeof(struct ext4_sb_info), GFP_KERNEL); + if (sbi == NULL) + return -ENOMEM; + + sbi->s_sb = sb; + sb->s_fs_info = sbi; + + if (!param || !param->disable_zeroout) + sbi->s_extent_max_zeroout_kb = 32; + + /* setup the mock inode */ + k_ctx.k_ei = kzalloc(sizeof(struct ext4_inode_info), GFP_KERNEL); + if (k_ctx.k_ei == NULL) + return -ENOMEM; + ei = k_ctx.k_ei; + inode = &ei->vfs_inode; + + err = ext4_es_register_shrinker(sbi); + if (err) + return err; + + ext4_es_init_tree(&ei->i_es_tree); + rwlock_init(&ei->i_es_lock); + INIT_LIST_HEAD(&ei->i_es_list); + ei->i_es_all_nr = 0; + ei->i_es_shk_nr = 0; + ei->i_es_shrink_lblk = 0; + + ei->i_disksize = (EXT_DATA_LBLK + EXT_DATA_LEN + 10) + << sb->s_blocksize_bits; + ei->i_flags = 0; + ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); + inode->i_sb = sb; + + k_ctx.k_data = kzalloc(EXT_DATA_LEN * 4096, GFP_KERNEL); + if (k_ctx.k_data == NULL) + return -ENOMEM; + + /* + * set the data area to a junk value + */ + memset(k_ctx.k_data, 'X', EXT_DATA_LEN * 4096); + + /* create a tree with depth 0 */ + eh = (struct ext4_extent_header *)k_ctx.k_ei->i_data; + + /* Fill extent header */ + eh = ext_inode_hdr(&k_ctx.k_ei->vfs_inode); + eh->eh_depth = 0; + eh->eh_entries = cpu_to_le16(1); + eh->eh_magic = EXT4_EXT_MAGIC; + eh->eh_max = + cpu_to_le16(ext4_ext_space_root_idx(&k_ctx.k_ei->vfs_inode, 0)); + eh->eh_generation = 0; + + /* + * add 1 extent in leaf node covering: + * - lblks: [EXT_DATA_LBLK, EXT_DATA_LBLK * + EXT_DATA_LEN) + * - pblks: [EXT_DATA_PBLK, EXT_DATA_PBLK + EXT_DATA_LEN) + */ + EXT_FIRST_EXTENT(eh)->ee_block = cpu_to_le32(EXT_DATA_LBLK); + EXT_FIRST_EXTENT(eh)->ee_len = cpu_to_le16(EXT_DATA_LEN); + ext4_ext_store_pblock(EXT_FIRST_EXTENT(eh), EXT_DATA_PBLK); + if (!param || param->is_unwrit_at_start) + ext4_ext_mark_unwritten(EXT_FIRST_EXTENT(eh)); + + ext4_es_insert_extent(inode, EXT_DATA_LBLK, EXT_DATA_LEN, EXT_DATA_PBLK, + ext4_ext_is_unwritten(EXT_FIRST_EXTENT(eh)) ? + EXTENT_STATUS_UNWRITTEN : + EXTENT_STATUS_WRITTEN, + 0); + + /* Add stubs */ + kunit_activate_static_stub(test, __ext4_ext_dirty, + __ext4_ext_dirty_stub); + kunit_activate_static_stub(test, ext4_ext_zeroout, ext4_ext_zeroout_stub); + kunit_activate_static_stub(test, ext4_issue_zeroout, + ext4_issue_zeroout_stub); + return 0; +} + +/* + * Return 1 if all bytes in the buf equal to c, else return the offset of first mismatch + */ +static int check_buffer(char *buf, int c, int size) +{ + void *ret = NULL; + + ret = memchr_inv(buf, c, size); + if (ret == NULL) + return 0; + + kunit_log(KERN_ALERT, kunit_get_current_test(), + "# %s: wrong char found at offset %u (expected:%d got:%d)", __func__, + (u32)((char *)ret - buf), c, *((char *)ret)); + return 1; +} + +/* + * Simulate a map block call by first calling ext4_map_query_blocks() to + * correctly populate map flags and pblk and then call the + * ext4_map_create_blocks() to do actual split and conversion. This is easier + * than calling ext4_map_blocks() because that needs mocking a lot of unrelated + * functions. + */ +static void ext4_map_create_blocks_helper(struct kunit *test, + struct inode *inode, + struct ext4_map_blocks *map, + int flags) +{ + int retval = 0; + + retval = ext4_map_query_blocks(NULL, inode, map, flags); + if (retval < 0) { + KUNIT_FAIL(test, + "ext4_map_query_blocks() failed. Cannot proceed\n"); + return; + } + + ext4_map_create_blocks(NULL, inode, map, flags); +} + +static void test_split_convert(struct kunit *test) +{ + struct ext4_ext_path *path; + struct inode *inode = &k_ctx.k_ei->vfs_inode; + struct ext4_extent *ex; + struct ext4_map_blocks map; + const struct kunit_ext_test_param *param = + (const struct kunit_ext_test_param *)(test->param_value); + int blkbits = inode->i_sb->s_blocksize_bits; + + if (param->is_zeroout_test) + /* + * Force zeroout by making ext4_ext_insert_extent return ENOSPC + */ + kunit_activate_static_stub(test, ext4_ext_insert_extent, + ext4_ext_insert_extent_stub); + + path = ext4_find_extent(inode, EXT_DATA_LBLK, NULL, EXT4_EX_NOCACHE); + ex = path->p_ext; + KUNIT_EXPECT_EQ(test, EXT_DATA_LBLK, le32_to_cpu(ex->ee_block)); + KUNIT_EXPECT_EQ(test, EXT_DATA_LEN, ext4_ext_get_actual_len(ex)); + KUNIT_EXPECT_EQ(test, param->is_unwrit_at_start, + ext4_ext_is_unwritten(ex)); + if (param->is_zeroout_test) + KUNIT_EXPECT_EQ(test, 0, + check_buffer(k_ctx.k_data, 'X', + EXT_DATA_LEN << blkbits)); + + map.m_lblk = param->split_map.m_lblk; + map.m_len = param->split_map.m_len; + + switch (param->type) { + case TEST_SPLIT_CONVERT: + path = ext4_split_convert_extents(NULL, inode, &map, path, + param->split_flags, NULL); + break; + case TEST_CREATE_BLOCKS: + ext4_map_create_blocks_helper(test, inode, &map, param->split_flags); + break; + default: + KUNIT_FAIL(test, "param->type %d not support.", param->type); + } + + path = ext4_find_extent(inode, EXT_DATA_LBLK, NULL, EXT4_EX_NOCACHE); + ex = path->p_ext; + + for (int i = 0; i < param->nr_exp_ext; i++) { + struct kunit_ext_state exp_ext = param->exp_ext_state[i]; + bool es_check_needed = param->type != TEST_SPLIT_CONVERT; + struct extent_status es; + int contains_ex, ex_end, es_end, es_pblk; + + KUNIT_EXPECT_EQ(test, exp_ext.ex_lblk, + le32_to_cpu(ex->ee_block)); + KUNIT_EXPECT_EQ(test, exp_ext.ex_len, + ext4_ext_get_actual_len(ex)); + KUNIT_EXPECT_EQ(test, exp_ext.is_unwrit, + ext4_ext_is_unwritten(ex)); + /* + * Confirm extent cache is in sync. Note that es cache can be + * merged even when on-disk extents are not so take that into + * account. + * + * Also, ext4_split_convert_extents() forces EXT4_EX_NOCACHE hence + * es status are ignored for that case. + */ + if (es_check_needed) { + ext4_es_lookup_extent(inode, le32_to_cpu(ex->ee_block), + NULL, &es, NULL); + + ex_end = exp_ext.ex_lblk + exp_ext.ex_len; + es_end = es.es_lblk + es.es_len; + contains_ex = es.es_lblk <= exp_ext.ex_lblk && + es_end >= ex_end; + es_pblk = ext4_es_pblock(&es) + + (exp_ext.ex_lblk - es.es_lblk); + + KUNIT_EXPECT_EQ(test, contains_ex, 1); + KUNIT_EXPECT_EQ(test, ext4_ext_pblock(ex), es_pblk); + KUNIT_EXPECT_EQ(test, 1, + (exp_ext.is_unwrit && + ext4_es_is_unwritten(&es)) || + (!exp_ext.is_unwrit && + ext4_es_is_written(&es))); + } + + /* Only printed on failure */ + kunit_log(KERN_INFO, test, + "# [extent %d] exp: lblk:%d len:%d unwrit:%d \n", i, + exp_ext.ex_lblk, exp_ext.ex_len, exp_ext.is_unwrit); + kunit_log(KERN_INFO, test, + "# [extent %d] got: lblk:%d len:%d unwrit:%d\n", i, + le32_to_cpu(ex->ee_block), + ext4_ext_get_actual_len(ex), + ext4_ext_is_unwritten(ex)); + if (es_check_needed) + kunit_log( + KERN_INFO, test, + "# [extent %d] es: lblk:%d len:%d pblk:%lld type:0x%x\n", + i, es.es_lblk, es.es_len, ext4_es_pblock(&es), + ext4_es_type(&es)); + kunit_log(KERN_INFO, test, "------------------\n"); + + ex = ex + 1; + } + + if (!param->is_zeroout_test) + return; + + /* + * Check that then data area has been zeroed out correctly + */ + for (int i = 0; i < param->nr_exp_data_segs; i++) { + loff_t off, len; + struct kunit_ext_data_state exp_data_seg = param->exp_data_state[i]; + + off = exp_data_seg.off_blk << blkbits; + len = exp_data_seg.len_blk << blkbits; + KUNIT_EXPECT_EQ_MSG(test, 0, + check_buffer(k_ctx.k_data + off, + exp_data_seg.exp_char, len), + "# corruption in byte range [%lld, %lld)", + off, len); + } + + return; +} + +static const struct kunit_ext_test_param test_split_convert_params[] = { + /* unwrit to writ splits */ + { .desc = "split unwrit extent to 2 extents and convert 1st half writ", + .type = TEST_SPLIT_CONVERT, + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CONVERT, + .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 }, + .nr_exp_ext = 2, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = 1, + .is_unwrit = 0 }, + { .ex_lblk = EXT_DATA_LBLK + 1, + .ex_len = EXT_DATA_LEN - 1, + .is_unwrit = 1 } }, + .is_zeroout_test = 0 }, + { .desc = "split unwrit extent to 2 extents and convert 2nd half writ", + .type = TEST_SPLIT_CONVERT, + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CONVERT, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 }, + .nr_exp_ext = 2, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = 1, + .is_unwrit = 1 }, + { .ex_lblk = EXT_DATA_LBLK + 1, + .ex_len = EXT_DATA_LEN - 1, + .is_unwrit = 0 } }, + .is_zeroout_test = 0 }, + { .desc = "split unwrit extent to 3 extents and convert 2nd half to writ", + .type = TEST_SPLIT_CONVERT, + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CONVERT, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 }, + .nr_exp_ext = 3, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = 1, + .is_unwrit = 1 }, + { .ex_lblk = EXT_DATA_LBLK + 1, + .ex_len = EXT_DATA_LEN - 2, + .is_unwrit = 0 }, + { .ex_lblk = EXT_DATA_LBLK + 1 + (EXT_DATA_LEN - 2), + .ex_len = 1, + .is_unwrit = 1 } }, + .is_zeroout_test = 0 }, + + /* writ to unwrit splits */ + { .desc = "split writ extent to 2 extents and convert 1st half unwrit", + .type = TEST_SPLIT_CONVERT, + .is_unwrit_at_start = 0, + .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, + .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 }, + .nr_exp_ext = 2, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = 1, + .is_unwrit = 1 }, + { .ex_lblk = EXT_DATA_LBLK + 1, + .ex_len = EXT_DATA_LEN - 1, + .is_unwrit = 0 } }, + .is_zeroout_test = 0 }, + { .desc = "split writ extent to 2 extents and convert 2nd half unwrit", + .type = TEST_SPLIT_CONVERT, + .is_unwrit_at_start = 0, + .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 }, + .nr_exp_ext = 2, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = 1, + .is_unwrit = 0 }, + { .ex_lblk = EXT_DATA_LBLK + 1, + .ex_len = EXT_DATA_LEN - 1, + .is_unwrit = 1 } }, + .is_zeroout_test = 0 }, + { .desc = "split writ extent to 3 extents and convert 2nd half to unwrit", + .type = TEST_SPLIT_CONVERT, + .is_unwrit_at_start = 0, + .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 }, + .nr_exp_ext = 3, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = 1, + .is_unwrit = 0 }, + { .ex_lblk = EXT_DATA_LBLK + 1, + .ex_len = EXT_DATA_LEN - 2, + .is_unwrit = 1 }, + { .ex_lblk = EXT_DATA_LBLK + 1 + (EXT_DATA_LEN - 2), + .ex_len = 1, + .is_unwrit = 0 } }, + .is_zeroout_test = 0 }, + + /* + * ***** zeroout tests ***** + */ + /* unwrit to writ splits */ + { .desc = "split unwrit extent to 2 extents and convert 1st half writ (zeroout)", + .type = TEST_SPLIT_CONVERT, + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CONVERT, + .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 }, + .nr_exp_ext = 1, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = EXT_DATA_LEN, + .is_unwrit = 0 } }, + .is_zeroout_test = 1, + .nr_exp_data_segs = 2, + .exp_data_state = { { .exp_char = 'X', .off_blk = 0, .len_blk = 1 }, + { .exp_char = 0, + .off_blk = 1, + .len_blk = EXT_DATA_LEN - 1 } } }, + { .desc = "split unwrit extent to 2 extents and convert 2nd half writ (zeroout)", + .type = TEST_SPLIT_CONVERT, + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CONVERT, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 }, + .nr_exp_ext = 1, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = EXT_DATA_LEN, + .is_unwrit = 0 } }, + .is_zeroout_test = 1, + .nr_exp_data_segs = 2, + .exp_data_state = { { .exp_char = 0, .off_blk = 0, .len_blk = 1 }, + { .exp_char = 'X', + .off_blk = 1, + .len_blk = EXT_DATA_LEN - 1 } } }, + { .desc = "split unwrit extent to 3 extents and convert 2nd half writ (zeroout)", + .type = TEST_SPLIT_CONVERT, + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CONVERT, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 }, + .nr_exp_ext = 1, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = EXT_DATA_LEN, + .is_unwrit = 0 } }, + .is_zeroout_test = 1, + .nr_exp_data_segs = 3, + .exp_data_state = { { .exp_char = 0, .off_blk = 0, .len_blk = 1 }, + { .exp_char = 'X', .off_blk = 1, .len_blk = EXT_DATA_LEN - 2 }, + { .exp_char = 0, .off_blk = EXT_DATA_LEN - 1, .len_blk = 1 } } }, + + /* writ to unwrit splits */ + { .desc = "split writ extent to 2 extents and convert 1st half unwrit (zeroout)", + .type = TEST_SPLIT_CONVERT, + .is_unwrit_at_start = 0, + .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, + .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 }, + .nr_exp_ext = 1, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = EXT_DATA_LEN, + .is_unwrit = 0 } }, + .is_zeroout_test = 1, + .nr_exp_data_segs = 2, + .exp_data_state = { { .exp_char = 0, .off_blk = 0, .len_blk = 1 }, + { .exp_char = 'X', + .off_blk = 1, + .len_blk = EXT_DATA_LEN - 1 } } }, + { .desc = "split writ extent to 2 extents and convert 2nd half unwrit (zeroout)", + .type = TEST_SPLIT_CONVERT, + .is_unwrit_at_start = 0, + .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 }, + .nr_exp_ext = 1, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = EXT_DATA_LEN, + .is_unwrit = 0 } }, + .is_zeroout_test = 1, + .nr_exp_data_segs = 2, + .exp_data_state = { { .exp_char = 'X', .off_blk = 0, .len_blk = 1 }, + { .exp_char = 0, + .off_blk = 1, + .len_blk = EXT_DATA_LEN - 1 } } }, + { .desc = "split writ extent to 3 extents and convert 2nd half unwrit (zeroout)", + .type = TEST_SPLIT_CONVERT, + .is_unwrit_at_start = 0, + .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 }, + .nr_exp_ext = 1, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = EXT_DATA_LEN, + .is_unwrit = 0 } }, + .is_zeroout_test = 1, + .nr_exp_data_segs = 3, + .exp_data_state = { { .exp_char = 'X', .off_blk = 0, .len_blk = 1 }, + { .exp_char = 0, + .off_blk = 1, + .len_blk = EXT_DATA_LEN - 2 }, + { .exp_char = 'X', + .off_blk = EXT_DATA_LEN - 1, + .len_blk = 1 } } }, +}; + +/* Tests to trigger ext4_ext_map_blocks() -> convert_initialized_extent() */ +static const struct kunit_ext_test_param test_convert_initialized_params[] = { + /* writ to unwrit splits */ + { .desc = "split writ extent to 2 extents and convert 1st half unwrit", + .type = TEST_CREATE_BLOCKS, + .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, + .is_unwrit_at_start = 0, + .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 }, + .nr_exp_ext = 2, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = 1, + .is_unwrit = 1 }, + { .ex_lblk = EXT_DATA_LBLK + 1, + .ex_len = EXT_DATA_LEN - 1, + .is_unwrit = 0 } }, + .is_zeroout_test = 0 }, + { .desc = "split writ extent to 2 extents and convert 2nd half unwrit", + .type = TEST_CREATE_BLOCKS, + .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, + .is_unwrit_at_start = 0, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 }, + .nr_exp_ext = 2, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = 1, + .is_unwrit = 0 }, + { .ex_lblk = EXT_DATA_LBLK + 1, + .ex_len = EXT_DATA_LEN - 1, + .is_unwrit = 1 } }, + .is_zeroout_test = 0 }, + { .desc = "split writ extent to 3 extents and convert 2nd half to unwrit", + .type = TEST_CREATE_BLOCKS, + .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, + .is_unwrit_at_start = 0, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 }, + .nr_exp_ext = 3, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = 1, + .is_unwrit = 0 }, + { .ex_lblk = EXT_DATA_LBLK + 1, + .ex_len = EXT_DATA_LEN - 2, + .is_unwrit = 1 }, + { .ex_lblk = EXT_DATA_LBLK + 1 + (EXT_DATA_LEN - 2), + .ex_len = 1, + .is_unwrit = 0 } }, + .is_zeroout_test = 0 }, + + /* writ to unwrit splits (zeroout) */ + { .desc = "split writ extent to 2 extents and convert 1st half unwrit (zeroout)", + .type = TEST_CREATE_BLOCKS, + .is_unwrit_at_start = 0, + .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, + .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 }, + .nr_exp_ext = 1, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = EXT_DATA_LEN, + .is_unwrit = 0 } }, + .is_zeroout_test = 1, + .nr_exp_data_segs = 2, + .exp_data_state = { { .exp_char = 0, .off_blk = 0, .len_blk = 1 }, + { .exp_char = 'X', + .off_blk = 1, + .len_blk = EXT_DATA_LEN - 1 } } }, + { .desc = "split writ extent to 2 extents and convert 2nd half unwrit (zeroout)", + .type = TEST_CREATE_BLOCKS, + .is_unwrit_at_start = 0, + .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 }, + .nr_exp_ext = 1, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = EXT_DATA_LEN, + .is_unwrit = 0 } }, + .is_zeroout_test = 1, + .nr_exp_data_segs = 2, + .exp_data_state = { { .exp_char = 'X', .off_blk = 0, .len_blk = 1 }, + { .exp_char = 0, + .off_blk = 1, + .len_blk = EXT_DATA_LEN - 1 } } }, + { .desc = "split writ extent to 3 extents and convert 2nd half unwrit (zeroout)", + .type = TEST_CREATE_BLOCKS, + .is_unwrit_at_start = 0, + .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 }, + .nr_exp_ext = 1, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = EXT_DATA_LEN, + .is_unwrit = 0 } }, + .is_zeroout_test = 1, + .nr_exp_data_segs = 3, + .exp_data_state = { { .exp_char = 'X', .off_blk = 0, .len_blk = 1 }, + { .exp_char = 0, + .off_blk = 1, + .len_blk = EXT_DATA_LEN - 2 }, + { .exp_char = 'X', + .off_blk = EXT_DATA_LEN - 1, + .len_blk = 1 } } }, +}; + +/* Tests to trigger ext4_ext_map_blocks() -> ext4_ext_handle_unwritten_exntents() */ +static const struct kunit_ext_test_param test_handle_unwritten_params[] = { + /* unwrit to writ splits via endio path */ + { .desc = "split unwrit extent to 2 extents and convert 1st half writ (endio)", + .type = TEST_CREATE_BLOCKS, + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CONVERT, + .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 }, + .nr_exp_ext = 2, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = 1, + .is_unwrit = 0 }, + { .ex_lblk = EXT_DATA_LBLK + 1, + .ex_len = EXT_DATA_LEN - 1, + .is_unwrit = 1 } }, + .is_zeroout_test = 0 }, + { .desc = "split unwrit extent to 2 extents and convert 2nd half writ (endio)", + .type = TEST_CREATE_BLOCKS, + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CONVERT, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 }, + .nr_exp_ext = 2, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = 1, + .is_unwrit = 1 }, + { .ex_lblk = EXT_DATA_LBLK + 1, + .ex_len = EXT_DATA_LEN - 1, + .is_unwrit = 0 } }, + .is_zeroout_test = 0 }, + { .desc = "split unwrit extent to 3 extents and convert 2nd half to writ (endio)", + .type = TEST_CREATE_BLOCKS, + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CONVERT, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 }, + .nr_exp_ext = 3, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = 1, + .is_unwrit = 1 }, + { .ex_lblk = EXT_DATA_LBLK + 1, + .ex_len = EXT_DATA_LEN - 2, + .is_unwrit = 0 }, + { .ex_lblk = EXT_DATA_LBLK + 1 + (EXT_DATA_LEN - 2), + .ex_len = 1, + .is_unwrit = 1 } }, + .is_zeroout_test = 0 }, + + /* unwrit to writ splits via non-endio path */ + { .desc = "split unwrit extent to 2 extents and convert 1st half writ (non endio)", + .type = TEST_CREATE_BLOCKS, + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CREATE, + .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 }, + .nr_exp_ext = 2, + .disable_zeroout = true, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = 1, + .is_unwrit = 0 }, + { .ex_lblk = EXT_DATA_LBLK + 1, + .ex_len = EXT_DATA_LEN - 1, + .is_unwrit = 1 } }, + .is_zeroout_test = 0 }, + { .desc = "split unwrit extent to 2 extents and convert 2nd half writ (non endio)", + .type = TEST_CREATE_BLOCKS, + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CREATE, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 }, + .nr_exp_ext = 2, + .disable_zeroout = true, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = 1, + .is_unwrit = 1 }, + { .ex_lblk = EXT_DATA_LBLK + 1, + .ex_len = EXT_DATA_LEN - 1, + .is_unwrit = 0 } }, + .is_zeroout_test = 0 }, + { .desc = "split unwrit extent to 3 extents and convert 2nd half to writ (non endio)", + .type = TEST_CREATE_BLOCKS, + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CREATE, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 }, + .nr_exp_ext = 3, + .disable_zeroout = true, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = 1, + .is_unwrit = 1 }, + { .ex_lblk = EXT_DATA_LBLK + 1, + .ex_len = EXT_DATA_LEN - 2, + .is_unwrit = 0 }, + { .ex_lblk = EXT_DATA_LBLK + 1 + (EXT_DATA_LEN - 2), + .ex_len = 1, + .is_unwrit = 1 } }, + .is_zeroout_test = 0 }, + + /* + * ***** zeroout tests ***** + */ + /* unwrit to writ splits (endio)*/ + { .desc = "split unwrit extent to 2 extents and convert 1st half writ (endio, zeroout)", + .type = TEST_CREATE_BLOCKS, + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CONVERT, + .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 }, + .nr_exp_ext = 1, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = EXT_DATA_LEN, + .is_unwrit = 0 } }, + .is_zeroout_test = 1, + .nr_exp_data_segs = 2, + .exp_data_state = { { .exp_char = 'X', .off_blk = 0, .len_blk = 1 }, + { .exp_char = 0, + .off_blk = 1, + .len_blk = EXT_DATA_LEN - 1 } } }, + { .desc = "split unwrit extent to 2 extents and convert 2nd half writ (endio, zeroout)", + .type = TEST_CREATE_BLOCKS, + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CONVERT, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 }, + .nr_exp_ext = 1, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = EXT_DATA_LEN, + .is_unwrit = 0 } }, + .is_zeroout_test = 1, + .nr_exp_data_segs = 2, + .exp_data_state = { { .exp_char = 0, .off_blk = 0, .len_blk = 1 }, + { .exp_char = 'X', + .off_blk = 1, + .len_blk = EXT_DATA_LEN - 1 } } }, + { .desc = "split unwrit extent to 3 extents and convert 2nd half writ (endio, zeroout)", + .type = TEST_CREATE_BLOCKS, + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CONVERT, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 }, + .nr_exp_ext = 1, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = EXT_DATA_LEN, + .is_unwrit = 0 } }, + .is_zeroout_test = 1, + .nr_exp_data_segs = 3, + .exp_data_state = { { .exp_char = 0, .off_blk = 0, .len_blk = 1 }, + { .exp_char = 'X', + .off_blk = 1, + .len_blk = EXT_DATA_LEN - 2 }, + { .exp_char = 0, + .off_blk = EXT_DATA_LEN - 1, + .len_blk = 1 } } }, + + /* unwrit to writ splits (non-endio)*/ + { .desc = "split unwrit extent to 2 extents and convert 1st half writ (non-endio, zeroout)", + .type = TEST_CREATE_BLOCKS, + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CREATE, + .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 }, + .nr_exp_ext = 1, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = EXT_DATA_LEN, + .is_unwrit = 0 } }, + .is_zeroout_test = 1, + .nr_exp_data_segs = 2, + .exp_data_state = { { .exp_char = 'X', .off_blk = 0, .len_blk = 1 }, + { .exp_char = 0, + .off_blk = 1, + .len_blk = EXT_DATA_LEN - 1 } } }, + { .desc = "split unwrit extent to 2 extents and convert 2nd half writ (non-endio, zeroout)", + .type = TEST_CREATE_BLOCKS, + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CREATE, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 }, + .nr_exp_ext = 1, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = EXT_DATA_LEN, + .is_unwrit = 0 } }, + .is_zeroout_test = 1, + .nr_exp_data_segs = 2, + .exp_data_state = { { .exp_char = 0, .off_blk = 0, .len_blk = 1 }, + { .exp_char = 'X', + .off_blk = 1, + .len_blk = EXT_DATA_LEN - 1 } } }, + { .desc = "split unwrit extent to 3 extents and convert 2nd half writ (non-endio, zeroout)", + .type = TEST_CREATE_BLOCKS, + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CREATE, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 }, + .nr_exp_ext = 1, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = EXT_DATA_LEN, + .is_unwrit = 0 } }, + .is_zeroout_test = 1, + .nr_exp_data_segs = 3, + .exp_data_state = { { .exp_char = 0, .off_blk = 0, .len_blk = 1 }, + { .exp_char = 'X', + .off_blk = 1, + .len_blk = EXT_DATA_LEN - 2 }, + { .exp_char = 0, + .off_blk = EXT_DATA_LEN - 1, + .len_blk = 1 } } }, +}; + +static void ext_get_desc(struct kunit *test, const void *p, char *desc) + +{ + struct kunit_ext_test_param *param = (struct kunit_ext_test_param *)p; + + snprintf(desc, KUNIT_PARAM_DESC_SIZE, "%s %s\n", param->desc, + (param->type & TEST_CREATE_BLOCKS) ? "(highlevel)" : ""); +} + +static int test_split_convert_param_init(struct kunit *test) +{ + size_t arr_size = ARRAY_SIZE(test_split_convert_params); + + kunit_register_params_array(test, test_split_convert_params, arr_size, + ext_get_desc); + return 0; +} + +static int test_convert_initialized_param_init(struct kunit *test) +{ + size_t arr_size = ARRAY_SIZE(test_convert_initialized_params); + + kunit_register_params_array(test, test_convert_initialized_params, + arr_size, ext_get_desc); + return 0; +} + +static int test_handle_unwritten_init(struct kunit *test) +{ + size_t arr_size = ARRAY_SIZE(test_handle_unwritten_params); + + kunit_register_params_array(test, test_handle_unwritten_params, + arr_size, ext_get_desc); + return 0; +} + +/* + * Note that we use KUNIT_CASE_PARAM_WITH_INIT() instead of the more compact + * KUNIT_ARRAY_PARAM() because the later currently has a limitation causing the + * output parsing to be prone to error. For more context: + * + * https://lore.kernel.org/linux-kselftest/aULJpTvJDw9ctUDe@li-dc0c254c-257c-11b2-a85c-98b6c1322444.ibm.com/ + */ +static struct kunit_case extents_test_cases[] = { + KUNIT_CASE_PARAM_WITH_INIT(test_split_convert, kunit_array_gen_params, + test_split_convert_param_init, NULL), + KUNIT_CASE_PARAM_WITH_INIT(test_split_convert, kunit_array_gen_params, + test_convert_initialized_param_init, NULL), + KUNIT_CASE_PARAM_WITH_INIT(test_split_convert, kunit_array_gen_params, + test_handle_unwritten_init, NULL), + {} +}; + +static struct kunit_suite extents_test_suite = { + .name = "ext4_extents_test", + .init = extents_kunit_init, + .exit = extents_kunit_exit, + .test_cases = extents_test_cases, +}; + +kunit_test_suites(&extents_test_suite); + +MODULE_LICENSE("GPL"); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 2cf5759ba689..3630b27e4fd7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -32,6 +32,7 @@ #include "ext4_jbd2.h" #include "ext4_extents.h" #include "xattr.h" +#include #include @@ -40,11 +41,9 @@ */ #define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ due to ENOSPC */ -#define EXT4_EXT_MARK_UNWRIT1 0x2 /* mark first half unwritten */ -#define EXT4_EXT_MARK_UNWRIT2 0x4 /* mark second half unwritten */ - -#define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */ -#define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */ +static struct ext4_ext_path *ext4_split_convert_extents( + handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, + struct ext4_ext_path *path, int flags, unsigned int *allocated); static __le32 ext4_extent_block_csum(struct inode *inode, struct ext4_extent_header *eh) @@ -86,8 +85,7 @@ static void ext4_extent_block_csum_set(struct inode *inode, static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, - ext4_lblk_t split, - int split_flag, int flags); + ext4_lblk_t split, int flags); static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) { @@ -192,6 +190,9 @@ static int __ext4_ext_dirty(const char *where, unsigned int line, { int err; + KUNIT_STATIC_STUB_REDIRECT(__ext4_ext_dirty, where, line, handle, inode, + path); + WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); if (path->p_bh) { ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh)); @@ -332,15 +333,12 @@ ext4_force_split_extent_at(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t lblk, int nofail) { - int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext); int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_SPLIT_NOMERGE; if (nofail) flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL; - return ext4_split_extent_at(handle, inode, path, lblk, unwritten ? - EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0, - flags); + return ext4_split_extent_at(handle, inode, path, lblk, flags); } static int @@ -530,6 +528,8 @@ static void ext4_cache_extents(struct inode *inode, ext4_lblk_t prev = 0; int i; + KUNIT_STATIC_STUB_REDIRECT(ext4_cache_extents, inode, eh); + for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) { unsigned int status = EXTENT_STATUS_WRITTEN; ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); @@ -893,6 +893,8 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, int ret; gfp_t gfp_flags = GFP_NOFS; + KUNIT_STATIC_STUB_REDIRECT(ext4_find_extent, inode, block, path, flags); + if (flags & EXT4_EX_NOFAIL) gfp_flags |= __GFP_NOFAIL; @@ -1985,6 +1987,9 @@ ext4_ext_insert_extent(handle_t *handle, struct inode *inode, ext4_lblk_t next; int mb_flags = 0, unwritten; + KUNIT_STATIC_STUB_REDIRECT(ext4_ext_insert_extent, handle, inode, path, + newext, gb_flags); + if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) mb_flags |= EXT4_MB_DELALLOC_RESERVED; if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { @@ -2944,10 +2949,6 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, } else { path = kcalloc(depth + 1, sizeof(struct ext4_ext_path), GFP_NOFS | __GFP_NOFAIL); - if (path == NULL) { - ext4_journal_stop(handle); - return -ENOMEM; - } path[0].p_maxdepth = path[0].p_depth = depth; path[0].p_hdr = ext_inode_hdr(inode); i = 0; @@ -3133,8 +3134,8 @@ static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) ext4_fsblk_t ee_pblock; unsigned int ee_len; - ee_block = le32_to_cpu(ex->ee_block); - ee_len = ext4_ext_get_actual_len(ex); + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); ee_pblock = ext4_ext_pblock(ex); if (ee_len == 0) @@ -3150,6 +3151,8 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) ext4_fsblk_t ee_pblock; unsigned int ee_len; + KUNIT_STATIC_STUB_REDIRECT(ext4_ext_zeroout, inode, ex); + ee_len = ext4_ext_get_actual_len(ex); ee_pblock = ext4_ext_pblock(ex); return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock, @@ -3163,35 +3166,30 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) * @inode: the file inode * @path: the path to the extent * @split: the logical block where the extent is splitted. - * @split_flags: indicates if the extent could be zeroout if split fails, and - * the states(init or unwritten) of new extents. * @flags: flags used to insert new extent to extent tree. * * * Splits extent [a, b] into two extents [a, @split) and [@split, b], states - * of which are determined by split_flag. + * of which are same as the original extent. No conversion is performed. * - * There are two cases: - * a> the extent are splitted into two extent. - * b> split is not needed, and just mark the extent. - * - * Return an extent path pointer on success, or an error pointer on failure. + * Return an extent path pointer on success, or an error pointer on failure. On + * failure, the extent is restored to original state. */ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t split, - int split_flag, int flags) + int flags) { ext4_fsblk_t newblock; ext4_lblk_t ee_block; - struct ext4_extent *ex, newex, orig_ex, zero_ex; + struct ext4_extent *ex, newex, orig_ex; struct ext4_extent *ex2 = NULL; unsigned int ee_len, depth; - int err = 0; + int err = 0, insert_err = 0, is_unwrit = 0; - BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) == - (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)); + /* Do not cache extents that are in the process of being modified. */ + flags |= EXT4_EX_NOCACHE; ext_debug(inode, "logical block %llu\n", (unsigned long long)split); @@ -3202,39 +3200,24 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); newblock = split - ee_block + ext4_ext_pblock(ex); + is_unwrit = ext4_ext_is_unwritten(ex); BUG_ON(split < ee_block || split >= (ee_block + ee_len)); - BUG_ON(!ext4_ext_is_unwritten(ex) && - split_flag & (EXT4_EXT_MAY_ZEROOUT | - EXT4_EXT_MARK_UNWRIT1 | - EXT4_EXT_MARK_UNWRIT2)); + + /* + * No split needed + */ + if (split == ee_block) + goto out; err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; - if (split == ee_block) { - /* - * case b: block @split is the block that the extent begins with - * then we just change the state of the extent, and splitting - * is not needed. - */ - if (split_flag & EXT4_EXT_MARK_UNWRIT2) - ext4_ext_mark_unwritten(ex); - else - ext4_ext_mark_initialized(ex); - - if (!(flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE)) - ext4_ext_try_to_merge(handle, inode, path, ex); - - err = ext4_ext_dirty(handle, inode, path + path->p_depth); - goto out; - } - /* case a */ memcpy(&orig_ex, ex, sizeof(orig_ex)); ex->ee_len = cpu_to_le16(split - ee_block); - if (split_flag & EXT4_EXT_MARK_UNWRIT1) + if (is_unwrit) ext4_ext_mark_unwritten(ex); /* @@ -3249,17 +3232,16 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, ex2->ee_block = cpu_to_le32(split); ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block)); ext4_ext_store_pblock(ex2, newblock); - if (split_flag & EXT4_EXT_MARK_UNWRIT2) + if (is_unwrit) ext4_ext_mark_unwritten(ex2); path = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (!IS_ERR(path)) - goto out; - - err = PTR_ERR(path); - if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM) return path; + insert_err = PTR_ERR(path); + err = 0; + /* * Get a new path to try to zeroout or fix the extent length. * Using EXT4_EX_NOFAIL guarantees that ext4_find_extent() @@ -3272,70 +3254,124 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, if (IS_ERR(path)) { EXT4_ERROR_INODE(inode, "Failed split extent on %u, err %ld", split, PTR_ERR(path)); - return path; + goto out_path; } + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + depth = ext_depth(inode); ex = path[depth].p_ext; - if (EXT4_EXT_MAY_ZEROOUT & split_flag) { - if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { - if (split_flag & EXT4_EXT_DATA_VALID1) { - err = ext4_ext_zeroout(inode, ex2); - zero_ex.ee_block = ex2->ee_block; - zero_ex.ee_len = cpu_to_le16( - ext4_ext_get_actual_len(ex2)); - ext4_ext_store_pblock(&zero_ex, - ext4_ext_pblock(ex2)); - } else { - err = ext4_ext_zeroout(inode, ex); - zero_ex.ee_block = ex->ee_block; - zero_ex.ee_len = cpu_to_le16( - ext4_ext_get_actual_len(ex)); - ext4_ext_store_pblock(&zero_ex, - ext4_ext_pblock(ex)); - } - } else { - err = ext4_ext_zeroout(inode, &orig_ex); - zero_ex.ee_block = orig_ex.ee_block; - zero_ex.ee_len = cpu_to_le16( - ext4_ext_get_actual_len(&orig_ex)); - ext4_ext_store_pblock(&zero_ex, - ext4_ext_pblock(&orig_ex)); - } - - if (!err) { - /* update the extent length and mark as initialized */ - ex->ee_len = cpu_to_le16(ee_len); - ext4_ext_try_to_merge(handle, inode, path, ex); - err = ext4_ext_dirty(handle, inode, path + path->p_depth); - if (!err) - /* update extent status tree */ - ext4_zeroout_es(inode, &zero_ex); - /* If we failed at this point, we don't know in which - * state the extent tree exactly is so don't try to fix - * length of the original extent as it may do even more - * damage. - */ - goto out; - } - } - fix_extent_len: ex->ee_len = orig_ex.ee_len; - /* - * Ignore ext4_ext_dirty return value since we are already in error path - * and err is a non-zero error code. - */ - ext4_ext_dirty(handle, inode, path + path->p_depth); + err = ext4_ext_dirty(handle, inode, path + path->p_depth); out: - if (err) { + if (err || insert_err) { ext4_free_ext_path(path); - path = ERR_PTR(err); + path = err ? ERR_PTR(err) : ERR_PTR(insert_err); } +out_path: + if (IS_ERR(path)) + /* Remove all remaining potentially stale extents. */ + ext4_es_remove_extent(inode, ee_block, ee_len); ext4_ext_show_leaf(inode, path); return path; } +static int ext4_split_extent_zeroout(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, + struct ext4_map_blocks *map, int flags) +{ + struct ext4_extent *ex; + unsigned int ee_len, depth; + ext4_lblk_t ee_block; + uint64_t lblk, pblk, len; + int is_unwrit; + int err = 0; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + is_unwrit = ext4_ext_is_unwritten(ex); + + if (flags & EXT4_GET_BLOCKS_CONVERT) { + /* + * EXT4_GET_BLOCKS_CONVERT: Caller wants the range specified by + * map to be initialized. Zeroout everything except the map + * range. + */ + + loff_t map_end = (loff_t) map->m_lblk + map->m_len; + loff_t ex_end = (loff_t) ee_block + ee_len; + + if (!is_unwrit) + /* Shouldn't happen. Just exit */ + return -EINVAL; + + /* zeroout left */ + if (map->m_lblk > ee_block) { + lblk = ee_block; + len = map->m_lblk - ee_block; + pblk = ext4_ext_pblock(ex); + err = ext4_issue_zeroout(inode, lblk, pblk, len); + if (err) + /* ZEROOUT failed, just return original error */ + return err; + } + + /* zeroout right */ + if (map_end < ex_end) { + lblk = map_end; + len = ex_end - map_end; + pblk = ext4_ext_pblock(ex) + (map_end - ee_block); + err = ext4_issue_zeroout(inode, lblk, pblk, len); + if (err) + /* ZEROOUT failed, just return original error */ + return err; + } + } else if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) { + /* + * EXT4_GET_BLOCKS_CONVERT_UNWRITTEN: Caller wants the + * range specified by map to be marked unwritten. + * Zeroout the map range leaving rest as it is. + */ + + if (is_unwrit) + /* Shouldn't happen. Just exit */ + return -EINVAL; + + lblk = map->m_lblk; + len = map->m_len; + pblk = ext4_ext_pblock(ex) + (map->m_lblk - ee_block); + err = ext4_issue_zeroout(inode, lblk, pblk, len); + if (err) + /* ZEROOUT failed, just return original error */ + return err; + } else { + /* + * We no longer perform unwritten to unwritten splits in IO paths. + * Hence this should not happen. + */ + WARN_ON_ONCE(true); + return -EINVAL; + } + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + return err; + + ext4_ext_mark_initialized(ex); + + ext4_ext_dirty(handle, inode, path + depth); + if (err) + return err; + + return 0; +} + /* * ext4_split_extent() splits an extent and mark extent which is covered * by @map as split_flags indicates @@ -3352,13 +3388,13 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle, struct ext4_ext_path *path, struct ext4_map_blocks *map, int split_flag, int flags, - unsigned int *allocated) + unsigned int *allocated, bool *did_zeroout) { - ext4_lblk_t ee_block; + ext4_lblk_t ee_block, orig_ee_block; struct ext4_extent *ex; - unsigned int ee_len, depth; - int unwritten; - int split_flag1, flags1; + unsigned int ee_len, orig_ee_len, depth; + int unwritten, orig_unwritten; + int orig_err = 0; depth = ext_depth(inode); ex = path[depth].p_ext; @@ -3366,25 +3402,27 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle, ee_len = ext4_ext_get_actual_len(ex); unwritten = ext4_ext_is_unwritten(ex); + orig_ee_block = ee_block; + orig_ee_len = ee_len; + orig_unwritten = unwritten; + + /* Do not cache extents that are in the process of being modified. */ + flags |= EXT4_EX_NOCACHE; + if (map->m_lblk + map->m_len < ee_block + ee_len) { - split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT; - flags1 = flags | EXT4_GET_BLOCKS_SPLIT_NOMERGE; - if (unwritten) - split_flag1 |= EXT4_EXT_MARK_UNWRIT1 | - EXT4_EXT_MARK_UNWRIT2; - if (split_flag & EXT4_EXT_DATA_VALID2) - split_flag1 |= EXT4_EXT_DATA_VALID1; path = ext4_split_extent_at(handle, inode, path, - map->m_lblk + map->m_len, split_flag1, flags1); + map->m_lblk + map->m_len, flags); if (IS_ERR(path)) - return path; + goto try_zeroout; + /* * Update path is required because previous ext4_split_extent_at * may result in split of original leaf or extent zeroout. */ path = ext4_find_extent(inode, map->m_lblk, path, flags); if (IS_ERR(path)) - return path; + goto try_zeroout; + depth = ext_depth(inode); ex = path[depth].p_ext; if (!ex) { @@ -3393,22 +3431,69 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle, ext4_free_ext_path(path); return ERR_PTR(-EFSCORRUPTED); } - unwritten = ext4_ext_is_unwritten(ex); + + /* extent would have changed so update original values */ + orig_ee_block = le32_to_cpu(ex->ee_block); + orig_ee_len = ext4_ext_get_actual_len(ex); + orig_unwritten = ext4_ext_is_unwritten(ex); } if (map->m_lblk >= ee_block) { - split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; - if (unwritten) { - split_flag1 |= EXT4_EXT_MARK_UNWRIT1; - split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | - EXT4_EXT_MARK_UNWRIT2); - } - path = ext4_split_extent_at(handle, inode, path, - map->m_lblk, split_flag1, flags); + path = ext4_split_extent_at(handle, inode, path, map->m_lblk, + flags); if (IS_ERR(path)) - return path; + goto try_zeroout; } + goto success; + +try_zeroout: + /* + * There was an error in splitting the extent. So instead, just zeroout + * unwritten portions and convert it to initialized as a last resort. If + * there is any failure here we just return the original error + */ + + orig_err = PTR_ERR(path); + if (orig_err != -ENOSPC && orig_err != -EDQUOT && orig_err != -ENOMEM) + goto out_orig_err; + + /* we can't zeroout? just return the original err */ + if (!(split_flag & EXT4_EXT_MAY_ZEROOUT)) + goto out_orig_err; + + if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) { + int max_zeroout_blks = + EXT4_SB(inode->i_sb)->s_extent_max_zeroout_kb >> + (inode->i_sb->s_blocksize_bits - 10); + + if (map->m_len > max_zeroout_blks) + goto out_orig_err; + } + + path = ext4_find_extent(inode, map->m_lblk, NULL, flags); + if (IS_ERR(path)) + goto out_orig_err; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + unwritten = ext4_ext_is_unwritten(ex); + + /* extent to zeroout should have been unchanged but its not */ + if (WARN_ON(ee_block != orig_ee_block || ee_len != orig_ee_len || + unwritten != orig_unwritten)) + goto out_free_path; + + if (ext4_split_extent_zeroout(handle, inode, path, map, flags)) + goto out_free_path; + + /* zeroout succeeded */ + if (did_zeroout) + *did_zeroout = true; + +success: if (allocated) { if (map->m_lblk + map->m_len > ee_block + ee_len) *allocated = ee_len - (map->m_lblk - ee_block); @@ -3417,6 +3502,12 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle, } ext4_ext_show_leaf(inode, path); return path; + +out_free_path: + ext4_free_ext_path(path); +out_orig_err: + return ERR_PTR(orig_err); + } /* @@ -3452,7 +3543,6 @@ ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, ext4_lblk_t ee_block, eof_block; unsigned int ee_len, depth, map_len = map->m_len; int err = 0; - int split_flag = EXT4_EXT_DATA_VALID2; unsigned int max_zeroout = 0; ext_debug(inode, "logical block %llu, max_blocks %u\n", @@ -3604,9 +3694,7 @@ ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, * It is safe to convert extent to initialized via explicit * zeroout only if extent is fully inside i_size or new_size. */ - split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; - - if (EXT4_EXT_MAY_ZEROOUT & split_flag) + if (ee_block + ee_len <= eof_block) max_zeroout = sbi->s_extent_max_zeroout_kb >> (inode->i_sb->s_blocksize_bits - 10); @@ -3661,8 +3749,8 @@ ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, } fallback: - path = ext4_split_extent(handle, inode, path, &split_map, split_flag, - flags, NULL); + path = ext4_split_convert_extents(handle, inode, &split_map, path, + flags | EXT4_GET_BLOCKS_CONVERT, NULL); if (IS_ERR(path)) return path; out: @@ -3712,7 +3800,8 @@ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle, ext4_lblk_t ee_block; struct ext4_extent *ex; unsigned int ee_len; - int split_flag = 0, depth; + int split_flag = 0, depth, err = 0; + bool did_zeroout = false; ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)map->m_lblk, map->m_len); @@ -3726,34 +3815,87 @@ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle, ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - /* Convert to unwritten */ - if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) { - split_flag |= EXT4_EXT_DATA_VALID1; - /* Convert to initialized */ - } else if (flags & EXT4_GET_BLOCKS_CONVERT) { - /* - * It is safe to convert extent to initialized via explicit - * zeroout only if extent is fully inside i_size or new_size. - */ - split_flag |= ee_block + ee_len <= eof_block ? - EXT4_EXT_MAY_ZEROOUT : 0; - split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); + /* No split needed */ + if (ee_block == map->m_lblk && ee_len == map->m_len) + goto convert; + + /* + * It is only safe to convert extent to initialized via explicit + * zeroout only if extent is fully inside i_size or new_size. + */ + split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; + + /* + * pass SPLIT_NOMERGE explicitly so we don't end up merging extents we + * just split. + */ + path = ext4_split_extent(handle, inode, path, map, split_flag, + flags | EXT4_GET_BLOCKS_SPLIT_NOMERGE, + allocated, &did_zeroout); + if (IS_ERR(path)) + return path; + +convert: + path = ext4_find_extent(inode, map->m_lblk, path, flags); + if (IS_ERR(path)) + return path; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + + /* + * Conversion is already handled in case of zeroout + */ + if (!did_zeroout) { + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto err; + + if (flags & EXT4_GET_BLOCKS_CONVERT) + ext4_ext_mark_initialized(ex); + else if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) + ext4_ext_mark_unwritten(ex); + + if (!(flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE)) + /* + * note: ext4_ext_correct_indexes() isn't needed here because + * borders are not changed + */ + ext4_ext_try_to_merge(handle, inode, path, ex); + + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto err; } - flags |= EXT4_GET_BLOCKS_SPLIT_NOMERGE; - return ext4_split_extent(handle, inode, path, map, split_flag, flags, - allocated); + + /* Lets update the extent status tree after conversion */ + if (!(flags & EXT4_EX_NOCACHE)) + ext4_es_insert_extent(inode, le32_to_cpu(ex->ee_block), + ext4_ext_get_actual_len(ex), + ext4_ext_pblock(ex), + ext4_ext_is_unwritten(ex) ? + EXTENT_STATUS_UNWRITTEN : + EXTENT_STATUS_WRITTEN, + false); + +err: + if (err) { + ext4_free_ext_path(path); + return ERR_PTR(err); + } + + return path; } static struct ext4_ext_path * ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path *path) + struct ext4_ext_path *path, int flags) { struct ext4_extent *ex; ext4_lblk_t ee_block; unsigned int ee_len; int depth; - int err = 0; depth = ext_depth(inode); ex = path[depth].p_ext; @@ -3763,66 +3905,21 @@ ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)ee_block, ee_len); - /* If extent is larger than requested it is a clear sign that we still - * have some extent state machine issues left. So extent_split is still - * required. - * TODO: Once all related issues will be fixed this situation should be - * illegal. - */ - if (ee_block != map->m_lblk || ee_len > map->m_len) { -#ifdef CONFIG_EXT4_DEBUG - ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu," - " len %u; IO logical block %llu, len %u", - inode->i_ino, (unsigned long long)ee_block, ee_len, - (unsigned long long)map->m_lblk, map->m_len); -#endif - path = ext4_split_convert_extents(handle, inode, map, path, - EXT4_GET_BLOCKS_CONVERT, NULL); - if (IS_ERR(path)) - return path; - - path = ext4_find_extent(inode, map->m_lblk, path, 0); - if (IS_ERR(path)) - return path; - depth = ext_depth(inode); - ex = path[depth].p_ext; - } - - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - goto errout; - /* first mark the extent as initialized */ - ext4_ext_mark_initialized(ex); - - /* note: ext4_ext_correct_indexes() isn't needed here because - * borders are not changed - */ - ext4_ext_try_to_merge(handle, inode, path, ex); - - /* Mark modified extent as dirty */ - err = ext4_ext_dirty(handle, inode, path + path->p_depth); - if (err) - goto errout; - - ext4_ext_show_leaf(inode, path); - return path; - -errout: - ext4_free_ext_path(path); - return ERR_PTR(err); + return ext4_split_convert_extents(handle, inode, map, path, flags, + NULL); } static struct ext4_ext_path * convert_initialized_extent(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path *path, + int flags, unsigned int *allocated) { struct ext4_extent *ex; ext4_lblk_t ee_block; unsigned int ee_len; int depth; - int err = 0; /* * Make sure that the extent is no bigger than we support with @@ -3839,53 +3936,33 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)ee_block, ee_len); - if (ee_block != map->m_lblk || ee_len > map->m_len) { - path = ext4_split_convert_extents(handle, inode, map, path, - EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, NULL); - if (IS_ERR(path)) - return path; + path = ext4_split_convert_extents(handle, inode, map, path, flags, + NULL); + if (IS_ERR(path)) + return path; - path = ext4_find_extent(inode, map->m_lblk, path, 0); - if (IS_ERR(path)) - return path; - depth = ext_depth(inode); - ex = path[depth].p_ext; - if (!ex) { - EXT4_ERROR_INODE(inode, "unexpected hole at %lu", - (unsigned long) map->m_lblk); - err = -EFSCORRUPTED; - goto errout; - } - } - - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - goto errout; - /* first mark the extent as unwritten */ - ext4_ext_mark_unwritten(ex); - - /* note: ext4_ext_correct_indexes() isn't needed here because - * borders are not changed - */ - ext4_ext_try_to_merge(handle, inode, path, ex); - - /* Mark modified extent as dirty */ - err = ext4_ext_dirty(handle, inode, path + path->p_depth); - if (err) - goto errout; ext4_ext_show_leaf(inode, path); ext4_update_inode_fsync_trans(handle, inode, 1); - map->m_flags |= EXT4_MAP_UNWRITTEN; + /* + * The extent might be initialized in case of zeroout. + */ + path = ext4_find_extent(inode, map->m_lblk, path, flags); + if (IS_ERR(path)) + return path; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + + if (ext4_ext_is_unwritten(ex)) + map->m_flags |= EXT4_MAP_UNWRITTEN; + else + map->m_flags |= EXT4_MAP_MAPPED; if (*allocated > map->m_len) *allocated = map->m_len; map->m_len = *allocated; return path; - -errout: - ext4_free_ext_path(path); - return ERR_PTR(err); } static struct ext4_ext_path * @@ -3910,30 +3987,10 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, trace_ext4_ext_handle_unwritten_extents(inode, map, flags, *allocated, newblock); - /* get_block() before submitting IO, split the extent */ - if (flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE) { - path = ext4_split_convert_extents(handle, inode, map, path, - flags | EXT4_GET_BLOCKS_CONVERT, allocated); - if (IS_ERR(path)) - return path; - /* - * shouldn't get a 0 allocated when splitting an extent unless - * m_len is 0 (bug) or extent has been corrupted - */ - if (unlikely(*allocated == 0)) { - EXT4_ERROR_INODE(inode, - "unexpected allocated == 0, m_len = %u", - map->m_len); - err = -EFSCORRUPTED; - goto errout; - } - map->m_flags |= EXT4_MAP_UNWRITTEN; - goto out; - } /* IO end_io complete, convert the filled extent to written */ if (flags & EXT4_GET_BLOCKS_CONVERT) { path = ext4_convert_unwritten_extents_endio(handle, inode, - map, path); + map, path, flags); if (IS_ERR(path)) return path; ext4_update_inode_fsync_trans(handle, inode, 1); @@ -3983,7 +4040,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, goto errout; } -out: map->m_flags |= EXT4_MAP_NEW; map_out: map->m_flags |= EXT4_MAP_MAPPED; @@ -4160,8 +4216,7 @@ static ext4_lblk_t ext4_ext_determine_insert_hole(struct inode *inode, insert_hole: /* Put just found gap into cache to speed up subsequent requests */ ext_debug(inode, " -> %u:%u\n", hole_start, len); - ext4_es_insert_extent(inode, hole_start, len, ~0, - EXTENT_STATUS_HOLE, false); + ext4_es_cache_extent(inode, hole_start, len, ~0, EXTENT_STATUS_HOLE); /* Update hole_len to reflect hole size after lblk */ if (hole_start != lblk) @@ -4257,7 +4312,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, if ((!ext4_ext_is_unwritten(ex)) && (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { path = convert_initialized_extent(handle, - inode, map, path, &allocated); + inode, map, path, flags, &allocated); if (IS_ERR(path)) err = PTR_ERR(path); goto out; @@ -5375,7 +5430,8 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, if (!extent) { EXT4_ERROR_INODE(inode, "unexpected hole at %lu", (unsigned long) *iterator); - return -EFSCORRUPTED; + ret = -EFSCORRUPTED; + goto out; } if (SHIFT == SHIFT_LEFT && *iterator > le32_to_cpu(extent->ee_block)) { @@ -5541,7 +5597,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) struct ext4_extent *extent; ext4_lblk_t start_lblk, len_lblk, ee_start_lblk = 0; unsigned int credits, ee_len; - int ret, depth, split_flag = 0; + int ret, depth; loff_t start; trace_ext4_insert_range(inode, offset, len); @@ -5612,12 +5668,8 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) */ if ((start_lblk > ee_start_lblk) && (start_lblk < (ee_start_lblk + ee_len))) { - if (ext4_ext_is_unwritten(extent)) - split_flag = EXT4_EXT_MARK_UNWRIT1 | - EXT4_EXT_MARK_UNWRIT2; path = ext4_split_extent_at(handle, inode, path, - start_lblk, split_flag, - EXT4_EX_NOCACHE | + start_lblk, EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_SPLIT_NOMERGE | EXT4_GET_BLOCKS_METADATA_NOFAIL); } @@ -6187,3 +6239,7 @@ int ext4_ext_clear_bb(struct inode *inode) ext4_free_ext_path(path); return 0; } + +#ifdef CONFIG_EXT4_KUNIT_TESTS +#include "extents-test.c" +#endif diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index e04fbf10fe4f..a1538bac51c6 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -16,6 +16,7 @@ #include "ext4.h" #include +#include /* * According to previous discussion in Ext4 Developer Workshop, we @@ -178,7 +179,8 @@ static struct kmem_cache *ext4_pending_cachep; static int __es_insert_extent(struct inode *inode, struct extent_status *newes, struct extent_status *prealloc); static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t end, int *reserved, + ext4_lblk_t end, unsigned int status, + int *reserved, struct extent_status *res, struct extent_status *prealloc); static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, @@ -242,6 +244,21 @@ static inline void ext4_es_inc_seq(struct inode *inode) WRITE_ONCE(ei->i_es_seq, ei->i_es_seq + 1); } +static inline int __es_check_extent_status(struct extent_status *es, + unsigned int status, + struct extent_status *res) +{ + if (ext4_es_type(es) & status) + return 0; + + if (res) { + res->es_lblk = es->es_lblk; + res->es_len = es->es_len; + res->es_pblk = es->es_pblk; + } + return -EINVAL; +} + /* * search through the tree for an delayed extent with a given offset. If * it can't be found, try to find next extent. @@ -882,7 +899,8 @@ static int __es_insert_extent(struct inode *inode, struct extent_status *newes, /* * ext4_es_insert_extent() adds information to an inode's extent - * status tree. + * status tree. This interface is used for modifying extents. To cache + * on-disk extents, use ext4_es_cache_extent() instead. */ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, @@ -929,7 +947,7 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, pr = __alloc_pending(true); write_lock(&EXT4_I(inode)->i_es_lock); - err1 = __es_remove_extent(inode, lblk, end, &resv_used, es1); + err1 = __es_remove_extent(inode, lblk, end, 0, &resv_used, NULL, es1); if (err1 != 0) goto error; /* Free preallocated extent if it didn't get used. */ @@ -961,10 +979,6 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, } pending = err3; } - /* - * TODO: For cache on-disk extents, there is no need to increment - * the sequence counter, this requires future optimization. - */ ext4_es_inc_seq(inode); error: write_unlock(&EXT4_I(inode)->i_es_lock); @@ -998,17 +1012,24 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, } /* - * ext4_es_cache_extent() inserts information into the extent status - * tree if and only if there isn't information about the range in - * question already. + * ext4_es_cache_extent() inserts information into the extent status tree + * only if there is no existing information about the specified range or + * if the existing extents have the same status. + * + * Note that this interface is only used for caching on-disk extent + * information and cannot be used to convert existing extents in the extent + * status tree. To convert existing extents, use ext4_es_insert_extent() + * instead. */ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, unsigned int status) { struct extent_status *es; - struct extent_status newes; + struct extent_status chkes, newes; ext4_lblk_t end = lblk + len - 1; + bool conflict = false; + int err; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; @@ -1016,7 +1037,6 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, newes.es_lblk = lblk; newes.es_len = len; ext4_es_store_pblock_status(&newes, pblk, status); - trace_ext4_es_cache_extent(inode, &newes); if (!len) return; @@ -1024,11 +1044,42 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, BUG_ON(end < lblk); write_lock(&EXT4_I(inode)->i_es_lock); - es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk); - if (!es || es->es_lblk > end) - __es_insert_extent(inode, &newes, NULL); + if (es && es->es_lblk <= end) { + /* Found an extent that covers the entire range. */ + if (es->es_lblk <= lblk && es->es_lblk + es->es_len > end) { + if (__es_check_extent_status(es, status, &chkes)) + conflict = true; + goto unlock; + } + /* Check and remove all extents in range. */ + err = __es_remove_extent(inode, lblk, end, status, NULL, + &chkes, NULL); + if (err) { + if (err == -EINVAL) + conflict = true; + goto unlock; + } + } + __es_insert_extent(inode, &newes, NULL); + trace_ext4_es_cache_extent(inode, &newes); + ext4_es_print_tree(inode); +unlock: write_unlock(&EXT4_I(inode)->i_es_lock); + if (!conflict) + return; + /* + * A hole in the on-disk extent but a delayed extent in the extent + * status tree, is allowed. + */ + if (status == EXTENT_STATUS_HOLE && + ext4_es_type(&chkes) == EXTENT_STATUS_DELAYED) + return; + + ext4_warning_inode(inode, + "ES cache extent failed: add [%d,%d,%llu,0x%x] conflict with existing [%d,%d,%llu,0x%x]\n", + lblk, len, pblk, status, chkes.es_lblk, chkes.es_len, + ext4_es_pblock(&chkes), ext4_es_status(&chkes)); } /* @@ -1409,23 +1460,27 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, return rc->ndelayed; } - /* * __es_remove_extent - removes block range from extent status tree * * @inode - file containing range * @lblk - first block in range * @end - last block in range + * @status - the extent status to be checked * @reserved - number of cluster reservations released + * @res - return the extent if the status is not match * @prealloc - pre-allocated es to avoid memory allocation failures * * If @reserved is not NULL and delayed allocation is enabled, counts * block/cluster reservations freed by removing range and if bigalloc - * enabled cancels pending reservations as needed. Returns 0 on success, - * error code on failure. + * enabled cancels pending reservations as needed. If @status is not + * zero, check extent status type while removing extent, return -EINVAL + * and pass out the extent through @res if not match. Returns 0 on + * success, error code on failure. */ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t end, int *reserved, + ext4_lblk_t end, unsigned int status, + int *reserved, struct extent_status *res, struct extent_status *prealloc) { struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; @@ -1434,18 +1489,24 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status orig_es; ext4_lblk_t len1, len2; ext4_fsblk_t block; - int err = 0; + int err; bool count_reserved = true; struct rsvd_count rc; if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC)) count_reserved = false; + if (status == 0) + status = ES_TYPE_MASK; es = __es_tree_search(&tree->root, lblk); if (!es) - goto out; + return 0; if (es->es_lblk > end) - goto out; + return 0; + + err = __es_check_extent_status(es, status, res); + if (err) + return err; /* Simply invalidate cache_es. */ tree->cache_es = NULL; @@ -1480,7 +1541,7 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, es->es_lblk = orig_es.es_lblk; es->es_len = orig_es.es_len; - goto out; + return err; } } else { es->es_lblk = end + 1; @@ -1494,7 +1555,7 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, if (count_reserved) count_rsvd(inode, orig_es.es_lblk + len1, orig_es.es_len - len1 - len2, &orig_es, &rc); - goto out_get_reserved; + goto out; } if (len1 > 0) { @@ -1509,6 +1570,9 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, } while (es && ext4_es_end(es) <= end) { + err = __es_check_extent_status(es, status, res); + if (err) + return err; if (count_reserved) count_rsvd(inode, es->es_lblk, es->es_len, es, &rc); node = rb_next(&es->rb_node); @@ -1524,6 +1588,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, if (es && es->es_lblk < end + 1) { ext4_lblk_t orig_len = es->es_len; + err = __es_check_extent_status(es, status, res); + if (err) + return err; + len1 = ext4_es_end(es) - end; if (count_reserved) count_rsvd(inode, es->es_lblk, orig_len - len1, @@ -1536,11 +1604,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, } } -out_get_reserved: +out: if (count_reserved) *reserved = get_rsvd(inode, end, es, &rc); -out: - return err; + return 0; } /* @@ -1582,7 +1649,7 @@ void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, * is reclaimed. */ write_lock(&EXT4_I(inode)->i_es_lock); - err = __es_remove_extent(inode, lblk, end, &reserved, es); + err = __es_remove_extent(inode, lblk, end, 0, &reserved, NULL, es); if (err) goto error; /* Free preallocated extent if it didn't get used. */ @@ -2174,7 +2241,7 @@ void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk, } write_lock(&EXT4_I(inode)->i_es_lock); - err1 = __es_remove_extent(inode, lblk, end, NULL, es1); + err1 = __es_remove_extent(inode, lblk, end, 0, NULL, NULL, es1); if (err1 != 0) goto error; /* Free preallocated extent if it didn't get used. */ diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index fa66b08de999..f575751f1cae 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -231,16 +231,16 @@ static bool ext4_fc_disabled(struct super_block *sb) void ext4_fc_del(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_fc_dentry_update *fc_dentry; wait_queue_head_t *wq; + int alloc_ctx; if (ext4_fc_disabled(inode->i_sb)) return; - mutex_lock(&sbi->s_fc_lock); + alloc_ctx = ext4_fc_lock(inode->i_sb); if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { - mutex_unlock(&sbi->s_fc_lock); + ext4_fc_unlock(inode->i_sb, alloc_ctx); return; } @@ -275,9 +275,9 @@ void ext4_fc_del(struct inode *inode) #endif prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) { - mutex_unlock(&sbi->s_fc_lock); + ext4_fc_unlock(inode->i_sb, alloc_ctx); schedule(); - mutex_lock(&sbi->s_fc_lock); + alloc_ctx = ext4_fc_lock(inode->i_sb); } finish_wait(wq, &wait.wq_entry); } @@ -288,7 +288,7 @@ void ext4_fc_del(struct inode *inode) * dentry create references, since it is not needed to log it anyways. */ if (list_empty(&ei->i_fc_dilist)) { - mutex_unlock(&sbi->s_fc_lock); + ext4_fc_unlock(inode->i_sb, alloc_ctx); return; } @@ -298,7 +298,7 @@ void ext4_fc_del(struct inode *inode) list_del_init(&fc_dentry->fcd_dilist); WARN_ON(!list_empty(&ei->i_fc_dilist)); - mutex_unlock(&sbi->s_fc_lock); + ext4_fc_unlock(inode->i_sb, alloc_ctx); release_dentry_name_snapshot(&fc_dentry->fcd_name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); @@ -315,6 +315,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl tid_t tid; bool has_transaction = true; bool is_ineligible; + int alloc_ctx; if (ext4_fc_disabled(sb)) return; @@ -329,12 +330,12 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl has_transaction = false; read_unlock(&sbi->s_journal->j_state_lock); } - mutex_lock(&sbi->s_fc_lock); + alloc_ctx = ext4_fc_lock(sb); is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid))) sbi->s_fc_ineligible_tid = tid; ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); - mutex_unlock(&sbi->s_fc_lock); + ext4_fc_unlock(sb, alloc_ctx); WARN_ON(reason >= EXT4_FC_REASON_MAX); sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; } @@ -358,6 +359,7 @@ static int ext4_fc_track_template( struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); tid_t tid = 0; + int alloc_ctx; int ret; tid = handle->h_transaction->t_tid; @@ -373,14 +375,14 @@ static int ext4_fc_track_template( if (!enqueue) return ret; - mutex_lock(&sbi->s_fc_lock); + alloc_ctx = ext4_fc_lock(inode->i_sb); if (list_empty(&EXT4_I(inode)->i_fc_list)) list_add_tail(&EXT4_I(inode)->i_fc_list, (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? &sbi->s_fc_q[FC_Q_STAGING] : &sbi->s_fc_q[FC_Q_MAIN]); - mutex_unlock(&sbi->s_fc_lock); + ext4_fc_unlock(inode->i_sb, alloc_ctx); return ret; } @@ -402,6 +404,7 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode, struct inode *dir = dentry->d_parent->d_inode; struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); + int alloc_ctx; spin_unlock(&ei->i_fc_lock); @@ -425,7 +428,7 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode, take_dentry_name_snapshot(&node->fcd_name, dentry); INIT_LIST_HEAD(&node->fcd_dilist); INIT_LIST_HEAD(&node->fcd_list); - mutex_lock(&sbi->s_fc_lock); + alloc_ctx = ext4_fc_lock(sb); if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) list_add_tail(&node->fcd_list, @@ -446,7 +449,7 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode, WARN_ON(!list_empty(&ei->i_fc_dilist)); list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist); } - mutex_unlock(&sbi->s_fc_lock); + ext4_fc_unlock(sb, alloc_ctx); spin_lock(&ei->i_fc_lock); return 0; @@ -1046,18 +1049,19 @@ static int ext4_fc_perform_commit(journal_t *journal) struct blk_plug plug; int ret = 0; u32 crc = 0; + int alloc_ctx; /* * Step 1: Mark all inodes on s_fc_q[MAIN] with * EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being * freed until the data flush is over. */ - mutex_lock(&sbi->s_fc_lock); + alloc_ctx = ext4_fc_lock(sb); list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ext4_set_inode_state(&iter->vfs_inode, EXT4_STATE_FC_FLUSHING_DATA); } - mutex_unlock(&sbi->s_fc_lock); + ext4_fc_unlock(sb, alloc_ctx); /* Step 2: Flush data for all the eligible inodes. */ ret = ext4_fc_flush_data(journal); @@ -1067,7 +1071,7 @@ static int ext4_fc_perform_commit(journal_t *journal) * any error from step 2. This ensures that waiters waiting on * EXT4_STATE_FC_FLUSHING_DATA can resume. */ - mutex_lock(&sbi->s_fc_lock); + alloc_ctx = ext4_fc_lock(sb); list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ext4_clear_inode_state(&iter->vfs_inode, EXT4_STATE_FC_FLUSHING_DATA); @@ -1084,7 +1088,7 @@ static int ext4_fc_perform_commit(journal_t *journal) * prepare_to_wait() in ext4_fc_del(). */ smp_mb(); - mutex_unlock(&sbi->s_fc_lock); + ext4_fc_unlock(sb, alloc_ctx); /* * If we encountered error in Step 2, return it now after clearing @@ -1101,12 +1105,12 @@ static int ext4_fc_perform_commit(journal_t *journal) * previous handles are now drained. We now mark the inodes on the * commit queue as being committed. */ - mutex_lock(&sbi->s_fc_lock); + alloc_ctx = ext4_fc_lock(sb); list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ext4_set_inode_state(&iter->vfs_inode, EXT4_STATE_FC_COMMITTING); } - mutex_unlock(&sbi->s_fc_lock); + ext4_fc_unlock(sb, alloc_ctx); jbd2_journal_unlock_updates(journal); /* @@ -1117,6 +1121,7 @@ static int ext4_fc_perform_commit(journal_t *journal) blkdev_issue_flush(journal->j_fs_dev); blk_start_plug(&plug); + alloc_ctx = ext4_fc_lock(sb); /* Step 6: Write fast commit blocks to disk. */ if (sbi->s_fc_bytes == 0) { /* @@ -1134,7 +1139,6 @@ static int ext4_fc_perform_commit(journal_t *journal) } /* Step 6.2: Now write all the dentry updates. */ - mutex_lock(&sbi->s_fc_lock); ret = ext4_fc_commit_dentry_updates(journal, &crc); if (ret) goto out; @@ -1156,7 +1160,7 @@ static int ext4_fc_perform_commit(journal_t *journal) ret = ext4_fc_write_tail(sb, crc); out: - mutex_unlock(&sbi->s_fc_lock); + ext4_fc_unlock(sb, alloc_ctx); blk_finish_plug(&plug); return ret; } @@ -1290,6 +1294,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *ei; struct ext4_fc_dentry_update *fc_dentry; + int alloc_ctx; if (full && sbi->s_fc_bh) sbi->s_fc_bh = NULL; @@ -1297,7 +1302,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) trace_ext4_fc_cleanup(journal, full, tid); jbd2_fc_release_bufs(journal); - mutex_lock(&sbi->s_fc_lock); + alloc_ctx = ext4_fc_lock(sb); while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) { ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN], struct ext4_inode_info, @@ -1356,7 +1361,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) if (full) sbi->s_fc_bytes = 0; - mutex_unlock(&sbi->s_fc_lock); + ext4_fc_unlock(sb, alloc_ctx); trace_ext4_fc_stats(sb); } @@ -2302,6 +2307,9 @@ static const char * const fc_ineligible_reasons[] = { [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op", [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling", [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename", + [EXT4_FC_REASON_MIGRATE] = "Inode format migration", + [EXT4_FC_REASON_VERITY] = "fs-verity enable", + [EXT4_FC_REASON_MOVE_EXT] = "Move extents", }; int ext4_fc_info_show(struct seq_file *seq, void *v) diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h index 3bd534e4dbbf..2f77a37fb101 100644 --- a/fs/ext4/fast_commit.h +++ b/fs/ext4/fast_commit.h @@ -97,6 +97,9 @@ enum { EXT4_FC_REASON_FALLOC_RANGE, EXT4_FC_REASON_INODE_JOURNAL_DATA, EXT4_FC_REASON_ENCRYPTED_FILENAME, + EXT4_FC_REASON_MIGRATE, + EXT4_FC_REASON_VERITY, + EXT4_FC_REASON_MOVE_EXT, EXT4_FC_REASON_MAX }; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 534cf864101f..4320ebff74f3 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -419,22 +419,20 @@ static const struct iomap_dio_ops ext4_dio_write_ops = { * updating inode i_disksize and/or orphan handling with exclusive lock. * * - shared locking will only be true mostly with overwrites, including - * initialized blocks and unwritten blocks. For overwrite unwritten blocks - * we protect splitting extents by i_data_sem in ext4_inode_info, so we can - * also release exclusive i_rwsem lock. + * initialized blocks and unwritten blocks. * * - Otherwise we will switch to exclusive i_rwsem lock. */ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, bool *ilock_shared, bool *extend, - bool *unwritten, int *dio_flags) + int *dio_flags) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); loff_t offset; size_t count; ssize_t ret; - bool overwrite, unaligned_io; + bool overwrite, unaligned_io, unwritten; restart: ret = ext4_generic_write_checks(iocb, from); @@ -446,7 +444,7 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, unaligned_io = ext4_unaligned_io(inode, from, offset); *extend = ext4_extending_io(inode, offset, count); - overwrite = ext4_overwrite_io(inode, offset, count, unwritten); + overwrite = ext4_overwrite_io(inode, offset, count, &unwritten); /* * Determine whether we need to upgrade to an exclusive lock. This is @@ -461,7 +459,7 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, */ if (*ilock_shared && ((!IS_NOSEC(inode) || *extend || !overwrite || - (unaligned_io && *unwritten)))) { + (unaligned_io && unwritten)))) { if (iocb->ki_flags & IOCB_NOWAIT) { ret = -EAGAIN; goto out; @@ -484,7 +482,7 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, ret = -EAGAIN; goto out; } - if (unaligned_io && (!overwrite || *unwritten)) + if (unaligned_io && (!overwrite || unwritten)) inode_dio_wait(inode); *dio_flags = IOMAP_DIO_FORCE_WAIT; } @@ -509,8 +507,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(iocb->ki_filp); loff_t offset = iocb->ki_pos; size_t count = iov_iter_count(from); - const struct iomap_ops *iomap_ops = &ext4_iomap_ops; - bool extend = false, unwritten = false; + bool extend = false; bool ilock_shared = true; int dio_flags = 0; @@ -556,7 +553,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend, - &unwritten, &dio_flags); + &dio_flags); if (ret <= 0) return ret; @@ -576,9 +573,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out; } - if (ilock_shared && !unwritten) - iomap_ops = &ext4_iomap_overwrite_ops; - ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, + ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops, dio_flags, NULL, 0); if (ret == -ENOTBLK) ret = 0; @@ -859,7 +854,6 @@ static int ext4_sample_last_mounted(struct super_block *sb, * when trying to sort through large numbers of block * devices or filesystem images. */ - memset(buf, 0, sizeof(buf)); path.mnt = mnt; path.dentry = mnt->mnt_root; cp = d_path(&path, buf, sizeof(buf)); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0c466ccbed69..15ba4d42982f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -48,6 +48,8 @@ #include "acl.h" #include "truncate.h" +#include + #include static void ext4_journalled_zero_new_buffers(handle_t *handle, @@ -400,6 +402,8 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, { int ret; + KUNIT_STATIC_STUB_REDIRECT(ext4_issue_zeroout, inode, lblk, pblk, len); + if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) return fscrypt_zeroout_range(inode, lblk, pblk, len); @@ -503,8 +507,8 @@ static int ext4_map_query_blocks_next_in_leaf(handle_t *handle, retval = ext4_ext_map_blocks(handle, inode, &map2, 0); if (retval <= 0) { - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status, false); + ext4_es_cache_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status); return map->m_len; } @@ -525,20 +529,20 @@ static int ext4_map_query_blocks_next_in_leaf(handle_t *handle, */ if (map->m_pblk + map->m_len == map2.m_pblk && status == status2) { - ext4_es_insert_extent(inode, map->m_lblk, - map->m_len + map2.m_len, map->m_pblk, - status, false); + ext4_es_cache_extent(inode, map->m_lblk, + map->m_len + map2.m_len, map->m_pblk, + status); map->m_len += map2.m_len; } else { - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status, false); + ext4_es_cache_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status); } return map->m_len; } -static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, - struct ext4_map_blocks *map, int flags) +int ext4_map_query_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags) { unsigned int status; int retval; @@ -573,8 +577,8 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, map->m_len == orig_mlen) { status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status, false); + ext4_es_cache_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status); } else { retval = ext4_map_query_blocks_next_in_leaf(handle, inode, map, orig_mlen); @@ -584,10 +588,9 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, return retval; } -static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, - struct ext4_map_blocks *map, int flags) +int ext4_map_create_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags) { - struct extent_status es; unsigned int status; int err, retval = 0; @@ -648,16 +651,6 @@ static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, return err; } - /* - * If the extent has been zeroed out, we don't need to update - * extent status tree. - */ - if (flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE && - ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) { - if (ext4_es_is_written(&es)) - return retval; - } - status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, @@ -2375,7 +2368,7 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) dioread_nolock = ext4_should_dioread_nolock(inode); if (dioread_nolock) - get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; + get_blocks_flags |= EXT4_GET_BLOCKS_UNWRIT_EXT; err = ext4_map_blocks(handle, inode, map, get_blocks_flags); if (err < 0) @@ -3740,7 +3733,7 @@ static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, else if (EXT4_LBLK_TO_B(inode, map->m_lblk) >= i_size_read(inode)) m_flags = EXT4_GET_BLOCKS_CREATE; else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; + m_flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; if (flags & IOMAP_ATOMIC) ret = ext4_map_blocks_atomic_write(handle, inode, map, m_flags, @@ -3812,22 +3805,25 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, if (offset + length <= i_size_read(inode)) { ret = ext4_map_blocks(NULL, inode, &map, 0); /* - * For atomic writes the entire requested length should - * be mapped. + * For DAX we convert extents to initialized ones before + * copying the data, otherwise we do it after I/O so + * there's no need to call into ext4_iomap_alloc(). */ - if (map.m_flags & EXT4_MAP_MAPPED) { - if ((!(flags & IOMAP_ATOMIC) && ret > 0) || - (flags & IOMAP_ATOMIC && ret >= orig_mlen)) + if ((map.m_flags & EXT4_MAP_MAPPED) || + (!(flags & IOMAP_DAX) && + (map.m_flags & EXT4_MAP_UNWRITTEN))) { + /* + * For atomic writes the entire requested + * length should be mapped. + */ + if (ret == orig_mlen || + (!(flags & IOMAP_ATOMIC) && ret > 0)) goto out; } map.m_len = orig_mlen; } ret = ext4_iomap_alloc(inode, &map, flags); } else { - /* - * This can be called for overwrites path from - * ext4_iomap_overwrite_begin(). - */ ret = ext4_map_blocks(NULL, inode, &map, 0); } @@ -3856,30 +3852,10 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, return 0; } -static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, - loff_t length, unsigned flags, struct iomap *iomap, - struct iomap *srcmap) -{ - int ret; - - /* - * Even for writes we don't need to allocate blocks, so just pretend - * we are reading to save overhead of starting a transaction. - */ - flags &= ~IOMAP_WRITE; - ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap); - WARN_ON_ONCE(!ret && iomap->type != IOMAP_MAPPED); - return ret; -} - const struct iomap_ops ext4_iomap_ops = { .iomap_begin = ext4_iomap_begin, }; -const struct iomap_ops ext4_iomap_overwrite_ops = { - .iomap_begin = ext4_iomap_overwrite_begin, -}; - static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) @@ -4133,9 +4109,13 @@ static int __ext4_block_zero_page_range(handle_t *handle, if (ext4_should_journal_data(inode)) { err = ext4_dirty_journalled_data(handle, bh); } else { - err = 0; mark_buffer_dirty(bh); - if (ext4_should_order_data(inode)) + /* + * Only the written block requires ordered data to prevent + * exposing stale data. + */ + if (!buffer_unwritten(bh) && !buffer_delay(bh) && + ext4_should_order_data(inode)) err = ext4_jbd2_inode_add_write(handle, inode, from, length); } diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index ea26cd03d3ce..3ae9cb50a0c0 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -968,6 +968,7 @@ static long ext4_ioctl_group_add(struct file *file, err = ext4_group_add(sb, input); if (EXT4_SB(sb)->s_journal) { + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE, NULL); jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); @@ -1613,6 +1614,8 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); if (EXT4_SB(sb)->s_journal) { + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE, + NULL); jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c index a9416b20ff64..4abb40d4561c 100644 --- a/fs/ext4/mballoc-test.c +++ b/fs/ext4/mballoc-test.c @@ -567,7 +567,7 @@ test_mark_diskspace_used_range(struct kunit *test, bitmap = mbt_ctx_bitmap(sb, TEST_GOAL_GROUP); memset(bitmap, 0, sb->s_blocksize); - ret = ext4_mb_mark_diskspace_used(ac, NULL, 0); + ret = ext4_mb_mark_diskspace_used(ac, NULL); KUNIT_ASSERT_EQ(test, ret, 0); max = EXT4_CLUSTERS_PER_GROUP(sb); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e817a758801d..b99d1a7e580e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -892,6 +892,21 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) } } +static ext4_group_t ext4_get_allocation_groups_count( + struct ext4_allocation_context *ac) +{ + ext4_group_t ngroups = ext4_get_groups_count(ac->ac_sb); + + /* non-extent files are limited to low blocks/groups */ + if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) + ngroups = EXT4_SB(ac->ac_sb)->s_blockfile_groups; + + /* Pairs with smp_wmb() in ext4_update_super() */ + smp_rmb(); + + return ngroups; +} + static int ext4_mb_scan_groups_xa_range(struct ext4_allocation_context *ac, struct xarray *xa, ext4_group_t start, ext4_group_t end) @@ -899,7 +914,7 @@ static int ext4_mb_scan_groups_xa_range(struct ext4_allocation_context *ac, struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); enum criteria cr = ac->ac_criteria; - ext4_group_t ngroups = ext4_get_groups_count(sb); + ext4_group_t ngroups = ext4_get_allocation_groups_count(ac); unsigned long group = start; struct ext4_group_info *grp; @@ -951,7 +966,7 @@ static int ext4_mb_scan_groups_p2_aligned(struct ext4_allocation_context *ac, ext4_group_t start, end; start = group; - end = ext4_get_groups_count(ac->ac_sb); + end = ext4_get_allocation_groups_count(ac); wrap_around: for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { ret = ext4_mb_scan_groups_largest_free_order_range(ac, i, @@ -1001,7 +1016,7 @@ static int ext4_mb_scan_groups_goal_fast(struct ext4_allocation_context *ac, ext4_group_t start, end; start = group; - end = ext4_get_groups_count(ac->ac_sb); + end = ext4_get_allocation_groups_count(ac); wrap_around: i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); for (; i < MB_NUM_ORDERS(ac->ac_sb); i++) { @@ -1083,7 +1098,7 @@ static int ext4_mb_scan_groups_best_avail(struct ext4_allocation_context *ac, min_order = fls(ac->ac_o_ex.fe_len); start = group; - end = ext4_get_groups_count(ac->ac_sb); + end = ext4_get_allocation_groups_count(ac); wrap_around: for (i = order; i >= min_order; i--) { int frag_order; @@ -1133,8 +1148,6 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac) return 0; if (ac->ac_criteria >= CR_GOAL_LEN_SLOW) return 0; - if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) - return 0; return 1; } @@ -1182,11 +1195,7 @@ static int ext4_mb_scan_groups(struct ext4_allocation_context *ac) int ret = 0; ext4_group_t start; struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - ext4_group_t ngroups = ext4_get_groups_count(ac->ac_sb); - - /* non-extent files are limited to low blocks/groups */ - if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) - ngroups = sbi->s_blockfile_groups; + ext4_group_t ngroups = ext4_get_allocation_groups_count(ac); /* searching for the right group start from the goal value specified */ start = ac->ac_g_ex.fe_group; @@ -1706,16 +1715,17 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, /* Avoid locking the folio in the fast path ... */ folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); - if (IS_ERR(folio) || !folio_test_uptodate(folio)) { + if (IS_ERR(folio) || !folio_test_uptodate(folio) || folio_test_locked(folio)) { + /* + * folio_test_locked is employed to detect ongoing folio + * migrations, since concurrent migrations can lead to + * bitmap inconsistency. And if we are not uptodate that + * implies somebody just created the folio but is yet to + * initialize it. We can drop the folio reference and + * try to get the folio with lock in both cases to avoid + * concurrency. + */ if (!IS_ERR(folio)) - /* - * drop the folio reference and try - * to get the folio with lock. If we - * are not uptodate that implies - * somebody just created the folio but - * is yet to initialize it. So - * wait for it to initialize. - */ folio_put(folio); folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); @@ -1764,7 +1774,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, /* we need another folio for the buddy */ folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); - if (IS_ERR(folio) || !folio_test_uptodate(folio)) { + if (IS_ERR(folio) || !folio_test_uptodate(folio) || folio_test_locked(folio)) { if (!IS_ERR(folio)) folio_put(folio); folio = __filemap_get_folio(inode->i_mapping, pnum, @@ -4185,8 +4195,7 @@ ext4_mb_mark_context(handle_t *handle, struct super_block *sb, bool state, * Returns 0 if success or error code */ static noinline_for_stack int -ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, - handle_t *handle, unsigned int reserv_clstrs) +ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, handle_t *handle) { struct ext4_group_desc *gdp; struct ext4_sb_info *sbi; @@ -4241,13 +4250,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, BUG_ON(changed != ac->ac_b_ex.fe_len); #endif percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); - /* - * Now reduce the dirty block count also. Should not go negative - */ - if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) - /* release all the reserved blocks if non delalloc */ - percpu_counter_sub(&sbi->s_dirtyclusters_counter, - reserv_clstrs); return err; } @@ -6331,7 +6333,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, ext4_mb_pa_put_free(ac); } if (likely(ac->ac_status == AC_STATUS_FOUND)) { - *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); + *errp = ext4_mb_mark_diskspace_used(ac, handle); if (*errp) { ext4_discard_allocated_blocks(ac); goto errout; @@ -6362,12 +6364,9 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, out: if (inquota && ar->len < inquota) dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); - if (!ar->len) { - if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) - /* release all the reserved blocks if non delalloc */ - percpu_counter_sub(&sbi->s_dirtyclusters_counter, - reserv_clstrs); - } + /* release any reserved blocks */ + if (reserv_clstrs) + percpu_counter_sub(&sbi->s_dirtyclusters_counter, reserv_clstrs); trace_ext4_allocate_blocks(ar, (unsigned long long)block); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 1b0dfd963d3f..96ab95167bd6 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -449,6 +449,12 @@ int ext4_ext_migrate(struct inode *inode) retval = PTR_ERR(handle); goto out_unlock; } + /* + * This operation rewrites the inode's block mapping layout + * (indirect to extents) and is not tracked in the fast commit + * log, so disable fast commits for this transaction. + */ + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_MIGRATE, handle); goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; owner[0] = i_uid_read(inode); @@ -630,6 +636,12 @@ int ext4_ind_migrate(struct inode *inode) ret = PTR_ERR(handle); goto out_unlock; } + /* + * This operation rewrites the inode's block mapping layout + * (extents to indirect blocks) and is not tracked in the fast + * commit log, so disable fast commits for this transaction. + */ + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_MIGRATE, handle); down_write(&EXT4_I(inode)->i_data_sem); ret = ext4_ext_check_inode(inode); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 635fb8a52e0c..ce1f738dff93 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -321,6 +321,8 @@ static int mext_move_extent(struct mext_data *mext, u64 *m_len) ret = PTR_ERR(handle); goto out; } + ext4_fc_mark_ineligible(orig_inode->i_sb, EXT4_FC_REASON_MOVE_EXT, + handle); ret = mext_move_begin(mext, folio, &move_type); if (ret) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3c73b982a4f7..69eb63dde983 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3650,10 +3650,12 @@ int ext4_feature_set_ok(struct super_block *sb, int readonly) } /* - * This function is called once a day if we have errors logged - * on the file system + * This function is called once a day by default if we have errors logged + * on the file system. + * Use the err_report_sec sysfs attribute to disable or adjust its call + * freequency. */ -static void print_daily_error_info(struct timer_list *t) +void print_daily_error_info(struct timer_list *t) { struct ext4_sb_info *sbi = timer_container_of(sbi, t, s_err_report); struct super_block *sb = sbi->s_sb; @@ -3693,7 +3695,9 @@ static void print_daily_error_info(struct timer_list *t) le64_to_cpu(es->s_last_error_block)); printk(KERN_CONT "\n"); } - mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ + + if (sbi->s_err_report_sec) + mod_timer(&sbi->s_err_report, jiffies + secs_to_jiffies(sbi->s_err_report_sec)); } /* Find next suitable group and run ext4_init_inode_table */ @@ -5616,6 +5620,10 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) clear_opt2(sb, MB_OPTIMIZE_SCAN); } + err = ext4_percpu_param_init(sbi); + if (err) + goto failed_mount5; + err = ext4_mb_init(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", @@ -5631,10 +5639,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; - err = ext4_percpu_param_init(sbi); - if (err) - goto failed_mount6; - if (ext4_has_feature_flex_bg(sb)) if (!ext4_fill_flex_info(sb)) { ext4_msg(sb, KERN_ERR, @@ -5690,8 +5694,12 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) clear_opt(sb, DISCARD); } - if (es->s_error_count) - mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ + if (es->s_error_count) { + sbi->s_err_report_sec = 5*60; /* first time 5 minutes */ + mod_timer(&sbi->s_err_report, + jiffies + secs_to_jiffies(sbi->s_err_report_sec)); + } + sbi->s_err_report_sec = 24*60*60; /* Once a day */ /* Enable message ratelimiting. Default is 10 messages per 5 secs. */ ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10); @@ -5716,8 +5724,8 @@ failed_mount8: __maybe_unused failed_mount6: ext4_mb_release(sb); ext4_flex_groups_free(sbi); - ext4_percpu_param_destroy(sbi); failed_mount5: + ext4_percpu_param_destroy(sbi); ext4_ext_release(sb); ext4_release_system_zone(sb); failed_mount4a: @@ -6237,10 +6245,11 @@ static void ext4_update_super(struct super_block *sb) ext4_errno_to_code(sbi->s_last_error_code); /* * Start the daily error reporting function if it hasn't been - * started already + * started already and sbi->s_err_report_sec is not zero */ - if (!es->s_error_count) - mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); + if (!es->s_error_count && !sbi->s_err_report_sec) + mod_timer(&sbi->s_err_report, + jiffies + secs_to_jiffies(sbi->s_err_report_sec)); le32_add_cpu(&es->s_error_count, sbi->s_add_error_count); sbi->s_add_error_count = 0; } diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 0018e09b867e..d2ecc1026c0c 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -40,6 +40,7 @@ typedef enum { attr_pointer_string, attr_pointer_atomic, attr_journal_task, + attr_err_report_sec, } attr_id_t; typedef enum { @@ -130,6 +131,36 @@ static ssize_t trigger_test_error(struct ext4_sb_info *sbi, return count; } +static ssize_t err_report_sec_store(struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned long t; + int ret; + + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; + + /*the maximum time interval must not exceed one year.*/ + if (t > (365*24*60*60)) + return -EINVAL; + + if (sbi->s_err_report_sec == t) /*nothing to do*/ + goto out; + else if (!sbi->s_err_report_sec && t) { + timer_setup(&sbi->s_err_report, print_daily_error_info, 0); + } else if (sbi->s_err_report_sec && !t) { + timer_delete_sync(&sbi->s_err_report); + goto out; + } + + sbi->s_err_report_sec = t; + mod_timer(&sbi->s_err_report, jiffies + secs_to_jiffies(sbi->s_err_report_sec)); + +out: + return count; +} + static ssize_t journal_task_show(struct ext4_sb_info *sbi, char *buf) { if (!sbi->s_journal) @@ -217,6 +248,7 @@ EXT4_ATTR_OFFSET(mb_group_prealloc, 0644, clusters_in_group, ext4_sb_info, s_mb_group_prealloc); EXT4_ATTR_OFFSET(mb_best_avail_max_trim_order, 0644, mb_order, ext4_sb_info, s_mb_best_avail_max_trim_order); +EXT4_ATTR_OFFSET(err_report_sec, 0644, err_report_sec, ext4_sb_info, s_err_report_sec); EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); @@ -309,6 +341,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(last_trim_minblks), ATTR_LIST(sb_update_sec), ATTR_LIST(sb_update_kb), + ATTR_LIST(err_report_sec), NULL, }; ATTRIBUTE_GROUPS(ext4); @@ -402,6 +435,7 @@ static ssize_t ext4_generic_attr_show(struct ext4_attr *a, return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr)); return sysfs_emit(buf, "%u\n", *((unsigned int *) ptr)); case attr_pointer_ul: + case attr_err_report_sec: return sysfs_emit(buf, "%lu\n", *((unsigned long *) ptr)); case attr_pointer_u8: return sysfs_emit(buf, "%u\n", *((unsigned char *) ptr)); @@ -525,6 +559,8 @@ static ssize_t ext4_attr_store(struct kobject *kobj, return inode_readahead_blks_store(sbi, buf, len); case attr_trigger_test_error: return trigger_test_error(sbi, buf, len); + case attr_err_report_sec: + return err_report_sec_store(sbi, buf, len); default: return ext4_generic_attr_store(a, sbi, buf, len); } diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index 415d9c4d8a32..667f9e8d4da9 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -231,6 +231,8 @@ static int ext4_end_enable_verity(struct file *filp, const void *desc, goto cleanup; } + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_VERITY, handle); + err = ext4_orphan_del(handle, inode); if (err) goto stop_and_cleanup; diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index fd76d14c2776..a3e8fe414df8 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -102,6 +102,9 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_RENAME_DIR); TRACE_DEFINE_ENUM(EXT4_FC_REASON_FALLOC_RANGE); TRACE_DEFINE_ENUM(EXT4_FC_REASON_INODE_JOURNAL_DATA); TRACE_DEFINE_ENUM(EXT4_FC_REASON_ENCRYPTED_FILENAME); +TRACE_DEFINE_ENUM(EXT4_FC_REASON_MIGRATE); +TRACE_DEFINE_ENUM(EXT4_FC_REASON_VERITY); +TRACE_DEFINE_ENUM(EXT4_FC_REASON_MOVE_EXT); TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX); #define show_fc_reason(reason) \ @@ -115,7 +118,10 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX); { EXT4_FC_REASON_RENAME_DIR, "RENAME_DIR"}, \ { EXT4_FC_REASON_FALLOC_RANGE, "FALLOC_RANGE"}, \ { EXT4_FC_REASON_INODE_JOURNAL_DATA, "INODE_JOURNAL_DATA"}, \ - { EXT4_FC_REASON_ENCRYPTED_FILENAME, "ENCRYPTED_FILENAME"}) + { EXT4_FC_REASON_ENCRYPTED_FILENAME, "ENCRYPTED_FILENAME"}, \ + { EXT4_FC_REASON_MIGRATE, "MIGRATE"}, \ + { EXT4_FC_REASON_VERITY, "VERITY"}, \ + { EXT4_FC_REASON_MOVE_EXT, "MOVE_EXT"}) TRACE_DEFINE_ENUM(CR_POWER2_ALIGNED); TRACE_DEFINE_ENUM(CR_GOAL_LEN_FAST);