From 22784ca541c0f01c5ebad14e8228298dc0a390ed Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Sat, 29 Nov 2025 18:32:33 +0800 Subject: [PATCH 01/45] ext4: subdivide EXT4_EXT_DATA_VALID1 When splitting an extent, if the EXT4_GET_BLOCKS_CONVERT flag is set and it is necessary to split the target extent in the middle, ext4_split_extent() first handles splitting the latter half of the extent and passes the EXT4_EXT_DATA_VALID1 flag. This flag implies that all blocks before the split point contain valid data; however, this assumption is incorrect. Therefore, subdivid EXT4_EXT_DATA_VALID1 into EXT4_EXT_DATA_ENTIRE_VALID1 and EXT4_EXT_DATA_PARTIAL_VALID1, which indicate that the first half of the extent is either entirely valid or only partially valid, respectively. These two flags cannot be set simultaneously. This patch does not use EXT4_EXT_DATA_PARTIAL_VALID1, it only replaces EXT4_EXT_DATA_VALID1 with EXT4_EXT_DATA_ENTIRE_VALID1 at the location where it is set, no logical changes. Signed-off-by: Zhang Yi Reviewed-by: Ojaswin Mujoo Reviewed-by: Baokun Li Cc: stable@kernel.org Message-ID: <20251129103247.686136-2-yi.zhang@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 2cf5759ba689..8d5ca450aa5d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -43,8 +43,13 @@ #define EXT4_EXT_MARK_UNWRIT1 0x2 /* mark first half unwritten */ #define EXT4_EXT_MARK_UNWRIT2 0x4 /* mark second half unwritten */ -#define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */ -#define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */ +/* first half contains valid data */ +#define EXT4_EXT_DATA_ENTIRE_VALID1 0x8 /* has entirely valid data */ +#define EXT4_EXT_DATA_PARTIAL_VALID1 0x10 /* has partially valid data */ +#define EXT4_EXT_DATA_VALID1 (EXT4_EXT_DATA_ENTIRE_VALID1 | \ + EXT4_EXT_DATA_PARTIAL_VALID1) + +#define EXT4_EXT_DATA_VALID2 0x20 /* second half contains valid data */ static __le32 ext4_extent_block_csum(struct inode *inode, struct ext4_extent_header *eh) @@ -3190,8 +3195,9 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, unsigned int ee_len, depth; int err = 0; - BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) == - (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)); + BUG_ON((split_flag & EXT4_EXT_DATA_VALID1) == EXT4_EXT_DATA_VALID1); + BUG_ON((split_flag & EXT4_EXT_DATA_VALID1) && + (split_flag & EXT4_EXT_DATA_VALID2)); ext_debug(inode, "logical block %llu\n", (unsigned long long)split); @@ -3373,7 +3379,7 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle, split_flag1 |= EXT4_EXT_MARK_UNWRIT1 | EXT4_EXT_MARK_UNWRIT2; if (split_flag & EXT4_EXT_DATA_VALID2) - split_flag1 |= EXT4_EXT_DATA_VALID1; + split_flag1 |= EXT4_EXT_DATA_ENTIRE_VALID1; path = ext4_split_extent_at(handle, inode, path, map->m_lblk + map->m_len, split_flag1, flags1); if (IS_ERR(path)) @@ -3728,7 +3734,7 @@ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle, /* Convert to unwritten */ if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) { - split_flag |= EXT4_EXT_DATA_VALID1; + split_flag |= EXT4_EXT_DATA_ENTIRE_VALID1; /* Convert to initialized */ } else if (flags & EXT4_GET_BLOCKS_CONVERT) { /* From 1bf6974822d1dba86cf11b5f05498581cf3488a2 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Sat, 29 Nov 2025 18:32:34 +0800 Subject: [PATCH 02/45] ext4: don't zero the entire extent if EXT4_EXT_DATA_PARTIAL_VALID1 When allocating initialized blocks from a large unwritten extent, or when splitting an unwritten extent during end I/O and converting it to initialized, there is currently a potential issue of stale data if the extent needs to be split in the middle. 0 A B N [UUUUUUUUUUUU] U: unwritten extent [--DDDDDDDD--] D: valid data |<- ->| ----> this range needs to be initialized ext4_split_extent() first try to split this extent at B with EXT4_EXT_DATA_ENTIRE_VALID1 and EXT4_EXT_MAY_ZEROOUT flag set, but ext4_split_extent_at() failed to split this extent due to temporary lack of space. It zeroout B to N and mark the entire extent from 0 to N as written. 0 A B N [WWWWWWWWWWWW] W: written extent [SSDDDDDDDDZZ] Z: zeroed, S: stale data ext4_split_extent() then try to split this extent at A with EXT4_EXT_DATA_VALID2 flag set. This time, it split successfully and left a stale written extent from 0 to A. 0 A B N [WW|WWWWWWWWWW] [SS|DDDDDDDDZZ] Fix this by pass EXT4_EXT_DATA_PARTIAL_VALID1 to ext4_split_extent_at() when splitting at B, don't convert the entire extent to written and left it as unwritten after zeroing out B to N. The remaining work is just like the standard two-part split. ext4_split_extent() will pass the EXT4_EXT_DATA_VALID2 flag when it calls ext4_split_extent_at() for the second time, allowing it to properly handle the split. If the split is successful, it will keep extent from 0 to A as unwritten. Signed-off-by: Zhang Yi Reviewed-by: Ojaswin Mujoo Reviewed-by: Baokun Li Cc: stable@kernel.org Message-ID: <20251129103247.686136-3-yi.zhang@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 8d5ca450aa5d..1fee84ea20af 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3310,6 +3310,15 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, } if (!err) { + /* + * The first half contains partially valid data, the + * splitting of this extent has not been completed, fix + * extent length and ext4_split_extent() split will the + * first half again. + */ + if (split_flag & EXT4_EXT_DATA_PARTIAL_VALID1) + goto fix_extent_len; + /* update the extent length and mark as initialized */ ex->ee_len = cpu_to_le16(ee_len); ext4_ext_try_to_merge(handle, inode, path, ex); @@ -3379,7 +3388,9 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle, split_flag1 |= EXT4_EXT_MARK_UNWRIT1 | EXT4_EXT_MARK_UNWRIT2; if (split_flag & EXT4_EXT_DATA_VALID2) - split_flag1 |= EXT4_EXT_DATA_ENTIRE_VALID1; + split_flag1 |= map->m_lblk > ee_block ? + EXT4_EXT_DATA_PARTIAL_VALID1 : + EXT4_EXT_DATA_ENTIRE_VALID1; path = ext4_split_extent_at(handle, inode, path, map->m_lblk + map->m_len, split_flag1, flags1); if (IS_ERR(path)) From feaf2a80e78f89ee8a3464126077ba8683b62791 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Sat, 29 Nov 2025 18:32:35 +0800 Subject: [PATCH 03/45] ext4: don't set EXT4_GET_BLOCKS_CONVERT when splitting before submitting I/O When allocating blocks during within-EOF DIO and writeback with dioread_nolock enabled, EXT4_GET_BLOCKS_PRE_IO was set to split an existing large unwritten extent. However, EXT4_GET_BLOCKS_CONVERT was set when calling ext4_split_convert_extents(), which may potentially result in stale data issues. Assume we have an unwritten extent, and then DIO writes the second half. [UUUUUUUUUUUUUUUU] on-disk extent U: unwritten extent [UUUUUUUUUUUUUUUU] extent status tree |<- ->| ----> dio write this range First, ext4_iomap_alloc() call ext4_map_blocks() with EXT4_GET_BLOCKS_PRE_IO, EXT4_GET_BLOCKS_UNWRIT_EXT and EXT4_GET_BLOCKS_CREATE flags set. ext4_map_blocks() find this extent and call ext4_split_convert_extents() with EXT4_GET_BLOCKS_CONVERT and the above flags set. Then, ext4_split_convert_extents() calls ext4_split_extent() with EXT4_EXT_MAY_ZEROOUT, EXT4_EXT_MARK_UNWRIT2 and EXT4_EXT_DATA_VALID2 flags set, and it calls ext4_split_extent_at() to split the second half with EXT4_EXT_DATA_VALID2, EXT4_EXT_MARK_UNWRIT1, EXT4_EXT_MAY_ZEROOUT and EXT4_EXT_MARK_UNWRIT2 flags set. However, ext4_split_extent_at() failed to insert extent since a temporary lack -ENOSPC. It zeroes out the first half but convert the entire on-disk extent to written since the EXT4_EXT_DATA_VALID2 flag set, but left the second half as unwritten in the extent status tree. [0000000000SSSSSS] data S: stale data, 0: zeroed [WWWWWWWWWWWWWWWW] on-disk extent W: written extent [WWWWWWWWWWUUUUUU] extent status tree Finally, if the DIO failed to write data to the disk, the stale data in the second half will be exposed once the cached extent entry is gone. Fix this issue by not passing EXT4_GET_BLOCKS_CONVERT when splitting an unwritten extent before submitting I/O, and make ext4_split_convert_extents() to zero out the entire extent range to zero for this case, and also mark the extent in the extent status tree for consistency. Fixes: b8a8684502a0 ("ext4: Introduce FALLOC_FL_ZERO_RANGE flag for fallocate") Signed-off-by: Zhang Yi Reviewed-by: Ojaswin Mujoo Reviewed-by: Baokun Li Cc: stable@kernel.org Message-ID: <20251129103247.686136-4-yi.zhang@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 1fee84ea20af..91b56de60c90 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3746,15 +3746,19 @@ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle, /* Convert to unwritten */ if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) { split_flag |= EXT4_EXT_DATA_ENTIRE_VALID1; - /* Convert to initialized */ - } else if (flags & EXT4_GET_BLOCKS_CONVERT) { + /* Split the existing unwritten extent */ + } else if (flags & (EXT4_GET_BLOCKS_UNWRIT_EXT | + EXT4_GET_BLOCKS_CONVERT)) { /* * It is safe to convert extent to initialized via explicit * zeroout only if extent is fully inside i_size or new_size. */ split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; - split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); + split_flag |= EXT4_EXT_MARK_UNWRIT2; + /* Convert to initialized */ + if (flags & EXT4_GET_BLOCKS_CONVERT) + split_flag |= EXT4_EXT_DATA_VALID2; } flags |= EXT4_GET_BLOCKS_SPLIT_NOMERGE; return ext4_split_extent(handle, inode, path, map, split_flag, flags, @@ -3930,7 +3934,7 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, /* get_block() before submitting IO, split the extent */ if (flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE) { path = ext4_split_convert_extents(handle, inode, map, path, - flags | EXT4_GET_BLOCKS_CONVERT, allocated); + flags, allocated); if (IS_ERR(path)) return path; /* From 5f1a1cccebf87e6c51f981908cc3cc10c3bc936b Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Sat, 29 Nov 2025 18:32:36 +0800 Subject: [PATCH 04/45] ext4: correct the mapping status if the extent has been zeroed Before submitting I/O and allocating blocks with the EXT4_GET_BLOCKS_PRE_IO flag set, ext4_split_convert_extents() may convert the target extent range to initialized due to ENOSPC, ENOMEM, or EQUOTA errors. However, it still marks the mapping as incorrectly unwritten. Although this may not seem to cause any practical problems, it will result in an unnecessary extent conversion operation after I/O completion. Therefore, it's better to correct the returned mapping status. Signed-off-by: Zhang Yi Reviewed-by: Ojaswin Mujoo Reviewed-by: Baokun Li Message-ID: <20251129103247.686136-5-yi.zhang@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 91b56de60c90..daecf3f0b367 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3933,6 +3933,8 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, /* get_block() before submitting IO, split the extent */ if (flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE) { + int depth; + path = ext4_split_convert_extents(handle, inode, map, path, flags, allocated); if (IS_ERR(path)) @@ -3948,7 +3950,13 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, err = -EFSCORRUPTED; goto errout; } - map->m_flags |= EXT4_MAP_UNWRITTEN; + /* Don't mark unwritten if the extent has been zeroed out. */ + path = ext4_find_extent(inode, map->m_lblk, path, flags); + if (IS_ERR(path)) + return path; + depth = ext_depth(inode); + if (ext4_ext_is_unwritten(path[depth].p_ext)) + map->m_flags |= EXT4_MAP_UNWRITTEN; goto out; } /* IO end_io complete, convert the filled extent to written */ From 8b4b19a2f96348d70bfa306ef7d4a13b0bcbea79 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Sat, 29 Nov 2025 18:32:37 +0800 Subject: [PATCH 05/45] ext4: don't cache extent during splitting extent Caching extents during the splitting process is risky, as it may result in stale extents remaining in the status tree. Moreover, in most cases, the corresponding extent block entries are likely already cached before the split happens, making caching here not particularly useful. Assume we have an unwritten extent, and then DIO writes the first half. [UUUUUUUUUUUUUUUU] on-disk extent U: unwritten extent [UUUUUUUUUUUUUUUU] extent status tree |<- ->| ----> dio write this range First, when ext4_split_extent_at() splits this extent, it truncates the existing extent and then inserts a new one. During this process, this extent status entry may be shrunk, and calls to ext4_find_extent() and ext4_cache_extents() may occur, which could potentially insert the truncated range as a hole into the extent status tree. After the split is completed, this hole is not replaced with the correct status. [UUUUUUU|UUUUUUUU] on-disk extent U: unwritten extent [UUUUUUU|HHHHHHHH] extent status tree H: hole Then, the outer calling functions will not correct this remaining hole extent either. Finally, if we perform a delayed buffer write on this latter part, it will re-insert the delayed extent and cause an error in space accounting. In adition, if the unwritten extent cache is not shrunk during the splitting, ext4_cache_extents() also conflicts with existing extents when caching extents. In the future, we will add checks when caching extents, which will trigger a warning. Therefore, Do not cache extents that are being split. Signed-off-by: Zhang Yi Reviewed-by: Ojaswin Mujoo Reviewed-by: Baokun Li Cc: stable@kernel.org Message-ID: <20251129103247.686136-6-yi.zhang@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index daecf3f0b367..be9fd2ab8667 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3199,6 +3199,9 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, BUG_ON((split_flag & EXT4_EXT_DATA_VALID1) && (split_flag & EXT4_EXT_DATA_VALID2)); + /* Do not cache extents that are in the process of being modified. */ + flags |= EXT4_EX_NOCACHE; + ext_debug(inode, "logical block %llu\n", (unsigned long long)split); ext4_ext_show_leaf(inode, path); @@ -3381,6 +3384,9 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle, ee_len = ext4_ext_get_actual_len(ex); unwritten = ext4_ext_is_unwritten(ex); + /* Do not cache extents that are in the process of being modified. */ + flags |= EXT4_EX_NOCACHE; + if (map->m_lblk + map->m_len < ee_block + ee_len) { split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT; flags1 = flags | EXT4_GET_BLOCKS_SPLIT_NOMERGE; From 6d882ea3b0931b43530d44149b79fcd4ffc13030 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Sat, 29 Nov 2025 18:32:38 +0800 Subject: [PATCH 06/45] ext4: drop extent cache after doing PARTIAL_VALID1 zeroout When splitting an unwritten extent in the middle and converting it to initialized in ext4_split_extent() with the EXT4_EXT_MAY_ZEROOUT and EXT4_EXT_DATA_VALID2 flags set, it could leave a stale unwritten extent. Assume we have an unwritten file and buffered write in the middle of it without dioread_nolock enabled, it will allocate blocks as written extent. 0 A B N [UUUUUUUUUUUU] on-disk extent U: unwritten extent [UUUUUUUUUUUU] extent status tree [--DDDDDDDD--] D: valid data |<- ->| ----> this range needs to be initialized ext4_split_extent() first try to split this extent at B with EXT4_EXT_DATA_PARTIAL_VALID1 and EXT4_EXT_MAY_ZEROOUT flag set, but ext4_split_extent_at() failed to split this extent due to temporary lack of space. It zeroout B to N and leave the entire extent as unwritten. 0 A B N [UUUUUUUUUUUU] on-disk extent [UUUUUUUUUUUU] extent status tree [--DDDDDDDDZZ] Z: zeroed data ext4_split_extent() then try to split this extent at A with EXT4_EXT_DATA_VALID2 flag set. This time, it split successfully and leave an written extent from A to N. 0 A B N [UUWWWWWWWWWW] on-disk extent W: written extent [UUUUUUUUUUUU] extent status tree [--DDDDDDDDZZ] Finally ext4_map_create_blocks() only insert extent A to B to the extent status tree, and leave an stale unwritten extent in the status tree. 0 A B N [UUWWWWWWWWWW] on-disk extent W: written extent [UUWWWWWWWWUU] extent status tree [--DDDDDDDDZZ] Fix this issue by always cached extent status entry after zeroing out the second part. Signed-off-by: Zhang Yi Reviewed-by: Baokun Li Cc: stable@kernel.org Reviewed-by: Ojaswin Mujoo Message-ID: <20251129103247.686136-7-yi.zhang@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index be9fd2ab8667..1094e4923451 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3319,8 +3319,16 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, * extent length and ext4_split_extent() split will the * first half again. */ - if (split_flag & EXT4_EXT_DATA_PARTIAL_VALID1) + if (split_flag & EXT4_EXT_DATA_PARTIAL_VALID1) { + /* + * Drop extent cache to prevent stale unwritten + * extents remaining after zeroing out. + */ + ext4_es_remove_extent(inode, + le32_to_cpu(zero_ex.ee_block), + ext4_ext_get_actual_len(&zero_ex)); goto fix_extent_len; + } /* update the extent length and mark as initialized */ ex->ee_len = cpu_to_le16(ee_len); From 79b592e8f1b435796cbc2722190368e3e8ffd7a1 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Sat, 29 Nov 2025 18:32:39 +0800 Subject: [PATCH 07/45] ext4: drop extent cache when splitting extent fails When the split extent fails, we might leave some extents still being processed and return an error directly, which will result in stale extent entries remaining in the extent status tree. So drop all of the remaining potentially stale extents if the splitting fails. Signed-off-by: Zhang Yi Reviewed-by: Baokun Li Cc: stable@kernel.org Reviewed-by: Ojaswin Mujoo Message-ID: <20251129103247.686136-8-yi.zhang@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 1094e4923451..945995d68c4d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3267,7 +3267,7 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, err = PTR_ERR(path); if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM) - return path; + goto out_path; /* * Get a new path to try to zeroout or fix the extent length. @@ -3281,7 +3281,7 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, if (IS_ERR(path)) { EXT4_ERROR_INODE(inode, "Failed split extent on %u, err %ld", split, PTR_ERR(path)); - return path; + goto out_path; } depth = ext_depth(inode); ex = path[depth].p_ext; @@ -3358,6 +3358,10 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, ext4_free_ext_path(path); path = ERR_PTR(err); } +out_path: + if (IS_ERR(path)) + /* Remove all remaining potentially stale extents. */ + ext4_es_remove_extent(inode, ee_block, ee_len); ext4_ext_show_leaf(inode, path); return path; } From c0329d0288dee6be26e5eca71fdc520c1c5e6acf Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Sat, 29 Nov 2025 18:32:40 +0800 Subject: [PATCH 08/45] ext4: cleanup zeroout in ext4_split_extent_at() zero_ex is a temporary variable used only for writing zeros and inserting extent status entry, it will not be directly inserted into the tree. Therefore, it can be assigned values from the target extent in various scenarios, eliminating the need to explicitly assign values to each variable individually. Signed-off-by: Zhang Yi Reviewed-by: Ojaswin Mujoo Reviewed-by: Jan Kara Reviewed-by: Baokun Li Message-ID: <20251129103247.686136-9-yi.zhang@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 87 ++++++++++++++++++++--------------------------- 1 file changed, 36 insertions(+), 51 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 945995d68c4d..2cfce2c01208 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3287,63 +3287,48 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, ex = path[depth].p_ext; if (EXT4_EXT_MAY_ZEROOUT & split_flag) { - if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { - if (split_flag & EXT4_EXT_DATA_VALID1) { - err = ext4_ext_zeroout(inode, ex2); - zero_ex.ee_block = ex2->ee_block; - zero_ex.ee_len = cpu_to_le16( - ext4_ext_get_actual_len(ex2)); - ext4_ext_store_pblock(&zero_ex, - ext4_ext_pblock(ex2)); - } else { - err = ext4_ext_zeroout(inode, ex); - zero_ex.ee_block = ex->ee_block; - zero_ex.ee_len = cpu_to_le16( - ext4_ext_get_actual_len(ex)); - ext4_ext_store_pblock(&zero_ex, - ext4_ext_pblock(ex)); - } - } else { - err = ext4_ext_zeroout(inode, &orig_ex); - zero_ex.ee_block = orig_ex.ee_block; - zero_ex.ee_len = cpu_to_le16( - ext4_ext_get_actual_len(&orig_ex)); - ext4_ext_store_pblock(&zero_ex, - ext4_ext_pblock(&orig_ex)); - } + if (split_flag & EXT4_EXT_DATA_VALID1) + memcpy(&zero_ex, ex2, sizeof(zero_ex)); + else if (split_flag & EXT4_EXT_DATA_VALID2) + memcpy(&zero_ex, ex, sizeof(zero_ex)); + else + memcpy(&zero_ex, &orig_ex, sizeof(zero_ex)); + ext4_ext_mark_initialized(&zero_ex); - if (!err) { + err = ext4_ext_zeroout(inode, &zero_ex); + if (err) + goto fix_extent_len; + + /* + * The first half contains partially valid data, the splitting + * of this extent has not been completed, fix extent length + * and ext4_split_extent() split will the first half again. + */ + if (split_flag & EXT4_EXT_DATA_PARTIAL_VALID1) { /* - * The first half contains partially valid data, the - * splitting of this extent has not been completed, fix - * extent length and ext4_split_extent() split will the - * first half again. + * Drop extent cache to prevent stale unwritten + * extents remaining after zeroing out. */ - if (split_flag & EXT4_EXT_DATA_PARTIAL_VALID1) { - /* - * Drop extent cache to prevent stale unwritten - * extents remaining after zeroing out. - */ - ext4_es_remove_extent(inode, + ext4_es_remove_extent(inode, le32_to_cpu(zero_ex.ee_block), ext4_ext_get_actual_len(&zero_ex)); - goto fix_extent_len; - } - - /* update the extent length and mark as initialized */ - ex->ee_len = cpu_to_le16(ee_len); - ext4_ext_try_to_merge(handle, inode, path, ex); - err = ext4_ext_dirty(handle, inode, path + path->p_depth); - if (!err) - /* update extent status tree */ - ext4_zeroout_es(inode, &zero_ex); - /* If we failed at this point, we don't know in which - * state the extent tree exactly is so don't try to fix - * length of the original extent as it may do even more - * damage. - */ - goto out; + goto fix_extent_len; } + + /* update the extent length and mark as initialized */ + ex->ee_len = cpu_to_le16(ee_len); + ext4_ext_try_to_merge(handle, inode, path, ex); + err = ext4_ext_dirty(handle, inode, path + path->p_depth); + if (!err) + /* update extent status tree */ + ext4_zeroout_es(inode, &zero_ex); + /* + * If we failed at this point, we don't know in which + * state the extent tree exactly is so don't try to fix + * length of the original extent as it may do even more + * damage. + */ + goto out; } fix_extent_len: From ef46e0d70014cad87dc18e7d292e7c75c0ef30c8 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Sat, 29 Nov 2025 18:32:41 +0800 Subject: [PATCH 09/45] ext4: cleanup useless out label in __es_remove_extent() The out label in __es_remove_extent() is just return err value, we can return it directly if something bad happens. Therefore, remove the useless out label and rename out_get_reserved to out. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Reviewed-by: Baokun Li Message-ID: <20251129103247.686136-10-yi.zhang@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index e04fbf10fe4f..04d56f8f6c0c 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1434,7 +1434,7 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status orig_es; ext4_lblk_t len1, len2; ext4_fsblk_t block; - int err = 0; + int err; bool count_reserved = true; struct rsvd_count rc; @@ -1443,9 +1443,9 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, es = __es_tree_search(&tree->root, lblk); if (!es) - goto out; + return 0; if (es->es_lblk > end) - goto out; + return 0; /* Simply invalidate cache_es. */ tree->cache_es = NULL; @@ -1480,7 +1480,7 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, es->es_lblk = orig_es.es_lblk; es->es_len = orig_es.es_len; - goto out; + return err; } } else { es->es_lblk = end + 1; @@ -1494,7 +1494,7 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, if (count_reserved) count_rsvd(inode, orig_es.es_lblk + len1, orig_es.es_len - len1 - len2, &orig_es, &rc); - goto out_get_reserved; + goto out; } if (len1 > 0) { @@ -1536,11 +1536,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, } } -out_get_reserved: +out: if (count_reserved) *reserved = get_rsvd(inode, end, es, &rc); -out: - return err; + return 0; } /* From 42ad7b23b335835548fecc757d6627b67cd5f0b4 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Sat, 29 Nov 2025 18:32:42 +0800 Subject: [PATCH 10/45] ext4: make __es_remove_extent() check extent status Currently, __es_remove_extent() unconditionally removes extent status entries within the specified range. In order to prepare for extending the ext4_es_cache_extent() function to cache on-disk extents, which may overwrite some existing short-length extents with the same status, allow __es_remove_extent() to check the specified extent type before removing it, and return error and pass out the conflicting extent if the status does not match. Signed-off-by: Zhang Yi Reviewed-by: Baokun Li Message-ID: <20251129103247.686136-11-yi.zhang@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.c | 49 +++++++++++++++++++++++++++++++++------- 1 file changed, 41 insertions(+), 8 deletions(-) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 04d56f8f6c0c..818007bb613f 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -178,7 +178,8 @@ static struct kmem_cache *ext4_pending_cachep; static int __es_insert_extent(struct inode *inode, struct extent_status *newes, struct extent_status *prealloc); static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t end, int *reserved, + ext4_lblk_t end, unsigned int status, + int *reserved, struct extent_status *res, struct extent_status *prealloc); static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, @@ -242,6 +243,21 @@ static inline void ext4_es_inc_seq(struct inode *inode) WRITE_ONCE(ei->i_es_seq, ei->i_es_seq + 1); } +static inline int __es_check_extent_status(struct extent_status *es, + unsigned int status, + struct extent_status *res) +{ + if (ext4_es_type(es) & status) + return 0; + + if (res) { + res->es_lblk = es->es_lblk; + res->es_len = es->es_len; + res->es_pblk = es->es_pblk; + } + return -EINVAL; +} + /* * search through the tree for an delayed extent with a given offset. If * it can't be found, try to find next extent. @@ -929,7 +945,7 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, pr = __alloc_pending(true); write_lock(&EXT4_I(inode)->i_es_lock); - err1 = __es_remove_extent(inode, lblk, end, &resv_used, es1); + err1 = __es_remove_extent(inode, lblk, end, 0, &resv_used, NULL, es1); if (err1 != 0) goto error; /* Free preallocated extent if it didn't get used. */ @@ -1409,23 +1425,27 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, return rc->ndelayed; } - /* * __es_remove_extent - removes block range from extent status tree * * @inode - file containing range * @lblk - first block in range * @end - last block in range + * @status - the extent status to be checked * @reserved - number of cluster reservations released + * @res - return the extent if the status is not match * @prealloc - pre-allocated es to avoid memory allocation failures * * If @reserved is not NULL and delayed allocation is enabled, counts * block/cluster reservations freed by removing range and if bigalloc - * enabled cancels pending reservations as needed. Returns 0 on success, - * error code on failure. + * enabled cancels pending reservations as needed. If @status is not + * zero, check extent status type while removing extent, return -EINVAL + * and pass out the extent through @res if not match. Returns 0 on + * success, error code on failure. */ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t end, int *reserved, + ext4_lblk_t end, unsigned int status, + int *reserved, struct extent_status *res, struct extent_status *prealloc) { struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; @@ -1440,6 +1460,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC)) count_reserved = false; + if (status == 0) + status = ES_TYPE_MASK; es = __es_tree_search(&tree->root, lblk); if (!es) @@ -1447,6 +1469,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, if (es->es_lblk > end) return 0; + err = __es_check_extent_status(es, status, res); + if (err) + return err; + /* Simply invalidate cache_es. */ tree->cache_es = NULL; if (count_reserved) @@ -1509,6 +1535,9 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, } while (es && ext4_es_end(es) <= end) { + err = __es_check_extent_status(es, status, res); + if (err) + return err; if (count_reserved) count_rsvd(inode, es->es_lblk, es->es_len, es, &rc); node = rb_next(&es->rb_node); @@ -1524,6 +1553,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, if (es && es->es_lblk < end + 1) { ext4_lblk_t orig_len = es->es_len; + err = __es_check_extent_status(es, status, res); + if (err) + return err; + len1 = ext4_es_end(es) - end; if (count_reserved) count_rsvd(inode, es->es_lblk, orig_len - len1, @@ -1581,7 +1614,7 @@ void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, * is reclaimed. */ write_lock(&EXT4_I(inode)->i_es_lock); - err = __es_remove_extent(inode, lblk, end, &reserved, es); + err = __es_remove_extent(inode, lblk, end, 0, &reserved, NULL, es); if (err) goto error; /* Free preallocated extent if it didn't get used. */ @@ -2173,7 +2206,7 @@ void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk, } write_lock(&EXT4_I(inode)->i_es_lock); - err1 = __es_remove_extent(inode, lblk, end, NULL, es1); + err1 = __es_remove_extent(inode, lblk, end, 0, NULL, NULL, es1); if (err1 != 0) goto error; /* Free preallocated extent if it didn't get used. */ From ae3e5ebbdd77a9b2b2eccd6a96ff656dfa3d718f Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Sat, 29 Nov 2025 18:32:43 +0800 Subject: [PATCH 11/45] ext4: make ext4_es_cache_extent() support overwrite existing extents Currently, ext4_es_cache_extent() is used to load extents into the extent status tree when reading on-disk extent blocks. But it inserts information into the extent status tree if and only if there isn't information about the specified range already. So it only used for the initial loading and does not support overwrit extents. However, there are many other places in ext4 where on-disk extents are inserted into the extent status tree, such as in ext4_map_query_blocks(). Currently, they call ext4_es_insert_extent() to perform the insertion, but they don't modify the extents, so ext4_es_cache_extent() would be a more appropriate choice. However, when ext4_map_query_blocks() inserts an extent, it may overwrite a short existing extent of the same type. Therefore, to prepare for the replacements, we need to extend ext4_es_cache_extent() to allow it to overwrite existing extents with the same status. So it checks the found extents before removing and inserting. (There is one exception, a hole in the on-disk extent but a delayed extent in the extent status tree is allowed.) In addition, since cached extents can be more lenient than the extents they modify and do not involve modifying reserved blocks, it is not necessary to ensure that the insertion operation succeeds as strictly as in the ext4_es_insert_extent() function. Signed-off-by: Zhang Yi Reviewed-by: Baokun Li Message-ID: <20251129103247.686136-12-yi.zhang@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.c | 50 ++++++++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 818007bb613f..48f04aef2f2e 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1014,17 +1014,24 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, } /* - * ext4_es_cache_extent() inserts information into the extent status - * tree if and only if there isn't information about the range in - * question already. + * ext4_es_cache_extent() inserts information into the extent status tree + * only if there is no existing information about the specified range or + * if the existing extents have the same status. + * + * Note that this interface is only used for caching on-disk extent + * information and cannot be used to convert existing extents in the extent + * status tree. To convert existing extents, use ext4_es_insert_extent() + * instead. */ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, unsigned int status) { struct extent_status *es; - struct extent_status newes; + struct extent_status chkes, newes; ext4_lblk_t end = lblk + len - 1; + bool conflict = false; + int err; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; @@ -1040,11 +1047,40 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, BUG_ON(end < lblk); write_lock(&EXT4_I(inode)->i_es_lock); - es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk); - if (!es || es->es_lblk > end) - __es_insert_extent(inode, &newes, NULL); + if (es && es->es_lblk <= end) { + /* Found an extent that covers the entire range. */ + if (es->es_lblk <= lblk && es->es_lblk + es->es_len > end) { + if (__es_check_extent_status(es, status, &chkes)) + conflict = true; + goto unlock; + } + /* Check and remove all extents in range. */ + err = __es_remove_extent(inode, lblk, end, status, NULL, + &chkes, NULL); + if (err) { + if (err == -EINVAL) + conflict = true; + goto unlock; + } + } + __es_insert_extent(inode, &newes, NULL); +unlock: write_unlock(&EXT4_I(inode)->i_es_lock); + if (!conflict) + return; + /* + * A hole in the on-disk extent but a delayed extent in the extent + * status tree, is allowed. + */ + if (status == EXTENT_STATUS_HOLE && + ext4_es_type(&chkes) == EXTENT_STATUS_DELAYED) + return; + + ext4_warning_inode(inode, + "ES cache extent failed: add [%d,%d,%llu,0x%x] conflict with existing [%d,%d,%llu,0x%x]\n", + lblk, len, pblk, status, chkes.es_lblk, chkes.es_len, + ext4_es_pblock(&chkes), ext4_es_status(&chkes)); } /* From b32e61cf6470ced0a70f20c9b513511a4e79b390 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Sat, 29 Nov 2025 18:32:44 +0800 Subject: [PATCH 12/45] ext4: adjust the debug info in ext4_es_cache_extent() Print a trace point after successfully inserting an extent in the ext4_es_cache_extent() function. Additionally, similar to other extent cache operation functions, call ext4_print_pending_tree() to display the extent debug information of the inode when in ES_DEBUG mode. Signed-off-by: Zhang Yi Reviewed-by: Baokun Li Message-ID: <20251129103247.686136-13-yi.zhang@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 48f04aef2f2e..0529c603ee88 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1039,7 +1039,6 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, newes.es_lblk = lblk; newes.es_len = len; ext4_es_store_pblock_status(&newes, pblk, status); - trace_ext4_es_cache_extent(inode, &newes); if (!len) return; @@ -1065,6 +1064,8 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, } } __es_insert_extent(inode, &newes, NULL); + trace_ext4_es_cache_extent(inode, &newes); + ext4_es_print_tree(inode); unlock: write_unlock(&EXT4_I(inode)->i_es_lock); if (!conflict) From a5567347b6f5e888c56274d710b8f9525dbd57d0 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Sat, 29 Nov 2025 18:32:45 +0800 Subject: [PATCH 13/45] ext4: replace ext4_es_insert_extent() when caching on-disk extents In ext4, the remaining places for inserting extents into the extent status tree within ext4_ext_determine_insert_hole() and ext4_map_query_blocks() directly cache on-disk extents. We can use ext4_es_cache_extent() instead of ext4_es_insert_extent() in these cases. This will help reduce unnecessary increases in extent sequence numbers and cache invalidations after supporting IOMAP in the future. Suggested-by: Jan Kara Signed-off-by: Zhang Yi Reviewed-by: Baokun Li Message-ID: <20251129103247.686136-14-yi.zhang@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 3 +-- fs/ext4/inode.c | 18 +++++++++--------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 2cfce2c01208..27eb2c1df012 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4192,8 +4192,7 @@ static ext4_lblk_t ext4_ext_determine_insert_hole(struct inode *inode, insert_hole: /* Put just found gap into cache to speed up subsequent requests */ ext_debug(inode, " -> %u:%u\n", hole_start, len); - ext4_es_insert_extent(inode, hole_start, len, ~0, - EXTENT_STATUS_HOLE, false); + ext4_es_cache_extent(inode, hole_start, len, ~0, EXTENT_STATUS_HOLE); /* Update hole_len to reflect hole size after lblk */ if (hole_start != lblk) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0c466ccbed69..08a7f2b54a9e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -503,8 +503,8 @@ static int ext4_map_query_blocks_next_in_leaf(handle_t *handle, retval = ext4_ext_map_blocks(handle, inode, &map2, 0); if (retval <= 0) { - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status, false); + ext4_es_cache_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status); return map->m_len; } @@ -525,13 +525,13 @@ static int ext4_map_query_blocks_next_in_leaf(handle_t *handle, */ if (map->m_pblk + map->m_len == map2.m_pblk && status == status2) { - ext4_es_insert_extent(inode, map->m_lblk, - map->m_len + map2.m_len, map->m_pblk, - status, false); + ext4_es_cache_extent(inode, map->m_lblk, + map->m_len + map2.m_len, map->m_pblk, + status); map->m_len += map2.m_len; } else { - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status, false); + ext4_es_cache_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status); } return map->m_len; @@ -573,8 +573,8 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, map->m_len == orig_mlen) { status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status, false); + ext4_es_cache_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status); } else { retval = ext4_map_query_blocks_next_in_leaf(handle, inode, map, orig_mlen); From 382dd788cb8b25dff007fb45d064ba8b1095685a Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Sat, 29 Nov 2025 18:32:46 +0800 Subject: [PATCH 14/45] ext4: drop the TODO comment in ext4_es_insert_extent() Now we have ext4_es_cache_extent() to cache on-disk extents instead of ext4_es_insert_extent(), so drop the TODO comment. Signed-off-by: Zhang Yi Reviewed-by: Baokun Li Message-ID: <20251129103247.686136-15-yi.zhang@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 0529c603ee88..fc83e7e2ca9e 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -898,7 +898,8 @@ static int __es_insert_extent(struct inode *inode, struct extent_status *newes, /* * ext4_es_insert_extent() adds information to an inode's extent - * status tree. + * status tree. This interface is used for modifying extents. To cache + * on-disk extents, use ext4_es_cache_extent() instead. */ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, @@ -977,10 +978,6 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, } pending = err3; } - /* - * TODO: For cache on-disk extents, there is no need to increment - * the sequence counter, this requires future optimization. - */ ext4_es_inc_seq(inode); error: write_unlock(&EXT4_I(inode)->i_es_lock); From 270564513489d98b721a1e4a10017978d5213bff Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Tue, 9 Dec 2025 21:31:16 +0800 Subject: [PATCH 15/45] ext4: move ext4_percpu_param_init() before ext4_mb_init() When running `kvm-xfstests -c ext4/1k -C 1 generic/383` with the `DOUBLE_CHECK` macro defined, the following panic is triggered: ================================================================== EXT4-fs error (device vdc): ext4_validate_block_bitmap:423: comm mount: bg 0: bad block bitmap checksum BUG: unable to handle page fault for address: ff110000fa2cc000 PGD 3e01067 P4D 3e02067 PUD 0 Oops: Oops: 0000 [#1] SMP NOPTI CPU: 0 UID: 0 PID: 2386 Comm: mount Tainted: G W 6.18.0-gba65a4e7120a-dirty #1152 PREEMPT(none) RIP: 0010:percpu_counter_add_batch+0x13/0xa0 Call Trace: ext4_mark_group_bitmap_corrupted+0xcb/0xe0 ext4_validate_block_bitmap+0x2a1/0x2f0 ext4_read_block_bitmap+0x33/0x50 mb_group_bb_bitmap_alloc+0x33/0x80 ext4_mb_add_groupinfo+0x190/0x250 ext4_mb_init_backend+0x87/0x290 ext4_mb_init+0x456/0x640 __ext4_fill_super+0x1072/0x1680 ext4_fill_super+0xd3/0x280 get_tree_bdev_flags+0x132/0x1d0 vfs_get_tree+0x29/0xd0 vfs_cmd_create+0x59/0xe0 __do_sys_fsconfig+0x4f6/0x6b0 do_syscall_64+0x50/0x1f0 entry_SYSCALL_64_after_hwframe+0x76/0x7e ================================================================== This issue can be reproduced using the following commands: mkfs.ext4 -F -q -b 1024 /dev/sda 5G tune2fs -O quota,project /dev/sda mount /dev/sda /tmp/test With DOUBLE_CHECK defined, mb_group_bb_bitmap_alloc() reads and validates the block bitmap. When the validation fails, ext4_mark_group_bitmap_corrupted() attempts to update sbi->s_freeclusters_counter. However, this percpu_counter has not been initialized yet at this point, which leads to the panic described above. Fix this by moving the execution of ext4_percpu_param_init() to occur before ext4_mb_init(), ensuring the per-CPU counters are initialized before they are used. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20251209133116.731350-1-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 87205660c5d0..5c2e931d8a53 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5604,6 +5604,10 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) clear_opt2(sb, MB_OPTIMIZE_SCAN); } + err = ext4_percpu_param_init(sbi); + if (err) + goto failed_mount5; + err = ext4_mb_init(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", @@ -5619,10 +5623,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; - err = ext4_percpu_param_init(sbi); - if (err) - goto failed_mount6; - if (ext4_has_feature_flex_bg(sb)) if (!ext4_fill_flex_info(sb)) { ext4_msg(sb, KERN_ERR, @@ -5704,8 +5704,8 @@ failed_mount8: __maybe_unused failed_mount6: ext4_mb_release(sb); ext4_flex_groups_free(sbi); - ext4_percpu_param_destroy(sbi); failed_mount5: + ext4_percpu_param_destroy(sbi); ext4_ext_release(sb); ext4_release_system_zone(sb); failed_mount4a: From d518215c27194486fe13136a8dbbbabeefb5c9b6 Mon Sep 17 00:00:00 2001 From: Baolin Liu Date: Thu, 11 Dec 2025 11:02:56 +0800 Subject: [PATCH 16/45] ext4: add sysfs attribute err_report_sec to control s_err_report timer Add a new sysfs attribute "err_report_sec" to control the s_err_report timer in ext4_sb_info. Writing '0' disables the timer, while writing a non-zero value enables the timer and sets the timeout in seconds. Signed-off-by: Baolin Liu Link: https://patch.msgid.link/20251211030256.28613-1-liubaolin12138@163.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 4 +++- fs/ext4/super.c | 27 ++++++++++++++++++--------- fs/ext4/sysfs.c | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 10 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 56112f201cac..046706c1f01f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1692,6 +1692,8 @@ struct ext4_sb_info { /* timer for periodic error stats printing */ struct timer_list s_err_report; + /* timeout in seconds for s_err_report; 0 disables the timer. */ + unsigned long s_err_report_sec; /* Lazy inode table initialization info */ struct ext4_li_request *s_li_request; @@ -2373,7 +2375,6 @@ static inline int ext4_emergency_state(struct super_block *sb) #define EXT4_DEF_SB_UPDATE_INTERVAL_SEC (3600) /* seconds (1 hour) */ #define EXT4_DEF_SB_UPDATE_INTERVAL_KB (16384) /* kilobytes (16MB) */ - /* * Minimum number of groups in a flexgroup before we separate out * directories into the first block group of a flexgroup @@ -3199,6 +3200,7 @@ extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, unsigned int flags); extern unsigned int ext4_num_base_meta_blocks(struct super_block *sb, ext4_group_t block_group); +extern void print_daily_error_info(struct timer_list *t); extern __printf(7, 8) void __ext4_error(struct super_block *, const char *, unsigned int, bool, diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5c2e931d8a53..752f414aa06b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3638,10 +3638,12 @@ int ext4_feature_set_ok(struct super_block *sb, int readonly) } /* - * This function is called once a day if we have errors logged - * on the file system + * This function is called once a day by default if we have errors logged + * on the file system. + * Use the err_report_sec sysfs attribute to disable or adjust its call + * freequency. */ -static void print_daily_error_info(struct timer_list *t) +void print_daily_error_info(struct timer_list *t) { struct ext4_sb_info *sbi = timer_container_of(sbi, t, s_err_report); struct super_block *sb = sbi->s_sb; @@ -3681,7 +3683,9 @@ static void print_daily_error_info(struct timer_list *t) le64_to_cpu(es->s_last_error_block)); printk(KERN_CONT "\n"); } - mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ + + if (sbi->s_err_report_sec) + mod_timer(&sbi->s_err_report, jiffies + secs_to_jiffies(sbi->s_err_report_sec)); } /* Find next suitable group and run ext4_init_inode_table */ @@ -5678,8 +5682,12 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) clear_opt(sb, DISCARD); } - if (es->s_error_count) - mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ + if (es->s_error_count) { + sbi->s_err_report_sec = 5*60; /* first time 5 minutes */ + mod_timer(&sbi->s_err_report, + jiffies + secs_to_jiffies(sbi->s_err_report_sec)); + } + sbi->s_err_report_sec = 24*60*60; /* Once a day */ /* Enable message ratelimiting. Default is 10 messages per 5 secs. */ ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10); @@ -6225,10 +6233,11 @@ static void ext4_update_super(struct super_block *sb) ext4_errno_to_code(sbi->s_last_error_code); /* * Start the daily error reporting function if it hasn't been - * started already + * started already and sbi->s_err_report_sec is not zero */ - if (!es->s_error_count) - mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); + if (!es->s_error_count && !sbi->s_err_report_sec) + mod_timer(&sbi->s_err_report, + jiffies + secs_to_jiffies(sbi->s_err_report_sec)); le32_add_cpu(&es->s_error_count, sbi->s_add_error_count); sbi->s_add_error_count = 0; } diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 0018e09b867e..d2ecc1026c0c 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -40,6 +40,7 @@ typedef enum { attr_pointer_string, attr_pointer_atomic, attr_journal_task, + attr_err_report_sec, } attr_id_t; typedef enum { @@ -130,6 +131,36 @@ static ssize_t trigger_test_error(struct ext4_sb_info *sbi, return count; } +static ssize_t err_report_sec_store(struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned long t; + int ret; + + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; + + /*the maximum time interval must not exceed one year.*/ + if (t > (365*24*60*60)) + return -EINVAL; + + if (sbi->s_err_report_sec == t) /*nothing to do*/ + goto out; + else if (!sbi->s_err_report_sec && t) { + timer_setup(&sbi->s_err_report, print_daily_error_info, 0); + } else if (sbi->s_err_report_sec && !t) { + timer_delete_sync(&sbi->s_err_report); + goto out; + } + + sbi->s_err_report_sec = t; + mod_timer(&sbi->s_err_report, jiffies + secs_to_jiffies(sbi->s_err_report_sec)); + +out: + return count; +} + static ssize_t journal_task_show(struct ext4_sb_info *sbi, char *buf) { if (!sbi->s_journal) @@ -217,6 +248,7 @@ EXT4_ATTR_OFFSET(mb_group_prealloc, 0644, clusters_in_group, ext4_sb_info, s_mb_group_prealloc); EXT4_ATTR_OFFSET(mb_best_avail_max_trim_order, 0644, mb_order, ext4_sb_info, s_mb_best_avail_max_trim_order); +EXT4_ATTR_OFFSET(err_report_sec, 0644, err_report_sec, ext4_sb_info, s_err_report_sec); EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); @@ -309,6 +341,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(last_trim_minblks), ATTR_LIST(sb_update_sec), ATTR_LIST(sb_update_kb), + ATTR_LIST(err_report_sec), NULL, }; ATTRIBUTE_GROUPS(ext4); @@ -402,6 +435,7 @@ static ssize_t ext4_generic_attr_show(struct ext4_attr *a, return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr)); return sysfs_emit(buf, "%u\n", *((unsigned int *) ptr)); case attr_pointer_ul: + case attr_err_report_sec: return sysfs_emit(buf, "%lu\n", *((unsigned long *) ptr)); case attr_pointer_u8: return sysfs_emit(buf, "%u\n", *((unsigned char *) ptr)); @@ -525,6 +559,8 @@ static ssize_t ext4_attr_store(struct kobject *kobj, return inode_readahead_blks_store(sbi, buf, len); case attr_trigger_test_error: return trigger_test_error(sbi, buf, len); + case attr_err_report_sec: + return err_report_sec_store(sbi, buf, len); default: return ext4_generic_attr_store(a, sbi, buf, len); } From 87e79fa122bc9a6576f1690ee264fcbd77d3ab58 Mon Sep 17 00:00:00 2001 From: Li Chen Date: Thu, 11 Dec 2025 19:51:38 +0800 Subject: [PATCH 17/45] ext4: mark inode format migration fast-commit ineligible Fast commits only log operations that have dedicated replay support. Inode format migration (indirect<->extent layout changes via EXT4_IOC_MIGRATE or toggling EXT4_EXTENTS_FL) rewrites the block mapping representation without going through the fast commit tracking paths. In practice these migrations are rare and usually followed by further updates, but mixing them into a fast commit makes the overall semantics harder to reason about and risks replay gaps if new call sites appear. Teach ext4 to mark the filesystem fast-commit ineligible when ext4_ext_migrate() or ext4_ind_migrate() start their journal transactions. This forces those transactions to fall back to a full commit, ensuring that the entire inode layout change is captured by the normal journal rather than partially encoded in fast commit TLVs. This change should not affect common workloads but makes format migrations safer and easier to reason about under fast commit. Testing: 1. prepare: dd if=/dev/zero of=/root/fc.img bs=1M count=0 seek=128 mkfs.ext4 -O fast_commit -F /root/fc.img mkdir -p /mnt/fc && mount -t ext4 -o loop /root/fc.img /mnt/fc 2. Created a test file and toggled the extents flag to exercise both ext4_ind_migrate() and ext4_ext_migrate(): touch /mnt/fc/migtest chattr -e /mnt/fc/migtest chattr +e /mnt/fc/migtest 3. Verified fast-commit ineligible statistics: tail -n 1 /proc/fs/ext4/loop0/fc_info "Inode format migration": 2 Signed-off-by: Li Chen Link: https://patch.msgid.link/20251211115146.897420-2-me@linux.beauty Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 1 + fs/ext4/fast_commit.h | 1 + fs/ext4/migrate.c | 12 ++++++++++++ include/trace/events/ext4.h | 4 +++- 4 files changed, 17 insertions(+), 1 deletion(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index fa66b08de999..afb28b3e52bb 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -2302,6 +2302,7 @@ static const char * const fc_ineligible_reasons[] = { [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op", [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling", [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename", + [EXT4_FC_REASON_MIGRATE] = "Inode format migration", }; int ext4_fc_info_show(struct seq_file *seq, void *v) diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h index 3bd534e4dbbf..be3b84a74c32 100644 --- a/fs/ext4/fast_commit.h +++ b/fs/ext4/fast_commit.h @@ -97,6 +97,7 @@ enum { EXT4_FC_REASON_FALLOC_RANGE, EXT4_FC_REASON_INODE_JOURNAL_DATA, EXT4_FC_REASON_ENCRYPTED_FILENAME, + EXT4_FC_REASON_MIGRATE, EXT4_FC_REASON_MAX }; diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 1b0dfd963d3f..96ab95167bd6 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -449,6 +449,12 @@ int ext4_ext_migrate(struct inode *inode) retval = PTR_ERR(handle); goto out_unlock; } + /* + * This operation rewrites the inode's block mapping layout + * (indirect to extents) and is not tracked in the fast commit + * log, so disable fast commits for this transaction. + */ + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_MIGRATE, handle); goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; owner[0] = i_uid_read(inode); @@ -630,6 +636,12 @@ int ext4_ind_migrate(struct inode *inode) ret = PTR_ERR(handle); goto out_unlock; } + /* + * This operation rewrites the inode's block mapping layout + * (extents to indirect blocks) and is not tracked in the fast + * commit log, so disable fast commits for this transaction. + */ + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_MIGRATE, handle); down_write(&EXT4_I(inode)->i_data_sem); ret = ext4_ext_check_inode(inode); diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index fd76d14c2776..31598a7810d6 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -102,6 +102,7 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_RENAME_DIR); TRACE_DEFINE_ENUM(EXT4_FC_REASON_FALLOC_RANGE); TRACE_DEFINE_ENUM(EXT4_FC_REASON_INODE_JOURNAL_DATA); TRACE_DEFINE_ENUM(EXT4_FC_REASON_ENCRYPTED_FILENAME); +TRACE_DEFINE_ENUM(EXT4_FC_REASON_MIGRATE); TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX); #define show_fc_reason(reason) \ @@ -115,7 +116,8 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX); { EXT4_FC_REASON_RENAME_DIR, "RENAME_DIR"}, \ { EXT4_FC_REASON_FALLOC_RANGE, "FALLOC_RANGE"}, \ { EXT4_FC_REASON_INODE_JOURNAL_DATA, "INODE_JOURNAL_DATA"}, \ - { EXT4_FC_REASON_ENCRYPTED_FILENAME, "ENCRYPTED_FILENAME"}) + { EXT4_FC_REASON_ENCRYPTED_FILENAME, "ENCRYPTED_FILENAME"}, \ + { EXT4_FC_REASON_MIGRATE, "MIGRATE"}) TRACE_DEFINE_ENUM(CR_POWER2_ALIGNED); TRACE_DEFINE_ENUM(CR_GOAL_LEN_FAST); From 16d43b9748c655b36a675cc55789f40fd827e9b1 Mon Sep 17 00:00:00 2001 From: Li Chen Date: Thu, 11 Dec 2025 19:51:39 +0800 Subject: [PATCH 18/45] ext4: mark fs-verity enable fast-commit ineligible Fast commits only log operations that have dedicated replay support. Enabling fs-verity builds a Merkle tree and updates inode and orphan state in ways that are not described by the fast commit replay tags. In practice these operations are rare and usually followed by further updates, but mixing them into a fast commit makes the overall semantics harder to reason about and risks replay gaps if new call sites appear. Teach ext4 to mark the filesystem fast-commit ineligible when ext4_end_enable_verity() starts its journal transaction. This forces that transaction to fall back to a full commit, ensuring that the fs-verity enable changes are captured by the normal journal rather than partially encoded in fast commit TLVs. This change should not affect common workloads but makes fs-verity enable safer and easier to reason about under fast commit. Testing: 1. prepare: dd if=/dev/zero of=/root/fc_verity.img bs=1M count=0 seek=128 mkfs.ext4 -O fast_commit,verity -F /root/fc_verity.img mkdir -p /mnt/fc_verity && mount -t ext4 -o loop /root/fc_verity.img /mnt/fc_verity 2. Enabled fs-verity on a file and verified reason accounting: echo "data" > /mnt/fc_verity/verityfile /root/enable_verity /mnt/fc_verity/verityfile sync tail -n 1 /proc/fs/ext4/loop0/fc_info "fs-verity enable": 1 3. Enabled fs-verity on a second file, fsynced it, and checked that the ineligible commit counter is updated too: echo "data2" > /mnt/fc_verity/verityfile2 /root/enable_verity /mnt/fc_verity/verityfile2 /root/fsync_file /mnt/fc_verity/verityfile2 sync /proc/fs/ext4/loop0/fc_info shows "fs-verity enable" incremented and fc stats ineligible increased accordingly. Signed-off-by: Li Chen Link: https://patch.msgid.link/20251211115146.897420-3-me@linux.beauty Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 1 + fs/ext4/fast_commit.h | 1 + fs/ext4/verity.c | 2 ++ include/trace/events/ext4.h | 4 +++- 4 files changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index afb28b3e52bb..242b69e5fe13 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -2303,6 +2303,7 @@ static const char * const fc_ineligible_reasons[] = { [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling", [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename", [EXT4_FC_REASON_MIGRATE] = "Inode format migration", + [EXT4_FC_REASON_VERITY] = "fs-verity enable", }; int ext4_fc_info_show(struct seq_file *seq, void *v) diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h index be3b84a74c32..20f65135208f 100644 --- a/fs/ext4/fast_commit.h +++ b/fs/ext4/fast_commit.h @@ -98,6 +98,7 @@ enum { EXT4_FC_REASON_INODE_JOURNAL_DATA, EXT4_FC_REASON_ENCRYPTED_FILENAME, EXT4_FC_REASON_MIGRATE, + EXT4_FC_REASON_VERITY, EXT4_FC_REASON_MAX }; diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index 415d9c4d8a32..667f9e8d4da9 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -231,6 +231,8 @@ static int ext4_end_enable_verity(struct file *filp, const void *desc, goto cleanup; } + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_VERITY, handle); + err = ext4_orphan_del(handle, inode); if (err) goto stop_and_cleanup; diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 31598a7810d6..726705bce9d3 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -103,6 +103,7 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_FALLOC_RANGE); TRACE_DEFINE_ENUM(EXT4_FC_REASON_INODE_JOURNAL_DATA); TRACE_DEFINE_ENUM(EXT4_FC_REASON_ENCRYPTED_FILENAME); TRACE_DEFINE_ENUM(EXT4_FC_REASON_MIGRATE); +TRACE_DEFINE_ENUM(EXT4_FC_REASON_VERITY); TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX); #define show_fc_reason(reason) \ @@ -117,7 +118,8 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX); { EXT4_FC_REASON_FALLOC_RANGE, "FALLOC_RANGE"}, \ { EXT4_FC_REASON_INODE_JOURNAL_DATA, "INODE_JOURNAL_DATA"}, \ { EXT4_FC_REASON_ENCRYPTED_FILENAME, "ENCRYPTED_FILENAME"}, \ - { EXT4_FC_REASON_MIGRATE, "MIGRATE"}) + { EXT4_FC_REASON_MIGRATE, "MIGRATE"}, \ + { EXT4_FC_REASON_VERITY, "VERITY"}) TRACE_DEFINE_ENUM(CR_POWER2_ALIGNED); TRACE_DEFINE_ENUM(CR_GOAL_LEN_FAST); From 690558921d9f9388c6bc83610451d8cb393e4d88 Mon Sep 17 00:00:00 2001 From: Li Chen Date: Thu, 11 Dec 2025 19:51:40 +0800 Subject: [PATCH 19/45] ext4: mark move extents fast-commit ineligible Fast commits only log operations that have dedicated replay support. EXT4_IOC_MOVE_EXT swaps extents between regular files and may copy data, rewriting the affected inodes' block mapping layout without going through the fast commit tracking paths. In practice these operations are rare and usually followed by further updates, but mixing them into a fast commit makes the overall semantics harder to reason about and risks replay gaps if new call sites appear. Teach ext4 to mark the filesystem fast-commit ineligible for the journal transactions used by move_extent_per_page() when EXT4_IOC_MOVE_EXT runs. This forces those transactions to fall back to a full commit, ensuring that these multi-inode extent swaps are captured by the normal journal rather than partially encoded in fast commit TLVs. This change should not affect common workloads but makes online defragmentation safer and easier to reason about under fast commit. Testing: 1. prepare: dd if=/dev/zero of=/root/fc_move.img bs=1M count=0 seek=256 mkfs.ext4 -O fast_commit -F /root/fc_move.img mkdir -p /mnt/fc_move && mount -t ext4 -o loop \ /root/fc_move.img /mnt/fc_move 2. Created two files, ran EXT4_IOC_MOVE_EXT via e4defrag, and checked the ineligible reason statistics: fallocate -l 64M /mnt/fc_move/file1 cp /mnt/fc_move/file1 /mnt/fc_move/file2 e4defrag /mnt/fc_move/file1 cat /proc/fs/ext4/loop0/fc_info shows "Move extents": > 0 and fc stats ineligible > 0. Signed-off-by: Li Chen Link: https://patch.msgid.link/20251211115146.897420-4-me@linux.beauty Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 1 + fs/ext4/fast_commit.h | 1 + fs/ext4/move_extent.c | 2 ++ include/trace/events/ext4.h | 4 +++- 4 files changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 242b69e5fe13..0ef2154a2b1f 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -2304,6 +2304,7 @@ static const char * const fc_ineligible_reasons[] = { [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename", [EXT4_FC_REASON_MIGRATE] = "Inode format migration", [EXT4_FC_REASON_VERITY] = "fs-verity enable", + [EXT4_FC_REASON_MOVE_EXT] = "Move extents", }; int ext4_fc_info_show(struct seq_file *seq, void *v) diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h index 20f65135208f..2f77a37fb101 100644 --- a/fs/ext4/fast_commit.h +++ b/fs/ext4/fast_commit.h @@ -99,6 +99,7 @@ enum { EXT4_FC_REASON_ENCRYPTED_FILENAME, EXT4_FC_REASON_MIGRATE, EXT4_FC_REASON_VERITY, + EXT4_FC_REASON_MOVE_EXT, EXT4_FC_REASON_MAX }; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 635fb8a52e0c..ce1f738dff93 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -321,6 +321,8 @@ static int mext_move_extent(struct mext_data *mext, u64 *m_len) ret = PTR_ERR(handle); goto out; } + ext4_fc_mark_ineligible(orig_inode->i_sb, EXT4_FC_REASON_MOVE_EXT, + handle); ret = mext_move_begin(mext, folio, &move_type); if (ret) diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 726705bce9d3..a3e8fe414df8 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -104,6 +104,7 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_INODE_JOURNAL_DATA); TRACE_DEFINE_ENUM(EXT4_FC_REASON_ENCRYPTED_FILENAME); TRACE_DEFINE_ENUM(EXT4_FC_REASON_MIGRATE); TRACE_DEFINE_ENUM(EXT4_FC_REASON_VERITY); +TRACE_DEFINE_ENUM(EXT4_FC_REASON_MOVE_EXT); TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX); #define show_fc_reason(reason) \ @@ -119,7 +120,8 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX); { EXT4_FC_REASON_INODE_JOURNAL_DATA, "INODE_JOURNAL_DATA"}, \ { EXT4_FC_REASON_ENCRYPTED_FILENAME, "ENCRYPTED_FILENAME"}, \ { EXT4_FC_REASON_MIGRATE, "MIGRATE"}, \ - { EXT4_FC_REASON_VERITY, "VERITY"}) + { EXT4_FC_REASON_VERITY, "VERITY"}, \ + { EXT4_FC_REASON_MOVE_EXT, "MOVE_EXT"}) TRACE_DEFINE_ENUM(CR_POWER2_ALIGNED); TRACE_DEFINE_ENUM(CR_GOAL_LEN_FAST); From 89b4336fd5ec78f51f9d3a1d100f3ffa3228e604 Mon Sep 17 00:00:00 2001 From: Li Chen Date: Thu, 11 Dec 2025 19:51:41 +0800 Subject: [PATCH 20/45] ext4: mark group add fast-commit ineligible Fast commits only log operations that have dedicated replay support. Online resize via EXT4_IOC_GROUP_ADD updates the superblock and group descriptor metadata without going through the fast commit tracking paths. In practice these operations are rare and usually followed by further updates, but mixing them into a fast commit makes the overall semantics harder to reason about and risks replay gaps if new call sites appear. Teach ext4 to mark the filesystem fast-commit ineligible when ext4_ioctl_group_add() adds new block groups. This forces those transactions to fall back to a full commit, ensuring that the filesystem geometry updates are captured by the normal journal rather than partially encoded in fast commit TLVs. This change should not affect common workloads but makes online resize via GROUP_ADD safer and easier to reason about under fast commit. Testing: 1. prepare: dd if=/dev/zero of=/root/fc_resize.img bs=1M count=0 seek=256 mkfs.ext4 -O fast_commit -F /root/fc_resize.img mkdir -p /mnt/fc_resize && mount -t ext4 -o loop /root/fc_resize.img /mnt/fc_resize 2. Ran a helper that issues EXT4_IOC_GROUP_ADD on the mounted filesystem and checked the resize ineligible reason: ./group_add_helper /mnt/fc_resize cat /proc/fs/ext4/loop0/fc_info shows "Resize": > 0. 3. Fsynced a file on the resized filesystem and verified that the fast commit stats report at least one ineligible commit: touch /mnt/fc_resize/file /root/fsync_file /mnt/fc_resize/file sync cat /proc/fs/ext4/loop0/fc_info shows fc stats ineligible > 0. Signed-off-by: Li Chen Link: https://patch.msgid.link/20251211115146.897420-5-me@linux.beauty Signed-off-by: Theodore Ts'o --- fs/ext4/ioctl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 7ce0fc40aec2..5109b005e028 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -966,6 +966,7 @@ static long ext4_ioctl_group_add(struct file *file, err = ext4_group_add(sb, input); if (EXT4_SB(sb)->s_journal) { + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE, NULL); jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); From 1f8dd813a1c771b13c303f73d876164bc9b327cc Mon Sep 17 00:00:00 2001 From: Li Chen Date: Thu, 11 Dec 2025 19:51:42 +0800 Subject: [PATCH 21/45] ext4: mark group extend fast-commit ineligible Fast commits only log operations that have dedicated replay support. EXT4_IOC_GROUP_EXTEND grows the filesystem to the end of the last block group and updates the same on-disk metadata without going through the fast commit tracking paths. In practice these operations are rare and usually followed by further updates, but mixing them into a fast commit makes the overall semantics harder to reason about and risks replay gaps if new call sites appear. Teach ext4 to mark the filesystem fast-commit ineligible when EXT4_IOC_GROUP_EXTEND grows the filesystem. This forces those transactions to fall back to a full commit, ensuring that the group extension changes are captured by the normal journal rather than partially encoded in fast commit TLVs. This change should not affect common workloads but makes online resize via GROUP_EXTEND safer and easier to reason about under fast commit. Testing: 1. prepare: dd if=/dev/zero of=/root/fc_resize.img bs=1M count=0 seek=256 mkfs.ext4 -O fast_commit -F /root/fc_resize.img mkdir -p /mnt/fc_resize && mount -t ext4 -o loop /root/fc_resize.img /mnt/fc_resize 2. Extended the filesystem to the end of the last block group using a helper that calls EXT4_IOC_GROUP_EXTEND on the mounted filesystem and checked fc_info: ./group_extend_helper /mnt/fc_resize cat /proc/fs/ext4/loop0/fc_info shows the "Resize" ineligible reason increased. 3. Fsynced a file on the resized filesystem and confirmed that the fast commit ineligible counter incremented for the resize transaction: touch /mnt/fc_resize/file /root/fsync_file /mnt/fc_resize/file sync cat /proc/fs/ext4/loop0/fc_info Signed-off-by: Li Chen Link: https://patch.msgid.link/20251211115146.897420-6-me@linux.beauty Signed-off-by: Theodore Ts'o --- fs/ext4/ioctl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 5109b005e028..e5e197ac7d88 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1612,6 +1612,8 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); if (EXT4_SB(sb)->s_journal) { + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE, + NULL); jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); From 26f260ce5828fc7897a70629884916301f5825d0 Mon Sep 17 00:00:00 2001 From: pengdonglin Date: Thu, 11 Dec 2025 20:38:29 +0800 Subject: [PATCH 22/45] ext4: remove unnecessary zero-initialization via memset The d_path function does not require the caller to pre-zero the buffer. Signed-off-by: pengdonglin Reviewed-by: Zhang Yi Reviewed-by: Baokun Li Link: https://patch.msgid.link/20251211123829.2777009-1-dolinux.peng@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 7a8b30932189..484cb7388802 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -858,7 +858,6 @@ static int ext4_sample_last_mounted(struct super_block *sb, * when trying to sort through large numbers of block * devices or filesystem images. */ - memset(buf, 0, sizeof(buf)); path.mnt = mnt; path.dentry = mnt->mnt_root; cp = d_path(&path, buf, sizeof(buf)); From 154922b34da9770223d9883ac6976635a786b5ba Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 23 Dec 2025 09:19:27 +0800 Subject: [PATCH 23/45] ext4: don't order data when zeroing unwritten or delayed block When zeroing out a written partial block, it is necessary to order the data to prevent exposing stale data on disk. However, if the buffer is unwritten or delayed, it is not allocated as written, so ordering the data is not required. This can prevent strange and unnecessary ordered writes when appending data across a region within a block. Assume we have a 2K unwritten file on a filesystem with 4K blocksize, and buffered write from 3K to 4K. Before this patch, __ext4_block_zero_page_range() would add the range [2k,3k) to the ordered range, and then the JBD2 commit process would write back this block. However, it does nothing since the block is not mapped as written, this folio will be redirtied and written back agian through the normal write back process. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Baokun Li Link: https://patch.msgid.link/20251223011927.34042-1-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 08a7f2b54a9e..6ad1a6e3d133 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4133,9 +4133,13 @@ static int __ext4_block_zero_page_range(handle_t *handle, if (ext4_should_journal_data(inode)) { err = ext4_dirty_journalled_data(handle, bh); } else { - err = 0; mark_buffer_dirty(bh); - if (ext4_should_order_data(inode)) + /* + * Only the written block requires ordered data to prevent + * exposing stale data. + */ + if (!buffer_unwritten(bh) && !buffer_delay(bh) && + ext4_should_order_data(inode)) err = ext4_jbd2_inode_add_write(handle, inode, from, length); } From ca81109d4a8f192dc1cbad4a1ee25246363c2833 Mon Sep 17 00:00:00 2001 From: Zilin Guan Date: Thu, 25 Dec 2025 08:48:00 +0000 Subject: [PATCH 24/45] ext4: fix memory leak in ext4_ext_shift_extents() In ext4_ext_shift_extents(), if the extent is NULL in the while loop, the function returns immediately without releasing the path obtained via ext4_find_extent(), leading to a memory leak. Fix this by jumping to the out label to ensure the path is properly released. Fixes: a18ed359bdddc ("ext4: always check ext4_ext_find_extent result") Signed-off-by: Zilin Guan Reviewed-by: Zhang Yi Reviewed-by: Baokun Li Link: https://patch.msgid.link/20251225084800.905701-1-zilin@seu.edu.cn Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/extents.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 27eb2c1df012..e0295e0339b4 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5406,7 +5406,8 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, if (!extent) { EXT4_ERROR_INODE(inode, "unexpected hole at %lu", (unsigned long) *iterator); - return -EFSCORRUPTED; + ret = -EFSCORRUPTED; + goto out; } if (SHIFT == SHIFT_LEFT && *iterator > le32_to_cpu(extent->ee_block)) { From 01942af95ab6c9d98e64ae01fdc243a03e4b973f Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 5 Jan 2026 09:45:16 +0800 Subject: [PATCH 25/45] ext4: use reserved metadata blocks when splitting extent on endio When performing buffered writes, we may need to split and convert an unwritten extent into a written one during the end I/O process. However, we do not reserve space specifically for these metadata changes, we only reserve 2% of space or 4096 blocks. To address this, we use EXT4_GET_BLOCKS_PRE_IO to potentially split extents in advance and EXT4_GET_BLOCKS_METADATA_NOFAIL to utilize reserved space if necessary. These two approaches can reduce the likelihood of running out of space and losing data. However, these methods are merely best efforts, we could still run out of space, and there is not much difference between converting an extent during the writeback process and the end I/O process, it won't increase the risk of losing data if we postpone the conversion. Therefore, also use EXT4_GET_BLOCKS_METADATA_NOFAIL in ext4_convert_unwritten_extents_endio() to prepare for the buffered I/O iomap conversion, which may perform extent conversion during the end I/O process. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Baokun Li Reviewed-by: Ojaswin Mujoo Link: https://patch.msgid.link/20260105014522.1937690-2-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e0295e0339b4..671cb038a417 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3794,6 +3794,8 @@ ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, * illegal. */ if (ee_block != map->m_lblk || ee_len > map->m_len) { + int flags = EXT4_GET_BLOCKS_CONVERT | + EXT4_GET_BLOCKS_METADATA_NOFAIL; #ifdef CONFIG_EXT4_DEBUG ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu," " len %u; IO logical block %llu, len %u", @@ -3801,7 +3803,7 @@ ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, (unsigned long long)map->m_lblk, map->m_len); #endif path = ext4_split_convert_extents(handle, inode, map, path, - EXT4_GET_BLOCKS_CONVERT, NULL); + flags, NULL); if (IS_ERR(path)) return path; From ea96cb5c4ae3ab9516a78b6b435721a6c701eff4 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 5 Jan 2026 09:45:17 +0800 Subject: [PATCH 26/45] ext4: don't split extent before submitting I/O Currently, when writing back dirty pages to the filesystem with the dioread_nolock feature enabled and when doing DIO, if the area to be written back is part of an unwritten extent, the EXT4_GET_BLOCKS_IO_CREATE_EXT flag is set during block allocation before submitting I/O. The function ext4_split_convert_extents() then attempts to split this extent in advance. This approach is designed to prevents extent splitting and conversion to the written type from failing due to insufficient disk space at the time of I/O completion, which could otherwise result in data loss. However, we already have two mechanisms to ensure successful extent conversion. The first is the EXT4_GET_BLOCKS_METADATA_NOFAIL flag, which is a best effort, it permits the use of 2% of the reserved space or 4,096 blocks in the file system when splitting extents. This flag covers most scenarios where extent splitting might fail. The second is the EXT4_EXT_MAY_ZEROOUT flag, which is also set during extent splitting. If the reserved space is insufficient and splitting fails, it does not retry the allocation. Instead, it directly zeros out the extra part of the extent, thereby avoiding splitting and directly converting the entire extent to the written type. These two mechanisms also exist when I/Os are completed because there is a concurrency window between write-back and fallocate, which may still require us to split extents upon I/O completion. There is no much difference between splitting extents before submitting I/O. Therefore, It seems possible to defer the splitting until I/O completion, it won't increase the risk of I/O failure and data loss. On the contrary, if some I/Os can be merged when I/O completion, it can also reduce unnecessary splitting operations, thereby alleviating the pressure on reserved space. In addition, deferring extent splitting until I/O completion can also simplify the IO submission process and avoid initiating unnecessary journal handles when writing unwritten extents. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Baokun Li Reviewed-by: Ojaswin Mujoo Link: https://patch.msgid.link/20260105014522.1937690-3-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 13 +------------ fs/ext4/inode.c | 4 ++-- 2 files changed, 3 insertions(+), 14 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 671cb038a417..4e7baac24a90 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3787,21 +3787,10 @@ ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)ee_block, ee_len); - /* If extent is larger than requested it is a clear sign that we still - * have some extent state machine issues left. So extent_split is still - * required. - * TODO: Once all related issues will be fixed this situation should be - * illegal. - */ if (ee_block != map->m_lblk || ee_len > map->m_len) { int flags = EXT4_GET_BLOCKS_CONVERT | EXT4_GET_BLOCKS_METADATA_NOFAIL; -#ifdef CONFIG_EXT4_DEBUG - ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu," - " len %u; IO logical block %llu, len %u", - inode->i_ino, (unsigned long long)ee_block, ee_len, - (unsigned long long)map->m_lblk, map->m_len); -#endif + path = ext4_split_convert_extents(handle, inode, map, path, flags, NULL); if (IS_ERR(path)) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6ad1a6e3d133..b86ddc9e85ce 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2375,7 +2375,7 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) dioread_nolock = ext4_should_dioread_nolock(inode); if (dioread_nolock) - get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; + get_blocks_flags |= EXT4_GET_BLOCKS_UNWRIT_EXT; err = ext4_map_blocks(handle, inode, map, get_blocks_flags); if (err < 0) @@ -3740,7 +3740,7 @@ static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, else if (EXT4_LBLK_TO_B(inode, map->m_lblk) >= i_size_read(inode)) m_flags = EXT4_GET_BLOCKS_CREATE; else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; + m_flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; if (flags & IOMAP_ATOMIC) ret = ext4_map_blocks_atomic_write(handle, inode, map, m_flags, From 5d87c7fca2c1f51537052c791df694dc3c4261cb Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 5 Jan 2026 09:45:18 +0800 Subject: [PATCH 27/45] ext4: avoid starting handle when dio writing an unwritten extent Since we have deferred the split of the unwritten extent until after I/O completion, it is not necessary to initiate the journal handle when submitting the I/O. This can improve the write performance of concurrent DIO for multiple files. The fio tests below show a ~25% performance improvement when wirting to unwritten files on my VM with a mem disk. [unwritten] direct=1 ioengine=psync numjobs=16 rw=write # write/randwrite bs=4K iodepth=1 directory=/mnt size=5G runtime=30s overwrite=0 norandommap=1 fallocate=native ramp_time=5s group_reporting=1 [w/o] w: IOPS=62.5k, BW=244MiB/s rw: IOPS=56.7k, BW=221MiB/s [w] w: IOPS=79.6k, BW=311MiB/s rw: IOPS=70.2k, BW=274MiB/s Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Baokun Li Reviewed-by: Ojaswin Mujoo Link: https://patch.msgid.link/20260105014522.1937690-4-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 4 +--- fs/ext4/inode.c | 9 +++++++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 484cb7388802..4fadc476d645 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -418,9 +418,7 @@ static const struct iomap_dio_ops ext4_dio_write_ops = { * updating inode i_disksize and/or orphan handling with exclusive lock. * * - shared locking will only be true mostly with overwrites, including - * initialized blocks and unwritten blocks. For overwrite unwritten blocks - * we protect splitting extents by i_data_sem in ext4_inode_info, so we can - * also release exclusive i_rwsem lock. + * initialized blocks and unwritten blocks. * * - Otherwise we will switch to exclusive i_rwsem lock. */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b86ddc9e85ce..1cb7775c5e41 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3813,9 +3813,14 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, ret = ext4_map_blocks(NULL, inode, &map, 0); /* * For atomic writes the entire requested length should - * be mapped. + * be mapped. For DAX we convert extents to initialized + * ones before copying the data, otherwise we do it + * after I/O so there's no need to call into + * ext4_iomap_alloc(). */ - if (map.m_flags & EXT4_MAP_MAPPED) { + if ((map.m_flags & EXT4_MAP_MAPPED) || + (!(flags & IOMAP_DAX) && + (map.m_flags & EXT4_MAP_UNWRITTEN))) { if ((!(flags & IOMAP_ATOMIC) && ret > 0) || (flags & IOMAP_ATOMIC && ret >= orig_mlen)) goto out; From 012924f0eeef84f9bdb71896265f8303245065a8 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 5 Jan 2026 09:45:19 +0800 Subject: [PATCH 28/45] ext4: remove useless ext4_iomap_overwrite_ops ext4_iomap_overwrite_ops was introduced in commit 8cd115bdda17 ("ext4: Optimize ext4 DIO overwrites"), which can optimize pure overwrite performance by dropping the IOMAP_WRITE flag to only query the mapped mapping information. This avoids starting a new journal handle, thereby improving speed. Later, commit 9faac62d4013 ("ext4: optimize file overwrites") also optimized similar scenarios, but it performs the check later, examining the mappings status only when the actual block mapping is needed. Thus, it can handle the previous commit scenario. That means in the case of an overwrite scenario, the condition "offset + length <= i_size_read(inode)" in the write path must always be true. Therefore, it is acceptable to remove the ext4_iomap_overwrite_ops, which will also clarify the write and read paths of ext4_iomap_begin. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Baokun Li Reviewed-by: Ojaswin Mujoo Link: https://patch.msgid.link/20260105014522.1937690-5-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 - fs/ext4/file.c | 5 +---- fs/ext4/inode.c | 24 ------------------------ 3 files changed, 1 insertion(+), 29 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 046706c1f01f..146d694beb02 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3911,7 +3911,6 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) } extern const struct iomap_ops ext4_iomap_ops; -extern const struct iomap_ops ext4_iomap_overwrite_ops; extern const struct iomap_ops ext4_iomap_report_ops; static inline int ext4_buffer_uptodate(struct buffer_head *bh) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 4fadc476d645..43a24c776bf0 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -506,7 +506,6 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(iocb->ki_filp); loff_t offset = iocb->ki_pos; size_t count = iov_iter_count(from); - const struct iomap_ops *iomap_ops = &ext4_iomap_ops; bool extend = false, unwritten = false; bool ilock_shared = true; int dio_flags = 0; @@ -573,9 +572,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out; } - if (ilock_shared && !unwritten) - iomap_ops = &ext4_iomap_overwrite_ops; - ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, + ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops, dio_flags, NULL, 0); if (ret == -ENOTBLK) ret = 0; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1cb7775c5e41..f346302709b9 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3829,10 +3829,6 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, } ret = ext4_iomap_alloc(inode, &map, flags); } else { - /* - * This can be called for overwrites path from - * ext4_iomap_overwrite_begin(). - */ ret = ext4_map_blocks(NULL, inode, &map, 0); } @@ -3861,30 +3857,10 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, return 0; } -static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, - loff_t length, unsigned flags, struct iomap *iomap, - struct iomap *srcmap) -{ - int ret; - - /* - * Even for writes we don't need to allocate blocks, so just pretend - * we are reading to save overhead of starting a transaction. - */ - flags &= ~IOMAP_WRITE; - ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap); - WARN_ON_ONCE(!ret && iomap->type != IOMAP_MAPPED); - return ret; -} - const struct iomap_ops ext4_iomap_ops = { .iomap_begin = ext4_iomap_begin, }; -const struct iomap_ops ext4_iomap_overwrite_ops = { - .iomap_begin = ext4_iomap_overwrite_begin, -}; - static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) From 8bd1f257af1c21d34f8758f4e36854970e1dc2f5 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 5 Jan 2026 09:45:20 +0800 Subject: [PATCH 29/45] ext4: remove unused unwritten parameter in ext4_dio_write_iter() The parameter unwritten in ext4_dio_write_iter() is no longer needed, simply remove it. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Baokun Li Reviewed-by: Ojaswin Mujoo Link: https://patch.msgid.link/20260105014522.1937690-6-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 43a24c776bf0..5ef5a855ce9f 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -424,14 +424,14 @@ static const struct iomap_dio_ops ext4_dio_write_ops = { */ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, bool *ilock_shared, bool *extend, - bool *unwritten, int *dio_flags) + int *dio_flags) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); loff_t offset; size_t count; ssize_t ret; - bool overwrite, unaligned_io; + bool overwrite, unaligned_io, unwritten; restart: ret = ext4_generic_write_checks(iocb, from); @@ -443,7 +443,7 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, unaligned_io = ext4_unaligned_io(inode, from, offset); *extend = ext4_extending_io(inode, offset, count); - overwrite = ext4_overwrite_io(inode, offset, count, unwritten); + overwrite = ext4_overwrite_io(inode, offset, count, &unwritten); /* * Determine whether we need to upgrade to an exclusive lock. This is @@ -458,7 +458,7 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, */ if (*ilock_shared && ((!IS_NOSEC(inode) || *extend || !overwrite || - (unaligned_io && *unwritten)))) { + (unaligned_io && unwritten)))) { if (iocb->ki_flags & IOCB_NOWAIT) { ret = -EAGAIN; goto out; @@ -481,7 +481,7 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, ret = -EAGAIN; goto out; } - if (unaligned_io && (!overwrite || *unwritten)) + if (unaligned_io && (!overwrite || unwritten)) inode_dio_wait(inode); *dio_flags = IOMAP_DIO_FORCE_WAIT; } @@ -506,7 +506,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(iocb->ki_filp); loff_t offset = iocb->ki_pos; size_t count = iov_iter_count(from); - bool extend = false, unwritten = false; + bool extend = false; bool ilock_shared = true; int dio_flags = 0; @@ -552,7 +552,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend, - &unwritten, &dio_flags); + &dio_flags); if (ret <= 0) return ret; From 5ca28af074ad506c95a8f86a5d260562f3caa39b Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 5 Jan 2026 09:45:21 +0800 Subject: [PATCH 30/45] ext4: simplify the mapping query logic in ext4_iomap_begin() In the write path mapping check of ext4_iomap_begin(), the return value 'ret' should never greater than orig_mlen. If 'ret' equals 'orig_mlen', it can be returned directly without checking IOMAP_ATOMIC. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Baokun Li Reviewed-by: Ojaswin Mujoo Link: https://patch.msgid.link/20260105014522.1937690-7-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f346302709b9..d914f612da10 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3812,17 +3812,19 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, if (offset + length <= i_size_read(inode)) { ret = ext4_map_blocks(NULL, inode, &map, 0); /* - * For atomic writes the entire requested length should - * be mapped. For DAX we convert extents to initialized - * ones before copying the data, otherwise we do it - * after I/O so there's no need to call into - * ext4_iomap_alloc(). + * For DAX we convert extents to initialized ones before + * copying the data, otherwise we do it after I/O so + * there's no need to call into ext4_iomap_alloc(). */ if ((map.m_flags & EXT4_MAP_MAPPED) || (!(flags & IOMAP_DAX) && (map.m_flags & EXT4_MAP_UNWRITTEN))) { - if ((!(flags & IOMAP_ATOMIC) && ret > 0) || - (flags & IOMAP_ATOMIC && ret >= orig_mlen)) + /* + * For atomic writes the entire requested + * length should be mapped. + */ + if (ret == orig_mlen || + (!(flags & IOMAP_ATOMIC) && ret > 0)) goto out; } map.m_len = orig_mlen; From 5f18f60d56c0cedd17826882b66b94f1a52f65ef Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 5 Jan 2026 09:45:22 +0800 Subject: [PATCH 31/45] ext4: remove EXT4_GET_BLOCKS_IO_CREATE_EXT We do not use EXT4_GET_BLOCKS_IO_CREATE_EXT or split extents before submitting I/O; therefore, remove the related code. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Baokun Li Reviewed-by: Ojaswin Mujoo Link: https://patch.msgid.link/20260105014522.1937690-8-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 9 --------- fs/ext4/extents.c | 29 ----------------------------- fs/ext4/inode.c | 11 ----------- 3 files changed, 49 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 146d694beb02..3b5454208305 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -707,15 +707,6 @@ enum { * found an unwritten extent, we need to split it. */ #define EXT4_GET_BLOCKS_SPLIT_NOMERGE 0x0008 - /* - * Caller is from the dio or dioread_nolock buffered IO, reqest to - * create an unwritten extent if it does not exist or split the - * found unwritten extent. Also do not merge the newly created - * unwritten extent, io end will convert unwritten to written, - * and try to merge the written extent. - */ -#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_SPLIT_NOMERGE|\ - EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) /* Convert unwritten extent to initialized. */ #define EXT4_GET_BLOCKS_CONVERT 0x0010 /* Eventual metadata allocation (due to growing extent tree) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 4e7baac24a90..8597110f50ef 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3925,34 +3925,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, trace_ext4_ext_handle_unwritten_extents(inode, map, flags, *allocated, newblock); - /* get_block() before submitting IO, split the extent */ - if (flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE) { - int depth; - - path = ext4_split_convert_extents(handle, inode, map, path, - flags, allocated); - if (IS_ERR(path)) - return path; - /* - * shouldn't get a 0 allocated when splitting an extent unless - * m_len is 0 (bug) or extent has been corrupted - */ - if (unlikely(*allocated == 0)) { - EXT4_ERROR_INODE(inode, - "unexpected allocated == 0, m_len = %u", - map->m_len); - err = -EFSCORRUPTED; - goto errout; - } - /* Don't mark unwritten if the extent has been zeroed out. */ - path = ext4_find_extent(inode, map->m_lblk, path, flags); - if (IS_ERR(path)) - return path; - depth = ext_depth(inode); - if (ext4_ext_is_unwritten(path[depth].p_ext)) - map->m_flags |= EXT4_MAP_UNWRITTEN; - goto out; - } /* IO end_io complete, convert the filled extent to written */ if (flags & EXT4_GET_BLOCKS_CONVERT) { path = ext4_convert_unwritten_extents_endio(handle, inode, @@ -4006,7 +3978,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, goto errout; } -out: map->m_flags |= EXT4_MAP_NEW; map_out: map->m_flags |= EXT4_MAP_MAPPED; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d914f612da10..cf0ffd799501 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -587,7 +587,6 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { - struct extent_status es; unsigned int status; int err, retval = 0; @@ -648,16 +647,6 @@ static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, return err; } - /* - * If the extent has been zeroed out, we don't need to update - * extent status tree. - */ - if (flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE && - ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) { - if (ext4_es_is_written(&es)) - return retval; - } - status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, From 591a4ab9b8b125bf72a345ca6f5c0ee4481db02b Mon Sep 17 00:00:00 2001 From: Baolin Liu Date: Tue, 6 Jan 2026 14:20:16 +0800 Subject: [PATCH 32/45] ext4: remove redundant NULL check after __GFP_NOFAIL Remove redundant NULL check after kcalloc() with GFP_NOFS | __GFP_NOFAIL. Signed-off-by: Baolin Liu Reviewed-by: Zhang Yi Link: https://patch.msgid.link/20260106062016.154573-1-liubaolin12138@163.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 8597110f50ef..3c288d4b19d2 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2949,10 +2949,6 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, } else { path = kcalloc(depth + 1, sizeof(struct ext4_ext_path), GFP_NOFS | __GFP_NOFAIL); - if (path == NULL) { - ext4_journal_stop(handle); - return -ENOMEM; - } path[0].p_maxdepth = path[0].p_depth = depth; path[0].p_hdr = ext_inode_hdr(inode); i = 0; From bdc56a9c46b2a99c12313122b9352b619a2e719e Mon Sep 17 00:00:00 2001 From: Yongjian Sun Date: Tue, 6 Jan 2026 17:08:20 +0800 Subject: [PATCH 33/45] ext4: fix e4b bitmap inconsistency reports A bitmap inconsistency issue was observed during stress tests under mixed huge-page workloads. Ext4 reported multiple e4b bitmap check failures like: ext4_mb_complex_scan_group:2508: group 350, 8179 free clusters as per group info. But got 8192 blocks Analysis and experimentation confirmed that the issue is caused by a race condition between page migration and bitmap modification. Although this timing window is extremely narrow, it is still hit in practice: folio_lock ext4_mb_load_buddy __migrate_folio check ref count folio_mc_copy __filemap_get_folio folio_try_get(folio) ...... mb_mark_used ext4_mb_unload_buddy __folio_migrate_mapping folio_ref_freeze folio_unlock The root cause of this issue is that the fast path of load_buddy only increments the folio's reference count, which is insufficient to prevent concurrent folio migration. We observed that the folio migration process acquires the folio lock. Therefore, we can determine whether to take the fast path in load_buddy by checking the lock status. If the folio is locked, we opt for the slow path (which acquires the lock) to close this concurrency window. Additionally, this change addresses the following issues: When the DOUBLE_CHECK macro is enabled to inspect bitmap-related issues, the following error may be triggered: corruption in group 324 at byte 784(6272): f in copy != ff on disk/prealloc Analysis reveals that this is a false positive. There is a specific race window where the bitmap and the group descriptor become momentarily inconsistent, leading to this error report: ext4_mb_load_buddy ext4_mb_load_buddy __filemap_get_folio(create|lock) folio_lock ext4_mb_init_cache folio_mark_uptodate __filemap_get_folio(no lock) ...... mb_mark_used mb_mark_used_double mb_cmp_bitmaps mb_set_bits(e4b->bd_bitmap) folio_unlock The original logic assumed that since mb_cmp_bitmaps is called when the bitmap is newly loaded from disk, the folio lock would be sufficient to prevent concurrent access. However, this overlooks a specific race condition: if another process attempts to load buddy and finds the folio is already in an uptodate state, it will immediately begin using it without holding folio lock. Signed-off-by: Yongjian Sun Reviewed-by: Zhang Yi Reviewed-by: Baokun Li Reviewed-by: Jan Kara Link: https://patch.msgid.link/20260106090820.836242-1-sunyongjian@huaweicloud.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/mballoc.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 56d50fd3310b..de4cacb740b3 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1706,16 +1706,17 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, /* Avoid locking the folio in the fast path ... */ folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); - if (IS_ERR(folio) || !folio_test_uptodate(folio)) { + if (IS_ERR(folio) || !folio_test_uptodate(folio) || folio_test_locked(folio)) { + /* + * folio_test_locked is employed to detect ongoing folio + * migrations, since concurrent migrations can lead to + * bitmap inconsistency. And if we are not uptodate that + * implies somebody just created the folio but is yet to + * initialize it. We can drop the folio reference and + * try to get the folio with lock in both cases to avoid + * concurrency. + */ if (!IS_ERR(folio)) - /* - * drop the folio reference and try - * to get the folio with lock. If we - * are not uptodate that implies - * somebody just created the folio but - * is yet to initialize it. So - * wait for it to initialize. - */ folio_put(folio); folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); @@ -1764,7 +1765,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, /* we need another folio for the buddy */ folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); - if (IS_ERR(folio) || !folio_test_uptodate(folio)) { + if (IS_ERR(folio) || !folio_test_uptodate(folio) || folio_test_locked(folio)) { if (!IS_ERR(folio)) folio_put(folio); folio = __filemap_get_folio(inode->i_mapping, pnum, From 491f2927ae097e2d405afe0b3fe841931ab8aad2 Mon Sep 17 00:00:00 2001 From: Li Chen Date: Tue, 6 Jan 2026 20:06:21 +0800 Subject: [PATCH 34/45] ext4: fast commit: make s_fc_lock reclaim-safe s_fc_lock can be acquired from inode eviction and thus is reclaim unsafe. Since the fast commit path holds s_fc_lock while writing the commit log, allocations under the lock can enter reclaim and invert the lock order with fs_reclaim. Add ext4_fc_lock()/ext4_fc_unlock() helpers which acquire s_fc_lock under memalloc_nofs_save()/restore() context and use them everywhere so allocations under the lock cannot recurse into filesystem reclaim. Fixes: 6593714d67ba ("ext4: hold s_fc_lock while during fast commit") Signed-off-by: Li Chen Reviewed-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20260106120621.440126-1-me@linux.beauty Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 16 ++++++++++++++ fs/ext4/fast_commit.c | 51 ++++++++++++++++++++++++------------------- 2 files changed, 44 insertions(+), 23 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3b5454208305..9610602fe37b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1788,6 +1788,10 @@ struct ext4_sb_info { * Main fast commit lock. This lock protects accesses to the * following fields: * ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh. + * + * s_fc_lock can be taken from reclaim context (inode eviction) and is + * thus reclaim unsafe. Use ext4_fc_lock()/ext4_fc_unlock() helpers + * when acquiring / releasing the lock. */ struct mutex s_fc_lock; struct buffer_head *s_fc_bh; @@ -1832,6 +1836,18 @@ static inline void ext4_writepages_up_write(struct super_block *sb, int ctx) percpu_up_write(&EXT4_SB(sb)->s_writepages_rwsem); } +static inline int ext4_fc_lock(struct super_block *sb) +{ + mutex_lock(&EXT4_SB(sb)->s_fc_lock); + return memalloc_nofs_save(); +} + +static inline void ext4_fc_unlock(struct super_block *sb, int ctx) +{ + memalloc_nofs_restore(ctx); + mutex_unlock(&EXT4_SB(sb)->s_fc_lock); +} + static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) { return ino == EXT4_ROOT_INO || diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 0ef2154a2b1f..f575751f1cae 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -231,16 +231,16 @@ static bool ext4_fc_disabled(struct super_block *sb) void ext4_fc_del(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_fc_dentry_update *fc_dentry; wait_queue_head_t *wq; + int alloc_ctx; if (ext4_fc_disabled(inode->i_sb)) return; - mutex_lock(&sbi->s_fc_lock); + alloc_ctx = ext4_fc_lock(inode->i_sb); if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { - mutex_unlock(&sbi->s_fc_lock); + ext4_fc_unlock(inode->i_sb, alloc_ctx); return; } @@ -275,9 +275,9 @@ void ext4_fc_del(struct inode *inode) #endif prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) { - mutex_unlock(&sbi->s_fc_lock); + ext4_fc_unlock(inode->i_sb, alloc_ctx); schedule(); - mutex_lock(&sbi->s_fc_lock); + alloc_ctx = ext4_fc_lock(inode->i_sb); } finish_wait(wq, &wait.wq_entry); } @@ -288,7 +288,7 @@ void ext4_fc_del(struct inode *inode) * dentry create references, since it is not needed to log it anyways. */ if (list_empty(&ei->i_fc_dilist)) { - mutex_unlock(&sbi->s_fc_lock); + ext4_fc_unlock(inode->i_sb, alloc_ctx); return; } @@ -298,7 +298,7 @@ void ext4_fc_del(struct inode *inode) list_del_init(&fc_dentry->fcd_dilist); WARN_ON(!list_empty(&ei->i_fc_dilist)); - mutex_unlock(&sbi->s_fc_lock); + ext4_fc_unlock(inode->i_sb, alloc_ctx); release_dentry_name_snapshot(&fc_dentry->fcd_name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); @@ -315,6 +315,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl tid_t tid; bool has_transaction = true; bool is_ineligible; + int alloc_ctx; if (ext4_fc_disabled(sb)) return; @@ -329,12 +330,12 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl has_transaction = false; read_unlock(&sbi->s_journal->j_state_lock); } - mutex_lock(&sbi->s_fc_lock); + alloc_ctx = ext4_fc_lock(sb); is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid))) sbi->s_fc_ineligible_tid = tid; ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); - mutex_unlock(&sbi->s_fc_lock); + ext4_fc_unlock(sb, alloc_ctx); WARN_ON(reason >= EXT4_FC_REASON_MAX); sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; } @@ -358,6 +359,7 @@ static int ext4_fc_track_template( struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); tid_t tid = 0; + int alloc_ctx; int ret; tid = handle->h_transaction->t_tid; @@ -373,14 +375,14 @@ static int ext4_fc_track_template( if (!enqueue) return ret; - mutex_lock(&sbi->s_fc_lock); + alloc_ctx = ext4_fc_lock(inode->i_sb); if (list_empty(&EXT4_I(inode)->i_fc_list)) list_add_tail(&EXT4_I(inode)->i_fc_list, (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? &sbi->s_fc_q[FC_Q_STAGING] : &sbi->s_fc_q[FC_Q_MAIN]); - mutex_unlock(&sbi->s_fc_lock); + ext4_fc_unlock(inode->i_sb, alloc_ctx); return ret; } @@ -402,6 +404,7 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode, struct inode *dir = dentry->d_parent->d_inode; struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); + int alloc_ctx; spin_unlock(&ei->i_fc_lock); @@ -425,7 +428,7 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode, take_dentry_name_snapshot(&node->fcd_name, dentry); INIT_LIST_HEAD(&node->fcd_dilist); INIT_LIST_HEAD(&node->fcd_list); - mutex_lock(&sbi->s_fc_lock); + alloc_ctx = ext4_fc_lock(sb); if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) list_add_tail(&node->fcd_list, @@ -446,7 +449,7 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode, WARN_ON(!list_empty(&ei->i_fc_dilist)); list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist); } - mutex_unlock(&sbi->s_fc_lock); + ext4_fc_unlock(sb, alloc_ctx); spin_lock(&ei->i_fc_lock); return 0; @@ -1046,18 +1049,19 @@ static int ext4_fc_perform_commit(journal_t *journal) struct blk_plug plug; int ret = 0; u32 crc = 0; + int alloc_ctx; /* * Step 1: Mark all inodes on s_fc_q[MAIN] with * EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being * freed until the data flush is over. */ - mutex_lock(&sbi->s_fc_lock); + alloc_ctx = ext4_fc_lock(sb); list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ext4_set_inode_state(&iter->vfs_inode, EXT4_STATE_FC_FLUSHING_DATA); } - mutex_unlock(&sbi->s_fc_lock); + ext4_fc_unlock(sb, alloc_ctx); /* Step 2: Flush data for all the eligible inodes. */ ret = ext4_fc_flush_data(journal); @@ -1067,7 +1071,7 @@ static int ext4_fc_perform_commit(journal_t *journal) * any error from step 2. This ensures that waiters waiting on * EXT4_STATE_FC_FLUSHING_DATA can resume. */ - mutex_lock(&sbi->s_fc_lock); + alloc_ctx = ext4_fc_lock(sb); list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ext4_clear_inode_state(&iter->vfs_inode, EXT4_STATE_FC_FLUSHING_DATA); @@ -1084,7 +1088,7 @@ static int ext4_fc_perform_commit(journal_t *journal) * prepare_to_wait() in ext4_fc_del(). */ smp_mb(); - mutex_unlock(&sbi->s_fc_lock); + ext4_fc_unlock(sb, alloc_ctx); /* * If we encountered error in Step 2, return it now after clearing @@ -1101,12 +1105,12 @@ static int ext4_fc_perform_commit(journal_t *journal) * previous handles are now drained. We now mark the inodes on the * commit queue as being committed. */ - mutex_lock(&sbi->s_fc_lock); + alloc_ctx = ext4_fc_lock(sb); list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ext4_set_inode_state(&iter->vfs_inode, EXT4_STATE_FC_COMMITTING); } - mutex_unlock(&sbi->s_fc_lock); + ext4_fc_unlock(sb, alloc_ctx); jbd2_journal_unlock_updates(journal); /* @@ -1117,6 +1121,7 @@ static int ext4_fc_perform_commit(journal_t *journal) blkdev_issue_flush(journal->j_fs_dev); blk_start_plug(&plug); + alloc_ctx = ext4_fc_lock(sb); /* Step 6: Write fast commit blocks to disk. */ if (sbi->s_fc_bytes == 0) { /* @@ -1134,7 +1139,6 @@ static int ext4_fc_perform_commit(journal_t *journal) } /* Step 6.2: Now write all the dentry updates. */ - mutex_lock(&sbi->s_fc_lock); ret = ext4_fc_commit_dentry_updates(journal, &crc); if (ret) goto out; @@ -1156,7 +1160,7 @@ static int ext4_fc_perform_commit(journal_t *journal) ret = ext4_fc_write_tail(sb, crc); out: - mutex_unlock(&sbi->s_fc_lock); + ext4_fc_unlock(sb, alloc_ctx); blk_finish_plug(&plug); return ret; } @@ -1290,6 +1294,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *ei; struct ext4_fc_dentry_update *fc_dentry; + int alloc_ctx; if (full && sbi->s_fc_bh) sbi->s_fc_bh = NULL; @@ -1297,7 +1302,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) trace_ext4_fc_cleanup(journal, full, tid); jbd2_fc_release_bufs(journal); - mutex_lock(&sbi->s_fc_lock); + alloc_ctx = ext4_fc_lock(sb); while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) { ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN], struct ext4_inode_info, @@ -1356,7 +1361,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) if (full) sbi->s_fc_bytes = 0; - mutex_unlock(&sbi->s_fc_lock); + ext4_fc_unlock(sb, alloc_ctx); trace_ext4_fc_stats(sb); } From 94a8cea54cd935c54fa2fba70354757c0fc245e3 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 13 Jan 2026 12:19:05 -0500 Subject: [PATCH 35/45] ext4: fix dirtyclusters double decrement on fs shutdown fstests test generic/388 occasionally reproduces a warning in ext4_put_super() associated with the dirty clusters count: WARNING: CPU: 7 PID: 76064 at fs/ext4/super.c:1324 ext4_put_super+0x48c/0x590 [ext4] Tracing the failure shows that the warning fires due to an s_dirtyclusters_counter value of -1. IOW, this appears to be a spurious decrement as opposed to some sort of leak. Further tracing of the dirty cluster count deltas and an LLM scan of the resulting output identified the cause as a double decrement in the error path between ext4_mb_mark_diskspace_used() and the caller ext4_mb_new_blocks(). First, note that generic/388 is a shutdown vs. fsstress test and so produces a random set of operations and shutdown injections. In the problematic case, the shutdown triggers an error return from the ext4_handle_dirty_metadata() call(s) made from ext4_mb_mark_context(). The changed value is non-zero at this point, so ext4_mb_mark_diskspace_used() does not exit after the error bubbles up from ext4_mb_mark_context(). Instead, the former decrements both cluster counters and returns the error up to ext4_mb_new_blocks(). The latter falls into the !ar->len out path which decrements the dirty clusters counter a second time, creating the inconsistency. To avoid this problem and simplify ownership of the cluster reservation in this codepath, lift the counter reduction to a single place in the caller. This makes it more clear that ext4_mb_new_blocks() is responsible for acquiring cluster reservation (via ext4_claim_free_clusters()) in the !delalloc case as well as releasing it, regardless of whether it ends up consumed or returned due to failure. Fixes: 0087d9fb3f29 ("ext4: Fix s_dirty_blocks_counter if block allocation failed with nodelalloc") Signed-off-by: Brian Foster Reviewed-by: Baokun Li Link: https://patch.msgid.link/20260113171905.118284-1-bfoster@redhat.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/mballoc-test.c | 2 +- fs/ext4/mballoc.c | 21 +++++---------------- 2 files changed, 6 insertions(+), 17 deletions(-) diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c index a9416b20ff64..4abb40d4561c 100644 --- a/fs/ext4/mballoc-test.c +++ b/fs/ext4/mballoc-test.c @@ -567,7 +567,7 @@ test_mark_diskspace_used_range(struct kunit *test, bitmap = mbt_ctx_bitmap(sb, TEST_GOAL_GROUP); memset(bitmap, 0, sb->s_blocksize); - ret = ext4_mb_mark_diskspace_used(ac, NULL, 0); + ret = ext4_mb_mark_diskspace_used(ac, NULL); KUNIT_ASSERT_EQ(test, ret, 0); max = EXT4_CLUSTERS_PER_GROUP(sb); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index de4cacb740b3..dd29558ad753 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4186,8 +4186,7 @@ ext4_mb_mark_context(handle_t *handle, struct super_block *sb, bool state, * Returns 0 if success or error code */ static noinline_for_stack int -ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, - handle_t *handle, unsigned int reserv_clstrs) +ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, handle_t *handle) { struct ext4_group_desc *gdp; struct ext4_sb_info *sbi; @@ -4242,13 +4241,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, BUG_ON(changed != ac->ac_b_ex.fe_len); #endif percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); - /* - * Now reduce the dirty block count also. Should not go negative - */ - if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) - /* release all the reserved blocks if non delalloc */ - percpu_counter_sub(&sbi->s_dirtyclusters_counter, - reserv_clstrs); return err; } @@ -6333,7 +6325,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, ext4_mb_pa_put_free(ac); } if (likely(ac->ac_status == AC_STATUS_FOUND)) { - *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); + *errp = ext4_mb_mark_diskspace_used(ac, handle); if (*errp) { ext4_discard_allocated_blocks(ac); goto errout; @@ -6364,12 +6356,9 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, out: if (inquota && ar->len < inquota) dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); - if (!ar->len) { - if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) - /* release all the reserved blocks if non delalloc */ - percpu_counter_sub(&sbi->s_dirtyclusters_counter, - reserv_clstrs); - } + /* release any reserved blocks */ + if (reserv_clstrs) + percpu_counter_sub(&sbi->s_dirtyclusters_counter, reserv_clstrs); trace_ext4_allocate_blocks(ar, (unsigned long long)block); From 4865c768b563deff1b6a6384e74a62f143427b42 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 14 Jan 2026 19:28:18 +0100 Subject: [PATCH 36/45] ext4: always allocate blocks only from groups inode can use For filesystems with more than 2^32 blocks inodes using indirect block based format cannot use blocks beyond the 32-bit limit. ext4_mb_scan_groups_linear() takes care to not select these unsupported groups for such inodes however other functions selecting groups for allocation don't. So far this is harmless because the other selection functions are used only with mb_optimize_scan and this is currently disabled for inodes with indirect blocks however in the following patch we want to enable mb_optimize_scan regardless of inode format. Reviewed-by: Baokun Li Reviewed-by: Zhang Yi Signed-off-by: Jan Kara Acked-by: Pedro Falcato Cc: stable@kernel.org Link: https://patch.msgid.link/20260114182836.14120-3-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index dd29558ad753..910b454b4a21 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -892,6 +892,21 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) } } +static ext4_group_t ext4_get_allocation_groups_count( + struct ext4_allocation_context *ac) +{ + ext4_group_t ngroups = ext4_get_groups_count(ac->ac_sb); + + /* non-extent files are limited to low blocks/groups */ + if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) + ngroups = EXT4_SB(ac->ac_sb)->s_blockfile_groups; + + /* Pairs with smp_wmb() in ext4_update_super() */ + smp_rmb(); + + return ngroups; +} + static int ext4_mb_scan_groups_xa_range(struct ext4_allocation_context *ac, struct xarray *xa, ext4_group_t start, ext4_group_t end) @@ -899,7 +914,7 @@ static int ext4_mb_scan_groups_xa_range(struct ext4_allocation_context *ac, struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); enum criteria cr = ac->ac_criteria; - ext4_group_t ngroups = ext4_get_groups_count(sb); + ext4_group_t ngroups = ext4_get_allocation_groups_count(ac); unsigned long group = start; struct ext4_group_info *grp; @@ -951,7 +966,7 @@ static int ext4_mb_scan_groups_p2_aligned(struct ext4_allocation_context *ac, ext4_group_t start, end; start = group; - end = ext4_get_groups_count(ac->ac_sb); + end = ext4_get_allocation_groups_count(ac); wrap_around: for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { ret = ext4_mb_scan_groups_largest_free_order_range(ac, i, @@ -1001,7 +1016,7 @@ static int ext4_mb_scan_groups_goal_fast(struct ext4_allocation_context *ac, ext4_group_t start, end; start = group; - end = ext4_get_groups_count(ac->ac_sb); + end = ext4_get_allocation_groups_count(ac); wrap_around: i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); for (; i < MB_NUM_ORDERS(ac->ac_sb); i++) { @@ -1083,7 +1098,7 @@ static int ext4_mb_scan_groups_best_avail(struct ext4_allocation_context *ac, min_order = fls(ac->ac_o_ex.fe_len); start = group; - end = ext4_get_groups_count(ac->ac_sb); + end = ext4_get_allocation_groups_count(ac); wrap_around: for (i = order; i >= min_order; i--) { int frag_order; @@ -1182,11 +1197,7 @@ static int ext4_mb_scan_groups(struct ext4_allocation_context *ac) int ret = 0; ext4_group_t start; struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - ext4_group_t ngroups = ext4_get_groups_count(ac->ac_sb); - - /* non-extent files are limited to low blocks/groups */ - if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) - ngroups = sbi->s_blockfile_groups; + ext4_group_t ngroups = ext4_get_allocation_groups_count(ac); /* searching for the right group start from the goal value specified */ start = ac->ac_g_ex.fe_group; From 3574c322b1d0eb32dbd76b469cb08f9a67641599 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 14 Jan 2026 19:28:19 +0100 Subject: [PATCH 37/45] ext4: use optimized mballoc scanning regardless of inode format Currently we don't used mballoc optimized scanning (using max free extent order and avg free extent order group lists) for inodes with indirect block based format. This is confusing for users and I don't see a good reason for that. Even with indirect block based inode format we can spend big amount of time searching for free blocks for large filesystems with fragmented free space. To add to the confusion before commit 077d0c2c78df ("ext4: make mb_optimize_scan performance mount option work with extents") optimized scanning was applied *only* to indirect block based inodes so that commit appears as a performance regression to some users. Just use optimized scanning whenever it is enabled by mount options. Reviewed-by: Baokun Li Reviewed-by: Zhang Yi Signed-off-by: Jan Kara Cc: stable@kernel.org Link: https://patch.msgid.link/20260114182836.14120-4-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 910b454b4a21..dbc82b65f810 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1148,8 +1148,6 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac) return 0; if (ac->ac_criteria >= CR_GOAL_LEN_SLOW) return 0; - if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) - return 0; return 1; } From cb1e0c1d1fad5bfad90d80c74ebdb53796d8a2db Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Fri, 23 Jan 2026 11:55:32 +0530 Subject: [PATCH 38/45] ext4: kunit tests for extent splitting and conversion Add multiple KUnit tests to test various permutations of extent splitting and conversion. We test the following cases: 1. Split of unwritten extent into 2 parts and convert 1 part to written 2. Split of unwritten extent into 3 parts and convert 1 part to written 3. Split of written extent into 2 parts and convert 1 part to unwritten 4. Split of written extent into 3 parts and convert 1 part to unwritten 5. Zeroout fallback for all the above cases except 3-4 because zeroout is not supported for written to unwritten splits The main function we test here is ext4_split_convert_extents(). Currently some of the tests are failing due to issues in implementation. All failures are mitigated at other layers in ext4 [1] but still point out the mismatch in expectation of what the caller wants vs what the function does. The aim is to eventually fix all the failures we see here. More detailed implementation notes can be found in the topmost commit in the test file. [1] for example, EXT4_GET_BLOCKS_CONVERT doesn't really convert the split extent to written, but rather the callers end up doing the conversion. Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Signed-off-by: Ojaswin Mujoo Link: https://patch.msgid.link/22bb9d17cd88c1318a2edde48887ca7488cb8a13.1769149131.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents-test.c | 559 +++++++++++++++++++++++++++++++++++++++ fs/ext4/extents.c | 23 +- fs/ext4/extents_status.c | 3 + fs/ext4/inode.c | 4 + 4 files changed, 587 insertions(+), 2 deletions(-) create mode 100644 fs/ext4/extents-test.c diff --git a/fs/ext4/extents-test.c b/fs/ext4/extents-test.c new file mode 100644 index 000000000000..15053c607cfd --- /dev/null +++ b/fs/ext4/extents-test.c @@ -0,0 +1,559 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Written by Ojaswin Mujoo (IBM) + * + * These Kunit tests are designed to test the functionality of + * extent split and conversion in ext4. + * + * Currently, ext4 can split extents in 2 ways: + * 1. By splitting the extents in the extent tree and optionally converting them + * to written or unwritten based on flags passed. + * 2. In case 1 encounters an error, ext4 instead zerooes out the unwritten + * areas of the extent and marks the complete extent written. + * + * The primary function that handles this is ext4_split_convert_extents(). + * + * We test both of the methods of split. The behavior we try to enforce is: + * 1. When passing EXT4_GET_BLOCKS_CONVERT flag to ext4_split_convert_extents(), + * the split extent should be converted to initialized. + * 2. When passing EXT4_GET_BLOCKS_CONVERT_UNWRITTEN flag to + * ext4_split_convert_extents(), the split extent should be converted to + * uninitialized. + * 3. In case we use the zeroout method, then we should correctly write zeroes + * to the unwritten areas of the extent and we should not corrupt/leak any + * data. + * + * Enforcing 1 and 2 is straight forward, we just setup a minimal inode with + * extent tree, call ext4_split_convert_extents() and check the final state of + * the extent tree. + * + * For zeroout testing, we maintain a separate buffer which represents the disk + * data corresponding to the extents. We then override ext4's zeroout functions + * to instead write zeroes to our buffer. Then, we override + * ext4_ext_insert_extent() to return -ENOSPC, which triggers the zeroout. + * Finally, we check the state of the extent tree and zeroout buffer to confirm + * everything went well. + */ + +#include +#include +#include +#include + +#include "ext4.h" +#include "ext4_extents.h" + +#define EXT_DATA_PBLK 100 +#define EXT_DATA_LBLK 10 +#define EXT_DATA_LEN 3 + +struct kunit_ctx { + /* + * Ext4 inode which has only 1 unwrit extent + */ + struct ext4_inode_info *k_ei; + /* + * Represents the underlying data area (used for zeroout testing) + */ + char *k_data; +} k_ctx; + +/* + * describes the state of an expected extent in extent tree. + */ +struct kunit_ext_state { + ext4_lblk_t ex_lblk; + ext4_lblk_t ex_len; + bool is_unwrit; +}; + +/* + * describes the state of the data area of a writ extent. Used for testing + * correctness of zeroout. + */ +struct kunit_ext_data_state { + char exp_char; + ext4_lblk_t off_blk; + ext4_lblk_t len_blk; +}; + +struct kunit_ext_test_param { + /* description of test */ + char *desc; + + /* is extent unwrit at beginning of test */ + bool is_unwrit_at_start; + + /* flags to pass while splitting */ + int split_flags; + + /* map describing range to split */ + struct ext4_map_blocks split_map; + + /* no of extents expected after split */ + int nr_exp_ext; + + /* + * expected state of extents after split. We will never split into more + * than 3 extents + */ + struct kunit_ext_state exp_ext_state[3]; + + /* Below fields used for zeroout tests */ + + bool is_zeroout_test; + /* + * no of expected data segments (zeroout tests). Example, if we expect + * data to be 4kb 0s, followed by 8kb non-zero, then nr_exp_data_segs==2 + */ + int nr_exp_data_segs; + + /* + * expected state of data area after zeroout. + */ + struct kunit_ext_data_state exp_data_state[3]; +}; + +static void ext_kill_sb(struct super_block *sb) +{ + generic_shutdown_super(sb); +} + +static int ext_set(struct super_block *sb, void *data) +{ + return 0; +} + +static struct file_system_type ext_fs_type = { + .name = "extents test", + .kill_sb = ext_kill_sb, +}; + +static void extents_kunit_exit(struct kunit *test) +{ + kfree(k_ctx.k_ei); + kfree(k_ctx.k_data); +} + +static void ext4_cache_extents_stub(struct inode *inode, + struct ext4_extent_header *eh) +{ + return; +} + +static int __ext4_ext_dirty_stub(const char *where, unsigned int line, + handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) +{ + return 0; +} + +static struct ext4_ext_path * +ext4_ext_insert_extent_stub(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *newext, int gb_flags) +{ + return ERR_PTR(-ENOSPC); +} + +static void ext4_es_remove_extent_stub(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len) +{ + return; +} + +static void ext4_zeroout_es_stub(struct inode *inode, struct ext4_extent *ex) +{ + return; +} + +/* + * We will zeroout the equivalent range in the data area + */ +static int ext4_ext_zeroout_stub(struct inode *inode, struct ext4_extent *ex) +{ + ext4_lblk_t ee_block, off_blk; + loff_t ee_len; + loff_t off_bytes; + struct kunit *test = kunit_get_current_test(); + + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + + KUNIT_EXPECT_EQ_MSG(test, 1, ee_block >= EXT_DATA_LBLK, "ee_block=%d", + ee_block); + KUNIT_EXPECT_EQ(test, 1, + ee_block + ee_len <= EXT_DATA_LBLK + EXT_DATA_LEN); + + off_blk = ee_block - EXT_DATA_LBLK; + off_bytes = off_blk << inode->i_sb->s_blocksize_bits; + memset(k_ctx.k_data + off_bytes, 0, + ee_len << inode->i_sb->s_blocksize_bits); + + return 0; +} + +static int ext4_issue_zeroout_stub(struct inode *inode, ext4_lblk_t lblk, + ext4_fsblk_t pblk, ext4_lblk_t len) +{ + ext4_lblk_t off_blk; + loff_t off_bytes; + struct kunit *test = kunit_get_current_test(); + + kunit_log(KERN_ALERT, test, + "%s: lblk=%u pblk=%llu len=%u", __func__, lblk, pblk, len); + KUNIT_EXPECT_EQ(test, 1, lblk >= EXT_DATA_LBLK); + KUNIT_EXPECT_EQ(test, 1, lblk + len <= EXT_DATA_LBLK + EXT_DATA_LEN); + KUNIT_EXPECT_EQ(test, 1, lblk - EXT_DATA_LBLK == pblk - EXT_DATA_PBLK); + + off_blk = lblk - EXT_DATA_LBLK; + off_bytes = off_blk << inode->i_sb->s_blocksize_bits; + memset(k_ctx.k_data + off_bytes, 0, + len << inode->i_sb->s_blocksize_bits); + + return 0; +} + +static int extents_kunit_init(struct kunit *test) +{ + struct ext4_extent_header *eh = NULL; + struct ext4_inode_info *ei; + struct inode *inode; + struct super_block *sb; + struct kunit_ext_test_param *param = + (struct kunit_ext_test_param *)(test->param_value); + + /* setup the mock inode */ + k_ctx.k_ei = kzalloc(sizeof(struct ext4_inode_info), GFP_KERNEL); + if (k_ctx.k_ei == NULL) + return -ENOMEM; + ei = k_ctx.k_ei; + inode = &ei->vfs_inode; + + sb = sget(&ext_fs_type, NULL, ext_set, 0, NULL); + if (IS_ERR(sb)) + return PTR_ERR(sb); + + sb->s_blocksize = 4096; + sb->s_blocksize_bits = 12; + + ei->i_disksize = (EXT_DATA_LBLK + EXT_DATA_LEN + 10) << sb->s_blocksize_bits; + inode->i_sb = sb; + + k_ctx.k_data = kzalloc(EXT_DATA_LEN * 4096, GFP_KERNEL); + if (k_ctx.k_data == NULL) + return -ENOMEM; + + /* + * set the data area to a junk value + */ + memset(k_ctx.k_data, 'X', EXT_DATA_LEN * 4096); + + /* create a tree with depth 0 */ + eh = (struct ext4_extent_header *)k_ctx.k_ei->i_data; + + /* Fill extent header */ + eh = ext_inode_hdr(&k_ctx.k_ei->vfs_inode); + eh->eh_depth = 0; + eh->eh_entries = cpu_to_le16(1); + eh->eh_magic = EXT4_EXT_MAGIC; + eh->eh_max = + cpu_to_le16(ext4_ext_space_root_idx(&k_ctx.k_ei->vfs_inode, 0)); + eh->eh_generation = 0; + + /* + * add 1 extent in leaf node covering lblks [10,13) and pblk [100,103) + */ + EXT_FIRST_EXTENT(eh)->ee_block = cpu_to_le32(EXT_DATA_LBLK); + EXT_FIRST_EXTENT(eh)->ee_len = cpu_to_le16(EXT_DATA_LEN); + ext4_ext_store_pblock(EXT_FIRST_EXTENT(eh), EXT_DATA_PBLK); + if (!param || param->is_unwrit_at_start) + ext4_ext_mark_unwritten(EXT_FIRST_EXTENT(eh)); + + /* Add stubs */ + kunit_activate_static_stub(test, ext4_cache_extents, + ext4_cache_extents_stub); + kunit_activate_static_stub(test, __ext4_ext_dirty, + __ext4_ext_dirty_stub); + kunit_activate_static_stub(test, ext4_es_remove_extent, + ext4_es_remove_extent_stub); + kunit_activate_static_stub(test, ext4_zeroout_es, ext4_zeroout_es_stub); + kunit_activate_static_stub(test, ext4_ext_zeroout, ext4_ext_zeroout_stub); + kunit_activate_static_stub(test, ext4_issue_zeroout, + ext4_issue_zeroout_stub); + return 0; +} + +/* + * Return 1 if all bytes in the buf equal to c, else return the offset of first mismatch + */ +static int check_buffer(char *buf, int c, int size) +{ + void *ret = NULL; + + ret = memchr_inv(buf, c, size); + if (ret == NULL) + return 0; + + kunit_log(KERN_ALERT, kunit_get_current_test(), + "# %s: wrong char found at offset %u (expected:%d got:%d)", __func__, + (u32)((char *)ret - buf), c, *((char *)ret)); + return 1; +} + +static void test_split_convert(struct kunit *test) +{ + struct ext4_ext_path *path; + struct inode *inode = &k_ctx.k_ei->vfs_inode; + struct ext4_extent *ex; + struct ext4_map_blocks map; + const struct kunit_ext_test_param *param = + (const struct kunit_ext_test_param *)(test->param_value); + int blkbits = inode->i_sb->s_blocksize_bits; + + if (param->is_zeroout_test) + /* + * Force zeroout by making ext4_ext_insert_extent return ENOSPC + */ + kunit_activate_static_stub(test, ext4_ext_insert_extent, + ext4_ext_insert_extent_stub); + + path = ext4_find_extent(inode, EXT_DATA_LBLK, NULL, 0); + ex = path->p_ext; + KUNIT_EXPECT_EQ(test, EXT_DATA_LBLK, le32_to_cpu(ex->ee_block)); + KUNIT_EXPECT_EQ(test, EXT_DATA_LEN, ext4_ext_get_actual_len(ex)); + KUNIT_EXPECT_EQ(test, param->is_unwrit_at_start, + ext4_ext_is_unwritten(ex)); + if (param->is_zeroout_test) + KUNIT_EXPECT_EQ(test, 0, + check_buffer(k_ctx.k_data, 'X', + EXT_DATA_LEN << blkbits)); + + map.m_lblk = param->split_map.m_lblk; + map.m_len = param->split_map.m_len; + ext4_split_convert_extents(NULL, inode, &map, path, + param->split_flags, NULL); + + path = ext4_find_extent(inode, EXT_DATA_LBLK, NULL, 0); + ex = path->p_ext; + + for (int i = 0; i < param->nr_exp_ext; i++) { + struct kunit_ext_state exp_ext = param->exp_ext_state[i]; + + KUNIT_EXPECT_EQ(test, exp_ext.ex_lblk, + le32_to_cpu(ex->ee_block)); + KUNIT_EXPECT_EQ(test, exp_ext.ex_len, + ext4_ext_get_actual_len(ex)); + KUNIT_EXPECT_EQ(test, exp_ext.is_unwrit, + ext4_ext_is_unwritten(ex)); + + /* Only printed on failure */ + kunit_log(KERN_INFO, test, + "# [extent %d] exp: lblk:%d len:%d unwrit:%d \n", i, + exp_ext.ex_lblk, exp_ext.ex_len, exp_ext.is_unwrit); + kunit_log(KERN_INFO, test, + "# [extent %d] got: lblk:%d len:%d unwrit:%d\n", i, + le32_to_cpu(ex->ee_block), + ext4_ext_get_actual_len(ex), + ext4_ext_is_unwritten(ex)); + kunit_log(KERN_INFO, test, "------------------\n"); + + ex = ex + 1; + } + + if (!param->is_zeroout_test) + return; + + /* + * Check that then data area has been zeroed out correctly + */ + for (int i = 0; i < param->nr_exp_data_segs; i++) { + loff_t off, len; + struct kunit_ext_data_state exp_data_seg = param->exp_data_state[i]; + + off = exp_data_seg.off_blk << blkbits; + len = exp_data_seg.len_blk << blkbits; + KUNIT_EXPECT_EQ_MSG(test, 0, + check_buffer(k_ctx.k_data + off, + exp_data_seg.exp_char, len), + "# corruption in byte range [%lld, %lld)", + off, len); + } + + return; +} + +static const struct kunit_ext_test_param test_split_convert_params[] = { + /* unwrit to writ splits */ + { .desc = "split unwrit extent to 2 extents and convert 1st half writ", + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CONVERT, + .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 }, + .nr_exp_ext = 2, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = 1, + .is_unwrit = 0 }, + { .ex_lblk = EXT_DATA_LBLK + 1, + .ex_len = EXT_DATA_LEN - 1, + .is_unwrit = 1 } }, + .is_zeroout_test = 0 }, + { .desc = "split unwrit extent to 2 extents and convert 2nd half writ", + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CONVERT, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 }, + .nr_exp_ext = 2, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = 1, + .is_unwrit = 1 }, + { .ex_lblk = EXT_DATA_LBLK + 1, + .ex_len = EXT_DATA_LEN - 1, + .is_unwrit = 0 } }, + .is_zeroout_test = 0 }, + { .desc = "split unwrit extent to 3 extents and convert 2nd half to writ", + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CONVERT, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 }, + .nr_exp_ext = 3, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = 1, + .is_unwrit = 1 }, + { .ex_lblk = EXT_DATA_LBLK + 1, + .ex_len = EXT_DATA_LEN - 2, + .is_unwrit = 0 }, + { .ex_lblk = EXT_DATA_LBLK + 1 + (EXT_DATA_LEN - 2), + .ex_len = 1, + .is_unwrit = 1 } }, + .is_zeroout_test = 0 }, + + /* writ to unwrit splits */ + { .desc = "split writ extent to 2 extents and convert 1st half unwrit", + .is_unwrit_at_start = 0, + .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, + .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 }, + .nr_exp_ext = 2, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = 1, + .is_unwrit = 1 }, + { .ex_lblk = EXT_DATA_LBLK + 1, + .ex_len = EXT_DATA_LEN - 1, + .is_unwrit = 0 } }, + .is_zeroout_test = 0 }, + { .desc = "split writ extent to 2 extents and convert 2nd half unwrit", + .is_unwrit_at_start = 0, + .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 }, + .nr_exp_ext = 2, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = 1, + .is_unwrit = 0 }, + { .ex_lblk = EXT_DATA_LBLK + 1, + .ex_len = EXT_DATA_LEN - 1, + .is_unwrit = 1 } }, + .is_zeroout_test = 0 }, + { .desc = "split writ extent to 3 extents and convert 2nd half to unwrit", + .is_unwrit_at_start = 0, + .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 }, + .nr_exp_ext = 3, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = 1, + .is_unwrit = 0 }, + { .ex_lblk = EXT_DATA_LBLK + 1, + .ex_len = EXT_DATA_LEN - 2, + .is_unwrit = 1 }, + { .ex_lblk = EXT_DATA_LBLK + 1 + (EXT_DATA_LEN - 2), + .ex_len = 1, + .is_unwrit = 0 } }, + .is_zeroout_test = 0 }, + + /* + * ***** zeroout tests ***** + */ + /* unwrit to writ splits */ + { .desc = "split unwrit extent to 2 extents and convert 1st half writ (zeroout)", + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CONVERT, + .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 }, + .nr_exp_ext = 1, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = EXT_DATA_LEN, + .is_unwrit = 0 } }, + .is_zeroout_test = 1, + .nr_exp_data_segs = 2, + .exp_data_state = { { .exp_char = 'X', .off_blk = 0, .len_blk = 1 }, + { .exp_char = 0, + .off_blk = 1, + .len_blk = EXT_DATA_LEN - 1 } } }, + { .desc = "split unwrit extent to 2 extents and convert 2nd half writ (zeroout)", + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CONVERT, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 }, + .nr_exp_ext = 1, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = EXT_DATA_LEN, + .is_unwrit = 0 } }, + .is_zeroout_test = 1, + .nr_exp_data_segs = 2, + .exp_data_state = { { .exp_char = 0, .off_blk = 0, .len_blk = 1 }, + { .exp_char = 'X', + .off_blk = 1, + .len_blk = EXT_DATA_LEN - 1 } } }, + { .desc = "split unwrit extent to 3 extents and convert 2nd half writ (zeroout)", + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CONVERT, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 }, + .nr_exp_ext = 1, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = EXT_DATA_LEN, + .is_unwrit = 0 } }, + .is_zeroout_test = 1, + .nr_exp_data_segs = 3, + .exp_data_state = { { .exp_char = 0, .off_blk = 0, .len_blk = 1 }, + { .exp_char = 'X', + .off_blk = 1, + .len_blk = EXT_DATA_LEN - 2 }, + { .exp_char = 0, + .off_blk = EXT_DATA_LEN - 1, + .len_blk = 1 } } }, +}; + +static void ext_get_desc(struct kunit *test, const void *p, char *desc) + +{ + struct kunit_ext_test_param *param = (struct kunit_ext_test_param *)p; + + snprintf(desc, KUNIT_PARAM_DESC_SIZE, "%s\n", param->desc); +} + +static int test_split_convert_param_init(struct kunit *test) +{ + size_t arr_size = ARRAY_SIZE(test_split_convert_params); + + kunit_register_params_array(test, test_split_convert_params, arr_size, + ext_get_desc); + return 0; +} + +/* + * Note that we use KUNIT_CASE_PARAM_WITH_INIT() instead of the more compact + * KUNIT_ARRAY_PARAM() because the later currently has a limitation causing the + * output parsing to be prone to error. For more context: + * + * https://lore.kernel.org/linux-kselftest/aULJpTvJDw9ctUDe@li-dc0c254c-257c-11b2-a85c-98b6c1322444.ibm.com/ + */ +static struct kunit_case extents_test_cases[] = { + KUNIT_CASE_PARAM_WITH_INIT(test_split_convert, kunit_array_gen_params, + test_split_convert_param_init, NULL), + {} +}; + +static struct kunit_suite extents_test_suite = { + .name = "ext4_extents_test", + .init = extents_kunit_init, + .exit = extents_kunit_exit, + .test_cases = extents_test_cases, +}; + +kunit_test_suites(&extents_test_suite); + +MODULE_LICENSE("GPL"); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3c288d4b19d2..08aa36854c11 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -32,6 +32,7 @@ #include "ext4_jbd2.h" #include "ext4_extents.h" #include "xattr.h" +#include #include @@ -197,6 +198,9 @@ static int __ext4_ext_dirty(const char *where, unsigned int line, { int err; + KUNIT_STATIC_STUB_REDIRECT(__ext4_ext_dirty, where, line, handle, inode, + path); + WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); if (path->p_bh) { ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh)); @@ -535,6 +539,8 @@ static void ext4_cache_extents(struct inode *inode, ext4_lblk_t prev = 0; int i; + KUNIT_STATIC_STUB_REDIRECT(ext4_cache_extents, inode, eh); + for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) { unsigned int status = EXTENT_STATUS_WRITTEN; ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); @@ -898,6 +904,8 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, int ret; gfp_t gfp_flags = GFP_NOFS; + KUNIT_STATIC_STUB_REDIRECT(ext4_find_extent, inode, block, path, flags); + if (flags & EXT4_EX_NOFAIL) gfp_flags |= __GFP_NOFAIL; @@ -1990,6 +1998,9 @@ ext4_ext_insert_extent(handle_t *handle, struct inode *inode, ext4_lblk_t next; int mb_flags = 0, unwritten; + KUNIT_STATIC_STUB_REDIRECT(ext4_ext_insert_extent, handle, inode, path, + newext, gb_flags); + if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) mb_flags |= EXT4_MB_DELALLOC_RESERVED; if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { @@ -3134,8 +3145,10 @@ static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) ext4_fsblk_t ee_pblock; unsigned int ee_len; - ee_block = le32_to_cpu(ex->ee_block); - ee_len = ext4_ext_get_actual_len(ex); + KUNIT_STATIC_STUB_REDIRECT(ext4_zeroout_es, inode, ex); + + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); ee_pblock = ext4_ext_pblock(ex); if (ee_len == 0) @@ -3151,6 +3164,8 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) ext4_fsblk_t ee_pblock; unsigned int ee_len; + KUNIT_STATIC_STUB_REDIRECT(ext4_ext_zeroout, inode, ex); + ee_len = ext4_ext_get_actual_len(ex); ee_pblock = ext4_ext_pblock(ex); return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock, @@ -6177,3 +6192,7 @@ int ext4_ext_clear_bb(struct inode *inode) ext4_free_ext_path(path); return 0; } + +#ifdef CONFIG_EXT4_KUNIT_TESTS +#include "extents-test.c" +#endif diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index fc83e7e2ca9e..6c1faf7c9f2a 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -16,6 +16,7 @@ #include "ext4.h" #include +#include /* * According to previous discussion in Ext4 Developer Workshop, we @@ -1627,6 +1628,8 @@ void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, int reserved = 0; struct extent_status *es = NULL; + KUNIT_STATIC_STUB_REDIRECT(ext4_es_remove_extent, inode, lblk, len); + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cf0ffd799501..fafe673e5e17 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -48,6 +48,8 @@ #include "acl.h" #include "truncate.h" +#include + #include static void ext4_journalled_zero_new_buffers(handle_t *handle, @@ -400,6 +402,8 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, { int ret; + KUNIT_STATIC_STUB_REDIRECT(ext4_issue_zeroout, inode, lblk, pblk, len); + if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) return fscrypt_zeroout_range(inode, lblk, pblk, len); From 4dff18488fe22b743d6e2ee32ca4a533ea237454 Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Fri, 23 Jan 2026 11:55:33 +0530 Subject: [PATCH 39/45] ext4: kunit tests for higher level extent manipulation functions Add more kunit tests to cover the high level caller ext4_map_create_blocks(). We pass flags in a manner that covers the below function: 1. ext4_ext_handle_unwritten_extents() 1.1 - Split/Convert unwritten extent to written in endio convtext. 1.2 - Split/Convert unwritten extent to written in non endio context. 1.3 - Zeroout tests for the above 2 cases 2. convert_initialized_extent() - Convert written extent to unwritten during zero range Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Signed-off-by: Ojaswin Mujoo Link: https://patch.msgid.link/9d8ad32cb62f44999c0fe3545b44fc3113546c70.1769149131.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 4 + fs/ext4/extents-test.c | 359 ++++++++++++++++++++++++++++++++++++++- fs/ext4/extents_status.c | 3 + fs/ext4/inode.c | 8 +- 4 files changed, 365 insertions(+), 9 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9610602fe37b..b76966dc06c0 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3804,6 +3804,10 @@ extern int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); +extern int ext4_map_query_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern int ext4_map_create_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int num, struct ext4_ext_path *path); diff --git a/fs/ext4/extents-test.c b/fs/ext4/extents-test.c index 15053c607cfd..4030fa5faca5 100644 --- a/fs/ext4/extents-test.c +++ b/fs/ext4/extents-test.c @@ -77,10 +77,18 @@ struct kunit_ext_data_state { ext4_lblk_t len_blk; }; +enum kunit_test_types { + TEST_SPLIT_CONVERT, + TEST_CREATE_BLOCKS, +}; + struct kunit_ext_test_param { /* description of test */ char *desc; + /* determines which function will be tested */ + int type; + /* is extent unwrit at beginning of test */ bool is_unwrit_at_start; @@ -90,6 +98,9 @@ struct kunit_ext_test_param { /* map describing range to split */ struct ext4_map_blocks split_map; + /* disable zeroout */ + bool disable_zeroout; + /* no of extents expected after split */ int nr_exp_ext; @@ -131,6 +142,9 @@ static struct file_system_type ext_fs_type = { static void extents_kunit_exit(struct kunit *test) { + struct ext4_sb_info *sbi = k_ctx.k_ei->vfs_inode.i_sb->s_fs_info; + + kfree(sbi); kfree(k_ctx.k_ei); kfree(k_ctx.k_data); } @@ -162,6 +176,13 @@ static void ext4_es_remove_extent_stub(struct inode *inode, ext4_lblk_t lblk, return; } +void ext4_es_insert_extent_stub(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status, bool delalloc_reserve_used) +{ + return; +} + static void ext4_zeroout_es_stub(struct inode *inode, struct ext4_extent *ex) { return; @@ -220,6 +241,7 @@ static int extents_kunit_init(struct kunit *test) struct ext4_inode_info *ei; struct inode *inode; struct super_block *sb; + struct ext4_sb_info *sbi = NULL; struct kunit_ext_test_param *param = (struct kunit_ext_test_param *)(test->param_value); @@ -237,7 +259,20 @@ static int extents_kunit_init(struct kunit *test) sb->s_blocksize = 4096; sb->s_blocksize_bits = 12; - ei->i_disksize = (EXT_DATA_LBLK + EXT_DATA_LEN + 10) << sb->s_blocksize_bits; + sbi = kzalloc(sizeof(struct ext4_sb_info), GFP_KERNEL); + if (sbi == NULL) + return -ENOMEM; + + sbi->s_sb = sb; + sb->s_fs_info = sbi; + + if (!param || !param->disable_zeroout) + sbi->s_extent_max_zeroout_kb = 32; + + ei->i_disksize = (EXT_DATA_LBLK + EXT_DATA_LEN + 10) + << sb->s_blocksize_bits; + ei->i_flags = 0; + ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); inode->i_sb = sb; k_ctx.k_data = kzalloc(EXT_DATA_LEN * 4096, GFP_KERNEL); @@ -262,7 +297,9 @@ static int extents_kunit_init(struct kunit *test) eh->eh_generation = 0; /* - * add 1 extent in leaf node covering lblks [10,13) and pblk [100,103) + * add 1 extent in leaf node covering: + * - lblks: [EXT_DATA_LBLK, EXT_DATA_LBLK * + EXT_DATA_LEN) + * - pblks: [EXT_DATA_PBLK, EXT_DATA_PBLK + EXT_DATA_LEN) */ EXT_FIRST_EXTENT(eh)->ee_block = cpu_to_le32(EXT_DATA_LBLK); EXT_FIRST_EXTENT(eh)->ee_len = cpu_to_le16(EXT_DATA_LEN); @@ -277,6 +314,8 @@ static int extents_kunit_init(struct kunit *test) __ext4_ext_dirty_stub); kunit_activate_static_stub(test, ext4_es_remove_extent, ext4_es_remove_extent_stub); + kunit_activate_static_stub(test, ext4_es_insert_extent, + ext4_es_insert_extent_stub); kunit_activate_static_stub(test, ext4_zeroout_es, ext4_zeroout_es_stub); kunit_activate_static_stub(test, ext4_ext_zeroout, ext4_ext_zeroout_stub); kunit_activate_static_stub(test, ext4_issue_zeroout, @@ -301,6 +340,30 @@ static int check_buffer(char *buf, int c, int size) return 1; } +/* + * Simulate a map block call by first calling ext4_map_query_blocks() to + * correctly populate map flags and pblk and then call the + * ext4_map_create_blocks() to do actual split and conversion. This is easier + * than calling ext4_map_blocks() because that needs mocking a lot of unrelated + * functions. + */ +static void ext4_map_create_blocks_helper(struct kunit *test, + struct inode *inode, + struct ext4_map_blocks *map, + int flags) +{ + int retval = 0; + + retval = ext4_map_query_blocks(NULL, inode, map, flags); + if (retval < 0) { + KUNIT_FAIL(test, + "ext4_map_query_blocks() failed. Cannot proceed\n"); + return; + } + + ext4_map_create_blocks(NULL, inode, map, flags); +} + static void test_split_convert(struct kunit *test) { struct ext4_ext_path *path; @@ -331,8 +394,18 @@ static void test_split_convert(struct kunit *test) map.m_lblk = param->split_map.m_lblk; map.m_len = param->split_map.m_len; - ext4_split_convert_extents(NULL, inode, &map, path, - param->split_flags, NULL); + + switch (param->type) { + case TEST_SPLIT_CONVERT: + path = ext4_split_convert_extents(NULL, inode, &map, path, + param->split_flags, NULL); + break; + case TEST_CREATE_BLOCKS: + ext4_map_create_blocks_helper(test, inode, &map, param->split_flags); + break; + default: + KUNIT_FAIL(test, "param->type %d not support.", param->type); + } path = ext4_find_extent(inode, EXT_DATA_LBLK, NULL, 0); ex = path->p_ext; @@ -386,6 +459,7 @@ static void test_split_convert(struct kunit *test) static const struct kunit_ext_test_param test_split_convert_params[] = { /* unwrit to writ splits */ { .desc = "split unwrit extent to 2 extents and convert 1st half writ", + .type = TEST_SPLIT_CONVERT, .is_unwrit_at_start = 1, .split_flags = EXT4_GET_BLOCKS_CONVERT, .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 }, @@ -398,6 +472,7 @@ static const struct kunit_ext_test_param test_split_convert_params[] = { .is_unwrit = 1 } }, .is_zeroout_test = 0 }, { .desc = "split unwrit extent to 2 extents and convert 2nd half writ", + .type = TEST_SPLIT_CONVERT, .is_unwrit_at_start = 1, .split_flags = EXT4_GET_BLOCKS_CONVERT, .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 }, @@ -410,6 +485,7 @@ static const struct kunit_ext_test_param test_split_convert_params[] = { .is_unwrit = 0 } }, .is_zeroout_test = 0 }, { .desc = "split unwrit extent to 3 extents and convert 2nd half to writ", + .type = TEST_SPLIT_CONVERT, .is_unwrit_at_start = 1, .split_flags = EXT4_GET_BLOCKS_CONVERT, .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 }, @@ -427,6 +503,7 @@ static const struct kunit_ext_test_param test_split_convert_params[] = { /* writ to unwrit splits */ { .desc = "split writ extent to 2 extents and convert 1st half unwrit", + .type = TEST_SPLIT_CONVERT, .is_unwrit_at_start = 0, .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 }, @@ -439,6 +516,7 @@ static const struct kunit_ext_test_param test_split_convert_params[] = { .is_unwrit = 0 } }, .is_zeroout_test = 0 }, { .desc = "split writ extent to 2 extents and convert 2nd half unwrit", + .type = TEST_SPLIT_CONVERT, .is_unwrit_at_start = 0, .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 }, @@ -451,6 +529,7 @@ static const struct kunit_ext_test_param test_split_convert_params[] = { .is_unwrit = 1 } }, .is_zeroout_test = 0 }, { .desc = "split writ extent to 3 extents and convert 2nd half to unwrit", + .type = TEST_SPLIT_CONVERT, .is_unwrit_at_start = 0, .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 }, @@ -471,6 +550,7 @@ static const struct kunit_ext_test_param test_split_convert_params[] = { */ /* unwrit to writ splits */ { .desc = "split unwrit extent to 2 extents and convert 1st half writ (zeroout)", + .type = TEST_SPLIT_CONVERT, .is_unwrit_at_start = 1, .split_flags = EXT4_GET_BLOCKS_CONVERT, .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 }, @@ -485,6 +565,7 @@ static const struct kunit_ext_test_param test_split_convert_params[] = { .off_blk = 1, .len_blk = EXT_DATA_LEN - 1 } } }, { .desc = "split unwrit extent to 2 extents and convert 2nd half writ (zeroout)", + .type = TEST_SPLIT_CONVERT, .is_unwrit_at_start = 1, .split_flags = EXT4_GET_BLOCKS_CONVERT, .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 }, @@ -499,6 +580,7 @@ static const struct kunit_ext_test_param test_split_convert_params[] = { .off_blk = 1, .len_blk = EXT_DATA_LEN - 1 } } }, { .desc = "split unwrit extent to 3 extents and convert 2nd half writ (zeroout)", + .type = TEST_SPLIT_CONVERT, .is_unwrit_at_start = 1, .split_flags = EXT4_GET_BLOCKS_CONVERT, .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 }, @@ -517,12 +599,257 @@ static const struct kunit_ext_test_param test_split_convert_params[] = { .len_blk = 1 } } }, }; +/* Tests to trigger ext4_ext_map_blocks() -> convert_initialized_extent() */ +static const struct kunit_ext_test_param test_convert_initialized_params[] = { + /* writ to unwrit splits */ + { .desc = "split writ extent to 2 extents and convert 1st half unwrit", + .type = TEST_CREATE_BLOCKS, + .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, + .is_unwrit_at_start = 0, + .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 }, + .nr_exp_ext = 2, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = 1, + .is_unwrit = 1 }, + { .ex_lblk = EXT_DATA_LBLK + 1, + .ex_len = EXT_DATA_LEN - 1, + .is_unwrit = 0 } }, + .is_zeroout_test = 0 }, + { .desc = "split writ extent to 2 extents and convert 2nd half unwrit", + .type = TEST_CREATE_BLOCKS, + .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, + .is_unwrit_at_start = 0, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 }, + .nr_exp_ext = 2, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = 1, + .is_unwrit = 0 }, + { .ex_lblk = EXT_DATA_LBLK + 1, + .ex_len = EXT_DATA_LEN - 1, + .is_unwrit = 1 } }, + .is_zeroout_test = 0 }, + { .desc = "split writ extent to 3 extents and convert 2nd half to unwrit", + .type = TEST_CREATE_BLOCKS, + .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, + .is_unwrit_at_start = 0, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 }, + .nr_exp_ext = 3, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = 1, + .is_unwrit = 0 }, + { .ex_lblk = EXT_DATA_LBLK + 1, + .ex_len = EXT_DATA_LEN - 2, + .is_unwrit = 1 }, + { .ex_lblk = EXT_DATA_LBLK + 1 + (EXT_DATA_LEN - 2), + .ex_len = 1, + .is_unwrit = 0 } }, + .is_zeroout_test = 0 }, +}; + +/* Tests to trigger ext4_ext_map_blocks() -> ext4_ext_handle_unwritten_exntents() */ +static const struct kunit_ext_test_param test_handle_unwritten_params[] = { + /* unwrit to writ splits via endio path */ + { .desc = "split unwrit extent to 2 extents and convert 1st half writ (endio)", + .type = TEST_CREATE_BLOCKS, + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CONVERT, + .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 }, + .nr_exp_ext = 2, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = 1, + .is_unwrit = 0 }, + { .ex_lblk = EXT_DATA_LBLK + 1, + .ex_len = EXT_DATA_LEN - 1, + .is_unwrit = 1 } }, + .is_zeroout_test = 0 }, + { .desc = "split unwrit extent to 2 extents and convert 2nd half writ (endio)", + .type = TEST_CREATE_BLOCKS, + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CONVERT, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 }, + .nr_exp_ext = 2, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = 1, + .is_unwrit = 1 }, + { .ex_lblk = EXT_DATA_LBLK + 1, + .ex_len = EXT_DATA_LEN - 1, + .is_unwrit = 0 } }, + .is_zeroout_test = 0 }, + { .desc = "split unwrit extent to 3 extents and convert 2nd half to writ (endio)", + .type = TEST_CREATE_BLOCKS, + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CONVERT, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 }, + .nr_exp_ext = 3, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = 1, + .is_unwrit = 1 }, + { .ex_lblk = EXT_DATA_LBLK + 1, + .ex_len = EXT_DATA_LEN - 2, + .is_unwrit = 0 }, + { .ex_lblk = EXT_DATA_LBLK + 1 + (EXT_DATA_LEN - 2), + .ex_len = 1, + .is_unwrit = 1 } }, + .is_zeroout_test = 0 }, + + /* unwrit to writ splits via non-endio path */ + { .desc = "split unwrit extent to 2 extents and convert 1st half writ (non endio)", + .type = TEST_CREATE_BLOCKS, + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CREATE, + .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 }, + .nr_exp_ext = 2, + .disable_zeroout = true, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = 1, + .is_unwrit = 0 }, + { .ex_lblk = EXT_DATA_LBLK + 1, + .ex_len = EXT_DATA_LEN - 1, + .is_unwrit = 1 } }, + .is_zeroout_test = 0 }, + { .desc = "split unwrit extent to 2 extents and convert 2nd half writ (non endio)", + .type = TEST_CREATE_BLOCKS, + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CREATE, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 }, + .nr_exp_ext = 2, + .disable_zeroout = true, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = 1, + .is_unwrit = 1 }, + { .ex_lblk = EXT_DATA_LBLK + 1, + .ex_len = EXT_DATA_LEN - 1, + .is_unwrit = 0 } }, + .is_zeroout_test = 0 }, + { .desc = "split unwrit extent to 3 extents and convert 2nd half to writ (non endio)", + .type = TEST_CREATE_BLOCKS, + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CREATE, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 }, + .nr_exp_ext = 3, + .disable_zeroout = true, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = 1, + .is_unwrit = 1 }, + { .ex_lblk = EXT_DATA_LBLK + 1, + .ex_len = EXT_DATA_LEN - 2, + .is_unwrit = 0 }, + { .ex_lblk = EXT_DATA_LBLK + 1 + (EXT_DATA_LEN - 2), + .ex_len = 1, + .is_unwrit = 1 } }, + .is_zeroout_test = 0 }, + + /* + * ***** zeroout tests ***** + */ + /* unwrit to writ splits (endio)*/ + { .desc = "split unwrit extent to 2 extents and convert 1st half writ (endio, zeroout)", + .type = TEST_CREATE_BLOCKS, + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CONVERT, + .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 }, + .nr_exp_ext = 1, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = EXT_DATA_LEN, + .is_unwrit = 0 } }, + .is_zeroout_test = 1, + .nr_exp_data_segs = 2, + .exp_data_state = { { .exp_char = 'X', .off_blk = 0, .len_blk = 1 }, + { .exp_char = 0, + .off_blk = 1, + .len_blk = EXT_DATA_LEN - 1 } } }, + { .desc = "split unwrit extent to 2 extents and convert 2nd half writ (endio, zeroout)", + .type = TEST_CREATE_BLOCKS, + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CONVERT, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 }, + .nr_exp_ext = 1, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = EXT_DATA_LEN, + .is_unwrit = 0 } }, + .is_zeroout_test = 1, + .nr_exp_data_segs = 2, + .exp_data_state = { { .exp_char = 0, .off_blk = 0, .len_blk = 1 }, + { .exp_char = 'X', + .off_blk = 1, + .len_blk = EXT_DATA_LEN - 1 } } }, + { .desc = "split unwrit extent to 3 extents and convert 2nd half writ (endio, zeroout)", + .type = TEST_CREATE_BLOCKS, + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CONVERT, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 }, + .nr_exp_ext = 1, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = EXT_DATA_LEN, + .is_unwrit = 0 } }, + .is_zeroout_test = 1, + .nr_exp_data_segs = 3, + .exp_data_state = { { .exp_char = 0, .off_blk = 0, .len_blk = 1 }, + { .exp_char = 'X', + .off_blk = 1, + .len_blk = EXT_DATA_LEN - 2 }, + { .exp_char = 0, + .off_blk = EXT_DATA_LEN - 1, + .len_blk = 1 } } }, + + /* unwrit to writ splits (non-endio)*/ + { .desc = "split unwrit extent to 2 extents and convert 1st half writ (non-endio, zeroout)", + .type = TEST_CREATE_BLOCKS, + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CREATE, + .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 }, + .nr_exp_ext = 1, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = EXT_DATA_LEN, + .is_unwrit = 0 } }, + .is_zeroout_test = 1, + .nr_exp_data_segs = 2, + .exp_data_state = { { .exp_char = 'X', .off_blk = 0, .len_blk = 1 }, + { .exp_char = 0, + .off_blk = 1, + .len_blk = EXT_DATA_LEN - 1 } } }, + { .desc = "split unwrit extent to 2 extents and convert 2nd half writ (non-endio, zeroout)", + .type = TEST_CREATE_BLOCKS, + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CREATE, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 }, + .nr_exp_ext = 1, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = EXT_DATA_LEN, + .is_unwrit = 0 } }, + .is_zeroout_test = 1, + .nr_exp_data_segs = 2, + .exp_data_state = { { .exp_char = 0, .off_blk = 0, .len_blk = 1 }, + { .exp_char = 'X', + .off_blk = 1, + .len_blk = EXT_DATA_LEN - 1 } } }, + { .desc = "split unwrit extent to 3 extents and convert 2nd half writ (non-endio, zeroout)", + .type = TEST_CREATE_BLOCKS, + .is_unwrit_at_start = 1, + .split_flags = EXT4_GET_BLOCKS_CREATE, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 }, + .nr_exp_ext = 1, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = EXT_DATA_LEN, + .is_unwrit = 0 } }, + .is_zeroout_test = 1, + .nr_exp_data_segs = 3, + .exp_data_state = { { .exp_char = 0, .off_blk = 0, .len_blk = 1 }, + { .exp_char = 'X', + .off_blk = 1, + .len_blk = EXT_DATA_LEN - 2 }, + { .exp_char = 0, + .off_blk = EXT_DATA_LEN - 1, + .len_blk = 1 } } }, +}; + static void ext_get_desc(struct kunit *test, const void *p, char *desc) { struct kunit_ext_test_param *param = (struct kunit_ext_test_param *)p; - snprintf(desc, KUNIT_PARAM_DESC_SIZE, "%s\n", param->desc); + snprintf(desc, KUNIT_PARAM_DESC_SIZE, "%s %s\n", param->desc, + (param->type & TEST_CREATE_BLOCKS) ? "(highlevel)" : ""); } static int test_split_convert_param_init(struct kunit *test) @@ -534,6 +861,24 @@ static int test_split_convert_param_init(struct kunit *test) return 0; } +static int test_convert_initialized_param_init(struct kunit *test) +{ + size_t arr_size = ARRAY_SIZE(test_convert_initialized_params); + + kunit_register_params_array(test, test_convert_initialized_params, + arr_size, ext_get_desc); + return 0; +} + +static int test_handle_unwritten_init(struct kunit *test) +{ + size_t arr_size = ARRAY_SIZE(test_handle_unwritten_params); + + kunit_register_params_array(test, test_handle_unwritten_params, + arr_size, ext_get_desc); + return 0; +} + /* * Note that we use KUNIT_CASE_PARAM_WITH_INIT() instead of the more compact * KUNIT_ARRAY_PARAM() because the later currently has a limitation causing the @@ -544,6 +889,10 @@ static int test_split_convert_param_init(struct kunit *test) static struct kunit_case extents_test_cases[] = { KUNIT_CASE_PARAM_WITH_INIT(test_split_convert, kunit_array_gen_params, test_split_convert_param_init, NULL), + KUNIT_CASE_PARAM_WITH_INIT(test_split_convert, kunit_array_gen_params, + test_convert_initialized_param_init, NULL), + KUNIT_CASE_PARAM_WITH_INIT(test_split_convert, kunit_array_gen_params, + test_handle_unwritten_init, NULL), {} }; diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 6c1faf7c9f2a..095ccb7ba4ba 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -916,6 +916,9 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, struct pending_reservation *pr = NULL; bool revise_pending = false; + KUNIT_STATIC_STUB_REDIRECT(ext4_es_insert_extent, inode, lblk, len, + pblk, status, delalloc_reserve_used); + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index fafe673e5e17..15ba4d42982f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -541,8 +541,8 @@ static int ext4_map_query_blocks_next_in_leaf(handle_t *handle, return map->m_len; } -static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, - struct ext4_map_blocks *map, int flags) +int ext4_map_query_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags) { unsigned int status; int retval; @@ -588,8 +588,8 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, return retval; } -static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, - struct ext4_map_blocks *map, int flags) +int ext4_map_create_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags) { unsigned int status; int err, retval = 0; From 82f80e2e3b23ef7ecdef0a494f7f310a1b1702e4 Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Fri, 23 Jan 2026 11:55:34 +0530 Subject: [PATCH 40/45] ext4: add extent status cache support to kunit tests Add support in Kunit tests to ensure that the extent status cache is also in sync after the extent split and conversion operations. Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Signed-off-by: Ojaswin Mujoo Link: https://patch.msgid.link/5f9d2668feeb89a3f3e9d03dadab8c10cbea3741.1769149131.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents-test.c | 103 ++++++++++++++++++++++++--------------- fs/ext4/extents.c | 2 - fs/ext4/extents_status.c | 5 -- 3 files changed, 63 insertions(+), 47 deletions(-) diff --git a/fs/ext4/extents-test.c b/fs/ext4/extents-test.c index 4030fa5faca5..3176bf7686b5 100644 --- a/fs/ext4/extents-test.c +++ b/fs/ext4/extents-test.c @@ -149,12 +149,6 @@ static void extents_kunit_exit(struct kunit *test) kfree(k_ctx.k_data); } -static void ext4_cache_extents_stub(struct inode *inode, - struct ext4_extent_header *eh) -{ - return; -} - static int __ext4_ext_dirty_stub(const char *where, unsigned int line, handle_t *handle, struct inode *inode, struct ext4_ext_path *path) @@ -170,24 +164,6 @@ ext4_ext_insert_extent_stub(handle_t *handle, struct inode *inode, return ERR_PTR(-ENOSPC); } -static void ext4_es_remove_extent_stub(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t len) -{ - return; -} - -void ext4_es_insert_extent_stub(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t len, ext4_fsblk_t pblk, - unsigned int status, bool delalloc_reserve_used) -{ - return; -} - -static void ext4_zeroout_es_stub(struct inode *inode, struct ext4_extent *ex) -{ - return; -} - /* * We will zeroout the equivalent range in the data area */ @@ -244,13 +220,7 @@ static int extents_kunit_init(struct kunit *test) struct ext4_sb_info *sbi = NULL; struct kunit_ext_test_param *param = (struct kunit_ext_test_param *)(test->param_value); - - /* setup the mock inode */ - k_ctx.k_ei = kzalloc(sizeof(struct ext4_inode_info), GFP_KERNEL); - if (k_ctx.k_ei == NULL) - return -ENOMEM; - ei = k_ctx.k_ei; - inode = &ei->vfs_inode; + int err; sb = sget(&ext_fs_type, NULL, ext_set, 0, NULL); if (IS_ERR(sb)) @@ -269,6 +239,24 @@ static int extents_kunit_init(struct kunit *test) if (!param || !param->disable_zeroout) sbi->s_extent_max_zeroout_kb = 32; + /* setup the mock inode */ + k_ctx.k_ei = kzalloc(sizeof(struct ext4_inode_info), GFP_KERNEL); + if (k_ctx.k_ei == NULL) + return -ENOMEM; + ei = k_ctx.k_ei; + inode = &ei->vfs_inode; + + err = ext4_es_register_shrinker(sbi); + if (err) + return err; + + ext4_es_init_tree(&ei->i_es_tree); + rwlock_init(&ei->i_es_lock); + INIT_LIST_HEAD(&ei->i_es_list); + ei->i_es_all_nr = 0; + ei->i_es_shk_nr = 0; + ei->i_es_shrink_lblk = 0; + ei->i_disksize = (EXT_DATA_LBLK + EXT_DATA_LEN + 10) << sb->s_blocksize_bits; ei->i_flags = 0; @@ -307,16 +295,15 @@ static int extents_kunit_init(struct kunit *test) if (!param || param->is_unwrit_at_start) ext4_ext_mark_unwritten(EXT_FIRST_EXTENT(eh)); + ext4_es_insert_extent(inode, EXT_DATA_LBLK, EXT_DATA_LEN, EXT_DATA_PBLK, + ext4_ext_is_unwritten(EXT_FIRST_EXTENT(eh)) ? + EXTENT_STATUS_UNWRITTEN : + EXTENT_STATUS_WRITTEN, + 0); + /* Add stubs */ - kunit_activate_static_stub(test, ext4_cache_extents, - ext4_cache_extents_stub); kunit_activate_static_stub(test, __ext4_ext_dirty, __ext4_ext_dirty_stub); - kunit_activate_static_stub(test, ext4_es_remove_extent, - ext4_es_remove_extent_stub); - kunit_activate_static_stub(test, ext4_es_insert_extent, - ext4_es_insert_extent_stub); - kunit_activate_static_stub(test, ext4_zeroout_es, ext4_zeroout_es_stub); kunit_activate_static_stub(test, ext4_ext_zeroout, ext4_ext_zeroout_stub); kunit_activate_static_stub(test, ext4_issue_zeroout, ext4_issue_zeroout_stub); @@ -381,7 +368,7 @@ static void test_split_convert(struct kunit *test) kunit_activate_static_stub(test, ext4_ext_insert_extent, ext4_ext_insert_extent_stub); - path = ext4_find_extent(inode, EXT_DATA_LBLK, NULL, 0); + path = ext4_find_extent(inode, EXT_DATA_LBLK, NULL, EXT4_EX_NOCACHE); ex = path->p_ext; KUNIT_EXPECT_EQ(test, EXT_DATA_LBLK, le32_to_cpu(ex->ee_block)); KUNIT_EXPECT_EQ(test, EXT_DATA_LEN, ext4_ext_get_actual_len(ex)); @@ -407,11 +394,14 @@ static void test_split_convert(struct kunit *test) KUNIT_FAIL(test, "param->type %d not support.", param->type); } - path = ext4_find_extent(inode, EXT_DATA_LBLK, NULL, 0); + path = ext4_find_extent(inode, EXT_DATA_LBLK, NULL, EXT4_EX_NOCACHE); ex = path->p_ext; for (int i = 0; i < param->nr_exp_ext; i++) { struct kunit_ext_state exp_ext = param->exp_ext_state[i]; + bool es_check_needed = param->type != TEST_SPLIT_CONVERT; + struct extent_status es; + int contains_ex, ex_end, es_end, es_pblk; KUNIT_EXPECT_EQ(test, exp_ext.ex_lblk, le32_to_cpu(ex->ee_block)); @@ -419,6 +409,33 @@ static void test_split_convert(struct kunit *test) ext4_ext_get_actual_len(ex)); KUNIT_EXPECT_EQ(test, exp_ext.is_unwrit, ext4_ext_is_unwritten(ex)); + /* + * Confirm extent cache is in sync. Note that es cache can be + * merged even when on-disk extents are not so take that into + * account. + * + * Also, ext4_split_convert_extents() forces EXT4_EX_NOCACHE hence + * es status are ignored for that case. + */ + if (es_check_needed) { + ext4_es_lookup_extent(inode, le32_to_cpu(ex->ee_block), + NULL, &es, NULL); + + ex_end = exp_ext.ex_lblk + exp_ext.ex_len; + es_end = es.es_lblk + es.es_len; + contains_ex = es.es_lblk <= exp_ext.ex_lblk && + es_end >= ex_end; + es_pblk = ext4_es_pblock(&es) + + (exp_ext.ex_lblk - es.es_lblk); + + KUNIT_EXPECT_EQ(test, contains_ex, 1); + KUNIT_EXPECT_EQ(test, ext4_ext_pblock(ex), es_pblk); + KUNIT_EXPECT_EQ(test, 1, + (exp_ext.is_unwrit && + ext4_es_is_unwritten(&es)) || + (!exp_ext.is_unwrit && + ext4_es_is_written(&es))); + } /* Only printed on failure */ kunit_log(KERN_INFO, test, @@ -429,6 +446,12 @@ static void test_split_convert(struct kunit *test) le32_to_cpu(ex->ee_block), ext4_ext_get_actual_len(ex), ext4_ext_is_unwritten(ex)); + if (es_check_needed) + kunit_log( + KERN_INFO, test, + "# [extent %d] es: lblk:%d len:%d pblk:%lld type:0x%x\n", + i, es.es_lblk, es.es_len, ext4_es_pblock(&es), + ext4_es_type(&es)); kunit_log(KERN_INFO, test, "------------------\n"); ex = ex + 1; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 08aa36854c11..16326d7f09b9 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3145,8 +3145,6 @@ static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) ext4_fsblk_t ee_pblock; unsigned int ee_len; - KUNIT_STATIC_STUB_REDIRECT(ext4_zeroout_es, inode, ex); - ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); ee_pblock = ext4_ext_pblock(ex); diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 095ccb7ba4ba..a1538bac51c6 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -916,9 +916,6 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, struct pending_reservation *pr = NULL; bool revise_pending = false; - KUNIT_STATIC_STUB_REDIRECT(ext4_es_insert_extent, inode, lblk, len, - pblk, status, delalloc_reserve_used); - if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; @@ -1631,8 +1628,6 @@ void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, int reserved = 0; struct extent_status *es = NULL; - KUNIT_STATIC_STUB_REDIRECT(ext4_es_remove_extent, inode, lblk, len); - if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; From 3fffa44b6ebf65be92a562a5063303979385a1c9 Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Fri, 23 Jan 2026 11:55:35 +0530 Subject: [PATCH 41/45] ext4: propagate flags to convert_initialized_extent() Currently, ext4_zero_range passes EXT4_EX_NOCACHE flag to avoid caching extents however this is not respected by convert_initialized_extent(). Hence, modify it to accept flags from the caller and to pass the flags on to other extent manipulation functions it calls. This makes sure the NOCACHE flag is respected throughout the code path. Also, we no longer explicitly pass CONVERT_UNWRITTEN as the caller takes care of this. Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Signed-off-by: Ojaswin Mujoo Link: https://patch.msgid.link/07008fbb14db727fddcaf4c30e2346c49f6c8fe0.1769149131.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 16326d7f09b9..2747af91e78e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3840,6 +3840,7 @@ static struct ext4_ext_path * convert_initialized_extent(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path *path, + int flags, unsigned int *allocated) { struct ext4_extent *ex; @@ -3865,11 +3866,11 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, if (ee_block != map->m_lblk || ee_len > map->m_len) { path = ext4_split_convert_extents(handle, inode, map, path, - EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, NULL); + flags, NULL); if (IS_ERR(path)) return path; - path = ext4_find_extent(inode, map->m_lblk, path, 0); + path = ext4_find_extent(inode, map->m_lblk, path, flags); if (IS_ERR(path)) return path; depth = ext_depth(inode); @@ -4259,7 +4260,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, if ((!ext4_ext_is_unwritten(ex)) && (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { path = convert_initialized_extent(handle, - inode, map, path, &allocated); + inode, map, path, flags, &allocated); if (IS_ERR(path)) err = PTR_ERR(path); goto out; From 6066990c99c48d8955eb4b467c11e14daa8d5ec4 Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Fri, 23 Jan 2026 11:55:36 +0530 Subject: [PATCH 42/45] ext4: propagate flags to ext4_convert_unwritten_extents_endio() Currently, callers like ext4_convert_unwritten_extents() pass EXT4_EX_NOCACHE flag to avoid caching extents however this is not respected by ext4_convert_unwritten_extents_endio(). Hence, modify it to accept flags from the caller and to pass the flags on to other extent manipulation functions it calls. This makes sure the NOCACHE flag is respected throughout the code path. Also, since the caller already passes METADATA_NOFAIL and CONVERT flags we don't need to explicitly pass it anymore. Reviewed-by: Jan Kara Reviewed-by: Zhang Yi Signed-off-by: Ojaswin Mujoo Link: https://patch.msgid.link/7c2139e0ad32c49c19b194f72219e15d613de284.1769149131.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 2747af91e78e..20939b5526b8 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3780,7 +3780,7 @@ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle, static struct ext4_ext_path * ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path *path) + struct ext4_ext_path *path, int flags) { struct ext4_extent *ex; ext4_lblk_t ee_block; @@ -3797,15 +3797,12 @@ ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, (unsigned long long)ee_block, ee_len); if (ee_block != map->m_lblk || ee_len > map->m_len) { - int flags = EXT4_GET_BLOCKS_CONVERT | - EXT4_GET_BLOCKS_METADATA_NOFAIL; - path = ext4_split_convert_extents(handle, inode, map, path, flags, NULL); if (IS_ERR(path)) return path; - path = ext4_find_extent(inode, map->m_lblk, path, 0); + path = ext4_find_extent(inode, map->m_lblk, path, flags); if (IS_ERR(path)) return path; depth = ext_depth(inode); @@ -3938,7 +3935,7 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, /* IO end_io complete, convert the filled extent to written */ if (flags & EXT4_GET_BLOCKS_CONVERT) { path = ext4_convert_unwritten_extents_endio(handle, inode, - map, path); + map, path, flags); if (IS_ERR(path)) return path; ext4_update_inode_fsync_trans(handle, inode, 1); From a985e07c264552d0aa7c019ca54d075414c56620 Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Fri, 23 Jan 2026 11:55:37 +0530 Subject: [PATCH 43/45] ext4: refactor zeroout path and handle all cases Currently, zeroout is used as a fallback in case we fail to split/convert extents in the "traditional" modify-the-extent-tree way. This is essential to mitigate failures in critical paths like extent splitting during endio. However, the logic is very messy and not easy to follow. Further, the fragile use of various flags has made it prone to errors. Refactor zeroout out logic by moving it up to ext4_split_extents(). Further, zeroout correctly based on the type of conversion we want, ie: - unwritten to written: Zeroout everything around the mapped range. - written to unwritten: Zeroout only the mapped range. Also, ext4_ext_convert_to_initialized() now passes EXT4_GET_BLOCKS_CONVERT to make the intention clear. Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Signed-off-by: Ojaswin Mujoo Link: https://patch.msgid.link/e1b51dedeca7c0b1f702141d91edfe4230560e7b.1769149131.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 285 ++++++++++++++++++++++++++++++---------------- 1 file changed, 186 insertions(+), 99 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 20939b5526b8..8d709bfd299b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -44,14 +44,6 @@ #define EXT4_EXT_MARK_UNWRIT1 0x2 /* mark first half unwritten */ #define EXT4_EXT_MARK_UNWRIT2 0x4 /* mark second half unwritten */ -/* first half contains valid data */ -#define EXT4_EXT_DATA_ENTIRE_VALID1 0x8 /* has entirely valid data */ -#define EXT4_EXT_DATA_PARTIAL_VALID1 0x10 /* has partially valid data */ -#define EXT4_EXT_DATA_VALID1 (EXT4_EXT_DATA_ENTIRE_VALID1 | \ - EXT4_EXT_DATA_PARTIAL_VALID1) - -#define EXT4_EXT_DATA_VALID2 0x20 /* second half contains valid data */ - static __le32 ext4_extent_block_csum(struct inode *inode, struct ext4_extent_header *eh) { @@ -3189,7 +3181,8 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) * a> the extent are splitted into two extent. * b> split is not needed, and just mark the extent. * - * Return an extent path pointer on success, or an error pointer on failure. + * Return an extent path pointer on success, or an error pointer on failure. On + * failure, the extent is restored to original state. */ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, struct inode *inode, @@ -3199,14 +3192,10 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, { ext4_fsblk_t newblock; ext4_lblk_t ee_block; - struct ext4_extent *ex, newex, orig_ex, zero_ex; + struct ext4_extent *ex, newex, orig_ex; struct ext4_extent *ex2 = NULL; unsigned int ee_len, depth; - int err = 0; - - BUG_ON((split_flag & EXT4_EXT_DATA_VALID1) == EXT4_EXT_DATA_VALID1); - BUG_ON((split_flag & EXT4_EXT_DATA_VALID1) && - (split_flag & EXT4_EXT_DATA_VALID2)); + int err = 0, insert_err = 0; /* Do not cache extents that are in the process of being modified. */ flags |= EXT4_EX_NOCACHE; @@ -3272,11 +3261,10 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, path = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (!IS_ERR(path)) - goto out; + return path; - err = PTR_ERR(path); - if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM) - goto out_path; + insert_err = PTR_ERR(path); + err = 0; /* * Get a new path to try to zeroout or fix the extent length. @@ -3292,65 +3280,21 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, split, PTR_ERR(path)); goto out_path; } + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + depth = ext_depth(inode); ex = path[depth].p_ext; - if (EXT4_EXT_MAY_ZEROOUT & split_flag) { - if (split_flag & EXT4_EXT_DATA_VALID1) - memcpy(&zero_ex, ex2, sizeof(zero_ex)); - else if (split_flag & EXT4_EXT_DATA_VALID2) - memcpy(&zero_ex, ex, sizeof(zero_ex)); - else - memcpy(&zero_ex, &orig_ex, sizeof(zero_ex)); - ext4_ext_mark_initialized(&zero_ex); - - err = ext4_ext_zeroout(inode, &zero_ex); - if (err) - goto fix_extent_len; - - /* - * The first half contains partially valid data, the splitting - * of this extent has not been completed, fix extent length - * and ext4_split_extent() split will the first half again. - */ - if (split_flag & EXT4_EXT_DATA_PARTIAL_VALID1) { - /* - * Drop extent cache to prevent stale unwritten - * extents remaining after zeroing out. - */ - ext4_es_remove_extent(inode, - le32_to_cpu(zero_ex.ee_block), - ext4_ext_get_actual_len(&zero_ex)); - goto fix_extent_len; - } - - /* update the extent length and mark as initialized */ - ex->ee_len = cpu_to_le16(ee_len); - ext4_ext_try_to_merge(handle, inode, path, ex); - err = ext4_ext_dirty(handle, inode, path + path->p_depth); - if (!err) - /* update extent status tree */ - ext4_zeroout_es(inode, &zero_ex); - /* - * If we failed at this point, we don't know in which - * state the extent tree exactly is so don't try to fix - * length of the original extent as it may do even more - * damage. - */ - goto out; - } - fix_extent_len: ex->ee_len = orig_ex.ee_len; - /* - * Ignore ext4_ext_dirty return value since we are already in error path - * and err is a non-zero error code. - */ - ext4_ext_dirty(handle, inode, path + path->p_depth); + err = ext4_ext_dirty(handle, inode, path + path->p_depth); out: - if (err) { + if (err || insert_err) { ext4_free_ext_path(path); - path = ERR_PTR(err); + path = err ? ERR_PTR(err) : ERR_PTR(insert_err); } out_path: if (IS_ERR(path)) @@ -3360,6 +3304,108 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, return path; } +static int ext4_split_extent_zeroout(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, + struct ext4_map_blocks *map, int flags) +{ + struct ext4_extent *ex; + unsigned int ee_len, depth; + ext4_lblk_t ee_block; + uint64_t lblk, pblk, len; + int is_unwrit; + int err = 0; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + is_unwrit = ext4_ext_is_unwritten(ex); + + if (flags & EXT4_GET_BLOCKS_CONVERT) { + /* + * EXT4_GET_BLOCKS_CONVERT: Caller wants the range specified by + * map to be initialized. Zeroout everything except the map + * range. + */ + + loff_t map_end = (loff_t) map->m_lblk + map->m_len; + loff_t ex_end = (loff_t) ee_block + ee_len; + + if (!is_unwrit) + /* Shouldn't happen. Just exit */ + return -EINVAL; + + /* zeroout left */ + if (map->m_lblk > ee_block) { + lblk = ee_block; + len = map->m_lblk - ee_block; + pblk = ext4_ext_pblock(ex); + err = ext4_issue_zeroout(inode, lblk, pblk, len); + if (err) + /* ZEROOUT failed, just return original error */ + return err; + } + + /* zeroout right */ + if (map_end < ex_end) { + lblk = map_end; + len = ex_end - map_end; + pblk = ext4_ext_pblock(ex) + (map_end - ee_block); + err = ext4_issue_zeroout(inode, lblk, pblk, len); + if (err) + /* ZEROOUT failed, just return original error */ + return err; + } + } else if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) { + /* + * EXT4_GET_BLOCKS_CONVERT_UNWRITTEN: Caller wants the + * range specified by map to be marked unwritten. + * Zeroout the map range leaving rest as it is. + */ + + if (is_unwrit) + /* Shouldn't happen. Just exit */ + return -EINVAL; + + lblk = map->m_lblk; + len = map->m_len; + pblk = ext4_ext_pblock(ex) + (map->m_lblk - ee_block); + err = ext4_issue_zeroout(inode, lblk, pblk, len); + if (err) + /* ZEROOUT failed, just return original error */ + return err; + } else { + /* + * We no longer perform unwritten to unwritten splits in IO paths. + * Hence this should not happen. + */ + WARN_ON_ONCE(true); + return -EINVAL; + } + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + return err; + + ext4_ext_mark_initialized(ex); + + ext4_ext_dirty(handle, inode, path + path->p_depth); + if (err) + return err; + + /* + * The whole extent is initialized and stable now so it can be added to + * es cache + */ + if (!(flags & EXT4_EX_NOCACHE)) + ext4_es_insert_extent(inode, le32_to_cpu(ex->ee_block), + ext4_ext_get_actual_len(ex), + ext4_ext_pblock(ex), + EXTENT_STATUS_WRITTEN, false); + + return 0; +} + /* * ext4_split_extent() splits an extent and mark extent which is covered * by @map as split_flags indicates @@ -3378,11 +3424,13 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle, int split_flag, int flags, unsigned int *allocated) { - ext4_lblk_t ee_block; + ext4_lblk_t ee_block, orig_ee_block; struct ext4_extent *ex; - unsigned int ee_len, depth; - int unwritten; - int split_flag1, flags1; + unsigned int ee_len, orig_ee_len, depth; + int unwritten, orig_unwritten; + int split_flag1 = 0, flags1 = 0; + int orig_err = 0; + int orig_flags = flags; depth = ext_depth(inode); ex = path[depth].p_ext; @@ -3390,30 +3438,31 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle, ee_len = ext4_ext_get_actual_len(ex); unwritten = ext4_ext_is_unwritten(ex); + orig_ee_block = ee_block; + orig_ee_len = ee_len; + orig_unwritten = unwritten; + /* Do not cache extents that are in the process of being modified. */ flags |= EXT4_EX_NOCACHE; if (map->m_lblk + map->m_len < ee_block + ee_len) { - split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT; flags1 = flags | EXT4_GET_BLOCKS_SPLIT_NOMERGE; if (unwritten) split_flag1 |= EXT4_EXT_MARK_UNWRIT1 | EXT4_EXT_MARK_UNWRIT2; - if (split_flag & EXT4_EXT_DATA_VALID2) - split_flag1 |= map->m_lblk > ee_block ? - EXT4_EXT_DATA_PARTIAL_VALID1 : - EXT4_EXT_DATA_ENTIRE_VALID1; path = ext4_split_extent_at(handle, inode, path, map->m_lblk + map->m_len, split_flag1, flags1); if (IS_ERR(path)) - return path; + goto try_zeroout; + /* * Update path is required because previous ext4_split_extent_at * may result in split of original leaf or extent zeroout. */ path = ext4_find_extent(inode, map->m_lblk, path, flags); if (IS_ERR(path)) - return path; + goto try_zeroout; + depth = ext_depth(inode); ex = path[depth].p_ext; if (!ex) { @@ -3422,22 +3471,61 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle, ext4_free_ext_path(path); return ERR_PTR(-EFSCORRUPTED); } - unwritten = ext4_ext_is_unwritten(ex); + + /* extent would have changed so update original values */ + orig_ee_block = le32_to_cpu(ex->ee_block); + orig_ee_len = ext4_ext_get_actual_len(ex); + orig_unwritten = ext4_ext_is_unwritten(ex); } if (map->m_lblk >= ee_block) { - split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; + split_flag1 = 0; if (unwritten) { split_flag1 |= EXT4_EXT_MARK_UNWRIT1; - split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | - EXT4_EXT_MARK_UNWRIT2); + split_flag1 |= split_flag & EXT4_EXT_MARK_UNWRIT2; } - path = ext4_split_extent_at(handle, inode, path, - map->m_lblk, split_flag1, flags); + path = ext4_split_extent_at(handle, inode, path, map->m_lblk, + split_flag1, flags); if (IS_ERR(path)) - return path; + goto try_zeroout; } + goto success; + +try_zeroout: + /* + * There was an error in splitting the extent. So instead, just zeroout + * unwritten portions and convert it to initialized as a last resort. If + * there is any failure here we just return the original error + */ + + orig_err = PTR_ERR(path); + if (orig_err != -ENOSPC && orig_err != -EDQUOT && orig_err != -ENOMEM) + goto out_orig_err; + + /* we can't zeroout? just return the original err */ + if (!(split_flag & EXT4_EXT_MAY_ZEROOUT)) + goto out_orig_err; + + path = ext4_find_extent(inode, map->m_lblk, NULL, flags); + if (IS_ERR(path)) + goto out_orig_err; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + unwritten = ext4_ext_is_unwritten(ex); + + /* extent to zeroout should have been unchanged but its not */ + if (WARN_ON(ee_block != orig_ee_block || ee_len != orig_ee_len || + unwritten != orig_unwritten)) + goto out_free_path; + + if (ext4_split_extent_zeroout(handle, inode, path, map, orig_flags)) + goto out_free_path; + +success: if (allocated) { if (map->m_lblk + map->m_len > ee_block + ee_len) *allocated = ee_len - (map->m_lblk - ee_block); @@ -3446,6 +3534,12 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle, } ext4_ext_show_leaf(inode, path); return path; + +out_free_path: + ext4_free_ext_path(path); +out_orig_err: + return ERR_PTR(orig_err); + } /* @@ -3481,7 +3575,7 @@ ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, ext4_lblk_t ee_block, eof_block; unsigned int ee_len, depth, map_len = map->m_len; int err = 0; - int split_flag = EXT4_EXT_DATA_VALID2; + int split_flag = 0; unsigned int max_zeroout = 0; ext_debug(inode, "logical block %llu, max_blocks %u\n", @@ -3691,7 +3785,7 @@ ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, fallback: path = ext4_split_extent(handle, inode, path, &split_map, split_flag, - flags, NULL); + flags | EXT4_GET_BLOCKS_CONVERT, NULL); if (IS_ERR(path)) return path; out: @@ -3755,11 +3849,7 @@ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle, ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - /* Convert to unwritten */ - if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) { - split_flag |= EXT4_EXT_DATA_ENTIRE_VALID1; - /* Split the existing unwritten extent */ - } else if (flags & (EXT4_GET_BLOCKS_UNWRIT_EXT | + if (flags & (EXT4_GET_BLOCKS_UNWRIT_EXT | EXT4_GET_BLOCKS_CONVERT)) { /* * It is safe to convert extent to initialized via explicit @@ -3768,9 +3858,6 @@ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle, split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; split_flag |= EXT4_EXT_MARK_UNWRIT2; - /* Convert to initialized */ - if (flags & EXT4_GET_BLOCKS_CONVERT) - split_flag |= EXT4_EXT_DATA_VALID2; } flags |= EXT4_GET_BLOCKS_SPLIT_NOMERGE; return ext4_split_extent(handle, inode, path, map, split_flag, flags, From 716b9c23b86240d419a43c1f211c628ac04acb8f Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Fri, 23 Jan 2026 11:55:38 +0530 Subject: [PATCH 44/45] ext4: refactor split and convert extents ext4_split_convert_extents() has been historically prone to subtle bugs and inconsistent behavior due to the way all the various flags interact with the extent split and conversion process. For example, callers like ext4_convert_unwritten_extents_endio() and convert_initialized_extents() needed to open code extent conversion despite passing CONVERT or CONVERT_UNWRITTEN flags because ext4_split_convert_extents() wasn't performing the conversion. Hence, refactor ext4_split_convert_extents() to clearly enforce the semantics of each flag. The major changes here are: * Clearly separate the split and convert process: * ext4_split_extent() and ext4_split_extent_at() are now only responsible to perform the split. * ext4_split_convert_extents() is now responsible to perform extent conversion after calling ext4_split_extent() for splitting. * This helps get rid of all the MARK_UNWRIT* flags. * Clearly enforce the semantics of flags passed to ext4_split_convert_extents(): * EXT4_GET_BLOCKS_CONVERT: Will convert the split extent to written * EXT4_GET_BLOCKS_CONVERT_UNWRITTEN: Will convert the split extent to unwritten * Modify all callers to enforce the above semantics. * Use ext4_split_convert_extents() instead of ext4_split_extents() in ext4_ext_convert_to_initialized() for uniformity. * Now that ext4_split_convert_extents() is handling caching to es, we dont need to do it in ext4_split_extent_zeroout(). * Cleanup all callers open coding the conversion logic. Further, modify kuniy tests to pass flags based on the new semantics. >From an end user point of view, we should not see any changes in behavior of ext4. Reviewed-by: Jan Kara Reviewed-by: Zhang Yi Signed-off-by: Ojaswin Mujoo Link: https://patch.msgid.link/2084a383d69ceefbaa293b8fcf725365eca0a349.1769149131.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 277 +++++++++++++++++++--------------------------- 1 file changed, 112 insertions(+), 165 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 8d709bfd299b..14f38b3cda27 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -41,8 +41,9 @@ */ #define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ due to ENOSPC */ -#define EXT4_EXT_MARK_UNWRIT1 0x2 /* mark first half unwritten */ -#define EXT4_EXT_MARK_UNWRIT2 0x4 /* mark second half unwritten */ +static struct ext4_ext_path *ext4_split_convert_extents( + handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, + struct ext4_ext_path *path, int flags, unsigned int *allocated); static __le32 ext4_extent_block_csum(struct inode *inode, struct ext4_extent_header *eh) @@ -84,8 +85,7 @@ static void ext4_extent_block_csum_set(struct inode *inode, static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, - ext4_lblk_t split, - int split_flag, int flags); + ext4_lblk_t split, int flags); static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) { @@ -333,15 +333,12 @@ ext4_force_split_extent_at(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t lblk, int nofail) { - int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext); int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_SPLIT_NOMERGE; if (nofail) flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL; - return ext4_split_extent_at(handle, inode, path, lblk, unwritten ? - EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0, - flags); + return ext4_split_extent_at(handle, inode, path, lblk, flags); } static int @@ -3169,17 +3166,11 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) * @inode: the file inode * @path: the path to the extent * @split: the logical block where the extent is splitted. - * @split_flags: indicates if the extent could be zeroout if split fails, and - * the states(init or unwritten) of new extents. * @flags: flags used to insert new extent to extent tree. * * * Splits extent [a, b] into two extents [a, @split) and [@split, b], states - * of which are determined by split_flag. - * - * There are two cases: - * a> the extent are splitted into two extent. - * b> split is not needed, and just mark the extent. + * of which are same as the original extent. No conversion is performed. * * Return an extent path pointer on success, or an error pointer on failure. On * failure, the extent is restored to original state. @@ -3188,14 +3179,14 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t split, - int split_flag, int flags) + int flags) { ext4_fsblk_t newblock; ext4_lblk_t ee_block; struct ext4_extent *ex, newex, orig_ex; struct ext4_extent *ex2 = NULL; unsigned int ee_len, depth; - int err = 0, insert_err = 0; + int err = 0, insert_err = 0, is_unwrit = 0; /* Do not cache extents that are in the process of being modified. */ flags |= EXT4_EX_NOCACHE; @@ -3209,39 +3200,24 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); newblock = split - ee_block + ext4_ext_pblock(ex); + is_unwrit = ext4_ext_is_unwritten(ex); BUG_ON(split < ee_block || split >= (ee_block + ee_len)); - BUG_ON(!ext4_ext_is_unwritten(ex) && - split_flag & (EXT4_EXT_MAY_ZEROOUT | - EXT4_EXT_MARK_UNWRIT1 | - EXT4_EXT_MARK_UNWRIT2)); + + /* + * No split needed + */ + if (split == ee_block) + goto out; err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; - if (split == ee_block) { - /* - * case b: block @split is the block that the extent begins with - * then we just change the state of the extent, and splitting - * is not needed. - */ - if (split_flag & EXT4_EXT_MARK_UNWRIT2) - ext4_ext_mark_unwritten(ex); - else - ext4_ext_mark_initialized(ex); - - if (!(flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE)) - ext4_ext_try_to_merge(handle, inode, path, ex); - - err = ext4_ext_dirty(handle, inode, path + path->p_depth); - goto out; - } - /* case a */ memcpy(&orig_ex, ex, sizeof(orig_ex)); ex->ee_len = cpu_to_le16(split - ee_block); - if (split_flag & EXT4_EXT_MARK_UNWRIT1) + if (is_unwrit) ext4_ext_mark_unwritten(ex); /* @@ -3256,7 +3232,7 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, ex2->ee_block = cpu_to_le32(split); ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block)); ext4_ext_store_pblock(ex2, newblock); - if (split_flag & EXT4_EXT_MARK_UNWRIT2) + if (is_unwrit) ext4_ext_mark_unwritten(ex2); path = ext4_ext_insert_extent(handle, inode, path, &newex, flags); @@ -3389,20 +3365,10 @@ static int ext4_split_extent_zeroout(handle_t *handle, struct inode *inode, ext4_ext_mark_initialized(ex); - ext4_ext_dirty(handle, inode, path + path->p_depth); + ext4_ext_dirty(handle, inode, path + depth); if (err) return err; - /* - * The whole extent is initialized and stable now so it can be added to - * es cache - */ - if (!(flags & EXT4_EX_NOCACHE)) - ext4_es_insert_extent(inode, le32_to_cpu(ex->ee_block), - ext4_ext_get_actual_len(ex), - ext4_ext_pblock(ex), - EXTENT_STATUS_WRITTEN, false); - return 0; } @@ -3422,15 +3388,13 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle, struct ext4_ext_path *path, struct ext4_map_blocks *map, int split_flag, int flags, - unsigned int *allocated) + unsigned int *allocated, bool *did_zeroout) { ext4_lblk_t ee_block, orig_ee_block; struct ext4_extent *ex; unsigned int ee_len, orig_ee_len, depth; int unwritten, orig_unwritten; - int split_flag1 = 0, flags1 = 0; int orig_err = 0; - int orig_flags = flags; depth = ext_depth(inode); ex = path[depth].p_ext; @@ -3446,12 +3410,8 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle, flags |= EXT4_EX_NOCACHE; if (map->m_lblk + map->m_len < ee_block + ee_len) { - flags1 = flags | EXT4_GET_BLOCKS_SPLIT_NOMERGE; - if (unwritten) - split_flag1 |= EXT4_EXT_MARK_UNWRIT1 | - EXT4_EXT_MARK_UNWRIT2; path = ext4_split_extent_at(handle, inode, path, - map->m_lblk + map->m_len, split_flag1, flags1); + map->m_lblk + map->m_len, flags); if (IS_ERR(path)) goto try_zeroout; @@ -3479,13 +3439,8 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle, } if (map->m_lblk >= ee_block) { - split_flag1 = 0; - if (unwritten) { - split_flag1 |= EXT4_EXT_MARK_UNWRIT1; - split_flag1 |= split_flag & EXT4_EXT_MARK_UNWRIT2; - } path = ext4_split_extent_at(handle, inode, path, map->m_lblk, - split_flag1, flags); + flags); if (IS_ERR(path)) goto try_zeroout; } @@ -3522,9 +3477,13 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle, unwritten != orig_unwritten)) goto out_free_path; - if (ext4_split_extent_zeroout(handle, inode, path, map, orig_flags)) + if (ext4_split_extent_zeroout(handle, inode, path, map, flags)) goto out_free_path; + /* zeroout succeeded */ + if (did_zeroout) + *did_zeroout = true; + success: if (allocated) { if (map->m_lblk + map->m_len > ee_block + ee_len) @@ -3575,7 +3534,6 @@ ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, ext4_lblk_t ee_block, eof_block; unsigned int ee_len, depth, map_len = map->m_len; int err = 0; - int split_flag = 0; unsigned int max_zeroout = 0; ext_debug(inode, "logical block %llu, max_blocks %u\n", @@ -3727,9 +3685,7 @@ ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, * It is safe to convert extent to initialized via explicit * zeroout only if extent is fully inside i_size or new_size. */ - split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; - - if (EXT4_EXT_MAY_ZEROOUT & split_flag) + if (ee_block + ee_len <= eof_block) max_zeroout = sbi->s_extent_max_zeroout_kb >> (inode->i_sb->s_blocksize_bits - 10); @@ -3784,8 +3740,8 @@ ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, } fallback: - path = ext4_split_extent(handle, inode, path, &split_map, split_flag, - flags | EXT4_GET_BLOCKS_CONVERT, NULL); + path = ext4_split_convert_extents(handle, inode, &split_map, path, + flags | EXT4_GET_BLOCKS_CONVERT, NULL); if (IS_ERR(path)) return path; out: @@ -3835,7 +3791,8 @@ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle, ext4_lblk_t ee_block; struct ext4_extent *ex; unsigned int ee_len; - int split_flag = 0, depth; + int split_flag = 0, depth, err = 0; + bool did_zeroout = false; ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)map->m_lblk, map->m_len); @@ -3849,19 +3806,81 @@ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle, ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - if (flags & (EXT4_GET_BLOCKS_UNWRIT_EXT | - EXT4_GET_BLOCKS_CONVERT)) { - /* - * It is safe to convert extent to initialized via explicit - * zeroout only if extent is fully inside i_size or new_size. - */ + /* No split needed */ + if (ee_block == map->m_lblk && ee_len == map->m_len) + goto convert; + + /* + * We don't use zeroout fallback for written to unwritten conversion as + * it is not as critical as endio and it might take unusually long. + * Also, it is only safe to convert extent to initialized via explicit + * zeroout only if extent is fully inside i_size or new_size. + */ + if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) split_flag |= ee_block + ee_len <= eof_block ? - EXT4_EXT_MAY_ZEROOUT : 0; - split_flag |= EXT4_EXT_MARK_UNWRIT2; + EXT4_EXT_MAY_ZEROOUT : + 0; + + /* + * pass SPLIT_NOMERGE explicitly so we don't end up merging extents we + * just split. + */ + path = ext4_split_extent(handle, inode, path, map, split_flag, + flags | EXT4_GET_BLOCKS_SPLIT_NOMERGE, + allocated, &did_zeroout); + if (IS_ERR(path)) + return path; + +convert: + path = ext4_find_extent(inode, map->m_lblk, path, flags); + if (IS_ERR(path)) + return path; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + + /* + * Conversion is already handled in case of zeroout + */ + if (!did_zeroout) { + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto err; + + if (flags & EXT4_GET_BLOCKS_CONVERT) + ext4_ext_mark_initialized(ex); + else if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) + ext4_ext_mark_unwritten(ex); + + if (!(flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE)) + /* + * note: ext4_ext_correct_indexes() isn't needed here because + * borders are not changed + */ + ext4_ext_try_to_merge(handle, inode, path, ex); + + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto err; } - flags |= EXT4_GET_BLOCKS_SPLIT_NOMERGE; - return ext4_split_extent(handle, inode, path, map, split_flag, flags, - allocated); + + /* Lets update the extent status tree after conversion */ + if (!(flags & EXT4_EX_NOCACHE)) + ext4_es_insert_extent(inode, le32_to_cpu(ex->ee_block), + ext4_ext_get_actual_len(ex), + ext4_ext_pblock(ex), + ext4_ext_is_unwritten(ex) ? + EXTENT_STATUS_UNWRITTEN : + EXTENT_STATUS_WRITTEN, + false); + +err: + if (err) { + ext4_free_ext_path(path); + return ERR_PTR(err); + } + + return path; } static struct ext4_ext_path * @@ -3873,7 +3892,6 @@ ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, ext4_lblk_t ee_block; unsigned int ee_len; int depth; - int err = 0; depth = ext_depth(inode); ex = path[depth].p_ext; @@ -3883,41 +3901,8 @@ ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)ee_block, ee_len); - if (ee_block != map->m_lblk || ee_len > map->m_len) { - path = ext4_split_convert_extents(handle, inode, map, path, - flags, NULL); - if (IS_ERR(path)) - return path; - - path = ext4_find_extent(inode, map->m_lblk, path, flags); - if (IS_ERR(path)) - return path; - depth = ext_depth(inode); - ex = path[depth].p_ext; - } - - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - goto errout; - /* first mark the extent as initialized */ - ext4_ext_mark_initialized(ex); - - /* note: ext4_ext_correct_indexes() isn't needed here because - * borders are not changed - */ - ext4_ext_try_to_merge(handle, inode, path, ex); - - /* Mark modified extent as dirty */ - err = ext4_ext_dirty(handle, inode, path + path->p_depth); - if (err) - goto errout; - - ext4_ext_show_leaf(inode, path); - return path; - -errout: - ext4_free_ext_path(path); - return ERR_PTR(err); + return ext4_split_convert_extents(handle, inode, map, path, flags, + NULL); } static struct ext4_ext_path * @@ -3931,7 +3916,6 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, ext4_lblk_t ee_block; unsigned int ee_len; int depth; - int err = 0; /* * Make sure that the extent is no bigger than we support with @@ -3948,40 +3932,11 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)ee_block, ee_len); - if (ee_block != map->m_lblk || ee_len > map->m_len) { - path = ext4_split_convert_extents(handle, inode, map, path, - flags, NULL); - if (IS_ERR(path)) - return path; + path = ext4_split_convert_extents(handle, inode, map, path, flags, + NULL); + if (IS_ERR(path)) + return path; - path = ext4_find_extent(inode, map->m_lblk, path, flags); - if (IS_ERR(path)) - return path; - depth = ext_depth(inode); - ex = path[depth].p_ext; - if (!ex) { - EXT4_ERROR_INODE(inode, "unexpected hole at %lu", - (unsigned long) map->m_lblk); - err = -EFSCORRUPTED; - goto errout; - } - } - - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - goto errout; - /* first mark the extent as unwritten */ - ext4_ext_mark_unwritten(ex); - - /* note: ext4_ext_correct_indexes() isn't needed here because - * borders are not changed - */ - ext4_ext_try_to_merge(handle, inode, path, ex); - - /* Mark modified extent as dirty */ - err = ext4_ext_dirty(handle, inode, path + path->p_depth); - if (err) - goto errout; ext4_ext_show_leaf(inode, path); ext4_update_inode_fsync_trans(handle, inode, 1); @@ -3991,10 +3946,6 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, *allocated = map->m_len; map->m_len = *allocated; return path; - -errout: - ext4_free_ext_path(path); - return ERR_PTR(err); } static struct ext4_ext_path * @@ -5629,7 +5580,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) struct ext4_extent *extent; ext4_lblk_t start_lblk, len_lblk, ee_start_lblk = 0; unsigned int credits, ee_len; - int ret, depth, split_flag = 0; + int ret, depth; loff_t start; trace_ext4_insert_range(inode, offset, len); @@ -5700,12 +5651,8 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) */ if ((start_lblk > ee_start_lblk) && (start_lblk < (ee_start_lblk + ee_len))) { - if (ext4_ext_is_unwritten(extent)) - split_flag = EXT4_EXT_MARK_UNWRIT1 | - EXT4_EXT_MARK_UNWRIT2; path = ext4_split_extent_at(handle, inode, path, - start_lblk, split_flag, - EXT4_EX_NOCACHE | + start_lblk, EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_SPLIT_NOMERGE | EXT4_GET_BLOCKS_METADATA_NOFAIL); } From 4f5e8e6f012349a107531b02eed5b5ace6181449 Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Fri, 23 Jan 2026 11:55:39 +0530 Subject: [PATCH 45/45] et4: allow zeroout when doing written to unwritten split Currently, when we are doing an extent split and convert operation of written to unwritten extent (example, as done by ZERO_RANGE), we don't allow the zeroout fallback in case the extent tree manipulation fails. This is mostly because zeroout might take unsually long and the fact that this code path is more tolerant to failures than endio. Since we have zeroout machinery in place, we might as well use it hence lift this restriction. To mitigate zeroout taking too long respect the max zeroout limit here so that the operation finishes relatively fast. Also, add kunit tests for this case. Reviewed-by: Jan Kara Reviewed-by: Zhang Yi Signed-off-by: Ojaswin Mujoo Link: https://patch.msgid.link/1c3349020b8e098a63f293b84bc8a9b56011cef4.1769149131.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents-test.c | 98 +++++++++++++++++++++++++++++++++++++++++- fs/ext4/extents.c | 33 ++++++++++---- 2 files changed, 122 insertions(+), 9 deletions(-) diff --git a/fs/ext4/extents-test.c b/fs/ext4/extents-test.c index 3176bf7686b5..4879e68e465d 100644 --- a/fs/ext4/extents-test.c +++ b/fs/ext4/extents-test.c @@ -613,11 +613,57 @@ static const struct kunit_ext_test_param test_split_convert_params[] = { .is_unwrit = 0 } }, .is_zeroout_test = 1, .nr_exp_data_segs = 3, + .exp_data_state = { { .exp_char = 0, .off_blk = 0, .len_blk = 1 }, + { .exp_char = 'X', .off_blk = 1, .len_blk = EXT_DATA_LEN - 2 }, + { .exp_char = 0, .off_blk = EXT_DATA_LEN - 1, .len_blk = 1 } } }, + + /* writ to unwrit splits */ + { .desc = "split writ extent to 2 extents and convert 1st half unwrit (zeroout)", + .type = TEST_SPLIT_CONVERT, + .is_unwrit_at_start = 0, + .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, + .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 }, + .nr_exp_ext = 1, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = EXT_DATA_LEN, + .is_unwrit = 0 } }, + .is_zeroout_test = 1, + .nr_exp_data_segs = 2, .exp_data_state = { { .exp_char = 0, .off_blk = 0, .len_blk = 1 }, { .exp_char = 'X', .off_blk = 1, - .len_blk = EXT_DATA_LEN - 2 }, + .len_blk = EXT_DATA_LEN - 1 } } }, + { .desc = "split writ extent to 2 extents and convert 2nd half unwrit (zeroout)", + .type = TEST_SPLIT_CONVERT, + .is_unwrit_at_start = 0, + .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 }, + .nr_exp_ext = 1, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = EXT_DATA_LEN, + .is_unwrit = 0 } }, + .is_zeroout_test = 1, + .nr_exp_data_segs = 2, + .exp_data_state = { { .exp_char = 'X', .off_blk = 0, .len_blk = 1 }, { .exp_char = 0, + .off_blk = 1, + .len_blk = EXT_DATA_LEN - 1 } } }, + { .desc = "split writ extent to 3 extents and convert 2nd half unwrit (zeroout)", + .type = TEST_SPLIT_CONVERT, + .is_unwrit_at_start = 0, + .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 }, + .nr_exp_ext = 1, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = EXT_DATA_LEN, + .is_unwrit = 0 } }, + .is_zeroout_test = 1, + .nr_exp_data_segs = 3, + .exp_data_state = { { .exp_char = 'X', .off_blk = 0, .len_blk = 1 }, + { .exp_char = 0, + .off_blk = 1, + .len_blk = EXT_DATA_LEN - 2 }, + { .exp_char = 'X', .off_blk = EXT_DATA_LEN - 1, .len_blk = 1 } } }, }; @@ -667,6 +713,56 @@ static const struct kunit_ext_test_param test_convert_initialized_params[] = { .ex_len = 1, .is_unwrit = 0 } }, .is_zeroout_test = 0 }, + + /* writ to unwrit splits (zeroout) */ + { .desc = "split writ extent to 2 extents and convert 1st half unwrit (zeroout)", + .type = TEST_CREATE_BLOCKS, + .is_unwrit_at_start = 0, + .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, + .split_map = { .m_lblk = EXT_DATA_LBLK, .m_len = 1 }, + .nr_exp_ext = 1, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = EXT_DATA_LEN, + .is_unwrit = 0 } }, + .is_zeroout_test = 1, + .nr_exp_data_segs = 2, + .exp_data_state = { { .exp_char = 0, .off_blk = 0, .len_blk = 1 }, + { .exp_char = 'X', + .off_blk = 1, + .len_blk = EXT_DATA_LEN - 1 } } }, + { .desc = "split writ extent to 2 extents and convert 2nd half unwrit (zeroout)", + .type = TEST_CREATE_BLOCKS, + .is_unwrit_at_start = 0, + .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 1 }, + .nr_exp_ext = 1, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = EXT_DATA_LEN, + .is_unwrit = 0 } }, + .is_zeroout_test = 1, + .nr_exp_data_segs = 2, + .exp_data_state = { { .exp_char = 'X', .off_blk = 0, .len_blk = 1 }, + { .exp_char = 0, + .off_blk = 1, + .len_blk = EXT_DATA_LEN - 1 } } }, + { .desc = "split writ extent to 3 extents and convert 2nd half unwrit (zeroout)", + .type = TEST_CREATE_BLOCKS, + .is_unwrit_at_start = 0, + .split_flags = EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, + .split_map = { .m_lblk = EXT_DATA_LBLK + 1, .m_len = EXT_DATA_LEN - 2 }, + .nr_exp_ext = 1, + .exp_ext_state = { { .ex_lblk = EXT_DATA_LBLK, + .ex_len = EXT_DATA_LEN, + .is_unwrit = 0 } }, + .is_zeroout_test = 1, + .nr_exp_data_segs = 3, + .exp_data_state = { { .exp_char = 'X', .off_blk = 0, .len_blk = 1 }, + { .exp_char = 0, + .off_blk = 1, + .len_blk = EXT_DATA_LEN - 2 }, + { .exp_char = 'X', + .off_blk = EXT_DATA_LEN - 1, + .len_blk = 1 } } }, }; /* Tests to trigger ext4_ext_map_blocks() -> ext4_ext_handle_unwritten_exntents() */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 14f38b3cda27..3630b27e4fd7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3462,6 +3462,15 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle, if (!(split_flag & EXT4_EXT_MAY_ZEROOUT)) goto out_orig_err; + if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) { + int max_zeroout_blks = + EXT4_SB(inode->i_sb)->s_extent_max_zeroout_kb >> + (inode->i_sb->s_blocksize_bits - 10); + + if (map->m_len > max_zeroout_blks) + goto out_orig_err; + } + path = ext4_find_extent(inode, map->m_lblk, NULL, flags); if (IS_ERR(path)) goto out_orig_err; @@ -3811,15 +3820,10 @@ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle, goto convert; /* - * We don't use zeroout fallback for written to unwritten conversion as - * it is not as critical as endio and it might take unusually long. - * Also, it is only safe to convert extent to initialized via explicit + * It is only safe to convert extent to initialized via explicit * zeroout only if extent is fully inside i_size or new_size. */ - if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) - split_flag |= ee_block + ee_len <= eof_block ? - EXT4_EXT_MAY_ZEROOUT : - 0; + split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; /* * pass SPLIT_NOMERGE explicitly so we don't end up merging extents we @@ -3941,7 +3945,20 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, ext4_update_inode_fsync_trans(handle, inode, 1); - map->m_flags |= EXT4_MAP_UNWRITTEN; + /* + * The extent might be initialized in case of zeroout. + */ + path = ext4_find_extent(inode, map->m_lblk, path, flags); + if (IS_ERR(path)) + return path; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + + if (ext4_ext_is_unwritten(ex)) + map->m_flags |= EXT4_MAP_UNWRITTEN; + else + map->m_flags |= EXT4_MAP_MAPPED; if (*allocated > map->m_len) *allocated = map->m_len; map->m_len = *allocated;