From c6ce65cb17aa9321687d1b8a842487f839e1a548 Mon Sep 17 00:00:00 2001 From: Wilfred Mallawa Date: Sun, 1 Mar 2026 10:34:35 +1000 Subject: [PATCH 1/9] xfs: add write pointer to xfs_rtgroup_geometry There is currently no XFS ioctl that allows userspace to retrieve the write pointer for a specific realtime group block for zoned XFS. On zoned block devices, userspace can obtain this information via zone reports from the underlying device. However, for zoned XFS operating on regular block devices, no equivalent mechanism exists. Access to the realtime group write pointer is useful to userspace development and analysis tools such as Zonar [1]. So extend the existing struct xfs_rtgroup_geometry to add a new rg_writepointer field. This field is valid if XFS_RTGROUP_GEOM_WRITEPOINTER flag is set. The rg_writepointer field specifies the location of the current writepointer as a block offset into the respective rtgroup. [1] https://lwn.net/Articles/1059364/ Signed-off-by: Wilfred Mallawa Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_fs.h | 5 ++++- fs/xfs/xfs_ioctl.c | 19 +++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index d165de607d17..185f09f327c0 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -995,7 +995,8 @@ struct xfs_rtgroup_geometry { __u32 rg_sick; /* o: sick things in ag */ __u32 rg_checked; /* o: checked metadata in ag */ __u32 rg_flags; /* i/o: flags for this ag */ - __u32 rg_reserved[27]; /* o: zero */ + __u32 rg_writepointer; /* o: write pointer block offset for zoned */ + __u32 rg_reserved[26]; /* o: zero */ }; #define XFS_RTGROUP_GEOM_SICK_SUPER (1U << 0) /* superblock */ #define XFS_RTGROUP_GEOM_SICK_BITMAP (1U << 1) /* rtbitmap */ @@ -1003,6 +1004,8 @@ struct xfs_rtgroup_geometry { #define XFS_RTGROUP_GEOM_SICK_RMAPBT (1U << 3) /* reverse mappings */ #define XFS_RTGROUP_GEOM_SICK_REFCNTBT (1U << 4) /* reference counts */ +#define XFS_RTGROUP_GEOM_WRITEPOINTER (1U << 0) /* write pointer */ + /* Health monitor event domains */ /* affects the whole fs */ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index facffdc8dca8..46e234863644 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -37,12 +37,15 @@ #include "xfs_ioctl.h" #include "xfs_xattr.h" #include "xfs_rtbitmap.h" +#include "xfs_rtrmap_btree.h" #include "xfs_file.h" #include "xfs_exchrange.h" #include "xfs_handle.h" #include "xfs_rtgroup.h" #include "xfs_healthmon.h" #include "xfs_verify_media.h" +#include "xfs_zone_priv.h" +#include "xfs_zone_alloc.h" #include #include @@ -413,6 +416,7 @@ xfs_ioc_rtgroup_geometry( { struct xfs_rtgroup *rtg; struct xfs_rtgroup_geometry rgeo; + xfs_rgblock_t highest_rgbno; int error; if (copy_from_user(&rgeo, arg, sizeof(rgeo))) @@ -433,6 +437,21 @@ xfs_ioc_rtgroup_geometry( if (error) return error; + if (xfs_has_zoned(mp)) { + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + if (rtg->rtg_open_zone) { + rgeo.rg_writepointer = rtg->rtg_open_zone->oz_allocated; + } else { + highest_rgbno = xfs_rtrmap_highest_rgbno(rtg); + if (highest_rgbno == NULLRGBLOCK) + rgeo.rg_writepointer = 0; + else + rgeo.rg_writepointer = highest_rgbno + 1; + } + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); + rgeo.rg_flags |= XFS_RTGROUP_GEOM_WRITEPOINTER; + } + if (copy_to_user(arg, &rgeo, sizeof(rgeo))) return -EFAULT; return 0; From db8367f63b301bbdff6eb00c2e09fad4f2ae75e9 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Tue, 10 Mar 2026 18:36:46 +0100 Subject: [PATCH 2/9] xfs: factor out isize updates from xfs_dio_write_end_io This is the only code needed for zoned inodes, so factor it out so we can move zoned inodes ioend to its own callback. Signed-off-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_file.c | 60 +++++++++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 23 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 6246f34df9fd..fce6be55d90c 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -560,6 +560,42 @@ xfs_zoned_write_space_reserve( flags, ac); } +/* + * We need to lock the test/set EOF update as we can be racing with + * other IO completions here to update the EOF. Failing to serialise + * here can result in EOF moving backwards and Bad Things Happen when + * that occurs. + * + * As IO completion only ever extends EOF, we can do an unlocked check + * here to avoid taking the spinlock. If we land within the current EOF, + * then we do not need to do an extending update at all, and we don't + * need to take the lock to check this. If we race with an update moving + * EOF, then we'll either still be beyond EOF and need to take the lock, + * or we'll be within EOF and we don't need to take it at all. + */ +static int +xfs_dio_endio_set_isize( + struct inode *inode, + loff_t offset, + ssize_t size) +{ + struct xfs_inode *ip = XFS_I(inode); + + if (offset + size <= i_size_read(inode)) + return 0; + + spin_lock(&ip->i_flags_lock); + if (offset + size <= i_size_read(inode)) { + spin_unlock(&ip->i_flags_lock); + return 0; + } + + i_size_write(inode, offset + size); + spin_unlock(&ip->i_flags_lock); + + return xfs_setfilesize(ip, offset, size); +} + static int xfs_dio_write_end_io( struct kiocb *iocb, @@ -623,30 +659,8 @@ xfs_dio_write_end_io( * with the on-disk inode size being outside the in-core inode size. We * have no other method of updating EOF for AIO, so always do it here * if necessary. - * - * We need to lock the test/set EOF update as we can be racing with - * other IO completions here to update the EOF. Failing to serialise - * here can result in EOF moving backwards and Bad Things Happen when - * that occurs. - * - * As IO completion only ever extends EOF, we can do an unlocked check - * here to avoid taking the spinlock. If we land within the current EOF, - * then we do not need to do an extending update at all, and we don't - * need to take the lock to check this. If we race with an update moving - * EOF, then we'll either still be beyond EOF and need to take the lock, - * or we'll be within EOF and we don't need to take it at all. */ - if (offset + size <= i_size_read(inode)) - goto out; - - spin_lock(&ip->i_flags_lock); - if (offset + size > i_size_read(inode)) { - i_size_write(inode, offset + size); - spin_unlock(&ip->i_flags_lock); - error = xfs_setfilesize(ip, offset, size); - } else { - spin_unlock(&ip->i_flags_lock); - } + error = xfs_dio_endio_set_isize(inode, offset, size); out: memalloc_nofs_restore(nofs_flag); From 02a5d8993b09fe9a6754e57d0e25399baffe9a06 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Tue, 10 Mar 2026 18:36:47 +0100 Subject: [PATCH 3/9] xfs: factor out xfs_dio_write_zoned_end_io Stop sharing direct IO end_io between regular and zoned devices by factoring out zoned dio end_io to its own function. Signed-off-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_file.c | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index fce6be55d90c..7918968e1d62 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -596,6 +596,36 @@ xfs_dio_endio_set_isize( return xfs_setfilesize(ip, offset, size); } +static int +xfs_zoned_dio_write_end_io( + struct kiocb *iocb, + ssize_t size, + int error, + unsigned flags) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct xfs_inode *ip = XFS_I(inode); + unsigned int nofs_flag; + + ASSERT(!(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW))); + + trace_xfs_end_io_direct_write(ip, iocb->ki_pos, size); + + if (xfs_is_shutdown(ip->i_mount)) + return -EIO; + + if (error || !size) + return error; + + XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size); + + nofs_flag = memalloc_nofs_save(); + error = xfs_dio_endio_set_isize(inode, iocb->ki_pos, size); + memalloc_nofs_restore(nofs_flag); + + return error; +} + static int xfs_dio_write_end_io( struct kiocb *iocb, @@ -608,8 +638,7 @@ xfs_dio_write_end_io( loff_t offset = iocb->ki_pos; unsigned int nofs_flag; - ASSERT(!xfs_is_zoned_inode(ip) || - !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW))); + ASSERT(!xfs_is_zoned_inode(ip)); trace_xfs_end_io_direct_write(ip, offset, size); @@ -702,7 +731,7 @@ xfs_dio_zoned_submit_io( static const struct iomap_dio_ops xfs_dio_zoned_write_ops = { .bio_set = &iomap_ioend_bioset, .submit_io = xfs_dio_zoned_submit_io, - .end_io = xfs_dio_write_end_io, + .end_io = xfs_zoned_dio_write_end_io, }; /* From 3bdc20b005c20ce1bf9b098d1ee2caa1d994141e Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Tue, 10 Mar 2026 18:36:48 +0100 Subject: [PATCH 4/9] xfs: factor out xfs_zone_inc_written Move the written blocks increment and full zone check into a new helper. Also add an assert to ensure rmap lock is held here. Signed-off-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_zone_alloc.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index e3d19b6dc64a..97149bfc2512 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -189,6 +189,18 @@ xfs_open_zone_mark_full( xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used); } +static inline void +xfs_zone_inc_written( + struct xfs_open_zone *oz, + xfs_filblks_t len) +{ + xfs_assert_ilocked(rtg_rmap(oz->oz_rtg), XFS_ILOCK_EXCL); + + oz->oz_written += len; + if (oz->oz_written == rtg_blocks(oz->oz_rtg)) + xfs_open_zone_mark_full(oz); +} + static void xfs_zone_record_blocks( struct xfs_trans *tp, @@ -206,9 +218,7 @@ xfs_zone_record_blocks( xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP); rmapip->i_used_blocks += len; ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg)); - oz->oz_written += len; - if (oz->oz_written == rtg_blocks(rtg)) - xfs_open_zone_mark_full(oz); + xfs_zone_inc_written(oz, len); xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE); } @@ -227,9 +237,7 @@ xfs_zone_skip_blocks( trace_xfs_zone_skip_blocks(oz, 0, len); xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); - oz->oz_written += len; - if (oz->oz_written == rtg_blocks(rtg)) - xfs_open_zone_mark_full(oz); + xfs_zone_inc_written(oz, len); xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); xfs_add_frextents(rtg_mount(rtg), len); From 01478f356ff794c7676803c7af04eaeaebfbb455 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Tue, 10 Mar 2026 18:36:49 +0100 Subject: [PATCH 5/9] xfs: opencode xfs_zone_record_blocks We only have a single caller, no need to keep it in its own function. Signed-off-by: Carlos Maiolino [hch: add zone_record_blocks trace back] Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_zone_alloc.c | 33 +++++++++++---------------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index 97149bfc2512..9d02160c5334 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -201,27 +201,6 @@ xfs_zone_inc_written( xfs_open_zone_mark_full(oz); } -static void -xfs_zone_record_blocks( - struct xfs_trans *tp, - struct xfs_open_zone *oz, - xfs_fsblock_t fsbno, - xfs_filblks_t len) -{ - struct xfs_mount *mp = tp->t_mountp; - struct xfs_rtgroup *rtg = oz->oz_rtg; - struct xfs_inode *rmapip = rtg_rmap(rtg); - - trace_xfs_zone_record_blocks(oz, xfs_rtb_to_rgbno(mp, fsbno), len); - - xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); - xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP); - rmapip->i_used_blocks += len; - ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg)); - xfs_zone_inc_written(oz, len); - xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE); -} - /* * Called for blocks that have been written to disk, but not actually linked to * an inode, which can happen when garbage collection races with user data @@ -252,6 +231,8 @@ xfs_zoned_map_extent( xfs_fsblock_t old_startblock) { struct xfs_bmbt_irec data; + struct xfs_rtgroup *rtg = oz->oz_rtg; + struct xfs_inode *rmapip = rtg_rmap(rtg); int nmaps = 1; int error; @@ -310,7 +291,15 @@ xfs_zoned_map_extent( } } - xfs_zone_record_blocks(tp, oz, new->br_startblock, new->br_blockcount); + trace_xfs_zone_record_blocks(oz, + xfs_rtb_to_rgbno(tp->t_mountp, new->br_startblock), + new->br_blockcount); + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP); + rmapip->i_used_blocks += new->br_blockcount; + ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg)); + xfs_zone_inc_written(oz, new->br_blockcount); + xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE); /* Map the new blocks into the data fork. */ xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new); From 770323d418ed5848cc21af172f77377b2cc0542d Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 16 Mar 2026 20:40:17 +0900 Subject: [PATCH 6/9] xfs: avoid unnecessary open zone check in xfs_select_zone_nowait() When xfs_select_zone_nowait() is called with pack_tight equal to true, the function xfs_select_open_zone_mru() is called if no open zone is returned by xfs_select_open_zone_lru(), that is, when oz is NULL. The open zone pointer return of xfs_select_zone_nowait() is then checked, but this check is outside of the "if (pack_tight)" that trigered the call to xfs_select_open_zone_mru(). In other word, this check is unnecessarily done even when pack_tight is false. Move the check for the return value of the call to xfs_select_open_zone_mru() inside the if that controls the call to this function, so that we do not uselessly test again the value of oz when pack_tight is false. No functional changes. Signed-off-by: Damien Le Moal Reviewed-by: Hans Holmberg Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_zone_alloc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index 9d02160c5334..612fcafd3a0c 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -678,10 +678,11 @@ xfs_select_zone_nowait( if (oz) goto out_unlock; - if (pack_tight) + if (pack_tight) { oz = xfs_select_open_zone_mru(zi, write_hint); - if (oz) - goto out_unlock; + if (oz) + goto out_unlock; + } /* * See if we can open a new zone and use that so that data for different From 6a82a691b08070ad03b237d7db89aa0bfef389e2 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 16 Mar 2026 20:40:18 +0900 Subject: [PATCH 7/9] xfs: fix a comment typo in xfs_select_zone_nowait() Fix a typo in the comment describing the second call to xfs_select_open_zone_lru() in xfs_select_zone_nowait(). Signed-off-by: Damien Le Moal Reviewed-by: Hans Holmberg Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_zone_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index 612fcafd3a0c..06e2cb79030e 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -693,7 +693,7 @@ xfs_select_zone_nowait( goto out_unlock; /* - * Try to find an zone that is an ok match to colocate data with. + * Try to find a zone that is an ok match to colocate data with. */ oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_OK); if (oz) From 68aa101bf2046aa8365333a3768cece07975ca5f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 16 Mar 2026 20:40:19 +0900 Subject: [PATCH 8/9] xfs: display more zone related information in mountstats Modify xfs_zoned_show_stats() to add to the information displayed with /proc/self/mountstats the total number of zones (RT groups) and the number of open zones together with the maximum number of open zones. Signed-off-by: Damien Le Moal Reviewed-by: Hans Holmberg Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_zone_info.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_zone_info.c b/fs/xfs/xfs_zone_info.c index 53eabbc3334c..a2af44011654 100644 --- a/fs/xfs/xfs_zone_info.c +++ b/fs/xfs/xfs_zone_info.c @@ -90,9 +90,14 @@ xfs_zoned_show_stats( seq_printf(m, "\tRT GC required: %d\n", xfs_zoned_need_gc(mp)); + seq_printf(m, "\ttotal number of zones: %u\n", + mp->m_sb.sb_rgcount); seq_printf(m, "\tfree zones: %d\n", atomic_read(&zi->zi_nr_free_zones)); - seq_puts(m, "\topen zones:\n"); + spin_lock(&zi->zi_open_zones_lock); + seq_printf(m, "\tnumber of open zones: %u / %u\n", + zi->zi_nr_open_zones, mp->m_max_open_zones); + seq_puts(m, "\topen zones:\n"); list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) xfs_show_open_zone(m, oz); if (zi->zi_open_gc_zone) { From c1f955437440f92632e2efca4b591371bb3caefc Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 16 Mar 2026 20:40:20 +0900 Subject: [PATCH 9/9] xfs: avoid unnecessary calculations in xfs_zoned_need_gc() If zonegc_low_space is set to zero (which is the default), the second condition in xfs_zoned_need_gc() that triggers GC never evaluates to true because the calculated threshold will always be 0. So there is no need to calculate the threshold and to evaluate that condition. Return early when zonegc_low_space is zero. While at it, add comments to document the intent of each of the 3 tests used to determine the return value to control the execution of garbage collection. Signed-off-by: Damien Le Moal Reviewed-by: Hans Holmberg Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_zone_gc.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c index 7efeecd2d85f..aaa0a3119d91 100644 --- a/fs/xfs/xfs_zone_gc.c +++ b/fs/xfs/xfs_zone_gc.c @@ -171,25 +171,37 @@ xfs_zoned_need_gc( s64 available, free, threshold; s32 remainder; + /* If we have no reclaimable blocks, running GC is useless. */ if (!xfs_zoned_have_reclaimable(mp->m_zone_info)) return false; + /* + * In order to avoid file fragmentation as much as possible, we should + * make sure that we can open enough zones. So trigger GC if the number + * of blocks immediately available for writes is lower than the total + * number of blocks from all possible open zones. + */ available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE); - if (available < xfs_rtgs_to_rfsbs(mp, mp->m_max_open_zones - XFS_OPEN_GC_ZONES)) return true; - free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS); + /* + * For cases where the user wants to be more aggressive with GC, + * the sysfs attribute zonegc_low_space may be set to a non zero value, + * to indicate that GC should try to maintain at least zonegc_low_space + * percent of the free space to be directly available for writing. Check + * this here. + */ + if (!mp->m_zonegc_low_space) + return false; + free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS); threshold = div_s64_rem(free, 100, &remainder); threshold = threshold * mp->m_zonegc_low_space + remainder * div_s64(mp->m_zonegc_low_space, 100); - if (available < threshold) - return true; - - return false; + return available < threshold; } static struct xfs_zone_gc_data *