xfs: fixes for v6.18-rc3

Signed-off-by: Carlos Maiolino <cem@kernel.org>
 -----BEGIN PGP SIGNATURE-----
 
 iJUEABMJAB0WIQSmtYVZ/MfVMGUq1GNcsMJ8RxYuYwUCaPyVuAAKCRBcsMJ8RxYu
 Y7ctAYDUsUnEuqhD2kvGhED6GXXUv1Ibl6vliCqBUbTkaKpXAq1QJrJBYrpwiAh+
 AVQyUoQBgNDi6zY1Ej85luZQEQCpTz6e9Z81Kwq5tBiCQz512SdmtluuzLJhkIt1
 BCbAS69DMg==
 =z2aZ
 -----END PGP SIGNATURE-----

Merge tag 'xfs-fixes-6.18-rc3' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull xfs fixes from Carlos Maiolino:
 "The main highlight here is a fix for a bug brought in by the removal
  of attr2 mount option, where some installations might actually have
  'attr2' explicitly configured in fstab preventing system to boot by
  not being able to remount the rootfs as RW.

  Besides that there are a couple fix to the zonefs implementation,
  changing XFS_ONLINE_SCRUB_STATS to depend on DEBUG_FS (was select
  before), and some other minor changes"

* tag 'xfs-fixes-6.18-rc3' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux:
  xfs: fix locking in xchk_nlinks_collect_dir
  xfs: loudly complain about defunct mount options
  xfs: always warn about deprecated mount options
  xfs: don't set bt_nr_sectors to a negative number
  xfs: don't use __GFP_NOFAIL in xfs_init_fs_context
  xfs: cache open zone in inode->i_private
  xfs: avoid busy loops in GCD
  xfs: XFS_ONLINE_SCRUB_STATS should depend on DEBUG_FS
  xfs: do not tightly pack-write large files
  xfs: Improve CONFIG_XFS_RT Kconfig help
This commit is contained in:
Linus Torvalds 2025-10-25 09:31:13 -07:00
commit 27c0b5c4f6
9 changed files with 193 additions and 140 deletions

View File

@ -119,6 +119,15 @@ config XFS_RT
See the xfs man page in section 5 for additional information.
This option is mandatory to support zoned block devices. For these
devices, the realtime subvolume must be backed by a zoned block
device and a regular block device used as the main device (for
metadata). If the zoned block device is a host-managed SMR hard-disk
containing conventional zones at the beginning of its address space,
XFS will use the disk conventional zones as the main device and the
remaining sequential write required zones as the backing storage for
the realtime subvolume.
If unsure, say N.
config XFS_DRAIN_INTENTS
@ -156,7 +165,7 @@ config XFS_ONLINE_SCRUB_STATS
bool "XFS online metadata check usage data collection"
default y
depends on XFS_ONLINE_SCRUB
select DEBUG_FS
depends on DEBUG_FS
help
If you say Y here, the kernel will gather usage data about
the online metadata check subsystem. This includes the number

View File

@ -376,6 +376,36 @@ xchk_nlinks_collect_pptr(
return error;
}
static uint
xchk_nlinks_ilock_dir(
struct xfs_inode *ip)
{
uint lock_mode = XFS_ILOCK_SHARED;
/*
* We're going to scan the directory entries, so we must be ready to
* pull the data fork mappings into memory if they aren't already.
*/
if (xfs_need_iread_extents(&ip->i_df))
lock_mode = XFS_ILOCK_EXCL;
/*
* We're going to scan the parent pointers, so we must be ready to
* pull the attr fork mappings into memory if they aren't already.
*/
if (xfs_has_parent(ip->i_mount) && xfs_inode_has_attr_fork(ip) &&
xfs_need_iread_extents(&ip->i_af))
lock_mode = XFS_ILOCK_EXCL;
/*
* Take the IOLOCK so that other threads cannot start a directory
* update while we're scanning.
*/
lock_mode |= XFS_IOLOCK_SHARED;
xfs_ilock(ip, lock_mode);
return lock_mode;
}
/* Walk a directory to bump the observed link counts of the children. */
STATIC int
xchk_nlinks_collect_dir(
@ -394,8 +424,7 @@ xchk_nlinks_collect_dir(
return 0;
/* Prevent anyone from changing this directory while we walk it. */
xfs_ilock(dp, XFS_IOLOCK_SHARED);
lock_mode = xfs_ilock_data_map_shared(dp);
lock_mode = xchk_nlinks_ilock_dir(dp);
/*
* The dotdot entry of an unlinked directory still points to the last
@ -452,7 +481,6 @@ xchk_nlinks_collect_dir(
xchk_iscan_abort(&xnc->collect_iscan);
out_unlock:
xfs_iunlock(dp, lock_mode);
xfs_iunlock(dp, XFS_IOLOCK_SHARED);
return error;
}

View File

@ -1751,7 +1751,7 @@ xfs_init_buftarg(
const char *descr)
{
/* The maximum size of the buftarg is only known once the sb is read. */
btp->bt_nr_sectors = (xfs_daddr_t)-1;
btp->bt_nr_sectors = XFS_BUF_DADDR_MAX;
/* Set up device logical sector size mask */
btp->bt_logical_sectorsize = logical_sectorsize;

View File

@ -22,6 +22,7 @@ extern struct kmem_cache *xfs_buf_cache;
*/
struct xfs_buf;
#define XFS_BUF_DADDR_MAX ((xfs_daddr_t) S64_MAX)
#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL))
#define XBF_READ (1u << 0) /* buffer intended for reading from device */

View File

@ -236,7 +236,6 @@ typedef struct xfs_mount {
bool m_update_sb; /* sb needs update in mount */
unsigned int m_max_open_zones;
unsigned int m_zonegc_low_space;
struct xfs_mru_cache *m_zone_cache; /* Inode to open zone cache */
/* max_atomic_write mount option value */
unsigned long long m_awu_max_bytes;

View File

@ -102,7 +102,7 @@ static const struct constant_table dax_param_enums[] = {
* Table driven mount option parser.
*/
enum {
Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev,
Op_deprecated, Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev,
Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid,
Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups,
Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32,
@ -114,7 +114,21 @@ enum {
Opt_lifetime, Opt_nolifetime, Opt_max_atomic_write,
};
#define fsparam_dead(NAME) \
__fsparam(NULL, (NAME), Op_deprecated, fs_param_deprecated, NULL)
static const struct fs_parameter_spec xfs_fs_parameters[] = {
/*
* These mount options were supposed to be deprecated in September 2025
* but the deprecation warning was buggy, so not all users were
* notified. The deprecation is now obnoxiously loud and postponed to
* September 2030.
*/
fsparam_dead("attr2"),
fsparam_dead("noattr2"),
fsparam_dead("ikeep"),
fsparam_dead("noikeep"),
fsparam_u32("logbufs", Opt_logbufs),
fsparam_string("logbsize", Opt_logbsize),
fsparam_string("logdev", Opt_logdev),
@ -786,6 +800,12 @@ xfs_fs_evict_inode(
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
if (IS_ENABLED(CONFIG_XFS_RT) &&
S_ISREG(inode->i_mode) && inode->i_private) {
xfs_open_zone_put(inode->i_private);
inode->i_private = NULL;
}
}
static void
@ -1373,16 +1393,25 @@ suffix_kstrtoull(
static inline void
xfs_fs_warn_deprecated(
struct fs_context *fc,
struct fs_parameter *param,
uint64_t flag,
bool value)
struct fs_parameter *param)
{
/* Don't print the warning if reconfiguring and current mount point
* already had the flag set
/*
* Always warn about someone passing in a deprecated mount option.
* Previously we wouldn't print the warning if we were reconfiguring
* and current mount point already had the flag set, but that was not
* the right thing to do.
*
* Many distributions mount the root filesystem with no options in the
* initramfs and rely on mount -a to remount the root fs with the
* options in fstab. However, the old behavior meant that there would
* never be a warning about deprecated mount options for the root fs in
* /etc/fstab. On a single-fs system, that means no warning at all.
*
* Compounding this problem are distribution scripts that copy
* /proc/mounts to fstab, which means that we can't remove mount
* options unless we're 100% sure they have only ever been advertised
* in /proc/mounts in response to explicitly provided mount options.
*/
if ((fc->purpose & FS_CONTEXT_FOR_RECONFIGURE) &&
!!(XFS_M(fc->root->d_sb)->m_features & flag) == value)
return;
xfs_warn(fc->s_fs_info, "%s mount option is deprecated.", param->key);
}
@ -1408,6 +1437,9 @@ xfs_fs_parse_param(
return opt;
switch (opt) {
case Op_deprecated:
xfs_fs_warn_deprecated(fc, param);
return 0;
case Opt_logbufs:
parsing_mp->m_logbufs = result.uint_32;
return 0;
@ -1528,7 +1560,6 @@ xfs_fs_parse_param(
xfs_mount_set_dax_mode(parsing_mp, result.uint_32);
return 0;
#endif
/* Following mount options will be removed in September 2025 */
case Opt_max_open_zones:
parsing_mp->m_max_open_zones = result.uint_32;
return 0;
@ -2221,7 +2252,7 @@ xfs_init_fs_context(
struct xfs_mount *mp;
int i;
mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL | __GFP_NOFAIL);
mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
if (!mp)
return -ENOMEM;

View File

@ -26,14 +26,22 @@
#include "xfs_trace.h"
#include "xfs_mru_cache.h"
static void
xfs_open_zone_free_rcu(
struct callback_head *cb)
{
struct xfs_open_zone *oz = container_of(cb, typeof(*oz), oz_rcu);
xfs_rtgroup_rele(oz->oz_rtg);
kfree(oz);
}
void
xfs_open_zone_put(
struct xfs_open_zone *oz)
{
if (atomic_dec_and_test(&oz->oz_ref)) {
xfs_rtgroup_rele(oz->oz_rtg);
kfree(oz);
}
if (atomic_dec_and_test(&oz->oz_ref))
call_rcu(&oz->oz_rcu, xfs_open_zone_free_rcu);
}
static inline uint32_t
@ -614,14 +622,25 @@ static inline enum rw_hint xfs_inode_write_hint(struct xfs_inode *ip)
}
/*
* Try to pack inodes that are written back after they were closed tight instead
* of trying to open new zones for them or spread them to the least recently
* used zone. This optimizes the data layout for workloads that untar or copy
* a lot of small files. Right now this does not separate multiple such
* Try to tightly pack small files that are written back after they were closed
* instead of trying to open new zones for them or spread them to the least
* recently used zone. This optimizes the data layout for workloads that untar
* or copy a lot of small files. Right now this does not separate multiple such
* streams.
*/
static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
size_t zone_capacity =
XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_RTG].blocks);
/*
* Do not pack write files that are already using a full zone to avoid
* fragmentation.
*/
if (i_size_read(VFS_I(ip)) >= zone_capacity)
return false;
return !inode_is_open_for_write(VFS_I(ip)) &&
!(ip->i_diflags & XFS_DIFLAG_APPEND);
}
@ -745,98 +764,55 @@ xfs_mark_rtg_boundary(
ioend->io_flags |= IOMAP_IOEND_BOUNDARY;
}
/*
* Cache the last zone written to for an inode so that it is considered first
* for subsequent writes.
*/
struct xfs_zone_cache_item {
struct xfs_mru_cache_elem mru;
struct xfs_open_zone *oz;
};
static inline struct xfs_zone_cache_item *
xfs_zone_cache_item(struct xfs_mru_cache_elem *mru)
{
return container_of(mru, struct xfs_zone_cache_item, mru);
}
static void
xfs_zone_cache_free_func(
void *data,
struct xfs_mru_cache_elem *mru)
{
struct xfs_zone_cache_item *item = xfs_zone_cache_item(mru);
xfs_open_zone_put(item->oz);
kfree(item);
}
/*
* Check if we have a cached last open zone available for the inode and
* if yes return a reference to it.
*/
static struct xfs_open_zone *
xfs_cached_zone(
struct xfs_mount *mp,
struct xfs_inode *ip)
xfs_get_cached_zone(
struct xfs_inode *ip)
{
struct xfs_mru_cache_elem *mru;
struct xfs_open_zone *oz;
struct xfs_open_zone *oz;
mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
if (!mru)
return NULL;
oz = xfs_zone_cache_item(mru)->oz;
rcu_read_lock();
oz = VFS_I(ip)->i_private;
if (oz) {
/*
* GC only steals open zones at mount time, so no GC zones
* should end up in the cache.
*/
ASSERT(!oz->oz_is_gc);
ASSERT(atomic_read(&oz->oz_ref) > 0);
atomic_inc(&oz->oz_ref);
if (!atomic_inc_not_zero(&oz->oz_ref))
oz = NULL;
}
xfs_mru_cache_done(mp->m_zone_cache);
rcu_read_unlock();
return oz;
}
/*
* Update the last used zone cache for a given inode.
* Stash our zone in the inode so that is is reused for future allocations.
*
* The caller must have a reference on the open zone.
* The open_zone structure will be pinned until either the inode is freed or
* until the cached open zone is replaced with a different one because the
* current one was full when we tried to use it. This means we keep any
* open zone around forever as long as any inode that used it for the last
* write is cached, which slightly increases the memory use of cached inodes
* that were every written to, but significantly simplifies the cached zone
* lookup. Because the open_zone is clearly marked as full when all data
* in the underlying RTG was written, the caching is always safe.
*/
static void
xfs_zone_cache_create_association(
struct xfs_inode *ip,
struct xfs_open_zone *oz)
xfs_set_cached_zone(
struct xfs_inode *ip,
struct xfs_open_zone *oz)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_zone_cache_item *item = NULL;
struct xfs_mru_cache_elem *mru;
struct xfs_open_zone *old_oz;
ASSERT(atomic_read(&oz->oz_ref) > 0);
atomic_inc(&oz->oz_ref);
mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
if (mru) {
/*
* If we have an association already, update it to point to the
* new zone.
*/
item = xfs_zone_cache_item(mru);
xfs_open_zone_put(item->oz);
item->oz = oz;
xfs_mru_cache_done(mp->m_zone_cache);
return;
}
item = kmalloc(sizeof(*item), GFP_KERNEL);
if (!item) {
xfs_open_zone_put(oz);
return;
}
item->oz = oz;
xfs_mru_cache_insert(mp->m_zone_cache, ip->i_ino, &item->mru);
old_oz = xchg(&VFS_I(ip)->i_private, oz);
if (old_oz)
xfs_open_zone_put(old_oz);
}
static void
@ -880,15 +856,14 @@ xfs_zone_alloc_and_submit(
* the inode is still associated with a zone and use that if so.
*/
if (!*oz)
*oz = xfs_cached_zone(mp, ip);
*oz = xfs_get_cached_zone(ip);
if (!*oz) {
select_zone:
*oz = xfs_select_zone(mp, write_hint, pack_tight);
if (!*oz)
goto out_error;
xfs_zone_cache_create_association(ip, *oz);
xfs_set_cached_zone(ip, *oz);
}
alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size),
@ -966,6 +941,12 @@ xfs_free_open_zones(
xfs_open_zone_put(oz);
}
spin_unlock(&zi->zi_open_zones_lock);
/*
* Wait for all open zones to be freed so that they drop the group
* references:
*/
rcu_barrier();
}
struct xfs_init_zones {
@ -1279,14 +1260,6 @@ xfs_mount_zones(
error = xfs_zone_gc_mount(mp);
if (error)
goto out_free_zone_info;
/*
* Set up a mru cache to track inode to open zone for data placement
* purposes. The magic values for group count and life time is the
* same as the defaults for file streams, which seems sane enough.
*/
xfs_mru_cache_create(&mp->m_zone_cache, mp,
5000, 10, xfs_zone_cache_free_func);
return 0;
out_free_zone_info:
@ -1300,5 +1273,4 @@ xfs_unmount_zones(
{
xfs_zone_gc_unmount(mp);
xfs_free_zone_info(mp->m_zone_info);
xfs_mru_cache_destroy(mp->m_zone_cache);
}

View File

@ -491,21 +491,6 @@ xfs_zone_gc_select_victim(
struct xfs_rtgroup *victim_rtg = NULL;
unsigned int bucket;
if (xfs_is_shutdown(mp))
return false;
if (iter->victim_rtg)
return true;
/*
* Don't start new work if we are asked to stop or park.
*/
if (kthread_should_stop() || kthread_should_park())
return false;
if (!xfs_zoned_need_gc(mp))
return false;
spin_lock(&zi->zi_used_buckets_lock);
for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {
victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket);
@ -975,6 +960,27 @@ xfs_zone_gc_reset_zones(
} while (next);
}
static bool
xfs_zone_gc_should_start_new_work(
struct xfs_zone_gc_data *data)
{
if (xfs_is_shutdown(data->mp))
return false;
if (!xfs_zone_gc_space_available(data))
return false;
if (!data->iter.victim_rtg) {
if (kthread_should_stop() || kthread_should_park())
return false;
if (!xfs_zoned_need_gc(data->mp))
return false;
if (!xfs_zone_gc_select_victim(data))
return false;
}
return true;
}
/*
* Handle the work to read and write data for GC and to reset the zones,
* including handling all completions.
@ -982,7 +988,7 @@ xfs_zone_gc_reset_zones(
* Note that the order of the chunks is preserved so that we don't undo the
* optimal order established by xfs_zone_gc_query().
*/
static bool
static void
xfs_zone_gc_handle_work(
struct xfs_zone_gc_data *data)
{
@ -996,30 +1002,22 @@ xfs_zone_gc_handle_work(
zi->zi_reset_list = NULL;
spin_unlock(&zi->zi_reset_list_lock);
if (!xfs_zone_gc_select_victim(data) ||
!xfs_zone_gc_space_available(data)) {
if (list_empty(&data->reading) &&
list_empty(&data->writing) &&
list_empty(&data->resetting) &&
!reset_list)
return false;
}
__set_current_state(TASK_RUNNING);
try_to_freeze();
if (reset_list)
if (reset_list) {
set_current_state(TASK_RUNNING);
xfs_zone_gc_reset_zones(data, reset_list);
}
list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
break;
set_current_state(TASK_RUNNING);
xfs_zone_gc_finish_reset(chunk);
}
list_for_each_entry_safe(chunk, next, &data->writing, entry) {
if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
break;
set_current_state(TASK_RUNNING);
xfs_zone_gc_finish_chunk(chunk);
}
@ -1027,15 +1025,18 @@ xfs_zone_gc_handle_work(
list_for_each_entry_safe(chunk, next, &data->reading, entry) {
if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
break;
set_current_state(TASK_RUNNING);
xfs_zone_gc_write_chunk(chunk);
}
blk_finish_plug(&plug);
blk_start_plug(&plug);
while (xfs_zone_gc_start_chunk(data))
;
blk_finish_plug(&plug);
return true;
if (xfs_zone_gc_should_start_new_work(data)) {
set_current_state(TASK_RUNNING);
blk_start_plug(&plug);
while (xfs_zone_gc_start_chunk(data))
;
blk_finish_plug(&plug);
}
}
/*
@ -1059,8 +1060,18 @@ xfs_zoned_gcd(
for (;;) {
set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
xfs_set_zonegc_running(mp);
if (xfs_zone_gc_handle_work(data))
xfs_zone_gc_handle_work(data);
/*
* Only sleep if nothing set the state to running. Else check for
* work again as someone might have queued up more work and woken
* us in the meantime.
*/
if (get_current_state() == TASK_RUNNING) {
try_to_freeze();
continue;
}
if (list_empty(&data->reading) &&
list_empty(&data->writing) &&

View File

@ -44,6 +44,8 @@ struct xfs_open_zone {
* the life time of an open zone.
*/
struct xfs_rtgroup *oz_rtg;
struct rcu_head oz_rcu;
};
/*