xfs: reload entire iunlink lists

This is the second part of correcting XFS to reload the incore unlinked
 inode list from the ondisk contents.  Whereas part one tackled failures
 from regular filesystem calls, this part takes on the problem of needing
 to reload the entire incore unlinked inode list on account of somebody
 loading an inode that's in the /middle/ of an unlinked list.  This
 happens during quotacheck, bulkstat, or even opening a file by handle.
 
 In this case we don't know the length of the list that we're reloading,
 so we don't want to create a new unbounded memory load while holding
 resources locked.  Instead, we'll target UNTRUSTED iget calls to reload
 the entire bucket.
 
 Note that this changes the definition of the incore unlinked inode list
 slightly -- i_prev_unlinked == 0 now means "not on the incore list".
 
 This has been lightly tested with fstests.  Enjoy!
 
 Signed-off-by: Darrick J. Wong <djwong@kernel.org>
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQQ2qTKExjcn+O1o2YRKO3ySh0YRpgUCZQChOQAKCRBKO3ySh0YR
 ptU6AP48lONiOPzWvF1mXTnDosAtIDsfliMY+qVgVxrghqBFmwEAitHlOadpWonu
 yoQ3cnSqzfA4rKT5MQZCm2iIHH/LMgU=
 =Kp5V
 -----END PGP SIGNATURE-----

Merge tag 'fix-iunlink-list-6.6_2023-09-12' of https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux into xfs-6.6-fixesA

xfs: reload entire iunlink lists

This is the second part of correcting XFS to reload the incore unlinked
inode list from the ondisk contents.  Whereas part one tackled failures
from regular filesystem calls, this part takes on the problem of needing
to reload the entire incore unlinked inode list on account of somebody
loading an inode that's in the /middle/ of an unlinked list.  This
happens during quotacheck, bulkstat, or even opening a file by handle.

In this case we don't know the length of the list that we're reloading,
so we don't want to create a new unbounded memory load while holding
resources locked.  Instead, we'll target UNTRUSTED iget calls to reload
the entire bucket.

Note that this changes the definition of the incore unlinked inode list
slightly -- i_prev_unlinked == 0 now means "not on the incore list".

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>

* tag 'fix-iunlink-list-6.6_2023-09-12' of https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux:
  xfs: make inode unlinked bucket recovery work with quotacheck
  xfs: reload entire unlinked bucket lists
  xfs: use i_prev_unlinked to distinguish inodes that are not on the unlinked list
This commit is contained in:
Chandan Babu R 2023-09-13 10:30:12 +05:30
commit abf7c8194f
9 changed files with 195 additions and 9 deletions

View File

@ -333,7 +333,6 @@ xfs_attr_inactive(
int error = 0;
mp = dp->i_mount;
ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
xfs_ilock(dp, lock_mode);
if (!xfs_inode_has_attr_fork(dp))

View File

@ -146,6 +146,12 @@ xfs_nfs_get_inode(
return ERR_PTR(error);
}
error = xfs_inode_reload_unlinked(ip);
if (error) {
xfs_irele(ip);
return ERR_PTR(error);
}
if (VFS_I(ip)->i_generation != generation) {
xfs_irele(ip);
return ERR_PTR(-ESTALE);

View File

@ -113,7 +113,7 @@ xfs_inode_alloc(
INIT_LIST_HEAD(&ip->i_ioend_list);
spin_lock_init(&ip->i_ioend_lock);
ip->i_next_unlinked = NULLAGINO;
ip->i_prev_unlinked = NULLAGINO;
ip->i_prev_unlinked = 0;
return ip;
}

View File

@ -1742,9 +1742,13 @@ xfs_inactive(
ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0))
truncate = 1;
error = xfs_qm_dqattach(ip);
if (error)
goto out;
if (xfs_iflags_test(ip, XFS_IQUOTAUNCHECKED)) {
xfs_qm_dqdetach(ip);
} else {
error = xfs_qm_dqattach(ip);
if (error)
goto out;
}
if (S_ISLNK(VFS_I(ip)->i_mode))
error = xfs_inactive_symlink(ip);
@ -1962,6 +1966,8 @@ xfs_iunlink_reload_next(
trace_xfs_iunlink_reload_next(next_ip);
rele:
ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE));
if (xfs_is_quotacheck_running(mp) && next_ip)
xfs_iflags_set(next_ip, XFS_IQUOTAUNCHECKED);
xfs_irele(next_ip);
return error;
}
@ -2014,6 +2020,7 @@ xfs_iunlink_insert_inode(
}
/* Point the head of the list to point to this inode. */
ip->i_prev_unlinked = NULLAGINO;
return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino);
}
@ -2116,7 +2123,7 @@ xfs_iunlink_remove_inode(
}
ip->i_next_unlinked = NULLAGINO;
ip->i_prev_unlinked = NULLAGINO;
ip->i_prev_unlinked = 0;
return error;
}
@ -3605,3 +3612,103 @@ xfs_iunlock2_io_mmap(
if (ip1 != ip2)
inode_unlock(VFS_I(ip1));
}
/*
* Reload the incore inode list for this inode. Caller should ensure that
* the link count cannot change, either by taking ILOCK_SHARED or otherwise
* preventing other threads from executing.
*/
int
xfs_inode_reload_unlinked_bucket(
struct xfs_trans *tp,
struct xfs_inode *ip)
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_buf *agibp;
struct xfs_agi *agi;
struct xfs_perag *pag;
xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
xfs_agino_t prev_agino, next_agino;
unsigned int bucket;
bool foundit = false;
int error;
/* Grab the first inode in the list */
pag = xfs_perag_get(mp, agno);
error = xfs_ialloc_read_agi(pag, tp, &agibp);
xfs_perag_put(pag);
if (error)
return error;
bucket = agino % XFS_AGI_UNLINKED_BUCKETS;
agi = agibp->b_addr;
trace_xfs_inode_reload_unlinked_bucket(ip);
xfs_info_ratelimited(mp,
"Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating list recovery.",
agino, agno);
prev_agino = NULLAGINO;
next_agino = be32_to_cpu(agi->agi_unlinked[bucket]);
while (next_agino != NULLAGINO) {
struct xfs_inode *next_ip = NULL;
if (next_agino == agino) {
/* Found this inode, set its backlink. */
next_ip = ip;
next_ip->i_prev_unlinked = prev_agino;
foundit = true;
}
if (!next_ip) {
/* Inode already in memory. */
next_ip = xfs_iunlink_lookup(pag, next_agino);
}
if (!next_ip) {
/* Inode not in memory, reload. */
error = xfs_iunlink_reload_next(tp, agibp, prev_agino,
next_agino);
if (error)
break;
next_ip = xfs_iunlink_lookup(pag, next_agino);
}
if (!next_ip) {
/* No incore inode at all? We reloaded it... */
ASSERT(next_ip != NULL);
error = -EFSCORRUPTED;
break;
}
prev_agino = next_agino;
next_agino = next_ip->i_next_unlinked;
}
xfs_trans_brelse(tp, agibp);
/* Should have found this inode somewhere in the iunlinked bucket. */
if (!error && !foundit)
error = -EFSCORRUPTED;
return error;
}
/* Decide if this inode is missing its unlinked list and reload it. */
int
xfs_inode_reload_unlinked(
struct xfs_inode *ip)
{
struct xfs_trans *tp;
int error;
error = xfs_trans_alloc_empty(ip->i_mount, &tp);
if (error)
return error;
xfs_ilock(ip, XFS_ILOCK_SHARED);
if (xfs_inode_unlinked_incomplete(ip))
error = xfs_inode_reload_unlinked_bucket(tp, ip);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
xfs_trans_cancel(tp);
return error;
}

View File

@ -68,8 +68,21 @@ typedef struct xfs_inode {
uint64_t i_diflags2; /* XFS_DIFLAG2_... */
struct timespec64 i_crtime; /* time created */
/* unlinked list pointers */
/*
* Unlinked list pointers. These point to the next and previous inodes
* in the AGI unlinked bucket list, respectively. These fields can
* only be updated with the AGI locked.
*
* i_next_unlinked caches di_next_unlinked.
*/
xfs_agino_t i_next_unlinked;
/*
* If the inode is not on an unlinked list, this field is zero. If the
* inode is the first element in an unlinked list, this field is
* NULLAGINO. Otherwise, i_prev_unlinked points to the previous inode
* in the unlinked list.
*/
xfs_agino_t i_prev_unlinked;
/* VFS inode */
@ -81,6 +94,11 @@ typedef struct xfs_inode {
struct list_head i_ioend_list;
} xfs_inode_t;
static inline bool xfs_inode_on_unlinked_list(const struct xfs_inode *ip)
{
return ip->i_prev_unlinked != 0;
}
static inline bool xfs_inode_has_attr_fork(struct xfs_inode *ip)
{
return ip->i_forkoff > 0;
@ -326,6 +344,9 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
*/
#define XFS_INACTIVATING (1 << 13)
/* Quotacheck is running but inode has not been added to quota counts. */
#define XFS_IQUOTAUNCHECKED (1 << 14)
/* All inode state flags related to inode reclaim. */
#define XFS_ALL_IRECLAIM_FLAGS (XFS_IRECLAIMABLE | \
XFS_IRECLAIM | \
@ -340,7 +361,7 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
#define XFS_IRECLAIM_RESET_FLAGS \
(XFS_IRECLAIMABLE | XFS_IRECLAIM | \
XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | XFS_NEED_INACTIVE | \
XFS_INACTIVATING)
XFS_INACTIVATING | XFS_IQUOTAUNCHECKED)
/*
* Flags for inode locking.
@ -575,4 +596,13 @@ void xfs_end_io(struct work_struct *work);
int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
static inline bool
xfs_inode_unlinked_incomplete(
struct xfs_inode *ip)
{
return VFS_I(ip)->i_nlink == 0 && !xfs_inode_on_unlinked_list(ip);
}
int xfs_inode_reload_unlinked_bucket(struct xfs_trans *tp, struct xfs_inode *ip);
int xfs_inode_reload_unlinked(struct xfs_inode *ip);
#endif /* __XFS_INODE_H__ */

View File

@ -80,6 +80,15 @@ xfs_bulkstat_one_int(
if (error)
goto out;
if (xfs_inode_unlinked_incomplete(ip)) {
error = xfs_inode_reload_unlinked_bucket(tp, ip);
if (error) {
xfs_iunlock(ip, XFS_ILOCK_SHARED);
xfs_irele(ip);
return error;
}
}
ASSERT(ip != NULL);
ASSERT(ip->i_imap.im_blkno != 0);
inode = VFS_I(ip);

View File

@ -405,6 +405,8 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
#define XFS_OPSTATE_WARNED_SHRINK 8
/* Kernel has logged a warning about logged xattr updates being used. */
#define XFS_OPSTATE_WARNED_LARP 9
/* Mount time quotacheck is running */
#define XFS_OPSTATE_QUOTACHECK_RUNNING 10
#define __XFS_IS_OPSTATE(name, NAME) \
static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
@ -427,6 +429,11 @@ __XFS_IS_OPSTATE(inode32, INODE32)
__XFS_IS_OPSTATE(readonly, READONLY)
__XFS_IS_OPSTATE(inodegc_enabled, INODEGC_ENABLED)
__XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED)
#ifdef CONFIG_XFS_QUOTA
__XFS_IS_OPSTATE(quotacheck_running, QUOTACHECK_RUNNING)
#else
# define xfs_is_quotacheck_running(mp) (false)
#endif
static inline bool
xfs_should_warn(struct xfs_mount *mp, long nr)
@ -444,7 +451,8 @@ xfs_should_warn(struct xfs_mount *mp, long nr)
{ (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" }, \
{ (1UL << XFS_OPSTATE_WARNED_SCRUB), "wscrub" }, \
{ (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" }, \
{ (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" }
{ (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" }, \
{ (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING), "quotacheck" }
/*
* Max and min values for mount-option defined I/O

View File

@ -1160,6 +1160,10 @@ xfs_qm_dqusage_adjust(
if (error)
return error;
error = xfs_inode_reload_unlinked(ip);
if (error)
goto error0;
ASSERT(ip->i_delayed_blks == 0);
if (XFS_IS_REALTIME_INODE(ip)) {
@ -1173,6 +1177,7 @@ xfs_qm_dqusage_adjust(
}
nblks = (xfs_qcnt_t)ip->i_nblocks - rtblks;
xfs_iflags_clear(ip, XFS_IQUOTAUNCHECKED);
/*
* Add the (disk blocks and inode) resources occupied by this
@ -1319,8 +1324,10 @@ xfs_qm_quotacheck(
flags |= XFS_PQUOTA_CHKD;
}
xfs_set_quotacheck_running(mp);
error = xfs_iwalk_threaded(mp, 0, 0, xfs_qm_dqusage_adjust, 0, true,
NULL);
xfs_clear_quotacheck_running(mp);
/*
* On error, the inode walk may have partially populated the dquot

View File

@ -3849,6 +3849,26 @@ TRACE_EVENT(xfs_iunlink_reload_next,
__entry->next_agino)
);
TRACE_EVENT(xfs_inode_reload_unlinked_bucket,
TP_PROTO(struct xfs_inode *ip),
TP_ARGS(ip),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
__field(xfs_agino_t, agino)
),
TP_fast_assign(
__entry->dev = ip->i_mount->m_super->s_dev;
__entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
__entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
),
TP_printk("dev %d:%d agno 0x%x agino 0x%x bucket %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agino,
__entry->agino % XFS_AGI_UNLINKED_BUCKETS)
);
DECLARE_EVENT_CLASS(xfs_ag_inode_class,
TP_PROTO(struct xfs_inode *ip),
TP_ARGS(ip),