From 55b6dd54c3bcb6edf7ad630a4510759f4b0cf1cd Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 13 Jan 2026 13:37:39 -0500 Subject: [PATCH 01/83] nfsd/sunrpc: add svc_rqst->rq_private pointer and remove rq_lease_breaker rq_lease_breaker has always been a NFSv4 specific layering violation in svc_rqst. The reason it's there though is that we need a place that is thread-local, and accessible from the svc_rqst pointer. Add a new rq_private pointer to struct svc_rqst. This is intended for use by the threads that are handling the service. sunrpc code doesn't touch it. In nfsd, define a new struct nfsd_thread_local_info. nfsd declares one of these on the stack and puts a pointer to it in rq_private. Add a new ntli_lease_breaker field to the new struct and convert all of the places that access rq_lease_breaker to use the new field instead. Signed-off-by: Jeff Layton Reviewed-by: Benjamin Coddington Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 3 ++- fs/nfsd/nfs4state.c | 9 ++++++--- fs/nfsd/nfsd.h | 4 ++++ fs/nfsd/nfssvc.c | 5 +++++ include/linux/sunrpc/svc.h | 5 ++++- 5 files changed, 21 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 6880c5c520e7..85e94c30285a 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -3043,6 +3043,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) struct svc_fh *current_fh = &cstate->current_fh; struct svc_fh *save_fh = &cstate->save_fh; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct nfsd_thread_local_info *ntli = rqstp->rq_private; __be32 status; resp->xdr = &rqstp->rq_res_stream; @@ -3081,7 +3082,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) } check_if_stalefh_allowed(args); - rqstp->rq_lease_breaker = (void **)&cstate->clp; + ntli->ntli_lease_breaker = &cstate->clp; trace_nfsd_compound(rqstp, args->tag, args->taglen, args->client_opcnt); while (!status && resp->opcnt < args->opcnt) { diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 6b9c399b89df..d8b0bd8ac842 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -5535,13 +5535,15 @@ nfsd_break_deleg_cb(struct file_lease *fl) static bool nfsd_breaker_owns_lease(struct file_lease *fl) { struct nfs4_delegation *dl = fl->c.flc_owner; + struct nfsd_thread_local_info *ntli; struct svc_rqst *rqst; struct nfs4_client *clp; rqst = nfsd_current_rqst(); if (!nfsd_v4client(rqst)) return false; - clp = *(rqst->rq_lease_breaker); + ntli = rqst->rq_private; + clp = *ntli->ntli_lease_breaker; return dl->dl_stid.sc_client == clp; } @@ -9348,13 +9350,14 @@ __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_delegation **pdp) { - __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct nfsd_thread_local_info *ntli = rqstp->rq_private; struct file_lock_context *ctx; struct nfs4_delegation *dp = NULL; struct file_lease *fl; struct nfs4_cb_fattr *ncf; struct inode *inode = d_inode(dentry); + __be32 status; ctx = locks_inode_context(inode); if (!ctx) @@ -9375,7 +9378,7 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry, break; } if (dp == NULL || dp == NON_NFSD_LEASE || - dp->dl_recall.cb_clp == *(rqstp->rq_lease_breaker)) { + dp->dl_recall.cb_clp == *(ntli->ntli_lease_breaker)) { spin_unlock(&ctx->flc_lock); if (dp == NON_NFSD_LEASE) { status = nfserrno(nfsd_open_break_lease(inode, diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index a01d70953358..938906c6d10c 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -82,6 +82,10 @@ extern atomic_t nfsd_th_cnt; /* number of available threads */ extern const struct seq_operations nfs_exports_op; +struct nfsd_thread_local_info { + struct nfs4_client **ntli_lease_breaker; +}; + /* * Common void argument and result helpers */ diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 4a04208393b8..fd979e5392a1 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -887,6 +887,7 @@ nfsd(void *vrqstp) struct svc_xprt *perm_sock = list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct svc_xprt), xpt_list); struct net *net = perm_sock->xpt_net; struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct nfsd_thread_local_info ntli = { }; bool have_mutex = false; /* At this point, the thread shares current->fs @@ -901,6 +902,10 @@ nfsd(void *vrqstp) set_freezable(); + /* use dynamic allocation if ntli should ever become large */ + static_assert(sizeof(struct nfsd_thread_local_info) < 256); + rqstp->rq_private = &ntli; + /* * The main request loop */ diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 4dc14c7a711b..ab8237ba9596 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -175,6 +175,9 @@ static inline unsigned long svc_serv_maxpages(const struct svc_serv *serv) /* * The context of a single thread, including the request currently being * processed. + * + * RPC programs are free to use rq_private to stash thread-local information. + * The sunrpc layer will not access it. */ struct svc_rqst { struct list_head rq_all; /* all threads list */ @@ -251,7 +254,7 @@ struct svc_rqst { unsigned long bc_to_initval; unsigned int bc_to_retries; unsigned int rq_status_counter; /* RPC processing counter */ - void **rq_lease_breaker; /* The v4 client breaking a lease */ + void *rq_private; /* For use by the service thread */ }; /* bits for rq_flags */ From 322ecd01bf8ad7e0da21e174679aff1759e68b2c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 13 Jan 2026 13:37:40 -0500 Subject: [PATCH 02/83] nfsd/sunrpc: move rq_cachetype into struct nfsd_thread_local_info The svc_rqst->rq_cachetype field is only accessed by nfsd. Move it into the nfsd_thread_local_info instead. Signed-off-by: Jeff Layton Reviewed-by: Benjamin Coddington Signed-off-by: Chuck Lever --- fs/nfsd/nfs4xdr.c | 3 ++- fs/nfsd/nfscache.c | 3 ++- fs/nfsd/nfsd.h | 1 + fs/nfsd/nfssvc.c | 5 +++-- include/linux/sunrpc/svc.h | 1 - 5 files changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 9d234913100b..690f7a3122ec 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2598,6 +2598,7 @@ nfsd4_opnum_in_range(struct nfsd4_compoundargs *argp, struct nfsd4_op *op) static bool nfsd4_decode_compound(struct nfsd4_compoundargs *argp) { + struct nfsd_thread_local_info *ntli = argp->rqstp->rq_private; struct nfsd4_op *op; bool cachethis = false; int auth_slack= argp->rqstp->rq_auth_slack; @@ -2690,7 +2691,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) if (argp->minorversion) cachethis = false; svc_reserve_auth(argp->rqstp, max_reply + readbytes); - argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE; + ntli->ntli_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE; argp->splice_ok = nfsd_read_splice_ok(argp->rqstp); if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack) diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index ab13ee9c7fd8..154468ceccdc 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -467,10 +467,11 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, unsigned int len, struct nfsd_cacherep **cacherep) { struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct nfsd_thread_local_info *ntli = rqstp->rq_private; struct nfsd_cacherep *rp, *found; __wsum csum; struct nfsd_drc_bucket *b; - int type = rqstp->rq_cachetype; + int type = ntli->ntli_cachetype; LIST_HEAD(dispose); int rtn = RC_DOIT; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 938906c6d10c..a2e35a4fa105 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -84,6 +84,7 @@ extern const struct seq_operations nfs_exports_op; struct nfsd_thread_local_info { struct nfs4_client **ntli_lease_breaker; + int ntli_cachetype; }; /* diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index fd979e5392a1..4f1ab3222a4d 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -972,6 +972,7 @@ nfsd(void *vrqstp) */ int nfsd_dispatch(struct svc_rqst *rqstp) { + struct nfsd_thread_local_info *ntli = rqstp->rq_private; const struct svc_procedure *proc = rqstp->rq_procinfo; __be32 *statp = rqstp->rq_accept_statp; struct nfsd_cacherep *rp; @@ -982,7 +983,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp) * Give the xdr decoder a chance to change this if it wants * (necessary in the NFSv4.0 compound case) */ - rqstp->rq_cachetype = proc->pc_cachetype; + ntli->ntli_cachetype = proc->pc_cachetype; /* * ->pc_decode advances the argument stream past the NFS @@ -1027,7 +1028,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp) */ smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter + 1); - nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, nfs_reply); + nfsd_cache_update(rqstp, rp, ntli->ntli_cachetype, nfs_reply); out_cached_reply: return 1; diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index ab8237ba9596..62152e4f3bcc 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -218,7 +218,6 @@ struct svc_rqst { u32 rq_vers; /* program version */ u32 rq_proc; /* procedure number */ u32 rq_prot; /* IP protocol */ - int rq_cachetype; /* catering to nfsd */ unsigned long rq_flags; /* flags field */ ktime_t rq_qtime; /* enqueue time */ From 7b546bd89975cfbd60d4b86f2d1a3b6be5f9e558 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Sat, 18 Oct 2025 11:11:23 +1100 Subject: [PATCH 03/83] sunrpc/cache: improve RCU safety in cache_list walking. 1/ consistently use hlist_add_head_rcu() when adding to the cachelist to reflect the fact that it can be concurrently walked using RCU. In fact hlist_add_head() has all the needed barriers so this is no safety issue, primarily a clarity issue. 2/ call cache_get() *before* adding the list with hlist_add_head_rcu(). It is generally safest to inc the refcount before publishing a reference. In this case it doesn't have any behavioural effect as code which does an RCU walk does not depend on precision of the refcount, and it will always be at least one. But it looks more correct to use this order. 3/ avoid possible races between NULL tests and hlist_entry_safe() calls. It is possible that a test will find that .next or .head is not NULL, but hlist_entry_safe() will find that it is NULL. This can lead to incorrect behaviour with the list-walk terminating early. It is safest to always call hlist_entry_safe() and test the result. Also simplify the *ppos calculation by simply assigning the hash shifted 32, rather than masking out low bits and incrementing high bits. Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- net/sunrpc/cache.c | 62 ++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 33 deletions(-) diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index ef8b7e8b1e9c..86b3fd5a429d 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -134,11 +134,11 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail, return tmp; } + cache_get(new); hlist_add_head_rcu(&new->cache_list, head); detail->entries++; if (detail->nextcheck > new->expiry_time) detail->nextcheck = new->expiry_time + 1; - cache_get(new); spin_unlock(&detail->hash_lock); if (freeme) @@ -233,9 +233,9 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, spin_lock(&detail->hash_lock); cache_entry_update(detail, tmp, new); - hlist_add_head(&tmp->cache_list, &detail->hash_table[hash]); - detail->entries++; cache_get(tmp); + hlist_add_head_rcu(&tmp->cache_list, &detail->hash_table[hash]); + detail->entries++; cache_fresh_locked(tmp, new->expiry_time, detail); cache_fresh_locked(old, 0, detail); spin_unlock(&detail->hash_lock); @@ -1378,18 +1378,14 @@ static void *__cache_seq_start(struct seq_file *m, loff_t *pos) hlist_for_each_entry_rcu(ch, &cd->hash_table[hash], cache_list) if (!entry--) return ch; - n &= ~((1LL<<32) - 1); - do { - hash++; - n += 1LL<<32; - } while(hash < cd->hash_size && - hlist_empty(&cd->hash_table[hash])); - if (hash >= cd->hash_size) - return NULL; - *pos = n+1; - return hlist_entry_safe(rcu_dereference_raw( + ch = NULL; + while (!ch && ++hash < cd->hash_size) + ch = hlist_entry_safe(rcu_dereference( hlist_first_rcu(&cd->hash_table[hash])), struct cache_head, cache_list); + + *pos = ((long long)hash << 32) + 1; + return ch; } static void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos) @@ -1398,29 +1394,29 @@ static void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos) int hash = (*pos >> 32); struct cache_detail *cd = m->private; - if (p == SEQ_START_TOKEN) + if (p == SEQ_START_TOKEN) { hash = 0; - else if (ch->cache_list.next == NULL) { - hash++; - *pos += 1LL<<32; - } else { - ++*pos; - return hlist_entry_safe(rcu_dereference_raw( + ch = NULL; + } + while (hash < cd->hash_size) { + if (ch) + ch = hlist_entry_safe( + rcu_dereference( hlist_next_rcu(&ch->cache_list)), - struct cache_head, cache_list); - } - *pos &= ~((1LL<<32) - 1); - while (hash < cd->hash_size && - hlist_empty(&cd->hash_table[hash])) { - hash++; - *pos += 1LL<<32; - } - if (hash >= cd->hash_size) - return NULL; - ++*pos; - return hlist_entry_safe(rcu_dereference_raw( - hlist_first_rcu(&cd->hash_table[hash])), struct cache_head, cache_list); + else + ch = hlist_entry_safe( + rcu_dereference( + hlist_first_rcu(&cd->hash_table[hash])), + struct cache_head, cache_list); + if (ch) { + ++*pos; + return ch; + } + hash++; + *pos = (long long)hash << 32; + } + return NULL; } void *cache_seq_start_rcu(struct seq_file *m, loff_t *pos) From a0ed7975de5e47091ab16aaece75d1b64c5709e7 Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Mon, 19 Jan 2026 10:41:26 -0500 Subject: [PATCH 04/83] NFSD: Track SCSI Persistent Registration Fencing per Client with xarray When a client holding pNFS SCSI layouts becomes unresponsive, the server revokes access by preempting the client's SCSI persistent reservation key. A layout recall is issued for each layout the client holds; if the client fails to respond, each recall triggers a fence operation. The first preempt for a given device succeeds and removes the client's key registration. Subsequent preempts for the same device fail because the key is no longer registered. Update the NFS server to handle SCSI persistent registration fencing on a per-client and per-device basis by utilizing an xarray associated with the nfs4_client structure. Each xarray entry is indexed by the dev_t of a block device registered by the client. The entry maintains a flag indicating whether this device has already been fenced for the corresponding client. When the server issues a persistent registration key to a client, it creates a new xarray entry at the dev_t index with the fenced flag initialized to 0. Before performing a fence via nfsd4_scsi_fence_client, the server checks the corresponding entry using the device's dev_t. If the fenced flag is already set, the fence operation is skipped; otherwise, the flag is set to 1 and fencing proceeds. The xarray is destroyed when the nfs4_client is released in __destroy_client. Signed-off-by: Dai Ngo Reviewed-by: Christoph Hellwig Signed-off-by: Chuck Lever --- fs/nfsd/blocklayout.c | 72 +++++++++++++++++++++++++++++++++++++++++++ fs/nfsd/nfs4state.c | 6 ++++ fs/nfsd/state.h | 3 ++ 3 files changed, 81 insertions(+) diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index a7cfba29990e..8b987fca1e60 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -273,6 +273,51 @@ const struct nfsd4_layout_ops bl_layout_ops = { #endif /* CONFIG_NFSD_BLOCKLAYOUT */ #ifdef CONFIG_NFSD_SCSILAYOUT + +#define NFSD_MDS_PR_FENCED XA_MARK_0 + +/* + * Clear the fence flag if the device already has an entry. This occurs + * when a client re-registers after a previous fence, allowing new + * layouts for this device. + * + * Insert only on first registration. This bounds cl_dev_fences to the + * count of devices this client has accessed, preventing unbounded growth. + */ +static inline int nfsd4_scsi_fence_insert(struct nfs4_client *clp, + dev_t device) +{ + struct xarray *xa = &clp->cl_dev_fences; + int ret; + + xa_lock(xa); + ret = __xa_insert(xa, device, XA_ZERO_ENTRY, GFP_KERNEL); + if (ret == -EBUSY) { + __xa_clear_mark(xa, device, NFSD_MDS_PR_FENCED); + ret = 0; + } + xa_unlock(xa); + return ret; +} + +static inline bool nfsd4_scsi_fence_set(struct nfs4_client *clp, dev_t device) +{ + struct xarray *xa = &clp->cl_dev_fences; + bool skip; + + xa_lock(xa); + skip = xa_get_mark(xa, device, NFSD_MDS_PR_FENCED); + if (!skip) + __xa_set_mark(xa, device, NFSD_MDS_PR_FENCED); + xa_unlock(xa); + return skip; +} + +static inline void nfsd4_scsi_fence_clear(struct nfs4_client *clp, dev_t device) +{ + xa_clear_mark(&clp->cl_dev_fences, device, NFSD_MDS_PR_FENCED); +} + #define NFSD_MDS_PR_KEY 0x0100000000000000ULL /* @@ -342,6 +387,10 @@ nfsd4_block_get_device_info_scsi(struct super_block *sb, goto out_free_dev; } + ret = nfsd4_scsi_fence_insert(clp, sb->s_bdev->bd_dev); + if (ret < 0) + goto out_free_dev; + ret = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true); if (ret) { pr_err("pNFS: failed to register key for device %s.\n", @@ -401,9 +450,32 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file) struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev; int status; + if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) + return; + status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY, nfsd4_scsi_pr_key(clp), PR_EXCLUSIVE_ACCESS_REG_ONLY, true); + /* + * Reset to allow retry only when the command could not have + * reached the device. Negative status means a local error + * (e.g., -ENOMEM) prevented the command from being sent. + * PR_STS_PATH_FAILED, PR_STS_PATH_FAST_FAILED, and + * PR_STS_RETRY_PATH_FAILURE indicate transport path failures + * before device delivery. + * + * For all other errors, the command may have reached the device + * and the preempt may have succeeded. Avoid resetting, since + * retrying a successful preempt returns PR_STS_IOERR or + * PR_STS_RESERVATION_CONFLICT, which would cause an infinite + * retry loop. + */ + if (status < 0 || + status == PR_STS_PATH_FAILED || + status == PR_STS_PATH_FAST_FAILED || + status == PR_STS_RETRY_PATH_FAILURE) + nfsd4_scsi_fence_clear(clp, bdev->bd_dev); + trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status); } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index d8b0bd8ac842..023fd665b899 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2381,6 +2381,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name, INIT_LIST_HEAD(&clp->cl_revoked); #ifdef CONFIG_NFSD_PNFS INIT_LIST_HEAD(&clp->cl_lo_states); +#endif +#ifdef CONFIG_NFSD_SCSILAYOUT + xa_init(&clp->cl_dev_fences); #endif INIT_LIST_HEAD(&clp->async_copies); spin_lock_init(&clp->async_lock); @@ -2543,6 +2546,9 @@ __destroy_client(struct nfs4_client *clp) svc_xprt_put(clp->cl_cb_conn.cb_xprt); atomic_add_unless(&nn->nfs4_client_count, -1, 0); nfsd4_dec_courtesy_client_count(nn, clp); +#ifdef CONFIG_NFSD_SCSILAYOUT + xa_destroy(&clp->cl_dev_fences); +#endif free_client(clp); wake_up_all(&expiry_wq); } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index c0ca115c3b74..99aeaab9cf2b 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -527,6 +527,9 @@ struct nfs4_client { struct nfsd4_cb_recall_any *cl_ra; time64_t cl_ra_time; +#ifdef CONFIG_NFSD_SCSILAYOUT + struct xarray cl_dev_fences; +#endif }; /* struct nfs4_client_reset From ed7f4d323b5c86cfe9ca6eb3a955416aaa335a9c Mon Sep 17 00:00:00 2001 From: Ryota Sakamoto Date: Sat, 24 Jan 2026 14:17:19 +0900 Subject: [PATCH 05/83] SUNRPC: Replace KUnit tests for memcmp() with KUNIT_EXPECT_MEMEQ_MSG() Replace KUnit tests for memcmp() with KUNIT_EXPECT_MEMEQ_MSG() to improve debugging that prints the hex dump of the buffers when the assertion fails, whereas memcmp() only returns an integer difference. Signed-off-by: Ryota Sakamoto Signed-off-by: Chuck Lever --- net/sunrpc/auth_gss/gss_krb5_test.c | 93 ++++++++++++++++------------- 1 file changed, 51 insertions(+), 42 deletions(-) diff --git a/net/sunrpc/auth_gss/gss_krb5_test.c b/net/sunrpc/auth_gss/gss_krb5_test.c index a5bff02cd7ba..dde1ee934d0d 100644 --- a/net/sunrpc/auth_gss/gss_krb5_test.c +++ b/net/sunrpc/auth_gss/gss_krb5_test.c @@ -63,10 +63,11 @@ static void kdf_case(struct kunit *test) KUNIT_ASSERT_EQ(test, err, 0); /* Assert */ - KUNIT_EXPECT_EQ_MSG(test, - memcmp(param->expected_result->data, - derivedkey.data, derivedkey.len), 0, - "key mismatch"); + KUNIT_EXPECT_MEMEQ_MSG(test, + param->expected_result->data, + derivedkey.data, + derivedkey.len, + "key mismatch"); } static void checksum_case(struct kunit *test) @@ -111,10 +112,11 @@ static void checksum_case(struct kunit *test) KUNIT_ASSERT_EQ(test, err, 0); /* Assert */ - KUNIT_EXPECT_EQ_MSG(test, - memcmp(param->expected_result->data, - checksum.data, checksum.len), 0, - "checksum mismatch"); + KUNIT_EXPECT_MEMEQ_MSG(test, + param->expected_result->data, + checksum.data, + checksum.len, + "checksum mismatch"); crypto_free_ahash(tfm); } @@ -314,10 +316,11 @@ static void rfc3961_nfold_case(struct kunit *test) param->expected_result->len * 8, result); /* Assert */ - KUNIT_EXPECT_EQ_MSG(test, - memcmp(param->expected_result->data, - result, param->expected_result->len), 0, - "result mismatch"); + KUNIT_EXPECT_MEMEQ_MSG(test, + param->expected_result->data, + result, + param->expected_result->len, + "result mismatch"); } static struct kunit_case rfc3961_test_cases[] = { @@ -569,14 +572,16 @@ static void rfc3962_encrypt_case(struct kunit *test) KUNIT_EXPECT_EQ_MSG(test, param->expected_result->len, buf.len, "ciphertext length mismatch"); - KUNIT_EXPECT_EQ_MSG(test, - memcmp(param->expected_result->data, - text, param->expected_result->len), 0, - "ciphertext mismatch"); - KUNIT_EXPECT_EQ_MSG(test, - memcmp(param->next_iv->data, iv, - param->next_iv->len), 0, - "IV mismatch"); + KUNIT_EXPECT_MEMEQ_MSG(test, + param->expected_result->data, + text, + param->expected_result->len, + "ciphertext mismatch"); + KUNIT_EXPECT_MEMEQ_MSG(test, + param->next_iv->data, + iv, + param->next_iv->len, + "IV mismatch"); crypto_free_sync_skcipher(cts_tfm); crypto_free_sync_skcipher(cbc_tfm); @@ -1194,15 +1199,17 @@ static void rfc6803_encrypt_case(struct kunit *test) KUNIT_EXPECT_EQ_MSG(test, param->expected_result->len, buf.len + checksum.len, "ciphertext length mismatch"); - KUNIT_EXPECT_EQ_MSG(test, - memcmp(param->expected_result->data, - buf.head[0].iov_base, buf.len), 0, - "encrypted result mismatch"); - KUNIT_EXPECT_EQ_MSG(test, - memcmp(param->expected_result->data + - (param->expected_result->len - checksum.len), - checksum.data, checksum.len), 0, - "HMAC mismatch"); + KUNIT_EXPECT_MEMEQ_MSG(test, + param->expected_result->data, + buf.head[0].iov_base, + buf.len, + "encrypted result mismatch"); + KUNIT_EXPECT_MEMEQ_MSG(test, + param->expected_result->data + + (param->expected_result->len - checksum.len), + checksum.data, + checksum.len, + "HMAC mismatch"); crypto_free_ahash(ahash_tfm); crypto_free_sync_skcipher(cts_tfm); @@ -1687,15 +1694,16 @@ static void rfc8009_encrypt_case(struct kunit *test) KUNIT_EXPECT_EQ_MSG(test, param->expected_result->len, buf.len, "ciphertext length mismatch"); - KUNIT_EXPECT_EQ_MSG(test, - memcmp(param->expected_result->data, - buf.head[0].iov_base, - param->expected_result->len), 0, - "ciphertext mismatch"); - KUNIT_EXPECT_EQ_MSG(test, memcmp(param->expected_hmac->data, - checksum.data, - checksum.len), 0, - "HMAC mismatch"); + KUNIT_EXPECT_MEMEQ_MSG(test, + param->expected_result->data, + buf.head[0].iov_base, + param->expected_result->len, + "ciphertext mismatch"); + KUNIT_EXPECT_MEMEQ_MSG(test, + param->expected_hmac->data, + checksum.data, + checksum.len, + "HMAC mismatch"); crypto_free_ahash(ahash_tfm); crypto_free_sync_skcipher(cts_tfm); @@ -1826,10 +1834,11 @@ static void encrypt_selftest_case(struct kunit *test) KUNIT_EXPECT_EQ_MSG(test, param->plaintext->len, buf.len, "length mismatch"); - KUNIT_EXPECT_EQ_MSG(test, - memcmp(param->plaintext->data, - buf.head[0].iov_base, buf.len), 0, - "plaintext mismatch"); + KUNIT_EXPECT_MEMEQ_MSG(test, + param->plaintext->data, + buf.head[0].iov_base, + buf.len, + "plaintext mismatch"); crypto_free_sync_skcipher(cts_tfm); crypto_free_sync_skcipher(cbc_tfm); From 6237a17fb8b150b6f2e5d243b2d4f23f85931c2e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 26 Jan 2026 07:10:13 -0500 Subject: [PATCH 06/83] nfsd: add a runtime switch for disabling delegated timestamps The delegated timestamp code seems to be working well enough now that we want to make it always be built in. In the event that there are problems though, we still want to be able to disable them for debugging purposes. Add a switch to debugfs to enable them at runtime. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/debugfs.c | 4 ++++ fs/nfsd/nfs4state.c | 8 ++++++++ fs/nfsd/nfsd.h | 1 + 3 files changed, 13 insertions(+) diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c index 7f44689e0a53..386fd1c54f52 100644 --- a/fs/nfsd/debugfs.c +++ b/fs/nfsd/debugfs.c @@ -140,4 +140,8 @@ void nfsd_debugfs_init(void) debugfs_create_file("io_cache_write", 0644, nfsd_top_dir, NULL, &nfsd_io_cache_write_fops); +#ifdef CONFIG_NFSD_V4 + debugfs_create_bool("delegated_timestamps", 0644, nfsd_top_dir, + &nfsd_delegts_enabled); +#endif } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 023fd665b899..99ade93ac12e 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -76,6 +76,8 @@ static const stateid_t close_stateid = { static u64 current_sessionid = 1; +bool nfsd_delegts_enabled __read_mostly = true; + #define ZERO_STATEID(stateid) (!memcmp((stateid), &zero_stateid, sizeof(stateid_t))) #define ONE_STATEID(stateid) (!memcmp((stateid), &one_stateid, sizeof(stateid_t))) #define CURRENT_STATEID(stateid) (!memcmp((stateid), ¤tstateid, sizeof(stateid_t))) @@ -6045,8 +6047,14 @@ nfsd4_verify_setuid_write(struct nfsd4_open *open, struct nfsd_file *nf) } #ifdef CONFIG_NFSD_V4_DELEG_TIMESTAMPS +/* + * Timestamp delegation was introduced in RFC7862. Runtime switch for disabling + * this feature is /sys/kernel/debug/nfsd/delegated_timestamps. + */ static bool nfsd4_want_deleg_timestamps(const struct nfsd4_open *open) { + if (!nfsd_delegts_enabled) + return false; return open->op_deleg_want & OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS; } #else /* CONFIG_NFSD_V4_DELEG_TIMESTAMPS */ diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index a2e35a4fa105..7c009f07c90b 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -160,6 +160,7 @@ static inline void nfsd_debugfs_exit(void) {} #endif extern bool nfsd_disable_splice_read __read_mostly; +extern bool nfsd_delegts_enabled __read_mostly; enum { /* Any new NFSD_IO enum value must be added at the end */ From 01afb9008527d2be96046a6859de2951306a93e9 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 26 Jan 2026 07:10:14 -0500 Subject: [PATCH 07/83] nfsd: remove NFSD_V4_DELEG_TIMESTAMPS Kconfig option Now that there is a runtime debugfs switch, eliminate the compile-time switch and always build in support for delegated timestamps. Administrators who previously disabled this feature at compile time can disable it at runtime via: echo 0 > /sys/kernel/debug/nfsd/delegated_timestamps Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/Kconfig | 10 ---------- fs/nfsd/nfs4state.c | 7 ------- 2 files changed, 17 deletions(-) diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 4fd6e818565e..fc0e87eaa257 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -177,16 +177,6 @@ config NFSD_LEGACY_CLIENT_TRACKING and will be removed in the future. Say Y here if you need support for them in the interim. -config NFSD_V4_DELEG_TIMESTAMPS - bool "Support delegated timestamps" - depends on NFSD_V4 - default n - help - NFSD implements delegated timestamps according to - draft-ietf-nfsv4-delstid-08 "Extending the Opening of Files". This - is currently an experimental feature and is therefore left disabled - by default. - config NFSD_V4_POSIX_ACLS bool "Support NFSv4 POSIX draft ACLs" depends on NFSD_V4 diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 99ade93ac12e..a767b562f991 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -6046,7 +6046,6 @@ nfsd4_verify_setuid_write(struct nfsd4_open *open, struct nfsd_file *nf) return 0; } -#ifdef CONFIG_NFSD_V4_DELEG_TIMESTAMPS /* * Timestamp delegation was introduced in RFC7862. Runtime switch for disabling * this feature is /sys/kernel/debug/nfsd/delegated_timestamps. @@ -6057,12 +6056,6 @@ static bool nfsd4_want_deleg_timestamps(const struct nfsd4_open *open) return false; return open->op_deleg_want & OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS; } -#else /* CONFIG_NFSD_V4_DELEG_TIMESTAMPS */ -static bool nfsd4_want_deleg_timestamps(const struct nfsd4_open *open) -{ - return false; -} -#endif /* CONFIG NFSD_V4_DELEG_TIMESTAMPS */ static struct nfs4_delegation * nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, From aa772bcc40e1722302b05045d96c0169ac5a2717 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:22 -0500 Subject: [PATCH 08/83] lockd: Simplify cast_status() in svcproc.c Clean up: The svcproc.c file handles only NLM v1 and v3 requests. NLMv4 requests are routed to a separate procedure table in svc4proc.c, so rqstp->rq_vers can never be 4 in this context. Remove the unused vers parameter and the dead "vers != 4" check from cast_to_nlm(). This eliminates the need for the macro wrapper. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svcproc.c | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 5817ef272332..95c6bf7ab757 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -17,32 +17,30 @@ #define NLMDBG_FACILITY NLMDBG_CLIENT #ifdef CONFIG_LOCKD_V4 -static __be32 -cast_to_nlm(__be32 status, u32 vers) +static inline __be32 cast_status(__be32 status) { - /* Note: status is assumed to be in network byte order !!! */ - if (vers != 4){ - switch (status) { - case nlm_granted: - case nlm_lck_denied: - case nlm_lck_denied_nolocks: - case nlm_lck_blocked: - case nlm_lck_denied_grace_period: - case nlm_drop_reply: - break; - case nlm4_deadlock: - status = nlm_lck_denied; - break; - default: - status = nlm_lck_denied_nolocks; - } + switch (status) { + case nlm_granted: + case nlm_lck_denied: + case nlm_lck_denied_nolocks: + case nlm_lck_blocked: + case nlm_lck_denied_grace_period: + case nlm_drop_reply: + break; + case nlm4_deadlock: + status = nlm_lck_denied; + break; + default: + status = nlm_lck_denied_nolocks; } - return (status); + return status; } -#define cast_status(status) (cast_to_nlm(status, rqstp->rq_vers)) #else -#define cast_status(status) (status) +static inline __be32 cast_status(__be32 status) +{ + return status; +} #endif /* From 153b9e025308417d167332c93e1bcc11174178de Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:23 -0500 Subject: [PATCH 09/83] lockd: Relocate and rename nlm_drop_reply The nlm_drop_reply status code is internal to the kernel's lockd implementation and must never appear on the wire. Its previous location in xdr.h grouped it with legitimate NLM protocol status codes, obscuring this critical distinction. Relocate the definition to lockd.h with a comment block for internal status codes, and rename to nlm__int__drop_reply to make its internal-only nature explicit. This prepares for adding additional internal status codes in subsequent patches. Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 22 ++++++++++++++-------- fs/lockd/svclock.c | 4 ++-- fs/lockd/svcproc.c | 24 +++++++++++++++--------- fs/nfsd/lockd.c | 2 +- include/linux/lockd/lockd.h | 6 ++++++ include/linux/lockd/xdr.h | 2 -- 6 files changed, 38 insertions(+), 22 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 4b6f18d97734..9c756d07223a 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -104,12 +104,13 @@ __nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp) /* Obtain client and file */ if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; /* Now check for conflicting locks */ resp->status = nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock); - if (resp->status == nlm_drop_reply) + if (resp->status == nlm__int__drop_reply) rc = rpc_drop_reply; else dprintk("lockd: TEST4 status %d\n", ntohl(resp->status)); @@ -140,13 +141,14 @@ __nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_res *resp) /* Obtain client and file */ if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; /* Now try to lock the file */ resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock, argp->block, &argp->cookie, argp->reclaim); - if (resp->status == nlm_drop_reply) + if (resp->status == nlm__int__drop_reply) rc = rpc_drop_reply; else dprintk("lockd: LOCK status %d\n", ntohl(resp->status)); @@ -182,7 +184,8 @@ __nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_res *resp) /* Obtain client and file */ if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; /* Try to cancel request. */ resp->status = nlmsvc_cancel_blocked(SVC_NET(rqstp), file, &argp->lock); @@ -222,7 +225,8 @@ __nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_res *resp) /* Obtain client and file */ if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; /* Now try to remove the lock */ resp->status = nlmsvc_unlock(SVC_NET(rqstp), file, &argp->lock); @@ -369,7 +373,8 @@ nlm4svc_proc_share(struct svc_rqst *rqstp) /* Obtain client and file */ if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; /* Now try to create the share */ resp->status = nlmsvc_share_file(host, file, argp); @@ -404,7 +409,8 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp) /* Obtain client and file */ if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; /* Now try to lock the file */ resp->status = nlmsvc_unshare_file(host, file, argp); diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 255a847ca0b6..d86b02153c7c 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -463,7 +463,7 @@ nlmsvc_defer_lock_rqst(struct svc_rqst *rqstp, struct nlm_block *block) block->b_deferred_req = rqstp->rq_chandle.defer(block->b_cache_req); if (block->b_deferred_req != NULL) - status = nlm_drop_reply; + status = nlm__int__drop_reply; } dprintk("lockd: nlmsvc_defer_lock_rqst block %p flags %d status %d\n", block, block->b_flags, ntohl(status)); @@ -531,7 +531,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, ret = nlm_lck_denied; goto out; } - ret = nlm_drop_reply; + ret = nlm__int__drop_reply; goto out; } diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 95c6bf7ab757..2a2e48a9bd12 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -25,7 +25,7 @@ static inline __be32 cast_status(__be32 status) case nlm_lck_denied_nolocks: case nlm_lck_blocked: case nlm_lck_denied_grace_period: - case nlm_drop_reply: + case nlm__int__drop_reply: break; case nlm4_deadlock: status = nlm_lck_denied; @@ -122,12 +122,13 @@ __nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp) /* Obtain client and file */ if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; /* Now check for conflicting locks */ resp->status = cast_status(nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock)); - if (resp->status == nlm_drop_reply) + if (resp->status == nlm__int__drop_reply) rc = rpc_drop_reply; else dprintk("lockd: TEST status %d vers %d\n", @@ -159,13 +160,14 @@ __nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_res *resp) /* Obtain client and file */ if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; /* Now try to lock the file */ resp->status = cast_status(nlmsvc_lock(rqstp, file, host, &argp->lock, argp->block, &argp->cookie, argp->reclaim)); - if (resp->status == nlm_drop_reply) + if (resp->status == nlm__int__drop_reply) rc = rpc_drop_reply; else dprintk("lockd: LOCK status %d\n", ntohl(resp->status)); @@ -202,7 +204,8 @@ __nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_res *resp) /* Obtain client and file */ if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; /* Try to cancel request. */ resp->status = cast_status(nlmsvc_cancel_blocked(net, file, &argp->lock)); @@ -243,7 +246,8 @@ __nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_res *resp) /* Obtain client and file */ if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; /* Now try to remove the lock */ resp->status = cast_status(nlmsvc_unlock(net, file, &argp->lock)); @@ -400,7 +404,8 @@ nlmsvc_proc_share(struct svc_rqst *rqstp) /* Obtain client and file */ if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; /* Now try to create the share */ resp->status = cast_status(nlmsvc_share_file(host, file, argp)); @@ -435,7 +440,8 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp) /* Obtain client and file */ if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; /* Now try to unshare the file */ resp->status = cast_status(nlmsvc_unshare_file(host, file, argp)); diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index c774ce9aa296..8c230ccd6645 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -71,7 +71,7 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp, * to callback when the delegation is returned but might * not have a proper lock request to block on. */ - return nlm_drop_reply; + return nlm__int__drop_reply; case nfserr_stale: return nlm_stale_fh; default: diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 330e38776bb2..fdefec39553f 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -38,6 +38,12 @@ */ #define LOCKD_DFLT_TIMEO 10 +/* + * Internal-use status codes, not to be placed on the wire. + * Version handlers translate these to appropriate wire values. + */ +#define nlm__int__drop_reply cpu_to_be32(30000) + /* * Lockd host handle (used both by the client and server personality). */ diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h index 17d53165d9f2..292e4e38d17d 100644 --- a/include/linux/lockd/xdr.h +++ b/include/linux/lockd/xdr.h @@ -33,8 +33,6 @@ struct svc_rqst; #define nlm_lck_blocked cpu_to_be32(NLM_LCK_BLOCKED) #define nlm_lck_denied_grace_period cpu_to_be32(NLM_LCK_DENIED_GRACE_PERIOD) -#define nlm_drop_reply cpu_to_be32(30000) - /* Lock info passed via NLM */ struct nlm_lock { char * caller; From 9e0d0c61940796893e0c2200cdc7be0684218238 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:24 -0500 Subject: [PATCH 10/83] lockd: Introduce nlm__int__deadlock The use of CONFIG_LOCKD_V4 in combination with a later cast_status() in the NLMv3 code is difficult to reason about. Instead, replace the use of nlm_deadlock with an implementation-defined status value that version-specific code translates appropriately. The new approach establishes a translation boundary: generic lockd code returns nlm__int__deadlock when posix_lock_file() yields -EDEADLK. Version-specific handlers (svc4proc.c for NLMv4, svcproc.c for NLMv3) translate this internal status to the appropriate wire protocol value. NLMv4 maps to nlm4_deadlock; NLMv3 maps to nlm_lck_denied (since NLMv3 lacks a deadlock-specific status code). Later this modification will also remove the need to include NLMv4 headers in NLMv3 and generic code. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 10 ++++++++-- fs/lockd/svclock.c | 8 +------- fs/lockd/svcproc.c | 4 +++- include/linux/lockd/lockd.h | 1 + 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 9c756d07223a..55b6dcc56db1 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -148,10 +148,16 @@ __nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_res *resp) resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock, argp->block, &argp->cookie, argp->reclaim); - if (resp->status == nlm__int__drop_reply) + switch (resp->status) { + case nlm__int__drop_reply: rc = rpc_drop_reply; - else + break; + case nlm__int__deadlock: + resp->status = nlm4_deadlock; + fallthrough; + default: dprintk("lockd: LOCK status %d\n", ntohl(resp->status)); + } nlmsvc_release_lockowner(&argp->lock); nlmsvc_release_host(host); diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index d86b02153c7c..5edf00751a1e 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -33,12 +33,6 @@ #define NLMDBG_FACILITY NLMDBG_SVCLOCK -#ifdef CONFIG_LOCKD_V4 -#define nlm_deadlock nlm4_deadlock -#else -#define nlm_deadlock nlm_lck_denied -#endif - static void nlmsvc_release_block(struct nlm_block *block); static void nlmsvc_insert_block(struct nlm_block *block, unsigned long); static void nlmsvc_remove_block(struct nlm_block *block); @@ -589,7 +583,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; case -EDEADLK: nlmsvc_remove_block(block); - ret = nlm_deadlock; + ret = nlm__int__deadlock; goto out; default: /* includes ENOLCK */ nlmsvc_remove_block(block); diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 2a2e48a9bd12..27ed71935e45 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -27,7 +27,7 @@ static inline __be32 cast_status(__be32 status) case nlm_lck_denied_grace_period: case nlm__int__drop_reply: break; - case nlm4_deadlock: + case nlm__int__deadlock: status = nlm_lck_denied; break; default: @@ -39,6 +39,8 @@ static inline __be32 cast_status(__be32 status) #else static inline __be32 cast_status(__be32 status) { + if (status == nlm__int__deadlock) + status = nlm_lck_denied; return status; } #endif diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index fdefec39553f..793691912137 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -43,6 +43,7 @@ * Version handlers translate these to appropriate wire values. */ #define nlm__int__drop_reply cpu_to_be32(30000) +#define nlm__int__deadlock cpu_to_be32(30001) /* * Lockd host handle (used both by the client and server personality). From 7db001e03d7a668ca6c3789fee42a24236ca90f6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:25 -0500 Subject: [PATCH 11/83] lockd: Have nlm_fopen() return errno values The nlm_fopen() function is part of the API between nfsd and lockd. Currently its return value is an on-the-wire NLM status code. But that forces NFSD to include NLM wire protocol definitions despite having no other dependency on the NLM wire protocol. In addition, a CONFIG_LOCKD_V4 Kconfig symbol appears in the middle of NFSD source code. Refactor: Let's not use on-the-wire values as part of a high-level API between two Linux kernel modules. That's what we have errno for, right? And, instead of simply moving the CONFIG_LOCKD_V4 check, we can get rid of it entirely and let the decision of what actual NLM status code goes on the wire to be left up to NLM version-specific code. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 18 ++++++++++--- fs/lockd/svcproc.c | 14 ++++++++++- fs/lockd/svcsubs.c | 27 +++++++++++++++----- fs/nfsd/lockd.c | 50 +++++++++++++++++++++---------------- include/linux/lockd/bind.h | 8 +++--- include/linux/lockd/lockd.h | 2 ++ 6 files changed, 82 insertions(+), 37 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 55b6dcc56db1..4ceb27cc72e4 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -73,9 +73,21 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, no_locks: nlmsvc_release_host(host); - if (error) - return error; - return nlm_lck_denied_nolocks; + switch (error) { + case nlm_granted: + return nlm_lck_denied_nolocks; + case nlm__int__stale_fh: + return nlm4_stale_fh; + case nlm__int__failed: + return nlm4_failed; + default: + if (be32_to_cpu(error) >= 30000) { + pr_warn_once("lockd: unhandled internal status %u\n", + be32_to_cpu(error)); + return nlm4_failed; + } + return error; + } } /* diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 27ed71935e45..272c8f36ed2a 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -39,8 +39,20 @@ static inline __be32 cast_status(__be32 status) #else static inline __be32 cast_status(__be32 status) { - if (status == nlm__int__deadlock) + switch (status) { + case nlm__int__deadlock: status = nlm_lck_denied; + break; + case nlm__int__stale_fh: + case nlm__int__failed: + status = nlm_lck_denied_nolocks; + break; + default: + if (be32_to_cpu(status) >= 30000) + pr_warn_once("lockd: unhandled internal status %u\n", + be32_to_cpu(status)); + break; + } return status; } #endif diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index dd0214dcb695..967739d2aa90 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -87,14 +87,29 @@ static __be32 nlm_do_fopen(struct svc_rqst *rqstp, struct nlm_file *file, int mode) { struct file **fp = &file->f_file[mode]; - __be32 nfserr; + __be32 nlmerr = nlm_granted; + int error; if (*fp) - return 0; - nfserr = nlmsvc_ops->fopen(rqstp, &file->f_handle, fp, mode); - if (nfserr) - dprintk("lockd: open failed (error %d)\n", nfserr); - return nfserr; + return nlmerr; + + error = nlmsvc_ops->fopen(rqstp, &file->f_handle, fp, mode); + if (error) { + dprintk("lockd: open failed (errno %d)\n", error); + switch (error) { + case -EWOULDBLOCK: + nlmerr = nlm__int__drop_reply; + break; + case -ESTALE: + nlmerr = nlm__int__stale_fh; + break; + default: + nlmerr = nlm__int__failed; + break; + } + } + + return nlmerr; } /* diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index 8c230ccd6645..6fe1325815e0 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -14,19 +14,20 @@ #define NFSDDBG_FACILITY NFSDDBG_LOCKD -#ifdef CONFIG_LOCKD_V4 -#define nlm_stale_fh nlm4_stale_fh -#define nlm_failed nlm4_failed -#else -#define nlm_stale_fh nlm_lck_denied_nolocks -#define nlm_failed nlm_lck_denied_nolocks -#endif -/* - * Note: we hold the dentry use count while the file is open. +/** + * nlm_fopen - Open an NFSD file + * @rqstp: NLM RPC procedure execution context + * @f: NFS file handle to be opened + * @filp: OUT: an opened struct file + * @flags: the POSIX open flags to use + * + * nlm_fopen() holds the dentry reference until nlm_fclose() releases it. + * + * Returns zero on success or a negative errno value if the file + * cannot be opened. */ -static __be32 -nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp, - int mode) +static int nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, + struct file **filp, int flags) { __be32 nfserr; int access; @@ -47,18 +48,17 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp, * if NFSEXP_NOAUTHNLM is set. Some older clients use AUTH_NULL * for NLM requests. */ - access = (mode == O_WRONLY) ? NFSD_MAY_WRITE : NFSD_MAY_READ; + access = (flags == O_WRONLY) ? NFSD_MAY_WRITE : NFSD_MAY_READ; access |= NFSD_MAY_NLM | NFSD_MAY_OWNER_OVERRIDE | NFSD_MAY_BYPASS_GSS; nfserr = nfsd_open(rqstp, &fh, S_IFREG, access, filp); fh_put(&fh); - /* We return nlm error codes as nlm doesn't know - * about nfsd, but nfsd does know about nlm.. - */ + switch (nfserr) { case nfs_ok: - return 0; + break; case nfserr_jukebox: - /* this error can indicate a presence of a conflicting + /* + * This error can indicate a presence of a conflicting * delegation to an NLM lock request. Options are: * (1) For now, drop this request and make the client * retry. When delegation is returned, client's lock retry @@ -66,19 +66,25 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp, * (2) NLM4_DENIED as per "spec" signals to the client * that the lock is unavailable now but client can retry. * Linux client implementation does not. It treats - * NLM4_DENIED same as NLM4_FAILED and errors the request. + * NLM4_DENIED same as NLM4_FAILED and fails the request. * (3) For the future, treat this as blocked lock and try * to callback when the delegation is returned but might * not have a proper lock request to block on. */ - return nlm__int__drop_reply; + return -EWOULDBLOCK; case nfserr_stale: - return nlm_stale_fh; + return -ESTALE; default: - return nlm_failed; + return -ENOLCK; } + + return 0; } +/** + * nlm_fclose - Close an NFSD file + * @filp: a struct file that was opened by nlm_fopen() + */ static void nlm_fclose(struct file *filp) { diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index c53c81242e72..2f5dd9e943ee 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -26,11 +26,9 @@ struct rpc_clnt; * This is the set of functions for lockd->nfsd communication */ struct nlmsvc_binding { - __be32 (*fopen)(struct svc_rqst *, - struct nfs_fh *, - struct file **, - int mode); - void (*fclose)(struct file *); + int (*fopen)(struct svc_rqst *rqstp, struct nfs_fh *f, + struct file **filp, int flags); + void (*fclose)(struct file *filp); }; extern const struct nlmsvc_binding *nlmsvc_ops; diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 793691912137..195e6ce28f6e 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -44,6 +44,8 @@ */ #define nlm__int__drop_reply cpu_to_be32(30000) #define nlm__int__deadlock cpu_to_be32(30001) +#define nlm__int__stale_fh cpu_to_be32(30002) +#define nlm__int__failed cpu_to_be32(30003) /* * Lockd host handle (used both by the client and server personality). From efb5b15e3b78f5644dd2d4ddec8880e0c9aa5b5f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:26 -0500 Subject: [PATCH 12/83] lockd: Relocate nlmsvc_unlock API declarations The nlmsvc_unlock_all_by_sb() and nlmsvc_unlock_all_by_ip() functions are part of lockd's external API, consumed by other kernel subsystems. Their declarations currently reside in linux/lockd/lockd.h alongside internal implementation details, which blurs the boundary between lockd's public interface and its private internals. Moving these declarations to linux/lockd/bind.h groups them with other external API functions and makes the separation explicit. This clarifies which functions are intended for external use and reduces the risk of internal implementation details leaking into the public API surface. Build-tested with allyesconfig; no functional changes. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfsctl.c | 2 +- include/linux/lockd/bind.h | 7 +++++++ include/linux/lockd/lockd.h | 6 ------ 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 71aabdaa1d15..0bf01ae411c5 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -11,7 +11,7 @@ #include #include -#include +#include #include #include #include diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index 2f5dd9e943ee..82eca0a13ccc 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -21,6 +21,7 @@ struct svc_rqst; struct rpc_task; struct rpc_clnt; +struct super_block; /* * This is the set of functions for lockd->nfsd communication @@ -80,4 +81,10 @@ extern int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl, vo extern int lockd_up(struct net *net, const struct cred *cred); extern void lockd_down(struct net *net); +/* + * Cluster failover support + */ +int nlmsvc_unlock_all_by_sb(struct super_block *sb); +int nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr); + #endif /* LINUX_LOCKD_BIND_H */ diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 195e6ce28f6e..0d883f48ec21 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -311,12 +311,6 @@ void nlmsvc_mark_resources(struct net *); void nlmsvc_free_host_resources(struct nlm_host *); void nlmsvc_invalidate_all(void); -/* - * Cluster failover support - */ -int nlmsvc_unlock_all_by_sb(struct super_block *sb); -int nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr); - static inline struct file *nlmsvc_file_file(const struct nlm_file *file) { return file->f_file[O_RDONLY] ? From 840621fd2ff23ada8b9262d90477e75232566e6b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:27 -0500 Subject: [PATCH 13/83] NFS: Use nlmclnt_shutdown_rpc_clnt() to safely shut down NLM A race condition exists in shutdown_store() when writing to the sysfs "shutdown" file concurrently with nlm_shutdown_hosts_net(). Without synchronization, the following sequence can occur: 1. shutdown_store() reads server->nlm_host (non-NULL) 2. nlm_shutdown_hosts_net() acquires nlm_host_mutex, calls rpc_shutdown_client(), sets h_rpcclnt to NULL, and potentially frees the host via nlm_gc_hosts() 3. shutdown_store() dereferences the now-stale or freed host Introduce nlmclnt_shutdown_rpc_clnt(), which acquires nlm_host_mutex before accessing h_rpcclnt. This synchronizes with nlm_shutdown_hosts_net() and ensures the rpc_clnt pointer remains valid during the shutdown operation. This change also improves API layering: NFS client code no longer needs to include the internal lockd header to access nlm_host fields. The new helper resides in bind.h alongside other public lockd interfaces. Reported-by: Jeff Layton Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/host.c | 29 +++++++++++++++++++++++++++++ fs/nfs/sysfs.c | 4 ++-- include/linux/lockd/bind.h | 1 + 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 1a9582a10a86..015900d2d4c2 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -306,6 +306,35 @@ void nlmclnt_release_host(struct nlm_host *host) } } +/* Callback for rpc_cancel_tasks() - matches all tasks for cancellation */ +static bool nlmclnt_match_all(const struct rpc_task *task, const void *data) +{ + return true; +} + +/** + * nlmclnt_shutdown_rpc_clnt - safely shut down NLM client RPC operations + * @host: nlm_host to shut down + * + * Cancels outstanding RPC tasks and marks the client as shut down. + * Synchronizes with nlmclnt_release_host() via nlm_host_mutex to prevent + * races between shutdown and host destruction. Safe to call if h_rpcclnt + * is NULL or already shut down. + */ +void nlmclnt_shutdown_rpc_clnt(struct nlm_host *host) +{ + struct rpc_clnt *clnt; + + mutex_lock(&nlm_host_mutex); + clnt = host->h_rpcclnt; + if (clnt) { + clnt->cl_shutdown = 1; + rpc_cancel_tasks(clnt, -EIO, nlmclnt_match_all, NULL); + } + mutex_unlock(&nlm_host_mutex); +} +EXPORT_SYMBOL_GPL(nlmclnt_shutdown_rpc_clnt); + /** * nlmsvc_lookup_host - Find an NLM host handle matching a remote client * @rqstp: incoming NLM request diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c index 7d8921f524a6..051da37770d8 100644 --- a/fs/nfs/sysfs.c +++ b/fs/nfs/sysfs.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include "internal.h" #include "nfs4_fs.h" @@ -285,7 +285,7 @@ shutdown_store(struct kobject *kobj, struct kobj_attribute *attr, shutdown_client(server->client_acl); if (server->nlm_host) - shutdown_client(server->nlm_host->h_rpcclnt); + nlmclnt_shutdown_rpc_clnt(server->nlm_host); out: shutdown_nfs_client(server->nfs_client); return count; diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index 82eca0a13ccc..39c124dcb19c 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -57,6 +57,7 @@ struct nlmclnt_initdata { extern struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init); extern void nlmclnt_done(struct nlm_host *host); extern struct rpc_clnt *nlmclnt_rpc_clnt(struct nlm_host *host); +extern void nlmclnt_shutdown_rpc_clnt(struct nlm_host *host); /* * NLM client operations provide a means to modify RPC processing of NLM From f4d5f8caadd858f11b21e8a9e5c85290fc21a568 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:28 -0500 Subject: [PATCH 14/83] lockd: Move xdr4.h from include/linux/lockd/ to fs/lockd/ The xdr4.h header declares NLMv4-specific XDR encoder/decoder functions and error codes that are used exclusively within the lockd subsystem. Moving it from include/linux/lockd/ to fs/lockd/ clarifies the intended scope of these declarations and prevents external code from depending on lockd-internal interfaces. This change reduces the public API surface of the lockd module and makes it easier to refactor NLMv4 internals without risk of breaking out-of-tree consumers. The header's contents are implementation details of the NLMv4 wire protocol handling, not a contract with other kernel subsystems. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/clnt4xdr.c | 2 ++ fs/lockd/svc4proc.c | 2 ++ fs/lockd/xdr4.c | 1 + {include/linux => fs}/lockd/xdr4.h | 15 +++------------ include/linux/lockd/bind.h | 3 --- include/linux/lockd/lockd.h | 7 ++++--- 6 files changed, 12 insertions(+), 18 deletions(-) rename {include/linux => fs}/lockd/xdr4.h (84%) diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c index 527458db4525..23896073c7e5 100644 --- a/fs/lockd/clnt4xdr.c +++ b/fs/lockd/clnt4xdr.c @@ -17,6 +17,8 @@ #include +#include "xdr4.h" + #define NLMDBG_FACILITY NLMDBG_XDR #if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 4ceb27cc72e4..51d072a83a49 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -14,6 +14,8 @@ #include #include +#include "xdr4.h" + #define NLMDBG_FACILITY NLMDBG_CLIENT /* diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c index e343c820301f..5b1e15977697 100644 --- a/fs/lockd/xdr4.c +++ b/fs/lockd/xdr4.c @@ -19,6 +19,7 @@ #include #include "svcxdr.h" +#include "xdr4.h" static inline s64 loff_t_to_s64(loff_t offset) diff --git a/include/linux/lockd/xdr4.h b/fs/lockd/xdr4.h similarity index 84% rename from include/linux/lockd/xdr4.h rename to fs/lockd/xdr4.h index 72831e35dca3..7be318c0512b 100644 --- a/include/linux/lockd/xdr4.h +++ b/fs/lockd/xdr4.h @@ -1,19 +1,12 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * linux/include/linux/lockd/xdr4.h - * * XDR types for the NLM protocol * * Copyright (C) 1996 Olaf Kirch */ -#ifndef LOCKD_XDR4_H -#define LOCKD_XDR4_H - -#include -#include -#include -#include +#ifndef _LOCKD_XDR4_H +#define _LOCKD_XDR4_H /* error codes new to NLMv4 */ #define nlm4_deadlock cpu_to_be32(NLM_DEADLCK) @@ -38,6 +31,4 @@ bool nlm4svc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); bool nlm4svc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); bool nlm4svc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr); -extern const struct rpc_version nlm_version4; - -#endif /* LOCKD_XDR4_H */ +#endif /* _LOCKD_XDR4_H */ diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index 39c124dcb19c..077da0696f12 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -13,9 +13,6 @@ #include /* need xdr-encoded error codes too, so... */ #include -#ifdef CONFIG_LOCKD_V4 -#include -#endif /* Dummy declarations */ struct svc_rqst; diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 0d883f48ec21..46f244141645 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -22,9 +22,6 @@ #include #include #include -#ifdef CONFIG_LOCKD_V4 -#include -#endif #include #include @@ -235,6 +232,10 @@ int nlmclnt_reclaim(struct nlm_host *, struct file_lock *, struct nlm_rqst *); void nlmclnt_next_cookie(struct nlm_cookie *); +#ifdef CONFIG_LOCKD_V4 +extern const struct rpc_version nlm_version4; +#endif + /* * Host cache */ From 4db2f8a016dc9f9b357bfbf5c507c2582bb36730 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:29 -0500 Subject: [PATCH 15/83] lockd: Move share.h from include/linux/lockd/ to fs/lockd/ The share.h header defines struct nlm_share and declares the DOS share management functions used by the NLM server to implement NLM_SHARE and NLM_UNSHARE operations. These interfaces are used exclusively within the lockd subsystem. A git grep search confirms no external code references them. Relocating this header from include/linux/lockd/ to fs/lockd/ narrows the public API surface of the lockd module. Out-of-tree code cannot depend on these internal interfaces after this change. Future refactoring of the share management implementation thus requires no consideration of external consumers. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- {include/linux => fs}/lockd/share.h | 8 +++----- fs/lockd/svc4proc.c | 2 +- fs/lockd/svcproc.c | 3 ++- fs/lockd/svcshare.c | 3 ++- fs/lockd/svcsubs.c | 3 ++- include/linux/lockd/lockd.h | 2 ++ 6 files changed, 12 insertions(+), 9 deletions(-) rename {include/linux => fs}/lockd/share.h (85%) diff --git a/include/linux/lockd/share.h b/fs/lockd/share.h similarity index 85% rename from include/linux/lockd/share.h rename to fs/lockd/share.h index 1f18a9faf645..d8f4ebd9c278 100644 --- a/include/linux/lockd/share.h +++ b/fs/lockd/share.h @@ -1,14 +1,12 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * linux/include/linux/lockd/share.h - * * DOS share management for lockd. * * Copyright (C) 1996, Olaf Kirch */ -#ifndef LINUX_LOCKD_SHARE_H -#define LINUX_LOCKD_SHARE_H +#ifndef _LOCKD_SHARE_H +#define _LOCKD_SHARE_H /* * DOS share for a specific file @@ -29,4 +27,4 @@ __be32 nlmsvc_unshare_file(struct nlm_host *, struct nlm_file *, void nlmsvc_traverse_shares(struct nlm_host *, struct nlm_file *, nlm_host_match_fn_t); -#endif /* LINUX_LOCKD_SHARE_H */ +#endif /* _LOCKD_SHARE_H */ diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 51d072a83a49..da88b638d90d 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -11,9 +11,9 @@ #include #include #include -#include #include +#include "share.h" #include "xdr4.h" #define NLMDBG_FACILITY NLMDBG_CLIENT diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 272c8f36ed2a..8441fabd019f 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -11,9 +11,10 @@ #include #include #include -#include #include +#include "share.h" + #define NLMDBG_FACILITY NLMDBG_CLIENT #ifdef CONFIG_LOCKD_V4 diff --git a/fs/lockd/svcshare.c b/fs/lockd/svcshare.c index 88c81ce1148d..8e06840834c6 100644 --- a/fs/lockd/svcshare.c +++ b/fs/lockd/svcshare.c @@ -15,7 +15,8 @@ #include #include #include -#include + +#include "share.h" static inline int nlm_cmp_owner(struct nlm_share *share, struct xdr_netobj *oh) diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 967739d2aa90..ce596a17112c 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -16,11 +16,12 @@ #include #include #include -#include #include #include #include +#include "share.h" + #define NLMDBG_FACILITY NLMDBG_SVCSUBS diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 46f244141645..eebcecd12fae 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -155,6 +155,8 @@ struct nlm_rqst { void * a_callback_data; /* sent to nlmclnt_operations callbacks */ }; +struct nlm_share; + /* * This struct describes a file held open by lockd on behalf of * an NFS client. From 2c562c6e6715619ce34bb37d8a0a5e40fdcc7a44 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:30 -0500 Subject: [PATCH 16/83] lockd: Relocate include/linux/lockd/lockd.h Headers placed in include/linux/ form part of the kernel's internal API and signal to subsystem maintainers that other parts of the kernel may depend on them. By moving lockd.h into fs/lockd/, lockd becomes a more self-contained module whose internal interfaces are clearly distinguished from its public contract with the rest of the kernel. This relocation addresses a long-standing XXX comment in the header itself that acknowledged the file's misplacement. Future changes to lockd internals can now proceed with confidence that external consumers are not inadvertently coupled to implementation details. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/clnt4xdr.c | 3 ++- fs/lockd/clntlock.c | 2 +- fs/lockd/clntproc.c | 2 +- fs/lockd/clntxdr.c | 3 ++- fs/lockd/host.c | 2 +- {include/linux => fs}/lockd/lockd.h | 12 +++--------- fs/lockd/mon.c | 2 +- fs/lockd/svc.c | 2 +- fs/lockd/svc4proc.c | 2 +- fs/lockd/svclock.c | 3 ++- fs/lockd/svcproc.c | 2 +- fs/lockd/svcshare.c | 2 +- fs/lockd/svcsubs.c | 2 +- fs/lockd/trace.h | 3 ++- fs/lockd/xdr.c | 3 +-- fs/lockd/xdr4.c | 2 +- 16 files changed, 22 insertions(+), 25 deletions(-) rename {include/linux => fs}/lockd/lockd.h (98%) diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c index 23896073c7e5..61ee5fa6dfa4 100644 --- a/fs/lockd/clnt4xdr.c +++ b/fs/lockd/clnt4xdr.c @@ -13,7 +13,8 @@ #include #include #include -#include + +#include "lockd.h" #include diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 85bc0f3e91df..8fa30c42c92a 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -15,9 +15,9 @@ #include #include #include -#include #include +#include "lockd.h" #include "trace.h" #define NLMDBG_FACILITY NLMDBG_CLIENT diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index fb4d0752c9bb..7f211008a5d2 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -18,8 +18,8 @@ #include #include #include -#include +#include "lockd.h" #include "trace.h" #define NLMDBG_FACILITY NLMDBG_CLIENT diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c index 6ea3448d2d31..65555f5224b1 100644 --- a/fs/lockd/clntxdr.c +++ b/fs/lockd/clntxdr.c @@ -15,7 +15,8 @@ #include #include #include -#include + +#include "lockd.h" #include diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 015900d2d4c2..ea8a8e166f7e 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -16,13 +16,13 @@ #include #include #include -#include #include #include #include +#include "lockd.h" #include "netns.h" #define NLMDBG_FACILITY NLMDBG_HOSTCACHE diff --git a/include/linux/lockd/lockd.h b/fs/lockd/lockd.h similarity index 98% rename from include/linux/lockd/lockd.h rename to fs/lockd/lockd.h index eebcecd12fae..9bcf89765a69 100644 --- a/include/linux/lockd/lockd.h +++ b/fs/lockd/lockd.h @@ -1,16 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * linux/include/linux/lockd/lockd.h - * - * General-purpose lockd include file. - * * Copyright (C) 1996 Olaf Kirch */ -#ifndef LINUX_LOCKD_LOCKD_H -#define LINUX_LOCKD_LOCKD_H - -/* XXX: a lot of this should really be under fs/lockd. */ +#ifndef _LOCKD_LOCKD_H +#define _LOCKD_LOCKD_H #include #include @@ -398,4 +392,4 @@ static inline int nlm_compare_locks(const struct file_lock *fl1, extern const struct lock_manager_operations nlmsvc_lock_operations; -#endif /* LINUX_LOCKD_LOCKD_H */ +#endif /* _LOCKD_LOCKD_H */ diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index b8fc732e1c67..3d3ee88ca4dc 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -16,10 +16,10 @@ #include #include #include -#include #include +#include "lockd.h" #include "netns.h" #define NLMDBG_FACILITY NLMDBG_MONITOR diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index dcd80c4e74c9..9dd7f8e11544 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -36,9 +36,9 @@ #include #include #include -#include #include +#include "lockd.h" #include "netns.h" #include "procfs.h" #include "netlink.h" diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index da88b638d90d..86dfeb6ce68d 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -10,9 +10,9 @@ #include #include -#include #include +#include "lockd.h" #include "share.h" #include "xdr4.h" diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 5edf00751a1e..1c800fffe69c 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -29,7 +29,8 @@ #include #include #include -#include + +#include "lockd.h" #define NLMDBG_FACILITY NLMDBG_SVCLOCK diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 8441fabd019f..e9a6bcc3bf2e 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -10,9 +10,9 @@ #include #include -#include #include +#include "lockd.h" #include "share.h" #define NLMDBG_FACILITY NLMDBG_CLIENT diff --git a/fs/lockd/svcshare.c b/fs/lockd/svcshare.c index 8e06840834c6..8675ac80ab16 100644 --- a/fs/lockd/svcshare.c +++ b/fs/lockd/svcshare.c @@ -14,8 +14,8 @@ #include #include -#include +#include "lockd.h" #include "share.h" static inline int diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index ce596a17112c..71eaec5ed8d7 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -15,11 +15,11 @@ #include #include #include -#include #include #include #include +#include "lockd.h" #include "share.h" #define NLMDBG_FACILITY NLMDBG_SVCSUBS diff --git a/fs/lockd/trace.h b/fs/lockd/trace.h index 7461b13b6e74..7214d7e96a42 100644 --- a/fs/lockd/trace.h +++ b/fs/lockd/trace.h @@ -8,7 +8,8 @@ #include #include #include -#include + +#include "lockd.h" #ifdef CONFIG_LOCKD_V4 #define NLM_STATUS_LIST \ diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index adfcce2bf11b..5aac49d1875a 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -15,13 +15,12 @@ #include #include #include -#include #include +#include "lockd.h" #include "svcxdr.h" - static inline loff_t s32_to_loff_t(__s32 offset) { diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c index 5b1e15977697..f57d4881d5f1 100644 --- a/fs/lockd/xdr4.c +++ b/fs/lockd/xdr4.c @@ -16,8 +16,8 @@ #include #include #include -#include +#include "lockd.h" #include "svcxdr.h" #include "xdr4.h" From 236f3171ac690f632e13d391f47c68c3a8519bd2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:31 -0500 Subject: [PATCH 17/83] lockd: Remove lockd/debug.h The lockd include structure has unnecessary indirection. The header include/linux/lockd/debug.h is consumed only by fs/lockd/lockd.h, creating an extra compilation dependency and making the code harder to navigate. Fold the debug.h definitions directly into lockd.h and remove the now-redundant header. This reduces the include tree depth and makes the debug-related definitions easier to find when working on lockd internals. Build-tested with lockd built as module and built-in. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/lockd.h | 24 +++++++++++++++++++++- include/linux/lockd/debug.h | 40 ------------------------------------- 2 files changed, 23 insertions(+), 41 deletions(-) delete mode 100644 include/linux/lockd/debug.h diff --git a/fs/lockd/lockd.h b/fs/lockd/lockd.h index 9bcf89765a69..460ccb701749 100644 --- a/fs/lockd/lockd.h +++ b/fs/lockd/lockd.h @@ -16,9 +16,31 @@ #include #include #include -#include +#include #include +/* + * Enable lockd debugging. + * Requires CONFIG_SUNRPC_DEBUG. + */ +#undef ifdebug +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +# define ifdebug(flag) if (unlikely(nlm_debug & NLMDBG_##flag)) +#else +# define ifdebug(flag) if (0) +#endif + +#define NLMDBG_SVC 0x0001 +#define NLMDBG_CLIENT 0x0002 +#define NLMDBG_CLNTLOCK 0x0004 +#define NLMDBG_SVCLOCK 0x0008 +#define NLMDBG_MONITOR 0x0010 +#define NLMDBG_CLNTSUBS 0x0020 +#define NLMDBG_SVCSUBS 0x0040 +#define NLMDBG_HOSTCACHE 0x0080 +#define NLMDBG_XDR 0x0100 +#define NLMDBG_ALL 0x7fff + /* * Version string */ diff --git a/include/linux/lockd/debug.h b/include/linux/lockd/debug.h deleted file mode 100644 index eede2ab5246f..000000000000 --- a/include/linux/lockd/debug.h +++ /dev/null @@ -1,40 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * linux/include/linux/lockd/debug.h - * - * Debugging stuff. - * - * Copyright (C) 1996 Olaf Kirch - */ - -#ifndef LINUX_LOCKD_DEBUG_H -#define LINUX_LOCKD_DEBUG_H - -#include - -/* - * Enable lockd debugging. - * Requires RPC_DEBUG. - */ -#undef ifdebug -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define ifdebug(flag) if (unlikely(nlm_debug & NLMDBG_##flag)) -#else -# define ifdebug(flag) if (0) -#endif - -/* - * Debug flags - */ -#define NLMDBG_SVC 0x0001 -#define NLMDBG_CLIENT 0x0002 -#define NLMDBG_CLNTLOCK 0x0004 -#define NLMDBG_SVCLOCK 0x0008 -#define NLMDBG_MONITOR 0x0010 -#define NLMDBG_CLNTSUBS 0x0020 -#define NLMDBG_SVCSUBS 0x0040 -#define NLMDBG_HOSTCACHE 0x0080 -#define NLMDBG_XDR 0x0100 -#define NLMDBG_ALL 0x7fff - -#endif /* LINUX_LOCKD_DEBUG_H */ From 615384a24b1e6b0f091ebc1dfbf7ec8b4c27fa81 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:32 -0500 Subject: [PATCH 18/83] lockd: Move xdr.h from include/linux/lockd/ to fs/lockd/ The lockd subsystem unnecessarily exposes internal NLM XDR type definitions through the global include path. These definitions are not used by any code outside fs/lockd/, making them inappropriate for include/linux/lockd/. Moving xdr.h to fs/lockd/ narrows the API surface and clarifies that these types are internal implementation details. The comment in linux/lockd/bind.h stating xdr.h was needed for "xdr-encoded error codes" is stale: no lockd API consumers use those codes. Forward declarations for struct nfs_fh and struct file_lock are added to bind.h because their definitions were previously pulled in transitively through xdr.h. Additionally, nfs3proc.c and proc.c need explicit includes of filelock.h for FL_CLOSE and for accessing struct file_lock members, respectively. Built and tested with lockd client/server operations. No functional change. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/lockd.h | 2 +- {include/linux => fs}/lockd/xdr.h | 8 +++----- fs/nfs/nfs3proc.c | 1 + fs/nfs/proc.c | 1 + include/linux/lockd/bind.h | 5 ++--- 5 files changed, 8 insertions(+), 9 deletions(-) rename {include/linux => fs}/lockd/xdr.h (96%) diff --git a/fs/lockd/lockd.h b/fs/lockd/lockd.h index 460ccb701749..6f83b9a7257f 100644 --- a/fs/lockd/lockd.h +++ b/fs/lockd/lockd.h @@ -15,7 +15,7 @@ #include #include #include -#include +#include "xdr.h" #include #include diff --git a/include/linux/lockd/xdr.h b/fs/lockd/xdr.h similarity index 96% rename from include/linux/lockd/xdr.h rename to fs/lockd/xdr.h index 292e4e38d17d..af821ecf2a4e 100644 --- a/include/linux/lockd/xdr.h +++ b/fs/lockd/xdr.h @@ -1,14 +1,12 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * linux/include/linux/lockd/xdr.h - * * XDR types for the NLM protocol * * Copyright (C) 1996 Olaf Kirch */ -#ifndef LOCKD_XDR_H -#define LOCKD_XDR_H +#ifndef _LOCKD_XDR_H +#define _LOCKD_XDR_H #include #include @@ -110,4 +108,4 @@ bool nlmsvc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); bool nlmsvc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); bool nlmsvc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr); -#endif /* LOCKD_XDR_H */ +#endif /* _LOCKD_XDR_H */ diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index be2aebf62056..95d7cd564b74 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 8c3d2efa2636..70795684b8e8 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include "internal.h" diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index 077da0696f12..ba9258c96bfd 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -11,10 +11,9 @@ #define LINUX_LOCKD_BIND_H #include -/* need xdr-encoded error codes too, so... */ -#include -/* Dummy declarations */ +struct file_lock; +struct nfs_fh; struct svc_rqst; struct rpc_task; struct rpc_clnt; From 5829352e568d24dd04ae112128a4f44748d073bc Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:33 -0500 Subject: [PATCH 19/83] lockd: Make linux/lockd/nlm.h an internal header The NLM protocol constants and status codes in nlm.h are needed only by lockd's internal implementation. NFS client code and NFSD interact with lockd through the stable API in bind.h and have no direct use for protocol-level definitions. Exposing these definitions globally via bind.h creates unnecessary coupling between lockd internals and its consumers. Moving nlm.h from include/linux/lockd/ to fs/lockd/ clarifies the API boundary: bind.h provides the lockd service interface, while nlm.h remains available only to code within fs/lockd/ that implements the protocol. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/lockd.h | 1 + {include/linux => fs}/lockd/nlm.h | 8 +++----- fs/lockd/svclock.c | 1 - include/linux/lockd/bind.h | 2 -- 4 files changed, 4 insertions(+), 8 deletions(-) rename {include/linux => fs}/lockd/nlm.h (91%) diff --git a/fs/lockd/lockd.h b/fs/lockd/lockd.h index 6f83b9a7257f..e73c6b348154 100644 --- a/fs/lockd/lockd.h +++ b/fs/lockd/lockd.h @@ -14,6 +14,7 @@ #include #include #include +#include "nlm.h" #include #include "xdr.h" #include diff --git a/include/linux/lockd/nlm.h b/fs/lockd/nlm.h similarity index 91% rename from include/linux/lockd/nlm.h rename to fs/lockd/nlm.h index 6e343ef760dc..47be65d0111f 100644 --- a/include/linux/lockd/nlm.h +++ b/fs/lockd/nlm.h @@ -1,14 +1,12 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * linux/include/linux/lockd/nlm.h - * * Declarations for the Network Lock Manager protocol. * * Copyright (C) 1996, Olaf Kirch */ -#ifndef LINUX_LOCKD_NLM_H -#define LINUX_LOCKD_NLM_H +#ifndef _LOCKD_NLM_H +#define _LOCKD_NLM_H /* Maximum file offset in file_lock.fl_end */ @@ -55,4 +53,4 @@ enum { #define NLMPROC_NM_LOCK 22 #define NLMPROC_FREE_ALL 23 -#endif /* LINUX_LOCKD_NLM_H */ +#endif /* _LOCKD_NLM_H */ diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 1c800fffe69c..e687103e42d1 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -28,7 +28,6 @@ #include #include #include -#include #include "lockd.h" diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index ba9258c96bfd..b614e0deea72 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -10,8 +10,6 @@ #ifndef LINUX_LOCKD_BIND_H #define LINUX_LOCKD_BIND_H -#include - struct file_lock; struct nfs_fh; struct svc_rqst; From b3f76a9b13f014edcba9eaae2bc09a6af7267cee Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:34 -0500 Subject: [PATCH 20/83] lockd: Move nlm4svc_set_file_lock_range() Both client-side and server-side NLMv4 code convert lock byte ranges from the wire format (start, length) to the kernel's file_lock format (start, end). The current nlm4svc_set_file_lock_range() performs this conversion, but the "svc" prefix incorrectly suggests server-only use, and client code must include server-internal headers to access it. Rename to lockd_set_file_lock_range4() and relocate to the shared lockd.h header, making it accessible to both client and server code. This eliminates the need for client code to include xdr4.h, reducing coupling between the XDR implementation files. While relocating the function, add input validation: clamp the starting offset to OFFSET_MAX before use. Without this, a malformed lock request with off > OFFSET_MAX results in fl_start > fl_end, violating file_lock invariants and potentially causing incorrect lock conflict detection. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/clnt4xdr.c | 2 +- fs/lockd/lockd.h | 25 +++++++++++++++++++++++++ fs/lockd/xdr4.c | 13 +------------ fs/lockd/xdr4.h | 1 - 4 files changed, 27 insertions(+), 14 deletions(-) diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c index 61ee5fa6dfa4..c09e67765cac 100644 --- a/fs/lockd/clnt4xdr.c +++ b/fs/lockd/clnt4xdr.c @@ -287,7 +287,7 @@ static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result) fl->c.flc_type = exclusive != 0 ? F_WRLCK : F_RDLCK; p = xdr_decode_hyper(p, &l_offset); xdr_decode_hyper(p, &l_len); - nlm4svc_set_file_lock_range(fl, l_offset, l_len); + lockd_set_file_lock_range4(fl, l_offset, l_len); error = 0; out: return error; diff --git a/fs/lockd/lockd.h b/fs/lockd/lockd.h index e73c6b348154..ef6431b4cac0 100644 --- a/fs/lockd/lockd.h +++ b/fs/lockd/lockd.h @@ -413,6 +413,31 @@ static inline int nlm_compare_locks(const struct file_lock *fl1, &&(fl1->c.flc_type == fl2->c.flc_type || fl2->c.flc_type == F_UNLCK); } +/** + * lockd_set_file_lock_range4 - set the byte range of a file_lock + * @fl: file_lock whose length fields are to be initialized + * @off: starting offset of the lock, in bytes + * @len: length of the byte range, in bytes, or zero + * + * The NLMv4 protocol represents lock byte ranges as (start, length), + * where length zero means "lock to end of file." The kernel's file_lock + * structure uses (start, end) representation. Convert from NLMv4 format + * to file_lock format, clamping the starting offset and treating + * arithmetic overflow as "lock to EOF." + */ +static inline void +lockd_set_file_lock_range4(struct file_lock *fl, u64 off, u64 len) +{ + u64 clamped_off = (off > OFFSET_MAX) ? OFFSET_MAX : off; + s64 end = clamped_off + len - 1; + + fl->fl_start = clamped_off; + if (len == 0 || end < 0) + fl->fl_end = OFFSET_MAX; + else + fl->fl_end = end; +} + extern const struct lock_manager_operations nlmsvc_lock_operations; #endif /* _LOCKD_LOCKD_H */ diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c index f57d4881d5f1..dbbb2dfcb81b 100644 --- a/fs/lockd/xdr4.c +++ b/fs/lockd/xdr4.c @@ -34,17 +34,6 @@ loff_t_to_s64(loff_t offset) return res; } -void nlm4svc_set_file_lock_range(struct file_lock *fl, u64 off, u64 len) -{ - s64 end = off + len - 1; - - fl->fl_start = off; - if (len == 0 || end < 0) - fl->fl_end = OFFSET_MAX; - else - fl->fl_end = end; -} - /* * NLM file handles are defined by specification to be a variable-length * XDR opaque no longer than 1024 bytes. However, this implementation @@ -91,7 +80,7 @@ svcxdr_decode_lock(struct xdr_stream *xdr, struct nlm_lock *lock) locks_init_lock(fl); fl->c.flc_type = F_RDLCK; - nlm4svc_set_file_lock_range(fl, lock->lock_start, lock->lock_len); + lockd_set_file_lock_range4(fl, lock->lock_start, lock->lock_len); return true; } diff --git a/fs/lockd/xdr4.h b/fs/lockd/xdr4.h index 7be318c0512b..4ddf51a2e0ea 100644 --- a/fs/lockd/xdr4.h +++ b/fs/lockd/xdr4.h @@ -15,7 +15,6 @@ #define nlm4_fbig cpu_to_be32(NLM_FBIG) #define nlm4_failed cpu_to_be32(NLM_FAILED) -void nlm4svc_set_file_lock_range(struct file_lock *fl, u64 off, u64 len); bool nlm4svc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); bool nlm4svc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); bool nlm4svc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); From 45cd458b57feeec639af3d7da05ce8c290d0179b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:35 -0500 Subject: [PATCH 21/83] lockd: Relocate svc_version definitions to XDR layer Public RPC server interfaces become cluttered when internal XDR implementation details leak into them. The procedure count, maximum XDR buffer size, and per-CPU call counters serve no purpose outside the code that encodes and decodes NLM protocol messages. Exposing these values through global headers creates unnecessary coupling between the RPC dispatch logic and the XDR layer. Relocating the svc_version structure definitions confines this implementation information to the files where XDR encoding and decoding occur. In svc.c, the buffer size computation now reads vs_xdrsize from the version structures rather than relying on a preprocessor constant. This calculation occurs at service initialization, after the linker has resolved the version structure definitions. The dispatch function becomes non-static because both the version structures and the dispatcher reside in different translation units. The NLMSVC_XDRSIZE macro is removed from xdr.h because buffer size is now computed from the union of XDR argument and result structures, matching the pattern used in other RPC services. Version 1 and 3 share the same procedure table but maintain separate counter arrays. Version 4 remains separate due to its distinct procedure definitions. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/lockd.h | 6 ++++-- fs/lockd/svc.c | 48 +++++++++++---------------------------------- fs/lockd/svc4proc.c | 23 +++++++++++++++++++++- fs/lockd/svcproc.c | 38 ++++++++++++++++++++++++++++++++++- fs/lockd/xdr.h | 5 ----- 5 files changed, 74 insertions(+), 46 deletions(-) diff --git a/fs/lockd/lockd.h b/fs/lockd/lockd.h index ef6431b4cac0..ad4c6701b64a 100644 --- a/fs/lockd/lockd.h +++ b/fs/lockd/lockd.h @@ -221,9 +221,10 @@ struct nlm_block { * Global variables */ extern const struct rpc_program nlm_program; -extern const struct svc_procedure nlmsvc_procedures[24]; +extern const struct svc_version nlmsvc_version1; +extern const struct svc_version nlmsvc_version3; #ifdef CONFIG_LOCKD_V4 -extern const struct svc_procedure nlmsvc_procedures4[24]; +extern const struct svc_version nlmsvc_version4; #endif extern int nlmsvc_grace_period; extern unsigned long nlm_timeout; @@ -318,6 +319,7 @@ void nlmsvc_traverse_blocks(struct nlm_host *, struct nlm_file *, void nlmsvc_grant_reply(struct nlm_cookie *, __be32); void nlmsvc_release_call(struct nlm_rqst *); void nlmsvc_locks_init_private(struct file_lock *, struct nlm_host *, pid_t); +int nlmsvc_dispatch(struct svc_rqst *rqstp); /* * File handling for the server personality diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 9dd7f8e11544..490551369ef2 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -44,7 +44,6 @@ #include "netlink.h" #define NLMDBG_FACILITY NLMDBG_SVC -#define LOCKD_BUFSIZE (1024 + NLMSVC_XDRSIZE) static struct svc_program nlmsvc_program; @@ -319,6 +318,7 @@ static struct notifier_block lockd_inet6addr_notifier = { static int lockd_get(void) { struct svc_serv *serv; + unsigned int bufsize; int error; if (nlmsvc_serv) { @@ -334,7 +334,15 @@ static int lockd_get(void) printk(KERN_WARNING "lockd_up: no pid, %d users??\n", nlmsvc_users); - serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, lockd); +#ifdef CONFIG_LOCKD_V4 + bufsize = 1024 + max3(nlmsvc_version1.vs_xdrsize, + nlmsvc_version3.vs_xdrsize, + nlmsvc_version4.vs_xdrsize); +#else + bufsize = 1024 + max(nlmsvc_version1.vs_xdrsize, + nlmsvc_version3.vs_xdrsize); +#endif + serv = svc_create(&nlmsvc_program, bufsize, lockd); if (!serv) { printk(KERN_WARNING "lockd_up: create service failed\n"); return -ENOMEM; @@ -640,7 +648,7 @@ module_exit(exit_nlm); * %0: Processing complete; do not send a Reply * %1: Processing complete; send Reply in rqstp->rq_res */ -static int nlmsvc_dispatch(struct svc_rqst *rqstp) +int nlmsvc_dispatch(struct svc_rqst *rqstp) { const struct svc_procedure *procp = rqstp->rq_procinfo; __be32 *statp = rqstp->rq_accept_statp; @@ -671,40 +679,6 @@ static int nlmsvc_dispatch(struct svc_rqst *rqstp) /* * Define NLM program and procedures */ -static DEFINE_PER_CPU_ALIGNED(unsigned long, nlmsvc_version1_count[17]); -static const struct svc_version nlmsvc_version1 = { - .vs_vers = 1, - .vs_nproc = 17, - .vs_proc = nlmsvc_procedures, - .vs_count = nlmsvc_version1_count, - .vs_dispatch = nlmsvc_dispatch, - .vs_xdrsize = NLMSVC_XDRSIZE, -}; - -static DEFINE_PER_CPU_ALIGNED(unsigned long, - nlmsvc_version3_count[ARRAY_SIZE(nlmsvc_procedures)]); -static const struct svc_version nlmsvc_version3 = { - .vs_vers = 3, - .vs_nproc = ARRAY_SIZE(nlmsvc_procedures), - .vs_proc = nlmsvc_procedures, - .vs_count = nlmsvc_version3_count, - .vs_dispatch = nlmsvc_dispatch, - .vs_xdrsize = NLMSVC_XDRSIZE, -}; - -#ifdef CONFIG_LOCKD_V4 -static DEFINE_PER_CPU_ALIGNED(unsigned long, - nlmsvc_version4_count[ARRAY_SIZE(nlmsvc_procedures4)]); -static const struct svc_version nlmsvc_version4 = { - .vs_vers = 4, - .vs_nproc = ARRAY_SIZE(nlmsvc_procedures4), - .vs_proc = nlmsvc_procedures4, - .vs_count = nlmsvc_version4_count, - .vs_dispatch = nlmsvc_dispatch, - .vs_xdrsize = NLMSVC_XDRSIZE, -}; -#endif - static const struct svc_version *nlmsvc_version[] = { [1] = &nlmsvc_version1, [3] = &nlmsvc_version3, diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 86dfeb6ce68d..c99f192bce77 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -530,7 +530,7 @@ struct nlm_void { int dummy; }; #define St 1 /* status */ #define Rg 4 /* range (offset + length) */ -const struct svc_procedure nlmsvc_procedures4[24] = { +static const struct svc_procedure nlm4svc_procedures[24] = { [NLMPROC_NULL] = { .pc_func = nlm4svc_proc_null, .pc_decode = nlm4svc_decode_void, @@ -772,3 +772,24 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_name = "FREE_ALL", }, }; + +/* + * Storage requirements for XDR arguments and results + */ +union nlm4svc_xdrstore { + struct nlm_args args; + struct nlm_res res; + struct nlm_reboot reboot; +}; + +static DEFINE_PER_CPU_ALIGNED(unsigned long, + nlm4svc_call_counters[ARRAY_SIZE(nlm4svc_procedures)]); + +const struct svc_version nlmsvc_version4 = { + .vs_vers = 4, + .vs_nproc = ARRAY_SIZE(nlm4svc_procedures), + .vs_proc = nlm4svc_procedures, + .vs_count = nlm4svc_call_counters, + .vs_dispatch = nlmsvc_dispatch, + .vs_xdrsize = sizeof(union nlm4svc_xdrstore), +}; diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index e9a6bcc3bf2e..75b0dfa1a79a 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -555,7 +555,7 @@ struct nlm_void { int dummy; }; #define No (1+1024/4) /* Net Obj */ #define Rg 2 /* range - offset + size */ -const struct svc_procedure nlmsvc_procedures[24] = { +static const struct svc_procedure nlmsvc_procedures[24] = { [NLMPROC_NULL] = { .pc_func = nlmsvc_proc_null, .pc_decode = nlmsvc_decode_void, @@ -797,3 +797,39 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_name = "FREE_ALL", }, }; + +/* + * Storage requirements for XDR arguments and results + */ +union nlmsvc_xdrstore { + struct nlm_args args; + struct nlm_res res; + struct nlm_reboot reboot; +}; + +/* + * NLMv1 defines only procedures 1 - 15. Linux lockd also implements + * procedures 0 (NULL) and 16 (SM_NOTIFY). + */ +static DEFINE_PER_CPU_ALIGNED(unsigned long, nlm1svc_call_counters[17]); + +const struct svc_version nlmsvc_version1 = { + .vs_vers = 1, + .vs_nproc = 17, + .vs_proc = nlmsvc_procedures, + .vs_count = nlm1svc_call_counters, + .vs_dispatch = nlmsvc_dispatch, + .vs_xdrsize = sizeof(union nlmsvc_xdrstore), +}; + +static DEFINE_PER_CPU_ALIGNED(unsigned long, + nlm3svc_call_counters[ARRAY_SIZE(nlmsvc_procedures)]); + +const struct svc_version nlmsvc_version3 = { + .vs_vers = 3, + .vs_nproc = ARRAY_SIZE(nlmsvc_procedures), + .vs_proc = nlmsvc_procedures, + .vs_count = nlm3svc_call_counters, + .vs_dispatch = nlmsvc_dispatch, + .vs_xdrsize = sizeof(union nlmsvc_xdrstore), +}; diff --git a/fs/lockd/xdr.h b/fs/lockd/xdr.h index af821ecf2a4e..3c60817c4349 100644 --- a/fs/lockd/xdr.h +++ b/fs/lockd/xdr.h @@ -88,11 +88,6 @@ struct nlm_reboot { struct nsm_private priv; }; -/* - * Contents of statd callback when monitored host rebooted - */ -#define NLMSVC_XDRSIZE sizeof(struct nlm_args) - bool nlmsvc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); bool nlmsvc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); bool nlmsvc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); From f83c8dda456ce4863f346aa26d88efa276eda35d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 4 Feb 2026 21:21:49 +0100 Subject: [PATCH 22/83] nfs/blocklayout: Fix compilation error (`make W=1`) in bl_write_pagelist() Clang compiler is not happy about set but unused variable (when dprintk() is no-op): .../blocklayout/blocklayout.c:384:9: error: variable 'count' set but not used [-Werror,-Wunused-but-set-variable] Remove a leftover from the previous cleanup. Fixes: 3a6fd1f004fc ("pnfs/blocklayout: remove read-modify-write handling in bl_write_pagelist") Acked-by: Anna Schumaker Reviewed-by: Jeff Layton Signed-off-by: Andy Shevchenko Signed-off-by: Chuck Lever --- fs/nfs/blocklayout/blocklayout.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index cb0a645aeb50..94e85ad9067e 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -381,14 +381,13 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync) sector_t isect, extent_length = 0; struct parallel_io *par = NULL; loff_t offset = header->args.offset; - size_t count = header->args.count; struct page **pages = header->args.pages; int pg_index = header->args.pgbase >> PAGE_SHIFT; unsigned int pg_len; struct blk_plug plug; int i; - dprintk("%s enter, %zu@%lld\n", __func__, count, offset); + dprintk("%s enter, %u@%lld\n", __func__, header->args.count, offset); /* At this point, header->page_aray is a (sequential) list of nfs_pages. * We want to write each, and if there is an error set pnfs_error @@ -429,7 +428,6 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync) } offset += pg_len; - count -= pg_len; isect += (pg_len >> SECTOR_SHIFT); extent_length -= (pg_len >> SECTOR_SHIFT); } From adcc59114ccd402259c089b0fea24da5e4974563 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 4 Feb 2026 21:21:50 +0100 Subject: [PATCH 23/83] sunrpc: Kill RPC_IFDEBUG() RPC_IFDEBUG() is used in only two places. In one the user of the definition is guarded by ifdeffery, in the second one it's implied due to dprintk() usage. Kill the macro and move the ifdeffery to the regular condition with the variable defined inside, while in the second case add the same conditional and move the respective code there. Reviewed-by: Jeff Layton Signed-off-by: Andy Shevchenko Signed-off-by: Chuck Lever --- fs/nfsd/nfsfh.c | 9 +++++--- include/linux/sunrpc/debug.h | 2 -- net/sunrpc/xprtrdma/svc_rdma_transport.c | 27 ++++++++++++------------ 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index ed85dd43da18..68b629fbaaeb 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -105,9 +105,12 @@ static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, { /* Check if the request originated from a secure port. */ if (rqstp && !nfsd_originating_port_ok(rqstp, cred, exp)) { - RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); - dprintk("nfsd: request from insecure port %s!\n", - svc_print_addr(rqstp, buf, sizeof(buf))); + if (IS_ENABLED(CONFIG_SUNRPC_DEBUG)) { + char buf[RPC_MAX_ADDRBUFLEN]; + + dprintk("nfsd: request from insecure port %s!\n", + svc_print_addr(rqstp, buf, sizeof(buf))); + } return nfserr_perm; } diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h index eb4bd62df319..93d1a11ffbfb 100644 --- a/include/linux/sunrpc/debug.h +++ b/include/linux/sunrpc/debug.h @@ -49,12 +49,10 @@ do { \ } \ } while (0) -# define RPC_IFDEBUG(x) x #else # define ifdebug(fac) if (0) # define dfprintk(fac, fmt, ...) do {} while (0) # define dfprintk_rcu(fac, fmt, ...) do {} while (0) -# define RPC_IFDEBUG(x) #endif /* diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 9b623849723e..f2d72181a6fe 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -414,7 +414,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) struct ib_qp_init_attr qp_attr; struct ib_device *dev; int ret = 0; - RPC_IFDEBUG(struct sockaddr *sap); listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); clear_bit(XPT_CONN, &xprt->xpt_flags); @@ -560,18 +559,20 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) goto errout; } -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - dprintk("svcrdma: new connection accepted on device %s:\n", dev->name); - sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr; - dprintk(" local address : %pIS:%u\n", sap, rpc_get_port(sap)); - sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; - dprintk(" remote address : %pIS:%u\n", sap, rpc_get_port(sap)); - dprintk(" max_sge : %d\n", newxprt->sc_max_send_sges); - dprintk(" sq_depth : %d\n", newxprt->sc_sq_depth); - dprintk(" rdma_rw_ctxs : %d\n", ctxts); - dprintk(" max_requests : %d\n", newxprt->sc_max_requests); - dprintk(" ord : %d\n", conn_param.initiator_depth); -#endif + if (IS_ENABLED(CONFIG_SUNRPC_DEBUG)) { + struct sockaddr *sap; + + dprintk("svcrdma: new connection accepted on device %s:\n", dev->name); + sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr; + dprintk(" local address : %pIS:%u\n", sap, rpc_get_port(sap)); + sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; + dprintk(" remote address : %pIS:%u\n", sap, rpc_get_port(sap)); + dprintk(" max_sge : %d\n", newxprt->sc_max_send_sges); + dprintk(" sq_depth : %d\n", newxprt->sc_sq_depth); + dprintk(" rdma_rw_ctxs : %d\n", ctxts); + dprintk(" max_requests : %d\n", newxprt->sc_max_requests); + dprintk(" ord : %d\n", conn_param.initiator_depth); + } return &newxprt->sc_xprt; From 6f57293abb8d087de830dd3f02e66d94b3e59973 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 4 Feb 2026 21:21:51 +0100 Subject: [PATCH 24/83] sunrpc: Fix compilation error (`make W=1`) when dprintk() is no-op Clang compiler is not happy about set but unused variables: .../flexfilelayout/flexfilelayoutdev.c:56:9: error: variable 'ret' set but not used [-Werror,-Wunused-but-set-variable] .../flexfilelayout/flexfilelayout.c:1505:6: error: variable 'err' set but not used [-Werror,-Wunused-but-set-variable] .../nfs4proc.c:9244:12: error: variable 'ptr' set but not used [-Werror,-Wunused-but-set-variable] Fix these by forwarding parameters of dprintk() to no_printk(). The positive side-effect is a format-string checker enabled even for the cases when dprintk() is no-op. Fixes: d67ae825a59d ("pnfs/flexfiles: Add the FlexFile Layout Driver") Fixes: fc931582c260 ("nfs41: create_session operation") Acked-by: Geert Uytterhoeven Reviewed-by: Jeff Layton Signed-off-by: Andy Shevchenko Signed-off-by: Chuck Lever --- fs/lockd/svclock.c | 5 +++++ include/linux/sunrpc/debug.h | 8 ++++++-- include/linux/sunrpc/sched.h | 3 --- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index e687103e42d1..ee23f5802af1 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -74,6 +74,11 @@ static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie) return buf; } +#else +static inline const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie) +{ + return "???"; +} #endif /* diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h index 93d1a11ffbfb..ab61bed2f7af 100644 --- a/include/linux/sunrpc/debug.h +++ b/include/linux/sunrpc/debug.h @@ -38,6 +38,8 @@ extern unsigned int nlm_debug; do { \ ifdebug(fac) \ __sunrpc_printk(fmt, ##__VA_ARGS__); \ + else \ + no_printk(fmt, ##__VA_ARGS__); \ } while (0) # define dfprintk_rcu(fac, fmt, ...) \ @@ -46,13 +48,15 @@ do { \ rcu_read_lock(); \ __sunrpc_printk(fmt, ##__VA_ARGS__); \ rcu_read_unlock(); \ + } else { \ + no_printk(fmt, ##__VA_ARGS__); \ } \ } while (0) #else # define ifdebug(fac) if (0) -# define dfprintk(fac, fmt, ...) do {} while (0) -# define dfprintk_rcu(fac, fmt, ...) do {} while (0) +# define dfprintk(fac, fmt, ...) no_printk(fmt, ##__VA_ARGS__) +# define dfprintk_rcu(fac, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif /* diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index ccba79ebf893..0dbdf3722537 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -95,10 +95,7 @@ struct rpc_task { int tk_rpc_status; /* Result of last RPC operation */ unsigned short tk_flags; /* misc flags */ unsigned short tk_timeouts; /* maj timeouts */ - -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) || IS_ENABLED(CONFIG_TRACEPOINTS) unsigned short tk_pid; /* debugging aid */ -#endif unsigned char tk_priority : 2,/* Task priority */ tk_garb_retry : 2, tk_cred_retry : 2; From b48f44f36e6607b2f818560f19deb86b4a9c717b Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Wed, 4 Feb 2026 13:07:43 -0800 Subject: [PATCH 25/83] NFSD: fix nfs4_file access extra count in nfsd4_add_rdaccess_to_wrdeleg In nfsd4_add_rdaccess_to_wrdeleg, if fp->fi_fds[O_RDONLY] is already set by another thread, __nfs4_file_get_access should not be called to increment the nfs4_file access count since that was already done by the thread that added READ access to the file. The extra fi_access count in nfs4_file can prevent the corresponding nfsd_file from being freed. When stopping nfs-server service, these extra access counts trigger a BUG in kmem_cache_destroy() that shows nfsd_file object remaining on __kmem_cache_shutdown. This problem can be reproduced by running the Git project's test suite over NFS. Fixes: 8072e34e1387 ("nfsd: fix nfsd_file reference leak in nfsd4_add_rdaccess_to_wrdeleg()") Signed-off-by: Dai Ngo Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a767b562f991..1b4c101ff04b 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -6266,12 +6266,12 @@ nfsd4_add_rdaccess_to_wrdeleg(struct svc_rqst *rqstp, struct nfsd4_open *open, return (false); fp = stp->st_stid.sc_file; spin_lock(&fp->fi_lock); - __nfs4_file_get_access(fp, NFS4_SHARE_ACCESS_READ); if (!fp->fi_fds[O_RDONLY]) { + __nfs4_file_get_access(fp, NFS4_SHARE_ACCESS_READ); fp->fi_fds[O_RDONLY] = nf; + fp->fi_rdeleg_file = nfsd_file_get(fp->fi_fds[O_RDONLY]); nf = NULL; } - fp->fi_rdeleg_file = nfsd_file_get(fp->fi_fds[O_RDONLY]); spin_unlock(&fp->fi_lock); if (nf) nfsd_file_put(nf); From f52792f484ba2316853736856dde19b7e7458861 Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Fri, 13 Feb 2026 10:36:30 -0800 Subject: [PATCH 26/83] NFSD: Enforce timeout on layout recall and integrate lease manager fencing When a layout conflict triggers a recall, enforcing a timeout is necessary to prevent excessive nfsd threads from being blocked in __break_lease ensuring the server continues servicing incoming requests efficiently. This patch introduces a new function to lease_manager_operations: lm_breaker_timedout: Invoked when a lease recall times out and is about to be disposed of. This function enables the lease manager to inform the caller whether the file_lease should remain on the flc_list or be disposed of. For the NFSD lease manager, this function now handles layout recall timeouts. If the layout type supports fencing and the client has not been fenced, a fence operation is triggered to prevent the client from accessing the block device. While the fencing operation is in progress, the conflicting file_lease remains on the flc_list until fencing is complete. This guarantees that no other clients can access the file, and the client with exclusive access is properly blocked before disposal. Signed-off-by: Dai Ngo Signed-off-by: Chuck Lever --- .../admin-guide/nfs/pnfs-block-server.rst | 30 ++++ .../admin-guide/nfs/pnfs-scsi-server.rst | 31 ++++ Documentation/filesystems/locking.rst | 2 + fs/locks.c | 26 ++- fs/nfsd/blocklayout.c | 42 ++++- fs/nfsd/nfs4layouts.c | 152 +++++++++++++++++- fs/nfsd/nfs4state.c | 1 + fs/nfsd/pnfs.h | 5 +- fs/nfsd/state.h | 6 + include/linux/filelock.h | 1 + 10 files changed, 279 insertions(+), 17 deletions(-) diff --git a/Documentation/admin-guide/nfs/pnfs-block-server.rst b/Documentation/admin-guide/nfs/pnfs-block-server.rst index 20fe9f5117fe..b4f5997009af 100644 --- a/Documentation/admin-guide/nfs/pnfs-block-server.rst +++ b/Documentation/admin-guide/nfs/pnfs-block-server.rst @@ -40,3 +40,33 @@ how to translate the device into a serial number from SCSI EVPD 0x80:: echo "fencing client ${CLIENT} serial ${EVPD}" >> /var/log/pnfsd-fence.log EOF + +If the nfsd server needs to fence a non-responding client and the +fencing operation fails, the server logs a warning message in the +system log with the following format: + + FENCE failed client[IP_address] clid[#n] device[dev_name] + + Where: + + IP_address: refers to the IP address of the affected client. + #n: indicates the unique client identifier. + dev_name: specifies the name of the block device related + to the fencing attempt. + +The server will repeatedly retry the operation indefinitely. During +this time, access to the affected file is restricted for all other +clients. This is to prevent potential data corruption if multiple +clients access the same file simultaneously. + +To restore access to the affected file for other clients, the admin +needs to take the following actions: + + . shutdown or power off the client being fenced. + . manually expire the client to release all its state on the server: + + echo 'expire' > /proc/fs/nfsd/clients/clid/ctl'. + + Where: + + clid: is the unique client identifier displayed in the system log. diff --git a/Documentation/admin-guide/nfs/pnfs-scsi-server.rst b/Documentation/admin-guide/nfs/pnfs-scsi-server.rst index b2eec2288329..db34afbf67a9 100644 --- a/Documentation/admin-guide/nfs/pnfs-scsi-server.rst +++ b/Documentation/admin-guide/nfs/pnfs-scsi-server.rst @@ -22,3 +22,34 @@ option and the underlying SCSI device support persistent reservations. On the client make sure the kernel has the CONFIG_PNFS_BLOCK option enabled, and the file system is mounted using the NFSv4.1 protocol version (mount -o vers=4.1). + +If the nfsd server needs to fence a non-responding client and the +fencing operation fails, the server logs a warning message in the +system log with the following format: + + FENCE failed client[IP_address] clid[#n] device[dev_name] + + Where: + + IP_address: refers to the IP address of the affected client. + #n: indicates the unique client identifier. + dev_name: specifies the name of the block device related + to the fencing attempt. + +The server will repeatedly retry the operation indefinitely. During +this time, access to the affected file is restricted for all other +clients. This is to prevent potential data corruption if multiple +clients access the same file simultaneously. + +To restore access to the affected file for other clients, the admin +needs to take the following actions: + + . shutdown or power off the client being fenced. + . manually expire the client to release all its state on the server: + + echo 'expire' > /proc/fs/nfsd/clients/clid/ctl'. + + Where: + + clid: is the unique client identifier displayed in the system log. + diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 8025df6e6499..8421ea21bd35 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -398,6 +398,7 @@ prototypes:: bool (*lm_breaker_owns_lease)(struct file_lock *); bool (*lm_lock_expirable)(struct file_lock *); void (*lm_expire_lock)(void); + bool (*lm_breaker_timedout)(struct file_lease *); locking rules: @@ -412,6 +413,7 @@ lm_breaker_owns_lease: yes no no lm_lock_expirable yes no no lm_expire_lock no no yes lm_open_conflict yes no no +lm_breaker_timedout yes no no ====================== ============= ================= ========= buffer_head diff --git a/fs/locks.c b/fs/locks.c index d13ec930b7bb..8e44b1f6c15a 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1534,6 +1534,7 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose) { struct file_lock_context *ctx = inode->i_flctx; struct file_lease *fl, *tmp; + bool remove; lockdep_assert_held(&ctx->flc_lock); @@ -1541,8 +1542,19 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose) trace_time_out_leases(inode, fl); if (past_time(fl->fl_downgrade_time)) lease_modify(fl, F_RDLCK, dispose); - if (past_time(fl->fl_break_time)) - lease_modify(fl, F_UNLCK, dispose); + + remove = true; + if (past_time(fl->fl_break_time)) { + /* + * Consult the lease manager when a lease break times + * out to determine whether the lease should be disposed + * of. + */ + if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout) + remove = fl->fl_lmops->lm_breaker_timedout(fl); + if (remove) + lease_modify(fl, F_UNLCK, dispose); + } } } @@ -1670,9 +1682,13 @@ int __break_lease(struct inode *inode, unsigned int flags) restart: fl = list_first_entry(&ctx->flc_lease, struct file_lease, c.flc_list); break_time = fl->fl_break_time; - if (break_time != 0) - break_time -= jiffies; - if (break_time == 0) + if (break_time != 0) { + if (time_after(jiffies, break_time)) { + fl->fl_break_time = jiffies + lease_break_time * HZ; + break_time = lease_break_time * HZ; + } else + break_time -= jiffies; + } else break_time++; locks_insert_block(&fl->c, &new_fl->c, leases_conflict); trace_break_lease_block(inode, new_fl); diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 8b987fca1e60..9d829c84f374 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -297,6 +297,7 @@ static inline int nfsd4_scsi_fence_insert(struct nfs4_client *clp, ret = 0; } xa_unlock(xa); + clp->cl_fence_retry_warn = false; return ret; } @@ -443,15 +444,33 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp, return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps); } -static void +/* + * Perform the fence operation to prevent the client from accessing the + * block device. If a fence operation is already in progress, wait for + * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the + * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set, + * update the layout stateid by setting the ls_fenced flag to indicate + * that the client has been fenced. + * + * The cl_fence_mutex ensures that the fence operation has been fully + * completed, rather than just in progress, when returning from this + * function. + * + * Return true if client was fenced otherwise return false. + */ +static bool nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file) { struct nfs4_client *clp = ls->ls_stid.sc_client; struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev; int status; + bool ret; - if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) - return; + mutex_lock(&clp->cl_fence_mutex); + if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) { + mutex_unlock(&clp->cl_fence_mutex); + return true; + } status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY, nfsd4_scsi_pr_key(clp), @@ -470,13 +489,22 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file) * PR_STS_RESERVATION_CONFLICT, which would cause an infinite * retry loop. */ - if (status < 0 || - status == PR_STS_PATH_FAILED || - status == PR_STS_PATH_FAST_FAILED || - status == PR_STS_RETRY_PATH_FAILURE) + switch (status) { + case 0: + case PR_STS_IOERR: + case PR_STS_RESERVATION_CONFLICT: + ret = true; + break; + default: + /* retry-able and other errors */ + ret = false; nfsd4_scsi_fence_clear(clp, bdev->bd_dev); + break; + } + mutex_unlock(&clp->cl_fence_mutex); trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status); + return ret; } const struct nfsd4_layout_ops scsi_layout_ops = { diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index ad7af8cfcf1f..69e41105efdd 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -27,6 +27,8 @@ static struct kmem_cache *nfs4_layout_stateid_cache; static const struct nfsd4_callback_ops nfsd4_cb_layout_ops; static const struct lease_manager_operations nfsd4_layouts_lm_ops; +static void nfsd4_layout_fence_worker(struct work_struct *work); + const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = { #ifdef CONFIG_NFSD_FLEXFILELAYOUT [LAYOUT_FLEX_FILES] = &ff_layout_ops, @@ -177,6 +179,13 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid) trace_nfsd_layoutstate_free(&ls->ls_stid.sc_stateid); + spin_lock(&ls->ls_lock); + if (delayed_work_pending(&ls->ls_fence_work)) { + spin_unlock(&ls->ls_lock); + cancel_delayed_work_sync(&ls->ls_fence_work); + } else + spin_unlock(&ls->ls_lock); + spin_lock(&clp->cl_lock); list_del_init(&ls->ls_perclnt); spin_unlock(&clp->cl_lock); @@ -271,6 +280,10 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, list_add(&ls->ls_perfile, &fp->fi_lo_states); spin_unlock(&fp->fi_lock); + ls->ls_fenced = false; + ls->ls_fence_delay = 0; + INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker); + trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid); return ls; } @@ -747,11 +760,9 @@ static bool nfsd4_layout_lm_break(struct file_lease *fl) { /* - * We don't want the locks code to timeout the lease for us; - * we'll remove it ourself if a layout isn't returned - * in time: + * Enforce break lease timeout to prevent NFSD + * thread from hanging in __break_lease. */ - fl->fl_break_time = 0; nfsd4_recall_file_layout(fl->c.flc_owner); return false; } @@ -782,10 +793,143 @@ nfsd4_layout_lm_open_conflict(struct file *filp, int arg) return 0; } +static void +nfsd4_layout_fence_worker(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct nfs4_layout_stateid *ls = container_of(dwork, + struct nfs4_layout_stateid, ls_fence_work); + struct nfsd_file *nf; + struct block_device *bdev; + struct nfs4_client *clp; + struct nfsd_net *nn; + + /* + * The workqueue clears WORK_STRUCT_PENDING before invoking + * this callback. Re-arm immediately so that + * delayed_work_pending() returns true while the fence + * operation is in progress, preventing + * lm_breaker_timedout() from taking a duplicate reference. + */ + mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0); + + spin_lock(&ls->ls_lock); + if (list_empty(&ls->ls_layouts)) { + spin_unlock(&ls->ls_lock); +dispose: + cancel_delayed_work(&ls->ls_fence_work); + /* unlock the lease so that tasks waiting on it can proceed */ + nfsd4_close_layout(ls); + + ls->ls_fenced = true; + nfs4_put_stid(&ls->ls_stid); + return; + } + spin_unlock(&ls->ls_lock); + + rcu_read_lock(); + nf = nfsd_file_get(ls->ls_file); + rcu_read_unlock(); + if (!nf) + goto dispose; + + clp = ls->ls_stid.sc_client; + nn = net_generic(clp->net, nfsd_net_id); + bdev = nf->nf_file->f_path.mnt->mnt_sb->s_bdev; + if (nfsd4_layout_ops[ls->ls_layout_type]->fence_client(ls, nf)) { + /* fenced ok */ + nfsd_file_put(nf); + pr_warn("%s: FENCED client[%pISpc] clid[%d] to device[%s]\n", + __func__, (struct sockaddr *)&clp->cl_addr, + clp->cl_clientid.cl_id - nn->clientid_base, + bdev->bd_disk->disk_name); + goto dispose; + } + /* fence failed */ + nfsd_file_put(nf); + + if (!clp->cl_fence_retry_warn) { + pr_warn("%s: FENCE failed client[%pISpc] clid[%d] device[%s]\n", + __func__, (struct sockaddr *)&clp->cl_addr, + clp->cl_clientid.cl_id - nn->clientid_base, + bdev->bd_disk->disk_name); + clp->cl_fence_retry_warn = true; + } + /* + * The fence worker retries the fencing operation indefinitely to + * prevent data corruption. The admin needs to take the following + * actions to restore access to the file for other clients: + * + * . shutdown or power off the client being fenced. + * . manually expire the client to release all its state on the server; + * echo 'expire' > /proc/fs/nfsd/clients/clid/ctl'. + * + * Where: + * + * clid: is the unique client identifier displayed in + * the warning message above. + */ + if (!ls->ls_fence_delay) + ls->ls_fence_delay = HZ; + else + ls->ls_fence_delay = min(ls->ls_fence_delay << 1, + MAX_FENCE_DELAY); + mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, ls->ls_fence_delay); +} + +/** + * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out. + * @fl: file to check + * + * If the layout type supports a fence operation, schedule a worker to + * fence the client from accessing the block device. + * + * This function runs under the protection of the spin_lock flc_lock. + * At this time, the file_lease associated with the layout stateid is + * on the flc_list. A reference count is incremented on the layout + * stateid to prevent it from being freed while the fence worker is + * executing. Once the fence worker finishes its operation, it releases + * this reference. + * + * The fence worker continues to run until either the client has been + * fenced or the layout becomes invalid. The layout can become invalid + * as a result of a LAYOUTRETURN or when the CB_LAYOUT recall callback + * has completed. + * + * Return true if the file_lease should be disposed of by the caller; + * otherwise, return false. + */ +static bool +nfsd4_layout_lm_breaker_timedout(struct file_lease *fl) +{ + struct nfs4_layout_stateid *ls = fl->c.flc_owner; + + if ((!nfsd4_layout_ops[ls->ls_layout_type]->fence_client) || + ls->ls_fenced) + return true; + if (delayed_work_pending(&ls->ls_fence_work)) + return false; + /* + * Make sure layout has not been returned yet before + * taking a reference count on the layout stateid. + */ + spin_lock(&ls->ls_lock); + if (list_empty(&ls->ls_layouts) || + !refcount_inc_not_zero(&ls->ls_stid.sc_count)) { + spin_unlock(&ls->ls_lock); + return true; + } + spin_unlock(&ls->ls_lock); + + mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0); + return false; +} + static const struct lease_manager_operations nfsd4_layouts_lm_ops = { .lm_break = nfsd4_layout_lm_break, .lm_change = nfsd4_layout_lm_change, .lm_open_conflict = nfsd4_layout_lm_open_conflict, + .lm_breaker_timedout = nfsd4_layout_lm_breaker_timedout, }; int diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 1b4c101ff04b..1d31f2bb2162 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2386,6 +2386,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name, #endif #ifdef CONFIG_NFSD_SCSILAYOUT xa_init(&clp->cl_dev_fences); + mutex_init(&clp->cl_fence_mutex); #endif INIT_LIST_HEAD(&clp->async_copies); spin_lock_init(&clp->async_lock); diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h index db9af780438b..f7bee4dc5d3d 100644 --- a/fs/nfsd/pnfs.h +++ b/fs/nfsd/pnfs.h @@ -11,6 +11,9 @@ struct xdr_stream; +/* Cap exponential backoff between fence retries at 3 minutes */ +#define MAX_FENCE_DELAY ((unsigned int)(3 * 60 * HZ)) + struct nfsd4_deviceid_map { struct list_head hash; u64 idx; @@ -38,7 +41,7 @@ struct nfsd4_layout_ops { struct svc_rqst *rqstp, struct nfsd4_layoutcommit *lcp); - void (*fence_client)(struct nfs4_layout_stateid *ls, + bool (*fence_client)(struct nfs4_layout_stateid *ls, struct nfsd_file *file); }; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 99aeaab9cf2b..ec1c5467012e 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -456,6 +456,7 @@ struct nfs4_client { struct list_head cl_lru; /* tail queue */ #ifdef CONFIG_NFSD_PNFS struct list_head cl_lo_states; /* outstanding layout states */ + bool cl_fence_retry_warn; #endif struct xdr_netobj cl_name; /* id generated by client */ nfs4_verifier cl_verifier; /* generated by client */ @@ -529,6 +530,7 @@ struct nfs4_client { time64_t cl_ra_time; #ifdef CONFIG_NFSD_SCSILAYOUT struct xarray cl_dev_fences; + struct mutex cl_fence_mutex; #endif }; @@ -745,6 +747,10 @@ struct nfs4_layout_stateid { stateid_t ls_recall_sid; bool ls_recalled; struct mutex ls_mutex; + + struct delayed_work ls_fence_work; + unsigned int ls_fence_delay; + bool ls_fenced; }; static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s) diff --git a/include/linux/filelock.h b/include/linux/filelock.h index d2c9740e26a8..5f0a2fb31450 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -50,6 +50,7 @@ struct lease_manager_operations { void (*lm_setup)(struct file_lease *, void **); bool (*lm_breaker_owns_lease)(struct file_lease *); int (*lm_open_conflict)(struct file *, int); + bool (*lm_breaker_timedout)(struct file_lease *fl); }; struct lock_manager { From 5bc37b759ec0cdde2c652a2637d704f2d6306617 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:06:53 -0500 Subject: [PATCH 27/83] Documentation: Add the RPC language description of NLM version 4 In order to generate source code to encode and decode NLMv4 protocol elements, include a copy of the RPC language description of NLMv4 for xdrgen to process. The language description is an amalgam of RFC 1813 and the Open Group's XNFS specification: https://pubs.opengroup.org/onlinepubs/9629799/chap10.htm The C code committed here was generated from the new nlm4.x file using tools/net/sunrpc/xdrgen/xdrgen. The goals of replacing hand-written XDR functions with ones that are tool-generated are to improve memory safety and make XDR encoding and decoding less brittle to maintain. The xdrgen utility derives both the type definitions and the encode/decode functions directly from protocol specifications, using names and symbols familiar to anyone who knows those specs. Unlike hand-written code that can inadvertently diverge from the specification, xdrgen guarantees that the generated code matches the specification exactly. We would eventually like xdrgen to generate Rust code as well, making the conversion of the kernel's NFS stacks to use Rust just a little easier for us. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- Documentation/sunrpc/xdr/nlm4.x | 211 +++++++++ fs/lockd/Makefile | 30 +- fs/lockd/nlm4xdr_gen.c | 724 +++++++++++++++++++++++++++++ fs/lockd/nlm4xdr_gen.h | 32 ++ include/linux/sunrpc/xdrgen/nlm4.h | 233 ++++++++++ 5 files changed, 1229 insertions(+), 1 deletion(-) create mode 100644 Documentation/sunrpc/xdr/nlm4.x create mode 100644 fs/lockd/nlm4xdr_gen.c create mode 100644 fs/lockd/nlm4xdr_gen.h create mode 100644 include/linux/sunrpc/xdrgen/nlm4.h diff --git a/Documentation/sunrpc/xdr/nlm4.x b/Documentation/sunrpc/xdr/nlm4.x new file mode 100644 index 000000000000..0c44a80ef674 --- /dev/null +++ b/Documentation/sunrpc/xdr/nlm4.x @@ -0,0 +1,211 @@ +/* + * This file was extracted by hand from + * https://www.rfc-editor.org/rfc/rfc1813.html . + * + * Note that RFC 1813 is Informational. Its official date of + * publication (June 1995) is before the IETF required its RFCs to + * carry an explicit copyright or other IP ownership notices. + * + * Note also that RFC 1813 does not specify the whole NLM4 protocol. + * In particular, the argument and result types are not present in + * that document, and had to be reverse-engineered. + */ + +/* + * The NLMv4 protocol + */ + +pragma header nlm4; + +/* + * The following definitions are missing in RFC 1813, + * but can be found in the OpenNetworking Network Lock + * Manager protocol: + * + * https://pubs.opengroup.org/onlinepubs/9629799/chap10.htm + */ + +const LM_MAXSTRLEN = 1024; + +const LM_MAXNAMELEN = 1025; + +const MAXNETOBJ_SZ = 1024; + +typedef opaque netobj; + +enum fsh4_mode { + fsm_DN = 0, /* deny none */ + fsm_DR = 1, /* deny read */ + fsm_DW = 2, /* deny write */ + fsm_DRW = 3 /* deny read/write */ +}; + +enum fsh4_access { + fsa_NONE = 0, /* for completeness */ + fsa_R = 1, /* read-only */ + fsa_W = 2, /* write-only */ + fsa_RW = 3 /* read/write */ +}; + +/* + * The following definitions come from the OpenNetworking + * Network Status Monitor protocol: + * + * https://pubs.opengroup.org/onlinepubs/9629799/chap11.htm + */ + +const SM_MAXSTRLEN = 1024; + +/* + * The NLM protocol as extracted from: + * https://tools.ietf.org/html/rfc1813 Appendix II + */ + +typedef unsigned hyper uint64; + +typedef hyper int64; + +typedef unsigned long uint32; + +typedef long int32; + +enum nlm4_stats { + NLM4_GRANTED = 0, + NLM4_DENIED = 1, + NLM4_DENIED_NOLOCKS = 2, + NLM4_BLOCKED = 3, + NLM4_DENIED_GRACE_PERIOD = 4, + NLM4_DEADLCK = 5, + NLM4_ROFS = 6, + NLM4_STALE_FH = 7, + NLM4_FBIG = 8, + NLM4_FAILED = 9 +}; + +pragma big_endian nlm4_stats; + +struct nlm4_holder { + bool exclusive; + int32 svid; + netobj oh; + uint64 l_offset; + uint64 l_len; +}; + +union nlm4_testrply switch (nlm4_stats stat) { + case NLM4_DENIED: + nlm4_holder holder; + default: + void; +}; + +struct nlm4_stat { + nlm4_stats stat; +}; + +struct nlm4_res { + netobj cookie; + nlm4_stat stat; +}; + +struct nlm4_testres { + netobj cookie; + nlm4_testrply stat; +}; + +struct nlm4_lock { + string caller_name; + netobj fh; + netobj oh; + int32 svid; + uint64 l_offset; + uint64 l_len; +}; + +struct nlm4_lockargs { + netobj cookie; + bool block; + bool exclusive; + nlm4_lock alock; + bool reclaim; + int32 state; +}; + +struct nlm4_cancargs { + netobj cookie; + bool block; + bool exclusive; + nlm4_lock alock; +}; + +struct nlm4_testargs { + netobj cookie; + bool exclusive; + nlm4_lock alock; +}; + +struct nlm4_unlockargs { + netobj cookie; + nlm4_lock alock; +}; + +struct nlm4_share { + string caller_name; + netobj fh; + netobj oh; + fsh4_mode mode; + fsh4_access access; +}; + +struct nlm4_shareargs { + netobj cookie; + nlm4_share share; + bool reclaim; +}; + +struct nlm4_shareres { + netobj cookie; + nlm4_stats stat; + int32 sequence; +}; + +struct nlm4_notify { + string name; + int32 state; +}; + +/* + * Argument for the Linux-private SM_NOTIFY procedure + */ +const SM_PRIV_SIZE = 16; + +struct nlm4_notifyargs { + nlm4_notify notify; + opaque private[SM_PRIV_SIZE]; +}; + +program NLM4_PROG { + version NLM4_VERS { + void NLMPROC4_NULL(void) = 0; + nlm4_testres NLMPROC4_TEST(nlm4_testargs) = 1; + nlm4_res NLMPROC4_LOCK(nlm4_lockargs) = 2; + nlm4_res NLMPROC4_CANCEL(nlm4_cancargs) = 3; + nlm4_res NLMPROC4_UNLOCK(nlm4_unlockargs) = 4; + nlm4_res NLMPROC4_GRANTED(nlm4_testargs) = 5; + void NLMPROC4_TEST_MSG(nlm4_testargs) = 6; + void NLMPROC4_LOCK_MSG(nlm4_lockargs) = 7; + void NLMPROC4_CANCEL_MSG(nlm4_cancargs) = 8; + void NLMPROC4_UNLOCK_MSG(nlm4_unlockargs) = 9; + void NLMPROC4_GRANTED_MSG(nlm4_testargs) = 10; + void NLMPROC4_TEST_RES(nlm4_testres) = 11; + void NLMPROC4_LOCK_RES(nlm4_res) = 12; + void NLMPROC4_CANCEL_RES(nlm4_res) = 13; + void NLMPROC4_UNLOCK_RES(nlm4_res) = 14; + void NLMPROC4_GRANTED_RES(nlm4_res) = 15; + void NLMPROC4_SM_NOTIFY(nlm4_notifyargs) = 16; + nlm4_shareres NLMPROC4_SHARE(nlm4_shareargs) = 20; + nlm4_shareres NLMPROC4_UNSHARE(nlm4_shareargs) = 21; + nlm4_res NLMPROC4_NM_LOCK(nlm4_lockargs) = 22; + void NLMPROC4_FREE_ALL(nlm4_notify) = 23; + } = 4; +} = 100021; diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile index 51bbe22d21e3..8e9d18a4348c 100644 --- a/fs/lockd/Makefile +++ b/fs/lockd/Makefile @@ -9,5 +9,33 @@ obj-$(CONFIG_LOCKD) += lockd.o lockd-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \ svcshare.o svcproc.o svcsubs.o mon.o trace.o xdr.o netlink.o -lockd-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o +lockd-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o nlm4xdr_gen.o lockd-$(CONFIG_PROC_FS) += procfs.o + +# +# XDR code generation (requires Python and additional packages) +# +# The generated *xdr_gen.{h,c} files are checked into git. Normal kernel +# builds do not require the xdrgen tool or its Python dependencies. +# +# Developers modifying .x files in Documentation/sunrpc/xdr/ should run +# "make xdrgen" to regenerate the affected files. +# +.PHONY: xdrgen + +XDRGEN = ../../tools/net/sunrpc/xdrgen/xdrgen + +XDRGEN_DEFINITIONS = ../../include/linux/sunrpc/xdrgen/nlm4.h +XDRGEN_DECLARATIONS = nlm4xdr_gen.h +XDRGEN_SOURCE = nlm4xdr_gen.c + +xdrgen: $(XDRGEN_DEFINITIONS) $(XDRGEN_DECLARATIONS) $(XDRGEN_SOURCE) + +../../include/linux/sunrpc/xdrgen/nlm4.h: ../../Documentation/sunrpc/xdr/nlm4.x + $(XDRGEN) definitions $< > $@ + +nlm4xdr_gen.h: ../../Documentation/sunrpc/xdr/nlm4.x + $(XDRGEN) declarations $< > $@ + +nlm4xdr_gen.c: ../../Documentation/sunrpc/xdr/nlm4.x + $(XDRGEN) source --peer server $< > $@ diff --git a/fs/lockd/nlm4xdr_gen.c b/fs/lockd/nlm4xdr_gen.c new file mode 100644 index 000000000000..1c8c221db456 --- /dev/null +++ b/fs/lockd/nlm4xdr_gen.c @@ -0,0 +1,724 @@ +// SPDX-License-Identifier: GPL-2.0 +// Generated by xdrgen. Manual edits will be lost. +// XDR specification file: ../../Documentation/sunrpc/xdr/nlm4.x +// XDR specification modification time: Thu Dec 25 13:10:19 2025 + +#include + +#include "nlm4xdr_gen.h" + +static bool __maybe_unused +xdrgen_decode_netobj(struct xdr_stream *xdr, netobj *ptr) +{ + return xdrgen_decode_opaque(xdr, ptr, MAXNETOBJ_SZ); +} + +static bool __maybe_unused +xdrgen_decode_fsh4_mode(struct xdr_stream *xdr, fsh4_mode *ptr) +{ + u32 val; + + if (xdr_stream_decode_u32(xdr, &val) < 0) + return false; + *ptr = val; + return true; +} + +static bool __maybe_unused +xdrgen_decode_fsh4_access(struct xdr_stream *xdr, fsh4_access *ptr) +{ + u32 val; + + if (xdr_stream_decode_u32(xdr, &val) < 0) + return false; + *ptr = val; + return true; +} + +static bool __maybe_unused +xdrgen_decode_uint64(struct xdr_stream *xdr, uint64 *ptr) +{ + return xdrgen_decode_unsigned_hyper(xdr, ptr); +} + +static bool __maybe_unused +xdrgen_decode_int64(struct xdr_stream *xdr, int64 *ptr) +{ + return xdrgen_decode_hyper(xdr, ptr); +} + +static bool __maybe_unused +xdrgen_decode_uint32(struct xdr_stream *xdr, uint32 *ptr) +{ + return xdrgen_decode_unsigned_long(xdr, ptr); +} + +static bool __maybe_unused +xdrgen_decode_int32(struct xdr_stream *xdr, int32 *ptr) +{ + return xdrgen_decode_long(xdr, ptr); +} + +static bool __maybe_unused +xdrgen_decode_nlm4_stats(struct xdr_stream *xdr, nlm4_stats *ptr) +{ + return xdr_stream_decode_be32(xdr, ptr) == 0; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_holder(struct xdr_stream *xdr, struct nlm4_holder *ptr) +{ + if (!xdrgen_decode_bool(xdr, &ptr->exclusive)) + return false; + if (!xdrgen_decode_int32(xdr, &ptr->svid)) + return false; + if (!xdrgen_decode_netobj(xdr, &ptr->oh)) + return false; + if (!xdrgen_decode_uint64(xdr, &ptr->l_offset)) + return false; + if (!xdrgen_decode_uint64(xdr, &ptr->l_len)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_testrply(struct xdr_stream *xdr, struct nlm4_testrply *ptr) +{ + if (!xdrgen_decode_nlm4_stats(xdr, &ptr->stat)) + return false; + switch (ptr->stat) { + case __constant_cpu_to_be32(NLM4_DENIED): + if (!xdrgen_decode_nlm4_holder(xdr, &ptr->u.holder)) + return false; + break; + default: + break; + } + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_stat(struct xdr_stream *xdr, struct nlm4_stat *ptr) +{ + if (!xdrgen_decode_nlm4_stats(xdr, &ptr->stat)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_res(struct xdr_stream *xdr, struct nlm4_res *ptr) +{ + if (!xdrgen_decode_netobj(xdr, &ptr->cookie)) + return false; + if (!xdrgen_decode_nlm4_stat(xdr, &ptr->stat)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_testres(struct xdr_stream *xdr, struct nlm4_testres *ptr) +{ + if (!xdrgen_decode_netobj(xdr, &ptr->cookie)) + return false; + if (!xdrgen_decode_nlm4_testrply(xdr, &ptr->stat)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_lock(struct xdr_stream *xdr, struct nlm4_lock *ptr) +{ + if (!xdrgen_decode_string(xdr, (string *)ptr, LM_MAXSTRLEN)) + return false; + if (!xdrgen_decode_netobj(xdr, &ptr->fh)) + return false; + if (!xdrgen_decode_netobj(xdr, &ptr->oh)) + return false; + if (!xdrgen_decode_int32(xdr, &ptr->svid)) + return false; + if (!xdrgen_decode_uint64(xdr, &ptr->l_offset)) + return false; + if (!xdrgen_decode_uint64(xdr, &ptr->l_len)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_lockargs(struct xdr_stream *xdr, struct nlm4_lockargs *ptr) +{ + if (!xdrgen_decode_netobj(xdr, &ptr->cookie)) + return false; + if (!xdrgen_decode_bool(xdr, &ptr->block)) + return false; + if (!xdrgen_decode_bool(xdr, &ptr->exclusive)) + return false; + if (!xdrgen_decode_nlm4_lock(xdr, &ptr->alock)) + return false; + if (!xdrgen_decode_bool(xdr, &ptr->reclaim)) + return false; + if (!xdrgen_decode_int32(xdr, &ptr->state)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_cancargs(struct xdr_stream *xdr, struct nlm4_cancargs *ptr) +{ + if (!xdrgen_decode_netobj(xdr, &ptr->cookie)) + return false; + if (!xdrgen_decode_bool(xdr, &ptr->block)) + return false; + if (!xdrgen_decode_bool(xdr, &ptr->exclusive)) + return false; + if (!xdrgen_decode_nlm4_lock(xdr, &ptr->alock)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_testargs(struct xdr_stream *xdr, struct nlm4_testargs *ptr) +{ + if (!xdrgen_decode_netobj(xdr, &ptr->cookie)) + return false; + if (!xdrgen_decode_bool(xdr, &ptr->exclusive)) + return false; + if (!xdrgen_decode_nlm4_lock(xdr, &ptr->alock)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_unlockargs(struct xdr_stream *xdr, struct nlm4_unlockargs *ptr) +{ + if (!xdrgen_decode_netobj(xdr, &ptr->cookie)) + return false; + if (!xdrgen_decode_nlm4_lock(xdr, &ptr->alock)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_share(struct xdr_stream *xdr, struct nlm4_share *ptr) +{ + if (!xdrgen_decode_string(xdr, (string *)ptr, LM_MAXSTRLEN)) + return false; + if (!xdrgen_decode_netobj(xdr, &ptr->fh)) + return false; + if (!xdrgen_decode_netobj(xdr, &ptr->oh)) + return false; + if (!xdrgen_decode_fsh4_mode(xdr, &ptr->mode)) + return false; + if (!xdrgen_decode_fsh4_access(xdr, &ptr->access)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_shareargs(struct xdr_stream *xdr, struct nlm4_shareargs *ptr) +{ + if (!xdrgen_decode_netobj(xdr, &ptr->cookie)) + return false; + if (!xdrgen_decode_nlm4_share(xdr, &ptr->share)) + return false; + if (!xdrgen_decode_bool(xdr, &ptr->reclaim)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_shareres(struct xdr_stream *xdr, struct nlm4_shareres *ptr) +{ + if (!xdrgen_decode_netobj(xdr, &ptr->cookie)) + return false; + if (!xdrgen_decode_nlm4_stats(xdr, &ptr->stat)) + return false; + if (!xdrgen_decode_int32(xdr, &ptr->sequence)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_notify(struct xdr_stream *xdr, struct nlm4_notify *ptr) +{ + if (!xdrgen_decode_string(xdr, (string *)ptr, LM_MAXNAMELEN)) + return false; + if (!xdrgen_decode_int32(xdr, &ptr->state)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_notifyargs(struct xdr_stream *xdr, struct nlm4_notifyargs *ptr) +{ + if (!xdrgen_decode_nlm4_notify(xdr, &ptr->notify)) + return false; + if (xdr_stream_decode_opaque_fixed(xdr, ptr->private, SM_PRIV_SIZE) < 0) + return false; + return true; +} + +/** + * nlm4_svc_decode_void - Decode a void argument + * @rqstp: RPC transaction context + * @xdr: source XDR data stream + * + * Return values: + * %true: procedure arguments decoded successfully + * %false: decode failed + */ +bool nlm4_svc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + return xdrgen_decode_void(xdr); +} + +/** + * nlm4_svc_decode_nlm4_testargs - Decode a nlm4_testargs argument + * @rqstp: RPC transaction context + * @xdr: source XDR data stream + * + * Return values: + * %true: procedure arguments decoded successfully + * %false: decode failed + */ +bool nlm4_svc_decode_nlm4_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_testargs *argp = rqstp->rq_argp; + + return xdrgen_decode_nlm4_testargs(xdr, argp); +} + +/** + * nlm4_svc_decode_nlm4_lockargs - Decode a nlm4_lockargs argument + * @rqstp: RPC transaction context + * @xdr: source XDR data stream + * + * Return values: + * %true: procedure arguments decoded successfully + * %false: decode failed + */ +bool nlm4_svc_decode_nlm4_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_lockargs *argp = rqstp->rq_argp; + + return xdrgen_decode_nlm4_lockargs(xdr, argp); +} + +/** + * nlm4_svc_decode_nlm4_cancargs - Decode a nlm4_cancargs argument + * @rqstp: RPC transaction context + * @xdr: source XDR data stream + * + * Return values: + * %true: procedure arguments decoded successfully + * %false: decode failed + */ +bool nlm4_svc_decode_nlm4_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_cancargs *argp = rqstp->rq_argp; + + return xdrgen_decode_nlm4_cancargs(xdr, argp); +} + +/** + * nlm4_svc_decode_nlm4_unlockargs - Decode a nlm4_unlockargs argument + * @rqstp: RPC transaction context + * @xdr: source XDR data stream + * + * Return values: + * %true: procedure arguments decoded successfully + * %false: decode failed + */ +bool nlm4_svc_decode_nlm4_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_unlockargs *argp = rqstp->rq_argp; + + return xdrgen_decode_nlm4_unlockargs(xdr, argp); +} + +/** + * nlm4_svc_decode_nlm4_testres - Decode a nlm4_testres argument + * @rqstp: RPC transaction context + * @xdr: source XDR data stream + * + * Return values: + * %true: procedure arguments decoded successfully + * %false: decode failed + */ +bool nlm4_svc_decode_nlm4_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_testres *argp = rqstp->rq_argp; + + return xdrgen_decode_nlm4_testres(xdr, argp); +} + +/** + * nlm4_svc_decode_nlm4_res - Decode a nlm4_res argument + * @rqstp: RPC transaction context + * @xdr: source XDR data stream + * + * Return values: + * %true: procedure arguments decoded successfully + * %false: decode failed + */ +bool nlm4_svc_decode_nlm4_res(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_res *argp = rqstp->rq_argp; + + return xdrgen_decode_nlm4_res(xdr, argp); +} + +/** + * nlm4_svc_decode_nlm4_notifyargs - Decode a nlm4_notifyargs argument + * @rqstp: RPC transaction context + * @xdr: source XDR data stream + * + * Return values: + * %true: procedure arguments decoded successfully + * %false: decode failed + */ +bool nlm4_svc_decode_nlm4_notifyargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_notifyargs *argp = rqstp->rq_argp; + + return xdrgen_decode_nlm4_notifyargs(xdr, argp); +} + +/** + * nlm4_svc_decode_nlm4_shareargs - Decode a nlm4_shareargs argument + * @rqstp: RPC transaction context + * @xdr: source XDR data stream + * + * Return values: + * %true: procedure arguments decoded successfully + * %false: decode failed + */ +bool nlm4_svc_decode_nlm4_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_shareargs *argp = rqstp->rq_argp; + + return xdrgen_decode_nlm4_shareargs(xdr, argp); +} + +/** + * nlm4_svc_decode_nlm4_notify - Decode a nlm4_notify argument + * @rqstp: RPC transaction context + * @xdr: source XDR data stream + * + * Return values: + * %true: procedure arguments decoded successfully + * %false: decode failed + */ +bool nlm4_svc_decode_nlm4_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_notify *argp = rqstp->rq_argp; + + return xdrgen_decode_nlm4_notify(xdr, argp); +} + +static bool __maybe_unused +xdrgen_encode_netobj(struct xdr_stream *xdr, const netobj value) +{ + return xdr_stream_encode_opaque(xdr, value.data, value.len) >= 0; +} + +static bool __maybe_unused +xdrgen_encode_fsh4_mode(struct xdr_stream *xdr, fsh4_mode value) +{ + return xdr_stream_encode_u32(xdr, value) == XDR_UNIT; +} + +static bool __maybe_unused +xdrgen_encode_fsh4_access(struct xdr_stream *xdr, fsh4_access value) +{ + return xdr_stream_encode_u32(xdr, value) == XDR_UNIT; +} + +static bool __maybe_unused +xdrgen_encode_uint64(struct xdr_stream *xdr, const uint64 value) +{ + return xdrgen_encode_unsigned_hyper(xdr, value); +} + +static bool __maybe_unused +xdrgen_encode_int64(struct xdr_stream *xdr, const int64 value) +{ + return xdrgen_encode_hyper(xdr, value); +} + +static bool __maybe_unused +xdrgen_encode_uint32(struct xdr_stream *xdr, const uint32 value) +{ + return xdrgen_encode_unsigned_long(xdr, value); +} + +static bool __maybe_unused +xdrgen_encode_int32(struct xdr_stream *xdr, const int32 value) +{ + return xdrgen_encode_long(xdr, value); +} + +static bool __maybe_unused +xdrgen_encode_nlm4_stats(struct xdr_stream *xdr, nlm4_stats value) +{ + return xdr_stream_encode_be32(xdr, value) == XDR_UNIT; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_holder(struct xdr_stream *xdr, const struct nlm4_holder *value) +{ + if (!xdrgen_encode_bool(xdr, value->exclusive)) + return false; + if (!xdrgen_encode_int32(xdr, value->svid)) + return false; + if (!xdrgen_encode_netobj(xdr, value->oh)) + return false; + if (!xdrgen_encode_uint64(xdr, value->l_offset)) + return false; + if (!xdrgen_encode_uint64(xdr, value->l_len)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_testrply(struct xdr_stream *xdr, const struct nlm4_testrply *ptr) +{ + if (!xdrgen_encode_nlm4_stats(xdr, ptr->stat)) + return false; + switch (ptr->stat) { + case __constant_cpu_to_be32(NLM4_DENIED): + if (!xdrgen_encode_nlm4_holder(xdr, &ptr->u.holder)) + return false; + break; + default: + break; + } + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_stat(struct xdr_stream *xdr, const struct nlm4_stat *value) +{ + if (!xdrgen_encode_nlm4_stats(xdr, value->stat)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_res(struct xdr_stream *xdr, const struct nlm4_res *value) +{ + if (!xdrgen_encode_netobj(xdr, value->cookie)) + return false; + if (!xdrgen_encode_nlm4_stat(xdr, &value->stat)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_testres(struct xdr_stream *xdr, const struct nlm4_testres *value) +{ + if (!xdrgen_encode_netobj(xdr, value->cookie)) + return false; + if (!xdrgen_encode_nlm4_testrply(xdr, &value->stat)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_lock(struct xdr_stream *xdr, const struct nlm4_lock *value) +{ + if (value->caller_name.len > LM_MAXSTRLEN) + return false; + if (xdr_stream_encode_opaque(xdr, value->caller_name.data, value->caller_name.len) < 0) + return false; + if (!xdrgen_encode_netobj(xdr, value->fh)) + return false; + if (!xdrgen_encode_netobj(xdr, value->oh)) + return false; + if (!xdrgen_encode_int32(xdr, value->svid)) + return false; + if (!xdrgen_encode_uint64(xdr, value->l_offset)) + return false; + if (!xdrgen_encode_uint64(xdr, value->l_len)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_lockargs(struct xdr_stream *xdr, const struct nlm4_lockargs *value) +{ + if (!xdrgen_encode_netobj(xdr, value->cookie)) + return false; + if (!xdrgen_encode_bool(xdr, value->block)) + return false; + if (!xdrgen_encode_bool(xdr, value->exclusive)) + return false; + if (!xdrgen_encode_nlm4_lock(xdr, &value->alock)) + return false; + if (!xdrgen_encode_bool(xdr, value->reclaim)) + return false; + if (!xdrgen_encode_int32(xdr, value->state)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_cancargs(struct xdr_stream *xdr, const struct nlm4_cancargs *value) +{ + if (!xdrgen_encode_netobj(xdr, value->cookie)) + return false; + if (!xdrgen_encode_bool(xdr, value->block)) + return false; + if (!xdrgen_encode_bool(xdr, value->exclusive)) + return false; + if (!xdrgen_encode_nlm4_lock(xdr, &value->alock)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_testargs(struct xdr_stream *xdr, const struct nlm4_testargs *value) +{ + if (!xdrgen_encode_netobj(xdr, value->cookie)) + return false; + if (!xdrgen_encode_bool(xdr, value->exclusive)) + return false; + if (!xdrgen_encode_nlm4_lock(xdr, &value->alock)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_unlockargs(struct xdr_stream *xdr, const struct nlm4_unlockargs *value) +{ + if (!xdrgen_encode_netobj(xdr, value->cookie)) + return false; + if (!xdrgen_encode_nlm4_lock(xdr, &value->alock)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_share(struct xdr_stream *xdr, const struct nlm4_share *value) +{ + if (value->caller_name.len > LM_MAXSTRLEN) + return false; + if (xdr_stream_encode_opaque(xdr, value->caller_name.data, value->caller_name.len) < 0) + return false; + if (!xdrgen_encode_netobj(xdr, value->fh)) + return false; + if (!xdrgen_encode_netobj(xdr, value->oh)) + return false; + if (!xdrgen_encode_fsh4_mode(xdr, value->mode)) + return false; + if (!xdrgen_encode_fsh4_access(xdr, value->access)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_shareargs(struct xdr_stream *xdr, const struct nlm4_shareargs *value) +{ + if (!xdrgen_encode_netobj(xdr, value->cookie)) + return false; + if (!xdrgen_encode_nlm4_share(xdr, &value->share)) + return false; + if (!xdrgen_encode_bool(xdr, value->reclaim)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_shareres(struct xdr_stream *xdr, const struct nlm4_shareres *value) +{ + if (!xdrgen_encode_netobj(xdr, value->cookie)) + return false; + if (!xdrgen_encode_nlm4_stats(xdr, value->stat)) + return false; + if (!xdrgen_encode_int32(xdr, value->sequence)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_notify(struct xdr_stream *xdr, const struct nlm4_notify *value) +{ + if (value->name.len > LM_MAXNAMELEN) + return false; + if (xdr_stream_encode_opaque(xdr, value->name.data, value->name.len) < 0) + return false; + if (!xdrgen_encode_int32(xdr, value->state)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_notifyargs(struct xdr_stream *xdr, const struct nlm4_notifyargs *value) +{ + if (!xdrgen_encode_nlm4_notify(xdr, &value->notify)) + return false; + if (xdr_stream_encode_opaque_fixed(xdr, value->private, SM_PRIV_SIZE) < 0) + return false; + return true; +} + +/** + * nlm4_svc_encode_void - Encode a void result + * @rqstp: RPC transaction context + * @xdr: target XDR data stream + * + * Return values: + * %true: procedure results encoded successfully + * %false: encode failed + */ +bool nlm4_svc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + return xdrgen_encode_void(xdr); +} + +/** + * nlm4_svc_encode_nlm4_testres - Encode a nlm4_testres result + * @rqstp: RPC transaction context + * @xdr: target XDR data stream + * + * Return values: + * %true: procedure results encoded successfully + * %false: encode failed + */ +bool nlm4_svc_encode_nlm4_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_testres *resp = rqstp->rq_resp; + + return xdrgen_encode_nlm4_testres(xdr, resp); +} + +/** + * nlm4_svc_encode_nlm4_res - Encode a nlm4_res result + * @rqstp: RPC transaction context + * @xdr: target XDR data stream + * + * Return values: + * %true: procedure results encoded successfully + * %false: encode failed + */ +bool nlm4_svc_encode_nlm4_res(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_res *resp = rqstp->rq_resp; + + return xdrgen_encode_nlm4_res(xdr, resp); +} + +/** + * nlm4_svc_encode_nlm4_shareres - Encode a nlm4_shareres result + * @rqstp: RPC transaction context + * @xdr: target XDR data stream + * + * Return values: + * %true: procedure results encoded successfully + * %false: encode failed + */ +bool nlm4_svc_encode_nlm4_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_shareres *resp = rqstp->rq_resp; + + return xdrgen_encode_nlm4_shareres(xdr, resp); +} diff --git a/fs/lockd/nlm4xdr_gen.h b/fs/lockd/nlm4xdr_gen.h new file mode 100644 index 000000000000..b6008b296a3e --- /dev/null +++ b/fs/lockd/nlm4xdr_gen.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Generated by xdrgen. Manual edits will be lost. */ +/* XDR specification file: ../../Documentation/sunrpc/xdr/nlm4.x */ +/* XDR specification modification time: Thu Dec 25 13:10:19 2025 */ + +#ifndef _LINUX_XDRGEN_NLM4_DECL_H +#define _LINUX_XDRGEN_NLM4_DECL_H + +#include + +#include +#include +#include +#include + +bool nlm4_svc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_decode_nlm4_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_decode_nlm4_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_decode_nlm4_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_decode_nlm4_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_decode_nlm4_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_decode_nlm4_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_decode_nlm4_notifyargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_decode_nlm4_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_decode_nlm4_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr); + +bool nlm4_svc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_encode_nlm4_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_encode_nlm4_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_encode_nlm4_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr); + +#endif /* _LINUX_XDRGEN_NLM4_DECL_H */ diff --git a/include/linux/sunrpc/xdrgen/nlm4.h b/include/linux/sunrpc/xdrgen/nlm4.h new file mode 100644 index 000000000000..e95e8f105624 --- /dev/null +++ b/include/linux/sunrpc/xdrgen/nlm4.h @@ -0,0 +1,233 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Generated by xdrgen. Manual edits will be lost. */ +/* XDR specification file: ../../Documentation/sunrpc/xdr/nlm4.x */ +/* XDR specification modification time: Thu Dec 25 13:10:19 2025 */ + +#ifndef _LINUX_XDRGEN_NLM4_DEF_H +#define _LINUX_XDRGEN_NLM4_DEF_H + +#include +#include + +enum { LM_MAXSTRLEN = 1024 }; + +enum { LM_MAXNAMELEN = 1025 }; + +enum { MAXNETOBJ_SZ = 1024 }; + +typedef opaque netobj; + +enum fsh4_mode { + fsm_DN = 0, + fsm_DR = 1, + fsm_DW = 2, + fsm_DRW = 3, +}; + +typedef enum fsh4_mode fsh4_mode; + +enum fsh4_access { + fsa_NONE = 0, + fsa_R = 1, + fsa_W = 2, + fsa_RW = 3, +}; + +typedef enum fsh4_access fsh4_access; + +enum { SM_MAXSTRLEN = 1024 }; + +typedef u64 uint64; + +typedef s64 int64; + +typedef u32 uint32; + +typedef s32 int32; + +enum nlm4_stats { + NLM4_GRANTED = 0, + NLM4_DENIED = 1, + NLM4_DENIED_NOLOCKS = 2, + NLM4_BLOCKED = 3, + NLM4_DENIED_GRACE_PERIOD = 4, + NLM4_DEADLCK = 5, + NLM4_ROFS = 6, + NLM4_STALE_FH = 7, + NLM4_FBIG = 8, + NLM4_FAILED = 9, +}; + +typedef __be32 nlm4_stats; + +struct nlm4_holder { + bool exclusive; + int32 svid; + netobj oh; + uint64 l_offset; + uint64 l_len; +}; + +struct nlm4_testrply { + nlm4_stats stat; + union { + struct nlm4_holder holder; + } u; +}; + +struct nlm4_stat { + nlm4_stats stat; +}; + +struct nlm4_res { + netobj cookie; + struct nlm4_stat stat; +}; + +struct nlm4_testres { + netobj cookie; + struct nlm4_testrply stat; +}; + +struct nlm4_lock { + string caller_name; + netobj fh; + netobj oh; + int32 svid; + uint64 l_offset; + uint64 l_len; +}; + +struct nlm4_lockargs { + netobj cookie; + bool block; + bool exclusive; + struct nlm4_lock alock; + bool reclaim; + int32 state; +}; + +struct nlm4_cancargs { + netobj cookie; + bool block; + bool exclusive; + struct nlm4_lock alock; +}; + +struct nlm4_testargs { + netobj cookie; + bool exclusive; + struct nlm4_lock alock; +}; + +struct nlm4_unlockargs { + netobj cookie; + struct nlm4_lock alock; +}; + +struct nlm4_share { + string caller_name; + netobj fh; + netobj oh; + fsh4_mode mode; + fsh4_access access; +}; + +struct nlm4_shareargs { + netobj cookie; + struct nlm4_share share; + bool reclaim; +}; + +struct nlm4_shareres { + netobj cookie; + nlm4_stats stat; + int32 sequence; +}; + +struct nlm4_notify { + string name; + int32 state; +}; + +enum { SM_PRIV_SIZE = 16 }; + +struct nlm4_notifyargs { + struct nlm4_notify notify; + u8 private[SM_PRIV_SIZE]; +}; + +enum { + NLMPROC4_NULL = 0, + NLMPROC4_TEST = 1, + NLMPROC4_LOCK = 2, + NLMPROC4_CANCEL = 3, + NLMPROC4_UNLOCK = 4, + NLMPROC4_GRANTED = 5, + NLMPROC4_TEST_MSG = 6, + NLMPROC4_LOCK_MSG = 7, + NLMPROC4_CANCEL_MSG = 8, + NLMPROC4_UNLOCK_MSG = 9, + NLMPROC4_GRANTED_MSG = 10, + NLMPROC4_TEST_RES = 11, + NLMPROC4_LOCK_RES = 12, + NLMPROC4_CANCEL_RES = 13, + NLMPROC4_UNLOCK_RES = 14, + NLMPROC4_GRANTED_RES = 15, + NLMPROC4_SM_NOTIFY = 16, + NLMPROC4_SHARE = 20, + NLMPROC4_UNSHARE = 21, + NLMPROC4_NM_LOCK = 22, + NLMPROC4_FREE_ALL = 23, +}; + +#ifndef NLM4_PROG +#define NLM4_PROG (100021) +#endif + +#define NLM4_netobj_sz (XDR_unsigned_int + XDR_QUADLEN(MAXNETOBJ_SZ)) +#define NLM4_fsh4_mode_sz (XDR_int) +#define NLM4_fsh4_access_sz (XDR_int) +#define NLM4_uint64_sz \ + (XDR_unsigned_hyper) +#define NLM4_int64_sz \ + (XDR_hyper) +#define NLM4_uint32_sz \ + (XDR_unsigned_long) +#define NLM4_int32_sz \ + (XDR_long) +#define NLM4_nlm4_stats_sz (XDR_int) +#define NLM4_nlm4_holder_sz \ + (XDR_bool + NLM4_int32_sz + NLM4_netobj_sz + NLM4_uint64_sz + NLM4_uint64_sz) +#define NLM4_nlm4_testrply_sz \ + (NLM4_nlm4_stats_sz + NLM4_nlm4_holder_sz) +#define NLM4_nlm4_stat_sz \ + (NLM4_nlm4_stats_sz) +#define NLM4_nlm4_res_sz \ + (NLM4_netobj_sz + NLM4_nlm4_stat_sz) +#define NLM4_nlm4_testres_sz \ + (NLM4_netobj_sz + NLM4_nlm4_testrply_sz) +#define NLM4_nlm4_lock_sz \ + (XDR_unsigned_int + XDR_QUADLEN(LM_MAXSTRLEN) + NLM4_netobj_sz + NLM4_netobj_sz + NLM4_int32_sz + NLM4_uint64_sz + NLM4_uint64_sz) +#define NLM4_nlm4_lockargs_sz \ + (NLM4_netobj_sz + XDR_bool + XDR_bool + NLM4_nlm4_lock_sz + XDR_bool + NLM4_int32_sz) +#define NLM4_nlm4_cancargs_sz \ + (NLM4_netobj_sz + XDR_bool + XDR_bool + NLM4_nlm4_lock_sz) +#define NLM4_nlm4_testargs_sz \ + (NLM4_netobj_sz + XDR_bool + NLM4_nlm4_lock_sz) +#define NLM4_nlm4_unlockargs_sz \ + (NLM4_netobj_sz + NLM4_nlm4_lock_sz) +#define NLM4_nlm4_share_sz \ + (XDR_unsigned_int + XDR_QUADLEN(LM_MAXSTRLEN) + NLM4_netobj_sz + NLM4_netobj_sz + NLM4_fsh4_mode_sz + NLM4_fsh4_access_sz) +#define NLM4_nlm4_shareargs_sz \ + (NLM4_netobj_sz + NLM4_nlm4_share_sz + XDR_bool) +#define NLM4_nlm4_shareres_sz \ + (NLM4_netobj_sz + NLM4_nlm4_stats_sz + NLM4_int32_sz) +#define NLM4_nlm4_notify_sz \ + (XDR_unsigned_int + XDR_QUADLEN(LM_MAXNAMELEN) + NLM4_int32_sz) +#define NLM4_nlm4_notifyargs_sz \ + (NLM4_nlm4_notify_sz + XDR_QUADLEN(SM_PRIV_SIZE)) +#define NLM4_MAX_ARGS_SZ \ + (NLM4_nlm4_lockargs_sz) + +#endif /* _LINUX_XDRGEN_NLM4_DEF_H */ From 3b4839f09ca2615f7f6c99c9f9891a1a5b62071e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:06:54 -0500 Subject: [PATCH 28/83] lockd: Use xdrgen XDR functions for the NLMv4 NULL procedure Hand-written XDR encoders and decoders are difficult to maintain and can inadvertently diverge from protocol specifications. By migrating to xdrgen-generated code, we improve type safety and ensure the implementation exactly matches the NLM version 4 protocol specification. This patch begins the migration by converting the NULL procedure to use nlm4_svc_decode_void and nlm4_svc_encode_void generated from Documentation/sunrpc/xdr/nlm4.x. The NULL procedure is straightforward as it has no arguments or results, making it an ideal starting point for this series. The pc_xdrressize field is set to XDR_void (zero) to reflect that this procedure returns no XDR-encoded data. The argzero field is also set to zero since xdrgen decoders reliably initialize all decoded values. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index c99f192bce77..4fcd66beb4df 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -13,7 +13,17 @@ #include #include "lockd.h" + +/* + * xdr.h defines SM_MAXSTRLEN and SM_PRIV_SIZE as macros. + * nlm4xdr_gen.h defines them as enum constants. Undefine the + * macros to allow the xdrgen enum definitions to be used. + */ +#undef SM_MAXSTRLEN +#undef SM_PRIV_SIZE + #include "share.h" +#include "nlm4xdr_gen.h" #include "xdr4.h" #define NLMDBG_FACILITY NLMDBG_CLIENT @@ -92,13 +102,19 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, } } -/* - * NULL: Test for presence of service +/** + * nlm4svc_proc_null - NULL: Test for presence of service + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully + * + * RPC synopsis: + * void NLMPROC4_NULL(void) = 0; */ static __be32 nlm4svc_proc_null(struct svc_rqst *rqstp) { - dprintk("lockd: NULL called\n"); return rpc_success; } @@ -531,15 +547,15 @@ struct nlm_void { int dummy; }; #define Rg 4 /* range (offset + length) */ static const struct svc_procedure nlm4svc_procedures[24] = { - [NLMPROC_NULL] = { - .pc_func = nlm4svc_proc_null, - .pc_decode = nlm4svc_decode_void, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_void), - .pc_argzero = sizeof(struct nlm_void), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = St, - .pc_name = "NULL", + [NLMPROC4_NULL] = { + .pc_func = nlm4svc_proc_null, + .pc_decode = nlm4_svc_decode_void, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = XDR_void, + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "NULL", }, [NLMPROC_TEST] = { .pc_func = nlm4svc_proc_test, From 3de744ee4e4557da0d63be8a97ad44b4dad58912 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:06:55 -0500 Subject: [PATCH 29/83] lockd: Use xdrgen XDR functions for the NLMv4 TEST procedure The NLM TEST procedure requires host and file lookups to check lock state, operations that will be common across multiple NLM procedures being migrated to xdrgen. By introducing the helper functions nlm4svc_lookup_host() and nlm4svc_lookup_file() now, we establish reusable patterns for subsequent conversions in this series. This patch converts the TEST procedure to use xdrgen functions nlm4_svc_decode_testargs and nlm4_svc_encode_testres generated from the NLM version 4 protocol specification. The procedure handler is rewritten to use xdrgen types through wrapper structures that bridge between generated code and the legacy nlm_lock representation still used by the core lockd logic. TEST_MSG is to be converted in a subsequent patch. The pc_argzero field is set to zero because xdrgen decoders reliably initialize all arguments in the argp->xdrgen field, making the early defensive memset unnecessary. Remaining argp fields are cleared as needed. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 186 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 174 insertions(+), 12 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 4fcd66beb4df..b07ab4d60871 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -28,6 +28,95 @@ #define NLMDBG_FACILITY NLMDBG_CLIENT +/* + * Wrapper structures combine xdrgen types with legacy nlm_lock. + * The xdrgen field must be first so the structure can be cast + * to its XDR type for the RPC dispatch layer. + */ +struct nlm4_testargs_wrapper { + struct nlm4_testargs xdrgen; + struct nlm_lock lock; +}; + +static_assert(offsetof(struct nlm4_testargs_wrapper, xdrgen) == 0); + +struct nlm4_testres_wrapper { + struct nlm4_testres xdrgen; + struct nlm_lock lock; +}; + +static_assert(offsetof(struct nlm4_testres_wrapper, xdrgen) == 0); + +static struct nlm_host * +nlm4svc_lookup_host(struct svc_rqst *rqstp, string caller, bool monitored) +{ + struct nlm_host *host; + + if (!nlmsvc_ops) + return NULL; + host = nlmsvc_lookup_host(rqstp, caller.data, caller.len); + if (!host) + return NULL; + if (monitored && nsm_monitor(host) < 0) { + nlmsvc_release_host(host); + return NULL; + } + return host; +} + +static __be32 +nlm4svc_lookup_file(struct svc_rqst *rqstp, struct nlm_host *host, + struct nlm_lock *lock, struct nlm_file **filp, + struct nlm4_lock *xdr_lock, unsigned char type) +{ + struct file_lock *fl = &lock->fl; + struct nlm_file *file = NULL; + __be32 error; + + if (xdr_lock->fh.len > NFS_MAXFHSIZE) + return nlm_lck_denied_nolocks; + lock->fh.size = xdr_lock->fh.len; + memcpy(lock->fh.data, xdr_lock->fh.data, xdr_lock->fh.len); + + lock->oh.len = xdr_lock->oh.len; + lock->oh.data = xdr_lock->oh.data; + + lock->svid = xdr_lock->svid; + lock->lock_start = xdr_lock->l_offset; + lock->lock_len = xdr_lock->l_len; + + if (lock->lock_start > OFFSET_MAX || + (lock->lock_len && ((lock->lock_len - 1) > (OFFSET_MAX - lock->lock_start)))) + return nlm4_fbig; + + locks_init_lock(fl); + fl->c.flc_type = type; + lockd_set_file_lock_range4(fl, lock->lock_start, lock->lock_len); + + error = nlm_lookup_file(rqstp, &file, lock); + switch (error) { + case nlm_granted: + break; + case nlm__int__stale_fh: + return nlm4_stale_fh; + case nlm__int__failed: + return nlm4_failed; + default: + return error; + } + *filp = file; + + fl->c.flc_flags = FL_POSIX; + fl->c.flc_file = file->f_file[lock_to_openmode(fl)]; + fl->c.flc_pid = current->tgid; + fl->fl_lmops = &nlmsvc_lock_operations; + nlmsvc_locks_init_private(fl, host, (pid_t)lock->svid); + if (!fl->c.flc_owner) + return nlm_lck_denied_nolocks; + + return nlm_granted; +} + /* * Obtain client and file from arguments */ @@ -151,10 +240,81 @@ __nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp) return rc; } -static __be32 -nlm4svc_proc_test(struct svc_rqst *rqstp) +/** + * nlm4svc_proc_test - TEST: Check for conflicting lock + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully. + * %rpc_drop_reply: Do not send an RPC reply. + * + * RPC synopsis: + * nlm4_testres NLMPROC4_TEST(nlm4_testargs) = 1; + * + * Permissible procedure status codes: + * %NLM4_GRANTED: The server would be able to grant the + * requested lock. + * %NLM4_DENIED: The requested lock conflicted with existing + * lock reservations for the file. + * %NLM4_DENIED_NOLOCKS: The server could not allocate the resources + * needed to process the request. + * %NLM4_DENIED_GRACE_PERIOD: The server has recently restarted and is + * re-establishing existing locks, and is not + * yet ready to accept normal service requests. + * + * The Linux NLM server implementation also returns: + * %NLM4_STALE_FH: The request specified an invalid file handle. + * %NLM4_FBIG: The request specified a length or offset + * that exceeds the range supported by the + * server. + * %NLM4_FAILED: The request failed for an unspecified reason. + */ +static __be32 nlm4svc_proc_test(struct svc_rqst *rqstp) { - return __nlm4svc_proc_test(rqstp, rqstp->rq_resp); + struct nlm4_testargs_wrapper *argp = rqstp->rq_argp; + unsigned char type = argp->xdrgen.exclusive ? F_WRLCK : F_RDLCK; + struct nlm4_testres_wrapper *resp = rqstp->rq_resp; + struct nlm_file *file = NULL; + struct nlm_host *host; + + resp->xdrgen.cookie = argp->xdrgen.cookie; + + resp->xdrgen.stat.stat = nlm_lck_denied_nolocks; + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.alock.caller_name, false); + if (!host) + goto out; + + resp->xdrgen.stat.stat = nlm4svc_lookup_file(rqstp, host, &argp->lock, + &file, &argp->xdrgen.alock, + type); + if (resp->xdrgen.stat.stat) + goto out; + + resp->xdrgen.stat.stat = nlmsvc_testlock(rqstp, file, host, + &argp->lock, &resp->lock); + nlmsvc_release_lockowner(&argp->lock); + + if (resp->xdrgen.stat.stat == nlm_lck_denied) { + struct nlm_lock *conf = &resp->lock; + struct nlm4_holder *holder = &resp->xdrgen.stat.u.holder; + + holder->exclusive = (conf->fl.c.flc_type != F_RDLCK); + holder->svid = conf->svid; + holder->oh.len = conf->oh.len; + holder->oh.data = conf->oh.data; + holder->l_offset = conf->fl.fl_start; + if (conf->fl.fl_end == OFFSET_MAX) + holder->l_len = 0; + else + holder->l_len = conf->fl.fl_end - conf->fl.fl_start + 1; + } + +out: + if (file) + nlm_release_file(file); + nlmsvc_release_host(host); + return resp->xdrgen.stat.stat == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; } static __be32 @@ -557,15 +717,15 @@ static const struct svc_procedure nlm4svc_procedures[24] = { .pc_xdrressize = XDR_void, .pc_name = "NULL", }, - [NLMPROC_TEST] = { - .pc_func = nlm4svc_proc_test, - .pc_decode = nlm4svc_decode_testargs, - .pc_encode = nlm4svc_encode_testres, - .pc_argsize = sizeof(struct nlm_args), - .pc_argzero = sizeof(struct nlm_args), - .pc_ressize = sizeof(struct nlm_res), - .pc_xdrressize = Ck+St+2+No+Rg, - .pc_name = "TEST", + [NLMPROC4_TEST] = { + .pc_func = nlm4svc_proc_test, + .pc_decode = nlm4_svc_decode_nlm4_testargs, + .pc_encode = nlm4_svc_encode_nlm4_testres, + .pc_argsize = sizeof(struct nlm4_testargs_wrapper), + .pc_argzero = 0, + .pc_ressize = sizeof(struct nlm4_testres_wrapper), + .pc_xdrressize = NLM4_nlm4_testres_sz, + .pc_name = "TEST", }, [NLMPROC_LOCK] = { .pc_func = nlm4svc_proc_lock, @@ -793,6 +953,8 @@ static const struct svc_procedure nlm4svc_procedures[24] = { * Storage requirements for XDR arguments and results */ union nlm4svc_xdrstore { + struct nlm4_testargs_wrapper testargs; + struct nlm4_testres_wrapper testres; struct nlm_args args; struct nlm_res res; struct nlm_reboot reboot; From 6cb785ab81bcb76c6ac985f9684c1d9118154427 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:06:56 -0500 Subject: [PATCH 30/83] lockd: Use xdrgen XDR functions for the NLMv4 LOCK procedure Replace legacy XDR handling in the LOCK procedure with xdrgen- generated functions nlm4_svc_decode_lockargs and nlm4_svc_encode_res. The new nlm4svc_do_lock() handler replaces __nlm4svc_proc_lock() at the NLMPROC4_LOCK entry point. Wrapper structures bridge xdrgen types to the legacy nlm_lock representation used by core lockd. The nlm4svc_lookup_host() and nlm4svc_lookup_file() helpers from the TEST conversion handle host and file lookup. The pc_argzero field is set to zero: xdrgen-generated decoders initialize all fields in argp->xdrgen, so a defensive memset is unnecessary. The wrapper's cookie and lock fields are cleared by nlm4svc_do_lock() before use. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 127 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 115 insertions(+), 12 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index b07ab4d60871..2cad72562ef2 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -40,6 +40,14 @@ struct nlm4_testargs_wrapper { static_assert(offsetof(struct nlm4_testargs_wrapper, xdrgen) == 0); +struct nlm4_lockargs_wrapper { + struct nlm4_lockargs xdrgen; + struct nlm_cookie cookie; + struct nlm_lock lock; +}; + +static_assert(offsetof(struct nlm4_lockargs_wrapper, xdrgen) == 0); + struct nlm4_testres_wrapper { struct nlm4_testres xdrgen; struct nlm_lock lock; @@ -47,6 +55,22 @@ struct nlm4_testres_wrapper { static_assert(offsetof(struct nlm4_testres_wrapper, xdrgen) == 0); +struct nlm4_res_wrapper { + struct nlm4_res xdrgen; +}; + +static_assert(offsetof(struct nlm4_res_wrapper, xdrgen) == 0); + +static __be32 +nlm4_netobj_to_cookie(struct nlm_cookie *cookie, netobj *object) +{ + if (object->len > NLM_MAXCOOKIELEN) + return nlm_lck_denied_nolocks; + cookie->len = object->len; + memcpy(cookie->data, object->data, object->len); + return nlm_granted; +} + static struct nlm_host * nlm4svc_lookup_host(struct svc_rqst *rqstp, string caller, bool monitored) { @@ -355,10 +379,88 @@ __nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_res *resp) return rc; } +static __be32 +nlm4svc_do_lock(struct svc_rqst *rqstp, bool monitored) +{ + struct nlm4_lockargs_wrapper *argp = rqstp->rq_argp; + unsigned char type = argp->xdrgen.exclusive ? F_WRLCK : F_RDLCK; + struct nlm4_res_wrapper *resp = rqstp->rq_resp; + struct nlm_file *file = NULL; + struct nlm_host *host = NULL; + + resp->xdrgen.cookie = argp->xdrgen.cookie; + + resp->xdrgen.stat.stat = nlm4_netobj_to_cookie(&argp->cookie, + &argp->xdrgen.cookie); + if (resp->xdrgen.stat.stat) + goto out; + + resp->xdrgen.stat.stat = nlm_lck_denied_nolocks; + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.alock.caller_name, + monitored); + if (!host) + goto out; + + resp->xdrgen.stat.stat = nlm4svc_lookup_file(rqstp, host, &argp->lock, + &file, &argp->xdrgen.alock, + type); + if (resp->xdrgen.stat.stat) + goto out; + + resp->xdrgen.stat.stat = nlmsvc_lock(rqstp, file, host, &argp->lock, + argp->xdrgen.block, &argp->cookie, + argp->xdrgen.reclaim); + if (resp->xdrgen.stat.stat == nlm__int__deadlock) + resp->xdrgen.stat.stat = nlm4_deadlock; + + nlmsvc_release_lockowner(&argp->lock); + +out: + if (file) + nlm_release_file(file); + nlmsvc_release_host(host); + return resp->xdrgen.stat.stat == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; +} + +/** + * nlm4svc_proc_lock - LOCK: Establish a monitored lock + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully. + * %rpc_drop_reply: Do not send an RPC reply. + * + * RPC synopsis: + * nlm4_res NLMPROC4_LOCK(nlm4_lockargs) = 2; + * + * Permissible procedure status codes: + * %NLM4_GRANTED: The requested lock was granted. + * %NLM4_DENIED: The requested lock conflicted with existing + * lock reservations for the file. + * %NLM4_DENIED_NOLOCKS: The server could not allocate the resources + * needed to process the request. + * %NLM4_BLOCKED: The blocking request cannot be granted + * immediately. The server will send an + * NLMPROC4_GRANTED callback to the client when + * the lock can be granted. + * %NLM4_DENIED_GRACE_PERIOD: The server has recently restarted and is + * re-establishing existing locks, and is not + * yet ready to accept normal service requests. + * + * The Linux NLM server implementation also returns: + * %NLM4_DEADLCK: The request could not be granted and + * blocking would cause a deadlock. + * %NLM4_STALE_FH: The request specified an invalid file handle. + * %NLM4_FBIG: The request specified a length or offset + * that exceeds the range supported by the + * server. + * %NLM4_FAILED: The request failed for an unspecified reason. + */ static __be32 nlm4svc_proc_lock(struct svc_rqst *rqstp) { - return __nlm4svc_proc_lock(rqstp, rqstp->rq_resp); + return nlm4svc_do_lock(rqstp, true); } static __be32 @@ -629,7 +731,7 @@ nlm4svc_proc_nm_lock(struct svc_rqst *rqstp) dprintk("lockd: NM_LOCK called\n"); argp->monitor = 0; /* just clean the monitor flag */ - return nlm4svc_proc_lock(rqstp); + return __nlm4svc_proc_lock(rqstp, rqstp->rq_resp); } /* @@ -727,15 +829,15 @@ static const struct svc_procedure nlm4svc_procedures[24] = { .pc_xdrressize = NLM4_nlm4_testres_sz, .pc_name = "TEST", }, - [NLMPROC_LOCK] = { - .pc_func = nlm4svc_proc_lock, - .pc_decode = nlm4svc_decode_lockargs, - .pc_encode = nlm4svc_encode_res, - .pc_argsize = sizeof(struct nlm_args), - .pc_argzero = sizeof(struct nlm_args), - .pc_ressize = sizeof(struct nlm_res), - .pc_xdrressize = Ck+St, - .pc_name = "LOCK", + [NLMPROC4_LOCK] = { + .pc_func = nlm4svc_proc_lock, + .pc_decode = nlm4_svc_decode_nlm4_lockargs, + .pc_encode = nlm4_svc_encode_nlm4_res, + .pc_argsize = sizeof(struct nlm4_lockargs_wrapper), + .pc_argzero = 0, + .pc_ressize = sizeof(struct nlm4_res_wrapper), + .pc_xdrressize = NLM4_nlm4_res_sz, + .pc_name = "LOCK", }, [NLMPROC_CANCEL] = { .pc_func = nlm4svc_proc_cancel, @@ -954,9 +1056,10 @@ static const struct svc_procedure nlm4svc_procedures[24] = { */ union nlm4svc_xdrstore { struct nlm4_testargs_wrapper testargs; + struct nlm4_lockargs_wrapper lockargs; struct nlm4_testres_wrapper testres; + struct nlm4_res_wrapper res; struct nlm_args args; - struct nlm_res res; struct nlm_reboot reboot; }; From 496a0e971ace1fa68b0fd5d00b2706ac61e7bc6c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:06:57 -0500 Subject: [PATCH 31/83] lockd: Use xdrgen XDR functions for the NLMv4 CANCEL procedure The NLM CANCEL procedure allows clients to cancel outstanding blocked lock requests, completing the set of lock-related operations that share common lookup patterns. This patch continues the xdrgen migration by converting the CANCEL procedure, leveraging the same nlm4svc_lookup_host() and nlm4svc_lookup_file() helpers established in the TEST procedure conversion to maintain consistency across the series. This patch converts the CANCEL procedure to use xdrgen functions nlm4_svc_decode_nlm4_cancargs and nlm4_svc_encode_nlm4_res generated from the NLM version 4 protocol specification. The procedure handler uses xdrgen types through a wrapper structure that bridges between generated code and the legacy nlm_lock representation still used by the core lockd logic. The pc_argzero field is set to zero because xdrgen decoders reliably initialize all arguments in the argp->xdrgen field, making the early defensive memset unnecessary. Remaining argp fields are cleared as needed. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 86 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 76 insertions(+), 10 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 2cad72562ef2..4a3815599a65 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -48,6 +48,13 @@ struct nlm4_lockargs_wrapper { static_assert(offsetof(struct nlm4_lockargs_wrapper, xdrgen) == 0); +struct nlm4_cancargs_wrapper { + struct nlm4_cancargs xdrgen; + struct nlm_lock lock; +}; + +static_assert(offsetof(struct nlm4_cancargs_wrapper, xdrgen) == 0); + struct nlm4_testres_wrapper { struct nlm4_testres xdrgen; struct nlm_lock lock; @@ -495,10 +502,68 @@ __nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_res *resp) return rpc_success; } +/** + * nlm4svc_proc_cancel - CANCEL: Cancel an outstanding blocked lock request + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully + * %rpc_drop_reply: Do not send an RPC reply + * + * RPC synopsis: + * nlm4_res NLMPROC4_CANCEL(nlm4_cancargs) = 3; + * + * Permissible procedure status codes: + * %NLM4_LCK_GRANTED: The requested lock was canceled. + * %NLM4_LCK_DENIED: There was no lock to cancel. + * %NLM4_DENIED_GRACE_PERIOD: The server has recently restarted and is + * re-establishing existing locks, and is not + * yet ready to accept normal service requests. + * + * The Linux NLM server implementation also returns: + * %NLM4_DENIED_NOLOCKS: A needed resource could not be allocated. + * %NLM4_STALE_FH: The request specified an invalid file handle. + * %NLM4_FBIG: The request specified a length or offset + * that exceeds the range supported by the + * server. + * %NLM4_FAILED: The request failed for an unspecified reason. + */ static __be32 nlm4svc_proc_cancel(struct svc_rqst *rqstp) { - return __nlm4svc_proc_cancel(rqstp, rqstp->rq_resp); + struct nlm4_cancargs_wrapper *argp = rqstp->rq_argp; + unsigned char type = argp->xdrgen.exclusive ? F_WRLCK : F_RDLCK; + struct nlm4_res_wrapper *resp = rqstp->rq_resp; + struct net *net = SVC_NET(rqstp); + struct nlm_host *host = NULL; + struct nlm_file *file = NULL; + + resp->xdrgen.cookie = argp->xdrgen.cookie; + + resp->xdrgen.stat.stat = nlm_lck_denied_grace_period; + if (locks_in_grace(net)) + goto out; + + resp->xdrgen.stat.stat = nlm_lck_denied_nolocks; + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.alock.caller_name, false); + if (!host) + goto out; + + resp->xdrgen.stat.stat = nlm4svc_lookup_file(rqstp, host, &argp->lock, + &file, &argp->xdrgen.alock, + type); + if (resp->xdrgen.stat.stat) + goto out; + + resp->xdrgen.stat.stat = nlmsvc_cancel_blocked(net, file, &argp->lock); + nlmsvc_release_lockowner(&argp->lock); + +out: + if (file) + nlm_release_file(file); + nlmsvc_release_host(host); + return resp->xdrgen.stat.stat == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; } /* @@ -839,15 +904,15 @@ static const struct svc_procedure nlm4svc_procedures[24] = { .pc_xdrressize = NLM4_nlm4_res_sz, .pc_name = "LOCK", }, - [NLMPROC_CANCEL] = { - .pc_func = nlm4svc_proc_cancel, - .pc_decode = nlm4svc_decode_cancargs, - .pc_encode = nlm4svc_encode_res, - .pc_argsize = sizeof(struct nlm_args), - .pc_argzero = sizeof(struct nlm_args), - .pc_ressize = sizeof(struct nlm_res), - .pc_xdrressize = Ck+St, - .pc_name = "CANCEL", + [NLMPROC4_CANCEL] = { + .pc_func = nlm4svc_proc_cancel, + .pc_decode = nlm4_svc_decode_nlm4_cancargs, + .pc_encode = nlm4_svc_encode_nlm4_res, + .pc_argsize = sizeof(struct nlm4_cancargs_wrapper), + .pc_argzero = 0, + .pc_ressize = sizeof(struct nlm4_res_wrapper), + .pc_xdrressize = NLM4_nlm4_res_sz, + .pc_name = "CANCEL", }, [NLMPROC_UNLOCK] = { .pc_func = nlm4svc_proc_unlock, @@ -1057,6 +1122,7 @@ static const struct svc_procedure nlm4svc_procedures[24] = { union nlm4svc_xdrstore { struct nlm4_testargs_wrapper testargs; struct nlm4_lockargs_wrapper lockargs; + struct nlm4_cancargs_wrapper cancargs; struct nlm4_testres_wrapper testres; struct nlm4_res_wrapper res; struct nlm_args args; From a2ac36e79b5db35367b1b269bf0a2aed9ba27be3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:06:58 -0500 Subject: [PATCH 32/83] lockd: Use xdrgen XDR functions for the NLMv4 UNLOCK procedure UNLOCK releases locks acquired via the LOCK procedure. Conversion of TEST, LOCK, CANCEL, and UNLOCK provides the complete set of lock lifecycle operations required by the NLM protocol, enabling clients to test for conflicts, acquire locks, abort pending lock requests, and release held locks. The procedure handler converts arguments from the xdrgen-generated nlm4_unlockargs structure to the legacy nlm_lock representation through nlm4_unlockargs_wrapper. This maintains compatibility with core lockd logic while using XDR decoders and encoders generated from the NLMv4 protocol specification. The original __nlm4svc_proc_unlock function is retained because the asynchronous callback path invokes it directly, bypassing the RPC dispatch mechanism. The pc_argzero field is zero because nlm4_svc_decode_nlm4_unlockargs initializes all fields in argp->xdrgen, eliminating the need for early memset of the argument buffer. Remaining argp fields outside the xdrgen structure are cleared explicitly where needed. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 84 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 74 insertions(+), 10 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 4a3815599a65..de1a9cf416ec 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -55,6 +55,13 @@ struct nlm4_cancargs_wrapper { static_assert(offsetof(struct nlm4_cancargs_wrapper, xdrgen) == 0); +struct nlm4_unlockargs_wrapper { + struct nlm4_unlockargs xdrgen; + struct nlm_lock lock; +}; + +static_assert(offsetof(struct nlm4_unlockargs_wrapper, xdrgen) == 0); + struct nlm4_testres_wrapper { struct nlm4_testres xdrgen; struct nlm_lock lock; @@ -601,10 +608,66 @@ __nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_res *resp) return rpc_success; } +/** + * nlm4svc_proc_unlock - UNLOCK: Remove a lock + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully. + * %rpc_drop_reply: Do not send an RPC reply. + * + * RPC synopsis: + * nlm4_res NLMPROC4_UNLOCK(nlm4_unlockargs) = 4; + * + * Permissible procedure status codes: + * %NLM4_GRANTED: The requested lock was released. + * %NLM4_DENIED_GRACE_PERIOD: The server has recently restarted and is + * re-establishing existing locks, and is not + * yet ready to accept normal service requests. + * + * The Linux NLM server implementation also returns: + * %NLM4_DENIED_NOLOCKS: A needed resource could not be allocated. + * %NLM4_STALE_FH: The request specified an invalid file handle. + * %NLM4_FBIG: The request specified a length or offset + * that exceeds the range supported by the + * server. + * %NLM4_FAILED: The request failed for an unspecified reason. + */ static __be32 nlm4svc_proc_unlock(struct svc_rqst *rqstp) { - return __nlm4svc_proc_unlock(rqstp, rqstp->rq_resp); + struct nlm4_unlockargs_wrapper *argp = rqstp->rq_argp; + struct nlm4_res_wrapper *resp = rqstp->rq_resp; + struct net *net = SVC_NET(rqstp); + struct nlm_host *host = NULL; + struct nlm_file *file = NULL; + + resp->xdrgen.cookie = argp->xdrgen.cookie; + + resp->xdrgen.stat.stat = nlm_lck_denied_grace_period; + if (locks_in_grace(net)) + goto out; + + resp->xdrgen.stat.stat = nlm_lck_denied_nolocks; + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.alock.caller_name, false); + if (!host) + goto out; + + resp->xdrgen.stat.stat = nlm4svc_lookup_file(rqstp, host, &argp->lock, + &file, &argp->xdrgen.alock, + F_UNLCK); + if (resp->xdrgen.stat.stat) + goto out; + + resp->xdrgen.stat.stat = nlmsvc_unlock(net, file, &argp->lock); + nlmsvc_release_lockowner(&argp->lock); + +out: + if (file) + nlm_release_file(file); + nlmsvc_release_host(host); + return resp->xdrgen.stat.stat == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; } /* @@ -914,15 +977,15 @@ static const struct svc_procedure nlm4svc_procedures[24] = { .pc_xdrressize = NLM4_nlm4_res_sz, .pc_name = "CANCEL", }, - [NLMPROC_UNLOCK] = { - .pc_func = nlm4svc_proc_unlock, - .pc_decode = nlm4svc_decode_unlockargs, - .pc_encode = nlm4svc_encode_res, - .pc_argsize = sizeof(struct nlm_args), - .pc_argzero = sizeof(struct nlm_args), - .pc_ressize = sizeof(struct nlm_res), - .pc_xdrressize = Ck+St, - .pc_name = "UNLOCK", + [NLMPROC4_UNLOCK] = { + .pc_func = nlm4svc_proc_unlock, + .pc_decode = nlm4_svc_decode_nlm4_unlockargs, + .pc_encode = nlm4_svc_encode_nlm4_res, + .pc_argsize = sizeof(struct nlm4_unlockargs_wrapper), + .pc_argzero = 0, + .pc_ressize = sizeof(struct nlm4_res_wrapper), + .pc_xdrressize = NLM4_nlm4_res_sz, + .pc_name = "UNLOCK", }, [NLMPROC_GRANTED] = { .pc_func = nlm4svc_proc_granted, @@ -1123,6 +1186,7 @@ union nlm4svc_xdrstore { struct nlm4_testargs_wrapper testargs; struct nlm4_lockargs_wrapper lockargs; struct nlm4_cancargs_wrapper cancargs; + struct nlm4_unlockargs_wrapper unlockargs; struct nlm4_testres_wrapper testres; struct nlm4_res_wrapper res; struct nlm_args args; From 8de56f61e2d2f2534620e2f8ffc32243c13e139e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:06:59 -0500 Subject: [PATCH 33/83] lockd: Use xdrgen XDR functions for the NLMv4 GRANTED procedure The NLM GRANTED procedure provides server-to-client notification when a previously blocked lock request has been granted, completing the asynchronous lock request flow. This patch completes the xdrgen migration for basic NLMv4 procedures by converting the GRANTED procedure, the final one in this conversion series. This patch converts the GRANTED procedure to use xdrgen functions nlm4_svc_decode_nlm4_testargs and nlm4_svc_encode_nlm4_res generated from the NLM version 4 protocol specification. The procedure handler uses xdrgen types through a wrapper structure that bridges between generated code and the legacy nlm_lock representation still used by the core lockd logic. A new helper function nlm4_lock_to_nlm_lock() is introduced to convert xdrgen nlm4_lock structures to the legacy nlm_lock format. This helper complements the existing nlm4svc_lookup_host() and nlm4svc_lookup_file() functions used throughout this series. The pc_argzero field is set to zero because xdrgen decoders reliably initialize all arguments in the argp->xdrgen field, making the early defensive memset unnecessary. Remaining argp fields are cleared as needed. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 66 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 56 insertions(+), 10 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index de1a9cf416ec..2e1a0392d68a 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -85,6 +85,21 @@ nlm4_netobj_to_cookie(struct nlm_cookie *cookie, netobj *object) return nlm_granted; } +static __be32 +nlm4_lock_to_nlm_lock(struct nlm_lock *lock, struct nlm4_lock *alock) +{ + if (alock->fh.len > NFS_MAXFHSIZE) + return nlm_lck_denied; + lock->fh.size = alock->fh.len; + memcpy(lock->fh.data, alock->fh.data, alock->fh.len); + lock->oh.len = alock->oh.len; + lock->oh.data = alock->oh.data; + lock->svid = alock->svid; + locks_init_lock(&lock->fl); + lockd_set_file_lock_range4(&lock->fl, alock->l_offset, alock->l_len); + return nlm_granted; +} + static struct nlm_host * nlm4svc_lookup_host(struct svc_rqst *rqstp, string caller, bool monitored) { @@ -687,10 +702,41 @@ __nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_res *resp) return rpc_success; } +/** + * nlm4svc_proc_granted - GRANTED: Server grants a previously blocked lock + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully. + * + * RPC synopsis: + * nlm4_res NLMPROC4_GRANTED(nlm4_testargs) = 5; + * + * Permissible procedure status codes: + * %NLM4_GRANTED: The requested lock was granted. + * %NLM4_DENIED: The server could not allocate the resources + * needed to process the request. + * %NLM4_DENIED_GRACE_PERIOD: The server has recently restarted and is + * re-establishing existing locks, and is not + * yet ready to accept normal service requests. + */ static __be32 nlm4svc_proc_granted(struct svc_rqst *rqstp) { - return __nlm4svc_proc_granted(rqstp, rqstp->rq_resp); + struct nlm4_testargs_wrapper *argp = rqstp->rq_argp; + struct nlm4_res_wrapper *resp = rqstp->rq_resp; + + resp->xdrgen.cookie = argp->xdrgen.cookie; + + resp->xdrgen.stat.stat = nlm4_lock_to_nlm_lock(&argp->lock, + &argp->xdrgen.alock); + if (resp->xdrgen.stat.stat) + goto out; + + resp->xdrgen.stat.stat = nlmclnt_grant(svc_addr(rqstp), &argp->lock); + +out: + return rpc_success; } /* @@ -987,15 +1033,15 @@ static const struct svc_procedure nlm4svc_procedures[24] = { .pc_xdrressize = NLM4_nlm4_res_sz, .pc_name = "UNLOCK", }, - [NLMPROC_GRANTED] = { - .pc_func = nlm4svc_proc_granted, - .pc_decode = nlm4svc_decode_testargs, - .pc_encode = nlm4svc_encode_res, - .pc_argsize = sizeof(struct nlm_args), - .pc_argzero = sizeof(struct nlm_args), - .pc_ressize = sizeof(struct nlm_res), - .pc_xdrressize = Ck+St, - .pc_name = "GRANTED", + [NLMPROC4_GRANTED] = { + .pc_func = nlm4svc_proc_granted, + .pc_decode = nlm4_svc_decode_nlm4_testargs, + .pc_encode = nlm4_svc_encode_nlm4_res, + .pc_argsize = sizeof(struct nlm4_testargs_wrapper), + .pc_argzero = 0, + .pc_ressize = sizeof(struct nlm4_res_wrapper), + .pc_xdrressize = NLM4_nlm4_res_sz, + .pc_name = "GRANTED", }, [NLMPROC_TEST_MSG] = { .pc_func = nlm4svc_proc_test_msg, From 3086ad11ab6ca4a95d7d65b87c40b8cbb60921a0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:07:00 -0500 Subject: [PATCH 34/83] lockd: Refactor nlm4svc_callback() The xdrgen-based XDR conversion requires each RPC procedure to handle its own argument extraction, since xdrgen generates distinct argument structures for each procedure rather than using a single shared type. This patch moves the host lookup logic from nlm4svc_callback() into each of the five MSG procedure handlers (TEST_MSG, LOCK_MSG, CANCEL_MSG, UNLOCK_MSG, and GRANTED_MSG). Each handler now performs its own host lookup from rqstp->rq_argp and passes the resulting host pointer to nlm4svc_callback(), which is reduced to a simpler helper that only dispatches the callback. This refactoring enables the subsequent xdrgen conversion patches by establishing the pattern where each procedure handles its own argument extraction, while preserving existing callback behavior unchanged. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 80 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 62 insertions(+), 18 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 2e1a0392d68a..f1a692f72a39 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -757,24 +757,17 @@ static const struct rpc_call_ops nlm4svc_callback_ops = { }; /* - * `Async' versions of the above service routines. They aren't really, - * because we send the callback before the reply proper. I hope this - * doesn't break any clients. + * Dispatch an async callback RPC to a client with a pre-resolved host. + * Caller provides a reference to @host; this function takes ownership + * and releases it via nlmsvc_release_host() before returning. */ -static __be32 nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, - __be32 (*func)(struct svc_rqst *, struct nlm_res *)) +static __be32 +nlm4svc_callback(struct svc_rqst *rqstp, struct nlm_host *host, u32 proc, + __be32 (*func)(struct svc_rqst *, struct nlm_res *)) { - struct nlm_args *argp = rqstp->rq_argp; - struct nlm_host *host; struct nlm_rqst *call; __be32 stat; - host = nlmsvc_lookup_host(rqstp, - argp->lock.caller, - argp->lock.len); - if (host == NULL) - return rpc_system_err; - call = nlm_alloc_call(host); nlmsvc_release_host(host); if (call == NULL) @@ -792,34 +785,85 @@ static __be32 nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, return rpc_success; } +/* + * 'Async' versions of the above service routines. They aren't really, + * because we send the callback before the reply proper. I hope this + * doesn't break any clients. + */ + static __be32 nlm4svc_proc_test_msg(struct svc_rqst *rqstp) { + struct nlm_args *argp = rqstp->rq_argp; + struct nlm_host *host; + dprintk("lockd: TEST_MSG called\n"); - return nlm4svc_callback(rqstp, NLMPROC_TEST_RES, __nlm4svc_proc_test); + + host = nlmsvc_lookup_host(rqstp, argp->lock.caller, argp->lock.len); + if (!host) + return rpc_system_err; + + return nlm4svc_callback(rqstp, host, NLMPROC_TEST_RES, + __nlm4svc_proc_test); } static __be32 nlm4svc_proc_lock_msg(struct svc_rqst *rqstp) { + struct nlm_args *argp = rqstp->rq_argp; + struct nlm_host *host; + dprintk("lockd: LOCK_MSG called\n"); - return nlm4svc_callback(rqstp, NLMPROC_LOCK_RES, __nlm4svc_proc_lock); + + host = nlmsvc_lookup_host(rqstp, argp->lock.caller, argp->lock.len); + if (!host) + return rpc_system_err; + + return nlm4svc_callback(rqstp, host, NLMPROC_LOCK_RES, + __nlm4svc_proc_lock); } static __be32 nlm4svc_proc_cancel_msg(struct svc_rqst *rqstp) { + struct nlm_args *argp = rqstp->rq_argp; + struct nlm_host *host; + dprintk("lockd: CANCEL_MSG called\n"); - return nlm4svc_callback(rqstp, NLMPROC_CANCEL_RES, __nlm4svc_proc_cancel); + + host = nlmsvc_lookup_host(rqstp, argp->lock.caller, argp->lock.len); + if (!host) + return rpc_system_err; + + return nlm4svc_callback(rqstp, host, NLMPROC_CANCEL_RES, + __nlm4svc_proc_cancel); } static __be32 nlm4svc_proc_unlock_msg(struct svc_rqst *rqstp) { + struct nlm_args *argp = rqstp->rq_argp; + struct nlm_host *host; + dprintk("lockd: UNLOCK_MSG called\n"); - return nlm4svc_callback(rqstp, NLMPROC_UNLOCK_RES, __nlm4svc_proc_unlock); + + host = nlmsvc_lookup_host(rqstp, argp->lock.caller, argp->lock.len); + if (!host) + return rpc_system_err; + + return nlm4svc_callback(rqstp, host, NLMPROC_UNLOCK_RES, + __nlm4svc_proc_unlock); } static __be32 nlm4svc_proc_granted_msg(struct svc_rqst *rqstp) { + struct nlm_args *argp = rqstp->rq_argp; + struct nlm_host *host; + dprintk("lockd: GRANTED_MSG called\n"); - return nlm4svc_callback(rqstp, NLMPROC_GRANTED_RES, __nlm4svc_proc_granted); + + host = nlmsvc_lookup_host(rqstp, argp->lock.caller, argp->lock.len); + if (!host) + return rpc_system_err; + + return nlm4svc_callback(rqstp, host, NLMPROC_GRANTED_RES, + __nlm4svc_proc_granted); } /* From 331f2b6acb409a87105f4b0247a76e84d9472566 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:07:01 -0500 Subject: [PATCH 35/83] lockd: Use xdrgen XDR functions for the NLMv4 TEST_MSG procedure The TEST_MSG procedure is part of NLM's asynchronous lock request flow, where clients send TEST_MSG to check lock availability without blocking. This patch continues the xdrgen migration by converting TEST_MSG to use generated XDR functions. This patch converts the TEST_MSG procedure to use xdrgen functions nlm4_svc_decode_nlm4_testargs and nlm4_svc_encode_void generated from the NLM version 4 protocol specification. The procedure handler uses xdrgen types through the nlm4_testargs_wrapper structure that bridges between generated code and the legacy nlm_lock representation. The pc_argzero field is set to zero because xdrgen decoders reliably initialize all arguments in the argp->xdrgen field, making the early defensive memset unnecessary. Remaining argp fields are cleared as needed. The NLM async callback mechanism uses client-side functions which continue to take legacy results like struct nlm_res, preventing TEST and TEST_MSG from sharing code for now. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 114 +++++++++++++++++++++++--------------------- 1 file changed, 60 insertions(+), 54 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index f1a692f72a39..afce778b62d3 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -260,39 +260,6 @@ nlm4svc_proc_null(struct svc_rqst *rqstp) return rpc_success; } -/* - * TEST: Check for conflicting lock - */ -static __be32 -__nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp) -{ - struct nlm_args *argp = rqstp->rq_argp; - struct nlm_host *host; - struct nlm_file *file; - __be32 rc = rpc_success; - - dprintk("lockd: TEST4 called\n"); - resp->cookie = argp->cookie; - - /* Obtain client and file */ - if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm__int__drop_reply ? - rpc_drop_reply : rpc_success; - - /* Now check for conflicting locks */ - resp->status = nlmsvc_testlock(rqstp, file, host, &argp->lock, - &resp->lock); - if (resp->status == nlm__int__drop_reply) - rc = rpc_drop_reply; - else - dprintk("lockd: TEST4 status %d\n", ntohl(resp->status)); - - nlmsvc_release_lockowner(&argp->lock); - nlmsvc_release_host(host); - nlm_release_file(file); - return rc; -} - /** * nlm4svc_proc_test - TEST: Check for conflicting lock * @rqstp: RPC transaction context @@ -785,25 +752,64 @@ nlm4svc_callback(struct svc_rqst *rqstp, struct nlm_host *host, u32 proc, return rpc_success; } -/* - * 'Async' versions of the above service routines. They aren't really, - * because we send the callback before the reply proper. I hope this - * doesn't break any clients. - */ +static __be32 +__nlm4svc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_res *resp) +{ + struct nlm4_testargs_wrapper *argp = rqstp->rq_argp; + unsigned char type = argp->xdrgen.exclusive ? F_WRLCK : F_RDLCK; + struct nlm_lockowner *owner; + struct nlm_file *file = NULL; + struct nlm_host *host = NULL; + resp->status = nlm_lck_denied_nolocks; + if (nlm4_netobj_to_cookie(&resp->cookie, &argp->xdrgen.cookie)) + goto out; + + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.alock.caller_name, false); + if (!host) + goto out; + + resp->status = nlm4svc_lookup_file(rqstp, host, &argp->lock, + &file, &argp->xdrgen.alock, type); + if (resp->status) + goto out; + + owner = argp->lock.fl.c.flc_owner; + resp->status = nlmsvc_testlock(rqstp, file, host, &argp->lock, + &resp->lock); + nlmsvc_put_lockowner(owner); + +out: + if (file) + nlm_release_file(file); + nlmsvc_release_host(host); + return resp->status == nlm__int__drop_reply ? rpc_drop_reply : rpc_success; +} + +/** + * nlm4svc_proc_test_msg - TEST_MSG: Check for conflicting lock + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully. + * %rpc_system_err: RPC execution failed. + * + * RPC synopsis: + * void NLMPROC4_TEST_MSG(nlm4_testargs) = 6; + * + * The response to this request is delivered via the TEST_RES procedure. + */ static __be32 nlm4svc_proc_test_msg(struct svc_rqst *rqstp) { - struct nlm_args *argp = rqstp->rq_argp; - struct nlm_host *host; + struct nlm4_testargs_wrapper *argp = rqstp->rq_argp; + struct nlm_host *host; - dprintk("lockd: TEST_MSG called\n"); - - host = nlmsvc_lookup_host(rqstp, argp->lock.caller, argp->lock.len); + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.alock.caller_name, false); if (!host) return rpc_system_err; - return nlm4svc_callback(rqstp, host, NLMPROC_TEST_RES, - __nlm4svc_proc_test); + return nlm4svc_callback(rqstp, host, NLMPROC4_TEST_RES, + __nlm4svc_proc_test_msg); } static __be32 nlm4svc_proc_lock_msg(struct svc_rqst *rqstp) @@ -1087,15 +1093,15 @@ static const struct svc_procedure nlm4svc_procedures[24] = { .pc_xdrressize = NLM4_nlm4_res_sz, .pc_name = "GRANTED", }, - [NLMPROC_TEST_MSG] = { - .pc_func = nlm4svc_proc_test_msg, - .pc_decode = nlm4svc_decode_testargs, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_args), - .pc_argzero = sizeof(struct nlm_args), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = St, - .pc_name = "TEST_MSG", + [NLMPROC4_TEST_MSG] = { + .pc_func = nlm4svc_proc_test_msg, + .pc_decode = nlm4_svc_decode_nlm4_testargs, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = sizeof(struct nlm4_testargs_wrapper), + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "TEST_MSG", }, [NLMPROC_LOCK_MSG] = { .pc_func = nlm4svc_proc_lock_msg, From b2be4e28c23a47b3d4bd87ce1caacdbc4606d087 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:07:02 -0500 Subject: [PATCH 36/83] lockd: Use xdrgen XDR functions for the NLMv4 LOCK_MSG procedure The LOCK_MSG procedure is part of NLM's asynchronous lock request flow, where clients send LOCK_MSG to request locks that may block. This patch continues the xdrgen migration by converting LOCK_MSG to use generated XDR functions. This patch converts the LOCK_MSG procedure to use xdrgen functions nlm4_svc_decode_nlm4_lockargs and nlm4_svc_encode_void generated from the NLM version 4 protocol specification. The procedure handler uses xdrgen types through the nlm4_lockargs_wrapper structure that bridges between generated code and the legacy nlm_lock representation. The pc_argzero field is set to zero because xdrgen decoders reliably initialize all arguments in the argp->xdrgen field, making the early defensive memset unnecessary. Remaining argp fields are cleared as needed. The NLM async callback mechanism uses client-side functions which continue to take legacy results like struct nlm_res, preventing LOCK and LOCK_MSG from sharing code for now. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 77 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 61 insertions(+), 16 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index afce778b62d3..d9406a4ab176 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -812,19 +812,64 @@ static __be32 nlm4svc_proc_test_msg(struct svc_rqst *rqstp) __nlm4svc_proc_test_msg); } +static __be32 +__nlm4svc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_res *resp) +{ + struct nlm4_lockargs_wrapper *argp = rqstp->rq_argp; + unsigned char type = argp->xdrgen.exclusive ? F_WRLCK : F_RDLCK; + struct nlm_file *file = NULL; + struct nlm_host *host = NULL; + + resp->status = nlm_lck_denied_nolocks; + if (nlm4_netobj_to_cookie(&resp->cookie, &argp->xdrgen.cookie)) + goto out; + + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.alock.caller_name, true); + if (!host) + goto out; + + resp->status = nlm4svc_lookup_file(rqstp, host, &argp->lock, + &file, &argp->xdrgen.alock, type); + if (resp->status) + goto out; + + resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock, + argp->xdrgen.block, &resp->cookie, + argp->xdrgen.reclaim); + nlmsvc_release_lockowner(&argp->lock); + +out: + if (file) + nlm_release_file(file); + nlmsvc_release_host(host); + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; +} + +/** + * nlm4svc_proc_lock_msg - LOCK_MSG: Establish a monitored lock + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully. + * %rpc_system_err: RPC execution failed. + * + * RPC synopsis: + * void NLMPROC4_LOCK_MSG(nlm4_lockargs) = 7; + * + * The response to this request is delivered via the LOCK_RES procedure. + */ static __be32 nlm4svc_proc_lock_msg(struct svc_rqst *rqstp) { - struct nlm_args *argp = rqstp->rq_argp; - struct nlm_host *host; + struct nlm4_lockargs_wrapper *argp = rqstp->rq_argp; + struct nlm_host *host; - dprintk("lockd: LOCK_MSG called\n"); - - host = nlmsvc_lookup_host(rqstp, argp->lock.caller, argp->lock.len); + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.alock.caller_name, true); if (!host) return rpc_system_err; - return nlm4svc_callback(rqstp, host, NLMPROC_LOCK_RES, - __nlm4svc_proc_lock); + return nlm4svc_callback(rqstp, host, NLMPROC4_LOCK_RES, + __nlm4svc_proc_lock_msg); } static __be32 nlm4svc_proc_cancel_msg(struct svc_rqst *rqstp) @@ -1103,15 +1148,15 @@ static const struct svc_procedure nlm4svc_procedures[24] = { .pc_xdrressize = XDR_void, .pc_name = "TEST_MSG", }, - [NLMPROC_LOCK_MSG] = { - .pc_func = nlm4svc_proc_lock_msg, - .pc_decode = nlm4svc_decode_lockargs, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_args), - .pc_argzero = sizeof(struct nlm_args), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = St, - .pc_name = "LOCK_MSG", + [NLMPROC4_LOCK_MSG] = { + .pc_func = nlm4svc_proc_lock_msg, + .pc_decode = nlm4_svc_decode_nlm4_lockargs, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = sizeof(struct nlm4_lockargs_wrapper), + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "LOCK_MSG", }, [NLMPROC_CANCEL_MSG] = { .pc_func = nlm4svc_proc_cancel_msg, From dea5b7ac0e9beabc5d4f54cec629e1dce9e69c5e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:07:03 -0500 Subject: [PATCH 37/83] lockd: Use xdrgen XDR functions for the NLMv4 CANCEL_MSG procedure The CANCEL_MSG procedure is part of NLM's asynchronous lock request flow, where clients send CANCEL_MSG to cancel pending lock requests. This patch continues the xdrgen migration by converting CANCEL_MSG to use generated XDR functions. This patch converts the CANCEL_MSG procedure to use xdrgen functions nlm4_svc_decode_nlm4_cancargs and nlm4_svc_encode_void generated from the NLM version 4 protocol specification. The procedure handler uses xdrgen types through the nlm4_cancargs_wrapper structure that bridges between generated code and the legacy nlm_lock representation. The pc_argzero field is set to zero because xdrgen decoders reliably initialize all arguments in the argp->xdrgen field, making the early defensive memset unnecessary. Remaining argp fields are cleared as needed. The NLM async callback mechanism uses client-side functions which continue to take legacy results like struct nlm_res, preventing CANCEL and CANCEL_MSG from sharing code for now. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 113 +++++++++++++++++++++++++------------------- 1 file changed, 65 insertions(+), 48 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index d9406a4ab176..01e21b0a8956 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -459,38 +459,6 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp) return nlm4svc_do_lock(rqstp, true); } -static __be32 -__nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_res *resp) -{ - struct nlm_args *argp = rqstp->rq_argp; - struct nlm_host *host; - struct nlm_file *file; - - dprintk("lockd: CANCEL called\n"); - - resp->cookie = argp->cookie; - - /* Don't accept requests during grace period */ - if (locks_in_grace(SVC_NET(rqstp))) { - resp->status = nlm_lck_denied_grace_period; - return rpc_success; - } - - /* Obtain client and file */ - if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm__int__drop_reply ? - rpc_drop_reply : rpc_success; - - /* Try to cancel request. */ - resp->status = nlmsvc_cancel_blocked(SVC_NET(rqstp), file, &argp->lock); - - dprintk("lockd: CANCEL status %d\n", ntohl(resp->status)); - nlmsvc_release_lockowner(&argp->lock); - nlmsvc_release_host(host); - nlm_release_file(file); - return rpc_success; -} - /** * nlm4svc_proc_cancel - CANCEL: Cancel an outstanding blocked lock request * @rqstp: RPC transaction context @@ -872,19 +840,68 @@ static __be32 nlm4svc_proc_lock_msg(struct svc_rqst *rqstp) __nlm4svc_proc_lock_msg); } +static __be32 +__nlm4svc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_res *resp) +{ + struct nlm4_cancargs_wrapper *argp = rqstp->rq_argp; + unsigned char type = argp->xdrgen.exclusive ? F_WRLCK : F_RDLCK; + struct net *net = SVC_NET(rqstp); + struct nlm_file *file = NULL; + struct nlm_host *host = NULL; + + resp->status = nlm_lck_denied_nolocks; + if (nlm4_netobj_to_cookie(&resp->cookie, &argp->xdrgen.cookie)) + goto out; + + resp->status = nlm_lck_denied_grace_period; + if (locks_in_grace(net)) + goto out; + + resp->status = nlm_lck_denied_nolocks; + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.alock.caller_name, false); + if (!host) + goto out; + + resp->status = nlm4svc_lookup_file(rqstp, host, &argp->lock, + &file, &argp->xdrgen.alock, type); + if (resp->status) + goto out; + + resp->status = nlmsvc_cancel_blocked(net, file, &argp->lock); + nlmsvc_release_lockowner(&argp->lock); + +out: + if (file) + nlm_release_file(file); + nlmsvc_release_host(host); + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; +} + +/** + * nlm4svc_proc_cancel_msg - CANCEL_MSG: Cancel an outstanding lock request + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully. + * %rpc_system_err: RPC execution failed. + * + * RPC synopsis: + * void NLMPROC4_CANCEL_MSG(nlm4_cancargs) = 8; + * + * The response to this request is delivered via the CANCEL_RES procedure. + */ static __be32 nlm4svc_proc_cancel_msg(struct svc_rqst *rqstp) { - struct nlm_args *argp = rqstp->rq_argp; - struct nlm_host *host; + struct nlm4_cancargs_wrapper *argp = rqstp->rq_argp; + struct nlm_host *host; - dprintk("lockd: CANCEL_MSG called\n"); - - host = nlmsvc_lookup_host(rqstp, argp->lock.caller, argp->lock.len); + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.alock.caller_name, false); if (!host) return rpc_system_err; - return nlm4svc_callback(rqstp, host, NLMPROC_CANCEL_RES, - __nlm4svc_proc_cancel); + return nlm4svc_callback(rqstp, host, NLMPROC4_CANCEL_RES, + __nlm4svc_proc_cancel_msg); } static __be32 nlm4svc_proc_unlock_msg(struct svc_rqst *rqstp) @@ -1158,15 +1175,15 @@ static const struct svc_procedure nlm4svc_procedures[24] = { .pc_xdrressize = XDR_void, .pc_name = "LOCK_MSG", }, - [NLMPROC_CANCEL_MSG] = { - .pc_func = nlm4svc_proc_cancel_msg, - .pc_decode = nlm4svc_decode_cancargs, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_args), - .pc_argzero = sizeof(struct nlm_args), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = St, - .pc_name = "CANCEL_MSG", + [NLMPROC4_CANCEL_MSG] = { + .pc_func = nlm4svc_proc_cancel_msg, + .pc_decode = nlm4_svc_decode_nlm4_cancargs, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = sizeof(struct nlm4_cancargs_wrapper), + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "CANCEL_MSG", }, [NLMPROC_UNLOCK_MSG] = { .pc_func = nlm4svc_proc_unlock_msg, From eff7d82f89afc8b4fd133055a28ffd7158f8947a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:07:04 -0500 Subject: [PATCH 38/83] lockd: Use xdrgen XDR functions for the NLMv4 UNLOCK_MSG procedure Convert the UNLOCK_MSG procedure to use xdrgen functions nlm4_svc_decode_nlm4_unlockargs and nlm4_svc_encode_void. The procedure handler uses the nlm4_unlockargs_wrapper structure that bridges between xdrgen types and the legacy nlm_lock representation. The pc_argzero field is set to zero because xdrgen decoders reliably initialize all arguments, making the early defensive memset unnecessary. The NLM async callback mechanism uses client-side functions which continue to take legacy struct nlm_res, preventing UNLOCK and UNLOCK_MSG from sharing code for now. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 115 ++++++++++++++++++++++++-------------------- 1 file changed, 64 insertions(+), 51 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 01e21b0a8956..c42c641dc5b6 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -523,41 +523,6 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp) rpc_drop_reply : rpc_success; } -/* - * UNLOCK: release a lock - */ -static __be32 -__nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_res *resp) -{ - struct nlm_args *argp = rqstp->rq_argp; - struct nlm_host *host; - struct nlm_file *file; - - dprintk("lockd: UNLOCK called\n"); - - resp->cookie = argp->cookie; - - /* Don't accept new lock requests during grace period */ - if (locks_in_grace(SVC_NET(rqstp))) { - resp->status = nlm_lck_denied_grace_period; - return rpc_success; - } - - /* Obtain client and file */ - if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm__int__drop_reply ? - rpc_drop_reply : rpc_success; - - /* Now try to remove the lock */ - resp->status = nlmsvc_unlock(SVC_NET(rqstp), file, &argp->lock); - - dprintk("lockd: UNLOCK status %d\n", ntohl(resp->status)); - nlmsvc_release_lockowner(&argp->lock); - nlmsvc_release_host(host); - nlm_release_file(file); - return rpc_success; -} - /** * nlm4svc_proc_unlock - UNLOCK: Remove a lock * @rqstp: RPC transaction context @@ -904,19 +869,67 @@ static __be32 nlm4svc_proc_cancel_msg(struct svc_rqst *rqstp) __nlm4svc_proc_cancel_msg); } +static __be32 +__nlm4svc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_res *resp) +{ + struct nlm4_unlockargs_wrapper *argp = rqstp->rq_argp; + struct net *net = SVC_NET(rqstp); + struct nlm_file *file = NULL; + struct nlm_host *host = NULL; + + resp->status = nlm_lck_denied_nolocks; + if (nlm4_netobj_to_cookie(&resp->cookie, &argp->xdrgen.cookie)) + goto out; + + resp->status = nlm_lck_denied_grace_period; + if (locks_in_grace(net)) + goto out; + + resp->status = nlm_lck_denied_nolocks; + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.alock.caller_name, false); + if (!host) + goto out; + + resp->status = nlm4svc_lookup_file(rqstp, host, &argp->lock, + &file, &argp->xdrgen.alock, F_UNLCK); + if (resp->status) + goto out; + + resp->status = nlmsvc_unlock(net, file, &argp->lock); + nlmsvc_release_lockowner(&argp->lock); + +out: + if (file) + nlm_release_file(file); + nlmsvc_release_host(host); + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; +} + +/** + * nlm4svc_proc_unlock_msg - UNLOCK_MSG: Remove an existing lock + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully. + * %rpc_system_err: RPC execution failed. + * + * RPC synopsis: + * void NLMPROC4_UNLOCK_MSG(nlm4_unlockargs) = 9; + * + * The response to this request is delivered via the UNLOCK_RES procedure. + */ static __be32 nlm4svc_proc_unlock_msg(struct svc_rqst *rqstp) { - struct nlm_args *argp = rqstp->rq_argp; - struct nlm_host *host; + struct nlm4_unlockargs_wrapper *argp = rqstp->rq_argp; + struct nlm_host *host; - dprintk("lockd: UNLOCK_MSG called\n"); - - host = nlmsvc_lookup_host(rqstp, argp->lock.caller, argp->lock.len); + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.alock.caller_name, false); if (!host) return rpc_system_err; - return nlm4svc_callback(rqstp, host, NLMPROC_UNLOCK_RES, - __nlm4svc_proc_unlock); + return nlm4svc_callback(rqstp, host, NLMPROC4_UNLOCK_RES, + __nlm4svc_proc_unlock_msg); } static __be32 nlm4svc_proc_granted_msg(struct svc_rqst *rqstp) @@ -1185,15 +1198,15 @@ static const struct svc_procedure nlm4svc_procedures[24] = { .pc_xdrressize = XDR_void, .pc_name = "CANCEL_MSG", }, - [NLMPROC_UNLOCK_MSG] = { - .pc_func = nlm4svc_proc_unlock_msg, - .pc_decode = nlm4svc_decode_unlockargs, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_args), - .pc_argzero = sizeof(struct nlm_args), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = St, - .pc_name = "UNLOCK_MSG", + [NLMPROC4_UNLOCK_MSG] = { + .pc_func = nlm4svc_proc_unlock_msg, + .pc_decode = nlm4_svc_decode_nlm4_unlockargs, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = sizeof(struct nlm4_unlockargs_wrapper), + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "UNLOCK_MSG", }, [NLMPROC_GRANTED_MSG] = { .pc_func = nlm4svc_proc_granted_msg, From 62721885e8610425403bc17e358b67db466e3cf8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:07:05 -0500 Subject: [PATCH 39/83] lockd: Use xdrgen XDR functions for the NLMv4 GRANTED_MSG procedure Convert the GRANTED_MSG procedure to use xdrgen functions nlm4_svc_decode_nlm4_testargs and nlm4_svc_encode_void. The procedure handler uses the nlm4_testargs_wrapper structure that bridges between xdrgen types and the legacy nlm_lock representation. The pc_argzero field is set to zero because xdrgen decoders reliably initialize all arguments, making the early defensive memset unnecessary. The NLM async callback mechanism uses client-side functions which continue to take legacy struct nlm_res, preventing GRANTED and GRANTED_MSG from sharing code for now. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 78 ++++++++++++++++++++++++++------------------- 1 file changed, 45 insertions(+), 33 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index c42c641dc5b6..306ecc21154e 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -585,23 +585,6 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp) rpc_drop_reply : rpc_success; } -/* - * GRANTED: A server calls us to tell that a process' lock request - * was granted - */ -static __be32 -__nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_res *resp) -{ - struct nlm_args *argp = rqstp->rq_argp; - - resp->cookie = argp->cookie; - - dprintk("lockd: GRANTED called\n"); - resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock); - dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); - return rpc_success; -} - /** * nlm4svc_proc_granted - GRANTED: Server grants a previously blocked lock * @rqstp: RPC transaction context @@ -932,19 +915,48 @@ static __be32 nlm4svc_proc_unlock_msg(struct svc_rqst *rqstp) __nlm4svc_proc_unlock_msg); } +static __be32 +__nlm4svc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_res *resp) +{ + struct nlm4_testargs_wrapper *argp = rqstp->rq_argp; + + resp->status = nlm_lck_denied; + if (nlm4_netobj_to_cookie(&resp->cookie, &argp->xdrgen.cookie)) + goto out; + + if (nlm4_lock_to_nlm_lock(&argp->lock, &argp->xdrgen.alock)) + goto out; + + resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock); + +out: + return rpc_success; +} + +/** + * nlm4svc_proc_granted_msg - GRANTED_MSG: Blocked lock has been granted + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully. + * %rpc_system_err: RPC execution failed. + * + * RPC synopsis: + * void NLMPROC4_GRANTED_MSG(nlm4_testargs) = 10; + * + * The response to this request is delivered via the GRANTED_RES procedure. + */ static __be32 nlm4svc_proc_granted_msg(struct svc_rqst *rqstp) { - struct nlm_args *argp = rqstp->rq_argp; - struct nlm_host *host; + struct nlm4_testargs_wrapper *argp = rqstp->rq_argp; + struct nlm_host *host; - dprintk("lockd: GRANTED_MSG called\n"); - - host = nlmsvc_lookup_host(rqstp, argp->lock.caller, argp->lock.len); + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.alock.caller_name, false); if (!host) return rpc_system_err; - return nlm4svc_callback(rqstp, host, NLMPROC_GRANTED_RES, - __nlm4svc_proc_granted); + return nlm4svc_callback(rqstp, host, NLMPROC4_GRANTED_RES, + __nlm4svc_proc_granted_msg); } /* @@ -1208,15 +1220,15 @@ static const struct svc_procedure nlm4svc_procedures[24] = { .pc_xdrressize = XDR_void, .pc_name = "UNLOCK_MSG", }, - [NLMPROC_GRANTED_MSG] = { - .pc_func = nlm4svc_proc_granted_msg, - .pc_decode = nlm4svc_decode_testargs, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_args), - .pc_argzero = sizeof(struct nlm_args), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = St, - .pc_name = "GRANTED_MSG", + [NLMPROC4_GRANTED_MSG] = { + .pc_func = nlm4svc_proc_granted_msg, + .pc_decode = nlm4_svc_decode_nlm4_testargs, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = sizeof(struct nlm4_testargs_wrapper), + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "GRANTED_MSG", }, [NLMPROC_TEST_RES] = { .pc_func = nlm4svc_proc_null, From 4764124811717650ecdef7a121768522683cafd7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:07:06 -0500 Subject: [PATCH 40/83] lockd: Use xdrgen XDR functions for the NLMv4 TEST_RES procedure Convert the TEST_RES procedure to use xdrgen functions nlm4_svc_decode_nlm4_testres and nlm4_svc_encode_void. TEST_RES is a callback procedure where the client sends test lock results back to the server after an async TEST request. The pc_argzero field is set to zero because xdrgen decoders reliably initialize all arguments, making the early defensive memset unnecessary. This change also corrects the pc_xdrressize field, which previously contained a placeholder value. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 306ecc21154e..6b391ec49341 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -1230,15 +1230,15 @@ static const struct svc_procedure nlm4svc_procedures[24] = { .pc_xdrressize = XDR_void, .pc_name = "GRANTED_MSG", }, - [NLMPROC_TEST_RES] = { - .pc_func = nlm4svc_proc_null, - .pc_decode = nlm4svc_decode_void, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_res), - .pc_argzero = sizeof(struct nlm_res), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = St, - .pc_name = "TEST_RES", + [NLMPROC4_TEST_RES] = { + .pc_func = nlm4svc_proc_null, + .pc_decode = nlm4_svc_decode_nlm4_testres, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = sizeof(struct nlm4_testres), + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "TEST_RES", }, [NLMPROC_LOCK_RES] = { .pc_func = nlm4svc_proc_null, From 50976ab9792af23f9a8672e415ca0d0bc93bd9d3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:07:07 -0500 Subject: [PATCH 41/83] lockd: Use xdrgen XDR functions for the NLMv4 LOCK_RES procedure Convert the LOCK_RES procedure to use xdrgen functions nlm4_svc_decode_nlm4_res and nlm4_svc_encode_void. LOCK_RES is a callback procedure where the client sends lock results back to the server after an async LOCK request. The pc_argzero field is set to zero because xdrgen decoders reliably initialize all arguments, making the early defensive memset unnecessary. This change also corrects the pc_xdrressize field, which previously contained a placeholder value. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 6b391ec49341..c5f21fc2228c 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -1240,15 +1240,15 @@ static const struct svc_procedure nlm4svc_procedures[24] = { .pc_xdrressize = XDR_void, .pc_name = "TEST_RES", }, - [NLMPROC_LOCK_RES] = { - .pc_func = nlm4svc_proc_null, - .pc_decode = nlm4svc_decode_void, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_res), - .pc_argzero = sizeof(struct nlm_res), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = St, - .pc_name = "LOCK_RES", + [NLMPROC4_LOCK_RES] = { + .pc_func = nlm4svc_proc_null, + .pc_decode = nlm4_svc_decode_nlm4_res, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = sizeof(struct nlm4_res), + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "LOCK_RES", }, [NLMPROC_CANCEL_RES] = { .pc_func = nlm4svc_proc_null, From f0eec0eb509a11880ed8b28148734962cf382a93 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:07:08 -0500 Subject: [PATCH 42/83] lockd: Use xdrgen XDR functions for the NLMv4 CANCEL_RES procedure Convert the CANCEL_RES procedure to use xdrgen functions nlm4_svc_decode_nlm4_res and nlm4_svc_encode_void. CANCEL_RES is a callback procedure where the client sends cancel results back to the server after an async CANCEL request. The pc_argzero field is set to zero because xdrgen decoders reliably initialize all arguments, making the early defensive memset unnecessary. This change also corrects the pc_xdrressize field, which previously contained a placeholder value. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index c5f21fc2228c..e9834b0077a0 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -1250,15 +1250,15 @@ static const struct svc_procedure nlm4svc_procedures[24] = { .pc_xdrressize = XDR_void, .pc_name = "LOCK_RES", }, - [NLMPROC_CANCEL_RES] = { - .pc_func = nlm4svc_proc_null, - .pc_decode = nlm4svc_decode_void, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_res), - .pc_argzero = sizeof(struct nlm_res), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = St, - .pc_name = "CANCEL_RES", + [NLMPROC4_CANCEL_RES] = { + .pc_func = nlm4svc_proc_null, + .pc_decode = nlm4_svc_decode_nlm4_res, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = sizeof(struct nlm4_res), + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "CANCEL_RES", }, [NLMPROC_UNLOCK_RES] = { .pc_func = nlm4svc_proc_null, From d4fc8bc100353096f87ad1c052df9e7073696510 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:07:09 -0500 Subject: [PATCH 43/83] lockd: Use xdrgen XDR functions for the NLMv4 UNLOCK_RES procedure Update the NLMPROC4_UNLOCK_RES entry in nlm_procedures4 to invoke xdrgen-generated XDR functions. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index e9834b0077a0..f730da7d1168 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -1260,15 +1260,15 @@ static const struct svc_procedure nlm4svc_procedures[24] = { .pc_xdrressize = XDR_void, .pc_name = "CANCEL_RES", }, - [NLMPROC_UNLOCK_RES] = { - .pc_func = nlm4svc_proc_null, - .pc_decode = nlm4svc_decode_void, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_res), - .pc_argzero = sizeof(struct nlm_res), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = St, - .pc_name = "UNLOCK_RES", + [NLMPROC4_UNLOCK_RES] = { + .pc_func = nlm4svc_proc_null, + .pc_decode = nlm4_svc_decode_nlm4_res, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = sizeof(struct nlm4_res), + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "UNLOCK_RES", }, [NLMPROC_GRANTED_RES] = { .pc_func = nlm4svc_proc_granted_res, From 5076fff93ce68cd13890b8f86099571ac2442ae3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:07:10 -0500 Subject: [PATCH 44/83] lockd: Use xdrgen XDR functions for the NLMv4 GRANTED_RES procedure Convert the GRANTED_RES procedure to use xdrgen functions nlm4_svc_decode_nlm4_res and nlm4_svc_encode_void. GRANTED_RES is a callback procedure where the client sends granted lock results back to the server after an async GRANTED request. The pc_argzero field is set to zero because xdrgen decoders reliably initialize all arguments, making the early defensive memset unnecessary. This change also corrects the pc_xdrressize field, which previously contained a placeholder value. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 60 +++++++++++++++++++++++++-------------------- 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index f730da7d1168..f986cdac5d00 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -71,6 +71,7 @@ static_assert(offsetof(struct nlm4_testres_wrapper, xdrgen) == 0); struct nlm4_res_wrapper { struct nlm4_res xdrgen; + struct nlm_cookie cookie; }; static_assert(offsetof(struct nlm4_res_wrapper, xdrgen) == 0); @@ -959,6 +960,30 @@ static __be32 nlm4svc_proc_granted_msg(struct svc_rqst *rqstp) __nlm4svc_proc_granted_msg); } +/** + * nlm4svc_proc_granted_res - GRANTED_RES: Lock Granted result + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully. + * + * RPC synopsis: + * void NLMPROC4_GRANTED_RES(nlm4_res) = 15; + */ +static __be32 nlm4svc_proc_granted_res(struct svc_rqst *rqstp) +{ + struct nlm4_res_wrapper *argp = rqstp->rq_argp; + + if (!nlmsvc_ops) + return rpc_success; + + if (nlm4_netobj_to_cookie(&argp->cookie, &argp->xdrgen.cookie)) + return rpc_success; + nlmsvc_grant_reply(&argp->cookie, argp->xdrgen.stat.stat); + + return rpc_success; +} + /* * SHARE: create a DOS share or alter existing share. */ @@ -1084,23 +1109,6 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp) return rpc_success; } -/* - * client sent a GRANTED_RES, let's remove the associated block - */ -static __be32 -nlm4svc_proc_granted_res(struct svc_rqst *rqstp) -{ - struct nlm_res *argp = rqstp->rq_argp; - - if (!nlmsvc_ops) - return rpc_success; - - dprintk("lockd: GRANTED_RES called\n"); - - nlmsvc_grant_reply(&argp->cookie, argp->status); - return rpc_success; -} - static __be32 nlm4svc_proc_unused(struct svc_rqst *rqstp) { @@ -1270,15 +1278,15 @@ static const struct svc_procedure nlm4svc_procedures[24] = { .pc_xdrressize = XDR_void, .pc_name = "UNLOCK_RES", }, - [NLMPROC_GRANTED_RES] = { - .pc_func = nlm4svc_proc_granted_res, - .pc_decode = nlm4svc_decode_res, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_res), - .pc_argzero = sizeof(struct nlm_res), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = St, - .pc_name = "GRANTED_RES", + [NLMPROC4_GRANTED_RES] = { + .pc_func = nlm4svc_proc_granted_res, + .pc_decode = nlm4_svc_decode_nlm4_res, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = sizeof(struct nlm4_res_wrapper), + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "GRANTED_RES", }, [NLMPROC_NSM_NOTIFY] = { .pc_func = nlm4svc_proc_sm_notify, From 16099e1002728558d792eaba8c565e8892b57041 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:07:11 -0500 Subject: [PATCH 45/83] lockd: Use xdrgen XDR functions for the NLMv4 SM_NOTIFY procedure Convert the SM_NOTIFY procedure to use xdrgen functions nlm4_svc_decode_nlm4_notifyargs and nlm4_svc_encode_void. SM_NOTIFY is a private callback from statd to notify lockd when a remote host has rebooted. This patch introduces struct nlm4_notifyargs_wrapper to bridge between the xdrgen-generated nlm4_notifyargs and the nlm_reboot structure expected by nlm_host_rebooted(). The wrapper contains both the xdrgen-decoded arguments and a reboot field for the existing API. The pc_argzero field is set to zero because xdrgen decoders reliably initialize all arguments, making the early defensive memset unnecessary. This change also corrects the pc_xdrressize field, which previously contained a placeholder value. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 86 +++++++++++++++++++++++++++++---------------- 1 file changed, 55 insertions(+), 31 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index f986cdac5d00..4f8c41046ed6 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -62,6 +62,13 @@ struct nlm4_unlockargs_wrapper { static_assert(offsetof(struct nlm4_unlockargs_wrapper, xdrgen) == 0); +struct nlm4_notifyargs_wrapper { + struct nlm4_notifyargs xdrgen; + struct nlm_reboot reboot; +}; + +static_assert(offsetof(struct nlm4_notifyargs_wrapper, xdrgen) == 0); + struct nlm4_testres_wrapper { struct nlm4_testres xdrgen; struct nlm_lock lock; @@ -984,6 +991,44 @@ static __be32 nlm4svc_proc_granted_res(struct svc_rqst *rqstp) return rpc_success; } +/** + * nlm4svc_proc_sm_notify - SM_NOTIFY: Peer has rebooted + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully. + * %rpc_system_err: RPC execution failed. + * + * The SM_NOTIFY procedure is a private callback from Linux statd and is + * not part of the official NLM protocol. + * + * RPC synopsis: + * void NLMPROC4_SM_NOTIFY(nlm4_notifyargs) = 16; + */ +static __be32 nlm4svc_proc_sm_notify(struct svc_rqst *rqstp) +{ + struct nlm4_notifyargs_wrapper *argp = rqstp->rq_argp; + struct nlm_reboot *reboot = &argp->reboot; + + if (!nlm_privileged_requester(rqstp)) { + char buf[RPC_MAX_ADDRBUFLEN]; + + pr_warn("lockd: rejected NSM callback from %s\n", + svc_print_addr(rqstp, buf, sizeof(buf))); + return rpc_system_err; + } + + reboot->len = argp->xdrgen.notify.name.len; + reboot->mon = (char *)argp->xdrgen.notify.name.data; + reboot->state = argp->xdrgen.notify.state; + memcpy(&reboot->priv.data, argp->xdrgen.private, + sizeof(reboot->priv.data)); + + nlm_host_rebooted(SVC_NET(rqstp), reboot); + + return rpc_success; +} + /* * SHARE: create a DOS share or alter existing share. */ @@ -1088,27 +1133,6 @@ nlm4svc_proc_free_all(struct svc_rqst *rqstp) return rpc_success; } -/* - * SM_NOTIFY: private callback from statd (not part of official NLM proto) - */ -static __be32 -nlm4svc_proc_sm_notify(struct svc_rqst *rqstp) -{ - struct nlm_reboot *argp = rqstp->rq_argp; - - dprintk("lockd: SM_NOTIFY called\n"); - - if (!nlm_privileged_requester(rqstp)) { - char buf[RPC_MAX_ADDRBUFLEN]; - printk(KERN_WARNING "lockd: rejected NSM callback from %s\n", - svc_print_addr(rqstp, buf, sizeof(buf))); - return rpc_system_err; - } - - nlm_host_rebooted(SVC_NET(rqstp), argp); - return rpc_success; -} - static __be32 nlm4svc_proc_unused(struct svc_rqst *rqstp) { @@ -1288,15 +1312,15 @@ static const struct svc_procedure nlm4svc_procedures[24] = { .pc_xdrressize = XDR_void, .pc_name = "GRANTED_RES", }, - [NLMPROC_NSM_NOTIFY] = { - .pc_func = nlm4svc_proc_sm_notify, - .pc_decode = nlm4svc_decode_reboot, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_reboot), - .pc_argzero = sizeof(struct nlm_reboot), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = St, - .pc_name = "SM_NOTIFY", + [NLMPROC4_SM_NOTIFY] = { + .pc_func = nlm4svc_proc_sm_notify, + .pc_decode = nlm4_svc_decode_nlm4_notifyargs, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = sizeof(struct nlm4_notifyargs_wrapper), + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "SM_NOTIFY", }, [17] = { .pc_func = nlm4svc_proc_unused, @@ -1378,10 +1402,10 @@ union nlm4svc_xdrstore { struct nlm4_lockargs_wrapper lockargs; struct nlm4_cancargs_wrapper cancargs; struct nlm4_unlockargs_wrapper unlockargs; + struct nlm4_notifyargs_wrapper notifyargs; struct nlm4_testres_wrapper testres; struct nlm4_res_wrapper res; struct nlm_args args; - struct nlm_reboot reboot; }; static DEFINE_PER_CPU_ALIGNED(unsigned long, From 5eae0e00dc4bdc5a56a1e5e405332622d0942e89 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:07:12 -0500 Subject: [PATCH 46/83] lockd: Convert server-side undefined procedures to xdrgen The NLMv4 protocol defines several procedure slots that are not implemented. These undefined procedures need proper handling to return rpc_proc_unavail to clients that mistakenly invoke them. This patch converts the three undefined procedure entries (slots 17, 18, and 19) to use xdrgen functions nlm4_svc_decode_void and nlm4_svc_encode_void. The nlm4svc_proc_unused function is also moved earlier in the file to follow the convention of placing procedure implementations before the procedure table. The pc_argsize, pc_ressize, and pc_argzero fields are now correctly set to zero since no arguments or results are processed. The pc_xdrressize field is updated to XDR_void to accurately reflect the response size. This conversion completes the migration of all NLMv4 server-side procedures to use xdrgen-generated XDR functions, improving type safety and eliminating hand-written XDR code. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 66 ++++++++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 30 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 4f8c41046ed6..b4ed77125f68 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -1029,6 +1029,18 @@ static __be32 nlm4svc_proc_sm_notify(struct svc_rqst *rqstp) return rpc_success; } +/** + * nlm4svc_proc_unused - stub for unused procedures + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_proc_unavail: Program can't support procedure. + */ +static __be32 nlm4svc_proc_unused(struct svc_rqst *rqstp) +{ + return rpc_proc_unavail; +} + /* * SHARE: create a DOS share or alter existing share. */ @@ -1133,12 +1145,6 @@ nlm4svc_proc_free_all(struct svc_rqst *rqstp) return rpc_success; } -static __be32 -nlm4svc_proc_unused(struct svc_rqst *rqstp) -{ - return rpc_proc_unavail; -} - /* * NLM Server procedures. @@ -1323,34 +1329,34 @@ static const struct svc_procedure nlm4svc_procedures[24] = { .pc_name = "SM_NOTIFY", }, [17] = { - .pc_func = nlm4svc_proc_unused, - .pc_decode = nlm4svc_decode_void, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_void), - .pc_argzero = sizeof(struct nlm_void), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = 0, - .pc_name = "UNUSED", + .pc_func = nlm4svc_proc_unused, + .pc_decode = nlm4_svc_decode_void, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = 0, + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "UNUSED", }, [18] = { - .pc_func = nlm4svc_proc_unused, - .pc_decode = nlm4svc_decode_void, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_void), - .pc_argzero = sizeof(struct nlm_void), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = 0, - .pc_name = "UNUSED", + .pc_func = nlm4svc_proc_unused, + .pc_decode = nlm4_svc_decode_void, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = 0, + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "UNUSED", }, [19] = { - .pc_func = nlm4svc_proc_unused, - .pc_decode = nlm4svc_decode_void, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_void), - .pc_argzero = sizeof(struct nlm_void), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = 0, - .pc_name = "UNUSED", + .pc_func = nlm4svc_proc_unused, + .pc_decode = nlm4_svc_decode_void, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = 0, + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "UNUSED", }, [NLMPROC_SHARE] = { .pc_func = nlm4svc_proc_share, From bb2a70b610810874838f176a0d157d5cd0226c18 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:07:13 -0500 Subject: [PATCH 47/83] lockd: Hoist file_lock init out of nlm4svc_decode_shareargs() The xdrgen-generated XDR decoders cannot initialize the file_lock structure because it is an internal kernel type, not part of the wire protocol. To prepare for converting SHARE and UNSHARE procedures to use xdrgen, the file_lock initialization must be moved from nlm4svc_decode_shareargs() into the procedure handlers themselves. This change removes one more dependency on the "struct nlm_lock::fl" field in fs/lockd/xdr4.c, allowing the XDR decoder to focus solely on unmarshalling wire data. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 16 ++++++++++++---- fs/lockd/xdr4.c | 3 --- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index b4ed77125f68..6dd9afc59551 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -1049,6 +1049,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp) { struct nlm_args *argp = rqstp->rq_argp; struct nlm_res *resp = rqstp->rq_resp; + struct nlm_lock *lock = &argp->lock; struct nlm_host *host; struct nlm_file *file; @@ -1063,7 +1064,10 @@ nlm4svc_proc_share(struct svc_rqst *rqstp) } /* Obtain client and file */ - if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) + locks_init_lock(&lock->fl); + lock->svid = ~(u32)0; + resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file); + if (resp->status) return resp->status == nlm__int__drop_reply ? rpc_drop_reply : rpc_success; @@ -1071,7 +1075,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp) resp->status = nlmsvc_share_file(host, file, argp); dprintk("lockd: SHARE status %d\n", ntohl(resp->status)); - nlmsvc_release_lockowner(&argp->lock); + nlmsvc_release_lockowner(lock); nlmsvc_release_host(host); nlm_release_file(file); return rpc_success; @@ -1085,6 +1089,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp) { struct nlm_args *argp = rqstp->rq_argp; struct nlm_res *resp = rqstp->rq_resp; + struct nlm_lock *lock = &argp->lock; struct nlm_host *host; struct nlm_file *file; @@ -1099,7 +1104,10 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp) } /* Obtain client and file */ - if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) + locks_init_lock(&lock->fl); + lock->svid = ~(u32)0; + resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file); + if (resp->status) return resp->status == nlm__int__drop_reply ? rpc_drop_reply : rpc_success; @@ -1107,7 +1115,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp) resp->status = nlmsvc_unshare_file(host, file, argp); dprintk("lockd: UNSHARE status %d\n", ntohl(resp->status)); - nlmsvc_release_lockowner(&argp->lock); + nlmsvc_release_lockowner(lock); nlmsvc_release_host(host); nlm_release_file(file); return rpc_success; diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c index dbbb2dfcb81b..308aac92a94e 100644 --- a/fs/lockd/xdr4.c +++ b/fs/lockd/xdr4.c @@ -257,9 +257,6 @@ nlm4svc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) struct nlm_args *argp = rqstp->rq_argp; struct nlm_lock *lock = &argp->lock; - locks_init_lock(&lock->fl); - lock->svid = ~(u32)0; - if (!svcxdr_decode_cookie(xdr, &argp->cookie)) return false; if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len)) From 4e6814b1750770213ab5b81bc04d8b941435a7b2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:07:14 -0500 Subject: [PATCH 48/83] lockd: Prepare share helpers for xdrgen conversion In order to convert the NLMv4 server-side XDR functions to use xdrgen, the internal share helpers need to be decoupled from the NLMv3-specific struct nlm_args. NLMv4 procedures will use different argument structures once they are converted. Refactor nlmsvc_share_file() and nlmsvc_unshare_file() to accept individual arguments (oh, access, mode) instead of the common struct nlm_args. This allows both protocol versions to call these helpers without forcing a common argument structure. While here, add kdoc comments to both functions and fix a comment typo in the unshare path. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/share.h | 8 ++++---- fs/lockd/svc4proc.c | 7 ++++--- fs/lockd/svcproc.c | 7 +++++-- fs/lockd/svcshare.c | 35 +++++++++++++++++++++++------------ 4 files changed, 36 insertions(+), 21 deletions(-) diff --git a/fs/lockd/share.h b/fs/lockd/share.h index d8f4ebd9c278..a2867e30c593 100644 --- a/fs/lockd/share.h +++ b/fs/lockd/share.h @@ -20,10 +20,10 @@ struct nlm_share { u32 s_mode; /* deny mode */ }; -__be32 nlmsvc_share_file(struct nlm_host *, struct nlm_file *, - struct nlm_args *); -__be32 nlmsvc_unshare_file(struct nlm_host *, struct nlm_file *, - struct nlm_args *); +__be32 nlmsvc_share_file(struct nlm_host *host, struct nlm_file *file, + struct xdr_netobj *oh, u32 access, u32 mode); +__be32 nlmsvc_unshare_file(struct nlm_host *host, struct nlm_file *file, + struct xdr_netobj *oh); void nlmsvc_traverse_shares(struct nlm_host *, struct nlm_file *, nlm_host_match_fn_t); diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 6dd9afc59551..d820d6620e06 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -1072,7 +1072,8 @@ nlm4svc_proc_share(struct svc_rqst *rqstp) rpc_drop_reply : rpc_success; /* Now try to create the share */ - resp->status = nlmsvc_share_file(host, file, argp); + resp->status = nlmsvc_share_file(host, file, &lock->oh, + argp->fsm_access, argp->fsm_mode); dprintk("lockd: SHARE status %d\n", ntohl(resp->status)); nlmsvc_release_lockowner(lock); @@ -1111,8 +1112,8 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp) return resp->status == nlm__int__drop_reply ? rpc_drop_reply : rpc_success; - /* Now try to lock the file */ - resp->status = nlmsvc_unshare_file(host, file, argp); + /* Now try to unshare the file */ + resp->status = nlmsvc_unshare_file(host, file, &lock->oh); dprintk("lockd: UNSHARE status %d\n", ntohl(resp->status)); nlmsvc_release_lockowner(lock); diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 75b0dfa1a79a..749abf8886ba 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -423,7 +423,9 @@ nlmsvc_proc_share(struct svc_rqst *rqstp) rpc_drop_reply : rpc_success; /* Now try to create the share */ - resp->status = cast_status(nlmsvc_share_file(host, file, argp)); + resp->status = cast_status(nlmsvc_share_file(host, file, &argp->lock.oh, + argp->fsm_access, + argp->fsm_mode)); dprintk("lockd: SHARE status %d\n", ntohl(resp->status)); nlmsvc_release_lockowner(&argp->lock); @@ -459,7 +461,8 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp) rpc_drop_reply : rpc_success; /* Now try to unshare the file */ - resp->status = cast_status(nlmsvc_unshare_file(host, file, argp)); + resp->status = cast_status(nlmsvc_unshare_file(host, file, + &argp->lock.oh)); dprintk("lockd: UNSHARE status %d\n", ntohl(resp->status)); nlmsvc_release_lockowner(&argp->lock); diff --git a/fs/lockd/svcshare.c b/fs/lockd/svcshare.c index 8675ac80ab16..53f5655c128c 100644 --- a/fs/lockd/svcshare.c +++ b/fs/lockd/svcshare.c @@ -25,12 +25,21 @@ nlm_cmp_owner(struct nlm_share *share, struct xdr_netobj *oh) && !memcmp(share->s_owner.data, oh->data, oh->len); } +/** + * nlmsvc_share_file - create a share + * @host: Network client peer + * @file: File to be shared + * @oh: Share owner handle + * @access: Requested access mode + * @mode: Requested file sharing mode + * + * Returns an NLM status code. + */ __be32 nlmsvc_share_file(struct nlm_host *host, struct nlm_file *file, - struct nlm_args *argp) + struct xdr_netobj *oh, u32 access, u32 mode) { struct nlm_share *share; - struct xdr_netobj *oh = &argp->lock.oh; u8 *ohdata; if (nlmsvc_file_cannot_lock(file)) @@ -39,13 +48,11 @@ nlmsvc_share_file(struct nlm_host *host, struct nlm_file *file, for (share = file->f_shares; share; share = share->s_next) { if (share->s_host == host && nlm_cmp_owner(share, oh)) goto update; - if ((argp->fsm_access & share->s_mode) - || (argp->fsm_mode & share->s_access )) + if ((access & share->s_mode) || (mode & share->s_access)) return nlm_lck_denied; } - share = kmalloc(sizeof(*share) + oh->len, - GFP_KERNEL); + share = kmalloc(sizeof(*share) + oh->len, GFP_KERNEL); if (share == NULL) return nlm_lck_denied_nolocks; @@ -61,20 +68,24 @@ nlmsvc_share_file(struct nlm_host *host, struct nlm_file *file, file->f_shares = share; update: - share->s_access = argp->fsm_access; - share->s_mode = argp->fsm_mode; + share->s_access = access; + share->s_mode = mode; return nlm_granted; } -/* - * Delete a share. +/** + * nlmsvc_unshare_file - delete a share + * @host: Network client peer + * @file: File to be unshared + * @oh: Share owner handle + * + * Returns an NLM status code. */ __be32 nlmsvc_unshare_file(struct nlm_host *host, struct nlm_file *file, - struct nlm_args *argp) + struct xdr_netobj *oh) { struct nlm_share *share, **shpp; - struct xdr_netobj *oh = &argp->lock.oh; if (nlmsvc_file_cannot_lock(file)) return nlm_lck_denied_nolocks; From 57c7bb3bd22ce14167f8c83441fc68b3bb80133b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:07:15 -0500 Subject: [PATCH 49/83] lockd: Use xdrgen XDR functions for the NLMv4 SHARE procedure Now that the share helpers have been decoupled from the NLMv3-specific struct nlm_args and file_lock initialization has been hoisted into the procedure handler, the NLMv4 SHARE procedure can be converted to use xdrgen-generated XDR functions. Replace the NLMPROC4_SHARE entry in the nlm_procedures4 array with an entry that uses xdrgen-built XDR decoders and encoders. The procedure handler is updated to use the new wrapper structures (nlm4_shareargs_wrapper and nlm4_shareres_wrapper) and access arguments through the argp->xdrgen hierarchy. The .pc_argzero field is set to zero because xdrgen decoders fully initialize all fields in argp->xdrgen, making the early defensive memset unnecessary. The remaining argp fields that fall outside the xdrgen structures are cleared explicitly as needed. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 118 ++++++++++++++++++++++++++++++-------------- 1 file changed, 81 insertions(+), 37 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index d820d6620e06..fbbc5db7a4f7 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -74,6 +74,13 @@ struct nlm4_testres_wrapper { struct nlm_lock lock; }; +struct nlm4_shareargs_wrapper { + struct nlm4_shareargs xdrgen; + struct nlm_lock lock; +}; + +static_assert(offsetof(struct nlm4_shareargs_wrapper, xdrgen) == 0); + static_assert(offsetof(struct nlm4_testres_wrapper, xdrgen) == 0); struct nlm4_res_wrapper { @@ -83,6 +90,12 @@ struct nlm4_res_wrapper { static_assert(offsetof(struct nlm4_res_wrapper, xdrgen) == 0); +struct nlm4_shareres_wrapper { + struct nlm4_shareres xdrgen; +}; + +static_assert(offsetof(struct nlm4_shareres_wrapper, xdrgen) == 0); + static __be32 nlm4_netobj_to_cookie(struct nlm_cookie *cookie, netobj *object) { @@ -1041,45 +1054,74 @@ static __be32 nlm4svc_proc_unused(struct svc_rqst *rqstp) return rpc_proc_unavail; } -/* - * SHARE: create a DOS share or alter existing share. +/** + * nlm4svc_proc_share - SHARE: Open a file using DOS file-sharing modes + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully. + * %rpc_drop_reply: Do not send an RPC reply. + * + * RPC synopsis: + * nlm4_shareres NLMPROC4_SHARE(nlm4_shareargs) = 20; + * + * Permissible procedure status codes: + * %NLM4_GRANTED: The requested share lock was granted. + * %NLM4_DENIED: The requested lock conflicted with existing + * lock reservations for the file. + * %NLM4_DENIED_GRACE_PERIOD: The server has recently restarted and is + * re-establishing existing locks, and is not + * yet ready to accept normal service requests. + * + * The Linux NLM server implementation also returns: + * %NLM4_DENIED_NOLOCKS: A needed resource could not be allocated. + * %NLM4_STALE_FH: The request specified an invalid file handle. + * %NLM4_FBIG: The request specified a length or offset + * that exceeds the range supported by the + * server. + * %NLM4_FAILED: The request failed for an unspecified reason. */ -static __be32 -nlm4svc_proc_share(struct svc_rqst *rqstp) +static __be32 nlm4svc_proc_share(struct svc_rqst *rqstp) { - struct nlm_args *argp = rqstp->rq_argp; - struct nlm_res *resp = rqstp->rq_resp; + struct nlm4_shareargs_wrapper *argp = rqstp->rq_argp; + struct nlm4_shareres_wrapper *resp = rqstp->rq_resp; struct nlm_lock *lock = &argp->lock; - struct nlm_host *host; - struct nlm_file *file; + struct nlm_host *host = NULL; + struct nlm_file *file = NULL; + struct nlm4_lock xdr_lock = { + .fh = argp->xdrgen.share.fh, + .oh = argp->xdrgen.share.oh, + .svid = ~(u32)0, + }; - dprintk("lockd: SHARE called\n"); + resp->xdrgen.cookie = argp->xdrgen.cookie; - resp->cookie = argp->cookie; + resp->xdrgen.stat = nlm_lck_denied_grace_period; + if (locks_in_grace(SVC_NET(rqstp)) && !argp->xdrgen.reclaim) + goto out; - /* Don't accept new lock requests during grace period */ - if (locks_in_grace(SVC_NET(rqstp)) && !argp->reclaim) { - resp->status = nlm_lck_denied_grace_period; - return rpc_success; - } + resp->xdrgen.stat = nlm_lck_denied_nolocks; + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.share.caller_name, true); + if (!host) + goto out; - /* Obtain client and file */ - locks_init_lock(&lock->fl); - lock->svid = ~(u32)0; - resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file); - if (resp->status) - return resp->status == nlm__int__drop_reply ? - rpc_drop_reply : rpc_success; + resp->xdrgen.stat = nlm4svc_lookup_file(rqstp, host, lock, &file, + &xdr_lock, F_RDLCK); + if (resp->xdrgen.stat) + goto out; - /* Now try to create the share */ - resp->status = nlmsvc_share_file(host, file, &lock->oh, - argp->fsm_access, argp->fsm_mode); + resp->xdrgen.stat = nlmsvc_share_file(host, file, &lock->oh, + argp->xdrgen.share.access, + argp->xdrgen.share.mode); - dprintk("lockd: SHARE status %d\n", ntohl(resp->status)); nlmsvc_release_lockowner(lock); + +out: + if (file) + nlm_release_file(file); nlmsvc_release_host(host); - nlm_release_file(file); - return rpc_success; + return resp->xdrgen.stat == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; } /* @@ -1367,15 +1409,15 @@ static const struct svc_procedure nlm4svc_procedures[24] = { .pc_xdrressize = XDR_void, .pc_name = "UNUSED", }, - [NLMPROC_SHARE] = { - .pc_func = nlm4svc_proc_share, - .pc_decode = nlm4svc_decode_shareargs, - .pc_encode = nlm4svc_encode_shareres, - .pc_argsize = sizeof(struct nlm_args), - .pc_argzero = sizeof(struct nlm_args), - .pc_ressize = sizeof(struct nlm_res), - .pc_xdrressize = Ck+St+1, - .pc_name = "SHARE", + [NLMPROC4_SHARE] = { + .pc_func = nlm4svc_proc_share, + .pc_decode = nlm4_svc_decode_nlm4_shareargs, + .pc_encode = nlm4_svc_encode_nlm4_shareres, + .pc_argsize = sizeof(struct nlm4_shareargs_wrapper), + .pc_argzero = 0, + .pc_ressize = sizeof(struct nlm4_shareres_wrapper), + .pc_xdrressize = NLM4_nlm4_shareres_sz, + .pc_name = "SHARE", }, [NLMPROC_UNSHARE] = { .pc_func = nlm4svc_proc_unshare, @@ -1418,8 +1460,10 @@ union nlm4svc_xdrstore { struct nlm4_cancargs_wrapper cancargs; struct nlm4_unlockargs_wrapper unlockargs; struct nlm4_notifyargs_wrapper notifyargs; + struct nlm4_shareargs_wrapper shareargs; struct nlm4_testres_wrapper testres; struct nlm4_res_wrapper res; + struct nlm4_shareres_wrapper shareres; struct nlm_args args; }; From 985d022db383a2d750a63a0e828cf41fa649512a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:07:16 -0500 Subject: [PATCH 50/83] lockd: Use xdrgen XDR functions for the NLMv4 UNSHARE procedure Now that the share helpers have been decoupled from the NLMv3-specific struct nlm_args and file_lock initialization has been hoisted into the procedure handler, the NLMv4 UNSHARE procedure can be converted to use xdrgen-generated XDR functions. Replace the NLMPROC4_UNSHARE entry in the nlm_procedures4 array with an entry that uses xdrgen-built XDR decoders and encoders. The procedure handler is updated to use the new wrapper structures (nlm4_shareargs_wrapper and nlm4_shareres_wrapper) and access arguments through the argp->xdrgen hierarchy. The .pc_argzero field is set to zero because xdrgen decoders fully initialize all fields in argp->xdrgen, making the early defensive memset unnecessary. The remaining argp fields that fall outside the xdrgen structures are cleared explicitly as needed. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 98 ++++++++++++++++++++++++++++----------------- 1 file changed, 62 insertions(+), 36 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index fbbc5db7a4f7..5d85f888fdf4 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -1124,44 +1124,70 @@ static __be32 nlm4svc_proc_share(struct svc_rqst *rqstp) rpc_drop_reply : rpc_success; } -/* - * UNSHARE: Release a DOS share. +/** + * nlm4svc_proc_unshare - UNSHARE: Release a share reservation + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully. + * %rpc_drop_reply: Do not send an RPC reply. + * + * RPC synopsis: + * nlm4_shareres NLMPROC4_UNSHARE(nlm4_shareargs) = 21; + * + * Permissible procedure status codes: + * %NLM4_GRANTED: The share reservation was released. + * %NLM4_DENIED_GRACE_PERIOD: The server has recently restarted and is + * re-establishing existing locks, and is not + * yet ready to accept normal service requests. + * + * The Linux NLM server implementation also returns: + * %NLM4_DENIED_NOLOCKS: A needed resource could not be allocated. + * %NLM4_STALE_FH: The request specified an invalid file handle. + * %NLM4_FBIG: The request specified a length or offset + * that exceeds the range supported by the + * server. + * %NLM4_FAILED: The request failed for an unspecified reason. */ -static __be32 -nlm4svc_proc_unshare(struct svc_rqst *rqstp) +static __be32 nlm4svc_proc_unshare(struct svc_rqst *rqstp) { - struct nlm_args *argp = rqstp->rq_argp; - struct nlm_res *resp = rqstp->rq_resp; + struct nlm4_shareargs_wrapper *argp = rqstp->rq_argp; + struct nlm4_shareres_wrapper *resp = rqstp->rq_resp; struct nlm_lock *lock = &argp->lock; - struct nlm_host *host; - struct nlm_file *file; + struct nlm4_lock xdr_lock = { + .fh = argp->xdrgen.share.fh, + .oh = argp->xdrgen.share.oh, + .svid = ~(u32)0, + }; + struct nlm_host *host = NULL; + struct nlm_file *file = NULL; - dprintk("lockd: UNSHARE called\n"); + resp->xdrgen.cookie = argp->xdrgen.cookie; - resp->cookie = argp->cookie; + resp->xdrgen.stat = nlm_lck_denied_grace_period; + if (locks_in_grace(SVC_NET(rqstp))) + goto out; - /* Don't accept requests during grace period */ - if (locks_in_grace(SVC_NET(rqstp))) { - resp->status = nlm_lck_denied_grace_period; - return rpc_success; - } + resp->xdrgen.stat = nlm_lck_denied_nolocks; + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.share.caller_name, true); + if (!host) + goto out; - /* Obtain client and file */ - locks_init_lock(&lock->fl); - lock->svid = ~(u32)0; - resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file); - if (resp->status) - return resp->status == nlm__int__drop_reply ? - rpc_drop_reply : rpc_success; + resp->xdrgen.stat = nlm4svc_lookup_file(rqstp, host, lock, &file, + &xdr_lock, F_RDLCK); + if (resp->xdrgen.stat) + goto out; - /* Now try to unshare the file */ - resp->status = nlmsvc_unshare_file(host, file, &lock->oh); + resp->xdrgen.stat = nlmsvc_unshare_file(host, file, &lock->oh); - dprintk("lockd: UNSHARE status %d\n", ntohl(resp->status)); nlmsvc_release_lockowner(lock); + +out: + if (file) + nlm_release_file(file); nlmsvc_release_host(host); - nlm_release_file(file); - return rpc_success; + return resp->xdrgen.stat == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; } /* @@ -1419,15 +1445,15 @@ static const struct svc_procedure nlm4svc_procedures[24] = { .pc_xdrressize = NLM4_nlm4_shareres_sz, .pc_name = "SHARE", }, - [NLMPROC_UNSHARE] = { - .pc_func = nlm4svc_proc_unshare, - .pc_decode = nlm4svc_decode_shareargs, - .pc_encode = nlm4svc_encode_shareres, - .pc_argsize = sizeof(struct nlm_args), - .pc_argzero = sizeof(struct nlm_args), - .pc_ressize = sizeof(struct nlm_res), - .pc_xdrressize = Ck+St+1, - .pc_name = "UNSHARE", + [NLMPROC4_UNSHARE] = { + .pc_func = nlm4svc_proc_unshare, + .pc_decode = nlm4_svc_decode_nlm4_shareargs, + .pc_encode = nlm4_svc_encode_nlm4_shareres, + .pc_argsize = sizeof(struct nlm4_shareargs_wrapper), + .pc_argzero = 0, + .pc_ressize = sizeof(struct nlm4_shareres_wrapper), + .pc_xdrressize = NLM4_nlm4_shareres_sz, + .pc_name = "UNSHARE", }, [NLMPROC_NM_LOCK] = { .pc_func = nlm4svc_proc_nm_lock, From eedae4430122c5799f3d33bbd26dcbafc7d02cd1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:07:17 -0500 Subject: [PATCH 51/83] lockd: Use xdrgen XDR functions for the NLMv4 NM_LOCK procedure Now that nlm4svc_do_lock() has been introduced to handle both monitored and non-monitored lock requests, the NLMv4 NM_LOCK procedure can be converted to use xdrgen-generated XDR functions. This conversion allows the removal of __nlm4svc_proc_lock(), a helper function that was previously shared between the LOCK and NM_LOCK procedures. Replace the NLMPROC4_NM_LOCK entry in the nlm_procedures4 array with an entry that uses xdrgen-built XDR decoders and encoders. The procedure handler is updated to call nlm4svc_do_lock() directly and access arguments through the argp->xdrgen hierarchy. The .pc_argzero field is set to zero because xdrgen decoders fully initialize all fields in argp->xdrgen, making the early defensive memset unnecessary. The remaining argp fields that fall outside the xdrgen structures are cleared explicitly as needed. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 101 +++++++++++++++++++------------------------- 1 file changed, 44 insertions(+), 57 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 5d85f888fdf4..62c90827dfae 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -358,44 +358,6 @@ static __be32 nlm4svc_proc_test(struct svc_rqst *rqstp) rpc_drop_reply : rpc_success; } -static __be32 -__nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_res *resp) -{ - struct nlm_args *argp = rqstp->rq_argp; - struct nlm_host *host; - struct nlm_file *file; - __be32 rc = rpc_success; - - dprintk("lockd: LOCK called\n"); - - resp->cookie = argp->cookie; - - /* Obtain client and file */ - if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm__int__drop_reply ? - rpc_drop_reply : rpc_success; - - /* Now try to lock the file */ - resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock, - argp->block, &argp->cookie, - argp->reclaim); - switch (resp->status) { - case nlm__int__drop_reply: - rc = rpc_drop_reply; - break; - case nlm__int__deadlock: - resp->status = nlm4_deadlock; - fallthrough; - default: - dprintk("lockd: LOCK status %d\n", ntohl(resp->status)); - } - - nlmsvc_release_lockowner(&argp->lock); - nlmsvc_release_host(host); - nlm_release_file(file); - return rc; -} - static __be32 nlm4svc_do_lock(struct svc_rqst *rqstp, bool monitored) { @@ -1190,18 +1152,43 @@ static __be32 nlm4svc_proc_unshare(struct svc_rqst *rqstp) rpc_drop_reply : rpc_success; } -/* - * NM_LOCK: Create an unmonitored lock +/** + * nlm4svc_proc_nm_lock - NM_LOCK: Establish a non-monitored lock + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully. + * %rpc_drop_reply: Do not send an RPC reply. + * + * RPC synopsis: + * nlm4_res NLMPROC4_NM_LOCK(nlm4_lockargs) = 22; + * + * Permissible procedure status codes: + * %NLM4_GRANTED: The requested lock was granted. + * %NLM4_DENIED: The requested lock conflicted with existing + * lock reservations for the file. + * %NLM4_DENIED_NOLOCKS: The server could not allocate the resources + * needed to process the request. + * %NLM4_BLOCKED: The blocking request cannot be granted + * immediately. The server will send an + * NLMPROC4_GRANTED callback to the client when + * the lock can be granted. + * %NLM4_DENIED_GRACE_PERIOD: The server has recently restarted and is + * re-establishing existing locks, and is not + * yet ready to accept normal service requests. + * + * The Linux NLM server implementation also returns: + * %NLM4_DEADLCK: The request could not be granted and + * blocking would cause a deadlock. + * %NLM4_STALE_FH: The request specified an invalid file handle. + * %NLM4_FBIG: The request specified a length or offset + * that exceeds the range supported by the + * server. + * %NLM4_FAILED: The request failed for an unspecified reason. */ -static __be32 -nlm4svc_proc_nm_lock(struct svc_rqst *rqstp) +static __be32 nlm4svc_proc_nm_lock(struct svc_rqst *rqstp) { - struct nlm_args *argp = rqstp->rq_argp; - - dprintk("lockd: NM_LOCK called\n"); - - argp->monitor = 0; /* just clean the monitor flag */ - return __nlm4svc_proc_lock(rqstp, rqstp->rq_resp); + return nlm4svc_do_lock(rqstp, false); } /* @@ -1455,15 +1442,15 @@ static const struct svc_procedure nlm4svc_procedures[24] = { .pc_xdrressize = NLM4_nlm4_shareres_sz, .pc_name = "UNSHARE", }, - [NLMPROC_NM_LOCK] = { - .pc_func = nlm4svc_proc_nm_lock, - .pc_decode = nlm4svc_decode_lockargs, - .pc_encode = nlm4svc_encode_res, - .pc_argsize = sizeof(struct nlm_args), - .pc_argzero = sizeof(struct nlm_args), - .pc_ressize = sizeof(struct nlm_res), - .pc_xdrressize = Ck+St, - .pc_name = "NM_LOCK", + [NLMPROC4_NM_LOCK] = { + .pc_func = nlm4svc_proc_nm_lock, + .pc_decode = nlm4_svc_decode_nlm4_lockargs, + .pc_encode = nlm4_svc_encode_nlm4_res, + .pc_argsize = sizeof(struct nlm4_lockargs_wrapper), + .pc_argzero = 0, + .pc_ressize = sizeof(struct nlm4_res_wrapper), + .pc_xdrressize = NLM4_nlm4_res_sz, + .pc_name = "NM_LOCK", }, [NLMPROC_FREE_ALL] = { .pc_func = nlm4svc_proc_free_all, From b201ce7af2a28d8d6c43a3b5bd099a44f98c1c3e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:07:18 -0500 Subject: [PATCH 52/83] lockd: Use xdrgen XDR functions for the NLMv4 FREE_ALL procedure With all other NLMv4 procedures now converted to xdrgen-generated XDR functions, the FREE_ALL procedure can be converted as well. This conversion allows the removal of nlm4svc_retrieve_args(), a 79-line helper function that was used only by FREE_ALL to retrieve client information from lockd's internal data structures. Replace the NLMPROC4_FREE_ALL entry in the nlm_procedures4 array with an entry that uses xdrgen-built XDR decoders and encoders. The procedure handler is updated to use the new wrapper structure (nlm4_notify_wrapper) and call nlm4svc_lookup_host() directly, eliminating the need for the now-removed helper function. The .pc_argzero field is set to zero because xdrgen decoders fully initialize all fields in argp->xdrgen, making the early defensive memset unnecessary. The remaining argp fields that fall outside the xdrgen structures are cleared explicitly as needed. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 125 ++++++++++++-------------------------------- 1 file changed, 33 insertions(+), 92 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 62c90827dfae..ca0409ea6b2d 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -69,6 +69,12 @@ struct nlm4_notifyargs_wrapper { static_assert(offsetof(struct nlm4_notifyargs_wrapper, xdrgen) == 0); +struct nlm4_notify_wrapper { + struct nlm4_notify xdrgen; +}; + +static_assert(offsetof(struct nlm4_notify_wrapper, xdrgen) == 0); + struct nlm4_testres_wrapper { struct nlm4_testres xdrgen; struct nlm_lock lock; @@ -191,80 +197,6 @@ nlm4svc_lookup_file(struct svc_rqst *rqstp, struct nlm_host *host, return nlm_granted; } -/* - * Obtain client and file from arguments - */ -static __be32 -nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, - struct nlm_host **hostp, struct nlm_file **filp) -{ - struct nlm_host *host = NULL; - struct nlm_file *file = NULL; - struct nlm_lock *lock = &argp->lock; - __be32 error = 0; - - /* nfsd callbacks must have been installed for this procedure */ - if (!nlmsvc_ops) - return nlm_lck_denied_nolocks; - - if (lock->lock_start > OFFSET_MAX || - (lock->lock_len && ((lock->lock_len - 1) > (OFFSET_MAX - lock->lock_start)))) - return nlm4_fbig; - - /* Obtain host handle */ - if (!(host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len)) - || (argp->monitor && nsm_monitor(host) < 0)) - goto no_locks; - *hostp = host; - - /* Obtain file pointer. Not used by FREE_ALL call. */ - if (filp != NULL) { - int mode = lock_to_openmode(&lock->fl); - - lock->fl.c.flc_flags = FL_POSIX; - - error = nlm_lookup_file(rqstp, &file, lock); - if (error) - goto no_locks; - *filp = file; - - /* Set up the missing parts of the file_lock structure */ - lock->fl.c.flc_file = file->f_file[mode]; - lock->fl.c.flc_pid = current->tgid; - lock->fl.fl_start = (loff_t)lock->lock_start; - lock->fl.fl_end = lock->lock_len ? - (loff_t)(lock->lock_start + lock->lock_len - 1) : - OFFSET_MAX; - lock->fl.fl_lmops = &nlmsvc_lock_operations; - nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid); - if (!lock->fl.c.flc_owner) { - /* lockowner allocation has failed */ - nlmsvc_release_host(host); - return nlm_lck_denied_nolocks; - } - } - - return 0; - -no_locks: - nlmsvc_release_host(host); - switch (error) { - case nlm_granted: - return nlm_lck_denied_nolocks; - case nlm__int__stale_fh: - return nlm4_stale_fh; - case nlm__int__failed: - return nlm4_failed; - default: - if (be32_to_cpu(error) >= 30000) { - pr_warn_once("lockd: unhandled internal status %u\n", - be32_to_cpu(error)); - return nlm4_failed; - } - return error; - } -} - /** * nlm4svc_proc_null - NULL: Test for presence of service * @rqstp: RPC transaction context @@ -1191,21 +1123,30 @@ static __be32 nlm4svc_proc_nm_lock(struct svc_rqst *rqstp) return nlm4svc_do_lock(rqstp, false); } -/* - * FREE_ALL: Release all locks and shares held by client +/** + * nlm4svc_proc_free_all - FREE_ALL: Discard client's lock and share state + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully. + * + * RPC synopsis: + * void NLMPROC4_FREE_ALL(nlm4_notify) = 23; */ -static __be32 -nlm4svc_proc_free_all(struct svc_rqst *rqstp) +static __be32 nlm4svc_proc_free_all(struct svc_rqst *rqstp) { - struct nlm_args *argp = rqstp->rq_argp; + struct nlm4_notify_wrapper *argp = rqstp->rq_argp; struct nlm_host *host; - /* Obtain client */ - if (nlm4svc_retrieve_args(rqstp, argp, &host, NULL)) - return rpc_success; + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.name, false); + if (!host) + goto out; nlmsvc_free_host_resources(host); + nlmsvc_release_host(host); + +out: return rpc_success; } @@ -1452,15 +1393,15 @@ static const struct svc_procedure nlm4svc_procedures[24] = { .pc_xdrressize = NLM4_nlm4_res_sz, .pc_name = "NM_LOCK", }, - [NLMPROC_FREE_ALL] = { - .pc_func = nlm4svc_proc_free_all, - .pc_decode = nlm4svc_decode_notify, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_args), - .pc_argzero = sizeof(struct nlm_args), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = St, - .pc_name = "FREE_ALL", + [NLMPROC4_FREE_ALL] = { + .pc_func = nlm4svc_proc_free_all, + .pc_decode = nlm4_svc_decode_nlm4_notify, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = sizeof(struct nlm4_notify_wrapper), + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "FREE_ALL", }, }; @@ -1474,10 +1415,10 @@ union nlm4svc_xdrstore { struct nlm4_unlockargs_wrapper unlockargs; struct nlm4_notifyargs_wrapper notifyargs; struct nlm4_shareargs_wrapper shareargs; + struct nlm4_notify_wrapper notify; struct nlm4_testres_wrapper testres; struct nlm4_res_wrapper res; struct nlm4_shareres_wrapper shareres; - struct nlm_args args; }; static DEFINE_PER_CPU_ALIGNED(unsigned long, From 515788fa985fee596ddc9f8cd1e295e01883bb9e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:07:19 -0500 Subject: [PATCH 53/83] lockd: Add LOCKD_SHARE_SVID constant for DOS sharing mode Replace the magic value ~(u32)0 with a named constant. This value is used as a synthetic svid when looking up lockowners for DOS share operations, which have no real process ID associated with them. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/share.h | 3 +++ fs/lockd/svc4proc.c | 4 ++-- fs/lockd/xdr.c | 3 ++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/lockd/share.h b/fs/lockd/share.h index a2867e30c593..20ea8ee49168 100644 --- a/fs/lockd/share.h +++ b/fs/lockd/share.h @@ -8,6 +8,9 @@ #ifndef _LOCKD_SHARE_H #define _LOCKD_SHARE_H +/* Synthetic svid for lockowner lookup during share operations */ +#define LOCKD_SHARE_SVID (~(u32)0) + /* * DOS share for a specific file */ diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index ca0409ea6b2d..ce340ea0d304 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -985,7 +985,7 @@ static __be32 nlm4svc_proc_share(struct svc_rqst *rqstp) struct nlm4_lock xdr_lock = { .fh = argp->xdrgen.share.fh, .oh = argp->xdrgen.share.oh, - .svid = ~(u32)0, + .svid = LOCKD_SHARE_SVID, }; resp->xdrgen.cookie = argp->xdrgen.cookie; @@ -1051,7 +1051,7 @@ static __be32 nlm4svc_proc_unshare(struct svc_rqst *rqstp) struct nlm4_lock xdr_lock = { .fh = argp->xdrgen.share.fh, .oh = argp->xdrgen.share.oh, - .svid = ~(u32)0, + .svid = LOCKD_SHARE_SVID, }; struct nlm_host *host = NULL; struct nlm_file *file = NULL; diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index 5aac49d1875a..dfca8b8dab73 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -19,6 +19,7 @@ #include #include "lockd.h" +#include "share.h" #include "svcxdr.h" static inline loff_t @@ -274,7 +275,7 @@ nlmsvc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) memset(lock, 0, sizeof(*lock)); locks_init_lock(&lock->fl); - lock->svid = ~(u32)0; + lock->svid = LOCKD_SHARE_SVID; if (!svcxdr_decode_cookie(xdr, &argp->cookie)) return false; From b131a424b0860d14b2778a5d3a8295f19e816291 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:07:20 -0500 Subject: [PATCH 54/83] lockd: Remove C macros that are no longer used The conversion of all NLMv4 procedures to xdrgen-generated XDR functions is complete. The hand-rolled XDR size calculation macros (Ck, No, St, Rg) and the nlm_void structure definition served only the older implementations and are now unused. Also removes NLMDBG_FACILITY, which was set to the client debug flag in server-side code but never referenced, and corrects a comment to specify "NLMv4 Server procedures". Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index ce340ea0d304..4044459b7c49 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -26,8 +26,6 @@ #include "nlm4xdr_gen.h" #include "xdr4.h" -#define NLMDBG_FACILITY NLMDBG_CLIENT - /* * Wrapper structures combine xdrgen types with legacy nlm_lock. * The xdrgen field must be first so the structure can be cast @@ -1152,16 +1150,9 @@ static __be32 nlm4svc_proc_free_all(struct svc_rqst *rqstp) /* - * NLM Server procedures. + * NLMv4 Server procedures. */ -struct nlm_void { int dummy; }; - -#define Ck (1+XDR_QUADLEN(NLM_MAXCOOKIELEN)) /* cookie */ -#define No (1+1024/4) /* netobj */ -#define St 1 /* status */ -#define Rg 4 /* range (offset + length) */ - static const struct svc_procedure nlm4svc_procedures[24] = { [NLMPROC4_NULL] = { .pc_func = nlm4svc_proc_null, From 4f406a2c1e23b03c8f5920318a0effb15ba238da Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:07:21 -0500 Subject: [PATCH 55/83] lockd: Remove dead code from fs/lockd/xdr4.c Now that all NLMv4 server-side procedures use XDR encoder and decoder functions generated by xdrgen, the hand-written code in fs/lockd/xdr4.c is no longer needed. This file contained the original XDR processing logic that has been systematically replaced throughout this series. Remove the file and its Makefile reference to eliminate the dead code. The helper function nlm4svc_set_file_lock_range() is still needed by the generated code, so move it to xdr4.h as an inline function where it remains accessible. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/Makefile | 2 +- fs/lockd/clnt4xdr.c | 2 - fs/lockd/lockd.h | 7 + fs/lockd/svc4proc.c | 1 - fs/lockd/xdr4.c | 334 -------------------------------------------- fs/lockd/xdr4.h | 33 ----- 6 files changed, 8 insertions(+), 371 deletions(-) delete mode 100644 fs/lockd/xdr4.c delete mode 100644 fs/lockd/xdr4.h diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile index 8e9d18a4348c..808f0f2a7be1 100644 --- a/fs/lockd/Makefile +++ b/fs/lockd/Makefile @@ -9,7 +9,7 @@ obj-$(CONFIG_LOCKD) += lockd.o lockd-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \ svcshare.o svcproc.o svcsubs.o mon.o trace.o xdr.o netlink.o -lockd-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o nlm4xdr_gen.o +lockd-$(CONFIG_LOCKD_V4) += clnt4xdr.o svc4proc.o nlm4xdr_gen.o lockd-$(CONFIG_PROC_FS) += procfs.o # diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c index c09e67765cac..2058733eacf8 100644 --- a/fs/lockd/clnt4xdr.c +++ b/fs/lockd/clnt4xdr.c @@ -18,8 +18,6 @@ #include -#include "xdr4.h" - #define NLMDBG_FACILITY NLMDBG_XDR #if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ) diff --git a/fs/lockd/lockd.h b/fs/lockd/lockd.h index ad4c6701b64a..a7c85ab6d4b5 100644 --- a/fs/lockd/lockd.h +++ b/fs/lockd/lockd.h @@ -52,6 +52,13 @@ */ #define LOCKD_DFLT_TIMEO 10 +/* error codes new to NLMv4 */ +#define nlm4_deadlock cpu_to_be32(NLM_DEADLCK) +#define nlm4_rofs cpu_to_be32(NLM_ROFS) +#define nlm4_stale_fh cpu_to_be32(NLM_STALE_FH) +#define nlm4_fbig cpu_to_be32(NLM_FBIG) +#define nlm4_failed cpu_to_be32(NLM_FAILED) + /* * Internal-use status codes, not to be placed on the wire. * Version handlers translate these to appropriate wire values. diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 4044459b7c49..5de41e249534 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -24,7 +24,6 @@ #include "share.h" #include "nlm4xdr_gen.h" -#include "xdr4.h" /* * Wrapper structures combine xdrgen types with legacy nlm_lock. diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c deleted file mode 100644 index 308aac92a94e..000000000000 --- a/fs/lockd/xdr4.c +++ /dev/null @@ -1,334 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/fs/lockd/xdr4.c - * - * XDR support for lockd and the lock client. - * - * Copyright (C) 1995, 1996 Olaf Kirch - * Copyright (C) 1999, Trond Myklebust - */ - -#include -#include -#include - -#include -#include -#include -#include - -#include "lockd.h" -#include "svcxdr.h" -#include "xdr4.h" - -static inline s64 -loff_t_to_s64(loff_t offset) -{ - s64 res; - if (offset > NLM4_OFFSET_MAX) - res = NLM4_OFFSET_MAX; - else if (offset < -NLM4_OFFSET_MAX) - res = -NLM4_OFFSET_MAX; - else - res = offset; - return res; -} - -/* - * NLM file handles are defined by specification to be a variable-length - * XDR opaque no longer than 1024 bytes. However, this implementation - * limits their length to the size of an NFSv3 file handle. - */ -static bool -svcxdr_decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh) -{ - __be32 *p; - u32 len; - - if (xdr_stream_decode_u32(xdr, &len) < 0) - return false; - if (len > NFS_MAXFHSIZE) - return false; - - p = xdr_inline_decode(xdr, len); - if (!p) - return false; - fh->size = len; - memcpy(fh->data, p, len); - memset(fh->data + len, 0, sizeof(fh->data) - len); - - return true; -} - -static bool -svcxdr_decode_lock(struct xdr_stream *xdr, struct nlm_lock *lock) -{ - struct file_lock *fl = &lock->fl; - - if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len)) - return false; - if (!svcxdr_decode_fhandle(xdr, &lock->fh)) - return false; - if (!svcxdr_decode_owner(xdr, &lock->oh)) - return false; - if (xdr_stream_decode_u32(xdr, &lock->svid) < 0) - return false; - if (xdr_stream_decode_u64(xdr, &lock->lock_start) < 0) - return false; - if (xdr_stream_decode_u64(xdr, &lock->lock_len) < 0) - return false; - - locks_init_lock(fl); - fl->c.flc_type = F_RDLCK; - lockd_set_file_lock_range4(fl, lock->lock_start, lock->lock_len); - return true; -} - -static bool -svcxdr_encode_holder(struct xdr_stream *xdr, const struct nlm_lock *lock) -{ - const struct file_lock *fl = &lock->fl; - s64 start, len; - - /* exclusive */ - if (xdr_stream_encode_bool(xdr, fl->c.flc_type != F_RDLCK) < 0) - return false; - if (xdr_stream_encode_u32(xdr, lock->svid) < 0) - return false; - if (!svcxdr_encode_owner(xdr, &lock->oh)) - return false; - start = loff_t_to_s64(fl->fl_start); - if (fl->fl_end == OFFSET_MAX) - len = 0; - else - len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1); - if (xdr_stream_encode_u64(xdr, start) < 0) - return false; - if (xdr_stream_encode_u64(xdr, len) < 0) - return false; - - return true; -} - -static bool -svcxdr_encode_testrply(struct xdr_stream *xdr, const struct nlm_res *resp) -{ - if (!svcxdr_encode_stats(xdr, resp->status)) - return false; - switch (resp->status) { - case nlm_lck_denied: - if (!svcxdr_encode_holder(xdr, &resp->lock)) - return false; - } - - return true; -} - - -/* - * Decode Call arguments - */ - -bool -nlm4svc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr) -{ - return true; -} - -bool -nlm4svc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) -{ - struct nlm_args *argp = rqstp->rq_argp; - u32 exclusive; - - if (!svcxdr_decode_cookie(xdr, &argp->cookie)) - return false; - if (xdr_stream_decode_bool(xdr, &exclusive) < 0) - return false; - if (!svcxdr_decode_lock(xdr, &argp->lock)) - return false; - if (exclusive) - argp->lock.fl.c.flc_type = F_WRLCK; - - return true; -} - -bool -nlm4svc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) -{ - struct nlm_args *argp = rqstp->rq_argp; - u32 exclusive; - - if (!svcxdr_decode_cookie(xdr, &argp->cookie)) - return false; - if (xdr_stream_decode_bool(xdr, &argp->block) < 0) - return false; - if (xdr_stream_decode_bool(xdr, &exclusive) < 0) - return false; - if (!svcxdr_decode_lock(xdr, &argp->lock)) - return false; - if (exclusive) - argp->lock.fl.c.flc_type = F_WRLCK; - if (xdr_stream_decode_bool(xdr, &argp->reclaim) < 0) - return false; - if (xdr_stream_decode_u32(xdr, &argp->state) < 0) - return false; - argp->monitor = 1; /* monitor client by default */ - - return true; -} - -bool -nlm4svc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) -{ - struct nlm_args *argp = rqstp->rq_argp; - u32 exclusive; - - if (!svcxdr_decode_cookie(xdr, &argp->cookie)) - return false; - if (xdr_stream_decode_bool(xdr, &argp->block) < 0) - return false; - if (xdr_stream_decode_bool(xdr, &exclusive) < 0) - return false; - if (!svcxdr_decode_lock(xdr, &argp->lock)) - return false; - if (exclusive) - argp->lock.fl.c.flc_type = F_WRLCK; - - return true; -} - -bool -nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) -{ - struct nlm_args *argp = rqstp->rq_argp; - - if (!svcxdr_decode_cookie(xdr, &argp->cookie)) - return false; - if (!svcxdr_decode_lock(xdr, &argp->lock)) - return false; - argp->lock.fl.c.flc_type = F_UNLCK; - - return true; -} - -bool -nlm4svc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr) -{ - struct nlm_res *resp = rqstp->rq_argp; - - if (!svcxdr_decode_cookie(xdr, &resp->cookie)) - return false; - if (!svcxdr_decode_stats(xdr, &resp->status)) - return false; - - return true; -} - -bool -nlm4svc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr) -{ - struct nlm_reboot *argp = rqstp->rq_argp; - __be32 *p; - u32 len; - - if (xdr_stream_decode_u32(xdr, &len) < 0) - return false; - if (len > SM_MAXSTRLEN) - return false; - p = xdr_inline_decode(xdr, len); - if (!p) - return false; - argp->len = len; - argp->mon = (char *)p; - if (xdr_stream_decode_u32(xdr, &argp->state) < 0) - return false; - p = xdr_inline_decode(xdr, SM_PRIV_SIZE); - if (!p) - return false; - memcpy(&argp->priv.data, p, sizeof(argp->priv.data)); - - return true; -} - -bool -nlm4svc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) -{ - struct nlm_args *argp = rqstp->rq_argp; - struct nlm_lock *lock = &argp->lock; - - if (!svcxdr_decode_cookie(xdr, &argp->cookie)) - return false; - if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len)) - return false; - if (!svcxdr_decode_fhandle(xdr, &lock->fh)) - return false; - if (!svcxdr_decode_owner(xdr, &lock->oh)) - return false; - /* XXX: Range checks are missing in the original code */ - if (xdr_stream_decode_u32(xdr, &argp->fsm_mode) < 0) - return false; - if (xdr_stream_decode_u32(xdr, &argp->fsm_access) < 0) - return false; - - return true; -} - -bool -nlm4svc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr) -{ - struct nlm_args *argp = rqstp->rq_argp; - struct nlm_lock *lock = &argp->lock; - - if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len)) - return false; - if (xdr_stream_decode_u32(xdr, &argp->state) < 0) - return false; - - return true; -} - - -/* - * Encode Reply results - */ - -bool -nlm4svc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr) -{ - return true; -} - -bool -nlm4svc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr) -{ - struct nlm_res *resp = rqstp->rq_resp; - - return svcxdr_encode_cookie(xdr, &resp->cookie) && - svcxdr_encode_testrply(xdr, resp); -} - -bool -nlm4svc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr) -{ - struct nlm_res *resp = rqstp->rq_resp; - - return svcxdr_encode_cookie(xdr, &resp->cookie) && - svcxdr_encode_stats(xdr, resp->status); -} - -bool -nlm4svc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr) -{ - struct nlm_res *resp = rqstp->rq_resp; - - if (!svcxdr_encode_cookie(xdr, &resp->cookie)) - return false; - if (!svcxdr_encode_stats(xdr, resp->status)) - return false; - /* sequence */ - if (xdr_stream_encode_u32(xdr, 0) < 0) - return false; - - return true; -} diff --git a/fs/lockd/xdr4.h b/fs/lockd/xdr4.h deleted file mode 100644 index 4ddf51a2e0ea..000000000000 --- a/fs/lockd/xdr4.h +++ /dev/null @@ -1,33 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * XDR types for the NLM protocol - * - * Copyright (C) 1996 Olaf Kirch - */ - -#ifndef _LOCKD_XDR4_H -#define _LOCKD_XDR4_H - -/* error codes new to NLMv4 */ -#define nlm4_deadlock cpu_to_be32(NLM_DEADLCK) -#define nlm4_rofs cpu_to_be32(NLM_ROFS) -#define nlm4_stale_fh cpu_to_be32(NLM_STALE_FH) -#define nlm4_fbig cpu_to_be32(NLM_FBIG) -#define nlm4_failed cpu_to_be32(NLM_FAILED) - -bool nlm4svc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr); - -bool nlm4svc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr); - -#endif /* _LOCKD_XDR4_H */ From 6b4f16a532e794e0df90baf15173e2166f863864 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 21 Feb 2026 13:39:59 -0500 Subject: [PATCH 56/83] sunrpc: Add XPT flags missing from SVC_XPRT_FLAG_LIST Commit eccbbc7c00a5 ("nfsd: don't use sv_nrthreads in connection limiting calculations.") and commit 898374fdd7f0 ("nfsd: unregister with rpcbind when deleting a transport") added new XPT flags but neglected to update the show_svc_xprt_flags() macro. Signed-off-by: Chuck Lever --- include/trace/events/sunrpc.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 750ecce56930..ff855197880d 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1933,7 +1933,9 @@ TRACE_EVENT(svc_stats_latency, svc_xprt_flag(CONG_CTRL) \ svc_xprt_flag(HANDSHAKE) \ svc_xprt_flag(TLS_SESSION) \ - svc_xprt_flag_end(PEER_AUTH) + svc_xprt_flag(PEER_AUTH) \ + svc_xprt_flag(PEER_VALID) \ + svc_xprt_flag_end(RPCB_UNREG) #undef svc_xprt_flag #undef svc_xprt_flag_end From 17c1d66579ff27a7a8f2f407d1425272ff6fdd8c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 23 Feb 2026 12:09:59 -0500 Subject: [PATCH 57/83] sunrpc: convert queue_lock from global spinlock to per-cache-detail lock The global queue_lock serializes all upcall queue operations across every cache_detail instance. Convert it to a per-cache-detail spinlock so that different caches (e.g. auth.unix.ip vs nfsd.fh) no longer contend with each other on queue operations. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/cache.h | 1 + net/sunrpc/cache.c | 47 ++++++++++++++++++------------------ 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index e783132e481f..3d32dd1f7b05 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -113,6 +113,7 @@ struct cache_detail { /* fields for communication over channel */ struct list_head queue; + spinlock_t queue_lock; atomic_t writers; /* how many time is /channel open */ time64_t last_close; /* if no writers, when did last close */ diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 86b3fd5a429d..1cfaae488c6c 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -400,6 +400,7 @@ void sunrpc_init_cache_detail(struct cache_detail *cd) { spin_lock_init(&cd->hash_lock); INIT_LIST_HEAD(&cd->queue); + spin_lock_init(&cd->queue_lock); spin_lock(&cache_list_lock); cd->nextcheck = 0; cd->entries = 0; @@ -803,8 +804,6 @@ void cache_clean_deferred(void *owner) * */ -static DEFINE_SPINLOCK(queue_lock); - struct cache_queue { struct list_head list; int reader; /* if 0, then request */ @@ -847,7 +846,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, inode_lock(inode); /* protect against multiple concurrent * readers on this file */ again: - spin_lock(&queue_lock); + spin_lock(&cd->queue_lock); /* need to find next request */ while (rp->q.list.next != &cd->queue && list_entry(rp->q.list.next, struct cache_queue, list) @@ -856,7 +855,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, list_move(&rp->q.list, next); } if (rp->q.list.next == &cd->queue) { - spin_unlock(&queue_lock); + spin_unlock(&cd->queue_lock); inode_unlock(inode); WARN_ON_ONCE(rp->offset); return 0; @@ -865,7 +864,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, WARN_ON_ONCE(rq->q.reader); if (rp->offset == 0) rq->readers++; - spin_unlock(&queue_lock); + spin_unlock(&cd->queue_lock); if (rq->len == 0) { err = cache_request(cd, rq); @@ -876,9 +875,9 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, if (rp->offset == 0 && !test_bit(CACHE_PENDING, &rq->item->flags)) { err = -EAGAIN; - spin_lock(&queue_lock); + spin_lock(&cd->queue_lock); list_move(&rp->q.list, &rq->q.list); - spin_unlock(&queue_lock); + spin_unlock(&cd->queue_lock); } else { if (rp->offset + count > rq->len) count = rq->len - rp->offset; @@ -888,26 +887,26 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, rp->offset += count; if (rp->offset >= rq->len) { rp->offset = 0; - spin_lock(&queue_lock); + spin_lock(&cd->queue_lock); list_move(&rp->q.list, &rq->q.list); - spin_unlock(&queue_lock); + spin_unlock(&cd->queue_lock); } err = 0; } out: if (rp->offset == 0) { /* need to release rq */ - spin_lock(&queue_lock); + spin_lock(&cd->queue_lock); rq->readers--; if (rq->readers == 0 && !test_bit(CACHE_PENDING, &rq->item->flags)) { list_del(&rq->q.list); - spin_unlock(&queue_lock); + spin_unlock(&cd->queue_lock); cache_put(rq->item, cd); kfree(rq->buf); kfree(rq); } else - spin_unlock(&queue_lock); + spin_unlock(&cd->queue_lock); } if (err == -EAGAIN) goto again; @@ -988,7 +987,7 @@ static __poll_t cache_poll(struct file *filp, poll_table *wait, if (!rp) return mask; - spin_lock(&queue_lock); + spin_lock(&cd->queue_lock); for (cq= &rp->q; &cq->list != &cd->queue; cq = list_entry(cq->list.next, struct cache_queue, list)) @@ -996,7 +995,7 @@ static __poll_t cache_poll(struct file *filp, poll_table *wait, mask |= EPOLLIN | EPOLLRDNORM; break; } - spin_unlock(&queue_lock); + spin_unlock(&cd->queue_lock); return mask; } @@ -1011,7 +1010,7 @@ static int cache_ioctl(struct inode *ino, struct file *filp, if (cmd != FIONREAD || !rp) return -EINVAL; - spin_lock(&queue_lock); + spin_lock(&cd->queue_lock); /* only find the length remaining in current request, * or the length of the next request @@ -1024,7 +1023,7 @@ static int cache_ioctl(struct inode *ino, struct file *filp, len = cr->len - rp->offset; break; } - spin_unlock(&queue_lock); + spin_unlock(&cd->queue_lock); return put_user(len, (int __user *)arg); } @@ -1046,9 +1045,9 @@ static int cache_open(struct inode *inode, struct file *filp, rp->offset = 0; rp->q.reader = 1; - spin_lock(&queue_lock); + spin_lock(&cd->queue_lock); list_add(&rp->q.list, &cd->queue); - spin_unlock(&queue_lock); + spin_unlock(&cd->queue_lock); } if (filp->f_mode & FMODE_WRITE) atomic_inc(&cd->writers); @@ -1064,7 +1063,7 @@ static int cache_release(struct inode *inode, struct file *filp, if (rp) { struct cache_request *rq = NULL; - spin_lock(&queue_lock); + spin_lock(&cd->queue_lock); if (rp->offset) { struct cache_queue *cq; for (cq = &rp->q; &cq->list != &cd->queue; @@ -1086,7 +1085,7 @@ static int cache_release(struct inode *inode, struct file *filp, rp->offset = 0; } list_del(&rp->q.list); - spin_unlock(&queue_lock); + spin_unlock(&cd->queue_lock); if (rq) { cache_put(rq->item, cd); @@ -1113,7 +1112,7 @@ static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch) struct cache_request *cr; LIST_HEAD(dequeued); - spin_lock(&queue_lock); + spin_lock(&detail->queue_lock); list_for_each_entry_safe(cq, tmp, &detail->queue, list) if (!cq->reader) { cr = container_of(cq, struct cache_request, q); @@ -1126,7 +1125,7 @@ static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch) continue; list_move(&cr->q.list, &dequeued); } - spin_unlock(&queue_lock); + spin_unlock(&detail->queue_lock); while (!list_empty(&dequeued)) { cr = list_entry(dequeued.next, struct cache_request, q.list); list_del(&cr->q.list); @@ -1251,7 +1250,7 @@ static int cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h) crq->buf = buf; crq->len = 0; crq->readers = 0; - spin_lock(&queue_lock); + spin_lock(&detail->queue_lock); if (test_bit(CACHE_PENDING, &h->flags)) { crq->item = cache_get(h); list_add_tail(&crq->q.list, &detail->queue); @@ -1259,7 +1258,7 @@ static int cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h) } else /* Lost a race, no longer PENDING, so don't enqueue */ ret = -EAGAIN; - spin_unlock(&queue_lock); + spin_unlock(&detail->queue_lock); wake_up(&queue_wait); if (ret == -EAGAIN) { kfree(buf); From 552d0e17ea042fc4f959c4543cbbd0e54de7a8e9 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 23 Feb 2026 12:10:00 -0500 Subject: [PATCH 58/83] sunrpc: convert queue_wait from global to per-cache-detail waitqueue The queue_wait waitqueue is currently a file-scoped global, so a wake_up for one cache_detail wakes pollers on all caches. Convert it to a per-cache-detail field so that only pollers on the relevant cache are woken. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/cache.h | 2 ++ net/sunrpc/cache.c | 7 +++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index 3d32dd1f7b05..031379efba24 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -16,6 +16,7 @@ #include #include #include +#include /* * Each cache requires: @@ -114,6 +115,7 @@ struct cache_detail { /* fields for communication over channel */ struct list_head queue; spinlock_t queue_lock; + wait_queue_head_t queue_wait; atomic_t writers; /* how many time is /channel open */ time64_t last_close; /* if no writers, when did last close */ diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 1cfaae488c6c..fd02dca1f07a 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -401,6 +401,7 @@ void sunrpc_init_cache_detail(struct cache_detail *cd) spin_lock_init(&cd->hash_lock); INIT_LIST_HEAD(&cd->queue); spin_lock_init(&cd->queue_lock); + init_waitqueue_head(&cd->queue_wait); spin_lock(&cache_list_lock); cd->nextcheck = 0; cd->entries = 0; @@ -970,8 +971,6 @@ static ssize_t cache_write(struct file *filp, const char __user *buf, return ret; } -static DECLARE_WAIT_QUEUE_HEAD(queue_wait); - static __poll_t cache_poll(struct file *filp, poll_table *wait, struct cache_detail *cd) { @@ -979,7 +978,7 @@ static __poll_t cache_poll(struct file *filp, poll_table *wait, struct cache_reader *rp = filp->private_data; struct cache_queue *cq; - poll_wait(filp, &queue_wait, wait); + poll_wait(filp, &cd->queue_wait, wait); /* alway allow write */ mask = EPOLLOUT | EPOLLWRNORM; @@ -1259,7 +1258,7 @@ static int cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h) /* Lost a race, no longer PENDING, so don't enqueue */ ret = -EAGAIN; spin_unlock(&detail->queue_lock); - wake_up(&queue_wait); + wake_up(&detail->queue_wait); if (ret == -EAGAIN) { kfree(buf); kfree(crq); From facc4e3c80420e3466003ce09b576e005b56a015 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 23 Feb 2026 12:10:01 -0500 Subject: [PATCH 59/83] sunrpc: split cache_detail queue into request and reader lists Replace the single interleaved queue (which mixed cache_request and cache_reader entries distinguished by a ->reader flag) with two dedicated lists: cd->requests for upcall requests and cd->readers for open file handles. Readers now track their position via a monotonically increasing sequence number (next_seqno) rather than by their position in the shared list. Each cache_request is assigned a seqno when enqueued, and a new cache_next_request() helper finds the next request at or after a given seqno. This eliminates the cache_queue wrapper struct entirely, simplifies the reader-skipping loops in cache_read/cache_poll/cache_ioctl/ cache_release, and makes the data flow easier to reason about. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/cache.h | 4 +- net/sunrpc/cache.c | 143 +++++++++++++++-------------------- 2 files changed, 62 insertions(+), 85 deletions(-) diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index 031379efba24..b1e595c2615b 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -113,9 +113,11 @@ struct cache_detail { int entries; /* fields for communication over channel */ - struct list_head queue; + struct list_head requests; + struct list_head readers; spinlock_t queue_lock; wait_queue_head_t queue_wait; + u64 next_seqno; atomic_t writers; /* how many time is /channel open */ time64_t last_close; /* if no writers, when did last close */ diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index fd02dca1f07a..7081c1214e6c 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -399,9 +399,11 @@ static struct delayed_work cache_cleaner; void sunrpc_init_cache_detail(struct cache_detail *cd) { spin_lock_init(&cd->hash_lock); - INIT_LIST_HEAD(&cd->queue); + INIT_LIST_HEAD(&cd->requests); + INIT_LIST_HEAD(&cd->readers); spin_lock_init(&cd->queue_lock); init_waitqueue_head(&cd->queue_wait); + cd->next_seqno = 0; spin_lock(&cache_list_lock); cd->nextcheck = 0; cd->entries = 0; @@ -796,29 +798,20 @@ void cache_clean_deferred(void *owner) * On read, you get a full request, or block. * On write, an update request is processed. * Poll works if anything to read, and always allows write. - * - * Implemented by linked list of requests. Each open file has - * a ->private that also exists in this list. New requests are added - * to the end and may wakeup and preceding readers. - * New readers are added to the head. If, on read, an item is found with - * CACHE_UPCALLING clear, we free it from the list. - * */ -struct cache_queue { - struct list_head list; - int reader; /* if 0, then request */ -}; struct cache_request { - struct cache_queue q; + struct list_head list; struct cache_head *item; - char * buf; + char *buf; int len; int readers; + u64 seqno; }; struct cache_reader { - struct cache_queue q; + struct list_head list; int offset; /* if non-0, we have a refcnt on next request */ + u64 next_seqno; }; static int cache_request(struct cache_detail *detail, @@ -833,6 +826,17 @@ static int cache_request(struct cache_detail *detail, return PAGE_SIZE - len; } +static struct cache_request * +cache_next_request(struct cache_detail *cd, u64 seqno) +{ + struct cache_request *rq; + + list_for_each_entry(rq, &cd->requests, list) + if (rq->seqno >= seqno) + return rq; + return NULL; +} + static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos, struct cache_detail *cd) { @@ -849,20 +853,13 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, again: spin_lock(&cd->queue_lock); /* need to find next request */ - while (rp->q.list.next != &cd->queue && - list_entry(rp->q.list.next, struct cache_queue, list) - ->reader) { - struct list_head *next = rp->q.list.next; - list_move(&rp->q.list, next); - } - if (rp->q.list.next == &cd->queue) { + rq = cache_next_request(cd, rp->next_seqno); + if (!rq) { spin_unlock(&cd->queue_lock); inode_unlock(inode); WARN_ON_ONCE(rp->offset); return 0; } - rq = container_of(rp->q.list.next, struct cache_request, q.list); - WARN_ON_ONCE(rq->q.reader); if (rp->offset == 0) rq->readers++; spin_unlock(&cd->queue_lock); @@ -876,9 +873,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, if (rp->offset == 0 && !test_bit(CACHE_PENDING, &rq->item->flags)) { err = -EAGAIN; - spin_lock(&cd->queue_lock); - list_move(&rp->q.list, &rq->q.list); - spin_unlock(&cd->queue_lock); + rp->next_seqno = rq->seqno + 1; } else { if (rp->offset + count > rq->len) count = rq->len - rp->offset; @@ -888,9 +883,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, rp->offset += count; if (rp->offset >= rq->len) { rp->offset = 0; - spin_lock(&cd->queue_lock); - list_move(&rp->q.list, &rq->q.list); - spin_unlock(&cd->queue_lock); + rp->next_seqno = rq->seqno + 1; } err = 0; } @@ -901,7 +894,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, rq->readers--; if (rq->readers == 0 && !test_bit(CACHE_PENDING, &rq->item->flags)) { - list_del(&rq->q.list); + list_del(&rq->list); spin_unlock(&cd->queue_lock); cache_put(rq->item, cd); kfree(rq->buf); @@ -976,7 +969,6 @@ static __poll_t cache_poll(struct file *filp, poll_table *wait, { __poll_t mask; struct cache_reader *rp = filp->private_data; - struct cache_queue *cq; poll_wait(filp, &cd->queue_wait, wait); @@ -988,12 +980,8 @@ static __poll_t cache_poll(struct file *filp, poll_table *wait, spin_lock(&cd->queue_lock); - for (cq= &rp->q; &cq->list != &cd->queue; - cq = list_entry(cq->list.next, struct cache_queue, list)) - if (!cq->reader) { - mask |= EPOLLIN | EPOLLRDNORM; - break; - } + if (cache_next_request(cd, rp->next_seqno)) + mask |= EPOLLIN | EPOLLRDNORM; spin_unlock(&cd->queue_lock); return mask; } @@ -1004,7 +992,7 @@ static int cache_ioctl(struct inode *ino, struct file *filp, { int len = 0; struct cache_reader *rp = filp->private_data; - struct cache_queue *cq; + struct cache_request *rq; if (cmd != FIONREAD || !rp) return -EINVAL; @@ -1014,14 +1002,9 @@ static int cache_ioctl(struct inode *ino, struct file *filp, /* only find the length remaining in current request, * or the length of the next request */ - for (cq= &rp->q; &cq->list != &cd->queue; - cq = list_entry(cq->list.next, struct cache_queue, list)) - if (!cq->reader) { - struct cache_request *cr = - container_of(cq, struct cache_request, q); - len = cr->len - rp->offset; - break; - } + rq = cache_next_request(cd, rp->next_seqno); + if (rq) + len = rq->len - rp->offset; spin_unlock(&cd->queue_lock); return put_user(len, (int __user *)arg); @@ -1042,10 +1025,10 @@ static int cache_open(struct inode *inode, struct file *filp, return -ENOMEM; } rp->offset = 0; - rp->q.reader = 1; + rp->next_seqno = 0; spin_lock(&cd->queue_lock); - list_add(&rp->q.list, &cd->queue); + list_add(&rp->list, &cd->readers); spin_unlock(&cd->queue_lock); } if (filp->f_mode & FMODE_WRITE) @@ -1064,26 +1047,21 @@ static int cache_release(struct inode *inode, struct file *filp, spin_lock(&cd->queue_lock); if (rp->offset) { - struct cache_queue *cq; - for (cq = &rp->q; &cq->list != &cd->queue; - cq = list_entry(cq->list.next, - struct cache_queue, list)) - if (!cq->reader) { - struct cache_request *cr = - container_of(cq, - struct cache_request, q); - cr->readers--; - if (cr->readers == 0 && - !test_bit(CACHE_PENDING, - &cr->item->flags)) { - list_del(&cr->q.list); - rq = cr; - } - break; + struct cache_request *cr; + + cr = cache_next_request(cd, rp->next_seqno); + if (cr) { + cr->readers--; + if (cr->readers == 0 && + !test_bit(CACHE_PENDING, + &cr->item->flags)) { + list_del(&cr->list); + rq = cr; } + } rp->offset = 0; } - list_del(&rp->q.list); + list_del(&rp->list); spin_unlock(&cd->queue_lock); if (rq) { @@ -1107,27 +1085,24 @@ static int cache_release(struct inode *inode, struct file *filp, static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch) { - struct cache_queue *cq, *tmp; - struct cache_request *cr; + struct cache_request *cr, *tmp; LIST_HEAD(dequeued); spin_lock(&detail->queue_lock); - list_for_each_entry_safe(cq, tmp, &detail->queue, list) - if (!cq->reader) { - cr = container_of(cq, struct cache_request, q); - if (cr->item != ch) - continue; - if (test_bit(CACHE_PENDING, &ch->flags)) - /* Lost a race and it is pending again */ - break; - if (cr->readers != 0) - continue; - list_move(&cr->q.list, &dequeued); - } + list_for_each_entry_safe(cr, tmp, &detail->requests, list) { + if (cr->item != ch) + continue; + if (test_bit(CACHE_PENDING, &ch->flags)) + /* Lost a race and it is pending again */ + break; + if (cr->readers != 0) + continue; + list_move(&cr->list, &dequeued); + } spin_unlock(&detail->queue_lock); while (!list_empty(&dequeued)) { - cr = list_entry(dequeued.next, struct cache_request, q.list); - list_del(&cr->q.list); + cr = list_entry(dequeued.next, struct cache_request, list); + list_del(&cr->list); cache_put(cr->item, detail); kfree(cr->buf); kfree(cr); @@ -1245,14 +1220,14 @@ static int cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h) return -EAGAIN; } - crq->q.reader = 0; crq->buf = buf; crq->len = 0; crq->readers = 0; spin_lock(&detail->queue_lock); if (test_bit(CACHE_PENDING, &h->flags)) { crq->item = cache_get(h); - list_add_tail(&crq->q.list, &detail->queue); + crq->seqno = detail->next_seqno++; + list_add_tail(&crq->list, &detail->requests); trace_cache_entry_upcall(detail, h); } else /* Lost a race, no longer PENDING, so don't enqueue */ From 8be12e0cf21110f1e0b7fd21711ff13fb75bee72 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 24 Feb 2026 08:28:11 -0500 Subject: [PATCH 60/83] nfsd: convert global state_lock to per-net deleg_lock Replace the global state_lock spinlock with a per-nfsd_net deleg_lock. The state_lock was only used to protect delegation lifecycle operations (the del_recall_lru list and delegation hash/unhash), all of which are scoped to a single network namespace. Making the lock per-net removes a source of unnecessary contention between containers. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/netns.h | 3 +++ fs/nfsd/nfs4state.c | 62 +++++++++++++++++++++++---------------------- fs/nfsd/state.h | 2 +- 3 files changed, 36 insertions(+), 31 deletions(-) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 9fa600602658..3a89d4708e8a 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -99,6 +99,9 @@ struct nfsd_net { */ struct list_head client_lru; struct list_head close_lru; + + /* protects del_recall_lru and delegation hash/unhash */ + spinlock_t deleg_lock ____cacheline_aligned; struct list_head del_recall_lru; /* protected by blocked_locks_lock */ diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 1d31f2bb2162..ba49f49bb93b 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -93,13 +93,6 @@ static void deleg_reaper(struct nfsd_net *nn); /* Locking: */ -/* - * Currently used for the del_recall_lru and file hash table. In an - * effort to decrease the scope of the client_mutex, this spinlock may - * eventually cover more: - */ -static DEFINE_SPINLOCK(state_lock); - enum nfsd4_st_mutex_lock_subclass { OPEN_STATEID_MUTEX = 0, LOCK_STATEID_MUTEX = 1, @@ -1295,8 +1288,9 @@ nfs4_delegation_exists(struct nfs4_client *clp, struct nfs4_file *fp) { struct nfs4_delegation *searchdp = NULL; struct nfs4_client *searchclp = NULL; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); - lockdep_assert_held(&state_lock); + lockdep_assert_held(&nn->deleg_lock); lockdep_assert_held(&fp->fi_lock); list_for_each_entry(searchdp, &fp->fi_delegations, dl_perfile) { @@ -1325,8 +1319,9 @@ static int hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp) { struct nfs4_client *clp = dp->dl_stid.sc_client; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); - lockdep_assert_held(&state_lock); + lockdep_assert_held(&nn->deleg_lock); lockdep_assert_held(&fp->fi_lock); lockdep_assert_held(&clp->cl_lock); @@ -1348,8 +1343,10 @@ static bool unhash_delegation_locked(struct nfs4_delegation *dp, unsigned short statusmask) { struct nfs4_file *fp = dp->dl_stid.sc_file; + struct nfsd_net *nn = net_generic(dp->dl_stid.sc_client->net, + nfsd_net_id); - lockdep_assert_held(&state_lock); + lockdep_assert_held(&nn->deleg_lock); if (!delegation_hashed(dp)) return false; @@ -1374,10 +1371,12 @@ unhash_delegation_locked(struct nfs4_delegation *dp, unsigned short statusmask) static void destroy_delegation(struct nfs4_delegation *dp) { bool unhashed; + struct nfsd_net *nn = net_generic(dp->dl_stid.sc_client->net, + nfsd_net_id); - spin_lock(&state_lock); + spin_lock(&nn->deleg_lock); unhashed = unhash_delegation_locked(dp, SC_STATUS_CLOSED); - spin_unlock(&state_lock); + spin_unlock(&nn->deleg_lock); if (unhashed) destroy_unhashed_deleg(dp); } @@ -1840,11 +1839,11 @@ void nfsd4_revoke_states(struct nfsd_net *nn, struct super_block *sb) case SC_TYPE_DELEG: refcount_inc(&stid->sc_count); dp = delegstateid(stid); - spin_lock(&state_lock); + spin_lock(&nn->deleg_lock); if (!unhash_delegation_locked( dp, SC_STATUS_ADMIN_REVOKED)) dp = NULL; - spin_unlock(&state_lock); + spin_unlock(&nn->deleg_lock); if (dp) revoke_delegation(dp); break; @@ -2510,13 +2509,13 @@ __destroy_client(struct nfs4_client *clp) struct nfs4_delegation *dp; LIST_HEAD(reaplist); - spin_lock(&state_lock); + spin_lock(&nn->deleg_lock); while (!list_empty(&clp->cl_delegations)) { dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt); unhash_delegation_locked(dp, SC_STATUS_CLOSED); list_add(&dp->dl_recall_lru, &reaplist); } - spin_unlock(&state_lock); + spin_unlock(&nn->deleg_lock); while (!list_empty(&reaplist)) { dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); @@ -5427,12 +5426,12 @@ static void nfsd4_cb_recall_prepare(struct nfsd4_callback *cb) * If the dl_time != 0, then we know that it has already been * queued for a lease break. Don't queue it again. */ - spin_lock(&state_lock); + spin_lock(&nn->deleg_lock); if (delegation_hashed(dp) && dp->dl_time == 0) { dp->dl_time = ktime_get_boottime_seconds(); list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru); } - spin_unlock(&state_lock); + spin_unlock(&nn->deleg_lock); } static int nfsd4_cb_recall_done(struct nfsd4_callback *cb, @@ -6064,6 +6063,7 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, { bool deleg_ts = nfsd4_want_deleg_timestamps(open); struct nfs4_client *clp = stp->st_stid.sc_client; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); struct nfs4_file *fp = stp->st_stid.sc_file; struct nfs4_clnt_odstate *odstate = stp->st_clnt_odstate; struct nfs4_delegation *dp; @@ -6123,7 +6123,7 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, return ERR_PTR(-EOPNOTSUPP); } - spin_lock(&state_lock); + spin_lock(&nn->deleg_lock); spin_lock(&fp->fi_lock); if (nfs4_delegation_exists(clp, fp)) status = -EAGAIN; @@ -6138,7 +6138,7 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, } else fp->fi_delegees++; spin_unlock(&fp->fi_lock); - spin_unlock(&state_lock); + spin_unlock(&nn->deleg_lock); if (nf) nfsd_file_put(nf); if (status) @@ -6182,13 +6182,13 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, if (fp->fi_had_conflict) goto out_unlock; - spin_lock(&state_lock); + spin_lock(&nn->deleg_lock); spin_lock(&clp->cl_lock); spin_lock(&fp->fi_lock); status = hash_delegation_locked(dp, fp); spin_unlock(&fp->fi_lock); spin_unlock(&clp->cl_lock); - spin_unlock(&state_lock); + spin_unlock(&nn->deleg_lock); if (status) goto out_unlock; @@ -6964,7 +6964,7 @@ nfs4_laundromat(struct nfsd_net *nn) nfs40_clean_admin_revoked(nn, <); - spin_lock(&state_lock); + spin_lock(&nn->deleg_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); if (!state_expired(<, dp->dl_time)) @@ -6973,7 +6973,7 @@ nfs4_laundromat(struct nfsd_net *nn) unhash_delegation_locked(dp, SC_STATUS_REVOKED); list_add(&dp->dl_recall_lru, &reaplist); } - spin_unlock(&state_lock); + spin_unlock(&nn->deleg_lock); while (!list_empty(&reaplist)) { dp = list_first_entry(&reaplist, struct nfs4_delegation, dl_recall_lru); @@ -8996,6 +8996,7 @@ static int nfs4_state_create_net(struct net *net) INIT_LIST_HEAD(&nn->client_lru); INIT_LIST_HEAD(&nn->close_lru); INIT_LIST_HEAD(&nn->del_recall_lru); + spin_lock_init(&nn->deleg_lock); spin_lock_init(&nn->client_lock); spin_lock_init(&nn->s2s_cp_lock); idr_init(&nn->s2s_cp_stateids); @@ -9127,13 +9128,13 @@ nfs4_state_shutdown_net(struct net *net) locks_end_grace(&nn->nfsd4_manager); INIT_LIST_HEAD(&reaplist); - spin_lock(&state_lock); + spin_lock(&nn->deleg_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); unhash_delegation_locked(dp, SC_STATUS_CLOSED); list_add(&dp->dl_recall_lru, &reaplist); } - spin_unlock(&state_lock); + spin_unlock(&nn->deleg_lock); list_for_each_safe(pos, next, &reaplist) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); @@ -9456,6 +9457,7 @@ nfsd_get_dir_deleg(struct nfsd4_compound_state *cstate, struct nfsd_file *nf) { struct nfs4_client *clp = cstate->clp; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); struct nfs4_delegation *dp; struct file_lease *fl; struct nfs4_file *fp, *rfp; @@ -9479,7 +9481,7 @@ nfsd_get_dir_deleg(struct nfsd4_compound_state *cstate, } /* if this client already has one, return that it's unavailable */ - spin_lock(&state_lock); + spin_lock(&nn->deleg_lock); spin_lock(&fp->fi_lock); /* existing delegation? */ if (nfs4_delegation_exists(clp, fp)) { @@ -9491,7 +9493,7 @@ nfsd_get_dir_deleg(struct nfsd4_compound_state *cstate, ++fp->fi_delegees; } spin_unlock(&fp->fi_lock); - spin_unlock(&state_lock); + spin_unlock(&nn->deleg_lock); if (status) { put_nfs4_file(fp); @@ -9520,13 +9522,13 @@ nfsd_get_dir_deleg(struct nfsd4_compound_state *cstate, * trying to set a delegation on the same file. If that happens, * then just say UNAVAIL. */ - spin_lock(&state_lock); + spin_lock(&nn->deleg_lock); spin_lock(&clp->cl_lock); spin_lock(&fp->fi_lock); status = hash_delegation_locked(dp, fp); spin_unlock(&fp->fi_lock); spin_unlock(&clp->cl_lock); - spin_unlock(&state_lock); + spin_unlock(&nn->deleg_lock); if (!status) { put_nfs4_file(fp); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index ec1c5467012e..3159c7b67f50 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -123,7 +123,7 @@ struct nfs4_stid { #define SC_TYPE_LAYOUT BIT(3) unsigned short sc_type; -/* state_lock protects sc_status for delegation stateids. +/* nn->deleg_lock protects sc_status for delegation stateids. * ->cl_lock protects sc_status for open and lock stateids. * ->st_mutex also protect sc_status for open stateids. * ->ls_lock protects sc_status for layout stateids. From 116b6b7acdd82605ed530232cd7509d1b5282f5c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 24 Feb 2026 12:10:34 -0500 Subject: [PATCH 61/83] nfsd: use dynamic allocation for oversized NFSv4.0 replay cache Commit 1e8e9913672a ("nfsd: fix heap overflow in NFSv4.0 LOCK replay cache") capped the replay cache copy at NFSD4_REPLAY_ISIZE to prevent a heap overflow, but set rp_buflen to zero when the encoded response exceeded the inline buffer. A retransmitted LOCK reaching the replay path then produced only a status code with no operation body, resulting in a malformed XDR response. When the encoded response exceeds the 112-byte inline rp_ibuf, a buffer is kmalloc'd to hold it. If the allocation fails, rp_buflen remains zero, preserving the behavior from the capped-copy fix. The buffer is freed when the stateowner is released or when a subsequent operation's response fits in the inline buffer. Fixes: 1e8e9913672a ("nfsd: fix heap overflow in NFSv4.0 LOCK replay cache") Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 16 ++++++++++++++++ fs/nfsd/nfs4xdr.c | 23 ++++++++++++++++------- fs/nfsd/state.h | 12 +++++++----- 3 files changed, 39 insertions(+), 12 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index ba49f49bb93b..b4d0e82b2690 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1496,8 +1496,24 @@ release_all_access(struct nfs4_ol_stateid *stp) } } +/** + * nfs4_replay_free_cache - release dynamically allocated replay buffer + * @rp: replay cache to reset + * + * If @rp->rp_buf points to a kmalloc'd buffer, free it and reset + * rp_buf to the inline rp_ibuf. Always zeroes rp_buflen. + */ +void nfs4_replay_free_cache(struct nfs4_replay *rp) +{ + if (rp->rp_buf != rp->rp_ibuf) + kfree(rp->rp_buf); + rp->rp_buf = rp->rp_ibuf; + rp->rp_buflen = 0; +} + static inline void nfs4_free_stateowner(struct nfs4_stateowner *sop) { + nfs4_replay_free_cache(&sop->so_replay); kfree(sop->so_owner.data); sop->so_ops->so_free(sop); } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 690f7a3122ec..2a0946c630e1 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -6282,14 +6282,23 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) int len = xdr->buf->len - (op_status_offset + XDR_UNIT); so->so_replay.rp_status = op->status; - if (len <= NFSD4_REPLAY_ISIZE) { - so->so_replay.rp_buflen = len; - read_bytes_from_xdr_buf(xdr->buf, - op_status_offset + XDR_UNIT, - so->so_replay.rp_buf, len); - } else { - so->so_replay.rp_buflen = 0; + if (len > NFSD4_REPLAY_ISIZE) { + char *buf = kmalloc(len, GFP_KERNEL); + + nfs4_replay_free_cache(&so->so_replay); + if (buf) { + so->so_replay.rp_buf = buf; + } else { + /* rp_buflen already zeroed; skip caching */ + goto status; + } + } else if (so->so_replay.rp_buf != so->so_replay.rp_ibuf) { + nfs4_replay_free_cache(&so->so_replay); } + so->so_replay.rp_buflen = len; + read_bytes_from_xdr_buf(xdr->buf, + op_status_offset + XDR_UNIT, + so->so_replay.rp_buf, len); } status: op->status = nfsd4_map_status(op->status, diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 3159c7b67f50..9b05462da4cc 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -554,10 +554,10 @@ struct nfs4_client_reclaim { * ~32(deleg. ace) = 112 bytes * * Some responses can exceed this. A LOCK denial includes the conflicting - * lock owner, which can be up to 1024 bytes (NFS4_OPAQUE_LIMIT). Responses - * larger than REPLAY_ISIZE are not cached in rp_ibuf; only rp_status is - * saved. Enlarging this constant increases the size of every - * nfs4_stateowner. + * lock owner, which can be up to 1024 bytes (NFS4_OPAQUE_LIMIT). When a + * response exceeds REPLAY_ISIZE, a buffer is dynamically allocated. If + * that allocation fails, only rp_status is saved. Enlarging this constant + * increases the size of every nfs4_stateowner. */ #define NFSD4_REPLAY_ISIZE 112 @@ -569,12 +569,14 @@ struct nfs4_client_reclaim { struct nfs4_replay { __be32 rp_status; unsigned int rp_buflen; - char *rp_buf; + char *rp_buf; /* rp_ibuf or kmalloc'd */ struct knfsd_fh rp_openfh; int rp_locked; char rp_ibuf[NFSD4_REPLAY_ISIZE]; }; +extern void nfs4_replay_free_cache(struct nfs4_replay *rp); + struct nfs4_stateowner; struct nfs4_stateowner_operations { From 62346217fd722510c3551858ad7d0fcfab8cce7e Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Wed, 25 Feb 2026 07:51:36 -0500 Subject: [PATCH 62/83] NFSD: Add a key for signing filehandles A future patch will enable NFSD to sign filehandles by appending a Message Authentication Code(MAC). To do this, NFSD requires a secret 128-bit key that can persist across reboots. A persisted key allows the server to accept filehandles after a restart. Enable NFSD to be configured with this key via the netlink interface. Link: https://lore.kernel.org/linux-nfs/cover.1772022373.git.bcodding@hammerspace.com Signed-off-by: Benjamin Coddington Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- Documentation/netlink/specs/nfsd.yaml | 6 +++++ fs/nfsd/netlink.c | 5 ++-- fs/nfsd/netns.h | 1 + fs/nfsd/nfsctl.c | 38 ++++++++++++++++++++++++++- fs/nfsd/trace.h | 22 ++++++++++++++++ include/uapi/linux/nfsd_netlink.h | 1 + 6 files changed, 70 insertions(+), 3 deletions(-) diff --git a/Documentation/netlink/specs/nfsd.yaml b/Documentation/netlink/specs/nfsd.yaml index f87b5a05e5e9..8ab43c8253b2 100644 --- a/Documentation/netlink/specs/nfsd.yaml +++ b/Documentation/netlink/specs/nfsd.yaml @@ -81,6 +81,11 @@ attribute-sets: - name: min-threads type: u32 + - + name: fh-key + type: binary + checks: + exact-len: 16 - name: version attributes: @@ -163,6 +168,7 @@ operations: - leasetime - scope - min-threads + - fh-key - name: threads-get doc: get the maximum number of running threads diff --git a/fs/nfsd/netlink.c b/fs/nfsd/netlink.c index 887525964451..81c943345d13 100644 --- a/fs/nfsd/netlink.c +++ b/fs/nfsd/netlink.c @@ -24,12 +24,13 @@ const struct nla_policy nfsd_version_nl_policy[NFSD_A_VERSION_ENABLED + 1] = { }; /* NFSD_CMD_THREADS_SET - do */ -static const struct nla_policy nfsd_threads_set_nl_policy[NFSD_A_SERVER_MIN_THREADS + 1] = { +static const struct nla_policy nfsd_threads_set_nl_policy[NFSD_A_SERVER_FH_KEY + 1] = { [NFSD_A_SERVER_THREADS] = { .type = NLA_U32, }, [NFSD_A_SERVER_GRACETIME] = { .type = NLA_U32, }, [NFSD_A_SERVER_LEASETIME] = { .type = NLA_U32, }, [NFSD_A_SERVER_SCOPE] = { .type = NLA_NUL_STRING, }, [NFSD_A_SERVER_MIN_THREADS] = { .type = NLA_U32, }, + [NFSD_A_SERVER_FH_KEY] = NLA_POLICY_EXACT_LEN(16), }; /* NFSD_CMD_VERSION_SET - do */ @@ -58,7 +59,7 @@ static const struct genl_split_ops nfsd_nl_ops[] = { .cmd = NFSD_CMD_THREADS_SET, .doit = nfsd_nl_threads_set_doit, .policy = nfsd_threads_set_nl_policy, - .maxattr = NFSD_A_SERVER_MIN_THREADS, + .maxattr = NFSD_A_SERVER_FH_KEY, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 3a89d4708e8a..6ad3fe5d7e12 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -227,6 +227,7 @@ struct nfsd_net { spinlock_t local_clients_lock; struct list_head local_clients; #endif + siphash_key_t *fh_key; }; /* Simple check to find out if a given net was properly initialized */ diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 0bf01ae411c5..20ec00f323b4 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1581,6 +1581,32 @@ int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb, return ret; } +/** + * nfsd_nl_fh_key_set - helper to copy fh_key from userspace + * @attr: nlattr NFSD_A_SERVER_FH_KEY + * @nn: nfsd_net + * + * Callers should hold nfsd_mutex, returns 0 on success or negative errno. + * Callers must ensure the server is shut down (sv_nrthreads == 0), + * userspace documentation asserts the key may only be set when the server + * is not running. + */ +static int nfsd_nl_fh_key_set(const struct nlattr *attr, struct nfsd_net *nn) +{ + siphash_key_t *fh_key = nn->fh_key; + + if (!fh_key) { + fh_key = kmalloc(sizeof(siphash_key_t), GFP_KERNEL); + if (!fh_key) + return -ENOMEM; + nn->fh_key = fh_key; + } + + fh_key->key[0] = get_unaligned_le64(nla_data(attr)); + fh_key->key[1] = get_unaligned_le64(nla_data(attr) + 8); + return 0; +} + /** * nfsd_nl_threads_set_doit - set the number of running threads * @skb: reply buffer @@ -1622,7 +1648,8 @@ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NFSD_A_SERVER_GRACETIME] || info->attrs[NFSD_A_SERVER_LEASETIME] || - info->attrs[NFSD_A_SERVER_SCOPE]) { + info->attrs[NFSD_A_SERVER_SCOPE] || + info->attrs[NFSD_A_SERVER_FH_KEY]) { ret = -EBUSY; if (nn->nfsd_serv && nn->nfsd_serv->sv_nrthreads) goto out_unlock; @@ -1651,6 +1678,14 @@ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info) attr = info->attrs[NFSD_A_SERVER_SCOPE]; if (attr) scope = nla_data(attr); + + attr = info->attrs[NFSD_A_SERVER_FH_KEY]; + if (attr) { + ret = nfsd_nl_fh_key_set(attr, nn); + trace_nfsd_ctl_fh_key_set((const char *)nn->fh_key, ret); + if (ret) + goto out_unlock; + } } attr = info->attrs[NFSD_A_SERVER_MIN_THREADS]; @@ -2237,6 +2272,7 @@ static __net_exit void nfsd_net_exit(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); + kfree_sensitive(nn->fh_key); nfsd_proc_stat_shutdown(net); percpu_counter_destroy_many(nn->counter, NFSD_STATS_COUNTERS_NUM); nfsd_idmap_shutdown(net); diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index d1d0b0dd0545..185a998996a0 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -2240,6 +2240,28 @@ TRACE_EVENT(nfsd_end_grace, ) ); +TRACE_EVENT(nfsd_ctl_fh_key_set, + TP_PROTO( + const char *key, + int result + ), + TP_ARGS(key, result), + TP_STRUCT__entry( + __field(u32, key_hash) + __field(int, result) + ), + TP_fast_assign( + if (key) + __entry->key_hash = ~crc32_le(0xFFFFFFFF, key, 16); + else + __entry->key_hash = 0; + __entry->result = result; + ), + TP_printk("key=0x%08x result=%d", + __entry->key_hash, __entry->result + ) +); + DECLARE_EVENT_CLASS(nfsd_copy_class, TP_PROTO( const struct nfsd4_copy *copy diff --git a/include/uapi/linux/nfsd_netlink.h b/include/uapi/linux/nfsd_netlink.h index e9efbc9e63d8..97c7447f4d14 100644 --- a/include/uapi/linux/nfsd_netlink.h +++ b/include/uapi/linux/nfsd_netlink.h @@ -36,6 +36,7 @@ enum { NFSD_A_SERVER_LEASETIME, NFSD_A_SERVER_SCOPE, NFSD_A_SERVER_MIN_THREADS, + NFSD_A_SERVER_FH_KEY, __NFSD_A_SERVER_MAX, NFSD_A_SERVER_MAX = (__NFSD_A_SERVER_MAX - 1) From a002ad8a9bc89c084bc40933065c88336700837e Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Wed, 25 Feb 2026 07:51:37 -0500 Subject: [PATCH 63/83] NFSD/export: Add sign_fh export option In order to signal that filehandles on this export should be signed, add a "sign_fh" export option. Filehandle signing can help the server defend against certain filehandle guessing attacks. Setting the "sign_fh" export option sets NFSEXP_SIGN_FH. In a future patch NFSD uses this signal to append a MAC onto filehandles for that export. While we're in here, tidy a few stray expflags to more closely align to the export flag order. Link: https://lore.kernel.org/linux-nfs/cover.1772022373.git.bcodding@hammerspace.com Signed-off-by: Benjamin Coddington Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/export.c | 5 +++-- include/uapi/linux/nfsd/export.h | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 8e8a76a44ff0..7f4a51b832ef 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1362,13 +1362,14 @@ static struct flags { { NFSEXP_ASYNC, {"async", "sync"}}, { NFSEXP_GATHERED_WRITES, {"wdelay", "no_wdelay"}}, { NFSEXP_NOREADDIRPLUS, {"nordirplus", ""}}, + { NFSEXP_SECURITY_LABEL, {"security_label", ""}}, + { NFSEXP_SIGN_FH, {"sign_fh", ""}}, { NFSEXP_NOHIDE, {"nohide", ""}}, - { NFSEXP_CROSSMOUNT, {"crossmnt", ""}}, { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}}, { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}}, + { NFSEXP_CROSSMOUNT, {"crossmnt", ""}}, { NFSEXP_V4ROOT, {"v4root", ""}}, { NFSEXP_PNFS, {"pnfs", ""}}, - { NFSEXP_SECURITY_LABEL, {"security_label", ""}}, { 0, {"", ""}} }; diff --git a/include/uapi/linux/nfsd/export.h b/include/uapi/linux/nfsd/export.h index a73ca3703abb..de647cf166c3 100644 --- a/include/uapi/linux/nfsd/export.h +++ b/include/uapi/linux/nfsd/export.h @@ -34,7 +34,7 @@ #define NFSEXP_GATHERED_WRITES 0x0020 #define NFSEXP_NOREADDIRPLUS 0x0040 #define NFSEXP_SECURITY_LABEL 0x0080 -/* 0x100 currently unused */ +#define NFSEXP_SIGN_FH 0x0100 #define NFSEXP_NOHIDE 0x0200 #define NFSEXP_NOSUBTREECHECK 0x0400 #define NFSEXP_NOAUTHNLM 0x0800 /* Don't authenticate NLM requests - just trust */ @@ -55,7 +55,7 @@ #define NFSEXP_PNFS 0x20000 /* All flags that we claim to support. (Note we don't support NOACL.) */ -#define NFSEXP_ALLFLAGS 0x3FEFF +#define NFSEXP_ALLFLAGS 0x3FFFF /* The flags that may vary depending on security flavor: */ #define NFSEXP_SECINFO_FLAGS (NFSEXP_READONLY | NFSEXP_ROOTSQUASH \ From 2a83ffc5575013784ea41739daf9e10200e44e7c Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Wed, 25 Feb 2026 07:51:38 -0500 Subject: [PATCH 64/83] NFSD: Sign filehandles NFS clients may bypass restrictive directory permissions by using open_by_handle() (or other available OS system call) to guess the filehandles for files below that directory. In order to harden knfsd servers against this attack, create a method to sign and verify filehandles using SipHash-2-4 as a MAC (Message Authentication Code). According to https://cr.yp.to/siphash/siphash-20120918.pdf, SipHash can be used as a MAC, and our use of SipHash-2-4 provides a low 1 in 2^64 chance of forgery. Filehandles that have been signed cannot be tampered with, nor can clients reasonably guess correct filehandles and hashes that may exist in parts of the filesystem they cannot access due to directory permissions. Append the 8 byte SipHash to encoded filehandles for exports that have set the "sign_fh" export option. Filehandles received from clients are verified by comparing the appended hash to the expected hash. If the MAC does not match the server responds with NFS error _STALE. If unsigned filehandles are received for an export with "sign_fh" they are rejected with NFS error _STALE. Signed-off-by: Benjamin Coddington Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- Documentation/filesystems/nfs/exporting.rst | 85 +++++++++++++++++++++ fs/nfsd/Kconfig | 2 +- fs/nfsd/nfsfh.c | 74 +++++++++++++++++- fs/nfsd/trace.h | 1 + 4 files changed, 157 insertions(+), 5 deletions(-) diff --git a/Documentation/filesystems/nfs/exporting.rst b/Documentation/filesystems/nfs/exporting.rst index a01d9b9b5bc3..4aa59b0bf253 100644 --- a/Documentation/filesystems/nfs/exporting.rst +++ b/Documentation/filesystems/nfs/exporting.rst @@ -206,3 +206,88 @@ following flags are defined: all of an inode's dirty data on last close. Exports that behave this way should set EXPORT_OP_FLUSH_ON_CLOSE so that NFSD knows to skip waiting for writeback when closing such files. + +Signed Filehandles +------------------ + +To protect against filehandle guessing attacks, the Linux NFS server can be +configured to sign filehandles with a Message Authentication Code (MAC). + +Standard NFS filehandles are often predictable. If an attacker can guess +a valid filehandle for a file they do not have permission to access via +directory traversal, they may be able to bypass path-based permissions +(though they still remain subject to inode-level permissions). + +Signed filehandles prevent this by appending a MAC to the filehandle +before it is sent to the client. Upon receiving a filehandle back from a +client, the server re-calculates the MAC using its internal key and +verifies it against the one provided. If the signatures do not match, +the server treats the filehandle as invalid (returning NFS[34]ERR_STALE). + +Note that signing filehandles provides integrity and authenticity but +not confidentiality. The contents of the filehandle remain visible to +the client; they simply cannot be forged or modified. + +Configuration +~~~~~~~~~~~~~ + +To enable signed filehandles, the administrator must provide a signing +key to the kernel and enable the "sign_fh" export option. + +1. Providing a Key + The signing key is managed via the nfsd netlink interface. This key + is per-network-namespace and must be set before any exports using + "sign_fh" become active. + +2. Export Options + The feature is controlled on a per-export basis in /etc/exports: + + sign_fh + Enables signing for all filehandles generated under this export. + + no_sign_fh + (Default) Disables signing. + +Key Management and Rotation +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The security of this mechanism relies entirely on the secrecy of the +signing key. + +Initial Setup: + The key should be generated using a high-quality random source and + loaded early in the boot process or during the nfs-server startup + sequence. + +Changing Keys: + If a key is changed while clients have active mounts, existing + filehandles held by those clients will become invalid, resulting in + "Stale file handle" errors on the client side. + +Safe Rotation: + Currently, there is no mechanism for "graceful" key rotation + (maintaining multiple valid keys). Changing the key is an atomic + operation that immediately invalidates all previous signatures. + +Transitioning Exports +~~~~~~~~~~~~~~~~~~~~~ + +When adding or removing the "sign_fh" flag from an active export, the +following behaviors should be expected: + ++-------------------+---------------------------------------------------+ +| Change | Result for Existing Clients | ++===================+===================================================+ +| Adding sign_fh | Clients holding unsigned filehandles will find | +| | them rejected, as the server now expects a | +| | signature. | ++-------------------+---------------------------------------------------+ +| Removing sign_fh | Clients holding signed filehandles will find them | +| | rejected, as the server now expects the | +| | filehandle to end at its traditional boundary | +| | without a MAC. | ++-------------------+---------------------------------------------------+ + +Because filehandles are often cached persistently by clients, adding or +removing this option should generally be done during a scheduled maintenance +window involving a NFS client unmount/remount. diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index fc0e87eaa257..ffb76761d6a8 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -7,6 +7,7 @@ config NFSD select CRC32 select CRYPTO_LIB_MD5 if NFSD_LEGACY_CLIENT_TRACKING select CRYPTO_LIB_SHA256 if NFSD_V4 + select CRYPTO # required by RPCSEC_GSS_KRB5 and signed filehandles select LOCKD select SUNRPC select EXPORTFS @@ -78,7 +79,6 @@ config NFSD_V4 depends on NFSD && PROC_FS select FS_POSIX_ACL select RPCSEC_GSS_KRB5 - select CRYPTO # required by RPCSEC_GSS_KRB5 select GRACE_PERIOD select NFS_V4_2_SSC_HELPER if NFS_V4_2 help diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 68b629fbaaeb..bce8784aa92e 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -11,6 +11,7 @@ #include #include +#include #include "nfsd.h" #include "vfs.h" #include "auth.h" @@ -140,6 +141,57 @@ static inline __be32 check_pseudo_root(struct dentry *dentry, return nfs_ok; } +/* Size of a file handle MAC, in 4-octet words */ +#define FH_MAC_WORDS (sizeof(__le64) / 4) + +static bool fh_append_mac(struct svc_fh *fhp, struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct knfsd_fh *fh = &fhp->fh_handle; + siphash_key_t *fh_key = nn->fh_key; + __le64 hash; + + if (!fh_key) + goto out_no_key; + if (fh->fh_size + sizeof(hash) > fhp->fh_maxsize) + goto out_no_space; + + hash = cpu_to_le64(siphash(&fh->fh_raw, fh->fh_size, fh_key)); + memcpy(&fh->fh_raw[fh->fh_size], &hash, sizeof(hash)); + fh->fh_size += sizeof(hash); + return true; + +out_no_key: + pr_warn_ratelimited("NFSD: unable to sign filehandles, fh_key not set.\n"); + return false; + +out_no_space: + pr_warn_ratelimited("NFSD: unable to sign filehandles, fh_size %zu would be greater than fh_maxsize %d.\n", + fh->fh_size + sizeof(hash), fhp->fh_maxsize); + return false; +} + +/* + * Verify that the filehandle's MAC was hashed from this filehandle + * given the server's fh_key: + */ +static bool fh_verify_mac(struct svc_fh *fhp, struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct knfsd_fh *fh = &fhp->fh_handle; + siphash_key_t *fh_key = nn->fh_key; + __le64 hash; + + if (!fh_key) { + pr_warn_ratelimited("NFSD: unable to verify signed filehandles, fh_key not set.\n"); + return false; + } + + hash = cpu_to_le64(siphash(&fh->fh_raw, fh->fh_size - sizeof(hash), fh_key)); + return crypto_memneq(&fh->fh_raw[fh->fh_size - sizeof(hash)], + &hash, sizeof(hash)) == 0; +} + /* * Use the given filehandle to look up the corresponding export and * dentry. On success, the results are used to set fh_export and @@ -236,13 +288,21 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct net *net, /* * Look up the dentry using the NFS file handle. */ - error = nfserr_badhandle; - fileid_type = fh->fh_fileid_type; + error = nfserr_stale; - if (fileid_type == FILEID_ROOT) + if (fileid_type == FILEID_ROOT) { + /* We don't sign or verify the root, no per-file identity */ dentry = dget(exp->ex_path.dentry); - else { + } else { + if (exp->ex_flags & NFSEXP_SIGN_FH) { + if (!fh_verify_mac(fhp, net)) { + trace_nfsd_set_fh_dentry_badmac(rqstp, fhp, -ESTALE); + goto out; + } + data_left -= FH_MAC_WORDS; + } + dentry = exportfs_decode_fh_raw(exp->ex_path.mnt, fid, data_left, fileid_type, 0, nfsd_acceptable, exp); @@ -258,6 +318,8 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct net *net, } } } + + error = nfserr_badhandle; if (dentry == NULL) goto out; if (IS_ERR(dentry)) { @@ -498,6 +560,10 @@ static void _fh_update(struct svc_fh *fhp, struct svc_export *exp, fhp->fh_handle.fh_fileid_type = fileid_type > 0 ? fileid_type : FILEID_INVALID; fhp->fh_handle.fh_size += maxsize * 4; + + if (exp->ex_flags & NFSEXP_SIGN_FH) + if (!fh_append_mac(fhp, exp->cd->net)) + fhp->fh_handle.fh_fileid_type = FILEID_INVALID; } else { fhp->fh_handle.fh_fileid_type = FILEID_ROOT; } diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 185a998996a0..5ad38f50836d 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -373,6 +373,7 @@ DEFINE_EVENT_CONDITION(nfsd_fh_err_class, nfsd_##name, \ DEFINE_NFSD_FH_ERR_EVENT(set_fh_dentry_badexport); DEFINE_NFSD_FH_ERR_EVENT(set_fh_dentry_badhandle); +DEFINE_NFSD_FH_ERR_EVENT(set_fh_dentry_badmac); TRACE_EVENT(nfsd_exp_find_key, TP_PROTO(const struct svc_expkey *key, From 46ca8dd2441ffa49a7f31b9070f972f52c5779c3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 26 Feb 2026 09:47:34 -0500 Subject: [PATCH 65/83] SUNRPC: Tighten bounds checking in svc_rqst_replace_page svc_rqst_replace_page() builds the Reply buffer by advancing rq_next_page through the response page range. The bounds check validates rq_next_page against the full rq_pages array, but the valid range for rq_next_page is [rq_respages, rq_page_end]. Use those bounds instead. This is correct today because rq_respages and rq_page_end both point into rq_pages, and it prepares for a subsequent change that separates the Reply page array from rq_pages. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- net/sunrpc/svc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index d8ccb8e4b5c2..f7ec02457328 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -934,11 +934,11 @@ svc_set_num_threads(struct svc_serv *serv, unsigned int min_threads, EXPORT_SYMBOL_GPL(svc_set_num_threads); /** - * svc_rqst_replace_page - Replace one page in rq_pages[] + * svc_rqst_replace_page - Replace one page in rq_respages[] * @rqstp: svc_rqst with pages to replace * @page: replacement page * - * When replacing a page in rq_pages, batch the release of the + * When replacing a page in rq_respages, batch the release of the * replaced pages to avoid hammering the page allocator. * * Return values: @@ -947,8 +947,8 @@ EXPORT_SYMBOL_GPL(svc_set_num_threads); */ bool svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page) { - struct page **begin = rqstp->rq_pages; - struct page **end = &rqstp->rq_pages[rqstp->rq_maxpages]; + struct page **begin = rqstp->rq_respages; + struct page **end = rqstp->rq_page_end; if (unlikely(rqstp->rq_next_page < begin || rqstp->rq_next_page > end)) { trace_svc_replace_page_err(rqstp); From ee66b9e3e1c69efc986f3932555f07121c3460a7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 26 Feb 2026 09:47:35 -0500 Subject: [PATCH 66/83] SUNRPC: Allocate a separate Reply page array struct svc_rqst uses a single dynamically-allocated page array (rq_pages) for both the incoming RPC Call message and the outgoing RPC Reply message. rq_respages is a sliding pointer into rq_pages that each transport receive path must compute based on how many pages the Call consumed. This boundary tracking is a source of confusion and bugs, and prevents an RPC transaction from having both a large Call and a large Reply simultaneously. Allocate rq_respages as its own page array, eliminating the boundary arithmetic. This decouples Call and Reply buffer lifetimes, following the precedent set by rq_bvec (a separate dynamically- allocated array for I/O vectors). Each svc_rqst now pins twice as many pages as before. For a server running 16 threads with a 1MB maximum payload, the additional cost is roughly 16MB of pinned memory. The new dynamic svc thread count facility keeps this overhead minimal on an idle server. A subsequent patch in this series limits per-request repopulation to only the pages released during the previous RPC, avoiding a full-array scan on each call to svc_alloc_arg(). Note: We've considered several alternatives to maintaining a full second array. Each alternative reintroduces either boundary logic complexity or I/O-path allocation pressure. rq_next_page is initialized in svc_alloc_arg() and svc_process() during Reply construction, and in svc_rdma_recvfrom() as a precaution on error paths. Transport receive paths no longer compute it from the Call size. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 47 ++++++++++++------------- net/sunrpc/svc.c | 29 ++++++++++++--- net/sunrpc/svc_xprt.c | 36 +++++++++++++------ net/sunrpc/svcsock.c | 6 ---- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 15 +++----- 5 files changed, 77 insertions(+), 56 deletions(-) diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 62152e4f3bcc..3b1a98ab5cba 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -134,25 +134,24 @@ enum { extern u32 svc_max_payload(const struct svc_rqst *rqstp); /* - * RPC Requests and replies are stored in one or more pages. - * We maintain an array of pages for each server thread. - * Requests are copied into these pages as they arrive. Remaining - * pages are available to write the reply into. + * RPC Call and Reply messages each have their own page array. + * rq_pages holds the incoming Call message; rq_respages holds + * the outgoing Reply message. Both arrays are sized to + * svc_serv_maxpages() entries and are allocated dynamically. * - * Pages are sent using ->sendmsg with MSG_SPLICE_PAGES so each server thread - * needs to allocate more to replace those used in sending. To help keep track - * of these pages we have a receive list where all pages initialy live, and a - * send list where pages are moved to when there are to be part of a reply. + * Pages are sent using ->sendmsg with MSG_SPLICE_PAGES so each + * server thread needs to allocate more to replace those used in + * sending. * - * We use xdr_buf for holding responses as it fits well with NFS - * read responses (that have a header, and some data pages, and possibly - * a tail) and means we can share some client side routines. + * xdr_buf holds responses; the structure fits NFS read responses + * (header, data pages, optional tail) and enables sharing of + * client-side routines. * - * The xdr_buf.head kvec always points to the first page in the rq_*pages - * list. The xdr_buf.pages pointer points to the second page on that - * list. xdr_buf.tail points to the end of the first page. - * This assumes that the non-page part of an rpc reply will fit - * in a page - NFSd ensures this. lockd also has no trouble. + * The xdr_buf.head kvec always points to the first page in the + * rq_*pages list. The xdr_buf.pages pointer points to the second + * page on that list. xdr_buf.tail points to the end of the first + * page. This assumes that the non-page part of an rpc reply will + * fit in a page - NFSd ensures this. lockd also has no trouble. */ /** @@ -162,10 +161,10 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp); * Returns a count of pages or vectors that can hold the maximum * size RPC message for @serv. * - * Each request/reply pair can have at most one "payload", plus two - * pages, one for the request, and one for the reply. - * nfsd_splice_actor() might need an extra page when a READ payload - * is not page-aligned. + * Each page array can hold at most one payload plus two + * overhead pages (one for the RPC header, one for tail data). + * nfsd_splice_actor() might need an extra page when a READ + * payload is not page-aligned. */ static inline unsigned long svc_serv_maxpages(const struct svc_serv *serv) { @@ -204,11 +203,11 @@ struct svc_rqst { struct xdr_stream rq_res_stream; struct folio *rq_scratch_folio; struct xdr_buf rq_res; - unsigned long rq_maxpages; /* num of entries in rq_pages */ - struct page * *rq_pages; - struct page * *rq_respages; /* points into rq_pages */ + unsigned long rq_maxpages; /* entries per page array */ + struct page * *rq_pages; /* Call buffer pages */ + struct page * *rq_respages; /* Reply buffer pages */ struct page * *rq_next_page; /* next reply page to use */ - struct page * *rq_page_end; /* one past the last page */ + struct page * *rq_page_end; /* one past the last reply page */ struct folio_batch rq_fbatch; struct bio_vec *rq_bvec; diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index f7ec02457328..9abef638b1e0 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -638,13 +638,23 @@ svc_init_buffer(struct svc_rqst *rqstp, const struct svc_serv *serv, int node) { rqstp->rq_maxpages = svc_serv_maxpages(serv); - /* rq_pages' last entry is NULL for historical reasons. */ + /* +1 for a NULL sentinel readable by nfsd_splice_actor() */ rqstp->rq_pages = kcalloc_node(rqstp->rq_maxpages + 1, sizeof(struct page *), GFP_KERNEL, node); if (!rqstp->rq_pages) return false; + /* +1 for a NULL sentinel at rq_page_end (see svc_rqst_replace_page) */ + rqstp->rq_respages = kcalloc_node(rqstp->rq_maxpages + 1, + sizeof(struct page *), + GFP_KERNEL, node); + if (!rqstp->rq_respages) { + kfree(rqstp->rq_pages); + rqstp->rq_pages = NULL; + return false; + } + return true; } @@ -656,10 +666,19 @@ svc_release_buffer(struct svc_rqst *rqstp) { unsigned long i; - for (i = 0; i < rqstp->rq_maxpages; i++) - if (rqstp->rq_pages[i]) - put_page(rqstp->rq_pages[i]); - kfree(rqstp->rq_pages); + if (rqstp->rq_pages) { + for (i = 0; i < rqstp->rq_maxpages; i++) + if (rqstp->rq_pages[i]) + put_page(rqstp->rq_pages[i]); + kfree(rqstp->rq_pages); + } + + if (rqstp->rq_respages) { + for (i = 0; i < rqstp->rq_maxpages; i++) + if (rqstp->rq_respages[i]) + put_page(rqstp->rq_respages[i]); + kfree(rqstp->rq_respages); + } } static void diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 56a663b8939f..e027765f4307 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -650,14 +650,13 @@ static void svc_check_conn_limits(struct svc_serv *serv) } } -static bool svc_alloc_arg(struct svc_rqst *rqstp) +static bool svc_fill_pages(struct svc_rqst *rqstp, struct page **pages, + unsigned long npages) { - struct xdr_buf *arg = &rqstp->rq_arg; - unsigned long pages, filled, ret; + unsigned long filled, ret; - pages = rqstp->rq_maxpages; - for (filled = 0; filled < pages; filled = ret) { - ret = alloc_pages_bulk(GFP_KERNEL, pages, rqstp->rq_pages); + for (filled = 0; filled < npages; filled = ret) { + ret = alloc_pages_bulk(GFP_KERNEL, npages, pages); if (ret > filled) /* Made progress, don't sleep yet */ continue; @@ -667,11 +666,29 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp) set_current_state(TASK_RUNNING); return false; } - trace_svc_alloc_arg_err(pages, ret); + trace_svc_alloc_arg_err(npages, ret); memalloc_retry_wait(GFP_KERNEL); } - rqstp->rq_page_end = &rqstp->rq_pages[pages]; - rqstp->rq_pages[pages] = NULL; /* this might be seen in nfsd_splice_actor() */ + return true; +} + +static bool svc_alloc_arg(struct svc_rqst *rqstp) +{ + struct xdr_buf *arg = &rqstp->rq_arg; + unsigned long pages; + + pages = rqstp->rq_maxpages; + + if (!svc_fill_pages(rqstp, rqstp->rq_pages, pages)) + return false; + if (!svc_fill_pages(rqstp, rqstp->rq_respages, pages)) + return false; + rqstp->rq_next_page = rqstp->rq_respages; + rqstp->rq_page_end = &rqstp->rq_respages[pages]; + /* svc_rqst_replace_page() dereferences *rq_next_page even + * at rq_page_end; NULL prevents releasing a garbage page. + */ + rqstp->rq_page_end[0] = NULL; /* Make arg->head point to first page and arg->pages point to rest */ arg->head[0].iov_base = page_address(rqstp->rq_pages[0]); @@ -1277,7 +1294,6 @@ static noinline int svc_deferred_recv(struct svc_rqst *rqstp) rqstp->rq_addrlen = dr->addrlen; /* Save off transport header len in case we get deferred again */ rqstp->rq_daddr = dr->daddr; - rqstp->rq_respages = rqstp->rq_pages; rqstp->rq_xprt_ctxt = dr->xprt_ctxt; dr->xprt_ctxt = NULL; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index f28c6076f7e8..c86f28f720f7 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -351,8 +351,6 @@ static ssize_t svc_tcp_read_msg(struct svc_rqst *rqstp, size_t buflen, for (i = 0, t = 0; t < buflen; i++, t += PAGE_SIZE) bvec_set_page(&bvec[i], rqstp->rq_pages[i], PAGE_SIZE, 0); - rqstp->rq_respages = &rqstp->rq_pages[i]; - rqstp->rq_next_page = rqstp->rq_respages + 1; iov_iter_bvec(&msg.msg_iter, ITER_DEST, bvec, i, buflen); if (seek) { @@ -677,13 +675,9 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) if (len <= rqstp->rq_arg.head[0].iov_len) { rqstp->rq_arg.head[0].iov_len = len; rqstp->rq_arg.page_len = 0; - rqstp->rq_respages = rqstp->rq_pages+1; } else { rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; - rqstp->rq_respages = rqstp->rq_pages + 1 + - DIV_ROUND_UP(rqstp->rq_arg.page_len, PAGE_SIZE); } - rqstp->rq_next_page = rqstp->rq_respages+1; if (serv->sv_stats) serv->sv_stats->netudpcnt++; diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index e7e4a39ca6c6..3081a37a5896 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -861,18 +861,12 @@ static noinline void svc_rdma_read_complete(struct svc_rqst *rqstp, unsigned int i; /* Transfer the Read chunk pages into @rqstp.rq_pages, replacing - * the rq_pages that were already allocated for this rqstp. + * the receive buffer pages already allocated for this rqstp. */ - release_pages(rqstp->rq_respages, ctxt->rc_page_count); + release_pages(rqstp->rq_pages, ctxt->rc_page_count); for (i = 0; i < ctxt->rc_page_count; i++) rqstp->rq_pages[i] = ctxt->rc_pages[i]; - /* Update @rqstp's result send buffer to start after the - * last page in the RDMA Read payload. - */ - rqstp->rq_respages = &rqstp->rq_pages[ctxt->rc_page_count]; - rqstp->rq_next_page = rqstp->rq_respages + 1; - /* Prevent svc_rdma_recv_ctxt_put() from releasing the * pages in ctxt::rc_pages a second time. */ @@ -931,10 +925,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) struct svc_rdma_recv_ctxt *ctxt; int ret; - /* Prevent svc_xprt_release() from releasing pages in rq_pages - * when returning 0 or an error. + /* Precaution: a zero page count on error return causes + * svc_rqst_release_pages() to release nothing. */ - rqstp->rq_respages = rqstp->rq_pages; rqstp->rq_next_page = rqstp->rq_respages; rqstp->rq_xprt_ctxt = NULL; From 22cc2ba5c27a500040d13cecb1dbfc3e4bccab81 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 26 Feb 2026 09:47:36 -0500 Subject: [PATCH 67/83] SUNRPC: Handle NULL entries in svc_rqst_release_pages svc_rqst_release_pages() releases response pages between rq_respages and rq_next_page. It currently passes the entire range to release_pages(), which does not expect NULL entries. A subsequent patch preserves the rq_next_page pointer in svc_rdma_save_io_pages() so that it accurately records how many response pages were consumed. After that change, the range [rq_respages, rq_next_page) can contain NULL entries where pages have already been transferred to a send context. Iterate through the range entry by entry, skipping NULLs, to handle this case correctly. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- net/sunrpc/svc.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 9abef638b1e0..0ce16e9abdf6 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -990,18 +990,24 @@ EXPORT_SYMBOL_GPL(svc_rqst_replace_page); * svc_rqst_release_pages - Release Reply buffer pages * @rqstp: RPC transaction context * - * Release response pages that might still be in flight after - * svc_send, and any spliced filesystem-owned pages. + * Release response pages in the range [rq_respages, rq_next_page). + * NULL entries in this range are skipped, allowing transports to + * transfer pages to a send context before this function runs. */ void svc_rqst_release_pages(struct svc_rqst *rqstp) { - int i, count = rqstp->rq_next_page - rqstp->rq_respages; + struct page **pp; - if (count) { - release_pages(rqstp->rq_respages, count); - for (i = 0; i < count; i++) - rqstp->rq_respages[i] = NULL; + for (pp = rqstp->rq_respages; pp < rqstp->rq_next_page; pp++) { + if (*pp) { + if (!folio_batch_add(&rqstp->rq_fbatch, + page_folio(*pp))) + __folio_batch_release(&rqstp->rq_fbatch); + *pp = NULL; + } } + if (rqstp->rq_fbatch.nr) + __folio_batch_release(&rqstp->rq_fbatch); } /** From 26c8e6eb759e736e254a99f727aeda7a514eaa5c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 26 Feb 2026 09:47:37 -0500 Subject: [PATCH 68/83] svcrdma: preserve rq_next_page in svc_rdma_save_io_pages svc_rdma_save_io_pages() transfers response pages to the send context and sets those slots to NULL. It then resets rq_next_page to equal rq_respages, hiding the NULL region from svc_rqst_release_pages(). Now that svc_rqst_release_pages() handles NULL entries, this reset is no longer necessary. Removing it preserves the invariant that the range [rq_respages, rq_next_page) accurately describes how many response pages were consumed, enabling a subsequent optimization in svc_alloc_arg() that refills only the consumed range. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 914cd263c2f1..17c8429da9d5 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -858,7 +858,8 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, /* The svc_rqst and all resources it owns are released as soon as * svc_rdma_sendto returns. Transfer pages under I/O to the ctxt - * so they are released by the Send completion handler. + * so they are released only after Send completion, and not by + * svc_rqst_release_pages(). */ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, struct svc_rdma_send_ctxt *ctxt) @@ -870,9 +871,6 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, ctxt->sc_pages[i] = rqstp->rq_respages[i]; rqstp->rq_respages[i] = NULL; } - - /* Prevent svc_xprt_release from releasing pages in rq_pages */ - rqstp->rq_next_page = rqstp->rq_respages; } /* Prepare the portion of the RPC Reply that will be transmitted From 7ed7504287a627834f2a35ef04e5dfd26d1c8986 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 26 Feb 2026 09:47:38 -0500 Subject: [PATCH 69/83] SUNRPC: Track consumed rq_pages entries The rq_pages array holds pages allocated for incoming RPC requests. Two transport receive paths NULL entries in rq_pages to prevent svc_rqst_release_pages() from freeing pages that the transport has taken ownership of: - svc_tcp_save_pages() moves partial request data pages to svsk->sk_pages during multi-fragment TCP reassembly. - svc_rdma_clear_rqst_pages() moves request data pages to head->rc_pages because they are targets of active RDMA Read WRs. A new rq_pages_nfree field in struct svc_rqst records how many entries were NULLed. svc_alloc_arg() uses it to refill only those entries rather than scanning the full rq_pages array. In steady state, the transport NULLs a handful of entries per RPC, so the allocator visits only those entries instead of the full ~259 slots (for 1MB messages). Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 10 ++++++++++ net/sunrpc/svc.c | 1 + net/sunrpc/svc_xprt.c | 11 ++++++++--- net/sunrpc/svcsock.c | 1 + net/sunrpc/xprtrdma/svc_rdma_rw.c | 1 + 5 files changed, 21 insertions(+), 3 deletions(-) diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 3b1a98ab5cba..c3399cf64524 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -143,6 +143,15 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp); * server thread needs to allocate more to replace those used in * sending. * + * rq_pages request page contract: + * + * Transport receive paths that move request data pages out of + * rq_pages -- TCP multi-fragment reassembly (svc_tcp_save_pages) + * and RDMA Read I/O (svc_rdma_clear_rqst_pages) -- NULL those + * entries to prevent svc_rqst_release_pages() from freeing pages + * still in transport use, and set rq_pages_nfree to the count. + * svc_alloc_arg() refills only that many rq_pages entries. + * * xdr_buf holds responses; the structure fits NFS read responses * (header, data pages, optional tail) and enables sharing of * client-side routines. @@ -204,6 +213,7 @@ struct svc_rqst { struct folio *rq_scratch_folio; struct xdr_buf rq_res; unsigned long rq_maxpages; /* entries per page array */ + unsigned long rq_pages_nfree; /* rq_pages entries NULLed by transport */ struct page * *rq_pages; /* Call buffer pages */ struct page * *rq_respages; /* Reply buffer pages */ struct page * *rq_next_page; /* next reply page to use */ diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 0ce16e9abdf6..6e57e35fa6d6 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -655,6 +655,7 @@ svc_init_buffer(struct svc_rqst *rqstp, const struct svc_serv *serv, int node) return false; } + rqstp->rq_pages_nfree = rqstp->rq_maxpages; return true; } diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index e027765f4307..795b5729525f 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -675,12 +675,17 @@ static bool svc_fill_pages(struct svc_rqst *rqstp, struct page **pages, static bool svc_alloc_arg(struct svc_rqst *rqstp) { struct xdr_buf *arg = &rqstp->rq_arg; - unsigned long pages; + unsigned long pages, nfree; pages = rqstp->rq_maxpages; - if (!svc_fill_pages(rqstp, rqstp->rq_pages, pages)) - return false; + nfree = rqstp->rq_pages_nfree; + if (nfree) { + if (!svc_fill_pages(rqstp, rqstp->rq_pages, nfree)) + return false; + rqstp->rq_pages_nfree = 0; + } + if (!svc_fill_pages(rqstp, rqstp->rq_respages, pages)) return false; rqstp->rq_next_page = rqstp->rq_respages; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index c86f28f720f7..2ce43f9995f1 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1009,6 +1009,7 @@ static void svc_tcp_save_pages(struct svc_sock *svsk, struct svc_rqst *rqstp) svsk->sk_pages[i] = rqstp->rq_pages[i]; rqstp->rq_pages[i] = NULL; } + rqstp->rq_pages_nfree = npages; } static void svc_tcp_clear_pages(struct svc_sock *svsk) diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 4ec2f9ae06aa..cf4a1762b629 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -1107,6 +1107,7 @@ static void svc_rdma_clear_rqst_pages(struct svc_rqst *rqstp, head->rc_pages[i] = rqstp->rq_pages[i]; rqstp->rq_pages[i] = NULL; } + rqstp->rq_pages_nfree = head->rc_page_count; } /** From d7f3efd9ff474867b04e1ea784690f02450a245b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 26 Feb 2026 09:47:39 -0500 Subject: [PATCH 70/83] SUNRPC: Optimize rq_respages allocation in svc_alloc_arg svc_alloc_arg() invokes alloc_pages_bulk() with the full rq_maxpages count (~259 for 1MB messages) for the rq_respages array, causing a full-array scan despite most slots holding valid pages. svc_rqst_release_pages() NULLs only the range [rq_respages, rq_next_page) after each RPC, so only that range contains NULL entries. Limit the rq_respages fill in svc_alloc_arg() to that range instead of scanning the full array. svc_init_buffer() initializes rq_next_page to span the entire rq_respages array, so the first svc_alloc_arg() call fills all slots. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 4 ++++ net/sunrpc/svc.c | 1 + net/sunrpc/svc_xprt.c | 8 +++++++- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index c3399cf64524..669c944eaf7f 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -152,6 +152,10 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp); * still in transport use, and set rq_pages_nfree to the count. * svc_alloc_arg() refills only that many rq_pages entries. * + * For rq_respages, svc_rqst_release_pages() NULLs entries in + * [rq_respages, rq_next_page) after each RPC. svc_alloc_arg() + * refills only that range. + * * xdr_buf holds responses; the structure fits NFS read responses * (header, data pages, optional tail) and enables sharing of * client-side routines. diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 6e57e35fa6d6..5e0b5ec2fd52 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -656,6 +656,7 @@ svc_init_buffer(struct svc_rqst *rqstp, const struct svc_serv *serv, int node) } rqstp->rq_pages_nfree = rqstp->rq_maxpages; + rqstp->rq_next_page = rqstp->rq_respages + rqstp->rq_maxpages; return true; } diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 795b5729525f..b16e710926c1 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -686,8 +686,14 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp) rqstp->rq_pages_nfree = 0; } - if (!svc_fill_pages(rqstp, rqstp->rq_respages, pages)) + if (WARN_ON_ONCE(rqstp->rq_next_page < rqstp->rq_respages)) return false; + nfree = rqstp->rq_next_page - rqstp->rq_respages; + if (nfree) { + if (!svc_fill_pages(rqstp, rqstp->rq_respages, nfree)) + return false; + } + rqstp->rq_next_page = rqstp->rq_respages; rqstp->rq_page_end = &rqstp->rq_respages[pages]; /* svc_rqst_replace_page() dereferences *rq_next_page even From ccc89b9d1ed233349cfe8d87b842e7351b74d8de Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 27 Feb 2026 09:03:28 -0500 Subject: [PATCH 71/83] svcrdma: Add fair queuing for Send Queue access When the Send Queue fills, multiple threads may wait for SQ slots. The previous implementation had no ordering guarantee, allowing starvation when one thread repeatedly acquires slots while others wait indefinitely. Introduce a ticket-based fair queuing system. Each waiter takes a ticket number and is served in FIFO order. This ensures forward progress for all waiters when SQ capacity is constrained. The implementation has two phases: 1. Fast path: attempt to reserve SQ slots without waiting 2. Slow path: take a ticket, wait for turn, then wait for slots The ticket system adds two atomic counters to the transport: - sc_sq_ticket_head: next ticket to issue - sc_sq_ticket_tail: ticket currently being served A dedicated wait queue (sc_sq_ticket_wait) handles ticket ordering, separate from sc_send_wait which handles SQ capacity. This separation ensures that send completions (the high-frequency wake source) wake only the current ticket holder rather than all queued waiters. Ticket handoff wakes only the ticket wait queue, and each ticket holder that exits via connection close propagates the wake to the next waiter in line. When a waiter successfully reserves slots, it advances the tail counter and wakes the next waiter. This creates an orderly handoff that prevents starvation while maintaining good throughput on the fast path when contention is low. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 10 ++ net/sunrpc/xprtrdma/svc_rdma_rw.c | 37 ++---- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 160 +++++++++++++++++------ net/sunrpc/xprtrdma/svc_rdma_transport.c | 6 +- 4 files changed, 145 insertions(+), 68 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 57f4fd94166a..658b8498177e 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -84,6 +84,9 @@ struct svcxprt_rdma { atomic_t sc_sq_avail; /* SQEs ready to be consumed */ unsigned int sc_sq_depth; /* Depth of SQ */ + atomic_t sc_sq_ticket_head; /* Next ticket to issue */ + atomic_t sc_sq_ticket_tail; /* Ticket currently serving */ + wait_queue_head_t sc_sq_ticket_wait; /* Ticket ordering waitlist */ __be32 sc_fc_credits; /* Forward credits */ u32 sc_max_requests; /* Max requests */ u32 sc_max_bc_requests;/* Backward credits */ @@ -306,6 +309,13 @@ extern void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, struct svc_rdma_recv_ctxt *rctxt, int status); extern void svc_rdma_wake_send_waiters(struct svcxprt_rdma *rdma, int avail); +extern int svc_rdma_sq_wait(struct svcxprt_rdma *rdma, + const struct rpc_rdma_cid *cid, int sqecount); +extern int svc_rdma_post_send_err(struct svcxprt_rdma *rdma, + const struct rpc_rdma_cid *cid, + const struct ib_send_wr *bad_wr, + const struct ib_send_wr *first_wr, + int sqecount, int ret); extern int svc_rdma_sendto(struct svc_rqst *); extern int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset, unsigned int length); diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index cf4a1762b629..97bce806974b 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -405,34 +405,17 @@ static int svc_rdma_post_chunk_ctxt(struct svcxprt_rdma *rdma, cqe = NULL; } - do { - if (atomic_sub_return(cc->cc_sqecount, - &rdma->sc_sq_avail) > 0) { - cc->cc_posttime = ktime_get(); - ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); - if (ret) - break; - return 0; - } + ret = svc_rdma_sq_wait(rdma, &cc->cc_cid, cc->cc_sqecount); + if (ret < 0) + return ret; - percpu_counter_inc(&svcrdma_stat_sq_starve); - trace_svcrdma_sq_full(rdma, &cc->cc_cid); - atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); - wait_event(rdma->sc_send_wait, - atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount); - trace_svcrdma_sq_retry(rdma, &cc->cc_cid); - } while (1); - - trace_svcrdma_sq_post_err(rdma, &cc->cc_cid, ret); - svc_xprt_deferred_close(&rdma->sc_xprt); - - /* If even one was posted, there will be a completion. */ - if (bad_wr != first_wr) - return 0; - - atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); - wake_up(&rdma->sc_send_wait); - return -ENOTCONN; + cc->cc_posttime = ktime_get(); + ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); + if (ret) + return svc_rdma_post_send_err(rdma, &cc->cc_cid, bad_wr, + first_wr, cc->cc_sqecount, + ret); + return 0; } /* Build a bvec that covers one kvec in an xdr_buf. diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 17c8429da9d5..02559947272a 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -294,6 +294,117 @@ void svc_rdma_wake_send_waiters(struct svcxprt_rdma *rdma, int avail) wake_up(&rdma->sc_send_wait); } +/** + * svc_rdma_sq_wait - Wait for SQ slots using fair queuing + * @rdma: controlling transport + * @cid: completion ID for tracing + * @sqecount: number of SQ entries needed + * + * A ticket-based system ensures fair ordering when multiple threads + * wait for Send Queue capacity. Each waiter takes a ticket and is + * served in order, preventing starvation. + * + * Protocol invariant: every ticket holder must increment + * sc_sq_ticket_tail exactly once, whether the reservation + * succeeds or the connection closes. Failing to advance the + * tail stalls all subsequent waiters. + * + * The ticket counters are signed 32-bit atomics. After + * wrapping through INT_MAX, the equality check + * (tail == ticket) remains correct because both counters + * advance monotonically and the comparison uses exact + * equality rather than relational operators. + * + * Return values: + * %0: SQ slots were reserved successfully + * %-ENOTCONN: The connection was lost + */ +int svc_rdma_sq_wait(struct svcxprt_rdma *rdma, + const struct rpc_rdma_cid *cid, int sqecount) +{ + int ticket; + + /* Fast path: try to reserve SQ slots without waiting. + * + * A failed reservation temporarily understates sc_sq_avail + * until the compensating atomic_add restores it. A Send + * completion arriving in that window sees a lower count + * than reality, but the value self-corrects once the add + * completes. No ordering guarantee is needed here because + * the slow path serializes all contended waiters. + */ + if (likely(atomic_sub_return(sqecount, &rdma->sc_sq_avail) >= 0)) + return 0; + atomic_add(sqecount, &rdma->sc_sq_avail); + + /* Slow path: take a ticket and wait in line */ + ticket = atomic_fetch_inc(&rdma->sc_sq_ticket_head); + + percpu_counter_inc(&svcrdma_stat_sq_starve); + trace_svcrdma_sq_full(rdma, cid); + + /* Wait until all earlier tickets have been served */ + wait_event(rdma->sc_sq_ticket_wait, + test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags) || + atomic_read(&rdma->sc_sq_ticket_tail) == ticket); + if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) + goto out_close; + + /* It's our turn. Wait for enough SQ slots to be available. */ + while (atomic_sub_return(sqecount, &rdma->sc_sq_avail) < 0) { + atomic_add(sqecount, &rdma->sc_sq_avail); + + wait_event(rdma->sc_send_wait, + test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags) || + atomic_read(&rdma->sc_sq_avail) >= sqecount); + if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) + goto out_close; + } + + /* Slots reserved successfully. Let the next waiter proceed. */ + atomic_inc(&rdma->sc_sq_ticket_tail); + wake_up(&rdma->sc_sq_ticket_wait); + trace_svcrdma_sq_retry(rdma, cid); + return 0; + +out_close: + atomic_inc(&rdma->sc_sq_ticket_tail); + wake_up(&rdma->sc_sq_ticket_wait); + return -ENOTCONN; +} + +/** + * svc_rdma_post_send_err - Handle ib_post_send failure + * @rdma: controlling transport + * @cid: completion ID for tracing + * @bad_wr: first WR that was not posted + * @first_wr: first WR in the chain + * @sqecount: number of SQ entries that were reserved + * @ret: error code from ib_post_send + * + * Return values: + * %0: At least one WR was posted; a completion handles cleanup + * %-ENOTCONN: No WRs were posted; SQ slots are released + */ +int svc_rdma_post_send_err(struct svcxprt_rdma *rdma, + const struct rpc_rdma_cid *cid, + const struct ib_send_wr *bad_wr, + const struct ib_send_wr *first_wr, + int sqecount, int ret) +{ + trace_svcrdma_sq_post_err(rdma, cid, ret); + svc_xprt_deferred_close(&rdma->sc_xprt); + + /* If even one WR was posted, a Send completion will + * return the reserved SQ slots. + */ + if (bad_wr != first_wr) + return 0; + + svc_rdma_wake_send_waiters(rdma, sqecount); + return -ENOTCONN; +} + /** * svc_rdma_wc_send - Invoked by RDMA provider for each polled Send WC * @cq: Completion Queue context @@ -336,11 +447,6 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) * that these values remain available after the ib_post_send() call. * In some error flow cases, svc_rdma_wc_send() releases @ctxt. * - * Note there is potential for starvation when the Send Queue is - * full because there is no order to when waiting threads are - * awoken. The transport is typically provisioned with a deep - * enough Send Queue that SQ exhaustion should be a rare event. - * * Return values: * %0: @ctxt's WR chain was posted successfully * %-ENOTCONN: The connection was lost @@ -362,42 +468,16 @@ int svc_rdma_post_send(struct svcxprt_rdma *rdma, send_wr->sg_list[0].length, DMA_TO_DEVICE); - /* If the SQ is full, wait until an SQ entry is available */ - while (!test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) { - if (atomic_sub_return(sqecount, &rdma->sc_sq_avail) < 0) { - svc_rdma_wake_send_waiters(rdma, sqecount); + ret = svc_rdma_sq_wait(rdma, &cid, sqecount); + if (ret < 0) + return ret; - /* When the transport is torn down, assume - * ib_drain_sq() will trigger enough Send - * completions to wake us. The XPT_CLOSE test - * above should then cause the while loop to - * exit. - */ - percpu_counter_inc(&svcrdma_stat_sq_starve); - trace_svcrdma_sq_full(rdma, &cid); - wait_event(rdma->sc_send_wait, - atomic_read(&rdma->sc_sq_avail) > 0); - trace_svcrdma_sq_retry(rdma, &cid); - continue; - } - - trace_svcrdma_post_send(ctxt); - ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); - if (ret) { - trace_svcrdma_sq_post_err(rdma, &cid, ret); - svc_xprt_deferred_close(&rdma->sc_xprt); - - /* If even one WR was posted, there will be a - * Send completion that bumps sc_sq_avail. - */ - if (bad_wr == first_wr) { - svc_rdma_wake_send_waiters(rdma, sqecount); - break; - } - } - return 0; - } - return -ENOTCONN; + trace_svcrdma_post_send(ctxt); + ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); + if (ret) + return svc_rdma_post_send_err(rdma, &cid, bad_wr, + first_wr, sqecount, ret); + return 0; } /** diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index f2d72181a6fe..f18bc60d9f4f 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -179,6 +179,7 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, init_llist_head(&cma_xprt->sc_recv_ctxts); init_llist_head(&cma_xprt->sc_rw_ctxts); init_waitqueue_head(&cma_xprt->sc_send_wait); + init_waitqueue_head(&cma_xprt->sc_sq_ticket_wait); spin_lock_init(&cma_xprt->sc_lock); spin_lock_init(&cma_xprt->sc_rq_dto_lock); @@ -477,6 +478,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr) newxprt->sc_sq_depth = dev->attrs.max_qp_wr; atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth); + atomic_set(&newxprt->sc_sq_ticket_head, 0); + atomic_set(&newxprt->sc_sq_ticket_tail, 0); newxprt->sc_pd = ib_alloc_pd(dev, 0); if (IS_ERR(newxprt->sc_pd)) { @@ -649,7 +652,8 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt) * If there are already waiters on the SQ, * return false. */ - if (waitqueue_active(&rdma->sc_send_wait)) + if (waitqueue_active(&rdma->sc_send_wait) || + waitqueue_active(&rdma->sc_sq_ticket_wait)) return 0; /* Otherwise return true. */ From a5f2087f3762bbf0c54f0a7796dc95bd39863c5f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 27 Feb 2026 09:03:29 -0500 Subject: [PATCH 72/83] svcrdma: Clean up use of rdma->sc_pd->device in Receive paths I can't think of a reason why svcrdma is using the PD's device. Most other consumers of the IB DMA API use the ib_device pointer from the connection's rdma_cm_id. I don't believe there's any functional difference between the two, but it is a little confusing to see some uses of rdma_cm_id->device and some of ib_pd->device. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 3081a37a5896..f8a0638eb095 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -118,7 +118,8 @@ svc_rdma_next_recv_ctxt(struct list_head *list) static struct svc_rdma_recv_ctxt * svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) { - int node = ibdev_to_node(rdma->sc_cm_id->device); + struct ib_device *device = rdma->sc_cm_id->device; + int node = ibdev_to_node(device); struct svc_rdma_recv_ctxt *ctxt; unsigned long pages; dma_addr_t addr; @@ -133,9 +134,9 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node); if (!buffer) goto fail1; - addr = ib_dma_map_single(rdma->sc_pd->device, buffer, - rdma->sc_max_req_size, DMA_FROM_DEVICE); - if (ib_dma_mapping_error(rdma->sc_pd->device, addr)) + addr = ib_dma_map_single(device, buffer, rdma->sc_max_req_size, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(device, addr)) goto fail2; svc_rdma_recv_cid_init(rdma, &ctxt->rc_cid); @@ -167,7 +168,7 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) static void svc_rdma_recv_ctxt_destroy(struct svcxprt_rdma *rdma, struct svc_rdma_recv_ctxt *ctxt) { - ib_dma_unmap_single(rdma->sc_pd->device, ctxt->rc_recv_sge.addr, + ib_dma_unmap_single(rdma->sc_cm_id->device, ctxt->rc_recv_sge.addr, ctxt->rc_recv_sge.length, DMA_FROM_DEVICE); kfree(ctxt->rc_recv_buf); kfree(ctxt); @@ -955,7 +956,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) return 0; percpu_counter_inc(&svcrdma_stat_recv); - ib_dma_sync_single_for_cpu(rdma_xprt->sc_pd->device, + ib_dma_sync_single_for_cpu(rdma_xprt->sc_cm_id->device, ctxt->rc_recv_sge.addr, ctxt->rc_byte_len, DMA_FROM_DEVICE); svc_rdma_build_arg_xdr(rqstp, ctxt); From c553983efad2ef0f1a8728a7a9104136297d8a0d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 27 Feb 2026 09:03:30 -0500 Subject: [PATCH 73/83] svcrdma: Clean up use of rdma->sc_pd->device I can't think of a reason why svcrdma is using the PD's device. Most other consumers of the IB DMA API use the ib_device pointer from the connection's rdma_cm_id. I don't think there's any functional difference between the two, but it is a little confusing to see some uses of rdma_cm_id and some of ib_pd. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 02559947272a..bef68efa7034 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -116,7 +116,8 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc); static struct svc_rdma_send_ctxt * svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) { - int node = ibdev_to_node(rdma->sc_cm_id->device); + struct ib_device *device = rdma->sc_cm_id->device; + int node = ibdev_to_node(device); struct svc_rdma_send_ctxt *ctxt; unsigned long pages; dma_addr_t addr; @@ -136,9 +137,9 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node); if (!buffer) goto fail2; - addr = ib_dma_map_single(rdma->sc_pd->device, buffer, - rdma->sc_max_req_size, DMA_TO_DEVICE); - if (ib_dma_mapping_error(rdma->sc_pd->device, addr)) + addr = ib_dma_map_single(device, buffer, rdma->sc_max_req_size, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(device, addr)) goto fail3; svc_rdma_send_cid_init(rdma, &ctxt->sc_cid); @@ -175,15 +176,14 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) */ void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma) { + struct ib_device *device = rdma->sc_cm_id->device; struct svc_rdma_send_ctxt *ctxt; struct llist_node *node; while ((node = llist_del_first(&rdma->sc_send_ctxts)) != NULL) { ctxt = llist_entry(node, struct svc_rdma_send_ctxt, sc_node); - ib_dma_unmap_single(rdma->sc_pd->device, - ctxt->sc_sges[0].addr, - rdma->sc_max_req_size, - DMA_TO_DEVICE); + ib_dma_unmap_single(device, ctxt->sc_sges[0].addr, + rdma->sc_max_req_size, DMA_TO_DEVICE); kfree(ctxt->sc_xprt_buf); kfree(ctxt->sc_pages); kfree(ctxt); @@ -463,7 +463,7 @@ int svc_rdma_post_send(struct svcxprt_rdma *rdma, might_sleep(); /* Sync the transport header buffer */ - ib_dma_sync_single_for_device(rdma->sc_pd->device, + ib_dma_sync_single_for_device(rdma->sc_cm_id->device, send_wr->sg_list[0].addr, send_wr->sg_list[0].length, DMA_TO_DEVICE); From d16f060f3ee297424c0aba047b1d49208adb9318 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 27 Feb 2026 09:03:31 -0500 Subject: [PATCH 74/83] svcrdma: Add Write chunk WRs to the RPC's Send WR chain Previously, Write chunk RDMA Writes were posted via a separate ib_post_send() call with their own completion handler. Each Write chunk incurred a doorbell and generated a completion event. Link Write chunk WRs onto the RPC Reply's Send WR chain so that a single ib_post_send() call posts both the RDMA Writes and the Send WR. A single completion event signals that all operations have finished. This reduces both doorbell rate and completion rate, as well as eliminating the latency of a round-trip between the Write chunk completion and the subsequent Send WR posting. The lifecycle of Write chunk resources changes: previously, the svc_rdma_write_done() completion handler released Write chunk resources when RDMA Writes completed. With WR chaining, resources remain live until the Send completion. A new sc_write_info_list tracks Write chunk metadata attached to each Send context, and svc_rdma_write_chunk_release() frees these resources when the Send context is released. The svc_rdma_write_done() handler now handles only error cases. On success it returns immediately since the Send completion handles resource release. On failure (WR flush), it closes the connection to signal to the client that the RPC Reply is incomplete. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 13 +++- net/sunrpc/xprtrdma/svc_rdma_rw.c | 94 ++++++++++++++++++++------- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 10 ++- 3 files changed, 91 insertions(+), 26 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 658b8498177e..df6e08aaad57 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -216,6 +216,7 @@ struct svc_rdma_recv_ctxt { */ struct svc_rdma_write_info { struct svcxprt_rdma *wi_rdma; + struct list_head wi_list; const struct svc_rdma_chunk *wi_chunk; @@ -244,7 +245,10 @@ struct svc_rdma_send_ctxt { struct ib_cqe sc_cqe; struct xdr_buf sc_hdrbuf; struct xdr_stream sc_stream; + + struct list_head sc_write_info_list; struct svc_rdma_write_info sc_reply_info; + void *sc_xprt_buf; int sc_page_count; int sc_cur_sge_no; @@ -277,11 +281,14 @@ extern void svc_rdma_cc_init(struct svcxprt_rdma *rdma, extern void svc_rdma_cc_release(struct svcxprt_rdma *rdma, struct svc_rdma_chunk_ctxt *cc, enum dma_data_direction dir); +extern void svc_rdma_write_chunk_release(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt); extern void svc_rdma_reply_chunk_release(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt); -extern int svc_rdma_send_write_list(struct svcxprt_rdma *rdma, - const struct svc_rdma_recv_ctxt *rctxt, - const struct xdr_buf *xdr); +extern int svc_rdma_prepare_write_list(struct svcxprt_rdma *rdma, + const struct svc_rdma_recv_ctxt *rctxt, + struct svc_rdma_send_ctxt *sctxt, + const struct xdr_buf *xdr); extern int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma, const struct svc_rdma_pcl *write_pcl, const struct svc_rdma_pcl *reply_pcl, diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 97bce806974b..ebc90c12c835 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -251,6 +251,28 @@ static void svc_rdma_write_info_free(struct svc_rdma_write_info *info) queue_work(svcrdma_wq, &info->wi_work); } +/** + * svc_rdma_write_chunk_release - Release Write chunk I/O resources + * @rdma: controlling transport + * @ctxt: Send context that is being released + * + * Write chunk resources remain live until Send completion because + * Write WRs are chained to the Send WR. This function releases all + * write_info structures accumulated on @ctxt->sc_write_info_list. + */ +void svc_rdma_write_chunk_release(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt) +{ + struct svc_rdma_write_info *info; + + while (!list_empty(&ctxt->sc_write_info_list)) { + info = list_first_entry(&ctxt->sc_write_info_list, + struct svc_rdma_write_info, wi_list); + list_del(&info->wi_list); + svc_rdma_write_info_free(info); + } +} + /** * svc_rdma_reply_chunk_release - Release Reply chunk I/O resources * @rdma: controlling transport @@ -307,13 +329,11 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) struct ib_cqe *cqe = wc->wr_cqe; struct svc_rdma_chunk_ctxt *cc = container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); - struct svc_rdma_write_info *info = - container_of(cc, struct svc_rdma_write_info, wi_cc); switch (wc->status) { case IB_WC_SUCCESS: trace_svcrdma_wc_write(&cc->cc_cid); - break; + return; case IB_WC_WR_FLUSH_ERR: trace_svcrdma_wc_write_flush(wc, &cc->cc_cid); break; @@ -321,12 +341,11 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) trace_svcrdma_wc_write_err(wc, &cc->cc_cid); } - svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount); - - if (unlikely(wc->status != IB_WC_SUCCESS)) - svc_xprt_deferred_close(&rdma->sc_xprt); - - svc_rdma_write_info_free(info); + /* The RDMA Write has flushed, so the client won't get + * some of the outgoing RPC message. Signal the loss + * to the client by closing the connection. + */ + svc_xprt_deferred_close(&rdma->sc_xprt); } /** @@ -600,13 +619,27 @@ static int svc_rdma_xb_write(const struct xdr_buf *xdr, void *data) return xdr->len; } -static int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, - const struct svc_rdma_chunk *chunk, - const struct xdr_buf *xdr) +/* + * svc_rdma_prepare_write_chunk - Link Write WRs for @chunk onto @sctxt's chain + * + * Write WRs are prepended to the Send WR chain so that a single + * ib_post_send() posts both RDMA Writes and the final Send. Only + * the first WR in each chunk gets a CQE for error detection; + * subsequent WRs complete without individual completion events. + * The Send WR's signaled completion indicates all chained + * operations have finished. + */ +static int svc_rdma_prepare_write_chunk(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *sctxt, + const struct svc_rdma_chunk *chunk, + const struct xdr_buf *xdr) { struct svc_rdma_write_info *info; struct svc_rdma_chunk_ctxt *cc; + struct ib_send_wr *first_wr; struct xdr_buf payload; + struct list_head *pos; + struct ib_cqe *cqe; int ret; if (xdr_buf_subsegment(xdr, &payload, chunk->ch_position, @@ -622,10 +655,25 @@ static int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, if (ret != payload.len) goto out_err; - trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount); - ret = svc_rdma_post_chunk_ctxt(rdma, cc); - if (ret < 0) + ret = -EINVAL; + if (unlikely(sctxt->sc_sqecount + cc->cc_sqecount > rdma->sc_sq_depth)) goto out_err; + + first_wr = sctxt->sc_wr_chain; + cqe = &cc->cc_cqe; + list_for_each(pos, &cc->cc_rwctxts) { + struct svc_rdma_rw_ctxt *rwc; + + rwc = list_entry(pos, struct svc_rdma_rw_ctxt, rw_list); + first_wr = rdma_rw_ctx_wrs(&rwc->rw_ctx, rdma->sc_qp, + rdma->sc_port_num, cqe, first_wr); + cqe = NULL; + } + sctxt->sc_wr_chain = first_wr; + sctxt->sc_sqecount += cc->cc_sqecount; + list_add(&info->wi_list, &sctxt->sc_write_info_list); + + trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount); return 0; out_err: @@ -634,17 +682,19 @@ static int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, } /** - * svc_rdma_send_write_list - Send all chunks on the Write list + * svc_rdma_prepare_write_list - Construct WR chain for sending Write list * @rdma: controlling RDMA transport * @rctxt: Write list provisioned by the client + * @sctxt: Send WR resources * @xdr: xdr_buf containing an RPC Reply message * - * Returns zero on success, or a negative errno if one or more - * Write chunks could not be sent. + * Returns zero on success, or a negative errno if WR chain + * construction fails for one or more Write chunks. */ -int svc_rdma_send_write_list(struct svcxprt_rdma *rdma, - const struct svc_rdma_recv_ctxt *rctxt, - const struct xdr_buf *xdr) +int svc_rdma_prepare_write_list(struct svcxprt_rdma *rdma, + const struct svc_rdma_recv_ctxt *rctxt, + struct svc_rdma_send_ctxt *sctxt, + const struct xdr_buf *xdr) { struct svc_rdma_chunk *chunk; int ret; @@ -652,7 +702,7 @@ int svc_rdma_send_write_list(struct svcxprt_rdma *rdma, pcl_for_each_chunk(chunk, &rctxt->rc_write_pcl) { if (!chunk->ch_payload_length) break; - ret = svc_rdma_send_write_chunk(rdma, chunk, xdr); + ret = svc_rdma_prepare_write_chunk(rdma, sctxt, chunk, xdr); if (ret < 0) return ret; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index bef68efa7034..8b3f0c8c14b2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -150,6 +150,7 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) ctxt->sc_send_wr.sg_list = ctxt->sc_sges; ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED; ctxt->sc_cqe.done = svc_rdma_wc_send; + INIT_LIST_HEAD(&ctxt->sc_write_info_list); ctxt->sc_xprt_buf = buffer; xdr_buf_init(&ctxt->sc_hdrbuf, ctxt->sc_xprt_buf, rdma->sc_max_req_size); @@ -237,6 +238,7 @@ static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma, struct ib_device *device = rdma->sc_cm_id->device; unsigned int i; + svc_rdma_write_chunk_release(rdma, ctxt); svc_rdma_reply_chunk_release(rdma, ctxt); if (ctxt->sc_page_count) @@ -1054,6 +1056,12 @@ void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, sctxt->sc_send_wr.num_sge = 1; sctxt->sc_send_wr.opcode = IB_WR_SEND; sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len; + + /* Ensure only the error message is posted, not any previously + * prepared Write chunk WRs. + */ + sctxt->sc_wr_chain = &sctxt->sc_send_wr; + sctxt->sc_sqecount = 1; if (svc_rdma_post_send(rdma, sctxt)) goto put_ctxt; return; @@ -1101,7 +1109,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) if (!p) goto put_ctxt; - ret = svc_rdma_send_write_list(rdma, rctxt, &rqstp->rq_res); + ret = svc_rdma_prepare_write_list(rdma, rctxt, sctxt, &rqstp->rq_res); if (ret < 0) goto put_ctxt; From 2239535fb062b404871556b3bbbe4e27579f5edb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 27 Feb 2026 09:03:32 -0500 Subject: [PATCH 75/83] svcrdma: Factor out WR chain linking into helper svc_rdma_prepare_write_chunk() and svc_rdma_prepare_reply_chunk() contain identical code for linking RDMA R/W work requests onto a Send context's WR chain. This duplication increases maintenance burden and risks divergent bug fixes. Introduce svc_rdma_cc_link_wrs() to consolidate the WR chain linking logic. The helper walks the chunk context's rwctxts list, chains each WR via rdma_rw_ctx_wrs(), and updates the Send context's chain head and SQE count. Completion signaling is requested only for the tail WR (posted first). No functional change. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_rw.c | 67 +++++++++++++------------------ 1 file changed, 28 insertions(+), 39 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index ebc90c12c835..9e17700fae2a 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -619,15 +619,32 @@ static int svc_rdma_xb_write(const struct xdr_buf *xdr, void *data) return xdr->len; } -/* - * svc_rdma_prepare_write_chunk - Link Write WRs for @chunk onto @sctxt's chain - * - * Write WRs are prepended to the Send WR chain so that a single - * ib_post_send() posts both RDMA Writes and the final Send. Only - * the first WR in each chunk gets a CQE for error detection; - * subsequent WRs complete without individual completion events. - * The Send WR's signaled completion indicates all chained - * operations have finished. +/* Link chunk WRs onto @sctxt's WR chain. Completion is requested + * for the tail WR, which is posted first. + */ +static void svc_rdma_cc_link_wrs(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *sctxt, + struct svc_rdma_chunk_ctxt *cc) +{ + struct ib_send_wr *first_wr; + struct list_head *pos; + struct ib_cqe *cqe; + + first_wr = sctxt->sc_wr_chain; + cqe = &cc->cc_cqe; + list_for_each(pos, &cc->cc_rwctxts) { + struct svc_rdma_rw_ctxt *rwc; + + rwc = list_entry(pos, struct svc_rdma_rw_ctxt, rw_list); + first_wr = rdma_rw_ctx_wrs(&rwc->rw_ctx, rdma->sc_qp, + rdma->sc_port_num, cqe, first_wr); + cqe = NULL; + } + sctxt->sc_wr_chain = first_wr; + sctxt->sc_sqecount += cc->cc_sqecount; +} + +/* Link Write WRs for @chunk onto @sctxt's WR chain. */ static int svc_rdma_prepare_write_chunk(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *sctxt, @@ -636,10 +653,7 @@ static int svc_rdma_prepare_write_chunk(struct svcxprt_rdma *rdma, { struct svc_rdma_write_info *info; struct svc_rdma_chunk_ctxt *cc; - struct ib_send_wr *first_wr; struct xdr_buf payload; - struct list_head *pos; - struct ib_cqe *cqe; int ret; if (xdr_buf_subsegment(xdr, &payload, chunk->ch_position, @@ -659,18 +673,7 @@ static int svc_rdma_prepare_write_chunk(struct svcxprt_rdma *rdma, if (unlikely(sctxt->sc_sqecount + cc->cc_sqecount > rdma->sc_sq_depth)) goto out_err; - first_wr = sctxt->sc_wr_chain; - cqe = &cc->cc_cqe; - list_for_each(pos, &cc->cc_rwctxts) { - struct svc_rdma_rw_ctxt *rwc; - - rwc = list_entry(pos, struct svc_rdma_rw_ctxt, rw_list); - first_wr = rdma_rw_ctx_wrs(&rwc->rw_ctx, rdma->sc_qp, - rdma->sc_port_num, cqe, first_wr); - cqe = NULL; - } - sctxt->sc_wr_chain = first_wr; - sctxt->sc_sqecount += cc->cc_sqecount; + svc_rdma_cc_link_wrs(rdma, sctxt, cc); list_add(&info->wi_list, &sctxt->sc_write_info_list); trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount); @@ -732,9 +735,6 @@ int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma, { struct svc_rdma_write_info *info = &sctxt->sc_reply_info; struct svc_rdma_chunk_ctxt *cc = &info->wi_cc; - struct ib_send_wr *first_wr; - struct list_head *pos; - struct ib_cqe *cqe; int ret; info->wi_rdma = rdma; @@ -748,18 +748,7 @@ int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma, if (ret < 0) return ret; - first_wr = sctxt->sc_wr_chain; - cqe = &cc->cc_cqe; - list_for_each(pos, &cc->cc_rwctxts) { - struct svc_rdma_rw_ctxt *rwc; - - rwc = list_entry(pos, struct svc_rdma_rw_ctxt, rw_list); - first_wr = rdma_rw_ctx_wrs(&rwc->rw_ctx, rdma->sc_qp, - rdma->sc_port_num, cqe, first_wr); - cqe = NULL; - } - sctxt->sc_wr_chain = first_wr; - sctxt->sc_sqecount += cc->cc_sqecount; + svc_rdma_cc_link_wrs(rdma, sctxt, cc); trace_svcrdma_post_reply_chunk(&cc->cc_cid, cc->cc_sqecount); return xdr->len; From 3603bf99062c6d563df4fba3848f829d5401d959 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 28 Feb 2026 14:09:22 -0800 Subject: [PATCH 76/83] SUNRPC: xdr.h: fix all kernel-doc warnings Correct a function parameter name (s/page/folio/) and add function return value sections for multiple functions to eliminate kernel-doc warnings: Warning: include/linux/sunrpc/xdr.h:298 function parameter 'folio' not described in 'xdr_set_scratch_folio' Warning: include/linux/sunrpc/xdr.h:337 No description found for return value of 'xdr_stream_remaining' Warning: include/linux/sunrpc/xdr.h:357 No description found for return value of 'xdr_align_size' Warning: include/linux/sunrpc/xdr.h:374 No description found for return value of 'xdr_pad_size' Warning: include/linux/sunrpc/xdr.h:387 No description found for return value of 'xdr_stream_encode_item_present' Signed-off-by: Randy Dunlap Signed-off-by: Chuck Lever --- include/linux/sunrpc/xdr.h | 48 +++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 152597750f55..b639a6fafcbc 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -290,7 +290,7 @@ xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen) /** * xdr_set_scratch_folio - Attach a scratch buffer for decoding data * @xdr: pointer to xdr_stream struct - * @page: an anonymous folio + * @folio: an anonymous folio * * See xdr_set_scratch_buffer(). */ @@ -330,7 +330,7 @@ static inline void xdr_commit_encode(struct xdr_stream *xdr) * xdr_stream_remaining - Return the number of bytes remaining in the stream * @xdr: pointer to struct xdr_stream * - * Return value: + * Returns: * Number of bytes remaining in @xdr before xdr->end */ static inline size_t @@ -350,7 +350,7 @@ ssize_t xdr_stream_encode_opaque_auth(struct xdr_stream *xdr, u32 flavor, * xdr_align_size - Calculate padded size of an object * @n: Size of an object being XDR encoded (in bytes) * - * Return value: + * Returns: * Size (in bytes) of the object including xdr padding */ static inline size_t @@ -368,7 +368,7 @@ xdr_align_size(size_t n) * This implementation avoids the need for conditional * branches or modulo division. * - * Return value: + * Returns: * Size (in bytes) of the needed XDR pad */ static inline size_t xdr_pad_size(size_t n) @@ -380,7 +380,7 @@ static inline size_t xdr_pad_size(size_t n) * xdr_stream_encode_item_present - Encode a "present" list item * @xdr: pointer to xdr_stream * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -399,7 +399,7 @@ static inline ssize_t xdr_stream_encode_item_present(struct xdr_stream *xdr) * xdr_stream_encode_item_absent - Encode a "not present" list item * @xdr: pointer to xdr_stream * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -419,7 +419,7 @@ static inline int xdr_stream_encode_item_absent(struct xdr_stream *xdr) * @p: address in a buffer into which to encode * @n: boolean value to encode * - * Return value: + * Returns: * Address of item following the encoded boolean */ static inline __be32 *xdr_encode_bool(__be32 *p, u32 n) @@ -433,7 +433,7 @@ static inline __be32 *xdr_encode_bool(__be32 *p, u32 n) * @xdr: pointer to xdr_stream * @n: boolean value to encode * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -453,7 +453,7 @@ static inline int xdr_stream_encode_bool(struct xdr_stream *xdr, __u32 n) * @xdr: pointer to xdr_stream * @n: integer to encode * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -474,7 +474,7 @@ xdr_stream_encode_u32(struct xdr_stream *xdr, __u32 n) * @xdr: pointer to xdr_stream * @n: integer to encode * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -495,7 +495,7 @@ xdr_stream_encode_be32(struct xdr_stream *xdr, __be32 n) * @xdr: pointer to xdr_stream * @n: 64-bit integer to encode * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -517,7 +517,7 @@ xdr_stream_encode_u64(struct xdr_stream *xdr, __u64 n) * @ptr: pointer to void pointer * @len: size of object * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -542,7 +542,7 @@ xdr_stream_encode_opaque_inline(struct xdr_stream *xdr, void **ptr, size_t len) * @ptr: pointer to opaque data object * @len: size of object pointed to by @ptr * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -563,7 +563,7 @@ xdr_stream_encode_opaque_fixed(struct xdr_stream *xdr, const void *ptr, size_t l * @ptr: pointer to opaque data object * @len: size of object pointed to by @ptr * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -585,7 +585,7 @@ xdr_stream_encode_opaque(struct xdr_stream *xdr, const void *ptr, size_t len) * @array: array of integers * @array_size: number of elements in @array * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -608,7 +608,7 @@ xdr_stream_encode_uint32_array(struct xdr_stream *xdr, * xdr_item_is_absent - symbolically handle XDR discriminators * @p: pointer to undecoded discriminator * - * Return values: + * Returns: * %true if the following XDR item is absent * %false if the following XDR item is present */ @@ -621,7 +621,7 @@ static inline bool xdr_item_is_absent(const __be32 *p) * xdr_item_is_present - symbolically handle XDR discriminators * @p: pointer to undecoded discriminator * - * Return values: + * Returns: * %true if the following XDR item is present * %false if the following XDR item is absent */ @@ -635,7 +635,7 @@ static inline bool xdr_item_is_present(const __be32 *p) * @xdr: pointer to xdr_stream * @ptr: pointer to a u32 in which to store the result * - * Return values: + * Returns: * %0 on success * %-EBADMSG on XDR buffer overflow */ @@ -656,7 +656,7 @@ xdr_stream_decode_bool(struct xdr_stream *xdr, __u32 *ptr) * @xdr: pointer to xdr_stream * @ptr: location to store integer * - * Return values: + * Returns: * %0 on success * %-EBADMSG on XDR buffer overflow */ @@ -677,7 +677,7 @@ xdr_stream_decode_u32(struct xdr_stream *xdr, __u32 *ptr) * @xdr: pointer to xdr_stream * @ptr: location to store integer * - * Return values: + * Returns: * %0 on success * %-EBADMSG on XDR buffer overflow */ @@ -698,7 +698,7 @@ xdr_stream_decode_be32(struct xdr_stream *xdr, __be32 *ptr) * @xdr: pointer to xdr_stream * @ptr: location to store 64-bit integer * - * Return values: + * Returns: * %0 on success * %-EBADMSG on XDR buffer overflow */ @@ -720,7 +720,7 @@ xdr_stream_decode_u64(struct xdr_stream *xdr, __u64 *ptr) * @ptr: location to store data * @len: size of buffer pointed to by @ptr * - * Return values: + * Returns: * %0 on success * %-EBADMSG on XDR buffer overflow */ @@ -746,7 +746,7 @@ xdr_stream_decode_opaque_fixed(struct xdr_stream *xdr, void *ptr, size_t len) * on @xdr. It is therefore expected that the object it points to should * be processed immediately. * - * Return values: + * Returns: * On success, returns size of object stored in *@ptr * %-EBADMSG on XDR buffer overflow * %-EMSGSIZE if the size of the object would exceed @maxlen @@ -777,7 +777,7 @@ xdr_stream_decode_opaque_inline(struct xdr_stream *xdr, void **ptr, size_t maxle * @array: location to store the integer array or NULL * @array_size: number of elements to store * - * Return values: + * Returns: * On success, returns number of elements stored in @array * %-EBADMSG on XDR buffer overflow * %-EMSGSIZE if the size of the array exceeds @array_size From 4e2866b2baaddfff6069a2f18fc134c1d5a08f2b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 11 Mar 2026 12:18:54 -0400 Subject: [PATCH 77/83] SUNRPC: Add svc_rqst_page_release() helper svc_rqst_replace_page() releases displaced pages through a per-rqst folio batch, but exposes the add-or-flush sequence directly. svc_tcp_restore_pages() releases displaced pages individually with put_page(). Introduce svc_rqst_page_release() to encapsulate the batched release mechanism. Convert svc_rqst_replace_page() and svc_tcp_restore_pages() to use it. The latter now benefits from the same batched release that svc_rqst_replace_page() already uses. Reviewed-by: Christoph Hellwig Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 15 +++++++++++++++ net/sunrpc/svc.c | 7 ++----- net/sunrpc/svcsock.c | 2 +- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 669c944eaf7f..1ebd9c7efa70 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -498,6 +498,21 @@ int svc_generic_rpcbind_set(struct net *net, #define RPC_MAX_ADDRBUFLEN (63U) +/** + * svc_rqst_page_release - release a page associated with an RPC transaction + * @rqstp: RPC transaction context + * @page: page to release + * + * Released pages are batched and freed together, reducing + * allocator pressure under heavy RPC workloads. + */ +static inline void svc_rqst_page_release(struct svc_rqst *rqstp, + struct page *page) +{ + if (!folio_batch_add(&rqstp->rq_fbatch, page_folio(page))) + __folio_batch_release(&rqstp->rq_fbatch); +} + /* * When we want to reduce the size of the reserved space in the response * buffer, we need to take into account the size of any checksum data that diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 5e0b5ec2fd52..576fa42e7abf 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -976,11 +976,8 @@ bool svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page) return false; } - if (*rqstp->rq_next_page) { - if (!folio_batch_add(&rqstp->rq_fbatch, - page_folio(*rqstp->rq_next_page))) - __folio_batch_release(&rqstp->rq_fbatch); - } + if (*rqstp->rq_next_page) + svc_rqst_page_release(rqstp, *rqstp->rq_next_page); get_page(page); *(rqstp->rq_next_page++) = page; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 2ce43f9995f1..7be3de1a1aed 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -988,7 +988,7 @@ static size_t svc_tcp_restore_pages(struct svc_sock *svsk, npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; for (i = 0; i < npages; i++) { if (rqstp->rq_pages[i] != NULL) - put_page(rqstp->rq_pages[i]); + svc_rqst_page_release(rqstp, rqstp->rq_pages[i]); BUG_ON(svsk->sk_pages[i] == NULL); rqstp->rq_pages[i] = svsk->sk_pages[i]; svsk->sk_pages[i] = NULL; From 18755b8c2f241648b951d3772e0742cc59834d5a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 10 Mar 2026 15:39:25 -0400 Subject: [PATCH 78/83] svcrdma: Use contiguous pages for RDMA Read sink buffers svc_rdma_build_read_segment() constructs RDMA Read sink buffers by consuming pages one-at-a-time from rq_pages[] and building one bvec per page. A 64KB NFS READ payload produces 16 separate bvecs, 16 DMA mappings, and potentially multiple RDMA Read WRs (on platforms with 4KB pages). A single higher-order allocation followed by split_page() yields physically contiguous memory while preserving per-page refcounts. A single bvec spanning the contiguous range causes rdma_rw_ctx_init_bvec() to take the rdma_rw_init_single_wr_bvec() fast path: one DMA mapping, one SGE, one WR. The split sub-pages replace the original rq_pages[] entries, so all downstream page tracking, completion handling, and xdr_buf assembly remain unchanged. Allocation uses __GFP_NORETRY | __GFP_NOWARN and falls back through decreasing orders. If even order-1 fails, the existing per-page path handles the segment. When nr_pages is not a power of two, get_order() rounds up and the allocation yields more pages than needed. The extra split pages replace existing rq_pages[] entries (freed via put_page() first), so there is no net increase in per- request page consumption. Successive segments reuse the same padding slots, preventing accumulation. The rq_maxpages guard rejects any allocation that would overrun the array, falling back to the per-page path. Under memory pressure, __GFP_NORETRY causes the higher- order allocation to fail without stalling. The contiguous path is attempted when the segment starts page-aligned (rc_pageoff == 0) and spans at least two pages. NFS WRITE segments carry application-modified byte ranges of arbitrary length, so the optimization is not restricted to power-of-two page counts. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_rw.c | 223 ++++++++++++++++++++++++++++++ 1 file changed, 223 insertions(+) diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 9e17700fae2a..402e2ceca4ff 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -754,6 +754,216 @@ int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma, return xdr->len; } +/* + * Cap contiguous RDMA Read sink allocations at order-4. + * Higher orders risk allocation failure under + * __GFP_NORETRY, which would negate the benefit of the + * contiguous fast path. + */ +#define SVC_RDMA_CONTIG_MAX_ORDER 4 + +/** + * svc_rdma_alloc_read_pages - Allocate physically contiguous pages + * @nr_pages: number of pages needed + * @order: on success, set to the allocation order + * + * Attempts a higher-order allocation, falling back to smaller orders. + * The returned pages are split immediately so each sub-page has its + * own refcount and can be freed independently. + * + * Returns a pointer to the first page on success, or NULL if even + * order-1 allocation fails. + */ +static struct page * +svc_rdma_alloc_read_pages(unsigned int nr_pages, unsigned int *order) +{ + unsigned int o; + struct page *page; + + o = min(get_order(nr_pages << PAGE_SHIFT), + SVC_RDMA_CONTIG_MAX_ORDER); + + while (o >= 1) { + page = alloc_pages(GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN, + o); + if (page) { + split_page(page, o); + *order = o; + return page; + } + o--; + } + return NULL; +} + +/* + * svc_rdma_fill_contig_bvec - Replace rq_pages with a contiguous allocation + * @rqstp: RPC transaction context + * @head: context for ongoing I/O + * @bv: bvec entry to fill + * @pages_left: number of data pages remaining in the segment + * @len_left: bytes remaining in the segment + * + * On success, fills @bv with a bvec spanning the contiguous range and + * advances rc_curpage/rc_page_count. Returns the byte length covered, + * or zero if the allocation failed or would overrun rq_maxpages. + */ +static unsigned int +svc_rdma_fill_contig_bvec(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head, + struct bio_vec *bv, unsigned int pages_left, + unsigned int len_left) +{ + unsigned int order, npages, chunk_pages, chunk_len, i; + struct page *page; + + page = svc_rdma_alloc_read_pages(pages_left, &order); + if (!page) + return 0; + npages = 1 << order; + + if (head->rc_curpage + npages > rqstp->rq_maxpages) { + for (i = 0; i < npages; i++) + __free_page(page + i); + return 0; + } + + /* + * Replace rq_pages[] entries with pages from the contiguous + * allocation. If npages exceeds chunk_pages, the extra pages + * stay in rq_pages[] for later reuse or normal rqst teardown. + */ + for (i = 0; i < npages; i++) { + svc_rqst_page_release(rqstp, + rqstp->rq_pages[head->rc_curpage + i]); + rqstp->rq_pages[head->rc_curpage + i] = page + i; + } + + chunk_pages = min(npages, pages_left); + chunk_len = min_t(unsigned int, chunk_pages << PAGE_SHIFT, len_left); + bvec_set_page(bv, page, chunk_len, 0); + head->rc_page_count += chunk_pages; + head->rc_curpage += chunk_pages; + return chunk_len; +} + +/* + * svc_rdma_fill_page_bvec - Add a single rq_page to the bvec array + * @head: context for ongoing I/O + * @ctxt: R/W context whose bvec array is being filled + * @cur: page to add + * @bvec_idx: pointer to current bvec index, not advanced on merge + * @len_left: bytes remaining in the segment + * + * If @cur is physically contiguous with the preceding bvec, it is + * merged by extending that bvec's length. Otherwise a new bvec + * entry is created. Returns the byte length covered. + */ +static unsigned int +svc_rdma_fill_page_bvec(struct svc_rdma_recv_ctxt *head, + struct svc_rdma_rw_ctxt *ctxt, struct page *cur, + unsigned int *bvec_idx, unsigned int len_left) +{ + unsigned int chunk_len = min_t(unsigned int, PAGE_SIZE, len_left); + + head->rc_page_count++; + head->rc_curpage++; + + if (*bvec_idx > 0) { + struct bio_vec *prev = &ctxt->rw_bvec[*bvec_idx - 1]; + + if (page_to_phys(prev->bv_page) + prev->bv_offset + + prev->bv_len == page_to_phys(cur)) { + prev->bv_len += chunk_len; + return chunk_len; + } + } + + bvec_set_page(&ctxt->rw_bvec[*bvec_idx], cur, chunk_len, 0); + (*bvec_idx)++; + return chunk_len; +} + +/** + * svc_rdma_build_read_segment_contig - Build RDMA Read WR with contiguous pages + * @rqstp: RPC transaction context + * @head: context for ongoing I/O + * @segment: co-ordinates of remote memory to be read + * + * Greedily allocates higher-order pages to cover the segment, + * building one bvec per contiguous chunk. Each allocation is + * split so sub-pages have independent refcounts. When a + * higher-order allocation fails, remaining pages are covered + * individually, merging adjacent pages into the preceding bvec + * when they are physically contiguous. The split sub-pages + * replace entries in rq_pages[] so downstream cleanup is + * unchanged. + * + * Returns: + * %0: the Read WR was constructed successfully + * %-ENOMEM: allocation failed + * %-EIO: a DMA mapping error occurred + */ +static int svc_rdma_build_read_segment_contig(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head, + const struct svc_rdma_segment *segment) +{ + struct svcxprt_rdma *rdma = svc_rdma_rqst_rdma(rqstp); + struct svc_rdma_chunk_ctxt *cc = &head->rc_cc; + unsigned int nr_data_pages, bvec_idx; + struct svc_rdma_rw_ctxt *ctxt; + unsigned int len_left; + int ret; + + nr_data_pages = PAGE_ALIGN(segment->rs_length) >> PAGE_SHIFT; + if (head->rc_curpage + nr_data_pages > rqstp->rq_maxpages) + return -ENOMEM; + + ctxt = svc_rdma_get_rw_ctxt(rdma, nr_data_pages); + if (!ctxt) + return -ENOMEM; + + bvec_idx = 0; + len_left = segment->rs_length; + while (len_left) { + unsigned int pages_left = PAGE_ALIGN(len_left) >> PAGE_SHIFT; + unsigned int chunk_len = 0; + + if (pages_left >= 2) + chunk_len = svc_rdma_fill_contig_bvec(rqstp, head, + &ctxt->rw_bvec[bvec_idx], + pages_left, len_left); + if (chunk_len) { + bvec_idx++; + } else { + struct page *cur = + rqstp->rq_pages[head->rc_curpage]; + chunk_len = svc_rdma_fill_page_bvec(head, ctxt, cur, + &bvec_idx, + len_left); + } + + len_left -= chunk_len; + } + + ctxt->rw_nents = bvec_idx; + + head->rc_pageoff = offset_in_page(segment->rs_length); + if (head->rc_pageoff) + head->rc_curpage--; + + ret = svc_rdma_rw_ctx_init(rdma, ctxt, segment->rs_offset, + segment->rs_handle, segment->rs_length, + DMA_FROM_DEVICE); + if (ret < 0) + return -EIO; + percpu_counter_inc(&svcrdma_stat_read); + + list_add(&ctxt->rw_list, &cc->cc_rwctxts); + cc->cc_sqecount += ret; + return 0; +} + /** * svc_rdma_build_read_segment - Build RDMA Read WQEs to pull one RDMA segment * @rqstp: RPC transaction context @@ -780,6 +990,14 @@ static int svc_rdma_build_read_segment(struct svc_rqst *rqstp, if (check_add_overflow(head->rc_pageoff, len, &total)) return -EINVAL; nr_bvec = PAGE_ALIGN(total) >> PAGE_SHIFT; + + if (head->rc_pageoff == 0 && nr_bvec >= 2) { + ret = svc_rdma_build_read_segment_contig(rqstp, head, + segment); + if (ret != -ENOMEM) + return ret; + } + ctxt = svc_rdma_get_rw_ctxt(rdma, nr_bvec); if (!ctxt) return -ENOMEM; @@ -1125,6 +1343,11 @@ static void svc_rdma_clear_rqst_pages(struct svc_rqst *rqstp, { unsigned int i; + /* + * Move only pages containing RPC data into rc_pages[]. Pages + * from a contiguous allocation that were not used for the + * payload remain in rq_pages[] for subsequent reuse. + */ for (i = 0; i < head->rc_page_count; i++) { head->rc_pages[i] = rqstp->rq_pages[i]; rqstp->rq_pages[i] = NULL; From 39bd1bfe92a1a9450e1d6397f845020581090836 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 13 Mar 2026 12:31:47 -0400 Subject: [PATCH 79/83] NFSD: use per-operation statidx for callback procedures The callback RPC procedure table uses NFSPROC4_CB_##call for p_statidx, which maps CB_NULL to index 0 and every compound-based callback (CB_RECALL, CB_LAYOUT, CB_OFFLOAD, etc.) to index 1. All compound callback operations therefore share a single statistics counter, making per-operation accounting impossible. Assign p_statidx from the NFSPROC4_CLNT_##proc enum instead, giving each callback operation its own counter slot. The counts array is already sized by ARRAY_SIZE(nfs4_cb_procedures), so no allocation change is needed. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4callback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index aea8bdd2fdc4..74effafdd0dc 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1016,7 +1016,7 @@ static int nfs4_xdr_dec_cb_offload(struct rpc_rqst *rqstp, .p_decode = nfs4_xdr_dec_##restype, \ .p_arglen = NFS4_enc_##argtype##_sz, \ .p_replen = NFS4_dec_##restype##_sz, \ - .p_statidx = NFSPROC4_CB_##call, \ + .p_statidx = NFSPROC4_CLNT_##proc, \ .p_name = #proc, \ } From 42cc13995967c1f3790cb106916eed8fab2a37b1 Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Fri, 13 Mar 2026 12:31:48 -0400 Subject: [PATCH 80/83] NFSD: convert callback RPC program to per-net namespace The callback channel's rpc_program, rpc_version, rpc_stat, and per-procedure counts are declared as file-scope statics in nfs4callback.c, shared across all network namespaces. Forechannel RPC statistics are already maintained per-netns (via nfsd_svcstats in struct nfsd_net); the backchannel has no such separation. When backchannel statistics are eventually surfaced to userspace, the global counters would expose cross-namespace data. Allocate per-netns copies of these structures through a new opaque struct nfsd_net_cb, managed by nfsd_net_cb_init() and nfsd_net_cb_shutdown(). The struct definition is private to nfs4callback.c; struct nfsd_net holds only a pointer. Signed-off-by: Dai Ngo Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/netns.h | 3 ++ fs/nfsd/nfs4callback.c | 111 ++++++++++++++++++++++++++++------------- fs/nfsd/nfsctl.c | 5 ++ fs/nfsd/state.h | 9 ++++ 4 files changed, 94 insertions(+), 34 deletions(-) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 6ad3fe5d7e12..27da1a3edacb 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -25,6 +25,7 @@ #define SESSION_HASH_SIZE 512 struct cld_net; +struct nfsd_net_cb; struct nfsd4_client_tracking_ops; enum { @@ -228,6 +229,8 @@ struct nfsd_net { struct list_head local_clients; #endif siphash_key_t *fh_key; + + struct nfsd_net_cb *nfsd_cb; }; /* Simple check to find out if a given net was properly initialized */ diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 74effafdd0dc..50827405468d 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1032,39 +1032,14 @@ static const struct rpc_procinfo nfs4_cb_procedures[] = { PROC(CB_GETATTR, COMPOUND, cb_getattr, cb_getattr), }; -static unsigned int nfs4_cb_counts[ARRAY_SIZE(nfs4_cb_procedures)]; -static const struct rpc_version nfs_cb_version4 = { -/* - * Note on the callback rpc program version number: despite language in rfc - * 5661 section 18.36.3 requiring servers to use 4 in this field, the - * official xdr descriptions for both 4.0 and 4.1 specify version 1, and - * in practice that appears to be what implementations use. The section - * 18.36.3 language is expected to be fixed in an erratum. - */ - .number = 1, - .nrprocs = ARRAY_SIZE(nfs4_cb_procedures), - .procs = nfs4_cb_procedures, - .counts = nfs4_cb_counts, -}; +#define NFS4_CB_PROGRAM 0x40000000 +#define NFS4_CB_VERSION 1 -static const struct rpc_version *nfs_cb_version[2] = { - [1] = &nfs_cb_version4, -}; - -static const struct rpc_program cb_program; - -static struct rpc_stat cb_stats = { - .program = &cb_program -}; - -#define NFS4_CALLBACK 0x40000000 -static const struct rpc_program cb_program = { - .name = "nfs4_cb", - .number = NFS4_CALLBACK, - .nrvers = ARRAY_SIZE(nfs_cb_version), - .version = nfs_cb_version, - .stats = &cb_stats, - .pipe_dir_name = "nfsd4_cb", +struct nfsd_net_cb { + struct rpc_version version4; + const struct rpc_version *versions[NFS4_CB_VERSION + 1]; + struct rpc_program program; + struct rpc_stat stat; }; static int max_cb_time(struct net *net) @@ -1140,6 +1115,7 @@ static const struct cred *get_backchannel_cred(struct nfs4_client *clp, struct r static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses) { + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); int maxtime = max_cb_time(clp->net); struct rpc_timeout timeparms = { .to_initval = maxtime, @@ -1152,14 +1128,14 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c .addrsize = conn->cb_addrlen, .saddress = (struct sockaddr *) &conn->cb_saddr, .timeout = &timeparms, - .program = &cb_program, - .version = 1, + .version = NFS4_CB_VERSION, .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), .cred = current_cred(), }; struct rpc_clnt *client; const struct cred *cred; + args.program = &nn->nfsd_cb->program; if (clp->cl_minorversion == 0) { if (!clp->cl_cred.cr_principal && (clp->cl_cred.cr_flavor >= RPC_AUTH_GSS_KRB5)) { @@ -1786,3 +1762,70 @@ bool nfsd4_run_cb(struct nfsd4_callback *cb) nfsd41_cb_inflight_end(clp); return queued; } + +/** + * nfsd_net_cb_shutdown - release per-netns callback RPC program resources + * @nn: NFS server network namespace + * + * Frees resources allocated by nfsd_net_cb_init(). + */ +void nfsd_net_cb_shutdown(struct nfsd_net *nn) +{ + struct nfsd_net_cb *cb = nn->nfsd_cb; + + if (cb) { + kfree(cb->version4.counts); + kfree(cb); + nn->nfsd_cb = NULL; + } +} + +/** + * nfsd_net_cb_init - initialize per-netns callback RPC program + * @nn: NFS server network namespace + * + * Sets up the callback RPC program, version table, procedure + * counts, and statistics structure for @nn. Caller must release + * these resources using nfsd_net_cb_shutdown(). + * + * Return: 0 on success, or -ENOMEM if allocation fails. + */ +int nfsd_net_cb_init(struct nfsd_net *nn) +{ + struct nfsd_net_cb *cb; + + cb = kzalloc(sizeof(*cb), GFP_KERNEL); + if (!cb) + return -ENOMEM; + + cb->version4.counts = kzalloc_objs(unsigned int, + ARRAY_SIZE(nfs4_cb_procedures), GFP_KERNEL); + if (!cb->version4.counts) { + kfree(cb); + return -ENOMEM; + } + /* + * Note on the callback rpc program version number: despite language + * in rfc 5661 section 18.36.3 requiring servers to use 4 in this + * field, the official xdr descriptions for both 4.0 and 4.1 specify + * version 1, and in practice that appears to be what implementations + * use. The section 18.36.3 language is expected to be fixed in an + * erratum. + */ + cb->version4.number = NFS4_CB_VERSION; + cb->version4.nrprocs = ARRAY_SIZE(nfs4_cb_procedures); + cb->version4.procs = nfs4_cb_procedures; + cb->versions[NFS4_CB_VERSION] = &cb->version4; + + cb->program.name = "nfs4_cb"; + cb->program.number = NFS4_CB_PROGRAM; + cb->program.nrvers = ARRAY_SIZE(cb->versions); + cb->program.version = &cb->versions[0]; + cb->program.pipe_dir_name = "nfsd4_cb"; + cb->program.stats = &cb->stat; + cb->stat.program = &cb->program; + + nn->nfsd_cb = cb; + + return 0; +} diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 20ec00f323b4..39e7012a60d8 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -2203,6 +2203,9 @@ static __net_init int nfsd_net_init(struct net *net) int retval; int i; + retval = nfsd_net_cb_init(nn); + if (retval) + return retval; retval = nfsd_export_init(net); if (retval) goto out_export_error; @@ -2243,6 +2246,7 @@ static __net_init int nfsd_net_init(struct net *net) out_idmap_error: nfsd_export_shutdown(net); out_export_error: + nfsd_net_cb_shutdown(nn); return retval; } @@ -2273,6 +2277,7 @@ static __net_exit void nfsd_net_exit(struct net *net) struct nfsd_net *nn = net_generic(net, nfsd_net_id); kfree_sensitive(nn->fh_key); + nfsd_net_cb_shutdown(nn); nfsd_proc_stat_shutdown(net); percpu_counter_destroy_many(nn->counter, NFSD_STATS_COUNTERS_NUM); nfsd_idmap_shutdown(net); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 9b05462da4cc..953675eba5c3 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -862,6 +862,8 @@ struct nfsd_file *find_any_file(struct nfs4_file *f); #ifdef CONFIG_NFSD_V4 void nfsd4_revoke_states(struct nfsd_net *nn, struct super_block *sb); void nfsd4_cancel_copy_by_sb(struct net *net, struct super_block *sb); +int nfsd_net_cb_init(struct nfsd_net *nn); +void nfsd_net_cb_shutdown(struct nfsd_net *nn); #else static inline void nfsd4_revoke_states(struct nfsd_net *nn, struct super_block *sb) { @@ -869,6 +871,13 @@ static inline void nfsd4_revoke_states(struct nfsd_net *nn, struct super_block * static inline void nfsd4_cancel_copy_by_sb(struct net *net, struct super_block *sb) { } +static inline int nfsd_net_cb_init(struct nfsd_net *nn) +{ + return 0; +} +static inline void nfsd_net_cb_shutdown(struct nfsd_net *nn) +{ +} #endif /* grace period management */ From fa6966fd05a122b413823c579a1f898427e2cdd4 Mon Sep 17 00:00:00 2001 From: Joseph Salisbury Date: Mon, 16 Mar 2026 14:25:16 -0400 Subject: [PATCH 81/83] nfsd: fix comment typo in nfs3xdr The file contains a spelling error in a source comment (occured). Typos in comments reduce readability and make text searches less reliable for developers and maintainers. Replace 'occured' with 'occurred' in the affected comment. This is a comment-only cleanup and does not change behavior. Signed-off-by: Joseph Salisbury Signed-off-by: Chuck Lever --- fs/nfsd/nfs3xdr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index ef4971d71ac4..2ff9a991a8fb 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -1069,7 +1069,7 @@ svcxdr_encode_entry3_common(struct nfsd3_readdirres *resp, const char *name, * * Return values: * %0: Entry was successfully encoded. - * %-EINVAL: An encoding problem occured, secondary status code in resp->common.err + * %-EINVAL: An encoding problem occurred, secondary status code in resp->common.err * * On exit, the following fields are updated: * - resp->xdr @@ -1144,7 +1144,7 @@ svcxdr_encode_entry3_plus(struct nfsd3_readdirres *resp, const char *name, * * Return values: * %0: Entry was successfully encoded. - * %-EINVAL: An encoding problem occured, secondary status code in resp->common.err + * %-EINVAL: An encoding problem occurred, secondary status code in resp->common.err * * On exit, the following fields are updated: * - resp->xdr From 124f9af22ce27d146f11e37f826671a0a1953ad5 Mon Sep 17 00:00:00 2001 From: Joseph Salisbury Date: Mon, 16 Mar 2026 14:28:45 -0400 Subject: [PATCH 82/83] nfsd: fix comment typo in nfsxdr The file contains a spelling error in a source comment (occured). Typos in comments reduce readability and make text searches less reliable for developers and maintainers. Replace 'occured' with 'occurred' in the affected comment. This is a comment-only cleanup and does not change behavior. Signed-off-by: Joseph Salisbury Signed-off-by: Chuck Lever --- fs/nfsd/nfsxdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index fc262ceafca9..ae71e0621317 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -605,7 +605,7 @@ svcxdr_encode_entry_common(struct nfsd_readdirres *resp, const char *name, * * Return values: * %0: Entry was successfully encoded. - * %-EINVAL: An encoding problem occured, secondary status code in resp->common.err + * %-EINVAL: An encoding problem occurred, secondary status code in resp->common.err * * On exit, the following fields are updated: * - resp->xdr From d644a698de12e996778657f65a4608299368e138 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 18 Mar 2026 15:21:05 -0700 Subject: [PATCH 83/83] NFSD: Docs: clean up pnfs server timeout docs Make various changes to the documentation formatting to avoid docs build errors and otherwise improve the produced output format: - use bullets for lists - don't use a '.' at the end of echo commands - fix indentation Documentation/admin-guide/nfs/pnfs-block-server.rst:55: ERROR: Unexpected indentation. [docutils] Documentation/admin-guide/nfs/pnfs-scsi-server.rst:37: ERROR: Unexpected indentation. [docutils] Fixes: 6a97f70b45e7 ("NFSD: Enforce timeout on layout recall and integrate lease manager fencing") Signed-off-by: Randy Dunlap Signed-off-by: Chuck Lever --- .../admin-guide/nfs/pnfs-block-server.rst | 20 +++++++++---------- .../admin-guide/nfs/pnfs-scsi-server.rst | 20 +++++++++---------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/Documentation/admin-guide/nfs/pnfs-block-server.rst b/Documentation/admin-guide/nfs/pnfs-block-server.rst index b4f5997009af..7667dd2e17f1 100644 --- a/Documentation/admin-guide/nfs/pnfs-block-server.rst +++ b/Documentation/admin-guide/nfs/pnfs-block-server.rst @@ -47,12 +47,12 @@ system log with the following format: FENCE failed client[IP_address] clid[#n] device[dev_name] - Where: + where: - IP_address: refers to the IP address of the affected client. - #n: indicates the unique client identifier. - dev_name: specifies the name of the block device related - to the fencing attempt. + - IP_address: refers to the IP address of the affected client. + - #n: indicates the unique client identifier. + - dev_name: specifies the name of the block device related + to the fencing attempt. The server will repeatedly retry the operation indefinitely. During this time, access to the affected file is restricted for all other @@ -62,11 +62,11 @@ clients access the same file simultaneously. To restore access to the affected file for other clients, the admin needs to take the following actions: - . shutdown or power off the client being fenced. - . manually expire the client to release all its state on the server: + - shutdown or power off the client being fenced. + - manually expire the client to release all its state on the server:: - echo 'expire' > /proc/fs/nfsd/clients/clid/ctl'. + echo 'expire' > /proc/fs/nfsd/clients/clid/ctl - Where: + where: - clid: is the unique client identifier displayed in the system log. + - clid: is the unique client identifier displayed in the system log. diff --git a/Documentation/admin-guide/nfs/pnfs-scsi-server.rst b/Documentation/admin-guide/nfs/pnfs-scsi-server.rst index db34afbf67a9..b202508d281d 100644 --- a/Documentation/admin-guide/nfs/pnfs-scsi-server.rst +++ b/Documentation/admin-guide/nfs/pnfs-scsi-server.rst @@ -29,12 +29,12 @@ system log with the following format: FENCE failed client[IP_address] clid[#n] device[dev_name] - Where: + where: - IP_address: refers to the IP address of the affected client. - #n: indicates the unique client identifier. - dev_name: specifies the name of the block device related - to the fencing attempt. + - IP_address: refers to the IP address of the affected client. + - #n: indicates the unique client identifier. + - dev_name: specifies the name of the block device related + to the fencing attempt. The server will repeatedly retry the operation indefinitely. During this time, access to the affected file is restricted for all other @@ -44,12 +44,12 @@ clients access the same file simultaneously. To restore access to the affected file for other clients, the admin needs to take the following actions: - . shutdown or power off the client being fenced. - . manually expire the client to release all its state on the server: + - shutdown or power off the client being fenced. + - manually expire the client to release all its state on the server:: - echo 'expire' > /proc/fs/nfsd/clients/clid/ctl'. + echo 'expire' > /proc/fs/nfsd/clients/clid/ctl - Where: + where: - clid: is the unique client identifier displayed in the system log. + - clid: is the unique client identifier displayed in the system log.