From 2289e87b5951f97783f07fc895e6c5e804b53668 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 17 Sep 2020 17:22:49 -0400 Subject: [PATCH 01/99] SUNRPC: Make trace_svc_process() display the RPC procedure symbolically The next few patches will employ these strings to help make server- side trace logs more human-readable. A similar technique is already in use in kernel RPC client code. Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 24 ++++++++++++++++++++++++ fs/lockd/svcproc.c | 24 ++++++++++++++++++++++++ fs/nfs/callback_xdr.c | 2 ++ fs/nfsd/nfs2acl.c | 5 +++++ fs/nfsd/nfs3acl.c | 3 +++ fs/nfsd/nfs3proc.c | 22 ++++++++++++++++++++++ fs/nfsd/nfs4proc.c | 2 ++ fs/nfsd/nfsproc.c | 18 ++++++++++++++++++ include/linux/sunrpc/svc.h | 1 + 9 files changed, 101 insertions(+) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index fa41dda39925..4c10fb5138f1 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -512,6 +512,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "NULL", }, [NLMPROC_TEST] = { .pc_func = nlm4svc_proc_test, @@ -520,6 +521,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St+2+No+Rg, + .pc_name = "TEST", }, [NLMPROC_LOCK] = { .pc_func = nlm4svc_proc_lock, @@ -528,6 +530,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, + .pc_name = "LOCK", }, [NLMPROC_CANCEL] = { .pc_func = nlm4svc_proc_cancel, @@ -536,6 +539,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, + .pc_name = "CANCEL", }, [NLMPROC_UNLOCK] = { .pc_func = nlm4svc_proc_unlock, @@ -544,6 +548,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, + .pc_name = "UNLOCK", }, [NLMPROC_GRANTED] = { .pc_func = nlm4svc_proc_granted, @@ -552,6 +557,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, + .pc_name = "GRANTED", }, [NLMPROC_TEST_MSG] = { .pc_func = nlm4svc_proc_test_msg, @@ -560,6 +566,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "TEST_MSG", }, [NLMPROC_LOCK_MSG] = { .pc_func = nlm4svc_proc_lock_msg, @@ -568,6 +575,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "LOCK_MSG", }, [NLMPROC_CANCEL_MSG] = { .pc_func = nlm4svc_proc_cancel_msg, @@ -576,6 +584,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "CANCEL_MSG", }, [NLMPROC_UNLOCK_MSG] = { .pc_func = nlm4svc_proc_unlock_msg, @@ -584,6 +593,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "UNLOCK_MSG", }, [NLMPROC_GRANTED_MSG] = { .pc_func = nlm4svc_proc_granted_msg, @@ -592,6 +602,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "GRANTED_MSG", }, [NLMPROC_TEST_RES] = { .pc_func = nlm4svc_proc_null, @@ -600,6 +611,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "TEST_RES", }, [NLMPROC_LOCK_RES] = { .pc_func = nlm4svc_proc_null, @@ -608,6 +620,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "LOCK_RES", }, [NLMPROC_CANCEL_RES] = { .pc_func = nlm4svc_proc_null, @@ -616,6 +629,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "CANCEL_RES", }, [NLMPROC_UNLOCK_RES] = { .pc_func = nlm4svc_proc_null, @@ -624,6 +638,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "UNLOCK_RES", }, [NLMPROC_GRANTED_RES] = { .pc_func = nlm4svc_proc_granted_res, @@ -632,6 +647,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "GRANTED_RES", }, [NLMPROC_NSM_NOTIFY] = { .pc_func = nlm4svc_proc_sm_notify, @@ -640,6 +656,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_reboot), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "SM_NOTIFY", }, [17] = { .pc_func = nlm4svc_proc_unused, @@ -648,6 +665,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = 0, + .pc_name = "UNUSED", }, [18] = { .pc_func = nlm4svc_proc_unused, @@ -656,6 +674,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = 0, + .pc_name = "UNUSED", }, [19] = { .pc_func = nlm4svc_proc_unused, @@ -664,6 +683,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = 0, + .pc_name = "UNUSED", }, [NLMPROC_SHARE] = { .pc_func = nlm4svc_proc_share, @@ -672,6 +692,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St+1, + .pc_name = "SHARE", }, [NLMPROC_UNSHARE] = { .pc_func = nlm4svc_proc_unshare, @@ -680,6 +701,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St+1, + .pc_name = "UNSHARE", }, [NLMPROC_NM_LOCK] = { .pc_func = nlm4svc_proc_nm_lock, @@ -688,6 +710,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, + .pc_name = "NM_LOCK", }, [NLMPROC_FREE_ALL] = { .pc_func = nlm4svc_proc_free_all, @@ -696,5 +719,6 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "FREE_ALL", }, }; diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 50855f2c1f4b..4ae4b63b5392 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -554,6 +554,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "NULL", }, [NLMPROC_TEST] = { .pc_func = nlmsvc_proc_test, @@ -562,6 +563,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St+2+No+Rg, + .pc_name = "TEST", }, [NLMPROC_LOCK] = { .pc_func = nlmsvc_proc_lock, @@ -570,6 +572,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, + .pc_name = "LOCK", }, [NLMPROC_CANCEL] = { .pc_func = nlmsvc_proc_cancel, @@ -578,6 +581,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, + .pc_name = "CANCEL", }, [NLMPROC_UNLOCK] = { .pc_func = nlmsvc_proc_unlock, @@ -586,6 +590,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, + .pc_name = "UNLOCK", }, [NLMPROC_GRANTED] = { .pc_func = nlmsvc_proc_granted, @@ -594,6 +599,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, + .pc_name = "GRANTED", }, [NLMPROC_TEST_MSG] = { .pc_func = nlmsvc_proc_test_msg, @@ -602,6 +608,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "TEST_MSG", }, [NLMPROC_LOCK_MSG] = { .pc_func = nlmsvc_proc_lock_msg, @@ -610,6 +617,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "LOCK_MSG", }, [NLMPROC_CANCEL_MSG] = { .pc_func = nlmsvc_proc_cancel_msg, @@ -618,6 +626,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "CANCEL_MSG", }, [NLMPROC_UNLOCK_MSG] = { .pc_func = nlmsvc_proc_unlock_msg, @@ -626,6 +635,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "UNLOCK_MSG", }, [NLMPROC_GRANTED_MSG] = { .pc_func = nlmsvc_proc_granted_msg, @@ -634,6 +644,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "GRANTED_MSG", }, [NLMPROC_TEST_RES] = { .pc_func = nlmsvc_proc_null, @@ -642,6 +653,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "TEST_RES", }, [NLMPROC_LOCK_RES] = { .pc_func = nlmsvc_proc_null, @@ -650,6 +662,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "LOCK_RES", }, [NLMPROC_CANCEL_RES] = { .pc_func = nlmsvc_proc_null, @@ -658,6 +671,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "CANCEL_RES", }, [NLMPROC_UNLOCK_RES] = { .pc_func = nlmsvc_proc_null, @@ -666,6 +680,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "UNLOCK_RES", }, [NLMPROC_GRANTED_RES] = { .pc_func = nlmsvc_proc_granted_res, @@ -674,6 +689,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "GRANTED_RES", }, [NLMPROC_NSM_NOTIFY] = { .pc_func = nlmsvc_proc_sm_notify, @@ -682,6 +698,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_reboot), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "SM_NOTIFY", }, [17] = { .pc_func = nlmsvc_proc_unused, @@ -690,6 +707,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "UNUSED", }, [18] = { .pc_func = nlmsvc_proc_unused, @@ -698,6 +716,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "UNUSED", }, [19] = { .pc_func = nlmsvc_proc_unused, @@ -706,6 +725,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "UNUSED", }, [NLMPROC_SHARE] = { .pc_func = nlmsvc_proc_share, @@ -714,6 +734,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St+1, + .pc_name = "SHARE", }, [NLMPROC_UNSHARE] = { .pc_func = nlmsvc_proc_unshare, @@ -722,6 +743,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St+1, + .pc_name = "UNSHARE", }, [NLMPROC_NM_LOCK] = { .pc_func = nlmsvc_proc_nm_lock, @@ -730,6 +752,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, + .pc_name = "NM_LOCK", }, [NLMPROC_FREE_ALL] = { .pc_func = nlmsvc_proc_free_all, @@ -738,5 +761,6 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = 0, + .pc_name = "FREE_ALL", }, }; diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 79ff172eb1c8..c5348ba81129 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -1060,6 +1060,7 @@ static const struct svc_procedure nfs4_callback_procedures1[] = { .pc_decode = nfs4_decode_void, .pc_encode = nfs4_encode_void, .pc_xdrressize = 1, + .pc_name = "NULL", }, [CB_COMPOUND] = { .pc_func = nfs4_callback_compound, @@ -1067,6 +1068,7 @@ static const struct svc_procedure nfs4_callback_procedures1[] = { .pc_argsize = 256, .pc_ressize = 256, .pc_xdrressize = NFS4_CALLBACK_BUFSIZE, + .pc_name = "COMPOUND", } }; diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index b0f66604532a..899762da23c9 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -371,6 +371,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = { .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST, + .pc_name = "NULL", }, [ACLPROC2_GETACL] = { .pc_func = nfsacld_proc_getacl, @@ -381,6 +382,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = { .pc_ressize = sizeof(struct nfsd3_getaclres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+1+2*(1+ACL), + .pc_name = "GETACL", }, [ACLPROC2_SETACL] = { .pc_func = nfsacld_proc_setacl, @@ -391,6 +393,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = { .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT, + .pc_name = "SETACL", }, [ACLPROC2_GETATTR] = { .pc_func = nfsacld_proc_getattr, @@ -401,6 +404,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = { .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT, + .pc_name = "GETATTR", }, [ACLPROC2_ACCESS] = { .pc_func = nfsacld_proc_access, @@ -411,6 +415,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = { .pc_ressize = sizeof(struct nfsd3_accessres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT+1, + .pc_name = "SETATTR", }, }; diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 7c30876a31a1..9e1a92fb9771 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -251,6 +251,7 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = { .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST, + .pc_name = "NULL", }, [ACLPROC3_GETACL] = { .pc_func = nfsd3_proc_getacl, @@ -261,6 +262,7 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = { .pc_ressize = sizeof(struct nfsd3_getaclres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+1+2*(1+ACL), + .pc_name = "GETACL", }, [ACLPROC3_SETACL] = { .pc_func = nfsd3_proc_setacl, @@ -271,6 +273,7 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = { .pc_ressize = sizeof(struct nfsd3_attrstat), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT, + .pc_name = "SETACL", }, }; diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 76931f4f57c3..c9c64471c568 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -708,6 +708,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST, + .pc_name = "NULL", }, [NFS3PROC_GETATTR] = { .pc_func = nfsd3_proc_getattr, @@ -718,6 +719,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_attrstatres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT, + .pc_name = "GETATTR", }, [NFS3PROC_SETATTR] = { .pc_func = nfsd3_proc_setattr, @@ -728,6 +730,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_wccstatres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+WC, + .pc_name = "SETATTR", }, [NFS3PROC_LOOKUP] = { .pc_func = nfsd3_proc_lookup, @@ -738,6 +741,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_diropres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+FH+pAT+pAT, + .pc_name = "LOOKUP", }, [NFS3PROC_ACCESS] = { .pc_func = nfsd3_proc_access, @@ -748,6 +752,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_accessres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+1, + .pc_name = "ACCESS", }, [NFS3PROC_READLINK] = { .pc_func = nfsd3_proc_readlink, @@ -758,6 +763,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_readlinkres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+1+NFS3_MAXPATHLEN/4, + .pc_name = "READLINK", }, [NFS3PROC_READ] = { .pc_func = nfsd3_proc_read, @@ -768,6 +774,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_readres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+4+NFSSVC_MAXBLKSIZE/4, + .pc_name = "READ", }, [NFS3PROC_WRITE] = { .pc_func = nfsd3_proc_write, @@ -778,6 +785,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_writeres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+WC+4, + .pc_name = "WRITE", }, [NFS3PROC_CREATE] = { .pc_func = nfsd3_proc_create, @@ -788,6 +796,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_createres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+(1+FH+pAT)+WC, + .pc_name = "CREATE", }, [NFS3PROC_MKDIR] = { .pc_func = nfsd3_proc_mkdir, @@ -798,6 +807,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_createres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+(1+FH+pAT)+WC, + .pc_name = "MKDIR", }, [NFS3PROC_SYMLINK] = { .pc_func = nfsd3_proc_symlink, @@ -808,6 +818,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_createres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+(1+FH+pAT)+WC, + .pc_name = "SYMLINK", }, [NFS3PROC_MKNOD] = { .pc_func = nfsd3_proc_mknod, @@ -818,6 +829,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_createres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+(1+FH+pAT)+WC, + .pc_name = "MKNOD", }, [NFS3PROC_REMOVE] = { .pc_func = nfsd3_proc_remove, @@ -828,6 +840,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_wccstatres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+WC, + .pc_name = "REMOVE", }, [NFS3PROC_RMDIR] = { .pc_func = nfsd3_proc_rmdir, @@ -838,6 +851,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_wccstatres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+WC, + .pc_name = "RMDIR", }, [NFS3PROC_RENAME] = { .pc_func = nfsd3_proc_rename, @@ -848,6 +862,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_renameres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+WC+WC, + .pc_name = "RENAME", }, [NFS3PROC_LINK] = { .pc_func = nfsd3_proc_link, @@ -858,6 +873,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_linkres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+pAT+WC, + .pc_name = "LINK", }, [NFS3PROC_READDIR] = { .pc_func = nfsd3_proc_readdir, @@ -867,6 +883,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_argsize = sizeof(struct nfsd3_readdirargs), .pc_ressize = sizeof(struct nfsd3_readdirres), .pc_cachetype = RC_NOCACHE, + .pc_name = "READDIR", }, [NFS3PROC_READDIRPLUS] = { .pc_func = nfsd3_proc_readdirplus, @@ -876,6 +893,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_argsize = sizeof(struct nfsd3_readdirplusargs), .pc_ressize = sizeof(struct nfsd3_readdirres), .pc_cachetype = RC_NOCACHE, + .pc_name = "READDIRPLUS", }, [NFS3PROC_FSSTAT] = { .pc_func = nfsd3_proc_fsstat, @@ -885,6 +903,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_fsstatres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+2*6+1, + .pc_name = "FSSTAT", }, [NFS3PROC_FSINFO] = { .pc_func = nfsd3_proc_fsinfo, @@ -894,6 +913,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_fsinfores), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+12, + .pc_name = "FSINFO", }, [NFS3PROC_PATHCONF] = { .pc_func = nfsd3_proc_pathconf, @@ -903,6 +923,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_pathconfres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+6, + .pc_name = "PATHCONF", }, [NFS3PROC_COMMIT] = { .pc_func = nfsd3_proc_commit, @@ -913,6 +934,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_commitres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+WC+2, + .pc_name = "COMMIT", }, }; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 8d6d2678abad..f567592692ee 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -3305,6 +3305,7 @@ static const struct svc_procedure nfsd_procedures4[2] = { .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = 1, + .pc_name = "NULL", }, [NFSPROC4_COMPOUND] = { .pc_func = nfsd4_proc_compound, @@ -3315,6 +3316,7 @@ static const struct svc_procedure nfsd_procedures4[2] = { .pc_release = nfsd4_release_compoundargs, .pc_cachetype = RC_NOCACHE, .pc_xdrressize = NFSD_BUFSIZE/4, + .pc_name = "COMPOUND", }, }; diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 9473d048efec..1f85a4dc9d1b 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -623,6 +623,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = 0, + .pc_name = "NULL", }, [NFSPROC_GETATTR] = { .pc_func = nfsd_proc_getattr, @@ -633,6 +634,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT, + .pc_name = "GETATTR", }, [NFSPROC_SETATTR] = { .pc_func = nfsd_proc_setattr, @@ -643,6 +645,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+AT, + .pc_name = "SETATTR", }, [NFSPROC_ROOT] = { .pc_func = nfsd_proc_root, @@ -652,6 +655,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = 0, + .pc_name = "ROOT", }, [NFSPROC_LOOKUP] = { .pc_func = nfsd_proc_lookup, @@ -662,6 +666,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_diropres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+FH+AT, + .pc_name = "LOOKUP", }, [NFSPROC_READLINK] = { .pc_func = nfsd_proc_readlink, @@ -671,6 +676,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_readlinkres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+1+NFS_MAXPATHLEN/4, + .pc_name = "READLINK", }, [NFSPROC_READ] = { .pc_func = nfsd_proc_read, @@ -681,6 +687,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_readres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4, + .pc_name = "READ", }, [NFSPROC_WRITECACHE] = { .pc_func = nfsd_proc_writecache, @@ -690,6 +697,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = 0, + .pc_name = "WRITECACHE", }, [NFSPROC_WRITE] = { .pc_func = nfsd_proc_write, @@ -700,6 +708,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+AT, + .pc_name = "WRITE", }, [NFSPROC_CREATE] = { .pc_func = nfsd_proc_create, @@ -710,6 +719,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_diropres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+FH+AT, + .pc_name = "CREATE", }, [NFSPROC_REMOVE] = { .pc_func = nfsd_proc_remove, @@ -719,6 +729,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, + .pc_name = "REMOVE", }, [NFSPROC_RENAME] = { .pc_func = nfsd_proc_rename, @@ -728,6 +739,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, + .pc_name = "RENAME", }, [NFSPROC_LINK] = { .pc_func = nfsd_proc_link, @@ -737,6 +749,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, + .pc_name = "LINK", }, [NFSPROC_SYMLINK] = { .pc_func = nfsd_proc_symlink, @@ -746,6 +759,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, + .pc_name = "SYMLINK", }, [NFSPROC_MKDIR] = { .pc_func = nfsd_proc_mkdir, @@ -756,6 +770,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_diropres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+FH+AT, + .pc_name = "MKDIR", }, [NFSPROC_RMDIR] = { .pc_func = nfsd_proc_rmdir, @@ -765,6 +780,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, + .pc_name = "RMDIR", }, [NFSPROC_READDIR] = { .pc_func = nfsd_proc_readdir, @@ -773,6 +789,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_argsize = sizeof(struct nfsd_readdirargs), .pc_ressize = sizeof(struct nfsd_readdirres), .pc_cachetype = RC_NOCACHE, + .pc_name = "READDIR", }, [NFSPROC_STATFS] = { .pc_func = nfsd_proc_statfs, @@ -782,6 +799,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_statfsres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+5, + .pc_name = "STATFS", }, }; diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 34c2a69820e9..31ee3b6047c3 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -463,6 +463,7 @@ struct svc_procedure { unsigned int pc_ressize; /* result struct size */ unsigned int pc_cachetype; /* cache info (NFS) */ unsigned int pc_xdrressize; /* maximum size of XDR reply */ + const char * pc_name; /* for display */ }; /* From 89ff87494c6e4b32ea7960d0c644efdbb2fe6ef5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 3 Dec 2020 10:22:09 -0500 Subject: [PATCH 02/99] SUNRPC: Display RPC procedure names instead of proc numbers Make the sunrpc trace subsystem trace events easier to use. Signed-off-by: Chuck Lever --- include/trace/events/sunrpc.h | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 6f89c27265f5..036eb1f5c133 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1603,6 +1603,7 @@ TRACE_EVENT(svc_process, __field(u32, vers) __field(u32, proc) __string(service, name) + __string(procedure, rqst->rq_procinfo->pc_name) __string(addr, rqst->rq_xprt ? rqst->rq_xprt->xpt_remotebuf : "(null)") ), @@ -1612,13 +1613,16 @@ TRACE_EVENT(svc_process, __entry->vers = rqst->rq_vers; __entry->proc = rqst->rq_proc; __assign_str(service, name); + __assign_str(procedure, rqst->rq_procinfo->pc_name); __assign_str(addr, rqst->rq_xprt ? rqst->rq_xprt->xpt_remotebuf : "(null)"); ), - TP_printk("addr=%s xid=0x%08x service=%s vers=%u proc=%u", + TP_printk("addr=%s xid=0x%08x service=%s vers=%u proc=%s", __get_str(addr), __entry->xid, - __get_str(service), __entry->vers, __entry->proc) + __get_str(service), __entry->vers, + __get_str(procedure) + ) ); DECLARE_EVENT_CLASS(svc_rqst_event, @@ -1874,6 +1878,7 @@ TRACE_EVENT(svc_stats_latency, TP_STRUCT__entry( __field(u32, xid) __field(unsigned long, execute) + __string(procedure, rqst->rq_procinfo->pc_name) __string(addr, rqst->rq_xprt->xpt_remotebuf) ), @@ -1881,11 +1886,13 @@ TRACE_EVENT(svc_stats_latency, __entry->xid = be32_to_cpu(rqst->rq_xid); __entry->execute = ktime_to_us(ktime_sub(ktime_get(), rqst->rq_stime)); + __assign_str(procedure, rqst->rq_procinfo->pc_name); __assign_str(addr, rqst->rq_xprt->xpt_remotebuf); ), - TP_printk("addr=%s xid=0x%08x execute-us=%lu", - __get_str(addr), __entry->xid, __entry->execute) + TP_printk("addr=%s xid=0x%08x proc=%s execute-us=%lu", + __get_str(addr), __entry->xid, __get_str(procedure), + __entry->execute) ); DECLARE_EVENT_CLASS(svc_deferred_event, From 81d217474326b25d7f14274b02fe3da1e85ad934 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 27 Nov 2020 17:37:02 -0500 Subject: [PATCH 03/99] SUNRPC: Move definition of XDR_UNIT Clean up: The unit of XDR alignment is defined by RFC 4506, not as part of the RPC message header. Thus it belongs in include/linux/sunrpc/xdr.h. Signed-off-by: Chuck Lever --- include/linux/sunrpc/msg_prot.h | 3 --- include/linux/sunrpc/xdr.h | 13 ++++++++++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/include/linux/sunrpc/msg_prot.h b/include/linux/sunrpc/msg_prot.h index 43f854487539..938c2bf29db8 100644 --- a/include/linux/sunrpc/msg_prot.h +++ b/include/linux/sunrpc/msg_prot.h @@ -10,9 +10,6 @@ #define RPC_VERSION 2 -/* size of an XDR encoding unit in bytes, i.e. 32bit */ -#define XDR_UNIT (4) - /* spec defines authentication flavor as an unsigned 32 bit integer */ typedef u32 rpc_authflavor_t; diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 19b6dea27367..dbba537caab6 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -19,6 +19,13 @@ struct bio_vec; struct rpc_rqst; +/* + * Size of an XDR encoding unit in bytes, i.e. 32 bits, + * as defined in Section 3 of RFC 4506. All encoded + * XDR data items are aligned on a boundary of 32 bits. + */ +#define XDR_UNIT sizeof(__be32) + /* * Buffer adjustment */ @@ -330,7 +337,7 @@ ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str, static inline size_t xdr_align_size(size_t n) { - const size_t mask = sizeof(__u32) - 1; + const size_t mask = XDR_UNIT - 1; return (n + mask) & ~mask; } @@ -360,7 +367,7 @@ static inline size_t xdr_pad_size(size_t n) */ static inline ssize_t xdr_stream_encode_item_present(struct xdr_stream *xdr) { - const size_t len = sizeof(__be32); + const size_t len = XDR_UNIT; __be32 *p = xdr_reserve_space(xdr, len); if (unlikely(!p)) @@ -379,7 +386,7 @@ static inline ssize_t xdr_stream_encode_item_present(struct xdr_stream *xdr) */ static inline int xdr_stream_encode_item_absent(struct xdr_stream *xdr) { - const size_t len = sizeof(__be32); + const size_t len = XDR_UNIT; __be32 *p = xdr_reserve_space(xdr, len); if (unlikely(!p)) From 9575363a9e4c8d7e2f9ba5e79884d623fff0be6f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 20 Oct 2020 14:30:02 -0400 Subject: [PATCH 04/99] NFSD: Update GETATTR3args decoder to use struct xdr_stream Signed-off-by: Chuck Lever --- fs/nfsd/nfs3proc.c | 3 +-- fs/nfsd/nfs3xdr.c | 31 +++++++++++++++++++++++++------ fs/nfsd/xdr3.h | 2 +- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index c9c64471c568..4b66f055141b 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -683,7 +683,6 @@ nfsd3_proc_commit(struct svc_rqst *rqstp) * NFSv3 Server procedures. * Only the results of non-idempotent operations are cached. */ -#define nfs3svc_decode_fhandleargs nfs3svc_decode_fhandle #define nfs3svc_encode_attrstatres nfs3svc_encode_attrstat #define nfs3svc_encode_wccstatres nfs3svc_encode_wccstat #define nfsd3_mkdirargs nfsd3_createargs @@ -715,7 +714,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_decode = nfs3svc_decode_fhandleargs, .pc_encode = nfs3svc_encode_attrstatres, .pc_release = nfs3svc_release_fhandle, - .pc_argsize = sizeof(struct nfsd3_fhandleargs), + .pc_argsize = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd3_attrstatres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT, diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 34b880211e5e..3a2b4abea1a4 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -29,8 +29,9 @@ static u32 nfs3_ftypes[] = { /* - * XDR functions for basic NFS types + * Basic NFSv3 data types (RFC 1813 Sections 2.5 and 2.6) */ + static __be32 * encode_time3(__be32 *p, struct timespec64 *time) { @@ -46,6 +47,26 @@ decode_time3(__be32 *p, struct timespec64 *time) return p; } +static bool +svcxdr_decode_nfs_fh3(struct xdr_stream *xdr, struct svc_fh *fhp) +{ + __be32 *p; + u32 size; + + if (xdr_stream_decode_u32(xdr, &size) < 0) + return false; + if (size == 0 || size > NFS3_FHSIZE) + return false; + p = xdr_inline_decode(xdr, size); + if (!p) + return false; + fh_init(fhp, NFS3_FHSIZE); + fhp->fh_handle.fh_size = size; + memcpy(&fhp->fh_handle.fh_base, p, size); + + return true; +} + static __be32 * decode_fh(__be32 *p, struct svc_fh *fhp) { @@ -312,14 +333,12 @@ void fill_post_wcc(struct svc_fh *fhp) */ int -nfs3svc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p) +nfs3svc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_fhandle *args = rqstp->rq_argp; - p = decode_fh(p, &args->fh); - if (!p) - return 0; - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_nfs_fh3(xdr, &args->fh); } int diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index 456fcd7a1038..62ea669768cf 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -273,7 +273,7 @@ union nfsd3_xdrstore { #define NFS3_SVC_XDRSIZE sizeof(union nfsd3_xdrstore) -int nfs3svc_decode_fhandle(struct svc_rqst *, __be32 *); +int nfs3svc_decode_fhandleargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_sattrargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_diropargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_accessargs(struct svc_rqst *, __be32 *); From 3b921a2b14251e9e203f1e8af76e8ade79f50e50 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 20 Oct 2020 14:32:04 -0400 Subject: [PATCH 05/99] NFSD: Update ACCESS3arg decoder to use struct xdr_stream Signed-off-by: Chuck Lever --- fs/nfsd/nfs3xdr.c | 9 +++++---- fs/nfsd/xdr3.h | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 3a2b4abea1a4..e07cebd80ef7 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -375,14 +375,15 @@ nfs3svc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p) int nfs3svc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_accessargs *args = rqstp->rq_argp; - p = decode_fh(p, &args->fh); - if (!p) + if (!svcxdr_decode_nfs_fh3(xdr, &args->fh)) + return 0; + if (xdr_stream_decode_u32(xdr, &args->access) < 0) return 0; - args->access = ntohl(*p++); - return xdr_argsize_check(rqstp, p); + return 1; } int diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index 62ea669768cf..a4dce4baec7c 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -25,7 +25,7 @@ struct nfsd3_diropargs { struct nfsd3_accessargs { struct svc_fh fh; - unsigned int access; + __u32 access; }; struct nfsd3_readargs { From be63bd2ac6bbf8c065a0ef6dfbea76934326c352 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 20 Oct 2020 14:34:40 -0400 Subject: [PATCH 06/99] NFSD: Update READ3arg decoder to use struct xdr_stream The code that sets up rq_vec is refactored so that it is now adjacent to the nfsd_read() call site where it is used. Signed-off-by: Chuck Lever --- fs/nfsd/nfs3proc.c | 23 ++++++++++++++++++----- fs/nfsd/nfs3xdr.c | 28 +++++++--------------------- fs/nfsd/xdr3.h | 1 - 3 files changed, 25 insertions(+), 27 deletions(-) diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 4b66f055141b..acdf47179a38 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -144,25 +144,38 @@ nfsd3_proc_read(struct svc_rqst *rqstp) { struct nfsd3_readargs *argp = rqstp->rq_argp; struct nfsd3_readres *resp = rqstp->rq_resp; - u32 max_blocksize = svc_max_payload(rqstp); - unsigned long cnt = min(argp->count, max_blocksize); + u32 max_blocksize = svc_max_payload(rqstp); + unsigned int len; + int v; + + argp->count = min_t(u32, argp->count, max_blocksize); dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n", SVCFH_fmt(&argp->fh), (unsigned long) argp->count, (unsigned long long) argp->offset); + v = 0; + len = argp->count; + while (len > 0) { + struct page *page = *(rqstp->rq_next_page++); + + rqstp->rq_vec[v].iov_base = page_address(page); + rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE); + len -= rqstp->rq_vec[v].iov_len; + v++; + } + /* Obtain buffer pointer for payload. * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof) * + 1 (xdr opaque byte count) = 26 */ - resp->count = cnt; + resp->count = argp->count; svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4); fh_copy(&resp->fh, &argp->fh); resp->status = nfsd_read(rqstp, &resp->fh, argp->offset, - rqstp->rq_vec, argp->vlen, &resp->count, - &resp->eof); + rqstp->rq_vec, v, &resp->count, &resp->eof); return rpc_success; } diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index e07cebd80ef7..2f32df15a7e8 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -389,31 +389,17 @@ nfs3svc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p) int nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_readargs *args = rqstp->rq_argp; - unsigned int len; - int v; - u32 max_blocksize = svc_max_payload(rqstp); - p = decode_fh(p, &args->fh); - if (!p) + if (!svcxdr_decode_nfs_fh3(xdr, &args->fh)) + return 0; + if (xdr_stream_decode_u64(xdr, &args->offset) < 0) + return 0; + if (xdr_stream_decode_u32(xdr, &args->count) < 0) return 0; - p = xdr_decode_hyper(p, &args->offset); - args->count = ntohl(*p++); - len = min(args->count, max_blocksize); - - /* set up the kvec */ - v=0; - while (len > 0) { - struct page *p = *(rqstp->rq_next_page++); - - rqstp->rq_vec[v].iov_base = page_address(p); - rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE); - len -= rqstp->rq_vec[v].iov_len; - v++; - } - args->vlen = v; - return xdr_argsize_check(rqstp, p); + return 1; } int diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index a4dce4baec7c..7dfeeaa4e1df 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -32,7 +32,6 @@ struct nfsd3_readargs { struct svc_fh fh; __u64 offset; __u32 count; - int vlen; }; struct nfsd3_writeargs { From c43b2f229a01969a7ccf94b033c5085e0ec2040c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 22 Oct 2020 11:14:55 -0400 Subject: [PATCH 07/99] NFSD: Update WRITE3arg decoder to use struct xdr_stream As part of the update, open code that sanity-checks the size of the data payload against the length of the RPC Call message has to be re-implemented to use xdr_stream infrastructure. Signed-off-by: Chuck Lever --- fs/nfsd/nfs3xdr.c | 51 +++++++++++++++++++---------------------------- 1 file changed, 20 insertions(+), 31 deletions(-) diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 2f32df15a7e8..c06467e8ac82 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -405,52 +405,41 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p) int nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_writeargs *args = rqstp->rq_argp; - unsigned int len, hdr, dlen; u32 max_blocksize = svc_max_payload(rqstp); struct kvec *head = rqstp->rq_arg.head; struct kvec *tail = rqstp->rq_arg.tail; + size_t remaining; - p = decode_fh(p, &args->fh); - if (!p) + if (!svcxdr_decode_nfs_fh3(xdr, &args->fh)) + return 0; + if (xdr_stream_decode_u64(xdr, &args->offset) < 0) + return 0; + if (xdr_stream_decode_u32(xdr, &args->count) < 0) + return 0; + if (xdr_stream_decode_u32(xdr, &args->stable) < 0) return 0; - p = xdr_decode_hyper(p, &args->offset); - args->count = ntohl(*p++); - args->stable = ntohl(*p++); - len = args->len = ntohl(*p++); - if ((void *)p > head->iov_base + head->iov_len) + /* opaque data */ + if (xdr_stream_decode_u32(xdr, &args->len) < 0) return 0; - /* - * The count must equal the amount of data passed. - */ + + /* request sanity */ if (args->count != args->len) return 0; - - /* - * Check to make sure that we got the right number of - * bytes. - */ - hdr = (void*)p - head->iov_base; - dlen = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len - hdr; - /* - * Round the length of the data which was specified up to - * the next multiple of XDR units and then compare that - * against the length which was actually received. - * Note that when RPCSEC/GSS (for example) is used, the - * data buffer can be padded so dlen might be larger - * than required. It must never be smaller. - */ - if (dlen < XDR_QUADLEN(len)*4) + remaining = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len; + remaining -= xdr_stream_pos(xdr); + if (remaining < xdr_align_size(args->len)) return 0; - if (args->count > max_blocksize) { args->count = max_blocksize; - len = args->len = max_blocksize; + args->len = max_blocksize; } - args->first.iov_base = (void *)p; - args->first.iov_len = head->iov_len - hdr; + args->first.iov_base = xdr->p; + args->first.iov_len = head->iov_len - xdr_stream_pos(xdr); + return 1; } From 224c1c894e48cd72e4dd9fb6311be80cbe1369b0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 24 Oct 2020 12:51:18 -0400 Subject: [PATCH 08/99] NFSD: Update READLINK3arg decoder to use struct xdr_stream The NFSv3 READLINK request takes a single filehandle, so it can re-use GETATTR's decoder. Signed-off-by: Chuck Lever --- fs/nfsd/nfs3proc.c | 9 +++++---- fs/nfsd/nfs3xdr.c | 13 ------------- fs/nfsd/xdr3.h | 6 ------ 3 files changed, 5 insertions(+), 23 deletions(-) diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index acdf47179a38..9e289e0f439b 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -124,15 +124,16 @@ nfsd3_proc_access(struct svc_rqst *rqstp) static __be32 nfsd3_proc_readlink(struct svc_rqst *rqstp) { - struct nfsd3_readlinkargs *argp = rqstp->rq_argp; + struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd3_readlinkres *resp = rqstp->rq_resp; + char *buffer = page_address(*(rqstp->rq_next_page++)); dprintk("nfsd: READLINK(3) %s\n", SVCFH_fmt(&argp->fh)); /* Read the symlink. */ fh_copy(&resp->fh, &argp->fh); resp->len = NFS3_MAXPATHLEN; - resp->status = nfsd_readlink(rqstp, &resp->fh, argp->buffer, &resp->len); + resp->status = nfsd_readlink(rqstp, &resp->fh, buffer, &resp->len); return rpc_success; } @@ -768,10 +769,10 @@ static const struct svc_procedure nfsd_procedures3[22] = { }, [NFS3PROC_READLINK] = { .pc_func = nfsd3_proc_readlink, - .pc_decode = nfs3svc_decode_readlinkargs, + .pc_decode = nfs3svc_decode_fhandleargs, .pc_encode = nfs3svc_encode_readlinkres, .pc_release = nfs3svc_release_fhandle, - .pc_argsize = sizeof(struct nfsd3_readlinkargs), + .pc_argsize = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd3_readlinkres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+1+NFS3_MAXPATHLEN/4, diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index c06467e8ac82..6b6a839c1fc8 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -543,19 +543,6 @@ nfs3svc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p) return xdr_argsize_check(rqstp, p); } -int -nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p) -{ - struct nfsd3_readlinkargs *args = rqstp->rq_argp; - - p = decode_fh(p, &args->fh); - if (!p) - return 0; - args->buffer = page_address(*(rqstp->rq_next_page++)); - - return xdr_argsize_check(rqstp, p); -} - int nfs3svc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p) { diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index 7dfeeaa4e1df..08f909142ddf 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -70,11 +70,6 @@ struct nfsd3_renameargs { unsigned int tlen; }; -struct nfsd3_readlinkargs { - struct svc_fh fh; - char * buffer; -}; - struct nfsd3_linkargs { struct svc_fh ffh; struct svc_fh tfh; @@ -282,7 +277,6 @@ int nfs3svc_decode_createargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_mkdirargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_mknodargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_renameargs(struct svc_rqst *, __be32 *); -int nfs3svc_decode_readlinkargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_linkargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_symlinkargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_readdirargs(struct svc_rqst *, __be32 *); From 0a8f37fb34a96267c656f7254e69bb9a2fc89fe4 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 10 Nov 2020 10:24:39 -0500 Subject: [PATCH 09/99] NFSD: Fix returned READDIR offset cookie Code inspection shows that the server's NFSv3 READDIR implementation handles offset cookies slightly differently than the NFSv2 READDIR, NFSv3 READDIRPLUS, and NFSv4 READDIR implementations, and there doesn't seem to be any need for this difference. As a clean up, I copied the logic from nfsd3_proc_readdirplus(). Signed-off-by: Chuck Lever --- fs/nfsd/nfs3proc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 9e289e0f439b..7ea2fb127f6f 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -444,6 +444,7 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) struct nfsd3_readdirargs *argp = rqstp->rq_argp; struct nfsd3_readdirres *resp = rqstp->rq_resp; int count = 0; + loff_t offset; struct page **p; caddr_t page_addr = NULL; @@ -462,7 +463,9 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) resp->common.err = nfs_ok; resp->buffer = argp->buffer; resp->rqstp = rqstp; - resp->status = nfsd_readdir(rqstp, &resp->fh, (loff_t *)&argp->cookie, + offset = argp->cookie; + + resp->status = nfsd_readdir(rqstp, &resp->fh, &offset, &resp->common, nfs3svc_encode_entry); memcpy(resp->verf, argp->verf, 8); count = 0; @@ -478,8 +481,6 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) } resp->count = count >> 2; if (resp->offset) { - loff_t offset = argp->cookie; - if (unlikely(resp->offset1)) { /* we ended up with offset on a page boundary */ *resp->offset = htonl(offset >> 32); From 40116ebd0934cca7e46423bdb3397d3d27eb9fb9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Nov 2020 09:50:23 -0500 Subject: [PATCH 10/99] NFSD: Add helper to set up the pages where the dirlist is encoded De-duplicate some code that is used by both READDIR and READDIRPLUS to build the dirlist in the Reply. Because this code is not related to decoding READ arguments, it is moved to a more appropriate spot. Signed-off-by: Chuck Lever --- fs/nfsd/nfs3proc.c | 29 +++++++++++++++++++---------- fs/nfsd/nfs3xdr.c | 20 -------------------- fs/nfsd/xdr3.h | 1 - 3 files changed, 19 insertions(+), 31 deletions(-) diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 7ea2fb127f6f..8675851199f8 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -435,6 +435,23 @@ nfsd3_proc_link(struct svc_rqst *rqstp) return rpc_success; } +static void nfsd3_init_dirlist_pages(struct svc_rqst *rqstp, + struct nfsd3_readdirres *resp, + int count) +{ + count = min_t(u32, count, svc_max_payload(rqstp)); + + /* Convert byte count to number of words (i.e. >> 2), + * and reserve room for the NULL ptr & eof flag (-2 words) */ + resp->buflen = (count >> 2) - 2; + + resp->buffer = page_address(*rqstp->rq_next_page); + while (count > 0) { + rqstp->rq_next_page++; + count -= PAGE_SIZE; + } +} + /* * Read a portion of a directory. */ @@ -452,16 +469,12 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) SVCFH_fmt(&argp->fh), argp->count, (u32) argp->cookie); - /* Make sure we've room for the NULL ptr & eof flag, and shrink to - * client read size */ - count = (argp->count >> 2) - 2; + nfsd3_init_dirlist_pages(rqstp, resp, argp->count); /* Read directory and encode entries on the fly */ fh_copy(&resp->fh, &argp->fh); - resp->buflen = count; resp->common.err = nfs_ok; - resp->buffer = argp->buffer; resp->rqstp = rqstp; offset = argp->cookie; @@ -513,16 +526,12 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp) SVCFH_fmt(&argp->fh), argp->count, (u32) argp->cookie); - /* Convert byte count to number of words (i.e. >> 2), - * and reserve room for the NULL ptr & eof flag (-2 words) */ - resp->count = (argp->count >> 2) - 2; + nfsd3_init_dirlist_pages(rqstp, resp, argp->count); /* Read directory and encode entries on the fly */ fh_copy(&resp->fh, &argp->fh); resp->common.err = nfs_ok; - resp->buffer = argp->buffer; - resp->buflen = resp->count; resp->rqstp = rqstp; offset = argp->cookie; diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 6b6a839c1fc8..8394aeb8381e 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -560,8 +560,6 @@ int nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_readdirargs *args = rqstp->rq_argp; - int len; - u32 max_blocksize = svc_max_payload(rqstp); p = decode_fh(p, &args->fh); if (!p) @@ -570,14 +568,6 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p) args->verf = p; p += 2; args->dircount = ~0; args->count = ntohl(*p++); - len = args->count = min_t(u32, args->count, max_blocksize); - - while (len > 0) { - struct page *p = *(rqstp->rq_next_page++); - if (!args->buffer) - args->buffer = page_address(p); - len -= PAGE_SIZE; - } return xdr_argsize_check(rqstp, p); } @@ -586,8 +576,6 @@ int nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_readdirargs *args = rqstp->rq_argp; - int len; - u32 max_blocksize = svc_max_payload(rqstp); p = decode_fh(p, &args->fh); if (!p) @@ -597,14 +585,6 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p) args->dircount = ntohl(*p++); args->count = ntohl(*p++); - len = args->count = min(args->count, max_blocksize); - while (len > 0) { - struct page *p = *(rqstp->rq_next_page++); - if (!args->buffer) - args->buffer = page_address(p); - len -= PAGE_SIZE; - } - return xdr_argsize_check(rqstp, p); } diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index 08f909142ddf..789a364d5e69 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -93,7 +93,6 @@ struct nfsd3_readdirargs { __u32 dircount; __u32 count; __be32 * verf; - __be32 * buffer; }; struct nfsd3_commitargs { From 9cedc2e64c296efb3bebe93a0ceeb5e71e8d722d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Oct 2020 13:23:52 -0400 Subject: [PATCH 11/99] NFSD: Update READDIR3args decoders to use struct xdr_stream As an additional clean up, neither nfsd3_proc_readdir() nor nfsd3_proc_readdirplus() make use of the dircount argument, so remove it from struct nfsd3_readdirargs. Signed-off-by: Chuck Lever --- fs/nfsd/nfs3xdr.c | 38 ++++++++++++++++++++++++-------------- fs/nfsd/xdr3.h | 1 - 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 8394aeb8381e..eb55be106a04 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -559,33 +559,43 @@ nfs3svc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p) int nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_readdirargs *args = rqstp->rq_argp; - p = decode_fh(p, &args->fh); - if (!p) + if (!svcxdr_decode_nfs_fh3(xdr, &args->fh)) + return 0; + if (xdr_stream_decode_u64(xdr, &args->cookie) < 0) + return 0; + args->verf = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE); + if (!args->verf) + return 0; + if (xdr_stream_decode_u32(xdr, &args->count) < 0) return 0; - p = xdr_decode_hyper(p, &args->cookie); - args->verf = p; p += 2; - args->dircount = ~0; - args->count = ntohl(*p++); - return xdr_argsize_check(rqstp, p); + return 1; } int nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_readdirargs *args = rqstp->rq_argp; + u32 dircount; - p = decode_fh(p, &args->fh); - if (!p) + if (!svcxdr_decode_nfs_fh3(xdr, &args->fh)) + return 0; + if (xdr_stream_decode_u64(xdr, &args->cookie) < 0) + return 0; + args->verf = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE); + if (!args->verf) + return 0; + /* dircount is ignored */ + if (xdr_stream_decode_u32(xdr, &dircount) < 0) + return 0; + if (xdr_stream_decode_u32(xdr, &args->count) < 0) return 0; - p = xdr_decode_hyper(p, &args->cookie); - args->verf = p; p += 2; - args->dircount = ntohl(*p++); - args->count = ntohl(*p++); - return xdr_argsize_check(rqstp, p); + return 1; } int diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index 789a364d5e69..64af5b01c5d7 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -90,7 +90,6 @@ struct nfsd3_symlinkargs { struct nfsd3_readdirargs { struct svc_fh fh; __u64 cookie; - __u32 dircount; __u32 count; __be32 * verf; }; From c8d26a0acfe77f0880e0acfe77e4209cf8f3a38b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 20 Oct 2020 14:41:56 -0400 Subject: [PATCH 12/99] NFSD: Update COMMIT3arg decoder to use struct xdr_stream Signed-off-by: Chuck Lever --- fs/nfsd/nfs3xdr.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index eb55be106a04..bafb84c97861 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -601,14 +601,17 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p) int nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_commitargs *args = rqstp->rq_argp; - p = decode_fh(p, &args->fh); - if (!p) - return 0; - p = xdr_decode_hyper(p, &args->offset); - args->count = ntohl(*p++); - return xdr_argsize_check(rqstp, p); + if (!svcxdr_decode_nfs_fh3(xdr, &args->fh)) + return 0; + if (xdr_stream_decode_u64(xdr, &args->offset) < 0) + return 0; + if (xdr_stream_decode_u32(xdr, &args->count) < 0) + return 0; + + return 1; } /* From 54d1d43dc709f58be38d278bfc38e9bfb38d35fc Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 20 Oct 2020 15:42:33 -0400 Subject: [PATCH 13/99] NFSD: Update the NFSv3 DIROPargs decoder to use struct xdr_stream Signed-off-by: Chuck Lever --- fs/nfsd/nfs3xdr.c | 40 +++++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index bafb84c97861..299ea8bbd685 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -117,6 +117,39 @@ decode_filename(__be32 *p, char **namp, unsigned int *lenp) return p; } +static bool +svcxdr_decode_filename3(struct xdr_stream *xdr, char **name, unsigned int *len) +{ + u32 size, i; + __be32 *p; + char *c; + + if (xdr_stream_decode_u32(xdr, &size) < 0) + return false; + if (size == 0 || size > NFS3_MAXNAMLEN) + return false; + p = xdr_inline_decode(xdr, size); + if (!p) + return false; + + *len = size; + *name = (char *)p; + for (i = 0, c = *name; i < size; i++, c++) { + if (*c == '\0' || *c == '/') + return false; + } + + return true; +} + +static bool +svcxdr_decode_diropargs3(struct xdr_stream *xdr, struct svc_fh *fhp, + char **name, unsigned int *len) +{ + return svcxdr_decode_nfs_fh3(xdr, fhp) && + svcxdr_decode_filename3(xdr, name, len); +} + static __be32 * decode_sattr3(__be32 *p, struct iattr *iap, struct user_namespace *userns) { @@ -363,13 +396,10 @@ nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p) int nfs3svc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_diropargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->fh)) - || !(p = decode_filename(p, &args->name, &args->len))) - return 0; - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_diropargs3(xdr, &args->fh, &args->name, &args->len); } int From d181e0a4bef36ee74d1338e5b5c2561d7463a5d0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 20 Oct 2020 15:44:12 -0400 Subject: [PATCH 14/99] NFSD: Update the RENAME3args decoder to use struct xdr_stream Signed-off-by: Chuck Lever --- fs/nfsd/nfs3xdr.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 299ea8bbd685..f870a068aad8 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -562,15 +562,13 @@ nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, __be32 *p) int nfs3svc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_renameargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->ffh)) - || !(p = decode_filename(p, &args->fname, &args->flen)) - || !(p = decode_fh(p, &args->tfh)) - || !(p = decode_filename(p, &args->tname, &args->tlen))) - return 0; - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_diropargs3(xdr, &args->ffh, + &args->fname, &args->flen) && + svcxdr_decode_diropargs3(xdr, &args->tfh, + &args->tname, &args->tlen); } int From efaa1e7c2c7475f0a9bbeb904d9aba09b73dd52a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Oct 2020 13:26:32 -0400 Subject: [PATCH 15/99] NFSD: Update the LINK3args decoder to use struct xdr_stream Signed-off-by: Chuck Lever --- fs/nfsd/nfs3xdr.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index f870a068aad8..9437dda2646f 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -574,14 +574,12 @@ nfs3svc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p) int nfs3svc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_linkargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->ffh)) - || !(p = decode_fh(p, &args->tfh)) - || !(p = decode_filename(p, &args->tname, &args->tlen))) - return 0; - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_nfs_fh3(xdr, &args->ffh) && + svcxdr_decode_diropargs3(xdr, &args->tfh, + &args->tname, &args->tlen); } int From 9cde9360d18d8b352b737d10f90f2aecccf93dbe Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 20 Oct 2020 15:48:22 -0400 Subject: [PATCH 16/99] NFSD: Update the SETATTR3args decoder to use struct xdr_stream Signed-off-by: Chuck Lever --- fs/nfsd/nfs3xdr.c | 138 +++++++++++++++++++++++++++++++++----- include/uapi/linux/nfs3.h | 6 ++ 2 files changed, 127 insertions(+), 17 deletions(-) diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 9437dda2646f..6a6bf8e34d82 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -39,12 +39,18 @@ encode_time3(__be32 *p, struct timespec64 *time) return p; } -static __be32 * -decode_time3(__be32 *p, struct timespec64 *time) +static bool +svcxdr_decode_nfstime3(struct xdr_stream *xdr, struct timespec64 *timep) { - time->tv_sec = ntohl(*p++); - time->tv_nsec = ntohl(*p++); - return p; + __be32 *p; + + p = xdr_inline_decode(xdr, XDR_UNIT * 2); + if (!p) + return false; + timep->tv_sec = be32_to_cpup(p++); + timep->tv_nsec = be32_to_cpup(p); + + return true; } static bool @@ -150,6 +156,112 @@ svcxdr_decode_diropargs3(struct xdr_stream *xdr, struct svc_fh *fhp, svcxdr_decode_filename3(xdr, name, len); } +static bool +svcxdr_decode_sattr3(struct svc_rqst *rqstp, struct xdr_stream *xdr, + struct iattr *iap) +{ + u32 set_it; + + iap->ia_valid = 0; + + if (xdr_stream_decode_bool(xdr, &set_it) < 0) + return false; + if (set_it) { + u32 mode; + + if (xdr_stream_decode_u32(xdr, &mode) < 0) + return false; + iap->ia_valid |= ATTR_MODE; + iap->ia_mode = mode; + } + if (xdr_stream_decode_bool(xdr, &set_it) < 0) + return false; + if (set_it) { + u32 uid; + + if (xdr_stream_decode_u32(xdr, &uid) < 0) + return false; + iap->ia_uid = make_kuid(nfsd_user_namespace(rqstp), uid); + if (uid_valid(iap->ia_uid)) + iap->ia_valid |= ATTR_UID; + } + if (xdr_stream_decode_bool(xdr, &set_it) < 0) + return false; + if (set_it) { + u32 gid; + + if (xdr_stream_decode_u32(xdr, &gid) < 0) + return false; + iap->ia_gid = make_kgid(nfsd_user_namespace(rqstp), gid); + if (gid_valid(iap->ia_gid)) + iap->ia_valid |= ATTR_GID; + } + if (xdr_stream_decode_bool(xdr, &set_it) < 0) + return false; + if (set_it) { + u64 newsize; + + if (xdr_stream_decode_u64(xdr, &newsize) < 0) + return false; + iap->ia_valid |= ATTR_SIZE; + iap->ia_size = min_t(u64, newsize, NFS_OFFSET_MAX); + } + if (xdr_stream_decode_u32(xdr, &set_it) < 0) + return false; + switch (set_it) { + case DONT_CHANGE: + break; + case SET_TO_SERVER_TIME: + iap->ia_valid |= ATTR_ATIME; + break; + case SET_TO_CLIENT_TIME: + if (!svcxdr_decode_nfstime3(xdr, &iap->ia_atime)) + return false; + iap->ia_valid |= ATTR_ATIME | ATTR_ATIME_SET; + break; + default: + return false; + } + if (xdr_stream_decode_u32(xdr, &set_it) < 0) + return false; + switch (set_it) { + case DONT_CHANGE: + break; + case SET_TO_SERVER_TIME: + iap->ia_valid |= ATTR_MTIME; + break; + case SET_TO_CLIENT_TIME: + if (!svcxdr_decode_nfstime3(xdr, &iap->ia_mtime)) + return false; + iap->ia_valid |= ATTR_MTIME | ATTR_MTIME_SET; + break; + default: + return false; + } + + return true; +} + +static bool +svcxdr_decode_sattrguard3(struct xdr_stream *xdr, struct nfsd3_sattrargs *args) +{ + __be32 *p; + u32 check; + + if (xdr_stream_decode_bool(xdr, &check) < 0) + return false; + if (check) { + p = xdr_inline_decode(xdr, XDR_UNIT * 2); + if (!p) + return false; + args->check_guard = 1; + args->guardtime = be32_to_cpup(p); + } else + args->check_guard = 0; + + return true; +} + static __be32 * decode_sattr3(__be32 *p, struct iattr *iap, struct user_namespace *userns) { @@ -377,20 +489,12 @@ nfs3svc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p) int nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_sattrargs *args = rqstp->rq_argp; - p = decode_fh(p, &args->fh); - if (!p) - return 0; - p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp)); - - if ((args->check_guard = ntohl(*p++)) != 0) { - struct timespec64 time; - p = decode_time3(p, &time); - args->guardtime = time.tv_sec; - } - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_nfs_fh3(xdr, &args->fh) && + svcxdr_decode_sattr3(rqstp, xdr, &args->attrs) && + svcxdr_decode_sattrguard3(xdr, args); } int diff --git a/include/uapi/linux/nfs3.h b/include/uapi/linux/nfs3.h index 37e4b34e6b43..c22ab77713bd 100644 --- a/include/uapi/linux/nfs3.h +++ b/include/uapi/linux/nfs3.h @@ -63,6 +63,12 @@ enum nfs3_ftype { NF3BAD = 8 }; +enum nfs3_time_how { + DONT_CHANGE = 0, + SET_TO_SERVER_TIME = 1, + SET_TO_CLIENT_TIME = 2, +}; + struct nfs3_fh { unsigned short size; unsigned char data[NFS3_FHSIZE]; From 6b3a11960d898b25a30103cc6a2ff0b24b90a83b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 20 Oct 2020 15:56:11 -0400 Subject: [PATCH 17/99] NFSD: Update the CREATE3args decoder to use struct xdr_stream Signed-off-by: Chuck Lever --- fs/nfsd/nfs3xdr.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 6a6bf8e34d82..24db3725a070 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -580,26 +580,26 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) int nfs3svc_decode_createargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_createargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->fh)) - || !(p = decode_filename(p, &args->name, &args->len))) + if (!svcxdr_decode_diropargs3(xdr, &args->fh, &args->name, &args->len)) return 0; - - switch (args->createmode = ntohl(*p++)) { + if (xdr_stream_decode_u32(xdr, &args->createmode) < 0) + return 0; + switch (args->createmode) { case NFS3_CREATE_UNCHECKED: case NFS3_CREATE_GUARDED: - p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp)); - break; + return svcxdr_decode_sattr3(rqstp, xdr, &args->attrs); case NFS3_CREATE_EXCLUSIVE: - args->verf = p; - p += 2; + args->verf = xdr_inline_decode(xdr, NFS3_CREATEVERFSIZE); + if (!args->verf) + return 0; break; default: return 0; } - - return xdr_argsize_check(rqstp, p); + return 1; } int From 83374c278db193f3e8b2608b45da1132b867a760 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 20 Oct 2020 17:02:16 -0400 Subject: [PATCH 18/99] NFSD: Update the MKDIR3args decoder to use struct xdr_stream Signed-off-by: Chuck Lever --- fs/nfsd/nfs3xdr.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 24db3725a070..b4071cda1d65 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -605,14 +605,12 @@ nfs3svc_decode_createargs(struct svc_rqst *rqstp, __be32 *p) int nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_createargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->fh)) || - !(p = decode_filename(p, &args->name, &args->len))) - return 0; - p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp)); - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_diropargs3(xdr, &args->fh, + &args->name, &args->len) && + svcxdr_decode_sattr3(rqstp, xdr, &args->attrs); } int From da39201637297460c13134c29286a00f3a1c92fe Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 20 Oct 2020 16:01:16 -0400 Subject: [PATCH 19/99] NFSD: Update the SYMLINK3args decoder to use struct xdr_stream Similar to the WRITE decoder, code that checks the sanity of the payload size is re-wired to work with xdr_stream infrastructure. Signed-off-by: Chuck Lever --- fs/nfsd/nfs3xdr.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index b4071cda1d65..eb17231ab166 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -616,25 +616,28 @@ nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, __be32 *p) int nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_symlinkargs *args = rqstp->rq_argp; - char *base = (char *)p; - size_t dlen; + struct kvec *head = rqstp->rq_arg.head; + struct kvec *tail = rqstp->rq_arg.tail; + size_t remaining; - if (!(p = decode_fh(p, &args->ffh)) || - !(p = decode_filename(p, &args->fname, &args->flen))) + if (!svcxdr_decode_diropargs3(xdr, &args->ffh, &args->fname, &args->flen)) return 0; - p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp)); - - args->tlen = ntohl(*p++); - - args->first.iov_base = p; - args->first.iov_len = rqstp->rq_arg.head[0].iov_len; - args->first.iov_len -= (char *)p - base; - - dlen = args->first.iov_len + rqstp->rq_arg.page_len + - rqstp->rq_arg.tail[0].iov_len; - if (dlen < XDR_QUADLEN(args->tlen) << 2) + if (!svcxdr_decode_sattr3(rqstp, xdr, &args->attrs)) return 0; + if (xdr_stream_decode_u32(xdr, &args->tlen) < 0) + return 0; + + /* request sanity */ + remaining = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len; + remaining -= xdr_stream_pos(xdr); + if (remaining < xdr_align_size(args->tlen)) + return 0; + + args->first.iov_base = xdr->p; + args->first.iov_len = head->iov_len - xdr_stream_pos(xdr); + return 1; } From f8a38e2d6c885f9d7cd03febc515d36293de4a5b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 20 Oct 2020 17:04:03 -0400 Subject: [PATCH 20/99] NFSD: Update the MKNOD3args decoder to use struct xdr_stream This commit removes the last usage of the original decode_sattr3(), so it is removed as a clean-up. Signed-off-by: Chuck Lever --- fs/nfsd/nfs3xdr.c | 107 +++++++++++++++------------------------------- 1 file changed, 35 insertions(+), 72 deletions(-) diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index eb17231ab166..a30b418a5116 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -103,26 +103,6 @@ encode_fh(__be32 *p, struct svc_fh *fhp) return p + XDR_QUADLEN(size); } -/* - * Decode a file name and make sure that the path contains - * no slashes or null bytes. - */ -static __be32 * -decode_filename(__be32 *p, char **namp, unsigned int *lenp) -{ - char *name; - unsigned int i; - - if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS3_MAXNAMLEN)) != NULL) { - for (i = 0, name = *namp; i < *lenp; i++, name++) { - if (*name == '\0' || *name == '/') - return NULL; - } - } - - return p; -} - static bool svcxdr_decode_filename3(struct xdr_stream *xdr, char **name, unsigned int *len) { @@ -262,49 +242,26 @@ svcxdr_decode_sattrguard3(struct xdr_stream *xdr, struct nfsd3_sattrargs *args) return true; } -static __be32 * -decode_sattr3(__be32 *p, struct iattr *iap, struct user_namespace *userns) +static bool +svcxdr_decode_specdata3(struct xdr_stream *xdr, struct nfsd3_mknodargs *args) { - u32 tmp; + __be32 *p; - iap->ia_valid = 0; + p = xdr_inline_decode(xdr, XDR_UNIT * 2); + if (!p) + return false; + args->major = be32_to_cpup(p++); + args->minor = be32_to_cpup(p); - if (*p++) { - iap->ia_valid |= ATTR_MODE; - iap->ia_mode = ntohl(*p++); - } - if (*p++) { - iap->ia_uid = make_kuid(userns, ntohl(*p++)); - if (uid_valid(iap->ia_uid)) - iap->ia_valid |= ATTR_UID; - } - if (*p++) { - iap->ia_gid = make_kgid(userns, ntohl(*p++)); - if (gid_valid(iap->ia_gid)) - iap->ia_valid |= ATTR_GID; - } - if (*p++) { - u64 newsize; + return true; +} - iap->ia_valid |= ATTR_SIZE; - p = xdr_decode_hyper(p, &newsize); - iap->ia_size = min_t(u64, newsize, NFS_OFFSET_MAX); - } - if ((tmp = ntohl(*p++)) == 1) { /* set to server time */ - iap->ia_valid |= ATTR_ATIME; - } else if (tmp == 2) { /* set to client time */ - iap->ia_valid |= ATTR_ATIME | ATTR_ATIME_SET; - iap->ia_atime.tv_sec = ntohl(*p++); - iap->ia_atime.tv_nsec = ntohl(*p++); - } - if ((tmp = ntohl(*p++)) == 1) { /* set to server time */ - iap->ia_valid |= ATTR_MTIME; - } else if (tmp == 2) { /* set to client time */ - iap->ia_valid |= ATTR_MTIME | ATTR_MTIME_SET; - iap->ia_mtime.tv_sec = ntohl(*p++); - iap->ia_mtime.tv_nsec = ntohl(*p++); - } - return p; +static bool +svcxdr_decode_devicedata3(struct svc_rqst *rqstp, struct xdr_stream *xdr, + struct nfsd3_mknodargs *args) +{ + return svcxdr_decode_sattr3(rqstp, xdr, &args->attrs) && + svcxdr_decode_specdata3(xdr, args); } static __be32 *encode_fsid(__be32 *p, struct svc_fh *fhp) @@ -644,24 +601,30 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p) int nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_mknodargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->fh)) - || !(p = decode_filename(p, &args->name, &args->len))) + if (!svcxdr_decode_diropargs3(xdr, &args->fh, &args->name, &args->len)) + return 0; + if (xdr_stream_decode_u32(xdr, &args->ftype) < 0) + return 0; + switch (args->ftype) { + case NF3CHR: + case NF3BLK: + return svcxdr_decode_devicedata3(rqstp, xdr, args); + case NF3SOCK: + case NF3FIFO: + return svcxdr_decode_sattr3(rqstp, xdr, &args->attrs); + case NF3REG: + case NF3DIR: + case NF3LNK: + /* Valid XDR but illegal file types */ + break; + default: return 0; - - args->ftype = ntohl(*p++); - - if (args->ftype == NF3BLK || args->ftype == NF3CHR - || args->ftype == NF3SOCK || args->ftype == NF3FIFO) - p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp)); - - if (args->ftype == NF3BLK || args->ftype == NF3CHR) { - args->major = ntohl(*p++); - args->minor = ntohl(*p++); } - return xdr_argsize_check(rqstp, p); + return 1; } int From ebcd8e8b28535b643a4c06685bd363b3b73a96af Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 Oct 2020 12:14:23 -0400 Subject: [PATCH 21/99] NFSD: Update the NFSv2 GETATTR argument decoder to use struct xdr_stream Signed-off-by: Chuck Lever --- fs/nfsd/nfsproc.c | 4 ++-- fs/nfsd/nfsxdr.c | 26 ++++++++++++++++++++------ fs/nfsd/xdr.h | 2 +- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 1f85a4dc9d1b..b9bc162a5c77 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -627,7 +627,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { }, [NFSPROC_GETATTR] = { .pc_func = nfsd_proc_getattr, - .pc_decode = nfssvc_decode_fhandle, + .pc_decode = nfssvc_decode_fhandleargs, .pc_encode = nfssvc_encode_attrstat, .pc_release = nfssvc_release_attrstat, .pc_argsize = sizeof(struct nfsd_fhandle), @@ -793,7 +793,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { }, [NFSPROC_STATFS] = { .pc_func = nfsd_proc_statfs, - .pc_decode = nfssvc_decode_fhandle, + .pc_decode = nfssvc_decode_fhandleargs, .pc_encode = nfssvc_encode_statfsres, .pc_argsize = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd_statfsres), diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 7aa6e8aca2c1..f3189e1be20f 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -23,8 +23,9 @@ static u32 nfs_ftypes[] = { /* - * XDR functions for basic NFS types + * Basic NFSv2 data types (RFC 1094 Section 2.3) */ + static __be32 * decode_fh(__be32 *p, struct svc_fh *fhp) { @@ -37,6 +38,21 @@ decode_fh(__be32 *p, struct svc_fh *fhp) return p + (NFS_FHSIZE >> 2); } +static bool +svcxdr_decode_fhandle(struct xdr_stream *xdr, struct svc_fh *fhp) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, NFS_FHSIZE); + if (!p) + return false; + fh_init(fhp, NFS_FHSIZE); + memcpy(&fhp->fh_handle.fh_base, p, NFS_FHSIZE); + fhp->fh_handle.fh_size = NFS_FHSIZE; + + return true; +} + /* Helper function for NFSv2 ACL code */ __be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp) { @@ -194,14 +210,12 @@ __be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *f */ int -nfssvc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p) +nfssvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_fhandle *args = rqstp->rq_argp; - p = decode_fh(p, &args->fh); - if (!p) - return 0; - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_fhandle(xdr, &args->fh); } int diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h index ad77387734cc..84256a6a1ba1 100644 --- a/fs/nfsd/xdr.h +++ b/fs/nfsd/xdr.h @@ -144,7 +144,7 @@ union nfsd_xdrstore { #define NFS2_SVC_XDRSIZE sizeof(union nfsd_xdrstore) -int nfssvc_decode_fhandle(struct svc_rqst *, __be32 *); +int nfssvc_decode_fhandleargs(struct svc_rqst *, __be32 *); int nfssvc_decode_sattrargs(struct svc_rqst *, __be32 *); int nfssvc_decode_diropargs(struct svc_rqst *, __be32 *); int nfssvc_decode_readargs(struct svc_rqst *, __be32 *); From 8c293ef993c8df0b1bea9ecb0de6eb96dec3ac9d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 Oct 2020 12:15:51 -0400 Subject: [PATCH 22/99] NFSD: Update the NFSv2 READ argument decoder to use struct xdr_stream The code that sets up rq_vec is refactored so that it is now adjacent to the nfsd_read() call site where it is used. Signed-off-by: Chuck Lever --- fs/nfsd/nfsproc.c | 32 ++++++++++++++++++-------------- fs/nfsd/nfsxdr.c | 36 ++++++++++++------------------------ fs/nfsd/xdr.h | 1 - 3 files changed, 30 insertions(+), 39 deletions(-) diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index b9bc162a5c77..814762793f9c 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -171,32 +171,36 @@ nfsd_proc_read(struct svc_rqst *rqstp) { struct nfsd_readargs *argp = rqstp->rq_argp; struct nfsd_readres *resp = rqstp->rq_resp; + unsigned int len; u32 eof; + int v; dprintk("nfsd: READ %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), argp->count, argp->offset); + argp->count = min_t(u32, argp->count, NFSSVC_MAXBLKSIZE_V2); + + v = 0; + len = argp->count; + while (len > 0) { + struct page *page = *(rqstp->rq_next_page++); + + rqstp->rq_vec[v].iov_base = page_address(page); + rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE); + len -= rqstp->rq_vec[v].iov_len; + v++; + } + /* Obtain buffer pointer for payload. 19 is 1 word for * status, 17 words for fattr, and 1 word for the byte count. */ - - if (NFSSVC_MAXBLKSIZE_V2 < argp->count) { - char buf[RPC_MAX_ADDRBUFLEN]; - printk(KERN_NOTICE - "oversized read request from %s (%d bytes)\n", - svc_print_addr(rqstp, buf, sizeof(buf)), - argp->count); - argp->count = NFSSVC_MAXBLKSIZE_V2; - } svc_reserve_auth(rqstp, (19<<2) + argp->count + 4); resp->count = argp->count; - resp->status = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), - argp->offset, - rqstp->rq_vec, argp->vlen, - &resp->count, - &eof); + fh_copy(&resp->fh, &argp->fh); + resp->status = nfsd_read(rqstp, &resp->fh, argp->offset, + rqstp->rq_vec, v, &resp->count, &eof); if (resp->status == nfs_ok) resp->status = fh_getattr(&resp->fh, &resp->stat); else if (resp->status == nfserr_jukebox) diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index f3189e1be20f..1eacaa2c13a9 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -246,33 +246,21 @@ nfssvc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p) int nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_readargs *args = rqstp->rq_argp; - unsigned int len; - int v; - p = decode_fh(p, &args->fh); - if (!p) + u32 totalcount; + + if (!svcxdr_decode_fhandle(xdr, &args->fh)) + return 0; + if (xdr_stream_decode_u32(xdr, &args->offset) < 0) + return 0; + if (xdr_stream_decode_u32(xdr, &args->count) < 0) + return 0; + /* totalcount is ignored */ + if (xdr_stream_decode_u32(xdr, &totalcount) < 0) return 0; - args->offset = ntohl(*p++); - len = args->count = ntohl(*p++); - p++; /* totalcount - unused */ - - len = min_t(unsigned int, len, NFSSVC_MAXBLKSIZE_V2); - - /* set up somewhere to store response. - * We take pages, put them on reslist and include in iovec - */ - v=0; - while (len > 0) { - struct page *p = *(rqstp->rq_next_page++); - - rqstp->rq_vec[v].iov_base = page_address(p); - rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE); - len -= rqstp->rq_vec[v].iov_len; - v++; - } - args->vlen = v; - return xdr_argsize_check(rqstp, p); + return 1; } int diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h index 84256a6a1ba1..d2ffda96975d 100644 --- a/fs/nfsd/xdr.h +++ b/fs/nfsd/xdr.h @@ -27,7 +27,6 @@ struct nfsd_readargs { struct svc_fh fh; __u32 offset; __u32 count; - int vlen; }; struct nfsd_writeargs { From a51b5b737a0be93fae6ea2a18df03ab2359a3f4b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 Oct 2020 12:18:36 -0400 Subject: [PATCH 23/99] NFSD: Update the NFSv2 WRITE argument decoder to use struct xdr_stream Signed-off-by: Chuck Lever --- fs/nfsd/nfsxdr.c | 52 +++++++++++++++++++----------------------------- 1 file changed, 21 insertions(+), 31 deletions(-) diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 1eacaa2c13a9..11d27b219cff 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -266,46 +266,36 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p) int nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_writeargs *args = rqstp->rq_argp; - unsigned int len, hdr, dlen; struct kvec *head = rqstp->rq_arg.head; + struct kvec *tail = rqstp->rq_arg.tail; + u32 beginoffset, totalcount; + size_t remaining; - p = decode_fh(p, &args->fh); - if (!p) + if (!svcxdr_decode_fhandle(xdr, &args->fh)) + return 0; + /* beginoffset is ignored */ + if (xdr_stream_decode_u32(xdr, &beginoffset) < 0) + return 0; + if (xdr_stream_decode_u32(xdr, &args->offset) < 0) + return 0; + /* totalcount is ignored */ + if (xdr_stream_decode_u32(xdr, &totalcount) < 0) return 0; - p++; /* beginoffset */ - args->offset = ntohl(*p++); /* offset */ - p++; /* totalcount */ - len = args->len = ntohl(*p++); - /* - * The protocol specifies a maximum of 8192 bytes. - */ - if (len > NFSSVC_MAXBLKSIZE_V2) + /* opaque data */ + if (xdr_stream_decode_u32(xdr, &args->len) < 0) return 0; - - /* - * Check to make sure that we got the right number of - * bytes. - */ - hdr = (void*)p - head->iov_base; - if (hdr > head->iov_len) + if (args->len > NFSSVC_MAXBLKSIZE_V2) return 0; - dlen = head->iov_len + rqstp->rq_arg.page_len - hdr; - - /* - * Round the length of the data which was specified up to - * the next multiple of XDR units and then compare that - * against the length which was actually received. - * Note that when RPCSEC/GSS (for example) is used, the - * data buffer can be padded so dlen might be larger - * than required. It must never be smaller. - */ - if (dlen < XDR_QUADLEN(len)*4) + remaining = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len; + remaining -= xdr_stream_pos(xdr); + if (remaining < xdr_align_size(args->len)) return 0; + args->first.iov_base = xdr->p; + args->first.iov_len = head->iov_len - xdr_stream_pos(xdr); - args->first.iov_base = (void *)p; - args->first.iov_len = head->iov_len - hdr; return 1; } From 1fcbd1c9456ba129d38420e345e91c4b6363db47 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 Oct 2020 12:21:25 -0400 Subject: [PATCH 24/99] NFSD: Update the NFSv2 READLINK argument decoder to use struct xdr_stream If the code that sets up the sink buffer for nfsd_readlink() is moved adjacent to the nfsd_readlink() call site that uses it, then the only argument is a file handle, and the fhandle decoder can be used instead. Signed-off-by: Chuck Lever --- fs/nfsd/nfsproc.c | 9 +++++---- fs/nfsd/nfsxdr.c | 13 ------------- fs/nfsd/xdr.h | 6 ------ 3 files changed, 5 insertions(+), 23 deletions(-) diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 814762793f9c..bdb47848f7fd 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -149,14 +149,15 @@ nfsd_proc_lookup(struct svc_rqst *rqstp) static __be32 nfsd_proc_readlink(struct svc_rqst *rqstp) { - struct nfsd_readlinkargs *argp = rqstp->rq_argp; + struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd_readlinkres *resp = rqstp->rq_resp; + char *buffer = page_address(*(rqstp->rq_next_page++)); dprintk("nfsd: READLINK %s\n", SVCFH_fmt(&argp->fh)); /* Read the symlink. */ resp->len = NFS_MAXPATHLEN; - resp->status = nfsd_readlink(rqstp, &argp->fh, argp->buffer, &resp->len); + resp->status = nfsd_readlink(rqstp, &argp->fh, buffer, &resp->len); fh_put(&argp->fh); return rpc_success; @@ -674,9 +675,9 @@ static const struct svc_procedure nfsd_procedures2[18] = { }, [NFSPROC_READLINK] = { .pc_func = nfsd_proc_readlink, - .pc_decode = nfssvc_decode_readlinkargs, + .pc_decode = nfssvc_decode_fhandleargs, .pc_encode = nfssvc_encode_readlinkres, - .pc_argsize = sizeof(struct nfsd_readlinkargs), + .pc_argsize = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd_readlinkres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+1+NFS_MAXPATHLEN/4, diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 11d27b219cff..02dd9888d93b 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -326,19 +326,6 @@ nfssvc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p) return xdr_argsize_check(rqstp, p); } -int -nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p) -{ - struct nfsd_readlinkargs *args = rqstp->rq_argp; - - p = decode_fh(p, &args->fh); - if (!p) - return 0; - args->buffer = page_address(*(rqstp->rq_next_page++)); - - return xdr_argsize_check(rqstp, p); -} - int nfssvc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p) { diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h index d2ffda96975d..288c29a999db 100644 --- a/fs/nfsd/xdr.h +++ b/fs/nfsd/xdr.h @@ -52,11 +52,6 @@ struct nfsd_renameargs { unsigned int tlen; }; -struct nfsd_readlinkargs { - struct svc_fh fh; - char * buffer; -}; - struct nfsd_linkargs { struct svc_fh ffh; struct svc_fh tfh; @@ -150,7 +145,6 @@ int nfssvc_decode_readargs(struct svc_rqst *, __be32 *); int nfssvc_decode_writeargs(struct svc_rqst *, __be32 *); int nfssvc_decode_createargs(struct svc_rqst *, __be32 *); int nfssvc_decode_renameargs(struct svc_rqst *, __be32 *); -int nfssvc_decode_readlinkargs(struct svc_rqst *, __be32 *); int nfssvc_decode_linkargs(struct svc_rqst *, __be32 *); int nfssvc_decode_symlinkargs(struct svc_rqst *, __be32 *); int nfssvc_decode_readdirargs(struct svc_rqst *, __be32 *); From 788cd46ecf83ee2d561cb4e754e276dc8089b787 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 13 Nov 2020 17:03:49 -0500 Subject: [PATCH 25/99] NFSD: Add helper to set up the pages where the dirlist is encoded Add a helper similar to nfsd3_init_dirlist_pages(). Signed-off-by: Chuck Lever --- fs/nfsd/nfsproc.c | 29 ++++++++++++++++++----------- fs/nfsd/nfsxdr.c | 2 -- fs/nfsd/xdr.h | 1 - 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index bdb47848f7fd..b2f8035f166b 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -553,6 +553,20 @@ nfsd_proc_rmdir(struct svc_rqst *rqstp) return rpc_success; } +static void nfsd_init_dirlist_pages(struct svc_rqst *rqstp, + struct nfsd_readdirres *resp, + int count) +{ + count = min_t(u32, count, PAGE_SIZE); + + /* Convert byte count to number of words (i.e. >> 2), + * and reserve room for the NULL ptr & eof flag (-2 words) */ + resp->buflen = (count >> 2) - 2; + + resp->buffer = page_address(*rqstp->rq_next_page); + rqstp->rq_next_page++; +} + /* * Read a portion of a directory. */ @@ -561,31 +575,24 @@ nfsd_proc_readdir(struct svc_rqst *rqstp) { struct nfsd_readdirargs *argp = rqstp->rq_argp; struct nfsd_readdirres *resp = rqstp->rq_resp; - int count; loff_t offset; + __be32 *buffer; dprintk("nfsd: READDIR %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), argp->count, argp->cookie); - /* Shrink to the client read size */ - count = (argp->count >> 2) - 2; + nfsd_init_dirlist_pages(rqstp, resp, argp->count); + buffer = resp->buffer; - /* Make sure we've room for the NULL ptr & eof flag */ - count -= 2; - if (count < 0) - count = 0; - - resp->buffer = argp->buffer; resp->offset = NULL; - resp->buflen = count; resp->common.err = nfs_ok; /* Read directory and encode entries on the fly */ offset = argp->cookie; resp->status = nfsd_readdir(rqstp, &argp->fh, &offset, &resp->common, nfssvc_encode_entry); - resp->count = resp->buffer - argp->buffer; + resp->count = resp->buffer - buffer; if (resp->offset) *resp->offset = htonl(offset); diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 02dd9888d93b..3d72334e1673 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -388,8 +388,6 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p) return 0; args->cookie = ntohl(*p++); args->count = ntohl(*p++); - args->count = min_t(u32, args->count, PAGE_SIZE); - args->buffer = page_address(*(rqstp->rq_next_page++)); return xdr_argsize_check(rqstp, p); } diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h index 288c29a999db..d700838f6512 100644 --- a/fs/nfsd/xdr.h +++ b/fs/nfsd/xdr.h @@ -73,7 +73,6 @@ struct nfsd_readdirargs { struct svc_fh fh; __u32 cookie; __u32 count; - __be32 * buffer; }; struct nfsd_stat { From 8688361ae2edb8f7e61d926dc5000c9a44f29370 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Oct 2020 14:15:51 -0400 Subject: [PATCH 26/99] NFSD: Update the NFSv2 READDIR argument decoder to use struct xdr_stream As an additional clean up, move code not related to XDR decoding into readdir's .pc_func call out. Signed-off-by: Chuck Lever --- fs/nfsd/nfsxdr.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 3d72334e1673..7b33093f8d8b 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -381,15 +381,17 @@ nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p) int nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_readdirargs *args = rqstp->rq_argp; - p = decode_fh(p, &args->fh); - if (!p) + if (!svcxdr_decode_fhandle(xdr, &args->fh)) + return 0; + if (xdr_stream_decode_u32(xdr, &args->cookie) < 0) + return 0; + if (xdr_stream_decode_u32(xdr, &args->count) < 0) return 0; - args->cookie = ntohl(*p++); - args->count = ntohl(*p++); - return xdr_argsize_check(rqstp, p); + return 1; } /* From 6d742c1864c18f143ea2031f1ed66bcd8f4812de Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Oct 2020 14:33:24 -0400 Subject: [PATCH 27/99] NFSD: Update NFSv2 diropargs decoding to use struct xdr_stream Signed-off-by: Chuck Lever --- fs/nfsd/nfsxdr.c | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 7b33093f8d8b..00a7db8548eb 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -86,6 +86,38 @@ decode_filename(__be32 *p, char **namp, unsigned int *lenp) return p; } +static bool +svcxdr_decode_filename(struct xdr_stream *xdr, char **name, unsigned int *len) +{ + u32 size, i; + __be32 *p; + char *c; + + if (xdr_stream_decode_u32(xdr, &size) < 0) + return false; + if (size == 0 || size > NFS_MAXNAMLEN) + return false; + p = xdr_inline_decode(xdr, size); + if (!p) + return false; + + *len = size; + *name = (char *)p; + for (i = 0, c = *name; i < size; i++, c++) + if (*c == '\0' || *c == '/') + return false; + + return true; +} + +static bool +svcxdr_decode_diropargs(struct xdr_stream *xdr, struct svc_fh *fhp, + char **name, unsigned int *len) +{ + return svcxdr_decode_fhandle(xdr, fhp) && + svcxdr_decode_filename(xdr, name, len); +} + static __be32 * decode_sattr(__be32 *p, struct iattr *iap, struct user_namespace *userns) { @@ -234,13 +266,10 @@ nfssvc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p) int nfssvc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_diropargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->fh)) - || !(p = decode_filename(p, &args->name, &args->len))) - return 0; - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_diropargs(xdr, &args->fh, &args->name, &args->len); } int From 62aa557efb81ea3339fabe7f5b1a343e742bbbdf Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 Oct 2020 12:35:41 -0400 Subject: [PATCH 28/99] NFSD: Update the NFSv2 RENAME argument decoder to use struct xdr_stream Signed-off-by: Chuck Lever --- fs/nfsd/nfsxdr.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 00a7db8548eb..d4f4729c7b1c 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -344,15 +344,13 @@ nfssvc_decode_createargs(struct svc_rqst *rqstp, __be32 *p) int nfssvc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_renameargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->ffh)) - || !(p = decode_filename(p, &args->fname, &args->flen)) - || !(p = decode_fh(p, &args->tfh)) - || !(p = decode_filename(p, &args->tname, &args->tlen))) - return 0; - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_diropargs(xdr, &args->ffh, + &args->fname, &args->flen) && + svcxdr_decode_diropargs(xdr, &args->tfh, + &args->tname, &args->tlen); } int From 77edcdf91f6245a9881b84e4e101738148bd039a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 Oct 2020 12:34:24 -0400 Subject: [PATCH 29/99] NFSD: Update the NFSv2 LINK argument decoder to use struct xdr_stream Signed-off-by: Chuck Lever --- fs/nfsd/nfsxdr.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index d4f4729c7b1c..3d0fe03a3fb9 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -356,14 +356,12 @@ nfssvc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p) int nfssvc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_linkargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->ffh)) - || !(p = decode_fh(p, &args->tfh)) - || !(p = decode_filename(p, &args->tname, &args->tlen))) - return 0; - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_fhandle(xdr, &args->ffh) && + svcxdr_decode_diropargs(xdr, &args->tfh, + &args->tname, &args->tlen); } int From 2fdd6bd293b9e7dda61220538b2759fbf06f5af0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 Oct 2020 12:39:06 -0400 Subject: [PATCH 30/99] NFSD: Update the NFSv2 SETATTR argument decoder to use struct xdr_stream Signed-off-by: Chuck Lever --- fs/nfsd/nfsxdr.c | 82 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 76 insertions(+), 6 deletions(-) diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 3d0fe03a3fb9..6c87ea8f3876 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -173,6 +173,79 @@ decode_sattr(__be32 *p, struct iattr *iap, struct user_namespace *userns) return p; } +static bool +svcxdr_decode_sattr(struct svc_rqst *rqstp, struct xdr_stream *xdr, + struct iattr *iap) +{ + u32 tmp1, tmp2; + __be32 *p; + + p = xdr_inline_decode(xdr, XDR_UNIT * 8); + if (!p) + return false; + + iap->ia_valid = 0; + + /* + * Some Sun clients put 0xffff in the mode field when they + * mean 0xffffffff. + */ + tmp1 = be32_to_cpup(p++); + if (tmp1 != (u32)-1 && tmp1 != 0xffff) { + iap->ia_valid |= ATTR_MODE; + iap->ia_mode = tmp1; + } + + tmp1 = be32_to_cpup(p++); + if (tmp1 != (u32)-1) { + iap->ia_uid = make_kuid(nfsd_user_namespace(rqstp), tmp1); + if (uid_valid(iap->ia_uid)) + iap->ia_valid |= ATTR_UID; + } + + tmp1 = be32_to_cpup(p++); + if (tmp1 != (u32)-1) { + iap->ia_gid = make_kgid(nfsd_user_namespace(rqstp), tmp1); + if (gid_valid(iap->ia_gid)) + iap->ia_valid |= ATTR_GID; + } + + tmp1 = be32_to_cpup(p++); + if (tmp1 != (u32)-1) { + iap->ia_valid |= ATTR_SIZE; + iap->ia_size = tmp1; + } + + tmp1 = be32_to_cpup(p++); + tmp2 = be32_to_cpup(p++); + if (tmp1 != (u32)-1 && tmp2 != (u32)-1) { + iap->ia_valid |= ATTR_ATIME | ATTR_ATIME_SET; + iap->ia_atime.tv_sec = tmp1; + iap->ia_atime.tv_nsec = tmp2 * NSEC_PER_USEC; + } + + tmp1 = be32_to_cpup(p++); + tmp2 = be32_to_cpup(p++); + if (tmp1 != (u32)-1 && tmp2 != (u32)-1) { + iap->ia_valid |= ATTR_MTIME | ATTR_MTIME_SET; + iap->ia_mtime.tv_sec = tmp1; + iap->ia_mtime.tv_nsec = tmp2 * NSEC_PER_USEC; + /* + * Passing the invalid value useconds=1000000 for mtime + * is a Sun convention for "set both mtime and atime to + * current server time". It's needed to make permissions + * checks for the "touch" program across v2 mounts to + * Solaris and Irix boxes work correctly. See description of + * sattr in section 6.1 of "NFS Illustrated" by + * Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5 + */ + if (tmp2 == 1000000) + iap->ia_valid &= ~(ATTR_ATIME_SET|ATTR_MTIME_SET); + } + + return true; +} + static __be32 * encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat) @@ -253,14 +326,11 @@ nfssvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p) int nfssvc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_sattrargs *args = rqstp->rq_argp; - p = decode_fh(p, &args->fh); - if (!p) - return 0; - p = decode_sattr(p, &args->attrs, nfsd_user_namespace(rqstp)); - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_fhandle(xdr, &args->fh) && + svcxdr_decode_sattr(rqstp, xdr, &args->attrs); } int From 7dcf65b91ecaf60ce593e7859ae2b29b7c46ccbd Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 Oct 2020 12:43:58 -0400 Subject: [PATCH 31/99] NFSD: Update the NFSv2 CREATE argument decoder to use struct xdr_stream Signed-off-by: Chuck Lever --- fs/nfsd/nfsxdr.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 6c87ea8f3876..2e2806cbe7b8 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -401,14 +401,12 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) int nfssvc_decode_createargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_createargs *args = rqstp->rq_argp; - if ( !(p = decode_fh(p, &args->fh)) - || !(p = decode_filename(p, &args->name, &args->len))) - return 0; - p = decode_sattr(p, &args->attrs, nfsd_user_namespace(rqstp)); - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_diropargs(xdr, &args->fh, + &args->name, &args->len) && + svcxdr_decode_sattr(rqstp, xdr, &args->attrs); } int From 09f75a5375ac61f4adb94da0accc1cfc60eb4f2b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 Oct 2020 12:46:03 -0400 Subject: [PATCH 32/99] NFSD: Update the NFSv2 SYMLINK argument decoder to use struct xdr_stream Signed-off-by: Chuck Lever --- fs/nfsd/nfsxdr.c | 113 +++++------------------------------------------ 1 file changed, 10 insertions(+), 103 deletions(-) diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 2e2806cbe7b8..f2cb4794aeaf 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -66,26 +66,6 @@ encode_fh(__be32 *p, struct svc_fh *fhp) return p + (NFS_FHSIZE>> 2); } -/* - * Decode a file name and make sure that the path contains - * no slashes or null bytes. - */ -static __be32 * -decode_filename(__be32 *p, char **namp, unsigned int *lenp) -{ - char *name; - unsigned int i; - - if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXNAMLEN)) != NULL) { - for (i = 0, name = *namp; i < *lenp; i++, name++) { - if (*name == '\0' || *name == '/') - return NULL; - } - } - - return p; -} - static bool svcxdr_decode_filename(struct xdr_stream *xdr, char **name, unsigned int *len) { @@ -118,61 +98,6 @@ svcxdr_decode_diropargs(struct xdr_stream *xdr, struct svc_fh *fhp, svcxdr_decode_filename(xdr, name, len); } -static __be32 * -decode_sattr(__be32 *p, struct iattr *iap, struct user_namespace *userns) -{ - u32 tmp, tmp1; - - iap->ia_valid = 0; - - /* Sun client bug compatibility check: some sun clients seem to - * put 0xffff in the mode field when they mean 0xffffffff. - * Quoting the 4.4BSD nfs server code: Nah nah nah nah na nah. - */ - if ((tmp = ntohl(*p++)) != (u32)-1 && tmp != 0xffff) { - iap->ia_valid |= ATTR_MODE; - iap->ia_mode = tmp; - } - if ((tmp = ntohl(*p++)) != (u32)-1) { - iap->ia_uid = make_kuid(userns, tmp); - if (uid_valid(iap->ia_uid)) - iap->ia_valid |= ATTR_UID; - } - if ((tmp = ntohl(*p++)) != (u32)-1) { - iap->ia_gid = make_kgid(userns, tmp); - if (gid_valid(iap->ia_gid)) - iap->ia_valid |= ATTR_GID; - } - if ((tmp = ntohl(*p++)) != (u32)-1) { - iap->ia_valid |= ATTR_SIZE; - iap->ia_size = tmp; - } - tmp = ntohl(*p++); tmp1 = ntohl(*p++); - if (tmp != (u32)-1 && tmp1 != (u32)-1) { - iap->ia_valid |= ATTR_ATIME | ATTR_ATIME_SET; - iap->ia_atime.tv_sec = tmp; - iap->ia_atime.tv_nsec = tmp1 * 1000; - } - tmp = ntohl(*p++); tmp1 = ntohl(*p++); - if (tmp != (u32)-1 && tmp1 != (u32)-1) { - iap->ia_valid |= ATTR_MTIME | ATTR_MTIME_SET; - iap->ia_mtime.tv_sec = tmp; - iap->ia_mtime.tv_nsec = tmp1 * 1000; - /* - * Passing the invalid value useconds=1000000 for mtime - * is a Sun convention for "set both mtime and atime to - * current server time". It's needed to make permissions - * checks for the "touch" program across v2 mounts to - * Solaris and Irix boxes work correctly. See description of - * sattr in section 6.1 of "NFS Illustrated" by - * Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5 - */ - if (tmp1 == 1000000) - iap->ia_valid &= ~(ATTR_ATIME_SET|ATTR_MTIME_SET); - } - return p; -} - static bool svcxdr_decode_sattr(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct iattr *iap) @@ -435,40 +360,22 @@ nfssvc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p) int nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_symlinkargs *args = rqstp->rq_argp; - char *base = (char *)p; - size_t xdrlen; + struct kvec *head = rqstp->rq_arg.head; - if ( !(p = decode_fh(p, &args->ffh)) - || !(p = decode_filename(p, &args->fname, &args->flen))) + if (!svcxdr_decode_diropargs(xdr, &args->ffh, &args->fname, &args->flen)) + return 0; + if (xdr_stream_decode_u32(xdr, &args->tlen) < 0) return 0; - - args->tlen = ntohl(*p++); if (args->tlen == 0) return 0; - args->first.iov_base = p; - args->first.iov_len = rqstp->rq_arg.head[0].iov_len; - args->first.iov_len -= (char *)p - base; - - /* This request is never larger than a page. Therefore, - * transport will deliver either: - * 1. pathname in the pagelist -> sattr is in the tail. - * 2. everything in the head buffer -> sattr is in the head. - */ - if (rqstp->rq_arg.page_len) { - if (args->tlen != rqstp->rq_arg.page_len) - return 0; - p = rqstp->rq_arg.tail[0].iov_base; - } else { - xdrlen = XDR_QUADLEN(args->tlen); - if (xdrlen > args->first.iov_len - (8 * sizeof(__be32))) - return 0; - p += xdrlen; - } - decode_sattr(p, &args->attrs, nfsd_user_namespace(rqstp)); - - return 1; + args->first.iov_len = head->iov_len - xdr_stream_pos(xdr); + args->first.iov_base = xdr_inline_decode(xdr, args->tlen); + if (!args->first.iov_base) + return 0; + return svcxdr_decode_sattr(rqstp, xdr, &args->attrs); } int From 5650682e16f41722f735b7beeb2dbc3411dfbeb6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 20 Oct 2020 10:08:19 -0400 Subject: [PATCH 33/99] NFSD: Remove argument length checking in nfsd_dispatch() Now that the argument decoders for NFSv2 and NFSv3 use the xdr_stream mechanism, the version-specific length checking logic in nfsd_dispatch() is no longer necessary. Signed-off-by: Chuck Lever --- fs/nfsd/nfssvc.c | 34 ---------------------------------- 1 file changed, 34 deletions(-) diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index f9c9f4c63cc7..6de406322106 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -955,37 +955,6 @@ nfsd(void *vrqstp) return 0; } -/* - * A write procedure can have a large argument, and a read procedure can - * have a large reply, but no NFSv2 or NFSv3 procedure has argument and - * reply that can both be larger than a page. The xdr code has taken - * advantage of this assumption to be a sloppy about bounds checking in - * some cases. Pending a rewrite of the NFSv2/v3 xdr code to fix that - * problem, we enforce these assumptions here: - */ -static bool nfs_request_too_big(struct svc_rqst *rqstp, - const struct svc_procedure *proc) -{ - /* - * The ACL code has more careful bounds-checking and is not - * susceptible to this problem: - */ - if (rqstp->rq_prog != NFS_PROGRAM) - return false; - /* - * Ditto NFSv4 (which can in theory have argument and reply both - * more than a page): - */ - if (rqstp->rq_vers >= 4) - return false; - /* The reply will be small, we're OK: */ - if (proc->pc_xdrressize > 0 && - proc->pc_xdrressize < XDR_QUADLEN(PAGE_SIZE)) - return false; - - return rqstp->rq_arg.len > PAGE_SIZE; -} - /** * nfsd_dispatch - Process an NFS or NFSACL Request * @rqstp: incoming request @@ -1004,9 +973,6 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) struct kvec *resv = &rqstp->rq_res.head[0]; __be32 *p; - if (nfs_request_too_big(rqstp, proc)) - goto out_decode_err; - /* * Give the xdr decoder a chance to change this if it wants * (necessary in the NFSv4.0 compound case) From 635a45d34706400c59c3b18ca9fccba195147bda Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Nov 2020 11:32:04 -0500 Subject: [PATCH 34/99] NFSD: Update the NFSv2 GETACL argument decoder to use struct xdr_stream Signed-off-by: Chuck Lever --- fs/nfsd/nfs2acl.c | 10 +++++----- fs/nfsd/nfsxdr.c | 11 ++++++++++- fs/nfsd/xdr.h | 1 + fs/nfsd/xdr3.h | 2 +- 4 files changed, 17 insertions(+), 7 deletions(-) diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 899762da23c9..df2e145cfab0 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -188,17 +188,17 @@ static __be32 nfsacld_proc_access(struct svc_rqst *rqstp) static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_getaclargs *argp = rqstp->rq_argp; - p = nfs2svc_decode_fh(p, &argp->fh); - if (!p) + if (!svcxdr_decode_fhandle(xdr, &argp->fh)) + return 0; + if (xdr_stream_decode_u32(xdr, &argp->mask) < 0) return 0; - argp->mask = ntohl(*p); p++; - return xdr_argsize_check(rqstp, p); + return 1; } - static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_setaclargs *argp = rqstp->rq_argp; diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index f2cb4794aeaf..5ab9fc14816c 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -38,7 +38,16 @@ decode_fh(__be32 *p, struct svc_fh *fhp) return p + (NFS_FHSIZE >> 2); } -static bool +/** + * svcxdr_decode_fhandle - Decode an NFSv2 file handle + * @xdr: XDR stream positioned at an encoded NFSv2 FH + * @fhp: OUT: filled-in server file handle + * + * Return values: + * %false: The encoded file handle was not valid + * %true: @fhp has been initialized + */ +bool svcxdr_decode_fhandle(struct xdr_stream *xdr, struct svc_fh *fhp) { __be32 *p; diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h index d700838f6512..035c99c7b384 100644 --- a/fs/nfsd/xdr.h +++ b/fs/nfsd/xdr.h @@ -165,5 +165,6 @@ void nfssvc_release_readres(struct svc_rqst *rqstp); /* Helper functions for NFSv2 ACL code */ __be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat); __be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp); +bool svcxdr_decode_fhandle(struct xdr_stream *xdr, struct svc_fh *fhp); #endif /* LINUX_NFSD_H */ diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index 64af5b01c5d7..43db4206cd25 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -102,7 +102,7 @@ struct nfsd3_commitargs { struct nfsd3_getaclargs { struct svc_fh fh; - int mask; + __u32 mask; }; struct posix_acl; From 6bb844b4eb6e3b109a2fdaffb60e6da722dc4356 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Nov 2020 10:38:46 -0500 Subject: [PATCH 35/99] NFSD: Add an xdr_stream-based decoder for NFSv2/3 ACLs Signed-off-by: Chuck Lever --- fs/nfs_common/nfsacl.c | 52 ++++++++++++++++++++++++++++++++++++++++++ include/linux/nfsacl.h | 3 +++ 2 files changed, 55 insertions(+) diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c index d056ad2fdefd..79c563c1a5e8 100644 --- a/fs/nfs_common/nfsacl.c +++ b/fs/nfs_common/nfsacl.c @@ -295,3 +295,55 @@ int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt, nfsacl_desc.desc.array_len; } EXPORT_SYMBOL_GPL(nfsacl_decode); + +/** + * nfs_stream_decode_acl - Decode an NFSv3 ACL + * + * @xdr: an xdr_stream positioned at an encoded ACL + * @aclcnt: OUT: count of ACEs in decoded posix_acl + * @pacl: OUT: a dynamically-allocated buffer containing the decoded posix_acl + * + * Return values: + * %false: The encoded ACL is not valid + * %true: @pacl contains a decoded ACL, and @xdr is advanced + * + * On a successful return, caller must release *pacl using posix_acl_release(). + */ +bool nfs_stream_decode_acl(struct xdr_stream *xdr, unsigned int *aclcnt, + struct posix_acl **pacl) +{ + const size_t elem_size = XDR_UNIT * 3; + struct nfsacl_decode_desc nfsacl_desc = { + .desc = { + .elem_size = elem_size, + .xcode = pacl ? xdr_nfsace_decode : NULL, + }, + }; + unsigned int base; + u32 entries; + + if (xdr_stream_decode_u32(xdr, &entries) < 0) + return false; + if (entries > NFS_ACL_MAX_ENTRIES) + return false; + + base = xdr_stream_pos(xdr); + if (!xdr_inline_decode(xdr, XDR_UNIT + elem_size * entries)) + return false; + nfsacl_desc.desc.array_maxlen = entries; + if (xdr_decode_array2(xdr->buf, base, &nfsacl_desc.desc)) + return false; + + if (pacl) { + if (entries != nfsacl_desc.desc.array_len || + posix_acl_from_nfsacl(nfsacl_desc.acl) != 0) { + posix_acl_release(nfsacl_desc.acl); + return false; + } + *pacl = nfsacl_desc.acl; + } + if (aclcnt) + *aclcnt = entries; + return true; +} +EXPORT_SYMBOL_GPL(nfs_stream_decode_acl); diff --git a/include/linux/nfsacl.h b/include/linux/nfsacl.h index 103d44695323..0ba99c513649 100644 --- a/include/linux/nfsacl.h +++ b/include/linux/nfsacl.h @@ -38,5 +38,8 @@ nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode, extern int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt, struct posix_acl **pacl); +extern bool +nfs_stream_decode_acl(struct xdr_stream *xdr, unsigned int *aclcnt, + struct posix_acl **pacl); #endif /* __LINUX_NFSACL_H */ From 427eab3ba22891845265f9a3846de6ac152ec836 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Nov 2020 11:37:35 -0500 Subject: [PATCH 36/99] NFSD: Update the NFSv2 SETACL argument decoder to use struct xdr_stream Signed-off-by: Chuck Lever --- fs/nfsd/nfs2acl.c | 29 ++++++++++++----------------- fs/nfsd/xdr3.h | 2 +- 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index df2e145cfab0..123820ec79d3 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -201,28 +201,23 @@ static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p) static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_setaclargs *argp = rqstp->rq_argp; - struct kvec *head = rqstp->rq_arg.head; - unsigned int base; - int n; - p = nfs2svc_decode_fh(p, &argp->fh); - if (!p) + if (!svcxdr_decode_fhandle(xdr, &argp->fh)) return 0; - argp->mask = ntohl(*p++); - if (argp->mask & ~NFS_ACL_MASK || - !xdr_argsize_check(rqstp, p)) + if (xdr_stream_decode_u32(xdr, &argp->mask) < 0) + return 0; + if (argp->mask & ~NFS_ACL_MASK) + return 0; + if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_ACL) ? + &argp->acl_access : NULL)) + return 0; + if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_DFACL) ? + &argp->acl_default : NULL)) return 0; - base = (char *)p - (char *)head->iov_base; - n = nfsacl_decode(&rqstp->rq_arg, base, NULL, - (argp->mask & NFS_ACL) ? - &argp->acl_access : NULL); - if (n > 0) - n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL, - (argp->mask & NFS_DFACL) ? - &argp->acl_default : NULL); - return (n > 0); + return 1; } static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p) diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index 43db4206cd25..5afb3ce4f062 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -108,7 +108,7 @@ struct nfsd3_getaclargs { struct posix_acl; struct nfsd3_setaclargs { struct svc_fh fh; - int mask; + __u32 mask; struct posix_acl *acl_access; struct posix_acl *acl_default; }; From 571d31f37a57729c9d3463b5a692a84e619b408a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Nov 2020 11:46:50 -0500 Subject: [PATCH 37/99] NFSD: Update the NFSv2 ACL GETATTR argument decoder to use struct xdr_stream Since the ACL GETATTR procedure is the same as the normal GETATTR procedure, simply re-use nfssvc_decode_fhandleargs. Signed-off-by: Chuck Lever --- fs/nfsd/nfs2acl.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 123820ec79d3..0274348f6679 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -220,16 +220,6 @@ static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p) return 1; } -static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p) -{ - struct nfsd_fhandle *argp = rqstp->rq_argp; - - p = nfs2svc_decode_fh(p, &argp->fh); - if (!p) - return 0; - return xdr_argsize_check(rqstp, p); -} - static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_accessargs *argp = rqstp->rq_argp; @@ -392,7 +382,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = { }, [ACLPROC2_GETATTR] = { .pc_func = nfsacld_proc_getattr, - .pc_decode = nfsaclsvc_decode_fhandleargs, + .pc_decode = nfssvc_decode_fhandleargs, .pc_encode = nfsaclsvc_encode_attrstatres, .pc_release = nfsaclsvc_release_attrstat, .pc_argsize = sizeof(struct nfsd_fhandle), From 64063892efc1daa3a48882673811ff327ba75ed5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Nov 2020 11:49:29 -0500 Subject: [PATCH 38/99] NFSD: Update the NFSv2 ACL ACCESS argument decoder to use struct xdr_stream Signed-off-by: Chuck Lever --- fs/nfsd/nfs2acl.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 0274348f6679..7eeac5b81c20 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -222,14 +222,15 @@ static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p) static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p) { - struct nfsd3_accessargs *argp = rqstp->rq_argp; + struct xdr_stream *xdr = &rqstp->rq_arg_stream; + struct nfsd3_accessargs *args = rqstp->rq_argp; - p = nfs2svc_decode_fh(p, &argp->fh); - if (!p) + if (!svcxdr_decode_fhandle(xdr, &args->fh)) + return 0; + if (xdr_stream_decode_u32(xdr, &args->access) < 0) return 0; - argp->access = ntohl(*p++); - return xdr_argsize_check(rqstp, p); + return 1; } /* From baadce65d6ee3032b921d9c043ba808bc69d6b13 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Oct 2020 17:49:16 -0400 Subject: [PATCH 39/99] NFSD: Clean up after updating NFSv2 ACL decoders Signed-off-by: Chuck Lever --- fs/nfsd/nfsxdr.c | 18 ------------------ fs/nfsd/xdr.h | 1 - 2 files changed, 19 deletions(-) diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 5ab9fc14816c..5d79ef6a0c7f 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -26,18 +26,6 @@ static u32 nfs_ftypes[] = { * Basic NFSv2 data types (RFC 1094 Section 2.3) */ -static __be32 * -decode_fh(__be32 *p, struct svc_fh *fhp) -{ - fh_init(fhp, NFS_FHSIZE); - memcpy(&fhp->fh_handle.fh_base, p, NFS_FHSIZE); - fhp->fh_handle.fh_size = NFS_FHSIZE; - - /* FIXME: Look up export pointer here and verify - * Sun Secure RPC if requested */ - return p + (NFS_FHSIZE >> 2); -} - /** * svcxdr_decode_fhandle - Decode an NFSv2 file handle * @xdr: XDR stream positioned at an encoded NFSv2 FH @@ -62,12 +50,6 @@ svcxdr_decode_fhandle(struct xdr_stream *xdr, struct svc_fh *fhp) return true; } -/* Helper function for NFSv2 ACL code */ -__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp) -{ - return decode_fh(p, fhp); -} - static __be32 * encode_fh(__be32 *p, struct svc_fh *fhp) { diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h index 035c99c7b384..3018b52b6d5e 100644 --- a/fs/nfsd/xdr.h +++ b/fs/nfsd/xdr.h @@ -164,7 +164,6 @@ void nfssvc_release_readres(struct svc_rqst *rqstp); /* Helper functions for NFSv2 ACL code */ __be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat); -__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp); bool svcxdr_decode_fhandle(struct xdr_stream *xdr, struct svc_fh *fhp); #endif /* LINUX_NFSD_H */ From 05027eafc266487c6e056d10ab352861df95b5d4 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Nov 2020 11:52:04 -0500 Subject: [PATCH 40/99] NFSD: Update the NFSv3 GETACL argument decoder to use struct xdr_stream Signed-off-by: Chuck Lever --- fs/nfsd/nfs3acl.c | 11 ++++++----- fs/nfsd/nfs3xdr.c | 11 ++++++++++- fs/nfsd/xdr3.h | 1 + 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 9e1a92fb9771..addb0d7d5500 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -124,19 +124,20 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst *rqstp) /* * XDR decode functions */ + static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_getaclargs *args = rqstp->rq_argp; - p = nfs3svc_decode_fh(p, &args->fh); - if (!p) + if (!svcxdr_decode_nfs_fh3(xdr, &args->fh)) + return 0; + if (xdr_stream_decode_u32(xdr, &args->mask) < 0) return 0; - args->mask = ntohl(*p); p++; - return xdr_argsize_check(rqstp, p); + return 1; } - static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_setaclargs *args = rqstp->rq_argp; diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index a30b418a5116..aa55d0ba2a54 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -53,7 +53,16 @@ svcxdr_decode_nfstime3(struct xdr_stream *xdr, struct timespec64 *timep) return true; } -static bool +/** + * svcxdr_decode_nfs_fh3 - Decode an NFSv3 file handle + * @xdr: XDR stream positioned at an undecoded NFSv3 FH + * @fhp: OUT: filled-in server file handle + * + * Return values: + * %false: The encoded file handle was not valid + * %true: @fhp has been initialized + */ +bool svcxdr_decode_nfs_fh3(struct xdr_stream *xdr, struct svc_fh *fhp) { __be32 *p; diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index 5afb3ce4f062..7456aee74f3d 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -308,6 +308,7 @@ int nfs3svc_encode_entry_plus(void *, const char *name, __be32 *nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp); __be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp); +bool svcxdr_decode_nfs_fh3(struct xdr_stream *xdr, struct svc_fh *fhp); #endif /* _LINUX_NFSD_XDR3_H */ From 68519ff2a1c72c67fcdc4b81671acda59f420af9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Nov 2020 11:56:26 -0500 Subject: [PATCH 41/99] NFSD: Update the NFSv2 SETACL argument decoder to use struct xdr_stream Signed-off-by: Chuck Lever --- fs/nfsd/nfs3acl.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index addb0d7d5500..a568b842e9eb 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -140,28 +140,23 @@ static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p) static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p) { - struct nfsd3_setaclargs *args = rqstp->rq_argp; - struct kvec *head = rqstp->rq_arg.head; - unsigned int base; - int n; + struct xdr_stream *xdr = &rqstp->rq_arg_stream; + struct nfsd3_setaclargs *argp = rqstp->rq_argp; - p = nfs3svc_decode_fh(p, &args->fh); - if (!p) + if (!svcxdr_decode_nfs_fh3(xdr, &argp->fh)) return 0; - args->mask = ntohl(*p++); - if (args->mask & ~NFS_ACL_MASK || - !xdr_argsize_check(rqstp, p)) + if (xdr_stream_decode_u32(xdr, &argp->mask) < 0) + return 0; + if (argp->mask & ~NFS_ACL_MASK) + return 0; + if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_ACL) ? + &argp->acl_access : NULL)) + return 0; + if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_DFACL) ? + &argp->acl_default : NULL)) return 0; - base = (char *)p - (char *)head->iov_base; - n = nfsacl_decode(&rqstp->rq_arg, base, NULL, - (args->mask & NFS_ACL) ? - &args->acl_access : NULL); - if (n > 0) - n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL, - (args->mask & NFS_DFACL) ? - &args->acl_default : NULL); - return (n > 0); + return 1; } /* From 9cee763ee654ce8622d673b8e32687d738e24ace Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 20 Oct 2020 09:56:52 -0400 Subject: [PATCH 42/99] NFSD: Clean up after updating NFSv3 ACL decoders Signed-off-by: Chuck Lever --- fs/nfsd/nfs3xdr.c | 20 -------------------- fs/nfsd/xdr3.h | 2 -- 2 files changed, 22 deletions(-) diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index aa55d0ba2a54..00a96054280a 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -82,26 +82,6 @@ svcxdr_decode_nfs_fh3(struct xdr_stream *xdr, struct svc_fh *fhp) return true; } -static __be32 * -decode_fh(__be32 *p, struct svc_fh *fhp) -{ - unsigned int size; - fh_init(fhp, NFS3_FHSIZE); - size = ntohl(*p++); - if (size > NFS3_FHSIZE) - return NULL; - - memcpy(&fhp->fh_handle.fh_base, p, size); - fhp->fh_handle.fh_size = size; - return p + XDR_QUADLEN(size); -} - -/* Helper function for NFSv3 ACL code */ -__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp) -{ - return decode_fh(p, fhp); -} - static __be32 * encode_fh(__be32 *p, struct svc_fh *fhp) { diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index 7456aee74f3d..3e1578953f54 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -307,8 +307,6 @@ int nfs3svc_encode_entry_plus(void *, const char *name, /* Helper functions for NFSv3 ACL code */ __be32 *nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp); -__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp); bool svcxdr_decode_nfs_fh3(struct xdr_stream *xdr, struct svc_fh *fhp); - #endif /* _LINUX_NFSD_XDR3_H */ From 1b76d1df1a3683b6b23cd1c813d13c5e6a9d35e5 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 6 Jan 2021 09:52:34 +0200 Subject: [PATCH 43/99] nfsd: remove unused stats counters Commit 501cb1849f86 ("nfsd: rip out the raparms cache") removed the code that updates read-ahead cache stats counters, commit 8bbfa9f3889b ("knfsd: remove the nfsd thread busy histogram") removed code that updates the thread busy stats counters back in 2009 and code that updated filehandle cache stats was removed back in 2002. Remove the unused stats counters from nfsd_stats struct and print hardcoded zeros in /proc/net/rpc/nfsd. Signed-off-by: Amir Goldstein Signed-off-by: Chuck Lever --- fs/nfsd/stats.c | 41 ++++++++++++++++------------------------- fs/nfsd/stats.h | 10 ---------- 2 files changed, 16 insertions(+), 35 deletions(-) diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index b1bc582b0493..e928e224205a 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -7,16 +7,14 @@ * Format: * rc * Statistsics for the reply cache - * fh + * fh * statistics for filehandle lookup * io * statistics for IO throughput - * th <10%-20%> <20%-30%> ... <90%-100%> <100%> - * time (seconds) when nfsd thread usage above thresholds - * and number of times that all threads were in use - * ra cache-size <10% <20% <30% ... <100% not-found - * number of times that read-ahead entry was found that deep in - * the cache. + * th + * number of threads + * ra + * * plus generic RPC stats (see net/sunrpc/stats.c) * * Copyright (C) 1995, 1996, 1997 Olaf Kirch @@ -38,31 +36,24 @@ static int nfsd_proc_show(struct seq_file *seq, void *v) { int i; - seq_printf(seq, "rc %u %u %u\nfh %u %u %u %u %u\nio %u %u\n", + seq_printf(seq, "rc %u %u %u\nfh %u 0 0 0 0\nio %u %u\n", nfsdstats.rchits, nfsdstats.rcmisses, nfsdstats.rcnocache, nfsdstats.fh_stale, - nfsdstats.fh_lookup, - nfsdstats.fh_anon, - nfsdstats.fh_nocache_dir, - nfsdstats.fh_nocache_nondir, nfsdstats.io_read, nfsdstats.io_write); - /* thread usage: */ - seq_printf(seq, "th %u %u", nfsdstats.th_cnt, nfsdstats.th_fullcnt); - for (i=0; i<10; i++) { - unsigned int jifs = nfsdstats.th_usage[i]; - unsigned int sec = jifs / HZ, msec = (jifs % HZ)*1000/HZ; - seq_printf(seq, " %u.%03u", sec, msec); - } - /* newline and ra-cache */ - seq_printf(seq, "\nra %u", nfsdstats.ra_size); - for (i=0; i<11; i++) - seq_printf(seq, " %u", nfsdstats.ra_depth[i]); - seq_putc(seq, '\n'); - + /* thread usage: */ + seq_printf(seq, "th %u 0", nfsdstats.th_cnt); + + /* deprecated thread usage histogram stats */ + for (i = 0; i < 10; i++) + seq_puts(seq, " 0.000"); + + /* deprecated ra-cache stats */ + seq_puts(seq, "\nra 0 0 0 0 0 0 0 0 0 0 0 0\n"); + /* show my rpc info */ svc_seq_show(seq, &nfsd_svcstats); diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index b23fdac69820..5e3cdf21556a 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -15,19 +15,9 @@ struct nfsd_stats { unsigned int rcmisses; /* repcache hits */ unsigned int rcnocache; /* uncached reqs */ unsigned int fh_stale; /* FH stale error */ - unsigned int fh_lookup; /* dentry cached */ - unsigned int fh_anon; /* anon file dentry returned */ - unsigned int fh_nocache_dir; /* filehandle not found in dcache */ - unsigned int fh_nocache_nondir; /* filehandle not found in dcache */ unsigned int io_read; /* bytes returned to read requests */ unsigned int io_write; /* bytes passed in write requests */ unsigned int th_cnt; /* number of available threads */ - unsigned int th_usage[10]; /* number of ticks during which n perdeciles - * of available threads were in use */ - unsigned int th_fullcnt; /* number of times last free thread was used */ - unsigned int ra_size; /* size of ra cache */ - unsigned int ra_depth[11]; /* number of times ra entry was found that deep - * in the cache (10percentiles). [10] = not found */ #ifdef CONFIG_NFSD_V4 unsigned int nfs4_opcount[LAST_NFS4_OP + 1]; /* count of individual nfsv4 operations */ #endif From e567b98ce9a4b35b63c364d24828a9e5cd7a8179 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 6 Jan 2021 09:52:35 +0200 Subject: [PATCH 44/99] nfsd: protect concurrent access to nfsd stats counters nfsd stats counters can be updated by concurrent nfsd threads without any protection. Convert some nfsd_stats and nfsd_net struct members to use percpu counters. The longest_chain* members of struct nfsd_net remain unprotected. Signed-off-by: Amir Goldstein Signed-off-by: Chuck Lever --- fs/nfsd/netns.h | 23 ++++++++----- fs/nfsd/nfs4proc.c | 2 +- fs/nfsd/nfscache.c | 52 ++++++++++++++++++++--------- fs/nfsd/nfsctl.c | 5 ++- fs/nfsd/nfsfh.c | 2 +- fs/nfsd/stats.c | 79 ++++++++++++++++++++++++++++++++++++-------- fs/nfsd/stats.h | 82 +++++++++++++++++++++++++++++++++++++++------- fs/nfsd/vfs.c | 4 +-- 8 files changed, 194 insertions(+), 55 deletions(-) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 7346acda9d76..c330f5bd0cf3 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -10,6 +10,7 @@ #include #include +#include /* Hash tables for nfs4_clientid state */ #define CLIENT_HASH_BITS 4 @@ -21,6 +22,14 @@ struct cld_net; struct nfsd4_client_tracking_ops; +enum { + /* cache misses due only to checksum comparison failures */ + NFSD_NET_PAYLOAD_MISSES, + /* amount of memory (in bytes) currently consumed by the DRC */ + NFSD_NET_DRC_MEM_USAGE, + NFSD_NET_COUNTERS_NUM +}; + /* * Represents a nfsd "container". With respect to nfsv4 state tracking, the * fields of interest are the *_id_hashtbls and the *_name_tree. These track @@ -149,20 +158,16 @@ struct nfsd_net { /* * Stats and other tracking of on the duplicate reply cache. - * These fields and the "rc" fields in nfsdstats are modified - * with only the per-bucket cache lock, which isn't really safe - * and should be fixed if we want the statistics to be - * completely accurate. + * The longest_chain* fields are modified with only the per-bucket + * cache lock, which isn't really safe and should be fixed if we want + * these statistics to be completely accurate. */ /* total number of entries */ atomic_t num_drc_entries; - /* cache misses due only to checksum comparison failures */ - unsigned int payload_misses; - - /* amount of memory (in bytes) currently consumed by the DRC */ - unsigned int drc_mem_usage; + /* Per-netns stats counters */ + struct percpu_counter counter[NFSD_NET_COUNTERS_NUM]; /* longest hash chain seen */ unsigned int longest_chain; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index f567592692ee..8273a99359d1 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -2174,7 +2174,7 @@ nfsd4_proc_null(struct svc_rqst *rqstp) static inline void nfsd4_increment_op_stats(u32 opnum) { if (opnum >= FIRST_NFS4_OP && opnum <= LAST_NFS4_OP) - nfsdstats.nfs4_opcount[opnum]++; + percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_NFS4_OP(opnum)]); } static const struct nfsd4_operation nfsd4_ops[]; diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 80c90fc231a5..96cdf77925f3 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -121,14 +121,14 @@ nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct svc_cacherep *rp, struct nfsd_net *nn) { if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) { - nn->drc_mem_usage -= rp->c_replvec.iov_len; + nfsd_stats_drc_mem_usage_sub(nn, rp->c_replvec.iov_len); kfree(rp->c_replvec.iov_base); } if (rp->c_state != RC_UNUSED) { rb_erase(&rp->c_node, &b->rb_head); list_del(&rp->c_lru); atomic_dec(&nn->num_drc_entries); - nn->drc_mem_usage -= sizeof(*rp); + nfsd_stats_drc_mem_usage_sub(nn, sizeof(*rp)); } kmem_cache_free(drc_slab, rp); } @@ -154,6 +154,16 @@ void nfsd_drc_slab_free(void) kmem_cache_destroy(drc_slab); } +static int nfsd_reply_cache_stats_init(struct nfsd_net *nn) +{ + return nfsd_percpu_counters_init(nn->counter, NFSD_NET_COUNTERS_NUM); +} + +static void nfsd_reply_cache_stats_destroy(struct nfsd_net *nn) +{ + nfsd_percpu_counters_destroy(nn->counter, NFSD_NET_COUNTERS_NUM); +} + int nfsd_reply_cache_init(struct nfsd_net *nn) { unsigned int hashsize; @@ -165,12 +175,16 @@ int nfsd_reply_cache_init(struct nfsd_net *nn) hashsize = nfsd_hashsize(nn->max_drc_entries); nn->maskbits = ilog2(hashsize); + status = nfsd_reply_cache_stats_init(nn); + if (status) + goto out_nomem; + nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan; nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count; nn->nfsd_reply_cache_shrinker.seeks = 1; status = register_shrinker(&nn->nfsd_reply_cache_shrinker); if (status) - goto out_nomem; + goto out_stats_destroy; nn->drc_hashtbl = kvzalloc(array_size(hashsize, sizeof(*nn->drc_hashtbl)), GFP_KERNEL); @@ -186,6 +200,8 @@ int nfsd_reply_cache_init(struct nfsd_net *nn) return 0; out_shrinker: unregister_shrinker(&nn->nfsd_reply_cache_shrinker); +out_stats_destroy: + nfsd_reply_cache_stats_destroy(nn); out_nomem: printk(KERN_ERR "nfsd: failed to allocate reply cache\n"); return -ENOMEM; @@ -196,6 +212,7 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn) struct svc_cacherep *rp; unsigned int i; + nfsd_reply_cache_stats_destroy(nn); unregister_shrinker(&nn->nfsd_reply_cache_shrinker); for (i = 0; i < nn->drc_hashsize; i++) { @@ -324,7 +341,7 @@ nfsd_cache_key_cmp(const struct svc_cacherep *key, { if (key->c_key.k_xid == rp->c_key.k_xid && key->c_key.k_csum != rp->c_key.k_csum) { - ++nn->payload_misses; + nfsd_stats_payload_misses_inc(nn); trace_nfsd_drc_mismatch(nn, key, rp); } @@ -407,7 +424,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp) rqstp->rq_cacherep = NULL; if (type == RC_NOCACHE) { - nfsdstats.rcnocache++; + nfsd_stats_rc_nocache_inc(); goto out; } @@ -429,12 +446,12 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp) goto found_entry; } - nfsdstats.rcmisses++; + nfsd_stats_rc_misses_inc(); rqstp->rq_cacherep = rp; rp->c_state = RC_INPROG; atomic_inc(&nn->num_drc_entries); - nn->drc_mem_usage += sizeof(*rp); + nfsd_stats_drc_mem_usage_add(nn, sizeof(*rp)); /* go ahead and prune the cache */ prune_bucket(b, nn); @@ -446,7 +463,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp) found_entry: /* We found a matching entry which is either in progress or done. */ - nfsdstats.rchits++; + nfsd_stats_rc_hits_inc(); rtn = RC_DROPIT; /* Request being processed */ @@ -548,7 +565,7 @@ void nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) return; } spin_lock(&b->cache_lock); - nn->drc_mem_usage += bufsize; + nfsd_stats_drc_mem_usage_add(nn, bufsize); lru_put_end(b, rp); rp->c_secure = test_bit(RQ_SECURE, &rqstp->rq_flags); rp->c_type = cachetype; @@ -588,13 +605,18 @@ static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) seq_printf(m, "max entries: %u\n", nn->max_drc_entries); seq_printf(m, "num entries: %u\n", - atomic_read(&nn->num_drc_entries)); + atomic_read(&nn->num_drc_entries)); seq_printf(m, "hash buckets: %u\n", 1 << nn->maskbits); - seq_printf(m, "mem usage: %u\n", nn->drc_mem_usage); - seq_printf(m, "cache hits: %u\n", nfsdstats.rchits); - seq_printf(m, "cache misses: %u\n", nfsdstats.rcmisses); - seq_printf(m, "not cached: %u\n", nfsdstats.rcnocache); - seq_printf(m, "payload misses: %u\n", nn->payload_misses); + seq_printf(m, "mem usage: %lld\n", + percpu_counter_sum_positive(&nn->counter[NFSD_NET_DRC_MEM_USAGE])); + seq_printf(m, "cache hits: %lld\n", + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_HITS])); + seq_printf(m, "cache misses: %lld\n", + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_MISSES])); + seq_printf(m, "not cached: %lld\n", + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE])); + seq_printf(m, "payload misses: %lld\n", + percpu_counter_sum_positive(&nn->counter[NFSD_NET_PAYLOAD_MISSES])); seq_printf(m, "longest chain len: %u\n", nn->longest_chain); seq_printf(m, "cachesize at longest: %u\n", nn->longest_chain_cachesize); return 0; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index f6d5d783f4a4..258605ee49b8 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1534,7 +1534,9 @@ static int __init init_nfsd(void) retval = nfsd4_init_pnfs(); if (retval) goto out_free_slabs; - nfsd_stat_init(); /* Statistics */ + retval = nfsd_stat_init(); /* Statistics */ + if (retval) + goto out_free_pnfs; retval = nfsd_drc_slab_create(); if (retval) goto out_free_stat; @@ -1554,6 +1556,7 @@ static int __init init_nfsd(void) nfsd_drc_slab_free(); out_free_stat: nfsd_stat_shutdown(); +out_free_pnfs: nfsd4_exit_pnfs(); out_free_slabs: nfsd4_free_slabs(); diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 66f2ef67792a..9e31b2b5c6d2 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -422,7 +422,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) } out: if (error == nfserr_stale) - nfsdstats.fh_stale++; + nfsd_stats_fh_stale_inc(); return error; } diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index e928e224205a..1d3b881e7382 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -36,13 +36,13 @@ static int nfsd_proc_show(struct seq_file *seq, void *v) { int i; - seq_printf(seq, "rc %u %u %u\nfh %u 0 0 0 0\nio %u %u\n", - nfsdstats.rchits, - nfsdstats.rcmisses, - nfsdstats.rcnocache, - nfsdstats.fh_stale, - nfsdstats.io_read, - nfsdstats.io_write); + seq_printf(seq, "rc %lld %lld %lld\nfh %lld 0 0 0 0\nio %lld %lld\n", + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_HITS]), + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_MISSES]), + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]), + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_FH_STALE]), + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_IO_READ]), + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_IO_WRITE])); /* thread usage: */ seq_printf(seq, "th %u 0", nfsdstats.th_cnt); @@ -61,8 +61,10 @@ static int nfsd_proc_show(struct seq_file *seq, void *v) /* Show count for individual nfsv4 operations */ /* Writing operation numbers 0 1 2 also for maintaining uniformity */ seq_printf(seq,"proc4ops %u", LAST_NFS4_OP + 1); - for (i = 0; i <= LAST_NFS4_OP; i++) - seq_printf(seq, " %u", nfsdstats.nfs4_opcount[i]); + for (i = 0; i <= LAST_NFS4_OP; i++) { + seq_printf(seq, " %lld", + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_NFS4_OP(i)])); + } seq_putc(seq, '\n'); #endif @@ -82,14 +84,63 @@ static const struct proc_ops nfsd_proc_ops = { .proc_release = single_release, }; -void -nfsd_stat_init(void) +int nfsd_percpu_counters_init(struct percpu_counter counters[], int num) { - svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_ops); + int i, err = 0; + + for (i = 0; !err && i < num; i++) + err = percpu_counter_init(&counters[i], 0, GFP_KERNEL); + + if (!err) + return 0; + + for (; i > 0; i--) + percpu_counter_destroy(&counters[i-1]); + + return err; } -void -nfsd_stat_shutdown(void) +void nfsd_percpu_counters_reset(struct percpu_counter counters[], int num) { + int i; + + for (i = 0; i < num; i++) + percpu_counter_set(&counters[i], 0); +} + +void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num) +{ + int i; + + for (i = 0; i < num; i++) + percpu_counter_destroy(&counters[i]); +} + +static int nfsd_stat_counters_init(void) +{ + return nfsd_percpu_counters_init(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM); +} + +static void nfsd_stat_counters_destroy(void) +{ + nfsd_percpu_counters_destroy(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM); +} + +int nfsd_stat_init(void) +{ + int err; + + err = nfsd_stat_counters_init(); + if (err) + return err; + + svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_ops); + + return 0; +} + +void nfsd_stat_shutdown(void) +{ + nfsd_stat_counters_destroy(); svc_proc_unregister(&init_net, "nfsd"); } diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index 5e3cdf21556a..87c3150c200f 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -8,27 +8,85 @@ #define _NFSD_STATS_H #include +#include +enum { + NFSD_STATS_RC_HITS, /* repcache hits */ + NFSD_STATS_RC_MISSES, /* repcache misses */ + NFSD_STATS_RC_NOCACHE, /* uncached reqs */ + NFSD_STATS_FH_STALE, /* FH stale error */ + NFSD_STATS_IO_READ, /* bytes returned to read requests */ + NFSD_STATS_IO_WRITE, /* bytes passed in write requests */ +#ifdef CONFIG_NFSD_V4 + NFSD_STATS_FIRST_NFS4_OP, /* count of individual nfsv4 operations */ + NFSD_STATS_LAST_NFS4_OP = NFSD_STATS_FIRST_NFS4_OP + LAST_NFS4_OP, +#define NFSD_STATS_NFS4_OP(op) (NFSD_STATS_FIRST_NFS4_OP + (op)) +#endif + NFSD_STATS_COUNTERS_NUM +}; + struct nfsd_stats { - unsigned int rchits; /* repcache hits */ - unsigned int rcmisses; /* repcache hits */ - unsigned int rcnocache; /* uncached reqs */ - unsigned int fh_stale; /* FH stale error */ - unsigned int io_read; /* bytes returned to read requests */ - unsigned int io_write; /* bytes passed in write requests */ - unsigned int th_cnt; /* number of available threads */ -#ifdef CONFIG_NFSD_V4 - unsigned int nfs4_opcount[LAST_NFS4_OP + 1]; /* count of individual nfsv4 operations */ -#endif + struct percpu_counter counter[NFSD_STATS_COUNTERS_NUM]; + /* Protected by nfsd_mutex */ + unsigned int th_cnt; /* number of available threads */ }; extern struct nfsd_stats nfsdstats; + extern struct svc_stat nfsd_svcstats; -void nfsd_stat_init(void); -void nfsd_stat_shutdown(void); +int nfsd_percpu_counters_init(struct percpu_counter counters[], int num); +void nfsd_percpu_counters_reset(struct percpu_counter counters[], int num); +void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num); +int nfsd_stat_init(void); +void nfsd_stat_shutdown(void); + +static inline void nfsd_stats_rc_hits_inc(void) +{ + percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_HITS]); +} + +static inline void nfsd_stats_rc_misses_inc(void) +{ + percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_MISSES]); +} + +static inline void nfsd_stats_rc_nocache_inc(void) +{ + percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]); +} + +static inline void nfsd_stats_fh_stale_inc(void) +{ + percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_FH_STALE]); +} + +static inline void nfsd_stats_io_read_add(s64 amount) +{ + percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_READ], amount); +} + +static inline void nfsd_stats_io_write_add(s64 amount) +{ + percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_WRITE], amount); +} + +static inline void nfsd_stats_payload_misses_inc(struct nfsd_net *nn) +{ + percpu_counter_inc(&nn->counter[NFSD_NET_PAYLOAD_MISSES]); +} + +static inline void nfsd_stats_drc_mem_usage_add(struct nfsd_net *nn, s64 amount) +{ + percpu_counter_add(&nn->counter[NFSD_NET_DRC_MEM_USAGE], amount); +} + +static inline void nfsd_stats_drc_mem_usage_sub(struct nfsd_net *nn, s64 amount) +{ + percpu_counter_sub(&nn->counter[NFSD_NET_DRC_MEM_USAGE], amount); +} #endif /* _NFSD_STATS_H */ diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 04937e51de56..d560c1bb2ec2 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -889,7 +889,7 @@ static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned long *count, u32 *eof, ssize_t host_err) { if (host_err >= 0) { - nfsdstats.io_read += host_err; + nfsd_stats_io_read_add(host_err); *eof = nfsd_eof_on_read(file, offset, host_err, *count); *count = host_err; fsnotify_access(file); @@ -1040,7 +1040,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, goto out_nfserr; } *cnt = host_err; - nfsdstats.io_write += *cnt; + nfsd_stats_io_write_add(*cnt); fsnotify_modify(file); if (stable && use_wgather) { From 20ad856e47323e208ae8d6a9ecfe5bf0be6f505e Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 6 Jan 2021 09:52:36 +0200 Subject: [PATCH 45/99] nfsd: report per-export stats Collect some nfsd stats per export in addition to the global stats. A new nfsdfs export_stats file is created. It uses the same ops as the exports file to iterate the export entries and we use the file's name to determine the reported info per export. For example: $ cat /proc/fs/nfsd/export_stats # Version 1.1 # Path Client Start-time # Stats /test localhost 92 fh_stale: 0 io_read: 9 io_write: 1 Every export entry reports the start time when stats collection started, so stats collecting scripts can know if stats where reset between samples. Signed-off-by: Amir Goldstein Signed-off-by: Chuck Lever --- fs/nfsd/export.c | 68 ++++++++++++++++++++++++++++++++++++++++++------ fs/nfsd/export.h | 15 +++++++++++ fs/nfsd/nfsctl.c | 3 +++ fs/nfsd/nfsd.h | 2 +- fs/nfsd/nfsfh.c | 4 +-- fs/nfsd/stats.h | 12 ++++++--- fs/nfsd/vfs.c | 4 +-- 7 files changed, 92 insertions(+), 16 deletions(-) diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 81e7bb12aca6..7c863f2c21e0 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -331,12 +331,29 @@ static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc) fsloc->locations = NULL; } +static int export_stats_init(struct export_stats *stats) +{ + stats->start_time = ktime_get_seconds(); + return nfsd_percpu_counters_init(stats->counter, EXP_STATS_COUNTERS_NUM); +} + +static void export_stats_reset(struct export_stats *stats) +{ + nfsd_percpu_counters_reset(stats->counter, EXP_STATS_COUNTERS_NUM); +} + +static void export_stats_destroy(struct export_stats *stats) +{ + nfsd_percpu_counters_destroy(stats->counter, EXP_STATS_COUNTERS_NUM); +} + static void svc_export_put(struct kref *ref) { struct svc_export *exp = container_of(ref, struct svc_export, h.ref); path_put(&exp->ex_path); auth_domain_put(exp->ex_client); nfsd4_fslocs_free(&exp->ex_fslocs); + export_stats_destroy(&exp->ex_stats); kfree(exp->ex_uuid); kfree_rcu(exp, ex_rcu); } @@ -692,22 +709,47 @@ static void exp_flags(struct seq_file *m, int flag, int fsid, kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fslocs); static void show_secinfo(struct seq_file *m, struct svc_export *exp); +static int is_export_stats_file(struct seq_file *m) +{ + /* + * The export_stats file uses the same ops as the exports file. + * We use the file's name to determine the reported info per export. + * There is no rename in nsfdfs, so d_name.name is stable. + */ + return !strcmp(m->file->f_path.dentry->d_name.name, "export_stats"); +} + static int svc_export_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) { - struct svc_export *exp ; + struct svc_export *exp; + bool export_stats = is_export_stats_file(m); - if (h ==NULL) { - seq_puts(m, "#path domain(flags)\n"); + if (h == NULL) { + if (export_stats) + seq_puts(m, "#path domain start-time\n#\tstats\n"); + else + seq_puts(m, "#path domain(flags)\n"); return 0; } exp = container_of(h, struct svc_export, h); seq_path(m, &exp->ex_path, " \t\n\\"); seq_putc(m, '\t'); seq_escape(m, exp->ex_client->name, " \t\n\\"); + if (export_stats) { + seq_printf(m, "\t%lld\n", exp->ex_stats.start_time); + seq_printf(m, "\tfh_stale: %lld\n", + percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_FH_STALE])); + seq_printf(m, "\tio_read: %lld\n", + percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_IO_READ])); + seq_printf(m, "\tio_write: %lld\n", + percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_IO_WRITE])); + seq_putc(m, '\n'); + return 0; + } seq_putc(m, '('); - if (test_bit(CACHE_VALID, &h->flags) && + if (test_bit(CACHE_VALID, &h->flags) && !test_bit(CACHE_NEGATIVE, &h->flags)) { exp_flags(m, exp->ex_flags, exp->ex_fsid, exp->ex_anon_uid, exp->ex_anon_gid, &exp->ex_fslocs); @@ -748,6 +790,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) new->ex_layout_types = 0; new->ex_uuid = NULL; new->cd = item->cd; + export_stats_reset(&new->ex_stats); } static void export_update(struct cache_head *cnew, struct cache_head *citem) @@ -780,10 +823,15 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem) static struct cache_head *svc_export_alloc(void) { struct svc_export *i = kmalloc(sizeof(*i), GFP_KERNEL); - if (i) - return &i->h; - else + if (!i) return NULL; + + if (export_stats_init(&i->ex_stats)) { + kfree(i); + return NULL; + } + + return &i->h; } static const struct cache_detail svc_export_cache_template = { @@ -1245,10 +1293,14 @@ static int e_show(struct seq_file *m, void *p) struct cache_head *cp = p; struct svc_export *exp = container_of(cp, struct svc_export, h); struct cache_detail *cd = m->private; + bool export_stats = is_export_stats_file(m); if (p == SEQ_START_TOKEN) { seq_puts(m, "# Version 1.1\n"); - seq_puts(m, "# Path Client(Flags) # IPs\n"); + if (export_stats) + seq_puts(m, "# Path Client Start-time\n#\tStats\n"); + else + seq_puts(m, "# Path Client(Flags) # IPs\n"); return 0; } diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index e7daa1f246f0..ee0e3aba4a6e 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -6,6 +6,7 @@ #define NFSD_EXPORT_H #include +#include #include #include @@ -46,6 +47,19 @@ struct exp_flavor_info { u32 flags; }; +/* Per-export stats */ +enum { + EXP_STATS_FH_STALE, + EXP_STATS_IO_READ, + EXP_STATS_IO_WRITE, + EXP_STATS_COUNTERS_NUM +}; + +struct export_stats { + time64_t start_time; + struct percpu_counter counter[EXP_STATS_COUNTERS_NUM]; +}; + struct svc_export { struct cache_head h; struct auth_domain * ex_client; @@ -62,6 +76,7 @@ struct svc_export { struct nfsd4_deviceid_map *ex_devid_map; struct cache_detail *cd; struct rcu_head ex_rcu; + struct export_stats ex_stats; }; /* an "export key" (expkey) maps a filehandlefragement to an diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 258605ee49b8..4f6e514192bd 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -32,6 +32,7 @@ enum { NFSD_Root = 1, NFSD_List, + NFSD_Export_Stats, NFSD_Export_features, NFSD_Fh, NFSD_FO_UnlockIP, @@ -1348,6 +1349,8 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc) static const struct tree_descr nfsd_files[] = { [NFSD_List] = {"exports", &exports_nfsd_operations, S_IRUGO}, + /* Per-export io stats use same ops as exports file */ + [NFSD_Export_Stats] = {"export_stats", &exports_nfsd_operations, S_IRUGO}, [NFSD_Export_features] = {"export_features", &export_features_operations, S_IRUGO}, [NFSD_FO_UnlockIP] = {"unlock_ip", diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index d63cf8196fed..8bdc37aa2c2e 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -24,8 +24,8 @@ #include #include "netns.h" -#include "stats.h" #include "export.h" +#include "stats.h" #undef ifdebug #ifdef CONFIG_SUNRPC_DEBUG diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 9e31b2b5c6d2..4744a276058d 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -349,7 +349,7 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) __be32 fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) { - struct svc_export *exp; + struct svc_export *exp = NULL; struct dentry *dentry; __be32 error; @@ -422,7 +422,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) } out: if (error == nfserr_stale) - nfsd_stats_fh_stale_inc(); + nfsd_stats_fh_stale_inc(exp); return error; } diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index 87c3150c200f..51ecda852e23 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -59,19 +59,25 @@ static inline void nfsd_stats_rc_nocache_inc(void) percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]); } -static inline void nfsd_stats_fh_stale_inc(void) +static inline void nfsd_stats_fh_stale_inc(struct svc_export *exp) { percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_FH_STALE]); + if (exp) + percpu_counter_inc(&exp->ex_stats.counter[EXP_STATS_FH_STALE]); } -static inline void nfsd_stats_io_read_add(s64 amount) +static inline void nfsd_stats_io_read_add(struct svc_export *exp, s64 amount) { percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_READ], amount); + if (exp) + percpu_counter_add(&exp->ex_stats.counter[EXP_STATS_IO_READ], amount); } -static inline void nfsd_stats_io_write_add(s64 amount) +static inline void nfsd_stats_io_write_add(struct svc_export *exp, s64 amount) { percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_WRITE], amount); + if (exp) + percpu_counter_add(&exp->ex_stats.counter[EXP_STATS_IO_WRITE], amount); } static inline void nfsd_stats_payload_misses_inc(struct nfsd_net *nn) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index d560c1bb2ec2..d316e11923c5 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -889,7 +889,7 @@ static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned long *count, u32 *eof, ssize_t host_err) { if (host_err >= 0) { - nfsd_stats_io_read_add(host_err); + nfsd_stats_io_read_add(fhp->fh_export, host_err); *eof = nfsd_eof_on_read(file, offset, host_err, *count); *count = host_err; fsnotify_access(file); @@ -1040,7 +1040,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, goto out_nfserr; } *cnt = host_err; - nfsd_stats_io_write_add(*cnt); + nfsd_stats_io_write_add(exp, *cnt); fsnotify_modify(file); if (stable && use_wgather) { From 59a00257c66c2d7b3db21245287711ea6c745e7c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Dec 2020 15:56:20 -0500 Subject: [PATCH 46/99] svcrdma: Refactor svc_rdma_init() and svc_rdma_clean_up() Setting up the proc variables is about to get more complicated. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 526da5d4710b..1fc1d5cbeb9b 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -224,27 +224,43 @@ static struct ctl_table svcrdma_root_table[] = { { }, }; +static void svc_rdma_proc_cleanup(void) +{ + if (!svcrdma_table_header) + return; + unregister_sysctl_table(svcrdma_table_header); + svcrdma_table_header = NULL; +} + +static int svc_rdma_proc_init(void) +{ + if (svcrdma_table_header) + return 0; + + svcrdma_table_header = register_sysctl_table(svcrdma_root_table); + return 0; +} + void svc_rdma_cleanup(void) { dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n"); - if (svcrdma_table_header) { - unregister_sysctl_table(svcrdma_table_header); - svcrdma_table_header = NULL; - } svc_unreg_xprt_class(&svc_rdma_class); + svc_rdma_proc_cleanup(); } int svc_rdma_init(void) { + int rc; + dprintk("SVCRDMA Module Init, register RPC RDMA transport\n"); dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord); dprintk("\tmax_requests : %u\n", svcrdma_max_requests); dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests); dprintk("\tmax_inline : %d\n", svcrdma_max_req_size); - if (!svcrdma_table_header) - svcrdma_table_header = - register_sysctl_table(svcrdma_root_table); + rc = svc_rdma_proc_init(); + if (rc) + return rc; /* Register RDMA with the SVC transport switch */ svc_reg_xprt_class(&svc_rdma_class); From df971cd853c05778ae1175e8aeb80a04bb9d4be5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Dec 2020 15:47:44 -0500 Subject: [PATCH 47/99] svcrdma: Convert rdma_stat_recv to a per-CPU counter Receives are frequent events. Avoid the overhead of a memory bus lock cycle for counting a value that is hardly every used. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 3 +- net/sunrpc/xprtrdma/svc_rdma.c | 55 +++++++++++++++++++++++-- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 3 +- 3 files changed, 54 insertions(+), 7 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 294b56e61522..ff32c59a27e7 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -49,6 +49,7 @@ #include #include +#include #include #include @@ -65,7 +66,7 @@ extern unsigned int svcrdma_max_requests; extern unsigned int svcrdma_max_bc_requests; extern unsigned int svcrdma_max_req_size; -extern atomic_t rdma_stat_recv; +extern struct percpu_counter svcrdma_stat_recv; extern atomic_t rdma_stat_read; extern atomic_t rdma_stat_write; extern atomic_t rdma_stat_sq_starve; diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 1fc1d5cbeb9b..3e5e622bad81 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -63,7 +63,7 @@ unsigned int svcrdma_max_req_size = RPCRDMA_DEF_INLINE_THRESH; static unsigned int min_max_inline = RPCRDMA_DEF_INLINE_THRESH; static unsigned int max_max_inline = RPCRDMA_MAX_INLINE_THRESH; -atomic_t rdma_stat_recv; +struct percpu_counter svcrdma_stat_recv; atomic_t rdma_stat_read; atomic_t rdma_stat_write; atomic_t rdma_stat_sq_starve; @@ -110,6 +110,42 @@ static int read_reset_stat(struct ctl_table *table, int write, return 0; } +enum { + SVCRDMA_COUNTER_BUFSIZ = sizeof(unsigned long long), +}; + +static int svcrdma_counter_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct percpu_counter *stat = (struct percpu_counter *)table->data; + char tmp[SVCRDMA_COUNTER_BUFSIZ + 1]; + int len; + + if (write) { + percpu_counter_set(stat, 0); + return 0; + } + + len = snprintf(tmp, SVCRDMA_COUNTER_BUFSIZ, "%lld\n", + percpu_counter_sum_positive(stat)); + if (len >= SVCRDMA_COUNTER_BUFSIZ) + return -EFAULT; + len = strlen(tmp); + if (*ppos > len) { + *lenp = 0; + return 0; + } + len -= *ppos; + if (len > *lenp) + len = *lenp; + if (len) + memcpy(buffer, tmp, len); + *lenp = len; + *ppos += len; + + return 0; +} + static struct ctl_table_header *svcrdma_table_header; static struct ctl_table svcrdma_parm_table[] = { { @@ -149,10 +185,10 @@ static struct ctl_table svcrdma_parm_table[] = { }, { .procname = "rdma_stat_recv", - .data = &rdma_stat_recv, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_recv, + .maxlen = SVCRDMA_COUNTER_BUFSIZ, .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = svcrdma_counter_handler, }, { .procname = "rdma_stat_write", @@ -230,15 +266,26 @@ static void svc_rdma_proc_cleanup(void) return; unregister_sysctl_table(svcrdma_table_header); svcrdma_table_header = NULL; + + percpu_counter_destroy(&svcrdma_stat_recv); } static int svc_rdma_proc_init(void) { + int rc; + if (svcrdma_table_header) return 0; + rc = percpu_counter_init(&svcrdma_stat_recv, 0, GFP_KERNEL); + if (rc) + goto out_err; + svcrdma_table_header = register_sysctl_table(svcrdma_root_table); return 0; + +out_err: + return rc; } void svc_rdma_cleanup(void) diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index cbdb71247755..7d14a74df716 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -845,8 +845,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) } list_del(&ctxt->rc_list); spin_unlock(&rdma_xprt->sc_rq_dto_lock); - - atomic_inc(&rdma_stat_recv); + percpu_counter_inc(&svcrdma_stat_recv); svc_rdma_build_arg_xdr(rqstp, ctxt); From 22df5a22462e836ccb30634c3a52602091179a73 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Dec 2020 15:55:17 -0500 Subject: [PATCH 48/99] svcrdma: Convert rdma_stat_sq_starve to a per-CPU counter Avoid the overhead of a memory bus lock cycle for counting a value that is hardly every used. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 2 +- net/sunrpc/xprtrdma/svc_rdma.c | 13 +++++++++---- net/sunrpc/xprtrdma/svc_rdma_rw.c | 1 + net/sunrpc/xprtrdma/svc_rdma_sendto.c | 2 +- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index ff32c59a27e7..c06b16ccf83e 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -69,7 +69,7 @@ extern unsigned int svcrdma_max_req_size; extern struct percpu_counter svcrdma_stat_recv; extern atomic_t rdma_stat_read; extern atomic_t rdma_stat_write; -extern atomic_t rdma_stat_sq_starve; +extern struct percpu_counter svcrdma_stat_sq_starve; extern atomic_t rdma_stat_rq_starve; extern atomic_t rdma_stat_rq_poll; extern atomic_t rdma_stat_rq_prod; diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 3e5e622bad81..ee768d4c3c90 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -66,7 +66,7 @@ static unsigned int max_max_inline = RPCRDMA_MAX_INLINE_THRESH; struct percpu_counter svcrdma_stat_recv; atomic_t rdma_stat_read; atomic_t rdma_stat_write; -atomic_t rdma_stat_sq_starve; +struct percpu_counter svcrdma_stat_sq_starve; atomic_t rdma_stat_rq_starve; atomic_t rdma_stat_rq_poll; atomic_t rdma_stat_rq_prod; @@ -199,10 +199,10 @@ static struct ctl_table svcrdma_parm_table[] = { }, { .procname = "rdma_stat_sq_starve", - .data = &rdma_stat_sq_starve, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_sq_starve, + .maxlen = SVCRDMA_COUNTER_BUFSIZ, .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = svcrdma_counter_handler, }, { .procname = "rdma_stat_rq_starve", @@ -267,6 +267,7 @@ static void svc_rdma_proc_cleanup(void) unregister_sysctl_table(svcrdma_table_header); svcrdma_table_header = NULL; + percpu_counter_destroy(&svcrdma_stat_sq_starve); percpu_counter_destroy(&svcrdma_stat_recv); } @@ -278,6 +279,9 @@ static int svc_rdma_proc_init(void) return 0; rc = percpu_counter_init(&svcrdma_stat_recv, 0, GFP_KERNEL); + if (rc) + goto out_err; + rc = percpu_counter_init(&svcrdma_stat_sq_starve, 0, GFP_KERNEL); if (rc) goto out_err; @@ -285,6 +289,7 @@ static int svc_rdma_proc_init(void) return 0; out_err: + percpu_counter_destroy(&svcrdma_stat_recv); return rc; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 0b63e1321d74..d7d98b2df00b 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -364,6 +364,7 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc) return 0; } + percpu_counter_inc(&svcrdma_stat_sq_starve); trace_svcrdma_sq_full(rdma); atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); wait_event(rdma->sc_send_wait, diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 68af79d4f04f..52c759a8543e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -317,7 +317,7 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt) /* If the SQ is full, wait until an SQ entry is available */ while (1) { if ((atomic_dec_return(&rdma->sc_sq_avail) < 0)) { - atomic_inc(&rdma_stat_sq_starve); + percpu_counter_inc(&svcrdma_stat_sq_starve); trace_svcrdma_sq_full(rdma); atomic_inc(&rdma->sc_sq_avail); wait_event(rdma->sc_send_wait, From 1e7e55731628c90d8c701c45f9c3a3b8718840d6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Dec 2020 16:09:36 -0500 Subject: [PATCH 49/99] svcrdma: Restore read and write stats Now that we have an efficient mechanism to update these two stats, let's start maintaining them again. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 4 ++-- net/sunrpc/xprtrdma/svc_rdma.c | 26 ++++++++++++++++++-------- net/sunrpc/xprtrdma/svc_rdma_rw.c | 2 ++ 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index c06b16ccf83e..c882816cba8e 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -66,10 +66,10 @@ extern unsigned int svcrdma_max_requests; extern unsigned int svcrdma_max_bc_requests; extern unsigned int svcrdma_max_req_size; +extern struct percpu_counter svcrdma_stat_read; extern struct percpu_counter svcrdma_stat_recv; -extern atomic_t rdma_stat_read; -extern atomic_t rdma_stat_write; extern struct percpu_counter svcrdma_stat_sq_starve; +extern struct percpu_counter svcrdma_stat_write; extern atomic_t rdma_stat_rq_starve; extern atomic_t rdma_stat_rq_poll; extern atomic_t rdma_stat_rq_prod; diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index ee768d4c3c90..c8336df7a142 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -63,10 +63,10 @@ unsigned int svcrdma_max_req_size = RPCRDMA_DEF_INLINE_THRESH; static unsigned int min_max_inline = RPCRDMA_DEF_INLINE_THRESH; static unsigned int max_max_inline = RPCRDMA_MAX_INLINE_THRESH; +struct percpu_counter svcrdma_stat_read; struct percpu_counter svcrdma_stat_recv; -atomic_t rdma_stat_read; -atomic_t rdma_stat_write; struct percpu_counter svcrdma_stat_sq_starve; +struct percpu_counter svcrdma_stat_write; atomic_t rdma_stat_rq_starve; atomic_t rdma_stat_rq_poll; atomic_t rdma_stat_rq_prod; @@ -178,10 +178,10 @@ static struct ctl_table svcrdma_parm_table[] = { { .procname = "rdma_stat_read", - .data = &rdma_stat_read, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_read, + .maxlen = SVCRDMA_COUNTER_BUFSIZ, .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = svcrdma_counter_handler, }, { .procname = "rdma_stat_recv", @@ -192,10 +192,10 @@ static struct ctl_table svcrdma_parm_table[] = { }, { .procname = "rdma_stat_write", - .data = &rdma_stat_write, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_write, + .maxlen = SVCRDMA_COUNTER_BUFSIZ, .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = svcrdma_counter_handler, }, { .procname = "rdma_stat_sq_starve", @@ -267,8 +267,10 @@ static void svc_rdma_proc_cleanup(void) unregister_sysctl_table(svcrdma_table_header); svcrdma_table_header = NULL; + percpu_counter_destroy(&svcrdma_stat_write); percpu_counter_destroy(&svcrdma_stat_sq_starve); percpu_counter_destroy(&svcrdma_stat_recv); + percpu_counter_destroy(&svcrdma_stat_read); } static int svc_rdma_proc_init(void) @@ -278,10 +280,16 @@ static int svc_rdma_proc_init(void) if (svcrdma_table_header) return 0; + rc = percpu_counter_init(&svcrdma_stat_read, 0, GFP_KERNEL); + if (rc) + goto out_err; rc = percpu_counter_init(&svcrdma_stat_recv, 0, GFP_KERNEL); if (rc) goto out_err; rc = percpu_counter_init(&svcrdma_stat_sq_starve, 0, GFP_KERNEL); + if (rc) + goto out_err; + rc = percpu_counter_init(&svcrdma_stat_write, 0, GFP_KERNEL); if (rc) goto out_err; @@ -289,7 +297,9 @@ static int svc_rdma_proc_init(void) return 0; out_err: + percpu_counter_destroy(&svcrdma_stat_sq_starve); percpu_counter_destroy(&svcrdma_stat_recv); + percpu_counter_destroy(&svcrdma_stat_read); return rc; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index d7d98b2df00b..693d139a8633 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -469,6 +469,7 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info, DMA_TO_DEVICE); if (ret < 0) return -EIO; + percpu_counter_inc(&svcrdma_stat_write); list_add(&ctxt->rw_list, &cc->cc_rwctxts); cc->cc_sqecount += ret; @@ -719,6 +720,7 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, segment->rs_handle, DMA_FROM_DEVICE); if (ret < 0) return -EIO; + percpu_counter_inc(&svcrdma_stat_read); list_add(&ctxt->rw_list, &cc->cc_rwctxts); cc->cc_sqecount += ret; From c6226ff9a62a17182b8092883ca201df5cd47f59 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Dec 2020 14:53:09 -0500 Subject: [PATCH 50/99] svcrdma: Deprecate stat variables that are no longer used Clean up. We are not permitted to remove old proc files. Instead, convert these variables to stubs that are only ever allowed to display a value of zero. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 5 -- net/sunrpc/xprtrdma/svc_rdma.c | 84 +++++++++++---------------------- 2 files changed, 27 insertions(+), 62 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index c882816cba8e..1e76ed688044 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -70,11 +70,6 @@ extern struct percpu_counter svcrdma_stat_read; extern struct percpu_counter svcrdma_stat_recv; extern struct percpu_counter svcrdma_stat_sq_starve; extern struct percpu_counter svcrdma_stat_write; -extern atomic_t rdma_stat_rq_starve; -extern atomic_t rdma_stat_rq_poll; -extern atomic_t rdma_stat_rq_prod; -extern atomic_t rdma_stat_sq_poll; -extern atomic_t rdma_stat_sq_prod; struct svcxprt_rdma { struct svc_xprt sc_xprt; /* SVC transport structure */ diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index c8336df7a142..5bc20e9d09cd 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -62,53 +62,13 @@ static unsigned int max_max_requests = 16384; unsigned int svcrdma_max_req_size = RPCRDMA_DEF_INLINE_THRESH; static unsigned int min_max_inline = RPCRDMA_DEF_INLINE_THRESH; static unsigned int max_max_inline = RPCRDMA_MAX_INLINE_THRESH; +static unsigned int svcrdma_stat_unused; +static unsigned int zero; struct percpu_counter svcrdma_stat_read; struct percpu_counter svcrdma_stat_recv; struct percpu_counter svcrdma_stat_sq_starve; struct percpu_counter svcrdma_stat_write; -atomic_t rdma_stat_rq_starve; -atomic_t rdma_stat_rq_poll; -atomic_t rdma_stat_rq_prod; -atomic_t rdma_stat_sq_poll; -atomic_t rdma_stat_sq_prod; - -/* - * This function implements reading and resetting an atomic_t stat - * variable through read/write to a proc file. Any write to the file - * resets the associated statistic to zero. Any read returns it's - * current value. - */ -static int read_reset_stat(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - atomic_t *stat = (atomic_t *)table->data; - - if (!stat) - return -EINVAL; - - if (write) - atomic_set(stat, 0); - else { - char str_buf[32]; - int len = snprintf(str_buf, 32, "%d\n", atomic_read(stat)); - if (len >= 32) - return -EFAULT; - len = strlen(str_buf); - if (*ppos > len) { - *lenp = 0; - return 0; - } - len -= *ppos; - if (len > *lenp) - len = *lenp; - if (len) - memcpy(buffer, str_buf, len); - *lenp = len; - *ppos += len; - } - return 0; -} enum { SVCRDMA_COUNTER_BUFSIZ = sizeof(unsigned long long), @@ -206,38 +166,48 @@ static struct ctl_table svcrdma_parm_table[] = { }, { .procname = "rdma_stat_rq_starve", - .data = &rdma_stat_rq_starve, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_unused, + .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &zero, }, { .procname = "rdma_stat_rq_poll", - .data = &rdma_stat_rq_poll, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_unused, + .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &zero, }, { .procname = "rdma_stat_rq_prod", - .data = &rdma_stat_rq_prod, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_unused, + .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &zero, }, { .procname = "rdma_stat_sq_poll", - .data = &rdma_stat_sq_poll, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_unused, + .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &zero, }, { .procname = "rdma_stat_sq_prod", - .data = &rdma_stat_sq_prod, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_unused, + .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &zero, }, { }, }; From 43042b90cae11cc2d9827c91df6d6b5fe498d5ce Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 8 Dec 2020 13:14:15 -0500 Subject: [PATCH 51/99] svcrdma: Reduce Receive doorbell rate This is similar to commit e340c2d6ef2a ("xprtrdma: Reduce the doorbell rate (Receive)") which added Receive batching to the client. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 1 + net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 82 +++++++++++++------------ 2 files changed, 44 insertions(+), 39 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 1e76ed688044..7c693b31965e 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -104,6 +104,7 @@ struct svcxprt_rdma { wait_queue_head_t sc_send_wait; /* SQ exhaustion waitlist */ unsigned long sc_flags; + u32 sc_pending_recvs; struct list_head sc_read_complete_q; struct work_struct sc_work; diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 7d14a74df716..ab0b7e9777bc 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -266,33 +266,46 @@ void svc_rdma_release_rqst(struct svc_rqst *rqstp) svc_rdma_recv_ctxt_put(rdma, ctxt); } -static int __svc_rdma_post_recv(struct svcxprt_rdma *rdma, - struct svc_rdma_recv_ctxt *ctxt) +static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma, + unsigned int wanted, bool temp) { + const struct ib_recv_wr *bad_wr = NULL; + struct svc_rdma_recv_ctxt *ctxt; + struct ib_recv_wr *recv_chain; int ret; - trace_svcrdma_post_recv(ctxt); - ret = ib_post_recv(rdma->sc_qp, &ctxt->rc_recv_wr, NULL); + recv_chain = NULL; + while (wanted--) { + ctxt = svc_rdma_recv_ctxt_get(rdma); + if (!ctxt) + break; + + trace_svcrdma_post_recv(ctxt); + ctxt->rc_temp = temp; + ctxt->rc_recv_wr.next = recv_chain; + recv_chain = &ctxt->rc_recv_wr; + rdma->sc_pending_recvs++; + } + if (!recv_chain) + return false; + + ret = ib_post_recv(rdma->sc_qp, recv_chain, &bad_wr); if (ret) goto err_post; - return 0; + return true; err_post: + while (bad_wr) { + ctxt = container_of(bad_wr, struct svc_rdma_recv_ctxt, + rc_recv_wr); + bad_wr = bad_wr->next; + svc_rdma_recv_ctxt_put(rdma, ctxt); + } + trace_svcrdma_rq_post_err(rdma, ret); - svc_rdma_recv_ctxt_put(rdma, ctxt); - return ret; -} - -static int svc_rdma_post_recv(struct svcxprt_rdma *rdma) -{ - struct svc_rdma_recv_ctxt *ctxt; - - if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) - return 0; - ctxt = svc_rdma_recv_ctxt_get(rdma); - if (!ctxt) - return -ENOMEM; - return __svc_rdma_post_recv(rdma, ctxt); + /* Since we're destroying the xprt, no need to reset + * sc_pending_recvs. */ + return false; } /** @@ -303,20 +316,7 @@ static int svc_rdma_post_recv(struct svcxprt_rdma *rdma) */ bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma) { - struct svc_rdma_recv_ctxt *ctxt; - unsigned int i; - int ret; - - for (i = 0; i < rdma->sc_max_requests; i++) { - ctxt = svc_rdma_recv_ctxt_get(rdma); - if (!ctxt) - return false; - ctxt->rc_temp = true; - ret = __svc_rdma_post_recv(rdma, ctxt); - if (ret) - return false; - } - return true; + return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests, true); } /** @@ -324,8 +324,6 @@ bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma) * @cq: Completion Queue context * @wc: Work Completion object * - * NB: The svc_xprt/svcxprt_rdma is pinned whenever it's possible that - * the Receive completion handler could be running. */ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) { @@ -333,6 +331,8 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) struct ib_cqe *cqe = wc->wr_cqe; struct svc_rdma_recv_ctxt *ctxt; + rdma->sc_pending_recvs--; + /* WARNING: Only wc->wr_cqe and wc->status are reliable */ ctxt = container_of(cqe, struct svc_rdma_recv_ctxt, rc_cqe); @@ -340,9 +340,6 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) if (wc->status != IB_WC_SUCCESS) goto flushed; - if (svc_rdma_post_recv(rdma)) - goto post_err; - /* All wc fields are now known to be valid */ ctxt->rc_byte_len = wc->byte_len; ib_dma_sync_single_for_cpu(rdma->sc_pd->device, @@ -356,11 +353,18 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) spin_unlock(&rdma->sc_rq_dto_lock); if (!test_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags)) svc_xprt_enqueue(&rdma->sc_xprt); + + if (!test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags) && + rdma->sc_pending_recvs < rdma->sc_max_requests) + if (!svc_rdma_refresh_recvs(rdma, RPCRDMA_MAX_RECV_BATCH, + false)) + goto post_err; + return; flushed: -post_err: svc_rdma_recv_ctxt_put(rdma, ctxt); +post_err: set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); svc_xprt_enqueue(&rdma->sc_xprt); } From dd2d055b278b20920ab454b233ec76038966788a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 30 Dec 2020 14:08:53 -0500 Subject: [PATCH 52/99] svcrdma: DMA-sync the receive buffer in svc_rdma_recvfrom() The Receive completion handler doesn't look at the contents of the Receive buffer. The DMA sync isn't terribly expensive but it's one less thing that needs to be done by the Receive completion handler, which is single-threaded (per svc_xprt). This helps scalability. Signed-off-by: Chuck Lever Reviewed-by: Christoph Hellwig --- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index ab0b7e9777bc..6d28f23ceb35 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -342,9 +342,6 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) /* All wc fields are now known to be valid */ ctxt->rc_byte_len = wc->byte_len; - ib_dma_sync_single_for_cpu(rdma->sc_pd->device, - ctxt->rc_recv_sge.addr, - wc->byte_len, DMA_FROM_DEVICE); spin_lock(&rdma->sc_rq_dto_lock); list_add_tail(&ctxt->rc_list, &rdma->sc_rq_dto_q); @@ -851,6 +848,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) spin_unlock(&rdma_xprt->sc_rq_dto_lock); percpu_counter_inc(&svcrdma_stat_recv); + ib_dma_sync_single_for_cpu(rdma_xprt->sc_pd->device, + ctxt->rc_recv_sge.addr, ctxt->rc_byte_len, + DMA_FROM_DEVICE); svc_rdma_build_arg_xdr(rqstp, ctxt); /* Prevent svc_xprt_release from releasing pages in rq_pages From 4ff923ce1e104c27b55f123ca9dbaa31fdb468ad Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 3 Jan 2021 14:37:23 -0500 Subject: [PATCH 53/99] SUNRPC: Correct a comment Clean up: The rq_argpages field was removed from struct svc_rqst in the pre-git era. Signed-off-by: Chuck Lever --- net/sunrpc/svc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 4187745887f0..61fb8a18552c 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -559,7 +559,7 @@ EXPORT_SYMBOL_GPL(svc_destroy); /* * Allocate an RPC server's buffer space. - * We allocate pages and place them in rq_argpages. + * We allocate pages and place them in rq_pages. */ static int svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, int node) From 33311873adb0d55c287b164117b5b4bb7b1bdc40 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 21 Jan 2021 17:57:37 -0500 Subject: [PATCH 54/99] nfsd4: simplify process_lookup1 This STALE_CLIENTID check is redundant with the one in lookup_clientid(). There's a difference in behavior is in case of memory allocation failure, which I think isn't a big deal. Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 1d2cd6a88f61..f9f89229dba6 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4680,8 +4680,6 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, struct nfs4_openowner *oo = NULL; __be32 status; - if (STALE_CLIENTID(&open->op_clientid, nn)) - return nfserr_stale_clientid; /* * In case we need it later, after we've already created the * file and don't want to risk a further failure: From a9d53a75cf574d6aa41f3cb4968fffe4f64e0fad Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 21 Jan 2021 17:57:38 -0500 Subject: [PATCH 55/99] nfsd: simplify process_lock Similarly, this STALE_CLIENTID check is already handled by: nfs4_preprocess_confirmed_seqid_op()-> nfs4_preprocess_seqid_op()-> nfsd4_lookup_stateid()-> set_client()-> STALE_CLIENTID() (This may cause it to return a different error in some cases where there are multiple things wrong; pynfs test SEQ10 regressed on this commit because of that, but I think that's the test's fault, and I've fixed it separately.) Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index f9f89229dba6..7ea63d7cec4d 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -6697,10 +6697,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &cstate->session->se_client->cl_clientid, sizeof(clientid_t)); - status = nfserr_stale_clientid; - if (STALE_CLIENTID(&lock->lk_new_clientid, nn)) - goto out; - /* validate and update open stateid and open seqid */ status = nfs4_preprocess_confirmed_seqid_op(cstate, lock->lk_new_open_seqid, From b4587eb2cf4b6271f67fb93b75f7de2a2026e853 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 21 Jan 2021 17:57:39 -0500 Subject: [PATCH 56/99] nfsd: simplify nfsd_renew You can take the single-exit thing too far, I think. Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 7ea63d7cec4d..ba955bbf21df 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -5300,15 +5300,12 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, trace_nfsd_clid_renew(clid); status = lookup_clientid(clid, cstate, nn, false); if (status) - goto out; + return status; clp = cstate->clp; - status = nfserr_cb_path_down; if (!list_empty(&clp->cl_delegations) && clp->cl_cb_state != NFSD4_CB_UP) - goto out; - status = nfs_ok; -out: - return status; + return nfserr_cb_path_down; + return nfs_ok; } void From 460d27091ae2c23e7ac959a61cd481c58832db58 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 21 Jan 2021 17:57:40 -0500 Subject: [PATCH 57/99] nfsd: rename lookup_clientid->set_client I think this is a better name, and I'm going to reuse elsewhere the code that does the lookup itself. Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index ba955bbf21df..4bdd90074e24 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4633,7 +4633,7 @@ static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4 return nfserr_bad_seqid; } -static __be32 lookup_clientid(clientid_t *clid, +static __be32 set_client(clientid_t *clid, struct nfsd4_compound_state *cstate, struct nfsd_net *nn, bool sessions) @@ -4688,7 +4688,7 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, if (open->op_file == NULL) return nfserr_jukebox; - status = lookup_clientid(clientid, cstate, nn, false); + status = set_client(clientid, cstate, nn, false); if (status) return status; clp = cstate->clp; @@ -5298,7 +5298,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); trace_nfsd_clid_renew(clid); - status = lookup_clientid(clid, cstate, nn, false); + status = set_client(clid, cstate, nn, false); if (status) return status; clp = cstate->clp; @@ -5681,8 +5681,7 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || CLOSE_STATEID(stateid)) return nfserr_bad_stateid; - status = lookup_clientid(&stateid->si_opaque.so_clid, cstate, nn, - false); + status = set_client(&stateid->si_opaque.so_clid, cstate, nn, false); if (status == nfserr_stale_clientid) { if (cstate->session) return nfserr_bad_stateid; @@ -5821,7 +5820,7 @@ static __be32 find_cpntf_state(struct nfsd_net *nn, stateid_t *st, cps->cpntf_time = ktime_get_boottime_seconds(); memset(&cstate, 0, sizeof(cstate)); - status = lookup_clientid(&cps->cp_p_clid, &cstate, nn, true); + status = set_client(&cps->cp_p_clid, &cstate, nn, true); if (status) goto out; status = nfsd4_lookup_stateid(&cstate, &cps->cp_p_stateid, @@ -6900,8 +6899,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_inval; if (!nfsd4_has_session(cstate)) { - status = lookup_clientid(&lockt->lt_clientid, cstate, nn, - false); + status = set_client(&lockt->lt_clientid, cstate, nn, false); if (status) goto out; } @@ -7085,7 +7083,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", clid->cl_boot, clid->cl_id); - status = lookup_clientid(clid, cstate, nn, false); + status = set_client(clid, cstate, nn, false); if (status) return status; @@ -7232,7 +7230,7 @@ nfs4_check_open_reclaim(clientid_t *clid, __be32 status; /* find clientid in conf_id_hashtbl */ - status = lookup_clientid(clid, cstate, nn, false); + status = set_client(clid, cstate, nn, false); if (status) return nfserr_reclaim_bad; From 7950b5316e40d99dcb85ab81a2d1dbb913d7c1c8 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 21 Jan 2021 17:57:41 -0500 Subject: [PATCH 58/99] nfsd: refactor set_client This'll be useful elsewhere. Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 4bdd90074e24..c74bf3b5b0de 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4633,40 +4633,40 @@ static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4 return nfserr_bad_seqid; } +static struct nfs4_client *lookup_clientid(clientid_t *clid, bool sessions, + struct nfsd_net *nn) +{ + struct nfs4_client *found; + + spin_lock(&nn->client_lock); + found = find_confirmed_client(clid, sessions, nn); + if (found) + atomic_inc(&found->cl_rpc_users); + spin_unlock(&nn->client_lock); + return found; +} + static __be32 set_client(clientid_t *clid, struct nfsd4_compound_state *cstate, struct nfsd_net *nn, bool sessions) { - struct nfs4_client *found; - if (cstate->clp) { - found = cstate->clp; - if (!same_clid(&found->cl_clientid, clid)) + if (!same_clid(&cstate->clp->cl_clientid, clid)) return nfserr_stale_clientid; return nfs_ok; } - if (STALE_CLIENTID(clid, nn)) return nfserr_stale_clientid; - /* * For v4.1+ we get the client in the SEQUENCE op. If we don't have one * cached already then we know this is for is for v4.0 and "sessions" * will be false. */ WARN_ON_ONCE(cstate->session); - spin_lock(&nn->client_lock); - found = find_confirmed_client(clid, sessions, nn); - if (!found) { - spin_unlock(&nn->client_lock); + cstate->clp = lookup_clientid(clid, sessions, nn); + if (!cstate->clp) return nfserr_expired; - } - atomic_inc(&found->cl_rpc_users); - spin_unlock(&nn->client_lock); - - /* Cache the nfs4_client in cstate! */ - cstate->clp = found; return nfs_ok; } From 47fdb22dacae78f37701d82a94c16a014186d34e Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 21 Jan 2021 17:57:42 -0500 Subject: [PATCH 59/99] nfsd: find_cpntf_state cleanup I think this unusual use of struct compound_state could cause confusion. It's not that much more complicated just to open-code this stateid lookup. The only change in behavior should be a different error return in the case the copy is using a source stateid that is a revoked delegation, but I doubt that matters. Signed-off-by: J. Bruce Fields [ cel: squashed in fix reported by Coverity ] Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index c74bf3b5b0de..b7a5ac5a81ac 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -5812,21 +5812,27 @@ static __be32 find_cpntf_state(struct nfsd_net *nn, stateid_t *st, { __be32 status; struct nfs4_cpntf_state *cps = NULL; - struct nfsd4_compound_state cstate; + struct nfs4_client *found; status = manage_cpntf_state(nn, st, NULL, &cps); if (status) return status; cps->cpntf_time = ktime_get_boottime_seconds(); - memset(&cstate, 0, sizeof(cstate)); - status = set_client(&cps->cp_p_clid, &cstate, nn, true); - if (status) + + status = nfserr_expired; + found = lookup_clientid(&cps->cp_p_clid, true, nn); + if (!found) goto out; - status = nfsd4_lookup_stateid(&cstate, &cps->cp_p_stateid, - NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, - stid, nn); - put_client_renew(cstate.clp); + + *stid = find_stateid_by_type(found, &cps->cp_p_stateid, + NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID); + if (*stid) + status = nfs_ok; + else + status = nfserr_bad_stateid; + + put_client_renew(found); out: nfs4_put_cpntf_state(nn, cps); return status; From 632faca72938f9f63049e48a8c438913828ac7a9 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 23 Dec 2020 11:44:25 -0800 Subject: [PATCH 60/99] f2fs: handle unallocated section and zone on pinned/atgc If we have large section/zone, unallocated segment makes them corrupted. E.g., - Pinned file: -1 119304647 119304647 - ATGC data: -1 119304647 119304647 Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index e81eb0748e2a..229814b4f4a6 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -101,11 +101,11 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, #define BLKS_PER_SEC(sbi) \ ((sbi)->segs_per_sec * (sbi)->blocks_per_seg) #define GET_SEC_FROM_SEG(sbi, segno) \ - ((segno) / (sbi)->segs_per_sec) + (((segno) == -1) ? -1: (segno) / (sbi)->segs_per_sec) #define GET_SEG_FROM_SEC(sbi, secno) \ ((secno) * (sbi)->segs_per_sec) #define GET_ZONE_FROM_SEC(sbi, secno) \ - ((secno) / (sbi)->secs_per_zone) + (((secno) == -1) ? -1: (secno) / (sbi)->secs_per_zone) #define GET_ZONE_FROM_SEG(sbi, segno) \ GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno)) From 36218b81f094648d929994399eb6eb5c97b991e5 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Tue, 22 Dec 2020 21:34:15 +0800 Subject: [PATCH 61/99] f2fs: Replace expression with offsetof() Use the existing offsetof() macro instead of duplicating code. Signed-off-by: Zheng Yongjun Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3a24423ac65f..5e3fabacefb5 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2696,7 +2696,7 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) src = F2FS_INODE(page); dst = F2FS_INODE(ipage); - memcpy(dst, src, (unsigned long)&src->i_ext - (unsigned long)src); + memcpy(dst, src, offsetof(struct f2fs_inode, i_ext)); dst->i_size = 0; dst->i_blocks = cpu_to_le64(1); dst->i_links = cpu_to_le32(1); From a28d9aa1a2c7c774c38f2da1a662434bc29cb98e Mon Sep 17 00:00:00 2001 From: Weichao Guo Date: Mon, 14 Dec 2020 11:54:53 +0800 Subject: [PATCH 62/99] f2fs: fix to set inode->i_mode correctly for posix_acl_update_mode We should update the ~S_IRWXUGO part of inode->i_mode in __setattr_copy, because posix_acl_update_mode updates mode based on inode->i_mode, which finally overwrites the ~S_IRWXUGO part of i_acl_mode with old i_mode. Testcase to reproduce this bug: 0. adduser abc 1. mkfs.f2fs /dev/sdd 2. mount -t f2fs /dev/sdd /mnt/f2fs 3. mkdir /mnt/f2fs/test 4. setfacl -m u:abc:r /mnt/f2fs/test 5. chmod +s /mnt/f2fs/test Signed-off-by: Weichao Guo Signed-off-by: Bin Shu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f585545277d7..eced14882fc1 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -850,6 +850,7 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) mode &= ~S_ISGID; + inode->i_mode = (inode->i_mode & S_IRWXUGO) | (mode & ~S_IRWXUGO); set_acl_inode(inode, mode); } } From 17232e830afb800acdcc22ae8980bf9d330393ef Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 25 Dec 2020 16:52:27 +0800 Subject: [PATCH 63/99] f2fs: enhance to update i_mode and acl atomically in f2fs_setattr() Previously, in f2fs_setattr(), we don't update S_ISUID|S_ISGID|S_ISVTX bits with S_IRWXUGO bits and acl entries atomically, so in error path, chmod() may partially success, this patch enhances to make chmod() flow being atomical. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 23 ++++++++++++++++++++++- fs/f2fs/file.c | 7 ++++--- fs/f2fs/xattr.c | 15 +++++++++------ 3 files changed, 35 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 1e5e9b1136ee..732ec10e7890 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -200,6 +200,27 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type) return __f2fs_get_acl(inode, type, NULL); } +static int f2fs_acl_update_mode(struct inode *inode, umode_t *mode_p, + struct posix_acl **acl) +{ + umode_t mode = inode->i_mode; + int error; + + if (is_inode_flag_set(inode, FI_ACL_MODE)) + mode = F2FS_I(inode)->i_acl_mode; + + error = posix_acl_equiv_mode(*acl, &mode); + if (error < 0) + return error; + if (error == 0) + *acl = NULL; + if (!in_group_p(inode->i_gid) && + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) + mode &= ~S_ISGID; + *mode_p = mode; + return 0; +} + static int __f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl, struct page *ipage) { @@ -213,7 +234,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl && !ipage) { - error = posix_acl_update_mode(inode, &mode, &acl); + error = f2fs_acl_update_mode(inode, &mode, &acl); if (error) return error; set_acl_inode(inode, mode); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index eced14882fc1..2ddc4baaf173 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -850,7 +850,6 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) mode &= ~S_ISGID; - inode->i_mode = (inode->i_mode & S_IRWXUGO) | (mode & ~S_IRWXUGO); set_acl_inode(inode, mode); } } @@ -950,8 +949,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_MODE) { err = posix_acl_chmod(inode, f2fs_get_inode_mode(inode)); - if (err || is_inode_flag_set(inode, FI_ACL_MODE)) { - inode->i_mode = F2FS_I(inode)->i_acl_mode; + + if (is_inode_flag_set(inode, FI_ACL_MODE)) { + if (!err) + inode->i_mode = F2FS_I(inode)->i_acl_mode; clear_inode_flag(inode, FI_ACL_MODE); } } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 65afcc3cc68a..2086bef6c154 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -673,7 +673,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, } if (value && f2fs_xattr_value_same(here, value, size)) - goto exit; + goto same; } else if ((flags & XATTR_REPLACE)) { error = -ENODATA; goto exit; @@ -738,17 +738,20 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (error) goto exit; - if (is_inode_flag_set(inode, FI_ACL_MODE)) { - inode->i_mode = F2FS_I(inode)->i_acl_mode; - inode->i_ctime = current_time(inode); - clear_inode_flag(inode, FI_ACL_MODE); - } if (index == F2FS_XATTR_INDEX_ENCRYPTION && !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT)) f2fs_set_encrypted_inode(inode); f2fs_mark_inode_dirty_sync(inode, true); if (!error && S_ISDIR(inode->i_mode)) set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); + +same: + if (is_inode_flag_set(inode, FI_ACL_MODE)) { + inode->i_mode = F2FS_I(inode)->i_acl_mode; + inode->i_ctime = current_time(inode); + clear_inode_flag(inode, FI_ACL_MODE); + } + exit: kfree(base_addr); return error; From e0fcd01510ad025c9bbce704c5c2579294056141 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 26 Dec 2020 18:07:01 +0800 Subject: [PATCH 64/99] f2fs: enforce the immutable flag on open files This patch ports commit 02b016ca7f99 ("ext4: enforce the immutable flag on open files") to f2fs. According to the chattr man page, "a file with the 'i' attribute cannot be modified..." Historically, this was only enforced when the file was opened, per the rest of the description, "... and the file can not be opened in write mode". There is general agreement that we should standardize all file systems to prevent modifications even for files that were opened at the time the immutable flag is set. Eventually, a change to enforce this at the VFS layer should be landing in mainline. Cc: stable@kernel.org Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 2ddc4baaf173..d57b54643918 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -60,6 +60,9 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) bool need_alloc = true; int err = 0; + if (unlikely(IS_IMMUTABLE(inode))) + return VM_FAULT_SIGBUS; + if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; goto err; @@ -865,6 +868,14 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + + if (unlikely(IS_APPEND(inode) && + (attr->ia_valid & (ATTR_MODE | ATTR_UID | + ATTR_GID | ATTR_TIMES_SET)))) + return -EPERM; + if ((attr->ia_valid & ATTR_SIZE) && !f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; @@ -4351,6 +4362,11 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) inode_lock(inode); } + if (unlikely(IS_IMMUTABLE(inode))) { + ret = -EPERM; + goto unlock; + } + ret = generic_write_checks(iocb, from); if (ret > 0) { bool preallocated = false; @@ -4415,6 +4431,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret > 0) f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); } +unlock: inode_unlock(inode); out: trace_f2fs_file_write_iter(inode, iocb->ki_pos, From 0b979f1bded3e6808184842133e6afeba312a4ff Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 26 Dec 2020 18:07:41 +0800 Subject: [PATCH 65/99] f2fs: relocate f2fs_precache_extents() Relocate f2fs_precache_extents() in prior to check_swap_activate(), then extent cache can be enabled before its use. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index aa34d620bec9..57b9aab2b142 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -4108,12 +4108,13 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, if (!f2fs_disable_compressed_file(inode)) return -EINVAL; + f2fs_precache_extents(inode); + ret = check_swap_activate(sis, file, span); if (ret < 0) return ret; set_inode_flag(inode, FI_PIN_FILE); - f2fs_precache_extents(inode); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return ret; } From 32be0e97c71366a19d11d1965e3f0957ea0be609 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 22 Jan 2021 17:40:13 +0800 Subject: [PATCH 66/99] f2fs: compress: deny setting unsupported compress algorithm If kernel doesn't support certain kinds of compress algorithm, deny to set them as compress algorithm of f2fs via 'compress_algorithm=%s' mount option. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b4a07fe62d1a..a275bd312ae5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -882,17 +882,33 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) if (!name) return -ENOMEM; if (!strcmp(name, "lzo")) { +#ifdef CONFIG_F2FS_FS_LZO F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZO; +#else + f2fs_info(sbi, "kernel doesn't support lzo compression"); +#endif } else if (!strcmp(name, "lz4")) { +#ifdef CONFIG_F2FS_FS_LZ4 F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4; +#else + f2fs_info(sbi, "kernel doesn't support lz4 compression"); +#endif } else if (!strcmp(name, "zstd")) { +#ifdef CONFIG_F2FS_FS_ZSTD F2FS_OPTION(sbi).compress_algorithm = COMPRESS_ZSTD; +#else + f2fs_info(sbi, "kernel doesn't support zstd compression"); +#endif } else if (!strcmp(name, "lzo-rle")) { +#ifdef CONFIG_F2FS_FS_LZORLE F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZORLE; +#else + f2fs_info(sbi, "kernel doesn't support lzorle compression"); +#endif } else { kfree(name); return -EINVAL; From 3fde13f817e23f05ce407d136325df4cbc913e67 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 22 Jan 2021 17:46:43 +0800 Subject: [PATCH 67/99] f2fs: compress: support compress level Expand 'compress_algorithm' mount option to accept parameter as format of :, by this way, it gives a way to allow user to do more specified config on lz4 and zstd compression level, then f2fs compression can provide higher compress ratio. In order to set compress level for lz4 algorithm, it needs to set CONFIG_LZ4HC_COMPRESS and CONFIG_F2FS_FS_LZ4HC config to enable lz4hc compress algorithm. CR and performance number on lz4/lz4hc algorithm: dd if=enwik9 of=compressed_file conv=fsync Original blocks: 244382 lz4 lz4hc-9 compressed blocks 170647 163270 compress ratio 69.8% 66.8% speed 16.4207 s, 60.9 MB/s 26.7299 s, 37.4 MB/s compress ratio = after / before Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 5 ++ fs/f2fs/Kconfig | 10 ++++ fs/f2fs/compress.c | 41 +++++++++++++- fs/f2fs/f2fs.h | 9 +++ fs/f2fs/super.c | 89 +++++++++++++++++++++++++++++- include/linux/f2fs_fs.h | 3 + 6 files changed, 152 insertions(+), 5 deletions(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index dae15c96e659..5eff4009e77e 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -249,6 +249,11 @@ checkpoint=%s[:%u[%]] Set to "disable" to turn off checkpointing. Set to "enabl This space is reclaimed once checkpoint=enable. compress_algorithm=%s Control compress algorithm, currently f2fs supports "lzo", "lz4", "zstd" and "lzo-rle" algorithm. +compress_algorithm=%s:%d Control compress algorithm and its compress level, now, only + "lz4" and "zstd" support compress level config. + algorithm level range + lz4 3 - 16 + zstd 1 - 22 compress_log_size=%u Support configuring compress cluster size, the size will be 4KB * (1 << %u), 16KB is minimum size, also it's default size. diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index d13c5c6a9787..63c1fc1a0e3b 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -119,6 +119,16 @@ config F2FS_FS_LZ4 help Support LZ4 compress algorithm, if unsure, say Y. +config F2FS_FS_LZ4HC + bool "LZ4HC compression support" + depends on F2FS_FS_COMPRESSION + depends on F2FS_FS_LZ4 + select LZ4HC_COMPRESS + default y + help + Support LZ4HC compress algorithm, LZ4HC has compatible on-disk + layout with LZ4, if unsure, say Y. + config F2FS_FS_ZSTD bool "ZSTD compression support" depends on F2FS_FS_COMPRESSION diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 4bcbacfe3325..a345a41e2119 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -252,8 +252,14 @@ static const struct f2fs_compress_ops f2fs_lzo_ops = { #ifdef CONFIG_F2FS_FS_LZ4 static int lz4_init_compress_ctx(struct compress_ctx *cc) { - cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), - LZ4_MEM_COMPRESS, GFP_NOFS); + unsigned int size = LZ4_MEM_COMPRESS; + +#ifdef CONFIG_F2FS_FS_LZ4HC + if (F2FS_I(cc->inode)->i_compress_flag >> COMPRESS_LEVEL_OFFSET) + size = LZ4HC_MEM_COMPRESS; +#endif + + cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), size, GFP_NOFS); if (!cc->private) return -ENOMEM; @@ -272,10 +278,34 @@ static void lz4_destroy_compress_ctx(struct compress_ctx *cc) cc->private = NULL; } +#ifdef CONFIG_F2FS_FS_LZ4HC +static int lz4hc_compress_pages(struct compress_ctx *cc) +{ + unsigned char level = F2FS_I(cc->inode)->i_compress_flag >> + COMPRESS_LEVEL_OFFSET; + int len; + + if (level) + len = LZ4_compress_HC(cc->rbuf, cc->cbuf->cdata, cc->rlen, + cc->clen, level, cc->private); + else + len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen, + cc->clen, cc->private); + if (!len) + return -EAGAIN; + + cc->clen = len; + return 0; +} +#endif + static int lz4_compress_pages(struct compress_ctx *cc) { int len; +#ifdef CONFIG_F2FS_FS_LZ4HC + return lz4hc_compress_pages(cc); +#endif len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen, cc->clen, cc->private); if (!len) @@ -325,8 +355,13 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc) ZSTD_CStream *stream; void *workspace; unsigned int workspace_size; + unsigned char level = F2FS_I(cc->inode)->i_compress_flag >> + COMPRESS_LEVEL_OFFSET; - params = ZSTD_getParams(F2FS_ZSTD_DEFAULT_CLEVEL, cc->rlen, 0); + if (!level) + level = F2FS_ZSTD_DEFAULT_CLEVEL; + + params = ZSTD_getParams(level, cc->rlen, 0); workspace_size = ZSTD_CStreamWorkspaceBound(params.cParams); workspace = f2fs_kvmalloc(F2FS_I_SB(cc->inode), diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index bb11759191dc..36012181c17f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -146,6 +146,7 @@ struct f2fs_mount_info { /* For compression */ unsigned char compress_algorithm; /* algorithm type */ unsigned char compress_log_size; /* cluster log size */ + unsigned char compress_level; /* compress level */ bool compress_chksum; /* compressed data chksum */ unsigned char compress_ext_cnt; /* extension count */ int compress_mode; /* compression mode */ @@ -735,6 +736,7 @@ struct f2fs_inode_info { atomic_t i_compr_blocks; /* # of compressed blocks */ unsigned char i_compress_algorithm; /* algorithm type */ unsigned char i_log_cluster_size; /* log of cluster size */ + unsigned char i_compress_level; /* compress level (lz4hc,zstd) */ unsigned short i_compress_flag; /* compress flag */ unsigned int i_cluster_size; /* cluster size */ }; @@ -1310,6 +1312,8 @@ struct compress_data { #define F2FS_COMPRESSED_PAGE_MAGIC 0xF5F2C000 +#define COMPRESS_LEVEL_OFFSET 8 + /* compress context */ struct compress_ctx { struct inode *inode; /* inode the context belong to */ @@ -3934,6 +3938,11 @@ static inline void set_compress_context(struct inode *inode) 1 << COMPRESS_CHKSUM : 0; F2FS_I(inode)->i_cluster_size = 1 << F2FS_I(inode)->i_log_cluster_size; + if (F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 && + F2FS_OPTION(sbi).compress_level) + F2FS_I(inode)->i_compress_flag |= + F2FS_OPTION(sbi).compress_level << + COMPRESS_LEVEL_OFFSET; F2FS_I(inode)->i_flags |= F2FS_COMPR_FL; set_inode_flag(inode, FI_COMPRESSED_FILE); stat_inc_compr_inode(inode); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a275bd312ae5..c8be27a9eed6 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include "f2fs.h" #include "node.h" @@ -464,6 +466,74 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb, return 0; } +#ifdef CONFIG_F2FS_FS_COMPRESSION +#ifdef CONFIG_F2FS_FS_LZ4 +static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str) +{ +#ifdef CONFIG_F2FS_FS_LZ4HC + unsigned int level; +#endif + + if (strlen(str) == 3) { + F2FS_OPTION(sbi).compress_level = 0; + return 0; + } + +#ifdef CONFIG_F2FS_FS_LZ4HC + str += 3; + + if (str[0] != ':') { + f2fs_info(sbi, "wrong format, e.g. :"); + return -EINVAL; + } + if (kstrtouint(str + 1, 10, &level)) + return -EINVAL; + + if (level < LZ4HC_MIN_CLEVEL || level > LZ4HC_MAX_CLEVEL) { + f2fs_info(sbi, "invalid lz4hc compress level: %d", level); + return -EINVAL; + } + + F2FS_OPTION(sbi).compress_level = level; + return 0; +#else + f2fs_info(sbi, "kernel doesn't support lz4hc compression"); + return -EINVAL; +#endif +} +#endif + +#ifdef CONFIG_F2FS_FS_ZSTD +static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str) +{ + unsigned int level; + int len = 4; + + if (strlen(str) == len) { + F2FS_OPTION(sbi).compress_level = 0; + return 0; + } + + str += len; + + if (str[0] != ':') { + f2fs_info(sbi, "wrong format, e.g. :"); + return -EINVAL; + } + if (kstrtouint(str + 1, 10, &level)) + return -EINVAL; + + if (!level || level > ZSTD_maxCLevel()) { + f2fs_info(sbi, "invalid zstd compress level: %d", level); + return -EINVAL; + } + + F2FS_OPTION(sbi).compress_level = level; + return 0; +} +#endif +#endif + static int parse_options(struct super_block *sb, char *options, bool is_remount) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -883,20 +953,31 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) return -ENOMEM; if (!strcmp(name, "lzo")) { #ifdef CONFIG_F2FS_FS_LZO + F2FS_OPTION(sbi).compress_level = 0; F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZO; #else f2fs_info(sbi, "kernel doesn't support lzo compression"); #endif - } else if (!strcmp(name, "lz4")) { + } else if (!strncmp(name, "lz4", 3)) { #ifdef CONFIG_F2FS_FS_LZ4 + ret = f2fs_set_lz4hc_level(sbi, name); + if (ret) { + kfree(name); + return -EINVAL; + } F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4; #else f2fs_info(sbi, "kernel doesn't support lz4 compression"); #endif - } else if (!strcmp(name, "zstd")) { + } else if (!strncmp(name, "zstd", 4)) { #ifdef CONFIG_F2FS_FS_ZSTD + ret = f2fs_set_zstd_level(sbi, name); + if (ret) { + kfree(name); + return -EINVAL; + } F2FS_OPTION(sbi).compress_algorithm = COMPRESS_ZSTD; #else @@ -904,6 +985,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) #endif } else if (!strcmp(name, "lzo-rle")) { #ifdef CONFIG_F2FS_FS_LZORLE + F2FS_OPTION(sbi).compress_level = 0; F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZORLE; #else @@ -1555,6 +1637,9 @@ static inline void f2fs_show_compress_options(struct seq_file *seq, } seq_printf(seq, ",compress_algorithm=%s", algtype); + if (F2FS_OPTION(sbi).compress_level) + seq_printf(seq, ":%d", F2FS_OPTION(sbi).compress_level); + seq_printf(seq, ",compress_log_size=%u", F2FS_OPTION(sbi).compress_log_size); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 7dc2a06cf19a..c6cc0a566ef5 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -274,6 +274,9 @@ struct f2fs_inode { __u8 i_compress_algorithm; /* compress algorithm */ __u8 i_log_cluster_size; /* log of cluster size */ __le16 i_compress_flag; /* compress flag */ + /* 0 bit: chksum flag + * [10,15] bits: compress level + */ __le32 i_extra_end[0]; /* for attribute size calculation */ } __packed; __le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */ From 5d4daa579e56adc97fb77c7dfda6c1f747c9ef25 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 9 Dec 2020 16:43:27 +0800 Subject: [PATCH 68/99] f2fs: introduce a new per-sb directory in sysfs Add a new directory 'stat' in path of /sys/fs/f2fs//, later we can add new readonly stat sysfs file into this directory, it will make directory less mess. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 5 +++- fs/f2fs/sysfs.c | 69 +++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 68 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 36012181c17f..ccbbf86d14e5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1545,9 +1545,12 @@ struct f2fs_sb_info { unsigned int node_io_flag; /* For sysfs suppport */ - struct kobject s_kobj; + struct kobject s_kobj; /* /sys/fs/f2fs/ */ struct completion s_kobj_unregister; + struct kobject s_stat_kobj; /* /sys/fs/f2fs//stat */ + struct completion s_stat_kobj_unregister; + /* For shrinker support */ struct list_head s_list; int s_ndevs; /* number of devices */ diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 30bae57428d1..bd1174ed2e6f 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -702,6 +702,11 @@ static struct attribute *f2fs_feat_attrs[] = { }; ATTRIBUTE_GROUPS(f2fs_feat); +static struct attribute *f2fs_stat_attrs[] = { + NULL, +}; +ATTRIBUTE_GROUPS(f2fs_stat); + static const struct sysfs_ops f2fs_attr_ops = { .show = f2fs_attr_show, .store = f2fs_attr_store, @@ -730,6 +735,44 @@ static struct kobject f2fs_feat = { .kset = &f2fs_kset, }; +static ssize_t f2fs_stat_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_stat_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static ssize_t f2fs_stat_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_stat_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->store ? a->store(a, sbi, buf, len) : 0; +} + +static void f2fs_stat_kobj_release(struct kobject *kobj) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_stat_kobj); + complete(&sbi->s_stat_kobj_unregister); +} + +static const struct sysfs_ops f2fs_stat_attr_ops = { + .show = f2fs_stat_attr_show, + .store = f2fs_stat_attr_store, +}; + +static struct kobj_type f2fs_stat_ktype = { + .default_groups = f2fs_stat_groups, + .sysfs_ops = &f2fs_stat_attr_ops, + .release = f2fs_stat_kobj_release, +}; + static int __maybe_unused segment_info_seq_show(struct seq_file *seq, void *offset) { @@ -936,11 +979,15 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) init_completion(&sbi->s_kobj_unregister); err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL, "%s", sb->s_id); - if (err) { - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); - return err; - } + if (err) + goto put_sb_kobj; + + sbi->s_stat_kobj.kset = &f2fs_kset; + init_completion(&sbi->s_stat_kobj_unregister); + err = kobject_init_and_add(&sbi->s_stat_kobj, &f2fs_stat_ktype, + &sbi->s_kobj, "stat"); + if (err) + goto put_stat_kobj; if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); @@ -956,6 +1003,13 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) victim_bits_seq_show, sb); } return 0; +put_stat_kobj: + kobject_put(&sbi->s_stat_kobj); + wait_for_completion(&sbi->s_stat_kobj_unregister); +put_sb_kobj: + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + return err; } void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) @@ -967,6 +1021,11 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) remove_proc_entry("victim_bits", sbi->s_proc); remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); } + + kobject_del(&sbi->s_stat_kobj); + kobject_put(&sbi->s_stat_kobj); + wait_for_completion(&sbi->s_stat_kobj_unregister); + kobject_del(&sbi->s_kobj); kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); From 0953fe864c4d05f5a5cde626a630a76918cf4f9c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 14 Dec 2020 17:20:57 +0800 Subject: [PATCH 69/99] f2fs: fix to tag FIEMAP_EXTENT_MERGED in f2fs_fiemap() f2fs does not natively support extents in metadata, 'extent' in f2fs is used as a virtual concept, so in f2fs_fiemap() interface, it needs to tag FIEMAP_EXTENT_MERGED flag to indicated the extent status is a result of merging. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 57b9aab2b142..547c9d4b430b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1964,6 +1964,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } if (size) { + flags |= FIEMAP_EXTENT_MERGED; if (IS_ENCRYPTED(inode)) flags |= FIEMAP_EXTENT_DATA_ENCRYPTED; From 2562515f0ad7342bde6456602c491b64c63fe950 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 16 Dec 2020 17:15:23 +0800 Subject: [PATCH 70/99] f2fs: fix out-of-repair __setattr_copy() __setattr_copy() was copied from setattr_copy() in fs/attr.c, there is two missing patches doesn't cover this inner function, fix it. Commit 7fa294c8991c ("userns: Allow chown and setgid preservation") Commit 23adbe12ef7d ("fs,userns: Change inode_capable to capable_wrt_inode_uidgid") Fixes: fbfa2cc58d53 ("f2fs: add file operations") Cc: stable@vger.kernel.org Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index d57b54643918..4e6d4b9120a8 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -851,7 +851,8 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + if (!in_group_p(inode->i_gid) && + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) mode &= ~S_ISGID; set_acl_inode(inode, mode); } From cf7404036019fada99d99ea01f49cb5c3142099d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 30 Dec 2020 16:38:35 +0800 Subject: [PATCH 71/99] f2fs: trival cleanup in move_data_block() Trival cleanups: - relocate set_summary() before its use - relocate "allocate block address" to correct place - remove unneeded f2fs_wait_on_page_writeback() Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 3ef84e6ded41..39330ad3c44e 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1169,8 +1169,6 @@ static int move_data_block(struct inode *inode, block_t bidx, if (err) goto put_out; - set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); - /* read page */ fio.page = page; fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; @@ -1207,6 +1205,9 @@ static int move_data_block(struct inode *inode, block_t bidx, } } + set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); + + /* allocate block address */ f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, &sum, type, NULL); @@ -1233,9 +1234,6 @@ static int move_data_block(struct inode *inode, block_t bidx, set_page_writeback(fio.encrypted_page); ClearPageError(page); - /* allocate block address */ - f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true); - fio.op = REQ_OP_WRITE; fio.op_flags = REQ_SYNC; fio.new_blkaddr = newaddr; From 7f59b277f79e8aacaa2ec7e549be6c27985c27f2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 4 Jan 2021 22:33:02 -0800 Subject: [PATCH 72/99] f2fs: clean up post-read processing Rework the post-read processing logic to be much easier to understand. At least one bug is fixed by this: if an I/O error occurred when reading from disk, decryption and verity would be performed on the uninitialized data, causing misleading messages in the kernel log. Signed-off-by: Eric Biggers Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 149 +++++++++++++----- fs/f2fs/data.c | 375 ++++++++++++++++++--------------------------- fs/f2fs/f2fs.h | 55 ++++++- 3 files changed, 306 insertions(+), 273 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index a345a41e2119..1696f9183ff5 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -756,38 +756,27 @@ static int f2fs_compress_pages(struct compress_ctx *cc) return ret; } -void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) +static void f2fs_decompress_cluster(struct decompress_io_ctx *dic) { - struct decompress_io_ctx *dic = - (struct decompress_io_ctx *)page_private(page); struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); - struct f2fs_inode_info *fi= F2FS_I(dic->inode); + struct f2fs_inode_info *fi = F2FS_I(dic->inode); const struct f2fs_compress_ops *cops = f2fs_cops[fi->i_compress_algorithm]; int ret; int i; - dec_page_count(sbi, F2FS_RD_DATA); - - if (bio->bi_status || PageError(page)) - dic->failed = true; - - if (atomic_dec_return(&dic->pending_pages)) - return; - trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx, dic->cluster_size, fi->i_compress_algorithm); - /* submit partial compressed pages */ if (dic->failed) { ret = -EIO; - goto out_free_dic; + goto out_end_io; } dic->tpages = page_array_alloc(dic->inode, dic->cluster_size); if (!dic->tpages) { ret = -ENOMEM; - goto out_free_dic; + goto out_end_io; } for (i = 0; i < dic->cluster_size; i++) { @@ -799,20 +788,20 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) dic->tpages[i] = f2fs_compress_alloc_page(); if (!dic->tpages[i]) { ret = -ENOMEM; - goto out_free_dic; + goto out_end_io; } } if (cops->init_decompress_ctx) { ret = cops->init_decompress_ctx(dic); if (ret) - goto out_free_dic; + goto out_end_io; } dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size); if (!dic->rbuf) { ret = -ENOMEM; - goto destroy_decompress_ctx; + goto out_destroy_decompress_ctx; } dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages); @@ -851,18 +840,34 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) vm_unmap_ram(dic->cbuf, dic->nr_cpages); out_vunmap_rbuf: vm_unmap_ram(dic->rbuf, dic->cluster_size); -destroy_decompress_ctx: +out_destroy_decompress_ctx: if (cops->destroy_decompress_ctx) cops->destroy_decompress_ctx(dic); -out_free_dic: - if (!verity) - f2fs_decompress_end_io(dic->rpages, dic->cluster_size, - ret, false); - +out_end_io: trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx, dic->clen, ret); - if (!verity) - f2fs_free_dic(dic); + f2fs_decompress_end_io(dic, ret); +} + +/* + * This is called when a page of a compressed cluster has been read from disk + * (or failed to be read from disk). It checks whether this page was the last + * page being waited on in the cluster, and if so, it decompresses the cluster + * (or in the case of a failure, cleans up without actually decompressing). + */ +void f2fs_end_read_compressed_page(struct page *page, bool failed) +{ + struct decompress_io_ctx *dic = + (struct decompress_io_ctx *)page_private(page); + struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); + + dec_page_count(sbi, F2FS_RD_DATA); + + if (failed) + WRITE_ONCE(dic->failed, true); + + if (atomic_dec_and_test(&dic->remaining_pages)) + f2fs_decompress_cluster(dic); } static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index) @@ -1529,6 +1534,8 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, return err; } +static void f2fs_free_dic(struct decompress_io_ctx *dic); + struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) { struct decompress_io_ctx *dic; @@ -1547,12 +1554,14 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) dic->magic = F2FS_COMPRESSED_PAGE_MAGIC; dic->inode = cc->inode; - atomic_set(&dic->pending_pages, cc->nr_cpages); + atomic_set(&dic->remaining_pages, cc->nr_cpages); dic->cluster_idx = cc->cluster_idx; dic->cluster_size = cc->cluster_size; dic->log_cluster_size = cc->log_cluster_size; dic->nr_cpages = cc->nr_cpages; + refcount_set(&dic->refcnt, 1); dic->failed = false; + dic->need_verity = f2fs_need_verity(cc->inode, start_idx); for (i = 0; i < dic->cluster_size; i++) dic->rpages[i] = cc->rpages[i]; @@ -1581,7 +1590,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) return ERR_PTR(-ENOMEM); } -void f2fs_free_dic(struct decompress_io_ctx *dic) +static void f2fs_free_dic(struct decompress_io_ctx *dic) { int i; @@ -1609,30 +1618,88 @@ void f2fs_free_dic(struct decompress_io_ctx *dic) kmem_cache_free(dic_entry_slab, dic); } -void f2fs_decompress_end_io(struct page **rpages, - unsigned int cluster_size, bool err, bool verity) +static void f2fs_put_dic(struct decompress_io_ctx *dic) +{ + if (refcount_dec_and_test(&dic->refcnt)) + f2fs_free_dic(dic); +} + +/* + * Update and unlock the cluster's pagecache pages, and release the reference to + * the decompress_io_ctx that was being held for I/O completion. + */ +static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) { int i; - for (i = 0; i < cluster_size; i++) { - struct page *rpage = rpages[i]; + for (i = 0; i < dic->cluster_size; i++) { + struct page *rpage = dic->rpages[i]; if (!rpage) continue; - if (err || PageError(rpage)) - goto clear_uptodate; - - if (!verity || fsverity_verify_page(rpage)) { + /* PG_error was set if verity failed. */ + if (failed || PageError(rpage)) { + ClearPageUptodate(rpage); + /* will re-read again later */ + ClearPageError(rpage); + } else { SetPageUptodate(rpage); - goto unlock; } -clear_uptodate: - ClearPageUptodate(rpage); - ClearPageError(rpage); -unlock: unlock_page(rpage); } + + f2fs_put_dic(dic); +} + +static void f2fs_verify_cluster(struct work_struct *work) +{ + struct decompress_io_ctx *dic = + container_of(work, struct decompress_io_ctx, verity_work); + int i; + + /* Verify the cluster's decompressed pages with fs-verity. */ + for (i = 0; i < dic->cluster_size; i++) { + struct page *rpage = dic->rpages[i]; + + if (rpage && !fsverity_verify_page(rpage)) + SetPageError(rpage); + } + + __f2fs_decompress_end_io(dic, false); +} + +/* + * This is called when a compressed cluster has been decompressed + * (or failed to be read and/or decompressed). + */ +void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) +{ + if (!failed && dic->need_verity) { + /* + * Note that to avoid deadlocks, the verity work can't be done + * on the decompression workqueue. This is because verifying + * the data pages can involve reading metadata pages from the + * file, and these metadata pages may be compressed. + */ + INIT_WORK(&dic->verity_work, f2fs_verify_cluster); + fsverity_enqueue_verify_work(&dic->verity_work); + } else { + __f2fs_decompress_end_io(dic, failed); + } +} + +/* + * Put a reference to a compressed page's decompress_io_ctx. + * + * This is called when the page is no longer needed and can be freed. + */ +void f2fs_put_page_dic(struct page *page) +{ + struct decompress_io_ctx *dic = + (struct decompress_io_ctx *)page_private(page); + + f2fs_put_dic(dic); } int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 547c9d4b430b..4d80f00e5e40 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -115,10 +115,21 @@ static enum count_type __read_io_type(struct page *page) /* postprocessing steps for read bios */ enum bio_post_read_step { - STEP_DECRYPT, - STEP_DECOMPRESS_NOWQ, /* handle normal cluster data inplace */ - STEP_DECOMPRESS, /* handle compressed cluster data in workqueue */ - STEP_VERITY, +#ifdef CONFIG_FS_ENCRYPTION + STEP_DECRYPT = 1 << 0, +#else + STEP_DECRYPT = 0, /* compile out the decryption-related code */ +#endif +#ifdef CONFIG_F2FS_FS_COMPRESSION + STEP_DECOMPRESS = 1 << 1, +#else + STEP_DECOMPRESS = 0, /* compile out the decompression-related code */ +#endif +#ifdef CONFIG_FS_VERITY + STEP_VERITY = 1 << 2, +#else + STEP_VERITY = 0, /* compile out the verity-related code */ +#endif }; struct bio_post_read_ctx { @@ -128,25 +139,26 @@ struct bio_post_read_ctx { unsigned int enabled_steps; }; -static void __read_end_io(struct bio *bio, bool compr, bool verity) +static void f2fs_finish_read_bio(struct bio *bio) { - struct page *page; struct bio_vec *bv; struct bvec_iter_all iter_all; + /* + * Update and unlock the bio's pagecache pages, and put the + * decompression context for any compressed pages. + */ bio_for_each_segment_all(bv, bio, iter_all) { - page = bv->bv_page; + struct page *page = bv->bv_page; -#ifdef CONFIG_F2FS_FS_COMPRESSION - if (compr && f2fs_is_compressed_page(page)) { - f2fs_decompress_pages(bio, page, verity); + if (f2fs_is_compressed_page(page)) { + if (bio->bi_status) + f2fs_end_read_compressed_page(page, true); + f2fs_put_page_dic(page); continue; } - if (verity) - continue; -#endif - /* PG_error was set if any post_read step failed */ + /* PG_error was set if decryption or verity failed. */ if (bio->bi_status || PageError(page)) { ClearPageUptodate(page); /* will re-read again later */ @@ -157,106 +169,104 @@ static void __read_end_io(struct bio *bio, bool compr, bool verity) dec_page_count(F2FS_P_SB(page), __read_io_type(page)); unlock_page(page); } + + if (bio->bi_private) + mempool_free(bio->bi_private, bio_post_read_ctx_pool); + bio_put(bio); } -static void f2fs_release_read_bio(struct bio *bio); -static void __f2fs_read_end_io(struct bio *bio, bool compr, bool verity) -{ - if (!compr) - __read_end_io(bio, false, verity); - f2fs_release_read_bio(bio); -} - -static void f2fs_decompress_bio(struct bio *bio, bool verity) -{ - __read_end_io(bio, true, verity); -} - -static void bio_post_read_processing(struct bio_post_read_ctx *ctx); - -static void f2fs_decrypt_work(struct bio_post_read_ctx *ctx) -{ - fscrypt_decrypt_bio(ctx->bio); -} - -static void f2fs_decompress_work(struct bio_post_read_ctx *ctx) -{ - f2fs_decompress_bio(ctx->bio, ctx->enabled_steps & (1 << STEP_VERITY)); -} - -#ifdef CONFIG_F2FS_FS_COMPRESSION -static void f2fs_verify_pages(struct page **rpages, unsigned int cluster_size) -{ - f2fs_decompress_end_io(rpages, cluster_size, false, true); -} - -static void f2fs_verify_bio(struct bio *bio) -{ - struct bio_vec *bv; - struct bvec_iter_all iter_all; - - bio_for_each_segment_all(bv, bio, iter_all) { - struct page *page = bv->bv_page; - struct decompress_io_ctx *dic; - - dic = (struct decompress_io_ctx *)page_private(page); - - if (dic) { - if (atomic_dec_return(&dic->verity_pages)) - continue; - f2fs_verify_pages(dic->rpages, - dic->cluster_size); - f2fs_free_dic(dic); - continue; - } - - if (bio->bi_status || PageError(page)) - goto clear_uptodate; - - if (fsverity_verify_page(page)) { - SetPageUptodate(page); - goto unlock; - } -clear_uptodate: - ClearPageUptodate(page); - ClearPageError(page); -unlock: - dec_page_count(F2FS_P_SB(page), __read_io_type(page)); - unlock_page(page); - } -} -#endif - -static void f2fs_verity_work(struct work_struct *work) +static void f2fs_verify_bio(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); struct bio *bio = ctx->bio; -#ifdef CONFIG_F2FS_FS_COMPRESSION - unsigned int enabled_steps = ctx->enabled_steps; -#endif + bool may_have_compressed_pages = (ctx->enabled_steps & STEP_DECOMPRESS); /* * fsverity_verify_bio() may call readpages() again, and while verity - * will be disabled for this, decryption may still be needed, resulting - * in another bio_post_read_ctx being allocated. So to prevent - * deadlocks we need to release the current ctx to the mempool first. - * This assumes that verity is the last post-read step. + * will be disabled for this, decryption and/or decompression may still + * be needed, resulting in another bio_post_read_ctx being allocated. + * So to prevent deadlocks we need to release the current ctx to the + * mempool first. This assumes that verity is the last post-read step. */ mempool_free(ctx, bio_post_read_ctx_pool); bio->bi_private = NULL; -#ifdef CONFIG_F2FS_FS_COMPRESSION - /* previous step is decompression */ - if (enabled_steps & (1 << STEP_DECOMPRESS)) { - f2fs_verify_bio(bio); - f2fs_release_read_bio(bio); - return; - } -#endif + /* + * Verify the bio's pages with fs-verity. Exclude compressed pages, + * as those were handled separately by f2fs_end_read_compressed_page(). + */ + if (may_have_compressed_pages) { + struct bio_vec *bv; + struct bvec_iter_all iter_all; - fsverity_verify_bio(bio); - __f2fs_read_end_io(bio, false, false); + bio_for_each_segment_all(bv, bio, iter_all) { + struct page *page = bv->bv_page; + + if (!f2fs_is_compressed_page(page) && + !PageError(page) && !fsverity_verify_page(page)) + SetPageError(page); + } + } else { + fsverity_verify_bio(bio); + } + + f2fs_finish_read_bio(bio); +} + +/* + * If the bio's data needs to be verified with fs-verity, then enqueue the + * verity work for the bio. Otherwise finish the bio now. + * + * Note that to avoid deadlocks, the verity work can't be done on the + * decryption/decompression workqueue. This is because verifying the data pages + * can involve reading verity metadata pages from the file, and these verity + * metadata pages may be encrypted and/or compressed. + */ +static void f2fs_verify_and_finish_bio(struct bio *bio) +{ + struct bio_post_read_ctx *ctx = bio->bi_private; + + if (ctx && (ctx->enabled_steps & STEP_VERITY)) { + INIT_WORK(&ctx->work, f2fs_verify_bio); + fsverity_enqueue_verify_work(&ctx->work); + } else { + f2fs_finish_read_bio(bio); + } +} + +/* + * Handle STEP_DECOMPRESS by decompressing any compressed clusters whose last + * remaining page was read by @ctx->bio. + * + * Note that a bio may span clusters (even a mix of compressed and uncompressed + * clusters) or be for just part of a cluster. STEP_DECOMPRESS just indicates + * that the bio includes at least one compressed page. The actual decompression + * is done on a per-cluster basis, not a per-bio basis. + */ +static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx) +{ + struct bio_vec *bv; + struct bvec_iter_all iter_all; + bool all_compressed = true; + + bio_for_each_segment_all(bv, ctx->bio, iter_all) { + struct page *page = bv->bv_page; + + /* PG_error was set if decryption failed. */ + if (f2fs_is_compressed_page(page)) + f2fs_end_read_compressed_page(page, PageError(page)); + else + all_compressed = false; + } + + /* + * Optimization: if all the bio's pages are compressed, then scheduling + * the per-bio verity work is unnecessary, as verity will be fully + * handled at the compression cluster level. + */ + if (all_compressed) + ctx->enabled_steps &= ~STEP_VERITY; } static void f2fs_post_read_work(struct work_struct *work) @@ -264,74 +274,36 @@ static void f2fs_post_read_work(struct work_struct *work) struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); - if (ctx->enabled_steps & (1 << STEP_DECRYPT)) - f2fs_decrypt_work(ctx); + if (ctx->enabled_steps & STEP_DECRYPT) + fscrypt_decrypt_bio(ctx->bio); - if (ctx->enabled_steps & (1 << STEP_DECOMPRESS)) - f2fs_decompress_work(ctx); + if (ctx->enabled_steps & STEP_DECOMPRESS) + f2fs_handle_step_decompress(ctx); - if (ctx->enabled_steps & (1 << STEP_VERITY)) { - INIT_WORK(&ctx->work, f2fs_verity_work); - fsverity_enqueue_verify_work(&ctx->work); - return; - } - - __f2fs_read_end_io(ctx->bio, - ctx->enabled_steps & (1 << STEP_DECOMPRESS), false); -} - -static void f2fs_enqueue_post_read_work(struct f2fs_sb_info *sbi, - struct work_struct *work) -{ - queue_work(sbi->post_read_wq, work); -} - -static void bio_post_read_processing(struct bio_post_read_ctx *ctx) -{ - /* - * We use different work queues for decryption and for verity because - * verity may require reading metadata pages that need decryption, and - * we shouldn't recurse to the same workqueue. - */ - - if (ctx->enabled_steps & (1 << STEP_DECRYPT) || - ctx->enabled_steps & (1 << STEP_DECOMPRESS)) { - INIT_WORK(&ctx->work, f2fs_post_read_work); - f2fs_enqueue_post_read_work(ctx->sbi, &ctx->work); - return; - } - - if (ctx->enabled_steps & (1 << STEP_VERITY)) { - INIT_WORK(&ctx->work, f2fs_verity_work); - fsverity_enqueue_verify_work(&ctx->work); - return; - } - - __f2fs_read_end_io(ctx->bio, false, false); -} - -static bool f2fs_bio_post_read_required(struct bio *bio) -{ - return bio->bi_private; + f2fs_verify_and_finish_bio(ctx->bio); } static void f2fs_read_end_io(struct bio *bio) { struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio)); + struct bio_post_read_ctx *ctx = bio->bi_private; if (time_to_inject(sbi, FAULT_READ_IO)) { f2fs_show_injection_info(sbi, FAULT_READ_IO); bio->bi_status = BLK_STS_IOERR; } - if (f2fs_bio_post_read_required(bio)) { - struct bio_post_read_ctx *ctx = bio->bi_private; - - bio_post_read_processing(ctx); + if (bio->bi_status) { + f2fs_finish_read_bio(bio); return; } - __f2fs_read_end_io(bio, false, false); + if (ctx && (ctx->enabled_steps & (STEP_DECRYPT | STEP_DECOMPRESS))) { + INIT_WORK(&ctx->work, f2fs_post_read_work); + queue_work(ctx->sbi->post_read_wq, &ctx->work); + } else { + f2fs_verify_and_finish_bio(bio); + } } static void f2fs_write_end_io(struct bio *bio) @@ -1022,16 +994,9 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio) up_write(&io->io_rwsem); } -static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx) -{ - return fsverity_active(inode) && - idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); -} - static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, unsigned nr_pages, unsigned op_flag, - pgoff_t first_idx, bool for_write, - bool for_verity) + pgoff_t first_idx, bool for_write) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; @@ -1050,13 +1015,19 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, bio_set_op_attrs(bio, REQ_OP_READ, op_flag); if (fscrypt_inode_uses_fs_layer_crypto(inode)) - post_read_steps |= 1 << STEP_DECRYPT; - if (f2fs_compressed_file(inode)) - post_read_steps |= 1 << STEP_DECOMPRESS_NOWQ; - if (for_verity && f2fs_need_verity(inode, first_idx)) - post_read_steps |= 1 << STEP_VERITY; + post_read_steps |= STEP_DECRYPT; - if (post_read_steps) { + if (f2fs_need_verity(inode, first_idx)) + post_read_steps |= STEP_VERITY; + + /* + * STEP_DECOMPRESS is handled specially, since a compressed file might + * contain both compressed and uncompressed clusters. We'll allocate a + * bio_post_read_ctx if the file is compressed, but the caller is + * responsible for enabling STEP_DECOMPRESS if it's actually needed. + */ + + if (post_read_steps || f2fs_compressed_file(inode)) { /* Due to the mempool, this never fails. */ ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); ctx->bio = bio; @@ -1068,13 +1039,6 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, return bio; } -static void f2fs_release_read_bio(struct bio *bio) -{ - if (bio->bi_private) - mempool_free(bio->bi_private, bio_post_read_ctx_pool); - bio_put(bio); -} - /* This can handle encryption stuffs */ static int f2fs_submit_page_read(struct inode *inode, struct page *page, block_t blkaddr, int op_flags, bool for_write) @@ -1083,7 +1047,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, struct bio *bio; bio = f2fs_grab_read_bio(inode, blkaddr, 1, op_flags, - page->index, for_write, true); + page->index, for_write); if (IS_ERR(bio)) return PTR_ERR(bio); @@ -2122,7 +2086,7 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page, if (bio == NULL) { bio = f2fs_grab_read_bio(inode, block_nr, nr_pages, is_readahead ? REQ_RAHEAD : 0, page->index, - false, true); + false); if (IS_ERR(bio)) { ret = PTR_ERR(bio); bio = NULL; @@ -2168,8 +2132,6 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, sector_t last_block_in_file; const unsigned blocksize = blks_to_bytes(inode, 1); struct decompress_io_ctx *dic = NULL; - struct bio_post_read_ctx *ctx; - bool for_verity = false; int i; int ret = 0; @@ -2235,29 +2197,10 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, goto out_put_dnode; } - /* - * It's possible to enable fsverity on the fly when handling a cluster, - * which requires complicated error handling. Instead of adding more - * complexity, let's give a rule where end_io post-processes fsverity - * per cluster. In order to do that, we need to submit bio, if previous - * bio sets a different post-process policy. - */ - if (fsverity_active(cc->inode)) { - atomic_set(&dic->verity_pages, cc->nr_cpages); - for_verity = true; - - if (bio) { - ctx = bio->bi_private; - if (!(ctx->enabled_steps & (1 << STEP_VERITY))) { - __submit_bio(sbi, bio, DATA); - bio = NULL; - } - } - } - for (i = 0; i < dic->nr_cpages; i++) { struct page *page = dic->cpages[i]; block_t blkaddr; + struct bio_post_read_ctx *ctx; blkaddr = data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i + 1); @@ -2273,31 +2216,10 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, if (!bio) { bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages, is_readahead ? REQ_RAHEAD : 0, - page->index, for_write, for_verity); + page->index, for_write); if (IS_ERR(bio)) { - unsigned int remained = dic->nr_cpages - i; - bool release = false; - ret = PTR_ERR(bio); - dic->failed = true; - - if (for_verity) { - if (!atomic_sub_return(remained, - &dic->verity_pages)) - release = true; - } else { - if (!atomic_sub_return(remained, - &dic->pending_pages)) - release = true; - } - - if (release) { - f2fs_decompress_end_io(dic->rpages, - cc->cluster_size, true, - false); - f2fs_free_dic(dic); - } - + f2fs_decompress_end_io(dic, ret); f2fs_put_dnode(&dn); *bio_ret = NULL; return ret; @@ -2309,10 +2231,9 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, if (bio_add_page(bio, page, blocksize, 0) < blocksize) goto submit_and_realloc; - /* tag STEP_DECOMPRESS to handle IO in wq */ ctx = bio->bi_private; - if (!(ctx->enabled_steps & (1 << STEP_DECOMPRESS))) - ctx->enabled_steps |= 1 << STEP_DECOMPRESS; + ctx->enabled_steps |= STEP_DECOMPRESS; + refcount_inc(&dic->refcnt); inc_page_count(sbi, F2FS_RD_DATA); f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); @@ -2329,7 +2250,13 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, out_put_dnode: f2fs_put_dnode(&dn); out: - f2fs_decompress_end_io(cc->rpages, cc->cluster_size, true, false); + for (i = 0; i < cc->cluster_size; i++) { + if (cc->rpages[i]) { + ClearPageUptodate(cc->rpages[i]); + ClearPageError(cc->rpages[i]); + unlock_page(cc->rpages[i]); + } + } *bio_ret = bio; return ret; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ccbbf86d14e5..980e061f7968 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1341,7 +1341,7 @@ struct compress_io_ctx { atomic_t pending_pages; /* in-flight compressed page count */ }; -/* decompress io context for read IO path */ +/* Context for decompressing one cluster on the read IO path */ struct decompress_io_ctx { u32 magic; /* magic number to indicate page is compressed */ struct inode *inode; /* inode the context belong to */ @@ -1357,11 +1357,37 @@ struct decompress_io_ctx { struct compress_data *cbuf; /* virtual mapped address on cpages */ size_t rlen; /* valid data length in rbuf */ size_t clen; /* valid data length in cbuf */ - atomic_t pending_pages; /* in-flight compressed page count */ - atomic_t verity_pages; /* in-flight page count for verity */ - bool failed; /* indicate IO error during decompression */ + + /* + * The number of compressed pages remaining to be read in this cluster. + * This is initially nr_cpages. It is decremented by 1 each time a page + * has been read (or failed to be read). When it reaches 0, the cluster + * is decompressed (or an error is reported). + * + * If an error occurs before all the pages have been submitted for I/O, + * then this will never reach 0. In this case the I/O submitter is + * responsible for calling f2fs_decompress_end_io() instead. + */ + atomic_t remaining_pages; + + /* + * Number of references to this decompress_io_ctx. + * + * One reference is held for I/O completion. This reference is dropped + * after the pagecache pages are updated and unlocked -- either after + * decompression (and verity if enabled), or after an error. + * + * In addition, each compressed page holds a reference while it is in a + * bio. These references are necessary prevent compressed pages from + * being freed while they are still in a bio. + */ + refcount_t refcnt; + + bool failed; /* IO error occurred before decompression? */ + bool need_verity; /* need fs-verity verification after decompression? */ void *private; /* payload buffer for specified decompression algorithm */ void *private2; /* extra payload buffer */ + struct work_struct verity_work; /* work to verify the decompressed pages */ }; #define NULL_CLUSTER ((unsigned int)(~0)) @@ -3883,7 +3909,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page); bool f2fs_is_compress_backend_ready(struct inode *inode); int f2fs_init_compress_mempool(void); void f2fs_destroy_compress_mempool(void); -void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity); +void f2fs_end_read_compressed_page(struct page *page, bool failed); bool f2fs_cluster_is_empty(struct compress_ctx *cc); bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page); @@ -3896,9 +3922,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, unsigned nr_pages, sector_t *last_block_in_bio, bool is_readahead, bool for_write); struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc); -void f2fs_free_dic(struct decompress_io_ctx *dic); -void f2fs_decompress_end_io(struct page **rpages, - unsigned int cluster_size, bool err, bool verity); +void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed); +void f2fs_put_page_dic(struct page *page); int f2fs_init_compress_ctx(struct compress_ctx *cc); void f2fs_destroy_compress_ctx(struct compress_ctx *cc); void f2fs_init_compress_info(struct f2fs_sb_info *sbi); @@ -3922,6 +3947,14 @@ static inline struct page *f2fs_compress_control_page(struct page *page) } static inline int f2fs_init_compress_mempool(void) { return 0; } static inline void f2fs_destroy_compress_mempool(void) { } +static inline void f2fs_end_read_compressed_page(struct page *page, bool failed) +{ + WARN_ON_ONCE(1); +} +static inline void f2fs_put_page_dic(struct page *page) +{ + WARN_ON_ONCE(1); +} static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { } static inline int __init f2fs_init_compress_cache(void) { return 0; } @@ -4126,6 +4159,12 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, return false; } +static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx) +{ + return fsverity_active(inode) && + idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); +} + #ifdef CONFIG_F2FS_FAULT_INJECTION extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, unsigned int type); From df0736d70c4fa6ed711ba103b61880fe72bb4777 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Wed, 6 Jan 2021 08:49:28 +0900 Subject: [PATCH 73/99] f2fs: fix null page reference in redirty_blocks By Colin's static analysis, we found out there is a null page reference under low memory situation in redirty_blocks. I've made the page finding loop stop immediately and return an error not to cause further memory pressure when we run into a failure to find a page under low memory condition. Signed-off-by: Daeho Jeong Reported-by: Colin Ian King Fixes: 5fdb322ff2c2 ("f2fs: add F2FS_IOC_DECOMPRESS_FILE and F2FS_IOC_COMPRESS_FILE") Reviewed-by: Colin Ian King Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 4e6d4b9120a8..e3a5b620b50a 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4057,8 +4057,10 @@ static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len) for (i = 0; i < page_len; i++, redirty_idx++) { page = find_lock_page(mapping, redirty_idx); - if (!page) - ret = -ENOENT; + if (!page) { + ret = -ENOMEM; + break; + } set_page_dirty(page); f2fs_put_page(page, 1); f2fs_put_page(page, 0); From 46085f37fc9e12d5c3539fb768b5ad7951e72acf Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 12 Jan 2021 09:55:09 +0800 Subject: [PATCH 74/99] f2fs: fix to set/clear I_LINKABLE under i_lock fsstress + fault injection test case reports a warning message as below: WARNING: CPU: 13 PID: 6226 at fs/inode.c:361 inc_nlink+0x32/0x40 Call Trace: f2fs_init_inode_metadata+0x25c/0x4a0 [f2fs] f2fs_add_inline_entry+0x153/0x3b0 [f2fs] f2fs_add_dentry+0x75/0x80 [f2fs] f2fs_do_add_link+0x108/0x160 [f2fs] f2fs_rename2+0x6ab/0x14f0 [f2fs] vfs_rename+0x70c/0x940 do_renameat2+0x4d8/0x4f0 __x64_sys_renameat2+0x4b/0x60 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Following race case can cause this: Thread A Kworker - f2fs_rename - f2fs_create_whiteout - __f2fs_tmpfile - f2fs_i_links_write - f2fs_mark_inode_dirty_sync - mark_inode_dirty_sync - writeback_single_inode - __writeback_single_inode - spin_lock(&inode->i_lock) - inode->i_state |= I_LINKABLE - inode->i_state &= ~dirty - spin_unlock(&inode->i_lock) - f2fs_add_link - f2fs_do_add_link - f2fs_add_dentry - f2fs_add_inline_entry - f2fs_init_inode_metadata - f2fs_i_links_write - inc_nlink - WARN_ON(!(inode->i_state & I_LINKABLE)) Fix to add i_lock to avoid i_state update race condition. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 6edb1ab579a1..887804968576 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -855,7 +855,11 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, if (whiteout) { f2fs_i_links_write(inode, false); + + spin_lock(&inode->i_lock); inode->i_state |= I_LINKABLE; + spin_unlock(&inode->i_lock); + *whiteout = inode; } else { d_tmpfile(dentry, inode); @@ -1041,7 +1045,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, err = f2fs_add_link(old_dentry, whiteout); if (err) goto put_out_dir; + + spin_lock(&whiteout->i_lock); whiteout->i_state &= ~I_LINKABLE; + spin_unlock(&whiteout->i_lock); + iput(whiteout); } From 794c43f716845e2d48ce195ed5c4179a4e05ce5f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 28 Dec 2020 15:25:29 -0800 Subject: [PATCH 75/99] libfs: unexport generic_ci_d_compare() and generic_ci_d_hash() Now that generic_set_encrypted_ci_d_ops() has been added and ext4 and f2fs are using it, it's no longer necessary to export generic_ci_d_compare() and generic_ci_d_hash() to filesystems. Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/libfs.c | 8 +++----- include/linux/fs.h | 5 ----- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index d1c3bade9f30..79721571e014 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1388,8 +1388,8 @@ static bool needs_casefold(const struct inode *dir) * * Return: 0 if names match, 1 if mismatch, or -ERRNO */ -int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, - const char *str, const struct qstr *name) +static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, + const char *str, const struct qstr *name) { const struct dentry *parent = READ_ONCE(dentry->d_parent); const struct inode *dir = READ_ONCE(parent->d_inode); @@ -1426,7 +1426,6 @@ int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, return 1; return !!memcmp(str, name->name, len); } -EXPORT_SYMBOL(generic_ci_d_compare); /** * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems @@ -1435,7 +1434,7 @@ EXPORT_SYMBOL(generic_ci_d_compare); * * Return: 0 if hash was successful or unchanged, and -EINVAL on error */ -int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) +static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) { const struct inode *dir = READ_ONCE(dentry->d_inode); struct super_block *sb = dentry->d_sb; @@ -1450,7 +1449,6 @@ int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) return -EINVAL; return 0; } -EXPORT_SYMBOL(generic_ci_d_hash); static const struct dentry_operations generic_ci_dentry_ops = { .d_hash = generic_ci_d_hash, diff --git a/include/linux/fs.h b/include/linux/fs.h index fd47deea7c17..6d8b1e7337e4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3192,11 +3192,6 @@ extern int generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_check_addressable(unsigned, u64); -#ifdef CONFIG_UNICODE -extern int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str); -extern int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, - const char *str, const struct qstr *name); -#endif extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry); #ifdef CONFIG_MIGRATION From 3afae09ffea5e08f523823be99a784675995d6bb Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 11 Jan 2021 17:42:53 +0800 Subject: [PATCH 76/99] f2fs: compress: fix potential deadlock generic/269 reports a hangtask issue, the root cause is ABBA deadlock described as below: Thread A Thread B - down_write(&sbi->gc_lock) -- A - f2fs_write_data_pages - lock all pages in cluster -- B - f2fs_write_multi_pages - f2fs_write_raw_pages - f2fs_write_single_data_page - f2fs_balance_fs - down_write(&sbi->gc_lock) -- A - f2fs_gc - do_garbage_collect - ra_data_block - pagecache_get_page -- B To fix this, it needs to avoid calling f2fs_balance_fs() if there is still cluster pages been locked in context of cluster writeback, so instead, let's call f2fs_balance_fs() in the end of f2fs_write_raw_pages() when all cluster pages were unlocked. Fixes: 4c8ff7095bef ("f2fs: support data compression") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 5 ++++- fs/f2fs/data.c | 10 ++++++---- fs/f2fs/f2fs.h | 2 +- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 1696f9183ff5..77fa342de38f 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1455,7 +1455,7 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc, ret = f2fs_write_single_data_page(cc->rpages[i], &_submitted, NULL, NULL, wbc, io_type, - compr_blocks); + compr_blocks, false); if (ret) { if (ret == AOP_WRITEPAGE_ACTIVATE) { unlock_page(cc->rpages[i]); @@ -1490,6 +1490,9 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc, *submitted += _submitted; } + + f2fs_balance_fs(F2FS_M_SB(mapping), true); + return 0; out_err: for (++i; i < cc->cluster_size; i++) { diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 4d80f00e5e40..c7bb07dd9a20 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2671,7 +2671,8 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, sector_t *last_block, struct writeback_control *wbc, enum iostat_type io_type, - int compr_blocks) + int compr_blocks, + bool allow_balance) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -2809,7 +2810,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, } unlock_page(page); if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) && - !F2FS_I(inode)->cp_task) + !F2FS_I(inode)->cp_task && allow_balance) f2fs_balance_fs(sbi, need_balance_fs); if (unlikely(f2fs_cp_error(sbi))) { @@ -2856,7 +2857,7 @@ static int f2fs_write_data_page(struct page *page, #endif return f2fs_write_single_data_page(page, NULL, NULL, NULL, - wbc, FS_DATA_IO, 0); + wbc, FS_DATA_IO, 0, true); } /* @@ -3024,7 +3025,8 @@ static int f2fs_write_cache_pages(struct address_space *mapping, } #endif ret = f2fs_write_single_data_page(page, &submitted, - &bio, &last_block, wbc, io_type, 0); + &bio, &last_block, wbc, io_type, + 0, true); if (ret == AOP_WRITEPAGE_ACTIVATE) unlock_page(page); #ifdef CONFIG_F2FS_FS_COMPRESSION diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 980e061f7968..63852404151e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3502,7 +3502,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, struct bio **bio, sector_t *last_block, struct writeback_control *wbc, enum iostat_type io_type, - int compr_blocks); + int compr_blocks, bool allow_balance); void f2fs_invalidate_page(struct page *page, unsigned int offset, unsigned int length); int f2fs_release_page(struct page *page, gfp_t wait); From 6d1451bf7f84ea45035553ae566b3c91661d902b Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Wed, 13 Jan 2021 13:21:54 +0800 Subject: [PATCH 77/99] f2fs: fix to use per-inode maxbytes F2FS inode may have different max size, e.g. compressed file have less blkaddr entries in all its direct-node blocks, result in being with less max filesize. So change to use per-inode maxbytes. Suggested-by: Chao Yu Signed-off-by: Chengguang Xu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 9 ++++++--- fs/f2fs/super.c | 12 ++++++++---- 4 files changed, 16 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index c7bb07dd9a20..9aa458c01101 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3761,7 +3761,7 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) filemap_write_and_wait(mapping); /* Block number less than F2FS MAX BLOCKS */ - if (unlikely(block >= F2FS_I_SB(inode)->max_file_blocks)) + if (unlikely(block >= max_file_blocks(inode))) goto out; if (f2fs_compressed_file(inode)) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 63852404151e..ca5f1ff14dab 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1474,7 +1474,6 @@ struct f2fs_sb_info { unsigned int total_sections; /* total section count */ unsigned int total_node_count; /* total node block count */ unsigned int total_valid_node_count; /* valid node block count */ - loff_t max_file_blocks; /* max block index of file */ int dir_level; /* directory level */ int readdir_ra; /* readahead inode in readdir */ u64 max_io_bytes; /* max io bytes to merge IOs */ @@ -3265,6 +3264,7 @@ int f2fs_inode_dirtied(struct inode *inode, bool sync); void f2fs_inode_synced(struct inode *inode); int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); int f2fs_quota_sync(struct super_block *sb, int type); +loff_t max_file_blocks(struct inode *inode); void f2fs_quota_off_umount(struct super_block *sb); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e3a5b620b50a..e768371c6575 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -486,6 +486,9 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file->f_mapping->host; loff_t maxbytes = inode->i_sb->s_maxbytes; + if (f2fs_compressed_file(inode)) + maxbytes = max_file_blocks(inode) << F2FS_BLKSIZE_BITS; + switch (whence) { case SEEK_SET: case SEEK_CUR: @@ -670,7 +673,7 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) free_from = (pgoff_t)F2FS_BLK_ALIGN(from); - if (free_from >= sbi->max_file_blocks) + if (free_from >= max_file_blocks(inode)) goto free_partial; if (lock) @@ -2744,7 +2747,7 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) return -EINVAL; if (unlikely((range.start + range.len) >> PAGE_SHIFT > - sbi->max_file_blocks)) + max_file_blocks(inode))) return -EINVAL; err = mnt_want_write_file(filp); @@ -3307,7 +3310,7 @@ int f2fs_precache_extents(struct inode *inode) map.m_next_extent = &m_next_extent; map.m_seg_type = NO_CHECK_TYPE; map.m_may_create = false; - end = F2FS_I_SB(inode)->max_file_blocks; + end = max_file_blocks(inode); while (map.m_lblk < end) { map.m_len = end - map.m_lblk; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index c8be27a9eed6..9749c9ad374f 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2739,10 +2739,10 @@ static const struct export_operations f2fs_export_ops = { .get_parent = f2fs_get_parent, }; -static loff_t max_file_blocks(void) +loff_t max_file_blocks(struct inode *inode) { loff_t result = 0; - loff_t leaf_count = DEF_ADDRS_PER_BLOCK; + loff_t leaf_count; /* * note: previously, result is equal to (DEF_ADDRS_PER_INODE - @@ -2751,6 +2751,11 @@ static loff_t max_file_blocks(void) * result as zero. */ + if (inode && f2fs_compressed_file(inode)) + leaf_count = ADDRS_PER_BLOCK(inode); + else + leaf_count = DEF_ADDRS_PER_BLOCK; + /* two direct node blocks */ result += (leaf_count * 2); @@ -3634,8 +3639,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (err) goto free_options; - sbi->max_file_blocks = max_file_blocks(); - sb->s_maxbytes = sbi->max_file_blocks << + sb->s_maxbytes = max_file_blocks(NULL) << le32_to_cpu(raw_super->log_blocksize); sb->s_max_links = F2FS_LINK_MAX; From 0bfe9f790448012ef38abf4e78feb2e691e2d366 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 14 Jan 2021 09:41:27 +0800 Subject: [PATCH 78/99] f2fs: introduce sb_status sysfs node Introduce /sys/fs/f2fs//stat/sb_status to show superblock status in real time as a hexadecimal value. value sb status macro description 0x1 SBI_IS_DIRTY, /* dirty flag for checkpoint */ 0x2 SBI_IS_CLOSE, /* specify unmounting */ 0x4 SBI_NEED_FSCK, /* need fsck.f2fs to fix */ 0x8 SBI_POR_DOING, /* recovery is doing or not */ 0x10 SBI_NEED_SB_WRITE, /* need to recover superblock */ 0x20 SBI_NEED_CP, /* need to checkpoint */ 0x40 SBI_IS_SHUTDOWN, /* shutdown by ioctl */ 0x80 SBI_IS_RECOVERED, /* recovered orphan/data */ 0x100 SBI_CP_DISABLED, /* CP was disabled last mount */ 0x200 SBI_CP_DISABLED_QUICK, /* CP was disabled quickly */ 0x400 SBI_QUOTA_NEED_FLUSH, /* need to flush quota info in CP */ 0x800 SBI_QUOTA_SKIP_FLUSH, /* skip flushing quota in current CP */ 0x1000 SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */ 0x2000 SBI_IS_RESIZEFS, /* resizefs is in process */ Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 23 +++++++++++++++++++++++ fs/f2fs/sysfs.c | 8 ++++++++ 2 files changed, 31 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 3dfee94e0618..362803901614 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -377,3 +377,26 @@ Description: This gives a control to limit the bio size in f2fs. Default is zero, which will follow underlying block layer limit, whereas, if it has a certain bytes value, f2fs won't submit a bio larger than that size. + +What: /sys/fs/f2fs//stat/sb_status +Date: December 2020 +Contact: "Chao Yu" +Description: Show status of f2fs superblock in real time. + + ====== ===================== ================================= + value sb status macro description + 0x1 SBI_IS_DIRTY dirty flag for checkpoint + 0x2 SBI_IS_CLOSE specify unmounting + 0x4 SBI_NEED_FSCK need fsck.f2fs to fix + 0x8 SBI_POR_DOING recovery is doing or not + 0x10 SBI_NEED_SB_WRITE need to recover superblock + 0x20 SBI_NEED_CP need to checkpoint + 0x40 SBI_IS_SHUTDOWN shutdown by ioctl + 0x80 SBI_IS_RECOVERED recovered orphan/data + 0x100 SBI_CP_DISABLED CP was disabled last mount + 0x200 SBI_CP_DISABLED_QUICK CP was disabled quickly + 0x400 SBI_QUOTA_NEED_FLUSH need to flush quota info in CP + 0x800 SBI_QUOTA_SKIP_FLUSH skip flushing quota in current CP + 0x1000 SBI_QUOTA_NEED_REPAIR quota file may be corrupted + 0x2000 SBI_IS_RESIZEFS resizefs is in process + ====== ===================== ================================= diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index bd1174ed2e6f..f39874d512ea 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -96,6 +96,12 @@ static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, sbi->sectors_written_start) >> 1))); } +static ssize_t sb_status_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sprintf(buf, "%lx\n", sbi->s_flag); +} + static ssize_t features_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -702,7 +708,9 @@ static struct attribute *f2fs_feat_attrs[] = { }; ATTRIBUTE_GROUPS(f2fs_feat); +F2FS_GENERAL_RO_ATTR(sb_status); static struct attribute *f2fs_stat_attrs[] = { + ATTR_LIST(sb_status), NULL, }; ATTRIBUTE_GROUPS(f2fs_stat); From deaa965fb01173478a1234f4305c71fffa4b5dc4 Mon Sep 17 00:00:00 2001 From: Jack Qiu Date: Wed, 13 Jan 2021 17:58:53 +0800 Subject: [PATCH 79/99] f2fs: remove unused stat_{inc, dec}_atomic_write Just clean code, no logical change. Signed-off-by: Jack Qiu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ca5f1ff14dab..d37883ce9002 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3748,8 +3748,6 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #define stat_dec_compr_inode(inode) do { } while (0) #define stat_add_compr_blocks(inode, blocks) do { } while (0) #define stat_sub_compr_blocks(inode, blocks) do { } while (0) -#define stat_inc_atomic_write(inode) do { } while (0) -#define stat_dec_atomic_write(inode) do { } while (0) #define stat_update_max_atomic_write(inode) do { } while (0) #define stat_inc_volatile_write(inode) do { } while (0) #define stat_dec_volatile_write(inode) do { } while (0) From 12699fb781574d50871ec6a4d96ac5e0f0ede03e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 14 Jan 2021 19:00:51 +0000 Subject: [PATCH 80/99] f2fs: Remove readahead collision detection With the new ->readahead operation, locked pages are added to the page cache, preventing two threads from racing with each other to read the same chunk of file, so this is dead code. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 25 ------------------------- fs/f2fs/f2fs.h | 1 - fs/f2fs/super.c | 2 -- 3 files changed, 28 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9aa458c01101..d9a063d8a63d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2265,11 +2265,6 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. - * - * Note that the aops->readpages() function is ONLY used for read-ahead. If - * this function ever deviates from doing just read-ahead, it should either - * use ->readpage() or do the necessary surgery to decouple ->readpages() - * from read-ahead. */ static int f2fs_mpage_readpages(struct inode *inode, struct readahead_control *rac, struct page *page) @@ -2292,7 +2287,6 @@ static int f2fs_mpage_readpages(struct inode *inode, unsigned nr_pages = rac ? readahead_count(rac) : 1; unsigned max_nr_pages = nr_pages; int ret = 0; - bool drop_ra = false; map.m_pblk = 0; map.m_lblk = 0; @@ -2303,26 +2297,10 @@ static int f2fs_mpage_readpages(struct inode *inode, map.m_seg_type = NO_CHECK_TYPE; map.m_may_create = false; - /* - * Two readahead threads for same address range can cause race condition - * which fragments sequential read IOs. So let's avoid each other. - */ - if (rac && readahead_count(rac)) { - if (READ_ONCE(F2FS_I(inode)->ra_offset) == readahead_index(rac)) - drop_ra = true; - else - WRITE_ONCE(F2FS_I(inode)->ra_offset, - readahead_index(rac)); - } - for (; nr_pages; nr_pages--) { if (rac) { page = readahead_page(rac); prefetchw(&page->flags); - if (drop_ra) { - f2fs_put_page(page, 1); - continue; - } } #ifdef CONFIG_F2FS_FS_COMPRESSION @@ -2385,9 +2363,6 @@ static int f2fs_mpage_readpages(struct inode *inode, } if (bio) __submit_bio(F2FS_I_SB(inode), bio, DATA); - - if (rac && readahead_count(rac) && !drop_ra) - WRITE_ONCE(F2FS_I(inode)->ra_offset, -1); return ret; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d37883ce9002..a2e520a13630 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -718,7 +718,6 @@ struct f2fs_inode_info { struct list_head inmem_pages; /* inmemory pages managed by f2fs */ struct task_struct *inmem_task; /* store inmemory task */ struct mutex inmem_lock; /* lock for inmemory pages */ - pgoff_t ra_offset; /* ongoing readahead offset */ struct extent_tree *extent_tree; /* cached extent_tree entry */ /* avoid racing between foreground op and gc */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 9749c9ad374f..6a30876ff374 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1174,8 +1174,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; - fi->ra_offset = -1; - return &fi->vfs_inode; } From d5f7bc0064e0541164bd3deeafad16bbb5992433 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 14 Jan 2021 13:59:09 -0800 Subject: [PATCH 81/99] f2fs: deprecate f2fs_trace_io This patch deprecates f2fs_trace_io, since f2fs uses page->private more broadly, resulting in more buggy cases. Acked-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/Kconfig | 10 --- fs/f2fs/Makefile | 1 - fs/f2fs/checkpoint.c | 3 - fs/f2fs/data.c | 4 -- fs/f2fs/file.c | 2 - fs/f2fs/node.c | 2 - fs/f2fs/segment.c | 3 - fs/f2fs/super.c | 6 -- fs/f2fs/trace.c | 165 ------------------------------------------- fs/f2fs/trace.h | 43 ----------- 10 files changed, 239 deletions(-) delete mode 100644 fs/f2fs/trace.c delete mode 100644 fs/f2fs/trace.h diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 63c1fc1a0e3b..62e638a49bbf 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -76,16 +76,6 @@ config F2FS_CHECK_FS If you want to improve the performance, say N. -config F2FS_IO_TRACE - bool "F2FS IO tracer" - depends on F2FS_FS - depends on FUNCTION_TRACER - help - F2FS IO trace is based on a function trace, which gathers process - information and block IO patterns in the filesystem level. - - If unsure, say N. - config F2FS_FAULT_INJECTION bool "F2FS fault injection facility" depends on F2FS_FS diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index ee7316b42f69..e5295746208b 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -7,6 +7,5 @@ f2fs-y += shrinker.o extent_cache.o sysfs.o f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o -f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o f2fs-$(CONFIG_FS_VERITY) += verity.o f2fs-$(CONFIG_F2FS_FS_COMPRESSION) += compress.o diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 897edb7c951a..8c79ba0566b1 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -17,7 +17,6 @@ #include "f2fs.h" #include "node.h" #include "segment.h" -#include "trace.h" #include static struct kmem_cache *ino_entry_slab; @@ -443,7 +442,6 @@ static int f2fs_set_meta_page_dirty(struct page *page) __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); f2fs_set_page_private(page, 0); - f2fs_trace_pid(page); return 1; } return 0; @@ -1017,7 +1015,6 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page) spin_unlock(&sbi->inode_lock[type]); f2fs_set_page_private(page, 0); - f2fs_trace_pid(page); } void f2fs_remove_dirty_inode(struct inode *inode) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d9a063d8a63d..38476d0d3916 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -25,7 +25,6 @@ #include "f2fs.h" #include "node.h" #include "segment.h" -#include "trace.h" #include #define NUM_PREALLOC_POST_READ_CTXS 128 @@ -679,7 +678,6 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) return -EFSCORRUPTED; trace_f2fs_submit_page_bio(page, fio); - f2fs_trace_ios(fio, 0); /* Allocate a new bio */ bio = __bio_alloc(fio, 1); @@ -884,7 +882,6 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) return -EFSCORRUPTED; trace_f2fs_submit_page_bio(page, fio); - f2fs_trace_ios(fio, 0); if (bio && !page_is_mergeable(fio->sbi, bio, *fio->last_block, fio->new_blkaddr)) @@ -981,7 +978,6 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio) wbc_account_cgroup_owner(fio->io_wbc, bio_page, PAGE_SIZE); io->last_block_in_bio = fio->new_blkaddr; - f2fs_trace_ios(fio, 0); trace_f2fs_submit_page_write(fio->page, fio); skip: diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e768371c6575..7db27c81d034 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -29,7 +29,6 @@ #include "xattr.h" #include "acl.h" #include "gc.h" -#include "trace.h" #include #include @@ -369,7 +368,6 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, f2fs_update_time(sbi, REQ_TIME); out: trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret); - f2fs_trace_ios(NULL, 1); return ret; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5e3fabacefb5..a8a0fb890e8d 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -17,7 +17,6 @@ #include "node.h" #include "segment.h" #include "xattr.h" -#include "trace.h" #include #define on_f2fs_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock) @@ -2089,7 +2088,6 @@ static int f2fs_set_node_page_dirty(struct page *page) __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); f2fs_set_page_private(page, 0); - f2fs_trace_pid(page); return 1; } return 0; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index deca74cb17df..7d34f1cacdee 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -20,7 +20,6 @@ #include "segment.h" #include "node.h" #include "gc.h" -#include "trace.h" #include #define __reverse_ffz(x) __reverse_ffs(~(x)) @@ -187,8 +186,6 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) { struct inmem_pages *new; - f2fs_trace_pid(page); - f2fs_set_page_private(page, ATOMIC_WRITTEN_PAGE); new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6a30876ff374..fc343243799c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -33,7 +33,6 @@ #include "segment.h" #include "xattr.h" #include "gc.h" -#include "trace.h" #define CREATE_TRACE_POINTS #include @@ -1448,8 +1447,6 @@ int f2fs_sync_fs(struct super_block *sb, int sync) err = f2fs_write_checkpoint(sbi, &cpc); up_write(&sbi->gc_lock); } - f2fs_trace_ios(NULL, 1); - return err; } @@ -4127,8 +4124,6 @@ static int __init init_f2fs_fs(void) return -EINVAL; } - f2fs_build_trace_ios(); - err = init_inodecache(); if (err) goto fail; @@ -4221,7 +4216,6 @@ static void __exit exit_f2fs_fs(void) f2fs_destroy_segment_manager_caches(); f2fs_destroy_node_manager_caches(); destroy_inodecache(); - f2fs_destroy_trace_ios(); } module_init(init_f2fs_fs) diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c deleted file mode 100644 index d0ab533a9ce8..000000000000 --- a/fs/f2fs/trace.c +++ /dev/null @@ -1,165 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * f2fs IO tracer - * - * Copyright (c) 2014 Motorola Mobility - * Copyright (c) 2014 Jaegeuk Kim - */ -#include -#include -#include -#include - -#include "f2fs.h" -#include "trace.h" - -static RADIX_TREE(pids, GFP_ATOMIC); -static spinlock_t pids_lock; -static struct last_io_info last_io; - -static inline void __print_last_io(void) -{ - if (!last_io.len) - return; - - trace_printk("%3x:%3x %4x %-16s %2x %5x %5x %12x %4x\n", - last_io.major, last_io.minor, - last_io.pid, "----------------", - last_io.type, - last_io.fio.op, last_io.fio.op_flags, - last_io.fio.new_blkaddr, - last_io.len); - memset(&last_io, 0, sizeof(last_io)); -} - -static int __file_type(struct inode *inode, pid_t pid) -{ - if (f2fs_is_atomic_file(inode)) - return __ATOMIC_FILE; - else if (f2fs_is_volatile_file(inode)) - return __VOLATILE_FILE; - else if (S_ISDIR(inode->i_mode)) - return __DIR_FILE; - else if (inode->i_ino == F2FS_NODE_INO(F2FS_I_SB(inode))) - return __NODE_FILE; - else if (inode->i_ino == F2FS_META_INO(F2FS_I_SB(inode))) - return __META_FILE; - else if (pid) - return __NORMAL_FILE; - else - return __MISC_FILE; -} - -void f2fs_trace_pid(struct page *page) -{ - struct inode *inode = page->mapping->host; - pid_t pid = task_pid_nr(current); - void *p; - - set_page_private(page, (unsigned long)pid); - -retry: - if (radix_tree_preload(GFP_NOFS)) - return; - - spin_lock(&pids_lock); - p = radix_tree_lookup(&pids, pid); - if (p == current) - goto out; - if (p) - radix_tree_delete(&pids, pid); - - if (radix_tree_insert(&pids, pid, current)) { - spin_unlock(&pids_lock); - radix_tree_preload_end(); - cond_resched(); - goto retry; - } - - trace_printk("%3x:%3x %4x %-16s\n", - MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), - pid, current->comm); -out: - spin_unlock(&pids_lock); - radix_tree_preload_end(); -} - -void f2fs_trace_ios(struct f2fs_io_info *fio, int flush) -{ - struct inode *inode; - pid_t pid; - int major, minor; - - if (flush) { - __print_last_io(); - return; - } - - inode = fio->page->mapping->host; - pid = page_private(fio->page); - - major = MAJOR(inode->i_sb->s_dev); - minor = MINOR(inode->i_sb->s_dev); - - if (last_io.major == major && last_io.minor == minor && - last_io.pid == pid && - last_io.type == __file_type(inode, pid) && - last_io.fio.op == fio->op && - last_io.fio.op_flags == fio->op_flags && - last_io.fio.new_blkaddr + last_io.len == - fio->new_blkaddr) { - last_io.len++; - return; - } - - __print_last_io(); - - last_io.major = major; - last_io.minor = minor; - last_io.pid = pid; - last_io.type = __file_type(inode, pid); - last_io.fio = *fio; - last_io.len = 1; - return; -} - -void f2fs_build_trace_ios(void) -{ - spin_lock_init(&pids_lock); -} - -#define PIDVEC_SIZE 128 -static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index, - unsigned int max_items) -{ - struct radix_tree_iter iter; - void **slot; - unsigned int ret = 0; - - if (unlikely(!max_items)) - return 0; - - radix_tree_for_each_slot(slot, &pids, &iter, first_index) { - results[ret] = iter.index; - if (++ret == max_items) - break; - } - return ret; -} - -void f2fs_destroy_trace_ios(void) -{ - pid_t pid[PIDVEC_SIZE]; - pid_t next_pid = 0; - unsigned int found; - - spin_lock(&pids_lock); - while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) { - unsigned idx; - - next_pid = pid[found - 1] + 1; - for (idx = 0; idx < found; idx++) - radix_tree_delete(&pids, pid[idx]); - } - spin_unlock(&pids_lock); -} diff --git a/fs/f2fs/trace.h b/fs/f2fs/trace.h deleted file mode 100644 index 789f6aa727fc..000000000000 --- a/fs/f2fs/trace.h +++ /dev/null @@ -1,43 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * f2fs IO tracer - * - * Copyright (c) 2014 Motorola Mobility - * Copyright (c) 2014 Jaegeuk Kim - */ -#ifndef __F2FS_TRACE_H__ -#define __F2FS_TRACE_H__ - -#ifdef CONFIG_F2FS_IO_TRACE -#include - -enum file_type { - __NORMAL_FILE, - __DIR_FILE, - __NODE_FILE, - __META_FILE, - __ATOMIC_FILE, - __VOLATILE_FILE, - __MISC_FILE, -}; - -struct last_io_info { - int major, minor; - pid_t pid; - enum file_type type; - struct f2fs_io_info fio; - block_t len; -}; - -extern void f2fs_trace_pid(struct page *); -extern void f2fs_trace_ios(struct f2fs_io_info *, int); -extern void f2fs_build_trace_ios(void); -extern void f2fs_destroy_trace_ios(void); -#else -#define f2fs_trace_pid(p) -#define f2fs_trace_ios(i, n) -#define f2fs_build_trace_ios() -#define f2fs_destroy_trace_ios() - -#endif -#endif /* __F2FS_TRACE_H__ */ From f71475ba8c2a77fff8051903cf4b7d826c3d1693 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 21 Jan 2021 17:57:43 -0500 Subject: [PATCH 82/99] nfsd: remove unused set_client argument Every caller is setting this argument to false, so we don't need it. Also cut this comment a bit and remove an unnecessary warning. Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index b7a5ac5a81ac..677586d9169a 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4648,8 +4648,7 @@ static struct nfs4_client *lookup_clientid(clientid_t *clid, bool sessions, static __be32 set_client(clientid_t *clid, struct nfsd4_compound_state *cstate, - struct nfsd_net *nn, - bool sessions) + struct nfsd_net *nn) { if (cstate->clp) { if (!same_clid(&cstate->clp->cl_clientid, clid)) @@ -4659,12 +4658,10 @@ static __be32 set_client(clientid_t *clid, if (STALE_CLIENTID(clid, nn)) return nfserr_stale_clientid; /* - * For v4.1+ we get the client in the SEQUENCE op. If we don't have one - * cached already then we know this is for is for v4.0 and "sessions" - * will be false. + * We're in the 4.0 case (otherwise the SEQUENCE op would have + * set cstate->clp), so session = false: */ - WARN_ON_ONCE(cstate->session); - cstate->clp = lookup_clientid(clid, sessions, nn); + cstate->clp = lookup_clientid(clid, false, nn); if (!cstate->clp) return nfserr_expired; return nfs_ok; @@ -4688,7 +4685,7 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, if (open->op_file == NULL) return nfserr_jukebox; - status = set_client(clientid, cstate, nn, false); + status = set_client(clientid, cstate, nn); if (status) return status; clp = cstate->clp; @@ -5298,7 +5295,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); trace_nfsd_clid_renew(clid); - status = set_client(clid, cstate, nn, false); + status = set_client(clid, cstate, nn); if (status) return status; clp = cstate->clp; @@ -5681,7 +5678,7 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || CLOSE_STATEID(stateid)) return nfserr_bad_stateid; - status = set_client(&stateid->si_opaque.so_clid, cstate, nn, false); + status = set_client(&stateid->si_opaque.so_clid, cstate, nn); if (status == nfserr_stale_clientid) { if (cstate->session) return nfserr_bad_stateid; @@ -6905,7 +6902,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_inval; if (!nfsd4_has_session(cstate)) { - status = set_client(&lockt->lt_clientid, cstate, nn, false); + status = set_client(&lockt->lt_clientid, cstate, nn); if (status) goto out; } @@ -7089,7 +7086,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", clid->cl_boot, clid->cl_id); - status = set_client(clid, cstate, nn, false); + status = set_client(clid, cstate, nn); if (status) return status; @@ -7236,7 +7233,7 @@ nfs4_check_open_reclaim(clientid_t *clid, __be32 status; /* find clientid in conf_id_hashtbl */ - status = set_client(clid, cstate, nn, false); + status = set_client(clid, cstate, nn); if (status) return nfserr_reclaim_bad; From 1722b04624806ced51693f546edb83e8b2297a77 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 21 Jan 2021 17:57:44 -0500 Subject: [PATCH 83/99] nfsd: simplify nfsd4_check_open_reclaim The set_client() was already taken care of by process_open1(). The comments here are mostly redundant with the code. Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 3 +-- fs/nfsd/nfs4state.c | 18 +++--------------- fs/nfsd/state.h | 3 +-- 3 files changed, 5 insertions(+), 19 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 8273a99359d1..21d35b0339e6 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -428,8 +428,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; break; case NFS4_OPEN_CLAIM_PREVIOUS: - status = nfs4_check_open_reclaim(&open->op_clientid, - cstate, nn); + status = nfs4_check_open_reclaim(cstate->clp); if (status) goto out; open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 677586d9169a..e55c779dee5f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -7222,25 +7222,13 @@ nfsd4_find_reclaim_client(struct xdr_netobj name, struct nfsd_net *nn) return NULL; } -/* -* Called from OPEN. Look for clientid in reclaim list. -*/ __be32 -nfs4_check_open_reclaim(clientid_t *clid, - struct nfsd4_compound_state *cstate, - struct nfsd_net *nn) +nfs4_check_open_reclaim(struct nfs4_client *clp) { - __be32 status; - - /* find clientid in conf_id_hashtbl */ - status = set_client(clid, cstate, nn); - if (status) - return nfserr_reclaim_bad; - - if (test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &cstate->clp->cl_flags)) + if (test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags)) return nfserr_no_grace; - if (nfsd4_client_record_check(cstate->clp)) + if (nfsd4_client_record_check(clp)) return nfserr_reclaim_bad; return nfs_ok; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 9eae11a9d21c..73deea353169 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -649,8 +649,7 @@ void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *) extern void nfs4_release_reclaim(struct nfsd_net *); extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(struct xdr_netobj name, struct nfsd_net *nn); -extern __be32 nfs4_check_open_reclaim(clientid_t *clid, - struct nfsd4_compound_state *cstate, struct nfsd_net *nn); +extern __be32 nfs4_check_open_reclaim(struct nfs4_client *); extern void nfsd4_probe_callback(struct nfs4_client *clp); extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); From ec59659b4972ec25851aa03b4b5baba6764a62e4 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 21 Jan 2021 17:57:45 -0500 Subject: [PATCH 84/99] nfsd: cstate->session->se_client -> cstate->clp I'm not sure why we're writing this out the hard way in so many places. Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 5 ++--- fs/nfsd/nfs4state.c | 16 ++++++++-------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 21d35b0339e6..acdb3cd806a1 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -378,8 +378,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * Before RECLAIM_COMPLETE done, server should deny new lock */ if (nfsd4_has_session(cstate) && - !test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, - &cstate->session->se_client->cl_flags) && + !test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &cstate->clp->cl_flags) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) return nfserr_grace; @@ -1887,7 +1886,7 @@ nfsd4_getdeviceinfo(struct svc_rqst *rqstp, nfserr = nfs_ok; if (gdp->gd_maxcount != 0) { nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, - rqstp, cstate->session->se_client, gdp); + rqstp, cstate->clp, gdp); } gdp->gd_notify_types &= ops->notify_types; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index e55c779dee5f..423fd6683f3a 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3891,6 +3891,7 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_reclaim_complete *rc = &u->reclaim_complete; + struct nfs4_client *clp = cstate->clp; __be32 status = 0; if (rc->rca_one_fs) { @@ -3904,12 +3905,11 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, } status = nfserr_complete_already; - if (test_and_set_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, - &cstate->session->se_client->cl_flags)) + if (test_and_set_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags)) goto out; status = nfserr_stale_clientid; - if (is_client_expired(cstate->session->se_client)) + if (is_client_expired(clp)) /* * The following error isn't really legal. * But we only get here if the client just explicitly @@ -3920,8 +3920,8 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, goto out; status = nfs_ok; - nfsd4_client_record_create(cstate->session->se_client); - inc_reclaim_complete(cstate->session->se_client); + nfsd4_client_record_create(clp); + inc_reclaim_complete(clp); out: return status; } @@ -5918,7 +5918,7 @@ nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct nfsd4_test_stateid *test_stateid = &u->test_stateid; struct nfsd4_test_stateid_id *stateid; - struct nfs4_client *cl = cstate->session->se_client; + struct nfs4_client *cl = cstate->clp; list_for_each_entry(stateid, &test_stateid->ts_stateid_list, ts_id_list) stateid->ts_id_status = @@ -5964,7 +5964,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stateid_t *stateid = &free_stateid->fr_stateid; struct nfs4_stid *s; struct nfs4_delegation *dp; - struct nfs4_client *cl = cstate->session->se_client; + struct nfs4_client *cl = cstate->clp; __be32 ret = nfserr_bad_stateid; spin_lock(&cl->cl_lock); @@ -6693,7 +6693,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (nfsd4_has_session(cstate)) /* See rfc 5661 18.10.3: given clientid is ignored: */ memcpy(&lock->lk_new_clientid, - &cstate->session->se_client->cl_clientid, + &cstate->clp->cl_clientid, sizeof(clientid_t)); /* validate and update open stateid and open seqid */ From 02591f9febd5f69bb4c266a4abf899c4cf21964f Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Thu, 28 Jan 2021 01:42:26 -0500 Subject: [PATCH 85/99] NFSv4_2: SSC helper should use its own config. Currently NFSv4_2 SSC helper, nfs_ssc, incorrectly uses GRACE_PERIOD as its config. Fix by adding new config NFS_V4_2_SSC_HELPER which depends on NFS_V4_2 and is automatically selected when NFSD_V4 is enabled. Also removed the file name from a comment in nfs_ssc.c. Signed-off-by: Dai Ngo Signed-off-by: Chuck Lever --- fs/Kconfig | 4 ++++ fs/nfs/nfs4file.c | 4 ++++ fs/nfs/super.c | 12 ++++++++++++ fs/nfs_common/Makefile | 2 +- fs/nfs_common/nfs_ssc.c | 2 -- fs/nfsd/Kconfig | 1 + 6 files changed, 22 insertions(+), 3 deletions(-) diff --git a/fs/Kconfig b/fs/Kconfig index aa4c12282301..a55bda4233bb 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -333,6 +333,10 @@ config NFS_COMMON depends on NFSD || NFS_FS || LOCKD default y +config NFS_V4_2_SSC_HELPER + tristate + default y if NFS_V4=y || NFS_FS=y + source "net/sunrpc/Kconfig" source "fs/ceph/Kconfig" source "fs/cifs/Kconfig" diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 57b3821d975a..441a2fa073c8 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -420,7 +420,9 @@ static const struct nfs4_ssc_client_ops nfs4_ssc_clnt_ops_tbl = { */ void nfs42_ssc_register_ops(void) { +#ifdef CONFIG_NFSD_V4 nfs42_ssc_register(&nfs4_ssc_clnt_ops_tbl); +#endif } /** @@ -431,7 +433,9 @@ void nfs42_ssc_register_ops(void) */ void nfs42_ssc_unregister_ops(void) { +#ifdef CONFIG_NFSD_V4 nfs42_ssc_unregister(&nfs4_ssc_clnt_ops_tbl); +#endif } #endif /* CONFIG_NFS_V4_2 */ diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 4034102010f0..c7a924580eec 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -86,9 +86,11 @@ const struct super_operations nfs_sops = { }; EXPORT_SYMBOL_GPL(nfs_sops); +#ifdef CONFIG_NFS_V4_2 static const struct nfs_ssc_client_ops nfs_ssc_clnt_ops_tbl = { .sco_sb_deactive = nfs_sb_deactive, }; +#endif #if IS_ENABLED(CONFIG_NFS_V4) static int __init register_nfs4_fs(void) @@ -111,15 +113,21 @@ static void unregister_nfs4_fs(void) } #endif +#ifdef CONFIG_NFS_V4_2 static void nfs_ssc_register_ops(void) { +#ifdef CONFIG_NFSD_V4 nfs_ssc_register(&nfs_ssc_clnt_ops_tbl); +#endif } static void nfs_ssc_unregister_ops(void) { +#ifdef CONFIG_NFSD_V4 nfs_ssc_unregister(&nfs_ssc_clnt_ops_tbl); +#endif } +#endif /* CONFIG_NFS_V4_2 */ static struct shrinker acl_shrinker = { .count_objects = nfs_access_cache_count, @@ -148,7 +156,9 @@ int __init register_nfs_fs(void) ret = register_shrinker(&acl_shrinker); if (ret < 0) goto error_3; +#ifdef CONFIG_NFS_V4_2 nfs_ssc_register_ops(); +#endif return 0; error_3: nfs_unregister_sysctl(); @@ -168,7 +178,9 @@ void __exit unregister_nfs_fs(void) unregister_shrinker(&acl_shrinker); nfs_unregister_sysctl(); unregister_nfs4_fs(); +#ifdef CONFIG_NFS_V4_2 nfs_ssc_unregister_ops(); +#endif unregister_filesystem(&nfs_fs_type); } diff --git a/fs/nfs_common/Makefile b/fs/nfs_common/Makefile index fa82f5aaa6d9..119c75ab9fd0 100644 --- a/fs/nfs_common/Makefile +++ b/fs/nfs_common/Makefile @@ -7,4 +7,4 @@ obj-$(CONFIG_NFS_ACL_SUPPORT) += nfs_acl.o nfs_acl-objs := nfsacl.o obj-$(CONFIG_GRACE_PERIOD) += grace.o -obj-$(CONFIG_GRACE_PERIOD) += nfs_ssc.o +obj-$(CONFIG_NFS_V4_2_SSC_HELPER) += nfs_ssc.o diff --git a/fs/nfs_common/nfs_ssc.c b/fs/nfs_common/nfs_ssc.c index f43bbb373913..7c1509e968c8 100644 --- a/fs/nfs_common/nfs_ssc.c +++ b/fs/nfs_common/nfs_ssc.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * fs/nfs_common/nfs_ssc_comm.c - * * Helper for knfsd's SSC to access ops in NFS client modules * * Author: Dai Ngo diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index dbbc583d6273..821e5913faee 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -76,6 +76,7 @@ config NFSD_V4 select CRYPTO_MD5 select CRYPTO_SHA256 select GRACE_PERIOD + select NFS_V4_2_SSC_HELPER if NFS_V4_2 help This option enables support in your system's NFS server for version 4 of the NFS protocol (RFC 3530). From 3cc55f4434b421d37300aa9a167ace7d60b45ccf Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 29 Jan 2021 14:26:29 -0500 Subject: [PATCH 86/99] nfs: use change attribute for NFS re-exports When exporting NFS, we may as well use the real change attribute returned by the original server instead of faking up a change attribute from the ctime. Note we can't do that by setting I_VERSION--that would also turn on the logic in iversion.h which treats the lower bit specially, and that doesn't make sense for NFS. So instead we define a new export operation for filesystems like NFS that want to manage the change attribute themselves. Signed-off-by: J. Bruce Fields Reviewed-by: Christoph Hellwig Signed-off-by: Chuck Lever --- fs/nfs/export.c | 18 ++++++++++++++++++ fs/nfsd/nfsfh.h | 5 ++++- include/linux/exportfs.h | 1 + 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/fs/nfs/export.c b/fs/nfs/export.c index 7412bb164fa7..f2b34cfe286c 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -167,10 +167,28 @@ nfs_get_parent(struct dentry *dentry) return parent; } +static u64 nfs_fetch_iversion(struct inode *inode) +{ + struct nfs_server *server = NFS_SERVER(inode); + + /* Is this the right call?: */ + nfs_revalidate_inode(server, inode); + /* + * Also, note we're ignoring any returned error. That seems to be + * the practice for cache consistency information elsewhere in + * the server, but I'm not sure why. + */ + if (server->nfs_client->rpc_ops->version >= 4) + return inode_peek_iversion_raw(inode); + else + return time_to_chattr(&inode->i_ctime); +} + const struct export_operations nfs_export_ops = { .encode_fh = nfs_encode_fh, .fh_to_dentry = nfs_fh_to_dentry, .get_parent = nfs_get_parent, + .fetch_iversion = nfs_fetch_iversion, .flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK| EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS| EXPORT_OP_NOATOMIC_ATTR, diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index cb20c2cd3469..f58933519f38 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -12,6 +12,7 @@ #include #include #include +#include static inline __u32 ino_t_to_u32(ino_t ino) { @@ -264,7 +265,9 @@ fh_clear_wcc(struct svc_fh *fhp) static inline u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode) { - if (IS_I_VERSION(inode)) { + if (inode->i_sb->s_export_op->fetch_iversion) + return inode->i_sb->s_export_op->fetch_iversion(inode); + else if (IS_I_VERSION(inode)) { u64 chattr; chattr = stat->ctime.tv_sec; diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 9f4d4bcbf251..fe848901fcc3 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -213,6 +213,7 @@ struct export_operations { bool write, u32 *device_generation); int (*commit_blocks)(struct inode *inode, struct iomap *iomaps, int nr_iomaps, struct iattr *iattr); + u64 (*fetch_iversion)(struct inode *); #define EXPORT_OP_NOWCC (0x1) /* don't collect v3 wcc data */ #define EXPORT_OP_NOSUBTREECHK (0x2) /* no subtree checking */ #define EXPORT_OP_CLOSE_BEFORE_UNLINK (0x4) /* close files before unlink */ From 428a23d2bf0ca8fd4d364a464c3e468f0e81671e Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 29 Jan 2021 14:27:01 -0500 Subject: [PATCH 87/99] nfsd: skip some unnecessary stats in the v4 case In the typical case of v4 and an i_version-supporting filesystem, we can skip a stat which is only required to fake up a change attribute from ctime. Signed-off-by: J. Bruce Fields Reviewed-by: Christoph Hellwig Signed-off-by: Chuck Lever --- fs/nfsd/nfs3xdr.c | 44 +++++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 00a96054280a..9d9a01ce0b27 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -364,6 +364,11 @@ encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) return encode_post_op_attr(rqstp, p, fhp); } +static bool fs_supports_change_attribute(struct super_block *sb) +{ + return sb->s_flags & SB_I_VERSION || sb->s_export_op->fetch_iversion; +} + /* * Fill in the pre_op attr for the wcc data */ @@ -372,24 +377,26 @@ void fill_pre_wcc(struct svc_fh *fhp) struct inode *inode; struct kstat stat; bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); - __be32 err; if (fhp->fh_no_wcc || fhp->fh_pre_saved) return; inode = d_inode(fhp->fh_dentry); - err = fh_getattr(fhp, &stat); - if (err) { - /* Grab the times from inode anyway */ - stat.mtime = inode->i_mtime; - stat.ctime = inode->i_ctime; - stat.size = inode->i_size; + if (fs_supports_change_attribute(inode->i_sb) || !v4) { + __be32 err = fh_getattr(fhp, &stat); + + if (err) { + /* Grab the times from inode anyway */ + stat.mtime = inode->i_mtime; + stat.ctime = inode->i_ctime; + stat.size = inode->i_size; + } + fhp->fh_pre_mtime = stat.mtime; + fhp->fh_pre_ctime = stat.ctime; + fhp->fh_pre_size = stat.size; } if (v4) fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); - fhp->fh_pre_mtime = stat.mtime; - fhp->fh_pre_ctime = stat.ctime; - fhp->fh_pre_size = stat.size; fhp->fh_pre_saved = true; } @@ -400,7 +407,6 @@ void fill_post_wcc(struct svc_fh *fhp) { bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); struct inode *inode = d_inode(fhp->fh_dentry); - __be32 err; if (fhp->fh_no_wcc) return; @@ -408,12 +414,16 @@ void fill_post_wcc(struct svc_fh *fhp) if (fhp->fh_post_saved) printk("nfsd: inode locked twice during operation.\n"); - err = fh_getattr(fhp, &fhp->fh_post_attr); - if (err) { - fhp->fh_post_saved = false; - fhp->fh_post_attr.ctime = inode->i_ctime; - } else - fhp->fh_post_saved = true; + fhp->fh_post_saved = true; + + if (fs_supports_change_attribute(inode->i_sb) || !v4) { + __be32 err = fh_getattr(fhp, &fhp->fh_post_attr); + + if (err) { + fhp->fh_post_saved = false; + fhp->fh_post_attr.ctime = inode->i_ctime; + } + } if (v4) fhp->fh_post_change = nfsd4_change_attribute(&fhp->fh_post_attr, inode); From b0ff4fe746fd028eef920ddc8c7b0361c1ede6ec Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 26 Jan 2021 17:00:42 -0800 Subject: [PATCH 88/99] f2fs: flush data when enabling checkpoint back During checkpoint=disable period, f2fs bypasses all the synchronous IOs such as sync and fsync. So, when enabling it back, we must flush all of them in order to keep the data persistent. Otherwise, suddern power-cut right after enabling checkpoint will cause data loss. Fixes: 4354994f097d ("f2fs: checkpoint disabling") Cc: stable@vger.kernel.org Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index fc343243799c..429bc00af440 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1892,6 +1892,9 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) { + /* we should flush all the data to keep data consistency */ + sync_inodes_sb(sbi->sb); + down_write(&sbi->gc_lock); f2fs_dirty_to_prefree(sbi); From 25fb04dbce6a0e165d28fd1fa8a1d7018c637fe8 Mon Sep 17 00:00:00 2001 From: Yi Chen Date: Thu, 28 Jan 2021 17:02:56 +0800 Subject: [PATCH 89/99] f2fs: fix to avoid inconsistent quota data Occasionally, quota data may be corrupted detected by fsck: Info: checkpoint state = 45 : crc compacted_summary unmount [QUOTA WARNING] Usage inconsistent for ID 0:actual (1543036928, 762) != expected (1543032832, 762) [ASSERT] (fsck_chk_quota_files:1986) --> Quota file is missing or invalid quota file content found. [QUOTA WARNING] Usage inconsistent for ID 0:actual (1352478720, 344) != expected (1352474624, 344) [ASSERT] (fsck_chk_quota_files:1986) --> Quota file is missing or invalid quota file content found. [FSCK] Unreachable nat entries [Ok..] [0x0] [FSCK] SIT valid block bitmap checking [Ok..] [FSCK] Hard link checking for regular file [Ok..] [0x0] [FSCK] valid_block_count matching with CP [Ok..] [0xdf299] [FSCK] valid_node_count matcing with CP (de lookup) [Ok..] [0x2b01] [FSCK] valid_node_count matcing with CP (nat lookup) [Ok..] [0x2b01] [FSCK] valid_inode_count matched with CP [Ok..] [0x2665] [FSCK] free segment_count matched with CP [Ok..] [0xcb04] [FSCK] next block offset is free [Ok..] [FSCK] fixing SIT types [FSCK] other corrupted bugs [Fail] The root cause is: If we open file w/ readonly flag, disk quota info won't be initialized for this file, however, following mmap() will force to convert inline inode via f2fs_convert_inline_inode(), which may increase block usage for this inode w/o updating quota data, it causes inconsistent disk quota info. The issue will happen in following stack: open(file, O_RDONLY) mmap(file) - f2fs_convert_inline_inode - f2fs_convert_inline_page - f2fs_reserve_block - f2fs_reserve_new_block - f2fs_reserve_new_blocks - f2fs_i_blocks_write - dquot_claim_block inode->i_blocks increase, but the dqb_curspace keep the size for the dquots is NULL. To fix this issue, let's call dquot_initialize() anyway in both f2fs_truncate() and f2fs_convert_inline_inode() functions to avoid potential inconsistent quota data issue. Fixes: 0abd675e97e6 ("f2fs: support plain user/group quota") Signed-off-by: Daiyue Zhang Signed-off-by: Dehe Gu Signed-off-by: Junchao Jiang Signed-off-by: Ge Qiu Signed-off-by: Yi Chen Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 4 ++++ fs/f2fs/inline.c | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7db27c81d034..00b2ce47fa37 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -771,6 +771,10 @@ int f2fs_truncate(struct inode *inode) return -EIO; } + err = dquot_initialize(inode); + if (err) + return err; + /* we should check inline_data size */ if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 806ebabf5870..993caefcd2bb 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -192,6 +192,10 @@ int f2fs_convert_inline_inode(struct inode *inode) f2fs_hw_is_readonly(sbi) || f2fs_readonly(sbi->sb)) return 0; + err = dquot_initialize(inode); + if (err) + return err; + page = f2fs_grab_cache_page(inode->i_mapping, 0, false); if (!page) return -ENOMEM; From 2e0cd472a0dd9b9a35699502570015af15d7c70f Mon Sep 17 00:00:00 2001 From: Liu Song Date: Sun, 31 Jan 2021 20:26:05 +0800 Subject: [PATCH 90/99] f2fs: remove unnecessary initialization in xattr.c These variables will be explicitly assigned before use, so there is no need to initialize. Signed-off-by: Liu Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 2086bef6c154..8159fae74b9a 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -327,7 +327,7 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, void *last_addr = NULL; nid_t xnid = F2FS_I(inode)->i_xattr_nid; unsigned int inline_size = inline_xattr_size(inode); - int err = 0; + int err; if (!xnid && !inline_size) return -ENODATA; @@ -515,7 +515,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, void *buffer, size_t buffer_size, struct page *ipage) { struct f2fs_xattr_entry *entry = NULL; - int error = 0; + int error; unsigned int size, len; void *base_addr = NULL; int base_size; @@ -562,7 +562,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) struct inode *inode = d_inode(dentry); struct f2fs_xattr_entry *entry; void *base_addr, *last_base_addr; - int error = 0; + int error; size_t rest = buffer_size; down_read(&F2FS_I(inode)->i_xattr_sem); @@ -632,7 +632,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, int found, newsize; size_t len; __u32 new_hsize; - int error = 0; + int error; if (name == NULL) return -EINVAL; From 39f71b7e40e21805d6b15fc7750bdd9cab6a5010 Mon Sep 17 00:00:00 2001 From: Dehe Gu Date: Tue, 2 Feb 2021 17:39:22 +0800 Subject: [PATCH 91/99] f2fs: fix a wrong condition in __submit_bio We should use !F2FS_IO_ALIGNED() to check and submit_io directly. Fixes: 8223ecc456d0 ("f2fs: fix to add missing F2FS_IO_ALIGNED() condition") Reviewed-by: Chao Yu Signed-off-by: Dehe Gu Signed-off-by: Ge Qiu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 38476d0d3916..1ee966a63df9 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -470,7 +470,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, if (f2fs_lfs_mode(sbi) && current->plug) blk_finish_plug(current->plug); - if (F2FS_IO_ALIGNED(sbi)) + if (!F2FS_IO_ALIGNED(sbi)) goto submit_io; start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS; From c8e43d55b1aa05d175daac25d228c7c1c71c7b11 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 2 Feb 2021 16:03:11 +0800 Subject: [PATCH 92/99] f2fs: relocate inline conversion from mmap() to mkwrite() If there is page fault only for read case on inline inode, we don't need to convert inline inode, instead, let's do conversion for write case. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 00b2ce47fa37..8e53f8898688 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -72,6 +72,10 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) goto err; } + err = f2fs_convert_inline_inode(inode); + if (err) + goto err; + #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { int ret = f2fs_is_compressed_cluster(inode, page->index); @@ -506,7 +510,6 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); - int err; if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; @@ -514,11 +517,6 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) if (!f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; - /* we don't need to use inline_data strictly */ - err = f2fs_convert_inline_inode(inode); - if (err) - return err; - file_accessed(file); vma->vm_ops = &f2fs_file_vm_ops; set_inode_flag(inode, FI_MMAP_FILE); From 261eeb9c1585de4515a770b48a3c89672c08ae7f Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Tue, 19 Jan 2021 09:00:42 +0900 Subject: [PATCH 93/99] f2fs: introduce checkpoint_merge mount option We've added a new mount options, "checkpoint_merge" and "nocheckpoint_merge", which creates a kernel daemon and makes it to merge concurrent checkpoint requests as much as possible to eliminate redundant checkpoint issues. Plus, we can eliminate the sluggish issue caused by slow checkpoint operation when the checkpoint is done in a process context in a cgroup having low i/o budget and cpu shares. To make this do better, we set the default i/o priority of the kernel daemon to "3", to give one higher priority than other kernel threads. The below verification result explains this. The basic idea has come from https://opensource.samsung.com. [Verification] Android Pixel Device(ARM64, 7GB RAM, 256GB UFS) Create two I/O cgroups (fg w/ weight 100, bg w/ wight 20) Set "strict_guarantees" to "1" in BFQ tunables In "fg" cgroup, - thread A => trigger 1000 checkpoint operations "for i in `seq 1 1000`; do touch test_dir1/file; fsync test_dir1; done" - thread B => gererating async. I/O "fio --rw=write --numjobs=1 --bs=128k --runtime=3600 --time_based=1 --filename=test_img --name=test" In "bg" cgroup, - thread C => trigger repeated checkpoint operations "echo $$ > /dev/blkio/bg/tasks; while true; do touch test_dir2/file; fsync test_dir2; done" We've measured thread A's execution time. [ w/o patch ] Elapsed Time: Avg. 68 seconds [ w/ patch ] Elapsed Time: Avg. 48 seconds Reported-by: kernel test robot Reported-by: Dan Carpenter [Jaegeuk Kim: fix the return value in f2fs_start_ckpt_thread, reported by Dan] Signed-off-by: Daeho Jeong Signed-off-by: Sungjong Seo Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 11 ++ fs/f2fs/checkpoint.c | 177 +++++++++++++++++++++++++++++ fs/f2fs/debug.c | 12 ++ fs/f2fs/f2fs.h | 27 +++++ fs/f2fs/super.c | 58 ++++++++-- 5 files changed, 277 insertions(+), 8 deletions(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 5eff4009e77e..f75ec244762f 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -247,6 +247,17 @@ checkpoint=%s[:%u[%]] Set to "disable" to turn off checkpointing. Set to "enabl hide up to all remaining free space. The actual space that would be unusable can be viewed at /sys/fs/f2fs//unusable This space is reclaimed once checkpoint=enable. +checkpoint_merge When checkpoint is enabled, this can be used to create a kernel + daemon and make it to merge concurrent checkpoint requests as + much as possible to eliminate redundant checkpoint issues. Plus, + we can eliminate the sluggish issue caused by slow checkpoint + operation when the checkpoint is done in a process context in + a cgroup having low i/o budget and cpu shares. To make this + do better, we set the default i/o priority of the kernel daemon + to "3", to give one higher priority than other kernel threads. + This is the same way to give a I/O priority to the jbd2 + journaling thread of ext4 filesystem. +nocheckpoint_merge Disable checkpoint merge feature. compress_algorithm=%s Control compress algorithm, currently f2fs supports "lzo", "lz4", "zstd" and "lzo-rle" algorithm. compress_algorithm=%s:%d Control compress algorithm and its compress level, now, only diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 8c79ba0566b1..6f6ecba8f920 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -13,12 +13,15 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" #include "segment.h" #include +#define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) + static struct kmem_cache *ino_entry_slab; struct kmem_cache *f2fs_inode_entry_slab; @@ -1704,3 +1707,177 @@ void f2fs_destroy_checkpoint_caches(void) kmem_cache_destroy(ino_entry_slab); kmem_cache_destroy(f2fs_inode_entry_slab); } + +static int __write_checkpoint_sync(struct f2fs_sb_info *sbi) +{ + struct cp_control cpc = { .reason = CP_SYNC, }; + int err; + + down_write(&sbi->gc_lock); + err = f2fs_write_checkpoint(sbi, &cpc); + up_write(&sbi->gc_lock); + + return err; +} + +static void __checkpoint_and_complete_reqs(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + struct ckpt_req *req, *next; + struct llist_node *dispatch_list; + u64 sum_diff = 0, diff, count = 0; + int ret; + + dispatch_list = llist_del_all(&cprc->issue_list); + if (!dispatch_list) + return; + dispatch_list = llist_reverse_order(dispatch_list); + + ret = __write_checkpoint_sync(sbi); + atomic_inc(&cprc->issued_ckpt); + + llist_for_each_entry_safe(req, next, dispatch_list, llnode) { + diff = (u64)ktime_ms_delta(ktime_get(), req->queue_time); + req->ret = ret; + complete(&req->wait); + + sum_diff += diff; + count++; + } + atomic_sub(count, &cprc->queued_ckpt); + atomic_add(count, &cprc->total_ckpt); + + spin_lock(&cprc->stat_lock); + cprc->cur_time = (unsigned int)div64_u64(sum_diff, count); + if (cprc->peak_time < cprc->cur_time) + cprc->peak_time = cprc->cur_time; + spin_unlock(&cprc->stat_lock); +} + +static int issue_checkpoint_thread(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct ckpt_req_control *cprc = &sbi->cprc_info; + wait_queue_head_t *q = &cprc->ckpt_wait_queue; +repeat: + if (kthread_should_stop()) + return 0; + + sb_start_intwrite(sbi->sb); + + if (!llist_empty(&cprc->issue_list)) + __checkpoint_and_complete_reqs(sbi); + + sb_end_intwrite(sbi->sb); + + wait_event_interruptible(*q, + kthread_should_stop() || !llist_empty(&cprc->issue_list)); + goto repeat; +} + +static void flush_remained_ckpt_reqs(struct f2fs_sb_info *sbi, + struct ckpt_req *wait_req) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + + if (!llist_empty(&cprc->issue_list)) { + __checkpoint_and_complete_reqs(sbi); + } else { + /* already dispatched by issue_checkpoint_thread */ + if (wait_req) + wait_for_completion(&wait_req->wait); + } +} + +static void init_ckpt_req(struct ckpt_req *req) +{ + memset(req, 0, sizeof(struct ckpt_req)); + + init_completion(&req->wait); + req->queue_time = ktime_get(); +} + +int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + struct ckpt_req req; + struct cp_control cpc; + + cpc.reason = __get_cp_reason(sbi); + if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC) { + int ret; + + down_write(&sbi->gc_lock); + ret = f2fs_write_checkpoint(sbi, &cpc); + up_write(&sbi->gc_lock); + + return ret; + } + + if (!cprc->f2fs_issue_ckpt) + return __write_checkpoint_sync(sbi); + + init_ckpt_req(&req); + + llist_add(&req.llnode, &cprc->issue_list); + atomic_inc(&cprc->queued_ckpt); + + /* update issue_list before we wake up issue_checkpoint thread */ + smp_mb(); + + if (waitqueue_active(&cprc->ckpt_wait_queue)) + wake_up(&cprc->ckpt_wait_queue); + + if (cprc->f2fs_issue_ckpt) + wait_for_completion(&req.wait); + else + flush_remained_ckpt_reqs(sbi, &req); + + return req.ret; +} + +int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + struct ckpt_req_control *cprc = &sbi->cprc_info; + + if (cprc->f2fs_issue_ckpt) + return 0; + + cprc->f2fs_issue_ckpt = kthread_run(issue_checkpoint_thread, sbi, + "f2fs_ckpt-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(cprc->f2fs_issue_ckpt)) { + cprc->f2fs_issue_ckpt = NULL; + return -ENOMEM; + } + + set_task_ioprio(cprc->f2fs_issue_ckpt, DEFAULT_CHECKPOINT_IOPRIO); + + return 0; +} + +void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + + if (cprc->f2fs_issue_ckpt) { + struct task_struct *ckpt_task = cprc->f2fs_issue_ckpt; + + cprc->f2fs_issue_ckpt = NULL; + kthread_stop(ckpt_task); + + flush_remained_ckpt_reqs(sbi, NULL); + } +} + +void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + + atomic_set(&cprc->issued_ckpt, 0); + atomic_set(&cprc->total_ckpt, 0); + atomic_set(&cprc->queued_ckpt, 0); + init_waitqueue_head(&cprc->ckpt_wait_queue); + init_llist_head(&cprc->issue_list); + spin_lock_init(&cprc->stat_lock); +} diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 197c914119da..91855d5721cd 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -120,6 +120,13 @@ static void update_general_status(struct f2fs_sb_info *sbi) atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks; } + si->nr_issued_ckpt = atomic_read(&sbi->cprc_info.issued_ckpt); + si->nr_total_ckpt = atomic_read(&sbi->cprc_info.total_ckpt); + si->nr_queued_ckpt = atomic_read(&sbi->cprc_info.queued_ckpt); + spin_lock(&sbi->cprc_info.stat_lock); + si->cur_ckpt_time = sbi->cprc_info.cur_time; + si->peak_ckpt_time = sbi->cprc_info.peak_time; + spin_unlock(&sbi->cprc_info.stat_lock); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); @@ -417,6 +424,11 @@ static int stat_show(struct seq_file *s, void *v) si->meta_count[META_NAT]); seq_printf(s, " - ssa blocks : %u\n", si->meta_count[META_SSA]); + seq_printf(s, "CP merge (Queued: %4d, Issued: %4d, Total: %4d, " + "Cur time: %4d(ms), Peak time: %4d(ms))\n", + si->nr_queued_ckpt, si->nr_issued_ckpt, + si->nr_total_ckpt, si->cur_ckpt_time, + si->peak_ckpt_time); seq_printf(s, "GC calls: %d (BG: %d)\n", si->call_count, si->bg_gc); seq_printf(s, " - data segments : %d (%d)\n", diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a2e520a13630..f7536aca8a31 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -97,6 +97,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_DISABLE_CHECKPOINT 0x02000000 #define F2FS_MOUNT_NORECOVERY 0x04000000 #define F2FS_MOUNT_ATGC 0x08000000 +#define F2FS_MOUNT_MERGE_CHECKPOINT 0x10000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -267,6 +268,25 @@ struct fsync_node_entry { unsigned int seq_id; /* sequence id */ }; +struct ckpt_req { + struct completion wait; /* completion for checkpoint done */ + struct llist_node llnode; /* llist_node to be linked in wait queue */ + int ret; /* return code of checkpoint */ + ktime_t queue_time; /* request queued time */ +}; + +struct ckpt_req_control { + struct task_struct *f2fs_issue_ckpt; /* checkpoint task */ + wait_queue_head_t ckpt_wait_queue; /* waiting queue for wake-up */ + atomic_t issued_ckpt; /* # of actually issued ckpts */ + atomic_t total_ckpt; /* # of total ckpts */ + atomic_t queued_ckpt; /* # of queued ckpts */ + struct llist_head issue_list; /* list for command issue */ + spinlock_t stat_lock; /* lock for below checkpoint time stats */ + unsigned int cur_time; /* cur wait time in msec for currently issued checkpoint */ + unsigned int peak_time; /* peak wait time in msec until now */ +}; + /* for the bitmap indicate blocks to be discarded */ struct discard_entry { struct list_head list; /* list head */ @@ -1433,6 +1453,7 @@ struct f2fs_sb_info { wait_queue_head_t cp_wait; unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ long interval_time[MAX_TIME]; /* to store thresholds */ + struct ckpt_req_control cprc_info; /* for checkpoint request control */ struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ @@ -3450,6 +3471,10 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi); int __init f2fs_create_checkpoint_caches(void); void f2fs_destroy_checkpoint_caches(void); +int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi); +int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi); +void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi); +void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi); /* * data.c @@ -3562,6 +3587,8 @@ struct f2fs_stat_info { int nr_discarding, nr_discarded; int nr_discard_cmd; unsigned int undiscard_blks; + int nr_issued_ckpt, nr_total_ckpt, nr_queued_ckpt; + unsigned int cur_ckpt_time, peak_ckpt_time; int inline_xattr, inline_inode, inline_dir, append, update, orphans; int compr_inode; unsigned long long compr_blocks; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 429bc00af440..1000d21120ca 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -144,6 +144,8 @@ enum { Opt_checkpoint_disable_cap, Opt_checkpoint_disable_cap_perc, Opt_checkpoint_enable, + Opt_checkpoint_merge, + Opt_nocheckpoint_merge, Opt_compress_algorithm, Opt_compress_log_size, Opt_compress_extension, @@ -214,6 +216,8 @@ static match_table_t f2fs_tokens = { {Opt_checkpoint_disable_cap, "checkpoint=disable:%u"}, {Opt_checkpoint_disable_cap_perc, "checkpoint=disable:%u%%"}, {Opt_checkpoint_enable, "checkpoint=enable"}, + {Opt_checkpoint_merge, "checkpoint_merge"}, + {Opt_nocheckpoint_merge, "nocheckpoint_merge"}, {Opt_compress_algorithm, "compress_algorithm=%s"}, {Opt_compress_log_size, "compress_log_size=%u"}, {Opt_compress_extension, "compress_extension=%s"}, @@ -941,6 +945,12 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) case Opt_checkpoint_enable: clear_opt(sbi, DISABLE_CHECKPOINT); break; + case Opt_checkpoint_merge: + set_opt(sbi, MERGE_CHECKPOINT); + break; + case Opt_nocheckpoint_merge: + clear_opt(sbi, MERGE_CHECKPOINT); + break; #ifdef CONFIG_F2FS_FS_COMPRESSION case Opt_compress_algorithm: if (!f2fs_sb_has_compression(sbi)) { @@ -1340,6 +1350,12 @@ static void f2fs_put_super(struct super_block *sb) /* prevent remaining shrinker jobs */ mutex_lock(&sbi->umount_mutex); + /* + * flush all issued checkpoints and stop checkpoint issue thread. + * after then, all checkpoints should be done by each process context. + */ + f2fs_stop_ckpt_thread(sbi); + /* * We don't need to do checkpoint when superblock is clean. * But, the previous checkpoint was not done by umount, it needs to do @@ -1438,15 +1454,9 @@ int f2fs_sync_fs(struct super_block *sb, int sync) if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return -EAGAIN; - if (sync) { - struct cp_control cpc; + if (sync) + err = f2fs_issue_checkpoint(sbi); - cpc.reason = __get_cp_reason(sbi); - - down_write(&sbi->gc_lock); - err = f2fs_write_checkpoint(sbi, &cpc); - up_write(&sbi->gc_lock); - } return err; } @@ -1770,6 +1780,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sbi, DISABLE_CHECKPOINT)) seq_printf(seq, ",checkpoint=disable:%u", F2FS_OPTION(sbi).unusable_cap); + if (test_opt(sbi, MERGE_CHECKPOINT)) + seq_puts(seq, ",checkpoint_merge"); + else + seq_puts(seq, ",nocheckpoint_merge"); if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_POSIX) seq_printf(seq, ",fsync_mode=%s", "posix"); else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) @@ -2053,6 +2067,19 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } } + if (!test_opt(sbi, DISABLE_CHECKPOINT) && + test_opt(sbi, MERGE_CHECKPOINT)) { + err = f2fs_start_ckpt_thread(sbi); + if (err) { + f2fs_err(sbi, + "Failed to start F2FS issue_checkpoint_thread (%d)", + err); + goto restore_gc; + } + } else { + f2fs_stop_ckpt_thread(sbi); + } + /* * We stop issue flush thread if FS is mounted as RO * or if flush_merge is not passed in mount option. @@ -3804,6 +3831,19 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) f2fs_init_fsync_node_info(sbi); + /* setup checkpoint request control and start checkpoint issue thread */ + f2fs_init_ckpt_req_control(sbi); + if (!test_opt(sbi, DISABLE_CHECKPOINT) && + test_opt(sbi, MERGE_CHECKPOINT)) { + err = f2fs_start_ckpt_thread(sbi); + if (err) { + f2fs_err(sbi, + "Failed to start F2FS issue_checkpoint_thread (%d)", + err); + goto stop_ckpt_thread; + } + } + /* setup f2fs internal modules */ err = f2fs_build_segment_manager(sbi); if (err) { @@ -4013,6 +4053,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) free_sm: f2fs_destroy_segment_manager(sbi); f2fs_destroy_post_read_wq(sbi); +stop_ckpt_thread: + f2fs_stop_ckpt_thread(sbi); free_devices: destroy_device_list(sbi); kvfree(sbi->ckpt); From e65920661708b7c0f3db45c9cd5d0095034ee37f Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Thu, 21 Jan 2021 22:45:29 +0900 Subject: [PATCH 94/99] f2fs: add ckpt_thread_ioprio sysfs node Added "ckpt_thread_ioprio" sysfs node to give a way to change checkpoint merge daemon's io priority. Its default value is "be,3", which means "BE" I/O class and I/O priority "3". We can select the class between "rt" and "be", and set the I/O priority within valid range of it. "," delimiter is necessary in between I/O class and priority number. Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 9 ++++ fs/f2fs/checkpoint.c | 3 +- fs/f2fs/f2fs.h | 1 + fs/f2fs/sysfs.c | 55 +++++++++++++++++++++++++ 4 files changed, 67 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 362803901614..cbeac1bebe2f 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -400,3 +400,12 @@ Description: Show status of f2fs superblock in real time. 0x1000 SBI_QUOTA_NEED_REPAIR quota file may be corrupted 0x2000 SBI_IS_RESIZEFS resizefs is in process ====== ===================== ================================= + +What: /sys/fs/f2fs//ckpt_thread_ioprio +Date: January 2021 +Contact: "Daeho Jeong" +Description: Give a way to change checkpoint merge daemon's io priority. + Its default value is "be,3", which means "BE" I/O class and + I/O priority "3". We can select the class between "rt" and "be", + and set the I/O priority within valid range of it. "," delimiter + is necessary in between I/O class and priority number. diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 6f6ecba8f920..579b9c3603cc 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1851,7 +1851,7 @@ int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi) return -ENOMEM; } - set_task_ioprio(cprc->f2fs_issue_ckpt, DEFAULT_CHECKPOINT_IOPRIO); + set_task_ioprio(cprc->f2fs_issue_ckpt, cprc->ckpt_thread_ioprio); return 0; } @@ -1877,6 +1877,7 @@ void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi) atomic_set(&cprc->issued_ckpt, 0); atomic_set(&cprc->total_ckpt, 0); atomic_set(&cprc->queued_ckpt, 0); + cprc->ckpt_thread_ioprio = DEFAULT_CHECKPOINT_IOPRIO; init_waitqueue_head(&cprc->ckpt_wait_queue); init_llist_head(&cprc->issue_list); spin_lock_init(&cprc->stat_lock); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f7536aca8a31..2860003a09ed 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -277,6 +277,7 @@ struct ckpt_req { struct ckpt_req_control { struct task_struct *f2fs_issue_ckpt; /* checkpoint task */ + int ckpt_thread_ioprio; /* checkpoint merge thread ioprio */ wait_queue_head_t ckpt_wait_queue; /* waiting queue for wake-up */ atomic_t issued_ckpt; /* # of actually issued ckpts */ atomic_t total_ckpt; /* # of total ckpts */ diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index f39874d512ea..e38a7f6921dd 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "f2fs.h" #include "segment.h" @@ -34,6 +35,7 @@ enum { FAULT_INFO_TYPE, /* struct f2fs_fault_info */ #endif RESERVED_BLOCKS, /* struct f2fs_sb_info */ + CPRC_INFO, /* struct ckpt_req_control */ }; struct f2fs_attr { @@ -70,6 +72,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) else if (struct_type == STAT_INFO) return (unsigned char *)F2FS_STAT(sbi); #endif + else if (struct_type == CPRC_INFO) + return (unsigned char *)&sbi->cprc_info; return NULL; } @@ -261,6 +265,23 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, return len; } + if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) { + struct ckpt_req_control *cprc = &sbi->cprc_info; + int len = 0; + int class = IOPRIO_PRIO_CLASS(cprc->ckpt_thread_ioprio); + int data = IOPRIO_PRIO_DATA(cprc->ckpt_thread_ioprio); + + if (class == IOPRIO_CLASS_RT) + len += scnprintf(buf + len, PAGE_SIZE - len, "rt,"); + else if (class == IOPRIO_CLASS_BE) + len += scnprintf(buf + len, PAGE_SIZE - len, "be,"); + else + return -EINVAL; + + len += scnprintf(buf + len, PAGE_SIZE - len, "%d\n", data); + return len; + } + ui = (unsigned int *)(ptr + a->offset); return sprintf(buf, "%u\n", *ui); @@ -314,6 +335,38 @@ static ssize_t __sbi_store(struct f2fs_attr *a, return ret ? ret : count; } + if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) { + const char *name = strim((char *)buf); + struct ckpt_req_control *cprc = &sbi->cprc_info; + int class; + long data; + int ret; + + if (!strncmp(name, "rt,", 3)) + class = IOPRIO_CLASS_RT; + else if (!strncmp(name, "be,", 3)) + class = IOPRIO_CLASS_BE; + else + return -EINVAL; + + name += 3; + ret = kstrtol(name, 10, &data); + if (ret) + return ret; + if (data >= IOPRIO_BE_NR || data < 0) + return -EINVAL; + + cprc->ckpt_thread_ioprio = IOPRIO_PRIO_VALUE(class, data); + if (test_opt(sbi, MERGE_CHECKPOINT)) { + ret = set_task_ioprio(cprc->f2fs_issue_ckpt, + cprc->ckpt_thread_ioprio); + if (ret) + return ret; + } + + return count; + } + ui = (unsigned int *)(ptr + a->offset); ret = kstrtoul(skip_spaces(buf), 0, &t); @@ -573,6 +626,7 @@ F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); #endif F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, data_io_flag, data_io_flag); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag); +F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio); F2FS_GENERAL_RO_ATTR(dirty_segments); F2FS_GENERAL_RO_ATTR(free_segments); F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); @@ -658,6 +712,7 @@ static struct attribute *f2fs_attrs[] = { #endif ATTR_LIST(data_io_flag), ATTR_LIST(node_io_flag), + ATTR_LIST(ckpt_thread_ioprio), ATTR_LIST(dirty_segments), ATTR_LIST(free_segments), ATTR_LIST(unusable), From d50dfc0c7df7bf037442045fbe63952ae0c4ce46 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 8 Feb 2021 13:42:21 -0800 Subject: [PATCH 95/99] f2fs: don't grab superblock freeze for flush/ckpt thread There are controlled by f2fs_freeze(). This fixes xfstests/generic/068 which is stuck at task:f2fs_ckpt-252:3 state:D stack: 0 pid: 5761 ppid: 2 flags:0x00004000 Call Trace: __schedule+0x44c/0x8a0 schedule+0x4f/0xc0 percpu_rwsem_wait+0xd8/0x140 ? percpu_down_write+0xf0/0xf0 __percpu_down_read+0x56/0x70 issue_checkpoint_thread+0x12c/0x160 [f2fs] ? wait_woken+0x80/0x80 kthread+0x114/0x150 ? __checkpoint_and_complete_reqs+0x110/0x110 [f2fs] ? kthread_park+0x90/0x90 ret_from_fork+0x22/0x30 Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 ---- fs/f2fs/segment.c | 4 ---- fs/f2fs/super.c | 4 ++++ 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 579b9c3603cc..174a0819ad96 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1763,13 +1763,9 @@ static int issue_checkpoint_thread(void *data) if (kthread_should_stop()) return 0; - sb_start_intwrite(sbi->sb); - if (!llist_empty(&cprc->issue_list)) __checkpoint_and_complete_reqs(sbi); - sb_end_intwrite(sbi->sb); - wait_event_interruptible(*q, kthread_should_stop() || !llist_empty(&cprc->issue_list)); goto repeat; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7d34f1cacdee..440634dfaa56 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -607,8 +607,6 @@ static int issue_flush_thread(void *data) if (kthread_should_stop()) return 0; - sb_start_intwrite(sbi->sb); - if (!llist_empty(&fcc->issue_list)) { struct flush_cmd *cmd, *next; int ret; @@ -629,8 +627,6 @@ static int issue_flush_thread(void *data) fcc->dispatch_list = NULL; } - sb_end_intwrite(sbi->sb); - wait_event_interruptible(*q, kthread_should_stop() || !llist_empty(&fcc->issue_list)); goto repeat; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1000d21120ca..4aa533cb4340 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1472,6 +1472,10 @@ static int f2fs_freeze(struct super_block *sb) /* must be clean, since sync_filesystem() was already called */ if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY)) return -EINVAL; + + /* ensure no checkpoint required */ + if (!llist_empty(&F2FS_SB(sb)->cprc_info.issue_list)) + return -EINVAL; return 0; } From bde545295b710bdd13a0fcd4b9fddd2383eeeb3a Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 20 Jan 2021 09:30:16 +0800 Subject: [PATCH 96/99] erofs: fix shift-out-of-bounds of blkszbits syzbot generated a crafted bitszbits which can be shifted out-of-bounds[1]. So directly print unsupported blkszbits instead of blksize. [1] https://lore.kernel.org/r/000000000000c72ddd05b9444d2f@google.com Link: https://lore.kernel.org/r/20210120013016.14071-1-hsiangkao@aol.com Reported-by: syzbot+c68f467cd7c45860e8d4@syzkaller.appspotmail.com Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index be10b16ea66e..d5a6b9b888a5 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -158,8 +158,8 @@ static int erofs_read_superblock(struct super_block *sb) blkszbits = dsb->blkszbits; /* 9(512 bytes) + LOG_SECTORS_PER_BLOCK == LOG_BLOCK_SIZE */ if (blkszbits != LOG_BLOCK_SIZE) { - erofs_err(sb, "blksize %u isn't supported on this platform", - 1 << blkszbits); + erofs_err(sb, "blkszbits %u isn't supported on this platform", + blkszbits); goto out; } From ce063129181312f8781a047a50be439c5859747b Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 9 Feb 2021 21:06:18 +0800 Subject: [PATCH 97/99] erofs: initialized fields can only be observed after bit is set Currently, although set_bit() & test_bit() pairs are used as a fast- path for initialized configurations. However, these atomic ops are actually relaxed forms. Instead, load-acquire & store-release form is needed to make sure uninitialized fields won't be observed in advance here (yet no such corresponding bitops so use full barriers instead.) Link: https://lore.kernel.org/r/20210209130618.15838-1-hsiangkao@aol.com Fixes: 62dc45979f3f ("staging: erofs: fix race of initializing xattrs of a inode at the same time") Fixes: 152a333a5895 ("staging: erofs: add compacted compression indexes support") Cc: # 5.3+ Reported-by: Huang Jianan Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/xattr.c | 10 +++++++++- fs/erofs/zmap.c | 10 +++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 5bde77d70852..47314a26767a 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -48,8 +48,14 @@ static int init_inode_xattrs(struct inode *inode) int ret = 0; /* the most case is that xattrs of this inode are initialized. */ - if (test_bit(EROFS_I_EA_INITED_BIT, &vi->flags)) + if (test_bit(EROFS_I_EA_INITED_BIT, &vi->flags)) { + /* + * paired with smp_mb() at the end of the function to ensure + * fields will only be observed after the bit is set. + */ + smp_mb(); return 0; + } if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_XATTR_BIT, TASK_KILLABLE)) return -ERESTARTSYS; @@ -137,6 +143,8 @@ static int init_inode_xattrs(struct inode *inode) } xattr_iter_end(&it, atomic_map); + /* paired with smp_mb() at the beginning of the function. */ + smp_mb(); set_bit(EROFS_I_EA_INITED_BIT, &vi->flags); out_unlock: diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index ae325541884e..14d2de35110c 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -36,8 +36,14 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) void *kaddr; struct z_erofs_map_header *h; - if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) + if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) { + /* + * paired with smp_mb() at the end of the function to ensure + * fields will only be observed after the bit is set. + */ + smp_mb(); return 0; + } if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE)) return -ERESTARTSYS; @@ -83,6 +89,8 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits + ((h->h_clusterbits >> 5) & 7); + /* paired with smp_mb() at the beginning of the function */ + smp_mb(); set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); unmap_done: kunmap_atomic(kaddr); From 938a184265d75ea474f1c6fe1da96a5196163789 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 12 Feb 2021 14:09:54 -0800 Subject: [PATCH 98/99] f2fs: give a warning only for readonly partition Let's allow mounting readonly partition. We're able to recovery later once we have it as read-write back. Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4aa533cb4340..30d5abef4361 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3933,12 +3933,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) * previous checkpoint was not done by clean system shutdown. */ if (f2fs_hw_is_readonly(sbi)) { - if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { - err = -EROFS; + if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) f2fs_err(sbi, "Need to recover fsync data, but write access unavailable"); - goto free_meta; - } - f2fs_info(sbi, "write access unavailable, skipping recovery"); + else + f2fs_info(sbi, "write access unavailable, skipping recovery"); goto reset_checkpoint; } From 092af2eb180062f5bafe02a75da9856676eb4f89 Mon Sep 17 00:00:00 2001 From: Ed Tsai Date: Thu, 4 Feb 2021 21:25:56 +0800 Subject: [PATCH 99/99] Documentation: f2fs: fix typo s/automaic/automatic Fix typo in f2fs documentation. Signed-off-by: Ed Tsai Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index f75ec244762f..81c05baa8312 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -847,7 +847,7 @@ This is the default option. f2fs does automatic compression in the writeback of compression enabled files. 2) compress_mode=user -This disables the automaic compression and gives the user discretion of choosing the +This disables the automatic compression and gives the user discretion of choosing the target file and the timing. The user can do manual compression/decompression on the compression enabled files using F2FS_IOC_DECOMPRESS_FILE and F2FS_IOC_COMPRESS_FILE ioctls like the below.