From e83588458f656417d9b7ac45baf1c7b45790059b Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 5 Mar 2025 13:36:41 +0100 Subject: [PATCH 1/4] file: add fput and file_ref_put routines optimized for use when closing a fd Vast majority of the time closing a file descriptor also operates on the last reference, where a regular fput usage will result in 2 atomics. This can be changed to only suffer 1. See commentary above file_ref_put_close() for more information. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250305123644.554845-2-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/file.c | 41 ++++++++++++---------- fs/file_table.c | 74 +++++++++++++++++++++++++++------------- fs/internal.h | 3 ++ include/linux/file_ref.h | 34 ++++++++++++++++++ 4 files changed, 111 insertions(+), 41 deletions(-) diff --git a/fs/file.c b/fs/file.c index d868cdb95d1e..3fef798b96e5 100644 --- a/fs/file.c +++ b/fs/file.c @@ -26,6 +26,28 @@ #include "internal.h" +bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt) +{ + /* + * If the reference count was already in the dead zone, then this + * put() operation is imbalanced. Warn, put the reference count back to + * DEAD and tell the caller to not deconstruct the object. + */ + if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) { + atomic_long_set(&ref->refcnt, FILE_REF_DEAD); + return false; + } + + /* + * This is a put() operation on a saturated refcount. Restore the + * mean saturation value and tell the caller to not deconstruct the + * object. + */ + if (cnt > FILE_REF_MAXREF) + atomic_long_set(&ref->refcnt, FILE_REF_SATURATED); + return false; +} + /** * __file_ref_put - Slowpath of file_ref_put() * @ref: Pointer to the reference count @@ -67,24 +89,7 @@ bool __file_ref_put(file_ref_t *ref, unsigned long cnt) return true; } - /* - * If the reference count was already in the dead zone, then this - * put() operation is imbalanced. Warn, put the reference count back to - * DEAD and tell the caller to not deconstruct the object. - */ - if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) { - atomic_long_set(&ref->refcnt, FILE_REF_DEAD); - return false; - } - - /* - * This is a put() operation on a saturated refcount. Restore the - * mean saturation value and tell the caller to not deconstruct the - * object. - */ - if (cnt > FILE_REF_MAXREF) - atomic_long_set(&ref->refcnt, FILE_REF_SATURATED); - return false; + return __file_ref_put_badval(ref, cnt); } EXPORT_SYMBOL_GPL(__file_ref_put); diff --git a/fs/file_table.c b/fs/file_table.c index f0291a66f9db..18debb7bd285 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -495,30 +495,36 @@ void flush_delayed_fput(void) } EXPORT_SYMBOL_GPL(flush_delayed_fput); +static void __fput_deferred(struct file *file) +{ + struct task_struct *task = current; + + if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) { + file_free(file); + return; + } + + if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { + init_task_work(&file->f_task_work, ____fput); + if (!task_work_add(task, &file->f_task_work, TWA_RESUME)) + return; + /* + * After this task has run exit_task_work(), + * task_work_add() will fail. Fall through to delayed + * fput to avoid leaking *file. + */ + } + + if (llist_add(&file->f_llist, &delayed_fput_list)) + schedule_delayed_work(&delayed_fput_work, 1); +} + void fput(struct file *file) { - if (file_ref_put(&file->f_ref)) { - struct task_struct *task = current; - - if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) { - file_free(file); - return; - } - if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { - init_task_work(&file->f_task_work, ____fput); - if (!task_work_add(task, &file->f_task_work, TWA_RESUME)) - return; - /* - * After this task has run exit_task_work(), - * task_work_add() will fail. Fall through to delayed - * fput to avoid leaking *file. - */ - } - - if (llist_add(&file->f_llist, &delayed_fput_list)) - schedule_delayed_work(&delayed_fput_work, 1); - } + if (unlikely(file_ref_put(&file->f_ref))) + __fput_deferred(file); } +EXPORT_SYMBOL(fput); /* * synchronous analog of fput(); for kernel threads that might be needed @@ -533,10 +539,32 @@ void __fput_sync(struct file *file) if (file_ref_put(&file->f_ref)) __fput(file); } - -EXPORT_SYMBOL(fput); EXPORT_SYMBOL(__fput_sync); +/* + * Equivalent to __fput_sync(), but optimized for being called with the last + * reference. + * + * See file_ref_put_close() for details. + */ +void fput_close_sync(struct file *file) +{ + if (likely(file_ref_put_close(&file->f_ref))) + __fput(file); +} + +/* + * Equivalent to fput(), but optimized for being called with the last + * reference. + * + * See file_ref_put_close() for details. + */ +void fput_close(struct file *file) +{ + if (file_ref_put_close(&file->f_ref)) + __fput_deferred(file); +} + void __init files_init(void) { struct kmem_cache_args args = { diff --git a/fs/internal.h b/fs/internal.h index e7f02ae1e098..05c817f39a28 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -118,6 +118,9 @@ static inline void put_file_access(struct file *file) } } +void fput_close_sync(struct file *); +void fput_close(struct file *); + /* * super.c */ diff --git a/include/linux/file_ref.h b/include/linux/file_ref.h index 9b3a8d9b17ab..6ef92d765a66 100644 --- a/include/linux/file_ref.h +++ b/include/linux/file_ref.h @@ -61,6 +61,7 @@ static inline void file_ref_init(file_ref_t *ref, unsigned long cnt) atomic_long_set(&ref->refcnt, cnt - 1); } +bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt); bool __file_ref_put(file_ref_t *ref, unsigned long cnt); /** @@ -160,6 +161,39 @@ static __always_inline __must_check bool file_ref_put(file_ref_t *ref) return __file_ref_put(ref, cnt); } +/** + * file_ref_put_close - drop a reference expecting it would transition to FILE_REF_NOREF + * @ref: Pointer to the reference count + * + * Semantically it is equivalent to calling file_ref_put(), but it trades lower + * performance in face of other CPUs also modifying the refcount for higher + * performance when this happens to be the last reference. + * + * For the last reference file_ref_put() issues 2 atomics. One to drop the + * reference and another to transition it to FILE_REF_DEAD. This routine does + * the work in one step, but in order to do it has to pre-read the variable which + * decreases scalability. + * + * Use with close() et al, stick to file_ref_put() by default. + */ +static __always_inline __must_check bool file_ref_put_close(file_ref_t *ref) +{ + long old, new; + + old = atomic_long_read(&ref->refcnt); + do { + if (unlikely(old < 0)) + return __file_ref_put_badval(ref, old); + + if (old == FILE_REF_ONEREF) + new = FILE_REF_DEAD; + else + new = old - 1; + } while (!atomic_long_try_cmpxchg(&ref->refcnt, &old, new)); + + return new == FILE_REF_DEAD; +} + /** * file_ref_read - Read the number of file references * @ref: Pointer to the reference count From 3e46a92a27c2927fcef996ba06cbe299da629c28 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 5 Mar 2025 13:36:42 +0100 Subject: [PATCH 2/4] fs: use fput_close_sync() in close() This bumps open+close rate by 1% on Sapphire Rapids by eliding one atomic. It would be higher if it was not for several other slowdowns of the same nature. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250305123644.554845-3-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/open.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/open.c b/fs/open.c index a8a5f843e3cf..47615d0e1d9c 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1576,7 +1576,7 @@ SYSCALL_DEFINE1(close, unsigned int, fd) * We're returning to user space. Don't bother * with any delayed fput() cases. */ - __fput_sync(file); + fput_close_sync(file); if (likely(retval == 0)) return 0; From a914bd93f3edfedcdd59deb615e8dd1b3643cac5 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 5 Mar 2025 13:36:43 +0100 Subject: [PATCH 3/4] fs: use fput_close() in filp_close() When tracing a kernel build over refcounts seen this is a wash: @[kprobe:filp_close]: [0] 32195 |@@@@@@@@@@ | [1] 164567 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| I verified vast majority of the skew comes from do_close_on_exec() which could be changed to use a different variant instead. Even without changing that, the 19.5% of calls which got here still can save the extra atomic. Calls here are borderline non-existent compared to fput (over 3.2 mln!), so they should not negatively affect scalability. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250305123644.554845-4-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/open.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/open.c b/fs/open.c index 47615d0e1d9c..39a89df7c37e 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1550,7 +1550,7 @@ int filp_close(struct file *filp, fl_owner_t id) int retval; retval = filp_flush(filp, id); - fput(filp); + fput_close(filp); return retval; } From 606623de503f4eaa4ca505650bbd2da1f5fb5e6a Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 5 Mar 2025 13:36:44 +0100 Subject: [PATCH 4/4] fs: use fput_close() in path_openat() This bumps failing open rate by 1.7% on Sapphire Rapids by avoiding one atomic. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250305123644.554845-5-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/namei.c b/fs/namei.c index 3ab9440c5b93..355a37097769 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3995,7 +3995,7 @@ static struct file *path_openat(struct nameidata *nd, WARN_ON(1); error = -EINVAL; } - fput(file); + fput_close(file); if (error == -EOPENSTALE) { if (flags & LOOKUP_RCU) error = -ECHILD;