diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 9a8ec2290f68..ea6474db8a31 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -963,3 +963,27 @@ Description: This sysfs entry can be used to change type of injected timeout: 0x00000003 Simulate Non-IO type sleep time 0x00000004 Simulate runnable time ========== =============================== + +What: /sys/fs/f2fs//adjust_lock_priority +Date: January 2026 +Contact: "Chao Yu" +Description: This sysfs entry can be used to enable/disable to adjust priority for task + which is in critical region covered by lock. + ========== ================== + Flag_Value Flag_Description + ========== ================== + 0x00000000 Disabled (default) + 0x00000001 cp_rwsem + 0x00000002 node_change + 0x00000004 node_write + 0x00000008 gc_lock + 0x00000010 cp_global + 0x00000020 io_rwsem + ========== ================== + +What: /sys/fs/f2fs//lock_duration_priority +Date: January 2026 +Contact: "Chao Yu" +Description: f2fs can tune priority of thread which has entered into critical region covered by + f2fs rwsemphore lock. This sysfs entry can be used to control priority value, the + range is [100,139], by default the value is 120. diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 5172396c0b01..2f5a03e29d0b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -90,16 +90,72 @@ static inline void trace_lock_elapsed_time_end(struct f2fs_rwsem *sem, runnable_time, io_sleep_time, other_time); } +static bool need_uplift_priority(struct f2fs_rwsem *sem, bool is_write) +{ + if (!(sem->sbi->adjust_lock_priority & BIT(sem->name - 1))) + return false; + + switch (sem->name) { + /* + * writer is checkpoint which has high priority, let's just uplift + * priority for reader + */ + case LOCK_NAME_CP_RWSEM: + case LOCK_NAME_NODE_CHANGE: + case LOCK_NAME_NODE_WRITE: + return !is_write; + case LOCK_NAME_GC_LOCK: + case LOCK_NAME_CP_GLOBAL: + case LOCK_NAME_IO_RWSEM: + return true; + default: + f2fs_bug_on(sem->sbi, 1); + } + return false; +} + +static void uplift_priority(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc, + bool is_write) +{ + lc->need_restore = false; + if (!sem->sbi->adjust_lock_priority) + return; + if (rt_task(current)) + return; + if (!need_uplift_priority(sem, is_write)) + return; + lc->orig_nice = task_nice(current); + lc->new_nice = PRIO_TO_NICE(sem->sbi->lock_duration_priority); + if (lc->orig_nice <= lc->new_nice) + return; + set_user_nice(current, lc->new_nice); + lc->need_restore = true; +} + +static void restore_priority(struct f2fs_lock_context *lc) +{ + if (!lc->need_restore) + return; + /* someone has updated the priority */ + if (task_nice(current) != lc->new_nice) + return; + set_user_nice(current, lc->orig_nice); +} + void f2fs_down_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc) { + uplift_priority(sem, lc, false); f2fs_down_read(sem); trace_lock_elapsed_time_start(sem, lc); } int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc) { - if (!f2fs_down_read_trylock(sem)) + uplift_priority(sem, lc, false); + if (!f2fs_down_read_trylock(sem)) { + restore_priority(lc); return 0; + } trace_lock_elapsed_time_start(sem, lc); return 1; } @@ -107,19 +163,24 @@ int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_contex void f2fs_up_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc) { f2fs_up_read(sem); + restore_priority(lc); trace_lock_elapsed_time_end(sem, lc, false); } void f2fs_down_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc) { + uplift_priority(sem, lc, true); f2fs_down_write(sem); trace_lock_elapsed_time_start(sem, lc); } int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc) { - if (!f2fs_down_write_trylock(sem)) + uplift_priority(sem, lc, true); + if (!f2fs_down_write_trylock(sem)) { + restore_priority(lc); return 0; + } trace_lock_elapsed_time_start(sem, lc); return 1; } @@ -127,6 +188,7 @@ int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_conte void f2fs_up_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc) { f2fs_up_write(sem); + restore_priority(lc); trace_lock_elapsed_time_end(sem, lc, true); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 29f81a496b72..a6e7368fc40a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -185,6 +185,7 @@ enum f2fs_lock_name { LOCK_NAME_GC_LOCK, LOCK_NAME_CP_GLOBAL, LOCK_NAME_IO_RWSEM, + LOCK_NAME_MAX, }; enum f2fs_timeout_type { @@ -1447,7 +1448,10 @@ struct f2fs_time_stat { struct f2fs_lock_context { struct f2fs_time_stat ts; + int orig_nice; + int new_nice; bool lock_trace; + bool need_restore; }; struct f2fs_gc_control { @@ -1588,6 +1592,8 @@ enum node_type { /* a threshold of maximum elapsed time in critical region to print tracepoint */ #define MAX_LOCK_ELAPSED_TIME 500 +#define F2FS_DEFAULT_TASK_PRIORITY (DEFAULT_PRIO) + static inline int f2fs_test_bit(unsigned int nr, char *addr); static inline void f2fs_set_bit(unsigned int nr, char *addr); static inline void f2fs_clear_bit(unsigned int nr, char *addr); @@ -1998,6 +2004,12 @@ struct f2fs_sb_info { /* max elapsed time threshold in critical region that lock covered */ unsigned long long max_lock_elapsed_time; + /* enable/disable to adjust task priority in critical region covered by lock */ + unsigned int adjust_lock_priority; + + /* adjust priority for task which is in critical region covered by lock */ + unsigned int lock_duration_priority; + #ifdef CONFIG_F2FS_FS_COMPRESSION struct kmem_cache *page_array_slab; /* page array entry */ unsigned int page_array_slab_size; /* default page array slab size */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 9d421a07d2d5..d5cf7265e5d3 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4338,6 +4338,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) spin_lock_init(&sbi->gc_remaining_trials_lock); atomic64_set(&sbi->current_atomic_write, 0); sbi->max_lock_elapsed_time = MAX_LOCK_ELAPSED_TIME; + sbi->adjust_lock_priority = 0; + sbi->lock_duration_priority = F2FS_DEFAULT_TASK_PRIORITY; sbi->sum_blocksize = f2fs_sb_has_packed_ssa(sbi) ? 4096 : sbi->blocksize; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index d01a2664a250..3a272e7edf23 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -955,6 +955,20 @@ static ssize_t __sbi_store(struct f2fs_attr *a, return count; } + if (!strcmp(a->attr.name, "adjust_lock_priority")) { + if (t >= BIT(LOCK_NAME_MAX - 1)) + return -EINVAL; + sbi->adjust_lock_priority = t; + return count; + } + + if (!strcmp(a->attr.name, "lock_duration_priority")) { + if (t < NICE_TO_PRIO(MIN_NICE) || t > NICE_TO_PRIO(MAX_NICE)) + return -EINVAL; + sbi->lock_duration_priority = t; + return count; + } + __sbi_store_value(a, sbi, ptr + a->offset, t); return count; @@ -1272,6 +1286,8 @@ F2FS_SBI_GENERAL_RW_ATTR(carve_out); F2FS_SBI_GENERAL_RW_ATTR(reserved_pin_section); F2FS_SBI_GENERAL_RW_ATTR(bggc_io_aware); F2FS_SBI_GENERAL_RW_ATTR(max_lock_elapsed_time); +F2FS_SBI_GENERAL_RW_ATTR(lock_duration_priority); +F2FS_SBI_GENERAL_RW_ATTR(adjust_lock_priority); /* STAT_INFO ATTR */ #ifdef CONFIG_F2FS_STAT_FS @@ -1478,6 +1494,8 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(allocate_section_hint), ATTR_LIST(allocate_section_policy), ATTR_LIST(max_lock_elapsed_time), + ATTR_LIST(lock_duration_priority), + ATTR_LIST(adjust_lock_priority), NULL, }; ATTRIBUTE_GROUPS(f2fs);