rseq: Expose lightweight statistics in debugfs

Analyzing the call frequency without actually using tracing is helpful for
analysis of this infrastructure. The overhead is minimal as it just
increments a per CPU counter associated to each operation.

The debugfs readout provides a racy sum of all counters.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.027916598@linutronix.de
This commit is contained in:
Thomas Gleixner 2025-10-27 09:44:52 +01:00 committed by Ingo Molnar
parent dab344753e
commit 5412910487
4 changed files with 135 additions and 25 deletions

View File

@ -29,21 +29,6 @@ static inline void rseq_sched_switch_event(struct task_struct *t)
}
}
static __always_inline void rseq_exit_to_user_mode(void)
{
struct rseq_event *ev = &current->rseq.event;
if (IS_ENABLED(CONFIG_DEBUG_RSEQ))
WARN_ON_ONCE(ev->sched_switch);
/*
* Ensure that event (especially user_irq) is cleared when the
* interrupt did not result in a schedule and therefore the
* rseq processing did not clear it.
*/
ev->events = 0;
}
/*
* KVM/HYPERV invoke resume_user_mode_work() before entering guest mode,
* which clears TIF_NOTIFY_RESUME. To avoid updating user space RSEQ in
@ -92,7 +77,6 @@ static inline void rseq_sched_switch_event(struct task_struct *t) { }
static inline void rseq_virt_userspace_exit(void) { }
static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { }
static inline void rseq_execve(struct task_struct *t) { }
static inline void rseq_exit_to_user_mode(void) { }
#endif /* !CONFIG_RSEQ */
#ifdef CONFIG_DEBUG_RSEQ

View File

@ -2,6 +2,37 @@
#ifndef _LINUX_RSEQ_ENTRY_H
#define _LINUX_RSEQ_ENTRY_H
/* Must be outside the CONFIG_RSEQ guard to resolve the stubs */
#ifdef CONFIG_RSEQ_STATS
#include <linux/percpu.h>
struct rseq_stats {
unsigned long exit;
unsigned long signal;
unsigned long slowpath;
unsigned long ids;
unsigned long cs;
unsigned long clear;
unsigned long fixup;
};
DECLARE_PER_CPU(struct rseq_stats, rseq_stats);
/*
* Slow path has interrupts and preemption enabled, but the fast path
* runs with interrupts disabled so there is no point in having the
* preemption checks implied in __this_cpu_inc() for every operation.
*/
#ifdef RSEQ_BUILD_SLOW_PATH
#define rseq_stat_inc(which) this_cpu_inc((which))
#else
#define rseq_stat_inc(which) raw_cpu_inc((which))
#endif
#else /* CONFIG_RSEQ_STATS */
#define rseq_stat_inc(x) do { } while (0)
#endif /* !CONFIG_RSEQ_STATS */
#ifdef CONFIG_RSEQ
#include <linux/rseq.h>
@ -39,8 +70,26 @@ static __always_inline void rseq_note_user_irq_entry(void)
current->rseq.event.user_irq = true;
}
static __always_inline void rseq_exit_to_user_mode(void)
{
struct rseq_event *ev = &current->rseq.event;
rseq_stat_inc(rseq_stats.exit);
if (IS_ENABLED(CONFIG_DEBUG_RSEQ))
WARN_ON_ONCE(ev->sched_switch);
/*
* Ensure that event (especially user_irq) is cleared when the
* interrupt did not result in a schedule and therefore the
* rseq processing did not clear it.
*/
ev->events = 0;
}
#else /* CONFIG_RSEQ */
static inline void rseq_note_user_irq_entry(void) { }
static inline void rseq_exit_to_user_mode(void) { }
#endif /* !CONFIG_RSEQ */
#endif /* _LINUX_RSEQ_ENTRY_H */

View File

@ -1913,6 +1913,18 @@ config RSEQ
If unsure, say Y.
config RSEQ_STATS
default n
bool "Enable lightweight statistics of restartable sequences" if EXPERT
depends on RSEQ && DEBUG_FS
help
Enable lightweight counters which expose information about the
frequency of RSEQ operations via debugfs. Mostly interesting for
kernel debugging or performance analysis. While lightweight it's
still adding code into the user/kernel mode transitions.
If unsure, say N.
config DEBUG_RSEQ
default n
bool "Enable debugging of rseq() system call" if EXPERT

View File

@ -67,12 +67,16 @@
* F1. <failure>
*/
#include <linux/sched.h>
#include <linux/uaccess.h>
#include <linux/syscalls.h>
#include <linux/rseq_entry.h>
#include <linux/types.h>
/* Required to select the proper per_cpu ops for rseq_stats_inc() */
#define RSEQ_BUILD_SLOW_PATH
#include <linux/debugfs.h>
#include <linux/ratelimit.h>
#include <linux/rseq_entry.h>
#include <linux/sched.h>
#include <linux/syscalls.h>
#include <linux/uaccess.h>
#include <linux/types.h>
#include <asm/ptrace.h>
#define CREATE_TRACE_POINTS
@ -108,6 +112,56 @@ void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
}
#endif /* CONFIG_TRACEPOINTS */
#ifdef CONFIG_RSEQ_STATS
DEFINE_PER_CPU(struct rseq_stats, rseq_stats);
static int rseq_debug_show(struct seq_file *m, void *p)
{
struct rseq_stats stats = { };
unsigned int cpu;
for_each_possible_cpu(cpu) {
stats.exit += data_race(per_cpu(rseq_stats.exit, cpu));
stats.signal += data_race(per_cpu(rseq_stats.signal, cpu));
stats.slowpath += data_race(per_cpu(rseq_stats.slowpath, cpu));
stats.ids += data_race(per_cpu(rseq_stats.ids, cpu));
stats.cs += data_race(per_cpu(rseq_stats.cs, cpu));
stats.clear += data_race(per_cpu(rseq_stats.clear, cpu));
stats.fixup += data_race(per_cpu(rseq_stats.fixup, cpu));
}
seq_printf(m, "exit: %16lu\n", stats.exit);
seq_printf(m, "signal: %16lu\n", stats.signal);
seq_printf(m, "slowp: %16lu\n", stats.slowpath);
seq_printf(m, "ids: %16lu\n", stats.ids);
seq_printf(m, "cs: %16lu\n", stats.cs);
seq_printf(m, "clear: %16lu\n", stats.clear);
seq_printf(m, "fixup: %16lu\n", stats.fixup);
return 0;
}
static int rseq_debug_open(struct inode *inode, struct file *file)
{
return single_open(file, rseq_debug_show, inode->i_private);
}
static const struct file_operations dfs_ops = {
.open = rseq_debug_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static int __init rseq_debugfs_init(void)
{
struct dentry *root_dir = debugfs_create_dir("rseq", NULL);
debugfs_create_file("stats", 0444, root_dir, NULL, &dfs_ops);
return 0;
}
__initcall(rseq_debugfs_init);
#endif /* CONFIG_RSEQ_STATS */
#ifdef CONFIG_DEBUG_RSEQ
static struct rseq *rseq_kernel_fields(struct task_struct *t)
{
@ -187,12 +241,13 @@ static int rseq_update_cpu_node_id(struct task_struct *t)
u32 node_id = cpu_to_node(cpu_id);
u32 mm_cid = task_mm_cid(t);
/*
* Validate read-only rseq fields.
*/
rseq_stat_inc(rseq_stats.ids);
/* Validate read-only rseq fields on debug kernels */
if (rseq_validate_ro_fields(t))
goto efault;
WARN_ON_ONCE((int) mm_cid < 0);
if (!user_write_access_begin(rseq, t->rseq.len))
goto efault;
@ -403,6 +458,8 @@ static int rseq_ip_fixup(struct pt_regs *regs, bool abort)
struct rseq_cs rseq_cs;
int ret;
rseq_stat_inc(rseq_stats.cs);
ret = rseq_get_rseq_cs(t, &rseq_cs);
if (ret)
return ret;
@ -412,8 +469,10 @@ static int rseq_ip_fixup(struct pt_regs *regs, bool abort)
* If not nested over a rseq critical section, restart is useless.
* Clear the rseq_cs pointer and return.
*/
if (!in_rseq_cs(ip, &rseq_cs))
if (!in_rseq_cs(ip, &rseq_cs)) {
rseq_stat_inc(rseq_stats.clear);
return clear_rseq_cs(t->rseq.usrptr);
}
ret = rseq_check_flags(t, rseq_cs.flags);
if (ret < 0)
return ret;
@ -422,6 +481,7 @@ static int rseq_ip_fixup(struct pt_regs *regs, bool abort)
ret = clear_rseq_cs(t->rseq.usrptr);
if (ret)
return ret;
rseq_stat_inc(rseq_stats.fixup);
trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset,
rseq_cs.abort_ip);
instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip);
@ -462,6 +522,11 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
if (unlikely(t->flags & PF_EXITING))
return;
if (ksig)
rseq_stat_inc(rseq_stats.signal);
else
rseq_stat_inc(rseq_stats.slowpath);
/*
* Read and clear the event pending bit first. If the task
* was not preempted or migrated or a signal is on the way,