From a485ea9e3ef31ac4e3a2245cdb11fa73352b950f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 13 Jan 2025 18:31:24 -0500 Subject: [PATCH 1/3] tracing: Fix irqsoff and wakeup latency tracers when using function graph The function graph tracer has become generic so that kretprobes and BPF can use it along with function graph tracing itself. Some of the infrastructure was specific for function graph tracing such as recording the calltime and return time of the functions. Calling the clock code on a high volume function does add overhead. The calculation of the calltime was removed from the generic code and placed into the function graph tracer itself so that the other users did not incur this overhead as they did not need that timestamp. The calltime field was still kept in the generic return entry structure and the function graph return entry callback filled it as that structure was passed to other code. But this broke both irqsoff and wakeup latency tracer as they still depended on the trace structure containing the calltime when the option display-graph is set as it used some of those same functions that the function graph tracer used. But now the calltime was not set and was just zero. This caused the calculation of the function time to be the absolute value of the return timestamp and not the length of the function. # cd /sys/kernel/tracing # echo 1 > options/display-graph # echo irqsoff > current_tracer The tracers went from: # REL TIME CPU TASK/PID |||| DURATION FUNCTION CALLS # | | | | |||| | | | | | | 0 us | 4) -0 | d..1. | 0.000 us | irqentry_enter(); 3 us | 4) -0 | d..2. | | irq_enter_rcu() { 4 us | 4) -0 | d..2. | 0.431 us | preempt_count_add(); 5 us | 4) -0 | d.h2. | | tick_irq_enter() { 5 us | 4) -0 | d.h2. | 0.433 us | tick_check_oneshot_broadcast_this_cpu(); 6 us | 4) -0 | d.h2. | 2.426 us | ktime_get(); 9 us | 4) -0 | d.h2. | | tick_nohz_stop_idle() { 10 us | 4) -0 | d.h2. | 0.398 us | nr_iowait_cpu(); 11 us | 4) -0 | d.h1. | 1.903 us | } 11 us | 4) -0 | d.h2. | | tick_do_update_jiffies64() { 12 us | 4) -0 | d.h2. | | _raw_spin_lock() { 12 us | 4) -0 | d.h2. | 0.360 us | preempt_count_add(); 13 us | 4) -0 | d.h3. | 0.354 us | do_raw_spin_lock(); 14 us | 4) -0 | d.h2. | 2.207 us | } 15 us | 4) -0 | d.h3. | 0.428 us | calc_global_load(); 16 us | 4) -0 | d.h3. | | _raw_spin_unlock() { 16 us | 4) -0 | d.h3. | 0.380 us | do_raw_spin_unlock(); 17 us | 4) -0 | d.h3. | 0.334 us | preempt_count_sub(); 18 us | 4) -0 | d.h1. | 1.768 us | } 18 us | 4) -0 | d.h2. | | update_wall_time() { [..] To: # REL TIME CPU TASK/PID |||| DURATION FUNCTION CALLS # | | | | |||| | | | | | | 0 us | 5) -0 | d.s2. | 0.000 us | _raw_spin_lock_irqsave(); 0 us | 5) -0 | d.s3. | 312159583 us | preempt_count_add(); 2 us | 5) -0 | d.s4. | 312159585 us | do_raw_spin_lock(); 3 us | 5) -0 | d.s4. | | _raw_spin_unlock() { 3 us | 5) -0 | d.s4. | 312159586 us | do_raw_spin_unlock(); 4 us | 5) -0 | d.s4. | 312159587 us | preempt_count_sub(); 4 us | 5) -0 | d.s2. | 312159587 us | } 5 us | 5) -0 | d.s3. | | _raw_spin_lock() { 5 us | 5) -0 | d.s3. | 312159588 us | preempt_count_add(); 6 us | 5) -0 | d.s4. | 312159589 us | do_raw_spin_lock(); 7 us | 5) -0 | d.s3. | 312159590 us | } 8 us | 5) -0 | d.s4. | 312159591 us | calc_wheel_index(); 9 us | 5) -0 | d.s4. | | enqueue_timer() { 9 us | 5) -0 | d.s4. | | wake_up_nohz_cpu() { 11 us | 5) -0 | d.s4. | | native_smp_send_reschedule() { 11 us | 5) -0 | d.s4. | 312171987 us | default_send_IPI_single_phys(); 12408 us | 5) -0 | d.s3. | 312171990 us | } 12408 us | 5) -0 | d.s3. | 312171991 us | } 12409 us | 5) -0 | d.s3. | 312171991 us | } Where the calculation of the time for each function was the return time minus zero and not the time of when the function returned. Have these tracers also save the calltime in the fgraph data section and retrieve it again on the return to get the correct timings again. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Mark Rutland Link: https://lore.kernel.org/20250113183124.61767419@gandalf.local.home Fixes: f1f36e22bee9 ("ftrace: Have calltime be saved in the fgraph storage") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_irqsoff.c | 14 ++++++++++++++ kernel/trace/trace_sched_wakeup.c | 14 ++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index fce064e20570..a4e799c1e767 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -182,6 +182,7 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace, struct trace_array_cpu *data; unsigned long flags; unsigned int trace_ctx; + u64 *calltime; int ret; if (ftrace_graph_ignore_func(gops, trace)) @@ -199,6 +200,12 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace, if (!func_prolog_dec(tr, &data, &flags)) return 0; + calltime = fgraph_reserve_data(gops->idx, sizeof(*calltime)); + if (!calltime) + return 0; + + *calltime = trace_clock_local(); + trace_ctx = tracing_gen_ctx_flags(flags); ret = __trace_graph_entry(tr, trace, trace_ctx); atomic_dec(&data->disabled); @@ -213,12 +220,19 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace, struct trace_array_cpu *data; unsigned long flags; unsigned int trace_ctx; + u64 *calltime; + int size; ftrace_graph_addr_finish(gops, trace); if (!func_prolog_dec(tr, &data, &flags)) return; + calltime = fgraph_retrieve_data(gops->idx, &size); + if (!calltime) + return; + trace->calltime = *calltime; + trace_ctx = tracing_gen_ctx_flags(flags); __trace_graph_return(tr, trace, trace_ctx); atomic_dec(&data->disabled); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index d6c7f18daa15..c58292e424d5 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -118,6 +118,7 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace, struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; unsigned int trace_ctx; + u64 *calltime; int ret = 0; if (ftrace_graph_ignore_func(gops, trace)) @@ -135,6 +136,12 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace, if (!func_prolog_preempt_disable(tr, &data, &trace_ctx)) return 0; + calltime = fgraph_reserve_data(gops->idx, sizeof(*calltime)); + if (!calltime) + return 0; + + *calltime = trace_clock_local(); + ret = __trace_graph_entry(tr, trace, trace_ctx); atomic_dec(&data->disabled); preempt_enable_notrace(); @@ -148,12 +155,19 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace, struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; unsigned int trace_ctx; + u64 *calltime; + int size; ftrace_graph_addr_finish(gops, trace); if (!func_prolog_preempt_disable(tr, &data, &trace_ctx)) return; + calltime = fgraph_retrieve_data(gops->idx, &size); + if (!calltime) + return; + trace->calltime = *calltime; + __trace_graph_return(tr, trace, trace_ctx); atomic_dec(&data->disabled); From 24e0e610400aa2c765ee87b1b3c1401732b20fc5 Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Fri, 3 Jan 2025 15:06:47 +0530 Subject: [PATCH 2/3] tracing: Print lazy preemption model Print lazy preemption model in ftrace header when latency-format=1. # cat /sys/kernel/debug/sched/preempt none voluntary full (lazy) Without patch: latency: 0 us, #232946/232946, CPU#40 | (M:unknown VP:0, KP:0, SP:0 HP:0 #P:80) ^^^^^^^ With Patch: latency: 0 us, #1897938/25566788, CPU#16 | (M:lazy VP:0, KP:0, SP:0 HP:0 #P:80) ^^^^ Now that lazy preemption is part of the kernel, make sure the tracing infrastructure reflects that. Link: https://lore.kernel.org/20250103093647.575919-1-sshegde@linux.ibm.com Signed-off-by: Shrikanth Hegde Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f8aebcb01e62..b6e40e8791fa 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4122,6 +4122,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) preempt_model_none() ? "server" : preempt_model_voluntary() ? "desktop" : preempt_model_full() ? "preempt" : + preempt_model_lazy() ? "lazy" : preempt_model_rt() ? "preempt_rt" : "unknown", /* These are reserved for later use */ From 94d529a3255ce65496e932173d7f13e801170597 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 14 Jan 2025 10:12:02 -0500 Subject: [PATCH 3/3] ftrace: Document that multiple function_graph tracing may have different times The function graph tracer now calculates the calltime internally and for each instance. If there are two instances that are running function graph tracer and are tracing the same functions, the timings of the length of those functions may be slightly different: # trace-cmd record -B foo -p function_graph -B bar -p function_graph sleep 5 # trace-cmd report [..] bar: sleep-981 [000] ...1. 1101.109027: funcgraph_entry: 0.764 us | mutex_unlock(); (ret=0xffff8abcc256c300) foo: sleep-981 [000] ...1. 1101.109028: funcgraph_entry: 0.748 us | mutex_unlock(); (ret=0xffff8abcc256c300) bar: sleep-981 [000] ..... 1101.109029: funcgraph_exit: 2.456 us | } (ret=0xffff8abcc256c300) foo: sleep-981 [000] ..... 1101.109029: funcgraph_exit: 2.403 us | } (ret=0xffff8abcc256c300) bar: sleep-981 [000] d..1. 1101.109031: funcgraph_entry: 0.844 us | fpregs_assert_state_consistent(); (ret=0x0) foo: sleep-981 [000] d..1. 1101.109032: funcgraph_entry: 0.803 us | fpregs_assert_state_consistent(); (ret=0x0) Link: https://lore.kernel.org/all/20250114101806.b2778cb01f34f5be9d23ad98@kernel.org/ Cc: Mark Rutland Cc: Mathieu Desnoyers Suggested-by: Masami Hiramatsu Link: https://lore.kernel.org/20250114101202.02e7bc68@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/ftrace.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst index 272464bb7c60..2b74f96d09d5 100644 --- a/Documentation/trace/ftrace.rst +++ b/Documentation/trace/ftrace.rst @@ -810,6 +810,12 @@ Here is the list of current tracers that may be configured. to draw a graph of function calls similar to C code source. + Note that the function graph calculates the timings of when the + function starts and returns internally and for each instance. If + there are two instances that run function graph tracer and traces + the same functions, the length of the timings may be slightly off as + each read the timestamp separately and not at the same time. + "blk" The block tracer. The tracer used by the blktrace user