From 6f86bdeab633a56d5c6dccf1a2c5989b6a5e323e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Feb 2025 16:39:44 -0500 Subject: [PATCH 1/3] tracing: Fix bad hist from corrupting named_triggers list The following commands causes a crash: ~# cd /sys/kernel/tracing/events/rcu/rcu_callback ~# echo 'hist:name=bad:keys=common_pid:onmax(bogus).save(common_pid)' > trigger bash: echo: write error: Invalid argument ~# echo 'hist:name=bad:keys=common_pid' > trigger Because the following occurs: event_trigger_write() { trigger_process_regex() { event_hist_trigger_parse() { data = event_trigger_alloc(..); event_trigger_register(.., data) { cmd_ops->reg(.., data, ..) [hist_register_trigger()] { data->ops->init() [event_hist_trigger_init()] { save_named_trigger(name, data) { list_add(&data->named_list, &named_triggers); } } } } ret = create_actions(); (return -EINVAL) if (ret) goto out_unreg; [..] ret = hist_trigger_enable(data, ...) { list_add_tail_rcu(&data->list, &file->triggers); <<<---- SKIPPED!!! (this is important!) [..] out_unreg: event_hist_unregister(.., data) { cmd_ops->unreg(.., data, ..) [hist_unregister_trigger()] { list_for_each_entry(iter, &file->triggers, list) { if (!hist_trigger_match(data, iter, named_data, false)) <- never matches continue; [..] test = iter; } if (test && test->ops->free) <<<-- test is NULL test->ops->free(test) [event_hist_trigger_free()] { [..] if (data->name) del_named_trigger(data) { list_del(&data->named_list); <<<<-- NEVER gets removed! } } } } [..] kfree(data); <<<-- frees item but it is still on list The next time a hist with name is registered, it causes an u-a-f bug and the kernel can crash. Move the code around such that if event_trigger_register() succeeds, the next thing called is hist_trigger_enable() which adds it to the list. A bunch of actions is called if get_named_trigger_data() returns false. But that doesn't need to be called after event_trigger_register(), so it can be moved up, allowing event_trigger_register() to be called just before hist_trigger_enable() keeping them together and allowing the file->triggers to be properly populated. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250227163944.1c37f85f@gandalf.local.home Fixes: 067fe038e70f6 ("tracing: Add variable reference handling to hist triggers") Reported-by: Tomas Glozar Tested-by: Tomas Glozar Reviewed-by: Tom Zanussi Closes: https://lore.kernel.org/all/CAP4=nvTsxjckSBTz=Oe_UYh8keD9_sZC4i++4h72mJLic4_W4A@mail.gmail.com/ Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 34 ++++++++++++++++---------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 261163b00137..ad7419e24055 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -6724,27 +6724,27 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, if (existing_hist_update_only(glob, trigger_data, file)) goto out_free; + if (!get_named_trigger_data(trigger_data)) { + + ret = create_actions(hist_data); + if (ret) + goto out_free; + + if (has_hist_vars(hist_data) || hist_data->n_var_refs) { + ret = save_hist_vars(hist_data); + if (ret) + goto out_free; + } + + ret = tracing_map_init(hist_data->map); + if (ret) + goto out_free; + } + ret = event_trigger_register(cmd_ops, file, glob, trigger_data); if (ret < 0) goto out_free; - if (get_named_trigger_data(trigger_data)) - goto enable; - - ret = create_actions(hist_data); - if (ret) - goto out_unreg; - - if (has_hist_vars(hist_data) || hist_data->n_var_refs) { - ret = save_hist_vars(hist_data); - if (ret) - goto out_unreg; - } - - ret = tracing_map_init(hist_data->map); - if (ret) - goto out_unreg; -enable: ret = hist_trigger_enable(trigger_data, file); if (ret) goto out_unreg; From 3908b6baf2ac20138915b5ca98338b4f063954d8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 26 Feb 2025 15:27:03 +0100 Subject: [PATCH 2/3] selftests/ftrace: Let fprobe test consider already enabled functions The fprobe test fails on Fedora 41 since the fprobe test assumption that the number of enabled_functions is zero before the test starts is not necessarily true. Some user space tools, like systemd, add BPF programs that attach to functions. Those will show up in the enabled_functions table and must be taken into account by the fprobe test. Therefore count the number of lines of enabled_functions before tests start, and use that as base when comparing expected results. Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Alexander Gordeev Link: https://lore.kernel.org/20250226142703.910860-1-hca@linux.ibm.com Fixes: e85c5e9792b9 ("selftests/ftrace: Update fprobe test to check enabled_functions file") Signed-off-by: Heiko Carstens Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- .../test.d/dynevent/add_remove_fprobe.tc | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe.tc b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe.tc index 449f9d8be746..73f6c6fcecab 100644 --- a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe.tc +++ b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe.tc @@ -10,12 +10,16 @@ PLACE=$FUNCTION_FORK PLACE2="kmem_cache_free" PLACE3="schedule_timeout" +# Some functions may have BPF programs attached, therefore +# count already enabled_functions before tests start +ocnt=`cat enabled_functions | wc -l` + echo "f:myevent1 $PLACE" >> dynamic_events # Make sure the event is attached and is the only one grep -q $PLACE enabled_functions cnt=`cat enabled_functions | wc -l` -if [ $cnt -ne 1 ]; then +if [ $cnt -ne $((ocnt + 1)) ]; then exit_fail fi @@ -23,7 +27,7 @@ echo "f:myevent2 $PLACE%return" >> dynamic_events # It should till be the only attached function cnt=`cat enabled_functions | wc -l` -if [ $cnt -ne 1 ]; then +if [ $cnt -ne $((ocnt + 1)) ]; then exit_fail fi @@ -32,7 +36,7 @@ echo "f:myevent3 $PLACE2" >> dynamic_events grep -q $PLACE2 enabled_functions cnt=`cat enabled_functions | wc -l` -if [ $cnt -ne 2 ]; then +if [ $cnt -ne $((ocnt + 2)) ]; then exit_fail fi @@ -49,7 +53,7 @@ grep -q myevent1 dynamic_events # should still have 2 left cnt=`cat enabled_functions | wc -l` -if [ $cnt -ne 2 ]; then +if [ $cnt -ne $((ocnt + 2)) ]; then exit_fail fi @@ -57,7 +61,7 @@ echo > dynamic_events # Should have none left cnt=`cat enabled_functions | wc -l` -if [ $cnt -ne 0 ]; then +if [ $cnt -ne $ocnt ]; then exit_fail fi @@ -65,7 +69,7 @@ echo "f:myevent4 $PLACE" >> dynamic_events # Should only have one enabled cnt=`cat enabled_functions | wc -l` -if [ $cnt -ne 1 ]; then +if [ $cnt -ne $((ocnt + 1)) ]; then exit_fail fi @@ -73,7 +77,7 @@ echo > dynamic_events # Should have none left cnt=`cat enabled_functions | wc -l` -if [ $cnt -ne 0 ]; then +if [ $cnt -ne $ocnt ]; then exit_fail fi From a1a7eb89ca0b89dc1c326eeee2596f263291aca3 Mon Sep 17 00:00:00 2001 From: Nikolay Kuratov Date: Thu, 6 Feb 2025 12:01:56 +0300 Subject: [PATCH 3/3] ftrace: Avoid potential division by zero in function_stat_show() Check whether denominator expression x * (x - 1) * 1000 mod {2^32, 2^64} produce zero and skip stddev computation in that case. For now don't care about rec->counter * rec->counter overflow because rec->time * rec->time overflow will likely happen earlier. Cc: stable@vger.kernel.org Cc: Wen Yang Cc: Mark Rutland Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250206090156.1561783-1-kniv@yandex-team.ru Fixes: e31f7939c1c27 ("ftrace: Avoid potential division by zero in function profiler") Signed-off-by: Nikolay Kuratov Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6b0c25761ccb..fc88e0688daf 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -540,6 +540,7 @@ static int function_stat_show(struct seq_file *m, void *v) static struct trace_seq s; unsigned long long avg; unsigned long long stddev; + unsigned long long stddev_denom; #endif guard(mutex)(&ftrace_profile_lock); @@ -559,23 +560,19 @@ static int function_stat_show(struct seq_file *m, void *v) #ifdef CONFIG_FUNCTION_GRAPH_TRACER seq_puts(m, " "); - /* Sample standard deviation (s^2) */ - if (rec->counter <= 1) - stddev = 0; - else { - /* - * Apply Welford's method: - * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) - */ + /* + * Variance formula: + * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) + * Maybe Welford's method is better here? + * Divide only by 1000 for ns^2 -> us^2 conversion. + * trace_print_graph_duration will divide by 1000 again. + */ + stddev = 0; + stddev_denom = rec->counter * (rec->counter - 1) * 1000; + if (stddev_denom) { stddev = rec->counter * rec->time_squared - rec->time * rec->time; - - /* - * Divide only 1000 for ns^2 -> us^2 conversion. - * trace_print_graph_duration will divide 1000 again. - */ - stddev = div64_ul(stddev, - rec->counter * (rec->counter - 1) * 1000); + stddev = div64_ul(stddev, stddev_denom); } trace_seq_init(&s);