perf report: Add --latency flag

Add record/report --latency flag that allows to capture and show
latency-centric profiles rather than the default CPU-consumption-centric
profiles. For latency profiles record captures context switch events,
and report shows Latency as the first column.

Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Link: https://lore.kernel.org/r/e9640464bcbc47dde2cb557003f421052ebc9eec.1739437531.git.dvyukov@google.com
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
This commit is contained in:
Dmitry Vyukov 2025-02-13 10:08:19 +01:00 committed by Namhyung Kim
parent ee1cffbe24
commit 2570c02c3a
9 changed files with 135 additions and 20 deletions

View File

@ -227,6 +227,10 @@ OPTIONS
'--filter' exists, the new filter expression will be combined with
them by '&&'.
--latency::
Enable data collection for latency profiling.
Use perf report --latency for latency-centric profile.
-a::
--all-cpus::
System-wide collection from all CPUs (default if no target is specified).

View File

@ -68,6 +68,11 @@ OPTIONS
--hide-unresolved::
Only display entries resolved to a symbol.
--latency::
Show latency-centric profile rather than the default
CPU-consumption-centric profile
(requires perf record --latency flag).
-s::
--sort=::
Sort histogram entries by given key(s) - multiple keys can be specified

View File

@ -161,6 +161,7 @@ struct record {
struct evlist *sb_evlist;
pthread_t thread_id;
int realtime_prio;
bool latency;
bool switch_output_event_set;
bool no_buildid;
bool no_buildid_set;
@ -3373,6 +3374,9 @@ static struct option __record_options[] = {
parse_events_option),
OPT_CALLBACK(0, "filter", &record.evlist, "filter",
"event filter", parse_filter),
OPT_BOOLEAN(0, "latency", &record.latency,
"Enable data collection for latency profiling.\n"
"\t\t\t Use perf report --latency for latency-centric profile."),
OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
NULL, "don't record events from perf itself",
exclude_perf),
@ -4019,6 +4023,22 @@ int cmd_record(int argc, const char **argv)
}
if (record.latency) {
/*
* There is no fundamental reason why latency profiling
* can't work for system-wide mode, but exact semantics
* and details are to be defined.
* See the following thread for details:
* https://lore.kernel.org/all/Z4XDJyvjiie3howF@google.com/
*/
if (record.opts.target.system_wide) {
pr_err("Failed: latency profiling is not supported with system-wide collection.\n");
err = -EINVAL;
goto out_opts;
}
record.opts.record_switch_events = true;
}
if (rec->buildid_mmap) {
if (!perf_can_record_build_id()) {
pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");

View File

@ -112,6 +112,8 @@ struct report {
u64 nr_entries;
u64 queue_size;
u64 total_cycles;
u64 total_samples;
u64 singlethreaded_samples;
int socket_filter;
DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
struct branch_type_stat brtype_stat;
@ -331,6 +333,10 @@ static int process_sample_event(const struct perf_tool *tool,
&rep->total_cycles, evsel);
}
rep->total_samples++;
if (al.parallelism == 1)
rep->singlethreaded_samples++;
ret = hist_entry_iter__add(&iter, &al, rep->max_stack, rep);
if (ret < 0)
pr_debug("problem adding hist entry, skipping event\n");
@ -1079,6 +1085,11 @@ static int __cmd_report(struct report *rep)
return ret;
}
/* Don't show Latency column for non-parallel profiles by default. */
if (!symbol_conf.prefer_latency && rep->total_samples &&
rep->singlethreaded_samples * 100 / rep->total_samples >= 99)
perf_hpp__cancel_latency();
evlist__check_mem_load_aux(session->evlist);
if (rep->stats_mode)
@ -1468,6 +1479,10 @@ int cmd_report(int argc, const char **argv)
"Disable raw trace ordering"),
OPT_BOOLEAN(0, "skip-empty", &report.skip_empty,
"Do not display empty (or dummy) events in the output"),
OPT_BOOLEAN(0, "latency", &symbol_conf.prefer_latency,
"Show latency-centric profile rather than the default\n"
"\t\t\t CPU-consumption-centric profile\n"
"\t\t\t (requires perf record --latency flag)."),
OPT_END()
};
struct perf_data data = {
@ -1722,16 +1737,25 @@ int cmd_report(int argc, const char **argv)
symbol_conf.annotate_data_sample = true;
}
symbol_conf.enable_latency = true;
if (report.disable_order || !perf_session__has_switch_events(session)) {
if (symbol_conf.parallelism_list_str ||
(sort_order && strstr(sort_order, "parallelism")) ||
(field_order && strstr(field_order, "parallelism"))) {
symbol_conf.prefer_latency ||
(sort_order && (strstr(sort_order, "latency") ||
strstr(sort_order, "parallelism"))) ||
(field_order && (strstr(field_order, "latency") ||
strstr(field_order, "parallelism")))) {
if (report.disable_order)
ui__error("Use of parallelism is incompatible with --disable-order.\n");
ui__error("Use of latency profile or parallelism is incompatible with --disable-order.\n");
else
ui__error("Use of parallelism requires --switch-events during record.\n");
ui__error("Use of latency profile or parallelism requires --latency flag during record.\n");
return -1;
}
/*
* If user did not ask for anything related to
* latency/parallelism explicitly, just don't show it.
*/
symbol_conf.enable_latency = false;
}
if (sort_order && strstr(sort_order, "ipc")) {

View File

@ -631,28 +631,48 @@ void perf_hpp__init(void)
if (is_strict_order(field_order))
return;
/*
* Overhead and latency columns are added in setup_overhead(),
* so they are added implicitly here only if they were added
* by setup_overhead() before (have was_taken flag set).
* This is required because setup_overhead() has more complex
* logic, in particular it does not add "overhead" if user
* specified "latency" in sort order, and vise versa.
*/
if (symbol_conf.cumulate_callchain) {
hpp_dimension__add_output(PERF_HPP__OVERHEAD_ACC);
/*
* Addition of fields is idempotent, so we add latency
* column twice to get desired order with simpler logic.
*/
if (symbol_conf.prefer_latency)
hpp_dimension__add_output(PERF_HPP__LATENCY_ACC, true);
hpp_dimension__add_output(PERF_HPP__OVERHEAD_ACC, true);
if (symbol_conf.enable_latency)
hpp_dimension__add_output(PERF_HPP__LATENCY_ACC, true);
perf_hpp__format[PERF_HPP__OVERHEAD].name = "Self";
}
hpp_dimension__add_output(PERF_HPP__OVERHEAD);
if (symbol_conf.prefer_latency)
hpp_dimension__add_output(PERF_HPP__LATENCY, true);
hpp_dimension__add_output(PERF_HPP__OVERHEAD, true);
if (symbol_conf.enable_latency)
hpp_dimension__add_output(PERF_HPP__LATENCY, true);
if (symbol_conf.show_cpu_utilization) {
hpp_dimension__add_output(PERF_HPP__OVERHEAD_SYS);
hpp_dimension__add_output(PERF_HPP__OVERHEAD_US);
hpp_dimension__add_output(PERF_HPP__OVERHEAD_SYS, false);
hpp_dimension__add_output(PERF_HPP__OVERHEAD_US, false);
if (perf_guest) {
hpp_dimension__add_output(PERF_HPP__OVERHEAD_GUEST_SYS);
hpp_dimension__add_output(PERF_HPP__OVERHEAD_GUEST_US);
hpp_dimension__add_output(PERF_HPP__OVERHEAD_GUEST_SYS, false);
hpp_dimension__add_output(PERF_HPP__OVERHEAD_GUEST_US, false);
}
}
if (symbol_conf.show_nr_samples)
hpp_dimension__add_output(PERF_HPP__SAMPLES);
hpp_dimension__add_output(PERF_HPP__SAMPLES, false);
if (symbol_conf.show_total_period)
hpp_dimension__add_output(PERF_HPP__PERIOD);
hpp_dimension__add_output(PERF_HPP__PERIOD, false);
}
void perf_hpp_list__column_register(struct perf_hpp_list *list,
@ -701,6 +721,24 @@ void perf_hpp__cancel_cumulate(void)
}
}
void perf_hpp__cancel_latency(void)
{
struct perf_hpp_fmt *fmt, *lat, *acc, *tmp;
if (is_strict_order(field_order))
return;
if (sort_order && strstr(sort_order, "latency"))
return;
lat = &perf_hpp__format[PERF_HPP__LATENCY];
acc = &perf_hpp__format[PERF_HPP__LATENCY_ACC];
perf_hpp_list__for_each_format_safe(&perf_hpp_list, fmt, tmp) {
if (fmt_equal(lat, fmt) || fmt_equal(acc, fmt))
perf_hpp__column_unregister(fmt);
}
}
void perf_hpp__setup_output_field(struct perf_hpp_list *list)
{
struct perf_hpp_fmt *fmt;

View File

@ -582,6 +582,7 @@ enum {
void perf_hpp__init(void);
void perf_hpp__cancel_cumulate(void);
void perf_hpp__cancel_latency(void);
void perf_hpp__setup_output_field(struct perf_hpp_list *list);
void perf_hpp__reset_output_field(struct perf_hpp_list *list);
void perf_hpp__append_sort_keys(struct perf_hpp_list *list);

View File

@ -2622,6 +2622,7 @@ struct hpp_dimension {
const char *name;
struct perf_hpp_fmt *fmt;
int taken;
int was_taken;
};
#define DIM(d, n) { .name = n, .fmt = &perf_hpp__format[d], }
@ -3513,6 +3514,7 @@ static int __hpp_dimension__add(struct hpp_dimension *hd,
return -1;
hd->taken = 1;
hd->was_taken = 1;
perf_hpp_list__register_sort_field(list, fmt);
return 0;
}
@ -3547,10 +3549,15 @@ static int __hpp_dimension__add_output(struct perf_hpp_list *list,
return 0;
}
int hpp_dimension__add_output(unsigned col)
int hpp_dimension__add_output(unsigned col, bool implicit)
{
struct hpp_dimension *hd;
BUG_ON(col >= PERF_HPP__MAX_INDEX);
return __hpp_dimension__add_output(&perf_hpp_list, &hpp_sort_dimensions[col]);
hd = &hpp_sort_dimensions[col];
if (implicit && !hd->was_taken)
return 0;
return __hpp_dimension__add_output(&perf_hpp_list, hd);
}
int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
@ -3809,10 +3816,24 @@ static char *setup_overhead(char *keys)
if (sort__mode == SORT_MODE__DIFF)
return keys;
keys = prefix_if_not_in("overhead", keys);
if (symbol_conf.cumulate_callchain)
keys = prefix_if_not_in("overhead_children", keys);
if (symbol_conf.prefer_latency) {
keys = prefix_if_not_in("overhead", keys);
keys = prefix_if_not_in("latency", keys);
if (symbol_conf.cumulate_callchain) {
keys = prefix_if_not_in("overhead_children", keys);
keys = prefix_if_not_in("latency_children", keys);
}
} else if (!keys || (!strstr(keys, "overhead") &&
!strstr(keys, "latency"))) {
if (symbol_conf.enable_latency)
keys = prefix_if_not_in("latency", keys);
keys = prefix_if_not_in("overhead", keys);
if (symbol_conf.cumulate_callchain) {
if (symbol_conf.enable_latency)
keys = prefix_if_not_in("latency_children", keys);
keys = prefix_if_not_in("overhead_children", keys);
}
}
return keys;
}

View File

@ -141,7 +141,7 @@ int report_parse_ignore_callees_opt(const struct option *opt, const char *arg, i
bool is_strict_order(const char *order);
int hpp_dimension__add_output(unsigned col);
int hpp_dimension__add_output(unsigned col, bool implicit);
void reset_dimensions(void);
int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
struct evlist *evlist,

View File

@ -49,7 +49,9 @@ struct symbol_conf {
keep_exited_threads,
annotate_data_member,
annotate_data_sample,
skip_empty;
skip_empty,
enable_latency,
prefer_latency;
const char *vmlinux_name,
*kallsyms_name,
*source_prefix,