mirror of
https://github.com/torvalds/linux.git
synced 2026-06-03 20:14:06 +02:00
perf report: Add --latency flag
Add record/report --latency flag that allows to capture and show latency-centric profiles rather than the default CPU-consumption-centric profiles. For latency profiles record captures context switch events, and report shows Latency as the first column. Signed-off-by: Dmitry Vyukov <dvyukov@google.com> Reviewed-by: Andi Kleen <ak@linux.intel.com> Link: https://lore.kernel.org/r/e9640464bcbc47dde2cb557003f421052ebc9eec.1739437531.git.dvyukov@google.com Signed-off-by: Namhyung Kim <namhyung@kernel.org>
This commit is contained in:
parent
ee1cffbe24
commit
2570c02c3a
|
|
@ -227,6 +227,10 @@ OPTIONS
|
|||
'--filter' exists, the new filter expression will be combined with
|
||||
them by '&&'.
|
||||
|
||||
--latency::
|
||||
Enable data collection for latency profiling.
|
||||
Use perf report --latency for latency-centric profile.
|
||||
|
||||
-a::
|
||||
--all-cpus::
|
||||
System-wide collection from all CPUs (default if no target is specified).
|
||||
|
|
|
|||
|
|
@ -68,6 +68,11 @@ OPTIONS
|
|||
--hide-unresolved::
|
||||
Only display entries resolved to a symbol.
|
||||
|
||||
--latency::
|
||||
Show latency-centric profile rather than the default
|
||||
CPU-consumption-centric profile
|
||||
(requires perf record --latency flag).
|
||||
|
||||
-s::
|
||||
--sort=::
|
||||
Sort histogram entries by given key(s) - multiple keys can be specified
|
||||
|
|
|
|||
|
|
@ -161,6 +161,7 @@ struct record {
|
|||
struct evlist *sb_evlist;
|
||||
pthread_t thread_id;
|
||||
int realtime_prio;
|
||||
bool latency;
|
||||
bool switch_output_event_set;
|
||||
bool no_buildid;
|
||||
bool no_buildid_set;
|
||||
|
|
@ -3373,6 +3374,9 @@ static struct option __record_options[] = {
|
|||
parse_events_option),
|
||||
OPT_CALLBACK(0, "filter", &record.evlist, "filter",
|
||||
"event filter", parse_filter),
|
||||
OPT_BOOLEAN(0, "latency", &record.latency,
|
||||
"Enable data collection for latency profiling.\n"
|
||||
"\t\t\t Use perf report --latency for latency-centric profile."),
|
||||
OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
|
||||
NULL, "don't record events from perf itself",
|
||||
exclude_perf),
|
||||
|
|
@ -4019,6 +4023,22 @@ int cmd_record(int argc, const char **argv)
|
|||
|
||||
}
|
||||
|
||||
if (record.latency) {
|
||||
/*
|
||||
* There is no fundamental reason why latency profiling
|
||||
* can't work for system-wide mode, but exact semantics
|
||||
* and details are to be defined.
|
||||
* See the following thread for details:
|
||||
* https://lore.kernel.org/all/Z4XDJyvjiie3howF@google.com/
|
||||
*/
|
||||
if (record.opts.target.system_wide) {
|
||||
pr_err("Failed: latency profiling is not supported with system-wide collection.\n");
|
||||
err = -EINVAL;
|
||||
goto out_opts;
|
||||
}
|
||||
record.opts.record_switch_events = true;
|
||||
}
|
||||
|
||||
if (rec->buildid_mmap) {
|
||||
if (!perf_can_record_build_id()) {
|
||||
pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
|
||||
|
|
|
|||
|
|
@ -112,6 +112,8 @@ struct report {
|
|||
u64 nr_entries;
|
||||
u64 queue_size;
|
||||
u64 total_cycles;
|
||||
u64 total_samples;
|
||||
u64 singlethreaded_samples;
|
||||
int socket_filter;
|
||||
DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
|
||||
struct branch_type_stat brtype_stat;
|
||||
|
|
@ -331,6 +333,10 @@ static int process_sample_event(const struct perf_tool *tool,
|
|||
&rep->total_cycles, evsel);
|
||||
}
|
||||
|
||||
rep->total_samples++;
|
||||
if (al.parallelism == 1)
|
||||
rep->singlethreaded_samples++;
|
||||
|
||||
ret = hist_entry_iter__add(&iter, &al, rep->max_stack, rep);
|
||||
if (ret < 0)
|
||||
pr_debug("problem adding hist entry, skipping event\n");
|
||||
|
|
@ -1079,6 +1085,11 @@ static int __cmd_report(struct report *rep)
|
|||
return ret;
|
||||
}
|
||||
|
||||
/* Don't show Latency column for non-parallel profiles by default. */
|
||||
if (!symbol_conf.prefer_latency && rep->total_samples &&
|
||||
rep->singlethreaded_samples * 100 / rep->total_samples >= 99)
|
||||
perf_hpp__cancel_latency();
|
||||
|
||||
evlist__check_mem_load_aux(session->evlist);
|
||||
|
||||
if (rep->stats_mode)
|
||||
|
|
@ -1468,6 +1479,10 @@ int cmd_report(int argc, const char **argv)
|
|||
"Disable raw trace ordering"),
|
||||
OPT_BOOLEAN(0, "skip-empty", &report.skip_empty,
|
||||
"Do not display empty (or dummy) events in the output"),
|
||||
OPT_BOOLEAN(0, "latency", &symbol_conf.prefer_latency,
|
||||
"Show latency-centric profile rather than the default\n"
|
||||
"\t\t\t CPU-consumption-centric profile\n"
|
||||
"\t\t\t (requires perf record --latency flag)."),
|
||||
OPT_END()
|
||||
};
|
||||
struct perf_data data = {
|
||||
|
|
@ -1722,16 +1737,25 @@ int cmd_report(int argc, const char **argv)
|
|||
symbol_conf.annotate_data_sample = true;
|
||||
}
|
||||
|
||||
symbol_conf.enable_latency = true;
|
||||
if (report.disable_order || !perf_session__has_switch_events(session)) {
|
||||
if (symbol_conf.parallelism_list_str ||
|
||||
(sort_order && strstr(sort_order, "parallelism")) ||
|
||||
(field_order && strstr(field_order, "parallelism"))) {
|
||||
symbol_conf.prefer_latency ||
|
||||
(sort_order && (strstr(sort_order, "latency") ||
|
||||
strstr(sort_order, "parallelism"))) ||
|
||||
(field_order && (strstr(field_order, "latency") ||
|
||||
strstr(field_order, "parallelism")))) {
|
||||
if (report.disable_order)
|
||||
ui__error("Use of parallelism is incompatible with --disable-order.\n");
|
||||
ui__error("Use of latency profile or parallelism is incompatible with --disable-order.\n");
|
||||
else
|
||||
ui__error("Use of parallelism requires --switch-events during record.\n");
|
||||
ui__error("Use of latency profile or parallelism requires --latency flag during record.\n");
|
||||
return -1;
|
||||
}
|
||||
/*
|
||||
* If user did not ask for anything related to
|
||||
* latency/parallelism explicitly, just don't show it.
|
||||
*/
|
||||
symbol_conf.enable_latency = false;
|
||||
}
|
||||
|
||||
if (sort_order && strstr(sort_order, "ipc")) {
|
||||
|
|
|
|||
|
|
@ -631,28 +631,48 @@ void perf_hpp__init(void)
|
|||
if (is_strict_order(field_order))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Overhead and latency columns are added in setup_overhead(),
|
||||
* so they are added implicitly here only if they were added
|
||||
* by setup_overhead() before (have was_taken flag set).
|
||||
* This is required because setup_overhead() has more complex
|
||||
* logic, in particular it does not add "overhead" if user
|
||||
* specified "latency" in sort order, and vise versa.
|
||||
*/
|
||||
if (symbol_conf.cumulate_callchain) {
|
||||
hpp_dimension__add_output(PERF_HPP__OVERHEAD_ACC);
|
||||
/*
|
||||
* Addition of fields is idempotent, so we add latency
|
||||
* column twice to get desired order with simpler logic.
|
||||
*/
|
||||
if (symbol_conf.prefer_latency)
|
||||
hpp_dimension__add_output(PERF_HPP__LATENCY_ACC, true);
|
||||
hpp_dimension__add_output(PERF_HPP__OVERHEAD_ACC, true);
|
||||
if (symbol_conf.enable_latency)
|
||||
hpp_dimension__add_output(PERF_HPP__LATENCY_ACC, true);
|
||||
perf_hpp__format[PERF_HPP__OVERHEAD].name = "Self";
|
||||
}
|
||||
|
||||
hpp_dimension__add_output(PERF_HPP__OVERHEAD);
|
||||
if (symbol_conf.prefer_latency)
|
||||
hpp_dimension__add_output(PERF_HPP__LATENCY, true);
|
||||
hpp_dimension__add_output(PERF_HPP__OVERHEAD, true);
|
||||
if (symbol_conf.enable_latency)
|
||||
hpp_dimension__add_output(PERF_HPP__LATENCY, true);
|
||||
|
||||
if (symbol_conf.show_cpu_utilization) {
|
||||
hpp_dimension__add_output(PERF_HPP__OVERHEAD_SYS);
|
||||
hpp_dimension__add_output(PERF_HPP__OVERHEAD_US);
|
||||
hpp_dimension__add_output(PERF_HPP__OVERHEAD_SYS, false);
|
||||
hpp_dimension__add_output(PERF_HPP__OVERHEAD_US, false);
|
||||
|
||||
if (perf_guest) {
|
||||
hpp_dimension__add_output(PERF_HPP__OVERHEAD_GUEST_SYS);
|
||||
hpp_dimension__add_output(PERF_HPP__OVERHEAD_GUEST_US);
|
||||
hpp_dimension__add_output(PERF_HPP__OVERHEAD_GUEST_SYS, false);
|
||||
hpp_dimension__add_output(PERF_HPP__OVERHEAD_GUEST_US, false);
|
||||
}
|
||||
}
|
||||
|
||||
if (symbol_conf.show_nr_samples)
|
||||
hpp_dimension__add_output(PERF_HPP__SAMPLES);
|
||||
hpp_dimension__add_output(PERF_HPP__SAMPLES, false);
|
||||
|
||||
if (symbol_conf.show_total_period)
|
||||
hpp_dimension__add_output(PERF_HPP__PERIOD);
|
||||
hpp_dimension__add_output(PERF_HPP__PERIOD, false);
|
||||
}
|
||||
|
||||
void perf_hpp_list__column_register(struct perf_hpp_list *list,
|
||||
|
|
@ -701,6 +721,24 @@ void perf_hpp__cancel_cumulate(void)
|
|||
}
|
||||
}
|
||||
|
||||
void perf_hpp__cancel_latency(void)
|
||||
{
|
||||
struct perf_hpp_fmt *fmt, *lat, *acc, *tmp;
|
||||
|
||||
if (is_strict_order(field_order))
|
||||
return;
|
||||
if (sort_order && strstr(sort_order, "latency"))
|
||||
return;
|
||||
|
||||
lat = &perf_hpp__format[PERF_HPP__LATENCY];
|
||||
acc = &perf_hpp__format[PERF_HPP__LATENCY_ACC];
|
||||
|
||||
perf_hpp_list__for_each_format_safe(&perf_hpp_list, fmt, tmp) {
|
||||
if (fmt_equal(lat, fmt) || fmt_equal(acc, fmt))
|
||||
perf_hpp__column_unregister(fmt);
|
||||
}
|
||||
}
|
||||
|
||||
void perf_hpp__setup_output_field(struct perf_hpp_list *list)
|
||||
{
|
||||
struct perf_hpp_fmt *fmt;
|
||||
|
|
|
|||
|
|
@ -582,6 +582,7 @@ enum {
|
|||
|
||||
void perf_hpp__init(void);
|
||||
void perf_hpp__cancel_cumulate(void);
|
||||
void perf_hpp__cancel_latency(void);
|
||||
void perf_hpp__setup_output_field(struct perf_hpp_list *list);
|
||||
void perf_hpp__reset_output_field(struct perf_hpp_list *list);
|
||||
void perf_hpp__append_sort_keys(struct perf_hpp_list *list);
|
||||
|
|
|
|||
|
|
@ -2622,6 +2622,7 @@ struct hpp_dimension {
|
|||
const char *name;
|
||||
struct perf_hpp_fmt *fmt;
|
||||
int taken;
|
||||
int was_taken;
|
||||
};
|
||||
|
||||
#define DIM(d, n) { .name = n, .fmt = &perf_hpp__format[d], }
|
||||
|
|
@ -3513,6 +3514,7 @@ static int __hpp_dimension__add(struct hpp_dimension *hd,
|
|||
return -1;
|
||||
|
||||
hd->taken = 1;
|
||||
hd->was_taken = 1;
|
||||
perf_hpp_list__register_sort_field(list, fmt);
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -3547,10 +3549,15 @@ static int __hpp_dimension__add_output(struct perf_hpp_list *list,
|
|||
return 0;
|
||||
}
|
||||
|
||||
int hpp_dimension__add_output(unsigned col)
|
||||
int hpp_dimension__add_output(unsigned col, bool implicit)
|
||||
{
|
||||
struct hpp_dimension *hd;
|
||||
|
||||
BUG_ON(col >= PERF_HPP__MAX_INDEX);
|
||||
return __hpp_dimension__add_output(&perf_hpp_list, &hpp_sort_dimensions[col]);
|
||||
hd = &hpp_sort_dimensions[col];
|
||||
if (implicit && !hd->was_taken)
|
||||
return 0;
|
||||
return __hpp_dimension__add_output(&perf_hpp_list, hd);
|
||||
}
|
||||
|
||||
int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
|
||||
|
|
@ -3809,10 +3816,24 @@ static char *setup_overhead(char *keys)
|
|||
if (sort__mode == SORT_MODE__DIFF)
|
||||
return keys;
|
||||
|
||||
keys = prefix_if_not_in("overhead", keys);
|
||||
|
||||
if (symbol_conf.cumulate_callchain)
|
||||
keys = prefix_if_not_in("overhead_children", keys);
|
||||
if (symbol_conf.prefer_latency) {
|
||||
keys = prefix_if_not_in("overhead", keys);
|
||||
keys = prefix_if_not_in("latency", keys);
|
||||
if (symbol_conf.cumulate_callchain) {
|
||||
keys = prefix_if_not_in("overhead_children", keys);
|
||||
keys = prefix_if_not_in("latency_children", keys);
|
||||
}
|
||||
} else if (!keys || (!strstr(keys, "overhead") &&
|
||||
!strstr(keys, "latency"))) {
|
||||
if (symbol_conf.enable_latency)
|
||||
keys = prefix_if_not_in("latency", keys);
|
||||
keys = prefix_if_not_in("overhead", keys);
|
||||
if (symbol_conf.cumulate_callchain) {
|
||||
if (symbol_conf.enable_latency)
|
||||
keys = prefix_if_not_in("latency_children", keys);
|
||||
keys = prefix_if_not_in("overhead_children", keys);
|
||||
}
|
||||
}
|
||||
|
||||
return keys;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -141,7 +141,7 @@ int report_parse_ignore_callees_opt(const struct option *opt, const char *arg, i
|
|||
|
||||
bool is_strict_order(const char *order);
|
||||
|
||||
int hpp_dimension__add_output(unsigned col);
|
||||
int hpp_dimension__add_output(unsigned col, bool implicit);
|
||||
void reset_dimensions(void);
|
||||
int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
|
||||
struct evlist *evlist,
|
||||
|
|
|
|||
|
|
@ -49,7 +49,9 @@ struct symbol_conf {
|
|||
keep_exited_threads,
|
||||
annotate_data_member,
|
||||
annotate_data_sample,
|
||||
skip_empty;
|
||||
skip_empty,
|
||||
enable_latency,
|
||||
prefer_latency;
|
||||
const char *vmlinux_name,
|
||||
*kallsyms_name,
|
||||
*source_prefix,
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user