mirror of
https://github.com/torvalds/linux.git
synced 2026-05-23 22:52:19 +02:00
perf tools: Construct LBR call chain
LBR call stack only has user-space callchains. It is output in the
PERF_SAMPLE_BRANCH_STACK data format. For kernel callchains, it's
still in the form of PERF_SAMPLE_CALLCHAIN.
The perf tool has to handle both data sources to construct a
complete callstack.
For the "perf report -D" option, both lbr and fp information will be
displayed.
A new call chain recording option "lbr" is introduced into the perf
tool for LBR call stack. The user can use --call-graph lbr to get
the call stack information from hardware.
Here are some examples.
When profiling bc(1) on Fedora 19:
echo 'scale=2000; 4*a(1)' > cmd; perf record --call-graph lbr bc -l < cmd
If enabling LBR, perf report output looks like:
50.36% bc bc [.] bc_divide
|
--- bc_divide
execute
run_code
yyparse
main
__libc_start_main
_start
33.66% bc bc [.] _one_mult
|
--- _one_mult
bc_divide
execute
run_code
yyparse
main
__libc_start_main
_start
7.62% bc bc [.] _bc_do_add
|
--- _bc_do_add
|
|--99.89%-- 0x2000186a8
--0.11%-- [...]
6.83% bc bc [.] _bc_do_sub
|
--- _bc_do_sub
|
|--99.94%-- bc_add
| execute
| run_code
| yyparse
| main
| __libc_start_main
| _start
--0.06%-- [...]
0.46% bc libc-2.17.so [.] __memset_sse2
|
--- __memset_sse2
|
|--54.13%-- bc_new_num
| |
| |--51.00%-- bc_divide
| | execute
| | run_code
| | yyparse
| | main
| | __libc_start_main
| | _start
| |
| |--30.46%-- _bc_do_sub
| | bc_add
| | execute
| | run_code
| | yyparse
| | main
| | __libc_start_main
| | _start
| |
| --18.55%-- _bc_do_add
| bc_add
| execute
| run_code
| yyparse
| main
| __libc_start_main
| _start
|
--45.87%-- bc_divide
execute
run_code
yyparse
main
__libc_start_main
_start
If using FP, perf report output looks like:
echo 'scale=2000; 4*a(1)' > cmd; perf record --call-graph fp bc -l < cmd
50.49% bc bc [.] bc_divide
|
--- bc_divide
33.57% bc bc [.] _one_mult
|
--- _one_mult
7.61% bc bc [.] _bc_do_add
|
--- _bc_do_add
0x2000186a8
6.88% bc bc [.] _bc_do_sub
|
--- _bc_do_sub
0.42% bc libc-2.17.so [.] __memcpy_ssse3_back
|
--- __memcpy_ssse3_back
If using LBR, perf report -D output looks like:
3458145275743 0x2fd750 [0xd8]: PERF_RECORD_SAMPLE(IP, 0x2): 9748/9748: 0x408ea8 period: 609644 addr: 0
... LBR call chain: nr:8
..... 0: fffffffffffffe00
..... 1: 0000000000408e50
..... 2: 000000000040a458
..... 3: 000000000040562e
..... 4: 0000000000408590
..... 5: 00000000004022c0
..... 6: 00000000004015dd
..... 7: 0000003d1cc21b43
... FP chain: nr:2
..... 0: fffffffffffffe00
..... 1: 0000000000408ea8
... thread: bc:9748
...... dso: /usr/bin/bc
The LBR call stack has the following known limitations:
- Zero length calls are not filtered out by the hardware
- Exception handing such as setjmp/longjmp will have calls/returns not
match
- Pushing different return address onto the stack will have
calls/returns not match
- If callstack is deeper than the LBR, only the last entries are
captured
Tested-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: David Ahern <dsahern@gmail.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Simon Que <sque@chromium.org>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1420482185-29830-3-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
parent
aad2b21c15
commit
384b60557b
|
|
@ -355,4 +355,8 @@ for ((_evsel) = list_entry((_leader)->node.next, struct perf_evsel, node); \
|
|||
(_evsel) && (_evsel)->leader == (_leader); \
|
||||
(_evsel) = list_entry((_evsel)->node.next, struct perf_evsel, node))
|
||||
|
||||
static inline bool has_branch_callstack(struct perf_evsel *evsel)
|
||||
{
|
||||
return evsel->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK;
|
||||
}
|
||||
#endif /* __PERF_EVSEL_H */
|
||||
|
|
|
|||
|
|
@ -1502,18 +1502,100 @@ static int remove_loops(struct branch_entry *l, int nr)
|
|||
return nr;
|
||||
}
|
||||
|
||||
static int thread__resolve_callchain_sample(struct thread *thread,
|
||||
struct ip_callchain *chain,
|
||||
struct branch_stack *branch,
|
||||
struct symbol **parent,
|
||||
struct addr_location *root_al,
|
||||
int max_stack)
|
||||
/*
|
||||
* Recolve LBR callstack chain sample
|
||||
* Return:
|
||||
* 1 on success get LBR callchain information
|
||||
* 0 no available LBR callchain information, should try fp
|
||||
* negative error code on other errors.
|
||||
*/
|
||||
static int resolve_lbr_callchain_sample(struct thread *thread,
|
||||
struct perf_sample *sample,
|
||||
struct symbol **parent,
|
||||
struct addr_location *root_al,
|
||||
int max_stack)
|
||||
{
|
||||
struct ip_callchain *chain = sample->callchain;
|
||||
int chain_nr = min(max_stack, (int)chain->nr);
|
||||
int i, j, err;
|
||||
u64 ip;
|
||||
|
||||
for (i = 0; i < chain_nr; i++) {
|
||||
if (chain->ips[i] == PERF_CONTEXT_USER)
|
||||
break;
|
||||
}
|
||||
|
||||
/* LBR only affects the user callchain */
|
||||
if (i != chain_nr) {
|
||||
struct branch_stack *lbr_stack = sample->branch_stack;
|
||||
int lbr_nr = lbr_stack->nr;
|
||||
/*
|
||||
* LBR callstack can only get user call chain.
|
||||
* The mix_chain_nr is kernel call chain
|
||||
* number plus LBR user call chain number.
|
||||
* i is kernel call chain number,
|
||||
* 1 is PERF_CONTEXT_USER,
|
||||
* lbr_nr + 1 is the user call chain number.
|
||||
* For details, please refer to the comments
|
||||
* in callchain__printf
|
||||
*/
|
||||
int mix_chain_nr = i + 1 + lbr_nr + 1;
|
||||
|
||||
if (mix_chain_nr > PERF_MAX_STACK_DEPTH + PERF_MAX_BRANCH_DEPTH) {
|
||||
pr_warning("corrupted callchain. skipping...\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (j = 0; j < mix_chain_nr; j++) {
|
||||
if (callchain_param.order == ORDER_CALLEE) {
|
||||
if (j < i + 1)
|
||||
ip = chain->ips[j];
|
||||
else if (j > i + 1)
|
||||
ip = lbr_stack->entries[j - i - 2].from;
|
||||
else
|
||||
ip = lbr_stack->entries[0].to;
|
||||
} else {
|
||||
if (j < lbr_nr)
|
||||
ip = lbr_stack->entries[lbr_nr - j - 1].from;
|
||||
else if (j > lbr_nr)
|
||||
ip = chain->ips[i + 1 - (j - lbr_nr)];
|
||||
else
|
||||
ip = lbr_stack->entries[0].to;
|
||||
}
|
||||
|
||||
err = add_callchain_ip(thread, parent, root_al, false, ip);
|
||||
if (err)
|
||||
return (err < 0) ? err : 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int thread__resolve_callchain_sample(struct thread *thread,
|
||||
struct perf_evsel *evsel,
|
||||
struct perf_sample *sample,
|
||||
struct symbol **parent,
|
||||
struct addr_location *root_al,
|
||||
int max_stack)
|
||||
{
|
||||
struct branch_stack *branch = sample->branch_stack;
|
||||
struct ip_callchain *chain = sample->callchain;
|
||||
int chain_nr = min(max_stack, (int)chain->nr);
|
||||
int i, j, err;
|
||||
int skip_idx = -1;
|
||||
int first_call = 0;
|
||||
|
||||
callchain_cursor_reset(&callchain_cursor);
|
||||
|
||||
if (has_branch_callstack(evsel)) {
|
||||
err = resolve_lbr_callchain_sample(thread, sample, parent,
|
||||
root_al, max_stack);
|
||||
if (err)
|
||||
return (err < 0) ? err : 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Based on DWARF debug information, some architectures skip
|
||||
* a callchain entry saved by the kernel.
|
||||
|
|
@ -1521,8 +1603,6 @@ static int thread__resolve_callchain_sample(struct thread *thread,
|
|||
if (chain->nr < PERF_MAX_STACK_DEPTH)
|
||||
skip_idx = arch_skip_callchain_idx(thread, chain);
|
||||
|
||||
callchain_cursor_reset(&callchain_cursor);
|
||||
|
||||
/*
|
||||
* Add branches to call stack for easier browsing. This gives
|
||||
* more context for a sample than just the callers.
|
||||
|
|
@ -1623,9 +1703,9 @@ int thread__resolve_callchain(struct thread *thread,
|
|||
struct addr_location *root_al,
|
||||
int max_stack)
|
||||
{
|
||||
int ret = thread__resolve_callchain_sample(thread, sample->callchain,
|
||||
sample->branch_stack,
|
||||
parent, root_al, max_stack);
|
||||
int ret = thread__resolve_callchain_sample(thread, evsel,
|
||||
sample, parent,
|
||||
root_al, max_stack);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
|
|
|
|||
|
|
@ -553,15 +553,67 @@ int perf_session_queue_event(struct perf_session *s, union perf_event *event,
|
|||
return 0;
|
||||
}
|
||||
|
||||
static void callchain__printf(struct perf_sample *sample)
|
||||
static void callchain__lbr_callstack_printf(struct perf_sample *sample)
|
||||
{
|
||||
struct ip_callchain *callchain = sample->callchain;
|
||||
struct branch_stack *lbr_stack = sample->branch_stack;
|
||||
u64 kernel_callchain_nr = callchain->nr;
|
||||
unsigned int i;
|
||||
|
||||
printf("... chain: nr:%" PRIu64 "\n", sample->callchain->nr);
|
||||
for (i = 0; i < kernel_callchain_nr; i++) {
|
||||
if (callchain->ips[i] == PERF_CONTEXT_USER)
|
||||
break;
|
||||
}
|
||||
|
||||
if ((i != kernel_callchain_nr) && lbr_stack->nr) {
|
||||
u64 total_nr;
|
||||
/*
|
||||
* LBR callstack can only get user call chain,
|
||||
* i is kernel call chain number,
|
||||
* 1 is PERF_CONTEXT_USER.
|
||||
*
|
||||
* The user call chain is stored in LBR registers.
|
||||
* LBR are pair registers. The caller is stored
|
||||
* in "from" register, while the callee is stored
|
||||
* in "to" register.
|
||||
* For example, there is a call stack
|
||||
* "A"->"B"->"C"->"D".
|
||||
* The LBR registers will recorde like
|
||||
* "C"->"D", "B"->"C", "A"->"B".
|
||||
* So only the first "to" register and all "from"
|
||||
* registers are needed to construct the whole stack.
|
||||
*/
|
||||
total_nr = i + 1 + lbr_stack->nr + 1;
|
||||
kernel_callchain_nr = i + 1;
|
||||
|
||||
printf("... LBR call chain: nr:%" PRIu64 "\n", total_nr);
|
||||
|
||||
for (i = 0; i < kernel_callchain_nr; i++)
|
||||
printf("..... %2d: %016" PRIx64 "\n",
|
||||
i, callchain->ips[i]);
|
||||
|
||||
for (i = 0; i < sample->callchain->nr; i++)
|
||||
printf("..... %2d: %016" PRIx64 "\n",
|
||||
i, sample->callchain->ips[i]);
|
||||
(int)(kernel_callchain_nr), lbr_stack->entries[0].to);
|
||||
for (i = 0; i < lbr_stack->nr; i++)
|
||||
printf("..... %2d: %016" PRIx64 "\n",
|
||||
(int)(i + kernel_callchain_nr + 1), lbr_stack->entries[i].from);
|
||||
}
|
||||
}
|
||||
|
||||
static void callchain__printf(struct perf_evsel *evsel,
|
||||
struct perf_sample *sample)
|
||||
{
|
||||
unsigned int i;
|
||||
struct ip_callchain *callchain = sample->callchain;
|
||||
|
||||
if (has_branch_callstack(evsel))
|
||||
callchain__lbr_callstack_printf(sample);
|
||||
|
||||
printf("... FP chain: nr:%" PRIu64 "\n", callchain->nr);
|
||||
|
||||
for (i = 0; i < callchain->nr; i++)
|
||||
printf("..... %2d: %016" PRIx64 "\n",
|
||||
i, callchain->ips[i]);
|
||||
}
|
||||
|
||||
static void branch_stack__printf(struct perf_sample *sample)
|
||||
|
|
@ -718,9 +770,9 @@ static void dump_sample(struct perf_evsel *evsel, union perf_event *event,
|
|||
sample_type = evsel->attr.sample_type;
|
||||
|
||||
if (sample_type & PERF_SAMPLE_CALLCHAIN)
|
||||
callchain__printf(sample);
|
||||
callchain__printf(evsel, sample);
|
||||
|
||||
if (sample_type & PERF_SAMPLE_BRANCH_STACK)
|
||||
if ((sample_type & PERF_SAMPLE_BRANCH_STACK) && !has_branch_callstack(evsel))
|
||||
branch_stack__printf(sample);
|
||||
|
||||
if (sample_type & PERF_SAMPLE_REGS_USER)
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user