kernel_optimize_test/tools/perf/builtin-annotate.c
Peter Zijlstra 70fbe05745 perf annotate: Add branch stack / basic block
I wanted to know the hottest path through a function and figured the
branch-stack (LBR) information should be able to help out with that.

The below uses the branch-stack to create basic blocks and generate
statistics from them.

        from    to              branch_i
        * ----> *
                |
                | block
                v
                * ----> *
                from    to      branch_i+1

The blocks are broken down into non-overlapping ranges, while tracking
if the start of each range is an entry point and/or the end of a range
is a branch.

Each block iterates all ranges it covers (while splitting where required
to exactly match the block) and increments the 'coverage' count.

For the range including the branch we increment the taken counter, as
well as the pred counter if flags.predicted.

Using these number we can find if an instruction:

 - had coverage; given by:

        br->coverage / br->sym->max_coverage

   This metric ensures each symbol has a 100% spot, which reflects the
   observation that each symbol must have a most covered/hottest
   block.

 - is a branch target: br->is_target && br->start == add

 - for targets, how much of a branch's coverages comes from it:

	target->entry / branch->coverage

 - is a branch: br->is_branch && br->end == addr

 - for branches, how often it was taken:

        br->taken / br->coverage

   after all, all execution that didn't take the branch would have
   incremented the coverage and continued onward to a later branch.

 - for branches, how often it was predicted:

        br->pred / br->taken

The coverage percentage is used to color the address and asm sections;
for low (<1%) coverage we use NORMAL (uncolored), indicating that these
instructions are not 'important'. For high coverage (>75%) we color the
address RED.

For each branch, we add an asm comment after the instruction with
information on how often it was taken and predicted.

Output looks like (sans color, which does loose a lot of the
information :/)

$ perf record --branch-filter u,any -e cycles:p ./branches 27
$ perf annotate branches

 Percent |	Source code & Disassembly of branches for cycles:pu (217 samples)
---------------------------------------------------------------------------------
         :	branches():
    0.00 :	  40057a:       push   %rbp
    0.00 :	  40057b:       mov    %rsp,%rbp
    0.00 :	  40057e:       sub    $0x20,%rsp
    0.00 :	  400582:       mov    %rdi,-0x18(%rbp)
    0.00 :	  400586:       mov    %rsi,-0x20(%rbp)
    0.00 :	  40058a:       mov    -0x18(%rbp),%rax
    0.00 :	  40058e:       mov    %rax,-0x10(%rbp)
    0.00 :	  400592:       movq   $0x0,-0x8(%rbp)
    0.00 :	  40059a:       jmpq   400656 <branches+0xdc>
    1.84 :	  40059f:       mov    -0x10(%rbp),%rax	# +100.00%
    3.23 :	  4005a3:       and    $0x1,%eax
    1.84 :	  4005a6:       test   %rax,%rax
    0.00 :	  4005a9:       je     4005bf <branches+0x45>	# -54.50% (p:42.00%)
    0.46 :	  4005ab:       mov    0x200bbe(%rip),%rax        # 601170 <acc>
   12.90 :	  4005b2:       add    $0x1,%rax
    2.30 :	  4005b6:       mov    %rax,0x200bb3(%rip)        # 601170 <acc>
    0.46 :	  4005bd:       jmp    4005d1 <branches+0x57>	# -100.00% (p:100.00%)
    0.92 :	  4005bf:       mov    0x200baa(%rip),%rax        # 601170 <acc>	# +49.54%
   13.82 :	  4005c6:       sub    $0x1,%rax
    0.46 :	  4005ca:       mov    %rax,0x200b9f(%rip)        # 601170 <acc>
    2.30 :	  4005d1:       mov    -0x10(%rbp),%rax	# +50.46%
    0.46 :	  4005d5:       mov    %rax,%rdi
    0.46 :	  4005d8:       callq  400526 <lfsr>	# -100.00% (p:100.00%)
    0.00 :	  4005dd:       mov    %rax,-0x10(%rbp)	# +100.00%
    0.92 :	  4005e1:       mov    -0x18(%rbp),%rax
    0.00 :	  4005e5:       and    $0x1,%eax
    0.00 :	  4005e8:       test   %rax,%rax
    0.00 :	  4005eb:       je     4005ff <branches+0x85>	# -100.00% (p:100.00%)
    0.00 :	  4005ed:       mov    0x200b7c(%rip),%rax        # 601170 <acc>
    0.00 :	  4005f4:       shr    $0x2,%rax
    0.00 :	  4005f8:       mov    %rax,0x200b71(%rip)        # 601170 <acc>
    0.00 :	  4005ff:       mov    -0x10(%rbp),%rax	# +100.00%
    7.37 :	  400603:       and    $0x1,%eax
    3.69 :	  400606:       test   %rax,%rax
    0.00 :	  400609:       jne    400612 <branches+0x98>	# -59.25% (p:42.99%)
    1.84 :	  40060b:       mov    $0x1,%eax
   14.29 :	  400610:       jmp    400617 <branches+0x9d>	# -100.00% (p:100.00%)
    1.38 :	  400612:       mov    $0x0,%eax	# +57.65%
   10.14 :	  400617:       test   %al,%al	# +42.35%
    0.00 :	  400619:       je     40062f <branches+0xb5>	# -57.65% (p:100.00%)
    0.46 :	  40061b:       mov    0x200b4e(%rip),%rax        # 601170 <acc>
    2.76 :	  400622:       sub    $0x1,%rax
    0.00 :	  400626:       mov    %rax,0x200b43(%rip)        # 601170 <acc>
    0.46 :	  40062d:       jmp    400641 <branches+0xc7>	# -100.00% (p:100.00%)
    0.92 :	  40062f:       mov    0x200b3a(%rip),%rax        # 601170 <acc>	# +56.13%
    2.30 :	  400636:       add    $0x1,%rax
    0.92 :	  40063a:       mov    %rax,0x200b2f(%rip)        # 601170 <acc>
    0.92 :	  400641:       mov    -0x10(%rbp),%rax	# +43.87%
    2.30 :	  400645:       mov    %rax,%rdi
    0.00 :	  400648:       callq  400526 <lfsr>	# -100.00% (p:100.00%)
    0.00 :	  40064d:       mov    %rax,-0x10(%rbp)	# +100.00%
    1.84 :	  400651:       addq   $0x1,-0x8(%rbp)
    0.92 :	  400656:       mov    -0x8(%rbp),%rax
    5.07 :	  40065a:       cmp    -0x20(%rbp),%rax
    0.00 :	  40065e:       jb     40059f <branches+0x25>	# -100.00% (p:100.00%)
    0.00 :	  400664:       nop
    0.00 :	  400665:       leaveq
    0.00 :	  400666:       retq

(Note: the --branch-filter u,any was used to avoid spurious target and
branch points due to interrupts/faults, they show up as very small -/+
annotations on 'weird' locations)

Committer note:

Please take a look at:

  http://vger.kernel.org/~acme/perf/annotate_basic_blocks.png

To see the colors.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: David Carrillo-Cisneros <davidcc@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
[ Moved sym->max_coverage to 'struct annotate', aka symbol__annotate(sym) ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-09-08 13:44:03 -03:00

511 lines
13 KiB
C

/*
* builtin-annotate.c
*
* Builtin annotate command: Analyze the perf.data input file,
* look up and read DSOs and symbol information and display
* a histogram of results, along various sorting keys.
*/
#include "builtin.h"
#include "util/util.h"
#include "util/color.h"
#include <linux/list.h>
#include "util/cache.h"
#include <linux/rbtree.h>
#include "util/symbol.h"
#include "perf.h"
#include "util/debug.h"
#include "util/evlist.h"
#include "util/evsel.h"
#include "util/annotate.h"
#include "util/event.h"
#include <subcmd/parse-options.h>
#include "util/parse-events.h"
#include "util/thread.h"
#include "util/sort.h"
#include "util/hist.h"
#include "util/session.h"
#include "util/tool.h"
#include "util/data.h"
#include "arch/common.h"
#include "util/block-range.h"
#include <dlfcn.h>
#include <linux/bitmap.h>
struct perf_annotate {
struct perf_tool tool;
struct perf_session *session;
bool use_tui, use_stdio, use_gtk;
bool full_paths;
bool print_line;
bool skip_missing;
const char *sym_hist_filter;
const char *cpu_list;
DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
};
/*
* Given one basic block:
*
* from to branch_i
* * ----> *
* |
* | block
* v
* * ----> *
* from to branch_i+1
*
* where the horizontal are the branches and the vertical is the executed
* block of instructions.
*
* We count, for each 'instruction', the number of blocks that covered it as
* well as count the ratio each branch is taken.
*
* We can do this without knowing the actual instruction stream by keeping
* track of the address ranges. We break down ranges such that there is no
* overlap and iterate from the start until the end.
*
* @acme: once we parse the objdump output _before_ processing the samples,
* we can easily fold the branch.cycles IPC bits in.
*/
static void process_basic_block(struct addr_map_symbol *start,
struct addr_map_symbol *end,
struct branch_flags *flags)
{
struct symbol *sym = start->sym;
struct annotation *notes = sym ? symbol__annotation(sym) : NULL;
struct block_range_iter iter;
struct block_range *entry;
/*
* Sanity; NULL isn't executable and the CPU cannot execute backwards
*/
if (!start->addr || start->addr > end->addr)
return;
iter = block_range__create(start->addr, end->addr);
if (!block_range_iter__valid(&iter))
return;
/*
* First block in range is a branch target.
*/
entry = block_range_iter(&iter);
assert(entry->is_target);
entry->entry++;
do {
entry = block_range_iter(&iter);
entry->coverage++;
entry->sym = sym;
if (notes)
notes->max_coverage = max(notes->max_coverage, entry->coverage);
} while (block_range_iter__next(&iter));
/*
* Last block in rage is a branch.
*/
entry = block_range_iter(&iter);
assert(entry->is_branch);
entry->taken++;
if (flags->predicted)
entry->pred++;
}
static void process_branch_stack(struct branch_stack *bs, struct addr_location *al,
struct perf_sample *sample)
{
struct addr_map_symbol *prev = NULL;
struct branch_info *bi;
int i;
if (!bs || !bs->nr)
return;
bi = sample__resolve_bstack(sample, al);
if (!bi)
return;
for (i = bs->nr - 1; i >= 0; i--) {
/*
* XXX filter against symbol
*/
if (prev)
process_basic_block(prev, &bi[i].from, &bi[i].flags);
prev = &bi[i].to;
}
free(bi);
}
static int perf_evsel__add_sample(struct perf_evsel *evsel,
struct perf_sample *sample,
struct addr_location *al,
struct perf_annotate *ann)
{
struct hists *hists = evsel__hists(evsel);
struct hist_entry *he;
int ret;
if (ann->sym_hist_filter != NULL &&
(al->sym == NULL ||
strcmp(ann->sym_hist_filter, al->sym->name) != 0)) {
/* We're only interested in a symbol named sym_hist_filter */
/*
* FIXME: why isn't this done in the symbol_filter when loading
* the DSO?
*/
if (al->sym != NULL) {
rb_erase(&al->sym->rb_node,
&al->map->dso->symbols[al->map->type]);
symbol__delete(al->sym);
dso__reset_find_symbol_cache(al->map->dso);
}
return 0;
}
/*
* XXX filtered samples can still have branch entires pointing into our
* symbol and are missed.
*/
process_branch_stack(sample->branch_stack, al, sample);
sample->period = 1;
sample->weight = 1;
he = hists__add_entry(hists, al, NULL, NULL, NULL, sample, true);
if (he == NULL)
return -ENOMEM;
ret = hist_entry__inc_addr_samples(he, evsel->idx, al->addr);
hists__inc_nr_samples(hists, true);
return ret;
}
static int process_sample_event(struct perf_tool *tool,
union perf_event *event,
struct perf_sample *sample,
struct perf_evsel *evsel,
struct machine *machine)
{
struct perf_annotate *ann = container_of(tool, struct perf_annotate, tool);
struct addr_location al;
int ret = 0;
if (machine__resolve(machine, &al, sample) < 0) {
pr_warning("problem processing %d event, skipping it.\n",
event->header.type);
return -1;
}
if (ann->cpu_list && !test_bit(sample->cpu, ann->cpu_bitmap))
goto out_put;
if (!al.filtered && perf_evsel__add_sample(evsel, sample, &al, ann)) {
pr_warning("problem incrementing symbol count, "
"skipping event\n");
ret = -1;
}
out_put:
addr_location__put(&al);
return ret;
}
static int hist_entry__tty_annotate(struct hist_entry *he,
struct perf_evsel *evsel,
struct perf_annotate *ann)
{
return symbol__tty_annotate(he->ms.sym, he->ms.map, evsel,
ann->print_line, ann->full_paths, 0, 0);
}
static void hists__find_annotations(struct hists *hists,
struct perf_evsel *evsel,
struct perf_annotate *ann)
{
struct rb_node *nd = rb_first(&hists->entries), *next;
int key = K_RIGHT;
while (nd) {
struct hist_entry *he = rb_entry(nd, struct hist_entry, rb_node);
struct annotation *notes;
if (he->ms.sym == NULL || he->ms.map->dso->annotate_warned)
goto find_next;
notes = symbol__annotation(he->ms.sym);
if (notes->src == NULL) {
find_next:
if (key == K_LEFT)
nd = rb_prev(nd);
else
nd = rb_next(nd);
continue;
}
if (use_browser == 2) {
int ret;
int (*annotate)(struct hist_entry *he,
struct perf_evsel *evsel,
struct hist_browser_timer *hbt);
annotate = dlsym(perf_gtk_handle,
"hist_entry__gtk_annotate");
if (annotate == NULL) {
ui__error("GTK browser not found!\n");
return;
}
ret = annotate(he, evsel, NULL);
if (!ret || !ann->skip_missing)
return;
/* skip missing symbols */
nd = rb_next(nd);
} else if (use_browser == 1) {
key = hist_entry__tui_annotate(he, evsel, NULL);
switch (key) {
case -1:
if (!ann->skip_missing)
return;
/* fall through */
case K_RIGHT:
next = rb_next(nd);
break;
case K_LEFT:
next = rb_prev(nd);
break;
default:
return;
}
if (next != NULL)
nd = next;
} else {
hist_entry__tty_annotate(he, evsel, ann);
nd = rb_next(nd);
/*
* Since we have a hist_entry per IP for the same
* symbol, free he->ms.sym->src to signal we already
* processed this symbol.
*/
zfree(&notes->src->cycles_hist);
zfree(&notes->src);
}
}
}
static int __cmd_annotate(struct perf_annotate *ann)
{
int ret;
struct perf_session *session = ann->session;
struct perf_evsel *pos;
u64 total_nr_samples;
if (ann->cpu_list) {
ret = perf_session__cpu_bitmap(session, ann->cpu_list,
ann->cpu_bitmap);
if (ret)
goto out;
}
if (!objdump_path) {
ret = perf_env__lookup_objdump(&session->header.env);
if (ret)
goto out;
}
ret = perf_session__process_events(session);
if (ret)
goto out;
if (dump_trace) {
perf_session__fprintf_nr_events(session, stdout);
perf_evlist__fprintf_nr_events(session->evlist, stdout);
goto out;
}
if (verbose > 3)
perf_session__fprintf(session, stdout);
if (verbose > 2)
perf_session__fprintf_dsos(session, stdout);
total_nr_samples = 0;
evlist__for_each_entry(session->evlist, pos) {
struct hists *hists = evsel__hists(pos);
u32 nr_samples = hists->stats.nr_events[PERF_RECORD_SAMPLE];
if (nr_samples > 0) {
total_nr_samples += nr_samples;
hists__collapse_resort(hists, NULL);
/* Don't sort callchain */
perf_evsel__reset_sample_bit(pos, CALLCHAIN);
perf_evsel__output_resort(pos, NULL);
if (symbol_conf.event_group &&
!perf_evsel__is_group_leader(pos))
continue;
hists__find_annotations(hists, pos, ann);
}
}
if (total_nr_samples == 0) {
ui__error("The %s file has no samples!\n", session->file->path);
goto out;
}
if (use_browser == 2) {
void (*show_annotations)(void);
show_annotations = dlsym(perf_gtk_handle,
"perf_gtk__show_annotations");
if (show_annotations == NULL) {
ui__error("GTK browser not found!\n");
goto out;
}
show_annotations();
}
out:
return ret;
}
static const char * const annotate_usage[] = {
"perf annotate [<options>]",
NULL
};
int cmd_annotate(int argc, const char **argv, const char *prefix __maybe_unused)
{
struct perf_annotate annotate = {
.tool = {
.sample = process_sample_event,
.mmap = perf_event__process_mmap,
.mmap2 = perf_event__process_mmap2,
.comm = perf_event__process_comm,
.exit = perf_event__process_exit,
.fork = perf_event__process_fork,
.ordered_events = true,
.ordering_requires_timestamps = true,
},
};
struct perf_data_file file = {
.mode = PERF_DATA_MODE_READ,
};
const struct option options[] = {
OPT_STRING('i', "input", &input_name, "file",
"input file name"),
OPT_STRING('d', "dsos", &symbol_conf.dso_list_str, "dso[,dso...]",
"only consider symbols in these dsos"),
OPT_STRING('s', "symbol", &annotate.sym_hist_filter, "symbol",
"symbol to annotate"),
OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"),
OPT_INCR('v', "verbose", &verbose,
"be more verbose (show symbol address, etc)"),
OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
"dump raw trace in ASCII"),
OPT_BOOLEAN(0, "gtk", &annotate.use_gtk, "Use the GTK interface"),
OPT_BOOLEAN(0, "tui", &annotate.use_tui, "Use the TUI interface"),
OPT_BOOLEAN(0, "stdio", &annotate.use_stdio, "Use the stdio interface"),
OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
"file", "vmlinux pathname"),
OPT_BOOLEAN('m', "modules", &symbol_conf.use_modules,
"load module symbols - WARNING: use only with -k and LIVE kernel"),
OPT_BOOLEAN('l', "print-line", &annotate.print_line,
"print matching source lines (may be slow)"),
OPT_BOOLEAN('P', "full-paths", &annotate.full_paths,
"Don't shorten the displayed pathnames"),
OPT_BOOLEAN(0, "skip-missing", &annotate.skip_missing,
"Skip symbols that cannot be annotated"),
OPT_STRING('C', "cpu", &annotate.cpu_list, "cpu", "list of cpus to profile"),
OPT_CALLBACK(0, "symfs", NULL, "directory",
"Look for files with symbols relative to this directory",
symbol__config_symfs),
OPT_BOOLEAN(0, "source", &symbol_conf.annotate_src,
"Interleave source code with assembly code (default)"),
OPT_BOOLEAN(0, "asm-raw", &symbol_conf.annotate_asm_raw,
"Display raw encoding of assembly instructions (default)"),
OPT_STRING('M', "disassembler-style", &disassembler_style, "disassembler style",
"Specify disassembler style (e.g. -M intel for intel syntax)"),
OPT_STRING(0, "objdump", &objdump_path, "path",
"objdump binary to use for disassembly and annotations"),
OPT_BOOLEAN(0, "group", &symbol_conf.event_group,
"Show event group information together"),
OPT_BOOLEAN(0, "show-total-period", &symbol_conf.show_total_period,
"Show a column with the sum of periods"),
OPT_CALLBACK_DEFAULT(0, "stdio-color", NULL, "mode",
"'always' (default), 'never' or 'auto' only applicable to --stdio mode",
stdio__config_color, "always"),
OPT_END()
};
int ret = hists__init();
if (ret < 0)
return ret;
argc = parse_options(argc, argv, options, annotate_usage, 0);
if (argc) {
/*
* Special case: if there's an argument left then assume that
* it's a symbol filter:
*/
if (argc > 1)
usage_with_options(annotate_usage, options);
annotate.sym_hist_filter = argv[0];
}
file.path = input_name;
annotate.session = perf_session__new(&file, false, &annotate.tool);
if (annotate.session == NULL)
return -1;
ret = symbol__annotation_init();
if (ret < 0)
goto out_delete;
symbol_conf.try_vmlinux_path = true;
ret = symbol__init(&annotate.session->header.env);
if (ret < 0)
goto out_delete;
if (setup_sorting(NULL) < 0)
usage_with_options(annotate_usage, options);
if (annotate.use_stdio)
use_browser = 0;
else if (annotate.use_tui)
use_browser = 1;
else if (annotate.use_gtk)
use_browser = 2;
setup_browser(true);
ret = __cmd_annotate(&annotate);
out_delete:
/*
* Speed up the exit process, for large files this can
* take quite a while.
*
* XXX Enable this when using valgrind or if we ever
* librarize this command.
*
* Also experiment with obstacks to see how much speed
* up we'll get here.
*
* perf_session__delete(session);
*/
return ret;
}