From 984028075794c00cbf4fb1e94bb6233e8be08875 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 2 May 2010 22:05:29 +0200 Subject: [PATCH 01/12] perf: Introduce a new "round of buffers read" pseudo event In order to provide a more rubust and deterministic reordering algorithm, we need to know when we reach a point where we just did a pass through over every counter buffers to read every thing they had. This patch introduces a new PERF_RECORD_FINISHED_ROUND pseudo event that only consist in an event header and doesn't need to contain anything. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Tom Zanussi Cc: Masami Hiramatsu --- tools/perf/builtin-record.c | 34 ++++++++++++++++++++++++---------- tools/perf/util/event.h | 3 ++- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index d3981ac50e1d..6b77b285fe10 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -494,6 +494,29 @@ static void event__synthesize_guest_os(struct machine *machine, void *data) " relocation symbol.\n", machine->pid); } +static struct perf_event_header finished_round_event = { + .size = sizeof(struct perf_event_header), + .type = PERF_RECORD_FINISHED_ROUND, +}; + +static void mmap_read_all(void) +{ + int i, counter, thread; + + for (i = 0; i < nr_cpu; i++) { + for (counter = 0; counter < nr_counters; counter++) { + for (thread = 0; thread < thread_num; thread++) { + if (mmap_array[i][counter][thread].base) + mmap_read(&mmap_array[i][counter][thread]); + } + + } + } + + if (perf_header__has_feat(&session->header, HEADER_TRACE_INFO)) + write_output(&finished_round_event, sizeof(finished_round_event)); +} + static int __cmd_record(int argc, const char **argv) { int i, counter; @@ -726,16 +749,7 @@ static int __cmd_record(int argc, const char **argv) int hits = samples; int thread; - for (i = 0; i < nr_cpu; i++) { - for (counter = 0; counter < nr_counters; counter++) { - for (thread = 0; - thread < thread_num; thread++) { - if (mmap_array[i][counter][thread].base) - mmap_read(&mmap_array[i][counter][thread]); - } - - } - } + mmap_read_all(); if (hits == samples) { if (done) diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index b364da5b0cbf..6cc1b1dced55 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -84,11 +84,12 @@ struct build_id_event { char filename[]; }; -enum perf_header_event_type { /* above any possible kernel type */ +enum perf_user_event_type { /* above any possible kernel type */ PERF_RECORD_HEADER_ATTR = 64, PERF_RECORD_HEADER_EVENT_TYPE = 65, PERF_RECORD_HEADER_TRACING_DATA = 66, PERF_RECORD_HEADER_BUILD_ID = 67, + PERF_RECORD_FINISHED_ROUND = 68, PERF_RECORD_HEADER_MAX }; From d6b17bebd79dae2e3577f2ea27a832af4991a5e6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 3 May 2010 15:14:33 +0200 Subject: [PATCH 02/12] perf: Provide a new deterministic events reordering algorithm The current events reordering algorithm is based on a heuristic that gets broken once we deal with a very fast flow of events. Indeed the time period based flushing is not suitable anymore in the following case, assuming we have a flush period of two seconds. CPU 0 | CPU 1 | cnt1 timestamps | cnt1 timestamps | 0 | 0 1 | 1 2 | 2 3 | 3 [...] | [...] 4 seconds later If we spend too much time to read the buffers (case of a lot of events to record in each buffers or when we have a lot of CPU buffers to read), in the next pass the CPU 0 buffer could contain a slice of several seconds of events. We'll read them all and notice we've reached the period to flush. In the above example we flush the first half of the CPU 0 buffer, then we read the CPU 1 buffer where we have events that were on the flush slice and then the reordering fails. It's simple to reproduce with: perf lock record perf bench sched messaging To solve this, we use a new solution that doesn't rely on an heuristical time slice period anymore but on a deterministic basis based on how perf record does its job. perf record saves the buffers through passes. A pass is a tour on every buffers from every CPUs. This is made in order: for each CPU we read the buffers of every counters. So the more buffers we visit, the later will be the timstamps of their events. When perf record finishes a pass it records a PERF_RECORD_FINISHED_ROUND pseudo event. We record the max timestamp t found in the pass n. Assuming these timestamps are monotonic across cpus, we know that if a buffer still has events with timestamps below t, they will be all available and then read in the pass n + 1. Hence when we start to read the pass n + 2, we can safely flush every events with timestamps below t. ============ PASS n ================= CPU 0 | CPU 1 | cnt1 timestamps | cnt2 timestamps 1 | 2 2 | 3 - | 4 <--- max recorded ============ PASS n + 1 ============== CPU 0 | CPU 1 | cnt1 timestamps | cnt2 timestamps 3 | 5 4 | 6 5 | 7 <---- max recorded Flush every events below timestamp 4 ============ PASS n + 2 ============== CPU 0 | CPU 1 | cnt1 timestamps | cnt2 timestamps 6 | 8 7 | 9 - | 10 Flush every events below timestamp 7 etc... It also works on perf.data versions that don't have PERF_RECORD_FINISHED_ROUND pseudo events. The difference is that the events will be only flushed in the end of the perf.data processing. It will then consume more memory and scale less with large perf.data files. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Tom Zanussi Cc: Masami Hiramatsu --- tools/perf/util/session.c | 106 +++++++++++++++++++++++++++----------- tools/perf/util/session.h | 36 +++++++------ 2 files changed, 97 insertions(+), 45 deletions(-) diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 5d353e70fe26..9401909fa283 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -98,7 +98,6 @@ struct perf_session *perf_session__new(const char *filename, int mode, bool forc self->unknown_events = 0; self->machines = RB_ROOT; self->repipe = repipe; - self->ordered_samples.flush_limit = ULLONG_MAX; INIT_LIST_HEAD(&self->ordered_samples.samples_head); if (mode == O_RDONLY) { @@ -194,6 +193,18 @@ static int process_event_stub(event_t *event __used, return 0; } +static int process_finished_round_stub(event_t *event __used, + struct perf_session *session __used, + struct perf_event_ops *ops __used) +{ + dump_printf(": unhandled!\n"); + return 0; +} + +static int process_finished_round(event_t *event, + struct perf_session *session, + struct perf_event_ops *ops); + static void perf_event_ops__fill_defaults(struct perf_event_ops *handler) { if (handler->sample == NULL) @@ -222,6 +233,12 @@ static void perf_event_ops__fill_defaults(struct perf_event_ops *handler) handler->tracing_data = process_event_stub; if (handler->build_id == NULL) handler->build_id = process_event_stub; + if (handler->finished_round == NULL) { + if (handler->ordered_samples) + handler->finished_round = process_finished_round; + else + handler->finished_round = process_finished_round_stub; + } } static const char *event__name[] = { @@ -359,16 +376,14 @@ struct sample_queue { struct list_head list; }; -#define FLUSH_PERIOD (2 * NSEC_PER_SEC) - static void flush_sample_queue(struct perf_session *s, struct perf_event_ops *ops) { struct list_head *head = &s->ordered_samples.samples_head; - u64 limit = s->ordered_samples.flush_limit; + u64 limit = s->ordered_samples.next_flush; struct sample_queue *tmp, *iter; - if (!ops->ordered_samples) + if (!ops->ordered_samples || !limit) return; list_for_each_entry_safe(iter, tmp, head, list) { @@ -387,6 +402,55 @@ static void flush_sample_queue(struct perf_session *s, } } +/* + * When perf record finishes a pass on every buffers, it records this pseudo + * event. + * We record the max timestamp t found in the pass n. + * Assuming these timestamps are monotonic across cpus, we know that if + * a buffer still has events with timestamps below t, they will be all + * available and then read in the pass n + 1. + * Hence when we start to read the pass n + 2, we can safely flush every + * events with timestamps below t. + * + * ============ PASS n ================= + * CPU 0 | CPU 1 + * | + * cnt1 timestamps | cnt2 timestamps + * 1 | 2 + * 2 | 3 + * - | 4 <--- max recorded + * + * ============ PASS n + 1 ============== + * CPU 0 | CPU 1 + * | + * cnt1 timestamps | cnt2 timestamps + * 3 | 5 + * 4 | 6 + * 5 | 7 <---- max recorded + * + * Flush every events below timestamp 4 + * + * ============ PASS n + 2 ============== + * CPU 0 | CPU 1 + * | + * cnt1 timestamps | cnt2 timestamps + * 6 | 8 + * 7 | 9 + * - | 10 + * + * Flush every events below timestamp 7 + * etc... + */ +static int process_finished_round(event_t *event __used, + struct perf_session *session, + struct perf_event_ops *ops) +{ + flush_sample_queue(session, ops); + session->ordered_samples.next_flush = session->ordered_samples.max_timestamp; + + return 0; +} + static void __queue_sample_end(struct sample_queue *new, struct list_head *head) { struct sample_queue *iter; @@ -455,17 +519,12 @@ static void __queue_sample_event(struct sample_queue *new, } static int queue_sample_event(event_t *event, struct sample_data *data, - struct perf_session *s, - struct perf_event_ops *ops) + struct perf_session *s) { u64 timestamp = data->time; struct sample_queue *new; - u64 flush_limit; - if (s->ordered_samples.flush_limit == ULLONG_MAX) - s->ordered_samples.flush_limit = timestamp + FLUSH_PERIOD; - if (timestamp < s->ordered_samples.last_flush) { printf("Warning: Timestamp below last timeslice flush\n"); return -EINVAL; @@ -488,23 +547,8 @@ static int queue_sample_event(event_t *event, struct sample_data *data, __queue_sample_event(new, s); s->ordered_samples.last_inserted = new; - /* - * We want to have a slice of events covering 2 * FLUSH_PERIOD - * If FLUSH_PERIOD is big enough, it ensures every events that occured - * in the first half of the timeslice have all been buffered and there - * are none remaining (we need that because of the weakly ordered - * event recording we have). Then once we reach the 2 * FLUSH_PERIOD - * timeslice, we flush the first half to be gentle with the memory - * (the second half can still get new events in the middle, so wait - * another period to flush it) - */ - flush_limit = s->ordered_samples.flush_limit; - - if (new->timestamp > flush_limit && - new->timestamp - flush_limit > FLUSH_PERIOD) { - s->ordered_samples.flush_limit += FLUSH_PERIOD; - flush_sample_queue(s, ops); - } + if (new->timestamp > s->ordered_samples.max_timestamp) + s->ordered_samples.max_timestamp = new->timestamp; return 0; } @@ -520,7 +564,7 @@ static int perf_session__process_sample(event_t *event, struct perf_session *s, bzero(&data, sizeof(struct sample_data)); event__parse_sample(event, s->sample_type, &data); - queue_sample_event(event, &data, s, ops); + queue_sample_event(event, &data, s); return 0; } @@ -572,6 +616,8 @@ static int perf_session__process_event(struct perf_session *self, return ops->tracing_data(event, self); case PERF_RECORD_HEADER_BUILD_ID: return ops->build_id(event, self); + case PERF_RECORD_FINISHED_ROUND: + return ops->finished_round(event, self, ops); default: self->unknown_events++; return -1; @@ -786,7 +832,7 @@ int __perf_session__process_events(struct perf_session *self, done: err = 0; /* do the final flush for ordered samples */ - self->ordered_samples.flush_limit = ULLONG_MAX; + self->ordered_samples.next_flush = ULLONG_MAX; flush_sample_queue(self, ops); out_err: ui_progress__delete(progress); diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index f2b2c6a3a49d..568fd08a478d 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -14,7 +14,8 @@ struct thread; struct ordered_samples { u64 last_flush; - u64 flush_limit; + u64 next_flush; + u64 max_timestamp; struct list_head samples_head; struct sample_queue *last_inserted; }; @@ -41,23 +42,28 @@ struct perf_session { char filename[0]; }; +struct perf_event_ops; + typedef int (*event_op)(event_t *self, struct perf_session *session); +typedef int (*event_op2)(event_t *self, struct perf_session *session, + struct perf_event_ops *ops); struct perf_event_ops { - event_op sample, - mmap, - comm, - fork, - exit, - lost, - read, - throttle, - unthrottle, - attr, - event_type, - tracing_data, - build_id; - bool ordered_samples; + event_op sample, + mmap, + comm, + fork, + exit, + lost, + read, + throttle, + unthrottle, + attr, + event_type, + tracing_data, + build_id; + event_op2 finished_round; + bool ordered_samples; }; struct perf_session *perf_session__new(const char *filename, int mode, bool force, bool repipe); From 26242d859c9be9eea61f7f19514e9d272ae8ce26 Mon Sep 17 00:00:00 2001 From: Hitoshi Mitake Date: Mon, 3 May 2010 14:12:00 +0900 Subject: [PATCH 03/12] perf lock: Add "info" subcommand for dumping misc information This adds the "info" subcommand to perf lock which can be used to dump metadata like threads or addresses of lock instances. "map" was removed because info should do the work for it. This will be useful not only for debugging but also for ordinary analyzing. v2: adding example of usage % sudo ./perf lock info -t | Thread ID: comm | 0: swapper | 1: init | 18: migration/5 | 29: events/2 | 32: events/5 | 33: events/6 ... % sudo ./perf lock info -m | Address of instance: name of class | 0xffff8800b95adae0: &(&sighand->siglock)->rlock | 0xffff8800bbb41ae0: &(&sighand->siglock)->rlock | 0xffff8800bf165ae0: &(&sighand->siglock)->rlock | 0xffff8800b9576a98: &p->cred_guard_mutex | 0xffff8800bb890a08: &(&p->alloc_lock)->rlock | 0xffff8800b9522a08: &(&p->alloc_lock)->rlock | 0xffff8800bb8aaa08: &(&p->alloc_lock)->rlock | 0xffff8800bba72a08: &(&p->alloc_lock)->rlock | 0xffff8800bf18ea08: &(&p->alloc_lock)->rlock | 0xffff8800b8a0d8a0: &(&ip->i_lock)->mr_lock | 0xffff88009bf818a0: &(&ip->i_lock)->mr_lock | 0xffff88004c66b8a0: &(&ip->i_lock)->mr_lock | 0xffff8800bb6478a0: &(shost->host_lock)->rlock v3: fixed some problems Frederic pointed out * better rbtree tracking in dump_threads() * removed printf() and used pr_info() and pr_debug() Signed-off-by: Hitoshi Mitake Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Jens Axboe Cc: Jason Baron Cc: Xiao Guangrong LKML-Reference: <1272863520-16179-1-git-send-email-mitake@dcl.info.waseda.ac.jp> Signed-off-by: Frederic Weisbecker --- tools/perf/builtin-lock.c | 96 +++++++++++++++++++++++++++++---------- 1 file changed, 73 insertions(+), 23 deletions(-) diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 6605000ed73d..c4eb854ff7eb 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -720,15 +720,15 @@ static void print_result(void) char cut_name[20]; int bad, total; - printf("%20s ", "Name"); - printf("%10s ", "acquired"); - printf("%10s ", "contended"); + pr_info("%20s ", "Name"); + pr_info("%10s ", "acquired"); + pr_info("%10s ", "contended"); - printf("%15s ", "total wait (ns)"); - printf("%15s ", "max wait (ns)"); - printf("%15s ", "min wait (ns)"); + pr_info("%15s ", "total wait (ns)"); + pr_info("%15s ", "max wait (ns)"); + pr_info("%15s ", "min wait (ns)"); - printf("\n\n"); + pr_info("\n\n"); bad = total = 0; while ((st = pop_from_result())) { @@ -741,7 +741,7 @@ static void print_result(void) if (strlen(st->name) < 16) { /* output raw name */ - printf("%20s ", st->name); + pr_info("%20s ", st->name); } else { strncpy(cut_name, st->name, 16); cut_name[16] = '.'; @@ -749,17 +749,17 @@ static void print_result(void) cut_name[18] = '.'; cut_name[19] = '\0'; /* cut off name for saving output style */ - printf("%20s ", cut_name); + pr_info("%20s ", cut_name); } - printf("%10u ", st->nr_acquired); - printf("%10u ", st->nr_contended); + pr_info("%10u ", st->nr_acquired); + pr_info("%10u ", st->nr_contended); - printf("%15llu ", st->wait_time_total); - printf("%15llu ", st->wait_time_max); - printf("%15llu ", st->wait_time_min == ULLONG_MAX ? + pr_info("%15llu ", st->wait_time_total); + pr_info("%15llu ", st->wait_time_max); + pr_info("%15llu ", st->wait_time_min == ULLONG_MAX ? 0 : st->wait_time_min); - printf("\n"); + pr_info("\n"); } { @@ -768,28 +768,59 @@ static void print_result(void) const char *name[4] = { "acquire", "acquired", "contended", "release" }; - printf("\n=== output for debug===\n\n"); - printf("bad:%d, total:%d\n", bad, total); - printf("bad rate:%f\n", (double)(bad / total)); + pr_debug("\n=== output for debug===\n\n"); + pr_debug("bad:%d, total:%d\n", bad, total); + pr_debug("bad rate:%f\n", (double)(bad / total)); - printf("histogram of events caused bad sequence\n"); + pr_debug("histogram of events caused bad sequence\n"); for (i = 0; i < 4; i++) - printf(" %10s: %d\n", name[i], bad_hist[i]); + pr_debug(" %10s: %d\n", name[i], bad_hist[i]); } } +static int info_threads; +static int info_map; + +static void dump_threads(void) +{ + struct thread_stat *st; + struct rb_node *node; + struct thread *t; + + pr_info("%10s: comm\n", "Thread ID"); + + node = rb_first(&thread_stats); + while (node) { + st = container_of(node, struct thread_stat, rb); + t = perf_session__findnew(session, st->tid); + pr_info("%10d: %s\n", st->tid, t->comm); + node = rb_next(node); + }; +} + static void dump_map(void) { unsigned int i; struct lock_stat *st; + pr_info("Address of instance: name of class\n"); for (i = 0; i < LOCKHASH_SIZE; i++) { list_for_each_entry(st, &lockhash_table[i], hash_entry) { - printf("%p: %s\n", st->addr, st->name); + pr_info(" %p: %s\n", st->addr, st->name); } } } +static void dump_info(void) +{ + if (info_threads) + dump_threads(); + else if (info_map) + dump_map(); + else + die("Unknown type of information\n"); +} + static int process_sample_event(event_t *self, struct perf_session *s) { struct sample_data data; @@ -858,6 +889,19 @@ static const struct option report_options[] = { OPT_END() }; +static const char * const info_usage[] = { + "perf lock info []", + NULL +}; + +static const struct option info_options[] = { + OPT_BOOLEAN('t', "threads", &info_threads, + "dump thread list in perf.data"), + OPT_BOOLEAN('m', "map", &info_map, + "map of lock instances (name:address table)"), + OPT_END() +}; + static const char * const lock_usage[] = { "perf lock [] {record|trace|report}", NULL @@ -929,12 +973,18 @@ int cmd_lock(int argc, const char **argv, const char *prefix __used) } else if (!strcmp(argv[0], "trace")) { /* Aliased to 'perf trace' */ return cmd_trace(argc, argv, prefix); - } else if (!strcmp(argv[0], "map")) { + } else if (!strcmp(argv[0], "info")) { + if (argc) { + argc = parse_options(argc, argv, + info_options, info_usage, 0); + if (argc) + usage_with_options(info_usage, info_options); + } /* recycling report_lock_ops */ trace_handler = &report_lock_ops; setup_pager(); read_events(); - dump_map(); + dump_info(); } else { usage_with_options(lock_usage, lock_options); } From 10350ec362b48f79f3df8447c25813790075e27c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 5 May 2010 23:47:28 +0200 Subject: [PATCH 04/12] perf: Cleanup perf lock broken states Use enum to get a human view of bad_hist indexes and put bad histogram output in its own function. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Hitoshi Mitake --- tools/perf/builtin-lock.c | 49 +++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index c4eb854ff7eb..1e93179c2d30 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -387,7 +387,15 @@ static struct lock_seq_stat *get_seq(struct thread_stat *ts, void *addr) return seq; } -static int bad_hist[4]; +enum broken_state { + BROKEN_ACQUIRE, + BROKEN_ACQUIRED, + BROKEN_CONTENDED, + BROKEN_RELEASE, + BROKEN_MAX, +}; + +static int bad_hist[BROKEN_MAX]; static void report_lock_acquire_event(struct trace_acquire_event *acquire_event, @@ -437,7 +445,7 @@ report_lock_acquire_event(struct trace_acquire_event *acquire_event, broken: /* broken lock sequence, discard it */ ls->discard = 1; - bad_hist[0]++; + bad_hist[BROKEN_ACQUIRE]++; list_del(&seq->list); free(seq); goto end; @@ -481,7 +489,6 @@ report_lock_acquired_event(struct trace_acquired_event *acquired_event, case SEQ_STATE_CONTENDED: contended_term = timestamp - seq->prev_event_time; ls->wait_time_total += contended_term; - if (contended_term < ls->wait_time_min) ls->wait_time_min = contended_term; else if (ls->wait_time_max < contended_term) @@ -492,7 +499,7 @@ report_lock_acquired_event(struct trace_acquired_event *acquired_event, case SEQ_STATE_READ_ACQUIRED: /* broken lock sequence, discard it */ ls->discard = 1; - bad_hist[1]++; + bad_hist[BROKEN_ACQUIRED]++; list_del(&seq->list); free(seq); goto end; @@ -540,7 +547,7 @@ report_lock_contended_event(struct trace_contended_event *contended_event, case SEQ_STATE_CONTENDED: /* broken lock sequence, discard it */ ls->discard = 1; - bad_hist[2]++; + bad_hist[BROKEN_CONTENDED]++; list_del(&seq->list); free(seq); goto end; @@ -594,7 +601,7 @@ report_lock_release_event(struct trace_release_event *release_event, case SEQ_STATE_RELEASED: /* broken lock sequence, discard it */ ls->discard = 1; - bad_hist[3]++; + bad_hist[BROKEN_RELEASE]++; goto free_seq; break; default: @@ -713,6 +720,21 @@ process_raw_event(void *data, int cpu, u64 timestamp, struct thread *thread) process_lock_release_event(data, event, cpu, timestamp, thread); } +static void print_bad_events(int bad, int total) +{ + /* Output for debug, this have to be removed */ + int i; + const char *name[4] = + { "acquire", "acquired", "contended", "release" }; + + pr_info("\n=== output for debug===\n\n"); + pr_info("bad:%d, total:%d\n", bad, total); + pr_info("bad rate:%f\n", (double)(bad / total)); + pr_info("histogram of events caused bad sequence\n"); + for (i = 0; i < BROKEN_MAX; i++) + pr_info(" %10s: %d\n", name[i], bad_hist[i]); +} + /* TODO: various way to print, coloring, nano or milli sec */ static void print_result(void) { @@ -762,20 +784,7 @@ static void print_result(void) pr_info("\n"); } - { - /* Output for debug, this have to be removed */ - int i; - const char *name[4] = - { "acquire", "acquired", "contended", "release" }; - - pr_debug("\n=== output for debug===\n\n"); - pr_debug("bad:%d, total:%d\n", bad, total); - pr_debug("bad rate:%f\n", (double)(bad / total)); - - pr_debug("histogram of events caused bad sequence\n"); - for (i = 0; i < 4; i++) - pr_debug(" %10s: %d\n", name[i], bad_hist[i]); - } + print_bad_events(bad, total); } static int info_threads; From 84c7a21791eb2e962a27e19bab5b77d5d9e13a34 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 5 May 2010 23:57:25 +0200 Subject: [PATCH 05/12] perf: Humanize lock flags in perf lock Use an enum instead of plain constants for lock flags. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Hitoshi Mitake --- tools/perf/builtin-lock.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 1e93179c2d30..3b304ed4d2e9 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -397,6 +397,11 @@ enum broken_state { static int bad_hist[BROKEN_MAX]; +enum acquire_flags { + TRY_LOCK = 1, + READ_LOCK = 2, +}; + static void report_lock_acquire_event(struct trace_acquire_event *acquire_event, struct event *__event __used, @@ -421,9 +426,9 @@ report_lock_acquire_event(struct trace_acquire_event *acquire_event, if (!acquire_event->flag) { seq->state = SEQ_STATE_ACQUIRING; } else { - if (acquire_event->flag & 1) + if (acquire_event->flag & TRY_LOCK) ls->nr_trylock++; - if (acquire_event->flag & 2) + if (acquire_event->flag & READ_LOCK) ls->nr_readlock++; seq->state = SEQ_STATE_READ_ACQUIRED; seq->read_count = 1; @@ -431,7 +436,7 @@ report_lock_acquire_event(struct trace_acquire_event *acquire_event, } break; case SEQ_STATE_READ_ACQUIRED: - if (acquire_event->flag & 2) { + if (acquire_event->flag & READ_LOCK) { seq->read_count++; ls->nr_acquired++; goto end; From 5efe08cf685f33f562566dc68b6077b6f6a4f706 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 6 May 2010 04:55:22 +0200 Subject: [PATCH 06/12] perf: Fix perf lock bad rate Fix the cast made to get the bad rate. It is made in the result instead of the operands. We need the operands to be cast in double, otherwise the result will always be zero. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Hitoshi Mitake --- tools/perf/builtin-lock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 3b304ed4d2e9..d7dde9cbbdf4 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -733,8 +733,8 @@ static void print_bad_events(int bad, int total) { "acquire", "acquired", "contended", "release" }; pr_info("\n=== output for debug===\n\n"); - pr_info("bad:%d, total:%d\n", bad, total); - pr_info("bad rate:%f\n", (double)(bad / total)); + pr_info("bad: %d, total: %d\n", bad, total); + pr_info("bad rate: %f %%\n", (double)bad / (double)total * 100); pr_info("histogram of events caused bad sequence\n"); for (i = 0; i < BROKEN_MAX; i++) pr_info(" %10s: %d\n", name[i], bad_hist[i]); From 90c0e5fc7b73d2575c7367e1da70ff9521718e5e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 May 2010 02:33:42 +0200 Subject: [PATCH 07/12] perf lock: Always check min AND max wait time When a lock is acquired after beeing contended, we update the wait time statistics for the given lock. But if the min wait time is updated, we don't check the max wait time. This is wrong because the first time we update the wait time, we want to update both min and max wait time. Before: Name acquired contended total wait (ns) max wait (ns) min wait (ns) key 8 1 21656 0 21656 After: Name acquired contended total wait (ns) max wait (ns) min wait (ns) key 8 1 21656 21656 21656 Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Hitoshi Mitake --- tools/perf/builtin-lock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index d7dde9cbbdf4..e549f4547b98 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -496,7 +496,7 @@ report_lock_acquired_event(struct trace_acquired_event *acquired_event, ls->wait_time_total += contended_term; if (contended_term < ls->wait_time_min) ls->wait_time_min = contended_term; - else if (ls->wait_time_max < contended_term) + if (ls->wait_time_max < contended_term) ls->wait_time_max = contended_term; break; case SEQ_STATE_RELEASED: From 883a2a3189dae9d2912c417e47152f51cb922a3f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 May 2010 06:16:11 +0200 Subject: [PATCH 08/12] tracing: Drop lock_acquired waittime field Drop the waittime field from the lock_acquired event, we can calculate it by substracting the lock_acquired event timestamp with the matching lock_acquire one. It is not needed and takes useless space in the traces. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Hitoshi Mitake Cc: Steven Rostedt --- include/trace/events/lock.h | 11 ++++------- kernel/lockdep.c | 2 +- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/include/trace/events/lock.h b/include/trace/events/lock.h index 5c1dcfc16c60..17ca287ae176 100644 --- a/include/trace/events/lock.h +++ b/include/trace/events/lock.h @@ -78,24 +78,21 @@ TRACE_EVENT(lock_contended, ); TRACE_EVENT(lock_acquired, - TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime), + TP_PROTO(struct lockdep_map *lock, unsigned long ip), - TP_ARGS(lock, ip, waittime), + TP_ARGS(lock, ip), TP_STRUCT__entry( __string(name, lock->name) - __field(s64, wait_nsec) __field(void *, lockdep_addr) ), TP_fast_assign( __assign_str(name, lock->name); - __entry->wait_nsec = waittime; __entry->lockdep_addr = lock; ), - TP_printk("%p %s (%llu ns)", __entry->lockdep_addr, - __get_str(name), - __entry->wait_nsec) + TP_printk("%p %s", __entry->lockdep_addr, + __get_str(name)) ); #endif diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 2594e1ce41cb..31e22e742368 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3380,7 +3380,7 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) hlock->holdtime_stamp = now; } - trace_lock_acquired(lock, ip, waittime); + trace_lock_acquired(lock, ip); stats = get_lock_stats(hlock_class(hlock)); if (waittime) { From 93135439459920c4d856f4ab8f068c030085c8df Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 May 2010 06:24:25 +0200 Subject: [PATCH 09/12] tracing: Drop the nested field from lock_release event Drop the nested field as we don't use it. Every nested state can be computed from a state machine on post processing already. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Hitoshi Mitake Cc: Steven Rostedt --- include/trace/events/lock.h | 4 ++-- kernel/lockdep.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/trace/events/lock.h b/include/trace/events/lock.h index 17ca287ae176..fde4c3853391 100644 --- a/include/trace/events/lock.h +++ b/include/trace/events/lock.h @@ -37,9 +37,9 @@ TRACE_EVENT(lock_acquire, TRACE_EVENT(lock_release, - TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip), + TP_PROTO(struct lockdep_map *lock, unsigned long ip), - TP_ARGS(lock, nested, ip), + TP_ARGS(lock, ip), TP_STRUCT__entry( __string(name, lock->name) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 31e22e742368..e9c759f06c1d 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3227,7 +3227,7 @@ void lock_release(struct lockdep_map *lock, int nested, raw_local_irq_save(flags); check_flags(flags); current->lockdep_recursion = 1; - trace_lock_release(lock, nested, ip); + trace_lock_release(lock, ip); __lock_release(lock, nested, ip); current->lockdep_recursion = 0; raw_local_irq_restore(flags); From 2c193c736803ceb547daec725e5c5d992d039f20 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 May 2010 06:36:02 +0200 Subject: [PATCH 10/12] tracing: Factorize lock events in a lock class lock_acquired, lock_contended and lock_release now share the same prototype and format. Let's factorize them into a lock event class. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Hitoshi Mitake Cc: Steven Rostedt --- include/trace/events/lock.h | 50 ++++++++++++------------------------- 1 file changed, 16 insertions(+), 34 deletions(-) diff --git a/include/trace/events/lock.h b/include/trace/events/lock.h index fde4c3853391..2821b86de63b 100644 --- a/include/trace/events/lock.h +++ b/include/trace/events/lock.h @@ -35,15 +35,15 @@ TRACE_EVENT(lock_acquire, __get_str(name)) ); -TRACE_EVENT(lock_release, +DECLARE_EVENT_CLASS(lock, TP_PROTO(struct lockdep_map *lock, unsigned long ip), TP_ARGS(lock, ip), TP_STRUCT__entry( - __string(name, lock->name) - __field(void *, lockdep_addr) + __string( name, lock->name ) + __field( void *, lockdep_addr ) ), TP_fast_assign( @@ -51,48 +51,30 @@ TRACE_EVENT(lock_release, __entry->lockdep_addr = lock; ), - TP_printk("%p %s", - __entry->lockdep_addr, __get_str(name)) + TP_printk("%p %s", __entry->lockdep_addr, __get_str(name)) +); + +DEFINE_EVENT(lock, lock_release, + + TP_PROTO(struct lockdep_map *lock, unsigned long ip), + + TP_ARGS(lock, ip) ); #ifdef CONFIG_LOCK_STAT -TRACE_EVENT(lock_contended, +DEFINE_EVENT(lock, lock_contended, TP_PROTO(struct lockdep_map *lock, unsigned long ip), - TP_ARGS(lock, ip), - - TP_STRUCT__entry( - __string(name, lock->name) - __field(void *, lockdep_addr) - ), - - TP_fast_assign( - __assign_str(name, lock->name); - __entry->lockdep_addr = lock; - ), - - TP_printk("%p %s", - __entry->lockdep_addr, __get_str(name)) + TP_ARGS(lock, ip) ); -TRACE_EVENT(lock_acquired, +DEFINE_EVENT(lock, lock_acquired, + TP_PROTO(struct lockdep_map *lock, unsigned long ip), - TP_ARGS(lock, ip), - - TP_STRUCT__entry( - __string(name, lock->name) - __field(void *, lockdep_addr) - ), - - TP_fast_assign( - __assign_str(name, lock->name); - __entry->lockdep_addr = lock; - ), - TP_printk("%p %s", __entry->lockdep_addr, - __get_str(name)) + TP_ARGS(lock, ip) ); #endif From 794e43b56c18b95fc9776c914a2659e7d558a352 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 5 May 2010 00:27:40 -0500 Subject: [PATCH 11/12] perf/live-mode: Handle payload-less events Some events, such as the PERF_RECORD_FINISHED_ROUND event consist of only an event header and no data. In this case, a 0-length payload will be read, and the 0 return value will be wrongly interpreted as an 'unexpected end of event stream'. This patch allows for proper handling of data-less events by skipping 0-length reads. Signed-off-by: Tom Zanussi Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Masami Hiramatsu LKML-Reference: <1273038527.6383.51.camel@tropicana> Signed-off-by: Frederic Weisbecker --- tools/perf/util/session.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 9401909fa283..00ab298bbb4b 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -696,15 +696,18 @@ static int __perf_session__process_pipe_events(struct perf_session *self, p = &event; p += sizeof(struct perf_event_header); - err = do_read(self->fd, p, size - sizeof(struct perf_event_header)); - if (err <= 0) { - if (err == 0) { - pr_err("unexpected end of event stream\n"); - goto done; - } + if (size - sizeof(struct perf_event_header)) { + err = do_read(self->fd, p, + size - sizeof(struct perf_event_header)); + if (err <= 0) { + if (err == 0) { + pr_err("unexpected end of event stream\n"); + goto done; + } - pr_err("failed to read event data\n"); - goto out_err; + pr_err("failed to read event data\n"); + goto out_err; + } } if (size == 0 || From 76ba7e846fcc89d9d4b25b89e303c9058de96d60 Mon Sep 17 00:00:00 2001 From: Hitoshi Mitake Date: Sat, 8 May 2010 17:10:29 +0900 Subject: [PATCH 12/12] perf lock: Drop "-a" option from cmd_record() default arguments set This patch drops "-a" from the default arguments passed to perf record by perf lock. If a user wants to do a system wide record of lock events, perf lock record -a ... is enough for this purpose. This can reduce the size of the perf.data file. % sudo ./perf lock record whoami root [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.439 MB perf.data (~19170 samples) ] % sudo ./perf lock record -a whoami # with -a option root [ perf record: Woken up 0 times to write data ] [ perf record: Captured and wrote 48.962 MB perf.data (~2139197 samples) ] Signed-off-by: Hitoshi Mitake Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Jens Axboe Cc: Jason Baron Cc: Xiao Guangrong LKML-Reference: Message-Id: <1273306229-5216-1-git-send-email-mitake@dcl.info.waseda.ac.jp> Signed-off-by: Frederic Weisbecker --- tools/perf/builtin-lock.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index e549f4547b98..e18dfdc2948a 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -930,7 +930,6 @@ static const struct option lock_options[] = { static const char *record_args[] = { "record", - "-a", "-R", "-f", "-m", "1024",