From 984028075794c00cbf4fb1e94bb6233e8be08875 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 2 May 2010 22:05:29 +0200
Subject: [PATCH 01/12] perf: Introduce a new "round of buffers read" pseudo
 event

In order to provide a more rubust and deterministic reordering
algorithm, we need to know when we reach a point where we just
did a pass through over every counter buffers to read every thing
they had.

This patch introduces a new PERF_RECORD_FINISHED_ROUND pseudo event
that only consist in an event header and doesn't need to contain
anything.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
---
 tools/perf/builtin-record.c | 34 ++++++++++++++++++++++++----------
 tools/perf/util/event.h     |  3 ++-
 2 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index d3981ac50e1d..6b77b285fe10 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -494,6 +494,29 @@ static void event__synthesize_guest_os(struct machine *machine, void *data)
 		       " relocation symbol.\n", machine->pid);
 }
 
+static struct perf_event_header finished_round_event = {
+	.size = sizeof(struct perf_event_header),
+	.type = PERF_RECORD_FINISHED_ROUND,
+};
+
+static void mmap_read_all(void)
+{
+	int i, counter, thread;
+
+	for (i = 0; i < nr_cpu; i++) {
+		for (counter = 0; counter < nr_counters; counter++) {
+			for (thread = 0; thread < thread_num; thread++) {
+				if (mmap_array[i][counter][thread].base)
+					mmap_read(&mmap_array[i][counter][thread]);
+			}
+
+		}
+	}
+
+	if (perf_header__has_feat(&session->header, HEADER_TRACE_INFO))
+		write_output(&finished_round_event, sizeof(finished_round_event));
+}
+
 static int __cmd_record(int argc, const char **argv)
 {
 	int i, counter;
@@ -726,16 +749,7 @@ static int __cmd_record(int argc, const char **argv)
 		int hits = samples;
 		int thread;
 
-		for (i = 0; i < nr_cpu; i++) {
-			for (counter = 0; counter < nr_counters; counter++) {
-				for (thread = 0;
-					thread < thread_num; thread++) {
-					if (mmap_array[i][counter][thread].base)
-						mmap_read(&mmap_array[i][counter][thread]);
-				}
-
-			}
-		}
+		mmap_read_all();
 
 		if (hits == samples) {
 			if (done)
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index b364da5b0cbf..6cc1b1dced55 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -84,11 +84,12 @@ struct build_id_event {
 	char			 filename[];
 };
 
-enum perf_header_event_type { /* above any possible kernel type */
+enum perf_user_event_type { /* above any possible kernel type */
 	PERF_RECORD_HEADER_ATTR			= 64,
 	PERF_RECORD_HEADER_EVENT_TYPE		= 65,
 	PERF_RECORD_HEADER_TRACING_DATA		= 66,
 	PERF_RECORD_HEADER_BUILD_ID		= 67,
+	PERF_RECORD_FINISHED_ROUND		= 68,
 	PERF_RECORD_HEADER_MAX
 };
 

From d6b17bebd79dae2e3577f2ea27a832af4991a5e6 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 3 May 2010 15:14:33 +0200
Subject: [PATCH 02/12] perf: Provide a new deterministic events reordering
 algorithm

The current events reordering algorithm is based on a heuristic that
gets broken once we deal with a very fast flow of events.

Indeed the time period based flushing is not suitable anymore
in the following case, assuming we have a flush period of two
seconds.

    CPU 0           |        CPU 1
                    |
  cnt1 timestamps   |      cnt1 timestamps
                    |
    0               |         0
    1               |         1
    2               |         2
    3               |         3
    [...]           |        [...]
    4 seconds later

If we spend too much time to read the buffers (case of a lot of
events to record in each buffers or when we have a lot of CPU buffers
to read), in the next pass the CPU 0 buffer could contain a slice
of several seconds of events. We'll read them all and notice we've
reached the period to flush. In the above example we flush the first
half of the CPU 0 buffer, then we read the CPU 1 buffer where we
have events that were on the flush slice and then the reordering
fails.

It's simple to reproduce with:

	perf lock record perf bench sched messaging

To solve this, we use a new solution that doesn't rely on an
heuristical time slice period anymore but on a deterministic basis
based on how perf record does its job.

perf record saves the buffers through passes. A pass is a tour
on every buffers from every CPUs. This is made in order: for
each CPU we read the buffers of every counters. So the more
buffers we visit, the later will be the timstamps of their events.

When perf record finishes a pass it records a
PERF_RECORD_FINISHED_ROUND pseudo event.
We record the max timestamp t found in the pass n. Assuming these
timestamps are monotonic across cpus, we know that if a buffer
still has events with timestamps below t, they will be all available
and then read in the pass n + 1.
Hence when we start to read the pass n + 2, we can safely flush every
events with timestamps below t.

      ============ PASS n =================
         CPU 0         |   CPU 1
                       |
      cnt1 timestamps  |   cnt2 timestamps
            1          |         2
            2          |         3
            -          |         4  <--- max recorded

      ============ PASS n + 1 ==============
         CPU 0         |   CPU 1
                       |
      cnt1 timestamps  |   cnt2 timestamps
            3          |         5
            4          |         6
            5          |         7 <---- max recorded

        Flush every events below timestamp 4

      ============ PASS n + 2 ==============
         CPU 0         |   CPU 1
                       |
      cnt1 timestamps  |   cnt2 timestamps
            6          |         8
            7          |         9
            -          |         10

        Flush every events below timestamp 7
        etc...

It also works on perf.data versions that don't have
PERF_RECORD_FINISHED_ROUND pseudo events. The difference is that
the events will be only flushed in the end of the perf.data
processing. It will then consume more memory and scale less with
large perf.data files.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
---
 tools/perf/util/session.c | 106 +++++++++++++++++++++++++++-----------
 tools/perf/util/session.h |  36 +++++++------
 2 files changed, 97 insertions(+), 45 deletions(-)

diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 5d353e70fe26..9401909fa283 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -98,7 +98,6 @@ struct perf_session *perf_session__new(const char *filename, int mode, bool forc
 	self->unknown_events = 0;
 	self->machines = RB_ROOT;
 	self->repipe = repipe;
-	self->ordered_samples.flush_limit = ULLONG_MAX;
 	INIT_LIST_HEAD(&self->ordered_samples.samples_head);
 
 	if (mode == O_RDONLY) {
@@ -194,6 +193,18 @@ static int process_event_stub(event_t *event __used,
 	return 0;
 }
 
+static int process_finished_round_stub(event_t *event __used,
+				       struct perf_session *session __used,
+				       struct perf_event_ops *ops __used)
+{
+	dump_printf(": unhandled!\n");
+	return 0;
+}
+
+static int process_finished_round(event_t *event,
+				  struct perf_session *session,
+				  struct perf_event_ops *ops);
+
 static void perf_event_ops__fill_defaults(struct perf_event_ops *handler)
 {
 	if (handler->sample == NULL)
@@ -222,6 +233,12 @@ static void perf_event_ops__fill_defaults(struct perf_event_ops *handler)
 		handler->tracing_data = process_event_stub;
 	if (handler->build_id == NULL)
 		handler->build_id = process_event_stub;
+	if (handler->finished_round == NULL) {
+		if (handler->ordered_samples)
+			handler->finished_round = process_finished_round;
+		else
+			handler->finished_round = process_finished_round_stub;
+	}
 }
 
 static const char *event__name[] = {
@@ -359,16 +376,14 @@ struct sample_queue {
 	struct list_head	list;
 };
 
-#define FLUSH_PERIOD	(2 * NSEC_PER_SEC)
-
 static void flush_sample_queue(struct perf_session *s,
 			       struct perf_event_ops *ops)
 {
 	struct list_head *head = &s->ordered_samples.samples_head;
-	u64 limit = s->ordered_samples.flush_limit;
+	u64 limit = s->ordered_samples.next_flush;
 	struct sample_queue *tmp, *iter;
 
-	if (!ops->ordered_samples)
+	if (!ops->ordered_samples || !limit)
 		return;
 
 	list_for_each_entry_safe(iter, tmp, head, list) {
@@ -387,6 +402,55 @@ static void flush_sample_queue(struct perf_session *s,
 	}
 }
 
+/*
+ * When perf record finishes a pass on every buffers, it records this pseudo
+ * event.
+ * We record the max timestamp t found in the pass n.
+ * Assuming these timestamps are monotonic across cpus, we know that if
+ * a buffer still has events with timestamps below t, they will be all
+ * available and then read in the pass n + 1.
+ * Hence when we start to read the pass n + 2, we can safely flush every
+ * events with timestamps below t.
+ *
+ *    ============ PASS n =================
+ *       CPU 0         |   CPU 1
+ *                     |
+ *    cnt1 timestamps  |   cnt2 timestamps
+ *          1          |         2
+ *          2          |         3
+ *          -          |         4  <--- max recorded
+ *
+ *    ============ PASS n + 1 ==============
+ *       CPU 0         |   CPU 1
+ *                     |
+ *    cnt1 timestamps  |   cnt2 timestamps
+ *          3          |         5
+ *          4          |         6
+ *          5          |         7 <---- max recorded
+ *
+ *      Flush every events below timestamp 4
+ *
+ *    ============ PASS n + 2 ==============
+ *       CPU 0         |   CPU 1
+ *                     |
+ *    cnt1 timestamps  |   cnt2 timestamps
+ *          6          |         8
+ *          7          |         9
+ *          -          |         10
+ *
+ *      Flush every events below timestamp 7
+ *      etc...
+ */
+static int process_finished_round(event_t *event __used,
+				  struct perf_session *session,
+				  struct perf_event_ops *ops)
+{
+	flush_sample_queue(session, ops);
+	session->ordered_samples.next_flush = session->ordered_samples.max_timestamp;
+
+	return 0;
+}
+
 static void __queue_sample_end(struct sample_queue *new, struct list_head *head)
 {
 	struct sample_queue *iter;
@@ -455,17 +519,12 @@ static void __queue_sample_event(struct sample_queue *new,
 }
 
 static int queue_sample_event(event_t *event, struct sample_data *data,
-			      struct perf_session *s,
-			      struct perf_event_ops *ops)
+			      struct perf_session *s)
 {
 	u64 timestamp = data->time;
 	struct sample_queue *new;
-	u64 flush_limit;
 
 
-	if (s->ordered_samples.flush_limit == ULLONG_MAX)
-		s->ordered_samples.flush_limit = timestamp + FLUSH_PERIOD;
-
 	if (timestamp < s->ordered_samples.last_flush) {
 		printf("Warning: Timestamp below last timeslice flush\n");
 		return -EINVAL;
@@ -488,23 +547,8 @@ static int queue_sample_event(event_t *event, struct sample_data *data,
 	__queue_sample_event(new, s);
 	s->ordered_samples.last_inserted = new;
 
-	/*
-	 * We want to have a slice of events covering 2 * FLUSH_PERIOD
-	 * If FLUSH_PERIOD is big enough, it ensures every events that occured
-	 * in the first half of the timeslice have all been buffered and there
-	 * are none remaining (we need that because of the weakly ordered
-	 * event recording we have). Then once we reach the 2 * FLUSH_PERIOD
-	 * timeslice, we flush the first half to be gentle with the memory
-	 * (the second half can still get new events in the middle, so wait
-	 * another period to flush it)
-	 */
-	flush_limit = s->ordered_samples.flush_limit;
-
-	if (new->timestamp > flush_limit &&
-		new->timestamp - flush_limit > FLUSH_PERIOD) {
-		s->ordered_samples.flush_limit += FLUSH_PERIOD;
-		flush_sample_queue(s, ops);
-	}
+	if (new->timestamp > s->ordered_samples.max_timestamp)
+		s->ordered_samples.max_timestamp = new->timestamp;
 
 	return 0;
 }
@@ -520,7 +564,7 @@ static int perf_session__process_sample(event_t *event, struct perf_session *s,
 	bzero(&data, sizeof(struct sample_data));
 	event__parse_sample(event, s->sample_type, &data);
 
-	queue_sample_event(event, &data, s, ops);
+	queue_sample_event(event, &data, s);
 
 	return 0;
 }
@@ -572,6 +616,8 @@ static int perf_session__process_event(struct perf_session *self,
 		return ops->tracing_data(event, self);
 	case PERF_RECORD_HEADER_BUILD_ID:
 		return ops->build_id(event, self);
+	case PERF_RECORD_FINISHED_ROUND:
+		return ops->finished_round(event, self, ops);
 	default:
 		self->unknown_events++;
 		return -1;
@@ -786,7 +832,7 @@ int __perf_session__process_events(struct perf_session *self,
 done:
 	err = 0;
 	/* do the final flush for ordered samples */
-	self->ordered_samples.flush_limit = ULLONG_MAX;
+	self->ordered_samples.next_flush = ULLONG_MAX;
 	flush_sample_queue(self, ops);
 out_err:
 	ui_progress__delete(progress);
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index f2b2c6a3a49d..568fd08a478d 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -14,7 +14,8 @@ struct thread;
 
 struct ordered_samples {
 	u64			last_flush;
-	u64			flush_limit;
+	u64			next_flush;
+	u64			max_timestamp;
 	struct list_head	samples_head;
 	struct sample_queue	*last_inserted;
 };
@@ -41,23 +42,28 @@ struct perf_session {
 	char filename[0];
 };
 
+struct perf_event_ops;
+
 typedef int (*event_op)(event_t *self, struct perf_session *session);
+typedef int (*event_op2)(event_t *self, struct perf_session *session,
+			 struct perf_event_ops *ops);
 
 struct perf_event_ops {
-	event_op sample,
-		 mmap,
-		 comm,
-		 fork,
-		 exit,
-		 lost,
-		 read,
-		 throttle,
-		 unthrottle,
-		 attr,
-		 event_type,
-		 tracing_data,
-		 build_id;
-	bool	ordered_samples;
+	event_op	sample,
+			mmap,
+			comm,
+			fork,
+			exit,
+			lost,
+			read,
+			throttle,
+			unthrottle,
+			attr,
+			event_type,
+			tracing_data,
+			build_id;
+	event_op2	finished_round;
+	bool		ordered_samples;
 };
 
 struct perf_session *perf_session__new(const char *filename, int mode, bool force, bool repipe);

From 26242d859c9be9eea61f7f19514e9d272ae8ce26 Mon Sep 17 00:00:00 2001
From: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Date: Mon, 3 May 2010 14:12:00 +0900
Subject: [PATCH 03/12] perf lock: Add "info" subcommand for dumping misc
 information

This adds the "info" subcommand to perf lock which can be used
to dump metadata like threads or addresses of lock instances.
"map" was removed because info should do the work for it.

This will be useful not only for debugging but also for ordinary
analyzing.

v2: adding example of usage
% sudo ./perf lock info -t
 | Thread ID: comm
 | 	 0: swapper
 |         1: init
 |        18: migration/5
 |        29: events/2
 |        32: events/5
 |        33: events/6
...

% sudo ./perf lock info -m
| Address of instance: name of class
|  0xffff8800b95adae0: &(&sighand->siglock)->rlock
|  0xffff8800bbb41ae0: &(&sighand->siglock)->rlock
|  0xffff8800bf165ae0: &(&sighand->siglock)->rlock
|  0xffff8800b9576a98: &p->cred_guard_mutex
|  0xffff8800bb890a08: &(&p->alloc_lock)->rlock
|  0xffff8800b9522a08: &(&p->alloc_lock)->rlock
|  0xffff8800bb8aaa08: &(&p->alloc_lock)->rlock
|  0xffff8800bba72a08: &(&p->alloc_lock)->rlock
|  0xffff8800bf18ea08: &(&p->alloc_lock)->rlock
|  0xffff8800b8a0d8a0: &(&ip->i_lock)->mr_lock
|  0xffff88009bf818a0: &(&ip->i_lock)->mr_lock
|  0xffff88004c66b8a0: &(&ip->i_lock)->mr_lock
|  0xffff8800bb6478a0: &(shost->host_lock)->rlock

v3: fixed some problems Frederic pointed out
 * better rbtree tracking in dump_threads()
 * removed printf() and used pr_info() and pr_debug()

Signed-off-by: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
LKML-Reference: <1272863520-16179-1-git-send-email-mitake@dcl.info.waseda.ac.jp>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 tools/perf/builtin-lock.c | 96 +++++++++++++++++++++++++++++----------
 1 file changed, 73 insertions(+), 23 deletions(-)

diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index 6605000ed73d..c4eb854ff7eb 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -720,15 +720,15 @@ static void print_result(void)
 	char cut_name[20];
 	int bad, total;
 
-	printf("%20s ", "Name");
-	printf("%10s ", "acquired");
-	printf("%10s ", "contended");
+	pr_info("%20s ", "Name");
+	pr_info("%10s ", "acquired");
+	pr_info("%10s ", "contended");
 
-	printf("%15s ", "total wait (ns)");
-	printf("%15s ", "max wait (ns)");
-	printf("%15s ", "min wait (ns)");
+	pr_info("%15s ", "total wait (ns)");
+	pr_info("%15s ", "max wait (ns)");
+	pr_info("%15s ", "min wait (ns)");
 
-	printf("\n\n");
+	pr_info("\n\n");
 
 	bad = total = 0;
 	while ((st = pop_from_result())) {
@@ -741,7 +741,7 @@ static void print_result(void)
 
 		if (strlen(st->name) < 16) {
 			/* output raw name */
-			printf("%20s ", st->name);
+			pr_info("%20s ", st->name);
 		} else {
 			strncpy(cut_name, st->name, 16);
 			cut_name[16] = '.';
@@ -749,17 +749,17 @@ static void print_result(void)
 			cut_name[18] = '.';
 			cut_name[19] = '\0';
 			/* cut off name for saving output style */
-			printf("%20s ", cut_name);
+			pr_info("%20s ", cut_name);
 		}
 
-		printf("%10u ", st->nr_acquired);
-		printf("%10u ", st->nr_contended);
+		pr_info("%10u ", st->nr_acquired);
+		pr_info("%10u ", st->nr_contended);
 
-		printf("%15llu ", st->wait_time_total);
-		printf("%15llu ", st->wait_time_max);
-		printf("%15llu ", st->wait_time_min == ULLONG_MAX ?
+		pr_info("%15llu ", st->wait_time_total);
+		pr_info("%15llu ", st->wait_time_max);
+		pr_info("%15llu ", st->wait_time_min == ULLONG_MAX ?
 		       0 : st->wait_time_min);
-		printf("\n");
+		pr_info("\n");
 	}
 
 	{
@@ -768,28 +768,59 @@ static void print_result(void)
 		const char *name[4] =
 			{ "acquire", "acquired", "contended", "release" };
 
-		printf("\n=== output for debug===\n\n");
-		printf("bad:%d, total:%d\n", bad, total);
-		printf("bad rate:%f\n", (double)(bad / total));
+		pr_debug("\n=== output for debug===\n\n");
+		pr_debug("bad:%d, total:%d\n", bad, total);
+		pr_debug("bad rate:%f\n", (double)(bad / total));
 
-		printf("histogram of events caused bad sequence\n");
+		pr_debug("histogram of events caused bad sequence\n");
 		for (i = 0; i < 4; i++)
-			printf(" %10s: %d\n", name[i], bad_hist[i]);
+			pr_debug(" %10s: %d\n", name[i], bad_hist[i]);
 	}
 }
 
+static int			info_threads;
+static int			info_map;
+
+static void dump_threads(void)
+{
+	struct thread_stat *st;
+	struct rb_node *node;
+	struct thread *t;
+
+	pr_info("%10s: comm\n", "Thread ID");
+
+	node = rb_first(&thread_stats);
+	while (node) {
+		st = container_of(node, struct thread_stat, rb);
+		t = perf_session__findnew(session, st->tid);
+		pr_info("%10d: %s\n", st->tid, t->comm);
+		node = rb_next(node);
+	};
+}
+
 static void dump_map(void)
 {
 	unsigned int i;
 	struct lock_stat *st;
 
+	pr_info("Address of instance: name of class\n");
 	for (i = 0; i < LOCKHASH_SIZE; i++) {
 		list_for_each_entry(st, &lockhash_table[i], hash_entry) {
-			printf("%p: %s\n", st->addr, st->name);
+			pr_info(" %p: %s\n", st->addr, st->name);
 		}
 	}
 }
 
+static void dump_info(void)
+{
+	if (info_threads)
+		dump_threads();
+	else if (info_map)
+		dump_map();
+	else
+		die("Unknown type of information\n");
+}
+
 static int process_sample_event(event_t *self, struct perf_session *s)
 {
 	struct sample_data data;
@@ -858,6 +889,19 @@ static const struct option report_options[] = {
 	OPT_END()
 };
 
+static const char * const info_usage[] = {
+	"perf lock info [<options>]",
+	NULL
+};
+
+static const struct option info_options[] = {
+	OPT_BOOLEAN('t', "threads", &info_threads,
+		    "dump thread list in perf.data"),
+	OPT_BOOLEAN('m', "map", &info_map,
+		    "map of lock instances (name:address table)"),
+	OPT_END()
+};
+
 static const char * const lock_usage[] = {
 	"perf lock [<options>] {record|trace|report}",
 	NULL
@@ -929,12 +973,18 @@ int cmd_lock(int argc, const char **argv, const char *prefix __used)
 	} else if (!strcmp(argv[0], "trace")) {
 		/* Aliased to 'perf trace' */
 		return cmd_trace(argc, argv, prefix);
-	} else if (!strcmp(argv[0], "map")) {
+	} else if (!strcmp(argv[0], "info")) {
+		if (argc) {
+			argc = parse_options(argc, argv,
+					     info_options, info_usage, 0);
+			if (argc)
+				usage_with_options(info_usage, info_options);
+		}
 		/* recycling report_lock_ops */
 		trace_handler = &report_lock_ops;
 		setup_pager();
 		read_events();
-		dump_map();
+		dump_info();
 	} else {
 		usage_with_options(lock_usage, lock_options);
 	}

From 10350ec362b48f79f3df8447c25813790075e27c Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 5 May 2010 23:47:28 +0200
Subject: [PATCH 04/12] perf: Cleanup perf lock broken states

Use enum to get a human view of bad_hist indexes and
put bad histogram output in its own function.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
---
 tools/perf/builtin-lock.c | 49 +++++++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 20 deletions(-)

diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index c4eb854ff7eb..1e93179c2d30 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -387,7 +387,15 @@ static struct lock_seq_stat *get_seq(struct thread_stat *ts, void *addr)
 	return seq;
 }
 
-static int bad_hist[4];
+enum broken_state {
+	BROKEN_ACQUIRE,
+	BROKEN_ACQUIRED,
+	BROKEN_CONTENDED,
+	BROKEN_RELEASE,
+	BROKEN_MAX,
+};
+
+static int bad_hist[BROKEN_MAX];
 
 static void
 report_lock_acquire_event(struct trace_acquire_event *acquire_event,
@@ -437,7 +445,7 @@ report_lock_acquire_event(struct trace_acquire_event *acquire_event,
 broken:
 		/* broken lock sequence, discard it */
 		ls->discard = 1;
-		bad_hist[0]++;
+		bad_hist[BROKEN_ACQUIRE]++;
 		list_del(&seq->list);
 		free(seq);
 		goto end;
@@ -481,7 +489,6 @@ report_lock_acquired_event(struct trace_acquired_event *acquired_event,
 	case SEQ_STATE_CONTENDED:
 		contended_term = timestamp - seq->prev_event_time;
 		ls->wait_time_total += contended_term;
-
 		if (contended_term < ls->wait_time_min)
 			ls->wait_time_min = contended_term;
 		else if (ls->wait_time_max < contended_term)
@@ -492,7 +499,7 @@ report_lock_acquired_event(struct trace_acquired_event *acquired_event,
 	case SEQ_STATE_READ_ACQUIRED:
 		/* broken lock sequence, discard it */
 		ls->discard = 1;
-		bad_hist[1]++;
+		bad_hist[BROKEN_ACQUIRED]++;
 		list_del(&seq->list);
 		free(seq);
 		goto end;
@@ -540,7 +547,7 @@ report_lock_contended_event(struct trace_contended_event *contended_event,
 	case SEQ_STATE_CONTENDED:
 		/* broken lock sequence, discard it */
 		ls->discard = 1;
-		bad_hist[2]++;
+		bad_hist[BROKEN_CONTENDED]++;
 		list_del(&seq->list);
 		free(seq);
 		goto end;
@@ -594,7 +601,7 @@ report_lock_release_event(struct trace_release_event *release_event,
 	case SEQ_STATE_RELEASED:
 		/* broken lock sequence, discard it */
 		ls->discard = 1;
-		bad_hist[3]++;
+		bad_hist[BROKEN_RELEASE]++;
 		goto free_seq;
 		break;
 	default:
@@ -713,6 +720,21 @@ process_raw_event(void *data, int cpu, u64 timestamp, struct thread *thread)
 		process_lock_release_event(data, event, cpu, timestamp, thread);
 }
 
+static void print_bad_events(int bad, int total)
+{
+	/* Output for debug, this have to be removed */
+	int i;
+	const char *name[4] =
+		{ "acquire", "acquired", "contended", "release" };
+
+	pr_info("\n=== output for debug===\n\n");
+	pr_info("bad:%d, total:%d\n", bad, total);
+	pr_info("bad rate:%f\n", (double)(bad / total));
+	pr_info("histogram of events caused bad sequence\n");
+	for (i = 0; i < BROKEN_MAX; i++)
+		pr_info(" %10s: %d\n", name[i], bad_hist[i]);
+}
+
 /* TODO: various way to print, coloring, nano or milli sec */
 static void print_result(void)
 {
@@ -762,20 +784,7 @@ static void print_result(void)
 		pr_info("\n");
 	}
 
-	{
-		/* Output for debug, this have to be removed */
-		int i;
-		const char *name[4] =
-			{ "acquire", "acquired", "contended", "release" };
-
-		pr_debug("\n=== output for debug===\n\n");
-		pr_debug("bad:%d, total:%d\n", bad, total);
-		pr_debug("bad rate:%f\n", (double)(bad / total));
-
-		pr_debug("histogram of events caused bad sequence\n");
-		for (i = 0; i < 4; i++)
-			pr_debug(" %10s: %d\n", name[i], bad_hist[i]);
-	}
+	print_bad_events(bad, total);
 }
 
 static int			info_threads;

From 84c7a21791eb2e962a27e19bab5b77d5d9e13a34 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 5 May 2010 23:57:25 +0200
Subject: [PATCH 05/12] perf: Humanize lock flags in perf lock

Use an enum instead of plain constants for lock flags.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
---
 tools/perf/builtin-lock.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index 1e93179c2d30..3b304ed4d2e9 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -397,6 +397,11 @@ enum broken_state {
 
 static int bad_hist[BROKEN_MAX];
 
+enum acquire_flags {
+	TRY_LOCK = 1,
+	READ_LOCK = 2,
+};
+
 static void
 report_lock_acquire_event(struct trace_acquire_event *acquire_event,
 			struct event *__event __used,
@@ -421,9 +426,9 @@ report_lock_acquire_event(struct trace_acquire_event *acquire_event,
 		if (!acquire_event->flag) {
 			seq->state = SEQ_STATE_ACQUIRING;
 		} else {
-			if (acquire_event->flag & 1)
+			if (acquire_event->flag & TRY_LOCK)
 				ls->nr_trylock++;
-			if (acquire_event->flag & 2)
+			if (acquire_event->flag & READ_LOCK)
 				ls->nr_readlock++;
 			seq->state = SEQ_STATE_READ_ACQUIRED;
 			seq->read_count = 1;
@@ -431,7 +436,7 @@ report_lock_acquire_event(struct trace_acquire_event *acquire_event,
 		}
 		break;
 	case SEQ_STATE_READ_ACQUIRED:
-		if (acquire_event->flag & 2) {
+		if (acquire_event->flag & READ_LOCK) {
 			seq->read_count++;
 			ls->nr_acquired++;
 			goto end;

From 5efe08cf685f33f562566dc68b6077b6f6a4f706 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 6 May 2010 04:55:22 +0200
Subject: [PATCH 06/12] perf: Fix perf lock bad rate

Fix the cast made to get the bad rate. It is made in the result
instead of the operands. We need the operands to be cast in double,
otherwise the result will always be zero.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
---
 tools/perf/builtin-lock.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index 3b304ed4d2e9..d7dde9cbbdf4 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -733,8 +733,8 @@ static void print_bad_events(int bad, int total)
 		{ "acquire", "acquired", "contended", "release" };
 
 	pr_info("\n=== output for debug===\n\n");
-	pr_info("bad:%d, total:%d\n", bad, total);
-	pr_info("bad rate:%f\n", (double)(bad / total));
+	pr_info("bad: %d, total: %d\n", bad, total);
+	pr_info("bad rate: %f %%\n", (double)bad / (double)total * 100);
 	pr_info("histogram of events caused bad sequence\n");
 	for (i = 0; i < BROKEN_MAX; i++)
 		pr_info(" %10s: %d\n", name[i], bad_hist[i]);

From 90c0e5fc7b73d2575c7367e1da70ff9521718e5e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 7 May 2010 02:33:42 +0200
Subject: [PATCH 07/12] perf lock: Always check min AND max wait time

When a lock is acquired after beeing contended, we update the
wait time statistics for the given lock.
But if the min wait time is updated, we don't check the max wait
time. This is wrong because the first time we update the wait time,
we want to update both min and max wait time.

Before:
	Name   acquired  contended total wait (ns)   max wait (ns)   min wait (ns)
	key          8          1           21656           0           21656

After:
	Name   acquired  contended total wait (ns)   max wait (ns)   min wait (ns)
	key          8          1           21656           21656           21656

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
---
 tools/perf/builtin-lock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index d7dde9cbbdf4..e549f4547b98 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -496,7 +496,7 @@ report_lock_acquired_event(struct trace_acquired_event *acquired_event,
 		ls->wait_time_total += contended_term;
 		if (contended_term < ls->wait_time_min)
 			ls->wait_time_min = contended_term;
-		else if (ls->wait_time_max < contended_term)
+		if (ls->wait_time_max < contended_term)
 			ls->wait_time_max = contended_term;
 		break;
 	case SEQ_STATE_RELEASED:

From 883a2a3189dae9d2912c417e47152f51cb922a3f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 8 May 2010 06:16:11 +0200
Subject: [PATCH 08/12] tracing: Drop lock_acquired waittime field

Drop the waittime field from the lock_acquired event, we can
calculate it by substracting the lock_acquired event timestamp
with the matching lock_acquire one.

It is not needed and takes useless space in the traces.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/events/lock.h | 11 ++++-------
 kernel/lockdep.c            |  2 +-
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/include/trace/events/lock.h b/include/trace/events/lock.h
index 5c1dcfc16c60..17ca287ae176 100644
--- a/include/trace/events/lock.h
+++ b/include/trace/events/lock.h
@@ -78,24 +78,21 @@ TRACE_EVENT(lock_contended,
 );
 
 TRACE_EVENT(lock_acquired,
-	TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime),
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
 
-	TP_ARGS(lock, ip, waittime),
+	TP_ARGS(lock, ip),
 
 	TP_STRUCT__entry(
 		__string(name, lock->name)
-		__field(s64, wait_nsec)
 		__field(void *, lockdep_addr)
 	),
 
 	TP_fast_assign(
 		__assign_str(name, lock->name);
-		__entry->wait_nsec = waittime;
 		__entry->lockdep_addr = lock;
 	),
-	TP_printk("%p %s (%llu ns)", __entry->lockdep_addr,
-		  __get_str(name),
-		  __entry->wait_nsec)
+	TP_printk("%p %s", __entry->lockdep_addr,
+		  __get_str(name))
 );
 
 #endif
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 2594e1ce41cb..31e22e742368 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3380,7 +3380,7 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
 		hlock->holdtime_stamp = now;
 	}
 
-	trace_lock_acquired(lock, ip, waittime);
+	trace_lock_acquired(lock, ip);
 
 	stats = get_lock_stats(hlock_class(hlock));
 	if (waittime) {

From 93135439459920c4d856f4ab8f068c030085c8df Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 8 May 2010 06:24:25 +0200
Subject: [PATCH 09/12] tracing: Drop the nested field from lock_release event

Drop the nested field as we don't use it. Every nested state can
be computed from a state machine on post processing already.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/events/lock.h | 4 ++--
 kernel/lockdep.c            | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/trace/events/lock.h b/include/trace/events/lock.h
index 17ca287ae176..fde4c3853391 100644
--- a/include/trace/events/lock.h
+++ b/include/trace/events/lock.h
@@ -37,9 +37,9 @@ TRACE_EVENT(lock_acquire,
 
 TRACE_EVENT(lock_release,
 
-	TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip),
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
 
-	TP_ARGS(lock, nested, ip),
+	TP_ARGS(lock, ip),
 
 	TP_STRUCT__entry(
 		__string(name, lock->name)
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 31e22e742368..e9c759f06c1d 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3227,7 +3227,7 @@ void lock_release(struct lockdep_map *lock, int nested,
 	raw_local_irq_save(flags);
 	check_flags(flags);
 	current->lockdep_recursion = 1;
-	trace_lock_release(lock, nested, ip);
+	trace_lock_release(lock, ip);
 	__lock_release(lock, nested, ip);
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);

From 2c193c736803ceb547daec725e5c5d992d039f20 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 8 May 2010 06:36:02 +0200
Subject: [PATCH 10/12] tracing: Factorize lock events in a lock class

lock_acquired, lock_contended and lock_release now share the
same prototype and format. Let's factorize them into a lock
event class.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/events/lock.h | 50 ++++++++++++-------------------------
 1 file changed, 16 insertions(+), 34 deletions(-)

diff --git a/include/trace/events/lock.h b/include/trace/events/lock.h
index fde4c3853391..2821b86de63b 100644
--- a/include/trace/events/lock.h
+++ b/include/trace/events/lock.h
@@ -35,15 +35,15 @@ TRACE_EVENT(lock_acquire,
 		  __get_str(name))
 );
 
-TRACE_EVENT(lock_release,
+DECLARE_EVENT_CLASS(lock,
 
 	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
 
 	TP_ARGS(lock, ip),
 
 	TP_STRUCT__entry(
-		__string(name, lock->name)
-		__field(void *, lockdep_addr)
+		__string(	name, 	lock->name	)
+		__field(	void *, lockdep_addr	)
 	),
 
 	TP_fast_assign(
@@ -51,48 +51,30 @@ TRACE_EVENT(lock_release,
 		__entry->lockdep_addr = lock;
 	),
 
-	TP_printk("%p %s",
-		  __entry->lockdep_addr, __get_str(name))
+	TP_printk("%p %s",  __entry->lockdep_addr, __get_str(name))
+);
+
+DEFINE_EVENT(lock, lock_release,
+
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
+
+	TP_ARGS(lock, ip)
 );
 
 #ifdef CONFIG_LOCK_STAT
 
-TRACE_EVENT(lock_contended,
+DEFINE_EVENT(lock, lock_contended,
 
 	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
 
-	TP_ARGS(lock, ip),
-
-	TP_STRUCT__entry(
-		__string(name, lock->name)
-		__field(void *, lockdep_addr)
-	),
-
-	TP_fast_assign(
-		__assign_str(name, lock->name);
-		__entry->lockdep_addr = lock;
-	),
-
-	TP_printk("%p %s",
-		  __entry->lockdep_addr, __get_str(name))
+	TP_ARGS(lock, ip)
 );
 
-TRACE_EVENT(lock_acquired,
+DEFINE_EVENT(lock, lock_acquired,
+
 	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
 
-	TP_ARGS(lock, ip),
-
-	TP_STRUCT__entry(
-		__string(name, lock->name)
-		__field(void *, lockdep_addr)
-	),
-
-	TP_fast_assign(
-		__assign_str(name, lock->name);
-		__entry->lockdep_addr = lock;
-	),
-	TP_printk("%p %s", __entry->lockdep_addr,
-		  __get_str(name))
+	TP_ARGS(lock, ip)
 );
 
 #endif

From 794e43b56c18b95fc9776c914a2659e7d558a352 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Wed, 5 May 2010 00:27:40 -0500
Subject: [PATCH 11/12] perf/live-mode: Handle payload-less events

Some events, such as the PERF_RECORD_FINISHED_ROUND event consist of
only an event header and no data.  In this case, a 0-length payload
will be read, and the 0 return value will be wrongly interpreted as an
'unexpected end of event stream'.

This patch allows for proper handling of data-less events by skipping
0-length reads.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
LKML-Reference: <1273038527.6383.51.camel@tropicana>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 tools/perf/util/session.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 9401909fa283..00ab298bbb4b 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -696,15 +696,18 @@ static int __perf_session__process_pipe_events(struct perf_session *self,
 	p = &event;
 	p += sizeof(struct perf_event_header);
 
-	err = do_read(self->fd, p, size - sizeof(struct perf_event_header));
-	if (err <= 0) {
-		if (err == 0) {
-			pr_err("unexpected end of event stream\n");
-			goto done;
-		}
+	if (size - sizeof(struct perf_event_header)) {
+		err = do_read(self->fd, p,
+			      size - sizeof(struct perf_event_header));
+		if (err <= 0) {
+			if (err == 0) {
+				pr_err("unexpected end of event stream\n");
+				goto done;
+			}
 
-		pr_err("failed to read event data\n");
-		goto out_err;
+			pr_err("failed to read event data\n");
+			goto out_err;
+		}
 	}
 
 	if (size == 0 ||

From 76ba7e846fcc89d9d4b25b89e303c9058de96d60 Mon Sep 17 00:00:00 2001
From: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Date: Sat, 8 May 2010 17:10:29 +0900
Subject: [PATCH 12/12] perf lock: Drop "-a" option from cmd_record() default
 arguments set

This patch drops "-a" from the default arguments passed to
perf record by perf lock.

If a user wants to do a system wide record of lock events,
        perf lock record -a <program> <argument> ...
is enough for this purpose.

This can reduce the size of the perf.data file.

% sudo ./perf lock record whoami
root
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.439 MB perf.data (~19170 samples) ]
% sudo ./perf lock record -a whoami   # with -a option
root
[ perf record: Woken up 0 times to write data ]
[ perf record: Captured and wrote 48.962 MB perf.data (~2139197 samples) ]

Signed-off-by: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
LKML-Reference: Message-Id: <1273306229-5216-1-git-send-email-mitake@dcl.info.waseda.ac.jp>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 tools/perf/builtin-lock.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index e549f4547b98..e18dfdc2948a 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -930,7 +930,6 @@ static const struct option lock_options[] = {
 
 static const char *record_args[] = {
 	"record",
-	"-a",
 	"-R",
 	"-f",
 	"-m", "1024",