2019-05-29 22:12:25 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2017-10-06 21:31:47 +08:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2011-2017, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
|
|
|
|
*
|
|
|
|
* Parts came from evlist.c builtin-{top,stat,record}.c, see those files for further
|
|
|
|
* copyright notes.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <sys/mman.h>
|
2017-10-06 21:46:01 +08:00
|
|
|
#include <inttypes.h>
|
|
|
|
#include <asm/bug.h>
|
2019-07-04 22:32:27 +08:00
|
|
|
#include <linux/zalloc.h>
|
2019-01-23 01:48:54 +08:00
|
|
|
#ifdef HAVE_LIBNUMA_SUPPORT
|
|
|
|
#include <numaif.h>
|
|
|
|
#endif
|
2017-10-06 21:46:01 +08:00
|
|
|
#include "debug.h"
|
2017-10-06 21:31:47 +08:00
|
|
|
#include "event.h"
|
|
|
|
#include "mmap.h"
|
|
|
|
#include "util.h" /* page_size */
|
|
|
|
|
|
|
|
size_t perf_mmap__mmap_len(struct perf_mmap *map)
|
|
|
|
{
|
|
|
|
return map->mask + 1 + page_size;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* When check_messup is true, 'end' must points to a good entry */
|
2017-12-03 10:00:41 +08:00
|
|
|
static union perf_event *perf_mmap__read(struct perf_mmap *map,
|
2018-01-19 05:26:21 +08:00
|
|
|
u64 *startp, u64 end)
|
2017-10-06 21:31:47 +08:00
|
|
|
{
|
|
|
|
unsigned char *data = map->base + page_size;
|
|
|
|
union perf_event *event = NULL;
|
2018-01-19 05:26:21 +08:00
|
|
|
int diff = end - *startp;
|
2017-10-06 21:31:47 +08:00
|
|
|
|
|
|
|
if (diff >= (int)sizeof(event->header)) {
|
|
|
|
size_t size;
|
|
|
|
|
2018-01-19 05:26:21 +08:00
|
|
|
event = (union perf_event *)&data[*startp & map->mask];
|
2017-10-06 21:31:47 +08:00
|
|
|
size = event->header.size;
|
|
|
|
|
2018-01-19 05:26:21 +08:00
|
|
|
if (size < sizeof(event->header) || diff < (int)size)
|
|
|
|
return NULL;
|
2017-10-06 21:31:47 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Event straddles the mmap boundary -- header should always
|
|
|
|
* be inside due to u64 alignment of output.
|
|
|
|
*/
|
2018-01-19 05:26:21 +08:00
|
|
|
if ((*startp & map->mask) + size != ((*startp + size) & map->mask)) {
|
|
|
|
unsigned int offset = *startp;
|
2017-10-06 21:31:47 +08:00
|
|
|
unsigned int len = min(sizeof(*event), size), cpy;
|
|
|
|
void *dst = map->event_copy;
|
|
|
|
|
|
|
|
do {
|
|
|
|
cpy = min(map->mask + 1 - (offset & map->mask), len);
|
|
|
|
memcpy(dst, &data[offset & map->mask], cpy);
|
|
|
|
offset += cpy;
|
|
|
|
dst += cpy;
|
|
|
|
len -= cpy;
|
|
|
|
} while (len);
|
|
|
|
|
|
|
|
event = (union perf_event *)map->event_copy;
|
|
|
|
}
|
|
|
|
|
2018-01-19 05:26:21 +08:00
|
|
|
*startp += size;
|
2017-10-06 21:31:47 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return event;
|
|
|
|
}
|
|
|
|
|
2018-01-19 05:26:23 +08:00
|
|
|
/*
|
|
|
|
* Read event from ring buffer one by one.
|
|
|
|
* Return one event for each call.
|
|
|
|
*
|
|
|
|
* Usage:
|
|
|
|
* perf_mmap__read_init()
|
|
|
|
* while(event = perf_mmap__read_event()) {
|
|
|
|
* //process the event
|
|
|
|
* perf_mmap__consume()
|
|
|
|
* }
|
|
|
|
* perf_mmap__read_done()
|
|
|
|
*/
|
2018-03-06 23:36:06 +08:00
|
|
|
union perf_event *perf_mmap__read_event(struct perf_mmap *map)
|
2018-01-19 05:26:23 +08:00
|
|
|
{
|
|
|
|
union perf_event *event;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check if event was unmapped due to a POLLHUP/POLLERR.
|
|
|
|
*/
|
|
|
|
if (!refcount_read(&map->refcnt))
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
/* non-overwirte doesn't pause the ringbuffer */
|
2018-03-06 23:36:03 +08:00
|
|
|
if (!map->overwrite)
|
|
|
|
map->end = perf_mmap__read_head(map);
|
2018-01-19 05:26:23 +08:00
|
|
|
|
2018-03-06 23:36:03 +08:00
|
|
|
event = perf_mmap__read(map, &map->start, map->end);
|
2018-01-19 05:26:23 +08:00
|
|
|
|
2018-03-06 23:36:03 +08:00
|
|
|
if (!map->overwrite)
|
|
|
|
map->prev = map->start;
|
2018-01-19 05:26:23 +08:00
|
|
|
|
|
|
|
return event;
|
|
|
|
}
|
|
|
|
|
2017-10-06 21:31:47 +08:00
|
|
|
static bool perf_mmap__empty(struct perf_mmap *map)
|
|
|
|
{
|
|
|
|
return perf_mmap__read_head(map) == map->prev && !map->auxtrace_mmap.base;
|
|
|
|
}
|
|
|
|
|
|
|
|
void perf_mmap__get(struct perf_mmap *map)
|
|
|
|
{
|
|
|
|
refcount_inc(&map->refcnt);
|
|
|
|
}
|
|
|
|
|
|
|
|
void perf_mmap__put(struct perf_mmap *map)
|
|
|
|
{
|
|
|
|
BUG_ON(map->base && refcount_read(&map->refcnt) == 0);
|
|
|
|
|
|
|
|
if (refcount_dec_and_test(&map->refcnt))
|
|
|
|
perf_mmap__munmap(map);
|
|
|
|
}
|
|
|
|
|
2018-03-06 23:36:05 +08:00
|
|
|
void perf_mmap__consume(struct perf_mmap *map)
|
2017-10-06 21:31:47 +08:00
|
|
|
{
|
2018-03-06 23:36:04 +08:00
|
|
|
if (!map->overwrite) {
|
2017-10-06 21:31:47 +08:00
|
|
|
u64 old = map->prev;
|
|
|
|
|
|
|
|
perf_mmap__write_tail(map, old);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (refcount_read(&map->refcnt) == 1 && perf_mmap__empty(map))
|
|
|
|
perf_mmap__put(map);
|
|
|
|
}
|
|
|
|
|
|
|
|
int __weak auxtrace_mmap__mmap(struct auxtrace_mmap *mm __maybe_unused,
|
|
|
|
struct auxtrace_mmap_params *mp __maybe_unused,
|
|
|
|
void *userpg __maybe_unused,
|
|
|
|
int fd __maybe_unused)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
void __weak auxtrace_mmap__munmap(struct auxtrace_mmap *mm __maybe_unused)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
void __weak auxtrace_mmap_params__init(struct auxtrace_mmap_params *mp __maybe_unused,
|
|
|
|
off_t auxtrace_offset __maybe_unused,
|
|
|
|
unsigned int auxtrace_pages __maybe_unused,
|
|
|
|
bool auxtrace_overwrite __maybe_unused)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
void __weak auxtrace_mmap_params__set_idx(struct auxtrace_mmap_params *mp __maybe_unused,
|
|
|
|
struct perf_evlist *evlist __maybe_unused,
|
|
|
|
int idx __maybe_unused,
|
|
|
|
bool per_cpu __maybe_unused)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2018-11-06 17:03:35 +08:00
|
|
|
#ifdef HAVE_AIO_SUPPORT
|
2019-03-19 01:42:19 +08:00
|
|
|
static int perf_mmap__aio_enabled(struct perf_mmap *map)
|
|
|
|
{
|
|
|
|
return map->aio.nr_cblocks > 0;
|
|
|
|
}
|
2019-01-23 01:48:54 +08:00
|
|
|
|
|
|
|
#ifdef HAVE_LIBNUMA_SUPPORT
|
|
|
|
static int perf_mmap__aio_alloc(struct perf_mmap *map, int idx)
|
|
|
|
{
|
|
|
|
map->aio.data[idx] = mmap(NULL, perf_mmap__mmap_len(map), PROT_READ|PROT_WRITE,
|
|
|
|
MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
|
|
|
|
if (map->aio.data[idx] == MAP_FAILED) {
|
|
|
|
map->aio.data[idx] = NULL;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void perf_mmap__aio_free(struct perf_mmap *map, int idx)
|
|
|
|
{
|
|
|
|
if (map->aio.data[idx]) {
|
|
|
|
munmap(map->aio.data[idx], perf_mmap__mmap_len(map));
|
|
|
|
map->aio.data[idx] = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int perf_mmap__aio_bind(struct perf_mmap *map, int idx, int cpu, int affinity)
|
|
|
|
{
|
|
|
|
void *data;
|
|
|
|
size_t mmap_len;
|
|
|
|
unsigned long node_mask;
|
|
|
|
|
|
|
|
if (affinity != PERF_AFFINITY_SYS && cpu__max_node() > 1) {
|
|
|
|
data = map->aio.data[idx];
|
|
|
|
mmap_len = perf_mmap__mmap_len(map);
|
|
|
|
node_mask = 1UL << cpu__get_node(cpu);
|
|
|
|
if (mbind(data, mmap_len, MPOL_BIND, &node_mask, 1, 0)) {
|
|
|
|
pr_err("Failed to bind [%p-%p] AIO buffer to node %d: error %m\n",
|
|
|
|
data, data + mmap_len, cpu__get_node(cpu));
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2019-03-19 01:42:19 +08:00
|
|
|
#else /* !HAVE_LIBNUMA_SUPPORT */
|
2019-01-23 01:48:54 +08:00
|
|
|
static int perf_mmap__aio_alloc(struct perf_mmap *map, int idx)
|
|
|
|
{
|
|
|
|
map->aio.data[idx] = malloc(perf_mmap__mmap_len(map));
|
|
|
|
if (map->aio.data[idx] == NULL)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void perf_mmap__aio_free(struct perf_mmap *map, int idx)
|
|
|
|
{
|
|
|
|
zfree(&(map->aio.data[idx]));
|
|
|
|
}
|
|
|
|
|
|
|
|
static int perf_mmap__aio_bind(struct perf_mmap *map __maybe_unused, int idx __maybe_unused,
|
|
|
|
int cpu __maybe_unused, int affinity __maybe_unused)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2018-11-06 17:03:35 +08:00
|
|
|
static int perf_mmap__aio_mmap(struct perf_mmap *map, struct mmap_params *mp)
|
|
|
|
{
|
2019-01-23 01:48:54 +08:00
|
|
|
int delta_max, i, prio, ret;
|
2018-11-06 17:03:35 +08:00
|
|
|
|
2018-11-06 17:04:58 +08:00
|
|
|
map->aio.nr_cblocks = mp->nr_cblocks;
|
|
|
|
if (map->aio.nr_cblocks) {
|
2018-11-06 17:07:19 +08:00
|
|
|
map->aio.aiocb = calloc(map->aio.nr_cblocks, sizeof(struct aiocb *));
|
|
|
|
if (!map->aio.aiocb) {
|
|
|
|
pr_debug2("failed to allocate aiocb for data buffer, error %m\n");
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
map->aio.cblocks = calloc(map->aio.nr_cblocks, sizeof(struct aiocb));
|
|
|
|
if (!map->aio.cblocks) {
|
|
|
|
pr_debug2("failed to allocate cblocks for data buffer, error %m\n");
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
map->aio.data = calloc(map->aio.nr_cblocks, sizeof(void *));
|
2018-11-06 17:03:35 +08:00
|
|
|
if (!map->aio.data) {
|
|
|
|
pr_debug2("failed to allocate data buffer, error %m\n");
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
delta_max = sysconf(_SC_AIO_PRIO_DELTA_MAX);
|
2018-11-06 17:07:19 +08:00
|
|
|
for (i = 0; i < map->aio.nr_cblocks; ++i) {
|
2019-01-23 01:48:54 +08:00
|
|
|
ret = perf_mmap__aio_alloc(map, i);
|
|
|
|
if (ret == -1) {
|
2018-11-06 17:07:19 +08:00
|
|
|
pr_debug2("failed to allocate data buffer area, error %m");
|
|
|
|
return -1;
|
|
|
|
}
|
2019-01-23 01:48:54 +08:00
|
|
|
ret = perf_mmap__aio_bind(map, i, map->cpu, mp->affinity);
|
|
|
|
if (ret == -1)
|
|
|
|
return -1;
|
2018-11-06 17:07:19 +08:00
|
|
|
/*
|
|
|
|
* Use cblock.aio_fildes value different from -1
|
|
|
|
* to denote started aio write operation on the
|
|
|
|
* cblock so it requires explicit record__aio_sync()
|
|
|
|
* call prior the cblock may be reused again.
|
|
|
|
*/
|
|
|
|
map->aio.cblocks[i].aio_fildes = -1;
|
|
|
|
/*
|
|
|
|
* Allocate cblocks with priority delta to have
|
|
|
|
* faster aio write system calls because queued requests
|
|
|
|
* are kept in separate per-prio queues and adding
|
|
|
|
* a new request will iterate thru shorter per-prio
|
|
|
|
* list. Blocks with numbers higher than
|
|
|
|
* _SC_AIO_PRIO_DELTA_MAX go with priority 0.
|
|
|
|
*/
|
|
|
|
prio = delta_max - i;
|
|
|
|
map->aio.cblocks[i].aio_reqprio = prio >= 0 ? prio : 0;
|
|
|
|
}
|
2018-11-06 17:03:35 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void perf_mmap__aio_munmap(struct perf_mmap *map)
|
|
|
|
{
|
2018-12-06 01:19:41 +08:00
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < map->aio.nr_cblocks; ++i)
|
2019-01-23 01:48:54 +08:00
|
|
|
perf_mmap__aio_free(map, i);
|
2018-11-06 17:03:35 +08:00
|
|
|
if (map->aio.data)
|
|
|
|
zfree(&map->aio.data);
|
2018-12-06 01:19:41 +08:00
|
|
|
zfree(&map->aio.cblocks);
|
|
|
|
zfree(&map->aio.aiocb);
|
2018-11-06 17:03:35 +08:00
|
|
|
}
|
2019-03-19 01:42:19 +08:00
|
|
|
#else /* !HAVE_AIO_SUPPORT */
|
|
|
|
static int perf_mmap__aio_enabled(struct perf_mmap *map __maybe_unused)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-11-06 17:03:35 +08:00
|
|
|
static int perf_mmap__aio_mmap(struct perf_mmap *map __maybe_unused,
|
|
|
|
struct mmap_params *mp __maybe_unused)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void perf_mmap__aio_munmap(struct perf_mmap *map __maybe_unused)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2017-10-06 21:31:47 +08:00
|
|
|
void perf_mmap__munmap(struct perf_mmap *map)
|
|
|
|
{
|
2018-11-06 17:03:35 +08:00
|
|
|
perf_mmap__aio_munmap(map);
|
2019-03-19 01:42:19 +08:00
|
|
|
if (map->data != NULL) {
|
|
|
|
munmap(map->data, perf_mmap__mmap_len(map));
|
|
|
|
map->data = NULL;
|
|
|
|
}
|
2017-10-06 21:31:47 +08:00
|
|
|
if (map->base != NULL) {
|
|
|
|
munmap(map->base, perf_mmap__mmap_len(map));
|
|
|
|
map->base = NULL;
|
|
|
|
map->fd = -1;
|
|
|
|
refcount_set(&map->refcnt, 0);
|
|
|
|
}
|
|
|
|
auxtrace_mmap__munmap(&map->auxtrace_mmap);
|
|
|
|
}
|
|
|
|
|
2019-01-23 01:50:57 +08:00
|
|
|
static void build_node_mask(int node, cpu_set_t *mask)
|
|
|
|
{
|
|
|
|
int c, cpu, nr_cpus;
|
2019-07-21 19:23:49 +08:00
|
|
|
const struct perf_cpu_map *cpu_map = NULL;
|
2019-01-23 01:50:57 +08:00
|
|
|
|
|
|
|
cpu_map = cpu_map__online();
|
|
|
|
if (!cpu_map)
|
|
|
|
return;
|
|
|
|
|
|
|
|
nr_cpus = cpu_map__nr(cpu_map);
|
|
|
|
for (c = 0; c < nr_cpus; c++) {
|
|
|
|
cpu = cpu_map->map[c]; /* map c index to online cpu index */
|
|
|
|
if (cpu__get_node(cpu) == node)
|
|
|
|
CPU_SET(cpu, mask);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void perf_mmap__setup_affinity_mask(struct perf_mmap *map, struct mmap_params *mp)
|
|
|
|
{
|
|
|
|
CPU_ZERO(&map->affinity_mask);
|
|
|
|
if (mp->affinity == PERF_AFFINITY_NODE && cpu__max_node() > 1)
|
|
|
|
build_node_mask(cpu__get_node(map->cpu), &map->affinity_mask);
|
|
|
|
else if (mp->affinity == PERF_AFFINITY_CPU)
|
|
|
|
CPU_SET(map->cpu, &map->affinity_mask);
|
|
|
|
}
|
|
|
|
|
2018-08-17 19:45:55 +08:00
|
|
|
int perf_mmap__mmap(struct perf_mmap *map, struct mmap_params *mp, int fd, int cpu)
|
2017-10-06 21:31:47 +08:00
|
|
|
{
|
|
|
|
/*
|
2018-03-02 07:09:11 +08:00
|
|
|
* The last one will be done at perf_mmap__consume(), so that we
|
2017-10-06 21:31:47 +08:00
|
|
|
* make sure we don't prevent tools from consuming every last event in
|
|
|
|
* the ring buffer.
|
|
|
|
*
|
|
|
|
* I.e. we can get the POLLHUP meaning that the fd doesn't exist
|
|
|
|
* anymore, but the last events for it are still in the ring buffer,
|
|
|
|
* waiting to be consumed.
|
|
|
|
*
|
|
|
|
* Tools can chose to ignore this at their own discretion, but the
|
|
|
|
* evlist layer can't just drop it when filtering events in
|
|
|
|
* perf_evlist__filter_pollfd().
|
|
|
|
*/
|
|
|
|
refcount_set(&map->refcnt, 2);
|
|
|
|
map->prev = 0;
|
|
|
|
map->mask = mp->mask;
|
|
|
|
map->base = mmap(NULL, perf_mmap__mmap_len(map), mp->prot,
|
|
|
|
MAP_SHARED, fd, 0);
|
|
|
|
if (map->base == MAP_FAILED) {
|
|
|
|
pr_debug2("failed to mmap perf event ring buffer, error %d\n",
|
|
|
|
errno);
|
|
|
|
map->base = NULL;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
map->fd = fd;
|
2018-08-17 19:45:55 +08:00
|
|
|
map->cpu = cpu;
|
2017-10-06 21:31:47 +08:00
|
|
|
|
2019-01-23 01:50:57 +08:00
|
|
|
perf_mmap__setup_affinity_mask(map, mp);
|
2019-01-23 01:47:43 +08:00
|
|
|
|
perf record: Implement --mmap-flush=<number> option
Implement a --mmap-flush option that specifies minimal number of bytes
that is extracted from mmaped kernel buffer to store into a trace. The
default option value is 1 byte what means every time trace writing
thread finds some new data in the mmaped buffer the data is extracted,
possibly compressed and written to a trace.
$ tools/perf/perf record --mmap-flush 1024 -e cycles -- matrix.gcc
$ tools/perf/perf record --aio --mmap-flush 1K -e cycles -- matrix.gcc
The option is independent from -z setting, doesn't vary with compression
level and can serve two purposes.
The first purpose is to increase the compression ratio of a trace data.
Larger data chunks are compressed more effectively so the implemented
option allows specifying data chunk size to compress. Also at some cases
executing more write syscalls with smaller data size can take longer
than executing less write syscalls with bigger data size due to syscall
overhead so extracting bigger data chunks specified by the option value
could additionally decrease runtime overhead.
The second purpose is to avoid self monitoring live-lock issue in system
wide (-a) profiling mode. Profiling in system wide mode with compression
(-a -z) can additionally induce data into the kernel buffers along with
the data from monitored processes. If performance data rate and volume
from the monitored processes is high then trace streaming and
compression activity in the tool is also high. High tool process
activity can lead to subtle live-lock effect when compression of single
new byte from some of mmaped kernel buffer leads to generation of the
next single byte at some mmaped buffer. So perf tool process ends up in
endless self monitoring.
Implemented synch parameter is the mean to force data move independently
from the specified flush threshold value. Despite the provided flush
value the tool needs capability to unconditionally drain memory buffers,
at least in the end of the collection.
Committer testing:
Running with the default value, i.e. as soon as there is something to
read go on consuming, we first write the synthesized events, small
chunks of about 128 bytes:
# perf trace -m 2048 --call-graph dwarf -e write -- perf record
<SNIP>
101.142 ( 0.004 ms): perf/25821 write(fd: 3</root/perf.data>, buf: 0x210db60, count: 120) = 120
__libc_write (/usr/lib64/libpthread-2.28.so)
ion (/home/acme/bin/perf)
record__write (inlined)
process_synthesized_event (/home/acme/bin/perf)
perf_tool__process_synth_event (inlined)
perf_event__synthesize_mmap_events (/home/acme/bin/perf)
Then we move to reading the mmap buffers consuming the events put there
by the kernel perf infrastructure:
107.561 ( 0.005 ms): perf/25821 write(fd: 3</root/perf.data>, buf: 0x7f1befc02000, count: 336) = 336
__libc_write (/usr/lib64/libpthread-2.28.so)
ion (/home/acme/bin/perf)
record__write (inlined)
record__pushfn (/home/acme/bin/perf)
perf_mmap__push (/home/acme/bin/perf)
record__mmap_read_evlist (inlined)
record__mmap_read_all (inlined)
__cmd_record (inlined)
cmd_record (/home/acme/bin/perf)
12919.953 ( 0.136 ms): perf/25821 write(fd: 3</root/perf.data>, buf: 0x7f1befc83150, count: 184984) = 184984
<SNIP same backtrace as in the 107.561 timestamp>
12920.094 ( 0.155 ms): perf/25821 write(fd: 3</root/perf.data>, buf: 0x7f1befc02150, count: 261816) = 261816
<SNIP same backtrace as in the 107.561 timestamp>
12920.253 ( 0.093 ms): perf/25821 write(fd: 3</root/perf.data>, buf: 0x7f1befb81120, count: 170832) = 170832
<SNIP same backtrace as in the 107.561 timestamp>
If we limit it to write only when more than 16MB are available for
reading, it throttles that to a quarter of the --mmap-pages set for
'perf record', which by default get to 528384 bytes, found out using
'record -v':
mmap flush: 132096
mmap size 528384B
With that in place all the writes coming from
record__mmap_read_evlist(), i.e. from the mmap buffers setup by the
kernel perf infrastructure were at least 132096 bytes long.
Trying with a bigger mmap size:
perf trace -e write perf record -v -m 2048 --mmap-flush 16M
74982.928 ( 2.471 ms): perf/26500 write(fd: 3</root/perf.data>, buf: 0x7ff94a6cc000, count: 3580888) = 3580888
74985.406 ( 2.353 ms): perf/26500 write(fd: 3</root/perf.data>, buf: 0x7ff949ecb000, count: 3453256) = 3453256
74987.764 ( 2.629 ms): perf/26500 write(fd: 3</root/perf.data>, buf: 0x7ff9496ca000, count: 3859232) = 3859232
74990.399 ( 2.341 ms): perf/26500 write(fd: 3</root/perf.data>, buf: 0x7ff948ec9000, count: 3769032) = 3769032
74992.744 ( 2.064 ms): perf/26500 write(fd: 3</root/perf.data>, buf: 0x7ff9486c8000, count: 3310520) = 3310520
74994.814 ( 2.619 ms): perf/26500 write(fd: 3</root/perf.data>, buf: 0x7ff947ec7000, count: 4194688) = 4194688
74997.439 ( 2.787 ms): perf/26500 write(fd: 3</root/perf.data>, buf: 0x7ff9476c6000, count: 4029760) = 4029760
Was again limited to a quarter of the mmap size:
mmap flush: 2098176
mmap size 8392704B
A warning about that would be good to have but can be added later,
something like:
"max flush is a quarter of the mmap size, if wanting to bump the mmap
flush further, bump the mmap size as well using -m/--mmap-pages"
Also rename the 'sync' parameters to 'synch' to keep tools/perf building
with older glibcs:
cc1: warnings being treated as errors
builtin-record.c: In function 'record__mmap_read_evlist':
builtin-record.c:775: warning: declaration of 'sync' shadows a global declaration
/usr/include/unistd.h:933: warning: shadowed declaration is here
builtin-record.c: In function 'record__mmap_read_all':
builtin-record.c:856: warning: declaration of 'sync' shadows a global declaration
/usr/include/unistd.h:933: warning: shadowed declaration is here
Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
Reviewed-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/f6600d72-ecfa-2eb7-7e51-f6954547d500@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2019-03-19 01:40:26 +08:00
|
|
|
map->flush = mp->flush;
|
|
|
|
|
2019-03-19 01:42:19 +08:00
|
|
|
map->comp_level = mp->comp_level;
|
|
|
|
|
|
|
|
if (map->comp_level && !perf_mmap__aio_enabled(map)) {
|
|
|
|
map->data = mmap(NULL, perf_mmap__mmap_len(map), PROT_READ|PROT_WRITE,
|
|
|
|
MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
|
|
|
|
if (map->data == MAP_FAILED) {
|
|
|
|
pr_debug2("failed to mmap data buffer, error %d\n",
|
|
|
|
errno);
|
|
|
|
map->data = NULL;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-10-06 21:31:47 +08:00
|
|
|
if (auxtrace_mmap__mmap(&map->auxtrace_mmap,
|
|
|
|
&mp->auxtrace_mp, map->base, fd))
|
|
|
|
return -1;
|
|
|
|
|
2018-11-06 17:03:35 +08:00
|
|
|
return perf_mmap__aio_mmap(map, mp);
|
2017-10-06 21:31:47 +08:00
|
|
|
}
|
2017-10-06 21:46:01 +08:00
|
|
|
|
2018-03-13 20:31:13 +08:00
|
|
|
static int overwrite_rb_find_range(void *buf, int mask, u64 *start, u64 *end)
|
2017-10-06 21:46:01 +08:00
|
|
|
{
|
|
|
|
struct perf_event_header *pheader;
|
2018-03-13 20:31:13 +08:00
|
|
|
u64 evt_head = *start;
|
2017-10-06 21:46:01 +08:00
|
|
|
int size = mask + 1;
|
|
|
|
|
2018-03-13 20:31:13 +08:00
|
|
|
pr_debug2("%s: buf=%p, start=%"PRIx64"\n", __func__, buf, *start);
|
|
|
|
pheader = (struct perf_event_header *)(buf + (*start & mask));
|
2017-10-06 21:46:01 +08:00
|
|
|
while (true) {
|
2018-03-13 20:31:13 +08:00
|
|
|
if (evt_head - *start >= (unsigned int)size) {
|
2017-12-05 00:51:07 +08:00
|
|
|
pr_debug("Finished reading overwrite ring buffer: rewind\n");
|
2018-03-13 20:31:13 +08:00
|
|
|
if (evt_head - *start > (unsigned int)size)
|
2017-10-06 21:46:01 +08:00
|
|
|
evt_head -= pheader->size;
|
|
|
|
*end = evt_head;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
pheader = (struct perf_event_header *)(buf + (evt_head & mask));
|
|
|
|
|
|
|
|
if (pheader->size == 0) {
|
2017-12-05 00:51:07 +08:00
|
|
|
pr_debug("Finished reading overwrite ring buffer: get start\n");
|
2017-10-06 21:46:01 +08:00
|
|
|
*end = evt_head;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
evt_head += pheader->size;
|
|
|
|
pr_debug3("move evt_head: %"PRIx64"\n", evt_head);
|
|
|
|
}
|
|
|
|
WARN_ONCE(1, "Shouldn't get here\n");
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2018-01-19 05:26:19 +08:00
|
|
|
/*
|
|
|
|
* Report the start and end of the available data in ringbuffer
|
|
|
|
*/
|
2018-03-26 22:42:15 +08:00
|
|
|
static int __perf_mmap__read_init(struct perf_mmap *md)
|
2017-10-06 21:46:01 +08:00
|
|
|
{
|
|
|
|
u64 head = perf_mmap__read_head(md);
|
|
|
|
u64 old = md->prev;
|
|
|
|
unsigned char *data = md->base + page_size;
|
|
|
|
unsigned long size;
|
|
|
|
|
2018-03-06 23:36:01 +08:00
|
|
|
md->start = md->overwrite ? head : old;
|
|
|
|
md->end = md->overwrite ? old : head;
|
2017-10-06 21:46:01 +08:00
|
|
|
|
perf record: Implement --mmap-flush=<number> option
Implement a --mmap-flush option that specifies minimal number of bytes
that is extracted from mmaped kernel buffer to store into a trace. The
default option value is 1 byte what means every time trace writing
thread finds some new data in the mmaped buffer the data is extracted,
possibly compressed and written to a trace.
$ tools/perf/perf record --mmap-flush 1024 -e cycles -- matrix.gcc
$ tools/perf/perf record --aio --mmap-flush 1K -e cycles -- matrix.gcc
The option is independent from -z setting, doesn't vary with compression
level and can serve two purposes.
The first purpose is to increase the compression ratio of a trace data.
Larger data chunks are compressed more effectively so the implemented
option allows specifying data chunk size to compress. Also at some cases
executing more write syscalls with smaller data size can take longer
than executing less write syscalls with bigger data size due to syscall
overhead so extracting bigger data chunks specified by the option value
could additionally decrease runtime overhead.
The second purpose is to avoid self monitoring live-lock issue in system
wide (-a) profiling mode. Profiling in system wide mode with compression
(-a -z) can additionally induce data into the kernel buffers along with
the data from monitored processes. If performance data rate and volume
from the monitored processes is high then trace streaming and
compression activity in the tool is also high. High tool process
activity can lead to subtle live-lock effect when compression of single
new byte from some of mmaped kernel buffer leads to generation of the
next single byte at some mmaped buffer. So perf tool process ends up in
endless self monitoring.
Implemented synch parameter is the mean to force data move independently
from the specified flush threshold value. Despite the provided flush
value the tool needs capability to unconditionally drain memory buffers,
at least in the end of the collection.
Committer testing:
Running with the default value, i.e. as soon as there is something to
read go on consuming, we first write the synthesized events, small
chunks of about 128 bytes:
# perf trace -m 2048 --call-graph dwarf -e write -- perf record
<SNIP>
101.142 ( 0.004 ms): perf/25821 write(fd: 3</root/perf.data>, buf: 0x210db60, count: 120) = 120
__libc_write (/usr/lib64/libpthread-2.28.so)
ion (/home/acme/bin/perf)
record__write (inlined)
process_synthesized_event (/home/acme/bin/perf)
perf_tool__process_synth_event (inlined)
perf_event__synthesize_mmap_events (/home/acme/bin/perf)
Then we move to reading the mmap buffers consuming the events put there
by the kernel perf infrastructure:
107.561 ( 0.005 ms): perf/25821 write(fd: 3</root/perf.data>, buf: 0x7f1befc02000, count: 336) = 336
__libc_write (/usr/lib64/libpthread-2.28.so)
ion (/home/acme/bin/perf)
record__write (inlined)
record__pushfn (/home/acme/bin/perf)
perf_mmap__push (/home/acme/bin/perf)
record__mmap_read_evlist (inlined)
record__mmap_read_all (inlined)
__cmd_record (inlined)
cmd_record (/home/acme/bin/perf)
12919.953 ( 0.136 ms): perf/25821 write(fd: 3</root/perf.data>, buf: 0x7f1befc83150, count: 184984) = 184984
<SNIP same backtrace as in the 107.561 timestamp>
12920.094 ( 0.155 ms): perf/25821 write(fd: 3</root/perf.data>, buf: 0x7f1befc02150, count: 261816) = 261816
<SNIP same backtrace as in the 107.561 timestamp>
12920.253 ( 0.093 ms): perf/25821 write(fd: 3</root/perf.data>, buf: 0x7f1befb81120, count: 170832) = 170832
<SNIP same backtrace as in the 107.561 timestamp>
If we limit it to write only when more than 16MB are available for
reading, it throttles that to a quarter of the --mmap-pages set for
'perf record', which by default get to 528384 bytes, found out using
'record -v':
mmap flush: 132096
mmap size 528384B
With that in place all the writes coming from
record__mmap_read_evlist(), i.e. from the mmap buffers setup by the
kernel perf infrastructure were at least 132096 bytes long.
Trying with a bigger mmap size:
perf trace -e write perf record -v -m 2048 --mmap-flush 16M
74982.928 ( 2.471 ms): perf/26500 write(fd: 3</root/perf.data>, buf: 0x7ff94a6cc000, count: 3580888) = 3580888
74985.406 ( 2.353 ms): perf/26500 write(fd: 3</root/perf.data>, buf: 0x7ff949ecb000, count: 3453256) = 3453256
74987.764 ( 2.629 ms): perf/26500 write(fd: 3</root/perf.data>, buf: 0x7ff9496ca000, count: 3859232) = 3859232
74990.399 ( 2.341 ms): perf/26500 write(fd: 3</root/perf.data>, buf: 0x7ff948ec9000, count: 3769032) = 3769032
74992.744 ( 2.064 ms): perf/26500 write(fd: 3</root/perf.data>, buf: 0x7ff9486c8000, count: 3310520) = 3310520
74994.814 ( 2.619 ms): perf/26500 write(fd: 3</root/perf.data>, buf: 0x7ff947ec7000, count: 4194688) = 4194688
74997.439 ( 2.787 ms): perf/26500 write(fd: 3</root/perf.data>, buf: 0x7ff9476c6000, count: 4029760) = 4029760
Was again limited to a quarter of the mmap size:
mmap flush: 2098176
mmap size 8392704B
A warning about that would be good to have but can be added later,
something like:
"max flush is a quarter of the mmap size, if wanting to bump the mmap
flush further, bump the mmap size as well using -m/--mmap-pages"
Also rename the 'sync' parameters to 'synch' to keep tools/perf building
with older glibcs:
cc1: warnings being treated as errors
builtin-record.c: In function 'record__mmap_read_evlist':
builtin-record.c:775: warning: declaration of 'sync' shadows a global declaration
/usr/include/unistd.h:933: warning: shadowed declaration is here
builtin-record.c: In function 'record__mmap_read_all':
builtin-record.c:856: warning: declaration of 'sync' shadows a global declaration
/usr/include/unistd.h:933: warning: shadowed declaration is here
Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
Reviewed-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/f6600d72-ecfa-2eb7-7e51-f6954547d500@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2019-03-19 01:40:26 +08:00
|
|
|
if ((md->end - md->start) < md->flush)
|
2018-01-19 05:26:20 +08:00
|
|
|
return -EAGAIN;
|
2017-10-06 21:46:01 +08:00
|
|
|
|
2018-03-06 23:36:01 +08:00
|
|
|
size = md->end - md->start;
|
2017-10-06 21:46:01 +08:00
|
|
|
if (size > (unsigned long)(md->mask) + 1) {
|
2018-03-06 23:36:01 +08:00
|
|
|
if (!md->overwrite) {
|
2017-12-05 00:51:06 +08:00
|
|
|
WARN_ONCE(1, "failed to keep up with mmap data. (warn only once)\n");
|
2017-10-06 21:46:01 +08:00
|
|
|
|
2017-12-05 00:51:06 +08:00
|
|
|
md->prev = head;
|
2018-03-06 23:36:05 +08:00
|
|
|
perf_mmap__consume(md);
|
2018-01-19 05:26:20 +08:00
|
|
|
return -EAGAIN;
|
2017-12-05 00:51:06 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Backward ring buffer is full. We still have a chance to read
|
|
|
|
* most of data from it.
|
|
|
|
*/
|
2018-03-13 20:31:13 +08:00
|
|
|
if (overwrite_rb_find_range(data, md->mask, &md->start, &md->end))
|
2018-01-19 05:26:20 +08:00
|
|
|
return -EINVAL;
|
2017-10-06 21:46:01 +08:00
|
|
|
}
|
|
|
|
|
2018-01-19 05:26:20 +08:00
|
|
|
return 0;
|
2018-01-19 05:26:19 +08:00
|
|
|
}
|
|
|
|
|
2018-03-26 22:42:15 +08:00
|
|
|
int perf_mmap__read_init(struct perf_mmap *map)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Check if event was unmapped due to a POLLHUP/POLLERR.
|
|
|
|
*/
|
|
|
|
if (!refcount_read(&map->refcnt))
|
|
|
|
return -ENOENT;
|
|
|
|
|
|
|
|
return __perf_mmap__read_init(map);
|
|
|
|
}
|
|
|
|
|
2018-03-06 23:36:02 +08:00
|
|
|
int perf_mmap__push(struct perf_mmap *md, void *to,
|
2018-09-13 20:54:06 +08:00
|
|
|
int push(struct perf_mmap *map, void *to, void *buf, size_t size))
|
2018-01-19 05:26:19 +08:00
|
|
|
{
|
|
|
|
u64 head = perf_mmap__read_head(md);
|
|
|
|
unsigned char *data = md->base + page_size;
|
|
|
|
unsigned long size;
|
|
|
|
void *buf;
|
|
|
|
int rc = 0;
|
|
|
|
|
2018-03-06 23:36:07 +08:00
|
|
|
rc = perf_mmap__read_init(md);
|
2018-01-19 05:26:20 +08:00
|
|
|
if (rc < 0)
|
2019-03-19 01:44:12 +08:00
|
|
|
return (rc == -EAGAIN) ? 1 : -1;
|
2018-01-19 05:26:19 +08:00
|
|
|
|
2018-03-06 23:36:02 +08:00
|
|
|
size = md->end - md->start;
|
2018-01-19 05:26:17 +08:00
|
|
|
|
2018-03-06 23:36:02 +08:00
|
|
|
if ((md->start & md->mask) + size != (md->end & md->mask)) {
|
|
|
|
buf = &data[md->start & md->mask];
|
|
|
|
size = md->mask + 1 - (md->start & md->mask);
|
|
|
|
md->start += size;
|
2017-10-06 21:46:01 +08:00
|
|
|
|
2018-09-13 20:54:06 +08:00
|
|
|
if (push(md, to, buf, size) < 0) {
|
2017-10-06 21:46:01 +08:00
|
|
|
rc = -1;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-03-06 23:36:02 +08:00
|
|
|
buf = &data[md->start & md->mask];
|
|
|
|
size = md->end - md->start;
|
|
|
|
md->start += size;
|
2017-10-06 21:46:01 +08:00
|
|
|
|
2018-09-13 20:54:06 +08:00
|
|
|
if (push(md, to, buf, size) < 0) {
|
2017-10-06 21:46:01 +08:00
|
|
|
rc = -1;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
md->prev = head;
|
2018-03-06 23:36:05 +08:00
|
|
|
perf_mmap__consume(md);
|
2017-10-06 21:46:01 +08:00
|
|
|
out:
|
|
|
|
return rc;
|
|
|
|
}
|
2018-01-19 05:26:22 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Mandatory for overwrite mode
|
|
|
|
* The direction of overwrite mode is backward.
|
|
|
|
* The last perf_mmap__read() will set tail to map->prev.
|
|
|
|
* Need to correct the map->prev to head which is the end of next read.
|
|
|
|
*/
|
|
|
|
void perf_mmap__read_done(struct perf_mmap *map)
|
|
|
|
{
|
2018-03-26 21:42:09 +08:00
|
|
|
/*
|
|
|
|
* Check if event was unmapped due to a POLLHUP/POLLERR.
|
|
|
|
*/
|
|
|
|
if (!refcount_read(&map->refcnt))
|
|
|
|
return;
|
|
|
|
|
2018-01-19 05:26:22 +08:00
|
|
|
map->prev = perf_mmap__read_head(map);
|
|
|
|
}
|