kernel_optimize_test/arch/powerpc/perf/8xx-pmu.c

206 lines
4.9 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-or-later
powerpc/8xx: Perf events on PPC 8xx This patch has been reworked since RFC version. In the RFC, this patch was preceded by a patch clearing MSR RI for all PPC32 at all time at exception prologs. Now MSR RI clearing is done only when this 8xx perf events functionality is compiled in, it is therefore limited to 8xx and merged inside this patch. Other main changes have been to take into account detailed review from Peter Zijlstra. The instructions counter has been reworked to behave as a free running counter like the three other counters. The 8xx has no PMU, however some events can be emulated by other means. This patch implements the following events (as reported by 'perf list'): cpu-cycles OR cycles [Hardware event] instructions [Hardware event] dTLB-load-misses [Hardware cache event] iTLB-load-misses [Hardware cache event] 'cycles' event is implemented using the timebase clock. Timebase clock corresponds to CPU clock divided by 16, so number of cycles is approximatly 16 times the number of TB ticks On the 8xx, TLB misses are handled by software. It is therefore easy to count all TLB misses each time the TLB miss exception is called. 'instructions' is calculated by using instruction watchpoint counter. This patch sets counter A to count instructions at address greater than 0, hence we count all instructions executed while MSR RI bit is set. The counter is set to the maximum which is 0xffff. Every 65535 instructions, debug instruction breakpoint exception fires. The exception handler increments a counter in memory which then represent the upper part of the instruction counter. We therefore end up with a 48 bits counter. In order to avoid unnecessary overhead while no perf event is active, this counter is started when the first event referring to this counter is added, and the counter is stopped when the last event referring to it is deleted. In order to properly support breakpoint exceptions, MSR RI bit has to be unset in exception epilogs in order to avoid breakpoint exceptions during critical sections during changes to SRR0 and SRR1 would be problematic. All counters are handled as free running counters. Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr> Signed-off-by: Scott Wood <oss@buserror.net>
2016-12-15 20:42:18 +08:00
/*
* Performance event support - PPC 8xx
*
* Copyright 2016 Christophe Leroy, CS Systemes d'Information
*/
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/perf_event.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <asm/pmc.h>
#include <asm/machdep.h>
#include <asm/firmware.h>
#include <asm/ptrace.h>
#include <asm/code-patching.h>
#include <asm/inst.h>
powerpc/8xx: Perf events on PPC 8xx This patch has been reworked since RFC version. In the RFC, this patch was preceded by a patch clearing MSR RI for all PPC32 at all time at exception prologs. Now MSR RI clearing is done only when this 8xx perf events functionality is compiled in, it is therefore limited to 8xx and merged inside this patch. Other main changes have been to take into account detailed review from Peter Zijlstra. The instructions counter has been reworked to behave as a free running counter like the three other counters. The 8xx has no PMU, however some events can be emulated by other means. This patch implements the following events (as reported by 'perf list'): cpu-cycles OR cycles [Hardware event] instructions [Hardware event] dTLB-load-misses [Hardware cache event] iTLB-load-misses [Hardware cache event] 'cycles' event is implemented using the timebase clock. Timebase clock corresponds to CPU clock divided by 16, so number of cycles is approximatly 16 times the number of TB ticks On the 8xx, TLB misses are handled by software. It is therefore easy to count all TLB misses each time the TLB miss exception is called. 'instructions' is calculated by using instruction watchpoint counter. This patch sets counter A to count instructions at address greater than 0, hence we count all instructions executed while MSR RI bit is set. The counter is set to the maximum which is 0xffff. Every 65535 instructions, debug instruction breakpoint exception fires. The exception handler increments a counter in memory which then represent the upper part of the instruction counter. We therefore end up with a 48 bits counter. In order to avoid unnecessary overhead while no perf event is active, this counter is started when the first event referring to this counter is added, and the counter is stopped when the last event referring to it is deleted. In order to properly support breakpoint exceptions, MSR RI bit has to be unset in exception epilogs in order to avoid breakpoint exceptions during critical sections during changes to SRR0 and SRR1 would be problematic. All counters are handled as free running counters. Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr> Signed-off-by: Scott Wood <oss@buserror.net>
2016-12-15 20:42:18 +08:00
#define PERF_8xx_ID_CPU_CYCLES 1
#define PERF_8xx_ID_HW_INSTRUCTIONS 2
#define PERF_8xx_ID_ITLB_LOAD_MISS 3
#define PERF_8xx_ID_DTLB_LOAD_MISS 4
#define C(x) PERF_COUNT_HW_CACHE_##x
#define DTLB_LOAD_MISS (C(DTLB) | (C(OP_READ) << 8) | (C(RESULT_MISS) << 16))
#define ITLB_LOAD_MISS (C(ITLB) | (C(OP_READ) << 8) | (C(RESULT_MISS) << 16))
extern unsigned long itlb_miss_counter, dtlb_miss_counter;
extern atomic_t instruction_counter;
static atomic_t insn_ctr_ref;
static atomic_t itlb_miss_ref;
static atomic_t dtlb_miss_ref;
powerpc/8xx: Perf events on PPC 8xx This patch has been reworked since RFC version. In the RFC, this patch was preceded by a patch clearing MSR RI for all PPC32 at all time at exception prologs. Now MSR RI clearing is done only when this 8xx perf events functionality is compiled in, it is therefore limited to 8xx and merged inside this patch. Other main changes have been to take into account detailed review from Peter Zijlstra. The instructions counter has been reworked to behave as a free running counter like the three other counters. The 8xx has no PMU, however some events can be emulated by other means. This patch implements the following events (as reported by 'perf list'): cpu-cycles OR cycles [Hardware event] instructions [Hardware event] dTLB-load-misses [Hardware cache event] iTLB-load-misses [Hardware cache event] 'cycles' event is implemented using the timebase clock. Timebase clock corresponds to CPU clock divided by 16, so number of cycles is approximatly 16 times the number of TB ticks On the 8xx, TLB misses are handled by software. It is therefore easy to count all TLB misses each time the TLB miss exception is called. 'instructions' is calculated by using instruction watchpoint counter. This patch sets counter A to count instructions at address greater than 0, hence we count all instructions executed while MSR RI bit is set. The counter is set to the maximum which is 0xffff. Every 65535 instructions, debug instruction breakpoint exception fires. The exception handler increments a counter in memory which then represent the upper part of the instruction counter. We therefore end up with a 48 bits counter. In order to avoid unnecessary overhead while no perf event is active, this counter is started when the first event referring to this counter is added, and the counter is stopped when the last event referring to it is deleted. In order to properly support breakpoint exceptions, MSR RI bit has to be unset in exception epilogs in order to avoid breakpoint exceptions during critical sections during changes to SRR0 and SRR1 would be problematic. All counters are handled as free running counters. Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr> Signed-off-by: Scott Wood <oss@buserror.net>
2016-12-15 20:42:18 +08:00
static s64 get_insn_ctr(void)
{
int ctr;
unsigned long counta;
do {
ctr = atomic_read(&instruction_counter);
counta = mfspr(SPRN_COUNTA);
} while (ctr != atomic_read(&instruction_counter));
return ((s64)ctr << 16) | (counta >> 16);
}
static int event_type(struct perf_event *event)
{
switch (event->attr.type) {
case PERF_TYPE_HARDWARE:
if (event->attr.config == PERF_COUNT_HW_CPU_CYCLES)
return PERF_8xx_ID_CPU_CYCLES;
if (event->attr.config == PERF_COUNT_HW_INSTRUCTIONS)
return PERF_8xx_ID_HW_INSTRUCTIONS;
break;
case PERF_TYPE_HW_CACHE:
if (event->attr.config == ITLB_LOAD_MISS)
return PERF_8xx_ID_ITLB_LOAD_MISS;
if (event->attr.config == DTLB_LOAD_MISS)
return PERF_8xx_ID_DTLB_LOAD_MISS;
break;
case PERF_TYPE_RAW:
break;
default:
return -ENOENT;
}
return -EOPNOTSUPP;
}
static int mpc8xx_pmu_event_init(struct perf_event *event)
{
int type = event_type(event);
if (type < 0)
return type;
return 0;
}
static int mpc8xx_pmu_add(struct perf_event *event, int flags)
{
int type = event_type(event);
s64 val = 0;
if (type < 0)
return type;
switch (type) {
case PERF_8xx_ID_CPU_CYCLES:
val = get_tb();
break;
case PERF_8xx_ID_HW_INSTRUCTIONS:
if (atomic_inc_return(&insn_ctr_ref) == 1)
mtspr(SPRN_ICTRL, 0xc0080007);
val = get_insn_ctr();
break;
case PERF_8xx_ID_ITLB_LOAD_MISS:
if (atomic_inc_return(&itlb_miss_ref) == 1) {
unsigned long target = patch_site_addr(&patch__itlbmiss_perf);
patch_branch_site(&patch__itlbmiss_exit_1, target, 0);
}
powerpc/8xx: Perf events on PPC 8xx This patch has been reworked since RFC version. In the RFC, this patch was preceded by a patch clearing MSR RI for all PPC32 at all time at exception prologs. Now MSR RI clearing is done only when this 8xx perf events functionality is compiled in, it is therefore limited to 8xx and merged inside this patch. Other main changes have been to take into account detailed review from Peter Zijlstra. The instructions counter has been reworked to behave as a free running counter like the three other counters. The 8xx has no PMU, however some events can be emulated by other means. This patch implements the following events (as reported by 'perf list'): cpu-cycles OR cycles [Hardware event] instructions [Hardware event] dTLB-load-misses [Hardware cache event] iTLB-load-misses [Hardware cache event] 'cycles' event is implemented using the timebase clock. Timebase clock corresponds to CPU clock divided by 16, so number of cycles is approximatly 16 times the number of TB ticks On the 8xx, TLB misses are handled by software. It is therefore easy to count all TLB misses each time the TLB miss exception is called. 'instructions' is calculated by using instruction watchpoint counter. This patch sets counter A to count instructions at address greater than 0, hence we count all instructions executed while MSR RI bit is set. The counter is set to the maximum which is 0xffff. Every 65535 instructions, debug instruction breakpoint exception fires. The exception handler increments a counter in memory which then represent the upper part of the instruction counter. We therefore end up with a 48 bits counter. In order to avoid unnecessary overhead while no perf event is active, this counter is started when the first event referring to this counter is added, and the counter is stopped when the last event referring to it is deleted. In order to properly support breakpoint exceptions, MSR RI bit has to be unset in exception epilogs in order to avoid breakpoint exceptions during critical sections during changes to SRR0 and SRR1 would be problematic. All counters are handled as free running counters. Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr> Signed-off-by: Scott Wood <oss@buserror.net>
2016-12-15 20:42:18 +08:00
val = itlb_miss_counter;
break;
case PERF_8xx_ID_DTLB_LOAD_MISS:
if (atomic_inc_return(&dtlb_miss_ref) == 1) {
unsigned long target = patch_site_addr(&patch__dtlbmiss_perf);
patch_branch_site(&patch__dtlbmiss_exit_1, target, 0);
}
powerpc/8xx: Perf events on PPC 8xx This patch has been reworked since RFC version. In the RFC, this patch was preceded by a patch clearing MSR RI for all PPC32 at all time at exception prologs. Now MSR RI clearing is done only when this 8xx perf events functionality is compiled in, it is therefore limited to 8xx and merged inside this patch. Other main changes have been to take into account detailed review from Peter Zijlstra. The instructions counter has been reworked to behave as a free running counter like the three other counters. The 8xx has no PMU, however some events can be emulated by other means. This patch implements the following events (as reported by 'perf list'): cpu-cycles OR cycles [Hardware event] instructions [Hardware event] dTLB-load-misses [Hardware cache event] iTLB-load-misses [Hardware cache event] 'cycles' event is implemented using the timebase clock. Timebase clock corresponds to CPU clock divided by 16, so number of cycles is approximatly 16 times the number of TB ticks On the 8xx, TLB misses are handled by software. It is therefore easy to count all TLB misses each time the TLB miss exception is called. 'instructions' is calculated by using instruction watchpoint counter. This patch sets counter A to count instructions at address greater than 0, hence we count all instructions executed while MSR RI bit is set. The counter is set to the maximum which is 0xffff. Every 65535 instructions, debug instruction breakpoint exception fires. The exception handler increments a counter in memory which then represent the upper part of the instruction counter. We therefore end up with a 48 bits counter. In order to avoid unnecessary overhead while no perf event is active, this counter is started when the first event referring to this counter is added, and the counter is stopped when the last event referring to it is deleted. In order to properly support breakpoint exceptions, MSR RI bit has to be unset in exception epilogs in order to avoid breakpoint exceptions during critical sections during changes to SRR0 and SRR1 would be problematic. All counters are handled as free running counters. Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr> Signed-off-by: Scott Wood <oss@buserror.net>
2016-12-15 20:42:18 +08:00
val = dtlb_miss_counter;
break;
}
local64_set(&event->hw.prev_count, val);
return 0;
}
static void mpc8xx_pmu_read(struct perf_event *event)
{
int type = event_type(event);
s64 prev, val = 0, delta = 0;
if (type < 0)
return;
do {
prev = local64_read(&event->hw.prev_count);
switch (type) {
case PERF_8xx_ID_CPU_CYCLES:
val = get_tb();
delta = 16 * (val - prev);
break;
case PERF_8xx_ID_HW_INSTRUCTIONS:
val = get_insn_ctr();
delta = prev - val;
if (delta < 0)
delta += 0x1000000000000LL;
break;
case PERF_8xx_ID_ITLB_LOAD_MISS:
val = itlb_miss_counter;
delta = (s64)((s32)val - (s32)prev);
break;
case PERF_8xx_ID_DTLB_LOAD_MISS:
val = dtlb_miss_counter;
delta = (s64)((s32)val - (s32)prev);
break;
}
} while (local64_cmpxchg(&event->hw.prev_count, prev, val) != prev);
local64_add(delta, &event->count);
}
static void mpc8xx_pmu_del(struct perf_event *event, int flags)
{
mpc8xx_pmu_read(event);
/* If it was the last user, stop counting to avoid useles overhead */
switch (event_type(event)) {
case PERF_8xx_ID_CPU_CYCLES:
break;
case PERF_8xx_ID_HW_INSTRUCTIONS:
if (atomic_dec_return(&insn_ctr_ref) == 0)
mtspr(SPRN_ICTRL, 7);
break;
case PERF_8xx_ID_ITLB_LOAD_MISS:
if (atomic_dec_return(&itlb_miss_ref) == 0) {
/* mfspr r10, SPRN_SPRG_SCRATCH0 */
struct ppc_inst insn = ppc_inst(PPC_INST_MFSPR | __PPC_RS(R10) |
__PPC_SPR(SPRN_SPRG_SCRATCH0));
patch_instruction_site(&patch__itlbmiss_exit_1, insn);
}
break;
case PERF_8xx_ID_DTLB_LOAD_MISS:
if (atomic_dec_return(&dtlb_miss_ref) == 0) {
/* mfspr r10, SPRN_DAR */
struct ppc_inst insn = ppc_inst(PPC_INST_MFSPR | __PPC_RS(R10) |
__PPC_SPR(SPRN_DAR));
patch_instruction_site(&patch__dtlbmiss_exit_1, insn);
}
break;
}
powerpc/8xx: Perf events on PPC 8xx This patch has been reworked since RFC version. In the RFC, this patch was preceded by a patch clearing MSR RI for all PPC32 at all time at exception prologs. Now MSR RI clearing is done only when this 8xx perf events functionality is compiled in, it is therefore limited to 8xx and merged inside this patch. Other main changes have been to take into account detailed review from Peter Zijlstra. The instructions counter has been reworked to behave as a free running counter like the three other counters. The 8xx has no PMU, however some events can be emulated by other means. This patch implements the following events (as reported by 'perf list'): cpu-cycles OR cycles [Hardware event] instructions [Hardware event] dTLB-load-misses [Hardware cache event] iTLB-load-misses [Hardware cache event] 'cycles' event is implemented using the timebase clock. Timebase clock corresponds to CPU clock divided by 16, so number of cycles is approximatly 16 times the number of TB ticks On the 8xx, TLB misses are handled by software. It is therefore easy to count all TLB misses each time the TLB miss exception is called. 'instructions' is calculated by using instruction watchpoint counter. This patch sets counter A to count instructions at address greater than 0, hence we count all instructions executed while MSR RI bit is set. The counter is set to the maximum which is 0xffff. Every 65535 instructions, debug instruction breakpoint exception fires. The exception handler increments a counter in memory which then represent the upper part of the instruction counter. We therefore end up with a 48 bits counter. In order to avoid unnecessary overhead while no perf event is active, this counter is started when the first event referring to this counter is added, and the counter is stopped when the last event referring to it is deleted. In order to properly support breakpoint exceptions, MSR RI bit has to be unset in exception epilogs in order to avoid breakpoint exceptions during critical sections during changes to SRR0 and SRR1 would be problematic. All counters are handled as free running counters. Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr> Signed-off-by: Scott Wood <oss@buserror.net>
2016-12-15 20:42:18 +08:00
}
static struct pmu mpc8xx_pmu = {
.event_init = mpc8xx_pmu_event_init,
.add = mpc8xx_pmu_add,
.del = mpc8xx_pmu_del,
.read = mpc8xx_pmu_read,
.capabilities = PERF_PMU_CAP_NO_INTERRUPT |
PERF_PMU_CAP_NO_NMI,
};
static int init_mpc8xx_pmu(void)
{
mtspr(SPRN_ICTRL, 7);
mtspr(SPRN_CMPA, 0);
mtspr(SPRN_COUNTA, 0xffff);
return perf_pmu_register(&mpc8xx_pmu, "cpu", PERF_TYPE_RAW);
}
early_initcall(init_mpc8xx_pmu);