2019-05-27 14:55:01 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
powerpc/8xx: Perf events on PPC 8xx
This patch has been reworked since RFC version. In the RFC, this patch
was preceded by a patch clearing MSR RI for all PPC32 at all time at
exception prologs. Now MSR RI clearing is done only when this 8xx perf
events functionality is compiled in, it is therefore limited to 8xx
and merged inside this patch.
Other main changes have been to take into account detailed review from
Peter Zijlstra. The instructions counter has been reworked to behave
as a free running counter like the three other counters.
The 8xx has no PMU, however some events can be emulated by other means.
This patch implements the following events (as reported by 'perf list'):
cpu-cycles OR cycles [Hardware event]
instructions [Hardware event]
dTLB-load-misses [Hardware cache event]
iTLB-load-misses [Hardware cache event]
'cycles' event is implemented using the timebase clock. Timebase clock
corresponds to CPU clock divided by 16, so number of cycles is
approximatly 16 times the number of TB ticks
On the 8xx, TLB misses are handled by software. It is therefore
easy to count all TLB misses each time the TLB miss exception is
called.
'instructions' is calculated by using instruction watchpoint counter.
This patch sets counter A to count instructions at address greater
than 0, hence we count all instructions executed while MSR RI bit is
set. The counter is set to the maximum which is 0xffff. Every 65535
instructions, debug instruction breakpoint exception fires. The
exception handler increments a counter in memory which then
represent the upper part of the instruction counter. We therefore
end up with a 48 bits counter. In order to avoid unnecessary overhead
while no perf event is active, this counter is started when the first
event referring to this counter is added, and the counter is stopped
when the last event referring to it is deleted. In order to properly
support breakpoint exceptions, MSR RI bit has to be unset in exception
epilogs in order to avoid breakpoint exceptions during critical
sections during changes to SRR0 and SRR1 would be problematic.
All counters are handled as free running counters.
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Scott Wood <oss@buserror.net>
2016-12-15 20:42:18 +08:00
|
|
|
/*
|
|
|
|
* Performance event support - PPC 8xx
|
|
|
|
*
|
|
|
|
* Copyright 2016 Christophe Leroy, CS Systemes d'Information
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/sched.h>
|
|
|
|
#include <linux/perf_event.h>
|
|
|
|
#include <linux/percpu.h>
|
|
|
|
#include <linux/hardirq.h>
|
|
|
|
#include <asm/pmc.h>
|
|
|
|
#include <asm/machdep.h>
|
|
|
|
#include <asm/firmware.h>
|
|
|
|
#include <asm/ptrace.h>
|
2018-01-12 20:45:23 +08:00
|
|
|
#include <asm/code-patching.h>
|
2020-05-06 11:40:26 +08:00
|
|
|
#include <asm/inst.h>
|
powerpc/8xx: Perf events on PPC 8xx
This patch has been reworked since RFC version. In the RFC, this patch
was preceded by a patch clearing MSR RI for all PPC32 at all time at
exception prologs. Now MSR RI clearing is done only when this 8xx perf
events functionality is compiled in, it is therefore limited to 8xx
and merged inside this patch.
Other main changes have been to take into account detailed review from
Peter Zijlstra. The instructions counter has been reworked to behave
as a free running counter like the three other counters.
The 8xx has no PMU, however some events can be emulated by other means.
This patch implements the following events (as reported by 'perf list'):
cpu-cycles OR cycles [Hardware event]
instructions [Hardware event]
dTLB-load-misses [Hardware cache event]
iTLB-load-misses [Hardware cache event]
'cycles' event is implemented using the timebase clock. Timebase clock
corresponds to CPU clock divided by 16, so number of cycles is
approximatly 16 times the number of TB ticks
On the 8xx, TLB misses are handled by software. It is therefore
easy to count all TLB misses each time the TLB miss exception is
called.
'instructions' is calculated by using instruction watchpoint counter.
This patch sets counter A to count instructions at address greater
than 0, hence we count all instructions executed while MSR RI bit is
set. The counter is set to the maximum which is 0xffff. Every 65535
instructions, debug instruction breakpoint exception fires. The
exception handler increments a counter in memory which then
represent the upper part of the instruction counter. We therefore
end up with a 48 bits counter. In order to avoid unnecessary overhead
while no perf event is active, this counter is started when the first
event referring to this counter is added, and the counter is stopped
when the last event referring to it is deleted. In order to properly
support breakpoint exceptions, MSR RI bit has to be unset in exception
epilogs in order to avoid breakpoint exceptions during critical
sections during changes to SRR0 and SRR1 would be problematic.
All counters are handled as free running counters.
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Scott Wood <oss@buserror.net>
2016-12-15 20:42:18 +08:00
|
|
|
|
|
|
|
#define PERF_8xx_ID_CPU_CYCLES 1
|
|
|
|
#define PERF_8xx_ID_HW_INSTRUCTIONS 2
|
|
|
|
#define PERF_8xx_ID_ITLB_LOAD_MISS 3
|
|
|
|
#define PERF_8xx_ID_DTLB_LOAD_MISS 4
|
|
|
|
|
|
|
|
#define C(x) PERF_COUNT_HW_CACHE_##x
|
|
|
|
#define DTLB_LOAD_MISS (C(DTLB) | (C(OP_READ) << 8) | (C(RESULT_MISS) << 16))
|
|
|
|
#define ITLB_LOAD_MISS (C(ITLB) | (C(OP_READ) << 8) | (C(RESULT_MISS) << 16))
|
|
|
|
|
|
|
|
extern unsigned long itlb_miss_counter, dtlb_miss_counter;
|
|
|
|
extern atomic_t instruction_counter;
|
|
|
|
|
|
|
|
static atomic_t insn_ctr_ref;
|
2018-01-12 20:45:23 +08:00
|
|
|
static atomic_t itlb_miss_ref;
|
|
|
|
static atomic_t dtlb_miss_ref;
|
powerpc/8xx: Perf events on PPC 8xx
This patch has been reworked since RFC version. In the RFC, this patch
was preceded by a patch clearing MSR RI for all PPC32 at all time at
exception prologs. Now MSR RI clearing is done only when this 8xx perf
events functionality is compiled in, it is therefore limited to 8xx
and merged inside this patch.
Other main changes have been to take into account detailed review from
Peter Zijlstra. The instructions counter has been reworked to behave
as a free running counter like the three other counters.
The 8xx has no PMU, however some events can be emulated by other means.
This patch implements the following events (as reported by 'perf list'):
cpu-cycles OR cycles [Hardware event]
instructions [Hardware event]
dTLB-load-misses [Hardware cache event]
iTLB-load-misses [Hardware cache event]
'cycles' event is implemented using the timebase clock. Timebase clock
corresponds to CPU clock divided by 16, so number of cycles is
approximatly 16 times the number of TB ticks
On the 8xx, TLB misses are handled by software. It is therefore
easy to count all TLB misses each time the TLB miss exception is
called.
'instructions' is calculated by using instruction watchpoint counter.
This patch sets counter A to count instructions at address greater
than 0, hence we count all instructions executed while MSR RI bit is
set. The counter is set to the maximum which is 0xffff. Every 65535
instructions, debug instruction breakpoint exception fires. The
exception handler increments a counter in memory which then
represent the upper part of the instruction counter. We therefore
end up with a 48 bits counter. In order to avoid unnecessary overhead
while no perf event is active, this counter is started when the first
event referring to this counter is added, and the counter is stopped
when the last event referring to it is deleted. In order to properly
support breakpoint exceptions, MSR RI bit has to be unset in exception
epilogs in order to avoid breakpoint exceptions during critical
sections during changes to SRR0 and SRR1 would be problematic.
All counters are handled as free running counters.
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Scott Wood <oss@buserror.net>
2016-12-15 20:42:18 +08:00
|
|
|
|
|
|
|
static s64 get_insn_ctr(void)
|
|
|
|
{
|
|
|
|
int ctr;
|
|
|
|
unsigned long counta;
|
|
|
|
|
|
|
|
do {
|
|
|
|
ctr = atomic_read(&instruction_counter);
|
|
|
|
counta = mfspr(SPRN_COUNTA);
|
|
|
|
} while (ctr != atomic_read(&instruction_counter));
|
|
|
|
|
|
|
|
return ((s64)ctr << 16) | (counta >> 16);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int event_type(struct perf_event *event)
|
|
|
|
{
|
|
|
|
switch (event->attr.type) {
|
|
|
|
case PERF_TYPE_HARDWARE:
|
|
|
|
if (event->attr.config == PERF_COUNT_HW_CPU_CYCLES)
|
|
|
|
return PERF_8xx_ID_CPU_CYCLES;
|
|
|
|
if (event->attr.config == PERF_COUNT_HW_INSTRUCTIONS)
|
|
|
|
return PERF_8xx_ID_HW_INSTRUCTIONS;
|
|
|
|
break;
|
|
|
|
case PERF_TYPE_HW_CACHE:
|
|
|
|
if (event->attr.config == ITLB_LOAD_MISS)
|
|
|
|
return PERF_8xx_ID_ITLB_LOAD_MISS;
|
|
|
|
if (event->attr.config == DTLB_LOAD_MISS)
|
|
|
|
return PERF_8xx_ID_DTLB_LOAD_MISS;
|
|
|
|
break;
|
|
|
|
case PERF_TYPE_RAW:
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return -ENOENT;
|
|
|
|
}
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int mpc8xx_pmu_event_init(struct perf_event *event)
|
|
|
|
{
|
|
|
|
int type = event_type(event);
|
|
|
|
|
|
|
|
if (type < 0)
|
|
|
|
return type;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int mpc8xx_pmu_add(struct perf_event *event, int flags)
|
|
|
|
{
|
|
|
|
int type = event_type(event);
|
|
|
|
s64 val = 0;
|
|
|
|
|
|
|
|
if (type < 0)
|
|
|
|
return type;
|
|
|
|
|
|
|
|
switch (type) {
|
|
|
|
case PERF_8xx_ID_CPU_CYCLES:
|
|
|
|
val = get_tb();
|
|
|
|
break;
|
|
|
|
case PERF_8xx_ID_HW_INSTRUCTIONS:
|
|
|
|
if (atomic_inc_return(&insn_ctr_ref) == 1)
|
|
|
|
mtspr(SPRN_ICTRL, 0xc0080007);
|
|
|
|
val = get_insn_ctr();
|
|
|
|
break;
|
|
|
|
case PERF_8xx_ID_ITLB_LOAD_MISS:
|
2018-01-12 20:45:23 +08:00
|
|
|
if (atomic_inc_return(&itlb_miss_ref) == 1) {
|
2018-10-19 14:55:08 +08:00
|
|
|
unsigned long target = patch_site_addr(&patch__itlbmiss_perf);
|
2018-01-12 20:45:23 +08:00
|
|
|
|
2018-10-19 14:55:08 +08:00
|
|
|
patch_branch_site(&patch__itlbmiss_exit_1, target, 0);
|
2018-01-12 20:45:23 +08:00
|
|
|
}
|
powerpc/8xx: Perf events on PPC 8xx
This patch has been reworked since RFC version. In the RFC, this patch
was preceded by a patch clearing MSR RI for all PPC32 at all time at
exception prologs. Now MSR RI clearing is done only when this 8xx perf
events functionality is compiled in, it is therefore limited to 8xx
and merged inside this patch.
Other main changes have been to take into account detailed review from
Peter Zijlstra. The instructions counter has been reworked to behave
as a free running counter like the three other counters.
The 8xx has no PMU, however some events can be emulated by other means.
This patch implements the following events (as reported by 'perf list'):
cpu-cycles OR cycles [Hardware event]
instructions [Hardware event]
dTLB-load-misses [Hardware cache event]
iTLB-load-misses [Hardware cache event]
'cycles' event is implemented using the timebase clock. Timebase clock
corresponds to CPU clock divided by 16, so number of cycles is
approximatly 16 times the number of TB ticks
On the 8xx, TLB misses are handled by software. It is therefore
easy to count all TLB misses each time the TLB miss exception is
called.
'instructions' is calculated by using instruction watchpoint counter.
This patch sets counter A to count instructions at address greater
than 0, hence we count all instructions executed while MSR RI bit is
set. The counter is set to the maximum which is 0xffff. Every 65535
instructions, debug instruction breakpoint exception fires. The
exception handler increments a counter in memory which then
represent the upper part of the instruction counter. We therefore
end up with a 48 bits counter. In order to avoid unnecessary overhead
while no perf event is active, this counter is started when the first
event referring to this counter is added, and the counter is stopped
when the last event referring to it is deleted. In order to properly
support breakpoint exceptions, MSR RI bit has to be unset in exception
epilogs in order to avoid breakpoint exceptions during critical
sections during changes to SRR0 and SRR1 would be problematic.
All counters are handled as free running counters.
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Scott Wood <oss@buserror.net>
2016-12-15 20:42:18 +08:00
|
|
|
val = itlb_miss_counter;
|
|
|
|
break;
|
|
|
|
case PERF_8xx_ID_DTLB_LOAD_MISS:
|
2018-01-12 20:45:23 +08:00
|
|
|
if (atomic_inc_return(&dtlb_miss_ref) == 1) {
|
2018-10-19 14:55:08 +08:00
|
|
|
unsigned long target = patch_site_addr(&patch__dtlbmiss_perf);
|
2018-01-12 20:45:23 +08:00
|
|
|
|
2018-10-19 14:55:08 +08:00
|
|
|
patch_branch_site(&patch__dtlbmiss_exit_1, target, 0);
|
2018-01-12 20:45:23 +08:00
|
|
|
}
|
powerpc/8xx: Perf events on PPC 8xx
This patch has been reworked since RFC version. In the RFC, this patch
was preceded by a patch clearing MSR RI for all PPC32 at all time at
exception prologs. Now MSR RI clearing is done only when this 8xx perf
events functionality is compiled in, it is therefore limited to 8xx
and merged inside this patch.
Other main changes have been to take into account detailed review from
Peter Zijlstra. The instructions counter has been reworked to behave
as a free running counter like the three other counters.
The 8xx has no PMU, however some events can be emulated by other means.
This patch implements the following events (as reported by 'perf list'):
cpu-cycles OR cycles [Hardware event]
instructions [Hardware event]
dTLB-load-misses [Hardware cache event]
iTLB-load-misses [Hardware cache event]
'cycles' event is implemented using the timebase clock. Timebase clock
corresponds to CPU clock divided by 16, so number of cycles is
approximatly 16 times the number of TB ticks
On the 8xx, TLB misses are handled by software. It is therefore
easy to count all TLB misses each time the TLB miss exception is
called.
'instructions' is calculated by using instruction watchpoint counter.
This patch sets counter A to count instructions at address greater
than 0, hence we count all instructions executed while MSR RI bit is
set. The counter is set to the maximum which is 0xffff. Every 65535
instructions, debug instruction breakpoint exception fires. The
exception handler increments a counter in memory which then
represent the upper part of the instruction counter. We therefore
end up with a 48 bits counter. In order to avoid unnecessary overhead
while no perf event is active, this counter is started when the first
event referring to this counter is added, and the counter is stopped
when the last event referring to it is deleted. In order to properly
support breakpoint exceptions, MSR RI bit has to be unset in exception
epilogs in order to avoid breakpoint exceptions during critical
sections during changes to SRR0 and SRR1 would be problematic.
All counters are handled as free running counters.
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Scott Wood <oss@buserror.net>
2016-12-15 20:42:18 +08:00
|
|
|
val = dtlb_miss_counter;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
local64_set(&event->hw.prev_count, val);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void mpc8xx_pmu_read(struct perf_event *event)
|
|
|
|
{
|
|
|
|
int type = event_type(event);
|
|
|
|
s64 prev, val = 0, delta = 0;
|
|
|
|
|
|
|
|
if (type < 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
do {
|
|
|
|
prev = local64_read(&event->hw.prev_count);
|
|
|
|
switch (type) {
|
|
|
|
case PERF_8xx_ID_CPU_CYCLES:
|
|
|
|
val = get_tb();
|
|
|
|
delta = 16 * (val - prev);
|
|
|
|
break;
|
|
|
|
case PERF_8xx_ID_HW_INSTRUCTIONS:
|
|
|
|
val = get_insn_ctr();
|
|
|
|
delta = prev - val;
|
|
|
|
if (delta < 0)
|
|
|
|
delta += 0x1000000000000LL;
|
|
|
|
break;
|
|
|
|
case PERF_8xx_ID_ITLB_LOAD_MISS:
|
|
|
|
val = itlb_miss_counter;
|
|
|
|
delta = (s64)((s32)val - (s32)prev);
|
|
|
|
break;
|
|
|
|
case PERF_8xx_ID_DTLB_LOAD_MISS:
|
|
|
|
val = dtlb_miss_counter;
|
|
|
|
delta = (s64)((s32)val - (s32)prev);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} while (local64_cmpxchg(&event->hw.prev_count, prev, val) != prev);
|
|
|
|
|
|
|
|
local64_add(delta, &event->count);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void mpc8xx_pmu_del(struct perf_event *event, int flags)
|
|
|
|
{
|
|
|
|
mpc8xx_pmu_read(event);
|
|
|
|
|
|
|
|
/* If it was the last user, stop counting to avoid useles overhead */
|
2018-01-12 20:45:23 +08:00
|
|
|
switch (event_type(event)) {
|
|
|
|
case PERF_8xx_ID_CPU_CYCLES:
|
|
|
|
break;
|
|
|
|
case PERF_8xx_ID_HW_INSTRUCTIONS:
|
|
|
|
if (atomic_dec_return(&insn_ctr_ref) == 0)
|
|
|
|
mtspr(SPRN_ICTRL, 7);
|
|
|
|
break;
|
|
|
|
case PERF_8xx_ID_ITLB_LOAD_MISS:
|
|
|
|
if (atomic_dec_return(&itlb_miss_ref) == 0) {
|
2019-12-21 16:32:31 +08:00
|
|
|
/* mfspr r10, SPRN_SPRG_SCRATCH0 */
|
2020-05-06 11:40:26 +08:00
|
|
|
struct ppc_inst insn = ppc_inst(PPC_INST_MFSPR | __PPC_RS(R10) |
|
|
|
|
__PPC_SPR(SPRN_SPRG_SCRATCH0));
|
2019-12-21 16:32:31 +08:00
|
|
|
|
2018-10-19 14:55:08 +08:00
|
|
|
patch_instruction_site(&patch__itlbmiss_exit_1, insn);
|
2018-01-12 20:45:23 +08:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
case PERF_8xx_ID_DTLB_LOAD_MISS:
|
|
|
|
if (atomic_dec_return(&dtlb_miss_ref) == 0) {
|
2019-12-21 16:32:31 +08:00
|
|
|
/* mfspr r10, SPRN_DAR */
|
2020-05-06 11:40:26 +08:00
|
|
|
struct ppc_inst insn = ppc_inst(PPC_INST_MFSPR | __PPC_RS(R10) |
|
|
|
|
__PPC_SPR(SPRN_DAR));
|
2019-12-21 16:32:31 +08:00
|
|
|
|
2018-10-19 14:55:08 +08:00
|
|
|
patch_instruction_site(&patch__dtlbmiss_exit_1, insn);
|
2018-01-12 20:45:23 +08:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
powerpc/8xx: Perf events on PPC 8xx
This patch has been reworked since RFC version. In the RFC, this patch
was preceded by a patch clearing MSR RI for all PPC32 at all time at
exception prologs. Now MSR RI clearing is done only when this 8xx perf
events functionality is compiled in, it is therefore limited to 8xx
and merged inside this patch.
Other main changes have been to take into account detailed review from
Peter Zijlstra. The instructions counter has been reworked to behave
as a free running counter like the three other counters.
The 8xx has no PMU, however some events can be emulated by other means.
This patch implements the following events (as reported by 'perf list'):
cpu-cycles OR cycles [Hardware event]
instructions [Hardware event]
dTLB-load-misses [Hardware cache event]
iTLB-load-misses [Hardware cache event]
'cycles' event is implemented using the timebase clock. Timebase clock
corresponds to CPU clock divided by 16, so number of cycles is
approximatly 16 times the number of TB ticks
On the 8xx, TLB misses are handled by software. It is therefore
easy to count all TLB misses each time the TLB miss exception is
called.
'instructions' is calculated by using instruction watchpoint counter.
This patch sets counter A to count instructions at address greater
than 0, hence we count all instructions executed while MSR RI bit is
set. The counter is set to the maximum which is 0xffff. Every 65535
instructions, debug instruction breakpoint exception fires. The
exception handler increments a counter in memory which then
represent the upper part of the instruction counter. We therefore
end up with a 48 bits counter. In order to avoid unnecessary overhead
while no perf event is active, this counter is started when the first
event referring to this counter is added, and the counter is stopped
when the last event referring to it is deleted. In order to properly
support breakpoint exceptions, MSR RI bit has to be unset in exception
epilogs in order to avoid breakpoint exceptions during critical
sections during changes to SRR0 and SRR1 would be problematic.
All counters are handled as free running counters.
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Scott Wood <oss@buserror.net>
2016-12-15 20:42:18 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct pmu mpc8xx_pmu = {
|
|
|
|
.event_init = mpc8xx_pmu_event_init,
|
|
|
|
.add = mpc8xx_pmu_add,
|
|
|
|
.del = mpc8xx_pmu_del,
|
|
|
|
.read = mpc8xx_pmu_read,
|
|
|
|
.capabilities = PERF_PMU_CAP_NO_INTERRUPT |
|
|
|
|
PERF_PMU_CAP_NO_NMI,
|
|
|
|
};
|
|
|
|
|
|
|
|
static int init_mpc8xx_pmu(void)
|
|
|
|
{
|
|
|
|
mtspr(SPRN_ICTRL, 7);
|
|
|
|
mtspr(SPRN_CMPA, 0);
|
|
|
|
mtspr(SPRN_COUNTA, 0xffff);
|
|
|
|
|
|
|
|
return perf_pmu_register(&mpc8xx_pmu, "cpu", PERF_TYPE_RAW);
|
|
|
|
}
|
|
|
|
|
|
|
|
early_initcall(init_mpc8xx_pmu);
|