x86-64: Move vread_tsc into a new file with sensible options
vread_tsc is short and hot, and it's userspace code so the usual reasons to enable -pg and turn off sibling calls don't apply. (OK, turning off sibling calls has no effect. But it might someday...) As an added benefit, tsc.c is profilable now. Signed-off-by: Andy Lutomirski <luto@mit.edu> Cc: Andi Kleen <andi@firstfloor.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: "David S. Miller" <davem@davemloft.net> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Borislav Petkov <bp@amd64.org> Link: http://lkml.kernel.org/r/%3C99c6d7f5efa3ccb65b4ac6eb443e1ab7bad47d7b.1306156808.git.luto%40mit.edu%3E Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
This commit is contained in:
parent
0f51f2852c
commit
44259b1abf
@ -51,6 +51,10 @@ extern int unsynchronized_tsc(void);
|
|||||||
extern int check_tsc_unstable(void);
|
extern int check_tsc_unstable(void);
|
||||||
extern unsigned long native_calibrate_tsc(void);
|
extern unsigned long native_calibrate_tsc(void);
|
||||||
|
|
||||||
|
#ifdef CONFIG_X86_64
|
||||||
|
extern cycles_t vread_tsc(void);
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Boot-time check whether the TSCs are synchronized across
|
* Boot-time check whether the TSCs are synchronized across
|
||||||
* all CPUs/cores:
|
* all CPUs/cores:
|
||||||
|
@ -8,7 +8,6 @@ CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
|
|||||||
|
|
||||||
ifdef CONFIG_FUNCTION_TRACER
|
ifdef CONFIG_FUNCTION_TRACER
|
||||||
# Do not profile debug and lowlevel utilities
|
# Do not profile debug and lowlevel utilities
|
||||||
CFLAGS_REMOVE_tsc.o = -pg
|
|
||||||
CFLAGS_REMOVE_rtc.o = -pg
|
CFLAGS_REMOVE_rtc.o = -pg
|
||||||
CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
|
CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
|
||||||
CFLAGS_REMOVE_pvclock.o = -pg
|
CFLAGS_REMOVE_pvclock.o = -pg
|
||||||
@ -24,13 +23,16 @@ endif
|
|||||||
nostackp := $(call cc-option, -fno-stack-protector)
|
nostackp := $(call cc-option, -fno-stack-protector)
|
||||||
CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
|
CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
|
||||||
CFLAGS_hpet.o := $(nostackp)
|
CFLAGS_hpet.o := $(nostackp)
|
||||||
CFLAGS_tsc.o := $(nostackp)
|
CFLAGS_vread_tsc_64.o := $(nostackp)
|
||||||
CFLAGS_paravirt.o := $(nostackp)
|
CFLAGS_paravirt.o := $(nostackp)
|
||||||
GCOV_PROFILE_vsyscall_64.o := n
|
GCOV_PROFILE_vsyscall_64.o := n
|
||||||
GCOV_PROFILE_hpet.o := n
|
GCOV_PROFILE_hpet.o := n
|
||||||
GCOV_PROFILE_tsc.o := n
|
GCOV_PROFILE_tsc.o := n
|
||||||
GCOV_PROFILE_paravirt.o := n
|
GCOV_PROFILE_paravirt.o := n
|
||||||
|
|
||||||
|
# vread_tsc_64 is hot and should be fully optimized:
|
||||||
|
CFLAGS_REMOVE_vread_tsc_64.o = -pg -fno-optimize-sibling-calls
|
||||||
|
|
||||||
obj-y := process_$(BITS).o signal.o entry_$(BITS).o
|
obj-y := process_$(BITS).o signal.o entry_$(BITS).o
|
||||||
obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
|
obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
|
||||||
obj-y += time.o ioport.o ldt.o dumpstack.o
|
obj-y += time.o ioport.o ldt.o dumpstack.o
|
||||||
@ -39,7 +41,7 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o
|
|||||||
obj-y += probe_roms.o
|
obj-y += probe_roms.o
|
||||||
obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
|
obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
|
||||||
obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
|
obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
|
||||||
obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
|
obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o vread_tsc_64.o
|
||||||
obj-y += bootflag.o e820.o
|
obj-y += bootflag.o e820.o
|
||||||
obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
|
obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
|
||||||
obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
|
obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
|
||||||
|
@ -763,40 +763,6 @@ static cycle_t read_tsc(struct clocksource *cs)
|
|||||||
ret : clocksource_tsc.cycle_last;
|
ret : clocksource_tsc.cycle_last;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_X86_64
|
|
||||||
static cycle_t __vsyscall_fn vread_tsc(void)
|
|
||||||
{
|
|
||||||
cycle_t ret;
|
|
||||||
u64 last;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Empirically, a fence (of type that depends on the CPU)
|
|
||||||
* before rdtsc is enough to ensure that rdtsc is ordered
|
|
||||||
* with respect to loads. The various CPU manuals are unclear
|
|
||||||
* as to whether rdtsc can be reordered with later loads,
|
|
||||||
* but no one has ever seen it happen.
|
|
||||||
*/
|
|
||||||
rdtsc_barrier();
|
|
||||||
ret = (cycle_t)vget_cycles();
|
|
||||||
|
|
||||||
last = VVAR(vsyscall_gtod_data).clock.cycle_last;
|
|
||||||
|
|
||||||
if (likely(ret >= last))
|
|
||||||
return ret;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* GCC likes to generate cmov here, but this branch is extremely
|
|
||||||
* predictable (it's just a funciton of time and the likely is
|
|
||||||
* very likely) and there's a data dependence, so force GCC
|
|
||||||
* to generate a branch instead. I don't barrier() because
|
|
||||||
* we don't actually need a barrier, and if this function
|
|
||||||
* ever gets inlined it will generate worse code.
|
|
||||||
*/
|
|
||||||
asm volatile ("");
|
|
||||||
return last;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static void resume_tsc(struct clocksource *cs)
|
static void resume_tsc(struct clocksource *cs)
|
||||||
{
|
{
|
||||||
clocksource_tsc.cycle_last = 0;
|
clocksource_tsc.cycle_last = 0;
|
||||||
|
36
arch/x86/kernel/vread_tsc_64.c
Normal file
36
arch/x86/kernel/vread_tsc_64.c
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
/* This code runs in userspace. */
|
||||||
|
|
||||||
|
#define DISABLE_BRANCH_PROFILING
|
||||||
|
#include <asm/vgtod.h>
|
||||||
|
|
||||||
|
notrace cycle_t __vsyscall_fn vread_tsc(void)
|
||||||
|
{
|
||||||
|
cycle_t ret;
|
||||||
|
u64 last;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Empirically, a fence (of type that depends on the CPU)
|
||||||
|
* before rdtsc is enough to ensure that rdtsc is ordered
|
||||||
|
* with respect to loads. The various CPU manuals are unclear
|
||||||
|
* as to whether rdtsc can be reordered with later loads,
|
||||||
|
* but no one has ever seen it happen.
|
||||||
|
*/
|
||||||
|
rdtsc_barrier();
|
||||||
|
ret = (cycle_t)vget_cycles();
|
||||||
|
|
||||||
|
last = VVAR(vsyscall_gtod_data).clock.cycle_last;
|
||||||
|
|
||||||
|
if (likely(ret >= last))
|
||||||
|
return ret;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* GCC likes to generate cmov here, but this branch is extremely
|
||||||
|
* predictable (it's just a funciton of time and the likely is
|
||||||
|
* very likely) and there's a data dependence, so force GCC
|
||||||
|
* to generate a branch instead. I don't barrier() because
|
||||||
|
* we don't actually need a barrier, and if this function
|
||||||
|
* ever gets inlined it will generate worse code.
|
||||||
|
*/
|
||||||
|
asm volatile ("");
|
||||||
|
return last;
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user