kernel_optimize_test/arch/x86/lib/copy_page_64.S
H. Peter Anvin 83a7a2ad2a x86, alternatives: Use 16-bit numbers for cpufeature index
We already have cpufeature indicies above 255, so use a 16-bit number
for the alternatives index.  This consumes a padding field and so
doesn't add any size, but it means that abusing the padding field to
create assembly errors on overflow no longer works.  We can retain the
test simply by redirecting it to the .discard section, however.

[ v3: updated to include open-coded locations ]

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
LKML-Reference: <tip-f88731e3068f9d1392ba71cc9f50f035d26a0d4f@git.kernel.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2010-07-07 10:36:28 -07:00

120 lines
2.3 KiB
ArmAsm

/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
#include <linux/linkage.h>
#include <asm/dwarf2.h>
ALIGN
copy_page_c:
CFI_STARTPROC
movl $4096/8,%ecx
rep movsq
ret
CFI_ENDPROC
ENDPROC(copy_page_c)
/* Don't use streaming store because it's better when the target
ends up in cache. */
/* Could vary the prefetch distance based on SMP/UP */
ENTRY(copy_page)
CFI_STARTPROC
subq $3*8,%rsp
CFI_ADJUST_CFA_OFFSET 3*8
movq %rbx,(%rsp)
CFI_REL_OFFSET rbx, 0
movq %r12,1*8(%rsp)
CFI_REL_OFFSET r12, 1*8
movq %r13,2*8(%rsp)
CFI_REL_OFFSET r13, 2*8
movl $(4096/64)-5,%ecx
.p2align 4
.Loop64:
dec %rcx
movq (%rsi), %rax
movq 8 (%rsi), %rbx
movq 16 (%rsi), %rdx
movq 24 (%rsi), %r8
movq 32 (%rsi), %r9
movq 40 (%rsi), %r10
movq 48 (%rsi), %r11
movq 56 (%rsi), %r12
prefetcht0 5*64(%rsi)
movq %rax, (%rdi)
movq %rbx, 8 (%rdi)
movq %rdx, 16 (%rdi)
movq %r8, 24 (%rdi)
movq %r9, 32 (%rdi)
movq %r10, 40 (%rdi)
movq %r11, 48 (%rdi)
movq %r12, 56 (%rdi)
leaq 64 (%rsi), %rsi
leaq 64 (%rdi), %rdi
jnz .Loop64
movl $5,%ecx
.p2align 4
.Loop2:
decl %ecx
movq (%rsi), %rax
movq 8 (%rsi), %rbx
movq 16 (%rsi), %rdx
movq 24 (%rsi), %r8
movq 32 (%rsi), %r9
movq 40 (%rsi), %r10
movq 48 (%rsi), %r11
movq 56 (%rsi), %r12
movq %rax, (%rdi)
movq %rbx, 8 (%rdi)
movq %rdx, 16 (%rdi)
movq %r8, 24 (%rdi)
movq %r9, 32 (%rdi)
movq %r10, 40 (%rdi)
movq %r11, 48 (%rdi)
movq %r12, 56 (%rdi)
leaq 64(%rdi),%rdi
leaq 64(%rsi),%rsi
jnz .Loop2
movq (%rsp),%rbx
CFI_RESTORE rbx
movq 1*8(%rsp),%r12
CFI_RESTORE r12
movq 2*8(%rsp),%r13
CFI_RESTORE r13
addq $3*8,%rsp
CFI_ADJUST_CFA_OFFSET -3*8
ret
.Lcopy_page_end:
CFI_ENDPROC
ENDPROC(copy_page)
/* Some CPUs run faster using the string copy instructions.
It is also a lot simpler. Use this when possible */
#include <asm/cpufeature.h>
.section .altinstr_replacement,"ax"
1: .byte 0xeb /* jmp <disp8> */
.byte (copy_page_c - copy_page) - (2f - 1b) /* offset */
2:
.previous
.section .altinstructions,"a"
.align 8
.quad copy_page
.quad 1b
.word X86_FEATURE_REP_GOOD
.byte .Lcopy_page_end - copy_page
.byte 2b - 1b
.previous