forked from luck/tmp_suning_uos_patched
x86: memcpy, clean up
Impact: cleanup Make this file more readable by bringing it more in line with the usual kernel style. Signed-off-by: Ingo Molnar <mingo@elte.hu>
This commit is contained in:
parent
dd1ef4ec47
commit
f3b6eaf014
|
@ -1,30 +1,38 @@
|
|||
/* Copyright 2002 Andi Kleen */
|
||||
|
||||
#include <linux/linkage.h>
|
||||
#include <asm/dwarf2.h>
|
||||
|
||||
#include <asm/cpufeature.h>
|
||||
#include <asm/dwarf2.h>
|
||||
|
||||
/*
|
||||
* memcpy - Copy a memory block.
|
||||
*
|
||||
* Input:
|
||||
* rdi destination
|
||||
* rsi source
|
||||
* rdx count
|
||||
*
|
||||
* Input:
|
||||
* rdi destination
|
||||
* rsi source
|
||||
* rdx count
|
||||
*
|
||||
* Output:
|
||||
* rax original destination
|
||||
*/
|
||||
*/
|
||||
|
||||
/*
|
||||
* memcpy_c() - fast string ops (REP MOVSQ) based variant.
|
||||
*
|
||||
* Calls to this get patched into the kernel image via the
|
||||
* alternative instructions framework:
|
||||
*/
|
||||
ALIGN
|
||||
memcpy_c:
|
||||
CFI_STARTPROC
|
||||
movq %rdi,%rax
|
||||
movl %edx,%ecx
|
||||
shrl $3,%ecx
|
||||
andl $7,%edx
|
||||
movq %rdi, %rax
|
||||
|
||||
movl %edx, %ecx
|
||||
shrl $3, %ecx
|
||||
andl $7, %edx
|
||||
rep movsq
|
||||
movl %edx,%ecx
|
||||
movl %edx, %ecx
|
||||
rep movsb
|
||||
ret
|
||||
CFI_ENDPROC
|
||||
|
@ -33,92 +41,110 @@ ENDPROC(memcpy_c)
|
|||
ENTRY(__memcpy)
|
||||
ENTRY(memcpy)
|
||||
CFI_STARTPROC
|
||||
movq %rdi,%rax
|
||||
|
||||
movl %edx,%ecx
|
||||
shrl $6,%ecx
|
||||
/*
|
||||
* Put the number of full 64-byte blocks into %ecx.
|
||||
* Tail portion is handled at the end:
|
||||
*/
|
||||
movq %rdi, %rax
|
||||
movl %edx, %ecx
|
||||
shrl $6, %ecx
|
||||
jz .Lhandle_tail
|
||||
|
||||
.p2align 4
|
||||
.Lloop_64:
|
||||
/*
|
||||
* We decrement the loop index here - and the zero-flag is
|
||||
* checked at the end of the loop (instructions inbetween do
|
||||
* not change the zero flag):
|
||||
*/
|
||||
decl %ecx
|
||||
|
||||
movq (%rsi),%r11
|
||||
movq 8(%rsi),%r8
|
||||
/*
|
||||
* Move in blocks of 4x16 bytes:
|
||||
*/
|
||||
movq 0*8(%rsi), %r11
|
||||
movq 1*8(%rsi), %r8
|
||||
movq %r11, 0*8(%rdi)
|
||||
movq %r8, 1*8(%rdi)
|
||||
|
||||
movq %r11,(%rdi)
|
||||
movq %r8,1*8(%rdi)
|
||||
movq 2*8(%rsi), %r9
|
||||
movq 3*8(%rsi), %r10
|
||||
movq %r9, 2*8(%rdi)
|
||||
movq %r10, 3*8(%rdi)
|
||||
|
||||
movq 2*8(%rsi),%r9
|
||||
movq 3*8(%rsi),%r10
|
||||
movq 4*8(%rsi), %r11
|
||||
movq 5*8(%rsi), %r8
|
||||
movq %r11, 4*8(%rdi)
|
||||
movq %r8, 5*8(%rdi)
|
||||
|
||||
movq %r9,2*8(%rdi)
|
||||
movq %r10,3*8(%rdi)
|
||||
movq 6*8(%rsi), %r9
|
||||
movq 7*8(%rsi), %r10
|
||||
movq %r9, 6*8(%rdi)
|
||||
movq %r10, 7*8(%rdi)
|
||||
|
||||
movq 4*8(%rsi),%r11
|
||||
movq 5*8(%rsi),%r8
|
||||
leaq 64(%rsi), %rsi
|
||||
leaq 64(%rdi), %rdi
|
||||
|
||||
movq %r11,4*8(%rdi)
|
||||
movq %r8,5*8(%rdi)
|
||||
|
||||
movq 6*8(%rsi),%r9
|
||||
movq 7*8(%rsi),%r10
|
||||
|
||||
movq %r9,6*8(%rdi)
|
||||
movq %r10,7*8(%rdi)
|
||||
|
||||
leaq 64(%rsi),%rsi
|
||||
leaq 64(%rdi),%rdi
|
||||
jnz .Lloop_64
|
||||
|
||||
.Lhandle_tail:
|
||||
movl %edx,%ecx
|
||||
andl $63,%ecx
|
||||
shrl $3,%ecx
|
||||
movl %edx, %ecx
|
||||
andl $63, %ecx
|
||||
shrl $3, %ecx
|
||||
jz .Lhandle_7
|
||||
|
||||
.p2align 4
|
||||
.Lloop_8:
|
||||
decl %ecx
|
||||
movq (%rsi),%r8
|
||||
movq %r8,(%rdi)
|
||||
leaq 8(%rdi),%rdi
|
||||
leaq 8(%rsi),%rsi
|
||||
movq (%rsi), %r8
|
||||
movq %r8, (%rdi)
|
||||
leaq 8(%rdi), %rdi
|
||||
leaq 8(%rsi), %rsi
|
||||
jnz .Lloop_8
|
||||
|
||||
.Lhandle_7:
|
||||
movl %edx,%ecx
|
||||
andl $7,%ecx
|
||||
jz .Lende
|
||||
movl %edx, %ecx
|
||||
andl $7, %ecx
|
||||
jz .Lend
|
||||
|
||||
.p2align 4
|
||||
.Lloop_1:
|
||||
movb (%rsi),%r8b
|
||||
movb %r8b,(%rdi)
|
||||
movb (%rsi), %r8b
|
||||
movb %r8b, (%rdi)
|
||||
incq %rdi
|
||||
incq %rsi
|
||||
decl %ecx
|
||||
jnz .Lloop_1
|
||||
|
||||
.Lende:
|
||||
.Lend:
|
||||
ret
|
||||
CFI_ENDPROC
|
||||
ENDPROC(memcpy)
|
||||
ENDPROC(__memcpy)
|
||||
|
||||
/* Some CPUs run faster using the string copy instructions.
|
||||
It is also a lot simpler. Use this when possible */
|
||||
/*
|
||||
* Some CPUs run faster using the string copy instructions.
|
||||
* It is also a lot simpler. Use this when possible:
|
||||
*/
|
||||
|
||||
.section .altinstr_replacement,"ax"
|
||||
.section .altinstr_replacement, "ax"
|
||||
1: .byte 0xeb /* jmp <disp8> */
|
||||
.byte (memcpy_c - memcpy) - (2f - 1b) /* offset */
|
||||
2:
|
||||
.previous
|
||||
.section .altinstructions,"a"
|
||||
|
||||
.section .altinstructions, "a"
|
||||
.align 8
|
||||
.quad memcpy
|
||||
.quad 1b
|
||||
.byte X86_FEATURE_REP_GOOD
|
||||
/* Replace only beginning, memcpy is used to apply alternatives, so it
|
||||
* is silly to overwrite itself with nops - reboot is only outcome... */
|
||||
|
||||
/*
|
||||
* Replace only beginning, memcpy is used to apply alternatives,
|
||||
* so it is silly to overwrite itself with nops - reboot is the
|
||||
* only outcome...
|
||||
*/
|
||||
.byte 2b - 1b
|
||||
.byte 2b - 1b
|
||||
.previous
|
||||
|
|
Loading…
Reference in New Issue
Block a user