kernel_optimize_test/arch/x86_64/lib/memcpy.S

/* Copyright 2002 Andi Kleen */
	
	#include <asm/cpufeature.h>		
/*
 * memcpy - Copy a memory block.
 *
 * Input:	
 * rdi destination
 * rsi source
 * rdx count
 * 
 * Output:
 * rax original destination
 */	

 	.globl __memcpy
	.globl memcpy
	.p2align 4
__memcpy:
memcpy:		
	pushq %rbx
	movq %rdi,%rax

	movl %edx,%ecx
	shrl $6,%ecx
	jz .Lhandle_tail
	
	.p2align 4
.Lloop_64:
	decl %ecx
	
	movq (%rsi),%r11
	movq 8(%rsi),%r8

	movq %r11,(%rdi)
	movq %r8,1*8(%rdi)

	movq 2*8(%rsi),%r9
	movq 3*8(%rsi),%r10

	movq %r9,2*8(%rdi)
	movq %r10,3*8(%rdi)
		
	movq 4*8(%rsi),%r11
	movq 5*8(%rsi),%r8

	movq %r11,4*8(%rdi)
	movq %r8,5*8(%rdi)

	movq 6*8(%rsi),%r9
	movq 7*8(%rsi),%r10

	movq %r9,6*8(%rdi)
	movq %r10,7*8(%rdi)

	leaq 64(%rsi),%rsi
	leaq 64(%rdi),%rdi
	jnz  .Lloop_64

.Lhandle_tail:
	movl %edx,%ecx
	andl $63,%ecx
	shrl $3,%ecx
	jz   .Lhandle_7
	.p2align 4
.Lloop_8: 
	decl %ecx
	movq (%rsi),%r8
	movq %r8,(%rdi) 
	leaq 8(%rdi),%rdi
	leaq 8(%rsi),%rsi
	jnz  .Lloop_8

.Lhandle_7:
	movl %edx,%ecx
	andl $7,%ecx
	jz .Lende
	.p2align 4
.Lloop_1:
	movb (%rsi),%r8b
	movb %r8b,(%rdi) 
	incq %rdi
	incq %rsi
	decl %ecx
	jnz .Lloop_1
	
.Lende: 	
	popq %rbx
	ret
.Lfinal:
	
	/* C stepping K8 run faster using the string copy instructions.
	   It is also a lot simpler. Use this when possible */
	
	.section .altinstructions,"a"
	.align 8
	.quad  memcpy
	.quad  memcpy_c
	.byte  X86_FEATURE_K8_C
	.byte  .Lfinal-memcpy
	.byte  memcpy_c_end-memcpy_c	
	.previous

	.section .altinstr_replacement,"ax"
 /* rdi	destination
  * rsi source
  * rdx count
  */			
memcpy_c:
	movq %rdi,%rax
	movl %edx,%ecx
	shrl $3,%ecx
	andl $7,%edx	
	rep 
	movsq 
	movl %edx,%ecx
	rep
	movsb
	ret
memcpy_c_end:
	.previous
Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-17 06:20:36 +08:00			`/* Copyright 2002 Andi Kleen */`

			`#include <asm/cpufeature.h>`
			`/*`
			`* memcpy - Copy a memory block.`
			`*`
			`* Input:`
			`* rdi destination`
			`* rsi source`
			`* rdx count`
			`*`
			`* Output:`
			`* rax original destination`
			`*/`

			`.globl __memcpy`
			`.globl memcpy`
			`.p2align 4`
			`__memcpy:`
			`memcpy:`
			`pushq %rbx`
			`movq %rdi,%rax`

			`movl %edx,%ecx`
			`shrl $6,%ecx`
			`jz .Lhandle_tail`

			`.p2align 4`
			`.Lloop_64:`
			`decl %ecx`

			`movq (%rsi),%r11`
			`movq 8(%rsi),%r8`

			`movq %r11,(%rdi)`
			`movq %r8,1*8(%rdi)`

			`movq 2*8(%rsi),%r9`
			`movq 3*8(%rsi),%r10`

			`movq %r9,2*8(%rdi)`
			`movq %r10,3*8(%rdi)`

			`movq 4*8(%rsi),%r11`
			`movq 5*8(%rsi),%r8`

			`movq %r11,4*8(%rdi)`
			`movq %r8,5*8(%rdi)`

			`movq 6*8(%rsi),%r9`
			`movq 7*8(%rsi),%r10`

			`movq %r9,6*8(%rdi)`
			`movq %r10,7*8(%rdi)`

			`leaq 64(%rsi),%rsi`
			`leaq 64(%rdi),%rdi`
			`jnz .Lloop_64`

			`.Lhandle_tail:`
			`movl %edx,%ecx`
			`andl $63,%ecx`
			`shrl $3,%ecx`
			`jz .Lhandle_7`
			`.p2align 4`
			`.Lloop_8:`
			`decl %ecx`
			`movq (%rsi),%r8`
			`movq %r8,(%rdi)`
			`leaq 8(%rdi),%rdi`
			`leaq 8(%rsi),%rsi`
			`jnz .Lloop_8`

			`.Lhandle_7:`
			`movl %edx,%ecx`
			`andl $7,%ecx`
			`jz .Lende`
			`.p2align 4`
			`.Lloop_1:`
			`movb (%rsi),%r8b`
			`movb %r8b,(%rdi)`
			`incq %rdi`
			`incq %rsi`
			`decl %ecx`
			`jnz .Lloop_1`

			`.Lende:`
			`popq %rbx`
			`ret`
			`.Lfinal:`

			`/* C stepping K8 run faster using the string copy instructions.`
			`It is also a lot simpler. Use this when possible */`

			`.section .altinstructions,"a"`
			`.align 8`
			`.quad memcpy`
			`.quad memcpy_c`
			`.byte X86_FEATURE_K8_C`
			`.byte .Lfinal-memcpy`
			`.byte memcpy_c_end-memcpy_c`
			`.previous`

			`.section .altinstr_replacement,"ax"`
			`/* rdi destination`
			`* rsi source`
			`* rdx count`
			`*/`
			`memcpy_c:`
			`movq %rdi,%rax`
			`movl %edx,%ecx`
			`shrl $3,%ecx`
			`andl $7,%edx`
			`rep`
			`movsq`
			`movl %edx,%ecx`
			`rep`
			`movsb`
			`ret`
			`memcpy_c_end:`
			`.previous`