kernel_optimize_test/arch/powerpc/lib/string.S
Anton Blanchard 15c2d45d17 powerpc: Add 64bit optimised memcmp
I noticed ksm spending quite a lot of time in memcmp on a large
KVM box. The current memcmp loop is very unoptimised - byte at a
time compares with no loop unrolling. We can do much much better.

Optimise the loop in a few ways:

- Unroll the byte at a time loop

- For large (at least 32 byte) comparisons that are also 8 byte
  aligned, use an unrolled modulo scheduled loop using 8 byte
  loads. This is similar to our glibc memcmp.

A simple microbenchmark testing 10000000 iterations of an 8192 byte
memcmp was used to measure the performance:

baseline:	29.93 s

modified:	 1.70 s

Just over 17x faster.

v2: Incorporated some suggestions from Segher:

- Use andi. instead of rdlicl.

- Convert bdnzt eq, to bdnz. It's just duplicating the earlier compare
  and was a relic from a previous version.

- Don't use cr5, we have plans to use that CR field for fast local
  atomics.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2015-01-23 14:02:55 +11:00

167 lines
2.5 KiB
ArmAsm

/*
* String handling functions for PowerPC.
*
* Copyright (C) 1996 Paul Mackerras.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <asm/processor.h>
#include <asm/errno.h>
#include <asm/ppc_asm.h>
.section __ex_table,"a"
PPC_LONG_ALIGN
.text
_GLOBAL(strcpy)
addi r5,r3,-1
addi r4,r4,-1
1: lbzu r0,1(r4)
cmpwi 0,r0,0
stbu r0,1(r5)
bne 1b
blr
/* This clears out any unused part of the destination buffer,
just as the libc version does. -- paulus */
_GLOBAL(strncpy)
PPC_LCMPI 0,r5,0
beqlr
mtctr r5
addi r6,r3,-1
addi r4,r4,-1
1: lbzu r0,1(r4)
cmpwi 0,r0,0
stbu r0,1(r6)
bdnzf 2,1b /* dec ctr, branch if ctr != 0 && !cr0.eq */
bnelr /* if we didn't hit a null char, we're done */
mfctr r5
PPC_LCMPI 0,r5,0 /* any space left in destination buffer? */
beqlr /* we know r0 == 0 here */
2: stbu r0,1(r6) /* clear it out if so */
bdnz 2b
blr
_GLOBAL(strcat)
addi r5,r3,-1
addi r4,r4,-1
1: lbzu r0,1(r5)
cmpwi 0,r0,0
bne 1b
addi r5,r5,-1
1: lbzu r0,1(r4)
cmpwi 0,r0,0
stbu r0,1(r5)
bne 1b
blr
_GLOBAL(strcmp)
addi r5,r3,-1
addi r4,r4,-1
1: lbzu r3,1(r5)
cmpwi 1,r3,0
lbzu r0,1(r4)
subf. r3,r0,r3
beqlr 1
beq 1b
blr
_GLOBAL(strncmp)
PPC_LCMPI 0,r5,0
beq- 2f
mtctr r5
addi r5,r3,-1
addi r4,r4,-1
1: lbzu r3,1(r5)
cmpwi 1,r3,0
lbzu r0,1(r4)
subf. r3,r0,r3
beqlr 1
bdnzt eq,1b
blr
2: li r3,0
blr
_GLOBAL(strlen)
addi r4,r3,-1
1: lbzu r0,1(r4)
cmpwi 0,r0,0
bne 1b
subf r3,r3,r4
blr
#ifdef CONFIG_PPC32
_GLOBAL(memcmp)
PPC_LCMPI 0,r5,0
beq- 2f
mtctr r5
addi r6,r3,-1
addi r4,r4,-1
1: lbzu r3,1(r6)
lbzu r0,1(r4)
subf. r3,r0,r3
bdnzt 2,1b
blr
2: li r3,0
blr
#endif
_GLOBAL(memchr)
PPC_LCMPI 0,r5,0
beq- 2f
mtctr r5
addi r3,r3,-1
1: lbzu r0,1(r3)
cmpw 0,r0,r4
bdnzf 2,1b
beqlr
2: li r3,0
blr
#ifdef CONFIG_PPC32
_GLOBAL(__clear_user)
addi r6,r3,-4
li r3,0
li r5,0
cmplwi 0,r4,4
blt 7f
/* clear a single word */
11: stwu r5,4(r6)
beqlr
/* clear word sized chunks */
andi. r0,r6,3
add r4,r0,r4
subf r6,r0,r6
srwi r0,r4,2
andi. r4,r4,3
mtctr r0
bdz 7f
1: stwu r5,4(r6)
bdnz 1b
/* clear byte sized chunks */
7: cmpwi 0,r4,0
beqlr
mtctr r4
addi r6,r6,3
8: stbu r5,1(r6)
bdnz 8b
blr
90: mr r3,r4
blr
91: mfctr r3
slwi r3,r3,2
add r3,r3,r4
blr
92: mfctr r3
blr
.section __ex_table,"a"
PPC_LONG 11b,90b
PPC_LONG 1b,91b
PPC_LONG 8b,92b
.text
#endif