sh: __copy_user() optimizations for small copies.

This implements a fast-path for small (less than 12 bytes) copies,
with the existing path treated as the slow-path and left as the default
behaviour for all other copy sizes.

Signed-off-by: Stuart Menefy <stuart.menefy@st.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
This commit is contained in:
Stuart Menefy 2007-09-28 12:36:35 +09:00 committed by Paul Mundt
parent 24eb17e081
commit 023ef184ff

View File

@ -141,47 +141,38 @@ ENTRY(__copy_user_page)
.long 9999b, 6000f ; \ .long 9999b, 6000f ; \
.previous .previous
ENTRY(__copy_user) ENTRY(__copy_user)
tst r6,r6 ! Check explicitly for zero ! Check if small number of bytes
bf 1f mov #11,r0
rts
mov #0,r0 ! normal return
1:
mov.l r10,@-r15
mov.l r9,@-r15
mov.l r8,@-r15
mov r4,r3 mov r4,r3
add r6,r3 ! last destination address cmp/gt r0,r6 ! r6 (len) > r0 (11)
mov #12,r0 ! Check if small number of bytes bf/s .L_cleanup_loop_no_pop
cmp/gt r0,r6 add r6,r3 ! last destination address
bt 2f
bra .L_cleanup_loop
nop
2:
neg r5,r0 ! Calculate bytes needed to align source
add #4,r0
and #3,r0
tst r0,r0
bt .L_jump
mov r0,r1
.L_loop1: ! Calculate bytes needed to align to src
! Copy bytes to align source mov.l r11,@-r15
EX( mov.b @r5+,r0 ) neg r5,r0
dt r1 mov.l r10,@-r15
EX( mov.b r0,@r4 ) add #4,r0
mov.l r9,@-r15
and #3,r0
mov.l r8,@-r15
tst r0,r0
bt 2f
1:
! Copy bytes to long word align src
EX( mov.b @r5+,r1 )
dt r0
add #-1,r6 add #-1,r6
bf/s .L_loop1 EX( mov.b r1,@r4 )
bf/s 1b
add #1,r4 add #1,r4
.L_jump: ! Jump to appropriate routine depending on dest
mov r6,r2 ! Calculate number of longwords to copy 2: mov #3,r1
mov r6, r2
and r4,r1
shlr2 r2 shlr2 r2
tst r2,r2
bt .L_cleanup
mov r4,r0 ! Jump to appropriate routine
and #3,r0
mov r0,r1
shll2 r1 shll2 r1
mova .L_jump_tbl,r0 mova .L_jump_tbl,r0
mov.l @(r0,r1),r1 mov.l @(r0,r1),r1
@ -195,43 +186,97 @@ EX( mov.b r0,@r4 )
.long .L_dest10 .long .L_dest10
.long .L_dest11 .long .L_dest11
/*
* Come here if there are less than 12 bytes to copy
*
* Keep the branch target close, so the bf/s callee doesn't overflow
* and result in a more expensive branch being inserted. This is the
* fast-path for small copies, the jump via the jump table will hit the
* default slow-path cleanup. -PFM.
*/
.L_cleanup_loop_no_pop:
tst r6,r6 ! Check explicitly for zero
bt 1f
2:
EX( mov.b @r5+,r0 )
dt r6
EX( mov.b r0,@r4 )
bf/s 2b
add #1,r4
1: mov #0,r0 ! normal return
5000:
# Exception handler:
.section .fixup, "ax"
6000:
mov.l 8000f,r1
mov r3,r0
jmp @r1
sub r4,r0
.align 2
8000: .long 5000b
.previous
rts
nop
! Destination = 00 ! Destination = 00
.L_dest00: .L_dest00:
mov r2,r7 ! Skip the large copy for small transfers
shlr2 r7 mov #(32+32-4), r0
shlr r7 cmp/gt r6, r0 ! r0 (60) > r6 (len)
tst r7,r7 bt 1f
mov #7,r0
bt/s 1f ! Align dest to a 32 byte boundary
and r0,r2 neg r4,r0
.align 2 add #0x20, r0
and #0x1f, r0
tst r0, r0
bt 2f
sub r0, r6
shlr2 r0
3:
EX( mov.l @r5+,r1 )
dt r0
EX( mov.l r1,@r4 )
bf/s 3b
add #4,r4
2: 2:
EX( mov.l @r5+,r0 ) EX( mov.l @r5+,r0 )
EX( mov.l @r5+,r1 )
EX( mov.l @r5+,r2 )
EX( mov.l @r5+,r7 )
EX( mov.l @r5+,r8 ) EX( mov.l @r5+,r8 )
EX( mov.l @r5+,r9 ) EX( mov.l @r5+,r9 )
EX( mov.l @r5+,r10 ) EX( mov.l @r5+,r10 )
EX( mov.l r0,@r4 ) EX( mov.l @r5+,r11 )
EX( mov.l r8,@(4,r4) ) EX( movca.l r0,@r4 )
EX( mov.l r9,@(8,r4) ) add #-32, r6
EX( mov.l r10,@(12,r4) ) EX( mov.l r1,@(4,r4) )
EX( mov.l @r5+,r0 ) mov #32, r0
EX( mov.l @r5+,r8 ) EX( mov.l r2,@(8,r4) )
EX( mov.l @r5+,r9 ) cmp/gt r6, r0 ! r0 (32) > r6 (len)
EX( mov.l @r5+,r10 ) EX( mov.l r7,@(12,r4) )
dt r7 EX( mov.l r8,@(16,r4) )
EX( mov.l r0,@(16,r4) ) EX( mov.l r9,@(20,r4) )
EX( mov.l r8,@(20,r4) ) EX( mov.l r10,@(24,r4) )
EX( mov.l r9,@(24,r4) ) EX( mov.l r11,@(28,r4) )
EX( mov.l r10,@(28,r4) )
bf/s 2b bf/s 2b
add #32,r4 add #32,r4
tst r2,r2
1: mov r6, r0
shlr2 r0
tst r0, r0
bt .L_cleanup bt .L_cleanup
1: 1:
EX( mov.l @r5+,r0 ) EX( mov.l @r5+,r1 )
dt r2 dt r0
EX( mov.l r0,@r4 ) EX( mov.l r1,@r4 )
bf/s 1b bf/s 1b
add #4,r4 add #4,r4
@ -250,7 +295,7 @@ EX( mov.l r0,@r4 )
and r0,r2 and r0,r2
2: 2:
dt r7 dt r7
#ifdef __LITTLE_ENDIAN__ #ifdef CONFIG_CPU_LITTLE_ENDIAN
EX( mov.l @r5+,r0 ) EX( mov.l @r5+,r0 )
EX( mov.l @r5+,r1 ) EX( mov.l @r5+,r1 )
EX( mov.l @r5+,r8 ) EX( mov.l @r5+,r8 )
@ -320,7 +365,7 @@ EX( mov.w r0,@(2,r4) )
1: ! Read longword, write two words per iteration 1: ! Read longword, write two words per iteration
EX( mov.l @r5+,r0 ) EX( mov.l @r5+,r0 )
dt r2 dt r2
#ifdef __LITTLE_ENDIAN__ #ifdef CONFIG_CPU_LITTLE_ENDIAN
EX( mov.w r0,@r4 ) EX( mov.w r0,@r4 )
shlr16 r0 shlr16 r0
EX( mov.w r0,@(2,r4) ) EX( mov.w r0,@(2,r4) )
@ -342,7 +387,7 @@ EX( mov.w r0,@r4 )
! Read longword, write byte, word, byte per iteration ! Read longword, write byte, word, byte per iteration
EX( mov.l @r5+,r0 ) EX( mov.l @r5+,r0 )
dt r2 dt r2
#ifdef __LITTLE_ENDIAN__ #ifdef CONFIG_CPU_LITTLE_ENDIAN
EX( mov.b r0,@r4 ) EX( mov.b r0,@r4 )
shlr8 r0 shlr8 r0
add #1,r4 add #1,r4
@ -379,6 +424,7 @@ EX( mov.b r0,@r4 )
.L_exit: .L_exit:
mov #0,r0 ! normal return mov #0,r0 ! normal return
5000: 5000:
# Exception handler: # Exception handler:
@ -394,5 +440,6 @@ EX( mov.b r0,@r4 )
.previous .previous
mov.l @r15+,r8 mov.l @r15+,r8
mov.l @r15+,r9 mov.l @r15+,r9
mov.l @r15+,r10
rts rts
mov.l @r15+,r10 mov.l @r15+,r11