forked from luck/tmp_suning_uos_patched
crypto: x86/chacha20 - Support partial lengths in 4-block SSSE3 variant
Add a length argument to the quad block function for SSSE3, so the block function may XOR only a partial length of four blocks. As we already have the stack set up, the partial XORing does not need to. This gives a slightly different function trailer, so we keep that separate from the 1-block function. Signed-off-by: Martin Willi <martin@strongswan.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
parent
e4e72063d3
commit
db8e15a249
|
@ -191,8 +191,9 @@ ENDPROC(chacha20_block_xor_ssse3)
|
||||||
|
|
||||||
ENTRY(chacha20_4block_xor_ssse3)
|
ENTRY(chacha20_4block_xor_ssse3)
|
||||||
# %rdi: Input state matrix, s
|
# %rdi: Input state matrix, s
|
||||||
# %rsi: 4 data blocks output, o
|
# %rsi: up to 4 data blocks output, o
|
||||||
# %rdx: 4 data blocks input, i
|
# %rdx: up to 4 data blocks input, i
|
||||||
|
# %rcx: input/output length in bytes
|
||||||
|
|
||||||
# This function encrypts four consecutive ChaCha20 blocks by loading the
|
# This function encrypts four consecutive ChaCha20 blocks by loading the
|
||||||
# the state matrix in SSE registers four times. As we need some scratch
|
# the state matrix in SSE registers four times. As we need some scratch
|
||||||
|
@ -207,6 +208,7 @@ ENTRY(chacha20_4block_xor_ssse3)
|
||||||
lea 8(%rsp),%r10
|
lea 8(%rsp),%r10
|
||||||
sub $0x80,%rsp
|
sub $0x80,%rsp
|
||||||
and $~63,%rsp
|
and $~63,%rsp
|
||||||
|
mov %rcx,%rax
|
||||||
|
|
||||||
# x0..15[0-3] = s0..3[0..3]
|
# x0..15[0-3] = s0..3[0..3]
|
||||||
movq 0x00(%rdi),%xmm1
|
movq 0x00(%rdi),%xmm1
|
||||||
|
@ -617,58 +619,143 @@ ENTRY(chacha20_4block_xor_ssse3)
|
||||||
|
|
||||||
# xor with corresponding input, write to output
|
# xor with corresponding input, write to output
|
||||||
movdqa 0x00(%rsp),%xmm0
|
movdqa 0x00(%rsp),%xmm0
|
||||||
|
cmp $0x10,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
movdqu 0x00(%rdx),%xmm1
|
movdqu 0x00(%rdx),%xmm1
|
||||||
pxor %xmm1,%xmm0
|
pxor %xmm1,%xmm0
|
||||||
movdqu %xmm0,0x00(%rsi)
|
movdqu %xmm0,0x00(%rsi)
|
||||||
movdqa 0x10(%rsp),%xmm0
|
|
||||||
movdqu 0x80(%rdx),%xmm1
|
movdqu %xmm4,%xmm0
|
||||||
|
cmp $0x20,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
|
movdqu 0x10(%rdx),%xmm1
|
||||||
pxor %xmm1,%xmm0
|
pxor %xmm1,%xmm0
|
||||||
movdqu %xmm0,0x80(%rsi)
|
movdqu %xmm0,0x10(%rsi)
|
||||||
|
|
||||||
|
movdqu %xmm8,%xmm0
|
||||||
|
cmp $0x30,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
|
movdqu 0x20(%rdx),%xmm1
|
||||||
|
pxor %xmm1,%xmm0
|
||||||
|
movdqu %xmm0,0x20(%rsi)
|
||||||
|
|
||||||
|
movdqu %xmm12,%xmm0
|
||||||
|
cmp $0x40,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
|
movdqu 0x30(%rdx),%xmm1
|
||||||
|
pxor %xmm1,%xmm0
|
||||||
|
movdqu %xmm0,0x30(%rsi)
|
||||||
|
|
||||||
movdqa 0x20(%rsp),%xmm0
|
movdqa 0x20(%rsp),%xmm0
|
||||||
|
cmp $0x50,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
movdqu 0x40(%rdx),%xmm1
|
movdqu 0x40(%rdx),%xmm1
|
||||||
pxor %xmm1,%xmm0
|
pxor %xmm1,%xmm0
|
||||||
movdqu %xmm0,0x40(%rsi)
|
movdqu %xmm0,0x40(%rsi)
|
||||||
|
|
||||||
|
movdqu %xmm6,%xmm0
|
||||||
|
cmp $0x60,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
|
movdqu 0x50(%rdx),%xmm1
|
||||||
|
pxor %xmm1,%xmm0
|
||||||
|
movdqu %xmm0,0x50(%rsi)
|
||||||
|
|
||||||
|
movdqu %xmm10,%xmm0
|
||||||
|
cmp $0x70,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
|
movdqu 0x60(%rdx),%xmm1
|
||||||
|
pxor %xmm1,%xmm0
|
||||||
|
movdqu %xmm0,0x60(%rsi)
|
||||||
|
|
||||||
|
movdqu %xmm14,%xmm0
|
||||||
|
cmp $0x80,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
|
movdqu 0x70(%rdx),%xmm1
|
||||||
|
pxor %xmm1,%xmm0
|
||||||
|
movdqu %xmm0,0x70(%rsi)
|
||||||
|
|
||||||
|
movdqa 0x10(%rsp),%xmm0
|
||||||
|
cmp $0x90,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
|
movdqu 0x80(%rdx),%xmm1
|
||||||
|
pxor %xmm1,%xmm0
|
||||||
|
movdqu %xmm0,0x80(%rsi)
|
||||||
|
|
||||||
|
movdqu %xmm5,%xmm0
|
||||||
|
cmp $0xa0,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
|
movdqu 0x90(%rdx),%xmm1
|
||||||
|
pxor %xmm1,%xmm0
|
||||||
|
movdqu %xmm0,0x90(%rsi)
|
||||||
|
|
||||||
|
movdqu %xmm9,%xmm0
|
||||||
|
cmp $0xb0,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
|
movdqu 0xa0(%rdx),%xmm1
|
||||||
|
pxor %xmm1,%xmm0
|
||||||
|
movdqu %xmm0,0xa0(%rsi)
|
||||||
|
|
||||||
|
movdqu %xmm13,%xmm0
|
||||||
|
cmp $0xc0,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
|
movdqu 0xb0(%rdx),%xmm1
|
||||||
|
pxor %xmm1,%xmm0
|
||||||
|
movdqu %xmm0,0xb0(%rsi)
|
||||||
|
|
||||||
movdqa 0x30(%rsp),%xmm0
|
movdqa 0x30(%rsp),%xmm0
|
||||||
|
cmp $0xd0,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
movdqu 0xc0(%rdx),%xmm1
|
movdqu 0xc0(%rdx),%xmm1
|
||||||
pxor %xmm1,%xmm0
|
pxor %xmm1,%xmm0
|
||||||
movdqu %xmm0,0xc0(%rsi)
|
movdqu %xmm0,0xc0(%rsi)
|
||||||
movdqu 0x10(%rdx),%xmm1
|
|
||||||
pxor %xmm1,%xmm4
|
|
||||||
movdqu %xmm4,0x10(%rsi)
|
|
||||||
movdqu 0x90(%rdx),%xmm1
|
|
||||||
pxor %xmm1,%xmm5
|
|
||||||
movdqu %xmm5,0x90(%rsi)
|
|
||||||
movdqu 0x50(%rdx),%xmm1
|
|
||||||
pxor %xmm1,%xmm6
|
|
||||||
movdqu %xmm6,0x50(%rsi)
|
|
||||||
movdqu 0xd0(%rdx),%xmm1
|
|
||||||
pxor %xmm1,%xmm7
|
|
||||||
movdqu %xmm7,0xd0(%rsi)
|
|
||||||
movdqu 0x20(%rdx),%xmm1
|
|
||||||
pxor %xmm1,%xmm8
|
|
||||||
movdqu %xmm8,0x20(%rsi)
|
|
||||||
movdqu 0xa0(%rdx),%xmm1
|
|
||||||
pxor %xmm1,%xmm9
|
|
||||||
movdqu %xmm9,0xa0(%rsi)
|
|
||||||
movdqu 0x60(%rdx),%xmm1
|
|
||||||
pxor %xmm1,%xmm10
|
|
||||||
movdqu %xmm10,0x60(%rsi)
|
|
||||||
movdqu 0xe0(%rdx),%xmm1
|
|
||||||
pxor %xmm1,%xmm11
|
|
||||||
movdqu %xmm11,0xe0(%rsi)
|
|
||||||
movdqu 0x30(%rdx),%xmm1
|
|
||||||
pxor %xmm1,%xmm12
|
|
||||||
movdqu %xmm12,0x30(%rsi)
|
|
||||||
movdqu 0xb0(%rdx),%xmm1
|
|
||||||
pxor %xmm1,%xmm13
|
|
||||||
movdqu %xmm13,0xb0(%rsi)
|
|
||||||
movdqu 0x70(%rdx),%xmm1
|
|
||||||
pxor %xmm1,%xmm14
|
|
||||||
movdqu %xmm14,0x70(%rsi)
|
|
||||||
movdqu 0xf0(%rdx),%xmm1
|
|
||||||
pxor %xmm1,%xmm15
|
|
||||||
movdqu %xmm15,0xf0(%rsi)
|
|
||||||
|
|
||||||
|
movdqu %xmm7,%xmm0
|
||||||
|
cmp $0xe0,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
|
movdqu 0xd0(%rdx),%xmm1
|
||||||
|
pxor %xmm1,%xmm0
|
||||||
|
movdqu %xmm0,0xd0(%rsi)
|
||||||
|
|
||||||
|
movdqu %xmm11,%xmm0
|
||||||
|
cmp $0xf0,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
|
movdqu 0xe0(%rdx),%xmm1
|
||||||
|
pxor %xmm1,%xmm0
|
||||||
|
movdqu %xmm0,0xe0(%rsi)
|
||||||
|
|
||||||
|
movdqu %xmm15,%xmm0
|
||||||
|
cmp $0x100,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
|
movdqu 0xf0(%rdx),%xmm1
|
||||||
|
pxor %xmm1,%xmm0
|
||||||
|
movdqu %xmm0,0xf0(%rsi)
|
||||||
|
|
||||||
|
.Ldone4:
|
||||||
lea -8(%r10),%rsp
|
lea -8(%r10),%rsp
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
.Lxorpart4:
|
||||||
|
# xor remaining bytes from partial register into output
|
||||||
|
mov %rax,%r9
|
||||||
|
and $0x0f,%r9
|
||||||
|
jz .Ldone4
|
||||||
|
and $~0x0f,%rax
|
||||||
|
|
||||||
|
mov %rsi,%r11
|
||||||
|
|
||||||
|
lea (%rdx,%rax),%rsi
|
||||||
|
mov %rsp,%rdi
|
||||||
|
mov %r9,%rcx
|
||||||
|
rep movsb
|
||||||
|
|
||||||
|
pxor 0x00(%rsp),%xmm0
|
||||||
|
movdqa %xmm0,0x00(%rsp)
|
||||||
|
|
||||||
|
mov %rsp,%rsi
|
||||||
|
lea (%r11,%rax),%rdi
|
||||||
|
mov %r9,%rcx
|
||||||
|
rep movsb
|
||||||
|
|
||||||
|
jmp .Ldone4
|
||||||
|
|
||||||
ENDPROC(chacha20_4block_xor_ssse3)
|
ENDPROC(chacha20_4block_xor_ssse3)
|
||||||
|
|
|
@ -21,7 +21,8 @@
|
||||||
|
|
||||||
asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
|
asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
|
||||||
unsigned int len);
|
unsigned int len);
|
||||||
asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
|
asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
|
||||||
|
unsigned int len);
|
||||||
#ifdef CONFIG_AS_AVX2
|
#ifdef CONFIG_AS_AVX2
|
||||||
asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src);
|
asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src);
|
||||||
static bool chacha20_use_avx2;
|
static bool chacha20_use_avx2;
|
||||||
|
@ -42,7 +43,7 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
|
while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
|
||||||
chacha20_4block_xor_ssse3(state, dst, src);
|
chacha20_4block_xor_ssse3(state, dst, src, bytes);
|
||||||
bytes -= CHACHA20_BLOCK_SIZE * 4;
|
bytes -= CHACHA20_BLOCK_SIZE * 4;
|
||||||
src += CHACHA20_BLOCK_SIZE * 4;
|
src += CHACHA20_BLOCK_SIZE * 4;
|
||||||
dst += CHACHA20_BLOCK_SIZE * 4;
|
dst += CHACHA20_BLOCK_SIZE * 4;
|
||||||
|
|
Loading…
Reference in New Issue
Block a user