162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 OR MIT */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 462306a36Sopenharmony_ci * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. 562306a36Sopenharmony_ci */ 662306a36Sopenharmony_ci 762306a36Sopenharmony_ci#include <linux/linkage.h> 862306a36Sopenharmony_ci 962306a36Sopenharmony_ci.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32 1062306a36Sopenharmony_ci.align 32 1162306a36Sopenharmony_ciIV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667 1262306a36Sopenharmony_ci .octa 0x5BE0CD191F83D9AB9B05688C510E527F 1362306a36Sopenharmony_ci.section .rodata.cst16.ROT16, "aM", @progbits, 16 1462306a36Sopenharmony_ci.align 16 1562306a36Sopenharmony_ciROT16: .octa 0x0D0C0F0E09080B0A0504070601000302 1662306a36Sopenharmony_ci.section .rodata.cst16.ROR328, "aM", @progbits, 16 1762306a36Sopenharmony_ci.align 16 1862306a36Sopenharmony_ciROR328: .octa 0x0C0F0E0D080B0A090407060500030201 1962306a36Sopenharmony_ci.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160 2062306a36Sopenharmony_ci.align 64 2162306a36Sopenharmony_ciSIGMA: 2262306a36Sopenharmony_ci.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 2362306a36Sopenharmony_ci.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7 2462306a36Sopenharmony_ci.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1 2562306a36Sopenharmony_ci.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0 2662306a36Sopenharmony_ci.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8 2762306a36Sopenharmony_ci.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14 2862306a36Sopenharmony_ci.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2 2962306a36Sopenharmony_ci.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6 3062306a36Sopenharmony_ci.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4 3162306a36Sopenharmony_ci.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12 3262306a36Sopenharmony_ci#ifdef CONFIG_AS_AVX512 3362306a36Sopenharmony_ci.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640 3462306a36Sopenharmony_ci.align 64 3562306a36Sopenharmony_ciSIGMA2: 3662306a36Sopenharmony_ci.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 3762306a36Sopenharmony_ci.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7 3862306a36Sopenharmony_ci.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9 3962306a36Sopenharmony_ci.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5 4062306a36Sopenharmony_ci.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12 4162306a36Sopenharmony_ci.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9 4262306a36Sopenharmony_ci.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0 4362306a36Sopenharmony_ci.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10 4462306a36Sopenharmony_ci.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14 4562306a36Sopenharmony_ci.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9 4662306a36Sopenharmony_ci#endif /* CONFIG_AS_AVX512 */ 4762306a36Sopenharmony_ci 4862306a36Sopenharmony_ci.text 4962306a36Sopenharmony_ciSYM_FUNC_START(blake2s_compress_ssse3) 5062306a36Sopenharmony_ci testq %rdx,%rdx 5162306a36Sopenharmony_ci je .Lendofloop 5262306a36Sopenharmony_ci movdqu (%rdi),%xmm0 5362306a36Sopenharmony_ci movdqu 0x10(%rdi),%xmm1 5462306a36Sopenharmony_ci movdqa ROT16(%rip),%xmm12 5562306a36Sopenharmony_ci movdqa ROR328(%rip),%xmm13 5662306a36Sopenharmony_ci movdqu 0x20(%rdi),%xmm14 5762306a36Sopenharmony_ci movq %rcx,%xmm15 5862306a36Sopenharmony_ci leaq SIGMA+0xa0(%rip),%r8 5962306a36Sopenharmony_ci jmp .Lbeginofloop 6062306a36Sopenharmony_ci .align 32 6162306a36Sopenharmony_ci.Lbeginofloop: 6262306a36Sopenharmony_ci movdqa %xmm0,%xmm10 6362306a36Sopenharmony_ci movdqa %xmm1,%xmm11 6462306a36Sopenharmony_ci paddq %xmm15,%xmm14 6562306a36Sopenharmony_ci movdqa IV(%rip),%xmm2 6662306a36Sopenharmony_ci movdqa %xmm14,%xmm3 6762306a36Sopenharmony_ci pxor IV+0x10(%rip),%xmm3 6862306a36Sopenharmony_ci leaq SIGMA(%rip),%rcx 6962306a36Sopenharmony_ci.Lroundloop: 7062306a36Sopenharmony_ci movzbl (%rcx),%eax 7162306a36Sopenharmony_ci movd (%rsi,%rax,4),%xmm4 7262306a36Sopenharmony_ci movzbl 0x1(%rcx),%eax 7362306a36Sopenharmony_ci movd (%rsi,%rax,4),%xmm5 7462306a36Sopenharmony_ci movzbl 0x2(%rcx),%eax 7562306a36Sopenharmony_ci movd (%rsi,%rax,4),%xmm6 7662306a36Sopenharmony_ci movzbl 0x3(%rcx),%eax 7762306a36Sopenharmony_ci movd (%rsi,%rax,4),%xmm7 7862306a36Sopenharmony_ci punpckldq %xmm5,%xmm4 7962306a36Sopenharmony_ci punpckldq %xmm7,%xmm6 8062306a36Sopenharmony_ci punpcklqdq %xmm6,%xmm4 8162306a36Sopenharmony_ci paddd %xmm4,%xmm0 8262306a36Sopenharmony_ci paddd %xmm1,%xmm0 8362306a36Sopenharmony_ci pxor %xmm0,%xmm3 8462306a36Sopenharmony_ci pshufb %xmm12,%xmm3 8562306a36Sopenharmony_ci paddd %xmm3,%xmm2 8662306a36Sopenharmony_ci pxor %xmm2,%xmm1 8762306a36Sopenharmony_ci movdqa %xmm1,%xmm8 8862306a36Sopenharmony_ci psrld $0xc,%xmm1 8962306a36Sopenharmony_ci pslld $0x14,%xmm8 9062306a36Sopenharmony_ci por %xmm8,%xmm1 9162306a36Sopenharmony_ci movzbl 0x4(%rcx),%eax 9262306a36Sopenharmony_ci movd (%rsi,%rax,4),%xmm5 9362306a36Sopenharmony_ci movzbl 0x5(%rcx),%eax 9462306a36Sopenharmony_ci movd (%rsi,%rax,4),%xmm6 9562306a36Sopenharmony_ci movzbl 0x6(%rcx),%eax 9662306a36Sopenharmony_ci movd (%rsi,%rax,4),%xmm7 9762306a36Sopenharmony_ci movzbl 0x7(%rcx),%eax 9862306a36Sopenharmony_ci movd (%rsi,%rax,4),%xmm4 9962306a36Sopenharmony_ci punpckldq %xmm6,%xmm5 10062306a36Sopenharmony_ci punpckldq %xmm4,%xmm7 10162306a36Sopenharmony_ci punpcklqdq %xmm7,%xmm5 10262306a36Sopenharmony_ci paddd %xmm5,%xmm0 10362306a36Sopenharmony_ci paddd %xmm1,%xmm0 10462306a36Sopenharmony_ci pxor %xmm0,%xmm3 10562306a36Sopenharmony_ci pshufb %xmm13,%xmm3 10662306a36Sopenharmony_ci paddd %xmm3,%xmm2 10762306a36Sopenharmony_ci pxor %xmm2,%xmm1 10862306a36Sopenharmony_ci movdqa %xmm1,%xmm8 10962306a36Sopenharmony_ci psrld $0x7,%xmm1 11062306a36Sopenharmony_ci pslld $0x19,%xmm8 11162306a36Sopenharmony_ci por %xmm8,%xmm1 11262306a36Sopenharmony_ci pshufd $0x93,%xmm0,%xmm0 11362306a36Sopenharmony_ci pshufd $0x4e,%xmm3,%xmm3 11462306a36Sopenharmony_ci pshufd $0x39,%xmm2,%xmm2 11562306a36Sopenharmony_ci movzbl 0x8(%rcx),%eax 11662306a36Sopenharmony_ci movd (%rsi,%rax,4),%xmm6 11762306a36Sopenharmony_ci movzbl 0x9(%rcx),%eax 11862306a36Sopenharmony_ci movd (%rsi,%rax,4),%xmm7 11962306a36Sopenharmony_ci movzbl 0xa(%rcx),%eax 12062306a36Sopenharmony_ci movd (%rsi,%rax,4),%xmm4 12162306a36Sopenharmony_ci movzbl 0xb(%rcx),%eax 12262306a36Sopenharmony_ci movd (%rsi,%rax,4),%xmm5 12362306a36Sopenharmony_ci punpckldq %xmm7,%xmm6 12462306a36Sopenharmony_ci punpckldq %xmm5,%xmm4 12562306a36Sopenharmony_ci punpcklqdq %xmm4,%xmm6 12662306a36Sopenharmony_ci paddd %xmm6,%xmm0 12762306a36Sopenharmony_ci paddd %xmm1,%xmm0 12862306a36Sopenharmony_ci pxor %xmm0,%xmm3 12962306a36Sopenharmony_ci pshufb %xmm12,%xmm3 13062306a36Sopenharmony_ci paddd %xmm3,%xmm2 13162306a36Sopenharmony_ci pxor %xmm2,%xmm1 13262306a36Sopenharmony_ci movdqa %xmm1,%xmm8 13362306a36Sopenharmony_ci psrld $0xc,%xmm1 13462306a36Sopenharmony_ci pslld $0x14,%xmm8 13562306a36Sopenharmony_ci por %xmm8,%xmm1 13662306a36Sopenharmony_ci movzbl 0xc(%rcx),%eax 13762306a36Sopenharmony_ci movd (%rsi,%rax,4),%xmm7 13862306a36Sopenharmony_ci movzbl 0xd(%rcx),%eax 13962306a36Sopenharmony_ci movd (%rsi,%rax,4),%xmm4 14062306a36Sopenharmony_ci movzbl 0xe(%rcx),%eax 14162306a36Sopenharmony_ci movd (%rsi,%rax,4),%xmm5 14262306a36Sopenharmony_ci movzbl 0xf(%rcx),%eax 14362306a36Sopenharmony_ci movd (%rsi,%rax,4),%xmm6 14462306a36Sopenharmony_ci punpckldq %xmm4,%xmm7 14562306a36Sopenharmony_ci punpckldq %xmm6,%xmm5 14662306a36Sopenharmony_ci punpcklqdq %xmm5,%xmm7 14762306a36Sopenharmony_ci paddd %xmm7,%xmm0 14862306a36Sopenharmony_ci paddd %xmm1,%xmm0 14962306a36Sopenharmony_ci pxor %xmm0,%xmm3 15062306a36Sopenharmony_ci pshufb %xmm13,%xmm3 15162306a36Sopenharmony_ci paddd %xmm3,%xmm2 15262306a36Sopenharmony_ci pxor %xmm2,%xmm1 15362306a36Sopenharmony_ci movdqa %xmm1,%xmm8 15462306a36Sopenharmony_ci psrld $0x7,%xmm1 15562306a36Sopenharmony_ci pslld $0x19,%xmm8 15662306a36Sopenharmony_ci por %xmm8,%xmm1 15762306a36Sopenharmony_ci pshufd $0x39,%xmm0,%xmm0 15862306a36Sopenharmony_ci pshufd $0x4e,%xmm3,%xmm3 15962306a36Sopenharmony_ci pshufd $0x93,%xmm2,%xmm2 16062306a36Sopenharmony_ci addq $0x10,%rcx 16162306a36Sopenharmony_ci cmpq %r8,%rcx 16262306a36Sopenharmony_ci jnz .Lroundloop 16362306a36Sopenharmony_ci pxor %xmm2,%xmm0 16462306a36Sopenharmony_ci pxor %xmm3,%xmm1 16562306a36Sopenharmony_ci pxor %xmm10,%xmm0 16662306a36Sopenharmony_ci pxor %xmm11,%xmm1 16762306a36Sopenharmony_ci addq $0x40,%rsi 16862306a36Sopenharmony_ci decq %rdx 16962306a36Sopenharmony_ci jnz .Lbeginofloop 17062306a36Sopenharmony_ci movdqu %xmm0,(%rdi) 17162306a36Sopenharmony_ci movdqu %xmm1,0x10(%rdi) 17262306a36Sopenharmony_ci movdqu %xmm14,0x20(%rdi) 17362306a36Sopenharmony_ci.Lendofloop: 17462306a36Sopenharmony_ci RET 17562306a36Sopenharmony_ciSYM_FUNC_END(blake2s_compress_ssse3) 17662306a36Sopenharmony_ci 17762306a36Sopenharmony_ci#ifdef CONFIG_AS_AVX512 17862306a36Sopenharmony_ciSYM_FUNC_START(blake2s_compress_avx512) 17962306a36Sopenharmony_ci vmovdqu (%rdi),%xmm0 18062306a36Sopenharmony_ci vmovdqu 0x10(%rdi),%xmm1 18162306a36Sopenharmony_ci vmovdqu 0x20(%rdi),%xmm4 18262306a36Sopenharmony_ci vmovq %rcx,%xmm5 18362306a36Sopenharmony_ci vmovdqa IV(%rip),%xmm14 18462306a36Sopenharmony_ci vmovdqa IV+16(%rip),%xmm15 18562306a36Sopenharmony_ci jmp .Lblake2s_compress_avx512_mainloop 18662306a36Sopenharmony_ci.align 32 18762306a36Sopenharmony_ci.Lblake2s_compress_avx512_mainloop: 18862306a36Sopenharmony_ci vmovdqa %xmm0,%xmm10 18962306a36Sopenharmony_ci vmovdqa %xmm1,%xmm11 19062306a36Sopenharmony_ci vpaddq %xmm5,%xmm4,%xmm4 19162306a36Sopenharmony_ci vmovdqa %xmm14,%xmm2 19262306a36Sopenharmony_ci vpxor %xmm15,%xmm4,%xmm3 19362306a36Sopenharmony_ci vmovdqu (%rsi),%ymm6 19462306a36Sopenharmony_ci vmovdqu 0x20(%rsi),%ymm7 19562306a36Sopenharmony_ci addq $0x40,%rsi 19662306a36Sopenharmony_ci leaq SIGMA2(%rip),%rax 19762306a36Sopenharmony_ci movb $0xa,%cl 19862306a36Sopenharmony_ci.Lblake2s_compress_avx512_roundloop: 19962306a36Sopenharmony_ci addq $0x40,%rax 20062306a36Sopenharmony_ci vmovdqa -0x40(%rax),%ymm8 20162306a36Sopenharmony_ci vmovdqa -0x20(%rax),%ymm9 20262306a36Sopenharmony_ci vpermi2d %ymm7,%ymm6,%ymm8 20362306a36Sopenharmony_ci vpermi2d %ymm7,%ymm6,%ymm9 20462306a36Sopenharmony_ci vmovdqa %ymm8,%ymm6 20562306a36Sopenharmony_ci vmovdqa %ymm9,%ymm7 20662306a36Sopenharmony_ci vpaddd %xmm8,%xmm0,%xmm0 20762306a36Sopenharmony_ci vpaddd %xmm1,%xmm0,%xmm0 20862306a36Sopenharmony_ci vpxor %xmm0,%xmm3,%xmm3 20962306a36Sopenharmony_ci vprord $0x10,%xmm3,%xmm3 21062306a36Sopenharmony_ci vpaddd %xmm3,%xmm2,%xmm2 21162306a36Sopenharmony_ci vpxor %xmm2,%xmm1,%xmm1 21262306a36Sopenharmony_ci vprord $0xc,%xmm1,%xmm1 21362306a36Sopenharmony_ci vextracti128 $0x1,%ymm8,%xmm8 21462306a36Sopenharmony_ci vpaddd %xmm8,%xmm0,%xmm0 21562306a36Sopenharmony_ci vpaddd %xmm1,%xmm0,%xmm0 21662306a36Sopenharmony_ci vpxor %xmm0,%xmm3,%xmm3 21762306a36Sopenharmony_ci vprord $0x8,%xmm3,%xmm3 21862306a36Sopenharmony_ci vpaddd %xmm3,%xmm2,%xmm2 21962306a36Sopenharmony_ci vpxor %xmm2,%xmm1,%xmm1 22062306a36Sopenharmony_ci vprord $0x7,%xmm1,%xmm1 22162306a36Sopenharmony_ci vpshufd $0x93,%xmm0,%xmm0 22262306a36Sopenharmony_ci vpshufd $0x4e,%xmm3,%xmm3 22362306a36Sopenharmony_ci vpshufd $0x39,%xmm2,%xmm2 22462306a36Sopenharmony_ci vpaddd %xmm9,%xmm0,%xmm0 22562306a36Sopenharmony_ci vpaddd %xmm1,%xmm0,%xmm0 22662306a36Sopenharmony_ci vpxor %xmm0,%xmm3,%xmm3 22762306a36Sopenharmony_ci vprord $0x10,%xmm3,%xmm3 22862306a36Sopenharmony_ci vpaddd %xmm3,%xmm2,%xmm2 22962306a36Sopenharmony_ci vpxor %xmm2,%xmm1,%xmm1 23062306a36Sopenharmony_ci vprord $0xc,%xmm1,%xmm1 23162306a36Sopenharmony_ci vextracti128 $0x1,%ymm9,%xmm9 23262306a36Sopenharmony_ci vpaddd %xmm9,%xmm0,%xmm0 23362306a36Sopenharmony_ci vpaddd %xmm1,%xmm0,%xmm0 23462306a36Sopenharmony_ci vpxor %xmm0,%xmm3,%xmm3 23562306a36Sopenharmony_ci vprord $0x8,%xmm3,%xmm3 23662306a36Sopenharmony_ci vpaddd %xmm3,%xmm2,%xmm2 23762306a36Sopenharmony_ci vpxor %xmm2,%xmm1,%xmm1 23862306a36Sopenharmony_ci vprord $0x7,%xmm1,%xmm1 23962306a36Sopenharmony_ci vpshufd $0x39,%xmm0,%xmm0 24062306a36Sopenharmony_ci vpshufd $0x4e,%xmm3,%xmm3 24162306a36Sopenharmony_ci vpshufd $0x93,%xmm2,%xmm2 24262306a36Sopenharmony_ci decb %cl 24362306a36Sopenharmony_ci jne .Lblake2s_compress_avx512_roundloop 24462306a36Sopenharmony_ci vpxor %xmm10,%xmm0,%xmm0 24562306a36Sopenharmony_ci vpxor %xmm11,%xmm1,%xmm1 24662306a36Sopenharmony_ci vpxor %xmm2,%xmm0,%xmm0 24762306a36Sopenharmony_ci vpxor %xmm3,%xmm1,%xmm1 24862306a36Sopenharmony_ci decq %rdx 24962306a36Sopenharmony_ci jne .Lblake2s_compress_avx512_mainloop 25062306a36Sopenharmony_ci vmovdqu %xmm0,(%rdi) 25162306a36Sopenharmony_ci vmovdqu %xmm1,0x10(%rdi) 25262306a36Sopenharmony_ci vmovdqu %xmm4,0x20(%rdi) 25362306a36Sopenharmony_ci vzeroupper 25462306a36Sopenharmony_ci RET 25562306a36Sopenharmony_ciSYM_FUNC_END(blake2s_compress_avx512) 25662306a36Sopenharmony_ci#endif /* CONFIG_AS_AVX512 */ 257