18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 OR MIT */ 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 48c2ecf20Sopenharmony_ci * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. 58c2ecf20Sopenharmony_ci */ 68c2ecf20Sopenharmony_ci 78c2ecf20Sopenharmony_ci#include <linux/linkage.h> 88c2ecf20Sopenharmony_ci 98c2ecf20Sopenharmony_ci.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32 108c2ecf20Sopenharmony_ci.align 32 118c2ecf20Sopenharmony_ciIV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667 128c2ecf20Sopenharmony_ci .octa 0x5BE0CD191F83D9AB9B05688C510E527F 138c2ecf20Sopenharmony_ci.section .rodata.cst16.ROT16, "aM", @progbits, 16 148c2ecf20Sopenharmony_ci.align 16 158c2ecf20Sopenharmony_ciROT16: .octa 0x0D0C0F0E09080B0A0504070601000302 168c2ecf20Sopenharmony_ci.section .rodata.cst16.ROR328, "aM", @progbits, 16 178c2ecf20Sopenharmony_ci.align 16 188c2ecf20Sopenharmony_ciROR328: .octa 0x0C0F0E0D080B0A090407060500030201 198c2ecf20Sopenharmony_ci.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160 208c2ecf20Sopenharmony_ci.align 64 218c2ecf20Sopenharmony_ciSIGMA: 228c2ecf20Sopenharmony_ci.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 238c2ecf20Sopenharmony_ci.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7 248c2ecf20Sopenharmony_ci.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1 258c2ecf20Sopenharmony_ci.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0 268c2ecf20Sopenharmony_ci.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8 278c2ecf20Sopenharmony_ci.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14 288c2ecf20Sopenharmony_ci.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2 298c2ecf20Sopenharmony_ci.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6 308c2ecf20Sopenharmony_ci.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4 318c2ecf20Sopenharmony_ci.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12 328c2ecf20Sopenharmony_ci#ifdef CONFIG_AS_AVX512 338c2ecf20Sopenharmony_ci.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640 348c2ecf20Sopenharmony_ci.align 64 358c2ecf20Sopenharmony_ciSIGMA2: 368c2ecf20Sopenharmony_ci.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 378c2ecf20Sopenharmony_ci.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7 388c2ecf20Sopenharmony_ci.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9 398c2ecf20Sopenharmony_ci.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5 408c2ecf20Sopenharmony_ci.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12 418c2ecf20Sopenharmony_ci.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9 428c2ecf20Sopenharmony_ci.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0 438c2ecf20Sopenharmony_ci.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10 448c2ecf20Sopenharmony_ci.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14 458c2ecf20Sopenharmony_ci.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9 468c2ecf20Sopenharmony_ci#endif /* CONFIG_AS_AVX512 */ 478c2ecf20Sopenharmony_ci 488c2ecf20Sopenharmony_ci.text 498c2ecf20Sopenharmony_ciSYM_FUNC_START(blake2s_compress_ssse3) 508c2ecf20Sopenharmony_ci testq %rdx,%rdx 518c2ecf20Sopenharmony_ci je .Lendofloop 528c2ecf20Sopenharmony_ci movdqu (%rdi),%xmm0 538c2ecf20Sopenharmony_ci movdqu 0x10(%rdi),%xmm1 548c2ecf20Sopenharmony_ci movdqa ROT16(%rip),%xmm12 558c2ecf20Sopenharmony_ci movdqa ROR328(%rip),%xmm13 568c2ecf20Sopenharmony_ci movdqu 0x20(%rdi),%xmm14 578c2ecf20Sopenharmony_ci movq %rcx,%xmm15 588c2ecf20Sopenharmony_ci leaq SIGMA+0xa0(%rip),%r8 598c2ecf20Sopenharmony_ci jmp .Lbeginofloop 608c2ecf20Sopenharmony_ci .align 32 618c2ecf20Sopenharmony_ci.Lbeginofloop: 628c2ecf20Sopenharmony_ci movdqa %xmm0,%xmm10 638c2ecf20Sopenharmony_ci movdqa %xmm1,%xmm11 648c2ecf20Sopenharmony_ci paddq %xmm15,%xmm14 658c2ecf20Sopenharmony_ci movdqa IV(%rip),%xmm2 668c2ecf20Sopenharmony_ci movdqa %xmm14,%xmm3 678c2ecf20Sopenharmony_ci pxor IV+0x10(%rip),%xmm3 688c2ecf20Sopenharmony_ci leaq SIGMA(%rip),%rcx 698c2ecf20Sopenharmony_ci.Lroundloop: 708c2ecf20Sopenharmony_ci movzbl (%rcx),%eax 718c2ecf20Sopenharmony_ci movd (%rsi,%rax,4),%xmm4 728c2ecf20Sopenharmony_ci movzbl 0x1(%rcx),%eax 738c2ecf20Sopenharmony_ci movd (%rsi,%rax,4),%xmm5 748c2ecf20Sopenharmony_ci movzbl 0x2(%rcx),%eax 758c2ecf20Sopenharmony_ci movd (%rsi,%rax,4),%xmm6 768c2ecf20Sopenharmony_ci movzbl 0x3(%rcx),%eax 778c2ecf20Sopenharmony_ci movd (%rsi,%rax,4),%xmm7 788c2ecf20Sopenharmony_ci punpckldq %xmm5,%xmm4 798c2ecf20Sopenharmony_ci punpckldq %xmm7,%xmm6 808c2ecf20Sopenharmony_ci punpcklqdq %xmm6,%xmm4 818c2ecf20Sopenharmony_ci paddd %xmm4,%xmm0 828c2ecf20Sopenharmony_ci paddd %xmm1,%xmm0 838c2ecf20Sopenharmony_ci pxor %xmm0,%xmm3 848c2ecf20Sopenharmony_ci pshufb %xmm12,%xmm3 858c2ecf20Sopenharmony_ci paddd %xmm3,%xmm2 868c2ecf20Sopenharmony_ci pxor %xmm2,%xmm1 878c2ecf20Sopenharmony_ci movdqa %xmm1,%xmm8 888c2ecf20Sopenharmony_ci psrld $0xc,%xmm1 898c2ecf20Sopenharmony_ci pslld $0x14,%xmm8 908c2ecf20Sopenharmony_ci por %xmm8,%xmm1 918c2ecf20Sopenharmony_ci movzbl 0x4(%rcx),%eax 928c2ecf20Sopenharmony_ci movd (%rsi,%rax,4),%xmm5 938c2ecf20Sopenharmony_ci movzbl 0x5(%rcx),%eax 948c2ecf20Sopenharmony_ci movd (%rsi,%rax,4),%xmm6 958c2ecf20Sopenharmony_ci movzbl 0x6(%rcx),%eax 968c2ecf20Sopenharmony_ci movd (%rsi,%rax,4),%xmm7 978c2ecf20Sopenharmony_ci movzbl 0x7(%rcx),%eax 988c2ecf20Sopenharmony_ci movd (%rsi,%rax,4),%xmm4 998c2ecf20Sopenharmony_ci punpckldq %xmm6,%xmm5 1008c2ecf20Sopenharmony_ci punpckldq %xmm4,%xmm7 1018c2ecf20Sopenharmony_ci punpcklqdq %xmm7,%xmm5 1028c2ecf20Sopenharmony_ci paddd %xmm5,%xmm0 1038c2ecf20Sopenharmony_ci paddd %xmm1,%xmm0 1048c2ecf20Sopenharmony_ci pxor %xmm0,%xmm3 1058c2ecf20Sopenharmony_ci pshufb %xmm13,%xmm3 1068c2ecf20Sopenharmony_ci paddd %xmm3,%xmm2 1078c2ecf20Sopenharmony_ci pxor %xmm2,%xmm1 1088c2ecf20Sopenharmony_ci movdqa %xmm1,%xmm8 1098c2ecf20Sopenharmony_ci psrld $0x7,%xmm1 1108c2ecf20Sopenharmony_ci pslld $0x19,%xmm8 1118c2ecf20Sopenharmony_ci por %xmm8,%xmm1 1128c2ecf20Sopenharmony_ci pshufd $0x93,%xmm0,%xmm0 1138c2ecf20Sopenharmony_ci pshufd $0x4e,%xmm3,%xmm3 1148c2ecf20Sopenharmony_ci pshufd $0x39,%xmm2,%xmm2 1158c2ecf20Sopenharmony_ci movzbl 0x8(%rcx),%eax 1168c2ecf20Sopenharmony_ci movd (%rsi,%rax,4),%xmm6 1178c2ecf20Sopenharmony_ci movzbl 0x9(%rcx),%eax 1188c2ecf20Sopenharmony_ci movd (%rsi,%rax,4),%xmm7 1198c2ecf20Sopenharmony_ci movzbl 0xa(%rcx),%eax 1208c2ecf20Sopenharmony_ci movd (%rsi,%rax,4),%xmm4 1218c2ecf20Sopenharmony_ci movzbl 0xb(%rcx),%eax 1228c2ecf20Sopenharmony_ci movd (%rsi,%rax,4),%xmm5 1238c2ecf20Sopenharmony_ci punpckldq %xmm7,%xmm6 1248c2ecf20Sopenharmony_ci punpckldq %xmm5,%xmm4 1258c2ecf20Sopenharmony_ci punpcklqdq %xmm4,%xmm6 1268c2ecf20Sopenharmony_ci paddd %xmm6,%xmm0 1278c2ecf20Sopenharmony_ci paddd %xmm1,%xmm0 1288c2ecf20Sopenharmony_ci pxor %xmm0,%xmm3 1298c2ecf20Sopenharmony_ci pshufb %xmm12,%xmm3 1308c2ecf20Sopenharmony_ci paddd %xmm3,%xmm2 1318c2ecf20Sopenharmony_ci pxor %xmm2,%xmm1 1328c2ecf20Sopenharmony_ci movdqa %xmm1,%xmm8 1338c2ecf20Sopenharmony_ci psrld $0xc,%xmm1 1348c2ecf20Sopenharmony_ci pslld $0x14,%xmm8 1358c2ecf20Sopenharmony_ci por %xmm8,%xmm1 1368c2ecf20Sopenharmony_ci movzbl 0xc(%rcx),%eax 1378c2ecf20Sopenharmony_ci movd (%rsi,%rax,4),%xmm7 1388c2ecf20Sopenharmony_ci movzbl 0xd(%rcx),%eax 1398c2ecf20Sopenharmony_ci movd (%rsi,%rax,4),%xmm4 1408c2ecf20Sopenharmony_ci movzbl 0xe(%rcx),%eax 1418c2ecf20Sopenharmony_ci movd (%rsi,%rax,4),%xmm5 1428c2ecf20Sopenharmony_ci movzbl 0xf(%rcx),%eax 1438c2ecf20Sopenharmony_ci movd (%rsi,%rax,4),%xmm6 1448c2ecf20Sopenharmony_ci punpckldq %xmm4,%xmm7 1458c2ecf20Sopenharmony_ci punpckldq %xmm6,%xmm5 1468c2ecf20Sopenharmony_ci punpcklqdq %xmm5,%xmm7 1478c2ecf20Sopenharmony_ci paddd %xmm7,%xmm0 1488c2ecf20Sopenharmony_ci paddd %xmm1,%xmm0 1498c2ecf20Sopenharmony_ci pxor %xmm0,%xmm3 1508c2ecf20Sopenharmony_ci pshufb %xmm13,%xmm3 1518c2ecf20Sopenharmony_ci paddd %xmm3,%xmm2 1528c2ecf20Sopenharmony_ci pxor %xmm2,%xmm1 1538c2ecf20Sopenharmony_ci movdqa %xmm1,%xmm8 1548c2ecf20Sopenharmony_ci psrld $0x7,%xmm1 1558c2ecf20Sopenharmony_ci pslld $0x19,%xmm8 1568c2ecf20Sopenharmony_ci por %xmm8,%xmm1 1578c2ecf20Sopenharmony_ci pshufd $0x39,%xmm0,%xmm0 1588c2ecf20Sopenharmony_ci pshufd $0x4e,%xmm3,%xmm3 1598c2ecf20Sopenharmony_ci pshufd $0x93,%xmm2,%xmm2 1608c2ecf20Sopenharmony_ci addq $0x10,%rcx 1618c2ecf20Sopenharmony_ci cmpq %r8,%rcx 1628c2ecf20Sopenharmony_ci jnz .Lroundloop 1638c2ecf20Sopenharmony_ci pxor %xmm2,%xmm0 1648c2ecf20Sopenharmony_ci pxor %xmm3,%xmm1 1658c2ecf20Sopenharmony_ci pxor %xmm10,%xmm0 1668c2ecf20Sopenharmony_ci pxor %xmm11,%xmm1 1678c2ecf20Sopenharmony_ci addq $0x40,%rsi 1688c2ecf20Sopenharmony_ci decq %rdx 1698c2ecf20Sopenharmony_ci jnz .Lbeginofloop 1708c2ecf20Sopenharmony_ci movdqu %xmm0,(%rdi) 1718c2ecf20Sopenharmony_ci movdqu %xmm1,0x10(%rdi) 1728c2ecf20Sopenharmony_ci movdqu %xmm14,0x20(%rdi) 1738c2ecf20Sopenharmony_ci.Lendofloop: 1748c2ecf20Sopenharmony_ci RET 1758c2ecf20Sopenharmony_ciSYM_FUNC_END(blake2s_compress_ssse3) 1768c2ecf20Sopenharmony_ci 1778c2ecf20Sopenharmony_ci#ifdef CONFIG_AS_AVX512 1788c2ecf20Sopenharmony_ciSYM_FUNC_START(blake2s_compress_avx512) 1798c2ecf20Sopenharmony_ci vmovdqu (%rdi),%xmm0 1808c2ecf20Sopenharmony_ci vmovdqu 0x10(%rdi),%xmm1 1818c2ecf20Sopenharmony_ci vmovdqu 0x20(%rdi),%xmm4 1828c2ecf20Sopenharmony_ci vmovq %rcx,%xmm5 1838c2ecf20Sopenharmony_ci vmovdqa IV(%rip),%xmm14 1848c2ecf20Sopenharmony_ci vmovdqa IV+16(%rip),%xmm15 1858c2ecf20Sopenharmony_ci jmp .Lblake2s_compress_avx512_mainloop 1868c2ecf20Sopenharmony_ci.align 32 1878c2ecf20Sopenharmony_ci.Lblake2s_compress_avx512_mainloop: 1888c2ecf20Sopenharmony_ci vmovdqa %xmm0,%xmm10 1898c2ecf20Sopenharmony_ci vmovdqa %xmm1,%xmm11 1908c2ecf20Sopenharmony_ci vpaddq %xmm5,%xmm4,%xmm4 1918c2ecf20Sopenharmony_ci vmovdqa %xmm14,%xmm2 1928c2ecf20Sopenharmony_ci vpxor %xmm15,%xmm4,%xmm3 1938c2ecf20Sopenharmony_ci vmovdqu (%rsi),%ymm6 1948c2ecf20Sopenharmony_ci vmovdqu 0x20(%rsi),%ymm7 1958c2ecf20Sopenharmony_ci addq $0x40,%rsi 1968c2ecf20Sopenharmony_ci leaq SIGMA2(%rip),%rax 1978c2ecf20Sopenharmony_ci movb $0xa,%cl 1988c2ecf20Sopenharmony_ci.Lblake2s_compress_avx512_roundloop: 1998c2ecf20Sopenharmony_ci addq $0x40,%rax 2008c2ecf20Sopenharmony_ci vmovdqa -0x40(%rax),%ymm8 2018c2ecf20Sopenharmony_ci vmovdqa -0x20(%rax),%ymm9 2028c2ecf20Sopenharmony_ci vpermi2d %ymm7,%ymm6,%ymm8 2038c2ecf20Sopenharmony_ci vpermi2d %ymm7,%ymm6,%ymm9 2048c2ecf20Sopenharmony_ci vmovdqa %ymm8,%ymm6 2058c2ecf20Sopenharmony_ci vmovdqa %ymm9,%ymm7 2068c2ecf20Sopenharmony_ci vpaddd %xmm8,%xmm0,%xmm0 2078c2ecf20Sopenharmony_ci vpaddd %xmm1,%xmm0,%xmm0 2088c2ecf20Sopenharmony_ci vpxor %xmm0,%xmm3,%xmm3 2098c2ecf20Sopenharmony_ci vprord $0x10,%xmm3,%xmm3 2108c2ecf20Sopenharmony_ci vpaddd %xmm3,%xmm2,%xmm2 2118c2ecf20Sopenharmony_ci vpxor %xmm2,%xmm1,%xmm1 2128c2ecf20Sopenharmony_ci vprord $0xc,%xmm1,%xmm1 2138c2ecf20Sopenharmony_ci vextracti128 $0x1,%ymm8,%xmm8 2148c2ecf20Sopenharmony_ci vpaddd %xmm8,%xmm0,%xmm0 2158c2ecf20Sopenharmony_ci vpaddd %xmm1,%xmm0,%xmm0 2168c2ecf20Sopenharmony_ci vpxor %xmm0,%xmm3,%xmm3 2178c2ecf20Sopenharmony_ci vprord $0x8,%xmm3,%xmm3 2188c2ecf20Sopenharmony_ci vpaddd %xmm3,%xmm2,%xmm2 2198c2ecf20Sopenharmony_ci vpxor %xmm2,%xmm1,%xmm1 2208c2ecf20Sopenharmony_ci vprord $0x7,%xmm1,%xmm1 2218c2ecf20Sopenharmony_ci vpshufd $0x93,%xmm0,%xmm0 2228c2ecf20Sopenharmony_ci vpshufd $0x4e,%xmm3,%xmm3 2238c2ecf20Sopenharmony_ci vpshufd $0x39,%xmm2,%xmm2 2248c2ecf20Sopenharmony_ci vpaddd %xmm9,%xmm0,%xmm0 2258c2ecf20Sopenharmony_ci vpaddd %xmm1,%xmm0,%xmm0 2268c2ecf20Sopenharmony_ci vpxor %xmm0,%xmm3,%xmm3 2278c2ecf20Sopenharmony_ci vprord $0x10,%xmm3,%xmm3 2288c2ecf20Sopenharmony_ci vpaddd %xmm3,%xmm2,%xmm2 2298c2ecf20Sopenharmony_ci vpxor %xmm2,%xmm1,%xmm1 2308c2ecf20Sopenharmony_ci vprord $0xc,%xmm1,%xmm1 2318c2ecf20Sopenharmony_ci vextracti128 $0x1,%ymm9,%xmm9 2328c2ecf20Sopenharmony_ci vpaddd %xmm9,%xmm0,%xmm0 2338c2ecf20Sopenharmony_ci vpaddd %xmm1,%xmm0,%xmm0 2348c2ecf20Sopenharmony_ci vpxor %xmm0,%xmm3,%xmm3 2358c2ecf20Sopenharmony_ci vprord $0x8,%xmm3,%xmm3 2368c2ecf20Sopenharmony_ci vpaddd %xmm3,%xmm2,%xmm2 2378c2ecf20Sopenharmony_ci vpxor %xmm2,%xmm1,%xmm1 2388c2ecf20Sopenharmony_ci vprord $0x7,%xmm1,%xmm1 2398c2ecf20Sopenharmony_ci vpshufd $0x39,%xmm0,%xmm0 2408c2ecf20Sopenharmony_ci vpshufd $0x4e,%xmm3,%xmm3 2418c2ecf20Sopenharmony_ci vpshufd $0x93,%xmm2,%xmm2 2428c2ecf20Sopenharmony_ci decb %cl 2438c2ecf20Sopenharmony_ci jne .Lblake2s_compress_avx512_roundloop 2448c2ecf20Sopenharmony_ci vpxor %xmm10,%xmm0,%xmm0 2458c2ecf20Sopenharmony_ci vpxor %xmm11,%xmm1,%xmm1 2468c2ecf20Sopenharmony_ci vpxor %xmm2,%xmm0,%xmm0 2478c2ecf20Sopenharmony_ci vpxor %xmm3,%xmm1,%xmm1 2488c2ecf20Sopenharmony_ci decq %rdx 2498c2ecf20Sopenharmony_ci jne .Lblake2s_compress_avx512_mainloop 2508c2ecf20Sopenharmony_ci vmovdqu %xmm0,(%rdi) 2518c2ecf20Sopenharmony_ci vmovdqu %xmm1,0x10(%rdi) 2528c2ecf20Sopenharmony_ci vmovdqu %xmm4,0x20(%rdi) 2538c2ecf20Sopenharmony_ci vzeroupper 2548c2ecf20Sopenharmony_ci RET 2558c2ecf20Sopenharmony_ciSYM_FUNC_END(blake2s_compress_avx512) 2568c2ecf20Sopenharmony_ci#endif /* CONFIG_AS_AVX512 */ 257