18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 OR MIT */
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
48c2ecf20Sopenharmony_ci * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
58c2ecf20Sopenharmony_ci */
68c2ecf20Sopenharmony_ci
78c2ecf20Sopenharmony_ci#include <linux/linkage.h>
88c2ecf20Sopenharmony_ci
98c2ecf20Sopenharmony_ci.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
108c2ecf20Sopenharmony_ci.align 32
118c2ecf20Sopenharmony_ciIV:	.octa 0xA54FF53A3C6EF372BB67AE856A09E667
128c2ecf20Sopenharmony_ci	.octa 0x5BE0CD191F83D9AB9B05688C510E527F
138c2ecf20Sopenharmony_ci.section .rodata.cst16.ROT16, "aM", @progbits, 16
148c2ecf20Sopenharmony_ci.align 16
158c2ecf20Sopenharmony_ciROT16:	.octa 0x0D0C0F0E09080B0A0504070601000302
168c2ecf20Sopenharmony_ci.section .rodata.cst16.ROR328, "aM", @progbits, 16
178c2ecf20Sopenharmony_ci.align 16
188c2ecf20Sopenharmony_ciROR328:	.octa 0x0C0F0E0D080B0A090407060500030201
198c2ecf20Sopenharmony_ci.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
208c2ecf20Sopenharmony_ci.align 64
218c2ecf20Sopenharmony_ciSIGMA:
228c2ecf20Sopenharmony_ci.byte  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
238c2ecf20Sopenharmony_ci.byte 14,  4,  9, 13, 10,  8, 15,  6,  5,  1,  0, 11,  3, 12,  2,  7
248c2ecf20Sopenharmony_ci.byte 11, 12,  5, 15,  8,  0,  2, 13,  9, 10,  3,  7,  4, 14,  6,  1
258c2ecf20Sopenharmony_ci.byte  7,  3, 13, 11,  9,  1, 12, 14, 15,  2,  5,  4,  8,  6, 10,  0
268c2ecf20Sopenharmony_ci.byte  9,  5,  2, 10,  0,  7,  4, 15,  3, 14, 11,  6, 13,  1, 12,  8
278c2ecf20Sopenharmony_ci.byte  2,  6,  0,  8, 12, 10, 11,  3,  1,  4,  7, 15,  9, 13,  5, 14
288c2ecf20Sopenharmony_ci.byte 12,  1, 14,  4,  5, 15, 13, 10,  8,  0,  6,  9, 11,  7,  3,  2
298c2ecf20Sopenharmony_ci.byte 13,  7, 12,  3, 11, 14,  1,  9,  2,  5, 15,  8, 10,  0,  4,  6
308c2ecf20Sopenharmony_ci.byte  6, 14, 11,  0, 15,  9,  3,  8, 10, 12, 13,  1,  5,  2,  7,  4
318c2ecf20Sopenharmony_ci.byte 10,  8,  7,  1,  2,  4,  6,  5, 13, 15,  9,  3,  0, 11, 14, 12
328c2ecf20Sopenharmony_ci#ifdef CONFIG_AS_AVX512
338c2ecf20Sopenharmony_ci.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
348c2ecf20Sopenharmony_ci.align 64
358c2ecf20Sopenharmony_ciSIGMA2:
368c2ecf20Sopenharmony_ci.long  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
378c2ecf20Sopenharmony_ci.long  8,  2, 13, 15, 10,  9, 12,  3,  6,  4,  0, 14,  5, 11,  1,  7
388c2ecf20Sopenharmony_ci.long 11, 13,  8,  6,  5, 10, 14,  3,  2,  4, 12, 15,  1,  0,  7,  9
398c2ecf20Sopenharmony_ci.long 11, 10,  7,  0,  8, 15,  1, 13,  3,  6,  2, 12,  4, 14,  9,  5
408c2ecf20Sopenharmony_ci.long  4, 10,  9, 14, 15,  0, 11,  8,  1,  7,  3, 13,  2,  5,  6, 12
418c2ecf20Sopenharmony_ci.long  2, 11,  4, 15, 14,  3, 10,  8, 13,  6,  5,  7,  0, 12,  1,  9
428c2ecf20Sopenharmony_ci.long  4,  8, 15,  9, 14, 11, 13,  5,  3,  2,  1, 12,  6, 10,  7,  0
438c2ecf20Sopenharmony_ci.long  6, 13,  0, 14, 12,  2,  1, 11, 15,  4,  5,  8,  7,  9,  3, 10
448c2ecf20Sopenharmony_ci.long 15,  5,  4, 13, 10,  7,  3, 11, 12,  2,  0,  6,  9,  8,  1, 14
458c2ecf20Sopenharmony_ci.long  8,  7, 14, 11, 13, 15,  0, 12, 10,  4,  5,  6,  3,  2,  1,  9
468c2ecf20Sopenharmony_ci#endif /* CONFIG_AS_AVX512 */
478c2ecf20Sopenharmony_ci
488c2ecf20Sopenharmony_ci.text
498c2ecf20Sopenharmony_ciSYM_FUNC_START(blake2s_compress_ssse3)
508c2ecf20Sopenharmony_ci	testq		%rdx,%rdx
518c2ecf20Sopenharmony_ci	je		.Lendofloop
528c2ecf20Sopenharmony_ci	movdqu		(%rdi),%xmm0
538c2ecf20Sopenharmony_ci	movdqu		0x10(%rdi),%xmm1
548c2ecf20Sopenharmony_ci	movdqa		ROT16(%rip),%xmm12
558c2ecf20Sopenharmony_ci	movdqa		ROR328(%rip),%xmm13
568c2ecf20Sopenharmony_ci	movdqu		0x20(%rdi),%xmm14
578c2ecf20Sopenharmony_ci	movq		%rcx,%xmm15
588c2ecf20Sopenharmony_ci	leaq		SIGMA+0xa0(%rip),%r8
598c2ecf20Sopenharmony_ci	jmp		.Lbeginofloop
608c2ecf20Sopenharmony_ci	.align		32
618c2ecf20Sopenharmony_ci.Lbeginofloop:
628c2ecf20Sopenharmony_ci	movdqa		%xmm0,%xmm10
638c2ecf20Sopenharmony_ci	movdqa		%xmm1,%xmm11
648c2ecf20Sopenharmony_ci	paddq		%xmm15,%xmm14
658c2ecf20Sopenharmony_ci	movdqa		IV(%rip),%xmm2
668c2ecf20Sopenharmony_ci	movdqa		%xmm14,%xmm3
678c2ecf20Sopenharmony_ci	pxor		IV+0x10(%rip),%xmm3
688c2ecf20Sopenharmony_ci	leaq		SIGMA(%rip),%rcx
698c2ecf20Sopenharmony_ci.Lroundloop:
708c2ecf20Sopenharmony_ci	movzbl		(%rcx),%eax
718c2ecf20Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm4
728c2ecf20Sopenharmony_ci	movzbl		0x1(%rcx),%eax
738c2ecf20Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm5
748c2ecf20Sopenharmony_ci	movzbl		0x2(%rcx),%eax
758c2ecf20Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm6
768c2ecf20Sopenharmony_ci	movzbl		0x3(%rcx),%eax
778c2ecf20Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm7
788c2ecf20Sopenharmony_ci	punpckldq	%xmm5,%xmm4
798c2ecf20Sopenharmony_ci	punpckldq	%xmm7,%xmm6
808c2ecf20Sopenharmony_ci	punpcklqdq	%xmm6,%xmm4
818c2ecf20Sopenharmony_ci	paddd		%xmm4,%xmm0
828c2ecf20Sopenharmony_ci	paddd		%xmm1,%xmm0
838c2ecf20Sopenharmony_ci	pxor		%xmm0,%xmm3
848c2ecf20Sopenharmony_ci	pshufb		%xmm12,%xmm3
858c2ecf20Sopenharmony_ci	paddd		%xmm3,%xmm2
868c2ecf20Sopenharmony_ci	pxor		%xmm2,%xmm1
878c2ecf20Sopenharmony_ci	movdqa		%xmm1,%xmm8
888c2ecf20Sopenharmony_ci	psrld		$0xc,%xmm1
898c2ecf20Sopenharmony_ci	pslld		$0x14,%xmm8
908c2ecf20Sopenharmony_ci	por		%xmm8,%xmm1
918c2ecf20Sopenharmony_ci	movzbl		0x4(%rcx),%eax
928c2ecf20Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm5
938c2ecf20Sopenharmony_ci	movzbl		0x5(%rcx),%eax
948c2ecf20Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm6
958c2ecf20Sopenharmony_ci	movzbl		0x6(%rcx),%eax
968c2ecf20Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm7
978c2ecf20Sopenharmony_ci	movzbl		0x7(%rcx),%eax
988c2ecf20Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm4
998c2ecf20Sopenharmony_ci	punpckldq	%xmm6,%xmm5
1008c2ecf20Sopenharmony_ci	punpckldq	%xmm4,%xmm7
1018c2ecf20Sopenharmony_ci	punpcklqdq	%xmm7,%xmm5
1028c2ecf20Sopenharmony_ci	paddd		%xmm5,%xmm0
1038c2ecf20Sopenharmony_ci	paddd		%xmm1,%xmm0
1048c2ecf20Sopenharmony_ci	pxor		%xmm0,%xmm3
1058c2ecf20Sopenharmony_ci	pshufb		%xmm13,%xmm3
1068c2ecf20Sopenharmony_ci	paddd		%xmm3,%xmm2
1078c2ecf20Sopenharmony_ci	pxor		%xmm2,%xmm1
1088c2ecf20Sopenharmony_ci	movdqa		%xmm1,%xmm8
1098c2ecf20Sopenharmony_ci	psrld		$0x7,%xmm1
1108c2ecf20Sopenharmony_ci	pslld		$0x19,%xmm8
1118c2ecf20Sopenharmony_ci	por		%xmm8,%xmm1
1128c2ecf20Sopenharmony_ci	pshufd		$0x93,%xmm0,%xmm0
1138c2ecf20Sopenharmony_ci	pshufd		$0x4e,%xmm3,%xmm3
1148c2ecf20Sopenharmony_ci	pshufd		$0x39,%xmm2,%xmm2
1158c2ecf20Sopenharmony_ci	movzbl		0x8(%rcx),%eax
1168c2ecf20Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm6
1178c2ecf20Sopenharmony_ci	movzbl		0x9(%rcx),%eax
1188c2ecf20Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm7
1198c2ecf20Sopenharmony_ci	movzbl		0xa(%rcx),%eax
1208c2ecf20Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm4
1218c2ecf20Sopenharmony_ci	movzbl		0xb(%rcx),%eax
1228c2ecf20Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm5
1238c2ecf20Sopenharmony_ci	punpckldq	%xmm7,%xmm6
1248c2ecf20Sopenharmony_ci	punpckldq	%xmm5,%xmm4
1258c2ecf20Sopenharmony_ci	punpcklqdq	%xmm4,%xmm6
1268c2ecf20Sopenharmony_ci	paddd		%xmm6,%xmm0
1278c2ecf20Sopenharmony_ci	paddd		%xmm1,%xmm0
1288c2ecf20Sopenharmony_ci	pxor		%xmm0,%xmm3
1298c2ecf20Sopenharmony_ci	pshufb		%xmm12,%xmm3
1308c2ecf20Sopenharmony_ci	paddd		%xmm3,%xmm2
1318c2ecf20Sopenharmony_ci	pxor		%xmm2,%xmm1
1328c2ecf20Sopenharmony_ci	movdqa		%xmm1,%xmm8
1338c2ecf20Sopenharmony_ci	psrld		$0xc,%xmm1
1348c2ecf20Sopenharmony_ci	pslld		$0x14,%xmm8
1358c2ecf20Sopenharmony_ci	por		%xmm8,%xmm1
1368c2ecf20Sopenharmony_ci	movzbl		0xc(%rcx),%eax
1378c2ecf20Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm7
1388c2ecf20Sopenharmony_ci	movzbl		0xd(%rcx),%eax
1398c2ecf20Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm4
1408c2ecf20Sopenharmony_ci	movzbl		0xe(%rcx),%eax
1418c2ecf20Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm5
1428c2ecf20Sopenharmony_ci	movzbl		0xf(%rcx),%eax
1438c2ecf20Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm6
1448c2ecf20Sopenharmony_ci	punpckldq	%xmm4,%xmm7
1458c2ecf20Sopenharmony_ci	punpckldq	%xmm6,%xmm5
1468c2ecf20Sopenharmony_ci	punpcklqdq	%xmm5,%xmm7
1478c2ecf20Sopenharmony_ci	paddd		%xmm7,%xmm0
1488c2ecf20Sopenharmony_ci	paddd		%xmm1,%xmm0
1498c2ecf20Sopenharmony_ci	pxor		%xmm0,%xmm3
1508c2ecf20Sopenharmony_ci	pshufb		%xmm13,%xmm3
1518c2ecf20Sopenharmony_ci	paddd		%xmm3,%xmm2
1528c2ecf20Sopenharmony_ci	pxor		%xmm2,%xmm1
1538c2ecf20Sopenharmony_ci	movdqa		%xmm1,%xmm8
1548c2ecf20Sopenharmony_ci	psrld		$0x7,%xmm1
1558c2ecf20Sopenharmony_ci	pslld		$0x19,%xmm8
1568c2ecf20Sopenharmony_ci	por		%xmm8,%xmm1
1578c2ecf20Sopenharmony_ci	pshufd		$0x39,%xmm0,%xmm0
1588c2ecf20Sopenharmony_ci	pshufd		$0x4e,%xmm3,%xmm3
1598c2ecf20Sopenharmony_ci	pshufd		$0x93,%xmm2,%xmm2
1608c2ecf20Sopenharmony_ci	addq		$0x10,%rcx
1618c2ecf20Sopenharmony_ci	cmpq		%r8,%rcx
1628c2ecf20Sopenharmony_ci	jnz		.Lroundloop
1638c2ecf20Sopenharmony_ci	pxor		%xmm2,%xmm0
1648c2ecf20Sopenharmony_ci	pxor		%xmm3,%xmm1
1658c2ecf20Sopenharmony_ci	pxor		%xmm10,%xmm0
1668c2ecf20Sopenharmony_ci	pxor		%xmm11,%xmm1
1678c2ecf20Sopenharmony_ci	addq		$0x40,%rsi
1688c2ecf20Sopenharmony_ci	decq		%rdx
1698c2ecf20Sopenharmony_ci	jnz		.Lbeginofloop
1708c2ecf20Sopenharmony_ci	movdqu		%xmm0,(%rdi)
1718c2ecf20Sopenharmony_ci	movdqu		%xmm1,0x10(%rdi)
1728c2ecf20Sopenharmony_ci	movdqu		%xmm14,0x20(%rdi)
1738c2ecf20Sopenharmony_ci.Lendofloop:
1748c2ecf20Sopenharmony_ci	RET
1758c2ecf20Sopenharmony_ciSYM_FUNC_END(blake2s_compress_ssse3)
1768c2ecf20Sopenharmony_ci
1778c2ecf20Sopenharmony_ci#ifdef CONFIG_AS_AVX512
1788c2ecf20Sopenharmony_ciSYM_FUNC_START(blake2s_compress_avx512)
1798c2ecf20Sopenharmony_ci	vmovdqu		(%rdi),%xmm0
1808c2ecf20Sopenharmony_ci	vmovdqu		0x10(%rdi),%xmm1
1818c2ecf20Sopenharmony_ci	vmovdqu		0x20(%rdi),%xmm4
1828c2ecf20Sopenharmony_ci	vmovq		%rcx,%xmm5
1838c2ecf20Sopenharmony_ci	vmovdqa		IV(%rip),%xmm14
1848c2ecf20Sopenharmony_ci	vmovdqa		IV+16(%rip),%xmm15
1858c2ecf20Sopenharmony_ci	jmp		.Lblake2s_compress_avx512_mainloop
1868c2ecf20Sopenharmony_ci.align 32
1878c2ecf20Sopenharmony_ci.Lblake2s_compress_avx512_mainloop:
1888c2ecf20Sopenharmony_ci	vmovdqa		%xmm0,%xmm10
1898c2ecf20Sopenharmony_ci	vmovdqa		%xmm1,%xmm11
1908c2ecf20Sopenharmony_ci	vpaddq		%xmm5,%xmm4,%xmm4
1918c2ecf20Sopenharmony_ci	vmovdqa		%xmm14,%xmm2
1928c2ecf20Sopenharmony_ci	vpxor		%xmm15,%xmm4,%xmm3
1938c2ecf20Sopenharmony_ci	vmovdqu		(%rsi),%ymm6
1948c2ecf20Sopenharmony_ci	vmovdqu		0x20(%rsi),%ymm7
1958c2ecf20Sopenharmony_ci	addq		$0x40,%rsi
1968c2ecf20Sopenharmony_ci	leaq		SIGMA2(%rip),%rax
1978c2ecf20Sopenharmony_ci	movb		$0xa,%cl
1988c2ecf20Sopenharmony_ci.Lblake2s_compress_avx512_roundloop:
1998c2ecf20Sopenharmony_ci	addq		$0x40,%rax
2008c2ecf20Sopenharmony_ci	vmovdqa		-0x40(%rax),%ymm8
2018c2ecf20Sopenharmony_ci	vmovdqa		-0x20(%rax),%ymm9
2028c2ecf20Sopenharmony_ci	vpermi2d	%ymm7,%ymm6,%ymm8
2038c2ecf20Sopenharmony_ci	vpermi2d	%ymm7,%ymm6,%ymm9
2048c2ecf20Sopenharmony_ci	vmovdqa		%ymm8,%ymm6
2058c2ecf20Sopenharmony_ci	vmovdqa		%ymm9,%ymm7
2068c2ecf20Sopenharmony_ci	vpaddd		%xmm8,%xmm0,%xmm0
2078c2ecf20Sopenharmony_ci	vpaddd		%xmm1,%xmm0,%xmm0
2088c2ecf20Sopenharmony_ci	vpxor		%xmm0,%xmm3,%xmm3
2098c2ecf20Sopenharmony_ci	vprord		$0x10,%xmm3,%xmm3
2108c2ecf20Sopenharmony_ci	vpaddd		%xmm3,%xmm2,%xmm2
2118c2ecf20Sopenharmony_ci	vpxor		%xmm2,%xmm1,%xmm1
2128c2ecf20Sopenharmony_ci	vprord		$0xc,%xmm1,%xmm1
2138c2ecf20Sopenharmony_ci	vextracti128	$0x1,%ymm8,%xmm8
2148c2ecf20Sopenharmony_ci	vpaddd		%xmm8,%xmm0,%xmm0
2158c2ecf20Sopenharmony_ci	vpaddd		%xmm1,%xmm0,%xmm0
2168c2ecf20Sopenharmony_ci	vpxor		%xmm0,%xmm3,%xmm3
2178c2ecf20Sopenharmony_ci	vprord		$0x8,%xmm3,%xmm3
2188c2ecf20Sopenharmony_ci	vpaddd		%xmm3,%xmm2,%xmm2
2198c2ecf20Sopenharmony_ci	vpxor		%xmm2,%xmm1,%xmm1
2208c2ecf20Sopenharmony_ci	vprord		$0x7,%xmm1,%xmm1
2218c2ecf20Sopenharmony_ci	vpshufd		$0x93,%xmm0,%xmm0
2228c2ecf20Sopenharmony_ci	vpshufd		$0x4e,%xmm3,%xmm3
2238c2ecf20Sopenharmony_ci	vpshufd		$0x39,%xmm2,%xmm2
2248c2ecf20Sopenharmony_ci	vpaddd		%xmm9,%xmm0,%xmm0
2258c2ecf20Sopenharmony_ci	vpaddd		%xmm1,%xmm0,%xmm0
2268c2ecf20Sopenharmony_ci	vpxor		%xmm0,%xmm3,%xmm3
2278c2ecf20Sopenharmony_ci	vprord		$0x10,%xmm3,%xmm3
2288c2ecf20Sopenharmony_ci	vpaddd		%xmm3,%xmm2,%xmm2
2298c2ecf20Sopenharmony_ci	vpxor		%xmm2,%xmm1,%xmm1
2308c2ecf20Sopenharmony_ci	vprord		$0xc,%xmm1,%xmm1
2318c2ecf20Sopenharmony_ci	vextracti128	$0x1,%ymm9,%xmm9
2328c2ecf20Sopenharmony_ci	vpaddd		%xmm9,%xmm0,%xmm0
2338c2ecf20Sopenharmony_ci	vpaddd		%xmm1,%xmm0,%xmm0
2348c2ecf20Sopenharmony_ci	vpxor		%xmm0,%xmm3,%xmm3
2358c2ecf20Sopenharmony_ci	vprord		$0x8,%xmm3,%xmm3
2368c2ecf20Sopenharmony_ci	vpaddd		%xmm3,%xmm2,%xmm2
2378c2ecf20Sopenharmony_ci	vpxor		%xmm2,%xmm1,%xmm1
2388c2ecf20Sopenharmony_ci	vprord		$0x7,%xmm1,%xmm1
2398c2ecf20Sopenharmony_ci	vpshufd		$0x39,%xmm0,%xmm0
2408c2ecf20Sopenharmony_ci	vpshufd		$0x4e,%xmm3,%xmm3
2418c2ecf20Sopenharmony_ci	vpshufd		$0x93,%xmm2,%xmm2
2428c2ecf20Sopenharmony_ci	decb		%cl
2438c2ecf20Sopenharmony_ci	jne		.Lblake2s_compress_avx512_roundloop
2448c2ecf20Sopenharmony_ci	vpxor		%xmm10,%xmm0,%xmm0
2458c2ecf20Sopenharmony_ci	vpxor		%xmm11,%xmm1,%xmm1
2468c2ecf20Sopenharmony_ci	vpxor		%xmm2,%xmm0,%xmm0
2478c2ecf20Sopenharmony_ci	vpxor		%xmm3,%xmm1,%xmm1
2488c2ecf20Sopenharmony_ci	decq		%rdx
2498c2ecf20Sopenharmony_ci	jne		.Lblake2s_compress_avx512_mainloop
2508c2ecf20Sopenharmony_ci	vmovdqu		%xmm0,(%rdi)
2518c2ecf20Sopenharmony_ci	vmovdqu		%xmm1,0x10(%rdi)
2528c2ecf20Sopenharmony_ci	vmovdqu		%xmm4,0x20(%rdi)
2538c2ecf20Sopenharmony_ci	vzeroupper
2548c2ecf20Sopenharmony_ci	RET
2558c2ecf20Sopenharmony_ciSYM_FUNC_END(blake2s_compress_avx512)
2568c2ecf20Sopenharmony_ci#endif /* CONFIG_AS_AVX512 */
257