162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 OR MIT */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
462306a36Sopenharmony_ci * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
562306a36Sopenharmony_ci */
662306a36Sopenharmony_ci
762306a36Sopenharmony_ci#include <linux/linkage.h>
862306a36Sopenharmony_ci
962306a36Sopenharmony_ci.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
1062306a36Sopenharmony_ci.align 32
1162306a36Sopenharmony_ciIV:	.octa 0xA54FF53A3C6EF372BB67AE856A09E667
1262306a36Sopenharmony_ci	.octa 0x5BE0CD191F83D9AB9B05688C510E527F
1362306a36Sopenharmony_ci.section .rodata.cst16.ROT16, "aM", @progbits, 16
1462306a36Sopenharmony_ci.align 16
1562306a36Sopenharmony_ciROT16:	.octa 0x0D0C0F0E09080B0A0504070601000302
1662306a36Sopenharmony_ci.section .rodata.cst16.ROR328, "aM", @progbits, 16
1762306a36Sopenharmony_ci.align 16
1862306a36Sopenharmony_ciROR328:	.octa 0x0C0F0E0D080B0A090407060500030201
1962306a36Sopenharmony_ci.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
2062306a36Sopenharmony_ci.align 64
2162306a36Sopenharmony_ciSIGMA:
2262306a36Sopenharmony_ci.byte  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
2362306a36Sopenharmony_ci.byte 14,  4,  9, 13, 10,  8, 15,  6,  5,  1,  0, 11,  3, 12,  2,  7
2462306a36Sopenharmony_ci.byte 11, 12,  5, 15,  8,  0,  2, 13,  9, 10,  3,  7,  4, 14,  6,  1
2562306a36Sopenharmony_ci.byte  7,  3, 13, 11,  9,  1, 12, 14, 15,  2,  5,  4,  8,  6, 10,  0
2662306a36Sopenharmony_ci.byte  9,  5,  2, 10,  0,  7,  4, 15,  3, 14, 11,  6, 13,  1, 12,  8
2762306a36Sopenharmony_ci.byte  2,  6,  0,  8, 12, 10, 11,  3,  1,  4,  7, 15,  9, 13,  5, 14
2862306a36Sopenharmony_ci.byte 12,  1, 14,  4,  5, 15, 13, 10,  8,  0,  6,  9, 11,  7,  3,  2
2962306a36Sopenharmony_ci.byte 13,  7, 12,  3, 11, 14,  1,  9,  2,  5, 15,  8, 10,  0,  4,  6
3062306a36Sopenharmony_ci.byte  6, 14, 11,  0, 15,  9,  3,  8, 10, 12, 13,  1,  5,  2,  7,  4
3162306a36Sopenharmony_ci.byte 10,  8,  7,  1,  2,  4,  6,  5, 13, 15,  9,  3,  0, 11, 14, 12
3262306a36Sopenharmony_ci#ifdef CONFIG_AS_AVX512
3362306a36Sopenharmony_ci.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
3462306a36Sopenharmony_ci.align 64
3562306a36Sopenharmony_ciSIGMA2:
3662306a36Sopenharmony_ci.long  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
3762306a36Sopenharmony_ci.long  8,  2, 13, 15, 10,  9, 12,  3,  6,  4,  0, 14,  5, 11,  1,  7
3862306a36Sopenharmony_ci.long 11, 13,  8,  6,  5, 10, 14,  3,  2,  4, 12, 15,  1,  0,  7,  9
3962306a36Sopenharmony_ci.long 11, 10,  7,  0,  8, 15,  1, 13,  3,  6,  2, 12,  4, 14,  9,  5
4062306a36Sopenharmony_ci.long  4, 10,  9, 14, 15,  0, 11,  8,  1,  7,  3, 13,  2,  5,  6, 12
4162306a36Sopenharmony_ci.long  2, 11,  4, 15, 14,  3, 10,  8, 13,  6,  5,  7,  0, 12,  1,  9
4262306a36Sopenharmony_ci.long  4,  8, 15,  9, 14, 11, 13,  5,  3,  2,  1, 12,  6, 10,  7,  0
4362306a36Sopenharmony_ci.long  6, 13,  0, 14, 12,  2,  1, 11, 15,  4,  5,  8,  7,  9,  3, 10
4462306a36Sopenharmony_ci.long 15,  5,  4, 13, 10,  7,  3, 11, 12,  2,  0,  6,  9,  8,  1, 14
4562306a36Sopenharmony_ci.long  8,  7, 14, 11, 13, 15,  0, 12, 10,  4,  5,  6,  3,  2,  1,  9
4662306a36Sopenharmony_ci#endif /* CONFIG_AS_AVX512 */
4762306a36Sopenharmony_ci
4862306a36Sopenharmony_ci.text
4962306a36Sopenharmony_ciSYM_FUNC_START(blake2s_compress_ssse3)
5062306a36Sopenharmony_ci	testq		%rdx,%rdx
5162306a36Sopenharmony_ci	je		.Lendofloop
5262306a36Sopenharmony_ci	movdqu		(%rdi),%xmm0
5362306a36Sopenharmony_ci	movdqu		0x10(%rdi),%xmm1
5462306a36Sopenharmony_ci	movdqa		ROT16(%rip),%xmm12
5562306a36Sopenharmony_ci	movdqa		ROR328(%rip),%xmm13
5662306a36Sopenharmony_ci	movdqu		0x20(%rdi),%xmm14
5762306a36Sopenharmony_ci	movq		%rcx,%xmm15
5862306a36Sopenharmony_ci	leaq		SIGMA+0xa0(%rip),%r8
5962306a36Sopenharmony_ci	jmp		.Lbeginofloop
6062306a36Sopenharmony_ci	.align		32
6162306a36Sopenharmony_ci.Lbeginofloop:
6262306a36Sopenharmony_ci	movdqa		%xmm0,%xmm10
6362306a36Sopenharmony_ci	movdqa		%xmm1,%xmm11
6462306a36Sopenharmony_ci	paddq		%xmm15,%xmm14
6562306a36Sopenharmony_ci	movdqa		IV(%rip),%xmm2
6662306a36Sopenharmony_ci	movdqa		%xmm14,%xmm3
6762306a36Sopenharmony_ci	pxor		IV+0x10(%rip),%xmm3
6862306a36Sopenharmony_ci	leaq		SIGMA(%rip),%rcx
6962306a36Sopenharmony_ci.Lroundloop:
7062306a36Sopenharmony_ci	movzbl		(%rcx),%eax
7162306a36Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm4
7262306a36Sopenharmony_ci	movzbl		0x1(%rcx),%eax
7362306a36Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm5
7462306a36Sopenharmony_ci	movzbl		0x2(%rcx),%eax
7562306a36Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm6
7662306a36Sopenharmony_ci	movzbl		0x3(%rcx),%eax
7762306a36Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm7
7862306a36Sopenharmony_ci	punpckldq	%xmm5,%xmm4
7962306a36Sopenharmony_ci	punpckldq	%xmm7,%xmm6
8062306a36Sopenharmony_ci	punpcklqdq	%xmm6,%xmm4
8162306a36Sopenharmony_ci	paddd		%xmm4,%xmm0
8262306a36Sopenharmony_ci	paddd		%xmm1,%xmm0
8362306a36Sopenharmony_ci	pxor		%xmm0,%xmm3
8462306a36Sopenharmony_ci	pshufb		%xmm12,%xmm3
8562306a36Sopenharmony_ci	paddd		%xmm3,%xmm2
8662306a36Sopenharmony_ci	pxor		%xmm2,%xmm1
8762306a36Sopenharmony_ci	movdqa		%xmm1,%xmm8
8862306a36Sopenharmony_ci	psrld		$0xc,%xmm1
8962306a36Sopenharmony_ci	pslld		$0x14,%xmm8
9062306a36Sopenharmony_ci	por		%xmm8,%xmm1
9162306a36Sopenharmony_ci	movzbl		0x4(%rcx),%eax
9262306a36Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm5
9362306a36Sopenharmony_ci	movzbl		0x5(%rcx),%eax
9462306a36Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm6
9562306a36Sopenharmony_ci	movzbl		0x6(%rcx),%eax
9662306a36Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm7
9762306a36Sopenharmony_ci	movzbl		0x7(%rcx),%eax
9862306a36Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm4
9962306a36Sopenharmony_ci	punpckldq	%xmm6,%xmm5
10062306a36Sopenharmony_ci	punpckldq	%xmm4,%xmm7
10162306a36Sopenharmony_ci	punpcklqdq	%xmm7,%xmm5
10262306a36Sopenharmony_ci	paddd		%xmm5,%xmm0
10362306a36Sopenharmony_ci	paddd		%xmm1,%xmm0
10462306a36Sopenharmony_ci	pxor		%xmm0,%xmm3
10562306a36Sopenharmony_ci	pshufb		%xmm13,%xmm3
10662306a36Sopenharmony_ci	paddd		%xmm3,%xmm2
10762306a36Sopenharmony_ci	pxor		%xmm2,%xmm1
10862306a36Sopenharmony_ci	movdqa		%xmm1,%xmm8
10962306a36Sopenharmony_ci	psrld		$0x7,%xmm1
11062306a36Sopenharmony_ci	pslld		$0x19,%xmm8
11162306a36Sopenharmony_ci	por		%xmm8,%xmm1
11262306a36Sopenharmony_ci	pshufd		$0x93,%xmm0,%xmm0
11362306a36Sopenharmony_ci	pshufd		$0x4e,%xmm3,%xmm3
11462306a36Sopenharmony_ci	pshufd		$0x39,%xmm2,%xmm2
11562306a36Sopenharmony_ci	movzbl		0x8(%rcx),%eax
11662306a36Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm6
11762306a36Sopenharmony_ci	movzbl		0x9(%rcx),%eax
11862306a36Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm7
11962306a36Sopenharmony_ci	movzbl		0xa(%rcx),%eax
12062306a36Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm4
12162306a36Sopenharmony_ci	movzbl		0xb(%rcx),%eax
12262306a36Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm5
12362306a36Sopenharmony_ci	punpckldq	%xmm7,%xmm6
12462306a36Sopenharmony_ci	punpckldq	%xmm5,%xmm4
12562306a36Sopenharmony_ci	punpcklqdq	%xmm4,%xmm6
12662306a36Sopenharmony_ci	paddd		%xmm6,%xmm0
12762306a36Sopenharmony_ci	paddd		%xmm1,%xmm0
12862306a36Sopenharmony_ci	pxor		%xmm0,%xmm3
12962306a36Sopenharmony_ci	pshufb		%xmm12,%xmm3
13062306a36Sopenharmony_ci	paddd		%xmm3,%xmm2
13162306a36Sopenharmony_ci	pxor		%xmm2,%xmm1
13262306a36Sopenharmony_ci	movdqa		%xmm1,%xmm8
13362306a36Sopenharmony_ci	psrld		$0xc,%xmm1
13462306a36Sopenharmony_ci	pslld		$0x14,%xmm8
13562306a36Sopenharmony_ci	por		%xmm8,%xmm1
13662306a36Sopenharmony_ci	movzbl		0xc(%rcx),%eax
13762306a36Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm7
13862306a36Sopenharmony_ci	movzbl		0xd(%rcx),%eax
13962306a36Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm4
14062306a36Sopenharmony_ci	movzbl		0xe(%rcx),%eax
14162306a36Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm5
14262306a36Sopenharmony_ci	movzbl		0xf(%rcx),%eax
14362306a36Sopenharmony_ci	movd		(%rsi,%rax,4),%xmm6
14462306a36Sopenharmony_ci	punpckldq	%xmm4,%xmm7
14562306a36Sopenharmony_ci	punpckldq	%xmm6,%xmm5
14662306a36Sopenharmony_ci	punpcklqdq	%xmm5,%xmm7
14762306a36Sopenharmony_ci	paddd		%xmm7,%xmm0
14862306a36Sopenharmony_ci	paddd		%xmm1,%xmm0
14962306a36Sopenharmony_ci	pxor		%xmm0,%xmm3
15062306a36Sopenharmony_ci	pshufb		%xmm13,%xmm3
15162306a36Sopenharmony_ci	paddd		%xmm3,%xmm2
15262306a36Sopenharmony_ci	pxor		%xmm2,%xmm1
15362306a36Sopenharmony_ci	movdqa		%xmm1,%xmm8
15462306a36Sopenharmony_ci	psrld		$0x7,%xmm1
15562306a36Sopenharmony_ci	pslld		$0x19,%xmm8
15662306a36Sopenharmony_ci	por		%xmm8,%xmm1
15762306a36Sopenharmony_ci	pshufd		$0x39,%xmm0,%xmm0
15862306a36Sopenharmony_ci	pshufd		$0x4e,%xmm3,%xmm3
15962306a36Sopenharmony_ci	pshufd		$0x93,%xmm2,%xmm2
16062306a36Sopenharmony_ci	addq		$0x10,%rcx
16162306a36Sopenharmony_ci	cmpq		%r8,%rcx
16262306a36Sopenharmony_ci	jnz		.Lroundloop
16362306a36Sopenharmony_ci	pxor		%xmm2,%xmm0
16462306a36Sopenharmony_ci	pxor		%xmm3,%xmm1
16562306a36Sopenharmony_ci	pxor		%xmm10,%xmm0
16662306a36Sopenharmony_ci	pxor		%xmm11,%xmm1
16762306a36Sopenharmony_ci	addq		$0x40,%rsi
16862306a36Sopenharmony_ci	decq		%rdx
16962306a36Sopenharmony_ci	jnz		.Lbeginofloop
17062306a36Sopenharmony_ci	movdqu		%xmm0,(%rdi)
17162306a36Sopenharmony_ci	movdqu		%xmm1,0x10(%rdi)
17262306a36Sopenharmony_ci	movdqu		%xmm14,0x20(%rdi)
17362306a36Sopenharmony_ci.Lendofloop:
17462306a36Sopenharmony_ci	RET
17562306a36Sopenharmony_ciSYM_FUNC_END(blake2s_compress_ssse3)
17662306a36Sopenharmony_ci
17762306a36Sopenharmony_ci#ifdef CONFIG_AS_AVX512
17862306a36Sopenharmony_ciSYM_FUNC_START(blake2s_compress_avx512)
17962306a36Sopenharmony_ci	vmovdqu		(%rdi),%xmm0
18062306a36Sopenharmony_ci	vmovdqu		0x10(%rdi),%xmm1
18162306a36Sopenharmony_ci	vmovdqu		0x20(%rdi),%xmm4
18262306a36Sopenharmony_ci	vmovq		%rcx,%xmm5
18362306a36Sopenharmony_ci	vmovdqa		IV(%rip),%xmm14
18462306a36Sopenharmony_ci	vmovdqa		IV+16(%rip),%xmm15
18562306a36Sopenharmony_ci	jmp		.Lblake2s_compress_avx512_mainloop
18662306a36Sopenharmony_ci.align 32
18762306a36Sopenharmony_ci.Lblake2s_compress_avx512_mainloop:
18862306a36Sopenharmony_ci	vmovdqa		%xmm0,%xmm10
18962306a36Sopenharmony_ci	vmovdqa		%xmm1,%xmm11
19062306a36Sopenharmony_ci	vpaddq		%xmm5,%xmm4,%xmm4
19162306a36Sopenharmony_ci	vmovdqa		%xmm14,%xmm2
19262306a36Sopenharmony_ci	vpxor		%xmm15,%xmm4,%xmm3
19362306a36Sopenharmony_ci	vmovdqu		(%rsi),%ymm6
19462306a36Sopenharmony_ci	vmovdqu		0x20(%rsi),%ymm7
19562306a36Sopenharmony_ci	addq		$0x40,%rsi
19662306a36Sopenharmony_ci	leaq		SIGMA2(%rip),%rax
19762306a36Sopenharmony_ci	movb		$0xa,%cl
19862306a36Sopenharmony_ci.Lblake2s_compress_avx512_roundloop:
19962306a36Sopenharmony_ci	addq		$0x40,%rax
20062306a36Sopenharmony_ci	vmovdqa		-0x40(%rax),%ymm8
20162306a36Sopenharmony_ci	vmovdqa		-0x20(%rax),%ymm9
20262306a36Sopenharmony_ci	vpermi2d	%ymm7,%ymm6,%ymm8
20362306a36Sopenharmony_ci	vpermi2d	%ymm7,%ymm6,%ymm9
20462306a36Sopenharmony_ci	vmovdqa		%ymm8,%ymm6
20562306a36Sopenharmony_ci	vmovdqa		%ymm9,%ymm7
20662306a36Sopenharmony_ci	vpaddd		%xmm8,%xmm0,%xmm0
20762306a36Sopenharmony_ci	vpaddd		%xmm1,%xmm0,%xmm0
20862306a36Sopenharmony_ci	vpxor		%xmm0,%xmm3,%xmm3
20962306a36Sopenharmony_ci	vprord		$0x10,%xmm3,%xmm3
21062306a36Sopenharmony_ci	vpaddd		%xmm3,%xmm2,%xmm2
21162306a36Sopenharmony_ci	vpxor		%xmm2,%xmm1,%xmm1
21262306a36Sopenharmony_ci	vprord		$0xc,%xmm1,%xmm1
21362306a36Sopenharmony_ci	vextracti128	$0x1,%ymm8,%xmm8
21462306a36Sopenharmony_ci	vpaddd		%xmm8,%xmm0,%xmm0
21562306a36Sopenharmony_ci	vpaddd		%xmm1,%xmm0,%xmm0
21662306a36Sopenharmony_ci	vpxor		%xmm0,%xmm3,%xmm3
21762306a36Sopenharmony_ci	vprord		$0x8,%xmm3,%xmm3
21862306a36Sopenharmony_ci	vpaddd		%xmm3,%xmm2,%xmm2
21962306a36Sopenharmony_ci	vpxor		%xmm2,%xmm1,%xmm1
22062306a36Sopenharmony_ci	vprord		$0x7,%xmm1,%xmm1
22162306a36Sopenharmony_ci	vpshufd		$0x93,%xmm0,%xmm0
22262306a36Sopenharmony_ci	vpshufd		$0x4e,%xmm3,%xmm3
22362306a36Sopenharmony_ci	vpshufd		$0x39,%xmm2,%xmm2
22462306a36Sopenharmony_ci	vpaddd		%xmm9,%xmm0,%xmm0
22562306a36Sopenharmony_ci	vpaddd		%xmm1,%xmm0,%xmm0
22662306a36Sopenharmony_ci	vpxor		%xmm0,%xmm3,%xmm3
22762306a36Sopenharmony_ci	vprord		$0x10,%xmm3,%xmm3
22862306a36Sopenharmony_ci	vpaddd		%xmm3,%xmm2,%xmm2
22962306a36Sopenharmony_ci	vpxor		%xmm2,%xmm1,%xmm1
23062306a36Sopenharmony_ci	vprord		$0xc,%xmm1,%xmm1
23162306a36Sopenharmony_ci	vextracti128	$0x1,%ymm9,%xmm9
23262306a36Sopenharmony_ci	vpaddd		%xmm9,%xmm0,%xmm0
23362306a36Sopenharmony_ci	vpaddd		%xmm1,%xmm0,%xmm0
23462306a36Sopenharmony_ci	vpxor		%xmm0,%xmm3,%xmm3
23562306a36Sopenharmony_ci	vprord		$0x8,%xmm3,%xmm3
23662306a36Sopenharmony_ci	vpaddd		%xmm3,%xmm2,%xmm2
23762306a36Sopenharmony_ci	vpxor		%xmm2,%xmm1,%xmm1
23862306a36Sopenharmony_ci	vprord		$0x7,%xmm1,%xmm1
23962306a36Sopenharmony_ci	vpshufd		$0x39,%xmm0,%xmm0
24062306a36Sopenharmony_ci	vpshufd		$0x4e,%xmm3,%xmm3
24162306a36Sopenharmony_ci	vpshufd		$0x93,%xmm2,%xmm2
24262306a36Sopenharmony_ci	decb		%cl
24362306a36Sopenharmony_ci	jne		.Lblake2s_compress_avx512_roundloop
24462306a36Sopenharmony_ci	vpxor		%xmm10,%xmm0,%xmm0
24562306a36Sopenharmony_ci	vpxor		%xmm11,%xmm1,%xmm1
24662306a36Sopenharmony_ci	vpxor		%xmm2,%xmm0,%xmm0
24762306a36Sopenharmony_ci	vpxor		%xmm3,%xmm1,%xmm1
24862306a36Sopenharmony_ci	decq		%rdx
24962306a36Sopenharmony_ci	jne		.Lblake2s_compress_avx512_mainloop
25062306a36Sopenharmony_ci	vmovdqu		%xmm0,(%rdi)
25162306a36Sopenharmony_ci	vmovdqu		%xmm1,0x10(%rdi)
25262306a36Sopenharmony_ci	vmovdqu		%xmm4,0x20(%rdi)
25362306a36Sopenharmony_ci	vzeroupper
25462306a36Sopenharmony_ci	RET
25562306a36Sopenharmony_ciSYM_FUNC_END(blake2s_compress_avx512)
25662306a36Sopenharmony_ci#endif /* CONFIG_AS_AVX512 */
257