162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * ChaCha 256-bit cipher algorithm, x64 AVX2 functions
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Copyright (C) 2015 Martin Willi
662306a36Sopenharmony_ci */
762306a36Sopenharmony_ci
862306a36Sopenharmony_ci#include <linux/linkage.h>
962306a36Sopenharmony_ci
1062306a36Sopenharmony_ci.section	.rodata.cst32.ROT8, "aM", @progbits, 32
1162306a36Sopenharmony_ci.align 32
1262306a36Sopenharmony_ciROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
1362306a36Sopenharmony_ci	.octa 0x0e0d0c0f0a09080b0605040702010003
1462306a36Sopenharmony_ci
1562306a36Sopenharmony_ci.section	.rodata.cst32.ROT16, "aM", @progbits, 32
1662306a36Sopenharmony_ci.align 32
1762306a36Sopenharmony_ciROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
1862306a36Sopenharmony_ci	.octa 0x0d0c0f0e09080b0a0504070601000302
1962306a36Sopenharmony_ci
2062306a36Sopenharmony_ci.section	.rodata.cst32.CTRINC, "aM", @progbits, 32
2162306a36Sopenharmony_ci.align 32
2262306a36Sopenharmony_ciCTRINC:	.octa 0x00000003000000020000000100000000
2362306a36Sopenharmony_ci	.octa 0x00000007000000060000000500000004
2462306a36Sopenharmony_ci
2562306a36Sopenharmony_ci.section	.rodata.cst32.CTR2BL, "aM", @progbits, 32
2662306a36Sopenharmony_ci.align 32
2762306a36Sopenharmony_ciCTR2BL:	.octa 0x00000000000000000000000000000000
2862306a36Sopenharmony_ci	.octa 0x00000000000000000000000000000001
2962306a36Sopenharmony_ci
3062306a36Sopenharmony_ci.section	.rodata.cst32.CTR4BL, "aM", @progbits, 32
3162306a36Sopenharmony_ci.align 32
3262306a36Sopenharmony_ciCTR4BL:	.octa 0x00000000000000000000000000000002
3362306a36Sopenharmony_ci	.octa 0x00000000000000000000000000000003
3462306a36Sopenharmony_ci
3562306a36Sopenharmony_ci.text
3662306a36Sopenharmony_ci
3762306a36Sopenharmony_ciSYM_FUNC_START(chacha_2block_xor_avx2)
3862306a36Sopenharmony_ci	# %rdi: Input state matrix, s
3962306a36Sopenharmony_ci	# %rsi: up to 2 data blocks output, o
4062306a36Sopenharmony_ci	# %rdx: up to 2 data blocks input, i
4162306a36Sopenharmony_ci	# %rcx: input/output length in bytes
4262306a36Sopenharmony_ci	# %r8d: nrounds
4362306a36Sopenharmony_ci
4462306a36Sopenharmony_ci	# This function encrypts two ChaCha blocks by loading the state
4562306a36Sopenharmony_ci	# matrix twice across four AVX registers. It performs matrix operations
4662306a36Sopenharmony_ci	# on four words in each matrix in parallel, but requires shuffling to
4762306a36Sopenharmony_ci	# rearrange the words after each round.
4862306a36Sopenharmony_ci
4962306a36Sopenharmony_ci	vzeroupper
5062306a36Sopenharmony_ci
5162306a36Sopenharmony_ci	# x0..3[0-2] = s0..3
5262306a36Sopenharmony_ci	vbroadcasti128	0x00(%rdi),%ymm0
5362306a36Sopenharmony_ci	vbroadcasti128	0x10(%rdi),%ymm1
5462306a36Sopenharmony_ci	vbroadcasti128	0x20(%rdi),%ymm2
5562306a36Sopenharmony_ci	vbroadcasti128	0x30(%rdi),%ymm3
5662306a36Sopenharmony_ci
5762306a36Sopenharmony_ci	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
5862306a36Sopenharmony_ci
5962306a36Sopenharmony_ci	vmovdqa		%ymm0,%ymm8
6062306a36Sopenharmony_ci	vmovdqa		%ymm1,%ymm9
6162306a36Sopenharmony_ci	vmovdqa		%ymm2,%ymm10
6262306a36Sopenharmony_ci	vmovdqa		%ymm3,%ymm11
6362306a36Sopenharmony_ci
6462306a36Sopenharmony_ci	vmovdqa		ROT8(%rip),%ymm4
6562306a36Sopenharmony_ci	vmovdqa		ROT16(%rip),%ymm5
6662306a36Sopenharmony_ci
6762306a36Sopenharmony_ci	mov		%rcx,%rax
6862306a36Sopenharmony_ci
6962306a36Sopenharmony_ci.Ldoubleround:
7062306a36Sopenharmony_ci
7162306a36Sopenharmony_ci	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
7262306a36Sopenharmony_ci	vpaddd		%ymm1,%ymm0,%ymm0
7362306a36Sopenharmony_ci	vpxor		%ymm0,%ymm3,%ymm3
7462306a36Sopenharmony_ci	vpshufb		%ymm5,%ymm3,%ymm3
7562306a36Sopenharmony_ci
7662306a36Sopenharmony_ci	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
7762306a36Sopenharmony_ci	vpaddd		%ymm3,%ymm2,%ymm2
7862306a36Sopenharmony_ci	vpxor		%ymm2,%ymm1,%ymm1
7962306a36Sopenharmony_ci	vmovdqa		%ymm1,%ymm6
8062306a36Sopenharmony_ci	vpslld		$12,%ymm6,%ymm6
8162306a36Sopenharmony_ci	vpsrld		$20,%ymm1,%ymm1
8262306a36Sopenharmony_ci	vpor		%ymm6,%ymm1,%ymm1
8362306a36Sopenharmony_ci
8462306a36Sopenharmony_ci	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
8562306a36Sopenharmony_ci	vpaddd		%ymm1,%ymm0,%ymm0
8662306a36Sopenharmony_ci	vpxor		%ymm0,%ymm3,%ymm3
8762306a36Sopenharmony_ci	vpshufb		%ymm4,%ymm3,%ymm3
8862306a36Sopenharmony_ci
8962306a36Sopenharmony_ci	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
9062306a36Sopenharmony_ci	vpaddd		%ymm3,%ymm2,%ymm2
9162306a36Sopenharmony_ci	vpxor		%ymm2,%ymm1,%ymm1
9262306a36Sopenharmony_ci	vmovdqa		%ymm1,%ymm7
9362306a36Sopenharmony_ci	vpslld		$7,%ymm7,%ymm7
9462306a36Sopenharmony_ci	vpsrld		$25,%ymm1,%ymm1
9562306a36Sopenharmony_ci	vpor		%ymm7,%ymm1,%ymm1
9662306a36Sopenharmony_ci
9762306a36Sopenharmony_ci	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
9862306a36Sopenharmony_ci	vpshufd		$0x39,%ymm1,%ymm1
9962306a36Sopenharmony_ci	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
10062306a36Sopenharmony_ci	vpshufd		$0x4e,%ymm2,%ymm2
10162306a36Sopenharmony_ci	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
10262306a36Sopenharmony_ci	vpshufd		$0x93,%ymm3,%ymm3
10362306a36Sopenharmony_ci
10462306a36Sopenharmony_ci	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
10562306a36Sopenharmony_ci	vpaddd		%ymm1,%ymm0,%ymm0
10662306a36Sopenharmony_ci	vpxor		%ymm0,%ymm3,%ymm3
10762306a36Sopenharmony_ci	vpshufb		%ymm5,%ymm3,%ymm3
10862306a36Sopenharmony_ci
10962306a36Sopenharmony_ci	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
11062306a36Sopenharmony_ci	vpaddd		%ymm3,%ymm2,%ymm2
11162306a36Sopenharmony_ci	vpxor		%ymm2,%ymm1,%ymm1
11262306a36Sopenharmony_ci	vmovdqa		%ymm1,%ymm6
11362306a36Sopenharmony_ci	vpslld		$12,%ymm6,%ymm6
11462306a36Sopenharmony_ci	vpsrld		$20,%ymm1,%ymm1
11562306a36Sopenharmony_ci	vpor		%ymm6,%ymm1,%ymm1
11662306a36Sopenharmony_ci
11762306a36Sopenharmony_ci	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
11862306a36Sopenharmony_ci	vpaddd		%ymm1,%ymm0,%ymm0
11962306a36Sopenharmony_ci	vpxor		%ymm0,%ymm3,%ymm3
12062306a36Sopenharmony_ci	vpshufb		%ymm4,%ymm3,%ymm3
12162306a36Sopenharmony_ci
12262306a36Sopenharmony_ci	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
12362306a36Sopenharmony_ci	vpaddd		%ymm3,%ymm2,%ymm2
12462306a36Sopenharmony_ci	vpxor		%ymm2,%ymm1,%ymm1
12562306a36Sopenharmony_ci	vmovdqa		%ymm1,%ymm7
12662306a36Sopenharmony_ci	vpslld		$7,%ymm7,%ymm7
12762306a36Sopenharmony_ci	vpsrld		$25,%ymm1,%ymm1
12862306a36Sopenharmony_ci	vpor		%ymm7,%ymm1,%ymm1
12962306a36Sopenharmony_ci
13062306a36Sopenharmony_ci	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
13162306a36Sopenharmony_ci	vpshufd		$0x93,%ymm1,%ymm1
13262306a36Sopenharmony_ci	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
13362306a36Sopenharmony_ci	vpshufd		$0x4e,%ymm2,%ymm2
13462306a36Sopenharmony_ci	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
13562306a36Sopenharmony_ci	vpshufd		$0x39,%ymm3,%ymm3
13662306a36Sopenharmony_ci
13762306a36Sopenharmony_ci	sub		$2,%r8d
13862306a36Sopenharmony_ci	jnz		.Ldoubleround
13962306a36Sopenharmony_ci
14062306a36Sopenharmony_ci	# o0 = i0 ^ (x0 + s0)
14162306a36Sopenharmony_ci	vpaddd		%ymm8,%ymm0,%ymm7
14262306a36Sopenharmony_ci	cmp		$0x10,%rax
14362306a36Sopenharmony_ci	jl		.Lxorpart2
14462306a36Sopenharmony_ci	vpxor		0x00(%rdx),%xmm7,%xmm6
14562306a36Sopenharmony_ci	vmovdqu		%xmm6,0x00(%rsi)
14662306a36Sopenharmony_ci	vextracti128	$1,%ymm7,%xmm0
14762306a36Sopenharmony_ci	# o1 = i1 ^ (x1 + s1)
14862306a36Sopenharmony_ci	vpaddd		%ymm9,%ymm1,%ymm7
14962306a36Sopenharmony_ci	cmp		$0x20,%rax
15062306a36Sopenharmony_ci	jl		.Lxorpart2
15162306a36Sopenharmony_ci	vpxor		0x10(%rdx),%xmm7,%xmm6
15262306a36Sopenharmony_ci	vmovdqu		%xmm6,0x10(%rsi)
15362306a36Sopenharmony_ci	vextracti128	$1,%ymm7,%xmm1
15462306a36Sopenharmony_ci	# o2 = i2 ^ (x2 + s2)
15562306a36Sopenharmony_ci	vpaddd		%ymm10,%ymm2,%ymm7
15662306a36Sopenharmony_ci	cmp		$0x30,%rax
15762306a36Sopenharmony_ci	jl		.Lxorpart2
15862306a36Sopenharmony_ci	vpxor		0x20(%rdx),%xmm7,%xmm6
15962306a36Sopenharmony_ci	vmovdqu		%xmm6,0x20(%rsi)
16062306a36Sopenharmony_ci	vextracti128	$1,%ymm7,%xmm2
16162306a36Sopenharmony_ci	# o3 = i3 ^ (x3 + s3)
16262306a36Sopenharmony_ci	vpaddd		%ymm11,%ymm3,%ymm7
16362306a36Sopenharmony_ci	cmp		$0x40,%rax
16462306a36Sopenharmony_ci	jl		.Lxorpart2
16562306a36Sopenharmony_ci	vpxor		0x30(%rdx),%xmm7,%xmm6
16662306a36Sopenharmony_ci	vmovdqu		%xmm6,0x30(%rsi)
16762306a36Sopenharmony_ci	vextracti128	$1,%ymm7,%xmm3
16862306a36Sopenharmony_ci
16962306a36Sopenharmony_ci	# xor and write second block
17062306a36Sopenharmony_ci	vmovdqa		%xmm0,%xmm7
17162306a36Sopenharmony_ci	cmp		$0x50,%rax
17262306a36Sopenharmony_ci	jl		.Lxorpart2
17362306a36Sopenharmony_ci	vpxor		0x40(%rdx),%xmm7,%xmm6
17462306a36Sopenharmony_ci	vmovdqu		%xmm6,0x40(%rsi)
17562306a36Sopenharmony_ci
17662306a36Sopenharmony_ci	vmovdqa		%xmm1,%xmm7
17762306a36Sopenharmony_ci	cmp		$0x60,%rax
17862306a36Sopenharmony_ci	jl		.Lxorpart2
17962306a36Sopenharmony_ci	vpxor		0x50(%rdx),%xmm7,%xmm6
18062306a36Sopenharmony_ci	vmovdqu		%xmm6,0x50(%rsi)
18162306a36Sopenharmony_ci
18262306a36Sopenharmony_ci	vmovdqa		%xmm2,%xmm7
18362306a36Sopenharmony_ci	cmp		$0x70,%rax
18462306a36Sopenharmony_ci	jl		.Lxorpart2
18562306a36Sopenharmony_ci	vpxor		0x60(%rdx),%xmm7,%xmm6
18662306a36Sopenharmony_ci	vmovdqu		%xmm6,0x60(%rsi)
18762306a36Sopenharmony_ci
18862306a36Sopenharmony_ci	vmovdqa		%xmm3,%xmm7
18962306a36Sopenharmony_ci	cmp		$0x80,%rax
19062306a36Sopenharmony_ci	jl		.Lxorpart2
19162306a36Sopenharmony_ci	vpxor		0x70(%rdx),%xmm7,%xmm6
19262306a36Sopenharmony_ci	vmovdqu		%xmm6,0x70(%rsi)
19362306a36Sopenharmony_ci
19462306a36Sopenharmony_ci.Ldone2:
19562306a36Sopenharmony_ci	vzeroupper
19662306a36Sopenharmony_ci	RET
19762306a36Sopenharmony_ci
19862306a36Sopenharmony_ci.Lxorpart2:
19962306a36Sopenharmony_ci	# xor remaining bytes from partial register into output
20062306a36Sopenharmony_ci	mov		%rax,%r9
20162306a36Sopenharmony_ci	and		$0x0f,%r9
20262306a36Sopenharmony_ci	jz		.Ldone2
20362306a36Sopenharmony_ci	and		$~0x0f,%rax
20462306a36Sopenharmony_ci
20562306a36Sopenharmony_ci	mov		%rsi,%r11
20662306a36Sopenharmony_ci
20762306a36Sopenharmony_ci	lea		8(%rsp),%r10
20862306a36Sopenharmony_ci	sub		$0x10,%rsp
20962306a36Sopenharmony_ci	and		$~31,%rsp
21062306a36Sopenharmony_ci
21162306a36Sopenharmony_ci	lea		(%rdx,%rax),%rsi
21262306a36Sopenharmony_ci	mov		%rsp,%rdi
21362306a36Sopenharmony_ci	mov		%r9,%rcx
21462306a36Sopenharmony_ci	rep movsb
21562306a36Sopenharmony_ci
21662306a36Sopenharmony_ci	vpxor		0x00(%rsp),%xmm7,%xmm7
21762306a36Sopenharmony_ci	vmovdqa		%xmm7,0x00(%rsp)
21862306a36Sopenharmony_ci
21962306a36Sopenharmony_ci	mov		%rsp,%rsi
22062306a36Sopenharmony_ci	lea		(%r11,%rax),%rdi
22162306a36Sopenharmony_ci	mov		%r9,%rcx
22262306a36Sopenharmony_ci	rep movsb
22362306a36Sopenharmony_ci
22462306a36Sopenharmony_ci	lea		-8(%r10),%rsp
22562306a36Sopenharmony_ci	jmp		.Ldone2
22662306a36Sopenharmony_ci
22762306a36Sopenharmony_ciSYM_FUNC_END(chacha_2block_xor_avx2)
22862306a36Sopenharmony_ci
22962306a36Sopenharmony_ciSYM_FUNC_START(chacha_4block_xor_avx2)
23062306a36Sopenharmony_ci	# %rdi: Input state matrix, s
23162306a36Sopenharmony_ci	# %rsi: up to 4 data blocks output, o
23262306a36Sopenharmony_ci	# %rdx: up to 4 data blocks input, i
23362306a36Sopenharmony_ci	# %rcx: input/output length in bytes
23462306a36Sopenharmony_ci	# %r8d: nrounds
23562306a36Sopenharmony_ci
23662306a36Sopenharmony_ci	# This function encrypts four ChaCha blocks by loading the state
23762306a36Sopenharmony_ci	# matrix four times across eight AVX registers. It performs matrix
23862306a36Sopenharmony_ci	# operations on four words in two matrices in parallel, sequentially
23962306a36Sopenharmony_ci	# to the operations on the four words of the other two matrices. The
24062306a36Sopenharmony_ci	# required word shuffling has a rather high latency, we can do the
24162306a36Sopenharmony_ci	# arithmetic on two matrix-pairs without much slowdown.
24262306a36Sopenharmony_ci
24362306a36Sopenharmony_ci	vzeroupper
24462306a36Sopenharmony_ci
24562306a36Sopenharmony_ci	# x0..3[0-4] = s0..3
24662306a36Sopenharmony_ci	vbroadcasti128	0x00(%rdi),%ymm0
24762306a36Sopenharmony_ci	vbroadcasti128	0x10(%rdi),%ymm1
24862306a36Sopenharmony_ci	vbroadcasti128	0x20(%rdi),%ymm2
24962306a36Sopenharmony_ci	vbroadcasti128	0x30(%rdi),%ymm3
25062306a36Sopenharmony_ci
25162306a36Sopenharmony_ci	vmovdqa		%ymm0,%ymm4
25262306a36Sopenharmony_ci	vmovdqa		%ymm1,%ymm5
25362306a36Sopenharmony_ci	vmovdqa		%ymm2,%ymm6
25462306a36Sopenharmony_ci	vmovdqa		%ymm3,%ymm7
25562306a36Sopenharmony_ci
25662306a36Sopenharmony_ci	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
25762306a36Sopenharmony_ci	vpaddd		CTR4BL(%rip),%ymm7,%ymm7
25862306a36Sopenharmony_ci
25962306a36Sopenharmony_ci	vmovdqa		%ymm0,%ymm11
26062306a36Sopenharmony_ci	vmovdqa		%ymm1,%ymm12
26162306a36Sopenharmony_ci	vmovdqa		%ymm2,%ymm13
26262306a36Sopenharmony_ci	vmovdqa		%ymm3,%ymm14
26362306a36Sopenharmony_ci	vmovdqa		%ymm7,%ymm15
26462306a36Sopenharmony_ci
26562306a36Sopenharmony_ci	vmovdqa		ROT8(%rip),%ymm8
26662306a36Sopenharmony_ci	vmovdqa		ROT16(%rip),%ymm9
26762306a36Sopenharmony_ci
26862306a36Sopenharmony_ci	mov		%rcx,%rax
26962306a36Sopenharmony_ci
27062306a36Sopenharmony_ci.Ldoubleround4:
27162306a36Sopenharmony_ci
27262306a36Sopenharmony_ci	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
27362306a36Sopenharmony_ci	vpaddd		%ymm1,%ymm0,%ymm0
27462306a36Sopenharmony_ci	vpxor		%ymm0,%ymm3,%ymm3
27562306a36Sopenharmony_ci	vpshufb		%ymm9,%ymm3,%ymm3
27662306a36Sopenharmony_ci
27762306a36Sopenharmony_ci	vpaddd		%ymm5,%ymm4,%ymm4
27862306a36Sopenharmony_ci	vpxor		%ymm4,%ymm7,%ymm7
27962306a36Sopenharmony_ci	vpshufb		%ymm9,%ymm7,%ymm7
28062306a36Sopenharmony_ci
28162306a36Sopenharmony_ci	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
28262306a36Sopenharmony_ci	vpaddd		%ymm3,%ymm2,%ymm2
28362306a36Sopenharmony_ci	vpxor		%ymm2,%ymm1,%ymm1
28462306a36Sopenharmony_ci	vmovdqa		%ymm1,%ymm10
28562306a36Sopenharmony_ci	vpslld		$12,%ymm10,%ymm10
28662306a36Sopenharmony_ci	vpsrld		$20,%ymm1,%ymm1
28762306a36Sopenharmony_ci	vpor		%ymm10,%ymm1,%ymm1
28862306a36Sopenharmony_ci
28962306a36Sopenharmony_ci	vpaddd		%ymm7,%ymm6,%ymm6
29062306a36Sopenharmony_ci	vpxor		%ymm6,%ymm5,%ymm5
29162306a36Sopenharmony_ci	vmovdqa		%ymm5,%ymm10
29262306a36Sopenharmony_ci	vpslld		$12,%ymm10,%ymm10
29362306a36Sopenharmony_ci	vpsrld		$20,%ymm5,%ymm5
29462306a36Sopenharmony_ci	vpor		%ymm10,%ymm5,%ymm5
29562306a36Sopenharmony_ci
29662306a36Sopenharmony_ci	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
29762306a36Sopenharmony_ci	vpaddd		%ymm1,%ymm0,%ymm0
29862306a36Sopenharmony_ci	vpxor		%ymm0,%ymm3,%ymm3
29962306a36Sopenharmony_ci	vpshufb		%ymm8,%ymm3,%ymm3
30062306a36Sopenharmony_ci
30162306a36Sopenharmony_ci	vpaddd		%ymm5,%ymm4,%ymm4
30262306a36Sopenharmony_ci	vpxor		%ymm4,%ymm7,%ymm7
30362306a36Sopenharmony_ci	vpshufb		%ymm8,%ymm7,%ymm7
30462306a36Sopenharmony_ci
30562306a36Sopenharmony_ci	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
30662306a36Sopenharmony_ci	vpaddd		%ymm3,%ymm2,%ymm2
30762306a36Sopenharmony_ci	vpxor		%ymm2,%ymm1,%ymm1
30862306a36Sopenharmony_ci	vmovdqa		%ymm1,%ymm10
30962306a36Sopenharmony_ci	vpslld		$7,%ymm10,%ymm10
31062306a36Sopenharmony_ci	vpsrld		$25,%ymm1,%ymm1
31162306a36Sopenharmony_ci	vpor		%ymm10,%ymm1,%ymm1
31262306a36Sopenharmony_ci
31362306a36Sopenharmony_ci	vpaddd		%ymm7,%ymm6,%ymm6
31462306a36Sopenharmony_ci	vpxor		%ymm6,%ymm5,%ymm5
31562306a36Sopenharmony_ci	vmovdqa		%ymm5,%ymm10
31662306a36Sopenharmony_ci	vpslld		$7,%ymm10,%ymm10
31762306a36Sopenharmony_ci	vpsrld		$25,%ymm5,%ymm5
31862306a36Sopenharmony_ci	vpor		%ymm10,%ymm5,%ymm5
31962306a36Sopenharmony_ci
32062306a36Sopenharmony_ci	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
32162306a36Sopenharmony_ci	vpshufd		$0x39,%ymm1,%ymm1
32262306a36Sopenharmony_ci	vpshufd		$0x39,%ymm5,%ymm5
32362306a36Sopenharmony_ci	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
32462306a36Sopenharmony_ci	vpshufd		$0x4e,%ymm2,%ymm2
32562306a36Sopenharmony_ci	vpshufd		$0x4e,%ymm6,%ymm6
32662306a36Sopenharmony_ci	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
32762306a36Sopenharmony_ci	vpshufd		$0x93,%ymm3,%ymm3
32862306a36Sopenharmony_ci	vpshufd		$0x93,%ymm7,%ymm7
32962306a36Sopenharmony_ci
33062306a36Sopenharmony_ci	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
33162306a36Sopenharmony_ci	vpaddd		%ymm1,%ymm0,%ymm0
33262306a36Sopenharmony_ci	vpxor		%ymm0,%ymm3,%ymm3
33362306a36Sopenharmony_ci	vpshufb		%ymm9,%ymm3,%ymm3
33462306a36Sopenharmony_ci
33562306a36Sopenharmony_ci	vpaddd		%ymm5,%ymm4,%ymm4
33662306a36Sopenharmony_ci	vpxor		%ymm4,%ymm7,%ymm7
33762306a36Sopenharmony_ci	vpshufb		%ymm9,%ymm7,%ymm7
33862306a36Sopenharmony_ci
33962306a36Sopenharmony_ci	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
34062306a36Sopenharmony_ci	vpaddd		%ymm3,%ymm2,%ymm2
34162306a36Sopenharmony_ci	vpxor		%ymm2,%ymm1,%ymm1
34262306a36Sopenharmony_ci	vmovdqa		%ymm1,%ymm10
34362306a36Sopenharmony_ci	vpslld		$12,%ymm10,%ymm10
34462306a36Sopenharmony_ci	vpsrld		$20,%ymm1,%ymm1
34562306a36Sopenharmony_ci	vpor		%ymm10,%ymm1,%ymm1
34662306a36Sopenharmony_ci
34762306a36Sopenharmony_ci	vpaddd		%ymm7,%ymm6,%ymm6
34862306a36Sopenharmony_ci	vpxor		%ymm6,%ymm5,%ymm5
34962306a36Sopenharmony_ci	vmovdqa		%ymm5,%ymm10
35062306a36Sopenharmony_ci	vpslld		$12,%ymm10,%ymm10
35162306a36Sopenharmony_ci	vpsrld		$20,%ymm5,%ymm5
35262306a36Sopenharmony_ci	vpor		%ymm10,%ymm5,%ymm5
35362306a36Sopenharmony_ci
35462306a36Sopenharmony_ci	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
35562306a36Sopenharmony_ci	vpaddd		%ymm1,%ymm0,%ymm0
35662306a36Sopenharmony_ci	vpxor		%ymm0,%ymm3,%ymm3
35762306a36Sopenharmony_ci	vpshufb		%ymm8,%ymm3,%ymm3
35862306a36Sopenharmony_ci
35962306a36Sopenharmony_ci	vpaddd		%ymm5,%ymm4,%ymm4
36062306a36Sopenharmony_ci	vpxor		%ymm4,%ymm7,%ymm7
36162306a36Sopenharmony_ci	vpshufb		%ymm8,%ymm7,%ymm7
36262306a36Sopenharmony_ci
36362306a36Sopenharmony_ci	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
36462306a36Sopenharmony_ci	vpaddd		%ymm3,%ymm2,%ymm2
36562306a36Sopenharmony_ci	vpxor		%ymm2,%ymm1,%ymm1
36662306a36Sopenharmony_ci	vmovdqa		%ymm1,%ymm10
36762306a36Sopenharmony_ci	vpslld		$7,%ymm10,%ymm10
36862306a36Sopenharmony_ci	vpsrld		$25,%ymm1,%ymm1
36962306a36Sopenharmony_ci	vpor		%ymm10,%ymm1,%ymm1
37062306a36Sopenharmony_ci
37162306a36Sopenharmony_ci	vpaddd		%ymm7,%ymm6,%ymm6
37262306a36Sopenharmony_ci	vpxor		%ymm6,%ymm5,%ymm5
37362306a36Sopenharmony_ci	vmovdqa		%ymm5,%ymm10
37462306a36Sopenharmony_ci	vpslld		$7,%ymm10,%ymm10
37562306a36Sopenharmony_ci	vpsrld		$25,%ymm5,%ymm5
37662306a36Sopenharmony_ci	vpor		%ymm10,%ymm5,%ymm5
37762306a36Sopenharmony_ci
37862306a36Sopenharmony_ci	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
37962306a36Sopenharmony_ci	vpshufd		$0x93,%ymm1,%ymm1
38062306a36Sopenharmony_ci	vpshufd		$0x93,%ymm5,%ymm5
38162306a36Sopenharmony_ci	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
38262306a36Sopenharmony_ci	vpshufd		$0x4e,%ymm2,%ymm2
38362306a36Sopenharmony_ci	vpshufd		$0x4e,%ymm6,%ymm6
38462306a36Sopenharmony_ci	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
38562306a36Sopenharmony_ci	vpshufd		$0x39,%ymm3,%ymm3
38662306a36Sopenharmony_ci	vpshufd		$0x39,%ymm7,%ymm7
38762306a36Sopenharmony_ci
38862306a36Sopenharmony_ci	sub		$2,%r8d
38962306a36Sopenharmony_ci	jnz		.Ldoubleround4
39062306a36Sopenharmony_ci
39162306a36Sopenharmony_ci	# o0 = i0 ^ (x0 + s0), first block
39262306a36Sopenharmony_ci	vpaddd		%ymm11,%ymm0,%ymm10
39362306a36Sopenharmony_ci	cmp		$0x10,%rax
39462306a36Sopenharmony_ci	jl		.Lxorpart4
39562306a36Sopenharmony_ci	vpxor		0x00(%rdx),%xmm10,%xmm9
39662306a36Sopenharmony_ci	vmovdqu		%xmm9,0x00(%rsi)
39762306a36Sopenharmony_ci	vextracti128	$1,%ymm10,%xmm0
39862306a36Sopenharmony_ci	# o1 = i1 ^ (x1 + s1), first block
39962306a36Sopenharmony_ci	vpaddd		%ymm12,%ymm1,%ymm10
40062306a36Sopenharmony_ci	cmp		$0x20,%rax
40162306a36Sopenharmony_ci	jl		.Lxorpart4
40262306a36Sopenharmony_ci	vpxor		0x10(%rdx),%xmm10,%xmm9
40362306a36Sopenharmony_ci	vmovdqu		%xmm9,0x10(%rsi)
40462306a36Sopenharmony_ci	vextracti128	$1,%ymm10,%xmm1
40562306a36Sopenharmony_ci	# o2 = i2 ^ (x2 + s2), first block
40662306a36Sopenharmony_ci	vpaddd		%ymm13,%ymm2,%ymm10
40762306a36Sopenharmony_ci	cmp		$0x30,%rax
40862306a36Sopenharmony_ci	jl		.Lxorpart4
40962306a36Sopenharmony_ci	vpxor		0x20(%rdx),%xmm10,%xmm9
41062306a36Sopenharmony_ci	vmovdqu		%xmm9,0x20(%rsi)
41162306a36Sopenharmony_ci	vextracti128	$1,%ymm10,%xmm2
41262306a36Sopenharmony_ci	# o3 = i3 ^ (x3 + s3), first block
41362306a36Sopenharmony_ci	vpaddd		%ymm14,%ymm3,%ymm10
41462306a36Sopenharmony_ci	cmp		$0x40,%rax
41562306a36Sopenharmony_ci	jl		.Lxorpart4
41662306a36Sopenharmony_ci	vpxor		0x30(%rdx),%xmm10,%xmm9
41762306a36Sopenharmony_ci	vmovdqu		%xmm9,0x30(%rsi)
41862306a36Sopenharmony_ci	vextracti128	$1,%ymm10,%xmm3
41962306a36Sopenharmony_ci
42062306a36Sopenharmony_ci	# xor and write second block
42162306a36Sopenharmony_ci	vmovdqa		%xmm0,%xmm10
42262306a36Sopenharmony_ci	cmp		$0x50,%rax
42362306a36Sopenharmony_ci	jl		.Lxorpart4
42462306a36Sopenharmony_ci	vpxor		0x40(%rdx),%xmm10,%xmm9
42562306a36Sopenharmony_ci	vmovdqu		%xmm9,0x40(%rsi)
42662306a36Sopenharmony_ci
42762306a36Sopenharmony_ci	vmovdqa		%xmm1,%xmm10
42862306a36Sopenharmony_ci	cmp		$0x60,%rax
42962306a36Sopenharmony_ci	jl		.Lxorpart4
43062306a36Sopenharmony_ci	vpxor		0x50(%rdx),%xmm10,%xmm9
43162306a36Sopenharmony_ci	vmovdqu		%xmm9,0x50(%rsi)
43262306a36Sopenharmony_ci
43362306a36Sopenharmony_ci	vmovdqa		%xmm2,%xmm10
43462306a36Sopenharmony_ci	cmp		$0x70,%rax
43562306a36Sopenharmony_ci	jl		.Lxorpart4
43662306a36Sopenharmony_ci	vpxor		0x60(%rdx),%xmm10,%xmm9
43762306a36Sopenharmony_ci	vmovdqu		%xmm9,0x60(%rsi)
43862306a36Sopenharmony_ci
43962306a36Sopenharmony_ci	vmovdqa		%xmm3,%xmm10
44062306a36Sopenharmony_ci	cmp		$0x80,%rax
44162306a36Sopenharmony_ci	jl		.Lxorpart4
44262306a36Sopenharmony_ci	vpxor		0x70(%rdx),%xmm10,%xmm9
44362306a36Sopenharmony_ci	vmovdqu		%xmm9,0x70(%rsi)
44462306a36Sopenharmony_ci
44562306a36Sopenharmony_ci	# o0 = i0 ^ (x0 + s0), third block
44662306a36Sopenharmony_ci	vpaddd		%ymm11,%ymm4,%ymm10
44762306a36Sopenharmony_ci	cmp		$0x90,%rax
44862306a36Sopenharmony_ci	jl		.Lxorpart4
44962306a36Sopenharmony_ci	vpxor		0x80(%rdx),%xmm10,%xmm9
45062306a36Sopenharmony_ci	vmovdqu		%xmm9,0x80(%rsi)
45162306a36Sopenharmony_ci	vextracti128	$1,%ymm10,%xmm4
45262306a36Sopenharmony_ci	# o1 = i1 ^ (x1 + s1), third block
45362306a36Sopenharmony_ci	vpaddd		%ymm12,%ymm5,%ymm10
45462306a36Sopenharmony_ci	cmp		$0xa0,%rax
45562306a36Sopenharmony_ci	jl		.Lxorpart4
45662306a36Sopenharmony_ci	vpxor		0x90(%rdx),%xmm10,%xmm9
45762306a36Sopenharmony_ci	vmovdqu		%xmm9,0x90(%rsi)
45862306a36Sopenharmony_ci	vextracti128	$1,%ymm10,%xmm5
45962306a36Sopenharmony_ci	# o2 = i2 ^ (x2 + s2), third block
46062306a36Sopenharmony_ci	vpaddd		%ymm13,%ymm6,%ymm10
46162306a36Sopenharmony_ci	cmp		$0xb0,%rax
46262306a36Sopenharmony_ci	jl		.Lxorpart4
46362306a36Sopenharmony_ci	vpxor		0xa0(%rdx),%xmm10,%xmm9
46462306a36Sopenharmony_ci	vmovdqu		%xmm9,0xa0(%rsi)
46562306a36Sopenharmony_ci	vextracti128	$1,%ymm10,%xmm6
46662306a36Sopenharmony_ci	# o3 = i3 ^ (x3 + s3), third block
46762306a36Sopenharmony_ci	vpaddd		%ymm15,%ymm7,%ymm10
46862306a36Sopenharmony_ci	cmp		$0xc0,%rax
46962306a36Sopenharmony_ci	jl		.Lxorpart4
47062306a36Sopenharmony_ci	vpxor		0xb0(%rdx),%xmm10,%xmm9
47162306a36Sopenharmony_ci	vmovdqu		%xmm9,0xb0(%rsi)
47262306a36Sopenharmony_ci	vextracti128	$1,%ymm10,%xmm7
47362306a36Sopenharmony_ci
47462306a36Sopenharmony_ci	# xor and write fourth block
47562306a36Sopenharmony_ci	vmovdqa		%xmm4,%xmm10
47662306a36Sopenharmony_ci	cmp		$0xd0,%rax
47762306a36Sopenharmony_ci	jl		.Lxorpart4
47862306a36Sopenharmony_ci	vpxor		0xc0(%rdx),%xmm10,%xmm9
47962306a36Sopenharmony_ci	vmovdqu		%xmm9,0xc0(%rsi)
48062306a36Sopenharmony_ci
48162306a36Sopenharmony_ci	vmovdqa		%xmm5,%xmm10
48262306a36Sopenharmony_ci	cmp		$0xe0,%rax
48362306a36Sopenharmony_ci	jl		.Lxorpart4
48462306a36Sopenharmony_ci	vpxor		0xd0(%rdx),%xmm10,%xmm9
48562306a36Sopenharmony_ci	vmovdqu		%xmm9,0xd0(%rsi)
48662306a36Sopenharmony_ci
48762306a36Sopenharmony_ci	vmovdqa		%xmm6,%xmm10
48862306a36Sopenharmony_ci	cmp		$0xf0,%rax
48962306a36Sopenharmony_ci	jl		.Lxorpart4
49062306a36Sopenharmony_ci	vpxor		0xe0(%rdx),%xmm10,%xmm9
49162306a36Sopenharmony_ci	vmovdqu		%xmm9,0xe0(%rsi)
49262306a36Sopenharmony_ci
49362306a36Sopenharmony_ci	vmovdqa		%xmm7,%xmm10
49462306a36Sopenharmony_ci	cmp		$0x100,%rax
49562306a36Sopenharmony_ci	jl		.Lxorpart4
49662306a36Sopenharmony_ci	vpxor		0xf0(%rdx),%xmm10,%xmm9
49762306a36Sopenharmony_ci	vmovdqu		%xmm9,0xf0(%rsi)
49862306a36Sopenharmony_ci
49962306a36Sopenharmony_ci.Ldone4:
50062306a36Sopenharmony_ci	vzeroupper
50162306a36Sopenharmony_ci	RET
50262306a36Sopenharmony_ci
50362306a36Sopenharmony_ci.Lxorpart4:
50462306a36Sopenharmony_ci	# xor remaining bytes from partial register into output
50562306a36Sopenharmony_ci	mov		%rax,%r9
50662306a36Sopenharmony_ci	and		$0x0f,%r9
50762306a36Sopenharmony_ci	jz		.Ldone4
50862306a36Sopenharmony_ci	and		$~0x0f,%rax
50962306a36Sopenharmony_ci
51062306a36Sopenharmony_ci	mov		%rsi,%r11
51162306a36Sopenharmony_ci
51262306a36Sopenharmony_ci	lea		8(%rsp),%r10
51362306a36Sopenharmony_ci	sub		$0x10,%rsp
51462306a36Sopenharmony_ci	and		$~31,%rsp
51562306a36Sopenharmony_ci
51662306a36Sopenharmony_ci	lea		(%rdx,%rax),%rsi
51762306a36Sopenharmony_ci	mov		%rsp,%rdi
51862306a36Sopenharmony_ci	mov		%r9,%rcx
51962306a36Sopenharmony_ci	rep movsb
52062306a36Sopenharmony_ci
52162306a36Sopenharmony_ci	vpxor		0x00(%rsp),%xmm10,%xmm10
52262306a36Sopenharmony_ci	vmovdqa		%xmm10,0x00(%rsp)
52362306a36Sopenharmony_ci
52462306a36Sopenharmony_ci	mov		%rsp,%rsi
52562306a36Sopenharmony_ci	lea		(%r11,%rax),%rdi
52662306a36Sopenharmony_ci	mov		%r9,%rcx
52762306a36Sopenharmony_ci	rep movsb
52862306a36Sopenharmony_ci
52962306a36Sopenharmony_ci	lea		-8(%r10),%rsp
53062306a36Sopenharmony_ci	jmp		.Ldone4
53162306a36Sopenharmony_ci
53262306a36Sopenharmony_ciSYM_FUNC_END(chacha_4block_xor_avx2)
53362306a36Sopenharmony_ci
53462306a36Sopenharmony_ciSYM_FUNC_START(chacha_8block_xor_avx2)
53562306a36Sopenharmony_ci	# %rdi: Input state matrix, s
53662306a36Sopenharmony_ci	# %rsi: up to 8 data blocks output, o
53762306a36Sopenharmony_ci	# %rdx: up to 8 data blocks input, i
53862306a36Sopenharmony_ci	# %rcx: input/output length in bytes
53962306a36Sopenharmony_ci	# %r8d: nrounds
54062306a36Sopenharmony_ci
54162306a36Sopenharmony_ci	# This function encrypts eight consecutive ChaCha blocks by loading
54262306a36Sopenharmony_ci	# the state matrix in AVX registers eight times. As we need some
54362306a36Sopenharmony_ci	# scratch registers, we save the first four registers on the stack. The
54462306a36Sopenharmony_ci	# algorithm performs each operation on the corresponding word of each
54562306a36Sopenharmony_ci	# state matrix, hence requires no word shuffling. For final XORing step
54662306a36Sopenharmony_ci	# we transpose the matrix by interleaving 32-, 64- and then 128-bit
54762306a36Sopenharmony_ci	# words, which allows us to do XOR in AVX registers. 8/16-bit word
54862306a36Sopenharmony_ci	# rotation is done with the slightly better performing byte shuffling,
54962306a36Sopenharmony_ci	# 7/12-bit word rotation uses traditional shift+OR.
55062306a36Sopenharmony_ci
55162306a36Sopenharmony_ci	vzeroupper
55262306a36Sopenharmony_ci	# 4 * 32 byte stack, 32-byte aligned
55362306a36Sopenharmony_ci	lea		8(%rsp),%r10
55462306a36Sopenharmony_ci	and		$~31, %rsp
55562306a36Sopenharmony_ci	sub		$0x80, %rsp
55662306a36Sopenharmony_ci	mov		%rcx,%rax
55762306a36Sopenharmony_ci
55862306a36Sopenharmony_ci	# x0..15[0-7] = s[0..15]
55962306a36Sopenharmony_ci	vpbroadcastd	0x00(%rdi),%ymm0
56062306a36Sopenharmony_ci	vpbroadcastd	0x04(%rdi),%ymm1
56162306a36Sopenharmony_ci	vpbroadcastd	0x08(%rdi),%ymm2
56262306a36Sopenharmony_ci	vpbroadcastd	0x0c(%rdi),%ymm3
56362306a36Sopenharmony_ci	vpbroadcastd	0x10(%rdi),%ymm4
56462306a36Sopenharmony_ci	vpbroadcastd	0x14(%rdi),%ymm5
56562306a36Sopenharmony_ci	vpbroadcastd	0x18(%rdi),%ymm6
56662306a36Sopenharmony_ci	vpbroadcastd	0x1c(%rdi),%ymm7
56762306a36Sopenharmony_ci	vpbroadcastd	0x20(%rdi),%ymm8
56862306a36Sopenharmony_ci	vpbroadcastd	0x24(%rdi),%ymm9
56962306a36Sopenharmony_ci	vpbroadcastd	0x28(%rdi),%ymm10
57062306a36Sopenharmony_ci	vpbroadcastd	0x2c(%rdi),%ymm11
57162306a36Sopenharmony_ci	vpbroadcastd	0x30(%rdi),%ymm12
57262306a36Sopenharmony_ci	vpbroadcastd	0x34(%rdi),%ymm13
57362306a36Sopenharmony_ci	vpbroadcastd	0x38(%rdi),%ymm14
57462306a36Sopenharmony_ci	vpbroadcastd	0x3c(%rdi),%ymm15
57562306a36Sopenharmony_ci	# x0..3 on stack
57662306a36Sopenharmony_ci	vmovdqa		%ymm0,0x00(%rsp)
57762306a36Sopenharmony_ci	vmovdqa		%ymm1,0x20(%rsp)
57862306a36Sopenharmony_ci	vmovdqa		%ymm2,0x40(%rsp)
57962306a36Sopenharmony_ci	vmovdqa		%ymm3,0x60(%rsp)
58062306a36Sopenharmony_ci
58162306a36Sopenharmony_ci	vmovdqa		CTRINC(%rip),%ymm1
58262306a36Sopenharmony_ci	vmovdqa		ROT8(%rip),%ymm2
58362306a36Sopenharmony_ci	vmovdqa		ROT16(%rip),%ymm3
58462306a36Sopenharmony_ci
58562306a36Sopenharmony_ci	# x12 += counter values 0-3
58662306a36Sopenharmony_ci	vpaddd		%ymm1,%ymm12,%ymm12
58762306a36Sopenharmony_ci
58862306a36Sopenharmony_ci.Ldoubleround8:
58962306a36Sopenharmony_ci	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
59062306a36Sopenharmony_ci	vpaddd		0x00(%rsp),%ymm4,%ymm0
59162306a36Sopenharmony_ci	vmovdqa		%ymm0,0x00(%rsp)
59262306a36Sopenharmony_ci	vpxor		%ymm0,%ymm12,%ymm12
59362306a36Sopenharmony_ci	vpshufb		%ymm3,%ymm12,%ymm12
59462306a36Sopenharmony_ci	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
59562306a36Sopenharmony_ci	vpaddd		0x20(%rsp),%ymm5,%ymm0
59662306a36Sopenharmony_ci	vmovdqa		%ymm0,0x20(%rsp)
59762306a36Sopenharmony_ci	vpxor		%ymm0,%ymm13,%ymm13
59862306a36Sopenharmony_ci	vpshufb		%ymm3,%ymm13,%ymm13
59962306a36Sopenharmony_ci	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
60062306a36Sopenharmony_ci	vpaddd		0x40(%rsp),%ymm6,%ymm0
60162306a36Sopenharmony_ci	vmovdqa		%ymm0,0x40(%rsp)
60262306a36Sopenharmony_ci	vpxor		%ymm0,%ymm14,%ymm14
60362306a36Sopenharmony_ci	vpshufb		%ymm3,%ymm14,%ymm14
60462306a36Sopenharmony_ci	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
60562306a36Sopenharmony_ci	vpaddd		0x60(%rsp),%ymm7,%ymm0
60662306a36Sopenharmony_ci	vmovdqa		%ymm0,0x60(%rsp)
60762306a36Sopenharmony_ci	vpxor		%ymm0,%ymm15,%ymm15
60862306a36Sopenharmony_ci	vpshufb		%ymm3,%ymm15,%ymm15
60962306a36Sopenharmony_ci
61062306a36Sopenharmony_ci	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
61162306a36Sopenharmony_ci	vpaddd		%ymm12,%ymm8,%ymm8
61262306a36Sopenharmony_ci	vpxor		%ymm8,%ymm4,%ymm4
61362306a36Sopenharmony_ci	vpslld		$12,%ymm4,%ymm0
61462306a36Sopenharmony_ci	vpsrld		$20,%ymm4,%ymm4
61562306a36Sopenharmony_ci	vpor		%ymm0,%ymm4,%ymm4
61662306a36Sopenharmony_ci	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
61762306a36Sopenharmony_ci	vpaddd		%ymm13,%ymm9,%ymm9
61862306a36Sopenharmony_ci	vpxor		%ymm9,%ymm5,%ymm5
61962306a36Sopenharmony_ci	vpslld		$12,%ymm5,%ymm0
62062306a36Sopenharmony_ci	vpsrld		$20,%ymm5,%ymm5
62162306a36Sopenharmony_ci	vpor		%ymm0,%ymm5,%ymm5
62262306a36Sopenharmony_ci	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
62362306a36Sopenharmony_ci	vpaddd		%ymm14,%ymm10,%ymm10
62462306a36Sopenharmony_ci	vpxor		%ymm10,%ymm6,%ymm6
62562306a36Sopenharmony_ci	vpslld		$12,%ymm6,%ymm0
62662306a36Sopenharmony_ci	vpsrld		$20,%ymm6,%ymm6
62762306a36Sopenharmony_ci	vpor		%ymm0,%ymm6,%ymm6
62862306a36Sopenharmony_ci	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
62962306a36Sopenharmony_ci	vpaddd		%ymm15,%ymm11,%ymm11
63062306a36Sopenharmony_ci	vpxor		%ymm11,%ymm7,%ymm7
63162306a36Sopenharmony_ci	vpslld		$12,%ymm7,%ymm0
63262306a36Sopenharmony_ci	vpsrld		$20,%ymm7,%ymm7
63362306a36Sopenharmony_ci	vpor		%ymm0,%ymm7,%ymm7
63462306a36Sopenharmony_ci
63562306a36Sopenharmony_ci	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
63662306a36Sopenharmony_ci	vpaddd		0x00(%rsp),%ymm4,%ymm0
63762306a36Sopenharmony_ci	vmovdqa		%ymm0,0x00(%rsp)
63862306a36Sopenharmony_ci	vpxor		%ymm0,%ymm12,%ymm12
63962306a36Sopenharmony_ci	vpshufb		%ymm2,%ymm12,%ymm12
64062306a36Sopenharmony_ci	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
64162306a36Sopenharmony_ci	vpaddd		0x20(%rsp),%ymm5,%ymm0
64262306a36Sopenharmony_ci	vmovdqa		%ymm0,0x20(%rsp)
64362306a36Sopenharmony_ci	vpxor		%ymm0,%ymm13,%ymm13
64462306a36Sopenharmony_ci	vpshufb		%ymm2,%ymm13,%ymm13
64562306a36Sopenharmony_ci	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
64662306a36Sopenharmony_ci	vpaddd		0x40(%rsp),%ymm6,%ymm0
64762306a36Sopenharmony_ci	vmovdqa		%ymm0,0x40(%rsp)
64862306a36Sopenharmony_ci	vpxor		%ymm0,%ymm14,%ymm14
64962306a36Sopenharmony_ci	vpshufb		%ymm2,%ymm14,%ymm14
65062306a36Sopenharmony_ci	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
65162306a36Sopenharmony_ci	vpaddd		0x60(%rsp),%ymm7,%ymm0
65262306a36Sopenharmony_ci	vmovdqa		%ymm0,0x60(%rsp)
65362306a36Sopenharmony_ci	vpxor		%ymm0,%ymm15,%ymm15
65462306a36Sopenharmony_ci	vpshufb		%ymm2,%ymm15,%ymm15
65562306a36Sopenharmony_ci
65662306a36Sopenharmony_ci	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
65762306a36Sopenharmony_ci	vpaddd		%ymm12,%ymm8,%ymm8
65862306a36Sopenharmony_ci	vpxor		%ymm8,%ymm4,%ymm4
65962306a36Sopenharmony_ci	vpslld		$7,%ymm4,%ymm0
66062306a36Sopenharmony_ci	vpsrld		$25,%ymm4,%ymm4
66162306a36Sopenharmony_ci	vpor		%ymm0,%ymm4,%ymm4
66262306a36Sopenharmony_ci	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
66362306a36Sopenharmony_ci	vpaddd		%ymm13,%ymm9,%ymm9
66462306a36Sopenharmony_ci	vpxor		%ymm9,%ymm5,%ymm5
66562306a36Sopenharmony_ci	vpslld		$7,%ymm5,%ymm0
66662306a36Sopenharmony_ci	vpsrld		$25,%ymm5,%ymm5
66762306a36Sopenharmony_ci	vpor		%ymm0,%ymm5,%ymm5
66862306a36Sopenharmony_ci	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
66962306a36Sopenharmony_ci	vpaddd		%ymm14,%ymm10,%ymm10
67062306a36Sopenharmony_ci	vpxor		%ymm10,%ymm6,%ymm6
67162306a36Sopenharmony_ci	vpslld		$7,%ymm6,%ymm0
67262306a36Sopenharmony_ci	vpsrld		$25,%ymm6,%ymm6
67362306a36Sopenharmony_ci	vpor		%ymm0,%ymm6,%ymm6
67462306a36Sopenharmony_ci	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
67562306a36Sopenharmony_ci	vpaddd		%ymm15,%ymm11,%ymm11
67662306a36Sopenharmony_ci	vpxor		%ymm11,%ymm7,%ymm7
67762306a36Sopenharmony_ci	vpslld		$7,%ymm7,%ymm0
67862306a36Sopenharmony_ci	vpsrld		$25,%ymm7,%ymm7
67962306a36Sopenharmony_ci	vpor		%ymm0,%ymm7,%ymm7
68062306a36Sopenharmony_ci
68162306a36Sopenharmony_ci	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
68262306a36Sopenharmony_ci	vpaddd		0x00(%rsp),%ymm5,%ymm0
68362306a36Sopenharmony_ci	vmovdqa		%ymm0,0x00(%rsp)
68462306a36Sopenharmony_ci	vpxor		%ymm0,%ymm15,%ymm15
68562306a36Sopenharmony_ci	vpshufb		%ymm3,%ymm15,%ymm15
68662306a36Sopenharmony_ci	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
68762306a36Sopenharmony_ci	vpaddd		0x20(%rsp),%ymm6,%ymm0
68862306a36Sopenharmony_ci	vmovdqa		%ymm0,0x20(%rsp)
68962306a36Sopenharmony_ci	vpxor		%ymm0,%ymm12,%ymm12
69062306a36Sopenharmony_ci	vpshufb		%ymm3,%ymm12,%ymm12
69162306a36Sopenharmony_ci	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
69262306a36Sopenharmony_ci	vpaddd		0x40(%rsp),%ymm7,%ymm0
69362306a36Sopenharmony_ci	vmovdqa		%ymm0,0x40(%rsp)
69462306a36Sopenharmony_ci	vpxor		%ymm0,%ymm13,%ymm13
69562306a36Sopenharmony_ci	vpshufb		%ymm3,%ymm13,%ymm13
69662306a36Sopenharmony_ci	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
69762306a36Sopenharmony_ci	vpaddd		0x60(%rsp),%ymm4,%ymm0
69862306a36Sopenharmony_ci	vmovdqa		%ymm0,0x60(%rsp)
69962306a36Sopenharmony_ci	vpxor		%ymm0,%ymm14,%ymm14
70062306a36Sopenharmony_ci	vpshufb		%ymm3,%ymm14,%ymm14
70162306a36Sopenharmony_ci
70262306a36Sopenharmony_ci	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
70362306a36Sopenharmony_ci	vpaddd		%ymm15,%ymm10,%ymm10
70462306a36Sopenharmony_ci	vpxor		%ymm10,%ymm5,%ymm5
70562306a36Sopenharmony_ci	vpslld		$12,%ymm5,%ymm0
70662306a36Sopenharmony_ci	vpsrld		$20,%ymm5,%ymm5
70762306a36Sopenharmony_ci	vpor		%ymm0,%ymm5,%ymm5
70862306a36Sopenharmony_ci	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
70962306a36Sopenharmony_ci	vpaddd		%ymm12,%ymm11,%ymm11
71062306a36Sopenharmony_ci	vpxor		%ymm11,%ymm6,%ymm6
71162306a36Sopenharmony_ci	vpslld		$12,%ymm6,%ymm0
71262306a36Sopenharmony_ci	vpsrld		$20,%ymm6,%ymm6
71362306a36Sopenharmony_ci	vpor		%ymm0,%ymm6,%ymm6
71462306a36Sopenharmony_ci	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
71562306a36Sopenharmony_ci	vpaddd		%ymm13,%ymm8,%ymm8
71662306a36Sopenharmony_ci	vpxor		%ymm8,%ymm7,%ymm7
71762306a36Sopenharmony_ci	vpslld		$12,%ymm7,%ymm0
71862306a36Sopenharmony_ci	vpsrld		$20,%ymm7,%ymm7
71962306a36Sopenharmony_ci	vpor		%ymm0,%ymm7,%ymm7
72062306a36Sopenharmony_ci	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
72162306a36Sopenharmony_ci	vpaddd		%ymm14,%ymm9,%ymm9
72262306a36Sopenharmony_ci	vpxor		%ymm9,%ymm4,%ymm4
72362306a36Sopenharmony_ci	vpslld		$12,%ymm4,%ymm0
72462306a36Sopenharmony_ci	vpsrld		$20,%ymm4,%ymm4
72562306a36Sopenharmony_ci	vpor		%ymm0,%ymm4,%ymm4
72662306a36Sopenharmony_ci
72762306a36Sopenharmony_ci	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
72862306a36Sopenharmony_ci	vpaddd		0x00(%rsp),%ymm5,%ymm0
72962306a36Sopenharmony_ci	vmovdqa		%ymm0,0x00(%rsp)
73062306a36Sopenharmony_ci	vpxor		%ymm0,%ymm15,%ymm15
73162306a36Sopenharmony_ci	vpshufb		%ymm2,%ymm15,%ymm15
73262306a36Sopenharmony_ci	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
73362306a36Sopenharmony_ci	vpaddd		0x20(%rsp),%ymm6,%ymm0
73462306a36Sopenharmony_ci	vmovdqa		%ymm0,0x20(%rsp)
73562306a36Sopenharmony_ci	vpxor		%ymm0,%ymm12,%ymm12
73662306a36Sopenharmony_ci	vpshufb		%ymm2,%ymm12,%ymm12
73762306a36Sopenharmony_ci	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
73862306a36Sopenharmony_ci	vpaddd		0x40(%rsp),%ymm7,%ymm0
73962306a36Sopenharmony_ci	vmovdqa		%ymm0,0x40(%rsp)
74062306a36Sopenharmony_ci	vpxor		%ymm0,%ymm13,%ymm13
74162306a36Sopenharmony_ci	vpshufb		%ymm2,%ymm13,%ymm13
74262306a36Sopenharmony_ci	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
74362306a36Sopenharmony_ci	vpaddd		0x60(%rsp),%ymm4,%ymm0
74462306a36Sopenharmony_ci	vmovdqa		%ymm0,0x60(%rsp)
74562306a36Sopenharmony_ci	vpxor		%ymm0,%ymm14,%ymm14
74662306a36Sopenharmony_ci	vpshufb		%ymm2,%ymm14,%ymm14
74762306a36Sopenharmony_ci
74862306a36Sopenharmony_ci	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
74962306a36Sopenharmony_ci	vpaddd		%ymm15,%ymm10,%ymm10
75062306a36Sopenharmony_ci	vpxor		%ymm10,%ymm5,%ymm5
75162306a36Sopenharmony_ci	vpslld		$7,%ymm5,%ymm0
75262306a36Sopenharmony_ci	vpsrld		$25,%ymm5,%ymm5
75362306a36Sopenharmony_ci	vpor		%ymm0,%ymm5,%ymm5
75462306a36Sopenharmony_ci	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
75562306a36Sopenharmony_ci	vpaddd		%ymm12,%ymm11,%ymm11
75662306a36Sopenharmony_ci	vpxor		%ymm11,%ymm6,%ymm6
75762306a36Sopenharmony_ci	vpslld		$7,%ymm6,%ymm0
75862306a36Sopenharmony_ci	vpsrld		$25,%ymm6,%ymm6
75962306a36Sopenharmony_ci	vpor		%ymm0,%ymm6,%ymm6
76062306a36Sopenharmony_ci	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
76162306a36Sopenharmony_ci	vpaddd		%ymm13,%ymm8,%ymm8
76262306a36Sopenharmony_ci	vpxor		%ymm8,%ymm7,%ymm7
76362306a36Sopenharmony_ci	vpslld		$7,%ymm7,%ymm0
76462306a36Sopenharmony_ci	vpsrld		$25,%ymm7,%ymm7
76562306a36Sopenharmony_ci	vpor		%ymm0,%ymm7,%ymm7
76662306a36Sopenharmony_ci	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
76762306a36Sopenharmony_ci	vpaddd		%ymm14,%ymm9,%ymm9
76862306a36Sopenharmony_ci	vpxor		%ymm9,%ymm4,%ymm4
76962306a36Sopenharmony_ci	vpslld		$7,%ymm4,%ymm0
77062306a36Sopenharmony_ci	vpsrld		$25,%ymm4,%ymm4
77162306a36Sopenharmony_ci	vpor		%ymm0,%ymm4,%ymm4
77262306a36Sopenharmony_ci
77362306a36Sopenharmony_ci	sub		$2,%r8d
77462306a36Sopenharmony_ci	jnz		.Ldoubleround8
77562306a36Sopenharmony_ci
77662306a36Sopenharmony_ci	# x0..15[0-3] += s[0..15]
77762306a36Sopenharmony_ci	vpbroadcastd	0x00(%rdi),%ymm0
77862306a36Sopenharmony_ci	vpaddd		0x00(%rsp),%ymm0,%ymm0
77962306a36Sopenharmony_ci	vmovdqa		%ymm0,0x00(%rsp)
78062306a36Sopenharmony_ci	vpbroadcastd	0x04(%rdi),%ymm0
78162306a36Sopenharmony_ci	vpaddd		0x20(%rsp),%ymm0,%ymm0
78262306a36Sopenharmony_ci	vmovdqa		%ymm0,0x20(%rsp)
78362306a36Sopenharmony_ci	vpbroadcastd	0x08(%rdi),%ymm0
78462306a36Sopenharmony_ci	vpaddd		0x40(%rsp),%ymm0,%ymm0
78562306a36Sopenharmony_ci	vmovdqa		%ymm0,0x40(%rsp)
78662306a36Sopenharmony_ci	vpbroadcastd	0x0c(%rdi),%ymm0
78762306a36Sopenharmony_ci	vpaddd		0x60(%rsp),%ymm0,%ymm0
78862306a36Sopenharmony_ci	vmovdqa		%ymm0,0x60(%rsp)
78962306a36Sopenharmony_ci	vpbroadcastd	0x10(%rdi),%ymm0
79062306a36Sopenharmony_ci	vpaddd		%ymm0,%ymm4,%ymm4
79162306a36Sopenharmony_ci	vpbroadcastd	0x14(%rdi),%ymm0
79262306a36Sopenharmony_ci	vpaddd		%ymm0,%ymm5,%ymm5
79362306a36Sopenharmony_ci	vpbroadcastd	0x18(%rdi),%ymm0
79462306a36Sopenharmony_ci	vpaddd		%ymm0,%ymm6,%ymm6
79562306a36Sopenharmony_ci	vpbroadcastd	0x1c(%rdi),%ymm0
79662306a36Sopenharmony_ci	vpaddd		%ymm0,%ymm7,%ymm7
79762306a36Sopenharmony_ci	vpbroadcastd	0x20(%rdi),%ymm0
79862306a36Sopenharmony_ci	vpaddd		%ymm0,%ymm8,%ymm8
79962306a36Sopenharmony_ci	vpbroadcastd	0x24(%rdi),%ymm0
80062306a36Sopenharmony_ci	vpaddd		%ymm0,%ymm9,%ymm9
80162306a36Sopenharmony_ci	vpbroadcastd	0x28(%rdi),%ymm0
80262306a36Sopenharmony_ci	vpaddd		%ymm0,%ymm10,%ymm10
80362306a36Sopenharmony_ci	vpbroadcastd	0x2c(%rdi),%ymm0
80462306a36Sopenharmony_ci	vpaddd		%ymm0,%ymm11,%ymm11
80562306a36Sopenharmony_ci	vpbroadcastd	0x30(%rdi),%ymm0
80662306a36Sopenharmony_ci	vpaddd		%ymm0,%ymm12,%ymm12
80762306a36Sopenharmony_ci	vpbroadcastd	0x34(%rdi),%ymm0
80862306a36Sopenharmony_ci	vpaddd		%ymm0,%ymm13,%ymm13
80962306a36Sopenharmony_ci	vpbroadcastd	0x38(%rdi),%ymm0
81062306a36Sopenharmony_ci	vpaddd		%ymm0,%ymm14,%ymm14
81162306a36Sopenharmony_ci	vpbroadcastd	0x3c(%rdi),%ymm0
81262306a36Sopenharmony_ci	vpaddd		%ymm0,%ymm15,%ymm15
81362306a36Sopenharmony_ci
81462306a36Sopenharmony_ci	# x12 += counter values 0-3
81562306a36Sopenharmony_ci	vpaddd		%ymm1,%ymm12,%ymm12
81662306a36Sopenharmony_ci
81762306a36Sopenharmony_ci	# interleave 32-bit words in state n, n+1
81862306a36Sopenharmony_ci	vmovdqa		0x00(%rsp),%ymm0
81962306a36Sopenharmony_ci	vmovdqa		0x20(%rsp),%ymm1
82062306a36Sopenharmony_ci	vpunpckldq	%ymm1,%ymm0,%ymm2
82162306a36Sopenharmony_ci	vpunpckhdq	%ymm1,%ymm0,%ymm1
82262306a36Sopenharmony_ci	vmovdqa		%ymm2,0x00(%rsp)
82362306a36Sopenharmony_ci	vmovdqa		%ymm1,0x20(%rsp)
82462306a36Sopenharmony_ci	vmovdqa		0x40(%rsp),%ymm0
82562306a36Sopenharmony_ci	vmovdqa		0x60(%rsp),%ymm1
82662306a36Sopenharmony_ci	vpunpckldq	%ymm1,%ymm0,%ymm2
82762306a36Sopenharmony_ci	vpunpckhdq	%ymm1,%ymm0,%ymm1
82862306a36Sopenharmony_ci	vmovdqa		%ymm2,0x40(%rsp)
82962306a36Sopenharmony_ci	vmovdqa		%ymm1,0x60(%rsp)
83062306a36Sopenharmony_ci	vmovdqa		%ymm4,%ymm0
83162306a36Sopenharmony_ci	vpunpckldq	%ymm5,%ymm0,%ymm4
83262306a36Sopenharmony_ci	vpunpckhdq	%ymm5,%ymm0,%ymm5
83362306a36Sopenharmony_ci	vmovdqa		%ymm6,%ymm0
83462306a36Sopenharmony_ci	vpunpckldq	%ymm7,%ymm0,%ymm6
83562306a36Sopenharmony_ci	vpunpckhdq	%ymm7,%ymm0,%ymm7
83662306a36Sopenharmony_ci	vmovdqa		%ymm8,%ymm0
83762306a36Sopenharmony_ci	vpunpckldq	%ymm9,%ymm0,%ymm8
83862306a36Sopenharmony_ci	vpunpckhdq	%ymm9,%ymm0,%ymm9
83962306a36Sopenharmony_ci	vmovdqa		%ymm10,%ymm0
84062306a36Sopenharmony_ci	vpunpckldq	%ymm11,%ymm0,%ymm10
84162306a36Sopenharmony_ci	vpunpckhdq	%ymm11,%ymm0,%ymm11
84262306a36Sopenharmony_ci	vmovdqa		%ymm12,%ymm0
84362306a36Sopenharmony_ci	vpunpckldq	%ymm13,%ymm0,%ymm12
84462306a36Sopenharmony_ci	vpunpckhdq	%ymm13,%ymm0,%ymm13
84562306a36Sopenharmony_ci	vmovdqa		%ymm14,%ymm0
84662306a36Sopenharmony_ci	vpunpckldq	%ymm15,%ymm0,%ymm14
84762306a36Sopenharmony_ci	vpunpckhdq	%ymm15,%ymm0,%ymm15
84862306a36Sopenharmony_ci
84962306a36Sopenharmony_ci	# interleave 64-bit words in state n, n+2
85062306a36Sopenharmony_ci	vmovdqa		0x00(%rsp),%ymm0
85162306a36Sopenharmony_ci	vmovdqa		0x40(%rsp),%ymm2
85262306a36Sopenharmony_ci	vpunpcklqdq	%ymm2,%ymm0,%ymm1
85362306a36Sopenharmony_ci	vpunpckhqdq	%ymm2,%ymm0,%ymm2
85462306a36Sopenharmony_ci	vmovdqa		%ymm1,0x00(%rsp)
85562306a36Sopenharmony_ci	vmovdqa		%ymm2,0x40(%rsp)
85662306a36Sopenharmony_ci	vmovdqa		0x20(%rsp),%ymm0
85762306a36Sopenharmony_ci	vmovdqa		0x60(%rsp),%ymm2
85862306a36Sopenharmony_ci	vpunpcklqdq	%ymm2,%ymm0,%ymm1
85962306a36Sopenharmony_ci	vpunpckhqdq	%ymm2,%ymm0,%ymm2
86062306a36Sopenharmony_ci	vmovdqa		%ymm1,0x20(%rsp)
86162306a36Sopenharmony_ci	vmovdqa		%ymm2,0x60(%rsp)
86262306a36Sopenharmony_ci	vmovdqa		%ymm4,%ymm0
86362306a36Sopenharmony_ci	vpunpcklqdq	%ymm6,%ymm0,%ymm4
86462306a36Sopenharmony_ci	vpunpckhqdq	%ymm6,%ymm0,%ymm6
86562306a36Sopenharmony_ci	vmovdqa		%ymm5,%ymm0
86662306a36Sopenharmony_ci	vpunpcklqdq	%ymm7,%ymm0,%ymm5
86762306a36Sopenharmony_ci	vpunpckhqdq	%ymm7,%ymm0,%ymm7
86862306a36Sopenharmony_ci	vmovdqa		%ymm8,%ymm0
86962306a36Sopenharmony_ci	vpunpcklqdq	%ymm10,%ymm0,%ymm8
87062306a36Sopenharmony_ci	vpunpckhqdq	%ymm10,%ymm0,%ymm10
87162306a36Sopenharmony_ci	vmovdqa		%ymm9,%ymm0
87262306a36Sopenharmony_ci	vpunpcklqdq	%ymm11,%ymm0,%ymm9
87362306a36Sopenharmony_ci	vpunpckhqdq	%ymm11,%ymm0,%ymm11
87462306a36Sopenharmony_ci	vmovdqa		%ymm12,%ymm0
87562306a36Sopenharmony_ci	vpunpcklqdq	%ymm14,%ymm0,%ymm12
87662306a36Sopenharmony_ci	vpunpckhqdq	%ymm14,%ymm0,%ymm14
87762306a36Sopenharmony_ci	vmovdqa		%ymm13,%ymm0
87862306a36Sopenharmony_ci	vpunpcklqdq	%ymm15,%ymm0,%ymm13
87962306a36Sopenharmony_ci	vpunpckhqdq	%ymm15,%ymm0,%ymm15
88062306a36Sopenharmony_ci
88162306a36Sopenharmony_ci	# interleave 128-bit words in state n, n+4
88262306a36Sopenharmony_ci	# xor/write first four blocks
88362306a36Sopenharmony_ci	vmovdqa		0x00(%rsp),%ymm1
88462306a36Sopenharmony_ci	vperm2i128	$0x20,%ymm4,%ymm1,%ymm0
88562306a36Sopenharmony_ci	cmp		$0x0020,%rax
88662306a36Sopenharmony_ci	jl		.Lxorpart8
88762306a36Sopenharmony_ci	vpxor		0x0000(%rdx),%ymm0,%ymm0
88862306a36Sopenharmony_ci	vmovdqu		%ymm0,0x0000(%rsi)
88962306a36Sopenharmony_ci	vperm2i128	$0x31,%ymm4,%ymm1,%ymm4
89062306a36Sopenharmony_ci
89162306a36Sopenharmony_ci	vperm2i128	$0x20,%ymm12,%ymm8,%ymm0
89262306a36Sopenharmony_ci	cmp		$0x0040,%rax
89362306a36Sopenharmony_ci	jl		.Lxorpart8
89462306a36Sopenharmony_ci	vpxor		0x0020(%rdx),%ymm0,%ymm0
89562306a36Sopenharmony_ci	vmovdqu		%ymm0,0x0020(%rsi)
89662306a36Sopenharmony_ci	vperm2i128	$0x31,%ymm12,%ymm8,%ymm12
89762306a36Sopenharmony_ci
89862306a36Sopenharmony_ci	vmovdqa		0x40(%rsp),%ymm1
89962306a36Sopenharmony_ci	vperm2i128	$0x20,%ymm6,%ymm1,%ymm0
90062306a36Sopenharmony_ci	cmp		$0x0060,%rax
90162306a36Sopenharmony_ci	jl		.Lxorpart8
90262306a36Sopenharmony_ci	vpxor		0x0040(%rdx),%ymm0,%ymm0
90362306a36Sopenharmony_ci	vmovdqu		%ymm0,0x0040(%rsi)
90462306a36Sopenharmony_ci	vperm2i128	$0x31,%ymm6,%ymm1,%ymm6
90562306a36Sopenharmony_ci
90662306a36Sopenharmony_ci	vperm2i128	$0x20,%ymm14,%ymm10,%ymm0
90762306a36Sopenharmony_ci	cmp		$0x0080,%rax
90862306a36Sopenharmony_ci	jl		.Lxorpart8
90962306a36Sopenharmony_ci	vpxor		0x0060(%rdx),%ymm0,%ymm0
91062306a36Sopenharmony_ci	vmovdqu		%ymm0,0x0060(%rsi)
91162306a36Sopenharmony_ci	vperm2i128	$0x31,%ymm14,%ymm10,%ymm14
91262306a36Sopenharmony_ci
91362306a36Sopenharmony_ci	vmovdqa		0x20(%rsp),%ymm1
91462306a36Sopenharmony_ci	vperm2i128	$0x20,%ymm5,%ymm1,%ymm0
91562306a36Sopenharmony_ci	cmp		$0x00a0,%rax
91662306a36Sopenharmony_ci	jl		.Lxorpart8
91762306a36Sopenharmony_ci	vpxor		0x0080(%rdx),%ymm0,%ymm0
91862306a36Sopenharmony_ci	vmovdqu		%ymm0,0x0080(%rsi)
91962306a36Sopenharmony_ci	vperm2i128	$0x31,%ymm5,%ymm1,%ymm5
92062306a36Sopenharmony_ci
92162306a36Sopenharmony_ci	vperm2i128	$0x20,%ymm13,%ymm9,%ymm0
92262306a36Sopenharmony_ci	cmp		$0x00c0,%rax
92362306a36Sopenharmony_ci	jl		.Lxorpart8
92462306a36Sopenharmony_ci	vpxor		0x00a0(%rdx),%ymm0,%ymm0
92562306a36Sopenharmony_ci	vmovdqu		%ymm0,0x00a0(%rsi)
92662306a36Sopenharmony_ci	vperm2i128	$0x31,%ymm13,%ymm9,%ymm13
92762306a36Sopenharmony_ci
92862306a36Sopenharmony_ci	vmovdqa		0x60(%rsp),%ymm1
92962306a36Sopenharmony_ci	vperm2i128	$0x20,%ymm7,%ymm1,%ymm0
93062306a36Sopenharmony_ci	cmp		$0x00e0,%rax
93162306a36Sopenharmony_ci	jl		.Lxorpart8
93262306a36Sopenharmony_ci	vpxor		0x00c0(%rdx),%ymm0,%ymm0
93362306a36Sopenharmony_ci	vmovdqu		%ymm0,0x00c0(%rsi)
93462306a36Sopenharmony_ci	vperm2i128	$0x31,%ymm7,%ymm1,%ymm7
93562306a36Sopenharmony_ci
93662306a36Sopenharmony_ci	vperm2i128	$0x20,%ymm15,%ymm11,%ymm0
93762306a36Sopenharmony_ci	cmp		$0x0100,%rax
93862306a36Sopenharmony_ci	jl		.Lxorpart8
93962306a36Sopenharmony_ci	vpxor		0x00e0(%rdx),%ymm0,%ymm0
94062306a36Sopenharmony_ci	vmovdqu		%ymm0,0x00e0(%rsi)
94162306a36Sopenharmony_ci	vperm2i128	$0x31,%ymm15,%ymm11,%ymm15
94262306a36Sopenharmony_ci
94362306a36Sopenharmony_ci	# xor remaining blocks, write to output
94462306a36Sopenharmony_ci	vmovdqa		%ymm4,%ymm0
94562306a36Sopenharmony_ci	cmp		$0x0120,%rax
94662306a36Sopenharmony_ci	jl		.Lxorpart8
94762306a36Sopenharmony_ci	vpxor		0x0100(%rdx),%ymm0,%ymm0
94862306a36Sopenharmony_ci	vmovdqu		%ymm0,0x0100(%rsi)
94962306a36Sopenharmony_ci
95062306a36Sopenharmony_ci	vmovdqa		%ymm12,%ymm0
95162306a36Sopenharmony_ci	cmp		$0x0140,%rax
95262306a36Sopenharmony_ci	jl		.Lxorpart8
95362306a36Sopenharmony_ci	vpxor		0x0120(%rdx),%ymm0,%ymm0
95462306a36Sopenharmony_ci	vmovdqu		%ymm0,0x0120(%rsi)
95562306a36Sopenharmony_ci
95662306a36Sopenharmony_ci	vmovdqa		%ymm6,%ymm0
95762306a36Sopenharmony_ci	cmp		$0x0160,%rax
95862306a36Sopenharmony_ci	jl		.Lxorpart8
95962306a36Sopenharmony_ci	vpxor		0x0140(%rdx),%ymm0,%ymm0
96062306a36Sopenharmony_ci	vmovdqu		%ymm0,0x0140(%rsi)
96162306a36Sopenharmony_ci
96262306a36Sopenharmony_ci	vmovdqa		%ymm14,%ymm0
96362306a36Sopenharmony_ci	cmp		$0x0180,%rax
96462306a36Sopenharmony_ci	jl		.Lxorpart8
96562306a36Sopenharmony_ci	vpxor		0x0160(%rdx),%ymm0,%ymm0
96662306a36Sopenharmony_ci	vmovdqu		%ymm0,0x0160(%rsi)
96762306a36Sopenharmony_ci
96862306a36Sopenharmony_ci	vmovdqa		%ymm5,%ymm0
96962306a36Sopenharmony_ci	cmp		$0x01a0,%rax
97062306a36Sopenharmony_ci	jl		.Lxorpart8
97162306a36Sopenharmony_ci	vpxor		0x0180(%rdx),%ymm0,%ymm0
97262306a36Sopenharmony_ci	vmovdqu		%ymm0,0x0180(%rsi)
97362306a36Sopenharmony_ci
97462306a36Sopenharmony_ci	vmovdqa		%ymm13,%ymm0
97562306a36Sopenharmony_ci	cmp		$0x01c0,%rax
97662306a36Sopenharmony_ci	jl		.Lxorpart8
97762306a36Sopenharmony_ci	vpxor		0x01a0(%rdx),%ymm0,%ymm0
97862306a36Sopenharmony_ci	vmovdqu		%ymm0,0x01a0(%rsi)
97962306a36Sopenharmony_ci
98062306a36Sopenharmony_ci	vmovdqa		%ymm7,%ymm0
98162306a36Sopenharmony_ci	cmp		$0x01e0,%rax
98262306a36Sopenharmony_ci	jl		.Lxorpart8
98362306a36Sopenharmony_ci	vpxor		0x01c0(%rdx),%ymm0,%ymm0
98462306a36Sopenharmony_ci	vmovdqu		%ymm0,0x01c0(%rsi)
98562306a36Sopenharmony_ci
98662306a36Sopenharmony_ci	vmovdqa		%ymm15,%ymm0
98762306a36Sopenharmony_ci	cmp		$0x0200,%rax
98862306a36Sopenharmony_ci	jl		.Lxorpart8
98962306a36Sopenharmony_ci	vpxor		0x01e0(%rdx),%ymm0,%ymm0
99062306a36Sopenharmony_ci	vmovdqu		%ymm0,0x01e0(%rsi)
99162306a36Sopenharmony_ci
99262306a36Sopenharmony_ci.Ldone8:
99362306a36Sopenharmony_ci	vzeroupper
99462306a36Sopenharmony_ci	lea		-8(%r10),%rsp
99562306a36Sopenharmony_ci	RET
99662306a36Sopenharmony_ci
99762306a36Sopenharmony_ci.Lxorpart8:
99862306a36Sopenharmony_ci	# xor remaining bytes from partial register into output
99962306a36Sopenharmony_ci	mov		%rax,%r9
100062306a36Sopenharmony_ci	and		$0x1f,%r9
100162306a36Sopenharmony_ci	jz		.Ldone8
100262306a36Sopenharmony_ci	and		$~0x1f,%rax
100362306a36Sopenharmony_ci
100462306a36Sopenharmony_ci	mov		%rsi,%r11
100562306a36Sopenharmony_ci
100662306a36Sopenharmony_ci	lea		(%rdx,%rax),%rsi
100762306a36Sopenharmony_ci	mov		%rsp,%rdi
100862306a36Sopenharmony_ci	mov		%r9,%rcx
100962306a36Sopenharmony_ci	rep movsb
101062306a36Sopenharmony_ci
101162306a36Sopenharmony_ci	vpxor		0x00(%rsp),%ymm0,%ymm0
101262306a36Sopenharmony_ci	vmovdqa		%ymm0,0x00(%rsp)
101362306a36Sopenharmony_ci
101462306a36Sopenharmony_ci	mov		%rsp,%rsi
101562306a36Sopenharmony_ci	lea		(%r11,%rax),%rdi
101662306a36Sopenharmony_ci	mov		%r9,%rcx
101762306a36Sopenharmony_ci	rep movsb
101862306a36Sopenharmony_ci
101962306a36Sopenharmony_ci	jmp		.Ldone8
102062306a36Sopenharmony_ci
102162306a36Sopenharmony_ciSYM_FUNC_END(chacha_8block_xor_avx2)
1022