18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * Copyright (C) 2015 Martin Willi
68c2ecf20Sopenharmony_ci */
78c2ecf20Sopenharmony_ci
88c2ecf20Sopenharmony_ci#include <linux/linkage.h>
98c2ecf20Sopenharmony_ci#include <asm/frame.h>
108c2ecf20Sopenharmony_ci
118c2ecf20Sopenharmony_ci.section	.rodata.cst16.ROT8, "aM", @progbits, 16
128c2ecf20Sopenharmony_ci.align 16
138c2ecf20Sopenharmony_ciROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
148c2ecf20Sopenharmony_ci.section	.rodata.cst16.ROT16, "aM", @progbits, 16
158c2ecf20Sopenharmony_ci.align 16
168c2ecf20Sopenharmony_ciROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
178c2ecf20Sopenharmony_ci.section	.rodata.cst16.CTRINC, "aM", @progbits, 16
188c2ecf20Sopenharmony_ci.align 16
198c2ecf20Sopenharmony_ciCTRINC:	.octa 0x00000003000000020000000100000000
208c2ecf20Sopenharmony_ci
218c2ecf20Sopenharmony_ci.text
228c2ecf20Sopenharmony_ci
238c2ecf20Sopenharmony_ci/*
248c2ecf20Sopenharmony_ci * chacha_permute - permute one block
258c2ecf20Sopenharmony_ci *
268c2ecf20Sopenharmony_ci * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3.  This
278c2ecf20Sopenharmony_ci * function performs matrix operations on four words in parallel, but requires
288c2ecf20Sopenharmony_ci * shuffling to rearrange the words after each round.  8/16-bit word rotation is
298c2ecf20Sopenharmony_ci * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
308c2ecf20Sopenharmony_ci * rotation uses traditional shift+OR.
318c2ecf20Sopenharmony_ci *
328c2ecf20Sopenharmony_ci * The round count is given in %r8d.
338c2ecf20Sopenharmony_ci *
348c2ecf20Sopenharmony_ci * Clobbers: %r8d, %xmm4-%xmm7
358c2ecf20Sopenharmony_ci */
368c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(chacha_permute)
378c2ecf20Sopenharmony_ci
388c2ecf20Sopenharmony_ci	movdqa		ROT8(%rip),%xmm4
398c2ecf20Sopenharmony_ci	movdqa		ROT16(%rip),%xmm5
408c2ecf20Sopenharmony_ci
418c2ecf20Sopenharmony_ci.Ldoubleround:
428c2ecf20Sopenharmony_ci	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
438c2ecf20Sopenharmony_ci	paddd		%xmm1,%xmm0
448c2ecf20Sopenharmony_ci	pxor		%xmm0,%xmm3
458c2ecf20Sopenharmony_ci	pshufb		%xmm5,%xmm3
468c2ecf20Sopenharmony_ci
478c2ecf20Sopenharmony_ci	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
488c2ecf20Sopenharmony_ci	paddd		%xmm3,%xmm2
498c2ecf20Sopenharmony_ci	pxor		%xmm2,%xmm1
508c2ecf20Sopenharmony_ci	movdqa		%xmm1,%xmm6
518c2ecf20Sopenharmony_ci	pslld		$12,%xmm6
528c2ecf20Sopenharmony_ci	psrld		$20,%xmm1
538c2ecf20Sopenharmony_ci	por		%xmm6,%xmm1
548c2ecf20Sopenharmony_ci
558c2ecf20Sopenharmony_ci	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
568c2ecf20Sopenharmony_ci	paddd		%xmm1,%xmm0
578c2ecf20Sopenharmony_ci	pxor		%xmm0,%xmm3
588c2ecf20Sopenharmony_ci	pshufb		%xmm4,%xmm3
598c2ecf20Sopenharmony_ci
608c2ecf20Sopenharmony_ci	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
618c2ecf20Sopenharmony_ci	paddd		%xmm3,%xmm2
628c2ecf20Sopenharmony_ci	pxor		%xmm2,%xmm1
638c2ecf20Sopenharmony_ci	movdqa		%xmm1,%xmm7
648c2ecf20Sopenharmony_ci	pslld		$7,%xmm7
658c2ecf20Sopenharmony_ci	psrld		$25,%xmm1
668c2ecf20Sopenharmony_ci	por		%xmm7,%xmm1
678c2ecf20Sopenharmony_ci
688c2ecf20Sopenharmony_ci	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
698c2ecf20Sopenharmony_ci	pshufd		$0x39,%xmm1,%xmm1
708c2ecf20Sopenharmony_ci	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
718c2ecf20Sopenharmony_ci	pshufd		$0x4e,%xmm2,%xmm2
728c2ecf20Sopenharmony_ci	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
738c2ecf20Sopenharmony_ci	pshufd		$0x93,%xmm3,%xmm3
748c2ecf20Sopenharmony_ci
758c2ecf20Sopenharmony_ci	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
768c2ecf20Sopenharmony_ci	paddd		%xmm1,%xmm0
778c2ecf20Sopenharmony_ci	pxor		%xmm0,%xmm3
788c2ecf20Sopenharmony_ci	pshufb		%xmm5,%xmm3
798c2ecf20Sopenharmony_ci
808c2ecf20Sopenharmony_ci	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
818c2ecf20Sopenharmony_ci	paddd		%xmm3,%xmm2
828c2ecf20Sopenharmony_ci	pxor		%xmm2,%xmm1
838c2ecf20Sopenharmony_ci	movdqa		%xmm1,%xmm6
848c2ecf20Sopenharmony_ci	pslld		$12,%xmm6
858c2ecf20Sopenharmony_ci	psrld		$20,%xmm1
868c2ecf20Sopenharmony_ci	por		%xmm6,%xmm1
878c2ecf20Sopenharmony_ci
888c2ecf20Sopenharmony_ci	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
898c2ecf20Sopenharmony_ci	paddd		%xmm1,%xmm0
908c2ecf20Sopenharmony_ci	pxor		%xmm0,%xmm3
918c2ecf20Sopenharmony_ci	pshufb		%xmm4,%xmm3
928c2ecf20Sopenharmony_ci
938c2ecf20Sopenharmony_ci	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
948c2ecf20Sopenharmony_ci	paddd		%xmm3,%xmm2
958c2ecf20Sopenharmony_ci	pxor		%xmm2,%xmm1
968c2ecf20Sopenharmony_ci	movdqa		%xmm1,%xmm7
978c2ecf20Sopenharmony_ci	pslld		$7,%xmm7
988c2ecf20Sopenharmony_ci	psrld		$25,%xmm1
998c2ecf20Sopenharmony_ci	por		%xmm7,%xmm1
1008c2ecf20Sopenharmony_ci
1018c2ecf20Sopenharmony_ci	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
1028c2ecf20Sopenharmony_ci	pshufd		$0x93,%xmm1,%xmm1
1038c2ecf20Sopenharmony_ci	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
1048c2ecf20Sopenharmony_ci	pshufd		$0x4e,%xmm2,%xmm2
1058c2ecf20Sopenharmony_ci	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
1068c2ecf20Sopenharmony_ci	pshufd		$0x39,%xmm3,%xmm3
1078c2ecf20Sopenharmony_ci
1088c2ecf20Sopenharmony_ci	sub		$2,%r8d
1098c2ecf20Sopenharmony_ci	jnz		.Ldoubleround
1108c2ecf20Sopenharmony_ci
1118c2ecf20Sopenharmony_ci	RET
1128c2ecf20Sopenharmony_ciSYM_FUNC_END(chacha_permute)
1138c2ecf20Sopenharmony_ci
1148c2ecf20Sopenharmony_ciSYM_FUNC_START(chacha_block_xor_ssse3)
1158c2ecf20Sopenharmony_ci	# %rdi: Input state matrix, s
1168c2ecf20Sopenharmony_ci	# %rsi: up to 1 data block output, o
1178c2ecf20Sopenharmony_ci	# %rdx: up to 1 data block input, i
1188c2ecf20Sopenharmony_ci	# %rcx: input/output length in bytes
1198c2ecf20Sopenharmony_ci	# %r8d: nrounds
1208c2ecf20Sopenharmony_ci	FRAME_BEGIN
1218c2ecf20Sopenharmony_ci
1228c2ecf20Sopenharmony_ci	# x0..3 = s0..3
1238c2ecf20Sopenharmony_ci	movdqu		0x00(%rdi),%xmm0
1248c2ecf20Sopenharmony_ci	movdqu		0x10(%rdi),%xmm1
1258c2ecf20Sopenharmony_ci	movdqu		0x20(%rdi),%xmm2
1268c2ecf20Sopenharmony_ci	movdqu		0x30(%rdi),%xmm3
1278c2ecf20Sopenharmony_ci	movdqa		%xmm0,%xmm8
1288c2ecf20Sopenharmony_ci	movdqa		%xmm1,%xmm9
1298c2ecf20Sopenharmony_ci	movdqa		%xmm2,%xmm10
1308c2ecf20Sopenharmony_ci	movdqa		%xmm3,%xmm11
1318c2ecf20Sopenharmony_ci
1328c2ecf20Sopenharmony_ci	mov		%rcx,%rax
1338c2ecf20Sopenharmony_ci	call		chacha_permute
1348c2ecf20Sopenharmony_ci
1358c2ecf20Sopenharmony_ci	# o0 = i0 ^ (x0 + s0)
1368c2ecf20Sopenharmony_ci	paddd		%xmm8,%xmm0
1378c2ecf20Sopenharmony_ci	cmp		$0x10,%rax
1388c2ecf20Sopenharmony_ci	jl		.Lxorpart
1398c2ecf20Sopenharmony_ci	movdqu		0x00(%rdx),%xmm4
1408c2ecf20Sopenharmony_ci	pxor		%xmm4,%xmm0
1418c2ecf20Sopenharmony_ci	movdqu		%xmm0,0x00(%rsi)
1428c2ecf20Sopenharmony_ci	# o1 = i1 ^ (x1 + s1)
1438c2ecf20Sopenharmony_ci	paddd		%xmm9,%xmm1
1448c2ecf20Sopenharmony_ci	movdqa		%xmm1,%xmm0
1458c2ecf20Sopenharmony_ci	cmp		$0x20,%rax
1468c2ecf20Sopenharmony_ci	jl		.Lxorpart
1478c2ecf20Sopenharmony_ci	movdqu		0x10(%rdx),%xmm0
1488c2ecf20Sopenharmony_ci	pxor		%xmm1,%xmm0
1498c2ecf20Sopenharmony_ci	movdqu		%xmm0,0x10(%rsi)
1508c2ecf20Sopenharmony_ci	# o2 = i2 ^ (x2 + s2)
1518c2ecf20Sopenharmony_ci	paddd		%xmm10,%xmm2
1528c2ecf20Sopenharmony_ci	movdqa		%xmm2,%xmm0
1538c2ecf20Sopenharmony_ci	cmp		$0x30,%rax
1548c2ecf20Sopenharmony_ci	jl		.Lxorpart
1558c2ecf20Sopenharmony_ci	movdqu		0x20(%rdx),%xmm0
1568c2ecf20Sopenharmony_ci	pxor		%xmm2,%xmm0
1578c2ecf20Sopenharmony_ci	movdqu		%xmm0,0x20(%rsi)
1588c2ecf20Sopenharmony_ci	# o3 = i3 ^ (x3 + s3)
1598c2ecf20Sopenharmony_ci	paddd		%xmm11,%xmm3
1608c2ecf20Sopenharmony_ci	movdqa		%xmm3,%xmm0
1618c2ecf20Sopenharmony_ci	cmp		$0x40,%rax
1628c2ecf20Sopenharmony_ci	jl		.Lxorpart
1638c2ecf20Sopenharmony_ci	movdqu		0x30(%rdx),%xmm0
1648c2ecf20Sopenharmony_ci	pxor		%xmm3,%xmm0
1658c2ecf20Sopenharmony_ci	movdqu		%xmm0,0x30(%rsi)
1668c2ecf20Sopenharmony_ci
1678c2ecf20Sopenharmony_ci.Ldone:
1688c2ecf20Sopenharmony_ci	FRAME_END
1698c2ecf20Sopenharmony_ci	RET
1708c2ecf20Sopenharmony_ci
1718c2ecf20Sopenharmony_ci.Lxorpart:
1728c2ecf20Sopenharmony_ci	# xor remaining bytes from partial register into output
1738c2ecf20Sopenharmony_ci	mov		%rax,%r9
1748c2ecf20Sopenharmony_ci	and		$0x0f,%r9
1758c2ecf20Sopenharmony_ci	jz		.Ldone
1768c2ecf20Sopenharmony_ci	and		$~0x0f,%rax
1778c2ecf20Sopenharmony_ci
1788c2ecf20Sopenharmony_ci	mov		%rsi,%r11
1798c2ecf20Sopenharmony_ci
1808c2ecf20Sopenharmony_ci	lea		8(%rsp),%r10
1818c2ecf20Sopenharmony_ci	sub		$0x10,%rsp
1828c2ecf20Sopenharmony_ci	and		$~31,%rsp
1838c2ecf20Sopenharmony_ci
1848c2ecf20Sopenharmony_ci	lea		(%rdx,%rax),%rsi
1858c2ecf20Sopenharmony_ci	mov		%rsp,%rdi
1868c2ecf20Sopenharmony_ci	mov		%r9,%rcx
1878c2ecf20Sopenharmony_ci	rep movsb
1888c2ecf20Sopenharmony_ci
1898c2ecf20Sopenharmony_ci	pxor		0x00(%rsp),%xmm0
1908c2ecf20Sopenharmony_ci	movdqa		%xmm0,0x00(%rsp)
1918c2ecf20Sopenharmony_ci
1928c2ecf20Sopenharmony_ci	mov		%rsp,%rsi
1938c2ecf20Sopenharmony_ci	lea		(%r11,%rax),%rdi
1948c2ecf20Sopenharmony_ci	mov		%r9,%rcx
1958c2ecf20Sopenharmony_ci	rep movsb
1968c2ecf20Sopenharmony_ci
1978c2ecf20Sopenharmony_ci	lea		-8(%r10),%rsp
1988c2ecf20Sopenharmony_ci	jmp		.Ldone
1998c2ecf20Sopenharmony_ci
2008c2ecf20Sopenharmony_ciSYM_FUNC_END(chacha_block_xor_ssse3)
2018c2ecf20Sopenharmony_ci
2028c2ecf20Sopenharmony_ciSYM_FUNC_START(hchacha_block_ssse3)
2038c2ecf20Sopenharmony_ci	# %rdi: Input state matrix, s
2048c2ecf20Sopenharmony_ci	# %rsi: output (8 32-bit words)
2058c2ecf20Sopenharmony_ci	# %edx: nrounds
2068c2ecf20Sopenharmony_ci	FRAME_BEGIN
2078c2ecf20Sopenharmony_ci
2088c2ecf20Sopenharmony_ci	movdqu		0x00(%rdi),%xmm0
2098c2ecf20Sopenharmony_ci	movdqu		0x10(%rdi),%xmm1
2108c2ecf20Sopenharmony_ci	movdqu		0x20(%rdi),%xmm2
2118c2ecf20Sopenharmony_ci	movdqu		0x30(%rdi),%xmm3
2128c2ecf20Sopenharmony_ci
2138c2ecf20Sopenharmony_ci	mov		%edx,%r8d
2148c2ecf20Sopenharmony_ci	call		chacha_permute
2158c2ecf20Sopenharmony_ci
2168c2ecf20Sopenharmony_ci	movdqu		%xmm0,0x00(%rsi)
2178c2ecf20Sopenharmony_ci	movdqu		%xmm3,0x10(%rsi)
2188c2ecf20Sopenharmony_ci
2198c2ecf20Sopenharmony_ci	FRAME_END
2208c2ecf20Sopenharmony_ci	RET
2218c2ecf20Sopenharmony_ciSYM_FUNC_END(hchacha_block_ssse3)
2228c2ecf20Sopenharmony_ci
2238c2ecf20Sopenharmony_ciSYM_FUNC_START(chacha_4block_xor_ssse3)
2248c2ecf20Sopenharmony_ci	# %rdi: Input state matrix, s
2258c2ecf20Sopenharmony_ci	# %rsi: up to 4 data blocks output, o
2268c2ecf20Sopenharmony_ci	# %rdx: up to 4 data blocks input, i
2278c2ecf20Sopenharmony_ci	# %rcx: input/output length in bytes
2288c2ecf20Sopenharmony_ci	# %r8d: nrounds
2298c2ecf20Sopenharmony_ci
2308c2ecf20Sopenharmony_ci	# This function encrypts four consecutive ChaCha blocks by loading the
2318c2ecf20Sopenharmony_ci	# the state matrix in SSE registers four times. As we need some scratch
2328c2ecf20Sopenharmony_ci	# registers, we save the first four registers on the stack. The
2338c2ecf20Sopenharmony_ci	# algorithm performs each operation on the corresponding word of each
2348c2ecf20Sopenharmony_ci	# state matrix, hence requires no word shuffling. For final XORing step
2358c2ecf20Sopenharmony_ci	# we transpose the matrix by interleaving 32- and then 64-bit words,
2368c2ecf20Sopenharmony_ci	# which allows us to do XOR in SSE registers. 8/16-bit word rotation is
2378c2ecf20Sopenharmony_ci	# done with the slightly better performing SSSE3 byte shuffling,
2388c2ecf20Sopenharmony_ci	# 7/12-bit word rotation uses traditional shift+OR.
2398c2ecf20Sopenharmony_ci
2408c2ecf20Sopenharmony_ci	lea		8(%rsp),%r10
2418c2ecf20Sopenharmony_ci	sub		$0x80,%rsp
2428c2ecf20Sopenharmony_ci	and		$~63,%rsp
2438c2ecf20Sopenharmony_ci	mov		%rcx,%rax
2448c2ecf20Sopenharmony_ci
2458c2ecf20Sopenharmony_ci	# x0..15[0-3] = s0..3[0..3]
2468c2ecf20Sopenharmony_ci	movq		0x00(%rdi),%xmm1
2478c2ecf20Sopenharmony_ci	pshufd		$0x00,%xmm1,%xmm0
2488c2ecf20Sopenharmony_ci	pshufd		$0x55,%xmm1,%xmm1
2498c2ecf20Sopenharmony_ci	movq		0x08(%rdi),%xmm3
2508c2ecf20Sopenharmony_ci	pshufd		$0x00,%xmm3,%xmm2
2518c2ecf20Sopenharmony_ci	pshufd		$0x55,%xmm3,%xmm3
2528c2ecf20Sopenharmony_ci	movq		0x10(%rdi),%xmm5
2538c2ecf20Sopenharmony_ci	pshufd		$0x00,%xmm5,%xmm4
2548c2ecf20Sopenharmony_ci	pshufd		$0x55,%xmm5,%xmm5
2558c2ecf20Sopenharmony_ci	movq		0x18(%rdi),%xmm7
2568c2ecf20Sopenharmony_ci	pshufd		$0x00,%xmm7,%xmm6
2578c2ecf20Sopenharmony_ci	pshufd		$0x55,%xmm7,%xmm7
2588c2ecf20Sopenharmony_ci	movq		0x20(%rdi),%xmm9
2598c2ecf20Sopenharmony_ci	pshufd		$0x00,%xmm9,%xmm8
2608c2ecf20Sopenharmony_ci	pshufd		$0x55,%xmm9,%xmm9
2618c2ecf20Sopenharmony_ci	movq		0x28(%rdi),%xmm11
2628c2ecf20Sopenharmony_ci	pshufd		$0x00,%xmm11,%xmm10
2638c2ecf20Sopenharmony_ci	pshufd		$0x55,%xmm11,%xmm11
2648c2ecf20Sopenharmony_ci	movq		0x30(%rdi),%xmm13
2658c2ecf20Sopenharmony_ci	pshufd		$0x00,%xmm13,%xmm12
2668c2ecf20Sopenharmony_ci	pshufd		$0x55,%xmm13,%xmm13
2678c2ecf20Sopenharmony_ci	movq		0x38(%rdi),%xmm15
2688c2ecf20Sopenharmony_ci	pshufd		$0x00,%xmm15,%xmm14
2698c2ecf20Sopenharmony_ci	pshufd		$0x55,%xmm15,%xmm15
2708c2ecf20Sopenharmony_ci	# x0..3 on stack
2718c2ecf20Sopenharmony_ci	movdqa		%xmm0,0x00(%rsp)
2728c2ecf20Sopenharmony_ci	movdqa		%xmm1,0x10(%rsp)
2738c2ecf20Sopenharmony_ci	movdqa		%xmm2,0x20(%rsp)
2748c2ecf20Sopenharmony_ci	movdqa		%xmm3,0x30(%rsp)
2758c2ecf20Sopenharmony_ci
2768c2ecf20Sopenharmony_ci	movdqa		CTRINC(%rip),%xmm1
2778c2ecf20Sopenharmony_ci	movdqa		ROT8(%rip),%xmm2
2788c2ecf20Sopenharmony_ci	movdqa		ROT16(%rip),%xmm3
2798c2ecf20Sopenharmony_ci
2808c2ecf20Sopenharmony_ci	# x12 += counter values 0-3
2818c2ecf20Sopenharmony_ci	paddd		%xmm1,%xmm12
2828c2ecf20Sopenharmony_ci
2838c2ecf20Sopenharmony_ci.Ldoubleround4:
2848c2ecf20Sopenharmony_ci	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
2858c2ecf20Sopenharmony_ci	movdqa		0x00(%rsp),%xmm0
2868c2ecf20Sopenharmony_ci	paddd		%xmm4,%xmm0
2878c2ecf20Sopenharmony_ci	movdqa		%xmm0,0x00(%rsp)
2888c2ecf20Sopenharmony_ci	pxor		%xmm0,%xmm12
2898c2ecf20Sopenharmony_ci	pshufb		%xmm3,%xmm12
2908c2ecf20Sopenharmony_ci	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
2918c2ecf20Sopenharmony_ci	movdqa		0x10(%rsp),%xmm0
2928c2ecf20Sopenharmony_ci	paddd		%xmm5,%xmm0
2938c2ecf20Sopenharmony_ci	movdqa		%xmm0,0x10(%rsp)
2948c2ecf20Sopenharmony_ci	pxor		%xmm0,%xmm13
2958c2ecf20Sopenharmony_ci	pshufb		%xmm3,%xmm13
2968c2ecf20Sopenharmony_ci	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
2978c2ecf20Sopenharmony_ci	movdqa		0x20(%rsp),%xmm0
2988c2ecf20Sopenharmony_ci	paddd		%xmm6,%xmm0
2998c2ecf20Sopenharmony_ci	movdqa		%xmm0,0x20(%rsp)
3008c2ecf20Sopenharmony_ci	pxor		%xmm0,%xmm14
3018c2ecf20Sopenharmony_ci	pshufb		%xmm3,%xmm14
3028c2ecf20Sopenharmony_ci	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
3038c2ecf20Sopenharmony_ci	movdqa		0x30(%rsp),%xmm0
3048c2ecf20Sopenharmony_ci	paddd		%xmm7,%xmm0
3058c2ecf20Sopenharmony_ci	movdqa		%xmm0,0x30(%rsp)
3068c2ecf20Sopenharmony_ci	pxor		%xmm0,%xmm15
3078c2ecf20Sopenharmony_ci	pshufb		%xmm3,%xmm15
3088c2ecf20Sopenharmony_ci
3098c2ecf20Sopenharmony_ci	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
3108c2ecf20Sopenharmony_ci	paddd		%xmm12,%xmm8
3118c2ecf20Sopenharmony_ci	pxor		%xmm8,%xmm4
3128c2ecf20Sopenharmony_ci	movdqa		%xmm4,%xmm0
3138c2ecf20Sopenharmony_ci	pslld		$12,%xmm0
3148c2ecf20Sopenharmony_ci	psrld		$20,%xmm4
3158c2ecf20Sopenharmony_ci	por		%xmm0,%xmm4
3168c2ecf20Sopenharmony_ci	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
3178c2ecf20Sopenharmony_ci	paddd		%xmm13,%xmm9
3188c2ecf20Sopenharmony_ci	pxor		%xmm9,%xmm5
3198c2ecf20Sopenharmony_ci	movdqa		%xmm5,%xmm0
3208c2ecf20Sopenharmony_ci	pslld		$12,%xmm0
3218c2ecf20Sopenharmony_ci	psrld		$20,%xmm5
3228c2ecf20Sopenharmony_ci	por		%xmm0,%xmm5
3238c2ecf20Sopenharmony_ci	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
3248c2ecf20Sopenharmony_ci	paddd		%xmm14,%xmm10
3258c2ecf20Sopenharmony_ci	pxor		%xmm10,%xmm6
3268c2ecf20Sopenharmony_ci	movdqa		%xmm6,%xmm0
3278c2ecf20Sopenharmony_ci	pslld		$12,%xmm0
3288c2ecf20Sopenharmony_ci	psrld		$20,%xmm6
3298c2ecf20Sopenharmony_ci	por		%xmm0,%xmm6
3308c2ecf20Sopenharmony_ci	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
3318c2ecf20Sopenharmony_ci	paddd		%xmm15,%xmm11
3328c2ecf20Sopenharmony_ci	pxor		%xmm11,%xmm7
3338c2ecf20Sopenharmony_ci	movdqa		%xmm7,%xmm0
3348c2ecf20Sopenharmony_ci	pslld		$12,%xmm0
3358c2ecf20Sopenharmony_ci	psrld		$20,%xmm7
3368c2ecf20Sopenharmony_ci	por		%xmm0,%xmm7
3378c2ecf20Sopenharmony_ci
3388c2ecf20Sopenharmony_ci	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
3398c2ecf20Sopenharmony_ci	movdqa		0x00(%rsp),%xmm0
3408c2ecf20Sopenharmony_ci	paddd		%xmm4,%xmm0
3418c2ecf20Sopenharmony_ci	movdqa		%xmm0,0x00(%rsp)
3428c2ecf20Sopenharmony_ci	pxor		%xmm0,%xmm12
3438c2ecf20Sopenharmony_ci	pshufb		%xmm2,%xmm12
3448c2ecf20Sopenharmony_ci	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
3458c2ecf20Sopenharmony_ci	movdqa		0x10(%rsp),%xmm0
3468c2ecf20Sopenharmony_ci	paddd		%xmm5,%xmm0
3478c2ecf20Sopenharmony_ci	movdqa		%xmm0,0x10(%rsp)
3488c2ecf20Sopenharmony_ci	pxor		%xmm0,%xmm13
3498c2ecf20Sopenharmony_ci	pshufb		%xmm2,%xmm13
3508c2ecf20Sopenharmony_ci	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
3518c2ecf20Sopenharmony_ci	movdqa		0x20(%rsp),%xmm0
3528c2ecf20Sopenharmony_ci	paddd		%xmm6,%xmm0
3538c2ecf20Sopenharmony_ci	movdqa		%xmm0,0x20(%rsp)
3548c2ecf20Sopenharmony_ci	pxor		%xmm0,%xmm14
3558c2ecf20Sopenharmony_ci	pshufb		%xmm2,%xmm14
3568c2ecf20Sopenharmony_ci	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
3578c2ecf20Sopenharmony_ci	movdqa		0x30(%rsp),%xmm0
3588c2ecf20Sopenharmony_ci	paddd		%xmm7,%xmm0
3598c2ecf20Sopenharmony_ci	movdqa		%xmm0,0x30(%rsp)
3608c2ecf20Sopenharmony_ci	pxor		%xmm0,%xmm15
3618c2ecf20Sopenharmony_ci	pshufb		%xmm2,%xmm15
3628c2ecf20Sopenharmony_ci
3638c2ecf20Sopenharmony_ci	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
3648c2ecf20Sopenharmony_ci	paddd		%xmm12,%xmm8
3658c2ecf20Sopenharmony_ci	pxor		%xmm8,%xmm4
3668c2ecf20Sopenharmony_ci	movdqa		%xmm4,%xmm0
3678c2ecf20Sopenharmony_ci	pslld		$7,%xmm0
3688c2ecf20Sopenharmony_ci	psrld		$25,%xmm4
3698c2ecf20Sopenharmony_ci	por		%xmm0,%xmm4
3708c2ecf20Sopenharmony_ci	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
3718c2ecf20Sopenharmony_ci	paddd		%xmm13,%xmm9
3728c2ecf20Sopenharmony_ci	pxor		%xmm9,%xmm5
3738c2ecf20Sopenharmony_ci	movdqa		%xmm5,%xmm0
3748c2ecf20Sopenharmony_ci	pslld		$7,%xmm0
3758c2ecf20Sopenharmony_ci	psrld		$25,%xmm5
3768c2ecf20Sopenharmony_ci	por		%xmm0,%xmm5
3778c2ecf20Sopenharmony_ci	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
3788c2ecf20Sopenharmony_ci	paddd		%xmm14,%xmm10
3798c2ecf20Sopenharmony_ci	pxor		%xmm10,%xmm6
3808c2ecf20Sopenharmony_ci	movdqa		%xmm6,%xmm0
3818c2ecf20Sopenharmony_ci	pslld		$7,%xmm0
3828c2ecf20Sopenharmony_ci	psrld		$25,%xmm6
3838c2ecf20Sopenharmony_ci	por		%xmm0,%xmm6
3848c2ecf20Sopenharmony_ci	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
3858c2ecf20Sopenharmony_ci	paddd		%xmm15,%xmm11
3868c2ecf20Sopenharmony_ci	pxor		%xmm11,%xmm7
3878c2ecf20Sopenharmony_ci	movdqa		%xmm7,%xmm0
3888c2ecf20Sopenharmony_ci	pslld		$7,%xmm0
3898c2ecf20Sopenharmony_ci	psrld		$25,%xmm7
3908c2ecf20Sopenharmony_ci	por		%xmm0,%xmm7
3918c2ecf20Sopenharmony_ci
3928c2ecf20Sopenharmony_ci	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
3938c2ecf20Sopenharmony_ci	movdqa		0x00(%rsp),%xmm0
3948c2ecf20Sopenharmony_ci	paddd		%xmm5,%xmm0
3958c2ecf20Sopenharmony_ci	movdqa		%xmm0,0x00(%rsp)
3968c2ecf20Sopenharmony_ci	pxor		%xmm0,%xmm15
3978c2ecf20Sopenharmony_ci	pshufb		%xmm3,%xmm15
3988c2ecf20Sopenharmony_ci	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)
3998c2ecf20Sopenharmony_ci	movdqa		0x10(%rsp),%xmm0
4008c2ecf20Sopenharmony_ci	paddd		%xmm6,%xmm0
4018c2ecf20Sopenharmony_ci	movdqa		%xmm0,0x10(%rsp)
4028c2ecf20Sopenharmony_ci	pxor		%xmm0,%xmm12
4038c2ecf20Sopenharmony_ci	pshufb		%xmm3,%xmm12
4048c2ecf20Sopenharmony_ci	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
4058c2ecf20Sopenharmony_ci	movdqa		0x20(%rsp),%xmm0
4068c2ecf20Sopenharmony_ci	paddd		%xmm7,%xmm0
4078c2ecf20Sopenharmony_ci	movdqa		%xmm0,0x20(%rsp)
4088c2ecf20Sopenharmony_ci	pxor		%xmm0,%xmm13
4098c2ecf20Sopenharmony_ci	pshufb		%xmm3,%xmm13
4108c2ecf20Sopenharmony_ci	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
4118c2ecf20Sopenharmony_ci	movdqa		0x30(%rsp),%xmm0
4128c2ecf20Sopenharmony_ci	paddd		%xmm4,%xmm0
4138c2ecf20Sopenharmony_ci	movdqa		%xmm0,0x30(%rsp)
4148c2ecf20Sopenharmony_ci	pxor		%xmm0,%xmm14
4158c2ecf20Sopenharmony_ci	pshufb		%xmm3,%xmm14
4168c2ecf20Sopenharmony_ci
4178c2ecf20Sopenharmony_ci	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
4188c2ecf20Sopenharmony_ci	paddd		%xmm15,%xmm10
4198c2ecf20Sopenharmony_ci	pxor		%xmm10,%xmm5
4208c2ecf20Sopenharmony_ci	movdqa		%xmm5,%xmm0
4218c2ecf20Sopenharmony_ci	pslld		$12,%xmm0
4228c2ecf20Sopenharmony_ci	psrld		$20,%xmm5
4238c2ecf20Sopenharmony_ci	por		%xmm0,%xmm5
4248c2ecf20Sopenharmony_ci	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
4258c2ecf20Sopenharmony_ci	paddd		%xmm12,%xmm11
4268c2ecf20Sopenharmony_ci	pxor		%xmm11,%xmm6
4278c2ecf20Sopenharmony_ci	movdqa		%xmm6,%xmm0
4288c2ecf20Sopenharmony_ci	pslld		$12,%xmm0
4298c2ecf20Sopenharmony_ci	psrld		$20,%xmm6
4308c2ecf20Sopenharmony_ci	por		%xmm0,%xmm6
4318c2ecf20Sopenharmony_ci	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
4328c2ecf20Sopenharmony_ci	paddd		%xmm13,%xmm8
4338c2ecf20Sopenharmony_ci	pxor		%xmm8,%xmm7
4348c2ecf20Sopenharmony_ci	movdqa		%xmm7,%xmm0
4358c2ecf20Sopenharmony_ci	pslld		$12,%xmm0
4368c2ecf20Sopenharmony_ci	psrld		$20,%xmm7
4378c2ecf20Sopenharmony_ci	por		%xmm0,%xmm7
4388c2ecf20Sopenharmony_ci	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
4398c2ecf20Sopenharmony_ci	paddd		%xmm14,%xmm9
4408c2ecf20Sopenharmony_ci	pxor		%xmm9,%xmm4
4418c2ecf20Sopenharmony_ci	movdqa		%xmm4,%xmm0
4428c2ecf20Sopenharmony_ci	pslld		$12,%xmm0
4438c2ecf20Sopenharmony_ci	psrld		$20,%xmm4
4448c2ecf20Sopenharmony_ci	por		%xmm0,%xmm4
4458c2ecf20Sopenharmony_ci
4468c2ecf20Sopenharmony_ci	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
4478c2ecf20Sopenharmony_ci	movdqa		0x00(%rsp),%xmm0
4488c2ecf20Sopenharmony_ci	paddd		%xmm5,%xmm0
4498c2ecf20Sopenharmony_ci	movdqa		%xmm0,0x00(%rsp)
4508c2ecf20Sopenharmony_ci	pxor		%xmm0,%xmm15
4518c2ecf20Sopenharmony_ci	pshufb		%xmm2,%xmm15
4528c2ecf20Sopenharmony_ci	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
4538c2ecf20Sopenharmony_ci	movdqa		0x10(%rsp),%xmm0
4548c2ecf20Sopenharmony_ci	paddd		%xmm6,%xmm0
4558c2ecf20Sopenharmony_ci	movdqa		%xmm0,0x10(%rsp)
4568c2ecf20Sopenharmony_ci	pxor		%xmm0,%xmm12
4578c2ecf20Sopenharmony_ci	pshufb		%xmm2,%xmm12
4588c2ecf20Sopenharmony_ci	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
4598c2ecf20Sopenharmony_ci	movdqa		0x20(%rsp),%xmm0
4608c2ecf20Sopenharmony_ci	paddd		%xmm7,%xmm0
4618c2ecf20Sopenharmony_ci	movdqa		%xmm0,0x20(%rsp)
4628c2ecf20Sopenharmony_ci	pxor		%xmm0,%xmm13
4638c2ecf20Sopenharmony_ci	pshufb		%xmm2,%xmm13
4648c2ecf20Sopenharmony_ci	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
4658c2ecf20Sopenharmony_ci	movdqa		0x30(%rsp),%xmm0
4668c2ecf20Sopenharmony_ci	paddd		%xmm4,%xmm0
4678c2ecf20Sopenharmony_ci	movdqa		%xmm0,0x30(%rsp)
4688c2ecf20Sopenharmony_ci	pxor		%xmm0,%xmm14
4698c2ecf20Sopenharmony_ci	pshufb		%xmm2,%xmm14
4708c2ecf20Sopenharmony_ci
4718c2ecf20Sopenharmony_ci	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
4728c2ecf20Sopenharmony_ci	paddd		%xmm15,%xmm10
4738c2ecf20Sopenharmony_ci	pxor		%xmm10,%xmm5
4748c2ecf20Sopenharmony_ci	movdqa		%xmm5,%xmm0
4758c2ecf20Sopenharmony_ci	pslld		$7,%xmm0
4768c2ecf20Sopenharmony_ci	psrld		$25,%xmm5
4778c2ecf20Sopenharmony_ci	por		%xmm0,%xmm5
4788c2ecf20Sopenharmony_ci	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
4798c2ecf20Sopenharmony_ci	paddd		%xmm12,%xmm11
4808c2ecf20Sopenharmony_ci	pxor		%xmm11,%xmm6
4818c2ecf20Sopenharmony_ci	movdqa		%xmm6,%xmm0
4828c2ecf20Sopenharmony_ci	pslld		$7,%xmm0
4838c2ecf20Sopenharmony_ci	psrld		$25,%xmm6
4848c2ecf20Sopenharmony_ci	por		%xmm0,%xmm6
4858c2ecf20Sopenharmony_ci	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
4868c2ecf20Sopenharmony_ci	paddd		%xmm13,%xmm8
4878c2ecf20Sopenharmony_ci	pxor		%xmm8,%xmm7
4888c2ecf20Sopenharmony_ci	movdqa		%xmm7,%xmm0
4898c2ecf20Sopenharmony_ci	pslld		$7,%xmm0
4908c2ecf20Sopenharmony_ci	psrld		$25,%xmm7
4918c2ecf20Sopenharmony_ci	por		%xmm0,%xmm7
4928c2ecf20Sopenharmony_ci	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
4938c2ecf20Sopenharmony_ci	paddd		%xmm14,%xmm9
4948c2ecf20Sopenharmony_ci	pxor		%xmm9,%xmm4
4958c2ecf20Sopenharmony_ci	movdqa		%xmm4,%xmm0
4968c2ecf20Sopenharmony_ci	pslld		$7,%xmm0
4978c2ecf20Sopenharmony_ci	psrld		$25,%xmm4
4988c2ecf20Sopenharmony_ci	por		%xmm0,%xmm4
4998c2ecf20Sopenharmony_ci
5008c2ecf20Sopenharmony_ci	sub		$2,%r8d
5018c2ecf20Sopenharmony_ci	jnz		.Ldoubleround4
5028c2ecf20Sopenharmony_ci
5038c2ecf20Sopenharmony_ci	# x0[0-3] += s0[0]
5048c2ecf20Sopenharmony_ci	# x1[0-3] += s0[1]
5058c2ecf20Sopenharmony_ci	movq		0x00(%rdi),%xmm3
5068c2ecf20Sopenharmony_ci	pshufd		$0x00,%xmm3,%xmm2
5078c2ecf20Sopenharmony_ci	pshufd		$0x55,%xmm3,%xmm3
5088c2ecf20Sopenharmony_ci	paddd		0x00(%rsp),%xmm2
5098c2ecf20Sopenharmony_ci	movdqa		%xmm2,0x00(%rsp)
5108c2ecf20Sopenharmony_ci	paddd		0x10(%rsp),%xmm3
5118c2ecf20Sopenharmony_ci	movdqa		%xmm3,0x10(%rsp)
5128c2ecf20Sopenharmony_ci	# x2[0-3] += s0[2]
5138c2ecf20Sopenharmony_ci	# x3[0-3] += s0[3]
5148c2ecf20Sopenharmony_ci	movq		0x08(%rdi),%xmm3
5158c2ecf20Sopenharmony_ci	pshufd		$0x00,%xmm3,%xmm2
5168c2ecf20Sopenharmony_ci	pshufd		$0x55,%xmm3,%xmm3
5178c2ecf20Sopenharmony_ci	paddd		0x20(%rsp),%xmm2
5188c2ecf20Sopenharmony_ci	movdqa		%xmm2,0x20(%rsp)
5198c2ecf20Sopenharmony_ci	paddd		0x30(%rsp),%xmm3
5208c2ecf20Sopenharmony_ci	movdqa		%xmm3,0x30(%rsp)
5218c2ecf20Sopenharmony_ci
5228c2ecf20Sopenharmony_ci	# x4[0-3] += s1[0]
5238c2ecf20Sopenharmony_ci	# x5[0-3] += s1[1]
5248c2ecf20Sopenharmony_ci	movq		0x10(%rdi),%xmm3
5258c2ecf20Sopenharmony_ci	pshufd		$0x00,%xmm3,%xmm2
5268c2ecf20Sopenharmony_ci	pshufd		$0x55,%xmm3,%xmm3
5278c2ecf20Sopenharmony_ci	paddd		%xmm2,%xmm4
5288c2ecf20Sopenharmony_ci	paddd		%xmm3,%xmm5
5298c2ecf20Sopenharmony_ci	# x6[0-3] += s1[2]
5308c2ecf20Sopenharmony_ci	# x7[0-3] += s1[3]
5318c2ecf20Sopenharmony_ci	movq		0x18(%rdi),%xmm3
5328c2ecf20Sopenharmony_ci	pshufd		$0x00,%xmm3,%xmm2
5338c2ecf20Sopenharmony_ci	pshufd		$0x55,%xmm3,%xmm3
5348c2ecf20Sopenharmony_ci	paddd		%xmm2,%xmm6
5358c2ecf20Sopenharmony_ci	paddd		%xmm3,%xmm7
5368c2ecf20Sopenharmony_ci
5378c2ecf20Sopenharmony_ci	# x8[0-3] += s2[0]
5388c2ecf20Sopenharmony_ci	# x9[0-3] += s2[1]
5398c2ecf20Sopenharmony_ci	movq		0x20(%rdi),%xmm3
5408c2ecf20Sopenharmony_ci	pshufd		$0x00,%xmm3,%xmm2
5418c2ecf20Sopenharmony_ci	pshufd		$0x55,%xmm3,%xmm3
5428c2ecf20Sopenharmony_ci	paddd		%xmm2,%xmm8
5438c2ecf20Sopenharmony_ci	paddd		%xmm3,%xmm9
5448c2ecf20Sopenharmony_ci	# x10[0-3] += s2[2]
5458c2ecf20Sopenharmony_ci	# x11[0-3] += s2[3]
5468c2ecf20Sopenharmony_ci	movq		0x28(%rdi),%xmm3
5478c2ecf20Sopenharmony_ci	pshufd		$0x00,%xmm3,%xmm2
5488c2ecf20Sopenharmony_ci	pshufd		$0x55,%xmm3,%xmm3
5498c2ecf20Sopenharmony_ci	paddd		%xmm2,%xmm10
5508c2ecf20Sopenharmony_ci	paddd		%xmm3,%xmm11
5518c2ecf20Sopenharmony_ci
5528c2ecf20Sopenharmony_ci	# x12[0-3] += s3[0]
5538c2ecf20Sopenharmony_ci	# x13[0-3] += s3[1]
5548c2ecf20Sopenharmony_ci	movq		0x30(%rdi),%xmm3
5558c2ecf20Sopenharmony_ci	pshufd		$0x00,%xmm3,%xmm2
5568c2ecf20Sopenharmony_ci	pshufd		$0x55,%xmm3,%xmm3
5578c2ecf20Sopenharmony_ci	paddd		%xmm2,%xmm12
5588c2ecf20Sopenharmony_ci	paddd		%xmm3,%xmm13
5598c2ecf20Sopenharmony_ci	# x14[0-3] += s3[2]
5608c2ecf20Sopenharmony_ci	# x15[0-3] += s3[3]
5618c2ecf20Sopenharmony_ci	movq		0x38(%rdi),%xmm3
5628c2ecf20Sopenharmony_ci	pshufd		$0x00,%xmm3,%xmm2
5638c2ecf20Sopenharmony_ci	pshufd		$0x55,%xmm3,%xmm3
5648c2ecf20Sopenharmony_ci	paddd		%xmm2,%xmm14
5658c2ecf20Sopenharmony_ci	paddd		%xmm3,%xmm15
5668c2ecf20Sopenharmony_ci
5678c2ecf20Sopenharmony_ci	# x12 += counter values 0-3
5688c2ecf20Sopenharmony_ci	paddd		%xmm1,%xmm12
5698c2ecf20Sopenharmony_ci
5708c2ecf20Sopenharmony_ci	# interleave 32-bit words in state n, n+1
5718c2ecf20Sopenharmony_ci	movdqa		0x00(%rsp),%xmm0
5728c2ecf20Sopenharmony_ci	movdqa		0x10(%rsp),%xmm1
5738c2ecf20Sopenharmony_ci	movdqa		%xmm0,%xmm2
5748c2ecf20Sopenharmony_ci	punpckldq	%xmm1,%xmm2
5758c2ecf20Sopenharmony_ci	punpckhdq	%xmm1,%xmm0
5768c2ecf20Sopenharmony_ci	movdqa		%xmm2,0x00(%rsp)
5778c2ecf20Sopenharmony_ci	movdqa		%xmm0,0x10(%rsp)
5788c2ecf20Sopenharmony_ci	movdqa		0x20(%rsp),%xmm0
5798c2ecf20Sopenharmony_ci	movdqa		0x30(%rsp),%xmm1
5808c2ecf20Sopenharmony_ci	movdqa		%xmm0,%xmm2
5818c2ecf20Sopenharmony_ci	punpckldq	%xmm1,%xmm2
5828c2ecf20Sopenharmony_ci	punpckhdq	%xmm1,%xmm0
5838c2ecf20Sopenharmony_ci	movdqa		%xmm2,0x20(%rsp)
5848c2ecf20Sopenharmony_ci	movdqa		%xmm0,0x30(%rsp)
5858c2ecf20Sopenharmony_ci	movdqa		%xmm4,%xmm0
5868c2ecf20Sopenharmony_ci	punpckldq	%xmm5,%xmm4
5878c2ecf20Sopenharmony_ci	punpckhdq	%xmm5,%xmm0
5888c2ecf20Sopenharmony_ci	movdqa		%xmm0,%xmm5
5898c2ecf20Sopenharmony_ci	movdqa		%xmm6,%xmm0
5908c2ecf20Sopenharmony_ci	punpckldq	%xmm7,%xmm6
5918c2ecf20Sopenharmony_ci	punpckhdq	%xmm7,%xmm0
5928c2ecf20Sopenharmony_ci	movdqa		%xmm0,%xmm7
5938c2ecf20Sopenharmony_ci	movdqa		%xmm8,%xmm0
5948c2ecf20Sopenharmony_ci	punpckldq	%xmm9,%xmm8
5958c2ecf20Sopenharmony_ci	punpckhdq	%xmm9,%xmm0
5968c2ecf20Sopenharmony_ci	movdqa		%xmm0,%xmm9
5978c2ecf20Sopenharmony_ci	movdqa		%xmm10,%xmm0
5988c2ecf20Sopenharmony_ci	punpckldq	%xmm11,%xmm10
5998c2ecf20Sopenharmony_ci	punpckhdq	%xmm11,%xmm0
6008c2ecf20Sopenharmony_ci	movdqa		%xmm0,%xmm11
6018c2ecf20Sopenharmony_ci	movdqa		%xmm12,%xmm0
6028c2ecf20Sopenharmony_ci	punpckldq	%xmm13,%xmm12
6038c2ecf20Sopenharmony_ci	punpckhdq	%xmm13,%xmm0
6048c2ecf20Sopenharmony_ci	movdqa		%xmm0,%xmm13
6058c2ecf20Sopenharmony_ci	movdqa		%xmm14,%xmm0
6068c2ecf20Sopenharmony_ci	punpckldq	%xmm15,%xmm14
6078c2ecf20Sopenharmony_ci	punpckhdq	%xmm15,%xmm0
6088c2ecf20Sopenharmony_ci	movdqa		%xmm0,%xmm15
6098c2ecf20Sopenharmony_ci
6108c2ecf20Sopenharmony_ci	# interleave 64-bit words in state n, n+2
6118c2ecf20Sopenharmony_ci	movdqa		0x00(%rsp),%xmm0
6128c2ecf20Sopenharmony_ci	movdqa		0x20(%rsp),%xmm1
6138c2ecf20Sopenharmony_ci	movdqa		%xmm0,%xmm2
6148c2ecf20Sopenharmony_ci	punpcklqdq	%xmm1,%xmm2
6158c2ecf20Sopenharmony_ci	punpckhqdq	%xmm1,%xmm0
6168c2ecf20Sopenharmony_ci	movdqa		%xmm2,0x00(%rsp)
6178c2ecf20Sopenharmony_ci	movdqa		%xmm0,0x20(%rsp)
6188c2ecf20Sopenharmony_ci	movdqa		0x10(%rsp),%xmm0
6198c2ecf20Sopenharmony_ci	movdqa		0x30(%rsp),%xmm1
6208c2ecf20Sopenharmony_ci	movdqa		%xmm0,%xmm2
6218c2ecf20Sopenharmony_ci	punpcklqdq	%xmm1,%xmm2
6228c2ecf20Sopenharmony_ci	punpckhqdq	%xmm1,%xmm0
6238c2ecf20Sopenharmony_ci	movdqa		%xmm2,0x10(%rsp)
6248c2ecf20Sopenharmony_ci	movdqa		%xmm0,0x30(%rsp)
6258c2ecf20Sopenharmony_ci	movdqa		%xmm4,%xmm0
6268c2ecf20Sopenharmony_ci	punpcklqdq	%xmm6,%xmm4
6278c2ecf20Sopenharmony_ci	punpckhqdq	%xmm6,%xmm0
6288c2ecf20Sopenharmony_ci	movdqa		%xmm0,%xmm6
6298c2ecf20Sopenharmony_ci	movdqa		%xmm5,%xmm0
6308c2ecf20Sopenharmony_ci	punpcklqdq	%xmm7,%xmm5
6318c2ecf20Sopenharmony_ci	punpckhqdq	%xmm7,%xmm0
6328c2ecf20Sopenharmony_ci	movdqa		%xmm0,%xmm7
6338c2ecf20Sopenharmony_ci	movdqa		%xmm8,%xmm0
6348c2ecf20Sopenharmony_ci	punpcklqdq	%xmm10,%xmm8
6358c2ecf20Sopenharmony_ci	punpckhqdq	%xmm10,%xmm0
6368c2ecf20Sopenharmony_ci	movdqa		%xmm0,%xmm10
6378c2ecf20Sopenharmony_ci	movdqa		%xmm9,%xmm0
6388c2ecf20Sopenharmony_ci	punpcklqdq	%xmm11,%xmm9
6398c2ecf20Sopenharmony_ci	punpckhqdq	%xmm11,%xmm0
6408c2ecf20Sopenharmony_ci	movdqa		%xmm0,%xmm11
6418c2ecf20Sopenharmony_ci	movdqa		%xmm12,%xmm0
6428c2ecf20Sopenharmony_ci	punpcklqdq	%xmm14,%xmm12
6438c2ecf20Sopenharmony_ci	punpckhqdq	%xmm14,%xmm0
6448c2ecf20Sopenharmony_ci	movdqa		%xmm0,%xmm14
6458c2ecf20Sopenharmony_ci	movdqa		%xmm13,%xmm0
6468c2ecf20Sopenharmony_ci	punpcklqdq	%xmm15,%xmm13
6478c2ecf20Sopenharmony_ci	punpckhqdq	%xmm15,%xmm0
6488c2ecf20Sopenharmony_ci	movdqa		%xmm0,%xmm15
6498c2ecf20Sopenharmony_ci
6508c2ecf20Sopenharmony_ci	# xor with corresponding input, write to output
6518c2ecf20Sopenharmony_ci	movdqa		0x00(%rsp),%xmm0
6528c2ecf20Sopenharmony_ci	cmp		$0x10,%rax
6538c2ecf20Sopenharmony_ci	jl		.Lxorpart4
6548c2ecf20Sopenharmony_ci	movdqu		0x00(%rdx),%xmm1
6558c2ecf20Sopenharmony_ci	pxor		%xmm1,%xmm0
6568c2ecf20Sopenharmony_ci	movdqu		%xmm0,0x00(%rsi)
6578c2ecf20Sopenharmony_ci
6588c2ecf20Sopenharmony_ci	movdqu		%xmm4,%xmm0
6598c2ecf20Sopenharmony_ci	cmp		$0x20,%rax
6608c2ecf20Sopenharmony_ci	jl		.Lxorpart4
6618c2ecf20Sopenharmony_ci	movdqu		0x10(%rdx),%xmm1
6628c2ecf20Sopenharmony_ci	pxor		%xmm1,%xmm0
6638c2ecf20Sopenharmony_ci	movdqu		%xmm0,0x10(%rsi)
6648c2ecf20Sopenharmony_ci
6658c2ecf20Sopenharmony_ci	movdqu		%xmm8,%xmm0
6668c2ecf20Sopenharmony_ci	cmp		$0x30,%rax
6678c2ecf20Sopenharmony_ci	jl		.Lxorpart4
6688c2ecf20Sopenharmony_ci	movdqu		0x20(%rdx),%xmm1
6698c2ecf20Sopenharmony_ci	pxor		%xmm1,%xmm0
6708c2ecf20Sopenharmony_ci	movdqu		%xmm0,0x20(%rsi)
6718c2ecf20Sopenharmony_ci
6728c2ecf20Sopenharmony_ci	movdqu		%xmm12,%xmm0
6738c2ecf20Sopenharmony_ci	cmp		$0x40,%rax
6748c2ecf20Sopenharmony_ci	jl		.Lxorpart4
6758c2ecf20Sopenharmony_ci	movdqu		0x30(%rdx),%xmm1
6768c2ecf20Sopenharmony_ci	pxor		%xmm1,%xmm0
6778c2ecf20Sopenharmony_ci	movdqu		%xmm0,0x30(%rsi)
6788c2ecf20Sopenharmony_ci
6798c2ecf20Sopenharmony_ci	movdqa		0x20(%rsp),%xmm0
6808c2ecf20Sopenharmony_ci	cmp		$0x50,%rax
6818c2ecf20Sopenharmony_ci	jl		.Lxorpart4
6828c2ecf20Sopenharmony_ci	movdqu		0x40(%rdx),%xmm1
6838c2ecf20Sopenharmony_ci	pxor		%xmm1,%xmm0
6848c2ecf20Sopenharmony_ci	movdqu		%xmm0,0x40(%rsi)
6858c2ecf20Sopenharmony_ci
6868c2ecf20Sopenharmony_ci	movdqu		%xmm6,%xmm0
6878c2ecf20Sopenharmony_ci	cmp		$0x60,%rax
6888c2ecf20Sopenharmony_ci	jl		.Lxorpart4
6898c2ecf20Sopenharmony_ci	movdqu		0x50(%rdx),%xmm1
6908c2ecf20Sopenharmony_ci	pxor		%xmm1,%xmm0
6918c2ecf20Sopenharmony_ci	movdqu		%xmm0,0x50(%rsi)
6928c2ecf20Sopenharmony_ci
6938c2ecf20Sopenharmony_ci	movdqu		%xmm10,%xmm0
6948c2ecf20Sopenharmony_ci	cmp		$0x70,%rax
6958c2ecf20Sopenharmony_ci	jl		.Lxorpart4
6968c2ecf20Sopenharmony_ci	movdqu		0x60(%rdx),%xmm1
6978c2ecf20Sopenharmony_ci	pxor		%xmm1,%xmm0
6988c2ecf20Sopenharmony_ci	movdqu		%xmm0,0x60(%rsi)
6998c2ecf20Sopenharmony_ci
7008c2ecf20Sopenharmony_ci	movdqu		%xmm14,%xmm0
7018c2ecf20Sopenharmony_ci	cmp		$0x80,%rax
7028c2ecf20Sopenharmony_ci	jl		.Lxorpart4
7038c2ecf20Sopenharmony_ci	movdqu		0x70(%rdx),%xmm1
7048c2ecf20Sopenharmony_ci	pxor		%xmm1,%xmm0
7058c2ecf20Sopenharmony_ci	movdqu		%xmm0,0x70(%rsi)
7068c2ecf20Sopenharmony_ci
7078c2ecf20Sopenharmony_ci	movdqa		0x10(%rsp),%xmm0
7088c2ecf20Sopenharmony_ci	cmp		$0x90,%rax
7098c2ecf20Sopenharmony_ci	jl		.Lxorpart4
7108c2ecf20Sopenharmony_ci	movdqu		0x80(%rdx),%xmm1
7118c2ecf20Sopenharmony_ci	pxor		%xmm1,%xmm0
7128c2ecf20Sopenharmony_ci	movdqu		%xmm0,0x80(%rsi)
7138c2ecf20Sopenharmony_ci
7148c2ecf20Sopenharmony_ci	movdqu		%xmm5,%xmm0
7158c2ecf20Sopenharmony_ci	cmp		$0xa0,%rax
7168c2ecf20Sopenharmony_ci	jl		.Lxorpart4
7178c2ecf20Sopenharmony_ci	movdqu		0x90(%rdx),%xmm1
7188c2ecf20Sopenharmony_ci	pxor		%xmm1,%xmm0
7198c2ecf20Sopenharmony_ci	movdqu		%xmm0,0x90(%rsi)
7208c2ecf20Sopenharmony_ci
7218c2ecf20Sopenharmony_ci	movdqu		%xmm9,%xmm0
7228c2ecf20Sopenharmony_ci	cmp		$0xb0,%rax
7238c2ecf20Sopenharmony_ci	jl		.Lxorpart4
7248c2ecf20Sopenharmony_ci	movdqu		0xa0(%rdx),%xmm1
7258c2ecf20Sopenharmony_ci	pxor		%xmm1,%xmm0
7268c2ecf20Sopenharmony_ci	movdqu		%xmm0,0xa0(%rsi)
7278c2ecf20Sopenharmony_ci
7288c2ecf20Sopenharmony_ci	movdqu		%xmm13,%xmm0
7298c2ecf20Sopenharmony_ci	cmp		$0xc0,%rax
7308c2ecf20Sopenharmony_ci	jl		.Lxorpart4
7318c2ecf20Sopenharmony_ci	movdqu		0xb0(%rdx),%xmm1
7328c2ecf20Sopenharmony_ci	pxor		%xmm1,%xmm0
7338c2ecf20Sopenharmony_ci	movdqu		%xmm0,0xb0(%rsi)
7348c2ecf20Sopenharmony_ci
7358c2ecf20Sopenharmony_ci	movdqa		0x30(%rsp),%xmm0
7368c2ecf20Sopenharmony_ci	cmp		$0xd0,%rax
7378c2ecf20Sopenharmony_ci	jl		.Lxorpart4
7388c2ecf20Sopenharmony_ci	movdqu		0xc0(%rdx),%xmm1
7398c2ecf20Sopenharmony_ci	pxor		%xmm1,%xmm0
7408c2ecf20Sopenharmony_ci	movdqu		%xmm0,0xc0(%rsi)
7418c2ecf20Sopenharmony_ci
7428c2ecf20Sopenharmony_ci	movdqu		%xmm7,%xmm0
7438c2ecf20Sopenharmony_ci	cmp		$0xe0,%rax
7448c2ecf20Sopenharmony_ci	jl		.Lxorpart4
7458c2ecf20Sopenharmony_ci	movdqu		0xd0(%rdx),%xmm1
7468c2ecf20Sopenharmony_ci	pxor		%xmm1,%xmm0
7478c2ecf20Sopenharmony_ci	movdqu		%xmm0,0xd0(%rsi)
7488c2ecf20Sopenharmony_ci
7498c2ecf20Sopenharmony_ci	movdqu		%xmm11,%xmm0
7508c2ecf20Sopenharmony_ci	cmp		$0xf0,%rax
7518c2ecf20Sopenharmony_ci	jl		.Lxorpart4
7528c2ecf20Sopenharmony_ci	movdqu		0xe0(%rdx),%xmm1
7538c2ecf20Sopenharmony_ci	pxor		%xmm1,%xmm0
7548c2ecf20Sopenharmony_ci	movdqu		%xmm0,0xe0(%rsi)
7558c2ecf20Sopenharmony_ci
7568c2ecf20Sopenharmony_ci	movdqu		%xmm15,%xmm0
7578c2ecf20Sopenharmony_ci	cmp		$0x100,%rax
7588c2ecf20Sopenharmony_ci	jl		.Lxorpart4
7598c2ecf20Sopenharmony_ci	movdqu		0xf0(%rdx),%xmm1
7608c2ecf20Sopenharmony_ci	pxor		%xmm1,%xmm0
7618c2ecf20Sopenharmony_ci	movdqu		%xmm0,0xf0(%rsi)
7628c2ecf20Sopenharmony_ci
7638c2ecf20Sopenharmony_ci.Ldone4:
7648c2ecf20Sopenharmony_ci	lea		-8(%r10),%rsp
7658c2ecf20Sopenharmony_ci	RET
7668c2ecf20Sopenharmony_ci
7678c2ecf20Sopenharmony_ci.Lxorpart4:
7688c2ecf20Sopenharmony_ci	# xor remaining bytes from partial register into output
7698c2ecf20Sopenharmony_ci	mov		%rax,%r9
7708c2ecf20Sopenharmony_ci	and		$0x0f,%r9
7718c2ecf20Sopenharmony_ci	jz		.Ldone4
7728c2ecf20Sopenharmony_ci	and		$~0x0f,%rax
7738c2ecf20Sopenharmony_ci
7748c2ecf20Sopenharmony_ci	mov		%rsi,%r11
7758c2ecf20Sopenharmony_ci
7768c2ecf20Sopenharmony_ci	lea		(%rdx,%rax),%rsi
7778c2ecf20Sopenharmony_ci	mov		%rsp,%rdi
7788c2ecf20Sopenharmony_ci	mov		%r9,%rcx
7798c2ecf20Sopenharmony_ci	rep movsb
7808c2ecf20Sopenharmony_ci
7818c2ecf20Sopenharmony_ci	pxor		0x00(%rsp),%xmm0
7828c2ecf20Sopenharmony_ci	movdqa		%xmm0,0x00(%rsp)
7838c2ecf20Sopenharmony_ci
7848c2ecf20Sopenharmony_ci	mov		%rsp,%rsi
7858c2ecf20Sopenharmony_ci	lea		(%r11,%rax),%rdi
7868c2ecf20Sopenharmony_ci	mov		%r9,%rcx
7878c2ecf20Sopenharmony_ci	rep movsb
7888c2ecf20Sopenharmony_ci
7898c2ecf20Sopenharmony_ci	jmp		.Ldone4
7908c2ecf20Sopenharmony_ci
7918c2ecf20Sopenharmony_ciSYM_FUNC_END(chacha_4block_xor_ssse3)
792