1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * ChaCha 256-bit cipher algorithm, x64 AVX2 functions
4  *
5  * Copyright (C) 2015 Martin Willi
6  */
7 
8 #include <linux/linkage.h>
9 
10 .section	.rodata.cst32.ROT8, "aM", @progbits, 32
11 .align 32
12 ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
13 	.octa 0x0e0d0c0f0a09080b0605040702010003
14 
15 .section	.rodata.cst32.ROT16, "aM", @progbits, 32
16 .align 32
17 ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
18 	.octa 0x0d0c0f0e09080b0a0504070601000302
19 
20 .section	.rodata.cst32.CTRINC, "aM", @progbits, 32
21 .align 32
22 CTRINC:	.octa 0x00000003000000020000000100000000
23 	.octa 0x00000007000000060000000500000004
24 
25 .section	.rodata.cst32.CTR2BL, "aM", @progbits, 32
26 .align 32
27 CTR2BL:	.octa 0x00000000000000000000000000000000
28 	.octa 0x00000000000000000000000000000001
29 
30 .section	.rodata.cst32.CTR4BL, "aM", @progbits, 32
31 .align 32
32 CTR4BL:	.octa 0x00000000000000000000000000000002
33 	.octa 0x00000000000000000000000000000003
34 
35 .text
36 
37 SYM_FUNC_START(chacha_2block_xor_avx2)
38 	# %rdi: Input state matrix, s
39 	# %rsi: up to 2 data blocks output, o
40 	# %rdx: up to 2 data blocks input, i
41 	# %rcx: input/output length in bytes
42 	# %r8d: nrounds
43 
44 	# This function encrypts two ChaCha blocks by loading the state
45 	# matrix twice across four AVX registers. It performs matrix operations
46 	# on four words in each matrix in parallel, but requires shuffling to
47 	# rearrange the words after each round.
48 
49 	vzeroupper
50 
51 	# x0..3[0-2] = s0..3
52 	vbroadcasti128	0x00(%rdi),%ymm0
53 	vbroadcasti128	0x10(%rdi),%ymm1
54 	vbroadcasti128	0x20(%rdi),%ymm2
55 	vbroadcasti128	0x30(%rdi),%ymm3
56 
57 	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
58 
59 	vmovdqa		%ymm0,%ymm8
60 	vmovdqa		%ymm1,%ymm9
61 	vmovdqa		%ymm2,%ymm10
62 	vmovdqa		%ymm3,%ymm11
63 
64 	vmovdqa		ROT8(%rip),%ymm4
65 	vmovdqa		ROT16(%rip),%ymm5
66 
67 	mov		%rcx,%rax
68 
69 .Ldoubleround:
70 
71 	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
72 	vpaddd		%ymm1,%ymm0,%ymm0
73 	vpxor		%ymm0,%ymm3,%ymm3
74 	vpshufb		%ymm5,%ymm3,%ymm3
75 
76 	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
77 	vpaddd		%ymm3,%ymm2,%ymm2
78 	vpxor		%ymm2,%ymm1,%ymm1
79 	vmovdqa		%ymm1,%ymm6
80 	vpslld		$12,%ymm6,%ymm6
81 	vpsrld		$20,%ymm1,%ymm1
82 	vpor		%ymm6,%ymm1,%ymm1
83 
84 	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
85 	vpaddd		%ymm1,%ymm0,%ymm0
86 	vpxor		%ymm0,%ymm3,%ymm3
87 	vpshufb		%ymm4,%ymm3,%ymm3
88 
89 	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
90 	vpaddd		%ymm3,%ymm2,%ymm2
91 	vpxor		%ymm2,%ymm1,%ymm1
92 	vmovdqa		%ymm1,%ymm7
93 	vpslld		$7,%ymm7,%ymm7
94 	vpsrld		$25,%ymm1,%ymm1
95 	vpor		%ymm7,%ymm1,%ymm1
96 
97 	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
98 	vpshufd		$0x39,%ymm1,%ymm1
99 	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
100 	vpshufd		$0x4e,%ymm2,%ymm2
101 	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
102 	vpshufd		$0x93,%ymm3,%ymm3
103 
104 	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
105 	vpaddd		%ymm1,%ymm0,%ymm0
106 	vpxor		%ymm0,%ymm3,%ymm3
107 	vpshufb		%ymm5,%ymm3,%ymm3
108 
109 	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
110 	vpaddd		%ymm3,%ymm2,%ymm2
111 	vpxor		%ymm2,%ymm1,%ymm1
112 	vmovdqa		%ymm1,%ymm6
113 	vpslld		$12,%ymm6,%ymm6
114 	vpsrld		$20,%ymm1,%ymm1
115 	vpor		%ymm6,%ymm1,%ymm1
116 
117 	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
118 	vpaddd		%ymm1,%ymm0,%ymm0
119 	vpxor		%ymm0,%ymm3,%ymm3
120 	vpshufb		%ymm4,%ymm3,%ymm3
121 
122 	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
123 	vpaddd		%ymm3,%ymm2,%ymm2
124 	vpxor		%ymm2,%ymm1,%ymm1
125 	vmovdqa		%ymm1,%ymm7
126 	vpslld		$7,%ymm7,%ymm7
127 	vpsrld		$25,%ymm1,%ymm1
128 	vpor		%ymm7,%ymm1,%ymm1
129 
130 	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
131 	vpshufd		$0x93,%ymm1,%ymm1
132 	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
133 	vpshufd		$0x4e,%ymm2,%ymm2
134 	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
135 	vpshufd		$0x39,%ymm3,%ymm3
136 
137 	sub		$2,%r8d
138 	jnz		.Ldoubleround
139 
140 	# o0 = i0 ^ (x0 + s0)
141 	vpaddd		%ymm8,%ymm0,%ymm7
142 	cmp		$0x10,%rax
143 	jl		.Lxorpart2
144 	vpxor		0x00(%rdx),%xmm7,%xmm6
145 	vmovdqu		%xmm6,0x00(%rsi)
146 	vextracti128	$1,%ymm7,%xmm0
147 	# o1 = i1 ^ (x1 + s1)
148 	vpaddd		%ymm9,%ymm1,%ymm7
149 	cmp		$0x20,%rax
150 	jl		.Lxorpart2
151 	vpxor		0x10(%rdx),%xmm7,%xmm6
152 	vmovdqu		%xmm6,0x10(%rsi)
153 	vextracti128	$1,%ymm7,%xmm1
154 	# o2 = i2 ^ (x2 + s2)
155 	vpaddd		%ymm10,%ymm2,%ymm7
156 	cmp		$0x30,%rax
157 	jl		.Lxorpart2
158 	vpxor		0x20(%rdx),%xmm7,%xmm6
159 	vmovdqu		%xmm6,0x20(%rsi)
160 	vextracti128	$1,%ymm7,%xmm2
161 	# o3 = i3 ^ (x3 + s3)
162 	vpaddd		%ymm11,%ymm3,%ymm7
163 	cmp		$0x40,%rax
164 	jl		.Lxorpart2
165 	vpxor		0x30(%rdx),%xmm7,%xmm6
166 	vmovdqu		%xmm6,0x30(%rsi)
167 	vextracti128	$1,%ymm7,%xmm3
168 
169 	# xor and write second block
170 	vmovdqa		%xmm0,%xmm7
171 	cmp		$0x50,%rax
172 	jl		.Lxorpart2
173 	vpxor		0x40(%rdx),%xmm7,%xmm6
174 	vmovdqu		%xmm6,0x40(%rsi)
175 
176 	vmovdqa		%xmm1,%xmm7
177 	cmp		$0x60,%rax
178 	jl		.Lxorpart2
179 	vpxor		0x50(%rdx),%xmm7,%xmm6
180 	vmovdqu		%xmm6,0x50(%rsi)
181 
182 	vmovdqa		%xmm2,%xmm7
183 	cmp		$0x70,%rax
184 	jl		.Lxorpart2
185 	vpxor		0x60(%rdx),%xmm7,%xmm6
186 	vmovdqu		%xmm6,0x60(%rsi)
187 
188 	vmovdqa		%xmm3,%xmm7
189 	cmp		$0x80,%rax
190 	jl		.Lxorpart2
191 	vpxor		0x70(%rdx),%xmm7,%xmm6
192 	vmovdqu		%xmm6,0x70(%rsi)
193 
194 .Ldone2:
195 	vzeroupper
196 	RET
197 
198 .Lxorpart2:
199 	# xor remaining bytes from partial register into output
200 	mov		%rax,%r9
201 	and		$0x0f,%r9
202 	jz		.Ldone2
203 	and		$~0x0f,%rax
204 
205 	mov		%rsi,%r11
206 
207 	lea		8(%rsp),%r10
208 	sub		$0x10,%rsp
209 	and		$~31,%rsp
210 
211 	lea		(%rdx,%rax),%rsi
212 	mov		%rsp,%rdi
213 	mov		%r9,%rcx
214 	rep movsb
215 
216 	vpxor		0x00(%rsp),%xmm7,%xmm7
217 	vmovdqa		%xmm7,0x00(%rsp)
218 
219 	mov		%rsp,%rsi
220 	lea		(%r11,%rax),%rdi
221 	mov		%r9,%rcx
222 	rep movsb
223 
224 	lea		-8(%r10),%rsp
225 	jmp		.Ldone2
226 
227 SYM_FUNC_END(chacha_2block_xor_avx2)
228 
229 SYM_FUNC_START(chacha_4block_xor_avx2)
230 	# %rdi: Input state matrix, s
231 	# %rsi: up to 4 data blocks output, o
232 	# %rdx: up to 4 data blocks input, i
233 	# %rcx: input/output length in bytes
234 	# %r8d: nrounds
235 
236 	# This function encrypts four ChaCha blocks by loading the state
237 	# matrix four times across eight AVX registers. It performs matrix
238 	# operations on four words in two matrices in parallel, sequentially
239 	# to the operations on the four words of the other two matrices. The
240 	# required word shuffling has a rather high latency, we can do the
241 	# arithmetic on two matrix-pairs without much slowdown.
242 
243 	vzeroupper
244 
245 	# x0..3[0-4] = s0..3
246 	vbroadcasti128	0x00(%rdi),%ymm0
247 	vbroadcasti128	0x10(%rdi),%ymm1
248 	vbroadcasti128	0x20(%rdi),%ymm2
249 	vbroadcasti128	0x30(%rdi),%ymm3
250 
251 	vmovdqa		%ymm0,%ymm4
252 	vmovdqa		%ymm1,%ymm5
253 	vmovdqa		%ymm2,%ymm6
254 	vmovdqa		%ymm3,%ymm7
255 
256 	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
257 	vpaddd		CTR4BL(%rip),%ymm7,%ymm7
258 
259 	vmovdqa		%ymm0,%ymm11
260 	vmovdqa		%ymm1,%ymm12
261 	vmovdqa		%ymm2,%ymm13
262 	vmovdqa		%ymm3,%ymm14
263 	vmovdqa		%ymm7,%ymm15
264 
265 	vmovdqa		ROT8(%rip),%ymm8
266 	vmovdqa		ROT16(%rip),%ymm9
267 
268 	mov		%rcx,%rax
269 
270 .Ldoubleround4:
271 
272 	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
273 	vpaddd		%ymm1,%ymm0,%ymm0
274 	vpxor		%ymm0,%ymm3,%ymm3
275 	vpshufb		%ymm9,%ymm3,%ymm3
276 
277 	vpaddd		%ymm5,%ymm4,%ymm4
278 	vpxor		%ymm4,%ymm7,%ymm7
279 	vpshufb		%ymm9,%ymm7,%ymm7
280 
281 	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
282 	vpaddd		%ymm3,%ymm2,%ymm2
283 	vpxor		%ymm2,%ymm1,%ymm1
284 	vmovdqa		%ymm1,%ymm10
285 	vpslld		$12,%ymm10,%ymm10
286 	vpsrld		$20,%ymm1,%ymm1
287 	vpor		%ymm10,%ymm1,%ymm1
288 
289 	vpaddd		%ymm7,%ymm6,%ymm6
290 	vpxor		%ymm6,%ymm5,%ymm5
291 	vmovdqa		%ymm5,%ymm10
292 	vpslld		$12,%ymm10,%ymm10
293 	vpsrld		$20,%ymm5,%ymm5
294 	vpor		%ymm10,%ymm5,%ymm5
295 
296 	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
297 	vpaddd		%ymm1,%ymm0,%ymm0
298 	vpxor		%ymm0,%ymm3,%ymm3
299 	vpshufb		%ymm8,%ymm3,%ymm3
300 
301 	vpaddd		%ymm5,%ymm4,%ymm4
302 	vpxor		%ymm4,%ymm7,%ymm7
303 	vpshufb		%ymm8,%ymm7,%ymm7
304 
305 	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
306 	vpaddd		%ymm3,%ymm2,%ymm2
307 	vpxor		%ymm2,%ymm1,%ymm1
308 	vmovdqa		%ymm1,%ymm10
309 	vpslld		$7,%ymm10,%ymm10
310 	vpsrld		$25,%ymm1,%ymm1
311 	vpor		%ymm10,%ymm1,%ymm1
312 
313 	vpaddd		%ymm7,%ymm6,%ymm6
314 	vpxor		%ymm6,%ymm5,%ymm5
315 	vmovdqa		%ymm5,%ymm10
316 	vpslld		$7,%ymm10,%ymm10
317 	vpsrld		$25,%ymm5,%ymm5
318 	vpor		%ymm10,%ymm5,%ymm5
319 
320 	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
321 	vpshufd		$0x39,%ymm1,%ymm1
322 	vpshufd		$0x39,%ymm5,%ymm5
323 	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
324 	vpshufd		$0x4e,%ymm2,%ymm2
325 	vpshufd		$0x4e,%ymm6,%ymm6
326 	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
327 	vpshufd		$0x93,%ymm3,%ymm3
328 	vpshufd		$0x93,%ymm7,%ymm7
329 
330 	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
331 	vpaddd		%ymm1,%ymm0,%ymm0
332 	vpxor		%ymm0,%ymm3,%ymm3
333 	vpshufb		%ymm9,%ymm3,%ymm3
334 
335 	vpaddd		%ymm5,%ymm4,%ymm4
336 	vpxor		%ymm4,%ymm7,%ymm7
337 	vpshufb		%ymm9,%ymm7,%ymm7
338 
339 	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
340 	vpaddd		%ymm3,%ymm2,%ymm2
341 	vpxor		%ymm2,%ymm1,%ymm1
342 	vmovdqa		%ymm1,%ymm10
343 	vpslld		$12,%ymm10,%ymm10
344 	vpsrld		$20,%ymm1,%ymm1
345 	vpor		%ymm10,%ymm1,%ymm1
346 
347 	vpaddd		%ymm7,%ymm6,%ymm6
348 	vpxor		%ymm6,%ymm5,%ymm5
349 	vmovdqa		%ymm5,%ymm10
350 	vpslld		$12,%ymm10,%ymm10
351 	vpsrld		$20,%ymm5,%ymm5
352 	vpor		%ymm10,%ymm5,%ymm5
353 
354 	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
355 	vpaddd		%ymm1,%ymm0,%ymm0
356 	vpxor		%ymm0,%ymm3,%ymm3
357 	vpshufb		%ymm8,%ymm3,%ymm3
358 
359 	vpaddd		%ymm5,%ymm4,%ymm4
360 	vpxor		%ymm4,%ymm7,%ymm7
361 	vpshufb		%ymm8,%ymm7,%ymm7
362 
363 	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
364 	vpaddd		%ymm3,%ymm2,%ymm2
365 	vpxor		%ymm2,%ymm1,%ymm1
366 	vmovdqa		%ymm1,%ymm10
367 	vpslld		$7,%ymm10,%ymm10
368 	vpsrld		$25,%ymm1,%ymm1
369 	vpor		%ymm10,%ymm1,%ymm1
370 
371 	vpaddd		%ymm7,%ymm6,%ymm6
372 	vpxor		%ymm6,%ymm5,%ymm5
373 	vmovdqa		%ymm5,%ymm10
374 	vpslld		$7,%ymm10,%ymm10
375 	vpsrld		$25,%ymm5,%ymm5
376 	vpor		%ymm10,%ymm5,%ymm5
377 
378 	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
379 	vpshufd		$0x93,%ymm1,%ymm1
380 	vpshufd		$0x93,%ymm5,%ymm5
381 	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
382 	vpshufd		$0x4e,%ymm2,%ymm2
383 	vpshufd		$0x4e,%ymm6,%ymm6
384 	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
385 	vpshufd		$0x39,%ymm3,%ymm3
386 	vpshufd		$0x39,%ymm7,%ymm7
387 
388 	sub		$2,%r8d
389 	jnz		.Ldoubleround4
390 
391 	# o0 = i0 ^ (x0 + s0), first block
392 	vpaddd		%ymm11,%ymm0,%ymm10
393 	cmp		$0x10,%rax
394 	jl		.Lxorpart4
395 	vpxor		0x00(%rdx),%xmm10,%xmm9
396 	vmovdqu		%xmm9,0x00(%rsi)
397 	vextracti128	$1,%ymm10,%xmm0
398 	# o1 = i1 ^ (x1 + s1), first block
399 	vpaddd		%ymm12,%ymm1,%ymm10
400 	cmp		$0x20,%rax
401 	jl		.Lxorpart4
402 	vpxor		0x10(%rdx),%xmm10,%xmm9
403 	vmovdqu		%xmm9,0x10(%rsi)
404 	vextracti128	$1,%ymm10,%xmm1
405 	# o2 = i2 ^ (x2 + s2), first block
406 	vpaddd		%ymm13,%ymm2,%ymm10
407 	cmp		$0x30,%rax
408 	jl		.Lxorpart4
409 	vpxor		0x20(%rdx),%xmm10,%xmm9
410 	vmovdqu		%xmm9,0x20(%rsi)
411 	vextracti128	$1,%ymm10,%xmm2
412 	# o3 = i3 ^ (x3 + s3), first block
413 	vpaddd		%ymm14,%ymm3,%ymm10
414 	cmp		$0x40,%rax
415 	jl		.Lxorpart4
416 	vpxor		0x30(%rdx),%xmm10,%xmm9
417 	vmovdqu		%xmm9,0x30(%rsi)
418 	vextracti128	$1,%ymm10,%xmm3
419 
420 	# xor and write second block
421 	vmovdqa		%xmm0,%xmm10
422 	cmp		$0x50,%rax
423 	jl		.Lxorpart4
424 	vpxor		0x40(%rdx),%xmm10,%xmm9
425 	vmovdqu		%xmm9,0x40(%rsi)
426 
427 	vmovdqa		%xmm1,%xmm10
428 	cmp		$0x60,%rax
429 	jl		.Lxorpart4
430 	vpxor		0x50(%rdx),%xmm10,%xmm9
431 	vmovdqu		%xmm9,0x50(%rsi)
432 
433 	vmovdqa		%xmm2,%xmm10
434 	cmp		$0x70,%rax
435 	jl		.Lxorpart4
436 	vpxor		0x60(%rdx),%xmm10,%xmm9
437 	vmovdqu		%xmm9,0x60(%rsi)
438 
439 	vmovdqa		%xmm3,%xmm10
440 	cmp		$0x80,%rax
441 	jl		.Lxorpart4
442 	vpxor		0x70(%rdx),%xmm10,%xmm9
443 	vmovdqu		%xmm9,0x70(%rsi)
444 
445 	# o0 = i0 ^ (x0 + s0), third block
446 	vpaddd		%ymm11,%ymm4,%ymm10
447 	cmp		$0x90,%rax
448 	jl		.Lxorpart4
449 	vpxor		0x80(%rdx),%xmm10,%xmm9
450 	vmovdqu		%xmm9,0x80(%rsi)
451 	vextracti128	$1,%ymm10,%xmm4
452 	# o1 = i1 ^ (x1 + s1), third block
453 	vpaddd		%ymm12,%ymm5,%ymm10
454 	cmp		$0xa0,%rax
455 	jl		.Lxorpart4
456 	vpxor		0x90(%rdx),%xmm10,%xmm9
457 	vmovdqu		%xmm9,0x90(%rsi)
458 	vextracti128	$1,%ymm10,%xmm5
459 	# o2 = i2 ^ (x2 + s2), third block
460 	vpaddd		%ymm13,%ymm6,%ymm10
461 	cmp		$0xb0,%rax
462 	jl		.Lxorpart4
463 	vpxor		0xa0(%rdx),%xmm10,%xmm9
464 	vmovdqu		%xmm9,0xa0(%rsi)
465 	vextracti128	$1,%ymm10,%xmm6
466 	# o3 = i3 ^ (x3 + s3), third block
467 	vpaddd		%ymm15,%ymm7,%ymm10
468 	cmp		$0xc0,%rax
469 	jl		.Lxorpart4
470 	vpxor		0xb0(%rdx),%xmm10,%xmm9
471 	vmovdqu		%xmm9,0xb0(%rsi)
472 	vextracti128	$1,%ymm10,%xmm7
473 
474 	# xor and write fourth block
475 	vmovdqa		%xmm4,%xmm10
476 	cmp		$0xd0,%rax
477 	jl		.Lxorpart4
478 	vpxor		0xc0(%rdx),%xmm10,%xmm9
479 	vmovdqu		%xmm9,0xc0(%rsi)
480 
481 	vmovdqa		%xmm5,%xmm10
482 	cmp		$0xe0,%rax
483 	jl		.Lxorpart4
484 	vpxor		0xd0(%rdx),%xmm10,%xmm9
485 	vmovdqu		%xmm9,0xd0(%rsi)
486 
487 	vmovdqa		%xmm6,%xmm10
488 	cmp		$0xf0,%rax
489 	jl		.Lxorpart4
490 	vpxor		0xe0(%rdx),%xmm10,%xmm9
491 	vmovdqu		%xmm9,0xe0(%rsi)
492 
493 	vmovdqa		%xmm7,%xmm10
494 	cmp		$0x100,%rax
495 	jl		.Lxorpart4
496 	vpxor		0xf0(%rdx),%xmm10,%xmm9
497 	vmovdqu		%xmm9,0xf0(%rsi)
498 
499 .Ldone4:
500 	vzeroupper
501 	RET
502 
503 .Lxorpart4:
504 	# xor remaining bytes from partial register into output
505 	mov		%rax,%r9
506 	and		$0x0f,%r9
507 	jz		.Ldone4
508 	and		$~0x0f,%rax
509 
510 	mov		%rsi,%r11
511 
512 	lea		8(%rsp),%r10
513 	sub		$0x10,%rsp
514 	and		$~31,%rsp
515 
516 	lea		(%rdx,%rax),%rsi
517 	mov		%rsp,%rdi
518 	mov		%r9,%rcx
519 	rep movsb
520 
521 	vpxor		0x00(%rsp),%xmm10,%xmm10
522 	vmovdqa		%xmm10,0x00(%rsp)
523 
524 	mov		%rsp,%rsi
525 	lea		(%r11,%rax),%rdi
526 	mov		%r9,%rcx
527 	rep movsb
528 
529 	lea		-8(%r10),%rsp
530 	jmp		.Ldone4
531 
532 SYM_FUNC_END(chacha_4block_xor_avx2)
533 
534 SYM_FUNC_START(chacha_8block_xor_avx2)
535 	# %rdi: Input state matrix, s
536 	# %rsi: up to 8 data blocks output, o
537 	# %rdx: up to 8 data blocks input, i
538 	# %rcx: input/output length in bytes
539 	# %r8d: nrounds
540 
541 	# This function encrypts eight consecutive ChaCha blocks by loading
542 	# the state matrix in AVX registers eight times. As we need some
543 	# scratch registers, we save the first four registers on the stack. The
544 	# algorithm performs each operation on the corresponding word of each
545 	# state matrix, hence requires no word shuffling. For final XORing step
546 	# we transpose the matrix by interleaving 32-, 64- and then 128-bit
547 	# words, which allows us to do XOR in AVX registers. 8/16-bit word
548 	# rotation is done with the slightly better performing byte shuffling,
549 	# 7/12-bit word rotation uses traditional shift+OR.
550 
551 	vzeroupper
552 	# 4 * 32 byte stack, 32-byte aligned
553 	lea		8(%rsp),%r10
554 	and		$~31, %rsp
555 	sub		$0x80, %rsp
556 	mov		%rcx,%rax
557 
558 	# x0..15[0-7] = s[0..15]
559 	vpbroadcastd	0x00(%rdi),%ymm0
560 	vpbroadcastd	0x04(%rdi),%ymm1
561 	vpbroadcastd	0x08(%rdi),%ymm2
562 	vpbroadcastd	0x0c(%rdi),%ymm3
563 	vpbroadcastd	0x10(%rdi),%ymm4
564 	vpbroadcastd	0x14(%rdi),%ymm5
565 	vpbroadcastd	0x18(%rdi),%ymm6
566 	vpbroadcastd	0x1c(%rdi),%ymm7
567 	vpbroadcastd	0x20(%rdi),%ymm8
568 	vpbroadcastd	0x24(%rdi),%ymm9
569 	vpbroadcastd	0x28(%rdi),%ymm10
570 	vpbroadcastd	0x2c(%rdi),%ymm11
571 	vpbroadcastd	0x30(%rdi),%ymm12
572 	vpbroadcastd	0x34(%rdi),%ymm13
573 	vpbroadcastd	0x38(%rdi),%ymm14
574 	vpbroadcastd	0x3c(%rdi),%ymm15
575 	# x0..3 on stack
576 	vmovdqa		%ymm0,0x00(%rsp)
577 	vmovdqa		%ymm1,0x20(%rsp)
578 	vmovdqa		%ymm2,0x40(%rsp)
579 	vmovdqa		%ymm3,0x60(%rsp)
580 
581 	vmovdqa		CTRINC(%rip),%ymm1
582 	vmovdqa		ROT8(%rip),%ymm2
583 	vmovdqa		ROT16(%rip),%ymm3
584 
585 	# x12 += counter values 0-3
586 	vpaddd		%ymm1,%ymm12,%ymm12
587 
588 .Ldoubleround8:
589 	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
590 	vpaddd		0x00(%rsp),%ymm4,%ymm0
591 	vmovdqa		%ymm0,0x00(%rsp)
592 	vpxor		%ymm0,%ymm12,%ymm12
593 	vpshufb		%ymm3,%ymm12,%ymm12
594 	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
595 	vpaddd		0x20(%rsp),%ymm5,%ymm0
596 	vmovdqa		%ymm0,0x20(%rsp)
597 	vpxor		%ymm0,%ymm13,%ymm13
598 	vpshufb		%ymm3,%ymm13,%ymm13
599 	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
600 	vpaddd		0x40(%rsp),%ymm6,%ymm0
601 	vmovdqa		%ymm0,0x40(%rsp)
602 	vpxor		%ymm0,%ymm14,%ymm14
603 	vpshufb		%ymm3,%ymm14,%ymm14
604 	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
605 	vpaddd		0x60(%rsp),%ymm7,%ymm0
606 	vmovdqa		%ymm0,0x60(%rsp)
607 	vpxor		%ymm0,%ymm15,%ymm15
608 	vpshufb		%ymm3,%ymm15,%ymm15
609 
610 	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
611 	vpaddd		%ymm12,%ymm8,%ymm8
612 	vpxor		%ymm8,%ymm4,%ymm4
613 	vpslld		$12,%ymm4,%ymm0
614 	vpsrld		$20,%ymm4,%ymm4
615 	vpor		%ymm0,%ymm4,%ymm4
616 	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
617 	vpaddd		%ymm13,%ymm9,%ymm9
618 	vpxor		%ymm9,%ymm5,%ymm5
619 	vpslld		$12,%ymm5,%ymm0
620 	vpsrld		$20,%ymm5,%ymm5
621 	vpor		%ymm0,%ymm5,%ymm5
622 	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
623 	vpaddd		%ymm14,%ymm10,%ymm10
624 	vpxor		%ymm10,%ymm6,%ymm6
625 	vpslld		$12,%ymm6,%ymm0
626 	vpsrld		$20,%ymm6,%ymm6
627 	vpor		%ymm0,%ymm6,%ymm6
628 	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
629 	vpaddd		%ymm15,%ymm11,%ymm11
630 	vpxor		%ymm11,%ymm7,%ymm7
631 	vpslld		$12,%ymm7,%ymm0
632 	vpsrld		$20,%ymm7,%ymm7
633 	vpor		%ymm0,%ymm7,%ymm7
634 
635 	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
636 	vpaddd		0x00(%rsp),%ymm4,%ymm0
637 	vmovdqa		%ymm0,0x00(%rsp)
638 	vpxor		%ymm0,%ymm12,%ymm12
639 	vpshufb		%ymm2,%ymm12,%ymm12
640 	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
641 	vpaddd		0x20(%rsp),%ymm5,%ymm0
642 	vmovdqa		%ymm0,0x20(%rsp)
643 	vpxor		%ymm0,%ymm13,%ymm13
644 	vpshufb		%ymm2,%ymm13,%ymm13
645 	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
646 	vpaddd		0x40(%rsp),%ymm6,%ymm0
647 	vmovdqa		%ymm0,0x40(%rsp)
648 	vpxor		%ymm0,%ymm14,%ymm14
649 	vpshufb		%ymm2,%ymm14,%ymm14
650 	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
651 	vpaddd		0x60(%rsp),%ymm7,%ymm0
652 	vmovdqa		%ymm0,0x60(%rsp)
653 	vpxor		%ymm0,%ymm15,%ymm15
654 	vpshufb		%ymm2,%ymm15,%ymm15
655 
656 	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
657 	vpaddd		%ymm12,%ymm8,%ymm8
658 	vpxor		%ymm8,%ymm4,%ymm4
659 	vpslld		$7,%ymm4,%ymm0
660 	vpsrld		$25,%ymm4,%ymm4
661 	vpor		%ymm0,%ymm4,%ymm4
662 	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
663 	vpaddd		%ymm13,%ymm9,%ymm9
664 	vpxor		%ymm9,%ymm5,%ymm5
665 	vpslld		$7,%ymm5,%ymm0
666 	vpsrld		$25,%ymm5,%ymm5
667 	vpor		%ymm0,%ymm5,%ymm5
668 	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
669 	vpaddd		%ymm14,%ymm10,%ymm10
670 	vpxor		%ymm10,%ymm6,%ymm6
671 	vpslld		$7,%ymm6,%ymm0
672 	vpsrld		$25,%ymm6,%ymm6
673 	vpor		%ymm0,%ymm6,%ymm6
674 	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
675 	vpaddd		%ymm15,%ymm11,%ymm11
676 	vpxor		%ymm11,%ymm7,%ymm7
677 	vpslld		$7,%ymm7,%ymm0
678 	vpsrld		$25,%ymm7,%ymm7
679 	vpor		%ymm0,%ymm7,%ymm7
680 
681 	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
682 	vpaddd		0x00(%rsp),%ymm5,%ymm0
683 	vmovdqa		%ymm0,0x00(%rsp)
684 	vpxor		%ymm0,%ymm15,%ymm15
685 	vpshufb		%ymm3,%ymm15,%ymm15
686 	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
687 	vpaddd		0x20(%rsp),%ymm6,%ymm0
688 	vmovdqa		%ymm0,0x20(%rsp)
689 	vpxor		%ymm0,%ymm12,%ymm12
690 	vpshufb		%ymm3,%ymm12,%ymm12
691 	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
692 	vpaddd		0x40(%rsp),%ymm7,%ymm0
693 	vmovdqa		%ymm0,0x40(%rsp)
694 	vpxor		%ymm0,%ymm13,%ymm13
695 	vpshufb		%ymm3,%ymm13,%ymm13
696 	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
697 	vpaddd		0x60(%rsp),%ymm4,%ymm0
698 	vmovdqa		%ymm0,0x60(%rsp)
699 	vpxor		%ymm0,%ymm14,%ymm14
700 	vpshufb		%ymm3,%ymm14,%ymm14
701 
702 	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
703 	vpaddd		%ymm15,%ymm10,%ymm10
704 	vpxor		%ymm10,%ymm5,%ymm5
705 	vpslld		$12,%ymm5,%ymm0
706 	vpsrld		$20,%ymm5,%ymm5
707 	vpor		%ymm0,%ymm5,%ymm5
708 	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
709 	vpaddd		%ymm12,%ymm11,%ymm11
710 	vpxor		%ymm11,%ymm6,%ymm6
711 	vpslld		$12,%ymm6,%ymm0
712 	vpsrld		$20,%ymm6,%ymm6
713 	vpor		%ymm0,%ymm6,%ymm6
714 	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
715 	vpaddd		%ymm13,%ymm8,%ymm8
716 	vpxor		%ymm8,%ymm7,%ymm7
717 	vpslld		$12,%ymm7,%ymm0
718 	vpsrld		$20,%ymm7,%ymm7
719 	vpor		%ymm0,%ymm7,%ymm7
720 	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
721 	vpaddd		%ymm14,%ymm9,%ymm9
722 	vpxor		%ymm9,%ymm4,%ymm4
723 	vpslld		$12,%ymm4,%ymm0
724 	vpsrld		$20,%ymm4,%ymm4
725 	vpor		%ymm0,%ymm4,%ymm4
726 
727 	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
728 	vpaddd		0x00(%rsp),%ymm5,%ymm0
729 	vmovdqa		%ymm0,0x00(%rsp)
730 	vpxor		%ymm0,%ymm15,%ymm15
731 	vpshufb		%ymm2,%ymm15,%ymm15
732 	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
733 	vpaddd		0x20(%rsp),%ymm6,%ymm0
734 	vmovdqa		%ymm0,0x20(%rsp)
735 	vpxor		%ymm0,%ymm12,%ymm12
736 	vpshufb		%ymm2,%ymm12,%ymm12
737 	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
738 	vpaddd		0x40(%rsp),%ymm7,%ymm0
739 	vmovdqa		%ymm0,0x40(%rsp)
740 	vpxor		%ymm0,%ymm13,%ymm13
741 	vpshufb		%ymm2,%ymm13,%ymm13
742 	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
743 	vpaddd		0x60(%rsp),%ymm4,%ymm0
744 	vmovdqa		%ymm0,0x60(%rsp)
745 	vpxor		%ymm0,%ymm14,%ymm14
746 	vpshufb		%ymm2,%ymm14,%ymm14
747 
748 	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
749 	vpaddd		%ymm15,%ymm10,%ymm10
750 	vpxor		%ymm10,%ymm5,%ymm5
751 	vpslld		$7,%ymm5,%ymm0
752 	vpsrld		$25,%ymm5,%ymm5
753 	vpor		%ymm0,%ymm5,%ymm5
754 	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
755 	vpaddd		%ymm12,%ymm11,%ymm11
756 	vpxor		%ymm11,%ymm6,%ymm6
757 	vpslld		$7,%ymm6,%ymm0
758 	vpsrld		$25,%ymm6,%ymm6
759 	vpor		%ymm0,%ymm6,%ymm6
760 	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
761 	vpaddd		%ymm13,%ymm8,%ymm8
762 	vpxor		%ymm8,%ymm7,%ymm7
763 	vpslld		$7,%ymm7,%ymm0
764 	vpsrld		$25,%ymm7,%ymm7
765 	vpor		%ymm0,%ymm7,%ymm7
766 	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
767 	vpaddd		%ymm14,%ymm9,%ymm9
768 	vpxor		%ymm9,%ymm4,%ymm4
769 	vpslld		$7,%ymm4,%ymm0
770 	vpsrld		$25,%ymm4,%ymm4
771 	vpor		%ymm0,%ymm4,%ymm4
772 
773 	sub		$2,%r8d
774 	jnz		.Ldoubleround8
775 
776 	# x0..15[0-3] += s[0..15]
777 	vpbroadcastd	0x00(%rdi),%ymm0
778 	vpaddd		0x00(%rsp),%ymm0,%ymm0
779 	vmovdqa		%ymm0,0x00(%rsp)
780 	vpbroadcastd	0x04(%rdi),%ymm0
781 	vpaddd		0x20(%rsp),%ymm0,%ymm0
782 	vmovdqa		%ymm0,0x20(%rsp)
783 	vpbroadcastd	0x08(%rdi),%ymm0
784 	vpaddd		0x40(%rsp),%ymm0,%ymm0
785 	vmovdqa		%ymm0,0x40(%rsp)
786 	vpbroadcastd	0x0c(%rdi),%ymm0
787 	vpaddd		0x60(%rsp),%ymm0,%ymm0
788 	vmovdqa		%ymm0,0x60(%rsp)
789 	vpbroadcastd	0x10(%rdi),%ymm0
790 	vpaddd		%ymm0,%ymm4,%ymm4
791 	vpbroadcastd	0x14(%rdi),%ymm0
792 	vpaddd		%ymm0,%ymm5,%ymm5
793 	vpbroadcastd	0x18(%rdi),%ymm0
794 	vpaddd		%ymm0,%ymm6,%ymm6
795 	vpbroadcastd	0x1c(%rdi),%ymm0
796 	vpaddd		%ymm0,%ymm7,%ymm7
797 	vpbroadcastd	0x20(%rdi),%ymm0
798 	vpaddd		%ymm0,%ymm8,%ymm8
799 	vpbroadcastd	0x24(%rdi),%ymm0
800 	vpaddd		%ymm0,%ymm9,%ymm9
801 	vpbroadcastd	0x28(%rdi),%ymm0
802 	vpaddd		%ymm0,%ymm10,%ymm10
803 	vpbroadcastd	0x2c(%rdi),%ymm0
804 	vpaddd		%ymm0,%ymm11,%ymm11
805 	vpbroadcastd	0x30(%rdi),%ymm0
806 	vpaddd		%ymm0,%ymm12,%ymm12
807 	vpbroadcastd	0x34(%rdi),%ymm0
808 	vpaddd		%ymm0,%ymm13,%ymm13
809 	vpbroadcastd	0x38(%rdi),%ymm0
810 	vpaddd		%ymm0,%ymm14,%ymm14
811 	vpbroadcastd	0x3c(%rdi),%ymm0
812 	vpaddd		%ymm0,%ymm15,%ymm15
813 
814 	# x12 += counter values 0-3
815 	vpaddd		%ymm1,%ymm12,%ymm12
816 
817 	# interleave 32-bit words in state n, n+1
818 	vmovdqa		0x00(%rsp),%ymm0
819 	vmovdqa		0x20(%rsp),%ymm1
820 	vpunpckldq	%ymm1,%ymm0,%ymm2
821 	vpunpckhdq	%ymm1,%ymm0,%ymm1
822 	vmovdqa		%ymm2,0x00(%rsp)
823 	vmovdqa		%ymm1,0x20(%rsp)
824 	vmovdqa		0x40(%rsp),%ymm0
825 	vmovdqa		0x60(%rsp),%ymm1
826 	vpunpckldq	%ymm1,%ymm0,%ymm2
827 	vpunpckhdq	%ymm1,%ymm0,%ymm1
828 	vmovdqa		%ymm2,0x40(%rsp)
829 	vmovdqa		%ymm1,0x60(%rsp)
830 	vmovdqa		%ymm4,%ymm0
831 	vpunpckldq	%ymm5,%ymm0,%ymm4
832 	vpunpckhdq	%ymm5,%ymm0,%ymm5
833 	vmovdqa		%ymm6,%ymm0
834 	vpunpckldq	%ymm7,%ymm0,%ymm6
835 	vpunpckhdq	%ymm7,%ymm0,%ymm7
836 	vmovdqa		%ymm8,%ymm0
837 	vpunpckldq	%ymm9,%ymm0,%ymm8
838 	vpunpckhdq	%ymm9,%ymm0,%ymm9
839 	vmovdqa		%ymm10,%ymm0
840 	vpunpckldq	%ymm11,%ymm0,%ymm10
841 	vpunpckhdq	%ymm11,%ymm0,%ymm11
842 	vmovdqa		%ymm12,%ymm0
843 	vpunpckldq	%ymm13,%ymm0,%ymm12
844 	vpunpckhdq	%ymm13,%ymm0,%ymm13
845 	vmovdqa		%ymm14,%ymm0
846 	vpunpckldq	%ymm15,%ymm0,%ymm14
847 	vpunpckhdq	%ymm15,%ymm0,%ymm15
848 
849 	# interleave 64-bit words in state n, n+2
850 	vmovdqa		0x00(%rsp),%ymm0
851 	vmovdqa		0x40(%rsp),%ymm2
852 	vpunpcklqdq	%ymm2,%ymm0,%ymm1
853 	vpunpckhqdq	%ymm2,%ymm0,%ymm2
854 	vmovdqa		%ymm1,0x00(%rsp)
855 	vmovdqa		%ymm2,0x40(%rsp)
856 	vmovdqa		0x20(%rsp),%ymm0
857 	vmovdqa		0x60(%rsp),%ymm2
858 	vpunpcklqdq	%ymm2,%ymm0,%ymm1
859 	vpunpckhqdq	%ymm2,%ymm0,%ymm2
860 	vmovdqa		%ymm1,0x20(%rsp)
861 	vmovdqa		%ymm2,0x60(%rsp)
862 	vmovdqa		%ymm4,%ymm0
863 	vpunpcklqdq	%ymm6,%ymm0,%ymm4
864 	vpunpckhqdq	%ymm6,%ymm0,%ymm6
865 	vmovdqa		%ymm5,%ymm0
866 	vpunpcklqdq	%ymm7,%ymm0,%ymm5
867 	vpunpckhqdq	%ymm7,%ymm0,%ymm7
868 	vmovdqa		%ymm8,%ymm0
869 	vpunpcklqdq	%ymm10,%ymm0,%ymm8
870 	vpunpckhqdq	%ymm10,%ymm0,%ymm10
871 	vmovdqa		%ymm9,%ymm0
872 	vpunpcklqdq	%ymm11,%ymm0,%ymm9
873 	vpunpckhqdq	%ymm11,%ymm0,%ymm11
874 	vmovdqa		%ymm12,%ymm0
875 	vpunpcklqdq	%ymm14,%ymm0,%ymm12
876 	vpunpckhqdq	%ymm14,%ymm0,%ymm14
877 	vmovdqa		%ymm13,%ymm0
878 	vpunpcklqdq	%ymm15,%ymm0,%ymm13
879 	vpunpckhqdq	%ymm15,%ymm0,%ymm15
880 
881 	# interleave 128-bit words in state n, n+4
882 	# xor/write first four blocks
883 	vmovdqa		0x00(%rsp),%ymm1
884 	vperm2i128	$0x20,%ymm4,%ymm1,%ymm0
885 	cmp		$0x0020,%rax
886 	jl		.Lxorpart8
887 	vpxor		0x0000(%rdx),%ymm0,%ymm0
888 	vmovdqu		%ymm0,0x0000(%rsi)
889 	vperm2i128	$0x31,%ymm4,%ymm1,%ymm4
890 
891 	vperm2i128	$0x20,%ymm12,%ymm8,%ymm0
892 	cmp		$0x0040,%rax
893 	jl		.Lxorpart8
894 	vpxor		0x0020(%rdx),%ymm0,%ymm0
895 	vmovdqu		%ymm0,0x0020(%rsi)
896 	vperm2i128	$0x31,%ymm12,%ymm8,%ymm12
897 
898 	vmovdqa		0x40(%rsp),%ymm1
899 	vperm2i128	$0x20,%ymm6,%ymm1,%ymm0
900 	cmp		$0x0060,%rax
901 	jl		.Lxorpart8
902 	vpxor		0x0040(%rdx),%ymm0,%ymm0
903 	vmovdqu		%ymm0,0x0040(%rsi)
904 	vperm2i128	$0x31,%ymm6,%ymm1,%ymm6
905 
906 	vperm2i128	$0x20,%ymm14,%ymm10,%ymm0
907 	cmp		$0x0080,%rax
908 	jl		.Lxorpart8
909 	vpxor		0x0060(%rdx),%ymm0,%ymm0
910 	vmovdqu		%ymm0,0x0060(%rsi)
911 	vperm2i128	$0x31,%ymm14,%ymm10,%ymm14
912 
913 	vmovdqa		0x20(%rsp),%ymm1
914 	vperm2i128	$0x20,%ymm5,%ymm1,%ymm0
915 	cmp		$0x00a0,%rax
916 	jl		.Lxorpart8
917 	vpxor		0x0080(%rdx),%ymm0,%ymm0
918 	vmovdqu		%ymm0,0x0080(%rsi)
919 	vperm2i128	$0x31,%ymm5,%ymm1,%ymm5
920 
921 	vperm2i128	$0x20,%ymm13,%ymm9,%ymm0
922 	cmp		$0x00c0,%rax
923 	jl		.Lxorpart8
924 	vpxor		0x00a0(%rdx),%ymm0,%ymm0
925 	vmovdqu		%ymm0,0x00a0(%rsi)
926 	vperm2i128	$0x31,%ymm13,%ymm9,%ymm13
927 
928 	vmovdqa		0x60(%rsp),%ymm1
929 	vperm2i128	$0x20,%ymm7,%ymm1,%ymm0
930 	cmp		$0x00e0,%rax
931 	jl		.Lxorpart8
932 	vpxor		0x00c0(%rdx),%ymm0,%ymm0
933 	vmovdqu		%ymm0,0x00c0(%rsi)
934 	vperm2i128	$0x31,%ymm7,%ymm1,%ymm7
935 
936 	vperm2i128	$0x20,%ymm15,%ymm11,%ymm0
937 	cmp		$0x0100,%rax
938 	jl		.Lxorpart8
939 	vpxor		0x00e0(%rdx),%ymm0,%ymm0
940 	vmovdqu		%ymm0,0x00e0(%rsi)
941 	vperm2i128	$0x31,%ymm15,%ymm11,%ymm15
942 
943 	# xor remaining blocks, write to output
944 	vmovdqa		%ymm4,%ymm0
945 	cmp		$0x0120,%rax
946 	jl		.Lxorpart8
947 	vpxor		0x0100(%rdx),%ymm0,%ymm0
948 	vmovdqu		%ymm0,0x0100(%rsi)
949 
950 	vmovdqa		%ymm12,%ymm0
951 	cmp		$0x0140,%rax
952 	jl		.Lxorpart8
953 	vpxor		0x0120(%rdx),%ymm0,%ymm0
954 	vmovdqu		%ymm0,0x0120(%rsi)
955 
956 	vmovdqa		%ymm6,%ymm0
957 	cmp		$0x0160,%rax
958 	jl		.Lxorpart8
959 	vpxor		0x0140(%rdx),%ymm0,%ymm0
960 	vmovdqu		%ymm0,0x0140(%rsi)
961 
962 	vmovdqa		%ymm14,%ymm0
963 	cmp		$0x0180,%rax
964 	jl		.Lxorpart8
965 	vpxor		0x0160(%rdx),%ymm0,%ymm0
966 	vmovdqu		%ymm0,0x0160(%rsi)
967 
968 	vmovdqa		%ymm5,%ymm0
969 	cmp		$0x01a0,%rax
970 	jl		.Lxorpart8
971 	vpxor		0x0180(%rdx),%ymm0,%ymm0
972 	vmovdqu		%ymm0,0x0180(%rsi)
973 
974 	vmovdqa		%ymm13,%ymm0
975 	cmp		$0x01c0,%rax
976 	jl		.Lxorpart8
977 	vpxor		0x01a0(%rdx),%ymm0,%ymm0
978 	vmovdqu		%ymm0,0x01a0(%rsi)
979 
980 	vmovdqa		%ymm7,%ymm0
981 	cmp		$0x01e0,%rax
982 	jl		.Lxorpart8
983 	vpxor		0x01c0(%rdx),%ymm0,%ymm0
984 	vmovdqu		%ymm0,0x01c0(%rsi)
985 
986 	vmovdqa		%ymm15,%ymm0
987 	cmp		$0x0200,%rax
988 	jl		.Lxorpart8
989 	vpxor		0x01e0(%rdx),%ymm0,%ymm0
990 	vmovdqu		%ymm0,0x01e0(%rsi)
991 
992 .Ldone8:
993 	vzeroupper
994 	lea		-8(%r10),%rsp
995 	RET
996 
997 .Lxorpart8:
998 	# xor remaining bytes from partial register into output
999 	mov		%rax,%r9
1000 	and		$0x1f,%r9
1001 	jz		.Ldone8
1002 	and		$~0x1f,%rax
1003 
1004 	mov		%rsi,%r11
1005 
1006 	lea		(%rdx,%rax),%rsi
1007 	mov		%rsp,%rdi
1008 	mov		%r9,%rcx
1009 	rep movsb
1010 
1011 	vpxor		0x00(%rsp),%ymm0,%ymm0
1012 	vmovdqa		%ymm0,0x00(%rsp)
1013 
1014 	mov		%rsp,%rsi
1015 	lea		(%r11,%rax),%rdi
1016 	mov		%r9,%rcx
1017 	rep movsb
1018 
1019 	jmp		.Ldone8
1020 
1021 SYM_FUNC_END(chacha_8block_xor_avx2)
1022