1 /*
2  * ChaCha/XChaCha NEON helper functions
3  *
4  * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 as
8  * published by the Free Software Foundation.
9  *
10  * Originally based on:
11  * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
12  *
13  * Copyright (C) 2015 Martin Willi
14  *
15  * This program is free software; you can redistribute it and/or modify
16  * it under the terms of the GNU General Public License as published by
17  * the Free Software Foundation; either version 2 of the License, or
18  * (at your option) any later version.
19  */
20 
21 #include <linux/linkage.h>
22 #include <asm/assembler.h>
23 #include <asm/cache.h>
24 
25 	.text
26 	.align		6
27 
28 /*
29  * chacha_permute - permute one block
30  *
31  * Permute one 64-byte block where the state matrix is stored in the four NEON
32  * registers v0-v3.  It performs matrix operations on four words in parallel,
33  * but requires shuffling to rearrange the words after each round.
34  *
35  * The round count is given in w3.
36  *
37  * Clobbers: w3, x10, v4, v12
38  */
39 SYM_FUNC_START_LOCAL(chacha_permute)
40 
41 	adr_l		x10, ROT8
42 	ld1		{v12.4s}, [x10]
43 
44 .Ldoubleround:
45 	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
46 	add		v0.4s, v0.4s, v1.4s
47 	eor		v3.16b, v3.16b, v0.16b
48 	rev32		v3.8h, v3.8h
49 
50 	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
51 	add		v2.4s, v2.4s, v3.4s
52 	eor		v4.16b, v1.16b, v2.16b
53 	shl		v1.4s, v4.4s, #12
54 	sri		v1.4s, v4.4s, #20
55 
56 	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
57 	add		v0.4s, v0.4s, v1.4s
58 	eor		v3.16b, v3.16b, v0.16b
59 	tbl		v3.16b, {v3.16b}, v12.16b
60 
61 	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
62 	add		v2.4s, v2.4s, v3.4s
63 	eor		v4.16b, v1.16b, v2.16b
64 	shl		v1.4s, v4.4s, #7
65 	sri		v1.4s, v4.4s, #25
66 
67 	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
68 	ext		v1.16b, v1.16b, v1.16b, #4
69 	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
70 	ext		v2.16b, v2.16b, v2.16b, #8
71 	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
72 	ext		v3.16b, v3.16b, v3.16b, #12
73 
74 	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
75 	add		v0.4s, v0.4s, v1.4s
76 	eor		v3.16b, v3.16b, v0.16b
77 	rev32		v3.8h, v3.8h
78 
79 	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
80 	add		v2.4s, v2.4s, v3.4s
81 	eor		v4.16b, v1.16b, v2.16b
82 	shl		v1.4s, v4.4s, #12
83 	sri		v1.4s, v4.4s, #20
84 
85 	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
86 	add		v0.4s, v0.4s, v1.4s
87 	eor		v3.16b, v3.16b, v0.16b
88 	tbl		v3.16b, {v3.16b}, v12.16b
89 
90 	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
91 	add		v2.4s, v2.4s, v3.4s
92 	eor		v4.16b, v1.16b, v2.16b
93 	shl		v1.4s, v4.4s, #7
94 	sri		v1.4s, v4.4s, #25
95 
96 	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
97 	ext		v1.16b, v1.16b, v1.16b, #12
98 	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
99 	ext		v2.16b, v2.16b, v2.16b, #8
100 	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
101 	ext		v3.16b, v3.16b, v3.16b, #4
102 
103 	subs		w3, w3, #2
104 	b.ne		.Ldoubleround
105 
106 	ret
107 SYM_FUNC_END(chacha_permute)
108 
109 SYM_FUNC_START(chacha_block_xor_neon)
110 	// x0: Input state matrix, s
111 	// x1: 1 data block output, o
112 	// x2: 1 data block input, i
113 	// w3: nrounds
114 
115 	stp		x29, x30, [sp, #-16]!
116 	mov		x29, sp
117 
118 	// x0..3 = s0..3
119 	ld1		{v0.4s-v3.4s}, [x0]
120 	ld1		{v8.4s-v11.4s}, [x0]
121 
122 	bl		chacha_permute
123 
124 	ld1		{v4.16b-v7.16b}, [x2]
125 
126 	// o0 = i0 ^ (x0 + s0)
127 	add		v0.4s, v0.4s, v8.4s
128 	eor		v0.16b, v0.16b, v4.16b
129 
130 	// o1 = i1 ^ (x1 + s1)
131 	add		v1.4s, v1.4s, v9.4s
132 	eor		v1.16b, v1.16b, v5.16b
133 
134 	// o2 = i2 ^ (x2 + s2)
135 	add		v2.4s, v2.4s, v10.4s
136 	eor		v2.16b, v2.16b, v6.16b
137 
138 	// o3 = i3 ^ (x3 + s3)
139 	add		v3.4s, v3.4s, v11.4s
140 	eor		v3.16b, v3.16b, v7.16b
141 
142 	st1		{v0.16b-v3.16b}, [x1]
143 
144 	ldp		x29, x30, [sp], #16
145 	ret
146 SYM_FUNC_END(chacha_block_xor_neon)
147 
148 SYM_FUNC_START(hchacha_block_neon)
149 	// x0: Input state matrix, s
150 	// x1: output (8 32-bit words)
151 	// w2: nrounds
152 
153 	stp		x29, x30, [sp, #-16]!
154 	mov		x29, sp
155 
156 	ld1		{v0.4s-v3.4s}, [x0]
157 
158 	mov		w3, w2
159 	bl		chacha_permute
160 
161 	st1		{v0.4s}, [x1], #16
162 	st1		{v3.4s}, [x1]
163 
164 	ldp		x29, x30, [sp], #16
165 	ret
166 SYM_FUNC_END(hchacha_block_neon)
167 
168 	a0		.req	w12
169 	a1		.req	w13
170 	a2		.req	w14
171 	a3		.req	w15
172 	a4		.req	w16
173 	a5		.req	w17
174 	a6		.req	w19
175 	a7		.req	w20
176 	a8		.req	w21
177 	a9		.req	w22
178 	a10		.req	w23
179 	a11		.req	w24
180 	a12		.req	w25
181 	a13		.req	w26
182 	a14		.req	w27
183 	a15		.req	w28
184 
185 	.align		6
186 SYM_FUNC_START(chacha_4block_xor_neon)
187 	frame_push	10
188 
189 	// x0: Input state matrix, s
190 	// x1: 4 data blocks output, o
191 	// x2: 4 data blocks input, i
192 	// w3: nrounds
193 	// x4: byte count
194 
195 	adr_l		x10, .Lpermute
196 	and		x5, x4, #63
197 	add		x10, x10, x5
198 	add		x11, x10, #64
199 
200 	//
201 	// This function encrypts four consecutive ChaCha blocks by loading
202 	// the state matrix in NEON registers four times. The algorithm performs
203 	// each operation on the corresponding word of each state matrix, hence
204 	// requires no word shuffling. For final XORing step we transpose the
205 	// matrix by interleaving 32- and then 64-bit words, which allows us to
206 	// do XOR in NEON registers.
207 	//
208 	// At the same time, a fifth block is encrypted in parallel using
209 	// scalar registers
210 	//
211 	adr_l		x9, CTRINC		// ... and ROT8
212 	ld1		{v30.4s-v31.4s}, [x9]
213 
214 	// x0..15[0-3] = s0..3[0..3]
215 	add		x8, x0, #16
216 	ld4r		{ v0.4s- v3.4s}, [x0]
217 	ld4r		{ v4.4s- v7.4s}, [x8], #16
218 	ld4r		{ v8.4s-v11.4s}, [x8], #16
219 	ld4r		{v12.4s-v15.4s}, [x8]
220 
221 	mov		a0, v0.s[0]
222 	mov		a1, v1.s[0]
223 	mov		a2, v2.s[0]
224 	mov		a3, v3.s[0]
225 	mov		a4, v4.s[0]
226 	mov		a5, v5.s[0]
227 	mov		a6, v6.s[0]
228 	mov		a7, v7.s[0]
229 	mov		a8, v8.s[0]
230 	mov		a9, v9.s[0]
231 	mov		a10, v10.s[0]
232 	mov		a11, v11.s[0]
233 	mov		a12, v12.s[0]
234 	mov		a13, v13.s[0]
235 	mov		a14, v14.s[0]
236 	mov		a15, v15.s[0]
237 
238 	// x12 += counter values 1-4
239 	add		v12.4s, v12.4s, v30.4s
240 
241 .Ldoubleround4:
242 	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
243 	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
244 	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
245 	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
246 	add		v0.4s, v0.4s, v4.4s
247 	  add		a0, a0, a4
248 	add		v1.4s, v1.4s, v5.4s
249 	  add		a1, a1, a5
250 	add		v2.4s, v2.4s, v6.4s
251 	  add		a2, a2, a6
252 	add		v3.4s, v3.4s, v7.4s
253 	  add		a3, a3, a7
254 
255 	eor		v12.16b, v12.16b, v0.16b
256 	  eor		a12, a12, a0
257 	eor		v13.16b, v13.16b, v1.16b
258 	  eor		a13, a13, a1
259 	eor		v14.16b, v14.16b, v2.16b
260 	  eor		a14, a14, a2
261 	eor		v15.16b, v15.16b, v3.16b
262 	  eor		a15, a15, a3
263 
264 	rev32		v12.8h, v12.8h
265 	  ror		a12, a12, #16
266 	rev32		v13.8h, v13.8h
267 	  ror		a13, a13, #16
268 	rev32		v14.8h, v14.8h
269 	  ror		a14, a14, #16
270 	rev32		v15.8h, v15.8h
271 	  ror		a15, a15, #16
272 
273 	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
274 	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
275 	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
276 	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
277 	add		v8.4s, v8.4s, v12.4s
278 	  add		a8, a8, a12
279 	add		v9.4s, v9.4s, v13.4s
280 	  add		a9, a9, a13
281 	add		v10.4s, v10.4s, v14.4s
282 	  add		a10, a10, a14
283 	add		v11.4s, v11.4s, v15.4s
284 	  add		a11, a11, a15
285 
286 	eor		v16.16b, v4.16b, v8.16b
287 	  eor		a4, a4, a8
288 	eor		v17.16b, v5.16b, v9.16b
289 	  eor		a5, a5, a9
290 	eor		v18.16b, v6.16b, v10.16b
291 	  eor		a6, a6, a10
292 	eor		v19.16b, v7.16b, v11.16b
293 	  eor		a7, a7, a11
294 
295 	shl		v4.4s, v16.4s, #12
296 	shl		v5.4s, v17.4s, #12
297 	shl		v6.4s, v18.4s, #12
298 	shl		v7.4s, v19.4s, #12
299 
300 	sri		v4.4s, v16.4s, #20
301 	  ror		a4, a4, #20
302 	sri		v5.4s, v17.4s, #20
303 	  ror		a5, a5, #20
304 	sri		v6.4s, v18.4s, #20
305 	  ror		a6, a6, #20
306 	sri		v7.4s, v19.4s, #20
307 	  ror		a7, a7, #20
308 
309 	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
310 	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
311 	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
312 	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
313 	add		v0.4s, v0.4s, v4.4s
314 	  add		a0, a0, a4
315 	add		v1.4s, v1.4s, v5.4s
316 	  add		a1, a1, a5
317 	add		v2.4s, v2.4s, v6.4s
318 	  add		a2, a2, a6
319 	add		v3.4s, v3.4s, v7.4s
320 	  add		a3, a3, a7
321 
322 	eor		v12.16b, v12.16b, v0.16b
323 	  eor		a12, a12, a0
324 	eor		v13.16b, v13.16b, v1.16b
325 	  eor		a13, a13, a1
326 	eor		v14.16b, v14.16b, v2.16b
327 	  eor		a14, a14, a2
328 	eor		v15.16b, v15.16b, v3.16b
329 	  eor		a15, a15, a3
330 
331 	tbl		v12.16b, {v12.16b}, v31.16b
332 	  ror		a12, a12, #24
333 	tbl		v13.16b, {v13.16b}, v31.16b
334 	  ror		a13, a13, #24
335 	tbl		v14.16b, {v14.16b}, v31.16b
336 	  ror		a14, a14, #24
337 	tbl		v15.16b, {v15.16b}, v31.16b
338 	  ror		a15, a15, #24
339 
340 	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
341 	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
342 	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
343 	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
344 	add		v8.4s, v8.4s, v12.4s
345 	  add		a8, a8, a12
346 	add		v9.4s, v9.4s, v13.4s
347 	  add		a9, a9, a13
348 	add		v10.4s, v10.4s, v14.4s
349 	  add		a10, a10, a14
350 	add		v11.4s, v11.4s, v15.4s
351 	  add		a11, a11, a15
352 
353 	eor		v16.16b, v4.16b, v8.16b
354 	  eor		a4, a4, a8
355 	eor		v17.16b, v5.16b, v9.16b
356 	  eor		a5, a5, a9
357 	eor		v18.16b, v6.16b, v10.16b
358 	  eor		a6, a6, a10
359 	eor		v19.16b, v7.16b, v11.16b
360 	  eor		a7, a7, a11
361 
362 	shl		v4.4s, v16.4s, #7
363 	shl		v5.4s, v17.4s, #7
364 	shl		v6.4s, v18.4s, #7
365 	shl		v7.4s, v19.4s, #7
366 
367 	sri		v4.4s, v16.4s, #25
368 	  ror		a4, a4, #25
369 	sri		v5.4s, v17.4s, #25
370 	  ror		a5, a5, #25
371 	sri		v6.4s, v18.4s, #25
372 	 ror		a6, a6, #25
373 	sri		v7.4s, v19.4s, #25
374 	  ror		a7, a7, #25
375 
376 	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
377 	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
378 	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
379 	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
380 	add		v0.4s, v0.4s, v5.4s
381 	  add		a0, a0, a5
382 	add		v1.4s, v1.4s, v6.4s
383 	  add		a1, a1, a6
384 	add		v2.4s, v2.4s, v7.4s
385 	  add		a2, a2, a7
386 	add		v3.4s, v3.4s, v4.4s
387 	  add		a3, a3, a4
388 
389 	eor		v15.16b, v15.16b, v0.16b
390 	  eor		a15, a15, a0
391 	eor		v12.16b, v12.16b, v1.16b
392 	  eor		a12, a12, a1
393 	eor		v13.16b, v13.16b, v2.16b
394 	  eor		a13, a13, a2
395 	eor		v14.16b, v14.16b, v3.16b
396 	  eor		a14, a14, a3
397 
398 	rev32		v15.8h, v15.8h
399 	  ror		a15, a15, #16
400 	rev32		v12.8h, v12.8h
401 	  ror		a12, a12, #16
402 	rev32		v13.8h, v13.8h
403 	  ror		a13, a13, #16
404 	rev32		v14.8h, v14.8h
405 	  ror		a14, a14, #16
406 
407 	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
408 	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
409 	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
410 	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
411 	add		v10.4s, v10.4s, v15.4s
412 	  add		a10, a10, a15
413 	add		v11.4s, v11.4s, v12.4s
414 	  add		a11, a11, a12
415 	add		v8.4s, v8.4s, v13.4s
416 	  add		a8, a8, a13
417 	add		v9.4s, v9.4s, v14.4s
418 	  add		a9, a9, a14
419 
420 	eor		v16.16b, v5.16b, v10.16b
421 	  eor		a5, a5, a10
422 	eor		v17.16b, v6.16b, v11.16b
423 	  eor		a6, a6, a11
424 	eor		v18.16b, v7.16b, v8.16b
425 	  eor		a7, a7, a8
426 	eor		v19.16b, v4.16b, v9.16b
427 	  eor		a4, a4, a9
428 
429 	shl		v5.4s, v16.4s, #12
430 	shl		v6.4s, v17.4s, #12
431 	shl		v7.4s, v18.4s, #12
432 	shl		v4.4s, v19.4s, #12
433 
434 	sri		v5.4s, v16.4s, #20
435 	  ror		a5, a5, #20
436 	sri		v6.4s, v17.4s, #20
437 	  ror		a6, a6, #20
438 	sri		v7.4s, v18.4s, #20
439 	  ror		a7, a7, #20
440 	sri		v4.4s, v19.4s, #20
441 	  ror		a4, a4, #20
442 
443 	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
444 	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
445 	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
446 	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
447 	add		v0.4s, v0.4s, v5.4s
448 	  add		a0, a0, a5
449 	add		v1.4s, v1.4s, v6.4s
450 	  add		a1, a1, a6
451 	add		v2.4s, v2.4s, v7.4s
452 	  add		a2, a2, a7
453 	add		v3.4s, v3.4s, v4.4s
454 	  add		a3, a3, a4
455 
456 	eor		v15.16b, v15.16b, v0.16b
457 	  eor		a15, a15, a0
458 	eor		v12.16b, v12.16b, v1.16b
459 	  eor		a12, a12, a1
460 	eor		v13.16b, v13.16b, v2.16b
461 	  eor		a13, a13, a2
462 	eor		v14.16b, v14.16b, v3.16b
463 	  eor		a14, a14, a3
464 
465 	tbl		v15.16b, {v15.16b}, v31.16b
466 	  ror		a15, a15, #24
467 	tbl		v12.16b, {v12.16b}, v31.16b
468 	  ror		a12, a12, #24
469 	tbl		v13.16b, {v13.16b}, v31.16b
470 	  ror		a13, a13, #24
471 	tbl		v14.16b, {v14.16b}, v31.16b
472 	  ror		a14, a14, #24
473 
474 	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
475 	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
476 	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
477 	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
478 	add		v10.4s, v10.4s, v15.4s
479 	  add		a10, a10, a15
480 	add		v11.4s, v11.4s, v12.4s
481 	  add		a11, a11, a12
482 	add		v8.4s, v8.4s, v13.4s
483 	  add		a8, a8, a13
484 	add		v9.4s, v9.4s, v14.4s
485 	  add		a9, a9, a14
486 
487 	eor		v16.16b, v5.16b, v10.16b
488 	  eor		a5, a5, a10
489 	eor		v17.16b, v6.16b, v11.16b
490 	  eor		a6, a6, a11
491 	eor		v18.16b, v7.16b, v8.16b
492 	  eor		a7, a7, a8
493 	eor		v19.16b, v4.16b, v9.16b
494 	  eor		a4, a4, a9
495 
496 	shl		v5.4s, v16.4s, #7
497 	shl		v6.4s, v17.4s, #7
498 	shl		v7.4s, v18.4s, #7
499 	shl		v4.4s, v19.4s, #7
500 
501 	sri		v5.4s, v16.4s, #25
502 	  ror		a5, a5, #25
503 	sri		v6.4s, v17.4s, #25
504 	  ror		a6, a6, #25
505 	sri		v7.4s, v18.4s, #25
506 	  ror		a7, a7, #25
507 	sri		v4.4s, v19.4s, #25
508 	  ror		a4, a4, #25
509 
510 	subs		w3, w3, #2
511 	b.ne		.Ldoubleround4
512 
513 	ld4r		{v16.4s-v19.4s}, [x0], #16
514 	ld4r		{v20.4s-v23.4s}, [x0], #16
515 
516 	// x12 += counter values 0-3
517 	add		v12.4s, v12.4s, v30.4s
518 
519 	// x0[0-3] += s0[0]
520 	// x1[0-3] += s0[1]
521 	// x2[0-3] += s0[2]
522 	// x3[0-3] += s0[3]
523 	add		v0.4s, v0.4s, v16.4s
524 	  mov		w6, v16.s[0]
525 	  mov		w7, v17.s[0]
526 	add		v1.4s, v1.4s, v17.4s
527 	  mov		w8, v18.s[0]
528 	  mov		w9, v19.s[0]
529 	add		v2.4s, v2.4s, v18.4s
530 	  add		a0, a0, w6
531 	  add		a1, a1, w7
532 	add		v3.4s, v3.4s, v19.4s
533 	  add		a2, a2, w8
534 	  add		a3, a3, w9
535 CPU_BE(	  rev		a0, a0		)
536 CPU_BE(	  rev		a1, a1		)
537 CPU_BE(	  rev		a2, a2		)
538 CPU_BE(	  rev		a3, a3		)
539 
540 	ld4r		{v24.4s-v27.4s}, [x0], #16
541 	ld4r		{v28.4s-v31.4s}, [x0]
542 
543 	// x4[0-3] += s1[0]
544 	// x5[0-3] += s1[1]
545 	// x6[0-3] += s1[2]
546 	// x7[0-3] += s1[3]
547 	add		v4.4s, v4.4s, v20.4s
548 	  mov		w6, v20.s[0]
549 	  mov		w7, v21.s[0]
550 	add		v5.4s, v5.4s, v21.4s
551 	  mov		w8, v22.s[0]
552 	  mov		w9, v23.s[0]
553 	add		v6.4s, v6.4s, v22.4s
554 	  add		a4, a4, w6
555 	  add		a5, a5, w7
556 	add		v7.4s, v7.4s, v23.4s
557 	  add		a6, a6, w8
558 	  add		a7, a7, w9
559 CPU_BE(	  rev		a4, a4		)
560 CPU_BE(	  rev		a5, a5		)
561 CPU_BE(	  rev		a6, a6		)
562 CPU_BE(	  rev		a7, a7		)
563 
564 	// x8[0-3] += s2[0]
565 	// x9[0-3] += s2[1]
566 	// x10[0-3] += s2[2]
567 	// x11[0-3] += s2[3]
568 	add		v8.4s, v8.4s, v24.4s
569 	  mov		w6, v24.s[0]
570 	  mov		w7, v25.s[0]
571 	add		v9.4s, v9.4s, v25.4s
572 	  mov		w8, v26.s[0]
573 	  mov		w9, v27.s[0]
574 	add		v10.4s, v10.4s, v26.4s
575 	  add		a8, a8, w6
576 	  add		a9, a9, w7
577 	add		v11.4s, v11.4s, v27.4s
578 	  add		a10, a10, w8
579 	  add		a11, a11, w9
580 CPU_BE(	  rev		a8, a8		)
581 CPU_BE(	  rev		a9, a9		)
582 CPU_BE(	  rev		a10, a10	)
583 CPU_BE(	  rev		a11, a11	)
584 
585 	// x12[0-3] += s3[0]
586 	// x13[0-3] += s3[1]
587 	// x14[0-3] += s3[2]
588 	// x15[0-3] += s3[3]
589 	add		v12.4s, v12.4s, v28.4s
590 	  mov		w6, v28.s[0]
591 	  mov		w7, v29.s[0]
592 	add		v13.4s, v13.4s, v29.4s
593 	  mov		w8, v30.s[0]
594 	  mov		w9, v31.s[0]
595 	add		v14.4s, v14.4s, v30.4s
596 	  add		a12, a12, w6
597 	  add		a13, a13, w7
598 	add		v15.4s, v15.4s, v31.4s
599 	  add		a14, a14, w8
600 	  add		a15, a15, w9
601 CPU_BE(	  rev		a12, a12	)
602 CPU_BE(	  rev		a13, a13	)
603 CPU_BE(	  rev		a14, a14	)
604 CPU_BE(	  rev		a15, a15	)
605 
606 	// interleave 32-bit words in state n, n+1
607 	  ldp		w6, w7, [x2], #64
608 	zip1		v16.4s, v0.4s, v1.4s
609 	  ldp		w8, w9, [x2, #-56]
610 	  eor		a0, a0, w6
611 	zip2		v17.4s, v0.4s, v1.4s
612 	  eor		a1, a1, w7
613 	zip1		v18.4s, v2.4s, v3.4s
614 	  eor		a2, a2, w8
615 	zip2		v19.4s, v2.4s, v3.4s
616 	  eor		a3, a3, w9
617 	  ldp		w6, w7, [x2, #-48]
618 	zip1		v20.4s, v4.4s, v5.4s
619 	  ldp		w8, w9, [x2, #-40]
620 	  eor		a4, a4, w6
621 	zip2		v21.4s, v4.4s, v5.4s
622 	  eor		a5, a5, w7
623 	zip1		v22.4s, v6.4s, v7.4s
624 	  eor		a6, a6, w8
625 	zip2		v23.4s, v6.4s, v7.4s
626 	  eor		a7, a7, w9
627 	  ldp		w6, w7, [x2, #-32]
628 	zip1		v24.4s, v8.4s, v9.4s
629 	  ldp		w8, w9, [x2, #-24]
630 	  eor		a8, a8, w6
631 	zip2		v25.4s, v8.4s, v9.4s
632 	  eor		a9, a9, w7
633 	zip1		v26.4s, v10.4s, v11.4s
634 	  eor		a10, a10, w8
635 	zip2		v27.4s, v10.4s, v11.4s
636 	  eor		a11, a11, w9
637 	  ldp		w6, w7, [x2, #-16]
638 	zip1		v28.4s, v12.4s, v13.4s
639 	  ldp		w8, w9, [x2, #-8]
640 	  eor		a12, a12, w6
641 	zip2		v29.4s, v12.4s, v13.4s
642 	  eor		a13, a13, w7
643 	zip1		v30.4s, v14.4s, v15.4s
644 	  eor		a14, a14, w8
645 	zip2		v31.4s, v14.4s, v15.4s
646 	  eor		a15, a15, w9
647 
648 	mov		x3, #64
649 	subs		x5, x4, #128
650 	add		x6, x5, x2
651 	csel		x3, x3, xzr, ge
652 	csel		x2, x2, x6, ge
653 
654 	// interleave 64-bit words in state n, n+2
655 	zip1		v0.2d, v16.2d, v18.2d
656 	zip2		v4.2d, v16.2d, v18.2d
657 	  stp		a0, a1, [x1], #64
658 	zip1		v8.2d, v17.2d, v19.2d
659 	zip2		v12.2d, v17.2d, v19.2d
660 	  stp		a2, a3, [x1, #-56]
661 	ld1		{v16.16b-v19.16b}, [x2], x3
662 
663 	subs		x6, x4, #192
664 	ccmp		x3, xzr, #4, lt
665 	add		x7, x6, x2
666 	csel		x3, x3, xzr, eq
667 	csel		x2, x2, x7, eq
668 
669 	zip1		v1.2d, v20.2d, v22.2d
670 	zip2		v5.2d, v20.2d, v22.2d
671 	  stp		a4, a5, [x1, #-48]
672 	zip1		v9.2d, v21.2d, v23.2d
673 	zip2		v13.2d, v21.2d, v23.2d
674 	  stp		a6, a7, [x1, #-40]
675 	ld1		{v20.16b-v23.16b}, [x2], x3
676 
677 	subs		x7, x4, #256
678 	ccmp		x3, xzr, #4, lt
679 	add		x8, x7, x2
680 	csel		x3, x3, xzr, eq
681 	csel		x2, x2, x8, eq
682 
683 	zip1		v2.2d, v24.2d, v26.2d
684 	zip2		v6.2d, v24.2d, v26.2d
685 	  stp		a8, a9, [x1, #-32]
686 	zip1		v10.2d, v25.2d, v27.2d
687 	zip2		v14.2d, v25.2d, v27.2d
688 	  stp		a10, a11, [x1, #-24]
689 	ld1		{v24.16b-v27.16b}, [x2], x3
690 
691 	subs		x8, x4, #320
692 	ccmp		x3, xzr, #4, lt
693 	add		x9, x8, x2
694 	csel		x2, x2, x9, eq
695 
696 	zip1		v3.2d, v28.2d, v30.2d
697 	zip2		v7.2d, v28.2d, v30.2d
698 	  stp		a12, a13, [x1, #-16]
699 	zip1		v11.2d, v29.2d, v31.2d
700 	zip2		v15.2d, v29.2d, v31.2d
701 	  stp		a14, a15, [x1, #-8]
702 	ld1		{v28.16b-v31.16b}, [x2]
703 
704 	// xor with corresponding input, write to output
705 	tbnz		x5, #63, 0f
706 	eor		v16.16b, v16.16b, v0.16b
707 	eor		v17.16b, v17.16b, v1.16b
708 	eor		v18.16b, v18.16b, v2.16b
709 	eor		v19.16b, v19.16b, v3.16b
710 	st1		{v16.16b-v19.16b}, [x1], #64
711 	cbz		x5, .Lout
712 
713 	tbnz		x6, #63, 1f
714 	eor		v20.16b, v20.16b, v4.16b
715 	eor		v21.16b, v21.16b, v5.16b
716 	eor		v22.16b, v22.16b, v6.16b
717 	eor		v23.16b, v23.16b, v7.16b
718 	st1		{v20.16b-v23.16b}, [x1], #64
719 	cbz		x6, .Lout
720 
721 	tbnz		x7, #63, 2f
722 	eor		v24.16b, v24.16b, v8.16b
723 	eor		v25.16b, v25.16b, v9.16b
724 	eor		v26.16b, v26.16b, v10.16b
725 	eor		v27.16b, v27.16b, v11.16b
726 	st1		{v24.16b-v27.16b}, [x1], #64
727 	cbz		x7, .Lout
728 
729 	tbnz		x8, #63, 3f
730 	eor		v28.16b, v28.16b, v12.16b
731 	eor		v29.16b, v29.16b, v13.16b
732 	eor		v30.16b, v30.16b, v14.16b
733 	eor		v31.16b, v31.16b, v15.16b
734 	st1		{v28.16b-v31.16b}, [x1]
735 
736 .Lout:	frame_pop
737 	ret
738 
739 	// fewer than 128 bytes of in/output
740 0:	ld1		{v8.16b}, [x10]
741 	ld1		{v9.16b}, [x11]
742 	movi		v10.16b, #16
743 	sub		x2, x1, #64
744 	add		x1, x1, x5
745 	ld1		{v16.16b-v19.16b}, [x2]
746 	tbl		v4.16b, {v0.16b-v3.16b}, v8.16b
747 	tbx		v20.16b, {v16.16b-v19.16b}, v9.16b
748 	add		v8.16b, v8.16b, v10.16b
749 	add		v9.16b, v9.16b, v10.16b
750 	tbl		v5.16b, {v0.16b-v3.16b}, v8.16b
751 	tbx		v21.16b, {v16.16b-v19.16b}, v9.16b
752 	add		v8.16b, v8.16b, v10.16b
753 	add		v9.16b, v9.16b, v10.16b
754 	tbl		v6.16b, {v0.16b-v3.16b}, v8.16b
755 	tbx		v22.16b, {v16.16b-v19.16b}, v9.16b
756 	add		v8.16b, v8.16b, v10.16b
757 	add		v9.16b, v9.16b, v10.16b
758 	tbl		v7.16b, {v0.16b-v3.16b}, v8.16b
759 	tbx		v23.16b, {v16.16b-v19.16b}, v9.16b
760 
761 	eor		v20.16b, v20.16b, v4.16b
762 	eor		v21.16b, v21.16b, v5.16b
763 	eor		v22.16b, v22.16b, v6.16b
764 	eor		v23.16b, v23.16b, v7.16b
765 	st1		{v20.16b-v23.16b}, [x1]
766 	b		.Lout
767 
768 	// fewer than 192 bytes of in/output
769 1:	ld1		{v8.16b}, [x10]
770 	ld1		{v9.16b}, [x11]
771 	movi		v10.16b, #16
772 	add		x1, x1, x6
773 	tbl		v0.16b, {v4.16b-v7.16b}, v8.16b
774 	tbx		v20.16b, {v16.16b-v19.16b}, v9.16b
775 	add		v8.16b, v8.16b, v10.16b
776 	add		v9.16b, v9.16b, v10.16b
777 	tbl		v1.16b, {v4.16b-v7.16b}, v8.16b
778 	tbx		v21.16b, {v16.16b-v19.16b}, v9.16b
779 	add		v8.16b, v8.16b, v10.16b
780 	add		v9.16b, v9.16b, v10.16b
781 	tbl		v2.16b, {v4.16b-v7.16b}, v8.16b
782 	tbx		v22.16b, {v16.16b-v19.16b}, v9.16b
783 	add		v8.16b, v8.16b, v10.16b
784 	add		v9.16b, v9.16b, v10.16b
785 	tbl		v3.16b, {v4.16b-v7.16b}, v8.16b
786 	tbx		v23.16b, {v16.16b-v19.16b}, v9.16b
787 
788 	eor		v20.16b, v20.16b, v0.16b
789 	eor		v21.16b, v21.16b, v1.16b
790 	eor		v22.16b, v22.16b, v2.16b
791 	eor		v23.16b, v23.16b, v3.16b
792 	st1		{v20.16b-v23.16b}, [x1]
793 	b		.Lout
794 
795 	// fewer than 256 bytes of in/output
796 2:	ld1		{v4.16b}, [x10]
797 	ld1		{v5.16b}, [x11]
798 	movi		v6.16b, #16
799 	add		x1, x1, x7
800 	tbl		v0.16b, {v8.16b-v11.16b}, v4.16b
801 	tbx		v24.16b, {v20.16b-v23.16b}, v5.16b
802 	add		v4.16b, v4.16b, v6.16b
803 	add		v5.16b, v5.16b, v6.16b
804 	tbl		v1.16b, {v8.16b-v11.16b}, v4.16b
805 	tbx		v25.16b, {v20.16b-v23.16b}, v5.16b
806 	add		v4.16b, v4.16b, v6.16b
807 	add		v5.16b, v5.16b, v6.16b
808 	tbl		v2.16b, {v8.16b-v11.16b}, v4.16b
809 	tbx		v26.16b, {v20.16b-v23.16b}, v5.16b
810 	add		v4.16b, v4.16b, v6.16b
811 	add		v5.16b, v5.16b, v6.16b
812 	tbl		v3.16b, {v8.16b-v11.16b}, v4.16b
813 	tbx		v27.16b, {v20.16b-v23.16b}, v5.16b
814 
815 	eor		v24.16b, v24.16b, v0.16b
816 	eor		v25.16b, v25.16b, v1.16b
817 	eor		v26.16b, v26.16b, v2.16b
818 	eor		v27.16b, v27.16b, v3.16b
819 	st1		{v24.16b-v27.16b}, [x1]
820 	b		.Lout
821 
822 	// fewer than 320 bytes of in/output
823 3:	ld1		{v4.16b}, [x10]
824 	ld1		{v5.16b}, [x11]
825 	movi		v6.16b, #16
826 	add		x1, x1, x8
827 	tbl		v0.16b, {v12.16b-v15.16b}, v4.16b
828 	tbx		v28.16b, {v24.16b-v27.16b}, v5.16b
829 	add		v4.16b, v4.16b, v6.16b
830 	add		v5.16b, v5.16b, v6.16b
831 	tbl		v1.16b, {v12.16b-v15.16b}, v4.16b
832 	tbx		v29.16b, {v24.16b-v27.16b}, v5.16b
833 	add		v4.16b, v4.16b, v6.16b
834 	add		v5.16b, v5.16b, v6.16b
835 	tbl		v2.16b, {v12.16b-v15.16b}, v4.16b
836 	tbx		v30.16b, {v24.16b-v27.16b}, v5.16b
837 	add		v4.16b, v4.16b, v6.16b
838 	add		v5.16b, v5.16b, v6.16b
839 	tbl		v3.16b, {v12.16b-v15.16b}, v4.16b
840 	tbx		v31.16b, {v24.16b-v27.16b}, v5.16b
841 
842 	eor		v28.16b, v28.16b, v0.16b
843 	eor		v29.16b, v29.16b, v1.16b
844 	eor		v30.16b, v30.16b, v2.16b
845 	eor		v31.16b, v31.16b, v3.16b
846 	st1		{v28.16b-v31.16b}, [x1]
847 	b		.Lout
848 SYM_FUNC_END(chacha_4block_xor_neon)
849 
850 	.section	".rodata", "a", %progbits
851 	.align		L1_CACHE_SHIFT
852 .Lpermute:
853 	.set		.Li, 0
854 	.rept		192
855 	.byte		(.Li - 64)
856 	.set		.Li, .Li + 1
857 	.endr
858 
859 CTRINC:	.word		1, 2, 3, 4
860 ROT8:	.word		0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
861