1 /* SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause */
2 /*
3  * AES CTR mode by8 optimization with AVX instructions. (x86_64)
4  *
5  * Copyright(c) 2014 Intel Corporation.
6  *
7  * Contact Information:
8  * James Guilford <james.guilford@intel.com>
9  * Sean Gulley <sean.m.gulley@intel.com>
10  * Chandramouli Narayanan <mouli@linux.intel.com>
11  */
12 /*
13  * This is AES128/192/256 CTR mode optimization implementation. It requires
14  * the support of Intel(R) AESNI and AVX instructions.
15  *
16  * This work was inspired by the AES CTR mode optimization published
17  * in Intel Optimized IPSEC Cryptographic library.
18  * Additional information on it can be found at:
19  *    https://github.com/intel/intel-ipsec-mb
20  */
21 
22 #include <linux/linkage.h>
23 
24 #define VMOVDQ		vmovdqu
25 
26 /*
27  * Note: the "x" prefix in these aliases means "this is an xmm register".  The
28  * alias prefixes have no relation to XCTR where the "X" prefix means "XOR
29  * counter".
30  */
31 #define xdata0		%xmm0
32 #define xdata1		%xmm1
33 #define xdata2		%xmm2
34 #define xdata3		%xmm3
35 #define xdata4		%xmm4
36 #define xdata5		%xmm5
37 #define xdata6		%xmm6
38 #define xdata7		%xmm7
39 #define xcounter	%xmm8	// CTR mode only
40 #define xiv		%xmm8	// XCTR mode only
41 #define xbyteswap	%xmm9	// CTR mode only
42 #define xtmp		%xmm9	// XCTR mode only
43 #define xkey0		%xmm10
44 #define xkey4		%xmm11
45 #define xkey8		%xmm12
46 #define xkey12		%xmm13
47 #define xkeyA		%xmm14
48 #define xkeyB		%xmm15
49 
50 #define p_in		%rdi
51 #define p_iv		%rsi
52 #define p_keys		%rdx
53 #define p_out		%rcx
54 #define num_bytes	%r8
55 #define counter		%r9	// XCTR mode only
56 #define tmp		%r10
57 #define	DDQ_DATA	0
58 #define	XDATA		1
59 #define KEY_128		1
60 #define KEY_192		2
61 #define KEY_256		3
62 
63 .section .rodata
64 .align 16
65 
66 byteswap_const:
67 	.octa 0x000102030405060708090A0B0C0D0E0F
68 ddq_low_msk:
69 	.octa 0x0000000000000000FFFFFFFFFFFFFFFF
70 ddq_high_add_1:
71 	.octa 0x00000000000000010000000000000000
72 ddq_add_1:
73 	.octa 0x00000000000000000000000000000001
74 ddq_add_2:
75 	.octa 0x00000000000000000000000000000002
76 ddq_add_3:
77 	.octa 0x00000000000000000000000000000003
78 ddq_add_4:
79 	.octa 0x00000000000000000000000000000004
80 ddq_add_5:
81 	.octa 0x00000000000000000000000000000005
82 ddq_add_6:
83 	.octa 0x00000000000000000000000000000006
84 ddq_add_7:
85 	.octa 0x00000000000000000000000000000007
86 ddq_add_8:
87 	.octa 0x00000000000000000000000000000008
88 
89 .text
90 
91 /* generate a unique variable for ddq_add_x */
92 
93 /* generate a unique variable for xmm register */
94 .macro setxdata n
95 	var_xdata = %xmm\n
96 .endm
97 
98 /* club the numeric 'id' to the symbol 'name' */
99 
100 .macro club name, id
101 .altmacro
102 	.if \name == XDATA
103 		setxdata %\id
104 	.endif
105 .noaltmacro
106 .endm
107 
108 /*
109  * do_aes num_in_par load_keys key_len
110  * This increments p_in, but not p_out
111  */
112 .macro do_aes b, k, key_len, xctr
113 	.set by, \b
114 	.set load_keys, \k
115 	.set klen, \key_len
116 
117 	.if (load_keys)
118 		vmovdqa	0*16(p_keys), xkey0
119 	.endif
120 
121 	.if \xctr
122 		movq counter, xtmp
123 		.set i, 0
124 		.rept (by)
125 			club XDATA, i
126 			vpaddq	(ddq_add_1 + 16 * i)(%rip), xtmp, var_xdata
127 			.set i, (i +1)
128 		.endr
129 		.set i, 0
130 		.rept (by)
131 			club	XDATA, i
132 			vpxor	xiv, var_xdata, var_xdata
133 			.set i, (i +1)
134 		.endr
135 	.else
136 		vpshufb	xbyteswap, xcounter, xdata0
137 		.set i, 1
138 		.rept (by - 1)
139 			club XDATA, i
140 			vpaddq	(ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
141 			vptest	ddq_low_msk(%rip), var_xdata
142 			jnz 1f
143 			vpaddq	ddq_high_add_1(%rip), var_xdata, var_xdata
144 			vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
145 			1:
146 			vpshufb	xbyteswap, var_xdata, var_xdata
147 			.set i, (i +1)
148 		.endr
149 	.endif
150 
151 	vmovdqa	1*16(p_keys), xkeyA
152 
153 	vpxor	xkey0, xdata0, xdata0
154 	.if \xctr
155 		add $by, counter
156 	.else
157 		vpaddq	(ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
158 		vptest	ddq_low_msk(%rip), xcounter
159 		jnz	1f
160 		vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
161 		1:
162 	.endif
163 
164 	.set i, 1
165 	.rept (by - 1)
166 		club XDATA, i
167 		vpxor	xkey0, var_xdata, var_xdata
168 		.set i, (i +1)
169 	.endr
170 
171 	vmovdqa	2*16(p_keys), xkeyB
172 
173 	.set i, 0
174 	.rept by
175 		club XDATA, i
176 		vaesenc	xkeyA, var_xdata, var_xdata		/* key 1 */
177 		.set i, (i +1)
178 	.endr
179 
180 	.if (klen == KEY_128)
181 		.if (load_keys)
182 			vmovdqa	3*16(p_keys), xkey4
183 		.endif
184 	.else
185 		vmovdqa	3*16(p_keys), xkeyA
186 	.endif
187 
188 	.set i, 0
189 	.rept by
190 		club XDATA, i
191 		vaesenc	xkeyB, var_xdata, var_xdata		/* key 2 */
192 		.set i, (i +1)
193 	.endr
194 
195 	add	$(16*by), p_in
196 
197 	.if (klen == KEY_128)
198 		vmovdqa	4*16(p_keys), xkeyB
199 	.else
200 		.if (load_keys)
201 			vmovdqa	4*16(p_keys), xkey4
202 		.endif
203 	.endif
204 
205 	.set i, 0
206 	.rept by
207 		club XDATA, i
208 		/* key 3 */
209 		.if (klen == KEY_128)
210 			vaesenc	xkey4, var_xdata, var_xdata
211 		.else
212 			vaesenc	xkeyA, var_xdata, var_xdata
213 		.endif
214 		.set i, (i +1)
215 	.endr
216 
217 	vmovdqa	5*16(p_keys), xkeyA
218 
219 	.set i, 0
220 	.rept by
221 		club XDATA, i
222 		/* key 4 */
223 		.if (klen == KEY_128)
224 			vaesenc	xkeyB, var_xdata, var_xdata
225 		.else
226 			vaesenc	xkey4, var_xdata, var_xdata
227 		.endif
228 		.set i, (i +1)
229 	.endr
230 
231 	.if (klen == KEY_128)
232 		.if (load_keys)
233 			vmovdqa	6*16(p_keys), xkey8
234 		.endif
235 	.else
236 		vmovdqa	6*16(p_keys), xkeyB
237 	.endif
238 
239 	.set i, 0
240 	.rept by
241 		club XDATA, i
242 		vaesenc	xkeyA, var_xdata, var_xdata		/* key 5 */
243 		.set i, (i +1)
244 	.endr
245 
246 	vmovdqa	7*16(p_keys), xkeyA
247 
248 	.set i, 0
249 	.rept by
250 		club XDATA, i
251 		/* key 6 */
252 		.if (klen == KEY_128)
253 			vaesenc	xkey8, var_xdata, var_xdata
254 		.else
255 			vaesenc	xkeyB, var_xdata, var_xdata
256 		.endif
257 		.set i, (i +1)
258 	.endr
259 
260 	.if (klen == KEY_128)
261 		vmovdqa	8*16(p_keys), xkeyB
262 	.else
263 		.if (load_keys)
264 			vmovdqa	8*16(p_keys), xkey8
265 		.endif
266 	.endif
267 
268 	.set i, 0
269 	.rept by
270 		club XDATA, i
271 		vaesenc	xkeyA, var_xdata, var_xdata		/* key 7 */
272 		.set i, (i +1)
273 	.endr
274 
275 	.if (klen == KEY_128)
276 		.if (load_keys)
277 			vmovdqa	9*16(p_keys), xkey12
278 		.endif
279 	.else
280 		vmovdqa	9*16(p_keys), xkeyA
281 	.endif
282 
283 	.set i, 0
284 	.rept by
285 		club XDATA, i
286 		/* key 8 */
287 		.if (klen == KEY_128)
288 			vaesenc	xkeyB, var_xdata, var_xdata
289 		.else
290 			vaesenc	xkey8, var_xdata, var_xdata
291 		.endif
292 		.set i, (i +1)
293 	.endr
294 
295 	vmovdqa	10*16(p_keys), xkeyB
296 
297 	.set i, 0
298 	.rept by
299 		club XDATA, i
300 		/* key 9 */
301 		.if (klen == KEY_128)
302 			vaesenc	xkey12, var_xdata, var_xdata
303 		.else
304 			vaesenc	xkeyA, var_xdata, var_xdata
305 		.endif
306 		.set i, (i +1)
307 	.endr
308 
309 	.if (klen != KEY_128)
310 		vmovdqa	11*16(p_keys), xkeyA
311 	.endif
312 
313 	.set i, 0
314 	.rept by
315 		club XDATA, i
316 		/* key 10 */
317 		.if (klen == KEY_128)
318 			vaesenclast	xkeyB, var_xdata, var_xdata
319 		.else
320 			vaesenc	xkeyB, var_xdata, var_xdata
321 		.endif
322 		.set i, (i +1)
323 	.endr
324 
325 	.if (klen != KEY_128)
326 		.if (load_keys)
327 			vmovdqa	12*16(p_keys), xkey12
328 		.endif
329 
330 		.set i, 0
331 		.rept by
332 			club XDATA, i
333 			vaesenc	xkeyA, var_xdata, var_xdata	/* key 11 */
334 			.set i, (i +1)
335 		.endr
336 
337 		.if (klen == KEY_256)
338 			vmovdqa	13*16(p_keys), xkeyA
339 		.endif
340 
341 		.set i, 0
342 		.rept by
343 			club XDATA, i
344 			.if (klen == KEY_256)
345 				/* key 12 */
346 				vaesenc	xkey12, var_xdata, var_xdata
347 			.else
348 				vaesenclast xkey12, var_xdata, var_xdata
349 			.endif
350 			.set i, (i +1)
351 		.endr
352 
353 		.if (klen == KEY_256)
354 			vmovdqa	14*16(p_keys), xkeyB
355 
356 			.set i, 0
357 			.rept by
358 				club XDATA, i
359 				/* key 13 */
360 				vaesenc	xkeyA, var_xdata, var_xdata
361 				.set i, (i +1)
362 			.endr
363 
364 			.set i, 0
365 			.rept by
366 				club XDATA, i
367 				/* key 14 */
368 				vaesenclast	xkeyB, var_xdata, var_xdata
369 				.set i, (i +1)
370 			.endr
371 		.endif
372 	.endif
373 
374 	.set i, 0
375 	.rept (by / 2)
376 		.set j, (i+1)
377 		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
378 		VMOVDQ	(j*16 - 16*by)(p_in), xkeyB
379 		club XDATA, i
380 		vpxor	xkeyA, var_xdata, var_xdata
381 		club XDATA, j
382 		vpxor	xkeyB, var_xdata, var_xdata
383 		.set i, (i+2)
384 	.endr
385 
386 	.if (i < by)
387 		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
388 		club XDATA, i
389 		vpxor	xkeyA, var_xdata, var_xdata
390 	.endif
391 
392 	.set i, 0
393 	.rept by
394 		club XDATA, i
395 		VMOVDQ	var_xdata, i*16(p_out)
396 		.set i, (i+1)
397 	.endr
398 .endm
399 
400 .macro do_aes_load val, key_len, xctr
401 	do_aes \val, 1, \key_len, \xctr
402 .endm
403 
404 .macro do_aes_noload val, key_len, xctr
405 	do_aes \val, 0, \key_len, \xctr
406 .endm
407 
408 /* main body of aes ctr load */
409 
410 .macro do_aes_ctrmain key_len, xctr
411 	cmp	$16, num_bytes
412 	jb	.Ldo_return2\xctr\key_len
413 
414 	.if \xctr
415 		shr	$4, counter
416 		vmovdqu	(p_iv), xiv
417 	.else
418 		vmovdqa	byteswap_const(%rip), xbyteswap
419 		vmovdqu	(p_iv), xcounter
420 		vpshufb	xbyteswap, xcounter, xcounter
421 	.endif
422 
423 	mov	num_bytes, tmp
424 	and	$(7*16), tmp
425 	jz	.Lmult_of_8_blks\xctr\key_len
426 
427 	/* 1 <= tmp <= 7 */
428 	cmp	$(4*16), tmp
429 	jg	.Lgt4\xctr\key_len
430 	je	.Leq4\xctr\key_len
431 
432 .Llt4\xctr\key_len:
433 	cmp	$(2*16), tmp
434 	jg	.Leq3\xctr\key_len
435 	je	.Leq2\xctr\key_len
436 
437 .Leq1\xctr\key_len:
438 	do_aes_load	1, \key_len, \xctr
439 	add	$(1*16), p_out
440 	and	$(~7*16), num_bytes
441 	jz	.Ldo_return2\xctr\key_len
442 	jmp	.Lmain_loop2\xctr\key_len
443 
444 .Leq2\xctr\key_len:
445 	do_aes_load	2, \key_len, \xctr
446 	add	$(2*16), p_out
447 	and	$(~7*16), num_bytes
448 	jz	.Ldo_return2\xctr\key_len
449 	jmp	.Lmain_loop2\xctr\key_len
450 
451 
452 .Leq3\xctr\key_len:
453 	do_aes_load	3, \key_len, \xctr
454 	add	$(3*16), p_out
455 	and	$(~7*16), num_bytes
456 	jz	.Ldo_return2\xctr\key_len
457 	jmp	.Lmain_loop2\xctr\key_len
458 
459 .Leq4\xctr\key_len:
460 	do_aes_load	4, \key_len, \xctr
461 	add	$(4*16), p_out
462 	and	$(~7*16), num_bytes
463 	jz	.Ldo_return2\xctr\key_len
464 	jmp	.Lmain_loop2\xctr\key_len
465 
466 .Lgt4\xctr\key_len:
467 	cmp	$(6*16), tmp
468 	jg	.Leq7\xctr\key_len
469 	je	.Leq6\xctr\key_len
470 
471 .Leq5\xctr\key_len:
472 	do_aes_load	5, \key_len, \xctr
473 	add	$(5*16), p_out
474 	and	$(~7*16), num_bytes
475 	jz	.Ldo_return2\xctr\key_len
476 	jmp	.Lmain_loop2\xctr\key_len
477 
478 .Leq6\xctr\key_len:
479 	do_aes_load	6, \key_len, \xctr
480 	add	$(6*16), p_out
481 	and	$(~7*16), num_bytes
482 	jz	.Ldo_return2\xctr\key_len
483 	jmp	.Lmain_loop2\xctr\key_len
484 
485 .Leq7\xctr\key_len:
486 	do_aes_load	7, \key_len, \xctr
487 	add	$(7*16), p_out
488 	and	$(~7*16), num_bytes
489 	jz	.Ldo_return2\xctr\key_len
490 	jmp	.Lmain_loop2\xctr\key_len
491 
492 .Lmult_of_8_blks\xctr\key_len:
493 	.if (\key_len != KEY_128)
494 		vmovdqa	0*16(p_keys), xkey0
495 		vmovdqa	4*16(p_keys), xkey4
496 		vmovdqa	8*16(p_keys), xkey8
497 		vmovdqa	12*16(p_keys), xkey12
498 	.else
499 		vmovdqa	0*16(p_keys), xkey0
500 		vmovdqa	3*16(p_keys), xkey4
501 		vmovdqa	6*16(p_keys), xkey8
502 		vmovdqa	9*16(p_keys), xkey12
503 	.endif
504 .align 16
505 .Lmain_loop2\xctr\key_len:
506 	/* num_bytes is a multiple of 8 and >0 */
507 	do_aes_noload	8, \key_len, \xctr
508 	add	$(8*16), p_out
509 	sub	$(8*16), num_bytes
510 	jne	.Lmain_loop2\xctr\key_len
511 
512 .Ldo_return2\xctr\key_len:
513 	.if !\xctr
514 		/* return updated IV */
515 		vpshufb	xbyteswap, xcounter, xcounter
516 		vmovdqu	xcounter, (p_iv)
517 	.endif
518 	RET
519 .endm
520 
521 /*
522  * routine to do AES128 CTR enc/decrypt "by8"
523  * XMM registers are clobbered.
524  * Saving/restoring must be done at a higher level
525  * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
526  *			unsigned int num_bytes)
527  */
528 SYM_FUNC_START(aes_ctr_enc_128_avx_by8)
529 	/* call the aes main loop */
530 	do_aes_ctrmain KEY_128 0
531 
532 SYM_FUNC_END(aes_ctr_enc_128_avx_by8)
533 
534 /*
535  * routine to do AES192 CTR enc/decrypt "by8"
536  * XMM registers are clobbered.
537  * Saving/restoring must be done at a higher level
538  * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
539  *			unsigned int num_bytes)
540  */
541 SYM_FUNC_START(aes_ctr_enc_192_avx_by8)
542 	/* call the aes main loop */
543 	do_aes_ctrmain KEY_192 0
544 
545 SYM_FUNC_END(aes_ctr_enc_192_avx_by8)
546 
547 /*
548  * routine to do AES256 CTR enc/decrypt "by8"
549  * XMM registers are clobbered.
550  * Saving/restoring must be done at a higher level
551  * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
552  *			unsigned int num_bytes)
553  */
554 SYM_FUNC_START(aes_ctr_enc_256_avx_by8)
555 	/* call the aes main loop */
556 	do_aes_ctrmain KEY_256 0
557 
558 SYM_FUNC_END(aes_ctr_enc_256_avx_by8)
559 
560 /*
561  * routine to do AES128 XCTR enc/decrypt "by8"
562  * XMM registers are clobbered.
563  * Saving/restoring must be done at a higher level
564  * aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, const void *keys,
565  * 	u8* out, unsigned int num_bytes, unsigned int byte_ctr)
566  */
567 SYM_FUNC_START(aes_xctr_enc_128_avx_by8)
568 	/* call the aes main loop */
569 	do_aes_ctrmain KEY_128 1
570 
571 SYM_FUNC_END(aes_xctr_enc_128_avx_by8)
572 
573 /*
574  * routine to do AES192 XCTR enc/decrypt "by8"
575  * XMM registers are clobbered.
576  * Saving/restoring must be done at a higher level
577  * aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, const void *keys,
578  * 	u8* out, unsigned int num_bytes, unsigned int byte_ctr)
579  */
580 SYM_FUNC_START(aes_xctr_enc_192_avx_by8)
581 	/* call the aes main loop */
582 	do_aes_ctrmain KEY_192 1
583 
584 SYM_FUNC_END(aes_xctr_enc_192_avx_by8)
585 
586 /*
587  * routine to do AES256 XCTR enc/decrypt "by8"
588  * XMM registers are clobbered.
589  * Saving/restoring must be done at a higher level
590  * aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, const void *keys,
591  * 	u8* out, unsigned int num_bytes, unsigned int byte_ctr)
592  */
593 SYM_FUNC_START(aes_xctr_enc_256_avx_by8)
594 	/* call the aes main loop */
595 	do_aes_ctrmain KEY_256 1
596 
597 SYM_FUNC_END(aes_xctr_enc_256_avx_by8)
598