1 /*
2  *	Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
3  *
4  * This is AES128/192/256 CTR mode optimization implementation. It requires
5  * the support of Intel(R) AESNI and AVX instructions.
6  *
7  * This work was inspired by the AES CTR mode optimization published
8  * in Intel Optimized IPSEC Cryptograhpic library.
9  * Additional information on it can be found at:
10  *    http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
11  *
12  * This file is provided under a dual BSD/GPLv2 license.  When using or
13  * redistributing this file, you may do so under either license.
14  *
15  * GPL LICENSE SUMMARY
16  *
17  * Copyright(c) 2014 Intel Corporation.
18  *
19  * This program is free software; you can redistribute it and/or modify
20  * it under the terms of version 2 of the GNU General Public License as
21  * published by the Free Software Foundation.
22  *
23  * This program is distributed in the hope that it will be useful, but
24  * WITHOUT ANY WARRANTY; without even the implied warranty of
25  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
26  * General Public License for more details.
27  *
28  * Contact Information:
29  * James Guilford <james.guilford@intel.com>
30  * Sean Gulley <sean.m.gulley@intel.com>
31  * Chandramouli Narayanan <mouli@linux.intel.com>
32  *
33  * BSD LICENSE
34  *
35  * Copyright(c) 2014 Intel Corporation.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  *
41  * Redistributions of source code must retain the above copyright
42  * notice, this list of conditions and the following disclaimer.
43  * Redistributions in binary form must reproduce the above copyright
44  * notice, this list of conditions and the following disclaimer in
45  * the documentation and/or other materials provided with the
46  * distribution.
47  * Neither the name of Intel Corporation nor the names of its
48  * contributors may be used to endorse or promote products derived
49  * from this software without specific prior written permission.
50  *
51  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
52  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
53  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
54  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
55  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
56  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
57  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62  *
63  */
64 
65 #include <linux/linkage.h>
66 
67 #define VMOVDQ		vmovdqu
68 
69 #define xdata0		%xmm0
70 #define xdata1		%xmm1
71 #define xdata2		%xmm2
72 #define xdata3		%xmm3
73 #define xdata4		%xmm4
74 #define xdata5		%xmm5
75 #define xdata6		%xmm6
76 #define xdata7		%xmm7
77 #define xcounter	%xmm8
78 #define xbyteswap	%xmm9
79 #define xkey0		%xmm10
80 #define xkey4		%xmm11
81 #define xkey8		%xmm12
82 #define xkey12		%xmm13
83 #define xkeyA		%xmm14
84 #define xkeyB		%xmm15
85 
86 #define p_in		%rdi
87 #define p_iv		%rsi
88 #define p_keys		%rdx
89 #define p_out		%rcx
90 #define num_bytes	%r8
91 
92 #define tmp		%r10
93 #define	DDQ_DATA	0
94 #define	XDATA		1
95 #define KEY_128		1
96 #define KEY_192		2
97 #define KEY_256		3
98 
99 .section .rodata
100 .align 16
101 
102 byteswap_const:
103 	.octa 0x000102030405060708090A0B0C0D0E0F
104 ddq_low_msk:
105 	.octa 0x0000000000000000FFFFFFFFFFFFFFFF
106 ddq_high_add_1:
107 	.octa 0x00000000000000010000000000000000
108 ddq_add_1:
109 	.octa 0x00000000000000000000000000000001
110 ddq_add_2:
111 	.octa 0x00000000000000000000000000000002
112 ddq_add_3:
113 	.octa 0x00000000000000000000000000000003
114 ddq_add_4:
115 	.octa 0x00000000000000000000000000000004
116 ddq_add_5:
117 	.octa 0x00000000000000000000000000000005
118 ddq_add_6:
119 	.octa 0x00000000000000000000000000000006
120 ddq_add_7:
121 	.octa 0x00000000000000000000000000000007
122 ddq_add_8:
123 	.octa 0x00000000000000000000000000000008
124 
125 .text
126 
127 /* generate a unique variable for ddq_add_x */
128 
129 /* generate a unique variable for xmm register */
130 .macro setxdata n
131 	var_xdata = %xmm\n
132 .endm
133 
134 /* club the numeric 'id' to the symbol 'name' */
135 
136 .macro club name, id
137 .altmacro
138 	.if \name == XDATA
139 		setxdata %\id
140 	.endif
141 .noaltmacro
142 .endm
143 
144 /*
145  * do_aes num_in_par load_keys key_len
146  * This increments p_in, but not p_out
147  */
148 .macro do_aes b, k, key_len
149 	.set by, \b
150 	.set load_keys, \k
151 	.set klen, \key_len
152 
153 	.if (load_keys)
154 		vmovdqa	0*16(p_keys), xkey0
155 	.endif
156 
157 	vpshufb	xbyteswap, xcounter, xdata0
158 
159 	.set i, 1
160 	.rept (by - 1)
161 		club XDATA, i
162 		vpaddq	(ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
163 		vptest	ddq_low_msk(%rip), var_xdata
164 		jnz 1f
165 		vpaddq	ddq_high_add_1(%rip), var_xdata, var_xdata
166 		vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
167 		1:
168 		vpshufb	xbyteswap, var_xdata, var_xdata
169 		.set i, (i +1)
170 	.endr
171 
172 	vmovdqa	1*16(p_keys), xkeyA
173 
174 	vpxor	xkey0, xdata0, xdata0
175 	vpaddq	(ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
176 	vptest	ddq_low_msk(%rip), xcounter
177 	jnz	1f
178 	vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
179 	1:
180 
181 	.set i, 1
182 	.rept (by - 1)
183 		club XDATA, i
184 		vpxor	xkey0, var_xdata, var_xdata
185 		.set i, (i +1)
186 	.endr
187 
188 	vmovdqa	2*16(p_keys), xkeyB
189 
190 	.set i, 0
191 	.rept by
192 		club XDATA, i
193 		vaesenc	xkeyA, var_xdata, var_xdata		/* key 1 */
194 		.set i, (i +1)
195 	.endr
196 
197 	.if (klen == KEY_128)
198 		.if (load_keys)
199 			vmovdqa	3*16(p_keys), xkey4
200 		.endif
201 	.else
202 		vmovdqa	3*16(p_keys), xkeyA
203 	.endif
204 
205 	.set i, 0
206 	.rept by
207 		club XDATA, i
208 		vaesenc	xkeyB, var_xdata, var_xdata		/* key 2 */
209 		.set i, (i +1)
210 	.endr
211 
212 	add	$(16*by), p_in
213 
214 	.if (klen == KEY_128)
215 		vmovdqa	4*16(p_keys), xkeyB
216 	.else
217 		.if (load_keys)
218 			vmovdqa	4*16(p_keys), xkey4
219 		.endif
220 	.endif
221 
222 	.set i, 0
223 	.rept by
224 		club XDATA, i
225 		/* key 3 */
226 		.if (klen == KEY_128)
227 			vaesenc	xkey4, var_xdata, var_xdata
228 		.else
229 			vaesenc	xkeyA, var_xdata, var_xdata
230 		.endif
231 		.set i, (i +1)
232 	.endr
233 
234 	vmovdqa	5*16(p_keys), xkeyA
235 
236 	.set i, 0
237 	.rept by
238 		club XDATA, i
239 		/* key 4 */
240 		.if (klen == KEY_128)
241 			vaesenc	xkeyB, var_xdata, var_xdata
242 		.else
243 			vaesenc	xkey4, var_xdata, var_xdata
244 		.endif
245 		.set i, (i +1)
246 	.endr
247 
248 	.if (klen == KEY_128)
249 		.if (load_keys)
250 			vmovdqa	6*16(p_keys), xkey8
251 		.endif
252 	.else
253 		vmovdqa	6*16(p_keys), xkeyB
254 	.endif
255 
256 	.set i, 0
257 	.rept by
258 		club XDATA, i
259 		vaesenc	xkeyA, var_xdata, var_xdata		/* key 5 */
260 		.set i, (i +1)
261 	.endr
262 
263 	vmovdqa	7*16(p_keys), xkeyA
264 
265 	.set i, 0
266 	.rept by
267 		club XDATA, i
268 		/* key 6 */
269 		.if (klen == KEY_128)
270 			vaesenc	xkey8, var_xdata, var_xdata
271 		.else
272 			vaesenc	xkeyB, var_xdata, var_xdata
273 		.endif
274 		.set i, (i +1)
275 	.endr
276 
277 	.if (klen == KEY_128)
278 		vmovdqa	8*16(p_keys), xkeyB
279 	.else
280 		.if (load_keys)
281 			vmovdqa	8*16(p_keys), xkey8
282 		.endif
283 	.endif
284 
285 	.set i, 0
286 	.rept by
287 		club XDATA, i
288 		vaesenc	xkeyA, var_xdata, var_xdata		/* key 7 */
289 		.set i, (i +1)
290 	.endr
291 
292 	.if (klen == KEY_128)
293 		.if (load_keys)
294 			vmovdqa	9*16(p_keys), xkey12
295 		.endif
296 	.else
297 		vmovdqa	9*16(p_keys), xkeyA
298 	.endif
299 
300 	.set i, 0
301 	.rept by
302 		club XDATA, i
303 		/* key 8 */
304 		.if (klen == KEY_128)
305 			vaesenc	xkeyB, var_xdata, var_xdata
306 		.else
307 			vaesenc	xkey8, var_xdata, var_xdata
308 		.endif
309 		.set i, (i +1)
310 	.endr
311 
312 	vmovdqa	10*16(p_keys), xkeyB
313 
314 	.set i, 0
315 	.rept by
316 		club XDATA, i
317 		/* key 9 */
318 		.if (klen == KEY_128)
319 			vaesenc	xkey12, var_xdata, var_xdata
320 		.else
321 			vaesenc	xkeyA, var_xdata, var_xdata
322 		.endif
323 		.set i, (i +1)
324 	.endr
325 
326 	.if (klen != KEY_128)
327 		vmovdqa	11*16(p_keys), xkeyA
328 	.endif
329 
330 	.set i, 0
331 	.rept by
332 		club XDATA, i
333 		/* key 10 */
334 		.if (klen == KEY_128)
335 			vaesenclast	xkeyB, var_xdata, var_xdata
336 		.else
337 			vaesenc	xkeyB, var_xdata, var_xdata
338 		.endif
339 		.set i, (i +1)
340 	.endr
341 
342 	.if (klen != KEY_128)
343 		.if (load_keys)
344 			vmovdqa	12*16(p_keys), xkey12
345 		.endif
346 
347 		.set i, 0
348 		.rept by
349 			club XDATA, i
350 			vaesenc	xkeyA, var_xdata, var_xdata	/* key 11 */
351 			.set i, (i +1)
352 		.endr
353 
354 		.if (klen == KEY_256)
355 			vmovdqa	13*16(p_keys), xkeyA
356 		.endif
357 
358 		.set i, 0
359 		.rept by
360 			club XDATA, i
361 			.if (klen == KEY_256)
362 				/* key 12 */
363 				vaesenc	xkey12, var_xdata, var_xdata
364 			.else
365 				vaesenclast xkey12, var_xdata, var_xdata
366 			.endif
367 			.set i, (i +1)
368 		.endr
369 
370 		.if (klen == KEY_256)
371 			vmovdqa	14*16(p_keys), xkeyB
372 
373 			.set i, 0
374 			.rept by
375 				club XDATA, i
376 				/* key 13 */
377 				vaesenc	xkeyA, var_xdata, var_xdata
378 				.set i, (i +1)
379 			.endr
380 
381 			.set i, 0
382 			.rept by
383 				club XDATA, i
384 				/* key 14 */
385 				vaesenclast	xkeyB, var_xdata, var_xdata
386 				.set i, (i +1)
387 			.endr
388 		.endif
389 	.endif
390 
391 	.set i, 0
392 	.rept (by / 2)
393 		.set j, (i+1)
394 		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
395 		VMOVDQ	(j*16 - 16*by)(p_in), xkeyB
396 		club XDATA, i
397 		vpxor	xkeyA, var_xdata, var_xdata
398 		club XDATA, j
399 		vpxor	xkeyB, var_xdata, var_xdata
400 		.set i, (i+2)
401 	.endr
402 
403 	.if (i < by)
404 		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
405 		club XDATA, i
406 		vpxor	xkeyA, var_xdata, var_xdata
407 	.endif
408 
409 	.set i, 0
410 	.rept by
411 		club XDATA, i
412 		VMOVDQ	var_xdata, i*16(p_out)
413 		.set i, (i+1)
414 	.endr
415 .endm
416 
417 .macro do_aes_load val, key_len
418 	do_aes \val, 1, \key_len
419 .endm
420 
421 .macro do_aes_noload val, key_len
422 	do_aes \val, 0, \key_len
423 .endm
424 
425 /* main body of aes ctr load */
426 
427 .macro do_aes_ctrmain key_len
428 	cmp	$16, num_bytes
429 	jb	.Ldo_return2\key_len
430 
431 	vmovdqa	byteswap_const(%rip), xbyteswap
432 	vmovdqu	(p_iv), xcounter
433 	vpshufb	xbyteswap, xcounter, xcounter
434 
435 	mov	num_bytes, tmp
436 	and	$(7*16), tmp
437 	jz	.Lmult_of_8_blks\key_len
438 
439 	/* 1 <= tmp <= 7 */
440 	cmp	$(4*16), tmp
441 	jg	.Lgt4\key_len
442 	je	.Leq4\key_len
443 
444 .Llt4\key_len:
445 	cmp	$(2*16), tmp
446 	jg	.Leq3\key_len
447 	je	.Leq2\key_len
448 
449 .Leq1\key_len:
450 	do_aes_load	1, \key_len
451 	add	$(1*16), p_out
452 	and	$(~7*16), num_bytes
453 	jz	.Ldo_return2\key_len
454 	jmp	.Lmain_loop2\key_len
455 
456 .Leq2\key_len:
457 	do_aes_load	2, \key_len
458 	add	$(2*16), p_out
459 	and	$(~7*16), num_bytes
460 	jz	.Ldo_return2\key_len
461 	jmp	.Lmain_loop2\key_len
462 
463 
464 .Leq3\key_len:
465 	do_aes_load	3, \key_len
466 	add	$(3*16), p_out
467 	and	$(~7*16), num_bytes
468 	jz	.Ldo_return2\key_len
469 	jmp	.Lmain_loop2\key_len
470 
471 .Leq4\key_len:
472 	do_aes_load	4, \key_len
473 	add	$(4*16), p_out
474 	and	$(~7*16), num_bytes
475 	jz	.Ldo_return2\key_len
476 	jmp	.Lmain_loop2\key_len
477 
478 .Lgt4\key_len:
479 	cmp	$(6*16), tmp
480 	jg	.Leq7\key_len
481 	je	.Leq6\key_len
482 
483 .Leq5\key_len:
484 	do_aes_load	5, \key_len
485 	add	$(5*16), p_out
486 	and	$(~7*16), num_bytes
487 	jz	.Ldo_return2\key_len
488 	jmp	.Lmain_loop2\key_len
489 
490 .Leq6\key_len:
491 	do_aes_load	6, \key_len
492 	add	$(6*16), p_out
493 	and	$(~7*16), num_bytes
494 	jz	.Ldo_return2\key_len
495 	jmp	.Lmain_loop2\key_len
496 
497 .Leq7\key_len:
498 	do_aes_load	7, \key_len
499 	add	$(7*16), p_out
500 	and	$(~7*16), num_bytes
501 	jz	.Ldo_return2\key_len
502 	jmp	.Lmain_loop2\key_len
503 
504 .Lmult_of_8_blks\key_len:
505 	.if (\key_len != KEY_128)
506 		vmovdqa	0*16(p_keys), xkey0
507 		vmovdqa	4*16(p_keys), xkey4
508 		vmovdqa	8*16(p_keys), xkey8
509 		vmovdqa	12*16(p_keys), xkey12
510 	.else
511 		vmovdqa	0*16(p_keys), xkey0
512 		vmovdqa	3*16(p_keys), xkey4
513 		vmovdqa	6*16(p_keys), xkey8
514 		vmovdqa	9*16(p_keys), xkey12
515 	.endif
516 .align 16
517 .Lmain_loop2\key_len:
518 	/* num_bytes is a multiple of 8 and >0 */
519 	do_aes_noload	8, \key_len
520 	add	$(8*16), p_out
521 	sub	$(8*16), num_bytes
522 	jne	.Lmain_loop2\key_len
523 
524 .Ldo_return2\key_len:
525 	/* return updated IV */
526 	vpshufb	xbyteswap, xcounter, xcounter
527 	vmovdqu	xcounter, (p_iv)
528 	RET
529 .endm
530 
531 /*
532  * routine to do AES128 CTR enc/decrypt "by8"
533  * XMM registers are clobbered.
534  * Saving/restoring must be done at a higher level
535  * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
536  *			unsigned int num_bytes)
537  */
538 SYM_FUNC_START(aes_ctr_enc_128_avx_by8)
539 	/* call the aes main loop */
540 	do_aes_ctrmain KEY_128
541 
542 SYM_FUNC_END(aes_ctr_enc_128_avx_by8)
543 
544 /*
545  * routine to do AES192 CTR enc/decrypt "by8"
546  * XMM registers are clobbered.
547  * Saving/restoring must be done at a higher level
548  * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
549  *			unsigned int num_bytes)
550  */
551 SYM_FUNC_START(aes_ctr_enc_192_avx_by8)
552 	/* call the aes main loop */
553 	do_aes_ctrmain KEY_192
554 
555 SYM_FUNC_END(aes_ctr_enc_192_avx_by8)
556 
557 /*
558  * routine to do AES256 CTR enc/decrypt "by8"
559  * XMM registers are clobbered.
560  * Saving/restoring must be done at a higher level
561  * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
562  *			unsigned int num_bytes)
563  */
564 SYM_FUNC_START(aes_ctr_enc_256_avx_by8)
565 	/* call the aes main loop */
566 	do_aes_ctrmain KEY_256
567 
568 SYM_FUNC_END(aes_ctr_enc_256_avx_by8)
569