1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * Implement AES algorithm in Intel AES-NI instructions.
4  *
5  * The white paper of AES-NI instructions can be downloaded from:
6  *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7  *
8  * Copyright (C) 2008, Intel Corp.
9  *    Author: Huang Ying <ying.huang@intel.com>
10  *            Vinodh Gopal <vinodh.gopal@intel.com>
11  *            Kahraman Akdemir
12  *
13  * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
14  * interface for 64-bit kernels.
15  *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
16  *             Aidan O'Mahony (aidan.o.mahony@intel.com)
17  *             Adrian Hoban <adrian.hoban@intel.com>
18  *             James Guilford (james.guilford@intel.com)
19  *             Gabriele Paoloni <gabriele.paoloni@intel.com>
20  *             Tadeusz Struk (tadeusz.struk@intel.com)
21  *             Wajdi Feghali (wajdi.k.feghali@intel.com)
22  *    Copyright (c) 2010, Intel Corporation.
23  *
24  * Ported x86_64 version to x86:
25  *    Author: Mathias Krause <minipli@googlemail.com>
26  */
27 
28 #include <linux/linkage.h>
29 #include <asm/frame.h>
30 #include <asm/nospec-branch.h>
31 
32 /*
33  * The following macros are used to move an (un)aligned 16 byte value to/from
34  * an XMM register.  This can done for either FP or integer values, for FP use
35  * movaps (move aligned packed single) or integer use movdqa (move double quad
36  * aligned).  It doesn't make a performance difference which instruction is used
37  * since Nehalem (original Core i7) was released.  However, the movaps is a byte
38  * shorter, so that is the one we'll use for now. (same for unaligned).
39  */
40 #define MOVADQ	movaps
41 #define MOVUDQ	movups
42 
43 #ifdef __x86_64__
44 
45 # constants in mergeable sections, linker can reorder and merge
46 .section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
47 .align 16
48 .Lgf128mul_x_ble_mask:
49 	.octa 0x00000000000000010000000000000087
50 .section	.rodata.cst16.POLY, "aM", @progbits, 16
51 .align 16
52 POLY:   .octa 0xC2000000000000000000000000000001
53 .section	.rodata.cst16.TWOONE, "aM", @progbits, 16
54 .align 16
55 TWOONE: .octa 0x00000001000000000000000000000001
56 
57 .section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
58 .align 16
59 SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
60 .section	.rodata.cst16.MASK1, "aM", @progbits, 16
61 .align 16
62 MASK1:      .octa 0x0000000000000000ffffffffffffffff
63 .section	.rodata.cst16.MASK2, "aM", @progbits, 16
64 .align 16
65 MASK2:      .octa 0xffffffffffffffff0000000000000000
66 .section	.rodata.cst16.ONE, "aM", @progbits, 16
67 .align 16
68 ONE:        .octa 0x00000000000000000000000000000001
69 .section	.rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
70 .align 16
71 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
72 .section	.rodata.cst16.dec, "aM", @progbits, 16
73 .align 16
74 dec:        .octa 0x1
75 .section	.rodata.cst16.enc, "aM", @progbits, 16
76 .align 16
77 enc:        .octa 0x2
78 
79 # order of these constants should not change.
80 # more specifically, ALL_F should follow SHIFT_MASK,
81 # and zero should follow ALL_F
82 .section	.rodata, "a", @progbits
83 .align 16
84 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
85 ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
86             .octa 0x00000000000000000000000000000000
87 
88 .text
89 
90 
91 #define	STACK_OFFSET    8*3
92 
93 #define AadHash 16*0
94 #define AadLen 16*1
95 #define InLen (16*1)+8
96 #define PBlockEncKey 16*2
97 #define OrigIV 16*3
98 #define CurCount 16*4
99 #define PBlockLen 16*5
100 #define	HashKey		16*6	// store HashKey <<1 mod poly here
101 #define	HashKey_2	16*7	// store HashKey^2 <<1 mod poly here
102 #define	HashKey_3	16*8	// store HashKey^3 <<1 mod poly here
103 #define	HashKey_4	16*9	// store HashKey^4 <<1 mod poly here
104 #define	HashKey_k	16*10	// store XOR of High 64 bits and Low 64
105 				// bits of  HashKey <<1 mod poly here
106 				//(for Karatsuba purposes)
107 #define	HashKey_2_k	16*11	// store XOR of High 64 bits and Low 64
108 				// bits of  HashKey^2 <<1 mod poly here
109 				// (for Karatsuba purposes)
110 #define	HashKey_3_k	16*12	// store XOR of High 64 bits and Low 64
111 				// bits of  HashKey^3 <<1 mod poly here
112 				// (for Karatsuba purposes)
113 #define	HashKey_4_k	16*13	// store XOR of High 64 bits and Low 64
114 				// bits of  HashKey^4 <<1 mod poly here
115 				// (for Karatsuba purposes)
116 
117 #define arg1 rdi
118 #define arg2 rsi
119 #define arg3 rdx
120 #define arg4 rcx
121 #define arg5 r8
122 #define arg6 r9
123 #define arg7 STACK_OFFSET+8(%rsp)
124 #define arg8 STACK_OFFSET+16(%rsp)
125 #define arg9 STACK_OFFSET+24(%rsp)
126 #define arg10 STACK_OFFSET+32(%rsp)
127 #define arg11 STACK_OFFSET+40(%rsp)
128 #define keysize 2*15*16(%arg1)
129 #endif
130 
131 
132 #define STATE1	%xmm0
133 #define STATE2	%xmm4
134 #define STATE3	%xmm5
135 #define STATE4	%xmm6
136 #define STATE	STATE1
137 #define IN1	%xmm1
138 #define IN2	%xmm7
139 #define IN3	%xmm8
140 #define IN4	%xmm9
141 #define IN	IN1
142 #define KEY	%xmm2
143 #define IV	%xmm3
144 
145 #define BSWAP_MASK %xmm10
146 #define CTR	%xmm11
147 #define INC	%xmm12
148 
149 #define GF128MUL_MASK %xmm10
150 
151 #ifdef __x86_64__
152 #define AREG	%rax
153 #define KEYP	%rdi
154 #define OUTP	%rsi
155 #define UKEYP	OUTP
156 #define INP	%rdx
157 #define LEN	%rcx
158 #define IVP	%r8
159 #define KLEN	%r9d
160 #define T1	%r10
161 #define TKEYP	T1
162 #define T2	%r11
163 #define TCTR_LOW T2
164 #else
165 #define AREG	%eax
166 #define KEYP	%edi
167 #define OUTP	AREG
168 #define UKEYP	OUTP
169 #define INP	%edx
170 #define LEN	%esi
171 #define IVP	%ebp
172 #define KLEN	%ebx
173 #define T1	%ecx
174 #define TKEYP	T1
175 #endif
176 
177 .macro FUNC_SAVE
178 	push	%r12
179 	push	%r13
180 	push	%r14
181 #
182 # states of %xmm registers %xmm6:%xmm15 not saved
183 # all %xmm registers are clobbered
184 #
185 .endm
186 
187 
188 .macro FUNC_RESTORE
189 	pop	%r14
190 	pop	%r13
191 	pop	%r12
192 .endm
193 
194 # Precompute hashkeys.
195 # Input: Hash subkey.
196 # Output: HashKeys stored in gcm_context_data.  Only needs to be called
197 # once per key.
198 # clobbers r12, and tmp xmm registers.
199 .macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
200 	mov	\SUBKEY, %r12
201 	movdqu	(%r12), \TMP3
202 	movdqa	SHUF_MASK(%rip), \TMP2
203 	pshufb	\TMP2, \TMP3
204 
205 	# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
206 
207 	movdqa	\TMP3, \TMP2
208 	psllq	$1, \TMP3
209 	psrlq	$63, \TMP2
210 	movdqa	\TMP2, \TMP1
211 	pslldq	$8, \TMP2
212 	psrldq	$8, \TMP1
213 	por	\TMP2, \TMP3
214 
215 	# reduce HashKey<<1
216 
217 	pshufd	$0x24, \TMP1, \TMP2
218 	pcmpeqd TWOONE(%rip), \TMP2
219 	pand	POLY(%rip), \TMP2
220 	pxor	\TMP2, \TMP3
221 	movdqu	\TMP3, HashKey(%arg2)
222 
223 	movdqa	   \TMP3, \TMP5
224 	pshufd	   $78, \TMP3, \TMP1
225 	pxor	   \TMP3, \TMP1
226 	movdqu	   \TMP1, HashKey_k(%arg2)
227 
228 	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
229 # TMP5 = HashKey^2<<1 (mod poly)
230 	movdqu	   \TMP5, HashKey_2(%arg2)
231 # HashKey_2 = HashKey^2<<1 (mod poly)
232 	pshufd	   $78, \TMP5, \TMP1
233 	pxor	   \TMP5, \TMP1
234 	movdqu	   \TMP1, HashKey_2_k(%arg2)
235 
236 	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
237 # TMP5 = HashKey^3<<1 (mod poly)
238 	movdqu	   \TMP5, HashKey_3(%arg2)
239 	pshufd	   $78, \TMP5, \TMP1
240 	pxor	   \TMP5, \TMP1
241 	movdqu	   \TMP1, HashKey_3_k(%arg2)
242 
243 	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
244 # TMP5 = HashKey^3<<1 (mod poly)
245 	movdqu	   \TMP5, HashKey_4(%arg2)
246 	pshufd	   $78, \TMP5, \TMP1
247 	pxor	   \TMP5, \TMP1
248 	movdqu	   \TMP1, HashKey_4_k(%arg2)
249 .endm
250 
251 # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
252 # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
253 .macro GCM_INIT Iv SUBKEY AAD AADLEN
254 	mov \AADLEN, %r11
255 	mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
256 	xor %r11d, %r11d
257 	mov %r11, InLen(%arg2) # ctx_data.in_length = 0
258 	mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
259 	mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
260 	mov \Iv, %rax
261 	movdqu (%rax), %xmm0
262 	movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
263 
264 	movdqa  SHUF_MASK(%rip), %xmm2
265 	pshufb %xmm2, %xmm0
266 	movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
267 
268 	PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
269 	movdqu HashKey(%arg2), %xmm13
270 
271 	CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
272 	%xmm4, %xmm5, %xmm6
273 .endm
274 
275 # GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
276 # struct has been initialized by GCM_INIT.
277 # Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
278 # Clobbers rax, r10-r13, and xmm0-xmm15
279 .macro GCM_ENC_DEC operation
280 	movdqu AadHash(%arg2), %xmm8
281 	movdqu HashKey(%arg2), %xmm13
282 	add %arg5, InLen(%arg2)
283 
284 	xor %r11d, %r11d # initialise the data pointer offset as zero
285 	PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
286 
287 	sub %r11, %arg5		# sub partial block data used
288 	mov %arg5, %r13		# save the number of bytes
289 
290 	and $-16, %r13		# %r13 = %r13 - (%r13 mod 16)
291 	mov %r13, %r12
292 	# Encrypt/Decrypt first few blocks
293 
294 	and	$(3<<4), %r12
295 	jz	_initial_num_blocks_is_0_\@
296 	cmp	$(2<<4), %r12
297 	jb	_initial_num_blocks_is_1_\@
298 	je	_initial_num_blocks_is_2_\@
299 _initial_num_blocks_is_3_\@:
300 	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
301 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
302 	sub	$48, %r13
303 	jmp	_initial_blocks_\@
304 _initial_num_blocks_is_2_\@:
305 	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
306 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
307 	sub	$32, %r13
308 	jmp	_initial_blocks_\@
309 _initial_num_blocks_is_1_\@:
310 	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
311 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
312 	sub	$16, %r13
313 	jmp	_initial_blocks_\@
314 _initial_num_blocks_is_0_\@:
315 	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
316 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
317 _initial_blocks_\@:
318 
319 	# Main loop - Encrypt/Decrypt remaining blocks
320 
321 	test	%r13, %r13
322 	je	_zero_cipher_left_\@
323 	sub	$64, %r13
324 	je	_four_cipher_left_\@
325 _crypt_by_4_\@:
326 	GHASH_4_ENCRYPT_4_PARALLEL_\operation	%xmm9, %xmm10, %xmm11, %xmm12, \
327 	%xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
328 	%xmm7, %xmm8, enc
329 	add	$64, %r11
330 	sub	$64, %r13
331 	jne	_crypt_by_4_\@
332 _four_cipher_left_\@:
333 	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
334 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
335 _zero_cipher_left_\@:
336 	movdqu %xmm8, AadHash(%arg2)
337 	movdqu %xmm0, CurCount(%arg2)
338 
339 	mov	%arg5, %r13
340 	and	$15, %r13			# %r13 = arg5 (mod 16)
341 	je	_multiple_of_16_bytes_\@
342 
343 	mov %r13, PBlockLen(%arg2)
344 
345 	# Handle the last <16 Byte block separately
346 	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
347 	movdqu %xmm0, CurCount(%arg2)
348 	movdqa SHUF_MASK(%rip), %xmm10
349 	pshufb %xmm10, %xmm0
350 
351 	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
352 	movdqu %xmm0, PBlockEncKey(%arg2)
353 
354 	cmp	$16, %arg5
355 	jge _large_enough_update_\@
356 
357 	lea (%arg4,%r11,1), %r10
358 	mov %r13, %r12
359 	READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
360 	jmp _data_read_\@
361 
362 _large_enough_update_\@:
363 	sub	$16, %r11
364 	add	%r13, %r11
365 
366 	# receive the last <16 Byte block
367 	movdqu	(%arg4, %r11, 1), %xmm1
368 
369 	sub	%r13, %r11
370 	add	$16, %r11
371 
372 	lea	SHIFT_MASK+16(%rip), %r12
373 	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
374 	# (r13 is the number of bytes in plaintext mod 16)
375 	sub	%r13, %r12
376 	# get the appropriate shuffle mask
377 	movdqu	(%r12), %xmm2
378 	# shift right 16-r13 bytes
379 	pshufb  %xmm2, %xmm1
380 
381 _data_read_\@:
382 	lea ALL_F+16(%rip), %r12
383 	sub %r13, %r12
384 
385 .ifc \operation, dec
386 	movdqa  %xmm1, %xmm2
387 .endif
388 	pxor	%xmm1, %xmm0            # XOR Encrypt(K, Yn)
389 	movdqu	(%r12), %xmm1
390 	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
391 	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
392 .ifc \operation, dec
393 	pand    %xmm1, %xmm2
394 	movdqa SHUF_MASK(%rip), %xmm10
395 	pshufb %xmm10 ,%xmm2
396 
397 	pxor %xmm2, %xmm8
398 .else
399 	movdqa SHUF_MASK(%rip), %xmm10
400 	pshufb %xmm10,%xmm0
401 
402 	pxor	%xmm0, %xmm8
403 .endif
404 
405 	movdqu %xmm8, AadHash(%arg2)
406 .ifc \operation, enc
407 	# GHASH computation for the last <16 byte block
408 	movdqa SHUF_MASK(%rip), %xmm10
409 	# shuffle xmm0 back to output as ciphertext
410 	pshufb %xmm10, %xmm0
411 .endif
412 
413 	# Output %r13 bytes
414 	movq %xmm0, %rax
415 	cmp $8, %r13
416 	jle _less_than_8_bytes_left_\@
417 	mov %rax, (%arg3 , %r11, 1)
418 	add $8, %r11
419 	psrldq $8, %xmm0
420 	movq %xmm0, %rax
421 	sub $8, %r13
422 _less_than_8_bytes_left_\@:
423 	mov %al,  (%arg3, %r11, 1)
424 	add $1, %r11
425 	shr $8, %rax
426 	sub $1, %r13
427 	jne _less_than_8_bytes_left_\@
428 _multiple_of_16_bytes_\@:
429 .endm
430 
431 # GCM_COMPLETE Finishes update of tag of last partial block
432 # Output: Authorization Tag (AUTH_TAG)
433 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
434 .macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
435 	movdqu AadHash(%arg2), %xmm8
436 	movdqu HashKey(%arg2), %xmm13
437 
438 	mov PBlockLen(%arg2), %r12
439 
440 	test %r12, %r12
441 	je _partial_done\@
442 
443 	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
444 
445 _partial_done\@:
446 	mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
447 	shl	$3, %r12		  # convert into number of bits
448 	movd	%r12d, %xmm15		  # len(A) in %xmm15
449 	mov InLen(%arg2), %r12
450 	shl     $3, %r12                  # len(C) in bits (*128)
451 	movq    %r12, %xmm1
452 
453 	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
454 	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
455 	pxor	%xmm15, %xmm8
456 	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
457 	# final GHASH computation
458 	movdqa SHUF_MASK(%rip), %xmm10
459 	pshufb %xmm10, %xmm8
460 
461 	movdqu OrigIV(%arg2), %xmm0       # %xmm0 = Y0
462 	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
463 	pxor	%xmm8, %xmm0
464 _return_T_\@:
465 	mov	\AUTHTAG, %r10                     # %r10 = authTag
466 	mov	\AUTHTAGLEN, %r11                    # %r11 = auth_tag_len
467 	cmp	$16, %r11
468 	je	_T_16_\@
469 	cmp	$8, %r11
470 	jl	_T_4_\@
471 _T_8_\@:
472 	movq	%xmm0, %rax
473 	mov	%rax, (%r10)
474 	add	$8, %r10
475 	sub	$8, %r11
476 	psrldq	$8, %xmm0
477 	test	%r11, %r11
478 	je	_return_T_done_\@
479 _T_4_\@:
480 	movd	%xmm0, %eax
481 	mov	%eax, (%r10)
482 	add	$4, %r10
483 	sub	$4, %r11
484 	psrldq	$4, %xmm0
485 	test	%r11, %r11
486 	je	_return_T_done_\@
487 _T_123_\@:
488 	movd	%xmm0, %eax
489 	cmp	$2, %r11
490 	jl	_T_1_\@
491 	mov	%ax, (%r10)
492 	cmp	$2, %r11
493 	je	_return_T_done_\@
494 	add	$2, %r10
495 	sar	$16, %eax
496 _T_1_\@:
497 	mov	%al, (%r10)
498 	jmp	_return_T_done_\@
499 _T_16_\@:
500 	movdqu	%xmm0, (%r10)
501 _return_T_done_\@:
502 .endm
503 
504 #ifdef __x86_64__
505 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
506 *
507 *
508 * Input: A and B (128-bits each, bit-reflected)
509 * Output: C = A*B*x mod poly, (i.e. >>1 )
510 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
511 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
512 *
513 */
514 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
515 	movdqa	  \GH, \TMP1
516 	pshufd	  $78, \GH, \TMP2
517 	pshufd	  $78, \HK, \TMP3
518 	pxor	  \GH, \TMP2            # TMP2 = a1+a0
519 	pxor	  \HK, \TMP3            # TMP3 = b1+b0
520 	pclmulqdq $0x11, \HK, \TMP1     # TMP1 = a1*b1
521 	pclmulqdq $0x00, \HK, \GH       # GH = a0*b0
522 	pclmulqdq $0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
523 	pxor	  \GH, \TMP2
524 	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
525 	movdqa	  \TMP2, \TMP3
526 	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
527 	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
528 	pxor	  \TMP3, \GH
529 	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
530 
531         # first phase of the reduction
532 
533 	movdqa    \GH, \TMP2
534 	movdqa    \GH, \TMP3
535 	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
536 					# in in order to perform
537 					# independent shifts
538 	pslld     $31, \TMP2            # packed right shift <<31
539 	pslld     $30, \TMP3            # packed right shift <<30
540 	pslld     $25, \TMP4            # packed right shift <<25
541 	pxor      \TMP3, \TMP2          # xor the shifted versions
542 	pxor      \TMP4, \TMP2
543 	movdqa    \TMP2, \TMP5
544 	psrldq    $4, \TMP5             # right shift TMP5 1 DW
545 	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
546 	pxor      \TMP2, \GH
547 
548         # second phase of the reduction
549 
550 	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
551 					# in in order to perform
552 					# independent shifts
553 	movdqa    \GH,\TMP3
554 	movdqa    \GH,\TMP4
555 	psrld     $1,\TMP2              # packed left shift >>1
556 	psrld     $2,\TMP3              # packed left shift >>2
557 	psrld     $7,\TMP4              # packed left shift >>7
558 	pxor      \TMP3,\TMP2		# xor the shifted versions
559 	pxor      \TMP4,\TMP2
560 	pxor      \TMP5, \TMP2
561 	pxor      \TMP2, \GH
562 	pxor      \TMP1, \GH            # result is in TMP1
563 .endm
564 
565 # Reads DLEN bytes starting at DPTR and stores in XMMDst
566 # where 0 < DLEN < 16
567 # Clobbers %rax, DLEN and XMM1
568 .macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
569         cmp $8, \DLEN
570         jl _read_lt8_\@
571         mov (\DPTR), %rax
572         movq %rax, \XMMDst
573         sub $8, \DLEN
574         jz _done_read_partial_block_\@
575 	xor %eax, %eax
576 _read_next_byte_\@:
577         shl $8, %rax
578         mov 7(\DPTR, \DLEN, 1), %al
579         dec \DLEN
580         jnz _read_next_byte_\@
581         movq %rax, \XMM1
582 	pslldq $8, \XMM1
583         por \XMM1, \XMMDst
584 	jmp _done_read_partial_block_\@
585 _read_lt8_\@:
586 	xor %eax, %eax
587 _read_next_byte_lt8_\@:
588         shl $8, %rax
589         mov -1(\DPTR, \DLEN, 1), %al
590         dec \DLEN
591         jnz _read_next_byte_lt8_\@
592         movq %rax, \XMMDst
593 _done_read_partial_block_\@:
594 .endm
595 
596 # CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
597 # clobbers r10-11, xmm14
598 .macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
599 	TMP6 TMP7
600 	MOVADQ	   SHUF_MASK(%rip), %xmm14
601 	mov	   \AAD, %r10		# %r10 = AAD
602 	mov	   \AADLEN, %r11		# %r11 = aadLen
603 	pxor	   \TMP7, \TMP7
604 	pxor	   \TMP6, \TMP6
605 
606 	cmp	   $16, %r11
607 	jl	   _get_AAD_rest\@
608 _get_AAD_blocks\@:
609 	movdqu	   (%r10), \TMP7
610 	pshufb	   %xmm14, \TMP7 # byte-reflect the AAD data
611 	pxor	   \TMP7, \TMP6
612 	GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
613 	add	   $16, %r10
614 	sub	   $16, %r11
615 	cmp	   $16, %r11
616 	jge	   _get_AAD_blocks\@
617 
618 	movdqu	   \TMP6, \TMP7
619 
620 	/* read the last <16B of AAD */
621 _get_AAD_rest\@:
622 	test	   %r11, %r11
623 	je	   _get_AAD_done\@
624 
625 	READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
626 	pshufb	   %xmm14, \TMP7 # byte-reflect the AAD data
627 	pxor	   \TMP6, \TMP7
628 	GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
629 	movdqu \TMP7, \TMP6
630 
631 _get_AAD_done\@:
632 	movdqu \TMP6, AadHash(%arg2)
633 .endm
634 
635 # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
636 # between update calls.
637 # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
638 # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
639 # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
640 .macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
641 	AAD_HASH operation
642 	mov 	PBlockLen(%arg2), %r13
643 	test	%r13, %r13
644 	je	_partial_block_done_\@	# Leave Macro if no partial blocks
645 	# Read in input data without over reading
646 	cmp	$16, \PLAIN_CYPH_LEN
647 	jl	_fewer_than_16_bytes_\@
648 	movups	(\PLAIN_CYPH_IN), %xmm1	# If more than 16 bytes, just fill xmm
649 	jmp	_data_read_\@
650 
651 _fewer_than_16_bytes_\@:
652 	lea	(\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
653 	mov	\PLAIN_CYPH_LEN, %r12
654 	READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
655 
656 	mov PBlockLen(%arg2), %r13
657 
658 _data_read_\@:				# Finished reading in data
659 
660 	movdqu	PBlockEncKey(%arg2), %xmm9
661 	movdqu	HashKey(%arg2), %xmm13
662 
663 	lea	SHIFT_MASK(%rip), %r12
664 
665 	# adjust the shuffle mask pointer to be able to shift r13 bytes
666 	# r16-r13 is the number of bytes in plaintext mod 16)
667 	add	%r13, %r12
668 	movdqu	(%r12), %xmm2		# get the appropriate shuffle mask
669 	pshufb	%xmm2, %xmm9		# shift right r13 bytes
670 
671 .ifc \operation, dec
672 	movdqa	%xmm1, %xmm3
673 	pxor	%xmm1, %xmm9		# Cyphertext XOR E(K, Yn)
674 
675 	mov	\PLAIN_CYPH_LEN, %r10
676 	add	%r13, %r10
677 	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
678 	sub	$16, %r10
679 	# Determine if if partial block is not being filled and
680 	# shift mask accordingly
681 	jge	_no_extra_mask_1_\@
682 	sub	%r10, %r12
683 _no_extra_mask_1_\@:
684 
685 	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
686 	# get the appropriate mask to mask out bottom r13 bytes of xmm9
687 	pand	%xmm1, %xmm9		# mask out bottom r13 bytes of xmm9
688 
689 	pand	%xmm1, %xmm3
690 	movdqa	SHUF_MASK(%rip), %xmm10
691 	pshufb	%xmm10, %xmm3
692 	pshufb	%xmm2, %xmm3
693 	pxor	%xmm3, \AAD_HASH
694 
695 	test	%r10, %r10
696 	jl	_partial_incomplete_1_\@
697 
698 	# GHASH computation for the last <16 Byte block
699 	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
700 	xor	%eax, %eax
701 
702 	mov	%rax, PBlockLen(%arg2)
703 	jmp	_dec_done_\@
704 _partial_incomplete_1_\@:
705 	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
706 _dec_done_\@:
707 	movdqu	\AAD_HASH, AadHash(%arg2)
708 .else
709 	pxor	%xmm1, %xmm9			# Plaintext XOR E(K, Yn)
710 
711 	mov	\PLAIN_CYPH_LEN, %r10
712 	add	%r13, %r10
713 	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
714 	sub	$16, %r10
715 	# Determine if if partial block is not being filled and
716 	# shift mask accordingly
717 	jge	_no_extra_mask_2_\@
718 	sub	%r10, %r12
719 _no_extra_mask_2_\@:
720 
721 	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
722 	# get the appropriate mask to mask out bottom r13 bytes of xmm9
723 	pand	%xmm1, %xmm9
724 
725 	movdqa	SHUF_MASK(%rip), %xmm1
726 	pshufb	%xmm1, %xmm9
727 	pshufb	%xmm2, %xmm9
728 	pxor	%xmm9, \AAD_HASH
729 
730 	test	%r10, %r10
731 	jl	_partial_incomplete_2_\@
732 
733 	# GHASH computation for the last <16 Byte block
734 	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
735 	xor	%eax, %eax
736 
737 	mov	%rax, PBlockLen(%arg2)
738 	jmp	_encode_done_\@
739 _partial_incomplete_2_\@:
740 	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
741 _encode_done_\@:
742 	movdqu	\AAD_HASH, AadHash(%arg2)
743 
744 	movdqa	SHUF_MASK(%rip), %xmm10
745 	# shuffle xmm9 back to output as ciphertext
746 	pshufb	%xmm10, %xmm9
747 	pshufb	%xmm2, %xmm9
748 .endif
749 	# output encrypted Bytes
750 	test	%r10, %r10
751 	jl	_partial_fill_\@
752 	mov	%r13, %r12
753 	mov	$16, %r13
754 	# Set r13 to be the number of bytes to write out
755 	sub	%r12, %r13
756 	jmp	_count_set_\@
757 _partial_fill_\@:
758 	mov	\PLAIN_CYPH_LEN, %r13
759 _count_set_\@:
760 	movdqa	%xmm9, %xmm0
761 	movq	%xmm0, %rax
762 	cmp	$8, %r13
763 	jle	_less_than_8_bytes_left_\@
764 
765 	mov	%rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
766 	add	$8, \DATA_OFFSET
767 	psrldq	$8, %xmm0
768 	movq	%xmm0, %rax
769 	sub	$8, %r13
770 _less_than_8_bytes_left_\@:
771 	movb	%al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
772 	add	$1, \DATA_OFFSET
773 	shr	$8, %rax
774 	sub	$1, %r13
775 	jne	_less_than_8_bytes_left_\@
776 _partial_block_done_\@:
777 .endm # PARTIAL_BLOCK
778 
779 /*
780 * if a = number of total plaintext bytes
781 * b = floor(a/16)
782 * num_initial_blocks = b mod 4
783 * encrypt the initial num_initial_blocks blocks and apply ghash on
784 * the ciphertext
785 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
786 * are clobbered
787 * arg1, %arg2, %arg3 are used as a pointer only, not modified
788 */
789 
790 
791 .macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
792 	XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
793 	MOVADQ		SHUF_MASK(%rip), %xmm14
794 
795 	movdqu AadHash(%arg2), %xmm\i		    # XMM0 = Y0
796 
797 	# start AES for num_initial_blocks blocks
798 
799 	movdqu CurCount(%arg2), \XMM0                # XMM0 = Y0
800 
801 .if (\i == 5) || (\i == 6) || (\i == 7)
802 
803 	MOVADQ		ONE(%RIP),\TMP1
804 	MOVADQ		0(%arg1),\TMP2
805 .irpc index, \i_seq
806 	paddd		\TMP1, \XMM0                 # INCR Y0
807 .ifc \operation, dec
808         movdqa     \XMM0, %xmm\index
809 .else
810 	MOVADQ		\XMM0, %xmm\index
811 .endif
812 	pshufb	%xmm14, %xmm\index      # perform a 16 byte swap
813 	pxor		\TMP2, %xmm\index
814 .endr
815 	lea	0x10(%arg1),%r10
816 	mov	keysize,%eax
817 	shr	$2,%eax				# 128->4, 192->6, 256->8
818 	add	$5,%eax			      # 128->9, 192->11, 256->13
819 
820 aes_loop_initial_\@:
821 	MOVADQ	(%r10),\TMP1
822 .irpc	index, \i_seq
823 	aesenc	\TMP1, %xmm\index
824 .endr
825 	add	$16,%r10
826 	sub	$1,%eax
827 	jnz	aes_loop_initial_\@
828 
829 	MOVADQ	(%r10), \TMP1
830 .irpc index, \i_seq
831 	aesenclast \TMP1, %xmm\index         # Last Round
832 .endr
833 .irpc index, \i_seq
834 	movdqu	   (%arg4 , %r11, 1), \TMP1
835 	pxor	   \TMP1, %xmm\index
836 	movdqu	   %xmm\index, (%arg3 , %r11, 1)
837 	# write back plaintext/ciphertext for num_initial_blocks
838 	add	   $16, %r11
839 
840 .ifc \operation, dec
841 	movdqa     \TMP1, %xmm\index
842 .endif
843 	pshufb	   %xmm14, %xmm\index
844 
845 		# prepare plaintext/ciphertext for GHASH computation
846 .endr
847 .endif
848 
849         # apply GHASH on num_initial_blocks blocks
850 
851 .if \i == 5
852         pxor       %xmm5, %xmm6
853 	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
854         pxor       %xmm6, %xmm7
855 	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
856         pxor       %xmm7, %xmm8
857 	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
858 .elseif \i == 6
859         pxor       %xmm6, %xmm7
860 	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
861         pxor       %xmm7, %xmm8
862 	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
863 .elseif \i == 7
864         pxor       %xmm7, %xmm8
865 	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
866 .endif
867 	cmp	   $64, %r13
868 	jl	_initial_blocks_done\@
869 	# no need for precomputed values
870 /*
871 *
872 * Precomputations for HashKey parallel with encryption of first 4 blocks.
873 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
874 */
875 	MOVADQ	   ONE(%RIP),\TMP1
876 	paddd	   \TMP1, \XMM0              # INCR Y0
877 	MOVADQ	   \XMM0, \XMM1
878 	pshufb  %xmm14, \XMM1        # perform a 16 byte swap
879 
880 	paddd	   \TMP1, \XMM0              # INCR Y0
881 	MOVADQ	   \XMM0, \XMM2
882 	pshufb  %xmm14, \XMM2        # perform a 16 byte swap
883 
884 	paddd	   \TMP1, \XMM0              # INCR Y0
885 	MOVADQ	   \XMM0, \XMM3
886 	pshufb %xmm14, \XMM3        # perform a 16 byte swap
887 
888 	paddd	   \TMP1, \XMM0              # INCR Y0
889 	MOVADQ	   \XMM0, \XMM4
890 	pshufb %xmm14, \XMM4        # perform a 16 byte swap
891 
892 	MOVADQ	   0(%arg1),\TMP1
893 	pxor	   \TMP1, \XMM1
894 	pxor	   \TMP1, \XMM2
895 	pxor	   \TMP1, \XMM3
896 	pxor	   \TMP1, \XMM4
897 .irpc index, 1234 # do 4 rounds
898 	movaps 0x10*\index(%arg1), \TMP1
899 	aesenc	   \TMP1, \XMM1
900 	aesenc	   \TMP1, \XMM2
901 	aesenc	   \TMP1, \XMM3
902 	aesenc	   \TMP1, \XMM4
903 .endr
904 .irpc index, 56789 # do next 5 rounds
905 	movaps 0x10*\index(%arg1), \TMP1
906 	aesenc	   \TMP1, \XMM1
907 	aesenc	   \TMP1, \XMM2
908 	aesenc	   \TMP1, \XMM3
909 	aesenc	   \TMP1, \XMM4
910 .endr
911 	lea	   0xa0(%arg1),%r10
912 	mov	   keysize,%eax
913 	shr	   $2,%eax			# 128->4, 192->6, 256->8
914 	sub	   $4,%eax			# 128->0, 192->2, 256->4
915 	jz	   aes_loop_pre_done\@
916 
917 aes_loop_pre_\@:
918 	MOVADQ	   (%r10),\TMP2
919 .irpc	index, 1234
920 	aesenc	   \TMP2, %xmm\index
921 .endr
922 	add	   $16,%r10
923 	sub	   $1,%eax
924 	jnz	   aes_loop_pre_\@
925 
926 aes_loop_pre_done\@:
927 	MOVADQ	   (%r10), \TMP2
928 	aesenclast \TMP2, \XMM1
929 	aesenclast \TMP2, \XMM2
930 	aesenclast \TMP2, \XMM3
931 	aesenclast \TMP2, \XMM4
932 	movdqu	   16*0(%arg4 , %r11 , 1), \TMP1
933 	pxor	   \TMP1, \XMM1
934 .ifc \operation, dec
935 	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
936 	movdqa     \TMP1, \XMM1
937 .endif
938 	movdqu	   16*1(%arg4 , %r11 , 1), \TMP1
939 	pxor	   \TMP1, \XMM2
940 .ifc \operation, dec
941 	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
942 	movdqa     \TMP1, \XMM2
943 .endif
944 	movdqu	   16*2(%arg4 , %r11 , 1), \TMP1
945 	pxor	   \TMP1, \XMM3
946 .ifc \operation, dec
947 	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
948 	movdqa     \TMP1, \XMM3
949 .endif
950 	movdqu	   16*3(%arg4 , %r11 , 1), \TMP1
951 	pxor	   \TMP1, \XMM4
952 .ifc \operation, dec
953 	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
954 	movdqa     \TMP1, \XMM4
955 .else
956 	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
957 	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
958 	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
959 	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
960 .endif
961 
962 	add	   $64, %r11
963 	pshufb %xmm14, \XMM1 # perform a 16 byte swap
964 	pxor	   \XMMDst, \XMM1
965 # combine GHASHed value with the corresponding ciphertext
966 	pshufb %xmm14, \XMM2 # perform a 16 byte swap
967 	pshufb %xmm14, \XMM3 # perform a 16 byte swap
968 	pshufb %xmm14, \XMM4 # perform a 16 byte swap
969 
970 _initial_blocks_done\@:
971 
972 .endm
973 
974 /*
975 * encrypt 4 blocks at a time
976 * ghash the 4 previously encrypted ciphertext blocks
977 * arg1, %arg3, %arg4 are used as pointers only, not modified
978 * %r11 is the data offset value
979 */
980 .macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
981 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
982 
983 	movdqa	  \XMM1, \XMM5
984 	movdqa	  \XMM2, \XMM6
985 	movdqa	  \XMM3, \XMM7
986 	movdqa	  \XMM4, \XMM8
987 
988         movdqa    SHUF_MASK(%rip), %xmm15
989         # multiply TMP5 * HashKey using karatsuba
990 
991 	movdqa	  \XMM5, \TMP4
992 	pshufd	  $78, \XMM5, \TMP6
993 	pxor	  \XMM5, \TMP6
994 	paddd     ONE(%rip), \XMM0		# INCR CNT
995 	movdqu	  HashKey_4(%arg2), \TMP5
996 	pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
997 	movdqa    \XMM0, \XMM1
998 	paddd     ONE(%rip), \XMM0		# INCR CNT
999 	movdqa    \XMM0, \XMM2
1000 	paddd     ONE(%rip), \XMM0		# INCR CNT
1001 	movdqa    \XMM0, \XMM3
1002 	paddd     ONE(%rip), \XMM0		# INCR CNT
1003 	movdqa    \XMM0, \XMM4
1004 	pshufb %xmm15, \XMM1	# perform a 16 byte swap
1005 	pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1006 	pshufb %xmm15, \XMM2	# perform a 16 byte swap
1007 	pshufb %xmm15, \XMM3	# perform a 16 byte swap
1008 	pshufb %xmm15, \XMM4	# perform a 16 byte swap
1009 
1010 	pxor	  (%arg1), \XMM1
1011 	pxor	  (%arg1), \XMM2
1012 	pxor	  (%arg1), \XMM3
1013 	pxor	  (%arg1), \XMM4
1014 	movdqu	  HashKey_4_k(%arg2), \TMP5
1015 	pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
1016 	movaps 0x10(%arg1), \TMP1
1017 	aesenc	  \TMP1, \XMM1              # Round 1
1018 	aesenc	  \TMP1, \XMM2
1019 	aesenc	  \TMP1, \XMM3
1020 	aesenc	  \TMP1, \XMM4
1021 	movaps 0x20(%arg1), \TMP1
1022 	aesenc	  \TMP1, \XMM1              # Round 2
1023 	aesenc	  \TMP1, \XMM2
1024 	aesenc	  \TMP1, \XMM3
1025 	aesenc	  \TMP1, \XMM4
1026 	movdqa	  \XMM6, \TMP1
1027 	pshufd	  $78, \XMM6, \TMP2
1028 	pxor	  \XMM6, \TMP2
1029 	movdqu	  HashKey_3(%arg2), \TMP5
1030 	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
1031 	movaps 0x30(%arg1), \TMP3
1032 	aesenc    \TMP3, \XMM1              # Round 3
1033 	aesenc    \TMP3, \XMM2
1034 	aesenc    \TMP3, \XMM3
1035 	aesenc    \TMP3, \XMM4
1036 	pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
1037 	movaps 0x40(%arg1), \TMP3
1038 	aesenc	  \TMP3, \XMM1              # Round 4
1039 	aesenc	  \TMP3, \XMM2
1040 	aesenc	  \TMP3, \XMM3
1041 	aesenc	  \TMP3, \XMM4
1042 	movdqu	  HashKey_3_k(%arg2), \TMP5
1043 	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1044 	movaps 0x50(%arg1), \TMP3
1045 	aesenc	  \TMP3, \XMM1              # Round 5
1046 	aesenc	  \TMP3, \XMM2
1047 	aesenc	  \TMP3, \XMM3
1048 	aesenc	  \TMP3, \XMM4
1049 	pxor	  \TMP1, \TMP4
1050 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1051 	pxor	  \XMM6, \XMM5
1052 	pxor	  \TMP2, \TMP6
1053 	movdqa	  \XMM7, \TMP1
1054 	pshufd	  $78, \XMM7, \TMP2
1055 	pxor	  \XMM7, \TMP2
1056 	movdqu	  HashKey_2(%arg2), \TMP5
1057 
1058         # Multiply TMP5 * HashKey using karatsuba
1059 
1060 	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1061 	movaps 0x60(%arg1), \TMP3
1062 	aesenc	  \TMP3, \XMM1              # Round 6
1063 	aesenc	  \TMP3, \XMM2
1064 	aesenc	  \TMP3, \XMM3
1065 	aesenc	  \TMP3, \XMM4
1066 	pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
1067 	movaps 0x70(%arg1), \TMP3
1068 	aesenc	  \TMP3, \XMM1              # Round 7
1069 	aesenc	  \TMP3, \XMM2
1070 	aesenc	  \TMP3, \XMM3
1071 	aesenc	  \TMP3, \XMM4
1072 	movdqu	  HashKey_2_k(%arg2), \TMP5
1073 	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1074 	movaps 0x80(%arg1), \TMP3
1075 	aesenc	  \TMP3, \XMM1              # Round 8
1076 	aesenc	  \TMP3, \XMM2
1077 	aesenc	  \TMP3, \XMM3
1078 	aesenc	  \TMP3, \XMM4
1079 	pxor	  \TMP1, \TMP4
1080 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1081 	pxor	  \XMM7, \XMM5
1082 	pxor	  \TMP2, \TMP6
1083 
1084         # Multiply XMM8 * HashKey
1085         # XMM8 and TMP5 hold the values for the two operands
1086 
1087 	movdqa	  \XMM8, \TMP1
1088 	pshufd	  $78, \XMM8, \TMP2
1089 	pxor	  \XMM8, \TMP2
1090 	movdqu	  HashKey(%arg2), \TMP5
1091 	pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
1092 	movaps 0x90(%arg1), \TMP3
1093 	aesenc	  \TMP3, \XMM1             # Round 9
1094 	aesenc	  \TMP3, \XMM2
1095 	aesenc	  \TMP3, \XMM3
1096 	aesenc	  \TMP3, \XMM4
1097 	pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
1098 	lea	  0xa0(%arg1),%r10
1099 	mov	  keysize,%eax
1100 	shr	  $2,%eax			# 128->4, 192->6, 256->8
1101 	sub	  $4,%eax			# 128->0, 192->2, 256->4
1102 	jz	  aes_loop_par_enc_done\@
1103 
1104 aes_loop_par_enc\@:
1105 	MOVADQ	  (%r10),\TMP3
1106 .irpc	index, 1234
1107 	aesenc	  \TMP3, %xmm\index
1108 .endr
1109 	add	  $16,%r10
1110 	sub	  $1,%eax
1111 	jnz	  aes_loop_par_enc\@
1112 
1113 aes_loop_par_enc_done\@:
1114 	MOVADQ	  (%r10), \TMP3
1115 	aesenclast \TMP3, \XMM1           # Round 10
1116 	aesenclast \TMP3, \XMM2
1117 	aesenclast \TMP3, \XMM3
1118 	aesenclast \TMP3, \XMM4
1119 	movdqu    HashKey_k(%arg2), \TMP5
1120 	pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1121 	movdqu	  (%arg4,%r11,1), \TMP3
1122 	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1123 	movdqu	  16(%arg4,%r11,1), \TMP3
1124 	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1125 	movdqu	  32(%arg4,%r11,1), \TMP3
1126 	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1127 	movdqu	  48(%arg4,%r11,1), \TMP3
1128 	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1129         movdqu    \XMM1, (%arg3,%r11,1)        # Write to the ciphertext buffer
1130         movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to the ciphertext buffer
1131         movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to the ciphertext buffer
1132         movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to the ciphertext buffer
1133 	pshufb %xmm15, \XMM1        # perform a 16 byte swap
1134 	pshufb %xmm15, \XMM2	# perform a 16 byte swap
1135 	pshufb %xmm15, \XMM3	# perform a 16 byte swap
1136 	pshufb %xmm15, \XMM4	# perform a 16 byte swap
1137 
1138 	pxor	  \TMP4, \TMP1
1139 	pxor	  \XMM8, \XMM5
1140 	pxor	  \TMP6, \TMP2
1141 	pxor	  \TMP1, \TMP2
1142 	pxor	  \XMM5, \TMP2
1143 	movdqa	  \TMP2, \TMP3
1144 	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1145 	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1146 	pxor	  \TMP3, \XMM5
1147 	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1148 
1149         # first phase of reduction
1150 
1151 	movdqa    \XMM5, \TMP2
1152 	movdqa    \XMM5, \TMP3
1153 	movdqa    \XMM5, \TMP4
1154 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1155 	pslld     $31, \TMP2                   # packed right shift << 31
1156 	pslld     $30, \TMP3                   # packed right shift << 30
1157 	pslld     $25, \TMP4                   # packed right shift << 25
1158 	pxor      \TMP3, \TMP2	               # xor the shifted versions
1159 	pxor      \TMP4, \TMP2
1160 	movdqa    \TMP2, \TMP5
1161 	psrldq    $4, \TMP5                    # right shift T5 1 DW
1162 	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1163 	pxor      \TMP2, \XMM5
1164 
1165         # second phase of reduction
1166 
1167 	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1168 	movdqa    \XMM5,\TMP3
1169 	movdqa    \XMM5,\TMP4
1170 	psrld     $1, \TMP2                    # packed left shift >>1
1171 	psrld     $2, \TMP3                    # packed left shift >>2
1172 	psrld     $7, \TMP4                    # packed left shift >>7
1173 	pxor      \TMP3,\TMP2		       # xor the shifted versions
1174 	pxor      \TMP4,\TMP2
1175 	pxor      \TMP5, \TMP2
1176 	pxor      \TMP2, \XMM5
1177 	pxor      \TMP1, \XMM5                 # result is in TMP1
1178 
1179 	pxor	  \XMM5, \XMM1
1180 .endm
1181 
1182 /*
1183 * decrypt 4 blocks at a time
1184 * ghash the 4 previously decrypted ciphertext blocks
1185 * arg1, %arg3, %arg4 are used as pointers only, not modified
1186 * %r11 is the data offset value
1187 */
1188 .macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
1189 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1190 
1191 	movdqa	  \XMM1, \XMM5
1192 	movdqa	  \XMM2, \XMM6
1193 	movdqa	  \XMM3, \XMM7
1194 	movdqa	  \XMM4, \XMM8
1195 
1196         movdqa    SHUF_MASK(%rip), %xmm15
1197         # multiply TMP5 * HashKey using karatsuba
1198 
1199 	movdqa	  \XMM5, \TMP4
1200 	pshufd	  $78, \XMM5, \TMP6
1201 	pxor	  \XMM5, \TMP6
1202 	paddd     ONE(%rip), \XMM0		# INCR CNT
1203 	movdqu	  HashKey_4(%arg2), \TMP5
1204 	pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
1205 	movdqa    \XMM0, \XMM1
1206 	paddd     ONE(%rip), \XMM0		# INCR CNT
1207 	movdqa    \XMM0, \XMM2
1208 	paddd     ONE(%rip), \XMM0		# INCR CNT
1209 	movdqa    \XMM0, \XMM3
1210 	paddd     ONE(%rip), \XMM0		# INCR CNT
1211 	movdqa    \XMM0, \XMM4
1212 	pshufb %xmm15, \XMM1	# perform a 16 byte swap
1213 	pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1214 	pshufb %xmm15, \XMM2	# perform a 16 byte swap
1215 	pshufb %xmm15, \XMM3	# perform a 16 byte swap
1216 	pshufb %xmm15, \XMM4	# perform a 16 byte swap
1217 
1218 	pxor	  (%arg1), \XMM1
1219 	pxor	  (%arg1), \XMM2
1220 	pxor	  (%arg1), \XMM3
1221 	pxor	  (%arg1), \XMM4
1222 	movdqu	  HashKey_4_k(%arg2), \TMP5
1223 	pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
1224 	movaps 0x10(%arg1), \TMP1
1225 	aesenc	  \TMP1, \XMM1              # Round 1
1226 	aesenc	  \TMP1, \XMM2
1227 	aesenc	  \TMP1, \XMM3
1228 	aesenc	  \TMP1, \XMM4
1229 	movaps 0x20(%arg1), \TMP1
1230 	aesenc	  \TMP1, \XMM1              # Round 2
1231 	aesenc	  \TMP1, \XMM2
1232 	aesenc	  \TMP1, \XMM3
1233 	aesenc	  \TMP1, \XMM4
1234 	movdqa	  \XMM6, \TMP1
1235 	pshufd	  $78, \XMM6, \TMP2
1236 	pxor	  \XMM6, \TMP2
1237 	movdqu	  HashKey_3(%arg2), \TMP5
1238 	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
1239 	movaps 0x30(%arg1), \TMP3
1240 	aesenc    \TMP3, \XMM1              # Round 3
1241 	aesenc    \TMP3, \XMM2
1242 	aesenc    \TMP3, \XMM3
1243 	aesenc    \TMP3, \XMM4
1244 	pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
1245 	movaps 0x40(%arg1), \TMP3
1246 	aesenc	  \TMP3, \XMM1              # Round 4
1247 	aesenc	  \TMP3, \XMM2
1248 	aesenc	  \TMP3, \XMM3
1249 	aesenc	  \TMP3, \XMM4
1250 	movdqu	  HashKey_3_k(%arg2), \TMP5
1251 	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1252 	movaps 0x50(%arg1), \TMP3
1253 	aesenc	  \TMP3, \XMM1              # Round 5
1254 	aesenc	  \TMP3, \XMM2
1255 	aesenc	  \TMP3, \XMM3
1256 	aesenc	  \TMP3, \XMM4
1257 	pxor	  \TMP1, \TMP4
1258 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1259 	pxor	  \XMM6, \XMM5
1260 	pxor	  \TMP2, \TMP6
1261 	movdqa	  \XMM7, \TMP1
1262 	pshufd	  $78, \XMM7, \TMP2
1263 	pxor	  \XMM7, \TMP2
1264 	movdqu	  HashKey_2(%arg2), \TMP5
1265 
1266         # Multiply TMP5 * HashKey using karatsuba
1267 
1268 	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1269 	movaps 0x60(%arg1), \TMP3
1270 	aesenc	  \TMP3, \XMM1              # Round 6
1271 	aesenc	  \TMP3, \XMM2
1272 	aesenc	  \TMP3, \XMM3
1273 	aesenc	  \TMP3, \XMM4
1274 	pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
1275 	movaps 0x70(%arg1), \TMP3
1276 	aesenc	  \TMP3, \XMM1              # Round 7
1277 	aesenc	  \TMP3, \XMM2
1278 	aesenc	  \TMP3, \XMM3
1279 	aesenc	  \TMP3, \XMM4
1280 	movdqu	  HashKey_2_k(%arg2), \TMP5
1281 	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1282 	movaps 0x80(%arg1), \TMP3
1283 	aesenc	  \TMP3, \XMM1              # Round 8
1284 	aesenc	  \TMP3, \XMM2
1285 	aesenc	  \TMP3, \XMM3
1286 	aesenc	  \TMP3, \XMM4
1287 	pxor	  \TMP1, \TMP4
1288 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1289 	pxor	  \XMM7, \XMM5
1290 	pxor	  \TMP2, \TMP6
1291 
1292         # Multiply XMM8 * HashKey
1293         # XMM8 and TMP5 hold the values for the two operands
1294 
1295 	movdqa	  \XMM8, \TMP1
1296 	pshufd	  $78, \XMM8, \TMP2
1297 	pxor	  \XMM8, \TMP2
1298 	movdqu	  HashKey(%arg2), \TMP5
1299 	pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
1300 	movaps 0x90(%arg1), \TMP3
1301 	aesenc	  \TMP3, \XMM1             # Round 9
1302 	aesenc	  \TMP3, \XMM2
1303 	aesenc	  \TMP3, \XMM3
1304 	aesenc	  \TMP3, \XMM4
1305 	pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
1306 	lea	  0xa0(%arg1),%r10
1307 	mov	  keysize,%eax
1308 	shr	  $2,%eax		        # 128->4, 192->6, 256->8
1309 	sub	  $4,%eax			# 128->0, 192->2, 256->4
1310 	jz	  aes_loop_par_dec_done\@
1311 
1312 aes_loop_par_dec\@:
1313 	MOVADQ	  (%r10),\TMP3
1314 .irpc	index, 1234
1315 	aesenc	  \TMP3, %xmm\index
1316 .endr
1317 	add	  $16,%r10
1318 	sub	  $1,%eax
1319 	jnz	  aes_loop_par_dec\@
1320 
1321 aes_loop_par_dec_done\@:
1322 	MOVADQ	  (%r10), \TMP3
1323 	aesenclast \TMP3, \XMM1           # last round
1324 	aesenclast \TMP3, \XMM2
1325 	aesenclast \TMP3, \XMM3
1326 	aesenclast \TMP3, \XMM4
1327 	movdqu    HashKey_k(%arg2), \TMP5
1328 	pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1329 	movdqu	  (%arg4,%r11,1), \TMP3
1330 	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1331 	movdqu	  \XMM1, (%arg3,%r11,1)        # Write to plaintext buffer
1332 	movdqa    \TMP3, \XMM1
1333 	movdqu	  16(%arg4,%r11,1), \TMP3
1334 	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1335 	movdqu	  \XMM2, 16(%arg3,%r11,1)      # Write to plaintext buffer
1336 	movdqa    \TMP3, \XMM2
1337 	movdqu	  32(%arg4,%r11,1), \TMP3
1338 	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1339 	movdqu	  \XMM3, 32(%arg3,%r11,1)      # Write to plaintext buffer
1340 	movdqa    \TMP3, \XMM3
1341 	movdqu	  48(%arg4,%r11,1), \TMP3
1342 	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1343 	movdqu	  \XMM4, 48(%arg3,%r11,1)      # Write to plaintext buffer
1344 	movdqa    \TMP3, \XMM4
1345 	pshufb %xmm15, \XMM1        # perform a 16 byte swap
1346 	pshufb %xmm15, \XMM2	# perform a 16 byte swap
1347 	pshufb %xmm15, \XMM3	# perform a 16 byte swap
1348 	pshufb %xmm15, \XMM4	# perform a 16 byte swap
1349 
1350 	pxor	  \TMP4, \TMP1
1351 	pxor	  \XMM8, \XMM5
1352 	pxor	  \TMP6, \TMP2
1353 	pxor	  \TMP1, \TMP2
1354 	pxor	  \XMM5, \TMP2
1355 	movdqa	  \TMP2, \TMP3
1356 	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1357 	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1358 	pxor	  \TMP3, \XMM5
1359 	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1360 
1361         # first phase of reduction
1362 
1363 	movdqa    \XMM5, \TMP2
1364 	movdqa    \XMM5, \TMP3
1365 	movdqa    \XMM5, \TMP4
1366 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1367 	pslld     $31, \TMP2                   # packed right shift << 31
1368 	pslld     $30, \TMP3                   # packed right shift << 30
1369 	pslld     $25, \TMP4                   # packed right shift << 25
1370 	pxor      \TMP3, \TMP2	               # xor the shifted versions
1371 	pxor      \TMP4, \TMP2
1372 	movdqa    \TMP2, \TMP5
1373 	psrldq    $4, \TMP5                    # right shift T5 1 DW
1374 	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1375 	pxor      \TMP2, \XMM5
1376 
1377         # second phase of reduction
1378 
1379 	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1380 	movdqa    \XMM5,\TMP3
1381 	movdqa    \XMM5,\TMP4
1382 	psrld     $1, \TMP2                    # packed left shift >>1
1383 	psrld     $2, \TMP3                    # packed left shift >>2
1384 	psrld     $7, \TMP4                    # packed left shift >>7
1385 	pxor      \TMP3,\TMP2		       # xor the shifted versions
1386 	pxor      \TMP4,\TMP2
1387 	pxor      \TMP5, \TMP2
1388 	pxor      \TMP2, \XMM5
1389 	pxor      \TMP1, \XMM5                 # result is in TMP1
1390 
1391 	pxor	  \XMM5, \XMM1
1392 .endm
1393 
1394 /* GHASH the last 4 ciphertext blocks. */
1395 .macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1396 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1397 
1398         # Multiply TMP6 * HashKey (using Karatsuba)
1399 
1400 	movdqa	  \XMM1, \TMP6
1401 	pshufd	  $78, \XMM1, \TMP2
1402 	pxor	  \XMM1, \TMP2
1403 	movdqu	  HashKey_4(%arg2), \TMP5
1404 	pclmulqdq $0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1405 	pclmulqdq $0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1406 	movdqu	  HashKey_4_k(%arg2), \TMP4
1407 	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1408 	movdqa	  \XMM1, \XMMDst
1409 	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1410 
1411         # Multiply TMP1 * HashKey (using Karatsuba)
1412 
1413 	movdqa	  \XMM2, \TMP1
1414 	pshufd	  $78, \XMM2, \TMP2
1415 	pxor	  \XMM2, \TMP2
1416 	movdqu	  HashKey_3(%arg2), \TMP5
1417 	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1418 	pclmulqdq $0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1419 	movdqu	  HashKey_3_k(%arg2), \TMP4
1420 	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1421 	pxor	  \TMP1, \TMP6
1422 	pxor	  \XMM2, \XMMDst
1423 	pxor	  \TMP2, \XMM1
1424 # results accumulated in TMP6, XMMDst, XMM1
1425 
1426         # Multiply TMP1 * HashKey (using Karatsuba)
1427 
1428 	movdqa	  \XMM3, \TMP1
1429 	pshufd	  $78, \XMM3, \TMP2
1430 	pxor	  \XMM3, \TMP2
1431 	movdqu	  HashKey_2(%arg2), \TMP5
1432 	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1433 	pclmulqdq $0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1434 	movdqu	  HashKey_2_k(%arg2), \TMP4
1435 	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1436 	pxor	  \TMP1, \TMP6
1437 	pxor	  \XMM3, \XMMDst
1438 	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1439 
1440         # Multiply TMP1 * HashKey (using Karatsuba)
1441 	movdqa	  \XMM4, \TMP1
1442 	pshufd	  $78, \XMM4, \TMP2
1443 	pxor	  \XMM4, \TMP2
1444 	movdqu	  HashKey(%arg2), \TMP5
1445 	pclmulqdq $0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
1446 	pclmulqdq $0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1447 	movdqu	  HashKey_k(%arg2), \TMP4
1448 	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1449 	pxor	  \TMP1, \TMP6
1450 	pxor	  \XMM4, \XMMDst
1451 	pxor	  \XMM1, \TMP2
1452 	pxor	  \TMP6, \TMP2
1453 	pxor	  \XMMDst, \TMP2
1454 	# middle section of the temp results combined as in karatsuba algorithm
1455 	movdqa	  \TMP2, \TMP4
1456 	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
1457 	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
1458 	pxor	  \TMP4, \XMMDst
1459 	pxor	  \TMP2, \TMP6
1460 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1461 	# first phase of the reduction
1462 	movdqa    \XMMDst, \TMP2
1463 	movdqa    \XMMDst, \TMP3
1464 	movdqa    \XMMDst, \TMP4
1465 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1466 	pslld     $31, \TMP2                # packed right shifting << 31
1467 	pslld     $30, \TMP3                # packed right shifting << 30
1468 	pslld     $25, \TMP4                # packed right shifting << 25
1469 	pxor      \TMP3, \TMP2              # xor the shifted versions
1470 	pxor      \TMP4, \TMP2
1471 	movdqa    \TMP2, \TMP7
1472 	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1473 	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1474 	pxor      \TMP2, \XMMDst
1475 
1476         # second phase of the reduction
1477 	movdqa    \XMMDst, \TMP2
1478 	# make 3 copies of XMMDst for doing 3 shift operations
1479 	movdqa    \XMMDst, \TMP3
1480 	movdqa    \XMMDst, \TMP4
1481 	psrld     $1, \TMP2                 # packed left shift >> 1
1482 	psrld     $2, \TMP3                 # packed left shift >> 2
1483 	psrld     $7, \TMP4                 # packed left shift >> 7
1484 	pxor      \TMP3, \TMP2              # xor the shifted versions
1485 	pxor      \TMP4, \TMP2
1486 	pxor      \TMP7, \TMP2
1487 	pxor      \TMP2, \XMMDst
1488 	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1489 .endm
1490 
1491 
1492 /* Encryption of a single block
1493 * uses eax & r10
1494 */
1495 
1496 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1497 
1498 	pxor		(%arg1), \XMM0
1499 	mov		keysize,%eax
1500 	shr		$2,%eax			# 128->4, 192->6, 256->8
1501 	add		$5,%eax			# 128->9, 192->11, 256->13
1502 	lea		16(%arg1), %r10	  # get first expanded key address
1503 
1504 _esb_loop_\@:
1505 	MOVADQ		(%r10),\TMP1
1506 	aesenc		\TMP1,\XMM0
1507 	add		$16,%r10
1508 	sub		$1,%eax
1509 	jnz		_esb_loop_\@
1510 
1511 	MOVADQ		(%r10),\TMP1
1512 	aesenclast	\TMP1,\XMM0
1513 .endm
1514 /*****************************************************************************
1515 * void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1516 *                   struct gcm_context_data *data
1517 *                                      // Context data
1518 *                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1519 *                   const u8 *in,      // Ciphertext input
1520 *                   u64 plaintext_len, // Length of data in bytes for decryption.
1521 *                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1522 *                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1523 *                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1524 *                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1525 *                   const u8 *aad,     // Additional Authentication Data (AAD)
1526 *                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1527 *                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1528 *                                      // given authentication tag and only return the plaintext if they match.
1529 *                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1530 *                                      // (most likely), 12 or 8.
1531 *
1532 * Assumptions:
1533 *
1534 * keys:
1535 *       keys are pre-expanded and aligned to 16 bytes. we are using the first
1536 *       set of 11 keys in the data structure void *aes_ctx
1537 *
1538 * iv:
1539 *       0                   1                   2                   3
1540 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1541 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1542 *       |                             Salt  (From the SA)               |
1543 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1544 *       |                     Initialization Vector                     |
1545 *       |         (This is the sequence number from IPSec header)       |
1546 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1547 *       |                              0x1                              |
1548 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1549 *
1550 *
1551 *
1552 * AAD:
1553 *       AAD padded to 128 bits with 0
1554 *       for example, assume AAD is a u32 vector
1555 *
1556 *       if AAD is 8 bytes:
1557 *       AAD[3] = {A0, A1};
1558 *       padded AAD in xmm register = {A1 A0 0 0}
1559 *
1560 *       0                   1                   2                   3
1561 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1562 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1563 *       |                               SPI (A1)                        |
1564 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1565 *       |                     32-bit Sequence Number (A0)               |
1566 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1567 *       |                              0x0                              |
1568 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1569 *
1570 *                                       AAD Format with 32-bit Sequence Number
1571 *
1572 *       if AAD is 12 bytes:
1573 *       AAD[3] = {A0, A1, A2};
1574 *       padded AAD in xmm register = {A2 A1 A0 0}
1575 *
1576 *       0                   1                   2                   3
1577 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1578 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1579 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1580 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1581 *       |                               SPI (A2)                        |
1582 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1583 *       |                 64-bit Extended Sequence Number {A1,A0}       |
1584 *       |                                                               |
1585 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1586 *       |                              0x0                              |
1587 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1588 *
1589 *                        AAD Format with 64-bit Extended Sequence Number
1590 *
1591 * poly = x^128 + x^127 + x^126 + x^121 + 1
1592 *
1593 *****************************************************************************/
1594 SYM_FUNC_START(aesni_gcm_dec)
1595 	FUNC_SAVE
1596 
1597 	GCM_INIT %arg6, arg7, arg8, arg9
1598 	GCM_ENC_DEC dec
1599 	GCM_COMPLETE arg10, arg11
1600 	FUNC_RESTORE
1601 	RET
1602 SYM_FUNC_END(aesni_gcm_dec)
1603 
1604 
1605 /*****************************************************************************
1606 * void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1607 *                    struct gcm_context_data *data
1608 *                                        // Context data
1609 *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1610 *                    const u8 *in,       // Plaintext input
1611 *                    u64 plaintext_len,  // Length of data in bytes for encryption.
1612 *                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1613 *                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1614 *                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1615 *                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1616 *                    const u8 *aad,      // Additional Authentication Data (AAD)
1617 *                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1618 *                    u8 *auth_tag,       // Authenticated Tag output.
1619 *                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1620 *                                        // 12 or 8.
1621 *
1622 * Assumptions:
1623 *
1624 * keys:
1625 *       keys are pre-expanded and aligned to 16 bytes. we are using the
1626 *       first set of 11 keys in the data structure void *aes_ctx
1627 *
1628 *
1629 * iv:
1630 *       0                   1                   2                   3
1631 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1632 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1633 *       |                             Salt  (From the SA)               |
1634 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1635 *       |                     Initialization Vector                     |
1636 *       |         (This is the sequence number from IPSec header)       |
1637 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1638 *       |                              0x1                              |
1639 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1640 *
1641 *
1642 *
1643 * AAD:
1644 *       AAD padded to 128 bits with 0
1645 *       for example, assume AAD is a u32 vector
1646 *
1647 *       if AAD is 8 bytes:
1648 *       AAD[3] = {A0, A1};
1649 *       padded AAD in xmm register = {A1 A0 0 0}
1650 *
1651 *       0                   1                   2                   3
1652 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1653 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1654 *       |                               SPI (A1)                        |
1655 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1656 *       |                     32-bit Sequence Number (A0)               |
1657 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1658 *       |                              0x0                              |
1659 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1660 *
1661 *                                 AAD Format with 32-bit Sequence Number
1662 *
1663 *       if AAD is 12 bytes:
1664 *       AAD[3] = {A0, A1, A2};
1665 *       padded AAD in xmm register = {A2 A1 A0 0}
1666 *
1667 *       0                   1                   2                   3
1668 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1669 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1670 *       |                               SPI (A2)                        |
1671 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1672 *       |                 64-bit Extended Sequence Number {A1,A0}       |
1673 *       |                                                               |
1674 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1675 *       |                              0x0                              |
1676 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1677 *
1678 *                         AAD Format with 64-bit Extended Sequence Number
1679 *
1680 * poly = x^128 + x^127 + x^126 + x^121 + 1
1681 ***************************************************************************/
1682 SYM_FUNC_START(aesni_gcm_enc)
1683 	FUNC_SAVE
1684 
1685 	GCM_INIT %arg6, arg7, arg8, arg9
1686 	GCM_ENC_DEC enc
1687 
1688 	GCM_COMPLETE arg10, arg11
1689 	FUNC_RESTORE
1690 	RET
1691 SYM_FUNC_END(aesni_gcm_enc)
1692 
1693 /*****************************************************************************
1694 * void aesni_gcm_init(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1695 *                     struct gcm_context_data *data,
1696 *                                         // context data
1697 *                     u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1698 *                                         // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1699 *                                         // concatenated with 0x00000001. 16-byte aligned pointer.
1700 *                     u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1701 *                     const u8 *aad,      // Additional Authentication Data (AAD)
1702 *                     u64 aad_len)        // Length of AAD in bytes.
1703 */
1704 SYM_FUNC_START(aesni_gcm_init)
1705 	FUNC_SAVE
1706 	GCM_INIT %arg3, %arg4,%arg5, %arg6
1707 	FUNC_RESTORE
1708 	RET
1709 SYM_FUNC_END(aesni_gcm_init)
1710 
1711 /*****************************************************************************
1712 * void aesni_gcm_enc_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1713 *                    struct gcm_context_data *data,
1714 *                                        // context data
1715 *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1716 *                    const u8 *in,       // Plaintext input
1717 *                    u64 plaintext_len,  // Length of data in bytes for encryption.
1718 */
1719 SYM_FUNC_START(aesni_gcm_enc_update)
1720 	FUNC_SAVE
1721 	GCM_ENC_DEC enc
1722 	FUNC_RESTORE
1723 	RET
1724 SYM_FUNC_END(aesni_gcm_enc_update)
1725 
1726 /*****************************************************************************
1727 * void aesni_gcm_dec_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1728 *                    struct gcm_context_data *data,
1729 *                                        // context data
1730 *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1731 *                    const u8 *in,       // Plaintext input
1732 *                    u64 plaintext_len,  // Length of data in bytes for encryption.
1733 */
1734 SYM_FUNC_START(aesni_gcm_dec_update)
1735 	FUNC_SAVE
1736 	GCM_ENC_DEC dec
1737 	FUNC_RESTORE
1738 	RET
1739 SYM_FUNC_END(aesni_gcm_dec_update)
1740 
1741 /*****************************************************************************
1742 * void aesni_gcm_finalize(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1743 *                    struct gcm_context_data *data,
1744 *                                        // context data
1745 *                    u8 *auth_tag,       // Authenticated Tag output.
1746 *                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1747 *                                        // 12 or 8.
1748 */
1749 SYM_FUNC_START(aesni_gcm_finalize)
1750 	FUNC_SAVE
1751 	GCM_COMPLETE %arg3 %arg4
1752 	FUNC_RESTORE
1753 	RET
1754 SYM_FUNC_END(aesni_gcm_finalize)
1755 
1756 #endif
1757 
1758 
1759 SYM_FUNC_START_LOCAL_ALIAS(_key_expansion_128)
1760 SYM_FUNC_START_LOCAL(_key_expansion_256a)
1761 	pshufd $0b11111111, %xmm1, %xmm1
1762 	shufps $0b00010000, %xmm0, %xmm4
1763 	pxor %xmm4, %xmm0
1764 	shufps $0b10001100, %xmm0, %xmm4
1765 	pxor %xmm4, %xmm0
1766 	pxor %xmm1, %xmm0
1767 	movaps %xmm0, (TKEYP)
1768 	add $0x10, TKEYP
1769 	RET
1770 SYM_FUNC_END(_key_expansion_256a)
1771 SYM_FUNC_END_ALIAS(_key_expansion_128)
1772 
1773 SYM_FUNC_START_LOCAL(_key_expansion_192a)
1774 	pshufd $0b01010101, %xmm1, %xmm1
1775 	shufps $0b00010000, %xmm0, %xmm4
1776 	pxor %xmm4, %xmm0
1777 	shufps $0b10001100, %xmm0, %xmm4
1778 	pxor %xmm4, %xmm0
1779 	pxor %xmm1, %xmm0
1780 
1781 	movaps %xmm2, %xmm5
1782 	movaps %xmm2, %xmm6
1783 	pslldq $4, %xmm5
1784 	pshufd $0b11111111, %xmm0, %xmm3
1785 	pxor %xmm3, %xmm2
1786 	pxor %xmm5, %xmm2
1787 
1788 	movaps %xmm0, %xmm1
1789 	shufps $0b01000100, %xmm0, %xmm6
1790 	movaps %xmm6, (TKEYP)
1791 	shufps $0b01001110, %xmm2, %xmm1
1792 	movaps %xmm1, 0x10(TKEYP)
1793 	add $0x20, TKEYP
1794 	RET
1795 SYM_FUNC_END(_key_expansion_192a)
1796 
1797 SYM_FUNC_START_LOCAL(_key_expansion_192b)
1798 	pshufd $0b01010101, %xmm1, %xmm1
1799 	shufps $0b00010000, %xmm0, %xmm4
1800 	pxor %xmm4, %xmm0
1801 	shufps $0b10001100, %xmm0, %xmm4
1802 	pxor %xmm4, %xmm0
1803 	pxor %xmm1, %xmm0
1804 
1805 	movaps %xmm2, %xmm5
1806 	pslldq $4, %xmm5
1807 	pshufd $0b11111111, %xmm0, %xmm3
1808 	pxor %xmm3, %xmm2
1809 	pxor %xmm5, %xmm2
1810 
1811 	movaps %xmm0, (TKEYP)
1812 	add $0x10, TKEYP
1813 	RET
1814 SYM_FUNC_END(_key_expansion_192b)
1815 
1816 SYM_FUNC_START_LOCAL(_key_expansion_256b)
1817 	pshufd $0b10101010, %xmm1, %xmm1
1818 	shufps $0b00010000, %xmm2, %xmm4
1819 	pxor %xmm4, %xmm2
1820 	shufps $0b10001100, %xmm2, %xmm4
1821 	pxor %xmm4, %xmm2
1822 	pxor %xmm1, %xmm2
1823 	movaps %xmm2, (TKEYP)
1824 	add $0x10, TKEYP
1825 	RET
1826 SYM_FUNC_END(_key_expansion_256b)
1827 
1828 /*
1829  * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1830  *                   unsigned int key_len)
1831  */
1832 SYM_FUNC_START(aesni_set_key)
1833 	FRAME_BEGIN
1834 #ifndef __x86_64__
1835 	pushl KEYP
1836 	movl (FRAME_OFFSET+8)(%esp), KEYP	# ctx
1837 	movl (FRAME_OFFSET+12)(%esp), UKEYP	# in_key
1838 	movl (FRAME_OFFSET+16)(%esp), %edx	# key_len
1839 #endif
1840 	movups (UKEYP), %xmm0		# user key (first 16 bytes)
1841 	movaps %xmm0, (KEYP)
1842 	lea 0x10(KEYP), TKEYP		# key addr
1843 	movl %edx, 480(KEYP)
1844 	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
1845 	cmp $24, %dl
1846 	jb .Lenc_key128
1847 	je .Lenc_key192
1848 	movups 0x10(UKEYP), %xmm2	# other user key
1849 	movaps %xmm2, (TKEYP)
1850 	add $0x10, TKEYP
1851 	aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
1852 	call _key_expansion_256a
1853 	aeskeygenassist $0x1, %xmm0, %xmm1
1854 	call _key_expansion_256b
1855 	aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
1856 	call _key_expansion_256a
1857 	aeskeygenassist $0x2, %xmm0, %xmm1
1858 	call _key_expansion_256b
1859 	aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
1860 	call _key_expansion_256a
1861 	aeskeygenassist $0x4, %xmm0, %xmm1
1862 	call _key_expansion_256b
1863 	aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
1864 	call _key_expansion_256a
1865 	aeskeygenassist $0x8, %xmm0, %xmm1
1866 	call _key_expansion_256b
1867 	aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
1868 	call _key_expansion_256a
1869 	aeskeygenassist $0x10, %xmm0, %xmm1
1870 	call _key_expansion_256b
1871 	aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
1872 	call _key_expansion_256a
1873 	aeskeygenassist $0x20, %xmm0, %xmm1
1874 	call _key_expansion_256b
1875 	aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
1876 	call _key_expansion_256a
1877 	jmp .Ldec_key
1878 .Lenc_key192:
1879 	movq 0x10(UKEYP), %xmm2		# other user key
1880 	aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
1881 	call _key_expansion_192a
1882 	aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
1883 	call _key_expansion_192b
1884 	aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
1885 	call _key_expansion_192a
1886 	aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
1887 	call _key_expansion_192b
1888 	aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
1889 	call _key_expansion_192a
1890 	aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
1891 	call _key_expansion_192b
1892 	aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
1893 	call _key_expansion_192a
1894 	aeskeygenassist $0x80, %xmm2, %xmm1	# round 8
1895 	call _key_expansion_192b
1896 	jmp .Ldec_key
1897 .Lenc_key128:
1898 	aeskeygenassist $0x1, %xmm0, %xmm1	# round 1
1899 	call _key_expansion_128
1900 	aeskeygenassist $0x2, %xmm0, %xmm1	# round 2
1901 	call _key_expansion_128
1902 	aeskeygenassist $0x4, %xmm0, %xmm1	# round 3
1903 	call _key_expansion_128
1904 	aeskeygenassist $0x8, %xmm0, %xmm1	# round 4
1905 	call _key_expansion_128
1906 	aeskeygenassist $0x10, %xmm0, %xmm1	# round 5
1907 	call _key_expansion_128
1908 	aeskeygenassist $0x20, %xmm0, %xmm1	# round 6
1909 	call _key_expansion_128
1910 	aeskeygenassist $0x40, %xmm0, %xmm1	# round 7
1911 	call _key_expansion_128
1912 	aeskeygenassist $0x80, %xmm0, %xmm1	# round 8
1913 	call _key_expansion_128
1914 	aeskeygenassist $0x1b, %xmm0, %xmm1	# round 9
1915 	call _key_expansion_128
1916 	aeskeygenassist $0x36, %xmm0, %xmm1	# round 10
1917 	call _key_expansion_128
1918 .Ldec_key:
1919 	sub $0x10, TKEYP
1920 	movaps (KEYP), %xmm0
1921 	movaps (TKEYP), %xmm1
1922 	movaps %xmm0, 240(TKEYP)
1923 	movaps %xmm1, 240(KEYP)
1924 	add $0x10, KEYP
1925 	lea 240-16(TKEYP), UKEYP
1926 .align 4
1927 .Ldec_key_loop:
1928 	movaps (KEYP), %xmm0
1929 	aesimc %xmm0, %xmm1
1930 	movaps %xmm1, (UKEYP)
1931 	add $0x10, KEYP
1932 	sub $0x10, UKEYP
1933 	cmp TKEYP, KEYP
1934 	jb .Ldec_key_loop
1935 	xor AREG, AREG
1936 #ifndef __x86_64__
1937 	popl KEYP
1938 #endif
1939 	FRAME_END
1940 	RET
1941 SYM_FUNC_END(aesni_set_key)
1942 
1943 /*
1944  * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
1945  */
1946 SYM_FUNC_START(aesni_enc)
1947 	FRAME_BEGIN
1948 #ifndef __x86_64__
1949 	pushl KEYP
1950 	pushl KLEN
1951 	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
1952 	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
1953 	movl (FRAME_OFFSET+20)(%esp), INP	# src
1954 #endif
1955 	movl 480(KEYP), KLEN		# key length
1956 	movups (INP), STATE		# input
1957 	call _aesni_enc1
1958 	movups STATE, (OUTP)		# output
1959 #ifndef __x86_64__
1960 	popl KLEN
1961 	popl KEYP
1962 #endif
1963 	FRAME_END
1964 	RET
1965 SYM_FUNC_END(aesni_enc)
1966 
1967 /*
1968  * _aesni_enc1:		internal ABI
1969  * input:
1970  *	KEYP:		key struct pointer
1971  *	KLEN:		round count
1972  *	STATE:		initial state (input)
1973  * output:
1974  *	STATE:		finial state (output)
1975  * changed:
1976  *	KEY
1977  *	TKEYP (T1)
1978  */
1979 SYM_FUNC_START_LOCAL(_aesni_enc1)
1980 	movaps (KEYP), KEY		# key
1981 	mov KEYP, TKEYP
1982 	pxor KEY, STATE		# round 0
1983 	add $0x30, TKEYP
1984 	cmp $24, KLEN
1985 	jb .Lenc128
1986 	lea 0x20(TKEYP), TKEYP
1987 	je .Lenc192
1988 	add $0x20, TKEYP
1989 	movaps -0x60(TKEYP), KEY
1990 	aesenc KEY, STATE
1991 	movaps -0x50(TKEYP), KEY
1992 	aesenc KEY, STATE
1993 .align 4
1994 .Lenc192:
1995 	movaps -0x40(TKEYP), KEY
1996 	aesenc KEY, STATE
1997 	movaps -0x30(TKEYP), KEY
1998 	aesenc KEY, STATE
1999 .align 4
2000 .Lenc128:
2001 	movaps -0x20(TKEYP), KEY
2002 	aesenc KEY, STATE
2003 	movaps -0x10(TKEYP), KEY
2004 	aesenc KEY, STATE
2005 	movaps (TKEYP), KEY
2006 	aesenc KEY, STATE
2007 	movaps 0x10(TKEYP), KEY
2008 	aesenc KEY, STATE
2009 	movaps 0x20(TKEYP), KEY
2010 	aesenc KEY, STATE
2011 	movaps 0x30(TKEYP), KEY
2012 	aesenc KEY, STATE
2013 	movaps 0x40(TKEYP), KEY
2014 	aesenc KEY, STATE
2015 	movaps 0x50(TKEYP), KEY
2016 	aesenc KEY, STATE
2017 	movaps 0x60(TKEYP), KEY
2018 	aesenc KEY, STATE
2019 	movaps 0x70(TKEYP), KEY
2020 	aesenclast KEY, STATE
2021 	RET
2022 SYM_FUNC_END(_aesni_enc1)
2023 
2024 /*
2025  * _aesni_enc4:	internal ABI
2026  * input:
2027  *	KEYP:		key struct pointer
2028  *	KLEN:		round count
2029  *	STATE1:		initial state (input)
2030  *	STATE2
2031  *	STATE3
2032  *	STATE4
2033  * output:
2034  *	STATE1:		finial state (output)
2035  *	STATE2
2036  *	STATE3
2037  *	STATE4
2038  * changed:
2039  *	KEY
2040  *	TKEYP (T1)
2041  */
2042 SYM_FUNC_START_LOCAL(_aesni_enc4)
2043 	movaps (KEYP), KEY		# key
2044 	mov KEYP, TKEYP
2045 	pxor KEY, STATE1		# round 0
2046 	pxor KEY, STATE2
2047 	pxor KEY, STATE3
2048 	pxor KEY, STATE4
2049 	add $0x30, TKEYP
2050 	cmp $24, KLEN
2051 	jb .L4enc128
2052 	lea 0x20(TKEYP), TKEYP
2053 	je .L4enc192
2054 	add $0x20, TKEYP
2055 	movaps -0x60(TKEYP), KEY
2056 	aesenc KEY, STATE1
2057 	aesenc KEY, STATE2
2058 	aesenc KEY, STATE3
2059 	aesenc KEY, STATE4
2060 	movaps -0x50(TKEYP), KEY
2061 	aesenc KEY, STATE1
2062 	aesenc KEY, STATE2
2063 	aesenc KEY, STATE3
2064 	aesenc KEY, STATE4
2065 #.align 4
2066 .L4enc192:
2067 	movaps -0x40(TKEYP), KEY
2068 	aesenc KEY, STATE1
2069 	aesenc KEY, STATE2
2070 	aesenc KEY, STATE3
2071 	aesenc KEY, STATE4
2072 	movaps -0x30(TKEYP), KEY
2073 	aesenc KEY, STATE1
2074 	aesenc KEY, STATE2
2075 	aesenc KEY, STATE3
2076 	aesenc KEY, STATE4
2077 #.align 4
2078 .L4enc128:
2079 	movaps -0x20(TKEYP), KEY
2080 	aesenc KEY, STATE1
2081 	aesenc KEY, STATE2
2082 	aesenc KEY, STATE3
2083 	aesenc KEY, STATE4
2084 	movaps -0x10(TKEYP), KEY
2085 	aesenc KEY, STATE1
2086 	aesenc KEY, STATE2
2087 	aesenc KEY, STATE3
2088 	aesenc KEY, STATE4
2089 	movaps (TKEYP), KEY
2090 	aesenc KEY, STATE1
2091 	aesenc KEY, STATE2
2092 	aesenc KEY, STATE3
2093 	aesenc KEY, STATE4
2094 	movaps 0x10(TKEYP), KEY
2095 	aesenc KEY, STATE1
2096 	aesenc KEY, STATE2
2097 	aesenc KEY, STATE3
2098 	aesenc KEY, STATE4
2099 	movaps 0x20(TKEYP), KEY
2100 	aesenc KEY, STATE1
2101 	aesenc KEY, STATE2
2102 	aesenc KEY, STATE3
2103 	aesenc KEY, STATE4
2104 	movaps 0x30(TKEYP), KEY
2105 	aesenc KEY, STATE1
2106 	aesenc KEY, STATE2
2107 	aesenc KEY, STATE3
2108 	aesenc KEY, STATE4
2109 	movaps 0x40(TKEYP), KEY
2110 	aesenc KEY, STATE1
2111 	aesenc KEY, STATE2
2112 	aesenc KEY, STATE3
2113 	aesenc KEY, STATE4
2114 	movaps 0x50(TKEYP), KEY
2115 	aesenc KEY, STATE1
2116 	aesenc KEY, STATE2
2117 	aesenc KEY, STATE3
2118 	aesenc KEY, STATE4
2119 	movaps 0x60(TKEYP), KEY
2120 	aesenc KEY, STATE1
2121 	aesenc KEY, STATE2
2122 	aesenc KEY, STATE3
2123 	aesenc KEY, STATE4
2124 	movaps 0x70(TKEYP), KEY
2125 	aesenclast KEY, STATE1		# last round
2126 	aesenclast KEY, STATE2
2127 	aesenclast KEY, STATE3
2128 	aesenclast KEY, STATE4
2129 	RET
2130 SYM_FUNC_END(_aesni_enc4)
2131 
2132 /*
2133  * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
2134  */
2135 SYM_FUNC_START(aesni_dec)
2136 	FRAME_BEGIN
2137 #ifndef __x86_64__
2138 	pushl KEYP
2139 	pushl KLEN
2140 	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
2141 	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
2142 	movl (FRAME_OFFSET+20)(%esp), INP	# src
2143 #endif
2144 	mov 480(KEYP), KLEN		# key length
2145 	add $240, KEYP
2146 	movups (INP), STATE		# input
2147 	call _aesni_dec1
2148 	movups STATE, (OUTP)		#output
2149 #ifndef __x86_64__
2150 	popl KLEN
2151 	popl KEYP
2152 #endif
2153 	FRAME_END
2154 	RET
2155 SYM_FUNC_END(aesni_dec)
2156 
2157 /*
2158  * _aesni_dec1:		internal ABI
2159  * input:
2160  *	KEYP:		key struct pointer
2161  *	KLEN:		key length
2162  *	STATE:		initial state (input)
2163  * output:
2164  *	STATE:		finial state (output)
2165  * changed:
2166  *	KEY
2167  *	TKEYP (T1)
2168  */
2169 SYM_FUNC_START_LOCAL(_aesni_dec1)
2170 	movaps (KEYP), KEY		# key
2171 	mov KEYP, TKEYP
2172 	pxor KEY, STATE		# round 0
2173 	add $0x30, TKEYP
2174 	cmp $24, KLEN
2175 	jb .Ldec128
2176 	lea 0x20(TKEYP), TKEYP
2177 	je .Ldec192
2178 	add $0x20, TKEYP
2179 	movaps -0x60(TKEYP), KEY
2180 	aesdec KEY, STATE
2181 	movaps -0x50(TKEYP), KEY
2182 	aesdec KEY, STATE
2183 .align 4
2184 .Ldec192:
2185 	movaps -0x40(TKEYP), KEY
2186 	aesdec KEY, STATE
2187 	movaps -0x30(TKEYP), KEY
2188 	aesdec KEY, STATE
2189 .align 4
2190 .Ldec128:
2191 	movaps -0x20(TKEYP), KEY
2192 	aesdec KEY, STATE
2193 	movaps -0x10(TKEYP), KEY
2194 	aesdec KEY, STATE
2195 	movaps (TKEYP), KEY
2196 	aesdec KEY, STATE
2197 	movaps 0x10(TKEYP), KEY
2198 	aesdec KEY, STATE
2199 	movaps 0x20(TKEYP), KEY
2200 	aesdec KEY, STATE
2201 	movaps 0x30(TKEYP), KEY
2202 	aesdec KEY, STATE
2203 	movaps 0x40(TKEYP), KEY
2204 	aesdec KEY, STATE
2205 	movaps 0x50(TKEYP), KEY
2206 	aesdec KEY, STATE
2207 	movaps 0x60(TKEYP), KEY
2208 	aesdec KEY, STATE
2209 	movaps 0x70(TKEYP), KEY
2210 	aesdeclast KEY, STATE
2211 	RET
2212 SYM_FUNC_END(_aesni_dec1)
2213 
2214 /*
2215  * _aesni_dec4:	internal ABI
2216  * input:
2217  *	KEYP:		key struct pointer
2218  *	KLEN:		key length
2219  *	STATE1:		initial state (input)
2220  *	STATE2
2221  *	STATE3
2222  *	STATE4
2223  * output:
2224  *	STATE1:		finial state (output)
2225  *	STATE2
2226  *	STATE3
2227  *	STATE4
2228  * changed:
2229  *	KEY
2230  *	TKEYP (T1)
2231  */
2232 SYM_FUNC_START_LOCAL(_aesni_dec4)
2233 	movaps (KEYP), KEY		# key
2234 	mov KEYP, TKEYP
2235 	pxor KEY, STATE1		# round 0
2236 	pxor KEY, STATE2
2237 	pxor KEY, STATE3
2238 	pxor KEY, STATE4
2239 	add $0x30, TKEYP
2240 	cmp $24, KLEN
2241 	jb .L4dec128
2242 	lea 0x20(TKEYP), TKEYP
2243 	je .L4dec192
2244 	add $0x20, TKEYP
2245 	movaps -0x60(TKEYP), KEY
2246 	aesdec KEY, STATE1
2247 	aesdec KEY, STATE2
2248 	aesdec KEY, STATE3
2249 	aesdec KEY, STATE4
2250 	movaps -0x50(TKEYP), KEY
2251 	aesdec KEY, STATE1
2252 	aesdec KEY, STATE2
2253 	aesdec KEY, STATE3
2254 	aesdec KEY, STATE4
2255 .align 4
2256 .L4dec192:
2257 	movaps -0x40(TKEYP), KEY
2258 	aesdec KEY, STATE1
2259 	aesdec KEY, STATE2
2260 	aesdec KEY, STATE3
2261 	aesdec KEY, STATE4
2262 	movaps -0x30(TKEYP), KEY
2263 	aesdec KEY, STATE1
2264 	aesdec KEY, STATE2
2265 	aesdec KEY, STATE3
2266 	aesdec KEY, STATE4
2267 .align 4
2268 .L4dec128:
2269 	movaps -0x20(TKEYP), KEY
2270 	aesdec KEY, STATE1
2271 	aesdec KEY, STATE2
2272 	aesdec KEY, STATE3
2273 	aesdec KEY, STATE4
2274 	movaps -0x10(TKEYP), KEY
2275 	aesdec KEY, STATE1
2276 	aesdec KEY, STATE2
2277 	aesdec KEY, STATE3
2278 	aesdec KEY, STATE4
2279 	movaps (TKEYP), KEY
2280 	aesdec KEY, STATE1
2281 	aesdec KEY, STATE2
2282 	aesdec KEY, STATE3
2283 	aesdec KEY, STATE4
2284 	movaps 0x10(TKEYP), KEY
2285 	aesdec KEY, STATE1
2286 	aesdec KEY, STATE2
2287 	aesdec KEY, STATE3
2288 	aesdec KEY, STATE4
2289 	movaps 0x20(TKEYP), KEY
2290 	aesdec KEY, STATE1
2291 	aesdec KEY, STATE2
2292 	aesdec KEY, STATE3
2293 	aesdec KEY, STATE4
2294 	movaps 0x30(TKEYP), KEY
2295 	aesdec KEY, STATE1
2296 	aesdec KEY, STATE2
2297 	aesdec KEY, STATE3
2298 	aesdec KEY, STATE4
2299 	movaps 0x40(TKEYP), KEY
2300 	aesdec KEY, STATE1
2301 	aesdec KEY, STATE2
2302 	aesdec KEY, STATE3
2303 	aesdec KEY, STATE4
2304 	movaps 0x50(TKEYP), KEY
2305 	aesdec KEY, STATE1
2306 	aesdec KEY, STATE2
2307 	aesdec KEY, STATE3
2308 	aesdec KEY, STATE4
2309 	movaps 0x60(TKEYP), KEY
2310 	aesdec KEY, STATE1
2311 	aesdec KEY, STATE2
2312 	aesdec KEY, STATE3
2313 	aesdec KEY, STATE4
2314 	movaps 0x70(TKEYP), KEY
2315 	aesdeclast KEY, STATE1		# last round
2316 	aesdeclast KEY, STATE2
2317 	aesdeclast KEY, STATE3
2318 	aesdeclast KEY, STATE4
2319 	RET
2320 SYM_FUNC_END(_aesni_dec4)
2321 
2322 /*
2323  * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2324  *		      size_t len)
2325  */
2326 SYM_FUNC_START(aesni_ecb_enc)
2327 	FRAME_BEGIN
2328 #ifndef __x86_64__
2329 	pushl LEN
2330 	pushl KEYP
2331 	pushl KLEN
2332 	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2333 	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2334 	movl (FRAME_OFFSET+24)(%esp), INP	# src
2335 	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2336 #endif
2337 	test LEN, LEN		# check length
2338 	jz .Lecb_enc_ret
2339 	mov 480(KEYP), KLEN
2340 	cmp $16, LEN
2341 	jb .Lecb_enc_ret
2342 	cmp $64, LEN
2343 	jb .Lecb_enc_loop1
2344 .align 4
2345 .Lecb_enc_loop4:
2346 	movups (INP), STATE1
2347 	movups 0x10(INP), STATE2
2348 	movups 0x20(INP), STATE3
2349 	movups 0x30(INP), STATE4
2350 	call _aesni_enc4
2351 	movups STATE1, (OUTP)
2352 	movups STATE2, 0x10(OUTP)
2353 	movups STATE3, 0x20(OUTP)
2354 	movups STATE4, 0x30(OUTP)
2355 	sub $64, LEN
2356 	add $64, INP
2357 	add $64, OUTP
2358 	cmp $64, LEN
2359 	jge .Lecb_enc_loop4
2360 	cmp $16, LEN
2361 	jb .Lecb_enc_ret
2362 .align 4
2363 .Lecb_enc_loop1:
2364 	movups (INP), STATE1
2365 	call _aesni_enc1
2366 	movups STATE1, (OUTP)
2367 	sub $16, LEN
2368 	add $16, INP
2369 	add $16, OUTP
2370 	cmp $16, LEN
2371 	jge .Lecb_enc_loop1
2372 .Lecb_enc_ret:
2373 #ifndef __x86_64__
2374 	popl KLEN
2375 	popl KEYP
2376 	popl LEN
2377 #endif
2378 	FRAME_END
2379 	RET
2380 SYM_FUNC_END(aesni_ecb_enc)
2381 
2382 /*
2383  * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2384  *		      size_t len);
2385  */
2386 SYM_FUNC_START(aesni_ecb_dec)
2387 	FRAME_BEGIN
2388 #ifndef __x86_64__
2389 	pushl LEN
2390 	pushl KEYP
2391 	pushl KLEN
2392 	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2393 	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2394 	movl (FRAME_OFFSET+24)(%esp), INP	# src
2395 	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2396 #endif
2397 	test LEN, LEN
2398 	jz .Lecb_dec_ret
2399 	mov 480(KEYP), KLEN
2400 	add $240, KEYP
2401 	cmp $16, LEN
2402 	jb .Lecb_dec_ret
2403 	cmp $64, LEN
2404 	jb .Lecb_dec_loop1
2405 .align 4
2406 .Lecb_dec_loop4:
2407 	movups (INP), STATE1
2408 	movups 0x10(INP), STATE2
2409 	movups 0x20(INP), STATE3
2410 	movups 0x30(INP), STATE4
2411 	call _aesni_dec4
2412 	movups STATE1, (OUTP)
2413 	movups STATE2, 0x10(OUTP)
2414 	movups STATE3, 0x20(OUTP)
2415 	movups STATE4, 0x30(OUTP)
2416 	sub $64, LEN
2417 	add $64, INP
2418 	add $64, OUTP
2419 	cmp $64, LEN
2420 	jge .Lecb_dec_loop4
2421 	cmp $16, LEN
2422 	jb .Lecb_dec_ret
2423 .align 4
2424 .Lecb_dec_loop1:
2425 	movups (INP), STATE1
2426 	call _aesni_dec1
2427 	movups STATE1, (OUTP)
2428 	sub $16, LEN
2429 	add $16, INP
2430 	add $16, OUTP
2431 	cmp $16, LEN
2432 	jge .Lecb_dec_loop1
2433 .Lecb_dec_ret:
2434 #ifndef __x86_64__
2435 	popl KLEN
2436 	popl KEYP
2437 	popl LEN
2438 #endif
2439 	FRAME_END
2440 	RET
2441 SYM_FUNC_END(aesni_ecb_dec)
2442 
2443 /*
2444  * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2445  *		      size_t len, u8 *iv)
2446  */
2447 SYM_FUNC_START(aesni_cbc_enc)
2448 	FRAME_BEGIN
2449 #ifndef __x86_64__
2450 	pushl IVP
2451 	pushl LEN
2452 	pushl KEYP
2453 	pushl KLEN
2454 	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2455 	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2456 	movl (FRAME_OFFSET+28)(%esp), INP	# src
2457 	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2458 	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2459 #endif
2460 	cmp $16, LEN
2461 	jb .Lcbc_enc_ret
2462 	mov 480(KEYP), KLEN
2463 	movups (IVP), STATE	# load iv as initial state
2464 .align 4
2465 .Lcbc_enc_loop:
2466 	movups (INP), IN	# load input
2467 	pxor IN, STATE
2468 	call _aesni_enc1
2469 	movups STATE, (OUTP)	# store output
2470 	sub $16, LEN
2471 	add $16, INP
2472 	add $16, OUTP
2473 	cmp $16, LEN
2474 	jge .Lcbc_enc_loop
2475 	movups STATE, (IVP)
2476 .Lcbc_enc_ret:
2477 #ifndef __x86_64__
2478 	popl KLEN
2479 	popl KEYP
2480 	popl LEN
2481 	popl IVP
2482 #endif
2483 	FRAME_END
2484 	RET
2485 SYM_FUNC_END(aesni_cbc_enc)
2486 
2487 /*
2488  * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2489  *		      size_t len, u8 *iv)
2490  */
2491 SYM_FUNC_START(aesni_cbc_dec)
2492 	FRAME_BEGIN
2493 #ifndef __x86_64__
2494 	pushl IVP
2495 	pushl LEN
2496 	pushl KEYP
2497 	pushl KLEN
2498 	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2499 	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2500 	movl (FRAME_OFFSET+28)(%esp), INP	# src
2501 	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2502 	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2503 #endif
2504 	cmp $16, LEN
2505 	jb .Lcbc_dec_just_ret
2506 	mov 480(KEYP), KLEN
2507 	add $240, KEYP
2508 	movups (IVP), IV
2509 	cmp $64, LEN
2510 	jb .Lcbc_dec_loop1
2511 .align 4
2512 .Lcbc_dec_loop4:
2513 	movups (INP), IN1
2514 	movaps IN1, STATE1
2515 	movups 0x10(INP), IN2
2516 	movaps IN2, STATE2
2517 #ifdef __x86_64__
2518 	movups 0x20(INP), IN3
2519 	movaps IN3, STATE3
2520 	movups 0x30(INP), IN4
2521 	movaps IN4, STATE4
2522 #else
2523 	movups 0x20(INP), IN1
2524 	movaps IN1, STATE3
2525 	movups 0x30(INP), IN2
2526 	movaps IN2, STATE4
2527 #endif
2528 	call _aesni_dec4
2529 	pxor IV, STATE1
2530 #ifdef __x86_64__
2531 	pxor IN1, STATE2
2532 	pxor IN2, STATE3
2533 	pxor IN3, STATE4
2534 	movaps IN4, IV
2535 #else
2536 	pxor IN1, STATE4
2537 	movaps IN2, IV
2538 	movups (INP), IN1
2539 	pxor IN1, STATE2
2540 	movups 0x10(INP), IN2
2541 	pxor IN2, STATE3
2542 #endif
2543 	movups STATE1, (OUTP)
2544 	movups STATE2, 0x10(OUTP)
2545 	movups STATE3, 0x20(OUTP)
2546 	movups STATE4, 0x30(OUTP)
2547 	sub $64, LEN
2548 	add $64, INP
2549 	add $64, OUTP
2550 	cmp $64, LEN
2551 	jge .Lcbc_dec_loop4
2552 	cmp $16, LEN
2553 	jb .Lcbc_dec_ret
2554 .align 4
2555 .Lcbc_dec_loop1:
2556 	movups (INP), IN
2557 	movaps IN, STATE
2558 	call _aesni_dec1
2559 	pxor IV, STATE
2560 	movups STATE, (OUTP)
2561 	movaps IN, IV
2562 	sub $16, LEN
2563 	add $16, INP
2564 	add $16, OUTP
2565 	cmp $16, LEN
2566 	jge .Lcbc_dec_loop1
2567 .Lcbc_dec_ret:
2568 	movups IV, (IVP)
2569 .Lcbc_dec_just_ret:
2570 #ifndef __x86_64__
2571 	popl KLEN
2572 	popl KEYP
2573 	popl LEN
2574 	popl IVP
2575 #endif
2576 	FRAME_END
2577 	RET
2578 SYM_FUNC_END(aesni_cbc_dec)
2579 
2580 #ifdef __x86_64__
2581 .pushsection .rodata
2582 .align 16
2583 .Lbswap_mask:
2584 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2585 .popsection
2586 
2587 /*
2588  * _aesni_inc_init:	internal ABI
2589  *	setup registers used by _aesni_inc
2590  * input:
2591  *	IV
2592  * output:
2593  *	CTR:	== IV, in little endian
2594  *	TCTR_LOW: == lower qword of CTR
2595  *	INC:	== 1, in little endian
2596  *	BSWAP_MASK == endian swapping mask
2597  */
2598 SYM_FUNC_START_LOCAL(_aesni_inc_init)
2599 	movaps .Lbswap_mask, BSWAP_MASK
2600 	movaps IV, CTR
2601 	pshufb BSWAP_MASK, CTR
2602 	mov $1, TCTR_LOW
2603 	movq TCTR_LOW, INC
2604 	movq CTR, TCTR_LOW
2605 	RET
2606 SYM_FUNC_END(_aesni_inc_init)
2607 
2608 /*
2609  * _aesni_inc:		internal ABI
2610  *	Increase IV by 1, IV is in big endian
2611  * input:
2612  *	IV
2613  *	CTR:	== IV, in little endian
2614  *	TCTR_LOW: == lower qword of CTR
2615  *	INC:	== 1, in little endian
2616  *	BSWAP_MASK == endian swapping mask
2617  * output:
2618  *	IV:	Increase by 1
2619  * changed:
2620  *	CTR:	== output IV, in little endian
2621  *	TCTR_LOW: == lower qword of CTR
2622  */
2623 SYM_FUNC_START_LOCAL(_aesni_inc)
2624 	paddq INC, CTR
2625 	add $1, TCTR_LOW
2626 	jnc .Linc_low
2627 	pslldq $8, INC
2628 	paddq INC, CTR
2629 	psrldq $8, INC
2630 .Linc_low:
2631 	movaps CTR, IV
2632 	pshufb BSWAP_MASK, IV
2633 	RET
2634 SYM_FUNC_END(_aesni_inc)
2635 
2636 /*
2637  * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2638  *		      size_t len, u8 *iv)
2639  */
2640 SYM_FUNC_START(aesni_ctr_enc)
2641 	FRAME_BEGIN
2642 	cmp $16, LEN
2643 	jb .Lctr_enc_just_ret
2644 	mov 480(KEYP), KLEN
2645 	movups (IVP), IV
2646 	call _aesni_inc_init
2647 	cmp $64, LEN
2648 	jb .Lctr_enc_loop1
2649 .align 4
2650 .Lctr_enc_loop4:
2651 	movaps IV, STATE1
2652 	call _aesni_inc
2653 	movups (INP), IN1
2654 	movaps IV, STATE2
2655 	call _aesni_inc
2656 	movups 0x10(INP), IN2
2657 	movaps IV, STATE3
2658 	call _aesni_inc
2659 	movups 0x20(INP), IN3
2660 	movaps IV, STATE4
2661 	call _aesni_inc
2662 	movups 0x30(INP), IN4
2663 	call _aesni_enc4
2664 	pxor IN1, STATE1
2665 	movups STATE1, (OUTP)
2666 	pxor IN2, STATE2
2667 	movups STATE2, 0x10(OUTP)
2668 	pxor IN3, STATE3
2669 	movups STATE3, 0x20(OUTP)
2670 	pxor IN4, STATE4
2671 	movups STATE4, 0x30(OUTP)
2672 	sub $64, LEN
2673 	add $64, INP
2674 	add $64, OUTP
2675 	cmp $64, LEN
2676 	jge .Lctr_enc_loop4
2677 	cmp $16, LEN
2678 	jb .Lctr_enc_ret
2679 .align 4
2680 .Lctr_enc_loop1:
2681 	movaps IV, STATE
2682 	call _aesni_inc
2683 	movups (INP), IN
2684 	call _aesni_enc1
2685 	pxor IN, STATE
2686 	movups STATE, (OUTP)
2687 	sub $16, LEN
2688 	add $16, INP
2689 	add $16, OUTP
2690 	cmp $16, LEN
2691 	jge .Lctr_enc_loop1
2692 .Lctr_enc_ret:
2693 	movups IV, (IVP)
2694 .Lctr_enc_just_ret:
2695 	FRAME_END
2696 	RET
2697 SYM_FUNC_END(aesni_ctr_enc)
2698 
2699 /*
2700  * _aesni_gf128mul_x_ble:		internal ABI
2701  *	Multiply in GF(2^128) for XTS IVs
2702  * input:
2703  *	IV:	current IV
2704  *	GF128MUL_MASK == mask with 0x87 and 0x01
2705  * output:
2706  *	IV:	next IV
2707  * changed:
2708  *	CTR:	== temporary value
2709  */
2710 #define _aesni_gf128mul_x_ble() \
2711 	pshufd $0x13, IV, CTR; \
2712 	paddq IV, IV; \
2713 	psrad $31, CTR; \
2714 	pand GF128MUL_MASK, CTR; \
2715 	pxor CTR, IV;
2716 
2717 /*
2718  * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
2719  *			  const u8 *src, unsigned int len, le128 *iv)
2720  */
2721 SYM_FUNC_START(aesni_xts_encrypt)
2722 	FRAME_BEGIN
2723 
2724 	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2725 	movups (IVP), IV
2726 
2727 	mov 480(KEYP), KLEN
2728 
2729 .Lxts_enc_loop4:
2730 	movdqa IV, STATE1
2731 	movdqu 0x00(INP), INC
2732 	pxor INC, STATE1
2733 	movdqu IV, 0x00(OUTP)
2734 
2735 	_aesni_gf128mul_x_ble()
2736 	movdqa IV, STATE2
2737 	movdqu 0x10(INP), INC
2738 	pxor INC, STATE2
2739 	movdqu IV, 0x10(OUTP)
2740 
2741 	_aesni_gf128mul_x_ble()
2742 	movdqa IV, STATE3
2743 	movdqu 0x20(INP), INC
2744 	pxor INC, STATE3
2745 	movdqu IV, 0x20(OUTP)
2746 
2747 	_aesni_gf128mul_x_ble()
2748 	movdqa IV, STATE4
2749 	movdqu 0x30(INP), INC
2750 	pxor INC, STATE4
2751 	movdqu IV, 0x30(OUTP)
2752 
2753 	call _aesni_enc4
2754 
2755 	movdqu 0x00(OUTP), INC
2756 	pxor INC, STATE1
2757 	movdqu STATE1, 0x00(OUTP)
2758 
2759 	movdqu 0x10(OUTP), INC
2760 	pxor INC, STATE2
2761 	movdqu STATE2, 0x10(OUTP)
2762 
2763 	movdqu 0x20(OUTP), INC
2764 	pxor INC, STATE3
2765 	movdqu STATE3, 0x20(OUTP)
2766 
2767 	movdqu 0x30(OUTP), INC
2768 	pxor INC, STATE4
2769 	movdqu STATE4, 0x30(OUTP)
2770 
2771 	_aesni_gf128mul_x_ble()
2772 
2773 	add $64, INP
2774 	add $64, OUTP
2775 	sub $64, LEN
2776 	ja .Lxts_enc_loop4
2777 
2778 	movups IV, (IVP)
2779 
2780 	FRAME_END
2781 	RET
2782 SYM_FUNC_END(aesni_xts_encrypt)
2783 
2784 /*
2785  * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
2786  *			  const u8 *src, unsigned int len, le128 *iv)
2787  */
2788 SYM_FUNC_START(aesni_xts_decrypt)
2789 	FRAME_BEGIN
2790 
2791 	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2792 	movups (IVP), IV
2793 
2794 	mov 480(KEYP), KLEN
2795 	add $240, KEYP
2796 
2797 .Lxts_dec_loop4:
2798 	movdqa IV, STATE1
2799 	movdqu 0x00(INP), INC
2800 	pxor INC, STATE1
2801 	movdqu IV, 0x00(OUTP)
2802 
2803 	_aesni_gf128mul_x_ble()
2804 	movdqa IV, STATE2
2805 	movdqu 0x10(INP), INC
2806 	pxor INC, STATE2
2807 	movdqu IV, 0x10(OUTP)
2808 
2809 	_aesni_gf128mul_x_ble()
2810 	movdqa IV, STATE3
2811 	movdqu 0x20(INP), INC
2812 	pxor INC, STATE3
2813 	movdqu IV, 0x20(OUTP)
2814 
2815 	_aesni_gf128mul_x_ble()
2816 	movdqa IV, STATE4
2817 	movdqu 0x30(INP), INC
2818 	pxor INC, STATE4
2819 	movdqu IV, 0x30(OUTP)
2820 
2821 	call _aesni_dec4
2822 
2823 	movdqu 0x00(OUTP), INC
2824 	pxor INC, STATE1
2825 	movdqu STATE1, 0x00(OUTP)
2826 
2827 	movdqu 0x10(OUTP), INC
2828 	pxor INC, STATE2
2829 	movdqu STATE2, 0x10(OUTP)
2830 
2831 	movdqu 0x20(OUTP), INC
2832 	pxor INC, STATE3
2833 	movdqu STATE3, 0x20(OUTP)
2834 
2835 	movdqu 0x30(OUTP), INC
2836 	pxor INC, STATE4
2837 	movdqu STATE4, 0x30(OUTP)
2838 
2839 	_aesni_gf128mul_x_ble()
2840 
2841 	add $64, INP
2842 	add $64, OUTP
2843 	sub $64, LEN
2844 	ja .Lxts_dec_loop4
2845 
2846 	movups IV, (IVP)
2847 
2848 	FRAME_END
2849 	RET
2850 SYM_FUNC_END(aesni_xts_decrypt)
2851 
2852 #endif
2853