1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * Implement AES algorithm in Intel AES-NI instructions.
4  *
5  * The white paper of AES-NI instructions can be downloaded from:
6  *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7  *
8  * Copyright (C) 2008, Intel Corp.
9  *    Author: Huang Ying <ying.huang@intel.com>
10  *            Vinodh Gopal <vinodh.gopal@intel.com>
11  *            Kahraman Akdemir
12  *
13  * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
14  * interface for 64-bit kernels.
15  *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
16  *             Aidan O'Mahony (aidan.o.mahony@intel.com)
17  *             Adrian Hoban <adrian.hoban@intel.com>
18  *             James Guilford (james.guilford@intel.com)
19  *             Gabriele Paoloni <gabriele.paoloni@intel.com>
20  *             Tadeusz Struk (tadeusz.struk@intel.com)
21  *             Wajdi Feghali (wajdi.k.feghali@intel.com)
22  *    Copyright (c) 2010, Intel Corporation.
23  *
24  * Ported x86_64 version to x86:
25  *    Author: Mathias Krause <minipli@googlemail.com>
26  */
27 
28 #include <linux/linkage.h>
29 #include <asm/frame.h>
30 #include <asm/nospec-branch.h>
31 
32 /*
33  * The following macros are used to move an (un)aligned 16 byte value to/from
34  * an XMM register.  This can done for either FP or integer values, for FP use
35  * movaps (move aligned packed single) or integer use movdqa (move double quad
36  * aligned).  It doesn't make a performance difference which instruction is used
37  * since Nehalem (original Core i7) was released.  However, the movaps is a byte
38  * shorter, so that is the one we'll use for now. (same for unaligned).
39  */
40 #define MOVADQ	movaps
41 #define MOVUDQ	movups
42 
43 #ifdef __x86_64__
44 
45 # constants in mergeable sections, linker can reorder and merge
46 .section	.rodata.cst16.POLY, "aM", @progbits, 16
47 .align 16
48 POLY:   .octa 0xC2000000000000000000000000000001
49 .section	.rodata.cst16.TWOONE, "aM", @progbits, 16
50 .align 16
51 TWOONE: .octa 0x00000001000000000000000000000001
52 
53 .section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
54 .align 16
55 SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
56 .section	.rodata.cst16.MASK1, "aM", @progbits, 16
57 .align 16
58 MASK1:      .octa 0x0000000000000000ffffffffffffffff
59 .section	.rodata.cst16.MASK2, "aM", @progbits, 16
60 .align 16
61 MASK2:      .octa 0xffffffffffffffff0000000000000000
62 .section	.rodata.cst16.ONE, "aM", @progbits, 16
63 .align 16
64 ONE:        .octa 0x00000000000000000000000000000001
65 .section	.rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
66 .align 16
67 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
68 .section	.rodata.cst16.dec, "aM", @progbits, 16
69 .align 16
70 dec:        .octa 0x1
71 .section	.rodata.cst16.enc, "aM", @progbits, 16
72 .align 16
73 enc:        .octa 0x2
74 
75 # order of these constants should not change.
76 # more specifically, ALL_F should follow SHIFT_MASK,
77 # and zero should follow ALL_F
78 .section	.rodata, "a", @progbits
79 .align 16
80 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
81 ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
82             .octa 0x00000000000000000000000000000000
83 
84 .text
85 
86 
87 #define	STACK_OFFSET    8*3
88 
89 #define AadHash 16*0
90 #define AadLen 16*1
91 #define InLen (16*1)+8
92 #define PBlockEncKey 16*2
93 #define OrigIV 16*3
94 #define CurCount 16*4
95 #define PBlockLen 16*5
96 #define	HashKey		16*6	// store HashKey <<1 mod poly here
97 #define	HashKey_2	16*7	// store HashKey^2 <<1 mod poly here
98 #define	HashKey_3	16*8	// store HashKey^3 <<1 mod poly here
99 #define	HashKey_4	16*9	// store HashKey^4 <<1 mod poly here
100 #define	HashKey_k	16*10	// store XOR of High 64 bits and Low 64
101 				// bits of  HashKey <<1 mod poly here
102 				//(for Karatsuba purposes)
103 #define	HashKey_2_k	16*11	// store XOR of High 64 bits and Low 64
104 				// bits of  HashKey^2 <<1 mod poly here
105 				// (for Karatsuba purposes)
106 #define	HashKey_3_k	16*12	// store XOR of High 64 bits and Low 64
107 				// bits of  HashKey^3 <<1 mod poly here
108 				// (for Karatsuba purposes)
109 #define	HashKey_4_k	16*13	// store XOR of High 64 bits and Low 64
110 				// bits of  HashKey^4 <<1 mod poly here
111 				// (for Karatsuba purposes)
112 
113 #define arg1 rdi
114 #define arg2 rsi
115 #define arg3 rdx
116 #define arg4 rcx
117 #define arg5 r8
118 #define arg6 r9
119 #define arg7 STACK_OFFSET+8(%rsp)
120 #define arg8 STACK_OFFSET+16(%rsp)
121 #define arg9 STACK_OFFSET+24(%rsp)
122 #define arg10 STACK_OFFSET+32(%rsp)
123 #define arg11 STACK_OFFSET+40(%rsp)
124 #define keysize 2*15*16(%arg1)
125 #endif
126 
127 
128 #define STATE1	%xmm0
129 #define STATE2	%xmm4
130 #define STATE3	%xmm5
131 #define STATE4	%xmm6
132 #define STATE	STATE1
133 #define IN1	%xmm1
134 #define IN2	%xmm7
135 #define IN3	%xmm8
136 #define IN4	%xmm9
137 #define IN	IN1
138 #define KEY	%xmm2
139 #define IV	%xmm3
140 
141 #define BSWAP_MASK %xmm10
142 #define CTR	%xmm11
143 #define INC	%xmm12
144 
145 #define GF128MUL_MASK %xmm7
146 
147 #ifdef __x86_64__
148 #define AREG	%rax
149 #define KEYP	%rdi
150 #define OUTP	%rsi
151 #define UKEYP	OUTP
152 #define INP	%rdx
153 #define LEN	%rcx
154 #define IVP	%r8
155 #define KLEN	%r9d
156 #define T1	%r10
157 #define TKEYP	T1
158 #define T2	%r11
159 #define TCTR_LOW T2
160 #else
161 #define AREG	%eax
162 #define KEYP	%edi
163 #define OUTP	AREG
164 #define UKEYP	OUTP
165 #define INP	%edx
166 #define LEN	%esi
167 #define IVP	%ebp
168 #define KLEN	%ebx
169 #define T1	%ecx
170 #define TKEYP	T1
171 #endif
172 
173 .macro FUNC_SAVE
174 	push	%r12
175 	push	%r13
176 	push	%r14
177 #
178 # states of %xmm registers %xmm6:%xmm15 not saved
179 # all %xmm registers are clobbered
180 #
181 .endm
182 
183 
184 .macro FUNC_RESTORE
185 	pop	%r14
186 	pop	%r13
187 	pop	%r12
188 .endm
189 
190 # Precompute hashkeys.
191 # Input: Hash subkey.
192 # Output: HashKeys stored in gcm_context_data.  Only needs to be called
193 # once per key.
194 # clobbers r12, and tmp xmm registers.
195 .macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
196 	mov	\SUBKEY, %r12
197 	movdqu	(%r12), \TMP3
198 	movdqa	SHUF_MASK(%rip), \TMP2
199 	pshufb	\TMP2, \TMP3
200 
201 	# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
202 
203 	movdqa	\TMP3, \TMP2
204 	psllq	$1, \TMP3
205 	psrlq	$63, \TMP2
206 	movdqa	\TMP2, \TMP1
207 	pslldq	$8, \TMP2
208 	psrldq	$8, \TMP1
209 	por	\TMP2, \TMP3
210 
211 	# reduce HashKey<<1
212 
213 	pshufd	$0x24, \TMP1, \TMP2
214 	pcmpeqd TWOONE(%rip), \TMP2
215 	pand	POLY(%rip), \TMP2
216 	pxor	\TMP2, \TMP3
217 	movdqu	\TMP3, HashKey(%arg2)
218 
219 	movdqa	   \TMP3, \TMP5
220 	pshufd	   $78, \TMP3, \TMP1
221 	pxor	   \TMP3, \TMP1
222 	movdqu	   \TMP1, HashKey_k(%arg2)
223 
224 	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
225 # TMP5 = HashKey^2<<1 (mod poly)
226 	movdqu	   \TMP5, HashKey_2(%arg2)
227 # HashKey_2 = HashKey^2<<1 (mod poly)
228 	pshufd	   $78, \TMP5, \TMP1
229 	pxor	   \TMP5, \TMP1
230 	movdqu	   \TMP1, HashKey_2_k(%arg2)
231 
232 	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
233 # TMP5 = HashKey^3<<1 (mod poly)
234 	movdqu	   \TMP5, HashKey_3(%arg2)
235 	pshufd	   $78, \TMP5, \TMP1
236 	pxor	   \TMP5, \TMP1
237 	movdqu	   \TMP1, HashKey_3_k(%arg2)
238 
239 	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
240 # TMP5 = HashKey^3<<1 (mod poly)
241 	movdqu	   \TMP5, HashKey_4(%arg2)
242 	pshufd	   $78, \TMP5, \TMP1
243 	pxor	   \TMP5, \TMP1
244 	movdqu	   \TMP1, HashKey_4_k(%arg2)
245 .endm
246 
247 # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
248 # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
249 .macro GCM_INIT Iv SUBKEY AAD AADLEN
250 	mov \AADLEN, %r11
251 	mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
252 	xor %r11d, %r11d
253 	mov %r11, InLen(%arg2) # ctx_data.in_length = 0
254 	mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
255 	mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
256 	mov \Iv, %rax
257 	movdqu (%rax), %xmm0
258 	movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
259 
260 	movdqa  SHUF_MASK(%rip), %xmm2
261 	pshufb %xmm2, %xmm0
262 	movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
263 
264 	PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
265 	movdqu HashKey(%arg2), %xmm13
266 
267 	CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
268 	%xmm4, %xmm5, %xmm6
269 .endm
270 
271 # GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
272 # struct has been initialized by GCM_INIT.
273 # Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
274 # Clobbers rax, r10-r13, and xmm0-xmm15
275 .macro GCM_ENC_DEC operation
276 	movdqu AadHash(%arg2), %xmm8
277 	movdqu HashKey(%arg2), %xmm13
278 	add %arg5, InLen(%arg2)
279 
280 	xor %r11d, %r11d # initialise the data pointer offset as zero
281 	PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
282 
283 	sub %r11, %arg5		# sub partial block data used
284 	mov %arg5, %r13		# save the number of bytes
285 
286 	and $-16, %r13		# %r13 = %r13 - (%r13 mod 16)
287 	mov %r13, %r12
288 	# Encrypt/Decrypt first few blocks
289 
290 	and	$(3<<4), %r12
291 	jz	.L_initial_num_blocks_is_0_\@
292 	cmp	$(2<<4), %r12
293 	jb	.L_initial_num_blocks_is_1_\@
294 	je	.L_initial_num_blocks_is_2_\@
295 .L_initial_num_blocks_is_3_\@:
296 	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
297 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
298 	sub	$48, %r13
299 	jmp	.L_initial_blocks_\@
300 .L_initial_num_blocks_is_2_\@:
301 	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
302 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
303 	sub	$32, %r13
304 	jmp	.L_initial_blocks_\@
305 .L_initial_num_blocks_is_1_\@:
306 	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
307 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
308 	sub	$16, %r13
309 	jmp	.L_initial_blocks_\@
310 .L_initial_num_blocks_is_0_\@:
311 	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
312 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
313 .L_initial_blocks_\@:
314 
315 	# Main loop - Encrypt/Decrypt remaining blocks
316 
317 	test	%r13, %r13
318 	je	.L_zero_cipher_left_\@
319 	sub	$64, %r13
320 	je	.L_four_cipher_left_\@
321 .L_crypt_by_4_\@:
322 	GHASH_4_ENCRYPT_4_PARALLEL_\operation	%xmm9, %xmm10, %xmm11, %xmm12, \
323 	%xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
324 	%xmm7, %xmm8, enc
325 	add	$64, %r11
326 	sub	$64, %r13
327 	jne	.L_crypt_by_4_\@
328 .L_four_cipher_left_\@:
329 	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
330 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
331 .L_zero_cipher_left_\@:
332 	movdqu %xmm8, AadHash(%arg2)
333 	movdqu %xmm0, CurCount(%arg2)
334 
335 	mov	%arg5, %r13
336 	and	$15, %r13			# %r13 = arg5 (mod 16)
337 	je	.L_multiple_of_16_bytes_\@
338 
339 	mov %r13, PBlockLen(%arg2)
340 
341 	# Handle the last <16 Byte block separately
342 	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
343 	movdqu %xmm0, CurCount(%arg2)
344 	movdqa SHUF_MASK(%rip), %xmm10
345 	pshufb %xmm10, %xmm0
346 
347 	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
348 	movdqu %xmm0, PBlockEncKey(%arg2)
349 
350 	cmp	$16, %arg5
351 	jge	.L_large_enough_update_\@
352 
353 	lea (%arg4,%r11,1), %r10
354 	mov %r13, %r12
355 	READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
356 	jmp	.L_data_read_\@
357 
358 .L_large_enough_update_\@:
359 	sub	$16, %r11
360 	add	%r13, %r11
361 
362 	# receive the last <16 Byte block
363 	movdqu	(%arg4, %r11, 1), %xmm1
364 
365 	sub	%r13, %r11
366 	add	$16, %r11
367 
368 	lea	SHIFT_MASK+16(%rip), %r12
369 	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
370 	# (r13 is the number of bytes in plaintext mod 16)
371 	sub	%r13, %r12
372 	# get the appropriate shuffle mask
373 	movdqu	(%r12), %xmm2
374 	# shift right 16-r13 bytes
375 	pshufb  %xmm2, %xmm1
376 
377 .L_data_read_\@:
378 	lea ALL_F+16(%rip), %r12
379 	sub %r13, %r12
380 
381 .ifc \operation, dec
382 	movdqa  %xmm1, %xmm2
383 .endif
384 	pxor	%xmm1, %xmm0            # XOR Encrypt(K, Yn)
385 	movdqu	(%r12), %xmm1
386 	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
387 	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
388 .ifc \operation, dec
389 	pand    %xmm1, %xmm2
390 	movdqa SHUF_MASK(%rip), %xmm10
391 	pshufb %xmm10 ,%xmm2
392 
393 	pxor %xmm2, %xmm8
394 .else
395 	movdqa SHUF_MASK(%rip), %xmm10
396 	pshufb %xmm10,%xmm0
397 
398 	pxor	%xmm0, %xmm8
399 .endif
400 
401 	movdqu %xmm8, AadHash(%arg2)
402 .ifc \operation, enc
403 	# GHASH computation for the last <16 byte block
404 	movdqa SHUF_MASK(%rip), %xmm10
405 	# shuffle xmm0 back to output as ciphertext
406 	pshufb %xmm10, %xmm0
407 .endif
408 
409 	# Output %r13 bytes
410 	movq %xmm0, %rax
411 	cmp $8, %r13
412 	jle .L_less_than_8_bytes_left_\@
413 	mov %rax, (%arg3 , %r11, 1)
414 	add $8, %r11
415 	psrldq $8, %xmm0
416 	movq %xmm0, %rax
417 	sub $8, %r13
418 .L_less_than_8_bytes_left_\@:
419 	mov %al,  (%arg3, %r11, 1)
420 	add $1, %r11
421 	shr $8, %rax
422 	sub $1, %r13
423 	jne .L_less_than_8_bytes_left_\@
424 .L_multiple_of_16_bytes_\@:
425 .endm
426 
427 # GCM_COMPLETE Finishes update of tag of last partial block
428 # Output: Authorization Tag (AUTH_TAG)
429 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
430 .macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
431 	movdqu AadHash(%arg2), %xmm8
432 	movdqu HashKey(%arg2), %xmm13
433 
434 	mov PBlockLen(%arg2), %r12
435 
436 	test %r12, %r12
437 	je .L_partial_done\@
438 
439 	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
440 
441 .L_partial_done\@:
442 	mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
443 	shl	$3, %r12		  # convert into number of bits
444 	movd	%r12d, %xmm15		  # len(A) in %xmm15
445 	mov InLen(%arg2), %r12
446 	shl     $3, %r12                  # len(C) in bits (*128)
447 	movq    %r12, %xmm1
448 
449 	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
450 	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
451 	pxor	%xmm15, %xmm8
452 	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
453 	# final GHASH computation
454 	movdqa SHUF_MASK(%rip), %xmm10
455 	pshufb %xmm10, %xmm8
456 
457 	movdqu OrigIV(%arg2), %xmm0       # %xmm0 = Y0
458 	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
459 	pxor	%xmm8, %xmm0
460 .L_return_T_\@:
461 	mov	\AUTHTAG, %r10                     # %r10 = authTag
462 	mov	\AUTHTAGLEN, %r11                    # %r11 = auth_tag_len
463 	cmp	$16, %r11
464 	je	.L_T_16_\@
465 	cmp	$8, %r11
466 	jl	.L_T_4_\@
467 .L_T_8_\@:
468 	movq	%xmm0, %rax
469 	mov	%rax, (%r10)
470 	add	$8, %r10
471 	sub	$8, %r11
472 	psrldq	$8, %xmm0
473 	test	%r11, %r11
474 	je	.L_return_T_done_\@
475 .L_T_4_\@:
476 	movd	%xmm0, %eax
477 	mov	%eax, (%r10)
478 	add	$4, %r10
479 	sub	$4, %r11
480 	psrldq	$4, %xmm0
481 	test	%r11, %r11
482 	je	.L_return_T_done_\@
483 .L_T_123_\@:
484 	movd	%xmm0, %eax
485 	cmp	$2, %r11
486 	jl	.L_T_1_\@
487 	mov	%ax, (%r10)
488 	cmp	$2, %r11
489 	je	.L_return_T_done_\@
490 	add	$2, %r10
491 	sar	$16, %eax
492 .L_T_1_\@:
493 	mov	%al, (%r10)
494 	jmp	.L_return_T_done_\@
495 .L_T_16_\@:
496 	movdqu	%xmm0, (%r10)
497 .L_return_T_done_\@:
498 .endm
499 
500 #ifdef __x86_64__
501 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
502 *
503 *
504 * Input: A and B (128-bits each, bit-reflected)
505 * Output: C = A*B*x mod poly, (i.e. >>1 )
506 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
507 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
508 *
509 */
510 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
511 	movdqa	  \GH, \TMP1
512 	pshufd	  $78, \GH, \TMP2
513 	pshufd	  $78, \HK, \TMP3
514 	pxor	  \GH, \TMP2            # TMP2 = a1+a0
515 	pxor	  \HK, \TMP3            # TMP3 = b1+b0
516 	pclmulqdq $0x11, \HK, \TMP1     # TMP1 = a1*b1
517 	pclmulqdq $0x00, \HK, \GH       # GH = a0*b0
518 	pclmulqdq $0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
519 	pxor	  \GH, \TMP2
520 	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
521 	movdqa	  \TMP2, \TMP3
522 	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
523 	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
524 	pxor	  \TMP3, \GH
525 	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
526 
527         # first phase of the reduction
528 
529 	movdqa    \GH, \TMP2
530 	movdqa    \GH, \TMP3
531 	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
532 					# in in order to perform
533 					# independent shifts
534 	pslld     $31, \TMP2            # packed right shift <<31
535 	pslld     $30, \TMP3            # packed right shift <<30
536 	pslld     $25, \TMP4            # packed right shift <<25
537 	pxor      \TMP3, \TMP2          # xor the shifted versions
538 	pxor      \TMP4, \TMP2
539 	movdqa    \TMP2, \TMP5
540 	psrldq    $4, \TMP5             # right shift TMP5 1 DW
541 	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
542 	pxor      \TMP2, \GH
543 
544         # second phase of the reduction
545 
546 	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
547 					# in in order to perform
548 					# independent shifts
549 	movdqa    \GH,\TMP3
550 	movdqa    \GH,\TMP4
551 	psrld     $1,\TMP2              # packed left shift >>1
552 	psrld     $2,\TMP3              # packed left shift >>2
553 	psrld     $7,\TMP4              # packed left shift >>7
554 	pxor      \TMP3,\TMP2		# xor the shifted versions
555 	pxor      \TMP4,\TMP2
556 	pxor      \TMP5, \TMP2
557 	pxor      \TMP2, \GH
558 	pxor      \TMP1, \GH            # result is in TMP1
559 .endm
560 
561 # Reads DLEN bytes starting at DPTR and stores in XMMDst
562 # where 0 < DLEN < 16
563 # Clobbers %rax, DLEN and XMM1
564 .macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
565         cmp $8, \DLEN
566         jl .L_read_lt8_\@
567         mov (\DPTR), %rax
568         movq %rax, \XMMDst
569         sub $8, \DLEN
570         jz .L_done_read_partial_block_\@
571 	xor %eax, %eax
572 .L_read_next_byte_\@:
573         shl $8, %rax
574         mov 7(\DPTR, \DLEN, 1), %al
575         dec \DLEN
576         jnz .L_read_next_byte_\@
577         movq %rax, \XMM1
578 	pslldq $8, \XMM1
579         por \XMM1, \XMMDst
580 	jmp .L_done_read_partial_block_\@
581 .L_read_lt8_\@:
582 	xor %eax, %eax
583 .L_read_next_byte_lt8_\@:
584         shl $8, %rax
585         mov -1(\DPTR, \DLEN, 1), %al
586         dec \DLEN
587         jnz .L_read_next_byte_lt8_\@
588         movq %rax, \XMMDst
589 .L_done_read_partial_block_\@:
590 .endm
591 
592 # CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
593 # clobbers r10-11, xmm14
594 .macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
595 	TMP6 TMP7
596 	MOVADQ	   SHUF_MASK(%rip), %xmm14
597 	mov	   \AAD, %r10		# %r10 = AAD
598 	mov	   \AADLEN, %r11		# %r11 = aadLen
599 	pxor	   \TMP7, \TMP7
600 	pxor	   \TMP6, \TMP6
601 
602 	cmp	   $16, %r11
603 	jl	   .L_get_AAD_rest\@
604 .L_get_AAD_blocks\@:
605 	movdqu	   (%r10), \TMP7
606 	pshufb	   %xmm14, \TMP7 # byte-reflect the AAD data
607 	pxor	   \TMP7, \TMP6
608 	GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
609 	add	   $16, %r10
610 	sub	   $16, %r11
611 	cmp	   $16, %r11
612 	jge	   .L_get_AAD_blocks\@
613 
614 	movdqu	   \TMP6, \TMP7
615 
616 	/* read the last <16B of AAD */
617 .L_get_AAD_rest\@:
618 	test	   %r11, %r11
619 	je	   .L_get_AAD_done\@
620 
621 	READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
622 	pshufb	   %xmm14, \TMP7 # byte-reflect the AAD data
623 	pxor	   \TMP6, \TMP7
624 	GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
625 	movdqu \TMP7, \TMP6
626 
627 .L_get_AAD_done\@:
628 	movdqu \TMP6, AadHash(%arg2)
629 .endm
630 
631 # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
632 # between update calls.
633 # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
634 # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
635 # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
636 .macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
637 	AAD_HASH operation
638 	mov 	PBlockLen(%arg2), %r13
639 	test	%r13, %r13
640 	je	.L_partial_block_done_\@	# Leave Macro if no partial blocks
641 	# Read in input data without over reading
642 	cmp	$16, \PLAIN_CYPH_LEN
643 	jl	.L_fewer_than_16_bytes_\@
644 	movups	(\PLAIN_CYPH_IN), %xmm1	# If more than 16 bytes, just fill xmm
645 	jmp	.L_data_read_\@
646 
647 .L_fewer_than_16_bytes_\@:
648 	lea	(\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
649 	mov	\PLAIN_CYPH_LEN, %r12
650 	READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
651 
652 	mov PBlockLen(%arg2), %r13
653 
654 .L_data_read_\@:				# Finished reading in data
655 
656 	movdqu	PBlockEncKey(%arg2), %xmm9
657 	movdqu	HashKey(%arg2), %xmm13
658 
659 	lea	SHIFT_MASK(%rip), %r12
660 
661 	# adjust the shuffle mask pointer to be able to shift r13 bytes
662 	# r16-r13 is the number of bytes in plaintext mod 16)
663 	add	%r13, %r12
664 	movdqu	(%r12), %xmm2		# get the appropriate shuffle mask
665 	pshufb	%xmm2, %xmm9		# shift right r13 bytes
666 
667 .ifc \operation, dec
668 	movdqa	%xmm1, %xmm3
669 	pxor	%xmm1, %xmm9		# Cyphertext XOR E(K, Yn)
670 
671 	mov	\PLAIN_CYPH_LEN, %r10
672 	add	%r13, %r10
673 	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
674 	sub	$16, %r10
675 	# Determine if if partial block is not being filled and
676 	# shift mask accordingly
677 	jge	.L_no_extra_mask_1_\@
678 	sub	%r10, %r12
679 .L_no_extra_mask_1_\@:
680 
681 	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
682 	# get the appropriate mask to mask out bottom r13 bytes of xmm9
683 	pand	%xmm1, %xmm9		# mask out bottom r13 bytes of xmm9
684 
685 	pand	%xmm1, %xmm3
686 	movdqa	SHUF_MASK(%rip), %xmm10
687 	pshufb	%xmm10, %xmm3
688 	pshufb	%xmm2, %xmm3
689 	pxor	%xmm3, \AAD_HASH
690 
691 	test	%r10, %r10
692 	jl	.L_partial_incomplete_1_\@
693 
694 	# GHASH computation for the last <16 Byte block
695 	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
696 	xor	%eax, %eax
697 
698 	mov	%rax, PBlockLen(%arg2)
699 	jmp	.L_dec_done_\@
700 .L_partial_incomplete_1_\@:
701 	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
702 .L_dec_done_\@:
703 	movdqu	\AAD_HASH, AadHash(%arg2)
704 .else
705 	pxor	%xmm1, %xmm9			# Plaintext XOR E(K, Yn)
706 
707 	mov	\PLAIN_CYPH_LEN, %r10
708 	add	%r13, %r10
709 	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
710 	sub	$16, %r10
711 	# Determine if if partial block is not being filled and
712 	# shift mask accordingly
713 	jge	.L_no_extra_mask_2_\@
714 	sub	%r10, %r12
715 .L_no_extra_mask_2_\@:
716 
717 	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
718 	# get the appropriate mask to mask out bottom r13 bytes of xmm9
719 	pand	%xmm1, %xmm9
720 
721 	movdqa	SHUF_MASK(%rip), %xmm1
722 	pshufb	%xmm1, %xmm9
723 	pshufb	%xmm2, %xmm9
724 	pxor	%xmm9, \AAD_HASH
725 
726 	test	%r10, %r10
727 	jl	.L_partial_incomplete_2_\@
728 
729 	# GHASH computation for the last <16 Byte block
730 	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
731 	xor	%eax, %eax
732 
733 	mov	%rax, PBlockLen(%arg2)
734 	jmp	.L_encode_done_\@
735 .L_partial_incomplete_2_\@:
736 	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
737 .L_encode_done_\@:
738 	movdqu	\AAD_HASH, AadHash(%arg2)
739 
740 	movdqa	SHUF_MASK(%rip), %xmm10
741 	# shuffle xmm9 back to output as ciphertext
742 	pshufb	%xmm10, %xmm9
743 	pshufb	%xmm2, %xmm9
744 .endif
745 	# output encrypted Bytes
746 	test	%r10, %r10
747 	jl	.L_partial_fill_\@
748 	mov	%r13, %r12
749 	mov	$16, %r13
750 	# Set r13 to be the number of bytes to write out
751 	sub	%r12, %r13
752 	jmp	.L_count_set_\@
753 .L_partial_fill_\@:
754 	mov	\PLAIN_CYPH_LEN, %r13
755 .L_count_set_\@:
756 	movdqa	%xmm9, %xmm0
757 	movq	%xmm0, %rax
758 	cmp	$8, %r13
759 	jle	.L_less_than_8_bytes_left_\@
760 
761 	mov	%rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
762 	add	$8, \DATA_OFFSET
763 	psrldq	$8, %xmm0
764 	movq	%xmm0, %rax
765 	sub	$8, %r13
766 .L_less_than_8_bytes_left_\@:
767 	movb	%al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
768 	add	$1, \DATA_OFFSET
769 	shr	$8, %rax
770 	sub	$1, %r13
771 	jne	.L_less_than_8_bytes_left_\@
772 .L_partial_block_done_\@:
773 .endm # PARTIAL_BLOCK
774 
775 /*
776 * if a = number of total plaintext bytes
777 * b = floor(a/16)
778 * num_initial_blocks = b mod 4
779 * encrypt the initial num_initial_blocks blocks and apply ghash on
780 * the ciphertext
781 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
782 * are clobbered
783 * arg1, %arg2, %arg3 are used as a pointer only, not modified
784 */
785 
786 
787 .macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
788 	XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
789 	MOVADQ		SHUF_MASK(%rip), %xmm14
790 
791 	movdqu AadHash(%arg2), %xmm\i		    # XMM0 = Y0
792 
793 	# start AES for num_initial_blocks blocks
794 
795 	movdqu CurCount(%arg2), \XMM0                # XMM0 = Y0
796 
797 .if (\i == 5) || (\i == 6) || (\i == 7)
798 
799 	MOVADQ		ONE(%RIP),\TMP1
800 	MOVADQ		0(%arg1),\TMP2
801 .irpc index, \i_seq
802 	paddd		\TMP1, \XMM0                 # INCR Y0
803 .ifc \operation, dec
804         movdqa     \XMM0, %xmm\index
805 .else
806 	MOVADQ		\XMM0, %xmm\index
807 .endif
808 	pshufb	%xmm14, %xmm\index      # perform a 16 byte swap
809 	pxor		\TMP2, %xmm\index
810 .endr
811 	lea	0x10(%arg1),%r10
812 	mov	keysize,%eax
813 	shr	$2,%eax				# 128->4, 192->6, 256->8
814 	add	$5,%eax			      # 128->9, 192->11, 256->13
815 
816 .Laes_loop_initial_\@:
817 	MOVADQ	(%r10),\TMP1
818 .irpc	index, \i_seq
819 	aesenc	\TMP1, %xmm\index
820 .endr
821 	add	$16,%r10
822 	sub	$1,%eax
823 	jnz	.Laes_loop_initial_\@
824 
825 	MOVADQ	(%r10), \TMP1
826 .irpc index, \i_seq
827 	aesenclast \TMP1, %xmm\index         # Last Round
828 .endr
829 .irpc index, \i_seq
830 	movdqu	   (%arg4 , %r11, 1), \TMP1
831 	pxor	   \TMP1, %xmm\index
832 	movdqu	   %xmm\index, (%arg3 , %r11, 1)
833 	# write back plaintext/ciphertext for num_initial_blocks
834 	add	   $16, %r11
835 
836 .ifc \operation, dec
837 	movdqa     \TMP1, %xmm\index
838 .endif
839 	pshufb	   %xmm14, %xmm\index
840 
841 		# prepare plaintext/ciphertext for GHASH computation
842 .endr
843 .endif
844 
845         # apply GHASH on num_initial_blocks blocks
846 
847 .if \i == 5
848         pxor       %xmm5, %xmm6
849 	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
850         pxor       %xmm6, %xmm7
851 	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
852         pxor       %xmm7, %xmm8
853 	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
854 .elseif \i == 6
855         pxor       %xmm6, %xmm7
856 	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
857         pxor       %xmm7, %xmm8
858 	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
859 .elseif \i == 7
860         pxor       %xmm7, %xmm8
861 	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
862 .endif
863 	cmp	   $64, %r13
864 	jl	.L_initial_blocks_done\@
865 	# no need for precomputed values
866 /*
867 *
868 * Precomputations for HashKey parallel with encryption of first 4 blocks.
869 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
870 */
871 	MOVADQ	   ONE(%RIP),\TMP1
872 	paddd	   \TMP1, \XMM0              # INCR Y0
873 	MOVADQ	   \XMM0, \XMM1
874 	pshufb  %xmm14, \XMM1        # perform a 16 byte swap
875 
876 	paddd	   \TMP1, \XMM0              # INCR Y0
877 	MOVADQ	   \XMM0, \XMM2
878 	pshufb  %xmm14, \XMM2        # perform a 16 byte swap
879 
880 	paddd	   \TMP1, \XMM0              # INCR Y0
881 	MOVADQ	   \XMM0, \XMM3
882 	pshufb %xmm14, \XMM3        # perform a 16 byte swap
883 
884 	paddd	   \TMP1, \XMM0              # INCR Y0
885 	MOVADQ	   \XMM0, \XMM4
886 	pshufb %xmm14, \XMM4        # perform a 16 byte swap
887 
888 	MOVADQ	   0(%arg1),\TMP1
889 	pxor	   \TMP1, \XMM1
890 	pxor	   \TMP1, \XMM2
891 	pxor	   \TMP1, \XMM3
892 	pxor	   \TMP1, \XMM4
893 .irpc index, 1234 # do 4 rounds
894 	movaps 0x10*\index(%arg1), \TMP1
895 	aesenc	   \TMP1, \XMM1
896 	aesenc	   \TMP1, \XMM2
897 	aesenc	   \TMP1, \XMM3
898 	aesenc	   \TMP1, \XMM4
899 .endr
900 .irpc index, 56789 # do next 5 rounds
901 	movaps 0x10*\index(%arg1), \TMP1
902 	aesenc	   \TMP1, \XMM1
903 	aesenc	   \TMP1, \XMM2
904 	aesenc	   \TMP1, \XMM3
905 	aesenc	   \TMP1, \XMM4
906 .endr
907 	lea	   0xa0(%arg1),%r10
908 	mov	   keysize,%eax
909 	shr	   $2,%eax			# 128->4, 192->6, 256->8
910 	sub	   $4,%eax			# 128->0, 192->2, 256->4
911 	jz	   .Laes_loop_pre_done\@
912 
913 .Laes_loop_pre_\@:
914 	MOVADQ	   (%r10),\TMP2
915 .irpc	index, 1234
916 	aesenc	   \TMP2, %xmm\index
917 .endr
918 	add	   $16,%r10
919 	sub	   $1,%eax
920 	jnz	   .Laes_loop_pre_\@
921 
922 .Laes_loop_pre_done\@:
923 	MOVADQ	   (%r10), \TMP2
924 	aesenclast \TMP2, \XMM1
925 	aesenclast \TMP2, \XMM2
926 	aesenclast \TMP2, \XMM3
927 	aesenclast \TMP2, \XMM4
928 	movdqu	   16*0(%arg4 , %r11 , 1), \TMP1
929 	pxor	   \TMP1, \XMM1
930 .ifc \operation, dec
931 	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
932 	movdqa     \TMP1, \XMM1
933 .endif
934 	movdqu	   16*1(%arg4 , %r11 , 1), \TMP1
935 	pxor	   \TMP1, \XMM2
936 .ifc \operation, dec
937 	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
938 	movdqa     \TMP1, \XMM2
939 .endif
940 	movdqu	   16*2(%arg4 , %r11 , 1), \TMP1
941 	pxor	   \TMP1, \XMM3
942 .ifc \operation, dec
943 	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
944 	movdqa     \TMP1, \XMM3
945 .endif
946 	movdqu	   16*3(%arg4 , %r11 , 1), \TMP1
947 	pxor	   \TMP1, \XMM4
948 .ifc \operation, dec
949 	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
950 	movdqa     \TMP1, \XMM4
951 .else
952 	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
953 	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
954 	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
955 	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
956 .endif
957 
958 	add	   $64, %r11
959 	pshufb %xmm14, \XMM1 # perform a 16 byte swap
960 	pxor	   \XMMDst, \XMM1
961 # combine GHASHed value with the corresponding ciphertext
962 	pshufb %xmm14, \XMM2 # perform a 16 byte swap
963 	pshufb %xmm14, \XMM3 # perform a 16 byte swap
964 	pshufb %xmm14, \XMM4 # perform a 16 byte swap
965 
966 .L_initial_blocks_done\@:
967 
968 .endm
969 
970 /*
971 * encrypt 4 blocks at a time
972 * ghash the 4 previously encrypted ciphertext blocks
973 * arg1, %arg3, %arg4 are used as pointers only, not modified
974 * %r11 is the data offset value
975 */
976 .macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
977 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
978 
979 	movdqa	  \XMM1, \XMM5
980 	movdqa	  \XMM2, \XMM6
981 	movdqa	  \XMM3, \XMM7
982 	movdqa	  \XMM4, \XMM8
983 
984         movdqa    SHUF_MASK(%rip), %xmm15
985         # multiply TMP5 * HashKey using karatsuba
986 
987 	movdqa	  \XMM5, \TMP4
988 	pshufd	  $78, \XMM5, \TMP6
989 	pxor	  \XMM5, \TMP6
990 	paddd     ONE(%rip), \XMM0		# INCR CNT
991 	movdqu	  HashKey_4(%arg2), \TMP5
992 	pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
993 	movdqa    \XMM0, \XMM1
994 	paddd     ONE(%rip), \XMM0		# INCR CNT
995 	movdqa    \XMM0, \XMM2
996 	paddd     ONE(%rip), \XMM0		# INCR CNT
997 	movdqa    \XMM0, \XMM3
998 	paddd     ONE(%rip), \XMM0		# INCR CNT
999 	movdqa    \XMM0, \XMM4
1000 	pshufb %xmm15, \XMM1	# perform a 16 byte swap
1001 	pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1002 	pshufb %xmm15, \XMM2	# perform a 16 byte swap
1003 	pshufb %xmm15, \XMM3	# perform a 16 byte swap
1004 	pshufb %xmm15, \XMM4	# perform a 16 byte swap
1005 
1006 	pxor	  (%arg1), \XMM1
1007 	pxor	  (%arg1), \XMM2
1008 	pxor	  (%arg1), \XMM3
1009 	pxor	  (%arg1), \XMM4
1010 	movdqu	  HashKey_4_k(%arg2), \TMP5
1011 	pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
1012 	movaps 0x10(%arg1), \TMP1
1013 	aesenc	  \TMP1, \XMM1              # Round 1
1014 	aesenc	  \TMP1, \XMM2
1015 	aesenc	  \TMP1, \XMM3
1016 	aesenc	  \TMP1, \XMM4
1017 	movaps 0x20(%arg1), \TMP1
1018 	aesenc	  \TMP1, \XMM1              # Round 2
1019 	aesenc	  \TMP1, \XMM2
1020 	aesenc	  \TMP1, \XMM3
1021 	aesenc	  \TMP1, \XMM4
1022 	movdqa	  \XMM6, \TMP1
1023 	pshufd	  $78, \XMM6, \TMP2
1024 	pxor	  \XMM6, \TMP2
1025 	movdqu	  HashKey_3(%arg2), \TMP5
1026 	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
1027 	movaps 0x30(%arg1), \TMP3
1028 	aesenc    \TMP3, \XMM1              # Round 3
1029 	aesenc    \TMP3, \XMM2
1030 	aesenc    \TMP3, \XMM3
1031 	aesenc    \TMP3, \XMM4
1032 	pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
1033 	movaps 0x40(%arg1), \TMP3
1034 	aesenc	  \TMP3, \XMM1              # Round 4
1035 	aesenc	  \TMP3, \XMM2
1036 	aesenc	  \TMP3, \XMM3
1037 	aesenc	  \TMP3, \XMM4
1038 	movdqu	  HashKey_3_k(%arg2), \TMP5
1039 	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1040 	movaps 0x50(%arg1), \TMP3
1041 	aesenc	  \TMP3, \XMM1              # Round 5
1042 	aesenc	  \TMP3, \XMM2
1043 	aesenc	  \TMP3, \XMM3
1044 	aesenc	  \TMP3, \XMM4
1045 	pxor	  \TMP1, \TMP4
1046 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1047 	pxor	  \XMM6, \XMM5
1048 	pxor	  \TMP2, \TMP6
1049 	movdqa	  \XMM7, \TMP1
1050 	pshufd	  $78, \XMM7, \TMP2
1051 	pxor	  \XMM7, \TMP2
1052 	movdqu	  HashKey_2(%arg2), \TMP5
1053 
1054         # Multiply TMP5 * HashKey using karatsuba
1055 
1056 	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1057 	movaps 0x60(%arg1), \TMP3
1058 	aesenc	  \TMP3, \XMM1              # Round 6
1059 	aesenc	  \TMP3, \XMM2
1060 	aesenc	  \TMP3, \XMM3
1061 	aesenc	  \TMP3, \XMM4
1062 	pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
1063 	movaps 0x70(%arg1), \TMP3
1064 	aesenc	  \TMP3, \XMM1              # Round 7
1065 	aesenc	  \TMP3, \XMM2
1066 	aesenc	  \TMP3, \XMM3
1067 	aesenc	  \TMP3, \XMM4
1068 	movdqu	  HashKey_2_k(%arg2), \TMP5
1069 	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1070 	movaps 0x80(%arg1), \TMP3
1071 	aesenc	  \TMP3, \XMM1              # Round 8
1072 	aesenc	  \TMP3, \XMM2
1073 	aesenc	  \TMP3, \XMM3
1074 	aesenc	  \TMP3, \XMM4
1075 	pxor	  \TMP1, \TMP4
1076 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1077 	pxor	  \XMM7, \XMM5
1078 	pxor	  \TMP2, \TMP6
1079 
1080         # Multiply XMM8 * HashKey
1081         # XMM8 and TMP5 hold the values for the two operands
1082 
1083 	movdqa	  \XMM8, \TMP1
1084 	pshufd	  $78, \XMM8, \TMP2
1085 	pxor	  \XMM8, \TMP2
1086 	movdqu	  HashKey(%arg2), \TMP5
1087 	pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
1088 	movaps 0x90(%arg1), \TMP3
1089 	aesenc	  \TMP3, \XMM1             # Round 9
1090 	aesenc	  \TMP3, \XMM2
1091 	aesenc	  \TMP3, \XMM3
1092 	aesenc	  \TMP3, \XMM4
1093 	pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
1094 	lea	  0xa0(%arg1),%r10
1095 	mov	  keysize,%eax
1096 	shr	  $2,%eax			# 128->4, 192->6, 256->8
1097 	sub	  $4,%eax			# 128->0, 192->2, 256->4
1098 	jz	  .Laes_loop_par_enc_done\@
1099 
1100 .Laes_loop_par_enc\@:
1101 	MOVADQ	  (%r10),\TMP3
1102 .irpc	index, 1234
1103 	aesenc	  \TMP3, %xmm\index
1104 .endr
1105 	add	  $16,%r10
1106 	sub	  $1,%eax
1107 	jnz	  .Laes_loop_par_enc\@
1108 
1109 .Laes_loop_par_enc_done\@:
1110 	MOVADQ	  (%r10), \TMP3
1111 	aesenclast \TMP3, \XMM1           # Round 10
1112 	aesenclast \TMP3, \XMM2
1113 	aesenclast \TMP3, \XMM3
1114 	aesenclast \TMP3, \XMM4
1115 	movdqu    HashKey_k(%arg2), \TMP5
1116 	pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1117 	movdqu	  (%arg4,%r11,1), \TMP3
1118 	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1119 	movdqu	  16(%arg4,%r11,1), \TMP3
1120 	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1121 	movdqu	  32(%arg4,%r11,1), \TMP3
1122 	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1123 	movdqu	  48(%arg4,%r11,1), \TMP3
1124 	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1125         movdqu    \XMM1, (%arg3,%r11,1)        # Write to the ciphertext buffer
1126         movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to the ciphertext buffer
1127         movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to the ciphertext buffer
1128         movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to the ciphertext buffer
1129 	pshufb %xmm15, \XMM1        # perform a 16 byte swap
1130 	pshufb %xmm15, \XMM2	# perform a 16 byte swap
1131 	pshufb %xmm15, \XMM3	# perform a 16 byte swap
1132 	pshufb %xmm15, \XMM4	# perform a 16 byte swap
1133 
1134 	pxor	  \TMP4, \TMP1
1135 	pxor	  \XMM8, \XMM5
1136 	pxor	  \TMP6, \TMP2
1137 	pxor	  \TMP1, \TMP2
1138 	pxor	  \XMM5, \TMP2
1139 	movdqa	  \TMP2, \TMP3
1140 	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1141 	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1142 	pxor	  \TMP3, \XMM5
1143 	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1144 
1145         # first phase of reduction
1146 
1147 	movdqa    \XMM5, \TMP2
1148 	movdqa    \XMM5, \TMP3
1149 	movdqa    \XMM5, \TMP4
1150 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1151 	pslld     $31, \TMP2                   # packed right shift << 31
1152 	pslld     $30, \TMP3                   # packed right shift << 30
1153 	pslld     $25, \TMP4                   # packed right shift << 25
1154 	pxor      \TMP3, \TMP2	               # xor the shifted versions
1155 	pxor      \TMP4, \TMP2
1156 	movdqa    \TMP2, \TMP5
1157 	psrldq    $4, \TMP5                    # right shift T5 1 DW
1158 	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1159 	pxor      \TMP2, \XMM5
1160 
1161         # second phase of reduction
1162 
1163 	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1164 	movdqa    \XMM5,\TMP3
1165 	movdqa    \XMM5,\TMP4
1166 	psrld     $1, \TMP2                    # packed left shift >>1
1167 	psrld     $2, \TMP3                    # packed left shift >>2
1168 	psrld     $7, \TMP4                    # packed left shift >>7
1169 	pxor      \TMP3,\TMP2		       # xor the shifted versions
1170 	pxor      \TMP4,\TMP2
1171 	pxor      \TMP5, \TMP2
1172 	pxor      \TMP2, \XMM5
1173 	pxor      \TMP1, \XMM5                 # result is in TMP1
1174 
1175 	pxor	  \XMM5, \XMM1
1176 .endm
1177 
1178 /*
1179 * decrypt 4 blocks at a time
1180 * ghash the 4 previously decrypted ciphertext blocks
1181 * arg1, %arg3, %arg4 are used as pointers only, not modified
1182 * %r11 is the data offset value
1183 */
1184 .macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
1185 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1186 
1187 	movdqa	  \XMM1, \XMM5
1188 	movdqa	  \XMM2, \XMM6
1189 	movdqa	  \XMM3, \XMM7
1190 	movdqa	  \XMM4, \XMM8
1191 
1192         movdqa    SHUF_MASK(%rip), %xmm15
1193         # multiply TMP5 * HashKey using karatsuba
1194 
1195 	movdqa	  \XMM5, \TMP4
1196 	pshufd	  $78, \XMM5, \TMP6
1197 	pxor	  \XMM5, \TMP6
1198 	paddd     ONE(%rip), \XMM0		# INCR CNT
1199 	movdqu	  HashKey_4(%arg2), \TMP5
1200 	pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
1201 	movdqa    \XMM0, \XMM1
1202 	paddd     ONE(%rip), \XMM0		# INCR CNT
1203 	movdqa    \XMM0, \XMM2
1204 	paddd     ONE(%rip), \XMM0		# INCR CNT
1205 	movdqa    \XMM0, \XMM3
1206 	paddd     ONE(%rip), \XMM0		# INCR CNT
1207 	movdqa    \XMM0, \XMM4
1208 	pshufb %xmm15, \XMM1	# perform a 16 byte swap
1209 	pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1210 	pshufb %xmm15, \XMM2	# perform a 16 byte swap
1211 	pshufb %xmm15, \XMM3	# perform a 16 byte swap
1212 	pshufb %xmm15, \XMM4	# perform a 16 byte swap
1213 
1214 	pxor	  (%arg1), \XMM1
1215 	pxor	  (%arg1), \XMM2
1216 	pxor	  (%arg1), \XMM3
1217 	pxor	  (%arg1), \XMM4
1218 	movdqu	  HashKey_4_k(%arg2), \TMP5
1219 	pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
1220 	movaps 0x10(%arg1), \TMP1
1221 	aesenc	  \TMP1, \XMM1              # Round 1
1222 	aesenc	  \TMP1, \XMM2
1223 	aesenc	  \TMP1, \XMM3
1224 	aesenc	  \TMP1, \XMM4
1225 	movaps 0x20(%arg1), \TMP1
1226 	aesenc	  \TMP1, \XMM1              # Round 2
1227 	aesenc	  \TMP1, \XMM2
1228 	aesenc	  \TMP1, \XMM3
1229 	aesenc	  \TMP1, \XMM4
1230 	movdqa	  \XMM6, \TMP1
1231 	pshufd	  $78, \XMM6, \TMP2
1232 	pxor	  \XMM6, \TMP2
1233 	movdqu	  HashKey_3(%arg2), \TMP5
1234 	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
1235 	movaps 0x30(%arg1), \TMP3
1236 	aesenc    \TMP3, \XMM1              # Round 3
1237 	aesenc    \TMP3, \XMM2
1238 	aesenc    \TMP3, \XMM3
1239 	aesenc    \TMP3, \XMM4
1240 	pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
1241 	movaps 0x40(%arg1), \TMP3
1242 	aesenc	  \TMP3, \XMM1              # Round 4
1243 	aesenc	  \TMP3, \XMM2
1244 	aesenc	  \TMP3, \XMM3
1245 	aesenc	  \TMP3, \XMM4
1246 	movdqu	  HashKey_3_k(%arg2), \TMP5
1247 	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1248 	movaps 0x50(%arg1), \TMP3
1249 	aesenc	  \TMP3, \XMM1              # Round 5
1250 	aesenc	  \TMP3, \XMM2
1251 	aesenc	  \TMP3, \XMM3
1252 	aesenc	  \TMP3, \XMM4
1253 	pxor	  \TMP1, \TMP4
1254 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1255 	pxor	  \XMM6, \XMM5
1256 	pxor	  \TMP2, \TMP6
1257 	movdqa	  \XMM7, \TMP1
1258 	pshufd	  $78, \XMM7, \TMP2
1259 	pxor	  \XMM7, \TMP2
1260 	movdqu	  HashKey_2(%arg2), \TMP5
1261 
1262         # Multiply TMP5 * HashKey using karatsuba
1263 
1264 	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1265 	movaps 0x60(%arg1), \TMP3
1266 	aesenc	  \TMP3, \XMM1              # Round 6
1267 	aesenc	  \TMP3, \XMM2
1268 	aesenc	  \TMP3, \XMM3
1269 	aesenc	  \TMP3, \XMM4
1270 	pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
1271 	movaps 0x70(%arg1), \TMP3
1272 	aesenc	  \TMP3, \XMM1              # Round 7
1273 	aesenc	  \TMP3, \XMM2
1274 	aesenc	  \TMP3, \XMM3
1275 	aesenc	  \TMP3, \XMM4
1276 	movdqu	  HashKey_2_k(%arg2), \TMP5
1277 	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1278 	movaps 0x80(%arg1), \TMP3
1279 	aesenc	  \TMP3, \XMM1              # Round 8
1280 	aesenc	  \TMP3, \XMM2
1281 	aesenc	  \TMP3, \XMM3
1282 	aesenc	  \TMP3, \XMM4
1283 	pxor	  \TMP1, \TMP4
1284 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1285 	pxor	  \XMM7, \XMM5
1286 	pxor	  \TMP2, \TMP6
1287 
1288         # Multiply XMM8 * HashKey
1289         # XMM8 and TMP5 hold the values for the two operands
1290 
1291 	movdqa	  \XMM8, \TMP1
1292 	pshufd	  $78, \XMM8, \TMP2
1293 	pxor	  \XMM8, \TMP2
1294 	movdqu	  HashKey(%arg2), \TMP5
1295 	pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
1296 	movaps 0x90(%arg1), \TMP3
1297 	aesenc	  \TMP3, \XMM1             # Round 9
1298 	aesenc	  \TMP3, \XMM2
1299 	aesenc	  \TMP3, \XMM3
1300 	aesenc	  \TMP3, \XMM4
1301 	pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
1302 	lea	  0xa0(%arg1),%r10
1303 	mov	  keysize,%eax
1304 	shr	  $2,%eax		        # 128->4, 192->6, 256->8
1305 	sub	  $4,%eax			# 128->0, 192->2, 256->4
1306 	jz	  .Laes_loop_par_dec_done\@
1307 
1308 .Laes_loop_par_dec\@:
1309 	MOVADQ	  (%r10),\TMP3
1310 .irpc	index, 1234
1311 	aesenc	  \TMP3, %xmm\index
1312 .endr
1313 	add	  $16,%r10
1314 	sub	  $1,%eax
1315 	jnz	  .Laes_loop_par_dec\@
1316 
1317 .Laes_loop_par_dec_done\@:
1318 	MOVADQ	  (%r10), \TMP3
1319 	aesenclast \TMP3, \XMM1           # last round
1320 	aesenclast \TMP3, \XMM2
1321 	aesenclast \TMP3, \XMM3
1322 	aesenclast \TMP3, \XMM4
1323 	movdqu    HashKey_k(%arg2), \TMP5
1324 	pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1325 	movdqu	  (%arg4,%r11,1), \TMP3
1326 	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1327 	movdqu	  \XMM1, (%arg3,%r11,1)        # Write to plaintext buffer
1328 	movdqa    \TMP3, \XMM1
1329 	movdqu	  16(%arg4,%r11,1), \TMP3
1330 	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1331 	movdqu	  \XMM2, 16(%arg3,%r11,1)      # Write to plaintext buffer
1332 	movdqa    \TMP3, \XMM2
1333 	movdqu	  32(%arg4,%r11,1), \TMP3
1334 	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1335 	movdqu	  \XMM3, 32(%arg3,%r11,1)      # Write to plaintext buffer
1336 	movdqa    \TMP3, \XMM3
1337 	movdqu	  48(%arg4,%r11,1), \TMP3
1338 	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1339 	movdqu	  \XMM4, 48(%arg3,%r11,1)      # Write to plaintext buffer
1340 	movdqa    \TMP3, \XMM4
1341 	pshufb %xmm15, \XMM1        # perform a 16 byte swap
1342 	pshufb %xmm15, \XMM2	# perform a 16 byte swap
1343 	pshufb %xmm15, \XMM3	# perform a 16 byte swap
1344 	pshufb %xmm15, \XMM4	# perform a 16 byte swap
1345 
1346 	pxor	  \TMP4, \TMP1
1347 	pxor	  \XMM8, \XMM5
1348 	pxor	  \TMP6, \TMP2
1349 	pxor	  \TMP1, \TMP2
1350 	pxor	  \XMM5, \TMP2
1351 	movdqa	  \TMP2, \TMP3
1352 	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1353 	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1354 	pxor	  \TMP3, \XMM5
1355 	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1356 
1357         # first phase of reduction
1358 
1359 	movdqa    \XMM5, \TMP2
1360 	movdqa    \XMM5, \TMP3
1361 	movdqa    \XMM5, \TMP4
1362 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1363 	pslld     $31, \TMP2                   # packed right shift << 31
1364 	pslld     $30, \TMP3                   # packed right shift << 30
1365 	pslld     $25, \TMP4                   # packed right shift << 25
1366 	pxor      \TMP3, \TMP2	               # xor the shifted versions
1367 	pxor      \TMP4, \TMP2
1368 	movdqa    \TMP2, \TMP5
1369 	psrldq    $4, \TMP5                    # right shift T5 1 DW
1370 	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1371 	pxor      \TMP2, \XMM5
1372 
1373         # second phase of reduction
1374 
1375 	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1376 	movdqa    \XMM5,\TMP3
1377 	movdqa    \XMM5,\TMP4
1378 	psrld     $1, \TMP2                    # packed left shift >>1
1379 	psrld     $2, \TMP3                    # packed left shift >>2
1380 	psrld     $7, \TMP4                    # packed left shift >>7
1381 	pxor      \TMP3,\TMP2		       # xor the shifted versions
1382 	pxor      \TMP4,\TMP2
1383 	pxor      \TMP5, \TMP2
1384 	pxor      \TMP2, \XMM5
1385 	pxor      \TMP1, \XMM5                 # result is in TMP1
1386 
1387 	pxor	  \XMM5, \XMM1
1388 .endm
1389 
1390 /* GHASH the last 4 ciphertext blocks. */
1391 .macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1392 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1393 
1394         # Multiply TMP6 * HashKey (using Karatsuba)
1395 
1396 	movdqa	  \XMM1, \TMP6
1397 	pshufd	  $78, \XMM1, \TMP2
1398 	pxor	  \XMM1, \TMP2
1399 	movdqu	  HashKey_4(%arg2), \TMP5
1400 	pclmulqdq $0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1401 	pclmulqdq $0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1402 	movdqu	  HashKey_4_k(%arg2), \TMP4
1403 	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1404 	movdqa	  \XMM1, \XMMDst
1405 	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1406 
1407         # Multiply TMP1 * HashKey (using Karatsuba)
1408 
1409 	movdqa	  \XMM2, \TMP1
1410 	pshufd	  $78, \XMM2, \TMP2
1411 	pxor	  \XMM2, \TMP2
1412 	movdqu	  HashKey_3(%arg2), \TMP5
1413 	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1414 	pclmulqdq $0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1415 	movdqu	  HashKey_3_k(%arg2), \TMP4
1416 	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1417 	pxor	  \TMP1, \TMP6
1418 	pxor	  \XMM2, \XMMDst
1419 	pxor	  \TMP2, \XMM1
1420 # results accumulated in TMP6, XMMDst, XMM1
1421 
1422         # Multiply TMP1 * HashKey (using Karatsuba)
1423 
1424 	movdqa	  \XMM3, \TMP1
1425 	pshufd	  $78, \XMM3, \TMP2
1426 	pxor	  \XMM3, \TMP2
1427 	movdqu	  HashKey_2(%arg2), \TMP5
1428 	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1429 	pclmulqdq $0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1430 	movdqu	  HashKey_2_k(%arg2), \TMP4
1431 	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1432 	pxor	  \TMP1, \TMP6
1433 	pxor	  \XMM3, \XMMDst
1434 	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1435 
1436         # Multiply TMP1 * HashKey (using Karatsuba)
1437 	movdqa	  \XMM4, \TMP1
1438 	pshufd	  $78, \XMM4, \TMP2
1439 	pxor	  \XMM4, \TMP2
1440 	movdqu	  HashKey(%arg2), \TMP5
1441 	pclmulqdq $0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
1442 	pclmulqdq $0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1443 	movdqu	  HashKey_k(%arg2), \TMP4
1444 	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1445 	pxor	  \TMP1, \TMP6
1446 	pxor	  \XMM4, \XMMDst
1447 	pxor	  \XMM1, \TMP2
1448 	pxor	  \TMP6, \TMP2
1449 	pxor	  \XMMDst, \TMP2
1450 	# middle section of the temp results combined as in karatsuba algorithm
1451 	movdqa	  \TMP2, \TMP4
1452 	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
1453 	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
1454 	pxor	  \TMP4, \XMMDst
1455 	pxor	  \TMP2, \TMP6
1456 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1457 	# first phase of the reduction
1458 	movdqa    \XMMDst, \TMP2
1459 	movdqa    \XMMDst, \TMP3
1460 	movdqa    \XMMDst, \TMP4
1461 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1462 	pslld     $31, \TMP2                # packed right shifting << 31
1463 	pslld     $30, \TMP3                # packed right shifting << 30
1464 	pslld     $25, \TMP4                # packed right shifting << 25
1465 	pxor      \TMP3, \TMP2              # xor the shifted versions
1466 	pxor      \TMP4, \TMP2
1467 	movdqa    \TMP2, \TMP7
1468 	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1469 	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1470 	pxor      \TMP2, \XMMDst
1471 
1472         # second phase of the reduction
1473 	movdqa    \XMMDst, \TMP2
1474 	# make 3 copies of XMMDst for doing 3 shift operations
1475 	movdqa    \XMMDst, \TMP3
1476 	movdqa    \XMMDst, \TMP4
1477 	psrld     $1, \TMP2                 # packed left shift >> 1
1478 	psrld     $2, \TMP3                 # packed left shift >> 2
1479 	psrld     $7, \TMP4                 # packed left shift >> 7
1480 	pxor      \TMP3, \TMP2              # xor the shifted versions
1481 	pxor      \TMP4, \TMP2
1482 	pxor      \TMP7, \TMP2
1483 	pxor      \TMP2, \XMMDst
1484 	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1485 .endm
1486 
1487 
1488 /* Encryption of a single block
1489 * uses eax & r10
1490 */
1491 
1492 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1493 
1494 	pxor		(%arg1), \XMM0
1495 	mov		keysize,%eax
1496 	shr		$2,%eax			# 128->4, 192->6, 256->8
1497 	add		$5,%eax			# 128->9, 192->11, 256->13
1498 	lea		16(%arg1), %r10	  # get first expanded key address
1499 
1500 _esb_loop_\@:
1501 	MOVADQ		(%r10),\TMP1
1502 	aesenc		\TMP1,\XMM0
1503 	add		$16,%r10
1504 	sub		$1,%eax
1505 	jnz		_esb_loop_\@
1506 
1507 	MOVADQ		(%r10),\TMP1
1508 	aesenclast	\TMP1,\XMM0
1509 .endm
1510 /*****************************************************************************
1511 * void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1512 *                   struct gcm_context_data *data
1513 *                                      // Context data
1514 *                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1515 *                   const u8 *in,      // Ciphertext input
1516 *                   u64 plaintext_len, // Length of data in bytes for decryption.
1517 *                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1518 *                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1519 *                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1520 *                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1521 *                   const u8 *aad,     // Additional Authentication Data (AAD)
1522 *                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1523 *                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1524 *                                      // given authentication tag and only return the plaintext if they match.
1525 *                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1526 *                                      // (most likely), 12 or 8.
1527 *
1528 * Assumptions:
1529 *
1530 * keys:
1531 *       keys are pre-expanded and aligned to 16 bytes. we are using the first
1532 *       set of 11 keys in the data structure void *aes_ctx
1533 *
1534 * iv:
1535 *       0                   1                   2                   3
1536 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1537 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1538 *       |                             Salt  (From the SA)               |
1539 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1540 *       |                     Initialization Vector                     |
1541 *       |         (This is the sequence number from IPSec header)       |
1542 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1543 *       |                              0x1                              |
1544 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1545 *
1546 *
1547 *
1548 * AAD:
1549 *       AAD padded to 128 bits with 0
1550 *       for example, assume AAD is a u32 vector
1551 *
1552 *       if AAD is 8 bytes:
1553 *       AAD[3] = {A0, A1};
1554 *       padded AAD in xmm register = {A1 A0 0 0}
1555 *
1556 *       0                   1                   2                   3
1557 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1558 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1559 *       |                               SPI (A1)                        |
1560 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1561 *       |                     32-bit Sequence Number (A0)               |
1562 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1563 *       |                              0x0                              |
1564 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1565 *
1566 *                                       AAD Format with 32-bit Sequence Number
1567 *
1568 *       if AAD is 12 bytes:
1569 *       AAD[3] = {A0, A1, A2};
1570 *       padded AAD in xmm register = {A2 A1 A0 0}
1571 *
1572 *       0                   1                   2                   3
1573 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1574 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1575 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1576 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1577 *       |                               SPI (A2)                        |
1578 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1579 *       |                 64-bit Extended Sequence Number {A1,A0}       |
1580 *       |                                                               |
1581 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1582 *       |                              0x0                              |
1583 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1584 *
1585 *                        AAD Format with 64-bit Extended Sequence Number
1586 *
1587 * poly = x^128 + x^127 + x^126 + x^121 + 1
1588 *
1589 *****************************************************************************/
1590 SYM_FUNC_START(aesni_gcm_dec)
1591 	FUNC_SAVE
1592 
1593 	GCM_INIT %arg6, arg7, arg8, arg9
1594 	GCM_ENC_DEC dec
1595 	GCM_COMPLETE arg10, arg11
1596 	FUNC_RESTORE
1597 	RET
1598 SYM_FUNC_END(aesni_gcm_dec)
1599 
1600 
1601 /*****************************************************************************
1602 * void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1603 *                    struct gcm_context_data *data
1604 *                                        // Context data
1605 *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1606 *                    const u8 *in,       // Plaintext input
1607 *                    u64 plaintext_len,  // Length of data in bytes for encryption.
1608 *                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1609 *                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1610 *                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1611 *                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1612 *                    const u8 *aad,      // Additional Authentication Data (AAD)
1613 *                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1614 *                    u8 *auth_tag,       // Authenticated Tag output.
1615 *                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1616 *                                        // 12 or 8.
1617 *
1618 * Assumptions:
1619 *
1620 * keys:
1621 *       keys are pre-expanded and aligned to 16 bytes. we are using the
1622 *       first set of 11 keys in the data structure void *aes_ctx
1623 *
1624 *
1625 * iv:
1626 *       0                   1                   2                   3
1627 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1628 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1629 *       |                             Salt  (From the SA)               |
1630 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1631 *       |                     Initialization Vector                     |
1632 *       |         (This is the sequence number from IPSec header)       |
1633 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1634 *       |                              0x1                              |
1635 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1636 *
1637 *
1638 *
1639 * AAD:
1640 *       AAD padded to 128 bits with 0
1641 *       for example, assume AAD is a u32 vector
1642 *
1643 *       if AAD is 8 bytes:
1644 *       AAD[3] = {A0, A1};
1645 *       padded AAD in xmm register = {A1 A0 0 0}
1646 *
1647 *       0                   1                   2                   3
1648 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1649 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1650 *       |                               SPI (A1)                        |
1651 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1652 *       |                     32-bit Sequence Number (A0)               |
1653 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1654 *       |                              0x0                              |
1655 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1656 *
1657 *                                 AAD Format with 32-bit Sequence Number
1658 *
1659 *       if AAD is 12 bytes:
1660 *       AAD[3] = {A0, A1, A2};
1661 *       padded AAD in xmm register = {A2 A1 A0 0}
1662 *
1663 *       0                   1                   2                   3
1664 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1665 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1666 *       |                               SPI (A2)                        |
1667 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1668 *       |                 64-bit Extended Sequence Number {A1,A0}       |
1669 *       |                                                               |
1670 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1671 *       |                              0x0                              |
1672 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1673 *
1674 *                         AAD Format with 64-bit Extended Sequence Number
1675 *
1676 * poly = x^128 + x^127 + x^126 + x^121 + 1
1677 ***************************************************************************/
1678 SYM_FUNC_START(aesni_gcm_enc)
1679 	FUNC_SAVE
1680 
1681 	GCM_INIT %arg6, arg7, arg8, arg9
1682 	GCM_ENC_DEC enc
1683 
1684 	GCM_COMPLETE arg10, arg11
1685 	FUNC_RESTORE
1686 	RET
1687 SYM_FUNC_END(aesni_gcm_enc)
1688 
1689 /*****************************************************************************
1690 * void aesni_gcm_init(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1691 *                     struct gcm_context_data *data,
1692 *                                         // context data
1693 *                     u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1694 *                                         // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1695 *                                         // concatenated with 0x00000001. 16-byte aligned pointer.
1696 *                     u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1697 *                     const u8 *aad,      // Additional Authentication Data (AAD)
1698 *                     u64 aad_len)        // Length of AAD in bytes.
1699 */
1700 SYM_FUNC_START(aesni_gcm_init)
1701 	FUNC_SAVE
1702 	GCM_INIT %arg3, %arg4,%arg5, %arg6
1703 	FUNC_RESTORE
1704 	RET
1705 SYM_FUNC_END(aesni_gcm_init)
1706 
1707 /*****************************************************************************
1708 * void aesni_gcm_enc_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1709 *                    struct gcm_context_data *data,
1710 *                                        // context data
1711 *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1712 *                    const u8 *in,       // Plaintext input
1713 *                    u64 plaintext_len,  // Length of data in bytes for encryption.
1714 */
1715 SYM_FUNC_START(aesni_gcm_enc_update)
1716 	FUNC_SAVE
1717 	GCM_ENC_DEC enc
1718 	FUNC_RESTORE
1719 	RET
1720 SYM_FUNC_END(aesni_gcm_enc_update)
1721 
1722 /*****************************************************************************
1723 * void aesni_gcm_dec_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1724 *                    struct gcm_context_data *data,
1725 *                                        // context data
1726 *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1727 *                    const u8 *in,       // Plaintext input
1728 *                    u64 plaintext_len,  // Length of data in bytes for encryption.
1729 */
1730 SYM_FUNC_START(aesni_gcm_dec_update)
1731 	FUNC_SAVE
1732 	GCM_ENC_DEC dec
1733 	FUNC_RESTORE
1734 	RET
1735 SYM_FUNC_END(aesni_gcm_dec_update)
1736 
1737 /*****************************************************************************
1738 * void aesni_gcm_finalize(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1739 *                    struct gcm_context_data *data,
1740 *                                        // context data
1741 *                    u8 *auth_tag,       // Authenticated Tag output.
1742 *                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1743 *                                        // 12 or 8.
1744 */
1745 SYM_FUNC_START(aesni_gcm_finalize)
1746 	FUNC_SAVE
1747 	GCM_COMPLETE %arg3 %arg4
1748 	FUNC_RESTORE
1749 	RET
1750 SYM_FUNC_END(aesni_gcm_finalize)
1751 
1752 #endif
1753 
1754 SYM_FUNC_START_LOCAL(_key_expansion_256a)
1755 	pshufd $0b11111111, %xmm1, %xmm1
1756 	shufps $0b00010000, %xmm0, %xmm4
1757 	pxor %xmm4, %xmm0
1758 	shufps $0b10001100, %xmm0, %xmm4
1759 	pxor %xmm4, %xmm0
1760 	pxor %xmm1, %xmm0
1761 	movaps %xmm0, (TKEYP)
1762 	add $0x10, TKEYP
1763 	RET
1764 SYM_FUNC_END(_key_expansion_256a)
1765 SYM_FUNC_ALIAS_LOCAL(_key_expansion_128, _key_expansion_256a)
1766 
1767 SYM_FUNC_START_LOCAL(_key_expansion_192a)
1768 	pshufd $0b01010101, %xmm1, %xmm1
1769 	shufps $0b00010000, %xmm0, %xmm4
1770 	pxor %xmm4, %xmm0
1771 	shufps $0b10001100, %xmm0, %xmm4
1772 	pxor %xmm4, %xmm0
1773 	pxor %xmm1, %xmm0
1774 
1775 	movaps %xmm2, %xmm5
1776 	movaps %xmm2, %xmm6
1777 	pslldq $4, %xmm5
1778 	pshufd $0b11111111, %xmm0, %xmm3
1779 	pxor %xmm3, %xmm2
1780 	pxor %xmm5, %xmm2
1781 
1782 	movaps %xmm0, %xmm1
1783 	shufps $0b01000100, %xmm0, %xmm6
1784 	movaps %xmm6, (TKEYP)
1785 	shufps $0b01001110, %xmm2, %xmm1
1786 	movaps %xmm1, 0x10(TKEYP)
1787 	add $0x20, TKEYP
1788 	RET
1789 SYM_FUNC_END(_key_expansion_192a)
1790 
1791 SYM_FUNC_START_LOCAL(_key_expansion_192b)
1792 	pshufd $0b01010101, %xmm1, %xmm1
1793 	shufps $0b00010000, %xmm0, %xmm4
1794 	pxor %xmm4, %xmm0
1795 	shufps $0b10001100, %xmm0, %xmm4
1796 	pxor %xmm4, %xmm0
1797 	pxor %xmm1, %xmm0
1798 
1799 	movaps %xmm2, %xmm5
1800 	pslldq $4, %xmm5
1801 	pshufd $0b11111111, %xmm0, %xmm3
1802 	pxor %xmm3, %xmm2
1803 	pxor %xmm5, %xmm2
1804 
1805 	movaps %xmm0, (TKEYP)
1806 	add $0x10, TKEYP
1807 	RET
1808 SYM_FUNC_END(_key_expansion_192b)
1809 
1810 SYM_FUNC_START_LOCAL(_key_expansion_256b)
1811 	pshufd $0b10101010, %xmm1, %xmm1
1812 	shufps $0b00010000, %xmm2, %xmm4
1813 	pxor %xmm4, %xmm2
1814 	shufps $0b10001100, %xmm2, %xmm4
1815 	pxor %xmm4, %xmm2
1816 	pxor %xmm1, %xmm2
1817 	movaps %xmm2, (TKEYP)
1818 	add $0x10, TKEYP
1819 	RET
1820 SYM_FUNC_END(_key_expansion_256b)
1821 
1822 /*
1823  * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1824  *                   unsigned int key_len)
1825  */
1826 SYM_FUNC_START(aesni_set_key)
1827 	FRAME_BEGIN
1828 #ifndef __x86_64__
1829 	pushl KEYP
1830 	movl (FRAME_OFFSET+8)(%esp), KEYP	# ctx
1831 	movl (FRAME_OFFSET+12)(%esp), UKEYP	# in_key
1832 	movl (FRAME_OFFSET+16)(%esp), %edx	# key_len
1833 #endif
1834 	movups (UKEYP), %xmm0		# user key (first 16 bytes)
1835 	movaps %xmm0, (KEYP)
1836 	lea 0x10(KEYP), TKEYP		# key addr
1837 	movl %edx, 480(KEYP)
1838 	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
1839 	cmp $24, %dl
1840 	jb .Lenc_key128
1841 	je .Lenc_key192
1842 	movups 0x10(UKEYP), %xmm2	# other user key
1843 	movaps %xmm2, (TKEYP)
1844 	add $0x10, TKEYP
1845 	aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
1846 	call _key_expansion_256a
1847 	aeskeygenassist $0x1, %xmm0, %xmm1
1848 	call _key_expansion_256b
1849 	aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
1850 	call _key_expansion_256a
1851 	aeskeygenassist $0x2, %xmm0, %xmm1
1852 	call _key_expansion_256b
1853 	aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
1854 	call _key_expansion_256a
1855 	aeskeygenassist $0x4, %xmm0, %xmm1
1856 	call _key_expansion_256b
1857 	aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
1858 	call _key_expansion_256a
1859 	aeskeygenassist $0x8, %xmm0, %xmm1
1860 	call _key_expansion_256b
1861 	aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
1862 	call _key_expansion_256a
1863 	aeskeygenassist $0x10, %xmm0, %xmm1
1864 	call _key_expansion_256b
1865 	aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
1866 	call _key_expansion_256a
1867 	aeskeygenassist $0x20, %xmm0, %xmm1
1868 	call _key_expansion_256b
1869 	aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
1870 	call _key_expansion_256a
1871 	jmp .Ldec_key
1872 .Lenc_key192:
1873 	movq 0x10(UKEYP), %xmm2		# other user key
1874 	aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
1875 	call _key_expansion_192a
1876 	aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
1877 	call _key_expansion_192b
1878 	aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
1879 	call _key_expansion_192a
1880 	aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
1881 	call _key_expansion_192b
1882 	aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
1883 	call _key_expansion_192a
1884 	aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
1885 	call _key_expansion_192b
1886 	aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
1887 	call _key_expansion_192a
1888 	aeskeygenassist $0x80, %xmm2, %xmm1	# round 8
1889 	call _key_expansion_192b
1890 	jmp .Ldec_key
1891 .Lenc_key128:
1892 	aeskeygenassist $0x1, %xmm0, %xmm1	# round 1
1893 	call _key_expansion_128
1894 	aeskeygenassist $0x2, %xmm0, %xmm1	# round 2
1895 	call _key_expansion_128
1896 	aeskeygenassist $0x4, %xmm0, %xmm1	# round 3
1897 	call _key_expansion_128
1898 	aeskeygenassist $0x8, %xmm0, %xmm1	# round 4
1899 	call _key_expansion_128
1900 	aeskeygenassist $0x10, %xmm0, %xmm1	# round 5
1901 	call _key_expansion_128
1902 	aeskeygenassist $0x20, %xmm0, %xmm1	# round 6
1903 	call _key_expansion_128
1904 	aeskeygenassist $0x40, %xmm0, %xmm1	# round 7
1905 	call _key_expansion_128
1906 	aeskeygenassist $0x80, %xmm0, %xmm1	# round 8
1907 	call _key_expansion_128
1908 	aeskeygenassist $0x1b, %xmm0, %xmm1	# round 9
1909 	call _key_expansion_128
1910 	aeskeygenassist $0x36, %xmm0, %xmm1	# round 10
1911 	call _key_expansion_128
1912 .Ldec_key:
1913 	sub $0x10, TKEYP
1914 	movaps (KEYP), %xmm0
1915 	movaps (TKEYP), %xmm1
1916 	movaps %xmm0, 240(TKEYP)
1917 	movaps %xmm1, 240(KEYP)
1918 	add $0x10, KEYP
1919 	lea 240-16(TKEYP), UKEYP
1920 .align 4
1921 .Ldec_key_loop:
1922 	movaps (KEYP), %xmm0
1923 	aesimc %xmm0, %xmm1
1924 	movaps %xmm1, (UKEYP)
1925 	add $0x10, KEYP
1926 	sub $0x10, UKEYP
1927 	cmp TKEYP, KEYP
1928 	jb .Ldec_key_loop
1929 	xor AREG, AREG
1930 #ifndef __x86_64__
1931 	popl KEYP
1932 #endif
1933 	FRAME_END
1934 	RET
1935 SYM_FUNC_END(aesni_set_key)
1936 
1937 /*
1938  * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
1939  */
1940 SYM_FUNC_START(aesni_enc)
1941 	FRAME_BEGIN
1942 #ifndef __x86_64__
1943 	pushl KEYP
1944 	pushl KLEN
1945 	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
1946 	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
1947 	movl (FRAME_OFFSET+20)(%esp), INP	# src
1948 #endif
1949 	movl 480(KEYP), KLEN		# key length
1950 	movups (INP), STATE		# input
1951 	call _aesni_enc1
1952 	movups STATE, (OUTP)		# output
1953 #ifndef __x86_64__
1954 	popl KLEN
1955 	popl KEYP
1956 #endif
1957 	FRAME_END
1958 	RET
1959 SYM_FUNC_END(aesni_enc)
1960 
1961 /*
1962  * _aesni_enc1:		internal ABI
1963  * input:
1964  *	KEYP:		key struct pointer
1965  *	KLEN:		round count
1966  *	STATE:		initial state (input)
1967  * output:
1968  *	STATE:		finial state (output)
1969  * changed:
1970  *	KEY
1971  *	TKEYP (T1)
1972  */
1973 SYM_FUNC_START_LOCAL(_aesni_enc1)
1974 	movaps (KEYP), KEY		# key
1975 	mov KEYP, TKEYP
1976 	pxor KEY, STATE		# round 0
1977 	add $0x30, TKEYP
1978 	cmp $24, KLEN
1979 	jb .Lenc128
1980 	lea 0x20(TKEYP), TKEYP
1981 	je .Lenc192
1982 	add $0x20, TKEYP
1983 	movaps -0x60(TKEYP), KEY
1984 	aesenc KEY, STATE
1985 	movaps -0x50(TKEYP), KEY
1986 	aesenc KEY, STATE
1987 .align 4
1988 .Lenc192:
1989 	movaps -0x40(TKEYP), KEY
1990 	aesenc KEY, STATE
1991 	movaps -0x30(TKEYP), KEY
1992 	aesenc KEY, STATE
1993 .align 4
1994 .Lenc128:
1995 	movaps -0x20(TKEYP), KEY
1996 	aesenc KEY, STATE
1997 	movaps -0x10(TKEYP), KEY
1998 	aesenc KEY, STATE
1999 	movaps (TKEYP), KEY
2000 	aesenc KEY, STATE
2001 	movaps 0x10(TKEYP), KEY
2002 	aesenc KEY, STATE
2003 	movaps 0x20(TKEYP), KEY
2004 	aesenc KEY, STATE
2005 	movaps 0x30(TKEYP), KEY
2006 	aesenc KEY, STATE
2007 	movaps 0x40(TKEYP), KEY
2008 	aesenc KEY, STATE
2009 	movaps 0x50(TKEYP), KEY
2010 	aesenc KEY, STATE
2011 	movaps 0x60(TKEYP), KEY
2012 	aesenc KEY, STATE
2013 	movaps 0x70(TKEYP), KEY
2014 	aesenclast KEY, STATE
2015 	RET
2016 SYM_FUNC_END(_aesni_enc1)
2017 
2018 /*
2019  * _aesni_enc4:	internal ABI
2020  * input:
2021  *	KEYP:		key struct pointer
2022  *	KLEN:		round count
2023  *	STATE1:		initial state (input)
2024  *	STATE2
2025  *	STATE3
2026  *	STATE4
2027  * output:
2028  *	STATE1:		finial state (output)
2029  *	STATE2
2030  *	STATE3
2031  *	STATE4
2032  * changed:
2033  *	KEY
2034  *	TKEYP (T1)
2035  */
2036 SYM_FUNC_START_LOCAL(_aesni_enc4)
2037 	movaps (KEYP), KEY		# key
2038 	mov KEYP, TKEYP
2039 	pxor KEY, STATE1		# round 0
2040 	pxor KEY, STATE2
2041 	pxor KEY, STATE3
2042 	pxor KEY, STATE4
2043 	add $0x30, TKEYP
2044 	cmp $24, KLEN
2045 	jb .L4enc128
2046 	lea 0x20(TKEYP), TKEYP
2047 	je .L4enc192
2048 	add $0x20, TKEYP
2049 	movaps -0x60(TKEYP), KEY
2050 	aesenc KEY, STATE1
2051 	aesenc KEY, STATE2
2052 	aesenc KEY, STATE3
2053 	aesenc KEY, STATE4
2054 	movaps -0x50(TKEYP), KEY
2055 	aesenc KEY, STATE1
2056 	aesenc KEY, STATE2
2057 	aesenc KEY, STATE3
2058 	aesenc KEY, STATE4
2059 #.align 4
2060 .L4enc192:
2061 	movaps -0x40(TKEYP), KEY
2062 	aesenc KEY, STATE1
2063 	aesenc KEY, STATE2
2064 	aesenc KEY, STATE3
2065 	aesenc KEY, STATE4
2066 	movaps -0x30(TKEYP), KEY
2067 	aesenc KEY, STATE1
2068 	aesenc KEY, STATE2
2069 	aesenc KEY, STATE3
2070 	aesenc KEY, STATE4
2071 #.align 4
2072 .L4enc128:
2073 	movaps -0x20(TKEYP), KEY
2074 	aesenc KEY, STATE1
2075 	aesenc KEY, STATE2
2076 	aesenc KEY, STATE3
2077 	aesenc KEY, STATE4
2078 	movaps -0x10(TKEYP), KEY
2079 	aesenc KEY, STATE1
2080 	aesenc KEY, STATE2
2081 	aesenc KEY, STATE3
2082 	aesenc KEY, STATE4
2083 	movaps (TKEYP), KEY
2084 	aesenc KEY, STATE1
2085 	aesenc KEY, STATE2
2086 	aesenc KEY, STATE3
2087 	aesenc KEY, STATE4
2088 	movaps 0x10(TKEYP), KEY
2089 	aesenc KEY, STATE1
2090 	aesenc KEY, STATE2
2091 	aesenc KEY, STATE3
2092 	aesenc KEY, STATE4
2093 	movaps 0x20(TKEYP), KEY
2094 	aesenc KEY, STATE1
2095 	aesenc KEY, STATE2
2096 	aesenc KEY, STATE3
2097 	aesenc KEY, STATE4
2098 	movaps 0x30(TKEYP), KEY
2099 	aesenc KEY, STATE1
2100 	aesenc KEY, STATE2
2101 	aesenc KEY, STATE3
2102 	aesenc KEY, STATE4
2103 	movaps 0x40(TKEYP), KEY
2104 	aesenc KEY, STATE1
2105 	aesenc KEY, STATE2
2106 	aesenc KEY, STATE3
2107 	aesenc KEY, STATE4
2108 	movaps 0x50(TKEYP), KEY
2109 	aesenc KEY, STATE1
2110 	aesenc KEY, STATE2
2111 	aesenc KEY, STATE3
2112 	aesenc KEY, STATE4
2113 	movaps 0x60(TKEYP), KEY
2114 	aesenc KEY, STATE1
2115 	aesenc KEY, STATE2
2116 	aesenc KEY, STATE3
2117 	aesenc KEY, STATE4
2118 	movaps 0x70(TKEYP), KEY
2119 	aesenclast KEY, STATE1		# last round
2120 	aesenclast KEY, STATE2
2121 	aesenclast KEY, STATE3
2122 	aesenclast KEY, STATE4
2123 	RET
2124 SYM_FUNC_END(_aesni_enc4)
2125 
2126 /*
2127  * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
2128  */
2129 SYM_FUNC_START(aesni_dec)
2130 	FRAME_BEGIN
2131 #ifndef __x86_64__
2132 	pushl KEYP
2133 	pushl KLEN
2134 	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
2135 	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
2136 	movl (FRAME_OFFSET+20)(%esp), INP	# src
2137 #endif
2138 	mov 480(KEYP), KLEN		# key length
2139 	add $240, KEYP
2140 	movups (INP), STATE		# input
2141 	call _aesni_dec1
2142 	movups STATE, (OUTP)		#output
2143 #ifndef __x86_64__
2144 	popl KLEN
2145 	popl KEYP
2146 #endif
2147 	FRAME_END
2148 	RET
2149 SYM_FUNC_END(aesni_dec)
2150 
2151 /*
2152  * _aesni_dec1:		internal ABI
2153  * input:
2154  *	KEYP:		key struct pointer
2155  *	KLEN:		key length
2156  *	STATE:		initial state (input)
2157  * output:
2158  *	STATE:		finial state (output)
2159  * changed:
2160  *	KEY
2161  *	TKEYP (T1)
2162  */
2163 SYM_FUNC_START_LOCAL(_aesni_dec1)
2164 	movaps (KEYP), KEY		# key
2165 	mov KEYP, TKEYP
2166 	pxor KEY, STATE		# round 0
2167 	add $0x30, TKEYP
2168 	cmp $24, KLEN
2169 	jb .Ldec128
2170 	lea 0x20(TKEYP), TKEYP
2171 	je .Ldec192
2172 	add $0x20, TKEYP
2173 	movaps -0x60(TKEYP), KEY
2174 	aesdec KEY, STATE
2175 	movaps -0x50(TKEYP), KEY
2176 	aesdec KEY, STATE
2177 .align 4
2178 .Ldec192:
2179 	movaps -0x40(TKEYP), KEY
2180 	aesdec KEY, STATE
2181 	movaps -0x30(TKEYP), KEY
2182 	aesdec KEY, STATE
2183 .align 4
2184 .Ldec128:
2185 	movaps -0x20(TKEYP), KEY
2186 	aesdec KEY, STATE
2187 	movaps -0x10(TKEYP), KEY
2188 	aesdec KEY, STATE
2189 	movaps (TKEYP), KEY
2190 	aesdec KEY, STATE
2191 	movaps 0x10(TKEYP), KEY
2192 	aesdec KEY, STATE
2193 	movaps 0x20(TKEYP), KEY
2194 	aesdec KEY, STATE
2195 	movaps 0x30(TKEYP), KEY
2196 	aesdec KEY, STATE
2197 	movaps 0x40(TKEYP), KEY
2198 	aesdec KEY, STATE
2199 	movaps 0x50(TKEYP), KEY
2200 	aesdec KEY, STATE
2201 	movaps 0x60(TKEYP), KEY
2202 	aesdec KEY, STATE
2203 	movaps 0x70(TKEYP), KEY
2204 	aesdeclast KEY, STATE
2205 	RET
2206 SYM_FUNC_END(_aesni_dec1)
2207 
2208 /*
2209  * _aesni_dec4:	internal ABI
2210  * input:
2211  *	KEYP:		key struct pointer
2212  *	KLEN:		key length
2213  *	STATE1:		initial state (input)
2214  *	STATE2
2215  *	STATE3
2216  *	STATE4
2217  * output:
2218  *	STATE1:		finial state (output)
2219  *	STATE2
2220  *	STATE3
2221  *	STATE4
2222  * changed:
2223  *	KEY
2224  *	TKEYP (T1)
2225  */
2226 SYM_FUNC_START_LOCAL(_aesni_dec4)
2227 	movaps (KEYP), KEY		# key
2228 	mov KEYP, TKEYP
2229 	pxor KEY, STATE1		# round 0
2230 	pxor KEY, STATE2
2231 	pxor KEY, STATE3
2232 	pxor KEY, STATE4
2233 	add $0x30, TKEYP
2234 	cmp $24, KLEN
2235 	jb .L4dec128
2236 	lea 0x20(TKEYP), TKEYP
2237 	je .L4dec192
2238 	add $0x20, TKEYP
2239 	movaps -0x60(TKEYP), KEY
2240 	aesdec KEY, STATE1
2241 	aesdec KEY, STATE2
2242 	aesdec KEY, STATE3
2243 	aesdec KEY, STATE4
2244 	movaps -0x50(TKEYP), KEY
2245 	aesdec KEY, STATE1
2246 	aesdec KEY, STATE2
2247 	aesdec KEY, STATE3
2248 	aesdec KEY, STATE4
2249 .align 4
2250 .L4dec192:
2251 	movaps -0x40(TKEYP), KEY
2252 	aesdec KEY, STATE1
2253 	aesdec KEY, STATE2
2254 	aesdec KEY, STATE3
2255 	aesdec KEY, STATE4
2256 	movaps -0x30(TKEYP), KEY
2257 	aesdec KEY, STATE1
2258 	aesdec KEY, STATE2
2259 	aesdec KEY, STATE3
2260 	aesdec KEY, STATE4
2261 .align 4
2262 .L4dec128:
2263 	movaps -0x20(TKEYP), KEY
2264 	aesdec KEY, STATE1
2265 	aesdec KEY, STATE2
2266 	aesdec KEY, STATE3
2267 	aesdec KEY, STATE4
2268 	movaps -0x10(TKEYP), KEY
2269 	aesdec KEY, STATE1
2270 	aesdec KEY, STATE2
2271 	aesdec KEY, STATE3
2272 	aesdec KEY, STATE4
2273 	movaps (TKEYP), KEY
2274 	aesdec KEY, STATE1
2275 	aesdec KEY, STATE2
2276 	aesdec KEY, STATE3
2277 	aesdec KEY, STATE4
2278 	movaps 0x10(TKEYP), KEY
2279 	aesdec KEY, STATE1
2280 	aesdec KEY, STATE2
2281 	aesdec KEY, STATE3
2282 	aesdec KEY, STATE4
2283 	movaps 0x20(TKEYP), KEY
2284 	aesdec KEY, STATE1
2285 	aesdec KEY, STATE2
2286 	aesdec KEY, STATE3
2287 	aesdec KEY, STATE4
2288 	movaps 0x30(TKEYP), KEY
2289 	aesdec KEY, STATE1
2290 	aesdec KEY, STATE2
2291 	aesdec KEY, STATE3
2292 	aesdec KEY, STATE4
2293 	movaps 0x40(TKEYP), KEY
2294 	aesdec KEY, STATE1
2295 	aesdec KEY, STATE2
2296 	aesdec KEY, STATE3
2297 	aesdec KEY, STATE4
2298 	movaps 0x50(TKEYP), KEY
2299 	aesdec KEY, STATE1
2300 	aesdec KEY, STATE2
2301 	aesdec KEY, STATE3
2302 	aesdec KEY, STATE4
2303 	movaps 0x60(TKEYP), KEY
2304 	aesdec KEY, STATE1
2305 	aesdec KEY, STATE2
2306 	aesdec KEY, STATE3
2307 	aesdec KEY, STATE4
2308 	movaps 0x70(TKEYP), KEY
2309 	aesdeclast KEY, STATE1		# last round
2310 	aesdeclast KEY, STATE2
2311 	aesdeclast KEY, STATE3
2312 	aesdeclast KEY, STATE4
2313 	RET
2314 SYM_FUNC_END(_aesni_dec4)
2315 
2316 /*
2317  * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2318  *		      size_t len)
2319  */
2320 SYM_FUNC_START(aesni_ecb_enc)
2321 	FRAME_BEGIN
2322 #ifndef __x86_64__
2323 	pushl LEN
2324 	pushl KEYP
2325 	pushl KLEN
2326 	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2327 	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2328 	movl (FRAME_OFFSET+24)(%esp), INP	# src
2329 	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2330 #endif
2331 	test LEN, LEN		# check length
2332 	jz .Lecb_enc_ret
2333 	mov 480(KEYP), KLEN
2334 	cmp $16, LEN
2335 	jb .Lecb_enc_ret
2336 	cmp $64, LEN
2337 	jb .Lecb_enc_loop1
2338 .align 4
2339 .Lecb_enc_loop4:
2340 	movups (INP), STATE1
2341 	movups 0x10(INP), STATE2
2342 	movups 0x20(INP), STATE3
2343 	movups 0x30(INP), STATE4
2344 	call _aesni_enc4
2345 	movups STATE1, (OUTP)
2346 	movups STATE2, 0x10(OUTP)
2347 	movups STATE3, 0x20(OUTP)
2348 	movups STATE4, 0x30(OUTP)
2349 	sub $64, LEN
2350 	add $64, INP
2351 	add $64, OUTP
2352 	cmp $64, LEN
2353 	jge .Lecb_enc_loop4
2354 	cmp $16, LEN
2355 	jb .Lecb_enc_ret
2356 .align 4
2357 .Lecb_enc_loop1:
2358 	movups (INP), STATE1
2359 	call _aesni_enc1
2360 	movups STATE1, (OUTP)
2361 	sub $16, LEN
2362 	add $16, INP
2363 	add $16, OUTP
2364 	cmp $16, LEN
2365 	jge .Lecb_enc_loop1
2366 .Lecb_enc_ret:
2367 #ifndef __x86_64__
2368 	popl KLEN
2369 	popl KEYP
2370 	popl LEN
2371 #endif
2372 	FRAME_END
2373 	RET
2374 SYM_FUNC_END(aesni_ecb_enc)
2375 
2376 /*
2377  * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2378  *		      size_t len);
2379  */
2380 SYM_FUNC_START(aesni_ecb_dec)
2381 	FRAME_BEGIN
2382 #ifndef __x86_64__
2383 	pushl LEN
2384 	pushl KEYP
2385 	pushl KLEN
2386 	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2387 	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2388 	movl (FRAME_OFFSET+24)(%esp), INP	# src
2389 	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2390 #endif
2391 	test LEN, LEN
2392 	jz .Lecb_dec_ret
2393 	mov 480(KEYP), KLEN
2394 	add $240, KEYP
2395 	cmp $16, LEN
2396 	jb .Lecb_dec_ret
2397 	cmp $64, LEN
2398 	jb .Lecb_dec_loop1
2399 .align 4
2400 .Lecb_dec_loop4:
2401 	movups (INP), STATE1
2402 	movups 0x10(INP), STATE2
2403 	movups 0x20(INP), STATE3
2404 	movups 0x30(INP), STATE4
2405 	call _aesni_dec4
2406 	movups STATE1, (OUTP)
2407 	movups STATE2, 0x10(OUTP)
2408 	movups STATE3, 0x20(OUTP)
2409 	movups STATE4, 0x30(OUTP)
2410 	sub $64, LEN
2411 	add $64, INP
2412 	add $64, OUTP
2413 	cmp $64, LEN
2414 	jge .Lecb_dec_loop4
2415 	cmp $16, LEN
2416 	jb .Lecb_dec_ret
2417 .align 4
2418 .Lecb_dec_loop1:
2419 	movups (INP), STATE1
2420 	call _aesni_dec1
2421 	movups STATE1, (OUTP)
2422 	sub $16, LEN
2423 	add $16, INP
2424 	add $16, OUTP
2425 	cmp $16, LEN
2426 	jge .Lecb_dec_loop1
2427 .Lecb_dec_ret:
2428 #ifndef __x86_64__
2429 	popl KLEN
2430 	popl KEYP
2431 	popl LEN
2432 #endif
2433 	FRAME_END
2434 	RET
2435 SYM_FUNC_END(aesni_ecb_dec)
2436 
2437 /*
2438  * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2439  *		      size_t len, u8 *iv)
2440  */
2441 SYM_FUNC_START(aesni_cbc_enc)
2442 	FRAME_BEGIN
2443 #ifndef __x86_64__
2444 	pushl IVP
2445 	pushl LEN
2446 	pushl KEYP
2447 	pushl KLEN
2448 	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2449 	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2450 	movl (FRAME_OFFSET+28)(%esp), INP	# src
2451 	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2452 	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2453 #endif
2454 	cmp $16, LEN
2455 	jb .Lcbc_enc_ret
2456 	mov 480(KEYP), KLEN
2457 	movups (IVP), STATE	# load iv as initial state
2458 .align 4
2459 .Lcbc_enc_loop:
2460 	movups (INP), IN	# load input
2461 	pxor IN, STATE
2462 	call _aesni_enc1
2463 	movups STATE, (OUTP)	# store output
2464 	sub $16, LEN
2465 	add $16, INP
2466 	add $16, OUTP
2467 	cmp $16, LEN
2468 	jge .Lcbc_enc_loop
2469 	movups STATE, (IVP)
2470 .Lcbc_enc_ret:
2471 #ifndef __x86_64__
2472 	popl KLEN
2473 	popl KEYP
2474 	popl LEN
2475 	popl IVP
2476 #endif
2477 	FRAME_END
2478 	RET
2479 SYM_FUNC_END(aesni_cbc_enc)
2480 
2481 /*
2482  * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2483  *		      size_t len, u8 *iv)
2484  */
2485 SYM_FUNC_START(aesni_cbc_dec)
2486 	FRAME_BEGIN
2487 #ifndef __x86_64__
2488 	pushl IVP
2489 	pushl LEN
2490 	pushl KEYP
2491 	pushl KLEN
2492 	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2493 	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2494 	movl (FRAME_OFFSET+28)(%esp), INP	# src
2495 	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2496 	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2497 #endif
2498 	cmp $16, LEN
2499 	jb .Lcbc_dec_just_ret
2500 	mov 480(KEYP), KLEN
2501 	add $240, KEYP
2502 	movups (IVP), IV
2503 	cmp $64, LEN
2504 	jb .Lcbc_dec_loop1
2505 .align 4
2506 .Lcbc_dec_loop4:
2507 	movups (INP), IN1
2508 	movaps IN1, STATE1
2509 	movups 0x10(INP), IN2
2510 	movaps IN2, STATE2
2511 #ifdef __x86_64__
2512 	movups 0x20(INP), IN3
2513 	movaps IN3, STATE3
2514 	movups 0x30(INP), IN4
2515 	movaps IN4, STATE4
2516 #else
2517 	movups 0x20(INP), IN1
2518 	movaps IN1, STATE3
2519 	movups 0x30(INP), IN2
2520 	movaps IN2, STATE4
2521 #endif
2522 	call _aesni_dec4
2523 	pxor IV, STATE1
2524 #ifdef __x86_64__
2525 	pxor IN1, STATE2
2526 	pxor IN2, STATE3
2527 	pxor IN3, STATE4
2528 	movaps IN4, IV
2529 #else
2530 	pxor IN1, STATE4
2531 	movaps IN2, IV
2532 	movups (INP), IN1
2533 	pxor IN1, STATE2
2534 	movups 0x10(INP), IN2
2535 	pxor IN2, STATE3
2536 #endif
2537 	movups STATE1, (OUTP)
2538 	movups STATE2, 0x10(OUTP)
2539 	movups STATE3, 0x20(OUTP)
2540 	movups STATE4, 0x30(OUTP)
2541 	sub $64, LEN
2542 	add $64, INP
2543 	add $64, OUTP
2544 	cmp $64, LEN
2545 	jge .Lcbc_dec_loop4
2546 	cmp $16, LEN
2547 	jb .Lcbc_dec_ret
2548 .align 4
2549 .Lcbc_dec_loop1:
2550 	movups (INP), IN
2551 	movaps IN, STATE
2552 	call _aesni_dec1
2553 	pxor IV, STATE
2554 	movups STATE, (OUTP)
2555 	movaps IN, IV
2556 	sub $16, LEN
2557 	add $16, INP
2558 	add $16, OUTP
2559 	cmp $16, LEN
2560 	jge .Lcbc_dec_loop1
2561 .Lcbc_dec_ret:
2562 	movups IV, (IVP)
2563 .Lcbc_dec_just_ret:
2564 #ifndef __x86_64__
2565 	popl KLEN
2566 	popl KEYP
2567 	popl LEN
2568 	popl IVP
2569 #endif
2570 	FRAME_END
2571 	RET
2572 SYM_FUNC_END(aesni_cbc_dec)
2573 
2574 /*
2575  * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2576  *			  size_t len, u8 *iv)
2577  */
2578 SYM_FUNC_START(aesni_cts_cbc_enc)
2579 	FRAME_BEGIN
2580 #ifndef __x86_64__
2581 	pushl IVP
2582 	pushl LEN
2583 	pushl KEYP
2584 	pushl KLEN
2585 	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2586 	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2587 	movl (FRAME_OFFSET+28)(%esp), INP	# src
2588 	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2589 	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2590 	lea .Lcts_permute_table, T1
2591 #else
2592 	lea .Lcts_permute_table(%rip), T1
2593 #endif
2594 	mov 480(KEYP), KLEN
2595 	movups (IVP), STATE
2596 	sub $16, LEN
2597 	mov T1, IVP
2598 	add $32, IVP
2599 	add LEN, T1
2600 	sub LEN, IVP
2601 	movups (T1), %xmm4
2602 	movups (IVP), %xmm5
2603 
2604 	movups (INP), IN1
2605 	add LEN, INP
2606 	movups (INP), IN2
2607 
2608 	pxor IN1, STATE
2609 	call _aesni_enc1
2610 
2611 	pshufb %xmm5, IN2
2612 	pxor STATE, IN2
2613 	pshufb %xmm4, STATE
2614 	add OUTP, LEN
2615 	movups STATE, (LEN)
2616 
2617 	movaps IN2, STATE
2618 	call _aesni_enc1
2619 	movups STATE, (OUTP)
2620 
2621 #ifndef __x86_64__
2622 	popl KLEN
2623 	popl KEYP
2624 	popl LEN
2625 	popl IVP
2626 #endif
2627 	FRAME_END
2628 	RET
2629 SYM_FUNC_END(aesni_cts_cbc_enc)
2630 
2631 /*
2632  * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2633  *			  size_t len, u8 *iv)
2634  */
2635 SYM_FUNC_START(aesni_cts_cbc_dec)
2636 	FRAME_BEGIN
2637 #ifndef __x86_64__
2638 	pushl IVP
2639 	pushl LEN
2640 	pushl KEYP
2641 	pushl KLEN
2642 	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2643 	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2644 	movl (FRAME_OFFSET+28)(%esp), INP	# src
2645 	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2646 	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2647 	lea .Lcts_permute_table, T1
2648 #else
2649 	lea .Lcts_permute_table(%rip), T1
2650 #endif
2651 	mov 480(KEYP), KLEN
2652 	add $240, KEYP
2653 	movups (IVP), IV
2654 	sub $16, LEN
2655 	mov T1, IVP
2656 	add $32, IVP
2657 	add LEN, T1
2658 	sub LEN, IVP
2659 	movups (T1), %xmm4
2660 
2661 	movups (INP), STATE
2662 	add LEN, INP
2663 	movups (INP), IN1
2664 
2665 	call _aesni_dec1
2666 	movaps STATE, IN2
2667 	pshufb %xmm4, STATE
2668 	pxor IN1, STATE
2669 
2670 	add OUTP, LEN
2671 	movups STATE, (LEN)
2672 
2673 	movups (IVP), %xmm0
2674 	pshufb %xmm0, IN1
2675 	pblendvb IN2, IN1
2676 	movaps IN1, STATE
2677 	call _aesni_dec1
2678 
2679 	pxor IV, STATE
2680 	movups STATE, (OUTP)
2681 
2682 #ifndef __x86_64__
2683 	popl KLEN
2684 	popl KEYP
2685 	popl LEN
2686 	popl IVP
2687 #endif
2688 	FRAME_END
2689 	RET
2690 SYM_FUNC_END(aesni_cts_cbc_dec)
2691 
2692 .pushsection .rodata
2693 .align 16
2694 .Lcts_permute_table:
2695 	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2696 	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2697 	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
2698 	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
2699 	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2700 	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2701 #ifdef __x86_64__
2702 .Lbswap_mask:
2703 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2704 #endif
2705 .popsection
2706 
2707 #ifdef __x86_64__
2708 /*
2709  * _aesni_inc_init:	internal ABI
2710  *	setup registers used by _aesni_inc
2711  * input:
2712  *	IV
2713  * output:
2714  *	CTR:	== IV, in little endian
2715  *	TCTR_LOW: == lower qword of CTR
2716  *	INC:	== 1, in little endian
2717  *	BSWAP_MASK == endian swapping mask
2718  */
2719 SYM_FUNC_START_LOCAL(_aesni_inc_init)
2720 	movaps .Lbswap_mask(%rip), BSWAP_MASK
2721 	movaps IV, CTR
2722 	pshufb BSWAP_MASK, CTR
2723 	mov $1, TCTR_LOW
2724 	movq TCTR_LOW, INC
2725 	movq CTR, TCTR_LOW
2726 	RET
2727 SYM_FUNC_END(_aesni_inc_init)
2728 
2729 /*
2730  * _aesni_inc:		internal ABI
2731  *	Increase IV by 1, IV is in big endian
2732  * input:
2733  *	IV
2734  *	CTR:	== IV, in little endian
2735  *	TCTR_LOW: == lower qword of CTR
2736  *	INC:	== 1, in little endian
2737  *	BSWAP_MASK == endian swapping mask
2738  * output:
2739  *	IV:	Increase by 1
2740  * changed:
2741  *	CTR:	== output IV, in little endian
2742  *	TCTR_LOW: == lower qword of CTR
2743  */
2744 SYM_FUNC_START_LOCAL(_aesni_inc)
2745 	paddq INC, CTR
2746 	add $1, TCTR_LOW
2747 	jnc .Linc_low
2748 	pslldq $8, INC
2749 	paddq INC, CTR
2750 	psrldq $8, INC
2751 .Linc_low:
2752 	movaps CTR, IV
2753 	pshufb BSWAP_MASK, IV
2754 	RET
2755 SYM_FUNC_END(_aesni_inc)
2756 
2757 /*
2758  * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2759  *		      size_t len, u8 *iv)
2760  */
2761 SYM_FUNC_START(aesni_ctr_enc)
2762 	FRAME_BEGIN
2763 	cmp $16, LEN
2764 	jb .Lctr_enc_just_ret
2765 	mov 480(KEYP), KLEN
2766 	movups (IVP), IV
2767 	call _aesni_inc_init
2768 	cmp $64, LEN
2769 	jb .Lctr_enc_loop1
2770 .align 4
2771 .Lctr_enc_loop4:
2772 	movaps IV, STATE1
2773 	call _aesni_inc
2774 	movups (INP), IN1
2775 	movaps IV, STATE2
2776 	call _aesni_inc
2777 	movups 0x10(INP), IN2
2778 	movaps IV, STATE3
2779 	call _aesni_inc
2780 	movups 0x20(INP), IN3
2781 	movaps IV, STATE4
2782 	call _aesni_inc
2783 	movups 0x30(INP), IN4
2784 	call _aesni_enc4
2785 	pxor IN1, STATE1
2786 	movups STATE1, (OUTP)
2787 	pxor IN2, STATE2
2788 	movups STATE2, 0x10(OUTP)
2789 	pxor IN3, STATE3
2790 	movups STATE3, 0x20(OUTP)
2791 	pxor IN4, STATE4
2792 	movups STATE4, 0x30(OUTP)
2793 	sub $64, LEN
2794 	add $64, INP
2795 	add $64, OUTP
2796 	cmp $64, LEN
2797 	jge .Lctr_enc_loop4
2798 	cmp $16, LEN
2799 	jb .Lctr_enc_ret
2800 .align 4
2801 .Lctr_enc_loop1:
2802 	movaps IV, STATE
2803 	call _aesni_inc
2804 	movups (INP), IN
2805 	call _aesni_enc1
2806 	pxor IN, STATE
2807 	movups STATE, (OUTP)
2808 	sub $16, LEN
2809 	add $16, INP
2810 	add $16, OUTP
2811 	cmp $16, LEN
2812 	jge .Lctr_enc_loop1
2813 .Lctr_enc_ret:
2814 	movups IV, (IVP)
2815 .Lctr_enc_just_ret:
2816 	FRAME_END
2817 	RET
2818 SYM_FUNC_END(aesni_ctr_enc)
2819 
2820 #endif
2821 
2822 .section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
2823 .align 16
2824 .Lgf128mul_x_ble_mask:
2825 	.octa 0x00000000000000010000000000000087
2826 .previous
2827 
2828 /*
2829  * _aesni_gf128mul_x_ble:		internal ABI
2830  *	Multiply in GF(2^128) for XTS IVs
2831  * input:
2832  *	IV:	current IV
2833  *	GF128MUL_MASK == mask with 0x87 and 0x01
2834  * output:
2835  *	IV:	next IV
2836  * changed:
2837  *	CTR:	== temporary value
2838  */
2839 #define _aesni_gf128mul_x_ble() \
2840 	pshufd $0x13, IV, KEY; \
2841 	paddq IV, IV; \
2842 	psrad $31, KEY; \
2843 	pand GF128MUL_MASK, KEY; \
2844 	pxor KEY, IV;
2845 
2846 /*
2847  * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
2848  *			  const u8 *src, unsigned int len, le128 *iv)
2849  */
2850 SYM_FUNC_START(aesni_xts_encrypt)
2851 	FRAME_BEGIN
2852 #ifndef __x86_64__
2853 	pushl IVP
2854 	pushl LEN
2855 	pushl KEYP
2856 	pushl KLEN
2857 	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2858 	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2859 	movl (FRAME_OFFSET+28)(%esp), INP	# src
2860 	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2861 	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2862 	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2863 #else
2864 	movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
2865 #endif
2866 	movups (IVP), IV
2867 
2868 	mov 480(KEYP), KLEN
2869 
2870 .Lxts_enc_loop4:
2871 	sub $64, LEN
2872 	jl .Lxts_enc_1x
2873 
2874 	movdqa IV, STATE1
2875 	movdqu 0x00(INP), IN
2876 	pxor IN, STATE1
2877 	movdqu IV, 0x00(OUTP)
2878 
2879 	_aesni_gf128mul_x_ble()
2880 	movdqa IV, STATE2
2881 	movdqu 0x10(INP), IN
2882 	pxor IN, STATE2
2883 	movdqu IV, 0x10(OUTP)
2884 
2885 	_aesni_gf128mul_x_ble()
2886 	movdqa IV, STATE3
2887 	movdqu 0x20(INP), IN
2888 	pxor IN, STATE3
2889 	movdqu IV, 0x20(OUTP)
2890 
2891 	_aesni_gf128mul_x_ble()
2892 	movdqa IV, STATE4
2893 	movdqu 0x30(INP), IN
2894 	pxor IN, STATE4
2895 	movdqu IV, 0x30(OUTP)
2896 
2897 	call _aesni_enc4
2898 
2899 	movdqu 0x00(OUTP), IN
2900 	pxor IN, STATE1
2901 	movdqu STATE1, 0x00(OUTP)
2902 
2903 	movdqu 0x10(OUTP), IN
2904 	pxor IN, STATE2
2905 	movdqu STATE2, 0x10(OUTP)
2906 
2907 	movdqu 0x20(OUTP), IN
2908 	pxor IN, STATE3
2909 	movdqu STATE3, 0x20(OUTP)
2910 
2911 	movdqu 0x30(OUTP), IN
2912 	pxor IN, STATE4
2913 	movdqu STATE4, 0x30(OUTP)
2914 
2915 	_aesni_gf128mul_x_ble()
2916 
2917 	add $64, INP
2918 	add $64, OUTP
2919 	test LEN, LEN
2920 	jnz .Lxts_enc_loop4
2921 
2922 .Lxts_enc_ret_iv:
2923 	movups IV, (IVP)
2924 
2925 .Lxts_enc_ret:
2926 #ifndef __x86_64__
2927 	popl KLEN
2928 	popl KEYP
2929 	popl LEN
2930 	popl IVP
2931 #endif
2932 	FRAME_END
2933 	RET
2934 
2935 .Lxts_enc_1x:
2936 	add $64, LEN
2937 	jz .Lxts_enc_ret_iv
2938 	sub $16, LEN
2939 	jl .Lxts_enc_cts4
2940 
2941 .Lxts_enc_loop1:
2942 	movdqu (INP), STATE
2943 	pxor IV, STATE
2944 	call _aesni_enc1
2945 	pxor IV, STATE
2946 	_aesni_gf128mul_x_ble()
2947 
2948 	test LEN, LEN
2949 	jz .Lxts_enc_out
2950 
2951 	add $16, INP
2952 	sub $16, LEN
2953 	jl .Lxts_enc_cts1
2954 
2955 	movdqu STATE, (OUTP)
2956 	add $16, OUTP
2957 	jmp .Lxts_enc_loop1
2958 
2959 .Lxts_enc_out:
2960 	movdqu STATE, (OUTP)
2961 	jmp .Lxts_enc_ret_iv
2962 
2963 .Lxts_enc_cts4:
2964 	movdqa STATE4, STATE
2965 	sub $16, OUTP
2966 
2967 .Lxts_enc_cts1:
2968 #ifndef __x86_64__
2969 	lea .Lcts_permute_table, T1
2970 #else
2971 	lea .Lcts_permute_table(%rip), T1
2972 #endif
2973 	add LEN, INP		/* rewind input pointer */
2974 	add $16, LEN		/* # bytes in final block */
2975 	movups (INP), IN1
2976 
2977 	mov T1, IVP
2978 	add $32, IVP
2979 	add LEN, T1
2980 	sub LEN, IVP
2981 	add OUTP, LEN
2982 
2983 	movups (T1), %xmm4
2984 	movaps STATE, IN2
2985 	pshufb %xmm4, STATE
2986 	movups STATE, (LEN)
2987 
2988 	movups (IVP), %xmm0
2989 	pshufb %xmm0, IN1
2990 	pblendvb IN2, IN1
2991 	movaps IN1, STATE
2992 
2993 	pxor IV, STATE
2994 	call _aesni_enc1
2995 	pxor IV, STATE
2996 
2997 	movups STATE, (OUTP)
2998 	jmp .Lxts_enc_ret
2999 SYM_FUNC_END(aesni_xts_encrypt)
3000 
3001 /*
3002  * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
3003  *			  const u8 *src, unsigned int len, le128 *iv)
3004  */
3005 SYM_FUNC_START(aesni_xts_decrypt)
3006 	FRAME_BEGIN
3007 #ifndef __x86_64__
3008 	pushl IVP
3009 	pushl LEN
3010 	pushl KEYP
3011 	pushl KLEN
3012 	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
3013 	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
3014 	movl (FRAME_OFFSET+28)(%esp), INP	# src
3015 	movl (FRAME_OFFSET+32)(%esp), LEN	# len
3016 	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
3017 	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
3018 #else
3019 	movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
3020 #endif
3021 	movups (IVP), IV
3022 
3023 	mov 480(KEYP), KLEN
3024 	add $240, KEYP
3025 
3026 	test $15, LEN
3027 	jz .Lxts_dec_loop4
3028 	sub $16, LEN
3029 
3030 .Lxts_dec_loop4:
3031 	sub $64, LEN
3032 	jl .Lxts_dec_1x
3033 
3034 	movdqa IV, STATE1
3035 	movdqu 0x00(INP), IN
3036 	pxor IN, STATE1
3037 	movdqu IV, 0x00(OUTP)
3038 
3039 	_aesni_gf128mul_x_ble()
3040 	movdqa IV, STATE2
3041 	movdqu 0x10(INP), IN
3042 	pxor IN, STATE2
3043 	movdqu IV, 0x10(OUTP)
3044 
3045 	_aesni_gf128mul_x_ble()
3046 	movdqa IV, STATE3
3047 	movdqu 0x20(INP), IN
3048 	pxor IN, STATE3
3049 	movdqu IV, 0x20(OUTP)
3050 
3051 	_aesni_gf128mul_x_ble()
3052 	movdqa IV, STATE4
3053 	movdqu 0x30(INP), IN
3054 	pxor IN, STATE4
3055 	movdqu IV, 0x30(OUTP)
3056 
3057 	call _aesni_dec4
3058 
3059 	movdqu 0x00(OUTP), IN
3060 	pxor IN, STATE1
3061 	movdqu STATE1, 0x00(OUTP)
3062 
3063 	movdqu 0x10(OUTP), IN
3064 	pxor IN, STATE2
3065 	movdqu STATE2, 0x10(OUTP)
3066 
3067 	movdqu 0x20(OUTP), IN
3068 	pxor IN, STATE3
3069 	movdqu STATE3, 0x20(OUTP)
3070 
3071 	movdqu 0x30(OUTP), IN
3072 	pxor IN, STATE4
3073 	movdqu STATE4, 0x30(OUTP)
3074 
3075 	_aesni_gf128mul_x_ble()
3076 
3077 	add $64, INP
3078 	add $64, OUTP
3079 	test LEN, LEN
3080 	jnz .Lxts_dec_loop4
3081 
3082 .Lxts_dec_ret_iv:
3083 	movups IV, (IVP)
3084 
3085 .Lxts_dec_ret:
3086 #ifndef __x86_64__
3087 	popl KLEN
3088 	popl KEYP
3089 	popl LEN
3090 	popl IVP
3091 #endif
3092 	FRAME_END
3093 	RET
3094 
3095 .Lxts_dec_1x:
3096 	add $64, LEN
3097 	jz .Lxts_dec_ret_iv
3098 
3099 .Lxts_dec_loop1:
3100 	movdqu (INP), STATE
3101 
3102 	add $16, INP
3103 	sub $16, LEN
3104 	jl .Lxts_dec_cts1
3105 
3106 	pxor IV, STATE
3107 	call _aesni_dec1
3108 	pxor IV, STATE
3109 	_aesni_gf128mul_x_ble()
3110 
3111 	test LEN, LEN
3112 	jz .Lxts_dec_out
3113 
3114 	movdqu STATE, (OUTP)
3115 	add $16, OUTP
3116 	jmp .Lxts_dec_loop1
3117 
3118 .Lxts_dec_out:
3119 	movdqu STATE, (OUTP)
3120 	jmp .Lxts_dec_ret_iv
3121 
3122 .Lxts_dec_cts1:
3123 	movdqa IV, STATE4
3124 	_aesni_gf128mul_x_ble()
3125 
3126 	pxor IV, STATE
3127 	call _aesni_dec1
3128 	pxor IV, STATE
3129 
3130 #ifndef __x86_64__
3131 	lea .Lcts_permute_table, T1
3132 #else
3133 	lea .Lcts_permute_table(%rip), T1
3134 #endif
3135 	add LEN, INP		/* rewind input pointer */
3136 	add $16, LEN		/* # bytes in final block */
3137 	movups (INP), IN1
3138 
3139 	mov T1, IVP
3140 	add $32, IVP
3141 	add LEN, T1
3142 	sub LEN, IVP
3143 	add OUTP, LEN
3144 
3145 	movups (T1), %xmm4
3146 	movaps STATE, IN2
3147 	pshufb %xmm4, STATE
3148 	movups STATE, (LEN)
3149 
3150 	movups (IVP), %xmm0
3151 	pshufb %xmm0, IN1
3152 	pblendvb IN2, IN1
3153 	movaps IN1, STATE
3154 
3155 	pxor STATE4, STATE
3156 	call _aesni_dec1
3157 	pxor STATE4, STATE
3158 
3159 	movups STATE, (OUTP)
3160 	jmp .Lxts_dec_ret
3161 SYM_FUNC_END(aesni_xts_decrypt)
3162