1 ########################################################################
2 # Copyright (c) 2013, Intel Corporation
3 #
4 # This software is available to you under a choice of one of two
5 # licenses.  You may choose to be licensed under the terms of the GNU
6 # General Public License (GPL) Version 2, available from the file
7 # COPYING in the main directory of this source tree, or the
8 # OpenIB.org BSD license below:
9 #
10 # Redistribution and use in source and binary forms, with or without
11 # modification, are permitted provided that the following conditions are
12 # met:
13 #
14 # * Redistributions of source code must retain the above copyright
15 #   notice, this list of conditions and the following disclaimer.
16 #
17 # * Redistributions in binary form must reproduce the above copyright
18 #   notice, this list of conditions and the following disclaimer in the
19 #   documentation and/or other materials provided with the
20 #   distribution.
21 #
22 # * Neither the name of the Intel Corporation nor the names of its
23 #   contributors may be used to endorse or promote products derived from
24 #   this software without specific prior written permission.
25 #
26 #
27 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34 # PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 ########################################################################
39 ##
40 ## Authors:
41 ##	Erdinc Ozturk <erdinc.ozturk@intel.com>
42 ##	Vinodh Gopal <vinodh.gopal@intel.com>
43 ##	James Guilford <james.guilford@intel.com>
44 ##	Tim Chen <tim.c.chen@linux.intel.com>
45 ##
46 ## References:
47 ##       This code was derived and highly optimized from the code described in paper:
48 ##               Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49 ##			on Intel Architecture Processors. August, 2010
50 ##       The details of the implementation is explained in:
51 ##               Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52 ##			on Intel Architecture Processors. October, 2012.
53 ##
54 ## Assumptions:
55 ##
56 ##
57 ##
58 ## iv:
59 ##       0                   1                   2                   3
60 ##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62 ##       |                             Salt  (From the SA)               |
63 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64 ##       |                     Initialization Vector                     |
65 ##       |         (This is the sequence number from IPSec header)       |
66 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
67 ##       |                              0x1                              |
68 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
69 ##
70 ##
71 ##
72 ## AAD:
73 ##       AAD padded to 128 bits with 0
74 ##       for example, assume AAD is a u32 vector
75 ##
76 ##       if AAD is 8 bytes:
77 ##       AAD[3] = {A0, A1}#
78 ##       padded AAD in xmm register = {A1 A0 0 0}
79 ##
80 ##       0                   1                   2                   3
81 ##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
83 ##       |                               SPI (A1)                        |
84 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85 ##       |                     32-bit Sequence Number (A0)               |
86 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
87 ##       |                              0x0                              |
88 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
89 ##
90 ##                                       AAD Format with 32-bit Sequence Number
91 ##
92 ##       if AAD is 12 bytes:
93 ##       AAD[3] = {A0, A1, A2}#
94 ##       padded AAD in xmm register = {A2 A1 A0 0}
95 ##
96 ##       0                   1                   2                   3
97 ##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
99 ##       |                               SPI (A2)                        |
100 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101 ##       |                 64-bit Extended Sequence Number {A1,A0}       |
102 ##       |                                                               |
103 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
104 ##       |                              0x0                              |
105 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
106 ##
107 ##        AAD Format with 64-bit Extended Sequence Number
108 ##
109 ##
110 ## aadLen:
111 ##       from the definition of the spec, aadLen can only be 8 or 12 bytes.
112 ##	 The code additionally supports aadLen of length 16 bytes.
113 ##
114 ## TLen:
115 ##       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
116 ##
117 ## poly = x^128 + x^127 + x^126 + x^121 + 1
118 ## throughout the code, one tab and two tab indentations are used. one tab is
119 ## for GHASH part, two tabs is for AES part.
120 ##
121 
122 #include <linux/linkage.h>
123 
124 # constants in mergeable sections, linker can reorder and merge
125 .section	.rodata.cst16.POLY, "aM", @progbits, 16
126 .align 16
127 POLY:            .octa     0xC2000000000000000000000000000001
128 
129 .section	.rodata.cst16.POLY2, "aM", @progbits, 16
130 .align 16
131 POLY2:           .octa     0xC20000000000000000000001C2000000
132 
133 .section	.rodata.cst16.TWOONE, "aM", @progbits, 16
134 .align 16
135 TWOONE:          .octa     0x00000001000000000000000000000001
136 
137 .section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
138 .align 16
139 SHUF_MASK:       .octa     0x000102030405060708090A0B0C0D0E0F
140 
141 .section	.rodata.cst16.ONE, "aM", @progbits, 16
142 .align 16
143 ONE:             .octa     0x00000000000000000000000000000001
144 
145 .section	.rodata.cst16.ONEf, "aM", @progbits, 16
146 .align 16
147 ONEf:            .octa     0x01000000000000000000000000000000
148 
149 # order of these constants should not change.
150 # more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
151 .section	.rodata, "a", @progbits
152 .align 16
153 SHIFT_MASK:      .octa     0x0f0e0d0c0b0a09080706050403020100
154 ALL_F:           .octa     0xffffffffffffffffffffffffffffffff
155                  .octa     0x00000000000000000000000000000000
156 
157 .text
158 
159 
160 #define AadHash 16*0
161 #define AadLen 16*1
162 #define InLen (16*1)+8
163 #define PBlockEncKey 16*2
164 #define OrigIV 16*3
165 #define CurCount 16*4
166 #define PBlockLen 16*5
167 
168 HashKey        = 16*6   # store HashKey <<1 mod poly here
169 HashKey_2      = 16*7   # store HashKey^2 <<1 mod poly here
170 HashKey_3      = 16*8   # store HashKey^3 <<1 mod poly here
171 HashKey_4      = 16*9   # store HashKey^4 <<1 mod poly here
172 HashKey_5      = 16*10   # store HashKey^5 <<1 mod poly here
173 HashKey_6      = 16*11   # store HashKey^6 <<1 mod poly here
174 HashKey_7      = 16*12   # store HashKey^7 <<1 mod poly here
175 HashKey_8      = 16*13   # store HashKey^8 <<1 mod poly here
176 HashKey_k      = 16*14   # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
177 HashKey_2_k    = 16*15   # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
178 HashKey_3_k    = 16*16   # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
179 HashKey_4_k    = 16*17   # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
180 HashKey_5_k    = 16*18   # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
181 HashKey_6_k    = 16*19   # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
182 HashKey_7_k    = 16*20   # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
183 HashKey_8_k    = 16*21   # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
184 
185 #define arg1 %rdi
186 #define arg2 %rsi
187 #define arg3 %rdx
188 #define arg4 %rcx
189 #define arg5 %r8
190 #define arg6 %r9
191 #define keysize 2*15*16(arg1)
192 
193 i = 0
194 j = 0
195 
196 out_order = 0
197 in_order = 1
198 DEC = 0
199 ENC = 1
200 
201 .macro define_reg r n
202 reg_\r = %xmm\n
203 .endm
204 
205 .macro setreg
206 .altmacro
207 define_reg i %i
208 define_reg j %j
209 .noaltmacro
210 .endm
211 
212 TMP1 =   16*0    # Temporary storage for AAD
213 TMP2 =   16*1    # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
214 TMP3 =   16*2    # Temporary storage for AES State 3
215 TMP4 =   16*3    # Temporary storage for AES State 4
216 TMP5 =   16*4    # Temporary storage for AES State 5
217 TMP6 =   16*5    # Temporary storage for AES State 6
218 TMP7 =   16*6    # Temporary storage for AES State 7
219 TMP8 =   16*7    # Temporary storage for AES State 8
220 
221 VARIABLE_OFFSET = 16*8
222 
223 ################################
224 # Utility Macros
225 ################################
226 
227 .macro FUNC_SAVE
228         push    %r12
229         push    %r13
230         push    %r15
231 
232 	push	%rbp
233 	mov	%rsp, %rbp
234 
235         sub     $VARIABLE_OFFSET, %rsp
236         and     $~63, %rsp                    # align rsp to 64 bytes
237 .endm
238 
239 .macro FUNC_RESTORE
240         mov     %rbp, %rsp
241 	pop	%rbp
242 
243         pop     %r15
244         pop     %r13
245         pop     %r12
246 .endm
247 
248 # Encryption of a single block
249 .macro ENCRYPT_SINGLE_BLOCK REP XMM0
250                 vpxor    (arg1), \XMM0, \XMM0
251                i = 1
252                setreg
253 .rep \REP
254                 vaesenc  16*i(arg1), \XMM0, \XMM0
255                i = (i+1)
256                setreg
257 .endr
258                 vaesenclast 16*i(arg1), \XMM0, \XMM0
259 .endm
260 
261 # combined for GCM encrypt and decrypt functions
262 # clobbering all xmm registers
263 # clobbering r10, r11, r12, r13, r15, rax
264 .macro  GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
265         vmovdqu AadHash(arg2), %xmm8
266         vmovdqu  HashKey(arg2), %xmm13      # xmm13 = HashKey
267         add arg5, InLen(arg2)
268 
269         # initialize the data pointer offset as zero
270         xor     %r11d, %r11d
271 
272         PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
273         sub %r11, arg5
274 
275         mov     arg5, %r13                  # save the number of bytes of plaintext/ciphertext
276         and     $-16, %r13                  # r13 = r13 - (r13 mod 16)
277 
278         mov     %r13, %r12
279         shr     $4, %r12
280         and     $7, %r12
281         jz      .L_initial_num_blocks_is_0\@
282 
283         cmp     $7, %r12
284         je      .L_initial_num_blocks_is_7\@
285         cmp     $6, %r12
286         je      .L_initial_num_blocks_is_6\@
287         cmp     $5, %r12
288         je      .L_initial_num_blocks_is_5\@
289         cmp     $4, %r12
290         je      .L_initial_num_blocks_is_4\@
291         cmp     $3, %r12
292         je      .L_initial_num_blocks_is_3\@
293         cmp     $2, %r12
294         je      .L_initial_num_blocks_is_2\@
295 
296         jmp     .L_initial_num_blocks_is_1\@
297 
298 .L_initial_num_blocks_is_7\@:
299         \INITIAL_BLOCKS  \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
300         sub     $16*7, %r13
301         jmp     .L_initial_blocks_encrypted\@
302 
303 .L_initial_num_blocks_is_6\@:
304         \INITIAL_BLOCKS  \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
305         sub     $16*6, %r13
306         jmp     .L_initial_blocks_encrypted\@
307 
308 .L_initial_num_blocks_is_5\@:
309         \INITIAL_BLOCKS  \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
310         sub     $16*5, %r13
311         jmp     .L_initial_blocks_encrypted\@
312 
313 .L_initial_num_blocks_is_4\@:
314         \INITIAL_BLOCKS  \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
315         sub     $16*4, %r13
316         jmp     .L_initial_blocks_encrypted\@
317 
318 .L_initial_num_blocks_is_3\@:
319         \INITIAL_BLOCKS  \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
320         sub     $16*3, %r13
321         jmp     .L_initial_blocks_encrypted\@
322 
323 .L_initial_num_blocks_is_2\@:
324         \INITIAL_BLOCKS  \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
325         sub     $16*2, %r13
326         jmp     .L_initial_blocks_encrypted\@
327 
328 .L_initial_num_blocks_is_1\@:
329         \INITIAL_BLOCKS  \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
330         sub     $16*1, %r13
331         jmp     .L_initial_blocks_encrypted\@
332 
333 .L_initial_num_blocks_is_0\@:
334         \INITIAL_BLOCKS  \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
335 
336 
337 .L_initial_blocks_encrypted\@:
338         test    %r13, %r13
339         je      .L_zero_cipher_left\@
340 
341         sub     $128, %r13
342         je      .L_eight_cipher_left\@
343 
344 
345 
346 
347         vmovd   %xmm9, %r15d
348         and     $255, %r15d
349         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
350 
351 
352 .L_encrypt_by_8_new\@:
353         cmp     $(255-8), %r15d
354         jg      .L_encrypt_by_8\@
355 
356 
357 
358         add     $8, %r15b
359         \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
360         add     $128, %r11
361         sub     $128, %r13
362         jne     .L_encrypt_by_8_new\@
363 
364         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
365         jmp     .L_eight_cipher_left\@
366 
367 .L_encrypt_by_8\@:
368         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
369         add     $8, %r15b
370         \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
371         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
372         add     $128, %r11
373         sub     $128, %r13
374         jne     .L_encrypt_by_8_new\@
375 
376         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
377 
378 
379 
380 
381 .L_eight_cipher_left\@:
382         \GHASH_LAST_8    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
383 
384 
385 .L_zero_cipher_left\@:
386         vmovdqu %xmm14, AadHash(arg2)
387         vmovdqu %xmm9, CurCount(arg2)
388 
389         # check for 0 length
390         mov     arg5, %r13
391         and     $15, %r13                            # r13 = (arg5 mod 16)
392 
393         je      .L_multiple_of_16_bytes\@
394 
395         # handle the last <16 Byte block separately
396 
397         mov %r13, PBlockLen(arg2)
398 
399         vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
400         vmovdqu %xmm9, CurCount(arg2)
401         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
402 
403         ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Yn)
404         vmovdqu %xmm9, PBlockEncKey(arg2)
405 
406         cmp $16, arg5
407         jge .L_large_enough_update\@
408 
409         lea (arg4,%r11,1), %r10
410         mov %r13, %r12
411 
412         READ_PARTIAL_BLOCK %r10 %r12 %xmm1
413 
414         lea     SHIFT_MASK+16(%rip), %r12
415         sub     %r13, %r12                           # adjust the shuffle mask pointer to be
416 						     # able to shift 16-r13 bytes (r13 is the
417 	# number of bytes in plaintext mod 16)
418 
419         jmp .L_final_ghash_mul\@
420 
421 .L_large_enough_update\@:
422         sub $16, %r11
423         add %r13, %r11
424 
425         # receive the last <16 Byte block
426         vmovdqu	(arg4, %r11, 1), %xmm1
427 
428         sub	%r13, %r11
429         add	$16, %r11
430 
431         lea	SHIFT_MASK+16(%rip), %r12
432         # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
433         # (r13 is the number of bytes in plaintext mod 16)
434         sub	%r13, %r12
435         # get the appropriate shuffle mask
436         vmovdqu	(%r12), %xmm2
437         # shift right 16-r13 bytes
438         vpshufb  %xmm2, %xmm1, %xmm1
439 
440 .L_final_ghash_mul\@:
441         .if  \ENC_DEC ==  DEC
442         vmovdqa %xmm1, %xmm2
443         vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
444         vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
445 						     # mask out top 16-r13 bytes of xmm9
446         vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
447         vpand   %xmm1, %xmm2, %xmm2
448         vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
449         vpxor   %xmm2, %xmm14, %xmm14
450 
451         vmovdqu %xmm14, AadHash(arg2)
452         .else
453         vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
454         vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
455 						     # mask out top 16-r13 bytes of xmm9
456         vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
457         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
458         vpxor   %xmm9, %xmm14, %xmm14
459 
460         vmovdqu %xmm14, AadHash(arg2)
461         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
462         .endif
463 
464 
465         #############################
466         # output r13 Bytes
467         vmovq   %xmm9, %rax
468         cmp     $8, %r13
469         jle     .L_less_than_8_bytes_left\@
470 
471         mov     %rax, (arg3 , %r11)
472         add     $8, %r11
473         vpsrldq $8, %xmm9, %xmm9
474         vmovq   %xmm9, %rax
475         sub     $8, %r13
476 
477 .L_less_than_8_bytes_left\@:
478         movb    %al, (arg3 , %r11)
479         add     $1, %r11
480         shr     $8, %rax
481         sub     $1, %r13
482         jne     .L_less_than_8_bytes_left\@
483         #############################
484 
485 .L_multiple_of_16_bytes\@:
486 .endm
487 
488 
489 # GCM_COMPLETE Finishes update of tag of last partial block
490 # Output: Authorization Tag (AUTH_TAG)
491 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
492 .macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
493         vmovdqu AadHash(arg2), %xmm14
494         vmovdqu HashKey(arg2), %xmm13
495 
496         mov PBlockLen(arg2), %r12
497         test %r12, %r12
498         je .L_partial_done\@
499 
500 	#GHASH computation for the last <16 Byte block
501         \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
502 
503 .L_partial_done\@:
504         mov AadLen(arg2), %r12                          # r12 = aadLen (number of bytes)
505         shl     $3, %r12                             # convert into number of bits
506         vmovd   %r12d, %xmm15                        # len(A) in xmm15
507 
508         mov InLen(arg2), %r12
509         shl     $3, %r12                        # len(C) in bits  (*128)
510         vmovq   %r12, %xmm1
511         vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
512         vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
513 
514         vpxor   %xmm15, %xmm14, %xmm14
515         \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
516         vpshufb SHUF_MASK(%rip), %xmm14, %xmm14      # perform a 16Byte swap
517 
518         vmovdqu OrigIV(arg2), %xmm9
519 
520         ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Y0)
521 
522         vpxor   %xmm14, %xmm9, %xmm9
523 
524 
525 
526 .L_return_T\@:
527         mov     \AUTH_TAG, %r10              # r10 = authTag
528         mov     \AUTH_TAG_LEN, %r11              # r11 = auth_tag_len
529 
530         cmp     $16, %r11
531         je      .L_T_16\@
532 
533         cmp     $8, %r11
534         jl      .L_T_4\@
535 
536 .L_T_8\@:
537         vmovq   %xmm9, %rax
538         mov     %rax, (%r10)
539         add     $8, %r10
540         sub     $8, %r11
541         vpsrldq $8, %xmm9, %xmm9
542         test    %r11, %r11
543         je     .L_return_T_done\@
544 .L_T_4\@:
545         vmovd   %xmm9, %eax
546         mov     %eax, (%r10)
547         add     $4, %r10
548         sub     $4, %r11
549         vpsrldq     $4, %xmm9, %xmm9
550         test    %r11, %r11
551         je     .L_return_T_done\@
552 .L_T_123\@:
553         vmovd     %xmm9, %eax
554         cmp     $2, %r11
555         jl     .L_T_1\@
556         mov     %ax, (%r10)
557         cmp     $2, %r11
558         je     .L_return_T_done\@
559         add     $2, %r10
560         sar     $16, %eax
561 .L_T_1\@:
562         mov     %al, (%r10)
563         jmp     .L_return_T_done\@
564 
565 .L_T_16\@:
566         vmovdqu %xmm9, (%r10)
567 
568 .L_return_T_done\@:
569 .endm
570 
571 .macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
572 
573 	mov     \AAD, %r10                      # r10 = AAD
574 	mov     \AADLEN, %r12                      # r12 = aadLen
575 
576 
577 	mov     %r12, %r11
578 
579 	vpxor   \T8, \T8, \T8
580 	vpxor   \T7, \T7, \T7
581 	cmp     $16, %r11
582 	jl      .L_get_AAD_rest8\@
583 .L_get_AAD_blocks\@:
584 	vmovdqu (%r10), \T7
585 	vpshufb SHUF_MASK(%rip), \T7, \T7
586 	vpxor   \T7, \T8, \T8
587 	\GHASH_MUL       \T8, \T2, \T1, \T3, \T4, \T5, \T6
588 	add     $16, %r10
589 	sub     $16, %r12
590 	sub     $16, %r11
591 	cmp     $16, %r11
592 	jge     .L_get_AAD_blocks\@
593 	vmovdqu \T8, \T7
594 	test    %r11, %r11
595 	je      .L_get_AAD_done\@
596 
597 	vpxor   \T7, \T7, \T7
598 
599 	/* read the last <16B of AAD. since we have at least 4B of
600 	data right after the AAD (the ICV, and maybe some CT), we can
601 	read 4B/8B blocks safely, and then get rid of the extra stuff */
602 .L_get_AAD_rest8\@:
603 	cmp     $4, %r11
604 	jle     .L_get_AAD_rest4\@
605 	movq    (%r10), \T1
606 	add     $8, %r10
607 	sub     $8, %r11
608 	vpslldq $8, \T1, \T1
609 	vpsrldq $8, \T7, \T7
610 	vpxor   \T1, \T7, \T7
611 	jmp     .L_get_AAD_rest8\@
612 .L_get_AAD_rest4\@:
613 	test    %r11, %r11
614 	jle     .L_get_AAD_rest0\@
615 	mov     (%r10), %eax
616 	movq    %rax, \T1
617 	add     $4, %r10
618 	sub     $4, %r11
619 	vpslldq $12, \T1, \T1
620 	vpsrldq $4, \T7, \T7
621 	vpxor   \T1, \T7, \T7
622 .L_get_AAD_rest0\@:
623 	/* finalize: shift out the extra bytes we read, and align
624 	left. since pslldq can only shift by an immediate, we use
625 	vpshufb and a pair of shuffle masks */
626 	leaq	ALL_F(%rip), %r11
627 	subq	%r12, %r11
628 	vmovdqu	16(%r11), \T1
629 	andq	$~3, %r11
630 	vpshufb (%r11), \T7, \T7
631 	vpand	\T1, \T7, \T7
632 .L_get_AAD_rest_final\@:
633 	vpshufb SHUF_MASK(%rip), \T7, \T7
634 	vpxor   \T8, \T7, \T7
635 	\GHASH_MUL       \T7, \T2, \T1, \T3, \T4, \T5, \T6
636 
637 .L_get_AAD_done\@:
638         vmovdqu \T7, AadHash(arg2)
639 .endm
640 
641 .macro INIT GHASH_MUL PRECOMPUTE
642         mov arg6, %r11
643         mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
644         xor %r11d, %r11d
645         mov %r11, InLen(arg2) # ctx_data.in_length = 0
646 
647         mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
648         mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
649         mov arg3, %rax
650         movdqu (%rax), %xmm0
651         movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
652 
653         vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
654         movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
655 
656         vmovdqu  (arg4), %xmm6              # xmm6 = HashKey
657 
658         vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
659         ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
660         vmovdqa  %xmm6, %xmm2
661         vpsllq   $1, %xmm6, %xmm6
662         vpsrlq   $63, %xmm2, %xmm2
663         vmovdqa  %xmm2, %xmm1
664         vpslldq  $8, %xmm2, %xmm2
665         vpsrldq  $8, %xmm1, %xmm1
666         vpor     %xmm2, %xmm6, %xmm6
667         #reduction
668         vpshufd  $0b00100100, %xmm1, %xmm2
669         vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
670         vpand    POLY(%rip), %xmm2, %xmm2
671         vpxor    %xmm2, %xmm6, %xmm6        # xmm6 holds the HashKey<<1 mod poly
672         #######################################################################
673         vmovdqu  %xmm6, HashKey(arg2)       # store HashKey<<1 mod poly
674 
675         CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
676 
677         \PRECOMPUTE  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
678 .endm
679 
680 
681 # Reads DLEN bytes starting at DPTR and stores in XMMDst
682 # where 0 < DLEN < 16
683 # Clobbers %rax, DLEN
684 .macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
685         vpxor \XMMDst, \XMMDst, \XMMDst
686 
687         cmp $8, \DLEN
688         jl .L_read_lt8_\@
689         mov (\DPTR), %rax
690         vpinsrq $0, %rax, \XMMDst, \XMMDst
691         sub $8, \DLEN
692         jz .L_done_read_partial_block_\@
693         xor %eax, %eax
694 .L_read_next_byte_\@:
695         shl $8, %rax
696         mov 7(\DPTR, \DLEN, 1), %al
697         dec \DLEN
698         jnz .L_read_next_byte_\@
699         vpinsrq $1, %rax, \XMMDst, \XMMDst
700         jmp .L_done_read_partial_block_\@
701 .L_read_lt8_\@:
702         xor %eax, %eax
703 .L_read_next_byte_lt8_\@:
704         shl $8, %rax
705         mov -1(\DPTR, \DLEN, 1), %al
706         dec \DLEN
707         jnz .L_read_next_byte_lt8_\@
708         vpinsrq $0, %rax, \XMMDst, \XMMDst
709 .L_done_read_partial_block_\@:
710 .endm
711 
712 # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
713 # between update calls.
714 # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
715 # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
716 # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
717 .macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
718         AAD_HASH ENC_DEC
719         mov 	PBlockLen(arg2), %r13
720         test	%r13, %r13
721         je	.L_partial_block_done_\@	# Leave Macro if no partial blocks
722         # Read in input data without over reading
723         cmp	$16, \PLAIN_CYPH_LEN
724         jl	.L_fewer_than_16_bytes_\@
725         vmovdqu	(\PLAIN_CYPH_IN), %xmm1	# If more than 16 bytes, just fill xmm
726         jmp	.L_data_read_\@
727 
728 .L_fewer_than_16_bytes_\@:
729         lea	(\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
730         mov	\PLAIN_CYPH_LEN, %r12
731         READ_PARTIAL_BLOCK %r10 %r12 %xmm1
732 
733         mov PBlockLen(arg2), %r13
734 
735 .L_data_read_\@:				# Finished reading in data
736 
737         vmovdqu	PBlockEncKey(arg2), %xmm9
738         vmovdqu	HashKey(arg2), %xmm13
739 
740         lea	SHIFT_MASK(%rip), %r12
741 
742         # adjust the shuffle mask pointer to be able to shift r13 bytes
743         # r16-r13 is the number of bytes in plaintext mod 16)
744         add	%r13, %r12
745         vmovdqu	(%r12), %xmm2		# get the appropriate shuffle mask
746         vpshufb %xmm2, %xmm9, %xmm9		# shift right r13 bytes
747 
748 .if  \ENC_DEC ==  DEC
749         vmovdqa	%xmm1, %xmm3
750         pxor	%xmm1, %xmm9		# Cyphertext XOR E(K, Yn)
751 
752         mov	\PLAIN_CYPH_LEN, %r10
753         add	%r13, %r10
754         # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
755         sub	$16, %r10
756         # Determine if if partial block is not being filled and
757         # shift mask accordingly
758         jge	.L_no_extra_mask_1_\@
759         sub	%r10, %r12
760 .L_no_extra_mask_1_\@:
761 
762         vmovdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
763         # get the appropriate mask to mask out bottom r13 bytes of xmm9
764         vpand	%xmm1, %xmm9, %xmm9		# mask out bottom r13 bytes of xmm9
765 
766         vpand	%xmm1, %xmm3, %xmm3
767         vmovdqa	SHUF_MASK(%rip), %xmm10
768         vpshufb	%xmm10, %xmm3, %xmm3
769         vpshufb	%xmm2, %xmm3, %xmm3
770         vpxor	%xmm3, \AAD_HASH, \AAD_HASH
771 
772         test	%r10, %r10
773         jl	.L_partial_incomplete_1_\@
774 
775         # GHASH computation for the last <16 Byte block
776         \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
777         xor	%eax,%eax
778 
779         mov	%rax, PBlockLen(arg2)
780         jmp	.L_dec_done_\@
781 .L_partial_incomplete_1_\@:
782         add	\PLAIN_CYPH_LEN, PBlockLen(arg2)
783 .L_dec_done_\@:
784         vmovdqu	\AAD_HASH, AadHash(arg2)
785 .else
786         vpxor	%xmm1, %xmm9, %xmm9			# Plaintext XOR E(K, Yn)
787 
788         mov	\PLAIN_CYPH_LEN, %r10
789         add	%r13, %r10
790         # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
791         sub	$16, %r10
792         # Determine if if partial block is not being filled and
793         # shift mask accordingly
794         jge	.L_no_extra_mask_2_\@
795         sub	%r10, %r12
796 .L_no_extra_mask_2_\@:
797 
798         vmovdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
799         # get the appropriate mask to mask out bottom r13 bytes of xmm9
800         vpand	%xmm1, %xmm9, %xmm9
801 
802         vmovdqa	SHUF_MASK(%rip), %xmm1
803         vpshufb %xmm1, %xmm9, %xmm9
804         vpshufb %xmm2, %xmm9, %xmm9
805         vpxor	%xmm9, \AAD_HASH, \AAD_HASH
806 
807         test	%r10, %r10
808         jl	.L_partial_incomplete_2_\@
809 
810         # GHASH computation for the last <16 Byte block
811         \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
812         xor	%eax,%eax
813 
814         mov	%rax, PBlockLen(arg2)
815         jmp	.L_encode_done_\@
816 .L_partial_incomplete_2_\@:
817         add	\PLAIN_CYPH_LEN, PBlockLen(arg2)
818 .L_encode_done_\@:
819         vmovdqu	\AAD_HASH, AadHash(arg2)
820 
821         vmovdqa	SHUF_MASK(%rip), %xmm10
822         # shuffle xmm9 back to output as ciphertext
823         vpshufb	%xmm10, %xmm9, %xmm9
824         vpshufb	%xmm2, %xmm9, %xmm9
825 .endif
826         # output encrypted Bytes
827         test	%r10, %r10
828         jl	.L_partial_fill_\@
829         mov	%r13, %r12
830         mov	$16, %r13
831         # Set r13 to be the number of bytes to write out
832         sub	%r12, %r13
833         jmp	.L_count_set_\@
834 .L_partial_fill_\@:
835         mov	\PLAIN_CYPH_LEN, %r13
836 .L_count_set_\@:
837         vmovdqa	%xmm9, %xmm0
838         vmovq	%xmm0, %rax
839         cmp	$8, %r13
840         jle	.L_less_than_8_bytes_left_\@
841 
842         mov	%rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
843         add	$8, \DATA_OFFSET
844         psrldq	$8, %xmm0
845         vmovq	%xmm0, %rax
846         sub	$8, %r13
847 .L_less_than_8_bytes_left_\@:
848         movb	%al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
849         add	$1, \DATA_OFFSET
850         shr	$8, %rax
851         sub	$1, %r13
852         jne	.L_less_than_8_bytes_left_\@
853 .L_partial_block_done_\@:
854 .endm # PARTIAL_BLOCK
855 
856 ###############################################################################
857 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
858 # Input: A and B (128-bits each, bit-reflected)
859 # Output: C = A*B*x mod poly, (i.e. >>1 )
860 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
861 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
862 ###############################################################################
863 .macro  GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
864 
865         vpshufd         $0b01001110, \GH, \T2
866         vpshufd         $0b01001110, \HK, \T3
867         vpxor           \GH     , \T2, \T2      # T2 = (a1+a0)
868         vpxor           \HK     , \T3, \T3      # T3 = (b1+b0)
869 
870         vpclmulqdq      $0x11, \HK, \GH, \T1    # T1 = a1*b1
871         vpclmulqdq      $0x00, \HK, \GH, \GH    # GH = a0*b0
872         vpclmulqdq      $0x00, \T3, \T2, \T2    # T2 = (a1+a0)*(b1+b0)
873         vpxor           \GH, \T2,\T2
874         vpxor           \T1, \T2,\T2            # T2 = a0*b1+a1*b0
875 
876         vpslldq         $8, \T2,\T3             # shift-L T3 2 DWs
877         vpsrldq         $8, \T2,\T2             # shift-R T2 2 DWs
878         vpxor           \T3, \GH, \GH
879         vpxor           \T2, \T1, \T1           # <T1:GH> = GH x HK
880 
881         #first phase of the reduction
882         vpslld  $31, \GH, \T2                   # packed right shifting << 31
883         vpslld  $30, \GH, \T3                   # packed right shifting shift << 30
884         vpslld  $25, \GH, \T4                   # packed right shifting shift << 25
885 
886         vpxor   \T3, \T2, \T2                   # xor the shifted versions
887         vpxor   \T4, \T2, \T2
888 
889         vpsrldq $4, \T2, \T5                    # shift-R T5 1 DW
890 
891         vpslldq $12, \T2, \T2                   # shift-L T2 3 DWs
892         vpxor   \T2, \GH, \GH                   # first phase of the reduction complete
893 
894         #second phase of the reduction
895 
896         vpsrld  $1,\GH, \T2                     # packed left shifting >> 1
897         vpsrld  $2,\GH, \T3                     # packed left shifting >> 2
898         vpsrld  $7,\GH, \T4                     # packed left shifting >> 7
899         vpxor   \T3, \T2, \T2                   # xor the shifted versions
900         vpxor   \T4, \T2, \T2
901 
902         vpxor   \T5, \T2, \T2
903         vpxor   \T2, \GH, \GH
904         vpxor   \T1, \GH, \GH                   # the result is in GH
905 
906 
907 .endm
908 
909 .macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
910 
911         # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
912         vmovdqa  \HK, \T5
913 
914         vpshufd  $0b01001110, \T5, \T1
915         vpxor    \T5, \T1, \T1
916         vmovdqu  \T1, HashKey_k(arg2)
917 
918         GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^2<<1 mod poly
919         vmovdqu  \T5, HashKey_2(arg2)                    #  [HashKey_2] = HashKey^2<<1 mod poly
920         vpshufd  $0b01001110, \T5, \T1
921         vpxor    \T5, \T1, \T1
922         vmovdqu  \T1, HashKey_2_k(arg2)
923 
924         GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^3<<1 mod poly
925         vmovdqu  \T5, HashKey_3(arg2)
926         vpshufd  $0b01001110, \T5, \T1
927         vpxor    \T5, \T1, \T1
928         vmovdqu  \T1, HashKey_3_k(arg2)
929 
930         GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^4<<1 mod poly
931         vmovdqu  \T5, HashKey_4(arg2)
932         vpshufd  $0b01001110, \T5, \T1
933         vpxor    \T5, \T1, \T1
934         vmovdqu  \T1, HashKey_4_k(arg2)
935 
936         GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^5<<1 mod poly
937         vmovdqu  \T5, HashKey_5(arg2)
938         vpshufd  $0b01001110, \T5, \T1
939         vpxor    \T5, \T1, \T1
940         vmovdqu  \T1, HashKey_5_k(arg2)
941 
942         GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^6<<1 mod poly
943         vmovdqu  \T5, HashKey_6(arg2)
944         vpshufd  $0b01001110, \T5, \T1
945         vpxor    \T5, \T1, \T1
946         vmovdqu  \T1, HashKey_6_k(arg2)
947 
948         GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^7<<1 mod poly
949         vmovdqu  \T5, HashKey_7(arg2)
950         vpshufd  $0b01001110, \T5, \T1
951         vpxor    \T5, \T1, \T1
952         vmovdqu  \T1, HashKey_7_k(arg2)
953 
954         GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^8<<1 mod poly
955         vmovdqu  \T5, HashKey_8(arg2)
956         vpshufd  $0b01001110, \T5, \T1
957         vpxor    \T5, \T1, \T1
958         vmovdqu  \T1, HashKey_8_k(arg2)
959 
960 .endm
961 
962 ## if a = number of total plaintext bytes
963 ## b = floor(a/16)
964 ## num_initial_blocks = b mod 4#
965 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
966 ## r10, r11, r12, rax are clobbered
967 ## arg1, arg2, arg3, arg4 are used as pointers only, not modified
968 
969 .macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
970 	i = (8-\num_initial_blocks)
971 	setreg
972         vmovdqu AadHash(arg2), reg_i
973 
974 	# start AES for num_initial_blocks blocks
975 	vmovdqu CurCount(arg2), \CTR
976 
977 	i = (9-\num_initial_blocks)
978 	setreg
979 .rep \num_initial_blocks
980                 vpaddd  ONE(%rip), \CTR, \CTR		# INCR Y0
981                 vmovdqa \CTR, reg_i
982                 vpshufb SHUF_MASK(%rip), reg_i, reg_i   # perform a 16Byte swap
983 	i = (i+1)
984 	setreg
985 .endr
986 
987 	vmovdqa  (arg1), \T_key
988 	i = (9-\num_initial_blocks)
989 	setreg
990 .rep \num_initial_blocks
991                 vpxor   \T_key, reg_i, reg_i
992 	i = (i+1)
993 	setreg
994 .endr
995 
996        j = 1
997        setreg
998 .rep \REP
999        vmovdqa  16*j(arg1), \T_key
1000 	i = (9-\num_initial_blocks)
1001 	setreg
1002 .rep \num_initial_blocks
1003         vaesenc \T_key, reg_i, reg_i
1004 	i = (i+1)
1005 	setreg
1006 .endr
1007 
1008        j = (j+1)
1009        setreg
1010 .endr
1011 
1012 	vmovdqa  16*j(arg1), \T_key
1013 	i = (9-\num_initial_blocks)
1014 	setreg
1015 .rep \num_initial_blocks
1016         vaesenclast      \T_key, reg_i, reg_i
1017 	i = (i+1)
1018 	setreg
1019 .endr
1020 
1021 	i = (9-\num_initial_blocks)
1022 	setreg
1023 .rep \num_initial_blocks
1024                 vmovdqu (arg4, %r11), \T1
1025                 vpxor   \T1, reg_i, reg_i
1026                 vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for num_initial_blocks blocks
1027                 add     $16, %r11
1028 .if  \ENC_DEC == DEC
1029                 vmovdqa \T1, reg_i
1030 .endif
1031                 vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
1032 	i = (i+1)
1033 	setreg
1034 .endr
1035 
1036 
1037 	i = (8-\num_initial_blocks)
1038 	j = (9-\num_initial_blocks)
1039 	setreg
1040 
1041 .rep \num_initial_blocks
1042         vpxor    reg_i, reg_j, reg_j
1043         GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1044 	i = (i+1)
1045 	j = (j+1)
1046 	setreg
1047 .endr
1048         # XMM8 has the combined result here
1049 
1050         vmovdqa  \XMM8, TMP1(%rsp)
1051         vmovdqa  \XMM8, \T3
1052 
1053         cmp     $128, %r13
1054         jl      .L_initial_blocks_done\@                  # no need for precomputed constants
1055 
1056 ###############################################################################
1057 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1058                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1059                 vmovdqa  \CTR, \XMM1
1060                 vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
1061 
1062                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1063                 vmovdqa  \CTR, \XMM2
1064                 vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
1065 
1066                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1067                 vmovdqa  \CTR, \XMM3
1068                 vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
1069 
1070                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1071                 vmovdqa  \CTR, \XMM4
1072                 vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
1073 
1074                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1075                 vmovdqa  \CTR, \XMM5
1076                 vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
1077 
1078                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1079                 vmovdqa  \CTR, \XMM6
1080                 vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
1081 
1082                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1083                 vmovdqa  \CTR, \XMM7
1084                 vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
1085 
1086                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1087                 vmovdqa  \CTR, \XMM8
1088                 vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
1089 
1090                 vmovdqa  (arg1), \T_key
1091                 vpxor    \T_key, \XMM1, \XMM1
1092                 vpxor    \T_key, \XMM2, \XMM2
1093                 vpxor    \T_key, \XMM3, \XMM3
1094                 vpxor    \T_key, \XMM4, \XMM4
1095                 vpxor    \T_key, \XMM5, \XMM5
1096                 vpxor    \T_key, \XMM6, \XMM6
1097                 vpxor    \T_key, \XMM7, \XMM7
1098                 vpxor    \T_key, \XMM8, \XMM8
1099 
1100                i = 1
1101                setreg
1102 .rep    \REP       # do REP rounds
1103                 vmovdqa  16*i(arg1), \T_key
1104                 vaesenc  \T_key, \XMM1, \XMM1
1105                 vaesenc  \T_key, \XMM2, \XMM2
1106                 vaesenc  \T_key, \XMM3, \XMM3
1107                 vaesenc  \T_key, \XMM4, \XMM4
1108                 vaesenc  \T_key, \XMM5, \XMM5
1109                 vaesenc  \T_key, \XMM6, \XMM6
1110                 vaesenc  \T_key, \XMM7, \XMM7
1111                 vaesenc  \T_key, \XMM8, \XMM8
1112                i = (i+1)
1113                setreg
1114 .endr
1115 
1116                 vmovdqa  16*i(arg1), \T_key
1117                 vaesenclast  \T_key, \XMM1, \XMM1
1118                 vaesenclast  \T_key, \XMM2, \XMM2
1119                 vaesenclast  \T_key, \XMM3, \XMM3
1120                 vaesenclast  \T_key, \XMM4, \XMM4
1121                 vaesenclast  \T_key, \XMM5, \XMM5
1122                 vaesenclast  \T_key, \XMM6, \XMM6
1123                 vaesenclast  \T_key, \XMM7, \XMM7
1124                 vaesenclast  \T_key, \XMM8, \XMM8
1125 
1126                 vmovdqu  (arg4, %r11), \T1
1127                 vpxor    \T1, \XMM1, \XMM1
1128                 vmovdqu  \XMM1, (arg3 , %r11)
1129                 .if   \ENC_DEC == DEC
1130                 vmovdqa  \T1, \XMM1
1131                 .endif
1132 
1133                 vmovdqu  16*1(arg4, %r11), \T1
1134                 vpxor    \T1, \XMM2, \XMM2
1135                 vmovdqu  \XMM2, 16*1(arg3 , %r11)
1136                 .if   \ENC_DEC == DEC
1137                 vmovdqa  \T1, \XMM2
1138                 .endif
1139 
1140                 vmovdqu  16*2(arg4, %r11), \T1
1141                 vpxor    \T1, \XMM3, \XMM3
1142                 vmovdqu  \XMM3, 16*2(arg3 , %r11)
1143                 .if   \ENC_DEC == DEC
1144                 vmovdqa  \T1, \XMM3
1145                 .endif
1146 
1147                 vmovdqu  16*3(arg4, %r11), \T1
1148                 vpxor    \T1, \XMM4, \XMM4
1149                 vmovdqu  \XMM4, 16*3(arg3 , %r11)
1150                 .if   \ENC_DEC == DEC
1151                 vmovdqa  \T1, \XMM4
1152                 .endif
1153 
1154                 vmovdqu  16*4(arg4, %r11), \T1
1155                 vpxor    \T1, \XMM5, \XMM5
1156                 vmovdqu  \XMM5, 16*4(arg3 , %r11)
1157                 .if   \ENC_DEC == DEC
1158                 vmovdqa  \T1, \XMM5
1159                 .endif
1160 
1161                 vmovdqu  16*5(arg4, %r11), \T1
1162                 vpxor    \T1, \XMM6, \XMM6
1163                 vmovdqu  \XMM6, 16*5(arg3 , %r11)
1164                 .if   \ENC_DEC == DEC
1165                 vmovdqa  \T1, \XMM6
1166                 .endif
1167 
1168                 vmovdqu  16*6(arg4, %r11), \T1
1169                 vpxor    \T1, \XMM7, \XMM7
1170                 vmovdqu  \XMM7, 16*6(arg3 , %r11)
1171                 .if   \ENC_DEC == DEC
1172                 vmovdqa  \T1, \XMM7
1173                 .endif
1174 
1175                 vmovdqu  16*7(arg4, %r11), \T1
1176                 vpxor    \T1, \XMM8, \XMM8
1177                 vmovdqu  \XMM8, 16*7(arg3 , %r11)
1178                 .if   \ENC_DEC == DEC
1179                 vmovdqa  \T1, \XMM8
1180                 .endif
1181 
1182                 add     $128, %r11
1183 
1184                 vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
1185                 vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with the corresponding ciphertext
1186                 vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
1187                 vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
1188                 vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
1189                 vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
1190                 vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
1191                 vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
1192                 vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
1193 
1194 ###############################################################################
1195 
1196 .L_initial_blocks_done\@:
1197 
1198 .endm
1199 
1200 # encrypt 8 blocks at a time
1201 # ghash the 8 previously encrypted ciphertext blocks
1202 # arg1, arg2, arg3, arg4 are used as pointers only, not modified
1203 # r11 is the data offset value
1204 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1205 
1206         vmovdqa \XMM1, \T2
1207         vmovdqa \XMM2, TMP2(%rsp)
1208         vmovdqa \XMM3, TMP3(%rsp)
1209         vmovdqa \XMM4, TMP4(%rsp)
1210         vmovdqa \XMM5, TMP5(%rsp)
1211         vmovdqa \XMM6, TMP6(%rsp)
1212         vmovdqa \XMM7, TMP7(%rsp)
1213         vmovdqa \XMM8, TMP8(%rsp)
1214 
1215 .if \loop_idx == in_order
1216                 vpaddd  ONE(%rip), \CTR, \XMM1           # INCR CNT
1217                 vpaddd  ONE(%rip), \XMM1, \XMM2
1218                 vpaddd  ONE(%rip), \XMM2, \XMM3
1219                 vpaddd  ONE(%rip), \XMM3, \XMM4
1220                 vpaddd  ONE(%rip), \XMM4, \XMM5
1221                 vpaddd  ONE(%rip), \XMM5, \XMM6
1222                 vpaddd  ONE(%rip), \XMM6, \XMM7
1223                 vpaddd  ONE(%rip), \XMM7, \XMM8
1224                 vmovdqa \XMM8, \CTR
1225 
1226                 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1    # perform a 16Byte swap
1227                 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2    # perform a 16Byte swap
1228                 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3    # perform a 16Byte swap
1229                 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4    # perform a 16Byte swap
1230                 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5    # perform a 16Byte swap
1231                 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6    # perform a 16Byte swap
1232                 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7    # perform a 16Byte swap
1233                 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8    # perform a 16Byte swap
1234 .else
1235                 vpaddd  ONEf(%rip), \CTR, \XMM1           # INCR CNT
1236                 vpaddd  ONEf(%rip), \XMM1, \XMM2
1237                 vpaddd  ONEf(%rip), \XMM2, \XMM3
1238                 vpaddd  ONEf(%rip), \XMM3, \XMM4
1239                 vpaddd  ONEf(%rip), \XMM4, \XMM5
1240                 vpaddd  ONEf(%rip), \XMM5, \XMM6
1241                 vpaddd  ONEf(%rip), \XMM6, \XMM7
1242                 vpaddd  ONEf(%rip), \XMM7, \XMM8
1243                 vmovdqa \XMM8, \CTR
1244 .endif
1245 
1246 
1247         #######################################################################
1248 
1249                 vmovdqu (arg1), \T1
1250                 vpxor   \T1, \XMM1, \XMM1
1251                 vpxor   \T1, \XMM2, \XMM2
1252                 vpxor   \T1, \XMM3, \XMM3
1253                 vpxor   \T1, \XMM4, \XMM4
1254                 vpxor   \T1, \XMM5, \XMM5
1255                 vpxor   \T1, \XMM6, \XMM6
1256                 vpxor   \T1, \XMM7, \XMM7
1257                 vpxor   \T1, \XMM8, \XMM8
1258 
1259         #######################################################################
1260 
1261 
1262 
1263 
1264 
1265                 vmovdqu 16*1(arg1), \T1
1266                 vaesenc \T1, \XMM1, \XMM1
1267                 vaesenc \T1, \XMM2, \XMM2
1268                 vaesenc \T1, \XMM3, \XMM3
1269                 vaesenc \T1, \XMM4, \XMM4
1270                 vaesenc \T1, \XMM5, \XMM5
1271                 vaesenc \T1, \XMM6, \XMM6
1272                 vaesenc \T1, \XMM7, \XMM7
1273                 vaesenc \T1, \XMM8, \XMM8
1274 
1275                 vmovdqu 16*2(arg1), \T1
1276                 vaesenc \T1, \XMM1, \XMM1
1277                 vaesenc \T1, \XMM2, \XMM2
1278                 vaesenc \T1, \XMM3, \XMM3
1279                 vaesenc \T1, \XMM4, \XMM4
1280                 vaesenc \T1, \XMM5, \XMM5
1281                 vaesenc \T1, \XMM6, \XMM6
1282                 vaesenc \T1, \XMM7, \XMM7
1283                 vaesenc \T1, \XMM8, \XMM8
1284 
1285 
1286         #######################################################################
1287 
1288         vmovdqu         HashKey_8(arg2), \T5
1289         vpclmulqdq      $0x11, \T5, \T2, \T4             # T4 = a1*b1
1290         vpclmulqdq      $0x00, \T5, \T2, \T7             # T7 = a0*b0
1291 
1292         vpshufd         $0b01001110, \T2, \T6
1293         vpxor           \T2, \T6, \T6
1294 
1295         vmovdqu         HashKey_8_k(arg2), \T5
1296         vpclmulqdq      $0x00, \T5, \T6, \T6
1297 
1298                 vmovdqu 16*3(arg1), \T1
1299                 vaesenc \T1, \XMM1, \XMM1
1300                 vaesenc \T1, \XMM2, \XMM2
1301                 vaesenc \T1, \XMM3, \XMM3
1302                 vaesenc \T1, \XMM4, \XMM4
1303                 vaesenc \T1, \XMM5, \XMM5
1304                 vaesenc \T1, \XMM6, \XMM6
1305                 vaesenc \T1, \XMM7, \XMM7
1306                 vaesenc \T1, \XMM8, \XMM8
1307 
1308         vmovdqa         TMP2(%rsp), \T1
1309         vmovdqu         HashKey_7(arg2), \T5
1310         vpclmulqdq      $0x11, \T5, \T1, \T3
1311         vpxor           \T3, \T4, \T4
1312         vpclmulqdq      $0x00, \T5, \T1, \T3
1313         vpxor           \T3, \T7, \T7
1314 
1315         vpshufd         $0b01001110, \T1, \T3
1316         vpxor           \T1, \T3, \T3
1317         vmovdqu         HashKey_7_k(arg2), \T5
1318         vpclmulqdq      $0x10, \T5, \T3, \T3
1319         vpxor           \T3, \T6, \T6
1320 
1321                 vmovdqu 16*4(arg1), \T1
1322                 vaesenc \T1, \XMM1, \XMM1
1323                 vaesenc \T1, \XMM2, \XMM2
1324                 vaesenc \T1, \XMM3, \XMM3
1325                 vaesenc \T1, \XMM4, \XMM4
1326                 vaesenc \T1, \XMM5, \XMM5
1327                 vaesenc \T1, \XMM6, \XMM6
1328                 vaesenc \T1, \XMM7, \XMM7
1329                 vaesenc \T1, \XMM8, \XMM8
1330 
1331         #######################################################################
1332 
1333         vmovdqa         TMP3(%rsp), \T1
1334         vmovdqu         HashKey_6(arg2), \T5
1335         vpclmulqdq      $0x11, \T5, \T1, \T3
1336         vpxor           \T3, \T4, \T4
1337         vpclmulqdq      $0x00, \T5, \T1, \T3
1338         vpxor           \T3, \T7, \T7
1339 
1340         vpshufd         $0b01001110, \T1, \T3
1341         vpxor           \T1, \T3, \T3
1342         vmovdqu         HashKey_6_k(arg2), \T5
1343         vpclmulqdq      $0x10, \T5, \T3, \T3
1344         vpxor           \T3, \T6, \T6
1345 
1346                 vmovdqu 16*5(arg1), \T1
1347                 vaesenc \T1, \XMM1, \XMM1
1348                 vaesenc \T1, \XMM2, \XMM2
1349                 vaesenc \T1, \XMM3, \XMM3
1350                 vaesenc \T1, \XMM4, \XMM4
1351                 vaesenc \T1, \XMM5, \XMM5
1352                 vaesenc \T1, \XMM6, \XMM6
1353                 vaesenc \T1, \XMM7, \XMM7
1354                 vaesenc \T1, \XMM8, \XMM8
1355 
1356         vmovdqa         TMP4(%rsp), \T1
1357         vmovdqu         HashKey_5(arg2), \T5
1358         vpclmulqdq      $0x11, \T5, \T1, \T3
1359         vpxor           \T3, \T4, \T4
1360         vpclmulqdq      $0x00, \T5, \T1, \T3
1361         vpxor           \T3, \T7, \T7
1362 
1363         vpshufd         $0b01001110, \T1, \T3
1364         vpxor           \T1, \T3, \T3
1365         vmovdqu         HashKey_5_k(arg2), \T5
1366         vpclmulqdq      $0x10, \T5, \T3, \T3
1367         vpxor           \T3, \T6, \T6
1368 
1369                 vmovdqu 16*6(arg1), \T1
1370                 vaesenc \T1, \XMM1, \XMM1
1371                 vaesenc \T1, \XMM2, \XMM2
1372                 vaesenc \T1, \XMM3, \XMM3
1373                 vaesenc \T1, \XMM4, \XMM4
1374                 vaesenc \T1, \XMM5, \XMM5
1375                 vaesenc \T1, \XMM6, \XMM6
1376                 vaesenc \T1, \XMM7, \XMM7
1377                 vaesenc \T1, \XMM8, \XMM8
1378 
1379 
1380         vmovdqa         TMP5(%rsp), \T1
1381         vmovdqu         HashKey_4(arg2), \T5
1382         vpclmulqdq      $0x11, \T5, \T1, \T3
1383         vpxor           \T3, \T4, \T4
1384         vpclmulqdq      $0x00, \T5, \T1, \T3
1385         vpxor           \T3, \T7, \T7
1386 
1387         vpshufd         $0b01001110, \T1, \T3
1388         vpxor           \T1, \T3, \T3
1389         vmovdqu         HashKey_4_k(arg2), \T5
1390         vpclmulqdq      $0x10, \T5, \T3, \T3
1391         vpxor           \T3, \T6, \T6
1392 
1393                 vmovdqu 16*7(arg1), \T1
1394                 vaesenc \T1, \XMM1, \XMM1
1395                 vaesenc \T1, \XMM2, \XMM2
1396                 vaesenc \T1, \XMM3, \XMM3
1397                 vaesenc \T1, \XMM4, \XMM4
1398                 vaesenc \T1, \XMM5, \XMM5
1399                 vaesenc \T1, \XMM6, \XMM6
1400                 vaesenc \T1, \XMM7, \XMM7
1401                 vaesenc \T1, \XMM8, \XMM8
1402 
1403         vmovdqa         TMP6(%rsp), \T1
1404         vmovdqu         HashKey_3(arg2), \T5
1405         vpclmulqdq      $0x11, \T5, \T1, \T3
1406         vpxor           \T3, \T4, \T4
1407         vpclmulqdq      $0x00, \T5, \T1, \T3
1408         vpxor           \T3, \T7, \T7
1409 
1410         vpshufd         $0b01001110, \T1, \T3
1411         vpxor           \T1, \T3, \T3
1412         vmovdqu         HashKey_3_k(arg2), \T5
1413         vpclmulqdq      $0x10, \T5, \T3, \T3
1414         vpxor           \T3, \T6, \T6
1415 
1416 
1417                 vmovdqu 16*8(arg1), \T1
1418                 vaesenc \T1, \XMM1, \XMM1
1419                 vaesenc \T1, \XMM2, \XMM2
1420                 vaesenc \T1, \XMM3, \XMM3
1421                 vaesenc \T1, \XMM4, \XMM4
1422                 vaesenc \T1, \XMM5, \XMM5
1423                 vaesenc \T1, \XMM6, \XMM6
1424                 vaesenc \T1, \XMM7, \XMM7
1425                 vaesenc \T1, \XMM8, \XMM8
1426 
1427         vmovdqa         TMP7(%rsp), \T1
1428         vmovdqu         HashKey_2(arg2), \T5
1429         vpclmulqdq      $0x11, \T5, \T1, \T3
1430         vpxor           \T3, \T4, \T4
1431         vpclmulqdq      $0x00, \T5, \T1, \T3
1432         vpxor           \T3, \T7, \T7
1433 
1434         vpshufd         $0b01001110, \T1, \T3
1435         vpxor           \T1, \T3, \T3
1436         vmovdqu         HashKey_2_k(arg2), \T5
1437         vpclmulqdq      $0x10, \T5, \T3, \T3
1438         vpxor           \T3, \T6, \T6
1439 
1440         #######################################################################
1441 
1442                 vmovdqu 16*9(arg1), \T5
1443                 vaesenc \T5, \XMM1, \XMM1
1444                 vaesenc \T5, \XMM2, \XMM2
1445                 vaesenc \T5, \XMM3, \XMM3
1446                 vaesenc \T5, \XMM4, \XMM4
1447                 vaesenc \T5, \XMM5, \XMM5
1448                 vaesenc \T5, \XMM6, \XMM6
1449                 vaesenc \T5, \XMM7, \XMM7
1450                 vaesenc \T5, \XMM8, \XMM8
1451 
1452         vmovdqa         TMP8(%rsp), \T1
1453         vmovdqu         HashKey(arg2), \T5
1454         vpclmulqdq      $0x11, \T5, \T1, \T3
1455         vpxor           \T3, \T4, \T4
1456         vpclmulqdq      $0x00, \T5, \T1, \T3
1457         vpxor           \T3, \T7, \T7
1458 
1459         vpshufd         $0b01001110, \T1, \T3
1460         vpxor           \T1, \T3, \T3
1461         vmovdqu         HashKey_k(arg2), \T5
1462         vpclmulqdq      $0x10, \T5, \T3, \T3
1463         vpxor           \T3, \T6, \T6
1464 
1465         vpxor           \T4, \T6, \T6
1466         vpxor           \T7, \T6, \T6
1467 
1468                 vmovdqu 16*10(arg1), \T5
1469 
1470         i = 11
1471         setreg
1472 .rep (\REP-9)
1473 
1474         vaesenc \T5, \XMM1, \XMM1
1475         vaesenc \T5, \XMM2, \XMM2
1476         vaesenc \T5, \XMM3, \XMM3
1477         vaesenc \T5, \XMM4, \XMM4
1478         vaesenc \T5, \XMM5, \XMM5
1479         vaesenc \T5, \XMM6, \XMM6
1480         vaesenc \T5, \XMM7, \XMM7
1481         vaesenc \T5, \XMM8, \XMM8
1482 
1483         vmovdqu 16*i(arg1), \T5
1484         i = i + 1
1485         setreg
1486 .endr
1487 
1488 	i = 0
1489 	j = 1
1490 	setreg
1491 .rep 8
1492 		vpxor	16*i(arg4, %r11), \T5, \T2
1493                 .if \ENC_DEC == ENC
1494                 vaesenclast     \T2, reg_j, reg_j
1495                 .else
1496                 vaesenclast     \T2, reg_j, \T3
1497                 vmovdqu 16*i(arg4, %r11), reg_j
1498                 vmovdqu \T3, 16*i(arg3, %r11)
1499                 .endif
1500 	i = (i+1)
1501 	j = (j+1)
1502 	setreg
1503 .endr
1504 	#######################################################################
1505 
1506 
1507 	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
1508 	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
1509 	vpxor	\T3, \T7, \T7
1510 	vpxor	\T4, \T6, \T6				# accumulate the results in T6:T7
1511 
1512 
1513 
1514 	#######################################################################
1515 	#first phase of the reduction
1516 	#######################################################################
1517         vpslld  $31, \T7, \T2                           # packed right shifting << 31
1518         vpslld  $30, \T7, \T3                           # packed right shifting shift << 30
1519         vpslld  $25, \T7, \T4                           # packed right shifting shift << 25
1520 
1521         vpxor   \T3, \T2, \T2                           # xor the shifted versions
1522         vpxor   \T4, \T2, \T2
1523 
1524         vpsrldq $4, \T2, \T1                            # shift-R T1 1 DW
1525 
1526         vpslldq $12, \T2, \T2                           # shift-L T2 3 DWs
1527         vpxor   \T2, \T7, \T7                           # first phase of the reduction complete
1528 	#######################################################################
1529                 .if \ENC_DEC == ENC
1530 		vmovdqu	 \XMM1,	16*0(arg3,%r11)		# Write to the Ciphertext buffer
1531 		vmovdqu	 \XMM2,	16*1(arg3,%r11)		# Write to the Ciphertext buffer
1532 		vmovdqu	 \XMM3,	16*2(arg3,%r11)		# Write to the Ciphertext buffer
1533 		vmovdqu	 \XMM4,	16*3(arg3,%r11)		# Write to the Ciphertext buffer
1534 		vmovdqu	 \XMM5,	16*4(arg3,%r11)		# Write to the Ciphertext buffer
1535 		vmovdqu	 \XMM6,	16*5(arg3,%r11)		# Write to the Ciphertext buffer
1536 		vmovdqu	 \XMM7,	16*6(arg3,%r11)		# Write to the Ciphertext buffer
1537 		vmovdqu	 \XMM8,	16*7(arg3,%r11)		# Write to the Ciphertext buffer
1538                 .endif
1539 
1540 	#######################################################################
1541 	#second phase of the reduction
1542         vpsrld  $1, \T7, \T2                            # packed left shifting >> 1
1543         vpsrld  $2, \T7, \T3                            # packed left shifting >> 2
1544         vpsrld  $7, \T7, \T4                            # packed left shifting >> 7
1545         vpxor   \T3, \T2, \T2                           # xor the shifted versions
1546         vpxor   \T4, \T2, \T2
1547 
1548         vpxor   \T1, \T2, \T2
1549         vpxor   \T2, \T7, \T7
1550         vpxor   \T7, \T6, \T6                           # the result is in T6
1551 	#######################################################################
1552 
1553 		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
1554 		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
1555 		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
1556 		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
1557 		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
1558 		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
1559 		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
1560 		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
1561 
1562 
1563 	vpxor	\T6, \XMM1, \XMM1
1564 
1565 
1566 
1567 .endm
1568 
1569 
1570 # GHASH the last 4 ciphertext blocks.
1571 .macro  GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1572 
1573         ## Karatsuba Method
1574 
1575 
1576         vpshufd         $0b01001110, \XMM1, \T2
1577         vpxor           \XMM1, \T2, \T2
1578         vmovdqu         HashKey_8(arg2), \T5
1579         vpclmulqdq      $0x11, \T5, \XMM1, \T6
1580         vpclmulqdq      $0x00, \T5, \XMM1, \T7
1581 
1582         vmovdqu         HashKey_8_k(arg2), \T3
1583         vpclmulqdq      $0x00, \T3, \T2, \XMM1
1584 
1585         ######################
1586 
1587         vpshufd         $0b01001110, \XMM2, \T2
1588         vpxor           \XMM2, \T2, \T2
1589         vmovdqu         HashKey_7(arg2), \T5
1590         vpclmulqdq      $0x11, \T5, \XMM2, \T4
1591         vpxor           \T4, \T6, \T6
1592 
1593         vpclmulqdq      $0x00, \T5, \XMM2, \T4
1594         vpxor           \T4, \T7, \T7
1595 
1596         vmovdqu         HashKey_7_k(arg2), \T3
1597         vpclmulqdq      $0x00, \T3, \T2, \T2
1598         vpxor           \T2, \XMM1, \XMM1
1599 
1600         ######################
1601 
1602         vpshufd         $0b01001110, \XMM3, \T2
1603         vpxor           \XMM3, \T2, \T2
1604         vmovdqu         HashKey_6(arg2), \T5
1605         vpclmulqdq      $0x11, \T5, \XMM3, \T4
1606         vpxor           \T4, \T6, \T6
1607 
1608         vpclmulqdq      $0x00, \T5, \XMM3, \T4
1609         vpxor           \T4, \T7, \T7
1610 
1611         vmovdqu         HashKey_6_k(arg2), \T3
1612         vpclmulqdq      $0x00, \T3, \T2, \T2
1613         vpxor           \T2, \XMM1, \XMM1
1614 
1615         ######################
1616 
1617         vpshufd         $0b01001110, \XMM4, \T2
1618         vpxor           \XMM4, \T2, \T2
1619         vmovdqu         HashKey_5(arg2), \T5
1620         vpclmulqdq      $0x11, \T5, \XMM4, \T4
1621         vpxor           \T4, \T6, \T6
1622 
1623         vpclmulqdq      $0x00, \T5, \XMM4, \T4
1624         vpxor           \T4, \T7, \T7
1625 
1626         vmovdqu         HashKey_5_k(arg2), \T3
1627         vpclmulqdq      $0x00, \T3, \T2, \T2
1628         vpxor           \T2, \XMM1, \XMM1
1629 
1630         ######################
1631 
1632         vpshufd         $0b01001110, \XMM5, \T2
1633         vpxor           \XMM5, \T2, \T2
1634         vmovdqu         HashKey_4(arg2), \T5
1635         vpclmulqdq      $0x11, \T5, \XMM5, \T4
1636         vpxor           \T4, \T6, \T6
1637 
1638         vpclmulqdq      $0x00, \T5, \XMM5, \T4
1639         vpxor           \T4, \T7, \T7
1640 
1641         vmovdqu         HashKey_4_k(arg2), \T3
1642         vpclmulqdq      $0x00, \T3, \T2, \T2
1643         vpxor           \T2, \XMM1, \XMM1
1644 
1645         ######################
1646 
1647         vpshufd         $0b01001110, \XMM6, \T2
1648         vpxor           \XMM6, \T2, \T2
1649         vmovdqu         HashKey_3(arg2), \T5
1650         vpclmulqdq      $0x11, \T5, \XMM6, \T4
1651         vpxor           \T4, \T6, \T6
1652 
1653         vpclmulqdq      $0x00, \T5, \XMM6, \T4
1654         vpxor           \T4, \T7, \T7
1655 
1656         vmovdqu         HashKey_3_k(arg2), \T3
1657         vpclmulqdq      $0x00, \T3, \T2, \T2
1658         vpxor           \T2, \XMM1, \XMM1
1659 
1660         ######################
1661 
1662         vpshufd         $0b01001110, \XMM7, \T2
1663         vpxor           \XMM7, \T2, \T2
1664         vmovdqu         HashKey_2(arg2), \T5
1665         vpclmulqdq      $0x11, \T5, \XMM7, \T4
1666         vpxor           \T4, \T6, \T6
1667 
1668         vpclmulqdq      $0x00, \T5, \XMM7, \T4
1669         vpxor           \T4, \T7, \T7
1670 
1671         vmovdqu         HashKey_2_k(arg2), \T3
1672         vpclmulqdq      $0x00, \T3, \T2, \T2
1673         vpxor           \T2, \XMM1, \XMM1
1674 
1675         ######################
1676 
1677         vpshufd         $0b01001110, \XMM8, \T2
1678         vpxor           \XMM8, \T2, \T2
1679         vmovdqu         HashKey(arg2), \T5
1680         vpclmulqdq      $0x11, \T5, \XMM8, \T4
1681         vpxor           \T4, \T6, \T6
1682 
1683         vpclmulqdq      $0x00, \T5, \XMM8, \T4
1684         vpxor           \T4, \T7, \T7
1685 
1686         vmovdqu         HashKey_k(arg2), \T3
1687         vpclmulqdq      $0x00, \T3, \T2, \T2
1688 
1689         vpxor           \T2, \XMM1, \XMM1
1690         vpxor           \T6, \XMM1, \XMM1
1691         vpxor           \T7, \XMM1, \T2
1692 
1693 
1694 
1695 
1696         vpslldq $8, \T2, \T4
1697         vpsrldq $8, \T2, \T2
1698 
1699         vpxor   \T4, \T7, \T7
1700         vpxor   \T2, \T6, \T6   # <T6:T7> holds the result of
1701 				# the accumulated carry-less multiplications
1702 
1703         #######################################################################
1704         #first phase of the reduction
1705         vpslld  $31, \T7, \T2   # packed right shifting << 31
1706         vpslld  $30, \T7, \T3   # packed right shifting shift << 30
1707         vpslld  $25, \T7, \T4   # packed right shifting shift << 25
1708 
1709         vpxor   \T3, \T2, \T2   # xor the shifted versions
1710         vpxor   \T4, \T2, \T2
1711 
1712         vpsrldq $4, \T2, \T1    # shift-R T1 1 DW
1713 
1714         vpslldq $12, \T2, \T2   # shift-L T2 3 DWs
1715         vpxor   \T2, \T7, \T7   # first phase of the reduction complete
1716         #######################################################################
1717 
1718 
1719         #second phase of the reduction
1720         vpsrld  $1, \T7, \T2    # packed left shifting >> 1
1721         vpsrld  $2, \T7, \T3    # packed left shifting >> 2
1722         vpsrld  $7, \T7, \T4    # packed left shifting >> 7
1723         vpxor   \T3, \T2, \T2   # xor the shifted versions
1724         vpxor   \T4, \T2, \T2
1725 
1726         vpxor   \T1, \T2, \T2
1727         vpxor   \T2, \T7, \T7
1728         vpxor   \T7, \T6, \T6   # the result is in T6
1729 
1730 .endm
1731 
1732 #############################################################
1733 #void   aesni_gcm_precomp_avx_gen2
1734 #        (gcm_data     *my_ctx_data,
1735 #         gcm_context_data *data,
1736 #        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1737 #        u8      *iv, /* Pre-counter block j0: 4 byte salt
1738 #			(from Security Association) concatenated with 8 byte
1739 #			Initialisation Vector (from IPSec ESP Payload)
1740 #			concatenated with 0x00000001. 16-byte aligned pointer. */
1741 #        const   u8 *aad, /* Additional Authentication Data (AAD)*/
1742 #        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1743 #############################################################
1744 SYM_FUNC_START(aesni_gcm_init_avx_gen2)
1745         FUNC_SAVE
1746         INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
1747         FUNC_RESTORE
1748         RET
1749 SYM_FUNC_END(aesni_gcm_init_avx_gen2)
1750 
1751 ###############################################################################
1752 #void   aesni_gcm_enc_update_avx_gen2(
1753 #        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1754 #        gcm_context_data *data,
1755 #        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
1756 #        const   u8 *in, /* Plaintext input */
1757 #        u64     plaintext_len) /* Length of data in Bytes for encryption. */
1758 ###############################################################################
1759 SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
1760         FUNC_SAVE
1761         mov     keysize, %eax
1762         cmp     $32, %eax
1763         je      key_256_enc_update
1764         cmp     $16, %eax
1765         je      key_128_enc_update
1766         # must be 192
1767         GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
1768         FUNC_RESTORE
1769         RET
1770 key_128_enc_update:
1771         GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
1772         FUNC_RESTORE
1773         RET
1774 key_256_enc_update:
1775         GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
1776         FUNC_RESTORE
1777         RET
1778 SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
1779 
1780 ###############################################################################
1781 #void   aesni_gcm_dec_update_avx_gen2(
1782 #        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1783 #        gcm_context_data *data,
1784 #        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
1785 #        const   u8 *in, /* Ciphertext input */
1786 #        u64     plaintext_len) /* Length of data in Bytes for encryption. */
1787 ###############################################################################
1788 SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
1789         FUNC_SAVE
1790         mov     keysize,%eax
1791         cmp     $32, %eax
1792         je      key_256_dec_update
1793         cmp     $16, %eax
1794         je      key_128_dec_update
1795         # must be 192
1796         GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
1797         FUNC_RESTORE
1798         RET
1799 key_128_dec_update:
1800         GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
1801         FUNC_RESTORE
1802         RET
1803 key_256_dec_update:
1804         GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
1805         FUNC_RESTORE
1806         RET
1807 SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
1808 
1809 ###############################################################################
1810 #void   aesni_gcm_finalize_avx_gen2(
1811 #        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1812 #        gcm_context_data *data,
1813 #        u8      *auth_tag, /* Authenticated Tag output. */
1814 #        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
1815 #				Valid values are 16 (most likely), 12 or 8. */
1816 ###############################################################################
1817 SYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
1818         FUNC_SAVE
1819         mov	keysize,%eax
1820         cmp     $32, %eax
1821         je      key_256_finalize
1822         cmp     $16, %eax
1823         je      key_128_finalize
1824         # must be 192
1825         GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
1826         FUNC_RESTORE
1827         RET
1828 key_128_finalize:
1829         GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
1830         FUNC_RESTORE
1831         RET
1832 key_256_finalize:
1833         GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
1834         FUNC_RESTORE
1835         RET
1836 SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
1837 
1838 ###############################################################################
1839 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1840 # Input: A and B (128-bits each, bit-reflected)
1841 # Output: C = A*B*x mod poly, (i.e. >>1 )
1842 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1843 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1844 ###############################################################################
1845 .macro  GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1846 
1847         vpclmulqdq      $0x11,\HK,\GH,\T1      # T1 = a1*b1
1848         vpclmulqdq      $0x00,\HK,\GH,\T2      # T2 = a0*b0
1849         vpclmulqdq      $0x01,\HK,\GH,\T3      # T3 = a1*b0
1850         vpclmulqdq      $0x10,\HK,\GH,\GH      # GH = a0*b1
1851         vpxor           \T3, \GH, \GH
1852 
1853 
1854         vpsrldq         $8 , \GH, \T3          # shift-R GH 2 DWs
1855         vpslldq         $8 , \GH, \GH          # shift-L GH 2 DWs
1856 
1857         vpxor           \T3, \T1, \T1
1858         vpxor           \T2, \GH, \GH
1859 
1860         #######################################################################
1861         #first phase of the reduction
1862         vmovdqa         POLY2(%rip), \T3
1863 
1864         vpclmulqdq      $0x01, \GH, \T3, \T2
1865         vpslldq         $8, \T2, \T2           # shift-L T2 2 DWs
1866 
1867         vpxor           \T2, \GH, \GH          # first phase of the reduction complete
1868         #######################################################################
1869         #second phase of the reduction
1870         vpclmulqdq      $0x00, \GH, \T3, \T2
1871         vpsrldq         $4, \T2, \T2           # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1872 
1873         vpclmulqdq      $0x10, \GH, \T3, \GH
1874         vpslldq         $4, \GH, \GH           # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1875 
1876         vpxor           \T2, \GH, \GH          # second phase of the reduction complete
1877         #######################################################################
1878         vpxor           \T1, \GH, \GH          # the result is in GH
1879 
1880 
1881 .endm
1882 
1883 .macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1884 
1885         # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1886         vmovdqa  \HK, \T5
1887         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^2<<1 mod poly
1888         vmovdqu  \T5, HashKey_2(arg2)                       #  [HashKey_2] = HashKey^2<<1 mod poly
1889 
1890         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^3<<1 mod poly
1891         vmovdqu  \T5, HashKey_3(arg2)
1892 
1893         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^4<<1 mod poly
1894         vmovdqu  \T5, HashKey_4(arg2)
1895 
1896         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^5<<1 mod poly
1897         vmovdqu  \T5, HashKey_5(arg2)
1898 
1899         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^6<<1 mod poly
1900         vmovdqu  \T5, HashKey_6(arg2)
1901 
1902         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^7<<1 mod poly
1903         vmovdqu  \T5, HashKey_7(arg2)
1904 
1905         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^8<<1 mod poly
1906         vmovdqu  \T5, HashKey_8(arg2)
1907 
1908 .endm
1909 
1910 ## if a = number of total plaintext bytes
1911 ## b = floor(a/16)
1912 ## num_initial_blocks = b mod 4#
1913 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1914 ## r10, r11, r12, rax are clobbered
1915 ## arg1, arg2, arg3, arg4 are used as pointers only, not modified
1916 
1917 .macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1918 	i = (8-\num_initial_blocks)
1919 	setreg
1920 	vmovdqu AadHash(arg2), reg_i
1921 
1922 	# start AES for num_initial_blocks blocks
1923 	vmovdqu CurCount(arg2), \CTR
1924 
1925 	i = (9-\num_initial_blocks)
1926 	setreg
1927 .rep \num_initial_blocks
1928                 vpaddd  ONE(%rip), \CTR, \CTR   # INCR Y0
1929                 vmovdqa \CTR, reg_i
1930                 vpshufb SHUF_MASK(%rip), reg_i, reg_i     # perform a 16Byte swap
1931 	i = (i+1)
1932 	setreg
1933 .endr
1934 
1935 	vmovdqa  (arg1), \T_key
1936 	i = (9-\num_initial_blocks)
1937 	setreg
1938 .rep \num_initial_blocks
1939                 vpxor   \T_key, reg_i, reg_i
1940 	i = (i+1)
1941 	setreg
1942 .endr
1943 
1944 	j = 1
1945 	setreg
1946 .rep \REP
1947 	vmovdqa  16*j(arg1), \T_key
1948 	i = (9-\num_initial_blocks)
1949 	setreg
1950 .rep \num_initial_blocks
1951         vaesenc \T_key, reg_i, reg_i
1952 	i = (i+1)
1953 	setreg
1954 .endr
1955 
1956 	j = (j+1)
1957 	setreg
1958 .endr
1959 
1960 
1961 	vmovdqa  16*j(arg1), \T_key
1962 	i = (9-\num_initial_blocks)
1963 	setreg
1964 .rep \num_initial_blocks
1965         vaesenclast      \T_key, reg_i, reg_i
1966 	i = (i+1)
1967 	setreg
1968 .endr
1969 
1970 	i = (9-\num_initial_blocks)
1971 	setreg
1972 .rep \num_initial_blocks
1973                 vmovdqu (arg4, %r11), \T1
1974                 vpxor   \T1, reg_i, reg_i
1975                 vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for
1976 						       # num_initial_blocks blocks
1977                 add     $16, %r11
1978 .if  \ENC_DEC == DEC
1979                 vmovdqa \T1, reg_i
1980 .endif
1981                 vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
1982 	i = (i+1)
1983 	setreg
1984 .endr
1985 
1986 
1987 	i = (8-\num_initial_blocks)
1988 	j = (9-\num_initial_blocks)
1989 	setreg
1990 
1991 .rep \num_initial_blocks
1992         vpxor    reg_i, reg_j, reg_j
1993         GHASH_MUL_AVX2       reg_j, \T2, \T1, \T3, \T4, \T5, \T6  # apply GHASH on num_initial_blocks blocks
1994 	i = (i+1)
1995 	j = (j+1)
1996 	setreg
1997 .endr
1998         # XMM8 has the combined result here
1999 
2000         vmovdqa  \XMM8, TMP1(%rsp)
2001         vmovdqa  \XMM8, \T3
2002 
2003         cmp     $128, %r13
2004         jl      .L_initial_blocks_done\@                  # no need for precomputed constants
2005 
2006 ###############################################################################
2007 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
2008                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2009                 vmovdqa  \CTR, \XMM1
2010                 vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
2011 
2012                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2013                 vmovdqa  \CTR, \XMM2
2014                 vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
2015 
2016                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2017                 vmovdqa  \CTR, \XMM3
2018                 vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
2019 
2020                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2021                 vmovdqa  \CTR, \XMM4
2022                 vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
2023 
2024                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2025                 vmovdqa  \CTR, \XMM5
2026                 vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
2027 
2028                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2029                 vmovdqa  \CTR, \XMM6
2030                 vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
2031 
2032                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2033                 vmovdqa  \CTR, \XMM7
2034                 vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
2035 
2036                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2037                 vmovdqa  \CTR, \XMM8
2038                 vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
2039 
2040                 vmovdqa  (arg1), \T_key
2041                 vpxor    \T_key, \XMM1, \XMM1
2042                 vpxor    \T_key, \XMM2, \XMM2
2043                 vpxor    \T_key, \XMM3, \XMM3
2044                 vpxor    \T_key, \XMM4, \XMM4
2045                 vpxor    \T_key, \XMM5, \XMM5
2046                 vpxor    \T_key, \XMM6, \XMM6
2047                 vpxor    \T_key, \XMM7, \XMM7
2048                 vpxor    \T_key, \XMM8, \XMM8
2049 
2050 		i = 1
2051 		setreg
2052 .rep    \REP       # do REP rounds
2053                 vmovdqa  16*i(arg1), \T_key
2054                 vaesenc  \T_key, \XMM1, \XMM1
2055                 vaesenc  \T_key, \XMM2, \XMM2
2056                 vaesenc  \T_key, \XMM3, \XMM3
2057                 vaesenc  \T_key, \XMM4, \XMM4
2058                 vaesenc  \T_key, \XMM5, \XMM5
2059                 vaesenc  \T_key, \XMM6, \XMM6
2060                 vaesenc  \T_key, \XMM7, \XMM7
2061                 vaesenc  \T_key, \XMM8, \XMM8
2062 		i = (i+1)
2063 		setreg
2064 .endr
2065 
2066 
2067                 vmovdqa  16*i(arg1), \T_key
2068                 vaesenclast  \T_key, \XMM1, \XMM1
2069                 vaesenclast  \T_key, \XMM2, \XMM2
2070                 vaesenclast  \T_key, \XMM3, \XMM3
2071                 vaesenclast  \T_key, \XMM4, \XMM4
2072                 vaesenclast  \T_key, \XMM5, \XMM5
2073                 vaesenclast  \T_key, \XMM6, \XMM6
2074                 vaesenclast  \T_key, \XMM7, \XMM7
2075                 vaesenclast  \T_key, \XMM8, \XMM8
2076 
2077                 vmovdqu  (arg4, %r11), \T1
2078                 vpxor    \T1, \XMM1, \XMM1
2079                 vmovdqu  \XMM1, (arg3 , %r11)
2080                 .if   \ENC_DEC == DEC
2081                 vmovdqa  \T1, \XMM1
2082                 .endif
2083 
2084                 vmovdqu  16*1(arg4, %r11), \T1
2085                 vpxor    \T1, \XMM2, \XMM2
2086                 vmovdqu  \XMM2, 16*1(arg3 , %r11)
2087                 .if   \ENC_DEC == DEC
2088                 vmovdqa  \T1, \XMM2
2089                 .endif
2090 
2091                 vmovdqu  16*2(arg4, %r11), \T1
2092                 vpxor    \T1, \XMM3, \XMM3
2093                 vmovdqu  \XMM3, 16*2(arg3 , %r11)
2094                 .if   \ENC_DEC == DEC
2095                 vmovdqa  \T1, \XMM3
2096                 .endif
2097 
2098                 vmovdqu  16*3(arg4, %r11), \T1
2099                 vpxor    \T1, \XMM4, \XMM4
2100                 vmovdqu  \XMM4, 16*3(arg3 , %r11)
2101                 .if   \ENC_DEC == DEC
2102                 vmovdqa  \T1, \XMM4
2103                 .endif
2104 
2105                 vmovdqu  16*4(arg4, %r11), \T1
2106                 vpxor    \T1, \XMM5, \XMM5
2107                 vmovdqu  \XMM5, 16*4(arg3 , %r11)
2108                 .if   \ENC_DEC == DEC
2109                 vmovdqa  \T1, \XMM5
2110                 .endif
2111 
2112                 vmovdqu  16*5(arg4, %r11), \T1
2113                 vpxor    \T1, \XMM6, \XMM6
2114                 vmovdqu  \XMM6, 16*5(arg3 , %r11)
2115                 .if   \ENC_DEC == DEC
2116                 vmovdqa  \T1, \XMM6
2117                 .endif
2118 
2119                 vmovdqu  16*6(arg4, %r11), \T1
2120                 vpxor    \T1, \XMM7, \XMM7
2121                 vmovdqu  \XMM7, 16*6(arg3 , %r11)
2122                 .if   \ENC_DEC == DEC
2123                 vmovdqa  \T1, \XMM7
2124                 .endif
2125 
2126                 vmovdqu  16*7(arg4, %r11), \T1
2127                 vpxor    \T1, \XMM8, \XMM8
2128                 vmovdqu  \XMM8, 16*7(arg3 , %r11)
2129                 .if   \ENC_DEC == DEC
2130                 vmovdqa  \T1, \XMM8
2131                 .endif
2132 
2133                 add     $128, %r11
2134 
2135                 vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
2136                 vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with
2137 							   # the corresponding ciphertext
2138                 vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
2139                 vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
2140                 vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
2141                 vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
2142                 vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
2143                 vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
2144                 vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
2145 
2146 ###############################################################################
2147 
2148 .L_initial_blocks_done\@:
2149 
2150 
2151 .endm
2152 
2153 
2154 
2155 # encrypt 8 blocks at a time
2156 # ghash the 8 previously encrypted ciphertext blocks
2157 # arg1, arg2, arg3, arg4 are used as pointers only, not modified
2158 # r11 is the data offset value
2159 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2160 
2161         vmovdqa \XMM1, \T2
2162         vmovdqa \XMM2, TMP2(%rsp)
2163         vmovdqa \XMM3, TMP3(%rsp)
2164         vmovdqa \XMM4, TMP4(%rsp)
2165         vmovdqa \XMM5, TMP5(%rsp)
2166         vmovdqa \XMM6, TMP6(%rsp)
2167         vmovdqa \XMM7, TMP7(%rsp)
2168         vmovdqa \XMM8, TMP8(%rsp)
2169 
2170 .if \loop_idx == in_order
2171                 vpaddd  ONE(%rip), \CTR, \XMM1            # INCR CNT
2172                 vpaddd  ONE(%rip), \XMM1, \XMM2
2173                 vpaddd  ONE(%rip), \XMM2, \XMM3
2174                 vpaddd  ONE(%rip), \XMM3, \XMM4
2175                 vpaddd  ONE(%rip), \XMM4, \XMM5
2176                 vpaddd  ONE(%rip), \XMM5, \XMM6
2177                 vpaddd  ONE(%rip), \XMM6, \XMM7
2178                 vpaddd  ONE(%rip), \XMM7, \XMM8
2179                 vmovdqa \XMM8, \CTR
2180 
2181                 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
2182                 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
2183                 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
2184                 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
2185                 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
2186                 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
2187                 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
2188                 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
2189 .else
2190                 vpaddd  ONEf(%rip), \CTR, \XMM1            # INCR CNT
2191                 vpaddd  ONEf(%rip), \XMM1, \XMM2
2192                 vpaddd  ONEf(%rip), \XMM2, \XMM3
2193                 vpaddd  ONEf(%rip), \XMM3, \XMM4
2194                 vpaddd  ONEf(%rip), \XMM4, \XMM5
2195                 vpaddd  ONEf(%rip), \XMM5, \XMM6
2196                 vpaddd  ONEf(%rip), \XMM6, \XMM7
2197                 vpaddd  ONEf(%rip), \XMM7, \XMM8
2198                 vmovdqa \XMM8, \CTR
2199 .endif
2200 
2201 
2202         #######################################################################
2203 
2204                 vmovdqu (arg1), \T1
2205                 vpxor   \T1, \XMM1, \XMM1
2206                 vpxor   \T1, \XMM2, \XMM2
2207                 vpxor   \T1, \XMM3, \XMM3
2208                 vpxor   \T1, \XMM4, \XMM4
2209                 vpxor   \T1, \XMM5, \XMM5
2210                 vpxor   \T1, \XMM6, \XMM6
2211                 vpxor   \T1, \XMM7, \XMM7
2212                 vpxor   \T1, \XMM8, \XMM8
2213 
2214         #######################################################################
2215 
2216 
2217 
2218 
2219 
2220                 vmovdqu 16*1(arg1), \T1
2221                 vaesenc \T1, \XMM1, \XMM1
2222                 vaesenc \T1, \XMM2, \XMM2
2223                 vaesenc \T1, \XMM3, \XMM3
2224                 vaesenc \T1, \XMM4, \XMM4
2225                 vaesenc \T1, \XMM5, \XMM5
2226                 vaesenc \T1, \XMM6, \XMM6
2227                 vaesenc \T1, \XMM7, \XMM7
2228                 vaesenc \T1, \XMM8, \XMM8
2229 
2230                 vmovdqu 16*2(arg1), \T1
2231                 vaesenc \T1, \XMM1, \XMM1
2232                 vaesenc \T1, \XMM2, \XMM2
2233                 vaesenc \T1, \XMM3, \XMM3
2234                 vaesenc \T1, \XMM4, \XMM4
2235                 vaesenc \T1, \XMM5, \XMM5
2236                 vaesenc \T1, \XMM6, \XMM6
2237                 vaesenc \T1, \XMM7, \XMM7
2238                 vaesenc \T1, \XMM8, \XMM8
2239 
2240 
2241         #######################################################################
2242 
2243         vmovdqu         HashKey_8(arg2), \T5
2244         vpclmulqdq      $0x11, \T5, \T2, \T4              # T4 = a1*b1
2245         vpclmulqdq      $0x00, \T5, \T2, \T7              # T7 = a0*b0
2246         vpclmulqdq      $0x01, \T5, \T2, \T6              # T6 = a1*b0
2247         vpclmulqdq      $0x10, \T5, \T2, \T5              # T5 = a0*b1
2248         vpxor           \T5, \T6, \T6
2249 
2250                 vmovdqu 16*3(arg1), \T1
2251                 vaesenc \T1, \XMM1, \XMM1
2252                 vaesenc \T1, \XMM2, \XMM2
2253                 vaesenc \T1, \XMM3, \XMM3
2254                 vaesenc \T1, \XMM4, \XMM4
2255                 vaesenc \T1, \XMM5, \XMM5
2256                 vaesenc \T1, \XMM6, \XMM6
2257                 vaesenc \T1, \XMM7, \XMM7
2258                 vaesenc \T1, \XMM8, \XMM8
2259 
2260         vmovdqa         TMP2(%rsp), \T1
2261         vmovdqu         HashKey_7(arg2), \T5
2262         vpclmulqdq      $0x11, \T5, \T1, \T3
2263         vpxor           \T3, \T4, \T4
2264 
2265         vpclmulqdq      $0x00, \T5, \T1, \T3
2266         vpxor           \T3, \T7, \T7
2267 
2268         vpclmulqdq      $0x01, \T5, \T1, \T3
2269         vpxor           \T3, \T6, \T6
2270 
2271         vpclmulqdq      $0x10, \T5, \T1, \T3
2272         vpxor           \T3, \T6, \T6
2273 
2274                 vmovdqu 16*4(arg1), \T1
2275                 vaesenc \T1, \XMM1, \XMM1
2276                 vaesenc \T1, \XMM2, \XMM2
2277                 vaesenc \T1, \XMM3, \XMM3
2278                 vaesenc \T1, \XMM4, \XMM4
2279                 vaesenc \T1, \XMM5, \XMM5
2280                 vaesenc \T1, \XMM6, \XMM6
2281                 vaesenc \T1, \XMM7, \XMM7
2282                 vaesenc \T1, \XMM8, \XMM8
2283 
2284         #######################################################################
2285 
2286         vmovdqa         TMP3(%rsp), \T1
2287         vmovdqu         HashKey_6(arg2), \T5
2288         vpclmulqdq      $0x11, \T5, \T1, \T3
2289         vpxor           \T3, \T4, \T4
2290 
2291         vpclmulqdq      $0x00, \T5, \T1, \T3
2292         vpxor           \T3, \T7, \T7
2293 
2294         vpclmulqdq      $0x01, \T5, \T1, \T3
2295         vpxor           \T3, \T6, \T6
2296 
2297         vpclmulqdq      $0x10, \T5, \T1, \T3
2298         vpxor           \T3, \T6, \T6
2299 
2300                 vmovdqu 16*5(arg1), \T1
2301                 vaesenc \T1, \XMM1, \XMM1
2302                 vaesenc \T1, \XMM2, \XMM2
2303                 vaesenc \T1, \XMM3, \XMM3
2304                 vaesenc \T1, \XMM4, \XMM4
2305                 vaesenc \T1, \XMM5, \XMM5
2306                 vaesenc \T1, \XMM6, \XMM6
2307                 vaesenc \T1, \XMM7, \XMM7
2308                 vaesenc \T1, \XMM8, \XMM8
2309 
2310         vmovdqa         TMP4(%rsp), \T1
2311         vmovdqu         HashKey_5(arg2), \T5
2312         vpclmulqdq      $0x11, \T5, \T1, \T3
2313         vpxor           \T3, \T4, \T4
2314 
2315         vpclmulqdq      $0x00, \T5, \T1, \T3
2316         vpxor           \T3, \T7, \T7
2317 
2318         vpclmulqdq      $0x01, \T5, \T1, \T3
2319         vpxor           \T3, \T6, \T6
2320 
2321         vpclmulqdq      $0x10, \T5, \T1, \T3
2322         vpxor           \T3, \T6, \T6
2323 
2324                 vmovdqu 16*6(arg1), \T1
2325                 vaesenc \T1, \XMM1, \XMM1
2326                 vaesenc \T1, \XMM2, \XMM2
2327                 vaesenc \T1, \XMM3, \XMM3
2328                 vaesenc \T1, \XMM4, \XMM4
2329                 vaesenc \T1, \XMM5, \XMM5
2330                 vaesenc \T1, \XMM6, \XMM6
2331                 vaesenc \T1, \XMM7, \XMM7
2332                 vaesenc \T1, \XMM8, \XMM8
2333 
2334 
2335         vmovdqa         TMP5(%rsp), \T1
2336         vmovdqu         HashKey_4(arg2), \T5
2337         vpclmulqdq      $0x11, \T5, \T1, \T3
2338         vpxor           \T3, \T4, \T4
2339 
2340         vpclmulqdq      $0x00, \T5, \T1, \T3
2341         vpxor           \T3, \T7, \T7
2342 
2343         vpclmulqdq      $0x01, \T5, \T1, \T3
2344         vpxor           \T3, \T6, \T6
2345 
2346         vpclmulqdq      $0x10, \T5, \T1, \T3
2347         vpxor           \T3, \T6, \T6
2348 
2349                 vmovdqu 16*7(arg1), \T1
2350                 vaesenc \T1, \XMM1, \XMM1
2351                 vaesenc \T1, \XMM2, \XMM2
2352                 vaesenc \T1, \XMM3, \XMM3
2353                 vaesenc \T1, \XMM4, \XMM4
2354                 vaesenc \T1, \XMM5, \XMM5
2355                 vaesenc \T1, \XMM6, \XMM6
2356                 vaesenc \T1, \XMM7, \XMM7
2357                 vaesenc \T1, \XMM8, \XMM8
2358 
2359         vmovdqa         TMP6(%rsp), \T1
2360         vmovdqu         HashKey_3(arg2), \T5
2361         vpclmulqdq      $0x11, \T5, \T1, \T3
2362         vpxor           \T3, \T4, \T4
2363 
2364         vpclmulqdq      $0x00, \T5, \T1, \T3
2365         vpxor           \T3, \T7, \T7
2366 
2367         vpclmulqdq      $0x01, \T5, \T1, \T3
2368         vpxor           \T3, \T6, \T6
2369 
2370         vpclmulqdq      $0x10, \T5, \T1, \T3
2371         vpxor           \T3, \T6, \T6
2372 
2373                 vmovdqu 16*8(arg1), \T1
2374                 vaesenc \T1, \XMM1, \XMM1
2375                 vaesenc \T1, \XMM2, \XMM2
2376                 vaesenc \T1, \XMM3, \XMM3
2377                 vaesenc \T1, \XMM4, \XMM4
2378                 vaesenc \T1, \XMM5, \XMM5
2379                 vaesenc \T1, \XMM6, \XMM6
2380                 vaesenc \T1, \XMM7, \XMM7
2381                 vaesenc \T1, \XMM8, \XMM8
2382 
2383         vmovdqa         TMP7(%rsp), \T1
2384         vmovdqu         HashKey_2(arg2), \T5
2385         vpclmulqdq      $0x11, \T5, \T1, \T3
2386         vpxor           \T3, \T4, \T4
2387 
2388         vpclmulqdq      $0x00, \T5, \T1, \T3
2389         vpxor           \T3, \T7, \T7
2390 
2391         vpclmulqdq      $0x01, \T5, \T1, \T3
2392         vpxor           \T3, \T6, \T6
2393 
2394         vpclmulqdq      $0x10, \T5, \T1, \T3
2395         vpxor           \T3, \T6, \T6
2396 
2397 
2398         #######################################################################
2399 
2400                 vmovdqu 16*9(arg1), \T5
2401                 vaesenc \T5, \XMM1, \XMM1
2402                 vaesenc \T5, \XMM2, \XMM2
2403                 vaesenc \T5, \XMM3, \XMM3
2404                 vaesenc \T5, \XMM4, \XMM4
2405                 vaesenc \T5, \XMM5, \XMM5
2406                 vaesenc \T5, \XMM6, \XMM6
2407                 vaesenc \T5, \XMM7, \XMM7
2408                 vaesenc \T5, \XMM8, \XMM8
2409 
2410         vmovdqa         TMP8(%rsp), \T1
2411         vmovdqu         HashKey(arg2), \T5
2412 
2413         vpclmulqdq      $0x00, \T5, \T1, \T3
2414         vpxor           \T3, \T7, \T7
2415 
2416         vpclmulqdq      $0x01, \T5, \T1, \T3
2417         vpxor           \T3, \T6, \T6
2418 
2419         vpclmulqdq      $0x10, \T5, \T1, \T3
2420         vpxor           \T3, \T6, \T6
2421 
2422         vpclmulqdq      $0x11, \T5, \T1, \T3
2423         vpxor           \T3, \T4, \T1
2424 
2425 
2426                 vmovdqu 16*10(arg1), \T5
2427 
2428         i = 11
2429         setreg
2430 .rep (\REP-9)
2431         vaesenc \T5, \XMM1, \XMM1
2432         vaesenc \T5, \XMM2, \XMM2
2433         vaesenc \T5, \XMM3, \XMM3
2434         vaesenc \T5, \XMM4, \XMM4
2435         vaesenc \T5, \XMM5, \XMM5
2436         vaesenc \T5, \XMM6, \XMM6
2437         vaesenc \T5, \XMM7, \XMM7
2438         vaesenc \T5, \XMM8, \XMM8
2439 
2440         vmovdqu 16*i(arg1), \T5
2441         i = i + 1
2442         setreg
2443 .endr
2444 
2445 	i = 0
2446 	j = 1
2447 	setreg
2448 .rep 8
2449 		vpxor	16*i(arg4, %r11), \T5, \T2
2450                 .if \ENC_DEC == ENC
2451                 vaesenclast     \T2, reg_j, reg_j
2452                 .else
2453                 vaesenclast     \T2, reg_j, \T3
2454                 vmovdqu 16*i(arg4, %r11), reg_j
2455                 vmovdqu \T3, 16*i(arg3, %r11)
2456                 .endif
2457 	i = (i+1)
2458 	j = (j+1)
2459 	setreg
2460 .endr
2461 	#######################################################################
2462 
2463 
2464 	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
2465 	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
2466 	vpxor	\T3, \T7, \T7
2467 	vpxor	\T6, \T1, \T1				# accumulate the results in T1:T7
2468 
2469 
2470 
2471 	#######################################################################
2472 	#first phase of the reduction
2473 	vmovdqa         POLY2(%rip), \T3
2474 
2475 	vpclmulqdq	$0x01, \T7, \T3, \T2
2476 	vpslldq		$8, \T2, \T2			# shift-L xmm2 2 DWs
2477 
2478 	vpxor		\T2, \T7, \T7			# first phase of the reduction complete
2479 	#######################################################################
2480                 .if \ENC_DEC == ENC
2481 		vmovdqu	 \XMM1,	16*0(arg3,%r11)		# Write to the Ciphertext buffer
2482 		vmovdqu	 \XMM2,	16*1(arg3,%r11)		# Write to the Ciphertext buffer
2483 		vmovdqu	 \XMM3,	16*2(arg3,%r11)		# Write to the Ciphertext buffer
2484 		vmovdqu	 \XMM4,	16*3(arg3,%r11)		# Write to the Ciphertext buffer
2485 		vmovdqu	 \XMM5,	16*4(arg3,%r11)		# Write to the Ciphertext buffer
2486 		vmovdqu	 \XMM6,	16*5(arg3,%r11)		# Write to the Ciphertext buffer
2487 		vmovdqu	 \XMM7,	16*6(arg3,%r11)		# Write to the Ciphertext buffer
2488 		vmovdqu	 \XMM8,	16*7(arg3,%r11)		# Write to the Ciphertext buffer
2489                 .endif
2490 
2491 	#######################################################################
2492 	#second phase of the reduction
2493 	vpclmulqdq	$0x00, \T7, \T3, \T2
2494 	vpsrldq		$4, \T2, \T2			# shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2495 
2496 	vpclmulqdq	$0x10, \T7, \T3, \T4
2497 	vpslldq		$4, \T4, \T4			# shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2498 
2499 	vpxor		\T2, \T4, \T4			# second phase of the reduction complete
2500 	#######################################################################
2501 	vpxor		\T4, \T1, \T1			# the result is in T1
2502 
2503 		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
2504 		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
2505 		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
2506 		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
2507 		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
2508 		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
2509 		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
2510 		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
2511 
2512 
2513 	vpxor	\T1, \XMM1, \XMM1
2514 
2515 
2516 
2517 .endm
2518 
2519 
2520 # GHASH the last 4 ciphertext blocks.
2521 .macro  GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2522 
2523         ## Karatsuba Method
2524 
2525         vmovdqu         HashKey_8(arg2), \T5
2526 
2527         vpshufd         $0b01001110, \XMM1, \T2
2528         vpshufd         $0b01001110, \T5, \T3
2529         vpxor           \XMM1, \T2, \T2
2530         vpxor           \T5, \T3, \T3
2531 
2532         vpclmulqdq      $0x11, \T5, \XMM1, \T6
2533         vpclmulqdq      $0x00, \T5, \XMM1, \T7
2534 
2535         vpclmulqdq      $0x00, \T3, \T2, \XMM1
2536 
2537         ######################
2538 
2539         vmovdqu         HashKey_7(arg2), \T5
2540         vpshufd         $0b01001110, \XMM2, \T2
2541         vpshufd         $0b01001110, \T5, \T3
2542         vpxor           \XMM2, \T2, \T2
2543         vpxor           \T5, \T3, \T3
2544 
2545         vpclmulqdq      $0x11, \T5, \XMM2, \T4
2546         vpxor           \T4, \T6, \T6
2547 
2548         vpclmulqdq      $0x00, \T5, \XMM2, \T4
2549         vpxor           \T4, \T7, \T7
2550 
2551         vpclmulqdq      $0x00, \T3, \T2, \T2
2552 
2553         vpxor           \T2, \XMM1, \XMM1
2554 
2555         ######################
2556 
2557         vmovdqu         HashKey_6(arg2), \T5
2558         vpshufd         $0b01001110, \XMM3, \T2
2559         vpshufd         $0b01001110, \T5, \T3
2560         vpxor           \XMM3, \T2, \T2
2561         vpxor           \T5, \T3, \T3
2562 
2563         vpclmulqdq      $0x11, \T5, \XMM3, \T4
2564         vpxor           \T4, \T6, \T6
2565 
2566         vpclmulqdq      $0x00, \T5, \XMM3, \T4
2567         vpxor           \T4, \T7, \T7
2568 
2569         vpclmulqdq      $0x00, \T3, \T2, \T2
2570 
2571         vpxor           \T2, \XMM1, \XMM1
2572 
2573         ######################
2574 
2575         vmovdqu         HashKey_5(arg2), \T5
2576         vpshufd         $0b01001110, \XMM4, \T2
2577         vpshufd         $0b01001110, \T5, \T3
2578         vpxor           \XMM4, \T2, \T2
2579         vpxor           \T5, \T3, \T3
2580 
2581         vpclmulqdq      $0x11, \T5, \XMM4, \T4
2582         vpxor           \T4, \T6, \T6
2583 
2584         vpclmulqdq      $0x00, \T5, \XMM4, \T4
2585         vpxor           \T4, \T7, \T7
2586 
2587         vpclmulqdq      $0x00, \T3, \T2, \T2
2588 
2589         vpxor           \T2, \XMM1, \XMM1
2590 
2591         ######################
2592 
2593         vmovdqu         HashKey_4(arg2), \T5
2594         vpshufd         $0b01001110, \XMM5, \T2
2595         vpshufd         $0b01001110, \T5, \T3
2596         vpxor           \XMM5, \T2, \T2
2597         vpxor           \T5, \T3, \T3
2598 
2599         vpclmulqdq      $0x11, \T5, \XMM5, \T4
2600         vpxor           \T4, \T6, \T6
2601 
2602         vpclmulqdq      $0x00, \T5, \XMM5, \T4
2603         vpxor           \T4, \T7, \T7
2604 
2605         vpclmulqdq      $0x00, \T3, \T2, \T2
2606 
2607         vpxor           \T2, \XMM1, \XMM1
2608 
2609         ######################
2610 
2611         vmovdqu         HashKey_3(arg2), \T5
2612         vpshufd         $0b01001110, \XMM6, \T2
2613         vpshufd         $0b01001110, \T5, \T3
2614         vpxor           \XMM6, \T2, \T2
2615         vpxor           \T5, \T3, \T3
2616 
2617         vpclmulqdq      $0x11, \T5, \XMM6, \T4
2618         vpxor           \T4, \T6, \T6
2619 
2620         vpclmulqdq      $0x00, \T5, \XMM6, \T4
2621         vpxor           \T4, \T7, \T7
2622 
2623         vpclmulqdq      $0x00, \T3, \T2, \T2
2624 
2625         vpxor           \T2, \XMM1, \XMM1
2626 
2627         ######################
2628 
2629         vmovdqu         HashKey_2(arg2), \T5
2630         vpshufd         $0b01001110, \XMM7, \T2
2631         vpshufd         $0b01001110, \T5, \T3
2632         vpxor           \XMM7, \T2, \T2
2633         vpxor           \T5, \T3, \T3
2634 
2635         vpclmulqdq      $0x11, \T5, \XMM7, \T4
2636         vpxor           \T4, \T6, \T6
2637 
2638         vpclmulqdq      $0x00, \T5, \XMM7, \T4
2639         vpxor           \T4, \T7, \T7
2640 
2641         vpclmulqdq      $0x00, \T3, \T2, \T2
2642 
2643         vpxor           \T2, \XMM1, \XMM1
2644 
2645         ######################
2646 
2647         vmovdqu         HashKey(arg2), \T5
2648         vpshufd         $0b01001110, \XMM8, \T2
2649         vpshufd         $0b01001110, \T5, \T3
2650         vpxor           \XMM8, \T2, \T2
2651         vpxor           \T5, \T3, \T3
2652 
2653         vpclmulqdq      $0x11, \T5, \XMM8, \T4
2654         vpxor           \T4, \T6, \T6
2655 
2656         vpclmulqdq      $0x00, \T5, \XMM8, \T4
2657         vpxor           \T4, \T7, \T7
2658 
2659         vpclmulqdq      $0x00, \T3, \T2, \T2
2660 
2661         vpxor           \T2, \XMM1, \XMM1
2662         vpxor           \T6, \XMM1, \XMM1
2663         vpxor           \T7, \XMM1, \T2
2664 
2665 
2666 
2667 
2668         vpslldq $8, \T2, \T4
2669         vpsrldq $8, \T2, \T2
2670 
2671         vpxor   \T4, \T7, \T7
2672         vpxor   \T2, \T6, \T6                      # <T6:T7> holds the result of the
2673 						   # accumulated carry-less multiplications
2674 
2675         #######################################################################
2676         #first phase of the reduction
2677         vmovdqa         POLY2(%rip), \T3
2678 
2679         vpclmulqdq      $0x01, \T7, \T3, \T2
2680         vpslldq         $8, \T2, \T2               # shift-L xmm2 2 DWs
2681 
2682         vpxor           \T2, \T7, \T7              # first phase of the reduction complete
2683         #######################################################################
2684 
2685 
2686         #second phase of the reduction
2687         vpclmulqdq      $0x00, \T7, \T3, \T2
2688         vpsrldq         $4, \T2, \T2               # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2689 
2690         vpclmulqdq      $0x10, \T7, \T3, \T4
2691         vpslldq         $4, \T4, \T4               # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2692 
2693         vpxor           \T2, \T4, \T4              # second phase of the reduction complete
2694         #######################################################################
2695         vpxor           \T4, \T6, \T6              # the result is in T6
2696 .endm
2697 
2698 
2699 
2700 #############################################################
2701 #void   aesni_gcm_init_avx_gen4
2702 #        (gcm_data     *my_ctx_data,
2703 #         gcm_context_data *data,
2704 #        u8      *iv, /* Pre-counter block j0: 4 byte salt
2705 #			(from Security Association) concatenated with 8 byte
2706 #			Initialisation Vector (from IPSec ESP Payload)
2707 #			concatenated with 0x00000001. 16-byte aligned pointer. */
2708 #        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
2709 #        const   u8 *aad, /* Additional Authentication Data (AAD)*/
2710 #        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2711 #############################################################
2712 SYM_FUNC_START(aesni_gcm_init_avx_gen4)
2713         FUNC_SAVE
2714         INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
2715         FUNC_RESTORE
2716         RET
2717 SYM_FUNC_END(aesni_gcm_init_avx_gen4)
2718 
2719 ###############################################################################
2720 #void   aesni_gcm_enc_avx_gen4(
2721 #        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2722 #        gcm_context_data *data,
2723 #        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
2724 #        const   u8 *in, /* Plaintext input */
2725 #        u64     plaintext_len) /* Length of data in Bytes for encryption. */
2726 ###############################################################################
2727 SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
2728         FUNC_SAVE
2729         mov     keysize,%eax
2730         cmp     $32, %eax
2731         je      key_256_enc_update4
2732         cmp     $16, %eax
2733         je      key_128_enc_update4
2734         # must be 192
2735         GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
2736         FUNC_RESTORE
2737 	RET
2738 key_128_enc_update4:
2739         GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
2740         FUNC_RESTORE
2741 	RET
2742 key_256_enc_update4:
2743         GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
2744         FUNC_RESTORE
2745 	RET
2746 SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
2747 
2748 ###############################################################################
2749 #void   aesni_gcm_dec_update_avx_gen4(
2750 #        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2751 #        gcm_context_data *data,
2752 #        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
2753 #        const   u8 *in, /* Ciphertext input */
2754 #        u64     plaintext_len) /* Length of data in Bytes for encryption. */
2755 ###############################################################################
2756 SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
2757         FUNC_SAVE
2758         mov     keysize,%eax
2759         cmp     $32, %eax
2760         je      key_256_dec_update4
2761         cmp     $16, %eax
2762         je      key_128_dec_update4
2763         # must be 192
2764         GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
2765         FUNC_RESTORE
2766         RET
2767 key_128_dec_update4:
2768         GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
2769         FUNC_RESTORE
2770         RET
2771 key_256_dec_update4:
2772         GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
2773         FUNC_RESTORE
2774         RET
2775 SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
2776 
2777 ###############################################################################
2778 #void   aesni_gcm_finalize_avx_gen4(
2779 #        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2780 #        gcm_context_data *data,
2781 #        u8      *auth_tag, /* Authenticated Tag output. */
2782 #        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
2783 #                              Valid values are 16 (most likely), 12 or 8. */
2784 ###############################################################################
2785 SYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
2786         FUNC_SAVE
2787         mov	keysize,%eax
2788         cmp     $32, %eax
2789         je      key_256_finalize4
2790         cmp     $16, %eax
2791         je      key_128_finalize4
2792         # must be 192
2793         GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
2794         FUNC_RESTORE
2795         RET
2796 key_128_finalize4:
2797         GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
2798         FUNC_RESTORE
2799         RET
2800 key_256_finalize4:
2801         GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
2802         FUNC_RESTORE
2803         RET
2804 SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)
2805