1 ########################################################################
2 # Copyright (c) 2013, Intel Corporation
3 #
4 # This software is available to you under a choice of one of two
5 # licenses.  You may choose to be licensed under the terms of the GNU
6 # General Public License (GPL) Version 2, available from the file
7 # COPYING in the main directory of this source tree, or the
8 # OpenIB.org BSD license below:
9 #
10 # Redistribution and use in source and binary forms, with or without
11 # modification, are permitted provided that the following conditions are
12 # met:
13 #
14 # * Redistributions of source code must retain the above copyright
15 #   notice, this list of conditions and the following disclaimer.
16 #
17 # * Redistributions in binary form must reproduce the above copyright
18 #   notice, this list of conditions and the following disclaimer in the
19 #   documentation and/or other materials provided with the
20 #   distribution.
21 #
22 # * Neither the name of the Intel Corporation nor the names of its
23 #   contributors may be used to endorse or promote products derived from
24 #   this software without specific prior written permission.
25 #
26 #
27 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34 # PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 ########################################################################
39 ##
40 ## Authors:
41 ##	Erdinc Ozturk <erdinc.ozturk@intel.com>
42 ##	Vinodh Gopal <vinodh.gopal@intel.com>
43 ##	James Guilford <james.guilford@intel.com>
44 ##	Tim Chen <tim.c.chen@linux.intel.com>
45 ##
46 ## References:
47 ##       This code was derived and highly optimized from the code described in paper:
48 ##               Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49 ##			on Intel Architecture Processors. August, 2010
50 ##       The details of the implementation is explained in:
51 ##               Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52 ##			on Intel Architecture Processors. October, 2012.
53 ##
54 ## Assumptions:
55 ##
56 ##
57 ##
58 ## iv:
59 ##       0                   1                   2                   3
60 ##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62 ##       |                             Salt  (From the SA)               |
63 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64 ##       |                     Initialization Vector                     |
65 ##       |         (This is the sequence number from IPSec header)       |
66 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
67 ##       |                              0x1                              |
68 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
69 ##
70 ##
71 ##
72 ## AAD:
73 ##       AAD padded to 128 bits with 0
74 ##       for example, assume AAD is a u32 vector
75 ##
76 ##       if AAD is 8 bytes:
77 ##       AAD[3] = {A0, A1}#
78 ##       padded AAD in xmm register = {A1 A0 0 0}
79 ##
80 ##       0                   1                   2                   3
81 ##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
83 ##       |                               SPI (A1)                        |
84 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85 ##       |                     32-bit Sequence Number (A0)               |
86 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
87 ##       |                              0x0                              |
88 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
89 ##
90 ##                                       AAD Format with 32-bit Sequence Number
91 ##
92 ##       if AAD is 12 bytes:
93 ##       AAD[3] = {A0, A1, A2}#
94 ##       padded AAD in xmm register = {A2 A1 A0 0}
95 ##
96 ##       0                   1                   2                   3
97 ##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
99 ##       |                               SPI (A2)                        |
100 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101 ##       |                 64-bit Extended Sequence Number {A1,A0}       |
102 ##       |                                                               |
103 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
104 ##       |                              0x0                              |
105 ##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
106 ##
107 ##        AAD Format with 64-bit Extended Sequence Number
108 ##
109 ##
110 ## aadLen:
111 ##       from the definition of the spec, aadLen can only be 8 or 12 bytes.
112 ##	 The code additionally supports aadLen of length 16 bytes.
113 ##
114 ## TLen:
115 ##       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
116 ##
117 ## poly = x^128 + x^127 + x^126 + x^121 + 1
118 ## throughout the code, one tab and two tab indentations are used. one tab is
119 ## for GHASH part, two tabs is for AES part.
120 ##
121 
122 #include <linux/linkage.h>
123 
124 # constants in mergeable sections, linker can reorder and merge
125 .section	.rodata.cst16.POLY, "aM", @progbits, 16
126 .align 16
127 POLY:            .octa     0xC2000000000000000000000000000001
128 
129 .section	.rodata.cst16.POLY2, "aM", @progbits, 16
130 .align 16
131 POLY2:           .octa     0xC20000000000000000000001C2000000
132 
133 .section	.rodata.cst16.TWOONE, "aM", @progbits, 16
134 .align 16
135 TWOONE:          .octa     0x00000001000000000000000000000001
136 
137 .section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
138 .align 16
139 SHUF_MASK:       .octa     0x000102030405060708090A0B0C0D0E0F
140 
141 .section	.rodata.cst16.ONE, "aM", @progbits, 16
142 .align 16
143 ONE:             .octa     0x00000000000000000000000000000001
144 
145 .section	.rodata.cst16.ONEf, "aM", @progbits, 16
146 .align 16
147 ONEf:            .octa     0x01000000000000000000000000000000
148 
149 # order of these constants should not change.
150 # more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
151 .section	.rodata, "a", @progbits
152 .align 16
153 SHIFT_MASK:      .octa     0x0f0e0d0c0b0a09080706050403020100
154 ALL_F:           .octa     0xffffffffffffffffffffffffffffffff
155                  .octa     0x00000000000000000000000000000000
156 
157 .section .rodata
158 .align 16
159 .type aad_shift_arr, @object
160 .size aad_shift_arr, 272
161 aad_shift_arr:
162         .octa     0xffffffffffffffffffffffffffffffff
163         .octa     0xffffffffffffffffffffffffffffff0C
164         .octa     0xffffffffffffffffffffffffffff0D0C
165         .octa     0xffffffffffffffffffffffffff0E0D0C
166         .octa     0xffffffffffffffffffffffff0F0E0D0C
167         .octa     0xffffffffffffffffffffff0C0B0A0908
168         .octa     0xffffffffffffffffffff0D0C0B0A0908
169         .octa     0xffffffffffffffffff0E0D0C0B0A0908
170         .octa     0xffffffffffffffff0F0E0D0C0B0A0908
171         .octa     0xffffffffffffff0C0B0A090807060504
172         .octa     0xffffffffffff0D0C0B0A090807060504
173         .octa     0xffffffffff0E0D0C0B0A090807060504
174         .octa     0xffffffff0F0E0D0C0B0A090807060504
175         .octa     0xffffff0C0B0A09080706050403020100
176         .octa     0xffff0D0C0B0A09080706050403020100
177         .octa     0xff0E0D0C0B0A09080706050403020100
178         .octa     0x0F0E0D0C0B0A09080706050403020100
179 
180 
181 .text
182 
183 
184 #define AadHash 16*0
185 #define AadLen 16*1
186 #define InLen (16*1)+8
187 #define PBlockEncKey 16*2
188 #define OrigIV 16*3
189 #define CurCount 16*4
190 #define PBlockLen 16*5
191 
192 HashKey        = 16*6   # store HashKey <<1 mod poly here
193 HashKey_2      = 16*7   # store HashKey^2 <<1 mod poly here
194 HashKey_3      = 16*8   # store HashKey^3 <<1 mod poly here
195 HashKey_4      = 16*9   # store HashKey^4 <<1 mod poly here
196 HashKey_5      = 16*10   # store HashKey^5 <<1 mod poly here
197 HashKey_6      = 16*11   # store HashKey^6 <<1 mod poly here
198 HashKey_7      = 16*12   # store HashKey^7 <<1 mod poly here
199 HashKey_8      = 16*13   # store HashKey^8 <<1 mod poly here
200 HashKey_k      = 16*14   # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
201 HashKey_2_k    = 16*15   # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
202 HashKey_3_k    = 16*16   # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
203 HashKey_4_k    = 16*17   # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
204 HashKey_5_k    = 16*18   # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
205 HashKey_6_k    = 16*19   # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
206 HashKey_7_k    = 16*20   # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
207 HashKey_8_k    = 16*21   # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
208 
209 #define arg1 %rdi
210 #define arg2 %rsi
211 #define arg3 %rdx
212 #define arg4 %rcx
213 #define arg5 %r8
214 #define arg6 %r9
215 #define arg7 STACK_OFFSET+8*1(%r14)
216 #define arg8 STACK_OFFSET+8*2(%r14)
217 #define arg9 STACK_OFFSET+8*3(%r14)
218 #define arg10 STACK_OFFSET+8*4(%r14)
219 #define keysize 2*15*16(arg1)
220 
221 i = 0
222 j = 0
223 
224 out_order = 0
225 in_order = 1
226 DEC = 0
227 ENC = 1
228 
229 .macro define_reg r n
230 reg_\r = %xmm\n
231 .endm
232 
233 .macro setreg
234 .altmacro
235 define_reg i %i
236 define_reg j %j
237 .noaltmacro
238 .endm
239 
240 # need to push 4 registers into stack to maintain
241 STACK_OFFSET = 8*4
242 
243 TMP1 =   16*0    # Temporary storage for AAD
244 TMP2 =   16*1    # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
245 TMP3 =   16*2    # Temporary storage for AES State 3
246 TMP4 =   16*3    # Temporary storage for AES State 4
247 TMP5 =   16*4    # Temporary storage for AES State 5
248 TMP6 =   16*5    # Temporary storage for AES State 6
249 TMP7 =   16*6    # Temporary storage for AES State 7
250 TMP8 =   16*7    # Temporary storage for AES State 8
251 
252 VARIABLE_OFFSET = 16*8
253 
254 ################################
255 # Utility Macros
256 ################################
257 
258 .macro FUNC_SAVE
259         #the number of pushes must equal STACK_OFFSET
260         push    %r12
261         push    %r13
262         push    %r14
263         push    %r15
264 
265         mov     %rsp, %r14
266 
267 
268 
269         sub     $VARIABLE_OFFSET, %rsp
270         and     $~63, %rsp                    # align rsp to 64 bytes
271 .endm
272 
273 .macro FUNC_RESTORE
274         mov     %r14, %rsp
275 
276         pop     %r15
277         pop     %r14
278         pop     %r13
279         pop     %r12
280 .endm
281 
282 # Encryption of a single block
283 .macro ENCRYPT_SINGLE_BLOCK REP XMM0
284                 vpxor    (arg1), \XMM0, \XMM0
285                i = 1
286                setreg
287 .rep \REP
288                 vaesenc  16*i(arg1), \XMM0, \XMM0
289                i = (i+1)
290                setreg
291 .endr
292                 vaesenclast 16*i(arg1), \XMM0, \XMM0
293 .endm
294 
295 # combined for GCM encrypt and decrypt functions
296 # clobbering all xmm registers
297 # clobbering r10, r11, r12, r13, r14, r15
298 .macro  GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
299         vmovdqu AadHash(arg2), %xmm8
300         vmovdqu  HashKey(arg2), %xmm13      # xmm13 = HashKey
301         add arg5, InLen(arg2)
302 
303         # initialize the data pointer offset as zero
304         xor     %r11d, %r11d
305 
306         PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
307         sub %r11, arg5
308 
309         mov     arg5, %r13                  # save the number of bytes of plaintext/ciphertext
310         and     $-16, %r13                  # r13 = r13 - (r13 mod 16)
311 
312         mov     %r13, %r12
313         shr     $4, %r12
314         and     $7, %r12
315         jz      _initial_num_blocks_is_0\@
316 
317         cmp     $7, %r12
318         je      _initial_num_blocks_is_7\@
319         cmp     $6, %r12
320         je      _initial_num_blocks_is_6\@
321         cmp     $5, %r12
322         je      _initial_num_blocks_is_5\@
323         cmp     $4, %r12
324         je      _initial_num_blocks_is_4\@
325         cmp     $3, %r12
326         je      _initial_num_blocks_is_3\@
327         cmp     $2, %r12
328         je      _initial_num_blocks_is_2\@
329 
330         jmp     _initial_num_blocks_is_1\@
331 
332 _initial_num_blocks_is_7\@:
333         \INITIAL_BLOCKS  \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
334         sub     $16*7, %r13
335         jmp     _initial_blocks_encrypted\@
336 
337 _initial_num_blocks_is_6\@:
338         \INITIAL_BLOCKS  \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
339         sub     $16*6, %r13
340         jmp     _initial_blocks_encrypted\@
341 
342 _initial_num_blocks_is_5\@:
343         \INITIAL_BLOCKS  \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
344         sub     $16*5, %r13
345         jmp     _initial_blocks_encrypted\@
346 
347 _initial_num_blocks_is_4\@:
348         \INITIAL_BLOCKS  \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
349         sub     $16*4, %r13
350         jmp     _initial_blocks_encrypted\@
351 
352 _initial_num_blocks_is_3\@:
353         \INITIAL_BLOCKS  \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
354         sub     $16*3, %r13
355         jmp     _initial_blocks_encrypted\@
356 
357 _initial_num_blocks_is_2\@:
358         \INITIAL_BLOCKS  \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
359         sub     $16*2, %r13
360         jmp     _initial_blocks_encrypted\@
361 
362 _initial_num_blocks_is_1\@:
363         \INITIAL_BLOCKS  \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
364         sub     $16*1, %r13
365         jmp     _initial_blocks_encrypted\@
366 
367 _initial_num_blocks_is_0\@:
368         \INITIAL_BLOCKS  \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
369 
370 
371 _initial_blocks_encrypted\@:
372         test    %r13, %r13
373         je      _zero_cipher_left\@
374 
375         sub     $128, %r13
376         je      _eight_cipher_left\@
377 
378 
379 
380 
381         vmovd   %xmm9, %r15d
382         and     $255, %r15d
383         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
384 
385 
386 _encrypt_by_8_new\@:
387         cmp     $(255-8), %r15d
388         jg      _encrypt_by_8\@
389 
390 
391 
392         add     $8, %r15b
393         \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
394         add     $128, %r11
395         sub     $128, %r13
396         jne     _encrypt_by_8_new\@
397 
398         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
399         jmp     _eight_cipher_left\@
400 
401 _encrypt_by_8\@:
402         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
403         add     $8, %r15b
404         \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
405         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
406         add     $128, %r11
407         sub     $128, %r13
408         jne     _encrypt_by_8_new\@
409 
410         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
411 
412 
413 
414 
415 _eight_cipher_left\@:
416         \GHASH_LAST_8    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
417 
418 
419 _zero_cipher_left\@:
420         vmovdqu %xmm14, AadHash(arg2)
421         vmovdqu %xmm9, CurCount(arg2)
422 
423         # check for 0 length
424         mov     arg5, %r13
425         and     $15, %r13                            # r13 = (arg5 mod 16)
426 
427         je      _multiple_of_16_bytes\@
428 
429         # handle the last <16 Byte block separately
430 
431         mov %r13, PBlockLen(arg2)
432 
433         vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
434         vmovdqu %xmm9, CurCount(arg2)
435         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
436 
437         ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Yn)
438         vmovdqu %xmm9, PBlockEncKey(arg2)
439 
440         cmp $16, arg5
441         jge _large_enough_update\@
442 
443         lea (arg4,%r11,1), %r10
444         mov %r13, %r12
445 
446         READ_PARTIAL_BLOCK %r10 %r12 %xmm1
447 
448         lea     SHIFT_MASK+16(%rip), %r12
449         sub     %r13, %r12                           # adjust the shuffle mask pointer to be
450 						     # able to shift 16-r13 bytes (r13 is the
451 	# number of bytes in plaintext mod 16)
452 
453         jmp _final_ghash_mul\@
454 
455 _large_enough_update\@:
456         sub $16, %r11
457         add %r13, %r11
458 
459         # receive the last <16 Byte block
460         vmovdqu	(arg4, %r11, 1), %xmm1
461 
462         sub	%r13, %r11
463         add	$16, %r11
464 
465         lea	SHIFT_MASK+16(%rip), %r12
466         # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
467         # (r13 is the number of bytes in plaintext mod 16)
468         sub	%r13, %r12
469         # get the appropriate shuffle mask
470         vmovdqu	(%r12), %xmm2
471         # shift right 16-r13 bytes
472         vpshufb  %xmm2, %xmm1, %xmm1
473 
474 _final_ghash_mul\@:
475         .if  \ENC_DEC ==  DEC
476         vmovdqa %xmm1, %xmm2
477         vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
478         vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
479 						     # mask out top 16-r13 bytes of xmm9
480         vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
481         vpand   %xmm1, %xmm2, %xmm2
482         vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
483         vpxor   %xmm2, %xmm14, %xmm14
484 
485         vmovdqu %xmm14, AadHash(arg2)
486         .else
487         vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
488         vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
489 						     # mask out top 16-r13 bytes of xmm9
490         vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
491         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
492         vpxor   %xmm9, %xmm14, %xmm14
493 
494         vmovdqu %xmm14, AadHash(arg2)
495         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
496         .endif
497 
498 
499         #############################
500         # output r13 Bytes
501         vmovq   %xmm9, %rax
502         cmp     $8, %r13
503         jle     _less_than_8_bytes_left\@
504 
505         mov     %rax, (arg3 , %r11)
506         add     $8, %r11
507         vpsrldq $8, %xmm9, %xmm9
508         vmovq   %xmm9, %rax
509         sub     $8, %r13
510 
511 _less_than_8_bytes_left\@:
512         movb    %al, (arg3 , %r11)
513         add     $1, %r11
514         shr     $8, %rax
515         sub     $1, %r13
516         jne     _less_than_8_bytes_left\@
517         #############################
518 
519 _multiple_of_16_bytes\@:
520 .endm
521 
522 
523 # GCM_COMPLETE Finishes update of tag of last partial block
524 # Output: Authorization Tag (AUTH_TAG)
525 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
526 .macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
527         vmovdqu AadHash(arg2), %xmm14
528         vmovdqu HashKey(arg2), %xmm13
529 
530         mov PBlockLen(arg2), %r12
531         test %r12, %r12
532         je _partial_done\@
533 
534 	#GHASH computation for the last <16 Byte block
535         \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
536 
537 _partial_done\@:
538         mov AadLen(arg2), %r12                          # r12 = aadLen (number of bytes)
539         shl     $3, %r12                             # convert into number of bits
540         vmovd   %r12d, %xmm15                        # len(A) in xmm15
541 
542         mov InLen(arg2), %r12
543         shl     $3, %r12                        # len(C) in bits  (*128)
544         vmovq   %r12, %xmm1
545         vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
546         vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
547 
548         vpxor   %xmm15, %xmm14, %xmm14
549         \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
550         vpshufb SHUF_MASK(%rip), %xmm14, %xmm14      # perform a 16Byte swap
551 
552         vmovdqu OrigIV(arg2), %xmm9
553 
554         ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Y0)
555 
556         vpxor   %xmm14, %xmm9, %xmm9
557 
558 
559 
560 _return_T\@:
561         mov     \AUTH_TAG, %r10              # r10 = authTag
562         mov     \AUTH_TAG_LEN, %r11              # r11 = auth_tag_len
563 
564         cmp     $16, %r11
565         je      _T_16\@
566 
567         cmp     $8, %r11
568         jl      _T_4\@
569 
570 _T_8\@:
571         vmovq   %xmm9, %rax
572         mov     %rax, (%r10)
573         add     $8, %r10
574         sub     $8, %r11
575         vpsrldq $8, %xmm9, %xmm9
576         test    %r11, %r11
577         je     _return_T_done\@
578 _T_4\@:
579         vmovd   %xmm9, %eax
580         mov     %eax, (%r10)
581         add     $4, %r10
582         sub     $4, %r11
583         vpsrldq     $4, %xmm9, %xmm9
584         test    %r11, %r11
585         je     _return_T_done\@
586 _T_123\@:
587         vmovd     %xmm9, %eax
588         cmp     $2, %r11
589         jl     _T_1\@
590         mov     %ax, (%r10)
591         cmp     $2, %r11
592         je     _return_T_done\@
593         add     $2, %r10
594         sar     $16, %eax
595 _T_1\@:
596         mov     %al, (%r10)
597         jmp     _return_T_done\@
598 
599 _T_16\@:
600         vmovdqu %xmm9, (%r10)
601 
602 _return_T_done\@:
603 .endm
604 
605 .macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
606 
607 	mov     \AAD, %r10                      # r10 = AAD
608 	mov     \AADLEN, %r12                      # r12 = aadLen
609 
610 
611 	mov     %r12, %r11
612 
613 	vpxor   \T8, \T8, \T8
614 	vpxor   \T7, \T7, \T7
615 	cmp     $16, %r11
616 	jl      _get_AAD_rest8\@
617 _get_AAD_blocks\@:
618 	vmovdqu (%r10), \T7
619 	vpshufb SHUF_MASK(%rip), \T7, \T7
620 	vpxor   \T7, \T8, \T8
621 	\GHASH_MUL       \T8, \T2, \T1, \T3, \T4, \T5, \T6
622 	add     $16, %r10
623 	sub     $16, %r12
624 	sub     $16, %r11
625 	cmp     $16, %r11
626 	jge     _get_AAD_blocks\@
627 	vmovdqu \T8, \T7
628 	test    %r11, %r11
629 	je      _get_AAD_done\@
630 
631 	vpxor   \T7, \T7, \T7
632 
633 	/* read the last <16B of AAD. since we have at least 4B of
634 	data right after the AAD (the ICV, and maybe some CT), we can
635 	read 4B/8B blocks safely, and then get rid of the extra stuff */
636 _get_AAD_rest8\@:
637 	cmp     $4, %r11
638 	jle     _get_AAD_rest4\@
639 	movq    (%r10), \T1
640 	add     $8, %r10
641 	sub     $8, %r11
642 	vpslldq $8, \T1, \T1
643 	vpsrldq $8, \T7, \T7
644 	vpxor   \T1, \T7, \T7
645 	jmp     _get_AAD_rest8\@
646 _get_AAD_rest4\@:
647 	test    %r11, %r11
648 	jle      _get_AAD_rest0\@
649 	mov     (%r10), %eax
650 	movq    %rax, \T1
651 	add     $4, %r10
652 	sub     $4, %r11
653 	vpslldq $12, \T1, \T1
654 	vpsrldq $4, \T7, \T7
655 	vpxor   \T1, \T7, \T7
656 _get_AAD_rest0\@:
657 	/* finalize: shift out the extra bytes we read, and align
658 	left. since pslldq can only shift by an immediate, we use
659 	vpshufb and an array of shuffle masks */
660 	movq    %r12, %r11
661 	salq    $4, %r11
662 	vmovdqu  aad_shift_arr(%r11), \T1
663 	vpshufb \T1, \T7, \T7
664 _get_AAD_rest_final\@:
665 	vpshufb SHUF_MASK(%rip), \T7, \T7
666 	vpxor   \T8, \T7, \T7
667 	\GHASH_MUL       \T7, \T2, \T1, \T3, \T4, \T5, \T6
668 
669 _get_AAD_done\@:
670         vmovdqu \T7, AadHash(arg2)
671 .endm
672 
673 .macro INIT GHASH_MUL PRECOMPUTE
674         mov arg6, %r11
675         mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
676         xor %r11d, %r11d
677         mov %r11, InLen(arg2) # ctx_data.in_length = 0
678 
679         mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
680         mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
681         mov arg3, %rax
682         movdqu (%rax), %xmm0
683         movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
684 
685         vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
686         movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
687 
688         vmovdqu  (arg4), %xmm6              # xmm6 = HashKey
689 
690         vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
691         ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
692         vmovdqa  %xmm6, %xmm2
693         vpsllq   $1, %xmm6, %xmm6
694         vpsrlq   $63, %xmm2, %xmm2
695         vmovdqa  %xmm2, %xmm1
696         vpslldq  $8, %xmm2, %xmm2
697         vpsrldq  $8, %xmm1, %xmm1
698         vpor     %xmm2, %xmm6, %xmm6
699         #reduction
700         vpshufd  $0b00100100, %xmm1, %xmm2
701         vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
702         vpand    POLY(%rip), %xmm2, %xmm2
703         vpxor    %xmm2, %xmm6, %xmm6        # xmm6 holds the HashKey<<1 mod poly
704         #######################################################################
705         vmovdqu  %xmm6, HashKey(arg2)       # store HashKey<<1 mod poly
706 
707         CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
708 
709         \PRECOMPUTE  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
710 .endm
711 
712 
713 # Reads DLEN bytes starting at DPTR and stores in XMMDst
714 # where 0 < DLEN < 16
715 # Clobbers %rax, DLEN
716 .macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
717         vpxor \XMMDst, \XMMDst, \XMMDst
718 
719         cmp $8, \DLEN
720         jl _read_lt8_\@
721         mov (\DPTR), %rax
722         vpinsrq $0, %rax, \XMMDst, \XMMDst
723         sub $8, \DLEN
724         jz _done_read_partial_block_\@
725         xor %eax, %eax
726 _read_next_byte_\@:
727         shl $8, %rax
728         mov 7(\DPTR, \DLEN, 1), %al
729         dec \DLEN
730         jnz _read_next_byte_\@
731         vpinsrq $1, %rax, \XMMDst, \XMMDst
732         jmp _done_read_partial_block_\@
733 _read_lt8_\@:
734         xor %eax, %eax
735 _read_next_byte_lt8_\@:
736         shl $8, %rax
737         mov -1(\DPTR, \DLEN, 1), %al
738         dec \DLEN
739         jnz _read_next_byte_lt8_\@
740         vpinsrq $0, %rax, \XMMDst, \XMMDst
741 _done_read_partial_block_\@:
742 .endm
743 
744 # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
745 # between update calls.
746 # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
747 # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
748 # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
749 .macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
750         AAD_HASH ENC_DEC
751         mov 	PBlockLen(arg2), %r13
752         test	%r13, %r13
753         je	_partial_block_done_\@	# Leave Macro if no partial blocks
754         # Read in input data without over reading
755         cmp	$16, \PLAIN_CYPH_LEN
756         jl	_fewer_than_16_bytes_\@
757         vmovdqu	(\PLAIN_CYPH_IN), %xmm1	# If more than 16 bytes, just fill xmm
758         jmp	_data_read_\@
759 
760 _fewer_than_16_bytes_\@:
761         lea	(\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
762         mov	\PLAIN_CYPH_LEN, %r12
763         READ_PARTIAL_BLOCK %r10 %r12 %xmm1
764 
765         mov PBlockLen(arg2), %r13
766 
767 _data_read_\@:				# Finished reading in data
768 
769         vmovdqu	PBlockEncKey(arg2), %xmm9
770         vmovdqu	HashKey(arg2), %xmm13
771 
772         lea	SHIFT_MASK(%rip), %r12
773 
774         # adjust the shuffle mask pointer to be able to shift r13 bytes
775         # r16-r13 is the number of bytes in plaintext mod 16)
776         add	%r13, %r12
777         vmovdqu	(%r12), %xmm2		# get the appropriate shuffle mask
778         vpshufb %xmm2, %xmm9, %xmm9		# shift right r13 bytes
779 
780 .if  \ENC_DEC ==  DEC
781         vmovdqa	%xmm1, %xmm3
782         pxor	%xmm1, %xmm9		# Cyphertext XOR E(K, Yn)
783 
784         mov	\PLAIN_CYPH_LEN, %r10
785         add	%r13, %r10
786         # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
787         sub	$16, %r10
788         # Determine if if partial block is not being filled and
789         # shift mask accordingly
790         jge	_no_extra_mask_1_\@
791         sub	%r10, %r12
792 _no_extra_mask_1_\@:
793 
794         vmovdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
795         # get the appropriate mask to mask out bottom r13 bytes of xmm9
796         vpand	%xmm1, %xmm9, %xmm9		# mask out bottom r13 bytes of xmm9
797 
798         vpand	%xmm1, %xmm3, %xmm3
799         vmovdqa	SHUF_MASK(%rip), %xmm10
800         vpshufb	%xmm10, %xmm3, %xmm3
801         vpshufb	%xmm2, %xmm3, %xmm3
802         vpxor	%xmm3, \AAD_HASH, \AAD_HASH
803 
804         test	%r10, %r10
805         jl	_partial_incomplete_1_\@
806 
807         # GHASH computation for the last <16 Byte block
808         \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
809         xor	%eax,%eax
810 
811         mov	%rax, PBlockLen(arg2)
812         jmp	_dec_done_\@
813 _partial_incomplete_1_\@:
814         add	\PLAIN_CYPH_LEN, PBlockLen(arg2)
815 _dec_done_\@:
816         vmovdqu	\AAD_HASH, AadHash(arg2)
817 .else
818         vpxor	%xmm1, %xmm9, %xmm9			# Plaintext XOR E(K, Yn)
819 
820         mov	\PLAIN_CYPH_LEN, %r10
821         add	%r13, %r10
822         # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
823         sub	$16, %r10
824         # Determine if if partial block is not being filled and
825         # shift mask accordingly
826         jge	_no_extra_mask_2_\@
827         sub	%r10, %r12
828 _no_extra_mask_2_\@:
829 
830         vmovdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
831         # get the appropriate mask to mask out bottom r13 bytes of xmm9
832         vpand	%xmm1, %xmm9, %xmm9
833 
834         vmovdqa	SHUF_MASK(%rip), %xmm1
835         vpshufb %xmm1, %xmm9, %xmm9
836         vpshufb %xmm2, %xmm9, %xmm9
837         vpxor	%xmm9, \AAD_HASH, \AAD_HASH
838 
839         test	%r10, %r10
840         jl	_partial_incomplete_2_\@
841 
842         # GHASH computation for the last <16 Byte block
843         \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
844         xor	%eax,%eax
845 
846         mov	%rax, PBlockLen(arg2)
847         jmp	_encode_done_\@
848 _partial_incomplete_2_\@:
849         add	\PLAIN_CYPH_LEN, PBlockLen(arg2)
850 _encode_done_\@:
851         vmovdqu	\AAD_HASH, AadHash(arg2)
852 
853         vmovdqa	SHUF_MASK(%rip), %xmm10
854         # shuffle xmm9 back to output as ciphertext
855         vpshufb	%xmm10, %xmm9, %xmm9
856         vpshufb	%xmm2, %xmm9, %xmm9
857 .endif
858         # output encrypted Bytes
859         test	%r10, %r10
860         jl	_partial_fill_\@
861         mov	%r13, %r12
862         mov	$16, %r13
863         # Set r13 to be the number of bytes to write out
864         sub	%r12, %r13
865         jmp	_count_set_\@
866 _partial_fill_\@:
867         mov	\PLAIN_CYPH_LEN, %r13
868 _count_set_\@:
869         vmovdqa	%xmm9, %xmm0
870         vmovq	%xmm0, %rax
871         cmp	$8, %r13
872         jle	_less_than_8_bytes_left_\@
873 
874         mov	%rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
875         add	$8, \DATA_OFFSET
876         psrldq	$8, %xmm0
877         vmovq	%xmm0, %rax
878         sub	$8, %r13
879 _less_than_8_bytes_left_\@:
880         movb	%al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
881         add	$1, \DATA_OFFSET
882         shr	$8, %rax
883         sub	$1, %r13
884         jne	_less_than_8_bytes_left_\@
885 _partial_block_done_\@:
886 .endm # PARTIAL_BLOCK
887 
888 ###############################################################################
889 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
890 # Input: A and B (128-bits each, bit-reflected)
891 # Output: C = A*B*x mod poly, (i.e. >>1 )
892 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
893 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
894 ###############################################################################
895 .macro  GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
896 
897         vpshufd         $0b01001110, \GH, \T2
898         vpshufd         $0b01001110, \HK, \T3
899         vpxor           \GH     , \T2, \T2      # T2 = (a1+a0)
900         vpxor           \HK     , \T3, \T3      # T3 = (b1+b0)
901 
902         vpclmulqdq      $0x11, \HK, \GH, \T1    # T1 = a1*b1
903         vpclmulqdq      $0x00, \HK, \GH, \GH    # GH = a0*b0
904         vpclmulqdq      $0x00, \T3, \T2, \T2    # T2 = (a1+a0)*(b1+b0)
905         vpxor           \GH, \T2,\T2
906         vpxor           \T1, \T2,\T2            # T2 = a0*b1+a1*b0
907 
908         vpslldq         $8, \T2,\T3             # shift-L T3 2 DWs
909         vpsrldq         $8, \T2,\T2             # shift-R T2 2 DWs
910         vpxor           \T3, \GH, \GH
911         vpxor           \T2, \T1, \T1           # <T1:GH> = GH x HK
912 
913         #first phase of the reduction
914         vpslld  $31, \GH, \T2                   # packed right shifting << 31
915         vpslld  $30, \GH, \T3                   # packed right shifting shift << 30
916         vpslld  $25, \GH, \T4                   # packed right shifting shift << 25
917 
918         vpxor   \T3, \T2, \T2                   # xor the shifted versions
919         vpxor   \T4, \T2, \T2
920 
921         vpsrldq $4, \T2, \T5                    # shift-R T5 1 DW
922 
923         vpslldq $12, \T2, \T2                   # shift-L T2 3 DWs
924         vpxor   \T2, \GH, \GH                   # first phase of the reduction complete
925 
926         #second phase of the reduction
927 
928         vpsrld  $1,\GH, \T2                     # packed left shifting >> 1
929         vpsrld  $2,\GH, \T3                     # packed left shifting >> 2
930         vpsrld  $7,\GH, \T4                     # packed left shifting >> 7
931         vpxor   \T3, \T2, \T2                   # xor the shifted versions
932         vpxor   \T4, \T2, \T2
933 
934         vpxor   \T5, \T2, \T2
935         vpxor   \T2, \GH, \GH
936         vpxor   \T1, \GH, \GH                   # the result is in GH
937 
938 
939 .endm
940 
941 .macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
942 
943         # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
944         vmovdqa  \HK, \T5
945 
946         vpshufd  $0b01001110, \T5, \T1
947         vpxor    \T5, \T1, \T1
948         vmovdqu  \T1, HashKey_k(arg2)
949 
950         GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^2<<1 mod poly
951         vmovdqu  \T5, HashKey_2(arg2)                    #  [HashKey_2] = HashKey^2<<1 mod poly
952         vpshufd  $0b01001110, \T5, \T1
953         vpxor    \T5, \T1, \T1
954         vmovdqu  \T1, HashKey_2_k(arg2)
955 
956         GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^3<<1 mod poly
957         vmovdqu  \T5, HashKey_3(arg2)
958         vpshufd  $0b01001110, \T5, \T1
959         vpxor    \T5, \T1, \T1
960         vmovdqu  \T1, HashKey_3_k(arg2)
961 
962         GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^4<<1 mod poly
963         vmovdqu  \T5, HashKey_4(arg2)
964         vpshufd  $0b01001110, \T5, \T1
965         vpxor    \T5, \T1, \T1
966         vmovdqu  \T1, HashKey_4_k(arg2)
967 
968         GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^5<<1 mod poly
969         vmovdqu  \T5, HashKey_5(arg2)
970         vpshufd  $0b01001110, \T5, \T1
971         vpxor    \T5, \T1, \T1
972         vmovdqu  \T1, HashKey_5_k(arg2)
973 
974         GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^6<<1 mod poly
975         vmovdqu  \T5, HashKey_6(arg2)
976         vpshufd  $0b01001110, \T5, \T1
977         vpxor    \T5, \T1, \T1
978         vmovdqu  \T1, HashKey_6_k(arg2)
979 
980         GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^7<<1 mod poly
981         vmovdqu  \T5, HashKey_7(arg2)
982         vpshufd  $0b01001110, \T5, \T1
983         vpxor    \T5, \T1, \T1
984         vmovdqu  \T1, HashKey_7_k(arg2)
985 
986         GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^8<<1 mod poly
987         vmovdqu  \T5, HashKey_8(arg2)
988         vpshufd  $0b01001110, \T5, \T1
989         vpxor    \T5, \T1, \T1
990         vmovdqu  \T1, HashKey_8_k(arg2)
991 
992 .endm
993 
994 ## if a = number of total plaintext bytes
995 ## b = floor(a/16)
996 ## num_initial_blocks = b mod 4#
997 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
998 ## r10, r11, r12, rax are clobbered
999 ## arg1, arg3, arg4, r14 are used as a pointer only, not modified
1000 
1001 .macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
1002 	i = (8-\num_initial_blocks)
1003 	setreg
1004         vmovdqu AadHash(arg2), reg_i
1005 
1006 	# start AES for num_initial_blocks blocks
1007 	vmovdqu CurCount(arg2), \CTR
1008 
1009 	i = (9-\num_initial_blocks)
1010 	setreg
1011 .rep \num_initial_blocks
1012                 vpaddd  ONE(%rip), \CTR, \CTR		# INCR Y0
1013                 vmovdqa \CTR, reg_i
1014                 vpshufb SHUF_MASK(%rip), reg_i, reg_i   # perform a 16Byte swap
1015 	i = (i+1)
1016 	setreg
1017 .endr
1018 
1019 	vmovdqa  (arg1), \T_key
1020 	i = (9-\num_initial_blocks)
1021 	setreg
1022 .rep \num_initial_blocks
1023                 vpxor   \T_key, reg_i, reg_i
1024 	i = (i+1)
1025 	setreg
1026 .endr
1027 
1028        j = 1
1029        setreg
1030 .rep \REP
1031        vmovdqa  16*j(arg1), \T_key
1032 	i = (9-\num_initial_blocks)
1033 	setreg
1034 .rep \num_initial_blocks
1035         vaesenc \T_key, reg_i, reg_i
1036 	i = (i+1)
1037 	setreg
1038 .endr
1039 
1040        j = (j+1)
1041        setreg
1042 .endr
1043 
1044 	vmovdqa  16*j(arg1), \T_key
1045 	i = (9-\num_initial_blocks)
1046 	setreg
1047 .rep \num_initial_blocks
1048         vaesenclast      \T_key, reg_i, reg_i
1049 	i = (i+1)
1050 	setreg
1051 .endr
1052 
1053 	i = (9-\num_initial_blocks)
1054 	setreg
1055 .rep \num_initial_blocks
1056                 vmovdqu (arg4, %r11), \T1
1057                 vpxor   \T1, reg_i, reg_i
1058                 vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for num_initial_blocks blocks
1059                 add     $16, %r11
1060 .if  \ENC_DEC == DEC
1061                 vmovdqa \T1, reg_i
1062 .endif
1063                 vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
1064 	i = (i+1)
1065 	setreg
1066 .endr
1067 
1068 
1069 	i = (8-\num_initial_blocks)
1070 	j = (9-\num_initial_blocks)
1071 	setreg
1072 
1073 .rep \num_initial_blocks
1074         vpxor    reg_i, reg_j, reg_j
1075         GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1076 	i = (i+1)
1077 	j = (j+1)
1078 	setreg
1079 .endr
1080         # XMM8 has the combined result here
1081 
1082         vmovdqa  \XMM8, TMP1(%rsp)
1083         vmovdqa  \XMM8, \T3
1084 
1085         cmp     $128, %r13
1086         jl      _initial_blocks_done\@                  # no need for precomputed constants
1087 
1088 ###############################################################################
1089 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1090                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1091                 vmovdqa  \CTR, \XMM1
1092                 vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
1093 
1094                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1095                 vmovdqa  \CTR, \XMM2
1096                 vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
1097 
1098                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1099                 vmovdqa  \CTR, \XMM3
1100                 vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
1101 
1102                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1103                 vmovdqa  \CTR, \XMM4
1104                 vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
1105 
1106                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1107                 vmovdqa  \CTR, \XMM5
1108                 vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
1109 
1110                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1111                 vmovdqa  \CTR, \XMM6
1112                 vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
1113 
1114                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1115                 vmovdqa  \CTR, \XMM7
1116                 vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
1117 
1118                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1119                 vmovdqa  \CTR, \XMM8
1120                 vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
1121 
1122                 vmovdqa  (arg1), \T_key
1123                 vpxor    \T_key, \XMM1, \XMM1
1124                 vpxor    \T_key, \XMM2, \XMM2
1125                 vpxor    \T_key, \XMM3, \XMM3
1126                 vpxor    \T_key, \XMM4, \XMM4
1127                 vpxor    \T_key, \XMM5, \XMM5
1128                 vpxor    \T_key, \XMM6, \XMM6
1129                 vpxor    \T_key, \XMM7, \XMM7
1130                 vpxor    \T_key, \XMM8, \XMM8
1131 
1132                i = 1
1133                setreg
1134 .rep    \REP       # do REP rounds
1135                 vmovdqa  16*i(arg1), \T_key
1136                 vaesenc  \T_key, \XMM1, \XMM1
1137                 vaesenc  \T_key, \XMM2, \XMM2
1138                 vaesenc  \T_key, \XMM3, \XMM3
1139                 vaesenc  \T_key, \XMM4, \XMM4
1140                 vaesenc  \T_key, \XMM5, \XMM5
1141                 vaesenc  \T_key, \XMM6, \XMM6
1142                 vaesenc  \T_key, \XMM7, \XMM7
1143                 vaesenc  \T_key, \XMM8, \XMM8
1144                i = (i+1)
1145                setreg
1146 .endr
1147 
1148                 vmovdqa  16*i(arg1), \T_key
1149                 vaesenclast  \T_key, \XMM1, \XMM1
1150                 vaesenclast  \T_key, \XMM2, \XMM2
1151                 vaesenclast  \T_key, \XMM3, \XMM3
1152                 vaesenclast  \T_key, \XMM4, \XMM4
1153                 vaesenclast  \T_key, \XMM5, \XMM5
1154                 vaesenclast  \T_key, \XMM6, \XMM6
1155                 vaesenclast  \T_key, \XMM7, \XMM7
1156                 vaesenclast  \T_key, \XMM8, \XMM8
1157 
1158                 vmovdqu  (arg4, %r11), \T1
1159                 vpxor    \T1, \XMM1, \XMM1
1160                 vmovdqu  \XMM1, (arg3 , %r11)
1161                 .if   \ENC_DEC == DEC
1162                 vmovdqa  \T1, \XMM1
1163                 .endif
1164 
1165                 vmovdqu  16*1(arg4, %r11), \T1
1166                 vpxor    \T1, \XMM2, \XMM2
1167                 vmovdqu  \XMM2, 16*1(arg3 , %r11)
1168                 .if   \ENC_DEC == DEC
1169                 vmovdqa  \T1, \XMM2
1170                 .endif
1171 
1172                 vmovdqu  16*2(arg4, %r11), \T1
1173                 vpxor    \T1, \XMM3, \XMM3
1174                 vmovdqu  \XMM3, 16*2(arg3 , %r11)
1175                 .if   \ENC_DEC == DEC
1176                 vmovdqa  \T1, \XMM3
1177                 .endif
1178 
1179                 vmovdqu  16*3(arg4, %r11), \T1
1180                 vpxor    \T1, \XMM4, \XMM4
1181                 vmovdqu  \XMM4, 16*3(arg3 , %r11)
1182                 .if   \ENC_DEC == DEC
1183                 vmovdqa  \T1, \XMM4
1184                 .endif
1185 
1186                 vmovdqu  16*4(arg4, %r11), \T1
1187                 vpxor    \T1, \XMM5, \XMM5
1188                 vmovdqu  \XMM5, 16*4(arg3 , %r11)
1189                 .if   \ENC_DEC == DEC
1190                 vmovdqa  \T1, \XMM5
1191                 .endif
1192 
1193                 vmovdqu  16*5(arg4, %r11), \T1
1194                 vpxor    \T1, \XMM6, \XMM6
1195                 vmovdqu  \XMM6, 16*5(arg3 , %r11)
1196                 .if   \ENC_DEC == DEC
1197                 vmovdqa  \T1, \XMM6
1198                 .endif
1199 
1200                 vmovdqu  16*6(arg4, %r11), \T1
1201                 vpxor    \T1, \XMM7, \XMM7
1202                 vmovdqu  \XMM7, 16*6(arg3 , %r11)
1203                 .if   \ENC_DEC == DEC
1204                 vmovdqa  \T1, \XMM7
1205                 .endif
1206 
1207                 vmovdqu  16*7(arg4, %r11), \T1
1208                 vpxor    \T1, \XMM8, \XMM8
1209                 vmovdqu  \XMM8, 16*7(arg3 , %r11)
1210                 .if   \ENC_DEC == DEC
1211                 vmovdqa  \T1, \XMM8
1212                 .endif
1213 
1214                 add     $128, %r11
1215 
1216                 vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
1217                 vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with the corresponding ciphertext
1218                 vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
1219                 vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
1220                 vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
1221                 vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
1222                 vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
1223                 vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
1224                 vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
1225 
1226 ###############################################################################
1227 
1228 _initial_blocks_done\@:
1229 
1230 .endm
1231 
1232 # encrypt 8 blocks at a time
1233 # ghash the 8 previously encrypted ciphertext blocks
1234 # arg1, arg3, arg4 are used as pointers only, not modified
1235 # r11 is the data offset value
1236 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1237 
1238         vmovdqa \XMM1, \T2
1239         vmovdqa \XMM2, TMP2(%rsp)
1240         vmovdqa \XMM3, TMP3(%rsp)
1241         vmovdqa \XMM4, TMP4(%rsp)
1242         vmovdqa \XMM5, TMP5(%rsp)
1243         vmovdqa \XMM6, TMP6(%rsp)
1244         vmovdqa \XMM7, TMP7(%rsp)
1245         vmovdqa \XMM8, TMP8(%rsp)
1246 
1247 .if \loop_idx == in_order
1248                 vpaddd  ONE(%rip), \CTR, \XMM1           # INCR CNT
1249                 vpaddd  ONE(%rip), \XMM1, \XMM2
1250                 vpaddd  ONE(%rip), \XMM2, \XMM3
1251                 vpaddd  ONE(%rip), \XMM3, \XMM4
1252                 vpaddd  ONE(%rip), \XMM4, \XMM5
1253                 vpaddd  ONE(%rip), \XMM5, \XMM6
1254                 vpaddd  ONE(%rip), \XMM6, \XMM7
1255                 vpaddd  ONE(%rip), \XMM7, \XMM8
1256                 vmovdqa \XMM8, \CTR
1257 
1258                 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1    # perform a 16Byte swap
1259                 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2    # perform a 16Byte swap
1260                 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3    # perform a 16Byte swap
1261                 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4    # perform a 16Byte swap
1262                 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5    # perform a 16Byte swap
1263                 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6    # perform a 16Byte swap
1264                 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7    # perform a 16Byte swap
1265                 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8    # perform a 16Byte swap
1266 .else
1267                 vpaddd  ONEf(%rip), \CTR, \XMM1           # INCR CNT
1268                 vpaddd  ONEf(%rip), \XMM1, \XMM2
1269                 vpaddd  ONEf(%rip), \XMM2, \XMM3
1270                 vpaddd  ONEf(%rip), \XMM3, \XMM4
1271                 vpaddd  ONEf(%rip), \XMM4, \XMM5
1272                 vpaddd  ONEf(%rip), \XMM5, \XMM6
1273                 vpaddd  ONEf(%rip), \XMM6, \XMM7
1274                 vpaddd  ONEf(%rip), \XMM7, \XMM8
1275                 vmovdqa \XMM8, \CTR
1276 .endif
1277 
1278 
1279         #######################################################################
1280 
1281                 vmovdqu (arg1), \T1
1282                 vpxor   \T1, \XMM1, \XMM1
1283                 vpxor   \T1, \XMM2, \XMM2
1284                 vpxor   \T1, \XMM3, \XMM3
1285                 vpxor   \T1, \XMM4, \XMM4
1286                 vpxor   \T1, \XMM5, \XMM5
1287                 vpxor   \T1, \XMM6, \XMM6
1288                 vpxor   \T1, \XMM7, \XMM7
1289                 vpxor   \T1, \XMM8, \XMM8
1290 
1291         #######################################################################
1292 
1293 
1294 
1295 
1296 
1297                 vmovdqu 16*1(arg1), \T1
1298                 vaesenc \T1, \XMM1, \XMM1
1299                 vaesenc \T1, \XMM2, \XMM2
1300                 vaesenc \T1, \XMM3, \XMM3
1301                 vaesenc \T1, \XMM4, \XMM4
1302                 vaesenc \T1, \XMM5, \XMM5
1303                 vaesenc \T1, \XMM6, \XMM6
1304                 vaesenc \T1, \XMM7, \XMM7
1305                 vaesenc \T1, \XMM8, \XMM8
1306 
1307                 vmovdqu 16*2(arg1), \T1
1308                 vaesenc \T1, \XMM1, \XMM1
1309                 vaesenc \T1, \XMM2, \XMM2
1310                 vaesenc \T1, \XMM3, \XMM3
1311                 vaesenc \T1, \XMM4, \XMM4
1312                 vaesenc \T1, \XMM5, \XMM5
1313                 vaesenc \T1, \XMM6, \XMM6
1314                 vaesenc \T1, \XMM7, \XMM7
1315                 vaesenc \T1, \XMM8, \XMM8
1316 
1317 
1318         #######################################################################
1319 
1320         vmovdqu         HashKey_8(arg2), \T5
1321         vpclmulqdq      $0x11, \T5, \T2, \T4             # T4 = a1*b1
1322         vpclmulqdq      $0x00, \T5, \T2, \T7             # T7 = a0*b0
1323 
1324         vpshufd         $0b01001110, \T2, \T6
1325         vpxor           \T2, \T6, \T6
1326 
1327         vmovdqu         HashKey_8_k(arg2), \T5
1328         vpclmulqdq      $0x00, \T5, \T6, \T6
1329 
1330                 vmovdqu 16*3(arg1), \T1
1331                 vaesenc \T1, \XMM1, \XMM1
1332                 vaesenc \T1, \XMM2, \XMM2
1333                 vaesenc \T1, \XMM3, \XMM3
1334                 vaesenc \T1, \XMM4, \XMM4
1335                 vaesenc \T1, \XMM5, \XMM5
1336                 vaesenc \T1, \XMM6, \XMM6
1337                 vaesenc \T1, \XMM7, \XMM7
1338                 vaesenc \T1, \XMM8, \XMM8
1339 
1340         vmovdqa         TMP2(%rsp), \T1
1341         vmovdqu         HashKey_7(arg2), \T5
1342         vpclmulqdq      $0x11, \T5, \T1, \T3
1343         vpxor           \T3, \T4, \T4
1344         vpclmulqdq      $0x00, \T5, \T1, \T3
1345         vpxor           \T3, \T7, \T7
1346 
1347         vpshufd         $0b01001110, \T1, \T3
1348         vpxor           \T1, \T3, \T3
1349         vmovdqu         HashKey_7_k(arg2), \T5
1350         vpclmulqdq      $0x10, \T5, \T3, \T3
1351         vpxor           \T3, \T6, \T6
1352 
1353                 vmovdqu 16*4(arg1), \T1
1354                 vaesenc \T1, \XMM1, \XMM1
1355                 vaesenc \T1, \XMM2, \XMM2
1356                 vaesenc \T1, \XMM3, \XMM3
1357                 vaesenc \T1, \XMM4, \XMM4
1358                 vaesenc \T1, \XMM5, \XMM5
1359                 vaesenc \T1, \XMM6, \XMM6
1360                 vaesenc \T1, \XMM7, \XMM7
1361                 vaesenc \T1, \XMM8, \XMM8
1362 
1363         #######################################################################
1364 
1365         vmovdqa         TMP3(%rsp), \T1
1366         vmovdqu         HashKey_6(arg2), \T5
1367         vpclmulqdq      $0x11, \T5, \T1, \T3
1368         vpxor           \T3, \T4, \T4
1369         vpclmulqdq      $0x00, \T5, \T1, \T3
1370         vpxor           \T3, \T7, \T7
1371 
1372         vpshufd         $0b01001110, \T1, \T3
1373         vpxor           \T1, \T3, \T3
1374         vmovdqu         HashKey_6_k(arg2), \T5
1375         vpclmulqdq      $0x10, \T5, \T3, \T3
1376         vpxor           \T3, \T6, \T6
1377 
1378                 vmovdqu 16*5(arg1), \T1
1379                 vaesenc \T1, \XMM1, \XMM1
1380                 vaesenc \T1, \XMM2, \XMM2
1381                 vaesenc \T1, \XMM3, \XMM3
1382                 vaesenc \T1, \XMM4, \XMM4
1383                 vaesenc \T1, \XMM5, \XMM5
1384                 vaesenc \T1, \XMM6, \XMM6
1385                 vaesenc \T1, \XMM7, \XMM7
1386                 vaesenc \T1, \XMM8, \XMM8
1387 
1388         vmovdqa         TMP4(%rsp), \T1
1389         vmovdqu         HashKey_5(arg2), \T5
1390         vpclmulqdq      $0x11, \T5, \T1, \T3
1391         vpxor           \T3, \T4, \T4
1392         vpclmulqdq      $0x00, \T5, \T1, \T3
1393         vpxor           \T3, \T7, \T7
1394 
1395         vpshufd         $0b01001110, \T1, \T3
1396         vpxor           \T1, \T3, \T3
1397         vmovdqu         HashKey_5_k(arg2), \T5
1398         vpclmulqdq      $0x10, \T5, \T3, \T3
1399         vpxor           \T3, \T6, \T6
1400 
1401                 vmovdqu 16*6(arg1), \T1
1402                 vaesenc \T1, \XMM1, \XMM1
1403                 vaesenc \T1, \XMM2, \XMM2
1404                 vaesenc \T1, \XMM3, \XMM3
1405                 vaesenc \T1, \XMM4, \XMM4
1406                 vaesenc \T1, \XMM5, \XMM5
1407                 vaesenc \T1, \XMM6, \XMM6
1408                 vaesenc \T1, \XMM7, \XMM7
1409                 vaesenc \T1, \XMM8, \XMM8
1410 
1411 
1412         vmovdqa         TMP5(%rsp), \T1
1413         vmovdqu         HashKey_4(arg2), \T5
1414         vpclmulqdq      $0x11, \T5, \T1, \T3
1415         vpxor           \T3, \T4, \T4
1416         vpclmulqdq      $0x00, \T5, \T1, \T3
1417         vpxor           \T3, \T7, \T7
1418 
1419         vpshufd         $0b01001110, \T1, \T3
1420         vpxor           \T1, \T3, \T3
1421         vmovdqu         HashKey_4_k(arg2), \T5
1422         vpclmulqdq      $0x10, \T5, \T3, \T3
1423         vpxor           \T3, \T6, \T6
1424 
1425                 vmovdqu 16*7(arg1), \T1
1426                 vaesenc \T1, \XMM1, \XMM1
1427                 vaesenc \T1, \XMM2, \XMM2
1428                 vaesenc \T1, \XMM3, \XMM3
1429                 vaesenc \T1, \XMM4, \XMM4
1430                 vaesenc \T1, \XMM5, \XMM5
1431                 vaesenc \T1, \XMM6, \XMM6
1432                 vaesenc \T1, \XMM7, \XMM7
1433                 vaesenc \T1, \XMM8, \XMM8
1434 
1435         vmovdqa         TMP6(%rsp), \T1
1436         vmovdqu         HashKey_3(arg2), \T5
1437         vpclmulqdq      $0x11, \T5, \T1, \T3
1438         vpxor           \T3, \T4, \T4
1439         vpclmulqdq      $0x00, \T5, \T1, \T3
1440         vpxor           \T3, \T7, \T7
1441 
1442         vpshufd         $0b01001110, \T1, \T3
1443         vpxor           \T1, \T3, \T3
1444         vmovdqu         HashKey_3_k(arg2), \T5
1445         vpclmulqdq      $0x10, \T5, \T3, \T3
1446         vpxor           \T3, \T6, \T6
1447 
1448 
1449                 vmovdqu 16*8(arg1), \T1
1450                 vaesenc \T1, \XMM1, \XMM1
1451                 vaesenc \T1, \XMM2, \XMM2
1452                 vaesenc \T1, \XMM3, \XMM3
1453                 vaesenc \T1, \XMM4, \XMM4
1454                 vaesenc \T1, \XMM5, \XMM5
1455                 vaesenc \T1, \XMM6, \XMM6
1456                 vaesenc \T1, \XMM7, \XMM7
1457                 vaesenc \T1, \XMM8, \XMM8
1458 
1459         vmovdqa         TMP7(%rsp), \T1
1460         vmovdqu         HashKey_2(arg2), \T5
1461         vpclmulqdq      $0x11, \T5, \T1, \T3
1462         vpxor           \T3, \T4, \T4
1463         vpclmulqdq      $0x00, \T5, \T1, \T3
1464         vpxor           \T3, \T7, \T7
1465 
1466         vpshufd         $0b01001110, \T1, \T3
1467         vpxor           \T1, \T3, \T3
1468         vmovdqu         HashKey_2_k(arg2), \T5
1469         vpclmulqdq      $0x10, \T5, \T3, \T3
1470         vpxor           \T3, \T6, \T6
1471 
1472         #######################################################################
1473 
1474                 vmovdqu 16*9(arg1), \T5
1475                 vaesenc \T5, \XMM1, \XMM1
1476                 vaesenc \T5, \XMM2, \XMM2
1477                 vaesenc \T5, \XMM3, \XMM3
1478                 vaesenc \T5, \XMM4, \XMM4
1479                 vaesenc \T5, \XMM5, \XMM5
1480                 vaesenc \T5, \XMM6, \XMM6
1481                 vaesenc \T5, \XMM7, \XMM7
1482                 vaesenc \T5, \XMM8, \XMM8
1483 
1484         vmovdqa         TMP8(%rsp), \T1
1485         vmovdqu         HashKey(arg2), \T5
1486         vpclmulqdq      $0x11, \T5, \T1, \T3
1487         vpxor           \T3, \T4, \T4
1488         vpclmulqdq      $0x00, \T5, \T1, \T3
1489         vpxor           \T3, \T7, \T7
1490 
1491         vpshufd         $0b01001110, \T1, \T3
1492         vpxor           \T1, \T3, \T3
1493         vmovdqu         HashKey_k(arg2), \T5
1494         vpclmulqdq      $0x10, \T5, \T3, \T3
1495         vpxor           \T3, \T6, \T6
1496 
1497         vpxor           \T4, \T6, \T6
1498         vpxor           \T7, \T6, \T6
1499 
1500                 vmovdqu 16*10(arg1), \T5
1501 
1502         i = 11
1503         setreg
1504 .rep (\REP-9)
1505 
1506         vaesenc \T5, \XMM1, \XMM1
1507         vaesenc \T5, \XMM2, \XMM2
1508         vaesenc \T5, \XMM3, \XMM3
1509         vaesenc \T5, \XMM4, \XMM4
1510         vaesenc \T5, \XMM5, \XMM5
1511         vaesenc \T5, \XMM6, \XMM6
1512         vaesenc \T5, \XMM7, \XMM7
1513         vaesenc \T5, \XMM8, \XMM8
1514 
1515         vmovdqu 16*i(arg1), \T5
1516         i = i + 1
1517         setreg
1518 .endr
1519 
1520 	i = 0
1521 	j = 1
1522 	setreg
1523 .rep 8
1524 		vpxor	16*i(arg4, %r11), \T5, \T2
1525                 .if \ENC_DEC == ENC
1526                 vaesenclast     \T2, reg_j, reg_j
1527                 .else
1528                 vaesenclast     \T2, reg_j, \T3
1529                 vmovdqu 16*i(arg4, %r11), reg_j
1530                 vmovdqu \T3, 16*i(arg3, %r11)
1531                 .endif
1532 	i = (i+1)
1533 	j = (j+1)
1534 	setreg
1535 .endr
1536 	#######################################################################
1537 
1538 
1539 	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
1540 	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
1541 	vpxor	\T3, \T7, \T7
1542 	vpxor	\T4, \T6, \T6				# accumulate the results in T6:T7
1543 
1544 
1545 
1546 	#######################################################################
1547 	#first phase of the reduction
1548 	#######################################################################
1549         vpslld  $31, \T7, \T2                           # packed right shifting << 31
1550         vpslld  $30, \T7, \T3                           # packed right shifting shift << 30
1551         vpslld  $25, \T7, \T4                           # packed right shifting shift << 25
1552 
1553         vpxor   \T3, \T2, \T2                           # xor the shifted versions
1554         vpxor   \T4, \T2, \T2
1555 
1556         vpsrldq $4, \T2, \T1                            # shift-R T1 1 DW
1557 
1558         vpslldq $12, \T2, \T2                           # shift-L T2 3 DWs
1559         vpxor   \T2, \T7, \T7                           # first phase of the reduction complete
1560 	#######################################################################
1561                 .if \ENC_DEC == ENC
1562 		vmovdqu	 \XMM1,	16*0(arg3,%r11)		# Write to the Ciphertext buffer
1563 		vmovdqu	 \XMM2,	16*1(arg3,%r11)		# Write to the Ciphertext buffer
1564 		vmovdqu	 \XMM3,	16*2(arg3,%r11)		# Write to the Ciphertext buffer
1565 		vmovdqu	 \XMM4,	16*3(arg3,%r11)		# Write to the Ciphertext buffer
1566 		vmovdqu	 \XMM5,	16*4(arg3,%r11)		# Write to the Ciphertext buffer
1567 		vmovdqu	 \XMM6,	16*5(arg3,%r11)		# Write to the Ciphertext buffer
1568 		vmovdqu	 \XMM7,	16*6(arg3,%r11)		# Write to the Ciphertext buffer
1569 		vmovdqu	 \XMM8,	16*7(arg3,%r11)		# Write to the Ciphertext buffer
1570                 .endif
1571 
1572 	#######################################################################
1573 	#second phase of the reduction
1574         vpsrld  $1, \T7, \T2                            # packed left shifting >> 1
1575         vpsrld  $2, \T7, \T3                            # packed left shifting >> 2
1576         vpsrld  $7, \T7, \T4                            # packed left shifting >> 7
1577         vpxor   \T3, \T2, \T2                           # xor the shifted versions
1578         vpxor   \T4, \T2, \T2
1579 
1580         vpxor   \T1, \T2, \T2
1581         vpxor   \T2, \T7, \T7
1582         vpxor   \T7, \T6, \T6                           # the result is in T6
1583 	#######################################################################
1584 
1585 		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
1586 		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
1587 		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
1588 		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
1589 		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
1590 		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
1591 		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
1592 		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
1593 
1594 
1595 	vpxor	\T6, \XMM1, \XMM1
1596 
1597 
1598 
1599 .endm
1600 
1601 
1602 # GHASH the last 4 ciphertext blocks.
1603 .macro  GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1604 
1605         ## Karatsuba Method
1606 
1607 
1608         vpshufd         $0b01001110, \XMM1, \T2
1609         vpxor           \XMM1, \T2, \T2
1610         vmovdqu         HashKey_8(arg2), \T5
1611         vpclmulqdq      $0x11, \T5, \XMM1, \T6
1612         vpclmulqdq      $0x00, \T5, \XMM1, \T7
1613 
1614         vmovdqu         HashKey_8_k(arg2), \T3
1615         vpclmulqdq      $0x00, \T3, \T2, \XMM1
1616 
1617         ######################
1618 
1619         vpshufd         $0b01001110, \XMM2, \T2
1620         vpxor           \XMM2, \T2, \T2
1621         vmovdqu         HashKey_7(arg2), \T5
1622         vpclmulqdq      $0x11, \T5, \XMM2, \T4
1623         vpxor           \T4, \T6, \T6
1624 
1625         vpclmulqdq      $0x00, \T5, \XMM2, \T4
1626         vpxor           \T4, \T7, \T7
1627 
1628         vmovdqu         HashKey_7_k(arg2), \T3
1629         vpclmulqdq      $0x00, \T3, \T2, \T2
1630         vpxor           \T2, \XMM1, \XMM1
1631 
1632         ######################
1633 
1634         vpshufd         $0b01001110, \XMM3, \T2
1635         vpxor           \XMM3, \T2, \T2
1636         vmovdqu         HashKey_6(arg2), \T5
1637         vpclmulqdq      $0x11, \T5, \XMM3, \T4
1638         vpxor           \T4, \T6, \T6
1639 
1640         vpclmulqdq      $0x00, \T5, \XMM3, \T4
1641         vpxor           \T4, \T7, \T7
1642 
1643         vmovdqu         HashKey_6_k(arg2), \T3
1644         vpclmulqdq      $0x00, \T3, \T2, \T2
1645         vpxor           \T2, \XMM1, \XMM1
1646 
1647         ######################
1648 
1649         vpshufd         $0b01001110, \XMM4, \T2
1650         vpxor           \XMM4, \T2, \T2
1651         vmovdqu         HashKey_5(arg2), \T5
1652         vpclmulqdq      $0x11, \T5, \XMM4, \T4
1653         vpxor           \T4, \T6, \T6
1654 
1655         vpclmulqdq      $0x00, \T5, \XMM4, \T4
1656         vpxor           \T4, \T7, \T7
1657 
1658         vmovdqu         HashKey_5_k(arg2), \T3
1659         vpclmulqdq      $0x00, \T3, \T2, \T2
1660         vpxor           \T2, \XMM1, \XMM1
1661 
1662         ######################
1663 
1664         vpshufd         $0b01001110, \XMM5, \T2
1665         vpxor           \XMM5, \T2, \T2
1666         vmovdqu         HashKey_4(arg2), \T5
1667         vpclmulqdq      $0x11, \T5, \XMM5, \T4
1668         vpxor           \T4, \T6, \T6
1669 
1670         vpclmulqdq      $0x00, \T5, \XMM5, \T4
1671         vpxor           \T4, \T7, \T7
1672 
1673         vmovdqu         HashKey_4_k(arg2), \T3
1674         vpclmulqdq      $0x00, \T3, \T2, \T2
1675         vpxor           \T2, \XMM1, \XMM1
1676 
1677         ######################
1678 
1679         vpshufd         $0b01001110, \XMM6, \T2
1680         vpxor           \XMM6, \T2, \T2
1681         vmovdqu         HashKey_3(arg2), \T5
1682         vpclmulqdq      $0x11, \T5, \XMM6, \T4
1683         vpxor           \T4, \T6, \T6
1684 
1685         vpclmulqdq      $0x00, \T5, \XMM6, \T4
1686         vpxor           \T4, \T7, \T7
1687 
1688         vmovdqu         HashKey_3_k(arg2), \T3
1689         vpclmulqdq      $0x00, \T3, \T2, \T2
1690         vpxor           \T2, \XMM1, \XMM1
1691 
1692         ######################
1693 
1694         vpshufd         $0b01001110, \XMM7, \T2
1695         vpxor           \XMM7, \T2, \T2
1696         vmovdqu         HashKey_2(arg2), \T5
1697         vpclmulqdq      $0x11, \T5, \XMM7, \T4
1698         vpxor           \T4, \T6, \T6
1699 
1700         vpclmulqdq      $0x00, \T5, \XMM7, \T4
1701         vpxor           \T4, \T7, \T7
1702 
1703         vmovdqu         HashKey_2_k(arg2), \T3
1704         vpclmulqdq      $0x00, \T3, \T2, \T2
1705         vpxor           \T2, \XMM1, \XMM1
1706 
1707         ######################
1708 
1709         vpshufd         $0b01001110, \XMM8, \T2
1710         vpxor           \XMM8, \T2, \T2
1711         vmovdqu         HashKey(arg2), \T5
1712         vpclmulqdq      $0x11, \T5, \XMM8, \T4
1713         vpxor           \T4, \T6, \T6
1714 
1715         vpclmulqdq      $0x00, \T5, \XMM8, \T4
1716         vpxor           \T4, \T7, \T7
1717 
1718         vmovdqu         HashKey_k(arg2), \T3
1719         vpclmulqdq      $0x00, \T3, \T2, \T2
1720 
1721         vpxor           \T2, \XMM1, \XMM1
1722         vpxor           \T6, \XMM1, \XMM1
1723         vpxor           \T7, \XMM1, \T2
1724 
1725 
1726 
1727 
1728         vpslldq $8, \T2, \T4
1729         vpsrldq $8, \T2, \T2
1730 
1731         vpxor   \T4, \T7, \T7
1732         vpxor   \T2, \T6, \T6   # <T6:T7> holds the result of
1733 				# the accumulated carry-less multiplications
1734 
1735         #######################################################################
1736         #first phase of the reduction
1737         vpslld  $31, \T7, \T2   # packed right shifting << 31
1738         vpslld  $30, \T7, \T3   # packed right shifting shift << 30
1739         vpslld  $25, \T7, \T4   # packed right shifting shift << 25
1740 
1741         vpxor   \T3, \T2, \T2   # xor the shifted versions
1742         vpxor   \T4, \T2, \T2
1743 
1744         vpsrldq $4, \T2, \T1    # shift-R T1 1 DW
1745 
1746         vpslldq $12, \T2, \T2   # shift-L T2 3 DWs
1747         vpxor   \T2, \T7, \T7   # first phase of the reduction complete
1748         #######################################################################
1749 
1750 
1751         #second phase of the reduction
1752         vpsrld  $1, \T7, \T2    # packed left shifting >> 1
1753         vpsrld  $2, \T7, \T3    # packed left shifting >> 2
1754         vpsrld  $7, \T7, \T4    # packed left shifting >> 7
1755         vpxor   \T3, \T2, \T2   # xor the shifted versions
1756         vpxor   \T4, \T2, \T2
1757 
1758         vpxor   \T1, \T2, \T2
1759         vpxor   \T2, \T7, \T7
1760         vpxor   \T7, \T6, \T6   # the result is in T6
1761 
1762 .endm
1763 
1764 #############################################################
1765 #void   aesni_gcm_precomp_avx_gen2
1766 #        (gcm_data     *my_ctx_data,
1767 #         gcm_context_data *data,
1768 #        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1769 #        u8      *iv, /* Pre-counter block j0: 4 byte salt
1770 #			(from Security Association) concatenated with 8 byte
1771 #			Initialisation Vector (from IPSec ESP Payload)
1772 #			concatenated with 0x00000001. 16-byte aligned pointer. */
1773 #        const   u8 *aad, /* Additional Authentication Data (AAD)*/
1774 #        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1775 #############################################################
1776 SYM_FUNC_START(aesni_gcm_init_avx_gen2)
1777         FUNC_SAVE
1778         INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
1779         FUNC_RESTORE
1780         RET
1781 SYM_FUNC_END(aesni_gcm_init_avx_gen2)
1782 
1783 ###############################################################################
1784 #void   aesni_gcm_enc_update_avx_gen2(
1785 #        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1786 #        gcm_context_data *data,
1787 #        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
1788 #        const   u8 *in, /* Plaintext input */
1789 #        u64     plaintext_len) /* Length of data in Bytes for encryption. */
1790 ###############################################################################
1791 SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
1792         FUNC_SAVE
1793         mov     keysize, %eax
1794         cmp     $32, %eax
1795         je      key_256_enc_update
1796         cmp     $16, %eax
1797         je      key_128_enc_update
1798         # must be 192
1799         GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
1800         FUNC_RESTORE
1801         RET
1802 key_128_enc_update:
1803         GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
1804         FUNC_RESTORE
1805         RET
1806 key_256_enc_update:
1807         GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
1808         FUNC_RESTORE
1809         RET
1810 SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
1811 
1812 ###############################################################################
1813 #void   aesni_gcm_dec_update_avx_gen2(
1814 #        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1815 #        gcm_context_data *data,
1816 #        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
1817 #        const   u8 *in, /* Ciphertext input */
1818 #        u64     plaintext_len) /* Length of data in Bytes for encryption. */
1819 ###############################################################################
1820 SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
1821         FUNC_SAVE
1822         mov     keysize,%eax
1823         cmp     $32, %eax
1824         je      key_256_dec_update
1825         cmp     $16, %eax
1826         je      key_128_dec_update
1827         # must be 192
1828         GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
1829         FUNC_RESTORE
1830         RET
1831 key_128_dec_update:
1832         GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
1833         FUNC_RESTORE
1834         RET
1835 key_256_dec_update:
1836         GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
1837         FUNC_RESTORE
1838         RET
1839 SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
1840 
1841 ###############################################################################
1842 #void   aesni_gcm_finalize_avx_gen2(
1843 #        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1844 #        gcm_context_data *data,
1845 #        u8      *auth_tag, /* Authenticated Tag output. */
1846 #        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
1847 #				Valid values are 16 (most likely), 12 or 8. */
1848 ###############################################################################
1849 SYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
1850         FUNC_SAVE
1851         mov	keysize,%eax
1852         cmp     $32, %eax
1853         je      key_256_finalize
1854         cmp     $16, %eax
1855         je      key_128_finalize
1856         # must be 192
1857         GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
1858         FUNC_RESTORE
1859         RET
1860 key_128_finalize:
1861         GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
1862         FUNC_RESTORE
1863         RET
1864 key_256_finalize:
1865         GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
1866         FUNC_RESTORE
1867         RET
1868 SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
1869 
1870 ###############################################################################
1871 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1872 # Input: A and B (128-bits each, bit-reflected)
1873 # Output: C = A*B*x mod poly, (i.e. >>1 )
1874 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1875 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1876 ###############################################################################
1877 .macro  GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1878 
1879         vpclmulqdq      $0x11,\HK,\GH,\T1      # T1 = a1*b1
1880         vpclmulqdq      $0x00,\HK,\GH,\T2      # T2 = a0*b0
1881         vpclmulqdq      $0x01,\HK,\GH,\T3      # T3 = a1*b0
1882         vpclmulqdq      $0x10,\HK,\GH,\GH      # GH = a0*b1
1883         vpxor           \T3, \GH, \GH
1884 
1885 
1886         vpsrldq         $8 , \GH, \T3          # shift-R GH 2 DWs
1887         vpslldq         $8 , \GH, \GH          # shift-L GH 2 DWs
1888 
1889         vpxor           \T3, \T1, \T1
1890         vpxor           \T2, \GH, \GH
1891 
1892         #######################################################################
1893         #first phase of the reduction
1894         vmovdqa         POLY2(%rip), \T3
1895 
1896         vpclmulqdq      $0x01, \GH, \T3, \T2
1897         vpslldq         $8, \T2, \T2           # shift-L T2 2 DWs
1898 
1899         vpxor           \T2, \GH, \GH          # first phase of the reduction complete
1900         #######################################################################
1901         #second phase of the reduction
1902         vpclmulqdq      $0x00, \GH, \T3, \T2
1903         vpsrldq         $4, \T2, \T2           # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1904 
1905         vpclmulqdq      $0x10, \GH, \T3, \GH
1906         vpslldq         $4, \GH, \GH           # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1907 
1908         vpxor           \T2, \GH, \GH          # second phase of the reduction complete
1909         #######################################################################
1910         vpxor           \T1, \GH, \GH          # the result is in GH
1911 
1912 
1913 .endm
1914 
1915 .macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1916 
1917         # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1918         vmovdqa  \HK, \T5
1919         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^2<<1 mod poly
1920         vmovdqu  \T5, HashKey_2(arg2)                       #  [HashKey_2] = HashKey^2<<1 mod poly
1921 
1922         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^3<<1 mod poly
1923         vmovdqu  \T5, HashKey_3(arg2)
1924 
1925         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^4<<1 mod poly
1926         vmovdqu  \T5, HashKey_4(arg2)
1927 
1928         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^5<<1 mod poly
1929         vmovdqu  \T5, HashKey_5(arg2)
1930 
1931         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^6<<1 mod poly
1932         vmovdqu  \T5, HashKey_6(arg2)
1933 
1934         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^7<<1 mod poly
1935         vmovdqu  \T5, HashKey_7(arg2)
1936 
1937         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^8<<1 mod poly
1938         vmovdqu  \T5, HashKey_8(arg2)
1939 
1940 .endm
1941 
1942 ## if a = number of total plaintext bytes
1943 ## b = floor(a/16)
1944 ## num_initial_blocks = b mod 4#
1945 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1946 ## r10, r11, r12, rax are clobbered
1947 ## arg1, arg3, arg4, r14 are used as a pointer only, not modified
1948 
1949 .macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1950 	i = (8-\num_initial_blocks)
1951 	setreg
1952 	vmovdqu AadHash(arg2), reg_i
1953 
1954 	# start AES for num_initial_blocks blocks
1955 	vmovdqu CurCount(arg2), \CTR
1956 
1957 	i = (9-\num_initial_blocks)
1958 	setreg
1959 .rep \num_initial_blocks
1960                 vpaddd  ONE(%rip), \CTR, \CTR   # INCR Y0
1961                 vmovdqa \CTR, reg_i
1962                 vpshufb SHUF_MASK(%rip), reg_i, reg_i     # perform a 16Byte swap
1963 	i = (i+1)
1964 	setreg
1965 .endr
1966 
1967 	vmovdqa  (arg1), \T_key
1968 	i = (9-\num_initial_blocks)
1969 	setreg
1970 .rep \num_initial_blocks
1971                 vpxor   \T_key, reg_i, reg_i
1972 	i = (i+1)
1973 	setreg
1974 .endr
1975 
1976 	j = 1
1977 	setreg
1978 .rep \REP
1979 	vmovdqa  16*j(arg1), \T_key
1980 	i = (9-\num_initial_blocks)
1981 	setreg
1982 .rep \num_initial_blocks
1983         vaesenc \T_key, reg_i, reg_i
1984 	i = (i+1)
1985 	setreg
1986 .endr
1987 
1988 	j = (j+1)
1989 	setreg
1990 .endr
1991 
1992 
1993 	vmovdqa  16*j(arg1), \T_key
1994 	i = (9-\num_initial_blocks)
1995 	setreg
1996 .rep \num_initial_blocks
1997         vaesenclast      \T_key, reg_i, reg_i
1998 	i = (i+1)
1999 	setreg
2000 .endr
2001 
2002 	i = (9-\num_initial_blocks)
2003 	setreg
2004 .rep \num_initial_blocks
2005                 vmovdqu (arg4, %r11), \T1
2006                 vpxor   \T1, reg_i, reg_i
2007                 vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for
2008 						       # num_initial_blocks blocks
2009                 add     $16, %r11
2010 .if  \ENC_DEC == DEC
2011                 vmovdqa \T1, reg_i
2012 .endif
2013                 vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
2014 	i = (i+1)
2015 	setreg
2016 .endr
2017 
2018 
2019 	i = (8-\num_initial_blocks)
2020 	j = (9-\num_initial_blocks)
2021 	setreg
2022 
2023 .rep \num_initial_blocks
2024         vpxor    reg_i, reg_j, reg_j
2025         GHASH_MUL_AVX2       reg_j, \T2, \T1, \T3, \T4, \T5, \T6  # apply GHASH on num_initial_blocks blocks
2026 	i = (i+1)
2027 	j = (j+1)
2028 	setreg
2029 .endr
2030         # XMM8 has the combined result here
2031 
2032         vmovdqa  \XMM8, TMP1(%rsp)
2033         vmovdqa  \XMM8, \T3
2034 
2035         cmp     $128, %r13
2036         jl      _initial_blocks_done\@                  # no need for precomputed constants
2037 
2038 ###############################################################################
2039 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
2040                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2041                 vmovdqa  \CTR, \XMM1
2042                 vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
2043 
2044                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2045                 vmovdqa  \CTR, \XMM2
2046                 vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
2047 
2048                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2049                 vmovdqa  \CTR, \XMM3
2050                 vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
2051 
2052                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2053                 vmovdqa  \CTR, \XMM4
2054                 vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
2055 
2056                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2057                 vmovdqa  \CTR, \XMM5
2058                 vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
2059 
2060                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2061                 vmovdqa  \CTR, \XMM6
2062                 vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
2063 
2064                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2065                 vmovdqa  \CTR, \XMM7
2066                 vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
2067 
2068                 vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2069                 vmovdqa  \CTR, \XMM8
2070                 vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
2071 
2072                 vmovdqa  (arg1), \T_key
2073                 vpxor    \T_key, \XMM1, \XMM1
2074                 vpxor    \T_key, \XMM2, \XMM2
2075                 vpxor    \T_key, \XMM3, \XMM3
2076                 vpxor    \T_key, \XMM4, \XMM4
2077                 vpxor    \T_key, \XMM5, \XMM5
2078                 vpxor    \T_key, \XMM6, \XMM6
2079                 vpxor    \T_key, \XMM7, \XMM7
2080                 vpxor    \T_key, \XMM8, \XMM8
2081 
2082 		i = 1
2083 		setreg
2084 .rep    \REP       # do REP rounds
2085                 vmovdqa  16*i(arg1), \T_key
2086                 vaesenc  \T_key, \XMM1, \XMM1
2087                 vaesenc  \T_key, \XMM2, \XMM2
2088                 vaesenc  \T_key, \XMM3, \XMM3
2089                 vaesenc  \T_key, \XMM4, \XMM4
2090                 vaesenc  \T_key, \XMM5, \XMM5
2091                 vaesenc  \T_key, \XMM6, \XMM6
2092                 vaesenc  \T_key, \XMM7, \XMM7
2093                 vaesenc  \T_key, \XMM8, \XMM8
2094 		i = (i+1)
2095 		setreg
2096 .endr
2097 
2098 
2099                 vmovdqa  16*i(arg1), \T_key
2100                 vaesenclast  \T_key, \XMM1, \XMM1
2101                 vaesenclast  \T_key, \XMM2, \XMM2
2102                 vaesenclast  \T_key, \XMM3, \XMM3
2103                 vaesenclast  \T_key, \XMM4, \XMM4
2104                 vaesenclast  \T_key, \XMM5, \XMM5
2105                 vaesenclast  \T_key, \XMM6, \XMM6
2106                 vaesenclast  \T_key, \XMM7, \XMM7
2107                 vaesenclast  \T_key, \XMM8, \XMM8
2108 
2109                 vmovdqu  (arg4, %r11), \T1
2110                 vpxor    \T1, \XMM1, \XMM1
2111                 vmovdqu  \XMM1, (arg3 , %r11)
2112                 .if   \ENC_DEC == DEC
2113                 vmovdqa  \T1, \XMM1
2114                 .endif
2115 
2116                 vmovdqu  16*1(arg4, %r11), \T1
2117                 vpxor    \T1, \XMM2, \XMM2
2118                 vmovdqu  \XMM2, 16*1(arg3 , %r11)
2119                 .if   \ENC_DEC == DEC
2120                 vmovdqa  \T1, \XMM2
2121                 .endif
2122 
2123                 vmovdqu  16*2(arg4, %r11), \T1
2124                 vpxor    \T1, \XMM3, \XMM3
2125                 vmovdqu  \XMM3, 16*2(arg3 , %r11)
2126                 .if   \ENC_DEC == DEC
2127                 vmovdqa  \T1, \XMM3
2128                 .endif
2129 
2130                 vmovdqu  16*3(arg4, %r11), \T1
2131                 vpxor    \T1, \XMM4, \XMM4
2132                 vmovdqu  \XMM4, 16*3(arg3 , %r11)
2133                 .if   \ENC_DEC == DEC
2134                 vmovdqa  \T1, \XMM4
2135                 .endif
2136 
2137                 vmovdqu  16*4(arg4, %r11), \T1
2138                 vpxor    \T1, \XMM5, \XMM5
2139                 vmovdqu  \XMM5, 16*4(arg3 , %r11)
2140                 .if   \ENC_DEC == DEC
2141                 vmovdqa  \T1, \XMM5
2142                 .endif
2143 
2144                 vmovdqu  16*5(arg4, %r11), \T1
2145                 vpxor    \T1, \XMM6, \XMM6
2146                 vmovdqu  \XMM6, 16*5(arg3 , %r11)
2147                 .if   \ENC_DEC == DEC
2148                 vmovdqa  \T1, \XMM6
2149                 .endif
2150 
2151                 vmovdqu  16*6(arg4, %r11), \T1
2152                 vpxor    \T1, \XMM7, \XMM7
2153                 vmovdqu  \XMM7, 16*6(arg3 , %r11)
2154                 .if   \ENC_DEC == DEC
2155                 vmovdqa  \T1, \XMM7
2156                 .endif
2157 
2158                 vmovdqu  16*7(arg4, %r11), \T1
2159                 vpxor    \T1, \XMM8, \XMM8
2160                 vmovdqu  \XMM8, 16*7(arg3 , %r11)
2161                 .if   \ENC_DEC == DEC
2162                 vmovdqa  \T1, \XMM8
2163                 .endif
2164 
2165                 add     $128, %r11
2166 
2167                 vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
2168                 vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with
2169 							   # the corresponding ciphertext
2170                 vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
2171                 vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
2172                 vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
2173                 vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
2174                 vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
2175                 vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
2176                 vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
2177 
2178 ###############################################################################
2179 
2180 _initial_blocks_done\@:
2181 
2182 
2183 .endm
2184 
2185 
2186 
2187 # encrypt 8 blocks at a time
2188 # ghash the 8 previously encrypted ciphertext blocks
2189 # arg1, arg3, arg4 are used as pointers only, not modified
2190 # r11 is the data offset value
2191 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2192 
2193         vmovdqa \XMM1, \T2
2194         vmovdqa \XMM2, TMP2(%rsp)
2195         vmovdqa \XMM3, TMP3(%rsp)
2196         vmovdqa \XMM4, TMP4(%rsp)
2197         vmovdqa \XMM5, TMP5(%rsp)
2198         vmovdqa \XMM6, TMP6(%rsp)
2199         vmovdqa \XMM7, TMP7(%rsp)
2200         vmovdqa \XMM8, TMP8(%rsp)
2201 
2202 .if \loop_idx == in_order
2203                 vpaddd  ONE(%rip), \CTR, \XMM1            # INCR CNT
2204                 vpaddd  ONE(%rip), \XMM1, \XMM2
2205                 vpaddd  ONE(%rip), \XMM2, \XMM3
2206                 vpaddd  ONE(%rip), \XMM3, \XMM4
2207                 vpaddd  ONE(%rip), \XMM4, \XMM5
2208                 vpaddd  ONE(%rip), \XMM5, \XMM6
2209                 vpaddd  ONE(%rip), \XMM6, \XMM7
2210                 vpaddd  ONE(%rip), \XMM7, \XMM8
2211                 vmovdqa \XMM8, \CTR
2212 
2213                 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
2214                 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
2215                 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
2216                 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
2217                 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
2218                 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
2219                 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
2220                 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
2221 .else
2222                 vpaddd  ONEf(%rip), \CTR, \XMM1            # INCR CNT
2223                 vpaddd  ONEf(%rip), \XMM1, \XMM2
2224                 vpaddd  ONEf(%rip), \XMM2, \XMM3
2225                 vpaddd  ONEf(%rip), \XMM3, \XMM4
2226                 vpaddd  ONEf(%rip), \XMM4, \XMM5
2227                 vpaddd  ONEf(%rip), \XMM5, \XMM6
2228                 vpaddd  ONEf(%rip), \XMM6, \XMM7
2229                 vpaddd  ONEf(%rip), \XMM7, \XMM8
2230                 vmovdqa \XMM8, \CTR
2231 .endif
2232 
2233 
2234         #######################################################################
2235 
2236                 vmovdqu (arg1), \T1
2237                 vpxor   \T1, \XMM1, \XMM1
2238                 vpxor   \T1, \XMM2, \XMM2
2239                 vpxor   \T1, \XMM3, \XMM3
2240                 vpxor   \T1, \XMM4, \XMM4
2241                 vpxor   \T1, \XMM5, \XMM5
2242                 vpxor   \T1, \XMM6, \XMM6
2243                 vpxor   \T1, \XMM7, \XMM7
2244                 vpxor   \T1, \XMM8, \XMM8
2245 
2246         #######################################################################
2247 
2248 
2249 
2250 
2251 
2252                 vmovdqu 16*1(arg1), \T1
2253                 vaesenc \T1, \XMM1, \XMM1
2254                 vaesenc \T1, \XMM2, \XMM2
2255                 vaesenc \T1, \XMM3, \XMM3
2256                 vaesenc \T1, \XMM4, \XMM4
2257                 vaesenc \T1, \XMM5, \XMM5
2258                 vaesenc \T1, \XMM6, \XMM6
2259                 vaesenc \T1, \XMM7, \XMM7
2260                 vaesenc \T1, \XMM8, \XMM8
2261 
2262                 vmovdqu 16*2(arg1), \T1
2263                 vaesenc \T1, \XMM1, \XMM1
2264                 vaesenc \T1, \XMM2, \XMM2
2265                 vaesenc \T1, \XMM3, \XMM3
2266                 vaesenc \T1, \XMM4, \XMM4
2267                 vaesenc \T1, \XMM5, \XMM5
2268                 vaesenc \T1, \XMM6, \XMM6
2269                 vaesenc \T1, \XMM7, \XMM7
2270                 vaesenc \T1, \XMM8, \XMM8
2271 
2272 
2273         #######################################################################
2274 
2275         vmovdqu         HashKey_8(arg2), \T5
2276         vpclmulqdq      $0x11, \T5, \T2, \T4              # T4 = a1*b1
2277         vpclmulqdq      $0x00, \T5, \T2, \T7              # T7 = a0*b0
2278         vpclmulqdq      $0x01, \T5, \T2, \T6              # T6 = a1*b0
2279         vpclmulqdq      $0x10, \T5, \T2, \T5              # T5 = a0*b1
2280         vpxor           \T5, \T6, \T6
2281 
2282                 vmovdqu 16*3(arg1), \T1
2283                 vaesenc \T1, \XMM1, \XMM1
2284                 vaesenc \T1, \XMM2, \XMM2
2285                 vaesenc \T1, \XMM3, \XMM3
2286                 vaesenc \T1, \XMM4, \XMM4
2287                 vaesenc \T1, \XMM5, \XMM5
2288                 vaesenc \T1, \XMM6, \XMM6
2289                 vaesenc \T1, \XMM7, \XMM7
2290                 vaesenc \T1, \XMM8, \XMM8
2291 
2292         vmovdqa         TMP2(%rsp), \T1
2293         vmovdqu         HashKey_7(arg2), \T5
2294         vpclmulqdq      $0x11, \T5, \T1, \T3
2295         vpxor           \T3, \T4, \T4
2296 
2297         vpclmulqdq      $0x00, \T5, \T1, \T3
2298         vpxor           \T3, \T7, \T7
2299 
2300         vpclmulqdq      $0x01, \T5, \T1, \T3
2301         vpxor           \T3, \T6, \T6
2302 
2303         vpclmulqdq      $0x10, \T5, \T1, \T3
2304         vpxor           \T3, \T6, \T6
2305 
2306                 vmovdqu 16*4(arg1), \T1
2307                 vaesenc \T1, \XMM1, \XMM1
2308                 vaesenc \T1, \XMM2, \XMM2
2309                 vaesenc \T1, \XMM3, \XMM3
2310                 vaesenc \T1, \XMM4, \XMM4
2311                 vaesenc \T1, \XMM5, \XMM5
2312                 vaesenc \T1, \XMM6, \XMM6
2313                 vaesenc \T1, \XMM7, \XMM7
2314                 vaesenc \T1, \XMM8, \XMM8
2315 
2316         #######################################################################
2317 
2318         vmovdqa         TMP3(%rsp), \T1
2319         vmovdqu         HashKey_6(arg2), \T5
2320         vpclmulqdq      $0x11, \T5, \T1, \T3
2321         vpxor           \T3, \T4, \T4
2322 
2323         vpclmulqdq      $0x00, \T5, \T1, \T3
2324         vpxor           \T3, \T7, \T7
2325 
2326         vpclmulqdq      $0x01, \T5, \T1, \T3
2327         vpxor           \T3, \T6, \T6
2328 
2329         vpclmulqdq      $0x10, \T5, \T1, \T3
2330         vpxor           \T3, \T6, \T6
2331 
2332                 vmovdqu 16*5(arg1), \T1
2333                 vaesenc \T1, \XMM1, \XMM1
2334                 vaesenc \T1, \XMM2, \XMM2
2335                 vaesenc \T1, \XMM3, \XMM3
2336                 vaesenc \T1, \XMM4, \XMM4
2337                 vaesenc \T1, \XMM5, \XMM5
2338                 vaesenc \T1, \XMM6, \XMM6
2339                 vaesenc \T1, \XMM7, \XMM7
2340                 vaesenc \T1, \XMM8, \XMM8
2341 
2342         vmovdqa         TMP4(%rsp), \T1
2343         vmovdqu         HashKey_5(arg2), \T5
2344         vpclmulqdq      $0x11, \T5, \T1, \T3
2345         vpxor           \T3, \T4, \T4
2346 
2347         vpclmulqdq      $0x00, \T5, \T1, \T3
2348         vpxor           \T3, \T7, \T7
2349 
2350         vpclmulqdq      $0x01, \T5, \T1, \T3
2351         vpxor           \T3, \T6, \T6
2352 
2353         vpclmulqdq      $0x10, \T5, \T1, \T3
2354         vpxor           \T3, \T6, \T6
2355 
2356                 vmovdqu 16*6(arg1), \T1
2357                 vaesenc \T1, \XMM1, \XMM1
2358                 vaesenc \T1, \XMM2, \XMM2
2359                 vaesenc \T1, \XMM3, \XMM3
2360                 vaesenc \T1, \XMM4, \XMM4
2361                 vaesenc \T1, \XMM5, \XMM5
2362                 vaesenc \T1, \XMM6, \XMM6
2363                 vaesenc \T1, \XMM7, \XMM7
2364                 vaesenc \T1, \XMM8, \XMM8
2365 
2366 
2367         vmovdqa         TMP5(%rsp), \T1
2368         vmovdqu         HashKey_4(arg2), \T5
2369         vpclmulqdq      $0x11, \T5, \T1, \T3
2370         vpxor           \T3, \T4, \T4
2371 
2372         vpclmulqdq      $0x00, \T5, \T1, \T3
2373         vpxor           \T3, \T7, \T7
2374 
2375         vpclmulqdq      $0x01, \T5, \T1, \T3
2376         vpxor           \T3, \T6, \T6
2377 
2378         vpclmulqdq      $0x10, \T5, \T1, \T3
2379         vpxor           \T3, \T6, \T6
2380 
2381                 vmovdqu 16*7(arg1), \T1
2382                 vaesenc \T1, \XMM1, \XMM1
2383                 vaesenc \T1, \XMM2, \XMM2
2384                 vaesenc \T1, \XMM3, \XMM3
2385                 vaesenc \T1, \XMM4, \XMM4
2386                 vaesenc \T1, \XMM5, \XMM5
2387                 vaesenc \T1, \XMM6, \XMM6
2388                 vaesenc \T1, \XMM7, \XMM7
2389                 vaesenc \T1, \XMM8, \XMM8
2390 
2391         vmovdqa         TMP6(%rsp), \T1
2392         vmovdqu         HashKey_3(arg2), \T5
2393         vpclmulqdq      $0x11, \T5, \T1, \T3
2394         vpxor           \T3, \T4, \T4
2395 
2396         vpclmulqdq      $0x00, \T5, \T1, \T3
2397         vpxor           \T3, \T7, \T7
2398 
2399         vpclmulqdq      $0x01, \T5, \T1, \T3
2400         vpxor           \T3, \T6, \T6
2401 
2402         vpclmulqdq      $0x10, \T5, \T1, \T3
2403         vpxor           \T3, \T6, \T6
2404 
2405                 vmovdqu 16*8(arg1), \T1
2406                 vaesenc \T1, \XMM1, \XMM1
2407                 vaesenc \T1, \XMM2, \XMM2
2408                 vaesenc \T1, \XMM3, \XMM3
2409                 vaesenc \T1, \XMM4, \XMM4
2410                 vaesenc \T1, \XMM5, \XMM5
2411                 vaesenc \T1, \XMM6, \XMM6
2412                 vaesenc \T1, \XMM7, \XMM7
2413                 vaesenc \T1, \XMM8, \XMM8
2414 
2415         vmovdqa         TMP7(%rsp), \T1
2416         vmovdqu         HashKey_2(arg2), \T5
2417         vpclmulqdq      $0x11, \T5, \T1, \T3
2418         vpxor           \T3, \T4, \T4
2419 
2420         vpclmulqdq      $0x00, \T5, \T1, \T3
2421         vpxor           \T3, \T7, \T7
2422 
2423         vpclmulqdq      $0x01, \T5, \T1, \T3
2424         vpxor           \T3, \T6, \T6
2425 
2426         vpclmulqdq      $0x10, \T5, \T1, \T3
2427         vpxor           \T3, \T6, \T6
2428 
2429 
2430         #######################################################################
2431 
2432                 vmovdqu 16*9(arg1), \T5
2433                 vaesenc \T5, \XMM1, \XMM1
2434                 vaesenc \T5, \XMM2, \XMM2
2435                 vaesenc \T5, \XMM3, \XMM3
2436                 vaesenc \T5, \XMM4, \XMM4
2437                 vaesenc \T5, \XMM5, \XMM5
2438                 vaesenc \T5, \XMM6, \XMM6
2439                 vaesenc \T5, \XMM7, \XMM7
2440                 vaesenc \T5, \XMM8, \XMM8
2441 
2442         vmovdqa         TMP8(%rsp), \T1
2443         vmovdqu         HashKey(arg2), \T5
2444 
2445         vpclmulqdq      $0x00, \T5, \T1, \T3
2446         vpxor           \T3, \T7, \T7
2447 
2448         vpclmulqdq      $0x01, \T5, \T1, \T3
2449         vpxor           \T3, \T6, \T6
2450 
2451         vpclmulqdq      $0x10, \T5, \T1, \T3
2452         vpxor           \T3, \T6, \T6
2453 
2454         vpclmulqdq      $0x11, \T5, \T1, \T3
2455         vpxor           \T3, \T4, \T1
2456 
2457 
2458                 vmovdqu 16*10(arg1), \T5
2459 
2460         i = 11
2461         setreg
2462 .rep (\REP-9)
2463         vaesenc \T5, \XMM1, \XMM1
2464         vaesenc \T5, \XMM2, \XMM2
2465         vaesenc \T5, \XMM3, \XMM3
2466         vaesenc \T5, \XMM4, \XMM4
2467         vaesenc \T5, \XMM5, \XMM5
2468         vaesenc \T5, \XMM6, \XMM6
2469         vaesenc \T5, \XMM7, \XMM7
2470         vaesenc \T5, \XMM8, \XMM8
2471 
2472         vmovdqu 16*i(arg1), \T5
2473         i = i + 1
2474         setreg
2475 .endr
2476 
2477 	i = 0
2478 	j = 1
2479 	setreg
2480 .rep 8
2481 		vpxor	16*i(arg4, %r11), \T5, \T2
2482                 .if \ENC_DEC == ENC
2483                 vaesenclast     \T2, reg_j, reg_j
2484                 .else
2485                 vaesenclast     \T2, reg_j, \T3
2486                 vmovdqu 16*i(arg4, %r11), reg_j
2487                 vmovdqu \T3, 16*i(arg3, %r11)
2488                 .endif
2489 	i = (i+1)
2490 	j = (j+1)
2491 	setreg
2492 .endr
2493 	#######################################################################
2494 
2495 
2496 	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
2497 	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
2498 	vpxor	\T3, \T7, \T7
2499 	vpxor	\T6, \T1, \T1				# accumulate the results in T1:T7
2500 
2501 
2502 
2503 	#######################################################################
2504 	#first phase of the reduction
2505 	vmovdqa         POLY2(%rip), \T3
2506 
2507 	vpclmulqdq	$0x01, \T7, \T3, \T2
2508 	vpslldq		$8, \T2, \T2			# shift-L xmm2 2 DWs
2509 
2510 	vpxor		\T2, \T7, \T7			# first phase of the reduction complete
2511 	#######################################################################
2512                 .if \ENC_DEC == ENC
2513 		vmovdqu	 \XMM1,	16*0(arg3,%r11)		# Write to the Ciphertext buffer
2514 		vmovdqu	 \XMM2,	16*1(arg3,%r11)		# Write to the Ciphertext buffer
2515 		vmovdqu	 \XMM3,	16*2(arg3,%r11)		# Write to the Ciphertext buffer
2516 		vmovdqu	 \XMM4,	16*3(arg3,%r11)		# Write to the Ciphertext buffer
2517 		vmovdqu	 \XMM5,	16*4(arg3,%r11)		# Write to the Ciphertext buffer
2518 		vmovdqu	 \XMM6,	16*5(arg3,%r11)		# Write to the Ciphertext buffer
2519 		vmovdqu	 \XMM7,	16*6(arg3,%r11)		# Write to the Ciphertext buffer
2520 		vmovdqu	 \XMM8,	16*7(arg3,%r11)		# Write to the Ciphertext buffer
2521                 .endif
2522 
2523 	#######################################################################
2524 	#second phase of the reduction
2525 	vpclmulqdq	$0x00, \T7, \T3, \T2
2526 	vpsrldq		$4, \T2, \T2			# shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2527 
2528 	vpclmulqdq	$0x10, \T7, \T3, \T4
2529 	vpslldq		$4, \T4, \T4			# shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2530 
2531 	vpxor		\T2, \T4, \T4			# second phase of the reduction complete
2532 	#######################################################################
2533 	vpxor		\T4, \T1, \T1			# the result is in T1
2534 
2535 		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
2536 		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
2537 		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
2538 		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
2539 		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
2540 		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
2541 		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
2542 		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
2543 
2544 
2545 	vpxor	\T1, \XMM1, \XMM1
2546 
2547 
2548 
2549 .endm
2550 
2551 
2552 # GHASH the last 4 ciphertext blocks.
2553 .macro  GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2554 
2555         ## Karatsuba Method
2556 
2557         vmovdqu         HashKey_8(arg2), \T5
2558 
2559         vpshufd         $0b01001110, \XMM1, \T2
2560         vpshufd         $0b01001110, \T5, \T3
2561         vpxor           \XMM1, \T2, \T2
2562         vpxor           \T5, \T3, \T3
2563 
2564         vpclmulqdq      $0x11, \T5, \XMM1, \T6
2565         vpclmulqdq      $0x00, \T5, \XMM1, \T7
2566 
2567         vpclmulqdq      $0x00, \T3, \T2, \XMM1
2568 
2569         ######################
2570 
2571         vmovdqu         HashKey_7(arg2), \T5
2572         vpshufd         $0b01001110, \XMM2, \T2
2573         vpshufd         $0b01001110, \T5, \T3
2574         vpxor           \XMM2, \T2, \T2
2575         vpxor           \T5, \T3, \T3
2576 
2577         vpclmulqdq      $0x11, \T5, \XMM2, \T4
2578         vpxor           \T4, \T6, \T6
2579 
2580         vpclmulqdq      $0x00, \T5, \XMM2, \T4
2581         vpxor           \T4, \T7, \T7
2582 
2583         vpclmulqdq      $0x00, \T3, \T2, \T2
2584 
2585         vpxor           \T2, \XMM1, \XMM1
2586 
2587         ######################
2588 
2589         vmovdqu         HashKey_6(arg2), \T5
2590         vpshufd         $0b01001110, \XMM3, \T2
2591         vpshufd         $0b01001110, \T5, \T3
2592         vpxor           \XMM3, \T2, \T2
2593         vpxor           \T5, \T3, \T3
2594 
2595         vpclmulqdq      $0x11, \T5, \XMM3, \T4
2596         vpxor           \T4, \T6, \T6
2597 
2598         vpclmulqdq      $0x00, \T5, \XMM3, \T4
2599         vpxor           \T4, \T7, \T7
2600 
2601         vpclmulqdq      $0x00, \T3, \T2, \T2
2602 
2603         vpxor           \T2, \XMM1, \XMM1
2604 
2605         ######################
2606 
2607         vmovdqu         HashKey_5(arg2), \T5
2608         vpshufd         $0b01001110, \XMM4, \T2
2609         vpshufd         $0b01001110, \T5, \T3
2610         vpxor           \XMM4, \T2, \T2
2611         vpxor           \T5, \T3, \T3
2612 
2613         vpclmulqdq      $0x11, \T5, \XMM4, \T4
2614         vpxor           \T4, \T6, \T6
2615 
2616         vpclmulqdq      $0x00, \T5, \XMM4, \T4
2617         vpxor           \T4, \T7, \T7
2618 
2619         vpclmulqdq      $0x00, \T3, \T2, \T2
2620 
2621         vpxor           \T2, \XMM1, \XMM1
2622 
2623         ######################
2624 
2625         vmovdqu         HashKey_4(arg2), \T5
2626         vpshufd         $0b01001110, \XMM5, \T2
2627         vpshufd         $0b01001110, \T5, \T3
2628         vpxor           \XMM5, \T2, \T2
2629         vpxor           \T5, \T3, \T3
2630 
2631         vpclmulqdq      $0x11, \T5, \XMM5, \T4
2632         vpxor           \T4, \T6, \T6
2633 
2634         vpclmulqdq      $0x00, \T5, \XMM5, \T4
2635         vpxor           \T4, \T7, \T7
2636 
2637         vpclmulqdq      $0x00, \T3, \T2, \T2
2638 
2639         vpxor           \T2, \XMM1, \XMM1
2640 
2641         ######################
2642 
2643         vmovdqu         HashKey_3(arg2), \T5
2644         vpshufd         $0b01001110, \XMM6, \T2
2645         vpshufd         $0b01001110, \T5, \T3
2646         vpxor           \XMM6, \T2, \T2
2647         vpxor           \T5, \T3, \T3
2648 
2649         vpclmulqdq      $0x11, \T5, \XMM6, \T4
2650         vpxor           \T4, \T6, \T6
2651 
2652         vpclmulqdq      $0x00, \T5, \XMM6, \T4
2653         vpxor           \T4, \T7, \T7
2654 
2655         vpclmulqdq      $0x00, \T3, \T2, \T2
2656 
2657         vpxor           \T2, \XMM1, \XMM1
2658 
2659         ######################
2660 
2661         vmovdqu         HashKey_2(arg2), \T5
2662         vpshufd         $0b01001110, \XMM7, \T2
2663         vpshufd         $0b01001110, \T5, \T3
2664         vpxor           \XMM7, \T2, \T2
2665         vpxor           \T5, \T3, \T3
2666 
2667         vpclmulqdq      $0x11, \T5, \XMM7, \T4
2668         vpxor           \T4, \T6, \T6
2669 
2670         vpclmulqdq      $0x00, \T5, \XMM7, \T4
2671         vpxor           \T4, \T7, \T7
2672 
2673         vpclmulqdq      $0x00, \T3, \T2, \T2
2674 
2675         vpxor           \T2, \XMM1, \XMM1
2676 
2677         ######################
2678 
2679         vmovdqu         HashKey(arg2), \T5
2680         vpshufd         $0b01001110, \XMM8, \T2
2681         vpshufd         $0b01001110, \T5, \T3
2682         vpxor           \XMM8, \T2, \T2
2683         vpxor           \T5, \T3, \T3
2684 
2685         vpclmulqdq      $0x11, \T5, \XMM8, \T4
2686         vpxor           \T4, \T6, \T6
2687 
2688         vpclmulqdq      $0x00, \T5, \XMM8, \T4
2689         vpxor           \T4, \T7, \T7
2690 
2691         vpclmulqdq      $0x00, \T3, \T2, \T2
2692 
2693         vpxor           \T2, \XMM1, \XMM1
2694         vpxor           \T6, \XMM1, \XMM1
2695         vpxor           \T7, \XMM1, \T2
2696 
2697 
2698 
2699 
2700         vpslldq $8, \T2, \T4
2701         vpsrldq $8, \T2, \T2
2702 
2703         vpxor   \T4, \T7, \T7
2704         vpxor   \T2, \T6, \T6                      # <T6:T7> holds the result of the
2705 						   # accumulated carry-less multiplications
2706 
2707         #######################################################################
2708         #first phase of the reduction
2709         vmovdqa         POLY2(%rip), \T3
2710 
2711         vpclmulqdq      $0x01, \T7, \T3, \T2
2712         vpslldq         $8, \T2, \T2               # shift-L xmm2 2 DWs
2713 
2714         vpxor           \T2, \T7, \T7              # first phase of the reduction complete
2715         #######################################################################
2716 
2717 
2718         #second phase of the reduction
2719         vpclmulqdq      $0x00, \T7, \T3, \T2
2720         vpsrldq         $4, \T2, \T2               # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2721 
2722         vpclmulqdq      $0x10, \T7, \T3, \T4
2723         vpslldq         $4, \T4, \T4               # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2724 
2725         vpxor           \T2, \T4, \T4              # second phase of the reduction complete
2726         #######################################################################
2727         vpxor           \T4, \T6, \T6              # the result is in T6
2728 .endm
2729 
2730 
2731 
2732 #############################################################
2733 #void   aesni_gcm_init_avx_gen4
2734 #        (gcm_data     *my_ctx_data,
2735 #         gcm_context_data *data,
2736 #        u8      *iv, /* Pre-counter block j0: 4 byte salt
2737 #			(from Security Association) concatenated with 8 byte
2738 #			Initialisation Vector (from IPSec ESP Payload)
2739 #			concatenated with 0x00000001. 16-byte aligned pointer. */
2740 #        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
2741 #        const   u8 *aad, /* Additional Authentication Data (AAD)*/
2742 #        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2743 #############################################################
2744 SYM_FUNC_START(aesni_gcm_init_avx_gen4)
2745         FUNC_SAVE
2746         INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
2747         FUNC_RESTORE
2748         RET
2749 SYM_FUNC_END(aesni_gcm_init_avx_gen4)
2750 
2751 ###############################################################################
2752 #void   aesni_gcm_enc_avx_gen4(
2753 #        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2754 #        gcm_context_data *data,
2755 #        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
2756 #        const   u8 *in, /* Plaintext input */
2757 #        u64     plaintext_len) /* Length of data in Bytes for encryption. */
2758 ###############################################################################
2759 SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
2760         FUNC_SAVE
2761         mov     keysize,%eax
2762         cmp     $32, %eax
2763         je      key_256_enc_update4
2764         cmp     $16, %eax
2765         je      key_128_enc_update4
2766         # must be 192
2767         GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
2768         FUNC_RESTORE
2769 	RET
2770 key_128_enc_update4:
2771         GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
2772         FUNC_RESTORE
2773 	RET
2774 key_256_enc_update4:
2775         GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
2776         FUNC_RESTORE
2777 	RET
2778 SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
2779 
2780 ###############################################################################
2781 #void   aesni_gcm_dec_update_avx_gen4(
2782 #        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2783 #        gcm_context_data *data,
2784 #        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
2785 #        const   u8 *in, /* Ciphertext input */
2786 #        u64     plaintext_len) /* Length of data in Bytes for encryption. */
2787 ###############################################################################
2788 SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
2789         FUNC_SAVE
2790         mov     keysize,%eax
2791         cmp     $32, %eax
2792         je      key_256_dec_update4
2793         cmp     $16, %eax
2794         je      key_128_dec_update4
2795         # must be 192
2796         GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
2797         FUNC_RESTORE
2798         RET
2799 key_128_dec_update4:
2800         GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
2801         FUNC_RESTORE
2802         RET
2803 key_256_dec_update4:
2804         GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
2805         FUNC_RESTORE
2806         RET
2807 SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
2808 
2809 ###############################################################################
2810 #void   aesni_gcm_finalize_avx_gen4(
2811 #        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2812 #        gcm_context_data *data,
2813 #        u8      *auth_tag, /* Authenticated Tag output. */
2814 #        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
2815 #                              Valid values are 16 (most likely), 12 or 8. */
2816 ###############################################################################
2817 SYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
2818         FUNC_SAVE
2819         mov	keysize,%eax
2820         cmp     $32, %eax
2821         je      key_256_finalize4
2822         cmp     $16, %eax
2823         je      key_128_finalize4
2824         # must be 192
2825         GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
2826         FUNC_RESTORE
2827         RET
2828 key_128_finalize4:
2829         GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
2830         FUNC_RESTORE
2831         RET
2832 key_256_finalize4:
2833         GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
2834         FUNC_RESTORE
2835         RET
2836 SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)
2837