1 .text
2 
3 
4 .align	7	// totally strategic alignment
5 _vpaes_consts:
6 Lk_mc_forward:	//	mc_forward
7 .quad	0x0407060500030201, 0x0C0F0E0D080B0A09
8 .quad	0x080B0A0904070605, 0x000302010C0F0E0D
9 .quad	0x0C0F0E0D080B0A09, 0x0407060500030201
10 .quad	0x000302010C0F0E0D, 0x080B0A0904070605
11 Lk_mc_backward:	//	mc_backward
12 .quad	0x0605040702010003, 0x0E0D0C0F0A09080B
13 .quad	0x020100030E0D0C0F, 0x0A09080B06050407
14 .quad	0x0E0D0C0F0A09080B, 0x0605040702010003
15 .quad	0x0A09080B06050407, 0x020100030E0D0C0F
16 Lk_sr:	//	sr
17 .quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
18 .quad	0x030E09040F0A0500, 0x0B06010C07020D08
19 .quad	0x0F060D040B020900, 0x070E050C030A0108
20 .quad	0x0B0E0104070A0D00, 0x0306090C0F020508
21 
22 //
23 // "Hot" constants
24 //
25 Lk_inv:	//	inv, inva
26 .quad	0x0E05060F0D080180, 0x040703090A0B0C02
27 .quad	0x01040A060F0B0780, 0x030D0E0C02050809
28 Lk_ipt:	//	input transform (lo, hi)
29 .quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
30 .quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
31 Lk_sbo:	//	sbou, sbot
32 .quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
33 .quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
34 Lk_sb1:	//	sb1u, sb1t
35 .quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
36 .quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
37 Lk_sb2:	//	sb2u, sb2t
38 .quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
39 .quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
40 
41 //
42 //  Decryption stuff
43 //
44 Lk_dipt:	//	decryption input transform
45 .quad	0x0F505B040B545F00, 0x154A411E114E451A
46 .quad	0x86E383E660056500, 0x12771772F491F194
47 Lk_dsbo:	//	decryption sbox final output
48 .quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
49 .quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
50 Lk_dsb9:	//	decryption sbox output *9*u, *9*t
51 .quad	0x851C03539A86D600, 0xCAD51F504F994CC9
52 .quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
53 Lk_dsbd:	//	decryption sbox output *D*u, *D*t
54 .quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
55 .quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
56 Lk_dsbb:	//	decryption sbox output *B*u, *B*t
57 .quad	0xD022649296B44200, 0x602646F6B0F2D404
58 .quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
59 Lk_dsbe:	//	decryption sbox output *E*u, *E*t
60 .quad	0x46F2929626D4D000, 0x2242600464B4F6B0
61 .quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
62 
63 //
64 //  Key schedule constants
65 //
66 Lk_dksd:	//	decryption key schedule: invskew x*D
67 .quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
68 .quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
69 Lk_dksb:	//	decryption key schedule: invskew x*B
70 .quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
71 .quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
72 Lk_dkse:	//	decryption key schedule: invskew x*E + 0x63
73 .quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
74 .quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
75 Lk_dks9:	//	decryption key schedule: invskew x*9
76 .quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
77 .quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
78 
79 Lk_rcon:	//	rcon
80 .quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
81 
82 Lk_opt:	//	output transform
83 .quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
84 .quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
85 Lk_deskew:	//	deskew tables: inverts the sbox's "skew"
86 .quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
87 .quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
88 
89 .byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
90 .align	2
91 
92 .align	6
93 //
94 //  _aes_preheat
95 //
96 //  Fills register %r10 -> .aes_consts (so you can -fPIC)
97 //  and %xmm9-%xmm15 as specified below.
98 //
99 
100 .align	4
101 _vpaes_encrypt_preheat:
102 	adr	x10, Lk_inv
103 	movi	v17.16b, #0x0f
104 	ld1	{v18.2d,v19.2d}, [x10],#32	// Lk_inv
105 	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64	// Lk_ipt, Lk_sbo
106 	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10]		// Lk_sb1, Lk_sb2
107 	ret
108 
109 
110 //
111 //  _aes_encrypt_core
112 //
113 //  AES-encrypt %xmm0.
114 //
115 //  Inputs:
116 //     %xmm0 = input
117 //     %xmm9-%xmm15 as in _vpaes_preheat
118 //    (%rdx) = scheduled keys
119 //
120 //  Output in %xmm0
121 //  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
122 //  Preserves %xmm6 - %xmm8 so you get some local vectors
123 //
124 //
125 
126 .align	4
127 _vpaes_encrypt_core:
128 	mov	x9, x2
129 	ldr	w8, [x2,#240]			// pull rounds
130 	adr	x11, Lk_mc_forward+16
131 						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
132 	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
133 	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
134 	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
135 	tbl	v1.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
136 						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
137 	tbl	v2.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
138 	eor	v0.16b, v1.16b, v16.16b		// vpxor	%xmm5,	%xmm1,	%xmm0
139 	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
140 	b	Lenc_entry
141 
142 .align	4
143 Lenc_loop:
144 	// middle of middle round
145 	add	x10, x11, #0x40
146 	tbl	v4.16b, {v25.16b}, v2.16b		// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
147 	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# Lk_mc_forward[]
148 	tbl	v0.16b, {v24.16b}, v3.16b		// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
149 	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
150 	tbl	v5.16b,	{v27.16b}, v2.16b		// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
151 	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
152 	tbl	v2.16b, {v26.16b}, v3.16b		// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
153 	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# Lk_mc_backward[]
154 	tbl	v3.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
155 	eor	v2.16b, v2.16b, v5.16b		// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
156 	tbl	v0.16b, {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
157 	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
158 	tbl	v4.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
159 	eor	v0.16b, v0.16b, v3.16b		// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
160 	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
161 	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
162 	sub	w8, w8, #1			// nr--
163 
164 Lenc_entry:
165 	// top of round
166 	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
167 	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
168 	tbl	v5.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
169 	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
170 	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
171 	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
172 	eor	v3.16b, v3.16b, v5.16b		// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
173 	eor	v4.16b, v4.16b, v5.16b		// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
174 	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
175 	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
176 	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
177 	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
178 	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
179 	cbnz	w8, Lenc_loop
180 
181 	// middle of last round
182 	add	x10, x11, #0x80
183 						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
184 						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
185 	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
186 	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# Lk_sr[]
187 	tbl	v0.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
188 	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
189 	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
190 	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
191 	ret
192 
193 
194 .globl	_vpaes_encrypt
195 
196 .align	4
197 _vpaes_encrypt:
198 .long	0xd503233f			// paciasp
199 	stp	x29,x30,[sp,#-16]!
200 	add	x29,sp,#0
201 
202 	ld1	{v7.16b}, [x0]
203 	bl	_vpaes_encrypt_preheat
204 	bl	_vpaes_encrypt_core
205 	st1	{v0.16b}, [x1]
206 
207 	ldp	x29,x30,[sp],#16
208 .long	0xd50323bf			// autiasp
209 	ret
210 
211 
212 
213 .align	4
214 _vpaes_encrypt_2x:
215 	mov	x9, x2
216 	ldr	w8, [x2,#240]			// pull rounds
217 	adr	x11, Lk_mc_forward+16
218 						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
219 	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
220 	and	v1.16b,  v14.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
221 	ushr	v0.16b,  v14.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0
222 	and	v9.16b,  v15.16b,  v17.16b
223 	ushr	v8.16b,  v15.16b,  #4
224 	tbl	v1.16b,  {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
225 	tbl	v9.16b,  {v20.16b}, v9.16b
226 						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
227 	tbl	v2.16b,  {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
228 	tbl	v10.16b, {v21.16b}, v8.16b
229 	eor	v0.16b,  v1.16b,   v16.16b	// vpxor	%xmm5,	%xmm1,	%xmm0
230 	eor	v8.16b,  v9.16b,   v16.16b
231 	eor	v0.16b,  v0.16b,   v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
232 	eor	v8.16b,  v8.16b,   v10.16b
233 	b	Lenc_2x_entry
234 
235 .align	4
236 Lenc_2x_loop:
237 	// middle of middle round
238 	add	x10, x11, #0x40
239 	tbl	v4.16b,  {v25.16b}, v2.16b	// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
240 	tbl	v12.16b, {v25.16b}, v10.16b
241 	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# Lk_mc_forward[]
242 	tbl	v0.16b,  {v24.16b}, v3.16b	// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
243 	tbl	v8.16b,  {v24.16b}, v11.16b
244 	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
245 	eor	v12.16b, v12.16b, v16.16b
246 	tbl	v5.16b,	 {v27.16b}, v2.16b	// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
247 	tbl	v13.16b, {v27.16b}, v10.16b
248 	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
249 	eor	v8.16b,  v8.16b,  v12.16b
250 	tbl	v2.16b,  {v26.16b}, v3.16b	// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
251 	tbl	v10.16b, {v26.16b}, v11.16b
252 	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# Lk_mc_backward[]
253 	tbl	v3.16b,  {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
254 	tbl	v11.16b, {v8.16b}, v1.16b
255 	eor	v2.16b,  v2.16b,  v5.16b	// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
256 	eor	v10.16b, v10.16b, v13.16b
257 	tbl	v0.16b,  {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
258 	tbl	v8.16b,  {v8.16b}, v4.16b
259 	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
260 	eor	v11.16b, v11.16b, v10.16b
261 	tbl	v4.16b,  {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
262 	tbl	v12.16b, {v11.16b},v1.16b
263 	eor	v0.16b,  v0.16b,  v3.16b	// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
264 	eor	v8.16b,  v8.16b,  v11.16b
265 	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
266 	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
267 	eor	v8.16b,  v8.16b,  v12.16b
268 	sub	w8, w8, #1			// nr--
269 
270 Lenc_2x_entry:
271 	// top of round
272 	and	v1.16b,  v0.16b, v17.16b	// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
273 	ushr	v0.16b,  v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
274 	and	v9.16b,  v8.16b, v17.16b
275 	ushr	v8.16b,  v8.16b, #4
276 	tbl	v5.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
277 	tbl	v13.16b, {v19.16b},v9.16b
278 	eor	v1.16b,  v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
279 	eor	v9.16b,  v9.16b,  v8.16b
280 	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
281 	tbl	v11.16b, {v18.16b},v8.16b
282 	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
283 	tbl	v12.16b, {v18.16b},v9.16b
284 	eor	v3.16b,  v3.16b,  v5.16b	// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
285 	eor	v11.16b, v11.16b, v13.16b
286 	eor	v4.16b,  v4.16b,  v5.16b	// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
287 	eor	v12.16b, v12.16b, v13.16b
288 	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
289 	tbl	v10.16b, {v18.16b},v11.16b
290 	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
291 	tbl	v11.16b, {v18.16b},v12.16b
292 	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
293 	eor	v10.16b, v10.16b, v9.16b
294 	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
295 	eor	v11.16b, v11.16b, v8.16b
296 	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
297 	cbnz	w8, Lenc_2x_loop
298 
299 	// middle of last round
300 	add	x10, x11, #0x80
301 						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
302 						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
303 	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
304 	tbl	v12.16b, {v22.16b}, v10.16b
305 	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# Lk_sr[]
306 	tbl	v0.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
307 	tbl	v8.16b,  {v23.16b}, v11.16b
308 	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
309 	eor	v12.16b, v12.16b, v16.16b
310 	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
311 	eor	v8.16b,  v8.16b,  v12.16b
312 	tbl	v0.16b,  {v0.16b},v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
313 	tbl	v1.16b,  {v8.16b},v1.16b
314 	ret
315 
316 
317 
318 .align	4
319 _vpaes_decrypt_preheat:
320 	adr	x10, Lk_inv
321 	movi	v17.16b, #0x0f
322 	adr	x11, Lk_dipt
323 	ld1	{v18.2d,v19.2d}, [x10],#32	// Lk_inv
324 	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64	// Lk_dipt, Lk_dsbo
325 	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64	// Lk_dsb9, Lk_dsbd
326 	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x11]		// Lk_dsbb, Lk_dsbe
327 	ret
328 
329 
330 //
331 //  Decryption core
332 //
333 //  Same API as encryption core.
334 //
335 
336 .align	4
337 _vpaes_decrypt_core:
338 	mov	x9, x2
339 	ldr	w8, [x2,#240]			// pull rounds
340 
341 						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
342 	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
343 	eor	x11, x11, #0x30			// xor		$0x30,	%r11
344 	adr	x10, Lk_sr
345 	and	x11, x11, #0x30			// and		$0x30,	%r11
346 	add	x11, x11, x10
347 	adr	x10, Lk_mc_forward+48
348 
349 	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
350 	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
351 	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
352 	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
353 	ld1	{v5.2d}, [x10]			// vmovdqa	Lk_mc_forward+48(%rip), %xmm5
354 						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
355 	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
356 	eor	v2.16b, v2.16b, v16.16b		// vpxor	%xmm4,	%xmm2,	%xmm2
357 	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
358 	b	Ldec_entry
359 
360 .align	4
361 Ldec_loop:
362 //
363 //  Inverse mix columns
364 //
365 						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
366 						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
367 	tbl	v4.16b, {v24.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
368 	tbl	v1.16b, {v25.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
369 	eor	v0.16b, v4.16b, v16.16b		// vpxor	%xmm4,	%xmm0,	%xmm0
370 						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
371 	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
372 						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
373 
374 	tbl	v4.16b, {v26.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
375 	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
376 	tbl	v1.16b, {v27.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
377 	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
378 						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
379 	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
380 						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
381 
382 	tbl	v4.16b, {v28.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
383 	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
384 	tbl	v1.16b, {v29.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
385 	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
386 						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
387 	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
388 						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
389 
390 	tbl	v4.16b, {v30.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
391 	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
392 	tbl	v1.16b, {v31.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
393 	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
394 	ext	v5.16b, v5.16b, v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
395 	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
396 	sub	w8, w8, #1			// sub		$1,%rax			# nr--
397 
398 Ldec_entry:
399 	// top of round
400 	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
401 	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
402 	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
403 	eor	v1.16b,	v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
404 	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
405 	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
406 	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
407 	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
408 	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
409 	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
410 	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
411 	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
412 	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
413 	cbnz	w8, Ldec_loop
414 
415 	// middle of last round
416 						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
417 	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
418 						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
419 	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# Lk_sr-Lk_dsbd=-0x160
420 	tbl	v1.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
421 	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
422 	eor	v0.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
423 	tbl	v0.16b, {v0.16b}, v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
424 	ret
425 
426 
427 .globl	_vpaes_decrypt
428 
429 .align	4
430 _vpaes_decrypt:
431 .long	0xd503233f			// paciasp
432 	stp	x29,x30,[sp,#-16]!
433 	add	x29,sp,#0
434 
435 	ld1	{v7.16b}, [x0]
436 	bl	_vpaes_decrypt_preheat
437 	bl	_vpaes_decrypt_core
438 	st1	{v0.16b}, [x1]
439 
440 	ldp	x29,x30,[sp],#16
441 .long	0xd50323bf			// autiasp
442 	ret
443 
444 
445 // v14-v15 input, v0-v1 output
446 
447 .align	4
448 _vpaes_decrypt_2x:
449 	mov	x9, x2
450 	ldr	w8, [x2,#240]			// pull rounds
451 
452 						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
453 	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
454 	eor	x11, x11, #0x30			// xor		$0x30,	%r11
455 	adr	x10, Lk_sr
456 	and	x11, x11, #0x30			// and		$0x30,	%r11
457 	add	x11, x11, x10
458 	adr	x10, Lk_mc_forward+48
459 
460 	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
461 	and	v1.16b,  v14.16b, v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
462 	ushr	v0.16b,  v14.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
463 	and	v9.16b,  v15.16b, v17.16b
464 	ushr	v8.16b,  v15.16b, #4
465 	tbl	v2.16b,  {v20.16b},v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
466 	tbl	v10.16b, {v20.16b},v9.16b
467 	ld1	{v5.2d}, [x10]			// vmovdqa	Lk_mc_forward+48(%rip), %xmm5
468 						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
469 	tbl	v0.16b,  {v21.16b},v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
470 	tbl	v8.16b,  {v21.16b},v8.16b
471 	eor	v2.16b,  v2.16b,  v16.16b	// vpxor	%xmm4,	%xmm2,	%xmm2
472 	eor	v10.16b, v10.16b, v16.16b
473 	eor	v0.16b,  v0.16b,  v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
474 	eor	v8.16b,  v8.16b,  v10.16b
475 	b	Ldec_2x_entry
476 
477 .align	4
478 Ldec_2x_loop:
479 //
480 //  Inverse mix columns
481 //
482 						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
483 						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
484 	tbl	v4.16b,  {v24.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
485 	tbl	v12.16b, {v24.16b}, v10.16b
486 	tbl	v1.16b,  {v25.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
487 	tbl	v9.16b,  {v25.16b}, v11.16b
488 	eor	v0.16b,  v4.16b,  v16.16b	// vpxor	%xmm4,	%xmm0,	%xmm0
489 	eor	v8.16b,  v12.16b, v16.16b
490 						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
491 	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
492 	eor	v8.16b,  v8.16b,  v9.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
493 						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
494 
495 	tbl	v4.16b,  {v26.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
496 	tbl	v12.16b, {v26.16b}, v10.16b
497 	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
498 	tbl	v8.16b,  {v8.16b},v5.16b
499 	tbl	v1.16b,  {v27.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
500 	tbl	v9.16b,  {v27.16b}, v11.16b
501 	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
502 	eor	v8.16b,  v8.16b,  v12.16b
503 						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
504 	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
505 	eor	v8.16b,  v8.16b,  v9.16b
506 						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
507 
508 	tbl	v4.16b,  {v28.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
509 	tbl	v12.16b, {v28.16b}, v10.16b
510 	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
511 	tbl	v8.16b,  {v8.16b},v5.16b
512 	tbl	v1.16b,  {v29.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
513 	tbl	v9.16b,  {v29.16b}, v11.16b
514 	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
515 	eor	v8.16b,  v8.16b,  v12.16b
516 						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
517 	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
518 	eor	v8.16b,  v8.16b,  v9.16b
519 						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
520 
521 	tbl	v4.16b,  {v30.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
522 	tbl	v12.16b, {v30.16b}, v10.16b
523 	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
524 	tbl	v8.16b,  {v8.16b},v5.16b
525 	tbl	v1.16b,  {v31.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
526 	tbl	v9.16b,  {v31.16b}, v11.16b
527 	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
528 	eor	v8.16b,  v8.16b,  v12.16b
529 	ext	v5.16b,  v5.16b,  v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
530 	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
531 	eor	v8.16b,  v8.16b,  v9.16b
532 	sub	w8, w8, #1			// sub		$1,%rax			# nr--
533 
534 Ldec_2x_entry:
535 	// top of round
536 	and	v1.16b,  v0.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
537 	ushr	v0.16b,  v0.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
538 	and	v9.16b,  v8.16b,  v17.16b
539 	ushr	v8.16b,  v8.16b,  #4
540 	tbl	v2.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
541 	tbl	v10.16b, {v19.16b},v9.16b
542 	eor	v1.16b,	 v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
543 	eor	v9.16b,	 v9.16b,  v8.16b
544 	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
545 	tbl	v11.16b, {v18.16b},v8.16b
546 	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
547 	tbl	v12.16b, {v18.16b},v9.16b
548 	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
549 	eor	v11.16b, v11.16b, v10.16b
550 	eor	v4.16b,  v4.16b,  v2.16b	// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
551 	eor	v12.16b, v12.16b, v10.16b
552 	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
553 	tbl	v10.16b, {v18.16b},v11.16b
554 	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
555 	tbl	v11.16b, {v18.16b},v12.16b
556 	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
557 	eor	v10.16b, v10.16b, v9.16b
558 	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
559 	eor	v11.16b, v11.16b, v8.16b
560 	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
561 	cbnz	w8, Ldec_2x_loop
562 
563 	// middle of last round
564 						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
565 	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
566 	tbl	v12.16b, {v22.16b}, v10.16b
567 						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
568 	tbl	v1.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
569 	tbl	v9.16b,  {v23.16b}, v11.16b
570 	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# Lk_sr-Lk_dsbd=-0x160
571 	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
572 	eor	v12.16b, v12.16b, v16.16b
573 	eor	v0.16b,  v1.16b,  v4.16b	// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
574 	eor	v8.16b,  v9.16b,  v12.16b
575 	tbl	v0.16b,  {v0.16b},v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
576 	tbl	v1.16b,  {v8.16b},v2.16b
577 	ret
578 
579 ////////////////////////////////////////////////////////
580 //                                                    //
581 //                  AES key schedule                  //
582 //                                                    //
583 ////////////////////////////////////////////////////////
584 
585 .align	4
586 _vpaes_key_preheat:
587 	adr	x10, Lk_inv
588 	movi	v16.16b, #0x5b			// Lk_s63
589 	adr	x11, Lk_sb1
590 	movi	v17.16b, #0x0f			// Lk_s0F
591 	ld1	{v18.2d,v19.2d,v20.2d,v21.2d}, [x10]		// Lk_inv, Lk_ipt
592 	adr	x10, Lk_dksd
593 	ld1	{v22.2d,v23.2d}, [x11]		// Lk_sb1
594 	adr	x11, Lk_mc_forward
595 	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64	// Lk_dksd, Lk_dksb
596 	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64	// Lk_dkse, Lk_dks9
597 	ld1	{v8.2d}, [x10]			// Lk_rcon
598 	ld1	{v9.2d}, [x11]			// Lk_mc_forward[0]
599 	ret
600 
601 
602 
603 .align	4
604 _vpaes_schedule_core:
605 .long	0xd503233f			// paciasp
606 	stp	x29, x30, [sp,#-16]!
607 	add	x29,sp,#0
608 
609 	bl	_vpaes_key_preheat		// load the tables
610 
611 	ld1	{v0.16b}, [x0],#16		// vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
612 
613 	// input transform
614 	mov	v3.16b, v0.16b			// vmovdqa	%xmm0,	%xmm3
615 	bl	_vpaes_schedule_transform
616 	mov	v7.16b, v0.16b			// vmovdqa	%xmm0,	%xmm7
617 
618 	adr	x10, Lk_sr			// lea	Lk_sr(%rip),%r10
619 	add	x8, x8, x10
620 	cbnz	w3, Lschedule_am_decrypting
621 
622 	// encrypting, output zeroth round key after transform
623 	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)
624 	b	Lschedule_go
625 
626 Lschedule_am_decrypting:
627 	// decrypting, output zeroth round key after shiftrows
628 	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
629 	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb  %xmm1,	%xmm3,	%xmm3
630 	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
631 	eor	x8, x8, #0x30			// xor	$0x30, %r8
632 
633 Lschedule_go:
634 	cmp	w1, #192			// cmp	$192,	%esi
635 	b.hi	Lschedule_256
636 	b.eq	Lschedule_192
637 	// 128: fall though
638 
639 //
640 //  .schedule_128
641 //
642 //  128-bit specific part of key schedule.
643 //
644 //  This schedule is really simple, because all its parts
645 //  are accomplished by the subroutines.
646 //
647 Lschedule_128:
648 	mov	x0, #10			// mov	$10, %esi
649 
650 Loop_schedule_128:
651 	sub	x0, x0, #1			// dec	%esi
652 	bl	_vpaes_schedule_round
653 	cbz	x0, Lschedule_mangle_last
654 	bl	_vpaes_schedule_mangle		// write output
655 	b	Loop_schedule_128
656 
657 //
658 //  .aes_schedule_192
659 //
660 //  192-bit specific part of key schedule.
661 //
662 //  The main body of this schedule is the same as the 128-bit
663 //  schedule, but with more smearing.  The long, high side is
664 //  stored in %xmm7 as before, and the short, low side is in
665 //  the high bits of %xmm6.
666 //
667 //  This schedule is somewhat nastier, however, because each
668 //  round produces 192 bits of key material, or 1.5 round keys.
669 //  Therefore, on each cycle we do 2 rounds and produce 3 round
670 //  keys.
671 //
672 .align	4
673 Lschedule_192:
674 	sub	x0, x0, #8
675 	ld1	{v0.16b}, [x0]		// vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
676 	bl	_vpaes_schedule_transform	// input transform
677 	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save short part
678 	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
679 	ins	v6.d[0], v4.d[0]		// vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
680 	mov	x0, #4			// mov	$4,	%esi
681 
682 Loop_schedule_192:
683 	sub	x0, x0, #1			// dec	%esi
684 	bl	_vpaes_schedule_round
685 	ext	v0.16b, v6.16b, v0.16b, #8	// vpalignr	$8,%xmm6,%xmm0,%xmm0
686 	bl	_vpaes_schedule_mangle		// save key n
687 	bl	_vpaes_schedule_192_smear
688 	bl	_vpaes_schedule_mangle		// save key n+1
689 	bl	_vpaes_schedule_round
690 	cbz	x0, Lschedule_mangle_last
691 	bl	_vpaes_schedule_mangle		// save key n+2
692 	bl	_vpaes_schedule_192_smear
693 	b	Loop_schedule_192
694 
695 //
696 //  .aes_schedule_256
697 //
698 //  256-bit specific part of key schedule.
699 //
700 //  The structure here is very similar to the 128-bit
701 //  schedule, but with an additional "low side" in
702 //  %xmm6.  The low side's rounds are the same as the
703 //  high side's, except no rcon and no rotation.
704 //
705 .align	4
706 Lschedule_256:
707 	ld1	{v0.16b}, [x0]		// vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
708 	bl	_vpaes_schedule_transform	// input transform
709 	mov	x0, #7			// mov	$7, %esi
710 
711 Loop_schedule_256:
712 	sub	x0, x0, #1			// dec	%esi
713 	bl	_vpaes_schedule_mangle		// output low result
714 	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
715 
716 	// high round
717 	bl	_vpaes_schedule_round
718 	cbz	x0, Lschedule_mangle_last
719 	bl	_vpaes_schedule_mangle
720 
721 	// low round. swap xmm7 and xmm6
722 	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
723 	movi	v4.16b, #0
724 	mov	v5.16b, v7.16b			// vmovdqa	%xmm7,	%xmm5
725 	mov	v7.16b, v6.16b			// vmovdqa	%xmm6,	%xmm7
726 	bl	_vpaes_schedule_low_round
727 	mov	v7.16b, v5.16b			// vmovdqa	%xmm5,	%xmm7
728 
729 	b	Loop_schedule_256
730 
731 //
732 //  .aes_schedule_mangle_last
733 //
734 //  Mangler for last round of key schedule
735 //  Mangles %xmm0
736 //    when encrypting, outputs out(%xmm0) ^ 63
737 //    when decrypting, outputs unskew(%xmm0)
738 //
739 //  Always called right before return... jumps to cleanup and exits
740 //
741 .align	4
742 Lschedule_mangle_last:
743 	// schedule last round key from xmm0
744 	adr	x11, Lk_deskew			// lea	Lk_deskew(%rip),%r11	# prepare to deskew
745 	cbnz	w3, Lschedule_mangle_last_dec
746 
747 	// encrypting
748 	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),%xmm1
749 	adr	x11, Lk_opt			// lea	Lk_opt(%rip),	%r11		# prepare to output transform
750 	add	x2, x2, #32			// add	$32,	%rdx
751 	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
752 
753 Lschedule_mangle_last_dec:
754 	ld1	{v20.2d,v21.2d}, [x11]		// reload constants
755 	sub	x2, x2, #16			// add	$-16,	%rdx
756 	eor	v0.16b, v0.16b, v16.16b		// vpxor	Lk_s63(%rip),	%xmm0,	%xmm0
757 	bl	_vpaes_schedule_transform	// output transform
758 	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)		# save last key
759 
760 	// cleanup
761 	eor	v0.16b, v0.16b, v0.16b		// vpxor	%xmm0,	%xmm0,	%xmm0
762 	eor	v1.16b, v1.16b, v1.16b		// vpxor	%xmm1,	%xmm1,	%xmm1
763 	eor	v2.16b, v2.16b, v2.16b		// vpxor	%xmm2,	%xmm2,	%xmm2
764 	eor	v3.16b, v3.16b, v3.16b		// vpxor	%xmm3,	%xmm3,	%xmm3
765 	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4,	%xmm4
766 	eor	v5.16b, v5.16b, v5.16b		// vpxor	%xmm5,	%xmm5,	%xmm5
767 	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
768 	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
769 	ldp	x29, x30, [sp],#16
770 .long	0xd50323bf			// autiasp
771 	ret
772 
773 
774 //
775 //  .aes_schedule_192_smear
776 //
777 //  Smear the short, low side in the 192-bit key schedule.
778 //
779 //  Inputs:
780 //    %xmm7: high side, b  a  x  y
781 //    %xmm6:  low side, d  c  0  0
782 //    %xmm13: 0
783 //
784 //  Outputs:
785 //    %xmm6: b+c+d  b+c  0  0
786 //    %xmm0: b+c+d  b+c  b  a
787 //
788 
789 .align	4
790 _vpaes_schedule_192_smear:
791 	movi	v1.16b, #0
792 	dup	v0.4s, v7.s[3]
793 	ins	v1.s[3], v6.s[2]	// vpshufd	$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
794 	ins	v0.s[0], v7.s[2]	// vpshufd	$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
795 	eor	v6.16b, v6.16b, v1.16b	// vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
796 	eor	v1.16b, v1.16b, v1.16b	// vpxor	%xmm1,	%xmm1,	%xmm1
797 	eor	v6.16b, v6.16b, v0.16b	// vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
798 	mov	v0.16b, v6.16b		// vmovdqa	%xmm6,	%xmm0
799 	ins	v6.d[0], v1.d[0]	// vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
800 	ret
801 
802 
803 //
804 //  .aes_schedule_round
805 //
806 //  Runs one main round of the key schedule on %xmm0, %xmm7
807 //
808 //  Specifically, runs subbytes on the high dword of %xmm0
809 //  then rotates it by one byte and xors into the low dword of
810 //  %xmm7.
811 //
812 //  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
813 //  next rcon.
814 //
815 //  Smears the dwords of %xmm7 by xoring the low into the
816 //  second low, result into third, result into highest.
817 //
818 //  Returns results in %xmm7 = %xmm0.
819 //  Clobbers %xmm1-%xmm4, %r11.
820 //
821 
822 .align	4
823 _vpaes_schedule_round:
824 	// extract rcon from xmm8
825 	movi	v4.16b, #0			// vpxor	%xmm4,	%xmm4,	%xmm4
826 	ext	v1.16b, v8.16b, v4.16b, #15	// vpalignr	$15,	%xmm8,	%xmm4,	%xmm1
827 	ext	v8.16b, v8.16b, v8.16b, #15	// vpalignr	$15,	%xmm8,	%xmm8,	%xmm8
828 	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
829 
830 	// rotate
831 	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
832 	ext	v0.16b, v0.16b, v0.16b, #1	// vpalignr	$1,	%xmm0,	%xmm0,	%xmm0
833 
834 	// fall through...
835 
836 	// low round: same as high round, but no rotation and no rcon.
837 _vpaes_schedule_low_round:
838 	// smear xmm7
839 	ext	v1.16b, v4.16b, v7.16b, #12	// vpslldq	$4,	%xmm7,	%xmm1
840 	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
841 	ext	v4.16b, v4.16b, v7.16b, #8	// vpslldq	$8,	%xmm7,	%xmm4
842 
843 	// subbytes
844 	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1		# 0 = k
845 	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0		# 1 = i
846 	eor	v7.16b, v7.16b, v4.16b		// vpxor	%xmm4,	%xmm7,	%xmm7
847 	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
848 	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1		# 0 = j
849 	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
850 	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
851 	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
852 	eor	v7.16b, v7.16b, v16.16b		// vpxor	Lk_s63(%rip),	%xmm7,	%xmm7
853 	tbl	v3.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
854 	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
855 	tbl	v2.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
856 	eor	v3.16b, v3.16b, v1.16b		// vpxor	%xmm1,	%xmm3,	%xmm3		# 2 = io
857 	eor	v2.16b, v2.16b, v0.16b		// vpxor	%xmm0,	%xmm2,	%xmm2		# 3 = jo
858 	tbl	v4.16b, {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
859 	tbl	v1.16b, {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
860 	eor	v1.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
861 
862 	// add in smeared stuff
863 	eor	v0.16b, v1.16b, v7.16b		// vpxor	%xmm7,	%xmm1,	%xmm0
864 	eor	v7.16b, v1.16b, v7.16b		// vmovdqa	%xmm0,	%xmm7
865 	ret
866 
867 
868 //
869 //  .aes_schedule_transform
870 //
871 //  Linear-transform %xmm0 according to tables at (%r11)
872 //
873 //  Requires that %xmm9 = 0x0F0F... as in preheat
874 //  Output in %xmm0
875 //  Clobbers %xmm1, %xmm2
876 //
877 
878 .align	4
879 _vpaes_schedule_transform:
880 	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
881 	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
882 						// vmovdqa	(%r11),	%xmm2 	# lo
883 	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
884 						// vmovdqa	16(%r11),	%xmm1 # hi
885 	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
886 	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
887 	ret
888 
889 
890 //
891 //  .aes_schedule_mangle
892 //
893 //  Mangle xmm0 from (basis-transformed) standard version
894 //  to our version.
895 //
896 //  On encrypt,
897 //    xor with 0x63
898 //    multiply by circulant 0,1,1,1
899 //    apply shiftrows transform
900 //
901 //  On decrypt,
902 //    xor with 0x63
903 //    multiply by "inverse mixcolumns" circulant E,B,D,9
904 //    deskew
905 //    apply shiftrows transform
906 //
907 //
908 //  Writes out to (%rdx), and increments or decrements it
909 //  Keeps track of round number mod 4 in %r8
910 //  Preserves xmm0
911 //  Clobbers xmm1-xmm5
912 //
913 
914 .align	4
915 _vpaes_schedule_mangle:
916 	mov	v4.16b, v0.16b			// vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
917 						// vmovdqa	.Lk_mc_forward(%rip),%xmm5
918 	cbnz	w3, Lschedule_mangle_dec
919 
920 	// encrypting
921 	eor	v4.16b, v0.16b, v16.16b		// vpxor	Lk_s63(%rip),	%xmm0,	%xmm4
922 	add	x2, x2, #16			// add	$16,	%rdx
923 	tbl	v4.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm4
924 	tbl	v1.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm1
925 	tbl	v3.16b, {v1.16b}, v9.16b	// vpshufb	%xmm5,	%xmm1,	%xmm3
926 	eor	v4.16b, v4.16b, v1.16b		// vpxor	%xmm1,	%xmm4,	%xmm4
927 	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
928 	eor	v3.16b, v3.16b, v4.16b		// vpxor	%xmm4,	%xmm3,	%xmm3
929 
930 	b	Lschedule_mangle_both
931 .align	4
932 Lschedule_mangle_dec:
933 	// inverse mix columns
934 						// lea	.Lk_dksd(%rip),%r11
935 	ushr	v1.16b, v4.16b, #4		// vpsrlb	$4,	%xmm4,	%xmm1	# 1 = hi
936 	and	v4.16b, v4.16b, v17.16b		// vpand	%xmm9,	%xmm4,	%xmm4	# 4 = lo
937 
938 						// vmovdqa	0x00(%r11),	%xmm2
939 	tbl	v2.16b, {v24.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
940 						// vmovdqa	0x10(%r11),	%xmm3
941 	tbl	v3.16b,	{v25.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
942 	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
943 	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
944 
945 						// vmovdqa	0x20(%r11),	%xmm2
946 	tbl	v2.16b, {v26.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
947 	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
948 						// vmovdqa	0x30(%r11),	%xmm3
949 	tbl	v3.16b, {v27.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
950 	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
951 	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
952 
953 						// vmovdqa	0x40(%r11),	%xmm2
954 	tbl	v2.16b, {v28.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
955 	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
956 						// vmovdqa	0x50(%r11),	%xmm3
957 	tbl	v3.16b, {v29.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
958 	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
959 
960 						// vmovdqa	0x60(%r11),	%xmm2
961 	tbl	v2.16b, {v30.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
962 	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
963 						// vmovdqa	0x70(%r11),	%xmm4
964 	tbl	v4.16b, {v31.16b}, v1.16b	// vpshufb	%xmm1,	%xmm4,	%xmm4
965 	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
966 	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
967 	eor	v3.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm3
968 
969 	sub	x2, x2, #16			// add	$-16,	%rdx
970 
971 Lschedule_mangle_both:
972 	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
973 	add	x8, x8, #64-16			// add	$-16,	%r8
974 	and	x8, x8, #~(1<<6)		// and	$0x30,	%r8
975 	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
976 	ret
977 
978 
979 .globl	_vpaes_set_encrypt_key
980 
981 .align	4
982 _vpaes_set_encrypt_key:
983 .long	0xd503233f		// paciasp
984 	stp	x29,x30,[sp,#-16]!
985 	add	x29,sp,#0
986 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
987 
988 	lsr	w9, w1, #5		// shr	$5,%eax
989 	add	w9, w9, #5		// $5,%eax
990 	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
991 
992 	mov	w3, #0		// mov	$0,%ecx
993 	mov	x8, #0x30		// mov	$0x30,%r8d
994 	bl	_vpaes_schedule_core
995 	eor	x0, x0, x0
996 
997 	ldp	d8,d9,[sp],#16
998 	ldp	x29,x30,[sp],#16
999 .long	0xd50323bf		// autiasp
1000 	ret
1001 
1002 
1003 .globl	_vpaes_set_decrypt_key
1004 
1005 .align	4
1006 _vpaes_set_decrypt_key:
1007 .long	0xd503233f		// paciasp
1008 	stp	x29,x30,[sp,#-16]!
1009 	add	x29,sp,#0
1010 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1011 
1012 	lsr	w9, w1, #5		// shr	$5,%eax
1013 	add	w9, w9, #5		// $5,%eax
1014 	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
1015 	lsl	w9, w9, #4		// shl	$4,%eax
1016 	add	x2, x2, #16		// lea	16(%rdx,%rax),%rdx
1017 	add	x2, x2, x9
1018 
1019 	mov	w3, #1		// mov	$1,%ecx
1020 	lsr	w8, w1, #1		// shr	$1,%r8d
1021 	and	x8, x8, #32		// and	$32,%r8d
1022 	eor	x8, x8, #32		// xor	$32,%r8d	# nbits==192?0:32
1023 	bl	_vpaes_schedule_core
1024 
1025 	ldp	d8,d9,[sp],#16
1026 	ldp	x29,x30,[sp],#16
1027 .long	0xd50323bf		// autiasp
1028 	ret
1029 
1030 .globl	_vpaes_cbc_encrypt
1031 
1032 .align	4
1033 _vpaes_cbc_encrypt:
1034 	cbz	x2, Lcbc_abort
1035 	cmp	w5, #0			// check direction
1036 	b.eq	vpaes_cbc_decrypt
1037 
1038 .long	0xd503233f		// paciasp
1039 	stp	x29,x30,[sp,#-16]!
1040 	add	x29,sp,#0
1041 
1042 	mov	x17, x2		// reassign
1043 	mov	x2,  x3		// reassign
1044 
1045 	ld1	{v0.16b}, [x4]	// load ivec
1046 	bl	_vpaes_encrypt_preheat
1047 	b	Lcbc_enc_loop
1048 
1049 .align	4
1050 Lcbc_enc_loop:
1051 	ld1	{v7.16b}, [x0],#16	// load input
1052 	eor	v7.16b, v7.16b, v0.16b	// xor with ivec
1053 	bl	_vpaes_encrypt_core
1054 	st1	{v0.16b}, [x1],#16	// save output
1055 	subs	x17, x17, #16
1056 	b.hi	Lcbc_enc_loop
1057 
1058 	st1	{v0.16b}, [x4]	// write ivec
1059 
1060 	ldp	x29,x30,[sp],#16
1061 .long	0xd50323bf		// autiasp
1062 Lcbc_abort:
1063 	ret
1064 
1065 
1066 
1067 .align	4
1068 vpaes_cbc_decrypt:
1069 .long	0xd503233f		// paciasp
1070 	stp	x29,x30,[sp,#-16]!
1071 	add	x29,sp,#0
1072 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1073 	stp	d10,d11,[sp,#-16]!
1074 	stp	d12,d13,[sp,#-16]!
1075 	stp	d14,d15,[sp,#-16]!
1076 
1077 	mov	x17, x2		// reassign
1078 	mov	x2,  x3		// reassign
1079 	ld1	{v6.16b}, [x4]	// load ivec
1080 	bl	_vpaes_decrypt_preheat
1081 	tst	x17, #16
1082 	b.eq	Lcbc_dec_loop2x
1083 
1084 	ld1	{v7.16b}, [x0], #16	// load input
1085 	bl	_vpaes_decrypt_core
1086 	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
1087 	orr	v6.16b, v7.16b, v7.16b	// next ivec value
1088 	st1	{v0.16b}, [x1], #16
1089 	subs	x17, x17, #16
1090 	b.ls	Lcbc_dec_done
1091 
1092 .align	4
1093 Lcbc_dec_loop2x:
1094 	ld1	{v14.16b,v15.16b}, [x0], #32
1095 	bl	_vpaes_decrypt_2x
1096 	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
1097 	eor	v1.16b, v1.16b, v14.16b
1098 	orr	v6.16b, v15.16b, v15.16b
1099 	st1	{v0.16b,v1.16b}, [x1], #32
1100 	subs	x17, x17, #32
1101 	b.hi	Lcbc_dec_loop2x
1102 
1103 Lcbc_dec_done:
1104 	st1	{v6.16b}, [x4]
1105 
1106 	ldp	d14,d15,[sp],#16
1107 	ldp	d12,d13,[sp],#16
1108 	ldp	d10,d11,[sp],#16
1109 	ldp	d8,d9,[sp],#16
1110 	ldp	x29,x30,[sp],#16
1111 .long	0xd50323bf		// autiasp
1112 	ret
1113 
1114 .globl	_vpaes_ecb_encrypt
1115 
1116 .align	4
1117 _vpaes_ecb_encrypt:
1118 .long	0xd503233f		// paciasp
1119 	stp	x29,x30,[sp,#-16]!
1120 	add	x29,sp,#0
1121 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1122 	stp	d10,d11,[sp,#-16]!
1123 	stp	d12,d13,[sp,#-16]!
1124 	stp	d14,d15,[sp,#-16]!
1125 
1126 	mov	x17, x2
1127 	mov	x2,  x3
1128 	bl	_vpaes_encrypt_preheat
1129 	tst	x17, #16
1130 	b.eq	Lecb_enc_loop
1131 
1132 	ld1	{v7.16b}, [x0],#16
1133 	bl	_vpaes_encrypt_core
1134 	st1	{v0.16b}, [x1],#16
1135 	subs	x17, x17, #16
1136 	b.ls	Lecb_enc_done
1137 
1138 .align	4
1139 Lecb_enc_loop:
1140 	ld1	{v14.16b,v15.16b}, [x0], #32
1141 	bl	_vpaes_encrypt_2x
1142 	st1	{v0.16b,v1.16b}, [x1], #32
1143 	subs	x17, x17, #32
1144 	b.hi	Lecb_enc_loop
1145 
1146 Lecb_enc_done:
1147 	ldp	d14,d15,[sp],#16
1148 	ldp	d12,d13,[sp],#16
1149 	ldp	d10,d11,[sp],#16
1150 	ldp	d8,d9,[sp],#16
1151 	ldp	x29,x30,[sp],#16
1152 .long	0xd50323bf		// autiasp
1153 	ret
1154 
1155 
1156 .globl	_vpaes_ecb_decrypt
1157 
1158 .align	4
1159 _vpaes_ecb_decrypt:
1160 .long	0xd503233f		// paciasp
1161 	stp	x29,x30,[sp,#-16]!
1162 	add	x29,sp,#0
1163 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1164 	stp	d10,d11,[sp,#-16]!
1165 	stp	d12,d13,[sp,#-16]!
1166 	stp	d14,d15,[sp,#-16]!
1167 
1168 	mov	x17, x2
1169 	mov	x2,  x3
1170 	bl	_vpaes_decrypt_preheat
1171 	tst	x17, #16
1172 	b.eq	Lecb_dec_loop
1173 
1174 	ld1	{v7.16b}, [x0],#16
1175 	bl	_vpaes_encrypt_core
1176 	st1	{v0.16b}, [x1],#16
1177 	subs	x17, x17, #16
1178 	b.ls	Lecb_dec_done
1179 
1180 .align	4
1181 Lecb_dec_loop:
1182 	ld1	{v14.16b,v15.16b}, [x0], #32
1183 	bl	_vpaes_decrypt_2x
1184 	st1	{v0.16b,v1.16b}, [x1], #32
1185 	subs	x17, x17, #32
1186 	b.hi	Lecb_dec_loop
1187 
1188 Lecb_dec_done:
1189 	ldp	d14,d15,[sp],#16
1190 	ldp	d12,d13,[sp],#16
1191 	ldp	d10,d11,[sp],#16
1192 	ldp	d8,d9,[sp],#16
1193 	ldp	x29,x30,[sp],#16
1194 .long	0xd50323bf		// autiasp
1195 	ret
1196 
1197