1 #include "arm_arch.h"
2 
3 #if __ARM_MAX_ARCH__>=7
4 
5 .text
6 .globl	_gcm_init_v8
7 
8 .align	4
9 _gcm_init_v8:
10 	ld1	{v17.2d},[x1]		//load input H
11 	movi	v19.16b,#0xe1
12 	shl	v19.2d,v19.2d,#57		//0xc2.0
13 	ext	v3.16b,v17.16b,v17.16b,#8
14 	ushr	v18.2d,v19.2d,#63
15 	dup	v17.4s,v17.s[1]
16 	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
17 	ushr	v18.2d,v3.2d,#63
18 	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
19 	and	v18.16b,v18.16b,v16.16b
20 	shl	v3.2d,v3.2d,#1
21 	ext	v18.16b,v18.16b,v18.16b,#8
22 	and	v16.16b,v16.16b,v17.16b
23 	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
24 	eor	v20.16b,v3.16b,v16.16b		//twisted H
25 	st1	{v20.2d},[x0],#16		//store Htable[0]
26 
27 	//calculate H^2
28 	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
29 	pmull	v0.1q,v20.1d,v20.1d
30 	eor	v16.16b,v16.16b,v20.16b
31 	pmull2	v2.1q,v20.2d,v20.2d
32 	pmull	v1.1q,v16.1d,v16.1d
33 
34 	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
35 	eor	v18.16b,v0.16b,v2.16b
36 	eor	v1.16b,v1.16b,v17.16b
37 	eor	v1.16b,v1.16b,v18.16b
38 	pmull	v18.1q,v0.1d,v19.1d		//1st phase
39 
40 	ins	v2.d[0],v1.d[1]
41 	ins	v1.d[1],v0.d[0]
42 	eor	v0.16b,v1.16b,v18.16b
43 
44 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
45 	pmull	v0.1q,v0.1d,v19.1d
46 	eor	v18.16b,v18.16b,v2.16b
47 	eor	v22.16b,v0.16b,v18.16b
48 
49 	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
50 	eor	v17.16b,v17.16b,v22.16b
51 	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
52 	st1	{v21.2d,v22.2d},[x0],#32	//store Htable[1..2]
53 	//calculate H^3 and H^4
54 	pmull	v0.1q,v20.1d, v22.1d
55 	pmull	v5.1q,v22.1d,v22.1d
56 	pmull2	v2.1q,v20.2d, v22.2d
57 	pmull2	v7.1q,v22.2d,v22.2d
58 	pmull	v1.1q,v16.1d,v17.1d
59 	pmull	v6.1q,v17.1d,v17.1d
60 
61 	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
62 	ext	v17.16b,v5.16b,v7.16b,#8
63 	eor	v18.16b,v0.16b,v2.16b
64 	eor	v1.16b,v1.16b,v16.16b
65 	eor	v4.16b,v5.16b,v7.16b
66 	eor	v6.16b,v6.16b,v17.16b
67 	eor	v1.16b,v1.16b,v18.16b
68 	pmull	v18.1q,v0.1d,v19.1d		//1st phase
69 	eor	v6.16b,v6.16b,v4.16b
70 	pmull	v4.1q,v5.1d,v19.1d
71 
72 	ins	v2.d[0],v1.d[1]
73 	ins	v7.d[0],v6.d[1]
74 	ins	v1.d[1],v0.d[0]
75 	ins	v6.d[1],v5.d[0]
76 	eor	v0.16b,v1.16b,v18.16b
77 	eor	v5.16b,v6.16b,v4.16b
78 
79 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
80 	ext	v4.16b,v5.16b,v5.16b,#8
81 	pmull	v0.1q,v0.1d,v19.1d
82 	pmull	v5.1q,v5.1d,v19.1d
83 	eor	v18.16b,v18.16b,v2.16b
84 	eor	v4.16b,v4.16b,v7.16b
85 	eor	v20.16b, v0.16b,v18.16b		//H^3
86 	eor	v22.16b,v5.16b,v4.16b		//H^4
87 
88 	ext	v16.16b,v20.16b, v20.16b,#8		//Karatsuba pre-processing
89 	ext	v17.16b,v22.16b,v22.16b,#8
90 	eor	v16.16b,v16.16b,v20.16b
91 	eor	v17.16b,v17.16b,v22.16b
92 	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
93 	st1	{v20.2d,v21.2d,v22.2d},[x0]		//store Htable[3..5]
94 	ret
95 
96 .globl	_gcm_gmult_v8
97 
98 .align	4
99 _gcm_gmult_v8:
100 	ld1	{v17.2d},[x0]		//load Xi
101 	movi	v19.16b,#0xe1
102 	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
103 	shl	v19.2d,v19.2d,#57
104 #ifndef __AARCH64EB__
105 	rev64	v17.16b,v17.16b
106 #endif
107 	ext	v3.16b,v17.16b,v17.16b,#8
108 
109 	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
110 	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
111 	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
112 	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
113 
114 	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
115 	eor	v18.16b,v0.16b,v2.16b
116 	eor	v1.16b,v1.16b,v17.16b
117 	eor	v1.16b,v1.16b,v18.16b
118 	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
119 
120 	ins	v2.d[0],v1.d[1]
121 	ins	v1.d[1],v0.d[0]
122 	eor	v0.16b,v1.16b,v18.16b
123 
124 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
125 	pmull	v0.1q,v0.1d,v19.1d
126 	eor	v18.16b,v18.16b,v2.16b
127 	eor	v0.16b,v0.16b,v18.16b
128 
129 #ifndef __AARCH64EB__
130 	rev64	v0.16b,v0.16b
131 #endif
132 	ext	v0.16b,v0.16b,v0.16b,#8
133 	st1	{v0.2d},[x0]		//write out Xi
134 
135 	ret
136 
137 .globl	_gcm_ghash_v8
138 
139 .align	4
140 _gcm_ghash_v8:
141 	cmp	x3,#64
142 	b.hs	Lgcm_ghash_v8_4x
143 	ld1	{v0.2d},[x0]		//load [rotated] Xi
144 						//"[rotated]" means that
145 						//loaded value would have
146 						//to be rotated in order to
147 						//make it appear as in
148 						//algorithm specification
149 	subs	x3,x3,#32		//see if x3 is 32 or larger
150 	mov	x12,#16		//x12 is used as post-
151 						//increment for input pointer;
152 						//as loop is modulo-scheduled
153 						//x12 is zeroed just in time
154 						//to preclude overstepping
155 						//inp[len], which means that
156 						//last block[s] are actually
157 						//loaded twice, but last
158 						//copy is not processed
159 	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
160 	movi	v19.16b,#0xe1
161 	ld1	{v22.2d},[x1]
162 	csel	x12,xzr,x12,eq			//is it time to zero x12?
163 	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
164 	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
165 	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
166 #ifndef __AARCH64EB__
167 	rev64	v16.16b,v16.16b
168 	rev64	v0.16b,v0.16b
169 #endif
170 	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
171 	b.lo	Lodd_tail_v8		//x3 was less than 32
172 	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
173 #ifndef __AARCH64EB__
174 	rev64	v17.16b,v17.16b
175 #endif
176 	ext	v7.16b,v17.16b,v17.16b,#8
177 	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
178 	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
179 	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
180 	pmull2	v6.1q,v20.2d,v7.2d
181 	b	Loop_mod2x_v8
182 
183 .align	4
184 Loop_mod2x_v8:
185 	ext	v18.16b,v3.16b,v3.16b,#8
186 	subs	x3,x3,#32		//is there more data?
187 	pmull	v0.1q,v22.1d,v3.1d		//H^2.lo·Xi.lo
188 	csel	x12,xzr,x12,lo			//is it time to zero x12?
189 
190 	pmull	v5.1q,v21.1d,v17.1d
191 	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
192 	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hi·Xi.hi
193 	eor	v0.16b,v0.16b,v4.16b		//accumulate
194 	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
195 	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
196 
197 	eor	v2.16b,v2.16b,v6.16b
198 	csel	x12,xzr,x12,eq			//is it time to zero x12?
199 	eor	v1.16b,v1.16b,v5.16b
200 
201 	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
202 	eor	v18.16b,v0.16b,v2.16b
203 	eor	v1.16b,v1.16b,v17.16b
204 	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
205 #ifndef __AARCH64EB__
206 	rev64	v16.16b,v16.16b
207 #endif
208 	eor	v1.16b,v1.16b,v18.16b
209 	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
210 
211 #ifndef __AARCH64EB__
212 	rev64	v17.16b,v17.16b
213 #endif
214 	ins	v2.d[0],v1.d[1]
215 	ins	v1.d[1],v0.d[0]
216 	ext	v7.16b,v17.16b,v17.16b,#8
217 	ext	v3.16b,v16.16b,v16.16b,#8
218 	eor	v0.16b,v1.16b,v18.16b
219 	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
220 	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
221 
222 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
223 	pmull	v0.1q,v0.1d,v19.1d
224 	eor	v3.16b,v3.16b,v18.16b
225 	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
226 	eor	v3.16b,v3.16b,v0.16b
227 	pmull2	v6.1q,v20.2d,v7.2d
228 	b.hs	Loop_mod2x_v8		//there was at least 32 more bytes
229 
230 	eor	v2.16b,v2.16b,v18.16b
231 	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
232 	adds	x3,x3,#32		//re-construct x3
233 	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
234 	b.eq	Ldone_v8		//is x3 zero?
235 Lodd_tail_v8:
236 	ext	v18.16b,v0.16b,v0.16b,#8
237 	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
238 	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
239 
240 	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
241 	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
242 	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
243 	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
244 
245 	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
246 	eor	v18.16b,v0.16b,v2.16b
247 	eor	v1.16b,v1.16b,v17.16b
248 	eor	v1.16b,v1.16b,v18.16b
249 	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
250 
251 	ins	v2.d[0],v1.d[1]
252 	ins	v1.d[1],v0.d[0]
253 	eor	v0.16b,v1.16b,v18.16b
254 
255 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
256 	pmull	v0.1q,v0.1d,v19.1d
257 	eor	v18.16b,v18.16b,v2.16b
258 	eor	v0.16b,v0.16b,v18.16b
259 
260 Ldone_v8:
261 #ifndef __AARCH64EB__
262 	rev64	v0.16b,v0.16b
263 #endif
264 	ext	v0.16b,v0.16b,v0.16b,#8
265 	st1	{v0.2d},[x0]		//write out Xi
266 
267 	ret
268 
269 
270 .align	4
271 gcm_ghash_v8_4x:
272 Lgcm_ghash_v8_4x:
273 	ld1	{v0.2d},[x0]		//load [rotated] Xi
274 	ld1	{v20.2d,v21.2d,v22.2d},[x1],#48	//load twisted H, ..., H^2
275 	movi	v19.16b,#0xe1
276 	ld1	{v26.2d,v27.2d,v28.2d},[x1]	//load twisted H^3, ..., H^4
277 	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
278 
279 	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
280 #ifndef __AARCH64EB__
281 	rev64	v0.16b,v0.16b
282 	rev64	v5.16b,v5.16b
283 	rev64	v6.16b,v6.16b
284 	rev64	v7.16b,v7.16b
285 	rev64	v4.16b,v4.16b
286 #endif
287 	ext	v25.16b,v7.16b,v7.16b,#8
288 	ext	v24.16b,v6.16b,v6.16b,#8
289 	ext	v23.16b,v5.16b,v5.16b,#8
290 
291 	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
292 	eor	v7.16b,v7.16b,v25.16b
293 	pmull2	v31.1q,v20.2d,v25.2d
294 	pmull	v30.1q,v21.1d,v7.1d
295 
296 	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
297 	eor	v6.16b,v6.16b,v24.16b
298 	pmull2	v24.1q,v22.2d,v24.2d
299 	pmull2	v6.1q,v21.2d,v6.2d
300 
301 	eor	v29.16b,v29.16b,v16.16b
302 	eor	v31.16b,v31.16b,v24.16b
303 	eor	v30.16b,v30.16b,v6.16b
304 
305 	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
306 	eor	v5.16b,v5.16b,v23.16b
307 	pmull2	v23.1q,v26.2d,v23.2d
308 	pmull	v5.1q,v27.1d,v5.1d
309 
310 	eor	v29.16b,v29.16b,v7.16b
311 	eor	v31.16b,v31.16b,v23.16b
312 	eor	v30.16b,v30.16b,v5.16b
313 
314 	subs	x3,x3,#128
315 	b.lo	Ltail4x
316 
317 	b	Loop4x
318 
319 .align	4
320 Loop4x:
321 	eor	v16.16b,v4.16b,v0.16b
322 	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
323 	ext	v3.16b,v16.16b,v16.16b,#8
324 #ifndef __AARCH64EB__
325 	rev64	v5.16b,v5.16b
326 	rev64	v6.16b,v6.16b
327 	rev64	v7.16b,v7.16b
328 	rev64	v4.16b,v4.16b
329 #endif
330 
331 	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
332 	eor	v16.16b,v16.16b,v3.16b
333 	pmull2	v2.1q,v28.2d,v3.2d
334 	ext	v25.16b,v7.16b,v7.16b,#8
335 	pmull2	v1.1q,v27.2d,v16.2d
336 
337 	eor	v0.16b,v0.16b,v29.16b
338 	eor	v2.16b,v2.16b,v31.16b
339 	ext	v24.16b,v6.16b,v6.16b,#8
340 	eor	v1.16b,v1.16b,v30.16b
341 	ext	v23.16b,v5.16b,v5.16b,#8
342 
343 	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
344 	eor	v18.16b,v0.16b,v2.16b
345 	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
346 	eor	v7.16b,v7.16b,v25.16b
347 	eor	v1.16b,v1.16b,v17.16b
348 	pmull2	v31.1q,v20.2d,v25.2d
349 	eor	v1.16b,v1.16b,v18.16b
350 	pmull	v30.1q,v21.1d,v7.1d
351 
352 	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
353 	ins	v2.d[0],v1.d[1]
354 	ins	v1.d[1],v0.d[0]
355 	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
356 	eor	v6.16b,v6.16b,v24.16b
357 	pmull2	v24.1q,v22.2d,v24.2d
358 	eor	v0.16b,v1.16b,v18.16b
359 	pmull2	v6.1q,v21.2d,v6.2d
360 
361 	eor	v29.16b,v29.16b,v16.16b
362 	eor	v31.16b,v31.16b,v24.16b
363 	eor	v30.16b,v30.16b,v6.16b
364 
365 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
366 	pmull	v0.1q,v0.1d,v19.1d
367 	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
368 	eor	v5.16b,v5.16b,v23.16b
369 	eor	v18.16b,v18.16b,v2.16b
370 	pmull2	v23.1q,v26.2d,v23.2d
371 	pmull	v5.1q,v27.1d,v5.1d
372 
373 	eor	v0.16b,v0.16b,v18.16b
374 	eor	v29.16b,v29.16b,v7.16b
375 	eor	v31.16b,v31.16b,v23.16b
376 	ext	v0.16b,v0.16b,v0.16b,#8
377 	eor	v30.16b,v30.16b,v5.16b
378 
379 	subs	x3,x3,#64
380 	b.hs	Loop4x
381 
382 Ltail4x:
383 	eor	v16.16b,v4.16b,v0.16b
384 	ext	v3.16b,v16.16b,v16.16b,#8
385 
386 	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
387 	eor	v16.16b,v16.16b,v3.16b
388 	pmull2	v2.1q,v28.2d,v3.2d
389 	pmull2	v1.1q,v27.2d,v16.2d
390 
391 	eor	v0.16b,v0.16b,v29.16b
392 	eor	v2.16b,v2.16b,v31.16b
393 	eor	v1.16b,v1.16b,v30.16b
394 
395 	adds	x3,x3,#64
396 	b.eq	Ldone4x
397 
398 	cmp	x3,#32
399 	b.lo	Lone
400 	b.eq	Ltwo
401 Lthree:
402 	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
403 	eor	v18.16b,v0.16b,v2.16b
404 	eor	v1.16b,v1.16b,v17.16b
405 	ld1	{v4.2d,v5.2d,v6.2d},[x2]
406 	eor	v1.16b,v1.16b,v18.16b
407 #ifndef	__AARCH64EB__
408 	rev64	v5.16b,v5.16b
409 	rev64	v6.16b,v6.16b
410 	rev64	v4.16b,v4.16b
411 #endif
412 
413 	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
414 	ins	v2.d[0],v1.d[1]
415 	ins	v1.d[1],v0.d[0]
416 	ext	v24.16b,v6.16b,v6.16b,#8
417 	ext	v23.16b,v5.16b,v5.16b,#8
418 	eor	v0.16b,v1.16b,v18.16b
419 
420 	pmull	v29.1q,v20.1d,v24.1d		//H·Ii+2
421 	eor	v6.16b,v6.16b,v24.16b
422 
423 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
424 	pmull	v0.1q,v0.1d,v19.1d
425 	eor	v18.16b,v18.16b,v2.16b
426 	pmull2	v31.1q,v20.2d,v24.2d
427 	pmull	v30.1q,v21.1d,v6.1d
428 	eor	v0.16b,v0.16b,v18.16b
429 	pmull	v7.1q,v22.1d,v23.1d		//H^2·Ii+1
430 	eor	v5.16b,v5.16b,v23.16b
431 	ext	v0.16b,v0.16b,v0.16b,#8
432 
433 	pmull2	v23.1q,v22.2d,v23.2d
434 	eor	v16.16b,v4.16b,v0.16b
435 	pmull2	v5.1q,v21.2d,v5.2d
436 	ext	v3.16b,v16.16b,v16.16b,#8
437 
438 	eor	v29.16b,v29.16b,v7.16b
439 	eor	v31.16b,v31.16b,v23.16b
440 	eor	v30.16b,v30.16b,v5.16b
441 
442 	pmull	v0.1q,v26.1d,v3.1d		//H^3·(Xi+Ii)
443 	eor	v16.16b,v16.16b,v3.16b
444 	pmull2	v2.1q,v26.2d,v3.2d
445 	pmull	v1.1q,v27.1d,v16.1d
446 
447 	eor	v0.16b,v0.16b,v29.16b
448 	eor	v2.16b,v2.16b,v31.16b
449 	eor	v1.16b,v1.16b,v30.16b
450 	b	Ldone4x
451 
452 .align	4
453 Ltwo:
454 	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
455 	eor	v18.16b,v0.16b,v2.16b
456 	eor	v1.16b,v1.16b,v17.16b
457 	ld1	{v4.2d,v5.2d},[x2]
458 	eor	v1.16b,v1.16b,v18.16b
459 #ifndef	__AARCH64EB__
460 	rev64	v5.16b,v5.16b
461 	rev64	v4.16b,v4.16b
462 #endif
463 
464 	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
465 	ins	v2.d[0],v1.d[1]
466 	ins	v1.d[1],v0.d[0]
467 	ext	v23.16b,v5.16b,v5.16b,#8
468 	eor	v0.16b,v1.16b,v18.16b
469 
470 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
471 	pmull	v0.1q,v0.1d,v19.1d
472 	eor	v18.16b,v18.16b,v2.16b
473 	eor	v0.16b,v0.16b,v18.16b
474 	ext	v0.16b,v0.16b,v0.16b,#8
475 
476 	pmull	v29.1q,v20.1d,v23.1d		//H·Ii+1
477 	eor	v5.16b,v5.16b,v23.16b
478 
479 	eor	v16.16b,v4.16b,v0.16b
480 	ext	v3.16b,v16.16b,v16.16b,#8
481 
482 	pmull2	v31.1q,v20.2d,v23.2d
483 	pmull	v30.1q,v21.1d,v5.1d
484 
485 	pmull	v0.1q,v22.1d,v3.1d		//H^2·(Xi+Ii)
486 	eor	v16.16b,v16.16b,v3.16b
487 	pmull2	v2.1q,v22.2d,v3.2d
488 	pmull2	v1.1q,v21.2d,v16.2d
489 
490 	eor	v0.16b,v0.16b,v29.16b
491 	eor	v2.16b,v2.16b,v31.16b
492 	eor	v1.16b,v1.16b,v30.16b
493 	b	Ldone4x
494 
495 .align	4
496 Lone:
497 	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
498 	eor	v18.16b,v0.16b,v2.16b
499 	eor	v1.16b,v1.16b,v17.16b
500 	ld1	{v4.2d},[x2]
501 	eor	v1.16b,v1.16b,v18.16b
502 #ifndef	__AARCH64EB__
503 	rev64	v4.16b,v4.16b
504 #endif
505 
506 	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
507 	ins	v2.d[0],v1.d[1]
508 	ins	v1.d[1],v0.d[0]
509 	eor	v0.16b,v1.16b,v18.16b
510 
511 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
512 	pmull	v0.1q,v0.1d,v19.1d
513 	eor	v18.16b,v18.16b,v2.16b
514 	eor	v0.16b,v0.16b,v18.16b
515 	ext	v0.16b,v0.16b,v0.16b,#8
516 
517 	eor	v16.16b,v4.16b,v0.16b
518 	ext	v3.16b,v16.16b,v16.16b,#8
519 
520 	pmull	v0.1q,v20.1d,v3.1d
521 	eor	v16.16b,v16.16b,v3.16b
522 	pmull2	v2.1q,v20.2d,v3.2d
523 	pmull	v1.1q,v21.1d,v16.1d
524 
525 Ldone4x:
526 	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
527 	eor	v18.16b,v0.16b,v2.16b
528 	eor	v1.16b,v1.16b,v17.16b
529 	eor	v1.16b,v1.16b,v18.16b
530 
531 	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
532 	ins	v2.d[0],v1.d[1]
533 	ins	v1.d[1],v0.d[0]
534 	eor	v0.16b,v1.16b,v18.16b
535 
536 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
537 	pmull	v0.1q,v0.1d,v19.1d
538 	eor	v18.16b,v18.16b,v2.16b
539 	eor	v0.16b,v0.16b,v18.16b
540 	ext	v0.16b,v0.16b,v0.16b,#8
541 
542 #ifndef __AARCH64EB__
543 	rev64	v0.16b,v0.16b
544 #endif
545 	st1	{v0.2d},[x0]		//write out Xi
546 
547 	ret
548 
549 .byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
550 .align	2
551 .align	2
552 #endif
553