1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 #
3 # Accelerated poly1305 implementation for ppc64le.
4 #
5 # Copyright 2023- IBM Corp. All rights reserved
6 #
7 #===================================================================================
8 # Written by Danny Tsen <dtsen@us.ibm.com>
9 #
10 # Poly1305 - this version mainly using vector/VSX/Scalar
11 #  - 26 bits limbs
12 #  - Handle multiple 64 byte blcok.
13 #
14 # Block size 16 bytes
15 # key = (r, s)
16 # clamp r &= 0x0FFFFFFC0FFFFFFC 0x0FFFFFFC0FFFFFFF
17 # p = 2^130 - 5
18 # a += m
19 # a = (r + a) % p
20 # a += s
21 #
22 # Improve performance by breaking down polynominal to the sum of products with
23 #     h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
24 #
25 #  07/22/21 - this revison based on the above sum of products.  Setup r^4, r^3, r^2, r and s3, s2, s1, s0
26 #             to 9 vectors for multiplications.
27 #
28 # setup r^4, r^3, r^2, r vectors
29 #    vs    [r^1, r^3, r^2, r^4]
30 #    vs0 = [r0,.....]
31 #    vs1 = [r1,.....]
32 #    vs2 = [r2,.....]
33 #    vs3 = [r3,.....]
34 #    vs4 = [r4,.....]
35 #    vs5 = [r1*5,...]
36 #    vs6 = [r2*5,...]
37 #    vs7 = [r2*5,...]
38 #    vs8 = [r4*5,...]
39 #
40 #  Each word in a vector consists a member of a "r/s" in [a * r/s].
41 #
42 # r0, r4*5, r3*5, r2*5, r1*5;
43 # r1, r0,   r4*5, r3*5, r2*5;
44 # r2, r1,   r0,   r4*5, r3*5;
45 # r3, r2,   r1,   r0,   r4*5;
46 # r4, r3,   r2,   r1,   r0  ;
47 #
48 #
49 # poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
50 #  k = 32 bytes key
51 #  r3 = k (r, s)
52 #  r4 = mlen
53 #  r5 = m
54 #
55 #include <asm/ppc_asm.h>
56 #include <asm/asm-offsets.h>
57 #include <asm/asm-compat.h>
58 #include <linux/linkage.h>
59 
60 .machine "any"
61 
62 .text
63 
64 .macro	SAVE_GPR GPR OFFSET FRAME
65 	std	\GPR,\OFFSET(\FRAME)
66 .endm
67 
68 .macro	SAVE_VRS VRS OFFSET FRAME
69 	li	16, \OFFSET
70 	stvx	\VRS, 16, \FRAME
71 .endm
72 
73 .macro	SAVE_VSX VSX OFFSET FRAME
74 	li	16, \OFFSET
75 	stxvx	\VSX, 16, \FRAME
76 .endm
77 
78 .macro	RESTORE_GPR GPR OFFSET FRAME
79 	ld	\GPR,\OFFSET(\FRAME)
80 .endm
81 
82 .macro	RESTORE_VRS VRS OFFSET FRAME
83 	li	16, \OFFSET
84 	lvx	\VRS, 16, \FRAME
85 .endm
86 
87 .macro	RESTORE_VSX VSX OFFSET FRAME
88 	li	16, \OFFSET
89 	lxvx	\VSX, 16, \FRAME
90 .endm
91 
92 .macro SAVE_REGS
93 	mflr 0
94 	std 0, 16(1)
95 	stdu 1,-752(1)
96 
97 	SAVE_GPR 14, 112, 1
98 	SAVE_GPR 15, 120, 1
99 	SAVE_GPR 16, 128, 1
100 	SAVE_GPR 17, 136, 1
101 	SAVE_GPR 18, 144, 1
102 	SAVE_GPR 19, 152, 1
103 	SAVE_GPR 20, 160, 1
104 	SAVE_GPR 21, 168, 1
105 	SAVE_GPR 22, 176, 1
106 	SAVE_GPR 23, 184, 1
107 	SAVE_GPR 24, 192, 1
108 	SAVE_GPR 25, 200, 1
109 	SAVE_GPR 26, 208, 1
110 	SAVE_GPR 27, 216, 1
111 	SAVE_GPR 28, 224, 1
112 	SAVE_GPR 29, 232, 1
113 	SAVE_GPR 30, 240, 1
114 	SAVE_GPR 31, 248, 1
115 
116 	addi	9, 1, 256
117 	SAVE_VRS 20, 0, 9
118 	SAVE_VRS 21, 16, 9
119 	SAVE_VRS 22, 32, 9
120 	SAVE_VRS 23, 48, 9
121 	SAVE_VRS 24, 64, 9
122 	SAVE_VRS 25, 80, 9
123 	SAVE_VRS 26, 96, 9
124 	SAVE_VRS 27, 112, 9
125 	SAVE_VRS 28, 128, 9
126 	SAVE_VRS 29, 144, 9
127 	SAVE_VRS 30, 160, 9
128 	SAVE_VRS 31, 176, 9
129 
130 	SAVE_VSX 14, 192, 9
131 	SAVE_VSX 15, 208, 9
132 	SAVE_VSX 16, 224, 9
133 	SAVE_VSX 17, 240, 9
134 	SAVE_VSX 18, 256, 9
135 	SAVE_VSX 19, 272, 9
136 	SAVE_VSX 20, 288, 9
137 	SAVE_VSX 21, 304, 9
138 	SAVE_VSX 22, 320, 9
139 	SAVE_VSX 23, 336, 9
140 	SAVE_VSX 24, 352, 9
141 	SAVE_VSX 25, 368, 9
142 	SAVE_VSX 26, 384, 9
143 	SAVE_VSX 27, 400, 9
144 	SAVE_VSX 28, 416, 9
145 	SAVE_VSX 29, 432, 9
146 	SAVE_VSX 30, 448, 9
147 	SAVE_VSX 31, 464, 9
148 .endm # SAVE_REGS
149 
150 .macro RESTORE_REGS
151 	addi	9, 1, 256
152 	RESTORE_VRS 20, 0, 9
153 	RESTORE_VRS 21, 16, 9
154 	RESTORE_VRS 22, 32, 9
155 	RESTORE_VRS 23, 48, 9
156 	RESTORE_VRS 24, 64, 9
157 	RESTORE_VRS 25, 80, 9
158 	RESTORE_VRS 26, 96, 9
159 	RESTORE_VRS 27, 112, 9
160 	RESTORE_VRS 28, 128, 9
161 	RESTORE_VRS 29, 144, 9
162 	RESTORE_VRS 30, 160, 9
163 	RESTORE_VRS 31, 176, 9
164 
165 	RESTORE_VSX 14, 192, 9
166 	RESTORE_VSX 15, 208, 9
167 	RESTORE_VSX 16, 224, 9
168 	RESTORE_VSX 17, 240, 9
169 	RESTORE_VSX 18, 256, 9
170 	RESTORE_VSX 19, 272, 9
171 	RESTORE_VSX 20, 288, 9
172 	RESTORE_VSX 21, 304, 9
173 	RESTORE_VSX 22, 320, 9
174 	RESTORE_VSX 23, 336, 9
175 	RESTORE_VSX 24, 352, 9
176 	RESTORE_VSX 25, 368, 9
177 	RESTORE_VSX 26, 384, 9
178 	RESTORE_VSX 27, 400, 9
179 	RESTORE_VSX 28, 416, 9
180 	RESTORE_VSX 29, 432, 9
181 	RESTORE_VSX 30, 448, 9
182 	RESTORE_VSX 31, 464, 9
183 
184 	RESTORE_GPR 14, 112, 1
185 	RESTORE_GPR 15, 120, 1
186 	RESTORE_GPR 16, 128, 1
187 	RESTORE_GPR 17, 136, 1
188 	RESTORE_GPR 18, 144, 1
189 	RESTORE_GPR 19, 152, 1
190 	RESTORE_GPR 20, 160, 1
191 	RESTORE_GPR 21, 168, 1
192 	RESTORE_GPR 22, 176, 1
193 	RESTORE_GPR 23, 184, 1
194 	RESTORE_GPR 24, 192, 1
195 	RESTORE_GPR 25, 200, 1
196 	RESTORE_GPR 26, 208, 1
197 	RESTORE_GPR 27, 216, 1
198 	RESTORE_GPR 28, 224, 1
199 	RESTORE_GPR 29, 232, 1
200 	RESTORE_GPR 30, 240, 1
201 	RESTORE_GPR 31, 248, 1
202 
203 	addi    1, 1, 752
204 	ld 0, 16(1)
205 	mtlr 0
206 .endm # RESTORE_REGS
207 
208 #
209 # p[0] = a0*r0 + a1*r4*5 + a2*r3*5 + a3*r2*5 + a4*r1*5;
210 # p[1] = a0*r1 + a1*r0   + a2*r4*5 + a3*r3*5 + a4*r2*5;
211 # p[2] = a0*r2 + a1*r1   + a2*r0   + a3*r4*5 + a4*r3*5;
212 # p[3] = a0*r3 + a1*r2   + a2*r1   + a3*r0   + a4*r4*5;
213 # p[4] = a0*r4 + a1*r3   + a2*r2   + a3*r1   + a4*r0  ;
214 #
215 #    [r^2, r^3, r^1, r^4]
216 #    [m3,  m2,  m4,  m1]
217 #
218 # multiply odd and even words
219 .macro mul_odd
220 	vmulouw	14, 4, 26
221 	vmulouw	10, 5, 3
222 	vmulouw	11, 6, 2
223 	vmulouw	12, 7, 1
224 	vmulouw	13, 8, 0
225 	vmulouw	15, 4, 27
226 	vaddudm	14, 14, 10
227 	vaddudm	14, 14, 11
228 	vmulouw	10, 5, 26
229 	vmulouw	11, 6, 3
230 	vaddudm	14, 14, 12
231 	vaddudm	14, 14, 13	# x0
232 	vaddudm	15, 15, 10
233 	vaddudm	15, 15, 11
234 	vmulouw	12, 7, 2
235 	vmulouw	13, 8, 1
236 	vaddudm	15, 15, 12
237 	vaddudm	15, 15, 13	# x1
238 	vmulouw	16, 4, 28
239 	vmulouw	10, 5, 27
240 	vmulouw	11, 6, 26
241 	vaddudm	16, 16, 10
242 	vaddudm	16, 16, 11
243 	vmulouw	12, 7, 3
244 	vmulouw	13, 8, 2
245 	vaddudm	16, 16, 12
246 	vaddudm	16, 16, 13	# x2
247 	vmulouw	17, 4, 29
248 	vmulouw	10, 5, 28
249 	vmulouw	11, 6, 27
250 	vaddudm	17, 17, 10
251 	vaddudm	17, 17, 11
252 	vmulouw	12, 7, 26
253 	vmulouw	13, 8, 3
254 	vaddudm	17, 17, 12
255 	vaddudm	17, 17, 13	# x3
256 	vmulouw	18, 4, 30
257 	vmulouw	10, 5, 29
258 	vmulouw	11, 6, 28
259 	vaddudm	18, 18, 10
260 	vaddudm	18, 18, 11
261 	vmulouw	12, 7, 27
262 	vmulouw	13, 8, 26
263 	vaddudm	18, 18, 12
264 	vaddudm	18, 18, 13	# x4
265 .endm
266 
267 .macro mul_even
268 	vmuleuw	9, 4, 26
269 	vmuleuw	10, 5, 3
270 	vmuleuw	11, 6, 2
271 	vmuleuw	12, 7, 1
272 	vmuleuw	13, 8, 0
273 	vaddudm	14, 14, 9
274 	vaddudm	14, 14, 10
275 	vaddudm	14, 14, 11
276 	vaddudm	14, 14, 12
277 	vaddudm	14, 14, 13	# x0
278 
279 	vmuleuw	9, 4, 27
280 	vmuleuw	10, 5, 26
281 	vmuleuw	11, 6, 3
282 	vmuleuw	12, 7, 2
283 	vmuleuw	13, 8, 1
284 	vaddudm	15, 15, 9
285 	vaddudm	15, 15, 10
286 	vaddudm	15, 15, 11
287 	vaddudm	15, 15, 12
288 	vaddudm	15, 15, 13	# x1
289 
290 	vmuleuw	9, 4, 28
291 	vmuleuw	10, 5, 27
292 	vmuleuw	11, 6, 26
293 	vmuleuw	12, 7, 3
294 	vmuleuw	13, 8, 2
295 	vaddudm	16, 16, 9
296 	vaddudm	16, 16, 10
297 	vaddudm	16, 16, 11
298 	vaddudm	16, 16, 12
299 	vaddudm	16, 16, 13	# x2
300 
301 	vmuleuw	9, 4, 29
302 	vmuleuw	10, 5, 28
303 	vmuleuw	11, 6, 27
304 	vmuleuw	12, 7, 26
305 	vmuleuw	13, 8, 3
306 	vaddudm	17, 17, 9
307 	vaddudm	17, 17, 10
308 	vaddudm	17, 17, 11
309 	vaddudm	17, 17, 12
310 	vaddudm	17, 17, 13	# x3
311 
312 	vmuleuw	9, 4, 30
313 	vmuleuw	10, 5, 29
314 	vmuleuw	11, 6, 28
315 	vmuleuw	12, 7, 27
316 	vmuleuw	13, 8, 26
317 	vaddudm	18, 18, 9
318 	vaddudm	18, 18, 10
319 	vaddudm	18, 18, 11
320 	vaddudm	18, 18, 12
321 	vaddudm	18, 18, 13	# x4
322 .endm
323 
324 #
325 # poly1305_setup_r
326 #
327 # setup r^4, r^3, r^2, r vectors
328 #    [r, r^3, r^2, r^4]
329 #    vs0 = [r0,...]
330 #    vs1 = [r1,...]
331 #    vs2 = [r2,...]
332 #    vs3 = [r3,...]
333 #    vs4 = [r4,...]
334 #    vs5 = [r4*5,...]
335 #    vs6 = [r3*5,...]
336 #    vs7 = [r2*5,...]
337 #    vs8 = [r1*5,...]
338 #
339 # r0, r4*5, r3*5, r2*5, r1*5;
340 # r1, r0,   r4*5, r3*5, r2*5;
341 # r2, r1,   r0,   r4*5, r3*5;
342 # r3, r2,   r1,   r0,   r4*5;
343 # r4, r3,   r2,   r1,   r0  ;
344 #
345 .macro poly1305_setup_r
346 
347 	# save r
348 	xxlor	26, 58, 58
349 	xxlor	27, 59, 59
350 	xxlor	28, 60, 60
351 	xxlor	29, 61, 61
352 	xxlor	30, 62, 62
353 
354 	xxlxor	31, 31, 31
355 
356 #    [r, r^3, r^2, r^4]
357 	# compute r^2
358 	vmr	4, 26
359 	vmr	5, 27
360 	vmr	6, 28
361 	vmr	7, 29
362 	vmr	8, 30
363 	bl	do_mul		# r^2 r^1
364 	xxpermdi 58, 58, 36, 0x3		# r0
365 	xxpermdi 59, 59, 37, 0x3		# r1
366 	xxpermdi 60, 60, 38, 0x3		# r2
367 	xxpermdi 61, 61, 39, 0x3		# r3
368 	xxpermdi 62, 62, 40, 0x3		# r4
369 	xxpermdi 36, 36, 36, 0x3
370 	xxpermdi 37, 37, 37, 0x3
371 	xxpermdi 38, 38, 38, 0x3
372 	xxpermdi 39, 39, 39, 0x3
373 	xxpermdi 40, 40, 40, 0x3
374 	vspltisb 13, 2
375 	vsld	9, 27, 13
376 	vsld	10, 28, 13
377 	vsld	11, 29, 13
378 	vsld	12, 30, 13
379 	vaddudm	0, 9, 27
380 	vaddudm	1, 10, 28
381 	vaddudm	2, 11, 29
382 	vaddudm	3, 12, 30
383 
384 	bl	do_mul		# r^4 r^3
385 	vmrgow	26, 26, 4
386 	vmrgow	27, 27, 5
387 	vmrgow	28, 28, 6
388 	vmrgow	29, 29, 7
389 	vmrgow	30, 30, 8
390 	vspltisb 13, 2
391 	vsld	9, 27, 13
392 	vsld	10, 28, 13
393 	vsld	11, 29, 13
394 	vsld	12, 30, 13
395 	vaddudm	0, 9, 27
396 	vaddudm	1, 10, 28
397 	vaddudm	2, 11, 29
398 	vaddudm	3, 12, 30
399 
400 	# r^2 r^4
401 	xxlor	0, 58, 58
402 	xxlor	1, 59, 59
403 	xxlor	2, 60, 60
404 	xxlor	3, 61, 61
405 	xxlor	4, 62, 62
406 	xxlor	5, 32, 32
407 	xxlor	6, 33, 33
408 	xxlor	7, 34, 34
409 	xxlor	8, 35, 35
410 
411 	vspltw	9, 26, 3
412 	vspltw	10, 26, 2
413 	vmrgow	26, 10, 9
414 	vspltw	9, 27, 3
415 	vspltw	10, 27, 2
416 	vmrgow	27, 10, 9
417 	vspltw	9, 28, 3
418 	vspltw	10, 28, 2
419 	vmrgow	28, 10, 9
420 	vspltw	9, 29, 3
421 	vspltw	10, 29, 2
422 	vmrgow	29, 10, 9
423 	vspltw	9, 30, 3
424 	vspltw	10, 30, 2
425 	vmrgow	30, 10, 9
426 
427 	vsld	9, 27, 13
428 	vsld	10, 28, 13
429 	vsld	11, 29, 13
430 	vsld	12, 30, 13
431 	vaddudm	0, 9, 27
432 	vaddudm	1, 10, 28
433 	vaddudm	2, 11, 29
434 	vaddudm	3, 12, 30
435 .endm
436 
437 SYM_FUNC_START_LOCAL(do_mul)
438 	mul_odd
439 
440 	# do reduction ( h %= p )
441 	# carry reduction
442 	vspltisb 9, 2
443 	vsrd	10, 14, 31
444 	vsrd	11, 17, 31
445 	vand	7, 17, 25
446 	vand	4, 14, 25
447 	vaddudm	18, 18, 11
448 	vsrd	12, 18, 31
449 	vaddudm	15, 15, 10
450 
451 	vsrd	11, 15, 31
452 	vand	8, 18, 25
453 	vand	5, 15, 25
454 	vaddudm	4, 4, 12
455 	vsld	10, 12, 9
456 	vaddudm	6, 16, 11
457 
458 	vsrd	13, 6, 31
459 	vand	6, 6, 25
460 	vaddudm	4, 4, 10
461 	vsrd	10, 4, 31
462 	vaddudm	7, 7, 13
463 
464 	vsrd	11, 7, 31
465 	vand	7, 7, 25
466 	vand	4, 4, 25
467 	vaddudm	5, 5, 10
468 	vaddudm	8, 8, 11
469 	blr
470 SYM_FUNC_END(do_mul)
471 
472 #
473 # init key
474 #
475 .macro do_poly1305_init
476 	addis	10, 2, rmask@toc@ha
477 	addi	10, 10, rmask@toc@l
478 
479 	ld	11, 0(10)
480 	ld	12, 8(10)
481 
482 	li	14, 16
483 	li	15, 32
484 	addis	10, 2, cnum@toc@ha
485 	addi	10, 10, cnum@toc@l
486 	lvx	25, 0, 10	# v25 - mask
487 	lvx	31, 14, 10	# v31 = 1a
488 	lvx	19, 15, 10	# v19 = 1 << 24
489 	lxv	24, 48(10)	# vs24
490 	lxv	25, 64(10)	# vs25
491 
492 	# initialize
493 	# load key from r3 to vectors
494 	ld	9, 24(3)
495 	ld	10, 32(3)
496 	and.	9, 9, 11
497 	and.	10, 10, 12
498 
499 	# break 26 bits
500 	extrdi	14, 9, 26, 38
501 	extrdi	15, 9, 26, 12
502 	extrdi	16, 9, 12, 0
503 	mtvsrdd	58, 0, 14
504 	insrdi	16, 10, 14, 38
505 	mtvsrdd	59, 0, 15
506 	extrdi	17, 10, 26, 24
507 	mtvsrdd	60, 0, 16
508 	extrdi	18, 10, 24, 0
509 	mtvsrdd	61, 0, 17
510 	mtvsrdd	62, 0, 18
511 
512 	# r1 = r1 * 5, r2 = r2 * 5, r3 = r3 * 5, r4 = r4 * 5
513 	li	9, 5
514 	mtvsrdd	36, 0, 9
515 	vmulouw	0, 27, 4		# v0 = rr0
516 	vmulouw	1, 28, 4		# v1 = rr1
517 	vmulouw	2, 29, 4		# v2 = rr2
518 	vmulouw	3, 30, 4		# v3 = rr3
519 .endm
520 
521 #
522 # poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
523 #  k = 32 bytes key
524 #  r3 = k (r, s)
525 #  r4 = mlen
526 #  r5 = m
527 #
528 SYM_FUNC_START(poly1305_p10le_4blocks)
529 .align 5
530 	cmpdi	5, 64
531 	blt	Out_no_poly1305
532 
533 	SAVE_REGS
534 
535 	do_poly1305_init
536 
537 	li	21, 0	# counter to message
538 
539 	poly1305_setup_r
540 
541 	# load previous H state
542 	# break/convert r6 to 26 bits
543 	ld	9, 0(3)
544 	ld	10, 8(3)
545 	ld	19, 16(3)
546 	sldi	19, 19, 24
547 	mtvsrdd	41, 0, 19
548 	extrdi	14, 9, 26, 38
549 	extrdi	15, 9, 26, 12
550 	extrdi	16, 9, 12, 0
551 	mtvsrdd	36, 0, 14
552 	insrdi	16, 10, 14, 38
553 	mtvsrdd	37, 0, 15
554 	extrdi	17, 10, 26, 24
555 	mtvsrdd	38, 0, 16
556 	extrdi	18, 10, 24, 0
557 	mtvsrdd	39, 0, 17
558 	mtvsrdd	40, 0, 18
559 	vor	8, 8, 9
560 
561 	# input m1 m2
562 	add	20, 4, 21
563 	xxlor	49, 24, 24
564 	xxlor	50, 25, 25
565 	lxvw4x	43, 0, 20
566 	addi	17, 20, 16
567 	lxvw4x	44, 0, 17
568 	vperm	14, 11, 12, 17
569 	vperm	15, 11, 12, 18
570 	vand	9, 14, 25	# a0
571 	vsrd	10, 14, 31	# >> 26
572 	vsrd	11, 10, 31	# 12 bits left
573 	vand	10, 10, 25	# a1
574 	vspltisb 13, 12
575 	vand	16, 15, 25
576 	vsld	12, 16, 13
577 	vor	11, 11, 12
578 	vand	11, 11, 25	# a2
579 	vspltisb 13, 14
580 	vsrd	12, 15, 13	# >> 14
581 	vsrd	13, 12, 31	# >> 26, a4
582 	vand	12, 12, 25	# a3
583 
584 	vaddudm	20, 4, 9
585 	vaddudm	21, 5, 10
586 	vaddudm	22, 6, 11
587 	vaddudm	23, 7, 12
588 	vaddudm	24, 8, 13
589 
590 	# m3 m4
591 	addi	17, 17, 16
592 	lxvw4x	43, 0, 17
593 	addi	17, 17, 16
594 	lxvw4x	44, 0, 17
595 	vperm	14, 11, 12, 17
596 	vperm	15, 11, 12, 18
597 	vand	9, 14, 25	# a0
598 	vsrd	10, 14, 31	# >> 26
599 	vsrd	11, 10, 31	# 12 bits left
600 	vand	10, 10, 25	# a1
601 	vspltisb 13, 12
602 	vand	16, 15, 25
603 	vsld	12, 16, 13
604 	vspltisb 13, 14
605 	vor	11, 11, 12
606 	vand	11, 11, 25	# a2
607 	vsrd	12, 15, 13	# >> 14
608 	vsrd	13, 12, 31	# >> 26, a4
609 	vand	12, 12, 25	# a3
610 
611 	# Smash 4 message blocks into 5 vectors of [m4,  m2,  m3,  m1]
612 	vmrgow	4, 9, 20
613 	vmrgow	5, 10, 21
614 	vmrgow	6, 11, 22
615 	vmrgow	7, 12, 23
616 	vmrgow	8, 13, 24
617 	vaddudm	8, 8, 19
618 
619 	addi	5, 5, -64	# len -= 64
620 	addi	21, 21, 64	# offset += 64
621 
622 	li      9, 64
623 	divdu   31, 5, 9
624 
625 	cmpdi	31, 0
626 	ble	Skip_block_loop
627 
628 	mtctr	31
629 
630 # h4 =   m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
631 # Rewrite the polynominal sum of product as follows,
632 # h1 = (h0 + m1) * r^2,	h2 = (h0 + m2) * r^2
633 # h3 = (h1 + m3) * r^2,	h4 = (h2 + m4) * r^2  --> (h0 + m1) r*4 + (h3 + m3) r^2, (h0 + m2) r^4 + (h0 + m4) r^2
634 #  .... Repeat
635 # h5 = (h3 + m5) * r^2,	h6 = (h4 + m6) * r^2  -->
636 # h7 = (h5 + m7) * r^2,	h8 = (h6 + m8) * r^1  --> m5 * r^4 + m6 * r^3 + m7 * r^2 + m8 * r
637 #
638 loop_4blocks:
639 
640 	# Multiply odd words and even words
641 	mul_odd
642 	mul_even
643 	# carry reduction
644 	vspltisb 9, 2
645 	vsrd	10, 14, 31
646 	vsrd	11, 17, 31
647 	vand	7, 17, 25
648 	vand	4, 14, 25
649 	vaddudm	18, 18, 11
650 	vsrd	12, 18, 31
651 	vaddudm	15, 15, 10
652 
653 	vsrd	11, 15, 31
654 	vand	8, 18, 25
655 	vand	5, 15, 25
656 	vaddudm	4, 4, 12
657 	vsld	10, 12, 9
658 	vaddudm	6, 16, 11
659 
660 	vsrd	13, 6, 31
661 	vand	6, 6, 25
662 	vaddudm	4, 4, 10
663 	vsrd	10, 4, 31
664 	vaddudm	7, 7, 13
665 
666 	vsrd	11, 7, 31
667 	vand	7, 7, 25
668 	vand	4, 4, 25
669 	vaddudm	5, 5, 10
670 	vaddudm	8, 8, 11
671 
672 	# input m1  m2  m3  m4
673 	add	20, 4, 21
674 	xxlor	49, 24, 24
675 	xxlor	50, 25, 25
676 	lxvw4x	43, 0, 20
677 	addi	17, 20, 16
678 	lxvw4x	44, 0, 17
679 	vperm	14, 11, 12, 17
680 	vperm	15, 11, 12, 18
681 	addi	17, 17, 16
682 	lxvw4x	43, 0, 17
683 	addi	17, 17, 16
684 	lxvw4x	44, 0, 17
685 	vperm	17, 11, 12, 17
686 	vperm	18, 11, 12, 18
687 
688 	vand	20, 14, 25	# a0
689 	vand	9, 17, 25	# a0
690 	vsrd	21, 14, 31	# >> 26
691 	vsrd	22, 21, 31	# 12 bits left
692 	vsrd	10, 17, 31	# >> 26
693 	vsrd	11, 10, 31	# 12 bits left
694 
695 	vand	21, 21, 25	# a1
696 	vand	10, 10, 25	# a1
697 
698 	vspltisb 13, 12
699 	vand	16, 15, 25
700 	vsld	23, 16, 13
701 	vor	22, 22, 23
702 	vand	22, 22, 25	# a2
703 	vand	16, 18, 25
704 	vsld	12, 16, 13
705 	vor	11, 11, 12
706 	vand	11, 11, 25	# a2
707 	vspltisb 13, 14
708 	vsrd	23, 15, 13	# >> 14
709 	vsrd	24, 23, 31	# >> 26, a4
710 	vand	23, 23, 25	# a3
711 	vsrd	12, 18, 13	# >> 14
712 	vsrd	13, 12, 31	# >> 26, a4
713 	vand	12, 12, 25	# a3
714 
715 	vaddudm	4, 4, 20
716 	vaddudm	5, 5, 21
717 	vaddudm	6, 6, 22
718 	vaddudm	7, 7, 23
719 	vaddudm	8, 8, 24
720 
721 	# Smash 4 message blocks into 5 vectors of [m4,  m2,  m3,  m1]
722 	vmrgow	4, 9, 4
723 	vmrgow	5, 10, 5
724 	vmrgow	6, 11, 6
725 	vmrgow	7, 12, 7
726 	vmrgow	8, 13, 8
727 	vaddudm	8, 8, 19
728 
729 	addi	5, 5, -64	# len -= 64
730 	addi	21, 21, 64	# offset += 64
731 
732 	bdnz	loop_4blocks
733 
734 Skip_block_loop:
735 	xxlor	58, 0, 0
736 	xxlor	59, 1, 1
737 	xxlor	60, 2, 2
738 	xxlor	61, 3, 3
739 	xxlor	62, 4, 4
740 	xxlor	32, 5, 5
741 	xxlor	33, 6, 6
742 	xxlor	34, 7, 7
743 	xxlor	35, 8, 8
744 
745 	# Multiply odd words and even words
746 	mul_odd
747 	mul_even
748 
749 	# Sum the products.
750 	xxpermdi 41, 31, 46, 0
751 	xxpermdi 42, 31, 47, 0
752 	vaddudm	4, 14, 9
753 	xxpermdi 36, 31, 36, 3
754 	vaddudm	5, 15, 10
755 	xxpermdi 37, 31, 37, 3
756 	xxpermdi 43, 31, 48, 0
757 	vaddudm	6, 16, 11
758 	xxpermdi 38, 31, 38, 3
759 	xxpermdi 44, 31, 49, 0
760 	vaddudm	7, 17, 12
761 	xxpermdi 39, 31, 39, 3
762 	xxpermdi 45, 31, 50, 0
763 	vaddudm	8, 18, 13
764 	xxpermdi 40, 31, 40, 3
765 
766 	# carry reduction
767 	vspltisb 9, 2
768 	vsrd	10, 4, 31
769 	vsrd	11, 7, 31
770 	vand	7, 7, 25
771 	vand	4, 4, 25
772 	vaddudm	8, 8, 11
773 	vsrd	12, 8, 31
774 	vaddudm	5, 5, 10
775 
776 	vsrd	11, 5, 31
777 	vand	8, 8, 25
778 	vand	5, 5, 25
779 	vaddudm	4, 4, 12
780 	vsld	10, 12, 9
781 	vaddudm	6, 6, 11
782 
783 	vsrd	13, 6, 31
784 	vand	6, 6, 25
785 	vaddudm	4, 4, 10
786 	vsrd	10, 4, 31
787 	vaddudm	7, 7, 13
788 
789 	vsrd	11, 7, 31
790 	vand	7, 7, 25
791 	vand	4, 4, 25
792 	vaddudm	5, 5, 10
793 	vsrd	10, 5, 31
794 	vand	5, 5, 25
795 	vaddudm	6, 6, 10
796 	vaddudm	8, 8, 11
797 
798 	b	do_final_update
799 
800 do_final_update:
801 	# combine 26 bit limbs
802 	# v4, v5, v6, v7 and v8 are 26 bit vectors
803 	vsld	5, 5, 31
804 	vor	20, 4, 5
805 	vspltisb 11, 12
806 	vsrd	12, 6, 11
807 	vsld	6, 6, 31
808 	vsld	6, 6, 31
809 	vor	20, 20, 6
810 	vspltisb 11, 14
811 	vsld	7, 7, 11
812 	vor	21, 7, 12
813 	mfvsrld	16, 40		# save last 2 bytes
814 	vsld	8, 8, 11
815 	vsld	8, 8, 31
816 	vor	21, 21, 8
817 	mfvsrld	17, 52
818 	mfvsrld	19, 53
819 	srdi	16, 16, 24
820 
821 	std	17, 0(3)
822 	std	19, 8(3)
823 	stw	16, 16(3)
824 
825 Out_loop:
826 	li	3, 0
827 
828 	RESTORE_REGS
829 
830 	blr
831 
832 Out_no_poly1305:
833 	li	3, 0
834 	blr
835 SYM_FUNC_END(poly1305_p10le_4blocks)
836 
837 #
838 # =======================================================================
839 # The following functions implement 64 x 64 bits multiplication poly1305.
840 #
841 SYM_FUNC_START_LOCAL(Poly1305_init_64)
842 	#  mask 0x0FFFFFFC0FFFFFFC
843 	#  mask 0x0FFFFFFC0FFFFFFF
844 	addis	10, 2, rmask@toc@ha
845 	addi	10, 10, rmask@toc@l
846 	ld	11, 0(10)
847 	ld	12, 8(10)
848 
849 	# initialize
850 	# load key from r3
851 	ld	9, 24(3)
852 	ld	10, 32(3)
853 	and.	9, 9, 11	# cramp mask r0
854 	and.	10, 10, 12	# cramp mask r1
855 
856         srdi    21, 10, 2
857         add     19, 21, 10      # s1: r19 - (r1 >> 2) *5
858 
859         # setup r and s
860         li      25, 0
861 	mtvsrdd 32+0, 9, 19	# r0, s1
862 	mtvsrdd 32+1, 10, 9	# r1, r0
863 	mtvsrdd 32+2, 19, 25	# s1
864 	mtvsrdd 32+3, 9, 25	# r0
865 
866 	blr
867 SYM_FUNC_END(Poly1305_init_64)
868 
869 # Poly1305_mult
870 # v6 = (h0, h1), v8 = h2
871 # v0 = (r0, s1), v1 = (r1, r0), v2 = s1, v3 = r0
872 #
873 # Output: v7, v10, v11
874 #
875 SYM_FUNC_START_LOCAL(Poly1305_mult)
876 	#
877 	#	d0 = h0 * r0 + h1 * s1
878 	vmsumudm	7, 6, 0, 9		# h0 * r0, h1 * s1
879 
880 	#	d1 = h0 * r1 + h1 * r0 + h2 * s1
881 	vmsumudm	11, 6, 1, 9		# h0 * r1, h1 * r0
882 	vmsumudm	10, 8, 2, 11		# d1 += h2 * s1
883 
884 	#       d2 = r0
885 	vmsumudm	11, 8, 3, 9		# d2 = h2 * r0
886 	blr
887 SYM_FUNC_END(Poly1305_mult)
888 
889 #
890 # carry reduction
891 # h %=p
892 #
893 # Input: v7, v10, v11
894 # Output: r27, r28, r29
895 #
896 SYM_FUNC_START_LOCAL(Carry_reduction)
897 	mfvsrld	27, 32+7
898 	mfvsrld	28, 32+10
899 	mfvsrld	29, 32+11
900 	mfvsrd	20, 32+7	# h0.h
901 	mfvsrd	21, 32+10	# h1.h
902 
903 	addc	28, 28, 20
904 	adde	29, 29, 21
905 	srdi	22, 29, 0x2
906 	sldi	23, 22, 0x2
907 	add	23, 23, 22	# (h2 & 3) * 5
908 	addc	27, 27, 23	# h0
909 	addze	28, 28		# h1
910 	andi.	29, 29, 0x3	# h2
911 	blr
912 SYM_FUNC_END(Carry_reduction)
913 
914 #
915 # poly1305 multiplication
916 # h *= r, h %= p
917 #	d0 = h0 * r0 + h1 * s1
918 #	d1 = h0 * r1 + h1 * r0 + h2 * s1
919 #       d2 = h0 * r0
920 #
921 #
922 # unsigned int poly1305_test_64s(unisgned char *state, const byte *src, size_t len, highbit)
923 #   - no highbit if final leftover block (highbit = 0)
924 #
925 SYM_FUNC_START(poly1305_64s)
926 	cmpdi	5, 0
927 	ble	Out_no_poly1305_64
928 
929 	mflr 0
930 	std 0, 16(1)
931 	stdu 1,-400(1)
932 
933 	SAVE_GPR 14, 112, 1
934 	SAVE_GPR 15, 120, 1
935 	SAVE_GPR 16, 128, 1
936 	SAVE_GPR 17, 136, 1
937 	SAVE_GPR 18, 144, 1
938 	SAVE_GPR 19, 152, 1
939 	SAVE_GPR 20, 160, 1
940 	SAVE_GPR 21, 168, 1
941 	SAVE_GPR 22, 176, 1
942 	SAVE_GPR 23, 184, 1
943 	SAVE_GPR 24, 192, 1
944 	SAVE_GPR 25, 200, 1
945 	SAVE_GPR 26, 208, 1
946 	SAVE_GPR 27, 216, 1
947 	SAVE_GPR 28, 224, 1
948 	SAVE_GPR 29, 232, 1
949 	SAVE_GPR 30, 240, 1
950 	SAVE_GPR 31, 248, 1
951 
952 	# Init poly1305
953 	bl Poly1305_init_64
954 
955 	li 25, 0			# offset to inp and outp
956 
957 	add 11, 25, 4
958 
959 	# load h
960 	# h0, h1, h2?
961         ld	27, 0(3)
962         ld	28, 8(3)
963         lwz	29, 16(3)
964 
965         li      30, 16
966         divdu   31, 5, 30
967 
968         mtctr   31
969 
970         mr      24, 6		# highbit
971 
972 Loop_block_64:
973 	vxor	9, 9, 9
974 
975 	ld	20, 0(11)
976 	ld	21, 8(11)
977 	addi	11, 11, 16
978 
979 	addc	27, 27, 20
980 	adde	28, 28, 21
981 	adde	29, 29, 24
982 
983 	li	22, 0
984 	mtvsrdd	32+6, 27, 28	# h0, h1
985 	mtvsrdd	32+8, 29, 22	# h2
986 
987 	bl	Poly1305_mult
988 
989 	bl	Carry_reduction
990 
991 	bdnz	Loop_block_64
992 
993 	std	27, 0(3)
994 	std	28, 8(3)
995 	stw	29, 16(3)
996 
997 	li	3, 0
998 
999 	RESTORE_GPR 14, 112, 1
1000 	RESTORE_GPR 15, 120, 1
1001 	RESTORE_GPR 16, 128, 1
1002 	RESTORE_GPR 17, 136, 1
1003 	RESTORE_GPR 18, 144, 1
1004 	RESTORE_GPR 19, 152, 1
1005 	RESTORE_GPR 20, 160, 1
1006 	RESTORE_GPR 21, 168, 1
1007 	RESTORE_GPR 22, 176, 1
1008 	RESTORE_GPR 23, 184, 1
1009 	RESTORE_GPR 24, 192, 1
1010 	RESTORE_GPR 25, 200, 1
1011 	RESTORE_GPR 26, 208, 1
1012 	RESTORE_GPR 27, 216, 1
1013 	RESTORE_GPR 28, 224, 1
1014 	RESTORE_GPR 29, 232, 1
1015 	RESTORE_GPR 30, 240, 1
1016 	RESTORE_GPR 31, 248, 1
1017 
1018 	addi    1, 1, 400
1019 	ld 0, 16(1)
1020 	mtlr 0
1021 
1022 	blr
1023 
1024 Out_no_poly1305_64:
1025 	li	3, 0
1026 	blr
1027 SYM_FUNC_END(poly1305_64s)
1028 
1029 #
1030 # Input: r3 = h, r4 = s, r5 = mac
1031 # mac = h + s
1032 #
1033 SYM_FUNC_START(poly1305_emit_64)
1034 	ld	10, 0(3)
1035 	ld	11, 8(3)
1036 	ld	12, 16(3)
1037 
1038 	# compare modulus
1039 	# h + 5 + (-p)
1040 	mr	6, 10
1041 	mr	7, 11
1042 	mr	8, 12
1043 	addic.	6, 6, 5
1044 	addze	7, 7
1045 	addze	8, 8
1046 	srdi	9, 8, 2		# overflow?
1047 	cmpdi	9, 0
1048 	beq	Skip_h64
1049 	mr	10, 6
1050 	mr	11, 7
1051 	mr	12, 8
1052 
1053 Skip_h64:
1054 	ld	6, 0(4)
1055 	ld	7, 8(4)
1056 	addc	10, 10, 6
1057 	adde	11, 11, 7
1058 	addze	12, 12
1059 
1060 	std	10, 0(5)
1061 	std	11, 8(5)
1062 	blr
1063 SYM_FUNC_END(poly1305_emit_64)
1064 
1065 SYM_DATA_START_LOCAL(RMASK)
1066 .align 5
1067 rmask:
1068 .byte	0xff, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f
1069 cnum:
1070 .long	0x03ffffff, 0x00000000, 0x03ffffff, 0x00000000
1071 .long	0x1a, 0x00, 0x1a, 0x00
1072 .long	0x01000000, 0x01000000, 0x01000000, 0x01000000
1073 .long	0x00010203, 0x04050607, 0x10111213, 0x14151617
1074 .long	0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f
1075 SYM_DATA_END(RMASK)
1076