1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * Core of the accelerated CRC algorithm.
4  * In your file, define the constants and CRC_FUNCTION_NAME
5  * Then include this file.
6  *
7  * Calculate the checksum of data that is 16 byte aligned and a multiple of
8  * 16 bytes.
9  *
10  * The first step is to reduce it to 1024 bits. We do this in 8 parallel
11  * chunks in order to mask the latency of the vpmsum instructions. If we
12  * have more than 32 kB of data to checksum we repeat this step multiple
13  * times, passing in the previous 1024 bits.
14  *
15  * The next step is to reduce the 1024 bits to 64 bits. This step adds
16  * 32 bits of 0s to the end - this matches what a CRC does. We just
17  * calculate constants that land the data in this 32 bits.
18  *
19  * We then use fixed point Barrett reduction to compute a mod n over GF(2)
20  * for n = CRC using POWER8 instructions. We use x = 32.
21  *
22  * https://en.wikipedia.org/wiki/Barrett_reduction
23  *
24  * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
25 */
26 
27 #include <asm/ppc_asm.h>
28 #include <asm/ppc-opcode.h>
29 
30 #define MAX_SIZE	32768
31 
32 	.text
33 
34 #if defined(__BIG_ENDIAN__) && defined(REFLECT)
35 #define BYTESWAP_DATA
36 #elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
37 #define BYTESWAP_DATA
38 #else
39 #undef BYTESWAP_DATA
40 #endif
41 
42 #define off16		r25
43 #define off32		r26
44 #define off48		r27
45 #define off64		r28
46 #define off80		r29
47 #define off96		r30
48 #define off112		r31
49 
50 #define const1		v24
51 #define const2		v25
52 
53 #define byteswap	v26
54 #define	mask_32bit	v27
55 #define	mask_64bit	v28
56 #define zeroes		v29
57 
58 #ifdef BYTESWAP_DATA
59 #define VPERM(A, B, C, D) vperm	A, B, C, D
60 #else
61 #define VPERM(A, B, C, D)
62 #endif
63 
64 /* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */
65 FUNC_START(CRC_FUNCTION_NAME)
66 	std	r31,-8(r1)
67 	std	r30,-16(r1)
68 	std	r29,-24(r1)
69 	std	r28,-32(r1)
70 	std	r27,-40(r1)
71 	std	r26,-48(r1)
72 	std	r25,-56(r1)
73 
74 	li	off16,16
75 	li	off32,32
76 	li	off48,48
77 	li	off64,64
78 	li	off80,80
79 	li	off96,96
80 	li	off112,112
81 	li	r0,0
82 
83 	/* Enough room for saving 10 non volatile VMX registers */
84 	subi	r6,r1,56+10*16
85 	subi	r7,r1,56+2*16
86 
87 	stvx	v20,0,r6
88 	stvx	v21,off16,r6
89 	stvx	v22,off32,r6
90 	stvx	v23,off48,r6
91 	stvx	v24,off64,r6
92 	stvx	v25,off80,r6
93 	stvx	v26,off96,r6
94 	stvx	v27,off112,r6
95 	stvx	v28,0,r7
96 	stvx	v29,off16,r7
97 
98 	mr	r10,r3
99 
100 	vxor	zeroes,zeroes,zeroes
101 	vspltisw v0,-1
102 
103 	vsldoi	mask_32bit,zeroes,v0,4
104 	vsldoi	mask_64bit,zeroes,v0,8
105 
106 	/* Get the initial value into v8 */
107 	vxor	v8,v8,v8
108 	MTVRD(v8, R3)
109 #ifdef REFLECT
110 	vsldoi	v8,zeroes,v8,8	/* shift into bottom 32 bits */
111 #else
112 	vsldoi	v8,v8,zeroes,4	/* shift into top 32 bits */
113 #endif
114 
115 #ifdef BYTESWAP_DATA
116 	addis	r3,r2,.byteswap_constant@toc@ha
117 	addi	r3,r3,.byteswap_constant@toc@l
118 
119 	lvx	byteswap,0,r3
120 	addi	r3,r3,16
121 #endif
122 
123 	cmpdi	r5,256
124 	blt	.Lshort
125 
126 	rldicr	r6,r5,0,56
127 
128 	/* Checksum in blocks of MAX_SIZE */
129 1:	lis	r7,MAX_SIZE@h
130 	ori	r7,r7,MAX_SIZE@l
131 	mr	r9,r7
132 	cmpd	r6,r7
133 	bgt	2f
134 	mr	r7,r6
135 2:	subf	r6,r7,r6
136 
137 	/* our main loop does 128 bytes at a time */
138 	srdi	r7,r7,7
139 
140 	/*
141 	 * Work out the offset into the constants table to start at. Each
142 	 * constant is 16 bytes, and it is used against 128 bytes of input
143 	 * data - 128 / 16 = 8
144 	 */
145 	sldi	r8,r7,4
146 	srdi	r9,r9,3
147 	subf	r8,r8,r9
148 
149 	/* We reduce our final 128 bytes in a separate step */
150 	addi	r7,r7,-1
151 	mtctr	r7
152 
153 	addis	r3,r2,.constants@toc@ha
154 	addi	r3,r3,.constants@toc@l
155 
156 	/* Find the start of our constants */
157 	add	r3,r3,r8
158 
159 	/* zero v0-v7 which will contain our checksums */
160 	vxor	v0,v0,v0
161 	vxor	v1,v1,v1
162 	vxor	v2,v2,v2
163 	vxor	v3,v3,v3
164 	vxor	v4,v4,v4
165 	vxor	v5,v5,v5
166 	vxor	v6,v6,v6
167 	vxor	v7,v7,v7
168 
169 	lvx	const1,0,r3
170 
171 	/*
172 	 * If we are looping back to consume more data we use the values
173 	 * already in v16-v23.
174 	 */
175 	cmpdi	r0,1
176 	beq	2f
177 
178 	/* First warm up pass */
179 	lvx	v16,0,r4
180 	lvx	v17,off16,r4
181 	VPERM(v16,v16,v16,byteswap)
182 	VPERM(v17,v17,v17,byteswap)
183 	lvx	v18,off32,r4
184 	lvx	v19,off48,r4
185 	VPERM(v18,v18,v18,byteswap)
186 	VPERM(v19,v19,v19,byteswap)
187 	lvx	v20,off64,r4
188 	lvx	v21,off80,r4
189 	VPERM(v20,v20,v20,byteswap)
190 	VPERM(v21,v21,v21,byteswap)
191 	lvx	v22,off96,r4
192 	lvx	v23,off112,r4
193 	VPERM(v22,v22,v22,byteswap)
194 	VPERM(v23,v23,v23,byteswap)
195 	addi	r4,r4,8*16
196 
197 	/* xor in initial value */
198 	vxor	v16,v16,v8
199 
200 2:	bdz	.Lfirst_warm_up_done
201 
202 	addi	r3,r3,16
203 	lvx	const2,0,r3
204 
205 	/* Second warm up pass */
206 	VPMSUMD(v8,v16,const1)
207 	lvx	v16,0,r4
208 	VPERM(v16,v16,v16,byteswap)
209 	ori	r2,r2,0
210 
211 	VPMSUMD(v9,v17,const1)
212 	lvx	v17,off16,r4
213 	VPERM(v17,v17,v17,byteswap)
214 	ori	r2,r2,0
215 
216 	VPMSUMD(v10,v18,const1)
217 	lvx	v18,off32,r4
218 	VPERM(v18,v18,v18,byteswap)
219 	ori	r2,r2,0
220 
221 	VPMSUMD(v11,v19,const1)
222 	lvx	v19,off48,r4
223 	VPERM(v19,v19,v19,byteswap)
224 	ori	r2,r2,0
225 
226 	VPMSUMD(v12,v20,const1)
227 	lvx	v20,off64,r4
228 	VPERM(v20,v20,v20,byteswap)
229 	ori	r2,r2,0
230 
231 	VPMSUMD(v13,v21,const1)
232 	lvx	v21,off80,r4
233 	VPERM(v21,v21,v21,byteswap)
234 	ori	r2,r2,0
235 
236 	VPMSUMD(v14,v22,const1)
237 	lvx	v22,off96,r4
238 	VPERM(v22,v22,v22,byteswap)
239 	ori	r2,r2,0
240 
241 	VPMSUMD(v15,v23,const1)
242 	lvx	v23,off112,r4
243 	VPERM(v23,v23,v23,byteswap)
244 
245 	addi	r4,r4,8*16
246 
247 	bdz	.Lfirst_cool_down
248 
249 	/*
250 	 * main loop. We modulo schedule it such that it takes three iterations
251 	 * to complete - first iteration load, second iteration vpmsum, third
252 	 * iteration xor.
253 	 */
254 	.balign	16
255 4:	lvx	const1,0,r3
256 	addi	r3,r3,16
257 	ori	r2,r2,0
258 
259 	vxor	v0,v0,v8
260 	VPMSUMD(v8,v16,const2)
261 	lvx	v16,0,r4
262 	VPERM(v16,v16,v16,byteswap)
263 	ori	r2,r2,0
264 
265 	vxor	v1,v1,v9
266 	VPMSUMD(v9,v17,const2)
267 	lvx	v17,off16,r4
268 	VPERM(v17,v17,v17,byteswap)
269 	ori	r2,r2,0
270 
271 	vxor	v2,v2,v10
272 	VPMSUMD(v10,v18,const2)
273 	lvx	v18,off32,r4
274 	VPERM(v18,v18,v18,byteswap)
275 	ori	r2,r2,0
276 
277 	vxor	v3,v3,v11
278 	VPMSUMD(v11,v19,const2)
279 	lvx	v19,off48,r4
280 	VPERM(v19,v19,v19,byteswap)
281 	lvx	const2,0,r3
282 	ori	r2,r2,0
283 
284 	vxor	v4,v4,v12
285 	VPMSUMD(v12,v20,const1)
286 	lvx	v20,off64,r4
287 	VPERM(v20,v20,v20,byteswap)
288 	ori	r2,r2,0
289 
290 	vxor	v5,v5,v13
291 	VPMSUMD(v13,v21,const1)
292 	lvx	v21,off80,r4
293 	VPERM(v21,v21,v21,byteswap)
294 	ori	r2,r2,0
295 
296 	vxor	v6,v6,v14
297 	VPMSUMD(v14,v22,const1)
298 	lvx	v22,off96,r4
299 	VPERM(v22,v22,v22,byteswap)
300 	ori	r2,r2,0
301 
302 	vxor	v7,v7,v15
303 	VPMSUMD(v15,v23,const1)
304 	lvx	v23,off112,r4
305 	VPERM(v23,v23,v23,byteswap)
306 
307 	addi	r4,r4,8*16
308 
309 	bdnz	4b
310 
311 .Lfirst_cool_down:
312 	/* First cool down pass */
313 	lvx	const1,0,r3
314 	addi	r3,r3,16
315 
316 	vxor	v0,v0,v8
317 	VPMSUMD(v8,v16,const1)
318 	ori	r2,r2,0
319 
320 	vxor	v1,v1,v9
321 	VPMSUMD(v9,v17,const1)
322 	ori	r2,r2,0
323 
324 	vxor	v2,v2,v10
325 	VPMSUMD(v10,v18,const1)
326 	ori	r2,r2,0
327 
328 	vxor	v3,v3,v11
329 	VPMSUMD(v11,v19,const1)
330 	ori	r2,r2,0
331 
332 	vxor	v4,v4,v12
333 	VPMSUMD(v12,v20,const1)
334 	ori	r2,r2,0
335 
336 	vxor	v5,v5,v13
337 	VPMSUMD(v13,v21,const1)
338 	ori	r2,r2,0
339 
340 	vxor	v6,v6,v14
341 	VPMSUMD(v14,v22,const1)
342 	ori	r2,r2,0
343 
344 	vxor	v7,v7,v15
345 	VPMSUMD(v15,v23,const1)
346 	ori	r2,r2,0
347 
348 .Lsecond_cool_down:
349 	/* Second cool down pass */
350 	vxor	v0,v0,v8
351 	vxor	v1,v1,v9
352 	vxor	v2,v2,v10
353 	vxor	v3,v3,v11
354 	vxor	v4,v4,v12
355 	vxor	v5,v5,v13
356 	vxor	v6,v6,v14
357 	vxor	v7,v7,v15
358 
359 #ifdef REFLECT
360 	/*
361 	 * vpmsumd produces a 96 bit result in the least significant bits
362 	 * of the register. Since we are bit reflected we have to shift it
363 	 * left 32 bits so it occupies the least significant bits in the
364 	 * bit reflected domain.
365 	 */
366 	vsldoi	v0,v0,zeroes,4
367 	vsldoi	v1,v1,zeroes,4
368 	vsldoi	v2,v2,zeroes,4
369 	vsldoi	v3,v3,zeroes,4
370 	vsldoi	v4,v4,zeroes,4
371 	vsldoi	v5,v5,zeroes,4
372 	vsldoi	v6,v6,zeroes,4
373 	vsldoi	v7,v7,zeroes,4
374 #endif
375 
376 	/* xor with last 1024 bits */
377 	lvx	v8,0,r4
378 	lvx	v9,off16,r4
379 	VPERM(v8,v8,v8,byteswap)
380 	VPERM(v9,v9,v9,byteswap)
381 	lvx	v10,off32,r4
382 	lvx	v11,off48,r4
383 	VPERM(v10,v10,v10,byteswap)
384 	VPERM(v11,v11,v11,byteswap)
385 	lvx	v12,off64,r4
386 	lvx	v13,off80,r4
387 	VPERM(v12,v12,v12,byteswap)
388 	VPERM(v13,v13,v13,byteswap)
389 	lvx	v14,off96,r4
390 	lvx	v15,off112,r4
391 	VPERM(v14,v14,v14,byteswap)
392 	VPERM(v15,v15,v15,byteswap)
393 
394 	addi	r4,r4,8*16
395 
396 	vxor	v16,v0,v8
397 	vxor	v17,v1,v9
398 	vxor	v18,v2,v10
399 	vxor	v19,v3,v11
400 	vxor	v20,v4,v12
401 	vxor	v21,v5,v13
402 	vxor	v22,v6,v14
403 	vxor	v23,v7,v15
404 
405 	li	r0,1
406 	cmpdi	r6,0
407 	addi	r6,r6,128
408 	bne	1b
409 
410 	/* Work out how many bytes we have left */
411 	andi.	r5,r5,127
412 
413 	/* Calculate where in the constant table we need to start */
414 	subfic	r6,r5,128
415 	add	r3,r3,r6
416 
417 	/* How many 16 byte chunks are in the tail */
418 	srdi	r7,r5,4
419 	mtctr	r7
420 
421 	/*
422 	 * Reduce the previously calculated 1024 bits to 64 bits, shifting
423 	 * 32 bits to include the trailing 32 bits of zeros
424 	 */
425 	lvx	v0,0,r3
426 	lvx	v1,off16,r3
427 	lvx	v2,off32,r3
428 	lvx	v3,off48,r3
429 	lvx	v4,off64,r3
430 	lvx	v5,off80,r3
431 	lvx	v6,off96,r3
432 	lvx	v7,off112,r3
433 	addi	r3,r3,8*16
434 
435 	VPMSUMW(v0,v16,v0)
436 	VPMSUMW(v1,v17,v1)
437 	VPMSUMW(v2,v18,v2)
438 	VPMSUMW(v3,v19,v3)
439 	VPMSUMW(v4,v20,v4)
440 	VPMSUMW(v5,v21,v5)
441 	VPMSUMW(v6,v22,v6)
442 	VPMSUMW(v7,v23,v7)
443 
444 	/* Now reduce the tail (0 - 112 bytes) */
445 	cmpdi	r7,0
446 	beq	1f
447 
448 	lvx	v16,0,r4
449 	lvx	v17,0,r3
450 	VPERM(v16,v16,v16,byteswap)
451 	VPMSUMW(v16,v16,v17)
452 	vxor	v0,v0,v16
453 	bdz	1f
454 
455 	lvx	v16,off16,r4
456 	lvx	v17,off16,r3
457 	VPERM(v16,v16,v16,byteswap)
458 	VPMSUMW(v16,v16,v17)
459 	vxor	v0,v0,v16
460 	bdz	1f
461 
462 	lvx	v16,off32,r4
463 	lvx	v17,off32,r3
464 	VPERM(v16,v16,v16,byteswap)
465 	VPMSUMW(v16,v16,v17)
466 	vxor	v0,v0,v16
467 	bdz	1f
468 
469 	lvx	v16,off48,r4
470 	lvx	v17,off48,r3
471 	VPERM(v16,v16,v16,byteswap)
472 	VPMSUMW(v16,v16,v17)
473 	vxor	v0,v0,v16
474 	bdz	1f
475 
476 	lvx	v16,off64,r4
477 	lvx	v17,off64,r3
478 	VPERM(v16,v16,v16,byteswap)
479 	VPMSUMW(v16,v16,v17)
480 	vxor	v0,v0,v16
481 	bdz	1f
482 
483 	lvx	v16,off80,r4
484 	lvx	v17,off80,r3
485 	VPERM(v16,v16,v16,byteswap)
486 	VPMSUMW(v16,v16,v17)
487 	vxor	v0,v0,v16
488 	bdz	1f
489 
490 	lvx	v16,off96,r4
491 	lvx	v17,off96,r3
492 	VPERM(v16,v16,v16,byteswap)
493 	VPMSUMW(v16,v16,v17)
494 	vxor	v0,v0,v16
495 
496 	/* Now xor all the parallel chunks together */
497 1:	vxor	v0,v0,v1
498 	vxor	v2,v2,v3
499 	vxor	v4,v4,v5
500 	vxor	v6,v6,v7
501 
502 	vxor	v0,v0,v2
503 	vxor	v4,v4,v6
504 
505 	vxor	v0,v0,v4
506 
507 .Lbarrett_reduction:
508 	/* Barrett constants */
509 	addis	r3,r2,.barrett_constants@toc@ha
510 	addi	r3,r3,.barrett_constants@toc@l
511 
512 	lvx	const1,0,r3
513 	lvx	const2,off16,r3
514 
515 	vsldoi	v1,v0,v0,8
516 	vxor	v0,v0,v1		/* xor two 64 bit results together */
517 
518 #ifdef REFLECT
519 	/* shift left one bit */
520 	vspltisb v1,1
521 	vsl	v0,v0,v1
522 #endif
523 
524 	vand	v0,v0,mask_64bit
525 #ifndef REFLECT
526 	/*
527 	 * Now for the Barrett reduction algorithm. The idea is to calculate q,
528 	 * the multiple of our polynomial that we need to subtract. By
529 	 * doing the computation 2x bits higher (ie 64 bits) and shifting the
530 	 * result back down 2x bits, we round down to the nearest multiple.
531 	 */
532 	VPMSUMD(v1,v0,const1)	/* ma */
533 	vsldoi	v1,zeroes,v1,8	/* q = floor(ma/(2^64)) */
534 	VPMSUMD(v1,v1,const2)	/* qn */
535 	vxor	v0,v0,v1	/* a - qn, subtraction is xor in GF(2) */
536 
537 	/*
538 	 * Get the result into r3. We need to shift it left 8 bytes:
539 	 * V0 [ 0 1 2 X ]
540 	 * V0 [ 0 X 2 3 ]
541 	 */
542 	vsldoi	v0,v0,zeroes,8	/* shift result into top 64 bits */
543 #else
544 	/*
545 	 * The reflected version of Barrett reduction. Instead of bit
546 	 * reflecting our data (which is expensive to do), we bit reflect our
547 	 * constants and our algorithm, which means the intermediate data in
548 	 * our vector registers goes from 0-63 instead of 63-0. We can reflect
549 	 * the algorithm because we don't carry in mod 2 arithmetic.
550 	 */
551 	vand	v1,v0,mask_32bit	/* bottom 32 bits of a */
552 	VPMSUMD(v1,v1,const1)		/* ma */
553 	vand	v1,v1,mask_32bit	/* bottom 32bits of ma */
554 	VPMSUMD(v1,v1,const2)		/* qn */
555 	vxor	v0,v0,v1		/* a - qn, subtraction is xor in GF(2) */
556 
557 	/*
558 	 * Since we are bit reflected, the result (ie the low 32 bits) is in
559 	 * the high 32 bits. We just need to shift it left 4 bytes
560 	 * V0 [ 0 1 X 3 ]
561 	 * V0 [ 0 X 2 3 ]
562 	 */
563 	vsldoi	v0,v0,zeroes,4		/* shift result into top 64 bits of */
564 #endif
565 
566 	/* Get it into r3 */
567 	MFVRD(R3, v0)
568 
569 .Lout:
570 	subi	r6,r1,56+10*16
571 	subi	r7,r1,56+2*16
572 
573 	lvx	v20,0,r6
574 	lvx	v21,off16,r6
575 	lvx	v22,off32,r6
576 	lvx	v23,off48,r6
577 	lvx	v24,off64,r6
578 	lvx	v25,off80,r6
579 	lvx	v26,off96,r6
580 	lvx	v27,off112,r6
581 	lvx	v28,0,r7
582 	lvx	v29,off16,r7
583 
584 	ld	r31,-8(r1)
585 	ld	r30,-16(r1)
586 	ld	r29,-24(r1)
587 	ld	r28,-32(r1)
588 	ld	r27,-40(r1)
589 	ld	r26,-48(r1)
590 	ld	r25,-56(r1)
591 
592 	blr
593 
594 .Lfirst_warm_up_done:
595 	lvx	const1,0,r3
596 	addi	r3,r3,16
597 
598 	VPMSUMD(v8,v16,const1)
599 	VPMSUMD(v9,v17,const1)
600 	VPMSUMD(v10,v18,const1)
601 	VPMSUMD(v11,v19,const1)
602 	VPMSUMD(v12,v20,const1)
603 	VPMSUMD(v13,v21,const1)
604 	VPMSUMD(v14,v22,const1)
605 	VPMSUMD(v15,v23,const1)
606 
607 	b	.Lsecond_cool_down
608 
609 .Lshort:
610 	cmpdi	r5,0
611 	beq	.Lzero
612 
613 	addis	r3,r2,.short_constants@toc@ha
614 	addi	r3,r3,.short_constants@toc@l
615 
616 	/* Calculate where in the constant table we need to start */
617 	subfic	r6,r5,256
618 	add	r3,r3,r6
619 
620 	/* How many 16 byte chunks? */
621 	srdi	r7,r5,4
622 	mtctr	r7
623 
624 	vxor	v19,v19,v19
625 	vxor	v20,v20,v20
626 
627 	lvx	v0,0,r4
628 	lvx	v16,0,r3
629 	VPERM(v0,v0,v16,byteswap)
630 	vxor	v0,v0,v8	/* xor in initial value */
631 	VPMSUMW(v0,v0,v16)
632 	bdz	.Lv0
633 
634 	lvx	v1,off16,r4
635 	lvx	v17,off16,r3
636 	VPERM(v1,v1,v17,byteswap)
637 	VPMSUMW(v1,v1,v17)
638 	bdz	.Lv1
639 
640 	lvx	v2,off32,r4
641 	lvx	v16,off32,r3
642 	VPERM(v2,v2,v16,byteswap)
643 	VPMSUMW(v2,v2,v16)
644 	bdz	.Lv2
645 
646 	lvx	v3,off48,r4
647 	lvx	v17,off48,r3
648 	VPERM(v3,v3,v17,byteswap)
649 	VPMSUMW(v3,v3,v17)
650 	bdz	.Lv3
651 
652 	lvx	v4,off64,r4
653 	lvx	v16,off64,r3
654 	VPERM(v4,v4,v16,byteswap)
655 	VPMSUMW(v4,v4,v16)
656 	bdz	.Lv4
657 
658 	lvx	v5,off80,r4
659 	lvx	v17,off80,r3
660 	VPERM(v5,v5,v17,byteswap)
661 	VPMSUMW(v5,v5,v17)
662 	bdz	.Lv5
663 
664 	lvx	v6,off96,r4
665 	lvx	v16,off96,r3
666 	VPERM(v6,v6,v16,byteswap)
667 	VPMSUMW(v6,v6,v16)
668 	bdz	.Lv6
669 
670 	lvx	v7,off112,r4
671 	lvx	v17,off112,r3
672 	VPERM(v7,v7,v17,byteswap)
673 	VPMSUMW(v7,v7,v17)
674 	bdz	.Lv7
675 
676 	addi	r3,r3,128
677 	addi	r4,r4,128
678 
679 	lvx	v8,0,r4
680 	lvx	v16,0,r3
681 	VPERM(v8,v8,v16,byteswap)
682 	VPMSUMW(v8,v8,v16)
683 	bdz	.Lv8
684 
685 	lvx	v9,off16,r4
686 	lvx	v17,off16,r3
687 	VPERM(v9,v9,v17,byteswap)
688 	VPMSUMW(v9,v9,v17)
689 	bdz	.Lv9
690 
691 	lvx	v10,off32,r4
692 	lvx	v16,off32,r3
693 	VPERM(v10,v10,v16,byteswap)
694 	VPMSUMW(v10,v10,v16)
695 	bdz	.Lv10
696 
697 	lvx	v11,off48,r4
698 	lvx	v17,off48,r3
699 	VPERM(v11,v11,v17,byteswap)
700 	VPMSUMW(v11,v11,v17)
701 	bdz	.Lv11
702 
703 	lvx	v12,off64,r4
704 	lvx	v16,off64,r3
705 	VPERM(v12,v12,v16,byteswap)
706 	VPMSUMW(v12,v12,v16)
707 	bdz	.Lv12
708 
709 	lvx	v13,off80,r4
710 	lvx	v17,off80,r3
711 	VPERM(v13,v13,v17,byteswap)
712 	VPMSUMW(v13,v13,v17)
713 	bdz	.Lv13
714 
715 	lvx	v14,off96,r4
716 	lvx	v16,off96,r3
717 	VPERM(v14,v14,v16,byteswap)
718 	VPMSUMW(v14,v14,v16)
719 	bdz	.Lv14
720 
721 	lvx	v15,off112,r4
722 	lvx	v17,off112,r3
723 	VPERM(v15,v15,v17,byteswap)
724 	VPMSUMW(v15,v15,v17)
725 
726 .Lv15:	vxor	v19,v19,v15
727 .Lv14:	vxor	v20,v20,v14
728 .Lv13:	vxor	v19,v19,v13
729 .Lv12:	vxor	v20,v20,v12
730 .Lv11:	vxor	v19,v19,v11
731 .Lv10:	vxor	v20,v20,v10
732 .Lv9:	vxor	v19,v19,v9
733 .Lv8:	vxor	v20,v20,v8
734 .Lv7:	vxor	v19,v19,v7
735 .Lv6:	vxor	v20,v20,v6
736 .Lv5:	vxor	v19,v19,v5
737 .Lv4:	vxor	v20,v20,v4
738 .Lv3:	vxor	v19,v19,v3
739 .Lv2:	vxor	v20,v20,v2
740 .Lv1:	vxor	v19,v19,v1
741 .Lv0:	vxor	v20,v20,v0
742 
743 	vxor	v0,v19,v20
744 
745 	b	.Lbarrett_reduction
746 
747 .Lzero:
748 	mr	r3,r10
749 	b	.Lout
750 
751 FUNC_END(CRC_FUNCTION_NAME)
752