1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * Author: Anton Blanchard <anton@au.ibm.com>
4  * Copyright 2015 IBM Corporation.
5  */
6 #include <asm/ppc_asm.h>
7 #include <asm/export.h>
8 #include <asm/ppc-opcode.h>
9 
10 #define off8	r6
11 #define off16	r7
12 #define off24	r8
13 
14 #define rA	r9
15 #define rB	r10
16 #define rC	r11
17 #define rD	r27
18 #define rE	r28
19 #define rF	r29
20 #define rG	r30
21 #define rH	r31
22 
23 #ifdef __LITTLE_ENDIAN__
24 #define LH	lhbrx
25 #define LW	lwbrx
26 #define LD	ldbrx
27 #define LVS	lvsr
28 #define VPERM(_VRT,_VRA,_VRB,_VRC) \
29 	vperm _VRT,_VRB,_VRA,_VRC
30 #else
31 #define LH	lhzx
32 #define LW	lwzx
33 #define LD	ldx
34 #define LVS	lvsl
35 #define VPERM(_VRT,_VRA,_VRB,_VRC) \
36 	vperm _VRT,_VRA,_VRB,_VRC
37 #endif
38 
39 #define VMX_THRESH 4096
40 #define ENTER_VMX_OPS	\
41 	mflr    r0;	\
42 	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
43 	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
44 	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
45 	std     r0,16(r1); \
46 	stdu    r1,-STACKFRAMESIZE(r1); \
47 	bl      enter_vmx_ops; \
48 	cmpwi   cr1,r3,0; \
49 	ld      r0,STACKFRAMESIZE+16(r1); \
50 	ld      r3,STK_REG(R31)(r1); \
51 	ld      r4,STK_REG(R30)(r1); \
52 	ld      r5,STK_REG(R29)(r1); \
53 	addi	r1,r1,STACKFRAMESIZE; \
54 	mtlr    r0
55 
56 #define EXIT_VMX_OPS \
57 	mflr    r0; \
58 	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
59 	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
60 	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
61 	std     r0,16(r1); \
62 	stdu    r1,-STACKFRAMESIZE(r1); \
63 	bl      exit_vmx_ops; \
64 	ld      r0,STACKFRAMESIZE+16(r1); \
65 	ld      r3,STK_REG(R31)(r1); \
66 	ld      r4,STK_REG(R30)(r1); \
67 	ld      r5,STK_REG(R29)(r1); \
68 	addi	r1,r1,STACKFRAMESIZE; \
69 	mtlr    r0
70 
71 /*
72  * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
73  * 16 bytes boundary and permute the result with the 1st 16 bytes.
74 
75  *    |  y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
76  *    ^                                  ^                                 ^
77  * 0xbbbb10                          0xbbbb20                          0xbbb30
78  *                                 ^
79  *                                _vaddr
80  *
81  *
82  * _vmask is the mask generated by LVS
83  * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
84  *   for example: 0xyyyyyyyyyyyyy012 for big endian
85  * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
86  *   for example: 0x3456789abcdefzzz for big endian
87  * The permute result is saved in _v_res.
88  *   for example: 0x0123456789abcdef for big endian.
89  */
90 #define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
91         lvx     _v2nd_qw,_vaddr,off16; \
92         VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
93 
94 /*
95  * There are 2 categories for memcmp:
96  * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
97  * are named like .Lsameoffset_xxxx
98  * 2) src/dst has different offset to the 8 bytes boundary. The handlers
99  * are named like .Ldiffoffset_xxxx
100  */
101 _GLOBAL_TOC(memcmp)
102 	cmpdi	cr1,r5,0
103 
104 	/* Use the short loop if the src/dst addresses are not
105 	 * with the same offset of 8 bytes align boundary.
106 	 */
107 	xor	r6,r3,r4
108 	andi.	r6,r6,7
109 
110 	/* Fall back to short loop if compare at aligned addrs
111 	 * with less than 8 bytes.
112 	 */
113 	cmpdi   cr6,r5,7
114 
115 	beq	cr1,.Lzero
116 	bgt	cr6,.Lno_short
117 
118 .Lshort:
119 	mtctr	r5
120 1:	lbz	rA,0(r3)
121 	lbz	rB,0(r4)
122 	subf.	rC,rB,rA
123 	bne	.Lnon_zero
124 	bdz	.Lzero
125 
126 	lbz	rA,1(r3)
127 	lbz	rB,1(r4)
128 	subf.	rC,rB,rA
129 	bne	.Lnon_zero
130 	bdz	.Lzero
131 
132 	lbz	rA,2(r3)
133 	lbz	rB,2(r4)
134 	subf.	rC,rB,rA
135 	bne	.Lnon_zero
136 	bdz	.Lzero
137 
138 	lbz	rA,3(r3)
139 	lbz	rB,3(r4)
140 	subf.	rC,rB,rA
141 	bne	.Lnon_zero
142 
143 	addi	r3,r3,4
144 	addi	r4,r4,4
145 
146 	bdnz	1b
147 
148 .Lzero:
149 	li	r3,0
150 	blr
151 
152 .Lno_short:
153 	dcbt	0,r3
154 	dcbt	0,r4
155 	bne	.Ldiffoffset_8bytes_make_align_start
156 
157 
158 .Lsameoffset_8bytes_make_align_start:
159 	/* attempt to compare bytes not aligned with 8 bytes so that
160 	 * rest comparison can run based on 8 bytes alignment.
161 	 */
162 	andi.   r6,r3,7
163 
164 	/* Try to compare the first double word which is not 8 bytes aligned:
165 	 * load the first double word at (src & ~7UL) and shift left appropriate
166 	 * bits before comparision.
167 	 */
168 	rlwinm  r6,r3,3,26,28
169 	beq     .Lsameoffset_8bytes_aligned
170 	clrrdi	r3,r3,3
171 	clrrdi	r4,r4,3
172 	LD	rA,0,r3
173 	LD	rB,0,r4
174 	sld	rA,rA,r6
175 	sld	rB,rB,r6
176 	cmpld	cr0,rA,rB
177 	srwi	r6,r6,3
178 	bne	cr0,.LcmpAB_lightweight
179 	subfic  r6,r6,8
180 	subf.	r5,r6,r5
181 	addi	r3,r3,8
182 	addi	r4,r4,8
183 	beq	.Lzero
184 
185 .Lsameoffset_8bytes_aligned:
186 	/* now we are aligned with 8 bytes.
187 	 * Use .Llong loop if left cmp bytes are equal or greater than 32B.
188 	 */
189 	cmpdi   cr6,r5,31
190 	bgt	cr6,.Llong
191 
192 .Lcmp_lt32bytes:
193 	/* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
194 	cmpdi   cr5,r5,7
195 	srdi    r0,r5,3
196 	ble	cr5,.Lcmp_rest_lt8bytes
197 
198 	/* handle 8 ~ 31 bytes */
199 	clrldi  r5,r5,61
200 	mtctr   r0
201 2:
202 	LD	rA,0,r3
203 	LD	rB,0,r4
204 	cmpld	cr0,rA,rB
205 	addi	r3,r3,8
206 	addi	r4,r4,8
207 	bne	cr0,.LcmpAB_lightweight
208 	bdnz	2b
209 
210 	cmpwi   r5,0
211 	beq	.Lzero
212 
213 .Lcmp_rest_lt8bytes:
214 	/*
215 	 * Here we have less than 8 bytes to compare. At least s1 is aligned to
216 	 * 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a
217 	 * page boundary, otherwise we might read past the end of the buffer and
218 	 * trigger a page fault. We use 4K as the conservative minimum page
219 	 * size. If we detect that case we go to the byte-by-byte loop.
220 	 *
221 	 * Otherwise the next double word is loaded from s1 and s2, and shifted
222 	 * right to compare the appropriate bits.
223 	 */
224 	clrldi	r6,r4,(64-12)	// r6 = r4 & 0xfff
225 	cmpdi	r6,0xff8
226 	bgt	.Lshort
227 
228 	subfic  r6,r5,8
229 	slwi	r6,r6,3
230 	LD	rA,0,r3
231 	LD	rB,0,r4
232 	srd	rA,rA,r6
233 	srd	rB,rB,r6
234 	cmpld	cr0,rA,rB
235 	bne	cr0,.LcmpAB_lightweight
236 	b	.Lzero
237 
238 .Lnon_zero:
239 	mr	r3,rC
240 	blr
241 
242 .Llong:
243 #ifdef CONFIG_ALTIVEC
244 BEGIN_FTR_SECTION
245 	/* Try to use vmx loop if length is equal or greater than 4K */
246 	cmpldi  cr6,r5,VMX_THRESH
247 	bge	cr6,.Lsameoffset_vmx_cmp
248 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
249 
250 .Llong_novmx_cmp:
251 #endif
252 	/* At least s1 addr is aligned with 8 bytes */
253 	li	off8,8
254 	li	off16,16
255 	li	off24,24
256 
257 	std	r31,-8(r1)
258 	std	r30,-16(r1)
259 	std	r29,-24(r1)
260 	std	r28,-32(r1)
261 	std	r27,-40(r1)
262 
263 	srdi	r0,r5,5
264 	mtctr	r0
265 	andi.	r5,r5,31
266 
267 	LD	rA,0,r3
268 	LD	rB,0,r4
269 
270 	LD	rC,off8,r3
271 	LD	rD,off8,r4
272 
273 	LD	rE,off16,r3
274 	LD	rF,off16,r4
275 
276 	LD	rG,off24,r3
277 	LD	rH,off24,r4
278 	cmpld	cr0,rA,rB
279 
280 	addi	r3,r3,32
281 	addi	r4,r4,32
282 
283 	bdz	.Lfirst32
284 
285 	LD	rA,0,r3
286 	LD	rB,0,r4
287 	cmpld	cr1,rC,rD
288 
289 	LD	rC,off8,r3
290 	LD	rD,off8,r4
291 	cmpld	cr6,rE,rF
292 
293 	LD	rE,off16,r3
294 	LD	rF,off16,r4
295 	cmpld	cr7,rG,rH
296 	bne	cr0,.LcmpAB
297 
298 	LD	rG,off24,r3
299 	LD	rH,off24,r4
300 	cmpld	cr0,rA,rB
301 	bne	cr1,.LcmpCD
302 
303 	addi	r3,r3,32
304 	addi	r4,r4,32
305 
306 	bdz	.Lsecond32
307 
308 	.balign	16
309 
310 1:	LD	rA,0,r3
311 	LD	rB,0,r4
312 	cmpld	cr1,rC,rD
313 	bne	cr6,.LcmpEF
314 
315 	LD	rC,off8,r3
316 	LD	rD,off8,r4
317 	cmpld	cr6,rE,rF
318 	bne	cr7,.LcmpGH
319 
320 	LD	rE,off16,r3
321 	LD	rF,off16,r4
322 	cmpld	cr7,rG,rH
323 	bne	cr0,.LcmpAB
324 
325 	LD	rG,off24,r3
326 	LD	rH,off24,r4
327 	cmpld	cr0,rA,rB
328 	bne	cr1,.LcmpCD
329 
330 	addi	r3,r3,32
331 	addi	r4,r4,32
332 
333 	bdnz	1b
334 
335 .Lsecond32:
336 	cmpld	cr1,rC,rD
337 	bne	cr6,.LcmpEF
338 
339 	cmpld	cr6,rE,rF
340 	bne	cr7,.LcmpGH
341 
342 	cmpld	cr7,rG,rH
343 	bne	cr0,.LcmpAB
344 
345 	bne	cr1,.LcmpCD
346 	bne	cr6,.LcmpEF
347 	bne	cr7,.LcmpGH
348 
349 .Ltail:
350 	ld	r31,-8(r1)
351 	ld	r30,-16(r1)
352 	ld	r29,-24(r1)
353 	ld	r28,-32(r1)
354 	ld	r27,-40(r1)
355 
356 	cmpdi	r5,0
357 	beq	.Lzero
358 	b	.Lshort
359 
360 .Lfirst32:
361 	cmpld	cr1,rC,rD
362 	cmpld	cr6,rE,rF
363 	cmpld	cr7,rG,rH
364 
365 	bne	cr0,.LcmpAB
366 	bne	cr1,.LcmpCD
367 	bne	cr6,.LcmpEF
368 	bne	cr7,.LcmpGH
369 
370 	b	.Ltail
371 
372 .LcmpAB:
373 	li	r3,1
374 	bgt	cr0,.Lout
375 	li	r3,-1
376 	b	.Lout
377 
378 .LcmpCD:
379 	li	r3,1
380 	bgt	cr1,.Lout
381 	li	r3,-1
382 	b	.Lout
383 
384 .LcmpEF:
385 	li	r3,1
386 	bgt	cr6,.Lout
387 	li	r3,-1
388 	b	.Lout
389 
390 .LcmpGH:
391 	li	r3,1
392 	bgt	cr7,.Lout
393 	li	r3,-1
394 
395 .Lout:
396 	ld	r31,-8(r1)
397 	ld	r30,-16(r1)
398 	ld	r29,-24(r1)
399 	ld	r28,-32(r1)
400 	ld	r27,-40(r1)
401 	blr
402 
403 .LcmpAB_lightweight:   /* skip NV GPRS restore */
404 	li	r3,1
405 	bgtlr
406 	li	r3,-1
407 	blr
408 
409 #ifdef CONFIG_ALTIVEC
410 .Lsameoffset_vmx_cmp:
411 	/* Enter with src/dst addrs has the same offset with 8 bytes
412 	 * align boundary.
413 	 *
414 	 * There is an optimization based on following fact: memcmp()
415 	 * prones to fail early at the first 32 bytes.
416 	 * Before applying VMX instructions which will lead to 32x128bits
417 	 * VMX regs load/restore penalty, we compare the first 32 bytes
418 	 * so that we can catch the ~80% fail cases.
419 	 */
420 
421 	li	r0,4
422 	mtctr	r0
423 .Lsameoffset_prechk_32B_loop:
424 	LD	rA,0,r3
425 	LD	rB,0,r4
426 	cmpld	cr0,rA,rB
427 	addi	r3,r3,8
428 	addi	r4,r4,8
429 	bne     cr0,.LcmpAB_lightweight
430 	addi	r5,r5,-8
431 	bdnz	.Lsameoffset_prechk_32B_loop
432 
433 	ENTER_VMX_OPS
434 	beq     cr1,.Llong_novmx_cmp
435 
436 3:
437 	/* need to check whether r4 has the same offset with r3
438 	 * for 16 bytes boundary.
439 	 */
440 	xor	r0,r3,r4
441 	andi.	r0,r0,0xf
442 	bne	.Ldiffoffset_vmx_cmp_start
443 
444 	/* len is no less than 4KB. Need to align with 16 bytes further.
445 	 */
446 	andi.	rA,r3,8
447 	LD	rA,0,r3
448 	beq	4f
449 	LD	rB,0,r4
450 	cmpld	cr0,rA,rB
451 	addi	r3,r3,8
452 	addi	r4,r4,8
453 	addi	r5,r5,-8
454 
455 	beq	cr0,4f
456 	/* save and restore cr0 */
457 	mfocrf  r5,128
458 	EXIT_VMX_OPS
459 	mtocrf  128,r5
460 	b	.LcmpAB_lightweight
461 
462 4:
463 	/* compare 32 bytes for each loop */
464 	srdi	r0,r5,5
465 	mtctr	r0
466 	clrldi  r5,r5,59
467 	li	off16,16
468 
469 .balign 16
470 5:
471 	lvx 	v0,0,r3
472 	lvx 	v1,0,r4
473 	VCMPEQUD_RC(v0,v0,v1)
474 	bnl	cr6,7f
475 	lvx 	v0,off16,r3
476 	lvx 	v1,off16,r4
477 	VCMPEQUD_RC(v0,v0,v1)
478 	bnl	cr6,6f
479 	addi	r3,r3,32
480 	addi	r4,r4,32
481 	bdnz	5b
482 
483 	EXIT_VMX_OPS
484 	cmpdi	r5,0
485 	beq	.Lzero
486 	b	.Lcmp_lt32bytes
487 
488 6:
489 	addi	r3,r3,16
490 	addi	r4,r4,16
491 
492 7:
493 	/* diff the last 16 bytes */
494 	EXIT_VMX_OPS
495 	LD	rA,0,r3
496 	LD	rB,0,r4
497 	cmpld	cr0,rA,rB
498 	li	off8,8
499 	bne	cr0,.LcmpAB_lightweight
500 
501 	LD	rA,off8,r3
502 	LD	rB,off8,r4
503 	cmpld	cr0,rA,rB
504 	bne	cr0,.LcmpAB_lightweight
505 	b	.Lzero
506 #endif
507 
508 .Ldiffoffset_8bytes_make_align_start:
509 	/* now try to align s1 with 8 bytes */
510 	rlwinm  r6,r3,3,26,28
511 	beq     .Ldiffoffset_align_s1_8bytes
512 
513 	clrrdi	r3,r3,3
514 	LD	rA,0,r3
515 	LD	rB,0,r4  /* unaligned load */
516 	sld	rA,rA,r6
517 	srd	rA,rA,r6
518 	srd	rB,rB,r6
519 	cmpld	cr0,rA,rB
520 	srwi	r6,r6,3
521 	bne	cr0,.LcmpAB_lightweight
522 
523 	subfic  r6,r6,8
524 	subf.	r5,r6,r5
525 	addi	r3,r3,8
526 	add	r4,r4,r6
527 
528 	beq	.Lzero
529 
530 .Ldiffoffset_align_s1_8bytes:
531 	/* now s1 is aligned with 8 bytes. */
532 #ifdef CONFIG_ALTIVEC
533 BEGIN_FTR_SECTION
534 	/* only do vmx ops when the size equal or greater than 4K bytes */
535 	cmpdi	cr5,r5,VMX_THRESH
536 	bge	cr5,.Ldiffoffset_vmx_cmp
537 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
538 
539 .Ldiffoffset_novmx_cmp:
540 #endif
541 
542 
543 	cmpdi   cr5,r5,31
544 	ble	cr5,.Lcmp_lt32bytes
545 
546 #ifdef CONFIG_ALTIVEC
547 	b	.Llong_novmx_cmp
548 #else
549 	b	.Llong
550 #endif
551 
552 #ifdef CONFIG_ALTIVEC
553 .Ldiffoffset_vmx_cmp:
554 	/* perform a 32 bytes pre-checking before
555 	 * enable VMX operations.
556 	 */
557 	li	r0,4
558 	mtctr	r0
559 .Ldiffoffset_prechk_32B_loop:
560 	LD	rA,0,r3
561 	LD	rB,0,r4
562 	cmpld	cr0,rA,rB
563 	addi	r3,r3,8
564 	addi	r4,r4,8
565 	bne     cr0,.LcmpAB_lightweight
566 	addi	r5,r5,-8
567 	bdnz	.Ldiffoffset_prechk_32B_loop
568 
569 	ENTER_VMX_OPS
570 	beq     cr1,.Ldiffoffset_novmx_cmp
571 
572 .Ldiffoffset_vmx_cmp_start:
573 	/* Firstly try to align r3 with 16 bytes */
574 	andi.   r6,r3,0xf
575 	li	off16,16
576 	beq     .Ldiffoffset_vmx_s1_16bytes_align
577 
578 	LVS	v3,0,r3
579 	LVS	v4,0,r4
580 
581 	lvx     v5,0,r3
582 	lvx     v6,0,r4
583 	LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
584 	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
585 
586 	VCMPEQUB_RC(v7,v9,v10)
587 	bnl	cr6,.Ldiffoffset_vmx_diff_found
588 
589 	subfic  r6,r6,16
590 	subf    r5,r6,r5
591 	add     r3,r3,r6
592 	add     r4,r4,r6
593 
594 .Ldiffoffset_vmx_s1_16bytes_align:
595 	/* now s1 is aligned with 16 bytes */
596 	lvx     v6,0,r4
597 	LVS	v4,0,r4
598 	srdi	r6,r5,5  /* loop for 32 bytes each */
599 	clrldi  r5,r5,59
600 	mtctr	r6
601 
602 .balign	16
603 .Ldiffoffset_vmx_32bytesloop:
604 	/* the first qw of r4 was saved in v6 */
605 	lvx	v9,0,r3
606 	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
607 	VCMPEQUB_RC(v7,v9,v10)
608 	vor	v6,v8,v8
609 	bnl	cr6,.Ldiffoffset_vmx_diff_found
610 
611 	addi	r3,r3,16
612 	addi	r4,r4,16
613 
614 	lvx	v9,0,r3
615 	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
616 	VCMPEQUB_RC(v7,v9,v10)
617 	vor	v6,v8,v8
618 	bnl	cr6,.Ldiffoffset_vmx_diff_found
619 
620 	addi	r3,r3,16
621 	addi	r4,r4,16
622 
623 	bdnz	.Ldiffoffset_vmx_32bytesloop
624 
625 	EXIT_VMX_OPS
626 
627 	cmpdi	r5,0
628 	beq	.Lzero
629 	b	.Lcmp_lt32bytes
630 
631 .Ldiffoffset_vmx_diff_found:
632 	EXIT_VMX_OPS
633 	/* anyway, the diff will appear in next 16 bytes */
634 	li	r5,16
635 	b	.Lcmp_lt32bytes
636 
637 #endif
638 EXPORT_SYMBOL(memcmp)
639