162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Author: Anton Blanchard <anton@au.ibm.com>
462306a36Sopenharmony_ci * Copyright 2015 IBM Corporation.
562306a36Sopenharmony_ci */
662306a36Sopenharmony_ci#include <linux/export.h>
762306a36Sopenharmony_ci#include <asm/ppc_asm.h>
862306a36Sopenharmony_ci#include <asm/ppc-opcode.h>
962306a36Sopenharmony_ci
1062306a36Sopenharmony_ci#define off8	r6
1162306a36Sopenharmony_ci#define off16	r7
1262306a36Sopenharmony_ci#define off24	r8
1362306a36Sopenharmony_ci
1462306a36Sopenharmony_ci#define rA	r9
1562306a36Sopenharmony_ci#define rB	r10
1662306a36Sopenharmony_ci#define rC	r11
1762306a36Sopenharmony_ci#define rD	r27
1862306a36Sopenharmony_ci#define rE	r28
1962306a36Sopenharmony_ci#define rF	r29
2062306a36Sopenharmony_ci#define rG	r30
2162306a36Sopenharmony_ci#define rH	r31
2262306a36Sopenharmony_ci
2362306a36Sopenharmony_ci#ifdef __LITTLE_ENDIAN__
2462306a36Sopenharmony_ci#define LH	lhbrx
2562306a36Sopenharmony_ci#define LW	lwbrx
2662306a36Sopenharmony_ci#define LD	ldbrx
2762306a36Sopenharmony_ci#define LVS	lvsr
2862306a36Sopenharmony_ci#define VPERM(_VRT,_VRA,_VRB,_VRC) \
2962306a36Sopenharmony_ci	vperm _VRT,_VRB,_VRA,_VRC
3062306a36Sopenharmony_ci#else
3162306a36Sopenharmony_ci#define LH	lhzx
3262306a36Sopenharmony_ci#define LW	lwzx
3362306a36Sopenharmony_ci#define LD	ldx
3462306a36Sopenharmony_ci#define LVS	lvsl
3562306a36Sopenharmony_ci#define VPERM(_VRT,_VRA,_VRB,_VRC) \
3662306a36Sopenharmony_ci	vperm _VRT,_VRA,_VRB,_VRC
3762306a36Sopenharmony_ci#endif
3862306a36Sopenharmony_ci
3962306a36Sopenharmony_ci#define VMX_THRESH 4096
4062306a36Sopenharmony_ci#define ENTER_VMX_OPS	\
4162306a36Sopenharmony_ci	mflr    r0;	\
4262306a36Sopenharmony_ci	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
4362306a36Sopenharmony_ci	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
4462306a36Sopenharmony_ci	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
4562306a36Sopenharmony_ci	std     r0,16(r1); \
4662306a36Sopenharmony_ci	stdu    r1,-STACKFRAMESIZE(r1); \
4762306a36Sopenharmony_ci	bl      CFUNC(enter_vmx_ops); \
4862306a36Sopenharmony_ci	cmpwi   cr1,r3,0; \
4962306a36Sopenharmony_ci	ld      r0,STACKFRAMESIZE+16(r1); \
5062306a36Sopenharmony_ci	ld      r3,STK_REG(R31)(r1); \
5162306a36Sopenharmony_ci	ld      r4,STK_REG(R30)(r1); \
5262306a36Sopenharmony_ci	ld      r5,STK_REG(R29)(r1); \
5362306a36Sopenharmony_ci	addi	r1,r1,STACKFRAMESIZE; \
5462306a36Sopenharmony_ci	mtlr    r0
5562306a36Sopenharmony_ci
5662306a36Sopenharmony_ci#define EXIT_VMX_OPS \
5762306a36Sopenharmony_ci	mflr    r0; \
5862306a36Sopenharmony_ci	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
5962306a36Sopenharmony_ci	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
6062306a36Sopenharmony_ci	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
6162306a36Sopenharmony_ci	std     r0,16(r1); \
6262306a36Sopenharmony_ci	stdu    r1,-STACKFRAMESIZE(r1); \
6362306a36Sopenharmony_ci	bl      CFUNC(exit_vmx_ops); \
6462306a36Sopenharmony_ci	ld      r0,STACKFRAMESIZE+16(r1); \
6562306a36Sopenharmony_ci	ld      r3,STK_REG(R31)(r1); \
6662306a36Sopenharmony_ci	ld      r4,STK_REG(R30)(r1); \
6762306a36Sopenharmony_ci	ld      r5,STK_REG(R29)(r1); \
6862306a36Sopenharmony_ci	addi	r1,r1,STACKFRAMESIZE; \
6962306a36Sopenharmony_ci	mtlr    r0
7062306a36Sopenharmony_ci
7162306a36Sopenharmony_ci/*
7262306a36Sopenharmony_ci * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
7362306a36Sopenharmony_ci * 16 bytes boundary and permute the result with the 1st 16 bytes.
7462306a36Sopenharmony_ci
7562306a36Sopenharmony_ci *    |  y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
7662306a36Sopenharmony_ci *    ^                                  ^                                 ^
7762306a36Sopenharmony_ci * 0xbbbb10                          0xbbbb20                          0xbbb30
7862306a36Sopenharmony_ci *                                 ^
7962306a36Sopenharmony_ci *                                _vaddr
8062306a36Sopenharmony_ci *
8162306a36Sopenharmony_ci *
8262306a36Sopenharmony_ci * _vmask is the mask generated by LVS
8362306a36Sopenharmony_ci * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
8462306a36Sopenharmony_ci *   for example: 0xyyyyyyyyyyyyy012 for big endian
8562306a36Sopenharmony_ci * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
8662306a36Sopenharmony_ci *   for example: 0x3456789abcdefzzz for big endian
8762306a36Sopenharmony_ci * The permute result is saved in _v_res.
8862306a36Sopenharmony_ci *   for example: 0x0123456789abcdef for big endian.
8962306a36Sopenharmony_ci */
9062306a36Sopenharmony_ci#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
9162306a36Sopenharmony_ci        lvx     _v2nd_qw,_vaddr,off16; \
9262306a36Sopenharmony_ci        VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
9362306a36Sopenharmony_ci
9462306a36Sopenharmony_ci/*
9562306a36Sopenharmony_ci * There are 2 categories for memcmp:
9662306a36Sopenharmony_ci * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
9762306a36Sopenharmony_ci * are named like .Lsameoffset_xxxx
9862306a36Sopenharmony_ci * 2) src/dst has different offset to the 8 bytes boundary. The handlers
9962306a36Sopenharmony_ci * are named like .Ldiffoffset_xxxx
10062306a36Sopenharmony_ci */
10162306a36Sopenharmony_ci_GLOBAL_TOC(memcmp)
10262306a36Sopenharmony_ci	cmpdi	cr1,r5,0
10362306a36Sopenharmony_ci
10462306a36Sopenharmony_ci	/* Use the short loop if the src/dst addresses are not
10562306a36Sopenharmony_ci	 * with the same offset of 8 bytes align boundary.
10662306a36Sopenharmony_ci	 */
10762306a36Sopenharmony_ci	xor	r6,r3,r4
10862306a36Sopenharmony_ci	andi.	r6,r6,7
10962306a36Sopenharmony_ci
11062306a36Sopenharmony_ci	/* Fall back to short loop if compare at aligned addrs
11162306a36Sopenharmony_ci	 * with less than 8 bytes.
11262306a36Sopenharmony_ci	 */
11362306a36Sopenharmony_ci	cmpdi   cr6,r5,7
11462306a36Sopenharmony_ci
11562306a36Sopenharmony_ci	beq	cr1,.Lzero
11662306a36Sopenharmony_ci	bgt	cr6,.Lno_short
11762306a36Sopenharmony_ci
11862306a36Sopenharmony_ci.Lshort:
11962306a36Sopenharmony_ci	mtctr	r5
12062306a36Sopenharmony_ci1:	lbz	rA,0(r3)
12162306a36Sopenharmony_ci	lbz	rB,0(r4)
12262306a36Sopenharmony_ci	subf.	rC,rB,rA
12362306a36Sopenharmony_ci	bne	.Lnon_zero
12462306a36Sopenharmony_ci	bdz	.Lzero
12562306a36Sopenharmony_ci
12662306a36Sopenharmony_ci	lbz	rA,1(r3)
12762306a36Sopenharmony_ci	lbz	rB,1(r4)
12862306a36Sopenharmony_ci	subf.	rC,rB,rA
12962306a36Sopenharmony_ci	bne	.Lnon_zero
13062306a36Sopenharmony_ci	bdz	.Lzero
13162306a36Sopenharmony_ci
13262306a36Sopenharmony_ci	lbz	rA,2(r3)
13362306a36Sopenharmony_ci	lbz	rB,2(r4)
13462306a36Sopenharmony_ci	subf.	rC,rB,rA
13562306a36Sopenharmony_ci	bne	.Lnon_zero
13662306a36Sopenharmony_ci	bdz	.Lzero
13762306a36Sopenharmony_ci
13862306a36Sopenharmony_ci	lbz	rA,3(r3)
13962306a36Sopenharmony_ci	lbz	rB,3(r4)
14062306a36Sopenharmony_ci	subf.	rC,rB,rA
14162306a36Sopenharmony_ci	bne	.Lnon_zero
14262306a36Sopenharmony_ci
14362306a36Sopenharmony_ci	addi	r3,r3,4
14462306a36Sopenharmony_ci	addi	r4,r4,4
14562306a36Sopenharmony_ci
14662306a36Sopenharmony_ci	bdnz	1b
14762306a36Sopenharmony_ci
14862306a36Sopenharmony_ci.Lzero:
14962306a36Sopenharmony_ci	li	r3,0
15062306a36Sopenharmony_ci	blr
15162306a36Sopenharmony_ci
15262306a36Sopenharmony_ci.Lno_short:
15362306a36Sopenharmony_ci	dcbt	0,r3
15462306a36Sopenharmony_ci	dcbt	0,r4
15562306a36Sopenharmony_ci	bne	.Ldiffoffset_8bytes_make_align_start
15662306a36Sopenharmony_ci
15762306a36Sopenharmony_ci
15862306a36Sopenharmony_ci.Lsameoffset_8bytes_make_align_start:
15962306a36Sopenharmony_ci	/* attempt to compare bytes not aligned with 8 bytes so that
16062306a36Sopenharmony_ci	 * rest comparison can run based on 8 bytes alignment.
16162306a36Sopenharmony_ci	 */
16262306a36Sopenharmony_ci	andi.   r6,r3,7
16362306a36Sopenharmony_ci
16462306a36Sopenharmony_ci	/* Try to compare the first double word which is not 8 bytes aligned:
16562306a36Sopenharmony_ci	 * load the first double word at (src & ~7UL) and shift left appropriate
16662306a36Sopenharmony_ci	 * bits before comparision.
16762306a36Sopenharmony_ci	 */
16862306a36Sopenharmony_ci	rlwinm  r6,r3,3,26,28
16962306a36Sopenharmony_ci	beq     .Lsameoffset_8bytes_aligned
17062306a36Sopenharmony_ci	clrrdi	r3,r3,3
17162306a36Sopenharmony_ci	clrrdi	r4,r4,3
17262306a36Sopenharmony_ci	LD	rA,0,r3
17362306a36Sopenharmony_ci	LD	rB,0,r4
17462306a36Sopenharmony_ci	sld	rA,rA,r6
17562306a36Sopenharmony_ci	sld	rB,rB,r6
17662306a36Sopenharmony_ci	cmpld	cr0,rA,rB
17762306a36Sopenharmony_ci	srwi	r6,r6,3
17862306a36Sopenharmony_ci	bne	cr0,.LcmpAB_lightweight
17962306a36Sopenharmony_ci	subfic  r6,r6,8
18062306a36Sopenharmony_ci	subf.	r5,r6,r5
18162306a36Sopenharmony_ci	addi	r3,r3,8
18262306a36Sopenharmony_ci	addi	r4,r4,8
18362306a36Sopenharmony_ci	beq	.Lzero
18462306a36Sopenharmony_ci
18562306a36Sopenharmony_ci.Lsameoffset_8bytes_aligned:
18662306a36Sopenharmony_ci	/* now we are aligned with 8 bytes.
18762306a36Sopenharmony_ci	 * Use .Llong loop if left cmp bytes are equal or greater than 32B.
18862306a36Sopenharmony_ci	 */
18962306a36Sopenharmony_ci	cmpdi   cr6,r5,31
19062306a36Sopenharmony_ci	bgt	cr6,.Llong
19162306a36Sopenharmony_ci
19262306a36Sopenharmony_ci.Lcmp_lt32bytes:
19362306a36Sopenharmony_ci	/* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
19462306a36Sopenharmony_ci	cmpdi   cr5,r5,7
19562306a36Sopenharmony_ci	srdi    r0,r5,3
19662306a36Sopenharmony_ci	ble	cr5,.Lcmp_rest_lt8bytes
19762306a36Sopenharmony_ci
19862306a36Sopenharmony_ci	/* handle 8 ~ 31 bytes */
19962306a36Sopenharmony_ci	clrldi  r5,r5,61
20062306a36Sopenharmony_ci	mtctr   r0
20162306a36Sopenharmony_ci2:
20262306a36Sopenharmony_ci	LD	rA,0,r3
20362306a36Sopenharmony_ci	LD	rB,0,r4
20462306a36Sopenharmony_ci	cmpld	cr0,rA,rB
20562306a36Sopenharmony_ci	addi	r3,r3,8
20662306a36Sopenharmony_ci	addi	r4,r4,8
20762306a36Sopenharmony_ci	bne	cr0,.LcmpAB_lightweight
20862306a36Sopenharmony_ci	bdnz	2b
20962306a36Sopenharmony_ci
21062306a36Sopenharmony_ci	cmpwi   r5,0
21162306a36Sopenharmony_ci	beq	.Lzero
21262306a36Sopenharmony_ci
21362306a36Sopenharmony_ci.Lcmp_rest_lt8bytes:
21462306a36Sopenharmony_ci	/*
21562306a36Sopenharmony_ci	 * Here we have less than 8 bytes to compare. At least s1 is aligned to
21662306a36Sopenharmony_ci	 * 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a
21762306a36Sopenharmony_ci	 * page boundary, otherwise we might read past the end of the buffer and
21862306a36Sopenharmony_ci	 * trigger a page fault. We use 4K as the conservative minimum page
21962306a36Sopenharmony_ci	 * size. If we detect that case we go to the byte-by-byte loop.
22062306a36Sopenharmony_ci	 *
22162306a36Sopenharmony_ci	 * Otherwise the next double word is loaded from s1 and s2, and shifted
22262306a36Sopenharmony_ci	 * right to compare the appropriate bits.
22362306a36Sopenharmony_ci	 */
22462306a36Sopenharmony_ci	clrldi	r6,r4,(64-12)	// r6 = r4 & 0xfff
22562306a36Sopenharmony_ci	cmpdi	r6,0xff8
22662306a36Sopenharmony_ci	bgt	.Lshort
22762306a36Sopenharmony_ci
22862306a36Sopenharmony_ci	subfic  r6,r5,8
22962306a36Sopenharmony_ci	slwi	r6,r6,3
23062306a36Sopenharmony_ci	LD	rA,0,r3
23162306a36Sopenharmony_ci	LD	rB,0,r4
23262306a36Sopenharmony_ci	srd	rA,rA,r6
23362306a36Sopenharmony_ci	srd	rB,rB,r6
23462306a36Sopenharmony_ci	cmpld	cr0,rA,rB
23562306a36Sopenharmony_ci	bne	cr0,.LcmpAB_lightweight
23662306a36Sopenharmony_ci	b	.Lzero
23762306a36Sopenharmony_ci
23862306a36Sopenharmony_ci.Lnon_zero:
23962306a36Sopenharmony_ci	mr	r3,rC
24062306a36Sopenharmony_ci	blr
24162306a36Sopenharmony_ci
24262306a36Sopenharmony_ci.Llong:
24362306a36Sopenharmony_ci#ifdef CONFIG_ALTIVEC
24462306a36Sopenharmony_ciBEGIN_FTR_SECTION
24562306a36Sopenharmony_ci	/* Try to use vmx loop if length is equal or greater than 4K */
24662306a36Sopenharmony_ci	cmpldi  cr6,r5,VMX_THRESH
24762306a36Sopenharmony_ci	bge	cr6,.Lsameoffset_vmx_cmp
24862306a36Sopenharmony_ciEND_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
24962306a36Sopenharmony_ci
25062306a36Sopenharmony_ci.Llong_novmx_cmp:
25162306a36Sopenharmony_ci#endif
25262306a36Sopenharmony_ci	/* At least s1 addr is aligned with 8 bytes */
25362306a36Sopenharmony_ci	li	off8,8
25462306a36Sopenharmony_ci	li	off16,16
25562306a36Sopenharmony_ci	li	off24,24
25662306a36Sopenharmony_ci
25762306a36Sopenharmony_ci	std	r31,-8(r1)
25862306a36Sopenharmony_ci	std	r30,-16(r1)
25962306a36Sopenharmony_ci	std	r29,-24(r1)
26062306a36Sopenharmony_ci	std	r28,-32(r1)
26162306a36Sopenharmony_ci	std	r27,-40(r1)
26262306a36Sopenharmony_ci
26362306a36Sopenharmony_ci	srdi	r0,r5,5
26462306a36Sopenharmony_ci	mtctr	r0
26562306a36Sopenharmony_ci	andi.	r5,r5,31
26662306a36Sopenharmony_ci
26762306a36Sopenharmony_ci	LD	rA,0,r3
26862306a36Sopenharmony_ci	LD	rB,0,r4
26962306a36Sopenharmony_ci
27062306a36Sopenharmony_ci	LD	rC,off8,r3
27162306a36Sopenharmony_ci	LD	rD,off8,r4
27262306a36Sopenharmony_ci
27362306a36Sopenharmony_ci	LD	rE,off16,r3
27462306a36Sopenharmony_ci	LD	rF,off16,r4
27562306a36Sopenharmony_ci
27662306a36Sopenharmony_ci	LD	rG,off24,r3
27762306a36Sopenharmony_ci	LD	rH,off24,r4
27862306a36Sopenharmony_ci	cmpld	cr0,rA,rB
27962306a36Sopenharmony_ci
28062306a36Sopenharmony_ci	addi	r3,r3,32
28162306a36Sopenharmony_ci	addi	r4,r4,32
28262306a36Sopenharmony_ci
28362306a36Sopenharmony_ci	bdz	.Lfirst32
28462306a36Sopenharmony_ci
28562306a36Sopenharmony_ci	LD	rA,0,r3
28662306a36Sopenharmony_ci	LD	rB,0,r4
28762306a36Sopenharmony_ci	cmpld	cr1,rC,rD
28862306a36Sopenharmony_ci
28962306a36Sopenharmony_ci	LD	rC,off8,r3
29062306a36Sopenharmony_ci	LD	rD,off8,r4
29162306a36Sopenharmony_ci	cmpld	cr6,rE,rF
29262306a36Sopenharmony_ci
29362306a36Sopenharmony_ci	LD	rE,off16,r3
29462306a36Sopenharmony_ci	LD	rF,off16,r4
29562306a36Sopenharmony_ci	cmpld	cr7,rG,rH
29662306a36Sopenharmony_ci	bne	cr0,.LcmpAB
29762306a36Sopenharmony_ci
29862306a36Sopenharmony_ci	LD	rG,off24,r3
29962306a36Sopenharmony_ci	LD	rH,off24,r4
30062306a36Sopenharmony_ci	cmpld	cr0,rA,rB
30162306a36Sopenharmony_ci	bne	cr1,.LcmpCD
30262306a36Sopenharmony_ci
30362306a36Sopenharmony_ci	addi	r3,r3,32
30462306a36Sopenharmony_ci	addi	r4,r4,32
30562306a36Sopenharmony_ci
30662306a36Sopenharmony_ci	bdz	.Lsecond32
30762306a36Sopenharmony_ci
30862306a36Sopenharmony_ci	.balign	16
30962306a36Sopenharmony_ci
31062306a36Sopenharmony_ci1:	LD	rA,0,r3
31162306a36Sopenharmony_ci	LD	rB,0,r4
31262306a36Sopenharmony_ci	cmpld	cr1,rC,rD
31362306a36Sopenharmony_ci	bne	cr6,.LcmpEF
31462306a36Sopenharmony_ci
31562306a36Sopenharmony_ci	LD	rC,off8,r3
31662306a36Sopenharmony_ci	LD	rD,off8,r4
31762306a36Sopenharmony_ci	cmpld	cr6,rE,rF
31862306a36Sopenharmony_ci	bne	cr7,.LcmpGH
31962306a36Sopenharmony_ci
32062306a36Sopenharmony_ci	LD	rE,off16,r3
32162306a36Sopenharmony_ci	LD	rF,off16,r4
32262306a36Sopenharmony_ci	cmpld	cr7,rG,rH
32362306a36Sopenharmony_ci	bne	cr0,.LcmpAB
32462306a36Sopenharmony_ci
32562306a36Sopenharmony_ci	LD	rG,off24,r3
32662306a36Sopenharmony_ci	LD	rH,off24,r4
32762306a36Sopenharmony_ci	cmpld	cr0,rA,rB
32862306a36Sopenharmony_ci	bne	cr1,.LcmpCD
32962306a36Sopenharmony_ci
33062306a36Sopenharmony_ci	addi	r3,r3,32
33162306a36Sopenharmony_ci	addi	r4,r4,32
33262306a36Sopenharmony_ci
33362306a36Sopenharmony_ci	bdnz	1b
33462306a36Sopenharmony_ci
33562306a36Sopenharmony_ci.Lsecond32:
33662306a36Sopenharmony_ci	cmpld	cr1,rC,rD
33762306a36Sopenharmony_ci	bne	cr6,.LcmpEF
33862306a36Sopenharmony_ci
33962306a36Sopenharmony_ci	cmpld	cr6,rE,rF
34062306a36Sopenharmony_ci	bne	cr7,.LcmpGH
34162306a36Sopenharmony_ci
34262306a36Sopenharmony_ci	cmpld	cr7,rG,rH
34362306a36Sopenharmony_ci	bne	cr0,.LcmpAB
34462306a36Sopenharmony_ci
34562306a36Sopenharmony_ci	bne	cr1,.LcmpCD
34662306a36Sopenharmony_ci	bne	cr6,.LcmpEF
34762306a36Sopenharmony_ci	bne	cr7,.LcmpGH
34862306a36Sopenharmony_ci
34962306a36Sopenharmony_ci.Ltail:
35062306a36Sopenharmony_ci	ld	r31,-8(r1)
35162306a36Sopenharmony_ci	ld	r30,-16(r1)
35262306a36Sopenharmony_ci	ld	r29,-24(r1)
35362306a36Sopenharmony_ci	ld	r28,-32(r1)
35462306a36Sopenharmony_ci	ld	r27,-40(r1)
35562306a36Sopenharmony_ci
35662306a36Sopenharmony_ci	cmpdi	r5,0
35762306a36Sopenharmony_ci	beq	.Lzero
35862306a36Sopenharmony_ci	b	.Lshort
35962306a36Sopenharmony_ci
36062306a36Sopenharmony_ci.Lfirst32:
36162306a36Sopenharmony_ci	cmpld	cr1,rC,rD
36262306a36Sopenharmony_ci	cmpld	cr6,rE,rF
36362306a36Sopenharmony_ci	cmpld	cr7,rG,rH
36462306a36Sopenharmony_ci
36562306a36Sopenharmony_ci	bne	cr0,.LcmpAB
36662306a36Sopenharmony_ci	bne	cr1,.LcmpCD
36762306a36Sopenharmony_ci	bne	cr6,.LcmpEF
36862306a36Sopenharmony_ci	bne	cr7,.LcmpGH
36962306a36Sopenharmony_ci
37062306a36Sopenharmony_ci	b	.Ltail
37162306a36Sopenharmony_ci
37262306a36Sopenharmony_ci.LcmpAB:
37362306a36Sopenharmony_ci	li	r3,1
37462306a36Sopenharmony_ci	bgt	cr0,.Lout
37562306a36Sopenharmony_ci	li	r3,-1
37662306a36Sopenharmony_ci	b	.Lout
37762306a36Sopenharmony_ci
37862306a36Sopenharmony_ci.LcmpCD:
37962306a36Sopenharmony_ci	li	r3,1
38062306a36Sopenharmony_ci	bgt	cr1,.Lout
38162306a36Sopenharmony_ci	li	r3,-1
38262306a36Sopenharmony_ci	b	.Lout
38362306a36Sopenharmony_ci
38462306a36Sopenharmony_ci.LcmpEF:
38562306a36Sopenharmony_ci	li	r3,1
38662306a36Sopenharmony_ci	bgt	cr6,.Lout
38762306a36Sopenharmony_ci	li	r3,-1
38862306a36Sopenharmony_ci	b	.Lout
38962306a36Sopenharmony_ci
39062306a36Sopenharmony_ci.LcmpGH:
39162306a36Sopenharmony_ci	li	r3,1
39262306a36Sopenharmony_ci	bgt	cr7,.Lout
39362306a36Sopenharmony_ci	li	r3,-1
39462306a36Sopenharmony_ci
39562306a36Sopenharmony_ci.Lout:
39662306a36Sopenharmony_ci	ld	r31,-8(r1)
39762306a36Sopenharmony_ci	ld	r30,-16(r1)
39862306a36Sopenharmony_ci	ld	r29,-24(r1)
39962306a36Sopenharmony_ci	ld	r28,-32(r1)
40062306a36Sopenharmony_ci	ld	r27,-40(r1)
40162306a36Sopenharmony_ci	blr
40262306a36Sopenharmony_ci
40362306a36Sopenharmony_ci.LcmpAB_lightweight:   /* skip NV GPRS restore */
40462306a36Sopenharmony_ci	li	r3,1
40562306a36Sopenharmony_ci	bgtlr
40662306a36Sopenharmony_ci	li	r3,-1
40762306a36Sopenharmony_ci	blr
40862306a36Sopenharmony_ci
40962306a36Sopenharmony_ci#ifdef CONFIG_ALTIVEC
41062306a36Sopenharmony_ci.Lsameoffset_vmx_cmp:
41162306a36Sopenharmony_ci	/* Enter with src/dst addrs has the same offset with 8 bytes
41262306a36Sopenharmony_ci	 * align boundary.
41362306a36Sopenharmony_ci	 *
41462306a36Sopenharmony_ci	 * There is an optimization based on following fact: memcmp()
41562306a36Sopenharmony_ci	 * prones to fail early at the first 32 bytes.
41662306a36Sopenharmony_ci	 * Before applying VMX instructions which will lead to 32x128bits
41762306a36Sopenharmony_ci	 * VMX regs load/restore penalty, we compare the first 32 bytes
41862306a36Sopenharmony_ci	 * so that we can catch the ~80% fail cases.
41962306a36Sopenharmony_ci	 */
42062306a36Sopenharmony_ci
42162306a36Sopenharmony_ci	li	r0,4
42262306a36Sopenharmony_ci	mtctr	r0
42362306a36Sopenharmony_ci.Lsameoffset_prechk_32B_loop:
42462306a36Sopenharmony_ci	LD	rA,0,r3
42562306a36Sopenharmony_ci	LD	rB,0,r4
42662306a36Sopenharmony_ci	cmpld	cr0,rA,rB
42762306a36Sopenharmony_ci	addi	r3,r3,8
42862306a36Sopenharmony_ci	addi	r4,r4,8
42962306a36Sopenharmony_ci	bne     cr0,.LcmpAB_lightweight
43062306a36Sopenharmony_ci	addi	r5,r5,-8
43162306a36Sopenharmony_ci	bdnz	.Lsameoffset_prechk_32B_loop
43262306a36Sopenharmony_ci
43362306a36Sopenharmony_ci	ENTER_VMX_OPS
43462306a36Sopenharmony_ci	beq     cr1,.Llong_novmx_cmp
43562306a36Sopenharmony_ci
43662306a36Sopenharmony_ci3:
43762306a36Sopenharmony_ci	/* need to check whether r4 has the same offset with r3
43862306a36Sopenharmony_ci	 * for 16 bytes boundary.
43962306a36Sopenharmony_ci	 */
44062306a36Sopenharmony_ci	xor	r0,r3,r4
44162306a36Sopenharmony_ci	andi.	r0,r0,0xf
44262306a36Sopenharmony_ci	bne	.Ldiffoffset_vmx_cmp_start
44362306a36Sopenharmony_ci
44462306a36Sopenharmony_ci	/* len is no less than 4KB. Need to align with 16 bytes further.
44562306a36Sopenharmony_ci	 */
44662306a36Sopenharmony_ci	andi.	rA,r3,8
44762306a36Sopenharmony_ci	LD	rA,0,r3
44862306a36Sopenharmony_ci	beq	4f
44962306a36Sopenharmony_ci	LD	rB,0,r4
45062306a36Sopenharmony_ci	cmpld	cr0,rA,rB
45162306a36Sopenharmony_ci	addi	r3,r3,8
45262306a36Sopenharmony_ci	addi	r4,r4,8
45362306a36Sopenharmony_ci	addi	r5,r5,-8
45462306a36Sopenharmony_ci
45562306a36Sopenharmony_ci	beq	cr0,4f
45662306a36Sopenharmony_ci	/* save and restore cr0 */
45762306a36Sopenharmony_ci	mfocrf  r5,128
45862306a36Sopenharmony_ci	EXIT_VMX_OPS
45962306a36Sopenharmony_ci	mtocrf  128,r5
46062306a36Sopenharmony_ci	b	.LcmpAB_lightweight
46162306a36Sopenharmony_ci
46262306a36Sopenharmony_ci4:
46362306a36Sopenharmony_ci	/* compare 32 bytes for each loop */
46462306a36Sopenharmony_ci	srdi	r0,r5,5
46562306a36Sopenharmony_ci	mtctr	r0
46662306a36Sopenharmony_ci	clrldi  r5,r5,59
46762306a36Sopenharmony_ci	li	off16,16
46862306a36Sopenharmony_ci
46962306a36Sopenharmony_ci.balign 16
47062306a36Sopenharmony_ci5:
47162306a36Sopenharmony_ci	lvx 	v0,0,r3
47262306a36Sopenharmony_ci	lvx 	v1,0,r4
47362306a36Sopenharmony_ci	VCMPEQUD_RC(v0,v0,v1)
47462306a36Sopenharmony_ci	bnl	cr6,7f
47562306a36Sopenharmony_ci	lvx 	v0,off16,r3
47662306a36Sopenharmony_ci	lvx 	v1,off16,r4
47762306a36Sopenharmony_ci	VCMPEQUD_RC(v0,v0,v1)
47862306a36Sopenharmony_ci	bnl	cr6,6f
47962306a36Sopenharmony_ci	addi	r3,r3,32
48062306a36Sopenharmony_ci	addi	r4,r4,32
48162306a36Sopenharmony_ci	bdnz	5b
48262306a36Sopenharmony_ci
48362306a36Sopenharmony_ci	EXIT_VMX_OPS
48462306a36Sopenharmony_ci	cmpdi	r5,0
48562306a36Sopenharmony_ci	beq	.Lzero
48662306a36Sopenharmony_ci	b	.Lcmp_lt32bytes
48762306a36Sopenharmony_ci
48862306a36Sopenharmony_ci6:
48962306a36Sopenharmony_ci	addi	r3,r3,16
49062306a36Sopenharmony_ci	addi	r4,r4,16
49162306a36Sopenharmony_ci
49262306a36Sopenharmony_ci7:
49362306a36Sopenharmony_ci	/* diff the last 16 bytes */
49462306a36Sopenharmony_ci	EXIT_VMX_OPS
49562306a36Sopenharmony_ci	LD	rA,0,r3
49662306a36Sopenharmony_ci	LD	rB,0,r4
49762306a36Sopenharmony_ci	cmpld	cr0,rA,rB
49862306a36Sopenharmony_ci	li	off8,8
49962306a36Sopenharmony_ci	bne	cr0,.LcmpAB_lightweight
50062306a36Sopenharmony_ci
50162306a36Sopenharmony_ci	LD	rA,off8,r3
50262306a36Sopenharmony_ci	LD	rB,off8,r4
50362306a36Sopenharmony_ci	cmpld	cr0,rA,rB
50462306a36Sopenharmony_ci	bne	cr0,.LcmpAB_lightweight
50562306a36Sopenharmony_ci	b	.Lzero
50662306a36Sopenharmony_ci#endif
50762306a36Sopenharmony_ci
50862306a36Sopenharmony_ci.Ldiffoffset_8bytes_make_align_start:
50962306a36Sopenharmony_ci	/* now try to align s1 with 8 bytes */
51062306a36Sopenharmony_ci	rlwinm  r6,r3,3,26,28
51162306a36Sopenharmony_ci	beq     .Ldiffoffset_align_s1_8bytes
51262306a36Sopenharmony_ci
51362306a36Sopenharmony_ci	clrrdi	r3,r3,3
51462306a36Sopenharmony_ci	LD	rA,0,r3
51562306a36Sopenharmony_ci	LD	rB,0,r4  /* unaligned load */
51662306a36Sopenharmony_ci	sld	rA,rA,r6
51762306a36Sopenharmony_ci	srd	rA,rA,r6
51862306a36Sopenharmony_ci	srd	rB,rB,r6
51962306a36Sopenharmony_ci	cmpld	cr0,rA,rB
52062306a36Sopenharmony_ci	srwi	r6,r6,3
52162306a36Sopenharmony_ci	bne	cr0,.LcmpAB_lightweight
52262306a36Sopenharmony_ci
52362306a36Sopenharmony_ci	subfic  r6,r6,8
52462306a36Sopenharmony_ci	subf.	r5,r6,r5
52562306a36Sopenharmony_ci	addi	r3,r3,8
52662306a36Sopenharmony_ci	add	r4,r4,r6
52762306a36Sopenharmony_ci
52862306a36Sopenharmony_ci	beq	.Lzero
52962306a36Sopenharmony_ci
53062306a36Sopenharmony_ci.Ldiffoffset_align_s1_8bytes:
53162306a36Sopenharmony_ci	/* now s1 is aligned with 8 bytes. */
53262306a36Sopenharmony_ci#ifdef CONFIG_ALTIVEC
53362306a36Sopenharmony_ciBEGIN_FTR_SECTION
53462306a36Sopenharmony_ci	/* only do vmx ops when the size equal or greater than 4K bytes */
53562306a36Sopenharmony_ci	cmpdi	cr5,r5,VMX_THRESH
53662306a36Sopenharmony_ci	bge	cr5,.Ldiffoffset_vmx_cmp
53762306a36Sopenharmony_ciEND_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
53862306a36Sopenharmony_ci
53962306a36Sopenharmony_ci.Ldiffoffset_novmx_cmp:
54062306a36Sopenharmony_ci#endif
54162306a36Sopenharmony_ci
54262306a36Sopenharmony_ci
54362306a36Sopenharmony_ci	cmpdi   cr5,r5,31
54462306a36Sopenharmony_ci	ble	cr5,.Lcmp_lt32bytes
54562306a36Sopenharmony_ci
54662306a36Sopenharmony_ci#ifdef CONFIG_ALTIVEC
54762306a36Sopenharmony_ci	b	.Llong_novmx_cmp
54862306a36Sopenharmony_ci#else
54962306a36Sopenharmony_ci	b	.Llong
55062306a36Sopenharmony_ci#endif
55162306a36Sopenharmony_ci
55262306a36Sopenharmony_ci#ifdef CONFIG_ALTIVEC
55362306a36Sopenharmony_ci.Ldiffoffset_vmx_cmp:
55462306a36Sopenharmony_ci	/* perform a 32 bytes pre-checking before
55562306a36Sopenharmony_ci	 * enable VMX operations.
55662306a36Sopenharmony_ci	 */
55762306a36Sopenharmony_ci	li	r0,4
55862306a36Sopenharmony_ci	mtctr	r0
55962306a36Sopenharmony_ci.Ldiffoffset_prechk_32B_loop:
56062306a36Sopenharmony_ci	LD	rA,0,r3
56162306a36Sopenharmony_ci	LD	rB,0,r4
56262306a36Sopenharmony_ci	cmpld	cr0,rA,rB
56362306a36Sopenharmony_ci	addi	r3,r3,8
56462306a36Sopenharmony_ci	addi	r4,r4,8
56562306a36Sopenharmony_ci	bne     cr0,.LcmpAB_lightweight
56662306a36Sopenharmony_ci	addi	r5,r5,-8
56762306a36Sopenharmony_ci	bdnz	.Ldiffoffset_prechk_32B_loop
56862306a36Sopenharmony_ci
56962306a36Sopenharmony_ci	ENTER_VMX_OPS
57062306a36Sopenharmony_ci	beq     cr1,.Ldiffoffset_novmx_cmp
57162306a36Sopenharmony_ci
57262306a36Sopenharmony_ci.Ldiffoffset_vmx_cmp_start:
57362306a36Sopenharmony_ci	/* Firstly try to align r3 with 16 bytes */
57462306a36Sopenharmony_ci	andi.   r6,r3,0xf
57562306a36Sopenharmony_ci	li	off16,16
57662306a36Sopenharmony_ci	beq     .Ldiffoffset_vmx_s1_16bytes_align
57762306a36Sopenharmony_ci
57862306a36Sopenharmony_ci	LVS	v3,0,r3
57962306a36Sopenharmony_ci	LVS	v4,0,r4
58062306a36Sopenharmony_ci
58162306a36Sopenharmony_ci	lvx     v5,0,r3
58262306a36Sopenharmony_ci	lvx     v6,0,r4
58362306a36Sopenharmony_ci	LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
58462306a36Sopenharmony_ci	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
58562306a36Sopenharmony_ci
58662306a36Sopenharmony_ci	VCMPEQUB_RC(v7,v9,v10)
58762306a36Sopenharmony_ci	bnl	cr6,.Ldiffoffset_vmx_diff_found
58862306a36Sopenharmony_ci
58962306a36Sopenharmony_ci	subfic  r6,r6,16
59062306a36Sopenharmony_ci	subf    r5,r6,r5
59162306a36Sopenharmony_ci	add     r3,r3,r6
59262306a36Sopenharmony_ci	add     r4,r4,r6
59362306a36Sopenharmony_ci
59462306a36Sopenharmony_ci.Ldiffoffset_vmx_s1_16bytes_align:
59562306a36Sopenharmony_ci	/* now s1 is aligned with 16 bytes */
59662306a36Sopenharmony_ci	lvx     v6,0,r4
59762306a36Sopenharmony_ci	LVS	v4,0,r4
59862306a36Sopenharmony_ci	srdi	r6,r5,5  /* loop for 32 bytes each */
59962306a36Sopenharmony_ci	clrldi  r5,r5,59
60062306a36Sopenharmony_ci	mtctr	r6
60162306a36Sopenharmony_ci
60262306a36Sopenharmony_ci.balign	16
60362306a36Sopenharmony_ci.Ldiffoffset_vmx_32bytesloop:
60462306a36Sopenharmony_ci	/* the first qw of r4 was saved in v6 */
60562306a36Sopenharmony_ci	lvx	v9,0,r3
60662306a36Sopenharmony_ci	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
60762306a36Sopenharmony_ci	VCMPEQUB_RC(v7,v9,v10)
60862306a36Sopenharmony_ci	vor	v6,v8,v8
60962306a36Sopenharmony_ci	bnl	cr6,.Ldiffoffset_vmx_diff_found
61062306a36Sopenharmony_ci
61162306a36Sopenharmony_ci	addi	r3,r3,16
61262306a36Sopenharmony_ci	addi	r4,r4,16
61362306a36Sopenharmony_ci
61462306a36Sopenharmony_ci	lvx	v9,0,r3
61562306a36Sopenharmony_ci	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
61662306a36Sopenharmony_ci	VCMPEQUB_RC(v7,v9,v10)
61762306a36Sopenharmony_ci	vor	v6,v8,v8
61862306a36Sopenharmony_ci	bnl	cr6,.Ldiffoffset_vmx_diff_found
61962306a36Sopenharmony_ci
62062306a36Sopenharmony_ci	addi	r3,r3,16
62162306a36Sopenharmony_ci	addi	r4,r4,16
62262306a36Sopenharmony_ci
62362306a36Sopenharmony_ci	bdnz	.Ldiffoffset_vmx_32bytesloop
62462306a36Sopenharmony_ci
62562306a36Sopenharmony_ci	EXIT_VMX_OPS
62662306a36Sopenharmony_ci
62762306a36Sopenharmony_ci	cmpdi	r5,0
62862306a36Sopenharmony_ci	beq	.Lzero
62962306a36Sopenharmony_ci	b	.Lcmp_lt32bytes
63062306a36Sopenharmony_ci
63162306a36Sopenharmony_ci.Ldiffoffset_vmx_diff_found:
63262306a36Sopenharmony_ci	EXIT_VMX_OPS
63362306a36Sopenharmony_ci	/* anyway, the diff will appear in next 16 bytes */
63462306a36Sopenharmony_ci	li	r5,16
63562306a36Sopenharmony_ci	b	.Lcmp_lt32bytes
63662306a36Sopenharmony_ci
63762306a36Sopenharmony_ci#endif
63862306a36Sopenharmony_ciEXPORT_SYMBOL(memcmp)
639