18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * Author: Anton Blanchard <anton@au.ibm.com>
48c2ecf20Sopenharmony_ci * Copyright 2015 IBM Corporation.
58c2ecf20Sopenharmony_ci */
68c2ecf20Sopenharmony_ci#include <asm/ppc_asm.h>
78c2ecf20Sopenharmony_ci#include <asm/export.h>
88c2ecf20Sopenharmony_ci#include <asm/ppc-opcode.h>
98c2ecf20Sopenharmony_ci
108c2ecf20Sopenharmony_ci#define off8	r6
118c2ecf20Sopenharmony_ci#define off16	r7
128c2ecf20Sopenharmony_ci#define off24	r8
138c2ecf20Sopenharmony_ci
148c2ecf20Sopenharmony_ci#define rA	r9
158c2ecf20Sopenharmony_ci#define rB	r10
168c2ecf20Sopenharmony_ci#define rC	r11
178c2ecf20Sopenharmony_ci#define rD	r27
188c2ecf20Sopenharmony_ci#define rE	r28
198c2ecf20Sopenharmony_ci#define rF	r29
208c2ecf20Sopenharmony_ci#define rG	r30
218c2ecf20Sopenharmony_ci#define rH	r31
228c2ecf20Sopenharmony_ci
238c2ecf20Sopenharmony_ci#ifdef __LITTLE_ENDIAN__
248c2ecf20Sopenharmony_ci#define LH	lhbrx
258c2ecf20Sopenharmony_ci#define LW	lwbrx
268c2ecf20Sopenharmony_ci#define LD	ldbrx
278c2ecf20Sopenharmony_ci#define LVS	lvsr
288c2ecf20Sopenharmony_ci#define VPERM(_VRT,_VRA,_VRB,_VRC) \
298c2ecf20Sopenharmony_ci	vperm _VRT,_VRB,_VRA,_VRC
308c2ecf20Sopenharmony_ci#else
318c2ecf20Sopenharmony_ci#define LH	lhzx
328c2ecf20Sopenharmony_ci#define LW	lwzx
338c2ecf20Sopenharmony_ci#define LD	ldx
348c2ecf20Sopenharmony_ci#define LVS	lvsl
358c2ecf20Sopenharmony_ci#define VPERM(_VRT,_VRA,_VRB,_VRC) \
368c2ecf20Sopenharmony_ci	vperm _VRT,_VRA,_VRB,_VRC
378c2ecf20Sopenharmony_ci#endif
388c2ecf20Sopenharmony_ci
398c2ecf20Sopenharmony_ci#define VMX_THRESH 4096
408c2ecf20Sopenharmony_ci#define ENTER_VMX_OPS	\
418c2ecf20Sopenharmony_ci	mflr    r0;	\
428c2ecf20Sopenharmony_ci	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
438c2ecf20Sopenharmony_ci	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
448c2ecf20Sopenharmony_ci	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
458c2ecf20Sopenharmony_ci	std     r0,16(r1); \
468c2ecf20Sopenharmony_ci	stdu    r1,-STACKFRAMESIZE(r1); \
478c2ecf20Sopenharmony_ci	bl      enter_vmx_ops; \
488c2ecf20Sopenharmony_ci	cmpwi   cr1,r3,0; \
498c2ecf20Sopenharmony_ci	ld      r0,STACKFRAMESIZE+16(r1); \
508c2ecf20Sopenharmony_ci	ld      r3,STK_REG(R31)(r1); \
518c2ecf20Sopenharmony_ci	ld      r4,STK_REG(R30)(r1); \
528c2ecf20Sopenharmony_ci	ld      r5,STK_REG(R29)(r1); \
538c2ecf20Sopenharmony_ci	addi	r1,r1,STACKFRAMESIZE; \
548c2ecf20Sopenharmony_ci	mtlr    r0
558c2ecf20Sopenharmony_ci
568c2ecf20Sopenharmony_ci#define EXIT_VMX_OPS \
578c2ecf20Sopenharmony_ci	mflr    r0; \
588c2ecf20Sopenharmony_ci	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
598c2ecf20Sopenharmony_ci	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
608c2ecf20Sopenharmony_ci	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
618c2ecf20Sopenharmony_ci	std     r0,16(r1); \
628c2ecf20Sopenharmony_ci	stdu    r1,-STACKFRAMESIZE(r1); \
638c2ecf20Sopenharmony_ci	bl      exit_vmx_ops; \
648c2ecf20Sopenharmony_ci	ld      r0,STACKFRAMESIZE+16(r1); \
658c2ecf20Sopenharmony_ci	ld      r3,STK_REG(R31)(r1); \
668c2ecf20Sopenharmony_ci	ld      r4,STK_REG(R30)(r1); \
678c2ecf20Sopenharmony_ci	ld      r5,STK_REG(R29)(r1); \
688c2ecf20Sopenharmony_ci	addi	r1,r1,STACKFRAMESIZE; \
698c2ecf20Sopenharmony_ci	mtlr    r0
708c2ecf20Sopenharmony_ci
718c2ecf20Sopenharmony_ci/*
728c2ecf20Sopenharmony_ci * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
738c2ecf20Sopenharmony_ci * 16 bytes boundary and permute the result with the 1st 16 bytes.
748c2ecf20Sopenharmony_ci
758c2ecf20Sopenharmony_ci *    |  y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
768c2ecf20Sopenharmony_ci *    ^                                  ^                                 ^
778c2ecf20Sopenharmony_ci * 0xbbbb10                          0xbbbb20                          0xbbb30
788c2ecf20Sopenharmony_ci *                                 ^
798c2ecf20Sopenharmony_ci *                                _vaddr
808c2ecf20Sopenharmony_ci *
818c2ecf20Sopenharmony_ci *
828c2ecf20Sopenharmony_ci * _vmask is the mask generated by LVS
838c2ecf20Sopenharmony_ci * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
848c2ecf20Sopenharmony_ci *   for example: 0xyyyyyyyyyyyyy012 for big endian
858c2ecf20Sopenharmony_ci * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
868c2ecf20Sopenharmony_ci *   for example: 0x3456789abcdefzzz for big endian
878c2ecf20Sopenharmony_ci * The permute result is saved in _v_res.
888c2ecf20Sopenharmony_ci *   for example: 0x0123456789abcdef for big endian.
898c2ecf20Sopenharmony_ci */
908c2ecf20Sopenharmony_ci#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
918c2ecf20Sopenharmony_ci        lvx     _v2nd_qw,_vaddr,off16; \
928c2ecf20Sopenharmony_ci        VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
938c2ecf20Sopenharmony_ci
948c2ecf20Sopenharmony_ci/*
958c2ecf20Sopenharmony_ci * There are 2 categories for memcmp:
968c2ecf20Sopenharmony_ci * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
978c2ecf20Sopenharmony_ci * are named like .Lsameoffset_xxxx
988c2ecf20Sopenharmony_ci * 2) src/dst has different offset to the 8 bytes boundary. The handlers
998c2ecf20Sopenharmony_ci * are named like .Ldiffoffset_xxxx
1008c2ecf20Sopenharmony_ci */
1018c2ecf20Sopenharmony_ci_GLOBAL_TOC(memcmp)
1028c2ecf20Sopenharmony_ci	cmpdi	cr1,r5,0
1038c2ecf20Sopenharmony_ci
1048c2ecf20Sopenharmony_ci	/* Use the short loop if the src/dst addresses are not
1058c2ecf20Sopenharmony_ci	 * with the same offset of 8 bytes align boundary.
1068c2ecf20Sopenharmony_ci	 */
1078c2ecf20Sopenharmony_ci	xor	r6,r3,r4
1088c2ecf20Sopenharmony_ci	andi.	r6,r6,7
1098c2ecf20Sopenharmony_ci
1108c2ecf20Sopenharmony_ci	/* Fall back to short loop if compare at aligned addrs
1118c2ecf20Sopenharmony_ci	 * with less than 8 bytes.
1128c2ecf20Sopenharmony_ci	 */
1138c2ecf20Sopenharmony_ci	cmpdi   cr6,r5,7
1148c2ecf20Sopenharmony_ci
1158c2ecf20Sopenharmony_ci	beq	cr1,.Lzero
1168c2ecf20Sopenharmony_ci	bgt	cr6,.Lno_short
1178c2ecf20Sopenharmony_ci
1188c2ecf20Sopenharmony_ci.Lshort:
1198c2ecf20Sopenharmony_ci	mtctr	r5
1208c2ecf20Sopenharmony_ci1:	lbz	rA,0(r3)
1218c2ecf20Sopenharmony_ci	lbz	rB,0(r4)
1228c2ecf20Sopenharmony_ci	subf.	rC,rB,rA
1238c2ecf20Sopenharmony_ci	bne	.Lnon_zero
1248c2ecf20Sopenharmony_ci	bdz	.Lzero
1258c2ecf20Sopenharmony_ci
1268c2ecf20Sopenharmony_ci	lbz	rA,1(r3)
1278c2ecf20Sopenharmony_ci	lbz	rB,1(r4)
1288c2ecf20Sopenharmony_ci	subf.	rC,rB,rA
1298c2ecf20Sopenharmony_ci	bne	.Lnon_zero
1308c2ecf20Sopenharmony_ci	bdz	.Lzero
1318c2ecf20Sopenharmony_ci
1328c2ecf20Sopenharmony_ci	lbz	rA,2(r3)
1338c2ecf20Sopenharmony_ci	lbz	rB,2(r4)
1348c2ecf20Sopenharmony_ci	subf.	rC,rB,rA
1358c2ecf20Sopenharmony_ci	bne	.Lnon_zero
1368c2ecf20Sopenharmony_ci	bdz	.Lzero
1378c2ecf20Sopenharmony_ci
1388c2ecf20Sopenharmony_ci	lbz	rA,3(r3)
1398c2ecf20Sopenharmony_ci	lbz	rB,3(r4)
1408c2ecf20Sopenharmony_ci	subf.	rC,rB,rA
1418c2ecf20Sopenharmony_ci	bne	.Lnon_zero
1428c2ecf20Sopenharmony_ci
1438c2ecf20Sopenharmony_ci	addi	r3,r3,4
1448c2ecf20Sopenharmony_ci	addi	r4,r4,4
1458c2ecf20Sopenharmony_ci
1468c2ecf20Sopenharmony_ci	bdnz	1b
1478c2ecf20Sopenharmony_ci
1488c2ecf20Sopenharmony_ci.Lzero:
1498c2ecf20Sopenharmony_ci	li	r3,0
1508c2ecf20Sopenharmony_ci	blr
1518c2ecf20Sopenharmony_ci
1528c2ecf20Sopenharmony_ci.Lno_short:
1538c2ecf20Sopenharmony_ci	dcbt	0,r3
1548c2ecf20Sopenharmony_ci	dcbt	0,r4
1558c2ecf20Sopenharmony_ci	bne	.Ldiffoffset_8bytes_make_align_start
1568c2ecf20Sopenharmony_ci
1578c2ecf20Sopenharmony_ci
1588c2ecf20Sopenharmony_ci.Lsameoffset_8bytes_make_align_start:
1598c2ecf20Sopenharmony_ci	/* attempt to compare bytes not aligned with 8 bytes so that
1608c2ecf20Sopenharmony_ci	 * rest comparison can run based on 8 bytes alignment.
1618c2ecf20Sopenharmony_ci	 */
1628c2ecf20Sopenharmony_ci	andi.   r6,r3,7
1638c2ecf20Sopenharmony_ci
1648c2ecf20Sopenharmony_ci	/* Try to compare the first double word which is not 8 bytes aligned:
1658c2ecf20Sopenharmony_ci	 * load the first double word at (src & ~7UL) and shift left appropriate
1668c2ecf20Sopenharmony_ci	 * bits before comparision.
1678c2ecf20Sopenharmony_ci	 */
1688c2ecf20Sopenharmony_ci	rlwinm  r6,r3,3,26,28
1698c2ecf20Sopenharmony_ci	beq     .Lsameoffset_8bytes_aligned
1708c2ecf20Sopenharmony_ci	clrrdi	r3,r3,3
1718c2ecf20Sopenharmony_ci	clrrdi	r4,r4,3
1728c2ecf20Sopenharmony_ci	LD	rA,0,r3
1738c2ecf20Sopenharmony_ci	LD	rB,0,r4
1748c2ecf20Sopenharmony_ci	sld	rA,rA,r6
1758c2ecf20Sopenharmony_ci	sld	rB,rB,r6
1768c2ecf20Sopenharmony_ci	cmpld	cr0,rA,rB
1778c2ecf20Sopenharmony_ci	srwi	r6,r6,3
1788c2ecf20Sopenharmony_ci	bne	cr0,.LcmpAB_lightweight
1798c2ecf20Sopenharmony_ci	subfic  r6,r6,8
1808c2ecf20Sopenharmony_ci	subf.	r5,r6,r5
1818c2ecf20Sopenharmony_ci	addi	r3,r3,8
1828c2ecf20Sopenharmony_ci	addi	r4,r4,8
1838c2ecf20Sopenharmony_ci	beq	.Lzero
1848c2ecf20Sopenharmony_ci
1858c2ecf20Sopenharmony_ci.Lsameoffset_8bytes_aligned:
1868c2ecf20Sopenharmony_ci	/* now we are aligned with 8 bytes.
1878c2ecf20Sopenharmony_ci	 * Use .Llong loop if left cmp bytes are equal or greater than 32B.
1888c2ecf20Sopenharmony_ci	 */
1898c2ecf20Sopenharmony_ci	cmpdi   cr6,r5,31
1908c2ecf20Sopenharmony_ci	bgt	cr6,.Llong
1918c2ecf20Sopenharmony_ci
1928c2ecf20Sopenharmony_ci.Lcmp_lt32bytes:
1938c2ecf20Sopenharmony_ci	/* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
1948c2ecf20Sopenharmony_ci	cmpdi   cr5,r5,7
1958c2ecf20Sopenharmony_ci	srdi    r0,r5,3
1968c2ecf20Sopenharmony_ci	ble	cr5,.Lcmp_rest_lt8bytes
1978c2ecf20Sopenharmony_ci
1988c2ecf20Sopenharmony_ci	/* handle 8 ~ 31 bytes */
1998c2ecf20Sopenharmony_ci	clrldi  r5,r5,61
2008c2ecf20Sopenharmony_ci	mtctr   r0
2018c2ecf20Sopenharmony_ci2:
2028c2ecf20Sopenharmony_ci	LD	rA,0,r3
2038c2ecf20Sopenharmony_ci	LD	rB,0,r4
2048c2ecf20Sopenharmony_ci	cmpld	cr0,rA,rB
2058c2ecf20Sopenharmony_ci	addi	r3,r3,8
2068c2ecf20Sopenharmony_ci	addi	r4,r4,8
2078c2ecf20Sopenharmony_ci	bne	cr0,.LcmpAB_lightweight
2088c2ecf20Sopenharmony_ci	bdnz	2b
2098c2ecf20Sopenharmony_ci
2108c2ecf20Sopenharmony_ci	cmpwi   r5,0
2118c2ecf20Sopenharmony_ci	beq	.Lzero
2128c2ecf20Sopenharmony_ci
2138c2ecf20Sopenharmony_ci.Lcmp_rest_lt8bytes:
2148c2ecf20Sopenharmony_ci	/*
2158c2ecf20Sopenharmony_ci	 * Here we have less than 8 bytes to compare. At least s1 is aligned to
2168c2ecf20Sopenharmony_ci	 * 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a
2178c2ecf20Sopenharmony_ci	 * page boundary, otherwise we might read past the end of the buffer and
2188c2ecf20Sopenharmony_ci	 * trigger a page fault. We use 4K as the conservative minimum page
2198c2ecf20Sopenharmony_ci	 * size. If we detect that case we go to the byte-by-byte loop.
2208c2ecf20Sopenharmony_ci	 *
2218c2ecf20Sopenharmony_ci	 * Otherwise the next double word is loaded from s1 and s2, and shifted
2228c2ecf20Sopenharmony_ci	 * right to compare the appropriate bits.
2238c2ecf20Sopenharmony_ci	 */
2248c2ecf20Sopenharmony_ci	clrldi	r6,r4,(64-12)	// r6 = r4 & 0xfff
2258c2ecf20Sopenharmony_ci	cmpdi	r6,0xff8
2268c2ecf20Sopenharmony_ci	bgt	.Lshort
2278c2ecf20Sopenharmony_ci
2288c2ecf20Sopenharmony_ci	subfic  r6,r5,8
2298c2ecf20Sopenharmony_ci	slwi	r6,r6,3
2308c2ecf20Sopenharmony_ci	LD	rA,0,r3
2318c2ecf20Sopenharmony_ci	LD	rB,0,r4
2328c2ecf20Sopenharmony_ci	srd	rA,rA,r6
2338c2ecf20Sopenharmony_ci	srd	rB,rB,r6
2348c2ecf20Sopenharmony_ci	cmpld	cr0,rA,rB
2358c2ecf20Sopenharmony_ci	bne	cr0,.LcmpAB_lightweight
2368c2ecf20Sopenharmony_ci	b	.Lzero
2378c2ecf20Sopenharmony_ci
2388c2ecf20Sopenharmony_ci.Lnon_zero:
2398c2ecf20Sopenharmony_ci	mr	r3,rC
2408c2ecf20Sopenharmony_ci	blr
2418c2ecf20Sopenharmony_ci
2428c2ecf20Sopenharmony_ci.Llong:
2438c2ecf20Sopenharmony_ci#ifdef CONFIG_ALTIVEC
2448c2ecf20Sopenharmony_ciBEGIN_FTR_SECTION
2458c2ecf20Sopenharmony_ci	/* Try to use vmx loop if length is equal or greater than 4K */
2468c2ecf20Sopenharmony_ci	cmpldi  cr6,r5,VMX_THRESH
2478c2ecf20Sopenharmony_ci	bge	cr6,.Lsameoffset_vmx_cmp
2488c2ecf20Sopenharmony_ciEND_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
2498c2ecf20Sopenharmony_ci
2508c2ecf20Sopenharmony_ci.Llong_novmx_cmp:
2518c2ecf20Sopenharmony_ci#endif
2528c2ecf20Sopenharmony_ci	/* At least s1 addr is aligned with 8 bytes */
2538c2ecf20Sopenharmony_ci	li	off8,8
2548c2ecf20Sopenharmony_ci	li	off16,16
2558c2ecf20Sopenharmony_ci	li	off24,24
2568c2ecf20Sopenharmony_ci
2578c2ecf20Sopenharmony_ci	std	r31,-8(r1)
2588c2ecf20Sopenharmony_ci	std	r30,-16(r1)
2598c2ecf20Sopenharmony_ci	std	r29,-24(r1)
2608c2ecf20Sopenharmony_ci	std	r28,-32(r1)
2618c2ecf20Sopenharmony_ci	std	r27,-40(r1)
2628c2ecf20Sopenharmony_ci
2638c2ecf20Sopenharmony_ci	srdi	r0,r5,5
2648c2ecf20Sopenharmony_ci	mtctr	r0
2658c2ecf20Sopenharmony_ci	andi.	r5,r5,31
2668c2ecf20Sopenharmony_ci
2678c2ecf20Sopenharmony_ci	LD	rA,0,r3
2688c2ecf20Sopenharmony_ci	LD	rB,0,r4
2698c2ecf20Sopenharmony_ci
2708c2ecf20Sopenharmony_ci	LD	rC,off8,r3
2718c2ecf20Sopenharmony_ci	LD	rD,off8,r4
2728c2ecf20Sopenharmony_ci
2738c2ecf20Sopenharmony_ci	LD	rE,off16,r3
2748c2ecf20Sopenharmony_ci	LD	rF,off16,r4
2758c2ecf20Sopenharmony_ci
2768c2ecf20Sopenharmony_ci	LD	rG,off24,r3
2778c2ecf20Sopenharmony_ci	LD	rH,off24,r4
2788c2ecf20Sopenharmony_ci	cmpld	cr0,rA,rB
2798c2ecf20Sopenharmony_ci
2808c2ecf20Sopenharmony_ci	addi	r3,r3,32
2818c2ecf20Sopenharmony_ci	addi	r4,r4,32
2828c2ecf20Sopenharmony_ci
2838c2ecf20Sopenharmony_ci	bdz	.Lfirst32
2848c2ecf20Sopenharmony_ci
2858c2ecf20Sopenharmony_ci	LD	rA,0,r3
2868c2ecf20Sopenharmony_ci	LD	rB,0,r4
2878c2ecf20Sopenharmony_ci	cmpld	cr1,rC,rD
2888c2ecf20Sopenharmony_ci
2898c2ecf20Sopenharmony_ci	LD	rC,off8,r3
2908c2ecf20Sopenharmony_ci	LD	rD,off8,r4
2918c2ecf20Sopenharmony_ci	cmpld	cr6,rE,rF
2928c2ecf20Sopenharmony_ci
2938c2ecf20Sopenharmony_ci	LD	rE,off16,r3
2948c2ecf20Sopenharmony_ci	LD	rF,off16,r4
2958c2ecf20Sopenharmony_ci	cmpld	cr7,rG,rH
2968c2ecf20Sopenharmony_ci	bne	cr0,.LcmpAB
2978c2ecf20Sopenharmony_ci
2988c2ecf20Sopenharmony_ci	LD	rG,off24,r3
2998c2ecf20Sopenharmony_ci	LD	rH,off24,r4
3008c2ecf20Sopenharmony_ci	cmpld	cr0,rA,rB
3018c2ecf20Sopenharmony_ci	bne	cr1,.LcmpCD
3028c2ecf20Sopenharmony_ci
3038c2ecf20Sopenharmony_ci	addi	r3,r3,32
3048c2ecf20Sopenharmony_ci	addi	r4,r4,32
3058c2ecf20Sopenharmony_ci
3068c2ecf20Sopenharmony_ci	bdz	.Lsecond32
3078c2ecf20Sopenharmony_ci
3088c2ecf20Sopenharmony_ci	.balign	16
3098c2ecf20Sopenharmony_ci
3108c2ecf20Sopenharmony_ci1:	LD	rA,0,r3
3118c2ecf20Sopenharmony_ci	LD	rB,0,r4
3128c2ecf20Sopenharmony_ci	cmpld	cr1,rC,rD
3138c2ecf20Sopenharmony_ci	bne	cr6,.LcmpEF
3148c2ecf20Sopenharmony_ci
3158c2ecf20Sopenharmony_ci	LD	rC,off8,r3
3168c2ecf20Sopenharmony_ci	LD	rD,off8,r4
3178c2ecf20Sopenharmony_ci	cmpld	cr6,rE,rF
3188c2ecf20Sopenharmony_ci	bne	cr7,.LcmpGH
3198c2ecf20Sopenharmony_ci
3208c2ecf20Sopenharmony_ci	LD	rE,off16,r3
3218c2ecf20Sopenharmony_ci	LD	rF,off16,r4
3228c2ecf20Sopenharmony_ci	cmpld	cr7,rG,rH
3238c2ecf20Sopenharmony_ci	bne	cr0,.LcmpAB
3248c2ecf20Sopenharmony_ci
3258c2ecf20Sopenharmony_ci	LD	rG,off24,r3
3268c2ecf20Sopenharmony_ci	LD	rH,off24,r4
3278c2ecf20Sopenharmony_ci	cmpld	cr0,rA,rB
3288c2ecf20Sopenharmony_ci	bne	cr1,.LcmpCD
3298c2ecf20Sopenharmony_ci
3308c2ecf20Sopenharmony_ci	addi	r3,r3,32
3318c2ecf20Sopenharmony_ci	addi	r4,r4,32
3328c2ecf20Sopenharmony_ci
3338c2ecf20Sopenharmony_ci	bdnz	1b
3348c2ecf20Sopenharmony_ci
3358c2ecf20Sopenharmony_ci.Lsecond32:
3368c2ecf20Sopenharmony_ci	cmpld	cr1,rC,rD
3378c2ecf20Sopenharmony_ci	bne	cr6,.LcmpEF
3388c2ecf20Sopenharmony_ci
3398c2ecf20Sopenharmony_ci	cmpld	cr6,rE,rF
3408c2ecf20Sopenharmony_ci	bne	cr7,.LcmpGH
3418c2ecf20Sopenharmony_ci
3428c2ecf20Sopenharmony_ci	cmpld	cr7,rG,rH
3438c2ecf20Sopenharmony_ci	bne	cr0,.LcmpAB
3448c2ecf20Sopenharmony_ci
3458c2ecf20Sopenharmony_ci	bne	cr1,.LcmpCD
3468c2ecf20Sopenharmony_ci	bne	cr6,.LcmpEF
3478c2ecf20Sopenharmony_ci	bne	cr7,.LcmpGH
3488c2ecf20Sopenharmony_ci
3498c2ecf20Sopenharmony_ci.Ltail:
3508c2ecf20Sopenharmony_ci	ld	r31,-8(r1)
3518c2ecf20Sopenharmony_ci	ld	r30,-16(r1)
3528c2ecf20Sopenharmony_ci	ld	r29,-24(r1)
3538c2ecf20Sopenharmony_ci	ld	r28,-32(r1)
3548c2ecf20Sopenharmony_ci	ld	r27,-40(r1)
3558c2ecf20Sopenharmony_ci
3568c2ecf20Sopenharmony_ci	cmpdi	r5,0
3578c2ecf20Sopenharmony_ci	beq	.Lzero
3588c2ecf20Sopenharmony_ci	b	.Lshort
3598c2ecf20Sopenharmony_ci
3608c2ecf20Sopenharmony_ci.Lfirst32:
3618c2ecf20Sopenharmony_ci	cmpld	cr1,rC,rD
3628c2ecf20Sopenharmony_ci	cmpld	cr6,rE,rF
3638c2ecf20Sopenharmony_ci	cmpld	cr7,rG,rH
3648c2ecf20Sopenharmony_ci
3658c2ecf20Sopenharmony_ci	bne	cr0,.LcmpAB
3668c2ecf20Sopenharmony_ci	bne	cr1,.LcmpCD
3678c2ecf20Sopenharmony_ci	bne	cr6,.LcmpEF
3688c2ecf20Sopenharmony_ci	bne	cr7,.LcmpGH
3698c2ecf20Sopenharmony_ci
3708c2ecf20Sopenharmony_ci	b	.Ltail
3718c2ecf20Sopenharmony_ci
3728c2ecf20Sopenharmony_ci.LcmpAB:
3738c2ecf20Sopenharmony_ci	li	r3,1
3748c2ecf20Sopenharmony_ci	bgt	cr0,.Lout
3758c2ecf20Sopenharmony_ci	li	r3,-1
3768c2ecf20Sopenharmony_ci	b	.Lout
3778c2ecf20Sopenharmony_ci
3788c2ecf20Sopenharmony_ci.LcmpCD:
3798c2ecf20Sopenharmony_ci	li	r3,1
3808c2ecf20Sopenharmony_ci	bgt	cr1,.Lout
3818c2ecf20Sopenharmony_ci	li	r3,-1
3828c2ecf20Sopenharmony_ci	b	.Lout
3838c2ecf20Sopenharmony_ci
3848c2ecf20Sopenharmony_ci.LcmpEF:
3858c2ecf20Sopenharmony_ci	li	r3,1
3868c2ecf20Sopenharmony_ci	bgt	cr6,.Lout
3878c2ecf20Sopenharmony_ci	li	r3,-1
3888c2ecf20Sopenharmony_ci	b	.Lout
3898c2ecf20Sopenharmony_ci
3908c2ecf20Sopenharmony_ci.LcmpGH:
3918c2ecf20Sopenharmony_ci	li	r3,1
3928c2ecf20Sopenharmony_ci	bgt	cr7,.Lout
3938c2ecf20Sopenharmony_ci	li	r3,-1
3948c2ecf20Sopenharmony_ci
3958c2ecf20Sopenharmony_ci.Lout:
3968c2ecf20Sopenharmony_ci	ld	r31,-8(r1)
3978c2ecf20Sopenharmony_ci	ld	r30,-16(r1)
3988c2ecf20Sopenharmony_ci	ld	r29,-24(r1)
3998c2ecf20Sopenharmony_ci	ld	r28,-32(r1)
4008c2ecf20Sopenharmony_ci	ld	r27,-40(r1)
4018c2ecf20Sopenharmony_ci	blr
4028c2ecf20Sopenharmony_ci
4038c2ecf20Sopenharmony_ci.LcmpAB_lightweight:   /* skip NV GPRS restore */
4048c2ecf20Sopenharmony_ci	li	r3,1
4058c2ecf20Sopenharmony_ci	bgtlr
4068c2ecf20Sopenharmony_ci	li	r3,-1
4078c2ecf20Sopenharmony_ci	blr
4088c2ecf20Sopenharmony_ci
4098c2ecf20Sopenharmony_ci#ifdef CONFIG_ALTIVEC
4108c2ecf20Sopenharmony_ci.Lsameoffset_vmx_cmp:
4118c2ecf20Sopenharmony_ci	/* Enter with src/dst addrs has the same offset with 8 bytes
4128c2ecf20Sopenharmony_ci	 * align boundary.
4138c2ecf20Sopenharmony_ci	 *
4148c2ecf20Sopenharmony_ci	 * There is an optimization based on following fact: memcmp()
4158c2ecf20Sopenharmony_ci	 * prones to fail early at the first 32 bytes.
4168c2ecf20Sopenharmony_ci	 * Before applying VMX instructions which will lead to 32x128bits
4178c2ecf20Sopenharmony_ci	 * VMX regs load/restore penalty, we compare the first 32 bytes
4188c2ecf20Sopenharmony_ci	 * so that we can catch the ~80% fail cases.
4198c2ecf20Sopenharmony_ci	 */
4208c2ecf20Sopenharmony_ci
4218c2ecf20Sopenharmony_ci	li	r0,4
4228c2ecf20Sopenharmony_ci	mtctr	r0
4238c2ecf20Sopenharmony_ci.Lsameoffset_prechk_32B_loop:
4248c2ecf20Sopenharmony_ci	LD	rA,0,r3
4258c2ecf20Sopenharmony_ci	LD	rB,0,r4
4268c2ecf20Sopenharmony_ci	cmpld	cr0,rA,rB
4278c2ecf20Sopenharmony_ci	addi	r3,r3,8
4288c2ecf20Sopenharmony_ci	addi	r4,r4,8
4298c2ecf20Sopenharmony_ci	bne     cr0,.LcmpAB_lightweight
4308c2ecf20Sopenharmony_ci	addi	r5,r5,-8
4318c2ecf20Sopenharmony_ci	bdnz	.Lsameoffset_prechk_32B_loop
4328c2ecf20Sopenharmony_ci
4338c2ecf20Sopenharmony_ci	ENTER_VMX_OPS
4348c2ecf20Sopenharmony_ci	beq     cr1,.Llong_novmx_cmp
4358c2ecf20Sopenharmony_ci
4368c2ecf20Sopenharmony_ci3:
4378c2ecf20Sopenharmony_ci	/* need to check whether r4 has the same offset with r3
4388c2ecf20Sopenharmony_ci	 * for 16 bytes boundary.
4398c2ecf20Sopenharmony_ci	 */
4408c2ecf20Sopenharmony_ci	xor	r0,r3,r4
4418c2ecf20Sopenharmony_ci	andi.	r0,r0,0xf
4428c2ecf20Sopenharmony_ci	bne	.Ldiffoffset_vmx_cmp_start
4438c2ecf20Sopenharmony_ci
4448c2ecf20Sopenharmony_ci	/* len is no less than 4KB. Need to align with 16 bytes further.
4458c2ecf20Sopenharmony_ci	 */
4468c2ecf20Sopenharmony_ci	andi.	rA,r3,8
4478c2ecf20Sopenharmony_ci	LD	rA,0,r3
4488c2ecf20Sopenharmony_ci	beq	4f
4498c2ecf20Sopenharmony_ci	LD	rB,0,r4
4508c2ecf20Sopenharmony_ci	cmpld	cr0,rA,rB
4518c2ecf20Sopenharmony_ci	addi	r3,r3,8
4528c2ecf20Sopenharmony_ci	addi	r4,r4,8
4538c2ecf20Sopenharmony_ci	addi	r5,r5,-8
4548c2ecf20Sopenharmony_ci
4558c2ecf20Sopenharmony_ci	beq	cr0,4f
4568c2ecf20Sopenharmony_ci	/* save and restore cr0 */
4578c2ecf20Sopenharmony_ci	mfocrf  r5,128
4588c2ecf20Sopenharmony_ci	EXIT_VMX_OPS
4598c2ecf20Sopenharmony_ci	mtocrf  128,r5
4608c2ecf20Sopenharmony_ci	b	.LcmpAB_lightweight
4618c2ecf20Sopenharmony_ci
4628c2ecf20Sopenharmony_ci4:
4638c2ecf20Sopenharmony_ci	/* compare 32 bytes for each loop */
4648c2ecf20Sopenharmony_ci	srdi	r0,r5,5
4658c2ecf20Sopenharmony_ci	mtctr	r0
4668c2ecf20Sopenharmony_ci	clrldi  r5,r5,59
4678c2ecf20Sopenharmony_ci	li	off16,16
4688c2ecf20Sopenharmony_ci
4698c2ecf20Sopenharmony_ci.balign 16
4708c2ecf20Sopenharmony_ci5:
4718c2ecf20Sopenharmony_ci	lvx 	v0,0,r3
4728c2ecf20Sopenharmony_ci	lvx 	v1,0,r4
4738c2ecf20Sopenharmony_ci	VCMPEQUD_RC(v0,v0,v1)
4748c2ecf20Sopenharmony_ci	bnl	cr6,7f
4758c2ecf20Sopenharmony_ci	lvx 	v0,off16,r3
4768c2ecf20Sopenharmony_ci	lvx 	v1,off16,r4
4778c2ecf20Sopenharmony_ci	VCMPEQUD_RC(v0,v0,v1)
4788c2ecf20Sopenharmony_ci	bnl	cr6,6f
4798c2ecf20Sopenharmony_ci	addi	r3,r3,32
4808c2ecf20Sopenharmony_ci	addi	r4,r4,32
4818c2ecf20Sopenharmony_ci	bdnz	5b
4828c2ecf20Sopenharmony_ci
4838c2ecf20Sopenharmony_ci	EXIT_VMX_OPS
4848c2ecf20Sopenharmony_ci	cmpdi	r5,0
4858c2ecf20Sopenharmony_ci	beq	.Lzero
4868c2ecf20Sopenharmony_ci	b	.Lcmp_lt32bytes
4878c2ecf20Sopenharmony_ci
4888c2ecf20Sopenharmony_ci6:
4898c2ecf20Sopenharmony_ci	addi	r3,r3,16
4908c2ecf20Sopenharmony_ci	addi	r4,r4,16
4918c2ecf20Sopenharmony_ci
4928c2ecf20Sopenharmony_ci7:
4938c2ecf20Sopenharmony_ci	/* diff the last 16 bytes */
4948c2ecf20Sopenharmony_ci	EXIT_VMX_OPS
4958c2ecf20Sopenharmony_ci	LD	rA,0,r3
4968c2ecf20Sopenharmony_ci	LD	rB,0,r4
4978c2ecf20Sopenharmony_ci	cmpld	cr0,rA,rB
4988c2ecf20Sopenharmony_ci	li	off8,8
4998c2ecf20Sopenharmony_ci	bne	cr0,.LcmpAB_lightweight
5008c2ecf20Sopenharmony_ci
5018c2ecf20Sopenharmony_ci	LD	rA,off8,r3
5028c2ecf20Sopenharmony_ci	LD	rB,off8,r4
5038c2ecf20Sopenharmony_ci	cmpld	cr0,rA,rB
5048c2ecf20Sopenharmony_ci	bne	cr0,.LcmpAB_lightweight
5058c2ecf20Sopenharmony_ci	b	.Lzero
5068c2ecf20Sopenharmony_ci#endif
5078c2ecf20Sopenharmony_ci
5088c2ecf20Sopenharmony_ci.Ldiffoffset_8bytes_make_align_start:
5098c2ecf20Sopenharmony_ci	/* now try to align s1 with 8 bytes */
5108c2ecf20Sopenharmony_ci	rlwinm  r6,r3,3,26,28
5118c2ecf20Sopenharmony_ci	beq     .Ldiffoffset_align_s1_8bytes
5128c2ecf20Sopenharmony_ci
5138c2ecf20Sopenharmony_ci	clrrdi	r3,r3,3
5148c2ecf20Sopenharmony_ci	LD	rA,0,r3
5158c2ecf20Sopenharmony_ci	LD	rB,0,r4  /* unaligned load */
5168c2ecf20Sopenharmony_ci	sld	rA,rA,r6
5178c2ecf20Sopenharmony_ci	srd	rA,rA,r6
5188c2ecf20Sopenharmony_ci	srd	rB,rB,r6
5198c2ecf20Sopenharmony_ci	cmpld	cr0,rA,rB
5208c2ecf20Sopenharmony_ci	srwi	r6,r6,3
5218c2ecf20Sopenharmony_ci	bne	cr0,.LcmpAB_lightweight
5228c2ecf20Sopenharmony_ci
5238c2ecf20Sopenharmony_ci	subfic  r6,r6,8
5248c2ecf20Sopenharmony_ci	subf.	r5,r6,r5
5258c2ecf20Sopenharmony_ci	addi	r3,r3,8
5268c2ecf20Sopenharmony_ci	add	r4,r4,r6
5278c2ecf20Sopenharmony_ci
5288c2ecf20Sopenharmony_ci	beq	.Lzero
5298c2ecf20Sopenharmony_ci
5308c2ecf20Sopenharmony_ci.Ldiffoffset_align_s1_8bytes:
5318c2ecf20Sopenharmony_ci	/* now s1 is aligned with 8 bytes. */
5328c2ecf20Sopenharmony_ci#ifdef CONFIG_ALTIVEC
5338c2ecf20Sopenharmony_ciBEGIN_FTR_SECTION
5348c2ecf20Sopenharmony_ci	/* only do vmx ops when the size equal or greater than 4K bytes */
5358c2ecf20Sopenharmony_ci	cmpdi	cr5,r5,VMX_THRESH
5368c2ecf20Sopenharmony_ci	bge	cr5,.Ldiffoffset_vmx_cmp
5378c2ecf20Sopenharmony_ciEND_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
5388c2ecf20Sopenharmony_ci
5398c2ecf20Sopenharmony_ci.Ldiffoffset_novmx_cmp:
5408c2ecf20Sopenharmony_ci#endif
5418c2ecf20Sopenharmony_ci
5428c2ecf20Sopenharmony_ci
5438c2ecf20Sopenharmony_ci	cmpdi   cr5,r5,31
5448c2ecf20Sopenharmony_ci	ble	cr5,.Lcmp_lt32bytes
5458c2ecf20Sopenharmony_ci
5468c2ecf20Sopenharmony_ci#ifdef CONFIG_ALTIVEC
5478c2ecf20Sopenharmony_ci	b	.Llong_novmx_cmp
5488c2ecf20Sopenharmony_ci#else
5498c2ecf20Sopenharmony_ci	b	.Llong
5508c2ecf20Sopenharmony_ci#endif
5518c2ecf20Sopenharmony_ci
5528c2ecf20Sopenharmony_ci#ifdef CONFIG_ALTIVEC
5538c2ecf20Sopenharmony_ci.Ldiffoffset_vmx_cmp:
5548c2ecf20Sopenharmony_ci	/* perform a 32 bytes pre-checking before
5558c2ecf20Sopenharmony_ci	 * enable VMX operations.
5568c2ecf20Sopenharmony_ci	 */
5578c2ecf20Sopenharmony_ci	li	r0,4
5588c2ecf20Sopenharmony_ci	mtctr	r0
5598c2ecf20Sopenharmony_ci.Ldiffoffset_prechk_32B_loop:
5608c2ecf20Sopenharmony_ci	LD	rA,0,r3
5618c2ecf20Sopenharmony_ci	LD	rB,0,r4
5628c2ecf20Sopenharmony_ci	cmpld	cr0,rA,rB
5638c2ecf20Sopenharmony_ci	addi	r3,r3,8
5648c2ecf20Sopenharmony_ci	addi	r4,r4,8
5658c2ecf20Sopenharmony_ci	bne     cr0,.LcmpAB_lightweight
5668c2ecf20Sopenharmony_ci	addi	r5,r5,-8
5678c2ecf20Sopenharmony_ci	bdnz	.Ldiffoffset_prechk_32B_loop
5688c2ecf20Sopenharmony_ci
5698c2ecf20Sopenharmony_ci	ENTER_VMX_OPS
5708c2ecf20Sopenharmony_ci	beq     cr1,.Ldiffoffset_novmx_cmp
5718c2ecf20Sopenharmony_ci
5728c2ecf20Sopenharmony_ci.Ldiffoffset_vmx_cmp_start:
5738c2ecf20Sopenharmony_ci	/* Firstly try to align r3 with 16 bytes */
5748c2ecf20Sopenharmony_ci	andi.   r6,r3,0xf
5758c2ecf20Sopenharmony_ci	li	off16,16
5768c2ecf20Sopenharmony_ci	beq     .Ldiffoffset_vmx_s1_16bytes_align
5778c2ecf20Sopenharmony_ci
5788c2ecf20Sopenharmony_ci	LVS	v3,0,r3
5798c2ecf20Sopenharmony_ci	LVS	v4,0,r4
5808c2ecf20Sopenharmony_ci
5818c2ecf20Sopenharmony_ci	lvx     v5,0,r3
5828c2ecf20Sopenharmony_ci	lvx     v6,0,r4
5838c2ecf20Sopenharmony_ci	LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
5848c2ecf20Sopenharmony_ci	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
5858c2ecf20Sopenharmony_ci
5868c2ecf20Sopenharmony_ci	VCMPEQUB_RC(v7,v9,v10)
5878c2ecf20Sopenharmony_ci	bnl	cr6,.Ldiffoffset_vmx_diff_found
5888c2ecf20Sopenharmony_ci
5898c2ecf20Sopenharmony_ci	subfic  r6,r6,16
5908c2ecf20Sopenharmony_ci	subf    r5,r6,r5
5918c2ecf20Sopenharmony_ci	add     r3,r3,r6
5928c2ecf20Sopenharmony_ci	add     r4,r4,r6
5938c2ecf20Sopenharmony_ci
5948c2ecf20Sopenharmony_ci.Ldiffoffset_vmx_s1_16bytes_align:
5958c2ecf20Sopenharmony_ci	/* now s1 is aligned with 16 bytes */
5968c2ecf20Sopenharmony_ci	lvx     v6,0,r4
5978c2ecf20Sopenharmony_ci	LVS	v4,0,r4
5988c2ecf20Sopenharmony_ci	srdi	r6,r5,5  /* loop for 32 bytes each */
5998c2ecf20Sopenharmony_ci	clrldi  r5,r5,59
6008c2ecf20Sopenharmony_ci	mtctr	r6
6018c2ecf20Sopenharmony_ci
6028c2ecf20Sopenharmony_ci.balign	16
6038c2ecf20Sopenharmony_ci.Ldiffoffset_vmx_32bytesloop:
6048c2ecf20Sopenharmony_ci	/* the first qw of r4 was saved in v6 */
6058c2ecf20Sopenharmony_ci	lvx	v9,0,r3
6068c2ecf20Sopenharmony_ci	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
6078c2ecf20Sopenharmony_ci	VCMPEQUB_RC(v7,v9,v10)
6088c2ecf20Sopenharmony_ci	vor	v6,v8,v8
6098c2ecf20Sopenharmony_ci	bnl	cr6,.Ldiffoffset_vmx_diff_found
6108c2ecf20Sopenharmony_ci
6118c2ecf20Sopenharmony_ci	addi	r3,r3,16
6128c2ecf20Sopenharmony_ci	addi	r4,r4,16
6138c2ecf20Sopenharmony_ci
6148c2ecf20Sopenharmony_ci	lvx	v9,0,r3
6158c2ecf20Sopenharmony_ci	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
6168c2ecf20Sopenharmony_ci	VCMPEQUB_RC(v7,v9,v10)
6178c2ecf20Sopenharmony_ci	vor	v6,v8,v8
6188c2ecf20Sopenharmony_ci	bnl	cr6,.Ldiffoffset_vmx_diff_found
6198c2ecf20Sopenharmony_ci
6208c2ecf20Sopenharmony_ci	addi	r3,r3,16
6218c2ecf20Sopenharmony_ci	addi	r4,r4,16
6228c2ecf20Sopenharmony_ci
6238c2ecf20Sopenharmony_ci	bdnz	.Ldiffoffset_vmx_32bytesloop
6248c2ecf20Sopenharmony_ci
6258c2ecf20Sopenharmony_ci	EXIT_VMX_OPS
6268c2ecf20Sopenharmony_ci
6278c2ecf20Sopenharmony_ci	cmpdi	r5,0
6288c2ecf20Sopenharmony_ci	beq	.Lzero
6298c2ecf20Sopenharmony_ci	b	.Lcmp_lt32bytes
6308c2ecf20Sopenharmony_ci
6318c2ecf20Sopenharmony_ci.Ldiffoffset_vmx_diff_found:
6328c2ecf20Sopenharmony_ci	EXIT_VMX_OPS
6338c2ecf20Sopenharmony_ci	/* anyway, the diff will appear in next 16 bytes */
6348c2ecf20Sopenharmony_ci	li	r5,16
6358c2ecf20Sopenharmony_ci	b	.Lcmp_lt32bytes
6368c2ecf20Sopenharmony_ci
6378c2ecf20Sopenharmony_ci#endif
6388c2ecf20Sopenharmony_ciEXPORT_SYMBOL(memcmp)
639