18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */ 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Author: Anton Blanchard <anton@au.ibm.com> 48c2ecf20Sopenharmony_ci * Copyright 2015 IBM Corporation. 58c2ecf20Sopenharmony_ci */ 68c2ecf20Sopenharmony_ci#include <asm/ppc_asm.h> 78c2ecf20Sopenharmony_ci#include <asm/export.h> 88c2ecf20Sopenharmony_ci#include <asm/ppc-opcode.h> 98c2ecf20Sopenharmony_ci 108c2ecf20Sopenharmony_ci#define off8 r6 118c2ecf20Sopenharmony_ci#define off16 r7 128c2ecf20Sopenharmony_ci#define off24 r8 138c2ecf20Sopenharmony_ci 148c2ecf20Sopenharmony_ci#define rA r9 158c2ecf20Sopenharmony_ci#define rB r10 168c2ecf20Sopenharmony_ci#define rC r11 178c2ecf20Sopenharmony_ci#define rD r27 188c2ecf20Sopenharmony_ci#define rE r28 198c2ecf20Sopenharmony_ci#define rF r29 208c2ecf20Sopenharmony_ci#define rG r30 218c2ecf20Sopenharmony_ci#define rH r31 228c2ecf20Sopenharmony_ci 238c2ecf20Sopenharmony_ci#ifdef __LITTLE_ENDIAN__ 248c2ecf20Sopenharmony_ci#define LH lhbrx 258c2ecf20Sopenharmony_ci#define LW lwbrx 268c2ecf20Sopenharmony_ci#define LD ldbrx 278c2ecf20Sopenharmony_ci#define LVS lvsr 288c2ecf20Sopenharmony_ci#define VPERM(_VRT,_VRA,_VRB,_VRC) \ 298c2ecf20Sopenharmony_ci vperm _VRT,_VRB,_VRA,_VRC 308c2ecf20Sopenharmony_ci#else 318c2ecf20Sopenharmony_ci#define LH lhzx 328c2ecf20Sopenharmony_ci#define LW lwzx 338c2ecf20Sopenharmony_ci#define LD ldx 348c2ecf20Sopenharmony_ci#define LVS lvsl 358c2ecf20Sopenharmony_ci#define VPERM(_VRT,_VRA,_VRB,_VRC) \ 368c2ecf20Sopenharmony_ci vperm _VRT,_VRA,_VRB,_VRC 378c2ecf20Sopenharmony_ci#endif 388c2ecf20Sopenharmony_ci 398c2ecf20Sopenharmony_ci#define VMX_THRESH 4096 408c2ecf20Sopenharmony_ci#define ENTER_VMX_OPS \ 418c2ecf20Sopenharmony_ci mflr r0; \ 428c2ecf20Sopenharmony_ci std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ 438c2ecf20Sopenharmony_ci std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ 448c2ecf20Sopenharmony_ci std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ 458c2ecf20Sopenharmony_ci std r0,16(r1); \ 468c2ecf20Sopenharmony_ci stdu r1,-STACKFRAMESIZE(r1); \ 478c2ecf20Sopenharmony_ci bl enter_vmx_ops; \ 488c2ecf20Sopenharmony_ci cmpwi cr1,r3,0; \ 498c2ecf20Sopenharmony_ci ld r0,STACKFRAMESIZE+16(r1); \ 508c2ecf20Sopenharmony_ci ld r3,STK_REG(R31)(r1); \ 518c2ecf20Sopenharmony_ci ld r4,STK_REG(R30)(r1); \ 528c2ecf20Sopenharmony_ci ld r5,STK_REG(R29)(r1); \ 538c2ecf20Sopenharmony_ci addi r1,r1,STACKFRAMESIZE; \ 548c2ecf20Sopenharmony_ci mtlr r0 558c2ecf20Sopenharmony_ci 568c2ecf20Sopenharmony_ci#define EXIT_VMX_OPS \ 578c2ecf20Sopenharmony_ci mflr r0; \ 588c2ecf20Sopenharmony_ci std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ 598c2ecf20Sopenharmony_ci std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ 608c2ecf20Sopenharmony_ci std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ 618c2ecf20Sopenharmony_ci std r0,16(r1); \ 628c2ecf20Sopenharmony_ci stdu r1,-STACKFRAMESIZE(r1); \ 638c2ecf20Sopenharmony_ci bl exit_vmx_ops; \ 648c2ecf20Sopenharmony_ci ld r0,STACKFRAMESIZE+16(r1); \ 658c2ecf20Sopenharmony_ci ld r3,STK_REG(R31)(r1); \ 668c2ecf20Sopenharmony_ci ld r4,STK_REG(R30)(r1); \ 678c2ecf20Sopenharmony_ci ld r5,STK_REG(R29)(r1); \ 688c2ecf20Sopenharmony_ci addi r1,r1,STACKFRAMESIZE; \ 698c2ecf20Sopenharmony_ci mtlr r0 708c2ecf20Sopenharmony_ci 718c2ecf20Sopenharmony_ci/* 728c2ecf20Sopenharmony_ci * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with 738c2ecf20Sopenharmony_ci * 16 bytes boundary and permute the result with the 1st 16 bytes. 748c2ecf20Sopenharmony_ci 758c2ecf20Sopenharmony_ci * | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z | 768c2ecf20Sopenharmony_ci * ^ ^ ^ 778c2ecf20Sopenharmony_ci * 0xbbbb10 0xbbbb20 0xbbb30 788c2ecf20Sopenharmony_ci * ^ 798c2ecf20Sopenharmony_ci * _vaddr 808c2ecf20Sopenharmony_ci * 818c2ecf20Sopenharmony_ci * 828c2ecf20Sopenharmony_ci * _vmask is the mask generated by LVS 838c2ecf20Sopenharmony_ci * _v1st_qw is the 1st aligned QW of current addr which is already loaded. 848c2ecf20Sopenharmony_ci * for example: 0xyyyyyyyyyyyyy012 for big endian 858c2ecf20Sopenharmony_ci * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded. 868c2ecf20Sopenharmony_ci * for example: 0x3456789abcdefzzz for big endian 878c2ecf20Sopenharmony_ci * The permute result is saved in _v_res. 888c2ecf20Sopenharmony_ci * for example: 0x0123456789abcdef for big endian. 898c2ecf20Sopenharmony_ci */ 908c2ecf20Sopenharmony_ci#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \ 918c2ecf20Sopenharmony_ci lvx _v2nd_qw,_vaddr,off16; \ 928c2ecf20Sopenharmony_ci VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask) 938c2ecf20Sopenharmony_ci 948c2ecf20Sopenharmony_ci/* 958c2ecf20Sopenharmony_ci * There are 2 categories for memcmp: 968c2ecf20Sopenharmony_ci * 1) src/dst has the same offset to the 8 bytes boundary. The handlers 978c2ecf20Sopenharmony_ci * are named like .Lsameoffset_xxxx 988c2ecf20Sopenharmony_ci * 2) src/dst has different offset to the 8 bytes boundary. The handlers 998c2ecf20Sopenharmony_ci * are named like .Ldiffoffset_xxxx 1008c2ecf20Sopenharmony_ci */ 1018c2ecf20Sopenharmony_ci_GLOBAL_TOC(memcmp) 1028c2ecf20Sopenharmony_ci cmpdi cr1,r5,0 1038c2ecf20Sopenharmony_ci 1048c2ecf20Sopenharmony_ci /* Use the short loop if the src/dst addresses are not 1058c2ecf20Sopenharmony_ci * with the same offset of 8 bytes align boundary. 1068c2ecf20Sopenharmony_ci */ 1078c2ecf20Sopenharmony_ci xor r6,r3,r4 1088c2ecf20Sopenharmony_ci andi. r6,r6,7 1098c2ecf20Sopenharmony_ci 1108c2ecf20Sopenharmony_ci /* Fall back to short loop if compare at aligned addrs 1118c2ecf20Sopenharmony_ci * with less than 8 bytes. 1128c2ecf20Sopenharmony_ci */ 1138c2ecf20Sopenharmony_ci cmpdi cr6,r5,7 1148c2ecf20Sopenharmony_ci 1158c2ecf20Sopenharmony_ci beq cr1,.Lzero 1168c2ecf20Sopenharmony_ci bgt cr6,.Lno_short 1178c2ecf20Sopenharmony_ci 1188c2ecf20Sopenharmony_ci.Lshort: 1198c2ecf20Sopenharmony_ci mtctr r5 1208c2ecf20Sopenharmony_ci1: lbz rA,0(r3) 1218c2ecf20Sopenharmony_ci lbz rB,0(r4) 1228c2ecf20Sopenharmony_ci subf. rC,rB,rA 1238c2ecf20Sopenharmony_ci bne .Lnon_zero 1248c2ecf20Sopenharmony_ci bdz .Lzero 1258c2ecf20Sopenharmony_ci 1268c2ecf20Sopenharmony_ci lbz rA,1(r3) 1278c2ecf20Sopenharmony_ci lbz rB,1(r4) 1288c2ecf20Sopenharmony_ci subf. rC,rB,rA 1298c2ecf20Sopenharmony_ci bne .Lnon_zero 1308c2ecf20Sopenharmony_ci bdz .Lzero 1318c2ecf20Sopenharmony_ci 1328c2ecf20Sopenharmony_ci lbz rA,2(r3) 1338c2ecf20Sopenharmony_ci lbz rB,2(r4) 1348c2ecf20Sopenharmony_ci subf. rC,rB,rA 1358c2ecf20Sopenharmony_ci bne .Lnon_zero 1368c2ecf20Sopenharmony_ci bdz .Lzero 1378c2ecf20Sopenharmony_ci 1388c2ecf20Sopenharmony_ci lbz rA,3(r3) 1398c2ecf20Sopenharmony_ci lbz rB,3(r4) 1408c2ecf20Sopenharmony_ci subf. rC,rB,rA 1418c2ecf20Sopenharmony_ci bne .Lnon_zero 1428c2ecf20Sopenharmony_ci 1438c2ecf20Sopenharmony_ci addi r3,r3,4 1448c2ecf20Sopenharmony_ci addi r4,r4,4 1458c2ecf20Sopenharmony_ci 1468c2ecf20Sopenharmony_ci bdnz 1b 1478c2ecf20Sopenharmony_ci 1488c2ecf20Sopenharmony_ci.Lzero: 1498c2ecf20Sopenharmony_ci li r3,0 1508c2ecf20Sopenharmony_ci blr 1518c2ecf20Sopenharmony_ci 1528c2ecf20Sopenharmony_ci.Lno_short: 1538c2ecf20Sopenharmony_ci dcbt 0,r3 1548c2ecf20Sopenharmony_ci dcbt 0,r4 1558c2ecf20Sopenharmony_ci bne .Ldiffoffset_8bytes_make_align_start 1568c2ecf20Sopenharmony_ci 1578c2ecf20Sopenharmony_ci 1588c2ecf20Sopenharmony_ci.Lsameoffset_8bytes_make_align_start: 1598c2ecf20Sopenharmony_ci /* attempt to compare bytes not aligned with 8 bytes so that 1608c2ecf20Sopenharmony_ci * rest comparison can run based on 8 bytes alignment. 1618c2ecf20Sopenharmony_ci */ 1628c2ecf20Sopenharmony_ci andi. r6,r3,7 1638c2ecf20Sopenharmony_ci 1648c2ecf20Sopenharmony_ci /* Try to compare the first double word which is not 8 bytes aligned: 1658c2ecf20Sopenharmony_ci * load the first double word at (src & ~7UL) and shift left appropriate 1668c2ecf20Sopenharmony_ci * bits before comparision. 1678c2ecf20Sopenharmony_ci */ 1688c2ecf20Sopenharmony_ci rlwinm r6,r3,3,26,28 1698c2ecf20Sopenharmony_ci beq .Lsameoffset_8bytes_aligned 1708c2ecf20Sopenharmony_ci clrrdi r3,r3,3 1718c2ecf20Sopenharmony_ci clrrdi r4,r4,3 1728c2ecf20Sopenharmony_ci LD rA,0,r3 1738c2ecf20Sopenharmony_ci LD rB,0,r4 1748c2ecf20Sopenharmony_ci sld rA,rA,r6 1758c2ecf20Sopenharmony_ci sld rB,rB,r6 1768c2ecf20Sopenharmony_ci cmpld cr0,rA,rB 1778c2ecf20Sopenharmony_ci srwi r6,r6,3 1788c2ecf20Sopenharmony_ci bne cr0,.LcmpAB_lightweight 1798c2ecf20Sopenharmony_ci subfic r6,r6,8 1808c2ecf20Sopenharmony_ci subf. r5,r6,r5 1818c2ecf20Sopenharmony_ci addi r3,r3,8 1828c2ecf20Sopenharmony_ci addi r4,r4,8 1838c2ecf20Sopenharmony_ci beq .Lzero 1848c2ecf20Sopenharmony_ci 1858c2ecf20Sopenharmony_ci.Lsameoffset_8bytes_aligned: 1868c2ecf20Sopenharmony_ci /* now we are aligned with 8 bytes. 1878c2ecf20Sopenharmony_ci * Use .Llong loop if left cmp bytes are equal or greater than 32B. 1888c2ecf20Sopenharmony_ci */ 1898c2ecf20Sopenharmony_ci cmpdi cr6,r5,31 1908c2ecf20Sopenharmony_ci bgt cr6,.Llong 1918c2ecf20Sopenharmony_ci 1928c2ecf20Sopenharmony_ci.Lcmp_lt32bytes: 1938c2ecf20Sopenharmony_ci /* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */ 1948c2ecf20Sopenharmony_ci cmpdi cr5,r5,7 1958c2ecf20Sopenharmony_ci srdi r0,r5,3 1968c2ecf20Sopenharmony_ci ble cr5,.Lcmp_rest_lt8bytes 1978c2ecf20Sopenharmony_ci 1988c2ecf20Sopenharmony_ci /* handle 8 ~ 31 bytes */ 1998c2ecf20Sopenharmony_ci clrldi r5,r5,61 2008c2ecf20Sopenharmony_ci mtctr r0 2018c2ecf20Sopenharmony_ci2: 2028c2ecf20Sopenharmony_ci LD rA,0,r3 2038c2ecf20Sopenharmony_ci LD rB,0,r4 2048c2ecf20Sopenharmony_ci cmpld cr0,rA,rB 2058c2ecf20Sopenharmony_ci addi r3,r3,8 2068c2ecf20Sopenharmony_ci addi r4,r4,8 2078c2ecf20Sopenharmony_ci bne cr0,.LcmpAB_lightweight 2088c2ecf20Sopenharmony_ci bdnz 2b 2098c2ecf20Sopenharmony_ci 2108c2ecf20Sopenharmony_ci cmpwi r5,0 2118c2ecf20Sopenharmony_ci beq .Lzero 2128c2ecf20Sopenharmony_ci 2138c2ecf20Sopenharmony_ci.Lcmp_rest_lt8bytes: 2148c2ecf20Sopenharmony_ci /* 2158c2ecf20Sopenharmony_ci * Here we have less than 8 bytes to compare. At least s1 is aligned to 2168c2ecf20Sopenharmony_ci * 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a 2178c2ecf20Sopenharmony_ci * page boundary, otherwise we might read past the end of the buffer and 2188c2ecf20Sopenharmony_ci * trigger a page fault. We use 4K as the conservative minimum page 2198c2ecf20Sopenharmony_ci * size. If we detect that case we go to the byte-by-byte loop. 2208c2ecf20Sopenharmony_ci * 2218c2ecf20Sopenharmony_ci * Otherwise the next double word is loaded from s1 and s2, and shifted 2228c2ecf20Sopenharmony_ci * right to compare the appropriate bits. 2238c2ecf20Sopenharmony_ci */ 2248c2ecf20Sopenharmony_ci clrldi r6,r4,(64-12) // r6 = r4 & 0xfff 2258c2ecf20Sopenharmony_ci cmpdi r6,0xff8 2268c2ecf20Sopenharmony_ci bgt .Lshort 2278c2ecf20Sopenharmony_ci 2288c2ecf20Sopenharmony_ci subfic r6,r5,8 2298c2ecf20Sopenharmony_ci slwi r6,r6,3 2308c2ecf20Sopenharmony_ci LD rA,0,r3 2318c2ecf20Sopenharmony_ci LD rB,0,r4 2328c2ecf20Sopenharmony_ci srd rA,rA,r6 2338c2ecf20Sopenharmony_ci srd rB,rB,r6 2348c2ecf20Sopenharmony_ci cmpld cr0,rA,rB 2358c2ecf20Sopenharmony_ci bne cr0,.LcmpAB_lightweight 2368c2ecf20Sopenharmony_ci b .Lzero 2378c2ecf20Sopenharmony_ci 2388c2ecf20Sopenharmony_ci.Lnon_zero: 2398c2ecf20Sopenharmony_ci mr r3,rC 2408c2ecf20Sopenharmony_ci blr 2418c2ecf20Sopenharmony_ci 2428c2ecf20Sopenharmony_ci.Llong: 2438c2ecf20Sopenharmony_ci#ifdef CONFIG_ALTIVEC 2448c2ecf20Sopenharmony_ciBEGIN_FTR_SECTION 2458c2ecf20Sopenharmony_ci /* Try to use vmx loop if length is equal or greater than 4K */ 2468c2ecf20Sopenharmony_ci cmpldi cr6,r5,VMX_THRESH 2478c2ecf20Sopenharmony_ci bge cr6,.Lsameoffset_vmx_cmp 2488c2ecf20Sopenharmony_ciEND_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 2498c2ecf20Sopenharmony_ci 2508c2ecf20Sopenharmony_ci.Llong_novmx_cmp: 2518c2ecf20Sopenharmony_ci#endif 2528c2ecf20Sopenharmony_ci /* At least s1 addr is aligned with 8 bytes */ 2538c2ecf20Sopenharmony_ci li off8,8 2548c2ecf20Sopenharmony_ci li off16,16 2558c2ecf20Sopenharmony_ci li off24,24 2568c2ecf20Sopenharmony_ci 2578c2ecf20Sopenharmony_ci std r31,-8(r1) 2588c2ecf20Sopenharmony_ci std r30,-16(r1) 2598c2ecf20Sopenharmony_ci std r29,-24(r1) 2608c2ecf20Sopenharmony_ci std r28,-32(r1) 2618c2ecf20Sopenharmony_ci std r27,-40(r1) 2628c2ecf20Sopenharmony_ci 2638c2ecf20Sopenharmony_ci srdi r0,r5,5 2648c2ecf20Sopenharmony_ci mtctr r0 2658c2ecf20Sopenharmony_ci andi. r5,r5,31 2668c2ecf20Sopenharmony_ci 2678c2ecf20Sopenharmony_ci LD rA,0,r3 2688c2ecf20Sopenharmony_ci LD rB,0,r4 2698c2ecf20Sopenharmony_ci 2708c2ecf20Sopenharmony_ci LD rC,off8,r3 2718c2ecf20Sopenharmony_ci LD rD,off8,r4 2728c2ecf20Sopenharmony_ci 2738c2ecf20Sopenharmony_ci LD rE,off16,r3 2748c2ecf20Sopenharmony_ci LD rF,off16,r4 2758c2ecf20Sopenharmony_ci 2768c2ecf20Sopenharmony_ci LD rG,off24,r3 2778c2ecf20Sopenharmony_ci LD rH,off24,r4 2788c2ecf20Sopenharmony_ci cmpld cr0,rA,rB 2798c2ecf20Sopenharmony_ci 2808c2ecf20Sopenharmony_ci addi r3,r3,32 2818c2ecf20Sopenharmony_ci addi r4,r4,32 2828c2ecf20Sopenharmony_ci 2838c2ecf20Sopenharmony_ci bdz .Lfirst32 2848c2ecf20Sopenharmony_ci 2858c2ecf20Sopenharmony_ci LD rA,0,r3 2868c2ecf20Sopenharmony_ci LD rB,0,r4 2878c2ecf20Sopenharmony_ci cmpld cr1,rC,rD 2888c2ecf20Sopenharmony_ci 2898c2ecf20Sopenharmony_ci LD rC,off8,r3 2908c2ecf20Sopenharmony_ci LD rD,off8,r4 2918c2ecf20Sopenharmony_ci cmpld cr6,rE,rF 2928c2ecf20Sopenharmony_ci 2938c2ecf20Sopenharmony_ci LD rE,off16,r3 2948c2ecf20Sopenharmony_ci LD rF,off16,r4 2958c2ecf20Sopenharmony_ci cmpld cr7,rG,rH 2968c2ecf20Sopenharmony_ci bne cr0,.LcmpAB 2978c2ecf20Sopenharmony_ci 2988c2ecf20Sopenharmony_ci LD rG,off24,r3 2998c2ecf20Sopenharmony_ci LD rH,off24,r4 3008c2ecf20Sopenharmony_ci cmpld cr0,rA,rB 3018c2ecf20Sopenharmony_ci bne cr1,.LcmpCD 3028c2ecf20Sopenharmony_ci 3038c2ecf20Sopenharmony_ci addi r3,r3,32 3048c2ecf20Sopenharmony_ci addi r4,r4,32 3058c2ecf20Sopenharmony_ci 3068c2ecf20Sopenharmony_ci bdz .Lsecond32 3078c2ecf20Sopenharmony_ci 3088c2ecf20Sopenharmony_ci .balign 16 3098c2ecf20Sopenharmony_ci 3108c2ecf20Sopenharmony_ci1: LD rA,0,r3 3118c2ecf20Sopenharmony_ci LD rB,0,r4 3128c2ecf20Sopenharmony_ci cmpld cr1,rC,rD 3138c2ecf20Sopenharmony_ci bne cr6,.LcmpEF 3148c2ecf20Sopenharmony_ci 3158c2ecf20Sopenharmony_ci LD rC,off8,r3 3168c2ecf20Sopenharmony_ci LD rD,off8,r4 3178c2ecf20Sopenharmony_ci cmpld cr6,rE,rF 3188c2ecf20Sopenharmony_ci bne cr7,.LcmpGH 3198c2ecf20Sopenharmony_ci 3208c2ecf20Sopenharmony_ci LD rE,off16,r3 3218c2ecf20Sopenharmony_ci LD rF,off16,r4 3228c2ecf20Sopenharmony_ci cmpld cr7,rG,rH 3238c2ecf20Sopenharmony_ci bne cr0,.LcmpAB 3248c2ecf20Sopenharmony_ci 3258c2ecf20Sopenharmony_ci LD rG,off24,r3 3268c2ecf20Sopenharmony_ci LD rH,off24,r4 3278c2ecf20Sopenharmony_ci cmpld cr0,rA,rB 3288c2ecf20Sopenharmony_ci bne cr1,.LcmpCD 3298c2ecf20Sopenharmony_ci 3308c2ecf20Sopenharmony_ci addi r3,r3,32 3318c2ecf20Sopenharmony_ci addi r4,r4,32 3328c2ecf20Sopenharmony_ci 3338c2ecf20Sopenharmony_ci bdnz 1b 3348c2ecf20Sopenharmony_ci 3358c2ecf20Sopenharmony_ci.Lsecond32: 3368c2ecf20Sopenharmony_ci cmpld cr1,rC,rD 3378c2ecf20Sopenharmony_ci bne cr6,.LcmpEF 3388c2ecf20Sopenharmony_ci 3398c2ecf20Sopenharmony_ci cmpld cr6,rE,rF 3408c2ecf20Sopenharmony_ci bne cr7,.LcmpGH 3418c2ecf20Sopenharmony_ci 3428c2ecf20Sopenharmony_ci cmpld cr7,rG,rH 3438c2ecf20Sopenharmony_ci bne cr0,.LcmpAB 3448c2ecf20Sopenharmony_ci 3458c2ecf20Sopenharmony_ci bne cr1,.LcmpCD 3468c2ecf20Sopenharmony_ci bne cr6,.LcmpEF 3478c2ecf20Sopenharmony_ci bne cr7,.LcmpGH 3488c2ecf20Sopenharmony_ci 3498c2ecf20Sopenharmony_ci.Ltail: 3508c2ecf20Sopenharmony_ci ld r31,-8(r1) 3518c2ecf20Sopenharmony_ci ld r30,-16(r1) 3528c2ecf20Sopenharmony_ci ld r29,-24(r1) 3538c2ecf20Sopenharmony_ci ld r28,-32(r1) 3548c2ecf20Sopenharmony_ci ld r27,-40(r1) 3558c2ecf20Sopenharmony_ci 3568c2ecf20Sopenharmony_ci cmpdi r5,0 3578c2ecf20Sopenharmony_ci beq .Lzero 3588c2ecf20Sopenharmony_ci b .Lshort 3598c2ecf20Sopenharmony_ci 3608c2ecf20Sopenharmony_ci.Lfirst32: 3618c2ecf20Sopenharmony_ci cmpld cr1,rC,rD 3628c2ecf20Sopenharmony_ci cmpld cr6,rE,rF 3638c2ecf20Sopenharmony_ci cmpld cr7,rG,rH 3648c2ecf20Sopenharmony_ci 3658c2ecf20Sopenharmony_ci bne cr0,.LcmpAB 3668c2ecf20Sopenharmony_ci bne cr1,.LcmpCD 3678c2ecf20Sopenharmony_ci bne cr6,.LcmpEF 3688c2ecf20Sopenharmony_ci bne cr7,.LcmpGH 3698c2ecf20Sopenharmony_ci 3708c2ecf20Sopenharmony_ci b .Ltail 3718c2ecf20Sopenharmony_ci 3728c2ecf20Sopenharmony_ci.LcmpAB: 3738c2ecf20Sopenharmony_ci li r3,1 3748c2ecf20Sopenharmony_ci bgt cr0,.Lout 3758c2ecf20Sopenharmony_ci li r3,-1 3768c2ecf20Sopenharmony_ci b .Lout 3778c2ecf20Sopenharmony_ci 3788c2ecf20Sopenharmony_ci.LcmpCD: 3798c2ecf20Sopenharmony_ci li r3,1 3808c2ecf20Sopenharmony_ci bgt cr1,.Lout 3818c2ecf20Sopenharmony_ci li r3,-1 3828c2ecf20Sopenharmony_ci b .Lout 3838c2ecf20Sopenharmony_ci 3848c2ecf20Sopenharmony_ci.LcmpEF: 3858c2ecf20Sopenharmony_ci li r3,1 3868c2ecf20Sopenharmony_ci bgt cr6,.Lout 3878c2ecf20Sopenharmony_ci li r3,-1 3888c2ecf20Sopenharmony_ci b .Lout 3898c2ecf20Sopenharmony_ci 3908c2ecf20Sopenharmony_ci.LcmpGH: 3918c2ecf20Sopenharmony_ci li r3,1 3928c2ecf20Sopenharmony_ci bgt cr7,.Lout 3938c2ecf20Sopenharmony_ci li r3,-1 3948c2ecf20Sopenharmony_ci 3958c2ecf20Sopenharmony_ci.Lout: 3968c2ecf20Sopenharmony_ci ld r31,-8(r1) 3978c2ecf20Sopenharmony_ci ld r30,-16(r1) 3988c2ecf20Sopenharmony_ci ld r29,-24(r1) 3998c2ecf20Sopenharmony_ci ld r28,-32(r1) 4008c2ecf20Sopenharmony_ci ld r27,-40(r1) 4018c2ecf20Sopenharmony_ci blr 4028c2ecf20Sopenharmony_ci 4038c2ecf20Sopenharmony_ci.LcmpAB_lightweight: /* skip NV GPRS restore */ 4048c2ecf20Sopenharmony_ci li r3,1 4058c2ecf20Sopenharmony_ci bgtlr 4068c2ecf20Sopenharmony_ci li r3,-1 4078c2ecf20Sopenharmony_ci blr 4088c2ecf20Sopenharmony_ci 4098c2ecf20Sopenharmony_ci#ifdef CONFIG_ALTIVEC 4108c2ecf20Sopenharmony_ci.Lsameoffset_vmx_cmp: 4118c2ecf20Sopenharmony_ci /* Enter with src/dst addrs has the same offset with 8 bytes 4128c2ecf20Sopenharmony_ci * align boundary. 4138c2ecf20Sopenharmony_ci * 4148c2ecf20Sopenharmony_ci * There is an optimization based on following fact: memcmp() 4158c2ecf20Sopenharmony_ci * prones to fail early at the first 32 bytes. 4168c2ecf20Sopenharmony_ci * Before applying VMX instructions which will lead to 32x128bits 4178c2ecf20Sopenharmony_ci * VMX regs load/restore penalty, we compare the first 32 bytes 4188c2ecf20Sopenharmony_ci * so that we can catch the ~80% fail cases. 4198c2ecf20Sopenharmony_ci */ 4208c2ecf20Sopenharmony_ci 4218c2ecf20Sopenharmony_ci li r0,4 4228c2ecf20Sopenharmony_ci mtctr r0 4238c2ecf20Sopenharmony_ci.Lsameoffset_prechk_32B_loop: 4248c2ecf20Sopenharmony_ci LD rA,0,r3 4258c2ecf20Sopenharmony_ci LD rB,0,r4 4268c2ecf20Sopenharmony_ci cmpld cr0,rA,rB 4278c2ecf20Sopenharmony_ci addi r3,r3,8 4288c2ecf20Sopenharmony_ci addi r4,r4,8 4298c2ecf20Sopenharmony_ci bne cr0,.LcmpAB_lightweight 4308c2ecf20Sopenharmony_ci addi r5,r5,-8 4318c2ecf20Sopenharmony_ci bdnz .Lsameoffset_prechk_32B_loop 4328c2ecf20Sopenharmony_ci 4338c2ecf20Sopenharmony_ci ENTER_VMX_OPS 4348c2ecf20Sopenharmony_ci beq cr1,.Llong_novmx_cmp 4358c2ecf20Sopenharmony_ci 4368c2ecf20Sopenharmony_ci3: 4378c2ecf20Sopenharmony_ci /* need to check whether r4 has the same offset with r3 4388c2ecf20Sopenharmony_ci * for 16 bytes boundary. 4398c2ecf20Sopenharmony_ci */ 4408c2ecf20Sopenharmony_ci xor r0,r3,r4 4418c2ecf20Sopenharmony_ci andi. r0,r0,0xf 4428c2ecf20Sopenharmony_ci bne .Ldiffoffset_vmx_cmp_start 4438c2ecf20Sopenharmony_ci 4448c2ecf20Sopenharmony_ci /* len is no less than 4KB. Need to align with 16 bytes further. 4458c2ecf20Sopenharmony_ci */ 4468c2ecf20Sopenharmony_ci andi. rA,r3,8 4478c2ecf20Sopenharmony_ci LD rA,0,r3 4488c2ecf20Sopenharmony_ci beq 4f 4498c2ecf20Sopenharmony_ci LD rB,0,r4 4508c2ecf20Sopenharmony_ci cmpld cr0,rA,rB 4518c2ecf20Sopenharmony_ci addi r3,r3,8 4528c2ecf20Sopenharmony_ci addi r4,r4,8 4538c2ecf20Sopenharmony_ci addi r5,r5,-8 4548c2ecf20Sopenharmony_ci 4558c2ecf20Sopenharmony_ci beq cr0,4f 4568c2ecf20Sopenharmony_ci /* save and restore cr0 */ 4578c2ecf20Sopenharmony_ci mfocrf r5,128 4588c2ecf20Sopenharmony_ci EXIT_VMX_OPS 4598c2ecf20Sopenharmony_ci mtocrf 128,r5 4608c2ecf20Sopenharmony_ci b .LcmpAB_lightweight 4618c2ecf20Sopenharmony_ci 4628c2ecf20Sopenharmony_ci4: 4638c2ecf20Sopenharmony_ci /* compare 32 bytes for each loop */ 4648c2ecf20Sopenharmony_ci srdi r0,r5,5 4658c2ecf20Sopenharmony_ci mtctr r0 4668c2ecf20Sopenharmony_ci clrldi r5,r5,59 4678c2ecf20Sopenharmony_ci li off16,16 4688c2ecf20Sopenharmony_ci 4698c2ecf20Sopenharmony_ci.balign 16 4708c2ecf20Sopenharmony_ci5: 4718c2ecf20Sopenharmony_ci lvx v0,0,r3 4728c2ecf20Sopenharmony_ci lvx v1,0,r4 4738c2ecf20Sopenharmony_ci VCMPEQUD_RC(v0,v0,v1) 4748c2ecf20Sopenharmony_ci bnl cr6,7f 4758c2ecf20Sopenharmony_ci lvx v0,off16,r3 4768c2ecf20Sopenharmony_ci lvx v1,off16,r4 4778c2ecf20Sopenharmony_ci VCMPEQUD_RC(v0,v0,v1) 4788c2ecf20Sopenharmony_ci bnl cr6,6f 4798c2ecf20Sopenharmony_ci addi r3,r3,32 4808c2ecf20Sopenharmony_ci addi r4,r4,32 4818c2ecf20Sopenharmony_ci bdnz 5b 4828c2ecf20Sopenharmony_ci 4838c2ecf20Sopenharmony_ci EXIT_VMX_OPS 4848c2ecf20Sopenharmony_ci cmpdi r5,0 4858c2ecf20Sopenharmony_ci beq .Lzero 4868c2ecf20Sopenharmony_ci b .Lcmp_lt32bytes 4878c2ecf20Sopenharmony_ci 4888c2ecf20Sopenharmony_ci6: 4898c2ecf20Sopenharmony_ci addi r3,r3,16 4908c2ecf20Sopenharmony_ci addi r4,r4,16 4918c2ecf20Sopenharmony_ci 4928c2ecf20Sopenharmony_ci7: 4938c2ecf20Sopenharmony_ci /* diff the last 16 bytes */ 4948c2ecf20Sopenharmony_ci EXIT_VMX_OPS 4958c2ecf20Sopenharmony_ci LD rA,0,r3 4968c2ecf20Sopenharmony_ci LD rB,0,r4 4978c2ecf20Sopenharmony_ci cmpld cr0,rA,rB 4988c2ecf20Sopenharmony_ci li off8,8 4998c2ecf20Sopenharmony_ci bne cr0,.LcmpAB_lightweight 5008c2ecf20Sopenharmony_ci 5018c2ecf20Sopenharmony_ci LD rA,off8,r3 5028c2ecf20Sopenharmony_ci LD rB,off8,r4 5038c2ecf20Sopenharmony_ci cmpld cr0,rA,rB 5048c2ecf20Sopenharmony_ci bne cr0,.LcmpAB_lightweight 5058c2ecf20Sopenharmony_ci b .Lzero 5068c2ecf20Sopenharmony_ci#endif 5078c2ecf20Sopenharmony_ci 5088c2ecf20Sopenharmony_ci.Ldiffoffset_8bytes_make_align_start: 5098c2ecf20Sopenharmony_ci /* now try to align s1 with 8 bytes */ 5108c2ecf20Sopenharmony_ci rlwinm r6,r3,3,26,28 5118c2ecf20Sopenharmony_ci beq .Ldiffoffset_align_s1_8bytes 5128c2ecf20Sopenharmony_ci 5138c2ecf20Sopenharmony_ci clrrdi r3,r3,3 5148c2ecf20Sopenharmony_ci LD rA,0,r3 5158c2ecf20Sopenharmony_ci LD rB,0,r4 /* unaligned load */ 5168c2ecf20Sopenharmony_ci sld rA,rA,r6 5178c2ecf20Sopenharmony_ci srd rA,rA,r6 5188c2ecf20Sopenharmony_ci srd rB,rB,r6 5198c2ecf20Sopenharmony_ci cmpld cr0,rA,rB 5208c2ecf20Sopenharmony_ci srwi r6,r6,3 5218c2ecf20Sopenharmony_ci bne cr0,.LcmpAB_lightweight 5228c2ecf20Sopenharmony_ci 5238c2ecf20Sopenharmony_ci subfic r6,r6,8 5248c2ecf20Sopenharmony_ci subf. r5,r6,r5 5258c2ecf20Sopenharmony_ci addi r3,r3,8 5268c2ecf20Sopenharmony_ci add r4,r4,r6 5278c2ecf20Sopenharmony_ci 5288c2ecf20Sopenharmony_ci beq .Lzero 5298c2ecf20Sopenharmony_ci 5308c2ecf20Sopenharmony_ci.Ldiffoffset_align_s1_8bytes: 5318c2ecf20Sopenharmony_ci /* now s1 is aligned with 8 bytes. */ 5328c2ecf20Sopenharmony_ci#ifdef CONFIG_ALTIVEC 5338c2ecf20Sopenharmony_ciBEGIN_FTR_SECTION 5348c2ecf20Sopenharmony_ci /* only do vmx ops when the size equal or greater than 4K bytes */ 5358c2ecf20Sopenharmony_ci cmpdi cr5,r5,VMX_THRESH 5368c2ecf20Sopenharmony_ci bge cr5,.Ldiffoffset_vmx_cmp 5378c2ecf20Sopenharmony_ciEND_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 5388c2ecf20Sopenharmony_ci 5398c2ecf20Sopenharmony_ci.Ldiffoffset_novmx_cmp: 5408c2ecf20Sopenharmony_ci#endif 5418c2ecf20Sopenharmony_ci 5428c2ecf20Sopenharmony_ci 5438c2ecf20Sopenharmony_ci cmpdi cr5,r5,31 5448c2ecf20Sopenharmony_ci ble cr5,.Lcmp_lt32bytes 5458c2ecf20Sopenharmony_ci 5468c2ecf20Sopenharmony_ci#ifdef CONFIG_ALTIVEC 5478c2ecf20Sopenharmony_ci b .Llong_novmx_cmp 5488c2ecf20Sopenharmony_ci#else 5498c2ecf20Sopenharmony_ci b .Llong 5508c2ecf20Sopenharmony_ci#endif 5518c2ecf20Sopenharmony_ci 5528c2ecf20Sopenharmony_ci#ifdef CONFIG_ALTIVEC 5538c2ecf20Sopenharmony_ci.Ldiffoffset_vmx_cmp: 5548c2ecf20Sopenharmony_ci /* perform a 32 bytes pre-checking before 5558c2ecf20Sopenharmony_ci * enable VMX operations. 5568c2ecf20Sopenharmony_ci */ 5578c2ecf20Sopenharmony_ci li r0,4 5588c2ecf20Sopenharmony_ci mtctr r0 5598c2ecf20Sopenharmony_ci.Ldiffoffset_prechk_32B_loop: 5608c2ecf20Sopenharmony_ci LD rA,0,r3 5618c2ecf20Sopenharmony_ci LD rB,0,r4 5628c2ecf20Sopenharmony_ci cmpld cr0,rA,rB 5638c2ecf20Sopenharmony_ci addi r3,r3,8 5648c2ecf20Sopenharmony_ci addi r4,r4,8 5658c2ecf20Sopenharmony_ci bne cr0,.LcmpAB_lightweight 5668c2ecf20Sopenharmony_ci addi r5,r5,-8 5678c2ecf20Sopenharmony_ci bdnz .Ldiffoffset_prechk_32B_loop 5688c2ecf20Sopenharmony_ci 5698c2ecf20Sopenharmony_ci ENTER_VMX_OPS 5708c2ecf20Sopenharmony_ci beq cr1,.Ldiffoffset_novmx_cmp 5718c2ecf20Sopenharmony_ci 5728c2ecf20Sopenharmony_ci.Ldiffoffset_vmx_cmp_start: 5738c2ecf20Sopenharmony_ci /* Firstly try to align r3 with 16 bytes */ 5748c2ecf20Sopenharmony_ci andi. r6,r3,0xf 5758c2ecf20Sopenharmony_ci li off16,16 5768c2ecf20Sopenharmony_ci beq .Ldiffoffset_vmx_s1_16bytes_align 5778c2ecf20Sopenharmony_ci 5788c2ecf20Sopenharmony_ci LVS v3,0,r3 5798c2ecf20Sopenharmony_ci LVS v4,0,r4 5808c2ecf20Sopenharmony_ci 5818c2ecf20Sopenharmony_ci lvx v5,0,r3 5828c2ecf20Sopenharmony_ci lvx v6,0,r4 5838c2ecf20Sopenharmony_ci LD_VSR_CROSS16B(r3,v3,v5,v7,v9) 5848c2ecf20Sopenharmony_ci LD_VSR_CROSS16B(r4,v4,v6,v8,v10) 5858c2ecf20Sopenharmony_ci 5868c2ecf20Sopenharmony_ci VCMPEQUB_RC(v7,v9,v10) 5878c2ecf20Sopenharmony_ci bnl cr6,.Ldiffoffset_vmx_diff_found 5888c2ecf20Sopenharmony_ci 5898c2ecf20Sopenharmony_ci subfic r6,r6,16 5908c2ecf20Sopenharmony_ci subf r5,r6,r5 5918c2ecf20Sopenharmony_ci add r3,r3,r6 5928c2ecf20Sopenharmony_ci add r4,r4,r6 5938c2ecf20Sopenharmony_ci 5948c2ecf20Sopenharmony_ci.Ldiffoffset_vmx_s1_16bytes_align: 5958c2ecf20Sopenharmony_ci /* now s1 is aligned with 16 bytes */ 5968c2ecf20Sopenharmony_ci lvx v6,0,r4 5978c2ecf20Sopenharmony_ci LVS v4,0,r4 5988c2ecf20Sopenharmony_ci srdi r6,r5,5 /* loop for 32 bytes each */ 5998c2ecf20Sopenharmony_ci clrldi r5,r5,59 6008c2ecf20Sopenharmony_ci mtctr r6 6018c2ecf20Sopenharmony_ci 6028c2ecf20Sopenharmony_ci.balign 16 6038c2ecf20Sopenharmony_ci.Ldiffoffset_vmx_32bytesloop: 6048c2ecf20Sopenharmony_ci /* the first qw of r4 was saved in v6 */ 6058c2ecf20Sopenharmony_ci lvx v9,0,r3 6068c2ecf20Sopenharmony_ci LD_VSR_CROSS16B(r4,v4,v6,v8,v10) 6078c2ecf20Sopenharmony_ci VCMPEQUB_RC(v7,v9,v10) 6088c2ecf20Sopenharmony_ci vor v6,v8,v8 6098c2ecf20Sopenharmony_ci bnl cr6,.Ldiffoffset_vmx_diff_found 6108c2ecf20Sopenharmony_ci 6118c2ecf20Sopenharmony_ci addi r3,r3,16 6128c2ecf20Sopenharmony_ci addi r4,r4,16 6138c2ecf20Sopenharmony_ci 6148c2ecf20Sopenharmony_ci lvx v9,0,r3 6158c2ecf20Sopenharmony_ci LD_VSR_CROSS16B(r4,v4,v6,v8,v10) 6168c2ecf20Sopenharmony_ci VCMPEQUB_RC(v7,v9,v10) 6178c2ecf20Sopenharmony_ci vor v6,v8,v8 6188c2ecf20Sopenharmony_ci bnl cr6,.Ldiffoffset_vmx_diff_found 6198c2ecf20Sopenharmony_ci 6208c2ecf20Sopenharmony_ci addi r3,r3,16 6218c2ecf20Sopenharmony_ci addi r4,r4,16 6228c2ecf20Sopenharmony_ci 6238c2ecf20Sopenharmony_ci bdnz .Ldiffoffset_vmx_32bytesloop 6248c2ecf20Sopenharmony_ci 6258c2ecf20Sopenharmony_ci EXIT_VMX_OPS 6268c2ecf20Sopenharmony_ci 6278c2ecf20Sopenharmony_ci cmpdi r5,0 6288c2ecf20Sopenharmony_ci beq .Lzero 6298c2ecf20Sopenharmony_ci b .Lcmp_lt32bytes 6308c2ecf20Sopenharmony_ci 6318c2ecf20Sopenharmony_ci.Ldiffoffset_vmx_diff_found: 6328c2ecf20Sopenharmony_ci EXIT_VMX_OPS 6338c2ecf20Sopenharmony_ci /* anyway, the diff will appear in next 16 bytes */ 6348c2ecf20Sopenharmony_ci li r5,16 6358c2ecf20Sopenharmony_ci b .Lcmp_lt32bytes 6368c2ecf20Sopenharmony_ci 6378c2ecf20Sopenharmony_ci#endif 6388c2ecf20Sopenharmony_ciEXPORT_SYMBOL(memcmp) 639