162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Author: Anton Blanchard <anton@au.ibm.com> 462306a36Sopenharmony_ci * Copyright 2015 IBM Corporation. 562306a36Sopenharmony_ci */ 662306a36Sopenharmony_ci#include <linux/export.h> 762306a36Sopenharmony_ci#include <asm/ppc_asm.h> 862306a36Sopenharmony_ci#include <asm/ppc-opcode.h> 962306a36Sopenharmony_ci 1062306a36Sopenharmony_ci#define off8 r6 1162306a36Sopenharmony_ci#define off16 r7 1262306a36Sopenharmony_ci#define off24 r8 1362306a36Sopenharmony_ci 1462306a36Sopenharmony_ci#define rA r9 1562306a36Sopenharmony_ci#define rB r10 1662306a36Sopenharmony_ci#define rC r11 1762306a36Sopenharmony_ci#define rD r27 1862306a36Sopenharmony_ci#define rE r28 1962306a36Sopenharmony_ci#define rF r29 2062306a36Sopenharmony_ci#define rG r30 2162306a36Sopenharmony_ci#define rH r31 2262306a36Sopenharmony_ci 2362306a36Sopenharmony_ci#ifdef __LITTLE_ENDIAN__ 2462306a36Sopenharmony_ci#define LH lhbrx 2562306a36Sopenharmony_ci#define LW lwbrx 2662306a36Sopenharmony_ci#define LD ldbrx 2762306a36Sopenharmony_ci#define LVS lvsr 2862306a36Sopenharmony_ci#define VPERM(_VRT,_VRA,_VRB,_VRC) \ 2962306a36Sopenharmony_ci vperm _VRT,_VRB,_VRA,_VRC 3062306a36Sopenharmony_ci#else 3162306a36Sopenharmony_ci#define LH lhzx 3262306a36Sopenharmony_ci#define LW lwzx 3362306a36Sopenharmony_ci#define LD ldx 3462306a36Sopenharmony_ci#define LVS lvsl 3562306a36Sopenharmony_ci#define VPERM(_VRT,_VRA,_VRB,_VRC) \ 3662306a36Sopenharmony_ci vperm _VRT,_VRA,_VRB,_VRC 3762306a36Sopenharmony_ci#endif 3862306a36Sopenharmony_ci 3962306a36Sopenharmony_ci#define VMX_THRESH 4096 4062306a36Sopenharmony_ci#define ENTER_VMX_OPS \ 4162306a36Sopenharmony_ci mflr r0; \ 4262306a36Sopenharmony_ci std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ 4362306a36Sopenharmony_ci std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ 4462306a36Sopenharmony_ci std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ 4562306a36Sopenharmony_ci std r0,16(r1); \ 4662306a36Sopenharmony_ci stdu r1,-STACKFRAMESIZE(r1); \ 4762306a36Sopenharmony_ci bl CFUNC(enter_vmx_ops); \ 4862306a36Sopenharmony_ci cmpwi cr1,r3,0; \ 4962306a36Sopenharmony_ci ld r0,STACKFRAMESIZE+16(r1); \ 5062306a36Sopenharmony_ci ld r3,STK_REG(R31)(r1); \ 5162306a36Sopenharmony_ci ld r4,STK_REG(R30)(r1); \ 5262306a36Sopenharmony_ci ld r5,STK_REG(R29)(r1); \ 5362306a36Sopenharmony_ci addi r1,r1,STACKFRAMESIZE; \ 5462306a36Sopenharmony_ci mtlr r0 5562306a36Sopenharmony_ci 5662306a36Sopenharmony_ci#define EXIT_VMX_OPS \ 5762306a36Sopenharmony_ci mflr r0; \ 5862306a36Sopenharmony_ci std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ 5962306a36Sopenharmony_ci std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ 6062306a36Sopenharmony_ci std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ 6162306a36Sopenharmony_ci std r0,16(r1); \ 6262306a36Sopenharmony_ci stdu r1,-STACKFRAMESIZE(r1); \ 6362306a36Sopenharmony_ci bl CFUNC(exit_vmx_ops); \ 6462306a36Sopenharmony_ci ld r0,STACKFRAMESIZE+16(r1); \ 6562306a36Sopenharmony_ci ld r3,STK_REG(R31)(r1); \ 6662306a36Sopenharmony_ci ld r4,STK_REG(R30)(r1); \ 6762306a36Sopenharmony_ci ld r5,STK_REG(R29)(r1); \ 6862306a36Sopenharmony_ci addi r1,r1,STACKFRAMESIZE; \ 6962306a36Sopenharmony_ci mtlr r0 7062306a36Sopenharmony_ci 7162306a36Sopenharmony_ci/* 7262306a36Sopenharmony_ci * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with 7362306a36Sopenharmony_ci * 16 bytes boundary and permute the result with the 1st 16 bytes. 7462306a36Sopenharmony_ci 7562306a36Sopenharmony_ci * | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z | 7662306a36Sopenharmony_ci * ^ ^ ^ 7762306a36Sopenharmony_ci * 0xbbbb10 0xbbbb20 0xbbb30 7862306a36Sopenharmony_ci * ^ 7962306a36Sopenharmony_ci * _vaddr 8062306a36Sopenharmony_ci * 8162306a36Sopenharmony_ci * 8262306a36Sopenharmony_ci * _vmask is the mask generated by LVS 8362306a36Sopenharmony_ci * _v1st_qw is the 1st aligned QW of current addr which is already loaded. 8462306a36Sopenharmony_ci * for example: 0xyyyyyyyyyyyyy012 for big endian 8562306a36Sopenharmony_ci * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded. 8662306a36Sopenharmony_ci * for example: 0x3456789abcdefzzz for big endian 8762306a36Sopenharmony_ci * The permute result is saved in _v_res. 8862306a36Sopenharmony_ci * for example: 0x0123456789abcdef for big endian. 8962306a36Sopenharmony_ci */ 9062306a36Sopenharmony_ci#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \ 9162306a36Sopenharmony_ci lvx _v2nd_qw,_vaddr,off16; \ 9262306a36Sopenharmony_ci VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask) 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_ci/* 9562306a36Sopenharmony_ci * There are 2 categories for memcmp: 9662306a36Sopenharmony_ci * 1) src/dst has the same offset to the 8 bytes boundary. The handlers 9762306a36Sopenharmony_ci * are named like .Lsameoffset_xxxx 9862306a36Sopenharmony_ci * 2) src/dst has different offset to the 8 bytes boundary. The handlers 9962306a36Sopenharmony_ci * are named like .Ldiffoffset_xxxx 10062306a36Sopenharmony_ci */ 10162306a36Sopenharmony_ci_GLOBAL_TOC(memcmp) 10262306a36Sopenharmony_ci cmpdi cr1,r5,0 10362306a36Sopenharmony_ci 10462306a36Sopenharmony_ci /* Use the short loop if the src/dst addresses are not 10562306a36Sopenharmony_ci * with the same offset of 8 bytes align boundary. 10662306a36Sopenharmony_ci */ 10762306a36Sopenharmony_ci xor r6,r3,r4 10862306a36Sopenharmony_ci andi. r6,r6,7 10962306a36Sopenharmony_ci 11062306a36Sopenharmony_ci /* Fall back to short loop if compare at aligned addrs 11162306a36Sopenharmony_ci * with less than 8 bytes. 11262306a36Sopenharmony_ci */ 11362306a36Sopenharmony_ci cmpdi cr6,r5,7 11462306a36Sopenharmony_ci 11562306a36Sopenharmony_ci beq cr1,.Lzero 11662306a36Sopenharmony_ci bgt cr6,.Lno_short 11762306a36Sopenharmony_ci 11862306a36Sopenharmony_ci.Lshort: 11962306a36Sopenharmony_ci mtctr r5 12062306a36Sopenharmony_ci1: lbz rA,0(r3) 12162306a36Sopenharmony_ci lbz rB,0(r4) 12262306a36Sopenharmony_ci subf. rC,rB,rA 12362306a36Sopenharmony_ci bne .Lnon_zero 12462306a36Sopenharmony_ci bdz .Lzero 12562306a36Sopenharmony_ci 12662306a36Sopenharmony_ci lbz rA,1(r3) 12762306a36Sopenharmony_ci lbz rB,1(r4) 12862306a36Sopenharmony_ci subf. rC,rB,rA 12962306a36Sopenharmony_ci bne .Lnon_zero 13062306a36Sopenharmony_ci bdz .Lzero 13162306a36Sopenharmony_ci 13262306a36Sopenharmony_ci lbz rA,2(r3) 13362306a36Sopenharmony_ci lbz rB,2(r4) 13462306a36Sopenharmony_ci subf. rC,rB,rA 13562306a36Sopenharmony_ci bne .Lnon_zero 13662306a36Sopenharmony_ci bdz .Lzero 13762306a36Sopenharmony_ci 13862306a36Sopenharmony_ci lbz rA,3(r3) 13962306a36Sopenharmony_ci lbz rB,3(r4) 14062306a36Sopenharmony_ci subf. rC,rB,rA 14162306a36Sopenharmony_ci bne .Lnon_zero 14262306a36Sopenharmony_ci 14362306a36Sopenharmony_ci addi r3,r3,4 14462306a36Sopenharmony_ci addi r4,r4,4 14562306a36Sopenharmony_ci 14662306a36Sopenharmony_ci bdnz 1b 14762306a36Sopenharmony_ci 14862306a36Sopenharmony_ci.Lzero: 14962306a36Sopenharmony_ci li r3,0 15062306a36Sopenharmony_ci blr 15162306a36Sopenharmony_ci 15262306a36Sopenharmony_ci.Lno_short: 15362306a36Sopenharmony_ci dcbt 0,r3 15462306a36Sopenharmony_ci dcbt 0,r4 15562306a36Sopenharmony_ci bne .Ldiffoffset_8bytes_make_align_start 15662306a36Sopenharmony_ci 15762306a36Sopenharmony_ci 15862306a36Sopenharmony_ci.Lsameoffset_8bytes_make_align_start: 15962306a36Sopenharmony_ci /* attempt to compare bytes not aligned with 8 bytes so that 16062306a36Sopenharmony_ci * rest comparison can run based on 8 bytes alignment. 16162306a36Sopenharmony_ci */ 16262306a36Sopenharmony_ci andi. r6,r3,7 16362306a36Sopenharmony_ci 16462306a36Sopenharmony_ci /* Try to compare the first double word which is not 8 bytes aligned: 16562306a36Sopenharmony_ci * load the first double word at (src & ~7UL) and shift left appropriate 16662306a36Sopenharmony_ci * bits before comparision. 16762306a36Sopenharmony_ci */ 16862306a36Sopenharmony_ci rlwinm r6,r3,3,26,28 16962306a36Sopenharmony_ci beq .Lsameoffset_8bytes_aligned 17062306a36Sopenharmony_ci clrrdi r3,r3,3 17162306a36Sopenharmony_ci clrrdi r4,r4,3 17262306a36Sopenharmony_ci LD rA,0,r3 17362306a36Sopenharmony_ci LD rB,0,r4 17462306a36Sopenharmony_ci sld rA,rA,r6 17562306a36Sopenharmony_ci sld rB,rB,r6 17662306a36Sopenharmony_ci cmpld cr0,rA,rB 17762306a36Sopenharmony_ci srwi r6,r6,3 17862306a36Sopenharmony_ci bne cr0,.LcmpAB_lightweight 17962306a36Sopenharmony_ci subfic r6,r6,8 18062306a36Sopenharmony_ci subf. r5,r6,r5 18162306a36Sopenharmony_ci addi r3,r3,8 18262306a36Sopenharmony_ci addi r4,r4,8 18362306a36Sopenharmony_ci beq .Lzero 18462306a36Sopenharmony_ci 18562306a36Sopenharmony_ci.Lsameoffset_8bytes_aligned: 18662306a36Sopenharmony_ci /* now we are aligned with 8 bytes. 18762306a36Sopenharmony_ci * Use .Llong loop if left cmp bytes are equal or greater than 32B. 18862306a36Sopenharmony_ci */ 18962306a36Sopenharmony_ci cmpdi cr6,r5,31 19062306a36Sopenharmony_ci bgt cr6,.Llong 19162306a36Sopenharmony_ci 19262306a36Sopenharmony_ci.Lcmp_lt32bytes: 19362306a36Sopenharmony_ci /* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */ 19462306a36Sopenharmony_ci cmpdi cr5,r5,7 19562306a36Sopenharmony_ci srdi r0,r5,3 19662306a36Sopenharmony_ci ble cr5,.Lcmp_rest_lt8bytes 19762306a36Sopenharmony_ci 19862306a36Sopenharmony_ci /* handle 8 ~ 31 bytes */ 19962306a36Sopenharmony_ci clrldi r5,r5,61 20062306a36Sopenharmony_ci mtctr r0 20162306a36Sopenharmony_ci2: 20262306a36Sopenharmony_ci LD rA,0,r3 20362306a36Sopenharmony_ci LD rB,0,r4 20462306a36Sopenharmony_ci cmpld cr0,rA,rB 20562306a36Sopenharmony_ci addi r3,r3,8 20662306a36Sopenharmony_ci addi r4,r4,8 20762306a36Sopenharmony_ci bne cr0,.LcmpAB_lightweight 20862306a36Sopenharmony_ci bdnz 2b 20962306a36Sopenharmony_ci 21062306a36Sopenharmony_ci cmpwi r5,0 21162306a36Sopenharmony_ci beq .Lzero 21262306a36Sopenharmony_ci 21362306a36Sopenharmony_ci.Lcmp_rest_lt8bytes: 21462306a36Sopenharmony_ci /* 21562306a36Sopenharmony_ci * Here we have less than 8 bytes to compare. At least s1 is aligned to 21662306a36Sopenharmony_ci * 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a 21762306a36Sopenharmony_ci * page boundary, otherwise we might read past the end of the buffer and 21862306a36Sopenharmony_ci * trigger a page fault. We use 4K as the conservative minimum page 21962306a36Sopenharmony_ci * size. If we detect that case we go to the byte-by-byte loop. 22062306a36Sopenharmony_ci * 22162306a36Sopenharmony_ci * Otherwise the next double word is loaded from s1 and s2, and shifted 22262306a36Sopenharmony_ci * right to compare the appropriate bits. 22362306a36Sopenharmony_ci */ 22462306a36Sopenharmony_ci clrldi r6,r4,(64-12) // r6 = r4 & 0xfff 22562306a36Sopenharmony_ci cmpdi r6,0xff8 22662306a36Sopenharmony_ci bgt .Lshort 22762306a36Sopenharmony_ci 22862306a36Sopenharmony_ci subfic r6,r5,8 22962306a36Sopenharmony_ci slwi r6,r6,3 23062306a36Sopenharmony_ci LD rA,0,r3 23162306a36Sopenharmony_ci LD rB,0,r4 23262306a36Sopenharmony_ci srd rA,rA,r6 23362306a36Sopenharmony_ci srd rB,rB,r6 23462306a36Sopenharmony_ci cmpld cr0,rA,rB 23562306a36Sopenharmony_ci bne cr0,.LcmpAB_lightweight 23662306a36Sopenharmony_ci b .Lzero 23762306a36Sopenharmony_ci 23862306a36Sopenharmony_ci.Lnon_zero: 23962306a36Sopenharmony_ci mr r3,rC 24062306a36Sopenharmony_ci blr 24162306a36Sopenharmony_ci 24262306a36Sopenharmony_ci.Llong: 24362306a36Sopenharmony_ci#ifdef CONFIG_ALTIVEC 24462306a36Sopenharmony_ciBEGIN_FTR_SECTION 24562306a36Sopenharmony_ci /* Try to use vmx loop if length is equal or greater than 4K */ 24662306a36Sopenharmony_ci cmpldi cr6,r5,VMX_THRESH 24762306a36Sopenharmony_ci bge cr6,.Lsameoffset_vmx_cmp 24862306a36Sopenharmony_ciEND_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 24962306a36Sopenharmony_ci 25062306a36Sopenharmony_ci.Llong_novmx_cmp: 25162306a36Sopenharmony_ci#endif 25262306a36Sopenharmony_ci /* At least s1 addr is aligned with 8 bytes */ 25362306a36Sopenharmony_ci li off8,8 25462306a36Sopenharmony_ci li off16,16 25562306a36Sopenharmony_ci li off24,24 25662306a36Sopenharmony_ci 25762306a36Sopenharmony_ci std r31,-8(r1) 25862306a36Sopenharmony_ci std r30,-16(r1) 25962306a36Sopenharmony_ci std r29,-24(r1) 26062306a36Sopenharmony_ci std r28,-32(r1) 26162306a36Sopenharmony_ci std r27,-40(r1) 26262306a36Sopenharmony_ci 26362306a36Sopenharmony_ci srdi r0,r5,5 26462306a36Sopenharmony_ci mtctr r0 26562306a36Sopenharmony_ci andi. r5,r5,31 26662306a36Sopenharmony_ci 26762306a36Sopenharmony_ci LD rA,0,r3 26862306a36Sopenharmony_ci LD rB,0,r4 26962306a36Sopenharmony_ci 27062306a36Sopenharmony_ci LD rC,off8,r3 27162306a36Sopenharmony_ci LD rD,off8,r4 27262306a36Sopenharmony_ci 27362306a36Sopenharmony_ci LD rE,off16,r3 27462306a36Sopenharmony_ci LD rF,off16,r4 27562306a36Sopenharmony_ci 27662306a36Sopenharmony_ci LD rG,off24,r3 27762306a36Sopenharmony_ci LD rH,off24,r4 27862306a36Sopenharmony_ci cmpld cr0,rA,rB 27962306a36Sopenharmony_ci 28062306a36Sopenharmony_ci addi r3,r3,32 28162306a36Sopenharmony_ci addi r4,r4,32 28262306a36Sopenharmony_ci 28362306a36Sopenharmony_ci bdz .Lfirst32 28462306a36Sopenharmony_ci 28562306a36Sopenharmony_ci LD rA,0,r3 28662306a36Sopenharmony_ci LD rB,0,r4 28762306a36Sopenharmony_ci cmpld cr1,rC,rD 28862306a36Sopenharmony_ci 28962306a36Sopenharmony_ci LD rC,off8,r3 29062306a36Sopenharmony_ci LD rD,off8,r4 29162306a36Sopenharmony_ci cmpld cr6,rE,rF 29262306a36Sopenharmony_ci 29362306a36Sopenharmony_ci LD rE,off16,r3 29462306a36Sopenharmony_ci LD rF,off16,r4 29562306a36Sopenharmony_ci cmpld cr7,rG,rH 29662306a36Sopenharmony_ci bne cr0,.LcmpAB 29762306a36Sopenharmony_ci 29862306a36Sopenharmony_ci LD rG,off24,r3 29962306a36Sopenharmony_ci LD rH,off24,r4 30062306a36Sopenharmony_ci cmpld cr0,rA,rB 30162306a36Sopenharmony_ci bne cr1,.LcmpCD 30262306a36Sopenharmony_ci 30362306a36Sopenharmony_ci addi r3,r3,32 30462306a36Sopenharmony_ci addi r4,r4,32 30562306a36Sopenharmony_ci 30662306a36Sopenharmony_ci bdz .Lsecond32 30762306a36Sopenharmony_ci 30862306a36Sopenharmony_ci .balign 16 30962306a36Sopenharmony_ci 31062306a36Sopenharmony_ci1: LD rA,0,r3 31162306a36Sopenharmony_ci LD rB,0,r4 31262306a36Sopenharmony_ci cmpld cr1,rC,rD 31362306a36Sopenharmony_ci bne cr6,.LcmpEF 31462306a36Sopenharmony_ci 31562306a36Sopenharmony_ci LD rC,off8,r3 31662306a36Sopenharmony_ci LD rD,off8,r4 31762306a36Sopenharmony_ci cmpld cr6,rE,rF 31862306a36Sopenharmony_ci bne cr7,.LcmpGH 31962306a36Sopenharmony_ci 32062306a36Sopenharmony_ci LD rE,off16,r3 32162306a36Sopenharmony_ci LD rF,off16,r4 32262306a36Sopenharmony_ci cmpld cr7,rG,rH 32362306a36Sopenharmony_ci bne cr0,.LcmpAB 32462306a36Sopenharmony_ci 32562306a36Sopenharmony_ci LD rG,off24,r3 32662306a36Sopenharmony_ci LD rH,off24,r4 32762306a36Sopenharmony_ci cmpld cr0,rA,rB 32862306a36Sopenharmony_ci bne cr1,.LcmpCD 32962306a36Sopenharmony_ci 33062306a36Sopenharmony_ci addi r3,r3,32 33162306a36Sopenharmony_ci addi r4,r4,32 33262306a36Sopenharmony_ci 33362306a36Sopenharmony_ci bdnz 1b 33462306a36Sopenharmony_ci 33562306a36Sopenharmony_ci.Lsecond32: 33662306a36Sopenharmony_ci cmpld cr1,rC,rD 33762306a36Sopenharmony_ci bne cr6,.LcmpEF 33862306a36Sopenharmony_ci 33962306a36Sopenharmony_ci cmpld cr6,rE,rF 34062306a36Sopenharmony_ci bne cr7,.LcmpGH 34162306a36Sopenharmony_ci 34262306a36Sopenharmony_ci cmpld cr7,rG,rH 34362306a36Sopenharmony_ci bne cr0,.LcmpAB 34462306a36Sopenharmony_ci 34562306a36Sopenharmony_ci bne cr1,.LcmpCD 34662306a36Sopenharmony_ci bne cr6,.LcmpEF 34762306a36Sopenharmony_ci bne cr7,.LcmpGH 34862306a36Sopenharmony_ci 34962306a36Sopenharmony_ci.Ltail: 35062306a36Sopenharmony_ci ld r31,-8(r1) 35162306a36Sopenharmony_ci ld r30,-16(r1) 35262306a36Sopenharmony_ci ld r29,-24(r1) 35362306a36Sopenharmony_ci ld r28,-32(r1) 35462306a36Sopenharmony_ci ld r27,-40(r1) 35562306a36Sopenharmony_ci 35662306a36Sopenharmony_ci cmpdi r5,0 35762306a36Sopenharmony_ci beq .Lzero 35862306a36Sopenharmony_ci b .Lshort 35962306a36Sopenharmony_ci 36062306a36Sopenharmony_ci.Lfirst32: 36162306a36Sopenharmony_ci cmpld cr1,rC,rD 36262306a36Sopenharmony_ci cmpld cr6,rE,rF 36362306a36Sopenharmony_ci cmpld cr7,rG,rH 36462306a36Sopenharmony_ci 36562306a36Sopenharmony_ci bne cr0,.LcmpAB 36662306a36Sopenharmony_ci bne cr1,.LcmpCD 36762306a36Sopenharmony_ci bne cr6,.LcmpEF 36862306a36Sopenharmony_ci bne cr7,.LcmpGH 36962306a36Sopenharmony_ci 37062306a36Sopenharmony_ci b .Ltail 37162306a36Sopenharmony_ci 37262306a36Sopenharmony_ci.LcmpAB: 37362306a36Sopenharmony_ci li r3,1 37462306a36Sopenharmony_ci bgt cr0,.Lout 37562306a36Sopenharmony_ci li r3,-1 37662306a36Sopenharmony_ci b .Lout 37762306a36Sopenharmony_ci 37862306a36Sopenharmony_ci.LcmpCD: 37962306a36Sopenharmony_ci li r3,1 38062306a36Sopenharmony_ci bgt cr1,.Lout 38162306a36Sopenharmony_ci li r3,-1 38262306a36Sopenharmony_ci b .Lout 38362306a36Sopenharmony_ci 38462306a36Sopenharmony_ci.LcmpEF: 38562306a36Sopenharmony_ci li r3,1 38662306a36Sopenharmony_ci bgt cr6,.Lout 38762306a36Sopenharmony_ci li r3,-1 38862306a36Sopenharmony_ci b .Lout 38962306a36Sopenharmony_ci 39062306a36Sopenharmony_ci.LcmpGH: 39162306a36Sopenharmony_ci li r3,1 39262306a36Sopenharmony_ci bgt cr7,.Lout 39362306a36Sopenharmony_ci li r3,-1 39462306a36Sopenharmony_ci 39562306a36Sopenharmony_ci.Lout: 39662306a36Sopenharmony_ci ld r31,-8(r1) 39762306a36Sopenharmony_ci ld r30,-16(r1) 39862306a36Sopenharmony_ci ld r29,-24(r1) 39962306a36Sopenharmony_ci ld r28,-32(r1) 40062306a36Sopenharmony_ci ld r27,-40(r1) 40162306a36Sopenharmony_ci blr 40262306a36Sopenharmony_ci 40362306a36Sopenharmony_ci.LcmpAB_lightweight: /* skip NV GPRS restore */ 40462306a36Sopenharmony_ci li r3,1 40562306a36Sopenharmony_ci bgtlr 40662306a36Sopenharmony_ci li r3,-1 40762306a36Sopenharmony_ci blr 40862306a36Sopenharmony_ci 40962306a36Sopenharmony_ci#ifdef CONFIG_ALTIVEC 41062306a36Sopenharmony_ci.Lsameoffset_vmx_cmp: 41162306a36Sopenharmony_ci /* Enter with src/dst addrs has the same offset with 8 bytes 41262306a36Sopenharmony_ci * align boundary. 41362306a36Sopenharmony_ci * 41462306a36Sopenharmony_ci * There is an optimization based on following fact: memcmp() 41562306a36Sopenharmony_ci * prones to fail early at the first 32 bytes. 41662306a36Sopenharmony_ci * Before applying VMX instructions which will lead to 32x128bits 41762306a36Sopenharmony_ci * VMX regs load/restore penalty, we compare the first 32 bytes 41862306a36Sopenharmony_ci * so that we can catch the ~80% fail cases. 41962306a36Sopenharmony_ci */ 42062306a36Sopenharmony_ci 42162306a36Sopenharmony_ci li r0,4 42262306a36Sopenharmony_ci mtctr r0 42362306a36Sopenharmony_ci.Lsameoffset_prechk_32B_loop: 42462306a36Sopenharmony_ci LD rA,0,r3 42562306a36Sopenharmony_ci LD rB,0,r4 42662306a36Sopenharmony_ci cmpld cr0,rA,rB 42762306a36Sopenharmony_ci addi r3,r3,8 42862306a36Sopenharmony_ci addi r4,r4,8 42962306a36Sopenharmony_ci bne cr0,.LcmpAB_lightweight 43062306a36Sopenharmony_ci addi r5,r5,-8 43162306a36Sopenharmony_ci bdnz .Lsameoffset_prechk_32B_loop 43262306a36Sopenharmony_ci 43362306a36Sopenharmony_ci ENTER_VMX_OPS 43462306a36Sopenharmony_ci beq cr1,.Llong_novmx_cmp 43562306a36Sopenharmony_ci 43662306a36Sopenharmony_ci3: 43762306a36Sopenharmony_ci /* need to check whether r4 has the same offset with r3 43862306a36Sopenharmony_ci * for 16 bytes boundary. 43962306a36Sopenharmony_ci */ 44062306a36Sopenharmony_ci xor r0,r3,r4 44162306a36Sopenharmony_ci andi. r0,r0,0xf 44262306a36Sopenharmony_ci bne .Ldiffoffset_vmx_cmp_start 44362306a36Sopenharmony_ci 44462306a36Sopenharmony_ci /* len is no less than 4KB. Need to align with 16 bytes further. 44562306a36Sopenharmony_ci */ 44662306a36Sopenharmony_ci andi. rA,r3,8 44762306a36Sopenharmony_ci LD rA,0,r3 44862306a36Sopenharmony_ci beq 4f 44962306a36Sopenharmony_ci LD rB,0,r4 45062306a36Sopenharmony_ci cmpld cr0,rA,rB 45162306a36Sopenharmony_ci addi r3,r3,8 45262306a36Sopenharmony_ci addi r4,r4,8 45362306a36Sopenharmony_ci addi r5,r5,-8 45462306a36Sopenharmony_ci 45562306a36Sopenharmony_ci beq cr0,4f 45662306a36Sopenharmony_ci /* save and restore cr0 */ 45762306a36Sopenharmony_ci mfocrf r5,128 45862306a36Sopenharmony_ci EXIT_VMX_OPS 45962306a36Sopenharmony_ci mtocrf 128,r5 46062306a36Sopenharmony_ci b .LcmpAB_lightweight 46162306a36Sopenharmony_ci 46262306a36Sopenharmony_ci4: 46362306a36Sopenharmony_ci /* compare 32 bytes for each loop */ 46462306a36Sopenharmony_ci srdi r0,r5,5 46562306a36Sopenharmony_ci mtctr r0 46662306a36Sopenharmony_ci clrldi r5,r5,59 46762306a36Sopenharmony_ci li off16,16 46862306a36Sopenharmony_ci 46962306a36Sopenharmony_ci.balign 16 47062306a36Sopenharmony_ci5: 47162306a36Sopenharmony_ci lvx v0,0,r3 47262306a36Sopenharmony_ci lvx v1,0,r4 47362306a36Sopenharmony_ci VCMPEQUD_RC(v0,v0,v1) 47462306a36Sopenharmony_ci bnl cr6,7f 47562306a36Sopenharmony_ci lvx v0,off16,r3 47662306a36Sopenharmony_ci lvx v1,off16,r4 47762306a36Sopenharmony_ci VCMPEQUD_RC(v0,v0,v1) 47862306a36Sopenharmony_ci bnl cr6,6f 47962306a36Sopenharmony_ci addi r3,r3,32 48062306a36Sopenharmony_ci addi r4,r4,32 48162306a36Sopenharmony_ci bdnz 5b 48262306a36Sopenharmony_ci 48362306a36Sopenharmony_ci EXIT_VMX_OPS 48462306a36Sopenharmony_ci cmpdi r5,0 48562306a36Sopenharmony_ci beq .Lzero 48662306a36Sopenharmony_ci b .Lcmp_lt32bytes 48762306a36Sopenharmony_ci 48862306a36Sopenharmony_ci6: 48962306a36Sopenharmony_ci addi r3,r3,16 49062306a36Sopenharmony_ci addi r4,r4,16 49162306a36Sopenharmony_ci 49262306a36Sopenharmony_ci7: 49362306a36Sopenharmony_ci /* diff the last 16 bytes */ 49462306a36Sopenharmony_ci EXIT_VMX_OPS 49562306a36Sopenharmony_ci LD rA,0,r3 49662306a36Sopenharmony_ci LD rB,0,r4 49762306a36Sopenharmony_ci cmpld cr0,rA,rB 49862306a36Sopenharmony_ci li off8,8 49962306a36Sopenharmony_ci bne cr0,.LcmpAB_lightweight 50062306a36Sopenharmony_ci 50162306a36Sopenharmony_ci LD rA,off8,r3 50262306a36Sopenharmony_ci LD rB,off8,r4 50362306a36Sopenharmony_ci cmpld cr0,rA,rB 50462306a36Sopenharmony_ci bne cr0,.LcmpAB_lightweight 50562306a36Sopenharmony_ci b .Lzero 50662306a36Sopenharmony_ci#endif 50762306a36Sopenharmony_ci 50862306a36Sopenharmony_ci.Ldiffoffset_8bytes_make_align_start: 50962306a36Sopenharmony_ci /* now try to align s1 with 8 bytes */ 51062306a36Sopenharmony_ci rlwinm r6,r3,3,26,28 51162306a36Sopenharmony_ci beq .Ldiffoffset_align_s1_8bytes 51262306a36Sopenharmony_ci 51362306a36Sopenharmony_ci clrrdi r3,r3,3 51462306a36Sopenharmony_ci LD rA,0,r3 51562306a36Sopenharmony_ci LD rB,0,r4 /* unaligned load */ 51662306a36Sopenharmony_ci sld rA,rA,r6 51762306a36Sopenharmony_ci srd rA,rA,r6 51862306a36Sopenharmony_ci srd rB,rB,r6 51962306a36Sopenharmony_ci cmpld cr0,rA,rB 52062306a36Sopenharmony_ci srwi r6,r6,3 52162306a36Sopenharmony_ci bne cr0,.LcmpAB_lightweight 52262306a36Sopenharmony_ci 52362306a36Sopenharmony_ci subfic r6,r6,8 52462306a36Sopenharmony_ci subf. r5,r6,r5 52562306a36Sopenharmony_ci addi r3,r3,8 52662306a36Sopenharmony_ci add r4,r4,r6 52762306a36Sopenharmony_ci 52862306a36Sopenharmony_ci beq .Lzero 52962306a36Sopenharmony_ci 53062306a36Sopenharmony_ci.Ldiffoffset_align_s1_8bytes: 53162306a36Sopenharmony_ci /* now s1 is aligned with 8 bytes. */ 53262306a36Sopenharmony_ci#ifdef CONFIG_ALTIVEC 53362306a36Sopenharmony_ciBEGIN_FTR_SECTION 53462306a36Sopenharmony_ci /* only do vmx ops when the size equal or greater than 4K bytes */ 53562306a36Sopenharmony_ci cmpdi cr5,r5,VMX_THRESH 53662306a36Sopenharmony_ci bge cr5,.Ldiffoffset_vmx_cmp 53762306a36Sopenharmony_ciEND_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 53862306a36Sopenharmony_ci 53962306a36Sopenharmony_ci.Ldiffoffset_novmx_cmp: 54062306a36Sopenharmony_ci#endif 54162306a36Sopenharmony_ci 54262306a36Sopenharmony_ci 54362306a36Sopenharmony_ci cmpdi cr5,r5,31 54462306a36Sopenharmony_ci ble cr5,.Lcmp_lt32bytes 54562306a36Sopenharmony_ci 54662306a36Sopenharmony_ci#ifdef CONFIG_ALTIVEC 54762306a36Sopenharmony_ci b .Llong_novmx_cmp 54862306a36Sopenharmony_ci#else 54962306a36Sopenharmony_ci b .Llong 55062306a36Sopenharmony_ci#endif 55162306a36Sopenharmony_ci 55262306a36Sopenharmony_ci#ifdef CONFIG_ALTIVEC 55362306a36Sopenharmony_ci.Ldiffoffset_vmx_cmp: 55462306a36Sopenharmony_ci /* perform a 32 bytes pre-checking before 55562306a36Sopenharmony_ci * enable VMX operations. 55662306a36Sopenharmony_ci */ 55762306a36Sopenharmony_ci li r0,4 55862306a36Sopenharmony_ci mtctr r0 55962306a36Sopenharmony_ci.Ldiffoffset_prechk_32B_loop: 56062306a36Sopenharmony_ci LD rA,0,r3 56162306a36Sopenharmony_ci LD rB,0,r4 56262306a36Sopenharmony_ci cmpld cr0,rA,rB 56362306a36Sopenharmony_ci addi r3,r3,8 56462306a36Sopenharmony_ci addi r4,r4,8 56562306a36Sopenharmony_ci bne cr0,.LcmpAB_lightweight 56662306a36Sopenharmony_ci addi r5,r5,-8 56762306a36Sopenharmony_ci bdnz .Ldiffoffset_prechk_32B_loop 56862306a36Sopenharmony_ci 56962306a36Sopenharmony_ci ENTER_VMX_OPS 57062306a36Sopenharmony_ci beq cr1,.Ldiffoffset_novmx_cmp 57162306a36Sopenharmony_ci 57262306a36Sopenharmony_ci.Ldiffoffset_vmx_cmp_start: 57362306a36Sopenharmony_ci /* Firstly try to align r3 with 16 bytes */ 57462306a36Sopenharmony_ci andi. r6,r3,0xf 57562306a36Sopenharmony_ci li off16,16 57662306a36Sopenharmony_ci beq .Ldiffoffset_vmx_s1_16bytes_align 57762306a36Sopenharmony_ci 57862306a36Sopenharmony_ci LVS v3,0,r3 57962306a36Sopenharmony_ci LVS v4,0,r4 58062306a36Sopenharmony_ci 58162306a36Sopenharmony_ci lvx v5,0,r3 58262306a36Sopenharmony_ci lvx v6,0,r4 58362306a36Sopenharmony_ci LD_VSR_CROSS16B(r3,v3,v5,v7,v9) 58462306a36Sopenharmony_ci LD_VSR_CROSS16B(r4,v4,v6,v8,v10) 58562306a36Sopenharmony_ci 58662306a36Sopenharmony_ci VCMPEQUB_RC(v7,v9,v10) 58762306a36Sopenharmony_ci bnl cr6,.Ldiffoffset_vmx_diff_found 58862306a36Sopenharmony_ci 58962306a36Sopenharmony_ci subfic r6,r6,16 59062306a36Sopenharmony_ci subf r5,r6,r5 59162306a36Sopenharmony_ci add r3,r3,r6 59262306a36Sopenharmony_ci add r4,r4,r6 59362306a36Sopenharmony_ci 59462306a36Sopenharmony_ci.Ldiffoffset_vmx_s1_16bytes_align: 59562306a36Sopenharmony_ci /* now s1 is aligned with 16 bytes */ 59662306a36Sopenharmony_ci lvx v6,0,r4 59762306a36Sopenharmony_ci LVS v4,0,r4 59862306a36Sopenharmony_ci srdi r6,r5,5 /* loop for 32 bytes each */ 59962306a36Sopenharmony_ci clrldi r5,r5,59 60062306a36Sopenharmony_ci mtctr r6 60162306a36Sopenharmony_ci 60262306a36Sopenharmony_ci.balign 16 60362306a36Sopenharmony_ci.Ldiffoffset_vmx_32bytesloop: 60462306a36Sopenharmony_ci /* the first qw of r4 was saved in v6 */ 60562306a36Sopenharmony_ci lvx v9,0,r3 60662306a36Sopenharmony_ci LD_VSR_CROSS16B(r4,v4,v6,v8,v10) 60762306a36Sopenharmony_ci VCMPEQUB_RC(v7,v9,v10) 60862306a36Sopenharmony_ci vor v6,v8,v8 60962306a36Sopenharmony_ci bnl cr6,.Ldiffoffset_vmx_diff_found 61062306a36Sopenharmony_ci 61162306a36Sopenharmony_ci addi r3,r3,16 61262306a36Sopenharmony_ci addi r4,r4,16 61362306a36Sopenharmony_ci 61462306a36Sopenharmony_ci lvx v9,0,r3 61562306a36Sopenharmony_ci LD_VSR_CROSS16B(r4,v4,v6,v8,v10) 61662306a36Sopenharmony_ci VCMPEQUB_RC(v7,v9,v10) 61762306a36Sopenharmony_ci vor v6,v8,v8 61862306a36Sopenharmony_ci bnl cr6,.Ldiffoffset_vmx_diff_found 61962306a36Sopenharmony_ci 62062306a36Sopenharmony_ci addi r3,r3,16 62162306a36Sopenharmony_ci addi r4,r4,16 62262306a36Sopenharmony_ci 62362306a36Sopenharmony_ci bdnz .Ldiffoffset_vmx_32bytesloop 62462306a36Sopenharmony_ci 62562306a36Sopenharmony_ci EXIT_VMX_OPS 62662306a36Sopenharmony_ci 62762306a36Sopenharmony_ci cmpdi r5,0 62862306a36Sopenharmony_ci beq .Lzero 62962306a36Sopenharmony_ci b .Lcmp_lt32bytes 63062306a36Sopenharmony_ci 63162306a36Sopenharmony_ci.Ldiffoffset_vmx_diff_found: 63262306a36Sopenharmony_ci EXIT_VMX_OPS 63362306a36Sopenharmony_ci /* anyway, the diff will appear in next 16 bytes */ 63462306a36Sopenharmony_ci li r5,16 63562306a36Sopenharmony_ci b .Lcmp_lt32bytes 63662306a36Sopenharmony_ci 63762306a36Sopenharmony_ci#endif 63862306a36Sopenharmony_ciEXPORT_SYMBOL(memcmp) 639