18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * 48c2ecf20Sopenharmony_ci * Optimized version of the standard memcpy() function 58c2ecf20Sopenharmony_ci * 68c2ecf20Sopenharmony_ci * Inputs: 78c2ecf20Sopenharmony_ci * in0: destination address 88c2ecf20Sopenharmony_ci * in1: source address 98c2ecf20Sopenharmony_ci * in2: number of bytes to copy 108c2ecf20Sopenharmony_ci * Output: 118c2ecf20Sopenharmony_ci * no return value 128c2ecf20Sopenharmony_ci * 138c2ecf20Sopenharmony_ci * Copyright (C) 2000-2001 Hewlett-Packard Co 148c2ecf20Sopenharmony_ci * Stephane Eranian <eranian@hpl.hp.com> 158c2ecf20Sopenharmony_ci * David Mosberger-Tang <davidm@hpl.hp.com> 168c2ecf20Sopenharmony_ci */ 178c2ecf20Sopenharmony_ci#include <asm/asmmacro.h> 188c2ecf20Sopenharmony_ci#include <asm/export.h> 198c2ecf20Sopenharmony_ci 208c2ecf20Sopenharmony_ciGLOBAL_ENTRY(memcpy) 218c2ecf20Sopenharmony_ci 228c2ecf20Sopenharmony_ci# define MEM_LAT 21 /* latency to memory */ 238c2ecf20Sopenharmony_ci 248c2ecf20Sopenharmony_ci# define dst r2 258c2ecf20Sopenharmony_ci# define src r3 268c2ecf20Sopenharmony_ci# define retval r8 278c2ecf20Sopenharmony_ci# define saved_pfs r9 288c2ecf20Sopenharmony_ci# define saved_lc r10 298c2ecf20Sopenharmony_ci# define saved_pr r11 308c2ecf20Sopenharmony_ci# define cnt r16 318c2ecf20Sopenharmony_ci# define src2 r17 328c2ecf20Sopenharmony_ci# define t0 r18 338c2ecf20Sopenharmony_ci# define t1 r19 348c2ecf20Sopenharmony_ci# define t2 r20 358c2ecf20Sopenharmony_ci# define t3 r21 368c2ecf20Sopenharmony_ci# define t4 r22 378c2ecf20Sopenharmony_ci# define src_end r23 388c2ecf20Sopenharmony_ci 398c2ecf20Sopenharmony_ci# define N (MEM_LAT + 4) 408c2ecf20Sopenharmony_ci# define Nrot ((N + 7) & ~7) 418c2ecf20Sopenharmony_ci 428c2ecf20Sopenharmony_ci /* 438c2ecf20Sopenharmony_ci * First, check if everything (src, dst, len) is a multiple of eight. If 448c2ecf20Sopenharmony_ci * so, we handle everything with no taken branches (other than the loop 458c2ecf20Sopenharmony_ci * itself) and a small icache footprint. Otherwise, we jump off to 468c2ecf20Sopenharmony_ci * the more general copy routine handling arbitrary 478c2ecf20Sopenharmony_ci * sizes/alignment etc. 488c2ecf20Sopenharmony_ci */ 498c2ecf20Sopenharmony_ci .prologue 508c2ecf20Sopenharmony_ci .save ar.pfs, saved_pfs 518c2ecf20Sopenharmony_ci alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot 528c2ecf20Sopenharmony_ci .save ar.lc, saved_lc 538c2ecf20Sopenharmony_ci mov saved_lc=ar.lc 548c2ecf20Sopenharmony_ci or t0=in0,in1 558c2ecf20Sopenharmony_ci ;; 568c2ecf20Sopenharmony_ci 578c2ecf20Sopenharmony_ci or t0=t0,in2 588c2ecf20Sopenharmony_ci .save pr, saved_pr 598c2ecf20Sopenharmony_ci mov saved_pr=pr 608c2ecf20Sopenharmony_ci 618c2ecf20Sopenharmony_ci .body 628c2ecf20Sopenharmony_ci 638c2ecf20Sopenharmony_ci cmp.eq p6,p0=in2,r0 // zero length? 648c2ecf20Sopenharmony_ci mov retval=in0 // return dst 658c2ecf20Sopenharmony_ci(p6) br.ret.spnt.many rp // zero length, return immediately 668c2ecf20Sopenharmony_ci ;; 678c2ecf20Sopenharmony_ci 688c2ecf20Sopenharmony_ci mov dst=in0 // copy because of rotation 698c2ecf20Sopenharmony_ci shr.u cnt=in2,3 // number of 8-byte words to copy 708c2ecf20Sopenharmony_ci mov pr.rot=1<<16 718c2ecf20Sopenharmony_ci ;; 728c2ecf20Sopenharmony_ci 738c2ecf20Sopenharmony_ci adds cnt=-1,cnt // br.ctop is repeat/until 748c2ecf20Sopenharmony_ci cmp.gtu p7,p0=16,in2 // copying less than 16 bytes? 758c2ecf20Sopenharmony_ci mov ar.ec=N 768c2ecf20Sopenharmony_ci ;; 778c2ecf20Sopenharmony_ci 788c2ecf20Sopenharmony_ci and t0=0x7,t0 798c2ecf20Sopenharmony_ci mov ar.lc=cnt 808c2ecf20Sopenharmony_ci ;; 818c2ecf20Sopenharmony_ci cmp.ne p6,p0=t0,r0 828c2ecf20Sopenharmony_ci 838c2ecf20Sopenharmony_ci mov src=in1 // copy because of rotation 848c2ecf20Sopenharmony_ci(p7) br.cond.spnt.few .memcpy_short 858c2ecf20Sopenharmony_ci(p6) br.cond.spnt.few .memcpy_long 868c2ecf20Sopenharmony_ci ;; 878c2ecf20Sopenharmony_ci nop.m 0 888c2ecf20Sopenharmony_ci ;; 898c2ecf20Sopenharmony_ci nop.m 0 908c2ecf20Sopenharmony_ci nop.i 0 918c2ecf20Sopenharmony_ci ;; 928c2ecf20Sopenharmony_ci nop.m 0 938c2ecf20Sopenharmony_ci ;; 948c2ecf20Sopenharmony_ci .rotr val[N] 958c2ecf20Sopenharmony_ci .rotp p[N] 968c2ecf20Sopenharmony_ci .align 32 978c2ecf20Sopenharmony_ci1: { .mib 988c2ecf20Sopenharmony_ci(p[0]) ld8 val[0]=[src],8 998c2ecf20Sopenharmony_ci nop.i 0 1008c2ecf20Sopenharmony_ci brp.loop.imp 1b, 2f 1018c2ecf20Sopenharmony_ci} 1028c2ecf20Sopenharmony_ci2: { .mfb 1038c2ecf20Sopenharmony_ci(p[N-1])st8 [dst]=val[N-1],8 1048c2ecf20Sopenharmony_ci nop.f 0 1058c2ecf20Sopenharmony_ci br.ctop.dptk.few 1b 1068c2ecf20Sopenharmony_ci} 1078c2ecf20Sopenharmony_ci ;; 1088c2ecf20Sopenharmony_ci mov ar.lc=saved_lc 1098c2ecf20Sopenharmony_ci mov pr=saved_pr,-1 1108c2ecf20Sopenharmony_ci mov ar.pfs=saved_pfs 1118c2ecf20Sopenharmony_ci br.ret.sptk.many rp 1128c2ecf20Sopenharmony_ci 1138c2ecf20Sopenharmony_ci /* 1148c2ecf20Sopenharmony_ci * Small (<16 bytes) unaligned copying is done via a simple byte-at-the-time 1158c2ecf20Sopenharmony_ci * copy loop. This performs relatively poorly on Itanium, but it doesn't 1168c2ecf20Sopenharmony_ci * get used very often (gcc inlines small copies) and due to atomicity 1178c2ecf20Sopenharmony_ci * issues, we want to avoid read-modify-write of entire words. 1188c2ecf20Sopenharmony_ci */ 1198c2ecf20Sopenharmony_ci .align 32 1208c2ecf20Sopenharmony_ci.memcpy_short: 1218c2ecf20Sopenharmony_ci adds cnt=-1,in2 // br.ctop is repeat/until 1228c2ecf20Sopenharmony_ci mov ar.ec=MEM_LAT 1238c2ecf20Sopenharmony_ci brp.loop.imp 1f, 2f 1248c2ecf20Sopenharmony_ci ;; 1258c2ecf20Sopenharmony_ci mov ar.lc=cnt 1268c2ecf20Sopenharmony_ci ;; 1278c2ecf20Sopenharmony_ci nop.m 0 1288c2ecf20Sopenharmony_ci ;; 1298c2ecf20Sopenharmony_ci nop.m 0 1308c2ecf20Sopenharmony_ci nop.i 0 1318c2ecf20Sopenharmony_ci ;; 1328c2ecf20Sopenharmony_ci nop.m 0 1338c2ecf20Sopenharmony_ci ;; 1348c2ecf20Sopenharmony_ci nop.m 0 1358c2ecf20Sopenharmony_ci ;; 1368c2ecf20Sopenharmony_ci /* 1378c2ecf20Sopenharmony_ci * It is faster to put a stop bit in the loop here because it makes 1388c2ecf20Sopenharmony_ci * the pipeline shorter (and latency is what matters on short copies). 1398c2ecf20Sopenharmony_ci */ 1408c2ecf20Sopenharmony_ci .align 32 1418c2ecf20Sopenharmony_ci1: { .mib 1428c2ecf20Sopenharmony_ci(p[0]) ld1 val[0]=[src],1 1438c2ecf20Sopenharmony_ci nop.i 0 1448c2ecf20Sopenharmony_ci brp.loop.imp 1b, 2f 1458c2ecf20Sopenharmony_ci} ;; 1468c2ecf20Sopenharmony_ci2: { .mfb 1478c2ecf20Sopenharmony_ci(p[MEM_LAT-1])st1 [dst]=val[MEM_LAT-1],1 1488c2ecf20Sopenharmony_ci nop.f 0 1498c2ecf20Sopenharmony_ci br.ctop.dptk.few 1b 1508c2ecf20Sopenharmony_ci} ;; 1518c2ecf20Sopenharmony_ci mov ar.lc=saved_lc 1528c2ecf20Sopenharmony_ci mov pr=saved_pr,-1 1538c2ecf20Sopenharmony_ci mov ar.pfs=saved_pfs 1548c2ecf20Sopenharmony_ci br.ret.sptk.many rp 1558c2ecf20Sopenharmony_ci 1568c2ecf20Sopenharmony_ci /* 1578c2ecf20Sopenharmony_ci * Large (>= 16 bytes) copying is done in a fancy way. Latency isn't 1588c2ecf20Sopenharmony_ci * an overriding concern here, but throughput is. We first do 1598c2ecf20Sopenharmony_ci * sub-word copying until the destination is aligned, then we check 1608c2ecf20Sopenharmony_ci * if the source is also aligned. If so, we do a simple load/store-loop 1618c2ecf20Sopenharmony_ci * until there are less than 8 bytes left over and then we do the tail, 1628c2ecf20Sopenharmony_ci * by storing the last few bytes using sub-word copying. If the source 1638c2ecf20Sopenharmony_ci * is not aligned, we branch off to the non-congruent loop. 1648c2ecf20Sopenharmony_ci * 1658c2ecf20Sopenharmony_ci * stage: op: 1668c2ecf20Sopenharmony_ci * 0 ld 1678c2ecf20Sopenharmony_ci * : 1688c2ecf20Sopenharmony_ci * MEM_LAT+3 shrp 1698c2ecf20Sopenharmony_ci * MEM_LAT+4 st 1708c2ecf20Sopenharmony_ci * 1718c2ecf20Sopenharmony_ci * On Itanium, the pipeline itself runs without stalls. However, br.ctop 1728c2ecf20Sopenharmony_ci * seems to introduce an unavoidable bubble in the pipeline so the overall 1738c2ecf20Sopenharmony_ci * latency is 2 cycles/iteration. This gives us a _copy_ throughput 1748c2ecf20Sopenharmony_ci * of 4 byte/cycle. Still not bad. 1758c2ecf20Sopenharmony_ci */ 1768c2ecf20Sopenharmony_ci# undef N 1778c2ecf20Sopenharmony_ci# undef Nrot 1788c2ecf20Sopenharmony_ci# define N (MEM_LAT + 5) /* number of stages */ 1798c2ecf20Sopenharmony_ci# define Nrot ((N+1 + 2 + 7) & ~7) /* number of rotating regs */ 1808c2ecf20Sopenharmony_ci 1818c2ecf20Sopenharmony_ci#define LOG_LOOP_SIZE 6 1828c2ecf20Sopenharmony_ci 1838c2ecf20Sopenharmony_ci.memcpy_long: 1848c2ecf20Sopenharmony_ci alloc t3=ar.pfs,3,Nrot,0,Nrot // resize register frame 1858c2ecf20Sopenharmony_ci and t0=-8,src // t0 = src & ~7 1868c2ecf20Sopenharmony_ci and t2=7,src // t2 = src & 7 1878c2ecf20Sopenharmony_ci ;; 1888c2ecf20Sopenharmony_ci ld8 t0=[t0] // t0 = 1st source word 1898c2ecf20Sopenharmony_ci adds src2=7,src // src2 = (src + 7) 1908c2ecf20Sopenharmony_ci sub t4=r0,dst // t4 = -dst 1918c2ecf20Sopenharmony_ci ;; 1928c2ecf20Sopenharmony_ci and src2=-8,src2 // src2 = (src + 7) & ~7 1938c2ecf20Sopenharmony_ci shl t2=t2,3 // t2 = 8*(src & 7) 1948c2ecf20Sopenharmony_ci shl t4=t4,3 // t4 = 8*(dst & 7) 1958c2ecf20Sopenharmony_ci ;; 1968c2ecf20Sopenharmony_ci ld8 t1=[src2] // t1 = 1st source word if src is 8-byte aligned, 2nd otherwise 1978c2ecf20Sopenharmony_ci sub t3=64,t2 // t3 = 64-8*(src & 7) 1988c2ecf20Sopenharmony_ci shr.u t0=t0,t2 1998c2ecf20Sopenharmony_ci ;; 2008c2ecf20Sopenharmony_ci add src_end=src,in2 2018c2ecf20Sopenharmony_ci shl t1=t1,t3 2028c2ecf20Sopenharmony_ci mov pr=t4,0x38 // (p5,p4,p3)=(dst & 7) 2038c2ecf20Sopenharmony_ci ;; 2048c2ecf20Sopenharmony_ci or t0=t0,t1 2058c2ecf20Sopenharmony_ci mov cnt=r0 2068c2ecf20Sopenharmony_ci adds src_end=-1,src_end 2078c2ecf20Sopenharmony_ci ;; 2088c2ecf20Sopenharmony_ci(p3) st1 [dst]=t0,1 2098c2ecf20Sopenharmony_ci(p3) shr.u t0=t0,8 2108c2ecf20Sopenharmony_ci(p3) adds cnt=1,cnt 2118c2ecf20Sopenharmony_ci ;; 2128c2ecf20Sopenharmony_ci(p4) st2 [dst]=t0,2 2138c2ecf20Sopenharmony_ci(p4) shr.u t0=t0,16 2148c2ecf20Sopenharmony_ci(p4) adds cnt=2,cnt 2158c2ecf20Sopenharmony_ci ;; 2168c2ecf20Sopenharmony_ci(p5) st4 [dst]=t0,4 2178c2ecf20Sopenharmony_ci(p5) adds cnt=4,cnt 2188c2ecf20Sopenharmony_ci and src_end=-8,src_end // src_end = last word of source buffer 2198c2ecf20Sopenharmony_ci ;; 2208c2ecf20Sopenharmony_ci 2218c2ecf20Sopenharmony_ci // At this point, dst is aligned to 8 bytes and there at least 16-7=9 bytes left to copy: 2228c2ecf20Sopenharmony_ci 2238c2ecf20Sopenharmony_ci1:{ add src=cnt,src // make src point to remainder of source buffer 2248c2ecf20Sopenharmony_ci sub cnt=in2,cnt // cnt = number of bytes left to copy 2258c2ecf20Sopenharmony_ci mov t4=ip 2268c2ecf20Sopenharmony_ci } ;; 2278c2ecf20Sopenharmony_ci and src2=-8,src // align source pointer 2288c2ecf20Sopenharmony_ci adds t4=.memcpy_loops-1b,t4 2298c2ecf20Sopenharmony_ci mov ar.ec=N 2308c2ecf20Sopenharmony_ci 2318c2ecf20Sopenharmony_ci and t0=7,src // t0 = src & 7 2328c2ecf20Sopenharmony_ci shr.u t2=cnt,3 // t2 = number of 8-byte words left to copy 2338c2ecf20Sopenharmony_ci shl cnt=cnt,3 // move bits 0-2 to 3-5 2348c2ecf20Sopenharmony_ci ;; 2358c2ecf20Sopenharmony_ci 2368c2ecf20Sopenharmony_ci .rotr val[N+1], w[2] 2378c2ecf20Sopenharmony_ci .rotp p[N] 2388c2ecf20Sopenharmony_ci 2398c2ecf20Sopenharmony_ci cmp.ne p6,p0=t0,r0 // is src aligned, too? 2408c2ecf20Sopenharmony_ci shl t0=t0,LOG_LOOP_SIZE // t0 = 8*(src & 7) 2418c2ecf20Sopenharmony_ci adds t2=-1,t2 // br.ctop is repeat/until 2428c2ecf20Sopenharmony_ci ;; 2438c2ecf20Sopenharmony_ci add t4=t0,t4 2448c2ecf20Sopenharmony_ci mov pr=cnt,0x38 // set (p5,p4,p3) to # of bytes last-word bytes to copy 2458c2ecf20Sopenharmony_ci mov ar.lc=t2 2468c2ecf20Sopenharmony_ci ;; 2478c2ecf20Sopenharmony_ci nop.m 0 2488c2ecf20Sopenharmony_ci ;; 2498c2ecf20Sopenharmony_ci nop.m 0 2508c2ecf20Sopenharmony_ci nop.i 0 2518c2ecf20Sopenharmony_ci ;; 2528c2ecf20Sopenharmony_ci nop.m 0 2538c2ecf20Sopenharmony_ci ;; 2548c2ecf20Sopenharmony_ci(p6) ld8 val[1]=[src2],8 // prime the pump... 2558c2ecf20Sopenharmony_ci mov b6=t4 2568c2ecf20Sopenharmony_ci br.sptk.few b6 2578c2ecf20Sopenharmony_ci ;; 2588c2ecf20Sopenharmony_ci 2598c2ecf20Sopenharmony_ci.memcpy_tail: 2608c2ecf20Sopenharmony_ci // At this point, (p5,p4,p3) are set to the number of bytes left to copy (which is 2618c2ecf20Sopenharmony_ci // less than 8) and t0 contains the last few bytes of the src buffer: 2628c2ecf20Sopenharmony_ci(p5) st4 [dst]=t0,4 2638c2ecf20Sopenharmony_ci(p5) shr.u t0=t0,32 2648c2ecf20Sopenharmony_ci mov ar.lc=saved_lc 2658c2ecf20Sopenharmony_ci ;; 2668c2ecf20Sopenharmony_ci(p4) st2 [dst]=t0,2 2678c2ecf20Sopenharmony_ci(p4) shr.u t0=t0,16 2688c2ecf20Sopenharmony_ci mov ar.pfs=saved_pfs 2698c2ecf20Sopenharmony_ci ;; 2708c2ecf20Sopenharmony_ci(p3) st1 [dst]=t0 2718c2ecf20Sopenharmony_ci mov pr=saved_pr,-1 2728c2ecf20Sopenharmony_ci br.ret.sptk.many rp 2738c2ecf20Sopenharmony_ci 2748c2ecf20Sopenharmony_ci/////////////////////////////////////////////////////// 2758c2ecf20Sopenharmony_ci .align 64 2768c2ecf20Sopenharmony_ci 2778c2ecf20Sopenharmony_ci#define COPY(shift,index) \ 2788c2ecf20Sopenharmony_ci 1: { .mib \ 2798c2ecf20Sopenharmony_ci (p[0]) ld8 val[0]=[src2],8; \ 2808c2ecf20Sopenharmony_ci (p[MEM_LAT+3]) shrp w[0]=val[MEM_LAT+3],val[MEM_LAT+4-index],shift; \ 2818c2ecf20Sopenharmony_ci brp.loop.imp 1b, 2f \ 2828c2ecf20Sopenharmony_ci }; \ 2838c2ecf20Sopenharmony_ci 2: { .mfb \ 2848c2ecf20Sopenharmony_ci (p[MEM_LAT+4]) st8 [dst]=w[1],8; \ 2858c2ecf20Sopenharmony_ci nop.f 0; \ 2868c2ecf20Sopenharmony_ci br.ctop.dptk.few 1b; \ 2878c2ecf20Sopenharmony_ci }; \ 2888c2ecf20Sopenharmony_ci ;; \ 2898c2ecf20Sopenharmony_ci ld8 val[N-1]=[src_end]; /* load last word (may be same as val[N]) */ \ 2908c2ecf20Sopenharmony_ci ;; \ 2918c2ecf20Sopenharmony_ci shrp t0=val[N-1],val[N-index],shift; \ 2928c2ecf20Sopenharmony_ci br .memcpy_tail 2938c2ecf20Sopenharmony_ci.memcpy_loops: 2948c2ecf20Sopenharmony_ci COPY(0, 1) /* no point special casing this---it doesn't go any faster without shrp */ 2958c2ecf20Sopenharmony_ci COPY(8, 0) 2968c2ecf20Sopenharmony_ci COPY(16, 0) 2978c2ecf20Sopenharmony_ci COPY(24, 0) 2988c2ecf20Sopenharmony_ci COPY(32, 0) 2998c2ecf20Sopenharmony_ci COPY(40, 0) 3008c2ecf20Sopenharmony_ci COPY(48, 0) 3018c2ecf20Sopenharmony_ci COPY(56, 0) 3028c2ecf20Sopenharmony_ci 3038c2ecf20Sopenharmony_ciEND(memcpy) 3048c2ecf20Sopenharmony_ciEXPORT_SYMBOL(memcpy) 305