18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */ 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved. 48c2ecf20Sopenharmony_ci */ 58c2ecf20Sopenharmony_ci 68c2ecf20Sopenharmony_ci/* 78c2ecf20Sopenharmony_ci * Description 88c2ecf20Sopenharmony_ci * 98c2ecf20Sopenharmony_ci * library function for memcpy where length bytes are copied from 108c2ecf20Sopenharmony_ci * ptr_in to ptr_out. ptr_out is returned unchanged. 118c2ecf20Sopenharmony_ci * Allows any combination of alignment on input and output pointers 128c2ecf20Sopenharmony_ci * and length from 0 to 2^32-1 138c2ecf20Sopenharmony_ci * 148c2ecf20Sopenharmony_ci * Restrictions 158c2ecf20Sopenharmony_ci * The arrays should not overlap, the program will produce undefined output 168c2ecf20Sopenharmony_ci * if they do. 178c2ecf20Sopenharmony_ci * For blocks less than 16 bytes a byte by byte copy is performed. For 188c2ecf20Sopenharmony_ci * 8byte alignments, and length multiples, a dword copy is performed up to 198c2ecf20Sopenharmony_ci * 96bytes 208c2ecf20Sopenharmony_ci * History 218c2ecf20Sopenharmony_ci * 228c2ecf20Sopenharmony_ci * DJH 5/15/09 Initial version 1.0 238c2ecf20Sopenharmony_ci * DJH 6/ 1/09 Version 1.1 modified ABI to inlcude R16-R19 248c2ecf20Sopenharmony_ci * DJH 7/12/09 Version 1.2 optimized codesize down to 760 was 840 258c2ecf20Sopenharmony_ci * DJH 10/14/09 Version 1.3 added special loop for aligned case, was 268c2ecf20Sopenharmony_ci * overreading bloated codesize back up to 892 278c2ecf20Sopenharmony_ci * DJH 4/20/10 Version 1.4 fixed Ldword_loop_epilog loop to prevent loads 288c2ecf20Sopenharmony_ci * occurring if only 1 left outstanding, fixes bug 298c2ecf20Sopenharmony_ci * # 3888, corrected for all alignments. Peeled off 308c2ecf20Sopenharmony_ci * 1 32byte chunk from kernel loop and extended 8byte 318c2ecf20Sopenharmony_ci * loop at end to solve all combinations and prevent 328c2ecf20Sopenharmony_ci * over read. Fixed Ldword_loop_prolog to prevent 338c2ecf20Sopenharmony_ci * overread for blocks less than 48bytes. Reduced 348c2ecf20Sopenharmony_ci * codesize to 752 bytes 358c2ecf20Sopenharmony_ci * DJH 4/21/10 version 1.5 1.4 fix broke code for input block ends not 368c2ecf20Sopenharmony_ci * aligned to dword boundaries,underwriting by 1 378c2ecf20Sopenharmony_ci * byte, added detection for this and fixed. A 388c2ecf20Sopenharmony_ci * little bloat. 398c2ecf20Sopenharmony_ci * DJH 4/23/10 version 1.6 corrected stack error, R20 was not being restored 408c2ecf20Sopenharmony_ci * always, fixed the error of R20 being modified 418c2ecf20Sopenharmony_ci * before it was being saved 428c2ecf20Sopenharmony_ci * Natural c model 438c2ecf20Sopenharmony_ci * =============== 448c2ecf20Sopenharmony_ci * void * memcpy(char * ptr_out, char * ptr_in, int length) { 458c2ecf20Sopenharmony_ci * int i; 468c2ecf20Sopenharmony_ci * if(length) for(i=0; i < length; i++) { ptr_out[i] = ptr_in[i]; } 478c2ecf20Sopenharmony_ci * return(ptr_out); 488c2ecf20Sopenharmony_ci * } 498c2ecf20Sopenharmony_ci * 508c2ecf20Sopenharmony_ci * Optimized memcpy function 518c2ecf20Sopenharmony_ci * ========================= 528c2ecf20Sopenharmony_ci * void * memcpy(char * ptr_out, char * ptr_in, int len) { 538c2ecf20Sopenharmony_ci * int i, prolog, kernel, epilog, mask; 548c2ecf20Sopenharmony_ci * u8 offset; 558c2ecf20Sopenharmony_ci * s64 data0, dataF8, data70; 568c2ecf20Sopenharmony_ci * 578c2ecf20Sopenharmony_ci * s64 * ptr8_in; 588c2ecf20Sopenharmony_ci * s64 * ptr8_out; 598c2ecf20Sopenharmony_ci * s32 * ptr4; 608c2ecf20Sopenharmony_ci * s16 * ptr2; 618c2ecf20Sopenharmony_ci * 628c2ecf20Sopenharmony_ci * offset = ((int) ptr_in) & 7; 638c2ecf20Sopenharmony_ci * ptr8_in = (s64 *) &ptr_in[-offset]; //read in the aligned pointers 648c2ecf20Sopenharmony_ci * 658c2ecf20Sopenharmony_ci * data70 = *ptr8_in++; 668c2ecf20Sopenharmony_ci * dataF8 = *ptr8_in++; 678c2ecf20Sopenharmony_ci * 688c2ecf20Sopenharmony_ci * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset); 698c2ecf20Sopenharmony_ci * 708c2ecf20Sopenharmony_ci * prolog = 32 - ((int) ptr_out); 718c2ecf20Sopenharmony_ci * mask = 0x7fffffff >> HEXAGON_R_cl0_R(len); 728c2ecf20Sopenharmony_ci * prolog = prolog & mask; 738c2ecf20Sopenharmony_ci * kernel = len - prolog; 748c2ecf20Sopenharmony_ci * epilog = kernel & 0x1F; 758c2ecf20Sopenharmony_ci * kernel = kernel>>5; 768c2ecf20Sopenharmony_ci * 778c2ecf20Sopenharmony_ci * if (prolog & 1) { ptr_out[0] = (u8) data0; data0 >>= 8; ptr_out += 1;} 788c2ecf20Sopenharmony_ci * ptr2 = (s16 *) &ptr_out[0]; 798c2ecf20Sopenharmony_ci * if (prolog & 2) { ptr2[0] = (u16) data0; data0 >>= 16; ptr_out += 2;} 808c2ecf20Sopenharmony_ci * ptr4 = (s32 *) &ptr_out[0]; 818c2ecf20Sopenharmony_ci * if (prolog & 4) { ptr4[0] = (u32) data0; data0 >>= 32; ptr_out += 4;} 828c2ecf20Sopenharmony_ci * 838c2ecf20Sopenharmony_ci * offset = offset + (prolog & 7); 848c2ecf20Sopenharmony_ci * if (offset >= 8) { 858c2ecf20Sopenharmony_ci * data70 = dataF8; 868c2ecf20Sopenharmony_ci * dataF8 = *ptr8_in++; 878c2ecf20Sopenharmony_ci * } 888c2ecf20Sopenharmony_ci * offset = offset & 0x7; 898c2ecf20Sopenharmony_ci * 908c2ecf20Sopenharmony_ci * prolog = prolog >> 3; 918c2ecf20Sopenharmony_ci * if (prolog) for (i=0; i < prolog; i++) { 928c2ecf20Sopenharmony_ci * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset); 938c2ecf20Sopenharmony_ci * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8; 948c2ecf20Sopenharmony_ci * data70 = dataF8; 958c2ecf20Sopenharmony_ci * dataF8 = *ptr8_in++; 968c2ecf20Sopenharmony_ci * } 978c2ecf20Sopenharmony_ci * if(kernel) { kernel -= 1; epilog += 32; } 988c2ecf20Sopenharmony_ci * if(kernel) for(i=0; i < kernel; i++) { 998c2ecf20Sopenharmony_ci * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset); 1008c2ecf20Sopenharmony_ci * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8; 1018c2ecf20Sopenharmony_ci * data70 = *ptr8_in++; 1028c2ecf20Sopenharmony_ci * 1038c2ecf20Sopenharmony_ci * data0 = HEXAGON_P_valignb_PPp(data70, dataF8, offset); 1048c2ecf20Sopenharmony_ci * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8; 1058c2ecf20Sopenharmony_ci * dataF8 = *ptr8_in++; 1068c2ecf20Sopenharmony_ci * 1078c2ecf20Sopenharmony_ci * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset); 1088c2ecf20Sopenharmony_ci * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8; 1098c2ecf20Sopenharmony_ci * data70 = *ptr8_in++; 1108c2ecf20Sopenharmony_ci * 1118c2ecf20Sopenharmony_ci * data0 = HEXAGON_P_valignb_PPp(data70, dataF8, offset); 1128c2ecf20Sopenharmony_ci * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8; 1138c2ecf20Sopenharmony_ci * dataF8 = *ptr8_in++; 1148c2ecf20Sopenharmony_ci * } 1158c2ecf20Sopenharmony_ci * epilogdws = epilog >> 3; 1168c2ecf20Sopenharmony_ci * if (epilogdws) for (i=0; i < epilogdws; i++) { 1178c2ecf20Sopenharmony_ci * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset); 1188c2ecf20Sopenharmony_ci * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8; 1198c2ecf20Sopenharmony_ci * data70 = dataF8; 1208c2ecf20Sopenharmony_ci * dataF8 = *ptr8_in++; 1218c2ecf20Sopenharmony_ci * } 1228c2ecf20Sopenharmony_ci * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset); 1238c2ecf20Sopenharmony_ci * 1248c2ecf20Sopenharmony_ci * ptr4 = (s32 *) &ptr_out[0]; 1258c2ecf20Sopenharmony_ci * if (epilog & 4) { ptr4[0] = (u32) data0; data0 >>= 32; ptr_out += 4;} 1268c2ecf20Sopenharmony_ci * ptr2 = (s16 *) &ptr_out[0]; 1278c2ecf20Sopenharmony_ci * if (epilog & 2) { ptr2[0] = (u16) data0; data0 >>= 16; ptr_out += 2;} 1288c2ecf20Sopenharmony_ci * if (epilog & 1) { *ptr_out++ = (u8) data0; } 1298c2ecf20Sopenharmony_ci * 1308c2ecf20Sopenharmony_ci * return(ptr_out - length); 1318c2ecf20Sopenharmony_ci * } 1328c2ecf20Sopenharmony_ci * 1338c2ecf20Sopenharmony_ci * Codesize : 784 bytes 1348c2ecf20Sopenharmony_ci */ 1358c2ecf20Sopenharmony_ci 1368c2ecf20Sopenharmony_ci 1378c2ecf20Sopenharmony_ci#define ptr_out R0 /* destination pounter */ 1388c2ecf20Sopenharmony_ci#define ptr_in R1 /* source pointer */ 1398c2ecf20Sopenharmony_ci#define len R2 /* length of copy in bytes */ 1408c2ecf20Sopenharmony_ci 1418c2ecf20Sopenharmony_ci#define data70 R13:12 /* lo 8 bytes of non-aligned transfer */ 1428c2ecf20Sopenharmony_ci#define dataF8 R11:10 /* hi 8 bytes of non-aligned transfer */ 1438c2ecf20Sopenharmony_ci#define ldata0 R7:6 /* even 8 bytes chunks */ 1448c2ecf20Sopenharmony_ci#define ldata1 R25:24 /* odd 8 bytes chunks */ 1458c2ecf20Sopenharmony_ci#define data1 R7 /* lower 8 bytes of ldata1 */ 1468c2ecf20Sopenharmony_ci#define data0 R6 /* lower 8 bytes of ldata0 */ 1478c2ecf20Sopenharmony_ci 1488c2ecf20Sopenharmony_ci#define ifbyte p0 /* if transfer has bytes in epilog/prolog */ 1498c2ecf20Sopenharmony_ci#define ifhword p0 /* if transfer has shorts in epilog/prolog */ 1508c2ecf20Sopenharmony_ci#define ifword p0 /* if transfer has words in epilog/prolog */ 1518c2ecf20Sopenharmony_ci#define noprolog p0 /* no prolog, xfer starts at 32byte */ 1528c2ecf20Sopenharmony_ci#define nokernel p1 /* no 32byte multiple block in the transfer */ 1538c2ecf20Sopenharmony_ci#define noepilog p0 /* no epilog, xfer ends on 32byte boundary */ 1548c2ecf20Sopenharmony_ci#define align p2 /* alignment of input rel to 8byte boundary */ 1558c2ecf20Sopenharmony_ci#define kernel1 p0 /* kernel count == 1 */ 1568c2ecf20Sopenharmony_ci 1578c2ecf20Sopenharmony_ci#define dalign R25 /* rel alignment of input to output data */ 1588c2ecf20Sopenharmony_ci#define star3 R16 /* number bytes in prolog - dwords */ 1598c2ecf20Sopenharmony_ci#define rest R8 /* length - prolog bytes */ 1608c2ecf20Sopenharmony_ci#define back R7 /* nr bytes > dword boundary in src block */ 1618c2ecf20Sopenharmony_ci#define epilog R3 /* bytes in epilog */ 1628c2ecf20Sopenharmony_ci#define inc R15:14 /* inc kernel by -1 and defetch ptr by 32 */ 1638c2ecf20Sopenharmony_ci#define kernel R4 /* number of 32byte chunks in kernel */ 1648c2ecf20Sopenharmony_ci#define ptr_in_p_128 R5 /* pointer for prefetch of input data */ 1658c2ecf20Sopenharmony_ci#define mask R8 /* mask used to determine prolog size */ 1668c2ecf20Sopenharmony_ci#define shift R8 /* used to work a shifter to extract bytes */ 1678c2ecf20Sopenharmony_ci#define shift2 R5 /* in epilog to workshifter to extract bytes */ 1688c2ecf20Sopenharmony_ci#define prolog R15 /* bytes in prolog */ 1698c2ecf20Sopenharmony_ci#define epilogdws R15 /* number dwords in epilog */ 1708c2ecf20Sopenharmony_ci#define shiftb R14 /* used to extract bytes */ 1718c2ecf20Sopenharmony_ci#define offset R9 /* same as align in reg */ 1728c2ecf20Sopenharmony_ci#define ptr_out_p_32 R17 /* pointer to output dczero */ 1738c2ecf20Sopenharmony_ci#define align888 R14 /* if simple dword loop can be used */ 1748c2ecf20Sopenharmony_ci#define len8 R9 /* number of dwords in length */ 1758c2ecf20Sopenharmony_ci#define over R20 /* nr of bytes > last inp buf dword boundary */ 1768c2ecf20Sopenharmony_ci 1778c2ecf20Sopenharmony_ci#define ptr_in_p_128kernel R5:4 /* packed fetch pointer & kernel cnt */ 1788c2ecf20Sopenharmony_ci 1798c2ecf20Sopenharmony_ci .section .text 1808c2ecf20Sopenharmony_ci .p2align 4 1818c2ecf20Sopenharmony_ci .global memcpy 1828c2ecf20Sopenharmony_ci .type memcpy, @function 1838c2ecf20Sopenharmony_cimemcpy: 1848c2ecf20Sopenharmony_ci{ 1858c2ecf20Sopenharmony_ci p2 = cmp.eq(len, #0); /* =0 */ 1868c2ecf20Sopenharmony_ci align888 = or(ptr_in, ptr_out); /* %8 < 97 */ 1878c2ecf20Sopenharmony_ci p0 = cmp.gtu(len, #23); /* %1, <24 */ 1888c2ecf20Sopenharmony_ci p1 = cmp.eq(ptr_in, ptr_out); /* attempt to overwrite self */ 1898c2ecf20Sopenharmony_ci} 1908c2ecf20Sopenharmony_ci{ 1918c2ecf20Sopenharmony_ci p1 = or(p2, p1); 1928c2ecf20Sopenharmony_ci p3 = cmp.gtu(len, #95); /* %8 < 97 */ 1938c2ecf20Sopenharmony_ci align888 = or(align888, len); /* %8 < 97 */ 1948c2ecf20Sopenharmony_ci len8 = lsr(len, #3); /* %8 < 97 */ 1958c2ecf20Sopenharmony_ci} 1968c2ecf20Sopenharmony_ci{ 1978c2ecf20Sopenharmony_ci dcfetch(ptr_in); /* zero/ptrin=ptrout causes fetch */ 1988c2ecf20Sopenharmony_ci p2 = bitsclr(align888, #7); /* %8 < 97 */ 1998c2ecf20Sopenharmony_ci if(p1) jumpr r31; /* =0 */ 2008c2ecf20Sopenharmony_ci} 2018c2ecf20Sopenharmony_ci{ 2028c2ecf20Sopenharmony_ci p2 = and(p2,!p3); /* %8 < 97 */ 2038c2ecf20Sopenharmony_ci if (p2.new) len = add(len, #-8); /* %8 < 97 */ 2048c2ecf20Sopenharmony_ci if (p2.new) jump:NT .Ldwordaligned; /* %8 < 97 */ 2058c2ecf20Sopenharmony_ci} 2068c2ecf20Sopenharmony_ci{ 2078c2ecf20Sopenharmony_ci if(!p0) jump .Lbytes23orless; /* %1, <24 */ 2088c2ecf20Sopenharmony_ci mask.l = #LO(0x7fffffff); 2098c2ecf20Sopenharmony_ci /* all bytes before line multiples of data */ 2108c2ecf20Sopenharmony_ci prolog = sub(#0, ptr_out); 2118c2ecf20Sopenharmony_ci} 2128c2ecf20Sopenharmony_ci{ 2138c2ecf20Sopenharmony_ci /* save r31 on stack, decrement sp by 16 */ 2148c2ecf20Sopenharmony_ci allocframe(#24); 2158c2ecf20Sopenharmony_ci mask.h = #HI(0x7fffffff); 2168c2ecf20Sopenharmony_ci ptr_in_p_128 = add(ptr_in, #32); 2178c2ecf20Sopenharmony_ci back = cl0(len); 2188c2ecf20Sopenharmony_ci} 2198c2ecf20Sopenharmony_ci{ 2208c2ecf20Sopenharmony_ci memd(sp+#0) = R17:16; /* save r16,r17 on stack6 */ 2218c2ecf20Sopenharmony_ci r31.l = #LO(.Lmemcpy_return); /* set up final return pointer */ 2228c2ecf20Sopenharmony_ci prolog &= lsr(mask, back); 2238c2ecf20Sopenharmony_ci offset = and(ptr_in, #7); 2248c2ecf20Sopenharmony_ci} 2258c2ecf20Sopenharmony_ci{ 2268c2ecf20Sopenharmony_ci memd(sp+#8) = R25:24; /* save r25,r24 on stack */ 2278c2ecf20Sopenharmony_ci dalign = sub(ptr_out, ptr_in); 2288c2ecf20Sopenharmony_ci r31.h = #HI(.Lmemcpy_return); /* set up final return pointer */ 2298c2ecf20Sopenharmony_ci} 2308c2ecf20Sopenharmony_ci{ 2318c2ecf20Sopenharmony_ci /* see if there if input buffer end if aligned */ 2328c2ecf20Sopenharmony_ci over = add(len, ptr_in); 2338c2ecf20Sopenharmony_ci back = add(len, offset); 2348c2ecf20Sopenharmony_ci memd(sp+#16) = R21:20; /* save r20,r21 on stack */ 2358c2ecf20Sopenharmony_ci} 2368c2ecf20Sopenharmony_ci{ 2378c2ecf20Sopenharmony_ci noprolog = bitsclr(prolog, #7); 2388c2ecf20Sopenharmony_ci prolog = and(prolog, #31); 2398c2ecf20Sopenharmony_ci dcfetch(ptr_in_p_128); 2408c2ecf20Sopenharmony_ci ptr_in_p_128 = add(ptr_in_p_128, #32); 2418c2ecf20Sopenharmony_ci} 2428c2ecf20Sopenharmony_ci{ 2438c2ecf20Sopenharmony_ci kernel = sub(len, prolog); 2448c2ecf20Sopenharmony_ci shift = asl(prolog, #3); 2458c2ecf20Sopenharmony_ci star3 = and(prolog, #7); 2468c2ecf20Sopenharmony_ci ptr_in = and(ptr_in, #-8); 2478c2ecf20Sopenharmony_ci} 2488c2ecf20Sopenharmony_ci{ 2498c2ecf20Sopenharmony_ci prolog = lsr(prolog, #3); 2508c2ecf20Sopenharmony_ci epilog = and(kernel, #31); 2518c2ecf20Sopenharmony_ci ptr_out_p_32 = add(ptr_out, prolog); 2528c2ecf20Sopenharmony_ci over = and(over, #7); 2538c2ecf20Sopenharmony_ci} 2548c2ecf20Sopenharmony_ci{ 2558c2ecf20Sopenharmony_ci p3 = cmp.gtu(back, #8); 2568c2ecf20Sopenharmony_ci kernel = lsr(kernel, #5); 2578c2ecf20Sopenharmony_ci dcfetch(ptr_in_p_128); 2588c2ecf20Sopenharmony_ci ptr_in_p_128 = add(ptr_in_p_128, #32); 2598c2ecf20Sopenharmony_ci} 2608c2ecf20Sopenharmony_ci{ 2618c2ecf20Sopenharmony_ci p1 = cmp.eq(prolog, #0); 2628c2ecf20Sopenharmony_ci if(!p1.new) prolog = add(prolog, #1); 2638c2ecf20Sopenharmony_ci dcfetch(ptr_in_p_128); /* reserve the line 64bytes on */ 2648c2ecf20Sopenharmony_ci ptr_in_p_128 = add(ptr_in_p_128, #32); 2658c2ecf20Sopenharmony_ci} 2668c2ecf20Sopenharmony_ci{ 2678c2ecf20Sopenharmony_ci nokernel = cmp.eq(kernel,#0); 2688c2ecf20Sopenharmony_ci dcfetch(ptr_in_p_128); /* reserve the line 64bytes on */ 2698c2ecf20Sopenharmony_ci ptr_in_p_128 = add(ptr_in_p_128, #32); 2708c2ecf20Sopenharmony_ci shiftb = and(shift, #8); 2718c2ecf20Sopenharmony_ci} 2728c2ecf20Sopenharmony_ci{ 2738c2ecf20Sopenharmony_ci dcfetch(ptr_in_p_128); /* reserve the line 64bytes on */ 2748c2ecf20Sopenharmony_ci ptr_in_p_128 = add(ptr_in_p_128, #32); 2758c2ecf20Sopenharmony_ci if(nokernel) jump .Lskip64; 2768c2ecf20Sopenharmony_ci p2 = cmp.eq(kernel, #1); /* skip ovr if kernel == 0 */ 2778c2ecf20Sopenharmony_ci} 2788c2ecf20Sopenharmony_ci{ 2798c2ecf20Sopenharmony_ci dczeroa(ptr_out_p_32); 2808c2ecf20Sopenharmony_ci /* don't advance pointer */ 2818c2ecf20Sopenharmony_ci if(!p2) ptr_out_p_32 = add(ptr_out_p_32, #32); 2828c2ecf20Sopenharmony_ci} 2838c2ecf20Sopenharmony_ci{ 2848c2ecf20Sopenharmony_ci dalign = and(dalign, #31); 2858c2ecf20Sopenharmony_ci dczeroa(ptr_out_p_32); 2868c2ecf20Sopenharmony_ci} 2878c2ecf20Sopenharmony_ci.Lskip64: 2888c2ecf20Sopenharmony_ci{ 2898c2ecf20Sopenharmony_ci data70 = memd(ptr_in++#16); 2908c2ecf20Sopenharmony_ci if(p3) dataF8 = memd(ptr_in+#8); 2918c2ecf20Sopenharmony_ci if(noprolog) jump .Lnoprolog32; 2928c2ecf20Sopenharmony_ci align = offset; 2938c2ecf20Sopenharmony_ci} 2948c2ecf20Sopenharmony_ci/* upto initial 7 bytes */ 2958c2ecf20Sopenharmony_ci{ 2968c2ecf20Sopenharmony_ci ldata0 = valignb(dataF8, data70, align); 2978c2ecf20Sopenharmony_ci ifbyte = tstbit(shift,#3); 2988c2ecf20Sopenharmony_ci offset = add(offset, star3); 2998c2ecf20Sopenharmony_ci} 3008c2ecf20Sopenharmony_ci{ 3018c2ecf20Sopenharmony_ci if(ifbyte) memb(ptr_out++#1) = data0; 3028c2ecf20Sopenharmony_ci ldata0 = lsr(ldata0, shiftb); 3038c2ecf20Sopenharmony_ci shiftb = and(shift, #16); 3048c2ecf20Sopenharmony_ci ifhword = tstbit(shift,#4); 3058c2ecf20Sopenharmony_ci} 3068c2ecf20Sopenharmony_ci{ 3078c2ecf20Sopenharmony_ci if(ifhword) memh(ptr_out++#2) = data0; 3088c2ecf20Sopenharmony_ci ldata0 = lsr(ldata0, shiftb); 3098c2ecf20Sopenharmony_ci ifword = tstbit(shift,#5); 3108c2ecf20Sopenharmony_ci p2 = cmp.gtu(offset, #7); 3118c2ecf20Sopenharmony_ci} 3128c2ecf20Sopenharmony_ci{ 3138c2ecf20Sopenharmony_ci if(ifword) memw(ptr_out++#4) = data0; 3148c2ecf20Sopenharmony_ci if(p2) data70 = dataF8; 3158c2ecf20Sopenharmony_ci if(p2) dataF8 = memd(ptr_in++#8); /* another 8 bytes */ 3168c2ecf20Sopenharmony_ci align = offset; 3178c2ecf20Sopenharmony_ci} 3188c2ecf20Sopenharmony_ci.Lnoprolog32: 3198c2ecf20Sopenharmony_ci{ 3208c2ecf20Sopenharmony_ci p3 = sp1loop0(.Ldword_loop_prolog, prolog) 3218c2ecf20Sopenharmony_ci rest = sub(len, star3); /* whats left after the loop */ 3228c2ecf20Sopenharmony_ci p0 = cmp.gt(over, #0); 3238c2ecf20Sopenharmony_ci} 3248c2ecf20Sopenharmony_ci if(p0) rest = add(rest, #16); 3258c2ecf20Sopenharmony_ci.Ldword_loop_prolog: 3268c2ecf20Sopenharmony_ci{ 3278c2ecf20Sopenharmony_ci if(p3) memd(ptr_out++#8) = ldata0; 3288c2ecf20Sopenharmony_ci ldata0 = valignb(dataF8, data70, align); 3298c2ecf20Sopenharmony_ci p0 = cmp.gt(rest, #16); 3308c2ecf20Sopenharmony_ci} 3318c2ecf20Sopenharmony_ci{ 3328c2ecf20Sopenharmony_ci data70 = dataF8; 3338c2ecf20Sopenharmony_ci if(p0) dataF8 = memd(ptr_in++#8); 3348c2ecf20Sopenharmony_ci rest = add(rest, #-8); 3358c2ecf20Sopenharmony_ci}:endloop0 3368c2ecf20Sopenharmony_ci.Lkernel: 3378c2ecf20Sopenharmony_ci{ 3388c2ecf20Sopenharmony_ci /* kernel is at least 32bytes */ 3398c2ecf20Sopenharmony_ci p3 = cmp.gtu(kernel, #0); 3408c2ecf20Sopenharmony_ci /* last itn. remove edge effects */ 3418c2ecf20Sopenharmony_ci if(p3.new) kernel = add(kernel, #-1); 3428c2ecf20Sopenharmony_ci /* dealt with in last dword loop */ 3438c2ecf20Sopenharmony_ci if(p3.new) epilog = add(epilog, #32); 3448c2ecf20Sopenharmony_ci} 3458c2ecf20Sopenharmony_ci{ 3468c2ecf20Sopenharmony_ci nokernel = cmp.eq(kernel, #0); /* after adjustment, recheck */ 3478c2ecf20Sopenharmony_ci if(nokernel.new) jump:NT .Lepilog; /* likely not taken */ 3488c2ecf20Sopenharmony_ci inc = combine(#32, #-1); 3498c2ecf20Sopenharmony_ci p3 = cmp.gtu(dalign, #24); 3508c2ecf20Sopenharmony_ci} 3518c2ecf20Sopenharmony_ci{ 3528c2ecf20Sopenharmony_ci if(p3) jump .Lodd_alignment; 3538c2ecf20Sopenharmony_ci} 3548c2ecf20Sopenharmony_ci{ 3558c2ecf20Sopenharmony_ci loop0(.Loword_loop_25to31, kernel); 3568c2ecf20Sopenharmony_ci kernel1 = cmp.gtu(kernel, #1); 3578c2ecf20Sopenharmony_ci rest = kernel; 3588c2ecf20Sopenharmony_ci} 3598c2ecf20Sopenharmony_ci .falign 3608c2ecf20Sopenharmony_ci.Loword_loop_25to31: 3618c2ecf20Sopenharmony_ci{ 3628c2ecf20Sopenharmony_ci dcfetch(ptr_in_p_128); /* prefetch 4 lines ahead */ 3638c2ecf20Sopenharmony_ci if(kernel1) ptr_out_p_32 = add(ptr_out_p_32, #32); 3648c2ecf20Sopenharmony_ci} 3658c2ecf20Sopenharmony_ci{ 3668c2ecf20Sopenharmony_ci dczeroa(ptr_out_p_32); /* reserve the next 32bytes in cache */ 3678c2ecf20Sopenharmony_ci p3 = cmp.eq(kernel, rest); 3688c2ecf20Sopenharmony_ci} 3698c2ecf20Sopenharmony_ci{ 3708c2ecf20Sopenharmony_ci /* kernel -= 1 */ 3718c2ecf20Sopenharmony_ci ptr_in_p_128kernel = vaddw(ptr_in_p_128kernel, inc); 3728c2ecf20Sopenharmony_ci /* kill write on first iteration */ 3738c2ecf20Sopenharmony_ci if(!p3) memd(ptr_out++#8) = ldata1; 3748c2ecf20Sopenharmony_ci ldata1 = valignb(dataF8, data70, align); 3758c2ecf20Sopenharmony_ci data70 = memd(ptr_in++#8); 3768c2ecf20Sopenharmony_ci} 3778c2ecf20Sopenharmony_ci{ 3788c2ecf20Sopenharmony_ci memd(ptr_out++#8) = ldata0; 3798c2ecf20Sopenharmony_ci ldata0 = valignb(data70, dataF8, align); 3808c2ecf20Sopenharmony_ci dataF8 = memd(ptr_in++#8); 3818c2ecf20Sopenharmony_ci} 3828c2ecf20Sopenharmony_ci{ 3838c2ecf20Sopenharmony_ci memd(ptr_out++#8) = ldata1; 3848c2ecf20Sopenharmony_ci ldata1 = valignb(dataF8, data70, align); 3858c2ecf20Sopenharmony_ci data70 = memd(ptr_in++#8); 3868c2ecf20Sopenharmony_ci} 3878c2ecf20Sopenharmony_ci{ 3888c2ecf20Sopenharmony_ci memd(ptr_out++#8) = ldata0; 3898c2ecf20Sopenharmony_ci ldata0 = valignb(data70, dataF8, align); 3908c2ecf20Sopenharmony_ci dataF8 = memd(ptr_in++#8); 3918c2ecf20Sopenharmony_ci kernel1 = cmp.gtu(kernel, #1); 3928c2ecf20Sopenharmony_ci}:endloop0 3938c2ecf20Sopenharmony_ci{ 3948c2ecf20Sopenharmony_ci memd(ptr_out++#8) = ldata1; 3958c2ecf20Sopenharmony_ci jump .Lepilog; 3968c2ecf20Sopenharmony_ci} 3978c2ecf20Sopenharmony_ci.Lodd_alignment: 3988c2ecf20Sopenharmony_ci{ 3998c2ecf20Sopenharmony_ci loop0(.Loword_loop_00to24, kernel); 4008c2ecf20Sopenharmony_ci kernel1 = cmp.gtu(kernel, #1); 4018c2ecf20Sopenharmony_ci rest = add(kernel, #-1); 4028c2ecf20Sopenharmony_ci} 4038c2ecf20Sopenharmony_ci .falign 4048c2ecf20Sopenharmony_ci.Loword_loop_00to24: 4058c2ecf20Sopenharmony_ci{ 4068c2ecf20Sopenharmony_ci dcfetch(ptr_in_p_128); /* prefetch 4 lines ahead */ 4078c2ecf20Sopenharmony_ci ptr_in_p_128kernel = vaddw(ptr_in_p_128kernel, inc); 4088c2ecf20Sopenharmony_ci if(kernel1) ptr_out_p_32 = add(ptr_out_p_32, #32); 4098c2ecf20Sopenharmony_ci} 4108c2ecf20Sopenharmony_ci{ 4118c2ecf20Sopenharmony_ci dczeroa(ptr_out_p_32); /* reserve the next 32bytes in cache */ 4128c2ecf20Sopenharmony_ci} 4138c2ecf20Sopenharmony_ci{ 4148c2ecf20Sopenharmony_ci memd(ptr_out++#8) = ldata0; 4158c2ecf20Sopenharmony_ci ldata0 = valignb(dataF8, data70, align); 4168c2ecf20Sopenharmony_ci data70 = memd(ptr_in++#8); 4178c2ecf20Sopenharmony_ci} 4188c2ecf20Sopenharmony_ci{ 4198c2ecf20Sopenharmony_ci memd(ptr_out++#8) = ldata0; 4208c2ecf20Sopenharmony_ci ldata0 = valignb(data70, dataF8, align); 4218c2ecf20Sopenharmony_ci dataF8 = memd(ptr_in++#8); 4228c2ecf20Sopenharmony_ci} 4238c2ecf20Sopenharmony_ci{ 4248c2ecf20Sopenharmony_ci memd(ptr_out++#8) = ldata0; 4258c2ecf20Sopenharmony_ci ldata0 = valignb(dataF8, data70, align); 4268c2ecf20Sopenharmony_ci data70 = memd(ptr_in++#8); 4278c2ecf20Sopenharmony_ci} 4288c2ecf20Sopenharmony_ci{ 4298c2ecf20Sopenharmony_ci memd(ptr_out++#8) = ldata0; 4308c2ecf20Sopenharmony_ci ldata0 = valignb(data70, dataF8, align); 4318c2ecf20Sopenharmony_ci dataF8 = memd(ptr_in++#8); 4328c2ecf20Sopenharmony_ci kernel1 = cmp.gtu(kernel, #1); 4338c2ecf20Sopenharmony_ci}:endloop0 4348c2ecf20Sopenharmony_ci.Lepilog: 4358c2ecf20Sopenharmony_ci{ 4368c2ecf20Sopenharmony_ci noepilog = cmp.eq(epilog,#0); 4378c2ecf20Sopenharmony_ci epilogdws = lsr(epilog, #3); 4388c2ecf20Sopenharmony_ci kernel = and(epilog, #7); 4398c2ecf20Sopenharmony_ci} 4408c2ecf20Sopenharmony_ci{ 4418c2ecf20Sopenharmony_ci if(noepilog) jumpr r31; 4428c2ecf20Sopenharmony_ci if(noepilog) ptr_out = sub(ptr_out, len); 4438c2ecf20Sopenharmony_ci p3 = cmp.eq(epilogdws, #0); 4448c2ecf20Sopenharmony_ci shift2 = asl(epilog, #3); 4458c2ecf20Sopenharmony_ci} 4468c2ecf20Sopenharmony_ci{ 4478c2ecf20Sopenharmony_ci shiftb = and(shift2, #32); 4488c2ecf20Sopenharmony_ci ifword = tstbit(epilog,#2); 4498c2ecf20Sopenharmony_ci if(p3) jump .Lepilog60; 4508c2ecf20Sopenharmony_ci if(!p3) epilog = add(epilog, #-16); 4518c2ecf20Sopenharmony_ci} 4528c2ecf20Sopenharmony_ci{ 4538c2ecf20Sopenharmony_ci loop0(.Ldword_loop_epilog, epilogdws); 4548c2ecf20Sopenharmony_ci /* stop criteria is lsbs unless = 0 then its 8 */ 4558c2ecf20Sopenharmony_ci p3 = cmp.eq(kernel, #0); 4568c2ecf20Sopenharmony_ci if(p3.new) kernel= #8; 4578c2ecf20Sopenharmony_ci p1 = cmp.gt(over, #0); 4588c2ecf20Sopenharmony_ci} 4598c2ecf20Sopenharmony_ci /* if not aligned to end of buffer execute 1 more iteration */ 4608c2ecf20Sopenharmony_ci if(p1) kernel= #0; 4618c2ecf20Sopenharmony_ci.Ldword_loop_epilog: 4628c2ecf20Sopenharmony_ci{ 4638c2ecf20Sopenharmony_ci memd(ptr_out++#8) = ldata0; 4648c2ecf20Sopenharmony_ci ldata0 = valignb(dataF8, data70, align); 4658c2ecf20Sopenharmony_ci p3 = cmp.gt(epilog, kernel); 4668c2ecf20Sopenharmony_ci} 4678c2ecf20Sopenharmony_ci{ 4688c2ecf20Sopenharmony_ci data70 = dataF8; 4698c2ecf20Sopenharmony_ci if(p3) dataF8 = memd(ptr_in++#8); 4708c2ecf20Sopenharmony_ci epilog = add(epilog, #-8); 4718c2ecf20Sopenharmony_ci}:endloop0 4728c2ecf20Sopenharmony_ci/* copy last 7 bytes */ 4738c2ecf20Sopenharmony_ci.Lepilog60: 4748c2ecf20Sopenharmony_ci{ 4758c2ecf20Sopenharmony_ci if(ifword) memw(ptr_out++#4) = data0; 4768c2ecf20Sopenharmony_ci ldata0 = lsr(ldata0, shiftb); 4778c2ecf20Sopenharmony_ci ifhword = tstbit(epilog,#1); 4788c2ecf20Sopenharmony_ci shiftb = and(shift2, #16); 4798c2ecf20Sopenharmony_ci} 4808c2ecf20Sopenharmony_ci{ 4818c2ecf20Sopenharmony_ci if(ifhword) memh(ptr_out++#2) = data0; 4828c2ecf20Sopenharmony_ci ldata0 = lsr(ldata0, shiftb); 4838c2ecf20Sopenharmony_ci ifbyte = tstbit(epilog,#0); 4848c2ecf20Sopenharmony_ci if(ifbyte.new) len = add(len, #-1); 4858c2ecf20Sopenharmony_ci} 4868c2ecf20Sopenharmony_ci{ 4878c2ecf20Sopenharmony_ci if(ifbyte) memb(ptr_out) = data0; 4888c2ecf20Sopenharmony_ci ptr_out = sub(ptr_out, len); /* return dest pointer */ 4898c2ecf20Sopenharmony_ci jumpr r31; 4908c2ecf20Sopenharmony_ci} 4918c2ecf20Sopenharmony_ci/* do byte copy for small n */ 4928c2ecf20Sopenharmony_ci.Lbytes23orless: 4938c2ecf20Sopenharmony_ci{ 4948c2ecf20Sopenharmony_ci p3 = sp1loop0(.Lbyte_copy, len); 4958c2ecf20Sopenharmony_ci len = add(len, #-1); 4968c2ecf20Sopenharmony_ci} 4978c2ecf20Sopenharmony_ci.Lbyte_copy: 4988c2ecf20Sopenharmony_ci{ 4998c2ecf20Sopenharmony_ci data0 = memb(ptr_in++#1); 5008c2ecf20Sopenharmony_ci if(p3) memb(ptr_out++#1) = data0; 5018c2ecf20Sopenharmony_ci}:endloop0 5028c2ecf20Sopenharmony_ci{ 5038c2ecf20Sopenharmony_ci memb(ptr_out) = data0; 5048c2ecf20Sopenharmony_ci ptr_out = sub(ptr_out, len); 5058c2ecf20Sopenharmony_ci jumpr r31; 5068c2ecf20Sopenharmony_ci} 5078c2ecf20Sopenharmony_ci/* do dword copies for aligned in, out and length */ 5088c2ecf20Sopenharmony_ci.Ldwordaligned: 5098c2ecf20Sopenharmony_ci{ 5108c2ecf20Sopenharmony_ci p3 = sp1loop0(.Ldword_copy, len8); 5118c2ecf20Sopenharmony_ci} 5128c2ecf20Sopenharmony_ci.Ldword_copy: 5138c2ecf20Sopenharmony_ci{ 5148c2ecf20Sopenharmony_ci if(p3) memd(ptr_out++#8) = ldata0; 5158c2ecf20Sopenharmony_ci ldata0 = memd(ptr_in++#8); 5168c2ecf20Sopenharmony_ci}:endloop0 5178c2ecf20Sopenharmony_ci{ 5188c2ecf20Sopenharmony_ci memd(ptr_out) = ldata0; 5198c2ecf20Sopenharmony_ci ptr_out = sub(ptr_out, len); 5208c2ecf20Sopenharmony_ci jumpr r31; /* return to function caller */ 5218c2ecf20Sopenharmony_ci} 5228c2ecf20Sopenharmony_ci.Lmemcpy_return: 5238c2ecf20Sopenharmony_ci r21:20 = memd(sp+#16); /* restore r20+r21 */ 5248c2ecf20Sopenharmony_ci{ 5258c2ecf20Sopenharmony_ci r25:24 = memd(sp+#8); /* restore r24+r25 */ 5268c2ecf20Sopenharmony_ci r17:16 = memd(sp+#0); /* restore r16+r17 */ 5278c2ecf20Sopenharmony_ci} 5288c2ecf20Sopenharmony_ci deallocframe; /* restore r31 and incrment stack by 16 */ 5298c2ecf20Sopenharmony_ci jumpr r31 530