162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved. 462306a36Sopenharmony_ci */ 562306a36Sopenharmony_ci 662306a36Sopenharmony_ci/* 762306a36Sopenharmony_ci * Description 862306a36Sopenharmony_ci * 962306a36Sopenharmony_ci * library function for memcpy where length bytes are copied from 1062306a36Sopenharmony_ci * ptr_in to ptr_out. ptr_out is returned unchanged. 1162306a36Sopenharmony_ci * Allows any combination of alignment on input and output pointers 1262306a36Sopenharmony_ci * and length from 0 to 2^32-1 1362306a36Sopenharmony_ci * 1462306a36Sopenharmony_ci * Restrictions 1562306a36Sopenharmony_ci * The arrays should not overlap, the program will produce undefined output 1662306a36Sopenharmony_ci * if they do. 1762306a36Sopenharmony_ci * For blocks less than 16 bytes a byte by byte copy is performed. For 1862306a36Sopenharmony_ci * 8byte alignments, and length multiples, a dword copy is performed up to 1962306a36Sopenharmony_ci * 96bytes 2062306a36Sopenharmony_ci * History 2162306a36Sopenharmony_ci * 2262306a36Sopenharmony_ci * DJH 5/15/09 Initial version 1.0 2362306a36Sopenharmony_ci * DJH 6/ 1/09 Version 1.1 modified ABI to inlcude R16-R19 2462306a36Sopenharmony_ci * DJH 7/12/09 Version 1.2 optimized codesize down to 760 was 840 2562306a36Sopenharmony_ci * DJH 10/14/09 Version 1.3 added special loop for aligned case, was 2662306a36Sopenharmony_ci * overreading bloated codesize back up to 892 2762306a36Sopenharmony_ci * DJH 4/20/10 Version 1.4 fixed Ldword_loop_epilog loop to prevent loads 2862306a36Sopenharmony_ci * occurring if only 1 left outstanding, fixes bug 2962306a36Sopenharmony_ci * # 3888, corrected for all alignments. Peeled off 3062306a36Sopenharmony_ci * 1 32byte chunk from kernel loop and extended 8byte 3162306a36Sopenharmony_ci * loop at end to solve all combinations and prevent 3262306a36Sopenharmony_ci * over read. Fixed Ldword_loop_prolog to prevent 3362306a36Sopenharmony_ci * overread for blocks less than 48bytes. Reduced 3462306a36Sopenharmony_ci * codesize to 752 bytes 3562306a36Sopenharmony_ci * DJH 4/21/10 version 1.5 1.4 fix broke code for input block ends not 3662306a36Sopenharmony_ci * aligned to dword boundaries,underwriting by 1 3762306a36Sopenharmony_ci * byte, added detection for this and fixed. A 3862306a36Sopenharmony_ci * little bloat. 3962306a36Sopenharmony_ci * DJH 4/23/10 version 1.6 corrected stack error, R20 was not being restored 4062306a36Sopenharmony_ci * always, fixed the error of R20 being modified 4162306a36Sopenharmony_ci * before it was being saved 4262306a36Sopenharmony_ci * Natural c model 4362306a36Sopenharmony_ci * =============== 4462306a36Sopenharmony_ci * void * memcpy(char * ptr_out, char * ptr_in, int length) { 4562306a36Sopenharmony_ci * int i; 4662306a36Sopenharmony_ci * if(length) for(i=0; i < length; i++) { ptr_out[i] = ptr_in[i]; } 4762306a36Sopenharmony_ci * return(ptr_out); 4862306a36Sopenharmony_ci * } 4962306a36Sopenharmony_ci * 5062306a36Sopenharmony_ci * Optimized memcpy function 5162306a36Sopenharmony_ci * ========================= 5262306a36Sopenharmony_ci * void * memcpy(char * ptr_out, char * ptr_in, int len) { 5362306a36Sopenharmony_ci * int i, prolog, kernel, epilog, mask; 5462306a36Sopenharmony_ci * u8 offset; 5562306a36Sopenharmony_ci * s64 data0, dataF8, data70; 5662306a36Sopenharmony_ci * 5762306a36Sopenharmony_ci * s64 * ptr8_in; 5862306a36Sopenharmony_ci * s64 * ptr8_out; 5962306a36Sopenharmony_ci * s32 * ptr4; 6062306a36Sopenharmony_ci * s16 * ptr2; 6162306a36Sopenharmony_ci * 6262306a36Sopenharmony_ci * offset = ((int) ptr_in) & 7; 6362306a36Sopenharmony_ci * ptr8_in = (s64 *) &ptr_in[-offset]; //read in the aligned pointers 6462306a36Sopenharmony_ci * 6562306a36Sopenharmony_ci * data70 = *ptr8_in++; 6662306a36Sopenharmony_ci * dataF8 = *ptr8_in++; 6762306a36Sopenharmony_ci * 6862306a36Sopenharmony_ci * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset); 6962306a36Sopenharmony_ci * 7062306a36Sopenharmony_ci * prolog = 32 - ((int) ptr_out); 7162306a36Sopenharmony_ci * mask = 0x7fffffff >> HEXAGON_R_cl0_R(len); 7262306a36Sopenharmony_ci * prolog = prolog & mask; 7362306a36Sopenharmony_ci * kernel = len - prolog; 7462306a36Sopenharmony_ci * epilog = kernel & 0x1F; 7562306a36Sopenharmony_ci * kernel = kernel>>5; 7662306a36Sopenharmony_ci * 7762306a36Sopenharmony_ci * if (prolog & 1) { ptr_out[0] = (u8) data0; data0 >>= 8; ptr_out += 1;} 7862306a36Sopenharmony_ci * ptr2 = (s16 *) &ptr_out[0]; 7962306a36Sopenharmony_ci * if (prolog & 2) { ptr2[0] = (u16) data0; data0 >>= 16; ptr_out += 2;} 8062306a36Sopenharmony_ci * ptr4 = (s32 *) &ptr_out[0]; 8162306a36Sopenharmony_ci * if (prolog & 4) { ptr4[0] = (u32) data0; data0 >>= 32; ptr_out += 4;} 8262306a36Sopenharmony_ci * 8362306a36Sopenharmony_ci * offset = offset + (prolog & 7); 8462306a36Sopenharmony_ci * if (offset >= 8) { 8562306a36Sopenharmony_ci * data70 = dataF8; 8662306a36Sopenharmony_ci * dataF8 = *ptr8_in++; 8762306a36Sopenharmony_ci * } 8862306a36Sopenharmony_ci * offset = offset & 0x7; 8962306a36Sopenharmony_ci * 9062306a36Sopenharmony_ci * prolog = prolog >> 3; 9162306a36Sopenharmony_ci * if (prolog) for (i=0; i < prolog; i++) { 9262306a36Sopenharmony_ci * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset); 9362306a36Sopenharmony_ci * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8; 9462306a36Sopenharmony_ci * data70 = dataF8; 9562306a36Sopenharmony_ci * dataF8 = *ptr8_in++; 9662306a36Sopenharmony_ci * } 9762306a36Sopenharmony_ci * if(kernel) { kernel -= 1; epilog += 32; } 9862306a36Sopenharmony_ci * if(kernel) for(i=0; i < kernel; i++) { 9962306a36Sopenharmony_ci * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset); 10062306a36Sopenharmony_ci * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8; 10162306a36Sopenharmony_ci * data70 = *ptr8_in++; 10262306a36Sopenharmony_ci * 10362306a36Sopenharmony_ci * data0 = HEXAGON_P_valignb_PPp(data70, dataF8, offset); 10462306a36Sopenharmony_ci * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8; 10562306a36Sopenharmony_ci * dataF8 = *ptr8_in++; 10662306a36Sopenharmony_ci * 10762306a36Sopenharmony_ci * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset); 10862306a36Sopenharmony_ci * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8; 10962306a36Sopenharmony_ci * data70 = *ptr8_in++; 11062306a36Sopenharmony_ci * 11162306a36Sopenharmony_ci * data0 = HEXAGON_P_valignb_PPp(data70, dataF8, offset); 11262306a36Sopenharmony_ci * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8; 11362306a36Sopenharmony_ci * dataF8 = *ptr8_in++; 11462306a36Sopenharmony_ci * } 11562306a36Sopenharmony_ci * epilogdws = epilog >> 3; 11662306a36Sopenharmony_ci * if (epilogdws) for (i=0; i < epilogdws; i++) { 11762306a36Sopenharmony_ci * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset); 11862306a36Sopenharmony_ci * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8; 11962306a36Sopenharmony_ci * data70 = dataF8; 12062306a36Sopenharmony_ci * dataF8 = *ptr8_in++; 12162306a36Sopenharmony_ci * } 12262306a36Sopenharmony_ci * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset); 12362306a36Sopenharmony_ci * 12462306a36Sopenharmony_ci * ptr4 = (s32 *) &ptr_out[0]; 12562306a36Sopenharmony_ci * if (epilog & 4) { ptr4[0] = (u32) data0; data0 >>= 32; ptr_out += 4;} 12662306a36Sopenharmony_ci * ptr2 = (s16 *) &ptr_out[0]; 12762306a36Sopenharmony_ci * if (epilog & 2) { ptr2[0] = (u16) data0; data0 >>= 16; ptr_out += 2;} 12862306a36Sopenharmony_ci * if (epilog & 1) { *ptr_out++ = (u8) data0; } 12962306a36Sopenharmony_ci * 13062306a36Sopenharmony_ci * return(ptr_out - length); 13162306a36Sopenharmony_ci * } 13262306a36Sopenharmony_ci * 13362306a36Sopenharmony_ci * Codesize : 784 bytes 13462306a36Sopenharmony_ci */ 13562306a36Sopenharmony_ci 13662306a36Sopenharmony_ci 13762306a36Sopenharmony_ci#define ptr_out R0 /* destination pounter */ 13862306a36Sopenharmony_ci#define ptr_in R1 /* source pointer */ 13962306a36Sopenharmony_ci#define len R2 /* length of copy in bytes */ 14062306a36Sopenharmony_ci 14162306a36Sopenharmony_ci#define data70 R13:12 /* lo 8 bytes of non-aligned transfer */ 14262306a36Sopenharmony_ci#define dataF8 R11:10 /* hi 8 bytes of non-aligned transfer */ 14362306a36Sopenharmony_ci#define ldata0 R7:6 /* even 8 bytes chunks */ 14462306a36Sopenharmony_ci#define ldata1 R25:24 /* odd 8 bytes chunks */ 14562306a36Sopenharmony_ci#define data1 R7 /* lower 8 bytes of ldata1 */ 14662306a36Sopenharmony_ci#define data0 R6 /* lower 8 bytes of ldata0 */ 14762306a36Sopenharmony_ci 14862306a36Sopenharmony_ci#define ifbyte p0 /* if transfer has bytes in epilog/prolog */ 14962306a36Sopenharmony_ci#define ifhword p0 /* if transfer has shorts in epilog/prolog */ 15062306a36Sopenharmony_ci#define ifword p0 /* if transfer has words in epilog/prolog */ 15162306a36Sopenharmony_ci#define noprolog p0 /* no prolog, xfer starts at 32byte */ 15262306a36Sopenharmony_ci#define nokernel p1 /* no 32byte multiple block in the transfer */ 15362306a36Sopenharmony_ci#define noepilog p0 /* no epilog, xfer ends on 32byte boundary */ 15462306a36Sopenharmony_ci#define align p2 /* alignment of input rel to 8byte boundary */ 15562306a36Sopenharmony_ci#define kernel1 p0 /* kernel count == 1 */ 15662306a36Sopenharmony_ci 15762306a36Sopenharmony_ci#define dalign R25 /* rel alignment of input to output data */ 15862306a36Sopenharmony_ci#define star3 R16 /* number bytes in prolog - dwords */ 15962306a36Sopenharmony_ci#define rest R8 /* length - prolog bytes */ 16062306a36Sopenharmony_ci#define back R7 /* nr bytes > dword boundary in src block */ 16162306a36Sopenharmony_ci#define epilog R3 /* bytes in epilog */ 16262306a36Sopenharmony_ci#define inc R15:14 /* inc kernel by -1 and defetch ptr by 32 */ 16362306a36Sopenharmony_ci#define kernel R4 /* number of 32byte chunks in kernel */ 16462306a36Sopenharmony_ci#define ptr_in_p_128 R5 /* pointer for prefetch of input data */ 16562306a36Sopenharmony_ci#define mask R8 /* mask used to determine prolog size */ 16662306a36Sopenharmony_ci#define shift R8 /* used to work a shifter to extract bytes */ 16762306a36Sopenharmony_ci#define shift2 R5 /* in epilog to workshifter to extract bytes */ 16862306a36Sopenharmony_ci#define prolog R15 /* bytes in prolog */ 16962306a36Sopenharmony_ci#define epilogdws R15 /* number dwords in epilog */ 17062306a36Sopenharmony_ci#define shiftb R14 /* used to extract bytes */ 17162306a36Sopenharmony_ci#define offset R9 /* same as align in reg */ 17262306a36Sopenharmony_ci#define ptr_out_p_32 R17 /* pointer to output dczero */ 17362306a36Sopenharmony_ci#define align888 R14 /* if simple dword loop can be used */ 17462306a36Sopenharmony_ci#define len8 R9 /* number of dwords in length */ 17562306a36Sopenharmony_ci#define over R20 /* nr of bytes > last inp buf dword boundary */ 17662306a36Sopenharmony_ci 17762306a36Sopenharmony_ci#define ptr_in_p_128kernel R5:4 /* packed fetch pointer & kernel cnt */ 17862306a36Sopenharmony_ci 17962306a36Sopenharmony_ci .section .text 18062306a36Sopenharmony_ci .p2align 4 18162306a36Sopenharmony_ci .global memcpy 18262306a36Sopenharmony_ci .type memcpy, @function 18362306a36Sopenharmony_cimemcpy: 18462306a36Sopenharmony_ci{ 18562306a36Sopenharmony_ci p2 = cmp.eq(len, #0); /* =0 */ 18662306a36Sopenharmony_ci align888 = or(ptr_in, ptr_out); /* %8 < 97 */ 18762306a36Sopenharmony_ci p0 = cmp.gtu(len, #23); /* %1, <24 */ 18862306a36Sopenharmony_ci p1 = cmp.eq(ptr_in, ptr_out); /* attempt to overwrite self */ 18962306a36Sopenharmony_ci} 19062306a36Sopenharmony_ci{ 19162306a36Sopenharmony_ci p1 = or(p2, p1); 19262306a36Sopenharmony_ci p3 = cmp.gtu(len, #95); /* %8 < 97 */ 19362306a36Sopenharmony_ci align888 = or(align888, len); /* %8 < 97 */ 19462306a36Sopenharmony_ci len8 = lsr(len, #3); /* %8 < 97 */ 19562306a36Sopenharmony_ci} 19662306a36Sopenharmony_ci{ 19762306a36Sopenharmony_ci dcfetch(ptr_in); /* zero/ptrin=ptrout causes fetch */ 19862306a36Sopenharmony_ci p2 = bitsclr(align888, #7); /* %8 < 97 */ 19962306a36Sopenharmony_ci if(p1) jumpr r31; /* =0 */ 20062306a36Sopenharmony_ci} 20162306a36Sopenharmony_ci{ 20262306a36Sopenharmony_ci p2 = and(p2,!p3); /* %8 < 97 */ 20362306a36Sopenharmony_ci if (p2.new) len = add(len, #-8); /* %8 < 97 */ 20462306a36Sopenharmony_ci if (p2.new) jump:NT .Ldwordaligned; /* %8 < 97 */ 20562306a36Sopenharmony_ci} 20662306a36Sopenharmony_ci{ 20762306a36Sopenharmony_ci if(!p0) jump .Lbytes23orless; /* %1, <24 */ 20862306a36Sopenharmony_ci mask.l = #LO(0x7fffffff); 20962306a36Sopenharmony_ci /* all bytes before line multiples of data */ 21062306a36Sopenharmony_ci prolog = sub(#0, ptr_out); 21162306a36Sopenharmony_ci} 21262306a36Sopenharmony_ci{ 21362306a36Sopenharmony_ci /* save r31 on stack, decrement sp by 16 */ 21462306a36Sopenharmony_ci allocframe(#24); 21562306a36Sopenharmony_ci mask.h = #HI(0x7fffffff); 21662306a36Sopenharmony_ci ptr_in_p_128 = add(ptr_in, #32); 21762306a36Sopenharmony_ci back = cl0(len); 21862306a36Sopenharmony_ci} 21962306a36Sopenharmony_ci{ 22062306a36Sopenharmony_ci memd(sp+#0) = R17:16; /* save r16,r17 on stack6 */ 22162306a36Sopenharmony_ci r31.l = #LO(.Lmemcpy_return); /* set up final return pointer */ 22262306a36Sopenharmony_ci prolog &= lsr(mask, back); 22362306a36Sopenharmony_ci offset = and(ptr_in, #7); 22462306a36Sopenharmony_ci} 22562306a36Sopenharmony_ci{ 22662306a36Sopenharmony_ci memd(sp+#8) = R25:24; /* save r25,r24 on stack */ 22762306a36Sopenharmony_ci dalign = sub(ptr_out, ptr_in); 22862306a36Sopenharmony_ci r31.h = #HI(.Lmemcpy_return); /* set up final return pointer */ 22962306a36Sopenharmony_ci} 23062306a36Sopenharmony_ci{ 23162306a36Sopenharmony_ci /* see if there if input buffer end if aligned */ 23262306a36Sopenharmony_ci over = add(len, ptr_in); 23362306a36Sopenharmony_ci back = add(len, offset); 23462306a36Sopenharmony_ci memd(sp+#16) = R21:20; /* save r20,r21 on stack */ 23562306a36Sopenharmony_ci} 23662306a36Sopenharmony_ci{ 23762306a36Sopenharmony_ci noprolog = bitsclr(prolog, #7); 23862306a36Sopenharmony_ci prolog = and(prolog, #31); 23962306a36Sopenharmony_ci dcfetch(ptr_in_p_128); 24062306a36Sopenharmony_ci ptr_in_p_128 = add(ptr_in_p_128, #32); 24162306a36Sopenharmony_ci} 24262306a36Sopenharmony_ci{ 24362306a36Sopenharmony_ci kernel = sub(len, prolog); 24462306a36Sopenharmony_ci shift = asl(prolog, #3); 24562306a36Sopenharmony_ci star3 = and(prolog, #7); 24662306a36Sopenharmony_ci ptr_in = and(ptr_in, #-8); 24762306a36Sopenharmony_ci} 24862306a36Sopenharmony_ci{ 24962306a36Sopenharmony_ci prolog = lsr(prolog, #3); 25062306a36Sopenharmony_ci epilog = and(kernel, #31); 25162306a36Sopenharmony_ci ptr_out_p_32 = add(ptr_out, prolog); 25262306a36Sopenharmony_ci over = and(over, #7); 25362306a36Sopenharmony_ci} 25462306a36Sopenharmony_ci{ 25562306a36Sopenharmony_ci p3 = cmp.gtu(back, #8); 25662306a36Sopenharmony_ci kernel = lsr(kernel, #5); 25762306a36Sopenharmony_ci dcfetch(ptr_in_p_128); 25862306a36Sopenharmony_ci ptr_in_p_128 = add(ptr_in_p_128, #32); 25962306a36Sopenharmony_ci} 26062306a36Sopenharmony_ci{ 26162306a36Sopenharmony_ci p1 = cmp.eq(prolog, #0); 26262306a36Sopenharmony_ci if(!p1.new) prolog = add(prolog, #1); 26362306a36Sopenharmony_ci dcfetch(ptr_in_p_128); /* reserve the line 64bytes on */ 26462306a36Sopenharmony_ci ptr_in_p_128 = add(ptr_in_p_128, #32); 26562306a36Sopenharmony_ci} 26662306a36Sopenharmony_ci{ 26762306a36Sopenharmony_ci nokernel = cmp.eq(kernel,#0); 26862306a36Sopenharmony_ci dcfetch(ptr_in_p_128); /* reserve the line 64bytes on */ 26962306a36Sopenharmony_ci ptr_in_p_128 = add(ptr_in_p_128, #32); 27062306a36Sopenharmony_ci shiftb = and(shift, #8); 27162306a36Sopenharmony_ci} 27262306a36Sopenharmony_ci{ 27362306a36Sopenharmony_ci dcfetch(ptr_in_p_128); /* reserve the line 64bytes on */ 27462306a36Sopenharmony_ci ptr_in_p_128 = add(ptr_in_p_128, #32); 27562306a36Sopenharmony_ci if(nokernel) jump .Lskip64; 27662306a36Sopenharmony_ci p2 = cmp.eq(kernel, #1); /* skip ovr if kernel == 0 */ 27762306a36Sopenharmony_ci} 27862306a36Sopenharmony_ci{ 27962306a36Sopenharmony_ci dczeroa(ptr_out_p_32); 28062306a36Sopenharmony_ci /* don't advance pointer */ 28162306a36Sopenharmony_ci if(!p2) ptr_out_p_32 = add(ptr_out_p_32, #32); 28262306a36Sopenharmony_ci} 28362306a36Sopenharmony_ci{ 28462306a36Sopenharmony_ci dalign = and(dalign, #31); 28562306a36Sopenharmony_ci dczeroa(ptr_out_p_32); 28662306a36Sopenharmony_ci} 28762306a36Sopenharmony_ci.Lskip64: 28862306a36Sopenharmony_ci{ 28962306a36Sopenharmony_ci data70 = memd(ptr_in++#16); 29062306a36Sopenharmony_ci if(p3) dataF8 = memd(ptr_in+#8); 29162306a36Sopenharmony_ci if(noprolog) jump .Lnoprolog32; 29262306a36Sopenharmony_ci align = offset; 29362306a36Sopenharmony_ci} 29462306a36Sopenharmony_ci/* upto initial 7 bytes */ 29562306a36Sopenharmony_ci{ 29662306a36Sopenharmony_ci ldata0 = valignb(dataF8, data70, align); 29762306a36Sopenharmony_ci ifbyte = tstbit(shift,#3); 29862306a36Sopenharmony_ci offset = add(offset, star3); 29962306a36Sopenharmony_ci} 30062306a36Sopenharmony_ci{ 30162306a36Sopenharmony_ci if(ifbyte) memb(ptr_out++#1) = data0; 30262306a36Sopenharmony_ci ldata0 = lsr(ldata0, shiftb); 30362306a36Sopenharmony_ci shiftb = and(shift, #16); 30462306a36Sopenharmony_ci ifhword = tstbit(shift,#4); 30562306a36Sopenharmony_ci} 30662306a36Sopenharmony_ci{ 30762306a36Sopenharmony_ci if(ifhword) memh(ptr_out++#2) = data0; 30862306a36Sopenharmony_ci ldata0 = lsr(ldata0, shiftb); 30962306a36Sopenharmony_ci ifword = tstbit(shift,#5); 31062306a36Sopenharmony_ci p2 = cmp.gtu(offset, #7); 31162306a36Sopenharmony_ci} 31262306a36Sopenharmony_ci{ 31362306a36Sopenharmony_ci if(ifword) memw(ptr_out++#4) = data0; 31462306a36Sopenharmony_ci if(p2) data70 = dataF8; 31562306a36Sopenharmony_ci if(p2) dataF8 = memd(ptr_in++#8); /* another 8 bytes */ 31662306a36Sopenharmony_ci align = offset; 31762306a36Sopenharmony_ci} 31862306a36Sopenharmony_ci.Lnoprolog32: 31962306a36Sopenharmony_ci{ 32062306a36Sopenharmony_ci p3 = sp1loop0(.Ldword_loop_prolog, prolog) 32162306a36Sopenharmony_ci rest = sub(len, star3); /* whats left after the loop */ 32262306a36Sopenharmony_ci p0 = cmp.gt(over, #0); 32362306a36Sopenharmony_ci} 32462306a36Sopenharmony_ci if(p0) rest = add(rest, #16); 32562306a36Sopenharmony_ci.Ldword_loop_prolog: 32662306a36Sopenharmony_ci{ 32762306a36Sopenharmony_ci if(p3) memd(ptr_out++#8) = ldata0; 32862306a36Sopenharmony_ci ldata0 = valignb(dataF8, data70, align); 32962306a36Sopenharmony_ci p0 = cmp.gt(rest, #16); 33062306a36Sopenharmony_ci} 33162306a36Sopenharmony_ci{ 33262306a36Sopenharmony_ci data70 = dataF8; 33362306a36Sopenharmony_ci if(p0) dataF8 = memd(ptr_in++#8); 33462306a36Sopenharmony_ci rest = add(rest, #-8); 33562306a36Sopenharmony_ci}:endloop0 33662306a36Sopenharmony_ci.Lkernel: 33762306a36Sopenharmony_ci{ 33862306a36Sopenharmony_ci /* kernel is at least 32bytes */ 33962306a36Sopenharmony_ci p3 = cmp.gtu(kernel, #0); 34062306a36Sopenharmony_ci /* last itn. remove edge effects */ 34162306a36Sopenharmony_ci if(p3.new) kernel = add(kernel, #-1); 34262306a36Sopenharmony_ci /* dealt with in last dword loop */ 34362306a36Sopenharmony_ci if(p3.new) epilog = add(epilog, #32); 34462306a36Sopenharmony_ci} 34562306a36Sopenharmony_ci{ 34662306a36Sopenharmony_ci nokernel = cmp.eq(kernel, #0); /* after adjustment, recheck */ 34762306a36Sopenharmony_ci if(nokernel.new) jump:NT .Lepilog; /* likely not taken */ 34862306a36Sopenharmony_ci inc = combine(#32, #-1); 34962306a36Sopenharmony_ci p3 = cmp.gtu(dalign, #24); 35062306a36Sopenharmony_ci} 35162306a36Sopenharmony_ci{ 35262306a36Sopenharmony_ci if(p3) jump .Lodd_alignment; 35362306a36Sopenharmony_ci} 35462306a36Sopenharmony_ci{ 35562306a36Sopenharmony_ci loop0(.Loword_loop_25to31, kernel); 35662306a36Sopenharmony_ci kernel1 = cmp.gtu(kernel, #1); 35762306a36Sopenharmony_ci rest = kernel; 35862306a36Sopenharmony_ci} 35962306a36Sopenharmony_ci .falign 36062306a36Sopenharmony_ci.Loword_loop_25to31: 36162306a36Sopenharmony_ci{ 36262306a36Sopenharmony_ci dcfetch(ptr_in_p_128); /* prefetch 4 lines ahead */ 36362306a36Sopenharmony_ci if(kernel1) ptr_out_p_32 = add(ptr_out_p_32, #32); 36462306a36Sopenharmony_ci} 36562306a36Sopenharmony_ci{ 36662306a36Sopenharmony_ci dczeroa(ptr_out_p_32); /* reserve the next 32bytes in cache */ 36762306a36Sopenharmony_ci p3 = cmp.eq(kernel, rest); 36862306a36Sopenharmony_ci} 36962306a36Sopenharmony_ci{ 37062306a36Sopenharmony_ci /* kernel -= 1 */ 37162306a36Sopenharmony_ci ptr_in_p_128kernel = vaddw(ptr_in_p_128kernel, inc); 37262306a36Sopenharmony_ci /* kill write on first iteration */ 37362306a36Sopenharmony_ci if(!p3) memd(ptr_out++#8) = ldata1; 37462306a36Sopenharmony_ci ldata1 = valignb(dataF8, data70, align); 37562306a36Sopenharmony_ci data70 = memd(ptr_in++#8); 37662306a36Sopenharmony_ci} 37762306a36Sopenharmony_ci{ 37862306a36Sopenharmony_ci memd(ptr_out++#8) = ldata0; 37962306a36Sopenharmony_ci ldata0 = valignb(data70, dataF8, align); 38062306a36Sopenharmony_ci dataF8 = memd(ptr_in++#8); 38162306a36Sopenharmony_ci} 38262306a36Sopenharmony_ci{ 38362306a36Sopenharmony_ci memd(ptr_out++#8) = ldata1; 38462306a36Sopenharmony_ci ldata1 = valignb(dataF8, data70, align); 38562306a36Sopenharmony_ci data70 = memd(ptr_in++#8); 38662306a36Sopenharmony_ci} 38762306a36Sopenharmony_ci{ 38862306a36Sopenharmony_ci memd(ptr_out++#8) = ldata0; 38962306a36Sopenharmony_ci ldata0 = valignb(data70, dataF8, align); 39062306a36Sopenharmony_ci dataF8 = memd(ptr_in++#8); 39162306a36Sopenharmony_ci kernel1 = cmp.gtu(kernel, #1); 39262306a36Sopenharmony_ci}:endloop0 39362306a36Sopenharmony_ci{ 39462306a36Sopenharmony_ci memd(ptr_out++#8) = ldata1; 39562306a36Sopenharmony_ci jump .Lepilog; 39662306a36Sopenharmony_ci} 39762306a36Sopenharmony_ci.Lodd_alignment: 39862306a36Sopenharmony_ci{ 39962306a36Sopenharmony_ci loop0(.Loword_loop_00to24, kernel); 40062306a36Sopenharmony_ci kernel1 = cmp.gtu(kernel, #1); 40162306a36Sopenharmony_ci rest = add(kernel, #-1); 40262306a36Sopenharmony_ci} 40362306a36Sopenharmony_ci .falign 40462306a36Sopenharmony_ci.Loword_loop_00to24: 40562306a36Sopenharmony_ci{ 40662306a36Sopenharmony_ci dcfetch(ptr_in_p_128); /* prefetch 4 lines ahead */ 40762306a36Sopenharmony_ci ptr_in_p_128kernel = vaddw(ptr_in_p_128kernel, inc); 40862306a36Sopenharmony_ci if(kernel1) ptr_out_p_32 = add(ptr_out_p_32, #32); 40962306a36Sopenharmony_ci} 41062306a36Sopenharmony_ci{ 41162306a36Sopenharmony_ci dczeroa(ptr_out_p_32); /* reserve the next 32bytes in cache */ 41262306a36Sopenharmony_ci} 41362306a36Sopenharmony_ci{ 41462306a36Sopenharmony_ci memd(ptr_out++#8) = ldata0; 41562306a36Sopenharmony_ci ldata0 = valignb(dataF8, data70, align); 41662306a36Sopenharmony_ci data70 = memd(ptr_in++#8); 41762306a36Sopenharmony_ci} 41862306a36Sopenharmony_ci{ 41962306a36Sopenharmony_ci memd(ptr_out++#8) = ldata0; 42062306a36Sopenharmony_ci ldata0 = valignb(data70, dataF8, align); 42162306a36Sopenharmony_ci dataF8 = memd(ptr_in++#8); 42262306a36Sopenharmony_ci} 42362306a36Sopenharmony_ci{ 42462306a36Sopenharmony_ci memd(ptr_out++#8) = ldata0; 42562306a36Sopenharmony_ci ldata0 = valignb(dataF8, data70, align); 42662306a36Sopenharmony_ci data70 = memd(ptr_in++#8); 42762306a36Sopenharmony_ci} 42862306a36Sopenharmony_ci{ 42962306a36Sopenharmony_ci memd(ptr_out++#8) = ldata0; 43062306a36Sopenharmony_ci ldata0 = valignb(data70, dataF8, align); 43162306a36Sopenharmony_ci dataF8 = memd(ptr_in++#8); 43262306a36Sopenharmony_ci kernel1 = cmp.gtu(kernel, #1); 43362306a36Sopenharmony_ci}:endloop0 43462306a36Sopenharmony_ci.Lepilog: 43562306a36Sopenharmony_ci{ 43662306a36Sopenharmony_ci noepilog = cmp.eq(epilog,#0); 43762306a36Sopenharmony_ci epilogdws = lsr(epilog, #3); 43862306a36Sopenharmony_ci kernel = and(epilog, #7); 43962306a36Sopenharmony_ci} 44062306a36Sopenharmony_ci{ 44162306a36Sopenharmony_ci if(noepilog) jumpr r31; 44262306a36Sopenharmony_ci if(noepilog) ptr_out = sub(ptr_out, len); 44362306a36Sopenharmony_ci p3 = cmp.eq(epilogdws, #0); 44462306a36Sopenharmony_ci shift2 = asl(epilog, #3); 44562306a36Sopenharmony_ci} 44662306a36Sopenharmony_ci{ 44762306a36Sopenharmony_ci shiftb = and(shift2, #32); 44862306a36Sopenharmony_ci ifword = tstbit(epilog,#2); 44962306a36Sopenharmony_ci if(p3) jump .Lepilog60; 45062306a36Sopenharmony_ci if(!p3) epilog = add(epilog, #-16); 45162306a36Sopenharmony_ci} 45262306a36Sopenharmony_ci{ 45362306a36Sopenharmony_ci loop0(.Ldword_loop_epilog, epilogdws); 45462306a36Sopenharmony_ci /* stop criteria is lsbs unless = 0 then its 8 */ 45562306a36Sopenharmony_ci p3 = cmp.eq(kernel, #0); 45662306a36Sopenharmony_ci if(p3.new) kernel= #8; 45762306a36Sopenharmony_ci p1 = cmp.gt(over, #0); 45862306a36Sopenharmony_ci} 45962306a36Sopenharmony_ci /* if not aligned to end of buffer execute 1 more iteration */ 46062306a36Sopenharmony_ci if(p1) kernel= #0; 46162306a36Sopenharmony_ci.Ldword_loop_epilog: 46262306a36Sopenharmony_ci{ 46362306a36Sopenharmony_ci memd(ptr_out++#8) = ldata0; 46462306a36Sopenharmony_ci ldata0 = valignb(dataF8, data70, align); 46562306a36Sopenharmony_ci p3 = cmp.gt(epilog, kernel); 46662306a36Sopenharmony_ci} 46762306a36Sopenharmony_ci{ 46862306a36Sopenharmony_ci data70 = dataF8; 46962306a36Sopenharmony_ci if(p3) dataF8 = memd(ptr_in++#8); 47062306a36Sopenharmony_ci epilog = add(epilog, #-8); 47162306a36Sopenharmony_ci}:endloop0 47262306a36Sopenharmony_ci/* copy last 7 bytes */ 47362306a36Sopenharmony_ci.Lepilog60: 47462306a36Sopenharmony_ci{ 47562306a36Sopenharmony_ci if(ifword) memw(ptr_out++#4) = data0; 47662306a36Sopenharmony_ci ldata0 = lsr(ldata0, shiftb); 47762306a36Sopenharmony_ci ifhword = tstbit(epilog,#1); 47862306a36Sopenharmony_ci shiftb = and(shift2, #16); 47962306a36Sopenharmony_ci} 48062306a36Sopenharmony_ci{ 48162306a36Sopenharmony_ci if(ifhword) memh(ptr_out++#2) = data0; 48262306a36Sopenharmony_ci ldata0 = lsr(ldata0, shiftb); 48362306a36Sopenharmony_ci ifbyte = tstbit(epilog,#0); 48462306a36Sopenharmony_ci if(ifbyte.new) len = add(len, #-1); 48562306a36Sopenharmony_ci} 48662306a36Sopenharmony_ci{ 48762306a36Sopenharmony_ci if(ifbyte) memb(ptr_out) = data0; 48862306a36Sopenharmony_ci ptr_out = sub(ptr_out, len); /* return dest pointer */ 48962306a36Sopenharmony_ci jumpr r31; 49062306a36Sopenharmony_ci} 49162306a36Sopenharmony_ci/* do byte copy for small n */ 49262306a36Sopenharmony_ci.Lbytes23orless: 49362306a36Sopenharmony_ci{ 49462306a36Sopenharmony_ci p3 = sp1loop0(.Lbyte_copy, len); 49562306a36Sopenharmony_ci len = add(len, #-1); 49662306a36Sopenharmony_ci} 49762306a36Sopenharmony_ci.Lbyte_copy: 49862306a36Sopenharmony_ci{ 49962306a36Sopenharmony_ci data0 = memb(ptr_in++#1); 50062306a36Sopenharmony_ci if(p3) memb(ptr_out++#1) = data0; 50162306a36Sopenharmony_ci}:endloop0 50262306a36Sopenharmony_ci{ 50362306a36Sopenharmony_ci memb(ptr_out) = data0; 50462306a36Sopenharmony_ci ptr_out = sub(ptr_out, len); 50562306a36Sopenharmony_ci jumpr r31; 50662306a36Sopenharmony_ci} 50762306a36Sopenharmony_ci/* do dword copies for aligned in, out and length */ 50862306a36Sopenharmony_ci.Ldwordaligned: 50962306a36Sopenharmony_ci{ 51062306a36Sopenharmony_ci p3 = sp1loop0(.Ldword_copy, len8); 51162306a36Sopenharmony_ci} 51262306a36Sopenharmony_ci.Ldword_copy: 51362306a36Sopenharmony_ci{ 51462306a36Sopenharmony_ci if(p3) memd(ptr_out++#8) = ldata0; 51562306a36Sopenharmony_ci ldata0 = memd(ptr_in++#8); 51662306a36Sopenharmony_ci}:endloop0 51762306a36Sopenharmony_ci{ 51862306a36Sopenharmony_ci memd(ptr_out) = ldata0; 51962306a36Sopenharmony_ci ptr_out = sub(ptr_out, len); 52062306a36Sopenharmony_ci jumpr r31; /* return to function caller */ 52162306a36Sopenharmony_ci} 52262306a36Sopenharmony_ci.Lmemcpy_return: 52362306a36Sopenharmony_ci r21:20 = memd(sp+#16); /* restore r20+r21 */ 52462306a36Sopenharmony_ci{ 52562306a36Sopenharmony_ci r25:24 = memd(sp+#8); /* restore r24+r25 */ 52662306a36Sopenharmony_ci r17:16 = memd(sp+#0); /* restore r16+r17 */ 52762306a36Sopenharmony_ci} 52862306a36Sopenharmony_ci deallocframe; /* restore r31 and incrment stack by 16 */ 52962306a36Sopenharmony_ci jumpr r31 530