162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * 462306a36Sopenharmony_ci * Optimized version of the copy_user() routine. 562306a36Sopenharmony_ci * It is used to copy date across the kernel/user boundary. 662306a36Sopenharmony_ci * 762306a36Sopenharmony_ci * The source and destination are always on opposite side of 862306a36Sopenharmony_ci * the boundary. When reading from user space we must catch 962306a36Sopenharmony_ci * faults on loads. When writing to user space we must catch 1062306a36Sopenharmony_ci * errors on stores. Note that because of the nature of the copy 1162306a36Sopenharmony_ci * we don't need to worry about overlapping regions. 1262306a36Sopenharmony_ci * 1362306a36Sopenharmony_ci * 1462306a36Sopenharmony_ci * Inputs: 1562306a36Sopenharmony_ci * in0 address of source buffer 1662306a36Sopenharmony_ci * in1 address of destination buffer 1762306a36Sopenharmony_ci * in2 number of bytes to copy 1862306a36Sopenharmony_ci * 1962306a36Sopenharmony_ci * Outputs: 2062306a36Sopenharmony_ci * ret0 0 in case of success. The number of bytes NOT copied in 2162306a36Sopenharmony_ci * case of error. 2262306a36Sopenharmony_ci * 2362306a36Sopenharmony_ci * Copyright (C) 2000-2001 Hewlett-Packard Co 2462306a36Sopenharmony_ci * Stephane Eranian <eranian@hpl.hp.com> 2562306a36Sopenharmony_ci * 2662306a36Sopenharmony_ci * Fixme: 2762306a36Sopenharmony_ci * - handle the case where we have more than 16 bytes and the alignment 2862306a36Sopenharmony_ci * are different. 2962306a36Sopenharmony_ci * - more benchmarking 3062306a36Sopenharmony_ci * - fix extraneous stop bit introduced by the EX() macro. 3162306a36Sopenharmony_ci */ 3262306a36Sopenharmony_ci 3362306a36Sopenharmony_ci#include <linux/export.h> 3462306a36Sopenharmony_ci#include <asm/asmmacro.h> 3562306a36Sopenharmony_ci 3662306a36Sopenharmony_ci// 3762306a36Sopenharmony_ci// Tuneable parameters 3862306a36Sopenharmony_ci// 3962306a36Sopenharmony_ci#define COPY_BREAK 16 // we do byte copy below (must be >=16) 4062306a36Sopenharmony_ci#define PIPE_DEPTH 21 // pipe depth 4162306a36Sopenharmony_ci 4262306a36Sopenharmony_ci#define EPI p[PIPE_DEPTH-1] 4362306a36Sopenharmony_ci 4462306a36Sopenharmony_ci// 4562306a36Sopenharmony_ci// arguments 4662306a36Sopenharmony_ci// 4762306a36Sopenharmony_ci#define dst in0 4862306a36Sopenharmony_ci#define src in1 4962306a36Sopenharmony_ci#define len in2 5062306a36Sopenharmony_ci 5162306a36Sopenharmony_ci// 5262306a36Sopenharmony_ci// local registers 5362306a36Sopenharmony_ci// 5462306a36Sopenharmony_ci#define t1 r2 // rshift in bytes 5562306a36Sopenharmony_ci#define t2 r3 // lshift in bytes 5662306a36Sopenharmony_ci#define rshift r14 // right shift in bits 5762306a36Sopenharmony_ci#define lshift r15 // left shift in bits 5862306a36Sopenharmony_ci#define word1 r16 5962306a36Sopenharmony_ci#define word2 r17 6062306a36Sopenharmony_ci#define cnt r18 6162306a36Sopenharmony_ci#define len2 r19 6262306a36Sopenharmony_ci#define saved_lc r20 6362306a36Sopenharmony_ci#define saved_pr r21 6462306a36Sopenharmony_ci#define tmp r22 6562306a36Sopenharmony_ci#define val r23 6662306a36Sopenharmony_ci#define src1 r24 6762306a36Sopenharmony_ci#define dst1 r25 6862306a36Sopenharmony_ci#define src2 r26 6962306a36Sopenharmony_ci#define dst2 r27 7062306a36Sopenharmony_ci#define len1 r28 7162306a36Sopenharmony_ci#define enddst r29 7262306a36Sopenharmony_ci#define endsrc r30 7362306a36Sopenharmony_ci#define saved_pfs r31 7462306a36Sopenharmony_ci 7562306a36Sopenharmony_ciGLOBAL_ENTRY(__copy_user) 7662306a36Sopenharmony_ci .prologue 7762306a36Sopenharmony_ci .save ar.pfs, saved_pfs 7862306a36Sopenharmony_ci alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7) 7962306a36Sopenharmony_ci 8062306a36Sopenharmony_ci .rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH] 8162306a36Sopenharmony_ci .rotp p[PIPE_DEPTH] 8262306a36Sopenharmony_ci 8362306a36Sopenharmony_ci adds len2=-1,len // br.ctop is repeat/until 8462306a36Sopenharmony_ci mov ret0=r0 8562306a36Sopenharmony_ci 8662306a36Sopenharmony_ci ;; // RAW of cfm when len=0 8762306a36Sopenharmony_ci cmp.eq p8,p0=r0,len // check for zero length 8862306a36Sopenharmony_ci .save ar.lc, saved_lc 8962306a36Sopenharmony_ci mov saved_lc=ar.lc // preserve ar.lc (slow) 9062306a36Sopenharmony_ci(p8) br.ret.spnt.many rp // empty mempcy() 9162306a36Sopenharmony_ci ;; 9262306a36Sopenharmony_ci add enddst=dst,len // first byte after end of source 9362306a36Sopenharmony_ci add endsrc=src,len // first byte after end of destination 9462306a36Sopenharmony_ci .save pr, saved_pr 9562306a36Sopenharmony_ci mov saved_pr=pr // preserve predicates 9662306a36Sopenharmony_ci 9762306a36Sopenharmony_ci .body 9862306a36Sopenharmony_ci 9962306a36Sopenharmony_ci mov dst1=dst // copy because of rotation 10062306a36Sopenharmony_ci mov ar.ec=PIPE_DEPTH 10162306a36Sopenharmony_ci mov pr.rot=1<<16 // p16=true all others are false 10262306a36Sopenharmony_ci 10362306a36Sopenharmony_ci mov src1=src // copy because of rotation 10462306a36Sopenharmony_ci mov ar.lc=len2 // initialize lc for small count 10562306a36Sopenharmony_ci cmp.lt p10,p7=COPY_BREAK,len // if len > COPY_BREAK then long copy 10662306a36Sopenharmony_ci 10762306a36Sopenharmony_ci xor tmp=src,dst // same alignment test prepare 10862306a36Sopenharmony_ci(p10) br.cond.dptk .long_copy_user 10962306a36Sopenharmony_ci ;; // RAW pr.rot/p16 ? 11062306a36Sopenharmony_ci // 11162306a36Sopenharmony_ci // Now we do the byte by byte loop with software pipeline 11262306a36Sopenharmony_ci // 11362306a36Sopenharmony_ci // p7 is necessarily false by now 11462306a36Sopenharmony_ci1: 11562306a36Sopenharmony_ci EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1) 11662306a36Sopenharmony_ci EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1) 11762306a36Sopenharmony_ci br.ctop.dptk.few 1b 11862306a36Sopenharmony_ci ;; 11962306a36Sopenharmony_ci mov ar.lc=saved_lc 12062306a36Sopenharmony_ci mov pr=saved_pr,0xffffffffffff0000 12162306a36Sopenharmony_ci mov ar.pfs=saved_pfs // restore ar.ec 12262306a36Sopenharmony_ci br.ret.sptk.many rp // end of short memcpy 12362306a36Sopenharmony_ci 12462306a36Sopenharmony_ci // 12562306a36Sopenharmony_ci // Not 8-byte aligned 12662306a36Sopenharmony_ci // 12762306a36Sopenharmony_ci.diff_align_copy_user: 12862306a36Sopenharmony_ci // At this point we know we have more than 16 bytes to copy 12962306a36Sopenharmony_ci // and also that src and dest do _not_ have the same alignment. 13062306a36Sopenharmony_ci and src2=0x7,src1 // src offset 13162306a36Sopenharmony_ci and dst2=0x7,dst1 // dst offset 13262306a36Sopenharmony_ci ;; 13362306a36Sopenharmony_ci // The basic idea is that we copy byte-by-byte at the head so 13462306a36Sopenharmony_ci // that we can reach 8-byte alignment for both src1 and dst1. 13562306a36Sopenharmony_ci // Then copy the body using software pipelined 8-byte copy, 13662306a36Sopenharmony_ci // shifting the two back-to-back words right and left, then copy 13762306a36Sopenharmony_ci // the tail by copying byte-by-byte. 13862306a36Sopenharmony_ci // 13962306a36Sopenharmony_ci // Fault handling. If the byte-by-byte at the head fails on the 14062306a36Sopenharmony_ci // load, then restart and finish the pipleline by copying zeros 14162306a36Sopenharmony_ci // to the dst1. Then copy zeros for the rest of dst1. 14262306a36Sopenharmony_ci // If 8-byte software pipeline fails on the load, do the same as 14362306a36Sopenharmony_ci // failure_in3 does. If the byte-by-byte at the tail fails, it is 14462306a36Sopenharmony_ci // handled simply by failure_in_pipe1. 14562306a36Sopenharmony_ci // 14662306a36Sopenharmony_ci // The case p14 represents the source has more bytes in the 14762306a36Sopenharmony_ci // the first word (by the shifted part), whereas the p15 needs to 14862306a36Sopenharmony_ci // copy some bytes from the 2nd word of the source that has the 14962306a36Sopenharmony_ci // tail of the 1st of the destination. 15062306a36Sopenharmony_ci // 15162306a36Sopenharmony_ci 15262306a36Sopenharmony_ci // 15362306a36Sopenharmony_ci // Optimization. If dst1 is 8-byte aligned (quite common), we don't need 15462306a36Sopenharmony_ci // to copy the head to dst1, to start 8-byte copy software pipeline. 15562306a36Sopenharmony_ci // We know src1 is not 8-byte aligned in this case. 15662306a36Sopenharmony_ci // 15762306a36Sopenharmony_ci cmp.eq p14,p15=r0,dst2 15862306a36Sopenharmony_ci(p15) br.cond.spnt 1f 15962306a36Sopenharmony_ci ;; 16062306a36Sopenharmony_ci sub t1=8,src2 16162306a36Sopenharmony_ci mov t2=src2 16262306a36Sopenharmony_ci ;; 16362306a36Sopenharmony_ci shl rshift=t2,3 16462306a36Sopenharmony_ci sub len1=len,t1 // set len1 16562306a36Sopenharmony_ci ;; 16662306a36Sopenharmony_ci sub lshift=64,rshift 16762306a36Sopenharmony_ci ;; 16862306a36Sopenharmony_ci br.cond.spnt .word_copy_user 16962306a36Sopenharmony_ci ;; 17062306a36Sopenharmony_ci1: 17162306a36Sopenharmony_ci cmp.leu p14,p15=src2,dst2 17262306a36Sopenharmony_ci sub t1=dst2,src2 17362306a36Sopenharmony_ci ;; 17462306a36Sopenharmony_ci .pred.rel "mutex", p14, p15 17562306a36Sopenharmony_ci(p14) sub word1=8,src2 // (8 - src offset) 17662306a36Sopenharmony_ci(p15) sub t1=r0,t1 // absolute value 17762306a36Sopenharmony_ci(p15) sub word1=8,dst2 // (8 - dst offset) 17862306a36Sopenharmony_ci ;; 17962306a36Sopenharmony_ci // For the case p14, we don't need to copy the shifted part to 18062306a36Sopenharmony_ci // the 1st word of destination. 18162306a36Sopenharmony_ci sub t2=8,t1 18262306a36Sopenharmony_ci(p14) sub word1=word1,t1 18362306a36Sopenharmony_ci ;; 18462306a36Sopenharmony_ci sub len1=len,word1 // resulting len 18562306a36Sopenharmony_ci(p15) shl rshift=t1,3 // in bits 18662306a36Sopenharmony_ci(p14) shl rshift=t2,3 18762306a36Sopenharmony_ci ;; 18862306a36Sopenharmony_ci(p14) sub len1=len1,t1 18962306a36Sopenharmony_ci adds cnt=-1,word1 19062306a36Sopenharmony_ci ;; 19162306a36Sopenharmony_ci sub lshift=64,rshift 19262306a36Sopenharmony_ci mov ar.ec=PIPE_DEPTH 19362306a36Sopenharmony_ci mov pr.rot=1<<16 // p16=true all others are false 19462306a36Sopenharmony_ci mov ar.lc=cnt 19562306a36Sopenharmony_ci ;; 19662306a36Sopenharmony_ci2: 19762306a36Sopenharmony_ci EX(.failure_in_pipe2,(p16) ld1 val1[0]=[src1],1) 19862306a36Sopenharmony_ci EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1) 19962306a36Sopenharmony_ci br.ctop.dptk.few 2b 20062306a36Sopenharmony_ci ;; 20162306a36Sopenharmony_ci clrrrb 20262306a36Sopenharmony_ci ;; 20362306a36Sopenharmony_ci.word_copy_user: 20462306a36Sopenharmony_ci cmp.gtu p9,p0=16,len1 20562306a36Sopenharmony_ci(p9) br.cond.spnt 4f // if (16 > len1) skip 8-byte copy 20662306a36Sopenharmony_ci ;; 20762306a36Sopenharmony_ci shr.u cnt=len1,3 // number of 64-bit words 20862306a36Sopenharmony_ci ;; 20962306a36Sopenharmony_ci adds cnt=-1,cnt 21062306a36Sopenharmony_ci ;; 21162306a36Sopenharmony_ci .pred.rel "mutex", p14, p15 21262306a36Sopenharmony_ci(p14) sub src1=src1,t2 21362306a36Sopenharmony_ci(p15) sub src1=src1,t1 21462306a36Sopenharmony_ci // 21562306a36Sopenharmony_ci // Now both src1 and dst1 point to an 8-byte aligned address. And 21662306a36Sopenharmony_ci // we have more than 8 bytes to copy. 21762306a36Sopenharmony_ci // 21862306a36Sopenharmony_ci mov ar.lc=cnt 21962306a36Sopenharmony_ci mov ar.ec=PIPE_DEPTH 22062306a36Sopenharmony_ci mov pr.rot=1<<16 // p16=true all others are false 22162306a36Sopenharmony_ci ;; 22262306a36Sopenharmony_ci3: 22362306a36Sopenharmony_ci // 22462306a36Sopenharmony_ci // The pipleline consists of 3 stages: 22562306a36Sopenharmony_ci // 1 (p16): Load a word from src1 22662306a36Sopenharmony_ci // 2 (EPI_1): Shift right pair, saving to tmp 22762306a36Sopenharmony_ci // 3 (EPI): Store tmp to dst1 22862306a36Sopenharmony_ci // 22962306a36Sopenharmony_ci // To make it simple, use at least 2 (p16) loops to set up val1[n] 23062306a36Sopenharmony_ci // because we need 2 back-to-back val1[] to get tmp. 23162306a36Sopenharmony_ci // Note that this implies EPI_2 must be p18 or greater. 23262306a36Sopenharmony_ci // 23362306a36Sopenharmony_ci 23462306a36Sopenharmony_ci#define EPI_1 p[PIPE_DEPTH-2] 23562306a36Sopenharmony_ci#define SWITCH(pred, shift) cmp.eq pred,p0=shift,rshift 23662306a36Sopenharmony_ci#define CASE(pred, shift) \ 23762306a36Sopenharmony_ci (pred) br.cond.spnt .copy_user_bit##shift 23862306a36Sopenharmony_ci#define BODY(rshift) \ 23962306a36Sopenharmony_ci.copy_user_bit##rshift: \ 24062306a36Sopenharmony_ci1: \ 24162306a36Sopenharmony_ci EX(.failure_out,(EPI) st8 [dst1]=tmp,8); \ 24262306a36Sopenharmony_ci(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \ 24362306a36Sopenharmony_ci EX(3f,(p16) ld8 val1[1]=[src1],8); \ 24462306a36Sopenharmony_ci(p16) mov val1[0]=r0; \ 24562306a36Sopenharmony_ci br.ctop.dptk 1b; \ 24662306a36Sopenharmony_ci ;; \ 24762306a36Sopenharmony_ci br.cond.sptk.many .diff_align_do_tail; \ 24862306a36Sopenharmony_ci2: \ 24962306a36Sopenharmony_ci(EPI) st8 [dst1]=tmp,8; \ 25062306a36Sopenharmony_ci(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \ 25162306a36Sopenharmony_ci3: \ 25262306a36Sopenharmony_ci(p16) mov val1[1]=r0; \ 25362306a36Sopenharmony_ci(p16) mov val1[0]=r0; \ 25462306a36Sopenharmony_ci br.ctop.dptk 2b; \ 25562306a36Sopenharmony_ci ;; \ 25662306a36Sopenharmony_ci br.cond.sptk.many .failure_in2 25762306a36Sopenharmony_ci 25862306a36Sopenharmony_ci // 25962306a36Sopenharmony_ci // Since the instruction 'shrp' requires a fixed 128-bit value 26062306a36Sopenharmony_ci // specifying the bits to shift, we need to provide 7 cases 26162306a36Sopenharmony_ci // below. 26262306a36Sopenharmony_ci // 26362306a36Sopenharmony_ci SWITCH(p6, 8) 26462306a36Sopenharmony_ci SWITCH(p7, 16) 26562306a36Sopenharmony_ci SWITCH(p8, 24) 26662306a36Sopenharmony_ci SWITCH(p9, 32) 26762306a36Sopenharmony_ci SWITCH(p10, 40) 26862306a36Sopenharmony_ci SWITCH(p11, 48) 26962306a36Sopenharmony_ci SWITCH(p12, 56) 27062306a36Sopenharmony_ci ;; 27162306a36Sopenharmony_ci CASE(p6, 8) 27262306a36Sopenharmony_ci CASE(p7, 16) 27362306a36Sopenharmony_ci CASE(p8, 24) 27462306a36Sopenharmony_ci CASE(p9, 32) 27562306a36Sopenharmony_ci CASE(p10, 40) 27662306a36Sopenharmony_ci CASE(p11, 48) 27762306a36Sopenharmony_ci CASE(p12, 56) 27862306a36Sopenharmony_ci ;; 27962306a36Sopenharmony_ci BODY(8) 28062306a36Sopenharmony_ci BODY(16) 28162306a36Sopenharmony_ci BODY(24) 28262306a36Sopenharmony_ci BODY(32) 28362306a36Sopenharmony_ci BODY(40) 28462306a36Sopenharmony_ci BODY(48) 28562306a36Sopenharmony_ci BODY(56) 28662306a36Sopenharmony_ci ;; 28762306a36Sopenharmony_ci.diff_align_do_tail: 28862306a36Sopenharmony_ci .pred.rel "mutex", p14, p15 28962306a36Sopenharmony_ci(p14) sub src1=src1,t1 29062306a36Sopenharmony_ci(p14) adds dst1=-8,dst1 29162306a36Sopenharmony_ci(p15) sub dst1=dst1,t1 29262306a36Sopenharmony_ci ;; 29362306a36Sopenharmony_ci4: 29462306a36Sopenharmony_ci // Tail correction. 29562306a36Sopenharmony_ci // 29662306a36Sopenharmony_ci // The problem with this piplelined loop is that the last word is not 29762306a36Sopenharmony_ci // loaded and thus parf of the last word written is not correct. 29862306a36Sopenharmony_ci // To fix that, we simply copy the tail byte by byte. 29962306a36Sopenharmony_ci 30062306a36Sopenharmony_ci sub len1=endsrc,src1,1 30162306a36Sopenharmony_ci clrrrb 30262306a36Sopenharmony_ci ;; 30362306a36Sopenharmony_ci mov ar.ec=PIPE_DEPTH 30462306a36Sopenharmony_ci mov pr.rot=1<<16 // p16=true all others are false 30562306a36Sopenharmony_ci mov ar.lc=len1 30662306a36Sopenharmony_ci ;; 30762306a36Sopenharmony_ci5: 30862306a36Sopenharmony_ci EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1) 30962306a36Sopenharmony_ci EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1) 31062306a36Sopenharmony_ci br.ctop.dptk.few 5b 31162306a36Sopenharmony_ci ;; 31262306a36Sopenharmony_ci mov ar.lc=saved_lc 31362306a36Sopenharmony_ci mov pr=saved_pr,0xffffffffffff0000 31462306a36Sopenharmony_ci mov ar.pfs=saved_pfs 31562306a36Sopenharmony_ci br.ret.sptk.many rp 31662306a36Sopenharmony_ci 31762306a36Sopenharmony_ci // 31862306a36Sopenharmony_ci // Beginning of long mempcy (i.e. > 16 bytes) 31962306a36Sopenharmony_ci // 32062306a36Sopenharmony_ci.long_copy_user: 32162306a36Sopenharmony_ci tbit.nz p6,p7=src1,0 // odd alignment 32262306a36Sopenharmony_ci and tmp=7,tmp 32362306a36Sopenharmony_ci ;; 32462306a36Sopenharmony_ci cmp.eq p10,p8=r0,tmp 32562306a36Sopenharmony_ci mov len1=len // copy because of rotation 32662306a36Sopenharmony_ci(p8) br.cond.dpnt .diff_align_copy_user 32762306a36Sopenharmony_ci ;; 32862306a36Sopenharmony_ci // At this point we know we have more than 16 bytes to copy 32962306a36Sopenharmony_ci // and also that both src and dest have the same alignment 33062306a36Sopenharmony_ci // which may not be the one we want. So for now we must move 33162306a36Sopenharmony_ci // forward slowly until we reach 16byte alignment: no need to 33262306a36Sopenharmony_ci // worry about reaching the end of buffer. 33362306a36Sopenharmony_ci // 33462306a36Sopenharmony_ci EX(.failure_in1,(p6) ld1 val1[0]=[src1],1) // 1-byte aligned 33562306a36Sopenharmony_ci(p6) adds len1=-1,len1;; 33662306a36Sopenharmony_ci tbit.nz p7,p0=src1,1 33762306a36Sopenharmony_ci ;; 33862306a36Sopenharmony_ci EX(.failure_in1,(p7) ld2 val1[1]=[src1],2) // 2-byte aligned 33962306a36Sopenharmony_ci(p7) adds len1=-2,len1;; 34062306a36Sopenharmony_ci tbit.nz p8,p0=src1,2 34162306a36Sopenharmony_ci ;; 34262306a36Sopenharmony_ci // 34362306a36Sopenharmony_ci // Stop bit not required after ld4 because if we fail on ld4 34462306a36Sopenharmony_ci // we have never executed the ld1, therefore st1 is not executed. 34562306a36Sopenharmony_ci // 34662306a36Sopenharmony_ci EX(.failure_in1,(p8) ld4 val2[0]=[src1],4) // 4-byte aligned 34762306a36Sopenharmony_ci ;; 34862306a36Sopenharmony_ci EX(.failure_out,(p6) st1 [dst1]=val1[0],1) 34962306a36Sopenharmony_ci tbit.nz p9,p0=src1,3 35062306a36Sopenharmony_ci ;; 35162306a36Sopenharmony_ci // 35262306a36Sopenharmony_ci // Stop bit not required after ld8 because if we fail on ld8 35362306a36Sopenharmony_ci // we have never executed the ld2, therefore st2 is not executed. 35462306a36Sopenharmony_ci // 35562306a36Sopenharmony_ci EX(.failure_in1,(p9) ld8 val2[1]=[src1],8) // 8-byte aligned 35662306a36Sopenharmony_ci EX(.failure_out,(p7) st2 [dst1]=val1[1],2) 35762306a36Sopenharmony_ci(p8) adds len1=-4,len1 35862306a36Sopenharmony_ci ;; 35962306a36Sopenharmony_ci EX(.failure_out, (p8) st4 [dst1]=val2[0],4) 36062306a36Sopenharmony_ci(p9) adds len1=-8,len1;; 36162306a36Sopenharmony_ci shr.u cnt=len1,4 // number of 128-bit (2x64bit) words 36262306a36Sopenharmony_ci ;; 36362306a36Sopenharmony_ci EX(.failure_out, (p9) st8 [dst1]=val2[1],8) 36462306a36Sopenharmony_ci tbit.nz p6,p0=len1,3 36562306a36Sopenharmony_ci cmp.eq p7,p0=r0,cnt 36662306a36Sopenharmony_ci adds tmp=-1,cnt // br.ctop is repeat/until 36762306a36Sopenharmony_ci(p7) br.cond.dpnt .dotail // we have less than 16 bytes left 36862306a36Sopenharmony_ci ;; 36962306a36Sopenharmony_ci adds src2=8,src1 37062306a36Sopenharmony_ci adds dst2=8,dst1 37162306a36Sopenharmony_ci mov ar.lc=tmp 37262306a36Sopenharmony_ci ;; 37362306a36Sopenharmony_ci // 37462306a36Sopenharmony_ci // 16bytes/iteration 37562306a36Sopenharmony_ci // 37662306a36Sopenharmony_ci2: 37762306a36Sopenharmony_ci EX(.failure_in3,(p16) ld8 val1[0]=[src1],16) 37862306a36Sopenharmony_ci(p16) ld8 val2[0]=[src2],16 37962306a36Sopenharmony_ci 38062306a36Sopenharmony_ci EX(.failure_out, (EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16) 38162306a36Sopenharmony_ci(EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16 38262306a36Sopenharmony_ci br.ctop.dptk 2b 38362306a36Sopenharmony_ci ;; // RAW on src1 when fall through from loop 38462306a36Sopenharmony_ci // 38562306a36Sopenharmony_ci // Tail correction based on len only 38662306a36Sopenharmony_ci // 38762306a36Sopenharmony_ci // No matter where we come from (loop or test) the src1 pointer 38862306a36Sopenharmony_ci // is 16 byte aligned AND we have less than 16 bytes to copy. 38962306a36Sopenharmony_ci // 39062306a36Sopenharmony_ci.dotail: 39162306a36Sopenharmony_ci EX(.failure_in1,(p6) ld8 val1[0]=[src1],8) // at least 8 bytes 39262306a36Sopenharmony_ci tbit.nz p7,p0=len1,2 39362306a36Sopenharmony_ci ;; 39462306a36Sopenharmony_ci EX(.failure_in1,(p7) ld4 val1[1]=[src1],4) // at least 4 bytes 39562306a36Sopenharmony_ci tbit.nz p8,p0=len1,1 39662306a36Sopenharmony_ci ;; 39762306a36Sopenharmony_ci EX(.failure_in1,(p8) ld2 val2[0]=[src1],2) // at least 2 bytes 39862306a36Sopenharmony_ci tbit.nz p9,p0=len1,0 39962306a36Sopenharmony_ci ;; 40062306a36Sopenharmony_ci EX(.failure_out, (p6) st8 [dst1]=val1[0],8) 40162306a36Sopenharmony_ci ;; 40262306a36Sopenharmony_ci EX(.failure_in1,(p9) ld1 val2[1]=[src1]) // only 1 byte left 40362306a36Sopenharmony_ci mov ar.lc=saved_lc 40462306a36Sopenharmony_ci ;; 40562306a36Sopenharmony_ci EX(.failure_out,(p7) st4 [dst1]=val1[1],4) 40662306a36Sopenharmony_ci mov pr=saved_pr,0xffffffffffff0000 40762306a36Sopenharmony_ci ;; 40862306a36Sopenharmony_ci EX(.failure_out, (p8) st2 [dst1]=val2[0],2) 40962306a36Sopenharmony_ci mov ar.pfs=saved_pfs 41062306a36Sopenharmony_ci ;; 41162306a36Sopenharmony_ci EX(.failure_out, (p9) st1 [dst1]=val2[1]) 41262306a36Sopenharmony_ci br.ret.sptk.many rp 41362306a36Sopenharmony_ci 41462306a36Sopenharmony_ci 41562306a36Sopenharmony_ci // 41662306a36Sopenharmony_ci // Here we handle the case where the byte by byte copy fails 41762306a36Sopenharmony_ci // on the load. 41862306a36Sopenharmony_ci // Several factors make the zeroing of the rest of the buffer kind of 41962306a36Sopenharmony_ci // tricky: 42062306a36Sopenharmony_ci // - the pipeline: loads/stores are not in sync (pipeline) 42162306a36Sopenharmony_ci // 42262306a36Sopenharmony_ci // In the same loop iteration, the dst1 pointer does not directly 42362306a36Sopenharmony_ci // reflect where the faulty load was. 42462306a36Sopenharmony_ci // 42562306a36Sopenharmony_ci // - pipeline effect 42662306a36Sopenharmony_ci // When you get a fault on load, you may have valid data from 42762306a36Sopenharmony_ci // previous loads not yet store in transit. Such data must be 42862306a36Sopenharmony_ci // store normally before moving onto zeroing the rest. 42962306a36Sopenharmony_ci // 43062306a36Sopenharmony_ci // - single/multi dispersal independence. 43162306a36Sopenharmony_ci // 43262306a36Sopenharmony_ci // solution: 43362306a36Sopenharmony_ci // - we don't disrupt the pipeline, i.e. data in transit in 43462306a36Sopenharmony_ci // the software pipeline will be eventually move to memory. 43562306a36Sopenharmony_ci // We simply replace the load with a simple mov and keep the 43662306a36Sopenharmony_ci // pipeline going. We can't really do this inline because 43762306a36Sopenharmony_ci // p16 is always reset to 1 when lc > 0. 43862306a36Sopenharmony_ci // 43962306a36Sopenharmony_ci.failure_in_pipe1: 44062306a36Sopenharmony_ci sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied 44162306a36Sopenharmony_ci1: 44262306a36Sopenharmony_ci(p16) mov val1[0]=r0 44362306a36Sopenharmony_ci(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1 44462306a36Sopenharmony_ci br.ctop.dptk 1b 44562306a36Sopenharmony_ci ;; 44662306a36Sopenharmony_ci mov pr=saved_pr,0xffffffffffff0000 44762306a36Sopenharmony_ci mov ar.lc=saved_lc 44862306a36Sopenharmony_ci mov ar.pfs=saved_pfs 44962306a36Sopenharmony_ci br.ret.sptk.many rp 45062306a36Sopenharmony_ci 45162306a36Sopenharmony_ci // 45262306a36Sopenharmony_ci // This is the case where the byte by byte copy fails on the load 45362306a36Sopenharmony_ci // when we copy the head. We need to finish the pipeline and copy 45462306a36Sopenharmony_ci // zeros for the rest of the destination. Since this happens 45562306a36Sopenharmony_ci // at the top we still need to fill the body and tail. 45662306a36Sopenharmony_ci.failure_in_pipe2: 45762306a36Sopenharmony_ci sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied 45862306a36Sopenharmony_ci2: 45962306a36Sopenharmony_ci(p16) mov val1[0]=r0 46062306a36Sopenharmony_ci(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1 46162306a36Sopenharmony_ci br.ctop.dptk 2b 46262306a36Sopenharmony_ci ;; 46362306a36Sopenharmony_ci sub len=enddst,dst1,1 // precompute len 46462306a36Sopenharmony_ci br.cond.dptk.many .failure_in1bis 46562306a36Sopenharmony_ci ;; 46662306a36Sopenharmony_ci 46762306a36Sopenharmony_ci // 46862306a36Sopenharmony_ci // Here we handle the head & tail part when we check for alignment. 46962306a36Sopenharmony_ci // The following code handles only the load failures. The 47062306a36Sopenharmony_ci // main diffculty comes from the fact that loads/stores are 47162306a36Sopenharmony_ci // scheduled. So when you fail on a load, the stores corresponding 47262306a36Sopenharmony_ci // to previous successful loads must be executed. 47362306a36Sopenharmony_ci // 47462306a36Sopenharmony_ci // However some simplifications are possible given the way 47562306a36Sopenharmony_ci // things work. 47662306a36Sopenharmony_ci // 47762306a36Sopenharmony_ci // 1) HEAD 47862306a36Sopenharmony_ci // Theory of operation: 47962306a36Sopenharmony_ci // 48062306a36Sopenharmony_ci // Page A | Page B 48162306a36Sopenharmony_ci // ---------|----- 48262306a36Sopenharmony_ci // 1|8 x 48362306a36Sopenharmony_ci // 1 2|8 x 48462306a36Sopenharmony_ci // 4|8 x 48562306a36Sopenharmony_ci // 1 4|8 x 48662306a36Sopenharmony_ci // 2 4|8 x 48762306a36Sopenharmony_ci // 1 2 4|8 x 48862306a36Sopenharmony_ci // |1 48962306a36Sopenharmony_ci // |2 x 49062306a36Sopenharmony_ci // |4 x 49162306a36Sopenharmony_ci // 49262306a36Sopenharmony_ci // page_size >= 4k (2^12). (x means 4, 2, 1) 49362306a36Sopenharmony_ci // Here we suppose Page A exists and Page B does not. 49462306a36Sopenharmony_ci // 49562306a36Sopenharmony_ci // As we move towards eight byte alignment we may encounter faults. 49662306a36Sopenharmony_ci // The numbers on each page show the size of the load (current alignment). 49762306a36Sopenharmony_ci // 49862306a36Sopenharmony_ci // Key point: 49962306a36Sopenharmony_ci // - if you fail on 1, 2, 4 then you have never executed any smaller 50062306a36Sopenharmony_ci // size loads, e.g. failing ld4 means no ld1 nor ld2 executed 50162306a36Sopenharmony_ci // before. 50262306a36Sopenharmony_ci // 50362306a36Sopenharmony_ci // This allows us to simplify the cleanup code, because basically you 50462306a36Sopenharmony_ci // only have to worry about "pending" stores in the case of a failing 50562306a36Sopenharmony_ci // ld8(). Given the way the code is written today, this means only 50662306a36Sopenharmony_ci // worry about st2, st4. There we can use the information encapsulated 50762306a36Sopenharmony_ci // into the predicates. 50862306a36Sopenharmony_ci // 50962306a36Sopenharmony_ci // Other key point: 51062306a36Sopenharmony_ci // - if you fail on the ld8 in the head, it means you went straight 51162306a36Sopenharmony_ci // to it, i.e. 8byte alignment within an unexisting page. 51262306a36Sopenharmony_ci // Again this comes from the fact that if you crossed just for the ld8 then 51362306a36Sopenharmony_ci // you are 8byte aligned but also 16byte align, therefore you would 51462306a36Sopenharmony_ci // either go for the 16byte copy loop OR the ld8 in the tail part. 51562306a36Sopenharmony_ci // The combination ld1, ld2, ld4, ld8 where you fail on ld8 is impossible 51662306a36Sopenharmony_ci // because it would mean you had 15bytes to copy in which case you 51762306a36Sopenharmony_ci // would have defaulted to the byte by byte copy. 51862306a36Sopenharmony_ci // 51962306a36Sopenharmony_ci // 52062306a36Sopenharmony_ci // 2) TAIL 52162306a36Sopenharmony_ci // Here we now we have less than 16 bytes AND we are either 8 or 16 byte 52262306a36Sopenharmony_ci // aligned. 52362306a36Sopenharmony_ci // 52462306a36Sopenharmony_ci // Key point: 52562306a36Sopenharmony_ci // This means that we either: 52662306a36Sopenharmony_ci // - are right on a page boundary 52762306a36Sopenharmony_ci // OR 52862306a36Sopenharmony_ci // - are at more than 16 bytes from a page boundary with 52962306a36Sopenharmony_ci // at most 15 bytes to copy: no chance of crossing. 53062306a36Sopenharmony_ci // 53162306a36Sopenharmony_ci // This allows us to assume that if we fail on a load we haven't possibly 53262306a36Sopenharmony_ci // executed any of the previous (tail) ones, so we don't need to do 53362306a36Sopenharmony_ci // any stores. For instance, if we fail on ld2, this means we had 53462306a36Sopenharmony_ci // 2 or 3 bytes left to copy and we did not execute the ld8 nor ld4. 53562306a36Sopenharmony_ci // 53662306a36Sopenharmony_ci // This means that we are in a situation similar the a fault in the 53762306a36Sopenharmony_ci // head part. That's nice! 53862306a36Sopenharmony_ci // 53962306a36Sopenharmony_ci.failure_in1: 54062306a36Sopenharmony_ci sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied 54162306a36Sopenharmony_ci sub len=endsrc,src1,1 54262306a36Sopenharmony_ci // 54362306a36Sopenharmony_ci // we know that ret0 can never be zero at this point 54462306a36Sopenharmony_ci // because we failed why trying to do a load, i.e. there is still 54562306a36Sopenharmony_ci // some work to do. 54662306a36Sopenharmony_ci // The failure_in1bis and length problem is taken care of at the 54762306a36Sopenharmony_ci // calling side. 54862306a36Sopenharmony_ci // 54962306a36Sopenharmony_ci ;; 55062306a36Sopenharmony_ci.failure_in1bis: // from (.failure_in3) 55162306a36Sopenharmony_ci mov ar.lc=len // Continue with a stupid byte store. 55262306a36Sopenharmony_ci ;; 55362306a36Sopenharmony_ci5: 55462306a36Sopenharmony_ci st1 [dst1]=r0,1 55562306a36Sopenharmony_ci br.cloop.dptk 5b 55662306a36Sopenharmony_ci ;; 55762306a36Sopenharmony_ci mov pr=saved_pr,0xffffffffffff0000 55862306a36Sopenharmony_ci mov ar.lc=saved_lc 55962306a36Sopenharmony_ci mov ar.pfs=saved_pfs 56062306a36Sopenharmony_ci br.ret.sptk.many rp 56162306a36Sopenharmony_ci 56262306a36Sopenharmony_ci // 56362306a36Sopenharmony_ci // Here we simply restart the loop but instead 56462306a36Sopenharmony_ci // of doing loads we fill the pipeline with zeroes 56562306a36Sopenharmony_ci // We can't simply store r0 because we may have valid 56662306a36Sopenharmony_ci // data in transit in the pipeline. 56762306a36Sopenharmony_ci // ar.lc and ar.ec are setup correctly at this point 56862306a36Sopenharmony_ci // 56962306a36Sopenharmony_ci // we MUST use src1/endsrc here and not dst1/enddst because 57062306a36Sopenharmony_ci // of the pipeline effect. 57162306a36Sopenharmony_ci // 57262306a36Sopenharmony_ci.failure_in3: 57362306a36Sopenharmony_ci sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied 57462306a36Sopenharmony_ci ;; 57562306a36Sopenharmony_ci2: 57662306a36Sopenharmony_ci(p16) mov val1[0]=r0 57762306a36Sopenharmony_ci(p16) mov val2[0]=r0 57862306a36Sopenharmony_ci(EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16 57962306a36Sopenharmony_ci(EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16 58062306a36Sopenharmony_ci br.ctop.dptk 2b 58162306a36Sopenharmony_ci ;; 58262306a36Sopenharmony_ci cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ? 58362306a36Sopenharmony_ci sub len=enddst,dst1,1 // precompute len 58462306a36Sopenharmony_ci(p6) br.cond.dptk .failure_in1bis 58562306a36Sopenharmony_ci ;; 58662306a36Sopenharmony_ci mov pr=saved_pr,0xffffffffffff0000 58762306a36Sopenharmony_ci mov ar.lc=saved_lc 58862306a36Sopenharmony_ci mov ar.pfs=saved_pfs 58962306a36Sopenharmony_ci br.ret.sptk.many rp 59062306a36Sopenharmony_ci 59162306a36Sopenharmony_ci.failure_in2: 59262306a36Sopenharmony_ci sub ret0=endsrc,src1 59362306a36Sopenharmony_ci cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ? 59462306a36Sopenharmony_ci sub len=enddst,dst1,1 // precompute len 59562306a36Sopenharmony_ci(p6) br.cond.dptk .failure_in1bis 59662306a36Sopenharmony_ci ;; 59762306a36Sopenharmony_ci mov pr=saved_pr,0xffffffffffff0000 59862306a36Sopenharmony_ci mov ar.lc=saved_lc 59962306a36Sopenharmony_ci mov ar.pfs=saved_pfs 60062306a36Sopenharmony_ci br.ret.sptk.many rp 60162306a36Sopenharmony_ci 60262306a36Sopenharmony_ci // 60362306a36Sopenharmony_ci // handling of failures on stores: that's the easy part 60462306a36Sopenharmony_ci // 60562306a36Sopenharmony_ci.failure_out: 60662306a36Sopenharmony_ci sub ret0=enddst,dst1 60762306a36Sopenharmony_ci mov pr=saved_pr,0xffffffffffff0000 60862306a36Sopenharmony_ci mov ar.lc=saved_lc 60962306a36Sopenharmony_ci 61062306a36Sopenharmony_ci mov ar.pfs=saved_pfs 61162306a36Sopenharmony_ci br.ret.sptk.many rp 61262306a36Sopenharmony_ciEND(__copy_user) 61362306a36Sopenharmony_ciEXPORT_SYMBOL(__copy_user) 614