162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * This routine clears to zero a linear memory buffer in user space. 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Inputs: 662306a36Sopenharmony_ci * in0: address of buffer 762306a36Sopenharmony_ci * in1: length of buffer in bytes 862306a36Sopenharmony_ci * Outputs: 962306a36Sopenharmony_ci * r8: number of bytes that didn't get cleared due to a fault 1062306a36Sopenharmony_ci * 1162306a36Sopenharmony_ci * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co 1262306a36Sopenharmony_ci * Stephane Eranian <eranian@hpl.hp.com> 1362306a36Sopenharmony_ci */ 1462306a36Sopenharmony_ci 1562306a36Sopenharmony_ci#include <linux/export.h> 1662306a36Sopenharmony_ci#include <asm/asmmacro.h> 1762306a36Sopenharmony_ci 1862306a36Sopenharmony_ci// 1962306a36Sopenharmony_ci// arguments 2062306a36Sopenharmony_ci// 2162306a36Sopenharmony_ci#define buf r32 2262306a36Sopenharmony_ci#define len r33 2362306a36Sopenharmony_ci 2462306a36Sopenharmony_ci// 2562306a36Sopenharmony_ci// local registers 2662306a36Sopenharmony_ci// 2762306a36Sopenharmony_ci#define cnt r16 2862306a36Sopenharmony_ci#define buf2 r17 2962306a36Sopenharmony_ci#define saved_lc r18 3062306a36Sopenharmony_ci#define saved_pfs r19 3162306a36Sopenharmony_ci#define tmp r20 3262306a36Sopenharmony_ci#define len2 r21 3362306a36Sopenharmony_ci#define len3 r22 3462306a36Sopenharmony_ci 3562306a36Sopenharmony_ci// 3662306a36Sopenharmony_ci// Theory of operations: 3762306a36Sopenharmony_ci// - we check whether or not the buffer is small, i.e., less than 17 3862306a36Sopenharmony_ci// in which case we do the byte by byte loop. 3962306a36Sopenharmony_ci// 4062306a36Sopenharmony_ci// - Otherwise we go progressively from 1 byte store to 8byte store in 4162306a36Sopenharmony_ci// the head part, the body is a 16byte store loop and we finish we the 4262306a36Sopenharmony_ci// tail for the last 15 bytes. 4362306a36Sopenharmony_ci// The good point about this breakdown is that the long buffer handling 4462306a36Sopenharmony_ci// contains only 2 branches. 4562306a36Sopenharmony_ci// 4662306a36Sopenharmony_ci// The reason for not using shifting & masking for both the head and the 4762306a36Sopenharmony_ci// tail is to stay semantically correct. This routine is not supposed 4862306a36Sopenharmony_ci// to write bytes outside of the buffer. While most of the time this would 4962306a36Sopenharmony_ci// be ok, we can't tolerate a mistake. A classical example is the case 5062306a36Sopenharmony_ci// of multithreaded code were to the extra bytes touched is actually owned 5162306a36Sopenharmony_ci// by another thread which runs concurrently to ours. Another, less likely, 5262306a36Sopenharmony_ci// example is with device drivers where reading an I/O mapped location may 5362306a36Sopenharmony_ci// have side effects (same thing for writing). 5462306a36Sopenharmony_ci// 5562306a36Sopenharmony_ci 5662306a36Sopenharmony_ciGLOBAL_ENTRY(__do_clear_user) 5762306a36Sopenharmony_ci .prologue 5862306a36Sopenharmony_ci .save ar.pfs, saved_pfs 5962306a36Sopenharmony_ci alloc saved_pfs=ar.pfs,2,0,0,0 6062306a36Sopenharmony_ci cmp.eq p6,p0=r0,len // check for zero length 6162306a36Sopenharmony_ci .save ar.lc, saved_lc 6262306a36Sopenharmony_ci mov saved_lc=ar.lc // preserve ar.lc (slow) 6362306a36Sopenharmony_ci .body 6462306a36Sopenharmony_ci ;; // avoid WAW on CFM 6562306a36Sopenharmony_ci adds tmp=-1,len // br.ctop is repeat/until 6662306a36Sopenharmony_ci mov ret0=len // return value is length at this point 6762306a36Sopenharmony_ci(p6) br.ret.spnt.many rp 6862306a36Sopenharmony_ci ;; 6962306a36Sopenharmony_ci cmp.lt p6,p0=16,len // if len > 16 then long memset 7062306a36Sopenharmony_ci mov ar.lc=tmp // initialize lc for small count 7162306a36Sopenharmony_ci(p6) br.cond.dptk .long_do_clear 7262306a36Sopenharmony_ci ;; // WAR on ar.lc 7362306a36Sopenharmony_ci // 7462306a36Sopenharmony_ci // worst case 16 iterations, avg 8 iterations 7562306a36Sopenharmony_ci // 7662306a36Sopenharmony_ci // We could have played with the predicates to use the extra 7762306a36Sopenharmony_ci // M slot for 2 stores/iteration but the cost the initialization 7862306a36Sopenharmony_ci // the various counters compared to how long the loop is supposed 7962306a36Sopenharmony_ci // to last on average does not make this solution viable. 8062306a36Sopenharmony_ci // 8162306a36Sopenharmony_ci1: 8262306a36Sopenharmony_ci EX( .Lexit1, st1 [buf]=r0,1 ) 8362306a36Sopenharmony_ci adds len=-1,len // countdown length using len 8462306a36Sopenharmony_ci br.cloop.dptk 1b 8562306a36Sopenharmony_ci ;; // avoid RAW on ar.lc 8662306a36Sopenharmony_ci // 8762306a36Sopenharmony_ci // .Lexit4: comes from byte by byte loop 8862306a36Sopenharmony_ci // len contains bytes left 8962306a36Sopenharmony_ci.Lexit1: 9062306a36Sopenharmony_ci mov ret0=len // faster than using ar.lc 9162306a36Sopenharmony_ci mov ar.lc=saved_lc 9262306a36Sopenharmony_ci br.ret.sptk.many rp // end of short clear_user 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_ci 9562306a36Sopenharmony_ci // 9662306a36Sopenharmony_ci // At this point we know we have more than 16 bytes to copy 9762306a36Sopenharmony_ci // so we focus on alignment (no branches required) 9862306a36Sopenharmony_ci // 9962306a36Sopenharmony_ci // The use of len/len2 for countdown of the number of bytes left 10062306a36Sopenharmony_ci // instead of ret0 is due to the fact that the exception code 10162306a36Sopenharmony_ci // changes the values of r8. 10262306a36Sopenharmony_ci // 10362306a36Sopenharmony_ci.long_do_clear: 10462306a36Sopenharmony_ci tbit.nz p6,p0=buf,0 // odd alignment (for long_do_clear) 10562306a36Sopenharmony_ci ;; 10662306a36Sopenharmony_ci EX( .Lexit3, (p6) st1 [buf]=r0,1 ) // 1-byte aligned 10762306a36Sopenharmony_ci(p6) adds len=-1,len;; // sync because buf is modified 10862306a36Sopenharmony_ci tbit.nz p6,p0=buf,1 10962306a36Sopenharmony_ci ;; 11062306a36Sopenharmony_ci EX( .Lexit3, (p6) st2 [buf]=r0,2 ) // 2-byte aligned 11162306a36Sopenharmony_ci(p6) adds len=-2,len;; 11262306a36Sopenharmony_ci tbit.nz p6,p0=buf,2 11362306a36Sopenharmony_ci ;; 11462306a36Sopenharmony_ci EX( .Lexit3, (p6) st4 [buf]=r0,4 ) // 4-byte aligned 11562306a36Sopenharmony_ci(p6) adds len=-4,len;; 11662306a36Sopenharmony_ci tbit.nz p6,p0=buf,3 11762306a36Sopenharmony_ci ;; 11862306a36Sopenharmony_ci EX( .Lexit3, (p6) st8 [buf]=r0,8 ) // 8-byte aligned 11962306a36Sopenharmony_ci(p6) adds len=-8,len;; 12062306a36Sopenharmony_ci shr.u cnt=len,4 // number of 128-bit (2x64bit) words 12162306a36Sopenharmony_ci ;; 12262306a36Sopenharmony_ci cmp.eq p6,p0=r0,cnt 12362306a36Sopenharmony_ci adds tmp=-1,cnt 12462306a36Sopenharmony_ci(p6) br.cond.dpnt .dotail // we have less than 16 bytes left 12562306a36Sopenharmony_ci ;; 12662306a36Sopenharmony_ci adds buf2=8,buf // setup second base pointer 12762306a36Sopenharmony_ci mov ar.lc=tmp 12862306a36Sopenharmony_ci ;; 12962306a36Sopenharmony_ci 13062306a36Sopenharmony_ci // 13162306a36Sopenharmony_ci // 16bytes/iteration core loop 13262306a36Sopenharmony_ci // 13362306a36Sopenharmony_ci // The second store can never generate a fault because 13462306a36Sopenharmony_ci // we come into the loop only when we are 16-byte aligned. 13562306a36Sopenharmony_ci // This means that if we cross a page then it will always be 13662306a36Sopenharmony_ci // in the first store and never in the second. 13762306a36Sopenharmony_ci // 13862306a36Sopenharmony_ci // 13962306a36Sopenharmony_ci // We need to keep track of the remaining length. A possible (optimistic) 14062306a36Sopenharmony_ci // way would be to use ar.lc and derive how many byte were left by 14162306a36Sopenharmony_ci // doing : left= 16*ar.lc + 16. this would avoid the addition at 14262306a36Sopenharmony_ci // every iteration. 14362306a36Sopenharmony_ci // However we need to keep the synchronization point. A template 14462306a36Sopenharmony_ci // M;;MB does not exist and thus we can keep the addition at no 14562306a36Sopenharmony_ci // extra cycle cost (use a nop slot anyway). It also simplifies the 14662306a36Sopenharmony_ci // (unlikely) error recovery code 14762306a36Sopenharmony_ci // 14862306a36Sopenharmony_ci 14962306a36Sopenharmony_ci2: EX(.Lexit3, st8 [buf]=r0,16 ) 15062306a36Sopenharmony_ci ;; // needed to get len correct when error 15162306a36Sopenharmony_ci st8 [buf2]=r0,16 15262306a36Sopenharmony_ci adds len=-16,len 15362306a36Sopenharmony_ci br.cloop.dptk 2b 15462306a36Sopenharmony_ci ;; 15562306a36Sopenharmony_ci mov ar.lc=saved_lc 15662306a36Sopenharmony_ci // 15762306a36Sopenharmony_ci // tail correction based on len only 15862306a36Sopenharmony_ci // 15962306a36Sopenharmony_ci // We alternate the use of len3,len2 to allow parallelism and correct 16062306a36Sopenharmony_ci // error handling. We also reuse p6/p7 to return correct value. 16162306a36Sopenharmony_ci // The addition of len2/len3 does not cost anything more compared to 16262306a36Sopenharmony_ci // the regular memset as we had empty slots. 16362306a36Sopenharmony_ci // 16462306a36Sopenharmony_ci.dotail: 16562306a36Sopenharmony_ci mov len2=len // for parallelization of error handling 16662306a36Sopenharmony_ci mov len3=len 16762306a36Sopenharmony_ci tbit.nz p6,p0=len,3 16862306a36Sopenharmony_ci ;; 16962306a36Sopenharmony_ci EX( .Lexit2, (p6) st8 [buf]=r0,8 ) // at least 8 bytes 17062306a36Sopenharmony_ci(p6) adds len3=-8,len2 17162306a36Sopenharmony_ci tbit.nz p7,p6=len,2 17262306a36Sopenharmony_ci ;; 17362306a36Sopenharmony_ci EX( .Lexit2, (p7) st4 [buf]=r0,4 ) // at least 4 bytes 17462306a36Sopenharmony_ci(p7) adds len2=-4,len3 17562306a36Sopenharmony_ci tbit.nz p6,p7=len,1 17662306a36Sopenharmony_ci ;; 17762306a36Sopenharmony_ci EX( .Lexit2, (p6) st2 [buf]=r0,2 ) // at least 2 bytes 17862306a36Sopenharmony_ci(p6) adds len3=-2,len2 17962306a36Sopenharmony_ci tbit.nz p7,p6=len,0 18062306a36Sopenharmony_ci ;; 18162306a36Sopenharmony_ci EX( .Lexit2, (p7) st1 [buf]=r0 ) // only 1 byte left 18262306a36Sopenharmony_ci mov ret0=r0 // success 18362306a36Sopenharmony_ci br.ret.sptk.many rp // end of most likely path 18462306a36Sopenharmony_ci 18562306a36Sopenharmony_ci // 18662306a36Sopenharmony_ci // Outlined error handling code 18762306a36Sopenharmony_ci // 18862306a36Sopenharmony_ci 18962306a36Sopenharmony_ci // 19062306a36Sopenharmony_ci // .Lexit3: comes from core loop, need restore pr/lc 19162306a36Sopenharmony_ci // len contains bytes left 19262306a36Sopenharmony_ci // 19362306a36Sopenharmony_ci // 19462306a36Sopenharmony_ci // .Lexit2: 19562306a36Sopenharmony_ci // if p6 -> coming from st8 or st2 : len2 contains what's left 19662306a36Sopenharmony_ci // if p7 -> coming from st4 or st1 : len3 contains what's left 19762306a36Sopenharmony_ci // We must restore lc/pr even though might not have been used. 19862306a36Sopenharmony_ci.Lexit2: 19962306a36Sopenharmony_ci .pred.rel "mutex", p6, p7 20062306a36Sopenharmony_ci(p6) mov len=len2 20162306a36Sopenharmony_ci(p7) mov len=len3 20262306a36Sopenharmony_ci ;; 20362306a36Sopenharmony_ci // 20462306a36Sopenharmony_ci // .Lexit4: comes from head, need not restore pr/lc 20562306a36Sopenharmony_ci // len contains bytes left 20662306a36Sopenharmony_ci // 20762306a36Sopenharmony_ci.Lexit3: 20862306a36Sopenharmony_ci mov ret0=len 20962306a36Sopenharmony_ci mov ar.lc=saved_lc 21062306a36Sopenharmony_ci br.ret.sptk.many rp 21162306a36Sopenharmony_ciEND(__do_clear_user) 21262306a36Sopenharmony_ciEXPORT_SYMBOL(__do_clear_user) 213