18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * This routine clears to zero a linear memory buffer in user space. 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Inputs: 68c2ecf20Sopenharmony_ci * in0: address of buffer 78c2ecf20Sopenharmony_ci * in1: length of buffer in bytes 88c2ecf20Sopenharmony_ci * Outputs: 98c2ecf20Sopenharmony_ci * r8: number of bytes that didn't get cleared due to a fault 108c2ecf20Sopenharmony_ci * 118c2ecf20Sopenharmony_ci * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co 128c2ecf20Sopenharmony_ci * Stephane Eranian <eranian@hpl.hp.com> 138c2ecf20Sopenharmony_ci */ 148c2ecf20Sopenharmony_ci 158c2ecf20Sopenharmony_ci#include <asm/asmmacro.h> 168c2ecf20Sopenharmony_ci#include <asm/export.h> 178c2ecf20Sopenharmony_ci 188c2ecf20Sopenharmony_ci// 198c2ecf20Sopenharmony_ci// arguments 208c2ecf20Sopenharmony_ci// 218c2ecf20Sopenharmony_ci#define buf r32 228c2ecf20Sopenharmony_ci#define len r33 238c2ecf20Sopenharmony_ci 248c2ecf20Sopenharmony_ci// 258c2ecf20Sopenharmony_ci// local registers 268c2ecf20Sopenharmony_ci// 278c2ecf20Sopenharmony_ci#define cnt r16 288c2ecf20Sopenharmony_ci#define buf2 r17 298c2ecf20Sopenharmony_ci#define saved_lc r18 308c2ecf20Sopenharmony_ci#define saved_pfs r19 318c2ecf20Sopenharmony_ci#define tmp r20 328c2ecf20Sopenharmony_ci#define len2 r21 338c2ecf20Sopenharmony_ci#define len3 r22 348c2ecf20Sopenharmony_ci 358c2ecf20Sopenharmony_ci// 368c2ecf20Sopenharmony_ci// Theory of operations: 378c2ecf20Sopenharmony_ci// - we check whether or not the buffer is small, i.e., less than 17 388c2ecf20Sopenharmony_ci// in which case we do the byte by byte loop. 398c2ecf20Sopenharmony_ci// 408c2ecf20Sopenharmony_ci// - Otherwise we go progressively from 1 byte store to 8byte store in 418c2ecf20Sopenharmony_ci// the head part, the body is a 16byte store loop and we finish we the 428c2ecf20Sopenharmony_ci// tail for the last 15 bytes. 438c2ecf20Sopenharmony_ci// The good point about this breakdown is that the long buffer handling 448c2ecf20Sopenharmony_ci// contains only 2 branches. 458c2ecf20Sopenharmony_ci// 468c2ecf20Sopenharmony_ci// The reason for not using shifting & masking for both the head and the 478c2ecf20Sopenharmony_ci// tail is to stay semantically correct. This routine is not supposed 488c2ecf20Sopenharmony_ci// to write bytes outside of the buffer. While most of the time this would 498c2ecf20Sopenharmony_ci// be ok, we can't tolerate a mistake. A classical example is the case 508c2ecf20Sopenharmony_ci// of multithreaded code were to the extra bytes touched is actually owned 518c2ecf20Sopenharmony_ci// by another thread which runs concurrently to ours. Another, less likely, 528c2ecf20Sopenharmony_ci// example is with device drivers where reading an I/O mapped location may 538c2ecf20Sopenharmony_ci// have side effects (same thing for writing). 548c2ecf20Sopenharmony_ci// 558c2ecf20Sopenharmony_ci 568c2ecf20Sopenharmony_ciGLOBAL_ENTRY(__do_clear_user) 578c2ecf20Sopenharmony_ci .prologue 588c2ecf20Sopenharmony_ci .save ar.pfs, saved_pfs 598c2ecf20Sopenharmony_ci alloc saved_pfs=ar.pfs,2,0,0,0 608c2ecf20Sopenharmony_ci cmp.eq p6,p0=r0,len // check for zero length 618c2ecf20Sopenharmony_ci .save ar.lc, saved_lc 628c2ecf20Sopenharmony_ci mov saved_lc=ar.lc // preserve ar.lc (slow) 638c2ecf20Sopenharmony_ci .body 648c2ecf20Sopenharmony_ci ;; // avoid WAW on CFM 658c2ecf20Sopenharmony_ci adds tmp=-1,len // br.ctop is repeat/until 668c2ecf20Sopenharmony_ci mov ret0=len // return value is length at this point 678c2ecf20Sopenharmony_ci(p6) br.ret.spnt.many rp 688c2ecf20Sopenharmony_ci ;; 698c2ecf20Sopenharmony_ci cmp.lt p6,p0=16,len // if len > 16 then long memset 708c2ecf20Sopenharmony_ci mov ar.lc=tmp // initialize lc for small count 718c2ecf20Sopenharmony_ci(p6) br.cond.dptk .long_do_clear 728c2ecf20Sopenharmony_ci ;; // WAR on ar.lc 738c2ecf20Sopenharmony_ci // 748c2ecf20Sopenharmony_ci // worst case 16 iterations, avg 8 iterations 758c2ecf20Sopenharmony_ci // 768c2ecf20Sopenharmony_ci // We could have played with the predicates to use the extra 778c2ecf20Sopenharmony_ci // M slot for 2 stores/iteration but the cost the initialization 788c2ecf20Sopenharmony_ci // the various counters compared to how long the loop is supposed 798c2ecf20Sopenharmony_ci // to last on average does not make this solution viable. 808c2ecf20Sopenharmony_ci // 818c2ecf20Sopenharmony_ci1: 828c2ecf20Sopenharmony_ci EX( .Lexit1, st1 [buf]=r0,1 ) 838c2ecf20Sopenharmony_ci adds len=-1,len // countdown length using len 848c2ecf20Sopenharmony_ci br.cloop.dptk 1b 858c2ecf20Sopenharmony_ci ;; // avoid RAW on ar.lc 868c2ecf20Sopenharmony_ci // 878c2ecf20Sopenharmony_ci // .Lexit4: comes from byte by byte loop 888c2ecf20Sopenharmony_ci // len contains bytes left 898c2ecf20Sopenharmony_ci.Lexit1: 908c2ecf20Sopenharmony_ci mov ret0=len // faster than using ar.lc 918c2ecf20Sopenharmony_ci mov ar.lc=saved_lc 928c2ecf20Sopenharmony_ci br.ret.sptk.many rp // end of short clear_user 938c2ecf20Sopenharmony_ci 948c2ecf20Sopenharmony_ci 958c2ecf20Sopenharmony_ci // 968c2ecf20Sopenharmony_ci // At this point we know we have more than 16 bytes to copy 978c2ecf20Sopenharmony_ci // so we focus on alignment (no branches required) 988c2ecf20Sopenharmony_ci // 998c2ecf20Sopenharmony_ci // The use of len/len2 for countdown of the number of bytes left 1008c2ecf20Sopenharmony_ci // instead of ret0 is due to the fact that the exception code 1018c2ecf20Sopenharmony_ci // changes the values of r8. 1028c2ecf20Sopenharmony_ci // 1038c2ecf20Sopenharmony_ci.long_do_clear: 1048c2ecf20Sopenharmony_ci tbit.nz p6,p0=buf,0 // odd alignment (for long_do_clear) 1058c2ecf20Sopenharmony_ci ;; 1068c2ecf20Sopenharmony_ci EX( .Lexit3, (p6) st1 [buf]=r0,1 ) // 1-byte aligned 1078c2ecf20Sopenharmony_ci(p6) adds len=-1,len;; // sync because buf is modified 1088c2ecf20Sopenharmony_ci tbit.nz p6,p0=buf,1 1098c2ecf20Sopenharmony_ci ;; 1108c2ecf20Sopenharmony_ci EX( .Lexit3, (p6) st2 [buf]=r0,2 ) // 2-byte aligned 1118c2ecf20Sopenharmony_ci(p6) adds len=-2,len;; 1128c2ecf20Sopenharmony_ci tbit.nz p6,p0=buf,2 1138c2ecf20Sopenharmony_ci ;; 1148c2ecf20Sopenharmony_ci EX( .Lexit3, (p6) st4 [buf]=r0,4 ) // 4-byte aligned 1158c2ecf20Sopenharmony_ci(p6) adds len=-4,len;; 1168c2ecf20Sopenharmony_ci tbit.nz p6,p0=buf,3 1178c2ecf20Sopenharmony_ci ;; 1188c2ecf20Sopenharmony_ci EX( .Lexit3, (p6) st8 [buf]=r0,8 ) // 8-byte aligned 1198c2ecf20Sopenharmony_ci(p6) adds len=-8,len;; 1208c2ecf20Sopenharmony_ci shr.u cnt=len,4 // number of 128-bit (2x64bit) words 1218c2ecf20Sopenharmony_ci ;; 1228c2ecf20Sopenharmony_ci cmp.eq p6,p0=r0,cnt 1238c2ecf20Sopenharmony_ci adds tmp=-1,cnt 1248c2ecf20Sopenharmony_ci(p6) br.cond.dpnt .dotail // we have less than 16 bytes left 1258c2ecf20Sopenharmony_ci ;; 1268c2ecf20Sopenharmony_ci adds buf2=8,buf // setup second base pointer 1278c2ecf20Sopenharmony_ci mov ar.lc=tmp 1288c2ecf20Sopenharmony_ci ;; 1298c2ecf20Sopenharmony_ci 1308c2ecf20Sopenharmony_ci // 1318c2ecf20Sopenharmony_ci // 16bytes/iteration core loop 1328c2ecf20Sopenharmony_ci // 1338c2ecf20Sopenharmony_ci // The second store can never generate a fault because 1348c2ecf20Sopenharmony_ci // we come into the loop only when we are 16-byte aligned. 1358c2ecf20Sopenharmony_ci // This means that if we cross a page then it will always be 1368c2ecf20Sopenharmony_ci // in the first store and never in the second. 1378c2ecf20Sopenharmony_ci // 1388c2ecf20Sopenharmony_ci // 1398c2ecf20Sopenharmony_ci // We need to keep track of the remaining length. A possible (optimistic) 1408c2ecf20Sopenharmony_ci // way would be to use ar.lc and derive how many byte were left by 1418c2ecf20Sopenharmony_ci // doing : left= 16*ar.lc + 16. this would avoid the addition at 1428c2ecf20Sopenharmony_ci // every iteration. 1438c2ecf20Sopenharmony_ci // However we need to keep the synchronization point. A template 1448c2ecf20Sopenharmony_ci // M;;MB does not exist and thus we can keep the addition at no 1458c2ecf20Sopenharmony_ci // extra cycle cost (use a nop slot anyway). It also simplifies the 1468c2ecf20Sopenharmony_ci // (unlikely) error recovery code 1478c2ecf20Sopenharmony_ci // 1488c2ecf20Sopenharmony_ci 1498c2ecf20Sopenharmony_ci2: EX(.Lexit3, st8 [buf]=r0,16 ) 1508c2ecf20Sopenharmony_ci ;; // needed to get len correct when error 1518c2ecf20Sopenharmony_ci st8 [buf2]=r0,16 1528c2ecf20Sopenharmony_ci adds len=-16,len 1538c2ecf20Sopenharmony_ci br.cloop.dptk 2b 1548c2ecf20Sopenharmony_ci ;; 1558c2ecf20Sopenharmony_ci mov ar.lc=saved_lc 1568c2ecf20Sopenharmony_ci // 1578c2ecf20Sopenharmony_ci // tail correction based on len only 1588c2ecf20Sopenharmony_ci // 1598c2ecf20Sopenharmony_ci // We alternate the use of len3,len2 to allow parallelism and correct 1608c2ecf20Sopenharmony_ci // error handling. We also reuse p6/p7 to return correct value. 1618c2ecf20Sopenharmony_ci // The addition of len2/len3 does not cost anything more compared to 1628c2ecf20Sopenharmony_ci // the regular memset as we had empty slots. 1638c2ecf20Sopenharmony_ci // 1648c2ecf20Sopenharmony_ci.dotail: 1658c2ecf20Sopenharmony_ci mov len2=len // for parallelization of error handling 1668c2ecf20Sopenharmony_ci mov len3=len 1678c2ecf20Sopenharmony_ci tbit.nz p6,p0=len,3 1688c2ecf20Sopenharmony_ci ;; 1698c2ecf20Sopenharmony_ci EX( .Lexit2, (p6) st8 [buf]=r0,8 ) // at least 8 bytes 1708c2ecf20Sopenharmony_ci(p6) adds len3=-8,len2 1718c2ecf20Sopenharmony_ci tbit.nz p7,p6=len,2 1728c2ecf20Sopenharmony_ci ;; 1738c2ecf20Sopenharmony_ci EX( .Lexit2, (p7) st4 [buf]=r0,4 ) // at least 4 bytes 1748c2ecf20Sopenharmony_ci(p7) adds len2=-4,len3 1758c2ecf20Sopenharmony_ci tbit.nz p6,p7=len,1 1768c2ecf20Sopenharmony_ci ;; 1778c2ecf20Sopenharmony_ci EX( .Lexit2, (p6) st2 [buf]=r0,2 ) // at least 2 bytes 1788c2ecf20Sopenharmony_ci(p6) adds len3=-2,len2 1798c2ecf20Sopenharmony_ci tbit.nz p7,p6=len,0 1808c2ecf20Sopenharmony_ci ;; 1818c2ecf20Sopenharmony_ci EX( .Lexit2, (p7) st1 [buf]=r0 ) // only 1 byte left 1828c2ecf20Sopenharmony_ci mov ret0=r0 // success 1838c2ecf20Sopenharmony_ci br.ret.sptk.many rp // end of most likely path 1848c2ecf20Sopenharmony_ci 1858c2ecf20Sopenharmony_ci // 1868c2ecf20Sopenharmony_ci // Outlined error handling code 1878c2ecf20Sopenharmony_ci // 1888c2ecf20Sopenharmony_ci 1898c2ecf20Sopenharmony_ci // 1908c2ecf20Sopenharmony_ci // .Lexit3: comes from core loop, need restore pr/lc 1918c2ecf20Sopenharmony_ci // len contains bytes left 1928c2ecf20Sopenharmony_ci // 1938c2ecf20Sopenharmony_ci // 1948c2ecf20Sopenharmony_ci // .Lexit2: 1958c2ecf20Sopenharmony_ci // if p6 -> coming from st8 or st2 : len2 contains what's left 1968c2ecf20Sopenharmony_ci // if p7 -> coming from st4 or st1 : len3 contains what's left 1978c2ecf20Sopenharmony_ci // We must restore lc/pr even though might not have been used. 1988c2ecf20Sopenharmony_ci.Lexit2: 1998c2ecf20Sopenharmony_ci .pred.rel "mutex", p6, p7 2008c2ecf20Sopenharmony_ci(p6) mov len=len2 2018c2ecf20Sopenharmony_ci(p7) mov len=len3 2028c2ecf20Sopenharmony_ci ;; 2038c2ecf20Sopenharmony_ci // 2048c2ecf20Sopenharmony_ci // .Lexit4: comes from head, need not restore pr/lc 2058c2ecf20Sopenharmony_ci // len contains bytes left 2068c2ecf20Sopenharmony_ci // 2078c2ecf20Sopenharmony_ci.Lexit3: 2088c2ecf20Sopenharmony_ci mov ret0=len 2098c2ecf20Sopenharmony_ci mov ar.lc=saved_lc 2108c2ecf20Sopenharmony_ci br.ret.sptk.many rp 2118c2ecf20Sopenharmony_ciEND(__do_clear_user) 2128c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__do_clear_user) 213