18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * arch/alpha/lib/ev6-clear_user.S 48c2ecf20Sopenharmony_ci * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> 58c2ecf20Sopenharmony_ci * 68c2ecf20Sopenharmony_ci * Zero user space, handling exceptions as we go. 78c2ecf20Sopenharmony_ci * 88c2ecf20Sopenharmony_ci * We have to make sure that $0 is always up-to-date and contains the 98c2ecf20Sopenharmony_ci * right "bytes left to zero" value (and that it is updated only _after_ 108c2ecf20Sopenharmony_ci * a successful copy). There is also some rather minor exception setup 118c2ecf20Sopenharmony_ci * stuff. 128c2ecf20Sopenharmony_ci * 138c2ecf20Sopenharmony_ci * Much of the information about 21264 scheduling/coding comes from: 148c2ecf20Sopenharmony_ci * Compiler Writer's Guide for the Alpha 21264 158c2ecf20Sopenharmony_ci * abbreviated as 'CWG' in other comments here 168c2ecf20Sopenharmony_ci * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html 178c2ecf20Sopenharmony_ci * Scheduling notation: 188c2ecf20Sopenharmony_ci * E - either cluster 198c2ecf20Sopenharmony_ci * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 208c2ecf20Sopenharmony_ci * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 218c2ecf20Sopenharmony_ci * Try not to change the actual algorithm if possible for consistency. 228c2ecf20Sopenharmony_ci * Determining actual stalls (other than slotting) doesn't appear to be easy to do. 238c2ecf20Sopenharmony_ci * From perusing the source code context where this routine is called, it is 248c2ecf20Sopenharmony_ci * a fair assumption that significant fractions of entire pages are zeroed, so 258c2ecf20Sopenharmony_ci * it's going to be worth the effort to hand-unroll a big loop, and use wh64. 268c2ecf20Sopenharmony_ci * ASSUMPTION: 278c2ecf20Sopenharmony_ci * The believed purpose of only updating $0 after a store is that a signal 288c2ecf20Sopenharmony_ci * may come along during the execution of this chunk of code, and we don't 298c2ecf20Sopenharmony_ci * want to leave a hole (and we also want to avoid repeating lots of work) 308c2ecf20Sopenharmony_ci */ 318c2ecf20Sopenharmony_ci 328c2ecf20Sopenharmony_ci#include <asm/export.h> 338c2ecf20Sopenharmony_ci/* Allow an exception for an insn; exit if we get one. */ 348c2ecf20Sopenharmony_ci#define EX(x,y...) \ 358c2ecf20Sopenharmony_ci 99: x,##y; \ 368c2ecf20Sopenharmony_ci .section __ex_table,"a"; \ 378c2ecf20Sopenharmony_ci .long 99b - .; \ 388c2ecf20Sopenharmony_ci lda $31, $exception-99b($31); \ 398c2ecf20Sopenharmony_ci .previous 408c2ecf20Sopenharmony_ci 418c2ecf20Sopenharmony_ci .set noat 428c2ecf20Sopenharmony_ci .set noreorder 438c2ecf20Sopenharmony_ci .align 4 448c2ecf20Sopenharmony_ci 458c2ecf20Sopenharmony_ci .globl __clear_user 468c2ecf20Sopenharmony_ci .ent __clear_user 478c2ecf20Sopenharmony_ci .frame $30, 0, $26 488c2ecf20Sopenharmony_ci .prologue 0 498c2ecf20Sopenharmony_ci 508c2ecf20Sopenharmony_ci # Pipeline info : Slotting & Comments 518c2ecf20Sopenharmony_ci__clear_user: 528c2ecf20Sopenharmony_ci and $17, $17, $0 538c2ecf20Sopenharmony_ci and $16, 7, $4 # .. E .. .. : find dest head misalignment 548c2ecf20Sopenharmony_ci beq $0, $zerolength # U .. .. .. : U L U L 558c2ecf20Sopenharmony_ci 568c2ecf20Sopenharmony_ci addq $0, $4, $1 # .. .. .. E : bias counter 578c2ecf20Sopenharmony_ci and $1, 7, $2 # .. .. E .. : number of misaligned bytes in tail 588c2ecf20Sopenharmony_ci# Note - we never actually use $2, so this is a moot computation 598c2ecf20Sopenharmony_ci# and we can rewrite this later... 608c2ecf20Sopenharmony_ci srl $1, 3, $1 # .. E .. .. : number of quadwords to clear 618c2ecf20Sopenharmony_ci beq $4, $headalign # U .. .. .. : U L U L 628c2ecf20Sopenharmony_ci 638c2ecf20Sopenharmony_ci/* 648c2ecf20Sopenharmony_ci * Head is not aligned. Write (8 - $4) bytes to head of destination 658c2ecf20Sopenharmony_ci * This means $16 is known to be misaligned 668c2ecf20Sopenharmony_ci */ 678c2ecf20Sopenharmony_ci EX( ldq_u $5, 0($16) ) # .. .. .. L : load dst word to mask back in 688c2ecf20Sopenharmony_ci beq $1, $onebyte # .. .. U .. : sub-word store? 698c2ecf20Sopenharmony_ci mskql $5, $16, $5 # .. U .. .. : take care of misaligned head 708c2ecf20Sopenharmony_ci addq $16, 8, $16 # E .. .. .. : L U U L 718c2ecf20Sopenharmony_ci 728c2ecf20Sopenharmony_ci EX( stq_u $5, -8($16) ) # .. .. .. L : 738c2ecf20Sopenharmony_ci subq $1, 1, $1 # .. .. E .. : 748c2ecf20Sopenharmony_ci addq $0, $4, $0 # .. E .. .. : bytes left -= 8 - misalignment 758c2ecf20Sopenharmony_ci subq $0, 8, $0 # E .. .. .. : U L U L 768c2ecf20Sopenharmony_ci 778c2ecf20Sopenharmony_ci .align 4 788c2ecf20Sopenharmony_ci/* 798c2ecf20Sopenharmony_ci * (The .align directive ought to be a moot point) 808c2ecf20Sopenharmony_ci * values upon initial entry to the loop 818c2ecf20Sopenharmony_ci * $1 is number of quadwords to clear (zero is a valid value) 828c2ecf20Sopenharmony_ci * $2 is number of trailing bytes (0..7) ($2 never used...) 838c2ecf20Sopenharmony_ci * $16 is known to be aligned 0mod8 848c2ecf20Sopenharmony_ci */ 858c2ecf20Sopenharmony_ci$headalign: 868c2ecf20Sopenharmony_ci subq $1, 16, $4 # .. .. .. E : If < 16, we can not use the huge loop 878c2ecf20Sopenharmony_ci and $16, 0x3f, $2 # .. .. E .. : Forward work for huge loop 888c2ecf20Sopenharmony_ci subq $2, 0x40, $3 # .. E .. .. : bias counter (huge loop) 898c2ecf20Sopenharmony_ci blt $4, $trailquad # U .. .. .. : U L U L 908c2ecf20Sopenharmony_ci 918c2ecf20Sopenharmony_ci/* 928c2ecf20Sopenharmony_ci * We know that we're going to do at least 16 quads, which means we are 938c2ecf20Sopenharmony_ci * going to be able to use the large block clear loop at least once. 948c2ecf20Sopenharmony_ci * Figure out how many quads we need to clear before we are 0mod64 aligned 958c2ecf20Sopenharmony_ci * so we can use the wh64 instruction. 968c2ecf20Sopenharmony_ci */ 978c2ecf20Sopenharmony_ci 988c2ecf20Sopenharmony_ci nop # .. .. .. E 998c2ecf20Sopenharmony_ci nop # .. .. E .. 1008c2ecf20Sopenharmony_ci nop # .. E .. .. 1018c2ecf20Sopenharmony_ci beq $3, $bigalign # U .. .. .. : U L U L : Aligned 0mod64 1028c2ecf20Sopenharmony_ci 1038c2ecf20Sopenharmony_ci$alignmod64: 1048c2ecf20Sopenharmony_ci EX( stq_u $31, 0($16) ) # .. .. .. L 1058c2ecf20Sopenharmony_ci addq $3, 8, $3 # .. .. E .. 1068c2ecf20Sopenharmony_ci subq $0, 8, $0 # .. E .. .. 1078c2ecf20Sopenharmony_ci nop # E .. .. .. : U L U L 1088c2ecf20Sopenharmony_ci 1098c2ecf20Sopenharmony_ci nop # .. .. .. E 1108c2ecf20Sopenharmony_ci subq $1, 1, $1 # .. .. E .. 1118c2ecf20Sopenharmony_ci addq $16, 8, $16 # .. E .. .. 1128c2ecf20Sopenharmony_ci blt $3, $alignmod64 # U .. .. .. : U L U L 1138c2ecf20Sopenharmony_ci 1148c2ecf20Sopenharmony_ci$bigalign: 1158c2ecf20Sopenharmony_ci/* 1168c2ecf20Sopenharmony_ci * $0 is the number of bytes left 1178c2ecf20Sopenharmony_ci * $1 is the number of quads left 1188c2ecf20Sopenharmony_ci * $16 is aligned 0mod64 1198c2ecf20Sopenharmony_ci * we know that we'll be taking a minimum of one trip through 1208c2ecf20Sopenharmony_ci * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle 1218c2ecf20Sopenharmony_ci * We are _not_ going to update $0 after every single store. That 1228c2ecf20Sopenharmony_ci * would be silly, because there will be cross-cluster dependencies 1238c2ecf20Sopenharmony_ci * no matter how the code is scheduled. By doing it in slightly 1248c2ecf20Sopenharmony_ci * staggered fashion, we can still do this loop in 5 fetches 1258c2ecf20Sopenharmony_ci * The worse case will be doing two extra quads in some future execution, 1268c2ecf20Sopenharmony_ci * in the event of an interrupted clear. 1278c2ecf20Sopenharmony_ci * Assumes the wh64 needs to be for 2 trips through the loop in the future 1288c2ecf20Sopenharmony_ci * The wh64 is issued on for the starting destination address for trip +2 1298c2ecf20Sopenharmony_ci * through the loop, and if there are less than two trips left, the target 1308c2ecf20Sopenharmony_ci * address will be for the current trip. 1318c2ecf20Sopenharmony_ci */ 1328c2ecf20Sopenharmony_ci nop # E : 1338c2ecf20Sopenharmony_ci nop # E : 1348c2ecf20Sopenharmony_ci nop # E : 1358c2ecf20Sopenharmony_ci bis $16,$16,$3 # E : U L U L : Initial wh64 address is dest 1368c2ecf20Sopenharmony_ci /* This might actually help for the current trip... */ 1378c2ecf20Sopenharmony_ci 1388c2ecf20Sopenharmony_ci$do_wh64: 1398c2ecf20Sopenharmony_ci wh64 ($3) # .. .. .. L1 : memory subsystem hint 1408c2ecf20Sopenharmony_ci subq $1, 16, $4 # .. .. E .. : Forward calculation - repeat the loop? 1418c2ecf20Sopenharmony_ci EX( stq_u $31, 0($16) ) # .. L .. .. 1428c2ecf20Sopenharmony_ci subq $0, 8, $0 # E .. .. .. : U L U L 1438c2ecf20Sopenharmony_ci 1448c2ecf20Sopenharmony_ci addq $16, 128, $3 # E : Target address of wh64 1458c2ecf20Sopenharmony_ci EX( stq_u $31, 8($16) ) # L : 1468c2ecf20Sopenharmony_ci EX( stq_u $31, 16($16) ) # L : 1478c2ecf20Sopenharmony_ci subq $0, 16, $0 # E : U L L U 1488c2ecf20Sopenharmony_ci 1498c2ecf20Sopenharmony_ci nop # E : 1508c2ecf20Sopenharmony_ci EX( stq_u $31, 24($16) ) # L : 1518c2ecf20Sopenharmony_ci EX( stq_u $31, 32($16) ) # L : 1528c2ecf20Sopenharmony_ci subq $0, 168, $5 # E : U L L U : two trips through the loop left? 1538c2ecf20Sopenharmony_ci /* 168 = 192 - 24, since we've already completed some stores */ 1548c2ecf20Sopenharmony_ci 1558c2ecf20Sopenharmony_ci subq $0, 16, $0 # E : 1568c2ecf20Sopenharmony_ci EX( stq_u $31, 40($16) ) # L : 1578c2ecf20Sopenharmony_ci EX( stq_u $31, 48($16) ) # L : 1588c2ecf20Sopenharmony_ci cmovlt $5, $16, $3 # E : U L L U : Latency 2, extra mapping cycle 1598c2ecf20Sopenharmony_ci 1608c2ecf20Sopenharmony_ci subq $1, 8, $1 # E : 1618c2ecf20Sopenharmony_ci subq $0, 16, $0 # E : 1628c2ecf20Sopenharmony_ci EX( stq_u $31, 56($16) ) # L : 1638c2ecf20Sopenharmony_ci nop # E : U L U L 1648c2ecf20Sopenharmony_ci 1658c2ecf20Sopenharmony_ci nop # E : 1668c2ecf20Sopenharmony_ci subq $0, 8, $0 # E : 1678c2ecf20Sopenharmony_ci addq $16, 64, $16 # E : 1688c2ecf20Sopenharmony_ci bge $4, $do_wh64 # U : U L U L 1698c2ecf20Sopenharmony_ci 1708c2ecf20Sopenharmony_ci$trailquad: 1718c2ecf20Sopenharmony_ci # zero to 16 quadwords left to store, plus any trailing bytes 1728c2ecf20Sopenharmony_ci # $1 is the number of quadwords left to go. 1738c2ecf20Sopenharmony_ci # 1748c2ecf20Sopenharmony_ci nop # .. .. .. E 1758c2ecf20Sopenharmony_ci nop # .. .. E .. 1768c2ecf20Sopenharmony_ci nop # .. E .. .. 1778c2ecf20Sopenharmony_ci beq $1, $trailbytes # U .. .. .. : U L U L : Only 0..7 bytes to go 1788c2ecf20Sopenharmony_ci 1798c2ecf20Sopenharmony_ci$onequad: 1808c2ecf20Sopenharmony_ci EX( stq_u $31, 0($16) ) # .. .. .. L 1818c2ecf20Sopenharmony_ci subq $1, 1, $1 # .. .. E .. 1828c2ecf20Sopenharmony_ci subq $0, 8, $0 # .. E .. .. 1838c2ecf20Sopenharmony_ci nop # E .. .. .. : U L U L 1848c2ecf20Sopenharmony_ci 1858c2ecf20Sopenharmony_ci nop # .. .. .. E 1868c2ecf20Sopenharmony_ci nop # .. .. E .. 1878c2ecf20Sopenharmony_ci addq $16, 8, $16 # .. E .. .. 1888c2ecf20Sopenharmony_ci bgt $1, $onequad # U .. .. .. : U L U L 1898c2ecf20Sopenharmony_ci 1908c2ecf20Sopenharmony_ci # We have an unknown number of bytes left to go. 1918c2ecf20Sopenharmony_ci$trailbytes: 1928c2ecf20Sopenharmony_ci nop # .. .. .. E 1938c2ecf20Sopenharmony_ci nop # .. .. E .. 1948c2ecf20Sopenharmony_ci nop # .. E .. .. 1958c2ecf20Sopenharmony_ci beq $0, $zerolength # U .. .. .. : U L U L 1968c2ecf20Sopenharmony_ci 1978c2ecf20Sopenharmony_ci # $0 contains the number of bytes left to copy (0..31) 1988c2ecf20Sopenharmony_ci # so we will use $0 as the loop counter 1998c2ecf20Sopenharmony_ci # We know for a fact that $0 > 0 zero due to previous context 2008c2ecf20Sopenharmony_ci$onebyte: 2018c2ecf20Sopenharmony_ci EX( stb $31, 0($16) ) # .. .. .. L 2028c2ecf20Sopenharmony_ci subq $0, 1, $0 # .. .. E .. : 2038c2ecf20Sopenharmony_ci addq $16, 1, $16 # .. E .. .. : 2048c2ecf20Sopenharmony_ci bgt $0, $onebyte # U .. .. .. : U L U L 2058c2ecf20Sopenharmony_ci 2068c2ecf20Sopenharmony_ci$zerolength: 2078c2ecf20Sopenharmony_ci$exception: # Destination for exception recovery(?) 2088c2ecf20Sopenharmony_ci nop # .. .. .. E : 2098c2ecf20Sopenharmony_ci nop # .. .. E .. : 2108c2ecf20Sopenharmony_ci nop # .. E .. .. : 2118c2ecf20Sopenharmony_ci ret $31, ($26), 1 # L0 .. .. .. : L U L U 2128c2ecf20Sopenharmony_ci .end __clear_user 2138c2ecf20Sopenharmony_ci EXPORT_SYMBOL(__clear_user) 214