18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * arch/alpha/lib/ev6-copy_page.S 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Copy an entire page. 68c2ecf20Sopenharmony_ci */ 78c2ecf20Sopenharmony_ci 88c2ecf20Sopenharmony_ci/* The following comparison of this routine vs the normal copy_page.S 98c2ecf20Sopenharmony_ci was written by an unnamed ev6 hardware designer and forwarded to me 108c2ecf20Sopenharmony_ci via Steven Hobbs <hobbs@steven.zko.dec.com>. 118c2ecf20Sopenharmony_ci 128c2ecf20Sopenharmony_ci First Problem: STQ overflows. 138c2ecf20Sopenharmony_ci ----------------------------- 148c2ecf20Sopenharmony_ci 158c2ecf20Sopenharmony_ci It would be nice if EV6 handled every resource overflow efficiently, 168c2ecf20Sopenharmony_ci but for some it doesn't. Including store queue overflows. It causes 178c2ecf20Sopenharmony_ci a trap and a restart of the pipe. 188c2ecf20Sopenharmony_ci 198c2ecf20Sopenharmony_ci To get around this we sometimes use (to borrow a term from a VSSAD 208c2ecf20Sopenharmony_ci researcher) "aeration". The idea is to slow the rate at which the 218c2ecf20Sopenharmony_ci processor receives valid instructions by inserting nops in the fetch 228c2ecf20Sopenharmony_ci path. In doing so, you can prevent the overflow and actually make 238c2ecf20Sopenharmony_ci the code run faster. You can, of course, take advantage of the fact 248c2ecf20Sopenharmony_ci that the processor can fetch at most 4 aligned instructions per cycle. 258c2ecf20Sopenharmony_ci 268c2ecf20Sopenharmony_ci I inserted enough nops to force it to take 10 cycles to fetch the 278c2ecf20Sopenharmony_ci loop code. In theory, EV6 should be able to execute this loop in 288c2ecf20Sopenharmony_ci 9 cycles but I was not able to get it to run that fast -- the initial 298c2ecf20Sopenharmony_ci conditions were such that I could not reach this optimum rate on 308c2ecf20Sopenharmony_ci (chaotic) EV6. I wrote the code such that everything would issue 318c2ecf20Sopenharmony_ci in order. 328c2ecf20Sopenharmony_ci 338c2ecf20Sopenharmony_ci Second Problem: Dcache index matches. 348c2ecf20Sopenharmony_ci ------------------------------------- 358c2ecf20Sopenharmony_ci 368c2ecf20Sopenharmony_ci If you are going to use this routine on random aligned pages, there 378c2ecf20Sopenharmony_ci is a 25% chance that the pages will be at the same dcache indices. 388c2ecf20Sopenharmony_ci This results in many nasty memory traps without care. 398c2ecf20Sopenharmony_ci 408c2ecf20Sopenharmony_ci The solution is to schedule the prefetches to avoid the memory 418c2ecf20Sopenharmony_ci conflicts. I schedule the wh64 prefetches farther ahead of the 428c2ecf20Sopenharmony_ci read prefetches to avoid this problem. 438c2ecf20Sopenharmony_ci 448c2ecf20Sopenharmony_ci Third Problem: Needs more prefetching. 458c2ecf20Sopenharmony_ci -------------------------------------- 468c2ecf20Sopenharmony_ci 478c2ecf20Sopenharmony_ci In order to improve the code I added deeper prefetching to take the 488c2ecf20Sopenharmony_ci most advantage of EV6's bandwidth. 498c2ecf20Sopenharmony_ci 508c2ecf20Sopenharmony_ci I also prefetched the read stream. Note that adding the read prefetch 518c2ecf20Sopenharmony_ci forced me to add another cycle to the inner-most kernel - up to 11 528c2ecf20Sopenharmony_ci from the original 8 cycles per iteration. We could improve performance 538c2ecf20Sopenharmony_ci further by unrolling the loop and doing multiple prefetches per cycle. 548c2ecf20Sopenharmony_ci 558c2ecf20Sopenharmony_ci I think that the code below will be very robust and fast code for the 568c2ecf20Sopenharmony_ci purposes of copying aligned pages. It is slower when both source and 578c2ecf20Sopenharmony_ci destination pages are in the dcache, but it is my guess that this is 588c2ecf20Sopenharmony_ci less important than the dcache miss case. */ 598c2ecf20Sopenharmony_ci 608c2ecf20Sopenharmony_ci#include <asm/export.h> 618c2ecf20Sopenharmony_ci .text 628c2ecf20Sopenharmony_ci .align 4 638c2ecf20Sopenharmony_ci .global copy_page 648c2ecf20Sopenharmony_ci .ent copy_page 658c2ecf20Sopenharmony_cicopy_page: 668c2ecf20Sopenharmony_ci .prologue 0 678c2ecf20Sopenharmony_ci 688c2ecf20Sopenharmony_ci /* Prefetch 5 read cachelines; write-hint 10 cache lines. */ 698c2ecf20Sopenharmony_ci wh64 ($16) 708c2ecf20Sopenharmony_ci ldl $31,0($17) 718c2ecf20Sopenharmony_ci ldl $31,64($17) 728c2ecf20Sopenharmony_ci lda $1,1*64($16) 738c2ecf20Sopenharmony_ci 748c2ecf20Sopenharmony_ci wh64 ($1) 758c2ecf20Sopenharmony_ci ldl $31,128($17) 768c2ecf20Sopenharmony_ci ldl $31,192($17) 778c2ecf20Sopenharmony_ci lda $1,2*64($16) 788c2ecf20Sopenharmony_ci 798c2ecf20Sopenharmony_ci wh64 ($1) 808c2ecf20Sopenharmony_ci ldl $31,256($17) 818c2ecf20Sopenharmony_ci lda $18,118 828c2ecf20Sopenharmony_ci lda $1,3*64($16) 838c2ecf20Sopenharmony_ci 848c2ecf20Sopenharmony_ci wh64 ($1) 858c2ecf20Sopenharmony_ci nop 868c2ecf20Sopenharmony_ci lda $1,4*64($16) 878c2ecf20Sopenharmony_ci lda $2,5*64($16) 888c2ecf20Sopenharmony_ci 898c2ecf20Sopenharmony_ci wh64 ($1) 908c2ecf20Sopenharmony_ci wh64 ($2) 918c2ecf20Sopenharmony_ci lda $1,6*64($16) 928c2ecf20Sopenharmony_ci lda $2,7*64($16) 938c2ecf20Sopenharmony_ci 948c2ecf20Sopenharmony_ci wh64 ($1) 958c2ecf20Sopenharmony_ci wh64 ($2) 968c2ecf20Sopenharmony_ci lda $1,8*64($16) 978c2ecf20Sopenharmony_ci lda $2,9*64($16) 988c2ecf20Sopenharmony_ci 998c2ecf20Sopenharmony_ci wh64 ($1) 1008c2ecf20Sopenharmony_ci wh64 ($2) 1018c2ecf20Sopenharmony_ci lda $19,10*64($16) 1028c2ecf20Sopenharmony_ci nop 1038c2ecf20Sopenharmony_ci 1048c2ecf20Sopenharmony_ci /* Main prefetching/write-hinting loop. */ 1058c2ecf20Sopenharmony_ci1: ldq $0,0($17) 1068c2ecf20Sopenharmony_ci ldq $1,8($17) 1078c2ecf20Sopenharmony_ci unop 1088c2ecf20Sopenharmony_ci unop 1098c2ecf20Sopenharmony_ci 1108c2ecf20Sopenharmony_ci unop 1118c2ecf20Sopenharmony_ci unop 1128c2ecf20Sopenharmony_ci ldq $2,16($17) 1138c2ecf20Sopenharmony_ci ldq $3,24($17) 1148c2ecf20Sopenharmony_ci 1158c2ecf20Sopenharmony_ci ldq $4,32($17) 1168c2ecf20Sopenharmony_ci ldq $5,40($17) 1178c2ecf20Sopenharmony_ci unop 1188c2ecf20Sopenharmony_ci unop 1198c2ecf20Sopenharmony_ci 1208c2ecf20Sopenharmony_ci unop 1218c2ecf20Sopenharmony_ci unop 1228c2ecf20Sopenharmony_ci ldq $6,48($17) 1238c2ecf20Sopenharmony_ci ldq $7,56($17) 1248c2ecf20Sopenharmony_ci 1258c2ecf20Sopenharmony_ci ldl $31,320($17) 1268c2ecf20Sopenharmony_ci unop 1278c2ecf20Sopenharmony_ci unop 1288c2ecf20Sopenharmony_ci unop 1298c2ecf20Sopenharmony_ci 1308c2ecf20Sopenharmony_ci /* This gives the extra cycle of aeration above the minimum. */ 1318c2ecf20Sopenharmony_ci unop 1328c2ecf20Sopenharmony_ci unop 1338c2ecf20Sopenharmony_ci unop 1348c2ecf20Sopenharmony_ci unop 1358c2ecf20Sopenharmony_ci 1368c2ecf20Sopenharmony_ci wh64 ($19) 1378c2ecf20Sopenharmony_ci unop 1388c2ecf20Sopenharmony_ci unop 1398c2ecf20Sopenharmony_ci unop 1408c2ecf20Sopenharmony_ci 1418c2ecf20Sopenharmony_ci stq $0,0($16) 1428c2ecf20Sopenharmony_ci subq $18,1,$18 1438c2ecf20Sopenharmony_ci stq $1,8($16) 1448c2ecf20Sopenharmony_ci unop 1458c2ecf20Sopenharmony_ci 1468c2ecf20Sopenharmony_ci unop 1478c2ecf20Sopenharmony_ci stq $2,16($16) 1488c2ecf20Sopenharmony_ci addq $17,64,$17 1498c2ecf20Sopenharmony_ci stq $3,24($16) 1508c2ecf20Sopenharmony_ci 1518c2ecf20Sopenharmony_ci stq $4,32($16) 1528c2ecf20Sopenharmony_ci stq $5,40($16) 1538c2ecf20Sopenharmony_ci addq $19,64,$19 1548c2ecf20Sopenharmony_ci unop 1558c2ecf20Sopenharmony_ci 1568c2ecf20Sopenharmony_ci stq $6,48($16) 1578c2ecf20Sopenharmony_ci stq $7,56($16) 1588c2ecf20Sopenharmony_ci addq $16,64,$16 1598c2ecf20Sopenharmony_ci bne $18, 1b 1608c2ecf20Sopenharmony_ci 1618c2ecf20Sopenharmony_ci /* Prefetch the final 5 cache lines of the read stream. */ 1628c2ecf20Sopenharmony_ci lda $18,10 1638c2ecf20Sopenharmony_ci ldl $31,320($17) 1648c2ecf20Sopenharmony_ci ldl $31,384($17) 1658c2ecf20Sopenharmony_ci ldl $31,448($17) 1668c2ecf20Sopenharmony_ci 1678c2ecf20Sopenharmony_ci ldl $31,512($17) 1688c2ecf20Sopenharmony_ci ldl $31,576($17) 1698c2ecf20Sopenharmony_ci nop 1708c2ecf20Sopenharmony_ci nop 1718c2ecf20Sopenharmony_ci 1728c2ecf20Sopenharmony_ci /* Non-prefetching, non-write-hinting cleanup loop for the 1738c2ecf20Sopenharmony_ci final 10 cache lines. */ 1748c2ecf20Sopenharmony_ci2: ldq $0,0($17) 1758c2ecf20Sopenharmony_ci ldq $1,8($17) 1768c2ecf20Sopenharmony_ci ldq $2,16($17) 1778c2ecf20Sopenharmony_ci ldq $3,24($17) 1788c2ecf20Sopenharmony_ci 1798c2ecf20Sopenharmony_ci ldq $4,32($17) 1808c2ecf20Sopenharmony_ci ldq $5,40($17) 1818c2ecf20Sopenharmony_ci ldq $6,48($17) 1828c2ecf20Sopenharmony_ci ldq $7,56($17) 1838c2ecf20Sopenharmony_ci 1848c2ecf20Sopenharmony_ci stq $0,0($16) 1858c2ecf20Sopenharmony_ci subq $18,1,$18 1868c2ecf20Sopenharmony_ci stq $1,8($16) 1878c2ecf20Sopenharmony_ci addq $17,64,$17 1888c2ecf20Sopenharmony_ci 1898c2ecf20Sopenharmony_ci stq $2,16($16) 1908c2ecf20Sopenharmony_ci stq $3,24($16) 1918c2ecf20Sopenharmony_ci stq $4,32($16) 1928c2ecf20Sopenharmony_ci stq $5,40($16) 1938c2ecf20Sopenharmony_ci 1948c2ecf20Sopenharmony_ci stq $6,48($16) 1958c2ecf20Sopenharmony_ci stq $7,56($16) 1968c2ecf20Sopenharmony_ci addq $16,64,$16 1978c2ecf20Sopenharmony_ci bne $18, 2b 1988c2ecf20Sopenharmony_ci 1998c2ecf20Sopenharmony_ci ret 2008c2ecf20Sopenharmony_ci nop 2018c2ecf20Sopenharmony_ci unop 2028c2ecf20Sopenharmony_ci nop 2038c2ecf20Sopenharmony_ci 2048c2ecf20Sopenharmony_ci .end copy_page 2058c2ecf20Sopenharmony_ci EXPORT_SYMBOL(copy_page) 206