162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * arch/alpha/lib/ev6-copy_page.S 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Copy an entire page. 662306a36Sopenharmony_ci */ 762306a36Sopenharmony_ci 862306a36Sopenharmony_ci/* The following comparison of this routine vs the normal copy_page.S 962306a36Sopenharmony_ci was written by an unnamed ev6 hardware designer and forwarded to me 1062306a36Sopenharmony_ci via Steven Hobbs <hobbs@steven.zko.dec.com>. 1162306a36Sopenharmony_ci 1262306a36Sopenharmony_ci First Problem: STQ overflows. 1362306a36Sopenharmony_ci ----------------------------- 1462306a36Sopenharmony_ci 1562306a36Sopenharmony_ci It would be nice if EV6 handled every resource overflow efficiently, 1662306a36Sopenharmony_ci but for some it doesn't. Including store queue overflows. It causes 1762306a36Sopenharmony_ci a trap and a restart of the pipe. 1862306a36Sopenharmony_ci 1962306a36Sopenharmony_ci To get around this we sometimes use (to borrow a term from a VSSAD 2062306a36Sopenharmony_ci researcher) "aeration". The idea is to slow the rate at which the 2162306a36Sopenharmony_ci processor receives valid instructions by inserting nops in the fetch 2262306a36Sopenharmony_ci path. In doing so, you can prevent the overflow and actually make 2362306a36Sopenharmony_ci the code run faster. You can, of course, take advantage of the fact 2462306a36Sopenharmony_ci that the processor can fetch at most 4 aligned instructions per cycle. 2562306a36Sopenharmony_ci 2662306a36Sopenharmony_ci I inserted enough nops to force it to take 10 cycles to fetch the 2762306a36Sopenharmony_ci loop code. In theory, EV6 should be able to execute this loop in 2862306a36Sopenharmony_ci 9 cycles but I was not able to get it to run that fast -- the initial 2962306a36Sopenharmony_ci conditions were such that I could not reach this optimum rate on 3062306a36Sopenharmony_ci (chaotic) EV6. I wrote the code such that everything would issue 3162306a36Sopenharmony_ci in order. 3262306a36Sopenharmony_ci 3362306a36Sopenharmony_ci Second Problem: Dcache index matches. 3462306a36Sopenharmony_ci ------------------------------------- 3562306a36Sopenharmony_ci 3662306a36Sopenharmony_ci If you are going to use this routine on random aligned pages, there 3762306a36Sopenharmony_ci is a 25% chance that the pages will be at the same dcache indices. 3862306a36Sopenharmony_ci This results in many nasty memory traps without care. 3962306a36Sopenharmony_ci 4062306a36Sopenharmony_ci The solution is to schedule the prefetches to avoid the memory 4162306a36Sopenharmony_ci conflicts. I schedule the wh64 prefetches farther ahead of the 4262306a36Sopenharmony_ci read prefetches to avoid this problem. 4362306a36Sopenharmony_ci 4462306a36Sopenharmony_ci Third Problem: Needs more prefetching. 4562306a36Sopenharmony_ci -------------------------------------- 4662306a36Sopenharmony_ci 4762306a36Sopenharmony_ci In order to improve the code I added deeper prefetching to take the 4862306a36Sopenharmony_ci most advantage of EV6's bandwidth. 4962306a36Sopenharmony_ci 5062306a36Sopenharmony_ci I also prefetched the read stream. Note that adding the read prefetch 5162306a36Sopenharmony_ci forced me to add another cycle to the inner-most kernel - up to 11 5262306a36Sopenharmony_ci from the original 8 cycles per iteration. We could improve performance 5362306a36Sopenharmony_ci further by unrolling the loop and doing multiple prefetches per cycle. 5462306a36Sopenharmony_ci 5562306a36Sopenharmony_ci I think that the code below will be very robust and fast code for the 5662306a36Sopenharmony_ci purposes of copying aligned pages. It is slower when both source and 5762306a36Sopenharmony_ci destination pages are in the dcache, but it is my guess that this is 5862306a36Sopenharmony_ci less important than the dcache miss case. */ 5962306a36Sopenharmony_ci 6062306a36Sopenharmony_ci#include <linux/export.h> 6162306a36Sopenharmony_ci .text 6262306a36Sopenharmony_ci .align 4 6362306a36Sopenharmony_ci .global copy_page 6462306a36Sopenharmony_ci .ent copy_page 6562306a36Sopenharmony_cicopy_page: 6662306a36Sopenharmony_ci .prologue 0 6762306a36Sopenharmony_ci 6862306a36Sopenharmony_ci /* Prefetch 5 read cachelines; write-hint 10 cache lines. */ 6962306a36Sopenharmony_ci wh64 ($16) 7062306a36Sopenharmony_ci ldl $31,0($17) 7162306a36Sopenharmony_ci ldl $31,64($17) 7262306a36Sopenharmony_ci lda $1,1*64($16) 7362306a36Sopenharmony_ci 7462306a36Sopenharmony_ci wh64 ($1) 7562306a36Sopenharmony_ci ldl $31,128($17) 7662306a36Sopenharmony_ci ldl $31,192($17) 7762306a36Sopenharmony_ci lda $1,2*64($16) 7862306a36Sopenharmony_ci 7962306a36Sopenharmony_ci wh64 ($1) 8062306a36Sopenharmony_ci ldl $31,256($17) 8162306a36Sopenharmony_ci lda $18,118 8262306a36Sopenharmony_ci lda $1,3*64($16) 8362306a36Sopenharmony_ci 8462306a36Sopenharmony_ci wh64 ($1) 8562306a36Sopenharmony_ci nop 8662306a36Sopenharmony_ci lda $1,4*64($16) 8762306a36Sopenharmony_ci lda $2,5*64($16) 8862306a36Sopenharmony_ci 8962306a36Sopenharmony_ci wh64 ($1) 9062306a36Sopenharmony_ci wh64 ($2) 9162306a36Sopenharmony_ci lda $1,6*64($16) 9262306a36Sopenharmony_ci lda $2,7*64($16) 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_ci wh64 ($1) 9562306a36Sopenharmony_ci wh64 ($2) 9662306a36Sopenharmony_ci lda $1,8*64($16) 9762306a36Sopenharmony_ci lda $2,9*64($16) 9862306a36Sopenharmony_ci 9962306a36Sopenharmony_ci wh64 ($1) 10062306a36Sopenharmony_ci wh64 ($2) 10162306a36Sopenharmony_ci lda $19,10*64($16) 10262306a36Sopenharmony_ci nop 10362306a36Sopenharmony_ci 10462306a36Sopenharmony_ci /* Main prefetching/write-hinting loop. */ 10562306a36Sopenharmony_ci1: ldq $0,0($17) 10662306a36Sopenharmony_ci ldq $1,8($17) 10762306a36Sopenharmony_ci unop 10862306a36Sopenharmony_ci unop 10962306a36Sopenharmony_ci 11062306a36Sopenharmony_ci unop 11162306a36Sopenharmony_ci unop 11262306a36Sopenharmony_ci ldq $2,16($17) 11362306a36Sopenharmony_ci ldq $3,24($17) 11462306a36Sopenharmony_ci 11562306a36Sopenharmony_ci ldq $4,32($17) 11662306a36Sopenharmony_ci ldq $5,40($17) 11762306a36Sopenharmony_ci unop 11862306a36Sopenharmony_ci unop 11962306a36Sopenharmony_ci 12062306a36Sopenharmony_ci unop 12162306a36Sopenharmony_ci unop 12262306a36Sopenharmony_ci ldq $6,48($17) 12362306a36Sopenharmony_ci ldq $7,56($17) 12462306a36Sopenharmony_ci 12562306a36Sopenharmony_ci ldl $31,320($17) 12662306a36Sopenharmony_ci unop 12762306a36Sopenharmony_ci unop 12862306a36Sopenharmony_ci unop 12962306a36Sopenharmony_ci 13062306a36Sopenharmony_ci /* This gives the extra cycle of aeration above the minimum. */ 13162306a36Sopenharmony_ci unop 13262306a36Sopenharmony_ci unop 13362306a36Sopenharmony_ci unop 13462306a36Sopenharmony_ci unop 13562306a36Sopenharmony_ci 13662306a36Sopenharmony_ci wh64 ($19) 13762306a36Sopenharmony_ci unop 13862306a36Sopenharmony_ci unop 13962306a36Sopenharmony_ci unop 14062306a36Sopenharmony_ci 14162306a36Sopenharmony_ci stq $0,0($16) 14262306a36Sopenharmony_ci subq $18,1,$18 14362306a36Sopenharmony_ci stq $1,8($16) 14462306a36Sopenharmony_ci unop 14562306a36Sopenharmony_ci 14662306a36Sopenharmony_ci unop 14762306a36Sopenharmony_ci stq $2,16($16) 14862306a36Sopenharmony_ci addq $17,64,$17 14962306a36Sopenharmony_ci stq $3,24($16) 15062306a36Sopenharmony_ci 15162306a36Sopenharmony_ci stq $4,32($16) 15262306a36Sopenharmony_ci stq $5,40($16) 15362306a36Sopenharmony_ci addq $19,64,$19 15462306a36Sopenharmony_ci unop 15562306a36Sopenharmony_ci 15662306a36Sopenharmony_ci stq $6,48($16) 15762306a36Sopenharmony_ci stq $7,56($16) 15862306a36Sopenharmony_ci addq $16,64,$16 15962306a36Sopenharmony_ci bne $18, 1b 16062306a36Sopenharmony_ci 16162306a36Sopenharmony_ci /* Prefetch the final 5 cache lines of the read stream. */ 16262306a36Sopenharmony_ci lda $18,10 16362306a36Sopenharmony_ci ldl $31,320($17) 16462306a36Sopenharmony_ci ldl $31,384($17) 16562306a36Sopenharmony_ci ldl $31,448($17) 16662306a36Sopenharmony_ci 16762306a36Sopenharmony_ci ldl $31,512($17) 16862306a36Sopenharmony_ci ldl $31,576($17) 16962306a36Sopenharmony_ci nop 17062306a36Sopenharmony_ci nop 17162306a36Sopenharmony_ci 17262306a36Sopenharmony_ci /* Non-prefetching, non-write-hinting cleanup loop for the 17362306a36Sopenharmony_ci final 10 cache lines. */ 17462306a36Sopenharmony_ci2: ldq $0,0($17) 17562306a36Sopenharmony_ci ldq $1,8($17) 17662306a36Sopenharmony_ci ldq $2,16($17) 17762306a36Sopenharmony_ci ldq $3,24($17) 17862306a36Sopenharmony_ci 17962306a36Sopenharmony_ci ldq $4,32($17) 18062306a36Sopenharmony_ci ldq $5,40($17) 18162306a36Sopenharmony_ci ldq $6,48($17) 18262306a36Sopenharmony_ci ldq $7,56($17) 18362306a36Sopenharmony_ci 18462306a36Sopenharmony_ci stq $0,0($16) 18562306a36Sopenharmony_ci subq $18,1,$18 18662306a36Sopenharmony_ci stq $1,8($16) 18762306a36Sopenharmony_ci addq $17,64,$17 18862306a36Sopenharmony_ci 18962306a36Sopenharmony_ci stq $2,16($16) 19062306a36Sopenharmony_ci stq $3,24($16) 19162306a36Sopenharmony_ci stq $4,32($16) 19262306a36Sopenharmony_ci stq $5,40($16) 19362306a36Sopenharmony_ci 19462306a36Sopenharmony_ci stq $6,48($16) 19562306a36Sopenharmony_ci stq $7,56($16) 19662306a36Sopenharmony_ci addq $16,64,$16 19762306a36Sopenharmony_ci bne $18, 2b 19862306a36Sopenharmony_ci 19962306a36Sopenharmony_ci ret 20062306a36Sopenharmony_ci nop 20162306a36Sopenharmony_ci unop 20262306a36Sopenharmony_ci nop 20362306a36Sopenharmony_ci 20462306a36Sopenharmony_ci .end copy_page 20562306a36Sopenharmony_ci EXPORT_SYMBOL(copy_page) 206