18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * arch/alpha/lib/ev6-memcpy.S 48c2ecf20Sopenharmony_ci * 21264 version by Rick Gorton <rick.gorton@alpha-processor.com> 58c2ecf20Sopenharmony_ci * 68c2ecf20Sopenharmony_ci * Reasonably optimized memcpy() routine for the Alpha 21264 78c2ecf20Sopenharmony_ci * 88c2ecf20Sopenharmony_ci * - memory accessed as aligned quadwords only 98c2ecf20Sopenharmony_ci * - uses bcmpge to compare 8 bytes in parallel 108c2ecf20Sopenharmony_ci * 118c2ecf20Sopenharmony_ci * Much of the information about 21264 scheduling/coding comes from: 128c2ecf20Sopenharmony_ci * Compiler Writer's Guide for the Alpha 21264 138c2ecf20Sopenharmony_ci * abbreviated as 'CWG' in other comments here 148c2ecf20Sopenharmony_ci * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html 158c2ecf20Sopenharmony_ci * Scheduling notation: 168c2ecf20Sopenharmony_ci * E - either cluster 178c2ecf20Sopenharmony_ci * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 188c2ecf20Sopenharmony_ci * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 198c2ecf20Sopenharmony_ci * 208c2ecf20Sopenharmony_ci * Temp usage notes: 218c2ecf20Sopenharmony_ci * $1,$2, - scratch 228c2ecf20Sopenharmony_ci */ 238c2ecf20Sopenharmony_ci#include <asm/export.h> 248c2ecf20Sopenharmony_ci .set noreorder 258c2ecf20Sopenharmony_ci .set noat 268c2ecf20Sopenharmony_ci 278c2ecf20Sopenharmony_ci .align 4 288c2ecf20Sopenharmony_ci .globl memcpy 298c2ecf20Sopenharmony_ci .ent memcpy 308c2ecf20Sopenharmony_cimemcpy: 318c2ecf20Sopenharmony_ci .frame $30,0,$26,0 328c2ecf20Sopenharmony_ci .prologue 0 338c2ecf20Sopenharmony_ci 348c2ecf20Sopenharmony_ci mov $16, $0 # E : copy dest to return 358c2ecf20Sopenharmony_ci ble $18, $nomoredata # U : done with the copy? 368c2ecf20Sopenharmony_ci xor $16, $17, $1 # E : are source and dest alignments the same? 378c2ecf20Sopenharmony_ci and $1, 7, $1 # E : are they the same mod 8? 388c2ecf20Sopenharmony_ci 398c2ecf20Sopenharmony_ci bne $1, $misaligned # U : Nope - gotta do this the slow way 408c2ecf20Sopenharmony_ci /* source and dest are same mod 8 address */ 418c2ecf20Sopenharmony_ci and $16, 7, $1 # E : Are both 0mod8? 428c2ecf20Sopenharmony_ci beq $1, $both_0mod8 # U : Yes 438c2ecf20Sopenharmony_ci nop # E : 448c2ecf20Sopenharmony_ci 458c2ecf20Sopenharmony_ci /* 468c2ecf20Sopenharmony_ci * source and dest are same misalignment. move a byte at a time 478c2ecf20Sopenharmony_ci * until a 0mod8 alignment for both is reached. 488c2ecf20Sopenharmony_ci * At least one byte more to move 498c2ecf20Sopenharmony_ci */ 508c2ecf20Sopenharmony_ci 518c2ecf20Sopenharmony_ci$head_align: 528c2ecf20Sopenharmony_ci ldbu $1, 0($17) # L : grab a byte 538c2ecf20Sopenharmony_ci subq $18, 1, $18 # E : count-- 548c2ecf20Sopenharmony_ci addq $17, 1, $17 # E : src++ 558c2ecf20Sopenharmony_ci stb $1, 0($16) # L : 568c2ecf20Sopenharmony_ci addq $16, 1, $16 # E : dest++ 578c2ecf20Sopenharmony_ci and $16, 7, $1 # E : Are we at 0mod8 yet? 588c2ecf20Sopenharmony_ci ble $18, $nomoredata # U : done with the copy? 598c2ecf20Sopenharmony_ci bne $1, $head_align # U : 608c2ecf20Sopenharmony_ci 618c2ecf20Sopenharmony_ci$both_0mod8: 628c2ecf20Sopenharmony_ci cmple $18, 127, $1 # E : Can we unroll the loop? 638c2ecf20Sopenharmony_ci bne $1, $no_unroll # U : 648c2ecf20Sopenharmony_ci and $16, 63, $1 # E : get mod64 alignment 658c2ecf20Sopenharmony_ci beq $1, $do_unroll # U : no single quads to fiddle 668c2ecf20Sopenharmony_ci 678c2ecf20Sopenharmony_ci$single_head_quad: 688c2ecf20Sopenharmony_ci ldq $1, 0($17) # L : get 8 bytes 698c2ecf20Sopenharmony_ci subq $18, 8, $18 # E : count -= 8 708c2ecf20Sopenharmony_ci addq $17, 8, $17 # E : src += 8 718c2ecf20Sopenharmony_ci nop # E : 728c2ecf20Sopenharmony_ci 738c2ecf20Sopenharmony_ci stq $1, 0($16) # L : store 748c2ecf20Sopenharmony_ci addq $16, 8, $16 # E : dest += 8 758c2ecf20Sopenharmony_ci and $16, 63, $1 # E : get mod64 alignment 768c2ecf20Sopenharmony_ci bne $1, $single_head_quad # U : still not fully aligned 778c2ecf20Sopenharmony_ci 788c2ecf20Sopenharmony_ci$do_unroll: 798c2ecf20Sopenharmony_ci addq $16, 64, $7 # E : Initial (+1 trip) wh64 address 808c2ecf20Sopenharmony_ci cmple $18, 127, $1 # E : Can we go through the unrolled loop? 818c2ecf20Sopenharmony_ci bne $1, $tail_quads # U : Nope 828c2ecf20Sopenharmony_ci nop # E : 838c2ecf20Sopenharmony_ci 848c2ecf20Sopenharmony_ci$unroll_body: 858c2ecf20Sopenharmony_ci wh64 ($7) # L1 : memory subsystem hint: 64 bytes at 868c2ecf20Sopenharmony_ci # ($7) are about to be over-written 878c2ecf20Sopenharmony_ci ldq $6, 0($17) # L0 : bytes 0..7 888c2ecf20Sopenharmony_ci nop # E : 898c2ecf20Sopenharmony_ci nop # E : 908c2ecf20Sopenharmony_ci 918c2ecf20Sopenharmony_ci ldq $4, 8($17) # L : bytes 8..15 928c2ecf20Sopenharmony_ci ldq $5, 16($17) # L : bytes 16..23 938c2ecf20Sopenharmony_ci addq $7, 64, $7 # E : Update next wh64 address 948c2ecf20Sopenharmony_ci nop # E : 958c2ecf20Sopenharmony_ci 968c2ecf20Sopenharmony_ci ldq $3, 24($17) # L : bytes 24..31 978c2ecf20Sopenharmony_ci addq $16, 64, $1 # E : fallback value for wh64 988c2ecf20Sopenharmony_ci nop # E : 998c2ecf20Sopenharmony_ci nop # E : 1008c2ecf20Sopenharmony_ci 1018c2ecf20Sopenharmony_ci addq $17, 32, $17 # E : src += 32 bytes 1028c2ecf20Sopenharmony_ci stq $6, 0($16) # L : bytes 0..7 1038c2ecf20Sopenharmony_ci nop # E : 1048c2ecf20Sopenharmony_ci nop # E : 1058c2ecf20Sopenharmony_ci 1068c2ecf20Sopenharmony_ci stq $4, 8($16) # L : bytes 8..15 1078c2ecf20Sopenharmony_ci stq $5, 16($16) # L : bytes 16..23 1088c2ecf20Sopenharmony_ci subq $18, 192, $2 # E : At least two more trips to go? 1098c2ecf20Sopenharmony_ci nop # E : 1108c2ecf20Sopenharmony_ci 1118c2ecf20Sopenharmony_ci stq $3, 24($16) # L : bytes 24..31 1128c2ecf20Sopenharmony_ci addq $16, 32, $16 # E : dest += 32 bytes 1138c2ecf20Sopenharmony_ci nop # E : 1148c2ecf20Sopenharmony_ci nop # E : 1158c2ecf20Sopenharmony_ci 1168c2ecf20Sopenharmony_ci ldq $6, 0($17) # L : bytes 0..7 1178c2ecf20Sopenharmony_ci ldq $4, 8($17) # L : bytes 8..15 1188c2ecf20Sopenharmony_ci cmovlt $2, $1, $7 # E : Latency 2, extra map slot - Use 1198c2ecf20Sopenharmony_ci # fallback wh64 address if < 2 more trips 1208c2ecf20Sopenharmony_ci nop # E : 1218c2ecf20Sopenharmony_ci 1228c2ecf20Sopenharmony_ci ldq $5, 16($17) # L : bytes 16..23 1238c2ecf20Sopenharmony_ci ldq $3, 24($17) # L : bytes 24..31 1248c2ecf20Sopenharmony_ci addq $16, 32, $16 # E : dest += 32 1258c2ecf20Sopenharmony_ci subq $18, 64, $18 # E : count -= 64 1268c2ecf20Sopenharmony_ci 1278c2ecf20Sopenharmony_ci addq $17, 32, $17 # E : src += 32 1288c2ecf20Sopenharmony_ci stq $6, -32($16) # L : bytes 0..7 1298c2ecf20Sopenharmony_ci stq $4, -24($16) # L : bytes 8..15 1308c2ecf20Sopenharmony_ci cmple $18, 63, $1 # E : At least one more trip? 1318c2ecf20Sopenharmony_ci 1328c2ecf20Sopenharmony_ci stq $5, -16($16) # L : bytes 16..23 1338c2ecf20Sopenharmony_ci stq $3, -8($16) # L : bytes 24..31 1348c2ecf20Sopenharmony_ci nop # E : 1358c2ecf20Sopenharmony_ci beq $1, $unroll_body 1368c2ecf20Sopenharmony_ci 1378c2ecf20Sopenharmony_ci$tail_quads: 1388c2ecf20Sopenharmony_ci$no_unroll: 1398c2ecf20Sopenharmony_ci .align 4 1408c2ecf20Sopenharmony_ci subq $18, 8, $18 # E : At least a quad left? 1418c2ecf20Sopenharmony_ci blt $18, $less_than_8 # U : Nope 1428c2ecf20Sopenharmony_ci nop # E : 1438c2ecf20Sopenharmony_ci nop # E : 1448c2ecf20Sopenharmony_ci 1458c2ecf20Sopenharmony_ci$move_a_quad: 1468c2ecf20Sopenharmony_ci ldq $1, 0($17) # L : fetch 8 1478c2ecf20Sopenharmony_ci subq $18, 8, $18 # E : count -= 8 1488c2ecf20Sopenharmony_ci addq $17, 8, $17 # E : src += 8 1498c2ecf20Sopenharmony_ci nop # E : 1508c2ecf20Sopenharmony_ci 1518c2ecf20Sopenharmony_ci stq $1, 0($16) # L : store 8 1528c2ecf20Sopenharmony_ci addq $16, 8, $16 # E : dest += 8 1538c2ecf20Sopenharmony_ci bge $18, $move_a_quad # U : 1548c2ecf20Sopenharmony_ci nop # E : 1558c2ecf20Sopenharmony_ci 1568c2ecf20Sopenharmony_ci$less_than_8: 1578c2ecf20Sopenharmony_ci .align 4 1588c2ecf20Sopenharmony_ci addq $18, 8, $18 # E : add back for trailing bytes 1598c2ecf20Sopenharmony_ci ble $18, $nomoredata # U : All-done 1608c2ecf20Sopenharmony_ci nop # E : 1618c2ecf20Sopenharmony_ci nop # E : 1628c2ecf20Sopenharmony_ci 1638c2ecf20Sopenharmony_ci /* Trailing bytes */ 1648c2ecf20Sopenharmony_ci$tail_bytes: 1658c2ecf20Sopenharmony_ci subq $18, 1, $18 # E : count-- 1668c2ecf20Sopenharmony_ci ldbu $1, 0($17) # L : fetch a byte 1678c2ecf20Sopenharmony_ci addq $17, 1, $17 # E : src++ 1688c2ecf20Sopenharmony_ci nop # E : 1698c2ecf20Sopenharmony_ci 1708c2ecf20Sopenharmony_ci stb $1, 0($16) # L : store a byte 1718c2ecf20Sopenharmony_ci addq $16, 1, $16 # E : dest++ 1728c2ecf20Sopenharmony_ci bgt $18, $tail_bytes # U : more to be done? 1738c2ecf20Sopenharmony_ci nop # E : 1748c2ecf20Sopenharmony_ci 1758c2ecf20Sopenharmony_ci /* branching to exit takes 3 extra cycles, so replicate exit here */ 1768c2ecf20Sopenharmony_ci ret $31, ($26), 1 # L0 : 1778c2ecf20Sopenharmony_ci nop # E : 1788c2ecf20Sopenharmony_ci nop # E : 1798c2ecf20Sopenharmony_ci nop # E : 1808c2ecf20Sopenharmony_ci 1818c2ecf20Sopenharmony_ci$misaligned: 1828c2ecf20Sopenharmony_ci mov $0, $4 # E : dest temp 1838c2ecf20Sopenharmony_ci and $0, 7, $1 # E : dest alignment mod8 1848c2ecf20Sopenharmony_ci beq $1, $dest_0mod8 # U : life doesnt totally suck 1858c2ecf20Sopenharmony_ci nop 1868c2ecf20Sopenharmony_ci 1878c2ecf20Sopenharmony_ci$aligndest: 1888c2ecf20Sopenharmony_ci ble $18, $nomoredata # U : 1898c2ecf20Sopenharmony_ci ldbu $1, 0($17) # L : fetch a byte 1908c2ecf20Sopenharmony_ci subq $18, 1, $18 # E : count-- 1918c2ecf20Sopenharmony_ci addq $17, 1, $17 # E : src++ 1928c2ecf20Sopenharmony_ci 1938c2ecf20Sopenharmony_ci stb $1, 0($4) # L : store it 1948c2ecf20Sopenharmony_ci addq $4, 1, $4 # E : dest++ 1958c2ecf20Sopenharmony_ci and $4, 7, $1 # E : dest 0mod8 yet? 1968c2ecf20Sopenharmony_ci bne $1, $aligndest # U : go until we are aligned. 1978c2ecf20Sopenharmony_ci 1988c2ecf20Sopenharmony_ci /* Source has unknown alignment, but dest is known to be 0mod8 */ 1998c2ecf20Sopenharmony_ci$dest_0mod8: 2008c2ecf20Sopenharmony_ci subq $18, 8, $18 # E : At least a quad left? 2018c2ecf20Sopenharmony_ci blt $18, $misalign_tail # U : Nope 2028c2ecf20Sopenharmony_ci ldq_u $3, 0($17) # L : seed (rotating load) of 8 bytes 2038c2ecf20Sopenharmony_ci nop # E : 2048c2ecf20Sopenharmony_ci 2058c2ecf20Sopenharmony_ci$mis_quad: 2068c2ecf20Sopenharmony_ci ldq_u $16, 8($17) # L : Fetch next 8 2078c2ecf20Sopenharmony_ci extql $3, $17, $3 # U : masking 2088c2ecf20Sopenharmony_ci extqh $16, $17, $1 # U : masking 2098c2ecf20Sopenharmony_ci bis $3, $1, $1 # E : merged bytes to store 2108c2ecf20Sopenharmony_ci 2118c2ecf20Sopenharmony_ci subq $18, 8, $18 # E : count -= 8 2128c2ecf20Sopenharmony_ci addq $17, 8, $17 # E : src += 8 2138c2ecf20Sopenharmony_ci stq $1, 0($4) # L : store 8 (aligned) 2148c2ecf20Sopenharmony_ci mov $16, $3 # E : "rotate" source data 2158c2ecf20Sopenharmony_ci 2168c2ecf20Sopenharmony_ci addq $4, 8, $4 # E : dest += 8 2178c2ecf20Sopenharmony_ci bge $18, $mis_quad # U : More quads to move 2188c2ecf20Sopenharmony_ci nop 2198c2ecf20Sopenharmony_ci nop 2208c2ecf20Sopenharmony_ci 2218c2ecf20Sopenharmony_ci$misalign_tail: 2228c2ecf20Sopenharmony_ci addq $18, 8, $18 # E : account for tail stuff 2238c2ecf20Sopenharmony_ci ble $18, $nomoredata # U : 2248c2ecf20Sopenharmony_ci nop 2258c2ecf20Sopenharmony_ci nop 2268c2ecf20Sopenharmony_ci 2278c2ecf20Sopenharmony_ci$misalign_byte: 2288c2ecf20Sopenharmony_ci ldbu $1, 0($17) # L : fetch 1 2298c2ecf20Sopenharmony_ci subq $18, 1, $18 # E : count-- 2308c2ecf20Sopenharmony_ci addq $17, 1, $17 # E : src++ 2318c2ecf20Sopenharmony_ci nop # E : 2328c2ecf20Sopenharmony_ci 2338c2ecf20Sopenharmony_ci stb $1, 0($4) # L : store 2348c2ecf20Sopenharmony_ci addq $4, 1, $4 # E : dest++ 2358c2ecf20Sopenharmony_ci bgt $18, $misalign_byte # U : more to go? 2368c2ecf20Sopenharmony_ci nop 2378c2ecf20Sopenharmony_ci 2388c2ecf20Sopenharmony_ci 2398c2ecf20Sopenharmony_ci$nomoredata: 2408c2ecf20Sopenharmony_ci ret $31, ($26), 1 # L0 : 2418c2ecf20Sopenharmony_ci nop # E : 2428c2ecf20Sopenharmony_ci nop # E : 2438c2ecf20Sopenharmony_ci nop # E : 2448c2ecf20Sopenharmony_ci 2458c2ecf20Sopenharmony_ci .end memcpy 2468c2ecf20Sopenharmony_ci EXPORT_SYMBOL(memcpy) 2478c2ecf20Sopenharmony_ci 2488c2ecf20Sopenharmony_ci/* For backwards module compatibility. */ 2498c2ecf20Sopenharmony_ci__memcpy = memcpy 2508c2ecf20Sopenharmony_ci.globl __memcpy 251