162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * arch/alpha/lib/ev6-memcpy.S 462306a36Sopenharmony_ci * 21264 version by Rick Gorton <rick.gorton@alpha-processor.com> 562306a36Sopenharmony_ci * 662306a36Sopenharmony_ci * Reasonably optimized memcpy() routine for the Alpha 21264 762306a36Sopenharmony_ci * 862306a36Sopenharmony_ci * - memory accessed as aligned quadwords only 962306a36Sopenharmony_ci * - uses bcmpge to compare 8 bytes in parallel 1062306a36Sopenharmony_ci * 1162306a36Sopenharmony_ci * Much of the information about 21264 scheduling/coding comes from: 1262306a36Sopenharmony_ci * Compiler Writer's Guide for the Alpha 21264 1362306a36Sopenharmony_ci * abbreviated as 'CWG' in other comments here 1462306a36Sopenharmony_ci * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html 1562306a36Sopenharmony_ci * Scheduling notation: 1662306a36Sopenharmony_ci * E - either cluster 1762306a36Sopenharmony_ci * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 1862306a36Sopenharmony_ci * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 1962306a36Sopenharmony_ci * 2062306a36Sopenharmony_ci * Temp usage notes: 2162306a36Sopenharmony_ci * $1,$2, - scratch 2262306a36Sopenharmony_ci */ 2362306a36Sopenharmony_ci#include <linux/export.h> 2462306a36Sopenharmony_ci .set noreorder 2562306a36Sopenharmony_ci .set noat 2662306a36Sopenharmony_ci 2762306a36Sopenharmony_ci .align 4 2862306a36Sopenharmony_ci .globl memcpy 2962306a36Sopenharmony_ci .ent memcpy 3062306a36Sopenharmony_cimemcpy: 3162306a36Sopenharmony_ci .frame $30,0,$26,0 3262306a36Sopenharmony_ci .prologue 0 3362306a36Sopenharmony_ci 3462306a36Sopenharmony_ci mov $16, $0 # E : copy dest to return 3562306a36Sopenharmony_ci ble $18, $nomoredata # U : done with the copy? 3662306a36Sopenharmony_ci xor $16, $17, $1 # E : are source and dest alignments the same? 3762306a36Sopenharmony_ci and $1, 7, $1 # E : are they the same mod 8? 3862306a36Sopenharmony_ci 3962306a36Sopenharmony_ci bne $1, $misaligned # U : Nope - gotta do this the slow way 4062306a36Sopenharmony_ci /* source and dest are same mod 8 address */ 4162306a36Sopenharmony_ci and $16, 7, $1 # E : Are both 0mod8? 4262306a36Sopenharmony_ci beq $1, $both_0mod8 # U : Yes 4362306a36Sopenharmony_ci nop # E : 4462306a36Sopenharmony_ci 4562306a36Sopenharmony_ci /* 4662306a36Sopenharmony_ci * source and dest are same misalignment. move a byte at a time 4762306a36Sopenharmony_ci * until a 0mod8 alignment for both is reached. 4862306a36Sopenharmony_ci * At least one byte more to move 4962306a36Sopenharmony_ci */ 5062306a36Sopenharmony_ci 5162306a36Sopenharmony_ci$head_align: 5262306a36Sopenharmony_ci ldbu $1, 0($17) # L : grab a byte 5362306a36Sopenharmony_ci subq $18, 1, $18 # E : count-- 5462306a36Sopenharmony_ci addq $17, 1, $17 # E : src++ 5562306a36Sopenharmony_ci stb $1, 0($16) # L : 5662306a36Sopenharmony_ci addq $16, 1, $16 # E : dest++ 5762306a36Sopenharmony_ci and $16, 7, $1 # E : Are we at 0mod8 yet? 5862306a36Sopenharmony_ci ble $18, $nomoredata # U : done with the copy? 5962306a36Sopenharmony_ci bne $1, $head_align # U : 6062306a36Sopenharmony_ci 6162306a36Sopenharmony_ci$both_0mod8: 6262306a36Sopenharmony_ci cmple $18, 127, $1 # E : Can we unroll the loop? 6362306a36Sopenharmony_ci bne $1, $no_unroll # U : 6462306a36Sopenharmony_ci and $16, 63, $1 # E : get mod64 alignment 6562306a36Sopenharmony_ci beq $1, $do_unroll # U : no single quads to fiddle 6662306a36Sopenharmony_ci 6762306a36Sopenharmony_ci$single_head_quad: 6862306a36Sopenharmony_ci ldq $1, 0($17) # L : get 8 bytes 6962306a36Sopenharmony_ci subq $18, 8, $18 # E : count -= 8 7062306a36Sopenharmony_ci addq $17, 8, $17 # E : src += 8 7162306a36Sopenharmony_ci nop # E : 7262306a36Sopenharmony_ci 7362306a36Sopenharmony_ci stq $1, 0($16) # L : store 7462306a36Sopenharmony_ci addq $16, 8, $16 # E : dest += 8 7562306a36Sopenharmony_ci and $16, 63, $1 # E : get mod64 alignment 7662306a36Sopenharmony_ci bne $1, $single_head_quad # U : still not fully aligned 7762306a36Sopenharmony_ci 7862306a36Sopenharmony_ci$do_unroll: 7962306a36Sopenharmony_ci addq $16, 64, $7 # E : Initial (+1 trip) wh64 address 8062306a36Sopenharmony_ci cmple $18, 127, $1 # E : Can we go through the unrolled loop? 8162306a36Sopenharmony_ci bne $1, $tail_quads # U : Nope 8262306a36Sopenharmony_ci nop # E : 8362306a36Sopenharmony_ci 8462306a36Sopenharmony_ci$unroll_body: 8562306a36Sopenharmony_ci wh64 ($7) # L1 : memory subsystem hint: 64 bytes at 8662306a36Sopenharmony_ci # ($7) are about to be over-written 8762306a36Sopenharmony_ci ldq $6, 0($17) # L0 : bytes 0..7 8862306a36Sopenharmony_ci nop # E : 8962306a36Sopenharmony_ci nop # E : 9062306a36Sopenharmony_ci 9162306a36Sopenharmony_ci ldq $4, 8($17) # L : bytes 8..15 9262306a36Sopenharmony_ci ldq $5, 16($17) # L : bytes 16..23 9362306a36Sopenharmony_ci addq $7, 64, $7 # E : Update next wh64 address 9462306a36Sopenharmony_ci nop # E : 9562306a36Sopenharmony_ci 9662306a36Sopenharmony_ci ldq $3, 24($17) # L : bytes 24..31 9762306a36Sopenharmony_ci addq $16, 64, $1 # E : fallback value for wh64 9862306a36Sopenharmony_ci nop # E : 9962306a36Sopenharmony_ci nop # E : 10062306a36Sopenharmony_ci 10162306a36Sopenharmony_ci addq $17, 32, $17 # E : src += 32 bytes 10262306a36Sopenharmony_ci stq $6, 0($16) # L : bytes 0..7 10362306a36Sopenharmony_ci nop # E : 10462306a36Sopenharmony_ci nop # E : 10562306a36Sopenharmony_ci 10662306a36Sopenharmony_ci stq $4, 8($16) # L : bytes 8..15 10762306a36Sopenharmony_ci stq $5, 16($16) # L : bytes 16..23 10862306a36Sopenharmony_ci subq $18, 192, $2 # E : At least two more trips to go? 10962306a36Sopenharmony_ci nop # E : 11062306a36Sopenharmony_ci 11162306a36Sopenharmony_ci stq $3, 24($16) # L : bytes 24..31 11262306a36Sopenharmony_ci addq $16, 32, $16 # E : dest += 32 bytes 11362306a36Sopenharmony_ci nop # E : 11462306a36Sopenharmony_ci nop # E : 11562306a36Sopenharmony_ci 11662306a36Sopenharmony_ci ldq $6, 0($17) # L : bytes 0..7 11762306a36Sopenharmony_ci ldq $4, 8($17) # L : bytes 8..15 11862306a36Sopenharmony_ci cmovlt $2, $1, $7 # E : Latency 2, extra map slot - Use 11962306a36Sopenharmony_ci # fallback wh64 address if < 2 more trips 12062306a36Sopenharmony_ci nop # E : 12162306a36Sopenharmony_ci 12262306a36Sopenharmony_ci ldq $5, 16($17) # L : bytes 16..23 12362306a36Sopenharmony_ci ldq $3, 24($17) # L : bytes 24..31 12462306a36Sopenharmony_ci addq $16, 32, $16 # E : dest += 32 12562306a36Sopenharmony_ci subq $18, 64, $18 # E : count -= 64 12662306a36Sopenharmony_ci 12762306a36Sopenharmony_ci addq $17, 32, $17 # E : src += 32 12862306a36Sopenharmony_ci stq $6, -32($16) # L : bytes 0..7 12962306a36Sopenharmony_ci stq $4, -24($16) # L : bytes 8..15 13062306a36Sopenharmony_ci cmple $18, 63, $1 # E : At least one more trip? 13162306a36Sopenharmony_ci 13262306a36Sopenharmony_ci stq $5, -16($16) # L : bytes 16..23 13362306a36Sopenharmony_ci stq $3, -8($16) # L : bytes 24..31 13462306a36Sopenharmony_ci nop # E : 13562306a36Sopenharmony_ci beq $1, $unroll_body 13662306a36Sopenharmony_ci 13762306a36Sopenharmony_ci$tail_quads: 13862306a36Sopenharmony_ci$no_unroll: 13962306a36Sopenharmony_ci .align 4 14062306a36Sopenharmony_ci subq $18, 8, $18 # E : At least a quad left? 14162306a36Sopenharmony_ci blt $18, $less_than_8 # U : Nope 14262306a36Sopenharmony_ci nop # E : 14362306a36Sopenharmony_ci nop # E : 14462306a36Sopenharmony_ci 14562306a36Sopenharmony_ci$move_a_quad: 14662306a36Sopenharmony_ci ldq $1, 0($17) # L : fetch 8 14762306a36Sopenharmony_ci subq $18, 8, $18 # E : count -= 8 14862306a36Sopenharmony_ci addq $17, 8, $17 # E : src += 8 14962306a36Sopenharmony_ci nop # E : 15062306a36Sopenharmony_ci 15162306a36Sopenharmony_ci stq $1, 0($16) # L : store 8 15262306a36Sopenharmony_ci addq $16, 8, $16 # E : dest += 8 15362306a36Sopenharmony_ci bge $18, $move_a_quad # U : 15462306a36Sopenharmony_ci nop # E : 15562306a36Sopenharmony_ci 15662306a36Sopenharmony_ci$less_than_8: 15762306a36Sopenharmony_ci .align 4 15862306a36Sopenharmony_ci addq $18, 8, $18 # E : add back for trailing bytes 15962306a36Sopenharmony_ci ble $18, $nomoredata # U : All-done 16062306a36Sopenharmony_ci nop # E : 16162306a36Sopenharmony_ci nop # E : 16262306a36Sopenharmony_ci 16362306a36Sopenharmony_ci /* Trailing bytes */ 16462306a36Sopenharmony_ci$tail_bytes: 16562306a36Sopenharmony_ci subq $18, 1, $18 # E : count-- 16662306a36Sopenharmony_ci ldbu $1, 0($17) # L : fetch a byte 16762306a36Sopenharmony_ci addq $17, 1, $17 # E : src++ 16862306a36Sopenharmony_ci nop # E : 16962306a36Sopenharmony_ci 17062306a36Sopenharmony_ci stb $1, 0($16) # L : store a byte 17162306a36Sopenharmony_ci addq $16, 1, $16 # E : dest++ 17262306a36Sopenharmony_ci bgt $18, $tail_bytes # U : more to be done? 17362306a36Sopenharmony_ci nop # E : 17462306a36Sopenharmony_ci 17562306a36Sopenharmony_ci /* branching to exit takes 3 extra cycles, so replicate exit here */ 17662306a36Sopenharmony_ci ret $31, ($26), 1 # L0 : 17762306a36Sopenharmony_ci nop # E : 17862306a36Sopenharmony_ci nop # E : 17962306a36Sopenharmony_ci nop # E : 18062306a36Sopenharmony_ci 18162306a36Sopenharmony_ci$misaligned: 18262306a36Sopenharmony_ci mov $0, $4 # E : dest temp 18362306a36Sopenharmony_ci and $0, 7, $1 # E : dest alignment mod8 18462306a36Sopenharmony_ci beq $1, $dest_0mod8 # U : life doesnt totally suck 18562306a36Sopenharmony_ci nop 18662306a36Sopenharmony_ci 18762306a36Sopenharmony_ci$aligndest: 18862306a36Sopenharmony_ci ble $18, $nomoredata # U : 18962306a36Sopenharmony_ci ldbu $1, 0($17) # L : fetch a byte 19062306a36Sopenharmony_ci subq $18, 1, $18 # E : count-- 19162306a36Sopenharmony_ci addq $17, 1, $17 # E : src++ 19262306a36Sopenharmony_ci 19362306a36Sopenharmony_ci stb $1, 0($4) # L : store it 19462306a36Sopenharmony_ci addq $4, 1, $4 # E : dest++ 19562306a36Sopenharmony_ci and $4, 7, $1 # E : dest 0mod8 yet? 19662306a36Sopenharmony_ci bne $1, $aligndest # U : go until we are aligned. 19762306a36Sopenharmony_ci 19862306a36Sopenharmony_ci /* Source has unknown alignment, but dest is known to be 0mod8 */ 19962306a36Sopenharmony_ci$dest_0mod8: 20062306a36Sopenharmony_ci subq $18, 8, $18 # E : At least a quad left? 20162306a36Sopenharmony_ci blt $18, $misalign_tail # U : Nope 20262306a36Sopenharmony_ci ldq_u $3, 0($17) # L : seed (rotating load) of 8 bytes 20362306a36Sopenharmony_ci nop # E : 20462306a36Sopenharmony_ci 20562306a36Sopenharmony_ci$mis_quad: 20662306a36Sopenharmony_ci ldq_u $16, 8($17) # L : Fetch next 8 20762306a36Sopenharmony_ci extql $3, $17, $3 # U : masking 20862306a36Sopenharmony_ci extqh $16, $17, $1 # U : masking 20962306a36Sopenharmony_ci bis $3, $1, $1 # E : merged bytes to store 21062306a36Sopenharmony_ci 21162306a36Sopenharmony_ci subq $18, 8, $18 # E : count -= 8 21262306a36Sopenharmony_ci addq $17, 8, $17 # E : src += 8 21362306a36Sopenharmony_ci stq $1, 0($4) # L : store 8 (aligned) 21462306a36Sopenharmony_ci mov $16, $3 # E : "rotate" source data 21562306a36Sopenharmony_ci 21662306a36Sopenharmony_ci addq $4, 8, $4 # E : dest += 8 21762306a36Sopenharmony_ci bge $18, $mis_quad # U : More quads to move 21862306a36Sopenharmony_ci nop 21962306a36Sopenharmony_ci nop 22062306a36Sopenharmony_ci 22162306a36Sopenharmony_ci$misalign_tail: 22262306a36Sopenharmony_ci addq $18, 8, $18 # E : account for tail stuff 22362306a36Sopenharmony_ci ble $18, $nomoredata # U : 22462306a36Sopenharmony_ci nop 22562306a36Sopenharmony_ci nop 22662306a36Sopenharmony_ci 22762306a36Sopenharmony_ci$misalign_byte: 22862306a36Sopenharmony_ci ldbu $1, 0($17) # L : fetch 1 22962306a36Sopenharmony_ci subq $18, 1, $18 # E : count-- 23062306a36Sopenharmony_ci addq $17, 1, $17 # E : src++ 23162306a36Sopenharmony_ci nop # E : 23262306a36Sopenharmony_ci 23362306a36Sopenharmony_ci stb $1, 0($4) # L : store 23462306a36Sopenharmony_ci addq $4, 1, $4 # E : dest++ 23562306a36Sopenharmony_ci bgt $18, $misalign_byte # U : more to go? 23662306a36Sopenharmony_ci nop 23762306a36Sopenharmony_ci 23862306a36Sopenharmony_ci 23962306a36Sopenharmony_ci$nomoredata: 24062306a36Sopenharmony_ci ret $31, ($26), 1 # L0 : 24162306a36Sopenharmony_ci nop # E : 24262306a36Sopenharmony_ci nop # E : 24362306a36Sopenharmony_ci nop # E : 24462306a36Sopenharmony_ci 24562306a36Sopenharmony_ci .end memcpy 24662306a36Sopenharmony_ci EXPORT_SYMBOL(memcpy) 24762306a36Sopenharmony_ci 24862306a36Sopenharmony_ci/* For backwards module compatibility. */ 24962306a36Sopenharmony_ci__memcpy = memcpy 25062306a36Sopenharmony_ci.globl __memcpy 251