162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * arch/alpha/lib/ev6-memset.S 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * This is an efficient (and relatively small) implementation of the C library 662306a36Sopenharmony_ci * "memset()" function for the 21264 implementation of Alpha. 762306a36Sopenharmony_ci * 862306a36Sopenharmony_ci * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> 962306a36Sopenharmony_ci * 1062306a36Sopenharmony_ci * Much of the information about 21264 scheduling/coding comes from: 1162306a36Sopenharmony_ci * Compiler Writer's Guide for the Alpha 21264 1262306a36Sopenharmony_ci * abbreviated as 'CWG' in other comments here 1362306a36Sopenharmony_ci * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html 1462306a36Sopenharmony_ci * Scheduling notation: 1562306a36Sopenharmony_ci * E - either cluster 1662306a36Sopenharmony_ci * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 1762306a36Sopenharmony_ci * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 1862306a36Sopenharmony_ci * The algorithm for the leading and trailing quadwords remains the same, 1962306a36Sopenharmony_ci * however the loop has been unrolled to enable better memory throughput, 2062306a36Sopenharmony_ci * and the code has been replicated for each of the entry points: __memset 2162306a36Sopenharmony_ci * and __memset16 to permit better scheduling to eliminate the stalling 2262306a36Sopenharmony_ci * encountered during the mask replication. 2362306a36Sopenharmony_ci * A future enhancement might be to put in a byte store loop for really 2462306a36Sopenharmony_ci * small (say < 32 bytes) memset()s. Whether or not that change would be 2562306a36Sopenharmony_ci * a win in the kernel would depend upon the contextual usage. 2662306a36Sopenharmony_ci * WARNING: Maintaining this is going to be more work than the above version, 2762306a36Sopenharmony_ci * as fixes will need to be made in multiple places. The performance gain 2862306a36Sopenharmony_ci * is worth it. 2962306a36Sopenharmony_ci */ 3062306a36Sopenharmony_ci#include <linux/export.h> 3162306a36Sopenharmony_ci .set noat 3262306a36Sopenharmony_ci .set noreorder 3362306a36Sopenharmony_ci.text 3462306a36Sopenharmony_ci .globl memset 3562306a36Sopenharmony_ci .globl __memset 3662306a36Sopenharmony_ci .globl ___memset 3762306a36Sopenharmony_ci .globl __memset16 3862306a36Sopenharmony_ci .globl __constant_c_memset 3962306a36Sopenharmony_ci 4062306a36Sopenharmony_ci .ent ___memset 4162306a36Sopenharmony_ci.align 5 4262306a36Sopenharmony_ci___memset: 4362306a36Sopenharmony_ci .frame $30,0,$26,0 4462306a36Sopenharmony_ci .prologue 0 4562306a36Sopenharmony_ci 4662306a36Sopenharmony_ci /* 4762306a36Sopenharmony_ci * Serious stalling happens. The only way to mitigate this is to 4862306a36Sopenharmony_ci * undertake a major re-write to interleave the constant materialization 4962306a36Sopenharmony_ci * with other parts of the fall-through code. This is important, even 5062306a36Sopenharmony_ci * though it makes maintenance tougher. 5162306a36Sopenharmony_ci * Do this later. 5262306a36Sopenharmony_ci */ 5362306a36Sopenharmony_ci and $17,255,$1 # E : 00000000000000ch 5462306a36Sopenharmony_ci insbl $17,1,$2 # U : 000000000000ch00 5562306a36Sopenharmony_ci bis $16,$16,$0 # E : return value 5662306a36Sopenharmony_ci ble $18,end_b # U : zero length requested? 5762306a36Sopenharmony_ci 5862306a36Sopenharmony_ci addq $18,$16,$6 # E : max address to write to 5962306a36Sopenharmony_ci bis $1,$2,$17 # E : 000000000000chch 6062306a36Sopenharmony_ci insbl $1,2,$3 # U : 0000000000ch0000 6162306a36Sopenharmony_ci insbl $1,3,$4 # U : 00000000ch000000 6262306a36Sopenharmony_ci 6362306a36Sopenharmony_ci or $3,$4,$3 # E : 00000000chch0000 6462306a36Sopenharmony_ci inswl $17,4,$5 # U : 0000chch00000000 6562306a36Sopenharmony_ci xor $16,$6,$1 # E : will complete write be within one quadword? 6662306a36Sopenharmony_ci inswl $17,6,$2 # U : chch000000000000 6762306a36Sopenharmony_ci 6862306a36Sopenharmony_ci or $17,$3,$17 # E : 00000000chchchch 6962306a36Sopenharmony_ci or $2,$5,$2 # E : chchchch00000000 7062306a36Sopenharmony_ci bic $1,7,$1 # E : fit within a single quadword? 7162306a36Sopenharmony_ci and $16,7,$3 # E : Target addr misalignment 7262306a36Sopenharmony_ci 7362306a36Sopenharmony_ci or $17,$2,$17 # E : chchchchchchchch 7462306a36Sopenharmony_ci beq $1,within_quad_b # U : 7562306a36Sopenharmony_ci nop # E : 7662306a36Sopenharmony_ci beq $3,aligned_b # U : target is 0mod8 7762306a36Sopenharmony_ci 7862306a36Sopenharmony_ci /* 7962306a36Sopenharmony_ci * Target address is misaligned, and won't fit within a quadword 8062306a36Sopenharmony_ci */ 8162306a36Sopenharmony_ci ldq_u $4,0($16) # L : Fetch first partial 8262306a36Sopenharmony_ci bis $16,$16,$5 # E : Save the address 8362306a36Sopenharmony_ci insql $17,$16,$2 # U : Insert new bytes 8462306a36Sopenharmony_ci subq $3,8,$3 # E : Invert (for addressing uses) 8562306a36Sopenharmony_ci 8662306a36Sopenharmony_ci addq $18,$3,$18 # E : $18 is new count ($3 is negative) 8762306a36Sopenharmony_ci mskql $4,$16,$4 # U : clear relevant parts of the quad 8862306a36Sopenharmony_ci subq $16,$3,$16 # E : $16 is new aligned destination 8962306a36Sopenharmony_ci bis $2,$4,$1 # E : Final bytes 9062306a36Sopenharmony_ci 9162306a36Sopenharmony_ci nop 9262306a36Sopenharmony_ci stq_u $1,0($5) # L : Store result 9362306a36Sopenharmony_ci nop 9462306a36Sopenharmony_ci nop 9562306a36Sopenharmony_ci 9662306a36Sopenharmony_ci.align 4 9762306a36Sopenharmony_cialigned_b: 9862306a36Sopenharmony_ci /* 9962306a36Sopenharmony_ci * We are now guaranteed to be quad aligned, with at least 10062306a36Sopenharmony_ci * one partial quad to write. 10162306a36Sopenharmony_ci */ 10262306a36Sopenharmony_ci 10362306a36Sopenharmony_ci sra $18,3,$3 # U : Number of remaining quads to write 10462306a36Sopenharmony_ci and $18,7,$18 # E : Number of trailing bytes to write 10562306a36Sopenharmony_ci bis $16,$16,$5 # E : Save dest address 10662306a36Sopenharmony_ci beq $3,no_quad_b # U : tail stuff only 10762306a36Sopenharmony_ci 10862306a36Sopenharmony_ci /* 10962306a36Sopenharmony_ci * it's worth the effort to unroll this and use wh64 if possible 11062306a36Sopenharmony_ci * Lifted a bunch of code from clear_user.S 11162306a36Sopenharmony_ci * At this point, entry values are: 11262306a36Sopenharmony_ci * $16 Current destination address 11362306a36Sopenharmony_ci * $5 A copy of $16 11462306a36Sopenharmony_ci * $6 The max quadword address to write to 11562306a36Sopenharmony_ci * $18 Number trailer bytes 11662306a36Sopenharmony_ci * $3 Number quads to write 11762306a36Sopenharmony_ci */ 11862306a36Sopenharmony_ci 11962306a36Sopenharmony_ci and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) 12062306a36Sopenharmony_ci subq $3, 16, $4 # E : Only try to unroll if > 128 bytes 12162306a36Sopenharmony_ci subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) 12262306a36Sopenharmony_ci blt $4, loop_b # U : 12362306a36Sopenharmony_ci 12462306a36Sopenharmony_ci /* 12562306a36Sopenharmony_ci * We know we've got at least 16 quads, minimum of one trip 12662306a36Sopenharmony_ci * through unrolled loop. Do a quad at a time to get us 0mod64 12762306a36Sopenharmony_ci * aligned. 12862306a36Sopenharmony_ci */ 12962306a36Sopenharmony_ci 13062306a36Sopenharmony_ci nop # E : 13162306a36Sopenharmony_ci nop # E : 13262306a36Sopenharmony_ci nop # E : 13362306a36Sopenharmony_ci beq $1, $bigalign_b # U : 13462306a36Sopenharmony_ci 13562306a36Sopenharmony_ci$alignmod64_b: 13662306a36Sopenharmony_ci stq $17, 0($5) # L : 13762306a36Sopenharmony_ci subq $3, 1, $3 # E : For consistency later 13862306a36Sopenharmony_ci addq $1, 8, $1 # E : Increment towards zero for alignment 13962306a36Sopenharmony_ci addq $5, 8, $4 # E : Initial wh64 address (filler instruction) 14062306a36Sopenharmony_ci 14162306a36Sopenharmony_ci nop 14262306a36Sopenharmony_ci nop 14362306a36Sopenharmony_ci addq $5, 8, $5 # E : Inc address 14462306a36Sopenharmony_ci blt $1, $alignmod64_b # U : 14562306a36Sopenharmony_ci 14662306a36Sopenharmony_ci$bigalign_b: 14762306a36Sopenharmony_ci /* 14862306a36Sopenharmony_ci * $3 - number quads left to go 14962306a36Sopenharmony_ci * $5 - target address (aligned 0mod64) 15062306a36Sopenharmony_ci * $17 - mask of stuff to store 15162306a36Sopenharmony_ci * Scratch registers available: $7, $2, $4, $1 15262306a36Sopenharmony_ci * we know that we'll be taking a minimum of one trip through 15362306a36Sopenharmony_ci * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle 15462306a36Sopenharmony_ci * Assumes the wh64 needs to be for 2 trips through the loop in the future 15562306a36Sopenharmony_ci * The wh64 is issued on for the starting destination address for trip +2 15662306a36Sopenharmony_ci * through the loop, and if there are less than two trips left, the target 15762306a36Sopenharmony_ci * address will be for the current trip. 15862306a36Sopenharmony_ci */ 15962306a36Sopenharmony_ci 16062306a36Sopenharmony_ci$do_wh64_b: 16162306a36Sopenharmony_ci wh64 ($4) # L1 : memory subsystem write hint 16262306a36Sopenharmony_ci subq $3, 24, $2 # E : For determining future wh64 addresses 16362306a36Sopenharmony_ci stq $17, 0($5) # L : 16462306a36Sopenharmony_ci nop # E : 16562306a36Sopenharmony_ci 16662306a36Sopenharmony_ci addq $5, 128, $4 # E : speculative target of next wh64 16762306a36Sopenharmony_ci stq $17, 8($5) # L : 16862306a36Sopenharmony_ci stq $17, 16($5) # L : 16962306a36Sopenharmony_ci addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) 17062306a36Sopenharmony_ci 17162306a36Sopenharmony_ci stq $17, 24($5) # L : 17262306a36Sopenharmony_ci stq $17, 32($5) # L : 17362306a36Sopenharmony_ci cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle 17462306a36Sopenharmony_ci nop 17562306a36Sopenharmony_ci 17662306a36Sopenharmony_ci stq $17, 40($5) # L : 17762306a36Sopenharmony_ci stq $17, 48($5) # L : 17862306a36Sopenharmony_ci subq $3, 16, $2 # E : Repeat the loop at least once more? 17962306a36Sopenharmony_ci nop 18062306a36Sopenharmony_ci 18162306a36Sopenharmony_ci stq $17, 56($5) # L : 18262306a36Sopenharmony_ci addq $5, 64, $5 # E : 18362306a36Sopenharmony_ci subq $3, 8, $3 # E : 18462306a36Sopenharmony_ci bge $2, $do_wh64_b # U : 18562306a36Sopenharmony_ci 18662306a36Sopenharmony_ci nop 18762306a36Sopenharmony_ci nop 18862306a36Sopenharmony_ci nop 18962306a36Sopenharmony_ci beq $3, no_quad_b # U : Might have finished already 19062306a36Sopenharmony_ci 19162306a36Sopenharmony_ci.align 4 19262306a36Sopenharmony_ci /* 19362306a36Sopenharmony_ci * Simple loop for trailing quadwords, or for small amounts 19462306a36Sopenharmony_ci * of data (where we can't use an unrolled loop and wh64) 19562306a36Sopenharmony_ci */ 19662306a36Sopenharmony_ciloop_b: 19762306a36Sopenharmony_ci stq $17,0($5) # L : 19862306a36Sopenharmony_ci subq $3,1,$3 # E : Decrement number quads left 19962306a36Sopenharmony_ci addq $5,8,$5 # E : Inc address 20062306a36Sopenharmony_ci bne $3,loop_b # U : more? 20162306a36Sopenharmony_ci 20262306a36Sopenharmony_cino_quad_b: 20362306a36Sopenharmony_ci /* 20462306a36Sopenharmony_ci * Write 0..7 trailing bytes. 20562306a36Sopenharmony_ci */ 20662306a36Sopenharmony_ci nop # E : 20762306a36Sopenharmony_ci beq $18,end_b # U : All done? 20862306a36Sopenharmony_ci ldq $7,0($5) # L : 20962306a36Sopenharmony_ci mskqh $7,$6,$2 # U : Mask final quad 21062306a36Sopenharmony_ci 21162306a36Sopenharmony_ci insqh $17,$6,$4 # U : New bits 21262306a36Sopenharmony_ci bis $2,$4,$1 # E : Put it all together 21362306a36Sopenharmony_ci stq $1,0($5) # L : And back to memory 21462306a36Sopenharmony_ci ret $31,($26),1 # L0 : 21562306a36Sopenharmony_ci 21662306a36Sopenharmony_ciwithin_quad_b: 21762306a36Sopenharmony_ci ldq_u $1,0($16) # L : 21862306a36Sopenharmony_ci insql $17,$16,$2 # U : New bits 21962306a36Sopenharmony_ci mskql $1,$16,$4 # U : Clear old 22062306a36Sopenharmony_ci bis $2,$4,$2 # E : New result 22162306a36Sopenharmony_ci 22262306a36Sopenharmony_ci mskql $2,$6,$4 # U : 22362306a36Sopenharmony_ci mskqh $1,$6,$2 # U : 22462306a36Sopenharmony_ci bis $2,$4,$1 # E : 22562306a36Sopenharmony_ci stq_u $1,0($16) # L : 22662306a36Sopenharmony_ci 22762306a36Sopenharmony_ciend_b: 22862306a36Sopenharmony_ci nop 22962306a36Sopenharmony_ci nop 23062306a36Sopenharmony_ci nop 23162306a36Sopenharmony_ci ret $31,($26),1 # L0 : 23262306a36Sopenharmony_ci .end ___memset 23362306a36Sopenharmony_ci EXPORT_SYMBOL(___memset) 23462306a36Sopenharmony_ci 23562306a36Sopenharmony_ci /* 23662306a36Sopenharmony_ci * This is the original body of code, prior to replication and 23762306a36Sopenharmony_ci * rescheduling. Leave it here, as there may be calls to this 23862306a36Sopenharmony_ci * entry point. 23962306a36Sopenharmony_ci */ 24062306a36Sopenharmony_ci.align 4 24162306a36Sopenharmony_ci .ent __constant_c_memset 24262306a36Sopenharmony_ci__constant_c_memset: 24362306a36Sopenharmony_ci .frame $30,0,$26,0 24462306a36Sopenharmony_ci .prologue 0 24562306a36Sopenharmony_ci 24662306a36Sopenharmony_ci addq $18,$16,$6 # E : max address to write to 24762306a36Sopenharmony_ci bis $16,$16,$0 # E : return value 24862306a36Sopenharmony_ci xor $16,$6,$1 # E : will complete write be within one quadword? 24962306a36Sopenharmony_ci ble $18,end # U : zero length requested? 25062306a36Sopenharmony_ci 25162306a36Sopenharmony_ci bic $1,7,$1 # E : fit within a single quadword 25262306a36Sopenharmony_ci beq $1,within_one_quad # U : 25362306a36Sopenharmony_ci and $16,7,$3 # E : Target addr misalignment 25462306a36Sopenharmony_ci beq $3,aligned # U : target is 0mod8 25562306a36Sopenharmony_ci 25662306a36Sopenharmony_ci /* 25762306a36Sopenharmony_ci * Target address is misaligned, and won't fit within a quadword 25862306a36Sopenharmony_ci */ 25962306a36Sopenharmony_ci ldq_u $4,0($16) # L : Fetch first partial 26062306a36Sopenharmony_ci bis $16,$16,$5 # E : Save the address 26162306a36Sopenharmony_ci insql $17,$16,$2 # U : Insert new bytes 26262306a36Sopenharmony_ci subq $3,8,$3 # E : Invert (for addressing uses) 26362306a36Sopenharmony_ci 26462306a36Sopenharmony_ci addq $18,$3,$18 # E : $18 is new count ($3 is negative) 26562306a36Sopenharmony_ci mskql $4,$16,$4 # U : clear relevant parts of the quad 26662306a36Sopenharmony_ci subq $16,$3,$16 # E : $16 is new aligned destination 26762306a36Sopenharmony_ci bis $2,$4,$1 # E : Final bytes 26862306a36Sopenharmony_ci 26962306a36Sopenharmony_ci nop 27062306a36Sopenharmony_ci stq_u $1,0($5) # L : Store result 27162306a36Sopenharmony_ci nop 27262306a36Sopenharmony_ci nop 27362306a36Sopenharmony_ci 27462306a36Sopenharmony_ci.align 4 27562306a36Sopenharmony_cialigned: 27662306a36Sopenharmony_ci /* 27762306a36Sopenharmony_ci * We are now guaranteed to be quad aligned, with at least 27862306a36Sopenharmony_ci * one partial quad to write. 27962306a36Sopenharmony_ci */ 28062306a36Sopenharmony_ci 28162306a36Sopenharmony_ci sra $18,3,$3 # U : Number of remaining quads to write 28262306a36Sopenharmony_ci and $18,7,$18 # E : Number of trailing bytes to write 28362306a36Sopenharmony_ci bis $16,$16,$5 # E : Save dest address 28462306a36Sopenharmony_ci beq $3,no_quad # U : tail stuff only 28562306a36Sopenharmony_ci 28662306a36Sopenharmony_ci /* 28762306a36Sopenharmony_ci * it's worth the effort to unroll this and use wh64 if possible 28862306a36Sopenharmony_ci * Lifted a bunch of code from clear_user.S 28962306a36Sopenharmony_ci * At this point, entry values are: 29062306a36Sopenharmony_ci * $16 Current destination address 29162306a36Sopenharmony_ci * $5 A copy of $16 29262306a36Sopenharmony_ci * $6 The max quadword address to write to 29362306a36Sopenharmony_ci * $18 Number trailer bytes 29462306a36Sopenharmony_ci * $3 Number quads to write 29562306a36Sopenharmony_ci */ 29662306a36Sopenharmony_ci 29762306a36Sopenharmony_ci and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) 29862306a36Sopenharmony_ci subq $3, 16, $4 # E : Only try to unroll if > 128 bytes 29962306a36Sopenharmony_ci subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) 30062306a36Sopenharmony_ci blt $4, loop # U : 30162306a36Sopenharmony_ci 30262306a36Sopenharmony_ci /* 30362306a36Sopenharmony_ci * We know we've got at least 16 quads, minimum of one trip 30462306a36Sopenharmony_ci * through unrolled loop. Do a quad at a time to get us 0mod64 30562306a36Sopenharmony_ci * aligned. 30662306a36Sopenharmony_ci */ 30762306a36Sopenharmony_ci 30862306a36Sopenharmony_ci nop # E : 30962306a36Sopenharmony_ci nop # E : 31062306a36Sopenharmony_ci nop # E : 31162306a36Sopenharmony_ci beq $1, $bigalign # U : 31262306a36Sopenharmony_ci 31362306a36Sopenharmony_ci$alignmod64: 31462306a36Sopenharmony_ci stq $17, 0($5) # L : 31562306a36Sopenharmony_ci subq $3, 1, $3 # E : For consistency later 31662306a36Sopenharmony_ci addq $1, 8, $1 # E : Increment towards zero for alignment 31762306a36Sopenharmony_ci addq $5, 8, $4 # E : Initial wh64 address (filler instruction) 31862306a36Sopenharmony_ci 31962306a36Sopenharmony_ci nop 32062306a36Sopenharmony_ci nop 32162306a36Sopenharmony_ci addq $5, 8, $5 # E : Inc address 32262306a36Sopenharmony_ci blt $1, $alignmod64 # U : 32362306a36Sopenharmony_ci 32462306a36Sopenharmony_ci$bigalign: 32562306a36Sopenharmony_ci /* 32662306a36Sopenharmony_ci * $3 - number quads left to go 32762306a36Sopenharmony_ci * $5 - target address (aligned 0mod64) 32862306a36Sopenharmony_ci * $17 - mask of stuff to store 32962306a36Sopenharmony_ci * Scratch registers available: $7, $2, $4, $1 33062306a36Sopenharmony_ci * we know that we'll be taking a minimum of one trip through 33162306a36Sopenharmony_ci * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle 33262306a36Sopenharmony_ci * Assumes the wh64 needs to be for 2 trips through the loop in the future 33362306a36Sopenharmony_ci * The wh64 is issued on for the starting destination address for trip +2 33462306a36Sopenharmony_ci * through the loop, and if there are less than two trips left, the target 33562306a36Sopenharmony_ci * address will be for the current trip. 33662306a36Sopenharmony_ci */ 33762306a36Sopenharmony_ci 33862306a36Sopenharmony_ci$do_wh64: 33962306a36Sopenharmony_ci wh64 ($4) # L1 : memory subsystem write hint 34062306a36Sopenharmony_ci subq $3, 24, $2 # E : For determining future wh64 addresses 34162306a36Sopenharmony_ci stq $17, 0($5) # L : 34262306a36Sopenharmony_ci nop # E : 34362306a36Sopenharmony_ci 34462306a36Sopenharmony_ci addq $5, 128, $4 # E : speculative target of next wh64 34562306a36Sopenharmony_ci stq $17, 8($5) # L : 34662306a36Sopenharmony_ci stq $17, 16($5) # L : 34762306a36Sopenharmony_ci addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) 34862306a36Sopenharmony_ci 34962306a36Sopenharmony_ci stq $17, 24($5) # L : 35062306a36Sopenharmony_ci stq $17, 32($5) # L : 35162306a36Sopenharmony_ci cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle 35262306a36Sopenharmony_ci nop 35362306a36Sopenharmony_ci 35462306a36Sopenharmony_ci stq $17, 40($5) # L : 35562306a36Sopenharmony_ci stq $17, 48($5) # L : 35662306a36Sopenharmony_ci subq $3, 16, $2 # E : Repeat the loop at least once more? 35762306a36Sopenharmony_ci nop 35862306a36Sopenharmony_ci 35962306a36Sopenharmony_ci stq $17, 56($5) # L : 36062306a36Sopenharmony_ci addq $5, 64, $5 # E : 36162306a36Sopenharmony_ci subq $3, 8, $3 # E : 36262306a36Sopenharmony_ci bge $2, $do_wh64 # U : 36362306a36Sopenharmony_ci 36462306a36Sopenharmony_ci nop 36562306a36Sopenharmony_ci nop 36662306a36Sopenharmony_ci nop 36762306a36Sopenharmony_ci beq $3, no_quad # U : Might have finished already 36862306a36Sopenharmony_ci 36962306a36Sopenharmony_ci.align 4 37062306a36Sopenharmony_ci /* 37162306a36Sopenharmony_ci * Simple loop for trailing quadwords, or for small amounts 37262306a36Sopenharmony_ci * of data (where we can't use an unrolled loop and wh64) 37362306a36Sopenharmony_ci */ 37462306a36Sopenharmony_ciloop: 37562306a36Sopenharmony_ci stq $17,0($5) # L : 37662306a36Sopenharmony_ci subq $3,1,$3 # E : Decrement number quads left 37762306a36Sopenharmony_ci addq $5,8,$5 # E : Inc address 37862306a36Sopenharmony_ci bne $3,loop # U : more? 37962306a36Sopenharmony_ci 38062306a36Sopenharmony_cino_quad: 38162306a36Sopenharmony_ci /* 38262306a36Sopenharmony_ci * Write 0..7 trailing bytes. 38362306a36Sopenharmony_ci */ 38462306a36Sopenharmony_ci nop # E : 38562306a36Sopenharmony_ci beq $18,end # U : All done? 38662306a36Sopenharmony_ci ldq $7,0($5) # L : 38762306a36Sopenharmony_ci mskqh $7,$6,$2 # U : Mask final quad 38862306a36Sopenharmony_ci 38962306a36Sopenharmony_ci insqh $17,$6,$4 # U : New bits 39062306a36Sopenharmony_ci bis $2,$4,$1 # E : Put it all together 39162306a36Sopenharmony_ci stq $1,0($5) # L : And back to memory 39262306a36Sopenharmony_ci ret $31,($26),1 # L0 : 39362306a36Sopenharmony_ci 39462306a36Sopenharmony_ciwithin_one_quad: 39562306a36Sopenharmony_ci ldq_u $1,0($16) # L : 39662306a36Sopenharmony_ci insql $17,$16,$2 # U : New bits 39762306a36Sopenharmony_ci mskql $1,$16,$4 # U : Clear old 39862306a36Sopenharmony_ci bis $2,$4,$2 # E : New result 39962306a36Sopenharmony_ci 40062306a36Sopenharmony_ci mskql $2,$6,$4 # U : 40162306a36Sopenharmony_ci mskqh $1,$6,$2 # U : 40262306a36Sopenharmony_ci bis $2,$4,$1 # E : 40362306a36Sopenharmony_ci stq_u $1,0($16) # L : 40462306a36Sopenharmony_ci 40562306a36Sopenharmony_ciend: 40662306a36Sopenharmony_ci nop 40762306a36Sopenharmony_ci nop 40862306a36Sopenharmony_ci nop 40962306a36Sopenharmony_ci ret $31,($26),1 # L0 : 41062306a36Sopenharmony_ci .end __constant_c_memset 41162306a36Sopenharmony_ci EXPORT_SYMBOL(__constant_c_memset) 41262306a36Sopenharmony_ci 41362306a36Sopenharmony_ci /* 41462306a36Sopenharmony_ci * This is a replicant of the __constant_c_memset code, rescheduled 41562306a36Sopenharmony_ci * to mask stalls. Note that entry point names also had to change 41662306a36Sopenharmony_ci */ 41762306a36Sopenharmony_ci .align 5 41862306a36Sopenharmony_ci .ent __memset16 41962306a36Sopenharmony_ci 42062306a36Sopenharmony_ci__memset16: 42162306a36Sopenharmony_ci .frame $30,0,$26,0 42262306a36Sopenharmony_ci .prologue 0 42362306a36Sopenharmony_ci 42462306a36Sopenharmony_ci inswl $17,0,$5 # U : 000000000000c1c2 42562306a36Sopenharmony_ci inswl $17,2,$2 # U : 00000000c1c20000 42662306a36Sopenharmony_ci bis $16,$16,$0 # E : return value 42762306a36Sopenharmony_ci addq $18,$16,$6 # E : max address to write to 42862306a36Sopenharmony_ci 42962306a36Sopenharmony_ci ble $18, end_w # U : zero length requested? 43062306a36Sopenharmony_ci inswl $17,4,$3 # U : 0000c1c200000000 43162306a36Sopenharmony_ci inswl $17,6,$4 # U : c1c2000000000000 43262306a36Sopenharmony_ci xor $16,$6,$1 # E : will complete write be within one quadword? 43362306a36Sopenharmony_ci 43462306a36Sopenharmony_ci or $2,$5,$2 # E : 00000000c1c2c1c2 43562306a36Sopenharmony_ci or $3,$4,$17 # E : c1c2c1c200000000 43662306a36Sopenharmony_ci bic $1,7,$1 # E : fit within a single quadword 43762306a36Sopenharmony_ci and $16,7,$3 # E : Target addr misalignment 43862306a36Sopenharmony_ci 43962306a36Sopenharmony_ci or $17,$2,$17 # E : c1c2c1c2c1c2c1c2 44062306a36Sopenharmony_ci beq $1,within_quad_w # U : 44162306a36Sopenharmony_ci nop 44262306a36Sopenharmony_ci beq $3,aligned_w # U : target is 0mod8 44362306a36Sopenharmony_ci 44462306a36Sopenharmony_ci /* 44562306a36Sopenharmony_ci * Target address is misaligned, and won't fit within a quadword 44662306a36Sopenharmony_ci */ 44762306a36Sopenharmony_ci ldq_u $4,0($16) # L : Fetch first partial 44862306a36Sopenharmony_ci bis $16,$16,$5 # E : Save the address 44962306a36Sopenharmony_ci insql $17,$16,$2 # U : Insert new bytes 45062306a36Sopenharmony_ci subq $3,8,$3 # E : Invert (for addressing uses) 45162306a36Sopenharmony_ci 45262306a36Sopenharmony_ci addq $18,$3,$18 # E : $18 is new count ($3 is negative) 45362306a36Sopenharmony_ci mskql $4,$16,$4 # U : clear relevant parts of the quad 45462306a36Sopenharmony_ci subq $16,$3,$16 # E : $16 is new aligned destination 45562306a36Sopenharmony_ci bis $2,$4,$1 # E : Final bytes 45662306a36Sopenharmony_ci 45762306a36Sopenharmony_ci nop 45862306a36Sopenharmony_ci stq_u $1,0($5) # L : Store result 45962306a36Sopenharmony_ci nop 46062306a36Sopenharmony_ci nop 46162306a36Sopenharmony_ci 46262306a36Sopenharmony_ci.align 4 46362306a36Sopenharmony_cialigned_w: 46462306a36Sopenharmony_ci /* 46562306a36Sopenharmony_ci * We are now guaranteed to be quad aligned, with at least 46662306a36Sopenharmony_ci * one partial quad to write. 46762306a36Sopenharmony_ci */ 46862306a36Sopenharmony_ci 46962306a36Sopenharmony_ci sra $18,3,$3 # U : Number of remaining quads to write 47062306a36Sopenharmony_ci and $18,7,$18 # E : Number of trailing bytes to write 47162306a36Sopenharmony_ci bis $16,$16,$5 # E : Save dest address 47262306a36Sopenharmony_ci beq $3,no_quad_w # U : tail stuff only 47362306a36Sopenharmony_ci 47462306a36Sopenharmony_ci /* 47562306a36Sopenharmony_ci * it's worth the effort to unroll this and use wh64 if possible 47662306a36Sopenharmony_ci * Lifted a bunch of code from clear_user.S 47762306a36Sopenharmony_ci * At this point, entry values are: 47862306a36Sopenharmony_ci * $16 Current destination address 47962306a36Sopenharmony_ci * $5 A copy of $16 48062306a36Sopenharmony_ci * $6 The max quadword address to write to 48162306a36Sopenharmony_ci * $18 Number trailer bytes 48262306a36Sopenharmony_ci * $3 Number quads to write 48362306a36Sopenharmony_ci */ 48462306a36Sopenharmony_ci 48562306a36Sopenharmony_ci and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) 48662306a36Sopenharmony_ci subq $3, 16, $4 # E : Only try to unroll if > 128 bytes 48762306a36Sopenharmony_ci subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) 48862306a36Sopenharmony_ci blt $4, loop_w # U : 48962306a36Sopenharmony_ci 49062306a36Sopenharmony_ci /* 49162306a36Sopenharmony_ci * We know we've got at least 16 quads, minimum of one trip 49262306a36Sopenharmony_ci * through unrolled loop. Do a quad at a time to get us 0mod64 49362306a36Sopenharmony_ci * aligned. 49462306a36Sopenharmony_ci */ 49562306a36Sopenharmony_ci 49662306a36Sopenharmony_ci nop # E : 49762306a36Sopenharmony_ci nop # E : 49862306a36Sopenharmony_ci nop # E : 49962306a36Sopenharmony_ci beq $1, $bigalign_w # U : 50062306a36Sopenharmony_ci 50162306a36Sopenharmony_ci$alignmod64_w: 50262306a36Sopenharmony_ci stq $17, 0($5) # L : 50362306a36Sopenharmony_ci subq $3, 1, $3 # E : For consistency later 50462306a36Sopenharmony_ci addq $1, 8, $1 # E : Increment towards zero for alignment 50562306a36Sopenharmony_ci addq $5, 8, $4 # E : Initial wh64 address (filler instruction) 50662306a36Sopenharmony_ci 50762306a36Sopenharmony_ci nop 50862306a36Sopenharmony_ci nop 50962306a36Sopenharmony_ci addq $5, 8, $5 # E : Inc address 51062306a36Sopenharmony_ci blt $1, $alignmod64_w # U : 51162306a36Sopenharmony_ci 51262306a36Sopenharmony_ci$bigalign_w: 51362306a36Sopenharmony_ci /* 51462306a36Sopenharmony_ci * $3 - number quads left to go 51562306a36Sopenharmony_ci * $5 - target address (aligned 0mod64) 51662306a36Sopenharmony_ci * $17 - mask of stuff to store 51762306a36Sopenharmony_ci * Scratch registers available: $7, $2, $4, $1 51862306a36Sopenharmony_ci * we know that we'll be taking a minimum of one trip through 51962306a36Sopenharmony_ci * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle 52062306a36Sopenharmony_ci * Assumes the wh64 needs to be for 2 trips through the loop in the future 52162306a36Sopenharmony_ci * The wh64 is issued on for the starting destination address for trip +2 52262306a36Sopenharmony_ci * through the loop, and if there are less than two trips left, the target 52362306a36Sopenharmony_ci * address will be for the current trip. 52462306a36Sopenharmony_ci */ 52562306a36Sopenharmony_ci 52662306a36Sopenharmony_ci$do_wh64_w: 52762306a36Sopenharmony_ci wh64 ($4) # L1 : memory subsystem write hint 52862306a36Sopenharmony_ci subq $3, 24, $2 # E : For determining future wh64 addresses 52962306a36Sopenharmony_ci stq $17, 0($5) # L : 53062306a36Sopenharmony_ci nop # E : 53162306a36Sopenharmony_ci 53262306a36Sopenharmony_ci addq $5, 128, $4 # E : speculative target of next wh64 53362306a36Sopenharmony_ci stq $17, 8($5) # L : 53462306a36Sopenharmony_ci stq $17, 16($5) # L : 53562306a36Sopenharmony_ci addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) 53662306a36Sopenharmony_ci 53762306a36Sopenharmony_ci stq $17, 24($5) # L : 53862306a36Sopenharmony_ci stq $17, 32($5) # L : 53962306a36Sopenharmony_ci cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle 54062306a36Sopenharmony_ci nop 54162306a36Sopenharmony_ci 54262306a36Sopenharmony_ci stq $17, 40($5) # L : 54362306a36Sopenharmony_ci stq $17, 48($5) # L : 54462306a36Sopenharmony_ci subq $3, 16, $2 # E : Repeat the loop at least once more? 54562306a36Sopenharmony_ci nop 54662306a36Sopenharmony_ci 54762306a36Sopenharmony_ci stq $17, 56($5) # L : 54862306a36Sopenharmony_ci addq $5, 64, $5 # E : 54962306a36Sopenharmony_ci subq $3, 8, $3 # E : 55062306a36Sopenharmony_ci bge $2, $do_wh64_w # U : 55162306a36Sopenharmony_ci 55262306a36Sopenharmony_ci nop 55362306a36Sopenharmony_ci nop 55462306a36Sopenharmony_ci nop 55562306a36Sopenharmony_ci beq $3, no_quad_w # U : Might have finished already 55662306a36Sopenharmony_ci 55762306a36Sopenharmony_ci.align 4 55862306a36Sopenharmony_ci /* 55962306a36Sopenharmony_ci * Simple loop for trailing quadwords, or for small amounts 56062306a36Sopenharmony_ci * of data (where we can't use an unrolled loop and wh64) 56162306a36Sopenharmony_ci */ 56262306a36Sopenharmony_ciloop_w: 56362306a36Sopenharmony_ci stq $17,0($5) # L : 56462306a36Sopenharmony_ci subq $3,1,$3 # E : Decrement number quads left 56562306a36Sopenharmony_ci addq $5,8,$5 # E : Inc address 56662306a36Sopenharmony_ci bne $3,loop_w # U : more? 56762306a36Sopenharmony_ci 56862306a36Sopenharmony_cino_quad_w: 56962306a36Sopenharmony_ci /* 57062306a36Sopenharmony_ci * Write 0..7 trailing bytes. 57162306a36Sopenharmony_ci */ 57262306a36Sopenharmony_ci nop # E : 57362306a36Sopenharmony_ci beq $18,end_w # U : All done? 57462306a36Sopenharmony_ci ldq $7,0($5) # L : 57562306a36Sopenharmony_ci mskqh $7,$6,$2 # U : Mask final quad 57662306a36Sopenharmony_ci 57762306a36Sopenharmony_ci insqh $17,$6,$4 # U : New bits 57862306a36Sopenharmony_ci bis $2,$4,$1 # E : Put it all together 57962306a36Sopenharmony_ci stq $1,0($5) # L : And back to memory 58062306a36Sopenharmony_ci ret $31,($26),1 # L0 : 58162306a36Sopenharmony_ci 58262306a36Sopenharmony_ciwithin_quad_w: 58362306a36Sopenharmony_ci ldq_u $1,0($16) # L : 58462306a36Sopenharmony_ci insql $17,$16,$2 # U : New bits 58562306a36Sopenharmony_ci mskql $1,$16,$4 # U : Clear old 58662306a36Sopenharmony_ci bis $2,$4,$2 # E : New result 58762306a36Sopenharmony_ci 58862306a36Sopenharmony_ci mskql $2,$6,$4 # U : 58962306a36Sopenharmony_ci mskqh $1,$6,$2 # U : 59062306a36Sopenharmony_ci bis $2,$4,$1 # E : 59162306a36Sopenharmony_ci stq_u $1,0($16) # L : 59262306a36Sopenharmony_ci 59362306a36Sopenharmony_ciend_w: 59462306a36Sopenharmony_ci nop 59562306a36Sopenharmony_ci nop 59662306a36Sopenharmony_ci nop 59762306a36Sopenharmony_ci ret $31,($26),1 # L0 : 59862306a36Sopenharmony_ci 59962306a36Sopenharmony_ci .end __memset16 60062306a36Sopenharmony_ci EXPORT_SYMBOL(__memset16) 60162306a36Sopenharmony_ci 60262306a36Sopenharmony_cimemset = ___memset 60362306a36Sopenharmony_ci__memset = ___memset 60462306a36Sopenharmony_ci EXPORT_SYMBOL(memset) 60562306a36Sopenharmony_ci EXPORT_SYMBOL(__memset) 606