18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * arch/alpha/lib/ev6-memset.S 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * This is an efficient (and relatively small) implementation of the C library 68c2ecf20Sopenharmony_ci * "memset()" function for the 21264 implementation of Alpha. 78c2ecf20Sopenharmony_ci * 88c2ecf20Sopenharmony_ci * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> 98c2ecf20Sopenharmony_ci * 108c2ecf20Sopenharmony_ci * Much of the information about 21264 scheduling/coding comes from: 118c2ecf20Sopenharmony_ci * Compiler Writer's Guide for the Alpha 21264 128c2ecf20Sopenharmony_ci * abbreviated as 'CWG' in other comments here 138c2ecf20Sopenharmony_ci * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html 148c2ecf20Sopenharmony_ci * Scheduling notation: 158c2ecf20Sopenharmony_ci * E - either cluster 168c2ecf20Sopenharmony_ci * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 178c2ecf20Sopenharmony_ci * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 188c2ecf20Sopenharmony_ci * The algorithm for the leading and trailing quadwords remains the same, 198c2ecf20Sopenharmony_ci * however the loop has been unrolled to enable better memory throughput, 208c2ecf20Sopenharmony_ci * and the code has been replicated for each of the entry points: __memset 218c2ecf20Sopenharmony_ci * and __memset16 to permit better scheduling to eliminate the stalling 228c2ecf20Sopenharmony_ci * encountered during the mask replication. 238c2ecf20Sopenharmony_ci * A future enhancement might be to put in a byte store loop for really 248c2ecf20Sopenharmony_ci * small (say < 32 bytes) memset()s. Whether or not that change would be 258c2ecf20Sopenharmony_ci * a win in the kernel would depend upon the contextual usage. 268c2ecf20Sopenharmony_ci * WARNING: Maintaining this is going to be more work than the above version, 278c2ecf20Sopenharmony_ci * as fixes will need to be made in multiple places. The performance gain 288c2ecf20Sopenharmony_ci * is worth it. 298c2ecf20Sopenharmony_ci */ 308c2ecf20Sopenharmony_ci#include <asm/export.h> 318c2ecf20Sopenharmony_ci .set noat 328c2ecf20Sopenharmony_ci .set noreorder 338c2ecf20Sopenharmony_ci.text 348c2ecf20Sopenharmony_ci .globl memset 358c2ecf20Sopenharmony_ci .globl __memset 368c2ecf20Sopenharmony_ci .globl ___memset 378c2ecf20Sopenharmony_ci .globl __memset16 388c2ecf20Sopenharmony_ci .globl __constant_c_memset 398c2ecf20Sopenharmony_ci 408c2ecf20Sopenharmony_ci .ent ___memset 418c2ecf20Sopenharmony_ci.align 5 428c2ecf20Sopenharmony_ci___memset: 438c2ecf20Sopenharmony_ci .frame $30,0,$26,0 448c2ecf20Sopenharmony_ci .prologue 0 458c2ecf20Sopenharmony_ci 468c2ecf20Sopenharmony_ci /* 478c2ecf20Sopenharmony_ci * Serious stalling happens. The only way to mitigate this is to 488c2ecf20Sopenharmony_ci * undertake a major re-write to interleave the constant materialization 498c2ecf20Sopenharmony_ci * with other parts of the fall-through code. This is important, even 508c2ecf20Sopenharmony_ci * though it makes maintenance tougher. 518c2ecf20Sopenharmony_ci * Do this later. 528c2ecf20Sopenharmony_ci */ 538c2ecf20Sopenharmony_ci and $17,255,$1 # E : 00000000000000ch 548c2ecf20Sopenharmony_ci insbl $17,1,$2 # U : 000000000000ch00 558c2ecf20Sopenharmony_ci bis $16,$16,$0 # E : return value 568c2ecf20Sopenharmony_ci ble $18,end_b # U : zero length requested? 578c2ecf20Sopenharmony_ci 588c2ecf20Sopenharmony_ci addq $18,$16,$6 # E : max address to write to 598c2ecf20Sopenharmony_ci bis $1,$2,$17 # E : 000000000000chch 608c2ecf20Sopenharmony_ci insbl $1,2,$3 # U : 0000000000ch0000 618c2ecf20Sopenharmony_ci insbl $1,3,$4 # U : 00000000ch000000 628c2ecf20Sopenharmony_ci 638c2ecf20Sopenharmony_ci or $3,$4,$3 # E : 00000000chch0000 648c2ecf20Sopenharmony_ci inswl $17,4,$5 # U : 0000chch00000000 658c2ecf20Sopenharmony_ci xor $16,$6,$1 # E : will complete write be within one quadword? 668c2ecf20Sopenharmony_ci inswl $17,6,$2 # U : chch000000000000 678c2ecf20Sopenharmony_ci 688c2ecf20Sopenharmony_ci or $17,$3,$17 # E : 00000000chchchch 698c2ecf20Sopenharmony_ci or $2,$5,$2 # E : chchchch00000000 708c2ecf20Sopenharmony_ci bic $1,7,$1 # E : fit within a single quadword? 718c2ecf20Sopenharmony_ci and $16,7,$3 # E : Target addr misalignment 728c2ecf20Sopenharmony_ci 738c2ecf20Sopenharmony_ci or $17,$2,$17 # E : chchchchchchchch 748c2ecf20Sopenharmony_ci beq $1,within_quad_b # U : 758c2ecf20Sopenharmony_ci nop # E : 768c2ecf20Sopenharmony_ci beq $3,aligned_b # U : target is 0mod8 778c2ecf20Sopenharmony_ci 788c2ecf20Sopenharmony_ci /* 798c2ecf20Sopenharmony_ci * Target address is misaligned, and won't fit within a quadword 808c2ecf20Sopenharmony_ci */ 818c2ecf20Sopenharmony_ci ldq_u $4,0($16) # L : Fetch first partial 828c2ecf20Sopenharmony_ci bis $16,$16,$5 # E : Save the address 838c2ecf20Sopenharmony_ci insql $17,$16,$2 # U : Insert new bytes 848c2ecf20Sopenharmony_ci subq $3,8,$3 # E : Invert (for addressing uses) 858c2ecf20Sopenharmony_ci 868c2ecf20Sopenharmony_ci addq $18,$3,$18 # E : $18 is new count ($3 is negative) 878c2ecf20Sopenharmony_ci mskql $4,$16,$4 # U : clear relevant parts of the quad 888c2ecf20Sopenharmony_ci subq $16,$3,$16 # E : $16 is new aligned destination 898c2ecf20Sopenharmony_ci bis $2,$4,$1 # E : Final bytes 908c2ecf20Sopenharmony_ci 918c2ecf20Sopenharmony_ci nop 928c2ecf20Sopenharmony_ci stq_u $1,0($5) # L : Store result 938c2ecf20Sopenharmony_ci nop 948c2ecf20Sopenharmony_ci nop 958c2ecf20Sopenharmony_ci 968c2ecf20Sopenharmony_ci.align 4 978c2ecf20Sopenharmony_cialigned_b: 988c2ecf20Sopenharmony_ci /* 998c2ecf20Sopenharmony_ci * We are now guaranteed to be quad aligned, with at least 1008c2ecf20Sopenharmony_ci * one partial quad to write. 1018c2ecf20Sopenharmony_ci */ 1028c2ecf20Sopenharmony_ci 1038c2ecf20Sopenharmony_ci sra $18,3,$3 # U : Number of remaining quads to write 1048c2ecf20Sopenharmony_ci and $18,7,$18 # E : Number of trailing bytes to write 1058c2ecf20Sopenharmony_ci bis $16,$16,$5 # E : Save dest address 1068c2ecf20Sopenharmony_ci beq $3,no_quad_b # U : tail stuff only 1078c2ecf20Sopenharmony_ci 1088c2ecf20Sopenharmony_ci /* 1098c2ecf20Sopenharmony_ci * it's worth the effort to unroll this and use wh64 if possible 1108c2ecf20Sopenharmony_ci * Lifted a bunch of code from clear_user.S 1118c2ecf20Sopenharmony_ci * At this point, entry values are: 1128c2ecf20Sopenharmony_ci * $16 Current destination address 1138c2ecf20Sopenharmony_ci * $5 A copy of $16 1148c2ecf20Sopenharmony_ci * $6 The max quadword address to write to 1158c2ecf20Sopenharmony_ci * $18 Number trailer bytes 1168c2ecf20Sopenharmony_ci * $3 Number quads to write 1178c2ecf20Sopenharmony_ci */ 1188c2ecf20Sopenharmony_ci 1198c2ecf20Sopenharmony_ci and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) 1208c2ecf20Sopenharmony_ci subq $3, 16, $4 # E : Only try to unroll if > 128 bytes 1218c2ecf20Sopenharmony_ci subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) 1228c2ecf20Sopenharmony_ci blt $4, loop_b # U : 1238c2ecf20Sopenharmony_ci 1248c2ecf20Sopenharmony_ci /* 1258c2ecf20Sopenharmony_ci * We know we've got at least 16 quads, minimum of one trip 1268c2ecf20Sopenharmony_ci * through unrolled loop. Do a quad at a time to get us 0mod64 1278c2ecf20Sopenharmony_ci * aligned. 1288c2ecf20Sopenharmony_ci */ 1298c2ecf20Sopenharmony_ci 1308c2ecf20Sopenharmony_ci nop # E : 1318c2ecf20Sopenharmony_ci nop # E : 1328c2ecf20Sopenharmony_ci nop # E : 1338c2ecf20Sopenharmony_ci beq $1, $bigalign_b # U : 1348c2ecf20Sopenharmony_ci 1358c2ecf20Sopenharmony_ci$alignmod64_b: 1368c2ecf20Sopenharmony_ci stq $17, 0($5) # L : 1378c2ecf20Sopenharmony_ci subq $3, 1, $3 # E : For consistency later 1388c2ecf20Sopenharmony_ci addq $1, 8, $1 # E : Increment towards zero for alignment 1398c2ecf20Sopenharmony_ci addq $5, 8, $4 # E : Initial wh64 address (filler instruction) 1408c2ecf20Sopenharmony_ci 1418c2ecf20Sopenharmony_ci nop 1428c2ecf20Sopenharmony_ci nop 1438c2ecf20Sopenharmony_ci addq $5, 8, $5 # E : Inc address 1448c2ecf20Sopenharmony_ci blt $1, $alignmod64_b # U : 1458c2ecf20Sopenharmony_ci 1468c2ecf20Sopenharmony_ci$bigalign_b: 1478c2ecf20Sopenharmony_ci /* 1488c2ecf20Sopenharmony_ci * $3 - number quads left to go 1498c2ecf20Sopenharmony_ci * $5 - target address (aligned 0mod64) 1508c2ecf20Sopenharmony_ci * $17 - mask of stuff to store 1518c2ecf20Sopenharmony_ci * Scratch registers available: $7, $2, $4, $1 1528c2ecf20Sopenharmony_ci * we know that we'll be taking a minimum of one trip through 1538c2ecf20Sopenharmony_ci * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle 1548c2ecf20Sopenharmony_ci * Assumes the wh64 needs to be for 2 trips through the loop in the future 1558c2ecf20Sopenharmony_ci * The wh64 is issued on for the starting destination address for trip +2 1568c2ecf20Sopenharmony_ci * through the loop, and if there are less than two trips left, the target 1578c2ecf20Sopenharmony_ci * address will be for the current trip. 1588c2ecf20Sopenharmony_ci */ 1598c2ecf20Sopenharmony_ci 1608c2ecf20Sopenharmony_ci$do_wh64_b: 1618c2ecf20Sopenharmony_ci wh64 ($4) # L1 : memory subsystem write hint 1628c2ecf20Sopenharmony_ci subq $3, 24, $2 # E : For determining future wh64 addresses 1638c2ecf20Sopenharmony_ci stq $17, 0($5) # L : 1648c2ecf20Sopenharmony_ci nop # E : 1658c2ecf20Sopenharmony_ci 1668c2ecf20Sopenharmony_ci addq $5, 128, $4 # E : speculative target of next wh64 1678c2ecf20Sopenharmony_ci stq $17, 8($5) # L : 1688c2ecf20Sopenharmony_ci stq $17, 16($5) # L : 1698c2ecf20Sopenharmony_ci addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) 1708c2ecf20Sopenharmony_ci 1718c2ecf20Sopenharmony_ci stq $17, 24($5) # L : 1728c2ecf20Sopenharmony_ci stq $17, 32($5) # L : 1738c2ecf20Sopenharmony_ci cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle 1748c2ecf20Sopenharmony_ci nop 1758c2ecf20Sopenharmony_ci 1768c2ecf20Sopenharmony_ci stq $17, 40($5) # L : 1778c2ecf20Sopenharmony_ci stq $17, 48($5) # L : 1788c2ecf20Sopenharmony_ci subq $3, 16, $2 # E : Repeat the loop at least once more? 1798c2ecf20Sopenharmony_ci nop 1808c2ecf20Sopenharmony_ci 1818c2ecf20Sopenharmony_ci stq $17, 56($5) # L : 1828c2ecf20Sopenharmony_ci addq $5, 64, $5 # E : 1838c2ecf20Sopenharmony_ci subq $3, 8, $3 # E : 1848c2ecf20Sopenharmony_ci bge $2, $do_wh64_b # U : 1858c2ecf20Sopenharmony_ci 1868c2ecf20Sopenharmony_ci nop 1878c2ecf20Sopenharmony_ci nop 1888c2ecf20Sopenharmony_ci nop 1898c2ecf20Sopenharmony_ci beq $3, no_quad_b # U : Might have finished already 1908c2ecf20Sopenharmony_ci 1918c2ecf20Sopenharmony_ci.align 4 1928c2ecf20Sopenharmony_ci /* 1938c2ecf20Sopenharmony_ci * Simple loop for trailing quadwords, or for small amounts 1948c2ecf20Sopenharmony_ci * of data (where we can't use an unrolled loop and wh64) 1958c2ecf20Sopenharmony_ci */ 1968c2ecf20Sopenharmony_ciloop_b: 1978c2ecf20Sopenharmony_ci stq $17,0($5) # L : 1988c2ecf20Sopenharmony_ci subq $3,1,$3 # E : Decrement number quads left 1998c2ecf20Sopenharmony_ci addq $5,8,$5 # E : Inc address 2008c2ecf20Sopenharmony_ci bne $3,loop_b # U : more? 2018c2ecf20Sopenharmony_ci 2028c2ecf20Sopenharmony_cino_quad_b: 2038c2ecf20Sopenharmony_ci /* 2048c2ecf20Sopenharmony_ci * Write 0..7 trailing bytes. 2058c2ecf20Sopenharmony_ci */ 2068c2ecf20Sopenharmony_ci nop # E : 2078c2ecf20Sopenharmony_ci beq $18,end_b # U : All done? 2088c2ecf20Sopenharmony_ci ldq $7,0($5) # L : 2098c2ecf20Sopenharmony_ci mskqh $7,$6,$2 # U : Mask final quad 2108c2ecf20Sopenharmony_ci 2118c2ecf20Sopenharmony_ci insqh $17,$6,$4 # U : New bits 2128c2ecf20Sopenharmony_ci bis $2,$4,$1 # E : Put it all together 2138c2ecf20Sopenharmony_ci stq $1,0($5) # L : And back to memory 2148c2ecf20Sopenharmony_ci ret $31,($26),1 # L0 : 2158c2ecf20Sopenharmony_ci 2168c2ecf20Sopenharmony_ciwithin_quad_b: 2178c2ecf20Sopenharmony_ci ldq_u $1,0($16) # L : 2188c2ecf20Sopenharmony_ci insql $17,$16,$2 # U : New bits 2198c2ecf20Sopenharmony_ci mskql $1,$16,$4 # U : Clear old 2208c2ecf20Sopenharmony_ci bis $2,$4,$2 # E : New result 2218c2ecf20Sopenharmony_ci 2228c2ecf20Sopenharmony_ci mskql $2,$6,$4 # U : 2238c2ecf20Sopenharmony_ci mskqh $1,$6,$2 # U : 2248c2ecf20Sopenharmony_ci bis $2,$4,$1 # E : 2258c2ecf20Sopenharmony_ci stq_u $1,0($16) # L : 2268c2ecf20Sopenharmony_ci 2278c2ecf20Sopenharmony_ciend_b: 2288c2ecf20Sopenharmony_ci nop 2298c2ecf20Sopenharmony_ci nop 2308c2ecf20Sopenharmony_ci nop 2318c2ecf20Sopenharmony_ci ret $31,($26),1 # L0 : 2328c2ecf20Sopenharmony_ci .end ___memset 2338c2ecf20Sopenharmony_ci EXPORT_SYMBOL(___memset) 2348c2ecf20Sopenharmony_ci 2358c2ecf20Sopenharmony_ci /* 2368c2ecf20Sopenharmony_ci * This is the original body of code, prior to replication and 2378c2ecf20Sopenharmony_ci * rescheduling. Leave it here, as there may be calls to this 2388c2ecf20Sopenharmony_ci * entry point. 2398c2ecf20Sopenharmony_ci */ 2408c2ecf20Sopenharmony_ci.align 4 2418c2ecf20Sopenharmony_ci .ent __constant_c_memset 2428c2ecf20Sopenharmony_ci__constant_c_memset: 2438c2ecf20Sopenharmony_ci .frame $30,0,$26,0 2448c2ecf20Sopenharmony_ci .prologue 0 2458c2ecf20Sopenharmony_ci 2468c2ecf20Sopenharmony_ci addq $18,$16,$6 # E : max address to write to 2478c2ecf20Sopenharmony_ci bis $16,$16,$0 # E : return value 2488c2ecf20Sopenharmony_ci xor $16,$6,$1 # E : will complete write be within one quadword? 2498c2ecf20Sopenharmony_ci ble $18,end # U : zero length requested? 2508c2ecf20Sopenharmony_ci 2518c2ecf20Sopenharmony_ci bic $1,7,$1 # E : fit within a single quadword 2528c2ecf20Sopenharmony_ci beq $1,within_one_quad # U : 2538c2ecf20Sopenharmony_ci and $16,7,$3 # E : Target addr misalignment 2548c2ecf20Sopenharmony_ci beq $3,aligned # U : target is 0mod8 2558c2ecf20Sopenharmony_ci 2568c2ecf20Sopenharmony_ci /* 2578c2ecf20Sopenharmony_ci * Target address is misaligned, and won't fit within a quadword 2588c2ecf20Sopenharmony_ci */ 2598c2ecf20Sopenharmony_ci ldq_u $4,0($16) # L : Fetch first partial 2608c2ecf20Sopenharmony_ci bis $16,$16,$5 # E : Save the address 2618c2ecf20Sopenharmony_ci insql $17,$16,$2 # U : Insert new bytes 2628c2ecf20Sopenharmony_ci subq $3,8,$3 # E : Invert (for addressing uses) 2638c2ecf20Sopenharmony_ci 2648c2ecf20Sopenharmony_ci addq $18,$3,$18 # E : $18 is new count ($3 is negative) 2658c2ecf20Sopenharmony_ci mskql $4,$16,$4 # U : clear relevant parts of the quad 2668c2ecf20Sopenharmony_ci subq $16,$3,$16 # E : $16 is new aligned destination 2678c2ecf20Sopenharmony_ci bis $2,$4,$1 # E : Final bytes 2688c2ecf20Sopenharmony_ci 2698c2ecf20Sopenharmony_ci nop 2708c2ecf20Sopenharmony_ci stq_u $1,0($5) # L : Store result 2718c2ecf20Sopenharmony_ci nop 2728c2ecf20Sopenharmony_ci nop 2738c2ecf20Sopenharmony_ci 2748c2ecf20Sopenharmony_ci.align 4 2758c2ecf20Sopenharmony_cialigned: 2768c2ecf20Sopenharmony_ci /* 2778c2ecf20Sopenharmony_ci * We are now guaranteed to be quad aligned, with at least 2788c2ecf20Sopenharmony_ci * one partial quad to write. 2798c2ecf20Sopenharmony_ci */ 2808c2ecf20Sopenharmony_ci 2818c2ecf20Sopenharmony_ci sra $18,3,$3 # U : Number of remaining quads to write 2828c2ecf20Sopenharmony_ci and $18,7,$18 # E : Number of trailing bytes to write 2838c2ecf20Sopenharmony_ci bis $16,$16,$5 # E : Save dest address 2848c2ecf20Sopenharmony_ci beq $3,no_quad # U : tail stuff only 2858c2ecf20Sopenharmony_ci 2868c2ecf20Sopenharmony_ci /* 2878c2ecf20Sopenharmony_ci * it's worth the effort to unroll this and use wh64 if possible 2888c2ecf20Sopenharmony_ci * Lifted a bunch of code from clear_user.S 2898c2ecf20Sopenharmony_ci * At this point, entry values are: 2908c2ecf20Sopenharmony_ci * $16 Current destination address 2918c2ecf20Sopenharmony_ci * $5 A copy of $16 2928c2ecf20Sopenharmony_ci * $6 The max quadword address to write to 2938c2ecf20Sopenharmony_ci * $18 Number trailer bytes 2948c2ecf20Sopenharmony_ci * $3 Number quads to write 2958c2ecf20Sopenharmony_ci */ 2968c2ecf20Sopenharmony_ci 2978c2ecf20Sopenharmony_ci and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) 2988c2ecf20Sopenharmony_ci subq $3, 16, $4 # E : Only try to unroll if > 128 bytes 2998c2ecf20Sopenharmony_ci subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) 3008c2ecf20Sopenharmony_ci blt $4, loop # U : 3018c2ecf20Sopenharmony_ci 3028c2ecf20Sopenharmony_ci /* 3038c2ecf20Sopenharmony_ci * We know we've got at least 16 quads, minimum of one trip 3048c2ecf20Sopenharmony_ci * through unrolled loop. Do a quad at a time to get us 0mod64 3058c2ecf20Sopenharmony_ci * aligned. 3068c2ecf20Sopenharmony_ci */ 3078c2ecf20Sopenharmony_ci 3088c2ecf20Sopenharmony_ci nop # E : 3098c2ecf20Sopenharmony_ci nop # E : 3108c2ecf20Sopenharmony_ci nop # E : 3118c2ecf20Sopenharmony_ci beq $1, $bigalign # U : 3128c2ecf20Sopenharmony_ci 3138c2ecf20Sopenharmony_ci$alignmod64: 3148c2ecf20Sopenharmony_ci stq $17, 0($5) # L : 3158c2ecf20Sopenharmony_ci subq $3, 1, $3 # E : For consistency later 3168c2ecf20Sopenharmony_ci addq $1, 8, $1 # E : Increment towards zero for alignment 3178c2ecf20Sopenharmony_ci addq $5, 8, $4 # E : Initial wh64 address (filler instruction) 3188c2ecf20Sopenharmony_ci 3198c2ecf20Sopenharmony_ci nop 3208c2ecf20Sopenharmony_ci nop 3218c2ecf20Sopenharmony_ci addq $5, 8, $5 # E : Inc address 3228c2ecf20Sopenharmony_ci blt $1, $alignmod64 # U : 3238c2ecf20Sopenharmony_ci 3248c2ecf20Sopenharmony_ci$bigalign: 3258c2ecf20Sopenharmony_ci /* 3268c2ecf20Sopenharmony_ci * $3 - number quads left to go 3278c2ecf20Sopenharmony_ci * $5 - target address (aligned 0mod64) 3288c2ecf20Sopenharmony_ci * $17 - mask of stuff to store 3298c2ecf20Sopenharmony_ci * Scratch registers available: $7, $2, $4, $1 3308c2ecf20Sopenharmony_ci * we know that we'll be taking a minimum of one trip through 3318c2ecf20Sopenharmony_ci * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle 3328c2ecf20Sopenharmony_ci * Assumes the wh64 needs to be for 2 trips through the loop in the future 3338c2ecf20Sopenharmony_ci * The wh64 is issued on for the starting destination address for trip +2 3348c2ecf20Sopenharmony_ci * through the loop, and if there are less than two trips left, the target 3358c2ecf20Sopenharmony_ci * address will be for the current trip. 3368c2ecf20Sopenharmony_ci */ 3378c2ecf20Sopenharmony_ci 3388c2ecf20Sopenharmony_ci$do_wh64: 3398c2ecf20Sopenharmony_ci wh64 ($4) # L1 : memory subsystem write hint 3408c2ecf20Sopenharmony_ci subq $3, 24, $2 # E : For determining future wh64 addresses 3418c2ecf20Sopenharmony_ci stq $17, 0($5) # L : 3428c2ecf20Sopenharmony_ci nop # E : 3438c2ecf20Sopenharmony_ci 3448c2ecf20Sopenharmony_ci addq $5, 128, $4 # E : speculative target of next wh64 3458c2ecf20Sopenharmony_ci stq $17, 8($5) # L : 3468c2ecf20Sopenharmony_ci stq $17, 16($5) # L : 3478c2ecf20Sopenharmony_ci addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) 3488c2ecf20Sopenharmony_ci 3498c2ecf20Sopenharmony_ci stq $17, 24($5) # L : 3508c2ecf20Sopenharmony_ci stq $17, 32($5) # L : 3518c2ecf20Sopenharmony_ci cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle 3528c2ecf20Sopenharmony_ci nop 3538c2ecf20Sopenharmony_ci 3548c2ecf20Sopenharmony_ci stq $17, 40($5) # L : 3558c2ecf20Sopenharmony_ci stq $17, 48($5) # L : 3568c2ecf20Sopenharmony_ci subq $3, 16, $2 # E : Repeat the loop at least once more? 3578c2ecf20Sopenharmony_ci nop 3588c2ecf20Sopenharmony_ci 3598c2ecf20Sopenharmony_ci stq $17, 56($5) # L : 3608c2ecf20Sopenharmony_ci addq $5, 64, $5 # E : 3618c2ecf20Sopenharmony_ci subq $3, 8, $3 # E : 3628c2ecf20Sopenharmony_ci bge $2, $do_wh64 # U : 3638c2ecf20Sopenharmony_ci 3648c2ecf20Sopenharmony_ci nop 3658c2ecf20Sopenharmony_ci nop 3668c2ecf20Sopenharmony_ci nop 3678c2ecf20Sopenharmony_ci beq $3, no_quad # U : Might have finished already 3688c2ecf20Sopenharmony_ci 3698c2ecf20Sopenharmony_ci.align 4 3708c2ecf20Sopenharmony_ci /* 3718c2ecf20Sopenharmony_ci * Simple loop for trailing quadwords, or for small amounts 3728c2ecf20Sopenharmony_ci * of data (where we can't use an unrolled loop and wh64) 3738c2ecf20Sopenharmony_ci */ 3748c2ecf20Sopenharmony_ciloop: 3758c2ecf20Sopenharmony_ci stq $17,0($5) # L : 3768c2ecf20Sopenharmony_ci subq $3,1,$3 # E : Decrement number quads left 3778c2ecf20Sopenharmony_ci addq $5,8,$5 # E : Inc address 3788c2ecf20Sopenharmony_ci bne $3,loop # U : more? 3798c2ecf20Sopenharmony_ci 3808c2ecf20Sopenharmony_cino_quad: 3818c2ecf20Sopenharmony_ci /* 3828c2ecf20Sopenharmony_ci * Write 0..7 trailing bytes. 3838c2ecf20Sopenharmony_ci */ 3848c2ecf20Sopenharmony_ci nop # E : 3858c2ecf20Sopenharmony_ci beq $18,end # U : All done? 3868c2ecf20Sopenharmony_ci ldq $7,0($5) # L : 3878c2ecf20Sopenharmony_ci mskqh $7,$6,$2 # U : Mask final quad 3888c2ecf20Sopenharmony_ci 3898c2ecf20Sopenharmony_ci insqh $17,$6,$4 # U : New bits 3908c2ecf20Sopenharmony_ci bis $2,$4,$1 # E : Put it all together 3918c2ecf20Sopenharmony_ci stq $1,0($5) # L : And back to memory 3928c2ecf20Sopenharmony_ci ret $31,($26),1 # L0 : 3938c2ecf20Sopenharmony_ci 3948c2ecf20Sopenharmony_ciwithin_one_quad: 3958c2ecf20Sopenharmony_ci ldq_u $1,0($16) # L : 3968c2ecf20Sopenharmony_ci insql $17,$16,$2 # U : New bits 3978c2ecf20Sopenharmony_ci mskql $1,$16,$4 # U : Clear old 3988c2ecf20Sopenharmony_ci bis $2,$4,$2 # E : New result 3998c2ecf20Sopenharmony_ci 4008c2ecf20Sopenharmony_ci mskql $2,$6,$4 # U : 4018c2ecf20Sopenharmony_ci mskqh $1,$6,$2 # U : 4028c2ecf20Sopenharmony_ci bis $2,$4,$1 # E : 4038c2ecf20Sopenharmony_ci stq_u $1,0($16) # L : 4048c2ecf20Sopenharmony_ci 4058c2ecf20Sopenharmony_ciend: 4068c2ecf20Sopenharmony_ci nop 4078c2ecf20Sopenharmony_ci nop 4088c2ecf20Sopenharmony_ci nop 4098c2ecf20Sopenharmony_ci ret $31,($26),1 # L0 : 4108c2ecf20Sopenharmony_ci .end __constant_c_memset 4118c2ecf20Sopenharmony_ci EXPORT_SYMBOL(__constant_c_memset) 4128c2ecf20Sopenharmony_ci 4138c2ecf20Sopenharmony_ci /* 4148c2ecf20Sopenharmony_ci * This is a replicant of the __constant_c_memset code, rescheduled 4158c2ecf20Sopenharmony_ci * to mask stalls. Note that entry point names also had to change 4168c2ecf20Sopenharmony_ci */ 4178c2ecf20Sopenharmony_ci .align 5 4188c2ecf20Sopenharmony_ci .ent __memset16 4198c2ecf20Sopenharmony_ci 4208c2ecf20Sopenharmony_ci__memset16: 4218c2ecf20Sopenharmony_ci .frame $30,0,$26,0 4228c2ecf20Sopenharmony_ci .prologue 0 4238c2ecf20Sopenharmony_ci 4248c2ecf20Sopenharmony_ci inswl $17,0,$5 # U : 000000000000c1c2 4258c2ecf20Sopenharmony_ci inswl $17,2,$2 # U : 00000000c1c20000 4268c2ecf20Sopenharmony_ci bis $16,$16,$0 # E : return value 4278c2ecf20Sopenharmony_ci addq $18,$16,$6 # E : max address to write to 4288c2ecf20Sopenharmony_ci 4298c2ecf20Sopenharmony_ci ble $18, end_w # U : zero length requested? 4308c2ecf20Sopenharmony_ci inswl $17,4,$3 # U : 0000c1c200000000 4318c2ecf20Sopenharmony_ci inswl $17,6,$4 # U : c1c2000000000000 4328c2ecf20Sopenharmony_ci xor $16,$6,$1 # E : will complete write be within one quadword? 4338c2ecf20Sopenharmony_ci 4348c2ecf20Sopenharmony_ci or $2,$5,$2 # E : 00000000c1c2c1c2 4358c2ecf20Sopenharmony_ci or $3,$4,$17 # E : c1c2c1c200000000 4368c2ecf20Sopenharmony_ci bic $1,7,$1 # E : fit within a single quadword 4378c2ecf20Sopenharmony_ci and $16,7,$3 # E : Target addr misalignment 4388c2ecf20Sopenharmony_ci 4398c2ecf20Sopenharmony_ci or $17,$2,$17 # E : c1c2c1c2c1c2c1c2 4408c2ecf20Sopenharmony_ci beq $1,within_quad_w # U : 4418c2ecf20Sopenharmony_ci nop 4428c2ecf20Sopenharmony_ci beq $3,aligned_w # U : target is 0mod8 4438c2ecf20Sopenharmony_ci 4448c2ecf20Sopenharmony_ci /* 4458c2ecf20Sopenharmony_ci * Target address is misaligned, and won't fit within a quadword 4468c2ecf20Sopenharmony_ci */ 4478c2ecf20Sopenharmony_ci ldq_u $4,0($16) # L : Fetch first partial 4488c2ecf20Sopenharmony_ci bis $16,$16,$5 # E : Save the address 4498c2ecf20Sopenharmony_ci insql $17,$16,$2 # U : Insert new bytes 4508c2ecf20Sopenharmony_ci subq $3,8,$3 # E : Invert (for addressing uses) 4518c2ecf20Sopenharmony_ci 4528c2ecf20Sopenharmony_ci addq $18,$3,$18 # E : $18 is new count ($3 is negative) 4538c2ecf20Sopenharmony_ci mskql $4,$16,$4 # U : clear relevant parts of the quad 4548c2ecf20Sopenharmony_ci subq $16,$3,$16 # E : $16 is new aligned destination 4558c2ecf20Sopenharmony_ci bis $2,$4,$1 # E : Final bytes 4568c2ecf20Sopenharmony_ci 4578c2ecf20Sopenharmony_ci nop 4588c2ecf20Sopenharmony_ci stq_u $1,0($5) # L : Store result 4598c2ecf20Sopenharmony_ci nop 4608c2ecf20Sopenharmony_ci nop 4618c2ecf20Sopenharmony_ci 4628c2ecf20Sopenharmony_ci.align 4 4638c2ecf20Sopenharmony_cialigned_w: 4648c2ecf20Sopenharmony_ci /* 4658c2ecf20Sopenharmony_ci * We are now guaranteed to be quad aligned, with at least 4668c2ecf20Sopenharmony_ci * one partial quad to write. 4678c2ecf20Sopenharmony_ci */ 4688c2ecf20Sopenharmony_ci 4698c2ecf20Sopenharmony_ci sra $18,3,$3 # U : Number of remaining quads to write 4708c2ecf20Sopenharmony_ci and $18,7,$18 # E : Number of trailing bytes to write 4718c2ecf20Sopenharmony_ci bis $16,$16,$5 # E : Save dest address 4728c2ecf20Sopenharmony_ci beq $3,no_quad_w # U : tail stuff only 4738c2ecf20Sopenharmony_ci 4748c2ecf20Sopenharmony_ci /* 4758c2ecf20Sopenharmony_ci * it's worth the effort to unroll this and use wh64 if possible 4768c2ecf20Sopenharmony_ci * Lifted a bunch of code from clear_user.S 4778c2ecf20Sopenharmony_ci * At this point, entry values are: 4788c2ecf20Sopenharmony_ci * $16 Current destination address 4798c2ecf20Sopenharmony_ci * $5 A copy of $16 4808c2ecf20Sopenharmony_ci * $6 The max quadword address to write to 4818c2ecf20Sopenharmony_ci * $18 Number trailer bytes 4828c2ecf20Sopenharmony_ci * $3 Number quads to write 4838c2ecf20Sopenharmony_ci */ 4848c2ecf20Sopenharmony_ci 4858c2ecf20Sopenharmony_ci and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) 4868c2ecf20Sopenharmony_ci subq $3, 16, $4 # E : Only try to unroll if > 128 bytes 4878c2ecf20Sopenharmony_ci subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) 4888c2ecf20Sopenharmony_ci blt $4, loop_w # U : 4898c2ecf20Sopenharmony_ci 4908c2ecf20Sopenharmony_ci /* 4918c2ecf20Sopenharmony_ci * We know we've got at least 16 quads, minimum of one trip 4928c2ecf20Sopenharmony_ci * through unrolled loop. Do a quad at a time to get us 0mod64 4938c2ecf20Sopenharmony_ci * aligned. 4948c2ecf20Sopenharmony_ci */ 4958c2ecf20Sopenharmony_ci 4968c2ecf20Sopenharmony_ci nop # E : 4978c2ecf20Sopenharmony_ci nop # E : 4988c2ecf20Sopenharmony_ci nop # E : 4998c2ecf20Sopenharmony_ci beq $1, $bigalign_w # U : 5008c2ecf20Sopenharmony_ci 5018c2ecf20Sopenharmony_ci$alignmod64_w: 5028c2ecf20Sopenharmony_ci stq $17, 0($5) # L : 5038c2ecf20Sopenharmony_ci subq $3, 1, $3 # E : For consistency later 5048c2ecf20Sopenharmony_ci addq $1, 8, $1 # E : Increment towards zero for alignment 5058c2ecf20Sopenharmony_ci addq $5, 8, $4 # E : Initial wh64 address (filler instruction) 5068c2ecf20Sopenharmony_ci 5078c2ecf20Sopenharmony_ci nop 5088c2ecf20Sopenharmony_ci nop 5098c2ecf20Sopenharmony_ci addq $5, 8, $5 # E : Inc address 5108c2ecf20Sopenharmony_ci blt $1, $alignmod64_w # U : 5118c2ecf20Sopenharmony_ci 5128c2ecf20Sopenharmony_ci$bigalign_w: 5138c2ecf20Sopenharmony_ci /* 5148c2ecf20Sopenharmony_ci * $3 - number quads left to go 5158c2ecf20Sopenharmony_ci * $5 - target address (aligned 0mod64) 5168c2ecf20Sopenharmony_ci * $17 - mask of stuff to store 5178c2ecf20Sopenharmony_ci * Scratch registers available: $7, $2, $4, $1 5188c2ecf20Sopenharmony_ci * we know that we'll be taking a minimum of one trip through 5198c2ecf20Sopenharmony_ci * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle 5208c2ecf20Sopenharmony_ci * Assumes the wh64 needs to be for 2 trips through the loop in the future 5218c2ecf20Sopenharmony_ci * The wh64 is issued on for the starting destination address for trip +2 5228c2ecf20Sopenharmony_ci * through the loop, and if there are less than two trips left, the target 5238c2ecf20Sopenharmony_ci * address will be for the current trip. 5248c2ecf20Sopenharmony_ci */ 5258c2ecf20Sopenharmony_ci 5268c2ecf20Sopenharmony_ci$do_wh64_w: 5278c2ecf20Sopenharmony_ci wh64 ($4) # L1 : memory subsystem write hint 5288c2ecf20Sopenharmony_ci subq $3, 24, $2 # E : For determining future wh64 addresses 5298c2ecf20Sopenharmony_ci stq $17, 0($5) # L : 5308c2ecf20Sopenharmony_ci nop # E : 5318c2ecf20Sopenharmony_ci 5328c2ecf20Sopenharmony_ci addq $5, 128, $4 # E : speculative target of next wh64 5338c2ecf20Sopenharmony_ci stq $17, 8($5) # L : 5348c2ecf20Sopenharmony_ci stq $17, 16($5) # L : 5358c2ecf20Sopenharmony_ci addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) 5368c2ecf20Sopenharmony_ci 5378c2ecf20Sopenharmony_ci stq $17, 24($5) # L : 5388c2ecf20Sopenharmony_ci stq $17, 32($5) # L : 5398c2ecf20Sopenharmony_ci cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle 5408c2ecf20Sopenharmony_ci nop 5418c2ecf20Sopenharmony_ci 5428c2ecf20Sopenharmony_ci stq $17, 40($5) # L : 5438c2ecf20Sopenharmony_ci stq $17, 48($5) # L : 5448c2ecf20Sopenharmony_ci subq $3, 16, $2 # E : Repeat the loop at least once more? 5458c2ecf20Sopenharmony_ci nop 5468c2ecf20Sopenharmony_ci 5478c2ecf20Sopenharmony_ci stq $17, 56($5) # L : 5488c2ecf20Sopenharmony_ci addq $5, 64, $5 # E : 5498c2ecf20Sopenharmony_ci subq $3, 8, $3 # E : 5508c2ecf20Sopenharmony_ci bge $2, $do_wh64_w # U : 5518c2ecf20Sopenharmony_ci 5528c2ecf20Sopenharmony_ci nop 5538c2ecf20Sopenharmony_ci nop 5548c2ecf20Sopenharmony_ci nop 5558c2ecf20Sopenharmony_ci beq $3, no_quad_w # U : Might have finished already 5568c2ecf20Sopenharmony_ci 5578c2ecf20Sopenharmony_ci.align 4 5588c2ecf20Sopenharmony_ci /* 5598c2ecf20Sopenharmony_ci * Simple loop for trailing quadwords, or for small amounts 5608c2ecf20Sopenharmony_ci * of data (where we can't use an unrolled loop and wh64) 5618c2ecf20Sopenharmony_ci */ 5628c2ecf20Sopenharmony_ciloop_w: 5638c2ecf20Sopenharmony_ci stq $17,0($5) # L : 5648c2ecf20Sopenharmony_ci subq $3,1,$3 # E : Decrement number quads left 5658c2ecf20Sopenharmony_ci addq $5,8,$5 # E : Inc address 5668c2ecf20Sopenharmony_ci bne $3,loop_w # U : more? 5678c2ecf20Sopenharmony_ci 5688c2ecf20Sopenharmony_cino_quad_w: 5698c2ecf20Sopenharmony_ci /* 5708c2ecf20Sopenharmony_ci * Write 0..7 trailing bytes. 5718c2ecf20Sopenharmony_ci */ 5728c2ecf20Sopenharmony_ci nop # E : 5738c2ecf20Sopenharmony_ci beq $18,end_w # U : All done? 5748c2ecf20Sopenharmony_ci ldq $7,0($5) # L : 5758c2ecf20Sopenharmony_ci mskqh $7,$6,$2 # U : Mask final quad 5768c2ecf20Sopenharmony_ci 5778c2ecf20Sopenharmony_ci insqh $17,$6,$4 # U : New bits 5788c2ecf20Sopenharmony_ci bis $2,$4,$1 # E : Put it all together 5798c2ecf20Sopenharmony_ci stq $1,0($5) # L : And back to memory 5808c2ecf20Sopenharmony_ci ret $31,($26),1 # L0 : 5818c2ecf20Sopenharmony_ci 5828c2ecf20Sopenharmony_ciwithin_quad_w: 5838c2ecf20Sopenharmony_ci ldq_u $1,0($16) # L : 5848c2ecf20Sopenharmony_ci insql $17,$16,$2 # U : New bits 5858c2ecf20Sopenharmony_ci mskql $1,$16,$4 # U : Clear old 5868c2ecf20Sopenharmony_ci bis $2,$4,$2 # E : New result 5878c2ecf20Sopenharmony_ci 5888c2ecf20Sopenharmony_ci mskql $2,$6,$4 # U : 5898c2ecf20Sopenharmony_ci mskqh $1,$6,$2 # U : 5908c2ecf20Sopenharmony_ci bis $2,$4,$1 # E : 5918c2ecf20Sopenharmony_ci stq_u $1,0($16) # L : 5928c2ecf20Sopenharmony_ci 5938c2ecf20Sopenharmony_ciend_w: 5948c2ecf20Sopenharmony_ci nop 5958c2ecf20Sopenharmony_ci nop 5968c2ecf20Sopenharmony_ci nop 5978c2ecf20Sopenharmony_ci ret $31,($26),1 # L0 : 5988c2ecf20Sopenharmony_ci 5998c2ecf20Sopenharmony_ci .end __memset16 6008c2ecf20Sopenharmony_ci EXPORT_SYMBOL(__memset16) 6018c2ecf20Sopenharmony_ci 6028c2ecf20Sopenharmony_cimemset = ___memset 6038c2ecf20Sopenharmony_ci__memset = ___memset 6048c2ecf20Sopenharmony_ci EXPORT_SYMBOL(memset) 6058c2ecf20Sopenharmony_ci EXPORT_SYMBOL(__memset) 606