162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * arch/alpha/lib/ev67-strrchr.S 462306a36Sopenharmony_ci * 21264 version by Rick Gorton <rick.gorton@alpha-processor.com> 562306a36Sopenharmony_ci * 662306a36Sopenharmony_ci * Finds length of a 0-terminated string. Optimized for the 762306a36Sopenharmony_ci * Alpha architecture: 862306a36Sopenharmony_ci * 962306a36Sopenharmony_ci * - memory accessed as aligned quadwords only 1062306a36Sopenharmony_ci * - uses bcmpge to compare 8 bytes in parallel 1162306a36Sopenharmony_ci * 1262306a36Sopenharmony_ci * Much of the information about 21264 scheduling/coding comes from: 1362306a36Sopenharmony_ci * Compiler Writer's Guide for the Alpha 21264 1462306a36Sopenharmony_ci * abbreviated as 'CWG' in other comments here 1562306a36Sopenharmony_ci * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html 1662306a36Sopenharmony_ci * Scheduling notation: 1762306a36Sopenharmony_ci * E - either cluster 1862306a36Sopenharmony_ci * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 1962306a36Sopenharmony_ci * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 2062306a36Sopenharmony_ci */ 2162306a36Sopenharmony_ci 2262306a36Sopenharmony_ci#include <linux/export.h> 2362306a36Sopenharmony_ci#include <asm/regdef.h> 2462306a36Sopenharmony_ci 2562306a36Sopenharmony_ci .set noreorder 2662306a36Sopenharmony_ci .set noat 2762306a36Sopenharmony_ci 2862306a36Sopenharmony_ci .align 4 2962306a36Sopenharmony_ci .ent strrchr 3062306a36Sopenharmony_ci .globl strrchr 3162306a36Sopenharmony_cistrrchr: 3262306a36Sopenharmony_ci .frame sp, 0, ra 3362306a36Sopenharmony_ci .prologue 0 3462306a36Sopenharmony_ci 3562306a36Sopenharmony_ci and a1, 0xff, t2 # E : 00000000000000ch 3662306a36Sopenharmony_ci insbl a1, 1, t4 # U : 000000000000ch00 3762306a36Sopenharmony_ci insbl a1, 2, t5 # U : 0000000000ch0000 3862306a36Sopenharmony_ci ldq_u t0, 0(a0) # L : load first quadword Latency=3 3962306a36Sopenharmony_ci 4062306a36Sopenharmony_ci mov zero, t6 # E : t6 is last match aligned addr 4162306a36Sopenharmony_ci or t2, t4, a1 # E : 000000000000chch 4262306a36Sopenharmony_ci sll t5, 8, t3 # U : 00000000ch000000 4362306a36Sopenharmony_ci mov zero, t8 # E : t8 is last match byte compare mask 4462306a36Sopenharmony_ci 4562306a36Sopenharmony_ci andnot a0, 7, v0 # E : align source addr 4662306a36Sopenharmony_ci or t5, t3, t3 # E : 00000000chch0000 4762306a36Sopenharmony_ci sll a1, 32, t2 # U : 0000chch00000000 4862306a36Sopenharmony_ci sll a1, 48, t4 # U : chch000000000000 4962306a36Sopenharmony_ci 5062306a36Sopenharmony_ci or t4, a1, a1 # E : chch00000000chch 5162306a36Sopenharmony_ci or t2, t3, t2 # E : 0000chchchch0000 5262306a36Sopenharmony_ci or a1, t2, a1 # E : chchchchchchchch 5362306a36Sopenharmony_ci lda t5, -1 # E : build garbage mask 5462306a36Sopenharmony_ci 5562306a36Sopenharmony_ci cmpbge zero, t0, t1 # E : bits set iff byte == zero 5662306a36Sopenharmony_ci mskqh t5, a0, t4 # E : Complete garbage mask 5762306a36Sopenharmony_ci xor t0, a1, t2 # E : make bytes == c zero 5862306a36Sopenharmony_ci cmpbge zero, t4, t4 # E : bits set iff byte is garbage 5962306a36Sopenharmony_ci 6062306a36Sopenharmony_ci cmpbge zero, t2, t3 # E : bits set iff byte == c 6162306a36Sopenharmony_ci andnot t1, t4, t1 # E : clear garbage from null test 6262306a36Sopenharmony_ci andnot t3, t4, t3 # E : clear garbage from char test 6362306a36Sopenharmony_ci bne t1, $eos # U : did we already hit the terminator? 6462306a36Sopenharmony_ci 6562306a36Sopenharmony_ci /* Character search main loop */ 6662306a36Sopenharmony_ci$loop: 6762306a36Sopenharmony_ci ldq t0, 8(v0) # L : load next quadword 6862306a36Sopenharmony_ci cmovne t3, v0, t6 # E : save previous comparisons match 6962306a36Sopenharmony_ci nop # : Latency=2, extra map slot (keep nop with cmov) 7062306a36Sopenharmony_ci nop 7162306a36Sopenharmony_ci 7262306a36Sopenharmony_ci cmovne t3, t3, t8 # E : Latency=2, extra map slot 7362306a36Sopenharmony_ci nop # : keep with cmovne 7462306a36Sopenharmony_ci addq v0, 8, v0 # E : 7562306a36Sopenharmony_ci xor t0, a1, t2 # E : 7662306a36Sopenharmony_ci 7762306a36Sopenharmony_ci cmpbge zero, t0, t1 # E : bits set iff byte == zero 7862306a36Sopenharmony_ci cmpbge zero, t2, t3 # E : bits set iff byte == c 7962306a36Sopenharmony_ci beq t1, $loop # U : if we havnt seen a null, loop 8062306a36Sopenharmony_ci nop 8162306a36Sopenharmony_ci 8262306a36Sopenharmony_ci /* Mask out character matches after terminator */ 8362306a36Sopenharmony_ci$eos: 8462306a36Sopenharmony_ci negq t1, t4 # E : isolate first null byte match 8562306a36Sopenharmony_ci and t1, t4, t4 # E : 8662306a36Sopenharmony_ci subq t4, 1, t5 # E : build a mask of the bytes up to... 8762306a36Sopenharmony_ci or t4, t5, t4 # E : ... and including the null 8862306a36Sopenharmony_ci 8962306a36Sopenharmony_ci and t3, t4, t3 # E : mask out char matches after null 9062306a36Sopenharmony_ci cmovne t3, t3, t8 # E : save it, if match found Latency=2, extra map slot 9162306a36Sopenharmony_ci nop # : Keep with cmovne 9262306a36Sopenharmony_ci nop 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_ci cmovne t3, v0, t6 # E : 9562306a36Sopenharmony_ci nop # : Keep with cmovne 9662306a36Sopenharmony_ci /* Locate the address of the last matched character */ 9762306a36Sopenharmony_ci ctlz t8, t2 # U0 : Latency=3 (0x40 for t8=0) 9862306a36Sopenharmony_ci nop 9962306a36Sopenharmony_ci 10062306a36Sopenharmony_ci cmoveq t8, 0x3f, t2 # E : Compensate for case when no match is seen 10162306a36Sopenharmony_ci nop # E : hide the cmov latency (2) behind ctlz latency 10262306a36Sopenharmony_ci lda t5, 0x3f($31) # E : 10362306a36Sopenharmony_ci subq t5, t2, t5 # E : Normalize leading zero count 10462306a36Sopenharmony_ci 10562306a36Sopenharmony_ci addq t6, t5, v0 # E : and add to quadword address 10662306a36Sopenharmony_ci ret # L0 : Latency=3 10762306a36Sopenharmony_ci nop 10862306a36Sopenharmony_ci nop 10962306a36Sopenharmony_ci 11062306a36Sopenharmony_ci .end strrchr 11162306a36Sopenharmony_ci EXPORT_SYMBOL(strrchr) 112