1bbbf1280Sopenharmony_ci/* 2bbbf1280Sopenharmony_ci * strrchr - find last position of a character in a string. 3bbbf1280Sopenharmony_ci * 4bbbf1280Sopenharmony_ci * Copyright (c) 2014-2020, Arm Limited. 5bbbf1280Sopenharmony_ci * SPDX-License-Identifier: MIT 6bbbf1280Sopenharmony_ci */ 7bbbf1280Sopenharmony_ci 8bbbf1280Sopenharmony_ci/* Assumptions: 9bbbf1280Sopenharmony_ci * 10bbbf1280Sopenharmony_ci * ARMv8-a, AArch64 11bbbf1280Sopenharmony_ci * Neon Available. 12bbbf1280Sopenharmony_ci */ 13bbbf1280Sopenharmony_ci 14bbbf1280Sopenharmony_ci#include "../asmdefs.h" 15bbbf1280Sopenharmony_ci 16bbbf1280Sopenharmony_ci/* Arguments and results. */ 17bbbf1280Sopenharmony_ci#define srcin x0 18bbbf1280Sopenharmony_ci#define chrin w1 19bbbf1280Sopenharmony_ci 20bbbf1280Sopenharmony_ci#define result x0 21bbbf1280Sopenharmony_ci 22bbbf1280Sopenharmony_ci#define src x2 23bbbf1280Sopenharmony_ci#define tmp1 x3 24bbbf1280Sopenharmony_ci#define wtmp2 w4 25bbbf1280Sopenharmony_ci#define tmp3 x5 26bbbf1280Sopenharmony_ci#define src_match x6 27bbbf1280Sopenharmony_ci#define src_offset x7 28bbbf1280Sopenharmony_ci#define const_m1 x8 29bbbf1280Sopenharmony_ci#define tmp4 x9 30bbbf1280Sopenharmony_ci#define nul_match x10 31bbbf1280Sopenharmony_ci#define chr_match x11 32bbbf1280Sopenharmony_ci 33bbbf1280Sopenharmony_ci#define vrepchr v0 34bbbf1280Sopenharmony_ci#define vdata1 v1 35bbbf1280Sopenharmony_ci#define vdata2 v2 36bbbf1280Sopenharmony_ci#define vhas_nul1 v3 37bbbf1280Sopenharmony_ci#define vhas_nul2 v4 38bbbf1280Sopenharmony_ci#define vhas_chr1 v5 39bbbf1280Sopenharmony_ci#define vhas_chr2 v6 40bbbf1280Sopenharmony_ci#define vrepmask_0 v7 41bbbf1280Sopenharmony_ci#define vrepmask_c v16 42bbbf1280Sopenharmony_ci#define vend1 v17 43bbbf1280Sopenharmony_ci#define vend2 v18 44bbbf1280Sopenharmony_ci 45bbbf1280Sopenharmony_ci/* Core algorithm. 46bbbf1280Sopenharmony_ci 47bbbf1280Sopenharmony_ci For each 32-byte hunk we calculate a 64-bit syndrome value, with 48bbbf1280Sopenharmony_ci two bits per byte (LSB is always in bits 0 and 1, for both big 49bbbf1280Sopenharmony_ci and little-endian systems). For each tuple, bit 0 is set iff 50bbbf1280Sopenharmony_ci the relevant byte matched the requested character; bit 1 is set 51bbbf1280Sopenharmony_ci iff the relevant byte matched the NUL end of string (we trigger 52bbbf1280Sopenharmony_ci off bit0 for the special case of looking for NUL). Since the bits 53bbbf1280Sopenharmony_ci in the syndrome reflect exactly the order in which things occur 54bbbf1280Sopenharmony_ci in the original string a count_trailing_zeros() operation will 55bbbf1280Sopenharmony_ci identify exactly which byte is causing the termination, and why. */ 56bbbf1280Sopenharmony_ci 57bbbf1280Sopenharmony_ciENTRY (__strrchr_aarch64) 58bbbf1280Sopenharmony_ci PTR_ARG (0) 59bbbf1280Sopenharmony_ci /* Magic constant 0x40100401 to allow us to identify which lane 60bbbf1280Sopenharmony_ci matches the requested byte. Magic constant 0x80200802 used 61bbbf1280Sopenharmony_ci similarly for NUL termination. */ 62bbbf1280Sopenharmony_ci mov wtmp2, #0x0401 63bbbf1280Sopenharmony_ci movk wtmp2, #0x4010, lsl #16 64bbbf1280Sopenharmony_ci dup vrepchr.16b, chrin 65bbbf1280Sopenharmony_ci bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ 66bbbf1280Sopenharmony_ci dup vrepmask_c.4s, wtmp2 67bbbf1280Sopenharmony_ci mov src_offset, #0 68bbbf1280Sopenharmony_ci ands tmp1, srcin, #31 69bbbf1280Sopenharmony_ci add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */ 70bbbf1280Sopenharmony_ci b.eq L(aligned) 71bbbf1280Sopenharmony_ci 72bbbf1280Sopenharmony_ci /* Input string is not 32-byte aligned. Rather than forcing 73bbbf1280Sopenharmony_ci the padding bytes to a safe value, we calculate the syndrome 74bbbf1280Sopenharmony_ci for all the bytes, but then mask off those bits of the 75bbbf1280Sopenharmony_ci syndrome that are related to the padding. */ 76bbbf1280Sopenharmony_ci ld1 {vdata1.16b, vdata2.16b}, [src], #32 77bbbf1280Sopenharmony_ci neg tmp1, tmp1 78bbbf1280Sopenharmony_ci cmeq vhas_nul1.16b, vdata1.16b, #0 79bbbf1280Sopenharmony_ci cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 80bbbf1280Sopenharmony_ci cmeq vhas_nul2.16b, vdata2.16b, #0 81bbbf1280Sopenharmony_ci cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 82bbbf1280Sopenharmony_ci and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b 83bbbf1280Sopenharmony_ci and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b 84bbbf1280Sopenharmony_ci and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b 85bbbf1280Sopenharmony_ci and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b 86bbbf1280Sopenharmony_ci addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128 87bbbf1280Sopenharmony_ci addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 88bbbf1280Sopenharmony_ci addp vend1.16b, vhas_nul1.16b, vhas_chr1.16b // 128->64 89bbbf1280Sopenharmony_ci mov nul_match, vend1.d[0] 90bbbf1280Sopenharmony_ci lsl tmp1, tmp1, #1 91bbbf1280Sopenharmony_ci mov const_m1, #~0 92bbbf1280Sopenharmony_ci lsr tmp3, const_m1, tmp1 93bbbf1280Sopenharmony_ci mov chr_match, vend1.d[1] 94bbbf1280Sopenharmony_ci 95bbbf1280Sopenharmony_ci bic nul_match, nul_match, tmp3 // Mask padding bits. 96bbbf1280Sopenharmony_ci bic chr_match, chr_match, tmp3 // Mask padding bits. 97bbbf1280Sopenharmony_ci cbnz nul_match, L(tail) 98bbbf1280Sopenharmony_ci 99bbbf1280Sopenharmony_ci .p2align 4 100bbbf1280Sopenharmony_ciL(loop): 101bbbf1280Sopenharmony_ci cmp chr_match, #0 102bbbf1280Sopenharmony_ci csel src_match, src, src_match, ne 103bbbf1280Sopenharmony_ci csel src_offset, chr_match, src_offset, ne 104bbbf1280Sopenharmony_ciL(aligned): 105bbbf1280Sopenharmony_ci ld1 {vdata1.16b, vdata2.16b}, [src], #32 106bbbf1280Sopenharmony_ci cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 107bbbf1280Sopenharmony_ci cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 108bbbf1280Sopenharmony_ci uminp vend1.16b, vdata1.16b, vdata2.16b 109bbbf1280Sopenharmony_ci and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b 110bbbf1280Sopenharmony_ci and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b 111bbbf1280Sopenharmony_ci cmeq vend1.16b, vend1.16b, 0 112bbbf1280Sopenharmony_ci addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 113bbbf1280Sopenharmony_ci addp vend1.16b, vend1.16b, vhas_chr1.16b // 128->64 114bbbf1280Sopenharmony_ci mov nul_match, vend1.d[0] 115bbbf1280Sopenharmony_ci mov chr_match, vend1.d[1] 116bbbf1280Sopenharmony_ci cbz nul_match, L(loop) 117bbbf1280Sopenharmony_ci 118bbbf1280Sopenharmony_ci cmeq vhas_nul1.16b, vdata1.16b, #0 119bbbf1280Sopenharmony_ci cmeq vhas_nul2.16b, vdata2.16b, #0 120bbbf1280Sopenharmony_ci and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b 121bbbf1280Sopenharmony_ci and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b 122bbbf1280Sopenharmony_ci addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b 123bbbf1280Sopenharmony_ci addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b 124bbbf1280Sopenharmony_ci mov nul_match, vhas_nul1.d[0] 125bbbf1280Sopenharmony_ci 126bbbf1280Sopenharmony_ciL(tail): 127bbbf1280Sopenharmony_ci /* Work out exactly where the string ends. */ 128bbbf1280Sopenharmony_ci sub tmp4, nul_match, #1 129bbbf1280Sopenharmony_ci eor tmp4, tmp4, nul_match 130bbbf1280Sopenharmony_ci ands chr_match, chr_match, tmp4 131bbbf1280Sopenharmony_ci /* And pick the values corresponding to the last match. */ 132bbbf1280Sopenharmony_ci csel src_match, src, src_match, ne 133bbbf1280Sopenharmony_ci csel src_offset, chr_match, src_offset, ne 134bbbf1280Sopenharmony_ci 135bbbf1280Sopenharmony_ci /* Count down from the top of the syndrome to find the last match. */ 136bbbf1280Sopenharmony_ci clz tmp3, src_offset 137bbbf1280Sopenharmony_ci /* Src_match points beyond the word containing the match, so we can 138bbbf1280Sopenharmony_ci simply subtract half the bit-offset into the syndrome. Because 139bbbf1280Sopenharmony_ci we are counting down, we need to go back one more character. */ 140bbbf1280Sopenharmony_ci add tmp3, tmp3, #2 141bbbf1280Sopenharmony_ci sub result, src_match, tmp3, lsr #1 142bbbf1280Sopenharmony_ci /* But if the syndrome shows no match was found, then return NULL. */ 143bbbf1280Sopenharmony_ci cmp src_offset, #0 144bbbf1280Sopenharmony_ci csel result, result, xzr, ne 145bbbf1280Sopenharmony_ci 146bbbf1280Sopenharmony_ci ret 147bbbf1280Sopenharmony_ci 148bbbf1280Sopenharmony_ciEND (__strrchr_aarch64) 149bbbf1280Sopenharmony_ci 150