1bbbf1280Sopenharmony_ci/* 2bbbf1280Sopenharmony_ci * memchr - find a character in a memory zone 3bbbf1280Sopenharmony_ci * 4bbbf1280Sopenharmony_ci * Copyright (c) 2014-2020, Arm Limited. 5bbbf1280Sopenharmony_ci * SPDX-License-Identifier: MIT 6bbbf1280Sopenharmony_ci */ 7bbbf1280Sopenharmony_ci 8bbbf1280Sopenharmony_ci/* Assumptions: 9bbbf1280Sopenharmony_ci * 10bbbf1280Sopenharmony_ci * ARMv8-a, AArch64 11bbbf1280Sopenharmony_ci * Neon Available. 12bbbf1280Sopenharmony_ci */ 13bbbf1280Sopenharmony_ci 14bbbf1280Sopenharmony_ci#include "../asmdefs.h" 15bbbf1280Sopenharmony_ci 16bbbf1280Sopenharmony_ci/* Arguments and results. */ 17bbbf1280Sopenharmony_ci#define srcin x0 18bbbf1280Sopenharmony_ci#define chrin w1 19bbbf1280Sopenharmony_ci#define cntin x2 20bbbf1280Sopenharmony_ci 21bbbf1280Sopenharmony_ci#define result x0 22bbbf1280Sopenharmony_ci 23bbbf1280Sopenharmony_ci#define src x3 24bbbf1280Sopenharmony_ci#define tmp x4 25bbbf1280Sopenharmony_ci#define wtmp2 w5 26bbbf1280Sopenharmony_ci#define synd x6 27bbbf1280Sopenharmony_ci#define soff x9 28bbbf1280Sopenharmony_ci#define cntrem x10 29bbbf1280Sopenharmony_ci 30bbbf1280Sopenharmony_ci#define vrepchr v0 31bbbf1280Sopenharmony_ci#define vdata1 v1 32bbbf1280Sopenharmony_ci#define vdata2 v2 33bbbf1280Sopenharmony_ci#define vhas_chr1 v3 34bbbf1280Sopenharmony_ci#define vhas_chr2 v4 35bbbf1280Sopenharmony_ci#define vrepmask v5 36bbbf1280Sopenharmony_ci#define vend v6 37bbbf1280Sopenharmony_ci 38bbbf1280Sopenharmony_ci/* 39bbbf1280Sopenharmony_ci * Core algorithm: 40bbbf1280Sopenharmony_ci * 41bbbf1280Sopenharmony_ci * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits 42bbbf1280Sopenharmony_ci * per byte. For each tuple, bit 0 is set if the relevant byte matched the 43bbbf1280Sopenharmony_ci * requested character and bit 1 is not used (faster than using a 32bit 44bbbf1280Sopenharmony_ci * syndrome). Since the bits in the syndrome reflect exactly the order in which 45bbbf1280Sopenharmony_ci * things occur in the original string, counting trailing zeros allows to 46bbbf1280Sopenharmony_ci * identify exactly which byte has matched. 47bbbf1280Sopenharmony_ci */ 48bbbf1280Sopenharmony_ci 49bbbf1280Sopenharmony_ciENTRY (__memchr_aarch64) 50bbbf1280Sopenharmony_ci PTR_ARG (0) 51bbbf1280Sopenharmony_ci SIZE_ARG (2) 52bbbf1280Sopenharmony_ci /* Do not dereference srcin if no bytes to compare. */ 53bbbf1280Sopenharmony_ci cbz cntin, L(zero_length) 54bbbf1280Sopenharmony_ci /* 55bbbf1280Sopenharmony_ci * Magic constant 0x40100401 allows us to identify which lane matches 56bbbf1280Sopenharmony_ci * the requested byte. 57bbbf1280Sopenharmony_ci */ 58bbbf1280Sopenharmony_ci mov wtmp2, #0x0401 59bbbf1280Sopenharmony_ci movk wtmp2, #0x4010, lsl #16 60bbbf1280Sopenharmony_ci dup vrepchr.16b, chrin 61bbbf1280Sopenharmony_ci /* Work with aligned 32-byte chunks */ 62bbbf1280Sopenharmony_ci bic src, srcin, #31 63bbbf1280Sopenharmony_ci dup vrepmask.4s, wtmp2 64bbbf1280Sopenharmony_ci ands soff, srcin, #31 65bbbf1280Sopenharmony_ci and cntrem, cntin, #31 66bbbf1280Sopenharmony_ci b.eq L(loop) 67bbbf1280Sopenharmony_ci 68bbbf1280Sopenharmony_ci /* 69bbbf1280Sopenharmony_ci * Input string is not 32-byte aligned. We calculate the syndrome 70bbbf1280Sopenharmony_ci * value for the aligned 32 bytes block containing the first bytes 71bbbf1280Sopenharmony_ci * and mask the irrelevant part. 72bbbf1280Sopenharmony_ci */ 73bbbf1280Sopenharmony_ci 74bbbf1280Sopenharmony_ci ld1 {vdata1.16b, vdata2.16b}, [src], #32 75bbbf1280Sopenharmony_ci sub tmp, soff, #32 76bbbf1280Sopenharmony_ci adds cntin, cntin, tmp 77bbbf1280Sopenharmony_ci cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 78bbbf1280Sopenharmony_ci cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 79bbbf1280Sopenharmony_ci and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 80bbbf1280Sopenharmony_ci and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 81bbbf1280Sopenharmony_ci addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ 82bbbf1280Sopenharmony_ci addp vend.16b, vend.16b, vend.16b /* 128->64 */ 83bbbf1280Sopenharmony_ci mov synd, vend.d[0] 84bbbf1280Sopenharmony_ci /* Clear the soff*2 lower bits */ 85bbbf1280Sopenharmony_ci lsl tmp, soff, #1 86bbbf1280Sopenharmony_ci lsr synd, synd, tmp 87bbbf1280Sopenharmony_ci lsl synd, synd, tmp 88bbbf1280Sopenharmony_ci /* The first block can also be the last */ 89bbbf1280Sopenharmony_ci b.ls L(masklast) 90bbbf1280Sopenharmony_ci /* Have we found something already? */ 91bbbf1280Sopenharmony_ci cbnz synd, L(tail) 92bbbf1280Sopenharmony_ci 93bbbf1280Sopenharmony_ciL(loop): 94bbbf1280Sopenharmony_ci ld1 {vdata1.16b, vdata2.16b}, [src], #32 95bbbf1280Sopenharmony_ci subs cntin, cntin, #32 96bbbf1280Sopenharmony_ci cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 97bbbf1280Sopenharmony_ci cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 98bbbf1280Sopenharmony_ci /* If we're out of data we finish regardless of the result */ 99bbbf1280Sopenharmony_ci b.ls L(end) 100bbbf1280Sopenharmony_ci /* Use a fast check for the termination condition */ 101bbbf1280Sopenharmony_ci orr vend.16b, vhas_chr1.16b, vhas_chr2.16b 102bbbf1280Sopenharmony_ci addp vend.2d, vend.2d, vend.2d 103bbbf1280Sopenharmony_ci mov synd, vend.d[0] 104bbbf1280Sopenharmony_ci /* We're not out of data, loop if we haven't found the character */ 105bbbf1280Sopenharmony_ci cbz synd, L(loop) 106bbbf1280Sopenharmony_ci 107bbbf1280Sopenharmony_ciL(end): 108bbbf1280Sopenharmony_ci /* Termination condition found, let's calculate the syndrome value */ 109bbbf1280Sopenharmony_ci and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 110bbbf1280Sopenharmony_ci and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 111bbbf1280Sopenharmony_ci addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ 112bbbf1280Sopenharmony_ci addp vend.16b, vend.16b, vend.16b /* 128->64 */ 113bbbf1280Sopenharmony_ci mov synd, vend.d[0] 114bbbf1280Sopenharmony_ci /* Only do the clear for the last possible block */ 115bbbf1280Sopenharmony_ci b.hs L(tail) 116bbbf1280Sopenharmony_ci 117bbbf1280Sopenharmony_ciL(masklast): 118bbbf1280Sopenharmony_ci /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */ 119bbbf1280Sopenharmony_ci add tmp, cntrem, soff 120bbbf1280Sopenharmony_ci and tmp, tmp, #31 121bbbf1280Sopenharmony_ci sub tmp, tmp, #32 122bbbf1280Sopenharmony_ci neg tmp, tmp, lsl #1 123bbbf1280Sopenharmony_ci lsl synd, synd, tmp 124bbbf1280Sopenharmony_ci lsr synd, synd, tmp 125bbbf1280Sopenharmony_ci 126bbbf1280Sopenharmony_ciL(tail): 127bbbf1280Sopenharmony_ci /* Count the trailing zeros using bit reversing */ 128bbbf1280Sopenharmony_ci rbit synd, synd 129bbbf1280Sopenharmony_ci /* Compensate the last post-increment */ 130bbbf1280Sopenharmony_ci sub src, src, #32 131bbbf1280Sopenharmony_ci /* Check that we have found a character */ 132bbbf1280Sopenharmony_ci cmp synd, #0 133bbbf1280Sopenharmony_ci /* And count the leading zeros */ 134bbbf1280Sopenharmony_ci clz synd, synd 135bbbf1280Sopenharmony_ci /* Compute the potential result */ 136bbbf1280Sopenharmony_ci add result, src, synd, lsr #1 137bbbf1280Sopenharmony_ci /* Select result or NULL */ 138bbbf1280Sopenharmony_ci csel result, xzr, result, eq 139bbbf1280Sopenharmony_ci ret 140bbbf1280Sopenharmony_ci 141bbbf1280Sopenharmony_ciL(zero_length): 142bbbf1280Sopenharmony_ci mov result, #0 143bbbf1280Sopenharmony_ci ret 144bbbf1280Sopenharmony_ci 145bbbf1280Sopenharmony_ciEND (__memchr_aarch64) 146bbbf1280Sopenharmony_ci 147