1bbbf1280Sopenharmony_ci/* 2bbbf1280Sopenharmony_ci * memchr - find a character in a memory zone 3bbbf1280Sopenharmony_ci * 4bbbf1280Sopenharmony_ci * Copyright (c) 2020, Arm Limited. 5bbbf1280Sopenharmony_ci * SPDX-License-Identifier: MIT 6bbbf1280Sopenharmony_ci */ 7bbbf1280Sopenharmony_ci 8bbbf1280Sopenharmony_ci/* Assumptions: 9bbbf1280Sopenharmony_ci * 10bbbf1280Sopenharmony_ci * ARMv8-a, AArch64, Advanced SIMD. 11bbbf1280Sopenharmony_ci * MTE compatible. 12bbbf1280Sopenharmony_ci */ 13bbbf1280Sopenharmony_ci 14bbbf1280Sopenharmony_ci#include "../asmdefs.h" 15bbbf1280Sopenharmony_ci 16bbbf1280Sopenharmony_ci#define srcin x0 17bbbf1280Sopenharmony_ci#define chrin w1 18bbbf1280Sopenharmony_ci#define cntin x2 19bbbf1280Sopenharmony_ci#define result x0 20bbbf1280Sopenharmony_ci 21bbbf1280Sopenharmony_ci#define src x3 22bbbf1280Sopenharmony_ci#define cntrem x4 23bbbf1280Sopenharmony_ci#define synd x5 24bbbf1280Sopenharmony_ci#define shift x6 25bbbf1280Sopenharmony_ci#define tmp x7 26bbbf1280Sopenharmony_ci#define wtmp w7 27bbbf1280Sopenharmony_ci 28bbbf1280Sopenharmony_ci#define vrepchr v0 29bbbf1280Sopenharmony_ci#define qdata q1 30bbbf1280Sopenharmony_ci#define vdata v1 31bbbf1280Sopenharmony_ci#define vhas_chr v2 32bbbf1280Sopenharmony_ci#define vrepmask v3 33bbbf1280Sopenharmony_ci#define vend v4 34bbbf1280Sopenharmony_ci#define dend d4 35bbbf1280Sopenharmony_ci 36bbbf1280Sopenharmony_ci/* 37bbbf1280Sopenharmony_ci Core algorithm: 38bbbf1280Sopenharmony_ci 39bbbf1280Sopenharmony_ci For each 16-byte chunk we calculate a 64-bit syndrome value with four bits 40bbbf1280Sopenharmony_ci per byte. For even bytes, bits 0-3 are set if the relevant byte matched the 41bbbf1280Sopenharmony_ci requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are 42bbbf1280Sopenharmony_ci set likewise for odd bytes so that adjacent bytes can be merged. Since the 43bbbf1280Sopenharmony_ci bits in the syndrome reflect the order in which things occur in the original 44bbbf1280Sopenharmony_ci string, counting trailing zeros identifies exactly which byte matched. */ 45bbbf1280Sopenharmony_ci 46bbbf1280Sopenharmony_ciENTRY (__memchr_aarch64_mte) 47bbbf1280Sopenharmony_ci PTR_ARG (0) 48bbbf1280Sopenharmony_ci SIZE_ARG (2) 49bbbf1280Sopenharmony_ci bic src, srcin, 15 50bbbf1280Sopenharmony_ci cbz cntin, L(nomatch) 51bbbf1280Sopenharmony_ci ld1 {vdata.16b}, [src] 52bbbf1280Sopenharmony_ci dup vrepchr.16b, chrin 53bbbf1280Sopenharmony_ci mov wtmp, 0xf00f 54bbbf1280Sopenharmony_ci dup vrepmask.8h, wtmp 55bbbf1280Sopenharmony_ci cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 56bbbf1280Sopenharmony_ci lsl shift, srcin, 2 57bbbf1280Sopenharmony_ci and vhas_chr.16b, vhas_chr.16b, vrepmask.16b 58bbbf1280Sopenharmony_ci addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 59bbbf1280Sopenharmony_ci fmov synd, dend 60bbbf1280Sopenharmony_ci lsr synd, synd, shift 61bbbf1280Sopenharmony_ci cbz synd, L(start_loop) 62bbbf1280Sopenharmony_ci 63bbbf1280Sopenharmony_ci rbit synd, synd 64bbbf1280Sopenharmony_ci clz synd, synd 65bbbf1280Sopenharmony_ci add result, srcin, synd, lsr 2 66bbbf1280Sopenharmony_ci cmp cntin, synd, lsr 2 67bbbf1280Sopenharmony_ci csel result, result, xzr, hi 68bbbf1280Sopenharmony_ci ret 69bbbf1280Sopenharmony_ci 70bbbf1280Sopenharmony_ciL(start_loop): 71bbbf1280Sopenharmony_ci sub tmp, src, srcin 72bbbf1280Sopenharmony_ci add tmp, tmp, 16 73bbbf1280Sopenharmony_ci subs cntrem, cntin, tmp 74bbbf1280Sopenharmony_ci b.ls L(nomatch) 75bbbf1280Sopenharmony_ci 76bbbf1280Sopenharmony_ci /* Make sure that it won't overread by a 16-byte chunk */ 77bbbf1280Sopenharmony_ci add tmp, cntrem, 15 78bbbf1280Sopenharmony_ci tbnz tmp, 4, L(loop32_2) 79bbbf1280Sopenharmony_ci 80bbbf1280Sopenharmony_ci .p2align 4 81bbbf1280Sopenharmony_ciL(loop32): 82bbbf1280Sopenharmony_ci ldr qdata, [src, 16]! 83bbbf1280Sopenharmony_ci cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 84bbbf1280Sopenharmony_ci umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 85bbbf1280Sopenharmony_ci fmov synd, dend 86bbbf1280Sopenharmony_ci cbnz synd, L(end) 87bbbf1280Sopenharmony_ci 88bbbf1280Sopenharmony_ciL(loop32_2): 89bbbf1280Sopenharmony_ci ldr qdata, [src, 16]! 90bbbf1280Sopenharmony_ci subs cntrem, cntrem, 32 91bbbf1280Sopenharmony_ci cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 92bbbf1280Sopenharmony_ci b.ls L(end) 93bbbf1280Sopenharmony_ci umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 94bbbf1280Sopenharmony_ci fmov synd, dend 95bbbf1280Sopenharmony_ci cbz synd, L(loop32) 96bbbf1280Sopenharmony_ciL(end): 97bbbf1280Sopenharmony_ci and vhas_chr.16b, vhas_chr.16b, vrepmask.16b 98bbbf1280Sopenharmony_ci addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 99bbbf1280Sopenharmony_ci fmov synd, dend 100bbbf1280Sopenharmony_ci add tmp, srcin, cntin 101bbbf1280Sopenharmony_ci sub cntrem, tmp, src 102bbbf1280Sopenharmony_ci#ifndef __AARCH64EB__ 103bbbf1280Sopenharmony_ci rbit synd, synd 104bbbf1280Sopenharmony_ci#endif 105bbbf1280Sopenharmony_ci clz synd, synd 106bbbf1280Sopenharmony_ci cmp cntrem, synd, lsr 2 107bbbf1280Sopenharmony_ci add result, src, synd, lsr 2 108bbbf1280Sopenharmony_ci csel result, result, xzr, hi 109bbbf1280Sopenharmony_ci ret 110bbbf1280Sopenharmony_ci 111bbbf1280Sopenharmony_ciL(nomatch): 112bbbf1280Sopenharmony_ci mov result, 0 113bbbf1280Sopenharmony_ci ret 114bbbf1280Sopenharmony_ci 115bbbf1280Sopenharmony_ciEND (__memchr_aarch64_mte) 116bbbf1280Sopenharmony_ci 117