1bbbf1280Sopenharmony_ci/* 2bbbf1280Sopenharmony_ci * strchrnul - find a character or nul in a string 3bbbf1280Sopenharmony_ci * 4bbbf1280Sopenharmony_ci * Copyright (c) 2014-2020, Arm Limited. 5bbbf1280Sopenharmony_ci * SPDX-License-Identifier: MIT 6bbbf1280Sopenharmony_ci */ 7bbbf1280Sopenharmony_ci 8bbbf1280Sopenharmony_ci/* Assumptions: 9bbbf1280Sopenharmony_ci * 10bbbf1280Sopenharmony_ci * ARMv8-a, AArch64 11bbbf1280Sopenharmony_ci * Neon Available. 12bbbf1280Sopenharmony_ci */ 13bbbf1280Sopenharmony_ci 14bbbf1280Sopenharmony_ci#include "../asmdefs.h" 15bbbf1280Sopenharmony_ci 16bbbf1280Sopenharmony_ci/* Arguments and results. */ 17bbbf1280Sopenharmony_ci#define srcin x0 18bbbf1280Sopenharmony_ci#define chrin w1 19bbbf1280Sopenharmony_ci 20bbbf1280Sopenharmony_ci#define result x0 21bbbf1280Sopenharmony_ci 22bbbf1280Sopenharmony_ci#define src x2 23bbbf1280Sopenharmony_ci#define tmp1 x3 24bbbf1280Sopenharmony_ci#define wtmp2 w4 25bbbf1280Sopenharmony_ci#define tmp3 x5 26bbbf1280Sopenharmony_ci 27bbbf1280Sopenharmony_ci#define vrepchr v0 28bbbf1280Sopenharmony_ci#define vdata1 v1 29bbbf1280Sopenharmony_ci#define vdata2 v2 30bbbf1280Sopenharmony_ci#define vhas_nul1 v3 31bbbf1280Sopenharmony_ci#define vhas_nul2 v4 32bbbf1280Sopenharmony_ci#define vhas_chr1 v5 33bbbf1280Sopenharmony_ci#define vhas_chr2 v6 34bbbf1280Sopenharmony_ci#define vrepmask v7 35bbbf1280Sopenharmony_ci#define vend1 v16 36bbbf1280Sopenharmony_ci 37bbbf1280Sopenharmony_ci/* Core algorithm. 38bbbf1280Sopenharmony_ci 39bbbf1280Sopenharmony_ci For each 32-byte hunk we calculate a 64-bit syndrome value, with 40bbbf1280Sopenharmony_ci two bits per byte (LSB is always in bits 0 and 1, for both big 41bbbf1280Sopenharmony_ci and little-endian systems). For each tuple, bit 0 is set iff 42bbbf1280Sopenharmony_ci the relevant byte matched the requested character or nul. Since the 43bbbf1280Sopenharmony_ci bits in the syndrome reflect exactly the order in which things occur 44bbbf1280Sopenharmony_ci in the original string a count_trailing_zeros() operation will 45bbbf1280Sopenharmony_ci identify exactly which byte is causing the termination. */ 46bbbf1280Sopenharmony_ci 47bbbf1280Sopenharmony_ci/* Locals and temporaries. */ 48bbbf1280Sopenharmony_ci 49bbbf1280Sopenharmony_ciENTRY (__strchrnul_aarch64) 50bbbf1280Sopenharmony_ci PTR_ARG (0) 51bbbf1280Sopenharmony_ci /* Magic constant 0x40100401 to allow us to identify which lane 52bbbf1280Sopenharmony_ci matches the termination condition. */ 53bbbf1280Sopenharmony_ci mov wtmp2, #0x0401 54bbbf1280Sopenharmony_ci movk wtmp2, #0x4010, lsl #16 55bbbf1280Sopenharmony_ci dup vrepchr.16b, chrin 56bbbf1280Sopenharmony_ci bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ 57bbbf1280Sopenharmony_ci dup vrepmask.4s, wtmp2 58bbbf1280Sopenharmony_ci ands tmp1, srcin, #31 59bbbf1280Sopenharmony_ci b.eq L(loop) 60bbbf1280Sopenharmony_ci 61bbbf1280Sopenharmony_ci /* Input string is not 32-byte aligned. Rather than forcing 62bbbf1280Sopenharmony_ci the padding bytes to a safe value, we calculate the syndrome 63bbbf1280Sopenharmony_ci for all the bytes, but then mask off those bits of the 64bbbf1280Sopenharmony_ci syndrome that are related to the padding. */ 65bbbf1280Sopenharmony_ci ld1 {vdata1.16b, vdata2.16b}, [src], #32 66bbbf1280Sopenharmony_ci neg tmp1, tmp1 67bbbf1280Sopenharmony_ci cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 68bbbf1280Sopenharmony_ci cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 69bbbf1280Sopenharmony_ci cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b 70bbbf1280Sopenharmony_ci cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b 71bbbf1280Sopenharmony_ci and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b 72bbbf1280Sopenharmony_ci and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b 73bbbf1280Sopenharmony_ci lsl tmp1, tmp1, #1 74bbbf1280Sopenharmony_ci addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 75bbbf1280Sopenharmony_ci mov tmp3, #~0 76bbbf1280Sopenharmony_ci addp vend1.16b, vend1.16b, vend1.16b // 128->64 77bbbf1280Sopenharmony_ci lsr tmp1, tmp3, tmp1 78bbbf1280Sopenharmony_ci 79bbbf1280Sopenharmony_ci mov tmp3, vend1.d[0] 80bbbf1280Sopenharmony_ci bic tmp1, tmp3, tmp1 // Mask padding bits. 81bbbf1280Sopenharmony_ci cbnz tmp1, L(tail) 82bbbf1280Sopenharmony_ci 83bbbf1280Sopenharmony_ci .p2align 4 84bbbf1280Sopenharmony_ciL(loop): 85bbbf1280Sopenharmony_ci ld1 {vdata1.16b, vdata2.16b}, [src], #32 86bbbf1280Sopenharmony_ci cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 87bbbf1280Sopenharmony_ci cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 88bbbf1280Sopenharmony_ci cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b 89bbbf1280Sopenharmony_ci cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b 90bbbf1280Sopenharmony_ci orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b 91bbbf1280Sopenharmony_ci umaxp vend1.16b, vend1.16b, vend1.16b 92bbbf1280Sopenharmony_ci mov tmp1, vend1.d[0] 93bbbf1280Sopenharmony_ci cbz tmp1, L(loop) 94bbbf1280Sopenharmony_ci 95bbbf1280Sopenharmony_ci /* Termination condition found. Now need to establish exactly why 96bbbf1280Sopenharmony_ci we terminated. */ 97bbbf1280Sopenharmony_ci and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b 98bbbf1280Sopenharmony_ci and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b 99bbbf1280Sopenharmony_ci addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 100bbbf1280Sopenharmony_ci addp vend1.16b, vend1.16b, vend1.16b // 128->64 101bbbf1280Sopenharmony_ci 102bbbf1280Sopenharmony_ci mov tmp1, vend1.d[0] 103bbbf1280Sopenharmony_ciL(tail): 104bbbf1280Sopenharmony_ci /* Count the trailing zeros, by bit reversing... */ 105bbbf1280Sopenharmony_ci rbit tmp1, tmp1 106bbbf1280Sopenharmony_ci /* Re-bias source. */ 107bbbf1280Sopenharmony_ci sub src, src, #32 108bbbf1280Sopenharmony_ci clz tmp1, tmp1 /* ... and counting the leading zeros. */ 109bbbf1280Sopenharmony_ci /* tmp1 is twice the offset into the fragment. */ 110bbbf1280Sopenharmony_ci add result, src, tmp1, lsr #1 111bbbf1280Sopenharmony_ci ret 112bbbf1280Sopenharmony_ci 113bbbf1280Sopenharmony_ciEND (__strchrnul_aarch64) 114bbbf1280Sopenharmony_ci 115