1 /* 2 * memchr - find a character in a memory zone 3 * 4 * Copyright (c) 2014-2020, Arm Limited. 5 * SPDX-License-Identifier: MIT 6 */ 7 8 /* Assumptions: 9 * 10 * ARMv8-a, AArch64 11 * Neon Available. 12 */ 13 14 #include "../asmdefs.h" 15 16 /* Arguments and results. */ 17 #define srcin x0 18 #define chrin w1 19 #define cntin x2 20 21 #define result x0 22 23 #define src x3 24 #define tmp x4 25 #define wtmp2 w5 26 #define synd x6 27 #define soff x9 28 #define cntrem x10 29 30 #define vrepchr v0 31 #define vdata1 v1 32 #define vdata2 v2 33 #define vhas_chr1 v3 34 #define vhas_chr2 v4 35 #define vrepmask v5 36 #define vend v6 37 38 /* 39 * Core algorithm: 40 * 41 * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits 42 * per byte. For each tuple, bit 0 is set if the relevant byte matched the 43 * requested character and bit 1 is not used (faster than using a 32bit 44 * syndrome). Since the bits in the syndrome reflect exactly the order in which 45 * things occur in the original string, counting trailing zeros allows to 46 * identify exactly which byte has matched. 47 */ 48 49 ENTRY (__memchr_aarch64) 50 PTR_ARG (0) 51 SIZE_ARG (2) 52 /* Do not dereference srcin if no bytes to compare. */ 53 cbz cntin, L(zero_length) 54 /* 55 * Magic constant 0x40100401 allows us to identify which lane matches 56 * the requested byte. 57 */ 58 mov wtmp2, #0x0401 59 movk wtmp2, #0x4010, lsl #16 60 dup vrepchr.16b, chrin 61 /* Work with aligned 32-byte chunks */ 62 bic src, srcin, #31 63 dup vrepmask.4s, wtmp2 64 ands soff, srcin, #31 65 and cntrem, cntin, #31 66 b.eq L(loop) 67 68 /* 69 * Input string is not 32-byte aligned. We calculate the syndrome 70 * value for the aligned 32 bytes block containing the first bytes 71 * and mask the irrelevant part. 72 */ 73 74 ld1 {vdata1.16b, vdata2.16b}, [src], #32 75 sub tmp, soff, #32 76 adds cntin, cntin, tmp 77 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 78 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 79 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 80 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 81 addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ 82 addp vend.16b, vend.16b, vend.16b /* 128->64 */ 83 mov synd, vend.d[0] 84 /* Clear the soff*2 lower bits */ 85 lsl tmp, soff, #1 86 lsr synd, synd, tmp 87 lsl synd, synd, tmp 88 /* The first block can also be the last */ 89 b.ls L(masklast) 90 /* Have we found something already? */ 91 cbnz synd, L(tail) 92 93 L(loop): 94 ld1 {vdata1.16b, vdata2.16b}, [src], #32 95 subs cntin, cntin, #32 96 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 97 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 98 /* If we're out of data we finish regardless of the result */ 99 b.ls L(end) 100 /* Use a fast check for the termination condition */ 101 orr vend.16b, vhas_chr1.16b, vhas_chr2.16b 102 addp vend.2d, vend.2d, vend.2d 103 mov synd, vend.d[0] 104 /* We're not out of data, loop if we haven't found the character */ 105 cbz synd, L(loop) 106 107 L(end): 108 /* Termination condition found, let's calculate the syndrome value */ 109 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 110 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 111 addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ 112 addp vend.16b, vend.16b, vend.16b /* 128->64 */ 113 mov synd, vend.d[0] 114 /* Only do the clear for the last possible block */ 115 b.hs L(tail) 116 117 L(masklast): 118 /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */ 119 add tmp, cntrem, soff 120 and tmp, tmp, #31 121 sub tmp, tmp, #32 122 neg tmp, tmp, lsl #1 123 lsl synd, synd, tmp 124 lsr synd, synd, tmp 125 126 L(tail): 127 /* Count the trailing zeros using bit reversing */ 128 rbit synd, synd 129 /* Compensate the last post-increment */ 130 sub src, src, #32 131 /* Check that we have found a character */ 132 cmp synd, #0 133 /* And count the leading zeros */ 134 clz synd, synd 135 /* Compute the potential result */ 136 add result, src, synd, lsr #1 137 /* Select result or NULL */ 138 csel result, xzr, result, eq 139 ret 140 141 L(zero_length): 142 mov result, #0 143 ret 144 145 END (__memchr_aarch64) 146 147