1 /*
2  * memchr - find a character in a memory zone
3  *
4  * Copyright (c) 2014-2020, Arm Limited.
5  * SPDX-License-Identifier: MIT
6  */
7 
8 /* Assumptions:
9  *
10  * ARMv8-a, AArch64
11  * Neon Available.
12  */
13 
14 #include "../asmdefs.h"
15 
16 /* Arguments and results.  */
17 #define srcin		x0
18 #define chrin		w1
19 #define cntin		x2
20 
21 #define result		x0
22 
23 #define src		x3
24 #define	tmp		x4
25 #define wtmp2		w5
26 #define synd		x6
27 #define soff		x9
28 #define cntrem		x10
29 
30 #define vrepchr		v0
31 #define vdata1		v1
32 #define vdata2		v2
33 #define vhas_chr1	v3
34 #define vhas_chr2	v4
35 #define vrepmask	v5
36 #define vend		v6
37 
38 /*
39  * Core algorithm:
40  *
41  * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
42  * per byte. For each tuple, bit 0 is set if the relevant byte matched the
43  * requested character and bit 1 is not used (faster than using a 32bit
44  * syndrome). Since the bits in the syndrome reflect exactly the order in which
45  * things occur in the original string, counting trailing zeros allows to
46  * identify exactly which byte has matched.
47  */
48 
49 ENTRY (__memchr_aarch64)
50 	PTR_ARG (0)
51 	SIZE_ARG (2)
52 	/* Do not dereference srcin if no bytes to compare.  */
53 	cbz	cntin, L(zero_length)
54 	/*
55 	 * Magic constant 0x40100401 allows us to identify which lane matches
56 	 * the requested byte.
57 	 */
58 	mov	wtmp2, #0x0401
59 	movk	wtmp2, #0x4010, lsl #16
60 	dup	vrepchr.16b, chrin
61 	/* Work with aligned 32-byte chunks */
62 	bic	src, srcin, #31
63 	dup	vrepmask.4s, wtmp2
64 	ands	soff, srcin, #31
65 	and	cntrem, cntin, #31
66 	b.eq	L(loop)
67 
68 	/*
69 	 * Input string is not 32-byte aligned. We calculate the syndrome
70 	 * value for the aligned 32 bytes block containing the first bytes
71 	 * and mask the irrelevant part.
72 	 */
73 
74 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
75 	sub	tmp, soff, #32
76 	adds	cntin, cntin, tmp
77 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
78 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
79 	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
80 	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
81 	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
82 	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
83 	mov	synd, vend.d[0]
84 	/* Clear the soff*2 lower bits */
85 	lsl	tmp, soff, #1
86 	lsr	synd, synd, tmp
87 	lsl	synd, synd, tmp
88 	/* The first block can also be the last */
89 	b.ls	L(masklast)
90 	/* Have we found something already? */
91 	cbnz	synd, L(tail)
92 
93 L(loop):
94 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
95 	subs	cntin, cntin, #32
96 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
97 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
98 	/* If we're out of data we finish regardless of the result */
99 	b.ls	L(end)
100 	/* Use a fast check for the termination condition */
101 	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
102 	addp	vend.2d, vend.2d, vend.2d
103 	mov	synd, vend.d[0]
104 	/* We're not out of data, loop if we haven't found the character */
105 	cbz	synd, L(loop)
106 
107 L(end):
108 	/* Termination condition found, let's calculate the syndrome value */
109 	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
110 	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
111 	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
112 	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
113 	mov	synd, vend.d[0]
114 	/* Only do the clear for the last possible block */
115 	b.hs	L(tail)
116 
117 L(masklast):
118 	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
119 	add	tmp, cntrem, soff
120 	and	tmp, tmp, #31
121 	sub	tmp, tmp, #32
122 	neg	tmp, tmp, lsl #1
123 	lsl	synd, synd, tmp
124 	lsr	synd, synd, tmp
125 
126 L(tail):
127 	/* Count the trailing zeros using bit reversing */
128 	rbit	synd, synd
129 	/* Compensate the last post-increment */
130 	sub	src, src, #32
131 	/* Check that we have found a character */
132 	cmp	synd, #0
133 	/* And count the leading zeros */
134 	clz	synd, synd
135 	/* Compute the potential result */
136 	add	result, src, synd, lsr #1
137 	/* Select result or NULL */
138 	csel	result, xzr, result, eq
139 	ret
140 
141 L(zero_length):
142 	mov	result, #0
143 	ret
144 
145 END (__memchr_aarch64)
146 
147