1bbbf1280Sopenharmony_ci/*
2bbbf1280Sopenharmony_ci * memchr - find a character in a memory zone
3bbbf1280Sopenharmony_ci *
4bbbf1280Sopenharmony_ci * Copyright (c) 2014-2020, Arm Limited.
5bbbf1280Sopenharmony_ci * SPDX-License-Identifier: MIT
6bbbf1280Sopenharmony_ci */
7bbbf1280Sopenharmony_ci
8bbbf1280Sopenharmony_ci/* Assumptions:
9bbbf1280Sopenharmony_ci *
10bbbf1280Sopenharmony_ci * ARMv8-a, AArch64
11bbbf1280Sopenharmony_ci * Neon Available.
12bbbf1280Sopenharmony_ci */
13bbbf1280Sopenharmony_ci
14bbbf1280Sopenharmony_ci#include "../asmdefs.h"
15bbbf1280Sopenharmony_ci
16bbbf1280Sopenharmony_ci/* Arguments and results.  */
17bbbf1280Sopenharmony_ci#define srcin		x0
18bbbf1280Sopenharmony_ci#define chrin		w1
19bbbf1280Sopenharmony_ci#define cntin		x2
20bbbf1280Sopenharmony_ci
21bbbf1280Sopenharmony_ci#define result		x0
22bbbf1280Sopenharmony_ci
23bbbf1280Sopenharmony_ci#define src		x3
24bbbf1280Sopenharmony_ci#define	tmp		x4
25bbbf1280Sopenharmony_ci#define wtmp2		w5
26bbbf1280Sopenharmony_ci#define synd		x6
27bbbf1280Sopenharmony_ci#define soff		x9
28bbbf1280Sopenharmony_ci#define cntrem		x10
29bbbf1280Sopenharmony_ci
30bbbf1280Sopenharmony_ci#define vrepchr		v0
31bbbf1280Sopenharmony_ci#define vdata1		v1
32bbbf1280Sopenharmony_ci#define vdata2		v2
33bbbf1280Sopenharmony_ci#define vhas_chr1	v3
34bbbf1280Sopenharmony_ci#define vhas_chr2	v4
35bbbf1280Sopenharmony_ci#define vrepmask	v5
36bbbf1280Sopenharmony_ci#define vend		v6
37bbbf1280Sopenharmony_ci
38bbbf1280Sopenharmony_ci/*
39bbbf1280Sopenharmony_ci * Core algorithm:
40bbbf1280Sopenharmony_ci *
41bbbf1280Sopenharmony_ci * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
42bbbf1280Sopenharmony_ci * per byte. For each tuple, bit 0 is set if the relevant byte matched the
43bbbf1280Sopenharmony_ci * requested character and bit 1 is not used (faster than using a 32bit
44bbbf1280Sopenharmony_ci * syndrome). Since the bits in the syndrome reflect exactly the order in which
45bbbf1280Sopenharmony_ci * things occur in the original string, counting trailing zeros allows to
46bbbf1280Sopenharmony_ci * identify exactly which byte has matched.
47bbbf1280Sopenharmony_ci */
48bbbf1280Sopenharmony_ci
49bbbf1280Sopenharmony_ciENTRY (__memchr_aarch64)
50bbbf1280Sopenharmony_ci	PTR_ARG (0)
51bbbf1280Sopenharmony_ci	SIZE_ARG (2)
52bbbf1280Sopenharmony_ci	/* Do not dereference srcin if no bytes to compare.  */
53bbbf1280Sopenharmony_ci	cbz	cntin, L(zero_length)
54bbbf1280Sopenharmony_ci	/*
55bbbf1280Sopenharmony_ci	 * Magic constant 0x40100401 allows us to identify which lane matches
56bbbf1280Sopenharmony_ci	 * the requested byte.
57bbbf1280Sopenharmony_ci	 */
58bbbf1280Sopenharmony_ci	mov	wtmp2, #0x0401
59bbbf1280Sopenharmony_ci	movk	wtmp2, #0x4010, lsl #16
60bbbf1280Sopenharmony_ci	dup	vrepchr.16b, chrin
61bbbf1280Sopenharmony_ci	/* Work with aligned 32-byte chunks */
62bbbf1280Sopenharmony_ci	bic	src, srcin, #31
63bbbf1280Sopenharmony_ci	dup	vrepmask.4s, wtmp2
64bbbf1280Sopenharmony_ci	ands	soff, srcin, #31
65bbbf1280Sopenharmony_ci	and	cntrem, cntin, #31
66bbbf1280Sopenharmony_ci	b.eq	L(loop)
67bbbf1280Sopenharmony_ci
68bbbf1280Sopenharmony_ci	/*
69bbbf1280Sopenharmony_ci	 * Input string is not 32-byte aligned. We calculate the syndrome
70bbbf1280Sopenharmony_ci	 * value for the aligned 32 bytes block containing the first bytes
71bbbf1280Sopenharmony_ci	 * and mask the irrelevant part.
72bbbf1280Sopenharmony_ci	 */
73bbbf1280Sopenharmony_ci
74bbbf1280Sopenharmony_ci	ld1	{vdata1.16b, vdata2.16b}, [src], #32
75bbbf1280Sopenharmony_ci	sub	tmp, soff, #32
76bbbf1280Sopenharmony_ci	adds	cntin, cntin, tmp
77bbbf1280Sopenharmony_ci	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
78bbbf1280Sopenharmony_ci	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
79bbbf1280Sopenharmony_ci	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
80bbbf1280Sopenharmony_ci	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
81bbbf1280Sopenharmony_ci	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
82bbbf1280Sopenharmony_ci	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
83bbbf1280Sopenharmony_ci	mov	synd, vend.d[0]
84bbbf1280Sopenharmony_ci	/* Clear the soff*2 lower bits */
85bbbf1280Sopenharmony_ci	lsl	tmp, soff, #1
86bbbf1280Sopenharmony_ci	lsr	synd, synd, tmp
87bbbf1280Sopenharmony_ci	lsl	synd, synd, tmp
88bbbf1280Sopenharmony_ci	/* The first block can also be the last */
89bbbf1280Sopenharmony_ci	b.ls	L(masklast)
90bbbf1280Sopenharmony_ci	/* Have we found something already? */
91bbbf1280Sopenharmony_ci	cbnz	synd, L(tail)
92bbbf1280Sopenharmony_ci
93bbbf1280Sopenharmony_ciL(loop):
94bbbf1280Sopenharmony_ci	ld1	{vdata1.16b, vdata2.16b}, [src], #32
95bbbf1280Sopenharmony_ci	subs	cntin, cntin, #32
96bbbf1280Sopenharmony_ci	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
97bbbf1280Sopenharmony_ci	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
98bbbf1280Sopenharmony_ci	/* If we're out of data we finish regardless of the result */
99bbbf1280Sopenharmony_ci	b.ls	L(end)
100bbbf1280Sopenharmony_ci	/* Use a fast check for the termination condition */
101bbbf1280Sopenharmony_ci	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
102bbbf1280Sopenharmony_ci	addp	vend.2d, vend.2d, vend.2d
103bbbf1280Sopenharmony_ci	mov	synd, vend.d[0]
104bbbf1280Sopenharmony_ci	/* We're not out of data, loop if we haven't found the character */
105bbbf1280Sopenharmony_ci	cbz	synd, L(loop)
106bbbf1280Sopenharmony_ci
107bbbf1280Sopenharmony_ciL(end):
108bbbf1280Sopenharmony_ci	/* Termination condition found, let's calculate the syndrome value */
109bbbf1280Sopenharmony_ci	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
110bbbf1280Sopenharmony_ci	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
111bbbf1280Sopenharmony_ci	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
112bbbf1280Sopenharmony_ci	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
113bbbf1280Sopenharmony_ci	mov	synd, vend.d[0]
114bbbf1280Sopenharmony_ci	/* Only do the clear for the last possible block */
115bbbf1280Sopenharmony_ci	b.hs	L(tail)
116bbbf1280Sopenharmony_ci
117bbbf1280Sopenharmony_ciL(masklast):
118bbbf1280Sopenharmony_ci	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
119bbbf1280Sopenharmony_ci	add	tmp, cntrem, soff
120bbbf1280Sopenharmony_ci	and	tmp, tmp, #31
121bbbf1280Sopenharmony_ci	sub	tmp, tmp, #32
122bbbf1280Sopenharmony_ci	neg	tmp, tmp, lsl #1
123bbbf1280Sopenharmony_ci	lsl	synd, synd, tmp
124bbbf1280Sopenharmony_ci	lsr	synd, synd, tmp
125bbbf1280Sopenharmony_ci
126bbbf1280Sopenharmony_ciL(tail):
127bbbf1280Sopenharmony_ci	/* Count the trailing zeros using bit reversing */
128bbbf1280Sopenharmony_ci	rbit	synd, synd
129bbbf1280Sopenharmony_ci	/* Compensate the last post-increment */
130bbbf1280Sopenharmony_ci	sub	src, src, #32
131bbbf1280Sopenharmony_ci	/* Check that we have found a character */
132bbbf1280Sopenharmony_ci	cmp	synd, #0
133bbbf1280Sopenharmony_ci	/* And count the leading zeros */
134bbbf1280Sopenharmony_ci	clz	synd, synd
135bbbf1280Sopenharmony_ci	/* Compute the potential result */
136bbbf1280Sopenharmony_ci	add	result, src, synd, lsr #1
137bbbf1280Sopenharmony_ci	/* Select result or NULL */
138bbbf1280Sopenharmony_ci	csel	result, xzr, result, eq
139bbbf1280Sopenharmony_ci	ret
140bbbf1280Sopenharmony_ci
141bbbf1280Sopenharmony_ciL(zero_length):
142bbbf1280Sopenharmony_ci	mov	result, #0
143bbbf1280Sopenharmony_ci	ret
144bbbf1280Sopenharmony_ci
145bbbf1280Sopenharmony_ciEND (__memchr_aarch64)
146bbbf1280Sopenharmony_ci
147