1bbbf1280Sopenharmony_ci/*
2bbbf1280Sopenharmony_ci * strrchr - find last position of a character in a string.
3bbbf1280Sopenharmony_ci *
4bbbf1280Sopenharmony_ci * Copyright (c) 2014-2020, Arm Limited.
5bbbf1280Sopenharmony_ci * SPDX-License-Identifier: MIT
6bbbf1280Sopenharmony_ci */
7bbbf1280Sopenharmony_ci
8bbbf1280Sopenharmony_ci/* Assumptions:
9bbbf1280Sopenharmony_ci *
10bbbf1280Sopenharmony_ci * ARMv8-a, AArch64
11bbbf1280Sopenharmony_ci * Neon Available.
12bbbf1280Sopenharmony_ci */
13bbbf1280Sopenharmony_ci
14bbbf1280Sopenharmony_ci#include "../asmdefs.h"
15bbbf1280Sopenharmony_ci
16bbbf1280Sopenharmony_ci/* Arguments and results.  */
17bbbf1280Sopenharmony_ci#define srcin		x0
18bbbf1280Sopenharmony_ci#define chrin		w1
19bbbf1280Sopenharmony_ci
20bbbf1280Sopenharmony_ci#define result		x0
21bbbf1280Sopenharmony_ci
22bbbf1280Sopenharmony_ci#define src		x2
23bbbf1280Sopenharmony_ci#define	tmp1		x3
24bbbf1280Sopenharmony_ci#define wtmp2		w4
25bbbf1280Sopenharmony_ci#define tmp3		x5
26bbbf1280Sopenharmony_ci#define src_match	x6
27bbbf1280Sopenharmony_ci#define src_offset	x7
28bbbf1280Sopenharmony_ci#define const_m1	x8
29bbbf1280Sopenharmony_ci#define tmp4		x9
30bbbf1280Sopenharmony_ci#define nul_match	x10
31bbbf1280Sopenharmony_ci#define chr_match	x11
32bbbf1280Sopenharmony_ci
33bbbf1280Sopenharmony_ci#define vrepchr		v0
34bbbf1280Sopenharmony_ci#define vdata1		v1
35bbbf1280Sopenharmony_ci#define vdata2		v2
36bbbf1280Sopenharmony_ci#define vhas_nul1	v3
37bbbf1280Sopenharmony_ci#define vhas_nul2	v4
38bbbf1280Sopenharmony_ci#define vhas_chr1	v5
39bbbf1280Sopenharmony_ci#define vhas_chr2	v6
40bbbf1280Sopenharmony_ci#define vrepmask_0	v7
41bbbf1280Sopenharmony_ci#define vrepmask_c	v16
42bbbf1280Sopenharmony_ci#define vend1		v17
43bbbf1280Sopenharmony_ci#define vend2		v18
44bbbf1280Sopenharmony_ci
45bbbf1280Sopenharmony_ci/* Core algorithm.
46bbbf1280Sopenharmony_ci
47bbbf1280Sopenharmony_ci   For each 32-byte hunk we calculate a 64-bit syndrome value, with
48bbbf1280Sopenharmony_ci   two bits per byte (LSB is always in bits 0 and 1, for both big
49bbbf1280Sopenharmony_ci   and little-endian systems).  For each tuple, bit 0 is set iff
50bbbf1280Sopenharmony_ci   the relevant byte matched the requested character; bit 1 is set
51bbbf1280Sopenharmony_ci   iff the relevant byte matched the NUL end of string (we trigger
52bbbf1280Sopenharmony_ci   off bit0 for the special case of looking for NUL).  Since the bits
53bbbf1280Sopenharmony_ci   in the syndrome reflect exactly the order in which things occur
54bbbf1280Sopenharmony_ci   in the original string a count_trailing_zeros() operation will
55bbbf1280Sopenharmony_ci   identify exactly which byte is causing the termination, and why.  */
56bbbf1280Sopenharmony_ci
57bbbf1280Sopenharmony_ciENTRY (__strrchr_aarch64)
58bbbf1280Sopenharmony_ci	PTR_ARG (0)
59bbbf1280Sopenharmony_ci	/* Magic constant 0x40100401 to allow us to identify which lane
60bbbf1280Sopenharmony_ci	   matches the requested byte.  Magic constant 0x80200802 used
61bbbf1280Sopenharmony_ci	   similarly for NUL termination.  */
62bbbf1280Sopenharmony_ci	mov	wtmp2, #0x0401
63bbbf1280Sopenharmony_ci	movk	wtmp2, #0x4010, lsl #16
64bbbf1280Sopenharmony_ci	dup	vrepchr.16b, chrin
65bbbf1280Sopenharmony_ci	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
66bbbf1280Sopenharmony_ci	dup	vrepmask_c.4s, wtmp2
67bbbf1280Sopenharmony_ci	mov	src_offset, #0
68bbbf1280Sopenharmony_ci	ands	tmp1, srcin, #31
69bbbf1280Sopenharmony_ci	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
70bbbf1280Sopenharmony_ci	b.eq	L(aligned)
71bbbf1280Sopenharmony_ci
72bbbf1280Sopenharmony_ci	/* Input string is not 32-byte aligned.  Rather than forcing
73bbbf1280Sopenharmony_ci	   the padding bytes to a safe value, we calculate the syndrome
74bbbf1280Sopenharmony_ci	   for all the bytes, but then mask off those bits of the
75bbbf1280Sopenharmony_ci	   syndrome that are related to the padding.  */
76bbbf1280Sopenharmony_ci	ld1	{vdata1.16b, vdata2.16b}, [src], #32
77bbbf1280Sopenharmony_ci	neg	tmp1, tmp1
78bbbf1280Sopenharmony_ci	cmeq	vhas_nul1.16b, vdata1.16b, #0
79bbbf1280Sopenharmony_ci	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
80bbbf1280Sopenharmony_ci	cmeq	vhas_nul2.16b, vdata2.16b, #0
81bbbf1280Sopenharmony_ci	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
82bbbf1280Sopenharmony_ci	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
83bbbf1280Sopenharmony_ci	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
84bbbf1280Sopenharmony_ci	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
85bbbf1280Sopenharmony_ci	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
86bbbf1280Sopenharmony_ci	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
87bbbf1280Sopenharmony_ci	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
88bbbf1280Sopenharmony_ci	addp	vend1.16b, vhas_nul1.16b, vhas_chr1.16b		// 128->64
89bbbf1280Sopenharmony_ci	mov	nul_match, vend1.d[0]
90bbbf1280Sopenharmony_ci	lsl	tmp1, tmp1, #1
91bbbf1280Sopenharmony_ci	mov	const_m1, #~0
92bbbf1280Sopenharmony_ci	lsr	tmp3, const_m1, tmp1
93bbbf1280Sopenharmony_ci	mov	chr_match, vend1.d[1]
94bbbf1280Sopenharmony_ci
95bbbf1280Sopenharmony_ci	bic	nul_match, nul_match, tmp3	// Mask padding bits.
96bbbf1280Sopenharmony_ci	bic	chr_match, chr_match, tmp3	// Mask padding bits.
97bbbf1280Sopenharmony_ci	cbnz	nul_match, L(tail)
98bbbf1280Sopenharmony_ci
99bbbf1280Sopenharmony_ci	.p2align 4
100bbbf1280Sopenharmony_ciL(loop):
101bbbf1280Sopenharmony_ci	cmp	chr_match, #0
102bbbf1280Sopenharmony_ci	csel	src_match, src, src_match, ne
103bbbf1280Sopenharmony_ci	csel	src_offset, chr_match, src_offset, ne
104bbbf1280Sopenharmony_ciL(aligned):
105bbbf1280Sopenharmony_ci	ld1	{vdata1.16b, vdata2.16b}, [src], #32
106bbbf1280Sopenharmony_ci	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
107bbbf1280Sopenharmony_ci	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
108bbbf1280Sopenharmony_ci	uminp	vend1.16b, vdata1.16b, vdata2.16b
109bbbf1280Sopenharmony_ci	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
110bbbf1280Sopenharmony_ci	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
111bbbf1280Sopenharmony_ci	cmeq	vend1.16b, vend1.16b, 0
112bbbf1280Sopenharmony_ci	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
113bbbf1280Sopenharmony_ci	addp	vend1.16b, vend1.16b, vhas_chr1.16b		// 128->64
114bbbf1280Sopenharmony_ci	mov	nul_match, vend1.d[0]
115bbbf1280Sopenharmony_ci	mov	chr_match, vend1.d[1]
116bbbf1280Sopenharmony_ci	cbz	nul_match, L(loop)
117bbbf1280Sopenharmony_ci
118bbbf1280Sopenharmony_ci	cmeq	vhas_nul1.16b, vdata1.16b, #0
119bbbf1280Sopenharmony_ci	cmeq	vhas_nul2.16b, vdata2.16b, #0
120bbbf1280Sopenharmony_ci	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
121bbbf1280Sopenharmony_ci	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
122bbbf1280Sopenharmony_ci	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
123bbbf1280Sopenharmony_ci	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
124bbbf1280Sopenharmony_ci	mov	nul_match, vhas_nul1.d[0]
125bbbf1280Sopenharmony_ci
126bbbf1280Sopenharmony_ciL(tail):
127bbbf1280Sopenharmony_ci	/* Work out exactly where the string ends.  */
128bbbf1280Sopenharmony_ci	sub	tmp4, nul_match, #1
129bbbf1280Sopenharmony_ci	eor	tmp4, tmp4, nul_match
130bbbf1280Sopenharmony_ci	ands	chr_match, chr_match, tmp4
131bbbf1280Sopenharmony_ci	/* And pick the values corresponding to the last match.  */
132bbbf1280Sopenharmony_ci	csel	src_match, src, src_match, ne
133bbbf1280Sopenharmony_ci	csel	src_offset, chr_match, src_offset, ne
134bbbf1280Sopenharmony_ci
135bbbf1280Sopenharmony_ci	/* Count down from the top of the syndrome to find the last match.  */
136bbbf1280Sopenharmony_ci	clz	tmp3, src_offset
137bbbf1280Sopenharmony_ci	/* Src_match points beyond the word containing the match, so we can
138bbbf1280Sopenharmony_ci	   simply subtract half the bit-offset into the syndrome.  Because
139bbbf1280Sopenharmony_ci	   we are counting down, we need to go back one more character.  */
140bbbf1280Sopenharmony_ci	add	tmp3, tmp3, #2
141bbbf1280Sopenharmony_ci	sub	result, src_match, tmp3, lsr #1
142bbbf1280Sopenharmony_ci	/* But if the syndrome shows no match was found, then return NULL.  */
143bbbf1280Sopenharmony_ci	cmp	src_offset, #0
144bbbf1280Sopenharmony_ci	csel	result, result, xzr, ne
145bbbf1280Sopenharmony_ci
146bbbf1280Sopenharmony_ci	ret
147bbbf1280Sopenharmony_ci
148bbbf1280Sopenharmony_ciEND (__strrchr_aarch64)
149bbbf1280Sopenharmony_ci
150