1 /*
2  * strrchr - find last position of a character in a string.
3  *
4  * Copyright (c) 2014-2020, Arm Limited.
5  * SPDX-License-Identifier: MIT
6  */
7 
8 /* Assumptions:
9  *
10  * ARMv8-a, AArch64
11  * Neon Available.
12  */
13 
14 #include "../asmdefs.h"
15 
16 /* Arguments and results.  */
17 #define srcin		x0
18 #define chrin		w1
19 
20 #define result		x0
21 
22 #define src		x2
23 #define	tmp1		x3
24 #define wtmp2		w4
25 #define tmp3		x5
26 #define src_match	x6
27 #define src_offset	x7
28 #define const_m1	x8
29 #define tmp4		x9
30 #define nul_match	x10
31 #define chr_match	x11
32 
33 #define vrepchr		v0
34 #define vdata1		v1
35 #define vdata2		v2
36 #define vhas_nul1	v3
37 #define vhas_nul2	v4
38 #define vhas_chr1	v5
39 #define vhas_chr2	v6
40 #define vrepmask_0	v7
41 #define vrepmask_c	v16
42 #define vend1		v17
43 #define vend2		v18
44 
45 /* Core algorithm.
46 
47    For each 32-byte hunk we calculate a 64-bit syndrome value, with
48    two bits per byte (LSB is always in bits 0 and 1, for both big
49    and little-endian systems).  For each tuple, bit 0 is set iff
50    the relevant byte matched the requested character; bit 1 is set
51    iff the relevant byte matched the NUL end of string (we trigger
52    off bit0 for the special case of looking for NUL).  Since the bits
53    in the syndrome reflect exactly the order in which things occur
54    in the original string a count_trailing_zeros() operation will
55    identify exactly which byte is causing the termination, and why.  */
56 
57 ENTRY (__strrchr_aarch64)
58 	PTR_ARG (0)
59 	/* Magic constant 0x40100401 to allow us to identify which lane
60 	   matches the requested byte.  Magic constant 0x80200802 used
61 	   similarly for NUL termination.  */
62 	mov	wtmp2, #0x0401
63 	movk	wtmp2, #0x4010, lsl #16
64 	dup	vrepchr.16b, chrin
65 	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
66 	dup	vrepmask_c.4s, wtmp2
67 	mov	src_offset, #0
68 	ands	tmp1, srcin, #31
69 	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
70 	b.eq	L(aligned)
71 
72 	/* Input string is not 32-byte aligned.  Rather than forcing
73 	   the padding bytes to a safe value, we calculate the syndrome
74 	   for all the bytes, but then mask off those bits of the
75 	   syndrome that are related to the padding.  */
76 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
77 	neg	tmp1, tmp1
78 	cmeq	vhas_nul1.16b, vdata1.16b, #0
79 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
80 	cmeq	vhas_nul2.16b, vdata2.16b, #0
81 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
82 	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
83 	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
84 	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
85 	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
86 	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
87 	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
88 	addp	vend1.16b, vhas_nul1.16b, vhas_chr1.16b		// 128->64
89 	mov	nul_match, vend1.d[0]
90 	lsl	tmp1, tmp1, #1
91 	mov	const_m1, #~0
92 	lsr	tmp3, const_m1, tmp1
93 	mov	chr_match, vend1.d[1]
94 
95 	bic	nul_match, nul_match, tmp3	// Mask padding bits.
96 	bic	chr_match, chr_match, tmp3	// Mask padding bits.
97 	cbnz	nul_match, L(tail)
98 
99 	.p2align 4
100 L(loop):
101 	cmp	chr_match, #0
102 	csel	src_match, src, src_match, ne
103 	csel	src_offset, chr_match, src_offset, ne
104 L(aligned):
105 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
106 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
107 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
108 	uminp	vend1.16b, vdata1.16b, vdata2.16b
109 	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
110 	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
111 	cmeq	vend1.16b, vend1.16b, 0
112 	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
113 	addp	vend1.16b, vend1.16b, vhas_chr1.16b		// 128->64
114 	mov	nul_match, vend1.d[0]
115 	mov	chr_match, vend1.d[1]
116 	cbz	nul_match, L(loop)
117 
118 	cmeq	vhas_nul1.16b, vdata1.16b, #0
119 	cmeq	vhas_nul2.16b, vdata2.16b, #0
120 	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
121 	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
122 	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
123 	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
124 	mov	nul_match, vhas_nul1.d[0]
125 
126 L(tail):
127 	/* Work out exactly where the string ends.  */
128 	sub	tmp4, nul_match, #1
129 	eor	tmp4, tmp4, nul_match
130 	ands	chr_match, chr_match, tmp4
131 	/* And pick the values corresponding to the last match.  */
132 	csel	src_match, src, src_match, ne
133 	csel	src_offset, chr_match, src_offset, ne
134 
135 	/* Count down from the top of the syndrome to find the last match.  */
136 	clz	tmp3, src_offset
137 	/* Src_match points beyond the word containing the match, so we can
138 	   simply subtract half the bit-offset into the syndrome.  Because
139 	   we are counting down, we need to go back one more character.  */
140 	add	tmp3, tmp3, #2
141 	sub	result, src_match, tmp3, lsr #1
142 	/* But if the syndrome shows no match was found, then return NULL.  */
143 	cmp	src_offset, #0
144 	csel	result, result, xzr, ne
145 
146 	ret
147 
148 END (__strrchr_aarch64)
149 
150