1 /*
2  * memrchr - find last character in a memory zone.
3  *
4  * Copyright (c) 2020, Arm Limited.
5  * SPDX-License-Identifier: MIT
6  */
7 
8 /* Assumptions:
9  *
10  * ARMv8-a, AArch64, Advanced SIMD.
11  * MTE compatible.
12  */
13 
14 #include "../asmdefs.h"
15 
16 #define srcin		x0
17 #define chrin		w1
18 #define cntin		x2
19 #define result		x0
20 
21 #define src		x3
22 #define cntrem		x4
23 #define synd		x5
24 #define shift		x6
25 #define	tmp		x7
26 #define wtmp		w7
27 #define end		x8
28 #define endm1		x9
29 
30 #define vrepchr		v0
31 #define qdata		q1
32 #define vdata		v1
33 #define vhas_chr	v2
34 #define vrepmask	v3
35 #define vend		v4
36 #define dend		d4
37 
38 /*
39    Core algorithm:
40 
41    For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
42    per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
43    requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
44    set likewise for odd bytes so that adjacent bytes can be merged. Since the
45    bits in the syndrome reflect the order in which things occur in the original
46    string, counting trailing zeros identifies exactly which byte matched.  */
47 
48 ENTRY (__memrchr_aarch64)
49 	PTR_ARG (0)
50 	add	end, srcin, cntin
51 	sub	endm1, end, 1
52 	bic	src, endm1, 15
53 	cbz	cntin, L(nomatch)
54 	ld1	{vdata.16b}, [src]
55 	dup	vrepchr.16b, chrin
56 	mov	wtmp, 0xf00f
57 	dup	vrepmask.8h, wtmp
58 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
59 	neg	shift, end, lsl 2
60 	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
61 	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
62 	fmov	synd, dend
63 	lsl	synd, synd, shift
64 	cbz	synd, L(start_loop)
65 
66 	clz	synd, synd
67 	sub	result, endm1, synd, lsr 2
68 	cmp	cntin, synd, lsr 2
69 	csel	result, result, xzr, hi
70 	ret
71 
72 L(start_loop):
73 	sub	tmp, end, src
74 	subs	cntrem, cntin, tmp
75 	b.ls	L(nomatch)
76 
77 	/* Make sure that it won't overread by a 16-byte chunk */
78 	add	tmp, cntrem, 15
79 	tbnz	tmp, 4, L(loop32_2)
80 
81 	.p2align 4
82 L(loop32):
83 	ldr	qdata, [src, -16]!
84 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
85 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
86 	fmov	synd, dend
87 	cbnz	synd, L(end)
88 
89 L(loop32_2):
90 	ldr	qdata, [src, -16]!
91 	subs	cntrem, cntrem, 32
92 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
93 	b.ls	L(end)
94 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
95 	fmov	synd, dend
96 	cbz	synd, L(loop32)
97 L(end):
98 	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
99 	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
100 	fmov	synd, dend
101 
102 	add	tmp, src, 15
103 #ifdef __AARCH64EB__
104 	rbit	synd, synd
105 #endif
106 	clz	synd, synd
107 	sub	tmp, tmp, synd, lsr 2
108 	cmp	tmp, srcin
109 	csel	result, tmp, xzr, hs
110 	ret
111 
112 L(nomatch):
113 	mov	result, 0
114 	ret
115 
116 END (__memrchr_aarch64)
117 
118