1 /*
2  * memchr - find a character in a memory zone
3  *
4  * Copyright (c) 2020, Arm Limited.
5  * SPDX-License-Identifier: MIT
6  */
7 
8 /* Assumptions:
9  *
10  * ARMv8-a, AArch64, Advanced SIMD.
11  * MTE compatible.
12  */
13 
14 #include "../asmdefs.h"
15 
16 #define srcin		x0
17 #define chrin		w1
18 #define cntin		x2
19 #define result		x0
20 
21 #define src		x3
22 #define cntrem		x4
23 #define synd		x5
24 #define shift		x6
25 #define	tmp		x7
26 #define wtmp		w7
27 
28 #define vrepchr		v0
29 #define qdata		q1
30 #define vdata		v1
31 #define vhas_chr	v2
32 #define vrepmask	v3
33 #define vend		v4
34 #define dend		d4
35 
36 /*
37    Core algorithm:
38 
39    For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
40    per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
41    requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
42    set likewise for odd bytes so that adjacent bytes can be merged. Since the
43    bits in the syndrome reflect the order in which things occur in the original
44    string, counting trailing zeros identifies exactly which byte matched.  */
45 
46 ENTRY (__memchr_aarch64_mte)
47 	PTR_ARG (0)
48 	SIZE_ARG (2)
49 	bic	src, srcin, 15
50 	cbz	cntin, L(nomatch)
51 	ld1	{vdata.16b}, [src]
52 	dup	vrepchr.16b, chrin
53 	mov	wtmp, 0xf00f
54 	dup	vrepmask.8h, wtmp
55 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
56 	lsl	shift, srcin, 2
57 	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
58 	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
59 	fmov	synd, dend
60 	lsr	synd, synd, shift
61 	cbz	synd, L(start_loop)
62 
63 	rbit	synd, synd
64 	clz	synd, synd
65 	add	result, srcin, synd, lsr 2
66 	cmp	cntin, synd, lsr 2
67 	csel	result, result, xzr, hi
68 	ret
69 
70 L(start_loop):
71 	sub	tmp, src, srcin
72 	add	tmp, tmp, 16
73 	subs	cntrem, cntin, tmp
74 	b.ls	L(nomatch)
75 
76 	/* Make sure that it won't overread by a 16-byte chunk */
77 	add	tmp, cntrem, 15
78 	tbnz	tmp, 4, L(loop32_2)
79 
80 	.p2align 4
81 L(loop32):
82 	ldr	qdata, [src, 16]!
83 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
84 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
85 	fmov	synd, dend
86 	cbnz	synd, L(end)
87 
88 L(loop32_2):
89 	ldr	qdata, [src, 16]!
90 	subs	cntrem, cntrem, 32
91 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
92 	b.ls	L(end)
93 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
94 	fmov	synd, dend
95 	cbz	synd, L(loop32)
96 L(end):
97 	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
98 	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
99 	fmov	synd, dend
100 	add	tmp, srcin, cntin
101 	sub	cntrem, tmp, src
102 #ifndef __AARCH64EB__
103 	rbit	synd, synd
104 #endif
105 	clz	synd, synd
106 	cmp	cntrem, synd, lsr 2
107 	add	result, src, synd, lsr 2
108 	csel	result, result, xzr, hi
109 	ret
110 
111 L(nomatch):
112 	mov	result, 0
113 	ret
114 
115 END (__memchr_aarch64_mte)
116 
117