1bbbf1280Sopenharmony_ci/*
2bbbf1280Sopenharmony_ci * memchr - find a character in a memory zone
3bbbf1280Sopenharmony_ci *
4bbbf1280Sopenharmony_ci * Copyright (c) 2020, Arm Limited.
5bbbf1280Sopenharmony_ci * SPDX-License-Identifier: MIT
6bbbf1280Sopenharmony_ci */
7bbbf1280Sopenharmony_ci
8bbbf1280Sopenharmony_ci/* Assumptions:
9bbbf1280Sopenharmony_ci *
10bbbf1280Sopenharmony_ci * ARMv8-a, AArch64, Advanced SIMD.
11bbbf1280Sopenharmony_ci * MTE compatible.
12bbbf1280Sopenharmony_ci */
13bbbf1280Sopenharmony_ci
14bbbf1280Sopenharmony_ci#include "../asmdefs.h"
15bbbf1280Sopenharmony_ci
16bbbf1280Sopenharmony_ci#define srcin		x0
17bbbf1280Sopenharmony_ci#define chrin		w1
18bbbf1280Sopenharmony_ci#define cntin		x2
19bbbf1280Sopenharmony_ci#define result		x0
20bbbf1280Sopenharmony_ci
21bbbf1280Sopenharmony_ci#define src		x3
22bbbf1280Sopenharmony_ci#define cntrem		x4
23bbbf1280Sopenharmony_ci#define synd		x5
24bbbf1280Sopenharmony_ci#define shift		x6
25bbbf1280Sopenharmony_ci#define	tmp		x7
26bbbf1280Sopenharmony_ci#define wtmp		w7
27bbbf1280Sopenharmony_ci
28bbbf1280Sopenharmony_ci#define vrepchr		v0
29bbbf1280Sopenharmony_ci#define qdata		q1
30bbbf1280Sopenharmony_ci#define vdata		v1
31bbbf1280Sopenharmony_ci#define vhas_chr	v2
32bbbf1280Sopenharmony_ci#define vrepmask	v3
33bbbf1280Sopenharmony_ci#define vend		v4
34bbbf1280Sopenharmony_ci#define dend		d4
35bbbf1280Sopenharmony_ci
36bbbf1280Sopenharmony_ci/*
37bbbf1280Sopenharmony_ci   Core algorithm:
38bbbf1280Sopenharmony_ci
39bbbf1280Sopenharmony_ci   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
40bbbf1280Sopenharmony_ci   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
41bbbf1280Sopenharmony_ci   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
42bbbf1280Sopenharmony_ci   set likewise for odd bytes so that adjacent bytes can be merged. Since the
43bbbf1280Sopenharmony_ci   bits in the syndrome reflect the order in which things occur in the original
44bbbf1280Sopenharmony_ci   string, counting trailing zeros identifies exactly which byte matched.  */
45bbbf1280Sopenharmony_ci
46bbbf1280Sopenharmony_ciENTRY (__memchr_aarch64_mte)
47bbbf1280Sopenharmony_ci	PTR_ARG (0)
48bbbf1280Sopenharmony_ci	SIZE_ARG (2)
49bbbf1280Sopenharmony_ci	bic	src, srcin, 15
50bbbf1280Sopenharmony_ci	cbz	cntin, L(nomatch)
51bbbf1280Sopenharmony_ci	ld1	{vdata.16b}, [src]
52bbbf1280Sopenharmony_ci	dup	vrepchr.16b, chrin
53bbbf1280Sopenharmony_ci	mov	wtmp, 0xf00f
54bbbf1280Sopenharmony_ci	dup	vrepmask.8h, wtmp
55bbbf1280Sopenharmony_ci	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
56bbbf1280Sopenharmony_ci	lsl	shift, srcin, 2
57bbbf1280Sopenharmony_ci	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
58bbbf1280Sopenharmony_ci	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
59bbbf1280Sopenharmony_ci	fmov	synd, dend
60bbbf1280Sopenharmony_ci	lsr	synd, synd, shift
61bbbf1280Sopenharmony_ci	cbz	synd, L(start_loop)
62bbbf1280Sopenharmony_ci
63bbbf1280Sopenharmony_ci	rbit	synd, synd
64bbbf1280Sopenharmony_ci	clz	synd, synd
65bbbf1280Sopenharmony_ci	add	result, srcin, synd, lsr 2
66bbbf1280Sopenharmony_ci	cmp	cntin, synd, lsr 2
67bbbf1280Sopenharmony_ci	csel	result, result, xzr, hi
68bbbf1280Sopenharmony_ci	ret
69bbbf1280Sopenharmony_ci
70bbbf1280Sopenharmony_ciL(start_loop):
71bbbf1280Sopenharmony_ci	sub	tmp, src, srcin
72bbbf1280Sopenharmony_ci	add	tmp, tmp, 16
73bbbf1280Sopenharmony_ci	subs	cntrem, cntin, tmp
74bbbf1280Sopenharmony_ci	b.ls	L(nomatch)
75bbbf1280Sopenharmony_ci
76bbbf1280Sopenharmony_ci	/* Make sure that it won't overread by a 16-byte chunk */
77bbbf1280Sopenharmony_ci	add	tmp, cntrem, 15
78bbbf1280Sopenharmony_ci	tbnz	tmp, 4, L(loop32_2)
79bbbf1280Sopenharmony_ci
80bbbf1280Sopenharmony_ci	.p2align 4
81bbbf1280Sopenharmony_ciL(loop32):
82bbbf1280Sopenharmony_ci	ldr	qdata, [src, 16]!
83bbbf1280Sopenharmony_ci	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
84bbbf1280Sopenharmony_ci	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
85bbbf1280Sopenharmony_ci	fmov	synd, dend
86bbbf1280Sopenharmony_ci	cbnz	synd, L(end)
87bbbf1280Sopenharmony_ci
88bbbf1280Sopenharmony_ciL(loop32_2):
89bbbf1280Sopenharmony_ci	ldr	qdata, [src, 16]!
90bbbf1280Sopenharmony_ci	subs	cntrem, cntrem, 32
91bbbf1280Sopenharmony_ci	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
92bbbf1280Sopenharmony_ci	b.ls	L(end)
93bbbf1280Sopenharmony_ci	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
94bbbf1280Sopenharmony_ci	fmov	synd, dend
95bbbf1280Sopenharmony_ci	cbz	synd, L(loop32)
96bbbf1280Sopenharmony_ciL(end):
97bbbf1280Sopenharmony_ci	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
98bbbf1280Sopenharmony_ci	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
99bbbf1280Sopenharmony_ci	fmov	synd, dend
100bbbf1280Sopenharmony_ci	add	tmp, srcin, cntin
101bbbf1280Sopenharmony_ci	sub	cntrem, tmp, src
102bbbf1280Sopenharmony_ci#ifndef __AARCH64EB__
103bbbf1280Sopenharmony_ci	rbit	synd, synd
104bbbf1280Sopenharmony_ci#endif
105bbbf1280Sopenharmony_ci	clz	synd, synd
106bbbf1280Sopenharmony_ci	cmp	cntrem, synd, lsr 2
107bbbf1280Sopenharmony_ci	add	result, src, synd, lsr 2
108bbbf1280Sopenharmony_ci	csel	result, result, xzr, hi
109bbbf1280Sopenharmony_ci	ret
110bbbf1280Sopenharmony_ci
111bbbf1280Sopenharmony_ciL(nomatch):
112bbbf1280Sopenharmony_ci	mov	result, 0
113bbbf1280Sopenharmony_ci	ret
114bbbf1280Sopenharmony_ci
115bbbf1280Sopenharmony_ciEND (__memchr_aarch64_mte)
116bbbf1280Sopenharmony_ci
117