1bbbf1280Sopenharmony_ci/*
2bbbf1280Sopenharmony_ci * strchrnul - find a character or nul in a string
3bbbf1280Sopenharmony_ci *
4bbbf1280Sopenharmony_ci * Copyright (c) 2014-2020, Arm Limited.
5bbbf1280Sopenharmony_ci * SPDX-License-Identifier: MIT
6bbbf1280Sopenharmony_ci */
7bbbf1280Sopenharmony_ci
8bbbf1280Sopenharmony_ci/* Assumptions:
9bbbf1280Sopenharmony_ci *
10bbbf1280Sopenharmony_ci * ARMv8-a, AArch64
11bbbf1280Sopenharmony_ci * Neon Available.
12bbbf1280Sopenharmony_ci */
13bbbf1280Sopenharmony_ci
14bbbf1280Sopenharmony_ci#include "../asmdefs.h"
15bbbf1280Sopenharmony_ci
16bbbf1280Sopenharmony_ci/* Arguments and results.  */
17bbbf1280Sopenharmony_ci#define srcin		x0
18bbbf1280Sopenharmony_ci#define chrin		w1
19bbbf1280Sopenharmony_ci
20bbbf1280Sopenharmony_ci#define result		x0
21bbbf1280Sopenharmony_ci
22bbbf1280Sopenharmony_ci#define src		x2
23bbbf1280Sopenharmony_ci#define	tmp1		x3
24bbbf1280Sopenharmony_ci#define wtmp2		w4
25bbbf1280Sopenharmony_ci#define tmp3		x5
26bbbf1280Sopenharmony_ci
27bbbf1280Sopenharmony_ci#define vrepchr		v0
28bbbf1280Sopenharmony_ci#define vdata1		v1
29bbbf1280Sopenharmony_ci#define vdata2		v2
30bbbf1280Sopenharmony_ci#define vhas_nul1	v3
31bbbf1280Sopenharmony_ci#define vhas_nul2	v4
32bbbf1280Sopenharmony_ci#define vhas_chr1	v5
33bbbf1280Sopenharmony_ci#define vhas_chr2	v6
34bbbf1280Sopenharmony_ci#define vrepmask	v7
35bbbf1280Sopenharmony_ci#define vend1		v16
36bbbf1280Sopenharmony_ci
37bbbf1280Sopenharmony_ci/* Core algorithm.
38bbbf1280Sopenharmony_ci
39bbbf1280Sopenharmony_ci   For each 32-byte hunk we calculate a 64-bit syndrome value, with
40bbbf1280Sopenharmony_ci   two bits per byte (LSB is always in bits 0 and 1, for both big
41bbbf1280Sopenharmony_ci   and little-endian systems).  For each tuple, bit 0 is set iff
42bbbf1280Sopenharmony_ci   the relevant byte matched the requested character or nul.  Since the
43bbbf1280Sopenharmony_ci   bits in the syndrome reflect exactly the order in which things occur
44bbbf1280Sopenharmony_ci   in the original string a count_trailing_zeros() operation will
45bbbf1280Sopenharmony_ci   identify exactly which byte is causing the termination.  */
46bbbf1280Sopenharmony_ci
47bbbf1280Sopenharmony_ci/* Locals and temporaries.  */
48bbbf1280Sopenharmony_ci
49bbbf1280Sopenharmony_ciENTRY (__strchrnul_aarch64)
50bbbf1280Sopenharmony_ci	PTR_ARG (0)
51bbbf1280Sopenharmony_ci	/* Magic constant 0x40100401 to allow us to identify which lane
52bbbf1280Sopenharmony_ci	   matches the termination condition.  */
53bbbf1280Sopenharmony_ci	mov	wtmp2, #0x0401
54bbbf1280Sopenharmony_ci	movk	wtmp2, #0x4010, lsl #16
55bbbf1280Sopenharmony_ci	dup	vrepchr.16b, chrin
56bbbf1280Sopenharmony_ci	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
57bbbf1280Sopenharmony_ci	dup	vrepmask.4s, wtmp2
58bbbf1280Sopenharmony_ci	ands	tmp1, srcin, #31
59bbbf1280Sopenharmony_ci	b.eq	L(loop)
60bbbf1280Sopenharmony_ci
61bbbf1280Sopenharmony_ci	/* Input string is not 32-byte aligned.  Rather than forcing
62bbbf1280Sopenharmony_ci	   the padding bytes to a safe value, we calculate the syndrome
63bbbf1280Sopenharmony_ci	   for all the bytes, but then mask off those bits of the
64bbbf1280Sopenharmony_ci	   syndrome that are related to the padding.  */
65bbbf1280Sopenharmony_ci	ld1	{vdata1.16b, vdata2.16b}, [src], #32
66bbbf1280Sopenharmony_ci	neg	tmp1, tmp1
67bbbf1280Sopenharmony_ci	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
68bbbf1280Sopenharmony_ci	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
69bbbf1280Sopenharmony_ci	cmhs	vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
70bbbf1280Sopenharmony_ci	cmhs	vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
71bbbf1280Sopenharmony_ci	and	vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
72bbbf1280Sopenharmony_ci	and	vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
73bbbf1280Sopenharmony_ci	lsl	tmp1, tmp1, #1
74bbbf1280Sopenharmony_ci	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
75bbbf1280Sopenharmony_ci	mov	tmp3, #~0
76bbbf1280Sopenharmony_ci	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
77bbbf1280Sopenharmony_ci	lsr	tmp1, tmp3, tmp1
78bbbf1280Sopenharmony_ci
79bbbf1280Sopenharmony_ci	mov	tmp3, vend1.d[0]
80bbbf1280Sopenharmony_ci	bic	tmp1, tmp3, tmp1	// Mask padding bits.
81bbbf1280Sopenharmony_ci	cbnz	tmp1, L(tail)
82bbbf1280Sopenharmony_ci
83bbbf1280Sopenharmony_ci	.p2align 4
84bbbf1280Sopenharmony_ciL(loop):
85bbbf1280Sopenharmony_ci	ld1	{vdata1.16b, vdata2.16b}, [src], #32
86bbbf1280Sopenharmony_ci	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
87bbbf1280Sopenharmony_ci	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
88bbbf1280Sopenharmony_ci	cmhs	vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
89bbbf1280Sopenharmony_ci	cmhs	vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
90bbbf1280Sopenharmony_ci	orr	vend1.16b, vhas_nul1.16b, vhas_nul2.16b
91bbbf1280Sopenharmony_ci	umaxp	vend1.16b, vend1.16b, vend1.16b
92bbbf1280Sopenharmony_ci	mov	tmp1, vend1.d[0]
93bbbf1280Sopenharmony_ci	cbz	tmp1, L(loop)
94bbbf1280Sopenharmony_ci
95bbbf1280Sopenharmony_ci	/* Termination condition found.  Now need to establish exactly why
96bbbf1280Sopenharmony_ci	   we terminated.  */
97bbbf1280Sopenharmony_ci	and	vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
98bbbf1280Sopenharmony_ci	and	vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
99bbbf1280Sopenharmony_ci	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b		// 256->128
100bbbf1280Sopenharmony_ci	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
101bbbf1280Sopenharmony_ci
102bbbf1280Sopenharmony_ci	mov	tmp1, vend1.d[0]
103bbbf1280Sopenharmony_ciL(tail):
104bbbf1280Sopenharmony_ci	/* Count the trailing zeros, by bit reversing...  */
105bbbf1280Sopenharmony_ci	rbit	tmp1, tmp1
106bbbf1280Sopenharmony_ci	/* Re-bias source.  */
107bbbf1280Sopenharmony_ci	sub	src, src, #32
108bbbf1280Sopenharmony_ci	clz	tmp1, tmp1	/* ... and counting the leading zeros.  */
109bbbf1280Sopenharmony_ci	/* tmp1 is twice the offset into the fragment.  */
110bbbf1280Sopenharmony_ci	add	result, src, tmp1, lsr #1
111bbbf1280Sopenharmony_ci	ret
112bbbf1280Sopenharmony_ci
113bbbf1280Sopenharmony_ciEND (__strchrnul_aarch64)
114bbbf1280Sopenharmony_ci
115