1bbbf1280Sopenharmony_ci/*
2bbbf1280Sopenharmony_ci * strlen - calculate the length of a string.
3bbbf1280Sopenharmony_ci *
4bbbf1280Sopenharmony_ci * Copyright (c) 2020, Arm Limited.
5bbbf1280Sopenharmony_ci * SPDX-License-Identifier: MIT
6bbbf1280Sopenharmony_ci */
7bbbf1280Sopenharmony_ci
8bbbf1280Sopenharmony_ci/* Assumptions:
9bbbf1280Sopenharmony_ci *
10bbbf1280Sopenharmony_ci * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
11bbbf1280Sopenharmony_ci * Not MTE compatible.
12bbbf1280Sopenharmony_ci */
13bbbf1280Sopenharmony_ci
14bbbf1280Sopenharmony_ci#include "../asmdefs.h"
15bbbf1280Sopenharmony_ci
16bbbf1280Sopenharmony_ci#define srcin	x0
17bbbf1280Sopenharmony_ci#define len	x0
18bbbf1280Sopenharmony_ci
19bbbf1280Sopenharmony_ci#define src	x1
20bbbf1280Sopenharmony_ci#define data1	x2
21bbbf1280Sopenharmony_ci#define data2	x3
22bbbf1280Sopenharmony_ci#define has_nul1 x4
23bbbf1280Sopenharmony_ci#define has_nul2 x5
24bbbf1280Sopenharmony_ci#define tmp1	x4
25bbbf1280Sopenharmony_ci#define tmp2	x5
26bbbf1280Sopenharmony_ci#define tmp3	x6
27bbbf1280Sopenharmony_ci#define tmp4	x7
28bbbf1280Sopenharmony_ci#define zeroones x8
29bbbf1280Sopenharmony_ci
30bbbf1280Sopenharmony_ci#define maskv	v0
31bbbf1280Sopenharmony_ci#define maskd	d0
32bbbf1280Sopenharmony_ci#define dataq1	q1
33bbbf1280Sopenharmony_ci#define dataq2	q2
34bbbf1280Sopenharmony_ci#define datav1	v1
35bbbf1280Sopenharmony_ci#define datav2	v2
36bbbf1280Sopenharmony_ci#define tmp	x2
37bbbf1280Sopenharmony_ci#define tmpw	w2
38bbbf1280Sopenharmony_ci#define synd	x3
39bbbf1280Sopenharmony_ci#define shift	x4
40bbbf1280Sopenharmony_ci
41bbbf1280Sopenharmony_ci/* For the first 32 bytes, NUL detection works on the principle that
42bbbf1280Sopenharmony_ci   (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
43bbbf1280Sopenharmony_ci   byte is zero, and can be done in parallel across the entire word.  */
44bbbf1280Sopenharmony_ci
45bbbf1280Sopenharmony_ci#define REP8_01 0x0101010101010101
46bbbf1280Sopenharmony_ci#define REP8_7f 0x7f7f7f7f7f7f7f7f
47bbbf1280Sopenharmony_ci
48bbbf1280Sopenharmony_ci/* To test the page crossing code path more thoroughly, compile with
49bbbf1280Sopenharmony_ci   -DTEST_PAGE_CROSS - this will force all calls through the slower
50bbbf1280Sopenharmony_ci   entry path.  This option is not intended for production use.  */
51bbbf1280Sopenharmony_ci
52bbbf1280Sopenharmony_ci#ifdef TEST_PAGE_CROSS
53bbbf1280Sopenharmony_ci# define MIN_PAGE_SIZE 32
54bbbf1280Sopenharmony_ci#else
55bbbf1280Sopenharmony_ci# define MIN_PAGE_SIZE 4096
56bbbf1280Sopenharmony_ci#endif
57bbbf1280Sopenharmony_ci
58bbbf1280Sopenharmony_ci/* Core algorithm:
59bbbf1280Sopenharmony_ci
60bbbf1280Sopenharmony_ci   Since strings are short on average, we check the first 32 bytes of the
61bbbf1280Sopenharmony_ci   string for a NUL character without aligning the string.  In order to use
62bbbf1280Sopenharmony_ci   unaligned loads safely we must do a page cross check first.
63bbbf1280Sopenharmony_ci
64bbbf1280Sopenharmony_ci   If there is a NUL byte we calculate the length from the 2 8-byte words
65bbbf1280Sopenharmony_ci   using conditional select to reduce branch mispredictions (it is unlikely
66bbbf1280Sopenharmony_ci   strlen will be repeatedly called on strings with the same length).
67bbbf1280Sopenharmony_ci
68bbbf1280Sopenharmony_ci   If the string is longer than 32 bytes, align src so we don't need further
69bbbf1280Sopenharmony_ci   page cross checks, and process 32 bytes per iteration using a fast SIMD
70bbbf1280Sopenharmony_ci   loop.
71bbbf1280Sopenharmony_ci
72bbbf1280Sopenharmony_ci   If the page cross check fails, we read 32 bytes from an aligned address,
73bbbf1280Sopenharmony_ci   and ignore any characters before the string.  If it contains a NUL
74bbbf1280Sopenharmony_ci   character, return the length, if not, continue in the main loop.  */
75bbbf1280Sopenharmony_ci
76bbbf1280Sopenharmony_ciENTRY (__strlen_aarch64)
77bbbf1280Sopenharmony_ci	PTR_ARG (0)
78bbbf1280Sopenharmony_ci	and	tmp1, srcin, MIN_PAGE_SIZE - 1
79bbbf1280Sopenharmony_ci	cmp	tmp1, MIN_PAGE_SIZE - 32
80bbbf1280Sopenharmony_ci	b.hi	L(page_cross)
81bbbf1280Sopenharmony_ci
82bbbf1280Sopenharmony_ci	/* Look for a NUL byte in the first 16 bytes.  */
83bbbf1280Sopenharmony_ci	ldp	data1, data2, [srcin]
84bbbf1280Sopenharmony_ci	mov	zeroones, REP8_01
85bbbf1280Sopenharmony_ci
86bbbf1280Sopenharmony_ci#ifdef __AARCH64EB__
87bbbf1280Sopenharmony_ci	/* For big-endian, carry propagation (if the final byte in the
88bbbf1280Sopenharmony_ci	   string is 0x01) means we cannot use has_nul1/2 directly.
89bbbf1280Sopenharmony_ci	   Since we expect strings to be small and early-exit,
90bbbf1280Sopenharmony_ci	   byte-swap the data now so has_null1/2 will be correct.  */
91bbbf1280Sopenharmony_ci	rev	data1, data1
92bbbf1280Sopenharmony_ci	rev	data2, data2
93bbbf1280Sopenharmony_ci#endif
94bbbf1280Sopenharmony_ci	sub	tmp1, data1, zeroones
95bbbf1280Sopenharmony_ci	orr	tmp2, data1, REP8_7f
96bbbf1280Sopenharmony_ci	sub	tmp3, data2, zeroones
97bbbf1280Sopenharmony_ci	orr	tmp4, data2, REP8_7f
98bbbf1280Sopenharmony_ci	bics	has_nul1, tmp1, tmp2
99bbbf1280Sopenharmony_ci	bic	has_nul2, tmp3, tmp4
100bbbf1280Sopenharmony_ci	ccmp	has_nul2, 0, 0, eq
101bbbf1280Sopenharmony_ci	b.eq	L(bytes16_31)
102bbbf1280Sopenharmony_ci
103bbbf1280Sopenharmony_ci	/* Find the exact offset of the first NUL byte in the first 16 bytes
104bbbf1280Sopenharmony_ci	   from the string start.  Enter with C = has_nul1 == 0.  */
105bbbf1280Sopenharmony_ci	csel	has_nul1, has_nul1, has_nul2, cc
106bbbf1280Sopenharmony_ci	mov	len, 8
107bbbf1280Sopenharmony_ci	rev	has_nul1, has_nul1
108bbbf1280Sopenharmony_ci	csel	len, xzr, len, cc
109bbbf1280Sopenharmony_ci	clz	tmp1, has_nul1
110bbbf1280Sopenharmony_ci	add	len, len, tmp1, lsr 3
111bbbf1280Sopenharmony_ci	ret
112bbbf1280Sopenharmony_ci
113bbbf1280Sopenharmony_ci	.p2align 3
114bbbf1280Sopenharmony_ci	/* Look for a NUL byte at offset 16..31 in the string.  */
115bbbf1280Sopenharmony_ciL(bytes16_31):
116bbbf1280Sopenharmony_ci	ldp	data1, data2, [srcin, 16]
117bbbf1280Sopenharmony_ci#ifdef __AARCH64EB__
118bbbf1280Sopenharmony_ci	rev	data1, data1
119bbbf1280Sopenharmony_ci	rev	data2, data2
120bbbf1280Sopenharmony_ci#endif
121bbbf1280Sopenharmony_ci	sub	tmp1, data1, zeroones
122bbbf1280Sopenharmony_ci	orr	tmp2, data1, REP8_7f
123bbbf1280Sopenharmony_ci	sub	tmp3, data2, zeroones
124bbbf1280Sopenharmony_ci	orr	tmp4, data2, REP8_7f
125bbbf1280Sopenharmony_ci	bics	has_nul1, tmp1, tmp2
126bbbf1280Sopenharmony_ci	bic	has_nul2, tmp3, tmp4
127bbbf1280Sopenharmony_ci	ccmp	has_nul2, 0, 0, eq
128bbbf1280Sopenharmony_ci	b.eq	L(loop_entry)
129bbbf1280Sopenharmony_ci
130bbbf1280Sopenharmony_ci	/* Find the exact offset of the first NUL byte at offset 16..31 from
131bbbf1280Sopenharmony_ci	   the string start.  Enter with C = has_nul1 == 0.  */
132bbbf1280Sopenharmony_ci	csel	has_nul1, has_nul1, has_nul2, cc
133bbbf1280Sopenharmony_ci	mov	len, 24
134bbbf1280Sopenharmony_ci	rev	has_nul1, has_nul1
135bbbf1280Sopenharmony_ci	mov	tmp3, 16
136bbbf1280Sopenharmony_ci	clz	tmp1, has_nul1
137bbbf1280Sopenharmony_ci	csel	len, tmp3, len, cc
138bbbf1280Sopenharmony_ci	add	len, len, tmp1, lsr 3
139bbbf1280Sopenharmony_ci	ret
140bbbf1280Sopenharmony_ci
141bbbf1280Sopenharmony_ciL(loop_entry):
142bbbf1280Sopenharmony_ci	bic	src, srcin, 31
143bbbf1280Sopenharmony_ci
144bbbf1280Sopenharmony_ci	.p2align 5
145bbbf1280Sopenharmony_ciL(loop):
146bbbf1280Sopenharmony_ci	ldp	dataq1, dataq2, [src, 32]!
147bbbf1280Sopenharmony_ci	uminp	maskv.16b, datav1.16b, datav2.16b
148bbbf1280Sopenharmony_ci	uminp	maskv.16b, maskv.16b, maskv.16b
149bbbf1280Sopenharmony_ci	cmeq	maskv.8b, maskv.8b, 0
150bbbf1280Sopenharmony_ci	fmov	synd, maskd
151bbbf1280Sopenharmony_ci	cbz	synd, L(loop)
152bbbf1280Sopenharmony_ci
153bbbf1280Sopenharmony_ci	/* Low 32 bits of synd are non-zero if a NUL was found in datav1.  */
154bbbf1280Sopenharmony_ci	cmeq	maskv.16b, datav1.16b, 0
155bbbf1280Sopenharmony_ci	sub	len, src, srcin
156bbbf1280Sopenharmony_ci	tst	synd, 0xffffffff
157bbbf1280Sopenharmony_ci	b.ne	1f
158bbbf1280Sopenharmony_ci	cmeq	maskv.16b, datav2.16b, 0
159bbbf1280Sopenharmony_ci	add	len, len, 16
160bbbf1280Sopenharmony_ci1:
161bbbf1280Sopenharmony_ci	/* Generate a bitmask and compute correct byte offset.  */
162bbbf1280Sopenharmony_ci#ifdef __AARCH64EB__
163bbbf1280Sopenharmony_ci	bic	maskv.8h, 0xf0
164bbbf1280Sopenharmony_ci#else
165bbbf1280Sopenharmony_ci	bic	maskv.8h, 0x0f, lsl 8
166bbbf1280Sopenharmony_ci#endif
167bbbf1280Sopenharmony_ci	umaxp	maskv.16b, maskv.16b, maskv.16b
168bbbf1280Sopenharmony_ci	fmov	synd, maskd
169bbbf1280Sopenharmony_ci#ifndef __AARCH64EB__
170bbbf1280Sopenharmony_ci	rbit	synd, synd
171bbbf1280Sopenharmony_ci#endif
172bbbf1280Sopenharmony_ci	clz	tmp, synd
173bbbf1280Sopenharmony_ci	add	len, len, tmp, lsr 2
174bbbf1280Sopenharmony_ci	ret
175bbbf1280Sopenharmony_ci
176bbbf1280Sopenharmony_ci        .p2align 4
177bbbf1280Sopenharmony_ci
178bbbf1280Sopenharmony_ciL(page_cross):
179bbbf1280Sopenharmony_ci	bic	src, srcin, 31
180bbbf1280Sopenharmony_ci	mov	tmpw, 0x0c03
181bbbf1280Sopenharmony_ci	movk	tmpw, 0xc030, lsl 16
182bbbf1280Sopenharmony_ci	ld1	{datav1.16b, datav2.16b}, [src]
183bbbf1280Sopenharmony_ci	dup	maskv.4s, tmpw
184bbbf1280Sopenharmony_ci	cmeq	datav1.16b, datav1.16b, 0
185bbbf1280Sopenharmony_ci	cmeq	datav2.16b, datav2.16b, 0
186bbbf1280Sopenharmony_ci	and	datav1.16b, datav1.16b, maskv.16b
187bbbf1280Sopenharmony_ci	and	datav2.16b, datav2.16b, maskv.16b
188bbbf1280Sopenharmony_ci	addp	maskv.16b, datav1.16b, datav2.16b
189bbbf1280Sopenharmony_ci	addp	maskv.16b, maskv.16b, maskv.16b
190bbbf1280Sopenharmony_ci	fmov	synd, maskd
191bbbf1280Sopenharmony_ci	lsl	shift, srcin, 1
192bbbf1280Sopenharmony_ci	lsr	synd, synd, shift
193bbbf1280Sopenharmony_ci	cbz	synd, L(loop)
194bbbf1280Sopenharmony_ci
195bbbf1280Sopenharmony_ci	rbit	synd, synd
196bbbf1280Sopenharmony_ci	clz	len, synd
197bbbf1280Sopenharmony_ci	lsr	len, len, 1
198bbbf1280Sopenharmony_ci	ret
199bbbf1280Sopenharmony_ci
200bbbf1280Sopenharmony_ciEND (__strlen_aarch64)
201