1 /*
2  * strlen - calculate the length of a string.
3  *
4  * Copyright (c) 2020, Arm Limited.
5  * SPDX-License-Identifier: MIT
6  */
7 
8 /* Assumptions:
9  *
10  * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
11  * Not MTE compatible.
12  */
13 
14 #include "../asmdefs.h"
15 
16 #define srcin	x0
17 #define len	x0
18 
19 #define src	x1
20 #define data1	x2
21 #define data2	x3
22 #define has_nul1 x4
23 #define has_nul2 x5
24 #define tmp1	x4
25 #define tmp2	x5
26 #define tmp3	x6
27 #define tmp4	x7
28 #define zeroones x8
29 
30 #define maskv	v0
31 #define maskd	d0
32 #define dataq1	q1
33 #define dataq2	q2
34 #define datav1	v1
35 #define datav2	v2
36 #define tmp	x2
37 #define tmpw	w2
38 #define synd	x3
39 #define shift	x4
40 
41 /* For the first 32 bytes, NUL detection works on the principle that
42    (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
43    byte is zero, and can be done in parallel across the entire word.  */
44 
45 #define REP8_01 0x0101010101010101
46 #define REP8_7f 0x7f7f7f7f7f7f7f7f
47 
48 /* To test the page crossing code path more thoroughly, compile with
49    -DTEST_PAGE_CROSS - this will force all calls through the slower
50    entry path.  This option is not intended for production use.  */
51 
52 #ifdef TEST_PAGE_CROSS
53 # define MIN_PAGE_SIZE 32
54 #else
55 # define MIN_PAGE_SIZE 4096
56 #endif
57 
58 /* Core algorithm:
59 
60    Since strings are short on average, we check the first 32 bytes of the
61    string for a NUL character without aligning the string.  In order to use
62    unaligned loads safely we must do a page cross check first.
63 
64    If there is a NUL byte we calculate the length from the 2 8-byte words
65    using conditional select to reduce branch mispredictions (it is unlikely
66    strlen will be repeatedly called on strings with the same length).
67 
68    If the string is longer than 32 bytes, align src so we don't need further
69    page cross checks, and process 32 bytes per iteration using a fast SIMD
70    loop.
71 
72    If the page cross check fails, we read 32 bytes from an aligned address,
73    and ignore any characters before the string.  If it contains a NUL
74    character, return the length, if not, continue in the main loop.  */
75 
76 ENTRY (__strlen_aarch64)
77 	PTR_ARG (0)
78 	and	tmp1, srcin, MIN_PAGE_SIZE - 1
79 	cmp	tmp1, MIN_PAGE_SIZE - 32
80 	b.hi	L(page_cross)
81 
82 	/* Look for a NUL byte in the first 16 bytes.  */
83 	ldp	data1, data2, [srcin]
84 	mov	zeroones, REP8_01
85 
86 #ifdef __AARCH64EB__
87 	/* For big-endian, carry propagation (if the final byte in the
88 	   string is 0x01) means we cannot use has_nul1/2 directly.
89 	   Since we expect strings to be small and early-exit,
90 	   byte-swap the data now so has_null1/2 will be correct.  */
91 	rev	data1, data1
92 	rev	data2, data2
93 #endif
94 	sub	tmp1, data1, zeroones
95 	orr	tmp2, data1, REP8_7f
96 	sub	tmp3, data2, zeroones
97 	orr	tmp4, data2, REP8_7f
98 	bics	has_nul1, tmp1, tmp2
99 	bic	has_nul2, tmp3, tmp4
100 	ccmp	has_nul2, 0, 0, eq
101 	b.eq	L(bytes16_31)
102 
103 	/* Find the exact offset of the first NUL byte in the first 16 bytes
104 	   from the string start.  Enter with C = has_nul1 == 0.  */
105 	csel	has_nul1, has_nul1, has_nul2, cc
106 	mov	len, 8
107 	rev	has_nul1, has_nul1
108 	csel	len, xzr, len, cc
109 	clz	tmp1, has_nul1
110 	add	len, len, tmp1, lsr 3
111 	ret
112 
113 	.p2align 3
114 	/* Look for a NUL byte at offset 16..31 in the string.  */
115 L(bytes16_31):
116 	ldp	data1, data2, [srcin, 16]
117 #ifdef __AARCH64EB__
118 	rev	data1, data1
119 	rev	data2, data2
120 #endif
121 	sub	tmp1, data1, zeroones
122 	orr	tmp2, data1, REP8_7f
123 	sub	tmp3, data2, zeroones
124 	orr	tmp4, data2, REP8_7f
125 	bics	has_nul1, tmp1, tmp2
126 	bic	has_nul2, tmp3, tmp4
127 	ccmp	has_nul2, 0, 0, eq
128 	b.eq	L(loop_entry)
129 
130 	/* Find the exact offset of the first NUL byte at offset 16..31 from
131 	   the string start.  Enter with C = has_nul1 == 0.  */
132 	csel	has_nul1, has_nul1, has_nul2, cc
133 	mov	len, 24
134 	rev	has_nul1, has_nul1
135 	mov	tmp3, 16
136 	clz	tmp1, has_nul1
137 	csel	len, tmp3, len, cc
138 	add	len, len, tmp1, lsr 3
139 	ret
140 
141 L(loop_entry):
142 	bic	src, srcin, 31
143 
144 	.p2align 5
145 L(loop):
146 	ldp	dataq1, dataq2, [src, 32]!
147 	uminp	maskv.16b, datav1.16b, datav2.16b
148 	uminp	maskv.16b, maskv.16b, maskv.16b
149 	cmeq	maskv.8b, maskv.8b, 0
150 	fmov	synd, maskd
151 	cbz	synd, L(loop)
152 
153 	/* Low 32 bits of synd are non-zero if a NUL was found in datav1.  */
154 	cmeq	maskv.16b, datav1.16b, 0
155 	sub	len, src, srcin
156 	tst	synd, 0xffffffff
157 	b.ne	1f
158 	cmeq	maskv.16b, datav2.16b, 0
159 	add	len, len, 16
160 1:
161 	/* Generate a bitmask and compute correct byte offset.  */
162 #ifdef __AARCH64EB__
163 	bic	maskv.8h, 0xf0
164 #else
165 	bic	maskv.8h, 0x0f, lsl 8
166 #endif
167 	umaxp	maskv.16b, maskv.16b, maskv.16b
168 	fmov	synd, maskd
169 #ifndef __AARCH64EB__
170 	rbit	synd, synd
171 #endif
172 	clz	tmp, synd
173 	add	len, len, tmp, lsr 2
174 	ret
175 
176         .p2align 4
177 
178 L(page_cross):
179 	bic	src, srcin, 31
180 	mov	tmpw, 0x0c03
181 	movk	tmpw, 0xc030, lsl 16
182 	ld1	{datav1.16b, datav2.16b}, [src]
183 	dup	maskv.4s, tmpw
184 	cmeq	datav1.16b, datav1.16b, 0
185 	cmeq	datav2.16b, datav2.16b, 0
186 	and	datav1.16b, datav1.16b, maskv.16b
187 	and	datav2.16b, datav2.16b, maskv.16b
188 	addp	maskv.16b, datav1.16b, datav2.16b
189 	addp	maskv.16b, maskv.16b, maskv.16b
190 	fmov	synd, maskd
191 	lsl	shift, srcin, 1
192 	lsr	synd, synd, shift
193 	cbz	synd, L(loop)
194 
195 	rbit	synd, synd
196 	clz	len, synd
197 	lsr	len, len, 1
198 	ret
199 
200 END (__strlen_aarch64)
201