1 /*
2  * strncmp - compare two strings
3  *
4  * Copyright (c) 2013-2021, Arm Limited.
5  * SPDX-License-Identifier: MIT
6  */
7 
8 /* Assumptions:
9  *
10  * ARMv8-a, AArch64
11  */
12 
13 #include "../asmdefs.h"
14 
15 #define REP8_01 0x0101010101010101
16 #define REP8_7f 0x7f7f7f7f7f7f7f7f
17 #define REP8_80 0x8080808080808080
18 
19 /* Parameters and result.  */
20 #define src1		x0
21 #define src2		x1
22 #define limit		x2
23 #define result		x0
24 
25 /* Internal variables.  */
26 #define data1		x3
27 #define data1w		w3
28 #define data2		x4
29 #define data2w		w4
30 #define has_nul		x5
31 #define diff		x6
32 #define syndrome	x7
33 #define tmp1		x8
34 #define tmp2		x9
35 #define tmp3		x10
36 #define zeroones	x11
37 #define pos		x12
38 #define limit_wd	x13
39 #define mask		x14
40 #define endloop		x15
41 #define count		mask
42 
43 ENTRY (__strncmp_aarch64)
44 	PTR_ARG (0)
45 	PTR_ARG (1)
46 	SIZE_ARG (2)
47 	cbz	limit, L(ret0)
48 	eor	tmp1, src1, src2
49 	mov	zeroones, #REP8_01
50 	tst	tmp1, #7
51 	and	count, src1, #7
52 	b.ne	L(misaligned8)
53 	cbnz	count, L(mutual_align)
54 	/* Calculate the number of full and partial words -1.  */
55 	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
56 	lsr	limit_wd, limit_wd, #3	/* Convert to Dwords.  */
57 
58 	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
59 	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
60 	   can be done in parallel across the entire word.  */
61 	.p2align 4
62 L(loop_aligned):
63 	ldr	data1, [src1], #8
64 	ldr	data2, [src2], #8
65 L(start_realigned):
66 	subs	limit_wd, limit_wd, #1
67 	sub	tmp1, data1, zeroones
68 	orr	tmp2, data1, #REP8_7f
69 	eor	diff, data1, data2	/* Non-zero if differences found.  */
70 	csinv	endloop, diff, xzr, pl	/* Last Dword or differences.  */
71 	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
72 	ccmp	endloop, #0, #0, eq
73 	b.eq	L(loop_aligned)
74 	/* End of main loop */
75 
76 	/* Not reached the limit, must have found the end or a diff.  */
77 	tbz	limit_wd, #63, L(not_limit)
78 
79 	/* Limit % 8 == 0 => all bytes significant.  */
80 	ands	limit, limit, #7
81 	b.eq	L(not_limit)
82 
83 	lsl	limit, limit, #3	/* Bits -> bytes.  */
84 	mov	mask, #~0
85 #ifdef __AARCH64EB__
86 	lsr	mask, mask, limit
87 #else
88 	lsl	mask, mask, limit
89 #endif
90 	bic	data1, data1, mask
91 	bic	data2, data2, mask
92 
93 	/* Make sure that the NUL byte is marked in the syndrome.  */
94 	orr	has_nul, has_nul, mask
95 
96 L(not_limit):
97 	orr	syndrome, diff, has_nul
98 
99 #ifndef	__AARCH64EB__
100 	rev	syndrome, syndrome
101 	rev	data1, data1
102 	/* The MS-non-zero bit of the syndrome marks either the first bit
103 	   that is different, or the top bit of the first zero byte.
104 	   Shifting left now will bring the critical information into the
105 	   top bits.  */
106 	clz	pos, syndrome
107 	rev	data2, data2
108 	lsl	data1, data1, pos
109 	lsl	data2, data2, pos
110 	/* But we need to zero-extend (char is unsigned) the value and then
111 	   perform a signed 32-bit subtraction.  */
112 	lsr	data1, data1, #56
113 	sub	result, data1, data2, lsr #56
114 	ret
115 #else
116 	/* For big-endian we cannot use the trick with the syndrome value
117 	   as carry-propagation can corrupt the upper bits if the trailing
118 	   bytes in the string contain 0x01.  */
119 	/* However, if there is no NUL byte in the dword, we can generate
120 	   the result directly.  We can't just subtract the bytes as the
121 	   MSB might be significant.  */
122 	cbnz	has_nul, 1f
123 	cmp	data1, data2
124 	cset	result, ne
125 	cneg	result, result, lo
126 	ret
127 1:
128 	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
129 	rev	tmp3, data1
130 	sub	tmp1, tmp3, zeroones
131 	orr	tmp2, tmp3, #REP8_7f
132 	bic	has_nul, tmp1, tmp2
133 	rev	has_nul, has_nul
134 	orr	syndrome, diff, has_nul
135 	clz	pos, syndrome
136 	/* The MS-non-zero bit of the syndrome marks either the first bit
137 	   that is different, or the top bit of the first zero byte.
138 	   Shifting left now will bring the critical information into the
139 	   top bits.  */
140 	lsl	data1, data1, pos
141 	lsl	data2, data2, pos
142 	/* But we need to zero-extend (char is unsigned) the value and then
143 	   perform a signed 32-bit subtraction.  */
144 	lsr	data1, data1, #56
145 	sub	result, data1, data2, lsr #56
146 	ret
147 #endif
148 
149 L(mutual_align):
150 	/* Sources are mutually aligned, but are not currently at an
151 	   alignment boundary.  Round down the addresses and then mask off
152 	   the bytes that precede the start point.
153 	   We also need to adjust the limit calculations, but without
154 	   overflowing if the limit is near ULONG_MAX.  */
155 	bic	src1, src1, #7
156 	bic	src2, src2, #7
157 	ldr	data1, [src1], #8
158 	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
159 	ldr	data2, [src2], #8
160 	mov	tmp2, #~0
161 	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
162 #ifdef __AARCH64EB__
163 	/* Big-endian.  Early bytes are at MSB.  */
164 	lsl	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
165 #else
166 	/* Little-endian.  Early bytes are at LSB.  */
167 	lsr	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
168 #endif
169 	and	tmp3, limit_wd, #7
170 	lsr	limit_wd, limit_wd, #3
171 	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
172 	add	limit, limit, count
173 	add	tmp3, tmp3, count
174 	orr	data1, data1, tmp2
175 	orr	data2, data2, tmp2
176 	add	limit_wd, limit_wd, tmp3, lsr #3
177 	b	L(start_realigned)
178 
179 	.p2align 4
180 	/* Don't bother with dwords for up to 16 bytes.  */
181 L(misaligned8):
182 	cmp	limit, #16
183 	b.hs	L(try_misaligned_words)
184 
185 L(byte_loop):
186 	/* Perhaps we can do better than this.  */
187 	ldrb	data1w, [src1], #1
188 	ldrb	data2w, [src2], #1
189 	subs	limit, limit, #1
190 	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
191 	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
192 	b.eq	L(byte_loop)
193 L(done):
194 	sub	result, data1, data2
195 	ret
196 	/* Align the SRC1 to a dword by doing a bytewise compare and then do
197 	   the dword loop.  */
198 L(try_misaligned_words):
199 	lsr	limit_wd, limit, #3
200 	cbz	count, L(do_misaligned)
201 
202 	neg	count, count
203 	and	count, count, #7
204 	sub	limit, limit, count
205 	lsr	limit_wd, limit, #3
206 
207 L(page_end_loop):
208 	ldrb	data1w, [src1], #1
209 	ldrb	data2w, [src2], #1
210 	cmp	data1w, #1
211 	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
212 	b.ne	L(done)
213 	subs	count, count, #1
214 	b.hi	L(page_end_loop)
215 
216 L(do_misaligned):
217 	/* Prepare ourselves for the next page crossing.  Unlike the aligned
218 	   loop, we fetch 1 less dword because we risk crossing bounds on
219 	   SRC2.  */
220 	mov	count, #8
221 	subs	limit_wd, limit_wd, #1
222 	b.lo	L(done_loop)
223 L(loop_misaligned):
224 	and	tmp2, src2, #0xff8
225 	eor	tmp2, tmp2, #0xff8
226 	cbz	tmp2, L(page_end_loop)
227 
228 	ldr	data1, [src1], #8
229 	ldr	data2, [src2], #8
230 	sub	tmp1, data1, zeroones
231 	orr	tmp2, data1, #REP8_7f
232 	eor	diff, data1, data2	/* Non-zero if differences found.  */
233 	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
234 	ccmp	diff, #0, #0, eq
235 	b.ne	L(not_limit)
236 	subs	limit_wd, limit_wd, #1
237 	b.pl	L(loop_misaligned)
238 
239 L(done_loop):
240 	/* We found a difference or a NULL before the limit was reached.  */
241 	and	limit, limit, #7
242 	cbz	limit, L(not_limit)
243 	/* Read the last word.  */
244 	sub	src1, src1, 8
245 	sub	src2, src2, 8
246 	ldr	data1, [src1, limit]
247 	ldr	data2, [src2, limit]
248 	sub	tmp1, data1, zeroones
249 	orr	tmp2, data1, #REP8_7f
250 	eor	diff, data1, data2	/* Non-zero if differences found.  */
251 	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
252 	ccmp	diff, #0, #0, eq
253 	b.ne	L(not_limit)
254 
255 L(ret0):
256 	mov	result, #0
257 	ret
258 
259 END ( __strncmp_aarch64)
260 
261