1bbbf1280Sopenharmony_ci/*
2bbbf1280Sopenharmony_ci * strncmp - compare two strings
3bbbf1280Sopenharmony_ci *
4bbbf1280Sopenharmony_ci * Copyright (c) 2013-2021, Arm Limited.
5bbbf1280Sopenharmony_ci * SPDX-License-Identifier: MIT
6bbbf1280Sopenharmony_ci */
7bbbf1280Sopenharmony_ci
8bbbf1280Sopenharmony_ci/* Assumptions:
9bbbf1280Sopenharmony_ci *
10bbbf1280Sopenharmony_ci * ARMv8-a, AArch64
11bbbf1280Sopenharmony_ci */
12bbbf1280Sopenharmony_ci
13bbbf1280Sopenharmony_ci#include "../asmdefs.h"
14bbbf1280Sopenharmony_ci
15bbbf1280Sopenharmony_ci#define REP8_01 0x0101010101010101
16bbbf1280Sopenharmony_ci#define REP8_7f 0x7f7f7f7f7f7f7f7f
17bbbf1280Sopenharmony_ci
18bbbf1280Sopenharmony_ci/* Parameters and result.  */
19bbbf1280Sopenharmony_ci#define src1		x0
20bbbf1280Sopenharmony_ci#define src2		x1
21bbbf1280Sopenharmony_ci#define limit		x2
22bbbf1280Sopenharmony_ci#define result		x0
23bbbf1280Sopenharmony_ci
24bbbf1280Sopenharmony_ci/* Internal variables.  */
25bbbf1280Sopenharmony_ci#define data1		x3
26bbbf1280Sopenharmony_ci#define data1w		w3
27bbbf1280Sopenharmony_ci#define data2		x4
28bbbf1280Sopenharmony_ci#define data2w		w4
29bbbf1280Sopenharmony_ci#define has_nul		x5
30bbbf1280Sopenharmony_ci#define diff		x6
31bbbf1280Sopenharmony_ci#define syndrome	x7
32bbbf1280Sopenharmony_ci#define tmp1		x8
33bbbf1280Sopenharmony_ci#define tmp2		x9
34bbbf1280Sopenharmony_ci#define tmp3		x10
35bbbf1280Sopenharmony_ci#define zeroones	x11
36bbbf1280Sopenharmony_ci#define pos		x12
37bbbf1280Sopenharmony_ci#define mask		x13
38bbbf1280Sopenharmony_ci#define endloop		x14
39bbbf1280Sopenharmony_ci#define count		mask
40bbbf1280Sopenharmony_ci#define offset		pos
41bbbf1280Sopenharmony_ci#define neg_offset	x15
42bbbf1280Sopenharmony_ci
43bbbf1280Sopenharmony_ci/* Define endian dependent shift operations.
44bbbf1280Sopenharmony_ci   On big-endian early bytes are at MSB and on little-endian LSB.
45bbbf1280Sopenharmony_ci   LS_FW means shifting towards early bytes.
46bbbf1280Sopenharmony_ci   LS_BK means shifting towards later bytes.
47bbbf1280Sopenharmony_ci   */
48bbbf1280Sopenharmony_ci#ifdef __AARCH64EB__
49bbbf1280Sopenharmony_ci#define LS_FW lsl
50bbbf1280Sopenharmony_ci#define LS_BK lsr
51bbbf1280Sopenharmony_ci#else
52bbbf1280Sopenharmony_ci#define LS_FW lsr
53bbbf1280Sopenharmony_ci#define LS_BK lsl
54bbbf1280Sopenharmony_ci#endif
55bbbf1280Sopenharmony_ci
56bbbf1280Sopenharmony_ciENTRY (__strncmp_aarch64_mte)
57bbbf1280Sopenharmony_ci	PTR_ARG (0)
58bbbf1280Sopenharmony_ci	PTR_ARG (1)
59bbbf1280Sopenharmony_ci	SIZE_ARG (2)
60bbbf1280Sopenharmony_ci	cbz	limit, L(ret0)
61bbbf1280Sopenharmony_ci	eor	tmp1, src1, src2
62bbbf1280Sopenharmony_ci	mov	zeroones, #REP8_01
63bbbf1280Sopenharmony_ci	tst	tmp1, #7
64bbbf1280Sopenharmony_ci	and	count, src1, #7
65bbbf1280Sopenharmony_ci	b.ne	L(misaligned8)
66bbbf1280Sopenharmony_ci	cbnz	count, L(mutual_align)
67bbbf1280Sopenharmony_ci
68bbbf1280Sopenharmony_ci	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
69bbbf1280Sopenharmony_ci	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
70bbbf1280Sopenharmony_ci	   can be done in parallel across the entire word.  */
71bbbf1280Sopenharmony_ci	.p2align 4
72bbbf1280Sopenharmony_ciL(loop_aligned):
73bbbf1280Sopenharmony_ci	ldr	data1, [src1], #8
74bbbf1280Sopenharmony_ci	ldr	data2, [src2], #8
75bbbf1280Sopenharmony_ciL(start_realigned):
76bbbf1280Sopenharmony_ci	subs	limit, limit, #8
77bbbf1280Sopenharmony_ci	sub	tmp1, data1, zeroones
78bbbf1280Sopenharmony_ci	orr	tmp2, data1, #REP8_7f
79bbbf1280Sopenharmony_ci	eor	diff, data1, data2	/* Non-zero if differences found.  */
80bbbf1280Sopenharmony_ci	csinv	endloop, diff, xzr, hi	/* Last Dword or differences.  */
81bbbf1280Sopenharmony_ci	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
82bbbf1280Sopenharmony_ci	ccmp	endloop, #0, #0, eq
83bbbf1280Sopenharmony_ci	b.eq	L(loop_aligned)
84bbbf1280Sopenharmony_ci	/* End of main loop */
85bbbf1280Sopenharmony_ci
86bbbf1280Sopenharmony_ciL(full_check):
87bbbf1280Sopenharmony_ci#ifndef __AARCH64EB__
88bbbf1280Sopenharmony_ci	orr	syndrome, diff, has_nul
89bbbf1280Sopenharmony_ci	add	limit, limit, 8	/* Rewind limit to before last subs. */
90bbbf1280Sopenharmony_ciL(syndrome_check):
91bbbf1280Sopenharmony_ci	/* Limit was reached. Check if the NUL byte or the difference
92bbbf1280Sopenharmony_ci	   is before the limit. */
93bbbf1280Sopenharmony_ci	rev	syndrome, syndrome
94bbbf1280Sopenharmony_ci	rev	data1, data1
95bbbf1280Sopenharmony_ci	clz	pos, syndrome
96bbbf1280Sopenharmony_ci	rev	data2, data2
97bbbf1280Sopenharmony_ci	lsl	data1, data1, pos
98bbbf1280Sopenharmony_ci	cmp	limit, pos, lsr #3
99bbbf1280Sopenharmony_ci	lsl	data2, data2, pos
100bbbf1280Sopenharmony_ci	/* But we need to zero-extend (char is unsigned) the value and then
101bbbf1280Sopenharmony_ci	   perform a signed 32-bit subtraction.  */
102bbbf1280Sopenharmony_ci	lsr	data1, data1, #56
103bbbf1280Sopenharmony_ci	sub	result, data1, data2, lsr #56
104bbbf1280Sopenharmony_ci	csel result, result, xzr, hi
105bbbf1280Sopenharmony_ci	ret
106bbbf1280Sopenharmony_ci#else
107bbbf1280Sopenharmony_ci	/* Not reached the limit, must have found the end or a diff.  */
108bbbf1280Sopenharmony_ci	tbz	limit, #63, L(not_limit)
109bbbf1280Sopenharmony_ci	add	tmp1, limit, 8
110bbbf1280Sopenharmony_ci	cbz	limit, L(not_limit)
111bbbf1280Sopenharmony_ci
112bbbf1280Sopenharmony_ci	lsl	limit, tmp1, #3	/* Bits -> bytes.  */
113bbbf1280Sopenharmony_ci	mov	mask, #~0
114bbbf1280Sopenharmony_ci	lsr	mask, mask, limit
115bbbf1280Sopenharmony_ci	bic	data1, data1, mask
116bbbf1280Sopenharmony_ci	bic	data2, data2, mask
117bbbf1280Sopenharmony_ci
118bbbf1280Sopenharmony_ci	/* Make sure that the NUL byte is marked in the syndrome.  */
119bbbf1280Sopenharmony_ci	orr	has_nul, has_nul, mask
120bbbf1280Sopenharmony_ci
121bbbf1280Sopenharmony_ciL(not_limit):
122bbbf1280Sopenharmony_ci	/* For big-endian we cannot use the trick with the syndrome value
123bbbf1280Sopenharmony_ci	   as carry-propagation can corrupt the upper bits if the trailing
124bbbf1280Sopenharmony_ci	   bytes in the string contain 0x01.  */
125bbbf1280Sopenharmony_ci	/* However, if there is no NUL byte in the dword, we can generate
126bbbf1280Sopenharmony_ci	   the result directly.  We can't just subtract the bytes as the
127bbbf1280Sopenharmony_ci	   MSB might be significant.  */
128bbbf1280Sopenharmony_ci	cbnz	has_nul, 1f
129bbbf1280Sopenharmony_ci	cmp	data1, data2
130bbbf1280Sopenharmony_ci	cset	result, ne
131bbbf1280Sopenharmony_ci	cneg	result, result, lo
132bbbf1280Sopenharmony_ci	ret
133bbbf1280Sopenharmony_ci1:
134bbbf1280Sopenharmony_ci	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
135bbbf1280Sopenharmony_ci	rev	tmp3, data1
136bbbf1280Sopenharmony_ci	sub	tmp1, tmp3, zeroones
137bbbf1280Sopenharmony_ci	orr	tmp2, tmp3, #REP8_7f
138bbbf1280Sopenharmony_ci	bic	has_nul, tmp1, tmp2
139bbbf1280Sopenharmony_ci	rev	has_nul, has_nul
140bbbf1280Sopenharmony_ci	orr	syndrome, diff, has_nul
141bbbf1280Sopenharmony_ci	clz	pos, syndrome
142bbbf1280Sopenharmony_ci	/* The most-significant-non-zero bit of the syndrome marks either the
143bbbf1280Sopenharmony_ci	   first bit that is different, or the top bit of the first zero byte.
144bbbf1280Sopenharmony_ci	   Shifting left now will bring the critical information into the
145bbbf1280Sopenharmony_ci	   top bits.  */
146bbbf1280Sopenharmony_ciL(end_quick):
147bbbf1280Sopenharmony_ci	lsl	data1, data1, pos
148bbbf1280Sopenharmony_ci	lsl	data2, data2, pos
149bbbf1280Sopenharmony_ci	/* But we need to zero-extend (char is unsigned) the value and then
150bbbf1280Sopenharmony_ci	   perform a signed 32-bit subtraction.  */
151bbbf1280Sopenharmony_ci	lsr	data1, data1, #56
152bbbf1280Sopenharmony_ci	sub	result, data1, data2, lsr #56
153bbbf1280Sopenharmony_ci	ret
154bbbf1280Sopenharmony_ci#endif
155bbbf1280Sopenharmony_ci
156bbbf1280Sopenharmony_ciL(mutual_align):
157bbbf1280Sopenharmony_ci	/* Sources are mutually aligned, but are not currently at an
158bbbf1280Sopenharmony_ci	   alignment boundary.  Round down the addresses and then mask off
159bbbf1280Sopenharmony_ci	   the bytes that precede the start point.
160bbbf1280Sopenharmony_ci	   We also need to adjust the limit calculations, but without
161bbbf1280Sopenharmony_ci	   overflowing if the limit is near ULONG_MAX.  */
162bbbf1280Sopenharmony_ci	bic	src1, src1, #7
163bbbf1280Sopenharmony_ci	bic	src2, src2, #7
164bbbf1280Sopenharmony_ci	ldr	data1, [src1], #8
165bbbf1280Sopenharmony_ci	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
166bbbf1280Sopenharmony_ci	ldr	data2, [src2], #8
167bbbf1280Sopenharmony_ci	mov	tmp2, #~0
168bbbf1280Sopenharmony_ci	LS_FW	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
169bbbf1280Sopenharmony_ci	/* Adjust the limit and ensure it doesn't overflow.  */
170bbbf1280Sopenharmony_ci	adds	limit, limit, count
171bbbf1280Sopenharmony_ci	csinv	limit, limit, xzr, lo
172bbbf1280Sopenharmony_ci	orr	data1, data1, tmp2
173bbbf1280Sopenharmony_ci	orr	data2, data2, tmp2
174bbbf1280Sopenharmony_ci	b	L(start_realigned)
175bbbf1280Sopenharmony_ci
176bbbf1280Sopenharmony_ci	.p2align 4
177bbbf1280Sopenharmony_ci	/* Don't bother with dwords for up to 16 bytes.  */
178bbbf1280Sopenharmony_ciL(misaligned8):
179bbbf1280Sopenharmony_ci	cmp	limit, #16
180bbbf1280Sopenharmony_ci	b.hs	L(try_misaligned_words)
181bbbf1280Sopenharmony_ci
182bbbf1280Sopenharmony_ciL(byte_loop):
183bbbf1280Sopenharmony_ci	/* Perhaps we can do better than this.  */
184bbbf1280Sopenharmony_ci	ldrb	data1w, [src1], #1
185bbbf1280Sopenharmony_ci	ldrb	data2w, [src2], #1
186bbbf1280Sopenharmony_ci	subs	limit, limit, #1
187bbbf1280Sopenharmony_ci	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
188bbbf1280Sopenharmony_ci	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
189bbbf1280Sopenharmony_ci	b.eq	L(byte_loop)
190bbbf1280Sopenharmony_ciL(done):
191bbbf1280Sopenharmony_ci	sub	result, data1, data2
192bbbf1280Sopenharmony_ci	ret
193bbbf1280Sopenharmony_ci	/* Align the SRC1 to a dword by doing a bytewise compare and then do
194bbbf1280Sopenharmony_ci	   the dword loop.  */
195bbbf1280Sopenharmony_ciL(try_misaligned_words):
196bbbf1280Sopenharmony_ci	cbz	count, L(src1_aligned)
197bbbf1280Sopenharmony_ci
198bbbf1280Sopenharmony_ci	neg	count, count
199bbbf1280Sopenharmony_ci	and	count, count, #7
200bbbf1280Sopenharmony_ci	sub	limit, limit, count
201bbbf1280Sopenharmony_ci
202bbbf1280Sopenharmony_ciL(page_end_loop):
203bbbf1280Sopenharmony_ci	ldrb	data1w, [src1], #1
204bbbf1280Sopenharmony_ci	ldrb	data2w, [src2], #1
205bbbf1280Sopenharmony_ci	cmp	data1w, #1
206bbbf1280Sopenharmony_ci	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
207bbbf1280Sopenharmony_ci	b.ne	L(done)
208bbbf1280Sopenharmony_ci	subs	count, count, #1
209bbbf1280Sopenharmony_ci	b.hi	L(page_end_loop)
210bbbf1280Sopenharmony_ci
211bbbf1280Sopenharmony_ci	/* The following diagram explains the comparison of misaligned strings.
212bbbf1280Sopenharmony_ci	   The bytes are shown in natural order. For little-endian, it is
213bbbf1280Sopenharmony_ci	   reversed in the registers. The "x" bytes are before the string.
214bbbf1280Sopenharmony_ci	   The "|" separates data that is loaded at one time.
215bbbf1280Sopenharmony_ci	   src1     | a a a a a a a a | b b b c c c c c | . . .
216bbbf1280Sopenharmony_ci	   src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
217bbbf1280Sopenharmony_ci
218bbbf1280Sopenharmony_ci	   After shifting in each step, the data looks like this:
219bbbf1280Sopenharmony_ci	                STEP_A              STEP_B              STEP_C
220bbbf1280Sopenharmony_ci	   data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
221bbbf1280Sopenharmony_ci	   data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
222bbbf1280Sopenharmony_ci
223bbbf1280Sopenharmony_ci	   The bytes with "0" are eliminated from the syndrome via mask.
224bbbf1280Sopenharmony_ci
225bbbf1280Sopenharmony_ci	   Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
226bbbf1280Sopenharmony_ci	   time from SRC2. The comparison happens in 3 steps. After each step
227bbbf1280Sopenharmony_ci	   the loop can exit, or read from SRC1 or SRC2. */
228bbbf1280Sopenharmony_ciL(src1_aligned):
229bbbf1280Sopenharmony_ci	/* Calculate offset from 8 byte alignment to string start in bits. No
230bbbf1280Sopenharmony_ci	   need to mask offset since shifts are ignoring upper bits. */
231bbbf1280Sopenharmony_ci	lsl	offset, src2, #3
232bbbf1280Sopenharmony_ci	bic	src2, src2, #0xf
233bbbf1280Sopenharmony_ci	mov	mask, -1
234bbbf1280Sopenharmony_ci	neg	neg_offset, offset
235bbbf1280Sopenharmony_ci	ldr	data1, [src1], #8
236bbbf1280Sopenharmony_ci	ldp	tmp1, tmp2, [src2], #16
237bbbf1280Sopenharmony_ci	LS_BK	mask, mask, neg_offset
238bbbf1280Sopenharmony_ci	and	neg_offset, neg_offset, #63	/* Need actual value for cmp later. */
239bbbf1280Sopenharmony_ci	/* Skip the first compare if data in tmp1 is irrelevant. */
240bbbf1280Sopenharmony_ci	tbnz	offset, 6, L(misaligned_mid_loop)
241bbbf1280Sopenharmony_ci
242bbbf1280Sopenharmony_ciL(loop_misaligned):
243bbbf1280Sopenharmony_ci	/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
244bbbf1280Sopenharmony_ci	LS_FW	data2, tmp1, offset
245bbbf1280Sopenharmony_ci	LS_BK	tmp1, tmp2, neg_offset
246bbbf1280Sopenharmony_ci	subs	limit, limit, #8
247bbbf1280Sopenharmony_ci	orr	data2, data2, tmp1	/* 8 bytes from SRC2 combined from two regs.*/
248bbbf1280Sopenharmony_ci	sub	has_nul, data1, zeroones
249bbbf1280Sopenharmony_ci	eor	diff, data1, data2	/* Non-zero if differences found.  */
250bbbf1280Sopenharmony_ci	orr	tmp3, data1, #REP8_7f
251bbbf1280Sopenharmony_ci	csinv	endloop, diff, xzr, hi	/* If limit, set to all ones. */
252bbbf1280Sopenharmony_ci	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL byte found in SRC1. */
253bbbf1280Sopenharmony_ci	orr	tmp3, endloop, has_nul
254bbbf1280Sopenharmony_ci	cbnz	tmp3, L(full_check)
255bbbf1280Sopenharmony_ci
256bbbf1280Sopenharmony_ci	ldr	data1, [src1], #8
257bbbf1280Sopenharmony_ciL(misaligned_mid_loop):
258bbbf1280Sopenharmony_ci	/* STEP_B: Compare first part of data1 to second part of tmp2. */
259bbbf1280Sopenharmony_ci	LS_FW	data2, tmp2, offset
260bbbf1280Sopenharmony_ci#ifdef __AARCH64EB__
261bbbf1280Sopenharmony_ci	/* For big-endian we do a byte reverse to avoid carry-propagation
262bbbf1280Sopenharmony_ci	problem described above. This way we can reuse the has_nul in the
263bbbf1280Sopenharmony_ci	next step and also use syndrome value trick at the end. */
264bbbf1280Sopenharmony_ci	rev	tmp3, data1
265bbbf1280Sopenharmony_ci	#define data1_fixed tmp3
266bbbf1280Sopenharmony_ci#else
267bbbf1280Sopenharmony_ci	#define data1_fixed data1
268bbbf1280Sopenharmony_ci#endif
269bbbf1280Sopenharmony_ci	sub	has_nul, data1_fixed, zeroones
270bbbf1280Sopenharmony_ci	orr	tmp3, data1_fixed, #REP8_7f
271bbbf1280Sopenharmony_ci	eor	diff, data2, data1	/* Non-zero if differences found.  */
272bbbf1280Sopenharmony_ci	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL terminator.  */
273bbbf1280Sopenharmony_ci#ifdef __AARCH64EB__
274bbbf1280Sopenharmony_ci	rev	has_nul, has_nul
275bbbf1280Sopenharmony_ci#endif
276bbbf1280Sopenharmony_ci	cmp	limit, neg_offset, lsr #3
277bbbf1280Sopenharmony_ci	orr	syndrome, diff, has_nul
278bbbf1280Sopenharmony_ci	bic	syndrome, syndrome, mask	/* Ignore later bytes. */
279bbbf1280Sopenharmony_ci	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
280bbbf1280Sopenharmony_ci	cbnz	tmp3, L(syndrome_check)
281bbbf1280Sopenharmony_ci
282bbbf1280Sopenharmony_ci	/* STEP_C: Compare second part of data1 to first part of tmp1. */
283bbbf1280Sopenharmony_ci	ldp	tmp1, tmp2, [src2], #16
284bbbf1280Sopenharmony_ci	cmp	limit, #8
285bbbf1280Sopenharmony_ci	LS_BK	data2, tmp1, neg_offset
286bbbf1280Sopenharmony_ci	eor	diff, data2, data1	/* Non-zero if differences found.  */
287bbbf1280Sopenharmony_ci	orr	syndrome, diff, has_nul
288bbbf1280Sopenharmony_ci	and	syndrome, syndrome, mask	/* Ignore earlier bytes. */
289bbbf1280Sopenharmony_ci	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
290bbbf1280Sopenharmony_ci	cbnz	tmp3, L(syndrome_check)
291bbbf1280Sopenharmony_ci
292bbbf1280Sopenharmony_ci	ldr	data1, [src1], #8
293bbbf1280Sopenharmony_ci	sub	limit, limit, #8
294bbbf1280Sopenharmony_ci	b	L(loop_misaligned)
295bbbf1280Sopenharmony_ci
296bbbf1280Sopenharmony_ci#ifdef	__AARCH64EB__
297bbbf1280Sopenharmony_ciL(syndrome_check):
298bbbf1280Sopenharmony_ci	clz	pos, syndrome
299bbbf1280Sopenharmony_ci	cmp	pos, limit, lsl #3
300bbbf1280Sopenharmony_ci	b.lo	L(end_quick)
301bbbf1280Sopenharmony_ci#endif
302bbbf1280Sopenharmony_ci
303bbbf1280Sopenharmony_ciL(ret0):
304bbbf1280Sopenharmony_ci	mov	result, #0
305bbbf1280Sopenharmony_ci	ret
306bbbf1280Sopenharmony_ciEND(__strncmp_aarch64_mte)
307bbbf1280Sopenharmony_ci
308