1bbbf1280Sopenharmony_ci/*
2bbbf1280Sopenharmony_ci * strncmp - compare two strings
3bbbf1280Sopenharmony_ci *
4bbbf1280Sopenharmony_ci * Copyright (c) 2013-2021, Arm Limited.
5bbbf1280Sopenharmony_ci * SPDX-License-Identifier: MIT
6bbbf1280Sopenharmony_ci */
7bbbf1280Sopenharmony_ci
8bbbf1280Sopenharmony_ci/* Assumptions:
9bbbf1280Sopenharmony_ci *
10bbbf1280Sopenharmony_ci * ARMv8-a, AArch64
11bbbf1280Sopenharmony_ci */
12bbbf1280Sopenharmony_ci
13bbbf1280Sopenharmony_ci#include "../asmdefs.h"
14bbbf1280Sopenharmony_ci
15bbbf1280Sopenharmony_ci#define REP8_01 0x0101010101010101
16bbbf1280Sopenharmony_ci#define REP8_7f 0x7f7f7f7f7f7f7f7f
17bbbf1280Sopenharmony_ci#define REP8_80 0x8080808080808080
18bbbf1280Sopenharmony_ci
19bbbf1280Sopenharmony_ci/* Parameters and result.  */
20bbbf1280Sopenharmony_ci#define src1		x0
21bbbf1280Sopenharmony_ci#define src2		x1
22bbbf1280Sopenharmony_ci#define limit		x2
23bbbf1280Sopenharmony_ci#define result		x0
24bbbf1280Sopenharmony_ci
25bbbf1280Sopenharmony_ci/* Internal variables.  */
26bbbf1280Sopenharmony_ci#define data1		x3
27bbbf1280Sopenharmony_ci#define data1w		w3
28bbbf1280Sopenharmony_ci#define data2		x4
29bbbf1280Sopenharmony_ci#define data2w		w4
30bbbf1280Sopenharmony_ci#define has_nul		x5
31bbbf1280Sopenharmony_ci#define diff		x6
32bbbf1280Sopenharmony_ci#define syndrome	x7
33bbbf1280Sopenharmony_ci#define tmp1		x8
34bbbf1280Sopenharmony_ci#define tmp2		x9
35bbbf1280Sopenharmony_ci#define tmp3		x10
36bbbf1280Sopenharmony_ci#define zeroones	x11
37bbbf1280Sopenharmony_ci#define pos		x12
38bbbf1280Sopenharmony_ci#define limit_wd	x13
39bbbf1280Sopenharmony_ci#define mask		x14
40bbbf1280Sopenharmony_ci#define endloop		x15
41bbbf1280Sopenharmony_ci#define count		mask
42bbbf1280Sopenharmony_ci
43bbbf1280Sopenharmony_ciENTRY (__strncmp_aarch64)
44bbbf1280Sopenharmony_ci	PTR_ARG (0)
45bbbf1280Sopenharmony_ci	PTR_ARG (1)
46bbbf1280Sopenharmony_ci	SIZE_ARG (2)
47bbbf1280Sopenharmony_ci	cbz	limit, L(ret0)
48bbbf1280Sopenharmony_ci	eor	tmp1, src1, src2
49bbbf1280Sopenharmony_ci	mov	zeroones, #REP8_01
50bbbf1280Sopenharmony_ci	tst	tmp1, #7
51bbbf1280Sopenharmony_ci	and	count, src1, #7
52bbbf1280Sopenharmony_ci	b.ne	L(misaligned8)
53bbbf1280Sopenharmony_ci	cbnz	count, L(mutual_align)
54bbbf1280Sopenharmony_ci	/* Calculate the number of full and partial words -1.  */
55bbbf1280Sopenharmony_ci	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
56bbbf1280Sopenharmony_ci	lsr	limit_wd, limit_wd, #3	/* Convert to Dwords.  */
57bbbf1280Sopenharmony_ci
58bbbf1280Sopenharmony_ci	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
59bbbf1280Sopenharmony_ci	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
60bbbf1280Sopenharmony_ci	   can be done in parallel across the entire word.  */
61bbbf1280Sopenharmony_ci	.p2align 4
62bbbf1280Sopenharmony_ciL(loop_aligned):
63bbbf1280Sopenharmony_ci	ldr	data1, [src1], #8
64bbbf1280Sopenharmony_ci	ldr	data2, [src2], #8
65bbbf1280Sopenharmony_ciL(start_realigned):
66bbbf1280Sopenharmony_ci	subs	limit_wd, limit_wd, #1
67bbbf1280Sopenharmony_ci	sub	tmp1, data1, zeroones
68bbbf1280Sopenharmony_ci	orr	tmp2, data1, #REP8_7f
69bbbf1280Sopenharmony_ci	eor	diff, data1, data2	/* Non-zero if differences found.  */
70bbbf1280Sopenharmony_ci	csinv	endloop, diff, xzr, pl	/* Last Dword or differences.  */
71bbbf1280Sopenharmony_ci	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
72bbbf1280Sopenharmony_ci	ccmp	endloop, #0, #0, eq
73bbbf1280Sopenharmony_ci	b.eq	L(loop_aligned)
74bbbf1280Sopenharmony_ci	/* End of main loop */
75bbbf1280Sopenharmony_ci
76bbbf1280Sopenharmony_ci	/* Not reached the limit, must have found the end or a diff.  */
77bbbf1280Sopenharmony_ci	tbz	limit_wd, #63, L(not_limit)
78bbbf1280Sopenharmony_ci
79bbbf1280Sopenharmony_ci	/* Limit % 8 == 0 => all bytes significant.  */
80bbbf1280Sopenharmony_ci	ands	limit, limit, #7
81bbbf1280Sopenharmony_ci	b.eq	L(not_limit)
82bbbf1280Sopenharmony_ci
83bbbf1280Sopenharmony_ci	lsl	limit, limit, #3	/* Bits -> bytes.  */
84bbbf1280Sopenharmony_ci	mov	mask, #~0
85bbbf1280Sopenharmony_ci#ifdef __AARCH64EB__
86bbbf1280Sopenharmony_ci	lsr	mask, mask, limit
87bbbf1280Sopenharmony_ci#else
88bbbf1280Sopenharmony_ci	lsl	mask, mask, limit
89bbbf1280Sopenharmony_ci#endif
90bbbf1280Sopenharmony_ci	bic	data1, data1, mask
91bbbf1280Sopenharmony_ci	bic	data2, data2, mask
92bbbf1280Sopenharmony_ci
93bbbf1280Sopenharmony_ci	/* Make sure that the NUL byte is marked in the syndrome.  */
94bbbf1280Sopenharmony_ci	orr	has_nul, has_nul, mask
95bbbf1280Sopenharmony_ci
96bbbf1280Sopenharmony_ciL(not_limit):
97bbbf1280Sopenharmony_ci	orr	syndrome, diff, has_nul
98bbbf1280Sopenharmony_ci
99bbbf1280Sopenharmony_ci#ifndef	__AARCH64EB__
100bbbf1280Sopenharmony_ci	rev	syndrome, syndrome
101bbbf1280Sopenharmony_ci	rev	data1, data1
102bbbf1280Sopenharmony_ci	/* The MS-non-zero bit of the syndrome marks either the first bit
103bbbf1280Sopenharmony_ci	   that is different, or the top bit of the first zero byte.
104bbbf1280Sopenharmony_ci	   Shifting left now will bring the critical information into the
105bbbf1280Sopenharmony_ci	   top bits.  */
106bbbf1280Sopenharmony_ci	clz	pos, syndrome
107bbbf1280Sopenharmony_ci	rev	data2, data2
108bbbf1280Sopenharmony_ci	lsl	data1, data1, pos
109bbbf1280Sopenharmony_ci	lsl	data2, data2, pos
110bbbf1280Sopenharmony_ci	/* But we need to zero-extend (char is unsigned) the value and then
111bbbf1280Sopenharmony_ci	   perform a signed 32-bit subtraction.  */
112bbbf1280Sopenharmony_ci	lsr	data1, data1, #56
113bbbf1280Sopenharmony_ci	sub	result, data1, data2, lsr #56
114bbbf1280Sopenharmony_ci	ret
115bbbf1280Sopenharmony_ci#else
116bbbf1280Sopenharmony_ci	/* For big-endian we cannot use the trick with the syndrome value
117bbbf1280Sopenharmony_ci	   as carry-propagation can corrupt the upper bits if the trailing
118bbbf1280Sopenharmony_ci	   bytes in the string contain 0x01.  */
119bbbf1280Sopenharmony_ci	/* However, if there is no NUL byte in the dword, we can generate
120bbbf1280Sopenharmony_ci	   the result directly.  We can't just subtract the bytes as the
121bbbf1280Sopenharmony_ci	   MSB might be significant.  */
122bbbf1280Sopenharmony_ci	cbnz	has_nul, 1f
123bbbf1280Sopenharmony_ci	cmp	data1, data2
124bbbf1280Sopenharmony_ci	cset	result, ne
125bbbf1280Sopenharmony_ci	cneg	result, result, lo
126bbbf1280Sopenharmony_ci	ret
127bbbf1280Sopenharmony_ci1:
128bbbf1280Sopenharmony_ci	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
129bbbf1280Sopenharmony_ci	rev	tmp3, data1
130bbbf1280Sopenharmony_ci	sub	tmp1, tmp3, zeroones
131bbbf1280Sopenharmony_ci	orr	tmp2, tmp3, #REP8_7f
132bbbf1280Sopenharmony_ci	bic	has_nul, tmp1, tmp2
133bbbf1280Sopenharmony_ci	rev	has_nul, has_nul
134bbbf1280Sopenharmony_ci	orr	syndrome, diff, has_nul
135bbbf1280Sopenharmony_ci	clz	pos, syndrome
136bbbf1280Sopenharmony_ci	/* The MS-non-zero bit of the syndrome marks either the first bit
137bbbf1280Sopenharmony_ci	   that is different, or the top bit of the first zero byte.
138bbbf1280Sopenharmony_ci	   Shifting left now will bring the critical information into the
139bbbf1280Sopenharmony_ci	   top bits.  */
140bbbf1280Sopenharmony_ci	lsl	data1, data1, pos
141bbbf1280Sopenharmony_ci	lsl	data2, data2, pos
142bbbf1280Sopenharmony_ci	/* But we need to zero-extend (char is unsigned) the value and then
143bbbf1280Sopenharmony_ci	   perform a signed 32-bit subtraction.  */
144bbbf1280Sopenharmony_ci	lsr	data1, data1, #56
145bbbf1280Sopenharmony_ci	sub	result, data1, data2, lsr #56
146bbbf1280Sopenharmony_ci	ret
147bbbf1280Sopenharmony_ci#endif
148bbbf1280Sopenharmony_ci
149bbbf1280Sopenharmony_ciL(mutual_align):
150bbbf1280Sopenharmony_ci	/* Sources are mutually aligned, but are not currently at an
151bbbf1280Sopenharmony_ci	   alignment boundary.  Round down the addresses and then mask off
152bbbf1280Sopenharmony_ci	   the bytes that precede the start point.
153bbbf1280Sopenharmony_ci	   We also need to adjust the limit calculations, but without
154bbbf1280Sopenharmony_ci	   overflowing if the limit is near ULONG_MAX.  */
155bbbf1280Sopenharmony_ci	bic	src1, src1, #7
156bbbf1280Sopenharmony_ci	bic	src2, src2, #7
157bbbf1280Sopenharmony_ci	ldr	data1, [src1], #8
158bbbf1280Sopenharmony_ci	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
159bbbf1280Sopenharmony_ci	ldr	data2, [src2], #8
160bbbf1280Sopenharmony_ci	mov	tmp2, #~0
161bbbf1280Sopenharmony_ci	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
162bbbf1280Sopenharmony_ci#ifdef __AARCH64EB__
163bbbf1280Sopenharmony_ci	/* Big-endian.  Early bytes are at MSB.  */
164bbbf1280Sopenharmony_ci	lsl	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
165bbbf1280Sopenharmony_ci#else
166bbbf1280Sopenharmony_ci	/* Little-endian.  Early bytes are at LSB.  */
167bbbf1280Sopenharmony_ci	lsr	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
168bbbf1280Sopenharmony_ci#endif
169bbbf1280Sopenharmony_ci	and	tmp3, limit_wd, #7
170bbbf1280Sopenharmony_ci	lsr	limit_wd, limit_wd, #3
171bbbf1280Sopenharmony_ci	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
172bbbf1280Sopenharmony_ci	add	limit, limit, count
173bbbf1280Sopenharmony_ci	add	tmp3, tmp3, count
174bbbf1280Sopenharmony_ci	orr	data1, data1, tmp2
175bbbf1280Sopenharmony_ci	orr	data2, data2, tmp2
176bbbf1280Sopenharmony_ci	add	limit_wd, limit_wd, tmp3, lsr #3
177bbbf1280Sopenharmony_ci	b	L(start_realigned)
178bbbf1280Sopenharmony_ci
179bbbf1280Sopenharmony_ci	.p2align 4
180bbbf1280Sopenharmony_ci	/* Don't bother with dwords for up to 16 bytes.  */
181bbbf1280Sopenharmony_ciL(misaligned8):
182bbbf1280Sopenharmony_ci	cmp	limit, #16
183bbbf1280Sopenharmony_ci	b.hs	L(try_misaligned_words)
184bbbf1280Sopenharmony_ci
185bbbf1280Sopenharmony_ciL(byte_loop):
186bbbf1280Sopenharmony_ci	/* Perhaps we can do better than this.  */
187bbbf1280Sopenharmony_ci	ldrb	data1w, [src1], #1
188bbbf1280Sopenharmony_ci	ldrb	data2w, [src2], #1
189bbbf1280Sopenharmony_ci	subs	limit, limit, #1
190bbbf1280Sopenharmony_ci	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
191bbbf1280Sopenharmony_ci	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
192bbbf1280Sopenharmony_ci	b.eq	L(byte_loop)
193bbbf1280Sopenharmony_ciL(done):
194bbbf1280Sopenharmony_ci	sub	result, data1, data2
195bbbf1280Sopenharmony_ci	ret
196bbbf1280Sopenharmony_ci	/* Align the SRC1 to a dword by doing a bytewise compare and then do
197bbbf1280Sopenharmony_ci	   the dword loop.  */
198bbbf1280Sopenharmony_ciL(try_misaligned_words):
199bbbf1280Sopenharmony_ci	lsr	limit_wd, limit, #3
200bbbf1280Sopenharmony_ci	cbz	count, L(do_misaligned)
201bbbf1280Sopenharmony_ci
202bbbf1280Sopenharmony_ci	neg	count, count
203bbbf1280Sopenharmony_ci	and	count, count, #7
204bbbf1280Sopenharmony_ci	sub	limit, limit, count
205bbbf1280Sopenharmony_ci	lsr	limit_wd, limit, #3
206bbbf1280Sopenharmony_ci
207bbbf1280Sopenharmony_ciL(page_end_loop):
208bbbf1280Sopenharmony_ci	ldrb	data1w, [src1], #1
209bbbf1280Sopenharmony_ci	ldrb	data2w, [src2], #1
210bbbf1280Sopenharmony_ci	cmp	data1w, #1
211bbbf1280Sopenharmony_ci	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
212bbbf1280Sopenharmony_ci	b.ne	L(done)
213bbbf1280Sopenharmony_ci	subs	count, count, #1
214bbbf1280Sopenharmony_ci	b.hi	L(page_end_loop)
215bbbf1280Sopenharmony_ci
216bbbf1280Sopenharmony_ciL(do_misaligned):
217bbbf1280Sopenharmony_ci	/* Prepare ourselves for the next page crossing.  Unlike the aligned
218bbbf1280Sopenharmony_ci	   loop, we fetch 1 less dword because we risk crossing bounds on
219bbbf1280Sopenharmony_ci	   SRC2.  */
220bbbf1280Sopenharmony_ci	mov	count, #8
221bbbf1280Sopenharmony_ci	subs	limit_wd, limit_wd, #1
222bbbf1280Sopenharmony_ci	b.lo	L(done_loop)
223bbbf1280Sopenharmony_ciL(loop_misaligned):
224bbbf1280Sopenharmony_ci	and	tmp2, src2, #0xff8
225bbbf1280Sopenharmony_ci	eor	tmp2, tmp2, #0xff8
226bbbf1280Sopenharmony_ci	cbz	tmp2, L(page_end_loop)
227bbbf1280Sopenharmony_ci
228bbbf1280Sopenharmony_ci	ldr	data1, [src1], #8
229bbbf1280Sopenharmony_ci	ldr	data2, [src2], #8
230bbbf1280Sopenharmony_ci	sub	tmp1, data1, zeroones
231bbbf1280Sopenharmony_ci	orr	tmp2, data1, #REP8_7f
232bbbf1280Sopenharmony_ci	eor	diff, data1, data2	/* Non-zero if differences found.  */
233bbbf1280Sopenharmony_ci	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
234bbbf1280Sopenharmony_ci	ccmp	diff, #0, #0, eq
235bbbf1280Sopenharmony_ci	b.ne	L(not_limit)
236bbbf1280Sopenharmony_ci	subs	limit_wd, limit_wd, #1
237bbbf1280Sopenharmony_ci	b.pl	L(loop_misaligned)
238bbbf1280Sopenharmony_ci
239bbbf1280Sopenharmony_ciL(done_loop):
240bbbf1280Sopenharmony_ci	/* We found a difference or a NULL before the limit was reached.  */
241bbbf1280Sopenharmony_ci	and	limit, limit, #7
242bbbf1280Sopenharmony_ci	cbz	limit, L(not_limit)
243bbbf1280Sopenharmony_ci	/* Read the last word.  */
244bbbf1280Sopenharmony_ci	sub	src1, src1, 8
245bbbf1280Sopenharmony_ci	sub	src2, src2, 8
246bbbf1280Sopenharmony_ci	ldr	data1, [src1, limit]
247bbbf1280Sopenharmony_ci	ldr	data2, [src2, limit]
248bbbf1280Sopenharmony_ci	sub	tmp1, data1, zeroones
249bbbf1280Sopenharmony_ci	orr	tmp2, data1, #REP8_7f
250bbbf1280Sopenharmony_ci	eor	diff, data1, data2	/* Non-zero if differences found.  */
251bbbf1280Sopenharmony_ci	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
252bbbf1280Sopenharmony_ci	ccmp	diff, #0, #0, eq
253bbbf1280Sopenharmony_ci	b.ne	L(not_limit)
254bbbf1280Sopenharmony_ci
255bbbf1280Sopenharmony_ciL(ret0):
256bbbf1280Sopenharmony_ci	mov	result, #0
257bbbf1280Sopenharmony_ci	ret
258bbbf1280Sopenharmony_ci
259bbbf1280Sopenharmony_ciEND ( __strncmp_aarch64)
260bbbf1280Sopenharmony_ci
261