162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Copyright (c) 2013-2021, Arm Limited.
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Adapted from the original at:
662306a36Sopenharmony_ci * https://github.com/ARM-software/optimized-routines/blob/98e4d6a5c13c8e54/string/aarch64/strlen.S
762306a36Sopenharmony_ci */
862306a36Sopenharmony_ci
962306a36Sopenharmony_ci#include <linux/linkage.h>
1062306a36Sopenharmony_ci#include <asm/assembler.h>
1162306a36Sopenharmony_ci#include <asm/mte-def.h>
1262306a36Sopenharmony_ci
1362306a36Sopenharmony_ci/* Assumptions:
1462306a36Sopenharmony_ci *
1562306a36Sopenharmony_ci * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
1662306a36Sopenharmony_ci */
1762306a36Sopenharmony_ci
1862306a36Sopenharmony_ci#define L(label) .L ## label
1962306a36Sopenharmony_ci
2062306a36Sopenharmony_ci/* Arguments and results.  */
2162306a36Sopenharmony_ci#define srcin		x0
2262306a36Sopenharmony_ci#define len		x0
2362306a36Sopenharmony_ci
2462306a36Sopenharmony_ci/* Locals and temporaries.  */
2562306a36Sopenharmony_ci#define src		x1
2662306a36Sopenharmony_ci#define data1		x2
2762306a36Sopenharmony_ci#define data2		x3
2862306a36Sopenharmony_ci#define has_nul1	x4
2962306a36Sopenharmony_ci#define has_nul2	x5
3062306a36Sopenharmony_ci#define tmp1		x4
3162306a36Sopenharmony_ci#define tmp2		x5
3262306a36Sopenharmony_ci#define tmp3		x6
3362306a36Sopenharmony_ci#define tmp4		x7
3462306a36Sopenharmony_ci#define zeroones	x8
3562306a36Sopenharmony_ci
3662306a36Sopenharmony_ci	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
3762306a36Sopenharmony_ci	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
3862306a36Sopenharmony_ci	   can be done in parallel across the entire word. A faster check
3962306a36Sopenharmony_ci	   (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
4062306a36Sopenharmony_ci	   false hits for characters 129..255.	*/
4162306a36Sopenharmony_ci
4262306a36Sopenharmony_ci#define REP8_01 0x0101010101010101
4362306a36Sopenharmony_ci#define REP8_7f 0x7f7f7f7f7f7f7f7f
4462306a36Sopenharmony_ci#define REP8_80 0x8080808080808080
4562306a36Sopenharmony_ci
4662306a36Sopenharmony_ci/*
4762306a36Sopenharmony_ci * When KASAN_HW_TAGS is in use, memory is checked at MTE_GRANULE_SIZE
4862306a36Sopenharmony_ci * (16-byte) granularity, and we must ensure that no access straddles this
4962306a36Sopenharmony_ci * alignment boundary.
5062306a36Sopenharmony_ci */
5162306a36Sopenharmony_ci#ifdef CONFIG_KASAN_HW_TAGS
5262306a36Sopenharmony_ci#define MIN_PAGE_SIZE MTE_GRANULE_SIZE
5362306a36Sopenharmony_ci#else
5462306a36Sopenharmony_ci#define MIN_PAGE_SIZE 4096
5562306a36Sopenharmony_ci#endif
5662306a36Sopenharmony_ci
5762306a36Sopenharmony_ci	/* Since strings are short on average, we check the first 16 bytes
5862306a36Sopenharmony_ci	   of the string for a NUL character.  In order to do an unaligned ldp
5962306a36Sopenharmony_ci	   safely we have to do a page cross check first.  If there is a NUL
6062306a36Sopenharmony_ci	   byte we calculate the length from the 2 8-byte words using
6162306a36Sopenharmony_ci	   conditional select to reduce branch mispredictions (it is unlikely
6262306a36Sopenharmony_ci	   strlen will be repeatedly called on strings with the same length).
6362306a36Sopenharmony_ci
6462306a36Sopenharmony_ci	   If the string is longer than 16 bytes, we align src so don't need
6562306a36Sopenharmony_ci	   further page cross checks, and process 32 bytes per iteration
6662306a36Sopenharmony_ci	   using the fast NUL check.  If we encounter non-ASCII characters,
6762306a36Sopenharmony_ci	   fallback to a second loop using the full NUL check.
6862306a36Sopenharmony_ci
6962306a36Sopenharmony_ci	   If the page cross check fails, we read 16 bytes from an aligned
7062306a36Sopenharmony_ci	   address, remove any characters before the string, and continue
7162306a36Sopenharmony_ci	   in the main loop using aligned loads.  Since strings crossing a
7262306a36Sopenharmony_ci	   page in the first 16 bytes are rare (probability of
7362306a36Sopenharmony_ci	   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
7462306a36Sopenharmony_ci
7562306a36Sopenharmony_ci	   AArch64 systems have a minimum page size of 4k.  We don't bother
7662306a36Sopenharmony_ci	   checking for larger page sizes - the cost of setting up the correct
7762306a36Sopenharmony_ci	   page size is just not worth the extra gain from a small reduction in
7862306a36Sopenharmony_ci	   the cases taking the slow path.  Note that we only care about
7962306a36Sopenharmony_ci	   whether the first fetch, which may be misaligned, crosses a page
8062306a36Sopenharmony_ci	   boundary.  */
8162306a36Sopenharmony_ci
8262306a36Sopenharmony_ciSYM_FUNC_START(__pi_strlen)
8362306a36Sopenharmony_ci	and	tmp1, srcin, MIN_PAGE_SIZE - 1
8462306a36Sopenharmony_ci	mov	zeroones, REP8_01
8562306a36Sopenharmony_ci	cmp	tmp1, MIN_PAGE_SIZE - 16
8662306a36Sopenharmony_ci	b.gt	L(page_cross)
8762306a36Sopenharmony_ci	ldp	data1, data2, [srcin]
8862306a36Sopenharmony_ci#ifdef __AARCH64EB__
8962306a36Sopenharmony_ci	/* For big-endian, carry propagation (if the final byte in the
9062306a36Sopenharmony_ci	   string is 0x01) means we cannot use has_nul1/2 directly.
9162306a36Sopenharmony_ci	   Since we expect strings to be small and early-exit,
9262306a36Sopenharmony_ci	   byte-swap the data now so has_null1/2 will be correct.  */
9362306a36Sopenharmony_ci	rev	data1, data1
9462306a36Sopenharmony_ci	rev	data2, data2
9562306a36Sopenharmony_ci#endif
9662306a36Sopenharmony_ci	sub	tmp1, data1, zeroones
9762306a36Sopenharmony_ci	orr	tmp2, data1, REP8_7f
9862306a36Sopenharmony_ci	sub	tmp3, data2, zeroones
9962306a36Sopenharmony_ci	orr	tmp4, data2, REP8_7f
10062306a36Sopenharmony_ci	bics	has_nul1, tmp1, tmp2
10162306a36Sopenharmony_ci	bic	has_nul2, tmp3, tmp4
10262306a36Sopenharmony_ci	ccmp	has_nul2, 0, 0, eq
10362306a36Sopenharmony_ci	beq	L(main_loop_entry)
10462306a36Sopenharmony_ci
10562306a36Sopenharmony_ci	/* Enter with C = has_nul1 == 0.  */
10662306a36Sopenharmony_ci	csel	has_nul1, has_nul1, has_nul2, cc
10762306a36Sopenharmony_ci	mov	len, 8
10862306a36Sopenharmony_ci	rev	has_nul1, has_nul1
10962306a36Sopenharmony_ci	clz	tmp1, has_nul1
11062306a36Sopenharmony_ci	csel	len, xzr, len, cc
11162306a36Sopenharmony_ci	add	len, len, tmp1, lsr 3
11262306a36Sopenharmony_ci	ret
11362306a36Sopenharmony_ci
11462306a36Sopenharmony_ci	/* The inner loop processes 32 bytes per iteration and uses the fast
11562306a36Sopenharmony_ci	   NUL check.  If we encounter non-ASCII characters, use a second
11662306a36Sopenharmony_ci	   loop with the accurate NUL check.  */
11762306a36Sopenharmony_ci	.p2align 4
11862306a36Sopenharmony_ciL(main_loop_entry):
11962306a36Sopenharmony_ci	bic	src, srcin, 15
12062306a36Sopenharmony_ci	sub	src, src, 16
12162306a36Sopenharmony_ciL(main_loop):
12262306a36Sopenharmony_ci	ldp	data1, data2, [src, 32]!
12362306a36Sopenharmony_ciL(page_cross_entry):
12462306a36Sopenharmony_ci	sub	tmp1, data1, zeroones
12562306a36Sopenharmony_ci	sub	tmp3, data2, zeroones
12662306a36Sopenharmony_ci	orr	tmp2, tmp1, tmp3
12762306a36Sopenharmony_ci	tst	tmp2, zeroones, lsl 7
12862306a36Sopenharmony_ci	bne	1f
12962306a36Sopenharmony_ci	ldp	data1, data2, [src, 16]
13062306a36Sopenharmony_ci	sub	tmp1, data1, zeroones
13162306a36Sopenharmony_ci	sub	tmp3, data2, zeroones
13262306a36Sopenharmony_ci	orr	tmp2, tmp1, tmp3
13362306a36Sopenharmony_ci	tst	tmp2, zeroones, lsl 7
13462306a36Sopenharmony_ci	beq	L(main_loop)
13562306a36Sopenharmony_ci	add	src, src, 16
13662306a36Sopenharmony_ci1:
13762306a36Sopenharmony_ci	/* The fast check failed, so do the slower, accurate NUL check.	 */
13862306a36Sopenharmony_ci	orr	tmp2, data1, REP8_7f
13962306a36Sopenharmony_ci	orr	tmp4, data2, REP8_7f
14062306a36Sopenharmony_ci	bics	has_nul1, tmp1, tmp2
14162306a36Sopenharmony_ci	bic	has_nul2, tmp3, tmp4
14262306a36Sopenharmony_ci	ccmp	has_nul2, 0, 0, eq
14362306a36Sopenharmony_ci	beq	L(nonascii_loop)
14462306a36Sopenharmony_ci
14562306a36Sopenharmony_ci	/* Enter with C = has_nul1 == 0.  */
14662306a36Sopenharmony_ciL(tail):
14762306a36Sopenharmony_ci#ifdef __AARCH64EB__
14862306a36Sopenharmony_ci	/* For big-endian, carry propagation (if the final byte in the
14962306a36Sopenharmony_ci	   string is 0x01) means we cannot use has_nul1/2 directly.  The
15062306a36Sopenharmony_ci	   easiest way to get the correct byte is to byte-swap the data
15162306a36Sopenharmony_ci	   and calculate the syndrome a second time.  */
15262306a36Sopenharmony_ci	csel	data1, data1, data2, cc
15362306a36Sopenharmony_ci	rev	data1, data1
15462306a36Sopenharmony_ci	sub	tmp1, data1, zeroones
15562306a36Sopenharmony_ci	orr	tmp2, data1, REP8_7f
15662306a36Sopenharmony_ci	bic	has_nul1, tmp1, tmp2
15762306a36Sopenharmony_ci#else
15862306a36Sopenharmony_ci	csel	has_nul1, has_nul1, has_nul2, cc
15962306a36Sopenharmony_ci#endif
16062306a36Sopenharmony_ci	sub	len, src, srcin
16162306a36Sopenharmony_ci	rev	has_nul1, has_nul1
16262306a36Sopenharmony_ci	add	tmp2, len, 8
16362306a36Sopenharmony_ci	clz	tmp1, has_nul1
16462306a36Sopenharmony_ci	csel	len, len, tmp2, cc
16562306a36Sopenharmony_ci	add	len, len, tmp1, lsr 3
16662306a36Sopenharmony_ci	ret
16762306a36Sopenharmony_ci
16862306a36Sopenharmony_ciL(nonascii_loop):
16962306a36Sopenharmony_ci	ldp	data1, data2, [src, 16]!
17062306a36Sopenharmony_ci	sub	tmp1, data1, zeroones
17162306a36Sopenharmony_ci	orr	tmp2, data1, REP8_7f
17262306a36Sopenharmony_ci	sub	tmp3, data2, zeroones
17362306a36Sopenharmony_ci	orr	tmp4, data2, REP8_7f
17462306a36Sopenharmony_ci	bics	has_nul1, tmp1, tmp2
17562306a36Sopenharmony_ci	bic	has_nul2, tmp3, tmp4
17662306a36Sopenharmony_ci	ccmp	has_nul2, 0, 0, eq
17762306a36Sopenharmony_ci	bne	L(tail)
17862306a36Sopenharmony_ci	ldp	data1, data2, [src, 16]!
17962306a36Sopenharmony_ci	sub	tmp1, data1, zeroones
18062306a36Sopenharmony_ci	orr	tmp2, data1, REP8_7f
18162306a36Sopenharmony_ci	sub	tmp3, data2, zeroones
18262306a36Sopenharmony_ci	orr	tmp4, data2, REP8_7f
18362306a36Sopenharmony_ci	bics	has_nul1, tmp1, tmp2
18462306a36Sopenharmony_ci	bic	has_nul2, tmp3, tmp4
18562306a36Sopenharmony_ci	ccmp	has_nul2, 0, 0, eq
18662306a36Sopenharmony_ci	beq	L(nonascii_loop)
18762306a36Sopenharmony_ci	b	L(tail)
18862306a36Sopenharmony_ci
18962306a36Sopenharmony_ci	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
19062306a36Sopenharmony_ci	   srcin to 0x7f, so we ignore any NUL bytes before the string.
19162306a36Sopenharmony_ci	   Then continue in the aligned loop.  */
19262306a36Sopenharmony_ciL(page_cross):
19362306a36Sopenharmony_ci	bic	src, srcin, 15
19462306a36Sopenharmony_ci	ldp	data1, data2, [src]
19562306a36Sopenharmony_ci	lsl	tmp1, srcin, 3
19662306a36Sopenharmony_ci	mov	tmp4, -1
19762306a36Sopenharmony_ci#ifdef __AARCH64EB__
19862306a36Sopenharmony_ci	/* Big-endian.	Early bytes are at MSB.	 */
19962306a36Sopenharmony_ci	lsr	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
20062306a36Sopenharmony_ci#else
20162306a36Sopenharmony_ci	/* Little-endian.  Early bytes are at LSB.  */
20262306a36Sopenharmony_ci	lsl	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
20362306a36Sopenharmony_ci#endif
20462306a36Sopenharmony_ci	orr	tmp1, tmp1, REP8_80
20562306a36Sopenharmony_ci	orn	data1, data1, tmp1
20662306a36Sopenharmony_ci	orn	tmp2, data2, tmp1
20762306a36Sopenharmony_ci	tst	srcin, 8
20862306a36Sopenharmony_ci	csel	data1, data1, tmp4, eq
20962306a36Sopenharmony_ci	csel	data2, data2, tmp2, eq
21062306a36Sopenharmony_ci	b	L(page_cross_entry)
21162306a36Sopenharmony_ciSYM_FUNC_END(__pi_strlen)
21262306a36Sopenharmony_ciSYM_FUNC_ALIAS_WEAK(strlen, __pi_strlen)
21362306a36Sopenharmony_ciEXPORT_SYMBOL_NOKASAN(strlen)
214