162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Copyright (c) 2013-2021, Arm Limited. 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Adapted from the original at: 662306a36Sopenharmony_ci * https://github.com/ARM-software/optimized-routines/blob/98e4d6a5c13c8e54/string/aarch64/strlen.S 762306a36Sopenharmony_ci */ 862306a36Sopenharmony_ci 962306a36Sopenharmony_ci#include <linux/linkage.h> 1062306a36Sopenharmony_ci#include <asm/assembler.h> 1162306a36Sopenharmony_ci#include <asm/mte-def.h> 1262306a36Sopenharmony_ci 1362306a36Sopenharmony_ci/* Assumptions: 1462306a36Sopenharmony_ci * 1562306a36Sopenharmony_ci * ARMv8-a, AArch64, unaligned accesses, min page size 4k. 1662306a36Sopenharmony_ci */ 1762306a36Sopenharmony_ci 1862306a36Sopenharmony_ci#define L(label) .L ## label 1962306a36Sopenharmony_ci 2062306a36Sopenharmony_ci/* Arguments and results. */ 2162306a36Sopenharmony_ci#define srcin x0 2262306a36Sopenharmony_ci#define len x0 2362306a36Sopenharmony_ci 2462306a36Sopenharmony_ci/* Locals and temporaries. */ 2562306a36Sopenharmony_ci#define src x1 2662306a36Sopenharmony_ci#define data1 x2 2762306a36Sopenharmony_ci#define data2 x3 2862306a36Sopenharmony_ci#define has_nul1 x4 2962306a36Sopenharmony_ci#define has_nul2 x5 3062306a36Sopenharmony_ci#define tmp1 x4 3162306a36Sopenharmony_ci#define tmp2 x5 3262306a36Sopenharmony_ci#define tmp3 x6 3362306a36Sopenharmony_ci#define tmp4 x7 3462306a36Sopenharmony_ci#define zeroones x8 3562306a36Sopenharmony_ci 3662306a36Sopenharmony_ci /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 3762306a36Sopenharmony_ci (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 3862306a36Sopenharmony_ci can be done in parallel across the entire word. A faster check 3962306a36Sopenharmony_ci (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives 4062306a36Sopenharmony_ci false hits for characters 129..255. */ 4162306a36Sopenharmony_ci 4262306a36Sopenharmony_ci#define REP8_01 0x0101010101010101 4362306a36Sopenharmony_ci#define REP8_7f 0x7f7f7f7f7f7f7f7f 4462306a36Sopenharmony_ci#define REP8_80 0x8080808080808080 4562306a36Sopenharmony_ci 4662306a36Sopenharmony_ci/* 4762306a36Sopenharmony_ci * When KASAN_HW_TAGS is in use, memory is checked at MTE_GRANULE_SIZE 4862306a36Sopenharmony_ci * (16-byte) granularity, and we must ensure that no access straddles this 4962306a36Sopenharmony_ci * alignment boundary. 5062306a36Sopenharmony_ci */ 5162306a36Sopenharmony_ci#ifdef CONFIG_KASAN_HW_TAGS 5262306a36Sopenharmony_ci#define MIN_PAGE_SIZE MTE_GRANULE_SIZE 5362306a36Sopenharmony_ci#else 5462306a36Sopenharmony_ci#define MIN_PAGE_SIZE 4096 5562306a36Sopenharmony_ci#endif 5662306a36Sopenharmony_ci 5762306a36Sopenharmony_ci /* Since strings are short on average, we check the first 16 bytes 5862306a36Sopenharmony_ci of the string for a NUL character. In order to do an unaligned ldp 5962306a36Sopenharmony_ci safely we have to do a page cross check first. If there is a NUL 6062306a36Sopenharmony_ci byte we calculate the length from the 2 8-byte words using 6162306a36Sopenharmony_ci conditional select to reduce branch mispredictions (it is unlikely 6262306a36Sopenharmony_ci strlen will be repeatedly called on strings with the same length). 6362306a36Sopenharmony_ci 6462306a36Sopenharmony_ci If the string is longer than 16 bytes, we align src so don't need 6562306a36Sopenharmony_ci further page cross checks, and process 32 bytes per iteration 6662306a36Sopenharmony_ci using the fast NUL check. If we encounter non-ASCII characters, 6762306a36Sopenharmony_ci fallback to a second loop using the full NUL check. 6862306a36Sopenharmony_ci 6962306a36Sopenharmony_ci If the page cross check fails, we read 16 bytes from an aligned 7062306a36Sopenharmony_ci address, remove any characters before the string, and continue 7162306a36Sopenharmony_ci in the main loop using aligned loads. Since strings crossing a 7262306a36Sopenharmony_ci page in the first 16 bytes are rare (probability of 7362306a36Sopenharmony_ci 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized. 7462306a36Sopenharmony_ci 7562306a36Sopenharmony_ci AArch64 systems have a minimum page size of 4k. We don't bother 7662306a36Sopenharmony_ci checking for larger page sizes - the cost of setting up the correct 7762306a36Sopenharmony_ci page size is just not worth the extra gain from a small reduction in 7862306a36Sopenharmony_ci the cases taking the slow path. Note that we only care about 7962306a36Sopenharmony_ci whether the first fetch, which may be misaligned, crosses a page 8062306a36Sopenharmony_ci boundary. */ 8162306a36Sopenharmony_ci 8262306a36Sopenharmony_ciSYM_FUNC_START(__pi_strlen) 8362306a36Sopenharmony_ci and tmp1, srcin, MIN_PAGE_SIZE - 1 8462306a36Sopenharmony_ci mov zeroones, REP8_01 8562306a36Sopenharmony_ci cmp tmp1, MIN_PAGE_SIZE - 16 8662306a36Sopenharmony_ci b.gt L(page_cross) 8762306a36Sopenharmony_ci ldp data1, data2, [srcin] 8862306a36Sopenharmony_ci#ifdef __AARCH64EB__ 8962306a36Sopenharmony_ci /* For big-endian, carry propagation (if the final byte in the 9062306a36Sopenharmony_ci string is 0x01) means we cannot use has_nul1/2 directly. 9162306a36Sopenharmony_ci Since we expect strings to be small and early-exit, 9262306a36Sopenharmony_ci byte-swap the data now so has_null1/2 will be correct. */ 9362306a36Sopenharmony_ci rev data1, data1 9462306a36Sopenharmony_ci rev data2, data2 9562306a36Sopenharmony_ci#endif 9662306a36Sopenharmony_ci sub tmp1, data1, zeroones 9762306a36Sopenharmony_ci orr tmp2, data1, REP8_7f 9862306a36Sopenharmony_ci sub tmp3, data2, zeroones 9962306a36Sopenharmony_ci orr tmp4, data2, REP8_7f 10062306a36Sopenharmony_ci bics has_nul1, tmp1, tmp2 10162306a36Sopenharmony_ci bic has_nul2, tmp3, tmp4 10262306a36Sopenharmony_ci ccmp has_nul2, 0, 0, eq 10362306a36Sopenharmony_ci beq L(main_loop_entry) 10462306a36Sopenharmony_ci 10562306a36Sopenharmony_ci /* Enter with C = has_nul1 == 0. */ 10662306a36Sopenharmony_ci csel has_nul1, has_nul1, has_nul2, cc 10762306a36Sopenharmony_ci mov len, 8 10862306a36Sopenharmony_ci rev has_nul1, has_nul1 10962306a36Sopenharmony_ci clz tmp1, has_nul1 11062306a36Sopenharmony_ci csel len, xzr, len, cc 11162306a36Sopenharmony_ci add len, len, tmp1, lsr 3 11262306a36Sopenharmony_ci ret 11362306a36Sopenharmony_ci 11462306a36Sopenharmony_ci /* The inner loop processes 32 bytes per iteration and uses the fast 11562306a36Sopenharmony_ci NUL check. If we encounter non-ASCII characters, use a second 11662306a36Sopenharmony_ci loop with the accurate NUL check. */ 11762306a36Sopenharmony_ci .p2align 4 11862306a36Sopenharmony_ciL(main_loop_entry): 11962306a36Sopenharmony_ci bic src, srcin, 15 12062306a36Sopenharmony_ci sub src, src, 16 12162306a36Sopenharmony_ciL(main_loop): 12262306a36Sopenharmony_ci ldp data1, data2, [src, 32]! 12362306a36Sopenharmony_ciL(page_cross_entry): 12462306a36Sopenharmony_ci sub tmp1, data1, zeroones 12562306a36Sopenharmony_ci sub tmp3, data2, zeroones 12662306a36Sopenharmony_ci orr tmp2, tmp1, tmp3 12762306a36Sopenharmony_ci tst tmp2, zeroones, lsl 7 12862306a36Sopenharmony_ci bne 1f 12962306a36Sopenharmony_ci ldp data1, data2, [src, 16] 13062306a36Sopenharmony_ci sub tmp1, data1, zeroones 13162306a36Sopenharmony_ci sub tmp3, data2, zeroones 13262306a36Sopenharmony_ci orr tmp2, tmp1, tmp3 13362306a36Sopenharmony_ci tst tmp2, zeroones, lsl 7 13462306a36Sopenharmony_ci beq L(main_loop) 13562306a36Sopenharmony_ci add src, src, 16 13662306a36Sopenharmony_ci1: 13762306a36Sopenharmony_ci /* The fast check failed, so do the slower, accurate NUL check. */ 13862306a36Sopenharmony_ci orr tmp2, data1, REP8_7f 13962306a36Sopenharmony_ci orr tmp4, data2, REP8_7f 14062306a36Sopenharmony_ci bics has_nul1, tmp1, tmp2 14162306a36Sopenharmony_ci bic has_nul2, tmp3, tmp4 14262306a36Sopenharmony_ci ccmp has_nul2, 0, 0, eq 14362306a36Sopenharmony_ci beq L(nonascii_loop) 14462306a36Sopenharmony_ci 14562306a36Sopenharmony_ci /* Enter with C = has_nul1 == 0. */ 14662306a36Sopenharmony_ciL(tail): 14762306a36Sopenharmony_ci#ifdef __AARCH64EB__ 14862306a36Sopenharmony_ci /* For big-endian, carry propagation (if the final byte in the 14962306a36Sopenharmony_ci string is 0x01) means we cannot use has_nul1/2 directly. The 15062306a36Sopenharmony_ci easiest way to get the correct byte is to byte-swap the data 15162306a36Sopenharmony_ci and calculate the syndrome a second time. */ 15262306a36Sopenharmony_ci csel data1, data1, data2, cc 15362306a36Sopenharmony_ci rev data1, data1 15462306a36Sopenharmony_ci sub tmp1, data1, zeroones 15562306a36Sopenharmony_ci orr tmp2, data1, REP8_7f 15662306a36Sopenharmony_ci bic has_nul1, tmp1, tmp2 15762306a36Sopenharmony_ci#else 15862306a36Sopenharmony_ci csel has_nul1, has_nul1, has_nul2, cc 15962306a36Sopenharmony_ci#endif 16062306a36Sopenharmony_ci sub len, src, srcin 16162306a36Sopenharmony_ci rev has_nul1, has_nul1 16262306a36Sopenharmony_ci add tmp2, len, 8 16362306a36Sopenharmony_ci clz tmp1, has_nul1 16462306a36Sopenharmony_ci csel len, len, tmp2, cc 16562306a36Sopenharmony_ci add len, len, tmp1, lsr 3 16662306a36Sopenharmony_ci ret 16762306a36Sopenharmony_ci 16862306a36Sopenharmony_ciL(nonascii_loop): 16962306a36Sopenharmony_ci ldp data1, data2, [src, 16]! 17062306a36Sopenharmony_ci sub tmp1, data1, zeroones 17162306a36Sopenharmony_ci orr tmp2, data1, REP8_7f 17262306a36Sopenharmony_ci sub tmp3, data2, zeroones 17362306a36Sopenharmony_ci orr tmp4, data2, REP8_7f 17462306a36Sopenharmony_ci bics has_nul1, tmp1, tmp2 17562306a36Sopenharmony_ci bic has_nul2, tmp3, tmp4 17662306a36Sopenharmony_ci ccmp has_nul2, 0, 0, eq 17762306a36Sopenharmony_ci bne L(tail) 17862306a36Sopenharmony_ci ldp data1, data2, [src, 16]! 17962306a36Sopenharmony_ci sub tmp1, data1, zeroones 18062306a36Sopenharmony_ci orr tmp2, data1, REP8_7f 18162306a36Sopenharmony_ci sub tmp3, data2, zeroones 18262306a36Sopenharmony_ci orr tmp4, data2, REP8_7f 18362306a36Sopenharmony_ci bics has_nul1, tmp1, tmp2 18462306a36Sopenharmony_ci bic has_nul2, tmp3, tmp4 18562306a36Sopenharmony_ci ccmp has_nul2, 0, 0, eq 18662306a36Sopenharmony_ci beq L(nonascii_loop) 18762306a36Sopenharmony_ci b L(tail) 18862306a36Sopenharmony_ci 18962306a36Sopenharmony_ci /* Load 16 bytes from [srcin & ~15] and force the bytes that precede 19062306a36Sopenharmony_ci srcin to 0x7f, so we ignore any NUL bytes before the string. 19162306a36Sopenharmony_ci Then continue in the aligned loop. */ 19262306a36Sopenharmony_ciL(page_cross): 19362306a36Sopenharmony_ci bic src, srcin, 15 19462306a36Sopenharmony_ci ldp data1, data2, [src] 19562306a36Sopenharmony_ci lsl tmp1, srcin, 3 19662306a36Sopenharmony_ci mov tmp4, -1 19762306a36Sopenharmony_ci#ifdef __AARCH64EB__ 19862306a36Sopenharmony_ci /* Big-endian. Early bytes are at MSB. */ 19962306a36Sopenharmony_ci lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ 20062306a36Sopenharmony_ci#else 20162306a36Sopenharmony_ci /* Little-endian. Early bytes are at LSB. */ 20262306a36Sopenharmony_ci lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ 20362306a36Sopenharmony_ci#endif 20462306a36Sopenharmony_ci orr tmp1, tmp1, REP8_80 20562306a36Sopenharmony_ci orn data1, data1, tmp1 20662306a36Sopenharmony_ci orn tmp2, data2, tmp1 20762306a36Sopenharmony_ci tst srcin, 8 20862306a36Sopenharmony_ci csel data1, data1, tmp4, eq 20962306a36Sopenharmony_ci csel data2, data2, tmp2, eq 21062306a36Sopenharmony_ci b L(page_cross_entry) 21162306a36Sopenharmony_ciSYM_FUNC_END(__pi_strlen) 21262306a36Sopenharmony_ciSYM_FUNC_ALIAS_WEAK(strlen, __pi_strlen) 21362306a36Sopenharmony_ciEXPORT_SYMBOL_NOKASAN(strlen) 214