1bbbf1280Sopenharmony_ci/* 2bbbf1280Sopenharmony_ci * strlen - calculate the length of a string. 3bbbf1280Sopenharmony_ci * 4bbbf1280Sopenharmony_ci * Copyright (c) 2020, Arm Limited. 5bbbf1280Sopenharmony_ci * SPDX-License-Identifier: MIT 6bbbf1280Sopenharmony_ci */ 7bbbf1280Sopenharmony_ci 8bbbf1280Sopenharmony_ci/* Assumptions: 9bbbf1280Sopenharmony_ci * 10bbbf1280Sopenharmony_ci * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. 11bbbf1280Sopenharmony_ci * Not MTE compatible. 12bbbf1280Sopenharmony_ci */ 13bbbf1280Sopenharmony_ci 14bbbf1280Sopenharmony_ci#include "../asmdefs.h" 15bbbf1280Sopenharmony_ci 16bbbf1280Sopenharmony_ci#define srcin x0 17bbbf1280Sopenharmony_ci#define len x0 18bbbf1280Sopenharmony_ci 19bbbf1280Sopenharmony_ci#define src x1 20bbbf1280Sopenharmony_ci#define data1 x2 21bbbf1280Sopenharmony_ci#define data2 x3 22bbbf1280Sopenharmony_ci#define has_nul1 x4 23bbbf1280Sopenharmony_ci#define has_nul2 x5 24bbbf1280Sopenharmony_ci#define tmp1 x4 25bbbf1280Sopenharmony_ci#define tmp2 x5 26bbbf1280Sopenharmony_ci#define tmp3 x6 27bbbf1280Sopenharmony_ci#define tmp4 x7 28bbbf1280Sopenharmony_ci#define zeroones x8 29bbbf1280Sopenharmony_ci 30bbbf1280Sopenharmony_ci#define maskv v0 31bbbf1280Sopenharmony_ci#define maskd d0 32bbbf1280Sopenharmony_ci#define dataq1 q1 33bbbf1280Sopenharmony_ci#define dataq2 q2 34bbbf1280Sopenharmony_ci#define datav1 v1 35bbbf1280Sopenharmony_ci#define datav2 v2 36bbbf1280Sopenharmony_ci#define tmp x2 37bbbf1280Sopenharmony_ci#define tmpw w2 38bbbf1280Sopenharmony_ci#define synd x3 39bbbf1280Sopenharmony_ci#define shift x4 40bbbf1280Sopenharmony_ci 41bbbf1280Sopenharmony_ci/* For the first 32 bytes, NUL detection works on the principle that 42bbbf1280Sopenharmony_ci (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a 43bbbf1280Sopenharmony_ci byte is zero, and can be done in parallel across the entire word. */ 44bbbf1280Sopenharmony_ci 45bbbf1280Sopenharmony_ci#define REP8_01 0x0101010101010101 46bbbf1280Sopenharmony_ci#define REP8_7f 0x7f7f7f7f7f7f7f7f 47bbbf1280Sopenharmony_ci 48bbbf1280Sopenharmony_ci/* To test the page crossing code path more thoroughly, compile with 49bbbf1280Sopenharmony_ci -DTEST_PAGE_CROSS - this will force all calls through the slower 50bbbf1280Sopenharmony_ci entry path. This option is not intended for production use. */ 51bbbf1280Sopenharmony_ci 52bbbf1280Sopenharmony_ci#ifdef TEST_PAGE_CROSS 53bbbf1280Sopenharmony_ci# define MIN_PAGE_SIZE 32 54bbbf1280Sopenharmony_ci#else 55bbbf1280Sopenharmony_ci# define MIN_PAGE_SIZE 4096 56bbbf1280Sopenharmony_ci#endif 57bbbf1280Sopenharmony_ci 58bbbf1280Sopenharmony_ci/* Core algorithm: 59bbbf1280Sopenharmony_ci 60bbbf1280Sopenharmony_ci Since strings are short on average, we check the first 32 bytes of the 61bbbf1280Sopenharmony_ci string for a NUL character without aligning the string. In order to use 62bbbf1280Sopenharmony_ci unaligned loads safely we must do a page cross check first. 63bbbf1280Sopenharmony_ci 64bbbf1280Sopenharmony_ci If there is a NUL byte we calculate the length from the 2 8-byte words 65bbbf1280Sopenharmony_ci using conditional select to reduce branch mispredictions (it is unlikely 66bbbf1280Sopenharmony_ci strlen will be repeatedly called on strings with the same length). 67bbbf1280Sopenharmony_ci 68bbbf1280Sopenharmony_ci If the string is longer than 32 bytes, align src so we don't need further 69bbbf1280Sopenharmony_ci page cross checks, and process 32 bytes per iteration using a fast SIMD 70bbbf1280Sopenharmony_ci loop. 71bbbf1280Sopenharmony_ci 72bbbf1280Sopenharmony_ci If the page cross check fails, we read 32 bytes from an aligned address, 73bbbf1280Sopenharmony_ci and ignore any characters before the string. If it contains a NUL 74bbbf1280Sopenharmony_ci character, return the length, if not, continue in the main loop. */ 75bbbf1280Sopenharmony_ci 76bbbf1280Sopenharmony_ciENTRY (__strlen_aarch64) 77bbbf1280Sopenharmony_ci PTR_ARG (0) 78bbbf1280Sopenharmony_ci and tmp1, srcin, MIN_PAGE_SIZE - 1 79bbbf1280Sopenharmony_ci cmp tmp1, MIN_PAGE_SIZE - 32 80bbbf1280Sopenharmony_ci b.hi L(page_cross) 81bbbf1280Sopenharmony_ci 82bbbf1280Sopenharmony_ci /* Look for a NUL byte in the first 16 bytes. */ 83bbbf1280Sopenharmony_ci ldp data1, data2, [srcin] 84bbbf1280Sopenharmony_ci mov zeroones, REP8_01 85bbbf1280Sopenharmony_ci 86bbbf1280Sopenharmony_ci#ifdef __AARCH64EB__ 87bbbf1280Sopenharmony_ci /* For big-endian, carry propagation (if the final byte in the 88bbbf1280Sopenharmony_ci string is 0x01) means we cannot use has_nul1/2 directly. 89bbbf1280Sopenharmony_ci Since we expect strings to be small and early-exit, 90bbbf1280Sopenharmony_ci byte-swap the data now so has_null1/2 will be correct. */ 91bbbf1280Sopenharmony_ci rev data1, data1 92bbbf1280Sopenharmony_ci rev data2, data2 93bbbf1280Sopenharmony_ci#endif 94bbbf1280Sopenharmony_ci sub tmp1, data1, zeroones 95bbbf1280Sopenharmony_ci orr tmp2, data1, REP8_7f 96bbbf1280Sopenharmony_ci sub tmp3, data2, zeroones 97bbbf1280Sopenharmony_ci orr tmp4, data2, REP8_7f 98bbbf1280Sopenharmony_ci bics has_nul1, tmp1, tmp2 99bbbf1280Sopenharmony_ci bic has_nul2, tmp3, tmp4 100bbbf1280Sopenharmony_ci ccmp has_nul2, 0, 0, eq 101bbbf1280Sopenharmony_ci b.eq L(bytes16_31) 102bbbf1280Sopenharmony_ci 103bbbf1280Sopenharmony_ci /* Find the exact offset of the first NUL byte in the first 16 bytes 104bbbf1280Sopenharmony_ci from the string start. Enter with C = has_nul1 == 0. */ 105bbbf1280Sopenharmony_ci csel has_nul1, has_nul1, has_nul2, cc 106bbbf1280Sopenharmony_ci mov len, 8 107bbbf1280Sopenharmony_ci rev has_nul1, has_nul1 108bbbf1280Sopenharmony_ci csel len, xzr, len, cc 109bbbf1280Sopenharmony_ci clz tmp1, has_nul1 110bbbf1280Sopenharmony_ci add len, len, tmp1, lsr 3 111bbbf1280Sopenharmony_ci ret 112bbbf1280Sopenharmony_ci 113bbbf1280Sopenharmony_ci .p2align 3 114bbbf1280Sopenharmony_ci /* Look for a NUL byte at offset 16..31 in the string. */ 115bbbf1280Sopenharmony_ciL(bytes16_31): 116bbbf1280Sopenharmony_ci ldp data1, data2, [srcin, 16] 117bbbf1280Sopenharmony_ci#ifdef __AARCH64EB__ 118bbbf1280Sopenharmony_ci rev data1, data1 119bbbf1280Sopenharmony_ci rev data2, data2 120bbbf1280Sopenharmony_ci#endif 121bbbf1280Sopenharmony_ci sub tmp1, data1, zeroones 122bbbf1280Sopenharmony_ci orr tmp2, data1, REP8_7f 123bbbf1280Sopenharmony_ci sub tmp3, data2, zeroones 124bbbf1280Sopenharmony_ci orr tmp4, data2, REP8_7f 125bbbf1280Sopenharmony_ci bics has_nul1, tmp1, tmp2 126bbbf1280Sopenharmony_ci bic has_nul2, tmp3, tmp4 127bbbf1280Sopenharmony_ci ccmp has_nul2, 0, 0, eq 128bbbf1280Sopenharmony_ci b.eq L(loop_entry) 129bbbf1280Sopenharmony_ci 130bbbf1280Sopenharmony_ci /* Find the exact offset of the first NUL byte at offset 16..31 from 131bbbf1280Sopenharmony_ci the string start. Enter with C = has_nul1 == 0. */ 132bbbf1280Sopenharmony_ci csel has_nul1, has_nul1, has_nul2, cc 133bbbf1280Sopenharmony_ci mov len, 24 134bbbf1280Sopenharmony_ci rev has_nul1, has_nul1 135bbbf1280Sopenharmony_ci mov tmp3, 16 136bbbf1280Sopenharmony_ci clz tmp1, has_nul1 137bbbf1280Sopenharmony_ci csel len, tmp3, len, cc 138bbbf1280Sopenharmony_ci add len, len, tmp1, lsr 3 139bbbf1280Sopenharmony_ci ret 140bbbf1280Sopenharmony_ci 141bbbf1280Sopenharmony_ciL(loop_entry): 142bbbf1280Sopenharmony_ci bic src, srcin, 31 143bbbf1280Sopenharmony_ci 144bbbf1280Sopenharmony_ci .p2align 5 145bbbf1280Sopenharmony_ciL(loop): 146bbbf1280Sopenharmony_ci ldp dataq1, dataq2, [src, 32]! 147bbbf1280Sopenharmony_ci uminp maskv.16b, datav1.16b, datav2.16b 148bbbf1280Sopenharmony_ci uminp maskv.16b, maskv.16b, maskv.16b 149bbbf1280Sopenharmony_ci cmeq maskv.8b, maskv.8b, 0 150bbbf1280Sopenharmony_ci fmov synd, maskd 151bbbf1280Sopenharmony_ci cbz synd, L(loop) 152bbbf1280Sopenharmony_ci 153bbbf1280Sopenharmony_ci /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */ 154bbbf1280Sopenharmony_ci cmeq maskv.16b, datav1.16b, 0 155bbbf1280Sopenharmony_ci sub len, src, srcin 156bbbf1280Sopenharmony_ci tst synd, 0xffffffff 157bbbf1280Sopenharmony_ci b.ne 1f 158bbbf1280Sopenharmony_ci cmeq maskv.16b, datav2.16b, 0 159bbbf1280Sopenharmony_ci add len, len, 16 160bbbf1280Sopenharmony_ci1: 161bbbf1280Sopenharmony_ci /* Generate a bitmask and compute correct byte offset. */ 162bbbf1280Sopenharmony_ci#ifdef __AARCH64EB__ 163bbbf1280Sopenharmony_ci bic maskv.8h, 0xf0 164bbbf1280Sopenharmony_ci#else 165bbbf1280Sopenharmony_ci bic maskv.8h, 0x0f, lsl 8 166bbbf1280Sopenharmony_ci#endif 167bbbf1280Sopenharmony_ci umaxp maskv.16b, maskv.16b, maskv.16b 168bbbf1280Sopenharmony_ci fmov synd, maskd 169bbbf1280Sopenharmony_ci#ifndef __AARCH64EB__ 170bbbf1280Sopenharmony_ci rbit synd, synd 171bbbf1280Sopenharmony_ci#endif 172bbbf1280Sopenharmony_ci clz tmp, synd 173bbbf1280Sopenharmony_ci add len, len, tmp, lsr 2 174bbbf1280Sopenharmony_ci ret 175bbbf1280Sopenharmony_ci 176bbbf1280Sopenharmony_ci .p2align 4 177bbbf1280Sopenharmony_ci 178bbbf1280Sopenharmony_ciL(page_cross): 179bbbf1280Sopenharmony_ci bic src, srcin, 31 180bbbf1280Sopenharmony_ci mov tmpw, 0x0c03 181bbbf1280Sopenharmony_ci movk tmpw, 0xc030, lsl 16 182bbbf1280Sopenharmony_ci ld1 {datav1.16b, datav2.16b}, [src] 183bbbf1280Sopenharmony_ci dup maskv.4s, tmpw 184bbbf1280Sopenharmony_ci cmeq datav1.16b, datav1.16b, 0 185bbbf1280Sopenharmony_ci cmeq datav2.16b, datav2.16b, 0 186bbbf1280Sopenharmony_ci and datav1.16b, datav1.16b, maskv.16b 187bbbf1280Sopenharmony_ci and datav2.16b, datav2.16b, maskv.16b 188bbbf1280Sopenharmony_ci addp maskv.16b, datav1.16b, datav2.16b 189bbbf1280Sopenharmony_ci addp maskv.16b, maskv.16b, maskv.16b 190bbbf1280Sopenharmony_ci fmov synd, maskd 191bbbf1280Sopenharmony_ci lsl shift, srcin, 1 192bbbf1280Sopenharmony_ci lsr synd, synd, shift 193bbbf1280Sopenharmony_ci cbz synd, L(loop) 194bbbf1280Sopenharmony_ci 195bbbf1280Sopenharmony_ci rbit synd, synd 196bbbf1280Sopenharmony_ci clz len, synd 197bbbf1280Sopenharmony_ci lsr len, len, 1 198bbbf1280Sopenharmony_ci ret 199bbbf1280Sopenharmony_ci 200bbbf1280Sopenharmony_ciEND (__strlen_aarch64) 201