1bbbf1280Sopenharmony_ci/* 2bbbf1280Sopenharmony_ci * strcpy/stpcpy - copy a string returning pointer to start/end. 3bbbf1280Sopenharmony_ci * 4bbbf1280Sopenharmony_ci * Copyright (c) 2020, Arm Limited. 5bbbf1280Sopenharmony_ci * SPDX-License-Identifier: MIT 6bbbf1280Sopenharmony_ci */ 7bbbf1280Sopenharmony_ci 8bbbf1280Sopenharmony_ci/* Assumptions: 9bbbf1280Sopenharmony_ci * 10bbbf1280Sopenharmony_ci * ARMv8-a, AArch64, Advanced SIMD. 11bbbf1280Sopenharmony_ci * MTE compatible. 12bbbf1280Sopenharmony_ci */ 13bbbf1280Sopenharmony_ci 14bbbf1280Sopenharmony_ci#include "../asmdefs.h" 15bbbf1280Sopenharmony_ci 16bbbf1280Sopenharmony_ci#define dstin x0 17bbbf1280Sopenharmony_ci#define srcin x1 18bbbf1280Sopenharmony_ci#define result x0 19bbbf1280Sopenharmony_ci 20bbbf1280Sopenharmony_ci#define src x2 21bbbf1280Sopenharmony_ci#define dst x3 22bbbf1280Sopenharmony_ci#define len x4 23bbbf1280Sopenharmony_ci#define synd x4 24bbbf1280Sopenharmony_ci#define tmp x5 25bbbf1280Sopenharmony_ci#define wtmp w5 26bbbf1280Sopenharmony_ci#define shift x5 27bbbf1280Sopenharmony_ci#define data1 x6 28bbbf1280Sopenharmony_ci#define dataw1 w6 29bbbf1280Sopenharmony_ci#define data2 x7 30bbbf1280Sopenharmony_ci#define dataw2 w7 31bbbf1280Sopenharmony_ci 32bbbf1280Sopenharmony_ci#define dataq q0 33bbbf1280Sopenharmony_ci#define vdata v0 34bbbf1280Sopenharmony_ci#define vhas_nul v1 35bbbf1280Sopenharmony_ci#define vrepmask v2 36bbbf1280Sopenharmony_ci#define vend v3 37bbbf1280Sopenharmony_ci#define dend d3 38bbbf1280Sopenharmony_ci#define dataq2 q1 39bbbf1280Sopenharmony_ci 40bbbf1280Sopenharmony_ci#ifdef BUILD_STPCPY 41bbbf1280Sopenharmony_ci# define STRCPY __stpcpy_aarch64_mte 42bbbf1280Sopenharmony_ci# define IFSTPCPY(X,...) X,__VA_ARGS__ 43bbbf1280Sopenharmony_ci#else 44bbbf1280Sopenharmony_ci# define STRCPY __strcpy_aarch64_mte 45bbbf1280Sopenharmony_ci# define IFSTPCPY(X,...) 46bbbf1280Sopenharmony_ci#endif 47bbbf1280Sopenharmony_ci 48bbbf1280Sopenharmony_ci/* Core algorithm: 49bbbf1280Sopenharmony_ci 50bbbf1280Sopenharmony_ci For each 16-byte chunk we calculate a 64-bit syndrome value with four bits 51bbbf1280Sopenharmony_ci per byte. For even bytes, bits 0-3 are set if the relevant byte matched the 52bbbf1280Sopenharmony_ci requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are 53bbbf1280Sopenharmony_ci set likewise for odd bytes so that adjacent bytes can be merged. Since the 54bbbf1280Sopenharmony_ci bits in the syndrome reflect the order in which things occur in the original 55bbbf1280Sopenharmony_ci string, counting trailing zeros identifies exactly which byte matched. */ 56bbbf1280Sopenharmony_ci 57bbbf1280Sopenharmony_ciENTRY (STRCPY) 58bbbf1280Sopenharmony_ci PTR_ARG (0) 59bbbf1280Sopenharmony_ci PTR_ARG (1) 60bbbf1280Sopenharmony_ci bic src, srcin, 15 61bbbf1280Sopenharmony_ci mov wtmp, 0xf00f 62bbbf1280Sopenharmony_ci ld1 {vdata.16b}, [src] 63bbbf1280Sopenharmony_ci dup vrepmask.8h, wtmp 64bbbf1280Sopenharmony_ci cmeq vhas_nul.16b, vdata.16b, 0 65bbbf1280Sopenharmony_ci lsl shift, srcin, 2 66bbbf1280Sopenharmony_ci and vhas_nul.16b, vhas_nul.16b, vrepmask.16b 67bbbf1280Sopenharmony_ci addp vend.16b, vhas_nul.16b, vhas_nul.16b 68bbbf1280Sopenharmony_ci fmov synd, dend 69bbbf1280Sopenharmony_ci lsr synd, synd, shift 70bbbf1280Sopenharmony_ci cbnz synd, L(tail) 71bbbf1280Sopenharmony_ci 72bbbf1280Sopenharmony_ci ldr dataq, [src, 16]! 73bbbf1280Sopenharmony_ci cmeq vhas_nul.16b, vdata.16b, 0 74bbbf1280Sopenharmony_ci and vhas_nul.16b, vhas_nul.16b, vrepmask.16b 75bbbf1280Sopenharmony_ci addp vend.16b, vhas_nul.16b, vhas_nul.16b 76bbbf1280Sopenharmony_ci fmov synd, dend 77bbbf1280Sopenharmony_ci cbz synd, L(start_loop) 78bbbf1280Sopenharmony_ci 79bbbf1280Sopenharmony_ci#ifndef __AARCH64EB__ 80bbbf1280Sopenharmony_ci rbit synd, synd 81bbbf1280Sopenharmony_ci#endif 82bbbf1280Sopenharmony_ci sub tmp, src, srcin 83bbbf1280Sopenharmony_ci clz len, synd 84bbbf1280Sopenharmony_ci add len, tmp, len, lsr 2 85bbbf1280Sopenharmony_ci tbz len, 4, L(less16) 86bbbf1280Sopenharmony_ci sub tmp, len, 15 87bbbf1280Sopenharmony_ci ldr dataq, [srcin] 88bbbf1280Sopenharmony_ci ldr dataq2, [srcin, tmp] 89bbbf1280Sopenharmony_ci str dataq, [dstin] 90bbbf1280Sopenharmony_ci str dataq2, [dstin, tmp] 91bbbf1280Sopenharmony_ci IFSTPCPY (add result, dstin, len) 92bbbf1280Sopenharmony_ci ret 93bbbf1280Sopenharmony_ci 94bbbf1280Sopenharmony_ci .p2align 4,,8 95bbbf1280Sopenharmony_ciL(tail): 96bbbf1280Sopenharmony_ci rbit synd, synd 97bbbf1280Sopenharmony_ci clz len, synd 98bbbf1280Sopenharmony_ci lsr len, len, 2 99bbbf1280Sopenharmony_ci 100bbbf1280Sopenharmony_ci .p2align 4 101bbbf1280Sopenharmony_ciL(less16): 102bbbf1280Sopenharmony_ci tbz len, 3, L(less8) 103bbbf1280Sopenharmony_ci sub tmp, len, 7 104bbbf1280Sopenharmony_ci ldr data1, [srcin] 105bbbf1280Sopenharmony_ci ldr data2, [srcin, tmp] 106bbbf1280Sopenharmony_ci str data1, [dstin] 107bbbf1280Sopenharmony_ci str data2, [dstin, tmp] 108bbbf1280Sopenharmony_ci IFSTPCPY (add result, dstin, len) 109bbbf1280Sopenharmony_ci ret 110bbbf1280Sopenharmony_ci 111bbbf1280Sopenharmony_ci .p2align 4 112bbbf1280Sopenharmony_ciL(less8): 113bbbf1280Sopenharmony_ci subs tmp, len, 3 114bbbf1280Sopenharmony_ci b.lo L(less4) 115bbbf1280Sopenharmony_ci ldr dataw1, [srcin] 116bbbf1280Sopenharmony_ci ldr dataw2, [srcin, tmp] 117bbbf1280Sopenharmony_ci str dataw1, [dstin] 118bbbf1280Sopenharmony_ci str dataw2, [dstin, tmp] 119bbbf1280Sopenharmony_ci IFSTPCPY (add result, dstin, len) 120bbbf1280Sopenharmony_ci ret 121bbbf1280Sopenharmony_ci 122bbbf1280Sopenharmony_ciL(less4): 123bbbf1280Sopenharmony_ci cbz len, L(zerobyte) 124bbbf1280Sopenharmony_ci ldrh dataw1, [srcin] 125bbbf1280Sopenharmony_ci strh dataw1, [dstin] 126bbbf1280Sopenharmony_ciL(zerobyte): 127bbbf1280Sopenharmony_ci strb wzr, [dstin, len] 128bbbf1280Sopenharmony_ci IFSTPCPY (add result, dstin, len) 129bbbf1280Sopenharmony_ci ret 130bbbf1280Sopenharmony_ci 131bbbf1280Sopenharmony_ci .p2align 4 132bbbf1280Sopenharmony_ciL(start_loop): 133bbbf1280Sopenharmony_ci sub len, src, srcin 134bbbf1280Sopenharmony_ci ldr dataq2, [srcin] 135bbbf1280Sopenharmony_ci add dst, dstin, len 136bbbf1280Sopenharmony_ci str dataq2, [dstin] 137bbbf1280Sopenharmony_ci 138bbbf1280Sopenharmony_ci .p2align 5 139bbbf1280Sopenharmony_ciL(loop): 140bbbf1280Sopenharmony_ci str dataq, [dst], 16 141bbbf1280Sopenharmony_ci ldr dataq, [src, 16]! 142bbbf1280Sopenharmony_ci cmeq vhas_nul.16b, vdata.16b, 0 143bbbf1280Sopenharmony_ci umaxp vend.16b, vhas_nul.16b, vhas_nul.16b 144bbbf1280Sopenharmony_ci fmov synd, dend 145bbbf1280Sopenharmony_ci cbz synd, L(loop) 146bbbf1280Sopenharmony_ci 147bbbf1280Sopenharmony_ci and vhas_nul.16b, vhas_nul.16b, vrepmask.16b 148bbbf1280Sopenharmony_ci addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ 149bbbf1280Sopenharmony_ci fmov synd, dend 150bbbf1280Sopenharmony_ci#ifndef __AARCH64EB__ 151bbbf1280Sopenharmony_ci rbit synd, synd 152bbbf1280Sopenharmony_ci#endif 153bbbf1280Sopenharmony_ci clz len, synd 154bbbf1280Sopenharmony_ci lsr len, len, 2 155bbbf1280Sopenharmony_ci sub tmp, len, 15 156bbbf1280Sopenharmony_ci ldr dataq, [src, tmp] 157bbbf1280Sopenharmony_ci str dataq, [dst, tmp] 158bbbf1280Sopenharmony_ci IFSTPCPY (add result, dst, len) 159bbbf1280Sopenharmony_ci ret 160bbbf1280Sopenharmony_ci 161bbbf1280Sopenharmony_ciEND (STRCPY) 162