1 /*
2  * strcpy/stpcpy - copy a string returning pointer to start/end.
3  *
4  * Copyright (c) 2020, Arm Limited.
5  * SPDX-License-Identifier: MIT
6  */
7 
8 /* Assumptions:
9  *
10  * ARMv8-a, AArch64, Advanced SIMD.
11  * MTE compatible.
12  */
13 
14 #include "../asmdefs.h"
15 
16 #define dstin		x0
17 #define srcin		x1
18 #define result		x0
19 
20 #define src		x2
21 #define dst		x3
22 #define len		x4
23 #define synd		x4
24 #define	tmp		x5
25 #define wtmp		w5
26 #define shift		x5
27 #define data1		x6
28 #define dataw1		w6
29 #define data2		x7
30 #define dataw2		w7
31 
32 #define dataq		q0
33 #define vdata		v0
34 #define vhas_nul	v1
35 #define vrepmask	v2
36 #define vend		v3
37 #define dend		d3
38 #define dataq2		q1
39 
40 #ifdef BUILD_STPCPY
41 # define STRCPY __stpcpy_aarch64_mte
42 # define IFSTPCPY(X,...) X,__VA_ARGS__
43 #else
44 # define STRCPY __strcpy_aarch64_mte
45 # define IFSTPCPY(X,...)
46 #endif
47 
48 /* Core algorithm:
49 
50    For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
51    per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
52    requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
53    set likewise for odd bytes so that adjacent bytes can be merged. Since the
54    bits in the syndrome reflect the order in which things occur in the original
55    string, counting trailing zeros identifies exactly which byte matched.  */
56 
57 ENTRY (STRCPY)
58 	PTR_ARG (0)
59 	PTR_ARG (1)
60 	bic	src, srcin, 15
61 	mov	wtmp, 0xf00f
62 	ld1	{vdata.16b}, [src]
63 	dup	vrepmask.8h, wtmp
64 	cmeq	vhas_nul.16b, vdata.16b, 0
65 	lsl	shift, srcin, 2
66 	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
67 	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
68 	fmov	synd, dend
69 	lsr	synd, synd, shift
70 	cbnz	synd, L(tail)
71 
72 	ldr	dataq, [src, 16]!
73 	cmeq	vhas_nul.16b, vdata.16b, 0
74 	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
75 	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
76 	fmov	synd, dend
77 	cbz	synd, L(start_loop)
78 
79 #ifndef __AARCH64EB__
80 	rbit	synd, synd
81 #endif
82 	sub	tmp, src, srcin
83 	clz	len, synd
84 	add	len, tmp, len, lsr 2
85 	tbz	len, 4, L(less16)
86 	sub	tmp, len, 15
87 	ldr	dataq, [srcin]
88 	ldr	dataq2, [srcin, tmp]
89 	str	dataq, [dstin]
90 	str	dataq2, [dstin, tmp]
91 	IFSTPCPY (add result, dstin, len)
92 	ret
93 
94 	.p2align 4,,8
95 L(tail):
96 	rbit	synd, synd
97 	clz	len, synd
98 	lsr	len, len, 2
99 
100 	.p2align 4
101 L(less16):
102 	tbz	len, 3, L(less8)
103 	sub	tmp, len, 7
104 	ldr	data1, [srcin]
105 	ldr	data2, [srcin, tmp]
106 	str	data1, [dstin]
107 	str	data2, [dstin, tmp]
108 	IFSTPCPY (add result, dstin, len)
109 	ret
110 
111 	.p2align 4
112 L(less8):
113 	subs	tmp, len, 3
114 	b.lo	L(less4)
115 	ldr	dataw1, [srcin]
116 	ldr	dataw2, [srcin, tmp]
117 	str	dataw1, [dstin]
118 	str	dataw2, [dstin, tmp]
119 	IFSTPCPY (add result, dstin, len)
120 	ret
121 
122 L(less4):
123 	cbz	len, L(zerobyte)
124 	ldrh	dataw1, [srcin]
125 	strh	dataw1, [dstin]
126 L(zerobyte):
127 	strb	wzr, [dstin, len]
128 	IFSTPCPY (add result, dstin, len)
129 	ret
130 
131 	.p2align 4
132 L(start_loop):
133 	sub	len, src, srcin
134 	ldr	dataq2, [srcin]
135 	add	dst, dstin, len
136 	str	dataq2, [dstin]
137 
138 	.p2align 5
139 L(loop):
140 	str	dataq, [dst], 16
141 	ldr	dataq, [src, 16]!
142 	cmeq	vhas_nul.16b, vdata.16b, 0
143 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
144 	fmov	synd, dend
145 	cbz	synd, L(loop)
146 
147 	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
148 	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
149 	fmov	synd, dend
150 #ifndef __AARCH64EB__
151 	rbit	synd, synd
152 #endif
153 	clz	len, synd
154 	lsr	len, len, 2
155 	sub	tmp, len, 15
156 	ldr	dataq, [src, tmp]
157 	str	dataq, [dst, tmp]
158 	IFSTPCPY (add result, dst, len)
159 	ret
160 
161 END (STRCPY)
162