1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /*
3  * Copyright (C) 2013 ARM Ltd.
4  * Copyright (C) 2013 Linaro.
5  *
6  * This code is based on glibc cortex strings work originally authored by Linaro
7  * be found @
8  *
9  * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10  * files/head:/src/aarch64/
11  */
12 
13 #include <linux/linkage.h>
14 #include <asm/assembler.h>
15 #include <asm/cache.h>
16 
17 /*
18  * Move a buffer from src to test (alignment handled by the hardware).
19  * If dest <= src, call memcpy, otherwise copy in reverse order.
20  *
21  * Parameters:
22  *	x0 - dest
23  *	x1 - src
24  *	x2 - n
25  * Returns:
26  *	x0 - dest
27  */
28 dstin	.req	x0
29 src	.req	x1
30 count	.req	x2
31 tmp1	.req	x3
32 tmp1w	.req	w3
33 tmp2	.req	x4
34 tmp2w	.req	w4
35 tmp3	.req	x5
36 tmp3w	.req	w5
37 dst	.req	x6
38 
39 A_l	.req	x7
40 A_h	.req	x8
41 B_l	.req	x9
42 B_h	.req	x10
43 C_l	.req	x11
44 C_h	.req	x12
45 D_l	.req	x13
46 D_h	.req	x14
47 
48 SYM_FUNC_START_ALIAS(__memmove)
49 SYM_FUNC_START_WEAK_PI(memmove)
50 	cmp	dstin, src
51 	b.lo	__memcpy
52 	add	tmp1, src, count
53 	cmp	dstin, tmp1
54 	b.hs	__memcpy		/* No overlap.  */
55 
56 	add	dst, dstin, count
57 	add	src, src, count
58 	cmp	count, #16
59 	b.lo	.Ltail15  /*probably non-alignment accesses.*/
60 
61 	ands	tmp2, src, #15     /* Bytes to reach alignment.  */
62 	b.eq	.LSrcAligned
63 	sub	count, count, tmp2
64 	/*
65 	* process the aligned offset length to make the src aligned firstly.
66 	* those extra instructions' cost is acceptable. It also make the
67 	* coming accesses are based on aligned address.
68 	*/
69 	tbz	tmp2, #0, 1f
70 	ldrb	tmp1w, [src, #-1]!
71 	strb	tmp1w, [dst, #-1]!
72 1:
73 	tbz	tmp2, #1, 2f
74 	ldrh	tmp1w, [src, #-2]!
75 	strh	tmp1w, [dst, #-2]!
76 2:
77 	tbz	tmp2, #2, 3f
78 	ldr	tmp1w, [src, #-4]!
79 	str	tmp1w, [dst, #-4]!
80 3:
81 	tbz	tmp2, #3, .LSrcAligned
82 	ldr	tmp1, [src, #-8]!
83 	str	tmp1, [dst, #-8]!
84 
85 .LSrcAligned:
86 	cmp	count, #64
87 	b.ge	.Lcpy_over64
88 
89 	/*
90 	* Deal with small copies quickly by dropping straight into the
91 	* exit block.
92 	*/
93 .Ltail63:
94 	/*
95 	* Copy up to 48 bytes of data. At this point we only need the
96 	* bottom 6 bits of count to be accurate.
97 	*/
98 	ands	tmp1, count, #0x30
99 	b.eq	.Ltail15
100 	cmp	tmp1w, #0x20
101 	b.eq	1f
102 	b.lt	2f
103 	ldp	A_l, A_h, [src, #-16]!
104 	stp	A_l, A_h, [dst, #-16]!
105 1:
106 	ldp	A_l, A_h, [src, #-16]!
107 	stp	A_l, A_h, [dst, #-16]!
108 2:
109 	ldp	A_l, A_h, [src, #-16]!
110 	stp	A_l, A_h, [dst, #-16]!
111 
112 .Ltail15:
113 	tbz	count, #3, 1f
114 	ldr	tmp1, [src, #-8]!
115 	str	tmp1, [dst, #-8]!
116 1:
117 	tbz	count, #2, 2f
118 	ldr	tmp1w, [src, #-4]!
119 	str	tmp1w, [dst, #-4]!
120 2:
121 	tbz	count, #1, 3f
122 	ldrh	tmp1w, [src, #-2]!
123 	strh	tmp1w, [dst, #-2]!
124 3:
125 	tbz	count, #0, .Lexitfunc
126 	ldrb	tmp1w, [src, #-1]
127 	strb	tmp1w, [dst, #-1]
128 
129 .Lexitfunc:
130 	ret
131 
132 .Lcpy_over64:
133 	subs	count, count, #128
134 	b.ge	.Lcpy_body_large
135 	/*
136 	* Less than 128 bytes to copy, so handle 64 bytes here and then jump
137 	* to the tail.
138 	*/
139 	ldp	A_l, A_h, [src, #-16]
140 	stp	A_l, A_h, [dst, #-16]
141 	ldp	B_l, B_h, [src, #-32]
142 	ldp	C_l, C_h, [src, #-48]
143 	stp	B_l, B_h, [dst, #-32]
144 	stp	C_l, C_h, [dst, #-48]
145 	ldp	D_l, D_h, [src, #-64]!
146 	stp	D_l, D_h, [dst, #-64]!
147 
148 	tst	count, #0x3f
149 	b.ne	.Ltail63
150 	ret
151 
152 	/*
153 	* Critical loop. Start at a new cache line boundary. Assuming
154 	* 64 bytes per line this ensures the entire loop is in one line.
155 	*/
156 	.p2align	L1_CACHE_SHIFT
157 .Lcpy_body_large:
158 	/* pre-load 64 bytes data. */
159 	ldp	A_l, A_h, [src, #-16]
160 	ldp	B_l, B_h, [src, #-32]
161 	ldp	C_l, C_h, [src, #-48]
162 	ldp	D_l, D_h, [src, #-64]!
163 1:
164 	/*
165 	* interlace the load of next 64 bytes data block with store of the last
166 	* loaded 64 bytes data.
167 	*/
168 	stp	A_l, A_h, [dst, #-16]
169 	ldp	A_l, A_h, [src, #-16]
170 	stp	B_l, B_h, [dst, #-32]
171 	ldp	B_l, B_h, [src, #-32]
172 	stp	C_l, C_h, [dst, #-48]
173 	ldp	C_l, C_h, [src, #-48]
174 	stp	D_l, D_h, [dst, #-64]!
175 	ldp	D_l, D_h, [src, #-64]!
176 	subs	count, count, #64
177 	b.ge	1b
178 	stp	A_l, A_h, [dst, #-16]
179 	stp	B_l, B_h, [dst, #-32]
180 	stp	C_l, C_h, [dst, #-48]
181 	stp	D_l, D_h, [dst, #-64]!
182 
183 	tst	count, #0x3f
184 	b.ne	.Ltail63
185 	ret
186 SYM_FUNC_END_PI(memmove)
187 EXPORT_SYMBOL(memmove)
188 SYM_FUNC_END_ALIAS(__memmove)
189 EXPORT_SYMBOL(__memmove)
190