1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /*
3  * Copyright (C) 2022 Michael T. Kloos <michael@michaelkloos.com>
4  */
5 
6 #include <linux/linkage.h>
7 #include <asm/asm.h>
8 
9 SYM_FUNC_START(__memmove)
10 SYM_FUNC_START_WEAK(memmove)
11 	/*
12 	 * Returns
13 	 *   a0 - dest
14 	 *
15 	 * Parameters
16 	 *   a0 - Inclusive first byte of dest
17 	 *   a1 - Inclusive first byte of src
18 	 *   a2 - Length of copy n
19 	 *
20 	 * Because the return matches the parameter register a0,
21 	 * we will not clobber or modify that register.
22 	 *
23 	 * Note: This currently only works on little-endian.
24 	 * To port to big-endian, reverse the direction of shifts
25 	 * in the 2 misaligned fixup copy loops.
26 	 */
27 
28 	/* Return if nothing to do */
29 	beq a0, a1, return_from_memmove
30 	beqz a2, return_from_memmove
31 
32 	/*
33 	 * Register Uses
34 	 *      Forward Copy: a1 - Index counter of src
35 	 *      Reverse Copy: a4 - Index counter of src
36 	 *      Forward Copy: t3 - Index counter of dest
37 	 *      Reverse Copy: t4 - Index counter of dest
38 	 *   Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest
39 	 *   Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest
40 	 *   Both Copy Modes: t0 - Link / Temporary for load-store
41 	 *   Both Copy Modes: t1 - Temporary for load-store
42 	 *   Both Copy Modes: t2 - Temporary for load-store
43 	 *   Both Copy Modes: a5 - dest to src alignment offset
44 	 *   Both Copy Modes: a6 - Shift ammount
45 	 *   Both Copy Modes: a7 - Inverse Shift ammount
46 	 *   Both Copy Modes: a2 - Alternate breakpoint for unrolled loops
47 	 */
48 
49 	/*
50 	 * Solve for some register values now.
51 	 * Byte copy does not need t5 or t6.
52 	 */
53 	mv   t3, a0
54 	add  t4, a0, a2
55 	add  a4, a1, a2
56 
57 	/*
58 	 * Byte copy if copying less than (2 * SZREG) bytes. This can
59 	 * cause problems with the bulk copy implementation and is
60 	 * small enough not to bother.
61 	 */
62 	andi t0, a2, -(2 * SZREG)
63 	beqz t0, byte_copy
64 
65 	/*
66 	 * Now solve for t5 and t6.
67 	 */
68 	andi t5, t3, -SZREG
69 	andi t6, t4, -SZREG
70 	/*
71 	 * If dest(Register t3) rounded down to the nearest naturally
72 	 * aligned SZREG address, does not equal dest, then add SZREG
73 	 * to find the low-bound of SZREG alignment in the dest memory
74 	 * region.  Note that this could overshoot the dest memory
75 	 * region if n is less than SZREG.  This is one reason why
76 	 * we always byte copy if n is less than SZREG.
77 	 * Otherwise, dest is already naturally aligned to SZREG.
78 	 */
79 	beq  t5, t3, 1f
80 		addi t5, t5, SZREG
81 	1:
82 
83 	/*
84 	 * If the dest and src are co-aligned to SZREG, then there is
85 	 * no need for the full rigmarole of a full misaligned fixup copy.
86 	 * Instead, do a simpler co-aligned copy.
87 	 */
88 	xor  t0, a0, a1
89 	andi t1, t0, (SZREG - 1)
90 	beqz t1, coaligned_copy
91 	/* Fall through to misaligned fixup copy */
92 
93 misaligned_fixup_copy:
94 	bltu a1, a0, misaligned_fixup_copy_reverse
95 
96 misaligned_fixup_copy_forward:
97 	jal  t0, byte_copy_until_aligned_forward
98 
99 	andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */
100 	slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
101 	sub  a5, a1, t3 /* Find the difference between src and dest */
102 	andi a1, a1, -SZREG /* Align the src pointer */
103 	addi a2, t6, SZREG /* The other breakpoint for the unrolled loop*/
104 
105 	/*
106 	 * Compute The Inverse Shift
107 	 * a7 = XLEN - a6 = XLEN + -a6
108 	 * 2s complement negation to find the negative: -a6 = ~a6 + 1
109 	 * Add that to XLEN.  XLEN = SZREG * 8.
110 	 */
111 	not  a7, a6
112 	addi a7, a7, (SZREG * 8 + 1)
113 
114 	/*
115 	 * Fix Misalignment Copy Loop - Forward
116 	 * load_val0 = load_ptr[0];
117 	 * do {
118 	 * 	load_val1 = load_ptr[1];
119 	 * 	store_ptr += 2;
120 	 * 	store_ptr[0 - 2] = (load_val0 >> {a6}) | (load_val1 << {a7});
121 	 *
122 	 * 	if (store_ptr == {a2})
123 	 * 		break;
124 	 *
125 	 * 	load_val0 = load_ptr[2];
126 	 * 	load_ptr += 2;
127 	 * 	store_ptr[1 - 2] = (load_val1 >> {a6}) | (load_val0 << {a7});
128 	 *
129 	 * } while (store_ptr != store_ptr_end);
130 	 * store_ptr = store_ptr_end;
131 	 */
132 
133 	REG_L t0, (0 * SZREG)(a1)
134 	1:
135 	REG_L t1, (1 * SZREG)(a1)
136 	addi  t3, t3, (2 * SZREG)
137 	srl   t0, t0, a6
138 	sll   t2, t1, a7
139 	or    t2, t0, t2
140 	REG_S t2, ((0 * SZREG) - (2 * SZREG))(t3)
141 
142 	beq   t3, a2, 2f
143 
144 	REG_L t0, (2 * SZREG)(a1)
145 	addi  a1, a1, (2 * SZREG)
146 	srl   t1, t1, a6
147 	sll   t2, t0, a7
148 	or    t2, t1, t2
149 	REG_S t2, ((1 * SZREG) - (2 * SZREG))(t3)
150 
151 	bne   t3, t6, 1b
152 	2:
153 	mv    t3, t6 /* Fix the dest pointer in case the loop was broken */
154 
155 	add  a1, t3, a5 /* Restore the src pointer */
156 	j byte_copy_forward /* Copy any remaining bytes */
157 
158 misaligned_fixup_copy_reverse:
159 	jal  t0, byte_copy_until_aligned_reverse
160 
161 	andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */
162 	slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
163 	sub  a5, a4, t4 /* Find the difference between src and dest */
164 	andi a4, a4, -SZREG /* Align the src pointer */
165 	addi a2, t5, -SZREG /* The other breakpoint for the unrolled loop*/
166 
167 	/*
168 	 * Compute The Inverse Shift
169 	 * a7 = XLEN - a6 = XLEN + -a6
170 	 * 2s complement negation to find the negative: -a6 = ~a6 + 1
171 	 * Add that to XLEN.  XLEN = SZREG * 8.
172 	 */
173 	not  a7, a6
174 	addi a7, a7, (SZREG * 8 + 1)
175 
176 	/*
177 	 * Fix Misalignment Copy Loop - Reverse
178 	 * load_val1 = load_ptr[0];
179 	 * do {
180 	 * 	load_val0 = load_ptr[-1];
181 	 * 	store_ptr -= 2;
182 	 * 	store_ptr[1] = (load_val0 >> {a6}) | (load_val1 << {a7});
183 	 *
184 	 * 	if (store_ptr == {a2})
185 	 * 		break;
186 	 *
187 	 * 	load_val1 = load_ptr[-2];
188 	 * 	load_ptr -= 2;
189 	 * 	store_ptr[0] = (load_val1 >> {a6}) | (load_val0 << {a7});
190 	 *
191 	 * } while (store_ptr != store_ptr_end);
192 	 * store_ptr = store_ptr_end;
193 	 */
194 
195 	REG_L t1, ( 0 * SZREG)(a4)
196 	1:
197 	REG_L t0, (-1 * SZREG)(a4)
198 	addi  t4, t4, (-2 * SZREG)
199 	sll   t1, t1, a7
200 	srl   t2, t0, a6
201 	or    t2, t1, t2
202 	REG_S t2, ( 1 * SZREG)(t4)
203 
204 	beq   t4, a2, 2f
205 
206 	REG_L t1, (-2 * SZREG)(a4)
207 	addi  a4, a4, (-2 * SZREG)
208 	sll   t0, t0, a7
209 	srl   t2, t1, a6
210 	or    t2, t0, t2
211 	REG_S t2, ( 0 * SZREG)(t4)
212 
213 	bne   t4, t5, 1b
214 	2:
215 	mv    t4, t5 /* Fix the dest pointer in case the loop was broken */
216 
217 	add  a4, t4, a5 /* Restore the src pointer */
218 	j byte_copy_reverse /* Copy any remaining bytes */
219 
220 /*
221  * Simple copy loops for SZREG co-aligned memory locations.
222  * These also make calls to do byte copies for any unaligned
223  * data at their terminations.
224  */
225 coaligned_copy:
226 	bltu a1, a0, coaligned_copy_reverse
227 
228 coaligned_copy_forward:
229 	jal t0, byte_copy_until_aligned_forward
230 
231 	1:
232 	REG_L t1, ( 0 * SZREG)(a1)
233 	addi  a1, a1, SZREG
234 	addi  t3, t3, SZREG
235 	REG_S t1, (-1 * SZREG)(t3)
236 	bne   t3, t6, 1b
237 
238 	j byte_copy_forward /* Copy any remaining bytes */
239 
240 coaligned_copy_reverse:
241 	jal t0, byte_copy_until_aligned_reverse
242 
243 	1:
244 	REG_L t1, (-1 * SZREG)(a4)
245 	addi  a4, a4, -SZREG
246 	addi  t4, t4, -SZREG
247 	REG_S t1, ( 0 * SZREG)(t4)
248 	bne   t4, t5, 1b
249 
250 	j byte_copy_reverse /* Copy any remaining bytes */
251 
252 /*
253  * These are basically sub-functions within the function.  They
254  * are used to byte copy until the dest pointer is in alignment.
255  * At which point, a bulk copy method can be used by the
256  * calling code.  These work on the same registers as the bulk
257  * copy loops.  Therefore, the register values can be picked
258  * up from where they were left and we avoid code duplication
259  * without any overhead except the call in and return jumps.
260  */
261 byte_copy_until_aligned_forward:
262 	beq  t3, t5, 2f
263 	1:
264 	lb   t1,  0(a1)
265 	addi a1, a1, 1
266 	addi t3, t3, 1
267 	sb   t1, -1(t3)
268 	bne  t3, t5, 1b
269 	2:
270 	jalr zero, 0x0(t0) /* Return to multibyte copy loop */
271 
272 byte_copy_until_aligned_reverse:
273 	beq  t4, t6, 2f
274 	1:
275 	lb   t1, -1(a4)
276 	addi a4, a4, -1
277 	addi t4, t4, -1
278 	sb   t1,  0(t4)
279 	bne  t4, t6, 1b
280 	2:
281 	jalr zero, 0x0(t0) /* Return to multibyte copy loop */
282 
283 /*
284  * Simple byte copy loops.
285  * These will byte copy until they reach the end of data to copy.
286  * At that point, they will call to return from memmove.
287  */
288 byte_copy:
289 	bltu a1, a0, byte_copy_reverse
290 
291 byte_copy_forward:
292 	beq  t3, t4, 2f
293 	1:
294 	lb   t1,  0(a1)
295 	addi a1, a1, 1
296 	addi t3, t3, 1
297 	sb   t1, -1(t3)
298 	bne  t3, t4, 1b
299 	2:
300 	ret
301 
302 byte_copy_reverse:
303 	beq  t4, t3, 2f
304 	1:
305 	lb   t1, -1(a4)
306 	addi a4, a4, -1
307 	addi t4, t4, -1
308 	sb   t1,  0(t4)
309 	bne  t4, t3, 1b
310 	2:
311 
312 return_from_memmove:
313 	ret
314 
315 SYM_FUNC_END(memmove)
316 SYM_FUNC_END(__memmove)
317 SYM_FUNC_ALIAS(__pi_memmove, __memmove)
318 SYM_FUNC_ALIAS(__pi___memmove, __memmove)
319