1 /*
2  * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions
3  * xthal_memcpy and xthal_bcopy
4  *
5  * This file is subject to the terms and conditions of the GNU General Public
6  * License.  See the file "COPYING" in the main directory of this archive
7  * for more details.
8  *
9  * Copyright (C) 2002 - 2012 Tensilica Inc.
10  */
11 
12 #include <linux/linkage.h>
13 #include <asm/asmmacro.h>
14 #include <asm/core.h>
15 
16 /*
17  * void *memcpy(void *dst, const void *src, size_t len);
18  *
19  * This function is intended to do the same thing as the standard
20  * library function memcpy() for most cases.
21  * However, where the source and/or destination references
22  * an instruction RAM or ROM or a data RAM or ROM, that
23  * source and/or destination will always be accessed with
24  * 32-bit load and store instructions (as required for these
25  * types of devices).
26  *
27  * !!!!!!!  XTFIXME:
28  * !!!!!!!  Handling of IRAM/IROM has not yet
29  * !!!!!!!  been implemented.
30  *
31  * The (general case) algorithm is as follows:
32  *   If destination is unaligned, align it by conditionally
33  *     copying 1 and 2 bytes.
34  *   If source is aligned,
35  *     do 16 bytes with a loop, and then finish up with
36  *     8, 4, 2, and 1 byte copies conditional on the length;
37  *   else (if source is unaligned),
38  *     do the same, but use SRC to align the source data.
39  *   This code tries to use fall-through branches for the common
40  *     case of aligned source and destination and multiple
41  *     of 4 (or 8) length.
42  *
43  * Register use:
44  *	a0/ return address
45  *	a1/ stack pointer
46  *	a2/ return value
47  *	a3/ src
48  *	a4/ length
49  *	a5/ dst
50  *	a6/ tmp
51  *	a7/ tmp
52  *	a8/ tmp
53  *	a9/ tmp
54  *	a10/ tmp
55  *	a11/ tmp
56  */
57 
58 	.text
59 
60 /*
61  * Byte by byte copy
62  */
63 	.align	4
64 	.byte	0		# 1 mod 4 alignment for LOOPNEZ
65 				# (0 mod 4 alignment for LBEG)
66 .Lbytecopy:
67 #if XCHAL_HAVE_LOOPS
68 	loopnez	a4, .Lbytecopydone
69 #else /* !XCHAL_HAVE_LOOPS */
70 	beqz	a4, .Lbytecopydone
71 	add	a7, a3, a4	# a7 = end address for source
72 #endif /* !XCHAL_HAVE_LOOPS */
73 .Lnextbyte:
74 	l8ui	a6, a3, 0
75 	addi	a3, a3, 1
76 	s8i	a6, a5, 0
77 	addi	a5, a5, 1
78 #if !XCHAL_HAVE_LOOPS
79 	bne	a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
80 #endif /* !XCHAL_HAVE_LOOPS */
81 .Lbytecopydone:
82 	abi_ret_default
83 
84 /*
85  * Destination is unaligned
86  */
87 
88 	.align	4
89 .Ldst1mod2:	# dst is only byte aligned
90 	_bltui	a4, 7, .Lbytecopy	# do short copies byte by byte
91 
92 	# copy 1 byte
93 	l8ui	a6, a3,  0
94 	addi	a3, a3,  1
95 	addi	a4, a4, -1
96 	s8i	a6, a5,  0
97 	addi	a5, a5,  1
98 	_bbci.l	a5, 1, .Ldstaligned	# if dst is now aligned, then
99 					# return to main algorithm
100 .Ldst2mod4:	# dst 16-bit aligned
101 	# copy 2 bytes
102 	_bltui	a4, 6, .Lbytecopy	# do short copies byte by byte
103 	l8ui	a6, a3,  0
104 	l8ui	a7, a3,  1
105 	addi	a3, a3,  2
106 	addi	a4, a4, -2
107 	s8i	a6, a5,  0
108 	s8i	a7, a5,  1
109 	addi	a5, a5,  2
110 	j	.Ldstaligned	# dst is now aligned, return to main algorithm
111 
112 ENTRY(__memcpy)
113 WEAK(memcpy)
114 
115 	abi_entry_default
116 	# a2/ dst, a3/ src, a4/ len
117 	mov	a5, a2		# copy dst so that a2 is return value
118 .Lcommon:
119 	_bbsi.l	a2, 0, .Ldst1mod2	# if dst is 1 mod 2
120 	_bbsi.l	a2, 1, .Ldst2mod4	# if dst is 2 mod 4
121 .Ldstaligned:	# return here from .Ldst?mod? once dst is aligned
122 	srli	a7, a4, 4	# number of loop iterations with 16B
123 				# per iteration
124 	movi	a8, 3		# if source is not aligned,
125 	_bany	a3, a8, .Lsrcunaligned	# then use shifting copy
126 	/*
127 	 * Destination and source are word-aligned, use word copy.
128 	 */
129 	# copy 16 bytes per iteration for word-aligned dst and word-aligned src
130 #if XCHAL_HAVE_LOOPS
131 	loopnez	a7, .Loop1done
132 #else /* !XCHAL_HAVE_LOOPS */
133 	beqz	a7, .Loop1done
134 	slli	a8, a7, 4
135 	add	a8, a8, a3	# a8 = end of last 16B source chunk
136 #endif /* !XCHAL_HAVE_LOOPS */
137 .Loop1:
138 	l32i	a6, a3,  0
139 	l32i	a7, a3,  4
140 	s32i	a6, a5,  0
141 	l32i	a6, a3,  8
142 	s32i	a7, a5,  4
143 	l32i	a7, a3, 12
144 	s32i	a6, a5,  8
145 	addi	a3, a3, 16
146 	s32i	a7, a5, 12
147 	addi	a5, a5, 16
148 #if !XCHAL_HAVE_LOOPS
149 	bne	a3, a8, .Loop1  # continue loop if a3:src != a8:src_end
150 #endif /* !XCHAL_HAVE_LOOPS */
151 .Loop1done:
152 	bbci.l	a4, 3, .L2
153 	# copy 8 bytes
154 	l32i	a6, a3,  0
155 	l32i	a7, a3,  4
156 	addi	a3, a3,  8
157 	s32i	a6, a5,  0
158 	s32i	a7, a5,  4
159 	addi	a5, a5,  8
160 .L2:
161 	bbsi.l	a4, 2, .L3
162 	bbsi.l	a4, 1, .L4
163 	bbsi.l	a4, 0, .L5
164 	abi_ret_default
165 .L3:
166 	# copy 4 bytes
167 	l32i	a6, a3,  0
168 	addi	a3, a3,  4
169 	s32i	a6, a5,  0
170 	addi	a5, a5,  4
171 	bbsi.l	a4, 1, .L4
172 	bbsi.l	a4, 0, .L5
173 	abi_ret_default
174 .L4:
175 	# copy 2 bytes
176 	l16ui	a6, a3,  0
177 	addi	a3, a3,  2
178 	s16i	a6, a5,  0
179 	addi	a5, a5,  2
180 	bbsi.l	a4, 0, .L5
181 	abi_ret_default
182 .L5:
183 	# copy 1 byte
184 	l8ui	a6, a3,  0
185 	s8i	a6, a5,  0
186 	abi_ret_default
187 
188 /*
189  * Destination is aligned, Source is unaligned
190  */
191 
192 	.align	4
193 .Lsrcunaligned:
194 	_beqz	a4, .Ldone	# avoid loading anything for zero-length copies
195 	# copy 16 bytes per iteration for word-aligned dst and unaligned src
196 	__ssa8	a3		# set shift amount from byte offset
197 
198 /* set to 1 when running on ISS (simulator) with the
199    lint or ferret client, or 0 to save a few cycles */
200 #define SIM_CHECKS_ALIGNMENT	1
201 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
202 	and	a11, a3, a8	# save unalignment offset for below
203 	sub	a3, a3, a11	# align a3
204 #endif
205 	l32i	a6, a3, 0	# load first word
206 #if XCHAL_HAVE_LOOPS
207 	loopnez	a7, .Loop2done
208 #else /* !XCHAL_HAVE_LOOPS */
209 	beqz	a7, .Loop2done
210 	slli	a10, a7, 4
211 	add	a10, a10, a3	# a10 = end of last 16B source chunk
212 #endif /* !XCHAL_HAVE_LOOPS */
213 .Loop2:
214 	l32i	a7, a3,  4
215 	l32i	a8, a3,  8
216 	__src_b	a6, a6, a7
217 	s32i	a6, a5,  0
218 	l32i	a9, a3, 12
219 	__src_b	a7, a7, a8
220 	s32i	a7, a5,  4
221 	l32i	a6, a3, 16
222 	__src_b	a8, a8, a9
223 	s32i	a8, a5,  8
224 	addi	a3, a3, 16
225 	__src_b	a9, a9, a6
226 	s32i	a9, a5, 12
227 	addi	a5, a5, 16
228 #if !XCHAL_HAVE_LOOPS
229 	bne	a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
230 #endif /* !XCHAL_HAVE_LOOPS */
231 .Loop2done:
232 	bbci.l	a4, 3, .L12
233 	# copy 8 bytes
234 	l32i	a7, a3,  4
235 	l32i	a8, a3,  8
236 	__src_b	a6, a6, a7
237 	s32i	a6, a5,  0
238 	addi	a3, a3,  8
239 	__src_b	a7, a7, a8
240 	s32i	a7, a5,  4
241 	addi	a5, a5,  8
242 	mov	a6, a8
243 .L12:
244 	bbci.l	a4, 2, .L13
245 	# copy 4 bytes
246 	l32i	a7, a3,  4
247 	addi	a3, a3,  4
248 	__src_b	a6, a6, a7
249 	s32i	a6, a5,  0
250 	addi	a5, a5,  4
251 	mov	a6, a7
252 .L13:
253 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
254 	add	a3, a3, a11	# readjust a3 with correct misalignment
255 #endif
256 	bbsi.l	a4, 1, .L14
257 	bbsi.l	a4, 0, .L15
258 .Ldone:	abi_ret_default
259 .L14:
260 	# copy 2 bytes
261 	l8ui	a6, a3,  0
262 	l8ui	a7, a3,  1
263 	addi	a3, a3,  2
264 	s8i	a6, a5,  0
265 	s8i	a7, a5,  1
266 	addi	a5, a5,  2
267 	bbsi.l	a4, 0, .L15
268 	abi_ret_default
269 .L15:
270 	# copy 1 byte
271 	l8ui	a6, a3,  0
272 	s8i	a6, a5,  0
273 	abi_ret_default
274 
275 ENDPROC(__memcpy)
276 
277 /*
278  * void bcopy(const void *src, void *dest, size_t n);
279  */
280 
281 ENTRY(bcopy)
282 
283 	abi_entry_default
284 	# a2=src, a3=dst, a4=len
285 	mov	a5, a3
286 	mov	a3, a2
287 	mov	a2, a5
288 	j	.Lmovecommon	# go to common code for memmove+bcopy
289 
290 ENDPROC(bcopy)
291 
292 /*
293  * void *memmove(void *dst, const void *src, size_t len);
294  *
295  * This function is intended to do the same thing as the standard
296  * library function memmove() for most cases.
297  * However, where the source and/or destination references
298  * an instruction RAM or ROM or a data RAM or ROM, that
299  * source and/or destination will always be accessed with
300  * 32-bit load and store instructions (as required for these
301  * types of devices).
302  *
303  * !!!!!!!  XTFIXME:
304  * !!!!!!!  Handling of IRAM/IROM has not yet
305  * !!!!!!!  been implemented.
306  *
307  * The (general case) algorithm is as follows:
308  *   If end of source doesn't overlap destination then use memcpy.
309  *   Otherwise do memcpy backwards.
310  *
311  * Register use:
312  *	a0/ return address
313  *	a1/ stack pointer
314  *	a2/ return value
315  *	a3/ src
316  *	a4/ length
317  *	a5/ dst
318  *	a6/ tmp
319  *	a7/ tmp
320  *	a8/ tmp
321  *	a9/ tmp
322  *	a10/ tmp
323  *	a11/ tmp
324  */
325 
326 /*
327  * Byte by byte copy
328  */
329 	.align	4
330 	.byte	0		# 1 mod 4 alignment for LOOPNEZ
331 				# (0 mod 4 alignment for LBEG)
332 .Lbackbytecopy:
333 #if XCHAL_HAVE_LOOPS
334 	loopnez	a4, .Lbackbytecopydone
335 #else /* !XCHAL_HAVE_LOOPS */
336 	beqz	a4, .Lbackbytecopydone
337 	sub	a7, a3, a4	# a7 = start address for source
338 #endif /* !XCHAL_HAVE_LOOPS */
339 .Lbacknextbyte:
340 	addi	a3, a3, -1
341 	l8ui	a6, a3, 0
342 	addi	a5, a5, -1
343 	s8i	a6, a5, 0
344 #if !XCHAL_HAVE_LOOPS
345 	bne	a3, a7, .Lbacknextbyte # continue loop if
346 				       # $a3:src != $a7:src_start
347 #endif /* !XCHAL_HAVE_LOOPS */
348 .Lbackbytecopydone:
349 	abi_ret_default
350 
351 /*
352  * Destination is unaligned
353  */
354 
355 	.align	4
356 .Lbackdst1mod2:	# dst is only byte aligned
357 	_bltui	a4, 7, .Lbackbytecopy	# do short copies byte by byte
358 
359 	# copy 1 byte
360 	addi	a3, a3, -1
361 	l8ui	a6, a3,  0
362 	addi	a5, a5, -1
363 	s8i	a6, a5,  0
364 	addi	a4, a4, -1
365 	_bbci.l	a5, 1, .Lbackdstaligned	# if dst is now aligned, then
366 					# return to main algorithm
367 .Lbackdst2mod4:	# dst 16-bit aligned
368 	# copy 2 bytes
369 	_bltui	a4, 6, .Lbackbytecopy	# do short copies byte by byte
370 	addi	a3, a3, -2
371 	l8ui	a6, a3,  0
372 	l8ui	a7, a3,  1
373 	addi	a5, a5, -2
374 	s8i	a6, a5,  0
375 	s8i	a7, a5,  1
376 	addi	a4, a4, -2
377 	j	.Lbackdstaligned	# dst is now aligned,
378 					# return to main algorithm
379 
380 ENTRY(__memmove)
381 WEAK(memmove)
382 
383 	abi_entry_default
384 	# a2/ dst, a3/ src, a4/ len
385 	mov	a5, a2		# copy dst so that a2 is return value
386 .Lmovecommon:
387 	sub	a6, a5, a3
388 	bgeu	a6, a4, .Lcommon
389 
390 	add	a5, a5, a4
391 	add	a3, a3, a4
392 
393 	_bbsi.l	a5, 0, .Lbackdst1mod2	# if dst is 1 mod 2
394 	_bbsi.l	a5, 1, .Lbackdst2mod4	# if dst is 2 mod 4
395 .Lbackdstaligned:	# return here from .Lbackdst?mod? once dst is aligned
396 	srli	a7, a4, 4	# number of loop iterations with 16B
397 				# per iteration
398 	movi	a8, 3		# if source is not aligned,
399 	_bany	a3, a8, .Lbacksrcunaligned	# then use shifting copy
400 	/*
401 	 * Destination and source are word-aligned, use word copy.
402 	 */
403 	# copy 16 bytes per iteration for word-aligned dst and word-aligned src
404 #if XCHAL_HAVE_LOOPS
405 	loopnez	a7, .backLoop1done
406 #else /* !XCHAL_HAVE_LOOPS */
407 	beqz	a7, .backLoop1done
408 	slli	a8, a7, 4
409 	sub	a8, a3, a8	# a8 = start of first 16B source chunk
410 #endif /* !XCHAL_HAVE_LOOPS */
411 .backLoop1:
412 	addi	a3, a3, -16
413 	l32i	a7, a3, 12
414 	l32i	a6, a3,  8
415 	addi	a5, a5, -16
416 	s32i	a7, a5, 12
417 	l32i	a7, a3,  4
418 	s32i	a6, a5,  8
419 	l32i	a6, a3,  0
420 	s32i	a7, a5,  4
421 	s32i	a6, a5,  0
422 #if !XCHAL_HAVE_LOOPS
423 	bne	a3, a8, .backLoop1  # continue loop if a3:src != a8:src_start
424 #endif /* !XCHAL_HAVE_LOOPS */
425 .backLoop1done:
426 	bbci.l	a4, 3, .Lback2
427 	# copy 8 bytes
428 	addi	a3, a3, -8
429 	l32i	a6, a3,  0
430 	l32i	a7, a3,  4
431 	addi	a5, a5, -8
432 	s32i	a6, a5,  0
433 	s32i	a7, a5,  4
434 .Lback2:
435 	bbsi.l	a4, 2, .Lback3
436 	bbsi.l	a4, 1, .Lback4
437 	bbsi.l	a4, 0, .Lback5
438 	abi_ret_default
439 .Lback3:
440 	# copy 4 bytes
441 	addi	a3, a3, -4
442 	l32i	a6, a3,  0
443 	addi	a5, a5, -4
444 	s32i	a6, a5,  0
445 	bbsi.l	a4, 1, .Lback4
446 	bbsi.l	a4, 0, .Lback5
447 	abi_ret_default
448 .Lback4:
449 	# copy 2 bytes
450 	addi	a3, a3, -2
451 	l16ui	a6, a3,  0
452 	addi	a5, a5, -2
453 	s16i	a6, a5,  0
454 	bbsi.l	a4, 0, .Lback5
455 	abi_ret_default
456 .Lback5:
457 	# copy 1 byte
458 	addi	a3, a3, -1
459 	l8ui	a6, a3,  0
460 	addi	a5, a5, -1
461 	s8i	a6, a5,  0
462 	abi_ret_default
463 
464 /*
465  * Destination is aligned, Source is unaligned
466  */
467 
468 	.align	4
469 .Lbacksrcunaligned:
470 	_beqz	a4, .Lbackdone	# avoid loading anything for zero-length copies
471 	# copy 16 bytes per iteration for word-aligned dst and unaligned src
472 	__ssa8	a3		# set shift amount from byte offset
473 #define SIM_CHECKS_ALIGNMENT	1	/* set to 1 when running on ISS with
474 					 * the lint or ferret client, or 0
475 					 * to save a few cycles */
476 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
477 	and	a11, a3, a8	# save unalignment offset for below
478 	sub	a3, a3, a11	# align a3
479 #endif
480 	l32i	a6, a3, 0	# load first word
481 #if XCHAL_HAVE_LOOPS
482 	loopnez	a7, .backLoop2done
483 #else /* !XCHAL_HAVE_LOOPS */
484 	beqz	a7, .backLoop2done
485 	slli	a10, a7, 4
486 	sub	a10, a3, a10	# a10 = start of first 16B source chunk
487 #endif /* !XCHAL_HAVE_LOOPS */
488 .backLoop2:
489 	addi	a3, a3, -16
490 	l32i	a7, a3, 12
491 	l32i	a8, a3,  8
492 	addi	a5, a5, -16
493 	__src_b	a6, a7, a6
494 	s32i	a6, a5, 12
495 	l32i	a9, a3,  4
496 	__src_b	a7, a8, a7
497 	s32i	a7, a5,  8
498 	l32i	a6, a3,  0
499 	__src_b	a8, a9, a8
500 	s32i	a8, a5,  4
501 	__src_b	a9, a6, a9
502 	s32i	a9, a5,  0
503 #if !XCHAL_HAVE_LOOPS
504 	bne	a3, a10, .backLoop2 # continue loop if a3:src != a10:src_start
505 #endif /* !XCHAL_HAVE_LOOPS */
506 .backLoop2done:
507 	bbci.l	a4, 3, .Lback12
508 	# copy 8 bytes
509 	addi	a3, a3, -8
510 	l32i	a7, a3,  4
511 	l32i	a8, a3,  0
512 	addi	a5, a5, -8
513 	__src_b	a6, a7, a6
514 	s32i	a6, a5,  4
515 	__src_b	a7, a8, a7
516 	s32i	a7, a5,  0
517 	mov	a6, a8
518 .Lback12:
519 	bbci.l	a4, 2, .Lback13
520 	# copy 4 bytes
521 	addi	a3, a3, -4
522 	l32i	a7, a3,  0
523 	addi	a5, a5, -4
524 	__src_b	a6, a7, a6
525 	s32i	a6, a5,  0
526 	mov	a6, a7
527 .Lback13:
528 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
529 	add	a3, a3, a11	# readjust a3 with correct misalignment
530 #endif
531 	bbsi.l	a4, 1, .Lback14
532 	bbsi.l	a4, 0, .Lback15
533 .Lbackdone:
534 	abi_ret_default
535 .Lback14:
536 	# copy 2 bytes
537 	addi	a3, a3, -2
538 	l8ui	a6, a3,  0
539 	l8ui	a7, a3,  1
540 	addi	a5, a5, -2
541 	s8i	a6, a5,  0
542 	s8i	a7, a5,  1
543 	bbsi.l	a4, 0, .Lback15
544 	abi_ret_default
545 .Lback15:
546 	# copy 1 byte
547 	addi	a3, a3, -1
548 	addi	a5, a5, -1
549 	l8ui	a6, a3,  0
550 	s8i	a6, a5,  0
551 	abi_ret_default
552 
553 ENDPROC(__memmove)
554