1 /*
2  * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions
3  * xthal_memcpy and xthal_bcopy
4  *
5  * This file is subject to the terms and conditions of the GNU General Public
6  * License.  See the file "COPYING" in the main directory of this archive
7  * for more details.
8  *
9  * Copyright (C) 2002 - 2012 Tensilica Inc.
10  */
11 
12 #include <linux/linkage.h>
13 #include <asm/asmmacro.h>
14 #include <asm/core.h>
15 
16 /*
17  * void *memcpy(void *dst, const void *src, size_t len);
18  *
19  * This function is intended to do the same thing as the standard
20  * library function memcpy() for most cases.
21  * However, where the source and/or destination references
22  * an instruction RAM or ROM or a data RAM or ROM, that
23  * source and/or destination will always be accessed with
24  * 32-bit load and store instructions (as required for these
25  * types of devices).
26  *
27  * !!!!!!!  XTFIXME:
28  * !!!!!!!  Handling of IRAM/IROM has not yet
29  * !!!!!!!  been implemented.
30  *
31  * The (general case) algorithm is as follows:
32  *   If destination is unaligned, align it by conditionally
33  *     copying 1 and 2 bytes.
34  *   If source is aligned,
35  *     do 16 bytes with a loop, and then finish up with
36  *     8, 4, 2, and 1 byte copies conditional on the length;
37  *   else (if source is unaligned),
38  *     do the same, but use SRC to align the source data.
39  *   This code tries to use fall-through branches for the common
40  *     case of aligned source and destination and multiple
41  *     of 4 (or 8) length.
42  *
43  * Register use:
44  *	a0/ return address
45  *	a1/ stack pointer
46  *	a2/ return value
47  *	a3/ src
48  *	a4/ length
49  *	a5/ dst
50  *	a6/ tmp
51  *	a7/ tmp
52  *	a8/ tmp
53  *	a9/ tmp
54  *	a10/ tmp
55  *	a11/ tmp
56  */
57 
58 	.text
59 
60 /*
61  * Byte by byte copy
62  */
63 	.align	4
64 	.byte	0		# 1 mod 4 alignment for LOOPNEZ
65 				# (0 mod 4 alignment for LBEG)
66 .Lbytecopy:
67 #if XCHAL_HAVE_LOOPS
68 	loopnez	a4, .Lbytecopydone
69 #else /* !XCHAL_HAVE_LOOPS */
70 	beqz	a4, .Lbytecopydone
71 	add	a7, a3, a4	# a7 = end address for source
72 #endif /* !XCHAL_HAVE_LOOPS */
73 .Lnextbyte:
74 	l8ui	a6, a3, 0
75 	addi	a3, a3, 1
76 	s8i	a6, a5, 0
77 	addi	a5, a5, 1
78 #if !XCHAL_HAVE_LOOPS
79 	bne	a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
80 #endif /* !XCHAL_HAVE_LOOPS */
81 .Lbytecopydone:
82 	abi_ret_default
83 
84 /*
85  * Destination is unaligned
86  */
87 
88 	.align	4
89 .Ldst1mod2:	# dst is only byte aligned
90 	_bltui	a4, 7, .Lbytecopy	# do short copies byte by byte
91 
92 	# copy 1 byte
93 	l8ui	a6, a3,  0
94 	addi	a3, a3,  1
95 	addi	a4, a4, -1
96 	s8i	a6, a5,  0
97 	addi	a5, a5,  1
98 	_bbci.l	a5, 1, .Ldstaligned	# if dst is now aligned, then
99 					# return to main algorithm
100 .Ldst2mod4:	# dst 16-bit aligned
101 	# copy 2 bytes
102 	_bltui	a4, 6, .Lbytecopy	# do short copies byte by byte
103 	l8ui	a6, a3,  0
104 	l8ui	a7, a3,  1
105 	addi	a3, a3,  2
106 	addi	a4, a4, -2
107 	s8i	a6, a5,  0
108 	s8i	a7, a5,  1
109 	addi	a5, a5,  2
110 	j	.Ldstaligned	# dst is now aligned, return to main algorithm
111 
112 ENTRY(__memcpy)
113 WEAK(memcpy)
114 
115 	abi_entry_default
116 	# a2/ dst, a3/ src, a4/ len
117 	mov	a5, a2		# copy dst so that a2 is return value
118 .Lcommon:
119 	_bbsi.l	a2, 0, .Ldst1mod2	# if dst is 1 mod 2
120 	_bbsi.l	a2, 1, .Ldst2mod4	# if dst is 2 mod 4
121 .Ldstaligned:	# return here from .Ldst?mod? once dst is aligned
122 	srli	a7, a4, 4	# number of loop iterations with 16B
123 				# per iteration
124 	movi	a8, 3		# if source is not aligned,
125 	_bany	a3, a8, .Lsrcunaligned	# then use shifting copy
126 	/*
127 	 * Destination and source are word-aligned, use word copy.
128 	 */
129 	# copy 16 bytes per iteration for word-aligned dst and word-aligned src
130 #if XCHAL_HAVE_LOOPS
131 	loopnez	a7, .Loop1done
132 #else /* !XCHAL_HAVE_LOOPS */
133 	beqz	a7, .Loop1done
134 	slli	a8, a7, 4
135 	add	a8, a8, a3	# a8 = end of last 16B source chunk
136 #endif /* !XCHAL_HAVE_LOOPS */
137 .Loop1:
138 	l32i	a6, a3,  0
139 	l32i	a7, a3,  4
140 	s32i	a6, a5,  0
141 	l32i	a6, a3,  8
142 	s32i	a7, a5,  4
143 	l32i	a7, a3, 12
144 	s32i	a6, a5,  8
145 	addi	a3, a3, 16
146 	s32i	a7, a5, 12
147 	addi	a5, a5, 16
148 #if !XCHAL_HAVE_LOOPS
149 	bne	a3, a8, .Loop1  # continue loop if a3:src != a8:src_end
150 #endif /* !XCHAL_HAVE_LOOPS */
151 .Loop1done:
152 	bbci.l	a4, 3, .L2
153 	# copy 8 bytes
154 	l32i	a6, a3,  0
155 	l32i	a7, a3,  4
156 	addi	a3, a3,  8
157 	s32i	a6, a5,  0
158 	s32i	a7, a5,  4
159 	addi	a5, a5,  8
160 .L2:
161 	bbsi.l	a4, 2, .L3
162 	bbsi.l	a4, 1, .L4
163 	bbsi.l	a4, 0, .L5
164 	abi_ret_default
165 .L3:
166 	# copy 4 bytes
167 	l32i	a6, a3,  0
168 	addi	a3, a3,  4
169 	s32i	a6, a5,  0
170 	addi	a5, a5,  4
171 	bbsi.l	a4, 1, .L4
172 	bbsi.l	a4, 0, .L5
173 	abi_ret_default
174 .L4:
175 	# copy 2 bytes
176 	l16ui	a6, a3,  0
177 	addi	a3, a3,  2
178 	s16i	a6, a5,  0
179 	addi	a5, a5,  2
180 	bbsi.l	a4, 0, .L5
181 	abi_ret_default
182 .L5:
183 	# copy 1 byte
184 	l8ui	a6, a3,  0
185 	s8i	a6, a5,  0
186 	abi_ret_default
187 
188 /*
189  * Destination is aligned, Source is unaligned
190  */
191 
192 	.align	4
193 .Lsrcunaligned:
194 	_beqz	a4, .Ldone	# avoid loading anything for zero-length copies
195 	# copy 16 bytes per iteration for word-aligned dst and unaligned src
196 	__ssa8	a3		# set shift amount from byte offset
197 
198 /* set to 1 when running on ISS (simulator) with the
199    lint or ferret client, or 0 to save a few cycles */
200 #define SIM_CHECKS_ALIGNMENT	1
201 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
202 	and	a11, a3, a8	# save unalignment offset for below
203 	sub	a3, a3, a11	# align a3
204 #endif
205 	l32i	a6, a3, 0	# load first word
206 #if XCHAL_HAVE_LOOPS
207 	loopnez	a7, .Loop2done
208 #else /* !XCHAL_HAVE_LOOPS */
209 	beqz	a7, .Loop2done
210 	slli	a10, a7, 4
211 	add	a10, a10, a3	# a10 = end of last 16B source chunk
212 #endif /* !XCHAL_HAVE_LOOPS */
213 .Loop2:
214 	l32i	a7, a3,  4
215 	l32i	a8, a3,  8
216 	__src_b	a6, a6, a7
217 	s32i	a6, a5,  0
218 	l32i	a9, a3, 12
219 	__src_b	a7, a7, a8
220 	s32i	a7, a5,  4
221 	l32i	a6, a3, 16
222 	__src_b	a8, a8, a9
223 	s32i	a8, a5,  8
224 	addi	a3, a3, 16
225 	__src_b	a9, a9, a6
226 	s32i	a9, a5, 12
227 	addi	a5, a5, 16
228 #if !XCHAL_HAVE_LOOPS
229 	bne	a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
230 #endif /* !XCHAL_HAVE_LOOPS */
231 .Loop2done:
232 	bbci.l	a4, 3, .L12
233 	# copy 8 bytes
234 	l32i	a7, a3,  4
235 	l32i	a8, a3,  8
236 	__src_b	a6, a6, a7
237 	s32i	a6, a5,  0
238 	addi	a3, a3,  8
239 	__src_b	a7, a7, a8
240 	s32i	a7, a5,  4
241 	addi	a5, a5,  8
242 	mov	a6, a8
243 .L12:
244 	bbci.l	a4, 2, .L13
245 	# copy 4 bytes
246 	l32i	a7, a3,  4
247 	addi	a3, a3,  4
248 	__src_b	a6, a6, a7
249 	s32i	a6, a5,  0
250 	addi	a5, a5,  4
251 	mov	a6, a7
252 .L13:
253 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
254 	add	a3, a3, a11	# readjust a3 with correct misalignment
255 #endif
256 	bbsi.l	a4, 1, .L14
257 	bbsi.l	a4, 0, .L15
258 .Ldone:	abi_ret_default
259 .L14:
260 	# copy 2 bytes
261 	l8ui	a6, a3,  0
262 	l8ui	a7, a3,  1
263 	addi	a3, a3,  2
264 	s8i	a6, a5,  0
265 	s8i	a7, a5,  1
266 	addi	a5, a5,  2
267 	bbsi.l	a4, 0, .L15
268 	abi_ret_default
269 .L15:
270 	# copy 1 byte
271 	l8ui	a6, a3,  0
272 	s8i	a6, a5,  0
273 	abi_ret_default
274 
275 ENDPROC(__memcpy)
276 EXPORT_SYMBOL(__memcpy)
277 EXPORT_SYMBOL(memcpy)
278 
279 /*
280  * void *memmove(void *dst, const void *src, size_t len);
281  *
282  * This function is intended to do the same thing as the standard
283  * library function memmove() for most cases.
284  * However, where the source and/or destination references
285  * an instruction RAM or ROM or a data RAM or ROM, that
286  * source and/or destination will always be accessed with
287  * 32-bit load and store instructions (as required for these
288  * types of devices).
289  *
290  * !!!!!!!  XTFIXME:
291  * !!!!!!!  Handling of IRAM/IROM has not yet
292  * !!!!!!!  been implemented.
293  *
294  * The (general case) algorithm is as follows:
295  *   If end of source doesn't overlap destination then use memcpy.
296  *   Otherwise do memcpy backwards.
297  *
298  * Register use:
299  *	a0/ return address
300  *	a1/ stack pointer
301  *	a2/ return value
302  *	a3/ src
303  *	a4/ length
304  *	a5/ dst
305  *	a6/ tmp
306  *	a7/ tmp
307  *	a8/ tmp
308  *	a9/ tmp
309  *	a10/ tmp
310  *	a11/ tmp
311  */
312 
313 /*
314  * Byte by byte copy
315  */
316 	.align	4
317 	.byte	0		# 1 mod 4 alignment for LOOPNEZ
318 				# (0 mod 4 alignment for LBEG)
319 .Lbackbytecopy:
320 #if XCHAL_HAVE_LOOPS
321 	loopnez	a4, .Lbackbytecopydone
322 #else /* !XCHAL_HAVE_LOOPS */
323 	beqz	a4, .Lbackbytecopydone
324 	sub	a7, a3, a4	# a7 = start address for source
325 #endif /* !XCHAL_HAVE_LOOPS */
326 .Lbacknextbyte:
327 	addi	a3, a3, -1
328 	l8ui	a6, a3, 0
329 	addi	a5, a5, -1
330 	s8i	a6, a5, 0
331 #if !XCHAL_HAVE_LOOPS
332 	bne	a3, a7, .Lbacknextbyte # continue loop if
333 				       # $a3:src != $a7:src_start
334 #endif /* !XCHAL_HAVE_LOOPS */
335 .Lbackbytecopydone:
336 	abi_ret_default
337 
338 /*
339  * Destination is unaligned
340  */
341 
342 	.align	4
343 .Lbackdst1mod2:	# dst is only byte aligned
344 	_bltui	a4, 7, .Lbackbytecopy	# do short copies byte by byte
345 
346 	# copy 1 byte
347 	addi	a3, a3, -1
348 	l8ui	a6, a3,  0
349 	addi	a5, a5, -1
350 	s8i	a6, a5,  0
351 	addi	a4, a4, -1
352 	_bbci.l	a5, 1, .Lbackdstaligned	# if dst is now aligned, then
353 					# return to main algorithm
354 .Lbackdst2mod4:	# dst 16-bit aligned
355 	# copy 2 bytes
356 	_bltui	a4, 6, .Lbackbytecopy	# do short copies byte by byte
357 	addi	a3, a3, -2
358 	l8ui	a6, a3,  0
359 	l8ui	a7, a3,  1
360 	addi	a5, a5, -2
361 	s8i	a6, a5,  0
362 	s8i	a7, a5,  1
363 	addi	a4, a4, -2
364 	j	.Lbackdstaligned	# dst is now aligned,
365 					# return to main algorithm
366 
367 ENTRY(__memmove)
368 WEAK(memmove)
369 
370 	abi_entry_default
371 	# a2/ dst, a3/ src, a4/ len
372 	mov	a5, a2		# copy dst so that a2 is return value
373 .Lmovecommon:
374 	sub	a6, a5, a3
375 	bgeu	a6, a4, .Lcommon
376 
377 	add	a5, a5, a4
378 	add	a3, a3, a4
379 
380 	_bbsi.l	a5, 0, .Lbackdst1mod2	# if dst is 1 mod 2
381 	_bbsi.l	a5, 1, .Lbackdst2mod4	# if dst is 2 mod 4
382 .Lbackdstaligned:	# return here from .Lbackdst?mod? once dst is aligned
383 	srli	a7, a4, 4	# number of loop iterations with 16B
384 				# per iteration
385 	movi	a8, 3		# if source is not aligned,
386 	_bany	a3, a8, .Lbacksrcunaligned	# then use shifting copy
387 	/*
388 	 * Destination and source are word-aligned, use word copy.
389 	 */
390 	# copy 16 bytes per iteration for word-aligned dst and word-aligned src
391 #if XCHAL_HAVE_LOOPS
392 	loopnez	a7, .LbackLoop1done
393 #else /* !XCHAL_HAVE_LOOPS */
394 	beqz	a7, .LbackLoop1done
395 	slli	a8, a7, 4
396 	sub	a8, a3, a8	# a8 = start of first 16B source chunk
397 #endif /* !XCHAL_HAVE_LOOPS */
398 .LbackLoop1:
399 	addi	a3, a3, -16
400 	l32i	a7, a3, 12
401 	l32i	a6, a3,  8
402 	addi	a5, a5, -16
403 	s32i	a7, a5, 12
404 	l32i	a7, a3,  4
405 	s32i	a6, a5,  8
406 	l32i	a6, a3,  0
407 	s32i	a7, a5,  4
408 	s32i	a6, a5,  0
409 #if !XCHAL_HAVE_LOOPS
410 	bne	a3, a8, .LbackLoop1  # continue loop if a3:src != a8:src_start
411 #endif /* !XCHAL_HAVE_LOOPS */
412 .LbackLoop1done:
413 	bbci.l	a4, 3, .Lback2
414 	# copy 8 bytes
415 	addi	a3, a3, -8
416 	l32i	a6, a3,  0
417 	l32i	a7, a3,  4
418 	addi	a5, a5, -8
419 	s32i	a6, a5,  0
420 	s32i	a7, a5,  4
421 .Lback2:
422 	bbsi.l	a4, 2, .Lback3
423 	bbsi.l	a4, 1, .Lback4
424 	bbsi.l	a4, 0, .Lback5
425 	abi_ret_default
426 .Lback3:
427 	# copy 4 bytes
428 	addi	a3, a3, -4
429 	l32i	a6, a3,  0
430 	addi	a5, a5, -4
431 	s32i	a6, a5,  0
432 	bbsi.l	a4, 1, .Lback4
433 	bbsi.l	a4, 0, .Lback5
434 	abi_ret_default
435 .Lback4:
436 	# copy 2 bytes
437 	addi	a3, a3, -2
438 	l16ui	a6, a3,  0
439 	addi	a5, a5, -2
440 	s16i	a6, a5,  0
441 	bbsi.l	a4, 0, .Lback5
442 	abi_ret_default
443 .Lback5:
444 	# copy 1 byte
445 	addi	a3, a3, -1
446 	l8ui	a6, a3,  0
447 	addi	a5, a5, -1
448 	s8i	a6, a5,  0
449 	abi_ret_default
450 
451 /*
452  * Destination is aligned, Source is unaligned
453  */
454 
455 	.align	4
456 .Lbacksrcunaligned:
457 	_beqz	a4, .Lbackdone	# avoid loading anything for zero-length copies
458 	# copy 16 bytes per iteration for word-aligned dst and unaligned src
459 	__ssa8	a3		# set shift amount from byte offset
460 #define SIM_CHECKS_ALIGNMENT	1	/* set to 1 when running on ISS with
461 					 * the lint or ferret client, or 0
462 					 * to save a few cycles */
463 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
464 	and	a11, a3, a8	# save unalignment offset for below
465 	sub	a3, a3, a11	# align a3
466 #endif
467 	l32i	a6, a3, 0	# load first word
468 #if XCHAL_HAVE_LOOPS
469 	loopnez	a7, .LbackLoop2done
470 #else /* !XCHAL_HAVE_LOOPS */
471 	beqz	a7, .LbackLoop2done
472 	slli	a10, a7, 4
473 	sub	a10, a3, a10	# a10 = start of first 16B source chunk
474 #endif /* !XCHAL_HAVE_LOOPS */
475 .LbackLoop2:
476 	addi	a3, a3, -16
477 	l32i	a7, a3, 12
478 	l32i	a8, a3,  8
479 	addi	a5, a5, -16
480 	__src_b	a6, a7, a6
481 	s32i	a6, a5, 12
482 	l32i	a9, a3,  4
483 	__src_b	a7, a8, a7
484 	s32i	a7, a5,  8
485 	l32i	a6, a3,  0
486 	__src_b	a8, a9, a8
487 	s32i	a8, a5,  4
488 	__src_b	a9, a6, a9
489 	s32i	a9, a5,  0
490 #if !XCHAL_HAVE_LOOPS
491 	bne	a3, a10, .LbackLoop2 # continue loop if a3:src != a10:src_start
492 #endif /* !XCHAL_HAVE_LOOPS */
493 .LbackLoop2done:
494 	bbci.l	a4, 3, .Lback12
495 	# copy 8 bytes
496 	addi	a3, a3, -8
497 	l32i	a7, a3,  4
498 	l32i	a8, a3,  0
499 	addi	a5, a5, -8
500 	__src_b	a6, a7, a6
501 	s32i	a6, a5,  4
502 	__src_b	a7, a8, a7
503 	s32i	a7, a5,  0
504 	mov	a6, a8
505 .Lback12:
506 	bbci.l	a4, 2, .Lback13
507 	# copy 4 bytes
508 	addi	a3, a3, -4
509 	l32i	a7, a3,  0
510 	addi	a5, a5, -4
511 	__src_b	a6, a7, a6
512 	s32i	a6, a5,  0
513 	mov	a6, a7
514 .Lback13:
515 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
516 	add	a3, a3, a11	# readjust a3 with correct misalignment
517 #endif
518 	bbsi.l	a4, 1, .Lback14
519 	bbsi.l	a4, 0, .Lback15
520 .Lbackdone:
521 	abi_ret_default
522 .Lback14:
523 	# copy 2 bytes
524 	addi	a3, a3, -2
525 	l8ui	a6, a3,  0
526 	l8ui	a7, a3,  1
527 	addi	a5, a5, -2
528 	s8i	a6, a5,  0
529 	s8i	a7, a5,  1
530 	bbsi.l	a4, 0, .Lback15
531 	abi_ret_default
532 .Lback15:
533 	# copy 1 byte
534 	addi	a3, a3, -1
535 	addi	a5, a5, -1
536 	l8ui	a6, a3,  0
537 	s8i	a6, a5,  0
538 	abi_ret_default
539 
540 ENDPROC(__memmove)
541 EXPORT_SYMBOL(__memmove)
542 EXPORT_SYMBOL(memmove)
543