18c2ecf20Sopenharmony_ci/*
28c2ecf20Sopenharmony_ci * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions
38c2ecf20Sopenharmony_ci * xthal_memcpy and xthal_bcopy
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * This file is subject to the terms and conditions of the GNU General Public
68c2ecf20Sopenharmony_ci * License.  See the file "COPYING" in the main directory of this archive
78c2ecf20Sopenharmony_ci * for more details.
88c2ecf20Sopenharmony_ci *
98c2ecf20Sopenharmony_ci * Copyright (C) 2002 - 2012 Tensilica Inc.
108c2ecf20Sopenharmony_ci */
118c2ecf20Sopenharmony_ci
128c2ecf20Sopenharmony_ci#include <linux/linkage.h>
138c2ecf20Sopenharmony_ci#include <asm/asmmacro.h>
148c2ecf20Sopenharmony_ci#include <asm/core.h>
158c2ecf20Sopenharmony_ci
168c2ecf20Sopenharmony_ci/*
178c2ecf20Sopenharmony_ci * void *memcpy(void *dst, const void *src, size_t len);
188c2ecf20Sopenharmony_ci *
198c2ecf20Sopenharmony_ci * This function is intended to do the same thing as the standard
208c2ecf20Sopenharmony_ci * library function memcpy() for most cases.
218c2ecf20Sopenharmony_ci * However, where the source and/or destination references
228c2ecf20Sopenharmony_ci * an instruction RAM or ROM or a data RAM or ROM, that
238c2ecf20Sopenharmony_ci * source and/or destination will always be accessed with
248c2ecf20Sopenharmony_ci * 32-bit load and store instructions (as required for these
258c2ecf20Sopenharmony_ci * types of devices).
268c2ecf20Sopenharmony_ci *
278c2ecf20Sopenharmony_ci * !!!!!!!  XTFIXME:
288c2ecf20Sopenharmony_ci * !!!!!!!  Handling of IRAM/IROM has not yet
298c2ecf20Sopenharmony_ci * !!!!!!!  been implemented.
308c2ecf20Sopenharmony_ci *
318c2ecf20Sopenharmony_ci * The (general case) algorithm is as follows:
328c2ecf20Sopenharmony_ci *   If destination is unaligned, align it by conditionally
338c2ecf20Sopenharmony_ci *     copying 1 and 2 bytes.
348c2ecf20Sopenharmony_ci *   If source is aligned,
358c2ecf20Sopenharmony_ci *     do 16 bytes with a loop, and then finish up with
368c2ecf20Sopenharmony_ci *     8, 4, 2, and 1 byte copies conditional on the length;
378c2ecf20Sopenharmony_ci *   else (if source is unaligned),
388c2ecf20Sopenharmony_ci *     do the same, but use SRC to align the source data.
398c2ecf20Sopenharmony_ci *   This code tries to use fall-through branches for the common
408c2ecf20Sopenharmony_ci *     case of aligned source and destination and multiple
418c2ecf20Sopenharmony_ci *     of 4 (or 8) length.
428c2ecf20Sopenharmony_ci *
438c2ecf20Sopenharmony_ci * Register use:
448c2ecf20Sopenharmony_ci *	a0/ return address
458c2ecf20Sopenharmony_ci *	a1/ stack pointer
468c2ecf20Sopenharmony_ci *	a2/ return value
478c2ecf20Sopenharmony_ci *	a3/ src
488c2ecf20Sopenharmony_ci *	a4/ length
498c2ecf20Sopenharmony_ci *	a5/ dst
508c2ecf20Sopenharmony_ci *	a6/ tmp
518c2ecf20Sopenharmony_ci *	a7/ tmp
528c2ecf20Sopenharmony_ci *	a8/ tmp
538c2ecf20Sopenharmony_ci *	a9/ tmp
548c2ecf20Sopenharmony_ci *	a10/ tmp
558c2ecf20Sopenharmony_ci *	a11/ tmp
568c2ecf20Sopenharmony_ci */
578c2ecf20Sopenharmony_ci
588c2ecf20Sopenharmony_ci	.text
598c2ecf20Sopenharmony_ci
608c2ecf20Sopenharmony_ci/*
618c2ecf20Sopenharmony_ci * Byte by byte copy
628c2ecf20Sopenharmony_ci */
638c2ecf20Sopenharmony_ci	.align	4
648c2ecf20Sopenharmony_ci	.byte	0		# 1 mod 4 alignment for LOOPNEZ
658c2ecf20Sopenharmony_ci				# (0 mod 4 alignment for LBEG)
668c2ecf20Sopenharmony_ci.Lbytecopy:
678c2ecf20Sopenharmony_ci#if XCHAL_HAVE_LOOPS
688c2ecf20Sopenharmony_ci	loopnez	a4, .Lbytecopydone
698c2ecf20Sopenharmony_ci#else /* !XCHAL_HAVE_LOOPS */
708c2ecf20Sopenharmony_ci	beqz	a4, .Lbytecopydone
718c2ecf20Sopenharmony_ci	add	a7, a3, a4	# a7 = end address for source
728c2ecf20Sopenharmony_ci#endif /* !XCHAL_HAVE_LOOPS */
738c2ecf20Sopenharmony_ci.Lnextbyte:
748c2ecf20Sopenharmony_ci	l8ui	a6, a3, 0
758c2ecf20Sopenharmony_ci	addi	a3, a3, 1
768c2ecf20Sopenharmony_ci	s8i	a6, a5, 0
778c2ecf20Sopenharmony_ci	addi	a5, a5, 1
788c2ecf20Sopenharmony_ci#if !XCHAL_HAVE_LOOPS
798c2ecf20Sopenharmony_ci	bne	a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
808c2ecf20Sopenharmony_ci#endif /* !XCHAL_HAVE_LOOPS */
818c2ecf20Sopenharmony_ci.Lbytecopydone:
828c2ecf20Sopenharmony_ci	abi_ret_default
838c2ecf20Sopenharmony_ci
848c2ecf20Sopenharmony_ci/*
858c2ecf20Sopenharmony_ci * Destination is unaligned
868c2ecf20Sopenharmony_ci */
878c2ecf20Sopenharmony_ci
888c2ecf20Sopenharmony_ci	.align	4
898c2ecf20Sopenharmony_ci.Ldst1mod2:	# dst is only byte aligned
908c2ecf20Sopenharmony_ci	_bltui	a4, 7, .Lbytecopy	# do short copies byte by byte
918c2ecf20Sopenharmony_ci
928c2ecf20Sopenharmony_ci	# copy 1 byte
938c2ecf20Sopenharmony_ci	l8ui	a6, a3,  0
948c2ecf20Sopenharmony_ci	addi	a3, a3,  1
958c2ecf20Sopenharmony_ci	addi	a4, a4, -1
968c2ecf20Sopenharmony_ci	s8i	a6, a5,  0
978c2ecf20Sopenharmony_ci	addi	a5, a5,  1
988c2ecf20Sopenharmony_ci	_bbci.l	a5, 1, .Ldstaligned	# if dst is now aligned, then
998c2ecf20Sopenharmony_ci					# return to main algorithm
1008c2ecf20Sopenharmony_ci.Ldst2mod4:	# dst 16-bit aligned
1018c2ecf20Sopenharmony_ci	# copy 2 bytes
1028c2ecf20Sopenharmony_ci	_bltui	a4, 6, .Lbytecopy	# do short copies byte by byte
1038c2ecf20Sopenharmony_ci	l8ui	a6, a3,  0
1048c2ecf20Sopenharmony_ci	l8ui	a7, a3,  1
1058c2ecf20Sopenharmony_ci	addi	a3, a3,  2
1068c2ecf20Sopenharmony_ci	addi	a4, a4, -2
1078c2ecf20Sopenharmony_ci	s8i	a6, a5,  0
1088c2ecf20Sopenharmony_ci	s8i	a7, a5,  1
1098c2ecf20Sopenharmony_ci	addi	a5, a5,  2
1108c2ecf20Sopenharmony_ci	j	.Ldstaligned	# dst is now aligned, return to main algorithm
1118c2ecf20Sopenharmony_ci
1128c2ecf20Sopenharmony_ciENTRY(__memcpy)
1138c2ecf20Sopenharmony_ciWEAK(memcpy)
1148c2ecf20Sopenharmony_ci
1158c2ecf20Sopenharmony_ci	abi_entry_default
1168c2ecf20Sopenharmony_ci	# a2/ dst, a3/ src, a4/ len
1178c2ecf20Sopenharmony_ci	mov	a5, a2		# copy dst so that a2 is return value
1188c2ecf20Sopenharmony_ci.Lcommon:
1198c2ecf20Sopenharmony_ci	_bbsi.l	a2, 0, .Ldst1mod2	# if dst is 1 mod 2
1208c2ecf20Sopenharmony_ci	_bbsi.l	a2, 1, .Ldst2mod4	# if dst is 2 mod 4
1218c2ecf20Sopenharmony_ci.Ldstaligned:	# return here from .Ldst?mod? once dst is aligned
1228c2ecf20Sopenharmony_ci	srli	a7, a4, 4	# number of loop iterations with 16B
1238c2ecf20Sopenharmony_ci				# per iteration
1248c2ecf20Sopenharmony_ci	movi	a8, 3		# if source is not aligned,
1258c2ecf20Sopenharmony_ci	_bany	a3, a8, .Lsrcunaligned	# then use shifting copy
1268c2ecf20Sopenharmony_ci	/*
1278c2ecf20Sopenharmony_ci	 * Destination and source are word-aligned, use word copy.
1288c2ecf20Sopenharmony_ci	 */
1298c2ecf20Sopenharmony_ci	# copy 16 bytes per iteration for word-aligned dst and word-aligned src
1308c2ecf20Sopenharmony_ci#if XCHAL_HAVE_LOOPS
1318c2ecf20Sopenharmony_ci	loopnez	a7, .Loop1done
1328c2ecf20Sopenharmony_ci#else /* !XCHAL_HAVE_LOOPS */
1338c2ecf20Sopenharmony_ci	beqz	a7, .Loop1done
1348c2ecf20Sopenharmony_ci	slli	a8, a7, 4
1358c2ecf20Sopenharmony_ci	add	a8, a8, a3	# a8 = end of last 16B source chunk
1368c2ecf20Sopenharmony_ci#endif /* !XCHAL_HAVE_LOOPS */
1378c2ecf20Sopenharmony_ci.Loop1:
1388c2ecf20Sopenharmony_ci	l32i	a6, a3,  0
1398c2ecf20Sopenharmony_ci	l32i	a7, a3,  4
1408c2ecf20Sopenharmony_ci	s32i	a6, a5,  0
1418c2ecf20Sopenharmony_ci	l32i	a6, a3,  8
1428c2ecf20Sopenharmony_ci	s32i	a7, a5,  4
1438c2ecf20Sopenharmony_ci	l32i	a7, a3, 12
1448c2ecf20Sopenharmony_ci	s32i	a6, a5,  8
1458c2ecf20Sopenharmony_ci	addi	a3, a3, 16
1468c2ecf20Sopenharmony_ci	s32i	a7, a5, 12
1478c2ecf20Sopenharmony_ci	addi	a5, a5, 16
1488c2ecf20Sopenharmony_ci#if !XCHAL_HAVE_LOOPS
1498c2ecf20Sopenharmony_ci	bne	a3, a8, .Loop1  # continue loop if a3:src != a8:src_end
1508c2ecf20Sopenharmony_ci#endif /* !XCHAL_HAVE_LOOPS */
1518c2ecf20Sopenharmony_ci.Loop1done:
1528c2ecf20Sopenharmony_ci	bbci.l	a4, 3, .L2
1538c2ecf20Sopenharmony_ci	# copy 8 bytes
1548c2ecf20Sopenharmony_ci	l32i	a6, a3,  0
1558c2ecf20Sopenharmony_ci	l32i	a7, a3,  4
1568c2ecf20Sopenharmony_ci	addi	a3, a3,  8
1578c2ecf20Sopenharmony_ci	s32i	a6, a5,  0
1588c2ecf20Sopenharmony_ci	s32i	a7, a5,  4
1598c2ecf20Sopenharmony_ci	addi	a5, a5,  8
1608c2ecf20Sopenharmony_ci.L2:
1618c2ecf20Sopenharmony_ci	bbsi.l	a4, 2, .L3
1628c2ecf20Sopenharmony_ci	bbsi.l	a4, 1, .L4
1638c2ecf20Sopenharmony_ci	bbsi.l	a4, 0, .L5
1648c2ecf20Sopenharmony_ci	abi_ret_default
1658c2ecf20Sopenharmony_ci.L3:
1668c2ecf20Sopenharmony_ci	# copy 4 bytes
1678c2ecf20Sopenharmony_ci	l32i	a6, a3,  0
1688c2ecf20Sopenharmony_ci	addi	a3, a3,  4
1698c2ecf20Sopenharmony_ci	s32i	a6, a5,  0
1708c2ecf20Sopenharmony_ci	addi	a5, a5,  4
1718c2ecf20Sopenharmony_ci	bbsi.l	a4, 1, .L4
1728c2ecf20Sopenharmony_ci	bbsi.l	a4, 0, .L5
1738c2ecf20Sopenharmony_ci	abi_ret_default
1748c2ecf20Sopenharmony_ci.L4:
1758c2ecf20Sopenharmony_ci	# copy 2 bytes
1768c2ecf20Sopenharmony_ci	l16ui	a6, a3,  0
1778c2ecf20Sopenharmony_ci	addi	a3, a3,  2
1788c2ecf20Sopenharmony_ci	s16i	a6, a5,  0
1798c2ecf20Sopenharmony_ci	addi	a5, a5,  2
1808c2ecf20Sopenharmony_ci	bbsi.l	a4, 0, .L5
1818c2ecf20Sopenharmony_ci	abi_ret_default
1828c2ecf20Sopenharmony_ci.L5:
1838c2ecf20Sopenharmony_ci	# copy 1 byte
1848c2ecf20Sopenharmony_ci	l8ui	a6, a3,  0
1858c2ecf20Sopenharmony_ci	s8i	a6, a5,  0
1868c2ecf20Sopenharmony_ci	abi_ret_default
1878c2ecf20Sopenharmony_ci
1888c2ecf20Sopenharmony_ci/*
1898c2ecf20Sopenharmony_ci * Destination is aligned, Source is unaligned
1908c2ecf20Sopenharmony_ci */
1918c2ecf20Sopenharmony_ci
1928c2ecf20Sopenharmony_ci	.align	4
1938c2ecf20Sopenharmony_ci.Lsrcunaligned:
1948c2ecf20Sopenharmony_ci	_beqz	a4, .Ldone	# avoid loading anything for zero-length copies
1958c2ecf20Sopenharmony_ci	# copy 16 bytes per iteration for word-aligned dst and unaligned src
1968c2ecf20Sopenharmony_ci	__ssa8	a3		# set shift amount from byte offset
1978c2ecf20Sopenharmony_ci
1988c2ecf20Sopenharmony_ci/* set to 1 when running on ISS (simulator) with the
1998c2ecf20Sopenharmony_ci   lint or ferret client, or 0 to save a few cycles */
2008c2ecf20Sopenharmony_ci#define SIM_CHECKS_ALIGNMENT	1
2018c2ecf20Sopenharmony_ci#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
2028c2ecf20Sopenharmony_ci	and	a11, a3, a8	# save unalignment offset for below
2038c2ecf20Sopenharmony_ci	sub	a3, a3, a11	# align a3
2048c2ecf20Sopenharmony_ci#endif
2058c2ecf20Sopenharmony_ci	l32i	a6, a3, 0	# load first word
2068c2ecf20Sopenharmony_ci#if XCHAL_HAVE_LOOPS
2078c2ecf20Sopenharmony_ci	loopnez	a7, .Loop2done
2088c2ecf20Sopenharmony_ci#else /* !XCHAL_HAVE_LOOPS */
2098c2ecf20Sopenharmony_ci	beqz	a7, .Loop2done
2108c2ecf20Sopenharmony_ci	slli	a10, a7, 4
2118c2ecf20Sopenharmony_ci	add	a10, a10, a3	# a10 = end of last 16B source chunk
2128c2ecf20Sopenharmony_ci#endif /* !XCHAL_HAVE_LOOPS */
2138c2ecf20Sopenharmony_ci.Loop2:
2148c2ecf20Sopenharmony_ci	l32i	a7, a3,  4
2158c2ecf20Sopenharmony_ci	l32i	a8, a3,  8
2168c2ecf20Sopenharmony_ci	__src_b	a6, a6, a7
2178c2ecf20Sopenharmony_ci	s32i	a6, a5,  0
2188c2ecf20Sopenharmony_ci	l32i	a9, a3, 12
2198c2ecf20Sopenharmony_ci	__src_b	a7, a7, a8
2208c2ecf20Sopenharmony_ci	s32i	a7, a5,  4
2218c2ecf20Sopenharmony_ci	l32i	a6, a3, 16
2228c2ecf20Sopenharmony_ci	__src_b	a8, a8, a9
2238c2ecf20Sopenharmony_ci	s32i	a8, a5,  8
2248c2ecf20Sopenharmony_ci	addi	a3, a3, 16
2258c2ecf20Sopenharmony_ci	__src_b	a9, a9, a6
2268c2ecf20Sopenharmony_ci	s32i	a9, a5, 12
2278c2ecf20Sopenharmony_ci	addi	a5, a5, 16
2288c2ecf20Sopenharmony_ci#if !XCHAL_HAVE_LOOPS
2298c2ecf20Sopenharmony_ci	bne	a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
2308c2ecf20Sopenharmony_ci#endif /* !XCHAL_HAVE_LOOPS */
2318c2ecf20Sopenharmony_ci.Loop2done:
2328c2ecf20Sopenharmony_ci	bbci.l	a4, 3, .L12
2338c2ecf20Sopenharmony_ci	# copy 8 bytes
2348c2ecf20Sopenharmony_ci	l32i	a7, a3,  4
2358c2ecf20Sopenharmony_ci	l32i	a8, a3,  8
2368c2ecf20Sopenharmony_ci	__src_b	a6, a6, a7
2378c2ecf20Sopenharmony_ci	s32i	a6, a5,  0
2388c2ecf20Sopenharmony_ci	addi	a3, a3,  8
2398c2ecf20Sopenharmony_ci	__src_b	a7, a7, a8
2408c2ecf20Sopenharmony_ci	s32i	a7, a5,  4
2418c2ecf20Sopenharmony_ci	addi	a5, a5,  8
2428c2ecf20Sopenharmony_ci	mov	a6, a8
2438c2ecf20Sopenharmony_ci.L12:
2448c2ecf20Sopenharmony_ci	bbci.l	a4, 2, .L13
2458c2ecf20Sopenharmony_ci	# copy 4 bytes
2468c2ecf20Sopenharmony_ci	l32i	a7, a3,  4
2478c2ecf20Sopenharmony_ci	addi	a3, a3,  4
2488c2ecf20Sopenharmony_ci	__src_b	a6, a6, a7
2498c2ecf20Sopenharmony_ci	s32i	a6, a5,  0
2508c2ecf20Sopenharmony_ci	addi	a5, a5,  4
2518c2ecf20Sopenharmony_ci	mov	a6, a7
2528c2ecf20Sopenharmony_ci.L13:
2538c2ecf20Sopenharmony_ci#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
2548c2ecf20Sopenharmony_ci	add	a3, a3, a11	# readjust a3 with correct misalignment
2558c2ecf20Sopenharmony_ci#endif
2568c2ecf20Sopenharmony_ci	bbsi.l	a4, 1, .L14
2578c2ecf20Sopenharmony_ci	bbsi.l	a4, 0, .L15
2588c2ecf20Sopenharmony_ci.Ldone:	abi_ret_default
2598c2ecf20Sopenharmony_ci.L14:
2608c2ecf20Sopenharmony_ci	# copy 2 bytes
2618c2ecf20Sopenharmony_ci	l8ui	a6, a3,  0
2628c2ecf20Sopenharmony_ci	l8ui	a7, a3,  1
2638c2ecf20Sopenharmony_ci	addi	a3, a3,  2
2648c2ecf20Sopenharmony_ci	s8i	a6, a5,  0
2658c2ecf20Sopenharmony_ci	s8i	a7, a5,  1
2668c2ecf20Sopenharmony_ci	addi	a5, a5,  2
2678c2ecf20Sopenharmony_ci	bbsi.l	a4, 0, .L15
2688c2ecf20Sopenharmony_ci	abi_ret_default
2698c2ecf20Sopenharmony_ci.L15:
2708c2ecf20Sopenharmony_ci	# copy 1 byte
2718c2ecf20Sopenharmony_ci	l8ui	a6, a3,  0
2728c2ecf20Sopenharmony_ci	s8i	a6, a5,  0
2738c2ecf20Sopenharmony_ci	abi_ret_default
2748c2ecf20Sopenharmony_ci
2758c2ecf20Sopenharmony_ciENDPROC(__memcpy)
2768c2ecf20Sopenharmony_ci
2778c2ecf20Sopenharmony_ci/*
2788c2ecf20Sopenharmony_ci * void bcopy(const void *src, void *dest, size_t n);
2798c2ecf20Sopenharmony_ci */
2808c2ecf20Sopenharmony_ci
2818c2ecf20Sopenharmony_ciENTRY(bcopy)
2828c2ecf20Sopenharmony_ci
2838c2ecf20Sopenharmony_ci	abi_entry_default
2848c2ecf20Sopenharmony_ci	# a2=src, a3=dst, a4=len
2858c2ecf20Sopenharmony_ci	mov	a5, a3
2868c2ecf20Sopenharmony_ci	mov	a3, a2
2878c2ecf20Sopenharmony_ci	mov	a2, a5
2888c2ecf20Sopenharmony_ci	j	.Lmovecommon	# go to common code for memmove+bcopy
2898c2ecf20Sopenharmony_ci
2908c2ecf20Sopenharmony_ciENDPROC(bcopy)
2918c2ecf20Sopenharmony_ci
2928c2ecf20Sopenharmony_ci/*
2938c2ecf20Sopenharmony_ci * void *memmove(void *dst, const void *src, size_t len);
2948c2ecf20Sopenharmony_ci *
2958c2ecf20Sopenharmony_ci * This function is intended to do the same thing as the standard
2968c2ecf20Sopenharmony_ci * library function memmove() for most cases.
2978c2ecf20Sopenharmony_ci * However, where the source and/or destination references
2988c2ecf20Sopenharmony_ci * an instruction RAM or ROM or a data RAM or ROM, that
2998c2ecf20Sopenharmony_ci * source and/or destination will always be accessed with
3008c2ecf20Sopenharmony_ci * 32-bit load and store instructions (as required for these
3018c2ecf20Sopenharmony_ci * types of devices).
3028c2ecf20Sopenharmony_ci *
3038c2ecf20Sopenharmony_ci * !!!!!!!  XTFIXME:
3048c2ecf20Sopenharmony_ci * !!!!!!!  Handling of IRAM/IROM has not yet
3058c2ecf20Sopenharmony_ci * !!!!!!!  been implemented.
3068c2ecf20Sopenharmony_ci *
3078c2ecf20Sopenharmony_ci * The (general case) algorithm is as follows:
3088c2ecf20Sopenharmony_ci *   If end of source doesn't overlap destination then use memcpy.
3098c2ecf20Sopenharmony_ci *   Otherwise do memcpy backwards.
3108c2ecf20Sopenharmony_ci *
3118c2ecf20Sopenharmony_ci * Register use:
3128c2ecf20Sopenharmony_ci *	a0/ return address
3138c2ecf20Sopenharmony_ci *	a1/ stack pointer
3148c2ecf20Sopenharmony_ci *	a2/ return value
3158c2ecf20Sopenharmony_ci *	a3/ src
3168c2ecf20Sopenharmony_ci *	a4/ length
3178c2ecf20Sopenharmony_ci *	a5/ dst
3188c2ecf20Sopenharmony_ci *	a6/ tmp
3198c2ecf20Sopenharmony_ci *	a7/ tmp
3208c2ecf20Sopenharmony_ci *	a8/ tmp
3218c2ecf20Sopenharmony_ci *	a9/ tmp
3228c2ecf20Sopenharmony_ci *	a10/ tmp
3238c2ecf20Sopenharmony_ci *	a11/ tmp
3248c2ecf20Sopenharmony_ci */
3258c2ecf20Sopenharmony_ci
3268c2ecf20Sopenharmony_ci/*
3278c2ecf20Sopenharmony_ci * Byte by byte copy
3288c2ecf20Sopenharmony_ci */
3298c2ecf20Sopenharmony_ci	.align	4
3308c2ecf20Sopenharmony_ci	.byte	0		# 1 mod 4 alignment for LOOPNEZ
3318c2ecf20Sopenharmony_ci				# (0 mod 4 alignment for LBEG)
3328c2ecf20Sopenharmony_ci.Lbackbytecopy:
3338c2ecf20Sopenharmony_ci#if XCHAL_HAVE_LOOPS
3348c2ecf20Sopenharmony_ci	loopnez	a4, .Lbackbytecopydone
3358c2ecf20Sopenharmony_ci#else /* !XCHAL_HAVE_LOOPS */
3368c2ecf20Sopenharmony_ci	beqz	a4, .Lbackbytecopydone
3378c2ecf20Sopenharmony_ci	sub	a7, a3, a4	# a7 = start address for source
3388c2ecf20Sopenharmony_ci#endif /* !XCHAL_HAVE_LOOPS */
3398c2ecf20Sopenharmony_ci.Lbacknextbyte:
3408c2ecf20Sopenharmony_ci	addi	a3, a3, -1
3418c2ecf20Sopenharmony_ci	l8ui	a6, a3, 0
3428c2ecf20Sopenharmony_ci	addi	a5, a5, -1
3438c2ecf20Sopenharmony_ci	s8i	a6, a5, 0
3448c2ecf20Sopenharmony_ci#if !XCHAL_HAVE_LOOPS
3458c2ecf20Sopenharmony_ci	bne	a3, a7, .Lbacknextbyte # continue loop if
3468c2ecf20Sopenharmony_ci				       # $a3:src != $a7:src_start
3478c2ecf20Sopenharmony_ci#endif /* !XCHAL_HAVE_LOOPS */
3488c2ecf20Sopenharmony_ci.Lbackbytecopydone:
3498c2ecf20Sopenharmony_ci	abi_ret_default
3508c2ecf20Sopenharmony_ci
3518c2ecf20Sopenharmony_ci/*
3528c2ecf20Sopenharmony_ci * Destination is unaligned
3538c2ecf20Sopenharmony_ci */
3548c2ecf20Sopenharmony_ci
3558c2ecf20Sopenharmony_ci	.align	4
3568c2ecf20Sopenharmony_ci.Lbackdst1mod2:	# dst is only byte aligned
3578c2ecf20Sopenharmony_ci	_bltui	a4, 7, .Lbackbytecopy	# do short copies byte by byte
3588c2ecf20Sopenharmony_ci
3598c2ecf20Sopenharmony_ci	# copy 1 byte
3608c2ecf20Sopenharmony_ci	addi	a3, a3, -1
3618c2ecf20Sopenharmony_ci	l8ui	a6, a3,  0
3628c2ecf20Sopenharmony_ci	addi	a5, a5, -1
3638c2ecf20Sopenharmony_ci	s8i	a6, a5,  0
3648c2ecf20Sopenharmony_ci	addi	a4, a4, -1
3658c2ecf20Sopenharmony_ci	_bbci.l	a5, 1, .Lbackdstaligned	# if dst is now aligned, then
3668c2ecf20Sopenharmony_ci					# return to main algorithm
3678c2ecf20Sopenharmony_ci.Lbackdst2mod4:	# dst 16-bit aligned
3688c2ecf20Sopenharmony_ci	# copy 2 bytes
3698c2ecf20Sopenharmony_ci	_bltui	a4, 6, .Lbackbytecopy	# do short copies byte by byte
3708c2ecf20Sopenharmony_ci	addi	a3, a3, -2
3718c2ecf20Sopenharmony_ci	l8ui	a6, a3,  0
3728c2ecf20Sopenharmony_ci	l8ui	a7, a3,  1
3738c2ecf20Sopenharmony_ci	addi	a5, a5, -2
3748c2ecf20Sopenharmony_ci	s8i	a6, a5,  0
3758c2ecf20Sopenharmony_ci	s8i	a7, a5,  1
3768c2ecf20Sopenharmony_ci	addi	a4, a4, -2
3778c2ecf20Sopenharmony_ci	j	.Lbackdstaligned	# dst is now aligned,
3788c2ecf20Sopenharmony_ci					# return to main algorithm
3798c2ecf20Sopenharmony_ci
3808c2ecf20Sopenharmony_ciENTRY(__memmove)
3818c2ecf20Sopenharmony_ciWEAK(memmove)
3828c2ecf20Sopenharmony_ci
3838c2ecf20Sopenharmony_ci	abi_entry_default
3848c2ecf20Sopenharmony_ci	# a2/ dst, a3/ src, a4/ len
3858c2ecf20Sopenharmony_ci	mov	a5, a2		# copy dst so that a2 is return value
3868c2ecf20Sopenharmony_ci.Lmovecommon:
3878c2ecf20Sopenharmony_ci	sub	a6, a5, a3
3888c2ecf20Sopenharmony_ci	bgeu	a6, a4, .Lcommon
3898c2ecf20Sopenharmony_ci
3908c2ecf20Sopenharmony_ci	add	a5, a5, a4
3918c2ecf20Sopenharmony_ci	add	a3, a3, a4
3928c2ecf20Sopenharmony_ci
3938c2ecf20Sopenharmony_ci	_bbsi.l	a5, 0, .Lbackdst1mod2	# if dst is 1 mod 2
3948c2ecf20Sopenharmony_ci	_bbsi.l	a5, 1, .Lbackdst2mod4	# if dst is 2 mod 4
3958c2ecf20Sopenharmony_ci.Lbackdstaligned:	# return here from .Lbackdst?mod? once dst is aligned
3968c2ecf20Sopenharmony_ci	srli	a7, a4, 4	# number of loop iterations with 16B
3978c2ecf20Sopenharmony_ci				# per iteration
3988c2ecf20Sopenharmony_ci	movi	a8, 3		# if source is not aligned,
3998c2ecf20Sopenharmony_ci	_bany	a3, a8, .Lbacksrcunaligned	# then use shifting copy
4008c2ecf20Sopenharmony_ci	/*
4018c2ecf20Sopenharmony_ci	 * Destination and source are word-aligned, use word copy.
4028c2ecf20Sopenharmony_ci	 */
4038c2ecf20Sopenharmony_ci	# copy 16 bytes per iteration for word-aligned dst and word-aligned src
4048c2ecf20Sopenharmony_ci#if XCHAL_HAVE_LOOPS
4058c2ecf20Sopenharmony_ci	loopnez	a7, .backLoop1done
4068c2ecf20Sopenharmony_ci#else /* !XCHAL_HAVE_LOOPS */
4078c2ecf20Sopenharmony_ci	beqz	a7, .backLoop1done
4088c2ecf20Sopenharmony_ci	slli	a8, a7, 4
4098c2ecf20Sopenharmony_ci	sub	a8, a3, a8	# a8 = start of first 16B source chunk
4108c2ecf20Sopenharmony_ci#endif /* !XCHAL_HAVE_LOOPS */
4118c2ecf20Sopenharmony_ci.backLoop1:
4128c2ecf20Sopenharmony_ci	addi	a3, a3, -16
4138c2ecf20Sopenharmony_ci	l32i	a7, a3, 12
4148c2ecf20Sopenharmony_ci	l32i	a6, a3,  8
4158c2ecf20Sopenharmony_ci	addi	a5, a5, -16
4168c2ecf20Sopenharmony_ci	s32i	a7, a5, 12
4178c2ecf20Sopenharmony_ci	l32i	a7, a3,  4
4188c2ecf20Sopenharmony_ci	s32i	a6, a5,  8
4198c2ecf20Sopenharmony_ci	l32i	a6, a3,  0
4208c2ecf20Sopenharmony_ci	s32i	a7, a5,  4
4218c2ecf20Sopenharmony_ci	s32i	a6, a5,  0
4228c2ecf20Sopenharmony_ci#if !XCHAL_HAVE_LOOPS
4238c2ecf20Sopenharmony_ci	bne	a3, a8, .backLoop1  # continue loop if a3:src != a8:src_start
4248c2ecf20Sopenharmony_ci#endif /* !XCHAL_HAVE_LOOPS */
4258c2ecf20Sopenharmony_ci.backLoop1done:
4268c2ecf20Sopenharmony_ci	bbci.l	a4, 3, .Lback2
4278c2ecf20Sopenharmony_ci	# copy 8 bytes
4288c2ecf20Sopenharmony_ci	addi	a3, a3, -8
4298c2ecf20Sopenharmony_ci	l32i	a6, a3,  0
4308c2ecf20Sopenharmony_ci	l32i	a7, a3,  4
4318c2ecf20Sopenharmony_ci	addi	a5, a5, -8
4328c2ecf20Sopenharmony_ci	s32i	a6, a5,  0
4338c2ecf20Sopenharmony_ci	s32i	a7, a5,  4
4348c2ecf20Sopenharmony_ci.Lback2:
4358c2ecf20Sopenharmony_ci	bbsi.l	a4, 2, .Lback3
4368c2ecf20Sopenharmony_ci	bbsi.l	a4, 1, .Lback4
4378c2ecf20Sopenharmony_ci	bbsi.l	a4, 0, .Lback5
4388c2ecf20Sopenharmony_ci	abi_ret_default
4398c2ecf20Sopenharmony_ci.Lback3:
4408c2ecf20Sopenharmony_ci	# copy 4 bytes
4418c2ecf20Sopenharmony_ci	addi	a3, a3, -4
4428c2ecf20Sopenharmony_ci	l32i	a6, a3,  0
4438c2ecf20Sopenharmony_ci	addi	a5, a5, -4
4448c2ecf20Sopenharmony_ci	s32i	a6, a5,  0
4458c2ecf20Sopenharmony_ci	bbsi.l	a4, 1, .Lback4
4468c2ecf20Sopenharmony_ci	bbsi.l	a4, 0, .Lback5
4478c2ecf20Sopenharmony_ci	abi_ret_default
4488c2ecf20Sopenharmony_ci.Lback4:
4498c2ecf20Sopenharmony_ci	# copy 2 bytes
4508c2ecf20Sopenharmony_ci	addi	a3, a3, -2
4518c2ecf20Sopenharmony_ci	l16ui	a6, a3,  0
4528c2ecf20Sopenharmony_ci	addi	a5, a5, -2
4538c2ecf20Sopenharmony_ci	s16i	a6, a5,  0
4548c2ecf20Sopenharmony_ci	bbsi.l	a4, 0, .Lback5
4558c2ecf20Sopenharmony_ci	abi_ret_default
4568c2ecf20Sopenharmony_ci.Lback5:
4578c2ecf20Sopenharmony_ci	# copy 1 byte
4588c2ecf20Sopenharmony_ci	addi	a3, a3, -1
4598c2ecf20Sopenharmony_ci	l8ui	a6, a3,  0
4608c2ecf20Sopenharmony_ci	addi	a5, a5, -1
4618c2ecf20Sopenharmony_ci	s8i	a6, a5,  0
4628c2ecf20Sopenharmony_ci	abi_ret_default
4638c2ecf20Sopenharmony_ci
4648c2ecf20Sopenharmony_ci/*
4658c2ecf20Sopenharmony_ci * Destination is aligned, Source is unaligned
4668c2ecf20Sopenharmony_ci */
4678c2ecf20Sopenharmony_ci
4688c2ecf20Sopenharmony_ci	.align	4
4698c2ecf20Sopenharmony_ci.Lbacksrcunaligned:
4708c2ecf20Sopenharmony_ci	_beqz	a4, .Lbackdone	# avoid loading anything for zero-length copies
4718c2ecf20Sopenharmony_ci	# copy 16 bytes per iteration for word-aligned dst and unaligned src
4728c2ecf20Sopenharmony_ci	__ssa8	a3		# set shift amount from byte offset
4738c2ecf20Sopenharmony_ci#define SIM_CHECKS_ALIGNMENT	1	/* set to 1 when running on ISS with
4748c2ecf20Sopenharmony_ci					 * the lint or ferret client, or 0
4758c2ecf20Sopenharmony_ci					 * to save a few cycles */
4768c2ecf20Sopenharmony_ci#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
4778c2ecf20Sopenharmony_ci	and	a11, a3, a8	# save unalignment offset for below
4788c2ecf20Sopenharmony_ci	sub	a3, a3, a11	# align a3
4798c2ecf20Sopenharmony_ci#endif
4808c2ecf20Sopenharmony_ci	l32i	a6, a3, 0	# load first word
4818c2ecf20Sopenharmony_ci#if XCHAL_HAVE_LOOPS
4828c2ecf20Sopenharmony_ci	loopnez	a7, .backLoop2done
4838c2ecf20Sopenharmony_ci#else /* !XCHAL_HAVE_LOOPS */
4848c2ecf20Sopenharmony_ci	beqz	a7, .backLoop2done
4858c2ecf20Sopenharmony_ci	slli	a10, a7, 4
4868c2ecf20Sopenharmony_ci	sub	a10, a3, a10	# a10 = start of first 16B source chunk
4878c2ecf20Sopenharmony_ci#endif /* !XCHAL_HAVE_LOOPS */
4888c2ecf20Sopenharmony_ci.backLoop2:
4898c2ecf20Sopenharmony_ci	addi	a3, a3, -16
4908c2ecf20Sopenharmony_ci	l32i	a7, a3, 12
4918c2ecf20Sopenharmony_ci	l32i	a8, a3,  8
4928c2ecf20Sopenharmony_ci	addi	a5, a5, -16
4938c2ecf20Sopenharmony_ci	__src_b	a6, a7, a6
4948c2ecf20Sopenharmony_ci	s32i	a6, a5, 12
4958c2ecf20Sopenharmony_ci	l32i	a9, a3,  4
4968c2ecf20Sopenharmony_ci	__src_b	a7, a8, a7
4978c2ecf20Sopenharmony_ci	s32i	a7, a5,  8
4988c2ecf20Sopenharmony_ci	l32i	a6, a3,  0
4998c2ecf20Sopenharmony_ci	__src_b	a8, a9, a8
5008c2ecf20Sopenharmony_ci	s32i	a8, a5,  4
5018c2ecf20Sopenharmony_ci	__src_b	a9, a6, a9
5028c2ecf20Sopenharmony_ci	s32i	a9, a5,  0
5038c2ecf20Sopenharmony_ci#if !XCHAL_HAVE_LOOPS
5048c2ecf20Sopenharmony_ci	bne	a3, a10, .backLoop2 # continue loop if a3:src != a10:src_start
5058c2ecf20Sopenharmony_ci#endif /* !XCHAL_HAVE_LOOPS */
5068c2ecf20Sopenharmony_ci.backLoop2done:
5078c2ecf20Sopenharmony_ci	bbci.l	a4, 3, .Lback12
5088c2ecf20Sopenharmony_ci	# copy 8 bytes
5098c2ecf20Sopenharmony_ci	addi	a3, a3, -8
5108c2ecf20Sopenharmony_ci	l32i	a7, a3,  4
5118c2ecf20Sopenharmony_ci	l32i	a8, a3,  0
5128c2ecf20Sopenharmony_ci	addi	a5, a5, -8
5138c2ecf20Sopenharmony_ci	__src_b	a6, a7, a6
5148c2ecf20Sopenharmony_ci	s32i	a6, a5,  4
5158c2ecf20Sopenharmony_ci	__src_b	a7, a8, a7
5168c2ecf20Sopenharmony_ci	s32i	a7, a5,  0
5178c2ecf20Sopenharmony_ci	mov	a6, a8
5188c2ecf20Sopenharmony_ci.Lback12:
5198c2ecf20Sopenharmony_ci	bbci.l	a4, 2, .Lback13
5208c2ecf20Sopenharmony_ci	# copy 4 bytes
5218c2ecf20Sopenharmony_ci	addi	a3, a3, -4
5228c2ecf20Sopenharmony_ci	l32i	a7, a3,  0
5238c2ecf20Sopenharmony_ci	addi	a5, a5, -4
5248c2ecf20Sopenharmony_ci	__src_b	a6, a7, a6
5258c2ecf20Sopenharmony_ci	s32i	a6, a5,  0
5268c2ecf20Sopenharmony_ci	mov	a6, a7
5278c2ecf20Sopenharmony_ci.Lback13:
5288c2ecf20Sopenharmony_ci#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
5298c2ecf20Sopenharmony_ci	add	a3, a3, a11	# readjust a3 with correct misalignment
5308c2ecf20Sopenharmony_ci#endif
5318c2ecf20Sopenharmony_ci	bbsi.l	a4, 1, .Lback14
5328c2ecf20Sopenharmony_ci	bbsi.l	a4, 0, .Lback15
5338c2ecf20Sopenharmony_ci.Lbackdone:
5348c2ecf20Sopenharmony_ci	abi_ret_default
5358c2ecf20Sopenharmony_ci.Lback14:
5368c2ecf20Sopenharmony_ci	# copy 2 bytes
5378c2ecf20Sopenharmony_ci	addi	a3, a3, -2
5388c2ecf20Sopenharmony_ci	l8ui	a6, a3,  0
5398c2ecf20Sopenharmony_ci	l8ui	a7, a3,  1
5408c2ecf20Sopenharmony_ci	addi	a5, a5, -2
5418c2ecf20Sopenharmony_ci	s8i	a6, a5,  0
5428c2ecf20Sopenharmony_ci	s8i	a7, a5,  1
5438c2ecf20Sopenharmony_ci	bbsi.l	a4, 0, .Lback15
5448c2ecf20Sopenharmony_ci	abi_ret_default
5458c2ecf20Sopenharmony_ci.Lback15:
5468c2ecf20Sopenharmony_ci	# copy 1 byte
5478c2ecf20Sopenharmony_ci	addi	a3, a3, -1
5488c2ecf20Sopenharmony_ci	addi	a5, a5, -1
5498c2ecf20Sopenharmony_ci	l8ui	a6, a3,  0
5508c2ecf20Sopenharmony_ci	s8i	a6, a5,  0
5518c2ecf20Sopenharmony_ci	abi_ret_default
5528c2ecf20Sopenharmony_ci
5538c2ecf20Sopenharmony_ciENDPROC(__memmove)
554