162306a36Sopenharmony_ci/*
262306a36Sopenharmony_ci * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions
362306a36Sopenharmony_ci * xthal_memcpy and xthal_bcopy
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * This file is subject to the terms and conditions of the GNU General Public
662306a36Sopenharmony_ci * License.  See the file "COPYING" in the main directory of this archive
762306a36Sopenharmony_ci * for more details.
862306a36Sopenharmony_ci *
962306a36Sopenharmony_ci * Copyright (C) 2002 - 2012 Tensilica Inc.
1062306a36Sopenharmony_ci */
1162306a36Sopenharmony_ci
1262306a36Sopenharmony_ci#include <linux/linkage.h>
1362306a36Sopenharmony_ci#include <asm/asmmacro.h>
1462306a36Sopenharmony_ci#include <asm/core.h>
1562306a36Sopenharmony_ci
1662306a36Sopenharmony_ci/*
1762306a36Sopenharmony_ci * void *memcpy(void *dst, const void *src, size_t len);
1862306a36Sopenharmony_ci *
1962306a36Sopenharmony_ci * This function is intended to do the same thing as the standard
2062306a36Sopenharmony_ci * library function memcpy() for most cases.
2162306a36Sopenharmony_ci * However, where the source and/or destination references
2262306a36Sopenharmony_ci * an instruction RAM or ROM or a data RAM or ROM, that
2362306a36Sopenharmony_ci * source and/or destination will always be accessed with
2462306a36Sopenharmony_ci * 32-bit load and store instructions (as required for these
2562306a36Sopenharmony_ci * types of devices).
2662306a36Sopenharmony_ci *
2762306a36Sopenharmony_ci * !!!!!!!  XTFIXME:
2862306a36Sopenharmony_ci * !!!!!!!  Handling of IRAM/IROM has not yet
2962306a36Sopenharmony_ci * !!!!!!!  been implemented.
3062306a36Sopenharmony_ci *
3162306a36Sopenharmony_ci * The (general case) algorithm is as follows:
3262306a36Sopenharmony_ci *   If destination is unaligned, align it by conditionally
3362306a36Sopenharmony_ci *     copying 1 and 2 bytes.
3462306a36Sopenharmony_ci *   If source is aligned,
3562306a36Sopenharmony_ci *     do 16 bytes with a loop, and then finish up with
3662306a36Sopenharmony_ci *     8, 4, 2, and 1 byte copies conditional on the length;
3762306a36Sopenharmony_ci *   else (if source is unaligned),
3862306a36Sopenharmony_ci *     do the same, but use SRC to align the source data.
3962306a36Sopenharmony_ci *   This code tries to use fall-through branches for the common
4062306a36Sopenharmony_ci *     case of aligned source and destination and multiple
4162306a36Sopenharmony_ci *     of 4 (or 8) length.
4262306a36Sopenharmony_ci *
4362306a36Sopenharmony_ci * Register use:
4462306a36Sopenharmony_ci *	a0/ return address
4562306a36Sopenharmony_ci *	a1/ stack pointer
4662306a36Sopenharmony_ci *	a2/ return value
4762306a36Sopenharmony_ci *	a3/ src
4862306a36Sopenharmony_ci *	a4/ length
4962306a36Sopenharmony_ci *	a5/ dst
5062306a36Sopenharmony_ci *	a6/ tmp
5162306a36Sopenharmony_ci *	a7/ tmp
5262306a36Sopenharmony_ci *	a8/ tmp
5362306a36Sopenharmony_ci *	a9/ tmp
5462306a36Sopenharmony_ci *	a10/ tmp
5562306a36Sopenharmony_ci *	a11/ tmp
5662306a36Sopenharmony_ci */
5762306a36Sopenharmony_ci
5862306a36Sopenharmony_ci	.text
5962306a36Sopenharmony_ci
6062306a36Sopenharmony_ci/*
6162306a36Sopenharmony_ci * Byte by byte copy
6262306a36Sopenharmony_ci */
6362306a36Sopenharmony_ci	.align	4
6462306a36Sopenharmony_ci	.byte	0		# 1 mod 4 alignment for LOOPNEZ
6562306a36Sopenharmony_ci				# (0 mod 4 alignment for LBEG)
6662306a36Sopenharmony_ci.Lbytecopy:
6762306a36Sopenharmony_ci#if XCHAL_HAVE_LOOPS
6862306a36Sopenharmony_ci	loopnez	a4, .Lbytecopydone
6962306a36Sopenharmony_ci#else /* !XCHAL_HAVE_LOOPS */
7062306a36Sopenharmony_ci	beqz	a4, .Lbytecopydone
7162306a36Sopenharmony_ci	add	a7, a3, a4	# a7 = end address for source
7262306a36Sopenharmony_ci#endif /* !XCHAL_HAVE_LOOPS */
7362306a36Sopenharmony_ci.Lnextbyte:
7462306a36Sopenharmony_ci	l8ui	a6, a3, 0
7562306a36Sopenharmony_ci	addi	a3, a3, 1
7662306a36Sopenharmony_ci	s8i	a6, a5, 0
7762306a36Sopenharmony_ci	addi	a5, a5, 1
7862306a36Sopenharmony_ci#if !XCHAL_HAVE_LOOPS
7962306a36Sopenharmony_ci	bne	a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
8062306a36Sopenharmony_ci#endif /* !XCHAL_HAVE_LOOPS */
8162306a36Sopenharmony_ci.Lbytecopydone:
8262306a36Sopenharmony_ci	abi_ret_default
8362306a36Sopenharmony_ci
8462306a36Sopenharmony_ci/*
8562306a36Sopenharmony_ci * Destination is unaligned
8662306a36Sopenharmony_ci */
8762306a36Sopenharmony_ci
8862306a36Sopenharmony_ci	.align	4
8962306a36Sopenharmony_ci.Ldst1mod2:	# dst is only byte aligned
9062306a36Sopenharmony_ci	_bltui	a4, 7, .Lbytecopy	# do short copies byte by byte
9162306a36Sopenharmony_ci
9262306a36Sopenharmony_ci	# copy 1 byte
9362306a36Sopenharmony_ci	l8ui	a6, a3,  0
9462306a36Sopenharmony_ci	addi	a3, a3,  1
9562306a36Sopenharmony_ci	addi	a4, a4, -1
9662306a36Sopenharmony_ci	s8i	a6, a5,  0
9762306a36Sopenharmony_ci	addi	a5, a5,  1
9862306a36Sopenharmony_ci	_bbci.l	a5, 1, .Ldstaligned	# if dst is now aligned, then
9962306a36Sopenharmony_ci					# return to main algorithm
10062306a36Sopenharmony_ci.Ldst2mod4:	# dst 16-bit aligned
10162306a36Sopenharmony_ci	# copy 2 bytes
10262306a36Sopenharmony_ci	_bltui	a4, 6, .Lbytecopy	# do short copies byte by byte
10362306a36Sopenharmony_ci	l8ui	a6, a3,  0
10462306a36Sopenharmony_ci	l8ui	a7, a3,  1
10562306a36Sopenharmony_ci	addi	a3, a3,  2
10662306a36Sopenharmony_ci	addi	a4, a4, -2
10762306a36Sopenharmony_ci	s8i	a6, a5,  0
10862306a36Sopenharmony_ci	s8i	a7, a5,  1
10962306a36Sopenharmony_ci	addi	a5, a5,  2
11062306a36Sopenharmony_ci	j	.Ldstaligned	# dst is now aligned, return to main algorithm
11162306a36Sopenharmony_ci
11262306a36Sopenharmony_ciENTRY(__memcpy)
11362306a36Sopenharmony_ciWEAK(memcpy)
11462306a36Sopenharmony_ci
11562306a36Sopenharmony_ci	abi_entry_default
11662306a36Sopenharmony_ci	# a2/ dst, a3/ src, a4/ len
11762306a36Sopenharmony_ci	mov	a5, a2		# copy dst so that a2 is return value
11862306a36Sopenharmony_ci.Lcommon:
11962306a36Sopenharmony_ci	_bbsi.l	a2, 0, .Ldst1mod2	# if dst is 1 mod 2
12062306a36Sopenharmony_ci	_bbsi.l	a2, 1, .Ldst2mod4	# if dst is 2 mod 4
12162306a36Sopenharmony_ci.Ldstaligned:	# return here from .Ldst?mod? once dst is aligned
12262306a36Sopenharmony_ci	srli	a7, a4, 4	# number of loop iterations with 16B
12362306a36Sopenharmony_ci				# per iteration
12462306a36Sopenharmony_ci	movi	a8, 3		# if source is not aligned,
12562306a36Sopenharmony_ci	_bany	a3, a8, .Lsrcunaligned	# then use shifting copy
12662306a36Sopenharmony_ci	/*
12762306a36Sopenharmony_ci	 * Destination and source are word-aligned, use word copy.
12862306a36Sopenharmony_ci	 */
12962306a36Sopenharmony_ci	# copy 16 bytes per iteration for word-aligned dst and word-aligned src
13062306a36Sopenharmony_ci#if XCHAL_HAVE_LOOPS
13162306a36Sopenharmony_ci	loopnez	a7, .Loop1done
13262306a36Sopenharmony_ci#else /* !XCHAL_HAVE_LOOPS */
13362306a36Sopenharmony_ci	beqz	a7, .Loop1done
13462306a36Sopenharmony_ci	slli	a8, a7, 4
13562306a36Sopenharmony_ci	add	a8, a8, a3	# a8 = end of last 16B source chunk
13662306a36Sopenharmony_ci#endif /* !XCHAL_HAVE_LOOPS */
13762306a36Sopenharmony_ci.Loop1:
13862306a36Sopenharmony_ci	l32i	a6, a3,  0
13962306a36Sopenharmony_ci	l32i	a7, a3,  4
14062306a36Sopenharmony_ci	s32i	a6, a5,  0
14162306a36Sopenharmony_ci	l32i	a6, a3,  8
14262306a36Sopenharmony_ci	s32i	a7, a5,  4
14362306a36Sopenharmony_ci	l32i	a7, a3, 12
14462306a36Sopenharmony_ci	s32i	a6, a5,  8
14562306a36Sopenharmony_ci	addi	a3, a3, 16
14662306a36Sopenharmony_ci	s32i	a7, a5, 12
14762306a36Sopenharmony_ci	addi	a5, a5, 16
14862306a36Sopenharmony_ci#if !XCHAL_HAVE_LOOPS
14962306a36Sopenharmony_ci	bne	a3, a8, .Loop1  # continue loop if a3:src != a8:src_end
15062306a36Sopenharmony_ci#endif /* !XCHAL_HAVE_LOOPS */
15162306a36Sopenharmony_ci.Loop1done:
15262306a36Sopenharmony_ci	bbci.l	a4, 3, .L2
15362306a36Sopenharmony_ci	# copy 8 bytes
15462306a36Sopenharmony_ci	l32i	a6, a3,  0
15562306a36Sopenharmony_ci	l32i	a7, a3,  4
15662306a36Sopenharmony_ci	addi	a3, a3,  8
15762306a36Sopenharmony_ci	s32i	a6, a5,  0
15862306a36Sopenharmony_ci	s32i	a7, a5,  4
15962306a36Sopenharmony_ci	addi	a5, a5,  8
16062306a36Sopenharmony_ci.L2:
16162306a36Sopenharmony_ci	bbsi.l	a4, 2, .L3
16262306a36Sopenharmony_ci	bbsi.l	a4, 1, .L4
16362306a36Sopenharmony_ci	bbsi.l	a4, 0, .L5
16462306a36Sopenharmony_ci	abi_ret_default
16562306a36Sopenharmony_ci.L3:
16662306a36Sopenharmony_ci	# copy 4 bytes
16762306a36Sopenharmony_ci	l32i	a6, a3,  0
16862306a36Sopenharmony_ci	addi	a3, a3,  4
16962306a36Sopenharmony_ci	s32i	a6, a5,  0
17062306a36Sopenharmony_ci	addi	a5, a5,  4
17162306a36Sopenharmony_ci	bbsi.l	a4, 1, .L4
17262306a36Sopenharmony_ci	bbsi.l	a4, 0, .L5
17362306a36Sopenharmony_ci	abi_ret_default
17462306a36Sopenharmony_ci.L4:
17562306a36Sopenharmony_ci	# copy 2 bytes
17662306a36Sopenharmony_ci	l16ui	a6, a3,  0
17762306a36Sopenharmony_ci	addi	a3, a3,  2
17862306a36Sopenharmony_ci	s16i	a6, a5,  0
17962306a36Sopenharmony_ci	addi	a5, a5,  2
18062306a36Sopenharmony_ci	bbsi.l	a4, 0, .L5
18162306a36Sopenharmony_ci	abi_ret_default
18262306a36Sopenharmony_ci.L5:
18362306a36Sopenharmony_ci	# copy 1 byte
18462306a36Sopenharmony_ci	l8ui	a6, a3,  0
18562306a36Sopenharmony_ci	s8i	a6, a5,  0
18662306a36Sopenharmony_ci	abi_ret_default
18762306a36Sopenharmony_ci
18862306a36Sopenharmony_ci/*
18962306a36Sopenharmony_ci * Destination is aligned, Source is unaligned
19062306a36Sopenharmony_ci */
19162306a36Sopenharmony_ci
19262306a36Sopenharmony_ci	.align	4
19362306a36Sopenharmony_ci.Lsrcunaligned:
19462306a36Sopenharmony_ci	_beqz	a4, .Ldone	# avoid loading anything for zero-length copies
19562306a36Sopenharmony_ci	# copy 16 bytes per iteration for word-aligned dst and unaligned src
19662306a36Sopenharmony_ci	__ssa8	a3		# set shift amount from byte offset
19762306a36Sopenharmony_ci
19862306a36Sopenharmony_ci/* set to 1 when running on ISS (simulator) with the
19962306a36Sopenharmony_ci   lint or ferret client, or 0 to save a few cycles */
20062306a36Sopenharmony_ci#define SIM_CHECKS_ALIGNMENT	1
20162306a36Sopenharmony_ci#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
20262306a36Sopenharmony_ci	and	a11, a3, a8	# save unalignment offset for below
20362306a36Sopenharmony_ci	sub	a3, a3, a11	# align a3
20462306a36Sopenharmony_ci#endif
20562306a36Sopenharmony_ci	l32i	a6, a3, 0	# load first word
20662306a36Sopenharmony_ci#if XCHAL_HAVE_LOOPS
20762306a36Sopenharmony_ci	loopnez	a7, .Loop2done
20862306a36Sopenharmony_ci#else /* !XCHAL_HAVE_LOOPS */
20962306a36Sopenharmony_ci	beqz	a7, .Loop2done
21062306a36Sopenharmony_ci	slli	a10, a7, 4
21162306a36Sopenharmony_ci	add	a10, a10, a3	# a10 = end of last 16B source chunk
21262306a36Sopenharmony_ci#endif /* !XCHAL_HAVE_LOOPS */
21362306a36Sopenharmony_ci.Loop2:
21462306a36Sopenharmony_ci	l32i	a7, a3,  4
21562306a36Sopenharmony_ci	l32i	a8, a3,  8
21662306a36Sopenharmony_ci	__src_b	a6, a6, a7
21762306a36Sopenharmony_ci	s32i	a6, a5,  0
21862306a36Sopenharmony_ci	l32i	a9, a3, 12
21962306a36Sopenharmony_ci	__src_b	a7, a7, a8
22062306a36Sopenharmony_ci	s32i	a7, a5,  4
22162306a36Sopenharmony_ci	l32i	a6, a3, 16
22262306a36Sopenharmony_ci	__src_b	a8, a8, a9
22362306a36Sopenharmony_ci	s32i	a8, a5,  8
22462306a36Sopenharmony_ci	addi	a3, a3, 16
22562306a36Sopenharmony_ci	__src_b	a9, a9, a6
22662306a36Sopenharmony_ci	s32i	a9, a5, 12
22762306a36Sopenharmony_ci	addi	a5, a5, 16
22862306a36Sopenharmony_ci#if !XCHAL_HAVE_LOOPS
22962306a36Sopenharmony_ci	bne	a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
23062306a36Sopenharmony_ci#endif /* !XCHAL_HAVE_LOOPS */
23162306a36Sopenharmony_ci.Loop2done:
23262306a36Sopenharmony_ci	bbci.l	a4, 3, .L12
23362306a36Sopenharmony_ci	# copy 8 bytes
23462306a36Sopenharmony_ci	l32i	a7, a3,  4
23562306a36Sopenharmony_ci	l32i	a8, a3,  8
23662306a36Sopenharmony_ci	__src_b	a6, a6, a7
23762306a36Sopenharmony_ci	s32i	a6, a5,  0
23862306a36Sopenharmony_ci	addi	a3, a3,  8
23962306a36Sopenharmony_ci	__src_b	a7, a7, a8
24062306a36Sopenharmony_ci	s32i	a7, a5,  4
24162306a36Sopenharmony_ci	addi	a5, a5,  8
24262306a36Sopenharmony_ci	mov	a6, a8
24362306a36Sopenharmony_ci.L12:
24462306a36Sopenharmony_ci	bbci.l	a4, 2, .L13
24562306a36Sopenharmony_ci	# copy 4 bytes
24662306a36Sopenharmony_ci	l32i	a7, a3,  4
24762306a36Sopenharmony_ci	addi	a3, a3,  4
24862306a36Sopenharmony_ci	__src_b	a6, a6, a7
24962306a36Sopenharmony_ci	s32i	a6, a5,  0
25062306a36Sopenharmony_ci	addi	a5, a5,  4
25162306a36Sopenharmony_ci	mov	a6, a7
25262306a36Sopenharmony_ci.L13:
25362306a36Sopenharmony_ci#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
25462306a36Sopenharmony_ci	add	a3, a3, a11	# readjust a3 with correct misalignment
25562306a36Sopenharmony_ci#endif
25662306a36Sopenharmony_ci	bbsi.l	a4, 1, .L14
25762306a36Sopenharmony_ci	bbsi.l	a4, 0, .L15
25862306a36Sopenharmony_ci.Ldone:	abi_ret_default
25962306a36Sopenharmony_ci.L14:
26062306a36Sopenharmony_ci	# copy 2 bytes
26162306a36Sopenharmony_ci	l8ui	a6, a3,  0
26262306a36Sopenharmony_ci	l8ui	a7, a3,  1
26362306a36Sopenharmony_ci	addi	a3, a3,  2
26462306a36Sopenharmony_ci	s8i	a6, a5,  0
26562306a36Sopenharmony_ci	s8i	a7, a5,  1
26662306a36Sopenharmony_ci	addi	a5, a5,  2
26762306a36Sopenharmony_ci	bbsi.l	a4, 0, .L15
26862306a36Sopenharmony_ci	abi_ret_default
26962306a36Sopenharmony_ci.L15:
27062306a36Sopenharmony_ci	# copy 1 byte
27162306a36Sopenharmony_ci	l8ui	a6, a3,  0
27262306a36Sopenharmony_ci	s8i	a6, a5,  0
27362306a36Sopenharmony_ci	abi_ret_default
27462306a36Sopenharmony_ci
27562306a36Sopenharmony_ciENDPROC(__memcpy)
27662306a36Sopenharmony_ciEXPORT_SYMBOL(__memcpy)
27762306a36Sopenharmony_ciEXPORT_SYMBOL(memcpy)
27862306a36Sopenharmony_ci
27962306a36Sopenharmony_ci/*
28062306a36Sopenharmony_ci * void *memmove(void *dst, const void *src, size_t len);
28162306a36Sopenharmony_ci *
28262306a36Sopenharmony_ci * This function is intended to do the same thing as the standard
28362306a36Sopenharmony_ci * library function memmove() for most cases.
28462306a36Sopenharmony_ci * However, where the source and/or destination references
28562306a36Sopenharmony_ci * an instruction RAM or ROM or a data RAM or ROM, that
28662306a36Sopenharmony_ci * source and/or destination will always be accessed with
28762306a36Sopenharmony_ci * 32-bit load and store instructions (as required for these
28862306a36Sopenharmony_ci * types of devices).
28962306a36Sopenharmony_ci *
29062306a36Sopenharmony_ci * !!!!!!!  XTFIXME:
29162306a36Sopenharmony_ci * !!!!!!!  Handling of IRAM/IROM has not yet
29262306a36Sopenharmony_ci * !!!!!!!  been implemented.
29362306a36Sopenharmony_ci *
29462306a36Sopenharmony_ci * The (general case) algorithm is as follows:
29562306a36Sopenharmony_ci *   If end of source doesn't overlap destination then use memcpy.
29662306a36Sopenharmony_ci *   Otherwise do memcpy backwards.
29762306a36Sopenharmony_ci *
29862306a36Sopenharmony_ci * Register use:
29962306a36Sopenharmony_ci *	a0/ return address
30062306a36Sopenharmony_ci *	a1/ stack pointer
30162306a36Sopenharmony_ci *	a2/ return value
30262306a36Sopenharmony_ci *	a3/ src
30362306a36Sopenharmony_ci *	a4/ length
30462306a36Sopenharmony_ci *	a5/ dst
30562306a36Sopenharmony_ci *	a6/ tmp
30662306a36Sopenharmony_ci *	a7/ tmp
30762306a36Sopenharmony_ci *	a8/ tmp
30862306a36Sopenharmony_ci *	a9/ tmp
30962306a36Sopenharmony_ci *	a10/ tmp
31062306a36Sopenharmony_ci *	a11/ tmp
31162306a36Sopenharmony_ci */
31262306a36Sopenharmony_ci
31362306a36Sopenharmony_ci/*
31462306a36Sopenharmony_ci * Byte by byte copy
31562306a36Sopenharmony_ci */
31662306a36Sopenharmony_ci	.align	4
31762306a36Sopenharmony_ci	.byte	0		# 1 mod 4 alignment for LOOPNEZ
31862306a36Sopenharmony_ci				# (0 mod 4 alignment for LBEG)
31962306a36Sopenharmony_ci.Lbackbytecopy:
32062306a36Sopenharmony_ci#if XCHAL_HAVE_LOOPS
32162306a36Sopenharmony_ci	loopnez	a4, .Lbackbytecopydone
32262306a36Sopenharmony_ci#else /* !XCHAL_HAVE_LOOPS */
32362306a36Sopenharmony_ci	beqz	a4, .Lbackbytecopydone
32462306a36Sopenharmony_ci	sub	a7, a3, a4	# a7 = start address for source
32562306a36Sopenharmony_ci#endif /* !XCHAL_HAVE_LOOPS */
32662306a36Sopenharmony_ci.Lbacknextbyte:
32762306a36Sopenharmony_ci	addi	a3, a3, -1
32862306a36Sopenharmony_ci	l8ui	a6, a3, 0
32962306a36Sopenharmony_ci	addi	a5, a5, -1
33062306a36Sopenharmony_ci	s8i	a6, a5, 0
33162306a36Sopenharmony_ci#if !XCHAL_HAVE_LOOPS
33262306a36Sopenharmony_ci	bne	a3, a7, .Lbacknextbyte # continue loop if
33362306a36Sopenharmony_ci				       # $a3:src != $a7:src_start
33462306a36Sopenharmony_ci#endif /* !XCHAL_HAVE_LOOPS */
33562306a36Sopenharmony_ci.Lbackbytecopydone:
33662306a36Sopenharmony_ci	abi_ret_default
33762306a36Sopenharmony_ci
33862306a36Sopenharmony_ci/*
33962306a36Sopenharmony_ci * Destination is unaligned
34062306a36Sopenharmony_ci */
34162306a36Sopenharmony_ci
34262306a36Sopenharmony_ci	.align	4
34362306a36Sopenharmony_ci.Lbackdst1mod2:	# dst is only byte aligned
34462306a36Sopenharmony_ci	_bltui	a4, 7, .Lbackbytecopy	# do short copies byte by byte
34562306a36Sopenharmony_ci
34662306a36Sopenharmony_ci	# copy 1 byte
34762306a36Sopenharmony_ci	addi	a3, a3, -1
34862306a36Sopenharmony_ci	l8ui	a6, a3,  0
34962306a36Sopenharmony_ci	addi	a5, a5, -1
35062306a36Sopenharmony_ci	s8i	a6, a5,  0
35162306a36Sopenharmony_ci	addi	a4, a4, -1
35262306a36Sopenharmony_ci	_bbci.l	a5, 1, .Lbackdstaligned	# if dst is now aligned, then
35362306a36Sopenharmony_ci					# return to main algorithm
35462306a36Sopenharmony_ci.Lbackdst2mod4:	# dst 16-bit aligned
35562306a36Sopenharmony_ci	# copy 2 bytes
35662306a36Sopenharmony_ci	_bltui	a4, 6, .Lbackbytecopy	# do short copies byte by byte
35762306a36Sopenharmony_ci	addi	a3, a3, -2
35862306a36Sopenharmony_ci	l8ui	a6, a3,  0
35962306a36Sopenharmony_ci	l8ui	a7, a3,  1
36062306a36Sopenharmony_ci	addi	a5, a5, -2
36162306a36Sopenharmony_ci	s8i	a6, a5,  0
36262306a36Sopenharmony_ci	s8i	a7, a5,  1
36362306a36Sopenharmony_ci	addi	a4, a4, -2
36462306a36Sopenharmony_ci	j	.Lbackdstaligned	# dst is now aligned,
36562306a36Sopenharmony_ci					# return to main algorithm
36662306a36Sopenharmony_ci
36762306a36Sopenharmony_ciENTRY(__memmove)
36862306a36Sopenharmony_ciWEAK(memmove)
36962306a36Sopenharmony_ci
37062306a36Sopenharmony_ci	abi_entry_default
37162306a36Sopenharmony_ci	# a2/ dst, a3/ src, a4/ len
37262306a36Sopenharmony_ci	mov	a5, a2		# copy dst so that a2 is return value
37362306a36Sopenharmony_ci.Lmovecommon:
37462306a36Sopenharmony_ci	sub	a6, a5, a3
37562306a36Sopenharmony_ci	bgeu	a6, a4, .Lcommon
37662306a36Sopenharmony_ci
37762306a36Sopenharmony_ci	add	a5, a5, a4
37862306a36Sopenharmony_ci	add	a3, a3, a4
37962306a36Sopenharmony_ci
38062306a36Sopenharmony_ci	_bbsi.l	a5, 0, .Lbackdst1mod2	# if dst is 1 mod 2
38162306a36Sopenharmony_ci	_bbsi.l	a5, 1, .Lbackdst2mod4	# if dst is 2 mod 4
38262306a36Sopenharmony_ci.Lbackdstaligned:	# return here from .Lbackdst?mod? once dst is aligned
38362306a36Sopenharmony_ci	srli	a7, a4, 4	# number of loop iterations with 16B
38462306a36Sopenharmony_ci				# per iteration
38562306a36Sopenharmony_ci	movi	a8, 3		# if source is not aligned,
38662306a36Sopenharmony_ci	_bany	a3, a8, .Lbacksrcunaligned	# then use shifting copy
38762306a36Sopenharmony_ci	/*
38862306a36Sopenharmony_ci	 * Destination and source are word-aligned, use word copy.
38962306a36Sopenharmony_ci	 */
39062306a36Sopenharmony_ci	# copy 16 bytes per iteration for word-aligned dst and word-aligned src
39162306a36Sopenharmony_ci#if XCHAL_HAVE_LOOPS
39262306a36Sopenharmony_ci	loopnez	a7, .LbackLoop1done
39362306a36Sopenharmony_ci#else /* !XCHAL_HAVE_LOOPS */
39462306a36Sopenharmony_ci	beqz	a7, .LbackLoop1done
39562306a36Sopenharmony_ci	slli	a8, a7, 4
39662306a36Sopenharmony_ci	sub	a8, a3, a8	# a8 = start of first 16B source chunk
39762306a36Sopenharmony_ci#endif /* !XCHAL_HAVE_LOOPS */
39862306a36Sopenharmony_ci.LbackLoop1:
39962306a36Sopenharmony_ci	addi	a3, a3, -16
40062306a36Sopenharmony_ci	l32i	a7, a3, 12
40162306a36Sopenharmony_ci	l32i	a6, a3,  8
40262306a36Sopenharmony_ci	addi	a5, a5, -16
40362306a36Sopenharmony_ci	s32i	a7, a5, 12
40462306a36Sopenharmony_ci	l32i	a7, a3,  4
40562306a36Sopenharmony_ci	s32i	a6, a5,  8
40662306a36Sopenharmony_ci	l32i	a6, a3,  0
40762306a36Sopenharmony_ci	s32i	a7, a5,  4
40862306a36Sopenharmony_ci	s32i	a6, a5,  0
40962306a36Sopenharmony_ci#if !XCHAL_HAVE_LOOPS
41062306a36Sopenharmony_ci	bne	a3, a8, .LbackLoop1  # continue loop if a3:src != a8:src_start
41162306a36Sopenharmony_ci#endif /* !XCHAL_HAVE_LOOPS */
41262306a36Sopenharmony_ci.LbackLoop1done:
41362306a36Sopenharmony_ci	bbci.l	a4, 3, .Lback2
41462306a36Sopenharmony_ci	# copy 8 bytes
41562306a36Sopenharmony_ci	addi	a3, a3, -8
41662306a36Sopenharmony_ci	l32i	a6, a3,  0
41762306a36Sopenharmony_ci	l32i	a7, a3,  4
41862306a36Sopenharmony_ci	addi	a5, a5, -8
41962306a36Sopenharmony_ci	s32i	a6, a5,  0
42062306a36Sopenharmony_ci	s32i	a7, a5,  4
42162306a36Sopenharmony_ci.Lback2:
42262306a36Sopenharmony_ci	bbsi.l	a4, 2, .Lback3
42362306a36Sopenharmony_ci	bbsi.l	a4, 1, .Lback4
42462306a36Sopenharmony_ci	bbsi.l	a4, 0, .Lback5
42562306a36Sopenharmony_ci	abi_ret_default
42662306a36Sopenharmony_ci.Lback3:
42762306a36Sopenharmony_ci	# copy 4 bytes
42862306a36Sopenharmony_ci	addi	a3, a3, -4
42962306a36Sopenharmony_ci	l32i	a6, a3,  0
43062306a36Sopenharmony_ci	addi	a5, a5, -4
43162306a36Sopenharmony_ci	s32i	a6, a5,  0
43262306a36Sopenharmony_ci	bbsi.l	a4, 1, .Lback4
43362306a36Sopenharmony_ci	bbsi.l	a4, 0, .Lback5
43462306a36Sopenharmony_ci	abi_ret_default
43562306a36Sopenharmony_ci.Lback4:
43662306a36Sopenharmony_ci	# copy 2 bytes
43762306a36Sopenharmony_ci	addi	a3, a3, -2
43862306a36Sopenharmony_ci	l16ui	a6, a3,  0
43962306a36Sopenharmony_ci	addi	a5, a5, -2
44062306a36Sopenharmony_ci	s16i	a6, a5,  0
44162306a36Sopenharmony_ci	bbsi.l	a4, 0, .Lback5
44262306a36Sopenharmony_ci	abi_ret_default
44362306a36Sopenharmony_ci.Lback5:
44462306a36Sopenharmony_ci	# copy 1 byte
44562306a36Sopenharmony_ci	addi	a3, a3, -1
44662306a36Sopenharmony_ci	l8ui	a6, a3,  0
44762306a36Sopenharmony_ci	addi	a5, a5, -1
44862306a36Sopenharmony_ci	s8i	a6, a5,  0
44962306a36Sopenharmony_ci	abi_ret_default
45062306a36Sopenharmony_ci
45162306a36Sopenharmony_ci/*
45262306a36Sopenharmony_ci * Destination is aligned, Source is unaligned
45362306a36Sopenharmony_ci */
45462306a36Sopenharmony_ci
45562306a36Sopenharmony_ci	.align	4
45662306a36Sopenharmony_ci.Lbacksrcunaligned:
45762306a36Sopenharmony_ci	_beqz	a4, .Lbackdone	# avoid loading anything for zero-length copies
45862306a36Sopenharmony_ci	# copy 16 bytes per iteration for word-aligned dst and unaligned src
45962306a36Sopenharmony_ci	__ssa8	a3		# set shift amount from byte offset
46062306a36Sopenharmony_ci#define SIM_CHECKS_ALIGNMENT	1	/* set to 1 when running on ISS with
46162306a36Sopenharmony_ci					 * the lint or ferret client, or 0
46262306a36Sopenharmony_ci					 * to save a few cycles */
46362306a36Sopenharmony_ci#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
46462306a36Sopenharmony_ci	and	a11, a3, a8	# save unalignment offset for below
46562306a36Sopenharmony_ci	sub	a3, a3, a11	# align a3
46662306a36Sopenharmony_ci#endif
46762306a36Sopenharmony_ci	l32i	a6, a3, 0	# load first word
46862306a36Sopenharmony_ci#if XCHAL_HAVE_LOOPS
46962306a36Sopenharmony_ci	loopnez	a7, .LbackLoop2done
47062306a36Sopenharmony_ci#else /* !XCHAL_HAVE_LOOPS */
47162306a36Sopenharmony_ci	beqz	a7, .LbackLoop2done
47262306a36Sopenharmony_ci	slli	a10, a7, 4
47362306a36Sopenharmony_ci	sub	a10, a3, a10	# a10 = start of first 16B source chunk
47462306a36Sopenharmony_ci#endif /* !XCHAL_HAVE_LOOPS */
47562306a36Sopenharmony_ci.LbackLoop2:
47662306a36Sopenharmony_ci	addi	a3, a3, -16
47762306a36Sopenharmony_ci	l32i	a7, a3, 12
47862306a36Sopenharmony_ci	l32i	a8, a3,  8
47962306a36Sopenharmony_ci	addi	a5, a5, -16
48062306a36Sopenharmony_ci	__src_b	a6, a7, a6
48162306a36Sopenharmony_ci	s32i	a6, a5, 12
48262306a36Sopenharmony_ci	l32i	a9, a3,  4
48362306a36Sopenharmony_ci	__src_b	a7, a8, a7
48462306a36Sopenharmony_ci	s32i	a7, a5,  8
48562306a36Sopenharmony_ci	l32i	a6, a3,  0
48662306a36Sopenharmony_ci	__src_b	a8, a9, a8
48762306a36Sopenharmony_ci	s32i	a8, a5,  4
48862306a36Sopenharmony_ci	__src_b	a9, a6, a9
48962306a36Sopenharmony_ci	s32i	a9, a5,  0
49062306a36Sopenharmony_ci#if !XCHAL_HAVE_LOOPS
49162306a36Sopenharmony_ci	bne	a3, a10, .LbackLoop2 # continue loop if a3:src != a10:src_start
49262306a36Sopenharmony_ci#endif /* !XCHAL_HAVE_LOOPS */
49362306a36Sopenharmony_ci.LbackLoop2done:
49462306a36Sopenharmony_ci	bbci.l	a4, 3, .Lback12
49562306a36Sopenharmony_ci	# copy 8 bytes
49662306a36Sopenharmony_ci	addi	a3, a3, -8
49762306a36Sopenharmony_ci	l32i	a7, a3,  4
49862306a36Sopenharmony_ci	l32i	a8, a3,  0
49962306a36Sopenharmony_ci	addi	a5, a5, -8
50062306a36Sopenharmony_ci	__src_b	a6, a7, a6
50162306a36Sopenharmony_ci	s32i	a6, a5,  4
50262306a36Sopenharmony_ci	__src_b	a7, a8, a7
50362306a36Sopenharmony_ci	s32i	a7, a5,  0
50462306a36Sopenharmony_ci	mov	a6, a8
50562306a36Sopenharmony_ci.Lback12:
50662306a36Sopenharmony_ci	bbci.l	a4, 2, .Lback13
50762306a36Sopenharmony_ci	# copy 4 bytes
50862306a36Sopenharmony_ci	addi	a3, a3, -4
50962306a36Sopenharmony_ci	l32i	a7, a3,  0
51062306a36Sopenharmony_ci	addi	a5, a5, -4
51162306a36Sopenharmony_ci	__src_b	a6, a7, a6
51262306a36Sopenharmony_ci	s32i	a6, a5,  0
51362306a36Sopenharmony_ci	mov	a6, a7
51462306a36Sopenharmony_ci.Lback13:
51562306a36Sopenharmony_ci#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
51662306a36Sopenharmony_ci	add	a3, a3, a11	# readjust a3 with correct misalignment
51762306a36Sopenharmony_ci#endif
51862306a36Sopenharmony_ci	bbsi.l	a4, 1, .Lback14
51962306a36Sopenharmony_ci	bbsi.l	a4, 0, .Lback15
52062306a36Sopenharmony_ci.Lbackdone:
52162306a36Sopenharmony_ci	abi_ret_default
52262306a36Sopenharmony_ci.Lback14:
52362306a36Sopenharmony_ci	# copy 2 bytes
52462306a36Sopenharmony_ci	addi	a3, a3, -2
52562306a36Sopenharmony_ci	l8ui	a6, a3,  0
52662306a36Sopenharmony_ci	l8ui	a7, a3,  1
52762306a36Sopenharmony_ci	addi	a5, a5, -2
52862306a36Sopenharmony_ci	s8i	a6, a5,  0
52962306a36Sopenharmony_ci	s8i	a7, a5,  1
53062306a36Sopenharmony_ci	bbsi.l	a4, 0, .Lback15
53162306a36Sopenharmony_ci	abi_ret_default
53262306a36Sopenharmony_ci.Lback15:
53362306a36Sopenharmony_ci	# copy 1 byte
53462306a36Sopenharmony_ci	addi	a3, a3, -1
53562306a36Sopenharmony_ci	addi	a5, a5, -1
53662306a36Sopenharmony_ci	l8ui	a6, a3,  0
53762306a36Sopenharmony_ci	s8i	a6, a5,  0
53862306a36Sopenharmony_ci	abi_ret_default
53962306a36Sopenharmony_ci
54062306a36Sopenharmony_ciENDPROC(__memmove)
54162306a36Sopenharmony_ciEXPORT_SYMBOL(__memmove)
54262306a36Sopenharmony_ciEXPORT_SYMBOL(memmove)
543