18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */
28c2ecf20Sopenharmony_ci/* NGmemcpy.S: Niagara optimized memcpy.
38c2ecf20Sopenharmony_ci *
48c2ecf20Sopenharmony_ci * Copyright (C) 2006, 2007 David S. Miller (davem@davemloft.net)
58c2ecf20Sopenharmony_ci */
68c2ecf20Sopenharmony_ci
78c2ecf20Sopenharmony_ci#ifdef __KERNEL__
88c2ecf20Sopenharmony_ci#include <linux/linkage.h>
98c2ecf20Sopenharmony_ci#include <asm/asi.h>
108c2ecf20Sopenharmony_ci#include <asm/thread_info.h>
118c2ecf20Sopenharmony_ci#define GLOBAL_SPARE	%g7
128c2ecf20Sopenharmony_ci#define RESTORE_ASI(TMP)	\
138c2ecf20Sopenharmony_ci	ldub	[%g6 + TI_CURRENT_DS], TMP;  \
148c2ecf20Sopenharmony_ci	wr	TMP, 0x0, %asi;
158c2ecf20Sopenharmony_ci#else
168c2ecf20Sopenharmony_ci#define GLOBAL_SPARE	%g5
178c2ecf20Sopenharmony_ci#define RESTORE_ASI(TMP)	\
188c2ecf20Sopenharmony_ci	wr	%g0, ASI_PNF, %asi
198c2ecf20Sopenharmony_ci#endif
208c2ecf20Sopenharmony_ci
218c2ecf20Sopenharmony_ci#ifdef __sparc_v9__
228c2ecf20Sopenharmony_ci#define SAVE_AMOUNT	128
238c2ecf20Sopenharmony_ci#else
248c2ecf20Sopenharmony_ci#define SAVE_AMOUNT	64
258c2ecf20Sopenharmony_ci#endif
268c2ecf20Sopenharmony_ci
278c2ecf20Sopenharmony_ci#ifndef STORE_ASI
288c2ecf20Sopenharmony_ci#define STORE_ASI	ASI_BLK_INIT_QUAD_LDD_P
298c2ecf20Sopenharmony_ci#endif
308c2ecf20Sopenharmony_ci
318c2ecf20Sopenharmony_ci#ifndef EX_LD
328c2ecf20Sopenharmony_ci#define EX_LD(x,y)	x
338c2ecf20Sopenharmony_ci#endif
348c2ecf20Sopenharmony_ci
358c2ecf20Sopenharmony_ci#ifndef EX_ST
368c2ecf20Sopenharmony_ci#define EX_ST(x,y)	x
378c2ecf20Sopenharmony_ci#endif
388c2ecf20Sopenharmony_ci
398c2ecf20Sopenharmony_ci#ifndef LOAD
408c2ecf20Sopenharmony_ci#ifndef MEMCPY_DEBUG
418c2ecf20Sopenharmony_ci#define LOAD(type,addr,dest)	type [addr], dest
428c2ecf20Sopenharmony_ci#else
438c2ecf20Sopenharmony_ci#define LOAD(type,addr,dest)	type##a [addr] 0x80, dest
448c2ecf20Sopenharmony_ci#endif
458c2ecf20Sopenharmony_ci#endif
468c2ecf20Sopenharmony_ci
478c2ecf20Sopenharmony_ci#ifndef LOAD_TWIN
488c2ecf20Sopenharmony_ci#define LOAD_TWIN(addr_reg,dest0,dest1)	\
498c2ecf20Sopenharmony_ci	ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0
508c2ecf20Sopenharmony_ci#endif
518c2ecf20Sopenharmony_ci
528c2ecf20Sopenharmony_ci#ifndef STORE
538c2ecf20Sopenharmony_ci#define STORE(type,src,addr)	type src, [addr]
548c2ecf20Sopenharmony_ci#endif
558c2ecf20Sopenharmony_ci
568c2ecf20Sopenharmony_ci#ifndef STORE_INIT
578c2ecf20Sopenharmony_ci#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
588c2ecf20Sopenharmony_ci#define STORE_INIT(src,addr)	stxa src, [addr] %asi
598c2ecf20Sopenharmony_ci#else
608c2ecf20Sopenharmony_ci#define STORE_INIT(src,addr)	stx src, [addr + 0x00]
618c2ecf20Sopenharmony_ci#endif
628c2ecf20Sopenharmony_ci#endif
638c2ecf20Sopenharmony_ci
648c2ecf20Sopenharmony_ci#ifndef FUNC_NAME
658c2ecf20Sopenharmony_ci#define FUNC_NAME	NGmemcpy
668c2ecf20Sopenharmony_ci#endif
678c2ecf20Sopenharmony_ci
688c2ecf20Sopenharmony_ci#ifndef PREAMBLE
698c2ecf20Sopenharmony_ci#define PREAMBLE
708c2ecf20Sopenharmony_ci#endif
718c2ecf20Sopenharmony_ci
728c2ecf20Sopenharmony_ci#ifndef XCC
738c2ecf20Sopenharmony_ci#define XCC xcc
748c2ecf20Sopenharmony_ci#endif
758c2ecf20Sopenharmony_ci
768c2ecf20Sopenharmony_ci	.register	%g2,#scratch
778c2ecf20Sopenharmony_ci	.register	%g3,#scratch
788c2ecf20Sopenharmony_ci
798c2ecf20Sopenharmony_ci	.text
808c2ecf20Sopenharmony_ci#ifndef EX_RETVAL
818c2ecf20Sopenharmony_ci#define EX_RETVAL(x)	x
828c2ecf20Sopenharmony_ci__restore_asi:
838c2ecf20Sopenharmony_ci	ret
848c2ecf20Sopenharmony_ci	wr	%g0, ASI_AIUS, %asi
858c2ecf20Sopenharmony_ci	 restore
868c2ecf20Sopenharmony_ciENTRY(NG_ret_i2_plus_i4_plus_1)
878c2ecf20Sopenharmony_ci	ba,pt	%xcc, __restore_asi
888c2ecf20Sopenharmony_ci	 add	%i2, %i5, %i0
898c2ecf20Sopenharmony_ciENDPROC(NG_ret_i2_plus_i4_plus_1)
908c2ecf20Sopenharmony_ciENTRY(NG_ret_i2_plus_g1)
918c2ecf20Sopenharmony_ci	ba,pt	%xcc, __restore_asi
928c2ecf20Sopenharmony_ci	 add	%i2, %g1, %i0
938c2ecf20Sopenharmony_ciENDPROC(NG_ret_i2_plus_g1)
948c2ecf20Sopenharmony_ciENTRY(NG_ret_i2_plus_g1_minus_8)
958c2ecf20Sopenharmony_ci	sub	%g1, 8, %g1
968c2ecf20Sopenharmony_ci	ba,pt	%xcc, __restore_asi
978c2ecf20Sopenharmony_ci	 add	%i2, %g1, %i0
988c2ecf20Sopenharmony_ciENDPROC(NG_ret_i2_plus_g1_minus_8)
998c2ecf20Sopenharmony_ciENTRY(NG_ret_i2_plus_g1_minus_16)
1008c2ecf20Sopenharmony_ci	sub	%g1, 16, %g1
1018c2ecf20Sopenharmony_ci	ba,pt	%xcc, __restore_asi
1028c2ecf20Sopenharmony_ci	 add	%i2, %g1, %i0
1038c2ecf20Sopenharmony_ciENDPROC(NG_ret_i2_plus_g1_minus_16)
1048c2ecf20Sopenharmony_ciENTRY(NG_ret_i2_plus_g1_minus_24)
1058c2ecf20Sopenharmony_ci	sub	%g1, 24, %g1
1068c2ecf20Sopenharmony_ci	ba,pt	%xcc, __restore_asi
1078c2ecf20Sopenharmony_ci	 add	%i2, %g1, %i0
1088c2ecf20Sopenharmony_ciENDPROC(NG_ret_i2_plus_g1_minus_24)
1098c2ecf20Sopenharmony_ciENTRY(NG_ret_i2_plus_g1_minus_32)
1108c2ecf20Sopenharmony_ci	sub	%g1, 32, %g1
1118c2ecf20Sopenharmony_ci	ba,pt	%xcc, __restore_asi
1128c2ecf20Sopenharmony_ci	 add	%i2, %g1, %i0
1138c2ecf20Sopenharmony_ciENDPROC(NG_ret_i2_plus_g1_minus_32)
1148c2ecf20Sopenharmony_ciENTRY(NG_ret_i2_plus_g1_minus_40)
1158c2ecf20Sopenharmony_ci	sub	%g1, 40, %g1
1168c2ecf20Sopenharmony_ci	ba,pt	%xcc, __restore_asi
1178c2ecf20Sopenharmony_ci	 add	%i2, %g1, %i0
1188c2ecf20Sopenharmony_ciENDPROC(NG_ret_i2_plus_g1_minus_40)
1198c2ecf20Sopenharmony_ciENTRY(NG_ret_i2_plus_g1_minus_48)
1208c2ecf20Sopenharmony_ci	sub	%g1, 48, %g1
1218c2ecf20Sopenharmony_ci	ba,pt	%xcc, __restore_asi
1228c2ecf20Sopenharmony_ci	 add	%i2, %g1, %i0
1238c2ecf20Sopenharmony_ciENDPROC(NG_ret_i2_plus_g1_minus_48)
1248c2ecf20Sopenharmony_ciENTRY(NG_ret_i2_plus_g1_minus_56)
1258c2ecf20Sopenharmony_ci	sub	%g1, 56, %g1
1268c2ecf20Sopenharmony_ci	ba,pt	%xcc, __restore_asi
1278c2ecf20Sopenharmony_ci	 add	%i2, %g1, %i0
1288c2ecf20Sopenharmony_ciENDPROC(NG_ret_i2_plus_g1_minus_56)
1298c2ecf20Sopenharmony_ciENTRY(NG_ret_i2_plus_i4)
1308c2ecf20Sopenharmony_ci	ba,pt	%xcc, __restore_asi
1318c2ecf20Sopenharmony_ci	 add	%i2, %i4, %i0
1328c2ecf20Sopenharmony_ciENDPROC(NG_ret_i2_plus_i4)
1338c2ecf20Sopenharmony_ciENTRY(NG_ret_i2_plus_i4_minus_8)
1348c2ecf20Sopenharmony_ci	sub	%i4, 8, %i4
1358c2ecf20Sopenharmony_ci	ba,pt	%xcc, __restore_asi
1368c2ecf20Sopenharmony_ci	 add	%i2, %i4, %i0
1378c2ecf20Sopenharmony_ciENDPROC(NG_ret_i2_plus_i4_minus_8)
1388c2ecf20Sopenharmony_ciENTRY(NG_ret_i2_plus_8)
1398c2ecf20Sopenharmony_ci	ba,pt	%xcc, __restore_asi
1408c2ecf20Sopenharmony_ci	 add	%i2, 8, %i0
1418c2ecf20Sopenharmony_ciENDPROC(NG_ret_i2_plus_8)
1428c2ecf20Sopenharmony_ciENTRY(NG_ret_i2_plus_4)
1438c2ecf20Sopenharmony_ci	ba,pt	%xcc, __restore_asi
1448c2ecf20Sopenharmony_ci	 add	%i2, 4, %i0
1458c2ecf20Sopenharmony_ciENDPROC(NG_ret_i2_plus_4)
1468c2ecf20Sopenharmony_ciENTRY(NG_ret_i2_plus_1)
1478c2ecf20Sopenharmony_ci	ba,pt	%xcc, __restore_asi
1488c2ecf20Sopenharmony_ci	 add	%i2, 1, %i0
1498c2ecf20Sopenharmony_ciENDPROC(NG_ret_i2_plus_1)
1508c2ecf20Sopenharmony_ciENTRY(NG_ret_i2_plus_g1_plus_1)
1518c2ecf20Sopenharmony_ci	add	%g1, 1, %g1
1528c2ecf20Sopenharmony_ci	ba,pt	%xcc, __restore_asi
1538c2ecf20Sopenharmony_ci	 add	%i2, %g1, %i0
1548c2ecf20Sopenharmony_ciENDPROC(NG_ret_i2_plus_g1_plus_1)
1558c2ecf20Sopenharmony_ciENTRY(NG_ret_i2)
1568c2ecf20Sopenharmony_ci	ba,pt	%xcc, __restore_asi
1578c2ecf20Sopenharmony_ci	 mov	%i2, %i0
1588c2ecf20Sopenharmony_ciENDPROC(NG_ret_i2)
1598c2ecf20Sopenharmony_ciENTRY(NG_ret_i2_and_7_plus_i4)
1608c2ecf20Sopenharmony_ci	and	%i2, 7, %i2
1618c2ecf20Sopenharmony_ci	ba,pt	%xcc, __restore_asi
1628c2ecf20Sopenharmony_ci	 add	%i2, %i4, %i0
1638c2ecf20Sopenharmony_ciENDPROC(NG_ret_i2_and_7_plus_i4)
1648c2ecf20Sopenharmony_ci#endif
1658c2ecf20Sopenharmony_ci
1668c2ecf20Sopenharmony_ci	.align		64
1678c2ecf20Sopenharmony_ci
1688c2ecf20Sopenharmony_ci	.globl	FUNC_NAME
1698c2ecf20Sopenharmony_ci	.type	FUNC_NAME,#function
1708c2ecf20Sopenharmony_ciFUNC_NAME:	/* %i0=dst, %i1=src, %i2=len */
1718c2ecf20Sopenharmony_ci	PREAMBLE
1728c2ecf20Sopenharmony_ci	save		%sp, -SAVE_AMOUNT, %sp
1738c2ecf20Sopenharmony_ci	srlx		%i2, 31, %g2
1748c2ecf20Sopenharmony_ci	cmp		%g2, 0
1758c2ecf20Sopenharmony_ci	tne		%xcc, 5
1768c2ecf20Sopenharmony_ci	mov		%i0, %o0
1778c2ecf20Sopenharmony_ci	cmp		%i2, 0
1788c2ecf20Sopenharmony_ci	be,pn		%XCC, 85f
1798c2ecf20Sopenharmony_ci	 or		%o0, %i1, %i3
1808c2ecf20Sopenharmony_ci	cmp		%i2, 16
1818c2ecf20Sopenharmony_ci	blu,a,pn	%XCC, 80f
1828c2ecf20Sopenharmony_ci	 or		%i3, %i2, %i3
1838c2ecf20Sopenharmony_ci
1848c2ecf20Sopenharmony_ci	/* 2 blocks (128 bytes) is the minimum we can do the block
1858c2ecf20Sopenharmony_ci	 * copy with.  We need to ensure that we'll iterate at least
1868c2ecf20Sopenharmony_ci	 * once in the block copy loop.  At worst we'll need to align
1878c2ecf20Sopenharmony_ci	 * the destination to a 64-byte boundary which can chew up
1888c2ecf20Sopenharmony_ci	 * to (64 - 1) bytes from the length before we perform the
1898c2ecf20Sopenharmony_ci	 * block copy loop.
1908c2ecf20Sopenharmony_ci	 */
1918c2ecf20Sopenharmony_ci	cmp		%i2, (2 * 64)
1928c2ecf20Sopenharmony_ci	blu,pt		%XCC, 70f
1938c2ecf20Sopenharmony_ci	 andcc		%i3, 0x7, %g0
1948c2ecf20Sopenharmony_ci
1958c2ecf20Sopenharmony_ci	/* %o0:	dst
1968c2ecf20Sopenharmony_ci	 * %i1:	src
1978c2ecf20Sopenharmony_ci	 * %i2:	len  (known to be >= 128)
1988c2ecf20Sopenharmony_ci	 *
1998c2ecf20Sopenharmony_ci	 * The block copy loops will use %i4/%i5,%g2/%g3 as
2008c2ecf20Sopenharmony_ci	 * temporaries while copying the data.
2018c2ecf20Sopenharmony_ci	 */
2028c2ecf20Sopenharmony_ci
2038c2ecf20Sopenharmony_ci	LOAD(prefetch, %i1, #one_read)
2048c2ecf20Sopenharmony_ci	wr		%g0, STORE_ASI, %asi
2058c2ecf20Sopenharmony_ci
2068c2ecf20Sopenharmony_ci	/* Align destination on 64-byte boundary.  */
2078c2ecf20Sopenharmony_ci	andcc		%o0, (64 - 1), %i4
2088c2ecf20Sopenharmony_ci	be,pt		%XCC, 2f
2098c2ecf20Sopenharmony_ci	 sub		%i4, 64, %i4
2108c2ecf20Sopenharmony_ci	sub		%g0, %i4, %i4	! bytes to align dst
2118c2ecf20Sopenharmony_ci	sub		%i2, %i4, %i2
2128c2ecf20Sopenharmony_ci1:	subcc		%i4, 1, %i4
2138c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_i4_plus_1)
2148c2ecf20Sopenharmony_ci	EX_ST(STORE(stb, %g1, %o0), NG_ret_i2_plus_i4_plus_1)
2158c2ecf20Sopenharmony_ci	add		%i1, 1, %i1
2168c2ecf20Sopenharmony_ci	bne,pt		%XCC, 1b
2178c2ecf20Sopenharmony_ci	add		%o0, 1, %o0
2188c2ecf20Sopenharmony_ci
2198c2ecf20Sopenharmony_ci	/* If the source is on a 16-byte boundary we can do
2208c2ecf20Sopenharmony_ci	 * the direct block copy loop.  If it is 8-byte aligned
2218c2ecf20Sopenharmony_ci	 * we can do the 16-byte loads offset by -8 bytes and the
2228c2ecf20Sopenharmony_ci	 * init stores offset by one register.
2238c2ecf20Sopenharmony_ci	 *
2248c2ecf20Sopenharmony_ci	 * If the source is not even 8-byte aligned, we need to do
2258c2ecf20Sopenharmony_ci	 * shifting and masking (basically integer faligndata).
2268c2ecf20Sopenharmony_ci	 *
2278c2ecf20Sopenharmony_ci	 * The careful bit with init stores is that if we store
2288c2ecf20Sopenharmony_ci	 * to any part of the cache line we have to store the whole
2298c2ecf20Sopenharmony_ci	 * cacheline else we can end up with corrupt L2 cache line
2308c2ecf20Sopenharmony_ci	 * contents.  Since the loop works on 64-bytes of 64-byte
2318c2ecf20Sopenharmony_ci	 * aligned store data at a time, this is easy to ensure.
2328c2ecf20Sopenharmony_ci	 */
2338c2ecf20Sopenharmony_ci2:
2348c2ecf20Sopenharmony_ci	andcc		%i1, (16 - 1), %i4
2358c2ecf20Sopenharmony_ci	andn		%i2, (64 - 1), %g1	! block copy loop iterator
2368c2ecf20Sopenharmony_ci	be,pt		%XCC, 50f
2378c2ecf20Sopenharmony_ci	 sub		%i2, %g1, %i2		! final sub-block copy bytes
2388c2ecf20Sopenharmony_ci
2398c2ecf20Sopenharmony_ci	cmp		%i4, 8
2408c2ecf20Sopenharmony_ci	be,pt		%XCC, 10f
2418c2ecf20Sopenharmony_ci	 sub		%i1, %i4, %i1
2428c2ecf20Sopenharmony_ci
2438c2ecf20Sopenharmony_ci	/* Neither 8-byte nor 16-byte aligned, shift and mask.  */
2448c2ecf20Sopenharmony_ci	and		%i4, 0x7, GLOBAL_SPARE
2458c2ecf20Sopenharmony_ci	sll		GLOBAL_SPARE, 3, GLOBAL_SPARE
2468c2ecf20Sopenharmony_ci	mov		64, %i5
2478c2ecf20Sopenharmony_ci	EX_LD(LOAD_TWIN(%i1, %g2, %g3), NG_ret_i2_plus_g1)
2488c2ecf20Sopenharmony_ci	sub		%i5, GLOBAL_SPARE, %i5
2498c2ecf20Sopenharmony_ci	mov		16, %o4
2508c2ecf20Sopenharmony_ci	mov		32, %o5
2518c2ecf20Sopenharmony_ci	mov		48, %o7
2528c2ecf20Sopenharmony_ci	mov		64, %i3
2538c2ecf20Sopenharmony_ci
2548c2ecf20Sopenharmony_ci	bg,pn	   	%XCC, 9f
2558c2ecf20Sopenharmony_ci	 nop
2568c2ecf20Sopenharmony_ci
2578c2ecf20Sopenharmony_ci#define MIX_THREE_WORDS(WORD1, WORD2, WORD3, PRE_SHIFT, POST_SHIFT, TMP) \
2588c2ecf20Sopenharmony_ci	sllx		WORD1, POST_SHIFT, WORD1; \
2598c2ecf20Sopenharmony_ci	srlx		WORD2, PRE_SHIFT, TMP; \
2608c2ecf20Sopenharmony_ci	sllx		WORD2, POST_SHIFT, WORD2; \
2618c2ecf20Sopenharmony_ci	or		WORD1, TMP, WORD1; \
2628c2ecf20Sopenharmony_ci	srlx		WORD3, PRE_SHIFT, TMP; \
2638c2ecf20Sopenharmony_ci	or		WORD2, TMP, WORD2;
2648c2ecf20Sopenharmony_ci
2658c2ecf20Sopenharmony_ci8:	EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1)
2668c2ecf20Sopenharmony_ci	MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1)
2678c2ecf20Sopenharmony_ci	LOAD(prefetch, %i1 + %i3, #one_read)
2688c2ecf20Sopenharmony_ci
2698c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%g2, %o0 + 0x00), NG_ret_i2_plus_g1)
2708c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%g3, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
2718c2ecf20Sopenharmony_ci
2728c2ecf20Sopenharmony_ci	EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16)
2738c2ecf20Sopenharmony_ci	MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1)
2748c2ecf20Sopenharmony_ci
2758c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
2768c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
2778c2ecf20Sopenharmony_ci
2788c2ecf20Sopenharmony_ci	EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
2798c2ecf20Sopenharmony_ci	MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1)
2808c2ecf20Sopenharmony_ci
2818c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%g2, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
2828c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%g3, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
2838c2ecf20Sopenharmony_ci
2848c2ecf20Sopenharmony_ci	EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48)
2858c2ecf20Sopenharmony_ci	add		%i1, 64, %i1
2868c2ecf20Sopenharmony_ci	MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1)
2878c2ecf20Sopenharmony_ci
2888c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
2898c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
2908c2ecf20Sopenharmony_ci
2918c2ecf20Sopenharmony_ci	subcc		%g1, 64, %g1
2928c2ecf20Sopenharmony_ci	bne,pt		%XCC, 8b
2938c2ecf20Sopenharmony_ci	 add		%o0, 64, %o0
2948c2ecf20Sopenharmony_ci
2958c2ecf20Sopenharmony_ci	ba,pt		%XCC, 60f
2968c2ecf20Sopenharmony_ci	 add		%i1, %i4, %i1
2978c2ecf20Sopenharmony_ci
2988c2ecf20Sopenharmony_ci9:	EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1)
2998c2ecf20Sopenharmony_ci	MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1)
3008c2ecf20Sopenharmony_ci	LOAD(prefetch, %i1 + %i3, #one_read)
3018c2ecf20Sopenharmony_ci
3028c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%g3, %o0 + 0x00), NG_ret_i2_plus_g1)
3038c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
3048c2ecf20Sopenharmony_ci
3058c2ecf20Sopenharmony_ci	EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16)
3068c2ecf20Sopenharmony_ci	MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1)
3078c2ecf20Sopenharmony_ci
3088c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
3098c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%g2, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
3108c2ecf20Sopenharmony_ci
3118c2ecf20Sopenharmony_ci	EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
3128c2ecf20Sopenharmony_ci	MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1)
3138c2ecf20Sopenharmony_ci
3148c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%g3, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
3158c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
3168c2ecf20Sopenharmony_ci
3178c2ecf20Sopenharmony_ci	EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48)
3188c2ecf20Sopenharmony_ci	add		%i1, 64, %i1
3198c2ecf20Sopenharmony_ci	MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1)
3208c2ecf20Sopenharmony_ci
3218c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
3228c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%g2, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
3238c2ecf20Sopenharmony_ci
3248c2ecf20Sopenharmony_ci	subcc		%g1, 64, %g1
3258c2ecf20Sopenharmony_ci	bne,pt		%XCC, 9b
3268c2ecf20Sopenharmony_ci	 add		%o0, 64, %o0
3278c2ecf20Sopenharmony_ci
3288c2ecf20Sopenharmony_ci	ba,pt		%XCC, 60f
3298c2ecf20Sopenharmony_ci	 add		%i1, %i4, %i1
3308c2ecf20Sopenharmony_ci
3318c2ecf20Sopenharmony_ci10:	/* Destination is 64-byte aligned, source was only 8-byte
3328c2ecf20Sopenharmony_ci	 * aligned but it has been subtracted by 8 and we perform
3338c2ecf20Sopenharmony_ci	 * one twin load ahead, then add 8 back into source when
3348c2ecf20Sopenharmony_ci	 * we finish the loop.
3358c2ecf20Sopenharmony_ci	 */
3368c2ecf20Sopenharmony_ci	EX_LD(LOAD_TWIN(%i1, %o4, %o5), NG_ret_i2_plus_g1)
3378c2ecf20Sopenharmony_ci	mov	16, %o7
3388c2ecf20Sopenharmony_ci	mov	32, %g2
3398c2ecf20Sopenharmony_ci	mov	48, %g3
3408c2ecf20Sopenharmony_ci	mov	64, %o1
3418c2ecf20Sopenharmony_ci1:	EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1)
3428c2ecf20Sopenharmony_ci	LOAD(prefetch, %i1 + %o1, #one_read)
3438c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%o5, %o0 + 0x00), NG_ret_i2_plus_g1)	! initializes cache line
3448c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
3458c2ecf20Sopenharmony_ci	EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16)
3468c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
3478c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%o4, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
3488c2ecf20Sopenharmony_ci	EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
3498c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%o5, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
3508c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
3518c2ecf20Sopenharmony_ci	EX_LD(LOAD_TWIN(%i1 + %o1, %o4, %o5), NG_ret_i2_plus_g1_minus_48)
3528c2ecf20Sopenharmony_ci	add		%i1, 64, %i1
3538c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
3548c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%o4, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
3558c2ecf20Sopenharmony_ci	subcc		%g1, 64, %g1
3568c2ecf20Sopenharmony_ci	bne,pt		%XCC, 1b
3578c2ecf20Sopenharmony_ci	 add		%o0, 64, %o0
3588c2ecf20Sopenharmony_ci
3598c2ecf20Sopenharmony_ci	ba,pt		%XCC, 60f
3608c2ecf20Sopenharmony_ci	 add		%i1, 0x8, %i1
3618c2ecf20Sopenharmony_ci
3628c2ecf20Sopenharmony_ci50:	/* Destination is 64-byte aligned, and source is 16-byte
3638c2ecf20Sopenharmony_ci	 * aligned.
3648c2ecf20Sopenharmony_ci	 */
3658c2ecf20Sopenharmony_ci	mov	16, %o7
3668c2ecf20Sopenharmony_ci	mov	32, %g2
3678c2ecf20Sopenharmony_ci	mov	48, %g3
3688c2ecf20Sopenharmony_ci	mov	64, %o1
3698c2ecf20Sopenharmony_ci1:	EX_LD(LOAD_TWIN(%i1 + %g0, %o4, %o5), NG_ret_i2_plus_g1)
3708c2ecf20Sopenharmony_ci	EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1)
3718c2ecf20Sopenharmony_ci	LOAD(prefetch, %i1 + %o1, #one_read)
3728c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%o4, %o0 + 0x00), NG_ret_i2_plus_g1)	! initializes cache line
3738c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%o5, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
3748c2ecf20Sopenharmony_ci	EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16)
3758c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
3768c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
3778c2ecf20Sopenharmony_ci	EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
3788c2ecf20Sopenharmony_ci	add	%i1, 64, %i1
3798c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%o4, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
3808c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%o5, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
3818c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
3828c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
3838c2ecf20Sopenharmony_ci	subcc	%g1, 64, %g1
3848c2ecf20Sopenharmony_ci	bne,pt	%XCC, 1b
3858c2ecf20Sopenharmony_ci	 add	%o0, 64, %o0
3868c2ecf20Sopenharmony_ci	/* fall through */
3878c2ecf20Sopenharmony_ci
3888c2ecf20Sopenharmony_ci60:
3898c2ecf20Sopenharmony_ci	membar		#Sync
3908c2ecf20Sopenharmony_ci
3918c2ecf20Sopenharmony_ci	/* %i2 contains any final bytes still needed to be copied
3928c2ecf20Sopenharmony_ci	 * over. If anything is left, we copy it one byte at a time.
3938c2ecf20Sopenharmony_ci	 */
3948c2ecf20Sopenharmony_ci	RESTORE_ASI(%i3)
3958c2ecf20Sopenharmony_ci	brz,pt		%i2, 85f
3968c2ecf20Sopenharmony_ci	 sub		%o0, %i1, %i3
3978c2ecf20Sopenharmony_ci	ba,a,pt		%XCC, 90f
3988c2ecf20Sopenharmony_ci	 nop
3998c2ecf20Sopenharmony_ci
4008c2ecf20Sopenharmony_ci	.align		64
4018c2ecf20Sopenharmony_ci70: /* 16 < len <= 64 */
4028c2ecf20Sopenharmony_ci	bne,pn		%XCC, 75f
4038c2ecf20Sopenharmony_ci	 sub		%o0, %i1, %i3
4048c2ecf20Sopenharmony_ci
4058c2ecf20Sopenharmony_ci72:
4068c2ecf20Sopenharmony_ci	andn		%i2, 0xf, %i4
4078c2ecf20Sopenharmony_ci	and		%i2, 0xf, %i2
4088c2ecf20Sopenharmony_ci1:	subcc		%i4, 0x10, %i4
4098c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_i4)
4108c2ecf20Sopenharmony_ci	add		%i1, 0x08, %i1
4118c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %i1, %g1), NG_ret_i2_plus_i4)
4128c2ecf20Sopenharmony_ci	sub		%i1, 0x08, %i1
4138c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_i4)
4148c2ecf20Sopenharmony_ci	add		%i1, 0x8, %i1
4158c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %g1, %i1 + %i3), NG_ret_i2_plus_i4_minus_8)
4168c2ecf20Sopenharmony_ci	bgu,pt		%XCC, 1b
4178c2ecf20Sopenharmony_ci	 add		%i1, 0x8, %i1
4188c2ecf20Sopenharmony_ci73:	andcc		%i2, 0x8, %g0
4198c2ecf20Sopenharmony_ci	be,pt		%XCC, 1f
4208c2ecf20Sopenharmony_ci	 nop
4218c2ecf20Sopenharmony_ci	sub		%i2, 0x8, %i2
4228c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_8)
4238c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_8)
4248c2ecf20Sopenharmony_ci	add		%i1, 0x8, %i1
4258c2ecf20Sopenharmony_ci1:	andcc		%i2, 0x4, %g0
4268c2ecf20Sopenharmony_ci	be,pt		%XCC, 1f
4278c2ecf20Sopenharmony_ci	 nop
4288c2ecf20Sopenharmony_ci	sub		%i2, 0x4, %i2
4298c2ecf20Sopenharmony_ci	EX_LD(LOAD(lduw, %i1, %i5), NG_ret_i2_plus_4)
4308c2ecf20Sopenharmony_ci	EX_ST(STORE(stw, %i5, %i1 + %i3), NG_ret_i2_plus_4)
4318c2ecf20Sopenharmony_ci	add		%i1, 0x4, %i1
4328c2ecf20Sopenharmony_ci1:	cmp		%i2, 0
4338c2ecf20Sopenharmony_ci	be,pt		%XCC, 85f
4348c2ecf20Sopenharmony_ci	 nop
4358c2ecf20Sopenharmony_ci	ba,pt		%xcc, 90f
4368c2ecf20Sopenharmony_ci	 nop
4378c2ecf20Sopenharmony_ci
4388c2ecf20Sopenharmony_ci75:
4398c2ecf20Sopenharmony_ci	andcc		%o0, 0x7, %g1
4408c2ecf20Sopenharmony_ci	sub		%g1, 0x8, %g1
4418c2ecf20Sopenharmony_ci	be,pn		%icc, 2f
4428c2ecf20Sopenharmony_ci	 sub		%g0, %g1, %g1
4438c2ecf20Sopenharmony_ci	sub		%i2, %g1, %i2
4448c2ecf20Sopenharmony_ci
4458c2ecf20Sopenharmony_ci1:	subcc		%g1, 1, %g1
4468c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldub, %i1, %i5), NG_ret_i2_plus_g1_plus_1)
4478c2ecf20Sopenharmony_ci	EX_ST(STORE(stb, %i5, %i1 + %i3), NG_ret_i2_plus_g1_plus_1)
4488c2ecf20Sopenharmony_ci	bgu,pt		%icc, 1b
4498c2ecf20Sopenharmony_ci	 add		%i1, 1, %i1
4508c2ecf20Sopenharmony_ci
4518c2ecf20Sopenharmony_ci2:	add		%i1, %i3, %o0
4528c2ecf20Sopenharmony_ci	andcc		%i1, 0x7, %g1
4538c2ecf20Sopenharmony_ci	bne,pt		%icc, 8f
4548c2ecf20Sopenharmony_ci	 sll		%g1, 3, %g1
4558c2ecf20Sopenharmony_ci
4568c2ecf20Sopenharmony_ci	cmp		%i2, 16
4578c2ecf20Sopenharmony_ci	bgeu,pt		%icc, 72b
4588c2ecf20Sopenharmony_ci	 nop
4598c2ecf20Sopenharmony_ci	ba,a,pt		%xcc, 73b
4608c2ecf20Sopenharmony_ci
4618c2ecf20Sopenharmony_ci8:	mov		64, %i3
4628c2ecf20Sopenharmony_ci	andn		%i1, 0x7, %i1
4638c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %i1, %g2), NG_ret_i2)
4648c2ecf20Sopenharmony_ci	sub		%i3, %g1, %i3
4658c2ecf20Sopenharmony_ci	andn		%i2, 0x7, %i4
4668c2ecf20Sopenharmony_ci	sllx		%g2, %g1, %g2
4678c2ecf20Sopenharmony_ci1:	add		%i1, 0x8, %i1
4688c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %i1, %g3), NG_ret_i2_and_7_plus_i4)
4698c2ecf20Sopenharmony_ci	subcc		%i4, 0x8, %i4
4708c2ecf20Sopenharmony_ci	srlx		%g3, %i3, %i5
4718c2ecf20Sopenharmony_ci	or		%i5, %g2, %i5
4728c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %i5, %o0), NG_ret_i2_and_7_plus_i4)
4738c2ecf20Sopenharmony_ci	add		%o0, 0x8, %o0
4748c2ecf20Sopenharmony_ci	bgu,pt		%icc, 1b
4758c2ecf20Sopenharmony_ci	 sllx		%g3, %g1, %g2
4768c2ecf20Sopenharmony_ci
4778c2ecf20Sopenharmony_ci	srl		%g1, 3, %g1
4788c2ecf20Sopenharmony_ci	andcc		%i2, 0x7, %i2
4798c2ecf20Sopenharmony_ci	be,pn		%icc, 85f
4808c2ecf20Sopenharmony_ci	 add		%i1, %g1, %i1
4818c2ecf20Sopenharmony_ci	ba,pt		%xcc, 90f
4828c2ecf20Sopenharmony_ci	 sub		%o0, %i1, %i3
4838c2ecf20Sopenharmony_ci
4848c2ecf20Sopenharmony_ci	.align		64
4858c2ecf20Sopenharmony_ci80: /* 0 < len <= 16 */
4868c2ecf20Sopenharmony_ci	andcc		%i3, 0x3, %g0
4878c2ecf20Sopenharmony_ci	bne,pn		%XCC, 90f
4888c2ecf20Sopenharmony_ci	 sub		%o0, %i1, %i3
4898c2ecf20Sopenharmony_ci
4908c2ecf20Sopenharmony_ci1:
4918c2ecf20Sopenharmony_ci	subcc		%i2, 4, %i2
4928c2ecf20Sopenharmony_ci	EX_LD(LOAD(lduw, %i1, %g1), NG_ret_i2_plus_4)
4938c2ecf20Sopenharmony_ci	EX_ST(STORE(stw, %g1, %i1 + %i3), NG_ret_i2_plus_4)
4948c2ecf20Sopenharmony_ci	bgu,pt		%XCC, 1b
4958c2ecf20Sopenharmony_ci	 add		%i1, 4, %i1
4968c2ecf20Sopenharmony_ci
4978c2ecf20Sopenharmony_ci85:	ret
4988c2ecf20Sopenharmony_ci	 restore	EX_RETVAL(%i0), %g0, %o0
4998c2ecf20Sopenharmony_ci
5008c2ecf20Sopenharmony_ci	.align		32
5018c2ecf20Sopenharmony_ci90:
5028c2ecf20Sopenharmony_ci	subcc		%i2, 1, %i2
5038c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_1)
5048c2ecf20Sopenharmony_ci	EX_ST(STORE(stb, %g1, %i1 + %i3), NG_ret_i2_plus_1)
5058c2ecf20Sopenharmony_ci	bgu,pt		%XCC, 90b
5068c2ecf20Sopenharmony_ci	 add		%i1, 1, %i1
5078c2ecf20Sopenharmony_ci	ret
5088c2ecf20Sopenharmony_ci	 restore	EX_RETVAL(%i0), %g0, %o0
5098c2ecf20Sopenharmony_ci
5108c2ecf20Sopenharmony_ci	.size		FUNC_NAME, .-FUNC_NAME
511