18c2ecf20Sopenharmony_ci/*
28c2ecf20Sopenharmony_ci * M7memcpy: Optimized SPARC M7 memcpy
38c2ecf20Sopenharmony_ci *
48c2ecf20Sopenharmony_ci * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
58c2ecf20Sopenharmony_ci */
68c2ecf20Sopenharmony_ci
78c2ecf20Sopenharmony_ci	.file	"M7memcpy.S"
88c2ecf20Sopenharmony_ci
98c2ecf20Sopenharmony_ci/*
108c2ecf20Sopenharmony_ci * memcpy(s1, s2, len)
118c2ecf20Sopenharmony_ci *
128c2ecf20Sopenharmony_ci * Copy s2 to s1, always copy n bytes.
138c2ecf20Sopenharmony_ci * Note: this C code does not work for overlapped copies.
148c2ecf20Sopenharmony_ci *
158c2ecf20Sopenharmony_ci * Fast assembler language version of the following C-program for memcpy
168c2ecf20Sopenharmony_ci * which represents the `standard' for the C-library.
178c2ecf20Sopenharmony_ci *
188c2ecf20Sopenharmony_ci *	void *
198c2ecf20Sopenharmony_ci *	memcpy(void *s, const void *s0, size_t n)
208c2ecf20Sopenharmony_ci *	{
218c2ecf20Sopenharmony_ci *		if (n != 0) {
228c2ecf20Sopenharmony_ci *		    char *s1 = s;
238c2ecf20Sopenharmony_ci *		    const char *s2 = s0;
248c2ecf20Sopenharmony_ci *		    do {
258c2ecf20Sopenharmony_ci *			*s1++ = *s2++;
268c2ecf20Sopenharmony_ci *		    } while (--n != 0);
278c2ecf20Sopenharmony_ci *		}
288c2ecf20Sopenharmony_ci *		return (s);
298c2ecf20Sopenharmony_ci *	}
308c2ecf20Sopenharmony_ci *
318c2ecf20Sopenharmony_ci *
328c2ecf20Sopenharmony_ci * SPARC T7/M7 Flow :
338c2ecf20Sopenharmony_ci *
348c2ecf20Sopenharmony_ci * if (count < SMALL_MAX) {
358c2ecf20Sopenharmony_ci *   if count < SHORTCOPY              (SHORTCOPY=3)
368c2ecf20Sopenharmony_ci *	copy bytes; exit with dst addr
378c2ecf20Sopenharmony_ci *   if src & dst aligned on word boundary but not long word boundary,
388c2ecf20Sopenharmony_ci *     copy with ldw/stw; branch to finish_up
398c2ecf20Sopenharmony_ci *   if src & dst aligned on long word boundary
408c2ecf20Sopenharmony_ci *     copy with ldx/stx; branch to finish_up
418c2ecf20Sopenharmony_ci *   if src & dst not aligned and length <= SHORTCHECK   (SHORTCHECK=14)
428c2ecf20Sopenharmony_ci *     copy bytes; exit with dst addr
438c2ecf20Sopenharmony_ci *   move enough bytes to get src to word boundary
448c2ecf20Sopenharmony_ci *   if dst now on word boundary
458c2ecf20Sopenharmony_ci * move_words:
468c2ecf20Sopenharmony_ci *     copy words; branch to finish_up
478c2ecf20Sopenharmony_ci *   if dst now on half word boundary
488c2ecf20Sopenharmony_ci *     load words, shift half words, store words; branch to finish_up
498c2ecf20Sopenharmony_ci *   if dst on byte 1
508c2ecf20Sopenharmony_ci *     load words, shift 3 bytes, store words; branch to finish_up
518c2ecf20Sopenharmony_ci *   if dst on byte 3
528c2ecf20Sopenharmony_ci *     load words, shift 1 byte, store words; branch to finish_up
538c2ecf20Sopenharmony_ci * finish_up:
548c2ecf20Sopenharmony_ci *     copy bytes; exit with dst addr
558c2ecf20Sopenharmony_ci * } else {                                         More than SMALL_MAX bytes
568c2ecf20Sopenharmony_ci *   move bytes until dst is on long word boundary
578c2ecf20Sopenharmony_ci *   if( src is on long word boundary ) {
588c2ecf20Sopenharmony_ci *     if (count < MED_MAX) {
598c2ecf20Sopenharmony_ci * finish_long:					   src/dst aligned on 8 bytes
608c2ecf20Sopenharmony_ci *       copy with ldx/stx in 8-way unrolled loop;
618c2ecf20Sopenharmony_ci *       copy final 0-63 bytes; exit with dst addr
628c2ecf20Sopenharmony_ci *     } else {				     src/dst aligned; count > MED_MAX
638c2ecf20Sopenharmony_ci *       align dst on 64 byte boundary; for main data movement:
648c2ecf20Sopenharmony_ci *       prefetch src data to L2 cache; let HW prefetch move data to L1 cache
658c2ecf20Sopenharmony_ci *       Use BIS (block initializing store) to avoid copying store cache
668c2ecf20Sopenharmony_ci *       lines from memory. But pre-store first element of each cache line
678c2ecf20Sopenharmony_ci *       ST_CHUNK lines in advance of the rest of that cache line. That
688c2ecf20Sopenharmony_ci *       gives time for replacement cache lines to be written back without
698c2ecf20Sopenharmony_ci *       excess STQ and Miss Buffer filling. Repeat until near the end,
708c2ecf20Sopenharmony_ci *       then finish up storing before going to finish_long.
718c2ecf20Sopenharmony_ci *     }
728c2ecf20Sopenharmony_ci *   } else {                                   src/dst not aligned on 8 bytes
738c2ecf20Sopenharmony_ci *     if src is word aligned and count < MED_WMAX
748c2ecf20Sopenharmony_ci *       move words in 8-way unrolled loop
758c2ecf20Sopenharmony_ci *       move final 0-31 bytes; exit with dst addr
768c2ecf20Sopenharmony_ci *     if count < MED_UMAX
778c2ecf20Sopenharmony_ci *       use alignaddr/faligndata combined with ldd/std in 8-way
788c2ecf20Sopenharmony_ci *       unrolled loop to move data.
798c2ecf20Sopenharmony_ci *       go to unalign_done
808c2ecf20Sopenharmony_ci *     else
818c2ecf20Sopenharmony_ci *       setup alignaddr for faligndata instructions
828c2ecf20Sopenharmony_ci *       align dst on 64 byte boundary; prefetch src data to L1 cache
838c2ecf20Sopenharmony_ci *       loadx8, falign, block-store, prefetch loop
848c2ecf20Sopenharmony_ci *	 (only use block-init-store when src/dst on 8 byte boundaries.)
858c2ecf20Sopenharmony_ci * unalign_done:
868c2ecf20Sopenharmony_ci *       move remaining bytes for unaligned cases. exit with dst addr.
878c2ecf20Sopenharmony_ci * }
888c2ecf20Sopenharmony_ci *
898c2ecf20Sopenharmony_ci */
908c2ecf20Sopenharmony_ci
918c2ecf20Sopenharmony_ci#include <asm/visasm.h>
928c2ecf20Sopenharmony_ci#include <asm/asi.h>
938c2ecf20Sopenharmony_ci
948c2ecf20Sopenharmony_ci#if !defined(EX_LD) && !defined(EX_ST)
958c2ecf20Sopenharmony_ci#define NON_USER_COPY
968c2ecf20Sopenharmony_ci#endif
978c2ecf20Sopenharmony_ci
988c2ecf20Sopenharmony_ci#ifndef EX_LD
998c2ecf20Sopenharmony_ci#define EX_LD(x,y)	x
1008c2ecf20Sopenharmony_ci#endif
1018c2ecf20Sopenharmony_ci#ifndef EX_LD_FP
1028c2ecf20Sopenharmony_ci#define EX_LD_FP(x,y)	x
1038c2ecf20Sopenharmony_ci#endif
1048c2ecf20Sopenharmony_ci
1058c2ecf20Sopenharmony_ci#ifndef EX_ST
1068c2ecf20Sopenharmony_ci#define EX_ST(x,y)	x
1078c2ecf20Sopenharmony_ci#endif
1088c2ecf20Sopenharmony_ci#ifndef EX_ST_FP
1098c2ecf20Sopenharmony_ci#define EX_ST_FP(x,y)	x
1108c2ecf20Sopenharmony_ci#endif
1118c2ecf20Sopenharmony_ci
1128c2ecf20Sopenharmony_ci#ifndef EX_RETVAL
1138c2ecf20Sopenharmony_ci#define EX_RETVAL(x)    x
1148c2ecf20Sopenharmony_ci#endif
1158c2ecf20Sopenharmony_ci
1168c2ecf20Sopenharmony_ci#ifndef LOAD
1178c2ecf20Sopenharmony_ci#define LOAD(type,addr,dest)	type [addr], dest
1188c2ecf20Sopenharmony_ci#endif
1198c2ecf20Sopenharmony_ci
1208c2ecf20Sopenharmony_ci#ifndef STORE
1218c2ecf20Sopenharmony_ci#define STORE(type,src,addr)	type src, [addr]
1228c2ecf20Sopenharmony_ci#endif
1238c2ecf20Sopenharmony_ci
1248c2ecf20Sopenharmony_ci/*
1258c2ecf20Sopenharmony_ci * ASI_BLK_INIT_QUAD_LDD_P/ASI_BLK_INIT_QUAD_LDD_S marks the cache
1268c2ecf20Sopenharmony_ci * line as "least recently used" which means if many threads are
1278c2ecf20Sopenharmony_ci * active, it has a high probability of being pushed out of the cache
1288c2ecf20Sopenharmony_ci * between the first initializing store and the final stores.
1298c2ecf20Sopenharmony_ci * Thus, we use ASI_ST_BLKINIT_MRU_P/ASI_ST_BLKINIT_MRU_S which
1308c2ecf20Sopenharmony_ci * marks the cache line as "most recently used" for all
1318c2ecf20Sopenharmony_ci * but the last cache line
1328c2ecf20Sopenharmony_ci */
1338c2ecf20Sopenharmony_ci#ifndef STORE_ASI
1348c2ecf20Sopenharmony_ci#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
1358c2ecf20Sopenharmony_ci#define STORE_ASI	ASI_BLK_INIT_QUAD_LDD_P
1368c2ecf20Sopenharmony_ci#else
1378c2ecf20Sopenharmony_ci#define STORE_ASI	0x80		/* ASI_P */
1388c2ecf20Sopenharmony_ci#endif
1398c2ecf20Sopenharmony_ci#endif
1408c2ecf20Sopenharmony_ci
1418c2ecf20Sopenharmony_ci#ifndef STORE_MRU_ASI
1428c2ecf20Sopenharmony_ci#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
1438c2ecf20Sopenharmony_ci#define STORE_MRU_ASI	ASI_ST_BLKINIT_MRU_P
1448c2ecf20Sopenharmony_ci#else
1458c2ecf20Sopenharmony_ci#define STORE_MRU_ASI	0x80		/* ASI_P */
1468c2ecf20Sopenharmony_ci#endif
1478c2ecf20Sopenharmony_ci#endif
1488c2ecf20Sopenharmony_ci
1498c2ecf20Sopenharmony_ci#ifndef STORE_INIT
1508c2ecf20Sopenharmony_ci#define STORE_INIT(src,addr)	stxa src, [addr] STORE_ASI
1518c2ecf20Sopenharmony_ci#endif
1528c2ecf20Sopenharmony_ci
1538c2ecf20Sopenharmony_ci#ifndef STORE_INIT_MRU
1548c2ecf20Sopenharmony_ci#define STORE_INIT_MRU(src,addr)	stxa src, [addr] STORE_MRU_ASI
1558c2ecf20Sopenharmony_ci#endif
1568c2ecf20Sopenharmony_ci
1578c2ecf20Sopenharmony_ci#ifndef FUNC_NAME
1588c2ecf20Sopenharmony_ci#define FUNC_NAME	M7memcpy
1598c2ecf20Sopenharmony_ci#endif
1608c2ecf20Sopenharmony_ci
1618c2ecf20Sopenharmony_ci#ifndef PREAMBLE
1628c2ecf20Sopenharmony_ci#define PREAMBLE
1638c2ecf20Sopenharmony_ci#endif
1648c2ecf20Sopenharmony_ci
1658c2ecf20Sopenharmony_ci#define	BLOCK_SIZE	64
1668c2ecf20Sopenharmony_ci#define	SHORTCOPY	3
1678c2ecf20Sopenharmony_ci#define	SHORTCHECK	14
1688c2ecf20Sopenharmony_ci#define	SHORT_LONG	64	/* max copy for short longword-aligned case */
1698c2ecf20Sopenharmony_ci				/* must be at least 64 */
1708c2ecf20Sopenharmony_ci#define	SMALL_MAX	128
1718c2ecf20Sopenharmony_ci#define	MED_UMAX	1024	/* max copy for medium un-aligned case */
1728c2ecf20Sopenharmony_ci#define	MED_WMAX	1024	/* max copy for medium word-aligned case */
1738c2ecf20Sopenharmony_ci#define	MED_MAX		1024	/* max copy for medium longword-aligned case */
1748c2ecf20Sopenharmony_ci#define ST_CHUNK	24	/* ST_CHUNK - block of values for BIS Store */
1758c2ecf20Sopenharmony_ci#define ALIGN_PRE	24	/* distance for aligned prefetch loop */
1768c2ecf20Sopenharmony_ci
1778c2ecf20Sopenharmony_ci	.register	%g2,#scratch
1788c2ecf20Sopenharmony_ci
1798c2ecf20Sopenharmony_ci	.section	".text"
1808c2ecf20Sopenharmony_ci	.global		FUNC_NAME
1818c2ecf20Sopenharmony_ci	.type		FUNC_NAME, #function
1828c2ecf20Sopenharmony_ci	.align		16
1838c2ecf20Sopenharmony_ciFUNC_NAME:
1848c2ecf20Sopenharmony_ci	srlx            %o2, 31, %g2
1858c2ecf20Sopenharmony_ci	cmp             %g2, 0
1868c2ecf20Sopenharmony_ci	tne             %xcc, 5
1878c2ecf20Sopenharmony_ci	PREAMBLE
1888c2ecf20Sopenharmony_ci	mov		%o0, %g1	! save %o0
1898c2ecf20Sopenharmony_ci	brz,pn          %o2, .Lsmallx
1908c2ecf20Sopenharmony_ci	 cmp            %o2, 3
1918c2ecf20Sopenharmony_ci	ble,pn          %icc, .Ltiny_cp
1928c2ecf20Sopenharmony_ci	 cmp            %o2, 19
1938c2ecf20Sopenharmony_ci	ble,pn          %icc, .Lsmall_cp
1948c2ecf20Sopenharmony_ci	 or             %o0, %o1, %g2
1958c2ecf20Sopenharmony_ci	cmp             %o2, SMALL_MAX
1968c2ecf20Sopenharmony_ci	bl,pn           %icc, .Lmedium_cp
1978c2ecf20Sopenharmony_ci	 nop
1988c2ecf20Sopenharmony_ci
1998c2ecf20Sopenharmony_ci.Lmedium:
2008c2ecf20Sopenharmony_ci	neg	%o0, %o5
2018c2ecf20Sopenharmony_ci	andcc	%o5, 7, %o5		! bytes till DST 8 byte aligned
2028c2ecf20Sopenharmony_ci	brz,pt	%o5, .Ldst_aligned_on_8
2038c2ecf20Sopenharmony_ci
2048c2ecf20Sopenharmony_ci	! %o5 has the bytes to be written in partial store.
2058c2ecf20Sopenharmony_ci	 sub	%o2, %o5, %o2
2068c2ecf20Sopenharmony_ci	sub	%o1, %o0, %o1		! %o1 gets the difference
2078c2ecf20Sopenharmony_ci7:					! dst aligning loop
2088c2ecf20Sopenharmony_ci	add	%o1, %o0, %o4
2098c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldub, %o4, %o4), memcpy_retl_o2_plus_o5)	! load one byte
2108c2ecf20Sopenharmony_ci	subcc	%o5, 1, %o5
2118c2ecf20Sopenharmony_ci	EX_ST(STORE(stb, %o4, %o0), memcpy_retl_o2_plus_o5_plus_1)
2128c2ecf20Sopenharmony_ci	bgu,pt	%xcc, 7b
2138c2ecf20Sopenharmony_ci	 add	%o0, 1, %o0		! advance dst
2148c2ecf20Sopenharmony_ci	add	%o1, %o0, %o1		! restore %o1
2158c2ecf20Sopenharmony_ci.Ldst_aligned_on_8:
2168c2ecf20Sopenharmony_ci	andcc	%o1, 7, %o5
2178c2ecf20Sopenharmony_ci	brnz,pt	%o5, .Lsrc_dst_unaligned_on_8
2188c2ecf20Sopenharmony_ci	 nop
2198c2ecf20Sopenharmony_ci
2208c2ecf20Sopenharmony_ci.Lsrc_dst_aligned_on_8:
2218c2ecf20Sopenharmony_ci	! check if we are copying MED_MAX or more bytes
2228c2ecf20Sopenharmony_ci	set MED_MAX, %o3
2238c2ecf20Sopenharmony_ci	cmp %o2, %o3 			! limit to store buffer size
2248c2ecf20Sopenharmony_ci	bgu,pn	%xcc, .Llarge_align8_copy
2258c2ecf20Sopenharmony_ci	 nop
2268c2ecf20Sopenharmony_ci
2278c2ecf20Sopenharmony_ci/*
2288c2ecf20Sopenharmony_ci * Special case for handling when src and dest are both long word aligned
2298c2ecf20Sopenharmony_ci * and total data to move is less than MED_MAX bytes
2308c2ecf20Sopenharmony_ci */
2318c2ecf20Sopenharmony_ci.Lmedlong:
2328c2ecf20Sopenharmony_ci	subcc	%o2, 63, %o2		! adjust length to allow cc test
2338c2ecf20Sopenharmony_ci	ble,pn	%xcc, .Lmedl63		! skip big loop if less than 64 bytes
2348c2ecf20Sopenharmony_ci	 nop
2358c2ecf20Sopenharmony_ci.Lmedl64:
2368c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_63)	! load
2378c2ecf20Sopenharmony_ci	subcc	%o2, 64, %o2		! decrement length count
2388c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_63_64)	! and store
2398c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_63_56)	! a block of 64
2408c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_63_56)
2418c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_63_48)
2428c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_63_48)
2438c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1+24, %o3), memcpy_retl_o2_plus_63_40)
2448c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o3, %o0+24), memcpy_retl_o2_plus_63_40)
2458c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_63_32)! load and store
2468c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o4, %o0+32), memcpy_retl_o2_plus_63_32)
2478c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1+40, %o3), memcpy_retl_o2_plus_63_24)! a block of 64
2488c2ecf20Sopenharmony_ci	add	%o1, 64, %o1		! increase src ptr by 64
2498c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o3, %o0+40), memcpy_retl_o2_plus_63_24)
2508c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_63_16)
2518c2ecf20Sopenharmony_ci	add	%o0, 64, %o0		! increase dst ptr by 64
2528c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_63_16)
2538c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_63_8)
2548c2ecf20Sopenharmony_ci	bgu,pt	%xcc, .Lmedl64		! repeat if at least 64 bytes left
2558c2ecf20Sopenharmony_ci	 EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_63_8)
2568c2ecf20Sopenharmony_ci.Lmedl63:
2578c2ecf20Sopenharmony_ci	addcc	%o2, 32, %o2		! adjust remaining count
2588c2ecf20Sopenharmony_ci	ble,pt	%xcc, .Lmedl31		! to skip if 31 or fewer bytes left
2598c2ecf20Sopenharmony_ci	 nop
2608c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_31)	! load
2618c2ecf20Sopenharmony_ci	sub	%o2, 32, %o2		! decrement length count
2628c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_31_32)	! and store
2638c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_31_24)	! a block of 32
2648c2ecf20Sopenharmony_ci	add	%o1, 32, %o1		! increase src ptr by 32
2658c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_31_24)
2668c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_31_16)
2678c2ecf20Sopenharmony_ci	add	%o0, 32, %o0		! increase dst ptr by 32
2688c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_31_16)
2698c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_31_8)
2708c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_31_8)
2718c2ecf20Sopenharmony_ci.Lmedl31:
2728c2ecf20Sopenharmony_ci	addcc	%o2, 16, %o2		! adjust remaining count
2738c2ecf20Sopenharmony_ci	ble,pt	%xcc, .Lmedl15		! skip if 15 or fewer bytes left
2748c2ecf20Sopenharmony_ci	 nop				!
2758c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_15)
2768c2ecf20Sopenharmony_ci	add	%o1, 16, %o1		! increase src ptr by 16
2778c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_15)
2788c2ecf20Sopenharmony_ci	sub	%o2, 16, %o2		! decrease count by 16
2798c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_15_8)
2808c2ecf20Sopenharmony_ci	add	%o0, 16, %o0		! increase dst ptr by 16
2818c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_15_8)
2828c2ecf20Sopenharmony_ci.Lmedl15:
2838c2ecf20Sopenharmony_ci	addcc	%o2, 15, %o2		! restore count
2848c2ecf20Sopenharmony_ci	bz,pt	%xcc, .Lsmallx	! exit if finished
2858c2ecf20Sopenharmony_ci	 cmp	%o2, 8
2868c2ecf20Sopenharmony_ci	blt,pt	%xcc, .Lmedw7		! skip if 7 or fewer bytes left
2878c2ecf20Sopenharmony_ci	 tst	%o2
2888c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)	! load 8 bytes
2898c2ecf20Sopenharmony_ci	add	%o1, 8, %o1		! increase src ptr by 8
2908c2ecf20Sopenharmony_ci	add	%o0, 8, %o0		! increase dst ptr by 8
2918c2ecf20Sopenharmony_ci	subcc	%o2, 8, %o2		! decrease count by 8
2928c2ecf20Sopenharmony_ci	bnz,pn	%xcc, .Lmedw7
2938c2ecf20Sopenharmony_ci	 EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)	! and store 8
2948c2ecf20Sopenharmony_ci	retl
2958c2ecf20Sopenharmony_ci	 mov	EX_RETVAL(%g1), %o0	! restore %o0
2968c2ecf20Sopenharmony_ci
2978c2ecf20Sopenharmony_ci	.align 16
2988c2ecf20Sopenharmony_ci.Lsrc_dst_unaligned_on_8:
2998c2ecf20Sopenharmony_ci	! DST is 8-byte aligned, src is not
3008c2ecf20Sopenharmony_ci2:
3018c2ecf20Sopenharmony_ci	andcc	%o1, 0x3, %o5		! test word alignment
3028c2ecf20Sopenharmony_ci	bnz,pt	%xcc, .Lunalignsetup	! branch to skip if not word aligned
3038c2ecf20Sopenharmony_ci	 nop
3048c2ecf20Sopenharmony_ci
3058c2ecf20Sopenharmony_ci/*
3068c2ecf20Sopenharmony_ci * Handle all cases where src and dest are aligned on word
3078c2ecf20Sopenharmony_ci * boundaries. Use unrolled loops for better performance.
3088c2ecf20Sopenharmony_ci * This option wins over standard large data move when
3098c2ecf20Sopenharmony_ci * source and destination is in cache for.Lmedium
3108c2ecf20Sopenharmony_ci * to short data moves.
3118c2ecf20Sopenharmony_ci */
3128c2ecf20Sopenharmony_ci	set MED_WMAX, %o3
3138c2ecf20Sopenharmony_ci	cmp %o2, %o3 			! limit to store buffer size
3148c2ecf20Sopenharmony_ci	bge,pt	%xcc, .Lunalignrejoin	! otherwise rejoin main loop
3158c2ecf20Sopenharmony_ci	 nop
3168c2ecf20Sopenharmony_ci
3178c2ecf20Sopenharmony_ci	subcc	%o2, 31, %o2		! adjust length to allow cc test
3188c2ecf20Sopenharmony_ci					! for end of loop
3198c2ecf20Sopenharmony_ci	ble,pt	%xcc, .Lmedw31		! skip big loop if less than 16
3208c2ecf20Sopenharmony_ci.Lmedw32:
3218c2ecf20Sopenharmony_ci	EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_31)! move a block of 32
3228c2ecf20Sopenharmony_ci	sllx	%o4, 32, %o5
3238c2ecf20Sopenharmony_ci	EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_31)
3248c2ecf20Sopenharmony_ci	or	%o4, %o5, %o5
3258c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_31)
3268c2ecf20Sopenharmony_ci	subcc	%o2, 32, %o2		! decrement length count
3278c2ecf20Sopenharmony_ci	EX_LD(LOAD(ld, %o1+8, %o4), memcpy_retl_o2_plus_31_24)
3288c2ecf20Sopenharmony_ci	sllx	%o4, 32, %o5
3298c2ecf20Sopenharmony_ci	EX_LD(LOAD(ld, %o1+12, %o4), memcpy_retl_o2_plus_31_24)
3308c2ecf20Sopenharmony_ci	or	%o4, %o5, %o5
3318c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o5, %o0+8), memcpy_retl_o2_plus_31_24)
3328c2ecf20Sopenharmony_ci	add	%o1, 32, %o1		! increase src ptr by 32
3338c2ecf20Sopenharmony_ci	EX_LD(LOAD(ld, %o1-16, %o4), memcpy_retl_o2_plus_31_16)
3348c2ecf20Sopenharmony_ci	sllx	%o4, 32, %o5
3358c2ecf20Sopenharmony_ci	EX_LD(LOAD(ld, %o1-12, %o4), memcpy_retl_o2_plus_31_16)
3368c2ecf20Sopenharmony_ci	or	%o4, %o5, %o5
3378c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o5, %o0+16), memcpy_retl_o2_plus_31_16)
3388c2ecf20Sopenharmony_ci	add	%o0, 32, %o0		! increase dst ptr by 32
3398c2ecf20Sopenharmony_ci	EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_31_8)
3408c2ecf20Sopenharmony_ci	sllx	%o4, 32, %o5
3418c2ecf20Sopenharmony_ci	EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_31_8)
3428c2ecf20Sopenharmony_ci	or	%o4, %o5, %o5
3438c2ecf20Sopenharmony_ci	bgu,pt	%xcc, .Lmedw32		! repeat if at least 32 bytes left
3448c2ecf20Sopenharmony_ci	 EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_31_8)
3458c2ecf20Sopenharmony_ci.Lmedw31:
3468c2ecf20Sopenharmony_ci	addcc	%o2, 31, %o2		! restore count
3478c2ecf20Sopenharmony_ci
3488c2ecf20Sopenharmony_ci	bz,pt	%xcc, .Lsmallx	! exit if finished
3498c2ecf20Sopenharmony_ci	 nop
3508c2ecf20Sopenharmony_ci	cmp	%o2, 16
3518c2ecf20Sopenharmony_ci	blt,pt	%xcc, .Lmedw15
3528c2ecf20Sopenharmony_ci	 nop
3538c2ecf20Sopenharmony_ci	EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)! move a block of 16 bytes
3548c2ecf20Sopenharmony_ci	sllx	%o4, 32, %o5
3558c2ecf20Sopenharmony_ci	subcc	%o2, 16, %o2		! decrement length count
3568c2ecf20Sopenharmony_ci	EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_16)
3578c2ecf20Sopenharmony_ci	or	%o4, %o5, %o5
3588c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_16)
3598c2ecf20Sopenharmony_ci	add	%o1, 16, %o1		! increase src ptr by 16
3608c2ecf20Sopenharmony_ci	EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_8)
3618c2ecf20Sopenharmony_ci	add	%o0, 16, %o0		! increase dst ptr by 16
3628c2ecf20Sopenharmony_ci	sllx	%o4, 32, %o5
3638c2ecf20Sopenharmony_ci	EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_8)
3648c2ecf20Sopenharmony_ci	or	%o4, %o5, %o5
3658c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_8)
3668c2ecf20Sopenharmony_ci.Lmedw15:
3678c2ecf20Sopenharmony_ci	bz,pt	%xcc, .Lsmallx	! exit if finished
3688c2ecf20Sopenharmony_ci	 cmp	%o2, 8
3698c2ecf20Sopenharmony_ci	blt,pn	%xcc, .Lmedw7		! skip if 7 or fewer bytes left
3708c2ecf20Sopenharmony_ci	 tst	%o2
3718c2ecf20Sopenharmony_ci	EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)	! load 4 bytes
3728c2ecf20Sopenharmony_ci	subcc	%o2, 8, %o2		! decrease count by 8
3738c2ecf20Sopenharmony_ci	EX_ST(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_8)! and store 4 bytes
3748c2ecf20Sopenharmony_ci	add	%o1, 8, %o1		! increase src ptr by 8
3758c2ecf20Sopenharmony_ci	EX_LD(LOAD(ld, %o1-4, %o3), memcpy_retl_o2_plus_4)	! load 4 bytes
3768c2ecf20Sopenharmony_ci	add	%o0, 8, %o0		! increase dst ptr by 8
3778c2ecf20Sopenharmony_ci	EX_ST(STORE(stw, %o3, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes
3788c2ecf20Sopenharmony_ci	bz,pt	%xcc, .Lsmallx	! exit if finished
3798c2ecf20Sopenharmony_ci.Lmedw7:				! count is ge 1, less than 8
3808c2ecf20Sopenharmony_ci	cmp	%o2, 4			! check for 4 bytes left
3818c2ecf20Sopenharmony_ci	blt,pn	%xcc, .Lsmallleft3	! skip if 3 or fewer bytes left
3828c2ecf20Sopenharmony_ci	 nop				!
3838c2ecf20Sopenharmony_ci	EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)	! load 4 bytes
3848c2ecf20Sopenharmony_ci	add	%o1, 4, %o1		! increase src ptr by 4
3858c2ecf20Sopenharmony_ci	add	%o0, 4, %o0		! increase dst ptr by 4
3868c2ecf20Sopenharmony_ci	subcc	%o2, 4, %o2		! decrease count by 4
3878c2ecf20Sopenharmony_ci	bnz	.Lsmallleft3
3888c2ecf20Sopenharmony_ci	 EX_ST(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes
3898c2ecf20Sopenharmony_ci	retl
3908c2ecf20Sopenharmony_ci	 mov	EX_RETVAL(%g1), %o0
3918c2ecf20Sopenharmony_ci
3928c2ecf20Sopenharmony_ci	.align 16
3938c2ecf20Sopenharmony_ci.Llarge_align8_copy:			! Src and dst share 8 byte alignment
3948c2ecf20Sopenharmony_ci	! align dst to 64 byte boundary
3958c2ecf20Sopenharmony_ci	andcc	%o0, 0x3f, %o3		! %o3 == 0 means dst is 64 byte aligned
3968c2ecf20Sopenharmony_ci	brz,pn	%o3, .Laligned_to_64
3978c2ecf20Sopenharmony_ci	 andcc	%o0, 8, %o3		! odd long words to move?
3988c2ecf20Sopenharmony_ci	brz,pt	%o3, .Laligned_to_16
3998c2ecf20Sopenharmony_ci	 nop
4008c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
4018c2ecf20Sopenharmony_ci	sub	%o2, 8, %o2
4028c2ecf20Sopenharmony_ci	add	%o1, 8, %o1		! increment src ptr
4038c2ecf20Sopenharmony_ci	add	%o0, 8, %o0		! increment dst ptr
4048c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
4058c2ecf20Sopenharmony_ci.Laligned_to_16:
4068c2ecf20Sopenharmony_ci	andcc	%o0, 16, %o3		! pair of long words to move?
4078c2ecf20Sopenharmony_ci	brz,pt	%o3, .Laligned_to_32
4088c2ecf20Sopenharmony_ci	 nop
4098c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
4108c2ecf20Sopenharmony_ci	sub	%o2, 16, %o2
4118c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_16)
4128c2ecf20Sopenharmony_ci	add	%o1, 16, %o1		! increment src ptr
4138c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8)
4148c2ecf20Sopenharmony_ci	add	%o0, 16, %o0		! increment dst ptr
4158c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
4168c2ecf20Sopenharmony_ci.Laligned_to_32:
4178c2ecf20Sopenharmony_ci	andcc	%o0, 32, %o3		! four long words to move?
4188c2ecf20Sopenharmony_ci	brz,pt	%o3, .Laligned_to_64
4198c2ecf20Sopenharmony_ci	 nop
4208c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
4218c2ecf20Sopenharmony_ci	sub	%o2, 32, %o2
4228c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_32)
4238c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_24)
4248c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_24)
4258c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_16)
4268c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_16)
4278c2ecf20Sopenharmony_ci	add	%o1, 32, %o1		! increment src ptr
4288c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8)
4298c2ecf20Sopenharmony_ci	add	%o0, 32, %o0		! increment dst ptr
4308c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
4318c2ecf20Sopenharmony_ci.Laligned_to_64:
4328c2ecf20Sopenharmony_ci!
4338c2ecf20Sopenharmony_ci!	Using block init store (BIS) instructions to avoid fetching cache
4348c2ecf20Sopenharmony_ci!	lines from memory. Use ST_CHUNK stores to first element of each cache
4358c2ecf20Sopenharmony_ci!	line (similar to prefetching) to avoid overfilling STQ or miss buffers.
4368c2ecf20Sopenharmony_ci!	Gives existing cache lines time to be moved out of L1/L2/L3 cache.
4378c2ecf20Sopenharmony_ci!	Initial stores using MRU version of BIS to keep cache line in
4388c2ecf20Sopenharmony_ci!	cache until we are ready to store final element of cache line.
4398c2ecf20Sopenharmony_ci!	Then store last element using the LRU version of BIS.
4408c2ecf20Sopenharmony_ci!
4418c2ecf20Sopenharmony_ci	andn	%o2, 0x3f, %o5		! %o5 is multiple of block size
4428c2ecf20Sopenharmony_ci	and	%o2, 0x3f, %o2		! residue bytes in %o2
4438c2ecf20Sopenharmony_ci!
4448c2ecf20Sopenharmony_ci!	We use STORE_MRU_ASI for the first seven stores to each cache line
4458c2ecf20Sopenharmony_ci!	followed by STORE_ASI (mark as LRU) for the last store. That
4468c2ecf20Sopenharmony_ci!	mixed approach reduces the probability that the cache line is removed
4478c2ecf20Sopenharmony_ci!	before we finish setting it, while minimizing the effects on
4488c2ecf20Sopenharmony_ci!	other cached values during a large memcpy
4498c2ecf20Sopenharmony_ci!
4508c2ecf20Sopenharmony_ci!	ST_CHUNK batches up initial BIS operations for several cache lines
4518c2ecf20Sopenharmony_ci!	to allow multiple requests to not be blocked by overflowing the
4528c2ecf20Sopenharmony_ci!	the store miss buffer. Then the matching stores for all those
4538c2ecf20Sopenharmony_ci!	BIS operations are executed.
4548c2ecf20Sopenharmony_ci!
4558c2ecf20Sopenharmony_ci
4568c2ecf20Sopenharmony_ci	sub	%o0, 8, %o0		! adjust %o0 for ASI alignment
4578c2ecf20Sopenharmony_ci.Lalign_loop:
4588c2ecf20Sopenharmony_ci	cmp	%o5, ST_CHUNK*64
4598c2ecf20Sopenharmony_ci	blu,pt	%xcc, .Lalign_loop_fin
4608c2ecf20Sopenharmony_ci	 mov	ST_CHUNK,%o3
4618c2ecf20Sopenharmony_ci.Lalign_loop_start:
4628c2ecf20Sopenharmony_ci	prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZE)], 21
4638c2ecf20Sopenharmony_ci	subcc	%o3, 1, %o3
4648c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5)
4658c2ecf20Sopenharmony_ci	add	%o1, 64, %o1
4668c2ecf20Sopenharmony_ci	add	%o0, 8, %o0
4678c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
4688c2ecf20Sopenharmony_ci	bgu	%xcc,.Lalign_loop_start
4698c2ecf20Sopenharmony_ci	 add	%o0, 56, %o0
4708c2ecf20Sopenharmony_ci
4718c2ecf20Sopenharmony_ci	mov	ST_CHUNK,%o3
4728c2ecf20Sopenharmony_ci	sllx	%o3, 6, %o4		! ST_CHUNK*64
4738c2ecf20Sopenharmony_ci	sub	%o1, %o4, %o1		! reset %o1
4748c2ecf20Sopenharmony_ci	sub	%o0, %o4, %o0		! reset %o0
4758c2ecf20Sopenharmony_ci
4768c2ecf20Sopenharmony_ci.Lalign_loop_rest:
4778c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5)
4788c2ecf20Sopenharmony_ci	add	%o0, 16, %o0
4798c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
4808c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5)
4818c2ecf20Sopenharmony_ci	add	%o0, 8, %o0
4828c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
4838c2ecf20Sopenharmony_ci	subcc	%o3, 1, %o3
4848c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5)
4858c2ecf20Sopenharmony_ci	add	%o0, 8, %o0
4868c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
4878c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5)
4888c2ecf20Sopenharmony_ci	add	%o0, 8, %o0
4898c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
4908c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5)
4918c2ecf20Sopenharmony_ci	add	%o0, 8, %o0
4928c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
4938c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5)
4948c2ecf20Sopenharmony_ci	add	%o1, 64, %o1
4958c2ecf20Sopenharmony_ci	add	%o0, 8, %o0
4968c2ecf20Sopenharmony_ci	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
4978c2ecf20Sopenharmony_ci	add	%o0, 8, %o0
4988c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5)
4998c2ecf20Sopenharmony_ci	sub	%o5, 64, %o5
5008c2ecf20Sopenharmony_ci	bgu	%xcc,.Lalign_loop_rest
5018c2ecf20Sopenharmony_ci	! mark cache line as LRU
5028c2ecf20Sopenharmony_ci	 EX_ST(STORE_INIT(%o4, %o0), memcpy_retl_o2_plus_o5_plus_64)
5038c2ecf20Sopenharmony_ci
5048c2ecf20Sopenharmony_ci	cmp	%o5, ST_CHUNK*64
5058c2ecf20Sopenharmony_ci	bgu,pt	%xcc, .Lalign_loop_start
5068c2ecf20Sopenharmony_ci	 mov	ST_CHUNK,%o3
5078c2ecf20Sopenharmony_ci
5088c2ecf20Sopenharmony_ci	cmp	%o5, 0
5098c2ecf20Sopenharmony_ci	beq	.Lalign_done
5108c2ecf20Sopenharmony_ci	 nop
5118c2ecf20Sopenharmony_ci.Lalign_loop_fin:
5128c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5)
5138c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_o5)
5148c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5)
5158c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o4, %o0+8+8), memcpy_retl_o2_plus_o5)
5168c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5)
5178c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o4, %o0+8+16), memcpy_retl_o2_plus_o5)
5188c2ecf20Sopenharmony_ci	subcc	%o5, 64, %o5
5198c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5_64)
5208c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o4, %o0+8+24), memcpy_retl_o2_plus_o5_64)
5218c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5_64)
5228c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o4, %o0+8+32), memcpy_retl_o2_plus_o5_64)
5238c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5_64)
5248c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o4, %o0+8+40), memcpy_retl_o2_plus_o5_64)
5258c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5_64)
5268c2ecf20Sopenharmony_ci	add	%o1, 64, %o1
5278c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o4, %o0+8+48), memcpy_retl_o2_plus_o5_64)
5288c2ecf20Sopenharmony_ci	add	%o0, 64, %o0
5298c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5_64)
5308c2ecf20Sopenharmony_ci	bgu	%xcc,.Lalign_loop_fin
5318c2ecf20Sopenharmony_ci	 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_o5_64)
5328c2ecf20Sopenharmony_ci
5338c2ecf20Sopenharmony_ci.Lalign_done:
5348c2ecf20Sopenharmony_ci	add	%o0, 8, %o0		! restore %o0 from ASI alignment
5358c2ecf20Sopenharmony_ci	membar	#StoreStore
5368c2ecf20Sopenharmony_ci	sub	%o2, 63, %o2		! adjust length to allow cc test
5378c2ecf20Sopenharmony_ci	ba	.Lmedl63		! in .Lmedl63
5388c2ecf20Sopenharmony_ci	 nop
5398c2ecf20Sopenharmony_ci
5408c2ecf20Sopenharmony_ci	.align 16
5418c2ecf20Sopenharmony_ci	! Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX
5428c2ecf20Sopenharmony_ci.Lunalignsetup:
5438c2ecf20Sopenharmony_ci.Lunalignrejoin:
5448c2ecf20Sopenharmony_ci	mov	%g1, %o3	! save %g1 as VISEntryHalf clobbers it
5458c2ecf20Sopenharmony_ci#ifdef NON_USER_COPY
5468c2ecf20Sopenharmony_ci	VISEntryHalfFast(.Lmedium_vis_entry_fail_cp)
5478c2ecf20Sopenharmony_ci#else
5488c2ecf20Sopenharmony_ci	VISEntryHalf
5498c2ecf20Sopenharmony_ci#endif
5508c2ecf20Sopenharmony_ci	mov	%o3, %g1	! restore %g1
5518c2ecf20Sopenharmony_ci
5528c2ecf20Sopenharmony_ci	set MED_UMAX, %o3
5538c2ecf20Sopenharmony_ci	cmp %o2, %o3 		! check for.Lmedium unaligned limit
5548c2ecf20Sopenharmony_ci	bge,pt	%xcc,.Lunalign_large
5558c2ecf20Sopenharmony_ci	 prefetch [%o1 + (4 * BLOCK_SIZE)], 20
5568c2ecf20Sopenharmony_ci	andn	%o2, 0x3f, %o5		! %o5 is multiple of block size
5578c2ecf20Sopenharmony_ci	and	%o2, 0x3f, %o2		! residue bytes in %o2
5588c2ecf20Sopenharmony_ci	cmp	%o2, 8			! Insure we do not load beyond
5598c2ecf20Sopenharmony_ci	bgt	.Lunalign_adjust	! end of source buffer
5608c2ecf20Sopenharmony_ci	 andn	%o1, 0x7, %o4		! %o4 has long word aligned src address
5618c2ecf20Sopenharmony_ci	add	%o2, 64, %o2		! adjust to leave loop
5628c2ecf20Sopenharmony_ci	sub	%o5, 64, %o5		! early if necessary
5638c2ecf20Sopenharmony_ci.Lunalign_adjust:
5648c2ecf20Sopenharmony_ci	alignaddr %o1, %g0, %g0		! generate %gsr
5658c2ecf20Sopenharmony_ci	add	%o1, %o5, %o1		! advance %o1 to after blocks
5668c2ecf20Sopenharmony_ci	EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5)
5678c2ecf20Sopenharmony_ci.Lunalign_loop:
5688c2ecf20Sopenharmony_ci	EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5)
5698c2ecf20Sopenharmony_ci	faligndata %f0, %f2, %f16
5708c2ecf20Sopenharmony_ci	EX_LD_FP(LOAD(ldd, %o4+16, %f4), memcpy_retl_o2_plus_o5)
5718c2ecf20Sopenharmony_ci	subcc	%o5, BLOCK_SIZE, %o5
5728c2ecf20Sopenharmony_ci	EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5_plus_64)
5738c2ecf20Sopenharmony_ci	faligndata %f2, %f4, %f18
5748c2ecf20Sopenharmony_ci	EX_LD_FP(LOAD(ldd, %o4+24, %f6), memcpy_retl_o2_plus_o5_plus_56)
5758c2ecf20Sopenharmony_ci	EX_ST_FP(STORE(std, %f18, %o0+8), memcpy_retl_o2_plus_o5_plus_56)
5768c2ecf20Sopenharmony_ci	faligndata %f4, %f6, %f20
5778c2ecf20Sopenharmony_ci	EX_LD_FP(LOAD(ldd, %o4+32, %f8), memcpy_retl_o2_plus_o5_plus_48)
5788c2ecf20Sopenharmony_ci	EX_ST_FP(STORE(std, %f20, %o0+16), memcpy_retl_o2_plus_o5_plus_48)
5798c2ecf20Sopenharmony_ci	faligndata %f6, %f8, %f22
5808c2ecf20Sopenharmony_ci	EX_LD_FP(LOAD(ldd, %o4+40, %f10), memcpy_retl_o2_plus_o5_plus_40)
5818c2ecf20Sopenharmony_ci	EX_ST_FP(STORE(std, %f22, %o0+24), memcpy_retl_o2_plus_o5_plus_40)
5828c2ecf20Sopenharmony_ci	faligndata %f8, %f10, %f24
5838c2ecf20Sopenharmony_ci	EX_LD_FP(LOAD(ldd, %o4+48, %f12), memcpy_retl_o2_plus_o5_plus_32)
5848c2ecf20Sopenharmony_ci	EX_ST_FP(STORE(std, %f24, %o0+32), memcpy_retl_o2_plus_o5_plus_32)
5858c2ecf20Sopenharmony_ci	faligndata %f10, %f12, %f26
5868c2ecf20Sopenharmony_ci	EX_LD_FP(LOAD(ldd, %o4+56, %f14), memcpy_retl_o2_plus_o5_plus_24)
5878c2ecf20Sopenharmony_ci	add	%o4, BLOCK_SIZE, %o4
5888c2ecf20Sopenharmony_ci	EX_ST_FP(STORE(std, %f26, %o0+40), memcpy_retl_o2_plus_o5_plus_24)
5898c2ecf20Sopenharmony_ci	faligndata %f12, %f14, %f28
5908c2ecf20Sopenharmony_ci	EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5_plus_16)
5918c2ecf20Sopenharmony_ci	EX_ST_FP(STORE(std, %f28, %o0+48), memcpy_retl_o2_plus_o5_plus_16)
5928c2ecf20Sopenharmony_ci	faligndata %f14, %f0, %f30
5938c2ecf20Sopenharmony_ci	EX_ST_FP(STORE(std, %f30, %o0+56), memcpy_retl_o2_plus_o5_plus_8)
5948c2ecf20Sopenharmony_ci	add	%o0, BLOCK_SIZE, %o0
5958c2ecf20Sopenharmony_ci	bgu,pt	%xcc, .Lunalign_loop
5968c2ecf20Sopenharmony_ci	 prefetch [%o4 + (5 * BLOCK_SIZE)], 20
5978c2ecf20Sopenharmony_ci	ba	.Lunalign_done
5988c2ecf20Sopenharmony_ci	 nop
5998c2ecf20Sopenharmony_ci
6008c2ecf20Sopenharmony_ci.Lunalign_large:
6018c2ecf20Sopenharmony_ci	andcc	%o0, 0x3f, %o3		! is dst 64-byte block aligned?
6028c2ecf20Sopenharmony_ci	bz	%xcc, .Lunalignsrc
6038c2ecf20Sopenharmony_ci	 sub	%o3, 64, %o3		! %o3 will be multiple of 8
6048c2ecf20Sopenharmony_ci	neg	%o3			! bytes until dest is 64 byte aligned
6058c2ecf20Sopenharmony_ci	sub	%o2, %o3, %o2		! update cnt with bytes to be moved
6068c2ecf20Sopenharmony_ci	! Move bytes according to source alignment
6078c2ecf20Sopenharmony_ci	andcc	%o1, 0x1, %o5
6088c2ecf20Sopenharmony_ci	bnz	%xcc, .Lunalignbyte	! check for byte alignment
6098c2ecf20Sopenharmony_ci	 nop
6108c2ecf20Sopenharmony_ci	andcc	%o1, 2, %o5		! check for half word alignment
6118c2ecf20Sopenharmony_ci	bnz	%xcc, .Lunalignhalf
6128c2ecf20Sopenharmony_ci	 nop
6138c2ecf20Sopenharmony_ci	! Src is word aligned
6148c2ecf20Sopenharmony_ci.Lunalignword:
6158c2ecf20Sopenharmony_ci	EX_LD_FP(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_o3)	! load 4 bytes
6168c2ecf20Sopenharmony_ci	add	%o1, 8, %o1		! increase src ptr by 8
6178c2ecf20Sopenharmony_ci	EX_ST_FP(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_o3)	! and store 4
6188c2ecf20Sopenharmony_ci	subcc	%o3, 8, %o3		! decrease count by 8
6198c2ecf20Sopenharmony_ci	EX_LD_FP(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_o3_plus_4)! load 4
6208c2ecf20Sopenharmony_ci	add	%o0, 8, %o0		! increase dst ptr by 8
6218c2ecf20Sopenharmony_ci	bnz	%xcc, .Lunalignword
6228c2ecf20Sopenharmony_ci	 EX_ST_FP(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_o3_plus_4)
6238c2ecf20Sopenharmony_ci	ba	.Lunalignsrc
6248c2ecf20Sopenharmony_ci	 nop
6258c2ecf20Sopenharmony_ci
6268c2ecf20Sopenharmony_ci	! Src is half-word aligned
6278c2ecf20Sopenharmony_ci.Lunalignhalf:
6288c2ecf20Sopenharmony_ci	EX_LD_FP(LOAD(lduh, %o1, %o4), memcpy_retl_o2_plus_o3)	! load 2 bytes
6298c2ecf20Sopenharmony_ci	sllx	%o4, 32, %o5		! shift left
6308c2ecf20Sopenharmony_ci	EX_LD_FP(LOAD(lduw, %o1+2, %o4), memcpy_retl_o2_plus_o3)
6318c2ecf20Sopenharmony_ci	or	%o4, %o5, %o5
6328c2ecf20Sopenharmony_ci	sllx	%o5, 16, %o5
6338c2ecf20Sopenharmony_ci	EX_LD_FP(LOAD(lduh, %o1+6, %o4), memcpy_retl_o2_plus_o3)
6348c2ecf20Sopenharmony_ci	or	%o4, %o5, %o5
6358c2ecf20Sopenharmony_ci	EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3)
6368c2ecf20Sopenharmony_ci	add	%o1, 8, %o1
6378c2ecf20Sopenharmony_ci	subcc	%o3, 8, %o3
6388c2ecf20Sopenharmony_ci	bnz	%xcc, .Lunalignhalf
6398c2ecf20Sopenharmony_ci	 add	%o0, 8, %o0
6408c2ecf20Sopenharmony_ci	ba	.Lunalignsrc
6418c2ecf20Sopenharmony_ci	 nop
6428c2ecf20Sopenharmony_ci
6438c2ecf20Sopenharmony_ci	! Src is Byte aligned
6448c2ecf20Sopenharmony_ci.Lunalignbyte:
6458c2ecf20Sopenharmony_ci	sub	%o0, %o1, %o0		! share pointer advance
6468c2ecf20Sopenharmony_ci.Lunalignbyte_loop:
6478c2ecf20Sopenharmony_ci	EX_LD_FP(LOAD(ldub, %o1, %o4), memcpy_retl_o2_plus_o3)
6488c2ecf20Sopenharmony_ci	sllx	%o4, 56, %o5
6498c2ecf20Sopenharmony_ci	EX_LD_FP(LOAD(lduh, %o1+1, %o4), memcpy_retl_o2_plus_o3)
6508c2ecf20Sopenharmony_ci	sllx	%o4, 40, %o4
6518c2ecf20Sopenharmony_ci	or	%o4, %o5, %o5
6528c2ecf20Sopenharmony_ci	EX_LD_FP(LOAD(lduh, %o1+3, %o4), memcpy_retl_o2_plus_o3)
6538c2ecf20Sopenharmony_ci	sllx	%o4, 24, %o4
6548c2ecf20Sopenharmony_ci	or	%o4, %o5, %o5
6558c2ecf20Sopenharmony_ci	EX_LD_FP(LOAD(lduh, %o1+5, %o4), memcpy_retl_o2_plus_o3)
6568c2ecf20Sopenharmony_ci	sllx	%o4,  8, %o4
6578c2ecf20Sopenharmony_ci	or	%o4, %o5, %o5
6588c2ecf20Sopenharmony_ci	EX_LD_FP(LOAD(ldub, %o1+7, %o4), memcpy_retl_o2_plus_o3)
6598c2ecf20Sopenharmony_ci	or	%o4, %o5, %o5
6608c2ecf20Sopenharmony_ci	add	%o0, %o1, %o0
6618c2ecf20Sopenharmony_ci	EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3)
6628c2ecf20Sopenharmony_ci	sub	%o0, %o1, %o0
6638c2ecf20Sopenharmony_ci	subcc	%o3, 8, %o3
6648c2ecf20Sopenharmony_ci	bnz	%xcc, .Lunalignbyte_loop
6658c2ecf20Sopenharmony_ci	 add	%o1, 8, %o1
6668c2ecf20Sopenharmony_ci	add	%o0,%o1, %o0 		! restore pointer
6678c2ecf20Sopenharmony_ci
6688c2ecf20Sopenharmony_ci	! Destination is now block (64 byte aligned)
6698c2ecf20Sopenharmony_ci.Lunalignsrc:
6708c2ecf20Sopenharmony_ci	andn	%o2, 0x3f, %o5		! %o5 is multiple of block size
6718c2ecf20Sopenharmony_ci	and	%o2, 0x3f, %o2		! residue bytes in %o2
6728c2ecf20Sopenharmony_ci	add	%o2, 64, %o2		! Insure we do not load beyond
6738c2ecf20Sopenharmony_ci	sub	%o5, 64, %o5		! end of source buffer
6748c2ecf20Sopenharmony_ci
6758c2ecf20Sopenharmony_ci	andn	%o1, 0x7, %o4		! %o4 has long word aligned src address
6768c2ecf20Sopenharmony_ci	alignaddr %o1, %g0, %g0		! generate %gsr
6778c2ecf20Sopenharmony_ci	add	%o1, %o5, %o1		! advance %o1 to after blocks
6788c2ecf20Sopenharmony_ci
6798c2ecf20Sopenharmony_ci	EX_LD_FP(LOAD(ldd, %o4, %f14), memcpy_retl_o2_plus_o5)
6808c2ecf20Sopenharmony_ci	add	%o4, 8, %o4
6818c2ecf20Sopenharmony_ci.Lunalign_sloop:
6828c2ecf20Sopenharmony_ci	EX_LD_FP(LOAD(ldd, %o4, %f16), memcpy_retl_o2_plus_o5)
6838c2ecf20Sopenharmony_ci	faligndata %f14, %f16, %f0
6848c2ecf20Sopenharmony_ci	EX_LD_FP(LOAD(ldd, %o4+8, %f18), memcpy_retl_o2_plus_o5)
6858c2ecf20Sopenharmony_ci	faligndata %f16, %f18, %f2
6868c2ecf20Sopenharmony_ci	EX_LD_FP(LOAD(ldd, %o4+16, %f20), memcpy_retl_o2_plus_o5)
6878c2ecf20Sopenharmony_ci	faligndata %f18, %f20, %f4
6888c2ecf20Sopenharmony_ci	EX_ST_FP(STORE(std, %f0, %o0), memcpy_retl_o2_plus_o5)
6898c2ecf20Sopenharmony_ci	subcc	%o5, 64, %o5
6908c2ecf20Sopenharmony_ci	EX_LD_FP(LOAD(ldd, %o4+24, %f22), memcpy_retl_o2_plus_o5_plus_56)
6918c2ecf20Sopenharmony_ci	faligndata %f20, %f22, %f6
6928c2ecf20Sopenharmony_ci	EX_ST_FP(STORE(std, %f2, %o0+8), memcpy_retl_o2_plus_o5_plus_56)
6938c2ecf20Sopenharmony_ci	EX_LD_FP(LOAD(ldd, %o4+32, %f24), memcpy_retl_o2_plus_o5_plus_48)
6948c2ecf20Sopenharmony_ci	faligndata %f22, %f24, %f8
6958c2ecf20Sopenharmony_ci	EX_ST_FP(STORE(std, %f4, %o0+16), memcpy_retl_o2_plus_o5_plus_48)
6968c2ecf20Sopenharmony_ci	EX_LD_FP(LOAD(ldd, %o4+40, %f26), memcpy_retl_o2_plus_o5_plus_40)
6978c2ecf20Sopenharmony_ci	faligndata %f24, %f26, %f10
6988c2ecf20Sopenharmony_ci	EX_ST_FP(STORE(std, %f6, %o0+24), memcpy_retl_o2_plus_o5_plus_40)
6998c2ecf20Sopenharmony_ci	EX_LD_FP(LOAD(ldd, %o4+48, %f28), memcpy_retl_o2_plus_o5_plus_40)
7008c2ecf20Sopenharmony_ci	faligndata %f26, %f28, %f12
7018c2ecf20Sopenharmony_ci	EX_ST_FP(STORE(std, %f8, %o0+32), memcpy_retl_o2_plus_o5_plus_40)
7028c2ecf20Sopenharmony_ci	add	%o4, 64, %o4
7038c2ecf20Sopenharmony_ci	EX_LD_FP(LOAD(ldd, %o4-8, %f30), memcpy_retl_o2_plus_o5_plus_40)
7048c2ecf20Sopenharmony_ci	faligndata %f28, %f30, %f14
7058c2ecf20Sopenharmony_ci	EX_ST_FP(STORE(std, %f10, %o0+40), memcpy_retl_o2_plus_o5_plus_40)
7068c2ecf20Sopenharmony_ci	EX_ST_FP(STORE(std, %f12, %o0+48), memcpy_retl_o2_plus_o5_plus_40)
7078c2ecf20Sopenharmony_ci	add	%o0, 64, %o0
7088c2ecf20Sopenharmony_ci	EX_ST_FP(STORE(std, %f14, %o0-8), memcpy_retl_o2_plus_o5_plus_40)
7098c2ecf20Sopenharmony_ci	fsrc2	%f30, %f14
7108c2ecf20Sopenharmony_ci	bgu,pt	%xcc, .Lunalign_sloop
7118c2ecf20Sopenharmony_ci	 prefetch [%o4 + (8 * BLOCK_SIZE)], 20
7128c2ecf20Sopenharmony_ci
7138c2ecf20Sopenharmony_ci.Lunalign_done:
7148c2ecf20Sopenharmony_ci	! Handle trailing bytes, 64 to 127
7158c2ecf20Sopenharmony_ci	! Dest long word aligned, Src not long word aligned
7168c2ecf20Sopenharmony_ci	cmp	%o2, 15
7178c2ecf20Sopenharmony_ci	bleu	%xcc, .Lunalign_short
7188c2ecf20Sopenharmony_ci
7198c2ecf20Sopenharmony_ci	 andn	%o2, 0x7, %o5		! %o5 is multiple of 8
7208c2ecf20Sopenharmony_ci	and	%o2, 0x7, %o2		! residue bytes in %o2
7218c2ecf20Sopenharmony_ci	add	%o2, 8, %o2
7228c2ecf20Sopenharmony_ci	sub	%o5, 8, %o5		! insure we do not load past end of src
7238c2ecf20Sopenharmony_ci	andn	%o1, 0x7, %o4		! %o4 has long word aligned src address
7248c2ecf20Sopenharmony_ci	add	%o1, %o5, %o1		! advance %o1 to after multiple of 8
7258c2ecf20Sopenharmony_ci	EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5)! fetch partialword
7268c2ecf20Sopenharmony_ci.Lunalign_by8:
7278c2ecf20Sopenharmony_ci	EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5)
7288c2ecf20Sopenharmony_ci	add	%o4, 8, %o4
7298c2ecf20Sopenharmony_ci	faligndata %f0, %f2, %f16
7308c2ecf20Sopenharmony_ci	subcc	%o5, 8, %o5
7318c2ecf20Sopenharmony_ci	EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5)
7328c2ecf20Sopenharmony_ci	fsrc2	%f2, %f0
7338c2ecf20Sopenharmony_ci	bgu,pt	%xcc, .Lunalign_by8
7348c2ecf20Sopenharmony_ci	 add	%o0, 8, %o0
7358c2ecf20Sopenharmony_ci
7368c2ecf20Sopenharmony_ci.Lunalign_short:
7378c2ecf20Sopenharmony_ci#ifdef NON_USER_COPY
7388c2ecf20Sopenharmony_ci	VISExitHalfFast
7398c2ecf20Sopenharmony_ci#else
7408c2ecf20Sopenharmony_ci	VISExitHalf
7418c2ecf20Sopenharmony_ci#endif
7428c2ecf20Sopenharmony_ci	ba	.Lsmallrest
7438c2ecf20Sopenharmony_ci	 nop
7448c2ecf20Sopenharmony_ci
7458c2ecf20Sopenharmony_ci/*
7468c2ecf20Sopenharmony_ci * This is a special case of nested memcpy. This can happen when kernel
7478c2ecf20Sopenharmony_ci * calls unaligned memcpy back to back without saving FP registers. We need
7488c2ecf20Sopenharmony_ci * traps(context switch) to save/restore FP registers. If the kernel calls
7498c2ecf20Sopenharmony_ci * memcpy without this trap sequence we will hit FP corruption. Let's use
7508c2ecf20Sopenharmony_ci * the normal integer load/store method in this case.
7518c2ecf20Sopenharmony_ci */
7528c2ecf20Sopenharmony_ci
7538c2ecf20Sopenharmony_ci#ifdef NON_USER_COPY
7548c2ecf20Sopenharmony_ci.Lmedium_vis_entry_fail_cp:
7558c2ecf20Sopenharmony_ci	or	%o0, %o1, %g2
7568c2ecf20Sopenharmony_ci#endif
7578c2ecf20Sopenharmony_ci.Lmedium_cp:
7588c2ecf20Sopenharmony_ci	LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
7598c2ecf20Sopenharmony_ci	andcc	%g2, 0x7, %g0
7608c2ecf20Sopenharmony_ci	bne,pn	%xcc, .Lmedium_unaligned_cp
7618c2ecf20Sopenharmony_ci	 nop
7628c2ecf20Sopenharmony_ci
7638c2ecf20Sopenharmony_ci.Lmedium_noprefetch_cp:
7648c2ecf20Sopenharmony_ci	andncc	%o2, 0x20 - 1, %o5
7658c2ecf20Sopenharmony_ci	be,pn	%xcc, 2f
7668c2ecf20Sopenharmony_ci	 sub	%o2, %o5, %o2
7678c2ecf20Sopenharmony_ci1:	EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
7688c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1 + 0x08, %g2), memcpy_retl_o2_plus_o5)
7698c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1 + 0x10, %g7), memcpy_retl_o2_plus_o5)
7708c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1 + 0x18, %o4), memcpy_retl_o2_plus_o5)
7718c2ecf20Sopenharmony_ci	add	%o1, 0x20, %o1
7728c2ecf20Sopenharmony_ci	subcc	%o5, 0x20, %o5
7738c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o3, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_32)
7748c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %g2, %o0 + 0x08), memcpy_retl_o2_plus_o5_plus_24)
7758c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %g7, %o0 + 0x10), memcpy_retl_o2_plus_o5_plus_24)
7768c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %o4, %o0 + 0x18), memcpy_retl_o2_plus_o5_plus_8)
7778c2ecf20Sopenharmony_ci	bne,pt	%xcc, 1b
7788c2ecf20Sopenharmony_ci	 add	%o0, 0x20, %o0
7798c2ecf20Sopenharmony_ci2:	andcc	%o2, 0x18, %o5
7808c2ecf20Sopenharmony_ci	be,pt	%xcc, 3f
7818c2ecf20Sopenharmony_ci	 sub	%o2, %o5, %o2
7828c2ecf20Sopenharmony_ci1:	EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
7838c2ecf20Sopenharmony_ci	add	%o1, 0x08, %o1
7848c2ecf20Sopenharmony_ci	add	%o0, 0x08, %o0
7858c2ecf20Sopenharmony_ci	subcc	%o5, 0x08, %o5
7868c2ecf20Sopenharmony_ci	bne,pt	%xcc, 1b
7878c2ecf20Sopenharmony_ci	 EX_ST(STORE(stx, %o3, %o0 - 0x08), memcpy_retl_o2_plus_o5_plus_8)
7888c2ecf20Sopenharmony_ci3:	brz,pt	%o2, .Lexit_cp
7898c2ecf20Sopenharmony_ci	 cmp	%o2, 0x04
7908c2ecf20Sopenharmony_ci	bl,pn	%xcc, .Ltiny_cp
7918c2ecf20Sopenharmony_ci	 nop
7928c2ecf20Sopenharmony_ci	EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2)
7938c2ecf20Sopenharmony_ci	add	%o1, 0x04, %o1
7948c2ecf20Sopenharmony_ci	add	%o0, 0x04, %o0
7958c2ecf20Sopenharmony_ci	subcc	%o2, 0x04, %o2
7968c2ecf20Sopenharmony_ci	bne,pn	%xcc, .Ltiny_cp
7978c2ecf20Sopenharmony_ci	 EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_4)
7988c2ecf20Sopenharmony_ci	ba,a,pt	%xcc, .Lexit_cp
7998c2ecf20Sopenharmony_ci
8008c2ecf20Sopenharmony_ci.Lmedium_unaligned_cp:
8018c2ecf20Sopenharmony_ci	/* First get dest 8 byte aligned.  */
8028c2ecf20Sopenharmony_ci	sub	%g0, %o0, %o3
8038c2ecf20Sopenharmony_ci	and	%o3, 0x7, %o3
8048c2ecf20Sopenharmony_ci	brz,pt	%o3, 2f
8058c2ecf20Sopenharmony_ci	 sub	%o2, %o3, %o2
8068c2ecf20Sopenharmony_ci
8078c2ecf20Sopenharmony_ci1:	EX_LD(LOAD(ldub, %o1 + 0x00, %g2), memcpy_retl_o2_plus_g1)
8088c2ecf20Sopenharmony_ci	add	%o1, 1, %o1
8098c2ecf20Sopenharmony_ci	subcc	%o3, 1, %o3
8108c2ecf20Sopenharmony_ci	add	%o0, 1, %o0
8118c2ecf20Sopenharmony_ci	bne,pt	%xcc, 1b
8128c2ecf20Sopenharmony_ci	 EX_ST(STORE(stb, %g2, %o0 - 0x01), memcpy_retl_o2_plus_g1_plus_1)
8138c2ecf20Sopenharmony_ci2:
8148c2ecf20Sopenharmony_ci	and	%o1, 0x7, %o3
8158c2ecf20Sopenharmony_ci	brz,pn	%o3, .Lmedium_noprefetch_cp
8168c2ecf20Sopenharmony_ci	 sll	%o3, 3, %o3
8178c2ecf20Sopenharmony_ci	mov	64, %g2
8188c2ecf20Sopenharmony_ci	sub	%g2, %o3, %g2
8198c2ecf20Sopenharmony_ci	andn	%o1, 0x7, %o1
8208c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldx, %o1 + 0x00, %o4), memcpy_retl_o2)
8218c2ecf20Sopenharmony_ci	sllx	%o4, %o3, %o4
8228c2ecf20Sopenharmony_ci	andn	%o2, 0x08 - 1, %o5
8238c2ecf20Sopenharmony_ci	sub	%o2, %o5, %o2
8248c2ecf20Sopenharmony_ci
8258c2ecf20Sopenharmony_ci1:	EX_LD(LOAD(ldx, %o1 + 0x08, %g3), memcpy_retl_o2_plus_o5)
8268c2ecf20Sopenharmony_ci	add	%o1, 0x08, %o1
8278c2ecf20Sopenharmony_ci	subcc	%o5, 0x08, %o5
8288c2ecf20Sopenharmony_ci	srlx	%g3, %g2, %g7
8298c2ecf20Sopenharmony_ci	or	%g7, %o4, %g7
8308c2ecf20Sopenharmony_ci	EX_ST(STORE(stx, %g7, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_8)
8318c2ecf20Sopenharmony_ci	add	%o0, 0x08, %o0
8328c2ecf20Sopenharmony_ci	bne,pt	%xcc, 1b
8338c2ecf20Sopenharmony_ci	 sllx	%g3, %o3, %o4
8348c2ecf20Sopenharmony_ci	srl	%o3, 3, %o3
8358c2ecf20Sopenharmony_ci	add	%o1, %o3, %o1
8368c2ecf20Sopenharmony_ci	brz,pn	%o2, .Lexit_cp
8378c2ecf20Sopenharmony_ci	 nop
8388c2ecf20Sopenharmony_ci	ba,pt	%xcc, .Lsmall_unaligned_cp
8398c2ecf20Sopenharmony_ci
8408c2ecf20Sopenharmony_ci.Ltiny_cp:
8418c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2)
8428c2ecf20Sopenharmony_ci	subcc	%o2, 1, %o2
8438c2ecf20Sopenharmony_ci	be,pn	%xcc, .Lexit_cp
8448c2ecf20Sopenharmony_ci	 EX_ST(STORE(stb, %o3, %o0 + 0x00), memcpy_retl_o2_plus_1)
8458c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldub, %o1 + 0x01, %o3), memcpy_retl_o2)
8468c2ecf20Sopenharmony_ci	subcc	%o2, 1, %o2
8478c2ecf20Sopenharmony_ci	be,pn	%xcc, .Lexit_cp
8488c2ecf20Sopenharmony_ci	 EX_ST(STORE(stb, %o3, %o0 + 0x01), memcpy_retl_o2_plus_1)
8498c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldub, %o1 + 0x02, %o3), memcpy_retl_o2)
8508c2ecf20Sopenharmony_ci	ba,pt	%xcc, .Lexit_cp
8518c2ecf20Sopenharmony_ci	 EX_ST(STORE(stb, %o3, %o0 + 0x02), memcpy_retl_o2)
8528c2ecf20Sopenharmony_ci
8538c2ecf20Sopenharmony_ci.Lsmall_cp:
8548c2ecf20Sopenharmony_ci	andcc	%g2, 0x3, %g0
8558c2ecf20Sopenharmony_ci	bne,pn	%xcc, .Lsmall_unaligned_cp
8568c2ecf20Sopenharmony_ci	 andn	%o2, 0x4 - 1, %o5
8578c2ecf20Sopenharmony_ci	sub	%o2, %o5, %o2
8588c2ecf20Sopenharmony_ci1:
8598c2ecf20Sopenharmony_ci	EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
8608c2ecf20Sopenharmony_ci	add	%o1, 0x04, %o1
8618c2ecf20Sopenharmony_ci	subcc	%o5, 0x04, %o5
8628c2ecf20Sopenharmony_ci	add	%o0, 0x04, %o0
8638c2ecf20Sopenharmony_ci	bne,pt	%xcc, 1b
8648c2ecf20Sopenharmony_ci	 EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_o5_plus_4)
8658c2ecf20Sopenharmony_ci	brz,pt	%o2, .Lexit_cp
8668c2ecf20Sopenharmony_ci	 nop
8678c2ecf20Sopenharmony_ci	ba,a,pt	%xcc, .Ltiny_cp
8688c2ecf20Sopenharmony_ci
8698c2ecf20Sopenharmony_ci.Lsmall_unaligned_cp:
8708c2ecf20Sopenharmony_ci1:	EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2)
8718c2ecf20Sopenharmony_ci	add	%o1, 1, %o1
8728c2ecf20Sopenharmony_ci	add	%o0, 1, %o0
8738c2ecf20Sopenharmony_ci	subcc	%o2, 1, %o2
8748c2ecf20Sopenharmony_ci	bne,pt	%xcc, 1b
8758c2ecf20Sopenharmony_ci	 EX_ST(STORE(stb, %o3, %o0 - 0x01), memcpy_retl_o2_plus_1)
8768c2ecf20Sopenharmony_ci	ba,a,pt	%xcc, .Lexit_cp
8778c2ecf20Sopenharmony_ci
8788c2ecf20Sopenharmony_ci.Lsmallrest:
8798c2ecf20Sopenharmony_ci	tst	%o2
8808c2ecf20Sopenharmony_ci	bz,pt	%xcc, .Lsmallx
8818c2ecf20Sopenharmony_ci	 cmp	%o2, 4
8828c2ecf20Sopenharmony_ci	blt,pn	%xcc, .Lsmallleft3
8838c2ecf20Sopenharmony_ci	 nop
8848c2ecf20Sopenharmony_ci	sub	%o2, 3, %o2
8858c2ecf20Sopenharmony_ci.Lsmallnotalign4:
8868c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_3)! read byte
8878c2ecf20Sopenharmony_ci	subcc	%o2, 4, %o2		! reduce count by 4
8888c2ecf20Sopenharmony_ci	EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_7)! write byte & repeat
8898c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2_plus_6)! for total of 4
8908c2ecf20Sopenharmony_ci	add	%o1, 4, %o1		! advance SRC by 4
8918c2ecf20Sopenharmony_ci	EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_6)
8928c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldub, %o1-2, %o3), memcpy_retl_o2_plus_5)
8938c2ecf20Sopenharmony_ci	add	%o0, 4, %o0		! advance DST by 4
8948c2ecf20Sopenharmony_ci	EX_ST(STORE(stb, %o3, %o0-2), memcpy_retl_o2_plus_5)
8958c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldub, %o1-1, %o3), memcpy_retl_o2_plus_4)
8968c2ecf20Sopenharmony_ci	bgu,pt	%xcc, .Lsmallnotalign4	! loop til 3 or fewer bytes remain
8978c2ecf20Sopenharmony_ci	EX_ST(STORE(stb, %o3, %o0-1), memcpy_retl_o2_plus_4)
8988c2ecf20Sopenharmony_ci	addcc	%o2, 3, %o2		! restore count
8998c2ecf20Sopenharmony_ci	bz,pt	%xcc, .Lsmallx
9008c2ecf20Sopenharmony_ci.Lsmallleft3:				! 1, 2, or 3 bytes remain
9018c2ecf20Sopenharmony_ci	subcc	%o2, 1, %o2
9028c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_1)	! load one byte
9038c2ecf20Sopenharmony_ci	bz,pt	%xcc, .Lsmallx
9048c2ecf20Sopenharmony_ci	EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_1)	! store one byte
9058c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2)	! load second byte
9068c2ecf20Sopenharmony_ci	subcc	%o2, 1, %o2
9078c2ecf20Sopenharmony_ci	bz,pt	%xcc, .Lsmallx
9088c2ecf20Sopenharmony_ci	EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_1)! store second byte
9098c2ecf20Sopenharmony_ci	EX_LD(LOAD(ldub, %o1+2, %o3), memcpy_retl_o2)	! load third byte
9108c2ecf20Sopenharmony_ci	EX_ST(STORE(stb, %o3, %o0+2), memcpy_retl_o2)	! store third byte
9118c2ecf20Sopenharmony_ci.Lsmallx:
9128c2ecf20Sopenharmony_ci	retl
9138c2ecf20Sopenharmony_ci	 mov	EX_RETVAL(%g1), %o0
9148c2ecf20Sopenharmony_ci.Lsmallfin:
9158c2ecf20Sopenharmony_ci	tst	%o2
9168c2ecf20Sopenharmony_ci	bnz,pn	%xcc, .Lsmallleft3
9178c2ecf20Sopenharmony_ci	 nop
9188c2ecf20Sopenharmony_ci	retl
9198c2ecf20Sopenharmony_ci	 mov	EX_RETVAL(%g1), %o0	! restore %o0
9208c2ecf20Sopenharmony_ci.Lexit_cp:
9218c2ecf20Sopenharmony_ci	retl
9228c2ecf20Sopenharmony_ci	 mov	EX_RETVAL(%g1), %o0
9238c2ecf20Sopenharmony_ci	.size  FUNC_NAME, .-FUNC_NAME
924