18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * Itanium 2-optimized version of memcpy and copy_user function
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * Inputs:
68c2ecf20Sopenharmony_ci * 	in0:	destination address
78c2ecf20Sopenharmony_ci *	in1:	source address
88c2ecf20Sopenharmony_ci *	in2:	number of bytes to copy
98c2ecf20Sopenharmony_ci * Output:
108c2ecf20Sopenharmony_ci *	for memcpy:    return dest
118c2ecf20Sopenharmony_ci * 	for copy_user: return 0 if success,
128c2ecf20Sopenharmony_ci *		       or number of byte NOT copied if error occurred.
138c2ecf20Sopenharmony_ci *
148c2ecf20Sopenharmony_ci * Copyright (C) 2002 Intel Corp.
158c2ecf20Sopenharmony_ci * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
168c2ecf20Sopenharmony_ci */
178c2ecf20Sopenharmony_ci#include <asm/asmmacro.h>
188c2ecf20Sopenharmony_ci#include <asm/page.h>
198c2ecf20Sopenharmony_ci#include <asm/export.h>
208c2ecf20Sopenharmony_ci
218c2ecf20Sopenharmony_ci#define EK(y...) EX(y)
228c2ecf20Sopenharmony_ci
238c2ecf20Sopenharmony_ci/* McKinley specific optimization */
248c2ecf20Sopenharmony_ci
258c2ecf20Sopenharmony_ci#define retval		r8
268c2ecf20Sopenharmony_ci#define saved_pfs	r31
278c2ecf20Sopenharmony_ci#define saved_lc	r10
288c2ecf20Sopenharmony_ci#define saved_pr	r11
298c2ecf20Sopenharmony_ci#define saved_in0	r14
308c2ecf20Sopenharmony_ci#define saved_in1	r15
318c2ecf20Sopenharmony_ci#define saved_in2	r16
328c2ecf20Sopenharmony_ci
338c2ecf20Sopenharmony_ci#define src0		r2
348c2ecf20Sopenharmony_ci#define src1		r3
358c2ecf20Sopenharmony_ci#define dst0		r17
368c2ecf20Sopenharmony_ci#define dst1		r18
378c2ecf20Sopenharmony_ci#define cnt		r9
388c2ecf20Sopenharmony_ci
398c2ecf20Sopenharmony_ci/* r19-r30 are temp for each code section */
408c2ecf20Sopenharmony_ci#define PREFETCH_DIST	8
418c2ecf20Sopenharmony_ci#define src_pre_mem	r19
428c2ecf20Sopenharmony_ci#define dst_pre_mem	r20
438c2ecf20Sopenharmony_ci#define src_pre_l2	r21
448c2ecf20Sopenharmony_ci#define dst_pre_l2	r22
458c2ecf20Sopenharmony_ci#define t1		r23
468c2ecf20Sopenharmony_ci#define t2		r24
478c2ecf20Sopenharmony_ci#define t3		r25
488c2ecf20Sopenharmony_ci#define t4		r26
498c2ecf20Sopenharmony_ci#define t5		t1	// alias!
508c2ecf20Sopenharmony_ci#define t6		t2	// alias!
518c2ecf20Sopenharmony_ci#define t7		t3	// alias!
528c2ecf20Sopenharmony_ci#define n8		r27
538c2ecf20Sopenharmony_ci#define t9		t5	// alias!
548c2ecf20Sopenharmony_ci#define t10		t4	// alias!
558c2ecf20Sopenharmony_ci#define t11		t7	// alias!
568c2ecf20Sopenharmony_ci#define t12		t6	// alias!
578c2ecf20Sopenharmony_ci#define t14		t10	// alias!
588c2ecf20Sopenharmony_ci#define t13		r28
598c2ecf20Sopenharmony_ci#define t15		r29
608c2ecf20Sopenharmony_ci#define tmp		r30
618c2ecf20Sopenharmony_ci
628c2ecf20Sopenharmony_ci/* defines for long_copy block */
638c2ecf20Sopenharmony_ci#define	A	0
648c2ecf20Sopenharmony_ci#define B	(PREFETCH_DIST)
658c2ecf20Sopenharmony_ci#define C	(B + PREFETCH_DIST)
668c2ecf20Sopenharmony_ci#define D	(C + 1)
678c2ecf20Sopenharmony_ci#define N	(D + 1)
688c2ecf20Sopenharmony_ci#define Nrot	((N + 7) & ~7)
698c2ecf20Sopenharmony_ci
708c2ecf20Sopenharmony_ci/* alias */
718c2ecf20Sopenharmony_ci#define in0		r32
728c2ecf20Sopenharmony_ci#define in1		r33
738c2ecf20Sopenharmony_ci#define in2		r34
748c2ecf20Sopenharmony_ci
758c2ecf20Sopenharmony_ciGLOBAL_ENTRY(memcpy)
768c2ecf20Sopenharmony_ci	and	r28=0x7,in0
778c2ecf20Sopenharmony_ci	and	r29=0x7,in1
788c2ecf20Sopenharmony_ci	mov	f6=f0
798c2ecf20Sopenharmony_ci	mov	retval=in0
808c2ecf20Sopenharmony_ci	br.cond.sptk .common_code
818c2ecf20Sopenharmony_ci	;;
828c2ecf20Sopenharmony_ciEND(memcpy)
838c2ecf20Sopenharmony_ciEXPORT_SYMBOL(memcpy)
848c2ecf20Sopenharmony_ciGLOBAL_ENTRY(__copy_user)
858c2ecf20Sopenharmony_ci	.prologue
868c2ecf20Sopenharmony_ci// check dest alignment
878c2ecf20Sopenharmony_ci	and	r28=0x7,in0
888c2ecf20Sopenharmony_ci	and	r29=0x7,in1
898c2ecf20Sopenharmony_ci	mov	f6=f1
908c2ecf20Sopenharmony_ci	mov	saved_in0=in0	// save dest pointer
918c2ecf20Sopenharmony_ci	mov	saved_in1=in1	// save src pointer
928c2ecf20Sopenharmony_ci	mov	retval=r0	// initialize return value
938c2ecf20Sopenharmony_ci	;;
948c2ecf20Sopenharmony_ci.common_code:
958c2ecf20Sopenharmony_ci	cmp.gt	p15,p0=8,in2	// check for small size
968c2ecf20Sopenharmony_ci	cmp.ne	p13,p0=0,r28	// check dest alignment
978c2ecf20Sopenharmony_ci	cmp.ne	p14,p0=0,r29	// check src alignment
988c2ecf20Sopenharmony_ci	add	src0=0,in1
998c2ecf20Sopenharmony_ci	sub	r30=8,r28	// for .align_dest
1008c2ecf20Sopenharmony_ci	mov	saved_in2=in2	// save len
1018c2ecf20Sopenharmony_ci	;;
1028c2ecf20Sopenharmony_ci	add	dst0=0,in0
1038c2ecf20Sopenharmony_ci	add	dst1=1,in0	// dest odd index
1048c2ecf20Sopenharmony_ci	cmp.le	p6,p0 = 1,r30	// for .align_dest
1058c2ecf20Sopenharmony_ci(p15)	br.cond.dpnt .memcpy_short
1068c2ecf20Sopenharmony_ci(p13)	br.cond.dpnt .align_dest
1078c2ecf20Sopenharmony_ci(p14)	br.cond.dpnt .unaligned_src
1088c2ecf20Sopenharmony_ci	;;
1098c2ecf20Sopenharmony_ci
1108c2ecf20Sopenharmony_ci// both dest and src are aligned on 8-byte boundary
1118c2ecf20Sopenharmony_ci.aligned_src:
1128c2ecf20Sopenharmony_ci	.save ar.pfs, saved_pfs
1138c2ecf20Sopenharmony_ci	alloc	saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
1148c2ecf20Sopenharmony_ci	.save pr, saved_pr
1158c2ecf20Sopenharmony_ci	mov	saved_pr=pr
1168c2ecf20Sopenharmony_ci
1178c2ecf20Sopenharmony_ci	shr.u	cnt=in2,7	// this much cache line
1188c2ecf20Sopenharmony_ci	;;
1198c2ecf20Sopenharmony_ci	cmp.lt	p6,p0=2*PREFETCH_DIST,cnt
1208c2ecf20Sopenharmony_ci	cmp.lt	p7,p8=1,cnt
1218c2ecf20Sopenharmony_ci	.save ar.lc, saved_lc
1228c2ecf20Sopenharmony_ci	mov	saved_lc=ar.lc
1238c2ecf20Sopenharmony_ci	.body
1248c2ecf20Sopenharmony_ci	add	cnt=-1,cnt
1258c2ecf20Sopenharmony_ci	add	src_pre_mem=0,in1	// prefetch src pointer
1268c2ecf20Sopenharmony_ci	add	dst_pre_mem=0,in0	// prefetch dest pointer
1278c2ecf20Sopenharmony_ci	;;
1288c2ecf20Sopenharmony_ci(p7)	mov	ar.lc=cnt	// prefetch count
1298c2ecf20Sopenharmony_ci(p8)	mov	ar.lc=r0
1308c2ecf20Sopenharmony_ci(p6)	br.cond.dpnt .long_copy
1318c2ecf20Sopenharmony_ci	;;
1328c2ecf20Sopenharmony_ci
1338c2ecf20Sopenharmony_ci.prefetch:
1348c2ecf20Sopenharmony_ci	lfetch.fault	  [src_pre_mem], 128
1358c2ecf20Sopenharmony_ci	lfetch.fault.excl [dst_pre_mem], 128
1368c2ecf20Sopenharmony_ci	br.cloop.dptk.few .prefetch
1378c2ecf20Sopenharmony_ci	;;
1388c2ecf20Sopenharmony_ci
1398c2ecf20Sopenharmony_ci.medium_copy:
1408c2ecf20Sopenharmony_ci	and	tmp=31,in2	// copy length after iteration
1418c2ecf20Sopenharmony_ci	shr.u	r29=in2,5	// number of 32-byte iteration
1428c2ecf20Sopenharmony_ci	add	dst1=8,dst0	// 2nd dest pointer
1438c2ecf20Sopenharmony_ci	;;
1448c2ecf20Sopenharmony_ci	add	cnt=-1,r29	// ctop iteration adjustment
1458c2ecf20Sopenharmony_ci	cmp.eq	p10,p0=r29,r0	// do we really need to loop?
1468c2ecf20Sopenharmony_ci	add	src1=8,src0	// 2nd src pointer
1478c2ecf20Sopenharmony_ci	cmp.le	p6,p0=8,tmp
1488c2ecf20Sopenharmony_ci	;;
1498c2ecf20Sopenharmony_ci	cmp.le	p7,p0=16,tmp
1508c2ecf20Sopenharmony_ci	mov	ar.lc=cnt	// loop setup
1518c2ecf20Sopenharmony_ci	cmp.eq	p16,p17 = r0,r0
1528c2ecf20Sopenharmony_ci	mov	ar.ec=2
1538c2ecf20Sopenharmony_ci(p10)	br.dpnt.few .aligned_src_tail
1548c2ecf20Sopenharmony_ci	;;
1558c2ecf20Sopenharmony_ci	TEXT_ALIGN(32)
1568c2ecf20Sopenharmony_ci1:
1578c2ecf20Sopenharmony_ciEX(.ex_handler, (p16)	ld8	r34=[src0],16)
1588c2ecf20Sopenharmony_ciEK(.ex_handler, (p16)	ld8	r38=[src1],16)
1598c2ecf20Sopenharmony_ciEX(.ex_handler, (p17)	st8	[dst0]=r33,16)
1608c2ecf20Sopenharmony_ciEK(.ex_handler, (p17)	st8	[dst1]=r37,16)
1618c2ecf20Sopenharmony_ci	;;
1628c2ecf20Sopenharmony_ciEX(.ex_handler, (p16)	ld8	r32=[src0],16)
1638c2ecf20Sopenharmony_ciEK(.ex_handler, (p16)	ld8	r36=[src1],16)
1648c2ecf20Sopenharmony_ciEX(.ex_handler, (p16)	st8	[dst0]=r34,16)
1658c2ecf20Sopenharmony_ciEK(.ex_handler, (p16)	st8	[dst1]=r38,16)
1668c2ecf20Sopenharmony_ci	br.ctop.dptk.few 1b
1678c2ecf20Sopenharmony_ci	;;
1688c2ecf20Sopenharmony_ci
1698c2ecf20Sopenharmony_ci.aligned_src_tail:
1708c2ecf20Sopenharmony_ciEX(.ex_handler, (p6)	ld8	t1=[src0])
1718c2ecf20Sopenharmony_ci	mov	ar.lc=saved_lc
1728c2ecf20Sopenharmony_ci	mov	ar.pfs=saved_pfs
1738c2ecf20Sopenharmony_ciEX(.ex_hndlr_s, (p7)	ld8	t2=[src1],8)
1748c2ecf20Sopenharmony_ci	cmp.le	p8,p0=24,tmp
1758c2ecf20Sopenharmony_ci	and	r21=-8,tmp
1768c2ecf20Sopenharmony_ci	;;
1778c2ecf20Sopenharmony_ciEX(.ex_hndlr_s, (p8)	ld8	t3=[src1])
1788c2ecf20Sopenharmony_ciEX(.ex_handler, (p6)	st8	[dst0]=t1)	// store byte 1
1798c2ecf20Sopenharmony_ci	and	in2=7,tmp	// remaining length
1808c2ecf20Sopenharmony_ciEX(.ex_hndlr_d, (p7)	st8	[dst1]=t2,8)	// store byte 2
1818c2ecf20Sopenharmony_ci	add	src0=src0,r21	// setting up src pointer
1828c2ecf20Sopenharmony_ci	add	dst0=dst0,r21	// setting up dest pointer
1838c2ecf20Sopenharmony_ci	;;
1848c2ecf20Sopenharmony_ciEX(.ex_handler, (p8)	st8	[dst1]=t3)	// store byte 3
1858c2ecf20Sopenharmony_ci	mov	pr=saved_pr,-1
1868c2ecf20Sopenharmony_ci	br.dptk.many .memcpy_short
1878c2ecf20Sopenharmony_ci	;;
1888c2ecf20Sopenharmony_ci
1898c2ecf20Sopenharmony_ci/* code taken from copy_page_mck */
1908c2ecf20Sopenharmony_ci.long_copy:
1918c2ecf20Sopenharmony_ci	.rotr v[2*PREFETCH_DIST]
1928c2ecf20Sopenharmony_ci	.rotp p[N]
1938c2ecf20Sopenharmony_ci
1948c2ecf20Sopenharmony_ci	mov src_pre_mem = src0
1958c2ecf20Sopenharmony_ci	mov pr.rot = 0x10000
1968c2ecf20Sopenharmony_ci	mov ar.ec = 1				// special unrolled loop
1978c2ecf20Sopenharmony_ci
1988c2ecf20Sopenharmony_ci	mov dst_pre_mem = dst0
1998c2ecf20Sopenharmony_ci
2008c2ecf20Sopenharmony_ci	add src_pre_l2 = 8*8, src0
2018c2ecf20Sopenharmony_ci	add dst_pre_l2 = 8*8, dst0
2028c2ecf20Sopenharmony_ci	;;
2038c2ecf20Sopenharmony_ci	add src0 = 8, src_pre_mem		// first t1 src
2048c2ecf20Sopenharmony_ci	mov ar.lc = 2*PREFETCH_DIST - 1
2058c2ecf20Sopenharmony_ci	shr.u cnt=in2,7				// number of lines
2068c2ecf20Sopenharmony_ci	add src1 = 3*8, src_pre_mem		// first t3 src
2078c2ecf20Sopenharmony_ci	add dst0 = 8, dst_pre_mem		// first t1 dst
2088c2ecf20Sopenharmony_ci	add dst1 = 3*8, dst_pre_mem		// first t3 dst
2098c2ecf20Sopenharmony_ci	;;
2108c2ecf20Sopenharmony_ci	and tmp=127,in2				// remaining bytes after this block
2118c2ecf20Sopenharmony_ci	add cnt = -(2*PREFETCH_DIST) - 1, cnt
2128c2ecf20Sopenharmony_ci	// same as .line_copy loop, but with all predicated-off instructions removed:
2138c2ecf20Sopenharmony_ci.prefetch_loop:
2148c2ecf20Sopenharmony_ciEX(.ex_hndlr_lcpy_1, (p[A])	ld8 v[A] = [src_pre_mem], 128)		// M0
2158c2ecf20Sopenharmony_ciEK(.ex_hndlr_lcpy_1, (p[B])	st8 [dst_pre_mem] = v[B], 128)		// M2
2168c2ecf20Sopenharmony_ci	br.ctop.sptk .prefetch_loop
2178c2ecf20Sopenharmony_ci	;;
2188c2ecf20Sopenharmony_ci	cmp.eq p16, p0 = r0, r0			// reset p16 to 1
2198c2ecf20Sopenharmony_ci	mov ar.lc = cnt
2208c2ecf20Sopenharmony_ci	mov ar.ec = N				// # of stages in pipeline
2218c2ecf20Sopenharmony_ci	;;
2228c2ecf20Sopenharmony_ci.line_copy:
2238c2ecf20Sopenharmony_ciEX(.ex_handler,	(p[D])	ld8 t2 = [src0], 3*8)			// M0
2248c2ecf20Sopenharmony_ciEK(.ex_handler,	(p[D])	ld8 t4 = [src1], 3*8)			// M1
2258c2ecf20Sopenharmony_ciEX(.ex_handler_lcpy,	(p[B])	st8 [dst_pre_mem] = v[B], 128)		// M2 prefetch dst from memory
2268c2ecf20Sopenharmony_ciEK(.ex_handler_lcpy,	(p[D])	st8 [dst_pre_l2] = n8, 128)		// M3 prefetch dst from L2
2278c2ecf20Sopenharmony_ci	;;
2288c2ecf20Sopenharmony_ciEX(.ex_handler_lcpy,	(p[A])	ld8 v[A] = [src_pre_mem], 128)		// M0 prefetch src from memory
2298c2ecf20Sopenharmony_ciEK(.ex_handler_lcpy,	(p[C])	ld8 n8 = [src_pre_l2], 128)		// M1 prefetch src from L2
2308c2ecf20Sopenharmony_ciEX(.ex_handler,	(p[D])	st8 [dst0] =  t1, 8)			// M2
2318c2ecf20Sopenharmony_ciEK(.ex_handler,	(p[D])	st8 [dst1] =  t3, 8)			// M3
2328c2ecf20Sopenharmony_ci	;;
2338c2ecf20Sopenharmony_ciEX(.ex_handler,	(p[D])	ld8  t5 = [src0], 8)
2348c2ecf20Sopenharmony_ciEK(.ex_handler,	(p[D])	ld8  t7 = [src1], 3*8)
2358c2ecf20Sopenharmony_ciEX(.ex_handler,	(p[D])	st8 [dst0] =  t2, 3*8)
2368c2ecf20Sopenharmony_ciEK(.ex_handler,	(p[D])	st8 [dst1] =  t4, 3*8)
2378c2ecf20Sopenharmony_ci	;;
2388c2ecf20Sopenharmony_ciEX(.ex_handler,	(p[D])	ld8  t6 = [src0], 3*8)
2398c2ecf20Sopenharmony_ciEK(.ex_handler,	(p[D])	ld8 t10 = [src1], 8)
2408c2ecf20Sopenharmony_ciEX(.ex_handler,	(p[D])	st8 [dst0] =  t5, 8)
2418c2ecf20Sopenharmony_ciEK(.ex_handler,	(p[D])	st8 [dst1] =  t7, 3*8)
2428c2ecf20Sopenharmony_ci	;;
2438c2ecf20Sopenharmony_ciEX(.ex_handler,	(p[D])	ld8  t9 = [src0], 3*8)
2448c2ecf20Sopenharmony_ciEK(.ex_handler,	(p[D])	ld8 t11 = [src1], 3*8)
2458c2ecf20Sopenharmony_ciEX(.ex_handler,	(p[D])	st8 [dst0] =  t6, 3*8)
2468c2ecf20Sopenharmony_ciEK(.ex_handler,	(p[D])	st8 [dst1] = t10, 8)
2478c2ecf20Sopenharmony_ci	;;
2488c2ecf20Sopenharmony_ciEX(.ex_handler,	(p[D])	ld8 t12 = [src0], 8)
2498c2ecf20Sopenharmony_ciEK(.ex_handler,	(p[D])	ld8 t14 = [src1], 8)
2508c2ecf20Sopenharmony_ciEX(.ex_handler,	(p[D])	st8 [dst0] =  t9, 3*8)
2518c2ecf20Sopenharmony_ciEK(.ex_handler,	(p[D])	st8 [dst1] = t11, 3*8)
2528c2ecf20Sopenharmony_ci	;;
2538c2ecf20Sopenharmony_ciEX(.ex_handler,	(p[D])	ld8 t13 = [src0], 4*8)
2548c2ecf20Sopenharmony_ciEK(.ex_handler,	(p[D])	ld8 t15 = [src1], 4*8)
2558c2ecf20Sopenharmony_ciEX(.ex_handler,	(p[D])	st8 [dst0] = t12, 8)
2568c2ecf20Sopenharmony_ciEK(.ex_handler,	(p[D])	st8 [dst1] = t14, 8)
2578c2ecf20Sopenharmony_ci	;;
2588c2ecf20Sopenharmony_ciEX(.ex_handler,	(p[C])	ld8  t1 = [src0], 8)
2598c2ecf20Sopenharmony_ciEK(.ex_handler,	(p[C])	ld8  t3 = [src1], 8)
2608c2ecf20Sopenharmony_ciEX(.ex_handler,	(p[D])	st8 [dst0] = t13, 4*8)
2618c2ecf20Sopenharmony_ciEK(.ex_handler,	(p[D])	st8 [dst1] = t15, 4*8)
2628c2ecf20Sopenharmony_ci	br.ctop.sptk .line_copy
2638c2ecf20Sopenharmony_ci	;;
2648c2ecf20Sopenharmony_ci
2658c2ecf20Sopenharmony_ci	add dst0=-8,dst0
2668c2ecf20Sopenharmony_ci	add src0=-8,src0
2678c2ecf20Sopenharmony_ci	mov in2=tmp
2688c2ecf20Sopenharmony_ci	.restore sp
2698c2ecf20Sopenharmony_ci	br.sptk.many .medium_copy
2708c2ecf20Sopenharmony_ci	;;
2718c2ecf20Sopenharmony_ci
2728c2ecf20Sopenharmony_ci#define BLOCK_SIZE	128*32
2738c2ecf20Sopenharmony_ci#define blocksize	r23
2748c2ecf20Sopenharmony_ci#define curlen		r24
2758c2ecf20Sopenharmony_ci
2768c2ecf20Sopenharmony_ci// dest is on 8-byte boundary, src is not. We need to do
2778c2ecf20Sopenharmony_ci// ld8-ld8, shrp, then st8.  Max 8 byte copy per cycle.
2788c2ecf20Sopenharmony_ci.unaligned_src:
2798c2ecf20Sopenharmony_ci	.prologue
2808c2ecf20Sopenharmony_ci	.save ar.pfs, saved_pfs
2818c2ecf20Sopenharmony_ci	alloc	saved_pfs=ar.pfs,3,5,0,8
2828c2ecf20Sopenharmony_ci	.save ar.lc, saved_lc
2838c2ecf20Sopenharmony_ci	mov	saved_lc=ar.lc
2848c2ecf20Sopenharmony_ci	.save pr, saved_pr
2858c2ecf20Sopenharmony_ci	mov	saved_pr=pr
2868c2ecf20Sopenharmony_ci	.body
2878c2ecf20Sopenharmony_ci.4k_block:
2888c2ecf20Sopenharmony_ci	mov	saved_in0=dst0	// need to save all input arguments
2898c2ecf20Sopenharmony_ci	mov	saved_in2=in2
2908c2ecf20Sopenharmony_ci	mov	blocksize=BLOCK_SIZE
2918c2ecf20Sopenharmony_ci	;;
2928c2ecf20Sopenharmony_ci	cmp.lt	p6,p7=blocksize,in2
2938c2ecf20Sopenharmony_ci	mov	saved_in1=src0
2948c2ecf20Sopenharmony_ci	;;
2958c2ecf20Sopenharmony_ci(p6)	mov	in2=blocksize
2968c2ecf20Sopenharmony_ci	;;
2978c2ecf20Sopenharmony_ci	shr.u	r21=in2,7	// this much cache line
2988c2ecf20Sopenharmony_ci	shr.u	r22=in2,4	// number of 16-byte iteration
2998c2ecf20Sopenharmony_ci	and	curlen=15,in2	// copy length after iteration
3008c2ecf20Sopenharmony_ci	and	r30=7,src0	// source alignment
3018c2ecf20Sopenharmony_ci	;;
3028c2ecf20Sopenharmony_ci	cmp.lt	p7,p8=1,r21
3038c2ecf20Sopenharmony_ci	add	cnt=-1,r21
3048c2ecf20Sopenharmony_ci	;;
3058c2ecf20Sopenharmony_ci
3068c2ecf20Sopenharmony_ci	add	src_pre_mem=0,src0	// prefetch src pointer
3078c2ecf20Sopenharmony_ci	add	dst_pre_mem=0,dst0	// prefetch dest pointer
3088c2ecf20Sopenharmony_ci	and	src0=-8,src0		// 1st src pointer
3098c2ecf20Sopenharmony_ci(p7)	mov	ar.lc = cnt
3108c2ecf20Sopenharmony_ci(p8)	mov	ar.lc = r0
3118c2ecf20Sopenharmony_ci	;;
3128c2ecf20Sopenharmony_ci	TEXT_ALIGN(32)
3138c2ecf20Sopenharmony_ci1:	lfetch.fault	  [src_pre_mem], 128
3148c2ecf20Sopenharmony_ci	lfetch.fault.excl [dst_pre_mem], 128
3158c2ecf20Sopenharmony_ci	br.cloop.dptk.few 1b
3168c2ecf20Sopenharmony_ci	;;
3178c2ecf20Sopenharmony_ci
3188c2ecf20Sopenharmony_ci	shladd	dst1=r22,3,dst0	// 2nd dest pointer
3198c2ecf20Sopenharmony_ci	shladd	src1=r22,3,src0	// 2nd src pointer
3208c2ecf20Sopenharmony_ci	cmp.eq	p8,p9=r22,r0	// do we really need to loop?
3218c2ecf20Sopenharmony_ci	cmp.le	p6,p7=8,curlen;	// have at least 8 byte remaining?
3228c2ecf20Sopenharmony_ci	add	cnt=-1,r22	// ctop iteration adjustment
3238c2ecf20Sopenharmony_ci	;;
3248c2ecf20Sopenharmony_ciEX(.ex_handler, (p9)	ld8	r33=[src0],8)	// loop primer
3258c2ecf20Sopenharmony_ciEK(.ex_handler, (p9)	ld8	r37=[src1],8)
3268c2ecf20Sopenharmony_ci(p8)	br.dpnt.few .noloop
3278c2ecf20Sopenharmony_ci	;;
3288c2ecf20Sopenharmony_ci
3298c2ecf20Sopenharmony_ci// The jump address is calculated based on src alignment. The COPYU
3308c2ecf20Sopenharmony_ci// macro below need to confine its size to power of two, so an entry
3318c2ecf20Sopenharmony_ci// can be caulated using shl instead of an expensive multiply. The
3328c2ecf20Sopenharmony_ci// size is then hard coded by the following #define to match the
3338c2ecf20Sopenharmony_ci// actual size.  This make it somewhat tedious when COPYU macro gets
3348c2ecf20Sopenharmony_ci// changed and this need to be adjusted to match.
3358c2ecf20Sopenharmony_ci#define LOOP_SIZE 6
3368c2ecf20Sopenharmony_ci1:
3378c2ecf20Sopenharmony_ci	mov	r29=ip		// jmp_table thread
3388c2ecf20Sopenharmony_ci	mov	ar.lc=cnt
3398c2ecf20Sopenharmony_ci	;;
3408c2ecf20Sopenharmony_ci	add	r29=.jump_table - 1b - (.jmp1-.jump_table), r29
3418c2ecf20Sopenharmony_ci	shl	r28=r30, LOOP_SIZE	// jmp_table thread
3428c2ecf20Sopenharmony_ci	mov	ar.ec=2		// loop setup
3438c2ecf20Sopenharmony_ci	;;
3448c2ecf20Sopenharmony_ci	add	r29=r29,r28		// jmp_table thread
3458c2ecf20Sopenharmony_ci	cmp.eq	p16,p17=r0,r0
3468c2ecf20Sopenharmony_ci	;;
3478c2ecf20Sopenharmony_ci	mov	b6=r29			// jmp_table thread
3488c2ecf20Sopenharmony_ci	;;
3498c2ecf20Sopenharmony_ci	br.cond.sptk.few b6
3508c2ecf20Sopenharmony_ci
3518c2ecf20Sopenharmony_ci// for 8-15 byte case
3528c2ecf20Sopenharmony_ci// We will skip the loop, but need to replicate the side effect
3538c2ecf20Sopenharmony_ci// that the loop produces.
3548c2ecf20Sopenharmony_ci.noloop:
3558c2ecf20Sopenharmony_ciEX(.ex_handler, (p6)	ld8	r37=[src1],8)
3568c2ecf20Sopenharmony_ci	add	src0=8,src0
3578c2ecf20Sopenharmony_ci(p6)	shl	r25=r30,3
3588c2ecf20Sopenharmony_ci	;;
3598c2ecf20Sopenharmony_ciEX(.ex_handler, (p6)	ld8	r27=[src1])
3608c2ecf20Sopenharmony_ci(p6)	shr.u	r28=r37,r25
3618c2ecf20Sopenharmony_ci(p6)	sub	r26=64,r25
3628c2ecf20Sopenharmony_ci	;;
3638c2ecf20Sopenharmony_ci(p6)	shl	r27=r27,r26
3648c2ecf20Sopenharmony_ci	;;
3658c2ecf20Sopenharmony_ci(p6)	or	r21=r28,r27
3668c2ecf20Sopenharmony_ci
3678c2ecf20Sopenharmony_ci.unaligned_src_tail:
3688c2ecf20Sopenharmony_ci/* check if we have more than blocksize to copy, if so go back */
3698c2ecf20Sopenharmony_ci	cmp.gt	p8,p0=saved_in2,blocksize
3708c2ecf20Sopenharmony_ci	;;
3718c2ecf20Sopenharmony_ci(p8)	add	dst0=saved_in0,blocksize
3728c2ecf20Sopenharmony_ci(p8)	add	src0=saved_in1,blocksize
3738c2ecf20Sopenharmony_ci(p8)	sub	in2=saved_in2,blocksize
3748c2ecf20Sopenharmony_ci(p8)	br.dpnt	.4k_block
3758c2ecf20Sopenharmony_ci	;;
3768c2ecf20Sopenharmony_ci
3778c2ecf20Sopenharmony_ci/* we have up to 15 byte to copy in the tail.
3788c2ecf20Sopenharmony_ci * part of work is already done in the jump table code
3798c2ecf20Sopenharmony_ci * we are at the following state.
3808c2ecf20Sopenharmony_ci * src side:
3818c2ecf20Sopenharmony_ci *
3828c2ecf20Sopenharmony_ci *   xxxxxx xx                   <----- r21 has xxxxxxxx already
3838c2ecf20Sopenharmony_ci * -------- -------- --------
3848c2ecf20Sopenharmony_ci * 0        8        16
3858c2ecf20Sopenharmony_ci *          ^
3868c2ecf20Sopenharmony_ci *          |
3878c2ecf20Sopenharmony_ci *          src1
3888c2ecf20Sopenharmony_ci *
3898c2ecf20Sopenharmony_ci * dst
3908c2ecf20Sopenharmony_ci * -------- -------- --------
3918c2ecf20Sopenharmony_ci * ^
3928c2ecf20Sopenharmony_ci * |
3938c2ecf20Sopenharmony_ci * dst1
3948c2ecf20Sopenharmony_ci */
3958c2ecf20Sopenharmony_ciEX(.ex_handler, (p6)	st8	[dst1]=r21,8)	// more than 8 byte to copy
3968c2ecf20Sopenharmony_ci(p6)	add	curlen=-8,curlen	// update length
3978c2ecf20Sopenharmony_ci	mov	ar.pfs=saved_pfs
3988c2ecf20Sopenharmony_ci	;;
3998c2ecf20Sopenharmony_ci	mov	ar.lc=saved_lc
4008c2ecf20Sopenharmony_ci	mov	pr=saved_pr,-1
4018c2ecf20Sopenharmony_ci	mov	in2=curlen	// remaining length
4028c2ecf20Sopenharmony_ci	mov	dst0=dst1	// dest pointer
4038c2ecf20Sopenharmony_ci	add	src0=src1,r30	// forward by src alignment
4048c2ecf20Sopenharmony_ci	;;
4058c2ecf20Sopenharmony_ci
4068c2ecf20Sopenharmony_ci// 7 byte or smaller.
4078c2ecf20Sopenharmony_ci.memcpy_short:
4088c2ecf20Sopenharmony_ci	cmp.le	p8,p9   = 1,in2
4098c2ecf20Sopenharmony_ci	cmp.le	p10,p11 = 2,in2
4108c2ecf20Sopenharmony_ci	cmp.le	p12,p13 = 3,in2
4118c2ecf20Sopenharmony_ci	cmp.le	p14,p15 = 4,in2
4128c2ecf20Sopenharmony_ci	add	src1=1,src0	// second src pointer
4138c2ecf20Sopenharmony_ci	add	dst1=1,dst0	// second dest pointer
4148c2ecf20Sopenharmony_ci	;;
4158c2ecf20Sopenharmony_ci
4168c2ecf20Sopenharmony_ciEX(.ex_handler_short, (p8)	ld1	t1=[src0],2)
4178c2ecf20Sopenharmony_ciEK(.ex_handler_short, (p10)	ld1	t2=[src1],2)
4188c2ecf20Sopenharmony_ci(p9)	br.ret.dpnt rp		// 0 byte copy
4198c2ecf20Sopenharmony_ci	;;
4208c2ecf20Sopenharmony_ci
4218c2ecf20Sopenharmony_ciEX(.ex_handler_short, (p8)	st1	[dst0]=t1,2)
4228c2ecf20Sopenharmony_ciEK(.ex_handler_short, (p10)	st1	[dst1]=t2,2)
4238c2ecf20Sopenharmony_ci(p11)	br.ret.dpnt rp		// 1 byte copy
4248c2ecf20Sopenharmony_ci
4258c2ecf20Sopenharmony_ciEX(.ex_handler_short, (p12)	ld1	t3=[src0],2)
4268c2ecf20Sopenharmony_ciEK(.ex_handler_short, (p14)	ld1	t4=[src1],2)
4278c2ecf20Sopenharmony_ci(p13)	br.ret.dpnt rp		// 2 byte copy
4288c2ecf20Sopenharmony_ci	;;
4298c2ecf20Sopenharmony_ci
4308c2ecf20Sopenharmony_ci	cmp.le	p6,p7   = 5,in2
4318c2ecf20Sopenharmony_ci	cmp.le	p8,p9   = 6,in2
4328c2ecf20Sopenharmony_ci	cmp.le	p10,p11 = 7,in2
4338c2ecf20Sopenharmony_ci
4348c2ecf20Sopenharmony_ciEX(.ex_handler_short, (p12)	st1	[dst0]=t3,2)
4358c2ecf20Sopenharmony_ciEK(.ex_handler_short, (p14)	st1	[dst1]=t4,2)
4368c2ecf20Sopenharmony_ci(p15)	br.ret.dpnt rp		// 3 byte copy
4378c2ecf20Sopenharmony_ci	;;
4388c2ecf20Sopenharmony_ci
4398c2ecf20Sopenharmony_ciEX(.ex_handler_short, (p6)	ld1	t5=[src0],2)
4408c2ecf20Sopenharmony_ciEK(.ex_handler_short, (p8)	ld1	t6=[src1],2)
4418c2ecf20Sopenharmony_ci(p7)	br.ret.dpnt rp		// 4 byte copy
4428c2ecf20Sopenharmony_ci	;;
4438c2ecf20Sopenharmony_ci
4448c2ecf20Sopenharmony_ciEX(.ex_handler_short, (p6)	st1	[dst0]=t5,2)
4458c2ecf20Sopenharmony_ciEK(.ex_handler_short, (p8)	st1	[dst1]=t6,2)
4468c2ecf20Sopenharmony_ci(p9)	br.ret.dptk rp		// 5 byte copy
4478c2ecf20Sopenharmony_ci
4488c2ecf20Sopenharmony_ciEX(.ex_handler_short, (p10)	ld1	t7=[src0],2)
4498c2ecf20Sopenharmony_ci(p11)	br.ret.dptk rp		// 6 byte copy
4508c2ecf20Sopenharmony_ci	;;
4518c2ecf20Sopenharmony_ci
4528c2ecf20Sopenharmony_ciEX(.ex_handler_short, (p10)	st1	[dst0]=t7,2)
4538c2ecf20Sopenharmony_ci	br.ret.dptk rp		// done all cases
4548c2ecf20Sopenharmony_ci
4558c2ecf20Sopenharmony_ci
4568c2ecf20Sopenharmony_ci/* Align dest to nearest 8-byte boundary. We know we have at
4578c2ecf20Sopenharmony_ci * least 7 bytes to copy, enough to crawl to 8-byte boundary.
4588c2ecf20Sopenharmony_ci * Actual number of byte to crawl depend on the dest alignment.
4598c2ecf20Sopenharmony_ci * 7 byte or less is taken care at .memcpy_short
4608c2ecf20Sopenharmony_ci
4618c2ecf20Sopenharmony_ci * src0 - source even index
4628c2ecf20Sopenharmony_ci * src1 - source  odd index
4638c2ecf20Sopenharmony_ci * dst0 - dest even index
4648c2ecf20Sopenharmony_ci * dst1 - dest  odd index
4658c2ecf20Sopenharmony_ci * r30  - distance to 8-byte boundary
4668c2ecf20Sopenharmony_ci */
4678c2ecf20Sopenharmony_ci
4688c2ecf20Sopenharmony_ci.align_dest:
4698c2ecf20Sopenharmony_ci	add	src1=1,in1	// source odd index
4708c2ecf20Sopenharmony_ci	cmp.le	p7,p0 = 2,r30	// for .align_dest
4718c2ecf20Sopenharmony_ci	cmp.le	p8,p0 = 3,r30	// for .align_dest
4728c2ecf20Sopenharmony_ciEX(.ex_handler_short, (p6)	ld1	t1=[src0],2)
4738c2ecf20Sopenharmony_ci	cmp.le	p9,p0 = 4,r30	// for .align_dest
4748c2ecf20Sopenharmony_ci	cmp.le	p10,p0 = 5,r30
4758c2ecf20Sopenharmony_ci	;;
4768c2ecf20Sopenharmony_ciEX(.ex_handler_short, (p7)	ld1	t2=[src1],2)
4778c2ecf20Sopenharmony_ciEK(.ex_handler_short, (p8)	ld1	t3=[src0],2)
4788c2ecf20Sopenharmony_ci	cmp.le	p11,p0 = 6,r30
4798c2ecf20Sopenharmony_ciEX(.ex_handler_short, (p6)	st1	[dst0] = t1,2)
4808c2ecf20Sopenharmony_ci	cmp.le	p12,p0 = 7,r30
4818c2ecf20Sopenharmony_ci	;;
4828c2ecf20Sopenharmony_ciEX(.ex_handler_short, (p9)	ld1	t4=[src1],2)
4838c2ecf20Sopenharmony_ciEK(.ex_handler_short, (p10)	ld1	t5=[src0],2)
4848c2ecf20Sopenharmony_ciEX(.ex_handler_short, (p7)	st1	[dst1] = t2,2)
4858c2ecf20Sopenharmony_ciEK(.ex_handler_short, (p8)	st1	[dst0] = t3,2)
4868c2ecf20Sopenharmony_ci	;;
4878c2ecf20Sopenharmony_ciEX(.ex_handler_short, (p11)	ld1	t6=[src1],2)
4888c2ecf20Sopenharmony_ciEK(.ex_handler_short, (p12)	ld1	t7=[src0],2)
4898c2ecf20Sopenharmony_ci	cmp.eq	p6,p7=r28,r29
4908c2ecf20Sopenharmony_ciEX(.ex_handler_short, (p9)	st1	[dst1] = t4,2)
4918c2ecf20Sopenharmony_ciEK(.ex_handler_short, (p10)	st1	[dst0] = t5,2)
4928c2ecf20Sopenharmony_ci	sub	in2=in2,r30
4938c2ecf20Sopenharmony_ci	;;
4948c2ecf20Sopenharmony_ciEX(.ex_handler_short, (p11)	st1	[dst1] = t6,2)
4958c2ecf20Sopenharmony_ciEK(.ex_handler_short, (p12)	st1	[dst0] = t7)
4968c2ecf20Sopenharmony_ci	add	dst0=in0,r30	// setup arguments
4978c2ecf20Sopenharmony_ci	add	src0=in1,r30
4988c2ecf20Sopenharmony_ci(p6)	br.cond.dptk .aligned_src
4998c2ecf20Sopenharmony_ci(p7)	br.cond.dpnt .unaligned_src
5008c2ecf20Sopenharmony_ci	;;
5018c2ecf20Sopenharmony_ci
5028c2ecf20Sopenharmony_ci/* main loop body in jump table format */
5038c2ecf20Sopenharmony_ci#define COPYU(shift)									\
5048c2ecf20Sopenharmony_ci1:											\
5058c2ecf20Sopenharmony_ciEX(.ex_handler,  (p16)	ld8	r32=[src0],8);		/* 1 */				\
5068c2ecf20Sopenharmony_ciEK(.ex_handler,  (p16)	ld8	r36=[src1],8);						\
5078c2ecf20Sopenharmony_ci		 (p17)	shrp	r35=r33,r34,shift;;	/* 1 */				\
5088c2ecf20Sopenharmony_ciEX(.ex_handler,  (p6)	ld8	r22=[src1]);	/* common, prime for tail section */	\
5098c2ecf20Sopenharmony_ci		 nop.m	0;								\
5108c2ecf20Sopenharmony_ci		 (p16)	shrp	r38=r36,r37,shift;					\
5118c2ecf20Sopenharmony_ciEX(.ex_handler,  (p17)	st8	[dst0]=r35,8);		/* 1 */				\
5128c2ecf20Sopenharmony_ciEK(.ex_handler,  (p17)	st8	[dst1]=r39,8);						\
5138c2ecf20Sopenharmony_ci		 br.ctop.dptk.few 1b;;							\
5148c2ecf20Sopenharmony_ci		 (p7)	add	src1=-8,src1;	/* back out for <8 byte case */		\
5158c2ecf20Sopenharmony_ci		 shrp	r21=r22,r38,shift;	/* speculative work */			\
5168c2ecf20Sopenharmony_ci		 br.sptk.few .unaligned_src_tail /* branch out of jump table */		\
5178c2ecf20Sopenharmony_ci		 ;;
5188c2ecf20Sopenharmony_ci	TEXT_ALIGN(32)
5198c2ecf20Sopenharmony_ci.jump_table:
5208c2ecf20Sopenharmony_ci	COPYU(8)	// unaligned cases
5218c2ecf20Sopenharmony_ci.jmp1:
5228c2ecf20Sopenharmony_ci	COPYU(16)
5238c2ecf20Sopenharmony_ci	COPYU(24)
5248c2ecf20Sopenharmony_ci	COPYU(32)
5258c2ecf20Sopenharmony_ci	COPYU(40)
5268c2ecf20Sopenharmony_ci	COPYU(48)
5278c2ecf20Sopenharmony_ci	COPYU(56)
5288c2ecf20Sopenharmony_ci
5298c2ecf20Sopenharmony_ci#undef A
5308c2ecf20Sopenharmony_ci#undef B
5318c2ecf20Sopenharmony_ci#undef C
5328c2ecf20Sopenharmony_ci#undef D
5338c2ecf20Sopenharmony_ci
5348c2ecf20Sopenharmony_ci/*
5358c2ecf20Sopenharmony_ci * Due to lack of local tag support in gcc 2.x assembler, it is not clear which
5368c2ecf20Sopenharmony_ci * instruction failed in the bundle.  The exception algorithm is that we
5378c2ecf20Sopenharmony_ci * first figure out the faulting address, then detect if there is any
5388c2ecf20Sopenharmony_ci * progress made on the copy, if so, redo the copy from last known copied
5398c2ecf20Sopenharmony_ci * location up to the faulting address (exclusive). In the copy_from_user
5408c2ecf20Sopenharmony_ci * case, remaining byte in kernel buffer will be zeroed.
5418c2ecf20Sopenharmony_ci *
5428c2ecf20Sopenharmony_ci * Take copy_from_user as an example, in the code there are multiple loads
5438c2ecf20Sopenharmony_ci * in a bundle and those multiple loads could span over two pages, the
5448c2ecf20Sopenharmony_ci * faulting address is calculated as page_round_down(max(src0, src1)).
5458c2ecf20Sopenharmony_ci * This is based on knowledge that if we can access one byte in a page, we
5468c2ecf20Sopenharmony_ci * can access any byte in that page.
5478c2ecf20Sopenharmony_ci *
5488c2ecf20Sopenharmony_ci * predicate used in the exception handler:
5498c2ecf20Sopenharmony_ci * p6-p7: direction
5508c2ecf20Sopenharmony_ci * p10-p11: src faulting addr calculation
5518c2ecf20Sopenharmony_ci * p12-p13: dst faulting addr calculation
5528c2ecf20Sopenharmony_ci */
5538c2ecf20Sopenharmony_ci
5548c2ecf20Sopenharmony_ci#define A	r19
5558c2ecf20Sopenharmony_ci#define B	r20
5568c2ecf20Sopenharmony_ci#define C	r21
5578c2ecf20Sopenharmony_ci#define D	r22
5588c2ecf20Sopenharmony_ci#define F	r28
5598c2ecf20Sopenharmony_ci
5608c2ecf20Sopenharmony_ci#define saved_retval	loc0
5618c2ecf20Sopenharmony_ci#define saved_rtlink	loc1
5628c2ecf20Sopenharmony_ci#define saved_pfs_stack	loc2
5638c2ecf20Sopenharmony_ci
5648c2ecf20Sopenharmony_ci.ex_hndlr_s:
5658c2ecf20Sopenharmony_ci	add	src0=8,src0
5668c2ecf20Sopenharmony_ci	br.sptk .ex_handler
5678c2ecf20Sopenharmony_ci	;;
5688c2ecf20Sopenharmony_ci.ex_hndlr_d:
5698c2ecf20Sopenharmony_ci	add	dst0=8,dst0
5708c2ecf20Sopenharmony_ci	br.sptk .ex_handler
5718c2ecf20Sopenharmony_ci	;;
5728c2ecf20Sopenharmony_ci.ex_hndlr_lcpy_1:
5738c2ecf20Sopenharmony_ci	mov	src1=src_pre_mem
5748c2ecf20Sopenharmony_ci	mov	dst1=dst_pre_mem
5758c2ecf20Sopenharmony_ci	cmp.gtu	p10,p11=src_pre_mem,saved_in1
5768c2ecf20Sopenharmony_ci	cmp.gtu	p12,p13=dst_pre_mem,saved_in0
5778c2ecf20Sopenharmony_ci	;;
5788c2ecf20Sopenharmony_ci(p10)	add	src0=8,saved_in1
5798c2ecf20Sopenharmony_ci(p11)	mov	src0=saved_in1
5808c2ecf20Sopenharmony_ci(p12)	add	dst0=8,saved_in0
5818c2ecf20Sopenharmony_ci(p13)	mov	dst0=saved_in0
5828c2ecf20Sopenharmony_ci	br.sptk	.ex_handler
5838c2ecf20Sopenharmony_ci.ex_handler_lcpy:
5848c2ecf20Sopenharmony_ci	// in line_copy block, the preload addresses should always ahead
5858c2ecf20Sopenharmony_ci	// of the other two src/dst pointers.  Furthermore, src1/dst1 should
5868c2ecf20Sopenharmony_ci	// always ahead of src0/dst0.
5878c2ecf20Sopenharmony_ci	mov	src1=src_pre_mem
5888c2ecf20Sopenharmony_ci	mov	dst1=dst_pre_mem
5898c2ecf20Sopenharmony_ci.ex_handler:
5908c2ecf20Sopenharmony_ci	mov	pr=saved_pr,-1		// first restore pr, lc, and pfs
5918c2ecf20Sopenharmony_ci	mov	ar.lc=saved_lc
5928c2ecf20Sopenharmony_ci	mov	ar.pfs=saved_pfs
5938c2ecf20Sopenharmony_ci	;;
5948c2ecf20Sopenharmony_ci.ex_handler_short: // fault occurred in these sections didn't change pr, lc, pfs
5958c2ecf20Sopenharmony_ci	cmp.ltu	p6,p7=saved_in0, saved_in1	// get the copy direction
5968c2ecf20Sopenharmony_ci	cmp.ltu	p10,p11=src0,src1
5978c2ecf20Sopenharmony_ci	cmp.ltu	p12,p13=dst0,dst1
5988c2ecf20Sopenharmony_ci	fcmp.eq	p8,p0=f6,f0		// is it memcpy?
5998c2ecf20Sopenharmony_ci	mov	tmp = dst0
6008c2ecf20Sopenharmony_ci	;;
6018c2ecf20Sopenharmony_ci(p11)	mov	src1 = src0		// pick the larger of the two
6028c2ecf20Sopenharmony_ci(p13)	mov	dst0 = dst1		// make dst0 the smaller one
6038c2ecf20Sopenharmony_ci(p13)	mov	dst1 = tmp		// and dst1 the larger one
6048c2ecf20Sopenharmony_ci	;;
6058c2ecf20Sopenharmony_ci(p6)	dep	F = r0,dst1,0,PAGE_SHIFT // usr dst round down to page boundary
6068c2ecf20Sopenharmony_ci(p7)	dep	F = r0,src1,0,PAGE_SHIFT // usr src round down to page boundary
6078c2ecf20Sopenharmony_ci	;;
6088c2ecf20Sopenharmony_ci(p6)	cmp.le	p14,p0=dst0,saved_in0	// no progress has been made on store
6098c2ecf20Sopenharmony_ci(p7)	cmp.le	p14,p0=src0,saved_in1	// no progress has been made on load
6108c2ecf20Sopenharmony_ci	mov	retval=saved_in2
6118c2ecf20Sopenharmony_ci(p8)	ld1	tmp=[src1]		// force an oops for memcpy call
6128c2ecf20Sopenharmony_ci(p8)	st1	[dst1]=r0		// force an oops for memcpy call
6138c2ecf20Sopenharmony_ci(p14)	br.ret.sptk.many rp
6148c2ecf20Sopenharmony_ci
6158c2ecf20Sopenharmony_ci/*
6168c2ecf20Sopenharmony_ci * The remaining byte to copy is calculated as:
6178c2ecf20Sopenharmony_ci *
6188c2ecf20Sopenharmony_ci * A =	(faulting_addr - orig_src)	-> len to faulting ld address
6198c2ecf20Sopenharmony_ci *	or
6208c2ecf20Sopenharmony_ci * 	(faulting_addr - orig_dst)	-> len to faulting st address
6218c2ecf20Sopenharmony_ci * B =	(cur_dst - orig_dst)		-> len copied so far
6228c2ecf20Sopenharmony_ci * C =	A - B				-> len need to be copied
6238c2ecf20Sopenharmony_ci * D =	orig_len - A			-> len need to be left along
6248c2ecf20Sopenharmony_ci */
6258c2ecf20Sopenharmony_ci(p6)	sub	A = F, saved_in0
6268c2ecf20Sopenharmony_ci(p7)	sub	A = F, saved_in1
6278c2ecf20Sopenharmony_ci	clrrrb
6288c2ecf20Sopenharmony_ci	;;
6298c2ecf20Sopenharmony_ci	alloc	saved_pfs_stack=ar.pfs,3,3,3,0
6308c2ecf20Sopenharmony_ci	cmp.lt	p8,p0=A,r0
6318c2ecf20Sopenharmony_ci	sub	B = dst0, saved_in0	// how many byte copied so far
6328c2ecf20Sopenharmony_ci	;;
6338c2ecf20Sopenharmony_ci(p8)	mov	A = 0;			// A shouldn't be negative, cap it
6348c2ecf20Sopenharmony_ci	;;
6358c2ecf20Sopenharmony_ci	sub	C = A, B
6368c2ecf20Sopenharmony_ci	sub	D = saved_in2, A
6378c2ecf20Sopenharmony_ci	;;
6388c2ecf20Sopenharmony_ci	cmp.gt	p8,p0=C,r0		// more than 1 byte?
6398c2ecf20Sopenharmony_ci	mov	r8=0
6408c2ecf20Sopenharmony_ci	mov	saved_retval = D
6418c2ecf20Sopenharmony_ci	mov	saved_rtlink = b0
6428c2ecf20Sopenharmony_ci
6438c2ecf20Sopenharmony_ci	add	out0=saved_in0, B
6448c2ecf20Sopenharmony_ci	add	out1=saved_in1, B
6458c2ecf20Sopenharmony_ci	mov	out2=C
6468c2ecf20Sopenharmony_ci(p8)	br.call.sptk.few b0=__copy_user	// recursive call
6478c2ecf20Sopenharmony_ci	;;
6488c2ecf20Sopenharmony_ci
6498c2ecf20Sopenharmony_ci	add	saved_retval=saved_retval,r8	// above might return non-zero value
6508c2ecf20Sopenharmony_ci	;;
6518c2ecf20Sopenharmony_ci
6528c2ecf20Sopenharmony_ci	mov	retval=saved_retval
6538c2ecf20Sopenharmony_ci	mov	ar.pfs=saved_pfs_stack
6548c2ecf20Sopenharmony_ci	mov	b0=saved_rtlink
6558c2ecf20Sopenharmony_ci	br.ret.sptk.many rp
6568c2ecf20Sopenharmony_ci
6578c2ecf20Sopenharmony_ci/* end of McKinley specific optimization */
6588c2ecf20Sopenharmony_ciEND(__copy_user)
6598c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__copy_user)
660