162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Itanium 2-optimized version of memcpy and copy_user function
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Inputs:
662306a36Sopenharmony_ci * 	in0:	destination address
762306a36Sopenharmony_ci *	in1:	source address
862306a36Sopenharmony_ci *	in2:	number of bytes to copy
962306a36Sopenharmony_ci * Output:
1062306a36Sopenharmony_ci *	for memcpy:    return dest
1162306a36Sopenharmony_ci * 	for copy_user: return 0 if success,
1262306a36Sopenharmony_ci *		       or number of byte NOT copied if error occurred.
1362306a36Sopenharmony_ci *
1462306a36Sopenharmony_ci * Copyright (C) 2002 Intel Corp.
1562306a36Sopenharmony_ci * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
1662306a36Sopenharmony_ci */
1762306a36Sopenharmony_ci#include <linux/export.h>
1862306a36Sopenharmony_ci#include <asm/asmmacro.h>
1962306a36Sopenharmony_ci#include <asm/page.h>
2062306a36Sopenharmony_ci
2162306a36Sopenharmony_ci#define EK(y...) EX(y)
2262306a36Sopenharmony_ci
2362306a36Sopenharmony_ci/* McKinley specific optimization */
2462306a36Sopenharmony_ci
2562306a36Sopenharmony_ci#define retval		r8
2662306a36Sopenharmony_ci#define saved_pfs	r31
2762306a36Sopenharmony_ci#define saved_lc	r10
2862306a36Sopenharmony_ci#define saved_pr	r11
2962306a36Sopenharmony_ci#define saved_in0	r14
3062306a36Sopenharmony_ci#define saved_in1	r15
3162306a36Sopenharmony_ci#define saved_in2	r16
3262306a36Sopenharmony_ci
3362306a36Sopenharmony_ci#define src0		r2
3462306a36Sopenharmony_ci#define src1		r3
3562306a36Sopenharmony_ci#define dst0		r17
3662306a36Sopenharmony_ci#define dst1		r18
3762306a36Sopenharmony_ci#define cnt		r9
3862306a36Sopenharmony_ci
3962306a36Sopenharmony_ci/* r19-r30 are temp for each code section */
4062306a36Sopenharmony_ci#define PREFETCH_DIST	8
4162306a36Sopenharmony_ci#define src_pre_mem	r19
4262306a36Sopenharmony_ci#define dst_pre_mem	r20
4362306a36Sopenharmony_ci#define src_pre_l2	r21
4462306a36Sopenharmony_ci#define dst_pre_l2	r22
4562306a36Sopenharmony_ci#define t1		r23
4662306a36Sopenharmony_ci#define t2		r24
4762306a36Sopenharmony_ci#define t3		r25
4862306a36Sopenharmony_ci#define t4		r26
4962306a36Sopenharmony_ci#define t5		t1	// alias!
5062306a36Sopenharmony_ci#define t6		t2	// alias!
5162306a36Sopenharmony_ci#define t7		t3	// alias!
5262306a36Sopenharmony_ci#define n8		r27
5362306a36Sopenharmony_ci#define t9		t5	// alias!
5462306a36Sopenharmony_ci#define t10		t4	// alias!
5562306a36Sopenharmony_ci#define t11		t7	// alias!
5662306a36Sopenharmony_ci#define t12		t6	// alias!
5762306a36Sopenharmony_ci#define t14		t10	// alias!
5862306a36Sopenharmony_ci#define t13		r28
5962306a36Sopenharmony_ci#define t15		r29
6062306a36Sopenharmony_ci#define tmp		r30
6162306a36Sopenharmony_ci
6262306a36Sopenharmony_ci/* defines for long_copy block */
6362306a36Sopenharmony_ci#define	A	0
6462306a36Sopenharmony_ci#define B	(PREFETCH_DIST)
6562306a36Sopenharmony_ci#define C	(B + PREFETCH_DIST)
6662306a36Sopenharmony_ci#define D	(C + 1)
6762306a36Sopenharmony_ci#define N	(D + 1)
6862306a36Sopenharmony_ci#define Nrot	((N + 7) & ~7)
6962306a36Sopenharmony_ci
7062306a36Sopenharmony_ci/* alias */
7162306a36Sopenharmony_ci#define in0		r32
7262306a36Sopenharmony_ci#define in1		r33
7362306a36Sopenharmony_ci#define in2		r34
7462306a36Sopenharmony_ci
7562306a36Sopenharmony_ciGLOBAL_ENTRY(memcpy)
7662306a36Sopenharmony_ci	and	r28=0x7,in0
7762306a36Sopenharmony_ci	and	r29=0x7,in1
7862306a36Sopenharmony_ci	mov	f6=f0
7962306a36Sopenharmony_ci	mov	retval=in0
8062306a36Sopenharmony_ci	br.cond.sptk .common_code
8162306a36Sopenharmony_ci	;;
8262306a36Sopenharmony_ciEND(memcpy)
8362306a36Sopenharmony_ciEXPORT_SYMBOL(memcpy)
8462306a36Sopenharmony_ciGLOBAL_ENTRY(__copy_user)
8562306a36Sopenharmony_ci	.prologue
8662306a36Sopenharmony_ci// check dest alignment
8762306a36Sopenharmony_ci	and	r28=0x7,in0
8862306a36Sopenharmony_ci	and	r29=0x7,in1
8962306a36Sopenharmony_ci	mov	f6=f1
9062306a36Sopenharmony_ci	mov	saved_in0=in0	// save dest pointer
9162306a36Sopenharmony_ci	mov	saved_in1=in1	// save src pointer
9262306a36Sopenharmony_ci	mov	retval=r0	// initialize return value
9362306a36Sopenharmony_ci	;;
9462306a36Sopenharmony_ci.common_code:
9562306a36Sopenharmony_ci	cmp.gt	p15,p0=8,in2	// check for small size
9662306a36Sopenharmony_ci	cmp.ne	p13,p0=0,r28	// check dest alignment
9762306a36Sopenharmony_ci	cmp.ne	p14,p0=0,r29	// check src alignment
9862306a36Sopenharmony_ci	add	src0=0,in1
9962306a36Sopenharmony_ci	sub	r30=8,r28	// for .align_dest
10062306a36Sopenharmony_ci	mov	saved_in2=in2	// save len
10162306a36Sopenharmony_ci	;;
10262306a36Sopenharmony_ci	add	dst0=0,in0
10362306a36Sopenharmony_ci	add	dst1=1,in0	// dest odd index
10462306a36Sopenharmony_ci	cmp.le	p6,p0 = 1,r30	// for .align_dest
10562306a36Sopenharmony_ci(p15)	br.cond.dpnt .memcpy_short
10662306a36Sopenharmony_ci(p13)	br.cond.dpnt .align_dest
10762306a36Sopenharmony_ci(p14)	br.cond.dpnt .unaligned_src
10862306a36Sopenharmony_ci	;;
10962306a36Sopenharmony_ci
11062306a36Sopenharmony_ci// both dest and src are aligned on 8-byte boundary
11162306a36Sopenharmony_ci.aligned_src:
11262306a36Sopenharmony_ci	.save ar.pfs, saved_pfs
11362306a36Sopenharmony_ci	alloc	saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
11462306a36Sopenharmony_ci	.save pr, saved_pr
11562306a36Sopenharmony_ci	mov	saved_pr=pr
11662306a36Sopenharmony_ci
11762306a36Sopenharmony_ci	shr.u	cnt=in2,7	// this much cache line
11862306a36Sopenharmony_ci	;;
11962306a36Sopenharmony_ci	cmp.lt	p6,p0=2*PREFETCH_DIST,cnt
12062306a36Sopenharmony_ci	cmp.lt	p7,p8=1,cnt
12162306a36Sopenharmony_ci	.save ar.lc, saved_lc
12262306a36Sopenharmony_ci	mov	saved_lc=ar.lc
12362306a36Sopenharmony_ci	.body
12462306a36Sopenharmony_ci	add	cnt=-1,cnt
12562306a36Sopenharmony_ci	add	src_pre_mem=0,in1	// prefetch src pointer
12662306a36Sopenharmony_ci	add	dst_pre_mem=0,in0	// prefetch dest pointer
12762306a36Sopenharmony_ci	;;
12862306a36Sopenharmony_ci(p7)	mov	ar.lc=cnt	// prefetch count
12962306a36Sopenharmony_ci(p8)	mov	ar.lc=r0
13062306a36Sopenharmony_ci(p6)	br.cond.dpnt .long_copy
13162306a36Sopenharmony_ci	;;
13262306a36Sopenharmony_ci
13362306a36Sopenharmony_ci.prefetch:
13462306a36Sopenharmony_ci	lfetch.fault	  [src_pre_mem], 128
13562306a36Sopenharmony_ci	lfetch.fault.excl [dst_pre_mem], 128
13662306a36Sopenharmony_ci	br.cloop.dptk.few .prefetch
13762306a36Sopenharmony_ci	;;
13862306a36Sopenharmony_ci
13962306a36Sopenharmony_ci.medium_copy:
14062306a36Sopenharmony_ci	and	tmp=31,in2	// copy length after iteration
14162306a36Sopenharmony_ci	shr.u	r29=in2,5	// number of 32-byte iteration
14262306a36Sopenharmony_ci	add	dst1=8,dst0	// 2nd dest pointer
14362306a36Sopenharmony_ci	;;
14462306a36Sopenharmony_ci	add	cnt=-1,r29	// ctop iteration adjustment
14562306a36Sopenharmony_ci	cmp.eq	p10,p0=r29,r0	// do we really need to loop?
14662306a36Sopenharmony_ci	add	src1=8,src0	// 2nd src pointer
14762306a36Sopenharmony_ci	cmp.le	p6,p0=8,tmp
14862306a36Sopenharmony_ci	;;
14962306a36Sopenharmony_ci	cmp.le	p7,p0=16,tmp
15062306a36Sopenharmony_ci	mov	ar.lc=cnt	// loop setup
15162306a36Sopenharmony_ci	cmp.eq	p16,p17 = r0,r0
15262306a36Sopenharmony_ci	mov	ar.ec=2
15362306a36Sopenharmony_ci(p10)	br.dpnt.few .aligned_src_tail
15462306a36Sopenharmony_ci	;;
15562306a36Sopenharmony_ci	TEXT_ALIGN(32)
15662306a36Sopenharmony_ci1:
15762306a36Sopenharmony_ciEX(.ex_handler, (p16)	ld8	r34=[src0],16)
15862306a36Sopenharmony_ciEK(.ex_handler, (p16)	ld8	r38=[src1],16)
15962306a36Sopenharmony_ciEX(.ex_handler, (p17)	st8	[dst0]=r33,16)
16062306a36Sopenharmony_ciEK(.ex_handler, (p17)	st8	[dst1]=r37,16)
16162306a36Sopenharmony_ci	;;
16262306a36Sopenharmony_ciEX(.ex_handler, (p16)	ld8	r32=[src0],16)
16362306a36Sopenharmony_ciEK(.ex_handler, (p16)	ld8	r36=[src1],16)
16462306a36Sopenharmony_ciEX(.ex_handler, (p16)	st8	[dst0]=r34,16)
16562306a36Sopenharmony_ciEK(.ex_handler, (p16)	st8	[dst1]=r38,16)
16662306a36Sopenharmony_ci	br.ctop.dptk.few 1b
16762306a36Sopenharmony_ci	;;
16862306a36Sopenharmony_ci
16962306a36Sopenharmony_ci.aligned_src_tail:
17062306a36Sopenharmony_ciEX(.ex_handler, (p6)	ld8	t1=[src0])
17162306a36Sopenharmony_ci	mov	ar.lc=saved_lc
17262306a36Sopenharmony_ci	mov	ar.pfs=saved_pfs
17362306a36Sopenharmony_ciEX(.ex_hndlr_s, (p7)	ld8	t2=[src1],8)
17462306a36Sopenharmony_ci	cmp.le	p8,p0=24,tmp
17562306a36Sopenharmony_ci	and	r21=-8,tmp
17662306a36Sopenharmony_ci	;;
17762306a36Sopenharmony_ciEX(.ex_hndlr_s, (p8)	ld8	t3=[src1])
17862306a36Sopenharmony_ciEX(.ex_handler, (p6)	st8	[dst0]=t1)	// store byte 1
17962306a36Sopenharmony_ci	and	in2=7,tmp	// remaining length
18062306a36Sopenharmony_ciEX(.ex_hndlr_d, (p7)	st8	[dst1]=t2,8)	// store byte 2
18162306a36Sopenharmony_ci	add	src0=src0,r21	// setting up src pointer
18262306a36Sopenharmony_ci	add	dst0=dst0,r21	// setting up dest pointer
18362306a36Sopenharmony_ci	;;
18462306a36Sopenharmony_ciEX(.ex_handler, (p8)	st8	[dst1]=t3)	// store byte 3
18562306a36Sopenharmony_ci	mov	pr=saved_pr,-1
18662306a36Sopenharmony_ci	br.dptk.many .memcpy_short
18762306a36Sopenharmony_ci	;;
18862306a36Sopenharmony_ci
18962306a36Sopenharmony_ci/* code taken from copy_page_mck */
19062306a36Sopenharmony_ci.long_copy:
19162306a36Sopenharmony_ci	.rotr v[2*PREFETCH_DIST]
19262306a36Sopenharmony_ci	.rotp p[N]
19362306a36Sopenharmony_ci
19462306a36Sopenharmony_ci	mov src_pre_mem = src0
19562306a36Sopenharmony_ci	mov pr.rot = 0x10000
19662306a36Sopenharmony_ci	mov ar.ec = 1				// special unrolled loop
19762306a36Sopenharmony_ci
19862306a36Sopenharmony_ci	mov dst_pre_mem = dst0
19962306a36Sopenharmony_ci
20062306a36Sopenharmony_ci	add src_pre_l2 = 8*8, src0
20162306a36Sopenharmony_ci	add dst_pre_l2 = 8*8, dst0
20262306a36Sopenharmony_ci	;;
20362306a36Sopenharmony_ci	add src0 = 8, src_pre_mem		// first t1 src
20462306a36Sopenharmony_ci	mov ar.lc = 2*PREFETCH_DIST - 1
20562306a36Sopenharmony_ci	shr.u cnt=in2,7				// number of lines
20662306a36Sopenharmony_ci	add src1 = 3*8, src_pre_mem		// first t3 src
20762306a36Sopenharmony_ci	add dst0 = 8, dst_pre_mem		// first t1 dst
20862306a36Sopenharmony_ci	add dst1 = 3*8, dst_pre_mem		// first t3 dst
20962306a36Sopenharmony_ci	;;
21062306a36Sopenharmony_ci	and tmp=127,in2				// remaining bytes after this block
21162306a36Sopenharmony_ci	add cnt = -(2*PREFETCH_DIST) - 1, cnt
21262306a36Sopenharmony_ci	// same as .line_copy loop, but with all predicated-off instructions removed:
21362306a36Sopenharmony_ci.prefetch_loop:
21462306a36Sopenharmony_ciEX(.ex_hndlr_lcpy_1, (p[A])	ld8 v[A] = [src_pre_mem], 128)		// M0
21562306a36Sopenharmony_ciEK(.ex_hndlr_lcpy_1, (p[B])	st8 [dst_pre_mem] = v[B], 128)		// M2
21662306a36Sopenharmony_ci	br.ctop.sptk .prefetch_loop
21762306a36Sopenharmony_ci	;;
21862306a36Sopenharmony_ci	cmp.eq p16, p0 = r0, r0			// reset p16 to 1
21962306a36Sopenharmony_ci	mov ar.lc = cnt
22062306a36Sopenharmony_ci	mov ar.ec = N				// # of stages in pipeline
22162306a36Sopenharmony_ci	;;
22262306a36Sopenharmony_ci.line_copy:
22362306a36Sopenharmony_ciEX(.ex_handler,	(p[D])	ld8 t2 = [src0], 3*8)			// M0
22462306a36Sopenharmony_ciEK(.ex_handler,	(p[D])	ld8 t4 = [src1], 3*8)			// M1
22562306a36Sopenharmony_ciEX(.ex_handler_lcpy,	(p[B])	st8 [dst_pre_mem] = v[B], 128)		// M2 prefetch dst from memory
22662306a36Sopenharmony_ciEK(.ex_handler_lcpy,	(p[D])	st8 [dst_pre_l2] = n8, 128)		// M3 prefetch dst from L2
22762306a36Sopenharmony_ci	;;
22862306a36Sopenharmony_ciEX(.ex_handler_lcpy,	(p[A])	ld8 v[A] = [src_pre_mem], 128)		// M0 prefetch src from memory
22962306a36Sopenharmony_ciEK(.ex_handler_lcpy,	(p[C])	ld8 n8 = [src_pre_l2], 128)		// M1 prefetch src from L2
23062306a36Sopenharmony_ciEX(.ex_handler,	(p[D])	st8 [dst0] =  t1, 8)			// M2
23162306a36Sopenharmony_ciEK(.ex_handler,	(p[D])	st8 [dst1] =  t3, 8)			// M3
23262306a36Sopenharmony_ci	;;
23362306a36Sopenharmony_ciEX(.ex_handler,	(p[D])	ld8  t5 = [src0], 8)
23462306a36Sopenharmony_ciEK(.ex_handler,	(p[D])	ld8  t7 = [src1], 3*8)
23562306a36Sopenharmony_ciEX(.ex_handler,	(p[D])	st8 [dst0] =  t2, 3*8)
23662306a36Sopenharmony_ciEK(.ex_handler,	(p[D])	st8 [dst1] =  t4, 3*8)
23762306a36Sopenharmony_ci	;;
23862306a36Sopenharmony_ciEX(.ex_handler,	(p[D])	ld8  t6 = [src0], 3*8)
23962306a36Sopenharmony_ciEK(.ex_handler,	(p[D])	ld8 t10 = [src1], 8)
24062306a36Sopenharmony_ciEX(.ex_handler,	(p[D])	st8 [dst0] =  t5, 8)
24162306a36Sopenharmony_ciEK(.ex_handler,	(p[D])	st8 [dst1] =  t7, 3*8)
24262306a36Sopenharmony_ci	;;
24362306a36Sopenharmony_ciEX(.ex_handler,	(p[D])	ld8  t9 = [src0], 3*8)
24462306a36Sopenharmony_ciEK(.ex_handler,	(p[D])	ld8 t11 = [src1], 3*8)
24562306a36Sopenharmony_ciEX(.ex_handler,	(p[D])	st8 [dst0] =  t6, 3*8)
24662306a36Sopenharmony_ciEK(.ex_handler,	(p[D])	st8 [dst1] = t10, 8)
24762306a36Sopenharmony_ci	;;
24862306a36Sopenharmony_ciEX(.ex_handler,	(p[D])	ld8 t12 = [src0], 8)
24962306a36Sopenharmony_ciEK(.ex_handler,	(p[D])	ld8 t14 = [src1], 8)
25062306a36Sopenharmony_ciEX(.ex_handler,	(p[D])	st8 [dst0] =  t9, 3*8)
25162306a36Sopenharmony_ciEK(.ex_handler,	(p[D])	st8 [dst1] = t11, 3*8)
25262306a36Sopenharmony_ci	;;
25362306a36Sopenharmony_ciEX(.ex_handler,	(p[D])	ld8 t13 = [src0], 4*8)
25462306a36Sopenharmony_ciEK(.ex_handler,	(p[D])	ld8 t15 = [src1], 4*8)
25562306a36Sopenharmony_ciEX(.ex_handler,	(p[D])	st8 [dst0] = t12, 8)
25662306a36Sopenharmony_ciEK(.ex_handler,	(p[D])	st8 [dst1] = t14, 8)
25762306a36Sopenharmony_ci	;;
25862306a36Sopenharmony_ciEX(.ex_handler,	(p[C])	ld8  t1 = [src0], 8)
25962306a36Sopenharmony_ciEK(.ex_handler,	(p[C])	ld8  t3 = [src1], 8)
26062306a36Sopenharmony_ciEX(.ex_handler,	(p[D])	st8 [dst0] = t13, 4*8)
26162306a36Sopenharmony_ciEK(.ex_handler,	(p[D])	st8 [dst1] = t15, 4*8)
26262306a36Sopenharmony_ci	br.ctop.sptk .line_copy
26362306a36Sopenharmony_ci	;;
26462306a36Sopenharmony_ci
26562306a36Sopenharmony_ci	add dst0=-8,dst0
26662306a36Sopenharmony_ci	add src0=-8,src0
26762306a36Sopenharmony_ci	mov in2=tmp
26862306a36Sopenharmony_ci	.restore sp
26962306a36Sopenharmony_ci	br.sptk.many .medium_copy
27062306a36Sopenharmony_ci	;;
27162306a36Sopenharmony_ci
27262306a36Sopenharmony_ci#define BLOCK_SIZE	128*32
27362306a36Sopenharmony_ci#define blocksize	r23
27462306a36Sopenharmony_ci#define curlen		r24
27562306a36Sopenharmony_ci
27662306a36Sopenharmony_ci// dest is on 8-byte boundary, src is not. We need to do
27762306a36Sopenharmony_ci// ld8-ld8, shrp, then st8.  Max 8 byte copy per cycle.
27862306a36Sopenharmony_ci.unaligned_src:
27962306a36Sopenharmony_ci	.prologue
28062306a36Sopenharmony_ci	.save ar.pfs, saved_pfs
28162306a36Sopenharmony_ci	alloc	saved_pfs=ar.pfs,3,5,0,8
28262306a36Sopenharmony_ci	.save ar.lc, saved_lc
28362306a36Sopenharmony_ci	mov	saved_lc=ar.lc
28462306a36Sopenharmony_ci	.save pr, saved_pr
28562306a36Sopenharmony_ci	mov	saved_pr=pr
28662306a36Sopenharmony_ci	.body
28762306a36Sopenharmony_ci.4k_block:
28862306a36Sopenharmony_ci	mov	saved_in0=dst0	// need to save all input arguments
28962306a36Sopenharmony_ci	mov	saved_in2=in2
29062306a36Sopenharmony_ci	mov	blocksize=BLOCK_SIZE
29162306a36Sopenharmony_ci	;;
29262306a36Sopenharmony_ci	cmp.lt	p6,p7=blocksize,in2
29362306a36Sopenharmony_ci	mov	saved_in1=src0
29462306a36Sopenharmony_ci	;;
29562306a36Sopenharmony_ci(p6)	mov	in2=blocksize
29662306a36Sopenharmony_ci	;;
29762306a36Sopenharmony_ci	shr.u	r21=in2,7	// this much cache line
29862306a36Sopenharmony_ci	shr.u	r22=in2,4	// number of 16-byte iteration
29962306a36Sopenharmony_ci	and	curlen=15,in2	// copy length after iteration
30062306a36Sopenharmony_ci	and	r30=7,src0	// source alignment
30162306a36Sopenharmony_ci	;;
30262306a36Sopenharmony_ci	cmp.lt	p7,p8=1,r21
30362306a36Sopenharmony_ci	add	cnt=-1,r21
30462306a36Sopenharmony_ci	;;
30562306a36Sopenharmony_ci
30662306a36Sopenharmony_ci	add	src_pre_mem=0,src0	// prefetch src pointer
30762306a36Sopenharmony_ci	add	dst_pre_mem=0,dst0	// prefetch dest pointer
30862306a36Sopenharmony_ci	and	src0=-8,src0		// 1st src pointer
30962306a36Sopenharmony_ci(p7)	mov	ar.lc = cnt
31062306a36Sopenharmony_ci(p8)	mov	ar.lc = r0
31162306a36Sopenharmony_ci	;;
31262306a36Sopenharmony_ci	TEXT_ALIGN(32)
31362306a36Sopenharmony_ci1:	lfetch.fault	  [src_pre_mem], 128
31462306a36Sopenharmony_ci	lfetch.fault.excl [dst_pre_mem], 128
31562306a36Sopenharmony_ci	br.cloop.dptk.few 1b
31662306a36Sopenharmony_ci	;;
31762306a36Sopenharmony_ci
31862306a36Sopenharmony_ci	shladd	dst1=r22,3,dst0	// 2nd dest pointer
31962306a36Sopenharmony_ci	shladd	src1=r22,3,src0	// 2nd src pointer
32062306a36Sopenharmony_ci	cmp.eq	p8,p9=r22,r0	// do we really need to loop?
32162306a36Sopenharmony_ci	cmp.le	p6,p7=8,curlen;	// have at least 8 byte remaining?
32262306a36Sopenharmony_ci	add	cnt=-1,r22	// ctop iteration adjustment
32362306a36Sopenharmony_ci	;;
32462306a36Sopenharmony_ciEX(.ex_handler, (p9)	ld8	r33=[src0],8)	// loop primer
32562306a36Sopenharmony_ciEK(.ex_handler, (p9)	ld8	r37=[src1],8)
32662306a36Sopenharmony_ci(p8)	br.dpnt.few .noloop
32762306a36Sopenharmony_ci	;;
32862306a36Sopenharmony_ci
32962306a36Sopenharmony_ci// The jump address is calculated based on src alignment. The COPYU
33062306a36Sopenharmony_ci// macro below need to confine its size to power of two, so an entry
33162306a36Sopenharmony_ci// can be caulated using shl instead of an expensive multiply. The
33262306a36Sopenharmony_ci// size is then hard coded by the following #define to match the
33362306a36Sopenharmony_ci// actual size.  This make it somewhat tedious when COPYU macro gets
33462306a36Sopenharmony_ci// changed and this need to be adjusted to match.
33562306a36Sopenharmony_ci#define LOOP_SIZE 6
33662306a36Sopenharmony_ci1:
33762306a36Sopenharmony_ci	mov	r29=ip		// jmp_table thread
33862306a36Sopenharmony_ci	mov	ar.lc=cnt
33962306a36Sopenharmony_ci	;;
34062306a36Sopenharmony_ci	add	r29=.jump_table - 1b - (.jmp1-.jump_table), r29
34162306a36Sopenharmony_ci	shl	r28=r30, LOOP_SIZE	// jmp_table thread
34262306a36Sopenharmony_ci	mov	ar.ec=2		// loop setup
34362306a36Sopenharmony_ci	;;
34462306a36Sopenharmony_ci	add	r29=r29,r28		// jmp_table thread
34562306a36Sopenharmony_ci	cmp.eq	p16,p17=r0,r0
34662306a36Sopenharmony_ci	;;
34762306a36Sopenharmony_ci	mov	b6=r29			// jmp_table thread
34862306a36Sopenharmony_ci	;;
34962306a36Sopenharmony_ci	br.cond.sptk.few b6
35062306a36Sopenharmony_ci
35162306a36Sopenharmony_ci// for 8-15 byte case
35262306a36Sopenharmony_ci// We will skip the loop, but need to replicate the side effect
35362306a36Sopenharmony_ci// that the loop produces.
35462306a36Sopenharmony_ci.noloop:
35562306a36Sopenharmony_ciEX(.ex_handler, (p6)	ld8	r37=[src1],8)
35662306a36Sopenharmony_ci	add	src0=8,src0
35762306a36Sopenharmony_ci(p6)	shl	r25=r30,3
35862306a36Sopenharmony_ci	;;
35962306a36Sopenharmony_ciEX(.ex_handler, (p6)	ld8	r27=[src1])
36062306a36Sopenharmony_ci(p6)	shr.u	r28=r37,r25
36162306a36Sopenharmony_ci(p6)	sub	r26=64,r25
36262306a36Sopenharmony_ci	;;
36362306a36Sopenharmony_ci(p6)	shl	r27=r27,r26
36462306a36Sopenharmony_ci	;;
36562306a36Sopenharmony_ci(p6)	or	r21=r28,r27
36662306a36Sopenharmony_ci
36762306a36Sopenharmony_ci.unaligned_src_tail:
36862306a36Sopenharmony_ci/* check if we have more than blocksize to copy, if so go back */
36962306a36Sopenharmony_ci	cmp.gt	p8,p0=saved_in2,blocksize
37062306a36Sopenharmony_ci	;;
37162306a36Sopenharmony_ci(p8)	add	dst0=saved_in0,blocksize
37262306a36Sopenharmony_ci(p8)	add	src0=saved_in1,blocksize
37362306a36Sopenharmony_ci(p8)	sub	in2=saved_in2,blocksize
37462306a36Sopenharmony_ci(p8)	br.dpnt	.4k_block
37562306a36Sopenharmony_ci	;;
37662306a36Sopenharmony_ci
37762306a36Sopenharmony_ci/* we have up to 15 byte to copy in the tail.
37862306a36Sopenharmony_ci * part of work is already done in the jump table code
37962306a36Sopenharmony_ci * we are at the following state.
38062306a36Sopenharmony_ci * src side:
38162306a36Sopenharmony_ci *
38262306a36Sopenharmony_ci *   xxxxxx xx                   <----- r21 has xxxxxxxx already
38362306a36Sopenharmony_ci * -------- -------- --------
38462306a36Sopenharmony_ci * 0        8        16
38562306a36Sopenharmony_ci *          ^
38662306a36Sopenharmony_ci *          |
38762306a36Sopenharmony_ci *          src1
38862306a36Sopenharmony_ci *
38962306a36Sopenharmony_ci * dst
39062306a36Sopenharmony_ci * -------- -------- --------
39162306a36Sopenharmony_ci * ^
39262306a36Sopenharmony_ci * |
39362306a36Sopenharmony_ci * dst1
39462306a36Sopenharmony_ci */
39562306a36Sopenharmony_ciEX(.ex_handler, (p6)	st8	[dst1]=r21,8)	// more than 8 byte to copy
39662306a36Sopenharmony_ci(p6)	add	curlen=-8,curlen	// update length
39762306a36Sopenharmony_ci	mov	ar.pfs=saved_pfs
39862306a36Sopenharmony_ci	;;
39962306a36Sopenharmony_ci	mov	ar.lc=saved_lc
40062306a36Sopenharmony_ci	mov	pr=saved_pr,-1
40162306a36Sopenharmony_ci	mov	in2=curlen	// remaining length
40262306a36Sopenharmony_ci	mov	dst0=dst1	// dest pointer
40362306a36Sopenharmony_ci	add	src0=src1,r30	// forward by src alignment
40462306a36Sopenharmony_ci	;;
40562306a36Sopenharmony_ci
40662306a36Sopenharmony_ci// 7 byte or smaller.
40762306a36Sopenharmony_ci.memcpy_short:
40862306a36Sopenharmony_ci	cmp.le	p8,p9   = 1,in2
40962306a36Sopenharmony_ci	cmp.le	p10,p11 = 2,in2
41062306a36Sopenharmony_ci	cmp.le	p12,p13 = 3,in2
41162306a36Sopenharmony_ci	cmp.le	p14,p15 = 4,in2
41262306a36Sopenharmony_ci	add	src1=1,src0	// second src pointer
41362306a36Sopenharmony_ci	add	dst1=1,dst0	// second dest pointer
41462306a36Sopenharmony_ci	;;
41562306a36Sopenharmony_ci
41662306a36Sopenharmony_ciEX(.ex_handler_short, (p8)	ld1	t1=[src0],2)
41762306a36Sopenharmony_ciEK(.ex_handler_short, (p10)	ld1	t2=[src1],2)
41862306a36Sopenharmony_ci(p9)	br.ret.dpnt rp		// 0 byte copy
41962306a36Sopenharmony_ci	;;
42062306a36Sopenharmony_ci
42162306a36Sopenharmony_ciEX(.ex_handler_short, (p8)	st1	[dst0]=t1,2)
42262306a36Sopenharmony_ciEK(.ex_handler_short, (p10)	st1	[dst1]=t2,2)
42362306a36Sopenharmony_ci(p11)	br.ret.dpnt rp		// 1 byte copy
42462306a36Sopenharmony_ci
42562306a36Sopenharmony_ciEX(.ex_handler_short, (p12)	ld1	t3=[src0],2)
42662306a36Sopenharmony_ciEK(.ex_handler_short, (p14)	ld1	t4=[src1],2)
42762306a36Sopenharmony_ci(p13)	br.ret.dpnt rp		// 2 byte copy
42862306a36Sopenharmony_ci	;;
42962306a36Sopenharmony_ci
43062306a36Sopenharmony_ci	cmp.le	p6,p7   = 5,in2
43162306a36Sopenharmony_ci	cmp.le	p8,p9   = 6,in2
43262306a36Sopenharmony_ci	cmp.le	p10,p11 = 7,in2
43362306a36Sopenharmony_ci
43462306a36Sopenharmony_ciEX(.ex_handler_short, (p12)	st1	[dst0]=t3,2)
43562306a36Sopenharmony_ciEK(.ex_handler_short, (p14)	st1	[dst1]=t4,2)
43662306a36Sopenharmony_ci(p15)	br.ret.dpnt rp		// 3 byte copy
43762306a36Sopenharmony_ci	;;
43862306a36Sopenharmony_ci
43962306a36Sopenharmony_ciEX(.ex_handler_short, (p6)	ld1	t5=[src0],2)
44062306a36Sopenharmony_ciEK(.ex_handler_short, (p8)	ld1	t6=[src1],2)
44162306a36Sopenharmony_ci(p7)	br.ret.dpnt rp		// 4 byte copy
44262306a36Sopenharmony_ci	;;
44362306a36Sopenharmony_ci
44462306a36Sopenharmony_ciEX(.ex_handler_short, (p6)	st1	[dst0]=t5,2)
44562306a36Sopenharmony_ciEK(.ex_handler_short, (p8)	st1	[dst1]=t6,2)
44662306a36Sopenharmony_ci(p9)	br.ret.dptk rp		// 5 byte copy
44762306a36Sopenharmony_ci
44862306a36Sopenharmony_ciEX(.ex_handler_short, (p10)	ld1	t7=[src0],2)
44962306a36Sopenharmony_ci(p11)	br.ret.dptk rp		// 6 byte copy
45062306a36Sopenharmony_ci	;;
45162306a36Sopenharmony_ci
45262306a36Sopenharmony_ciEX(.ex_handler_short, (p10)	st1	[dst0]=t7,2)
45362306a36Sopenharmony_ci	br.ret.dptk rp		// done all cases
45462306a36Sopenharmony_ci
45562306a36Sopenharmony_ci
45662306a36Sopenharmony_ci/* Align dest to nearest 8-byte boundary. We know we have at
45762306a36Sopenharmony_ci * least 7 bytes to copy, enough to crawl to 8-byte boundary.
45862306a36Sopenharmony_ci * Actual number of byte to crawl depend on the dest alignment.
45962306a36Sopenharmony_ci * 7 byte or less is taken care at .memcpy_short
46062306a36Sopenharmony_ci
46162306a36Sopenharmony_ci * src0 - source even index
46262306a36Sopenharmony_ci * src1 - source  odd index
46362306a36Sopenharmony_ci * dst0 - dest even index
46462306a36Sopenharmony_ci * dst1 - dest  odd index
46562306a36Sopenharmony_ci * r30  - distance to 8-byte boundary
46662306a36Sopenharmony_ci */
46762306a36Sopenharmony_ci
46862306a36Sopenharmony_ci.align_dest:
46962306a36Sopenharmony_ci	add	src1=1,in1	// source odd index
47062306a36Sopenharmony_ci	cmp.le	p7,p0 = 2,r30	// for .align_dest
47162306a36Sopenharmony_ci	cmp.le	p8,p0 = 3,r30	// for .align_dest
47262306a36Sopenharmony_ciEX(.ex_handler_short, (p6)	ld1	t1=[src0],2)
47362306a36Sopenharmony_ci	cmp.le	p9,p0 = 4,r30	// for .align_dest
47462306a36Sopenharmony_ci	cmp.le	p10,p0 = 5,r30
47562306a36Sopenharmony_ci	;;
47662306a36Sopenharmony_ciEX(.ex_handler_short, (p7)	ld1	t2=[src1],2)
47762306a36Sopenharmony_ciEK(.ex_handler_short, (p8)	ld1	t3=[src0],2)
47862306a36Sopenharmony_ci	cmp.le	p11,p0 = 6,r30
47962306a36Sopenharmony_ciEX(.ex_handler_short, (p6)	st1	[dst0] = t1,2)
48062306a36Sopenharmony_ci	cmp.le	p12,p0 = 7,r30
48162306a36Sopenharmony_ci	;;
48262306a36Sopenharmony_ciEX(.ex_handler_short, (p9)	ld1	t4=[src1],2)
48362306a36Sopenharmony_ciEK(.ex_handler_short, (p10)	ld1	t5=[src0],2)
48462306a36Sopenharmony_ciEX(.ex_handler_short, (p7)	st1	[dst1] = t2,2)
48562306a36Sopenharmony_ciEK(.ex_handler_short, (p8)	st1	[dst0] = t3,2)
48662306a36Sopenharmony_ci	;;
48762306a36Sopenharmony_ciEX(.ex_handler_short, (p11)	ld1	t6=[src1],2)
48862306a36Sopenharmony_ciEK(.ex_handler_short, (p12)	ld1	t7=[src0],2)
48962306a36Sopenharmony_ci	cmp.eq	p6,p7=r28,r29
49062306a36Sopenharmony_ciEX(.ex_handler_short, (p9)	st1	[dst1] = t4,2)
49162306a36Sopenharmony_ciEK(.ex_handler_short, (p10)	st1	[dst0] = t5,2)
49262306a36Sopenharmony_ci	sub	in2=in2,r30
49362306a36Sopenharmony_ci	;;
49462306a36Sopenharmony_ciEX(.ex_handler_short, (p11)	st1	[dst1] = t6,2)
49562306a36Sopenharmony_ciEK(.ex_handler_short, (p12)	st1	[dst0] = t7)
49662306a36Sopenharmony_ci	add	dst0=in0,r30	// setup arguments
49762306a36Sopenharmony_ci	add	src0=in1,r30
49862306a36Sopenharmony_ci(p6)	br.cond.dptk .aligned_src
49962306a36Sopenharmony_ci(p7)	br.cond.dpnt .unaligned_src
50062306a36Sopenharmony_ci	;;
50162306a36Sopenharmony_ci
50262306a36Sopenharmony_ci/* main loop body in jump table format */
50362306a36Sopenharmony_ci#define COPYU(shift)									\
50462306a36Sopenharmony_ci1:											\
50562306a36Sopenharmony_ciEX(.ex_handler,  (p16)	ld8	r32=[src0],8);		/* 1 */				\
50662306a36Sopenharmony_ciEK(.ex_handler,  (p16)	ld8	r36=[src1],8);						\
50762306a36Sopenharmony_ci		 (p17)	shrp	r35=r33,r34,shift;;	/* 1 */				\
50862306a36Sopenharmony_ciEX(.ex_handler,  (p6)	ld8	r22=[src1]);	/* common, prime for tail section */	\
50962306a36Sopenharmony_ci		 nop.m	0;								\
51062306a36Sopenharmony_ci		 (p16)	shrp	r38=r36,r37,shift;					\
51162306a36Sopenharmony_ciEX(.ex_handler,  (p17)	st8	[dst0]=r35,8);		/* 1 */				\
51262306a36Sopenharmony_ciEK(.ex_handler,  (p17)	st8	[dst1]=r39,8);						\
51362306a36Sopenharmony_ci		 br.ctop.dptk.few 1b;;							\
51462306a36Sopenharmony_ci		 (p7)	add	src1=-8,src1;	/* back out for <8 byte case */		\
51562306a36Sopenharmony_ci		 shrp	r21=r22,r38,shift;	/* speculative work */			\
51662306a36Sopenharmony_ci		 br.sptk.few .unaligned_src_tail /* branch out of jump table */		\
51762306a36Sopenharmony_ci		 ;;
51862306a36Sopenharmony_ci	TEXT_ALIGN(32)
51962306a36Sopenharmony_ci.jump_table:
52062306a36Sopenharmony_ci	COPYU(8)	// unaligned cases
52162306a36Sopenharmony_ci.jmp1:
52262306a36Sopenharmony_ci	COPYU(16)
52362306a36Sopenharmony_ci	COPYU(24)
52462306a36Sopenharmony_ci	COPYU(32)
52562306a36Sopenharmony_ci	COPYU(40)
52662306a36Sopenharmony_ci	COPYU(48)
52762306a36Sopenharmony_ci	COPYU(56)
52862306a36Sopenharmony_ci
52962306a36Sopenharmony_ci#undef A
53062306a36Sopenharmony_ci#undef B
53162306a36Sopenharmony_ci#undef C
53262306a36Sopenharmony_ci#undef D
53362306a36Sopenharmony_ci
53462306a36Sopenharmony_ci/*
53562306a36Sopenharmony_ci * Due to lack of local tag support in gcc 2.x assembler, it is not clear which
53662306a36Sopenharmony_ci * instruction failed in the bundle.  The exception algorithm is that we
53762306a36Sopenharmony_ci * first figure out the faulting address, then detect if there is any
53862306a36Sopenharmony_ci * progress made on the copy, if so, redo the copy from last known copied
53962306a36Sopenharmony_ci * location up to the faulting address (exclusive). In the copy_from_user
54062306a36Sopenharmony_ci * case, remaining byte in kernel buffer will be zeroed.
54162306a36Sopenharmony_ci *
54262306a36Sopenharmony_ci * Take copy_from_user as an example, in the code there are multiple loads
54362306a36Sopenharmony_ci * in a bundle and those multiple loads could span over two pages, the
54462306a36Sopenharmony_ci * faulting address is calculated as page_round_down(max(src0, src1)).
54562306a36Sopenharmony_ci * This is based on knowledge that if we can access one byte in a page, we
54662306a36Sopenharmony_ci * can access any byte in that page.
54762306a36Sopenharmony_ci *
54862306a36Sopenharmony_ci * predicate used in the exception handler:
54962306a36Sopenharmony_ci * p6-p7: direction
55062306a36Sopenharmony_ci * p10-p11: src faulting addr calculation
55162306a36Sopenharmony_ci * p12-p13: dst faulting addr calculation
55262306a36Sopenharmony_ci */
55362306a36Sopenharmony_ci
55462306a36Sopenharmony_ci#define A	r19
55562306a36Sopenharmony_ci#define B	r20
55662306a36Sopenharmony_ci#define C	r21
55762306a36Sopenharmony_ci#define D	r22
55862306a36Sopenharmony_ci#define F	r28
55962306a36Sopenharmony_ci
56062306a36Sopenharmony_ci#define saved_retval	loc0
56162306a36Sopenharmony_ci#define saved_rtlink	loc1
56262306a36Sopenharmony_ci#define saved_pfs_stack	loc2
56362306a36Sopenharmony_ci
56462306a36Sopenharmony_ci.ex_hndlr_s:
56562306a36Sopenharmony_ci	add	src0=8,src0
56662306a36Sopenharmony_ci	br.sptk .ex_handler
56762306a36Sopenharmony_ci	;;
56862306a36Sopenharmony_ci.ex_hndlr_d:
56962306a36Sopenharmony_ci	add	dst0=8,dst0
57062306a36Sopenharmony_ci	br.sptk .ex_handler
57162306a36Sopenharmony_ci	;;
57262306a36Sopenharmony_ci.ex_hndlr_lcpy_1:
57362306a36Sopenharmony_ci	mov	src1=src_pre_mem
57462306a36Sopenharmony_ci	mov	dst1=dst_pre_mem
57562306a36Sopenharmony_ci	cmp.gtu	p10,p11=src_pre_mem,saved_in1
57662306a36Sopenharmony_ci	cmp.gtu	p12,p13=dst_pre_mem,saved_in0
57762306a36Sopenharmony_ci	;;
57862306a36Sopenharmony_ci(p10)	add	src0=8,saved_in1
57962306a36Sopenharmony_ci(p11)	mov	src0=saved_in1
58062306a36Sopenharmony_ci(p12)	add	dst0=8,saved_in0
58162306a36Sopenharmony_ci(p13)	mov	dst0=saved_in0
58262306a36Sopenharmony_ci	br.sptk	.ex_handler
58362306a36Sopenharmony_ci.ex_handler_lcpy:
58462306a36Sopenharmony_ci	// in line_copy block, the preload addresses should always ahead
58562306a36Sopenharmony_ci	// of the other two src/dst pointers.  Furthermore, src1/dst1 should
58662306a36Sopenharmony_ci	// always ahead of src0/dst0.
58762306a36Sopenharmony_ci	mov	src1=src_pre_mem
58862306a36Sopenharmony_ci	mov	dst1=dst_pre_mem
58962306a36Sopenharmony_ci.ex_handler:
59062306a36Sopenharmony_ci	mov	pr=saved_pr,-1		// first restore pr, lc, and pfs
59162306a36Sopenharmony_ci	mov	ar.lc=saved_lc
59262306a36Sopenharmony_ci	mov	ar.pfs=saved_pfs
59362306a36Sopenharmony_ci	;;
59462306a36Sopenharmony_ci.ex_handler_short: // fault occurred in these sections didn't change pr, lc, pfs
59562306a36Sopenharmony_ci	cmp.ltu	p6,p7=saved_in0, saved_in1	// get the copy direction
59662306a36Sopenharmony_ci	cmp.ltu	p10,p11=src0,src1
59762306a36Sopenharmony_ci	cmp.ltu	p12,p13=dst0,dst1
59862306a36Sopenharmony_ci	fcmp.eq	p8,p0=f6,f0		// is it memcpy?
59962306a36Sopenharmony_ci	mov	tmp = dst0
60062306a36Sopenharmony_ci	;;
60162306a36Sopenharmony_ci(p11)	mov	src1 = src0		// pick the larger of the two
60262306a36Sopenharmony_ci(p13)	mov	dst0 = dst1		// make dst0 the smaller one
60362306a36Sopenharmony_ci(p13)	mov	dst1 = tmp		// and dst1 the larger one
60462306a36Sopenharmony_ci	;;
60562306a36Sopenharmony_ci(p6)	dep	F = r0,dst1,0,PAGE_SHIFT // usr dst round down to page boundary
60662306a36Sopenharmony_ci(p7)	dep	F = r0,src1,0,PAGE_SHIFT // usr src round down to page boundary
60762306a36Sopenharmony_ci	;;
60862306a36Sopenharmony_ci(p6)	cmp.le	p14,p0=dst0,saved_in0	// no progress has been made on store
60962306a36Sopenharmony_ci(p7)	cmp.le	p14,p0=src0,saved_in1	// no progress has been made on load
61062306a36Sopenharmony_ci	mov	retval=saved_in2
61162306a36Sopenharmony_ci(p8)	ld1	tmp=[src1]		// force an oops for memcpy call
61262306a36Sopenharmony_ci(p8)	st1	[dst1]=r0		// force an oops for memcpy call
61362306a36Sopenharmony_ci(p14)	br.ret.sptk.many rp
61462306a36Sopenharmony_ci
61562306a36Sopenharmony_ci/*
61662306a36Sopenharmony_ci * The remaining byte to copy is calculated as:
61762306a36Sopenharmony_ci *
61862306a36Sopenharmony_ci * A =	(faulting_addr - orig_src)	-> len to faulting ld address
61962306a36Sopenharmony_ci *	or
62062306a36Sopenharmony_ci * 	(faulting_addr - orig_dst)	-> len to faulting st address
62162306a36Sopenharmony_ci * B =	(cur_dst - orig_dst)		-> len copied so far
62262306a36Sopenharmony_ci * C =	A - B				-> len need to be copied
62362306a36Sopenharmony_ci * D =	orig_len - A			-> len need to be left along
62462306a36Sopenharmony_ci */
62562306a36Sopenharmony_ci(p6)	sub	A = F, saved_in0
62662306a36Sopenharmony_ci(p7)	sub	A = F, saved_in1
62762306a36Sopenharmony_ci	clrrrb
62862306a36Sopenharmony_ci	;;
62962306a36Sopenharmony_ci	alloc	saved_pfs_stack=ar.pfs,3,3,3,0
63062306a36Sopenharmony_ci	cmp.lt	p8,p0=A,r0
63162306a36Sopenharmony_ci	sub	B = dst0, saved_in0	// how many byte copied so far
63262306a36Sopenharmony_ci	;;
63362306a36Sopenharmony_ci(p8)	mov	A = 0;			// A shouldn't be negative, cap it
63462306a36Sopenharmony_ci	;;
63562306a36Sopenharmony_ci	sub	C = A, B
63662306a36Sopenharmony_ci	sub	D = saved_in2, A
63762306a36Sopenharmony_ci	;;
63862306a36Sopenharmony_ci	cmp.gt	p8,p0=C,r0		// more than 1 byte?
63962306a36Sopenharmony_ci	mov	r8=0
64062306a36Sopenharmony_ci	mov	saved_retval = D
64162306a36Sopenharmony_ci	mov	saved_rtlink = b0
64262306a36Sopenharmony_ci
64362306a36Sopenharmony_ci	add	out0=saved_in0, B
64462306a36Sopenharmony_ci	add	out1=saved_in1, B
64562306a36Sopenharmony_ci	mov	out2=C
64662306a36Sopenharmony_ci(p8)	br.call.sptk.few b0=__copy_user	// recursive call
64762306a36Sopenharmony_ci	;;
64862306a36Sopenharmony_ci
64962306a36Sopenharmony_ci	add	saved_retval=saved_retval,r8	// above might return non-zero value
65062306a36Sopenharmony_ci	;;
65162306a36Sopenharmony_ci
65262306a36Sopenharmony_ci	mov	retval=saved_retval
65362306a36Sopenharmony_ci	mov	ar.pfs=saved_pfs_stack
65462306a36Sopenharmony_ci	mov	b0=saved_rtlink
65562306a36Sopenharmony_ci	br.ret.sptk.many rp
65662306a36Sopenharmony_ci
65762306a36Sopenharmony_ci/* end of McKinley specific optimization */
65862306a36Sopenharmony_ciEND(__copy_user)
65962306a36Sopenharmony_ciEXPORT_SYMBOL(__copy_user)
660