162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci *
462306a36Sopenharmony_ci * Copyright (C) IBM Corporation, 2012
562306a36Sopenharmony_ci *
662306a36Sopenharmony_ci * Author: Anton Blanchard <anton@au.ibm.com>
762306a36Sopenharmony_ci */
862306a36Sopenharmony_ci#include <asm/ppc_asm.h>
962306a36Sopenharmony_ci
1062306a36Sopenharmony_ci#ifndef SELFTEST_CASE
1162306a36Sopenharmony_ci/* 0 == don't use VMX, 1 == use VMX */
1262306a36Sopenharmony_ci#define SELFTEST_CASE	0
1362306a36Sopenharmony_ci#endif
1462306a36Sopenharmony_ci
1562306a36Sopenharmony_ci#ifdef __BIG_ENDIAN__
1662306a36Sopenharmony_ci#define LVS(VRT,RA,RB)		lvsl	VRT,RA,RB
1762306a36Sopenharmony_ci#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRA,VRB,VRC
1862306a36Sopenharmony_ci#else
1962306a36Sopenharmony_ci#define LVS(VRT,RA,RB)		lvsr	VRT,RA,RB
2062306a36Sopenharmony_ci#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRB,VRA,VRC
2162306a36Sopenharmony_ci#endif
2262306a36Sopenharmony_ci
2362306a36Sopenharmony_ci_GLOBAL(memcpy_power7)
2462306a36Sopenharmony_ci	cmpldi	r5,16
2562306a36Sopenharmony_ci	cmpldi	cr1,r5,4096
2662306a36Sopenharmony_ci	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
2762306a36Sopenharmony_ci	blt	.Lshort_copy
2862306a36Sopenharmony_ci
2962306a36Sopenharmony_ci#ifdef CONFIG_ALTIVEC
3062306a36Sopenharmony_citest_feature = SELFTEST_CASE
3162306a36Sopenharmony_ciBEGIN_FTR_SECTION
3262306a36Sopenharmony_ci	bgt	cr1, .Lvmx_copy
3362306a36Sopenharmony_ciEND_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
3462306a36Sopenharmony_ci#endif
3562306a36Sopenharmony_ci
3662306a36Sopenharmony_ci.Lnonvmx_copy:
3762306a36Sopenharmony_ci	/* Get the source 8B aligned */
3862306a36Sopenharmony_ci	neg	r6,r4
3962306a36Sopenharmony_ci	mtocrf	0x01,r6
4062306a36Sopenharmony_ci	clrldi	r6,r6,(64-3)
4162306a36Sopenharmony_ci
4262306a36Sopenharmony_ci	bf	cr7*4+3,1f
4362306a36Sopenharmony_ci	lbz	r0,0(r4)
4462306a36Sopenharmony_ci	addi	r4,r4,1
4562306a36Sopenharmony_ci	stb	r0,0(r3)
4662306a36Sopenharmony_ci	addi	r3,r3,1
4762306a36Sopenharmony_ci
4862306a36Sopenharmony_ci1:	bf	cr7*4+2,2f
4962306a36Sopenharmony_ci	lhz	r0,0(r4)
5062306a36Sopenharmony_ci	addi	r4,r4,2
5162306a36Sopenharmony_ci	sth	r0,0(r3)
5262306a36Sopenharmony_ci	addi	r3,r3,2
5362306a36Sopenharmony_ci
5462306a36Sopenharmony_ci2:	bf	cr7*4+1,3f
5562306a36Sopenharmony_ci	lwz	r0,0(r4)
5662306a36Sopenharmony_ci	addi	r4,r4,4
5762306a36Sopenharmony_ci	stw	r0,0(r3)
5862306a36Sopenharmony_ci	addi	r3,r3,4
5962306a36Sopenharmony_ci
6062306a36Sopenharmony_ci3:	sub	r5,r5,r6
6162306a36Sopenharmony_ci	cmpldi	r5,128
6262306a36Sopenharmony_ci	blt	5f
6362306a36Sopenharmony_ci
6462306a36Sopenharmony_ci	mflr	r0
6562306a36Sopenharmony_ci	stdu	r1,-STACKFRAMESIZE(r1)
6662306a36Sopenharmony_ci	std	r14,STK_REG(R14)(r1)
6762306a36Sopenharmony_ci	std	r15,STK_REG(R15)(r1)
6862306a36Sopenharmony_ci	std	r16,STK_REG(R16)(r1)
6962306a36Sopenharmony_ci	std	r17,STK_REG(R17)(r1)
7062306a36Sopenharmony_ci	std	r18,STK_REG(R18)(r1)
7162306a36Sopenharmony_ci	std	r19,STK_REG(R19)(r1)
7262306a36Sopenharmony_ci	std	r20,STK_REG(R20)(r1)
7362306a36Sopenharmony_ci	std	r21,STK_REG(R21)(r1)
7462306a36Sopenharmony_ci	std	r22,STK_REG(R22)(r1)
7562306a36Sopenharmony_ci	std	r0,STACKFRAMESIZE+16(r1)
7662306a36Sopenharmony_ci
7762306a36Sopenharmony_ci	srdi	r6,r5,7
7862306a36Sopenharmony_ci	mtctr	r6
7962306a36Sopenharmony_ci
8062306a36Sopenharmony_ci	/* Now do cacheline (128B) sized loads and stores. */
8162306a36Sopenharmony_ci	.align	5
8262306a36Sopenharmony_ci4:
8362306a36Sopenharmony_ci	ld	r0,0(r4)
8462306a36Sopenharmony_ci	ld	r6,8(r4)
8562306a36Sopenharmony_ci	ld	r7,16(r4)
8662306a36Sopenharmony_ci	ld	r8,24(r4)
8762306a36Sopenharmony_ci	ld	r9,32(r4)
8862306a36Sopenharmony_ci	ld	r10,40(r4)
8962306a36Sopenharmony_ci	ld	r11,48(r4)
9062306a36Sopenharmony_ci	ld	r12,56(r4)
9162306a36Sopenharmony_ci	ld	r14,64(r4)
9262306a36Sopenharmony_ci	ld	r15,72(r4)
9362306a36Sopenharmony_ci	ld	r16,80(r4)
9462306a36Sopenharmony_ci	ld	r17,88(r4)
9562306a36Sopenharmony_ci	ld	r18,96(r4)
9662306a36Sopenharmony_ci	ld	r19,104(r4)
9762306a36Sopenharmony_ci	ld	r20,112(r4)
9862306a36Sopenharmony_ci	ld	r21,120(r4)
9962306a36Sopenharmony_ci	addi	r4,r4,128
10062306a36Sopenharmony_ci	std	r0,0(r3)
10162306a36Sopenharmony_ci	std	r6,8(r3)
10262306a36Sopenharmony_ci	std	r7,16(r3)
10362306a36Sopenharmony_ci	std	r8,24(r3)
10462306a36Sopenharmony_ci	std	r9,32(r3)
10562306a36Sopenharmony_ci	std	r10,40(r3)
10662306a36Sopenharmony_ci	std	r11,48(r3)
10762306a36Sopenharmony_ci	std	r12,56(r3)
10862306a36Sopenharmony_ci	std	r14,64(r3)
10962306a36Sopenharmony_ci	std	r15,72(r3)
11062306a36Sopenharmony_ci	std	r16,80(r3)
11162306a36Sopenharmony_ci	std	r17,88(r3)
11262306a36Sopenharmony_ci	std	r18,96(r3)
11362306a36Sopenharmony_ci	std	r19,104(r3)
11462306a36Sopenharmony_ci	std	r20,112(r3)
11562306a36Sopenharmony_ci	std	r21,120(r3)
11662306a36Sopenharmony_ci	addi	r3,r3,128
11762306a36Sopenharmony_ci	bdnz	4b
11862306a36Sopenharmony_ci
11962306a36Sopenharmony_ci	clrldi	r5,r5,(64-7)
12062306a36Sopenharmony_ci
12162306a36Sopenharmony_ci	ld	r14,STK_REG(R14)(r1)
12262306a36Sopenharmony_ci	ld	r15,STK_REG(R15)(r1)
12362306a36Sopenharmony_ci	ld	r16,STK_REG(R16)(r1)
12462306a36Sopenharmony_ci	ld	r17,STK_REG(R17)(r1)
12562306a36Sopenharmony_ci	ld	r18,STK_REG(R18)(r1)
12662306a36Sopenharmony_ci	ld	r19,STK_REG(R19)(r1)
12762306a36Sopenharmony_ci	ld	r20,STK_REG(R20)(r1)
12862306a36Sopenharmony_ci	ld	r21,STK_REG(R21)(r1)
12962306a36Sopenharmony_ci	ld	r22,STK_REG(R22)(r1)
13062306a36Sopenharmony_ci	addi	r1,r1,STACKFRAMESIZE
13162306a36Sopenharmony_ci
13262306a36Sopenharmony_ci	/* Up to 127B to go */
13362306a36Sopenharmony_ci5:	srdi	r6,r5,4
13462306a36Sopenharmony_ci	mtocrf	0x01,r6
13562306a36Sopenharmony_ci
13662306a36Sopenharmony_ci6:	bf	cr7*4+1,7f
13762306a36Sopenharmony_ci	ld	r0,0(r4)
13862306a36Sopenharmony_ci	ld	r6,8(r4)
13962306a36Sopenharmony_ci	ld	r7,16(r4)
14062306a36Sopenharmony_ci	ld	r8,24(r4)
14162306a36Sopenharmony_ci	ld	r9,32(r4)
14262306a36Sopenharmony_ci	ld	r10,40(r4)
14362306a36Sopenharmony_ci	ld	r11,48(r4)
14462306a36Sopenharmony_ci	ld	r12,56(r4)
14562306a36Sopenharmony_ci	addi	r4,r4,64
14662306a36Sopenharmony_ci	std	r0,0(r3)
14762306a36Sopenharmony_ci	std	r6,8(r3)
14862306a36Sopenharmony_ci	std	r7,16(r3)
14962306a36Sopenharmony_ci	std	r8,24(r3)
15062306a36Sopenharmony_ci	std	r9,32(r3)
15162306a36Sopenharmony_ci	std	r10,40(r3)
15262306a36Sopenharmony_ci	std	r11,48(r3)
15362306a36Sopenharmony_ci	std	r12,56(r3)
15462306a36Sopenharmony_ci	addi	r3,r3,64
15562306a36Sopenharmony_ci
15662306a36Sopenharmony_ci	/* Up to 63B to go */
15762306a36Sopenharmony_ci7:	bf	cr7*4+2,8f
15862306a36Sopenharmony_ci	ld	r0,0(r4)
15962306a36Sopenharmony_ci	ld	r6,8(r4)
16062306a36Sopenharmony_ci	ld	r7,16(r4)
16162306a36Sopenharmony_ci	ld	r8,24(r4)
16262306a36Sopenharmony_ci	addi	r4,r4,32
16362306a36Sopenharmony_ci	std	r0,0(r3)
16462306a36Sopenharmony_ci	std	r6,8(r3)
16562306a36Sopenharmony_ci	std	r7,16(r3)
16662306a36Sopenharmony_ci	std	r8,24(r3)
16762306a36Sopenharmony_ci	addi	r3,r3,32
16862306a36Sopenharmony_ci
16962306a36Sopenharmony_ci	/* Up to 31B to go */
17062306a36Sopenharmony_ci8:	bf	cr7*4+3,9f
17162306a36Sopenharmony_ci	ld	r0,0(r4)
17262306a36Sopenharmony_ci	ld	r6,8(r4)
17362306a36Sopenharmony_ci	addi	r4,r4,16
17462306a36Sopenharmony_ci	std	r0,0(r3)
17562306a36Sopenharmony_ci	std	r6,8(r3)
17662306a36Sopenharmony_ci	addi	r3,r3,16
17762306a36Sopenharmony_ci
17862306a36Sopenharmony_ci9:	clrldi	r5,r5,(64-4)
17962306a36Sopenharmony_ci
18062306a36Sopenharmony_ci	/* Up to 15B to go */
18162306a36Sopenharmony_ci.Lshort_copy:
18262306a36Sopenharmony_ci	mtocrf	0x01,r5
18362306a36Sopenharmony_ci	bf	cr7*4+0,12f
18462306a36Sopenharmony_ci	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
18562306a36Sopenharmony_ci	lwz	r6,4(r4)
18662306a36Sopenharmony_ci	addi	r4,r4,8
18762306a36Sopenharmony_ci	stw	r0,0(r3)
18862306a36Sopenharmony_ci	stw	r6,4(r3)
18962306a36Sopenharmony_ci	addi	r3,r3,8
19062306a36Sopenharmony_ci
19162306a36Sopenharmony_ci12:	bf	cr7*4+1,13f
19262306a36Sopenharmony_ci	lwz	r0,0(r4)
19362306a36Sopenharmony_ci	addi	r4,r4,4
19462306a36Sopenharmony_ci	stw	r0,0(r3)
19562306a36Sopenharmony_ci	addi	r3,r3,4
19662306a36Sopenharmony_ci
19762306a36Sopenharmony_ci13:	bf	cr7*4+2,14f
19862306a36Sopenharmony_ci	lhz	r0,0(r4)
19962306a36Sopenharmony_ci	addi	r4,r4,2
20062306a36Sopenharmony_ci	sth	r0,0(r3)
20162306a36Sopenharmony_ci	addi	r3,r3,2
20262306a36Sopenharmony_ci
20362306a36Sopenharmony_ci14:	bf	cr7*4+3,15f
20462306a36Sopenharmony_ci	lbz	r0,0(r4)
20562306a36Sopenharmony_ci	stb	r0,0(r3)
20662306a36Sopenharmony_ci
20762306a36Sopenharmony_ci15:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
20862306a36Sopenharmony_ci	blr
20962306a36Sopenharmony_ci
21062306a36Sopenharmony_ci.Lunwind_stack_nonvmx_copy:
21162306a36Sopenharmony_ci	addi	r1,r1,STACKFRAMESIZE
21262306a36Sopenharmony_ci	b	.Lnonvmx_copy
21362306a36Sopenharmony_ci
21462306a36Sopenharmony_ci.Lvmx_copy:
21562306a36Sopenharmony_ci#ifdef CONFIG_ALTIVEC
21662306a36Sopenharmony_ci	mflr	r0
21762306a36Sopenharmony_ci	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
21862306a36Sopenharmony_ci	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
21962306a36Sopenharmony_ci	std	r0,16(r1)
22062306a36Sopenharmony_ci	stdu	r1,-STACKFRAMESIZE(r1)
22162306a36Sopenharmony_ci	bl	CFUNC(enter_vmx_ops)
22262306a36Sopenharmony_ci	cmpwi	cr1,r3,0
22362306a36Sopenharmony_ci	ld	r0,STACKFRAMESIZE+16(r1)
22462306a36Sopenharmony_ci	ld	r3,STK_REG(R31)(r1)
22562306a36Sopenharmony_ci	ld	r4,STK_REG(R30)(r1)
22662306a36Sopenharmony_ci	ld	r5,STK_REG(R29)(r1)
22762306a36Sopenharmony_ci	mtlr	r0
22862306a36Sopenharmony_ci
22962306a36Sopenharmony_ci	/*
23062306a36Sopenharmony_ci	 * We prefetch both the source and destination using enhanced touch
23162306a36Sopenharmony_ci	 * instructions. We use a stream ID of 0 for the load side and
23262306a36Sopenharmony_ci	 * 1 for the store side.
23362306a36Sopenharmony_ci	 */
23462306a36Sopenharmony_ci	clrrdi	r6,r4,7
23562306a36Sopenharmony_ci	clrrdi	r9,r3,7
23662306a36Sopenharmony_ci	ori	r9,r9,1		/* stream=1 */
23762306a36Sopenharmony_ci
23862306a36Sopenharmony_ci	srdi	r7,r5,7		/* length in cachelines, capped at 0x3FF */
23962306a36Sopenharmony_ci	cmpldi	r7,0x3FF
24062306a36Sopenharmony_ci	ble	1f
24162306a36Sopenharmony_ci	li	r7,0x3FF
24262306a36Sopenharmony_ci1:	lis	r0,0x0E00	/* depth=7 */
24362306a36Sopenharmony_ci	sldi	r7,r7,7
24462306a36Sopenharmony_ci	or	r7,r7,r0
24562306a36Sopenharmony_ci	ori	r10,r7,1	/* stream=1 */
24662306a36Sopenharmony_ci
24762306a36Sopenharmony_ci	lis	r8,0x8000	/* GO=1 */
24862306a36Sopenharmony_ci	clrldi	r8,r8,32
24962306a36Sopenharmony_ci
25062306a36Sopenharmony_ci	dcbt	0,r6,0b01000
25162306a36Sopenharmony_ci	dcbt	0,r7,0b01010
25262306a36Sopenharmony_ci	dcbtst	0,r9,0b01000
25362306a36Sopenharmony_ci	dcbtst	0,r10,0b01010
25462306a36Sopenharmony_ci	eieio
25562306a36Sopenharmony_ci	dcbt	0,r8,0b01010	/* GO */
25662306a36Sopenharmony_ci
25762306a36Sopenharmony_ci	beq	cr1,.Lunwind_stack_nonvmx_copy
25862306a36Sopenharmony_ci
25962306a36Sopenharmony_ci	/*
26062306a36Sopenharmony_ci	 * If source and destination are not relatively aligned we use a
26162306a36Sopenharmony_ci	 * slower permute loop.
26262306a36Sopenharmony_ci	 */
26362306a36Sopenharmony_ci	xor	r6,r4,r3
26462306a36Sopenharmony_ci	rldicl.	r6,r6,0,(64-4)
26562306a36Sopenharmony_ci	bne	.Lvmx_unaligned_copy
26662306a36Sopenharmony_ci
26762306a36Sopenharmony_ci	/* Get the destination 16B aligned */
26862306a36Sopenharmony_ci	neg	r6,r3
26962306a36Sopenharmony_ci	mtocrf	0x01,r6
27062306a36Sopenharmony_ci	clrldi	r6,r6,(64-4)
27162306a36Sopenharmony_ci
27262306a36Sopenharmony_ci	bf	cr7*4+3,1f
27362306a36Sopenharmony_ci	lbz	r0,0(r4)
27462306a36Sopenharmony_ci	addi	r4,r4,1
27562306a36Sopenharmony_ci	stb	r0,0(r3)
27662306a36Sopenharmony_ci	addi	r3,r3,1
27762306a36Sopenharmony_ci
27862306a36Sopenharmony_ci1:	bf	cr7*4+2,2f
27962306a36Sopenharmony_ci	lhz	r0,0(r4)
28062306a36Sopenharmony_ci	addi	r4,r4,2
28162306a36Sopenharmony_ci	sth	r0,0(r3)
28262306a36Sopenharmony_ci	addi	r3,r3,2
28362306a36Sopenharmony_ci
28462306a36Sopenharmony_ci2:	bf	cr7*4+1,3f
28562306a36Sopenharmony_ci	lwz	r0,0(r4)
28662306a36Sopenharmony_ci	addi	r4,r4,4
28762306a36Sopenharmony_ci	stw	r0,0(r3)
28862306a36Sopenharmony_ci	addi	r3,r3,4
28962306a36Sopenharmony_ci
29062306a36Sopenharmony_ci3:	bf	cr7*4+0,4f
29162306a36Sopenharmony_ci	ld	r0,0(r4)
29262306a36Sopenharmony_ci	addi	r4,r4,8
29362306a36Sopenharmony_ci	std	r0,0(r3)
29462306a36Sopenharmony_ci	addi	r3,r3,8
29562306a36Sopenharmony_ci
29662306a36Sopenharmony_ci4:	sub	r5,r5,r6
29762306a36Sopenharmony_ci
29862306a36Sopenharmony_ci	/* Get the desination 128B aligned */
29962306a36Sopenharmony_ci	neg	r6,r3
30062306a36Sopenharmony_ci	srdi	r7,r6,4
30162306a36Sopenharmony_ci	mtocrf	0x01,r7
30262306a36Sopenharmony_ci	clrldi	r6,r6,(64-7)
30362306a36Sopenharmony_ci
30462306a36Sopenharmony_ci	li	r9,16
30562306a36Sopenharmony_ci	li	r10,32
30662306a36Sopenharmony_ci	li	r11,48
30762306a36Sopenharmony_ci
30862306a36Sopenharmony_ci	bf	cr7*4+3,5f
30962306a36Sopenharmony_ci	lvx	v1,0,r4
31062306a36Sopenharmony_ci	addi	r4,r4,16
31162306a36Sopenharmony_ci	stvx	v1,0,r3
31262306a36Sopenharmony_ci	addi	r3,r3,16
31362306a36Sopenharmony_ci
31462306a36Sopenharmony_ci5:	bf	cr7*4+2,6f
31562306a36Sopenharmony_ci	lvx	v1,0,r4
31662306a36Sopenharmony_ci	lvx	v0,r4,r9
31762306a36Sopenharmony_ci	addi	r4,r4,32
31862306a36Sopenharmony_ci	stvx	v1,0,r3
31962306a36Sopenharmony_ci	stvx	v0,r3,r9
32062306a36Sopenharmony_ci	addi	r3,r3,32
32162306a36Sopenharmony_ci
32262306a36Sopenharmony_ci6:	bf	cr7*4+1,7f
32362306a36Sopenharmony_ci	lvx	v3,0,r4
32462306a36Sopenharmony_ci	lvx	v2,r4,r9
32562306a36Sopenharmony_ci	lvx	v1,r4,r10
32662306a36Sopenharmony_ci	lvx	v0,r4,r11
32762306a36Sopenharmony_ci	addi	r4,r4,64
32862306a36Sopenharmony_ci	stvx	v3,0,r3
32962306a36Sopenharmony_ci	stvx	v2,r3,r9
33062306a36Sopenharmony_ci	stvx	v1,r3,r10
33162306a36Sopenharmony_ci	stvx	v0,r3,r11
33262306a36Sopenharmony_ci	addi	r3,r3,64
33362306a36Sopenharmony_ci
33462306a36Sopenharmony_ci7:	sub	r5,r5,r6
33562306a36Sopenharmony_ci	srdi	r6,r5,7
33662306a36Sopenharmony_ci
33762306a36Sopenharmony_ci	std	r14,STK_REG(R14)(r1)
33862306a36Sopenharmony_ci	std	r15,STK_REG(R15)(r1)
33962306a36Sopenharmony_ci	std	r16,STK_REG(R16)(r1)
34062306a36Sopenharmony_ci
34162306a36Sopenharmony_ci	li	r12,64
34262306a36Sopenharmony_ci	li	r14,80
34362306a36Sopenharmony_ci	li	r15,96
34462306a36Sopenharmony_ci	li	r16,112
34562306a36Sopenharmony_ci
34662306a36Sopenharmony_ci	mtctr	r6
34762306a36Sopenharmony_ci
34862306a36Sopenharmony_ci	/*
34962306a36Sopenharmony_ci	 * Now do cacheline sized loads and stores. By this stage the
35062306a36Sopenharmony_ci	 * cacheline stores are also cacheline aligned.
35162306a36Sopenharmony_ci	 */
35262306a36Sopenharmony_ci	.align	5
35362306a36Sopenharmony_ci8:
35462306a36Sopenharmony_ci	lvx	v7,0,r4
35562306a36Sopenharmony_ci	lvx	v6,r4,r9
35662306a36Sopenharmony_ci	lvx	v5,r4,r10
35762306a36Sopenharmony_ci	lvx	v4,r4,r11
35862306a36Sopenharmony_ci	lvx	v3,r4,r12
35962306a36Sopenharmony_ci	lvx	v2,r4,r14
36062306a36Sopenharmony_ci	lvx	v1,r4,r15
36162306a36Sopenharmony_ci	lvx	v0,r4,r16
36262306a36Sopenharmony_ci	addi	r4,r4,128
36362306a36Sopenharmony_ci	stvx	v7,0,r3
36462306a36Sopenharmony_ci	stvx	v6,r3,r9
36562306a36Sopenharmony_ci	stvx	v5,r3,r10
36662306a36Sopenharmony_ci	stvx	v4,r3,r11
36762306a36Sopenharmony_ci	stvx	v3,r3,r12
36862306a36Sopenharmony_ci	stvx	v2,r3,r14
36962306a36Sopenharmony_ci	stvx	v1,r3,r15
37062306a36Sopenharmony_ci	stvx	v0,r3,r16
37162306a36Sopenharmony_ci	addi	r3,r3,128
37262306a36Sopenharmony_ci	bdnz	8b
37362306a36Sopenharmony_ci
37462306a36Sopenharmony_ci	ld	r14,STK_REG(R14)(r1)
37562306a36Sopenharmony_ci	ld	r15,STK_REG(R15)(r1)
37662306a36Sopenharmony_ci	ld	r16,STK_REG(R16)(r1)
37762306a36Sopenharmony_ci
37862306a36Sopenharmony_ci	/* Up to 127B to go */
37962306a36Sopenharmony_ci	clrldi	r5,r5,(64-7)
38062306a36Sopenharmony_ci	srdi	r6,r5,4
38162306a36Sopenharmony_ci	mtocrf	0x01,r6
38262306a36Sopenharmony_ci
38362306a36Sopenharmony_ci	bf	cr7*4+1,9f
38462306a36Sopenharmony_ci	lvx	v3,0,r4
38562306a36Sopenharmony_ci	lvx	v2,r4,r9
38662306a36Sopenharmony_ci	lvx	v1,r4,r10
38762306a36Sopenharmony_ci	lvx	v0,r4,r11
38862306a36Sopenharmony_ci	addi	r4,r4,64
38962306a36Sopenharmony_ci	stvx	v3,0,r3
39062306a36Sopenharmony_ci	stvx	v2,r3,r9
39162306a36Sopenharmony_ci	stvx	v1,r3,r10
39262306a36Sopenharmony_ci	stvx	v0,r3,r11
39362306a36Sopenharmony_ci	addi	r3,r3,64
39462306a36Sopenharmony_ci
39562306a36Sopenharmony_ci9:	bf	cr7*4+2,10f
39662306a36Sopenharmony_ci	lvx	v1,0,r4
39762306a36Sopenharmony_ci	lvx	v0,r4,r9
39862306a36Sopenharmony_ci	addi	r4,r4,32
39962306a36Sopenharmony_ci	stvx	v1,0,r3
40062306a36Sopenharmony_ci	stvx	v0,r3,r9
40162306a36Sopenharmony_ci	addi	r3,r3,32
40262306a36Sopenharmony_ci
40362306a36Sopenharmony_ci10:	bf	cr7*4+3,11f
40462306a36Sopenharmony_ci	lvx	v1,0,r4
40562306a36Sopenharmony_ci	addi	r4,r4,16
40662306a36Sopenharmony_ci	stvx	v1,0,r3
40762306a36Sopenharmony_ci	addi	r3,r3,16
40862306a36Sopenharmony_ci
40962306a36Sopenharmony_ci	/* Up to 15B to go */
41062306a36Sopenharmony_ci11:	clrldi	r5,r5,(64-4)
41162306a36Sopenharmony_ci	mtocrf	0x01,r5
41262306a36Sopenharmony_ci	bf	cr7*4+0,12f
41362306a36Sopenharmony_ci	ld	r0,0(r4)
41462306a36Sopenharmony_ci	addi	r4,r4,8
41562306a36Sopenharmony_ci	std	r0,0(r3)
41662306a36Sopenharmony_ci	addi	r3,r3,8
41762306a36Sopenharmony_ci
41862306a36Sopenharmony_ci12:	bf	cr7*4+1,13f
41962306a36Sopenharmony_ci	lwz	r0,0(r4)
42062306a36Sopenharmony_ci	addi	r4,r4,4
42162306a36Sopenharmony_ci	stw	r0,0(r3)
42262306a36Sopenharmony_ci	addi	r3,r3,4
42362306a36Sopenharmony_ci
42462306a36Sopenharmony_ci13:	bf	cr7*4+2,14f
42562306a36Sopenharmony_ci	lhz	r0,0(r4)
42662306a36Sopenharmony_ci	addi	r4,r4,2
42762306a36Sopenharmony_ci	sth	r0,0(r3)
42862306a36Sopenharmony_ci	addi	r3,r3,2
42962306a36Sopenharmony_ci
43062306a36Sopenharmony_ci14:	bf	cr7*4+3,15f
43162306a36Sopenharmony_ci	lbz	r0,0(r4)
43262306a36Sopenharmony_ci	stb	r0,0(r3)
43362306a36Sopenharmony_ci
43462306a36Sopenharmony_ci15:	addi	r1,r1,STACKFRAMESIZE
43562306a36Sopenharmony_ci	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
43662306a36Sopenharmony_ci	b	CFUNC(exit_vmx_ops)		/* tail call optimise */
43762306a36Sopenharmony_ci
43862306a36Sopenharmony_ci.Lvmx_unaligned_copy:
43962306a36Sopenharmony_ci	/* Get the destination 16B aligned */
44062306a36Sopenharmony_ci	neg	r6,r3
44162306a36Sopenharmony_ci	mtocrf	0x01,r6
44262306a36Sopenharmony_ci	clrldi	r6,r6,(64-4)
44362306a36Sopenharmony_ci
44462306a36Sopenharmony_ci	bf	cr7*4+3,1f
44562306a36Sopenharmony_ci	lbz	r0,0(r4)
44662306a36Sopenharmony_ci	addi	r4,r4,1
44762306a36Sopenharmony_ci	stb	r0,0(r3)
44862306a36Sopenharmony_ci	addi	r3,r3,1
44962306a36Sopenharmony_ci
45062306a36Sopenharmony_ci1:	bf	cr7*4+2,2f
45162306a36Sopenharmony_ci	lhz	r0,0(r4)
45262306a36Sopenharmony_ci	addi	r4,r4,2
45362306a36Sopenharmony_ci	sth	r0,0(r3)
45462306a36Sopenharmony_ci	addi	r3,r3,2
45562306a36Sopenharmony_ci
45662306a36Sopenharmony_ci2:	bf	cr7*4+1,3f
45762306a36Sopenharmony_ci	lwz	r0,0(r4)
45862306a36Sopenharmony_ci	addi	r4,r4,4
45962306a36Sopenharmony_ci	stw	r0,0(r3)
46062306a36Sopenharmony_ci	addi	r3,r3,4
46162306a36Sopenharmony_ci
46262306a36Sopenharmony_ci3:	bf	cr7*4+0,4f
46362306a36Sopenharmony_ci	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
46462306a36Sopenharmony_ci	lwz	r7,4(r4)
46562306a36Sopenharmony_ci	addi	r4,r4,8
46662306a36Sopenharmony_ci	stw	r0,0(r3)
46762306a36Sopenharmony_ci	stw	r7,4(r3)
46862306a36Sopenharmony_ci	addi	r3,r3,8
46962306a36Sopenharmony_ci
47062306a36Sopenharmony_ci4:	sub	r5,r5,r6
47162306a36Sopenharmony_ci
47262306a36Sopenharmony_ci	/* Get the desination 128B aligned */
47362306a36Sopenharmony_ci	neg	r6,r3
47462306a36Sopenharmony_ci	srdi	r7,r6,4
47562306a36Sopenharmony_ci	mtocrf	0x01,r7
47662306a36Sopenharmony_ci	clrldi	r6,r6,(64-7)
47762306a36Sopenharmony_ci
47862306a36Sopenharmony_ci	li	r9,16
47962306a36Sopenharmony_ci	li	r10,32
48062306a36Sopenharmony_ci	li	r11,48
48162306a36Sopenharmony_ci
48262306a36Sopenharmony_ci	LVS(v16,0,r4)		/* Setup permute control vector */
48362306a36Sopenharmony_ci	lvx	v0,0,r4
48462306a36Sopenharmony_ci	addi	r4,r4,16
48562306a36Sopenharmony_ci
48662306a36Sopenharmony_ci	bf	cr7*4+3,5f
48762306a36Sopenharmony_ci	lvx	v1,0,r4
48862306a36Sopenharmony_ci	VPERM(v8,v0,v1,v16)
48962306a36Sopenharmony_ci	addi	r4,r4,16
49062306a36Sopenharmony_ci	stvx	v8,0,r3
49162306a36Sopenharmony_ci	addi	r3,r3,16
49262306a36Sopenharmony_ci	vor	v0,v1,v1
49362306a36Sopenharmony_ci
49462306a36Sopenharmony_ci5:	bf	cr7*4+2,6f
49562306a36Sopenharmony_ci	lvx	v1,0,r4
49662306a36Sopenharmony_ci	VPERM(v8,v0,v1,v16)
49762306a36Sopenharmony_ci	lvx	v0,r4,r9
49862306a36Sopenharmony_ci	VPERM(v9,v1,v0,v16)
49962306a36Sopenharmony_ci	addi	r4,r4,32
50062306a36Sopenharmony_ci	stvx	v8,0,r3
50162306a36Sopenharmony_ci	stvx	v9,r3,r9
50262306a36Sopenharmony_ci	addi	r3,r3,32
50362306a36Sopenharmony_ci
50462306a36Sopenharmony_ci6:	bf	cr7*4+1,7f
50562306a36Sopenharmony_ci	lvx	v3,0,r4
50662306a36Sopenharmony_ci	VPERM(v8,v0,v3,v16)
50762306a36Sopenharmony_ci	lvx	v2,r4,r9
50862306a36Sopenharmony_ci	VPERM(v9,v3,v2,v16)
50962306a36Sopenharmony_ci	lvx	v1,r4,r10
51062306a36Sopenharmony_ci	VPERM(v10,v2,v1,v16)
51162306a36Sopenharmony_ci	lvx	v0,r4,r11
51262306a36Sopenharmony_ci	VPERM(v11,v1,v0,v16)
51362306a36Sopenharmony_ci	addi	r4,r4,64
51462306a36Sopenharmony_ci	stvx	v8,0,r3
51562306a36Sopenharmony_ci	stvx	v9,r3,r9
51662306a36Sopenharmony_ci	stvx	v10,r3,r10
51762306a36Sopenharmony_ci	stvx	v11,r3,r11
51862306a36Sopenharmony_ci	addi	r3,r3,64
51962306a36Sopenharmony_ci
52062306a36Sopenharmony_ci7:	sub	r5,r5,r6
52162306a36Sopenharmony_ci	srdi	r6,r5,7
52262306a36Sopenharmony_ci
52362306a36Sopenharmony_ci	std	r14,STK_REG(R14)(r1)
52462306a36Sopenharmony_ci	std	r15,STK_REG(R15)(r1)
52562306a36Sopenharmony_ci	std	r16,STK_REG(R16)(r1)
52662306a36Sopenharmony_ci
52762306a36Sopenharmony_ci	li	r12,64
52862306a36Sopenharmony_ci	li	r14,80
52962306a36Sopenharmony_ci	li	r15,96
53062306a36Sopenharmony_ci	li	r16,112
53162306a36Sopenharmony_ci
53262306a36Sopenharmony_ci	mtctr	r6
53362306a36Sopenharmony_ci
53462306a36Sopenharmony_ci	/*
53562306a36Sopenharmony_ci	 * Now do cacheline sized loads and stores. By this stage the
53662306a36Sopenharmony_ci	 * cacheline stores are also cacheline aligned.
53762306a36Sopenharmony_ci	 */
53862306a36Sopenharmony_ci	.align	5
53962306a36Sopenharmony_ci8:
54062306a36Sopenharmony_ci	lvx	v7,0,r4
54162306a36Sopenharmony_ci	VPERM(v8,v0,v7,v16)
54262306a36Sopenharmony_ci	lvx	v6,r4,r9
54362306a36Sopenharmony_ci	VPERM(v9,v7,v6,v16)
54462306a36Sopenharmony_ci	lvx	v5,r4,r10
54562306a36Sopenharmony_ci	VPERM(v10,v6,v5,v16)
54662306a36Sopenharmony_ci	lvx	v4,r4,r11
54762306a36Sopenharmony_ci	VPERM(v11,v5,v4,v16)
54862306a36Sopenharmony_ci	lvx	v3,r4,r12
54962306a36Sopenharmony_ci	VPERM(v12,v4,v3,v16)
55062306a36Sopenharmony_ci	lvx	v2,r4,r14
55162306a36Sopenharmony_ci	VPERM(v13,v3,v2,v16)
55262306a36Sopenharmony_ci	lvx	v1,r4,r15
55362306a36Sopenharmony_ci	VPERM(v14,v2,v1,v16)
55462306a36Sopenharmony_ci	lvx	v0,r4,r16
55562306a36Sopenharmony_ci	VPERM(v15,v1,v0,v16)
55662306a36Sopenharmony_ci	addi	r4,r4,128
55762306a36Sopenharmony_ci	stvx	v8,0,r3
55862306a36Sopenharmony_ci	stvx	v9,r3,r9
55962306a36Sopenharmony_ci	stvx	v10,r3,r10
56062306a36Sopenharmony_ci	stvx	v11,r3,r11
56162306a36Sopenharmony_ci	stvx	v12,r3,r12
56262306a36Sopenharmony_ci	stvx	v13,r3,r14
56362306a36Sopenharmony_ci	stvx	v14,r3,r15
56462306a36Sopenharmony_ci	stvx	v15,r3,r16
56562306a36Sopenharmony_ci	addi	r3,r3,128
56662306a36Sopenharmony_ci	bdnz	8b
56762306a36Sopenharmony_ci
56862306a36Sopenharmony_ci	ld	r14,STK_REG(R14)(r1)
56962306a36Sopenharmony_ci	ld	r15,STK_REG(R15)(r1)
57062306a36Sopenharmony_ci	ld	r16,STK_REG(R16)(r1)
57162306a36Sopenharmony_ci
57262306a36Sopenharmony_ci	/* Up to 127B to go */
57362306a36Sopenharmony_ci	clrldi	r5,r5,(64-7)
57462306a36Sopenharmony_ci	srdi	r6,r5,4
57562306a36Sopenharmony_ci	mtocrf	0x01,r6
57662306a36Sopenharmony_ci
57762306a36Sopenharmony_ci	bf	cr7*4+1,9f
57862306a36Sopenharmony_ci	lvx	v3,0,r4
57962306a36Sopenharmony_ci	VPERM(v8,v0,v3,v16)
58062306a36Sopenharmony_ci	lvx	v2,r4,r9
58162306a36Sopenharmony_ci	VPERM(v9,v3,v2,v16)
58262306a36Sopenharmony_ci	lvx	v1,r4,r10
58362306a36Sopenharmony_ci	VPERM(v10,v2,v1,v16)
58462306a36Sopenharmony_ci	lvx	v0,r4,r11
58562306a36Sopenharmony_ci	VPERM(v11,v1,v0,v16)
58662306a36Sopenharmony_ci	addi	r4,r4,64
58762306a36Sopenharmony_ci	stvx	v8,0,r3
58862306a36Sopenharmony_ci	stvx	v9,r3,r9
58962306a36Sopenharmony_ci	stvx	v10,r3,r10
59062306a36Sopenharmony_ci	stvx	v11,r3,r11
59162306a36Sopenharmony_ci	addi	r3,r3,64
59262306a36Sopenharmony_ci
59362306a36Sopenharmony_ci9:	bf	cr7*4+2,10f
59462306a36Sopenharmony_ci	lvx	v1,0,r4
59562306a36Sopenharmony_ci	VPERM(v8,v0,v1,v16)
59662306a36Sopenharmony_ci	lvx	v0,r4,r9
59762306a36Sopenharmony_ci	VPERM(v9,v1,v0,v16)
59862306a36Sopenharmony_ci	addi	r4,r4,32
59962306a36Sopenharmony_ci	stvx	v8,0,r3
60062306a36Sopenharmony_ci	stvx	v9,r3,r9
60162306a36Sopenharmony_ci	addi	r3,r3,32
60262306a36Sopenharmony_ci
60362306a36Sopenharmony_ci10:	bf	cr7*4+3,11f
60462306a36Sopenharmony_ci	lvx	v1,0,r4
60562306a36Sopenharmony_ci	VPERM(v8,v0,v1,v16)
60662306a36Sopenharmony_ci	addi	r4,r4,16
60762306a36Sopenharmony_ci	stvx	v8,0,r3
60862306a36Sopenharmony_ci	addi	r3,r3,16
60962306a36Sopenharmony_ci
61062306a36Sopenharmony_ci	/* Up to 15B to go */
61162306a36Sopenharmony_ci11:	clrldi	r5,r5,(64-4)
61262306a36Sopenharmony_ci	addi	r4,r4,-16	/* Unwind the +16 load offset */
61362306a36Sopenharmony_ci	mtocrf	0x01,r5
61462306a36Sopenharmony_ci	bf	cr7*4+0,12f
61562306a36Sopenharmony_ci	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
61662306a36Sopenharmony_ci	lwz	r6,4(r4)
61762306a36Sopenharmony_ci	addi	r4,r4,8
61862306a36Sopenharmony_ci	stw	r0,0(r3)
61962306a36Sopenharmony_ci	stw	r6,4(r3)
62062306a36Sopenharmony_ci	addi	r3,r3,8
62162306a36Sopenharmony_ci
62262306a36Sopenharmony_ci12:	bf	cr7*4+1,13f
62362306a36Sopenharmony_ci	lwz	r0,0(r4)
62462306a36Sopenharmony_ci	addi	r4,r4,4
62562306a36Sopenharmony_ci	stw	r0,0(r3)
62662306a36Sopenharmony_ci	addi	r3,r3,4
62762306a36Sopenharmony_ci
62862306a36Sopenharmony_ci13:	bf	cr7*4+2,14f
62962306a36Sopenharmony_ci	lhz	r0,0(r4)
63062306a36Sopenharmony_ci	addi	r4,r4,2
63162306a36Sopenharmony_ci	sth	r0,0(r3)
63262306a36Sopenharmony_ci	addi	r3,r3,2
63362306a36Sopenharmony_ci
63462306a36Sopenharmony_ci14:	bf	cr7*4+3,15f
63562306a36Sopenharmony_ci	lbz	r0,0(r4)
63662306a36Sopenharmony_ci	stb	r0,0(r3)
63762306a36Sopenharmony_ci
63862306a36Sopenharmony_ci15:	addi	r1,r1,STACKFRAMESIZE
63962306a36Sopenharmony_ci	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
64062306a36Sopenharmony_ci	b	CFUNC(exit_vmx_ops)		/* tail call optimise */
64162306a36Sopenharmony_ci#endif /* CONFIG_ALTIVEC */
642