162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * "memcpy" implementation of SuperH
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Copyright (C) 1999  Niibe Yutaka
662306a36Sopenharmony_ci * Copyright (c) 2002  STMicroelectronics Ltd
762306a36Sopenharmony_ci *   Modified from memcpy.S and micro-optimised for SH4
862306a36Sopenharmony_ci *   Stuart Menefy (stuart.menefy@st.com)
962306a36Sopenharmony_ci *
1062306a36Sopenharmony_ci */
1162306a36Sopenharmony_ci#include <linux/linkage.h>
1262306a36Sopenharmony_ci
1362306a36Sopenharmony_ci/*
1462306a36Sopenharmony_ci * void *memcpy(void *dst, const void *src, size_t n);
1562306a36Sopenharmony_ci *
1662306a36Sopenharmony_ci * It is assumed that there is no overlap between src and dst.
1762306a36Sopenharmony_ci * If there is an overlap, then the results are undefined.
1862306a36Sopenharmony_ci */
1962306a36Sopenharmony_ci
2062306a36Sopenharmony_ci	!
2162306a36Sopenharmony_ci	!	GHIJ KLMN OPQR -->  ...G HIJK LMNO PQR.
2262306a36Sopenharmony_ci	!
2362306a36Sopenharmony_ci
2462306a36Sopenharmony_ci	! Size is 16 or greater, and may have trailing bytes
2562306a36Sopenharmony_ci
2662306a36Sopenharmony_ci	.balign	32
2762306a36Sopenharmony_ci.Lcase1:
2862306a36Sopenharmony_ci	! Read a long word and write a long word at once
2962306a36Sopenharmony_ci	! At the start of each iteration, r7 contains last long load
3062306a36Sopenharmony_ci	add	#-1,r5		!  79 EX
3162306a36Sopenharmony_ci	mov	r4,r2		!   5 MT (0 cycles latency)
3262306a36Sopenharmony_ci
3362306a36Sopenharmony_ci	mov.l	@(r0,r5),r7	!  21 LS (2 cycles latency)
3462306a36Sopenharmony_ci	add	#-4,r5		!  50 EX
3562306a36Sopenharmony_ci
3662306a36Sopenharmony_ci	add	#7,r2		!  79 EX
3762306a36Sopenharmony_ci	!
3862306a36Sopenharmony_ci#ifdef CONFIG_CPU_LITTLE_ENDIAN
3962306a36Sopenharmony_ci	! 6 cycles, 4 bytes per iteration
4062306a36Sopenharmony_ci3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! NMLK
4162306a36Sopenharmony_ci	mov	r7, r3		!   5 MT (latency=0)	! RQPO
4262306a36Sopenharmony_ci
4362306a36Sopenharmony_ci	cmp/hi	r2,r0		!  57 MT
4462306a36Sopenharmony_ci	shll16	r3		! 103 EX
4562306a36Sopenharmony_ci
4662306a36Sopenharmony_ci	mov	r1,r6		!   5 MT (latency=0)
4762306a36Sopenharmony_ci	shll8	r3		! 102 EX		! Oxxx
4862306a36Sopenharmony_ci
4962306a36Sopenharmony_ci	shlr8	r6		! 106 EX		! xNML
5062306a36Sopenharmony_ci	mov	r1, r7		!   5 MT (latency=0)
5162306a36Sopenharmony_ci
5262306a36Sopenharmony_ci	or	r6,r3		!  82 EX		! ONML
5362306a36Sopenharmony_ci	bt/s	3b		! 109 BR
5462306a36Sopenharmony_ci
5562306a36Sopenharmony_ci	 mov.l	r3,@-r0		!  30 LS
5662306a36Sopenharmony_ci#else
5762306a36Sopenharmony_ci3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! KLMN
5862306a36Sopenharmony_ci	mov	r7,r3		!   5 MT (latency=0)	! OPQR
5962306a36Sopenharmony_ci
6062306a36Sopenharmony_ci	cmp/hi	r2,r0		!  57 MT
6162306a36Sopenharmony_ci	shlr16	r3		! 107 EX
6262306a36Sopenharmony_ci
6362306a36Sopenharmony_ci	shlr8	r3		! 106 EX		! xxxO
6462306a36Sopenharmony_ci	mov	r1,r6		!   5 MT (latency=0)
6562306a36Sopenharmony_ci
6662306a36Sopenharmony_ci	shll8	r6		! 102 EX		! LMNx
6762306a36Sopenharmony_ci	mov	r1,r7		!   5 MT (latency=0)
6862306a36Sopenharmony_ci
6962306a36Sopenharmony_ci	or	r6,r3		!  82 EX		! LMNO
7062306a36Sopenharmony_ci	bt/s	3b		! 109 BR
7162306a36Sopenharmony_ci
7262306a36Sopenharmony_ci	 mov.l	r3,@-r0		!  30 LS
7362306a36Sopenharmony_ci#endif
7462306a36Sopenharmony_ci	! Finally, copy a byte at once, if necessary
7562306a36Sopenharmony_ci
7662306a36Sopenharmony_ci	add	#4,r5		!  50 EX
7762306a36Sopenharmony_ci	cmp/eq	r4,r0		!  54 MT
7862306a36Sopenharmony_ci
7962306a36Sopenharmony_ci	add	#-6,r2		!  50 EX
8062306a36Sopenharmony_ci	bt	9f		! 109 BR
8162306a36Sopenharmony_ci
8262306a36Sopenharmony_ci8:	cmp/hi	r2,r0		!  57 MT
8362306a36Sopenharmony_ci	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
8462306a36Sopenharmony_ci
8562306a36Sopenharmony_ci	bt/s	8b		! 109 BR
8662306a36Sopenharmony_ci
8762306a36Sopenharmony_ci	 mov.b	r1,@-r0		!  29 LS
8862306a36Sopenharmony_ci
8962306a36Sopenharmony_ci9:	rts
9062306a36Sopenharmony_ci	 nop
9162306a36Sopenharmony_ci
9262306a36Sopenharmony_ci
9362306a36Sopenharmony_ci	!
9462306a36Sopenharmony_ci	!	GHIJ KLMN OPQR -->  .GHI JKLM NOPQ R...
9562306a36Sopenharmony_ci	!
9662306a36Sopenharmony_ci
9762306a36Sopenharmony_ci	! Size is 16 or greater, and may have trailing bytes
9862306a36Sopenharmony_ci
9962306a36Sopenharmony_ci	.balign	32
10062306a36Sopenharmony_ci.Lcase3:
10162306a36Sopenharmony_ci	! Read a long word and write a long word at once
10262306a36Sopenharmony_ci	! At the start of each iteration, r7 contains last long load
10362306a36Sopenharmony_ci	add	#-3,r5		! 79 EX
10462306a36Sopenharmony_ci	mov	r4,r2		!  5 MT (0 cycles latency)
10562306a36Sopenharmony_ci
10662306a36Sopenharmony_ci	mov.l	@(r0,r5),r7	! 21 LS (2 cycles latency)
10762306a36Sopenharmony_ci	add	#-4,r5		! 50 EX
10862306a36Sopenharmony_ci
10962306a36Sopenharmony_ci	add	#7,r2		!  79 EX
11062306a36Sopenharmony_ci	!
11162306a36Sopenharmony_ci#ifdef CONFIG_CPU_LITTLE_ENDIAN
11262306a36Sopenharmony_ci	! 6 cycles, 4 bytes per iteration
11362306a36Sopenharmony_ci3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! NMLK
11462306a36Sopenharmony_ci	mov	r7, r3		!   5 MT (latency=0)	! RQPO
11562306a36Sopenharmony_ci
11662306a36Sopenharmony_ci	cmp/hi	r2,r0		!  57 MT
11762306a36Sopenharmony_ci	shll8	r3		! 102 EX		! QPOx
11862306a36Sopenharmony_ci
11962306a36Sopenharmony_ci	mov	r1,r6		!   5 MT (latency=0)
12062306a36Sopenharmony_ci	shlr16	r6		! 107 EX
12162306a36Sopenharmony_ci
12262306a36Sopenharmony_ci	shlr8	r6		! 106 EX		! xxxN
12362306a36Sopenharmony_ci	mov	r1, r7		!   5 MT (latency=0)
12462306a36Sopenharmony_ci
12562306a36Sopenharmony_ci	or	r6,r3		!  82 EX		! QPON
12662306a36Sopenharmony_ci	bt/s	3b		! 109 BR
12762306a36Sopenharmony_ci
12862306a36Sopenharmony_ci	 mov.l	r3,@-r0		!  30 LS
12962306a36Sopenharmony_ci#else
13062306a36Sopenharmony_ci3:	mov	r7,r3		! OPQR
13162306a36Sopenharmony_ci	shlr8	r3		! xOPQ
13262306a36Sopenharmony_ci	mov.l	@(r0,r5),r7	! KLMN
13362306a36Sopenharmony_ci	mov	r7,r6
13462306a36Sopenharmony_ci	shll16	r6
13562306a36Sopenharmony_ci	shll8	r6		! Nxxx
13662306a36Sopenharmony_ci	or	r6,r3		! NOPQ
13762306a36Sopenharmony_ci	cmp/hi	r2,r0
13862306a36Sopenharmony_ci	bt/s	3b
13962306a36Sopenharmony_ci	 mov.l	r3,@-r0
14062306a36Sopenharmony_ci#endif
14162306a36Sopenharmony_ci
14262306a36Sopenharmony_ci	! Finally, copy a byte at once, if necessary
14362306a36Sopenharmony_ci
14462306a36Sopenharmony_ci	add	#6,r5		!  50 EX
14562306a36Sopenharmony_ci	cmp/eq	r4,r0		!  54 MT
14662306a36Sopenharmony_ci
14762306a36Sopenharmony_ci	add	#-6,r2		!  50 EX
14862306a36Sopenharmony_ci	bt	9f		! 109 BR
14962306a36Sopenharmony_ci
15062306a36Sopenharmony_ci8:	cmp/hi	r2,r0		!  57 MT
15162306a36Sopenharmony_ci	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
15262306a36Sopenharmony_ci
15362306a36Sopenharmony_ci	bt/s	8b		! 109 BR
15462306a36Sopenharmony_ci
15562306a36Sopenharmony_ci	 mov.b	r1,@-r0		!  29 LS
15662306a36Sopenharmony_ci
15762306a36Sopenharmony_ci9:	rts
15862306a36Sopenharmony_ci	 nop
15962306a36Sopenharmony_ci
16062306a36Sopenharmony_ciENTRY(memcpy)
16162306a36Sopenharmony_ci
16262306a36Sopenharmony_ci	! Calculate the invariants which will be used in the remainder
16362306a36Sopenharmony_ci	! of the code:
16462306a36Sopenharmony_ci	!
16562306a36Sopenharmony_ci	!      r4   -->  [ ...  ] DST             [ ...  ] SRC
16662306a36Sopenharmony_ci	!	         [ ...  ]                 [ ...  ]
16762306a36Sopenharmony_ci	!	           :                        :
16862306a36Sopenharmony_ci	!      r0   -->  [ ...  ]       r0+r5 --> [ ...  ]
16962306a36Sopenharmony_ci	!
17062306a36Sopenharmony_ci	!
17162306a36Sopenharmony_ci
17262306a36Sopenharmony_ci	! Short circuit the common case of src, dst and len being 32 bit aligned
17362306a36Sopenharmony_ci	! and test for zero length move
17462306a36Sopenharmony_ci
17562306a36Sopenharmony_ci	mov	r6, r0		!   5 MT (0 cycle latency)
17662306a36Sopenharmony_ci	or	r4, r0		!  82 EX
17762306a36Sopenharmony_ci
17862306a36Sopenharmony_ci	or	r5, r0		!  82 EX
17962306a36Sopenharmony_ci	tst	r6, r6		!  86 MT
18062306a36Sopenharmony_ci
18162306a36Sopenharmony_ci	bt/s	99f		! 111 BR		(zero len)
18262306a36Sopenharmony_ci	 tst	#3, r0		!  87 MT
18362306a36Sopenharmony_ci
18462306a36Sopenharmony_ci	mov	r4, r0		!   5 MT (0 cycle latency)
18562306a36Sopenharmony_ci	add	r6, r0		!  49 EX
18662306a36Sopenharmony_ci
18762306a36Sopenharmony_ci	mov	#16, r1		!   6 EX
18862306a36Sopenharmony_ci	bt/s	.Lcase00	! 111 BR		(aligned)
18962306a36Sopenharmony_ci
19062306a36Sopenharmony_ci	 sub	r4, r5		!  75 EX
19162306a36Sopenharmony_ci
19262306a36Sopenharmony_ci	! Arguments are not nicely long word aligned or zero len.
19362306a36Sopenharmony_ci	! Check for small copies, and if so do a simple byte at a time copy.
19462306a36Sopenharmony_ci	!
19562306a36Sopenharmony_ci	! Deciding on an exact value of 'small' is not easy, as the point at which
19662306a36Sopenharmony_ci	! using the optimised routines become worthwhile varies (these are the
19762306a36Sopenharmony_ci	! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
19862306a36Sopenharmony_ci	!	size	byte-at-time	long	word	byte
19962306a36Sopenharmony_ci	!	16	42		39-40	46-50	50-55
20062306a36Sopenharmony_ci	!	24	58		43-44	54-58	62-67
20162306a36Sopenharmony_ci	!	36	82		49-50	66-70	80-85
20262306a36Sopenharmony_ci	! However the penalty for getting it 'wrong' is much higher for long word
20362306a36Sopenharmony_ci	! aligned data (and this is more common), so use a value of 16.
20462306a36Sopenharmony_ci
20562306a36Sopenharmony_ci	cmp/gt	r6,r1		!  56 MT
20662306a36Sopenharmony_ci
20762306a36Sopenharmony_ci	add	#-1,r5		!  50 EX
20862306a36Sopenharmony_ci	bf/s	6f		! 108 BR		(not small)
20962306a36Sopenharmony_ci
21062306a36Sopenharmony_ci	 mov	r5, r3		!   5 MT (latency=0)
21162306a36Sopenharmony_ci	shlr	r6		! 104 EX
21262306a36Sopenharmony_ci
21362306a36Sopenharmony_ci	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
21462306a36Sopenharmony_ci	bf/s	4f		! 111 BR
21562306a36Sopenharmony_ci
21662306a36Sopenharmony_ci	 add	#-1,r3		!  50 EX
21762306a36Sopenharmony_ci	tst	r6, r6		!  86 MT
21862306a36Sopenharmony_ci
21962306a36Sopenharmony_ci	bt/s	98f		! 110 BR
22062306a36Sopenharmony_ci	 mov.b	r1,@-r0		!  29 LS
22162306a36Sopenharmony_ci
22262306a36Sopenharmony_ci	! 4 cycles, 2 bytes per iteration
22362306a36Sopenharmony_ci3:	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
22462306a36Sopenharmony_ci
22562306a36Sopenharmony_ci4:	mov.b	@(r0,r3),r2	!  20 LS (latency=2)
22662306a36Sopenharmony_ci	dt	r6		!  67 EX
22762306a36Sopenharmony_ci
22862306a36Sopenharmony_ci	mov.b	r1,@-r0		!  29 LS
22962306a36Sopenharmony_ci	bf/s	3b		! 111 BR
23062306a36Sopenharmony_ci
23162306a36Sopenharmony_ci	 mov.b	r2,@-r0		!  29 LS
23262306a36Sopenharmony_ci98:
23362306a36Sopenharmony_ci	rts
23462306a36Sopenharmony_ci	 nop
23562306a36Sopenharmony_ci
23662306a36Sopenharmony_ci99:	rts
23762306a36Sopenharmony_ci	 mov	r4, r0
23862306a36Sopenharmony_ci
23962306a36Sopenharmony_ci	! Size is not small, so its worthwhile looking for optimisations.
24062306a36Sopenharmony_ci	! First align destination to a long word boundary.
24162306a36Sopenharmony_ci	!
24262306a36Sopenharmony_ci	! r5 = normal value -1
24362306a36Sopenharmony_ci
24462306a36Sopenharmony_ci6:	tst	#3, r0		!  87 MT
24562306a36Sopenharmony_ci        mov	#3, r3		!   6 EX
24662306a36Sopenharmony_ci
24762306a36Sopenharmony_ci	bt/s	2f		! 111 BR
24862306a36Sopenharmony_ci	 and	r0,r3		!  78 EX
24962306a36Sopenharmony_ci
25062306a36Sopenharmony_ci	! 3 cycles, 1 byte per iteration
25162306a36Sopenharmony_ci1:	dt	r3		!  67 EX
25262306a36Sopenharmony_ci	mov.b	@(r0,r5),r1	!  19 LS (latency=2)
25362306a36Sopenharmony_ci
25462306a36Sopenharmony_ci	add	#-1, r6		!  79 EX
25562306a36Sopenharmony_ci	bf/s	1b		! 109 BR
25662306a36Sopenharmony_ci
25762306a36Sopenharmony_ci	 mov.b	r1,@-r0		!  28 LS
25862306a36Sopenharmony_ci
25962306a36Sopenharmony_ci2:	add	#1, r5		!  79 EX
26062306a36Sopenharmony_ci
26162306a36Sopenharmony_ci	! Now select the appropriate bulk transfer code based on relative
26262306a36Sopenharmony_ci	! alignment of src and dst.
26362306a36Sopenharmony_ci
26462306a36Sopenharmony_ci	mov	r0, r3		!   5 MT (latency=0)
26562306a36Sopenharmony_ci
26662306a36Sopenharmony_ci	mov	r5, r0		!   5 MT (latency=0)
26762306a36Sopenharmony_ci	tst	#1, r0		!  87 MT
26862306a36Sopenharmony_ci
26962306a36Sopenharmony_ci	bf/s	1f		! 111 BR
27062306a36Sopenharmony_ci	 mov	#64, r7		!   6 EX
27162306a36Sopenharmony_ci
27262306a36Sopenharmony_ci	! bit 0 clear
27362306a36Sopenharmony_ci
27462306a36Sopenharmony_ci	cmp/ge	r7, r6		!  55 MT
27562306a36Sopenharmony_ci
27662306a36Sopenharmony_ci	bt/s	2f		! 111 BR
27762306a36Sopenharmony_ci	 tst	#2, r0		!  87 MT
27862306a36Sopenharmony_ci
27962306a36Sopenharmony_ci	! small
28062306a36Sopenharmony_ci	bt/s	.Lcase0
28162306a36Sopenharmony_ci	 mov	r3, r0
28262306a36Sopenharmony_ci
28362306a36Sopenharmony_ci	bra	.Lcase2
28462306a36Sopenharmony_ci	 nop
28562306a36Sopenharmony_ci
28662306a36Sopenharmony_ci	! big
28762306a36Sopenharmony_ci2:	bt/s	.Lcase0b
28862306a36Sopenharmony_ci	 mov	r3, r0
28962306a36Sopenharmony_ci
29062306a36Sopenharmony_ci	bra	.Lcase2b
29162306a36Sopenharmony_ci	 nop
29262306a36Sopenharmony_ci
29362306a36Sopenharmony_ci	! bit 0 set
29462306a36Sopenharmony_ci1:	tst	#2, r0		! 87 MT
29562306a36Sopenharmony_ci
29662306a36Sopenharmony_ci	bt/s	.Lcase1
29762306a36Sopenharmony_ci	 mov	r3, r0
29862306a36Sopenharmony_ci
29962306a36Sopenharmony_ci	bra	.Lcase3
30062306a36Sopenharmony_ci	 nop
30162306a36Sopenharmony_ci
30262306a36Sopenharmony_ci
30362306a36Sopenharmony_ci	!
30462306a36Sopenharmony_ci	!	GHIJ KLMN OPQR -->  GHIJ KLMN OPQR
30562306a36Sopenharmony_ci	!
30662306a36Sopenharmony_ci
30762306a36Sopenharmony_ci	! src, dst and size are all long word aligned
30862306a36Sopenharmony_ci	! size is non-zero
30962306a36Sopenharmony_ci
31062306a36Sopenharmony_ci	.balign	32
31162306a36Sopenharmony_ci.Lcase00:
31262306a36Sopenharmony_ci	mov	#64, r1		!   6 EX
31362306a36Sopenharmony_ci	mov	r5, r3		!   5 MT (latency=0)
31462306a36Sopenharmony_ci
31562306a36Sopenharmony_ci	cmp/gt	r6, r1		!  56 MT
31662306a36Sopenharmony_ci	add	#-4, r5		!  50 EX
31762306a36Sopenharmony_ci
31862306a36Sopenharmony_ci	bf	.Lcase00b	! 108 BR		(big loop)
31962306a36Sopenharmony_ci	shlr2	r6		! 105 EX
32062306a36Sopenharmony_ci
32162306a36Sopenharmony_ci	shlr	r6		! 104 EX
32262306a36Sopenharmony_ci	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
32362306a36Sopenharmony_ci
32462306a36Sopenharmony_ci	bf/s	4f		! 111 BR
32562306a36Sopenharmony_ci	 add	#-8, r3		!  50 EX
32662306a36Sopenharmony_ci
32762306a36Sopenharmony_ci	tst	r6, r6		!  86 MT
32862306a36Sopenharmony_ci	bt/s	5f		! 110 BR
32962306a36Sopenharmony_ci
33062306a36Sopenharmony_ci	 mov.l	r1,@-r0		!  30 LS
33162306a36Sopenharmony_ci
33262306a36Sopenharmony_ci	! 4 cycles, 2 long words per iteration
33362306a36Sopenharmony_ci3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
33462306a36Sopenharmony_ci
33562306a36Sopenharmony_ci4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
33662306a36Sopenharmony_ci	dt	r6		!  67 EX
33762306a36Sopenharmony_ci
33862306a36Sopenharmony_ci	mov.l	r1, @-r0	!  30 LS
33962306a36Sopenharmony_ci	bf/s	3b		! 109 BR
34062306a36Sopenharmony_ci
34162306a36Sopenharmony_ci	 mov.l	r2, @-r0	!  30 LS
34262306a36Sopenharmony_ci
34362306a36Sopenharmony_ci5:	rts
34462306a36Sopenharmony_ci	 nop
34562306a36Sopenharmony_ci
34662306a36Sopenharmony_ci
34762306a36Sopenharmony_ci	! Size is 16 or greater and less than 64, but may have trailing bytes
34862306a36Sopenharmony_ci
34962306a36Sopenharmony_ci	.balign	32
35062306a36Sopenharmony_ci.Lcase0:
35162306a36Sopenharmony_ci	add	#-4, r5		!  50 EX
35262306a36Sopenharmony_ci	mov	r4, r7		!   5 MT (latency=0)
35362306a36Sopenharmony_ci
35462306a36Sopenharmony_ci	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
35562306a36Sopenharmony_ci	mov	#4, r2		!   6 EX
35662306a36Sopenharmony_ci
35762306a36Sopenharmony_ci	add	#11, r7		!  50 EX
35862306a36Sopenharmony_ci	tst	r2, r6		!  86 MT
35962306a36Sopenharmony_ci
36062306a36Sopenharmony_ci	mov	r5, r3		!   5 MT (latency=0)
36162306a36Sopenharmony_ci	bt/s	4f		! 111 BR
36262306a36Sopenharmony_ci
36362306a36Sopenharmony_ci	 add	#-4, r3		!  50 EX
36462306a36Sopenharmony_ci	mov.l	r1,@-r0		!  30 LS
36562306a36Sopenharmony_ci
36662306a36Sopenharmony_ci	! 4 cycles, 2 long words per iteration
36762306a36Sopenharmony_ci3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
36862306a36Sopenharmony_ci
36962306a36Sopenharmony_ci4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
37062306a36Sopenharmony_ci	cmp/hi	r7, r0
37162306a36Sopenharmony_ci
37262306a36Sopenharmony_ci	mov.l	r1, @-r0	!  30 LS
37362306a36Sopenharmony_ci	bt/s	3b		! 109 BR
37462306a36Sopenharmony_ci
37562306a36Sopenharmony_ci	 mov.l	r2, @-r0	!  30 LS
37662306a36Sopenharmony_ci
37762306a36Sopenharmony_ci	! Copy the final 0-3 bytes
37862306a36Sopenharmony_ci
37962306a36Sopenharmony_ci	add	#3,r5		!  50 EX
38062306a36Sopenharmony_ci
38162306a36Sopenharmony_ci	cmp/eq	r0, r4		!  54 MT
38262306a36Sopenharmony_ci	add	#-10, r7	!  50 EX
38362306a36Sopenharmony_ci
38462306a36Sopenharmony_ci	bt	9f		! 110 BR
38562306a36Sopenharmony_ci
38662306a36Sopenharmony_ci	! 3 cycles, 1 byte per iteration
38762306a36Sopenharmony_ci1:	mov.b	@(r0,r5),r1	!  19 LS
38862306a36Sopenharmony_ci	cmp/hi	r7,r0		!  57 MT
38962306a36Sopenharmony_ci
39062306a36Sopenharmony_ci	bt/s	1b		! 111 BR
39162306a36Sopenharmony_ci	 mov.b	r1,@-r0		!  28 LS
39262306a36Sopenharmony_ci
39362306a36Sopenharmony_ci9:	rts
39462306a36Sopenharmony_ci	 nop
39562306a36Sopenharmony_ci
39662306a36Sopenharmony_ci	! Size is at least 64 bytes, so will be going round the big loop at least once.
39762306a36Sopenharmony_ci	!
39862306a36Sopenharmony_ci	!   r2 = rounded up r4
39962306a36Sopenharmony_ci	!   r3 = rounded down r0
40062306a36Sopenharmony_ci
40162306a36Sopenharmony_ci	.balign	32
40262306a36Sopenharmony_ci.Lcase0b:
40362306a36Sopenharmony_ci	add	#-4, r5		!  50 EX
40462306a36Sopenharmony_ci
40562306a36Sopenharmony_ci.Lcase00b:
40662306a36Sopenharmony_ci	mov	r0, r3		!   5 MT (latency=0)
40762306a36Sopenharmony_ci	mov	#(~0x1f), r1	!   6 EX
40862306a36Sopenharmony_ci
40962306a36Sopenharmony_ci	and	r1, r3		!  78 EX
41062306a36Sopenharmony_ci	mov	r4, r2		!   5 MT (latency=0)
41162306a36Sopenharmony_ci
41262306a36Sopenharmony_ci	cmp/eq	r3, r0		!  54 MT
41362306a36Sopenharmony_ci	add	#0x1f, r2	!  50 EX
41462306a36Sopenharmony_ci
41562306a36Sopenharmony_ci	bt/s	1f		! 110 BR
41662306a36Sopenharmony_ci	 and	r1, r2		!  78 EX
41762306a36Sopenharmony_ci
41862306a36Sopenharmony_ci	! copy initial words until cache line aligned
41962306a36Sopenharmony_ci
42062306a36Sopenharmony_ci	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
42162306a36Sopenharmony_ci	tst	#4, r0		!  87 MT
42262306a36Sopenharmony_ci
42362306a36Sopenharmony_ci	mov	r5, r6		!   5 MT (latency=0)
42462306a36Sopenharmony_ci	add	#-4, r6		!  50 EX
42562306a36Sopenharmony_ci
42662306a36Sopenharmony_ci	bt/s	4f		! 111 BR
42762306a36Sopenharmony_ci	 add	#8, r3		!  50 EX
42862306a36Sopenharmony_ci
42962306a36Sopenharmony_ci	tst	#0x18, r0	!  87 MT
43062306a36Sopenharmony_ci
43162306a36Sopenharmony_ci	bt/s	1f		! 109 BR
43262306a36Sopenharmony_ci	 mov.l	r1,@-r0		!  30 LS
43362306a36Sopenharmony_ci
43462306a36Sopenharmony_ci	! 4 cycles, 2 long words per iteration
43562306a36Sopenharmony_ci3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
43662306a36Sopenharmony_ci
43762306a36Sopenharmony_ci4:	mov.l	@(r0, r6), r7	!  21 LS (latency=2)
43862306a36Sopenharmony_ci	cmp/eq	r3, r0		!  54 MT
43962306a36Sopenharmony_ci
44062306a36Sopenharmony_ci	mov.l	r1, @-r0	!  30 LS
44162306a36Sopenharmony_ci	bf/s	3b		! 109 BR
44262306a36Sopenharmony_ci
44362306a36Sopenharmony_ci	 mov.l	r7, @-r0	!  30 LS
44462306a36Sopenharmony_ci
44562306a36Sopenharmony_ci	! Copy the cache line aligned blocks
44662306a36Sopenharmony_ci	!
44762306a36Sopenharmony_ci	! In use: r0, r2, r4, r5
44862306a36Sopenharmony_ci	! Scratch: r1, r3, r6, r7
44962306a36Sopenharmony_ci	!
45062306a36Sopenharmony_ci	! We could do this with the four scratch registers, but if src
45162306a36Sopenharmony_ci	! and dest hit the same cache line, this will thrash, so make
45262306a36Sopenharmony_ci	! use of additional registers.
45362306a36Sopenharmony_ci	!
45462306a36Sopenharmony_ci	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
45562306a36Sopenharmony_ci	!   r5:	 src (was r0+r5)
45662306a36Sopenharmony_ci	!   r1:	 dest (was r0)
45762306a36Sopenharmony_ci	! this can be reversed at the end, so we don't need to save any extra
45862306a36Sopenharmony_ci	! state.
45962306a36Sopenharmony_ci	!
46062306a36Sopenharmony_ci1:	mov.l	r8, @-r15	!  30 LS
46162306a36Sopenharmony_ci	add	r0, r5		!  49 EX
46262306a36Sopenharmony_ci
46362306a36Sopenharmony_ci	mov.l	r9, @-r15	!  30 LS
46462306a36Sopenharmony_ci	mov	r0, r1		!   5 MT (latency=0)
46562306a36Sopenharmony_ci
46662306a36Sopenharmony_ci	mov.l	r10, @-r15	!  30 LS
46762306a36Sopenharmony_ci	add	#-0x1c, r5	!  50 EX
46862306a36Sopenharmony_ci
46962306a36Sopenharmony_ci	mov.l	r11, @-r15	!  30 LS
47062306a36Sopenharmony_ci
47162306a36Sopenharmony_ci	! 16 cycles, 32 bytes per iteration
47262306a36Sopenharmony_ci2:	mov.l	@(0x00,r5),r0	! 18 LS (latency=2)
47362306a36Sopenharmony_ci	add	#-0x20, r1	! 50 EX
47462306a36Sopenharmony_ci	mov.l	@(0x04,r5),r3	! 18 LS (latency=2)
47562306a36Sopenharmony_ci	mov.l	@(0x08,r5),r6	! 18 LS (latency=2)
47662306a36Sopenharmony_ci	mov.l	@(0x0c,r5),r7	! 18 LS (latency=2)
47762306a36Sopenharmony_ci	mov.l	@(0x10,r5),r8	! 18 LS (latency=2)
47862306a36Sopenharmony_ci	mov.l	@(0x14,r5),r9	! 18 LS (latency=2)
47962306a36Sopenharmony_ci	mov.l	@(0x18,r5),r10	! 18 LS (latency=2)
48062306a36Sopenharmony_ci	mov.l	@(0x1c,r5),r11	! 18 LS (latency=2)
48162306a36Sopenharmony_ci	movca.l	r0,@r1		! 40 LS (latency=3-7)
48262306a36Sopenharmony_ci	mov.l	r3,@(0x04,r1)	! 33 LS
48362306a36Sopenharmony_ci	mov.l	r6,@(0x08,r1)	! 33 LS
48462306a36Sopenharmony_ci	mov.l	r7,@(0x0c,r1)	! 33 LS
48562306a36Sopenharmony_ci
48662306a36Sopenharmony_ci	mov.l	r8,@(0x10,r1)	! 33 LS
48762306a36Sopenharmony_ci	add	#-0x20, r5	! 50 EX
48862306a36Sopenharmony_ci
48962306a36Sopenharmony_ci	mov.l	r9,@(0x14,r1)	! 33 LS
49062306a36Sopenharmony_ci	cmp/eq	r2,r1		! 54 MT
49162306a36Sopenharmony_ci
49262306a36Sopenharmony_ci	mov.l	r10,@(0x18,r1)	!  33 LS
49362306a36Sopenharmony_ci	bf/s	2b		! 109 BR
49462306a36Sopenharmony_ci
49562306a36Sopenharmony_ci	 mov.l	r11,@(0x1c,r1)	!  33 LS
49662306a36Sopenharmony_ci
49762306a36Sopenharmony_ci	mov	r1, r0		!   5 MT (latency=0)
49862306a36Sopenharmony_ci
49962306a36Sopenharmony_ci	mov.l	@r15+, r11	!  15 LS
50062306a36Sopenharmony_ci	sub	r1, r5		!  75 EX
50162306a36Sopenharmony_ci
50262306a36Sopenharmony_ci	mov.l	@r15+, r10	!  15 LS
50362306a36Sopenharmony_ci	cmp/eq	r4, r0		!  54 MT
50462306a36Sopenharmony_ci
50562306a36Sopenharmony_ci	bf/s	1f		! 109 BR
50662306a36Sopenharmony_ci	 mov.l	 @r15+, r9	!  15 LS
50762306a36Sopenharmony_ci
50862306a36Sopenharmony_ci	rts
50962306a36Sopenharmony_ci1:	 mov.l	@r15+, r8	!  15 LS
51062306a36Sopenharmony_ci	sub	r4, r1		!  75 EX		(len remaining)
51162306a36Sopenharmony_ci
51262306a36Sopenharmony_ci	! number of trailing bytes is non-zero
51362306a36Sopenharmony_ci	!
51462306a36Sopenharmony_ci	! invariants restored (r5 already decremented by 4)
51562306a36Sopenharmony_ci	! also r1=num bytes remaining
51662306a36Sopenharmony_ci
51762306a36Sopenharmony_ci	mov	#4, r2		!   6 EX
51862306a36Sopenharmony_ci	mov	r4, r7		!   5 MT (latency=0)
51962306a36Sopenharmony_ci
52062306a36Sopenharmony_ci	add	#0x1c, r5	!  50 EX		(back to -4)
52162306a36Sopenharmony_ci	cmp/hs	r2, r1		!  58 MT
52262306a36Sopenharmony_ci
52362306a36Sopenharmony_ci	bf/s	5f		! 108 BR
52462306a36Sopenharmony_ci	 add	 #11, r7	!  50 EX
52562306a36Sopenharmony_ci
52662306a36Sopenharmony_ci	mov.l	@(r0, r5), r6	!  21 LS (latency=2)
52762306a36Sopenharmony_ci	tst	r2, r1		!  86 MT
52862306a36Sopenharmony_ci
52962306a36Sopenharmony_ci	mov	r5, r3		!   5 MT (latency=0)
53062306a36Sopenharmony_ci	bt/s	4f		! 111 BR
53162306a36Sopenharmony_ci
53262306a36Sopenharmony_ci	 add	#-4, r3		!  50 EX
53362306a36Sopenharmony_ci	cmp/hs	r2, r1		!  58 MT
53462306a36Sopenharmony_ci
53562306a36Sopenharmony_ci	bt/s	5f		! 111 BR
53662306a36Sopenharmony_ci	 mov.l	r6,@-r0		!  30 LS
53762306a36Sopenharmony_ci
53862306a36Sopenharmony_ci	! 4 cycles, 2 long words per iteration
53962306a36Sopenharmony_ci3:	mov.l	@(r0, r5), r6	!  21 LS (latency=2)
54062306a36Sopenharmony_ci
54162306a36Sopenharmony_ci4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
54262306a36Sopenharmony_ci	cmp/hi	r7, r0
54362306a36Sopenharmony_ci
54462306a36Sopenharmony_ci	mov.l	r6, @-r0	!  30 LS
54562306a36Sopenharmony_ci	bt/s	3b		! 109 BR
54662306a36Sopenharmony_ci
54762306a36Sopenharmony_ci	 mov.l	r2, @-r0	!  30 LS
54862306a36Sopenharmony_ci
54962306a36Sopenharmony_ci	! Copy the final 0-3 bytes
55062306a36Sopenharmony_ci
55162306a36Sopenharmony_ci5:	cmp/eq	r0, r4		!  54 MT
55262306a36Sopenharmony_ci	add	#-10, r7	!  50 EX
55362306a36Sopenharmony_ci
55462306a36Sopenharmony_ci	bt	9f		! 110 BR
55562306a36Sopenharmony_ci	add	#3,r5		!  50 EX
55662306a36Sopenharmony_ci
55762306a36Sopenharmony_ci	! 3 cycles, 1 byte per iteration
55862306a36Sopenharmony_ci1:	mov.b	@(r0,r5),r1	!  19 LS
55962306a36Sopenharmony_ci	cmp/hi	r7,r0		!  57 MT
56062306a36Sopenharmony_ci
56162306a36Sopenharmony_ci	bt/s	1b		! 111 BR
56262306a36Sopenharmony_ci	 mov.b	r1,@-r0		!  28 LS
56362306a36Sopenharmony_ci
56462306a36Sopenharmony_ci9:	rts
56562306a36Sopenharmony_ci	 nop
56662306a36Sopenharmony_ci
56762306a36Sopenharmony_ci	!
56862306a36Sopenharmony_ci	!	GHIJ KLMN OPQR -->  ..GH IJKL MNOP QR..
56962306a36Sopenharmony_ci	!
57062306a36Sopenharmony_ci
57162306a36Sopenharmony_ci	.balign	32
57262306a36Sopenharmony_ci.Lcase2:
57362306a36Sopenharmony_ci	! Size is 16 or greater and less then 64, but may have trailing bytes
57462306a36Sopenharmony_ci
57562306a36Sopenharmony_ci2:	mov	r5, r6		!   5 MT (latency=0)
57662306a36Sopenharmony_ci	add	#-2,r5		!  50 EX
57762306a36Sopenharmony_ci
57862306a36Sopenharmony_ci	mov	r4,r2		!   5 MT (latency=0)
57962306a36Sopenharmony_ci	add	#-4,r6		!  50 EX
58062306a36Sopenharmony_ci
58162306a36Sopenharmony_ci	add	#7,r2		!  50 EX
58262306a36Sopenharmony_ci3:	mov.w	@(r0,r5),r1	!  20 LS (latency=2)
58362306a36Sopenharmony_ci
58462306a36Sopenharmony_ci	mov.w	@(r0,r6),r3	!  20 LS (latency=2)
58562306a36Sopenharmony_ci	cmp/hi	r2,r0		!  57 MT
58662306a36Sopenharmony_ci
58762306a36Sopenharmony_ci	mov.w	r1,@-r0		!  29 LS
58862306a36Sopenharmony_ci	bt/s	3b		! 111 BR
58962306a36Sopenharmony_ci
59062306a36Sopenharmony_ci	 mov.w	r3,@-r0		!  29 LS
59162306a36Sopenharmony_ci
59262306a36Sopenharmony_ci	bra	10f
59362306a36Sopenharmony_ci	 nop
59462306a36Sopenharmony_ci
59562306a36Sopenharmony_ci
59662306a36Sopenharmony_ci	.balign	32
59762306a36Sopenharmony_ci.Lcase2b:
59862306a36Sopenharmony_ci	! Size is at least 64 bytes, so will be going round the big loop at least once.
59962306a36Sopenharmony_ci	!
60062306a36Sopenharmony_ci	!   r2 = rounded up r4
60162306a36Sopenharmony_ci	!   r3 = rounded down r0
60262306a36Sopenharmony_ci
60362306a36Sopenharmony_ci	mov	r0, r3		!   5 MT (latency=0)
60462306a36Sopenharmony_ci	mov	#(~0x1f), r1	!   6 EX
60562306a36Sopenharmony_ci
60662306a36Sopenharmony_ci	and	r1, r3		!  78 EX
60762306a36Sopenharmony_ci	mov	r4, r2		!   5 MT (latency=0)
60862306a36Sopenharmony_ci
60962306a36Sopenharmony_ci	cmp/eq	r3, r0		!  54 MT
61062306a36Sopenharmony_ci	add	#0x1f, r2	!  50 EX
61162306a36Sopenharmony_ci
61262306a36Sopenharmony_ci	add	#-2, r5		!  50 EX
61362306a36Sopenharmony_ci	bt/s	1f		! 110 BR
61462306a36Sopenharmony_ci	 and	r1, r2		!  78 EX
61562306a36Sopenharmony_ci
61662306a36Sopenharmony_ci	! Copy a short word one at a time until we are cache line aligned
61762306a36Sopenharmony_ci	!   Normal values: r0, r2, r3, r4
61862306a36Sopenharmony_ci	!   Unused: r1, r6, r7
61962306a36Sopenharmony_ci	!   Mod: r5 (=r5-2)
62062306a36Sopenharmony_ci	!
62162306a36Sopenharmony_ci	add	#2, r3		!  50 EX
62262306a36Sopenharmony_ci
62362306a36Sopenharmony_ci2:	mov.w	@(r0,r5),r1	!  20 LS (latency=2)
62462306a36Sopenharmony_ci	cmp/eq	r3,r0		!  54 MT
62562306a36Sopenharmony_ci
62662306a36Sopenharmony_ci	bf/s	2b		! 111 BR
62762306a36Sopenharmony_ci
62862306a36Sopenharmony_ci	 mov.w	r1,@-r0		!  29 LS
62962306a36Sopenharmony_ci
63062306a36Sopenharmony_ci	! Copy the cache line aligned blocks
63162306a36Sopenharmony_ci	!
63262306a36Sopenharmony_ci	! In use: r0, r2, r4, r5 (=r5-2)
63362306a36Sopenharmony_ci	! Scratch: r1, r3, r6, r7
63462306a36Sopenharmony_ci	!
63562306a36Sopenharmony_ci	! We could do this with the four scratch registers, but if src
63662306a36Sopenharmony_ci	! and dest hit the same cache line, this will thrash, so make
63762306a36Sopenharmony_ci	! use of additional registers.
63862306a36Sopenharmony_ci	!
63962306a36Sopenharmony_ci	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
64062306a36Sopenharmony_ci	!   r5:	 src (was r0+r5)
64162306a36Sopenharmony_ci	!   r1:	 dest (was r0)
64262306a36Sopenharmony_ci	! this can be reversed at the end, so we don't need to save any extra
64362306a36Sopenharmony_ci	! state.
64462306a36Sopenharmony_ci	!
64562306a36Sopenharmony_ci1:	mov.l	r8, @-r15	!  30 LS
64662306a36Sopenharmony_ci	add	r0, r5		!  49 EX
64762306a36Sopenharmony_ci
64862306a36Sopenharmony_ci	mov.l	r9, @-r15	!  30 LS
64962306a36Sopenharmony_ci	mov	r0, r1		!   5 MT (latency=0)
65062306a36Sopenharmony_ci
65162306a36Sopenharmony_ci	mov.l	r10, @-r15	!  30 LS
65262306a36Sopenharmony_ci	add	#-0x1e, r5	!  50 EX
65362306a36Sopenharmony_ci
65462306a36Sopenharmony_ci	mov.l	r11, @-r15	!  30 LS
65562306a36Sopenharmony_ci
65662306a36Sopenharmony_ci	mov.l	r12, @-r15	!  30 LS
65762306a36Sopenharmony_ci
65862306a36Sopenharmony_ci	! 17 cycles, 32 bytes per iteration
65962306a36Sopenharmony_ci#ifdef CONFIG_CPU_LITTLE_ENDIAN
66062306a36Sopenharmony_ci2:	mov.w	@r5+, r0	!  14 LS (latency=2)		..JI
66162306a36Sopenharmony_ci	add	#-0x20, r1	!  50 EX
66262306a36Sopenharmony_ci
66362306a36Sopenharmony_ci	mov.l	@r5+, r3	!  15 LS (latency=2)		NMLK
66462306a36Sopenharmony_ci
66562306a36Sopenharmony_ci	mov.l	@r5+, r6	!  15 LS (latency=2)		RQPO
66662306a36Sopenharmony_ci	shll16	r0		! 103 EX			JI..
66762306a36Sopenharmony_ci
66862306a36Sopenharmony_ci	mov.l	@r5+, r7	!  15 LS (latency=2)
66962306a36Sopenharmony_ci	xtrct	r3, r0		!  48 EX			LKJI
67062306a36Sopenharmony_ci
67162306a36Sopenharmony_ci	mov.l	@r5+, r8	!  15 LS (latency=2)
67262306a36Sopenharmony_ci	xtrct	r6, r3		!  48 EX			PONM
67362306a36Sopenharmony_ci
67462306a36Sopenharmony_ci	mov.l	@r5+, r9	!  15 LS (latency=2)
67562306a36Sopenharmony_ci	xtrct	r7, r6		!  48 EX
67662306a36Sopenharmony_ci
67762306a36Sopenharmony_ci	mov.l	@r5+, r10	!  15 LS (latency=2)
67862306a36Sopenharmony_ci	xtrct	r8, r7		!  48 EX
67962306a36Sopenharmony_ci
68062306a36Sopenharmony_ci	mov.l	@r5+, r11	!  15 LS (latency=2)
68162306a36Sopenharmony_ci	xtrct	r9, r8		!  48 EX
68262306a36Sopenharmony_ci
68362306a36Sopenharmony_ci	mov.w	@r5+, r12	!  15 LS (latency=2)
68462306a36Sopenharmony_ci	xtrct	r10, r9		!  48 EX
68562306a36Sopenharmony_ci
68662306a36Sopenharmony_ci	movca.l	r0,@r1		!  40 LS (latency=3-7)
68762306a36Sopenharmony_ci	xtrct	r11, r10	!  48 EX
68862306a36Sopenharmony_ci
68962306a36Sopenharmony_ci	mov.l	r3, @(0x04,r1)	!  33 LS
69062306a36Sopenharmony_ci	xtrct	r12, r11	!  48 EX
69162306a36Sopenharmony_ci
69262306a36Sopenharmony_ci	mov.l	r6, @(0x08,r1)	!  33 LS
69362306a36Sopenharmony_ci
69462306a36Sopenharmony_ci	mov.l	r7, @(0x0c,r1)	!  33 LS
69562306a36Sopenharmony_ci
69662306a36Sopenharmony_ci	mov.l	r8, @(0x10,r1)	!  33 LS
69762306a36Sopenharmony_ci	add	#-0x40, r5	!  50 EX
69862306a36Sopenharmony_ci
69962306a36Sopenharmony_ci	mov.l	r9, @(0x14,r1)	!  33 LS
70062306a36Sopenharmony_ci	cmp/eq	r2,r1		!  54 MT
70162306a36Sopenharmony_ci
70262306a36Sopenharmony_ci	mov.l	r10, @(0x18,r1)	!  33 LS
70362306a36Sopenharmony_ci	bf/s	2b		! 109 BR
70462306a36Sopenharmony_ci
70562306a36Sopenharmony_ci	 mov.l	r11, @(0x1c,r1)	!  33 LS
70662306a36Sopenharmony_ci#else
70762306a36Sopenharmony_ci2:	mov.w	@(0x1e,r5), r0	!  17 LS (latency=2)
70862306a36Sopenharmony_ci	add	#-2, r5		!  50 EX
70962306a36Sopenharmony_ci
71062306a36Sopenharmony_ci	mov.l	@(0x1c,r5), r3	!  18 LS (latency=2)
71162306a36Sopenharmony_ci	add	#-4, r1		!  50 EX
71262306a36Sopenharmony_ci
71362306a36Sopenharmony_ci	mov.l	@(0x18,r5), r6	!  18 LS (latency=2)
71462306a36Sopenharmony_ci	shll16	r0		! 103 EX
71562306a36Sopenharmony_ci
71662306a36Sopenharmony_ci	mov.l	@(0x14,r5), r7	!  18 LS (latency=2)
71762306a36Sopenharmony_ci	xtrct	r3, r0		!  48 EX
71862306a36Sopenharmony_ci
71962306a36Sopenharmony_ci	mov.l	@(0x10,r5), r8	!  18 LS (latency=2)
72062306a36Sopenharmony_ci	xtrct	r6, r3		!  48 EX
72162306a36Sopenharmony_ci
72262306a36Sopenharmony_ci	mov.l	@(0x0c,r5), r9	!  18 LS (latency=2)
72362306a36Sopenharmony_ci	xtrct	r7, r6		!  48 EX
72462306a36Sopenharmony_ci
72562306a36Sopenharmony_ci	mov.l	@(0x08,r5), r10	!  18 LS (latency=2)
72662306a36Sopenharmony_ci	xtrct	r8, r7		!  48 EX
72762306a36Sopenharmony_ci
72862306a36Sopenharmony_ci	mov.l	@(0x04,r5), r11	!  18 LS (latency=2)
72962306a36Sopenharmony_ci	xtrct	r9, r8		!  48 EX
73062306a36Sopenharmony_ci
73162306a36Sopenharmony_ci	mov.l   @(0x00,r5), r12 !  18 LS (latency=2)
73262306a36Sopenharmony_ci    	xtrct	r10, r9		!  48 EX
73362306a36Sopenharmony_ci
73462306a36Sopenharmony_ci	movca.l	r0,@r1		!  40 LS (latency=3-7)
73562306a36Sopenharmony_ci	add	#-0x1c, r1	!  50 EX
73662306a36Sopenharmony_ci
73762306a36Sopenharmony_ci	mov.l	r3, @(0x18,r1)	!  33 LS
73862306a36Sopenharmony_ci	xtrct	r11, r10	!  48 EX
73962306a36Sopenharmony_ci
74062306a36Sopenharmony_ci	mov.l	r6, @(0x14,r1)	!  33 LS
74162306a36Sopenharmony_ci	xtrct	r12, r11	!  48 EX
74262306a36Sopenharmony_ci
74362306a36Sopenharmony_ci	mov.l	r7, @(0x10,r1)	!  33 LS
74462306a36Sopenharmony_ci
74562306a36Sopenharmony_ci	mov.l	r8, @(0x0c,r1)	!  33 LS
74662306a36Sopenharmony_ci	add	#-0x1e, r5	!  50 EX
74762306a36Sopenharmony_ci
74862306a36Sopenharmony_ci	mov.l	r9, @(0x08,r1)	!  33 LS
74962306a36Sopenharmony_ci	cmp/eq	r2,r1		!  54 MT
75062306a36Sopenharmony_ci
75162306a36Sopenharmony_ci	mov.l	r10, @(0x04,r1)	!  33 LS
75262306a36Sopenharmony_ci	bf/s	2b		! 109 BR
75362306a36Sopenharmony_ci
75462306a36Sopenharmony_ci	 mov.l	r11, @(0x00,r1)	!  33 LS
75562306a36Sopenharmony_ci#endif
75662306a36Sopenharmony_ci
75762306a36Sopenharmony_ci	mov.l	@r15+, r12
75862306a36Sopenharmony_ci	mov	r1, r0		!   5 MT (latency=0)
75962306a36Sopenharmony_ci
76062306a36Sopenharmony_ci	mov.l	@r15+, r11	!  15 LS
76162306a36Sopenharmony_ci	sub	r1, r5		!  75 EX
76262306a36Sopenharmony_ci
76362306a36Sopenharmony_ci	mov.l	@r15+, r10	!  15 LS
76462306a36Sopenharmony_ci	cmp/eq	r4, r0		!  54 MT
76562306a36Sopenharmony_ci
76662306a36Sopenharmony_ci	bf/s	1f		! 109 BR
76762306a36Sopenharmony_ci	 mov.l	 @r15+, r9	!  15 LS
76862306a36Sopenharmony_ci
76962306a36Sopenharmony_ci	rts
77062306a36Sopenharmony_ci1:	 mov.l	@r15+, r8	!  15 LS
77162306a36Sopenharmony_ci
77262306a36Sopenharmony_ci	add	#0x1e, r5	!  50 EX
77362306a36Sopenharmony_ci
77462306a36Sopenharmony_ci	! Finish off a short word at a time
77562306a36Sopenharmony_ci	! r5 must be invariant - 2
77662306a36Sopenharmony_ci10:	mov	r4,r2		!   5 MT (latency=0)
77762306a36Sopenharmony_ci	add	#1,r2		!  50 EX
77862306a36Sopenharmony_ci
77962306a36Sopenharmony_ci	cmp/hi	r2, r0		!  57 MT
78062306a36Sopenharmony_ci	bf/s	1f		! 109 BR
78162306a36Sopenharmony_ci
78262306a36Sopenharmony_ci	 add	#2, r2		!  50 EX
78362306a36Sopenharmony_ci
78462306a36Sopenharmony_ci3:	mov.w	@(r0,r5),r1	!  20 LS
78562306a36Sopenharmony_ci	cmp/hi	r2,r0		!  57 MT
78662306a36Sopenharmony_ci
78762306a36Sopenharmony_ci	bt/s	3b		! 109 BR
78862306a36Sopenharmony_ci
78962306a36Sopenharmony_ci	 mov.w	r1,@-r0		!  29 LS
79062306a36Sopenharmony_ci1:
79162306a36Sopenharmony_ci
79262306a36Sopenharmony_ci	!
79362306a36Sopenharmony_ci	! Finally, copy the last byte if necessary
79462306a36Sopenharmony_ci	cmp/eq	r4,r0		!  54 MT
79562306a36Sopenharmony_ci	bt/s	9b
79662306a36Sopenharmony_ci	 add	#1,r5
79762306a36Sopenharmony_ci	mov.b	@(r0,r5),r1
79862306a36Sopenharmony_ci	rts
79962306a36Sopenharmony_ci	 mov.b	r1,@-r0
80062306a36Sopenharmony_ci
801