18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * "memcpy" implementation of SuperH
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * Copyright (C) 1999  Niibe Yutaka
68c2ecf20Sopenharmony_ci * Copyright (c) 2002  STMicroelectronics Ltd
78c2ecf20Sopenharmony_ci *   Modified from memcpy.S and micro-optimised for SH4
88c2ecf20Sopenharmony_ci *   Stuart Menefy (stuart.menefy@st.com)
98c2ecf20Sopenharmony_ci *
108c2ecf20Sopenharmony_ci */
118c2ecf20Sopenharmony_ci#include <linux/linkage.h>
128c2ecf20Sopenharmony_ci
138c2ecf20Sopenharmony_ci/*
148c2ecf20Sopenharmony_ci * void *memcpy(void *dst, const void *src, size_t n);
158c2ecf20Sopenharmony_ci *
168c2ecf20Sopenharmony_ci * It is assumed that there is no overlap between src and dst.
178c2ecf20Sopenharmony_ci * If there is an overlap, then the results are undefined.
188c2ecf20Sopenharmony_ci */
198c2ecf20Sopenharmony_ci
208c2ecf20Sopenharmony_ci	!
218c2ecf20Sopenharmony_ci	!	GHIJ KLMN OPQR -->  ...G HIJK LMNO PQR.
228c2ecf20Sopenharmony_ci	!
238c2ecf20Sopenharmony_ci
248c2ecf20Sopenharmony_ci	! Size is 16 or greater, and may have trailing bytes
258c2ecf20Sopenharmony_ci
268c2ecf20Sopenharmony_ci	.balign	32
278c2ecf20Sopenharmony_ci.Lcase1:
288c2ecf20Sopenharmony_ci	! Read a long word and write a long word at once
298c2ecf20Sopenharmony_ci	! At the start of each iteration, r7 contains last long load
308c2ecf20Sopenharmony_ci	add	#-1,r5		!  79 EX
318c2ecf20Sopenharmony_ci	mov	r4,r2		!   5 MT (0 cycles latency)
328c2ecf20Sopenharmony_ci
338c2ecf20Sopenharmony_ci	mov.l	@(r0,r5),r7	!  21 LS (2 cycles latency)
348c2ecf20Sopenharmony_ci	add	#-4,r5		!  50 EX
358c2ecf20Sopenharmony_ci
368c2ecf20Sopenharmony_ci	add	#7,r2		!  79 EX
378c2ecf20Sopenharmony_ci	!
388c2ecf20Sopenharmony_ci#ifdef CONFIG_CPU_LITTLE_ENDIAN
398c2ecf20Sopenharmony_ci	! 6 cycles, 4 bytes per iteration
408c2ecf20Sopenharmony_ci3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! NMLK
418c2ecf20Sopenharmony_ci	mov	r7, r3		!   5 MT (latency=0)	! RQPO
428c2ecf20Sopenharmony_ci
438c2ecf20Sopenharmony_ci	cmp/hi	r2,r0		!  57 MT
448c2ecf20Sopenharmony_ci	shll16	r3		! 103 EX
458c2ecf20Sopenharmony_ci
468c2ecf20Sopenharmony_ci	mov	r1,r6		!   5 MT (latency=0)
478c2ecf20Sopenharmony_ci	shll8	r3		! 102 EX		! Oxxx
488c2ecf20Sopenharmony_ci
498c2ecf20Sopenharmony_ci	shlr8	r6		! 106 EX		! xNML
508c2ecf20Sopenharmony_ci	mov	r1, r7		!   5 MT (latency=0)
518c2ecf20Sopenharmony_ci
528c2ecf20Sopenharmony_ci	or	r6,r3		!  82 EX		! ONML
538c2ecf20Sopenharmony_ci	bt/s	3b		! 109 BR
548c2ecf20Sopenharmony_ci
558c2ecf20Sopenharmony_ci	 mov.l	r3,@-r0		!  30 LS
568c2ecf20Sopenharmony_ci#else
578c2ecf20Sopenharmony_ci3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! KLMN
588c2ecf20Sopenharmony_ci	mov	r7,r3		!   5 MT (latency=0)	! OPQR
598c2ecf20Sopenharmony_ci
608c2ecf20Sopenharmony_ci	cmp/hi	r2,r0		!  57 MT
618c2ecf20Sopenharmony_ci	shlr16	r3		! 107 EX
628c2ecf20Sopenharmony_ci
638c2ecf20Sopenharmony_ci	shlr8	r3		! 106 EX		! xxxO
648c2ecf20Sopenharmony_ci	mov	r1,r6		!   5 MT (latency=0)
658c2ecf20Sopenharmony_ci
668c2ecf20Sopenharmony_ci	shll8	r6		! 102 EX		! LMNx
678c2ecf20Sopenharmony_ci	mov	r1,r7		!   5 MT (latency=0)
688c2ecf20Sopenharmony_ci
698c2ecf20Sopenharmony_ci	or	r6,r3		!  82 EX		! LMNO
708c2ecf20Sopenharmony_ci	bt/s	3b		! 109 BR
718c2ecf20Sopenharmony_ci
728c2ecf20Sopenharmony_ci	 mov.l	r3,@-r0		!  30 LS
738c2ecf20Sopenharmony_ci#endif
748c2ecf20Sopenharmony_ci	! Finally, copy a byte at once, if necessary
758c2ecf20Sopenharmony_ci
768c2ecf20Sopenharmony_ci	add	#4,r5		!  50 EX
778c2ecf20Sopenharmony_ci	cmp/eq	r4,r0		!  54 MT
788c2ecf20Sopenharmony_ci
798c2ecf20Sopenharmony_ci	add	#-6,r2		!  50 EX
808c2ecf20Sopenharmony_ci	bt	9f		! 109 BR
818c2ecf20Sopenharmony_ci
828c2ecf20Sopenharmony_ci8:	cmp/hi	r2,r0		!  57 MT
838c2ecf20Sopenharmony_ci	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
848c2ecf20Sopenharmony_ci
858c2ecf20Sopenharmony_ci	bt/s	8b		! 109 BR
868c2ecf20Sopenharmony_ci
878c2ecf20Sopenharmony_ci	 mov.b	r1,@-r0		!  29 LS
888c2ecf20Sopenharmony_ci
898c2ecf20Sopenharmony_ci9:	rts
908c2ecf20Sopenharmony_ci	 nop
918c2ecf20Sopenharmony_ci
928c2ecf20Sopenharmony_ci
938c2ecf20Sopenharmony_ci	!
948c2ecf20Sopenharmony_ci	!	GHIJ KLMN OPQR -->  .GHI JKLM NOPQ R...
958c2ecf20Sopenharmony_ci	!
968c2ecf20Sopenharmony_ci
978c2ecf20Sopenharmony_ci	! Size is 16 or greater, and may have trailing bytes
988c2ecf20Sopenharmony_ci
998c2ecf20Sopenharmony_ci	.balign	32
1008c2ecf20Sopenharmony_ci.Lcase3:
1018c2ecf20Sopenharmony_ci	! Read a long word and write a long word at once
1028c2ecf20Sopenharmony_ci	! At the start of each iteration, r7 contains last long load
1038c2ecf20Sopenharmony_ci	add	#-3,r5		! 79 EX
1048c2ecf20Sopenharmony_ci	mov	r4,r2		!  5 MT (0 cycles latency)
1058c2ecf20Sopenharmony_ci
1068c2ecf20Sopenharmony_ci	mov.l	@(r0,r5),r7	! 21 LS (2 cycles latency)
1078c2ecf20Sopenharmony_ci	add	#-4,r5		! 50 EX
1088c2ecf20Sopenharmony_ci
1098c2ecf20Sopenharmony_ci	add	#7,r2		!  79 EX
1108c2ecf20Sopenharmony_ci	!
1118c2ecf20Sopenharmony_ci#ifdef CONFIG_CPU_LITTLE_ENDIAN
1128c2ecf20Sopenharmony_ci	! 6 cycles, 4 bytes per iteration
1138c2ecf20Sopenharmony_ci3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! NMLK
1148c2ecf20Sopenharmony_ci	mov	r7, r3		!   5 MT (latency=0)	! RQPO
1158c2ecf20Sopenharmony_ci
1168c2ecf20Sopenharmony_ci	cmp/hi	r2,r0		!  57 MT
1178c2ecf20Sopenharmony_ci	shll8	r3		! 102 EX		! QPOx
1188c2ecf20Sopenharmony_ci
1198c2ecf20Sopenharmony_ci	mov	r1,r6		!   5 MT (latency=0)
1208c2ecf20Sopenharmony_ci	shlr16	r6		! 107 EX
1218c2ecf20Sopenharmony_ci
1228c2ecf20Sopenharmony_ci	shlr8	r6		! 106 EX		! xxxN
1238c2ecf20Sopenharmony_ci	mov	r1, r7		!   5 MT (latency=0)
1248c2ecf20Sopenharmony_ci
1258c2ecf20Sopenharmony_ci	or	r6,r3		!  82 EX		! QPON
1268c2ecf20Sopenharmony_ci	bt/s	3b		! 109 BR
1278c2ecf20Sopenharmony_ci
1288c2ecf20Sopenharmony_ci	 mov.l	r3,@-r0		!  30 LS
1298c2ecf20Sopenharmony_ci#else
1308c2ecf20Sopenharmony_ci3:	mov	r7,r3		! OPQR
1318c2ecf20Sopenharmony_ci	shlr8	r3		! xOPQ
1328c2ecf20Sopenharmony_ci	mov.l	@(r0,r5),r7	! KLMN
1338c2ecf20Sopenharmony_ci	mov	r7,r6
1348c2ecf20Sopenharmony_ci	shll16	r6
1358c2ecf20Sopenharmony_ci	shll8	r6		! Nxxx
1368c2ecf20Sopenharmony_ci	or	r6,r3		! NOPQ
1378c2ecf20Sopenharmony_ci	cmp/hi	r2,r0
1388c2ecf20Sopenharmony_ci	bt/s	3b
1398c2ecf20Sopenharmony_ci	 mov.l	r3,@-r0
1408c2ecf20Sopenharmony_ci#endif
1418c2ecf20Sopenharmony_ci
1428c2ecf20Sopenharmony_ci	! Finally, copy a byte at once, if necessary
1438c2ecf20Sopenharmony_ci
1448c2ecf20Sopenharmony_ci	add	#6,r5		!  50 EX
1458c2ecf20Sopenharmony_ci	cmp/eq	r4,r0		!  54 MT
1468c2ecf20Sopenharmony_ci
1478c2ecf20Sopenharmony_ci	add	#-6,r2		!  50 EX
1488c2ecf20Sopenharmony_ci	bt	9f		! 109 BR
1498c2ecf20Sopenharmony_ci
1508c2ecf20Sopenharmony_ci8:	cmp/hi	r2,r0		!  57 MT
1518c2ecf20Sopenharmony_ci	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
1528c2ecf20Sopenharmony_ci
1538c2ecf20Sopenharmony_ci	bt/s	8b		! 109 BR
1548c2ecf20Sopenharmony_ci
1558c2ecf20Sopenharmony_ci	 mov.b	r1,@-r0		!  29 LS
1568c2ecf20Sopenharmony_ci
1578c2ecf20Sopenharmony_ci9:	rts
1588c2ecf20Sopenharmony_ci	 nop
1598c2ecf20Sopenharmony_ci
1608c2ecf20Sopenharmony_ciENTRY(memcpy)
1618c2ecf20Sopenharmony_ci
1628c2ecf20Sopenharmony_ci	! Calculate the invariants which will be used in the remainder
1638c2ecf20Sopenharmony_ci	! of the code:
1648c2ecf20Sopenharmony_ci	!
1658c2ecf20Sopenharmony_ci	!      r4   -->  [ ...  ] DST             [ ...  ] SRC
1668c2ecf20Sopenharmony_ci	!	         [ ...  ]                 [ ...  ]
1678c2ecf20Sopenharmony_ci	!	           :                        :
1688c2ecf20Sopenharmony_ci	!      r0   -->  [ ...  ]       r0+r5 --> [ ...  ]
1698c2ecf20Sopenharmony_ci	!
1708c2ecf20Sopenharmony_ci	!
1718c2ecf20Sopenharmony_ci
1728c2ecf20Sopenharmony_ci	! Short circuit the common case of src, dst and len being 32 bit aligned
1738c2ecf20Sopenharmony_ci	! and test for zero length move
1748c2ecf20Sopenharmony_ci
1758c2ecf20Sopenharmony_ci	mov	r6, r0		!   5 MT (0 cycle latency)
1768c2ecf20Sopenharmony_ci	or	r4, r0		!  82 EX
1778c2ecf20Sopenharmony_ci
1788c2ecf20Sopenharmony_ci	or	r5, r0		!  82 EX
1798c2ecf20Sopenharmony_ci	tst	r6, r6		!  86 MT
1808c2ecf20Sopenharmony_ci
1818c2ecf20Sopenharmony_ci	bt/s	99f		! 111 BR		(zero len)
1828c2ecf20Sopenharmony_ci	 tst	#3, r0		!  87 MT
1838c2ecf20Sopenharmony_ci
1848c2ecf20Sopenharmony_ci	mov	r4, r0		!   5 MT (0 cycle latency)
1858c2ecf20Sopenharmony_ci	add	r6, r0		!  49 EX
1868c2ecf20Sopenharmony_ci
1878c2ecf20Sopenharmony_ci	mov	#16, r1		!   6 EX
1888c2ecf20Sopenharmony_ci	bt/s	.Lcase00	! 111 BR		(aligned)
1898c2ecf20Sopenharmony_ci
1908c2ecf20Sopenharmony_ci	 sub	r4, r5		!  75 EX
1918c2ecf20Sopenharmony_ci
1928c2ecf20Sopenharmony_ci	! Arguments are not nicely long word aligned or zero len.
1938c2ecf20Sopenharmony_ci	! Check for small copies, and if so do a simple byte at a time copy.
1948c2ecf20Sopenharmony_ci	!
1958c2ecf20Sopenharmony_ci	! Deciding on an exact value of 'small' is not easy, as the point at which
1968c2ecf20Sopenharmony_ci	! using the optimised routines become worthwhile varies (these are the
1978c2ecf20Sopenharmony_ci	! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
1988c2ecf20Sopenharmony_ci	!	size	byte-at-time	long	word	byte
1998c2ecf20Sopenharmony_ci	!	16	42		39-40	46-50	50-55
2008c2ecf20Sopenharmony_ci	!	24	58		43-44	54-58	62-67
2018c2ecf20Sopenharmony_ci	!	36	82		49-50	66-70	80-85
2028c2ecf20Sopenharmony_ci	! However the penalty for getting it 'wrong' is much higher for long word
2038c2ecf20Sopenharmony_ci	! aligned data (and this is more common), so use a value of 16.
2048c2ecf20Sopenharmony_ci
2058c2ecf20Sopenharmony_ci	cmp/gt	r6,r1		!  56 MT
2068c2ecf20Sopenharmony_ci
2078c2ecf20Sopenharmony_ci	add	#-1,r5		!  50 EX
2088c2ecf20Sopenharmony_ci	bf/s	6f		! 108 BR		(not small)
2098c2ecf20Sopenharmony_ci
2108c2ecf20Sopenharmony_ci	 mov	r5, r3		!   5 MT (latency=0)
2118c2ecf20Sopenharmony_ci	shlr	r6		! 104 EX
2128c2ecf20Sopenharmony_ci
2138c2ecf20Sopenharmony_ci	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
2148c2ecf20Sopenharmony_ci	bf/s	4f		! 111 BR
2158c2ecf20Sopenharmony_ci
2168c2ecf20Sopenharmony_ci	 add	#-1,r3		!  50 EX
2178c2ecf20Sopenharmony_ci	tst	r6, r6		!  86 MT
2188c2ecf20Sopenharmony_ci
2198c2ecf20Sopenharmony_ci	bt/s	98f		! 110 BR
2208c2ecf20Sopenharmony_ci	 mov.b	r1,@-r0		!  29 LS
2218c2ecf20Sopenharmony_ci
2228c2ecf20Sopenharmony_ci	! 4 cycles, 2 bytes per iteration
2238c2ecf20Sopenharmony_ci3:	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
2248c2ecf20Sopenharmony_ci
2258c2ecf20Sopenharmony_ci4:	mov.b	@(r0,r3),r2	!  20 LS (latency=2)
2268c2ecf20Sopenharmony_ci	dt	r6		!  67 EX
2278c2ecf20Sopenharmony_ci
2288c2ecf20Sopenharmony_ci	mov.b	r1,@-r0		!  29 LS
2298c2ecf20Sopenharmony_ci	bf/s	3b		! 111 BR
2308c2ecf20Sopenharmony_ci
2318c2ecf20Sopenharmony_ci	 mov.b	r2,@-r0		!  29 LS
2328c2ecf20Sopenharmony_ci98:
2338c2ecf20Sopenharmony_ci	rts
2348c2ecf20Sopenharmony_ci	 nop
2358c2ecf20Sopenharmony_ci
2368c2ecf20Sopenharmony_ci99:	rts
2378c2ecf20Sopenharmony_ci	 mov	r4, r0
2388c2ecf20Sopenharmony_ci
2398c2ecf20Sopenharmony_ci	! Size is not small, so its worthwhile looking for optimisations.
2408c2ecf20Sopenharmony_ci	! First align destination to a long word boundary.
2418c2ecf20Sopenharmony_ci	!
2428c2ecf20Sopenharmony_ci	! r5 = normal value -1
2438c2ecf20Sopenharmony_ci
2448c2ecf20Sopenharmony_ci6:	tst	#3, r0		!  87 MT
2458c2ecf20Sopenharmony_ci        mov	#3, r3		!   6 EX
2468c2ecf20Sopenharmony_ci
2478c2ecf20Sopenharmony_ci	bt/s	2f		! 111 BR
2488c2ecf20Sopenharmony_ci	 and	r0,r3		!  78 EX
2498c2ecf20Sopenharmony_ci
2508c2ecf20Sopenharmony_ci	! 3 cycles, 1 byte per iteration
2518c2ecf20Sopenharmony_ci1:	dt	r3		!  67 EX
2528c2ecf20Sopenharmony_ci	mov.b	@(r0,r5),r1	!  19 LS (latency=2)
2538c2ecf20Sopenharmony_ci
2548c2ecf20Sopenharmony_ci	add	#-1, r6		!  79 EX
2558c2ecf20Sopenharmony_ci	bf/s	1b		! 109 BR
2568c2ecf20Sopenharmony_ci
2578c2ecf20Sopenharmony_ci	 mov.b	r1,@-r0		!  28 LS
2588c2ecf20Sopenharmony_ci
2598c2ecf20Sopenharmony_ci2:	add	#1, r5		!  79 EX
2608c2ecf20Sopenharmony_ci
2618c2ecf20Sopenharmony_ci	! Now select the appropriate bulk transfer code based on relative
2628c2ecf20Sopenharmony_ci	! alignment of src and dst.
2638c2ecf20Sopenharmony_ci
2648c2ecf20Sopenharmony_ci	mov	r0, r3		!   5 MT (latency=0)
2658c2ecf20Sopenharmony_ci
2668c2ecf20Sopenharmony_ci	mov	r5, r0		!   5 MT (latency=0)
2678c2ecf20Sopenharmony_ci	tst	#1, r0		!  87 MT
2688c2ecf20Sopenharmony_ci
2698c2ecf20Sopenharmony_ci	bf/s	1f		! 111 BR
2708c2ecf20Sopenharmony_ci	 mov	#64, r7		!   6 EX
2718c2ecf20Sopenharmony_ci
2728c2ecf20Sopenharmony_ci	! bit 0 clear
2738c2ecf20Sopenharmony_ci
2748c2ecf20Sopenharmony_ci	cmp/ge	r7, r6		!  55 MT
2758c2ecf20Sopenharmony_ci
2768c2ecf20Sopenharmony_ci	bt/s	2f		! 111 BR
2778c2ecf20Sopenharmony_ci	 tst	#2, r0		!  87 MT
2788c2ecf20Sopenharmony_ci
2798c2ecf20Sopenharmony_ci	! small
2808c2ecf20Sopenharmony_ci	bt/s	.Lcase0
2818c2ecf20Sopenharmony_ci	 mov	r3, r0
2828c2ecf20Sopenharmony_ci
2838c2ecf20Sopenharmony_ci	bra	.Lcase2
2848c2ecf20Sopenharmony_ci	 nop
2858c2ecf20Sopenharmony_ci
2868c2ecf20Sopenharmony_ci	! big
2878c2ecf20Sopenharmony_ci2:	bt/s	.Lcase0b
2888c2ecf20Sopenharmony_ci	 mov	r3, r0
2898c2ecf20Sopenharmony_ci
2908c2ecf20Sopenharmony_ci	bra	.Lcase2b
2918c2ecf20Sopenharmony_ci	 nop
2928c2ecf20Sopenharmony_ci
2938c2ecf20Sopenharmony_ci	! bit 0 set
2948c2ecf20Sopenharmony_ci1:	tst	#2, r0		! 87 MT
2958c2ecf20Sopenharmony_ci
2968c2ecf20Sopenharmony_ci	bt/s	.Lcase1
2978c2ecf20Sopenharmony_ci	 mov	r3, r0
2988c2ecf20Sopenharmony_ci
2998c2ecf20Sopenharmony_ci	bra	.Lcase3
3008c2ecf20Sopenharmony_ci	 nop
3018c2ecf20Sopenharmony_ci
3028c2ecf20Sopenharmony_ci
3038c2ecf20Sopenharmony_ci	!
3048c2ecf20Sopenharmony_ci	!	GHIJ KLMN OPQR -->  GHIJ KLMN OPQR
3058c2ecf20Sopenharmony_ci	!
3068c2ecf20Sopenharmony_ci
3078c2ecf20Sopenharmony_ci	! src, dst and size are all long word aligned
3088c2ecf20Sopenharmony_ci	! size is non-zero
3098c2ecf20Sopenharmony_ci
3108c2ecf20Sopenharmony_ci	.balign	32
3118c2ecf20Sopenharmony_ci.Lcase00:
3128c2ecf20Sopenharmony_ci	mov	#64, r1		!   6 EX
3138c2ecf20Sopenharmony_ci	mov	r5, r3		!   5 MT (latency=0)
3148c2ecf20Sopenharmony_ci
3158c2ecf20Sopenharmony_ci	cmp/gt	r6, r1		!  56 MT
3168c2ecf20Sopenharmony_ci	add	#-4, r5		!  50 EX
3178c2ecf20Sopenharmony_ci
3188c2ecf20Sopenharmony_ci	bf	.Lcase00b	! 108 BR		(big loop)
3198c2ecf20Sopenharmony_ci	shlr2	r6		! 105 EX
3208c2ecf20Sopenharmony_ci
3218c2ecf20Sopenharmony_ci	shlr	r6		! 104 EX
3228c2ecf20Sopenharmony_ci	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
3238c2ecf20Sopenharmony_ci
3248c2ecf20Sopenharmony_ci	bf/s	4f		! 111 BR
3258c2ecf20Sopenharmony_ci	 add	#-8, r3		!  50 EX
3268c2ecf20Sopenharmony_ci
3278c2ecf20Sopenharmony_ci	tst	r6, r6		!  86 MT
3288c2ecf20Sopenharmony_ci	bt/s	5f		! 110 BR
3298c2ecf20Sopenharmony_ci
3308c2ecf20Sopenharmony_ci	 mov.l	r1,@-r0		!  30 LS
3318c2ecf20Sopenharmony_ci
3328c2ecf20Sopenharmony_ci	! 4 cycles, 2 long words per iteration
3338c2ecf20Sopenharmony_ci3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
3348c2ecf20Sopenharmony_ci
3358c2ecf20Sopenharmony_ci4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
3368c2ecf20Sopenharmony_ci	dt	r6		!  67 EX
3378c2ecf20Sopenharmony_ci
3388c2ecf20Sopenharmony_ci	mov.l	r1, @-r0	!  30 LS
3398c2ecf20Sopenharmony_ci	bf/s	3b		! 109 BR
3408c2ecf20Sopenharmony_ci
3418c2ecf20Sopenharmony_ci	 mov.l	r2, @-r0	!  30 LS
3428c2ecf20Sopenharmony_ci
3438c2ecf20Sopenharmony_ci5:	rts
3448c2ecf20Sopenharmony_ci	 nop
3458c2ecf20Sopenharmony_ci
3468c2ecf20Sopenharmony_ci
3478c2ecf20Sopenharmony_ci	! Size is 16 or greater and less than 64, but may have trailing bytes
3488c2ecf20Sopenharmony_ci
3498c2ecf20Sopenharmony_ci	.balign	32
3508c2ecf20Sopenharmony_ci.Lcase0:
3518c2ecf20Sopenharmony_ci	add	#-4, r5		!  50 EX
3528c2ecf20Sopenharmony_ci	mov	r4, r7		!   5 MT (latency=0)
3538c2ecf20Sopenharmony_ci
3548c2ecf20Sopenharmony_ci	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
3558c2ecf20Sopenharmony_ci	mov	#4, r2		!   6 EX
3568c2ecf20Sopenharmony_ci
3578c2ecf20Sopenharmony_ci	add	#11, r7		!  50 EX
3588c2ecf20Sopenharmony_ci	tst	r2, r6		!  86 MT
3598c2ecf20Sopenharmony_ci
3608c2ecf20Sopenharmony_ci	mov	r5, r3		!   5 MT (latency=0)
3618c2ecf20Sopenharmony_ci	bt/s	4f		! 111 BR
3628c2ecf20Sopenharmony_ci
3638c2ecf20Sopenharmony_ci	 add	#-4, r3		!  50 EX
3648c2ecf20Sopenharmony_ci	mov.l	r1,@-r0		!  30 LS
3658c2ecf20Sopenharmony_ci
3668c2ecf20Sopenharmony_ci	! 4 cycles, 2 long words per iteration
3678c2ecf20Sopenharmony_ci3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
3688c2ecf20Sopenharmony_ci
3698c2ecf20Sopenharmony_ci4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
3708c2ecf20Sopenharmony_ci	cmp/hi	r7, r0
3718c2ecf20Sopenharmony_ci
3728c2ecf20Sopenharmony_ci	mov.l	r1, @-r0	!  30 LS
3738c2ecf20Sopenharmony_ci	bt/s	3b		! 109 BR
3748c2ecf20Sopenharmony_ci
3758c2ecf20Sopenharmony_ci	 mov.l	r2, @-r0	!  30 LS
3768c2ecf20Sopenharmony_ci
3778c2ecf20Sopenharmony_ci	! Copy the final 0-3 bytes
3788c2ecf20Sopenharmony_ci
3798c2ecf20Sopenharmony_ci	add	#3,r5		!  50 EX
3808c2ecf20Sopenharmony_ci
3818c2ecf20Sopenharmony_ci	cmp/eq	r0, r4		!  54 MT
3828c2ecf20Sopenharmony_ci	add	#-10, r7	!  50 EX
3838c2ecf20Sopenharmony_ci
3848c2ecf20Sopenharmony_ci	bt	9f		! 110 BR
3858c2ecf20Sopenharmony_ci
3868c2ecf20Sopenharmony_ci	! 3 cycles, 1 byte per iteration
3878c2ecf20Sopenharmony_ci1:	mov.b	@(r0,r5),r1	!  19 LS
3888c2ecf20Sopenharmony_ci	cmp/hi	r7,r0		!  57 MT
3898c2ecf20Sopenharmony_ci
3908c2ecf20Sopenharmony_ci	bt/s	1b		! 111 BR
3918c2ecf20Sopenharmony_ci	 mov.b	r1,@-r0		!  28 LS
3928c2ecf20Sopenharmony_ci
3938c2ecf20Sopenharmony_ci9:	rts
3948c2ecf20Sopenharmony_ci	 nop
3958c2ecf20Sopenharmony_ci
3968c2ecf20Sopenharmony_ci	! Size is at least 64 bytes, so will be going round the big loop at least once.
3978c2ecf20Sopenharmony_ci	!
3988c2ecf20Sopenharmony_ci	!   r2 = rounded up r4
3998c2ecf20Sopenharmony_ci	!   r3 = rounded down r0
4008c2ecf20Sopenharmony_ci
4018c2ecf20Sopenharmony_ci	.balign	32
4028c2ecf20Sopenharmony_ci.Lcase0b:
4038c2ecf20Sopenharmony_ci	add	#-4, r5		!  50 EX
4048c2ecf20Sopenharmony_ci
4058c2ecf20Sopenharmony_ci.Lcase00b:
4068c2ecf20Sopenharmony_ci	mov	r0, r3		!   5 MT (latency=0)
4078c2ecf20Sopenharmony_ci	mov	#(~0x1f), r1	!   6 EX
4088c2ecf20Sopenharmony_ci
4098c2ecf20Sopenharmony_ci	and	r1, r3		!  78 EX
4108c2ecf20Sopenharmony_ci	mov	r4, r2		!   5 MT (latency=0)
4118c2ecf20Sopenharmony_ci
4128c2ecf20Sopenharmony_ci	cmp/eq	r3, r0		!  54 MT
4138c2ecf20Sopenharmony_ci	add	#0x1f, r2	!  50 EX
4148c2ecf20Sopenharmony_ci
4158c2ecf20Sopenharmony_ci	bt/s	1f		! 110 BR
4168c2ecf20Sopenharmony_ci	 and	r1, r2		!  78 EX
4178c2ecf20Sopenharmony_ci
4188c2ecf20Sopenharmony_ci	! copy initial words until cache line aligned
4198c2ecf20Sopenharmony_ci
4208c2ecf20Sopenharmony_ci	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
4218c2ecf20Sopenharmony_ci	tst	#4, r0		!  87 MT
4228c2ecf20Sopenharmony_ci
4238c2ecf20Sopenharmony_ci	mov	r5, r6		!   5 MT (latency=0)
4248c2ecf20Sopenharmony_ci	add	#-4, r6		!  50 EX
4258c2ecf20Sopenharmony_ci
4268c2ecf20Sopenharmony_ci	bt/s	4f		! 111 BR
4278c2ecf20Sopenharmony_ci	 add	#8, r3		!  50 EX
4288c2ecf20Sopenharmony_ci
4298c2ecf20Sopenharmony_ci	tst	#0x18, r0	!  87 MT
4308c2ecf20Sopenharmony_ci
4318c2ecf20Sopenharmony_ci	bt/s	1f		! 109 BR
4328c2ecf20Sopenharmony_ci	 mov.l	r1,@-r0		!  30 LS
4338c2ecf20Sopenharmony_ci
4348c2ecf20Sopenharmony_ci	! 4 cycles, 2 long words per iteration
4358c2ecf20Sopenharmony_ci3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
4368c2ecf20Sopenharmony_ci
4378c2ecf20Sopenharmony_ci4:	mov.l	@(r0, r6), r7	!  21 LS (latency=2)
4388c2ecf20Sopenharmony_ci	cmp/eq	r3, r0		!  54 MT
4398c2ecf20Sopenharmony_ci
4408c2ecf20Sopenharmony_ci	mov.l	r1, @-r0	!  30 LS
4418c2ecf20Sopenharmony_ci	bf/s	3b		! 109 BR
4428c2ecf20Sopenharmony_ci
4438c2ecf20Sopenharmony_ci	 mov.l	r7, @-r0	!  30 LS
4448c2ecf20Sopenharmony_ci
4458c2ecf20Sopenharmony_ci	! Copy the cache line aligned blocks
4468c2ecf20Sopenharmony_ci	!
4478c2ecf20Sopenharmony_ci	! In use: r0, r2, r4, r5
4488c2ecf20Sopenharmony_ci	! Scratch: r1, r3, r6, r7
4498c2ecf20Sopenharmony_ci	!
4508c2ecf20Sopenharmony_ci	! We could do this with the four scratch registers, but if src
4518c2ecf20Sopenharmony_ci	! and dest hit the same cache line, this will thrash, so make
4528c2ecf20Sopenharmony_ci	! use of additional registers.
4538c2ecf20Sopenharmony_ci	!
4548c2ecf20Sopenharmony_ci	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
4558c2ecf20Sopenharmony_ci	!   r5:	 src (was r0+r5)
4568c2ecf20Sopenharmony_ci	!   r1:	 dest (was r0)
4578c2ecf20Sopenharmony_ci	! this can be reversed at the end, so we don't need to save any extra
4588c2ecf20Sopenharmony_ci	! state.
4598c2ecf20Sopenharmony_ci	!
4608c2ecf20Sopenharmony_ci1:	mov.l	r8, @-r15	!  30 LS
4618c2ecf20Sopenharmony_ci	add	r0, r5		!  49 EX
4628c2ecf20Sopenharmony_ci
4638c2ecf20Sopenharmony_ci	mov.l	r9, @-r15	!  30 LS
4648c2ecf20Sopenharmony_ci	mov	r0, r1		!   5 MT (latency=0)
4658c2ecf20Sopenharmony_ci
4668c2ecf20Sopenharmony_ci	mov.l	r10, @-r15	!  30 LS
4678c2ecf20Sopenharmony_ci	add	#-0x1c, r5	!  50 EX
4688c2ecf20Sopenharmony_ci
4698c2ecf20Sopenharmony_ci	mov.l	r11, @-r15	!  30 LS
4708c2ecf20Sopenharmony_ci
4718c2ecf20Sopenharmony_ci	! 16 cycles, 32 bytes per iteration
4728c2ecf20Sopenharmony_ci2:	mov.l	@(0x00,r5),r0	! 18 LS (latency=2)
4738c2ecf20Sopenharmony_ci	add	#-0x20, r1	! 50 EX
4748c2ecf20Sopenharmony_ci	mov.l	@(0x04,r5),r3	! 18 LS (latency=2)
4758c2ecf20Sopenharmony_ci	mov.l	@(0x08,r5),r6	! 18 LS (latency=2)
4768c2ecf20Sopenharmony_ci	mov.l	@(0x0c,r5),r7	! 18 LS (latency=2)
4778c2ecf20Sopenharmony_ci	mov.l	@(0x10,r5),r8	! 18 LS (latency=2)
4788c2ecf20Sopenharmony_ci	mov.l	@(0x14,r5),r9	! 18 LS (latency=2)
4798c2ecf20Sopenharmony_ci	mov.l	@(0x18,r5),r10	! 18 LS (latency=2)
4808c2ecf20Sopenharmony_ci	mov.l	@(0x1c,r5),r11	! 18 LS (latency=2)
4818c2ecf20Sopenharmony_ci	movca.l	r0,@r1		! 40 LS (latency=3-7)
4828c2ecf20Sopenharmony_ci	mov.l	r3,@(0x04,r1)	! 33 LS
4838c2ecf20Sopenharmony_ci	mov.l	r6,@(0x08,r1)	! 33 LS
4848c2ecf20Sopenharmony_ci	mov.l	r7,@(0x0c,r1)	! 33 LS
4858c2ecf20Sopenharmony_ci
4868c2ecf20Sopenharmony_ci	mov.l	r8,@(0x10,r1)	! 33 LS
4878c2ecf20Sopenharmony_ci	add	#-0x20, r5	! 50 EX
4888c2ecf20Sopenharmony_ci
4898c2ecf20Sopenharmony_ci	mov.l	r9,@(0x14,r1)	! 33 LS
4908c2ecf20Sopenharmony_ci	cmp/eq	r2,r1		! 54 MT
4918c2ecf20Sopenharmony_ci
4928c2ecf20Sopenharmony_ci	mov.l	r10,@(0x18,r1)	!  33 LS
4938c2ecf20Sopenharmony_ci	bf/s	2b		! 109 BR
4948c2ecf20Sopenharmony_ci
4958c2ecf20Sopenharmony_ci	 mov.l	r11,@(0x1c,r1)	!  33 LS
4968c2ecf20Sopenharmony_ci
4978c2ecf20Sopenharmony_ci	mov	r1, r0		!   5 MT (latency=0)
4988c2ecf20Sopenharmony_ci
4998c2ecf20Sopenharmony_ci	mov.l	@r15+, r11	!  15 LS
5008c2ecf20Sopenharmony_ci	sub	r1, r5		!  75 EX
5018c2ecf20Sopenharmony_ci
5028c2ecf20Sopenharmony_ci	mov.l	@r15+, r10	!  15 LS
5038c2ecf20Sopenharmony_ci	cmp/eq	r4, r0		!  54 MT
5048c2ecf20Sopenharmony_ci
5058c2ecf20Sopenharmony_ci	bf/s	1f		! 109 BR
5068c2ecf20Sopenharmony_ci	 mov.l	 @r15+, r9	!  15 LS
5078c2ecf20Sopenharmony_ci
5088c2ecf20Sopenharmony_ci	rts
5098c2ecf20Sopenharmony_ci1:	 mov.l	@r15+, r8	!  15 LS
5108c2ecf20Sopenharmony_ci	sub	r4, r1		!  75 EX		(len remaining)
5118c2ecf20Sopenharmony_ci
5128c2ecf20Sopenharmony_ci	! number of trailing bytes is non-zero
5138c2ecf20Sopenharmony_ci	!
5148c2ecf20Sopenharmony_ci	! invariants restored (r5 already decremented by 4)
5158c2ecf20Sopenharmony_ci	! also r1=num bytes remaining
5168c2ecf20Sopenharmony_ci
5178c2ecf20Sopenharmony_ci	mov	#4, r2		!   6 EX
5188c2ecf20Sopenharmony_ci	mov	r4, r7		!   5 MT (latency=0)
5198c2ecf20Sopenharmony_ci
5208c2ecf20Sopenharmony_ci	add	#0x1c, r5	!  50 EX		(back to -4)
5218c2ecf20Sopenharmony_ci	cmp/hs	r2, r1		!  58 MT
5228c2ecf20Sopenharmony_ci
5238c2ecf20Sopenharmony_ci	bf/s	5f		! 108 BR
5248c2ecf20Sopenharmony_ci	 add	 #11, r7	!  50 EX
5258c2ecf20Sopenharmony_ci
5268c2ecf20Sopenharmony_ci	mov.l	@(r0, r5), r6	!  21 LS (latency=2)
5278c2ecf20Sopenharmony_ci	tst	r2, r1		!  86 MT
5288c2ecf20Sopenharmony_ci
5298c2ecf20Sopenharmony_ci	mov	r5, r3		!   5 MT (latency=0)
5308c2ecf20Sopenharmony_ci	bt/s	4f		! 111 BR
5318c2ecf20Sopenharmony_ci
5328c2ecf20Sopenharmony_ci	 add	#-4, r3		!  50 EX
5338c2ecf20Sopenharmony_ci	cmp/hs	r2, r1		!  58 MT
5348c2ecf20Sopenharmony_ci
5358c2ecf20Sopenharmony_ci	bt/s	5f		! 111 BR
5368c2ecf20Sopenharmony_ci	 mov.l	r6,@-r0		!  30 LS
5378c2ecf20Sopenharmony_ci
5388c2ecf20Sopenharmony_ci	! 4 cycles, 2 long words per iteration
5398c2ecf20Sopenharmony_ci3:	mov.l	@(r0, r5), r6	!  21 LS (latency=2)
5408c2ecf20Sopenharmony_ci
5418c2ecf20Sopenharmony_ci4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
5428c2ecf20Sopenharmony_ci	cmp/hi	r7, r0
5438c2ecf20Sopenharmony_ci
5448c2ecf20Sopenharmony_ci	mov.l	r6, @-r0	!  30 LS
5458c2ecf20Sopenharmony_ci	bt/s	3b		! 109 BR
5468c2ecf20Sopenharmony_ci
5478c2ecf20Sopenharmony_ci	 mov.l	r2, @-r0	!  30 LS
5488c2ecf20Sopenharmony_ci
5498c2ecf20Sopenharmony_ci	! Copy the final 0-3 bytes
5508c2ecf20Sopenharmony_ci
5518c2ecf20Sopenharmony_ci5:	cmp/eq	r0, r4		!  54 MT
5528c2ecf20Sopenharmony_ci	add	#-10, r7	!  50 EX
5538c2ecf20Sopenharmony_ci
5548c2ecf20Sopenharmony_ci	bt	9f		! 110 BR
5558c2ecf20Sopenharmony_ci	add	#3,r5		!  50 EX
5568c2ecf20Sopenharmony_ci
5578c2ecf20Sopenharmony_ci	! 3 cycles, 1 byte per iteration
5588c2ecf20Sopenharmony_ci1:	mov.b	@(r0,r5),r1	!  19 LS
5598c2ecf20Sopenharmony_ci	cmp/hi	r7,r0		!  57 MT
5608c2ecf20Sopenharmony_ci
5618c2ecf20Sopenharmony_ci	bt/s	1b		! 111 BR
5628c2ecf20Sopenharmony_ci	 mov.b	r1,@-r0		!  28 LS
5638c2ecf20Sopenharmony_ci
5648c2ecf20Sopenharmony_ci9:	rts
5658c2ecf20Sopenharmony_ci	 nop
5668c2ecf20Sopenharmony_ci
5678c2ecf20Sopenharmony_ci	!
5688c2ecf20Sopenharmony_ci	!	GHIJ KLMN OPQR -->  ..GH IJKL MNOP QR..
5698c2ecf20Sopenharmony_ci	!
5708c2ecf20Sopenharmony_ci
5718c2ecf20Sopenharmony_ci	.balign	32
5728c2ecf20Sopenharmony_ci.Lcase2:
5738c2ecf20Sopenharmony_ci	! Size is 16 or greater and less then 64, but may have trailing bytes
5748c2ecf20Sopenharmony_ci
5758c2ecf20Sopenharmony_ci2:	mov	r5, r6		!   5 MT (latency=0)
5768c2ecf20Sopenharmony_ci	add	#-2,r5		!  50 EX
5778c2ecf20Sopenharmony_ci
5788c2ecf20Sopenharmony_ci	mov	r4,r2		!   5 MT (latency=0)
5798c2ecf20Sopenharmony_ci	add	#-4,r6		!  50 EX
5808c2ecf20Sopenharmony_ci
5818c2ecf20Sopenharmony_ci	add	#7,r2		!  50 EX
5828c2ecf20Sopenharmony_ci3:	mov.w	@(r0,r5),r1	!  20 LS (latency=2)
5838c2ecf20Sopenharmony_ci
5848c2ecf20Sopenharmony_ci	mov.w	@(r0,r6),r3	!  20 LS (latency=2)
5858c2ecf20Sopenharmony_ci	cmp/hi	r2,r0		!  57 MT
5868c2ecf20Sopenharmony_ci
5878c2ecf20Sopenharmony_ci	mov.w	r1,@-r0		!  29 LS
5888c2ecf20Sopenharmony_ci	bt/s	3b		! 111 BR
5898c2ecf20Sopenharmony_ci
5908c2ecf20Sopenharmony_ci	 mov.w	r3,@-r0		!  29 LS
5918c2ecf20Sopenharmony_ci
5928c2ecf20Sopenharmony_ci	bra	10f
5938c2ecf20Sopenharmony_ci	 nop
5948c2ecf20Sopenharmony_ci
5958c2ecf20Sopenharmony_ci
5968c2ecf20Sopenharmony_ci	.balign	32
5978c2ecf20Sopenharmony_ci.Lcase2b:
5988c2ecf20Sopenharmony_ci	! Size is at least 64 bytes, so will be going round the big loop at least once.
5998c2ecf20Sopenharmony_ci	!
6008c2ecf20Sopenharmony_ci	!   r2 = rounded up r4
6018c2ecf20Sopenharmony_ci	!   r3 = rounded down r0
6028c2ecf20Sopenharmony_ci
6038c2ecf20Sopenharmony_ci	mov	r0, r3		!   5 MT (latency=0)
6048c2ecf20Sopenharmony_ci	mov	#(~0x1f), r1	!   6 EX
6058c2ecf20Sopenharmony_ci
6068c2ecf20Sopenharmony_ci	and	r1, r3		!  78 EX
6078c2ecf20Sopenharmony_ci	mov	r4, r2		!   5 MT (latency=0)
6088c2ecf20Sopenharmony_ci
6098c2ecf20Sopenharmony_ci	cmp/eq	r3, r0		!  54 MT
6108c2ecf20Sopenharmony_ci	add	#0x1f, r2	!  50 EX
6118c2ecf20Sopenharmony_ci
6128c2ecf20Sopenharmony_ci	add	#-2, r5		!  50 EX
6138c2ecf20Sopenharmony_ci	bt/s	1f		! 110 BR
6148c2ecf20Sopenharmony_ci	 and	r1, r2		!  78 EX
6158c2ecf20Sopenharmony_ci
6168c2ecf20Sopenharmony_ci	! Copy a short word one at a time until we are cache line aligned
6178c2ecf20Sopenharmony_ci	!   Normal values: r0, r2, r3, r4
6188c2ecf20Sopenharmony_ci	!   Unused: r1, r6, r7
6198c2ecf20Sopenharmony_ci	!   Mod: r5 (=r5-2)
6208c2ecf20Sopenharmony_ci	!
6218c2ecf20Sopenharmony_ci	add	#2, r3		!  50 EX
6228c2ecf20Sopenharmony_ci
6238c2ecf20Sopenharmony_ci2:	mov.w	@(r0,r5),r1	!  20 LS (latency=2)
6248c2ecf20Sopenharmony_ci	cmp/eq	r3,r0		!  54 MT
6258c2ecf20Sopenharmony_ci
6268c2ecf20Sopenharmony_ci	bf/s	2b		! 111 BR
6278c2ecf20Sopenharmony_ci
6288c2ecf20Sopenharmony_ci	 mov.w	r1,@-r0		!  29 LS
6298c2ecf20Sopenharmony_ci
6308c2ecf20Sopenharmony_ci	! Copy the cache line aligned blocks
6318c2ecf20Sopenharmony_ci	!
6328c2ecf20Sopenharmony_ci	! In use: r0, r2, r4, r5 (=r5-2)
6338c2ecf20Sopenharmony_ci	! Scratch: r1, r3, r6, r7
6348c2ecf20Sopenharmony_ci	!
6358c2ecf20Sopenharmony_ci	! We could do this with the four scratch registers, but if src
6368c2ecf20Sopenharmony_ci	! and dest hit the same cache line, this will thrash, so make
6378c2ecf20Sopenharmony_ci	! use of additional registers.
6388c2ecf20Sopenharmony_ci	!
6398c2ecf20Sopenharmony_ci	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
6408c2ecf20Sopenharmony_ci	!   r5:	 src (was r0+r5)
6418c2ecf20Sopenharmony_ci	!   r1:	 dest (was r0)
6428c2ecf20Sopenharmony_ci	! this can be reversed at the end, so we don't need to save any extra
6438c2ecf20Sopenharmony_ci	! state.
6448c2ecf20Sopenharmony_ci	!
6458c2ecf20Sopenharmony_ci1:	mov.l	r8, @-r15	!  30 LS
6468c2ecf20Sopenharmony_ci	add	r0, r5		!  49 EX
6478c2ecf20Sopenharmony_ci
6488c2ecf20Sopenharmony_ci	mov.l	r9, @-r15	!  30 LS
6498c2ecf20Sopenharmony_ci	mov	r0, r1		!   5 MT (latency=0)
6508c2ecf20Sopenharmony_ci
6518c2ecf20Sopenharmony_ci	mov.l	r10, @-r15	!  30 LS
6528c2ecf20Sopenharmony_ci	add	#-0x1e, r5	!  50 EX
6538c2ecf20Sopenharmony_ci
6548c2ecf20Sopenharmony_ci	mov.l	r11, @-r15	!  30 LS
6558c2ecf20Sopenharmony_ci
6568c2ecf20Sopenharmony_ci	mov.l	r12, @-r15	!  30 LS
6578c2ecf20Sopenharmony_ci
6588c2ecf20Sopenharmony_ci	! 17 cycles, 32 bytes per iteration
6598c2ecf20Sopenharmony_ci#ifdef CONFIG_CPU_LITTLE_ENDIAN
6608c2ecf20Sopenharmony_ci2:	mov.w	@r5+, r0	!  14 LS (latency=2)		..JI
6618c2ecf20Sopenharmony_ci	add	#-0x20, r1	!  50 EX
6628c2ecf20Sopenharmony_ci
6638c2ecf20Sopenharmony_ci	mov.l	@r5+, r3	!  15 LS (latency=2)		NMLK
6648c2ecf20Sopenharmony_ci
6658c2ecf20Sopenharmony_ci	mov.l	@r5+, r6	!  15 LS (latency=2)		RQPO
6668c2ecf20Sopenharmony_ci	shll16	r0		! 103 EX			JI..
6678c2ecf20Sopenharmony_ci
6688c2ecf20Sopenharmony_ci	mov.l	@r5+, r7	!  15 LS (latency=2)
6698c2ecf20Sopenharmony_ci	xtrct	r3, r0		!  48 EX			LKJI
6708c2ecf20Sopenharmony_ci
6718c2ecf20Sopenharmony_ci	mov.l	@r5+, r8	!  15 LS (latency=2)
6728c2ecf20Sopenharmony_ci	xtrct	r6, r3		!  48 EX			PONM
6738c2ecf20Sopenharmony_ci
6748c2ecf20Sopenharmony_ci	mov.l	@r5+, r9	!  15 LS (latency=2)
6758c2ecf20Sopenharmony_ci	xtrct	r7, r6		!  48 EX
6768c2ecf20Sopenharmony_ci
6778c2ecf20Sopenharmony_ci	mov.l	@r5+, r10	!  15 LS (latency=2)
6788c2ecf20Sopenharmony_ci	xtrct	r8, r7		!  48 EX
6798c2ecf20Sopenharmony_ci
6808c2ecf20Sopenharmony_ci	mov.l	@r5+, r11	!  15 LS (latency=2)
6818c2ecf20Sopenharmony_ci	xtrct	r9, r8		!  48 EX
6828c2ecf20Sopenharmony_ci
6838c2ecf20Sopenharmony_ci	mov.w	@r5+, r12	!  15 LS (latency=2)
6848c2ecf20Sopenharmony_ci	xtrct	r10, r9		!  48 EX
6858c2ecf20Sopenharmony_ci
6868c2ecf20Sopenharmony_ci	movca.l	r0,@r1		!  40 LS (latency=3-7)
6878c2ecf20Sopenharmony_ci	xtrct	r11, r10	!  48 EX
6888c2ecf20Sopenharmony_ci
6898c2ecf20Sopenharmony_ci	mov.l	r3, @(0x04,r1)	!  33 LS
6908c2ecf20Sopenharmony_ci	xtrct	r12, r11	!  48 EX
6918c2ecf20Sopenharmony_ci
6928c2ecf20Sopenharmony_ci	mov.l	r6, @(0x08,r1)	!  33 LS
6938c2ecf20Sopenharmony_ci
6948c2ecf20Sopenharmony_ci	mov.l	r7, @(0x0c,r1)	!  33 LS
6958c2ecf20Sopenharmony_ci
6968c2ecf20Sopenharmony_ci	mov.l	r8, @(0x10,r1)	!  33 LS
6978c2ecf20Sopenharmony_ci	add	#-0x40, r5	!  50 EX
6988c2ecf20Sopenharmony_ci
6998c2ecf20Sopenharmony_ci	mov.l	r9, @(0x14,r1)	!  33 LS
7008c2ecf20Sopenharmony_ci	cmp/eq	r2,r1		!  54 MT
7018c2ecf20Sopenharmony_ci
7028c2ecf20Sopenharmony_ci	mov.l	r10, @(0x18,r1)	!  33 LS
7038c2ecf20Sopenharmony_ci	bf/s	2b		! 109 BR
7048c2ecf20Sopenharmony_ci
7058c2ecf20Sopenharmony_ci	 mov.l	r11, @(0x1c,r1)	!  33 LS
7068c2ecf20Sopenharmony_ci#else
7078c2ecf20Sopenharmony_ci2:	mov.w	@(0x1e,r5), r0	!  17 LS (latency=2)
7088c2ecf20Sopenharmony_ci	add	#-2, r5		!  50 EX
7098c2ecf20Sopenharmony_ci
7108c2ecf20Sopenharmony_ci	mov.l	@(0x1c,r5), r3	!  18 LS (latency=2)
7118c2ecf20Sopenharmony_ci	add	#-4, r1		!  50 EX
7128c2ecf20Sopenharmony_ci
7138c2ecf20Sopenharmony_ci	mov.l	@(0x18,r5), r6	!  18 LS (latency=2)
7148c2ecf20Sopenharmony_ci	shll16	r0		! 103 EX
7158c2ecf20Sopenharmony_ci
7168c2ecf20Sopenharmony_ci	mov.l	@(0x14,r5), r7	!  18 LS (latency=2)
7178c2ecf20Sopenharmony_ci	xtrct	r3, r0		!  48 EX
7188c2ecf20Sopenharmony_ci
7198c2ecf20Sopenharmony_ci	mov.l	@(0x10,r5), r8	!  18 LS (latency=2)
7208c2ecf20Sopenharmony_ci	xtrct	r6, r3		!  48 EX
7218c2ecf20Sopenharmony_ci
7228c2ecf20Sopenharmony_ci	mov.l	@(0x0c,r5), r9	!  18 LS (latency=2)
7238c2ecf20Sopenharmony_ci	xtrct	r7, r6		!  48 EX
7248c2ecf20Sopenharmony_ci
7258c2ecf20Sopenharmony_ci	mov.l	@(0x08,r5), r10	!  18 LS (latency=2)
7268c2ecf20Sopenharmony_ci	xtrct	r8, r7		!  48 EX
7278c2ecf20Sopenharmony_ci
7288c2ecf20Sopenharmony_ci	mov.l	@(0x04,r5), r11	!  18 LS (latency=2)
7298c2ecf20Sopenharmony_ci	xtrct	r9, r8		!  48 EX
7308c2ecf20Sopenharmony_ci
7318c2ecf20Sopenharmony_ci	mov.l   @(0x00,r5), r12 !  18 LS (latency=2)
7328c2ecf20Sopenharmony_ci    	xtrct	r10, r9		!  48 EX
7338c2ecf20Sopenharmony_ci
7348c2ecf20Sopenharmony_ci	movca.l	r0,@r1		!  40 LS (latency=3-7)
7358c2ecf20Sopenharmony_ci	add	#-0x1c, r1	!  50 EX
7368c2ecf20Sopenharmony_ci
7378c2ecf20Sopenharmony_ci	mov.l	r3, @(0x18,r1)	!  33 LS
7388c2ecf20Sopenharmony_ci	xtrct	r11, r10	!  48 EX
7398c2ecf20Sopenharmony_ci
7408c2ecf20Sopenharmony_ci	mov.l	r6, @(0x14,r1)	!  33 LS
7418c2ecf20Sopenharmony_ci	xtrct	r12, r11	!  48 EX
7428c2ecf20Sopenharmony_ci
7438c2ecf20Sopenharmony_ci	mov.l	r7, @(0x10,r1)	!  33 LS
7448c2ecf20Sopenharmony_ci
7458c2ecf20Sopenharmony_ci	mov.l	r8, @(0x0c,r1)	!  33 LS
7468c2ecf20Sopenharmony_ci	add	#-0x1e, r5	!  50 EX
7478c2ecf20Sopenharmony_ci
7488c2ecf20Sopenharmony_ci	mov.l	r9, @(0x08,r1)	!  33 LS
7498c2ecf20Sopenharmony_ci	cmp/eq	r2,r1		!  54 MT
7508c2ecf20Sopenharmony_ci
7518c2ecf20Sopenharmony_ci	mov.l	r10, @(0x04,r1)	!  33 LS
7528c2ecf20Sopenharmony_ci	bf/s	2b		! 109 BR
7538c2ecf20Sopenharmony_ci
7548c2ecf20Sopenharmony_ci	 mov.l	r11, @(0x00,r1)	!  33 LS
7558c2ecf20Sopenharmony_ci#endif
7568c2ecf20Sopenharmony_ci
7578c2ecf20Sopenharmony_ci	mov.l	@r15+, r12
7588c2ecf20Sopenharmony_ci	mov	r1, r0		!   5 MT (latency=0)
7598c2ecf20Sopenharmony_ci
7608c2ecf20Sopenharmony_ci	mov.l	@r15+, r11	!  15 LS
7618c2ecf20Sopenharmony_ci	sub	r1, r5		!  75 EX
7628c2ecf20Sopenharmony_ci
7638c2ecf20Sopenharmony_ci	mov.l	@r15+, r10	!  15 LS
7648c2ecf20Sopenharmony_ci	cmp/eq	r4, r0		!  54 MT
7658c2ecf20Sopenharmony_ci
7668c2ecf20Sopenharmony_ci	bf/s	1f		! 109 BR
7678c2ecf20Sopenharmony_ci	 mov.l	 @r15+, r9	!  15 LS
7688c2ecf20Sopenharmony_ci
7698c2ecf20Sopenharmony_ci	rts
7708c2ecf20Sopenharmony_ci1:	 mov.l	@r15+, r8	!  15 LS
7718c2ecf20Sopenharmony_ci
7728c2ecf20Sopenharmony_ci	add	#0x1e, r5	!  50 EX
7738c2ecf20Sopenharmony_ci
7748c2ecf20Sopenharmony_ci	! Finish off a short word at a time
7758c2ecf20Sopenharmony_ci	! r5 must be invariant - 2
7768c2ecf20Sopenharmony_ci10:	mov	r4,r2		!   5 MT (latency=0)
7778c2ecf20Sopenharmony_ci	add	#1,r2		!  50 EX
7788c2ecf20Sopenharmony_ci
7798c2ecf20Sopenharmony_ci	cmp/hi	r2, r0		!  57 MT
7808c2ecf20Sopenharmony_ci	bf/s	1f		! 109 BR
7818c2ecf20Sopenharmony_ci
7828c2ecf20Sopenharmony_ci	 add	#2, r2		!  50 EX
7838c2ecf20Sopenharmony_ci
7848c2ecf20Sopenharmony_ci3:	mov.w	@(r0,r5),r1	!  20 LS
7858c2ecf20Sopenharmony_ci	cmp/hi	r2,r0		!  57 MT
7868c2ecf20Sopenharmony_ci
7878c2ecf20Sopenharmony_ci	bt/s	3b		! 109 BR
7888c2ecf20Sopenharmony_ci
7898c2ecf20Sopenharmony_ci	 mov.w	r1,@-r0		!  29 LS
7908c2ecf20Sopenharmony_ci1:
7918c2ecf20Sopenharmony_ci
7928c2ecf20Sopenharmony_ci	!
7938c2ecf20Sopenharmony_ci	! Finally, copy the last byte if necessary
7948c2ecf20Sopenharmony_ci	cmp/eq	r4,r0		!  54 MT
7958c2ecf20Sopenharmony_ci	bt/s	9b
7968c2ecf20Sopenharmony_ci	 add	#1,r5
7978c2ecf20Sopenharmony_ci	mov.b	@(r0,r5),r1
7988c2ecf20Sopenharmony_ci	rts
7998c2ecf20Sopenharmony_ci	 mov.b	r1,@-r0
8008c2ecf20Sopenharmony_ci
801