18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * "memcpy" implementation of SuperH 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Copyright (C) 1999 Niibe Yutaka 68c2ecf20Sopenharmony_ci * Copyright (c) 2002 STMicroelectronics Ltd 78c2ecf20Sopenharmony_ci * Modified from memcpy.S and micro-optimised for SH4 88c2ecf20Sopenharmony_ci * Stuart Menefy (stuart.menefy@st.com) 98c2ecf20Sopenharmony_ci * 108c2ecf20Sopenharmony_ci */ 118c2ecf20Sopenharmony_ci#include <linux/linkage.h> 128c2ecf20Sopenharmony_ci 138c2ecf20Sopenharmony_ci/* 148c2ecf20Sopenharmony_ci * void *memcpy(void *dst, const void *src, size_t n); 158c2ecf20Sopenharmony_ci * 168c2ecf20Sopenharmony_ci * It is assumed that there is no overlap between src and dst. 178c2ecf20Sopenharmony_ci * If there is an overlap, then the results are undefined. 188c2ecf20Sopenharmony_ci */ 198c2ecf20Sopenharmony_ci 208c2ecf20Sopenharmony_ci ! 218c2ecf20Sopenharmony_ci ! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR. 228c2ecf20Sopenharmony_ci ! 238c2ecf20Sopenharmony_ci 248c2ecf20Sopenharmony_ci ! Size is 16 or greater, and may have trailing bytes 258c2ecf20Sopenharmony_ci 268c2ecf20Sopenharmony_ci .balign 32 278c2ecf20Sopenharmony_ci.Lcase1: 288c2ecf20Sopenharmony_ci ! Read a long word and write a long word at once 298c2ecf20Sopenharmony_ci ! At the start of each iteration, r7 contains last long load 308c2ecf20Sopenharmony_ci add #-1,r5 ! 79 EX 318c2ecf20Sopenharmony_ci mov r4,r2 ! 5 MT (0 cycles latency) 328c2ecf20Sopenharmony_ci 338c2ecf20Sopenharmony_ci mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency) 348c2ecf20Sopenharmony_ci add #-4,r5 ! 50 EX 358c2ecf20Sopenharmony_ci 368c2ecf20Sopenharmony_ci add #7,r2 ! 79 EX 378c2ecf20Sopenharmony_ci ! 388c2ecf20Sopenharmony_ci#ifdef CONFIG_CPU_LITTLE_ENDIAN 398c2ecf20Sopenharmony_ci ! 6 cycles, 4 bytes per iteration 408c2ecf20Sopenharmony_ci3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK 418c2ecf20Sopenharmony_ci mov r7, r3 ! 5 MT (latency=0) ! RQPO 428c2ecf20Sopenharmony_ci 438c2ecf20Sopenharmony_ci cmp/hi r2,r0 ! 57 MT 448c2ecf20Sopenharmony_ci shll16 r3 ! 103 EX 458c2ecf20Sopenharmony_ci 468c2ecf20Sopenharmony_ci mov r1,r6 ! 5 MT (latency=0) 478c2ecf20Sopenharmony_ci shll8 r3 ! 102 EX ! Oxxx 488c2ecf20Sopenharmony_ci 498c2ecf20Sopenharmony_ci shlr8 r6 ! 106 EX ! xNML 508c2ecf20Sopenharmony_ci mov r1, r7 ! 5 MT (latency=0) 518c2ecf20Sopenharmony_ci 528c2ecf20Sopenharmony_ci or r6,r3 ! 82 EX ! ONML 538c2ecf20Sopenharmony_ci bt/s 3b ! 109 BR 548c2ecf20Sopenharmony_ci 558c2ecf20Sopenharmony_ci mov.l r3,@-r0 ! 30 LS 568c2ecf20Sopenharmony_ci#else 578c2ecf20Sopenharmony_ci3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! KLMN 588c2ecf20Sopenharmony_ci mov r7,r3 ! 5 MT (latency=0) ! OPQR 598c2ecf20Sopenharmony_ci 608c2ecf20Sopenharmony_ci cmp/hi r2,r0 ! 57 MT 618c2ecf20Sopenharmony_ci shlr16 r3 ! 107 EX 628c2ecf20Sopenharmony_ci 638c2ecf20Sopenharmony_ci shlr8 r3 ! 106 EX ! xxxO 648c2ecf20Sopenharmony_ci mov r1,r6 ! 5 MT (latency=0) 658c2ecf20Sopenharmony_ci 668c2ecf20Sopenharmony_ci shll8 r6 ! 102 EX ! LMNx 678c2ecf20Sopenharmony_ci mov r1,r7 ! 5 MT (latency=0) 688c2ecf20Sopenharmony_ci 698c2ecf20Sopenharmony_ci or r6,r3 ! 82 EX ! LMNO 708c2ecf20Sopenharmony_ci bt/s 3b ! 109 BR 718c2ecf20Sopenharmony_ci 728c2ecf20Sopenharmony_ci mov.l r3,@-r0 ! 30 LS 738c2ecf20Sopenharmony_ci#endif 748c2ecf20Sopenharmony_ci ! Finally, copy a byte at once, if necessary 758c2ecf20Sopenharmony_ci 768c2ecf20Sopenharmony_ci add #4,r5 ! 50 EX 778c2ecf20Sopenharmony_ci cmp/eq r4,r0 ! 54 MT 788c2ecf20Sopenharmony_ci 798c2ecf20Sopenharmony_ci add #-6,r2 ! 50 EX 808c2ecf20Sopenharmony_ci bt 9f ! 109 BR 818c2ecf20Sopenharmony_ci 828c2ecf20Sopenharmony_ci8: cmp/hi r2,r0 ! 57 MT 838c2ecf20Sopenharmony_ci mov.b @(r0,r5),r1 ! 20 LS (latency=2) 848c2ecf20Sopenharmony_ci 858c2ecf20Sopenharmony_ci bt/s 8b ! 109 BR 868c2ecf20Sopenharmony_ci 878c2ecf20Sopenharmony_ci mov.b r1,@-r0 ! 29 LS 888c2ecf20Sopenharmony_ci 898c2ecf20Sopenharmony_ci9: rts 908c2ecf20Sopenharmony_ci nop 918c2ecf20Sopenharmony_ci 928c2ecf20Sopenharmony_ci 938c2ecf20Sopenharmony_ci ! 948c2ecf20Sopenharmony_ci ! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R... 958c2ecf20Sopenharmony_ci ! 968c2ecf20Sopenharmony_ci 978c2ecf20Sopenharmony_ci ! Size is 16 or greater, and may have trailing bytes 988c2ecf20Sopenharmony_ci 998c2ecf20Sopenharmony_ci .balign 32 1008c2ecf20Sopenharmony_ci.Lcase3: 1018c2ecf20Sopenharmony_ci ! Read a long word and write a long word at once 1028c2ecf20Sopenharmony_ci ! At the start of each iteration, r7 contains last long load 1038c2ecf20Sopenharmony_ci add #-3,r5 ! 79 EX 1048c2ecf20Sopenharmony_ci mov r4,r2 ! 5 MT (0 cycles latency) 1058c2ecf20Sopenharmony_ci 1068c2ecf20Sopenharmony_ci mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency) 1078c2ecf20Sopenharmony_ci add #-4,r5 ! 50 EX 1088c2ecf20Sopenharmony_ci 1098c2ecf20Sopenharmony_ci add #7,r2 ! 79 EX 1108c2ecf20Sopenharmony_ci ! 1118c2ecf20Sopenharmony_ci#ifdef CONFIG_CPU_LITTLE_ENDIAN 1128c2ecf20Sopenharmony_ci ! 6 cycles, 4 bytes per iteration 1138c2ecf20Sopenharmony_ci3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK 1148c2ecf20Sopenharmony_ci mov r7, r3 ! 5 MT (latency=0) ! RQPO 1158c2ecf20Sopenharmony_ci 1168c2ecf20Sopenharmony_ci cmp/hi r2,r0 ! 57 MT 1178c2ecf20Sopenharmony_ci shll8 r3 ! 102 EX ! QPOx 1188c2ecf20Sopenharmony_ci 1198c2ecf20Sopenharmony_ci mov r1,r6 ! 5 MT (latency=0) 1208c2ecf20Sopenharmony_ci shlr16 r6 ! 107 EX 1218c2ecf20Sopenharmony_ci 1228c2ecf20Sopenharmony_ci shlr8 r6 ! 106 EX ! xxxN 1238c2ecf20Sopenharmony_ci mov r1, r7 ! 5 MT (latency=0) 1248c2ecf20Sopenharmony_ci 1258c2ecf20Sopenharmony_ci or r6,r3 ! 82 EX ! QPON 1268c2ecf20Sopenharmony_ci bt/s 3b ! 109 BR 1278c2ecf20Sopenharmony_ci 1288c2ecf20Sopenharmony_ci mov.l r3,@-r0 ! 30 LS 1298c2ecf20Sopenharmony_ci#else 1308c2ecf20Sopenharmony_ci3: mov r7,r3 ! OPQR 1318c2ecf20Sopenharmony_ci shlr8 r3 ! xOPQ 1328c2ecf20Sopenharmony_ci mov.l @(r0,r5),r7 ! KLMN 1338c2ecf20Sopenharmony_ci mov r7,r6 1348c2ecf20Sopenharmony_ci shll16 r6 1358c2ecf20Sopenharmony_ci shll8 r6 ! Nxxx 1368c2ecf20Sopenharmony_ci or r6,r3 ! NOPQ 1378c2ecf20Sopenharmony_ci cmp/hi r2,r0 1388c2ecf20Sopenharmony_ci bt/s 3b 1398c2ecf20Sopenharmony_ci mov.l r3,@-r0 1408c2ecf20Sopenharmony_ci#endif 1418c2ecf20Sopenharmony_ci 1428c2ecf20Sopenharmony_ci ! Finally, copy a byte at once, if necessary 1438c2ecf20Sopenharmony_ci 1448c2ecf20Sopenharmony_ci add #6,r5 ! 50 EX 1458c2ecf20Sopenharmony_ci cmp/eq r4,r0 ! 54 MT 1468c2ecf20Sopenharmony_ci 1478c2ecf20Sopenharmony_ci add #-6,r2 ! 50 EX 1488c2ecf20Sopenharmony_ci bt 9f ! 109 BR 1498c2ecf20Sopenharmony_ci 1508c2ecf20Sopenharmony_ci8: cmp/hi r2,r0 ! 57 MT 1518c2ecf20Sopenharmony_ci mov.b @(r0,r5),r1 ! 20 LS (latency=2) 1528c2ecf20Sopenharmony_ci 1538c2ecf20Sopenharmony_ci bt/s 8b ! 109 BR 1548c2ecf20Sopenharmony_ci 1558c2ecf20Sopenharmony_ci mov.b r1,@-r0 ! 29 LS 1568c2ecf20Sopenharmony_ci 1578c2ecf20Sopenharmony_ci9: rts 1588c2ecf20Sopenharmony_ci nop 1598c2ecf20Sopenharmony_ci 1608c2ecf20Sopenharmony_ciENTRY(memcpy) 1618c2ecf20Sopenharmony_ci 1628c2ecf20Sopenharmony_ci ! Calculate the invariants which will be used in the remainder 1638c2ecf20Sopenharmony_ci ! of the code: 1648c2ecf20Sopenharmony_ci ! 1658c2ecf20Sopenharmony_ci ! r4 --> [ ... ] DST [ ... ] SRC 1668c2ecf20Sopenharmony_ci ! [ ... ] [ ... ] 1678c2ecf20Sopenharmony_ci ! : : 1688c2ecf20Sopenharmony_ci ! r0 --> [ ... ] r0+r5 --> [ ... ] 1698c2ecf20Sopenharmony_ci ! 1708c2ecf20Sopenharmony_ci ! 1718c2ecf20Sopenharmony_ci 1728c2ecf20Sopenharmony_ci ! Short circuit the common case of src, dst and len being 32 bit aligned 1738c2ecf20Sopenharmony_ci ! and test for zero length move 1748c2ecf20Sopenharmony_ci 1758c2ecf20Sopenharmony_ci mov r6, r0 ! 5 MT (0 cycle latency) 1768c2ecf20Sopenharmony_ci or r4, r0 ! 82 EX 1778c2ecf20Sopenharmony_ci 1788c2ecf20Sopenharmony_ci or r5, r0 ! 82 EX 1798c2ecf20Sopenharmony_ci tst r6, r6 ! 86 MT 1808c2ecf20Sopenharmony_ci 1818c2ecf20Sopenharmony_ci bt/s 99f ! 111 BR (zero len) 1828c2ecf20Sopenharmony_ci tst #3, r0 ! 87 MT 1838c2ecf20Sopenharmony_ci 1848c2ecf20Sopenharmony_ci mov r4, r0 ! 5 MT (0 cycle latency) 1858c2ecf20Sopenharmony_ci add r6, r0 ! 49 EX 1868c2ecf20Sopenharmony_ci 1878c2ecf20Sopenharmony_ci mov #16, r1 ! 6 EX 1888c2ecf20Sopenharmony_ci bt/s .Lcase00 ! 111 BR (aligned) 1898c2ecf20Sopenharmony_ci 1908c2ecf20Sopenharmony_ci sub r4, r5 ! 75 EX 1918c2ecf20Sopenharmony_ci 1928c2ecf20Sopenharmony_ci ! Arguments are not nicely long word aligned or zero len. 1938c2ecf20Sopenharmony_ci ! Check for small copies, and if so do a simple byte at a time copy. 1948c2ecf20Sopenharmony_ci ! 1958c2ecf20Sopenharmony_ci ! Deciding on an exact value of 'small' is not easy, as the point at which 1968c2ecf20Sopenharmony_ci ! using the optimised routines become worthwhile varies (these are the 1978c2ecf20Sopenharmony_ci ! cycle counts for differnet sizes using byte-at-a-time vs. optimised): 1988c2ecf20Sopenharmony_ci ! size byte-at-time long word byte 1998c2ecf20Sopenharmony_ci ! 16 42 39-40 46-50 50-55 2008c2ecf20Sopenharmony_ci ! 24 58 43-44 54-58 62-67 2018c2ecf20Sopenharmony_ci ! 36 82 49-50 66-70 80-85 2028c2ecf20Sopenharmony_ci ! However the penalty for getting it 'wrong' is much higher for long word 2038c2ecf20Sopenharmony_ci ! aligned data (and this is more common), so use a value of 16. 2048c2ecf20Sopenharmony_ci 2058c2ecf20Sopenharmony_ci cmp/gt r6,r1 ! 56 MT 2068c2ecf20Sopenharmony_ci 2078c2ecf20Sopenharmony_ci add #-1,r5 ! 50 EX 2088c2ecf20Sopenharmony_ci bf/s 6f ! 108 BR (not small) 2098c2ecf20Sopenharmony_ci 2108c2ecf20Sopenharmony_ci mov r5, r3 ! 5 MT (latency=0) 2118c2ecf20Sopenharmony_ci shlr r6 ! 104 EX 2128c2ecf20Sopenharmony_ci 2138c2ecf20Sopenharmony_ci mov.b @(r0,r5),r1 ! 20 LS (latency=2) 2148c2ecf20Sopenharmony_ci bf/s 4f ! 111 BR 2158c2ecf20Sopenharmony_ci 2168c2ecf20Sopenharmony_ci add #-1,r3 ! 50 EX 2178c2ecf20Sopenharmony_ci tst r6, r6 ! 86 MT 2188c2ecf20Sopenharmony_ci 2198c2ecf20Sopenharmony_ci bt/s 98f ! 110 BR 2208c2ecf20Sopenharmony_ci mov.b r1,@-r0 ! 29 LS 2218c2ecf20Sopenharmony_ci 2228c2ecf20Sopenharmony_ci ! 4 cycles, 2 bytes per iteration 2238c2ecf20Sopenharmony_ci3: mov.b @(r0,r5),r1 ! 20 LS (latency=2) 2248c2ecf20Sopenharmony_ci 2258c2ecf20Sopenharmony_ci4: mov.b @(r0,r3),r2 ! 20 LS (latency=2) 2268c2ecf20Sopenharmony_ci dt r6 ! 67 EX 2278c2ecf20Sopenharmony_ci 2288c2ecf20Sopenharmony_ci mov.b r1,@-r0 ! 29 LS 2298c2ecf20Sopenharmony_ci bf/s 3b ! 111 BR 2308c2ecf20Sopenharmony_ci 2318c2ecf20Sopenharmony_ci mov.b r2,@-r0 ! 29 LS 2328c2ecf20Sopenharmony_ci98: 2338c2ecf20Sopenharmony_ci rts 2348c2ecf20Sopenharmony_ci nop 2358c2ecf20Sopenharmony_ci 2368c2ecf20Sopenharmony_ci99: rts 2378c2ecf20Sopenharmony_ci mov r4, r0 2388c2ecf20Sopenharmony_ci 2398c2ecf20Sopenharmony_ci ! Size is not small, so its worthwhile looking for optimisations. 2408c2ecf20Sopenharmony_ci ! First align destination to a long word boundary. 2418c2ecf20Sopenharmony_ci ! 2428c2ecf20Sopenharmony_ci ! r5 = normal value -1 2438c2ecf20Sopenharmony_ci 2448c2ecf20Sopenharmony_ci6: tst #3, r0 ! 87 MT 2458c2ecf20Sopenharmony_ci mov #3, r3 ! 6 EX 2468c2ecf20Sopenharmony_ci 2478c2ecf20Sopenharmony_ci bt/s 2f ! 111 BR 2488c2ecf20Sopenharmony_ci and r0,r3 ! 78 EX 2498c2ecf20Sopenharmony_ci 2508c2ecf20Sopenharmony_ci ! 3 cycles, 1 byte per iteration 2518c2ecf20Sopenharmony_ci1: dt r3 ! 67 EX 2528c2ecf20Sopenharmony_ci mov.b @(r0,r5),r1 ! 19 LS (latency=2) 2538c2ecf20Sopenharmony_ci 2548c2ecf20Sopenharmony_ci add #-1, r6 ! 79 EX 2558c2ecf20Sopenharmony_ci bf/s 1b ! 109 BR 2568c2ecf20Sopenharmony_ci 2578c2ecf20Sopenharmony_ci mov.b r1,@-r0 ! 28 LS 2588c2ecf20Sopenharmony_ci 2598c2ecf20Sopenharmony_ci2: add #1, r5 ! 79 EX 2608c2ecf20Sopenharmony_ci 2618c2ecf20Sopenharmony_ci ! Now select the appropriate bulk transfer code based on relative 2628c2ecf20Sopenharmony_ci ! alignment of src and dst. 2638c2ecf20Sopenharmony_ci 2648c2ecf20Sopenharmony_ci mov r0, r3 ! 5 MT (latency=0) 2658c2ecf20Sopenharmony_ci 2668c2ecf20Sopenharmony_ci mov r5, r0 ! 5 MT (latency=0) 2678c2ecf20Sopenharmony_ci tst #1, r0 ! 87 MT 2688c2ecf20Sopenharmony_ci 2698c2ecf20Sopenharmony_ci bf/s 1f ! 111 BR 2708c2ecf20Sopenharmony_ci mov #64, r7 ! 6 EX 2718c2ecf20Sopenharmony_ci 2728c2ecf20Sopenharmony_ci ! bit 0 clear 2738c2ecf20Sopenharmony_ci 2748c2ecf20Sopenharmony_ci cmp/ge r7, r6 ! 55 MT 2758c2ecf20Sopenharmony_ci 2768c2ecf20Sopenharmony_ci bt/s 2f ! 111 BR 2778c2ecf20Sopenharmony_ci tst #2, r0 ! 87 MT 2788c2ecf20Sopenharmony_ci 2798c2ecf20Sopenharmony_ci ! small 2808c2ecf20Sopenharmony_ci bt/s .Lcase0 2818c2ecf20Sopenharmony_ci mov r3, r0 2828c2ecf20Sopenharmony_ci 2838c2ecf20Sopenharmony_ci bra .Lcase2 2848c2ecf20Sopenharmony_ci nop 2858c2ecf20Sopenharmony_ci 2868c2ecf20Sopenharmony_ci ! big 2878c2ecf20Sopenharmony_ci2: bt/s .Lcase0b 2888c2ecf20Sopenharmony_ci mov r3, r0 2898c2ecf20Sopenharmony_ci 2908c2ecf20Sopenharmony_ci bra .Lcase2b 2918c2ecf20Sopenharmony_ci nop 2928c2ecf20Sopenharmony_ci 2938c2ecf20Sopenharmony_ci ! bit 0 set 2948c2ecf20Sopenharmony_ci1: tst #2, r0 ! 87 MT 2958c2ecf20Sopenharmony_ci 2968c2ecf20Sopenharmony_ci bt/s .Lcase1 2978c2ecf20Sopenharmony_ci mov r3, r0 2988c2ecf20Sopenharmony_ci 2998c2ecf20Sopenharmony_ci bra .Lcase3 3008c2ecf20Sopenharmony_ci nop 3018c2ecf20Sopenharmony_ci 3028c2ecf20Sopenharmony_ci 3038c2ecf20Sopenharmony_ci ! 3048c2ecf20Sopenharmony_ci ! GHIJ KLMN OPQR --> GHIJ KLMN OPQR 3058c2ecf20Sopenharmony_ci ! 3068c2ecf20Sopenharmony_ci 3078c2ecf20Sopenharmony_ci ! src, dst and size are all long word aligned 3088c2ecf20Sopenharmony_ci ! size is non-zero 3098c2ecf20Sopenharmony_ci 3108c2ecf20Sopenharmony_ci .balign 32 3118c2ecf20Sopenharmony_ci.Lcase00: 3128c2ecf20Sopenharmony_ci mov #64, r1 ! 6 EX 3138c2ecf20Sopenharmony_ci mov r5, r3 ! 5 MT (latency=0) 3148c2ecf20Sopenharmony_ci 3158c2ecf20Sopenharmony_ci cmp/gt r6, r1 ! 56 MT 3168c2ecf20Sopenharmony_ci add #-4, r5 ! 50 EX 3178c2ecf20Sopenharmony_ci 3188c2ecf20Sopenharmony_ci bf .Lcase00b ! 108 BR (big loop) 3198c2ecf20Sopenharmony_ci shlr2 r6 ! 105 EX 3208c2ecf20Sopenharmony_ci 3218c2ecf20Sopenharmony_ci shlr r6 ! 104 EX 3228c2ecf20Sopenharmony_ci mov.l @(r0, r5), r1 ! 21 LS (latency=2) 3238c2ecf20Sopenharmony_ci 3248c2ecf20Sopenharmony_ci bf/s 4f ! 111 BR 3258c2ecf20Sopenharmony_ci add #-8, r3 ! 50 EX 3268c2ecf20Sopenharmony_ci 3278c2ecf20Sopenharmony_ci tst r6, r6 ! 86 MT 3288c2ecf20Sopenharmony_ci bt/s 5f ! 110 BR 3298c2ecf20Sopenharmony_ci 3308c2ecf20Sopenharmony_ci mov.l r1,@-r0 ! 30 LS 3318c2ecf20Sopenharmony_ci 3328c2ecf20Sopenharmony_ci ! 4 cycles, 2 long words per iteration 3338c2ecf20Sopenharmony_ci3: mov.l @(r0, r5), r1 ! 21 LS (latency=2) 3348c2ecf20Sopenharmony_ci 3358c2ecf20Sopenharmony_ci4: mov.l @(r0, r3), r2 ! 21 LS (latency=2) 3368c2ecf20Sopenharmony_ci dt r6 ! 67 EX 3378c2ecf20Sopenharmony_ci 3388c2ecf20Sopenharmony_ci mov.l r1, @-r0 ! 30 LS 3398c2ecf20Sopenharmony_ci bf/s 3b ! 109 BR 3408c2ecf20Sopenharmony_ci 3418c2ecf20Sopenharmony_ci mov.l r2, @-r0 ! 30 LS 3428c2ecf20Sopenharmony_ci 3438c2ecf20Sopenharmony_ci5: rts 3448c2ecf20Sopenharmony_ci nop 3458c2ecf20Sopenharmony_ci 3468c2ecf20Sopenharmony_ci 3478c2ecf20Sopenharmony_ci ! Size is 16 or greater and less than 64, but may have trailing bytes 3488c2ecf20Sopenharmony_ci 3498c2ecf20Sopenharmony_ci .balign 32 3508c2ecf20Sopenharmony_ci.Lcase0: 3518c2ecf20Sopenharmony_ci add #-4, r5 ! 50 EX 3528c2ecf20Sopenharmony_ci mov r4, r7 ! 5 MT (latency=0) 3538c2ecf20Sopenharmony_ci 3548c2ecf20Sopenharmony_ci mov.l @(r0, r5), r1 ! 21 LS (latency=2) 3558c2ecf20Sopenharmony_ci mov #4, r2 ! 6 EX 3568c2ecf20Sopenharmony_ci 3578c2ecf20Sopenharmony_ci add #11, r7 ! 50 EX 3588c2ecf20Sopenharmony_ci tst r2, r6 ! 86 MT 3598c2ecf20Sopenharmony_ci 3608c2ecf20Sopenharmony_ci mov r5, r3 ! 5 MT (latency=0) 3618c2ecf20Sopenharmony_ci bt/s 4f ! 111 BR 3628c2ecf20Sopenharmony_ci 3638c2ecf20Sopenharmony_ci add #-4, r3 ! 50 EX 3648c2ecf20Sopenharmony_ci mov.l r1,@-r0 ! 30 LS 3658c2ecf20Sopenharmony_ci 3668c2ecf20Sopenharmony_ci ! 4 cycles, 2 long words per iteration 3678c2ecf20Sopenharmony_ci3: mov.l @(r0, r5), r1 ! 21 LS (latency=2) 3688c2ecf20Sopenharmony_ci 3698c2ecf20Sopenharmony_ci4: mov.l @(r0, r3), r2 ! 21 LS (latency=2) 3708c2ecf20Sopenharmony_ci cmp/hi r7, r0 3718c2ecf20Sopenharmony_ci 3728c2ecf20Sopenharmony_ci mov.l r1, @-r0 ! 30 LS 3738c2ecf20Sopenharmony_ci bt/s 3b ! 109 BR 3748c2ecf20Sopenharmony_ci 3758c2ecf20Sopenharmony_ci mov.l r2, @-r0 ! 30 LS 3768c2ecf20Sopenharmony_ci 3778c2ecf20Sopenharmony_ci ! Copy the final 0-3 bytes 3788c2ecf20Sopenharmony_ci 3798c2ecf20Sopenharmony_ci add #3,r5 ! 50 EX 3808c2ecf20Sopenharmony_ci 3818c2ecf20Sopenharmony_ci cmp/eq r0, r4 ! 54 MT 3828c2ecf20Sopenharmony_ci add #-10, r7 ! 50 EX 3838c2ecf20Sopenharmony_ci 3848c2ecf20Sopenharmony_ci bt 9f ! 110 BR 3858c2ecf20Sopenharmony_ci 3868c2ecf20Sopenharmony_ci ! 3 cycles, 1 byte per iteration 3878c2ecf20Sopenharmony_ci1: mov.b @(r0,r5),r1 ! 19 LS 3888c2ecf20Sopenharmony_ci cmp/hi r7,r0 ! 57 MT 3898c2ecf20Sopenharmony_ci 3908c2ecf20Sopenharmony_ci bt/s 1b ! 111 BR 3918c2ecf20Sopenharmony_ci mov.b r1,@-r0 ! 28 LS 3928c2ecf20Sopenharmony_ci 3938c2ecf20Sopenharmony_ci9: rts 3948c2ecf20Sopenharmony_ci nop 3958c2ecf20Sopenharmony_ci 3968c2ecf20Sopenharmony_ci ! Size is at least 64 bytes, so will be going round the big loop at least once. 3978c2ecf20Sopenharmony_ci ! 3988c2ecf20Sopenharmony_ci ! r2 = rounded up r4 3998c2ecf20Sopenharmony_ci ! r3 = rounded down r0 4008c2ecf20Sopenharmony_ci 4018c2ecf20Sopenharmony_ci .balign 32 4028c2ecf20Sopenharmony_ci.Lcase0b: 4038c2ecf20Sopenharmony_ci add #-4, r5 ! 50 EX 4048c2ecf20Sopenharmony_ci 4058c2ecf20Sopenharmony_ci.Lcase00b: 4068c2ecf20Sopenharmony_ci mov r0, r3 ! 5 MT (latency=0) 4078c2ecf20Sopenharmony_ci mov #(~0x1f), r1 ! 6 EX 4088c2ecf20Sopenharmony_ci 4098c2ecf20Sopenharmony_ci and r1, r3 ! 78 EX 4108c2ecf20Sopenharmony_ci mov r4, r2 ! 5 MT (latency=0) 4118c2ecf20Sopenharmony_ci 4128c2ecf20Sopenharmony_ci cmp/eq r3, r0 ! 54 MT 4138c2ecf20Sopenharmony_ci add #0x1f, r2 ! 50 EX 4148c2ecf20Sopenharmony_ci 4158c2ecf20Sopenharmony_ci bt/s 1f ! 110 BR 4168c2ecf20Sopenharmony_ci and r1, r2 ! 78 EX 4178c2ecf20Sopenharmony_ci 4188c2ecf20Sopenharmony_ci ! copy initial words until cache line aligned 4198c2ecf20Sopenharmony_ci 4208c2ecf20Sopenharmony_ci mov.l @(r0, r5), r1 ! 21 LS (latency=2) 4218c2ecf20Sopenharmony_ci tst #4, r0 ! 87 MT 4228c2ecf20Sopenharmony_ci 4238c2ecf20Sopenharmony_ci mov r5, r6 ! 5 MT (latency=0) 4248c2ecf20Sopenharmony_ci add #-4, r6 ! 50 EX 4258c2ecf20Sopenharmony_ci 4268c2ecf20Sopenharmony_ci bt/s 4f ! 111 BR 4278c2ecf20Sopenharmony_ci add #8, r3 ! 50 EX 4288c2ecf20Sopenharmony_ci 4298c2ecf20Sopenharmony_ci tst #0x18, r0 ! 87 MT 4308c2ecf20Sopenharmony_ci 4318c2ecf20Sopenharmony_ci bt/s 1f ! 109 BR 4328c2ecf20Sopenharmony_ci mov.l r1,@-r0 ! 30 LS 4338c2ecf20Sopenharmony_ci 4348c2ecf20Sopenharmony_ci ! 4 cycles, 2 long words per iteration 4358c2ecf20Sopenharmony_ci3: mov.l @(r0, r5), r1 ! 21 LS (latency=2) 4368c2ecf20Sopenharmony_ci 4378c2ecf20Sopenharmony_ci4: mov.l @(r0, r6), r7 ! 21 LS (latency=2) 4388c2ecf20Sopenharmony_ci cmp/eq r3, r0 ! 54 MT 4398c2ecf20Sopenharmony_ci 4408c2ecf20Sopenharmony_ci mov.l r1, @-r0 ! 30 LS 4418c2ecf20Sopenharmony_ci bf/s 3b ! 109 BR 4428c2ecf20Sopenharmony_ci 4438c2ecf20Sopenharmony_ci mov.l r7, @-r0 ! 30 LS 4448c2ecf20Sopenharmony_ci 4458c2ecf20Sopenharmony_ci ! Copy the cache line aligned blocks 4468c2ecf20Sopenharmony_ci ! 4478c2ecf20Sopenharmony_ci ! In use: r0, r2, r4, r5 4488c2ecf20Sopenharmony_ci ! Scratch: r1, r3, r6, r7 4498c2ecf20Sopenharmony_ci ! 4508c2ecf20Sopenharmony_ci ! We could do this with the four scratch registers, but if src 4518c2ecf20Sopenharmony_ci ! and dest hit the same cache line, this will thrash, so make 4528c2ecf20Sopenharmony_ci ! use of additional registers. 4538c2ecf20Sopenharmony_ci ! 4548c2ecf20Sopenharmony_ci ! We also need r0 as a temporary (for movca), so 'undo' the invariant: 4558c2ecf20Sopenharmony_ci ! r5: src (was r0+r5) 4568c2ecf20Sopenharmony_ci ! r1: dest (was r0) 4578c2ecf20Sopenharmony_ci ! this can be reversed at the end, so we don't need to save any extra 4588c2ecf20Sopenharmony_ci ! state. 4598c2ecf20Sopenharmony_ci ! 4608c2ecf20Sopenharmony_ci1: mov.l r8, @-r15 ! 30 LS 4618c2ecf20Sopenharmony_ci add r0, r5 ! 49 EX 4628c2ecf20Sopenharmony_ci 4638c2ecf20Sopenharmony_ci mov.l r9, @-r15 ! 30 LS 4648c2ecf20Sopenharmony_ci mov r0, r1 ! 5 MT (latency=0) 4658c2ecf20Sopenharmony_ci 4668c2ecf20Sopenharmony_ci mov.l r10, @-r15 ! 30 LS 4678c2ecf20Sopenharmony_ci add #-0x1c, r5 ! 50 EX 4688c2ecf20Sopenharmony_ci 4698c2ecf20Sopenharmony_ci mov.l r11, @-r15 ! 30 LS 4708c2ecf20Sopenharmony_ci 4718c2ecf20Sopenharmony_ci ! 16 cycles, 32 bytes per iteration 4728c2ecf20Sopenharmony_ci2: mov.l @(0x00,r5),r0 ! 18 LS (latency=2) 4738c2ecf20Sopenharmony_ci add #-0x20, r1 ! 50 EX 4748c2ecf20Sopenharmony_ci mov.l @(0x04,r5),r3 ! 18 LS (latency=2) 4758c2ecf20Sopenharmony_ci mov.l @(0x08,r5),r6 ! 18 LS (latency=2) 4768c2ecf20Sopenharmony_ci mov.l @(0x0c,r5),r7 ! 18 LS (latency=2) 4778c2ecf20Sopenharmony_ci mov.l @(0x10,r5),r8 ! 18 LS (latency=2) 4788c2ecf20Sopenharmony_ci mov.l @(0x14,r5),r9 ! 18 LS (latency=2) 4798c2ecf20Sopenharmony_ci mov.l @(0x18,r5),r10 ! 18 LS (latency=2) 4808c2ecf20Sopenharmony_ci mov.l @(0x1c,r5),r11 ! 18 LS (latency=2) 4818c2ecf20Sopenharmony_ci movca.l r0,@r1 ! 40 LS (latency=3-7) 4828c2ecf20Sopenharmony_ci mov.l r3,@(0x04,r1) ! 33 LS 4838c2ecf20Sopenharmony_ci mov.l r6,@(0x08,r1) ! 33 LS 4848c2ecf20Sopenharmony_ci mov.l r7,@(0x0c,r1) ! 33 LS 4858c2ecf20Sopenharmony_ci 4868c2ecf20Sopenharmony_ci mov.l r8,@(0x10,r1) ! 33 LS 4878c2ecf20Sopenharmony_ci add #-0x20, r5 ! 50 EX 4888c2ecf20Sopenharmony_ci 4898c2ecf20Sopenharmony_ci mov.l r9,@(0x14,r1) ! 33 LS 4908c2ecf20Sopenharmony_ci cmp/eq r2,r1 ! 54 MT 4918c2ecf20Sopenharmony_ci 4928c2ecf20Sopenharmony_ci mov.l r10,@(0x18,r1) ! 33 LS 4938c2ecf20Sopenharmony_ci bf/s 2b ! 109 BR 4948c2ecf20Sopenharmony_ci 4958c2ecf20Sopenharmony_ci mov.l r11,@(0x1c,r1) ! 33 LS 4968c2ecf20Sopenharmony_ci 4978c2ecf20Sopenharmony_ci mov r1, r0 ! 5 MT (latency=0) 4988c2ecf20Sopenharmony_ci 4998c2ecf20Sopenharmony_ci mov.l @r15+, r11 ! 15 LS 5008c2ecf20Sopenharmony_ci sub r1, r5 ! 75 EX 5018c2ecf20Sopenharmony_ci 5028c2ecf20Sopenharmony_ci mov.l @r15+, r10 ! 15 LS 5038c2ecf20Sopenharmony_ci cmp/eq r4, r0 ! 54 MT 5048c2ecf20Sopenharmony_ci 5058c2ecf20Sopenharmony_ci bf/s 1f ! 109 BR 5068c2ecf20Sopenharmony_ci mov.l @r15+, r9 ! 15 LS 5078c2ecf20Sopenharmony_ci 5088c2ecf20Sopenharmony_ci rts 5098c2ecf20Sopenharmony_ci1: mov.l @r15+, r8 ! 15 LS 5108c2ecf20Sopenharmony_ci sub r4, r1 ! 75 EX (len remaining) 5118c2ecf20Sopenharmony_ci 5128c2ecf20Sopenharmony_ci ! number of trailing bytes is non-zero 5138c2ecf20Sopenharmony_ci ! 5148c2ecf20Sopenharmony_ci ! invariants restored (r5 already decremented by 4) 5158c2ecf20Sopenharmony_ci ! also r1=num bytes remaining 5168c2ecf20Sopenharmony_ci 5178c2ecf20Sopenharmony_ci mov #4, r2 ! 6 EX 5188c2ecf20Sopenharmony_ci mov r4, r7 ! 5 MT (latency=0) 5198c2ecf20Sopenharmony_ci 5208c2ecf20Sopenharmony_ci add #0x1c, r5 ! 50 EX (back to -4) 5218c2ecf20Sopenharmony_ci cmp/hs r2, r1 ! 58 MT 5228c2ecf20Sopenharmony_ci 5238c2ecf20Sopenharmony_ci bf/s 5f ! 108 BR 5248c2ecf20Sopenharmony_ci add #11, r7 ! 50 EX 5258c2ecf20Sopenharmony_ci 5268c2ecf20Sopenharmony_ci mov.l @(r0, r5), r6 ! 21 LS (latency=2) 5278c2ecf20Sopenharmony_ci tst r2, r1 ! 86 MT 5288c2ecf20Sopenharmony_ci 5298c2ecf20Sopenharmony_ci mov r5, r3 ! 5 MT (latency=0) 5308c2ecf20Sopenharmony_ci bt/s 4f ! 111 BR 5318c2ecf20Sopenharmony_ci 5328c2ecf20Sopenharmony_ci add #-4, r3 ! 50 EX 5338c2ecf20Sopenharmony_ci cmp/hs r2, r1 ! 58 MT 5348c2ecf20Sopenharmony_ci 5358c2ecf20Sopenharmony_ci bt/s 5f ! 111 BR 5368c2ecf20Sopenharmony_ci mov.l r6,@-r0 ! 30 LS 5378c2ecf20Sopenharmony_ci 5388c2ecf20Sopenharmony_ci ! 4 cycles, 2 long words per iteration 5398c2ecf20Sopenharmony_ci3: mov.l @(r0, r5), r6 ! 21 LS (latency=2) 5408c2ecf20Sopenharmony_ci 5418c2ecf20Sopenharmony_ci4: mov.l @(r0, r3), r2 ! 21 LS (latency=2) 5428c2ecf20Sopenharmony_ci cmp/hi r7, r0 5438c2ecf20Sopenharmony_ci 5448c2ecf20Sopenharmony_ci mov.l r6, @-r0 ! 30 LS 5458c2ecf20Sopenharmony_ci bt/s 3b ! 109 BR 5468c2ecf20Sopenharmony_ci 5478c2ecf20Sopenharmony_ci mov.l r2, @-r0 ! 30 LS 5488c2ecf20Sopenharmony_ci 5498c2ecf20Sopenharmony_ci ! Copy the final 0-3 bytes 5508c2ecf20Sopenharmony_ci 5518c2ecf20Sopenharmony_ci5: cmp/eq r0, r4 ! 54 MT 5528c2ecf20Sopenharmony_ci add #-10, r7 ! 50 EX 5538c2ecf20Sopenharmony_ci 5548c2ecf20Sopenharmony_ci bt 9f ! 110 BR 5558c2ecf20Sopenharmony_ci add #3,r5 ! 50 EX 5568c2ecf20Sopenharmony_ci 5578c2ecf20Sopenharmony_ci ! 3 cycles, 1 byte per iteration 5588c2ecf20Sopenharmony_ci1: mov.b @(r0,r5),r1 ! 19 LS 5598c2ecf20Sopenharmony_ci cmp/hi r7,r0 ! 57 MT 5608c2ecf20Sopenharmony_ci 5618c2ecf20Sopenharmony_ci bt/s 1b ! 111 BR 5628c2ecf20Sopenharmony_ci mov.b r1,@-r0 ! 28 LS 5638c2ecf20Sopenharmony_ci 5648c2ecf20Sopenharmony_ci9: rts 5658c2ecf20Sopenharmony_ci nop 5668c2ecf20Sopenharmony_ci 5678c2ecf20Sopenharmony_ci ! 5688c2ecf20Sopenharmony_ci ! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR.. 5698c2ecf20Sopenharmony_ci ! 5708c2ecf20Sopenharmony_ci 5718c2ecf20Sopenharmony_ci .balign 32 5728c2ecf20Sopenharmony_ci.Lcase2: 5738c2ecf20Sopenharmony_ci ! Size is 16 or greater and less then 64, but may have trailing bytes 5748c2ecf20Sopenharmony_ci 5758c2ecf20Sopenharmony_ci2: mov r5, r6 ! 5 MT (latency=0) 5768c2ecf20Sopenharmony_ci add #-2,r5 ! 50 EX 5778c2ecf20Sopenharmony_ci 5788c2ecf20Sopenharmony_ci mov r4,r2 ! 5 MT (latency=0) 5798c2ecf20Sopenharmony_ci add #-4,r6 ! 50 EX 5808c2ecf20Sopenharmony_ci 5818c2ecf20Sopenharmony_ci add #7,r2 ! 50 EX 5828c2ecf20Sopenharmony_ci3: mov.w @(r0,r5),r1 ! 20 LS (latency=2) 5838c2ecf20Sopenharmony_ci 5848c2ecf20Sopenharmony_ci mov.w @(r0,r6),r3 ! 20 LS (latency=2) 5858c2ecf20Sopenharmony_ci cmp/hi r2,r0 ! 57 MT 5868c2ecf20Sopenharmony_ci 5878c2ecf20Sopenharmony_ci mov.w r1,@-r0 ! 29 LS 5888c2ecf20Sopenharmony_ci bt/s 3b ! 111 BR 5898c2ecf20Sopenharmony_ci 5908c2ecf20Sopenharmony_ci mov.w r3,@-r0 ! 29 LS 5918c2ecf20Sopenharmony_ci 5928c2ecf20Sopenharmony_ci bra 10f 5938c2ecf20Sopenharmony_ci nop 5948c2ecf20Sopenharmony_ci 5958c2ecf20Sopenharmony_ci 5968c2ecf20Sopenharmony_ci .balign 32 5978c2ecf20Sopenharmony_ci.Lcase2b: 5988c2ecf20Sopenharmony_ci ! Size is at least 64 bytes, so will be going round the big loop at least once. 5998c2ecf20Sopenharmony_ci ! 6008c2ecf20Sopenharmony_ci ! r2 = rounded up r4 6018c2ecf20Sopenharmony_ci ! r3 = rounded down r0 6028c2ecf20Sopenharmony_ci 6038c2ecf20Sopenharmony_ci mov r0, r3 ! 5 MT (latency=0) 6048c2ecf20Sopenharmony_ci mov #(~0x1f), r1 ! 6 EX 6058c2ecf20Sopenharmony_ci 6068c2ecf20Sopenharmony_ci and r1, r3 ! 78 EX 6078c2ecf20Sopenharmony_ci mov r4, r2 ! 5 MT (latency=0) 6088c2ecf20Sopenharmony_ci 6098c2ecf20Sopenharmony_ci cmp/eq r3, r0 ! 54 MT 6108c2ecf20Sopenharmony_ci add #0x1f, r2 ! 50 EX 6118c2ecf20Sopenharmony_ci 6128c2ecf20Sopenharmony_ci add #-2, r5 ! 50 EX 6138c2ecf20Sopenharmony_ci bt/s 1f ! 110 BR 6148c2ecf20Sopenharmony_ci and r1, r2 ! 78 EX 6158c2ecf20Sopenharmony_ci 6168c2ecf20Sopenharmony_ci ! Copy a short word one at a time until we are cache line aligned 6178c2ecf20Sopenharmony_ci ! Normal values: r0, r2, r3, r4 6188c2ecf20Sopenharmony_ci ! Unused: r1, r6, r7 6198c2ecf20Sopenharmony_ci ! Mod: r5 (=r5-2) 6208c2ecf20Sopenharmony_ci ! 6218c2ecf20Sopenharmony_ci add #2, r3 ! 50 EX 6228c2ecf20Sopenharmony_ci 6238c2ecf20Sopenharmony_ci2: mov.w @(r0,r5),r1 ! 20 LS (latency=2) 6248c2ecf20Sopenharmony_ci cmp/eq r3,r0 ! 54 MT 6258c2ecf20Sopenharmony_ci 6268c2ecf20Sopenharmony_ci bf/s 2b ! 111 BR 6278c2ecf20Sopenharmony_ci 6288c2ecf20Sopenharmony_ci mov.w r1,@-r0 ! 29 LS 6298c2ecf20Sopenharmony_ci 6308c2ecf20Sopenharmony_ci ! Copy the cache line aligned blocks 6318c2ecf20Sopenharmony_ci ! 6328c2ecf20Sopenharmony_ci ! In use: r0, r2, r4, r5 (=r5-2) 6338c2ecf20Sopenharmony_ci ! Scratch: r1, r3, r6, r7 6348c2ecf20Sopenharmony_ci ! 6358c2ecf20Sopenharmony_ci ! We could do this with the four scratch registers, but if src 6368c2ecf20Sopenharmony_ci ! and dest hit the same cache line, this will thrash, so make 6378c2ecf20Sopenharmony_ci ! use of additional registers. 6388c2ecf20Sopenharmony_ci ! 6398c2ecf20Sopenharmony_ci ! We also need r0 as a temporary (for movca), so 'undo' the invariant: 6408c2ecf20Sopenharmony_ci ! r5: src (was r0+r5) 6418c2ecf20Sopenharmony_ci ! r1: dest (was r0) 6428c2ecf20Sopenharmony_ci ! this can be reversed at the end, so we don't need to save any extra 6438c2ecf20Sopenharmony_ci ! state. 6448c2ecf20Sopenharmony_ci ! 6458c2ecf20Sopenharmony_ci1: mov.l r8, @-r15 ! 30 LS 6468c2ecf20Sopenharmony_ci add r0, r5 ! 49 EX 6478c2ecf20Sopenharmony_ci 6488c2ecf20Sopenharmony_ci mov.l r9, @-r15 ! 30 LS 6498c2ecf20Sopenharmony_ci mov r0, r1 ! 5 MT (latency=0) 6508c2ecf20Sopenharmony_ci 6518c2ecf20Sopenharmony_ci mov.l r10, @-r15 ! 30 LS 6528c2ecf20Sopenharmony_ci add #-0x1e, r5 ! 50 EX 6538c2ecf20Sopenharmony_ci 6548c2ecf20Sopenharmony_ci mov.l r11, @-r15 ! 30 LS 6558c2ecf20Sopenharmony_ci 6568c2ecf20Sopenharmony_ci mov.l r12, @-r15 ! 30 LS 6578c2ecf20Sopenharmony_ci 6588c2ecf20Sopenharmony_ci ! 17 cycles, 32 bytes per iteration 6598c2ecf20Sopenharmony_ci#ifdef CONFIG_CPU_LITTLE_ENDIAN 6608c2ecf20Sopenharmony_ci2: mov.w @r5+, r0 ! 14 LS (latency=2) ..JI 6618c2ecf20Sopenharmony_ci add #-0x20, r1 ! 50 EX 6628c2ecf20Sopenharmony_ci 6638c2ecf20Sopenharmony_ci mov.l @r5+, r3 ! 15 LS (latency=2) NMLK 6648c2ecf20Sopenharmony_ci 6658c2ecf20Sopenharmony_ci mov.l @r5+, r6 ! 15 LS (latency=2) RQPO 6668c2ecf20Sopenharmony_ci shll16 r0 ! 103 EX JI.. 6678c2ecf20Sopenharmony_ci 6688c2ecf20Sopenharmony_ci mov.l @r5+, r7 ! 15 LS (latency=2) 6698c2ecf20Sopenharmony_ci xtrct r3, r0 ! 48 EX LKJI 6708c2ecf20Sopenharmony_ci 6718c2ecf20Sopenharmony_ci mov.l @r5+, r8 ! 15 LS (latency=2) 6728c2ecf20Sopenharmony_ci xtrct r6, r3 ! 48 EX PONM 6738c2ecf20Sopenharmony_ci 6748c2ecf20Sopenharmony_ci mov.l @r5+, r9 ! 15 LS (latency=2) 6758c2ecf20Sopenharmony_ci xtrct r7, r6 ! 48 EX 6768c2ecf20Sopenharmony_ci 6778c2ecf20Sopenharmony_ci mov.l @r5+, r10 ! 15 LS (latency=2) 6788c2ecf20Sopenharmony_ci xtrct r8, r7 ! 48 EX 6798c2ecf20Sopenharmony_ci 6808c2ecf20Sopenharmony_ci mov.l @r5+, r11 ! 15 LS (latency=2) 6818c2ecf20Sopenharmony_ci xtrct r9, r8 ! 48 EX 6828c2ecf20Sopenharmony_ci 6838c2ecf20Sopenharmony_ci mov.w @r5+, r12 ! 15 LS (latency=2) 6848c2ecf20Sopenharmony_ci xtrct r10, r9 ! 48 EX 6858c2ecf20Sopenharmony_ci 6868c2ecf20Sopenharmony_ci movca.l r0,@r1 ! 40 LS (latency=3-7) 6878c2ecf20Sopenharmony_ci xtrct r11, r10 ! 48 EX 6888c2ecf20Sopenharmony_ci 6898c2ecf20Sopenharmony_ci mov.l r3, @(0x04,r1) ! 33 LS 6908c2ecf20Sopenharmony_ci xtrct r12, r11 ! 48 EX 6918c2ecf20Sopenharmony_ci 6928c2ecf20Sopenharmony_ci mov.l r6, @(0x08,r1) ! 33 LS 6938c2ecf20Sopenharmony_ci 6948c2ecf20Sopenharmony_ci mov.l r7, @(0x0c,r1) ! 33 LS 6958c2ecf20Sopenharmony_ci 6968c2ecf20Sopenharmony_ci mov.l r8, @(0x10,r1) ! 33 LS 6978c2ecf20Sopenharmony_ci add #-0x40, r5 ! 50 EX 6988c2ecf20Sopenharmony_ci 6998c2ecf20Sopenharmony_ci mov.l r9, @(0x14,r1) ! 33 LS 7008c2ecf20Sopenharmony_ci cmp/eq r2,r1 ! 54 MT 7018c2ecf20Sopenharmony_ci 7028c2ecf20Sopenharmony_ci mov.l r10, @(0x18,r1) ! 33 LS 7038c2ecf20Sopenharmony_ci bf/s 2b ! 109 BR 7048c2ecf20Sopenharmony_ci 7058c2ecf20Sopenharmony_ci mov.l r11, @(0x1c,r1) ! 33 LS 7068c2ecf20Sopenharmony_ci#else 7078c2ecf20Sopenharmony_ci2: mov.w @(0x1e,r5), r0 ! 17 LS (latency=2) 7088c2ecf20Sopenharmony_ci add #-2, r5 ! 50 EX 7098c2ecf20Sopenharmony_ci 7108c2ecf20Sopenharmony_ci mov.l @(0x1c,r5), r3 ! 18 LS (latency=2) 7118c2ecf20Sopenharmony_ci add #-4, r1 ! 50 EX 7128c2ecf20Sopenharmony_ci 7138c2ecf20Sopenharmony_ci mov.l @(0x18,r5), r6 ! 18 LS (latency=2) 7148c2ecf20Sopenharmony_ci shll16 r0 ! 103 EX 7158c2ecf20Sopenharmony_ci 7168c2ecf20Sopenharmony_ci mov.l @(0x14,r5), r7 ! 18 LS (latency=2) 7178c2ecf20Sopenharmony_ci xtrct r3, r0 ! 48 EX 7188c2ecf20Sopenharmony_ci 7198c2ecf20Sopenharmony_ci mov.l @(0x10,r5), r8 ! 18 LS (latency=2) 7208c2ecf20Sopenharmony_ci xtrct r6, r3 ! 48 EX 7218c2ecf20Sopenharmony_ci 7228c2ecf20Sopenharmony_ci mov.l @(0x0c,r5), r9 ! 18 LS (latency=2) 7238c2ecf20Sopenharmony_ci xtrct r7, r6 ! 48 EX 7248c2ecf20Sopenharmony_ci 7258c2ecf20Sopenharmony_ci mov.l @(0x08,r5), r10 ! 18 LS (latency=2) 7268c2ecf20Sopenharmony_ci xtrct r8, r7 ! 48 EX 7278c2ecf20Sopenharmony_ci 7288c2ecf20Sopenharmony_ci mov.l @(0x04,r5), r11 ! 18 LS (latency=2) 7298c2ecf20Sopenharmony_ci xtrct r9, r8 ! 48 EX 7308c2ecf20Sopenharmony_ci 7318c2ecf20Sopenharmony_ci mov.l @(0x00,r5), r12 ! 18 LS (latency=2) 7328c2ecf20Sopenharmony_ci xtrct r10, r9 ! 48 EX 7338c2ecf20Sopenharmony_ci 7348c2ecf20Sopenharmony_ci movca.l r0,@r1 ! 40 LS (latency=3-7) 7358c2ecf20Sopenharmony_ci add #-0x1c, r1 ! 50 EX 7368c2ecf20Sopenharmony_ci 7378c2ecf20Sopenharmony_ci mov.l r3, @(0x18,r1) ! 33 LS 7388c2ecf20Sopenharmony_ci xtrct r11, r10 ! 48 EX 7398c2ecf20Sopenharmony_ci 7408c2ecf20Sopenharmony_ci mov.l r6, @(0x14,r1) ! 33 LS 7418c2ecf20Sopenharmony_ci xtrct r12, r11 ! 48 EX 7428c2ecf20Sopenharmony_ci 7438c2ecf20Sopenharmony_ci mov.l r7, @(0x10,r1) ! 33 LS 7448c2ecf20Sopenharmony_ci 7458c2ecf20Sopenharmony_ci mov.l r8, @(0x0c,r1) ! 33 LS 7468c2ecf20Sopenharmony_ci add #-0x1e, r5 ! 50 EX 7478c2ecf20Sopenharmony_ci 7488c2ecf20Sopenharmony_ci mov.l r9, @(0x08,r1) ! 33 LS 7498c2ecf20Sopenharmony_ci cmp/eq r2,r1 ! 54 MT 7508c2ecf20Sopenharmony_ci 7518c2ecf20Sopenharmony_ci mov.l r10, @(0x04,r1) ! 33 LS 7528c2ecf20Sopenharmony_ci bf/s 2b ! 109 BR 7538c2ecf20Sopenharmony_ci 7548c2ecf20Sopenharmony_ci mov.l r11, @(0x00,r1) ! 33 LS 7558c2ecf20Sopenharmony_ci#endif 7568c2ecf20Sopenharmony_ci 7578c2ecf20Sopenharmony_ci mov.l @r15+, r12 7588c2ecf20Sopenharmony_ci mov r1, r0 ! 5 MT (latency=0) 7598c2ecf20Sopenharmony_ci 7608c2ecf20Sopenharmony_ci mov.l @r15+, r11 ! 15 LS 7618c2ecf20Sopenharmony_ci sub r1, r5 ! 75 EX 7628c2ecf20Sopenharmony_ci 7638c2ecf20Sopenharmony_ci mov.l @r15+, r10 ! 15 LS 7648c2ecf20Sopenharmony_ci cmp/eq r4, r0 ! 54 MT 7658c2ecf20Sopenharmony_ci 7668c2ecf20Sopenharmony_ci bf/s 1f ! 109 BR 7678c2ecf20Sopenharmony_ci mov.l @r15+, r9 ! 15 LS 7688c2ecf20Sopenharmony_ci 7698c2ecf20Sopenharmony_ci rts 7708c2ecf20Sopenharmony_ci1: mov.l @r15+, r8 ! 15 LS 7718c2ecf20Sopenharmony_ci 7728c2ecf20Sopenharmony_ci add #0x1e, r5 ! 50 EX 7738c2ecf20Sopenharmony_ci 7748c2ecf20Sopenharmony_ci ! Finish off a short word at a time 7758c2ecf20Sopenharmony_ci ! r5 must be invariant - 2 7768c2ecf20Sopenharmony_ci10: mov r4,r2 ! 5 MT (latency=0) 7778c2ecf20Sopenharmony_ci add #1,r2 ! 50 EX 7788c2ecf20Sopenharmony_ci 7798c2ecf20Sopenharmony_ci cmp/hi r2, r0 ! 57 MT 7808c2ecf20Sopenharmony_ci bf/s 1f ! 109 BR 7818c2ecf20Sopenharmony_ci 7828c2ecf20Sopenharmony_ci add #2, r2 ! 50 EX 7838c2ecf20Sopenharmony_ci 7848c2ecf20Sopenharmony_ci3: mov.w @(r0,r5),r1 ! 20 LS 7858c2ecf20Sopenharmony_ci cmp/hi r2,r0 ! 57 MT 7868c2ecf20Sopenharmony_ci 7878c2ecf20Sopenharmony_ci bt/s 3b ! 109 BR 7888c2ecf20Sopenharmony_ci 7898c2ecf20Sopenharmony_ci mov.w r1,@-r0 ! 29 LS 7908c2ecf20Sopenharmony_ci1: 7918c2ecf20Sopenharmony_ci 7928c2ecf20Sopenharmony_ci ! 7938c2ecf20Sopenharmony_ci ! Finally, copy the last byte if necessary 7948c2ecf20Sopenharmony_ci cmp/eq r4,r0 ! 54 MT 7958c2ecf20Sopenharmony_ci bt/s 9b 7968c2ecf20Sopenharmony_ci add #1,r5 7978c2ecf20Sopenharmony_ci mov.b @(r0,r5),r1 7988c2ecf20Sopenharmony_ci rts 7998c2ecf20Sopenharmony_ci mov.b r1,@-r0 8008c2ecf20Sopenharmony_ci 801