162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * "memcpy" implementation of SuperH 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Copyright (C) 1999 Niibe Yutaka 662306a36Sopenharmony_ci * Copyright (c) 2002 STMicroelectronics Ltd 762306a36Sopenharmony_ci * Modified from memcpy.S and micro-optimised for SH4 862306a36Sopenharmony_ci * Stuart Menefy (stuart.menefy@st.com) 962306a36Sopenharmony_ci * 1062306a36Sopenharmony_ci */ 1162306a36Sopenharmony_ci#include <linux/linkage.h> 1262306a36Sopenharmony_ci 1362306a36Sopenharmony_ci/* 1462306a36Sopenharmony_ci * void *memcpy(void *dst, const void *src, size_t n); 1562306a36Sopenharmony_ci * 1662306a36Sopenharmony_ci * It is assumed that there is no overlap between src and dst. 1762306a36Sopenharmony_ci * If there is an overlap, then the results are undefined. 1862306a36Sopenharmony_ci */ 1962306a36Sopenharmony_ci 2062306a36Sopenharmony_ci ! 2162306a36Sopenharmony_ci ! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR. 2262306a36Sopenharmony_ci ! 2362306a36Sopenharmony_ci 2462306a36Sopenharmony_ci ! Size is 16 or greater, and may have trailing bytes 2562306a36Sopenharmony_ci 2662306a36Sopenharmony_ci .balign 32 2762306a36Sopenharmony_ci.Lcase1: 2862306a36Sopenharmony_ci ! Read a long word and write a long word at once 2962306a36Sopenharmony_ci ! At the start of each iteration, r7 contains last long load 3062306a36Sopenharmony_ci add #-1,r5 ! 79 EX 3162306a36Sopenharmony_ci mov r4,r2 ! 5 MT (0 cycles latency) 3262306a36Sopenharmony_ci 3362306a36Sopenharmony_ci mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency) 3462306a36Sopenharmony_ci add #-4,r5 ! 50 EX 3562306a36Sopenharmony_ci 3662306a36Sopenharmony_ci add #7,r2 ! 79 EX 3762306a36Sopenharmony_ci ! 3862306a36Sopenharmony_ci#ifdef CONFIG_CPU_LITTLE_ENDIAN 3962306a36Sopenharmony_ci ! 6 cycles, 4 bytes per iteration 4062306a36Sopenharmony_ci3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK 4162306a36Sopenharmony_ci mov r7, r3 ! 5 MT (latency=0) ! RQPO 4262306a36Sopenharmony_ci 4362306a36Sopenharmony_ci cmp/hi r2,r0 ! 57 MT 4462306a36Sopenharmony_ci shll16 r3 ! 103 EX 4562306a36Sopenharmony_ci 4662306a36Sopenharmony_ci mov r1,r6 ! 5 MT (latency=0) 4762306a36Sopenharmony_ci shll8 r3 ! 102 EX ! Oxxx 4862306a36Sopenharmony_ci 4962306a36Sopenharmony_ci shlr8 r6 ! 106 EX ! xNML 5062306a36Sopenharmony_ci mov r1, r7 ! 5 MT (latency=0) 5162306a36Sopenharmony_ci 5262306a36Sopenharmony_ci or r6,r3 ! 82 EX ! ONML 5362306a36Sopenharmony_ci bt/s 3b ! 109 BR 5462306a36Sopenharmony_ci 5562306a36Sopenharmony_ci mov.l r3,@-r0 ! 30 LS 5662306a36Sopenharmony_ci#else 5762306a36Sopenharmony_ci3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! KLMN 5862306a36Sopenharmony_ci mov r7,r3 ! 5 MT (latency=0) ! OPQR 5962306a36Sopenharmony_ci 6062306a36Sopenharmony_ci cmp/hi r2,r0 ! 57 MT 6162306a36Sopenharmony_ci shlr16 r3 ! 107 EX 6262306a36Sopenharmony_ci 6362306a36Sopenharmony_ci shlr8 r3 ! 106 EX ! xxxO 6462306a36Sopenharmony_ci mov r1,r6 ! 5 MT (latency=0) 6562306a36Sopenharmony_ci 6662306a36Sopenharmony_ci shll8 r6 ! 102 EX ! LMNx 6762306a36Sopenharmony_ci mov r1,r7 ! 5 MT (latency=0) 6862306a36Sopenharmony_ci 6962306a36Sopenharmony_ci or r6,r3 ! 82 EX ! LMNO 7062306a36Sopenharmony_ci bt/s 3b ! 109 BR 7162306a36Sopenharmony_ci 7262306a36Sopenharmony_ci mov.l r3,@-r0 ! 30 LS 7362306a36Sopenharmony_ci#endif 7462306a36Sopenharmony_ci ! Finally, copy a byte at once, if necessary 7562306a36Sopenharmony_ci 7662306a36Sopenharmony_ci add #4,r5 ! 50 EX 7762306a36Sopenharmony_ci cmp/eq r4,r0 ! 54 MT 7862306a36Sopenharmony_ci 7962306a36Sopenharmony_ci add #-6,r2 ! 50 EX 8062306a36Sopenharmony_ci bt 9f ! 109 BR 8162306a36Sopenharmony_ci 8262306a36Sopenharmony_ci8: cmp/hi r2,r0 ! 57 MT 8362306a36Sopenharmony_ci mov.b @(r0,r5),r1 ! 20 LS (latency=2) 8462306a36Sopenharmony_ci 8562306a36Sopenharmony_ci bt/s 8b ! 109 BR 8662306a36Sopenharmony_ci 8762306a36Sopenharmony_ci mov.b r1,@-r0 ! 29 LS 8862306a36Sopenharmony_ci 8962306a36Sopenharmony_ci9: rts 9062306a36Sopenharmony_ci nop 9162306a36Sopenharmony_ci 9262306a36Sopenharmony_ci 9362306a36Sopenharmony_ci ! 9462306a36Sopenharmony_ci ! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R... 9562306a36Sopenharmony_ci ! 9662306a36Sopenharmony_ci 9762306a36Sopenharmony_ci ! Size is 16 or greater, and may have trailing bytes 9862306a36Sopenharmony_ci 9962306a36Sopenharmony_ci .balign 32 10062306a36Sopenharmony_ci.Lcase3: 10162306a36Sopenharmony_ci ! Read a long word and write a long word at once 10262306a36Sopenharmony_ci ! At the start of each iteration, r7 contains last long load 10362306a36Sopenharmony_ci add #-3,r5 ! 79 EX 10462306a36Sopenharmony_ci mov r4,r2 ! 5 MT (0 cycles latency) 10562306a36Sopenharmony_ci 10662306a36Sopenharmony_ci mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency) 10762306a36Sopenharmony_ci add #-4,r5 ! 50 EX 10862306a36Sopenharmony_ci 10962306a36Sopenharmony_ci add #7,r2 ! 79 EX 11062306a36Sopenharmony_ci ! 11162306a36Sopenharmony_ci#ifdef CONFIG_CPU_LITTLE_ENDIAN 11262306a36Sopenharmony_ci ! 6 cycles, 4 bytes per iteration 11362306a36Sopenharmony_ci3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK 11462306a36Sopenharmony_ci mov r7, r3 ! 5 MT (latency=0) ! RQPO 11562306a36Sopenharmony_ci 11662306a36Sopenharmony_ci cmp/hi r2,r0 ! 57 MT 11762306a36Sopenharmony_ci shll8 r3 ! 102 EX ! QPOx 11862306a36Sopenharmony_ci 11962306a36Sopenharmony_ci mov r1,r6 ! 5 MT (latency=0) 12062306a36Sopenharmony_ci shlr16 r6 ! 107 EX 12162306a36Sopenharmony_ci 12262306a36Sopenharmony_ci shlr8 r6 ! 106 EX ! xxxN 12362306a36Sopenharmony_ci mov r1, r7 ! 5 MT (latency=0) 12462306a36Sopenharmony_ci 12562306a36Sopenharmony_ci or r6,r3 ! 82 EX ! QPON 12662306a36Sopenharmony_ci bt/s 3b ! 109 BR 12762306a36Sopenharmony_ci 12862306a36Sopenharmony_ci mov.l r3,@-r0 ! 30 LS 12962306a36Sopenharmony_ci#else 13062306a36Sopenharmony_ci3: mov r7,r3 ! OPQR 13162306a36Sopenharmony_ci shlr8 r3 ! xOPQ 13262306a36Sopenharmony_ci mov.l @(r0,r5),r7 ! KLMN 13362306a36Sopenharmony_ci mov r7,r6 13462306a36Sopenharmony_ci shll16 r6 13562306a36Sopenharmony_ci shll8 r6 ! Nxxx 13662306a36Sopenharmony_ci or r6,r3 ! NOPQ 13762306a36Sopenharmony_ci cmp/hi r2,r0 13862306a36Sopenharmony_ci bt/s 3b 13962306a36Sopenharmony_ci mov.l r3,@-r0 14062306a36Sopenharmony_ci#endif 14162306a36Sopenharmony_ci 14262306a36Sopenharmony_ci ! Finally, copy a byte at once, if necessary 14362306a36Sopenharmony_ci 14462306a36Sopenharmony_ci add #6,r5 ! 50 EX 14562306a36Sopenharmony_ci cmp/eq r4,r0 ! 54 MT 14662306a36Sopenharmony_ci 14762306a36Sopenharmony_ci add #-6,r2 ! 50 EX 14862306a36Sopenharmony_ci bt 9f ! 109 BR 14962306a36Sopenharmony_ci 15062306a36Sopenharmony_ci8: cmp/hi r2,r0 ! 57 MT 15162306a36Sopenharmony_ci mov.b @(r0,r5),r1 ! 20 LS (latency=2) 15262306a36Sopenharmony_ci 15362306a36Sopenharmony_ci bt/s 8b ! 109 BR 15462306a36Sopenharmony_ci 15562306a36Sopenharmony_ci mov.b r1,@-r0 ! 29 LS 15662306a36Sopenharmony_ci 15762306a36Sopenharmony_ci9: rts 15862306a36Sopenharmony_ci nop 15962306a36Sopenharmony_ci 16062306a36Sopenharmony_ciENTRY(memcpy) 16162306a36Sopenharmony_ci 16262306a36Sopenharmony_ci ! Calculate the invariants which will be used in the remainder 16362306a36Sopenharmony_ci ! of the code: 16462306a36Sopenharmony_ci ! 16562306a36Sopenharmony_ci ! r4 --> [ ... ] DST [ ... ] SRC 16662306a36Sopenharmony_ci ! [ ... ] [ ... ] 16762306a36Sopenharmony_ci ! : : 16862306a36Sopenharmony_ci ! r0 --> [ ... ] r0+r5 --> [ ... ] 16962306a36Sopenharmony_ci ! 17062306a36Sopenharmony_ci ! 17162306a36Sopenharmony_ci 17262306a36Sopenharmony_ci ! Short circuit the common case of src, dst and len being 32 bit aligned 17362306a36Sopenharmony_ci ! and test for zero length move 17462306a36Sopenharmony_ci 17562306a36Sopenharmony_ci mov r6, r0 ! 5 MT (0 cycle latency) 17662306a36Sopenharmony_ci or r4, r0 ! 82 EX 17762306a36Sopenharmony_ci 17862306a36Sopenharmony_ci or r5, r0 ! 82 EX 17962306a36Sopenharmony_ci tst r6, r6 ! 86 MT 18062306a36Sopenharmony_ci 18162306a36Sopenharmony_ci bt/s 99f ! 111 BR (zero len) 18262306a36Sopenharmony_ci tst #3, r0 ! 87 MT 18362306a36Sopenharmony_ci 18462306a36Sopenharmony_ci mov r4, r0 ! 5 MT (0 cycle latency) 18562306a36Sopenharmony_ci add r6, r0 ! 49 EX 18662306a36Sopenharmony_ci 18762306a36Sopenharmony_ci mov #16, r1 ! 6 EX 18862306a36Sopenharmony_ci bt/s .Lcase00 ! 111 BR (aligned) 18962306a36Sopenharmony_ci 19062306a36Sopenharmony_ci sub r4, r5 ! 75 EX 19162306a36Sopenharmony_ci 19262306a36Sopenharmony_ci ! Arguments are not nicely long word aligned or zero len. 19362306a36Sopenharmony_ci ! Check for small copies, and if so do a simple byte at a time copy. 19462306a36Sopenharmony_ci ! 19562306a36Sopenharmony_ci ! Deciding on an exact value of 'small' is not easy, as the point at which 19662306a36Sopenharmony_ci ! using the optimised routines become worthwhile varies (these are the 19762306a36Sopenharmony_ci ! cycle counts for differnet sizes using byte-at-a-time vs. optimised): 19862306a36Sopenharmony_ci ! size byte-at-time long word byte 19962306a36Sopenharmony_ci ! 16 42 39-40 46-50 50-55 20062306a36Sopenharmony_ci ! 24 58 43-44 54-58 62-67 20162306a36Sopenharmony_ci ! 36 82 49-50 66-70 80-85 20262306a36Sopenharmony_ci ! However the penalty for getting it 'wrong' is much higher for long word 20362306a36Sopenharmony_ci ! aligned data (and this is more common), so use a value of 16. 20462306a36Sopenharmony_ci 20562306a36Sopenharmony_ci cmp/gt r6,r1 ! 56 MT 20662306a36Sopenharmony_ci 20762306a36Sopenharmony_ci add #-1,r5 ! 50 EX 20862306a36Sopenharmony_ci bf/s 6f ! 108 BR (not small) 20962306a36Sopenharmony_ci 21062306a36Sopenharmony_ci mov r5, r3 ! 5 MT (latency=0) 21162306a36Sopenharmony_ci shlr r6 ! 104 EX 21262306a36Sopenharmony_ci 21362306a36Sopenharmony_ci mov.b @(r0,r5),r1 ! 20 LS (latency=2) 21462306a36Sopenharmony_ci bf/s 4f ! 111 BR 21562306a36Sopenharmony_ci 21662306a36Sopenharmony_ci add #-1,r3 ! 50 EX 21762306a36Sopenharmony_ci tst r6, r6 ! 86 MT 21862306a36Sopenharmony_ci 21962306a36Sopenharmony_ci bt/s 98f ! 110 BR 22062306a36Sopenharmony_ci mov.b r1,@-r0 ! 29 LS 22162306a36Sopenharmony_ci 22262306a36Sopenharmony_ci ! 4 cycles, 2 bytes per iteration 22362306a36Sopenharmony_ci3: mov.b @(r0,r5),r1 ! 20 LS (latency=2) 22462306a36Sopenharmony_ci 22562306a36Sopenharmony_ci4: mov.b @(r0,r3),r2 ! 20 LS (latency=2) 22662306a36Sopenharmony_ci dt r6 ! 67 EX 22762306a36Sopenharmony_ci 22862306a36Sopenharmony_ci mov.b r1,@-r0 ! 29 LS 22962306a36Sopenharmony_ci bf/s 3b ! 111 BR 23062306a36Sopenharmony_ci 23162306a36Sopenharmony_ci mov.b r2,@-r0 ! 29 LS 23262306a36Sopenharmony_ci98: 23362306a36Sopenharmony_ci rts 23462306a36Sopenharmony_ci nop 23562306a36Sopenharmony_ci 23662306a36Sopenharmony_ci99: rts 23762306a36Sopenharmony_ci mov r4, r0 23862306a36Sopenharmony_ci 23962306a36Sopenharmony_ci ! Size is not small, so its worthwhile looking for optimisations. 24062306a36Sopenharmony_ci ! First align destination to a long word boundary. 24162306a36Sopenharmony_ci ! 24262306a36Sopenharmony_ci ! r5 = normal value -1 24362306a36Sopenharmony_ci 24462306a36Sopenharmony_ci6: tst #3, r0 ! 87 MT 24562306a36Sopenharmony_ci mov #3, r3 ! 6 EX 24662306a36Sopenharmony_ci 24762306a36Sopenharmony_ci bt/s 2f ! 111 BR 24862306a36Sopenharmony_ci and r0,r3 ! 78 EX 24962306a36Sopenharmony_ci 25062306a36Sopenharmony_ci ! 3 cycles, 1 byte per iteration 25162306a36Sopenharmony_ci1: dt r3 ! 67 EX 25262306a36Sopenharmony_ci mov.b @(r0,r5),r1 ! 19 LS (latency=2) 25362306a36Sopenharmony_ci 25462306a36Sopenharmony_ci add #-1, r6 ! 79 EX 25562306a36Sopenharmony_ci bf/s 1b ! 109 BR 25662306a36Sopenharmony_ci 25762306a36Sopenharmony_ci mov.b r1,@-r0 ! 28 LS 25862306a36Sopenharmony_ci 25962306a36Sopenharmony_ci2: add #1, r5 ! 79 EX 26062306a36Sopenharmony_ci 26162306a36Sopenharmony_ci ! Now select the appropriate bulk transfer code based on relative 26262306a36Sopenharmony_ci ! alignment of src and dst. 26362306a36Sopenharmony_ci 26462306a36Sopenharmony_ci mov r0, r3 ! 5 MT (latency=0) 26562306a36Sopenharmony_ci 26662306a36Sopenharmony_ci mov r5, r0 ! 5 MT (latency=0) 26762306a36Sopenharmony_ci tst #1, r0 ! 87 MT 26862306a36Sopenharmony_ci 26962306a36Sopenharmony_ci bf/s 1f ! 111 BR 27062306a36Sopenharmony_ci mov #64, r7 ! 6 EX 27162306a36Sopenharmony_ci 27262306a36Sopenharmony_ci ! bit 0 clear 27362306a36Sopenharmony_ci 27462306a36Sopenharmony_ci cmp/ge r7, r6 ! 55 MT 27562306a36Sopenharmony_ci 27662306a36Sopenharmony_ci bt/s 2f ! 111 BR 27762306a36Sopenharmony_ci tst #2, r0 ! 87 MT 27862306a36Sopenharmony_ci 27962306a36Sopenharmony_ci ! small 28062306a36Sopenharmony_ci bt/s .Lcase0 28162306a36Sopenharmony_ci mov r3, r0 28262306a36Sopenharmony_ci 28362306a36Sopenharmony_ci bra .Lcase2 28462306a36Sopenharmony_ci nop 28562306a36Sopenharmony_ci 28662306a36Sopenharmony_ci ! big 28762306a36Sopenharmony_ci2: bt/s .Lcase0b 28862306a36Sopenharmony_ci mov r3, r0 28962306a36Sopenharmony_ci 29062306a36Sopenharmony_ci bra .Lcase2b 29162306a36Sopenharmony_ci nop 29262306a36Sopenharmony_ci 29362306a36Sopenharmony_ci ! bit 0 set 29462306a36Sopenharmony_ci1: tst #2, r0 ! 87 MT 29562306a36Sopenharmony_ci 29662306a36Sopenharmony_ci bt/s .Lcase1 29762306a36Sopenharmony_ci mov r3, r0 29862306a36Sopenharmony_ci 29962306a36Sopenharmony_ci bra .Lcase3 30062306a36Sopenharmony_ci nop 30162306a36Sopenharmony_ci 30262306a36Sopenharmony_ci 30362306a36Sopenharmony_ci ! 30462306a36Sopenharmony_ci ! GHIJ KLMN OPQR --> GHIJ KLMN OPQR 30562306a36Sopenharmony_ci ! 30662306a36Sopenharmony_ci 30762306a36Sopenharmony_ci ! src, dst and size are all long word aligned 30862306a36Sopenharmony_ci ! size is non-zero 30962306a36Sopenharmony_ci 31062306a36Sopenharmony_ci .balign 32 31162306a36Sopenharmony_ci.Lcase00: 31262306a36Sopenharmony_ci mov #64, r1 ! 6 EX 31362306a36Sopenharmony_ci mov r5, r3 ! 5 MT (latency=0) 31462306a36Sopenharmony_ci 31562306a36Sopenharmony_ci cmp/gt r6, r1 ! 56 MT 31662306a36Sopenharmony_ci add #-4, r5 ! 50 EX 31762306a36Sopenharmony_ci 31862306a36Sopenharmony_ci bf .Lcase00b ! 108 BR (big loop) 31962306a36Sopenharmony_ci shlr2 r6 ! 105 EX 32062306a36Sopenharmony_ci 32162306a36Sopenharmony_ci shlr r6 ! 104 EX 32262306a36Sopenharmony_ci mov.l @(r0, r5), r1 ! 21 LS (latency=2) 32362306a36Sopenharmony_ci 32462306a36Sopenharmony_ci bf/s 4f ! 111 BR 32562306a36Sopenharmony_ci add #-8, r3 ! 50 EX 32662306a36Sopenharmony_ci 32762306a36Sopenharmony_ci tst r6, r6 ! 86 MT 32862306a36Sopenharmony_ci bt/s 5f ! 110 BR 32962306a36Sopenharmony_ci 33062306a36Sopenharmony_ci mov.l r1,@-r0 ! 30 LS 33162306a36Sopenharmony_ci 33262306a36Sopenharmony_ci ! 4 cycles, 2 long words per iteration 33362306a36Sopenharmony_ci3: mov.l @(r0, r5), r1 ! 21 LS (latency=2) 33462306a36Sopenharmony_ci 33562306a36Sopenharmony_ci4: mov.l @(r0, r3), r2 ! 21 LS (latency=2) 33662306a36Sopenharmony_ci dt r6 ! 67 EX 33762306a36Sopenharmony_ci 33862306a36Sopenharmony_ci mov.l r1, @-r0 ! 30 LS 33962306a36Sopenharmony_ci bf/s 3b ! 109 BR 34062306a36Sopenharmony_ci 34162306a36Sopenharmony_ci mov.l r2, @-r0 ! 30 LS 34262306a36Sopenharmony_ci 34362306a36Sopenharmony_ci5: rts 34462306a36Sopenharmony_ci nop 34562306a36Sopenharmony_ci 34662306a36Sopenharmony_ci 34762306a36Sopenharmony_ci ! Size is 16 or greater and less than 64, but may have trailing bytes 34862306a36Sopenharmony_ci 34962306a36Sopenharmony_ci .balign 32 35062306a36Sopenharmony_ci.Lcase0: 35162306a36Sopenharmony_ci add #-4, r5 ! 50 EX 35262306a36Sopenharmony_ci mov r4, r7 ! 5 MT (latency=0) 35362306a36Sopenharmony_ci 35462306a36Sopenharmony_ci mov.l @(r0, r5), r1 ! 21 LS (latency=2) 35562306a36Sopenharmony_ci mov #4, r2 ! 6 EX 35662306a36Sopenharmony_ci 35762306a36Sopenharmony_ci add #11, r7 ! 50 EX 35862306a36Sopenharmony_ci tst r2, r6 ! 86 MT 35962306a36Sopenharmony_ci 36062306a36Sopenharmony_ci mov r5, r3 ! 5 MT (latency=0) 36162306a36Sopenharmony_ci bt/s 4f ! 111 BR 36262306a36Sopenharmony_ci 36362306a36Sopenharmony_ci add #-4, r3 ! 50 EX 36462306a36Sopenharmony_ci mov.l r1,@-r0 ! 30 LS 36562306a36Sopenharmony_ci 36662306a36Sopenharmony_ci ! 4 cycles, 2 long words per iteration 36762306a36Sopenharmony_ci3: mov.l @(r0, r5), r1 ! 21 LS (latency=2) 36862306a36Sopenharmony_ci 36962306a36Sopenharmony_ci4: mov.l @(r0, r3), r2 ! 21 LS (latency=2) 37062306a36Sopenharmony_ci cmp/hi r7, r0 37162306a36Sopenharmony_ci 37262306a36Sopenharmony_ci mov.l r1, @-r0 ! 30 LS 37362306a36Sopenharmony_ci bt/s 3b ! 109 BR 37462306a36Sopenharmony_ci 37562306a36Sopenharmony_ci mov.l r2, @-r0 ! 30 LS 37662306a36Sopenharmony_ci 37762306a36Sopenharmony_ci ! Copy the final 0-3 bytes 37862306a36Sopenharmony_ci 37962306a36Sopenharmony_ci add #3,r5 ! 50 EX 38062306a36Sopenharmony_ci 38162306a36Sopenharmony_ci cmp/eq r0, r4 ! 54 MT 38262306a36Sopenharmony_ci add #-10, r7 ! 50 EX 38362306a36Sopenharmony_ci 38462306a36Sopenharmony_ci bt 9f ! 110 BR 38562306a36Sopenharmony_ci 38662306a36Sopenharmony_ci ! 3 cycles, 1 byte per iteration 38762306a36Sopenharmony_ci1: mov.b @(r0,r5),r1 ! 19 LS 38862306a36Sopenharmony_ci cmp/hi r7,r0 ! 57 MT 38962306a36Sopenharmony_ci 39062306a36Sopenharmony_ci bt/s 1b ! 111 BR 39162306a36Sopenharmony_ci mov.b r1,@-r0 ! 28 LS 39262306a36Sopenharmony_ci 39362306a36Sopenharmony_ci9: rts 39462306a36Sopenharmony_ci nop 39562306a36Sopenharmony_ci 39662306a36Sopenharmony_ci ! Size is at least 64 bytes, so will be going round the big loop at least once. 39762306a36Sopenharmony_ci ! 39862306a36Sopenharmony_ci ! r2 = rounded up r4 39962306a36Sopenharmony_ci ! r3 = rounded down r0 40062306a36Sopenharmony_ci 40162306a36Sopenharmony_ci .balign 32 40262306a36Sopenharmony_ci.Lcase0b: 40362306a36Sopenharmony_ci add #-4, r5 ! 50 EX 40462306a36Sopenharmony_ci 40562306a36Sopenharmony_ci.Lcase00b: 40662306a36Sopenharmony_ci mov r0, r3 ! 5 MT (latency=0) 40762306a36Sopenharmony_ci mov #(~0x1f), r1 ! 6 EX 40862306a36Sopenharmony_ci 40962306a36Sopenharmony_ci and r1, r3 ! 78 EX 41062306a36Sopenharmony_ci mov r4, r2 ! 5 MT (latency=0) 41162306a36Sopenharmony_ci 41262306a36Sopenharmony_ci cmp/eq r3, r0 ! 54 MT 41362306a36Sopenharmony_ci add #0x1f, r2 ! 50 EX 41462306a36Sopenharmony_ci 41562306a36Sopenharmony_ci bt/s 1f ! 110 BR 41662306a36Sopenharmony_ci and r1, r2 ! 78 EX 41762306a36Sopenharmony_ci 41862306a36Sopenharmony_ci ! copy initial words until cache line aligned 41962306a36Sopenharmony_ci 42062306a36Sopenharmony_ci mov.l @(r0, r5), r1 ! 21 LS (latency=2) 42162306a36Sopenharmony_ci tst #4, r0 ! 87 MT 42262306a36Sopenharmony_ci 42362306a36Sopenharmony_ci mov r5, r6 ! 5 MT (latency=0) 42462306a36Sopenharmony_ci add #-4, r6 ! 50 EX 42562306a36Sopenharmony_ci 42662306a36Sopenharmony_ci bt/s 4f ! 111 BR 42762306a36Sopenharmony_ci add #8, r3 ! 50 EX 42862306a36Sopenharmony_ci 42962306a36Sopenharmony_ci tst #0x18, r0 ! 87 MT 43062306a36Sopenharmony_ci 43162306a36Sopenharmony_ci bt/s 1f ! 109 BR 43262306a36Sopenharmony_ci mov.l r1,@-r0 ! 30 LS 43362306a36Sopenharmony_ci 43462306a36Sopenharmony_ci ! 4 cycles, 2 long words per iteration 43562306a36Sopenharmony_ci3: mov.l @(r0, r5), r1 ! 21 LS (latency=2) 43662306a36Sopenharmony_ci 43762306a36Sopenharmony_ci4: mov.l @(r0, r6), r7 ! 21 LS (latency=2) 43862306a36Sopenharmony_ci cmp/eq r3, r0 ! 54 MT 43962306a36Sopenharmony_ci 44062306a36Sopenharmony_ci mov.l r1, @-r0 ! 30 LS 44162306a36Sopenharmony_ci bf/s 3b ! 109 BR 44262306a36Sopenharmony_ci 44362306a36Sopenharmony_ci mov.l r7, @-r0 ! 30 LS 44462306a36Sopenharmony_ci 44562306a36Sopenharmony_ci ! Copy the cache line aligned blocks 44662306a36Sopenharmony_ci ! 44762306a36Sopenharmony_ci ! In use: r0, r2, r4, r5 44862306a36Sopenharmony_ci ! Scratch: r1, r3, r6, r7 44962306a36Sopenharmony_ci ! 45062306a36Sopenharmony_ci ! We could do this with the four scratch registers, but if src 45162306a36Sopenharmony_ci ! and dest hit the same cache line, this will thrash, so make 45262306a36Sopenharmony_ci ! use of additional registers. 45362306a36Sopenharmony_ci ! 45462306a36Sopenharmony_ci ! We also need r0 as a temporary (for movca), so 'undo' the invariant: 45562306a36Sopenharmony_ci ! r5: src (was r0+r5) 45662306a36Sopenharmony_ci ! r1: dest (was r0) 45762306a36Sopenharmony_ci ! this can be reversed at the end, so we don't need to save any extra 45862306a36Sopenharmony_ci ! state. 45962306a36Sopenharmony_ci ! 46062306a36Sopenharmony_ci1: mov.l r8, @-r15 ! 30 LS 46162306a36Sopenharmony_ci add r0, r5 ! 49 EX 46262306a36Sopenharmony_ci 46362306a36Sopenharmony_ci mov.l r9, @-r15 ! 30 LS 46462306a36Sopenharmony_ci mov r0, r1 ! 5 MT (latency=0) 46562306a36Sopenharmony_ci 46662306a36Sopenharmony_ci mov.l r10, @-r15 ! 30 LS 46762306a36Sopenharmony_ci add #-0x1c, r5 ! 50 EX 46862306a36Sopenharmony_ci 46962306a36Sopenharmony_ci mov.l r11, @-r15 ! 30 LS 47062306a36Sopenharmony_ci 47162306a36Sopenharmony_ci ! 16 cycles, 32 bytes per iteration 47262306a36Sopenharmony_ci2: mov.l @(0x00,r5),r0 ! 18 LS (latency=2) 47362306a36Sopenharmony_ci add #-0x20, r1 ! 50 EX 47462306a36Sopenharmony_ci mov.l @(0x04,r5),r3 ! 18 LS (latency=2) 47562306a36Sopenharmony_ci mov.l @(0x08,r5),r6 ! 18 LS (latency=2) 47662306a36Sopenharmony_ci mov.l @(0x0c,r5),r7 ! 18 LS (latency=2) 47762306a36Sopenharmony_ci mov.l @(0x10,r5),r8 ! 18 LS (latency=2) 47862306a36Sopenharmony_ci mov.l @(0x14,r5),r9 ! 18 LS (latency=2) 47962306a36Sopenharmony_ci mov.l @(0x18,r5),r10 ! 18 LS (latency=2) 48062306a36Sopenharmony_ci mov.l @(0x1c,r5),r11 ! 18 LS (latency=2) 48162306a36Sopenharmony_ci movca.l r0,@r1 ! 40 LS (latency=3-7) 48262306a36Sopenharmony_ci mov.l r3,@(0x04,r1) ! 33 LS 48362306a36Sopenharmony_ci mov.l r6,@(0x08,r1) ! 33 LS 48462306a36Sopenharmony_ci mov.l r7,@(0x0c,r1) ! 33 LS 48562306a36Sopenharmony_ci 48662306a36Sopenharmony_ci mov.l r8,@(0x10,r1) ! 33 LS 48762306a36Sopenharmony_ci add #-0x20, r5 ! 50 EX 48862306a36Sopenharmony_ci 48962306a36Sopenharmony_ci mov.l r9,@(0x14,r1) ! 33 LS 49062306a36Sopenharmony_ci cmp/eq r2,r1 ! 54 MT 49162306a36Sopenharmony_ci 49262306a36Sopenharmony_ci mov.l r10,@(0x18,r1) ! 33 LS 49362306a36Sopenharmony_ci bf/s 2b ! 109 BR 49462306a36Sopenharmony_ci 49562306a36Sopenharmony_ci mov.l r11,@(0x1c,r1) ! 33 LS 49662306a36Sopenharmony_ci 49762306a36Sopenharmony_ci mov r1, r0 ! 5 MT (latency=0) 49862306a36Sopenharmony_ci 49962306a36Sopenharmony_ci mov.l @r15+, r11 ! 15 LS 50062306a36Sopenharmony_ci sub r1, r5 ! 75 EX 50162306a36Sopenharmony_ci 50262306a36Sopenharmony_ci mov.l @r15+, r10 ! 15 LS 50362306a36Sopenharmony_ci cmp/eq r4, r0 ! 54 MT 50462306a36Sopenharmony_ci 50562306a36Sopenharmony_ci bf/s 1f ! 109 BR 50662306a36Sopenharmony_ci mov.l @r15+, r9 ! 15 LS 50762306a36Sopenharmony_ci 50862306a36Sopenharmony_ci rts 50962306a36Sopenharmony_ci1: mov.l @r15+, r8 ! 15 LS 51062306a36Sopenharmony_ci sub r4, r1 ! 75 EX (len remaining) 51162306a36Sopenharmony_ci 51262306a36Sopenharmony_ci ! number of trailing bytes is non-zero 51362306a36Sopenharmony_ci ! 51462306a36Sopenharmony_ci ! invariants restored (r5 already decremented by 4) 51562306a36Sopenharmony_ci ! also r1=num bytes remaining 51662306a36Sopenharmony_ci 51762306a36Sopenharmony_ci mov #4, r2 ! 6 EX 51862306a36Sopenharmony_ci mov r4, r7 ! 5 MT (latency=0) 51962306a36Sopenharmony_ci 52062306a36Sopenharmony_ci add #0x1c, r5 ! 50 EX (back to -4) 52162306a36Sopenharmony_ci cmp/hs r2, r1 ! 58 MT 52262306a36Sopenharmony_ci 52362306a36Sopenharmony_ci bf/s 5f ! 108 BR 52462306a36Sopenharmony_ci add #11, r7 ! 50 EX 52562306a36Sopenharmony_ci 52662306a36Sopenharmony_ci mov.l @(r0, r5), r6 ! 21 LS (latency=2) 52762306a36Sopenharmony_ci tst r2, r1 ! 86 MT 52862306a36Sopenharmony_ci 52962306a36Sopenharmony_ci mov r5, r3 ! 5 MT (latency=0) 53062306a36Sopenharmony_ci bt/s 4f ! 111 BR 53162306a36Sopenharmony_ci 53262306a36Sopenharmony_ci add #-4, r3 ! 50 EX 53362306a36Sopenharmony_ci cmp/hs r2, r1 ! 58 MT 53462306a36Sopenharmony_ci 53562306a36Sopenharmony_ci bt/s 5f ! 111 BR 53662306a36Sopenharmony_ci mov.l r6,@-r0 ! 30 LS 53762306a36Sopenharmony_ci 53862306a36Sopenharmony_ci ! 4 cycles, 2 long words per iteration 53962306a36Sopenharmony_ci3: mov.l @(r0, r5), r6 ! 21 LS (latency=2) 54062306a36Sopenharmony_ci 54162306a36Sopenharmony_ci4: mov.l @(r0, r3), r2 ! 21 LS (latency=2) 54262306a36Sopenharmony_ci cmp/hi r7, r0 54362306a36Sopenharmony_ci 54462306a36Sopenharmony_ci mov.l r6, @-r0 ! 30 LS 54562306a36Sopenharmony_ci bt/s 3b ! 109 BR 54662306a36Sopenharmony_ci 54762306a36Sopenharmony_ci mov.l r2, @-r0 ! 30 LS 54862306a36Sopenharmony_ci 54962306a36Sopenharmony_ci ! Copy the final 0-3 bytes 55062306a36Sopenharmony_ci 55162306a36Sopenharmony_ci5: cmp/eq r0, r4 ! 54 MT 55262306a36Sopenharmony_ci add #-10, r7 ! 50 EX 55362306a36Sopenharmony_ci 55462306a36Sopenharmony_ci bt 9f ! 110 BR 55562306a36Sopenharmony_ci add #3,r5 ! 50 EX 55662306a36Sopenharmony_ci 55762306a36Sopenharmony_ci ! 3 cycles, 1 byte per iteration 55862306a36Sopenharmony_ci1: mov.b @(r0,r5),r1 ! 19 LS 55962306a36Sopenharmony_ci cmp/hi r7,r0 ! 57 MT 56062306a36Sopenharmony_ci 56162306a36Sopenharmony_ci bt/s 1b ! 111 BR 56262306a36Sopenharmony_ci mov.b r1,@-r0 ! 28 LS 56362306a36Sopenharmony_ci 56462306a36Sopenharmony_ci9: rts 56562306a36Sopenharmony_ci nop 56662306a36Sopenharmony_ci 56762306a36Sopenharmony_ci ! 56862306a36Sopenharmony_ci ! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR.. 56962306a36Sopenharmony_ci ! 57062306a36Sopenharmony_ci 57162306a36Sopenharmony_ci .balign 32 57262306a36Sopenharmony_ci.Lcase2: 57362306a36Sopenharmony_ci ! Size is 16 or greater and less then 64, but may have trailing bytes 57462306a36Sopenharmony_ci 57562306a36Sopenharmony_ci2: mov r5, r6 ! 5 MT (latency=0) 57662306a36Sopenharmony_ci add #-2,r5 ! 50 EX 57762306a36Sopenharmony_ci 57862306a36Sopenharmony_ci mov r4,r2 ! 5 MT (latency=0) 57962306a36Sopenharmony_ci add #-4,r6 ! 50 EX 58062306a36Sopenharmony_ci 58162306a36Sopenharmony_ci add #7,r2 ! 50 EX 58262306a36Sopenharmony_ci3: mov.w @(r0,r5),r1 ! 20 LS (latency=2) 58362306a36Sopenharmony_ci 58462306a36Sopenharmony_ci mov.w @(r0,r6),r3 ! 20 LS (latency=2) 58562306a36Sopenharmony_ci cmp/hi r2,r0 ! 57 MT 58662306a36Sopenharmony_ci 58762306a36Sopenharmony_ci mov.w r1,@-r0 ! 29 LS 58862306a36Sopenharmony_ci bt/s 3b ! 111 BR 58962306a36Sopenharmony_ci 59062306a36Sopenharmony_ci mov.w r3,@-r0 ! 29 LS 59162306a36Sopenharmony_ci 59262306a36Sopenharmony_ci bra 10f 59362306a36Sopenharmony_ci nop 59462306a36Sopenharmony_ci 59562306a36Sopenharmony_ci 59662306a36Sopenharmony_ci .balign 32 59762306a36Sopenharmony_ci.Lcase2b: 59862306a36Sopenharmony_ci ! Size is at least 64 bytes, so will be going round the big loop at least once. 59962306a36Sopenharmony_ci ! 60062306a36Sopenharmony_ci ! r2 = rounded up r4 60162306a36Sopenharmony_ci ! r3 = rounded down r0 60262306a36Sopenharmony_ci 60362306a36Sopenharmony_ci mov r0, r3 ! 5 MT (latency=0) 60462306a36Sopenharmony_ci mov #(~0x1f), r1 ! 6 EX 60562306a36Sopenharmony_ci 60662306a36Sopenharmony_ci and r1, r3 ! 78 EX 60762306a36Sopenharmony_ci mov r4, r2 ! 5 MT (latency=0) 60862306a36Sopenharmony_ci 60962306a36Sopenharmony_ci cmp/eq r3, r0 ! 54 MT 61062306a36Sopenharmony_ci add #0x1f, r2 ! 50 EX 61162306a36Sopenharmony_ci 61262306a36Sopenharmony_ci add #-2, r5 ! 50 EX 61362306a36Sopenharmony_ci bt/s 1f ! 110 BR 61462306a36Sopenharmony_ci and r1, r2 ! 78 EX 61562306a36Sopenharmony_ci 61662306a36Sopenharmony_ci ! Copy a short word one at a time until we are cache line aligned 61762306a36Sopenharmony_ci ! Normal values: r0, r2, r3, r4 61862306a36Sopenharmony_ci ! Unused: r1, r6, r7 61962306a36Sopenharmony_ci ! Mod: r5 (=r5-2) 62062306a36Sopenharmony_ci ! 62162306a36Sopenharmony_ci add #2, r3 ! 50 EX 62262306a36Sopenharmony_ci 62362306a36Sopenharmony_ci2: mov.w @(r0,r5),r1 ! 20 LS (latency=2) 62462306a36Sopenharmony_ci cmp/eq r3,r0 ! 54 MT 62562306a36Sopenharmony_ci 62662306a36Sopenharmony_ci bf/s 2b ! 111 BR 62762306a36Sopenharmony_ci 62862306a36Sopenharmony_ci mov.w r1,@-r0 ! 29 LS 62962306a36Sopenharmony_ci 63062306a36Sopenharmony_ci ! Copy the cache line aligned blocks 63162306a36Sopenharmony_ci ! 63262306a36Sopenharmony_ci ! In use: r0, r2, r4, r5 (=r5-2) 63362306a36Sopenharmony_ci ! Scratch: r1, r3, r6, r7 63462306a36Sopenharmony_ci ! 63562306a36Sopenharmony_ci ! We could do this with the four scratch registers, but if src 63662306a36Sopenharmony_ci ! and dest hit the same cache line, this will thrash, so make 63762306a36Sopenharmony_ci ! use of additional registers. 63862306a36Sopenharmony_ci ! 63962306a36Sopenharmony_ci ! We also need r0 as a temporary (for movca), so 'undo' the invariant: 64062306a36Sopenharmony_ci ! r5: src (was r0+r5) 64162306a36Sopenharmony_ci ! r1: dest (was r0) 64262306a36Sopenharmony_ci ! this can be reversed at the end, so we don't need to save any extra 64362306a36Sopenharmony_ci ! state. 64462306a36Sopenharmony_ci ! 64562306a36Sopenharmony_ci1: mov.l r8, @-r15 ! 30 LS 64662306a36Sopenharmony_ci add r0, r5 ! 49 EX 64762306a36Sopenharmony_ci 64862306a36Sopenharmony_ci mov.l r9, @-r15 ! 30 LS 64962306a36Sopenharmony_ci mov r0, r1 ! 5 MT (latency=0) 65062306a36Sopenharmony_ci 65162306a36Sopenharmony_ci mov.l r10, @-r15 ! 30 LS 65262306a36Sopenharmony_ci add #-0x1e, r5 ! 50 EX 65362306a36Sopenharmony_ci 65462306a36Sopenharmony_ci mov.l r11, @-r15 ! 30 LS 65562306a36Sopenharmony_ci 65662306a36Sopenharmony_ci mov.l r12, @-r15 ! 30 LS 65762306a36Sopenharmony_ci 65862306a36Sopenharmony_ci ! 17 cycles, 32 bytes per iteration 65962306a36Sopenharmony_ci#ifdef CONFIG_CPU_LITTLE_ENDIAN 66062306a36Sopenharmony_ci2: mov.w @r5+, r0 ! 14 LS (latency=2) ..JI 66162306a36Sopenharmony_ci add #-0x20, r1 ! 50 EX 66262306a36Sopenharmony_ci 66362306a36Sopenharmony_ci mov.l @r5+, r3 ! 15 LS (latency=2) NMLK 66462306a36Sopenharmony_ci 66562306a36Sopenharmony_ci mov.l @r5+, r6 ! 15 LS (latency=2) RQPO 66662306a36Sopenharmony_ci shll16 r0 ! 103 EX JI.. 66762306a36Sopenharmony_ci 66862306a36Sopenharmony_ci mov.l @r5+, r7 ! 15 LS (latency=2) 66962306a36Sopenharmony_ci xtrct r3, r0 ! 48 EX LKJI 67062306a36Sopenharmony_ci 67162306a36Sopenharmony_ci mov.l @r5+, r8 ! 15 LS (latency=2) 67262306a36Sopenharmony_ci xtrct r6, r3 ! 48 EX PONM 67362306a36Sopenharmony_ci 67462306a36Sopenharmony_ci mov.l @r5+, r9 ! 15 LS (latency=2) 67562306a36Sopenharmony_ci xtrct r7, r6 ! 48 EX 67662306a36Sopenharmony_ci 67762306a36Sopenharmony_ci mov.l @r5+, r10 ! 15 LS (latency=2) 67862306a36Sopenharmony_ci xtrct r8, r7 ! 48 EX 67962306a36Sopenharmony_ci 68062306a36Sopenharmony_ci mov.l @r5+, r11 ! 15 LS (latency=2) 68162306a36Sopenharmony_ci xtrct r9, r8 ! 48 EX 68262306a36Sopenharmony_ci 68362306a36Sopenharmony_ci mov.w @r5+, r12 ! 15 LS (latency=2) 68462306a36Sopenharmony_ci xtrct r10, r9 ! 48 EX 68562306a36Sopenharmony_ci 68662306a36Sopenharmony_ci movca.l r0,@r1 ! 40 LS (latency=3-7) 68762306a36Sopenharmony_ci xtrct r11, r10 ! 48 EX 68862306a36Sopenharmony_ci 68962306a36Sopenharmony_ci mov.l r3, @(0x04,r1) ! 33 LS 69062306a36Sopenharmony_ci xtrct r12, r11 ! 48 EX 69162306a36Sopenharmony_ci 69262306a36Sopenharmony_ci mov.l r6, @(0x08,r1) ! 33 LS 69362306a36Sopenharmony_ci 69462306a36Sopenharmony_ci mov.l r7, @(0x0c,r1) ! 33 LS 69562306a36Sopenharmony_ci 69662306a36Sopenharmony_ci mov.l r8, @(0x10,r1) ! 33 LS 69762306a36Sopenharmony_ci add #-0x40, r5 ! 50 EX 69862306a36Sopenharmony_ci 69962306a36Sopenharmony_ci mov.l r9, @(0x14,r1) ! 33 LS 70062306a36Sopenharmony_ci cmp/eq r2,r1 ! 54 MT 70162306a36Sopenharmony_ci 70262306a36Sopenharmony_ci mov.l r10, @(0x18,r1) ! 33 LS 70362306a36Sopenharmony_ci bf/s 2b ! 109 BR 70462306a36Sopenharmony_ci 70562306a36Sopenharmony_ci mov.l r11, @(0x1c,r1) ! 33 LS 70662306a36Sopenharmony_ci#else 70762306a36Sopenharmony_ci2: mov.w @(0x1e,r5), r0 ! 17 LS (latency=2) 70862306a36Sopenharmony_ci add #-2, r5 ! 50 EX 70962306a36Sopenharmony_ci 71062306a36Sopenharmony_ci mov.l @(0x1c,r5), r3 ! 18 LS (latency=2) 71162306a36Sopenharmony_ci add #-4, r1 ! 50 EX 71262306a36Sopenharmony_ci 71362306a36Sopenharmony_ci mov.l @(0x18,r5), r6 ! 18 LS (latency=2) 71462306a36Sopenharmony_ci shll16 r0 ! 103 EX 71562306a36Sopenharmony_ci 71662306a36Sopenharmony_ci mov.l @(0x14,r5), r7 ! 18 LS (latency=2) 71762306a36Sopenharmony_ci xtrct r3, r0 ! 48 EX 71862306a36Sopenharmony_ci 71962306a36Sopenharmony_ci mov.l @(0x10,r5), r8 ! 18 LS (latency=2) 72062306a36Sopenharmony_ci xtrct r6, r3 ! 48 EX 72162306a36Sopenharmony_ci 72262306a36Sopenharmony_ci mov.l @(0x0c,r5), r9 ! 18 LS (latency=2) 72362306a36Sopenharmony_ci xtrct r7, r6 ! 48 EX 72462306a36Sopenharmony_ci 72562306a36Sopenharmony_ci mov.l @(0x08,r5), r10 ! 18 LS (latency=2) 72662306a36Sopenharmony_ci xtrct r8, r7 ! 48 EX 72762306a36Sopenharmony_ci 72862306a36Sopenharmony_ci mov.l @(0x04,r5), r11 ! 18 LS (latency=2) 72962306a36Sopenharmony_ci xtrct r9, r8 ! 48 EX 73062306a36Sopenharmony_ci 73162306a36Sopenharmony_ci mov.l @(0x00,r5), r12 ! 18 LS (latency=2) 73262306a36Sopenharmony_ci xtrct r10, r9 ! 48 EX 73362306a36Sopenharmony_ci 73462306a36Sopenharmony_ci movca.l r0,@r1 ! 40 LS (latency=3-7) 73562306a36Sopenharmony_ci add #-0x1c, r1 ! 50 EX 73662306a36Sopenharmony_ci 73762306a36Sopenharmony_ci mov.l r3, @(0x18,r1) ! 33 LS 73862306a36Sopenharmony_ci xtrct r11, r10 ! 48 EX 73962306a36Sopenharmony_ci 74062306a36Sopenharmony_ci mov.l r6, @(0x14,r1) ! 33 LS 74162306a36Sopenharmony_ci xtrct r12, r11 ! 48 EX 74262306a36Sopenharmony_ci 74362306a36Sopenharmony_ci mov.l r7, @(0x10,r1) ! 33 LS 74462306a36Sopenharmony_ci 74562306a36Sopenharmony_ci mov.l r8, @(0x0c,r1) ! 33 LS 74662306a36Sopenharmony_ci add #-0x1e, r5 ! 50 EX 74762306a36Sopenharmony_ci 74862306a36Sopenharmony_ci mov.l r9, @(0x08,r1) ! 33 LS 74962306a36Sopenharmony_ci cmp/eq r2,r1 ! 54 MT 75062306a36Sopenharmony_ci 75162306a36Sopenharmony_ci mov.l r10, @(0x04,r1) ! 33 LS 75262306a36Sopenharmony_ci bf/s 2b ! 109 BR 75362306a36Sopenharmony_ci 75462306a36Sopenharmony_ci mov.l r11, @(0x00,r1) ! 33 LS 75562306a36Sopenharmony_ci#endif 75662306a36Sopenharmony_ci 75762306a36Sopenharmony_ci mov.l @r15+, r12 75862306a36Sopenharmony_ci mov r1, r0 ! 5 MT (latency=0) 75962306a36Sopenharmony_ci 76062306a36Sopenharmony_ci mov.l @r15+, r11 ! 15 LS 76162306a36Sopenharmony_ci sub r1, r5 ! 75 EX 76262306a36Sopenharmony_ci 76362306a36Sopenharmony_ci mov.l @r15+, r10 ! 15 LS 76462306a36Sopenharmony_ci cmp/eq r4, r0 ! 54 MT 76562306a36Sopenharmony_ci 76662306a36Sopenharmony_ci bf/s 1f ! 109 BR 76762306a36Sopenharmony_ci mov.l @r15+, r9 ! 15 LS 76862306a36Sopenharmony_ci 76962306a36Sopenharmony_ci rts 77062306a36Sopenharmony_ci1: mov.l @r15+, r8 ! 15 LS 77162306a36Sopenharmony_ci 77262306a36Sopenharmony_ci add #0x1e, r5 ! 50 EX 77362306a36Sopenharmony_ci 77462306a36Sopenharmony_ci ! Finish off a short word at a time 77562306a36Sopenharmony_ci ! r5 must be invariant - 2 77662306a36Sopenharmony_ci10: mov r4,r2 ! 5 MT (latency=0) 77762306a36Sopenharmony_ci add #1,r2 ! 50 EX 77862306a36Sopenharmony_ci 77962306a36Sopenharmony_ci cmp/hi r2, r0 ! 57 MT 78062306a36Sopenharmony_ci bf/s 1f ! 109 BR 78162306a36Sopenharmony_ci 78262306a36Sopenharmony_ci add #2, r2 ! 50 EX 78362306a36Sopenharmony_ci 78462306a36Sopenharmony_ci3: mov.w @(r0,r5),r1 ! 20 LS 78562306a36Sopenharmony_ci cmp/hi r2,r0 ! 57 MT 78662306a36Sopenharmony_ci 78762306a36Sopenharmony_ci bt/s 3b ! 109 BR 78862306a36Sopenharmony_ci 78962306a36Sopenharmony_ci mov.w r1,@-r0 ! 29 LS 79062306a36Sopenharmony_ci1: 79162306a36Sopenharmony_ci 79262306a36Sopenharmony_ci ! 79362306a36Sopenharmony_ci ! Finally, copy the last byte if necessary 79462306a36Sopenharmony_ci cmp/eq r4,r0 ! 54 MT 79562306a36Sopenharmony_ci bt/s 9b 79662306a36Sopenharmony_ci add #1,r5 79762306a36Sopenharmony_ci mov.b @(r0,r5),r1 79862306a36Sopenharmony_ci rts 79962306a36Sopenharmony_ci mov.b r1,@-r0 80062306a36Sopenharmony_ci 801