18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */ 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Copyright (C) 2013 ARM Ltd. 48c2ecf20Sopenharmony_ci * Copyright (C) 2013 Linaro. 58c2ecf20Sopenharmony_ci * 68c2ecf20Sopenharmony_ci * This code is based on glibc cortex strings work originally authored by Linaro 78c2ecf20Sopenharmony_ci * be found @ 88c2ecf20Sopenharmony_ci * 98c2ecf20Sopenharmony_ci * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 108c2ecf20Sopenharmony_ci * files/head:/src/aarch64/ 118c2ecf20Sopenharmony_ci */ 128c2ecf20Sopenharmony_ci 138c2ecf20Sopenharmony_ci 148c2ecf20Sopenharmony_ci/* 158c2ecf20Sopenharmony_ci * Copy a buffer from src to dest (alignment handled by the hardware) 168c2ecf20Sopenharmony_ci * 178c2ecf20Sopenharmony_ci * Parameters: 188c2ecf20Sopenharmony_ci * x0 - dest 198c2ecf20Sopenharmony_ci * x1 - src 208c2ecf20Sopenharmony_ci * x2 - n 218c2ecf20Sopenharmony_ci * Returns: 228c2ecf20Sopenharmony_ci * x0 - dest 238c2ecf20Sopenharmony_ci */ 248c2ecf20Sopenharmony_cidstin .req x0 258c2ecf20Sopenharmony_cisrc .req x1 268c2ecf20Sopenharmony_cicount .req x2 278c2ecf20Sopenharmony_citmp1 .req x3 288c2ecf20Sopenharmony_citmp1w .req w3 298c2ecf20Sopenharmony_citmp2 .req x4 308c2ecf20Sopenharmony_citmp2w .req w4 318c2ecf20Sopenharmony_cidst .req x6 328c2ecf20Sopenharmony_ci 338c2ecf20Sopenharmony_ciA_l .req x7 348c2ecf20Sopenharmony_ciA_h .req x8 358c2ecf20Sopenharmony_ciB_l .req x9 368c2ecf20Sopenharmony_ciB_h .req x10 378c2ecf20Sopenharmony_ciC_l .req x11 388c2ecf20Sopenharmony_ciC_h .req x12 398c2ecf20Sopenharmony_ciD_l .req x13 408c2ecf20Sopenharmony_ciD_h .req x14 418c2ecf20Sopenharmony_ci 428c2ecf20Sopenharmony_ci mov dst, dstin 438c2ecf20Sopenharmony_ci cmp count, #16 448c2ecf20Sopenharmony_ci /*When memory length is less than 16, the accessed are not aligned.*/ 458c2ecf20Sopenharmony_ci b.lo .Ltiny15 468c2ecf20Sopenharmony_ci 478c2ecf20Sopenharmony_ci neg tmp2, src 488c2ecf20Sopenharmony_ci ands tmp2, tmp2, #15/* Bytes to reach alignment. */ 498c2ecf20Sopenharmony_ci b.eq .LSrcAligned 508c2ecf20Sopenharmony_ci sub count, count, tmp2 518c2ecf20Sopenharmony_ci /* 528c2ecf20Sopenharmony_ci * Copy the leading memory data from src to dst in an increasing 538c2ecf20Sopenharmony_ci * address order.By this way,the risk of overwriting the source 548c2ecf20Sopenharmony_ci * memory data is eliminated when the distance between src and 558c2ecf20Sopenharmony_ci * dst is less than 16. The memory accesses here are alignment. 568c2ecf20Sopenharmony_ci */ 578c2ecf20Sopenharmony_ci tbz tmp2, #0, 1f 588c2ecf20Sopenharmony_ci ldrb1 tmp1w, src, #1 598c2ecf20Sopenharmony_ci strb1 tmp1w, dst, #1 608c2ecf20Sopenharmony_ci1: 618c2ecf20Sopenharmony_ci tbz tmp2, #1, 2f 628c2ecf20Sopenharmony_ci ldrh1 tmp1w, src, #2 638c2ecf20Sopenharmony_ci strh1 tmp1w, dst, #2 648c2ecf20Sopenharmony_ci2: 658c2ecf20Sopenharmony_ci tbz tmp2, #2, 3f 668c2ecf20Sopenharmony_ci ldr1 tmp1w, src, #4 678c2ecf20Sopenharmony_ci str1 tmp1w, dst, #4 688c2ecf20Sopenharmony_ci3: 698c2ecf20Sopenharmony_ci tbz tmp2, #3, .LSrcAligned 708c2ecf20Sopenharmony_ci ldr1 tmp1, src, #8 718c2ecf20Sopenharmony_ci str1 tmp1, dst, #8 728c2ecf20Sopenharmony_ci 738c2ecf20Sopenharmony_ci.LSrcAligned: 748c2ecf20Sopenharmony_ci cmp count, #64 758c2ecf20Sopenharmony_ci b.ge .Lcpy_over64 768c2ecf20Sopenharmony_ci /* 778c2ecf20Sopenharmony_ci * Deal with small copies quickly by dropping straight into the 788c2ecf20Sopenharmony_ci * exit block. 798c2ecf20Sopenharmony_ci */ 808c2ecf20Sopenharmony_ci.Ltail63: 818c2ecf20Sopenharmony_ci /* 828c2ecf20Sopenharmony_ci * Copy up to 48 bytes of data. At this point we only need the 838c2ecf20Sopenharmony_ci * bottom 6 bits of count to be accurate. 848c2ecf20Sopenharmony_ci */ 858c2ecf20Sopenharmony_ci ands tmp1, count, #0x30 868c2ecf20Sopenharmony_ci b.eq .Ltiny15 878c2ecf20Sopenharmony_ci cmp tmp1w, #0x20 888c2ecf20Sopenharmony_ci b.eq 1f 898c2ecf20Sopenharmony_ci b.lt 2f 908c2ecf20Sopenharmony_ci ldp1 A_l, A_h, src, #16 918c2ecf20Sopenharmony_ci stp1 A_l, A_h, dst, #16 928c2ecf20Sopenharmony_ci1: 938c2ecf20Sopenharmony_ci ldp1 A_l, A_h, src, #16 948c2ecf20Sopenharmony_ci stp1 A_l, A_h, dst, #16 958c2ecf20Sopenharmony_ci2: 968c2ecf20Sopenharmony_ci ldp1 A_l, A_h, src, #16 978c2ecf20Sopenharmony_ci stp1 A_l, A_h, dst, #16 988c2ecf20Sopenharmony_ci.Ltiny15: 998c2ecf20Sopenharmony_ci /* 1008c2ecf20Sopenharmony_ci * Prefer to break one ldp/stp into several load/store to access 1018c2ecf20Sopenharmony_ci * memory in an increasing address order,rather than to load/store 16 1028c2ecf20Sopenharmony_ci * bytes from (src-16) to (dst-16) and to backward the src to aligned 1038c2ecf20Sopenharmony_ci * address,which way is used in original cortex memcpy. If keeping 1048c2ecf20Sopenharmony_ci * the original memcpy process here, memmove need to satisfy the 1058c2ecf20Sopenharmony_ci * precondition that src address is at least 16 bytes bigger than dst 1068c2ecf20Sopenharmony_ci * address,otherwise some source data will be overwritten when memove 1078c2ecf20Sopenharmony_ci * call memcpy directly. To make memmove simpler and decouple the 1088c2ecf20Sopenharmony_ci * memcpy's dependency on memmove, withdrew the original process. 1098c2ecf20Sopenharmony_ci */ 1108c2ecf20Sopenharmony_ci tbz count, #3, 1f 1118c2ecf20Sopenharmony_ci ldr1 tmp1, src, #8 1128c2ecf20Sopenharmony_ci str1 tmp1, dst, #8 1138c2ecf20Sopenharmony_ci1: 1148c2ecf20Sopenharmony_ci tbz count, #2, 2f 1158c2ecf20Sopenharmony_ci ldr1 tmp1w, src, #4 1168c2ecf20Sopenharmony_ci str1 tmp1w, dst, #4 1178c2ecf20Sopenharmony_ci2: 1188c2ecf20Sopenharmony_ci tbz count, #1, 3f 1198c2ecf20Sopenharmony_ci ldrh1 tmp1w, src, #2 1208c2ecf20Sopenharmony_ci strh1 tmp1w, dst, #2 1218c2ecf20Sopenharmony_ci3: 1228c2ecf20Sopenharmony_ci tbz count, #0, .Lexitfunc 1238c2ecf20Sopenharmony_ci ldrb1 tmp1w, src, #1 1248c2ecf20Sopenharmony_ci strb1 tmp1w, dst, #1 1258c2ecf20Sopenharmony_ci 1268c2ecf20Sopenharmony_ci b .Lexitfunc 1278c2ecf20Sopenharmony_ci 1288c2ecf20Sopenharmony_ci.Lcpy_over64: 1298c2ecf20Sopenharmony_ci subs count, count, #128 1308c2ecf20Sopenharmony_ci b.ge .Lcpy_body_large 1318c2ecf20Sopenharmony_ci /* 1328c2ecf20Sopenharmony_ci * Less than 128 bytes to copy, so handle 64 here and then jump 1338c2ecf20Sopenharmony_ci * to the tail. 1348c2ecf20Sopenharmony_ci */ 1358c2ecf20Sopenharmony_ci ldp1 A_l, A_h, src, #16 1368c2ecf20Sopenharmony_ci stp1 A_l, A_h, dst, #16 1378c2ecf20Sopenharmony_ci ldp1 B_l, B_h, src, #16 1388c2ecf20Sopenharmony_ci ldp1 C_l, C_h, src, #16 1398c2ecf20Sopenharmony_ci stp1 B_l, B_h, dst, #16 1408c2ecf20Sopenharmony_ci stp1 C_l, C_h, dst, #16 1418c2ecf20Sopenharmony_ci ldp1 D_l, D_h, src, #16 1428c2ecf20Sopenharmony_ci stp1 D_l, D_h, dst, #16 1438c2ecf20Sopenharmony_ci 1448c2ecf20Sopenharmony_ci tst count, #0x3f 1458c2ecf20Sopenharmony_ci b.ne .Ltail63 1468c2ecf20Sopenharmony_ci b .Lexitfunc 1478c2ecf20Sopenharmony_ci 1488c2ecf20Sopenharmony_ci /* 1498c2ecf20Sopenharmony_ci * Critical loop. Start at a new cache line boundary. Assuming 1508c2ecf20Sopenharmony_ci * 64 bytes per line this ensures the entire loop is in one line. 1518c2ecf20Sopenharmony_ci */ 1528c2ecf20Sopenharmony_ci .p2align L1_CACHE_SHIFT 1538c2ecf20Sopenharmony_ci.Lcpy_body_large: 1548c2ecf20Sopenharmony_ci /* pre-get 64 bytes data. */ 1558c2ecf20Sopenharmony_ci ldp1 A_l, A_h, src, #16 1568c2ecf20Sopenharmony_ci ldp1 B_l, B_h, src, #16 1578c2ecf20Sopenharmony_ci ldp1 C_l, C_h, src, #16 1588c2ecf20Sopenharmony_ci ldp1 D_l, D_h, src, #16 1598c2ecf20Sopenharmony_ci1: 1608c2ecf20Sopenharmony_ci /* 1618c2ecf20Sopenharmony_ci * interlace the load of next 64 bytes data block with store of the last 1628c2ecf20Sopenharmony_ci * loaded 64 bytes data. 1638c2ecf20Sopenharmony_ci */ 1648c2ecf20Sopenharmony_ci stp1 A_l, A_h, dst, #16 1658c2ecf20Sopenharmony_ci ldp1 A_l, A_h, src, #16 1668c2ecf20Sopenharmony_ci stp1 B_l, B_h, dst, #16 1678c2ecf20Sopenharmony_ci ldp1 B_l, B_h, src, #16 1688c2ecf20Sopenharmony_ci stp1 C_l, C_h, dst, #16 1698c2ecf20Sopenharmony_ci ldp1 C_l, C_h, src, #16 1708c2ecf20Sopenharmony_ci stp1 D_l, D_h, dst, #16 1718c2ecf20Sopenharmony_ci ldp1 D_l, D_h, src, #16 1728c2ecf20Sopenharmony_ci subs count, count, #64 1738c2ecf20Sopenharmony_ci b.ge 1b 1748c2ecf20Sopenharmony_ci stp1 A_l, A_h, dst, #16 1758c2ecf20Sopenharmony_ci stp1 B_l, B_h, dst, #16 1768c2ecf20Sopenharmony_ci stp1 C_l, C_h, dst, #16 1778c2ecf20Sopenharmony_ci stp1 D_l, D_h, dst, #16 1788c2ecf20Sopenharmony_ci 1798c2ecf20Sopenharmony_ci tst count, #0x3f 1808c2ecf20Sopenharmony_ci b.ne .Ltail63 1818c2ecf20Sopenharmony_ci.Lexitfunc: 182