18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */ 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Copyright (C) 2013 ARM Ltd. 48c2ecf20Sopenharmony_ci * Copyright (C) 2013 Linaro. 58c2ecf20Sopenharmony_ci * 68c2ecf20Sopenharmony_ci * This code is based on glibc cortex strings work originally authored by Linaro 78c2ecf20Sopenharmony_ci * be found @ 88c2ecf20Sopenharmony_ci * 98c2ecf20Sopenharmony_ci * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 108c2ecf20Sopenharmony_ci * files/head:/src/aarch64/ 118c2ecf20Sopenharmony_ci */ 128c2ecf20Sopenharmony_ci 138c2ecf20Sopenharmony_ci#include <linux/linkage.h> 148c2ecf20Sopenharmony_ci#include <asm/assembler.h> 158c2ecf20Sopenharmony_ci#include <asm/cache.h> 168c2ecf20Sopenharmony_ci 178c2ecf20Sopenharmony_ci/* 188c2ecf20Sopenharmony_ci * Move a buffer from src to test (alignment handled by the hardware). 198c2ecf20Sopenharmony_ci * If dest <= src, call memcpy, otherwise copy in reverse order. 208c2ecf20Sopenharmony_ci * 218c2ecf20Sopenharmony_ci * Parameters: 228c2ecf20Sopenharmony_ci * x0 - dest 238c2ecf20Sopenharmony_ci * x1 - src 248c2ecf20Sopenharmony_ci * x2 - n 258c2ecf20Sopenharmony_ci * Returns: 268c2ecf20Sopenharmony_ci * x0 - dest 278c2ecf20Sopenharmony_ci */ 288c2ecf20Sopenharmony_cidstin .req x0 298c2ecf20Sopenharmony_cisrc .req x1 308c2ecf20Sopenharmony_cicount .req x2 318c2ecf20Sopenharmony_citmp1 .req x3 328c2ecf20Sopenharmony_citmp1w .req w3 338c2ecf20Sopenharmony_citmp2 .req x4 348c2ecf20Sopenharmony_citmp2w .req w4 358c2ecf20Sopenharmony_citmp3 .req x5 368c2ecf20Sopenharmony_citmp3w .req w5 378c2ecf20Sopenharmony_cidst .req x6 388c2ecf20Sopenharmony_ci 398c2ecf20Sopenharmony_ciA_l .req x7 408c2ecf20Sopenharmony_ciA_h .req x8 418c2ecf20Sopenharmony_ciB_l .req x9 428c2ecf20Sopenharmony_ciB_h .req x10 438c2ecf20Sopenharmony_ciC_l .req x11 448c2ecf20Sopenharmony_ciC_h .req x12 458c2ecf20Sopenharmony_ciD_l .req x13 468c2ecf20Sopenharmony_ciD_h .req x14 478c2ecf20Sopenharmony_ci 488c2ecf20Sopenharmony_ciSYM_FUNC_START_ALIAS(__memmove) 498c2ecf20Sopenharmony_ciSYM_FUNC_START_WEAK_PI(memmove) 508c2ecf20Sopenharmony_ci cmp dstin, src 518c2ecf20Sopenharmony_ci b.lo __memcpy 528c2ecf20Sopenharmony_ci add tmp1, src, count 538c2ecf20Sopenharmony_ci cmp dstin, tmp1 548c2ecf20Sopenharmony_ci b.hs __memcpy /* No overlap. */ 558c2ecf20Sopenharmony_ci 568c2ecf20Sopenharmony_ci add dst, dstin, count 578c2ecf20Sopenharmony_ci add src, src, count 588c2ecf20Sopenharmony_ci cmp count, #16 598c2ecf20Sopenharmony_ci b.lo .Ltail15 /*probably non-alignment accesses.*/ 608c2ecf20Sopenharmony_ci 618c2ecf20Sopenharmony_ci ands tmp2, src, #15 /* Bytes to reach alignment. */ 628c2ecf20Sopenharmony_ci b.eq .LSrcAligned 638c2ecf20Sopenharmony_ci sub count, count, tmp2 648c2ecf20Sopenharmony_ci /* 658c2ecf20Sopenharmony_ci * process the aligned offset length to make the src aligned firstly. 668c2ecf20Sopenharmony_ci * those extra instructions' cost is acceptable. It also make the 678c2ecf20Sopenharmony_ci * coming accesses are based on aligned address. 688c2ecf20Sopenharmony_ci */ 698c2ecf20Sopenharmony_ci tbz tmp2, #0, 1f 708c2ecf20Sopenharmony_ci ldrb tmp1w, [src, #-1]! 718c2ecf20Sopenharmony_ci strb tmp1w, [dst, #-1]! 728c2ecf20Sopenharmony_ci1: 738c2ecf20Sopenharmony_ci tbz tmp2, #1, 2f 748c2ecf20Sopenharmony_ci ldrh tmp1w, [src, #-2]! 758c2ecf20Sopenharmony_ci strh tmp1w, [dst, #-2]! 768c2ecf20Sopenharmony_ci2: 778c2ecf20Sopenharmony_ci tbz tmp2, #2, 3f 788c2ecf20Sopenharmony_ci ldr tmp1w, [src, #-4]! 798c2ecf20Sopenharmony_ci str tmp1w, [dst, #-4]! 808c2ecf20Sopenharmony_ci3: 818c2ecf20Sopenharmony_ci tbz tmp2, #3, .LSrcAligned 828c2ecf20Sopenharmony_ci ldr tmp1, [src, #-8]! 838c2ecf20Sopenharmony_ci str tmp1, [dst, #-8]! 848c2ecf20Sopenharmony_ci 858c2ecf20Sopenharmony_ci.LSrcAligned: 868c2ecf20Sopenharmony_ci cmp count, #64 878c2ecf20Sopenharmony_ci b.ge .Lcpy_over64 888c2ecf20Sopenharmony_ci 898c2ecf20Sopenharmony_ci /* 908c2ecf20Sopenharmony_ci * Deal with small copies quickly by dropping straight into the 918c2ecf20Sopenharmony_ci * exit block. 928c2ecf20Sopenharmony_ci */ 938c2ecf20Sopenharmony_ci.Ltail63: 948c2ecf20Sopenharmony_ci /* 958c2ecf20Sopenharmony_ci * Copy up to 48 bytes of data. At this point we only need the 968c2ecf20Sopenharmony_ci * bottom 6 bits of count to be accurate. 978c2ecf20Sopenharmony_ci */ 988c2ecf20Sopenharmony_ci ands tmp1, count, #0x30 998c2ecf20Sopenharmony_ci b.eq .Ltail15 1008c2ecf20Sopenharmony_ci cmp tmp1w, #0x20 1018c2ecf20Sopenharmony_ci b.eq 1f 1028c2ecf20Sopenharmony_ci b.lt 2f 1038c2ecf20Sopenharmony_ci ldp A_l, A_h, [src, #-16]! 1048c2ecf20Sopenharmony_ci stp A_l, A_h, [dst, #-16]! 1058c2ecf20Sopenharmony_ci1: 1068c2ecf20Sopenharmony_ci ldp A_l, A_h, [src, #-16]! 1078c2ecf20Sopenharmony_ci stp A_l, A_h, [dst, #-16]! 1088c2ecf20Sopenharmony_ci2: 1098c2ecf20Sopenharmony_ci ldp A_l, A_h, [src, #-16]! 1108c2ecf20Sopenharmony_ci stp A_l, A_h, [dst, #-16]! 1118c2ecf20Sopenharmony_ci 1128c2ecf20Sopenharmony_ci.Ltail15: 1138c2ecf20Sopenharmony_ci tbz count, #3, 1f 1148c2ecf20Sopenharmony_ci ldr tmp1, [src, #-8]! 1158c2ecf20Sopenharmony_ci str tmp1, [dst, #-8]! 1168c2ecf20Sopenharmony_ci1: 1178c2ecf20Sopenharmony_ci tbz count, #2, 2f 1188c2ecf20Sopenharmony_ci ldr tmp1w, [src, #-4]! 1198c2ecf20Sopenharmony_ci str tmp1w, [dst, #-4]! 1208c2ecf20Sopenharmony_ci2: 1218c2ecf20Sopenharmony_ci tbz count, #1, 3f 1228c2ecf20Sopenharmony_ci ldrh tmp1w, [src, #-2]! 1238c2ecf20Sopenharmony_ci strh tmp1w, [dst, #-2]! 1248c2ecf20Sopenharmony_ci3: 1258c2ecf20Sopenharmony_ci tbz count, #0, .Lexitfunc 1268c2ecf20Sopenharmony_ci ldrb tmp1w, [src, #-1] 1278c2ecf20Sopenharmony_ci strb tmp1w, [dst, #-1] 1288c2ecf20Sopenharmony_ci 1298c2ecf20Sopenharmony_ci.Lexitfunc: 1308c2ecf20Sopenharmony_ci ret 1318c2ecf20Sopenharmony_ci 1328c2ecf20Sopenharmony_ci.Lcpy_over64: 1338c2ecf20Sopenharmony_ci subs count, count, #128 1348c2ecf20Sopenharmony_ci b.ge .Lcpy_body_large 1358c2ecf20Sopenharmony_ci /* 1368c2ecf20Sopenharmony_ci * Less than 128 bytes to copy, so handle 64 bytes here and then jump 1378c2ecf20Sopenharmony_ci * to the tail. 1388c2ecf20Sopenharmony_ci */ 1398c2ecf20Sopenharmony_ci ldp A_l, A_h, [src, #-16] 1408c2ecf20Sopenharmony_ci stp A_l, A_h, [dst, #-16] 1418c2ecf20Sopenharmony_ci ldp B_l, B_h, [src, #-32] 1428c2ecf20Sopenharmony_ci ldp C_l, C_h, [src, #-48] 1438c2ecf20Sopenharmony_ci stp B_l, B_h, [dst, #-32] 1448c2ecf20Sopenharmony_ci stp C_l, C_h, [dst, #-48] 1458c2ecf20Sopenharmony_ci ldp D_l, D_h, [src, #-64]! 1468c2ecf20Sopenharmony_ci stp D_l, D_h, [dst, #-64]! 1478c2ecf20Sopenharmony_ci 1488c2ecf20Sopenharmony_ci tst count, #0x3f 1498c2ecf20Sopenharmony_ci b.ne .Ltail63 1508c2ecf20Sopenharmony_ci ret 1518c2ecf20Sopenharmony_ci 1528c2ecf20Sopenharmony_ci /* 1538c2ecf20Sopenharmony_ci * Critical loop. Start at a new cache line boundary. Assuming 1548c2ecf20Sopenharmony_ci * 64 bytes per line this ensures the entire loop is in one line. 1558c2ecf20Sopenharmony_ci */ 1568c2ecf20Sopenharmony_ci .p2align L1_CACHE_SHIFT 1578c2ecf20Sopenharmony_ci.Lcpy_body_large: 1588c2ecf20Sopenharmony_ci /* pre-load 64 bytes data. */ 1598c2ecf20Sopenharmony_ci ldp A_l, A_h, [src, #-16] 1608c2ecf20Sopenharmony_ci ldp B_l, B_h, [src, #-32] 1618c2ecf20Sopenharmony_ci ldp C_l, C_h, [src, #-48] 1628c2ecf20Sopenharmony_ci ldp D_l, D_h, [src, #-64]! 1638c2ecf20Sopenharmony_ci1: 1648c2ecf20Sopenharmony_ci /* 1658c2ecf20Sopenharmony_ci * interlace the load of next 64 bytes data block with store of the last 1668c2ecf20Sopenharmony_ci * loaded 64 bytes data. 1678c2ecf20Sopenharmony_ci */ 1688c2ecf20Sopenharmony_ci stp A_l, A_h, [dst, #-16] 1698c2ecf20Sopenharmony_ci ldp A_l, A_h, [src, #-16] 1708c2ecf20Sopenharmony_ci stp B_l, B_h, [dst, #-32] 1718c2ecf20Sopenharmony_ci ldp B_l, B_h, [src, #-32] 1728c2ecf20Sopenharmony_ci stp C_l, C_h, [dst, #-48] 1738c2ecf20Sopenharmony_ci ldp C_l, C_h, [src, #-48] 1748c2ecf20Sopenharmony_ci stp D_l, D_h, [dst, #-64]! 1758c2ecf20Sopenharmony_ci ldp D_l, D_h, [src, #-64]! 1768c2ecf20Sopenharmony_ci subs count, count, #64 1778c2ecf20Sopenharmony_ci b.ge 1b 1788c2ecf20Sopenharmony_ci stp A_l, A_h, [dst, #-16] 1798c2ecf20Sopenharmony_ci stp B_l, B_h, [dst, #-32] 1808c2ecf20Sopenharmony_ci stp C_l, C_h, [dst, #-48] 1818c2ecf20Sopenharmony_ci stp D_l, D_h, [dst, #-64]! 1828c2ecf20Sopenharmony_ci 1838c2ecf20Sopenharmony_ci tst count, #0x3f 1848c2ecf20Sopenharmony_ci b.ne .Ltail63 1858c2ecf20Sopenharmony_ci ret 1868c2ecf20Sopenharmony_ciSYM_FUNC_END_PI(memmove) 1878c2ecf20Sopenharmony_ciEXPORT_SYMBOL(memmove) 1888c2ecf20Sopenharmony_ciSYM_FUNC_END_ALIAS(__memmove) 1898c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__memmove) 190