162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Copyright (C) 2013 ARM Ltd. 462306a36Sopenharmony_ci * Copyright (C) 2013 Linaro. 562306a36Sopenharmony_ci * 662306a36Sopenharmony_ci * This code is based on glibc cortex strings work originally authored by Linaro 762306a36Sopenharmony_ci * be found @ 862306a36Sopenharmony_ci * 962306a36Sopenharmony_ci * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 1062306a36Sopenharmony_ci * files/head:/src/aarch64/ 1162306a36Sopenharmony_ci */ 1262306a36Sopenharmony_ci 1362306a36Sopenharmony_ci 1462306a36Sopenharmony_ci/* 1562306a36Sopenharmony_ci * Copy a buffer from src to dest (alignment handled by the hardware) 1662306a36Sopenharmony_ci * 1762306a36Sopenharmony_ci * Parameters: 1862306a36Sopenharmony_ci * x0 - dest 1962306a36Sopenharmony_ci * x1 - src 2062306a36Sopenharmony_ci * x2 - n 2162306a36Sopenharmony_ci * Returns: 2262306a36Sopenharmony_ci * x0 - dest 2362306a36Sopenharmony_ci */ 2462306a36Sopenharmony_cidstin .req x0 2562306a36Sopenharmony_cisrc .req x1 2662306a36Sopenharmony_cicount .req x2 2762306a36Sopenharmony_citmp1 .req x3 2862306a36Sopenharmony_citmp1w .req w3 2962306a36Sopenharmony_citmp2 .req x4 3062306a36Sopenharmony_citmp2w .req w4 3162306a36Sopenharmony_cidst .req x6 3262306a36Sopenharmony_ci 3362306a36Sopenharmony_ciA_l .req x7 3462306a36Sopenharmony_ciA_h .req x8 3562306a36Sopenharmony_ciB_l .req x9 3662306a36Sopenharmony_ciB_h .req x10 3762306a36Sopenharmony_ciC_l .req x11 3862306a36Sopenharmony_ciC_h .req x12 3962306a36Sopenharmony_ciD_l .req x13 4062306a36Sopenharmony_ciD_h .req x14 4162306a36Sopenharmony_ci 4262306a36Sopenharmony_ci mov dst, dstin 4362306a36Sopenharmony_ci cmp count, #16 4462306a36Sopenharmony_ci /*When memory length is less than 16, the accessed are not aligned.*/ 4562306a36Sopenharmony_ci b.lo .Ltiny15 4662306a36Sopenharmony_ci 4762306a36Sopenharmony_ci neg tmp2, src 4862306a36Sopenharmony_ci ands tmp2, tmp2, #15/* Bytes to reach alignment. */ 4962306a36Sopenharmony_ci b.eq .LSrcAligned 5062306a36Sopenharmony_ci sub count, count, tmp2 5162306a36Sopenharmony_ci /* 5262306a36Sopenharmony_ci * Copy the leading memory data from src to dst in an increasing 5362306a36Sopenharmony_ci * address order.By this way,the risk of overwriting the source 5462306a36Sopenharmony_ci * memory data is eliminated when the distance between src and 5562306a36Sopenharmony_ci * dst is less than 16. The memory accesses here are alignment. 5662306a36Sopenharmony_ci */ 5762306a36Sopenharmony_ci tbz tmp2, #0, 1f 5862306a36Sopenharmony_ci ldrb1 tmp1w, src, #1 5962306a36Sopenharmony_ci strb1 tmp1w, dst, #1 6062306a36Sopenharmony_ci1: 6162306a36Sopenharmony_ci tbz tmp2, #1, 2f 6262306a36Sopenharmony_ci ldrh1 tmp1w, src, #2 6362306a36Sopenharmony_ci strh1 tmp1w, dst, #2 6462306a36Sopenharmony_ci2: 6562306a36Sopenharmony_ci tbz tmp2, #2, 3f 6662306a36Sopenharmony_ci ldr1 tmp1w, src, #4 6762306a36Sopenharmony_ci str1 tmp1w, dst, #4 6862306a36Sopenharmony_ci3: 6962306a36Sopenharmony_ci tbz tmp2, #3, .LSrcAligned 7062306a36Sopenharmony_ci ldr1 tmp1, src, #8 7162306a36Sopenharmony_ci str1 tmp1, dst, #8 7262306a36Sopenharmony_ci 7362306a36Sopenharmony_ci.LSrcAligned: 7462306a36Sopenharmony_ci cmp count, #64 7562306a36Sopenharmony_ci b.ge .Lcpy_over64 7662306a36Sopenharmony_ci /* 7762306a36Sopenharmony_ci * Deal with small copies quickly by dropping straight into the 7862306a36Sopenharmony_ci * exit block. 7962306a36Sopenharmony_ci */ 8062306a36Sopenharmony_ci.Ltail63: 8162306a36Sopenharmony_ci /* 8262306a36Sopenharmony_ci * Copy up to 48 bytes of data. At this point we only need the 8362306a36Sopenharmony_ci * bottom 6 bits of count to be accurate. 8462306a36Sopenharmony_ci */ 8562306a36Sopenharmony_ci ands tmp1, count, #0x30 8662306a36Sopenharmony_ci b.eq .Ltiny15 8762306a36Sopenharmony_ci cmp tmp1w, #0x20 8862306a36Sopenharmony_ci b.eq 1f 8962306a36Sopenharmony_ci b.lt 2f 9062306a36Sopenharmony_ci ldp1 A_l, A_h, src, #16 9162306a36Sopenharmony_ci stp1 A_l, A_h, dst, #16 9262306a36Sopenharmony_ci1: 9362306a36Sopenharmony_ci ldp1 A_l, A_h, src, #16 9462306a36Sopenharmony_ci stp1 A_l, A_h, dst, #16 9562306a36Sopenharmony_ci2: 9662306a36Sopenharmony_ci ldp1 A_l, A_h, src, #16 9762306a36Sopenharmony_ci stp1 A_l, A_h, dst, #16 9862306a36Sopenharmony_ci.Ltiny15: 9962306a36Sopenharmony_ci /* 10062306a36Sopenharmony_ci * Prefer to break one ldp/stp into several load/store to access 10162306a36Sopenharmony_ci * memory in an increasing address order,rather than to load/store 16 10262306a36Sopenharmony_ci * bytes from (src-16) to (dst-16) and to backward the src to aligned 10362306a36Sopenharmony_ci * address,which way is used in original cortex memcpy. If keeping 10462306a36Sopenharmony_ci * the original memcpy process here, memmove need to satisfy the 10562306a36Sopenharmony_ci * precondition that src address is at least 16 bytes bigger than dst 10662306a36Sopenharmony_ci * address,otherwise some source data will be overwritten when memove 10762306a36Sopenharmony_ci * call memcpy directly. To make memmove simpler and decouple the 10862306a36Sopenharmony_ci * memcpy's dependency on memmove, withdrew the original process. 10962306a36Sopenharmony_ci */ 11062306a36Sopenharmony_ci tbz count, #3, 1f 11162306a36Sopenharmony_ci ldr1 tmp1, src, #8 11262306a36Sopenharmony_ci str1 tmp1, dst, #8 11362306a36Sopenharmony_ci1: 11462306a36Sopenharmony_ci tbz count, #2, 2f 11562306a36Sopenharmony_ci ldr1 tmp1w, src, #4 11662306a36Sopenharmony_ci str1 tmp1w, dst, #4 11762306a36Sopenharmony_ci2: 11862306a36Sopenharmony_ci tbz count, #1, 3f 11962306a36Sopenharmony_ci ldrh1 tmp1w, src, #2 12062306a36Sopenharmony_ci strh1 tmp1w, dst, #2 12162306a36Sopenharmony_ci3: 12262306a36Sopenharmony_ci tbz count, #0, .Lexitfunc 12362306a36Sopenharmony_ci ldrb1 tmp1w, src, #1 12462306a36Sopenharmony_ci strb1 tmp1w, dst, #1 12562306a36Sopenharmony_ci 12662306a36Sopenharmony_ci b .Lexitfunc 12762306a36Sopenharmony_ci 12862306a36Sopenharmony_ci.Lcpy_over64: 12962306a36Sopenharmony_ci subs count, count, #128 13062306a36Sopenharmony_ci b.ge .Lcpy_body_large 13162306a36Sopenharmony_ci /* 13262306a36Sopenharmony_ci * Less than 128 bytes to copy, so handle 64 here and then jump 13362306a36Sopenharmony_ci * to the tail. 13462306a36Sopenharmony_ci */ 13562306a36Sopenharmony_ci ldp1 A_l, A_h, src, #16 13662306a36Sopenharmony_ci stp1 A_l, A_h, dst, #16 13762306a36Sopenharmony_ci ldp1 B_l, B_h, src, #16 13862306a36Sopenharmony_ci ldp1 C_l, C_h, src, #16 13962306a36Sopenharmony_ci stp1 B_l, B_h, dst, #16 14062306a36Sopenharmony_ci stp1 C_l, C_h, dst, #16 14162306a36Sopenharmony_ci ldp1 D_l, D_h, src, #16 14262306a36Sopenharmony_ci stp1 D_l, D_h, dst, #16 14362306a36Sopenharmony_ci 14462306a36Sopenharmony_ci tst count, #0x3f 14562306a36Sopenharmony_ci b.ne .Ltail63 14662306a36Sopenharmony_ci b .Lexitfunc 14762306a36Sopenharmony_ci 14862306a36Sopenharmony_ci /* 14962306a36Sopenharmony_ci * Critical loop. Start at a new cache line boundary. Assuming 15062306a36Sopenharmony_ci * 64 bytes per line this ensures the entire loop is in one line. 15162306a36Sopenharmony_ci */ 15262306a36Sopenharmony_ci .p2align L1_CACHE_SHIFT 15362306a36Sopenharmony_ci.Lcpy_body_large: 15462306a36Sopenharmony_ci /* pre-get 64 bytes data. */ 15562306a36Sopenharmony_ci ldp1 A_l, A_h, src, #16 15662306a36Sopenharmony_ci ldp1 B_l, B_h, src, #16 15762306a36Sopenharmony_ci ldp1 C_l, C_h, src, #16 15862306a36Sopenharmony_ci ldp1 D_l, D_h, src, #16 15962306a36Sopenharmony_ci1: 16062306a36Sopenharmony_ci /* 16162306a36Sopenharmony_ci * interlace the load of next 64 bytes data block with store of the last 16262306a36Sopenharmony_ci * loaded 64 bytes data. 16362306a36Sopenharmony_ci */ 16462306a36Sopenharmony_ci stp1 A_l, A_h, dst, #16 16562306a36Sopenharmony_ci ldp1 A_l, A_h, src, #16 16662306a36Sopenharmony_ci stp1 B_l, B_h, dst, #16 16762306a36Sopenharmony_ci ldp1 B_l, B_h, src, #16 16862306a36Sopenharmony_ci stp1 C_l, C_h, dst, #16 16962306a36Sopenharmony_ci ldp1 C_l, C_h, src, #16 17062306a36Sopenharmony_ci stp1 D_l, D_h, dst, #16 17162306a36Sopenharmony_ci ldp1 D_l, D_h, src, #16 17262306a36Sopenharmony_ci subs count, count, #64 17362306a36Sopenharmony_ci b.ge 1b 17462306a36Sopenharmony_ci stp1 A_l, A_h, dst, #16 17562306a36Sopenharmony_ci stp1 B_l, B_h, dst, #16 17662306a36Sopenharmony_ci stp1 C_l, C_h, dst, #16 17762306a36Sopenharmony_ci stp1 D_l, D_h, dst, #16 17862306a36Sopenharmony_ci 17962306a36Sopenharmony_ci tst count, #0x3f 18062306a36Sopenharmony_ci b.ne .Ltail63 18162306a36Sopenharmony_ci.Lexitfunc: 182