1bbbf1280Sopenharmony_ci/* 2bbbf1280Sopenharmony_ci * memcpy - copy memory area 3bbbf1280Sopenharmony_ci * 4bbbf1280Sopenharmony_ci * Copyright (c) 2019-2020, Arm Limited. 5bbbf1280Sopenharmony_ci * SPDX-License-Identifier: MIT 6bbbf1280Sopenharmony_ci */ 7bbbf1280Sopenharmony_ci 8bbbf1280Sopenharmony_ci/* Assumptions: 9bbbf1280Sopenharmony_ci * 10bbbf1280Sopenharmony_ci * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. 11bbbf1280Sopenharmony_ci * 12bbbf1280Sopenharmony_ci */ 13bbbf1280Sopenharmony_ci 14bbbf1280Sopenharmony_ci#include "../asmdefs.h" 15bbbf1280Sopenharmony_ci 16bbbf1280Sopenharmony_ci#define dstin x0 17bbbf1280Sopenharmony_ci#define src x1 18bbbf1280Sopenharmony_ci#define count x2 19bbbf1280Sopenharmony_ci#define dst x3 20bbbf1280Sopenharmony_ci#define srcend x4 21bbbf1280Sopenharmony_ci#define dstend x5 22bbbf1280Sopenharmony_ci#define A_l x6 23bbbf1280Sopenharmony_ci#define A_lw w6 24bbbf1280Sopenharmony_ci#define A_h x7 25bbbf1280Sopenharmony_ci#define B_l x8 26bbbf1280Sopenharmony_ci#define B_lw w8 27bbbf1280Sopenharmony_ci#define B_h x9 28bbbf1280Sopenharmony_ci#define C_lw w10 29bbbf1280Sopenharmony_ci#define tmp1 x14 30bbbf1280Sopenharmony_ci 31bbbf1280Sopenharmony_ci#define A_q q0 32bbbf1280Sopenharmony_ci#define B_q q1 33bbbf1280Sopenharmony_ci#define C_q q2 34bbbf1280Sopenharmony_ci#define D_q q3 35bbbf1280Sopenharmony_ci#define E_q q4 36bbbf1280Sopenharmony_ci#define F_q q5 37bbbf1280Sopenharmony_ci#define G_q q6 38bbbf1280Sopenharmony_ci#define H_q q7 39bbbf1280Sopenharmony_ci 40bbbf1280Sopenharmony_ci/* This implementation handles overlaps and supports both memcpy and memmove 41bbbf1280Sopenharmony_ci from a single entry point. It uses unaligned accesses and branchless 42bbbf1280Sopenharmony_ci sequences to keep the code small, simple and improve performance. 43bbbf1280Sopenharmony_ci 44bbbf1280Sopenharmony_ci Copies are split into 3 main cases: small copies of up to 32 bytes, medium 45bbbf1280Sopenharmony_ci copies of up to 128 bytes, and large copies. The overhead of the overlap 46bbbf1280Sopenharmony_ci check is negligible since it is only required for large copies. 47bbbf1280Sopenharmony_ci 48bbbf1280Sopenharmony_ci Large copies use a software pipelined loop processing 64 bytes per iteration. 49bbbf1280Sopenharmony_ci The source pointer is 16-byte aligned to minimize unaligned accesses. 50bbbf1280Sopenharmony_ci The loop tail is handled by always copying 64 bytes from the end. 51bbbf1280Sopenharmony_ci*/ 52bbbf1280Sopenharmony_ci 53bbbf1280Sopenharmony_ciENTRY_ALIAS (__memmove_aarch64_simd) 54bbbf1280Sopenharmony_ciENTRY (__memcpy_aarch64_simd) 55bbbf1280Sopenharmony_ci PTR_ARG (0) 56bbbf1280Sopenharmony_ci PTR_ARG (1) 57bbbf1280Sopenharmony_ci SIZE_ARG (2) 58bbbf1280Sopenharmony_ci add srcend, src, count 59bbbf1280Sopenharmony_ci add dstend, dstin, count 60bbbf1280Sopenharmony_ci cmp count, 128 61bbbf1280Sopenharmony_ci b.hi L(copy_long) 62bbbf1280Sopenharmony_ci cmp count, 32 63bbbf1280Sopenharmony_ci b.hi L(copy32_128) 64bbbf1280Sopenharmony_ci 65bbbf1280Sopenharmony_ci /* Small copies: 0..32 bytes. */ 66bbbf1280Sopenharmony_ci cmp count, 16 67bbbf1280Sopenharmony_ci b.lo L(copy16) 68bbbf1280Sopenharmony_ci ldr A_q, [src] 69bbbf1280Sopenharmony_ci ldr B_q, [srcend, -16] 70bbbf1280Sopenharmony_ci str A_q, [dstin] 71bbbf1280Sopenharmony_ci str B_q, [dstend, -16] 72bbbf1280Sopenharmony_ci ret 73bbbf1280Sopenharmony_ci 74bbbf1280Sopenharmony_ci /* Copy 8-15 bytes. */ 75bbbf1280Sopenharmony_ciL(copy16): 76bbbf1280Sopenharmony_ci tbz count, 3, L(copy8) 77bbbf1280Sopenharmony_ci ldr A_l, [src] 78bbbf1280Sopenharmony_ci ldr A_h, [srcend, -8] 79bbbf1280Sopenharmony_ci str A_l, [dstin] 80bbbf1280Sopenharmony_ci str A_h, [dstend, -8] 81bbbf1280Sopenharmony_ci ret 82bbbf1280Sopenharmony_ci 83bbbf1280Sopenharmony_ci .p2align 3 84bbbf1280Sopenharmony_ci /* Copy 4-7 bytes. */ 85bbbf1280Sopenharmony_ciL(copy8): 86bbbf1280Sopenharmony_ci tbz count, 2, L(copy4) 87bbbf1280Sopenharmony_ci ldr A_lw, [src] 88bbbf1280Sopenharmony_ci ldr B_lw, [srcend, -4] 89bbbf1280Sopenharmony_ci str A_lw, [dstin] 90bbbf1280Sopenharmony_ci str B_lw, [dstend, -4] 91bbbf1280Sopenharmony_ci ret 92bbbf1280Sopenharmony_ci 93bbbf1280Sopenharmony_ci /* Copy 0..3 bytes using a branchless sequence. */ 94bbbf1280Sopenharmony_ciL(copy4): 95bbbf1280Sopenharmony_ci cbz count, L(copy0) 96bbbf1280Sopenharmony_ci lsr tmp1, count, 1 97bbbf1280Sopenharmony_ci ldrb A_lw, [src] 98bbbf1280Sopenharmony_ci ldrb C_lw, [srcend, -1] 99bbbf1280Sopenharmony_ci ldrb B_lw, [src, tmp1] 100bbbf1280Sopenharmony_ci strb A_lw, [dstin] 101bbbf1280Sopenharmony_ci strb B_lw, [dstin, tmp1] 102bbbf1280Sopenharmony_ci strb C_lw, [dstend, -1] 103bbbf1280Sopenharmony_ciL(copy0): 104bbbf1280Sopenharmony_ci ret 105bbbf1280Sopenharmony_ci 106bbbf1280Sopenharmony_ci .p2align 4 107bbbf1280Sopenharmony_ci /* Medium copies: 33..128 bytes. */ 108bbbf1280Sopenharmony_ciL(copy32_128): 109bbbf1280Sopenharmony_ci ldp A_q, B_q, [src] 110bbbf1280Sopenharmony_ci ldp C_q, D_q, [srcend, -32] 111bbbf1280Sopenharmony_ci cmp count, 64 112bbbf1280Sopenharmony_ci b.hi L(copy128) 113bbbf1280Sopenharmony_ci stp A_q, B_q, [dstin] 114bbbf1280Sopenharmony_ci stp C_q, D_q, [dstend, -32] 115bbbf1280Sopenharmony_ci ret 116bbbf1280Sopenharmony_ci 117bbbf1280Sopenharmony_ci .p2align 4 118bbbf1280Sopenharmony_ci /* Copy 65..128 bytes. */ 119bbbf1280Sopenharmony_ciL(copy128): 120bbbf1280Sopenharmony_ci ldp E_q, F_q, [src, 32] 121bbbf1280Sopenharmony_ci cmp count, 96 122bbbf1280Sopenharmony_ci b.ls L(copy96) 123bbbf1280Sopenharmony_ci ldp G_q, H_q, [srcend, -64] 124bbbf1280Sopenharmony_ci stp G_q, H_q, [dstend, -64] 125bbbf1280Sopenharmony_ciL(copy96): 126bbbf1280Sopenharmony_ci stp A_q, B_q, [dstin] 127bbbf1280Sopenharmony_ci stp E_q, F_q, [dstin, 32] 128bbbf1280Sopenharmony_ci stp C_q, D_q, [dstend, -32] 129bbbf1280Sopenharmony_ci ret 130bbbf1280Sopenharmony_ci 131bbbf1280Sopenharmony_ci /* Copy more than 128 bytes. */ 132bbbf1280Sopenharmony_ciL(copy_long): 133bbbf1280Sopenharmony_ci /* Use backwards copy if there is an overlap. */ 134bbbf1280Sopenharmony_ci sub tmp1, dstin, src 135bbbf1280Sopenharmony_ci cmp tmp1, count 136bbbf1280Sopenharmony_ci b.lo L(copy_long_backwards) 137bbbf1280Sopenharmony_ci 138bbbf1280Sopenharmony_ci /* Copy 16 bytes and then align src to 16-byte alignment. */ 139bbbf1280Sopenharmony_ci ldr D_q, [src] 140bbbf1280Sopenharmony_ci and tmp1, src, 15 141bbbf1280Sopenharmony_ci bic src, src, 15 142bbbf1280Sopenharmony_ci sub dst, dstin, tmp1 143bbbf1280Sopenharmony_ci add count, count, tmp1 /* Count is now 16 too large. */ 144bbbf1280Sopenharmony_ci ldp A_q, B_q, [src, 16] 145bbbf1280Sopenharmony_ci str D_q, [dstin] 146bbbf1280Sopenharmony_ci ldp C_q, D_q, [src, 48] 147bbbf1280Sopenharmony_ci subs count, count, 128 + 16 /* Test and readjust count. */ 148bbbf1280Sopenharmony_ci b.ls L(copy64_from_end) 149bbbf1280Sopenharmony_ciL(loop64): 150bbbf1280Sopenharmony_ci stp A_q, B_q, [dst, 16] 151bbbf1280Sopenharmony_ci ldp A_q, B_q, [src, 80] 152bbbf1280Sopenharmony_ci stp C_q, D_q, [dst, 48] 153bbbf1280Sopenharmony_ci ldp C_q, D_q, [src, 112] 154bbbf1280Sopenharmony_ci add src, src, 64 155bbbf1280Sopenharmony_ci add dst, dst, 64 156bbbf1280Sopenharmony_ci subs count, count, 64 157bbbf1280Sopenharmony_ci b.hi L(loop64) 158bbbf1280Sopenharmony_ci 159bbbf1280Sopenharmony_ci /* Write the last iteration and copy 64 bytes from the end. */ 160bbbf1280Sopenharmony_ciL(copy64_from_end): 161bbbf1280Sopenharmony_ci ldp E_q, F_q, [srcend, -64] 162bbbf1280Sopenharmony_ci stp A_q, B_q, [dst, 16] 163bbbf1280Sopenharmony_ci ldp A_q, B_q, [srcend, -32] 164bbbf1280Sopenharmony_ci stp C_q, D_q, [dst, 48] 165bbbf1280Sopenharmony_ci stp E_q, F_q, [dstend, -64] 166bbbf1280Sopenharmony_ci stp A_q, B_q, [dstend, -32] 167bbbf1280Sopenharmony_ci ret 168bbbf1280Sopenharmony_ci 169bbbf1280Sopenharmony_ci /* Large backwards copy for overlapping copies. 170bbbf1280Sopenharmony_ci Copy 16 bytes and then align srcend to 16-byte alignment. */ 171bbbf1280Sopenharmony_ciL(copy_long_backwards): 172bbbf1280Sopenharmony_ci cbz tmp1, L(copy0) 173bbbf1280Sopenharmony_ci ldr D_q, [srcend, -16] 174bbbf1280Sopenharmony_ci and tmp1, srcend, 15 175bbbf1280Sopenharmony_ci bic srcend, srcend, 15 176bbbf1280Sopenharmony_ci sub count, count, tmp1 177bbbf1280Sopenharmony_ci ldp A_q, B_q, [srcend, -32] 178bbbf1280Sopenharmony_ci str D_q, [dstend, -16] 179bbbf1280Sopenharmony_ci ldp C_q, D_q, [srcend, -64] 180bbbf1280Sopenharmony_ci sub dstend, dstend, tmp1 181bbbf1280Sopenharmony_ci subs count, count, 128 182bbbf1280Sopenharmony_ci b.ls L(copy64_from_start) 183bbbf1280Sopenharmony_ci 184bbbf1280Sopenharmony_ciL(loop64_backwards): 185bbbf1280Sopenharmony_ci str B_q, [dstend, -16] 186bbbf1280Sopenharmony_ci str A_q, [dstend, -32] 187bbbf1280Sopenharmony_ci ldp A_q, B_q, [srcend, -96] 188bbbf1280Sopenharmony_ci str D_q, [dstend, -48] 189bbbf1280Sopenharmony_ci str C_q, [dstend, -64]! 190bbbf1280Sopenharmony_ci ldp C_q, D_q, [srcend, -128] 191bbbf1280Sopenharmony_ci sub srcend, srcend, 64 192bbbf1280Sopenharmony_ci subs count, count, 64 193bbbf1280Sopenharmony_ci b.hi L(loop64_backwards) 194bbbf1280Sopenharmony_ci 195bbbf1280Sopenharmony_ci /* Write the last iteration and copy 64 bytes from the start. */ 196bbbf1280Sopenharmony_ciL(copy64_from_start): 197bbbf1280Sopenharmony_ci ldp E_q, F_q, [src, 32] 198bbbf1280Sopenharmony_ci stp A_q, B_q, [dstend, -32] 199bbbf1280Sopenharmony_ci ldp A_q, B_q, [src] 200bbbf1280Sopenharmony_ci stp C_q, D_q, [dstend, -64] 201bbbf1280Sopenharmony_ci stp E_q, F_q, [dstin, 32] 202bbbf1280Sopenharmony_ci stp A_q, B_q, [dstin] 203bbbf1280Sopenharmony_ci ret 204bbbf1280Sopenharmony_ci 205bbbf1280Sopenharmony_ciEND (__memcpy_aarch64_simd) 206bbbf1280Sopenharmony_ci 207