162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Copyright (c) 2012-2021, Arm Limited. 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Adapted from the original at: 662306a36Sopenharmony_ci * https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/memcpy.S 762306a36Sopenharmony_ci */ 862306a36Sopenharmony_ci 962306a36Sopenharmony_ci#include <linux/linkage.h> 1062306a36Sopenharmony_ci#include <asm/assembler.h> 1162306a36Sopenharmony_ci 1262306a36Sopenharmony_ci/* Assumptions: 1362306a36Sopenharmony_ci * 1462306a36Sopenharmony_ci * ARMv8-a, AArch64, unaligned accesses. 1562306a36Sopenharmony_ci * 1662306a36Sopenharmony_ci */ 1762306a36Sopenharmony_ci 1862306a36Sopenharmony_ci#define L(label) .L ## label 1962306a36Sopenharmony_ci 2062306a36Sopenharmony_ci#define dstin x0 2162306a36Sopenharmony_ci#define src x1 2262306a36Sopenharmony_ci#define count x2 2362306a36Sopenharmony_ci#define dst x3 2462306a36Sopenharmony_ci#define srcend x4 2562306a36Sopenharmony_ci#define dstend x5 2662306a36Sopenharmony_ci#define A_l x6 2762306a36Sopenharmony_ci#define A_lw w6 2862306a36Sopenharmony_ci#define A_h x7 2962306a36Sopenharmony_ci#define B_l x8 3062306a36Sopenharmony_ci#define B_lw w8 3162306a36Sopenharmony_ci#define B_h x9 3262306a36Sopenharmony_ci#define C_l x10 3362306a36Sopenharmony_ci#define C_lw w10 3462306a36Sopenharmony_ci#define C_h x11 3562306a36Sopenharmony_ci#define D_l x12 3662306a36Sopenharmony_ci#define D_h x13 3762306a36Sopenharmony_ci#define E_l x14 3862306a36Sopenharmony_ci#define E_h x15 3962306a36Sopenharmony_ci#define F_l x16 4062306a36Sopenharmony_ci#define F_h x17 4162306a36Sopenharmony_ci#define G_l count 4262306a36Sopenharmony_ci#define G_h dst 4362306a36Sopenharmony_ci#define H_l src 4462306a36Sopenharmony_ci#define H_h srcend 4562306a36Sopenharmony_ci#define tmp1 x14 4662306a36Sopenharmony_ci 4762306a36Sopenharmony_ci/* This implementation handles overlaps and supports both memcpy and memmove 4862306a36Sopenharmony_ci from a single entry point. It uses unaligned accesses and branchless 4962306a36Sopenharmony_ci sequences to keep the code small, simple and improve performance. 5062306a36Sopenharmony_ci 5162306a36Sopenharmony_ci Copies are split into 3 main cases: small copies of up to 32 bytes, medium 5262306a36Sopenharmony_ci copies of up to 128 bytes, and large copies. The overhead of the overlap 5362306a36Sopenharmony_ci check is negligible since it is only required for large copies. 5462306a36Sopenharmony_ci 5562306a36Sopenharmony_ci Large copies use a software pipelined loop processing 64 bytes per iteration. 5662306a36Sopenharmony_ci The destination pointer is 16-byte aligned to minimize unaligned accesses. 5762306a36Sopenharmony_ci The loop tail is handled by always copying 64 bytes from the end. 5862306a36Sopenharmony_ci*/ 5962306a36Sopenharmony_ci 6062306a36Sopenharmony_ciSYM_FUNC_START(__pi_memcpy) 6162306a36Sopenharmony_ci add srcend, src, count 6262306a36Sopenharmony_ci add dstend, dstin, count 6362306a36Sopenharmony_ci cmp count, 128 6462306a36Sopenharmony_ci b.hi L(copy_long) 6562306a36Sopenharmony_ci cmp count, 32 6662306a36Sopenharmony_ci b.hi L(copy32_128) 6762306a36Sopenharmony_ci 6862306a36Sopenharmony_ci /* Small copies: 0..32 bytes. */ 6962306a36Sopenharmony_ci cmp count, 16 7062306a36Sopenharmony_ci b.lo L(copy16) 7162306a36Sopenharmony_ci ldp A_l, A_h, [src] 7262306a36Sopenharmony_ci ldp D_l, D_h, [srcend, -16] 7362306a36Sopenharmony_ci stp A_l, A_h, [dstin] 7462306a36Sopenharmony_ci stp D_l, D_h, [dstend, -16] 7562306a36Sopenharmony_ci ret 7662306a36Sopenharmony_ci 7762306a36Sopenharmony_ci /* Copy 8-15 bytes. */ 7862306a36Sopenharmony_ciL(copy16): 7962306a36Sopenharmony_ci tbz count, 3, L(copy8) 8062306a36Sopenharmony_ci ldr A_l, [src] 8162306a36Sopenharmony_ci ldr A_h, [srcend, -8] 8262306a36Sopenharmony_ci str A_l, [dstin] 8362306a36Sopenharmony_ci str A_h, [dstend, -8] 8462306a36Sopenharmony_ci ret 8562306a36Sopenharmony_ci 8662306a36Sopenharmony_ci .p2align 3 8762306a36Sopenharmony_ci /* Copy 4-7 bytes. */ 8862306a36Sopenharmony_ciL(copy8): 8962306a36Sopenharmony_ci tbz count, 2, L(copy4) 9062306a36Sopenharmony_ci ldr A_lw, [src] 9162306a36Sopenharmony_ci ldr B_lw, [srcend, -4] 9262306a36Sopenharmony_ci str A_lw, [dstin] 9362306a36Sopenharmony_ci str B_lw, [dstend, -4] 9462306a36Sopenharmony_ci ret 9562306a36Sopenharmony_ci 9662306a36Sopenharmony_ci /* Copy 0..3 bytes using a branchless sequence. */ 9762306a36Sopenharmony_ciL(copy4): 9862306a36Sopenharmony_ci cbz count, L(copy0) 9962306a36Sopenharmony_ci lsr tmp1, count, 1 10062306a36Sopenharmony_ci ldrb A_lw, [src] 10162306a36Sopenharmony_ci ldrb C_lw, [srcend, -1] 10262306a36Sopenharmony_ci ldrb B_lw, [src, tmp1] 10362306a36Sopenharmony_ci strb A_lw, [dstin] 10462306a36Sopenharmony_ci strb B_lw, [dstin, tmp1] 10562306a36Sopenharmony_ci strb C_lw, [dstend, -1] 10662306a36Sopenharmony_ciL(copy0): 10762306a36Sopenharmony_ci ret 10862306a36Sopenharmony_ci 10962306a36Sopenharmony_ci .p2align 4 11062306a36Sopenharmony_ci /* Medium copies: 33..128 bytes. */ 11162306a36Sopenharmony_ciL(copy32_128): 11262306a36Sopenharmony_ci ldp A_l, A_h, [src] 11362306a36Sopenharmony_ci ldp B_l, B_h, [src, 16] 11462306a36Sopenharmony_ci ldp C_l, C_h, [srcend, -32] 11562306a36Sopenharmony_ci ldp D_l, D_h, [srcend, -16] 11662306a36Sopenharmony_ci cmp count, 64 11762306a36Sopenharmony_ci b.hi L(copy128) 11862306a36Sopenharmony_ci stp A_l, A_h, [dstin] 11962306a36Sopenharmony_ci stp B_l, B_h, [dstin, 16] 12062306a36Sopenharmony_ci stp C_l, C_h, [dstend, -32] 12162306a36Sopenharmony_ci stp D_l, D_h, [dstend, -16] 12262306a36Sopenharmony_ci ret 12362306a36Sopenharmony_ci 12462306a36Sopenharmony_ci .p2align 4 12562306a36Sopenharmony_ci /* Copy 65..128 bytes. */ 12662306a36Sopenharmony_ciL(copy128): 12762306a36Sopenharmony_ci ldp E_l, E_h, [src, 32] 12862306a36Sopenharmony_ci ldp F_l, F_h, [src, 48] 12962306a36Sopenharmony_ci cmp count, 96 13062306a36Sopenharmony_ci b.ls L(copy96) 13162306a36Sopenharmony_ci ldp G_l, G_h, [srcend, -64] 13262306a36Sopenharmony_ci ldp H_l, H_h, [srcend, -48] 13362306a36Sopenharmony_ci stp G_l, G_h, [dstend, -64] 13462306a36Sopenharmony_ci stp H_l, H_h, [dstend, -48] 13562306a36Sopenharmony_ciL(copy96): 13662306a36Sopenharmony_ci stp A_l, A_h, [dstin] 13762306a36Sopenharmony_ci stp B_l, B_h, [dstin, 16] 13862306a36Sopenharmony_ci stp E_l, E_h, [dstin, 32] 13962306a36Sopenharmony_ci stp F_l, F_h, [dstin, 48] 14062306a36Sopenharmony_ci stp C_l, C_h, [dstend, -32] 14162306a36Sopenharmony_ci stp D_l, D_h, [dstend, -16] 14262306a36Sopenharmony_ci ret 14362306a36Sopenharmony_ci 14462306a36Sopenharmony_ci .p2align 4 14562306a36Sopenharmony_ci /* Copy more than 128 bytes. */ 14662306a36Sopenharmony_ciL(copy_long): 14762306a36Sopenharmony_ci /* Use backwards copy if there is an overlap. */ 14862306a36Sopenharmony_ci sub tmp1, dstin, src 14962306a36Sopenharmony_ci cbz tmp1, L(copy0) 15062306a36Sopenharmony_ci cmp tmp1, count 15162306a36Sopenharmony_ci b.lo L(copy_long_backwards) 15262306a36Sopenharmony_ci 15362306a36Sopenharmony_ci /* Copy 16 bytes and then align dst to 16-byte alignment. */ 15462306a36Sopenharmony_ci 15562306a36Sopenharmony_ci ldp D_l, D_h, [src] 15662306a36Sopenharmony_ci and tmp1, dstin, 15 15762306a36Sopenharmony_ci bic dst, dstin, 15 15862306a36Sopenharmony_ci sub src, src, tmp1 15962306a36Sopenharmony_ci add count, count, tmp1 /* Count is now 16 too large. */ 16062306a36Sopenharmony_ci ldp A_l, A_h, [src, 16] 16162306a36Sopenharmony_ci stp D_l, D_h, [dstin] 16262306a36Sopenharmony_ci ldp B_l, B_h, [src, 32] 16362306a36Sopenharmony_ci ldp C_l, C_h, [src, 48] 16462306a36Sopenharmony_ci ldp D_l, D_h, [src, 64]! 16562306a36Sopenharmony_ci subs count, count, 128 + 16 /* Test and readjust count. */ 16662306a36Sopenharmony_ci b.ls L(copy64_from_end) 16762306a36Sopenharmony_ci 16862306a36Sopenharmony_ciL(loop64): 16962306a36Sopenharmony_ci stp A_l, A_h, [dst, 16] 17062306a36Sopenharmony_ci ldp A_l, A_h, [src, 16] 17162306a36Sopenharmony_ci stp B_l, B_h, [dst, 32] 17262306a36Sopenharmony_ci ldp B_l, B_h, [src, 32] 17362306a36Sopenharmony_ci stp C_l, C_h, [dst, 48] 17462306a36Sopenharmony_ci ldp C_l, C_h, [src, 48] 17562306a36Sopenharmony_ci stp D_l, D_h, [dst, 64]! 17662306a36Sopenharmony_ci ldp D_l, D_h, [src, 64]! 17762306a36Sopenharmony_ci subs count, count, 64 17862306a36Sopenharmony_ci b.hi L(loop64) 17962306a36Sopenharmony_ci 18062306a36Sopenharmony_ci /* Write the last iteration and copy 64 bytes from the end. */ 18162306a36Sopenharmony_ciL(copy64_from_end): 18262306a36Sopenharmony_ci ldp E_l, E_h, [srcend, -64] 18362306a36Sopenharmony_ci stp A_l, A_h, [dst, 16] 18462306a36Sopenharmony_ci ldp A_l, A_h, [srcend, -48] 18562306a36Sopenharmony_ci stp B_l, B_h, [dst, 32] 18662306a36Sopenharmony_ci ldp B_l, B_h, [srcend, -32] 18762306a36Sopenharmony_ci stp C_l, C_h, [dst, 48] 18862306a36Sopenharmony_ci ldp C_l, C_h, [srcend, -16] 18962306a36Sopenharmony_ci stp D_l, D_h, [dst, 64] 19062306a36Sopenharmony_ci stp E_l, E_h, [dstend, -64] 19162306a36Sopenharmony_ci stp A_l, A_h, [dstend, -48] 19262306a36Sopenharmony_ci stp B_l, B_h, [dstend, -32] 19362306a36Sopenharmony_ci stp C_l, C_h, [dstend, -16] 19462306a36Sopenharmony_ci ret 19562306a36Sopenharmony_ci 19662306a36Sopenharmony_ci .p2align 4 19762306a36Sopenharmony_ci 19862306a36Sopenharmony_ci /* Large backwards copy for overlapping copies. 19962306a36Sopenharmony_ci Copy 16 bytes and then align dst to 16-byte alignment. */ 20062306a36Sopenharmony_ciL(copy_long_backwards): 20162306a36Sopenharmony_ci ldp D_l, D_h, [srcend, -16] 20262306a36Sopenharmony_ci and tmp1, dstend, 15 20362306a36Sopenharmony_ci sub srcend, srcend, tmp1 20462306a36Sopenharmony_ci sub count, count, tmp1 20562306a36Sopenharmony_ci ldp A_l, A_h, [srcend, -16] 20662306a36Sopenharmony_ci stp D_l, D_h, [dstend, -16] 20762306a36Sopenharmony_ci ldp B_l, B_h, [srcend, -32] 20862306a36Sopenharmony_ci ldp C_l, C_h, [srcend, -48] 20962306a36Sopenharmony_ci ldp D_l, D_h, [srcend, -64]! 21062306a36Sopenharmony_ci sub dstend, dstend, tmp1 21162306a36Sopenharmony_ci subs count, count, 128 21262306a36Sopenharmony_ci b.ls L(copy64_from_start) 21362306a36Sopenharmony_ci 21462306a36Sopenharmony_ciL(loop64_backwards): 21562306a36Sopenharmony_ci stp A_l, A_h, [dstend, -16] 21662306a36Sopenharmony_ci ldp A_l, A_h, [srcend, -16] 21762306a36Sopenharmony_ci stp B_l, B_h, [dstend, -32] 21862306a36Sopenharmony_ci ldp B_l, B_h, [srcend, -32] 21962306a36Sopenharmony_ci stp C_l, C_h, [dstend, -48] 22062306a36Sopenharmony_ci ldp C_l, C_h, [srcend, -48] 22162306a36Sopenharmony_ci stp D_l, D_h, [dstend, -64]! 22262306a36Sopenharmony_ci ldp D_l, D_h, [srcend, -64]! 22362306a36Sopenharmony_ci subs count, count, 64 22462306a36Sopenharmony_ci b.hi L(loop64_backwards) 22562306a36Sopenharmony_ci 22662306a36Sopenharmony_ci /* Write the last iteration and copy 64 bytes from the start. */ 22762306a36Sopenharmony_ciL(copy64_from_start): 22862306a36Sopenharmony_ci ldp G_l, G_h, [src, 48] 22962306a36Sopenharmony_ci stp A_l, A_h, [dstend, -16] 23062306a36Sopenharmony_ci ldp A_l, A_h, [src, 32] 23162306a36Sopenharmony_ci stp B_l, B_h, [dstend, -32] 23262306a36Sopenharmony_ci ldp B_l, B_h, [src, 16] 23362306a36Sopenharmony_ci stp C_l, C_h, [dstend, -48] 23462306a36Sopenharmony_ci ldp C_l, C_h, [src] 23562306a36Sopenharmony_ci stp D_l, D_h, [dstend, -64] 23662306a36Sopenharmony_ci stp G_l, G_h, [dstin, 48] 23762306a36Sopenharmony_ci stp A_l, A_h, [dstin, 32] 23862306a36Sopenharmony_ci stp B_l, B_h, [dstin, 16] 23962306a36Sopenharmony_ci stp C_l, C_h, [dstin] 24062306a36Sopenharmony_ci ret 24162306a36Sopenharmony_ciSYM_FUNC_END(__pi_memcpy) 24262306a36Sopenharmony_ci 24362306a36Sopenharmony_ciSYM_FUNC_ALIAS(__memcpy, __pi_memcpy) 24462306a36Sopenharmony_ciEXPORT_SYMBOL(__memcpy) 24562306a36Sopenharmony_ciSYM_FUNC_ALIAS_WEAK(memcpy, __memcpy) 24662306a36Sopenharmony_ciEXPORT_SYMBOL(memcpy) 24762306a36Sopenharmony_ci 24862306a36Sopenharmony_ciSYM_FUNC_ALIAS(__pi_memmove, __pi_memcpy) 24962306a36Sopenharmony_ci 25062306a36Sopenharmony_ciSYM_FUNC_ALIAS(__memmove, __pi_memmove) 25162306a36Sopenharmony_ciEXPORT_SYMBOL(__memmove) 25262306a36Sopenharmony_ciSYM_FUNC_ALIAS_WEAK(memmove, __memmove) 25362306a36Sopenharmony_ciEXPORT_SYMBOL(memmove) 254