162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Copyright (C) 2022 Michael T. Kloos <michael@michaelkloos.com> 462306a36Sopenharmony_ci */ 562306a36Sopenharmony_ci 662306a36Sopenharmony_ci#include <linux/linkage.h> 762306a36Sopenharmony_ci#include <asm/asm.h> 862306a36Sopenharmony_ci 962306a36Sopenharmony_ciSYM_FUNC_START(__memmove) 1062306a36Sopenharmony_ciSYM_FUNC_START_WEAK(memmove) 1162306a36Sopenharmony_ci /* 1262306a36Sopenharmony_ci * Returns 1362306a36Sopenharmony_ci * a0 - dest 1462306a36Sopenharmony_ci * 1562306a36Sopenharmony_ci * Parameters 1662306a36Sopenharmony_ci * a0 - Inclusive first byte of dest 1762306a36Sopenharmony_ci * a1 - Inclusive first byte of src 1862306a36Sopenharmony_ci * a2 - Length of copy n 1962306a36Sopenharmony_ci * 2062306a36Sopenharmony_ci * Because the return matches the parameter register a0, 2162306a36Sopenharmony_ci * we will not clobber or modify that register. 2262306a36Sopenharmony_ci * 2362306a36Sopenharmony_ci * Note: This currently only works on little-endian. 2462306a36Sopenharmony_ci * To port to big-endian, reverse the direction of shifts 2562306a36Sopenharmony_ci * in the 2 misaligned fixup copy loops. 2662306a36Sopenharmony_ci */ 2762306a36Sopenharmony_ci 2862306a36Sopenharmony_ci /* Return if nothing to do */ 2962306a36Sopenharmony_ci beq a0, a1, return_from_memmove 3062306a36Sopenharmony_ci beqz a2, return_from_memmove 3162306a36Sopenharmony_ci 3262306a36Sopenharmony_ci /* 3362306a36Sopenharmony_ci * Register Uses 3462306a36Sopenharmony_ci * Forward Copy: a1 - Index counter of src 3562306a36Sopenharmony_ci * Reverse Copy: a4 - Index counter of src 3662306a36Sopenharmony_ci * Forward Copy: t3 - Index counter of dest 3762306a36Sopenharmony_ci * Reverse Copy: t4 - Index counter of dest 3862306a36Sopenharmony_ci * Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest 3962306a36Sopenharmony_ci * Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest 4062306a36Sopenharmony_ci * Both Copy Modes: t0 - Link / Temporary for load-store 4162306a36Sopenharmony_ci * Both Copy Modes: t1 - Temporary for load-store 4262306a36Sopenharmony_ci * Both Copy Modes: t2 - Temporary for load-store 4362306a36Sopenharmony_ci * Both Copy Modes: a5 - dest to src alignment offset 4462306a36Sopenharmony_ci * Both Copy Modes: a6 - Shift ammount 4562306a36Sopenharmony_ci * Both Copy Modes: a7 - Inverse Shift ammount 4662306a36Sopenharmony_ci * Both Copy Modes: a2 - Alternate breakpoint for unrolled loops 4762306a36Sopenharmony_ci */ 4862306a36Sopenharmony_ci 4962306a36Sopenharmony_ci /* 5062306a36Sopenharmony_ci * Solve for some register values now. 5162306a36Sopenharmony_ci * Byte copy does not need t5 or t6. 5262306a36Sopenharmony_ci */ 5362306a36Sopenharmony_ci mv t3, a0 5462306a36Sopenharmony_ci add t4, a0, a2 5562306a36Sopenharmony_ci add a4, a1, a2 5662306a36Sopenharmony_ci 5762306a36Sopenharmony_ci /* 5862306a36Sopenharmony_ci * Byte copy if copying less than (2 * SZREG) bytes. This can 5962306a36Sopenharmony_ci * cause problems with the bulk copy implementation and is 6062306a36Sopenharmony_ci * small enough not to bother. 6162306a36Sopenharmony_ci */ 6262306a36Sopenharmony_ci andi t0, a2, -(2 * SZREG) 6362306a36Sopenharmony_ci beqz t0, byte_copy 6462306a36Sopenharmony_ci 6562306a36Sopenharmony_ci /* 6662306a36Sopenharmony_ci * Now solve for t5 and t6. 6762306a36Sopenharmony_ci */ 6862306a36Sopenharmony_ci andi t5, t3, -SZREG 6962306a36Sopenharmony_ci andi t6, t4, -SZREG 7062306a36Sopenharmony_ci /* 7162306a36Sopenharmony_ci * If dest(Register t3) rounded down to the nearest naturally 7262306a36Sopenharmony_ci * aligned SZREG address, does not equal dest, then add SZREG 7362306a36Sopenharmony_ci * to find the low-bound of SZREG alignment in the dest memory 7462306a36Sopenharmony_ci * region. Note that this could overshoot the dest memory 7562306a36Sopenharmony_ci * region if n is less than SZREG. This is one reason why 7662306a36Sopenharmony_ci * we always byte copy if n is less than SZREG. 7762306a36Sopenharmony_ci * Otherwise, dest is already naturally aligned to SZREG. 7862306a36Sopenharmony_ci */ 7962306a36Sopenharmony_ci beq t5, t3, 1f 8062306a36Sopenharmony_ci addi t5, t5, SZREG 8162306a36Sopenharmony_ci 1: 8262306a36Sopenharmony_ci 8362306a36Sopenharmony_ci /* 8462306a36Sopenharmony_ci * If the dest and src are co-aligned to SZREG, then there is 8562306a36Sopenharmony_ci * no need for the full rigmarole of a full misaligned fixup copy. 8662306a36Sopenharmony_ci * Instead, do a simpler co-aligned copy. 8762306a36Sopenharmony_ci */ 8862306a36Sopenharmony_ci xor t0, a0, a1 8962306a36Sopenharmony_ci andi t1, t0, (SZREG - 1) 9062306a36Sopenharmony_ci beqz t1, coaligned_copy 9162306a36Sopenharmony_ci /* Fall through to misaligned fixup copy */ 9262306a36Sopenharmony_ci 9362306a36Sopenharmony_cimisaligned_fixup_copy: 9462306a36Sopenharmony_ci bltu a1, a0, misaligned_fixup_copy_reverse 9562306a36Sopenharmony_ci 9662306a36Sopenharmony_cimisaligned_fixup_copy_forward: 9762306a36Sopenharmony_ci jal t0, byte_copy_until_aligned_forward 9862306a36Sopenharmony_ci 9962306a36Sopenharmony_ci andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */ 10062306a36Sopenharmony_ci slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */ 10162306a36Sopenharmony_ci sub a5, a1, t3 /* Find the difference between src and dest */ 10262306a36Sopenharmony_ci andi a1, a1, -SZREG /* Align the src pointer */ 10362306a36Sopenharmony_ci addi a2, t6, SZREG /* The other breakpoint for the unrolled loop*/ 10462306a36Sopenharmony_ci 10562306a36Sopenharmony_ci /* 10662306a36Sopenharmony_ci * Compute The Inverse Shift 10762306a36Sopenharmony_ci * a7 = XLEN - a6 = XLEN + -a6 10862306a36Sopenharmony_ci * 2s complement negation to find the negative: -a6 = ~a6 + 1 10962306a36Sopenharmony_ci * Add that to XLEN. XLEN = SZREG * 8. 11062306a36Sopenharmony_ci */ 11162306a36Sopenharmony_ci not a7, a6 11262306a36Sopenharmony_ci addi a7, a7, (SZREG * 8 + 1) 11362306a36Sopenharmony_ci 11462306a36Sopenharmony_ci /* 11562306a36Sopenharmony_ci * Fix Misalignment Copy Loop - Forward 11662306a36Sopenharmony_ci * load_val0 = load_ptr[0]; 11762306a36Sopenharmony_ci * do { 11862306a36Sopenharmony_ci * load_val1 = load_ptr[1]; 11962306a36Sopenharmony_ci * store_ptr += 2; 12062306a36Sopenharmony_ci * store_ptr[0 - 2] = (load_val0 >> {a6}) | (load_val1 << {a7}); 12162306a36Sopenharmony_ci * 12262306a36Sopenharmony_ci * if (store_ptr == {a2}) 12362306a36Sopenharmony_ci * break; 12462306a36Sopenharmony_ci * 12562306a36Sopenharmony_ci * load_val0 = load_ptr[2]; 12662306a36Sopenharmony_ci * load_ptr += 2; 12762306a36Sopenharmony_ci * store_ptr[1 - 2] = (load_val1 >> {a6}) | (load_val0 << {a7}); 12862306a36Sopenharmony_ci * 12962306a36Sopenharmony_ci * } while (store_ptr != store_ptr_end); 13062306a36Sopenharmony_ci * store_ptr = store_ptr_end; 13162306a36Sopenharmony_ci */ 13262306a36Sopenharmony_ci 13362306a36Sopenharmony_ci REG_L t0, (0 * SZREG)(a1) 13462306a36Sopenharmony_ci 1: 13562306a36Sopenharmony_ci REG_L t1, (1 * SZREG)(a1) 13662306a36Sopenharmony_ci addi t3, t3, (2 * SZREG) 13762306a36Sopenharmony_ci srl t0, t0, a6 13862306a36Sopenharmony_ci sll t2, t1, a7 13962306a36Sopenharmony_ci or t2, t0, t2 14062306a36Sopenharmony_ci REG_S t2, ((0 * SZREG) - (2 * SZREG))(t3) 14162306a36Sopenharmony_ci 14262306a36Sopenharmony_ci beq t3, a2, 2f 14362306a36Sopenharmony_ci 14462306a36Sopenharmony_ci REG_L t0, (2 * SZREG)(a1) 14562306a36Sopenharmony_ci addi a1, a1, (2 * SZREG) 14662306a36Sopenharmony_ci srl t1, t1, a6 14762306a36Sopenharmony_ci sll t2, t0, a7 14862306a36Sopenharmony_ci or t2, t1, t2 14962306a36Sopenharmony_ci REG_S t2, ((1 * SZREG) - (2 * SZREG))(t3) 15062306a36Sopenharmony_ci 15162306a36Sopenharmony_ci bne t3, t6, 1b 15262306a36Sopenharmony_ci 2: 15362306a36Sopenharmony_ci mv t3, t6 /* Fix the dest pointer in case the loop was broken */ 15462306a36Sopenharmony_ci 15562306a36Sopenharmony_ci add a1, t3, a5 /* Restore the src pointer */ 15662306a36Sopenharmony_ci j byte_copy_forward /* Copy any remaining bytes */ 15762306a36Sopenharmony_ci 15862306a36Sopenharmony_cimisaligned_fixup_copy_reverse: 15962306a36Sopenharmony_ci jal t0, byte_copy_until_aligned_reverse 16062306a36Sopenharmony_ci 16162306a36Sopenharmony_ci andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */ 16262306a36Sopenharmony_ci slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */ 16362306a36Sopenharmony_ci sub a5, a4, t4 /* Find the difference between src and dest */ 16462306a36Sopenharmony_ci andi a4, a4, -SZREG /* Align the src pointer */ 16562306a36Sopenharmony_ci addi a2, t5, -SZREG /* The other breakpoint for the unrolled loop*/ 16662306a36Sopenharmony_ci 16762306a36Sopenharmony_ci /* 16862306a36Sopenharmony_ci * Compute The Inverse Shift 16962306a36Sopenharmony_ci * a7 = XLEN - a6 = XLEN + -a6 17062306a36Sopenharmony_ci * 2s complement negation to find the negative: -a6 = ~a6 + 1 17162306a36Sopenharmony_ci * Add that to XLEN. XLEN = SZREG * 8. 17262306a36Sopenharmony_ci */ 17362306a36Sopenharmony_ci not a7, a6 17462306a36Sopenharmony_ci addi a7, a7, (SZREG * 8 + 1) 17562306a36Sopenharmony_ci 17662306a36Sopenharmony_ci /* 17762306a36Sopenharmony_ci * Fix Misalignment Copy Loop - Reverse 17862306a36Sopenharmony_ci * load_val1 = load_ptr[0]; 17962306a36Sopenharmony_ci * do { 18062306a36Sopenharmony_ci * load_val0 = load_ptr[-1]; 18162306a36Sopenharmony_ci * store_ptr -= 2; 18262306a36Sopenharmony_ci * store_ptr[1] = (load_val0 >> {a6}) | (load_val1 << {a7}); 18362306a36Sopenharmony_ci * 18462306a36Sopenharmony_ci * if (store_ptr == {a2}) 18562306a36Sopenharmony_ci * break; 18662306a36Sopenharmony_ci * 18762306a36Sopenharmony_ci * load_val1 = load_ptr[-2]; 18862306a36Sopenharmony_ci * load_ptr -= 2; 18962306a36Sopenharmony_ci * store_ptr[0] = (load_val1 >> {a6}) | (load_val0 << {a7}); 19062306a36Sopenharmony_ci * 19162306a36Sopenharmony_ci * } while (store_ptr != store_ptr_end); 19262306a36Sopenharmony_ci * store_ptr = store_ptr_end; 19362306a36Sopenharmony_ci */ 19462306a36Sopenharmony_ci 19562306a36Sopenharmony_ci REG_L t1, ( 0 * SZREG)(a4) 19662306a36Sopenharmony_ci 1: 19762306a36Sopenharmony_ci REG_L t0, (-1 * SZREG)(a4) 19862306a36Sopenharmony_ci addi t4, t4, (-2 * SZREG) 19962306a36Sopenharmony_ci sll t1, t1, a7 20062306a36Sopenharmony_ci srl t2, t0, a6 20162306a36Sopenharmony_ci or t2, t1, t2 20262306a36Sopenharmony_ci REG_S t2, ( 1 * SZREG)(t4) 20362306a36Sopenharmony_ci 20462306a36Sopenharmony_ci beq t4, a2, 2f 20562306a36Sopenharmony_ci 20662306a36Sopenharmony_ci REG_L t1, (-2 * SZREG)(a4) 20762306a36Sopenharmony_ci addi a4, a4, (-2 * SZREG) 20862306a36Sopenharmony_ci sll t0, t0, a7 20962306a36Sopenharmony_ci srl t2, t1, a6 21062306a36Sopenharmony_ci or t2, t0, t2 21162306a36Sopenharmony_ci REG_S t2, ( 0 * SZREG)(t4) 21262306a36Sopenharmony_ci 21362306a36Sopenharmony_ci bne t4, t5, 1b 21462306a36Sopenharmony_ci 2: 21562306a36Sopenharmony_ci mv t4, t5 /* Fix the dest pointer in case the loop was broken */ 21662306a36Sopenharmony_ci 21762306a36Sopenharmony_ci add a4, t4, a5 /* Restore the src pointer */ 21862306a36Sopenharmony_ci j byte_copy_reverse /* Copy any remaining bytes */ 21962306a36Sopenharmony_ci 22062306a36Sopenharmony_ci/* 22162306a36Sopenharmony_ci * Simple copy loops for SZREG co-aligned memory locations. 22262306a36Sopenharmony_ci * These also make calls to do byte copies for any unaligned 22362306a36Sopenharmony_ci * data at their terminations. 22462306a36Sopenharmony_ci */ 22562306a36Sopenharmony_cicoaligned_copy: 22662306a36Sopenharmony_ci bltu a1, a0, coaligned_copy_reverse 22762306a36Sopenharmony_ci 22862306a36Sopenharmony_cicoaligned_copy_forward: 22962306a36Sopenharmony_ci jal t0, byte_copy_until_aligned_forward 23062306a36Sopenharmony_ci 23162306a36Sopenharmony_ci 1: 23262306a36Sopenharmony_ci REG_L t1, ( 0 * SZREG)(a1) 23362306a36Sopenharmony_ci addi a1, a1, SZREG 23462306a36Sopenharmony_ci addi t3, t3, SZREG 23562306a36Sopenharmony_ci REG_S t1, (-1 * SZREG)(t3) 23662306a36Sopenharmony_ci bne t3, t6, 1b 23762306a36Sopenharmony_ci 23862306a36Sopenharmony_ci j byte_copy_forward /* Copy any remaining bytes */ 23962306a36Sopenharmony_ci 24062306a36Sopenharmony_cicoaligned_copy_reverse: 24162306a36Sopenharmony_ci jal t0, byte_copy_until_aligned_reverse 24262306a36Sopenharmony_ci 24362306a36Sopenharmony_ci 1: 24462306a36Sopenharmony_ci REG_L t1, (-1 * SZREG)(a4) 24562306a36Sopenharmony_ci addi a4, a4, -SZREG 24662306a36Sopenharmony_ci addi t4, t4, -SZREG 24762306a36Sopenharmony_ci REG_S t1, ( 0 * SZREG)(t4) 24862306a36Sopenharmony_ci bne t4, t5, 1b 24962306a36Sopenharmony_ci 25062306a36Sopenharmony_ci j byte_copy_reverse /* Copy any remaining bytes */ 25162306a36Sopenharmony_ci 25262306a36Sopenharmony_ci/* 25362306a36Sopenharmony_ci * These are basically sub-functions within the function. They 25462306a36Sopenharmony_ci * are used to byte copy until the dest pointer is in alignment. 25562306a36Sopenharmony_ci * At which point, a bulk copy method can be used by the 25662306a36Sopenharmony_ci * calling code. These work on the same registers as the bulk 25762306a36Sopenharmony_ci * copy loops. Therefore, the register values can be picked 25862306a36Sopenharmony_ci * up from where they were left and we avoid code duplication 25962306a36Sopenharmony_ci * without any overhead except the call in and return jumps. 26062306a36Sopenharmony_ci */ 26162306a36Sopenharmony_cibyte_copy_until_aligned_forward: 26262306a36Sopenharmony_ci beq t3, t5, 2f 26362306a36Sopenharmony_ci 1: 26462306a36Sopenharmony_ci lb t1, 0(a1) 26562306a36Sopenharmony_ci addi a1, a1, 1 26662306a36Sopenharmony_ci addi t3, t3, 1 26762306a36Sopenharmony_ci sb t1, -1(t3) 26862306a36Sopenharmony_ci bne t3, t5, 1b 26962306a36Sopenharmony_ci 2: 27062306a36Sopenharmony_ci jalr zero, 0x0(t0) /* Return to multibyte copy loop */ 27162306a36Sopenharmony_ci 27262306a36Sopenharmony_cibyte_copy_until_aligned_reverse: 27362306a36Sopenharmony_ci beq t4, t6, 2f 27462306a36Sopenharmony_ci 1: 27562306a36Sopenharmony_ci lb t1, -1(a4) 27662306a36Sopenharmony_ci addi a4, a4, -1 27762306a36Sopenharmony_ci addi t4, t4, -1 27862306a36Sopenharmony_ci sb t1, 0(t4) 27962306a36Sopenharmony_ci bne t4, t6, 1b 28062306a36Sopenharmony_ci 2: 28162306a36Sopenharmony_ci jalr zero, 0x0(t0) /* Return to multibyte copy loop */ 28262306a36Sopenharmony_ci 28362306a36Sopenharmony_ci/* 28462306a36Sopenharmony_ci * Simple byte copy loops. 28562306a36Sopenharmony_ci * These will byte copy until they reach the end of data to copy. 28662306a36Sopenharmony_ci * At that point, they will call to return from memmove. 28762306a36Sopenharmony_ci */ 28862306a36Sopenharmony_cibyte_copy: 28962306a36Sopenharmony_ci bltu a1, a0, byte_copy_reverse 29062306a36Sopenharmony_ci 29162306a36Sopenharmony_cibyte_copy_forward: 29262306a36Sopenharmony_ci beq t3, t4, 2f 29362306a36Sopenharmony_ci 1: 29462306a36Sopenharmony_ci lb t1, 0(a1) 29562306a36Sopenharmony_ci addi a1, a1, 1 29662306a36Sopenharmony_ci addi t3, t3, 1 29762306a36Sopenharmony_ci sb t1, -1(t3) 29862306a36Sopenharmony_ci bne t3, t4, 1b 29962306a36Sopenharmony_ci 2: 30062306a36Sopenharmony_ci ret 30162306a36Sopenharmony_ci 30262306a36Sopenharmony_cibyte_copy_reverse: 30362306a36Sopenharmony_ci beq t4, t3, 2f 30462306a36Sopenharmony_ci 1: 30562306a36Sopenharmony_ci lb t1, -1(a4) 30662306a36Sopenharmony_ci addi a4, a4, -1 30762306a36Sopenharmony_ci addi t4, t4, -1 30862306a36Sopenharmony_ci sb t1, 0(t4) 30962306a36Sopenharmony_ci bne t4, t3, 1b 31062306a36Sopenharmony_ci 2: 31162306a36Sopenharmony_ci 31262306a36Sopenharmony_cireturn_from_memmove: 31362306a36Sopenharmony_ci ret 31462306a36Sopenharmony_ci 31562306a36Sopenharmony_ciSYM_FUNC_END(memmove) 31662306a36Sopenharmony_ciSYM_FUNC_END(__memmove) 31762306a36Sopenharmony_ciSYM_FUNC_ALIAS(__pi_memmove, __memmove) 31862306a36Sopenharmony_ciSYM_FUNC_ALIAS(__pi___memmove, __memmove) 319