162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */ 262306a36Sopenharmony_ci/* Copyright 2002 Andi Kleen */ 362306a36Sopenharmony_ci 462306a36Sopenharmony_ci#include <linux/linkage.h> 562306a36Sopenharmony_ci#include <asm/errno.h> 662306a36Sopenharmony_ci#include <asm/cpufeatures.h> 762306a36Sopenharmony_ci#include <asm/alternative.h> 862306a36Sopenharmony_ci#include <asm/export.h> 962306a36Sopenharmony_ci 1062306a36Sopenharmony_ci.section .noinstr.text, "ax" 1162306a36Sopenharmony_ci 1262306a36Sopenharmony_ci/* 1362306a36Sopenharmony_ci * memcpy - Copy a memory block. 1462306a36Sopenharmony_ci * 1562306a36Sopenharmony_ci * Input: 1662306a36Sopenharmony_ci * rdi destination 1762306a36Sopenharmony_ci * rsi source 1862306a36Sopenharmony_ci * rdx count 1962306a36Sopenharmony_ci * 2062306a36Sopenharmony_ci * Output: 2162306a36Sopenharmony_ci * rax original destination 2262306a36Sopenharmony_ci * 2362306a36Sopenharmony_ci * The FSRM alternative should be done inline (avoiding the call and 2462306a36Sopenharmony_ci * the disgusting return handling), but that would require some help 2562306a36Sopenharmony_ci * from the compiler for better calling conventions. 2662306a36Sopenharmony_ci * 2762306a36Sopenharmony_ci * The 'rep movsb' itself is small enough to replace the call, but the 2862306a36Sopenharmony_ci * two register moves blow up the code. And one of them is "needed" 2962306a36Sopenharmony_ci * only for the return value that is the same as the source input, 3062306a36Sopenharmony_ci * which the compiler could/should do much better anyway. 3162306a36Sopenharmony_ci */ 3262306a36Sopenharmony_ciSYM_TYPED_FUNC_START(__memcpy) 3362306a36Sopenharmony_ci ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM 3462306a36Sopenharmony_ci 3562306a36Sopenharmony_ci movq %rdi, %rax 3662306a36Sopenharmony_ci movq %rdx, %rcx 3762306a36Sopenharmony_ci rep movsb 3862306a36Sopenharmony_ci RET 3962306a36Sopenharmony_ciSYM_FUNC_END(__memcpy) 4062306a36Sopenharmony_ciEXPORT_SYMBOL(__memcpy) 4162306a36Sopenharmony_ci 4262306a36Sopenharmony_ciSYM_FUNC_ALIAS(memcpy, __memcpy) 4362306a36Sopenharmony_ciEXPORT_SYMBOL(memcpy) 4462306a36Sopenharmony_ci 4562306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(memcpy_orig) 4662306a36Sopenharmony_ci movq %rdi, %rax 4762306a36Sopenharmony_ci 4862306a36Sopenharmony_ci cmpq $0x20, %rdx 4962306a36Sopenharmony_ci jb .Lhandle_tail 5062306a36Sopenharmony_ci 5162306a36Sopenharmony_ci /* 5262306a36Sopenharmony_ci * We check whether memory false dependence could occur, 5362306a36Sopenharmony_ci * then jump to corresponding copy mode. 5462306a36Sopenharmony_ci */ 5562306a36Sopenharmony_ci cmp %dil, %sil 5662306a36Sopenharmony_ci jl .Lcopy_backward 5762306a36Sopenharmony_ci subq $0x20, %rdx 5862306a36Sopenharmony_ci.Lcopy_forward_loop: 5962306a36Sopenharmony_ci subq $0x20, %rdx 6062306a36Sopenharmony_ci 6162306a36Sopenharmony_ci /* 6262306a36Sopenharmony_ci * Move in blocks of 4x8 bytes: 6362306a36Sopenharmony_ci */ 6462306a36Sopenharmony_ci movq 0*8(%rsi), %r8 6562306a36Sopenharmony_ci movq 1*8(%rsi), %r9 6662306a36Sopenharmony_ci movq 2*8(%rsi), %r10 6762306a36Sopenharmony_ci movq 3*8(%rsi), %r11 6862306a36Sopenharmony_ci leaq 4*8(%rsi), %rsi 6962306a36Sopenharmony_ci 7062306a36Sopenharmony_ci movq %r8, 0*8(%rdi) 7162306a36Sopenharmony_ci movq %r9, 1*8(%rdi) 7262306a36Sopenharmony_ci movq %r10, 2*8(%rdi) 7362306a36Sopenharmony_ci movq %r11, 3*8(%rdi) 7462306a36Sopenharmony_ci leaq 4*8(%rdi), %rdi 7562306a36Sopenharmony_ci jae .Lcopy_forward_loop 7662306a36Sopenharmony_ci addl $0x20, %edx 7762306a36Sopenharmony_ci jmp .Lhandle_tail 7862306a36Sopenharmony_ci 7962306a36Sopenharmony_ci.Lcopy_backward: 8062306a36Sopenharmony_ci /* 8162306a36Sopenharmony_ci * Calculate copy position to tail. 8262306a36Sopenharmony_ci */ 8362306a36Sopenharmony_ci addq %rdx, %rsi 8462306a36Sopenharmony_ci addq %rdx, %rdi 8562306a36Sopenharmony_ci subq $0x20, %rdx 8662306a36Sopenharmony_ci /* 8762306a36Sopenharmony_ci * At most 3 ALU operations in one cycle, 8862306a36Sopenharmony_ci * so append NOPS in the same 16 bytes trunk. 8962306a36Sopenharmony_ci */ 9062306a36Sopenharmony_ci .p2align 4 9162306a36Sopenharmony_ci.Lcopy_backward_loop: 9262306a36Sopenharmony_ci subq $0x20, %rdx 9362306a36Sopenharmony_ci movq -1*8(%rsi), %r8 9462306a36Sopenharmony_ci movq -2*8(%rsi), %r9 9562306a36Sopenharmony_ci movq -3*8(%rsi), %r10 9662306a36Sopenharmony_ci movq -4*8(%rsi), %r11 9762306a36Sopenharmony_ci leaq -4*8(%rsi), %rsi 9862306a36Sopenharmony_ci movq %r8, -1*8(%rdi) 9962306a36Sopenharmony_ci movq %r9, -2*8(%rdi) 10062306a36Sopenharmony_ci movq %r10, -3*8(%rdi) 10162306a36Sopenharmony_ci movq %r11, -4*8(%rdi) 10262306a36Sopenharmony_ci leaq -4*8(%rdi), %rdi 10362306a36Sopenharmony_ci jae .Lcopy_backward_loop 10462306a36Sopenharmony_ci 10562306a36Sopenharmony_ci /* 10662306a36Sopenharmony_ci * Calculate copy position to head. 10762306a36Sopenharmony_ci */ 10862306a36Sopenharmony_ci addl $0x20, %edx 10962306a36Sopenharmony_ci subq %rdx, %rsi 11062306a36Sopenharmony_ci subq %rdx, %rdi 11162306a36Sopenharmony_ci.Lhandle_tail: 11262306a36Sopenharmony_ci cmpl $16, %edx 11362306a36Sopenharmony_ci jb .Lless_16bytes 11462306a36Sopenharmony_ci 11562306a36Sopenharmony_ci /* 11662306a36Sopenharmony_ci * Move data from 16 bytes to 31 bytes. 11762306a36Sopenharmony_ci */ 11862306a36Sopenharmony_ci movq 0*8(%rsi), %r8 11962306a36Sopenharmony_ci movq 1*8(%rsi), %r9 12062306a36Sopenharmony_ci movq -2*8(%rsi, %rdx), %r10 12162306a36Sopenharmony_ci movq -1*8(%rsi, %rdx), %r11 12262306a36Sopenharmony_ci movq %r8, 0*8(%rdi) 12362306a36Sopenharmony_ci movq %r9, 1*8(%rdi) 12462306a36Sopenharmony_ci movq %r10, -2*8(%rdi, %rdx) 12562306a36Sopenharmony_ci movq %r11, -1*8(%rdi, %rdx) 12662306a36Sopenharmony_ci RET 12762306a36Sopenharmony_ci .p2align 4 12862306a36Sopenharmony_ci.Lless_16bytes: 12962306a36Sopenharmony_ci cmpl $8, %edx 13062306a36Sopenharmony_ci jb .Lless_8bytes 13162306a36Sopenharmony_ci /* 13262306a36Sopenharmony_ci * Move data from 8 bytes to 15 bytes. 13362306a36Sopenharmony_ci */ 13462306a36Sopenharmony_ci movq 0*8(%rsi), %r8 13562306a36Sopenharmony_ci movq -1*8(%rsi, %rdx), %r9 13662306a36Sopenharmony_ci movq %r8, 0*8(%rdi) 13762306a36Sopenharmony_ci movq %r9, -1*8(%rdi, %rdx) 13862306a36Sopenharmony_ci RET 13962306a36Sopenharmony_ci .p2align 4 14062306a36Sopenharmony_ci.Lless_8bytes: 14162306a36Sopenharmony_ci cmpl $4, %edx 14262306a36Sopenharmony_ci jb .Lless_3bytes 14362306a36Sopenharmony_ci 14462306a36Sopenharmony_ci /* 14562306a36Sopenharmony_ci * Move data from 4 bytes to 7 bytes. 14662306a36Sopenharmony_ci */ 14762306a36Sopenharmony_ci movl (%rsi), %ecx 14862306a36Sopenharmony_ci movl -4(%rsi, %rdx), %r8d 14962306a36Sopenharmony_ci movl %ecx, (%rdi) 15062306a36Sopenharmony_ci movl %r8d, -4(%rdi, %rdx) 15162306a36Sopenharmony_ci RET 15262306a36Sopenharmony_ci .p2align 4 15362306a36Sopenharmony_ci.Lless_3bytes: 15462306a36Sopenharmony_ci subl $1, %edx 15562306a36Sopenharmony_ci jb .Lend 15662306a36Sopenharmony_ci /* 15762306a36Sopenharmony_ci * Move data from 1 bytes to 3 bytes. 15862306a36Sopenharmony_ci */ 15962306a36Sopenharmony_ci movzbl (%rsi), %ecx 16062306a36Sopenharmony_ci jz .Lstore_1byte 16162306a36Sopenharmony_ci movzbq 1(%rsi), %r8 16262306a36Sopenharmony_ci movzbq (%rsi, %rdx), %r9 16362306a36Sopenharmony_ci movb %r8b, 1(%rdi) 16462306a36Sopenharmony_ci movb %r9b, (%rdi, %rdx) 16562306a36Sopenharmony_ci.Lstore_1byte: 16662306a36Sopenharmony_ci movb %cl, (%rdi) 16762306a36Sopenharmony_ci 16862306a36Sopenharmony_ci.Lend: 16962306a36Sopenharmony_ci RET 17062306a36Sopenharmony_ciSYM_FUNC_END(memcpy_orig) 17162306a36Sopenharmony_ci 172