162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */ 262306a36Sopenharmony_ci/* Copyright 2002 Andi Kleen */ 362306a36Sopenharmony_ci 462306a36Sopenharmony_ci#include <linux/linkage.h> 562306a36Sopenharmony_ci#include <linux/cfi_types.h> 662306a36Sopenharmony_ci#include <asm/errno.h> 762306a36Sopenharmony_ci#include <asm/cpufeatures.h> 862306a36Sopenharmony_ci#include <asm/alternative.h> 962306a36Sopenharmony_ci#include <asm/export.h> 1062306a36Sopenharmony_ci 1162306a36Sopenharmony_ci.section .noinstr.text, "ax" 1262306a36Sopenharmony_ci 1362306a36Sopenharmony_ci/* 1462306a36Sopenharmony_ci * memcpy - Copy a memory block. 1562306a36Sopenharmony_ci * 1662306a36Sopenharmony_ci * Input: 1762306a36Sopenharmony_ci * rdi destination 1862306a36Sopenharmony_ci * rsi source 1962306a36Sopenharmony_ci * rdx count 2062306a36Sopenharmony_ci * 2162306a36Sopenharmony_ci * Output: 2262306a36Sopenharmony_ci * rax original destination 2362306a36Sopenharmony_ci * 2462306a36Sopenharmony_ci * The FSRM alternative should be done inline (avoiding the call and 2562306a36Sopenharmony_ci * the disgusting return handling), but that would require some help 2662306a36Sopenharmony_ci * from the compiler for better calling conventions. 2762306a36Sopenharmony_ci * 2862306a36Sopenharmony_ci * The 'rep movsb' itself is small enough to replace the call, but the 2962306a36Sopenharmony_ci * two register moves blow up the code. And one of them is "needed" 3062306a36Sopenharmony_ci * only for the return value that is the same as the source input, 3162306a36Sopenharmony_ci * which the compiler could/should do much better anyway. 3262306a36Sopenharmony_ci */ 3362306a36Sopenharmony_ciSYM_TYPED_FUNC_START(__memcpy) 3462306a36Sopenharmony_ci ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM 3562306a36Sopenharmony_ci 3662306a36Sopenharmony_ci movq %rdi, %rax 3762306a36Sopenharmony_ci movq %rdx, %rcx 3862306a36Sopenharmony_ci rep movsb 3962306a36Sopenharmony_ci RET 4062306a36Sopenharmony_ciSYM_FUNC_END(__memcpy) 4162306a36Sopenharmony_ciEXPORT_SYMBOL(__memcpy) 4262306a36Sopenharmony_ci 4362306a36Sopenharmony_ciSYM_FUNC_ALIAS_MEMFUNC(memcpy, __memcpy) 4462306a36Sopenharmony_ciEXPORT_SYMBOL(memcpy) 4562306a36Sopenharmony_ci 4662306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(memcpy_orig) 4762306a36Sopenharmony_ci movq %rdi, %rax 4862306a36Sopenharmony_ci 4962306a36Sopenharmony_ci cmpq $0x20, %rdx 5062306a36Sopenharmony_ci jb .Lhandle_tail 5162306a36Sopenharmony_ci 5262306a36Sopenharmony_ci /* 5362306a36Sopenharmony_ci * We check whether memory false dependence could occur, 5462306a36Sopenharmony_ci * then jump to corresponding copy mode. 5562306a36Sopenharmony_ci */ 5662306a36Sopenharmony_ci cmp %dil, %sil 5762306a36Sopenharmony_ci jl .Lcopy_backward 5862306a36Sopenharmony_ci subq $0x20, %rdx 5962306a36Sopenharmony_ci.Lcopy_forward_loop: 6062306a36Sopenharmony_ci subq $0x20, %rdx 6162306a36Sopenharmony_ci 6262306a36Sopenharmony_ci /* 6362306a36Sopenharmony_ci * Move in blocks of 4x8 bytes: 6462306a36Sopenharmony_ci */ 6562306a36Sopenharmony_ci movq 0*8(%rsi), %r8 6662306a36Sopenharmony_ci movq 1*8(%rsi), %r9 6762306a36Sopenharmony_ci movq 2*8(%rsi), %r10 6862306a36Sopenharmony_ci movq 3*8(%rsi), %r11 6962306a36Sopenharmony_ci leaq 4*8(%rsi), %rsi 7062306a36Sopenharmony_ci 7162306a36Sopenharmony_ci movq %r8, 0*8(%rdi) 7262306a36Sopenharmony_ci movq %r9, 1*8(%rdi) 7362306a36Sopenharmony_ci movq %r10, 2*8(%rdi) 7462306a36Sopenharmony_ci movq %r11, 3*8(%rdi) 7562306a36Sopenharmony_ci leaq 4*8(%rdi), %rdi 7662306a36Sopenharmony_ci jae .Lcopy_forward_loop 7762306a36Sopenharmony_ci addl $0x20, %edx 7862306a36Sopenharmony_ci jmp .Lhandle_tail 7962306a36Sopenharmony_ci 8062306a36Sopenharmony_ci.Lcopy_backward: 8162306a36Sopenharmony_ci /* 8262306a36Sopenharmony_ci * Calculate copy position to tail. 8362306a36Sopenharmony_ci */ 8462306a36Sopenharmony_ci addq %rdx, %rsi 8562306a36Sopenharmony_ci addq %rdx, %rdi 8662306a36Sopenharmony_ci subq $0x20, %rdx 8762306a36Sopenharmony_ci /* 8862306a36Sopenharmony_ci * At most 3 ALU operations in one cycle, 8962306a36Sopenharmony_ci * so append NOPS in the same 16 bytes trunk. 9062306a36Sopenharmony_ci */ 9162306a36Sopenharmony_ci .p2align 4 9262306a36Sopenharmony_ci.Lcopy_backward_loop: 9362306a36Sopenharmony_ci subq $0x20, %rdx 9462306a36Sopenharmony_ci movq -1*8(%rsi), %r8 9562306a36Sopenharmony_ci movq -2*8(%rsi), %r9 9662306a36Sopenharmony_ci movq -3*8(%rsi), %r10 9762306a36Sopenharmony_ci movq -4*8(%rsi), %r11 9862306a36Sopenharmony_ci leaq -4*8(%rsi), %rsi 9962306a36Sopenharmony_ci movq %r8, -1*8(%rdi) 10062306a36Sopenharmony_ci movq %r9, -2*8(%rdi) 10162306a36Sopenharmony_ci movq %r10, -3*8(%rdi) 10262306a36Sopenharmony_ci movq %r11, -4*8(%rdi) 10362306a36Sopenharmony_ci leaq -4*8(%rdi), %rdi 10462306a36Sopenharmony_ci jae .Lcopy_backward_loop 10562306a36Sopenharmony_ci 10662306a36Sopenharmony_ci /* 10762306a36Sopenharmony_ci * Calculate copy position to head. 10862306a36Sopenharmony_ci */ 10962306a36Sopenharmony_ci addl $0x20, %edx 11062306a36Sopenharmony_ci subq %rdx, %rsi 11162306a36Sopenharmony_ci subq %rdx, %rdi 11262306a36Sopenharmony_ci.Lhandle_tail: 11362306a36Sopenharmony_ci cmpl $16, %edx 11462306a36Sopenharmony_ci jb .Lless_16bytes 11562306a36Sopenharmony_ci 11662306a36Sopenharmony_ci /* 11762306a36Sopenharmony_ci * Move data from 16 bytes to 31 bytes. 11862306a36Sopenharmony_ci */ 11962306a36Sopenharmony_ci movq 0*8(%rsi), %r8 12062306a36Sopenharmony_ci movq 1*8(%rsi), %r9 12162306a36Sopenharmony_ci movq -2*8(%rsi, %rdx), %r10 12262306a36Sopenharmony_ci movq -1*8(%rsi, %rdx), %r11 12362306a36Sopenharmony_ci movq %r8, 0*8(%rdi) 12462306a36Sopenharmony_ci movq %r9, 1*8(%rdi) 12562306a36Sopenharmony_ci movq %r10, -2*8(%rdi, %rdx) 12662306a36Sopenharmony_ci movq %r11, -1*8(%rdi, %rdx) 12762306a36Sopenharmony_ci RET 12862306a36Sopenharmony_ci .p2align 4 12962306a36Sopenharmony_ci.Lless_16bytes: 13062306a36Sopenharmony_ci cmpl $8, %edx 13162306a36Sopenharmony_ci jb .Lless_8bytes 13262306a36Sopenharmony_ci /* 13362306a36Sopenharmony_ci * Move data from 8 bytes to 15 bytes. 13462306a36Sopenharmony_ci */ 13562306a36Sopenharmony_ci movq 0*8(%rsi), %r8 13662306a36Sopenharmony_ci movq -1*8(%rsi, %rdx), %r9 13762306a36Sopenharmony_ci movq %r8, 0*8(%rdi) 13862306a36Sopenharmony_ci movq %r9, -1*8(%rdi, %rdx) 13962306a36Sopenharmony_ci RET 14062306a36Sopenharmony_ci .p2align 4 14162306a36Sopenharmony_ci.Lless_8bytes: 14262306a36Sopenharmony_ci cmpl $4, %edx 14362306a36Sopenharmony_ci jb .Lless_3bytes 14462306a36Sopenharmony_ci 14562306a36Sopenharmony_ci /* 14662306a36Sopenharmony_ci * Move data from 4 bytes to 7 bytes. 14762306a36Sopenharmony_ci */ 14862306a36Sopenharmony_ci movl (%rsi), %ecx 14962306a36Sopenharmony_ci movl -4(%rsi, %rdx), %r8d 15062306a36Sopenharmony_ci movl %ecx, (%rdi) 15162306a36Sopenharmony_ci movl %r8d, -4(%rdi, %rdx) 15262306a36Sopenharmony_ci RET 15362306a36Sopenharmony_ci .p2align 4 15462306a36Sopenharmony_ci.Lless_3bytes: 15562306a36Sopenharmony_ci subl $1, %edx 15662306a36Sopenharmony_ci jb .Lend 15762306a36Sopenharmony_ci /* 15862306a36Sopenharmony_ci * Move data from 1 bytes to 3 bytes. 15962306a36Sopenharmony_ci */ 16062306a36Sopenharmony_ci movzbl (%rsi), %ecx 16162306a36Sopenharmony_ci jz .Lstore_1byte 16262306a36Sopenharmony_ci movzbq 1(%rsi), %r8 16362306a36Sopenharmony_ci movzbq (%rsi, %rdx), %r9 16462306a36Sopenharmony_ci movb %r8b, 1(%rdi) 16562306a36Sopenharmony_ci movb %r9b, (%rdi, %rdx) 16662306a36Sopenharmony_ci.Lstore_1byte: 16762306a36Sopenharmony_ci movb %cl, (%rdi) 16862306a36Sopenharmony_ci 16962306a36Sopenharmony_ci.Lend: 17062306a36Sopenharmony_ci RET 17162306a36Sopenharmony_ciSYM_FUNC_END(memcpy_orig) 17262306a36Sopenharmony_ci 173