18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */ 28c2ecf20Sopenharmony_ci/* Copyright 2002 Andi Kleen */ 38c2ecf20Sopenharmony_ci 48c2ecf20Sopenharmony_ci#include <linux/linkage.h> 58c2ecf20Sopenharmony_ci#include <asm/errno.h> 68c2ecf20Sopenharmony_ci#include <asm/cpufeatures.h> 78c2ecf20Sopenharmony_ci#include <asm/alternative.h> 88c2ecf20Sopenharmony_ci#include <asm/export.h> 98c2ecf20Sopenharmony_ci 108c2ecf20Sopenharmony_ci.pushsection .noinstr.text, "ax" 118c2ecf20Sopenharmony_ci 128c2ecf20Sopenharmony_ci/* 138c2ecf20Sopenharmony_ci * We build a jump to memcpy_orig by default which gets NOPped out on 148c2ecf20Sopenharmony_ci * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which 158c2ecf20Sopenharmony_ci * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs 168c2ecf20Sopenharmony_ci * to a jmp to memcpy_erms which does the REP; MOVSB mem copy. 178c2ecf20Sopenharmony_ci */ 188c2ecf20Sopenharmony_ci 198c2ecf20Sopenharmony_ci/* 208c2ecf20Sopenharmony_ci * memcpy - Copy a memory block. 218c2ecf20Sopenharmony_ci * 228c2ecf20Sopenharmony_ci * Input: 238c2ecf20Sopenharmony_ci * rdi destination 248c2ecf20Sopenharmony_ci * rsi source 258c2ecf20Sopenharmony_ci * rdx count 268c2ecf20Sopenharmony_ci * 278c2ecf20Sopenharmony_ci * Output: 288c2ecf20Sopenharmony_ci * rax original destination 298c2ecf20Sopenharmony_ci */ 308c2ecf20Sopenharmony_ciSYM_FUNC_START_ALIAS(__memcpy) 318c2ecf20Sopenharmony_ciSYM_FUNC_START_WEAK(memcpy) 328c2ecf20Sopenharmony_ci ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ 338c2ecf20Sopenharmony_ci "jmp memcpy_erms", X86_FEATURE_ERMS 348c2ecf20Sopenharmony_ci 358c2ecf20Sopenharmony_ci movq %rdi, %rax 368c2ecf20Sopenharmony_ci movq %rdx, %rcx 378c2ecf20Sopenharmony_ci shrq $3, %rcx 388c2ecf20Sopenharmony_ci andl $7, %edx 398c2ecf20Sopenharmony_ci rep movsq 408c2ecf20Sopenharmony_ci movl %edx, %ecx 418c2ecf20Sopenharmony_ci rep movsb 428c2ecf20Sopenharmony_ci RET 438c2ecf20Sopenharmony_ciSYM_FUNC_END(memcpy) 448c2ecf20Sopenharmony_ciSYM_FUNC_END_ALIAS(__memcpy) 458c2ecf20Sopenharmony_ciEXPORT_SYMBOL(memcpy) 468c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__memcpy) 478c2ecf20Sopenharmony_ci 488c2ecf20Sopenharmony_ci/* 498c2ecf20Sopenharmony_ci * memcpy_erms() - enhanced fast string memcpy. This is faster and 508c2ecf20Sopenharmony_ci * simpler than memcpy. Use memcpy_erms when possible. 518c2ecf20Sopenharmony_ci */ 528c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(memcpy_erms) 538c2ecf20Sopenharmony_ci movq %rdi, %rax 548c2ecf20Sopenharmony_ci movq %rdx, %rcx 558c2ecf20Sopenharmony_ci rep movsb 568c2ecf20Sopenharmony_ci RET 578c2ecf20Sopenharmony_ciSYM_FUNC_END(memcpy_erms) 588c2ecf20Sopenharmony_ci 598c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(memcpy_orig) 608c2ecf20Sopenharmony_ci movq %rdi, %rax 618c2ecf20Sopenharmony_ci 628c2ecf20Sopenharmony_ci cmpq $0x20, %rdx 638c2ecf20Sopenharmony_ci jb .Lhandle_tail 648c2ecf20Sopenharmony_ci 658c2ecf20Sopenharmony_ci /* 668c2ecf20Sopenharmony_ci * We check whether memory false dependence could occur, 678c2ecf20Sopenharmony_ci * then jump to corresponding copy mode. 688c2ecf20Sopenharmony_ci */ 698c2ecf20Sopenharmony_ci cmp %dil, %sil 708c2ecf20Sopenharmony_ci jl .Lcopy_backward 718c2ecf20Sopenharmony_ci subq $0x20, %rdx 728c2ecf20Sopenharmony_ci.Lcopy_forward_loop: 738c2ecf20Sopenharmony_ci subq $0x20, %rdx 748c2ecf20Sopenharmony_ci 758c2ecf20Sopenharmony_ci /* 768c2ecf20Sopenharmony_ci * Move in blocks of 4x8 bytes: 778c2ecf20Sopenharmony_ci */ 788c2ecf20Sopenharmony_ci movq 0*8(%rsi), %r8 798c2ecf20Sopenharmony_ci movq 1*8(%rsi), %r9 808c2ecf20Sopenharmony_ci movq 2*8(%rsi), %r10 818c2ecf20Sopenharmony_ci movq 3*8(%rsi), %r11 828c2ecf20Sopenharmony_ci leaq 4*8(%rsi), %rsi 838c2ecf20Sopenharmony_ci 848c2ecf20Sopenharmony_ci movq %r8, 0*8(%rdi) 858c2ecf20Sopenharmony_ci movq %r9, 1*8(%rdi) 868c2ecf20Sopenharmony_ci movq %r10, 2*8(%rdi) 878c2ecf20Sopenharmony_ci movq %r11, 3*8(%rdi) 888c2ecf20Sopenharmony_ci leaq 4*8(%rdi), %rdi 898c2ecf20Sopenharmony_ci jae .Lcopy_forward_loop 908c2ecf20Sopenharmony_ci addl $0x20, %edx 918c2ecf20Sopenharmony_ci jmp .Lhandle_tail 928c2ecf20Sopenharmony_ci 938c2ecf20Sopenharmony_ci.Lcopy_backward: 948c2ecf20Sopenharmony_ci /* 958c2ecf20Sopenharmony_ci * Calculate copy position to tail. 968c2ecf20Sopenharmony_ci */ 978c2ecf20Sopenharmony_ci addq %rdx, %rsi 988c2ecf20Sopenharmony_ci addq %rdx, %rdi 998c2ecf20Sopenharmony_ci subq $0x20, %rdx 1008c2ecf20Sopenharmony_ci /* 1018c2ecf20Sopenharmony_ci * At most 3 ALU operations in one cycle, 1028c2ecf20Sopenharmony_ci * so append NOPS in the same 16 bytes trunk. 1038c2ecf20Sopenharmony_ci */ 1048c2ecf20Sopenharmony_ci .p2align 4 1058c2ecf20Sopenharmony_ci.Lcopy_backward_loop: 1068c2ecf20Sopenharmony_ci subq $0x20, %rdx 1078c2ecf20Sopenharmony_ci movq -1*8(%rsi), %r8 1088c2ecf20Sopenharmony_ci movq -2*8(%rsi), %r9 1098c2ecf20Sopenharmony_ci movq -3*8(%rsi), %r10 1108c2ecf20Sopenharmony_ci movq -4*8(%rsi), %r11 1118c2ecf20Sopenharmony_ci leaq -4*8(%rsi), %rsi 1128c2ecf20Sopenharmony_ci movq %r8, -1*8(%rdi) 1138c2ecf20Sopenharmony_ci movq %r9, -2*8(%rdi) 1148c2ecf20Sopenharmony_ci movq %r10, -3*8(%rdi) 1158c2ecf20Sopenharmony_ci movq %r11, -4*8(%rdi) 1168c2ecf20Sopenharmony_ci leaq -4*8(%rdi), %rdi 1178c2ecf20Sopenharmony_ci jae .Lcopy_backward_loop 1188c2ecf20Sopenharmony_ci 1198c2ecf20Sopenharmony_ci /* 1208c2ecf20Sopenharmony_ci * Calculate copy position to head. 1218c2ecf20Sopenharmony_ci */ 1228c2ecf20Sopenharmony_ci addl $0x20, %edx 1238c2ecf20Sopenharmony_ci subq %rdx, %rsi 1248c2ecf20Sopenharmony_ci subq %rdx, %rdi 1258c2ecf20Sopenharmony_ci.Lhandle_tail: 1268c2ecf20Sopenharmony_ci cmpl $16, %edx 1278c2ecf20Sopenharmony_ci jb .Lless_16bytes 1288c2ecf20Sopenharmony_ci 1298c2ecf20Sopenharmony_ci /* 1308c2ecf20Sopenharmony_ci * Move data from 16 bytes to 31 bytes. 1318c2ecf20Sopenharmony_ci */ 1328c2ecf20Sopenharmony_ci movq 0*8(%rsi), %r8 1338c2ecf20Sopenharmony_ci movq 1*8(%rsi), %r9 1348c2ecf20Sopenharmony_ci movq -2*8(%rsi, %rdx), %r10 1358c2ecf20Sopenharmony_ci movq -1*8(%rsi, %rdx), %r11 1368c2ecf20Sopenharmony_ci movq %r8, 0*8(%rdi) 1378c2ecf20Sopenharmony_ci movq %r9, 1*8(%rdi) 1388c2ecf20Sopenharmony_ci movq %r10, -2*8(%rdi, %rdx) 1398c2ecf20Sopenharmony_ci movq %r11, -1*8(%rdi, %rdx) 1408c2ecf20Sopenharmony_ci RET 1418c2ecf20Sopenharmony_ci .p2align 4 1428c2ecf20Sopenharmony_ci.Lless_16bytes: 1438c2ecf20Sopenharmony_ci cmpl $8, %edx 1448c2ecf20Sopenharmony_ci jb .Lless_8bytes 1458c2ecf20Sopenharmony_ci /* 1468c2ecf20Sopenharmony_ci * Move data from 8 bytes to 15 bytes. 1478c2ecf20Sopenharmony_ci */ 1488c2ecf20Sopenharmony_ci movq 0*8(%rsi), %r8 1498c2ecf20Sopenharmony_ci movq -1*8(%rsi, %rdx), %r9 1508c2ecf20Sopenharmony_ci movq %r8, 0*8(%rdi) 1518c2ecf20Sopenharmony_ci movq %r9, -1*8(%rdi, %rdx) 1528c2ecf20Sopenharmony_ci RET 1538c2ecf20Sopenharmony_ci .p2align 4 1548c2ecf20Sopenharmony_ci.Lless_8bytes: 1558c2ecf20Sopenharmony_ci cmpl $4, %edx 1568c2ecf20Sopenharmony_ci jb .Lless_3bytes 1578c2ecf20Sopenharmony_ci 1588c2ecf20Sopenharmony_ci /* 1598c2ecf20Sopenharmony_ci * Move data from 4 bytes to 7 bytes. 1608c2ecf20Sopenharmony_ci */ 1618c2ecf20Sopenharmony_ci movl (%rsi), %ecx 1628c2ecf20Sopenharmony_ci movl -4(%rsi, %rdx), %r8d 1638c2ecf20Sopenharmony_ci movl %ecx, (%rdi) 1648c2ecf20Sopenharmony_ci movl %r8d, -4(%rdi, %rdx) 1658c2ecf20Sopenharmony_ci RET 1668c2ecf20Sopenharmony_ci .p2align 4 1678c2ecf20Sopenharmony_ci.Lless_3bytes: 1688c2ecf20Sopenharmony_ci subl $1, %edx 1698c2ecf20Sopenharmony_ci jb .Lend 1708c2ecf20Sopenharmony_ci /* 1718c2ecf20Sopenharmony_ci * Move data from 1 bytes to 3 bytes. 1728c2ecf20Sopenharmony_ci */ 1738c2ecf20Sopenharmony_ci movzbl (%rsi), %ecx 1748c2ecf20Sopenharmony_ci jz .Lstore_1byte 1758c2ecf20Sopenharmony_ci movzbq 1(%rsi), %r8 1768c2ecf20Sopenharmony_ci movzbq (%rsi, %rdx), %r9 1778c2ecf20Sopenharmony_ci movb %r8b, 1(%rdi) 1788c2ecf20Sopenharmony_ci movb %r9b, (%rdi, %rdx) 1798c2ecf20Sopenharmony_ci.Lstore_1byte: 1808c2ecf20Sopenharmony_ci movb %cl, (%rdi) 1818c2ecf20Sopenharmony_ci 1828c2ecf20Sopenharmony_ci.Lend: 1838c2ecf20Sopenharmony_ci RET 1848c2ecf20Sopenharmony_ciSYM_FUNC_END(memcpy_orig) 1858c2ecf20Sopenharmony_ci 1868c2ecf20Sopenharmony_ci.popsection 187