18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 28c2ecf20Sopenharmony_ci/* Copyright 2002 Andi Kleen, SuSE Labs */ 38c2ecf20Sopenharmony_ci 48c2ecf20Sopenharmony_ci#include <linux/linkage.h> 58c2ecf20Sopenharmony_ci#include <asm/cpufeatures.h> 68c2ecf20Sopenharmony_ci#include <asm/alternative.h> 78c2ecf20Sopenharmony_ci#include <asm/export.h> 88c2ecf20Sopenharmony_ci 98c2ecf20Sopenharmony_ci/* 108c2ecf20Sopenharmony_ci * ISO C memset - set a memory block to a byte value. This function uses fast 118c2ecf20Sopenharmony_ci * string to get better performance than the original function. The code is 128c2ecf20Sopenharmony_ci * simpler and shorter than the original function as well. 138c2ecf20Sopenharmony_ci * 148c2ecf20Sopenharmony_ci * rdi destination 158c2ecf20Sopenharmony_ci * rsi value (char) 168c2ecf20Sopenharmony_ci * rdx count (bytes) 178c2ecf20Sopenharmony_ci * 188c2ecf20Sopenharmony_ci * rax original destination 198c2ecf20Sopenharmony_ci */ 208c2ecf20Sopenharmony_ciSYM_FUNC_START_WEAK(memset) 218c2ecf20Sopenharmony_ciSYM_FUNC_START(__memset) 228c2ecf20Sopenharmony_ci /* 238c2ecf20Sopenharmony_ci * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended 248c2ecf20Sopenharmony_ci * to use it when possible. If not available, use fast string instructions. 258c2ecf20Sopenharmony_ci * 268c2ecf20Sopenharmony_ci * Otherwise, use original memset function. 278c2ecf20Sopenharmony_ci */ 288c2ecf20Sopenharmony_ci ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \ 298c2ecf20Sopenharmony_ci "jmp memset_erms", X86_FEATURE_ERMS 308c2ecf20Sopenharmony_ci 318c2ecf20Sopenharmony_ci movq %rdi,%r9 328c2ecf20Sopenharmony_ci movq %rdx,%rcx 338c2ecf20Sopenharmony_ci andl $7,%edx 348c2ecf20Sopenharmony_ci shrq $3,%rcx 358c2ecf20Sopenharmony_ci /* expand byte value */ 368c2ecf20Sopenharmony_ci movzbl %sil,%esi 378c2ecf20Sopenharmony_ci movabs $0x0101010101010101,%rax 388c2ecf20Sopenharmony_ci imulq %rsi,%rax 398c2ecf20Sopenharmony_ci rep stosq 408c2ecf20Sopenharmony_ci movl %edx,%ecx 418c2ecf20Sopenharmony_ci rep stosb 428c2ecf20Sopenharmony_ci movq %r9,%rax 438c2ecf20Sopenharmony_ci RET 448c2ecf20Sopenharmony_ciSYM_FUNC_END(__memset) 458c2ecf20Sopenharmony_ciSYM_FUNC_END_ALIAS(memset) 468c2ecf20Sopenharmony_ciEXPORT_SYMBOL(memset) 478c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__memset) 488c2ecf20Sopenharmony_ci 498c2ecf20Sopenharmony_ci/* 508c2ecf20Sopenharmony_ci * ISO C memset - set a memory block to a byte value. This function uses 518c2ecf20Sopenharmony_ci * enhanced rep stosb to override the fast string function. 528c2ecf20Sopenharmony_ci * The code is simpler and shorter than the fast string function as well. 538c2ecf20Sopenharmony_ci * 548c2ecf20Sopenharmony_ci * rdi destination 558c2ecf20Sopenharmony_ci * rsi value (char) 568c2ecf20Sopenharmony_ci * rdx count (bytes) 578c2ecf20Sopenharmony_ci * 588c2ecf20Sopenharmony_ci * rax original destination 598c2ecf20Sopenharmony_ci */ 608c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(memset_erms) 618c2ecf20Sopenharmony_ci movq %rdi,%r9 628c2ecf20Sopenharmony_ci movb %sil,%al 638c2ecf20Sopenharmony_ci movq %rdx,%rcx 648c2ecf20Sopenharmony_ci rep stosb 658c2ecf20Sopenharmony_ci movq %r9,%rax 668c2ecf20Sopenharmony_ci RET 678c2ecf20Sopenharmony_ciSYM_FUNC_END(memset_erms) 688c2ecf20Sopenharmony_ci 698c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(memset_orig) 708c2ecf20Sopenharmony_ci movq %rdi,%r10 718c2ecf20Sopenharmony_ci 728c2ecf20Sopenharmony_ci /* expand byte value */ 738c2ecf20Sopenharmony_ci movzbl %sil,%ecx 748c2ecf20Sopenharmony_ci movabs $0x0101010101010101,%rax 758c2ecf20Sopenharmony_ci imulq %rcx,%rax 768c2ecf20Sopenharmony_ci 778c2ecf20Sopenharmony_ci /* align dst */ 788c2ecf20Sopenharmony_ci movl %edi,%r9d 798c2ecf20Sopenharmony_ci andl $7,%r9d 808c2ecf20Sopenharmony_ci jnz .Lbad_alignment 818c2ecf20Sopenharmony_ci.Lafter_bad_alignment: 828c2ecf20Sopenharmony_ci 838c2ecf20Sopenharmony_ci movq %rdx,%rcx 848c2ecf20Sopenharmony_ci shrq $6,%rcx 858c2ecf20Sopenharmony_ci jz .Lhandle_tail 868c2ecf20Sopenharmony_ci 878c2ecf20Sopenharmony_ci .p2align 4 888c2ecf20Sopenharmony_ci.Lloop_64: 898c2ecf20Sopenharmony_ci decq %rcx 908c2ecf20Sopenharmony_ci movq %rax,(%rdi) 918c2ecf20Sopenharmony_ci movq %rax,8(%rdi) 928c2ecf20Sopenharmony_ci movq %rax,16(%rdi) 938c2ecf20Sopenharmony_ci movq %rax,24(%rdi) 948c2ecf20Sopenharmony_ci movq %rax,32(%rdi) 958c2ecf20Sopenharmony_ci movq %rax,40(%rdi) 968c2ecf20Sopenharmony_ci movq %rax,48(%rdi) 978c2ecf20Sopenharmony_ci movq %rax,56(%rdi) 988c2ecf20Sopenharmony_ci leaq 64(%rdi),%rdi 998c2ecf20Sopenharmony_ci jnz .Lloop_64 1008c2ecf20Sopenharmony_ci 1018c2ecf20Sopenharmony_ci /* Handle tail in loops. The loops should be faster than hard 1028c2ecf20Sopenharmony_ci to predict jump tables. */ 1038c2ecf20Sopenharmony_ci .p2align 4 1048c2ecf20Sopenharmony_ci.Lhandle_tail: 1058c2ecf20Sopenharmony_ci movl %edx,%ecx 1068c2ecf20Sopenharmony_ci andl $63&(~7),%ecx 1078c2ecf20Sopenharmony_ci jz .Lhandle_7 1088c2ecf20Sopenharmony_ci shrl $3,%ecx 1098c2ecf20Sopenharmony_ci .p2align 4 1108c2ecf20Sopenharmony_ci.Lloop_8: 1118c2ecf20Sopenharmony_ci decl %ecx 1128c2ecf20Sopenharmony_ci movq %rax,(%rdi) 1138c2ecf20Sopenharmony_ci leaq 8(%rdi),%rdi 1148c2ecf20Sopenharmony_ci jnz .Lloop_8 1158c2ecf20Sopenharmony_ci 1168c2ecf20Sopenharmony_ci.Lhandle_7: 1178c2ecf20Sopenharmony_ci andl $7,%edx 1188c2ecf20Sopenharmony_ci jz .Lende 1198c2ecf20Sopenharmony_ci .p2align 4 1208c2ecf20Sopenharmony_ci.Lloop_1: 1218c2ecf20Sopenharmony_ci decl %edx 1228c2ecf20Sopenharmony_ci movb %al,(%rdi) 1238c2ecf20Sopenharmony_ci leaq 1(%rdi),%rdi 1248c2ecf20Sopenharmony_ci jnz .Lloop_1 1258c2ecf20Sopenharmony_ci 1268c2ecf20Sopenharmony_ci.Lende: 1278c2ecf20Sopenharmony_ci movq %r10,%rax 1288c2ecf20Sopenharmony_ci RET 1298c2ecf20Sopenharmony_ci 1308c2ecf20Sopenharmony_ci.Lbad_alignment: 1318c2ecf20Sopenharmony_ci cmpq $7,%rdx 1328c2ecf20Sopenharmony_ci jbe .Lhandle_7 1338c2ecf20Sopenharmony_ci movq %rax,(%rdi) /* unaligned store */ 1348c2ecf20Sopenharmony_ci movq $8,%r8 1358c2ecf20Sopenharmony_ci subq %r9,%r8 1368c2ecf20Sopenharmony_ci addq %r8,%rdi 1378c2ecf20Sopenharmony_ci subq %r8,%rdx 1388c2ecf20Sopenharmony_ci jmp .Lafter_bad_alignment 1398c2ecf20Sopenharmony_ci.Lfinal: 1408c2ecf20Sopenharmony_ciSYM_FUNC_END(memset_orig) 141