162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 262306a36Sopenharmony_ci/* Copyright 2002 Andi Kleen, SuSE Labs */ 362306a36Sopenharmony_ci 462306a36Sopenharmony_ci#include <linux/linkage.h> 562306a36Sopenharmony_ci#include <asm/cpufeatures.h> 662306a36Sopenharmony_ci#include <asm/alternative.h> 762306a36Sopenharmony_ci#include <asm/export.h> 862306a36Sopenharmony_ci 962306a36Sopenharmony_ci.section .noinstr.text, "ax" 1062306a36Sopenharmony_ci 1162306a36Sopenharmony_ci/* 1262306a36Sopenharmony_ci * ISO C memset - set a memory block to a byte value. This function uses fast 1362306a36Sopenharmony_ci * string to get better performance than the original function. The code is 1462306a36Sopenharmony_ci * simpler and shorter than the original function as well. 1562306a36Sopenharmony_ci * 1662306a36Sopenharmony_ci * rdi destination 1762306a36Sopenharmony_ci * rsi value (char) 1862306a36Sopenharmony_ci * rdx count (bytes) 1962306a36Sopenharmony_ci * 2062306a36Sopenharmony_ci * rax original destination 2162306a36Sopenharmony_ci * 2262306a36Sopenharmony_ci * The FSRS alternative should be done inline (avoiding the call and 2362306a36Sopenharmony_ci * the disgusting return handling), but that would require some help 2462306a36Sopenharmony_ci * from the compiler for better calling conventions. 2562306a36Sopenharmony_ci * 2662306a36Sopenharmony_ci * The 'rep stosb' itself is small enough to replace the call, but all 2762306a36Sopenharmony_ci * the register moves blow up the code. And two of them are "needed" 2862306a36Sopenharmony_ci * only for the return value that is the same as the source input, 2962306a36Sopenharmony_ci * which the compiler could/should do much better anyway. 3062306a36Sopenharmony_ci */ 3162306a36Sopenharmony_ciSYM_FUNC_START(__memset) 3262306a36Sopenharmony_ci ALTERNATIVE "jmp memset_orig", "", X86_FEATURE_FSRS 3362306a36Sopenharmony_ci 3462306a36Sopenharmony_ci movq %rdi,%r9 3562306a36Sopenharmony_ci movb %sil,%al 3662306a36Sopenharmony_ci movq %rdx,%rcx 3762306a36Sopenharmony_ci rep stosb 3862306a36Sopenharmony_ci movq %r9,%rax 3962306a36Sopenharmony_ci RET 4062306a36Sopenharmony_ciSYM_FUNC_END(__memset) 4162306a36Sopenharmony_ciEXPORT_SYMBOL(__memset) 4262306a36Sopenharmony_ci 4362306a36Sopenharmony_ciSYM_FUNC_ALIAS(memset, __memset) 4462306a36Sopenharmony_ciEXPORT_SYMBOL(memset) 4562306a36Sopenharmony_ci 4662306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(memset_orig) 4762306a36Sopenharmony_ci movq %rdi,%r10 4862306a36Sopenharmony_ci 4962306a36Sopenharmony_ci /* expand byte value */ 5062306a36Sopenharmony_ci movzbl %sil,%ecx 5162306a36Sopenharmony_ci movabs $0x0101010101010101,%rax 5262306a36Sopenharmony_ci imulq %rcx,%rax 5362306a36Sopenharmony_ci 5462306a36Sopenharmony_ci /* align dst */ 5562306a36Sopenharmony_ci movl %edi,%r9d 5662306a36Sopenharmony_ci andl $7,%r9d 5762306a36Sopenharmony_ci jnz .Lbad_alignment 5862306a36Sopenharmony_ci.Lafter_bad_alignment: 5962306a36Sopenharmony_ci 6062306a36Sopenharmony_ci movq %rdx,%rcx 6162306a36Sopenharmony_ci shrq $6,%rcx 6262306a36Sopenharmony_ci jz .Lhandle_tail 6362306a36Sopenharmony_ci 6462306a36Sopenharmony_ci .p2align 4 6562306a36Sopenharmony_ci.Lloop_64: 6662306a36Sopenharmony_ci decq %rcx 6762306a36Sopenharmony_ci movq %rax,(%rdi) 6862306a36Sopenharmony_ci movq %rax,8(%rdi) 6962306a36Sopenharmony_ci movq %rax,16(%rdi) 7062306a36Sopenharmony_ci movq %rax,24(%rdi) 7162306a36Sopenharmony_ci movq %rax,32(%rdi) 7262306a36Sopenharmony_ci movq %rax,40(%rdi) 7362306a36Sopenharmony_ci movq %rax,48(%rdi) 7462306a36Sopenharmony_ci movq %rax,56(%rdi) 7562306a36Sopenharmony_ci leaq 64(%rdi),%rdi 7662306a36Sopenharmony_ci jnz .Lloop_64 7762306a36Sopenharmony_ci 7862306a36Sopenharmony_ci /* Handle tail in loops. The loops should be faster than hard 7962306a36Sopenharmony_ci to predict jump tables. */ 8062306a36Sopenharmony_ci .p2align 4 8162306a36Sopenharmony_ci.Lhandle_tail: 8262306a36Sopenharmony_ci movl %edx,%ecx 8362306a36Sopenharmony_ci andl $63&(~7),%ecx 8462306a36Sopenharmony_ci jz .Lhandle_7 8562306a36Sopenharmony_ci shrl $3,%ecx 8662306a36Sopenharmony_ci .p2align 4 8762306a36Sopenharmony_ci.Lloop_8: 8862306a36Sopenharmony_ci decl %ecx 8962306a36Sopenharmony_ci movq %rax,(%rdi) 9062306a36Sopenharmony_ci leaq 8(%rdi),%rdi 9162306a36Sopenharmony_ci jnz .Lloop_8 9262306a36Sopenharmony_ci 9362306a36Sopenharmony_ci.Lhandle_7: 9462306a36Sopenharmony_ci andl $7,%edx 9562306a36Sopenharmony_ci jz .Lende 9662306a36Sopenharmony_ci .p2align 4 9762306a36Sopenharmony_ci.Lloop_1: 9862306a36Sopenharmony_ci decl %edx 9962306a36Sopenharmony_ci movb %al,(%rdi) 10062306a36Sopenharmony_ci leaq 1(%rdi),%rdi 10162306a36Sopenharmony_ci jnz .Lloop_1 10262306a36Sopenharmony_ci 10362306a36Sopenharmony_ci.Lende: 10462306a36Sopenharmony_ci movq %r10,%rax 10562306a36Sopenharmony_ci RET 10662306a36Sopenharmony_ci 10762306a36Sopenharmony_ci.Lbad_alignment: 10862306a36Sopenharmony_ci cmpq $7,%rdx 10962306a36Sopenharmony_ci jbe .Lhandle_7 11062306a36Sopenharmony_ci movq %rax,(%rdi) /* unaligned store */ 11162306a36Sopenharmony_ci movq $8,%r8 11262306a36Sopenharmony_ci subq %r9,%r8 11362306a36Sopenharmony_ci addq %r8,%rdi 11462306a36Sopenharmony_ci subq %r8,%rdx 11562306a36Sopenharmony_ci jmp .Lafter_bad_alignment 11662306a36Sopenharmony_ci.Lfinal: 11762306a36Sopenharmony_ciSYM_FUNC_END(memset_orig) 118