xref: /kernel/linux/linux-6.6/arch/x86/lib/memset_64.S (revision 62306a36)
162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */
262306a36Sopenharmony_ci/* Copyright 2002 Andi Kleen, SuSE Labs */
362306a36Sopenharmony_ci
462306a36Sopenharmony_ci#include <linux/linkage.h>
562306a36Sopenharmony_ci#include <asm/cpufeatures.h>
662306a36Sopenharmony_ci#include <asm/alternative.h>
762306a36Sopenharmony_ci#include <asm/export.h>
862306a36Sopenharmony_ci
962306a36Sopenharmony_ci.section .noinstr.text, "ax"
1062306a36Sopenharmony_ci
1162306a36Sopenharmony_ci/*
1262306a36Sopenharmony_ci * ISO C memset - set a memory block to a byte value. This function uses fast
1362306a36Sopenharmony_ci * string to get better performance than the original function. The code is
1462306a36Sopenharmony_ci * simpler and shorter than the original function as well.
1562306a36Sopenharmony_ci *
1662306a36Sopenharmony_ci * rdi   destination
1762306a36Sopenharmony_ci * rsi   value (char)
1862306a36Sopenharmony_ci * rdx   count (bytes)
1962306a36Sopenharmony_ci *
2062306a36Sopenharmony_ci * rax   original destination
2162306a36Sopenharmony_ci *
2262306a36Sopenharmony_ci * The FSRS alternative should be done inline (avoiding the call and
2362306a36Sopenharmony_ci * the disgusting return handling), but that would require some help
2462306a36Sopenharmony_ci * from the compiler for better calling conventions.
2562306a36Sopenharmony_ci *
2662306a36Sopenharmony_ci * The 'rep stosb' itself is small enough to replace the call, but all
2762306a36Sopenharmony_ci * the register moves blow up the code. And two of them are "needed"
2862306a36Sopenharmony_ci * only for the return value that is the same as the source input,
2962306a36Sopenharmony_ci * which the compiler could/should do much better anyway.
3062306a36Sopenharmony_ci */
3162306a36Sopenharmony_ciSYM_FUNC_START(__memset)
3262306a36Sopenharmony_ci	ALTERNATIVE "jmp memset_orig", "", X86_FEATURE_FSRS
3362306a36Sopenharmony_ci
3462306a36Sopenharmony_ci	movq %rdi,%r9
3562306a36Sopenharmony_ci	movb %sil,%al
3662306a36Sopenharmony_ci	movq %rdx,%rcx
3762306a36Sopenharmony_ci	rep stosb
3862306a36Sopenharmony_ci	movq %r9,%rax
3962306a36Sopenharmony_ci	RET
4062306a36Sopenharmony_ciSYM_FUNC_END(__memset)
4162306a36Sopenharmony_ciEXPORT_SYMBOL(__memset)
4262306a36Sopenharmony_ci
4362306a36Sopenharmony_ciSYM_FUNC_ALIAS_MEMFUNC(memset, __memset)
4462306a36Sopenharmony_ciEXPORT_SYMBOL(memset)
4562306a36Sopenharmony_ci
4662306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(memset_orig)
4762306a36Sopenharmony_ci	movq %rdi,%r10
4862306a36Sopenharmony_ci
4962306a36Sopenharmony_ci	/* expand byte value  */
5062306a36Sopenharmony_ci	movzbl %sil,%ecx
5162306a36Sopenharmony_ci	movabs $0x0101010101010101,%rax
5262306a36Sopenharmony_ci	imulq  %rcx,%rax
5362306a36Sopenharmony_ci
5462306a36Sopenharmony_ci	/* align dst */
5562306a36Sopenharmony_ci	movl  %edi,%r9d
5662306a36Sopenharmony_ci	andl  $7,%r9d
5762306a36Sopenharmony_ci	jnz  .Lbad_alignment
5862306a36Sopenharmony_ci.Lafter_bad_alignment:
5962306a36Sopenharmony_ci
6062306a36Sopenharmony_ci	movq  %rdx,%rcx
6162306a36Sopenharmony_ci	shrq  $6,%rcx
6262306a36Sopenharmony_ci	jz	 .Lhandle_tail
6362306a36Sopenharmony_ci
6462306a36Sopenharmony_ci	.p2align 4
6562306a36Sopenharmony_ci.Lloop_64:
6662306a36Sopenharmony_ci	decq  %rcx
6762306a36Sopenharmony_ci	movq  %rax,(%rdi)
6862306a36Sopenharmony_ci	movq  %rax,8(%rdi)
6962306a36Sopenharmony_ci	movq  %rax,16(%rdi)
7062306a36Sopenharmony_ci	movq  %rax,24(%rdi)
7162306a36Sopenharmony_ci	movq  %rax,32(%rdi)
7262306a36Sopenharmony_ci	movq  %rax,40(%rdi)
7362306a36Sopenharmony_ci	movq  %rax,48(%rdi)
7462306a36Sopenharmony_ci	movq  %rax,56(%rdi)
7562306a36Sopenharmony_ci	leaq  64(%rdi),%rdi
7662306a36Sopenharmony_ci	jnz    .Lloop_64
7762306a36Sopenharmony_ci
7862306a36Sopenharmony_ci	/* Handle tail in loops. The loops should be faster than hard
7962306a36Sopenharmony_ci	   to predict jump tables. */
8062306a36Sopenharmony_ci	.p2align 4
8162306a36Sopenharmony_ci.Lhandle_tail:
8262306a36Sopenharmony_ci	movl	%edx,%ecx
8362306a36Sopenharmony_ci	andl    $63&(~7),%ecx
8462306a36Sopenharmony_ci	jz 		.Lhandle_7
8562306a36Sopenharmony_ci	shrl	$3,%ecx
8662306a36Sopenharmony_ci	.p2align 4
8762306a36Sopenharmony_ci.Lloop_8:
8862306a36Sopenharmony_ci	decl   %ecx
8962306a36Sopenharmony_ci	movq  %rax,(%rdi)
9062306a36Sopenharmony_ci	leaq  8(%rdi),%rdi
9162306a36Sopenharmony_ci	jnz    .Lloop_8
9262306a36Sopenharmony_ci
9362306a36Sopenharmony_ci.Lhandle_7:
9462306a36Sopenharmony_ci	andl	$7,%edx
9562306a36Sopenharmony_ci	jz      .Lende
9662306a36Sopenharmony_ci	.p2align 4
9762306a36Sopenharmony_ci.Lloop_1:
9862306a36Sopenharmony_ci	decl    %edx
9962306a36Sopenharmony_ci	movb 	%al,(%rdi)
10062306a36Sopenharmony_ci	leaq	1(%rdi),%rdi
10162306a36Sopenharmony_ci	jnz     .Lloop_1
10262306a36Sopenharmony_ci
10362306a36Sopenharmony_ci.Lende:
10462306a36Sopenharmony_ci	movq	%r10,%rax
10562306a36Sopenharmony_ci	RET
10662306a36Sopenharmony_ci
10762306a36Sopenharmony_ci.Lbad_alignment:
10862306a36Sopenharmony_ci	cmpq $7,%rdx
10962306a36Sopenharmony_ci	jbe	.Lhandle_7
11062306a36Sopenharmony_ci	movq %rax,(%rdi)	/* unaligned store */
11162306a36Sopenharmony_ci	movq $8,%r8
11262306a36Sopenharmony_ci	subq %r9,%r8
11362306a36Sopenharmony_ci	addq %r8,%rdi
11462306a36Sopenharmony_ci	subq %r8,%rdx
11562306a36Sopenharmony_ci	jmp .Lafter_bad_alignment
11662306a36Sopenharmony_ci.Lfinal:
11762306a36Sopenharmony_ciSYM_FUNC_END(memset_orig)
118