18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */
28c2ecf20Sopenharmony_ci/* Copyright 2002 Andi Kleen, SuSE Labs */
38c2ecf20Sopenharmony_ci
48c2ecf20Sopenharmony_ci#include <linux/linkage.h>
58c2ecf20Sopenharmony_ci#include <asm/cpufeatures.h>
68c2ecf20Sopenharmony_ci#include <asm/alternative.h>
78c2ecf20Sopenharmony_ci#include <asm/export.h>
88c2ecf20Sopenharmony_ci
98c2ecf20Sopenharmony_ci/*
108c2ecf20Sopenharmony_ci * ISO C memset - set a memory block to a byte value. This function uses fast
118c2ecf20Sopenharmony_ci * string to get better performance than the original function. The code is
128c2ecf20Sopenharmony_ci * simpler and shorter than the original function as well.
138c2ecf20Sopenharmony_ci *
148c2ecf20Sopenharmony_ci * rdi   destination
158c2ecf20Sopenharmony_ci * rsi   value (char)
168c2ecf20Sopenharmony_ci * rdx   count (bytes)
178c2ecf20Sopenharmony_ci *
188c2ecf20Sopenharmony_ci * rax   original destination
198c2ecf20Sopenharmony_ci */
208c2ecf20Sopenharmony_ciSYM_FUNC_START_WEAK(memset)
218c2ecf20Sopenharmony_ciSYM_FUNC_START(__memset)
228c2ecf20Sopenharmony_ci	/*
238c2ecf20Sopenharmony_ci	 * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
248c2ecf20Sopenharmony_ci	 * to use it when possible. If not available, use fast string instructions.
258c2ecf20Sopenharmony_ci	 *
268c2ecf20Sopenharmony_ci	 * Otherwise, use original memset function.
278c2ecf20Sopenharmony_ci	 */
288c2ecf20Sopenharmony_ci	ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
298c2ecf20Sopenharmony_ci		      "jmp memset_erms", X86_FEATURE_ERMS
308c2ecf20Sopenharmony_ci
318c2ecf20Sopenharmony_ci	movq %rdi,%r9
328c2ecf20Sopenharmony_ci	movq %rdx,%rcx
338c2ecf20Sopenharmony_ci	andl $7,%edx
348c2ecf20Sopenharmony_ci	shrq $3,%rcx
358c2ecf20Sopenharmony_ci	/* expand byte value  */
368c2ecf20Sopenharmony_ci	movzbl %sil,%esi
378c2ecf20Sopenharmony_ci	movabs $0x0101010101010101,%rax
388c2ecf20Sopenharmony_ci	imulq %rsi,%rax
398c2ecf20Sopenharmony_ci	rep stosq
408c2ecf20Sopenharmony_ci	movl %edx,%ecx
418c2ecf20Sopenharmony_ci	rep stosb
428c2ecf20Sopenharmony_ci	movq %r9,%rax
438c2ecf20Sopenharmony_ci	RET
448c2ecf20Sopenharmony_ciSYM_FUNC_END(__memset)
458c2ecf20Sopenharmony_ciSYM_FUNC_END_ALIAS(memset)
468c2ecf20Sopenharmony_ciEXPORT_SYMBOL(memset)
478c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__memset)
488c2ecf20Sopenharmony_ci
498c2ecf20Sopenharmony_ci/*
508c2ecf20Sopenharmony_ci * ISO C memset - set a memory block to a byte value. This function uses
518c2ecf20Sopenharmony_ci * enhanced rep stosb to override the fast string function.
528c2ecf20Sopenharmony_ci * The code is simpler and shorter than the fast string function as well.
538c2ecf20Sopenharmony_ci *
548c2ecf20Sopenharmony_ci * rdi   destination
558c2ecf20Sopenharmony_ci * rsi   value (char)
568c2ecf20Sopenharmony_ci * rdx   count (bytes)
578c2ecf20Sopenharmony_ci *
588c2ecf20Sopenharmony_ci * rax   original destination
598c2ecf20Sopenharmony_ci */
608c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(memset_erms)
618c2ecf20Sopenharmony_ci	movq %rdi,%r9
628c2ecf20Sopenharmony_ci	movb %sil,%al
638c2ecf20Sopenharmony_ci	movq %rdx,%rcx
648c2ecf20Sopenharmony_ci	rep stosb
658c2ecf20Sopenharmony_ci	movq %r9,%rax
668c2ecf20Sopenharmony_ci	RET
678c2ecf20Sopenharmony_ciSYM_FUNC_END(memset_erms)
688c2ecf20Sopenharmony_ci
698c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(memset_orig)
708c2ecf20Sopenharmony_ci	movq %rdi,%r10
718c2ecf20Sopenharmony_ci
728c2ecf20Sopenharmony_ci	/* expand byte value  */
738c2ecf20Sopenharmony_ci	movzbl %sil,%ecx
748c2ecf20Sopenharmony_ci	movabs $0x0101010101010101,%rax
758c2ecf20Sopenharmony_ci	imulq  %rcx,%rax
768c2ecf20Sopenharmony_ci
778c2ecf20Sopenharmony_ci	/* align dst */
788c2ecf20Sopenharmony_ci	movl  %edi,%r9d
798c2ecf20Sopenharmony_ci	andl  $7,%r9d
808c2ecf20Sopenharmony_ci	jnz  .Lbad_alignment
818c2ecf20Sopenharmony_ci.Lafter_bad_alignment:
828c2ecf20Sopenharmony_ci
838c2ecf20Sopenharmony_ci	movq  %rdx,%rcx
848c2ecf20Sopenharmony_ci	shrq  $6,%rcx
858c2ecf20Sopenharmony_ci	jz	 .Lhandle_tail
868c2ecf20Sopenharmony_ci
878c2ecf20Sopenharmony_ci	.p2align 4
888c2ecf20Sopenharmony_ci.Lloop_64:
898c2ecf20Sopenharmony_ci	decq  %rcx
908c2ecf20Sopenharmony_ci	movq  %rax,(%rdi)
918c2ecf20Sopenharmony_ci	movq  %rax,8(%rdi)
928c2ecf20Sopenharmony_ci	movq  %rax,16(%rdi)
938c2ecf20Sopenharmony_ci	movq  %rax,24(%rdi)
948c2ecf20Sopenharmony_ci	movq  %rax,32(%rdi)
958c2ecf20Sopenharmony_ci	movq  %rax,40(%rdi)
968c2ecf20Sopenharmony_ci	movq  %rax,48(%rdi)
978c2ecf20Sopenharmony_ci	movq  %rax,56(%rdi)
988c2ecf20Sopenharmony_ci	leaq  64(%rdi),%rdi
998c2ecf20Sopenharmony_ci	jnz    .Lloop_64
1008c2ecf20Sopenharmony_ci
1018c2ecf20Sopenharmony_ci	/* Handle tail in loops. The loops should be faster than hard
1028c2ecf20Sopenharmony_ci	   to predict jump tables. */
1038c2ecf20Sopenharmony_ci	.p2align 4
1048c2ecf20Sopenharmony_ci.Lhandle_tail:
1058c2ecf20Sopenharmony_ci	movl	%edx,%ecx
1068c2ecf20Sopenharmony_ci	andl    $63&(~7),%ecx
1078c2ecf20Sopenharmony_ci	jz 		.Lhandle_7
1088c2ecf20Sopenharmony_ci	shrl	$3,%ecx
1098c2ecf20Sopenharmony_ci	.p2align 4
1108c2ecf20Sopenharmony_ci.Lloop_8:
1118c2ecf20Sopenharmony_ci	decl   %ecx
1128c2ecf20Sopenharmony_ci	movq  %rax,(%rdi)
1138c2ecf20Sopenharmony_ci	leaq  8(%rdi),%rdi
1148c2ecf20Sopenharmony_ci	jnz    .Lloop_8
1158c2ecf20Sopenharmony_ci
1168c2ecf20Sopenharmony_ci.Lhandle_7:
1178c2ecf20Sopenharmony_ci	andl	$7,%edx
1188c2ecf20Sopenharmony_ci	jz      .Lende
1198c2ecf20Sopenharmony_ci	.p2align 4
1208c2ecf20Sopenharmony_ci.Lloop_1:
1218c2ecf20Sopenharmony_ci	decl    %edx
1228c2ecf20Sopenharmony_ci	movb 	%al,(%rdi)
1238c2ecf20Sopenharmony_ci	leaq	1(%rdi),%rdi
1248c2ecf20Sopenharmony_ci	jnz     .Lloop_1
1258c2ecf20Sopenharmony_ci
1268c2ecf20Sopenharmony_ci.Lende:
1278c2ecf20Sopenharmony_ci	movq	%r10,%rax
1288c2ecf20Sopenharmony_ci	RET
1298c2ecf20Sopenharmony_ci
1308c2ecf20Sopenharmony_ci.Lbad_alignment:
1318c2ecf20Sopenharmony_ci	cmpq $7,%rdx
1328c2ecf20Sopenharmony_ci	jbe	.Lhandle_7
1338c2ecf20Sopenharmony_ci	movq %rax,(%rdi)	/* unaligned store */
1348c2ecf20Sopenharmony_ci	movq $8,%r8
1358c2ecf20Sopenharmony_ci	subq %r9,%r8
1368c2ecf20Sopenharmony_ci	addq %r8,%rdi
1378c2ecf20Sopenharmony_ci	subq %r8,%rdx
1388c2ecf20Sopenharmony_ci	jmp .Lafter_bad_alignment
1398c2ecf20Sopenharmony_ci.Lfinal:
1408c2ecf20Sopenharmony_ciSYM_FUNC_END(memset_orig)
141