1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /* Copyright 2002 Andi Kleen */
3 
4 #include <linux/linkage.h>
5 #include <asm/errno.h>
6 #include <asm/cpufeatures.h>
7 #include <asm/alternative.h>
8 #include <asm/export.h>
9 
10 .section .noinstr.text, "ax"
11 
12 /*
13  * memcpy - Copy a memory block.
14  *
15  * Input:
16  *  rdi destination
17  *  rsi source
18  *  rdx count
19  *
20  * Output:
21  * rax original destination
22  *
23  * The FSRM alternative should be done inline (avoiding the call and
24  * the disgusting return handling), but that would require some help
25  * from the compiler for better calling conventions.
26  *
27  * The 'rep movsb' itself is small enough to replace the call, but the
28  * two register moves blow up the code. And one of them is "needed"
29  * only for the return value that is the same as the source input,
30  * which the compiler could/should do much better anyway.
31  */
32 SYM_TYPED_FUNC_START(__memcpy)
33 	ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM
34 
35 	movq %rdi, %rax
36 	movq %rdx, %rcx
37 	rep movsb
38 	RET
39 SYM_FUNC_END(__memcpy)
40 EXPORT_SYMBOL(__memcpy)
41 
42 SYM_FUNC_ALIAS(memcpy, __memcpy)
43 EXPORT_SYMBOL(memcpy)
44 
45 SYM_FUNC_START_LOCAL(memcpy_orig)
46 	movq %rdi, %rax
47 
48 	cmpq $0x20, %rdx
49 	jb .Lhandle_tail
50 
51 	/*
52 	 * We check whether memory false dependence could occur,
53 	 * then jump to corresponding copy mode.
54 	 */
55 	cmp  %dil, %sil
56 	jl .Lcopy_backward
57 	subq $0x20, %rdx
58 .Lcopy_forward_loop:
59 	subq $0x20,	%rdx
60 
61 	/*
62 	 * Move in blocks of 4x8 bytes:
63 	 */
64 	movq 0*8(%rsi),	%r8
65 	movq 1*8(%rsi),	%r9
66 	movq 2*8(%rsi),	%r10
67 	movq 3*8(%rsi),	%r11
68 	leaq 4*8(%rsi),	%rsi
69 
70 	movq %r8,	0*8(%rdi)
71 	movq %r9,	1*8(%rdi)
72 	movq %r10,	2*8(%rdi)
73 	movq %r11,	3*8(%rdi)
74 	leaq 4*8(%rdi),	%rdi
75 	jae  .Lcopy_forward_loop
76 	addl $0x20,	%edx
77 	jmp  .Lhandle_tail
78 
79 .Lcopy_backward:
80 	/*
81 	 * Calculate copy position to tail.
82 	 */
83 	addq %rdx,	%rsi
84 	addq %rdx,	%rdi
85 	subq $0x20,	%rdx
86 	/*
87 	 * At most 3 ALU operations in one cycle,
88 	 * so append NOPS in the same 16 bytes trunk.
89 	 */
90 	.p2align 4
91 .Lcopy_backward_loop:
92 	subq $0x20,	%rdx
93 	movq -1*8(%rsi),	%r8
94 	movq -2*8(%rsi),	%r9
95 	movq -3*8(%rsi),	%r10
96 	movq -4*8(%rsi),	%r11
97 	leaq -4*8(%rsi),	%rsi
98 	movq %r8,		-1*8(%rdi)
99 	movq %r9,		-2*8(%rdi)
100 	movq %r10,		-3*8(%rdi)
101 	movq %r11,		-4*8(%rdi)
102 	leaq -4*8(%rdi),	%rdi
103 	jae  .Lcopy_backward_loop
104 
105 	/*
106 	 * Calculate copy position to head.
107 	 */
108 	addl $0x20,	%edx
109 	subq %rdx,	%rsi
110 	subq %rdx,	%rdi
111 .Lhandle_tail:
112 	cmpl $16,	%edx
113 	jb   .Lless_16bytes
114 
115 	/*
116 	 * Move data from 16 bytes to 31 bytes.
117 	 */
118 	movq 0*8(%rsi), %r8
119 	movq 1*8(%rsi),	%r9
120 	movq -2*8(%rsi, %rdx),	%r10
121 	movq -1*8(%rsi, %rdx),	%r11
122 	movq %r8,	0*8(%rdi)
123 	movq %r9,	1*8(%rdi)
124 	movq %r10,	-2*8(%rdi, %rdx)
125 	movq %r11,	-1*8(%rdi, %rdx)
126 	RET
127 	.p2align 4
128 .Lless_16bytes:
129 	cmpl $8,	%edx
130 	jb   .Lless_8bytes
131 	/*
132 	 * Move data from 8 bytes to 15 bytes.
133 	 */
134 	movq 0*8(%rsi),	%r8
135 	movq -1*8(%rsi, %rdx),	%r9
136 	movq %r8,	0*8(%rdi)
137 	movq %r9,	-1*8(%rdi, %rdx)
138 	RET
139 	.p2align 4
140 .Lless_8bytes:
141 	cmpl $4,	%edx
142 	jb   .Lless_3bytes
143 
144 	/*
145 	 * Move data from 4 bytes to 7 bytes.
146 	 */
147 	movl (%rsi), %ecx
148 	movl -4(%rsi, %rdx), %r8d
149 	movl %ecx, (%rdi)
150 	movl %r8d, -4(%rdi, %rdx)
151 	RET
152 	.p2align 4
153 .Lless_3bytes:
154 	subl $1, %edx
155 	jb .Lend
156 	/*
157 	 * Move data from 1 bytes to 3 bytes.
158 	 */
159 	movzbl (%rsi), %ecx
160 	jz .Lstore_1byte
161 	movzbq 1(%rsi), %r8
162 	movzbq (%rsi, %rdx), %r9
163 	movb %r8b, 1(%rdi)
164 	movb %r9b, (%rdi, %rdx)
165 .Lstore_1byte:
166 	movb %cl, (%rdi)
167 
168 .Lend:
169 	RET
170 SYM_FUNC_END(memcpy_orig)
171 
172