1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * Normally compiler builtins are used, but sometimes the compiler calls out
4  * of line code. Based on asm-i386/string.h.
5  *
6  * This assembly file is re-written from memmove_64.c file.
7  *	- Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
8  */
9 #include <linux/linkage.h>
10 #include <asm/cpufeatures.h>
11 #include <asm/alternative.h>
12 #include <asm/export.h>
13 
14 #undef memmove
15 
16 /*
17  * Implement memmove(). This can handle overlap between src and dst.
18  *
19  * Input:
20  * rdi: dest
21  * rsi: src
22  * rdx: count
23  *
24  * Output:
25  * rax: dest
26  */
27 SYM_FUNC_START_WEAK(memmove)
28 SYM_FUNC_START(__memmove)
29 
30 	mov %rdi, %rax
31 
32 	/* Decide forward/backward copy mode */
33 	cmp %rdi, %rsi
34 	jge .Lmemmove_begin_forward
35 	mov %rsi, %r8
36 	add %rdx, %r8
37 	cmp %rdi, %r8
38 	jg 2f
39 
40 	/* FSRM implies ERMS => no length checks, do the copy directly */
41 .Lmemmove_begin_forward:
42 	ALTERNATIVE "cmp $0x20, %rdx; jb 1f", "", X86_FEATURE_FSRM
43 	ALTERNATIVE "", "jmp .Lmemmove_erms", X86_FEATURE_ERMS
44 
45 	/*
46 	 * movsq instruction have many startup latency
47 	 * so we handle small size by general register.
48 	 */
49 	cmp  $680, %rdx
50 	jb	3f
51 	/*
52 	 * movsq instruction is only good for aligned case.
53 	 */
54 
55 	cmpb %dil, %sil
56 	je 4f
57 3:
58 	sub $0x20, %rdx
59 	/*
60 	 * We gobble 32 bytes forward in each loop.
61 	 */
62 5:
63 	sub $0x20, %rdx
64 	movq 0*8(%rsi), %r11
65 	movq 1*8(%rsi), %r10
66 	movq 2*8(%rsi), %r9
67 	movq 3*8(%rsi), %r8
68 	leaq 4*8(%rsi), %rsi
69 
70 	movq %r11, 0*8(%rdi)
71 	movq %r10, 1*8(%rdi)
72 	movq %r9, 2*8(%rdi)
73 	movq %r8, 3*8(%rdi)
74 	leaq 4*8(%rdi), %rdi
75 	jae 5b
76 	addq $0x20, %rdx
77 	jmp 1f
78 	/*
79 	 * Handle data forward by movsq.
80 	 */
81 	.p2align 4
82 4:
83 	movq %rdx, %rcx
84 	movq -8(%rsi, %rdx), %r11
85 	lea -8(%rdi, %rdx), %r10
86 	shrq $3, %rcx
87 	rep movsq
88 	movq %r11, (%r10)
89 	jmp 13f
90 .Lmemmove_end_forward:
91 
92 	/*
93 	 * Handle data backward by movsq.
94 	 */
95 	.p2align 4
96 7:
97 	movq %rdx, %rcx
98 	movq (%rsi), %r11
99 	movq %rdi, %r10
100 	leaq -8(%rsi, %rdx), %rsi
101 	leaq -8(%rdi, %rdx), %rdi
102 	shrq $3, %rcx
103 	std
104 	rep movsq
105 	cld
106 	movq %r11, (%r10)
107 	jmp 13f
108 
109 	/*
110 	 * Start to prepare for backward copy.
111 	 */
112 	.p2align 4
113 2:
114 	cmp $0x20, %rdx
115 	jb 1f
116 	cmp $680, %rdx
117 	jb 6f
118 	cmp %dil, %sil
119 	je 7b
120 6:
121 	/*
122 	 * Calculate copy position to tail.
123 	 */
124 	addq %rdx, %rsi
125 	addq %rdx, %rdi
126 	subq $0x20, %rdx
127 	/*
128 	 * We gobble 32 bytes backward in each loop.
129 	 */
130 8:
131 	subq $0x20, %rdx
132 	movq -1*8(%rsi), %r11
133 	movq -2*8(%rsi), %r10
134 	movq -3*8(%rsi), %r9
135 	movq -4*8(%rsi), %r8
136 	leaq -4*8(%rsi), %rsi
137 
138 	movq %r11, -1*8(%rdi)
139 	movq %r10, -2*8(%rdi)
140 	movq %r9, -3*8(%rdi)
141 	movq %r8, -4*8(%rdi)
142 	leaq -4*8(%rdi), %rdi
143 	jae 8b
144 	/*
145 	 * Calculate copy position to head.
146 	 */
147 	addq $0x20, %rdx
148 	subq %rdx, %rsi
149 	subq %rdx, %rdi
150 1:
151 	cmpq $16, %rdx
152 	jb 9f
153 	/*
154 	 * Move data from 16 bytes to 31 bytes.
155 	 */
156 	movq 0*8(%rsi), %r11
157 	movq 1*8(%rsi), %r10
158 	movq -2*8(%rsi, %rdx), %r9
159 	movq -1*8(%rsi, %rdx), %r8
160 	movq %r11, 0*8(%rdi)
161 	movq %r10, 1*8(%rdi)
162 	movq %r9, -2*8(%rdi, %rdx)
163 	movq %r8, -1*8(%rdi, %rdx)
164 	jmp 13f
165 	.p2align 4
166 9:
167 	cmpq $8, %rdx
168 	jb 10f
169 	/*
170 	 * Move data from 8 bytes to 15 bytes.
171 	 */
172 	movq 0*8(%rsi), %r11
173 	movq -1*8(%rsi, %rdx), %r10
174 	movq %r11, 0*8(%rdi)
175 	movq %r10, -1*8(%rdi, %rdx)
176 	jmp 13f
177 10:
178 	cmpq $4, %rdx
179 	jb 11f
180 	/*
181 	 * Move data from 4 bytes to 7 bytes.
182 	 */
183 	movl (%rsi), %r11d
184 	movl -4(%rsi, %rdx), %r10d
185 	movl %r11d, (%rdi)
186 	movl %r10d, -4(%rdi, %rdx)
187 	jmp 13f
188 11:
189 	cmp $2, %rdx
190 	jb 12f
191 	/*
192 	 * Move data from 2 bytes to 3 bytes.
193 	 */
194 	movw (%rsi), %r11w
195 	movw -2(%rsi, %rdx), %r10w
196 	movw %r11w, (%rdi)
197 	movw %r10w, -2(%rdi, %rdx)
198 	jmp 13f
199 12:
200 	cmp $1, %rdx
201 	jb 13f
202 	/*
203 	 * Move data for 1 byte.
204 	 */
205 	movb (%rsi), %r11b
206 	movb %r11b, (%rdi)
207 13:
208 	RET
209 
210 .Lmemmove_erms:
211 	movq %rdx, %rcx
212 	rep movsb
213 	RET
214 SYM_FUNC_END(__memmove)
215 SYM_FUNC_END_ALIAS(memmove)
216 EXPORT_SYMBOL(__memmove)
217 EXPORT_SYMBOL(memmove)
218