1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * Memory copy functions for 32-bit PowerPC.
4  *
5  * Copyright (C) 1996-2005 Paul Mackerras.
6  */
7 #include <asm/processor.h>
8 #include <asm/cache.h>
9 #include <asm/errno.h>
10 #include <asm/ppc_asm.h>
11 #include <asm/export.h>
12 #include <asm/code-patching-asm.h>
13 #include <asm/kasan.h>
14 
15 #define COPY_16_BYTES		\
16 	lwz	r7,4(r4);	\
17 	lwz	r8,8(r4);	\
18 	lwz	r9,12(r4);	\
19 	lwzu	r10,16(r4);	\
20 	stw	r7,4(r6);	\
21 	stw	r8,8(r6);	\
22 	stw	r9,12(r6);	\
23 	stwu	r10,16(r6)
24 
25 #define COPY_16_BYTES_WITHEX(n)	\
26 8 ## n ## 0:			\
27 	lwz	r7,4(r4);	\
28 8 ## n ## 1:			\
29 	lwz	r8,8(r4);	\
30 8 ## n ## 2:			\
31 	lwz	r9,12(r4);	\
32 8 ## n ## 3:			\
33 	lwzu	r10,16(r4);	\
34 8 ## n ## 4:			\
35 	stw	r7,4(r6);	\
36 8 ## n ## 5:			\
37 	stw	r8,8(r6);	\
38 8 ## n ## 6:			\
39 	stw	r9,12(r6);	\
40 8 ## n ## 7:			\
41 	stwu	r10,16(r6)
42 
43 #define COPY_16_BYTES_EXCODE(n)			\
44 9 ## n ## 0:					\
45 	addi	r5,r5,-(16 * n);		\
46 	b	104f;				\
47 9 ## n ## 1:					\
48 	addi	r5,r5,-(16 * n);		\
49 	b	105f;				\
50 	EX_TABLE(8 ## n ## 0b,9 ## n ## 0b);	\
51 	EX_TABLE(8 ## n ## 1b,9 ## n ## 0b);	\
52 	EX_TABLE(8 ## n ## 2b,9 ## n ## 0b);	\
53 	EX_TABLE(8 ## n ## 3b,9 ## n ## 0b);	\
54 	EX_TABLE(8 ## n ## 4b,9 ## n ## 1b);	\
55 	EX_TABLE(8 ## n ## 5b,9 ## n ## 1b);	\
56 	EX_TABLE(8 ## n ## 6b,9 ## n ## 1b);	\
57 	EX_TABLE(8 ## n ## 7b,9 ## n ## 1b)
58 
59 	.text
60 	.stabs	"arch/powerpc/lib/",N_SO,0,0,0f
61 	.stabs	"copy_32.S",N_SO,0,0,0f
62 0:
63 
64 CACHELINE_BYTES = L1_CACHE_BYTES
65 LG_CACHELINE_BYTES = L1_CACHE_SHIFT
66 CACHELINE_MASK = (L1_CACHE_BYTES-1)
67 
68 #ifndef CONFIG_KASAN
69 _GLOBAL(memset16)
70 	rlwinm.	r0 ,r5, 31, 1, 31
71 	addi	r6, r3, -4
72 	beq-	2f
73 	rlwimi	r4 ,r4 ,16 ,0 ,15
74 	mtctr	r0
75 1:	stwu	r4, 4(r6)
76 	bdnz	1b
77 2:	andi.	r0, r5, 1
78 	beqlr
79 	sth	r4, 4(r6)
80 	blr
81 EXPORT_SYMBOL(memset16)
82 #endif
83 
84 /*
85  * Use dcbz on the complete cache lines in the destination
86  * to set them to zero.  This requires that the destination
87  * area is cacheable.  -- paulus
88  *
89  * During early init, cache might not be active yet, so dcbz cannot be used.
90  * We therefore skip the optimised bloc that uses dcbz. This jump is
91  * replaced by a nop once cache is active. This is done in machine_init()
92  */
93 _GLOBAL_KASAN(memset)
94 	cmplwi	0,r5,4
95 	blt	7f
96 
97 	rlwimi	r4,r4,8,16,23
98 	rlwimi	r4,r4,16,0,15
99 
100 	stw	r4,0(r3)
101 	beqlr
102 	andi.	r0,r3,3
103 	add	r5,r0,r5
104 	subf	r6,r0,r3
105 	cmplwi	0,r4,0
106 	/*
107 	 * Skip optimised bloc until cache is enabled. Will be replaced
108 	 * by 'bne' during boot to use normal procedure if r4 is not zero
109 	 */
110 5:	b	2f
111 	patch_site	5b, patch__memset_nocache
112 
113 	clrlwi	r7,r6,32-LG_CACHELINE_BYTES
114 	add	r8,r7,r5
115 	srwi	r9,r8,LG_CACHELINE_BYTES
116 	addic.	r9,r9,-1	/* total number of complete cachelines */
117 	ble	2f
118 	xori	r0,r7,CACHELINE_MASK & ~3
119 	srwi.	r0,r0,2
120 	beq	3f
121 	mtctr	r0
122 4:	stwu	r4,4(r6)
123 	bdnz	4b
124 3:	mtctr	r9
125 	li	r7,4
126 10:	dcbz	r7,r6
127 	addi	r6,r6,CACHELINE_BYTES
128 	bdnz	10b
129 	clrlwi	r5,r8,32-LG_CACHELINE_BYTES
130 	addi	r5,r5,4
131 
132 2:	srwi	r0,r5,2
133 	mtctr	r0
134 	bdz	6f
135 1:	stwu	r4,4(r6)
136 	bdnz	1b
137 6:	andi.	r5,r5,3
138 	beqlr
139 	mtctr	r5
140 	addi	r6,r6,3
141 8:	stbu	r4,1(r6)
142 	bdnz	8b
143 	blr
144 
145 7:	cmpwi	0,r5,0
146 	beqlr
147 	mtctr	r5
148 	addi	r6,r3,-1
149 9:	stbu	r4,1(r6)
150 	bdnz	9b
151 	blr
152 EXPORT_SYMBOL(memset)
153 EXPORT_SYMBOL_KASAN(memset)
154 
155 /*
156  * This version uses dcbz on the complete cache lines in the
157  * destination area to reduce memory traffic.  This requires that
158  * the destination area is cacheable.
159  * We only use this version if the source and dest don't overlap.
160  * -- paulus.
161  *
162  * During early init, cache might not be active yet, so dcbz cannot be used.
163  * We therefore jump to generic_memcpy which doesn't use dcbz. This jump is
164  * replaced by a nop once cache is active. This is done in machine_init()
165  */
166 _GLOBAL_KASAN(memmove)
167 	cmplw	0,r3,r4
168 	bgt	backwards_memcpy
169 	/* fall through */
170 
171 _GLOBAL_KASAN(memcpy)
172 1:	b	generic_memcpy
173 	patch_site	1b, patch__memcpy_nocache
174 
175 	add	r7,r3,r5		/* test if the src & dst overlap */
176 	add	r8,r4,r5
177 	cmplw	0,r4,r7
178 	cmplw	1,r3,r8
179 	crand	0,0,4			/* cr0.lt &= cr1.lt */
180 	blt	generic_memcpy		/* if regions overlap */
181 
182 	addi	r4,r4,-4
183 	addi	r6,r3,-4
184 	neg	r0,r3
185 	andi.	r0,r0,CACHELINE_MASK	/* # bytes to start of cache line */
186 	beq	58f
187 
188 	cmplw	0,r5,r0			/* is this more than total to do? */
189 	blt	63f			/* if not much to do */
190 	andi.	r8,r0,3			/* get it word-aligned first */
191 	subf	r5,r0,r5
192 	mtctr	r8
193 	beq+	61f
194 70:	lbz	r9,4(r4)		/* do some bytes */
195 	addi	r4,r4,1
196 	addi	r6,r6,1
197 	stb	r9,3(r6)
198 	bdnz	70b
199 61:	srwi.	r0,r0,2
200 	mtctr	r0
201 	beq	58f
202 72:	lwzu	r9,4(r4)		/* do some words */
203 	stwu	r9,4(r6)
204 	bdnz	72b
205 
206 58:	srwi.	r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
207 	clrlwi	r5,r5,32-LG_CACHELINE_BYTES
208 	li	r11,4
209 	mtctr	r0
210 	beq	63f
211 53:
212 	dcbz	r11,r6
213 	COPY_16_BYTES
214 #if L1_CACHE_BYTES >= 32
215 	COPY_16_BYTES
216 #if L1_CACHE_BYTES >= 64
217 	COPY_16_BYTES
218 	COPY_16_BYTES
219 #if L1_CACHE_BYTES >= 128
220 	COPY_16_BYTES
221 	COPY_16_BYTES
222 	COPY_16_BYTES
223 	COPY_16_BYTES
224 #endif
225 #endif
226 #endif
227 	bdnz	53b
228 
229 63:	srwi.	r0,r5,2
230 	mtctr	r0
231 	beq	64f
232 30:	lwzu	r0,4(r4)
233 	stwu	r0,4(r6)
234 	bdnz	30b
235 
236 64:	andi.	r0,r5,3
237 	mtctr	r0
238 	beq+	65f
239 	addi	r4,r4,3
240 	addi	r6,r6,3
241 40:	lbzu	r0,1(r4)
242 	stbu	r0,1(r6)
243 	bdnz	40b
244 65:	blr
245 EXPORT_SYMBOL(memcpy)
246 EXPORT_SYMBOL(memmove)
247 EXPORT_SYMBOL_KASAN(memcpy)
248 EXPORT_SYMBOL_KASAN(memmove)
249 
250 generic_memcpy:
251 	srwi.	r7,r5,3
252 	addi	r6,r3,-4
253 	addi	r4,r4,-4
254 	beq	2f			/* if less than 8 bytes to do */
255 	andi.	r0,r6,3			/* get dest word aligned */
256 	mtctr	r7
257 	bne	5f
258 1:	lwz	r7,4(r4)
259 	lwzu	r8,8(r4)
260 	stw	r7,4(r6)
261 	stwu	r8,8(r6)
262 	bdnz	1b
263 	andi.	r5,r5,7
264 2:	cmplwi	0,r5,4
265 	blt	3f
266 	lwzu	r0,4(r4)
267 	addi	r5,r5,-4
268 	stwu	r0,4(r6)
269 3:	cmpwi	0,r5,0
270 	beqlr
271 	mtctr	r5
272 	addi	r4,r4,3
273 	addi	r6,r6,3
274 4:	lbzu	r0,1(r4)
275 	stbu	r0,1(r6)
276 	bdnz	4b
277 	blr
278 5:	subfic	r0,r0,4
279 	mtctr	r0
280 6:	lbz	r7,4(r4)
281 	addi	r4,r4,1
282 	stb	r7,4(r6)
283 	addi	r6,r6,1
284 	bdnz	6b
285 	subf	r5,r0,r5
286 	rlwinm.	r7,r5,32-3,3,31
287 	beq	2b
288 	mtctr	r7
289 	b	1b
290 
291 _GLOBAL(backwards_memcpy)
292 	rlwinm.	r7,r5,32-3,3,31		/* r0 = r5 >> 3 */
293 	add	r6,r3,r5
294 	add	r4,r4,r5
295 	beq	2f
296 	andi.	r0,r6,3
297 	mtctr	r7
298 	bne	5f
299 1:	lwz	r7,-4(r4)
300 	lwzu	r8,-8(r4)
301 	stw	r7,-4(r6)
302 	stwu	r8,-8(r6)
303 	bdnz	1b
304 	andi.	r5,r5,7
305 2:	cmplwi	0,r5,4
306 	blt	3f
307 	lwzu	r0,-4(r4)
308 	subi	r5,r5,4
309 	stwu	r0,-4(r6)
310 3:	cmpwi	0,r5,0
311 	beqlr
312 	mtctr	r5
313 4:	lbzu	r0,-1(r4)
314 	stbu	r0,-1(r6)
315 	bdnz	4b
316 	blr
317 5:	mtctr	r0
318 6:	lbzu	r7,-1(r4)
319 	stbu	r7,-1(r6)
320 	bdnz	6b
321 	subf	r5,r0,r5
322 	rlwinm.	r7,r5,32-3,3,31
323 	beq	2b
324 	mtctr	r7
325 	b	1b
326 
327 _GLOBAL(__copy_tofrom_user)
328 	addi	r4,r4,-4
329 	addi	r6,r3,-4
330 	neg	r0,r3
331 	andi.	r0,r0,CACHELINE_MASK	/* # bytes to start of cache line */
332 	beq	58f
333 
334 	cmplw	0,r5,r0			/* is this more than total to do? */
335 	blt	63f			/* if not much to do */
336 	andi.	r8,r0,3			/* get it word-aligned first */
337 	mtctr	r8
338 	beq+	61f
339 70:	lbz	r9,4(r4)		/* do some bytes */
340 71:	stb	r9,4(r6)
341 	addi	r4,r4,1
342 	addi	r6,r6,1
343 	bdnz	70b
344 61:	subf	r5,r0,r5
345 	srwi.	r0,r0,2
346 	mtctr	r0
347 	beq	58f
348 72:	lwzu	r9,4(r4)		/* do some words */
349 73:	stwu	r9,4(r6)
350 	bdnz	72b
351 
352 	EX_TABLE(70b,100f)
353 	EX_TABLE(71b,101f)
354 	EX_TABLE(72b,102f)
355 	EX_TABLE(73b,103f)
356 
357 58:	srwi.	r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
358 	clrlwi	r5,r5,32-LG_CACHELINE_BYTES
359 	li	r11,4
360 	beq	63f
361 
362 	/* Here we decide how far ahead to prefetch the source */
363 	li	r3,4
364 	cmpwi	r0,1
365 	li	r7,0
366 	ble	114f
367 	li	r7,1
368 #if MAX_COPY_PREFETCH > 1
369 	/* Heuristically, for large transfers we prefetch
370 	   MAX_COPY_PREFETCH cachelines ahead.  For small transfers
371 	   we prefetch 1 cacheline ahead. */
372 	cmpwi	r0,MAX_COPY_PREFETCH
373 	ble	112f
374 	li	r7,MAX_COPY_PREFETCH
375 112:	mtctr	r7
376 111:	dcbt	r3,r4
377 	addi	r3,r3,CACHELINE_BYTES
378 	bdnz	111b
379 #else
380 	dcbt	r3,r4
381 	addi	r3,r3,CACHELINE_BYTES
382 #endif /* MAX_COPY_PREFETCH > 1 */
383 
384 114:	subf	r8,r7,r0
385 	mr	r0,r7
386 	mtctr	r8
387 
388 53:	dcbt	r3,r4
389 54:	dcbz	r11,r6
390 	EX_TABLE(54b,105f)
391 /* the main body of the cacheline loop */
392 	COPY_16_BYTES_WITHEX(0)
393 #if L1_CACHE_BYTES >= 32
394 	COPY_16_BYTES_WITHEX(1)
395 #if L1_CACHE_BYTES >= 64
396 	COPY_16_BYTES_WITHEX(2)
397 	COPY_16_BYTES_WITHEX(3)
398 #if L1_CACHE_BYTES >= 128
399 	COPY_16_BYTES_WITHEX(4)
400 	COPY_16_BYTES_WITHEX(5)
401 	COPY_16_BYTES_WITHEX(6)
402 	COPY_16_BYTES_WITHEX(7)
403 #endif
404 #endif
405 #endif
406 	bdnz	53b
407 	cmpwi	r0,0
408 	li	r3,4
409 	li	r7,0
410 	bne	114b
411 
412 63:	srwi.	r0,r5,2
413 	mtctr	r0
414 	beq	64f
415 30:	lwzu	r0,4(r4)
416 31:	stwu	r0,4(r6)
417 	bdnz	30b
418 
419 64:	andi.	r0,r5,3
420 	mtctr	r0
421 	beq+	65f
422 40:	lbz	r0,4(r4)
423 41:	stb	r0,4(r6)
424 	addi	r4,r4,1
425 	addi	r6,r6,1
426 	bdnz	40b
427 65:	li	r3,0
428 	blr
429 
430 /* read fault, initial single-byte copy */
431 100:	li	r9,0
432 	b	90f
433 /* write fault, initial single-byte copy */
434 101:	li	r9,1
435 90:	subf	r5,r8,r5
436 	li	r3,0
437 	b	99f
438 /* read fault, initial word copy */
439 102:	li	r9,0
440 	b	91f
441 /* write fault, initial word copy */
442 103:	li	r9,1
443 91:	li	r3,2
444 	b	99f
445 
446 /*
447  * this stuff handles faults in the cacheline loop and branches to either
448  * 104f (if in read part) or 105f (if in write part), after updating r5
449  */
450 	COPY_16_BYTES_EXCODE(0)
451 #if L1_CACHE_BYTES >= 32
452 	COPY_16_BYTES_EXCODE(1)
453 #if L1_CACHE_BYTES >= 64
454 	COPY_16_BYTES_EXCODE(2)
455 	COPY_16_BYTES_EXCODE(3)
456 #if L1_CACHE_BYTES >= 128
457 	COPY_16_BYTES_EXCODE(4)
458 	COPY_16_BYTES_EXCODE(5)
459 	COPY_16_BYTES_EXCODE(6)
460 	COPY_16_BYTES_EXCODE(7)
461 #endif
462 #endif
463 #endif
464 
465 /* read fault in cacheline loop */
466 104:	li	r9,0
467 	b	92f
468 /* fault on dcbz (effectively a write fault) */
469 /* or write fault in cacheline loop */
470 105:	li	r9,1
471 92:	li	r3,LG_CACHELINE_BYTES
472 	mfctr	r8
473 	add	r0,r0,r8
474 	b	106f
475 /* read fault in final word loop */
476 108:	li	r9,0
477 	b	93f
478 /* write fault in final word loop */
479 109:	li	r9,1
480 93:	andi.	r5,r5,3
481 	li	r3,2
482 	b	99f
483 /* read fault in final byte loop */
484 110:	li	r9,0
485 	b	94f
486 /* write fault in final byte loop */
487 111:	li	r9,1
488 94:	li	r5,0
489 	li	r3,0
490 /*
491  * At this stage the number of bytes not copied is
492  * r5 + (ctr << r3), and r9 is 0 for read or 1 for write.
493  */
494 99:	mfctr	r0
495 106:	slw	r3,r0,r3
496 	add.	r3,r3,r5
497 	beq	120f			/* shouldn't happen */
498 	cmpwi	0,r9,0
499 	bne	120f
500 /* for a read fault, first try to continue the copy one byte at a time */
501 	mtctr	r3
502 130:	lbz	r0,4(r4)
503 131:	stb	r0,4(r6)
504 	addi	r4,r4,1
505 	addi	r6,r6,1
506 	bdnz	130b
507 /* then clear out the destination: r3 bytes starting at 4(r6) */
508 132:	mfctr	r3
509 120:	blr
510 
511 	EX_TABLE(30b,108b)
512 	EX_TABLE(31b,109b)
513 	EX_TABLE(40b,110b)
514 	EX_TABLE(41b,111b)
515 	EX_TABLE(130b,132b)
516 	EX_TABLE(131b,120b)
517 
518 EXPORT_SYMBOL(__copy_tofrom_user)
519