1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * Copyright (C) 2002 Paul Mackerras, IBM Corp.
4  */
5 #include <asm/processor.h>
6 #include <asm/ppc_asm.h>
7 #include <asm/export.h>
8 #include <asm/asm-compat.h>
9 #include <asm/feature-fixups.h>
10 
11 #ifndef SELFTEST_CASE
12 /* 0 == most CPUs, 1 == POWER6, 2 == Cell */
13 #define SELFTEST_CASE	0
14 #endif
15 
16 #ifdef __BIG_ENDIAN__
17 #define sLd sld		/* Shift towards low-numbered address. */
18 #define sHd srd		/* Shift towards high-numbered address. */
19 #else
20 #define sLd srd		/* Shift towards low-numbered address. */
21 #define sHd sld		/* Shift towards high-numbered address. */
22 #endif
23 
24 /*
25  * These macros are used to generate exception table entries.
26  * The exception handlers below use the original arguments
27  * (stored on the stack) and the point where we're up to in
28  * the destination buffer, i.e. the address of the first
29  * unmodified byte.  Generally r3 points into the destination
30  * buffer, but the first unmodified byte is at a variable
31  * offset from r3.  In the code below, the symbol r3_offset
32  * is set to indicate the current offset at each point in
33  * the code.  This offset is then used as a negative offset
34  * from the exception handler code, and those instructions
35  * before the exception handlers are addi instructions that
36  * adjust r3 to point to the correct place.
37  */
38 	.macro	lex		/* exception handler for load */
39 100:	EX_TABLE(100b, .Lld_exc - r3_offset)
40 	.endm
41 
42 	.macro	stex		/* exception handler for store */
43 100:	EX_TABLE(100b, .Lst_exc - r3_offset)
44 	.endm
45 
46 	.align	7
47 _GLOBAL_TOC(__copy_tofrom_user)
48 #ifdef CONFIG_PPC_BOOK3S_64
49 BEGIN_FTR_SECTION
50 	nop
51 FTR_SECTION_ELSE
52 	b	__copy_tofrom_user_power7
53 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
54 #endif
55 _GLOBAL(__copy_tofrom_user_base)
56 	/* first check for a 4kB copy on a 4kB boundary */
57 	cmpldi	cr1,r5,16
58 	cmpdi	cr6,r5,4096
59 	or	r0,r3,r4
60 	neg	r6,r3		/* LS 3 bits = # bytes to 8-byte dest bdry */
61 	andi.	r0,r0,4095
62 	std	r3,-24(r1)
63 	crand	cr0*4+2,cr0*4+2,cr6*4+2
64 	std	r4,-16(r1)
65 	std	r5,-8(r1)
66 	dcbt	0,r4
67 	beq	.Lcopy_page_4K
68 	andi.	r6,r6,7
69 	PPC_MTOCRF(0x01,r5)
70 	blt	cr1,.Lshort_copy
71 /* Below we want to nop out the bne if we're on a CPU that has the
72  * CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
73  * cleared.
74  * At the time of writing the only CPU that has this combination of bits
75  * set is Power6.
76  */
77 test_feature = (SELFTEST_CASE == 1)
78 BEGIN_FTR_SECTION
79 	nop
80 FTR_SECTION_ELSE
81 	bne	.Ldst_unaligned
82 ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
83 		    CPU_FTR_UNALIGNED_LD_STD)
84 .Ldst_aligned:
85 	addi	r3,r3,-16
86 r3_offset = 16
87 test_feature = (SELFTEST_CASE == 0)
88 BEGIN_FTR_SECTION
89 	andi.	r0,r4,7
90 	bne	.Lsrc_unaligned
91 END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
92 	blt	cr1,.Ldo_tail		/* if < 16 bytes to copy */
93 	srdi	r0,r5,5
94 	cmpdi	cr1,r0,0
95 lex;	ld	r7,0(r4)
96 lex;	ld	r6,8(r4)
97 	addi	r4,r4,16
98 	mtctr	r0
99 	andi.	r0,r5,0x10
100 	beq	22f
101 	addi	r3,r3,16
102 r3_offset = 0
103 	addi	r4,r4,-16
104 	mr	r9,r7
105 	mr	r8,r6
106 	beq	cr1,72f
107 21:
108 lex;	ld	r7,16(r4)
109 lex;	ld	r6,24(r4)
110 	addi	r4,r4,32
111 stex;	std	r9,0(r3)
112 r3_offset = 8
113 stex;	std	r8,8(r3)
114 r3_offset = 16
115 22:
116 lex;	ld	r9,0(r4)
117 lex;	ld	r8,8(r4)
118 stex;	std	r7,16(r3)
119 r3_offset = 24
120 stex;	std	r6,24(r3)
121 	addi	r3,r3,32
122 r3_offset = 0
123 	bdnz	21b
124 72:
125 stex;	std	r9,0(r3)
126 r3_offset = 8
127 stex;	std	r8,8(r3)
128 r3_offset = 16
129 	andi.	r5,r5,0xf
130 	beq+	3f
131 	addi	r4,r4,16
132 .Ldo_tail:
133 	addi	r3,r3,16
134 r3_offset = 0
135 	bf	cr7*4+0,246f
136 lex;	ld	r9,0(r4)
137 	addi	r4,r4,8
138 stex;	std	r9,0(r3)
139 	addi	r3,r3,8
140 246:	bf	cr7*4+1,1f
141 lex;	lwz	r9,0(r4)
142 	addi	r4,r4,4
143 stex;	stw	r9,0(r3)
144 	addi	r3,r3,4
145 1:	bf	cr7*4+2,2f
146 lex;	lhz	r9,0(r4)
147 	addi	r4,r4,2
148 stex;	sth	r9,0(r3)
149 	addi	r3,r3,2
150 2:	bf	cr7*4+3,3f
151 lex;	lbz	r9,0(r4)
152 stex;	stb	r9,0(r3)
153 3:	li	r3,0
154 	blr
155 
156 .Lsrc_unaligned:
157 r3_offset = 16
158 	srdi	r6,r5,3
159 	addi	r5,r5,-16
160 	subf	r4,r0,r4
161 	srdi	r7,r5,4
162 	sldi	r10,r0,3
163 	cmpldi	cr6,r6,3
164 	andi.	r5,r5,7
165 	mtctr	r7
166 	subfic	r11,r10,64
167 	add	r5,r5,r0
168 	bt	cr7*4+0,28f
169 
170 lex;	ld	r9,0(r4)	/* 3+2n loads, 2+2n stores */
171 lex;	ld	r0,8(r4)
172 	sLd	r6,r9,r10
173 lex;	ldu	r9,16(r4)
174 	sHd	r7,r0,r11
175 	sLd	r8,r0,r10
176 	or	r7,r7,r6
177 	blt	cr6,79f
178 lex;	ld	r0,8(r4)
179 	b	2f
180 
181 28:
182 lex;	ld	r0,0(r4)	/* 4+2n loads, 3+2n stores */
183 lex;	ldu	r9,8(r4)
184 	sLd	r8,r0,r10
185 	addi	r3,r3,-8
186 r3_offset = 24
187 	blt	cr6,5f
188 lex;	ld	r0,8(r4)
189 	sHd	r12,r9,r11
190 	sLd	r6,r9,r10
191 lex;	ldu	r9,16(r4)
192 	or	r12,r8,r12
193 	sHd	r7,r0,r11
194 	sLd	r8,r0,r10
195 	addi	r3,r3,16
196 r3_offset = 8
197 	beq	cr6,78f
198 
199 1:	or	r7,r7,r6
200 lex;	ld	r0,8(r4)
201 stex;	std	r12,8(r3)
202 r3_offset = 16
203 2:	sHd	r12,r9,r11
204 	sLd	r6,r9,r10
205 lex;	ldu	r9,16(r4)
206 	or	r12,r8,r12
207 stex;	stdu	r7,16(r3)
208 r3_offset = 8
209 	sHd	r7,r0,r11
210 	sLd	r8,r0,r10
211 	bdnz	1b
212 
213 78:
214 stex;	std	r12,8(r3)
215 r3_offset = 16
216 	or	r7,r7,r6
217 79:
218 stex;	std	r7,16(r3)
219 r3_offset = 24
220 5:	sHd	r12,r9,r11
221 	or	r12,r8,r12
222 stex;	std	r12,24(r3)
223 r3_offset = 32
224 	bne	6f
225 	li	r3,0
226 	blr
227 6:	cmpwi	cr1,r5,8
228 	addi	r3,r3,32
229 r3_offset = 0
230 	sLd	r9,r9,r10
231 	ble	cr1,7f
232 lex;	ld	r0,8(r4)
233 	sHd	r7,r0,r11
234 	or	r9,r7,r9
235 7:
236 	bf	cr7*4+1,1f
237 #ifdef __BIG_ENDIAN__
238 	rotldi	r9,r9,32
239 #endif
240 stex;	stw	r9,0(r3)
241 #ifdef __LITTLE_ENDIAN__
242 	rotrdi	r9,r9,32
243 #endif
244 	addi	r3,r3,4
245 1:	bf	cr7*4+2,2f
246 #ifdef __BIG_ENDIAN__
247 	rotldi	r9,r9,16
248 #endif
249 stex;	sth	r9,0(r3)
250 #ifdef __LITTLE_ENDIAN__
251 	rotrdi	r9,r9,16
252 #endif
253 	addi	r3,r3,2
254 2:	bf	cr7*4+3,3f
255 #ifdef __BIG_ENDIAN__
256 	rotldi	r9,r9,8
257 #endif
258 stex;	stb	r9,0(r3)
259 #ifdef __LITTLE_ENDIAN__
260 	rotrdi	r9,r9,8
261 #endif
262 3:	li	r3,0
263 	blr
264 
265 .Ldst_unaligned:
266 r3_offset = 0
267 	PPC_MTOCRF(0x01,r6)		/* put #bytes to 8B bdry into cr7 */
268 	subf	r5,r6,r5
269 	li	r7,0
270 	cmpldi	cr1,r5,16
271 	bf	cr7*4+3,1f
272 100:	EX_TABLE(100b, .Lld_exc_r7)
273 	lbz	r0,0(r4)
274 100:	EX_TABLE(100b, .Lst_exc_r7)
275 	stb	r0,0(r3)
276 	addi	r7,r7,1
277 1:	bf	cr7*4+2,2f
278 100:	EX_TABLE(100b, .Lld_exc_r7)
279 	lhzx	r0,r7,r4
280 100:	EX_TABLE(100b, .Lst_exc_r7)
281 	sthx	r0,r7,r3
282 	addi	r7,r7,2
283 2:	bf	cr7*4+1,3f
284 100:	EX_TABLE(100b, .Lld_exc_r7)
285 	lwzx	r0,r7,r4
286 100:	EX_TABLE(100b, .Lst_exc_r7)
287 	stwx	r0,r7,r3
288 3:	PPC_MTOCRF(0x01,r5)
289 	add	r4,r6,r4
290 	add	r3,r6,r3
291 	b	.Ldst_aligned
292 
293 .Lshort_copy:
294 r3_offset = 0
295 	bf	cr7*4+0,1f
296 lex;	lwz	r0,0(r4)
297 lex;	lwz	r9,4(r4)
298 	addi	r4,r4,8
299 stex;	stw	r0,0(r3)
300 stex;	stw	r9,4(r3)
301 	addi	r3,r3,8
302 1:	bf	cr7*4+1,2f
303 lex;	lwz	r0,0(r4)
304 	addi	r4,r4,4
305 stex;	stw	r0,0(r3)
306 	addi	r3,r3,4
307 2:	bf	cr7*4+2,3f
308 lex;	lhz	r0,0(r4)
309 	addi	r4,r4,2
310 stex;	sth	r0,0(r3)
311 	addi	r3,r3,2
312 3:	bf	cr7*4+3,4f
313 lex;	lbz	r0,0(r4)
314 stex;	stb	r0,0(r3)
315 4:	li	r3,0
316 	blr
317 
318 /*
319  * exception handlers follow
320  * we have to return the number of bytes not copied
321  * for an exception on a load, we set the rest of the destination to 0
322  * Note that the number of bytes of instructions for adjusting r3 needs
323  * to equal the amount of the adjustment, due to the trick of using
324  * .Lld_exc - r3_offset as the handler address.
325  */
326 
327 .Lld_exc_r7:
328 	add	r3,r3,r7
329 	b	.Lld_exc
330 
331 	/* adjust by 24 */
332 	addi	r3,r3,8
333 	nop
334 	/* adjust by 16 */
335 	addi	r3,r3,8
336 	nop
337 	/* adjust by 8 */
338 	addi	r3,r3,8
339 	nop
340 
341 /*
342  * Here we have had a fault on a load and r3 points to the first
343  * unmodified byte of the destination.  We use the original arguments
344  * and r3 to work out how much wasn't copied.  Since we load some
345  * distance ahead of the stores, we continue copying byte-by-byte until
346  * we hit the load fault again in order to copy as much as possible.
347  */
348 .Lld_exc:
349 	ld	r6,-24(r1)
350 	ld	r4,-16(r1)
351 	ld	r5,-8(r1)
352 	subf	r6,r6,r3
353 	add	r4,r4,r6
354 	subf	r5,r6,r5	/* #bytes left to go */
355 
356 /*
357  * first see if we can copy any more bytes before hitting another exception
358  */
359 	mtctr	r5
360 r3_offset = 0
361 100:	EX_TABLE(100b, .Ldone)
362 43:	lbz	r0,0(r4)
363 	addi	r4,r4,1
364 stex;	stb	r0,0(r3)
365 	addi	r3,r3,1
366 	bdnz	43b
367 	li	r3,0		/* huh? all copied successfully this time? */
368 	blr
369 
370 /*
371  * here we have trapped again, amount remaining is in ctr.
372  */
373 .Ldone:
374 	mfctr	r3
375 	blr
376 
377 /*
378  * exception handlers for stores: we need to work out how many bytes
379  * weren't copied, and we may need to copy some more.
380  * Note that the number of bytes of instructions for adjusting r3 needs
381  * to equal the amount of the adjustment, due to the trick of using
382  * .Lst_exc - r3_offset as the handler address.
383  */
384 .Lst_exc_r7:
385 	add	r3,r3,r7
386 	b	.Lst_exc
387 
388 	/* adjust by 24 */
389 	addi	r3,r3,8
390 	nop
391 	/* adjust by 16 */
392 	addi	r3,r3,8
393 	nop
394 	/* adjust by 8 */
395 	addi	r3,r3,4
396 	/* adjust by 4 */
397 	addi	r3,r3,4
398 .Lst_exc:
399 	ld	r6,-24(r1)	/* original destination pointer */
400 	ld	r4,-16(r1)	/* original source pointer */
401 	ld	r5,-8(r1)	/* original number of bytes */
402 	add	r7,r6,r5
403 	/*
404 	 * If the destination pointer isn't 8-byte aligned,
405 	 * we may have got the exception as a result of a
406 	 * store that overlapped a page boundary, so we may be
407 	 * able to copy a few more bytes.
408 	 */
409 17:	andi.	r0,r3,7
410 	beq	19f
411 	subf	r8,r6,r3	/* #bytes copied */
412 100:	EX_TABLE(100b,19f)
413 	lbzx	r0,r8,r4
414 100:	EX_TABLE(100b,19f)
415 	stb	r0,0(r3)
416 	addi	r3,r3,1
417 	cmpld	r3,r7
418 	blt	17b
419 19:	subf	r3,r3,r7	/* #bytes not copied in r3 */
420 	blr
421 
422 /*
423  * Routine to copy a whole page of data, optimized for POWER4.
424  * On POWER4 it is more than 50% faster than the simple loop
425  * above (following the .Ldst_aligned label).
426  */
427 	.macro	exc
428 100:	EX_TABLE(100b, .Labort)
429 	.endm
430 .Lcopy_page_4K:
431 	std	r31,-32(1)
432 	std	r30,-40(1)
433 	std	r29,-48(1)
434 	std	r28,-56(1)
435 	std	r27,-64(1)
436 	std	r26,-72(1)
437 	std	r25,-80(1)
438 	std	r24,-88(1)
439 	std	r23,-96(1)
440 	std	r22,-104(1)
441 	std	r21,-112(1)
442 	std	r20,-120(1)
443 	li	r5,4096/32 - 1
444 	addi	r3,r3,-8
445 	li	r0,5
446 0:	addi	r5,r5,-24
447 	mtctr	r0
448 exc;	ld	r22,640(4)
449 exc;	ld	r21,512(4)
450 exc;	ld	r20,384(4)
451 exc;	ld	r11,256(4)
452 exc;	ld	r9,128(4)
453 exc;	ld	r7,0(4)
454 exc;	ld	r25,648(4)
455 exc;	ld	r24,520(4)
456 exc;	ld	r23,392(4)
457 exc;	ld	r10,264(4)
458 exc;	ld	r8,136(4)
459 exc;	ldu	r6,8(4)
460 	cmpwi	r5,24
461 1:
462 exc;	std	r22,648(3)
463 exc;	std	r21,520(3)
464 exc;	std	r20,392(3)
465 exc;	std	r11,264(3)
466 exc;	std	r9,136(3)
467 exc;	std	r7,8(3)
468 exc;	ld	r28,648(4)
469 exc;	ld	r27,520(4)
470 exc;	ld	r26,392(4)
471 exc;	ld	r31,264(4)
472 exc;	ld	r30,136(4)
473 exc;	ld	r29,8(4)
474 exc;	std	r25,656(3)
475 exc;	std	r24,528(3)
476 exc;	std	r23,400(3)
477 exc;	std	r10,272(3)
478 exc;	std	r8,144(3)
479 exc;	std	r6,16(3)
480 exc;	ld	r22,656(4)
481 exc;	ld	r21,528(4)
482 exc;	ld	r20,400(4)
483 exc;	ld	r11,272(4)
484 exc;	ld	r9,144(4)
485 exc;	ld	r7,16(4)
486 exc;	std	r28,664(3)
487 exc;	std	r27,536(3)
488 exc;	std	r26,408(3)
489 exc;	std	r31,280(3)
490 exc;	std	r30,152(3)
491 exc;	stdu	r29,24(3)
492 exc;	ld	r25,664(4)
493 exc;	ld	r24,536(4)
494 exc;	ld	r23,408(4)
495 exc;	ld	r10,280(4)
496 exc;	ld	r8,152(4)
497 exc;	ldu	r6,24(4)
498 	bdnz	1b
499 exc;	std	r22,648(3)
500 exc;	std	r21,520(3)
501 exc;	std	r20,392(3)
502 exc;	std	r11,264(3)
503 exc;	std	r9,136(3)
504 exc;	std	r7,8(3)
505 	addi	r4,r4,640
506 	addi	r3,r3,648
507 	bge	0b
508 	mtctr	r5
509 exc;	ld	r7,0(4)
510 exc;	ld	r8,8(4)
511 exc;	ldu	r9,16(4)
512 3:
513 exc;	ld	r10,8(4)
514 exc;	std	r7,8(3)
515 exc;	ld	r7,16(4)
516 exc;	std	r8,16(3)
517 exc;	ld	r8,24(4)
518 exc;	std	r9,24(3)
519 exc;	ldu	r9,32(4)
520 exc;	stdu	r10,32(3)
521 	bdnz	3b
522 4:
523 exc;	ld	r10,8(4)
524 exc;	std	r7,8(3)
525 exc;	std	r8,16(3)
526 exc;	std	r9,24(3)
527 exc;	std	r10,32(3)
528 9:	ld	r20,-120(1)
529 	ld	r21,-112(1)
530 	ld	r22,-104(1)
531 	ld	r23,-96(1)
532 	ld	r24,-88(1)
533 	ld	r25,-80(1)
534 	ld	r26,-72(1)
535 	ld	r27,-64(1)
536 	ld	r28,-56(1)
537 	ld	r29,-48(1)
538 	ld	r30,-40(1)
539 	ld	r31,-32(1)
540 	li	r3,0
541 	blr
542 
543 /*
544  * on an exception, reset to the beginning and jump back into the
545  * standard __copy_tofrom_user
546  */
547 .Labort:
548 	ld	r20,-120(1)
549 	ld	r21,-112(1)
550 	ld	r22,-104(1)
551 	ld	r23,-96(1)
552 	ld	r24,-88(1)
553 	ld	r25,-80(1)
554 	ld	r26,-72(1)
555 	ld	r27,-64(1)
556 	ld	r28,-56(1)
557 	ld	r29,-48(1)
558 	ld	r30,-40(1)
559 	ld	r31,-32(1)
560 	ld	r3,-24(r1)
561 	ld	r4,-16(r1)
562 	li	r5,4096
563 	b	.Ldst_aligned
564 EXPORT_SYMBOL(__copy_tofrom_user)
565