1 /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 /* 3 * 4 * Copyright (C) IBM Corporation, 2011 5 * 6 * Author: Anton Blanchard <anton@au.ibm.com> 7 */ 8 #include <asm/ppc_asm.h> 9 10 #ifndef SELFTEST_CASE 11 /* 0 == don't use VMX, 1 == use VMX */ 12 #define SELFTEST_CASE 0 13 #endif 14 15 #ifdef __BIG_ENDIAN__ 16 #define LVS(VRT,RA,RB) lvsl VRT,RA,RB 17 #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC 18 #else 19 #define LVS(VRT,RA,RB) lvsr VRT,RA,RB 20 #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC 21 #endif 22 23 .macro err1 24 100: 25 EX_TABLE(100b,.Ldo_err1) 26 .endm 27 28 .macro err2 29 200: 30 EX_TABLE(200b,.Ldo_err2) 31 .endm 32 33 #ifdef CONFIG_ALTIVEC 34 .macro err3 35 300: 36 EX_TABLE(300b,.Ldo_err3) 37 .endm 38 39 .macro err4 40 400: 41 EX_TABLE(400b,.Ldo_err4) 42 .endm 43 44 45 .Ldo_err4: 46 ld r16,STK_REG(R16)(r1) 47 ld r15,STK_REG(R15)(r1) 48 ld r14,STK_REG(R14)(r1) 49 .Ldo_err3: 50 bl exit_vmx_usercopy 51 ld r0,STACKFRAMESIZE+16(r1) 52 mtlr r0 53 b .Lexit 54 #endif /* CONFIG_ALTIVEC */ 55 56 .Ldo_err2: 57 ld r22,STK_REG(R22)(r1) 58 ld r21,STK_REG(R21)(r1) 59 ld r20,STK_REG(R20)(r1) 60 ld r19,STK_REG(R19)(r1) 61 ld r18,STK_REG(R18)(r1) 62 ld r17,STK_REG(R17)(r1) 63 ld r16,STK_REG(R16)(r1) 64 ld r15,STK_REG(R15)(r1) 65 ld r14,STK_REG(R14)(r1) 66 .Lexit: 67 addi r1,r1,STACKFRAMESIZE 68 .Ldo_err1: 69 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 70 ld r4,-STACKFRAMESIZE+STK_REG(R30)(r1) 71 ld r5,-STACKFRAMESIZE+STK_REG(R29)(r1) 72 b __copy_tofrom_user_base 73 74 75 _GLOBAL(__copy_tofrom_user_power7) 76 cmpldi r5,16 77 cmpldi cr1,r5,3328 78 79 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 80 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) 81 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) 82 83 blt .Lshort_copy 84 85 #ifdef CONFIG_ALTIVEC 86 test_feature = SELFTEST_CASE 87 BEGIN_FTR_SECTION 88 bgt cr1,.Lvmx_copy 89 END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) 90 #endif 91 92 .Lnonvmx_copy: 93 /* Get the source 8B aligned */ 94 neg r6,r4 95 mtocrf 0x01,r6 96 clrldi r6,r6,(64-3) 97 98 bf cr7*4+3,1f 99 err1; lbz r0,0(r4) 100 addi r4,r4,1 101 err1; stb r0,0(r3) 102 addi r3,r3,1 103 104 1: bf cr7*4+2,2f 105 err1; lhz r0,0(r4) 106 addi r4,r4,2 107 err1; sth r0,0(r3) 108 addi r3,r3,2 109 110 2: bf cr7*4+1,3f 111 err1; lwz r0,0(r4) 112 addi r4,r4,4 113 err1; stw r0,0(r3) 114 addi r3,r3,4 115 116 3: sub r5,r5,r6 117 cmpldi r5,128 118 blt 5f 119 120 mflr r0 121 stdu r1,-STACKFRAMESIZE(r1) 122 std r14,STK_REG(R14)(r1) 123 std r15,STK_REG(R15)(r1) 124 std r16,STK_REG(R16)(r1) 125 std r17,STK_REG(R17)(r1) 126 std r18,STK_REG(R18)(r1) 127 std r19,STK_REG(R19)(r1) 128 std r20,STK_REG(R20)(r1) 129 std r21,STK_REG(R21)(r1) 130 std r22,STK_REG(R22)(r1) 131 std r0,STACKFRAMESIZE+16(r1) 132 133 srdi r6,r5,7 134 mtctr r6 135 136 /* Now do cacheline (128B) sized loads and stores. */ 137 .align 5 138 4: 139 err2; ld r0,0(r4) 140 err2; ld r6,8(r4) 141 err2; ld r7,16(r4) 142 err2; ld r8,24(r4) 143 err2; ld r9,32(r4) 144 err2; ld r10,40(r4) 145 err2; ld r11,48(r4) 146 err2; ld r12,56(r4) 147 err2; ld r14,64(r4) 148 err2; ld r15,72(r4) 149 err2; ld r16,80(r4) 150 err2; ld r17,88(r4) 151 err2; ld r18,96(r4) 152 err2; ld r19,104(r4) 153 err2; ld r20,112(r4) 154 err2; ld r21,120(r4) 155 addi r4,r4,128 156 err2; std r0,0(r3) 157 err2; std r6,8(r3) 158 err2; std r7,16(r3) 159 err2; std r8,24(r3) 160 err2; std r9,32(r3) 161 err2; std r10,40(r3) 162 err2; std r11,48(r3) 163 err2; std r12,56(r3) 164 err2; std r14,64(r3) 165 err2; std r15,72(r3) 166 err2; std r16,80(r3) 167 err2; std r17,88(r3) 168 err2; std r18,96(r3) 169 err2; std r19,104(r3) 170 err2; std r20,112(r3) 171 err2; std r21,120(r3) 172 addi r3,r3,128 173 bdnz 4b 174 175 clrldi r5,r5,(64-7) 176 177 ld r14,STK_REG(R14)(r1) 178 ld r15,STK_REG(R15)(r1) 179 ld r16,STK_REG(R16)(r1) 180 ld r17,STK_REG(R17)(r1) 181 ld r18,STK_REG(R18)(r1) 182 ld r19,STK_REG(R19)(r1) 183 ld r20,STK_REG(R20)(r1) 184 ld r21,STK_REG(R21)(r1) 185 ld r22,STK_REG(R22)(r1) 186 addi r1,r1,STACKFRAMESIZE 187 188 /* Up to 127B to go */ 189 5: srdi r6,r5,4 190 mtocrf 0x01,r6 191 192 6: bf cr7*4+1,7f 193 err1; ld r0,0(r4) 194 err1; ld r6,8(r4) 195 err1; ld r7,16(r4) 196 err1; ld r8,24(r4) 197 err1; ld r9,32(r4) 198 err1; ld r10,40(r4) 199 err1; ld r11,48(r4) 200 err1; ld r12,56(r4) 201 addi r4,r4,64 202 err1; std r0,0(r3) 203 err1; std r6,8(r3) 204 err1; std r7,16(r3) 205 err1; std r8,24(r3) 206 err1; std r9,32(r3) 207 err1; std r10,40(r3) 208 err1; std r11,48(r3) 209 err1; std r12,56(r3) 210 addi r3,r3,64 211 212 /* Up to 63B to go */ 213 7: bf cr7*4+2,8f 214 err1; ld r0,0(r4) 215 err1; ld r6,8(r4) 216 err1; ld r7,16(r4) 217 err1; ld r8,24(r4) 218 addi r4,r4,32 219 err1; std r0,0(r3) 220 err1; std r6,8(r3) 221 err1; std r7,16(r3) 222 err1; std r8,24(r3) 223 addi r3,r3,32 224 225 /* Up to 31B to go */ 226 8: bf cr7*4+3,9f 227 err1; ld r0,0(r4) 228 err1; ld r6,8(r4) 229 addi r4,r4,16 230 err1; std r0,0(r3) 231 err1; std r6,8(r3) 232 addi r3,r3,16 233 234 9: clrldi r5,r5,(64-4) 235 236 /* Up to 15B to go */ 237 .Lshort_copy: 238 mtocrf 0x01,r5 239 bf cr7*4+0,12f 240 err1; lwz r0,0(r4) /* Less chance of a reject with word ops */ 241 err1; lwz r6,4(r4) 242 addi r4,r4,8 243 err1; stw r0,0(r3) 244 err1; stw r6,4(r3) 245 addi r3,r3,8 246 247 12: bf cr7*4+1,13f 248 err1; lwz r0,0(r4) 249 addi r4,r4,4 250 err1; stw r0,0(r3) 251 addi r3,r3,4 252 253 13: bf cr7*4+2,14f 254 err1; lhz r0,0(r4) 255 addi r4,r4,2 256 err1; sth r0,0(r3) 257 addi r3,r3,2 258 259 14: bf cr7*4+3,15f 260 err1; lbz r0,0(r4) 261 err1; stb r0,0(r3) 262 263 15: li r3,0 264 blr 265 266 .Lunwind_stack_nonvmx_copy: 267 addi r1,r1,STACKFRAMESIZE 268 b .Lnonvmx_copy 269 270 .Lvmx_copy: 271 #ifdef CONFIG_ALTIVEC 272 mflr r0 273 std r0,16(r1) 274 stdu r1,-STACKFRAMESIZE(r1) 275 bl enter_vmx_usercopy 276 cmpwi cr1,r3,0 277 ld r0,STACKFRAMESIZE+16(r1) 278 ld r3,STK_REG(R31)(r1) 279 ld r4,STK_REG(R30)(r1) 280 ld r5,STK_REG(R29)(r1) 281 mtlr r0 282 283 /* 284 * We prefetch both the source and destination using enhanced touch 285 * instructions. We use a stream ID of 0 for the load side and 286 * 1 for the store side. 287 */ 288 clrrdi r6,r4,7 289 clrrdi r9,r3,7 290 ori r9,r9,1 /* stream=1 */ 291 292 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */ 293 cmpldi r7,0x3FF 294 ble 1f 295 li r7,0x3FF 296 1: lis r0,0x0E00 /* depth=7 */ 297 sldi r7,r7,7 298 or r7,r7,r0 299 ori r10,r7,1 /* stream=1 */ 300 301 lis r8,0x8000 /* GO=1 */ 302 clrldi r8,r8,32 303 304 /* setup read stream 0 */ 305 dcbt 0,r6,0b01000 /* addr from */ 306 dcbt 0,r7,0b01010 /* length and depth from */ 307 /* setup write stream 1 */ 308 dcbtst 0,r9,0b01000 /* addr to */ 309 dcbtst 0,r10,0b01010 /* length and depth to */ 310 eieio 311 dcbt 0,r8,0b01010 /* all streams GO */ 312 313 beq cr1,.Lunwind_stack_nonvmx_copy 314 315 /* 316 * If source and destination are not relatively aligned we use a 317 * slower permute loop. 318 */ 319 xor r6,r4,r3 320 rldicl. r6,r6,0,(64-4) 321 bne .Lvmx_unaligned_copy 322 323 /* Get the destination 16B aligned */ 324 neg r6,r3 325 mtocrf 0x01,r6 326 clrldi r6,r6,(64-4) 327 328 bf cr7*4+3,1f 329 err3; lbz r0,0(r4) 330 addi r4,r4,1 331 err3; stb r0,0(r3) 332 addi r3,r3,1 333 334 1: bf cr7*4+2,2f 335 err3; lhz r0,0(r4) 336 addi r4,r4,2 337 err3; sth r0,0(r3) 338 addi r3,r3,2 339 340 2: bf cr7*4+1,3f 341 err3; lwz r0,0(r4) 342 addi r4,r4,4 343 err3; stw r0,0(r3) 344 addi r3,r3,4 345 346 3: bf cr7*4+0,4f 347 err3; ld r0,0(r4) 348 addi r4,r4,8 349 err3; std r0,0(r3) 350 addi r3,r3,8 351 352 4: sub r5,r5,r6 353 354 /* Get the desination 128B aligned */ 355 neg r6,r3 356 srdi r7,r6,4 357 mtocrf 0x01,r7 358 clrldi r6,r6,(64-7) 359 360 li r9,16 361 li r10,32 362 li r11,48 363 364 bf cr7*4+3,5f 365 err3; lvx v1,0,r4 366 addi r4,r4,16 367 err3; stvx v1,0,r3 368 addi r3,r3,16 369 370 5: bf cr7*4+2,6f 371 err3; lvx v1,0,r4 372 err3; lvx v0,r4,r9 373 addi r4,r4,32 374 err3; stvx v1,0,r3 375 err3; stvx v0,r3,r9 376 addi r3,r3,32 377 378 6: bf cr7*4+1,7f 379 err3; lvx v3,0,r4 380 err3; lvx v2,r4,r9 381 err3; lvx v1,r4,r10 382 err3; lvx v0,r4,r11 383 addi r4,r4,64 384 err3; stvx v3,0,r3 385 err3; stvx v2,r3,r9 386 err3; stvx v1,r3,r10 387 err3; stvx v0,r3,r11 388 addi r3,r3,64 389 390 7: sub r5,r5,r6 391 srdi r6,r5,7 392 393 std r14,STK_REG(R14)(r1) 394 std r15,STK_REG(R15)(r1) 395 std r16,STK_REG(R16)(r1) 396 397 li r12,64 398 li r14,80 399 li r15,96 400 li r16,112 401 402 mtctr r6 403 404 /* 405 * Now do cacheline sized loads and stores. By this stage the 406 * cacheline stores are also cacheline aligned. 407 */ 408 .align 5 409 8: 410 err4; lvx v7,0,r4 411 err4; lvx v6,r4,r9 412 err4; lvx v5,r4,r10 413 err4; lvx v4,r4,r11 414 err4; lvx v3,r4,r12 415 err4; lvx v2,r4,r14 416 err4; lvx v1,r4,r15 417 err4; lvx v0,r4,r16 418 addi r4,r4,128 419 err4; stvx v7,0,r3 420 err4; stvx v6,r3,r9 421 err4; stvx v5,r3,r10 422 err4; stvx v4,r3,r11 423 err4; stvx v3,r3,r12 424 err4; stvx v2,r3,r14 425 err4; stvx v1,r3,r15 426 err4; stvx v0,r3,r16 427 addi r3,r3,128 428 bdnz 8b 429 430 ld r14,STK_REG(R14)(r1) 431 ld r15,STK_REG(R15)(r1) 432 ld r16,STK_REG(R16)(r1) 433 434 /* Up to 127B to go */ 435 clrldi r5,r5,(64-7) 436 srdi r6,r5,4 437 mtocrf 0x01,r6 438 439 bf cr7*4+1,9f 440 err3; lvx v3,0,r4 441 err3; lvx v2,r4,r9 442 err3; lvx v1,r4,r10 443 err3; lvx v0,r4,r11 444 addi r4,r4,64 445 err3; stvx v3,0,r3 446 err3; stvx v2,r3,r9 447 err3; stvx v1,r3,r10 448 err3; stvx v0,r3,r11 449 addi r3,r3,64 450 451 9: bf cr7*4+2,10f 452 err3; lvx v1,0,r4 453 err3; lvx v0,r4,r9 454 addi r4,r4,32 455 err3; stvx v1,0,r3 456 err3; stvx v0,r3,r9 457 addi r3,r3,32 458 459 10: bf cr7*4+3,11f 460 err3; lvx v1,0,r4 461 addi r4,r4,16 462 err3; stvx v1,0,r3 463 addi r3,r3,16 464 465 /* Up to 15B to go */ 466 11: clrldi r5,r5,(64-4) 467 mtocrf 0x01,r5 468 bf cr7*4+0,12f 469 err3; ld r0,0(r4) 470 addi r4,r4,8 471 err3; std r0,0(r3) 472 addi r3,r3,8 473 474 12: bf cr7*4+1,13f 475 err3; lwz r0,0(r4) 476 addi r4,r4,4 477 err3; stw r0,0(r3) 478 addi r3,r3,4 479 480 13: bf cr7*4+2,14f 481 err3; lhz r0,0(r4) 482 addi r4,r4,2 483 err3; sth r0,0(r3) 484 addi r3,r3,2 485 486 14: bf cr7*4+3,15f 487 err3; lbz r0,0(r4) 488 err3; stb r0,0(r3) 489 490 15: addi r1,r1,STACKFRAMESIZE 491 b exit_vmx_usercopy /* tail call optimise */ 492 493 .Lvmx_unaligned_copy: 494 /* Get the destination 16B aligned */ 495 neg r6,r3 496 mtocrf 0x01,r6 497 clrldi r6,r6,(64-4) 498 499 bf cr7*4+3,1f 500 err3; lbz r0,0(r4) 501 addi r4,r4,1 502 err3; stb r0,0(r3) 503 addi r3,r3,1 504 505 1: bf cr7*4+2,2f 506 err3; lhz r0,0(r4) 507 addi r4,r4,2 508 err3; sth r0,0(r3) 509 addi r3,r3,2 510 511 2: bf cr7*4+1,3f 512 err3; lwz r0,0(r4) 513 addi r4,r4,4 514 err3; stw r0,0(r3) 515 addi r3,r3,4 516 517 3: bf cr7*4+0,4f 518 err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ 519 err3; lwz r7,4(r4) 520 addi r4,r4,8 521 err3; stw r0,0(r3) 522 err3; stw r7,4(r3) 523 addi r3,r3,8 524 525 4: sub r5,r5,r6 526 527 /* Get the desination 128B aligned */ 528 neg r6,r3 529 srdi r7,r6,4 530 mtocrf 0x01,r7 531 clrldi r6,r6,(64-7) 532 533 li r9,16 534 li r10,32 535 li r11,48 536 537 LVS(v16,0,r4) /* Setup permute control vector */ 538 err3; lvx v0,0,r4 539 addi r4,r4,16 540 541 bf cr7*4+3,5f 542 err3; lvx v1,0,r4 543 VPERM(v8,v0,v1,v16) 544 addi r4,r4,16 545 err3; stvx v8,0,r3 546 addi r3,r3,16 547 vor v0,v1,v1 548 549 5: bf cr7*4+2,6f 550 err3; lvx v1,0,r4 551 VPERM(v8,v0,v1,v16) 552 err3; lvx v0,r4,r9 553 VPERM(v9,v1,v0,v16) 554 addi r4,r4,32 555 err3; stvx v8,0,r3 556 err3; stvx v9,r3,r9 557 addi r3,r3,32 558 559 6: bf cr7*4+1,7f 560 err3; lvx v3,0,r4 561 VPERM(v8,v0,v3,v16) 562 err3; lvx v2,r4,r9 563 VPERM(v9,v3,v2,v16) 564 err3; lvx v1,r4,r10 565 VPERM(v10,v2,v1,v16) 566 err3; lvx v0,r4,r11 567 VPERM(v11,v1,v0,v16) 568 addi r4,r4,64 569 err3; stvx v8,0,r3 570 err3; stvx v9,r3,r9 571 err3; stvx v10,r3,r10 572 err3; stvx v11,r3,r11 573 addi r3,r3,64 574 575 7: sub r5,r5,r6 576 srdi r6,r5,7 577 578 std r14,STK_REG(R14)(r1) 579 std r15,STK_REG(R15)(r1) 580 std r16,STK_REG(R16)(r1) 581 582 li r12,64 583 li r14,80 584 li r15,96 585 li r16,112 586 587 mtctr r6 588 589 /* 590 * Now do cacheline sized loads and stores. By this stage the 591 * cacheline stores are also cacheline aligned. 592 */ 593 .align 5 594 8: 595 err4; lvx v7,0,r4 596 VPERM(v8,v0,v7,v16) 597 err4; lvx v6,r4,r9 598 VPERM(v9,v7,v6,v16) 599 err4; lvx v5,r4,r10 600 VPERM(v10,v6,v5,v16) 601 err4; lvx v4,r4,r11 602 VPERM(v11,v5,v4,v16) 603 err4; lvx v3,r4,r12 604 VPERM(v12,v4,v3,v16) 605 err4; lvx v2,r4,r14 606 VPERM(v13,v3,v2,v16) 607 err4; lvx v1,r4,r15 608 VPERM(v14,v2,v1,v16) 609 err4; lvx v0,r4,r16 610 VPERM(v15,v1,v0,v16) 611 addi r4,r4,128 612 err4; stvx v8,0,r3 613 err4; stvx v9,r3,r9 614 err4; stvx v10,r3,r10 615 err4; stvx v11,r3,r11 616 err4; stvx v12,r3,r12 617 err4; stvx v13,r3,r14 618 err4; stvx v14,r3,r15 619 err4; stvx v15,r3,r16 620 addi r3,r3,128 621 bdnz 8b 622 623 ld r14,STK_REG(R14)(r1) 624 ld r15,STK_REG(R15)(r1) 625 ld r16,STK_REG(R16)(r1) 626 627 /* Up to 127B to go */ 628 clrldi r5,r5,(64-7) 629 srdi r6,r5,4 630 mtocrf 0x01,r6 631 632 bf cr7*4+1,9f 633 err3; lvx v3,0,r4 634 VPERM(v8,v0,v3,v16) 635 err3; lvx v2,r4,r9 636 VPERM(v9,v3,v2,v16) 637 err3; lvx v1,r4,r10 638 VPERM(v10,v2,v1,v16) 639 err3; lvx v0,r4,r11 640 VPERM(v11,v1,v0,v16) 641 addi r4,r4,64 642 err3; stvx v8,0,r3 643 err3; stvx v9,r3,r9 644 err3; stvx v10,r3,r10 645 err3; stvx v11,r3,r11 646 addi r3,r3,64 647 648 9: bf cr7*4+2,10f 649 err3; lvx v1,0,r4 650 VPERM(v8,v0,v1,v16) 651 err3; lvx v0,r4,r9 652 VPERM(v9,v1,v0,v16) 653 addi r4,r4,32 654 err3; stvx v8,0,r3 655 err3; stvx v9,r3,r9 656 addi r3,r3,32 657 658 10: bf cr7*4+3,11f 659 err3; lvx v1,0,r4 660 VPERM(v8,v0,v1,v16) 661 addi r4,r4,16 662 err3; stvx v8,0,r3 663 addi r3,r3,16 664 665 /* Up to 15B to go */ 666 11: clrldi r5,r5,(64-4) 667 addi r4,r4,-16 /* Unwind the +16 load offset */ 668 mtocrf 0x01,r5 669 bf cr7*4+0,12f 670 err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ 671 err3; lwz r6,4(r4) 672 addi r4,r4,8 673 err3; stw r0,0(r3) 674 err3; stw r6,4(r3) 675 addi r3,r3,8 676 677 12: bf cr7*4+1,13f 678 err3; lwz r0,0(r4) 679 addi r4,r4,4 680 err3; stw r0,0(r3) 681 addi r3,r3,4 682 683 13: bf cr7*4+2,14f 684 err3; lhz r0,0(r4) 685 addi r4,r4,2 686 err3; sth r0,0(r3) 687 addi r3,r3,2 688 689 14: bf cr7*4+3,15f 690 err3; lbz r0,0(r4) 691 err3; stb r0,0(r3) 692 693 15: addi r1,r1,STACKFRAMESIZE 694 b exit_vmx_usercopy /* tail call optimise */ 695 #endif /* CONFIG_ALTIVEC */ 696