1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2#ifndef _ASM_X86_XOR_H 3#define _ASM_X86_XOR_H 4 5/* 6 * Optimized RAID-5 checksumming functions for SSE. 7 */ 8 9/* 10 * Cache avoiding checksumming functions utilizing KNI instructions 11 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) 12 */ 13 14/* 15 * Based on 16 * High-speed RAID5 checksumming functions utilizing SSE instructions. 17 * Copyright (C) 1998 Ingo Molnar. 18 */ 19 20/* 21 * x86-64 changes / gcc fixes from Andi Kleen. 22 * Copyright 2002 Andi Kleen, SuSE Labs. 23 * 24 * This hasn't been optimized for the hammer yet, but there are likely 25 * no advantages to be gotten from x86-64 here anyways. 26 */ 27 28#include <asm/fpu/api.h> 29 30#ifdef CONFIG_X86_32 31/* reduce register pressure */ 32# define XOR_CONSTANT_CONSTRAINT "i" 33#else 34# define XOR_CONSTANT_CONSTRAINT "re" 35#endif 36 37#define OFFS(x) "16*("#x")" 38#define PF_OFFS(x) "256+16*("#x")" 39#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" 40#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" 41#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" 42#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" 43#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" 44#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" 45#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" 46#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" 47#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" 48#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" 49#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" 50#define NOP(x) 51 52#define BLK64(pf, op, i) \ 53 pf(i) \ 54 op(i, 0) \ 55 op(i + 1, 1) \ 56 op(i + 2, 2) \ 57 op(i + 3, 3) 58 59static void 60xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) 61{ 62 unsigned long lines = bytes >> 8; 63 64 kernel_fpu_begin(); 65 66 asm volatile( 67#undef BLOCK 68#define BLOCK(i) \ 69 LD(i, 0) \ 70 LD(i + 1, 1) \ 71 PF1(i) \ 72 PF1(i + 2) \ 73 LD(i + 2, 2) \ 74 LD(i + 3, 3) \ 75 PF0(i + 4) \ 76 PF0(i + 6) \ 77 XO1(i, 0) \ 78 XO1(i + 1, 1) \ 79 XO1(i + 2, 2) \ 80 XO1(i + 3, 3) \ 81 ST(i, 0) \ 82 ST(i + 1, 1) \ 83 ST(i + 2, 2) \ 84 ST(i + 3, 3) \ 85 86 87 PF0(0) 88 PF0(2) 89 90 " .align 32 ;\n" 91 " 1: ;\n" 92 93 BLOCK(0) 94 BLOCK(4) 95 BLOCK(8) 96 BLOCK(12) 97 98 " add %[inc], %[p1] ;\n" 99 " add %[inc], %[p2] ;\n" 100 " dec %[cnt] ;\n" 101 " jnz 1b ;\n" 102 : [cnt] "+r" (lines), 103 [p1] "+r" (p1), [p2] "+r" (p2) 104 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 105 : "memory"); 106 107 kernel_fpu_end(); 108} 109 110static void 111xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2) 112{ 113 unsigned long lines = bytes >> 8; 114 115 kernel_fpu_begin(); 116 117 asm volatile( 118#undef BLOCK 119#define BLOCK(i) \ 120 BLK64(PF0, LD, i) \ 121 BLK64(PF1, XO1, i) \ 122 BLK64(NOP, ST, i) \ 123 124 " .align 32 ;\n" 125 " 1: ;\n" 126 127 BLOCK(0) 128 BLOCK(4) 129 BLOCK(8) 130 BLOCK(12) 131 132 " add %[inc], %[p1] ;\n" 133 " add %[inc], %[p2] ;\n" 134 " dec %[cnt] ;\n" 135 " jnz 1b ;\n" 136 : [cnt] "+r" (lines), 137 [p1] "+r" (p1), [p2] "+r" (p2) 138 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 139 : "memory"); 140 141 kernel_fpu_end(); 142} 143 144static void 145xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, 146 unsigned long *p3) 147{ 148 unsigned long lines = bytes >> 8; 149 150 kernel_fpu_begin(); 151 152 asm volatile( 153#undef BLOCK 154#define BLOCK(i) \ 155 PF1(i) \ 156 PF1(i + 2) \ 157 LD(i, 0) \ 158 LD(i + 1, 1) \ 159 LD(i + 2, 2) \ 160 LD(i + 3, 3) \ 161 PF2(i) \ 162 PF2(i + 2) \ 163 PF0(i + 4) \ 164 PF0(i + 6) \ 165 XO1(i, 0) \ 166 XO1(i + 1, 1) \ 167 XO1(i + 2, 2) \ 168 XO1(i + 3, 3) \ 169 XO2(i, 0) \ 170 XO2(i + 1, 1) \ 171 XO2(i + 2, 2) \ 172 XO2(i + 3, 3) \ 173 ST(i, 0) \ 174 ST(i + 1, 1) \ 175 ST(i + 2, 2) \ 176 ST(i + 3, 3) \ 177 178 179 PF0(0) 180 PF0(2) 181 182 " .align 32 ;\n" 183 " 1: ;\n" 184 185 BLOCK(0) 186 BLOCK(4) 187 BLOCK(8) 188 BLOCK(12) 189 190 " add %[inc], %[p1] ;\n" 191 " add %[inc], %[p2] ;\n" 192 " add %[inc], %[p3] ;\n" 193 " dec %[cnt] ;\n" 194 " jnz 1b ;\n" 195 : [cnt] "+r" (lines), 196 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) 197 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 198 : "memory"); 199 200 kernel_fpu_end(); 201} 202 203static void 204xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, 205 unsigned long *p3) 206{ 207 unsigned long lines = bytes >> 8; 208 209 kernel_fpu_begin(); 210 211 asm volatile( 212#undef BLOCK 213#define BLOCK(i) \ 214 BLK64(PF0, LD, i) \ 215 BLK64(PF1, XO1, i) \ 216 BLK64(PF2, XO2, i) \ 217 BLK64(NOP, ST, i) \ 218 219 " .align 32 ;\n" 220 " 1: ;\n" 221 222 BLOCK(0) 223 BLOCK(4) 224 BLOCK(8) 225 BLOCK(12) 226 227 " add %[inc], %[p1] ;\n" 228 " add %[inc], %[p2] ;\n" 229 " add %[inc], %[p3] ;\n" 230 " dec %[cnt] ;\n" 231 " jnz 1b ;\n" 232 : [cnt] "+r" (lines), 233 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) 234 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 235 : "memory"); 236 237 kernel_fpu_end(); 238} 239 240static void 241xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, 242 unsigned long *p3, unsigned long *p4) 243{ 244 unsigned long lines = bytes >> 8; 245 246 kernel_fpu_begin(); 247 248 asm volatile( 249#undef BLOCK 250#define BLOCK(i) \ 251 PF1(i) \ 252 PF1(i + 2) \ 253 LD(i, 0) \ 254 LD(i + 1, 1) \ 255 LD(i + 2, 2) \ 256 LD(i + 3, 3) \ 257 PF2(i) \ 258 PF2(i + 2) \ 259 XO1(i, 0) \ 260 XO1(i + 1, 1) \ 261 XO1(i + 2, 2) \ 262 XO1(i + 3, 3) \ 263 PF3(i) \ 264 PF3(i + 2) \ 265 PF0(i + 4) \ 266 PF0(i + 6) \ 267 XO2(i, 0) \ 268 XO2(i + 1, 1) \ 269 XO2(i + 2, 2) \ 270 XO2(i + 3, 3) \ 271 XO3(i, 0) \ 272 XO3(i + 1, 1) \ 273 XO3(i + 2, 2) \ 274 XO3(i + 3, 3) \ 275 ST(i, 0) \ 276 ST(i + 1, 1) \ 277 ST(i + 2, 2) \ 278 ST(i + 3, 3) \ 279 280 281 PF0(0) 282 PF0(2) 283 284 " .align 32 ;\n" 285 " 1: ;\n" 286 287 BLOCK(0) 288 BLOCK(4) 289 BLOCK(8) 290 BLOCK(12) 291 292 " add %[inc], %[p1] ;\n" 293 " add %[inc], %[p2] ;\n" 294 " add %[inc], %[p3] ;\n" 295 " add %[inc], %[p4] ;\n" 296 " dec %[cnt] ;\n" 297 " jnz 1b ;\n" 298 : [cnt] "+r" (lines), [p1] "+r" (p1), 299 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) 300 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 301 : "memory"); 302 303 kernel_fpu_end(); 304} 305 306static void 307xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, 308 unsigned long *p3, unsigned long *p4) 309{ 310 unsigned long lines = bytes >> 8; 311 312 kernel_fpu_begin(); 313 314 asm volatile( 315#undef BLOCK 316#define BLOCK(i) \ 317 BLK64(PF0, LD, i) \ 318 BLK64(PF1, XO1, i) \ 319 BLK64(PF2, XO2, i) \ 320 BLK64(PF3, XO3, i) \ 321 BLK64(NOP, ST, i) \ 322 323 " .align 32 ;\n" 324 " 1: ;\n" 325 326 BLOCK(0) 327 BLOCK(4) 328 BLOCK(8) 329 BLOCK(12) 330 331 " add %[inc], %[p1] ;\n" 332 " add %[inc], %[p2] ;\n" 333 " add %[inc], %[p3] ;\n" 334 " add %[inc], %[p4] ;\n" 335 " dec %[cnt] ;\n" 336 " jnz 1b ;\n" 337 : [cnt] "+r" (lines), [p1] "+r" (p1), 338 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) 339 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 340 : "memory"); 341 342 kernel_fpu_end(); 343} 344 345static void 346xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, 347 unsigned long *p3, unsigned long *p4, unsigned long *p5) 348{ 349 unsigned long lines = bytes >> 8; 350 351 kernel_fpu_begin(); 352 353 asm volatile( 354#undef BLOCK 355#define BLOCK(i) \ 356 PF1(i) \ 357 PF1(i + 2) \ 358 LD(i, 0) \ 359 LD(i + 1, 1) \ 360 LD(i + 2, 2) \ 361 LD(i + 3, 3) \ 362 PF2(i) \ 363 PF2(i + 2) \ 364 XO1(i, 0) \ 365 XO1(i + 1, 1) \ 366 XO1(i + 2, 2) \ 367 XO1(i + 3, 3) \ 368 PF3(i) \ 369 PF3(i + 2) \ 370 XO2(i, 0) \ 371 XO2(i + 1, 1) \ 372 XO2(i + 2, 2) \ 373 XO2(i + 3, 3) \ 374 PF4(i) \ 375 PF4(i + 2) \ 376 PF0(i + 4) \ 377 PF0(i + 6) \ 378 XO3(i, 0) \ 379 XO3(i + 1, 1) \ 380 XO3(i + 2, 2) \ 381 XO3(i + 3, 3) \ 382 XO4(i, 0) \ 383 XO4(i + 1, 1) \ 384 XO4(i + 2, 2) \ 385 XO4(i + 3, 3) \ 386 ST(i, 0) \ 387 ST(i + 1, 1) \ 388 ST(i + 2, 2) \ 389 ST(i + 3, 3) \ 390 391 392 PF0(0) 393 PF0(2) 394 395 " .align 32 ;\n" 396 " 1: ;\n" 397 398 BLOCK(0) 399 BLOCK(4) 400 BLOCK(8) 401 BLOCK(12) 402 403 " add %[inc], %[p1] ;\n" 404 " add %[inc], %[p2] ;\n" 405 " add %[inc], %[p3] ;\n" 406 " add %[inc], %[p4] ;\n" 407 " add %[inc], %[p5] ;\n" 408 " dec %[cnt] ;\n" 409 " jnz 1b ;\n" 410 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), 411 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) 412 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 413 : "memory"); 414 415 kernel_fpu_end(); 416} 417 418static void 419xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, 420 unsigned long *p3, unsigned long *p4, unsigned long *p5) 421{ 422 unsigned long lines = bytes >> 8; 423 424 kernel_fpu_begin(); 425 426 asm volatile( 427#undef BLOCK 428#define BLOCK(i) \ 429 BLK64(PF0, LD, i) \ 430 BLK64(PF1, XO1, i) \ 431 BLK64(PF2, XO2, i) \ 432 BLK64(PF3, XO3, i) \ 433 BLK64(PF4, XO4, i) \ 434 BLK64(NOP, ST, i) \ 435 436 " .align 32 ;\n" 437 " 1: ;\n" 438 439 BLOCK(0) 440 BLOCK(4) 441 BLOCK(8) 442 BLOCK(12) 443 444 " add %[inc], %[p1] ;\n" 445 " add %[inc], %[p2] ;\n" 446 " add %[inc], %[p3] ;\n" 447 " add %[inc], %[p4] ;\n" 448 " add %[inc], %[p5] ;\n" 449 " dec %[cnt] ;\n" 450 " jnz 1b ;\n" 451 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), 452 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) 453 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 454 : "memory"); 455 456 kernel_fpu_end(); 457} 458 459static struct xor_block_template xor_block_sse_pf64 = { 460 .name = "prefetch64-sse", 461 .do_2 = xor_sse_2_pf64, 462 .do_3 = xor_sse_3_pf64, 463 .do_4 = xor_sse_4_pf64, 464 .do_5 = xor_sse_5_pf64, 465}; 466 467#undef LD 468#undef XO1 469#undef XO2 470#undef XO3 471#undef XO4 472#undef ST 473#undef NOP 474#undef BLK64 475#undef BLOCK 476 477#undef XOR_CONSTANT_CONSTRAINT 478 479#ifdef CONFIG_X86_32 480# include <asm/xor_32.h> 481#else 482# include <asm/xor_64.h> 483#endif 484 485#define XOR_SELECT_TEMPLATE(FASTEST) \ 486 AVX_SELECT(FASTEST) 487 488#endif /* _ASM_X86_XOR_H */ 489