18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */ 28c2ecf20Sopenharmony_ci#ifndef _ASM_X86_XOR_32_H 38c2ecf20Sopenharmony_ci#define _ASM_X86_XOR_32_H 48c2ecf20Sopenharmony_ci 58c2ecf20Sopenharmony_ci/* 68c2ecf20Sopenharmony_ci * Optimized RAID-5 checksumming functions for MMX. 78c2ecf20Sopenharmony_ci */ 88c2ecf20Sopenharmony_ci 98c2ecf20Sopenharmony_ci/* 108c2ecf20Sopenharmony_ci * High-speed RAID5 checksumming functions utilizing MMX instructions. 118c2ecf20Sopenharmony_ci * Copyright (C) 1998 Ingo Molnar. 128c2ecf20Sopenharmony_ci */ 138c2ecf20Sopenharmony_ci 148c2ecf20Sopenharmony_ci#define LD(x, y) " movq 8*("#x")(%1), %%mm"#y" ;\n" 158c2ecf20Sopenharmony_ci#define ST(x, y) " movq %%mm"#y", 8*("#x")(%1) ;\n" 168c2ecf20Sopenharmony_ci#define XO1(x, y) " pxor 8*("#x")(%2), %%mm"#y" ;\n" 178c2ecf20Sopenharmony_ci#define XO2(x, y) " pxor 8*("#x")(%3), %%mm"#y" ;\n" 188c2ecf20Sopenharmony_ci#define XO3(x, y) " pxor 8*("#x")(%4), %%mm"#y" ;\n" 198c2ecf20Sopenharmony_ci#define XO4(x, y) " pxor 8*("#x")(%5), %%mm"#y" ;\n" 208c2ecf20Sopenharmony_ci 218c2ecf20Sopenharmony_ci#include <asm/fpu/api.h> 228c2ecf20Sopenharmony_ci 238c2ecf20Sopenharmony_cistatic void 248c2ecf20Sopenharmony_cixor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) 258c2ecf20Sopenharmony_ci{ 268c2ecf20Sopenharmony_ci unsigned long lines = bytes >> 7; 278c2ecf20Sopenharmony_ci 288c2ecf20Sopenharmony_ci kernel_fpu_begin(); 298c2ecf20Sopenharmony_ci 308c2ecf20Sopenharmony_ci asm volatile( 318c2ecf20Sopenharmony_ci#undef BLOCK 328c2ecf20Sopenharmony_ci#define BLOCK(i) \ 338c2ecf20Sopenharmony_ci LD(i, 0) \ 348c2ecf20Sopenharmony_ci LD(i + 1, 1) \ 358c2ecf20Sopenharmony_ci LD(i + 2, 2) \ 368c2ecf20Sopenharmony_ci LD(i + 3, 3) \ 378c2ecf20Sopenharmony_ci XO1(i, 0) \ 388c2ecf20Sopenharmony_ci ST(i, 0) \ 398c2ecf20Sopenharmony_ci XO1(i+1, 1) \ 408c2ecf20Sopenharmony_ci ST(i+1, 1) \ 418c2ecf20Sopenharmony_ci XO1(i + 2, 2) \ 428c2ecf20Sopenharmony_ci ST(i + 2, 2) \ 438c2ecf20Sopenharmony_ci XO1(i + 3, 3) \ 448c2ecf20Sopenharmony_ci ST(i + 3, 3) 458c2ecf20Sopenharmony_ci 468c2ecf20Sopenharmony_ci " .align 32 ;\n" 478c2ecf20Sopenharmony_ci " 1: ;\n" 488c2ecf20Sopenharmony_ci 498c2ecf20Sopenharmony_ci BLOCK(0) 508c2ecf20Sopenharmony_ci BLOCK(4) 518c2ecf20Sopenharmony_ci BLOCK(8) 528c2ecf20Sopenharmony_ci BLOCK(12) 538c2ecf20Sopenharmony_ci 548c2ecf20Sopenharmony_ci " addl $128, %1 ;\n" 558c2ecf20Sopenharmony_ci " addl $128, %2 ;\n" 568c2ecf20Sopenharmony_ci " decl %0 ;\n" 578c2ecf20Sopenharmony_ci " jnz 1b ;\n" 588c2ecf20Sopenharmony_ci : "+r" (lines), 598c2ecf20Sopenharmony_ci "+r" (p1), "+r" (p2) 608c2ecf20Sopenharmony_ci : 618c2ecf20Sopenharmony_ci : "memory"); 628c2ecf20Sopenharmony_ci 638c2ecf20Sopenharmony_ci kernel_fpu_end(); 648c2ecf20Sopenharmony_ci} 658c2ecf20Sopenharmony_ci 668c2ecf20Sopenharmony_cistatic void 678c2ecf20Sopenharmony_cixor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, 688c2ecf20Sopenharmony_ci unsigned long *p3) 698c2ecf20Sopenharmony_ci{ 708c2ecf20Sopenharmony_ci unsigned long lines = bytes >> 7; 718c2ecf20Sopenharmony_ci 728c2ecf20Sopenharmony_ci kernel_fpu_begin(); 738c2ecf20Sopenharmony_ci 748c2ecf20Sopenharmony_ci asm volatile( 758c2ecf20Sopenharmony_ci#undef BLOCK 768c2ecf20Sopenharmony_ci#define BLOCK(i) \ 778c2ecf20Sopenharmony_ci LD(i, 0) \ 788c2ecf20Sopenharmony_ci LD(i + 1, 1) \ 798c2ecf20Sopenharmony_ci LD(i + 2, 2) \ 808c2ecf20Sopenharmony_ci LD(i + 3, 3) \ 818c2ecf20Sopenharmony_ci XO1(i, 0) \ 828c2ecf20Sopenharmony_ci XO1(i + 1, 1) \ 838c2ecf20Sopenharmony_ci XO1(i + 2, 2) \ 848c2ecf20Sopenharmony_ci XO1(i + 3, 3) \ 858c2ecf20Sopenharmony_ci XO2(i, 0) \ 868c2ecf20Sopenharmony_ci ST(i, 0) \ 878c2ecf20Sopenharmony_ci XO2(i + 1, 1) \ 888c2ecf20Sopenharmony_ci ST(i + 1, 1) \ 898c2ecf20Sopenharmony_ci XO2(i + 2, 2) \ 908c2ecf20Sopenharmony_ci ST(i + 2, 2) \ 918c2ecf20Sopenharmony_ci XO2(i + 3, 3) \ 928c2ecf20Sopenharmony_ci ST(i + 3, 3) 938c2ecf20Sopenharmony_ci 948c2ecf20Sopenharmony_ci " .align 32 ;\n" 958c2ecf20Sopenharmony_ci " 1: ;\n" 968c2ecf20Sopenharmony_ci 978c2ecf20Sopenharmony_ci BLOCK(0) 988c2ecf20Sopenharmony_ci BLOCK(4) 998c2ecf20Sopenharmony_ci BLOCK(8) 1008c2ecf20Sopenharmony_ci BLOCK(12) 1018c2ecf20Sopenharmony_ci 1028c2ecf20Sopenharmony_ci " addl $128, %1 ;\n" 1038c2ecf20Sopenharmony_ci " addl $128, %2 ;\n" 1048c2ecf20Sopenharmony_ci " addl $128, %3 ;\n" 1058c2ecf20Sopenharmony_ci " decl %0 ;\n" 1068c2ecf20Sopenharmony_ci " jnz 1b ;\n" 1078c2ecf20Sopenharmony_ci : "+r" (lines), 1088c2ecf20Sopenharmony_ci "+r" (p1), "+r" (p2), "+r" (p3) 1098c2ecf20Sopenharmony_ci : 1108c2ecf20Sopenharmony_ci : "memory"); 1118c2ecf20Sopenharmony_ci 1128c2ecf20Sopenharmony_ci kernel_fpu_end(); 1138c2ecf20Sopenharmony_ci} 1148c2ecf20Sopenharmony_ci 1158c2ecf20Sopenharmony_cistatic void 1168c2ecf20Sopenharmony_cixor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, 1178c2ecf20Sopenharmony_ci unsigned long *p3, unsigned long *p4) 1188c2ecf20Sopenharmony_ci{ 1198c2ecf20Sopenharmony_ci unsigned long lines = bytes >> 7; 1208c2ecf20Sopenharmony_ci 1218c2ecf20Sopenharmony_ci kernel_fpu_begin(); 1228c2ecf20Sopenharmony_ci 1238c2ecf20Sopenharmony_ci asm volatile( 1248c2ecf20Sopenharmony_ci#undef BLOCK 1258c2ecf20Sopenharmony_ci#define BLOCK(i) \ 1268c2ecf20Sopenharmony_ci LD(i, 0) \ 1278c2ecf20Sopenharmony_ci LD(i + 1, 1) \ 1288c2ecf20Sopenharmony_ci LD(i + 2, 2) \ 1298c2ecf20Sopenharmony_ci LD(i + 3, 3) \ 1308c2ecf20Sopenharmony_ci XO1(i, 0) \ 1318c2ecf20Sopenharmony_ci XO1(i + 1, 1) \ 1328c2ecf20Sopenharmony_ci XO1(i + 2, 2) \ 1338c2ecf20Sopenharmony_ci XO1(i + 3, 3) \ 1348c2ecf20Sopenharmony_ci XO2(i, 0) \ 1358c2ecf20Sopenharmony_ci XO2(i + 1, 1) \ 1368c2ecf20Sopenharmony_ci XO2(i + 2, 2) \ 1378c2ecf20Sopenharmony_ci XO2(i + 3, 3) \ 1388c2ecf20Sopenharmony_ci XO3(i, 0) \ 1398c2ecf20Sopenharmony_ci ST(i, 0) \ 1408c2ecf20Sopenharmony_ci XO3(i + 1, 1) \ 1418c2ecf20Sopenharmony_ci ST(i + 1, 1) \ 1428c2ecf20Sopenharmony_ci XO3(i + 2, 2) \ 1438c2ecf20Sopenharmony_ci ST(i + 2, 2) \ 1448c2ecf20Sopenharmony_ci XO3(i + 3, 3) \ 1458c2ecf20Sopenharmony_ci ST(i + 3, 3) 1468c2ecf20Sopenharmony_ci 1478c2ecf20Sopenharmony_ci " .align 32 ;\n" 1488c2ecf20Sopenharmony_ci " 1: ;\n" 1498c2ecf20Sopenharmony_ci 1508c2ecf20Sopenharmony_ci BLOCK(0) 1518c2ecf20Sopenharmony_ci BLOCK(4) 1528c2ecf20Sopenharmony_ci BLOCK(8) 1538c2ecf20Sopenharmony_ci BLOCK(12) 1548c2ecf20Sopenharmony_ci 1558c2ecf20Sopenharmony_ci " addl $128, %1 ;\n" 1568c2ecf20Sopenharmony_ci " addl $128, %2 ;\n" 1578c2ecf20Sopenharmony_ci " addl $128, %3 ;\n" 1588c2ecf20Sopenharmony_ci " addl $128, %4 ;\n" 1598c2ecf20Sopenharmony_ci " decl %0 ;\n" 1608c2ecf20Sopenharmony_ci " jnz 1b ;\n" 1618c2ecf20Sopenharmony_ci : "+r" (lines), 1628c2ecf20Sopenharmony_ci "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) 1638c2ecf20Sopenharmony_ci : 1648c2ecf20Sopenharmony_ci : "memory"); 1658c2ecf20Sopenharmony_ci 1668c2ecf20Sopenharmony_ci kernel_fpu_end(); 1678c2ecf20Sopenharmony_ci} 1688c2ecf20Sopenharmony_ci 1698c2ecf20Sopenharmony_ci 1708c2ecf20Sopenharmony_cistatic void 1718c2ecf20Sopenharmony_cixor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, 1728c2ecf20Sopenharmony_ci unsigned long *p3, unsigned long *p4, unsigned long *p5) 1738c2ecf20Sopenharmony_ci{ 1748c2ecf20Sopenharmony_ci unsigned long lines = bytes >> 7; 1758c2ecf20Sopenharmony_ci 1768c2ecf20Sopenharmony_ci kernel_fpu_begin(); 1778c2ecf20Sopenharmony_ci 1788c2ecf20Sopenharmony_ci /* Make sure GCC forgets anything it knows about p4 or p5, 1798c2ecf20Sopenharmony_ci such that it won't pass to the asm volatile below a 1808c2ecf20Sopenharmony_ci register that is shared with any other variable. That's 1818c2ecf20Sopenharmony_ci because we modify p4 and p5 there, but we can't mark them 1828c2ecf20Sopenharmony_ci as read/write, otherwise we'd overflow the 10-asm-operands 1838c2ecf20Sopenharmony_ci limit of GCC < 3.1. */ 1848c2ecf20Sopenharmony_ci asm("" : "+r" (p4), "+r" (p5)); 1858c2ecf20Sopenharmony_ci 1868c2ecf20Sopenharmony_ci asm volatile( 1878c2ecf20Sopenharmony_ci#undef BLOCK 1888c2ecf20Sopenharmony_ci#define BLOCK(i) \ 1898c2ecf20Sopenharmony_ci LD(i, 0) \ 1908c2ecf20Sopenharmony_ci LD(i + 1, 1) \ 1918c2ecf20Sopenharmony_ci LD(i + 2, 2) \ 1928c2ecf20Sopenharmony_ci LD(i + 3, 3) \ 1938c2ecf20Sopenharmony_ci XO1(i, 0) \ 1948c2ecf20Sopenharmony_ci XO1(i + 1, 1) \ 1958c2ecf20Sopenharmony_ci XO1(i + 2, 2) \ 1968c2ecf20Sopenharmony_ci XO1(i + 3, 3) \ 1978c2ecf20Sopenharmony_ci XO2(i, 0) \ 1988c2ecf20Sopenharmony_ci XO2(i + 1, 1) \ 1998c2ecf20Sopenharmony_ci XO2(i + 2, 2) \ 2008c2ecf20Sopenharmony_ci XO2(i + 3, 3) \ 2018c2ecf20Sopenharmony_ci XO3(i, 0) \ 2028c2ecf20Sopenharmony_ci XO3(i + 1, 1) \ 2038c2ecf20Sopenharmony_ci XO3(i + 2, 2) \ 2048c2ecf20Sopenharmony_ci XO3(i + 3, 3) \ 2058c2ecf20Sopenharmony_ci XO4(i, 0) \ 2068c2ecf20Sopenharmony_ci ST(i, 0) \ 2078c2ecf20Sopenharmony_ci XO4(i + 1, 1) \ 2088c2ecf20Sopenharmony_ci ST(i + 1, 1) \ 2098c2ecf20Sopenharmony_ci XO4(i + 2, 2) \ 2108c2ecf20Sopenharmony_ci ST(i + 2, 2) \ 2118c2ecf20Sopenharmony_ci XO4(i + 3, 3) \ 2128c2ecf20Sopenharmony_ci ST(i + 3, 3) 2138c2ecf20Sopenharmony_ci 2148c2ecf20Sopenharmony_ci " .align 32 ;\n" 2158c2ecf20Sopenharmony_ci " 1: ;\n" 2168c2ecf20Sopenharmony_ci 2178c2ecf20Sopenharmony_ci BLOCK(0) 2188c2ecf20Sopenharmony_ci BLOCK(4) 2198c2ecf20Sopenharmony_ci BLOCK(8) 2208c2ecf20Sopenharmony_ci BLOCK(12) 2218c2ecf20Sopenharmony_ci 2228c2ecf20Sopenharmony_ci " addl $128, %1 ;\n" 2238c2ecf20Sopenharmony_ci " addl $128, %2 ;\n" 2248c2ecf20Sopenharmony_ci " addl $128, %3 ;\n" 2258c2ecf20Sopenharmony_ci " addl $128, %4 ;\n" 2268c2ecf20Sopenharmony_ci " addl $128, %5 ;\n" 2278c2ecf20Sopenharmony_ci " decl %0 ;\n" 2288c2ecf20Sopenharmony_ci " jnz 1b ;\n" 2298c2ecf20Sopenharmony_ci : "+r" (lines), 2308c2ecf20Sopenharmony_ci "+r" (p1), "+r" (p2), "+r" (p3) 2318c2ecf20Sopenharmony_ci : "r" (p4), "r" (p5) 2328c2ecf20Sopenharmony_ci : "memory"); 2338c2ecf20Sopenharmony_ci 2348c2ecf20Sopenharmony_ci /* p4 and p5 were modified, and now the variables are dead. 2358c2ecf20Sopenharmony_ci Clobber them just to be sure nobody does something stupid 2368c2ecf20Sopenharmony_ci like assuming they have some legal value. */ 2378c2ecf20Sopenharmony_ci asm("" : "=r" (p4), "=r" (p5)); 2388c2ecf20Sopenharmony_ci 2398c2ecf20Sopenharmony_ci kernel_fpu_end(); 2408c2ecf20Sopenharmony_ci} 2418c2ecf20Sopenharmony_ci 2428c2ecf20Sopenharmony_ci#undef LD 2438c2ecf20Sopenharmony_ci#undef XO1 2448c2ecf20Sopenharmony_ci#undef XO2 2458c2ecf20Sopenharmony_ci#undef XO3 2468c2ecf20Sopenharmony_ci#undef XO4 2478c2ecf20Sopenharmony_ci#undef ST 2488c2ecf20Sopenharmony_ci#undef BLOCK 2498c2ecf20Sopenharmony_ci 2508c2ecf20Sopenharmony_cistatic void 2518c2ecf20Sopenharmony_cixor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) 2528c2ecf20Sopenharmony_ci{ 2538c2ecf20Sopenharmony_ci unsigned long lines = bytes >> 6; 2548c2ecf20Sopenharmony_ci 2558c2ecf20Sopenharmony_ci kernel_fpu_begin(); 2568c2ecf20Sopenharmony_ci 2578c2ecf20Sopenharmony_ci asm volatile( 2588c2ecf20Sopenharmony_ci " .align 32 ;\n" 2598c2ecf20Sopenharmony_ci " 1: ;\n" 2608c2ecf20Sopenharmony_ci " movq (%1), %%mm0 ;\n" 2618c2ecf20Sopenharmony_ci " movq 8(%1), %%mm1 ;\n" 2628c2ecf20Sopenharmony_ci " pxor (%2), %%mm0 ;\n" 2638c2ecf20Sopenharmony_ci " movq 16(%1), %%mm2 ;\n" 2648c2ecf20Sopenharmony_ci " movq %%mm0, (%1) ;\n" 2658c2ecf20Sopenharmony_ci " pxor 8(%2), %%mm1 ;\n" 2668c2ecf20Sopenharmony_ci " movq 24(%1), %%mm3 ;\n" 2678c2ecf20Sopenharmony_ci " movq %%mm1, 8(%1) ;\n" 2688c2ecf20Sopenharmony_ci " pxor 16(%2), %%mm2 ;\n" 2698c2ecf20Sopenharmony_ci " movq 32(%1), %%mm4 ;\n" 2708c2ecf20Sopenharmony_ci " movq %%mm2, 16(%1) ;\n" 2718c2ecf20Sopenharmony_ci " pxor 24(%2), %%mm3 ;\n" 2728c2ecf20Sopenharmony_ci " movq 40(%1), %%mm5 ;\n" 2738c2ecf20Sopenharmony_ci " movq %%mm3, 24(%1) ;\n" 2748c2ecf20Sopenharmony_ci " pxor 32(%2), %%mm4 ;\n" 2758c2ecf20Sopenharmony_ci " movq 48(%1), %%mm6 ;\n" 2768c2ecf20Sopenharmony_ci " movq %%mm4, 32(%1) ;\n" 2778c2ecf20Sopenharmony_ci " pxor 40(%2), %%mm5 ;\n" 2788c2ecf20Sopenharmony_ci " movq 56(%1), %%mm7 ;\n" 2798c2ecf20Sopenharmony_ci " movq %%mm5, 40(%1) ;\n" 2808c2ecf20Sopenharmony_ci " pxor 48(%2), %%mm6 ;\n" 2818c2ecf20Sopenharmony_ci " pxor 56(%2), %%mm7 ;\n" 2828c2ecf20Sopenharmony_ci " movq %%mm6, 48(%1) ;\n" 2838c2ecf20Sopenharmony_ci " movq %%mm7, 56(%1) ;\n" 2848c2ecf20Sopenharmony_ci 2858c2ecf20Sopenharmony_ci " addl $64, %1 ;\n" 2868c2ecf20Sopenharmony_ci " addl $64, %2 ;\n" 2878c2ecf20Sopenharmony_ci " decl %0 ;\n" 2888c2ecf20Sopenharmony_ci " jnz 1b ;\n" 2898c2ecf20Sopenharmony_ci : "+r" (lines), 2908c2ecf20Sopenharmony_ci "+r" (p1), "+r" (p2) 2918c2ecf20Sopenharmony_ci : 2928c2ecf20Sopenharmony_ci : "memory"); 2938c2ecf20Sopenharmony_ci 2948c2ecf20Sopenharmony_ci kernel_fpu_end(); 2958c2ecf20Sopenharmony_ci} 2968c2ecf20Sopenharmony_ci 2978c2ecf20Sopenharmony_cistatic void 2988c2ecf20Sopenharmony_cixor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, 2998c2ecf20Sopenharmony_ci unsigned long *p3) 3008c2ecf20Sopenharmony_ci{ 3018c2ecf20Sopenharmony_ci unsigned long lines = bytes >> 6; 3028c2ecf20Sopenharmony_ci 3038c2ecf20Sopenharmony_ci kernel_fpu_begin(); 3048c2ecf20Sopenharmony_ci 3058c2ecf20Sopenharmony_ci asm volatile( 3068c2ecf20Sopenharmony_ci " .align 32,0x90 ;\n" 3078c2ecf20Sopenharmony_ci " 1: ;\n" 3088c2ecf20Sopenharmony_ci " movq (%1), %%mm0 ;\n" 3098c2ecf20Sopenharmony_ci " movq 8(%1), %%mm1 ;\n" 3108c2ecf20Sopenharmony_ci " pxor (%2), %%mm0 ;\n" 3118c2ecf20Sopenharmony_ci " movq 16(%1), %%mm2 ;\n" 3128c2ecf20Sopenharmony_ci " pxor 8(%2), %%mm1 ;\n" 3138c2ecf20Sopenharmony_ci " pxor (%3), %%mm0 ;\n" 3148c2ecf20Sopenharmony_ci " pxor 16(%2), %%mm2 ;\n" 3158c2ecf20Sopenharmony_ci " movq %%mm0, (%1) ;\n" 3168c2ecf20Sopenharmony_ci " pxor 8(%3), %%mm1 ;\n" 3178c2ecf20Sopenharmony_ci " pxor 16(%3), %%mm2 ;\n" 3188c2ecf20Sopenharmony_ci " movq 24(%1), %%mm3 ;\n" 3198c2ecf20Sopenharmony_ci " movq %%mm1, 8(%1) ;\n" 3208c2ecf20Sopenharmony_ci " movq 32(%1), %%mm4 ;\n" 3218c2ecf20Sopenharmony_ci " movq 40(%1), %%mm5 ;\n" 3228c2ecf20Sopenharmony_ci " pxor 24(%2), %%mm3 ;\n" 3238c2ecf20Sopenharmony_ci " movq %%mm2, 16(%1) ;\n" 3248c2ecf20Sopenharmony_ci " pxor 32(%2), %%mm4 ;\n" 3258c2ecf20Sopenharmony_ci " pxor 24(%3), %%mm3 ;\n" 3268c2ecf20Sopenharmony_ci " pxor 40(%2), %%mm5 ;\n" 3278c2ecf20Sopenharmony_ci " movq %%mm3, 24(%1) ;\n" 3288c2ecf20Sopenharmony_ci " pxor 32(%3), %%mm4 ;\n" 3298c2ecf20Sopenharmony_ci " pxor 40(%3), %%mm5 ;\n" 3308c2ecf20Sopenharmony_ci " movq 48(%1), %%mm6 ;\n" 3318c2ecf20Sopenharmony_ci " movq %%mm4, 32(%1) ;\n" 3328c2ecf20Sopenharmony_ci " movq 56(%1), %%mm7 ;\n" 3338c2ecf20Sopenharmony_ci " pxor 48(%2), %%mm6 ;\n" 3348c2ecf20Sopenharmony_ci " movq %%mm5, 40(%1) ;\n" 3358c2ecf20Sopenharmony_ci " pxor 56(%2), %%mm7 ;\n" 3368c2ecf20Sopenharmony_ci " pxor 48(%3), %%mm6 ;\n" 3378c2ecf20Sopenharmony_ci " pxor 56(%3), %%mm7 ;\n" 3388c2ecf20Sopenharmony_ci " movq %%mm6, 48(%1) ;\n" 3398c2ecf20Sopenharmony_ci " movq %%mm7, 56(%1) ;\n" 3408c2ecf20Sopenharmony_ci 3418c2ecf20Sopenharmony_ci " addl $64, %1 ;\n" 3428c2ecf20Sopenharmony_ci " addl $64, %2 ;\n" 3438c2ecf20Sopenharmony_ci " addl $64, %3 ;\n" 3448c2ecf20Sopenharmony_ci " decl %0 ;\n" 3458c2ecf20Sopenharmony_ci " jnz 1b ;\n" 3468c2ecf20Sopenharmony_ci : "+r" (lines), 3478c2ecf20Sopenharmony_ci "+r" (p1), "+r" (p2), "+r" (p3) 3488c2ecf20Sopenharmony_ci : 3498c2ecf20Sopenharmony_ci : "memory" ); 3508c2ecf20Sopenharmony_ci 3518c2ecf20Sopenharmony_ci kernel_fpu_end(); 3528c2ecf20Sopenharmony_ci} 3538c2ecf20Sopenharmony_ci 3548c2ecf20Sopenharmony_cistatic void 3558c2ecf20Sopenharmony_cixor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, 3568c2ecf20Sopenharmony_ci unsigned long *p3, unsigned long *p4) 3578c2ecf20Sopenharmony_ci{ 3588c2ecf20Sopenharmony_ci unsigned long lines = bytes >> 6; 3598c2ecf20Sopenharmony_ci 3608c2ecf20Sopenharmony_ci kernel_fpu_begin(); 3618c2ecf20Sopenharmony_ci 3628c2ecf20Sopenharmony_ci asm volatile( 3638c2ecf20Sopenharmony_ci " .align 32,0x90 ;\n" 3648c2ecf20Sopenharmony_ci " 1: ;\n" 3658c2ecf20Sopenharmony_ci " movq (%1), %%mm0 ;\n" 3668c2ecf20Sopenharmony_ci " movq 8(%1), %%mm1 ;\n" 3678c2ecf20Sopenharmony_ci " pxor (%2), %%mm0 ;\n" 3688c2ecf20Sopenharmony_ci " movq 16(%1), %%mm2 ;\n" 3698c2ecf20Sopenharmony_ci " pxor 8(%2), %%mm1 ;\n" 3708c2ecf20Sopenharmony_ci " pxor (%3), %%mm0 ;\n" 3718c2ecf20Sopenharmony_ci " pxor 16(%2), %%mm2 ;\n" 3728c2ecf20Sopenharmony_ci " pxor 8(%3), %%mm1 ;\n" 3738c2ecf20Sopenharmony_ci " pxor (%4), %%mm0 ;\n" 3748c2ecf20Sopenharmony_ci " movq 24(%1), %%mm3 ;\n" 3758c2ecf20Sopenharmony_ci " pxor 16(%3), %%mm2 ;\n" 3768c2ecf20Sopenharmony_ci " pxor 8(%4), %%mm1 ;\n" 3778c2ecf20Sopenharmony_ci " movq %%mm0, (%1) ;\n" 3788c2ecf20Sopenharmony_ci " movq 32(%1), %%mm4 ;\n" 3798c2ecf20Sopenharmony_ci " pxor 24(%2), %%mm3 ;\n" 3808c2ecf20Sopenharmony_ci " pxor 16(%4), %%mm2 ;\n" 3818c2ecf20Sopenharmony_ci " movq %%mm1, 8(%1) ;\n" 3828c2ecf20Sopenharmony_ci " movq 40(%1), %%mm5 ;\n" 3838c2ecf20Sopenharmony_ci " pxor 32(%2), %%mm4 ;\n" 3848c2ecf20Sopenharmony_ci " pxor 24(%3), %%mm3 ;\n" 3858c2ecf20Sopenharmony_ci " movq %%mm2, 16(%1) ;\n" 3868c2ecf20Sopenharmony_ci " pxor 40(%2), %%mm5 ;\n" 3878c2ecf20Sopenharmony_ci " pxor 32(%3), %%mm4 ;\n" 3888c2ecf20Sopenharmony_ci " pxor 24(%4), %%mm3 ;\n" 3898c2ecf20Sopenharmony_ci " movq %%mm3, 24(%1) ;\n" 3908c2ecf20Sopenharmony_ci " movq 56(%1), %%mm7 ;\n" 3918c2ecf20Sopenharmony_ci " movq 48(%1), %%mm6 ;\n" 3928c2ecf20Sopenharmony_ci " pxor 40(%3), %%mm5 ;\n" 3938c2ecf20Sopenharmony_ci " pxor 32(%4), %%mm4 ;\n" 3948c2ecf20Sopenharmony_ci " pxor 48(%2), %%mm6 ;\n" 3958c2ecf20Sopenharmony_ci " movq %%mm4, 32(%1) ;\n" 3968c2ecf20Sopenharmony_ci " pxor 56(%2), %%mm7 ;\n" 3978c2ecf20Sopenharmony_ci " pxor 40(%4), %%mm5 ;\n" 3988c2ecf20Sopenharmony_ci " pxor 48(%3), %%mm6 ;\n" 3998c2ecf20Sopenharmony_ci " pxor 56(%3), %%mm7 ;\n" 4008c2ecf20Sopenharmony_ci " movq %%mm5, 40(%1) ;\n" 4018c2ecf20Sopenharmony_ci " pxor 48(%4), %%mm6 ;\n" 4028c2ecf20Sopenharmony_ci " pxor 56(%4), %%mm7 ;\n" 4038c2ecf20Sopenharmony_ci " movq %%mm6, 48(%1) ;\n" 4048c2ecf20Sopenharmony_ci " movq %%mm7, 56(%1) ;\n" 4058c2ecf20Sopenharmony_ci 4068c2ecf20Sopenharmony_ci " addl $64, %1 ;\n" 4078c2ecf20Sopenharmony_ci " addl $64, %2 ;\n" 4088c2ecf20Sopenharmony_ci " addl $64, %3 ;\n" 4098c2ecf20Sopenharmony_ci " addl $64, %4 ;\n" 4108c2ecf20Sopenharmony_ci " decl %0 ;\n" 4118c2ecf20Sopenharmony_ci " jnz 1b ;\n" 4128c2ecf20Sopenharmony_ci : "+r" (lines), 4138c2ecf20Sopenharmony_ci "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) 4148c2ecf20Sopenharmony_ci : 4158c2ecf20Sopenharmony_ci : "memory"); 4168c2ecf20Sopenharmony_ci 4178c2ecf20Sopenharmony_ci kernel_fpu_end(); 4188c2ecf20Sopenharmony_ci} 4198c2ecf20Sopenharmony_ci 4208c2ecf20Sopenharmony_cistatic void 4218c2ecf20Sopenharmony_cixor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, 4228c2ecf20Sopenharmony_ci unsigned long *p3, unsigned long *p4, unsigned long *p5) 4238c2ecf20Sopenharmony_ci{ 4248c2ecf20Sopenharmony_ci unsigned long lines = bytes >> 6; 4258c2ecf20Sopenharmony_ci 4268c2ecf20Sopenharmony_ci kernel_fpu_begin(); 4278c2ecf20Sopenharmony_ci 4288c2ecf20Sopenharmony_ci /* Make sure GCC forgets anything it knows about p4 or p5, 4298c2ecf20Sopenharmony_ci such that it won't pass to the asm volatile below a 4308c2ecf20Sopenharmony_ci register that is shared with any other variable. That's 4318c2ecf20Sopenharmony_ci because we modify p4 and p5 there, but we can't mark them 4328c2ecf20Sopenharmony_ci as read/write, otherwise we'd overflow the 10-asm-operands 4338c2ecf20Sopenharmony_ci limit of GCC < 3.1. */ 4348c2ecf20Sopenharmony_ci asm("" : "+r" (p4), "+r" (p5)); 4358c2ecf20Sopenharmony_ci 4368c2ecf20Sopenharmony_ci asm volatile( 4378c2ecf20Sopenharmony_ci " .align 32,0x90 ;\n" 4388c2ecf20Sopenharmony_ci " 1: ;\n" 4398c2ecf20Sopenharmony_ci " movq (%1), %%mm0 ;\n" 4408c2ecf20Sopenharmony_ci " movq 8(%1), %%mm1 ;\n" 4418c2ecf20Sopenharmony_ci " pxor (%2), %%mm0 ;\n" 4428c2ecf20Sopenharmony_ci " pxor 8(%2), %%mm1 ;\n" 4438c2ecf20Sopenharmony_ci " movq 16(%1), %%mm2 ;\n" 4448c2ecf20Sopenharmony_ci " pxor (%3), %%mm0 ;\n" 4458c2ecf20Sopenharmony_ci " pxor 8(%3), %%mm1 ;\n" 4468c2ecf20Sopenharmony_ci " pxor 16(%2), %%mm2 ;\n" 4478c2ecf20Sopenharmony_ci " pxor (%4), %%mm0 ;\n" 4488c2ecf20Sopenharmony_ci " pxor 8(%4), %%mm1 ;\n" 4498c2ecf20Sopenharmony_ci " pxor 16(%3), %%mm2 ;\n" 4508c2ecf20Sopenharmony_ci " movq 24(%1), %%mm3 ;\n" 4518c2ecf20Sopenharmony_ci " pxor (%5), %%mm0 ;\n" 4528c2ecf20Sopenharmony_ci " pxor 8(%5), %%mm1 ;\n" 4538c2ecf20Sopenharmony_ci " movq %%mm0, (%1) ;\n" 4548c2ecf20Sopenharmony_ci " pxor 16(%4), %%mm2 ;\n" 4558c2ecf20Sopenharmony_ci " pxor 24(%2), %%mm3 ;\n" 4568c2ecf20Sopenharmony_ci " movq %%mm1, 8(%1) ;\n" 4578c2ecf20Sopenharmony_ci " pxor 16(%5), %%mm2 ;\n" 4588c2ecf20Sopenharmony_ci " pxor 24(%3), %%mm3 ;\n" 4598c2ecf20Sopenharmony_ci " movq 32(%1), %%mm4 ;\n" 4608c2ecf20Sopenharmony_ci " movq %%mm2, 16(%1) ;\n" 4618c2ecf20Sopenharmony_ci " pxor 24(%4), %%mm3 ;\n" 4628c2ecf20Sopenharmony_ci " pxor 32(%2), %%mm4 ;\n" 4638c2ecf20Sopenharmony_ci " movq 40(%1), %%mm5 ;\n" 4648c2ecf20Sopenharmony_ci " pxor 24(%5), %%mm3 ;\n" 4658c2ecf20Sopenharmony_ci " pxor 32(%3), %%mm4 ;\n" 4668c2ecf20Sopenharmony_ci " pxor 40(%2), %%mm5 ;\n" 4678c2ecf20Sopenharmony_ci " movq %%mm3, 24(%1) ;\n" 4688c2ecf20Sopenharmony_ci " pxor 32(%4), %%mm4 ;\n" 4698c2ecf20Sopenharmony_ci " pxor 40(%3), %%mm5 ;\n" 4708c2ecf20Sopenharmony_ci " movq 48(%1), %%mm6 ;\n" 4718c2ecf20Sopenharmony_ci " movq 56(%1), %%mm7 ;\n" 4728c2ecf20Sopenharmony_ci " pxor 32(%5), %%mm4 ;\n" 4738c2ecf20Sopenharmony_ci " pxor 40(%4), %%mm5 ;\n" 4748c2ecf20Sopenharmony_ci " pxor 48(%2), %%mm6 ;\n" 4758c2ecf20Sopenharmony_ci " pxor 56(%2), %%mm7 ;\n" 4768c2ecf20Sopenharmony_ci " movq %%mm4, 32(%1) ;\n" 4778c2ecf20Sopenharmony_ci " pxor 48(%3), %%mm6 ;\n" 4788c2ecf20Sopenharmony_ci " pxor 56(%3), %%mm7 ;\n" 4798c2ecf20Sopenharmony_ci " pxor 40(%5), %%mm5 ;\n" 4808c2ecf20Sopenharmony_ci " pxor 48(%4), %%mm6 ;\n" 4818c2ecf20Sopenharmony_ci " pxor 56(%4), %%mm7 ;\n" 4828c2ecf20Sopenharmony_ci " movq %%mm5, 40(%1) ;\n" 4838c2ecf20Sopenharmony_ci " pxor 48(%5), %%mm6 ;\n" 4848c2ecf20Sopenharmony_ci " pxor 56(%5), %%mm7 ;\n" 4858c2ecf20Sopenharmony_ci " movq %%mm6, 48(%1) ;\n" 4868c2ecf20Sopenharmony_ci " movq %%mm7, 56(%1) ;\n" 4878c2ecf20Sopenharmony_ci 4888c2ecf20Sopenharmony_ci " addl $64, %1 ;\n" 4898c2ecf20Sopenharmony_ci " addl $64, %2 ;\n" 4908c2ecf20Sopenharmony_ci " addl $64, %3 ;\n" 4918c2ecf20Sopenharmony_ci " addl $64, %4 ;\n" 4928c2ecf20Sopenharmony_ci " addl $64, %5 ;\n" 4938c2ecf20Sopenharmony_ci " decl %0 ;\n" 4948c2ecf20Sopenharmony_ci " jnz 1b ;\n" 4958c2ecf20Sopenharmony_ci : "+r" (lines), 4968c2ecf20Sopenharmony_ci "+r" (p1), "+r" (p2), "+r" (p3) 4978c2ecf20Sopenharmony_ci : "r" (p4), "r" (p5) 4988c2ecf20Sopenharmony_ci : "memory"); 4998c2ecf20Sopenharmony_ci 5008c2ecf20Sopenharmony_ci /* p4 and p5 were modified, and now the variables are dead. 5018c2ecf20Sopenharmony_ci Clobber them just to be sure nobody does something stupid 5028c2ecf20Sopenharmony_ci like assuming they have some legal value. */ 5038c2ecf20Sopenharmony_ci asm("" : "=r" (p4), "=r" (p5)); 5048c2ecf20Sopenharmony_ci 5058c2ecf20Sopenharmony_ci kernel_fpu_end(); 5068c2ecf20Sopenharmony_ci} 5078c2ecf20Sopenharmony_ci 5088c2ecf20Sopenharmony_cistatic struct xor_block_template xor_block_pII_mmx = { 5098c2ecf20Sopenharmony_ci .name = "pII_mmx", 5108c2ecf20Sopenharmony_ci .do_2 = xor_pII_mmx_2, 5118c2ecf20Sopenharmony_ci .do_3 = xor_pII_mmx_3, 5128c2ecf20Sopenharmony_ci .do_4 = xor_pII_mmx_4, 5138c2ecf20Sopenharmony_ci .do_5 = xor_pII_mmx_5, 5148c2ecf20Sopenharmony_ci}; 5158c2ecf20Sopenharmony_ci 5168c2ecf20Sopenharmony_cistatic struct xor_block_template xor_block_p5_mmx = { 5178c2ecf20Sopenharmony_ci .name = "p5_mmx", 5188c2ecf20Sopenharmony_ci .do_2 = xor_p5_mmx_2, 5198c2ecf20Sopenharmony_ci .do_3 = xor_p5_mmx_3, 5208c2ecf20Sopenharmony_ci .do_4 = xor_p5_mmx_4, 5218c2ecf20Sopenharmony_ci .do_5 = xor_p5_mmx_5, 5228c2ecf20Sopenharmony_ci}; 5238c2ecf20Sopenharmony_ci 5248c2ecf20Sopenharmony_cistatic struct xor_block_template xor_block_pIII_sse = { 5258c2ecf20Sopenharmony_ci .name = "pIII_sse", 5268c2ecf20Sopenharmony_ci .do_2 = xor_sse_2, 5278c2ecf20Sopenharmony_ci .do_3 = xor_sse_3, 5288c2ecf20Sopenharmony_ci .do_4 = xor_sse_4, 5298c2ecf20Sopenharmony_ci .do_5 = xor_sse_5, 5308c2ecf20Sopenharmony_ci}; 5318c2ecf20Sopenharmony_ci 5328c2ecf20Sopenharmony_ci/* Also try the AVX routines */ 5338c2ecf20Sopenharmony_ci#include <asm/xor_avx.h> 5348c2ecf20Sopenharmony_ci 5358c2ecf20Sopenharmony_ci/* Also try the generic routines. */ 5368c2ecf20Sopenharmony_ci#include <asm-generic/xor.h> 5378c2ecf20Sopenharmony_ci 5388c2ecf20Sopenharmony_ci/* We force the use of the SSE xor block because it can write around L2. 5398c2ecf20Sopenharmony_ci We may also be able to load into the L1 only depending on how the cpu 5408c2ecf20Sopenharmony_ci deals with a load to a line that is being prefetched. */ 5418c2ecf20Sopenharmony_ci#undef XOR_TRY_TEMPLATES 5428c2ecf20Sopenharmony_ci#define XOR_TRY_TEMPLATES \ 5438c2ecf20Sopenharmony_cido { \ 5448c2ecf20Sopenharmony_ci AVX_XOR_SPEED; \ 5458c2ecf20Sopenharmony_ci if (boot_cpu_has(X86_FEATURE_XMM)) { \ 5468c2ecf20Sopenharmony_ci xor_speed(&xor_block_pIII_sse); \ 5478c2ecf20Sopenharmony_ci xor_speed(&xor_block_sse_pf64); \ 5488c2ecf20Sopenharmony_ci } else if (boot_cpu_has(X86_FEATURE_MMX)) { \ 5498c2ecf20Sopenharmony_ci xor_speed(&xor_block_pII_mmx); \ 5508c2ecf20Sopenharmony_ci xor_speed(&xor_block_p5_mmx); \ 5518c2ecf20Sopenharmony_ci } else { \ 5528c2ecf20Sopenharmony_ci xor_speed(&xor_block_8regs); \ 5538c2ecf20Sopenharmony_ci xor_speed(&xor_block_8regs_p); \ 5548c2ecf20Sopenharmony_ci xor_speed(&xor_block_32regs); \ 5558c2ecf20Sopenharmony_ci xor_speed(&xor_block_32regs_p); \ 5568c2ecf20Sopenharmony_ci } \ 5578c2ecf20Sopenharmony_ci} while (0) 5588c2ecf20Sopenharmony_ci 5598c2ecf20Sopenharmony_ci#endif /* _ASM_X86_XOR_32_H */ 560