18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */ 28c2ecf20Sopenharmony_ci#ifndef _ASM_X86_XOR_H 38c2ecf20Sopenharmony_ci#define _ASM_X86_XOR_H 48c2ecf20Sopenharmony_ci 58c2ecf20Sopenharmony_ci/* 68c2ecf20Sopenharmony_ci * Optimized RAID-5 checksumming functions for SSE. 78c2ecf20Sopenharmony_ci */ 88c2ecf20Sopenharmony_ci 98c2ecf20Sopenharmony_ci/* 108c2ecf20Sopenharmony_ci * Cache avoiding checksumming functions utilizing KNI instructions 118c2ecf20Sopenharmony_ci * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) 128c2ecf20Sopenharmony_ci */ 138c2ecf20Sopenharmony_ci 148c2ecf20Sopenharmony_ci/* 158c2ecf20Sopenharmony_ci * Based on 168c2ecf20Sopenharmony_ci * High-speed RAID5 checksumming functions utilizing SSE instructions. 178c2ecf20Sopenharmony_ci * Copyright (C) 1998 Ingo Molnar. 188c2ecf20Sopenharmony_ci */ 198c2ecf20Sopenharmony_ci 208c2ecf20Sopenharmony_ci/* 218c2ecf20Sopenharmony_ci * x86-64 changes / gcc fixes from Andi Kleen. 228c2ecf20Sopenharmony_ci * Copyright 2002 Andi Kleen, SuSE Labs. 238c2ecf20Sopenharmony_ci * 248c2ecf20Sopenharmony_ci * This hasn't been optimized for the hammer yet, but there are likely 258c2ecf20Sopenharmony_ci * no advantages to be gotten from x86-64 here anyways. 268c2ecf20Sopenharmony_ci */ 278c2ecf20Sopenharmony_ci 288c2ecf20Sopenharmony_ci#include <asm/fpu/api.h> 298c2ecf20Sopenharmony_ci 308c2ecf20Sopenharmony_ci#ifdef CONFIG_X86_32 318c2ecf20Sopenharmony_ci/* reduce register pressure */ 328c2ecf20Sopenharmony_ci# define XOR_CONSTANT_CONSTRAINT "i" 338c2ecf20Sopenharmony_ci#else 348c2ecf20Sopenharmony_ci# define XOR_CONSTANT_CONSTRAINT "re" 358c2ecf20Sopenharmony_ci#endif 368c2ecf20Sopenharmony_ci 378c2ecf20Sopenharmony_ci#define OFFS(x) "16*("#x")" 388c2ecf20Sopenharmony_ci#define PF_OFFS(x) "256+16*("#x")" 398c2ecf20Sopenharmony_ci#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" 408c2ecf20Sopenharmony_ci#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" 418c2ecf20Sopenharmony_ci#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" 428c2ecf20Sopenharmony_ci#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" 438c2ecf20Sopenharmony_ci#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" 448c2ecf20Sopenharmony_ci#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" 458c2ecf20Sopenharmony_ci#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" 468c2ecf20Sopenharmony_ci#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" 478c2ecf20Sopenharmony_ci#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" 488c2ecf20Sopenharmony_ci#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" 498c2ecf20Sopenharmony_ci#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" 508c2ecf20Sopenharmony_ci#define NOP(x) 518c2ecf20Sopenharmony_ci 528c2ecf20Sopenharmony_ci#define BLK64(pf, op, i) \ 538c2ecf20Sopenharmony_ci pf(i) \ 548c2ecf20Sopenharmony_ci op(i, 0) \ 558c2ecf20Sopenharmony_ci op(i + 1, 1) \ 568c2ecf20Sopenharmony_ci op(i + 2, 2) \ 578c2ecf20Sopenharmony_ci op(i + 3, 3) 588c2ecf20Sopenharmony_ci 598c2ecf20Sopenharmony_cistatic void 608c2ecf20Sopenharmony_cixor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) 618c2ecf20Sopenharmony_ci{ 628c2ecf20Sopenharmony_ci unsigned long lines = bytes >> 8; 638c2ecf20Sopenharmony_ci 648c2ecf20Sopenharmony_ci kernel_fpu_begin(); 658c2ecf20Sopenharmony_ci 668c2ecf20Sopenharmony_ci asm volatile( 678c2ecf20Sopenharmony_ci#undef BLOCK 688c2ecf20Sopenharmony_ci#define BLOCK(i) \ 698c2ecf20Sopenharmony_ci LD(i, 0) \ 708c2ecf20Sopenharmony_ci LD(i + 1, 1) \ 718c2ecf20Sopenharmony_ci PF1(i) \ 728c2ecf20Sopenharmony_ci PF1(i + 2) \ 738c2ecf20Sopenharmony_ci LD(i + 2, 2) \ 748c2ecf20Sopenharmony_ci LD(i + 3, 3) \ 758c2ecf20Sopenharmony_ci PF0(i + 4) \ 768c2ecf20Sopenharmony_ci PF0(i + 6) \ 778c2ecf20Sopenharmony_ci XO1(i, 0) \ 788c2ecf20Sopenharmony_ci XO1(i + 1, 1) \ 798c2ecf20Sopenharmony_ci XO1(i + 2, 2) \ 808c2ecf20Sopenharmony_ci XO1(i + 3, 3) \ 818c2ecf20Sopenharmony_ci ST(i, 0) \ 828c2ecf20Sopenharmony_ci ST(i + 1, 1) \ 838c2ecf20Sopenharmony_ci ST(i + 2, 2) \ 848c2ecf20Sopenharmony_ci ST(i + 3, 3) \ 858c2ecf20Sopenharmony_ci 868c2ecf20Sopenharmony_ci 878c2ecf20Sopenharmony_ci PF0(0) 888c2ecf20Sopenharmony_ci PF0(2) 898c2ecf20Sopenharmony_ci 908c2ecf20Sopenharmony_ci " .align 32 ;\n" 918c2ecf20Sopenharmony_ci " 1: ;\n" 928c2ecf20Sopenharmony_ci 938c2ecf20Sopenharmony_ci BLOCK(0) 948c2ecf20Sopenharmony_ci BLOCK(4) 958c2ecf20Sopenharmony_ci BLOCK(8) 968c2ecf20Sopenharmony_ci BLOCK(12) 978c2ecf20Sopenharmony_ci 988c2ecf20Sopenharmony_ci " add %[inc], %[p1] ;\n" 998c2ecf20Sopenharmony_ci " add %[inc], %[p2] ;\n" 1008c2ecf20Sopenharmony_ci " dec %[cnt] ;\n" 1018c2ecf20Sopenharmony_ci " jnz 1b ;\n" 1028c2ecf20Sopenharmony_ci : [cnt] "+r" (lines), 1038c2ecf20Sopenharmony_ci [p1] "+r" (p1), [p2] "+r" (p2) 1048c2ecf20Sopenharmony_ci : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 1058c2ecf20Sopenharmony_ci : "memory"); 1068c2ecf20Sopenharmony_ci 1078c2ecf20Sopenharmony_ci kernel_fpu_end(); 1088c2ecf20Sopenharmony_ci} 1098c2ecf20Sopenharmony_ci 1108c2ecf20Sopenharmony_cistatic void 1118c2ecf20Sopenharmony_cixor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2) 1128c2ecf20Sopenharmony_ci{ 1138c2ecf20Sopenharmony_ci unsigned long lines = bytes >> 8; 1148c2ecf20Sopenharmony_ci 1158c2ecf20Sopenharmony_ci kernel_fpu_begin(); 1168c2ecf20Sopenharmony_ci 1178c2ecf20Sopenharmony_ci asm volatile( 1188c2ecf20Sopenharmony_ci#undef BLOCK 1198c2ecf20Sopenharmony_ci#define BLOCK(i) \ 1208c2ecf20Sopenharmony_ci BLK64(PF0, LD, i) \ 1218c2ecf20Sopenharmony_ci BLK64(PF1, XO1, i) \ 1228c2ecf20Sopenharmony_ci BLK64(NOP, ST, i) \ 1238c2ecf20Sopenharmony_ci 1248c2ecf20Sopenharmony_ci " .align 32 ;\n" 1258c2ecf20Sopenharmony_ci " 1: ;\n" 1268c2ecf20Sopenharmony_ci 1278c2ecf20Sopenharmony_ci BLOCK(0) 1288c2ecf20Sopenharmony_ci BLOCK(4) 1298c2ecf20Sopenharmony_ci BLOCK(8) 1308c2ecf20Sopenharmony_ci BLOCK(12) 1318c2ecf20Sopenharmony_ci 1328c2ecf20Sopenharmony_ci " add %[inc], %[p1] ;\n" 1338c2ecf20Sopenharmony_ci " add %[inc], %[p2] ;\n" 1348c2ecf20Sopenharmony_ci " dec %[cnt] ;\n" 1358c2ecf20Sopenharmony_ci " jnz 1b ;\n" 1368c2ecf20Sopenharmony_ci : [cnt] "+r" (lines), 1378c2ecf20Sopenharmony_ci [p1] "+r" (p1), [p2] "+r" (p2) 1388c2ecf20Sopenharmony_ci : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 1398c2ecf20Sopenharmony_ci : "memory"); 1408c2ecf20Sopenharmony_ci 1418c2ecf20Sopenharmony_ci kernel_fpu_end(); 1428c2ecf20Sopenharmony_ci} 1438c2ecf20Sopenharmony_ci 1448c2ecf20Sopenharmony_cistatic void 1458c2ecf20Sopenharmony_cixor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, 1468c2ecf20Sopenharmony_ci unsigned long *p3) 1478c2ecf20Sopenharmony_ci{ 1488c2ecf20Sopenharmony_ci unsigned long lines = bytes >> 8; 1498c2ecf20Sopenharmony_ci 1508c2ecf20Sopenharmony_ci kernel_fpu_begin(); 1518c2ecf20Sopenharmony_ci 1528c2ecf20Sopenharmony_ci asm volatile( 1538c2ecf20Sopenharmony_ci#undef BLOCK 1548c2ecf20Sopenharmony_ci#define BLOCK(i) \ 1558c2ecf20Sopenharmony_ci PF1(i) \ 1568c2ecf20Sopenharmony_ci PF1(i + 2) \ 1578c2ecf20Sopenharmony_ci LD(i, 0) \ 1588c2ecf20Sopenharmony_ci LD(i + 1, 1) \ 1598c2ecf20Sopenharmony_ci LD(i + 2, 2) \ 1608c2ecf20Sopenharmony_ci LD(i + 3, 3) \ 1618c2ecf20Sopenharmony_ci PF2(i) \ 1628c2ecf20Sopenharmony_ci PF2(i + 2) \ 1638c2ecf20Sopenharmony_ci PF0(i + 4) \ 1648c2ecf20Sopenharmony_ci PF0(i + 6) \ 1658c2ecf20Sopenharmony_ci XO1(i, 0) \ 1668c2ecf20Sopenharmony_ci XO1(i + 1, 1) \ 1678c2ecf20Sopenharmony_ci XO1(i + 2, 2) \ 1688c2ecf20Sopenharmony_ci XO1(i + 3, 3) \ 1698c2ecf20Sopenharmony_ci XO2(i, 0) \ 1708c2ecf20Sopenharmony_ci XO2(i + 1, 1) \ 1718c2ecf20Sopenharmony_ci XO2(i + 2, 2) \ 1728c2ecf20Sopenharmony_ci XO2(i + 3, 3) \ 1738c2ecf20Sopenharmony_ci ST(i, 0) \ 1748c2ecf20Sopenharmony_ci ST(i + 1, 1) \ 1758c2ecf20Sopenharmony_ci ST(i + 2, 2) \ 1768c2ecf20Sopenharmony_ci ST(i + 3, 3) \ 1778c2ecf20Sopenharmony_ci 1788c2ecf20Sopenharmony_ci 1798c2ecf20Sopenharmony_ci PF0(0) 1808c2ecf20Sopenharmony_ci PF0(2) 1818c2ecf20Sopenharmony_ci 1828c2ecf20Sopenharmony_ci " .align 32 ;\n" 1838c2ecf20Sopenharmony_ci " 1: ;\n" 1848c2ecf20Sopenharmony_ci 1858c2ecf20Sopenharmony_ci BLOCK(0) 1868c2ecf20Sopenharmony_ci BLOCK(4) 1878c2ecf20Sopenharmony_ci BLOCK(8) 1888c2ecf20Sopenharmony_ci BLOCK(12) 1898c2ecf20Sopenharmony_ci 1908c2ecf20Sopenharmony_ci " add %[inc], %[p1] ;\n" 1918c2ecf20Sopenharmony_ci " add %[inc], %[p2] ;\n" 1928c2ecf20Sopenharmony_ci " add %[inc], %[p3] ;\n" 1938c2ecf20Sopenharmony_ci " dec %[cnt] ;\n" 1948c2ecf20Sopenharmony_ci " jnz 1b ;\n" 1958c2ecf20Sopenharmony_ci : [cnt] "+r" (lines), 1968c2ecf20Sopenharmony_ci [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) 1978c2ecf20Sopenharmony_ci : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 1988c2ecf20Sopenharmony_ci : "memory"); 1998c2ecf20Sopenharmony_ci 2008c2ecf20Sopenharmony_ci kernel_fpu_end(); 2018c2ecf20Sopenharmony_ci} 2028c2ecf20Sopenharmony_ci 2038c2ecf20Sopenharmony_cistatic void 2048c2ecf20Sopenharmony_cixor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, 2058c2ecf20Sopenharmony_ci unsigned long *p3) 2068c2ecf20Sopenharmony_ci{ 2078c2ecf20Sopenharmony_ci unsigned long lines = bytes >> 8; 2088c2ecf20Sopenharmony_ci 2098c2ecf20Sopenharmony_ci kernel_fpu_begin(); 2108c2ecf20Sopenharmony_ci 2118c2ecf20Sopenharmony_ci asm volatile( 2128c2ecf20Sopenharmony_ci#undef BLOCK 2138c2ecf20Sopenharmony_ci#define BLOCK(i) \ 2148c2ecf20Sopenharmony_ci BLK64(PF0, LD, i) \ 2158c2ecf20Sopenharmony_ci BLK64(PF1, XO1, i) \ 2168c2ecf20Sopenharmony_ci BLK64(PF2, XO2, i) \ 2178c2ecf20Sopenharmony_ci BLK64(NOP, ST, i) \ 2188c2ecf20Sopenharmony_ci 2198c2ecf20Sopenharmony_ci " .align 32 ;\n" 2208c2ecf20Sopenharmony_ci " 1: ;\n" 2218c2ecf20Sopenharmony_ci 2228c2ecf20Sopenharmony_ci BLOCK(0) 2238c2ecf20Sopenharmony_ci BLOCK(4) 2248c2ecf20Sopenharmony_ci BLOCK(8) 2258c2ecf20Sopenharmony_ci BLOCK(12) 2268c2ecf20Sopenharmony_ci 2278c2ecf20Sopenharmony_ci " add %[inc], %[p1] ;\n" 2288c2ecf20Sopenharmony_ci " add %[inc], %[p2] ;\n" 2298c2ecf20Sopenharmony_ci " add %[inc], %[p3] ;\n" 2308c2ecf20Sopenharmony_ci " dec %[cnt] ;\n" 2318c2ecf20Sopenharmony_ci " jnz 1b ;\n" 2328c2ecf20Sopenharmony_ci : [cnt] "+r" (lines), 2338c2ecf20Sopenharmony_ci [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) 2348c2ecf20Sopenharmony_ci : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 2358c2ecf20Sopenharmony_ci : "memory"); 2368c2ecf20Sopenharmony_ci 2378c2ecf20Sopenharmony_ci kernel_fpu_end(); 2388c2ecf20Sopenharmony_ci} 2398c2ecf20Sopenharmony_ci 2408c2ecf20Sopenharmony_cistatic void 2418c2ecf20Sopenharmony_cixor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, 2428c2ecf20Sopenharmony_ci unsigned long *p3, unsigned long *p4) 2438c2ecf20Sopenharmony_ci{ 2448c2ecf20Sopenharmony_ci unsigned long lines = bytes >> 8; 2458c2ecf20Sopenharmony_ci 2468c2ecf20Sopenharmony_ci kernel_fpu_begin(); 2478c2ecf20Sopenharmony_ci 2488c2ecf20Sopenharmony_ci asm volatile( 2498c2ecf20Sopenharmony_ci#undef BLOCK 2508c2ecf20Sopenharmony_ci#define BLOCK(i) \ 2518c2ecf20Sopenharmony_ci PF1(i) \ 2528c2ecf20Sopenharmony_ci PF1(i + 2) \ 2538c2ecf20Sopenharmony_ci LD(i, 0) \ 2548c2ecf20Sopenharmony_ci LD(i + 1, 1) \ 2558c2ecf20Sopenharmony_ci LD(i + 2, 2) \ 2568c2ecf20Sopenharmony_ci LD(i + 3, 3) \ 2578c2ecf20Sopenharmony_ci PF2(i) \ 2588c2ecf20Sopenharmony_ci PF2(i + 2) \ 2598c2ecf20Sopenharmony_ci XO1(i, 0) \ 2608c2ecf20Sopenharmony_ci XO1(i + 1, 1) \ 2618c2ecf20Sopenharmony_ci XO1(i + 2, 2) \ 2628c2ecf20Sopenharmony_ci XO1(i + 3, 3) \ 2638c2ecf20Sopenharmony_ci PF3(i) \ 2648c2ecf20Sopenharmony_ci PF3(i + 2) \ 2658c2ecf20Sopenharmony_ci PF0(i + 4) \ 2668c2ecf20Sopenharmony_ci PF0(i + 6) \ 2678c2ecf20Sopenharmony_ci XO2(i, 0) \ 2688c2ecf20Sopenharmony_ci XO2(i + 1, 1) \ 2698c2ecf20Sopenharmony_ci XO2(i + 2, 2) \ 2708c2ecf20Sopenharmony_ci XO2(i + 3, 3) \ 2718c2ecf20Sopenharmony_ci XO3(i, 0) \ 2728c2ecf20Sopenharmony_ci XO3(i + 1, 1) \ 2738c2ecf20Sopenharmony_ci XO3(i + 2, 2) \ 2748c2ecf20Sopenharmony_ci XO3(i + 3, 3) \ 2758c2ecf20Sopenharmony_ci ST(i, 0) \ 2768c2ecf20Sopenharmony_ci ST(i + 1, 1) \ 2778c2ecf20Sopenharmony_ci ST(i + 2, 2) \ 2788c2ecf20Sopenharmony_ci ST(i + 3, 3) \ 2798c2ecf20Sopenharmony_ci 2808c2ecf20Sopenharmony_ci 2818c2ecf20Sopenharmony_ci PF0(0) 2828c2ecf20Sopenharmony_ci PF0(2) 2838c2ecf20Sopenharmony_ci 2848c2ecf20Sopenharmony_ci " .align 32 ;\n" 2858c2ecf20Sopenharmony_ci " 1: ;\n" 2868c2ecf20Sopenharmony_ci 2878c2ecf20Sopenharmony_ci BLOCK(0) 2888c2ecf20Sopenharmony_ci BLOCK(4) 2898c2ecf20Sopenharmony_ci BLOCK(8) 2908c2ecf20Sopenharmony_ci BLOCK(12) 2918c2ecf20Sopenharmony_ci 2928c2ecf20Sopenharmony_ci " add %[inc], %[p1] ;\n" 2938c2ecf20Sopenharmony_ci " add %[inc], %[p2] ;\n" 2948c2ecf20Sopenharmony_ci " add %[inc], %[p3] ;\n" 2958c2ecf20Sopenharmony_ci " add %[inc], %[p4] ;\n" 2968c2ecf20Sopenharmony_ci " dec %[cnt] ;\n" 2978c2ecf20Sopenharmony_ci " jnz 1b ;\n" 2988c2ecf20Sopenharmony_ci : [cnt] "+r" (lines), [p1] "+r" (p1), 2998c2ecf20Sopenharmony_ci [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) 3008c2ecf20Sopenharmony_ci : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 3018c2ecf20Sopenharmony_ci : "memory"); 3028c2ecf20Sopenharmony_ci 3038c2ecf20Sopenharmony_ci kernel_fpu_end(); 3048c2ecf20Sopenharmony_ci} 3058c2ecf20Sopenharmony_ci 3068c2ecf20Sopenharmony_cistatic void 3078c2ecf20Sopenharmony_cixor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, 3088c2ecf20Sopenharmony_ci unsigned long *p3, unsigned long *p4) 3098c2ecf20Sopenharmony_ci{ 3108c2ecf20Sopenharmony_ci unsigned long lines = bytes >> 8; 3118c2ecf20Sopenharmony_ci 3128c2ecf20Sopenharmony_ci kernel_fpu_begin(); 3138c2ecf20Sopenharmony_ci 3148c2ecf20Sopenharmony_ci asm volatile( 3158c2ecf20Sopenharmony_ci#undef BLOCK 3168c2ecf20Sopenharmony_ci#define BLOCK(i) \ 3178c2ecf20Sopenharmony_ci BLK64(PF0, LD, i) \ 3188c2ecf20Sopenharmony_ci BLK64(PF1, XO1, i) \ 3198c2ecf20Sopenharmony_ci BLK64(PF2, XO2, i) \ 3208c2ecf20Sopenharmony_ci BLK64(PF3, XO3, i) \ 3218c2ecf20Sopenharmony_ci BLK64(NOP, ST, i) \ 3228c2ecf20Sopenharmony_ci 3238c2ecf20Sopenharmony_ci " .align 32 ;\n" 3248c2ecf20Sopenharmony_ci " 1: ;\n" 3258c2ecf20Sopenharmony_ci 3268c2ecf20Sopenharmony_ci BLOCK(0) 3278c2ecf20Sopenharmony_ci BLOCK(4) 3288c2ecf20Sopenharmony_ci BLOCK(8) 3298c2ecf20Sopenharmony_ci BLOCK(12) 3308c2ecf20Sopenharmony_ci 3318c2ecf20Sopenharmony_ci " add %[inc], %[p1] ;\n" 3328c2ecf20Sopenharmony_ci " add %[inc], %[p2] ;\n" 3338c2ecf20Sopenharmony_ci " add %[inc], %[p3] ;\n" 3348c2ecf20Sopenharmony_ci " add %[inc], %[p4] ;\n" 3358c2ecf20Sopenharmony_ci " dec %[cnt] ;\n" 3368c2ecf20Sopenharmony_ci " jnz 1b ;\n" 3378c2ecf20Sopenharmony_ci : [cnt] "+r" (lines), [p1] "+r" (p1), 3388c2ecf20Sopenharmony_ci [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) 3398c2ecf20Sopenharmony_ci : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 3408c2ecf20Sopenharmony_ci : "memory"); 3418c2ecf20Sopenharmony_ci 3428c2ecf20Sopenharmony_ci kernel_fpu_end(); 3438c2ecf20Sopenharmony_ci} 3448c2ecf20Sopenharmony_ci 3458c2ecf20Sopenharmony_cistatic void 3468c2ecf20Sopenharmony_cixor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, 3478c2ecf20Sopenharmony_ci unsigned long *p3, unsigned long *p4, unsigned long *p5) 3488c2ecf20Sopenharmony_ci{ 3498c2ecf20Sopenharmony_ci unsigned long lines = bytes >> 8; 3508c2ecf20Sopenharmony_ci 3518c2ecf20Sopenharmony_ci kernel_fpu_begin(); 3528c2ecf20Sopenharmony_ci 3538c2ecf20Sopenharmony_ci asm volatile( 3548c2ecf20Sopenharmony_ci#undef BLOCK 3558c2ecf20Sopenharmony_ci#define BLOCK(i) \ 3568c2ecf20Sopenharmony_ci PF1(i) \ 3578c2ecf20Sopenharmony_ci PF1(i + 2) \ 3588c2ecf20Sopenharmony_ci LD(i, 0) \ 3598c2ecf20Sopenharmony_ci LD(i + 1, 1) \ 3608c2ecf20Sopenharmony_ci LD(i + 2, 2) \ 3618c2ecf20Sopenharmony_ci LD(i + 3, 3) \ 3628c2ecf20Sopenharmony_ci PF2(i) \ 3638c2ecf20Sopenharmony_ci PF2(i + 2) \ 3648c2ecf20Sopenharmony_ci XO1(i, 0) \ 3658c2ecf20Sopenharmony_ci XO1(i + 1, 1) \ 3668c2ecf20Sopenharmony_ci XO1(i + 2, 2) \ 3678c2ecf20Sopenharmony_ci XO1(i + 3, 3) \ 3688c2ecf20Sopenharmony_ci PF3(i) \ 3698c2ecf20Sopenharmony_ci PF3(i + 2) \ 3708c2ecf20Sopenharmony_ci XO2(i, 0) \ 3718c2ecf20Sopenharmony_ci XO2(i + 1, 1) \ 3728c2ecf20Sopenharmony_ci XO2(i + 2, 2) \ 3738c2ecf20Sopenharmony_ci XO2(i + 3, 3) \ 3748c2ecf20Sopenharmony_ci PF4(i) \ 3758c2ecf20Sopenharmony_ci PF4(i + 2) \ 3768c2ecf20Sopenharmony_ci PF0(i + 4) \ 3778c2ecf20Sopenharmony_ci PF0(i + 6) \ 3788c2ecf20Sopenharmony_ci XO3(i, 0) \ 3798c2ecf20Sopenharmony_ci XO3(i + 1, 1) \ 3808c2ecf20Sopenharmony_ci XO3(i + 2, 2) \ 3818c2ecf20Sopenharmony_ci XO3(i + 3, 3) \ 3828c2ecf20Sopenharmony_ci XO4(i, 0) \ 3838c2ecf20Sopenharmony_ci XO4(i + 1, 1) \ 3848c2ecf20Sopenharmony_ci XO4(i + 2, 2) \ 3858c2ecf20Sopenharmony_ci XO4(i + 3, 3) \ 3868c2ecf20Sopenharmony_ci ST(i, 0) \ 3878c2ecf20Sopenharmony_ci ST(i + 1, 1) \ 3888c2ecf20Sopenharmony_ci ST(i + 2, 2) \ 3898c2ecf20Sopenharmony_ci ST(i + 3, 3) \ 3908c2ecf20Sopenharmony_ci 3918c2ecf20Sopenharmony_ci 3928c2ecf20Sopenharmony_ci PF0(0) 3938c2ecf20Sopenharmony_ci PF0(2) 3948c2ecf20Sopenharmony_ci 3958c2ecf20Sopenharmony_ci " .align 32 ;\n" 3968c2ecf20Sopenharmony_ci " 1: ;\n" 3978c2ecf20Sopenharmony_ci 3988c2ecf20Sopenharmony_ci BLOCK(0) 3998c2ecf20Sopenharmony_ci BLOCK(4) 4008c2ecf20Sopenharmony_ci BLOCK(8) 4018c2ecf20Sopenharmony_ci BLOCK(12) 4028c2ecf20Sopenharmony_ci 4038c2ecf20Sopenharmony_ci " add %[inc], %[p1] ;\n" 4048c2ecf20Sopenharmony_ci " add %[inc], %[p2] ;\n" 4058c2ecf20Sopenharmony_ci " add %[inc], %[p3] ;\n" 4068c2ecf20Sopenharmony_ci " add %[inc], %[p4] ;\n" 4078c2ecf20Sopenharmony_ci " add %[inc], %[p5] ;\n" 4088c2ecf20Sopenharmony_ci " dec %[cnt] ;\n" 4098c2ecf20Sopenharmony_ci " jnz 1b ;\n" 4108c2ecf20Sopenharmony_ci : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), 4118c2ecf20Sopenharmony_ci [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) 4128c2ecf20Sopenharmony_ci : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 4138c2ecf20Sopenharmony_ci : "memory"); 4148c2ecf20Sopenharmony_ci 4158c2ecf20Sopenharmony_ci kernel_fpu_end(); 4168c2ecf20Sopenharmony_ci} 4178c2ecf20Sopenharmony_ci 4188c2ecf20Sopenharmony_cistatic void 4198c2ecf20Sopenharmony_cixor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, 4208c2ecf20Sopenharmony_ci unsigned long *p3, unsigned long *p4, unsigned long *p5) 4218c2ecf20Sopenharmony_ci{ 4228c2ecf20Sopenharmony_ci unsigned long lines = bytes >> 8; 4238c2ecf20Sopenharmony_ci 4248c2ecf20Sopenharmony_ci kernel_fpu_begin(); 4258c2ecf20Sopenharmony_ci 4268c2ecf20Sopenharmony_ci asm volatile( 4278c2ecf20Sopenharmony_ci#undef BLOCK 4288c2ecf20Sopenharmony_ci#define BLOCK(i) \ 4298c2ecf20Sopenharmony_ci BLK64(PF0, LD, i) \ 4308c2ecf20Sopenharmony_ci BLK64(PF1, XO1, i) \ 4318c2ecf20Sopenharmony_ci BLK64(PF2, XO2, i) \ 4328c2ecf20Sopenharmony_ci BLK64(PF3, XO3, i) \ 4338c2ecf20Sopenharmony_ci BLK64(PF4, XO4, i) \ 4348c2ecf20Sopenharmony_ci BLK64(NOP, ST, i) \ 4358c2ecf20Sopenharmony_ci 4368c2ecf20Sopenharmony_ci " .align 32 ;\n" 4378c2ecf20Sopenharmony_ci " 1: ;\n" 4388c2ecf20Sopenharmony_ci 4398c2ecf20Sopenharmony_ci BLOCK(0) 4408c2ecf20Sopenharmony_ci BLOCK(4) 4418c2ecf20Sopenharmony_ci BLOCK(8) 4428c2ecf20Sopenharmony_ci BLOCK(12) 4438c2ecf20Sopenharmony_ci 4448c2ecf20Sopenharmony_ci " add %[inc], %[p1] ;\n" 4458c2ecf20Sopenharmony_ci " add %[inc], %[p2] ;\n" 4468c2ecf20Sopenharmony_ci " add %[inc], %[p3] ;\n" 4478c2ecf20Sopenharmony_ci " add %[inc], %[p4] ;\n" 4488c2ecf20Sopenharmony_ci " add %[inc], %[p5] ;\n" 4498c2ecf20Sopenharmony_ci " dec %[cnt] ;\n" 4508c2ecf20Sopenharmony_ci " jnz 1b ;\n" 4518c2ecf20Sopenharmony_ci : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), 4528c2ecf20Sopenharmony_ci [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) 4538c2ecf20Sopenharmony_ci : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 4548c2ecf20Sopenharmony_ci : "memory"); 4558c2ecf20Sopenharmony_ci 4568c2ecf20Sopenharmony_ci kernel_fpu_end(); 4578c2ecf20Sopenharmony_ci} 4588c2ecf20Sopenharmony_ci 4598c2ecf20Sopenharmony_cistatic struct xor_block_template xor_block_sse_pf64 = { 4608c2ecf20Sopenharmony_ci .name = "prefetch64-sse", 4618c2ecf20Sopenharmony_ci .do_2 = xor_sse_2_pf64, 4628c2ecf20Sopenharmony_ci .do_3 = xor_sse_3_pf64, 4638c2ecf20Sopenharmony_ci .do_4 = xor_sse_4_pf64, 4648c2ecf20Sopenharmony_ci .do_5 = xor_sse_5_pf64, 4658c2ecf20Sopenharmony_ci}; 4668c2ecf20Sopenharmony_ci 4678c2ecf20Sopenharmony_ci#undef LD 4688c2ecf20Sopenharmony_ci#undef XO1 4698c2ecf20Sopenharmony_ci#undef XO2 4708c2ecf20Sopenharmony_ci#undef XO3 4718c2ecf20Sopenharmony_ci#undef XO4 4728c2ecf20Sopenharmony_ci#undef ST 4738c2ecf20Sopenharmony_ci#undef NOP 4748c2ecf20Sopenharmony_ci#undef BLK64 4758c2ecf20Sopenharmony_ci#undef BLOCK 4768c2ecf20Sopenharmony_ci 4778c2ecf20Sopenharmony_ci#undef XOR_CONSTANT_CONSTRAINT 4788c2ecf20Sopenharmony_ci 4798c2ecf20Sopenharmony_ci#ifdef CONFIG_X86_32 4808c2ecf20Sopenharmony_ci# include <asm/xor_32.h> 4818c2ecf20Sopenharmony_ci#else 4828c2ecf20Sopenharmony_ci# include <asm/xor_64.h> 4838c2ecf20Sopenharmony_ci#endif 4848c2ecf20Sopenharmony_ci 4858c2ecf20Sopenharmony_ci#define XOR_SELECT_TEMPLATE(FASTEST) \ 4868c2ecf20Sopenharmony_ci AVX_SELECT(FASTEST) 4878c2ecf20Sopenharmony_ci 4888c2ecf20Sopenharmony_ci#endif /* _ASM_X86_XOR_H */ 489