18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */ 28c2ecf20Sopenharmony_ci#ifndef _ASM_X86_XOR_AVX_H 38c2ecf20Sopenharmony_ci#define _ASM_X86_XOR_AVX_H 48c2ecf20Sopenharmony_ci 58c2ecf20Sopenharmony_ci/* 68c2ecf20Sopenharmony_ci * Optimized RAID-5 checksumming functions for AVX 78c2ecf20Sopenharmony_ci * 88c2ecf20Sopenharmony_ci * Copyright (C) 2012 Intel Corporation 98c2ecf20Sopenharmony_ci * Author: Jim Kukunas <james.t.kukunas@linux.intel.com> 108c2ecf20Sopenharmony_ci * 118c2ecf20Sopenharmony_ci * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines 128c2ecf20Sopenharmony_ci */ 138c2ecf20Sopenharmony_ci 148c2ecf20Sopenharmony_ci#include <linux/compiler.h> 158c2ecf20Sopenharmony_ci#include <asm/fpu/api.h> 168c2ecf20Sopenharmony_ci 178c2ecf20Sopenharmony_ci#define BLOCK4(i) \ 188c2ecf20Sopenharmony_ci BLOCK(32 * i, 0) \ 198c2ecf20Sopenharmony_ci BLOCK(32 * (i + 1), 1) \ 208c2ecf20Sopenharmony_ci BLOCK(32 * (i + 2), 2) \ 218c2ecf20Sopenharmony_ci BLOCK(32 * (i + 3), 3) 228c2ecf20Sopenharmony_ci 238c2ecf20Sopenharmony_ci#define BLOCK16() \ 248c2ecf20Sopenharmony_ci BLOCK4(0) \ 258c2ecf20Sopenharmony_ci BLOCK4(4) \ 268c2ecf20Sopenharmony_ci BLOCK4(8) \ 278c2ecf20Sopenharmony_ci BLOCK4(12) 288c2ecf20Sopenharmony_ci 298c2ecf20Sopenharmony_cistatic void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1) 308c2ecf20Sopenharmony_ci{ 318c2ecf20Sopenharmony_ci unsigned long lines = bytes >> 9; 328c2ecf20Sopenharmony_ci 338c2ecf20Sopenharmony_ci kernel_fpu_begin(); 348c2ecf20Sopenharmony_ci 358c2ecf20Sopenharmony_ci while (lines--) { 368c2ecf20Sopenharmony_ci#undef BLOCK 378c2ecf20Sopenharmony_ci#define BLOCK(i, reg) \ 388c2ecf20Sopenharmony_cido { \ 398c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \ 408c2ecf20Sopenharmony_ci asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 418c2ecf20Sopenharmony_ci "m" (p0[i / sizeof(*p0)])); \ 428c2ecf20Sopenharmony_ci asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 438c2ecf20Sopenharmony_ci "=m" (p0[i / sizeof(*p0)])); \ 448c2ecf20Sopenharmony_ci} while (0); 458c2ecf20Sopenharmony_ci 468c2ecf20Sopenharmony_ci BLOCK16() 478c2ecf20Sopenharmony_ci 488c2ecf20Sopenharmony_ci p0 = (unsigned long *)((uintptr_t)p0 + 512); 498c2ecf20Sopenharmony_ci p1 = (unsigned long *)((uintptr_t)p1 + 512); 508c2ecf20Sopenharmony_ci } 518c2ecf20Sopenharmony_ci 528c2ecf20Sopenharmony_ci kernel_fpu_end(); 538c2ecf20Sopenharmony_ci} 548c2ecf20Sopenharmony_ci 558c2ecf20Sopenharmony_cistatic void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1, 568c2ecf20Sopenharmony_ci unsigned long *p2) 578c2ecf20Sopenharmony_ci{ 588c2ecf20Sopenharmony_ci unsigned long lines = bytes >> 9; 598c2ecf20Sopenharmony_ci 608c2ecf20Sopenharmony_ci kernel_fpu_begin(); 618c2ecf20Sopenharmony_ci 628c2ecf20Sopenharmony_ci while (lines--) { 638c2ecf20Sopenharmony_ci#undef BLOCK 648c2ecf20Sopenharmony_ci#define BLOCK(i, reg) \ 658c2ecf20Sopenharmony_cido { \ 668c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \ 678c2ecf20Sopenharmony_ci asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 688c2ecf20Sopenharmony_ci "m" (p1[i / sizeof(*p1)])); \ 698c2ecf20Sopenharmony_ci asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 708c2ecf20Sopenharmony_ci "m" (p0[i / sizeof(*p0)])); \ 718c2ecf20Sopenharmony_ci asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 728c2ecf20Sopenharmony_ci "=m" (p0[i / sizeof(*p0)])); \ 738c2ecf20Sopenharmony_ci} while (0); 748c2ecf20Sopenharmony_ci 758c2ecf20Sopenharmony_ci BLOCK16() 768c2ecf20Sopenharmony_ci 778c2ecf20Sopenharmony_ci p0 = (unsigned long *)((uintptr_t)p0 + 512); 788c2ecf20Sopenharmony_ci p1 = (unsigned long *)((uintptr_t)p1 + 512); 798c2ecf20Sopenharmony_ci p2 = (unsigned long *)((uintptr_t)p2 + 512); 808c2ecf20Sopenharmony_ci } 818c2ecf20Sopenharmony_ci 828c2ecf20Sopenharmony_ci kernel_fpu_end(); 838c2ecf20Sopenharmony_ci} 848c2ecf20Sopenharmony_ci 858c2ecf20Sopenharmony_cistatic void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1, 868c2ecf20Sopenharmony_ci unsigned long *p2, unsigned long *p3) 878c2ecf20Sopenharmony_ci{ 888c2ecf20Sopenharmony_ci unsigned long lines = bytes >> 9; 898c2ecf20Sopenharmony_ci 908c2ecf20Sopenharmony_ci kernel_fpu_begin(); 918c2ecf20Sopenharmony_ci 928c2ecf20Sopenharmony_ci while (lines--) { 938c2ecf20Sopenharmony_ci#undef BLOCK 948c2ecf20Sopenharmony_ci#define BLOCK(i, reg) \ 958c2ecf20Sopenharmony_cido { \ 968c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \ 978c2ecf20Sopenharmony_ci asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 988c2ecf20Sopenharmony_ci "m" (p2[i / sizeof(*p2)])); \ 998c2ecf20Sopenharmony_ci asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 1008c2ecf20Sopenharmony_ci "m" (p1[i / sizeof(*p1)])); \ 1018c2ecf20Sopenharmony_ci asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 1028c2ecf20Sopenharmony_ci "m" (p0[i / sizeof(*p0)])); \ 1038c2ecf20Sopenharmony_ci asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 1048c2ecf20Sopenharmony_ci "=m" (p0[i / sizeof(*p0)])); \ 1058c2ecf20Sopenharmony_ci} while (0); 1068c2ecf20Sopenharmony_ci 1078c2ecf20Sopenharmony_ci BLOCK16(); 1088c2ecf20Sopenharmony_ci 1098c2ecf20Sopenharmony_ci p0 = (unsigned long *)((uintptr_t)p0 + 512); 1108c2ecf20Sopenharmony_ci p1 = (unsigned long *)((uintptr_t)p1 + 512); 1118c2ecf20Sopenharmony_ci p2 = (unsigned long *)((uintptr_t)p2 + 512); 1128c2ecf20Sopenharmony_ci p3 = (unsigned long *)((uintptr_t)p3 + 512); 1138c2ecf20Sopenharmony_ci } 1148c2ecf20Sopenharmony_ci 1158c2ecf20Sopenharmony_ci kernel_fpu_end(); 1168c2ecf20Sopenharmony_ci} 1178c2ecf20Sopenharmony_ci 1188c2ecf20Sopenharmony_cistatic void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1, 1198c2ecf20Sopenharmony_ci unsigned long *p2, unsigned long *p3, unsigned long *p4) 1208c2ecf20Sopenharmony_ci{ 1218c2ecf20Sopenharmony_ci unsigned long lines = bytes >> 9; 1228c2ecf20Sopenharmony_ci 1238c2ecf20Sopenharmony_ci kernel_fpu_begin(); 1248c2ecf20Sopenharmony_ci 1258c2ecf20Sopenharmony_ci while (lines--) { 1268c2ecf20Sopenharmony_ci#undef BLOCK 1278c2ecf20Sopenharmony_ci#define BLOCK(i, reg) \ 1288c2ecf20Sopenharmony_cido { \ 1298c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \ 1308c2ecf20Sopenharmony_ci asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 1318c2ecf20Sopenharmony_ci "m" (p3[i / sizeof(*p3)])); \ 1328c2ecf20Sopenharmony_ci asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 1338c2ecf20Sopenharmony_ci "m" (p2[i / sizeof(*p2)])); \ 1348c2ecf20Sopenharmony_ci asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 1358c2ecf20Sopenharmony_ci "m" (p1[i / sizeof(*p1)])); \ 1368c2ecf20Sopenharmony_ci asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 1378c2ecf20Sopenharmony_ci "m" (p0[i / sizeof(*p0)])); \ 1388c2ecf20Sopenharmony_ci asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 1398c2ecf20Sopenharmony_ci "=m" (p0[i / sizeof(*p0)])); \ 1408c2ecf20Sopenharmony_ci} while (0); 1418c2ecf20Sopenharmony_ci 1428c2ecf20Sopenharmony_ci BLOCK16() 1438c2ecf20Sopenharmony_ci 1448c2ecf20Sopenharmony_ci p0 = (unsigned long *)((uintptr_t)p0 + 512); 1458c2ecf20Sopenharmony_ci p1 = (unsigned long *)((uintptr_t)p1 + 512); 1468c2ecf20Sopenharmony_ci p2 = (unsigned long *)((uintptr_t)p2 + 512); 1478c2ecf20Sopenharmony_ci p3 = (unsigned long *)((uintptr_t)p3 + 512); 1488c2ecf20Sopenharmony_ci p4 = (unsigned long *)((uintptr_t)p4 + 512); 1498c2ecf20Sopenharmony_ci } 1508c2ecf20Sopenharmony_ci 1518c2ecf20Sopenharmony_ci kernel_fpu_end(); 1528c2ecf20Sopenharmony_ci} 1538c2ecf20Sopenharmony_ci 1548c2ecf20Sopenharmony_cistatic struct xor_block_template xor_block_avx = { 1558c2ecf20Sopenharmony_ci .name = "avx", 1568c2ecf20Sopenharmony_ci .do_2 = xor_avx_2, 1578c2ecf20Sopenharmony_ci .do_3 = xor_avx_3, 1588c2ecf20Sopenharmony_ci .do_4 = xor_avx_4, 1598c2ecf20Sopenharmony_ci .do_5 = xor_avx_5, 1608c2ecf20Sopenharmony_ci}; 1618c2ecf20Sopenharmony_ci 1628c2ecf20Sopenharmony_ci#define AVX_XOR_SPEED \ 1638c2ecf20Sopenharmony_cido { \ 1648c2ecf20Sopenharmony_ci if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \ 1658c2ecf20Sopenharmony_ci xor_speed(&xor_block_avx); \ 1668c2ecf20Sopenharmony_ci} while (0) 1678c2ecf20Sopenharmony_ci 1688c2ecf20Sopenharmony_ci#define AVX_SELECT(FASTEST) \ 1698c2ecf20Sopenharmony_ci (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST) 1708c2ecf20Sopenharmony_ci 1718c2ecf20Sopenharmony_ci#endif 172