18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * arch/arm64/lib/xor-neon.c 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Authors: Jackie Liu <liuyun01@kylinos.cn> 68c2ecf20Sopenharmony_ci * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd. 78c2ecf20Sopenharmony_ci */ 88c2ecf20Sopenharmony_ci 98c2ecf20Sopenharmony_ci#include <linux/raid/xor.h> 108c2ecf20Sopenharmony_ci#include <linux/module.h> 118c2ecf20Sopenharmony_ci#include <asm/neon-intrinsics.h> 128c2ecf20Sopenharmony_ci 138c2ecf20Sopenharmony_civoid xor_arm64_neon_2(unsigned long bytes, unsigned long *p1, 148c2ecf20Sopenharmony_ci unsigned long *p2) 158c2ecf20Sopenharmony_ci{ 168c2ecf20Sopenharmony_ci uint64_t *dp1 = (uint64_t *)p1; 178c2ecf20Sopenharmony_ci uint64_t *dp2 = (uint64_t *)p2; 188c2ecf20Sopenharmony_ci 198c2ecf20Sopenharmony_ci register uint64x2_t v0, v1, v2, v3; 208c2ecf20Sopenharmony_ci long lines = bytes / (sizeof(uint64x2_t) * 4); 218c2ecf20Sopenharmony_ci 228c2ecf20Sopenharmony_ci do { 238c2ecf20Sopenharmony_ci /* p1 ^= p2 */ 248c2ecf20Sopenharmony_ci v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 258c2ecf20Sopenharmony_ci v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 268c2ecf20Sopenharmony_ci v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 278c2ecf20Sopenharmony_ci v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 288c2ecf20Sopenharmony_ci 298c2ecf20Sopenharmony_ci /* store */ 308c2ecf20Sopenharmony_ci vst1q_u64(dp1 + 0, v0); 318c2ecf20Sopenharmony_ci vst1q_u64(dp1 + 2, v1); 328c2ecf20Sopenharmony_ci vst1q_u64(dp1 + 4, v2); 338c2ecf20Sopenharmony_ci vst1q_u64(dp1 + 6, v3); 348c2ecf20Sopenharmony_ci 358c2ecf20Sopenharmony_ci dp1 += 8; 368c2ecf20Sopenharmony_ci dp2 += 8; 378c2ecf20Sopenharmony_ci } while (--lines > 0); 388c2ecf20Sopenharmony_ci} 398c2ecf20Sopenharmony_ci 408c2ecf20Sopenharmony_civoid xor_arm64_neon_3(unsigned long bytes, unsigned long *p1, 418c2ecf20Sopenharmony_ci unsigned long *p2, unsigned long *p3) 428c2ecf20Sopenharmony_ci{ 438c2ecf20Sopenharmony_ci uint64_t *dp1 = (uint64_t *)p1; 448c2ecf20Sopenharmony_ci uint64_t *dp2 = (uint64_t *)p2; 458c2ecf20Sopenharmony_ci uint64_t *dp3 = (uint64_t *)p3; 468c2ecf20Sopenharmony_ci 478c2ecf20Sopenharmony_ci register uint64x2_t v0, v1, v2, v3; 488c2ecf20Sopenharmony_ci long lines = bytes / (sizeof(uint64x2_t) * 4); 498c2ecf20Sopenharmony_ci 508c2ecf20Sopenharmony_ci do { 518c2ecf20Sopenharmony_ci /* p1 ^= p2 */ 528c2ecf20Sopenharmony_ci v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 538c2ecf20Sopenharmony_ci v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 548c2ecf20Sopenharmony_ci v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 558c2ecf20Sopenharmony_ci v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 568c2ecf20Sopenharmony_ci 578c2ecf20Sopenharmony_ci /* p1 ^= p3 */ 588c2ecf20Sopenharmony_ci v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); 598c2ecf20Sopenharmony_ci v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); 608c2ecf20Sopenharmony_ci v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); 618c2ecf20Sopenharmony_ci v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); 628c2ecf20Sopenharmony_ci 638c2ecf20Sopenharmony_ci /* store */ 648c2ecf20Sopenharmony_ci vst1q_u64(dp1 + 0, v0); 658c2ecf20Sopenharmony_ci vst1q_u64(dp1 + 2, v1); 668c2ecf20Sopenharmony_ci vst1q_u64(dp1 + 4, v2); 678c2ecf20Sopenharmony_ci vst1q_u64(dp1 + 6, v3); 688c2ecf20Sopenharmony_ci 698c2ecf20Sopenharmony_ci dp1 += 8; 708c2ecf20Sopenharmony_ci dp2 += 8; 718c2ecf20Sopenharmony_ci dp3 += 8; 728c2ecf20Sopenharmony_ci } while (--lines > 0); 738c2ecf20Sopenharmony_ci} 748c2ecf20Sopenharmony_ci 758c2ecf20Sopenharmony_civoid xor_arm64_neon_4(unsigned long bytes, unsigned long *p1, 768c2ecf20Sopenharmony_ci unsigned long *p2, unsigned long *p3, unsigned long *p4) 778c2ecf20Sopenharmony_ci{ 788c2ecf20Sopenharmony_ci uint64_t *dp1 = (uint64_t *)p1; 798c2ecf20Sopenharmony_ci uint64_t *dp2 = (uint64_t *)p2; 808c2ecf20Sopenharmony_ci uint64_t *dp3 = (uint64_t *)p3; 818c2ecf20Sopenharmony_ci uint64_t *dp4 = (uint64_t *)p4; 828c2ecf20Sopenharmony_ci 838c2ecf20Sopenharmony_ci register uint64x2_t v0, v1, v2, v3; 848c2ecf20Sopenharmony_ci long lines = bytes / (sizeof(uint64x2_t) * 4); 858c2ecf20Sopenharmony_ci 868c2ecf20Sopenharmony_ci do { 878c2ecf20Sopenharmony_ci /* p1 ^= p2 */ 888c2ecf20Sopenharmony_ci v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 898c2ecf20Sopenharmony_ci v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 908c2ecf20Sopenharmony_ci v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 918c2ecf20Sopenharmony_ci v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 928c2ecf20Sopenharmony_ci 938c2ecf20Sopenharmony_ci /* p1 ^= p3 */ 948c2ecf20Sopenharmony_ci v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); 958c2ecf20Sopenharmony_ci v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); 968c2ecf20Sopenharmony_ci v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); 978c2ecf20Sopenharmony_ci v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); 988c2ecf20Sopenharmony_ci 998c2ecf20Sopenharmony_ci /* p1 ^= p4 */ 1008c2ecf20Sopenharmony_ci v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); 1018c2ecf20Sopenharmony_ci v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); 1028c2ecf20Sopenharmony_ci v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); 1038c2ecf20Sopenharmony_ci v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); 1048c2ecf20Sopenharmony_ci 1058c2ecf20Sopenharmony_ci /* store */ 1068c2ecf20Sopenharmony_ci vst1q_u64(dp1 + 0, v0); 1078c2ecf20Sopenharmony_ci vst1q_u64(dp1 + 2, v1); 1088c2ecf20Sopenharmony_ci vst1q_u64(dp1 + 4, v2); 1098c2ecf20Sopenharmony_ci vst1q_u64(dp1 + 6, v3); 1108c2ecf20Sopenharmony_ci 1118c2ecf20Sopenharmony_ci dp1 += 8; 1128c2ecf20Sopenharmony_ci dp2 += 8; 1138c2ecf20Sopenharmony_ci dp3 += 8; 1148c2ecf20Sopenharmony_ci dp4 += 8; 1158c2ecf20Sopenharmony_ci } while (--lines > 0); 1168c2ecf20Sopenharmony_ci} 1178c2ecf20Sopenharmony_ci 1188c2ecf20Sopenharmony_civoid xor_arm64_neon_5(unsigned long bytes, unsigned long *p1, 1198c2ecf20Sopenharmony_ci unsigned long *p2, unsigned long *p3, 1208c2ecf20Sopenharmony_ci unsigned long *p4, unsigned long *p5) 1218c2ecf20Sopenharmony_ci{ 1228c2ecf20Sopenharmony_ci uint64_t *dp1 = (uint64_t *)p1; 1238c2ecf20Sopenharmony_ci uint64_t *dp2 = (uint64_t *)p2; 1248c2ecf20Sopenharmony_ci uint64_t *dp3 = (uint64_t *)p3; 1258c2ecf20Sopenharmony_ci uint64_t *dp4 = (uint64_t *)p4; 1268c2ecf20Sopenharmony_ci uint64_t *dp5 = (uint64_t *)p5; 1278c2ecf20Sopenharmony_ci 1288c2ecf20Sopenharmony_ci register uint64x2_t v0, v1, v2, v3; 1298c2ecf20Sopenharmony_ci long lines = bytes / (sizeof(uint64x2_t) * 4); 1308c2ecf20Sopenharmony_ci 1318c2ecf20Sopenharmony_ci do { 1328c2ecf20Sopenharmony_ci /* p1 ^= p2 */ 1338c2ecf20Sopenharmony_ci v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 1348c2ecf20Sopenharmony_ci v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 1358c2ecf20Sopenharmony_ci v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 1368c2ecf20Sopenharmony_ci v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 1378c2ecf20Sopenharmony_ci 1388c2ecf20Sopenharmony_ci /* p1 ^= p3 */ 1398c2ecf20Sopenharmony_ci v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); 1408c2ecf20Sopenharmony_ci v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); 1418c2ecf20Sopenharmony_ci v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); 1428c2ecf20Sopenharmony_ci v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); 1438c2ecf20Sopenharmony_ci 1448c2ecf20Sopenharmony_ci /* p1 ^= p4 */ 1458c2ecf20Sopenharmony_ci v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); 1468c2ecf20Sopenharmony_ci v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); 1478c2ecf20Sopenharmony_ci v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); 1488c2ecf20Sopenharmony_ci v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); 1498c2ecf20Sopenharmony_ci 1508c2ecf20Sopenharmony_ci /* p1 ^= p5 */ 1518c2ecf20Sopenharmony_ci v0 = veorq_u64(v0, vld1q_u64(dp5 + 0)); 1528c2ecf20Sopenharmony_ci v1 = veorq_u64(v1, vld1q_u64(dp5 + 2)); 1538c2ecf20Sopenharmony_ci v2 = veorq_u64(v2, vld1q_u64(dp5 + 4)); 1548c2ecf20Sopenharmony_ci v3 = veorq_u64(v3, vld1q_u64(dp5 + 6)); 1558c2ecf20Sopenharmony_ci 1568c2ecf20Sopenharmony_ci /* store */ 1578c2ecf20Sopenharmony_ci vst1q_u64(dp1 + 0, v0); 1588c2ecf20Sopenharmony_ci vst1q_u64(dp1 + 2, v1); 1598c2ecf20Sopenharmony_ci vst1q_u64(dp1 + 4, v2); 1608c2ecf20Sopenharmony_ci vst1q_u64(dp1 + 6, v3); 1618c2ecf20Sopenharmony_ci 1628c2ecf20Sopenharmony_ci dp1 += 8; 1638c2ecf20Sopenharmony_ci dp2 += 8; 1648c2ecf20Sopenharmony_ci dp3 += 8; 1658c2ecf20Sopenharmony_ci dp4 += 8; 1668c2ecf20Sopenharmony_ci dp5 += 8; 1678c2ecf20Sopenharmony_ci } while (--lines > 0); 1688c2ecf20Sopenharmony_ci} 1698c2ecf20Sopenharmony_ci 1708c2ecf20Sopenharmony_cistruct xor_block_template const xor_block_inner_neon = { 1718c2ecf20Sopenharmony_ci .name = "__inner_neon__", 1728c2ecf20Sopenharmony_ci .do_2 = xor_arm64_neon_2, 1738c2ecf20Sopenharmony_ci .do_3 = xor_arm64_neon_3, 1748c2ecf20Sopenharmony_ci .do_4 = xor_arm64_neon_4, 1758c2ecf20Sopenharmony_ci .do_5 = xor_arm64_neon_5, 1768c2ecf20Sopenharmony_ci}; 1778c2ecf20Sopenharmony_ciEXPORT_SYMBOL(xor_block_inner_neon); 1788c2ecf20Sopenharmony_ci 1798c2ecf20Sopenharmony_ciMODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>"); 1808c2ecf20Sopenharmony_ciMODULE_DESCRIPTION("ARMv8 XOR Extensions"); 1818c2ecf20Sopenharmony_ciMODULE_LICENSE("GPL"); 182