18c2ecf20Sopenharmony_ci/* ----------------------------------------------------------------------- 28c2ecf20Sopenharmony_ci * 38c2ecf20Sopenharmony_ci * neon.uc - RAID-6 syndrome calculation using ARM NEON instructions 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Copyright (C) 2012 Rob Herring 68c2ecf20Sopenharmony_ci * Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org> 78c2ecf20Sopenharmony_ci * 88c2ecf20Sopenharmony_ci * Based on altivec.uc: 98c2ecf20Sopenharmony_ci * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved 108c2ecf20Sopenharmony_ci * 118c2ecf20Sopenharmony_ci * This program is free software; you can redistribute it and/or modify 128c2ecf20Sopenharmony_ci * it under the terms of the GNU General Public License as published by 138c2ecf20Sopenharmony_ci * the Free Software Foundation, Inc., 53 Temple Place Ste 330, 148c2ecf20Sopenharmony_ci * Boston MA 02111-1307, USA; either version 2 of the License, or 158c2ecf20Sopenharmony_ci * (at your option) any later version; incorporated herein by reference. 168c2ecf20Sopenharmony_ci * 178c2ecf20Sopenharmony_ci * ----------------------------------------------------------------------- */ 188c2ecf20Sopenharmony_ci 198c2ecf20Sopenharmony_ci/* 208c2ecf20Sopenharmony_ci * neon$#.c 218c2ecf20Sopenharmony_ci * 228c2ecf20Sopenharmony_ci * $#-way unrolled NEON intrinsics math RAID-6 instruction set 238c2ecf20Sopenharmony_ci * 248c2ecf20Sopenharmony_ci * This file is postprocessed using unroll.awk 258c2ecf20Sopenharmony_ci */ 268c2ecf20Sopenharmony_ci 278c2ecf20Sopenharmony_ci#include <arm_neon.h> 288c2ecf20Sopenharmony_ci 298c2ecf20Sopenharmony_citypedef uint8x16_t unative_t; 308c2ecf20Sopenharmony_ci 318c2ecf20Sopenharmony_ci#define NSIZE sizeof(unative_t) 328c2ecf20Sopenharmony_ci 338c2ecf20Sopenharmony_ci/* 348c2ecf20Sopenharmony_ci * The SHLBYTE() operation shifts each byte left by 1, *not* 358c2ecf20Sopenharmony_ci * rolling over into the next byte 368c2ecf20Sopenharmony_ci */ 378c2ecf20Sopenharmony_cistatic inline unative_t SHLBYTE(unative_t v) 388c2ecf20Sopenharmony_ci{ 398c2ecf20Sopenharmony_ci return vshlq_n_u8(v, 1); 408c2ecf20Sopenharmony_ci} 418c2ecf20Sopenharmony_ci 428c2ecf20Sopenharmony_ci/* 438c2ecf20Sopenharmony_ci * The MASK() operation returns 0xFF in any byte for which the high 448c2ecf20Sopenharmony_ci * bit is 1, 0x00 for any byte for which the high bit is 0. 458c2ecf20Sopenharmony_ci */ 468c2ecf20Sopenharmony_cistatic inline unative_t MASK(unative_t v) 478c2ecf20Sopenharmony_ci{ 488c2ecf20Sopenharmony_ci return (unative_t)vshrq_n_s8((int8x16_t)v, 7); 498c2ecf20Sopenharmony_ci} 508c2ecf20Sopenharmony_ci 518c2ecf20Sopenharmony_cistatic inline unative_t PMUL(unative_t v, unative_t u) 528c2ecf20Sopenharmony_ci{ 538c2ecf20Sopenharmony_ci return (unative_t)vmulq_p8((poly8x16_t)v, (poly8x16_t)u); 548c2ecf20Sopenharmony_ci} 558c2ecf20Sopenharmony_ci 568c2ecf20Sopenharmony_civoid raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs) 578c2ecf20Sopenharmony_ci{ 588c2ecf20Sopenharmony_ci uint8_t **dptr = (uint8_t **)ptrs; 598c2ecf20Sopenharmony_ci uint8_t *p, *q; 608c2ecf20Sopenharmony_ci int d, z, z0; 618c2ecf20Sopenharmony_ci 628c2ecf20Sopenharmony_ci register unative_t wd$$, wq$$, wp$$, w1$$, w2$$; 638c2ecf20Sopenharmony_ci const unative_t x1d = vdupq_n_u8(0x1d); 648c2ecf20Sopenharmony_ci 658c2ecf20Sopenharmony_ci z0 = disks - 3; /* Highest data disk */ 668c2ecf20Sopenharmony_ci p = dptr[z0+1]; /* XOR parity */ 678c2ecf20Sopenharmony_ci q = dptr[z0+2]; /* RS syndrome */ 688c2ecf20Sopenharmony_ci 698c2ecf20Sopenharmony_ci for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { 708c2ecf20Sopenharmony_ci wq$$ = wp$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]); 718c2ecf20Sopenharmony_ci for ( z = z0-1 ; z >= 0 ; z-- ) { 728c2ecf20Sopenharmony_ci wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]); 738c2ecf20Sopenharmony_ci wp$$ = veorq_u8(wp$$, wd$$); 748c2ecf20Sopenharmony_ci w2$$ = MASK(wq$$); 758c2ecf20Sopenharmony_ci w1$$ = SHLBYTE(wq$$); 768c2ecf20Sopenharmony_ci 778c2ecf20Sopenharmony_ci w2$$ = vandq_u8(w2$$, x1d); 788c2ecf20Sopenharmony_ci w1$$ = veorq_u8(w1$$, w2$$); 798c2ecf20Sopenharmony_ci wq$$ = veorq_u8(w1$$, wd$$); 808c2ecf20Sopenharmony_ci } 818c2ecf20Sopenharmony_ci vst1q_u8(&p[d+NSIZE*$$], wp$$); 828c2ecf20Sopenharmony_ci vst1q_u8(&q[d+NSIZE*$$], wq$$); 838c2ecf20Sopenharmony_ci } 848c2ecf20Sopenharmony_ci} 858c2ecf20Sopenharmony_ci 868c2ecf20Sopenharmony_civoid raid6_neon$#_xor_syndrome_real(int disks, int start, int stop, 878c2ecf20Sopenharmony_ci unsigned long bytes, void **ptrs) 888c2ecf20Sopenharmony_ci{ 898c2ecf20Sopenharmony_ci uint8_t **dptr = (uint8_t **)ptrs; 908c2ecf20Sopenharmony_ci uint8_t *p, *q; 918c2ecf20Sopenharmony_ci int d, z, z0; 928c2ecf20Sopenharmony_ci 938c2ecf20Sopenharmony_ci register unative_t wd$$, wq$$, wp$$, w1$$, w2$$; 948c2ecf20Sopenharmony_ci const unative_t x1d = vdupq_n_u8(0x1d); 958c2ecf20Sopenharmony_ci 968c2ecf20Sopenharmony_ci z0 = stop; /* P/Q right side optimization */ 978c2ecf20Sopenharmony_ci p = dptr[disks-2]; /* XOR parity */ 988c2ecf20Sopenharmony_ci q = dptr[disks-1]; /* RS syndrome */ 998c2ecf20Sopenharmony_ci 1008c2ecf20Sopenharmony_ci for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { 1018c2ecf20Sopenharmony_ci wq$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]); 1028c2ecf20Sopenharmony_ci wp$$ = veorq_u8(vld1q_u8(&p[d+$$*NSIZE]), wq$$); 1038c2ecf20Sopenharmony_ci 1048c2ecf20Sopenharmony_ci /* P/Q data pages */ 1058c2ecf20Sopenharmony_ci for ( z = z0-1 ; z >= start ; z-- ) { 1068c2ecf20Sopenharmony_ci wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]); 1078c2ecf20Sopenharmony_ci wp$$ = veorq_u8(wp$$, wd$$); 1088c2ecf20Sopenharmony_ci w2$$ = MASK(wq$$); 1098c2ecf20Sopenharmony_ci w1$$ = SHLBYTE(wq$$); 1108c2ecf20Sopenharmony_ci 1118c2ecf20Sopenharmony_ci w2$$ = vandq_u8(w2$$, x1d); 1128c2ecf20Sopenharmony_ci w1$$ = veorq_u8(w1$$, w2$$); 1138c2ecf20Sopenharmony_ci wq$$ = veorq_u8(w1$$, wd$$); 1148c2ecf20Sopenharmony_ci } 1158c2ecf20Sopenharmony_ci /* P/Q left side optimization */ 1168c2ecf20Sopenharmony_ci for ( z = start-1 ; z >= 3 ; z -= 4 ) { 1178c2ecf20Sopenharmony_ci w2$$ = vshrq_n_u8(wq$$, 4); 1188c2ecf20Sopenharmony_ci w1$$ = vshlq_n_u8(wq$$, 4); 1198c2ecf20Sopenharmony_ci 1208c2ecf20Sopenharmony_ci w2$$ = PMUL(w2$$, x1d); 1218c2ecf20Sopenharmony_ci wq$$ = veorq_u8(w1$$, w2$$); 1228c2ecf20Sopenharmony_ci } 1238c2ecf20Sopenharmony_ci 1248c2ecf20Sopenharmony_ci switch (z) { 1258c2ecf20Sopenharmony_ci case 2: 1268c2ecf20Sopenharmony_ci w2$$ = vshrq_n_u8(wq$$, 5); 1278c2ecf20Sopenharmony_ci w1$$ = vshlq_n_u8(wq$$, 3); 1288c2ecf20Sopenharmony_ci 1298c2ecf20Sopenharmony_ci w2$$ = PMUL(w2$$, x1d); 1308c2ecf20Sopenharmony_ci wq$$ = veorq_u8(w1$$, w2$$); 1318c2ecf20Sopenharmony_ci break; 1328c2ecf20Sopenharmony_ci case 1: 1338c2ecf20Sopenharmony_ci w2$$ = vshrq_n_u8(wq$$, 6); 1348c2ecf20Sopenharmony_ci w1$$ = vshlq_n_u8(wq$$, 2); 1358c2ecf20Sopenharmony_ci 1368c2ecf20Sopenharmony_ci w2$$ = PMUL(w2$$, x1d); 1378c2ecf20Sopenharmony_ci wq$$ = veorq_u8(w1$$, w2$$); 1388c2ecf20Sopenharmony_ci break; 1398c2ecf20Sopenharmony_ci case 0: 1408c2ecf20Sopenharmony_ci w2$$ = MASK(wq$$); 1418c2ecf20Sopenharmony_ci w1$$ = SHLBYTE(wq$$); 1428c2ecf20Sopenharmony_ci 1438c2ecf20Sopenharmony_ci w2$$ = vandq_u8(w2$$, x1d); 1448c2ecf20Sopenharmony_ci wq$$ = veorq_u8(w1$$, w2$$); 1458c2ecf20Sopenharmony_ci } 1468c2ecf20Sopenharmony_ci w1$$ = vld1q_u8(&q[d+NSIZE*$$]); 1478c2ecf20Sopenharmony_ci wq$$ = veorq_u8(wq$$, w1$$); 1488c2ecf20Sopenharmony_ci 1498c2ecf20Sopenharmony_ci vst1q_u8(&p[d+NSIZE*$$], wp$$); 1508c2ecf20Sopenharmony_ci vst1q_u8(&q[d+NSIZE*$$], wq$$); 1518c2ecf20Sopenharmony_ci } 1528c2ecf20Sopenharmony_ci} 153