18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */ 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Shared glue code for 128bit block ciphers, AVX2 assembler macros 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 68c2ecf20Sopenharmony_ci */ 78c2ecf20Sopenharmony_ci 88c2ecf20Sopenharmony_ci#define load_16way(src, x0, x1, x2, x3, x4, x5, x6, x7) \ 98c2ecf20Sopenharmony_ci vmovdqu (0*32)(src), x0; \ 108c2ecf20Sopenharmony_ci vmovdqu (1*32)(src), x1; \ 118c2ecf20Sopenharmony_ci vmovdqu (2*32)(src), x2; \ 128c2ecf20Sopenharmony_ci vmovdqu (3*32)(src), x3; \ 138c2ecf20Sopenharmony_ci vmovdqu (4*32)(src), x4; \ 148c2ecf20Sopenharmony_ci vmovdqu (5*32)(src), x5; \ 158c2ecf20Sopenharmony_ci vmovdqu (6*32)(src), x6; \ 168c2ecf20Sopenharmony_ci vmovdqu (7*32)(src), x7; 178c2ecf20Sopenharmony_ci 188c2ecf20Sopenharmony_ci#define store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \ 198c2ecf20Sopenharmony_ci vmovdqu x0, (0*32)(dst); \ 208c2ecf20Sopenharmony_ci vmovdqu x1, (1*32)(dst); \ 218c2ecf20Sopenharmony_ci vmovdqu x2, (2*32)(dst); \ 228c2ecf20Sopenharmony_ci vmovdqu x3, (3*32)(dst); \ 238c2ecf20Sopenharmony_ci vmovdqu x4, (4*32)(dst); \ 248c2ecf20Sopenharmony_ci vmovdqu x5, (5*32)(dst); \ 258c2ecf20Sopenharmony_ci vmovdqu x6, (6*32)(dst); \ 268c2ecf20Sopenharmony_ci vmovdqu x7, (7*32)(dst); 278c2ecf20Sopenharmony_ci 288c2ecf20Sopenharmony_ci#define store_cbc_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7, t0) \ 298c2ecf20Sopenharmony_ci vpxor t0, t0, t0; \ 308c2ecf20Sopenharmony_ci vinserti128 $1, (src), t0, t0; \ 318c2ecf20Sopenharmony_ci vpxor t0, x0, x0; \ 328c2ecf20Sopenharmony_ci vpxor (0*32+16)(src), x1, x1; \ 338c2ecf20Sopenharmony_ci vpxor (1*32+16)(src), x2, x2; \ 348c2ecf20Sopenharmony_ci vpxor (2*32+16)(src), x3, x3; \ 358c2ecf20Sopenharmony_ci vpxor (3*32+16)(src), x4, x4; \ 368c2ecf20Sopenharmony_ci vpxor (4*32+16)(src), x5, x5; \ 378c2ecf20Sopenharmony_ci vpxor (5*32+16)(src), x6, x6; \ 388c2ecf20Sopenharmony_ci vpxor (6*32+16)(src), x7, x7; \ 398c2ecf20Sopenharmony_ci store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7); 408c2ecf20Sopenharmony_ci 418c2ecf20Sopenharmony_ci#define inc_le128(x, minus_one, tmp) \ 428c2ecf20Sopenharmony_ci vpcmpeqq minus_one, x, tmp; \ 438c2ecf20Sopenharmony_ci vpsubq minus_one, x, x; \ 448c2ecf20Sopenharmony_ci vpslldq $8, tmp, tmp; \ 458c2ecf20Sopenharmony_ci vpsubq tmp, x, x; 468c2ecf20Sopenharmony_ci 478c2ecf20Sopenharmony_ci#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \ 488c2ecf20Sopenharmony_ci vpcmpeqq minus_one, x, tmp1; \ 498c2ecf20Sopenharmony_ci vpcmpeqq minus_two, x, tmp2; \ 508c2ecf20Sopenharmony_ci vpsubq minus_two, x, x; \ 518c2ecf20Sopenharmony_ci vpor tmp2, tmp1, tmp1; \ 528c2ecf20Sopenharmony_ci vpslldq $8, tmp1, tmp1; \ 538c2ecf20Sopenharmony_ci vpsubq tmp1, x, x; 548c2ecf20Sopenharmony_ci 558c2ecf20Sopenharmony_ci#define load_ctr_16way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t0x, t1, \ 568c2ecf20Sopenharmony_ci t1x, t2, t2x, t3, t3x, t4, t5) \ 578c2ecf20Sopenharmony_ci vpcmpeqd t0, t0, t0; \ 588c2ecf20Sopenharmony_ci vpsrldq $8, t0, t0; /* ab: -1:0 ; cd: -1:0 */ \ 598c2ecf20Sopenharmony_ci vpaddq t0, t0, t4; /* ab: -2:0 ; cd: -2:0 */\ 608c2ecf20Sopenharmony_ci \ 618c2ecf20Sopenharmony_ci /* load IV and byteswap */ \ 628c2ecf20Sopenharmony_ci vmovdqu (iv), t2x; \ 638c2ecf20Sopenharmony_ci vmovdqa t2x, t3x; \ 648c2ecf20Sopenharmony_ci inc_le128(t2x, t0x, t1x); \ 658c2ecf20Sopenharmony_ci vbroadcasti128 bswap, t1; \ 668c2ecf20Sopenharmony_ci vinserti128 $1, t2x, t3, t2; /* ab: le0 ; cd: le1 */ \ 678c2ecf20Sopenharmony_ci vpshufb t1, t2, x0; \ 688c2ecf20Sopenharmony_ci \ 698c2ecf20Sopenharmony_ci /* construct IVs */ \ 708c2ecf20Sopenharmony_ci add2_le128(t2, t0, t4, t3, t5); /* ab: le2 ; cd: le3 */ \ 718c2ecf20Sopenharmony_ci vpshufb t1, t2, x1; \ 728c2ecf20Sopenharmony_ci add2_le128(t2, t0, t4, t3, t5); \ 738c2ecf20Sopenharmony_ci vpshufb t1, t2, x2; \ 748c2ecf20Sopenharmony_ci add2_le128(t2, t0, t4, t3, t5); \ 758c2ecf20Sopenharmony_ci vpshufb t1, t2, x3; \ 768c2ecf20Sopenharmony_ci add2_le128(t2, t0, t4, t3, t5); \ 778c2ecf20Sopenharmony_ci vpshufb t1, t2, x4; \ 788c2ecf20Sopenharmony_ci add2_le128(t2, t0, t4, t3, t5); \ 798c2ecf20Sopenharmony_ci vpshufb t1, t2, x5; \ 808c2ecf20Sopenharmony_ci add2_le128(t2, t0, t4, t3, t5); \ 818c2ecf20Sopenharmony_ci vpshufb t1, t2, x6; \ 828c2ecf20Sopenharmony_ci add2_le128(t2, t0, t4, t3, t5); \ 838c2ecf20Sopenharmony_ci vpshufb t1, t2, x7; \ 848c2ecf20Sopenharmony_ci vextracti128 $1, t2, t2x; \ 858c2ecf20Sopenharmony_ci inc_le128(t2x, t0x, t3x); \ 868c2ecf20Sopenharmony_ci vmovdqu t2x, (iv); 878c2ecf20Sopenharmony_ci 888c2ecf20Sopenharmony_ci#define store_ctr_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \ 898c2ecf20Sopenharmony_ci vpxor (0*32)(src), x0, x0; \ 908c2ecf20Sopenharmony_ci vpxor (1*32)(src), x1, x1; \ 918c2ecf20Sopenharmony_ci vpxor (2*32)(src), x2, x2; \ 928c2ecf20Sopenharmony_ci vpxor (3*32)(src), x3, x3; \ 938c2ecf20Sopenharmony_ci vpxor (4*32)(src), x4, x4; \ 948c2ecf20Sopenharmony_ci vpxor (5*32)(src), x5, x5; \ 958c2ecf20Sopenharmony_ci vpxor (6*32)(src), x6, x6; \ 968c2ecf20Sopenharmony_ci vpxor (7*32)(src), x7, x7; \ 978c2ecf20Sopenharmony_ci store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7); 988c2ecf20Sopenharmony_ci 998c2ecf20Sopenharmony_ci#define gf128mul_x_ble(iv, mask, tmp) \ 1008c2ecf20Sopenharmony_ci vpsrad $31, iv, tmp; \ 1018c2ecf20Sopenharmony_ci vpaddq iv, iv, iv; \ 1028c2ecf20Sopenharmony_ci vpshufd $0x13, tmp, tmp; \ 1038c2ecf20Sopenharmony_ci vpand mask, tmp, tmp; \ 1048c2ecf20Sopenharmony_ci vpxor tmp, iv, iv; 1058c2ecf20Sopenharmony_ci 1068c2ecf20Sopenharmony_ci#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \ 1078c2ecf20Sopenharmony_ci vpsrad $31, iv, tmp0; \ 1088c2ecf20Sopenharmony_ci vpaddq iv, iv, tmp1; \ 1098c2ecf20Sopenharmony_ci vpsllq $2, iv, iv; \ 1108c2ecf20Sopenharmony_ci vpshufd $0x13, tmp0, tmp0; \ 1118c2ecf20Sopenharmony_ci vpsrad $31, tmp1, tmp1; \ 1128c2ecf20Sopenharmony_ci vpand mask2, tmp0, tmp0; \ 1138c2ecf20Sopenharmony_ci vpshufd $0x13, tmp1, tmp1; \ 1148c2ecf20Sopenharmony_ci vpxor tmp0, iv, iv; \ 1158c2ecf20Sopenharmony_ci vpand mask1, tmp1, tmp1; \ 1168c2ecf20Sopenharmony_ci vpxor tmp1, iv, iv; 1178c2ecf20Sopenharmony_ci 1188c2ecf20Sopenharmony_ci#define load_xts_16way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, \ 1198c2ecf20Sopenharmony_ci tivx, t0, t0x, t1, t1x, t2, t2x, t3, \ 1208c2ecf20Sopenharmony_ci xts_gf128mul_and_shl1_mask_0, \ 1218c2ecf20Sopenharmony_ci xts_gf128mul_and_shl1_mask_1) \ 1228c2ecf20Sopenharmony_ci vbroadcasti128 xts_gf128mul_and_shl1_mask_0, t1; \ 1238c2ecf20Sopenharmony_ci \ 1248c2ecf20Sopenharmony_ci /* load IV and construct second IV */ \ 1258c2ecf20Sopenharmony_ci vmovdqu (iv), tivx; \ 1268c2ecf20Sopenharmony_ci vmovdqa tivx, t0x; \ 1278c2ecf20Sopenharmony_ci gf128mul_x_ble(tivx, t1x, t2x); \ 1288c2ecf20Sopenharmony_ci vbroadcasti128 xts_gf128mul_and_shl1_mask_1, t2; \ 1298c2ecf20Sopenharmony_ci vinserti128 $1, tivx, t0, tiv; \ 1308c2ecf20Sopenharmony_ci vpxor (0*32)(src), tiv, x0; \ 1318c2ecf20Sopenharmony_ci vmovdqu tiv, (0*32)(dst); \ 1328c2ecf20Sopenharmony_ci \ 1338c2ecf20Sopenharmony_ci /* construct and store IVs, also xor with source */ \ 1348c2ecf20Sopenharmony_ci gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ 1358c2ecf20Sopenharmony_ci vpxor (1*32)(src), tiv, x1; \ 1368c2ecf20Sopenharmony_ci vmovdqu tiv, (1*32)(dst); \ 1378c2ecf20Sopenharmony_ci \ 1388c2ecf20Sopenharmony_ci gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ 1398c2ecf20Sopenharmony_ci vpxor (2*32)(src), tiv, x2; \ 1408c2ecf20Sopenharmony_ci vmovdqu tiv, (2*32)(dst); \ 1418c2ecf20Sopenharmony_ci \ 1428c2ecf20Sopenharmony_ci gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ 1438c2ecf20Sopenharmony_ci vpxor (3*32)(src), tiv, x3; \ 1448c2ecf20Sopenharmony_ci vmovdqu tiv, (3*32)(dst); \ 1458c2ecf20Sopenharmony_ci \ 1468c2ecf20Sopenharmony_ci gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ 1478c2ecf20Sopenharmony_ci vpxor (4*32)(src), tiv, x4; \ 1488c2ecf20Sopenharmony_ci vmovdqu tiv, (4*32)(dst); \ 1498c2ecf20Sopenharmony_ci \ 1508c2ecf20Sopenharmony_ci gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ 1518c2ecf20Sopenharmony_ci vpxor (5*32)(src), tiv, x5; \ 1528c2ecf20Sopenharmony_ci vmovdqu tiv, (5*32)(dst); \ 1538c2ecf20Sopenharmony_ci \ 1548c2ecf20Sopenharmony_ci gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ 1558c2ecf20Sopenharmony_ci vpxor (6*32)(src), tiv, x6; \ 1568c2ecf20Sopenharmony_ci vmovdqu tiv, (6*32)(dst); \ 1578c2ecf20Sopenharmony_ci \ 1588c2ecf20Sopenharmony_ci gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ 1598c2ecf20Sopenharmony_ci vpxor (7*32)(src), tiv, x7; \ 1608c2ecf20Sopenharmony_ci vmovdqu tiv, (7*32)(dst); \ 1618c2ecf20Sopenharmony_ci \ 1628c2ecf20Sopenharmony_ci vextracti128 $1, tiv, tivx; \ 1638c2ecf20Sopenharmony_ci gf128mul_x_ble(tivx, t1x, t2x); \ 1648c2ecf20Sopenharmony_ci vmovdqu tivx, (iv); 1658c2ecf20Sopenharmony_ci 1668c2ecf20Sopenharmony_ci#define store_xts_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \ 1678c2ecf20Sopenharmony_ci vpxor (0*32)(dst), x0, x0; \ 1688c2ecf20Sopenharmony_ci vpxor (1*32)(dst), x1, x1; \ 1698c2ecf20Sopenharmony_ci vpxor (2*32)(dst), x2, x2; \ 1708c2ecf20Sopenharmony_ci vpxor (3*32)(dst), x3, x3; \ 1718c2ecf20Sopenharmony_ci vpxor (4*32)(dst), x4, x4; \ 1728c2ecf20Sopenharmony_ci vpxor (5*32)(dst), x5, x5; \ 1738c2ecf20Sopenharmony_ci vpxor (6*32)(dst), x6, x6; \ 1748c2ecf20Sopenharmony_ci vpxor (7*32)(dst), x7, x7; \ 1758c2ecf20Sopenharmony_ci store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7); 176