18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * Shared glue code for 128bit block ciphers, AVX2 assembler macros
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
68c2ecf20Sopenharmony_ci */
78c2ecf20Sopenharmony_ci
88c2ecf20Sopenharmony_ci#define load_16way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
98c2ecf20Sopenharmony_ci	vmovdqu (0*32)(src), x0; \
108c2ecf20Sopenharmony_ci	vmovdqu (1*32)(src), x1; \
118c2ecf20Sopenharmony_ci	vmovdqu (2*32)(src), x2; \
128c2ecf20Sopenharmony_ci	vmovdqu (3*32)(src), x3; \
138c2ecf20Sopenharmony_ci	vmovdqu (4*32)(src), x4; \
148c2ecf20Sopenharmony_ci	vmovdqu (5*32)(src), x5; \
158c2ecf20Sopenharmony_ci	vmovdqu (6*32)(src), x6; \
168c2ecf20Sopenharmony_ci	vmovdqu (7*32)(src), x7;
178c2ecf20Sopenharmony_ci
188c2ecf20Sopenharmony_ci#define store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
198c2ecf20Sopenharmony_ci	vmovdqu x0, (0*32)(dst); \
208c2ecf20Sopenharmony_ci	vmovdqu x1, (1*32)(dst); \
218c2ecf20Sopenharmony_ci	vmovdqu x2, (2*32)(dst); \
228c2ecf20Sopenharmony_ci	vmovdqu x3, (3*32)(dst); \
238c2ecf20Sopenharmony_ci	vmovdqu x4, (4*32)(dst); \
248c2ecf20Sopenharmony_ci	vmovdqu x5, (5*32)(dst); \
258c2ecf20Sopenharmony_ci	vmovdqu x6, (6*32)(dst); \
268c2ecf20Sopenharmony_ci	vmovdqu x7, (7*32)(dst);
278c2ecf20Sopenharmony_ci
288c2ecf20Sopenharmony_ci#define store_cbc_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7, t0) \
298c2ecf20Sopenharmony_ci	vpxor t0, t0, t0; \
308c2ecf20Sopenharmony_ci	vinserti128 $1, (src), t0, t0; \
318c2ecf20Sopenharmony_ci	vpxor t0, x0, x0; \
328c2ecf20Sopenharmony_ci	vpxor (0*32+16)(src), x1, x1; \
338c2ecf20Sopenharmony_ci	vpxor (1*32+16)(src), x2, x2; \
348c2ecf20Sopenharmony_ci	vpxor (2*32+16)(src), x3, x3; \
358c2ecf20Sopenharmony_ci	vpxor (3*32+16)(src), x4, x4; \
368c2ecf20Sopenharmony_ci	vpxor (4*32+16)(src), x5, x5; \
378c2ecf20Sopenharmony_ci	vpxor (5*32+16)(src), x6, x6; \
388c2ecf20Sopenharmony_ci	vpxor (6*32+16)(src), x7, x7; \
398c2ecf20Sopenharmony_ci	store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
408c2ecf20Sopenharmony_ci
418c2ecf20Sopenharmony_ci#define inc_le128(x, minus_one, tmp) \
428c2ecf20Sopenharmony_ci	vpcmpeqq minus_one, x, tmp; \
438c2ecf20Sopenharmony_ci	vpsubq minus_one, x, x; \
448c2ecf20Sopenharmony_ci	vpslldq $8, tmp, tmp; \
458c2ecf20Sopenharmony_ci	vpsubq tmp, x, x;
468c2ecf20Sopenharmony_ci
478c2ecf20Sopenharmony_ci#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
488c2ecf20Sopenharmony_ci	vpcmpeqq minus_one, x, tmp1; \
498c2ecf20Sopenharmony_ci	vpcmpeqq minus_two, x, tmp2; \
508c2ecf20Sopenharmony_ci	vpsubq minus_two, x, x; \
518c2ecf20Sopenharmony_ci	vpor tmp2, tmp1, tmp1; \
528c2ecf20Sopenharmony_ci	vpslldq $8, tmp1, tmp1; \
538c2ecf20Sopenharmony_ci	vpsubq tmp1, x, x;
548c2ecf20Sopenharmony_ci
558c2ecf20Sopenharmony_ci#define load_ctr_16way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t0x, t1, \
568c2ecf20Sopenharmony_ci		       t1x, t2, t2x, t3, t3x, t4, t5) \
578c2ecf20Sopenharmony_ci	vpcmpeqd t0, t0, t0; \
588c2ecf20Sopenharmony_ci	vpsrldq $8, t0, t0; /* ab: -1:0 ; cd: -1:0 */ \
598c2ecf20Sopenharmony_ci	vpaddq t0, t0, t4; /* ab: -2:0 ; cd: -2:0 */\
608c2ecf20Sopenharmony_ci	\
618c2ecf20Sopenharmony_ci	/* load IV and byteswap */ \
628c2ecf20Sopenharmony_ci	vmovdqu (iv), t2x; \
638c2ecf20Sopenharmony_ci	vmovdqa t2x, t3x; \
648c2ecf20Sopenharmony_ci	inc_le128(t2x, t0x, t1x); \
658c2ecf20Sopenharmony_ci	vbroadcasti128 bswap, t1; \
668c2ecf20Sopenharmony_ci	vinserti128 $1, t2x, t3, t2; /* ab: le0 ; cd: le1 */ \
678c2ecf20Sopenharmony_ci	vpshufb t1, t2, x0; \
688c2ecf20Sopenharmony_ci	\
698c2ecf20Sopenharmony_ci	/* construct IVs */ \
708c2ecf20Sopenharmony_ci	add2_le128(t2, t0, t4, t3, t5); /* ab: le2 ; cd: le3 */ \
718c2ecf20Sopenharmony_ci	vpshufb t1, t2, x1; \
728c2ecf20Sopenharmony_ci	add2_le128(t2, t0, t4, t3, t5); \
738c2ecf20Sopenharmony_ci	vpshufb t1, t2, x2; \
748c2ecf20Sopenharmony_ci	add2_le128(t2, t0, t4, t3, t5); \
758c2ecf20Sopenharmony_ci	vpshufb t1, t2, x3; \
768c2ecf20Sopenharmony_ci	add2_le128(t2, t0, t4, t3, t5); \
778c2ecf20Sopenharmony_ci	vpshufb t1, t2, x4; \
788c2ecf20Sopenharmony_ci	add2_le128(t2, t0, t4, t3, t5); \
798c2ecf20Sopenharmony_ci	vpshufb t1, t2, x5; \
808c2ecf20Sopenharmony_ci	add2_le128(t2, t0, t4, t3, t5); \
818c2ecf20Sopenharmony_ci	vpshufb t1, t2, x6; \
828c2ecf20Sopenharmony_ci	add2_le128(t2, t0, t4, t3, t5); \
838c2ecf20Sopenharmony_ci	vpshufb t1, t2, x7; \
848c2ecf20Sopenharmony_ci	vextracti128 $1, t2, t2x; \
858c2ecf20Sopenharmony_ci	inc_le128(t2x, t0x, t3x); \
868c2ecf20Sopenharmony_ci	vmovdqu t2x, (iv);
878c2ecf20Sopenharmony_ci
888c2ecf20Sopenharmony_ci#define store_ctr_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
898c2ecf20Sopenharmony_ci	vpxor (0*32)(src), x0, x0; \
908c2ecf20Sopenharmony_ci	vpxor (1*32)(src), x1, x1; \
918c2ecf20Sopenharmony_ci	vpxor (2*32)(src), x2, x2; \
928c2ecf20Sopenharmony_ci	vpxor (3*32)(src), x3, x3; \
938c2ecf20Sopenharmony_ci	vpxor (4*32)(src), x4, x4; \
948c2ecf20Sopenharmony_ci	vpxor (5*32)(src), x5, x5; \
958c2ecf20Sopenharmony_ci	vpxor (6*32)(src), x6, x6; \
968c2ecf20Sopenharmony_ci	vpxor (7*32)(src), x7, x7; \
978c2ecf20Sopenharmony_ci	store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
988c2ecf20Sopenharmony_ci
998c2ecf20Sopenharmony_ci#define gf128mul_x_ble(iv, mask, tmp) \
1008c2ecf20Sopenharmony_ci	vpsrad $31, iv, tmp; \
1018c2ecf20Sopenharmony_ci	vpaddq iv, iv, iv; \
1028c2ecf20Sopenharmony_ci	vpshufd $0x13, tmp, tmp; \
1038c2ecf20Sopenharmony_ci	vpand mask, tmp, tmp; \
1048c2ecf20Sopenharmony_ci	vpxor tmp, iv, iv;
1058c2ecf20Sopenharmony_ci
1068c2ecf20Sopenharmony_ci#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
1078c2ecf20Sopenharmony_ci	vpsrad $31, iv, tmp0; \
1088c2ecf20Sopenharmony_ci	vpaddq iv, iv, tmp1; \
1098c2ecf20Sopenharmony_ci	vpsllq $2, iv, iv; \
1108c2ecf20Sopenharmony_ci	vpshufd $0x13, tmp0, tmp0; \
1118c2ecf20Sopenharmony_ci	vpsrad $31, tmp1, tmp1; \
1128c2ecf20Sopenharmony_ci	vpand mask2, tmp0, tmp0; \
1138c2ecf20Sopenharmony_ci	vpshufd $0x13, tmp1, tmp1; \
1148c2ecf20Sopenharmony_ci	vpxor tmp0, iv, iv; \
1158c2ecf20Sopenharmony_ci	vpand mask1, tmp1, tmp1; \
1168c2ecf20Sopenharmony_ci	vpxor tmp1, iv, iv;
1178c2ecf20Sopenharmony_ci
1188c2ecf20Sopenharmony_ci#define load_xts_16way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, \
1198c2ecf20Sopenharmony_ci		       tivx, t0, t0x, t1, t1x, t2, t2x, t3, \
1208c2ecf20Sopenharmony_ci		       xts_gf128mul_and_shl1_mask_0, \
1218c2ecf20Sopenharmony_ci		       xts_gf128mul_and_shl1_mask_1) \
1228c2ecf20Sopenharmony_ci	vbroadcasti128 xts_gf128mul_and_shl1_mask_0, t1; \
1238c2ecf20Sopenharmony_ci	\
1248c2ecf20Sopenharmony_ci	/* load IV and construct second IV */ \
1258c2ecf20Sopenharmony_ci	vmovdqu (iv), tivx; \
1268c2ecf20Sopenharmony_ci	vmovdqa tivx, t0x; \
1278c2ecf20Sopenharmony_ci	gf128mul_x_ble(tivx, t1x, t2x); \
1288c2ecf20Sopenharmony_ci	vbroadcasti128 xts_gf128mul_and_shl1_mask_1, t2; \
1298c2ecf20Sopenharmony_ci	vinserti128 $1, tivx, t0, tiv; \
1308c2ecf20Sopenharmony_ci	vpxor (0*32)(src), tiv, x0; \
1318c2ecf20Sopenharmony_ci	vmovdqu tiv, (0*32)(dst); \
1328c2ecf20Sopenharmony_ci	\
1338c2ecf20Sopenharmony_ci	/* construct and store IVs, also xor with source */ \
1348c2ecf20Sopenharmony_ci	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
1358c2ecf20Sopenharmony_ci	vpxor (1*32)(src), tiv, x1; \
1368c2ecf20Sopenharmony_ci	vmovdqu tiv, (1*32)(dst); \
1378c2ecf20Sopenharmony_ci	\
1388c2ecf20Sopenharmony_ci	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
1398c2ecf20Sopenharmony_ci	vpxor (2*32)(src), tiv, x2; \
1408c2ecf20Sopenharmony_ci	vmovdqu tiv, (2*32)(dst); \
1418c2ecf20Sopenharmony_ci	\
1428c2ecf20Sopenharmony_ci	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
1438c2ecf20Sopenharmony_ci	vpxor (3*32)(src), tiv, x3; \
1448c2ecf20Sopenharmony_ci	vmovdqu tiv, (3*32)(dst); \
1458c2ecf20Sopenharmony_ci	\
1468c2ecf20Sopenharmony_ci	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
1478c2ecf20Sopenharmony_ci	vpxor (4*32)(src), tiv, x4; \
1488c2ecf20Sopenharmony_ci	vmovdqu tiv, (4*32)(dst); \
1498c2ecf20Sopenharmony_ci	\
1508c2ecf20Sopenharmony_ci	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
1518c2ecf20Sopenharmony_ci	vpxor (5*32)(src), tiv, x5; \
1528c2ecf20Sopenharmony_ci	vmovdqu tiv, (5*32)(dst); \
1538c2ecf20Sopenharmony_ci	\
1548c2ecf20Sopenharmony_ci	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
1558c2ecf20Sopenharmony_ci	vpxor (6*32)(src), tiv, x6; \
1568c2ecf20Sopenharmony_ci	vmovdqu tiv, (6*32)(dst); \
1578c2ecf20Sopenharmony_ci	\
1588c2ecf20Sopenharmony_ci	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
1598c2ecf20Sopenharmony_ci	vpxor (7*32)(src), tiv, x7; \
1608c2ecf20Sopenharmony_ci	vmovdqu tiv, (7*32)(dst); \
1618c2ecf20Sopenharmony_ci	\
1628c2ecf20Sopenharmony_ci	vextracti128 $1, tiv, tivx; \
1638c2ecf20Sopenharmony_ci	gf128mul_x_ble(tivx, t1x, t2x); \
1648c2ecf20Sopenharmony_ci	vmovdqu tivx, (iv);
1658c2ecf20Sopenharmony_ci
1668c2ecf20Sopenharmony_ci#define store_xts_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
1678c2ecf20Sopenharmony_ci	vpxor (0*32)(dst), x0, x0; \
1688c2ecf20Sopenharmony_ci	vpxor (1*32)(dst), x1, x1; \
1698c2ecf20Sopenharmony_ci	vpxor (2*32)(dst), x2, x2; \
1708c2ecf20Sopenharmony_ci	vpxor (3*32)(dst), x3, x3; \
1718c2ecf20Sopenharmony_ci	vpxor (4*32)(dst), x4, x4; \
1728c2ecf20Sopenharmony_ci	vpxor (5*32)(dst), x5, x5; \
1738c2ecf20Sopenharmony_ci	vpxor (6*32)(dst), x6, x6; \
1748c2ecf20Sopenharmony_ci	vpxor (7*32)(dst), x7, x7; \
1758c2ecf20Sopenharmony_ci	store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
176