18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */ 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * x86_64/AVX2/AES-NI assembler implementation of Camellia 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 68c2ecf20Sopenharmony_ci */ 78c2ecf20Sopenharmony_ci 88c2ecf20Sopenharmony_ci#include <linux/linkage.h> 98c2ecf20Sopenharmony_ci#include <asm/frame.h> 108c2ecf20Sopenharmony_ci#include <asm/nospec-branch.h> 118c2ecf20Sopenharmony_ci 128c2ecf20Sopenharmony_ci#define CAMELLIA_TABLE_BYTE_LEN 272 138c2ecf20Sopenharmony_ci 148c2ecf20Sopenharmony_ci/* struct camellia_ctx: */ 158c2ecf20Sopenharmony_ci#define key_table 0 168c2ecf20Sopenharmony_ci#define key_length CAMELLIA_TABLE_BYTE_LEN 178c2ecf20Sopenharmony_ci 188c2ecf20Sopenharmony_ci/* register macros */ 198c2ecf20Sopenharmony_ci#define CTX %rdi 208c2ecf20Sopenharmony_ci#define RIO %r8 218c2ecf20Sopenharmony_ci 228c2ecf20Sopenharmony_ci/********************************************************************** 238c2ecf20Sopenharmony_ci helper macros 248c2ecf20Sopenharmony_ci **********************************************************************/ 258c2ecf20Sopenharmony_ci#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ 268c2ecf20Sopenharmony_ci vpand x, mask4bit, tmp0; \ 278c2ecf20Sopenharmony_ci vpandn x, mask4bit, x; \ 288c2ecf20Sopenharmony_ci vpsrld $4, x, x; \ 298c2ecf20Sopenharmony_ci \ 308c2ecf20Sopenharmony_ci vpshufb tmp0, lo_t, tmp0; \ 318c2ecf20Sopenharmony_ci vpshufb x, hi_t, x; \ 328c2ecf20Sopenharmony_ci vpxor tmp0, x, x; 338c2ecf20Sopenharmony_ci 348c2ecf20Sopenharmony_ci#define ymm0_x xmm0 358c2ecf20Sopenharmony_ci#define ymm1_x xmm1 368c2ecf20Sopenharmony_ci#define ymm2_x xmm2 378c2ecf20Sopenharmony_ci#define ymm3_x xmm3 388c2ecf20Sopenharmony_ci#define ymm4_x xmm4 398c2ecf20Sopenharmony_ci#define ymm5_x xmm5 408c2ecf20Sopenharmony_ci#define ymm6_x xmm6 418c2ecf20Sopenharmony_ci#define ymm7_x xmm7 428c2ecf20Sopenharmony_ci#define ymm8_x xmm8 438c2ecf20Sopenharmony_ci#define ymm9_x xmm9 448c2ecf20Sopenharmony_ci#define ymm10_x xmm10 458c2ecf20Sopenharmony_ci#define ymm11_x xmm11 468c2ecf20Sopenharmony_ci#define ymm12_x xmm12 478c2ecf20Sopenharmony_ci#define ymm13_x xmm13 488c2ecf20Sopenharmony_ci#define ymm14_x xmm14 498c2ecf20Sopenharmony_ci#define ymm15_x xmm15 508c2ecf20Sopenharmony_ci 518c2ecf20Sopenharmony_ci/********************************************************************** 528c2ecf20Sopenharmony_ci 32-way camellia 538c2ecf20Sopenharmony_ci **********************************************************************/ 548c2ecf20Sopenharmony_ci 558c2ecf20Sopenharmony_ci/* 568c2ecf20Sopenharmony_ci * IN: 578c2ecf20Sopenharmony_ci * x0..x7: byte-sliced AB state 588c2ecf20Sopenharmony_ci * mem_cd: register pointer storing CD state 598c2ecf20Sopenharmony_ci * key: index for key material 608c2ecf20Sopenharmony_ci * OUT: 618c2ecf20Sopenharmony_ci * x0..x7: new byte-sliced CD state 628c2ecf20Sopenharmony_ci */ 638c2ecf20Sopenharmony_ci#define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \ 648c2ecf20Sopenharmony_ci t7, mem_cd, key) \ 658c2ecf20Sopenharmony_ci /* \ 668c2ecf20Sopenharmony_ci * S-function with AES subbytes \ 678c2ecf20Sopenharmony_ci */ \ 688c2ecf20Sopenharmony_ci vbroadcasti128 .Linv_shift_row, t4; \ 698c2ecf20Sopenharmony_ci vpbroadcastd .L0f0f0f0f, t7; \ 708c2ecf20Sopenharmony_ci vbroadcasti128 .Lpre_tf_lo_s1, t5; \ 718c2ecf20Sopenharmony_ci vbroadcasti128 .Lpre_tf_hi_s1, t6; \ 728c2ecf20Sopenharmony_ci vbroadcasti128 .Lpre_tf_lo_s4, t2; \ 738c2ecf20Sopenharmony_ci vbroadcasti128 .Lpre_tf_hi_s4, t3; \ 748c2ecf20Sopenharmony_ci \ 758c2ecf20Sopenharmony_ci /* AES inverse shift rows */ \ 768c2ecf20Sopenharmony_ci vpshufb t4, x0, x0; \ 778c2ecf20Sopenharmony_ci vpshufb t4, x7, x7; \ 788c2ecf20Sopenharmony_ci vpshufb t4, x3, x3; \ 798c2ecf20Sopenharmony_ci vpshufb t4, x6, x6; \ 808c2ecf20Sopenharmony_ci vpshufb t4, x2, x2; \ 818c2ecf20Sopenharmony_ci vpshufb t4, x5, x5; \ 828c2ecf20Sopenharmony_ci vpshufb t4, x1, x1; \ 838c2ecf20Sopenharmony_ci vpshufb t4, x4, x4; \ 848c2ecf20Sopenharmony_ci \ 858c2ecf20Sopenharmony_ci /* prefilter sboxes 1, 2 and 3 */ \ 868c2ecf20Sopenharmony_ci /* prefilter sbox 4 */ \ 878c2ecf20Sopenharmony_ci filter_8bit(x0, t5, t6, t7, t4); \ 888c2ecf20Sopenharmony_ci filter_8bit(x7, t5, t6, t7, t4); \ 898c2ecf20Sopenharmony_ci vextracti128 $1, x0, t0##_x; \ 908c2ecf20Sopenharmony_ci vextracti128 $1, x7, t1##_x; \ 918c2ecf20Sopenharmony_ci filter_8bit(x3, t2, t3, t7, t4); \ 928c2ecf20Sopenharmony_ci filter_8bit(x6, t2, t3, t7, t4); \ 938c2ecf20Sopenharmony_ci vextracti128 $1, x3, t3##_x; \ 948c2ecf20Sopenharmony_ci vextracti128 $1, x6, t2##_x; \ 958c2ecf20Sopenharmony_ci filter_8bit(x2, t5, t6, t7, t4); \ 968c2ecf20Sopenharmony_ci filter_8bit(x5, t5, t6, t7, t4); \ 978c2ecf20Sopenharmony_ci filter_8bit(x1, t5, t6, t7, t4); \ 988c2ecf20Sopenharmony_ci filter_8bit(x4, t5, t6, t7, t4); \ 998c2ecf20Sopenharmony_ci \ 1008c2ecf20Sopenharmony_ci vpxor t4##_x, t4##_x, t4##_x; \ 1018c2ecf20Sopenharmony_ci \ 1028c2ecf20Sopenharmony_ci /* AES subbytes + AES shift rows */ \ 1038c2ecf20Sopenharmony_ci vextracti128 $1, x2, t6##_x; \ 1048c2ecf20Sopenharmony_ci vextracti128 $1, x5, t5##_x; \ 1058c2ecf20Sopenharmony_ci vaesenclast t4##_x, x0##_x, x0##_x; \ 1068c2ecf20Sopenharmony_ci vaesenclast t4##_x, t0##_x, t0##_x; \ 1078c2ecf20Sopenharmony_ci vinserti128 $1, t0##_x, x0, x0; \ 1088c2ecf20Sopenharmony_ci vaesenclast t4##_x, x7##_x, x7##_x; \ 1098c2ecf20Sopenharmony_ci vaesenclast t4##_x, t1##_x, t1##_x; \ 1108c2ecf20Sopenharmony_ci vinserti128 $1, t1##_x, x7, x7; \ 1118c2ecf20Sopenharmony_ci vaesenclast t4##_x, x3##_x, x3##_x; \ 1128c2ecf20Sopenharmony_ci vaesenclast t4##_x, t3##_x, t3##_x; \ 1138c2ecf20Sopenharmony_ci vinserti128 $1, t3##_x, x3, x3; \ 1148c2ecf20Sopenharmony_ci vaesenclast t4##_x, x6##_x, x6##_x; \ 1158c2ecf20Sopenharmony_ci vaesenclast t4##_x, t2##_x, t2##_x; \ 1168c2ecf20Sopenharmony_ci vinserti128 $1, t2##_x, x6, x6; \ 1178c2ecf20Sopenharmony_ci vextracti128 $1, x1, t3##_x; \ 1188c2ecf20Sopenharmony_ci vextracti128 $1, x4, t2##_x; \ 1198c2ecf20Sopenharmony_ci vbroadcasti128 .Lpost_tf_lo_s1, t0; \ 1208c2ecf20Sopenharmony_ci vbroadcasti128 .Lpost_tf_hi_s1, t1; \ 1218c2ecf20Sopenharmony_ci vaesenclast t4##_x, x2##_x, x2##_x; \ 1228c2ecf20Sopenharmony_ci vaesenclast t4##_x, t6##_x, t6##_x; \ 1238c2ecf20Sopenharmony_ci vinserti128 $1, t6##_x, x2, x2; \ 1248c2ecf20Sopenharmony_ci vaesenclast t4##_x, x5##_x, x5##_x; \ 1258c2ecf20Sopenharmony_ci vaesenclast t4##_x, t5##_x, t5##_x; \ 1268c2ecf20Sopenharmony_ci vinserti128 $1, t5##_x, x5, x5; \ 1278c2ecf20Sopenharmony_ci vaesenclast t4##_x, x1##_x, x1##_x; \ 1288c2ecf20Sopenharmony_ci vaesenclast t4##_x, t3##_x, t3##_x; \ 1298c2ecf20Sopenharmony_ci vinserti128 $1, t3##_x, x1, x1; \ 1308c2ecf20Sopenharmony_ci vaesenclast t4##_x, x4##_x, x4##_x; \ 1318c2ecf20Sopenharmony_ci vaesenclast t4##_x, t2##_x, t2##_x; \ 1328c2ecf20Sopenharmony_ci vinserti128 $1, t2##_x, x4, x4; \ 1338c2ecf20Sopenharmony_ci \ 1348c2ecf20Sopenharmony_ci /* postfilter sboxes 1 and 4 */ \ 1358c2ecf20Sopenharmony_ci vbroadcasti128 .Lpost_tf_lo_s3, t2; \ 1368c2ecf20Sopenharmony_ci vbroadcasti128 .Lpost_tf_hi_s3, t3; \ 1378c2ecf20Sopenharmony_ci filter_8bit(x0, t0, t1, t7, t6); \ 1388c2ecf20Sopenharmony_ci filter_8bit(x7, t0, t1, t7, t6); \ 1398c2ecf20Sopenharmony_ci filter_8bit(x3, t0, t1, t7, t6); \ 1408c2ecf20Sopenharmony_ci filter_8bit(x6, t0, t1, t7, t6); \ 1418c2ecf20Sopenharmony_ci \ 1428c2ecf20Sopenharmony_ci /* postfilter sbox 3 */ \ 1438c2ecf20Sopenharmony_ci vbroadcasti128 .Lpost_tf_lo_s2, t4; \ 1448c2ecf20Sopenharmony_ci vbroadcasti128 .Lpost_tf_hi_s2, t5; \ 1458c2ecf20Sopenharmony_ci filter_8bit(x2, t2, t3, t7, t6); \ 1468c2ecf20Sopenharmony_ci filter_8bit(x5, t2, t3, t7, t6); \ 1478c2ecf20Sopenharmony_ci \ 1488c2ecf20Sopenharmony_ci vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \ 1498c2ecf20Sopenharmony_ci \ 1508c2ecf20Sopenharmony_ci /* postfilter sbox 2 */ \ 1518c2ecf20Sopenharmony_ci filter_8bit(x1, t4, t5, t7, t2); \ 1528c2ecf20Sopenharmony_ci filter_8bit(x4, t4, t5, t7, t2); \ 1538c2ecf20Sopenharmony_ci vpxor t7, t7, t7; \ 1548c2ecf20Sopenharmony_ci \ 1558c2ecf20Sopenharmony_ci vpsrldq $1, t0, t1; \ 1568c2ecf20Sopenharmony_ci vpsrldq $2, t0, t2; \ 1578c2ecf20Sopenharmony_ci vpshufb t7, t1, t1; \ 1588c2ecf20Sopenharmony_ci vpsrldq $3, t0, t3; \ 1598c2ecf20Sopenharmony_ci \ 1608c2ecf20Sopenharmony_ci /* P-function */ \ 1618c2ecf20Sopenharmony_ci vpxor x5, x0, x0; \ 1628c2ecf20Sopenharmony_ci vpxor x6, x1, x1; \ 1638c2ecf20Sopenharmony_ci vpxor x7, x2, x2; \ 1648c2ecf20Sopenharmony_ci vpxor x4, x3, x3; \ 1658c2ecf20Sopenharmony_ci \ 1668c2ecf20Sopenharmony_ci vpshufb t7, t2, t2; \ 1678c2ecf20Sopenharmony_ci vpsrldq $4, t0, t4; \ 1688c2ecf20Sopenharmony_ci vpshufb t7, t3, t3; \ 1698c2ecf20Sopenharmony_ci vpsrldq $5, t0, t5; \ 1708c2ecf20Sopenharmony_ci vpshufb t7, t4, t4; \ 1718c2ecf20Sopenharmony_ci \ 1728c2ecf20Sopenharmony_ci vpxor x2, x4, x4; \ 1738c2ecf20Sopenharmony_ci vpxor x3, x5, x5; \ 1748c2ecf20Sopenharmony_ci vpxor x0, x6, x6; \ 1758c2ecf20Sopenharmony_ci vpxor x1, x7, x7; \ 1768c2ecf20Sopenharmony_ci \ 1778c2ecf20Sopenharmony_ci vpsrldq $6, t0, t6; \ 1788c2ecf20Sopenharmony_ci vpshufb t7, t5, t5; \ 1798c2ecf20Sopenharmony_ci vpshufb t7, t6, t6; \ 1808c2ecf20Sopenharmony_ci \ 1818c2ecf20Sopenharmony_ci vpxor x7, x0, x0; \ 1828c2ecf20Sopenharmony_ci vpxor x4, x1, x1; \ 1838c2ecf20Sopenharmony_ci vpxor x5, x2, x2; \ 1848c2ecf20Sopenharmony_ci vpxor x6, x3, x3; \ 1858c2ecf20Sopenharmony_ci \ 1868c2ecf20Sopenharmony_ci vpxor x3, x4, x4; \ 1878c2ecf20Sopenharmony_ci vpxor x0, x5, x5; \ 1888c2ecf20Sopenharmony_ci vpxor x1, x6, x6; \ 1898c2ecf20Sopenharmony_ci vpxor x2, x7, x7; /* note: high and low parts swapped */ \ 1908c2ecf20Sopenharmony_ci \ 1918c2ecf20Sopenharmony_ci /* Add key material and result to CD (x becomes new CD) */ \ 1928c2ecf20Sopenharmony_ci \ 1938c2ecf20Sopenharmony_ci vpxor t6, x1, x1; \ 1948c2ecf20Sopenharmony_ci vpxor 5 * 32(mem_cd), x1, x1; \ 1958c2ecf20Sopenharmony_ci \ 1968c2ecf20Sopenharmony_ci vpsrldq $7, t0, t6; \ 1978c2ecf20Sopenharmony_ci vpshufb t7, t0, t0; \ 1988c2ecf20Sopenharmony_ci vpshufb t7, t6, t7; \ 1998c2ecf20Sopenharmony_ci \ 2008c2ecf20Sopenharmony_ci vpxor t7, x0, x0; \ 2018c2ecf20Sopenharmony_ci vpxor 4 * 32(mem_cd), x0, x0; \ 2028c2ecf20Sopenharmony_ci \ 2038c2ecf20Sopenharmony_ci vpxor t5, x2, x2; \ 2048c2ecf20Sopenharmony_ci vpxor 6 * 32(mem_cd), x2, x2; \ 2058c2ecf20Sopenharmony_ci \ 2068c2ecf20Sopenharmony_ci vpxor t4, x3, x3; \ 2078c2ecf20Sopenharmony_ci vpxor 7 * 32(mem_cd), x3, x3; \ 2088c2ecf20Sopenharmony_ci \ 2098c2ecf20Sopenharmony_ci vpxor t3, x4, x4; \ 2108c2ecf20Sopenharmony_ci vpxor 0 * 32(mem_cd), x4, x4; \ 2118c2ecf20Sopenharmony_ci \ 2128c2ecf20Sopenharmony_ci vpxor t2, x5, x5; \ 2138c2ecf20Sopenharmony_ci vpxor 1 * 32(mem_cd), x5, x5; \ 2148c2ecf20Sopenharmony_ci \ 2158c2ecf20Sopenharmony_ci vpxor t1, x6, x6; \ 2168c2ecf20Sopenharmony_ci vpxor 2 * 32(mem_cd), x6, x6; \ 2178c2ecf20Sopenharmony_ci \ 2188c2ecf20Sopenharmony_ci vpxor t0, x7, x7; \ 2198c2ecf20Sopenharmony_ci vpxor 3 * 32(mem_cd), x7, x7; 2208c2ecf20Sopenharmony_ci 2218c2ecf20Sopenharmony_ci/* 2228c2ecf20Sopenharmony_ci * Size optimization... with inlined roundsm32 binary would be over 5 times 2238c2ecf20Sopenharmony_ci * larger and would only marginally faster. 2248c2ecf20Sopenharmony_ci */ 2258c2ecf20Sopenharmony_ci.align 8 2268c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) 2278c2ecf20Sopenharmony_ci roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 2288c2ecf20Sopenharmony_ci %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, 2298c2ecf20Sopenharmony_ci %rcx, (%r9)); 2308c2ecf20Sopenharmony_ci RET; 2318c2ecf20Sopenharmony_ciSYM_FUNC_END(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) 2328c2ecf20Sopenharmony_ci 2338c2ecf20Sopenharmony_ci.align 8 2348c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) 2358c2ecf20Sopenharmony_ci roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3, 2368c2ecf20Sopenharmony_ci %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11, 2378c2ecf20Sopenharmony_ci %rax, (%r9)); 2388c2ecf20Sopenharmony_ci RET; 2398c2ecf20Sopenharmony_ciSYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) 2408c2ecf20Sopenharmony_ci 2418c2ecf20Sopenharmony_ci/* 2428c2ecf20Sopenharmony_ci * IN/OUT: 2438c2ecf20Sopenharmony_ci * x0..x7: byte-sliced AB state preloaded 2448c2ecf20Sopenharmony_ci * mem_ab: byte-sliced AB state in memory 2458c2ecf20Sopenharmony_ci * mem_cb: byte-sliced CD state in memory 2468c2ecf20Sopenharmony_ci */ 2478c2ecf20Sopenharmony_ci#define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 2488c2ecf20Sopenharmony_ci y6, y7, mem_ab, mem_cd, i, dir, store_ab) \ 2498c2ecf20Sopenharmony_ci leaq (key_table + (i) * 8)(CTX), %r9; \ 2508c2ecf20Sopenharmony_ci call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \ 2518c2ecf20Sopenharmony_ci \ 2528c2ecf20Sopenharmony_ci vmovdqu x0, 4 * 32(mem_cd); \ 2538c2ecf20Sopenharmony_ci vmovdqu x1, 5 * 32(mem_cd); \ 2548c2ecf20Sopenharmony_ci vmovdqu x2, 6 * 32(mem_cd); \ 2558c2ecf20Sopenharmony_ci vmovdqu x3, 7 * 32(mem_cd); \ 2568c2ecf20Sopenharmony_ci vmovdqu x4, 0 * 32(mem_cd); \ 2578c2ecf20Sopenharmony_ci vmovdqu x5, 1 * 32(mem_cd); \ 2588c2ecf20Sopenharmony_ci vmovdqu x6, 2 * 32(mem_cd); \ 2598c2ecf20Sopenharmony_ci vmovdqu x7, 3 * 32(mem_cd); \ 2608c2ecf20Sopenharmony_ci \ 2618c2ecf20Sopenharmony_ci leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \ 2628c2ecf20Sopenharmony_ci call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \ 2638c2ecf20Sopenharmony_ci \ 2648c2ecf20Sopenharmony_ci store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab); 2658c2ecf20Sopenharmony_ci 2668c2ecf20Sopenharmony_ci#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */ 2678c2ecf20Sopenharmony_ci 2688c2ecf20Sopenharmony_ci#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \ 2698c2ecf20Sopenharmony_ci /* Store new AB state */ \ 2708c2ecf20Sopenharmony_ci vmovdqu x4, 4 * 32(mem_ab); \ 2718c2ecf20Sopenharmony_ci vmovdqu x5, 5 * 32(mem_ab); \ 2728c2ecf20Sopenharmony_ci vmovdqu x6, 6 * 32(mem_ab); \ 2738c2ecf20Sopenharmony_ci vmovdqu x7, 7 * 32(mem_ab); \ 2748c2ecf20Sopenharmony_ci vmovdqu x0, 0 * 32(mem_ab); \ 2758c2ecf20Sopenharmony_ci vmovdqu x1, 1 * 32(mem_ab); \ 2768c2ecf20Sopenharmony_ci vmovdqu x2, 2 * 32(mem_ab); \ 2778c2ecf20Sopenharmony_ci vmovdqu x3, 3 * 32(mem_ab); 2788c2ecf20Sopenharmony_ci 2798c2ecf20Sopenharmony_ci#define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 2808c2ecf20Sopenharmony_ci y6, y7, mem_ab, mem_cd, i) \ 2818c2ecf20Sopenharmony_ci two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 2828c2ecf20Sopenharmony_ci y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \ 2838c2ecf20Sopenharmony_ci two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 2848c2ecf20Sopenharmony_ci y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \ 2858c2ecf20Sopenharmony_ci two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 2868c2ecf20Sopenharmony_ci y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store); 2878c2ecf20Sopenharmony_ci 2888c2ecf20Sopenharmony_ci#define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 2898c2ecf20Sopenharmony_ci y6, y7, mem_ab, mem_cd, i) \ 2908c2ecf20Sopenharmony_ci two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 2918c2ecf20Sopenharmony_ci y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \ 2928c2ecf20Sopenharmony_ci two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 2938c2ecf20Sopenharmony_ci y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \ 2948c2ecf20Sopenharmony_ci two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 2958c2ecf20Sopenharmony_ci y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store); 2968c2ecf20Sopenharmony_ci 2978c2ecf20Sopenharmony_ci/* 2988c2ecf20Sopenharmony_ci * IN: 2998c2ecf20Sopenharmony_ci * v0..3: byte-sliced 32-bit integers 3008c2ecf20Sopenharmony_ci * OUT: 3018c2ecf20Sopenharmony_ci * v0..3: (IN <<< 1) 3028c2ecf20Sopenharmony_ci */ 3038c2ecf20Sopenharmony_ci#define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \ 3048c2ecf20Sopenharmony_ci vpcmpgtb v0, zero, t0; \ 3058c2ecf20Sopenharmony_ci vpaddb v0, v0, v0; \ 3068c2ecf20Sopenharmony_ci vpabsb t0, t0; \ 3078c2ecf20Sopenharmony_ci \ 3088c2ecf20Sopenharmony_ci vpcmpgtb v1, zero, t1; \ 3098c2ecf20Sopenharmony_ci vpaddb v1, v1, v1; \ 3108c2ecf20Sopenharmony_ci vpabsb t1, t1; \ 3118c2ecf20Sopenharmony_ci \ 3128c2ecf20Sopenharmony_ci vpcmpgtb v2, zero, t2; \ 3138c2ecf20Sopenharmony_ci vpaddb v2, v2, v2; \ 3148c2ecf20Sopenharmony_ci vpabsb t2, t2; \ 3158c2ecf20Sopenharmony_ci \ 3168c2ecf20Sopenharmony_ci vpor t0, v1, v1; \ 3178c2ecf20Sopenharmony_ci \ 3188c2ecf20Sopenharmony_ci vpcmpgtb v3, zero, t0; \ 3198c2ecf20Sopenharmony_ci vpaddb v3, v3, v3; \ 3208c2ecf20Sopenharmony_ci vpabsb t0, t0; \ 3218c2ecf20Sopenharmony_ci \ 3228c2ecf20Sopenharmony_ci vpor t1, v2, v2; \ 3238c2ecf20Sopenharmony_ci vpor t2, v3, v3; \ 3248c2ecf20Sopenharmony_ci vpor t0, v0, v0; 3258c2ecf20Sopenharmony_ci 3268c2ecf20Sopenharmony_ci/* 3278c2ecf20Sopenharmony_ci * IN: 3288c2ecf20Sopenharmony_ci * r: byte-sliced AB state in memory 3298c2ecf20Sopenharmony_ci * l: byte-sliced CD state in memory 3308c2ecf20Sopenharmony_ci * OUT: 3318c2ecf20Sopenharmony_ci * x0..x7: new byte-sliced CD state 3328c2ecf20Sopenharmony_ci */ 3338c2ecf20Sopenharmony_ci#define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \ 3348c2ecf20Sopenharmony_ci tt1, tt2, tt3, kll, klr, krl, krr) \ 3358c2ecf20Sopenharmony_ci /* \ 3368c2ecf20Sopenharmony_ci * t0 = kll; \ 3378c2ecf20Sopenharmony_ci * t0 &= ll; \ 3388c2ecf20Sopenharmony_ci * lr ^= rol32(t0, 1); \ 3398c2ecf20Sopenharmony_ci */ \ 3408c2ecf20Sopenharmony_ci vpbroadcastd kll, t0; /* only lowest 32-bit used */ \ 3418c2ecf20Sopenharmony_ci vpxor tt0, tt0, tt0; \ 3428c2ecf20Sopenharmony_ci vpshufb tt0, t0, t3; \ 3438c2ecf20Sopenharmony_ci vpsrldq $1, t0, t0; \ 3448c2ecf20Sopenharmony_ci vpshufb tt0, t0, t2; \ 3458c2ecf20Sopenharmony_ci vpsrldq $1, t0, t0; \ 3468c2ecf20Sopenharmony_ci vpshufb tt0, t0, t1; \ 3478c2ecf20Sopenharmony_ci vpsrldq $1, t0, t0; \ 3488c2ecf20Sopenharmony_ci vpshufb tt0, t0, t0; \ 3498c2ecf20Sopenharmony_ci \ 3508c2ecf20Sopenharmony_ci vpand l0, t0, t0; \ 3518c2ecf20Sopenharmony_ci vpand l1, t1, t1; \ 3528c2ecf20Sopenharmony_ci vpand l2, t2, t2; \ 3538c2ecf20Sopenharmony_ci vpand l3, t3, t3; \ 3548c2ecf20Sopenharmony_ci \ 3558c2ecf20Sopenharmony_ci rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ 3568c2ecf20Sopenharmony_ci \ 3578c2ecf20Sopenharmony_ci vpxor l4, t0, l4; \ 3588c2ecf20Sopenharmony_ci vpbroadcastd krr, t0; /* only lowest 32-bit used */ \ 3598c2ecf20Sopenharmony_ci vmovdqu l4, 4 * 32(l); \ 3608c2ecf20Sopenharmony_ci vpxor l5, t1, l5; \ 3618c2ecf20Sopenharmony_ci vmovdqu l5, 5 * 32(l); \ 3628c2ecf20Sopenharmony_ci vpxor l6, t2, l6; \ 3638c2ecf20Sopenharmony_ci vmovdqu l6, 6 * 32(l); \ 3648c2ecf20Sopenharmony_ci vpxor l7, t3, l7; \ 3658c2ecf20Sopenharmony_ci vmovdqu l7, 7 * 32(l); \ 3668c2ecf20Sopenharmony_ci \ 3678c2ecf20Sopenharmony_ci /* \ 3688c2ecf20Sopenharmony_ci * t2 = krr; \ 3698c2ecf20Sopenharmony_ci * t2 |= rr; \ 3708c2ecf20Sopenharmony_ci * rl ^= t2; \ 3718c2ecf20Sopenharmony_ci */ \ 3728c2ecf20Sopenharmony_ci \ 3738c2ecf20Sopenharmony_ci vpshufb tt0, t0, t3; \ 3748c2ecf20Sopenharmony_ci vpsrldq $1, t0, t0; \ 3758c2ecf20Sopenharmony_ci vpshufb tt0, t0, t2; \ 3768c2ecf20Sopenharmony_ci vpsrldq $1, t0, t0; \ 3778c2ecf20Sopenharmony_ci vpshufb tt0, t0, t1; \ 3788c2ecf20Sopenharmony_ci vpsrldq $1, t0, t0; \ 3798c2ecf20Sopenharmony_ci vpshufb tt0, t0, t0; \ 3808c2ecf20Sopenharmony_ci \ 3818c2ecf20Sopenharmony_ci vpor 4 * 32(r), t0, t0; \ 3828c2ecf20Sopenharmony_ci vpor 5 * 32(r), t1, t1; \ 3838c2ecf20Sopenharmony_ci vpor 6 * 32(r), t2, t2; \ 3848c2ecf20Sopenharmony_ci vpor 7 * 32(r), t3, t3; \ 3858c2ecf20Sopenharmony_ci \ 3868c2ecf20Sopenharmony_ci vpxor 0 * 32(r), t0, t0; \ 3878c2ecf20Sopenharmony_ci vpxor 1 * 32(r), t1, t1; \ 3888c2ecf20Sopenharmony_ci vpxor 2 * 32(r), t2, t2; \ 3898c2ecf20Sopenharmony_ci vpxor 3 * 32(r), t3, t3; \ 3908c2ecf20Sopenharmony_ci vmovdqu t0, 0 * 32(r); \ 3918c2ecf20Sopenharmony_ci vpbroadcastd krl, t0; /* only lowest 32-bit used */ \ 3928c2ecf20Sopenharmony_ci vmovdqu t1, 1 * 32(r); \ 3938c2ecf20Sopenharmony_ci vmovdqu t2, 2 * 32(r); \ 3948c2ecf20Sopenharmony_ci vmovdqu t3, 3 * 32(r); \ 3958c2ecf20Sopenharmony_ci \ 3968c2ecf20Sopenharmony_ci /* \ 3978c2ecf20Sopenharmony_ci * t2 = krl; \ 3988c2ecf20Sopenharmony_ci * t2 &= rl; \ 3998c2ecf20Sopenharmony_ci * rr ^= rol32(t2, 1); \ 4008c2ecf20Sopenharmony_ci */ \ 4018c2ecf20Sopenharmony_ci vpshufb tt0, t0, t3; \ 4028c2ecf20Sopenharmony_ci vpsrldq $1, t0, t0; \ 4038c2ecf20Sopenharmony_ci vpshufb tt0, t0, t2; \ 4048c2ecf20Sopenharmony_ci vpsrldq $1, t0, t0; \ 4058c2ecf20Sopenharmony_ci vpshufb tt0, t0, t1; \ 4068c2ecf20Sopenharmony_ci vpsrldq $1, t0, t0; \ 4078c2ecf20Sopenharmony_ci vpshufb tt0, t0, t0; \ 4088c2ecf20Sopenharmony_ci \ 4098c2ecf20Sopenharmony_ci vpand 0 * 32(r), t0, t0; \ 4108c2ecf20Sopenharmony_ci vpand 1 * 32(r), t1, t1; \ 4118c2ecf20Sopenharmony_ci vpand 2 * 32(r), t2, t2; \ 4128c2ecf20Sopenharmony_ci vpand 3 * 32(r), t3, t3; \ 4138c2ecf20Sopenharmony_ci \ 4148c2ecf20Sopenharmony_ci rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ 4158c2ecf20Sopenharmony_ci \ 4168c2ecf20Sopenharmony_ci vpxor 4 * 32(r), t0, t0; \ 4178c2ecf20Sopenharmony_ci vpxor 5 * 32(r), t1, t1; \ 4188c2ecf20Sopenharmony_ci vpxor 6 * 32(r), t2, t2; \ 4198c2ecf20Sopenharmony_ci vpxor 7 * 32(r), t3, t3; \ 4208c2ecf20Sopenharmony_ci vmovdqu t0, 4 * 32(r); \ 4218c2ecf20Sopenharmony_ci vpbroadcastd klr, t0; /* only lowest 32-bit used */ \ 4228c2ecf20Sopenharmony_ci vmovdqu t1, 5 * 32(r); \ 4238c2ecf20Sopenharmony_ci vmovdqu t2, 6 * 32(r); \ 4248c2ecf20Sopenharmony_ci vmovdqu t3, 7 * 32(r); \ 4258c2ecf20Sopenharmony_ci \ 4268c2ecf20Sopenharmony_ci /* \ 4278c2ecf20Sopenharmony_ci * t0 = klr; \ 4288c2ecf20Sopenharmony_ci * t0 |= lr; \ 4298c2ecf20Sopenharmony_ci * ll ^= t0; \ 4308c2ecf20Sopenharmony_ci */ \ 4318c2ecf20Sopenharmony_ci \ 4328c2ecf20Sopenharmony_ci vpshufb tt0, t0, t3; \ 4338c2ecf20Sopenharmony_ci vpsrldq $1, t0, t0; \ 4348c2ecf20Sopenharmony_ci vpshufb tt0, t0, t2; \ 4358c2ecf20Sopenharmony_ci vpsrldq $1, t0, t0; \ 4368c2ecf20Sopenharmony_ci vpshufb tt0, t0, t1; \ 4378c2ecf20Sopenharmony_ci vpsrldq $1, t0, t0; \ 4388c2ecf20Sopenharmony_ci vpshufb tt0, t0, t0; \ 4398c2ecf20Sopenharmony_ci \ 4408c2ecf20Sopenharmony_ci vpor l4, t0, t0; \ 4418c2ecf20Sopenharmony_ci vpor l5, t1, t1; \ 4428c2ecf20Sopenharmony_ci vpor l6, t2, t2; \ 4438c2ecf20Sopenharmony_ci vpor l7, t3, t3; \ 4448c2ecf20Sopenharmony_ci \ 4458c2ecf20Sopenharmony_ci vpxor l0, t0, l0; \ 4468c2ecf20Sopenharmony_ci vmovdqu l0, 0 * 32(l); \ 4478c2ecf20Sopenharmony_ci vpxor l1, t1, l1; \ 4488c2ecf20Sopenharmony_ci vmovdqu l1, 1 * 32(l); \ 4498c2ecf20Sopenharmony_ci vpxor l2, t2, l2; \ 4508c2ecf20Sopenharmony_ci vmovdqu l2, 2 * 32(l); \ 4518c2ecf20Sopenharmony_ci vpxor l3, t3, l3; \ 4528c2ecf20Sopenharmony_ci vmovdqu l3, 3 * 32(l); 4538c2ecf20Sopenharmony_ci 4548c2ecf20Sopenharmony_ci#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ 4558c2ecf20Sopenharmony_ci vpunpckhdq x1, x0, t2; \ 4568c2ecf20Sopenharmony_ci vpunpckldq x1, x0, x0; \ 4578c2ecf20Sopenharmony_ci \ 4588c2ecf20Sopenharmony_ci vpunpckldq x3, x2, t1; \ 4598c2ecf20Sopenharmony_ci vpunpckhdq x3, x2, x2; \ 4608c2ecf20Sopenharmony_ci \ 4618c2ecf20Sopenharmony_ci vpunpckhqdq t1, x0, x1; \ 4628c2ecf20Sopenharmony_ci vpunpcklqdq t1, x0, x0; \ 4638c2ecf20Sopenharmony_ci \ 4648c2ecf20Sopenharmony_ci vpunpckhqdq x2, t2, x3; \ 4658c2ecf20Sopenharmony_ci vpunpcklqdq x2, t2, x2; 4668c2ecf20Sopenharmony_ci 4678c2ecf20Sopenharmony_ci#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \ 4688c2ecf20Sopenharmony_ci a3, b3, c3, d3, st0, st1) \ 4698c2ecf20Sopenharmony_ci vmovdqu d2, st0; \ 4708c2ecf20Sopenharmony_ci vmovdqu d3, st1; \ 4718c2ecf20Sopenharmony_ci transpose_4x4(a0, a1, a2, a3, d2, d3); \ 4728c2ecf20Sopenharmony_ci transpose_4x4(b0, b1, b2, b3, d2, d3); \ 4738c2ecf20Sopenharmony_ci vmovdqu st0, d2; \ 4748c2ecf20Sopenharmony_ci vmovdqu st1, d3; \ 4758c2ecf20Sopenharmony_ci \ 4768c2ecf20Sopenharmony_ci vmovdqu a0, st0; \ 4778c2ecf20Sopenharmony_ci vmovdqu a1, st1; \ 4788c2ecf20Sopenharmony_ci transpose_4x4(c0, c1, c2, c3, a0, a1); \ 4798c2ecf20Sopenharmony_ci transpose_4x4(d0, d1, d2, d3, a0, a1); \ 4808c2ecf20Sopenharmony_ci \ 4818c2ecf20Sopenharmony_ci vbroadcasti128 .Lshufb_16x16b, a0; \ 4828c2ecf20Sopenharmony_ci vmovdqu st1, a1; \ 4838c2ecf20Sopenharmony_ci vpshufb a0, a2, a2; \ 4848c2ecf20Sopenharmony_ci vpshufb a0, a3, a3; \ 4858c2ecf20Sopenharmony_ci vpshufb a0, b0, b0; \ 4868c2ecf20Sopenharmony_ci vpshufb a0, b1, b1; \ 4878c2ecf20Sopenharmony_ci vpshufb a0, b2, b2; \ 4888c2ecf20Sopenharmony_ci vpshufb a0, b3, b3; \ 4898c2ecf20Sopenharmony_ci vpshufb a0, a1, a1; \ 4908c2ecf20Sopenharmony_ci vpshufb a0, c0, c0; \ 4918c2ecf20Sopenharmony_ci vpshufb a0, c1, c1; \ 4928c2ecf20Sopenharmony_ci vpshufb a0, c2, c2; \ 4938c2ecf20Sopenharmony_ci vpshufb a0, c3, c3; \ 4948c2ecf20Sopenharmony_ci vpshufb a0, d0, d0; \ 4958c2ecf20Sopenharmony_ci vpshufb a0, d1, d1; \ 4968c2ecf20Sopenharmony_ci vpshufb a0, d2, d2; \ 4978c2ecf20Sopenharmony_ci vpshufb a0, d3, d3; \ 4988c2ecf20Sopenharmony_ci vmovdqu d3, st1; \ 4998c2ecf20Sopenharmony_ci vmovdqu st0, d3; \ 5008c2ecf20Sopenharmony_ci vpshufb a0, d3, a0; \ 5018c2ecf20Sopenharmony_ci vmovdqu d2, st0; \ 5028c2ecf20Sopenharmony_ci \ 5038c2ecf20Sopenharmony_ci transpose_4x4(a0, b0, c0, d0, d2, d3); \ 5048c2ecf20Sopenharmony_ci transpose_4x4(a1, b1, c1, d1, d2, d3); \ 5058c2ecf20Sopenharmony_ci vmovdqu st0, d2; \ 5068c2ecf20Sopenharmony_ci vmovdqu st1, d3; \ 5078c2ecf20Sopenharmony_ci \ 5088c2ecf20Sopenharmony_ci vmovdqu b0, st0; \ 5098c2ecf20Sopenharmony_ci vmovdqu b1, st1; \ 5108c2ecf20Sopenharmony_ci transpose_4x4(a2, b2, c2, d2, b0, b1); \ 5118c2ecf20Sopenharmony_ci transpose_4x4(a3, b3, c3, d3, b0, b1); \ 5128c2ecf20Sopenharmony_ci vmovdqu st0, b0; \ 5138c2ecf20Sopenharmony_ci vmovdqu st1, b1; \ 5148c2ecf20Sopenharmony_ci /* does not adjust output bytes inside vectors */ 5158c2ecf20Sopenharmony_ci 5168c2ecf20Sopenharmony_ci/* load blocks to registers and apply pre-whitening */ 5178c2ecf20Sopenharmony_ci#define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 5188c2ecf20Sopenharmony_ci y6, y7, rio, key) \ 5198c2ecf20Sopenharmony_ci vpbroadcastq key, x0; \ 5208c2ecf20Sopenharmony_ci vpshufb .Lpack_bswap, x0, x0; \ 5218c2ecf20Sopenharmony_ci \ 5228c2ecf20Sopenharmony_ci vpxor 0 * 32(rio), x0, y7; \ 5238c2ecf20Sopenharmony_ci vpxor 1 * 32(rio), x0, y6; \ 5248c2ecf20Sopenharmony_ci vpxor 2 * 32(rio), x0, y5; \ 5258c2ecf20Sopenharmony_ci vpxor 3 * 32(rio), x0, y4; \ 5268c2ecf20Sopenharmony_ci vpxor 4 * 32(rio), x0, y3; \ 5278c2ecf20Sopenharmony_ci vpxor 5 * 32(rio), x0, y2; \ 5288c2ecf20Sopenharmony_ci vpxor 6 * 32(rio), x0, y1; \ 5298c2ecf20Sopenharmony_ci vpxor 7 * 32(rio), x0, y0; \ 5308c2ecf20Sopenharmony_ci vpxor 8 * 32(rio), x0, x7; \ 5318c2ecf20Sopenharmony_ci vpxor 9 * 32(rio), x0, x6; \ 5328c2ecf20Sopenharmony_ci vpxor 10 * 32(rio), x0, x5; \ 5338c2ecf20Sopenharmony_ci vpxor 11 * 32(rio), x0, x4; \ 5348c2ecf20Sopenharmony_ci vpxor 12 * 32(rio), x0, x3; \ 5358c2ecf20Sopenharmony_ci vpxor 13 * 32(rio), x0, x2; \ 5368c2ecf20Sopenharmony_ci vpxor 14 * 32(rio), x0, x1; \ 5378c2ecf20Sopenharmony_ci vpxor 15 * 32(rio), x0, x0; 5388c2ecf20Sopenharmony_ci 5398c2ecf20Sopenharmony_ci/* byteslice pre-whitened blocks and store to temporary memory */ 5408c2ecf20Sopenharmony_ci#define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 5418c2ecf20Sopenharmony_ci y6, y7, mem_ab, mem_cd) \ 5428c2ecf20Sopenharmony_ci byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \ 5438c2ecf20Sopenharmony_ci y4, y5, y6, y7, (mem_ab), (mem_cd)); \ 5448c2ecf20Sopenharmony_ci \ 5458c2ecf20Sopenharmony_ci vmovdqu x0, 0 * 32(mem_ab); \ 5468c2ecf20Sopenharmony_ci vmovdqu x1, 1 * 32(mem_ab); \ 5478c2ecf20Sopenharmony_ci vmovdqu x2, 2 * 32(mem_ab); \ 5488c2ecf20Sopenharmony_ci vmovdqu x3, 3 * 32(mem_ab); \ 5498c2ecf20Sopenharmony_ci vmovdqu x4, 4 * 32(mem_ab); \ 5508c2ecf20Sopenharmony_ci vmovdqu x5, 5 * 32(mem_ab); \ 5518c2ecf20Sopenharmony_ci vmovdqu x6, 6 * 32(mem_ab); \ 5528c2ecf20Sopenharmony_ci vmovdqu x7, 7 * 32(mem_ab); \ 5538c2ecf20Sopenharmony_ci vmovdqu y0, 0 * 32(mem_cd); \ 5548c2ecf20Sopenharmony_ci vmovdqu y1, 1 * 32(mem_cd); \ 5558c2ecf20Sopenharmony_ci vmovdqu y2, 2 * 32(mem_cd); \ 5568c2ecf20Sopenharmony_ci vmovdqu y3, 3 * 32(mem_cd); \ 5578c2ecf20Sopenharmony_ci vmovdqu y4, 4 * 32(mem_cd); \ 5588c2ecf20Sopenharmony_ci vmovdqu y5, 5 * 32(mem_cd); \ 5598c2ecf20Sopenharmony_ci vmovdqu y6, 6 * 32(mem_cd); \ 5608c2ecf20Sopenharmony_ci vmovdqu y7, 7 * 32(mem_cd); 5618c2ecf20Sopenharmony_ci 5628c2ecf20Sopenharmony_ci/* de-byteslice, apply post-whitening and store blocks */ 5638c2ecf20Sopenharmony_ci#define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ 5648c2ecf20Sopenharmony_ci y5, y6, y7, key, stack_tmp0, stack_tmp1) \ 5658c2ecf20Sopenharmony_ci byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \ 5668c2ecf20Sopenharmony_ci y3, y7, x3, x7, stack_tmp0, stack_tmp1); \ 5678c2ecf20Sopenharmony_ci \ 5688c2ecf20Sopenharmony_ci vmovdqu x0, stack_tmp0; \ 5698c2ecf20Sopenharmony_ci \ 5708c2ecf20Sopenharmony_ci vpbroadcastq key, x0; \ 5718c2ecf20Sopenharmony_ci vpshufb .Lpack_bswap, x0, x0; \ 5728c2ecf20Sopenharmony_ci \ 5738c2ecf20Sopenharmony_ci vpxor x0, y7, y7; \ 5748c2ecf20Sopenharmony_ci vpxor x0, y6, y6; \ 5758c2ecf20Sopenharmony_ci vpxor x0, y5, y5; \ 5768c2ecf20Sopenharmony_ci vpxor x0, y4, y4; \ 5778c2ecf20Sopenharmony_ci vpxor x0, y3, y3; \ 5788c2ecf20Sopenharmony_ci vpxor x0, y2, y2; \ 5798c2ecf20Sopenharmony_ci vpxor x0, y1, y1; \ 5808c2ecf20Sopenharmony_ci vpxor x0, y0, y0; \ 5818c2ecf20Sopenharmony_ci vpxor x0, x7, x7; \ 5828c2ecf20Sopenharmony_ci vpxor x0, x6, x6; \ 5838c2ecf20Sopenharmony_ci vpxor x0, x5, x5; \ 5848c2ecf20Sopenharmony_ci vpxor x0, x4, x4; \ 5858c2ecf20Sopenharmony_ci vpxor x0, x3, x3; \ 5868c2ecf20Sopenharmony_ci vpxor x0, x2, x2; \ 5878c2ecf20Sopenharmony_ci vpxor x0, x1, x1; \ 5888c2ecf20Sopenharmony_ci vpxor stack_tmp0, x0, x0; 5898c2ecf20Sopenharmony_ci 5908c2ecf20Sopenharmony_ci#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 5918c2ecf20Sopenharmony_ci y6, y7, rio) \ 5928c2ecf20Sopenharmony_ci vmovdqu x0, 0 * 32(rio); \ 5938c2ecf20Sopenharmony_ci vmovdqu x1, 1 * 32(rio); \ 5948c2ecf20Sopenharmony_ci vmovdqu x2, 2 * 32(rio); \ 5958c2ecf20Sopenharmony_ci vmovdqu x3, 3 * 32(rio); \ 5968c2ecf20Sopenharmony_ci vmovdqu x4, 4 * 32(rio); \ 5978c2ecf20Sopenharmony_ci vmovdqu x5, 5 * 32(rio); \ 5988c2ecf20Sopenharmony_ci vmovdqu x6, 6 * 32(rio); \ 5998c2ecf20Sopenharmony_ci vmovdqu x7, 7 * 32(rio); \ 6008c2ecf20Sopenharmony_ci vmovdqu y0, 8 * 32(rio); \ 6018c2ecf20Sopenharmony_ci vmovdqu y1, 9 * 32(rio); \ 6028c2ecf20Sopenharmony_ci vmovdqu y2, 10 * 32(rio); \ 6038c2ecf20Sopenharmony_ci vmovdqu y3, 11 * 32(rio); \ 6048c2ecf20Sopenharmony_ci vmovdqu y4, 12 * 32(rio); \ 6058c2ecf20Sopenharmony_ci vmovdqu y5, 13 * 32(rio); \ 6068c2ecf20Sopenharmony_ci vmovdqu y6, 14 * 32(rio); \ 6078c2ecf20Sopenharmony_ci vmovdqu y7, 15 * 32(rio); 6088c2ecf20Sopenharmony_ci 6098c2ecf20Sopenharmony_ci 6108c2ecf20Sopenharmony_ci.section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32 6118c2ecf20Sopenharmony_ci.align 32 6128c2ecf20Sopenharmony_ci#define SHUFB_BYTES(idx) \ 6138c2ecf20Sopenharmony_ci 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) 6148c2ecf20Sopenharmony_ci.Lshufb_16x16b: 6158c2ecf20Sopenharmony_ci .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) 6168c2ecf20Sopenharmony_ci .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) 6178c2ecf20Sopenharmony_ci 6188c2ecf20Sopenharmony_ci.section .rodata.cst32.pack_bswap, "aM", @progbits, 32 6198c2ecf20Sopenharmony_ci.align 32 6208c2ecf20Sopenharmony_ci.Lpack_bswap: 6218c2ecf20Sopenharmony_ci .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 6228c2ecf20Sopenharmony_ci .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 6238c2ecf20Sopenharmony_ci 6248c2ecf20Sopenharmony_ci/* NB: section is mergeable, all elements must be aligned 16-byte blocks */ 6258c2ecf20Sopenharmony_ci.section .rodata.cst16, "aM", @progbits, 16 6268c2ecf20Sopenharmony_ci.align 16 6278c2ecf20Sopenharmony_ci 6288c2ecf20Sopenharmony_ci/* For CTR-mode IV byteswap */ 6298c2ecf20Sopenharmony_ci.Lbswap128_mask: 6308c2ecf20Sopenharmony_ci .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 6318c2ecf20Sopenharmony_ci 6328c2ecf20Sopenharmony_ci/* For XTS mode */ 6338c2ecf20Sopenharmony_ci.Lxts_gf128mul_and_shl1_mask_0: 6348c2ecf20Sopenharmony_ci .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 6358c2ecf20Sopenharmony_ci.Lxts_gf128mul_and_shl1_mask_1: 6368c2ecf20Sopenharmony_ci .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 6378c2ecf20Sopenharmony_ci 6388c2ecf20Sopenharmony_ci/* 6398c2ecf20Sopenharmony_ci * pre-SubByte transform 6408c2ecf20Sopenharmony_ci * 6418c2ecf20Sopenharmony_ci * pre-lookup for sbox1, sbox2, sbox3: 6428c2ecf20Sopenharmony_ci * swap_bitendianness( 6438c2ecf20Sopenharmony_ci * isom_map_camellia_to_aes( 6448c2ecf20Sopenharmony_ci * camellia_f( 6458c2ecf20Sopenharmony_ci * swap_bitendianess(in) 6468c2ecf20Sopenharmony_ci * ) 6478c2ecf20Sopenharmony_ci * ) 6488c2ecf20Sopenharmony_ci * ) 6498c2ecf20Sopenharmony_ci * 6508c2ecf20Sopenharmony_ci * (note: '⊕ 0xc5' inside camellia_f()) 6518c2ecf20Sopenharmony_ci */ 6528c2ecf20Sopenharmony_ci.Lpre_tf_lo_s1: 6538c2ecf20Sopenharmony_ci .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86 6548c2ecf20Sopenharmony_ci .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88 6558c2ecf20Sopenharmony_ci.Lpre_tf_hi_s1: 6568c2ecf20Sopenharmony_ci .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a 6578c2ecf20Sopenharmony_ci .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23 6588c2ecf20Sopenharmony_ci 6598c2ecf20Sopenharmony_ci/* 6608c2ecf20Sopenharmony_ci * pre-SubByte transform 6618c2ecf20Sopenharmony_ci * 6628c2ecf20Sopenharmony_ci * pre-lookup for sbox4: 6638c2ecf20Sopenharmony_ci * swap_bitendianness( 6648c2ecf20Sopenharmony_ci * isom_map_camellia_to_aes( 6658c2ecf20Sopenharmony_ci * camellia_f( 6668c2ecf20Sopenharmony_ci * swap_bitendianess(in <<< 1) 6678c2ecf20Sopenharmony_ci * ) 6688c2ecf20Sopenharmony_ci * ) 6698c2ecf20Sopenharmony_ci * ) 6708c2ecf20Sopenharmony_ci * 6718c2ecf20Sopenharmony_ci * (note: '⊕ 0xc5' inside camellia_f()) 6728c2ecf20Sopenharmony_ci */ 6738c2ecf20Sopenharmony_ci.Lpre_tf_lo_s4: 6748c2ecf20Sopenharmony_ci .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25 6758c2ecf20Sopenharmony_ci .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74 6768c2ecf20Sopenharmony_ci.Lpre_tf_hi_s4: 6778c2ecf20Sopenharmony_ci .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72 6788c2ecf20Sopenharmony_ci .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf 6798c2ecf20Sopenharmony_ci 6808c2ecf20Sopenharmony_ci/* 6818c2ecf20Sopenharmony_ci * post-SubByte transform 6828c2ecf20Sopenharmony_ci * 6838c2ecf20Sopenharmony_ci * post-lookup for sbox1, sbox4: 6848c2ecf20Sopenharmony_ci * swap_bitendianness( 6858c2ecf20Sopenharmony_ci * camellia_h( 6868c2ecf20Sopenharmony_ci * isom_map_aes_to_camellia( 6878c2ecf20Sopenharmony_ci * swap_bitendianness( 6888c2ecf20Sopenharmony_ci * aes_inverse_affine_transform(in) 6898c2ecf20Sopenharmony_ci * ) 6908c2ecf20Sopenharmony_ci * ) 6918c2ecf20Sopenharmony_ci * ) 6928c2ecf20Sopenharmony_ci * ) 6938c2ecf20Sopenharmony_ci * 6948c2ecf20Sopenharmony_ci * (note: '⊕ 0x6e' inside camellia_h()) 6958c2ecf20Sopenharmony_ci */ 6968c2ecf20Sopenharmony_ci.Lpost_tf_lo_s1: 6978c2ecf20Sopenharmony_ci .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31 6988c2ecf20Sopenharmony_ci .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1 6998c2ecf20Sopenharmony_ci.Lpost_tf_hi_s1: 7008c2ecf20Sopenharmony_ci .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8 7018c2ecf20Sopenharmony_ci .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c 7028c2ecf20Sopenharmony_ci 7038c2ecf20Sopenharmony_ci/* 7048c2ecf20Sopenharmony_ci * post-SubByte transform 7058c2ecf20Sopenharmony_ci * 7068c2ecf20Sopenharmony_ci * post-lookup for sbox2: 7078c2ecf20Sopenharmony_ci * swap_bitendianness( 7088c2ecf20Sopenharmony_ci * camellia_h( 7098c2ecf20Sopenharmony_ci * isom_map_aes_to_camellia( 7108c2ecf20Sopenharmony_ci * swap_bitendianness( 7118c2ecf20Sopenharmony_ci * aes_inverse_affine_transform(in) 7128c2ecf20Sopenharmony_ci * ) 7138c2ecf20Sopenharmony_ci * ) 7148c2ecf20Sopenharmony_ci * ) 7158c2ecf20Sopenharmony_ci * ) <<< 1 7168c2ecf20Sopenharmony_ci * 7178c2ecf20Sopenharmony_ci * (note: '⊕ 0x6e' inside camellia_h()) 7188c2ecf20Sopenharmony_ci */ 7198c2ecf20Sopenharmony_ci.Lpost_tf_lo_s2: 7208c2ecf20Sopenharmony_ci .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62 7218c2ecf20Sopenharmony_ci .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3 7228c2ecf20Sopenharmony_ci.Lpost_tf_hi_s2: 7238c2ecf20Sopenharmony_ci .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51 7248c2ecf20Sopenharmony_ci .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18 7258c2ecf20Sopenharmony_ci 7268c2ecf20Sopenharmony_ci/* 7278c2ecf20Sopenharmony_ci * post-SubByte transform 7288c2ecf20Sopenharmony_ci * 7298c2ecf20Sopenharmony_ci * post-lookup for sbox3: 7308c2ecf20Sopenharmony_ci * swap_bitendianness( 7318c2ecf20Sopenharmony_ci * camellia_h( 7328c2ecf20Sopenharmony_ci * isom_map_aes_to_camellia( 7338c2ecf20Sopenharmony_ci * swap_bitendianness( 7348c2ecf20Sopenharmony_ci * aes_inverse_affine_transform(in) 7358c2ecf20Sopenharmony_ci * ) 7368c2ecf20Sopenharmony_ci * ) 7378c2ecf20Sopenharmony_ci * ) 7388c2ecf20Sopenharmony_ci * ) >>> 1 7398c2ecf20Sopenharmony_ci * 7408c2ecf20Sopenharmony_ci * (note: '⊕ 0x6e' inside camellia_h()) 7418c2ecf20Sopenharmony_ci */ 7428c2ecf20Sopenharmony_ci.Lpost_tf_lo_s3: 7438c2ecf20Sopenharmony_ci .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98 7448c2ecf20Sopenharmony_ci .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8 7458c2ecf20Sopenharmony_ci.Lpost_tf_hi_s3: 7468c2ecf20Sopenharmony_ci .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54 7478c2ecf20Sopenharmony_ci .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06 7488c2ecf20Sopenharmony_ci 7498c2ecf20Sopenharmony_ci/* For isolating SubBytes from AESENCLAST, inverse shift row */ 7508c2ecf20Sopenharmony_ci.Linv_shift_row: 7518c2ecf20Sopenharmony_ci .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b 7528c2ecf20Sopenharmony_ci .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 7538c2ecf20Sopenharmony_ci 7548c2ecf20Sopenharmony_ci.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4 7558c2ecf20Sopenharmony_ci.align 4 7568c2ecf20Sopenharmony_ci/* 4-bit mask */ 7578c2ecf20Sopenharmony_ci.L0f0f0f0f: 7588c2ecf20Sopenharmony_ci .long 0x0f0f0f0f 7598c2ecf20Sopenharmony_ci 7608c2ecf20Sopenharmony_ci.text 7618c2ecf20Sopenharmony_ci 7628c2ecf20Sopenharmony_ci.align 8 7638c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(__camellia_enc_blk32) 7648c2ecf20Sopenharmony_ci /* input: 7658c2ecf20Sopenharmony_ci * %rdi: ctx, CTX 7668c2ecf20Sopenharmony_ci * %rax: temporary storage, 512 bytes 7678c2ecf20Sopenharmony_ci * %ymm0..%ymm15: 32 plaintext blocks 7688c2ecf20Sopenharmony_ci * output: 7698c2ecf20Sopenharmony_ci * %ymm0..%ymm15: 32 encrypted blocks, order swapped: 7708c2ecf20Sopenharmony_ci * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 7718c2ecf20Sopenharmony_ci */ 7728c2ecf20Sopenharmony_ci FRAME_BEGIN 7738c2ecf20Sopenharmony_ci 7748c2ecf20Sopenharmony_ci leaq 8 * 32(%rax), %rcx; 7758c2ecf20Sopenharmony_ci 7768c2ecf20Sopenharmony_ci inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 7778c2ecf20Sopenharmony_ci %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 7788c2ecf20Sopenharmony_ci %ymm15, %rax, %rcx); 7798c2ecf20Sopenharmony_ci 7808c2ecf20Sopenharmony_ci enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 7818c2ecf20Sopenharmony_ci %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 7828c2ecf20Sopenharmony_ci %ymm15, %rax, %rcx, 0); 7838c2ecf20Sopenharmony_ci 7848c2ecf20Sopenharmony_ci fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 7858c2ecf20Sopenharmony_ci %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 7868c2ecf20Sopenharmony_ci %ymm15, 7878c2ecf20Sopenharmony_ci ((key_table + (8) * 8) + 0)(CTX), 7888c2ecf20Sopenharmony_ci ((key_table + (8) * 8) + 4)(CTX), 7898c2ecf20Sopenharmony_ci ((key_table + (8) * 8) + 8)(CTX), 7908c2ecf20Sopenharmony_ci ((key_table + (8) * 8) + 12)(CTX)); 7918c2ecf20Sopenharmony_ci 7928c2ecf20Sopenharmony_ci enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 7938c2ecf20Sopenharmony_ci %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 7948c2ecf20Sopenharmony_ci %ymm15, %rax, %rcx, 8); 7958c2ecf20Sopenharmony_ci 7968c2ecf20Sopenharmony_ci fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 7978c2ecf20Sopenharmony_ci %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 7988c2ecf20Sopenharmony_ci %ymm15, 7998c2ecf20Sopenharmony_ci ((key_table + (16) * 8) + 0)(CTX), 8008c2ecf20Sopenharmony_ci ((key_table + (16) * 8) + 4)(CTX), 8018c2ecf20Sopenharmony_ci ((key_table + (16) * 8) + 8)(CTX), 8028c2ecf20Sopenharmony_ci ((key_table + (16) * 8) + 12)(CTX)); 8038c2ecf20Sopenharmony_ci 8048c2ecf20Sopenharmony_ci enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 8058c2ecf20Sopenharmony_ci %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 8068c2ecf20Sopenharmony_ci %ymm15, %rax, %rcx, 16); 8078c2ecf20Sopenharmony_ci 8088c2ecf20Sopenharmony_ci movl $24, %r8d; 8098c2ecf20Sopenharmony_ci cmpl $16, key_length(CTX); 8108c2ecf20Sopenharmony_ci jne .Lenc_max32; 8118c2ecf20Sopenharmony_ci 8128c2ecf20Sopenharmony_ci.Lenc_done: 8138c2ecf20Sopenharmony_ci /* load CD for output */ 8148c2ecf20Sopenharmony_ci vmovdqu 0 * 32(%rcx), %ymm8; 8158c2ecf20Sopenharmony_ci vmovdqu 1 * 32(%rcx), %ymm9; 8168c2ecf20Sopenharmony_ci vmovdqu 2 * 32(%rcx), %ymm10; 8178c2ecf20Sopenharmony_ci vmovdqu 3 * 32(%rcx), %ymm11; 8188c2ecf20Sopenharmony_ci vmovdqu 4 * 32(%rcx), %ymm12; 8198c2ecf20Sopenharmony_ci vmovdqu 5 * 32(%rcx), %ymm13; 8208c2ecf20Sopenharmony_ci vmovdqu 6 * 32(%rcx), %ymm14; 8218c2ecf20Sopenharmony_ci vmovdqu 7 * 32(%rcx), %ymm15; 8228c2ecf20Sopenharmony_ci 8238c2ecf20Sopenharmony_ci outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 8248c2ecf20Sopenharmony_ci %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 8258c2ecf20Sopenharmony_ci %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax)); 8268c2ecf20Sopenharmony_ci 8278c2ecf20Sopenharmony_ci FRAME_END 8288c2ecf20Sopenharmony_ci RET; 8298c2ecf20Sopenharmony_ci 8308c2ecf20Sopenharmony_ci.align 8 8318c2ecf20Sopenharmony_ci.Lenc_max32: 8328c2ecf20Sopenharmony_ci movl $32, %r8d; 8338c2ecf20Sopenharmony_ci 8348c2ecf20Sopenharmony_ci fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 8358c2ecf20Sopenharmony_ci %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 8368c2ecf20Sopenharmony_ci %ymm15, 8378c2ecf20Sopenharmony_ci ((key_table + (24) * 8) + 0)(CTX), 8388c2ecf20Sopenharmony_ci ((key_table + (24) * 8) + 4)(CTX), 8398c2ecf20Sopenharmony_ci ((key_table + (24) * 8) + 8)(CTX), 8408c2ecf20Sopenharmony_ci ((key_table + (24) * 8) + 12)(CTX)); 8418c2ecf20Sopenharmony_ci 8428c2ecf20Sopenharmony_ci enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 8438c2ecf20Sopenharmony_ci %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 8448c2ecf20Sopenharmony_ci %ymm15, %rax, %rcx, 24); 8458c2ecf20Sopenharmony_ci 8468c2ecf20Sopenharmony_ci jmp .Lenc_done; 8478c2ecf20Sopenharmony_ciSYM_FUNC_END(__camellia_enc_blk32) 8488c2ecf20Sopenharmony_ci 8498c2ecf20Sopenharmony_ci.align 8 8508c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(__camellia_dec_blk32) 8518c2ecf20Sopenharmony_ci /* input: 8528c2ecf20Sopenharmony_ci * %rdi: ctx, CTX 8538c2ecf20Sopenharmony_ci * %rax: temporary storage, 512 bytes 8548c2ecf20Sopenharmony_ci * %r8d: 24 for 16 byte key, 32 for larger 8558c2ecf20Sopenharmony_ci * %ymm0..%ymm15: 16 encrypted blocks 8568c2ecf20Sopenharmony_ci * output: 8578c2ecf20Sopenharmony_ci * %ymm0..%ymm15: 16 plaintext blocks, order swapped: 8588c2ecf20Sopenharmony_ci * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 8598c2ecf20Sopenharmony_ci */ 8608c2ecf20Sopenharmony_ci FRAME_BEGIN 8618c2ecf20Sopenharmony_ci 8628c2ecf20Sopenharmony_ci leaq 8 * 32(%rax), %rcx; 8638c2ecf20Sopenharmony_ci 8648c2ecf20Sopenharmony_ci inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 8658c2ecf20Sopenharmony_ci %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 8668c2ecf20Sopenharmony_ci %ymm15, %rax, %rcx); 8678c2ecf20Sopenharmony_ci 8688c2ecf20Sopenharmony_ci cmpl $32, %r8d; 8698c2ecf20Sopenharmony_ci je .Ldec_max32; 8708c2ecf20Sopenharmony_ci 8718c2ecf20Sopenharmony_ci.Ldec_max24: 8728c2ecf20Sopenharmony_ci dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 8738c2ecf20Sopenharmony_ci %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 8748c2ecf20Sopenharmony_ci %ymm15, %rax, %rcx, 16); 8758c2ecf20Sopenharmony_ci 8768c2ecf20Sopenharmony_ci fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 8778c2ecf20Sopenharmony_ci %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 8788c2ecf20Sopenharmony_ci %ymm15, 8798c2ecf20Sopenharmony_ci ((key_table + (16) * 8) + 8)(CTX), 8808c2ecf20Sopenharmony_ci ((key_table + (16) * 8) + 12)(CTX), 8818c2ecf20Sopenharmony_ci ((key_table + (16) * 8) + 0)(CTX), 8828c2ecf20Sopenharmony_ci ((key_table + (16) * 8) + 4)(CTX)); 8838c2ecf20Sopenharmony_ci 8848c2ecf20Sopenharmony_ci dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 8858c2ecf20Sopenharmony_ci %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 8868c2ecf20Sopenharmony_ci %ymm15, %rax, %rcx, 8); 8878c2ecf20Sopenharmony_ci 8888c2ecf20Sopenharmony_ci fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 8898c2ecf20Sopenharmony_ci %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 8908c2ecf20Sopenharmony_ci %ymm15, 8918c2ecf20Sopenharmony_ci ((key_table + (8) * 8) + 8)(CTX), 8928c2ecf20Sopenharmony_ci ((key_table + (8) * 8) + 12)(CTX), 8938c2ecf20Sopenharmony_ci ((key_table + (8) * 8) + 0)(CTX), 8948c2ecf20Sopenharmony_ci ((key_table + (8) * 8) + 4)(CTX)); 8958c2ecf20Sopenharmony_ci 8968c2ecf20Sopenharmony_ci dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 8978c2ecf20Sopenharmony_ci %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 8988c2ecf20Sopenharmony_ci %ymm15, %rax, %rcx, 0); 8998c2ecf20Sopenharmony_ci 9008c2ecf20Sopenharmony_ci /* load CD for output */ 9018c2ecf20Sopenharmony_ci vmovdqu 0 * 32(%rcx), %ymm8; 9028c2ecf20Sopenharmony_ci vmovdqu 1 * 32(%rcx), %ymm9; 9038c2ecf20Sopenharmony_ci vmovdqu 2 * 32(%rcx), %ymm10; 9048c2ecf20Sopenharmony_ci vmovdqu 3 * 32(%rcx), %ymm11; 9058c2ecf20Sopenharmony_ci vmovdqu 4 * 32(%rcx), %ymm12; 9068c2ecf20Sopenharmony_ci vmovdqu 5 * 32(%rcx), %ymm13; 9078c2ecf20Sopenharmony_ci vmovdqu 6 * 32(%rcx), %ymm14; 9088c2ecf20Sopenharmony_ci vmovdqu 7 * 32(%rcx), %ymm15; 9098c2ecf20Sopenharmony_ci 9108c2ecf20Sopenharmony_ci outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 9118c2ecf20Sopenharmony_ci %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 9128c2ecf20Sopenharmony_ci %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax)); 9138c2ecf20Sopenharmony_ci 9148c2ecf20Sopenharmony_ci FRAME_END 9158c2ecf20Sopenharmony_ci RET; 9168c2ecf20Sopenharmony_ci 9178c2ecf20Sopenharmony_ci.align 8 9188c2ecf20Sopenharmony_ci.Ldec_max32: 9198c2ecf20Sopenharmony_ci dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 9208c2ecf20Sopenharmony_ci %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 9218c2ecf20Sopenharmony_ci %ymm15, %rax, %rcx, 24); 9228c2ecf20Sopenharmony_ci 9238c2ecf20Sopenharmony_ci fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 9248c2ecf20Sopenharmony_ci %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 9258c2ecf20Sopenharmony_ci %ymm15, 9268c2ecf20Sopenharmony_ci ((key_table + (24) * 8) + 8)(CTX), 9278c2ecf20Sopenharmony_ci ((key_table + (24) * 8) + 12)(CTX), 9288c2ecf20Sopenharmony_ci ((key_table + (24) * 8) + 0)(CTX), 9298c2ecf20Sopenharmony_ci ((key_table + (24) * 8) + 4)(CTX)); 9308c2ecf20Sopenharmony_ci 9318c2ecf20Sopenharmony_ci jmp .Ldec_max24; 9328c2ecf20Sopenharmony_ciSYM_FUNC_END(__camellia_dec_blk32) 9338c2ecf20Sopenharmony_ci 9348c2ecf20Sopenharmony_ciSYM_FUNC_START(camellia_ecb_enc_32way) 9358c2ecf20Sopenharmony_ci /* input: 9368c2ecf20Sopenharmony_ci * %rdi: ctx, CTX 9378c2ecf20Sopenharmony_ci * %rsi: dst (32 blocks) 9388c2ecf20Sopenharmony_ci * %rdx: src (32 blocks) 9398c2ecf20Sopenharmony_ci */ 9408c2ecf20Sopenharmony_ci FRAME_BEGIN 9418c2ecf20Sopenharmony_ci 9428c2ecf20Sopenharmony_ci vzeroupper; 9438c2ecf20Sopenharmony_ci 9448c2ecf20Sopenharmony_ci inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 9458c2ecf20Sopenharmony_ci %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 9468c2ecf20Sopenharmony_ci %ymm15, %rdx, (key_table)(CTX)); 9478c2ecf20Sopenharmony_ci 9488c2ecf20Sopenharmony_ci /* now dst can be used as temporary buffer (even in src == dst case) */ 9498c2ecf20Sopenharmony_ci movq %rsi, %rax; 9508c2ecf20Sopenharmony_ci 9518c2ecf20Sopenharmony_ci call __camellia_enc_blk32; 9528c2ecf20Sopenharmony_ci 9538c2ecf20Sopenharmony_ci write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, 9548c2ecf20Sopenharmony_ci %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, 9558c2ecf20Sopenharmony_ci %ymm8, %rsi); 9568c2ecf20Sopenharmony_ci 9578c2ecf20Sopenharmony_ci vzeroupper; 9588c2ecf20Sopenharmony_ci 9598c2ecf20Sopenharmony_ci FRAME_END 9608c2ecf20Sopenharmony_ci RET; 9618c2ecf20Sopenharmony_ciSYM_FUNC_END(camellia_ecb_enc_32way) 9628c2ecf20Sopenharmony_ci 9638c2ecf20Sopenharmony_ciSYM_FUNC_START(camellia_ecb_dec_32way) 9648c2ecf20Sopenharmony_ci /* input: 9658c2ecf20Sopenharmony_ci * %rdi: ctx, CTX 9668c2ecf20Sopenharmony_ci * %rsi: dst (32 blocks) 9678c2ecf20Sopenharmony_ci * %rdx: src (32 blocks) 9688c2ecf20Sopenharmony_ci */ 9698c2ecf20Sopenharmony_ci FRAME_BEGIN 9708c2ecf20Sopenharmony_ci 9718c2ecf20Sopenharmony_ci vzeroupper; 9728c2ecf20Sopenharmony_ci 9738c2ecf20Sopenharmony_ci cmpl $16, key_length(CTX); 9748c2ecf20Sopenharmony_ci movl $32, %r8d; 9758c2ecf20Sopenharmony_ci movl $24, %eax; 9768c2ecf20Sopenharmony_ci cmovel %eax, %r8d; /* max */ 9778c2ecf20Sopenharmony_ci 9788c2ecf20Sopenharmony_ci inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 9798c2ecf20Sopenharmony_ci %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 9808c2ecf20Sopenharmony_ci %ymm15, %rdx, (key_table)(CTX, %r8, 8)); 9818c2ecf20Sopenharmony_ci 9828c2ecf20Sopenharmony_ci /* now dst can be used as temporary buffer (even in src == dst case) */ 9838c2ecf20Sopenharmony_ci movq %rsi, %rax; 9848c2ecf20Sopenharmony_ci 9858c2ecf20Sopenharmony_ci call __camellia_dec_blk32; 9868c2ecf20Sopenharmony_ci 9878c2ecf20Sopenharmony_ci write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, 9888c2ecf20Sopenharmony_ci %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, 9898c2ecf20Sopenharmony_ci %ymm8, %rsi); 9908c2ecf20Sopenharmony_ci 9918c2ecf20Sopenharmony_ci vzeroupper; 9928c2ecf20Sopenharmony_ci 9938c2ecf20Sopenharmony_ci FRAME_END 9948c2ecf20Sopenharmony_ci RET; 9958c2ecf20Sopenharmony_ciSYM_FUNC_END(camellia_ecb_dec_32way) 9968c2ecf20Sopenharmony_ci 9978c2ecf20Sopenharmony_ciSYM_FUNC_START(camellia_cbc_dec_32way) 9988c2ecf20Sopenharmony_ci /* input: 9998c2ecf20Sopenharmony_ci * %rdi: ctx, CTX 10008c2ecf20Sopenharmony_ci * %rsi: dst (32 blocks) 10018c2ecf20Sopenharmony_ci * %rdx: src (32 blocks) 10028c2ecf20Sopenharmony_ci */ 10038c2ecf20Sopenharmony_ci FRAME_BEGIN 10048c2ecf20Sopenharmony_ci 10058c2ecf20Sopenharmony_ci vzeroupper; 10068c2ecf20Sopenharmony_ci 10078c2ecf20Sopenharmony_ci cmpl $16, key_length(CTX); 10088c2ecf20Sopenharmony_ci movl $32, %r8d; 10098c2ecf20Sopenharmony_ci movl $24, %eax; 10108c2ecf20Sopenharmony_ci cmovel %eax, %r8d; /* max */ 10118c2ecf20Sopenharmony_ci 10128c2ecf20Sopenharmony_ci inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 10138c2ecf20Sopenharmony_ci %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 10148c2ecf20Sopenharmony_ci %ymm15, %rdx, (key_table)(CTX, %r8, 8)); 10158c2ecf20Sopenharmony_ci 10168c2ecf20Sopenharmony_ci movq %rsp, %r10; 10178c2ecf20Sopenharmony_ci cmpq %rsi, %rdx; 10188c2ecf20Sopenharmony_ci je .Lcbc_dec_use_stack; 10198c2ecf20Sopenharmony_ci 10208c2ecf20Sopenharmony_ci /* dst can be used as temporary storage, src is not overwritten. */ 10218c2ecf20Sopenharmony_ci movq %rsi, %rax; 10228c2ecf20Sopenharmony_ci jmp .Lcbc_dec_continue; 10238c2ecf20Sopenharmony_ci 10248c2ecf20Sopenharmony_ci.Lcbc_dec_use_stack: 10258c2ecf20Sopenharmony_ci /* 10268c2ecf20Sopenharmony_ci * dst still in-use (because dst == src), so use stack for temporary 10278c2ecf20Sopenharmony_ci * storage. 10288c2ecf20Sopenharmony_ci */ 10298c2ecf20Sopenharmony_ci subq $(16 * 32), %rsp; 10308c2ecf20Sopenharmony_ci movq %rsp, %rax; 10318c2ecf20Sopenharmony_ci 10328c2ecf20Sopenharmony_ci.Lcbc_dec_continue: 10338c2ecf20Sopenharmony_ci call __camellia_dec_blk32; 10348c2ecf20Sopenharmony_ci 10358c2ecf20Sopenharmony_ci vmovdqu %ymm7, (%rax); 10368c2ecf20Sopenharmony_ci vpxor %ymm7, %ymm7, %ymm7; 10378c2ecf20Sopenharmony_ci vinserti128 $1, (%rdx), %ymm7, %ymm7; 10388c2ecf20Sopenharmony_ci vpxor (%rax), %ymm7, %ymm7; 10398c2ecf20Sopenharmony_ci movq %r10, %rsp; 10408c2ecf20Sopenharmony_ci vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6; 10418c2ecf20Sopenharmony_ci vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5; 10428c2ecf20Sopenharmony_ci vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4; 10438c2ecf20Sopenharmony_ci vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3; 10448c2ecf20Sopenharmony_ci vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2; 10458c2ecf20Sopenharmony_ci vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1; 10468c2ecf20Sopenharmony_ci vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0; 10478c2ecf20Sopenharmony_ci vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15; 10488c2ecf20Sopenharmony_ci vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14; 10498c2ecf20Sopenharmony_ci vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13; 10508c2ecf20Sopenharmony_ci vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12; 10518c2ecf20Sopenharmony_ci vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11; 10528c2ecf20Sopenharmony_ci vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10; 10538c2ecf20Sopenharmony_ci vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9; 10548c2ecf20Sopenharmony_ci vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8; 10558c2ecf20Sopenharmony_ci write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, 10568c2ecf20Sopenharmony_ci %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, 10578c2ecf20Sopenharmony_ci %ymm8, %rsi); 10588c2ecf20Sopenharmony_ci 10598c2ecf20Sopenharmony_ci vzeroupper; 10608c2ecf20Sopenharmony_ci 10618c2ecf20Sopenharmony_ci FRAME_END 10628c2ecf20Sopenharmony_ci RET; 10638c2ecf20Sopenharmony_ciSYM_FUNC_END(camellia_cbc_dec_32way) 10648c2ecf20Sopenharmony_ci 10658c2ecf20Sopenharmony_ci#define inc_le128(x, minus_one, tmp) \ 10668c2ecf20Sopenharmony_ci vpcmpeqq minus_one, x, tmp; \ 10678c2ecf20Sopenharmony_ci vpsubq minus_one, x, x; \ 10688c2ecf20Sopenharmony_ci vpslldq $8, tmp, tmp; \ 10698c2ecf20Sopenharmony_ci vpsubq tmp, x, x; 10708c2ecf20Sopenharmony_ci 10718c2ecf20Sopenharmony_ci#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \ 10728c2ecf20Sopenharmony_ci vpcmpeqq minus_one, x, tmp1; \ 10738c2ecf20Sopenharmony_ci vpcmpeqq minus_two, x, tmp2; \ 10748c2ecf20Sopenharmony_ci vpsubq minus_two, x, x; \ 10758c2ecf20Sopenharmony_ci vpor tmp2, tmp1, tmp1; \ 10768c2ecf20Sopenharmony_ci vpslldq $8, tmp1, tmp1; \ 10778c2ecf20Sopenharmony_ci vpsubq tmp1, x, x; 10788c2ecf20Sopenharmony_ci 10798c2ecf20Sopenharmony_ciSYM_FUNC_START(camellia_ctr_32way) 10808c2ecf20Sopenharmony_ci /* input: 10818c2ecf20Sopenharmony_ci * %rdi: ctx, CTX 10828c2ecf20Sopenharmony_ci * %rsi: dst (32 blocks) 10838c2ecf20Sopenharmony_ci * %rdx: src (32 blocks) 10848c2ecf20Sopenharmony_ci * %rcx: iv (little endian, 128bit) 10858c2ecf20Sopenharmony_ci */ 10868c2ecf20Sopenharmony_ci FRAME_BEGIN 10878c2ecf20Sopenharmony_ci 10888c2ecf20Sopenharmony_ci vzeroupper; 10898c2ecf20Sopenharmony_ci 10908c2ecf20Sopenharmony_ci movq %rsp, %r10; 10918c2ecf20Sopenharmony_ci cmpq %rsi, %rdx; 10928c2ecf20Sopenharmony_ci je .Lctr_use_stack; 10938c2ecf20Sopenharmony_ci 10948c2ecf20Sopenharmony_ci /* dst can be used as temporary storage, src is not overwritten. */ 10958c2ecf20Sopenharmony_ci movq %rsi, %rax; 10968c2ecf20Sopenharmony_ci jmp .Lctr_continue; 10978c2ecf20Sopenharmony_ci 10988c2ecf20Sopenharmony_ci.Lctr_use_stack: 10998c2ecf20Sopenharmony_ci subq $(16 * 32), %rsp; 11008c2ecf20Sopenharmony_ci movq %rsp, %rax; 11018c2ecf20Sopenharmony_ci 11028c2ecf20Sopenharmony_ci.Lctr_continue: 11038c2ecf20Sopenharmony_ci vpcmpeqd %ymm15, %ymm15, %ymm15; 11048c2ecf20Sopenharmony_ci vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */ 11058c2ecf20Sopenharmony_ci vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */ 11068c2ecf20Sopenharmony_ci 11078c2ecf20Sopenharmony_ci /* load IV and byteswap */ 11088c2ecf20Sopenharmony_ci vmovdqu (%rcx), %xmm0; 11098c2ecf20Sopenharmony_ci vmovdqa %xmm0, %xmm1; 11108c2ecf20Sopenharmony_ci inc_le128(%xmm0, %xmm15, %xmm14); 11118c2ecf20Sopenharmony_ci vbroadcasti128 .Lbswap128_mask, %ymm14; 11128c2ecf20Sopenharmony_ci vinserti128 $1, %xmm0, %ymm1, %ymm0; 11138c2ecf20Sopenharmony_ci vpshufb %ymm14, %ymm0, %ymm13; 11148c2ecf20Sopenharmony_ci vmovdqu %ymm13, 15 * 32(%rax); 11158c2ecf20Sopenharmony_ci 11168c2ecf20Sopenharmony_ci /* construct IVs */ 11178c2ecf20Sopenharmony_ci add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */ 11188c2ecf20Sopenharmony_ci vpshufb %ymm14, %ymm0, %ymm13; 11198c2ecf20Sopenharmony_ci vmovdqu %ymm13, 14 * 32(%rax); 11208c2ecf20Sopenharmony_ci add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 11218c2ecf20Sopenharmony_ci vpshufb %ymm14, %ymm0, %ymm13; 11228c2ecf20Sopenharmony_ci vmovdqu %ymm13, 13 * 32(%rax); 11238c2ecf20Sopenharmony_ci add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 11248c2ecf20Sopenharmony_ci vpshufb %ymm14, %ymm0, %ymm13; 11258c2ecf20Sopenharmony_ci vmovdqu %ymm13, 12 * 32(%rax); 11268c2ecf20Sopenharmony_ci add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 11278c2ecf20Sopenharmony_ci vpshufb %ymm14, %ymm0, %ymm13; 11288c2ecf20Sopenharmony_ci vmovdqu %ymm13, 11 * 32(%rax); 11298c2ecf20Sopenharmony_ci add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 11308c2ecf20Sopenharmony_ci vpshufb %ymm14, %ymm0, %ymm10; 11318c2ecf20Sopenharmony_ci add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 11328c2ecf20Sopenharmony_ci vpshufb %ymm14, %ymm0, %ymm9; 11338c2ecf20Sopenharmony_ci add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 11348c2ecf20Sopenharmony_ci vpshufb %ymm14, %ymm0, %ymm8; 11358c2ecf20Sopenharmony_ci add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 11368c2ecf20Sopenharmony_ci vpshufb %ymm14, %ymm0, %ymm7; 11378c2ecf20Sopenharmony_ci add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 11388c2ecf20Sopenharmony_ci vpshufb %ymm14, %ymm0, %ymm6; 11398c2ecf20Sopenharmony_ci add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 11408c2ecf20Sopenharmony_ci vpshufb %ymm14, %ymm0, %ymm5; 11418c2ecf20Sopenharmony_ci add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 11428c2ecf20Sopenharmony_ci vpshufb %ymm14, %ymm0, %ymm4; 11438c2ecf20Sopenharmony_ci add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 11448c2ecf20Sopenharmony_ci vpshufb %ymm14, %ymm0, %ymm3; 11458c2ecf20Sopenharmony_ci add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 11468c2ecf20Sopenharmony_ci vpshufb %ymm14, %ymm0, %ymm2; 11478c2ecf20Sopenharmony_ci add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 11488c2ecf20Sopenharmony_ci vpshufb %ymm14, %ymm0, %ymm1; 11498c2ecf20Sopenharmony_ci add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 11508c2ecf20Sopenharmony_ci vextracti128 $1, %ymm0, %xmm13; 11518c2ecf20Sopenharmony_ci vpshufb %ymm14, %ymm0, %ymm0; 11528c2ecf20Sopenharmony_ci inc_le128(%xmm13, %xmm15, %xmm14); 11538c2ecf20Sopenharmony_ci vmovdqu %xmm13, (%rcx); 11548c2ecf20Sopenharmony_ci 11558c2ecf20Sopenharmony_ci /* inpack32_pre: */ 11568c2ecf20Sopenharmony_ci vpbroadcastq (key_table)(CTX), %ymm15; 11578c2ecf20Sopenharmony_ci vpshufb .Lpack_bswap, %ymm15, %ymm15; 11588c2ecf20Sopenharmony_ci vpxor %ymm0, %ymm15, %ymm0; 11598c2ecf20Sopenharmony_ci vpxor %ymm1, %ymm15, %ymm1; 11608c2ecf20Sopenharmony_ci vpxor %ymm2, %ymm15, %ymm2; 11618c2ecf20Sopenharmony_ci vpxor %ymm3, %ymm15, %ymm3; 11628c2ecf20Sopenharmony_ci vpxor %ymm4, %ymm15, %ymm4; 11638c2ecf20Sopenharmony_ci vpxor %ymm5, %ymm15, %ymm5; 11648c2ecf20Sopenharmony_ci vpxor %ymm6, %ymm15, %ymm6; 11658c2ecf20Sopenharmony_ci vpxor %ymm7, %ymm15, %ymm7; 11668c2ecf20Sopenharmony_ci vpxor %ymm8, %ymm15, %ymm8; 11678c2ecf20Sopenharmony_ci vpxor %ymm9, %ymm15, %ymm9; 11688c2ecf20Sopenharmony_ci vpxor %ymm10, %ymm15, %ymm10; 11698c2ecf20Sopenharmony_ci vpxor 11 * 32(%rax), %ymm15, %ymm11; 11708c2ecf20Sopenharmony_ci vpxor 12 * 32(%rax), %ymm15, %ymm12; 11718c2ecf20Sopenharmony_ci vpxor 13 * 32(%rax), %ymm15, %ymm13; 11728c2ecf20Sopenharmony_ci vpxor 14 * 32(%rax), %ymm15, %ymm14; 11738c2ecf20Sopenharmony_ci vpxor 15 * 32(%rax), %ymm15, %ymm15; 11748c2ecf20Sopenharmony_ci 11758c2ecf20Sopenharmony_ci call __camellia_enc_blk32; 11768c2ecf20Sopenharmony_ci 11778c2ecf20Sopenharmony_ci movq %r10, %rsp; 11788c2ecf20Sopenharmony_ci 11798c2ecf20Sopenharmony_ci vpxor 0 * 32(%rdx), %ymm7, %ymm7; 11808c2ecf20Sopenharmony_ci vpxor 1 * 32(%rdx), %ymm6, %ymm6; 11818c2ecf20Sopenharmony_ci vpxor 2 * 32(%rdx), %ymm5, %ymm5; 11828c2ecf20Sopenharmony_ci vpxor 3 * 32(%rdx), %ymm4, %ymm4; 11838c2ecf20Sopenharmony_ci vpxor 4 * 32(%rdx), %ymm3, %ymm3; 11848c2ecf20Sopenharmony_ci vpxor 5 * 32(%rdx), %ymm2, %ymm2; 11858c2ecf20Sopenharmony_ci vpxor 6 * 32(%rdx), %ymm1, %ymm1; 11868c2ecf20Sopenharmony_ci vpxor 7 * 32(%rdx), %ymm0, %ymm0; 11878c2ecf20Sopenharmony_ci vpxor 8 * 32(%rdx), %ymm15, %ymm15; 11888c2ecf20Sopenharmony_ci vpxor 9 * 32(%rdx), %ymm14, %ymm14; 11898c2ecf20Sopenharmony_ci vpxor 10 * 32(%rdx), %ymm13, %ymm13; 11908c2ecf20Sopenharmony_ci vpxor 11 * 32(%rdx), %ymm12, %ymm12; 11918c2ecf20Sopenharmony_ci vpxor 12 * 32(%rdx), %ymm11, %ymm11; 11928c2ecf20Sopenharmony_ci vpxor 13 * 32(%rdx), %ymm10, %ymm10; 11938c2ecf20Sopenharmony_ci vpxor 14 * 32(%rdx), %ymm9, %ymm9; 11948c2ecf20Sopenharmony_ci vpxor 15 * 32(%rdx), %ymm8, %ymm8; 11958c2ecf20Sopenharmony_ci write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, 11968c2ecf20Sopenharmony_ci %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, 11978c2ecf20Sopenharmony_ci %ymm8, %rsi); 11988c2ecf20Sopenharmony_ci 11998c2ecf20Sopenharmony_ci vzeroupper; 12008c2ecf20Sopenharmony_ci 12018c2ecf20Sopenharmony_ci FRAME_END 12028c2ecf20Sopenharmony_ci RET; 12038c2ecf20Sopenharmony_ciSYM_FUNC_END(camellia_ctr_32way) 12048c2ecf20Sopenharmony_ci 12058c2ecf20Sopenharmony_ci#define gf128mul_x_ble(iv, mask, tmp) \ 12068c2ecf20Sopenharmony_ci vpsrad $31, iv, tmp; \ 12078c2ecf20Sopenharmony_ci vpaddq iv, iv, iv; \ 12088c2ecf20Sopenharmony_ci vpshufd $0x13, tmp, tmp; \ 12098c2ecf20Sopenharmony_ci vpand mask, tmp, tmp; \ 12108c2ecf20Sopenharmony_ci vpxor tmp, iv, iv; 12118c2ecf20Sopenharmony_ci 12128c2ecf20Sopenharmony_ci#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \ 12138c2ecf20Sopenharmony_ci vpsrad $31, iv, tmp0; \ 12148c2ecf20Sopenharmony_ci vpaddq iv, iv, tmp1; \ 12158c2ecf20Sopenharmony_ci vpsllq $2, iv, iv; \ 12168c2ecf20Sopenharmony_ci vpshufd $0x13, tmp0, tmp0; \ 12178c2ecf20Sopenharmony_ci vpsrad $31, tmp1, tmp1; \ 12188c2ecf20Sopenharmony_ci vpand mask2, tmp0, tmp0; \ 12198c2ecf20Sopenharmony_ci vpshufd $0x13, tmp1, tmp1; \ 12208c2ecf20Sopenharmony_ci vpxor tmp0, iv, iv; \ 12218c2ecf20Sopenharmony_ci vpand mask1, tmp1, tmp1; \ 12228c2ecf20Sopenharmony_ci vpxor tmp1, iv, iv; 12238c2ecf20Sopenharmony_ci 12248c2ecf20Sopenharmony_ci.align 8 12258c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(camellia_xts_crypt_32way) 12268c2ecf20Sopenharmony_ci /* input: 12278c2ecf20Sopenharmony_ci * %rdi: ctx, CTX 12288c2ecf20Sopenharmony_ci * %rsi: dst (32 blocks) 12298c2ecf20Sopenharmony_ci * %rdx: src (32 blocks) 12308c2ecf20Sopenharmony_ci * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 12318c2ecf20Sopenharmony_ci * %r8: index for input whitening key 12328c2ecf20Sopenharmony_ci * %r9: pointer to __camellia_enc_blk32 or __camellia_dec_blk32 12338c2ecf20Sopenharmony_ci */ 12348c2ecf20Sopenharmony_ci FRAME_BEGIN 12358c2ecf20Sopenharmony_ci 12368c2ecf20Sopenharmony_ci vzeroupper; 12378c2ecf20Sopenharmony_ci 12388c2ecf20Sopenharmony_ci subq $(16 * 32), %rsp; 12398c2ecf20Sopenharmony_ci movq %rsp, %rax; 12408c2ecf20Sopenharmony_ci 12418c2ecf20Sopenharmony_ci vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12; 12428c2ecf20Sopenharmony_ci 12438c2ecf20Sopenharmony_ci /* load IV and construct second IV */ 12448c2ecf20Sopenharmony_ci vmovdqu (%rcx), %xmm0; 12458c2ecf20Sopenharmony_ci vmovdqa %xmm0, %xmm15; 12468c2ecf20Sopenharmony_ci gf128mul_x_ble(%xmm0, %xmm12, %xmm13); 12478c2ecf20Sopenharmony_ci vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13; 12488c2ecf20Sopenharmony_ci vinserti128 $1, %xmm0, %ymm15, %ymm0; 12498c2ecf20Sopenharmony_ci vpxor 0 * 32(%rdx), %ymm0, %ymm15; 12508c2ecf20Sopenharmony_ci vmovdqu %ymm15, 15 * 32(%rax); 12518c2ecf20Sopenharmony_ci vmovdqu %ymm0, 0 * 32(%rsi); 12528c2ecf20Sopenharmony_ci 12538c2ecf20Sopenharmony_ci /* construct IVs */ 12548c2ecf20Sopenharmony_ci gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 12558c2ecf20Sopenharmony_ci vpxor 1 * 32(%rdx), %ymm0, %ymm15; 12568c2ecf20Sopenharmony_ci vmovdqu %ymm15, 14 * 32(%rax); 12578c2ecf20Sopenharmony_ci vmovdqu %ymm0, 1 * 32(%rsi); 12588c2ecf20Sopenharmony_ci 12598c2ecf20Sopenharmony_ci gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 12608c2ecf20Sopenharmony_ci vpxor 2 * 32(%rdx), %ymm0, %ymm15; 12618c2ecf20Sopenharmony_ci vmovdqu %ymm15, 13 * 32(%rax); 12628c2ecf20Sopenharmony_ci vmovdqu %ymm0, 2 * 32(%rsi); 12638c2ecf20Sopenharmony_ci 12648c2ecf20Sopenharmony_ci gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 12658c2ecf20Sopenharmony_ci vpxor 3 * 32(%rdx), %ymm0, %ymm15; 12668c2ecf20Sopenharmony_ci vmovdqu %ymm15, 12 * 32(%rax); 12678c2ecf20Sopenharmony_ci vmovdqu %ymm0, 3 * 32(%rsi); 12688c2ecf20Sopenharmony_ci 12698c2ecf20Sopenharmony_ci gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 12708c2ecf20Sopenharmony_ci vpxor 4 * 32(%rdx), %ymm0, %ymm11; 12718c2ecf20Sopenharmony_ci vmovdqu %ymm0, 4 * 32(%rsi); 12728c2ecf20Sopenharmony_ci 12738c2ecf20Sopenharmony_ci gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 12748c2ecf20Sopenharmony_ci vpxor 5 * 32(%rdx), %ymm0, %ymm10; 12758c2ecf20Sopenharmony_ci vmovdqu %ymm0, 5 * 32(%rsi); 12768c2ecf20Sopenharmony_ci 12778c2ecf20Sopenharmony_ci gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 12788c2ecf20Sopenharmony_ci vpxor 6 * 32(%rdx), %ymm0, %ymm9; 12798c2ecf20Sopenharmony_ci vmovdqu %ymm0, 6 * 32(%rsi); 12808c2ecf20Sopenharmony_ci 12818c2ecf20Sopenharmony_ci gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 12828c2ecf20Sopenharmony_ci vpxor 7 * 32(%rdx), %ymm0, %ymm8; 12838c2ecf20Sopenharmony_ci vmovdqu %ymm0, 7 * 32(%rsi); 12848c2ecf20Sopenharmony_ci 12858c2ecf20Sopenharmony_ci gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 12868c2ecf20Sopenharmony_ci vpxor 8 * 32(%rdx), %ymm0, %ymm7; 12878c2ecf20Sopenharmony_ci vmovdqu %ymm0, 8 * 32(%rsi); 12888c2ecf20Sopenharmony_ci 12898c2ecf20Sopenharmony_ci gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 12908c2ecf20Sopenharmony_ci vpxor 9 * 32(%rdx), %ymm0, %ymm6; 12918c2ecf20Sopenharmony_ci vmovdqu %ymm0, 9 * 32(%rsi); 12928c2ecf20Sopenharmony_ci 12938c2ecf20Sopenharmony_ci gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 12948c2ecf20Sopenharmony_ci vpxor 10 * 32(%rdx), %ymm0, %ymm5; 12958c2ecf20Sopenharmony_ci vmovdqu %ymm0, 10 * 32(%rsi); 12968c2ecf20Sopenharmony_ci 12978c2ecf20Sopenharmony_ci gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 12988c2ecf20Sopenharmony_ci vpxor 11 * 32(%rdx), %ymm0, %ymm4; 12998c2ecf20Sopenharmony_ci vmovdqu %ymm0, 11 * 32(%rsi); 13008c2ecf20Sopenharmony_ci 13018c2ecf20Sopenharmony_ci gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 13028c2ecf20Sopenharmony_ci vpxor 12 * 32(%rdx), %ymm0, %ymm3; 13038c2ecf20Sopenharmony_ci vmovdqu %ymm0, 12 * 32(%rsi); 13048c2ecf20Sopenharmony_ci 13058c2ecf20Sopenharmony_ci gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 13068c2ecf20Sopenharmony_ci vpxor 13 * 32(%rdx), %ymm0, %ymm2; 13078c2ecf20Sopenharmony_ci vmovdqu %ymm0, 13 * 32(%rsi); 13088c2ecf20Sopenharmony_ci 13098c2ecf20Sopenharmony_ci gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 13108c2ecf20Sopenharmony_ci vpxor 14 * 32(%rdx), %ymm0, %ymm1; 13118c2ecf20Sopenharmony_ci vmovdqu %ymm0, 14 * 32(%rsi); 13128c2ecf20Sopenharmony_ci 13138c2ecf20Sopenharmony_ci gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 13148c2ecf20Sopenharmony_ci vpxor 15 * 32(%rdx), %ymm0, %ymm15; 13158c2ecf20Sopenharmony_ci vmovdqu %ymm15, 0 * 32(%rax); 13168c2ecf20Sopenharmony_ci vmovdqu %ymm0, 15 * 32(%rsi); 13178c2ecf20Sopenharmony_ci 13188c2ecf20Sopenharmony_ci vextracti128 $1, %ymm0, %xmm0; 13198c2ecf20Sopenharmony_ci gf128mul_x_ble(%xmm0, %xmm12, %xmm15); 13208c2ecf20Sopenharmony_ci vmovdqu %xmm0, (%rcx); 13218c2ecf20Sopenharmony_ci 13228c2ecf20Sopenharmony_ci /* inpack32_pre: */ 13238c2ecf20Sopenharmony_ci vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15; 13248c2ecf20Sopenharmony_ci vpshufb .Lpack_bswap, %ymm15, %ymm15; 13258c2ecf20Sopenharmony_ci vpxor 0 * 32(%rax), %ymm15, %ymm0; 13268c2ecf20Sopenharmony_ci vpxor %ymm1, %ymm15, %ymm1; 13278c2ecf20Sopenharmony_ci vpxor %ymm2, %ymm15, %ymm2; 13288c2ecf20Sopenharmony_ci vpxor %ymm3, %ymm15, %ymm3; 13298c2ecf20Sopenharmony_ci vpxor %ymm4, %ymm15, %ymm4; 13308c2ecf20Sopenharmony_ci vpxor %ymm5, %ymm15, %ymm5; 13318c2ecf20Sopenharmony_ci vpxor %ymm6, %ymm15, %ymm6; 13328c2ecf20Sopenharmony_ci vpxor %ymm7, %ymm15, %ymm7; 13338c2ecf20Sopenharmony_ci vpxor %ymm8, %ymm15, %ymm8; 13348c2ecf20Sopenharmony_ci vpxor %ymm9, %ymm15, %ymm9; 13358c2ecf20Sopenharmony_ci vpxor %ymm10, %ymm15, %ymm10; 13368c2ecf20Sopenharmony_ci vpxor %ymm11, %ymm15, %ymm11; 13378c2ecf20Sopenharmony_ci vpxor 12 * 32(%rax), %ymm15, %ymm12; 13388c2ecf20Sopenharmony_ci vpxor 13 * 32(%rax), %ymm15, %ymm13; 13398c2ecf20Sopenharmony_ci vpxor 14 * 32(%rax), %ymm15, %ymm14; 13408c2ecf20Sopenharmony_ci vpxor 15 * 32(%rax), %ymm15, %ymm15; 13418c2ecf20Sopenharmony_ci 13428c2ecf20Sopenharmony_ci CALL_NOSPEC r9; 13438c2ecf20Sopenharmony_ci 13448c2ecf20Sopenharmony_ci addq $(16 * 32), %rsp; 13458c2ecf20Sopenharmony_ci 13468c2ecf20Sopenharmony_ci vpxor 0 * 32(%rsi), %ymm7, %ymm7; 13478c2ecf20Sopenharmony_ci vpxor 1 * 32(%rsi), %ymm6, %ymm6; 13488c2ecf20Sopenharmony_ci vpxor 2 * 32(%rsi), %ymm5, %ymm5; 13498c2ecf20Sopenharmony_ci vpxor 3 * 32(%rsi), %ymm4, %ymm4; 13508c2ecf20Sopenharmony_ci vpxor 4 * 32(%rsi), %ymm3, %ymm3; 13518c2ecf20Sopenharmony_ci vpxor 5 * 32(%rsi), %ymm2, %ymm2; 13528c2ecf20Sopenharmony_ci vpxor 6 * 32(%rsi), %ymm1, %ymm1; 13538c2ecf20Sopenharmony_ci vpxor 7 * 32(%rsi), %ymm0, %ymm0; 13548c2ecf20Sopenharmony_ci vpxor 8 * 32(%rsi), %ymm15, %ymm15; 13558c2ecf20Sopenharmony_ci vpxor 9 * 32(%rsi), %ymm14, %ymm14; 13568c2ecf20Sopenharmony_ci vpxor 10 * 32(%rsi), %ymm13, %ymm13; 13578c2ecf20Sopenharmony_ci vpxor 11 * 32(%rsi), %ymm12, %ymm12; 13588c2ecf20Sopenharmony_ci vpxor 12 * 32(%rsi), %ymm11, %ymm11; 13598c2ecf20Sopenharmony_ci vpxor 13 * 32(%rsi), %ymm10, %ymm10; 13608c2ecf20Sopenharmony_ci vpxor 14 * 32(%rsi), %ymm9, %ymm9; 13618c2ecf20Sopenharmony_ci vpxor 15 * 32(%rsi), %ymm8, %ymm8; 13628c2ecf20Sopenharmony_ci write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, 13638c2ecf20Sopenharmony_ci %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, 13648c2ecf20Sopenharmony_ci %ymm8, %rsi); 13658c2ecf20Sopenharmony_ci 13668c2ecf20Sopenharmony_ci vzeroupper; 13678c2ecf20Sopenharmony_ci 13688c2ecf20Sopenharmony_ci FRAME_END 13698c2ecf20Sopenharmony_ci RET; 13708c2ecf20Sopenharmony_ciSYM_FUNC_END(camellia_xts_crypt_32way) 13718c2ecf20Sopenharmony_ci 13728c2ecf20Sopenharmony_ciSYM_FUNC_START(camellia_xts_enc_32way) 13738c2ecf20Sopenharmony_ci /* input: 13748c2ecf20Sopenharmony_ci * %rdi: ctx, CTX 13758c2ecf20Sopenharmony_ci * %rsi: dst (32 blocks) 13768c2ecf20Sopenharmony_ci * %rdx: src (32 blocks) 13778c2ecf20Sopenharmony_ci * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 13788c2ecf20Sopenharmony_ci */ 13798c2ecf20Sopenharmony_ci 13808c2ecf20Sopenharmony_ci xorl %r8d, %r8d; /* input whitening key, 0 for enc */ 13818c2ecf20Sopenharmony_ci 13828c2ecf20Sopenharmony_ci leaq __camellia_enc_blk32, %r9; 13838c2ecf20Sopenharmony_ci 13848c2ecf20Sopenharmony_ci jmp camellia_xts_crypt_32way; 13858c2ecf20Sopenharmony_ciSYM_FUNC_END(camellia_xts_enc_32way) 13868c2ecf20Sopenharmony_ci 13878c2ecf20Sopenharmony_ciSYM_FUNC_START(camellia_xts_dec_32way) 13888c2ecf20Sopenharmony_ci /* input: 13898c2ecf20Sopenharmony_ci * %rdi: ctx, CTX 13908c2ecf20Sopenharmony_ci * %rsi: dst (32 blocks) 13918c2ecf20Sopenharmony_ci * %rdx: src (32 blocks) 13928c2ecf20Sopenharmony_ci * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 13938c2ecf20Sopenharmony_ci */ 13948c2ecf20Sopenharmony_ci 13958c2ecf20Sopenharmony_ci cmpl $16, key_length(CTX); 13968c2ecf20Sopenharmony_ci movl $32, %r8d; 13978c2ecf20Sopenharmony_ci movl $24, %eax; 13988c2ecf20Sopenharmony_ci cmovel %eax, %r8d; /* input whitening key, last for dec */ 13998c2ecf20Sopenharmony_ci 14008c2ecf20Sopenharmony_ci leaq __camellia_dec_blk32, %r9; 14018c2ecf20Sopenharmony_ci 14028c2ecf20Sopenharmony_ci jmp camellia_xts_crypt_32way; 14038c2ecf20Sopenharmony_ciSYM_FUNC_END(camellia_xts_dec_32way) 1404