18c2ecf20Sopenharmony_ci/* 28c2ecf20Sopenharmony_ci * x86_64/AVX/AES-NI assembler implementation of Camellia 38c2ecf20Sopenharmony_ci * 48c2ecf20Sopenharmony_ci * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 58c2ecf20Sopenharmony_ci * 68c2ecf20Sopenharmony_ci * This program is free software; you can redistribute it and/or modify 78c2ecf20Sopenharmony_ci * it under the terms of the GNU General Public License as published by 88c2ecf20Sopenharmony_ci * the Free Software Foundation; either version 2 of the License, or 98c2ecf20Sopenharmony_ci * (at your option) any later version. 108c2ecf20Sopenharmony_ci * 118c2ecf20Sopenharmony_ci */ 128c2ecf20Sopenharmony_ci 138c2ecf20Sopenharmony_ci/* 148c2ecf20Sopenharmony_ci * Version licensed under 2-clause BSD License is available at: 158c2ecf20Sopenharmony_ci * http://koti.mbnet.fi/axh/crypto/camellia-BSD-1.2.0-aesni1.tar.xz 168c2ecf20Sopenharmony_ci */ 178c2ecf20Sopenharmony_ci 188c2ecf20Sopenharmony_ci#include <linux/linkage.h> 198c2ecf20Sopenharmony_ci#include <asm/frame.h> 208c2ecf20Sopenharmony_ci#include <asm/nospec-branch.h> 218c2ecf20Sopenharmony_ci 228c2ecf20Sopenharmony_ci#define CAMELLIA_TABLE_BYTE_LEN 272 238c2ecf20Sopenharmony_ci 248c2ecf20Sopenharmony_ci/* struct camellia_ctx: */ 258c2ecf20Sopenharmony_ci#define key_table 0 268c2ecf20Sopenharmony_ci#define key_length CAMELLIA_TABLE_BYTE_LEN 278c2ecf20Sopenharmony_ci 288c2ecf20Sopenharmony_ci/* register macros */ 298c2ecf20Sopenharmony_ci#define CTX %rdi 308c2ecf20Sopenharmony_ci 318c2ecf20Sopenharmony_ci/********************************************************************** 328c2ecf20Sopenharmony_ci 16-way camellia 338c2ecf20Sopenharmony_ci **********************************************************************/ 348c2ecf20Sopenharmony_ci#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ 358c2ecf20Sopenharmony_ci vpand x, mask4bit, tmp0; \ 368c2ecf20Sopenharmony_ci vpandn x, mask4bit, x; \ 378c2ecf20Sopenharmony_ci vpsrld $4, x, x; \ 388c2ecf20Sopenharmony_ci \ 398c2ecf20Sopenharmony_ci vpshufb tmp0, lo_t, tmp0; \ 408c2ecf20Sopenharmony_ci vpshufb x, hi_t, x; \ 418c2ecf20Sopenharmony_ci vpxor tmp0, x, x; 428c2ecf20Sopenharmony_ci 438c2ecf20Sopenharmony_ci/* 448c2ecf20Sopenharmony_ci * IN: 458c2ecf20Sopenharmony_ci * x0..x7: byte-sliced AB state 468c2ecf20Sopenharmony_ci * mem_cd: register pointer storing CD state 478c2ecf20Sopenharmony_ci * key: index for key material 488c2ecf20Sopenharmony_ci * OUT: 498c2ecf20Sopenharmony_ci * x0..x7: new byte-sliced CD state 508c2ecf20Sopenharmony_ci */ 518c2ecf20Sopenharmony_ci#define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \ 528c2ecf20Sopenharmony_ci t7, mem_cd, key) \ 538c2ecf20Sopenharmony_ci /* \ 548c2ecf20Sopenharmony_ci * S-function with AES subbytes \ 558c2ecf20Sopenharmony_ci */ \ 568c2ecf20Sopenharmony_ci vmovdqa .Linv_shift_row, t4; \ 578c2ecf20Sopenharmony_ci vbroadcastss .L0f0f0f0f, t7; \ 588c2ecf20Sopenharmony_ci vmovdqa .Lpre_tf_lo_s1, t0; \ 598c2ecf20Sopenharmony_ci vmovdqa .Lpre_tf_hi_s1, t1; \ 608c2ecf20Sopenharmony_ci \ 618c2ecf20Sopenharmony_ci /* AES inverse shift rows */ \ 628c2ecf20Sopenharmony_ci vpshufb t4, x0, x0; \ 638c2ecf20Sopenharmony_ci vpshufb t4, x7, x7; \ 648c2ecf20Sopenharmony_ci vpshufb t4, x1, x1; \ 658c2ecf20Sopenharmony_ci vpshufb t4, x4, x4; \ 668c2ecf20Sopenharmony_ci vpshufb t4, x2, x2; \ 678c2ecf20Sopenharmony_ci vpshufb t4, x5, x5; \ 688c2ecf20Sopenharmony_ci vpshufb t4, x3, x3; \ 698c2ecf20Sopenharmony_ci vpshufb t4, x6, x6; \ 708c2ecf20Sopenharmony_ci \ 718c2ecf20Sopenharmony_ci /* prefilter sboxes 1, 2 and 3 */ \ 728c2ecf20Sopenharmony_ci vmovdqa .Lpre_tf_lo_s4, t2; \ 738c2ecf20Sopenharmony_ci vmovdqa .Lpre_tf_hi_s4, t3; \ 748c2ecf20Sopenharmony_ci filter_8bit(x0, t0, t1, t7, t6); \ 758c2ecf20Sopenharmony_ci filter_8bit(x7, t0, t1, t7, t6); \ 768c2ecf20Sopenharmony_ci filter_8bit(x1, t0, t1, t7, t6); \ 778c2ecf20Sopenharmony_ci filter_8bit(x4, t0, t1, t7, t6); \ 788c2ecf20Sopenharmony_ci filter_8bit(x2, t0, t1, t7, t6); \ 798c2ecf20Sopenharmony_ci filter_8bit(x5, t0, t1, t7, t6); \ 808c2ecf20Sopenharmony_ci \ 818c2ecf20Sopenharmony_ci /* prefilter sbox 4 */ \ 828c2ecf20Sopenharmony_ci vpxor t4, t4, t4; \ 838c2ecf20Sopenharmony_ci filter_8bit(x3, t2, t3, t7, t6); \ 848c2ecf20Sopenharmony_ci filter_8bit(x6, t2, t3, t7, t6); \ 858c2ecf20Sopenharmony_ci \ 868c2ecf20Sopenharmony_ci /* AES subbytes + AES shift rows */ \ 878c2ecf20Sopenharmony_ci vmovdqa .Lpost_tf_lo_s1, t0; \ 888c2ecf20Sopenharmony_ci vmovdqa .Lpost_tf_hi_s1, t1; \ 898c2ecf20Sopenharmony_ci vaesenclast t4, x0, x0; \ 908c2ecf20Sopenharmony_ci vaesenclast t4, x7, x7; \ 918c2ecf20Sopenharmony_ci vaesenclast t4, x1, x1; \ 928c2ecf20Sopenharmony_ci vaesenclast t4, x4, x4; \ 938c2ecf20Sopenharmony_ci vaesenclast t4, x2, x2; \ 948c2ecf20Sopenharmony_ci vaesenclast t4, x5, x5; \ 958c2ecf20Sopenharmony_ci vaesenclast t4, x3, x3; \ 968c2ecf20Sopenharmony_ci vaesenclast t4, x6, x6; \ 978c2ecf20Sopenharmony_ci \ 988c2ecf20Sopenharmony_ci /* postfilter sboxes 1 and 4 */ \ 998c2ecf20Sopenharmony_ci vmovdqa .Lpost_tf_lo_s3, t2; \ 1008c2ecf20Sopenharmony_ci vmovdqa .Lpost_tf_hi_s3, t3; \ 1018c2ecf20Sopenharmony_ci filter_8bit(x0, t0, t1, t7, t6); \ 1028c2ecf20Sopenharmony_ci filter_8bit(x7, t0, t1, t7, t6); \ 1038c2ecf20Sopenharmony_ci filter_8bit(x3, t0, t1, t7, t6); \ 1048c2ecf20Sopenharmony_ci filter_8bit(x6, t0, t1, t7, t6); \ 1058c2ecf20Sopenharmony_ci \ 1068c2ecf20Sopenharmony_ci /* postfilter sbox 3 */ \ 1078c2ecf20Sopenharmony_ci vmovdqa .Lpost_tf_lo_s2, t4; \ 1088c2ecf20Sopenharmony_ci vmovdqa .Lpost_tf_hi_s2, t5; \ 1098c2ecf20Sopenharmony_ci filter_8bit(x2, t2, t3, t7, t6); \ 1108c2ecf20Sopenharmony_ci filter_8bit(x5, t2, t3, t7, t6); \ 1118c2ecf20Sopenharmony_ci \ 1128c2ecf20Sopenharmony_ci vpxor t6, t6, t6; \ 1138c2ecf20Sopenharmony_ci vmovq key, t0; \ 1148c2ecf20Sopenharmony_ci \ 1158c2ecf20Sopenharmony_ci /* postfilter sbox 2 */ \ 1168c2ecf20Sopenharmony_ci filter_8bit(x1, t4, t5, t7, t2); \ 1178c2ecf20Sopenharmony_ci filter_8bit(x4, t4, t5, t7, t2); \ 1188c2ecf20Sopenharmony_ci \ 1198c2ecf20Sopenharmony_ci vpsrldq $5, t0, t5; \ 1208c2ecf20Sopenharmony_ci vpsrldq $1, t0, t1; \ 1218c2ecf20Sopenharmony_ci vpsrldq $2, t0, t2; \ 1228c2ecf20Sopenharmony_ci vpsrldq $3, t0, t3; \ 1238c2ecf20Sopenharmony_ci vpsrldq $4, t0, t4; \ 1248c2ecf20Sopenharmony_ci vpshufb t6, t0, t0; \ 1258c2ecf20Sopenharmony_ci vpshufb t6, t1, t1; \ 1268c2ecf20Sopenharmony_ci vpshufb t6, t2, t2; \ 1278c2ecf20Sopenharmony_ci vpshufb t6, t3, t3; \ 1288c2ecf20Sopenharmony_ci vpshufb t6, t4, t4; \ 1298c2ecf20Sopenharmony_ci vpsrldq $2, t5, t7; \ 1308c2ecf20Sopenharmony_ci vpshufb t6, t7, t7; \ 1318c2ecf20Sopenharmony_ci \ 1328c2ecf20Sopenharmony_ci /* \ 1338c2ecf20Sopenharmony_ci * P-function \ 1348c2ecf20Sopenharmony_ci */ \ 1358c2ecf20Sopenharmony_ci vpxor x5, x0, x0; \ 1368c2ecf20Sopenharmony_ci vpxor x6, x1, x1; \ 1378c2ecf20Sopenharmony_ci vpxor x7, x2, x2; \ 1388c2ecf20Sopenharmony_ci vpxor x4, x3, x3; \ 1398c2ecf20Sopenharmony_ci \ 1408c2ecf20Sopenharmony_ci vpxor x2, x4, x4; \ 1418c2ecf20Sopenharmony_ci vpxor x3, x5, x5; \ 1428c2ecf20Sopenharmony_ci vpxor x0, x6, x6; \ 1438c2ecf20Sopenharmony_ci vpxor x1, x7, x7; \ 1448c2ecf20Sopenharmony_ci \ 1458c2ecf20Sopenharmony_ci vpxor x7, x0, x0; \ 1468c2ecf20Sopenharmony_ci vpxor x4, x1, x1; \ 1478c2ecf20Sopenharmony_ci vpxor x5, x2, x2; \ 1488c2ecf20Sopenharmony_ci vpxor x6, x3, x3; \ 1498c2ecf20Sopenharmony_ci \ 1508c2ecf20Sopenharmony_ci vpxor x3, x4, x4; \ 1518c2ecf20Sopenharmony_ci vpxor x0, x5, x5; \ 1528c2ecf20Sopenharmony_ci vpxor x1, x6, x6; \ 1538c2ecf20Sopenharmony_ci vpxor x2, x7, x7; /* note: high and low parts swapped */ \ 1548c2ecf20Sopenharmony_ci \ 1558c2ecf20Sopenharmony_ci /* \ 1568c2ecf20Sopenharmony_ci * Add key material and result to CD (x becomes new CD) \ 1578c2ecf20Sopenharmony_ci */ \ 1588c2ecf20Sopenharmony_ci \ 1598c2ecf20Sopenharmony_ci vpxor t3, x4, x4; \ 1608c2ecf20Sopenharmony_ci vpxor 0 * 16(mem_cd), x4, x4; \ 1618c2ecf20Sopenharmony_ci \ 1628c2ecf20Sopenharmony_ci vpxor t2, x5, x5; \ 1638c2ecf20Sopenharmony_ci vpxor 1 * 16(mem_cd), x5, x5; \ 1648c2ecf20Sopenharmony_ci \ 1658c2ecf20Sopenharmony_ci vpsrldq $1, t5, t3; \ 1668c2ecf20Sopenharmony_ci vpshufb t6, t5, t5; \ 1678c2ecf20Sopenharmony_ci vpshufb t6, t3, t6; \ 1688c2ecf20Sopenharmony_ci \ 1698c2ecf20Sopenharmony_ci vpxor t1, x6, x6; \ 1708c2ecf20Sopenharmony_ci vpxor 2 * 16(mem_cd), x6, x6; \ 1718c2ecf20Sopenharmony_ci \ 1728c2ecf20Sopenharmony_ci vpxor t0, x7, x7; \ 1738c2ecf20Sopenharmony_ci vpxor 3 * 16(mem_cd), x7, x7; \ 1748c2ecf20Sopenharmony_ci \ 1758c2ecf20Sopenharmony_ci vpxor t7, x0, x0; \ 1768c2ecf20Sopenharmony_ci vpxor 4 * 16(mem_cd), x0, x0; \ 1778c2ecf20Sopenharmony_ci \ 1788c2ecf20Sopenharmony_ci vpxor t6, x1, x1; \ 1798c2ecf20Sopenharmony_ci vpxor 5 * 16(mem_cd), x1, x1; \ 1808c2ecf20Sopenharmony_ci \ 1818c2ecf20Sopenharmony_ci vpxor t5, x2, x2; \ 1828c2ecf20Sopenharmony_ci vpxor 6 * 16(mem_cd), x2, x2; \ 1838c2ecf20Sopenharmony_ci \ 1848c2ecf20Sopenharmony_ci vpxor t4, x3, x3; \ 1858c2ecf20Sopenharmony_ci vpxor 7 * 16(mem_cd), x3, x3; 1868c2ecf20Sopenharmony_ci 1878c2ecf20Sopenharmony_ci/* 1888c2ecf20Sopenharmony_ci * Size optimization... with inlined roundsm16, binary would be over 5 times 1898c2ecf20Sopenharmony_ci * larger and would only be 0.5% faster (on sandy-bridge). 1908c2ecf20Sopenharmony_ci */ 1918c2ecf20Sopenharmony_ci.align 8 1928c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) 1938c2ecf20Sopenharmony_ci roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 1948c2ecf20Sopenharmony_ci %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, 1958c2ecf20Sopenharmony_ci %rcx, (%r9)); 1968c2ecf20Sopenharmony_ci RET; 1978c2ecf20Sopenharmony_ciSYM_FUNC_END(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) 1988c2ecf20Sopenharmony_ci 1998c2ecf20Sopenharmony_ci.align 8 2008c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) 2018c2ecf20Sopenharmony_ci roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3, 2028c2ecf20Sopenharmony_ci %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11, 2038c2ecf20Sopenharmony_ci %rax, (%r9)); 2048c2ecf20Sopenharmony_ci RET; 2058c2ecf20Sopenharmony_ciSYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) 2068c2ecf20Sopenharmony_ci 2078c2ecf20Sopenharmony_ci/* 2088c2ecf20Sopenharmony_ci * IN/OUT: 2098c2ecf20Sopenharmony_ci * x0..x7: byte-sliced AB state preloaded 2108c2ecf20Sopenharmony_ci * mem_ab: byte-sliced AB state in memory 2118c2ecf20Sopenharmony_ci * mem_cb: byte-sliced CD state in memory 2128c2ecf20Sopenharmony_ci */ 2138c2ecf20Sopenharmony_ci#define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 2148c2ecf20Sopenharmony_ci y6, y7, mem_ab, mem_cd, i, dir, store_ab) \ 2158c2ecf20Sopenharmony_ci leaq (key_table + (i) * 8)(CTX), %r9; \ 2168c2ecf20Sopenharmony_ci call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \ 2178c2ecf20Sopenharmony_ci \ 2188c2ecf20Sopenharmony_ci vmovdqu x4, 0 * 16(mem_cd); \ 2198c2ecf20Sopenharmony_ci vmovdqu x5, 1 * 16(mem_cd); \ 2208c2ecf20Sopenharmony_ci vmovdqu x6, 2 * 16(mem_cd); \ 2218c2ecf20Sopenharmony_ci vmovdqu x7, 3 * 16(mem_cd); \ 2228c2ecf20Sopenharmony_ci vmovdqu x0, 4 * 16(mem_cd); \ 2238c2ecf20Sopenharmony_ci vmovdqu x1, 5 * 16(mem_cd); \ 2248c2ecf20Sopenharmony_ci vmovdqu x2, 6 * 16(mem_cd); \ 2258c2ecf20Sopenharmony_ci vmovdqu x3, 7 * 16(mem_cd); \ 2268c2ecf20Sopenharmony_ci \ 2278c2ecf20Sopenharmony_ci leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \ 2288c2ecf20Sopenharmony_ci call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \ 2298c2ecf20Sopenharmony_ci \ 2308c2ecf20Sopenharmony_ci store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab); 2318c2ecf20Sopenharmony_ci 2328c2ecf20Sopenharmony_ci#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */ 2338c2ecf20Sopenharmony_ci 2348c2ecf20Sopenharmony_ci#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \ 2358c2ecf20Sopenharmony_ci /* Store new AB state */ \ 2368c2ecf20Sopenharmony_ci vmovdqu x0, 0 * 16(mem_ab); \ 2378c2ecf20Sopenharmony_ci vmovdqu x1, 1 * 16(mem_ab); \ 2388c2ecf20Sopenharmony_ci vmovdqu x2, 2 * 16(mem_ab); \ 2398c2ecf20Sopenharmony_ci vmovdqu x3, 3 * 16(mem_ab); \ 2408c2ecf20Sopenharmony_ci vmovdqu x4, 4 * 16(mem_ab); \ 2418c2ecf20Sopenharmony_ci vmovdqu x5, 5 * 16(mem_ab); \ 2428c2ecf20Sopenharmony_ci vmovdqu x6, 6 * 16(mem_ab); \ 2438c2ecf20Sopenharmony_ci vmovdqu x7, 7 * 16(mem_ab); 2448c2ecf20Sopenharmony_ci 2458c2ecf20Sopenharmony_ci#define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 2468c2ecf20Sopenharmony_ci y6, y7, mem_ab, mem_cd, i) \ 2478c2ecf20Sopenharmony_ci two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 2488c2ecf20Sopenharmony_ci y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \ 2498c2ecf20Sopenharmony_ci two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 2508c2ecf20Sopenharmony_ci y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \ 2518c2ecf20Sopenharmony_ci two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 2528c2ecf20Sopenharmony_ci y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store); 2538c2ecf20Sopenharmony_ci 2548c2ecf20Sopenharmony_ci#define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 2558c2ecf20Sopenharmony_ci y6, y7, mem_ab, mem_cd, i) \ 2568c2ecf20Sopenharmony_ci two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 2578c2ecf20Sopenharmony_ci y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \ 2588c2ecf20Sopenharmony_ci two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 2598c2ecf20Sopenharmony_ci y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \ 2608c2ecf20Sopenharmony_ci two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 2618c2ecf20Sopenharmony_ci y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store); 2628c2ecf20Sopenharmony_ci 2638c2ecf20Sopenharmony_ci/* 2648c2ecf20Sopenharmony_ci * IN: 2658c2ecf20Sopenharmony_ci * v0..3: byte-sliced 32-bit integers 2668c2ecf20Sopenharmony_ci * OUT: 2678c2ecf20Sopenharmony_ci * v0..3: (IN <<< 1) 2688c2ecf20Sopenharmony_ci */ 2698c2ecf20Sopenharmony_ci#define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \ 2708c2ecf20Sopenharmony_ci vpcmpgtb v0, zero, t0; \ 2718c2ecf20Sopenharmony_ci vpaddb v0, v0, v0; \ 2728c2ecf20Sopenharmony_ci vpabsb t0, t0; \ 2738c2ecf20Sopenharmony_ci \ 2748c2ecf20Sopenharmony_ci vpcmpgtb v1, zero, t1; \ 2758c2ecf20Sopenharmony_ci vpaddb v1, v1, v1; \ 2768c2ecf20Sopenharmony_ci vpabsb t1, t1; \ 2778c2ecf20Sopenharmony_ci \ 2788c2ecf20Sopenharmony_ci vpcmpgtb v2, zero, t2; \ 2798c2ecf20Sopenharmony_ci vpaddb v2, v2, v2; \ 2808c2ecf20Sopenharmony_ci vpabsb t2, t2; \ 2818c2ecf20Sopenharmony_ci \ 2828c2ecf20Sopenharmony_ci vpor t0, v1, v1; \ 2838c2ecf20Sopenharmony_ci \ 2848c2ecf20Sopenharmony_ci vpcmpgtb v3, zero, t0; \ 2858c2ecf20Sopenharmony_ci vpaddb v3, v3, v3; \ 2868c2ecf20Sopenharmony_ci vpabsb t0, t0; \ 2878c2ecf20Sopenharmony_ci \ 2888c2ecf20Sopenharmony_ci vpor t1, v2, v2; \ 2898c2ecf20Sopenharmony_ci vpor t2, v3, v3; \ 2908c2ecf20Sopenharmony_ci vpor t0, v0, v0; 2918c2ecf20Sopenharmony_ci 2928c2ecf20Sopenharmony_ci/* 2938c2ecf20Sopenharmony_ci * IN: 2948c2ecf20Sopenharmony_ci * r: byte-sliced AB state in memory 2958c2ecf20Sopenharmony_ci * l: byte-sliced CD state in memory 2968c2ecf20Sopenharmony_ci * OUT: 2978c2ecf20Sopenharmony_ci * x0..x7: new byte-sliced CD state 2988c2ecf20Sopenharmony_ci */ 2998c2ecf20Sopenharmony_ci#define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \ 3008c2ecf20Sopenharmony_ci tt1, tt2, tt3, kll, klr, krl, krr) \ 3018c2ecf20Sopenharmony_ci /* \ 3028c2ecf20Sopenharmony_ci * t0 = kll; \ 3038c2ecf20Sopenharmony_ci * t0 &= ll; \ 3048c2ecf20Sopenharmony_ci * lr ^= rol32(t0, 1); \ 3058c2ecf20Sopenharmony_ci */ \ 3068c2ecf20Sopenharmony_ci vpxor tt0, tt0, tt0; \ 3078c2ecf20Sopenharmony_ci vmovd kll, t0; \ 3088c2ecf20Sopenharmony_ci vpshufb tt0, t0, t3; \ 3098c2ecf20Sopenharmony_ci vpsrldq $1, t0, t0; \ 3108c2ecf20Sopenharmony_ci vpshufb tt0, t0, t2; \ 3118c2ecf20Sopenharmony_ci vpsrldq $1, t0, t0; \ 3128c2ecf20Sopenharmony_ci vpshufb tt0, t0, t1; \ 3138c2ecf20Sopenharmony_ci vpsrldq $1, t0, t0; \ 3148c2ecf20Sopenharmony_ci vpshufb tt0, t0, t0; \ 3158c2ecf20Sopenharmony_ci \ 3168c2ecf20Sopenharmony_ci vpand l0, t0, t0; \ 3178c2ecf20Sopenharmony_ci vpand l1, t1, t1; \ 3188c2ecf20Sopenharmony_ci vpand l2, t2, t2; \ 3198c2ecf20Sopenharmony_ci vpand l3, t3, t3; \ 3208c2ecf20Sopenharmony_ci \ 3218c2ecf20Sopenharmony_ci rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ 3228c2ecf20Sopenharmony_ci \ 3238c2ecf20Sopenharmony_ci vpxor l4, t0, l4; \ 3248c2ecf20Sopenharmony_ci vmovdqu l4, 4 * 16(l); \ 3258c2ecf20Sopenharmony_ci vpxor l5, t1, l5; \ 3268c2ecf20Sopenharmony_ci vmovdqu l5, 5 * 16(l); \ 3278c2ecf20Sopenharmony_ci vpxor l6, t2, l6; \ 3288c2ecf20Sopenharmony_ci vmovdqu l6, 6 * 16(l); \ 3298c2ecf20Sopenharmony_ci vpxor l7, t3, l7; \ 3308c2ecf20Sopenharmony_ci vmovdqu l7, 7 * 16(l); \ 3318c2ecf20Sopenharmony_ci \ 3328c2ecf20Sopenharmony_ci /* \ 3338c2ecf20Sopenharmony_ci * t2 = krr; \ 3348c2ecf20Sopenharmony_ci * t2 |= rr; \ 3358c2ecf20Sopenharmony_ci * rl ^= t2; \ 3368c2ecf20Sopenharmony_ci */ \ 3378c2ecf20Sopenharmony_ci \ 3388c2ecf20Sopenharmony_ci vmovd krr, t0; \ 3398c2ecf20Sopenharmony_ci vpshufb tt0, t0, t3; \ 3408c2ecf20Sopenharmony_ci vpsrldq $1, t0, t0; \ 3418c2ecf20Sopenharmony_ci vpshufb tt0, t0, t2; \ 3428c2ecf20Sopenharmony_ci vpsrldq $1, t0, t0; \ 3438c2ecf20Sopenharmony_ci vpshufb tt0, t0, t1; \ 3448c2ecf20Sopenharmony_ci vpsrldq $1, t0, t0; \ 3458c2ecf20Sopenharmony_ci vpshufb tt0, t0, t0; \ 3468c2ecf20Sopenharmony_ci \ 3478c2ecf20Sopenharmony_ci vpor 4 * 16(r), t0, t0; \ 3488c2ecf20Sopenharmony_ci vpor 5 * 16(r), t1, t1; \ 3498c2ecf20Sopenharmony_ci vpor 6 * 16(r), t2, t2; \ 3508c2ecf20Sopenharmony_ci vpor 7 * 16(r), t3, t3; \ 3518c2ecf20Sopenharmony_ci \ 3528c2ecf20Sopenharmony_ci vpxor 0 * 16(r), t0, t0; \ 3538c2ecf20Sopenharmony_ci vpxor 1 * 16(r), t1, t1; \ 3548c2ecf20Sopenharmony_ci vpxor 2 * 16(r), t2, t2; \ 3558c2ecf20Sopenharmony_ci vpxor 3 * 16(r), t3, t3; \ 3568c2ecf20Sopenharmony_ci vmovdqu t0, 0 * 16(r); \ 3578c2ecf20Sopenharmony_ci vmovdqu t1, 1 * 16(r); \ 3588c2ecf20Sopenharmony_ci vmovdqu t2, 2 * 16(r); \ 3598c2ecf20Sopenharmony_ci vmovdqu t3, 3 * 16(r); \ 3608c2ecf20Sopenharmony_ci \ 3618c2ecf20Sopenharmony_ci /* \ 3628c2ecf20Sopenharmony_ci * t2 = krl; \ 3638c2ecf20Sopenharmony_ci * t2 &= rl; \ 3648c2ecf20Sopenharmony_ci * rr ^= rol32(t2, 1); \ 3658c2ecf20Sopenharmony_ci */ \ 3668c2ecf20Sopenharmony_ci vmovd krl, t0; \ 3678c2ecf20Sopenharmony_ci vpshufb tt0, t0, t3; \ 3688c2ecf20Sopenharmony_ci vpsrldq $1, t0, t0; \ 3698c2ecf20Sopenharmony_ci vpshufb tt0, t0, t2; \ 3708c2ecf20Sopenharmony_ci vpsrldq $1, t0, t0; \ 3718c2ecf20Sopenharmony_ci vpshufb tt0, t0, t1; \ 3728c2ecf20Sopenharmony_ci vpsrldq $1, t0, t0; \ 3738c2ecf20Sopenharmony_ci vpshufb tt0, t0, t0; \ 3748c2ecf20Sopenharmony_ci \ 3758c2ecf20Sopenharmony_ci vpand 0 * 16(r), t0, t0; \ 3768c2ecf20Sopenharmony_ci vpand 1 * 16(r), t1, t1; \ 3778c2ecf20Sopenharmony_ci vpand 2 * 16(r), t2, t2; \ 3788c2ecf20Sopenharmony_ci vpand 3 * 16(r), t3, t3; \ 3798c2ecf20Sopenharmony_ci \ 3808c2ecf20Sopenharmony_ci rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ 3818c2ecf20Sopenharmony_ci \ 3828c2ecf20Sopenharmony_ci vpxor 4 * 16(r), t0, t0; \ 3838c2ecf20Sopenharmony_ci vpxor 5 * 16(r), t1, t1; \ 3848c2ecf20Sopenharmony_ci vpxor 6 * 16(r), t2, t2; \ 3858c2ecf20Sopenharmony_ci vpxor 7 * 16(r), t3, t3; \ 3868c2ecf20Sopenharmony_ci vmovdqu t0, 4 * 16(r); \ 3878c2ecf20Sopenharmony_ci vmovdqu t1, 5 * 16(r); \ 3888c2ecf20Sopenharmony_ci vmovdqu t2, 6 * 16(r); \ 3898c2ecf20Sopenharmony_ci vmovdqu t3, 7 * 16(r); \ 3908c2ecf20Sopenharmony_ci \ 3918c2ecf20Sopenharmony_ci /* \ 3928c2ecf20Sopenharmony_ci * t0 = klr; \ 3938c2ecf20Sopenharmony_ci * t0 |= lr; \ 3948c2ecf20Sopenharmony_ci * ll ^= t0; \ 3958c2ecf20Sopenharmony_ci */ \ 3968c2ecf20Sopenharmony_ci \ 3978c2ecf20Sopenharmony_ci vmovd klr, t0; \ 3988c2ecf20Sopenharmony_ci vpshufb tt0, t0, t3; \ 3998c2ecf20Sopenharmony_ci vpsrldq $1, t0, t0; \ 4008c2ecf20Sopenharmony_ci vpshufb tt0, t0, t2; \ 4018c2ecf20Sopenharmony_ci vpsrldq $1, t0, t0; \ 4028c2ecf20Sopenharmony_ci vpshufb tt0, t0, t1; \ 4038c2ecf20Sopenharmony_ci vpsrldq $1, t0, t0; \ 4048c2ecf20Sopenharmony_ci vpshufb tt0, t0, t0; \ 4058c2ecf20Sopenharmony_ci \ 4068c2ecf20Sopenharmony_ci vpor l4, t0, t0; \ 4078c2ecf20Sopenharmony_ci vpor l5, t1, t1; \ 4088c2ecf20Sopenharmony_ci vpor l6, t2, t2; \ 4098c2ecf20Sopenharmony_ci vpor l7, t3, t3; \ 4108c2ecf20Sopenharmony_ci \ 4118c2ecf20Sopenharmony_ci vpxor l0, t0, l0; \ 4128c2ecf20Sopenharmony_ci vmovdqu l0, 0 * 16(l); \ 4138c2ecf20Sopenharmony_ci vpxor l1, t1, l1; \ 4148c2ecf20Sopenharmony_ci vmovdqu l1, 1 * 16(l); \ 4158c2ecf20Sopenharmony_ci vpxor l2, t2, l2; \ 4168c2ecf20Sopenharmony_ci vmovdqu l2, 2 * 16(l); \ 4178c2ecf20Sopenharmony_ci vpxor l3, t3, l3; \ 4188c2ecf20Sopenharmony_ci vmovdqu l3, 3 * 16(l); 4198c2ecf20Sopenharmony_ci 4208c2ecf20Sopenharmony_ci#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ 4218c2ecf20Sopenharmony_ci vpunpckhdq x1, x0, t2; \ 4228c2ecf20Sopenharmony_ci vpunpckldq x1, x0, x0; \ 4238c2ecf20Sopenharmony_ci \ 4248c2ecf20Sopenharmony_ci vpunpckldq x3, x2, t1; \ 4258c2ecf20Sopenharmony_ci vpunpckhdq x3, x2, x2; \ 4268c2ecf20Sopenharmony_ci \ 4278c2ecf20Sopenharmony_ci vpunpckhqdq t1, x0, x1; \ 4288c2ecf20Sopenharmony_ci vpunpcklqdq t1, x0, x0; \ 4298c2ecf20Sopenharmony_ci \ 4308c2ecf20Sopenharmony_ci vpunpckhqdq x2, t2, x3; \ 4318c2ecf20Sopenharmony_ci vpunpcklqdq x2, t2, x2; 4328c2ecf20Sopenharmony_ci 4338c2ecf20Sopenharmony_ci#define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \ 4348c2ecf20Sopenharmony_ci b3, c3, d3, st0, st1) \ 4358c2ecf20Sopenharmony_ci vmovdqu d2, st0; \ 4368c2ecf20Sopenharmony_ci vmovdqu d3, st1; \ 4378c2ecf20Sopenharmony_ci transpose_4x4(a0, a1, a2, a3, d2, d3); \ 4388c2ecf20Sopenharmony_ci transpose_4x4(b0, b1, b2, b3, d2, d3); \ 4398c2ecf20Sopenharmony_ci vmovdqu st0, d2; \ 4408c2ecf20Sopenharmony_ci vmovdqu st1, d3; \ 4418c2ecf20Sopenharmony_ci \ 4428c2ecf20Sopenharmony_ci vmovdqu a0, st0; \ 4438c2ecf20Sopenharmony_ci vmovdqu a1, st1; \ 4448c2ecf20Sopenharmony_ci transpose_4x4(c0, c1, c2, c3, a0, a1); \ 4458c2ecf20Sopenharmony_ci transpose_4x4(d0, d1, d2, d3, a0, a1); \ 4468c2ecf20Sopenharmony_ci \ 4478c2ecf20Sopenharmony_ci vmovdqu .Lshufb_16x16b, a0; \ 4488c2ecf20Sopenharmony_ci vmovdqu st1, a1; \ 4498c2ecf20Sopenharmony_ci vpshufb a0, a2, a2; \ 4508c2ecf20Sopenharmony_ci vpshufb a0, a3, a3; \ 4518c2ecf20Sopenharmony_ci vpshufb a0, b0, b0; \ 4528c2ecf20Sopenharmony_ci vpshufb a0, b1, b1; \ 4538c2ecf20Sopenharmony_ci vpshufb a0, b2, b2; \ 4548c2ecf20Sopenharmony_ci vpshufb a0, b3, b3; \ 4558c2ecf20Sopenharmony_ci vpshufb a0, a1, a1; \ 4568c2ecf20Sopenharmony_ci vpshufb a0, c0, c0; \ 4578c2ecf20Sopenharmony_ci vpshufb a0, c1, c1; \ 4588c2ecf20Sopenharmony_ci vpshufb a0, c2, c2; \ 4598c2ecf20Sopenharmony_ci vpshufb a0, c3, c3; \ 4608c2ecf20Sopenharmony_ci vpshufb a0, d0, d0; \ 4618c2ecf20Sopenharmony_ci vpshufb a0, d1, d1; \ 4628c2ecf20Sopenharmony_ci vpshufb a0, d2, d2; \ 4638c2ecf20Sopenharmony_ci vpshufb a0, d3, d3; \ 4648c2ecf20Sopenharmony_ci vmovdqu d3, st1; \ 4658c2ecf20Sopenharmony_ci vmovdqu st0, d3; \ 4668c2ecf20Sopenharmony_ci vpshufb a0, d3, a0; \ 4678c2ecf20Sopenharmony_ci vmovdqu d2, st0; \ 4688c2ecf20Sopenharmony_ci \ 4698c2ecf20Sopenharmony_ci transpose_4x4(a0, b0, c0, d0, d2, d3); \ 4708c2ecf20Sopenharmony_ci transpose_4x4(a1, b1, c1, d1, d2, d3); \ 4718c2ecf20Sopenharmony_ci vmovdqu st0, d2; \ 4728c2ecf20Sopenharmony_ci vmovdqu st1, d3; \ 4738c2ecf20Sopenharmony_ci \ 4748c2ecf20Sopenharmony_ci vmovdqu b0, st0; \ 4758c2ecf20Sopenharmony_ci vmovdqu b1, st1; \ 4768c2ecf20Sopenharmony_ci transpose_4x4(a2, b2, c2, d2, b0, b1); \ 4778c2ecf20Sopenharmony_ci transpose_4x4(a3, b3, c3, d3, b0, b1); \ 4788c2ecf20Sopenharmony_ci vmovdqu st0, b0; \ 4798c2ecf20Sopenharmony_ci vmovdqu st1, b1; \ 4808c2ecf20Sopenharmony_ci /* does not adjust output bytes inside vectors */ 4818c2ecf20Sopenharmony_ci 4828c2ecf20Sopenharmony_ci/* load blocks to registers and apply pre-whitening */ 4838c2ecf20Sopenharmony_ci#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 4848c2ecf20Sopenharmony_ci y6, y7, rio, key) \ 4858c2ecf20Sopenharmony_ci vmovq key, x0; \ 4868c2ecf20Sopenharmony_ci vpshufb .Lpack_bswap, x0, x0; \ 4878c2ecf20Sopenharmony_ci \ 4888c2ecf20Sopenharmony_ci vpxor 0 * 16(rio), x0, y7; \ 4898c2ecf20Sopenharmony_ci vpxor 1 * 16(rio), x0, y6; \ 4908c2ecf20Sopenharmony_ci vpxor 2 * 16(rio), x0, y5; \ 4918c2ecf20Sopenharmony_ci vpxor 3 * 16(rio), x0, y4; \ 4928c2ecf20Sopenharmony_ci vpxor 4 * 16(rio), x0, y3; \ 4938c2ecf20Sopenharmony_ci vpxor 5 * 16(rio), x0, y2; \ 4948c2ecf20Sopenharmony_ci vpxor 6 * 16(rio), x0, y1; \ 4958c2ecf20Sopenharmony_ci vpxor 7 * 16(rio), x0, y0; \ 4968c2ecf20Sopenharmony_ci vpxor 8 * 16(rio), x0, x7; \ 4978c2ecf20Sopenharmony_ci vpxor 9 * 16(rio), x0, x6; \ 4988c2ecf20Sopenharmony_ci vpxor 10 * 16(rio), x0, x5; \ 4998c2ecf20Sopenharmony_ci vpxor 11 * 16(rio), x0, x4; \ 5008c2ecf20Sopenharmony_ci vpxor 12 * 16(rio), x0, x3; \ 5018c2ecf20Sopenharmony_ci vpxor 13 * 16(rio), x0, x2; \ 5028c2ecf20Sopenharmony_ci vpxor 14 * 16(rio), x0, x1; \ 5038c2ecf20Sopenharmony_ci vpxor 15 * 16(rio), x0, x0; 5048c2ecf20Sopenharmony_ci 5058c2ecf20Sopenharmony_ci/* byteslice pre-whitened blocks and store to temporary memory */ 5068c2ecf20Sopenharmony_ci#define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 5078c2ecf20Sopenharmony_ci y6, y7, mem_ab, mem_cd) \ 5088c2ecf20Sopenharmony_ci byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ 5098c2ecf20Sopenharmony_ci y5, y6, y7, (mem_ab), (mem_cd)); \ 5108c2ecf20Sopenharmony_ci \ 5118c2ecf20Sopenharmony_ci vmovdqu x0, 0 * 16(mem_ab); \ 5128c2ecf20Sopenharmony_ci vmovdqu x1, 1 * 16(mem_ab); \ 5138c2ecf20Sopenharmony_ci vmovdqu x2, 2 * 16(mem_ab); \ 5148c2ecf20Sopenharmony_ci vmovdqu x3, 3 * 16(mem_ab); \ 5158c2ecf20Sopenharmony_ci vmovdqu x4, 4 * 16(mem_ab); \ 5168c2ecf20Sopenharmony_ci vmovdqu x5, 5 * 16(mem_ab); \ 5178c2ecf20Sopenharmony_ci vmovdqu x6, 6 * 16(mem_ab); \ 5188c2ecf20Sopenharmony_ci vmovdqu x7, 7 * 16(mem_ab); \ 5198c2ecf20Sopenharmony_ci vmovdqu y0, 0 * 16(mem_cd); \ 5208c2ecf20Sopenharmony_ci vmovdqu y1, 1 * 16(mem_cd); \ 5218c2ecf20Sopenharmony_ci vmovdqu y2, 2 * 16(mem_cd); \ 5228c2ecf20Sopenharmony_ci vmovdqu y3, 3 * 16(mem_cd); \ 5238c2ecf20Sopenharmony_ci vmovdqu y4, 4 * 16(mem_cd); \ 5248c2ecf20Sopenharmony_ci vmovdqu y5, 5 * 16(mem_cd); \ 5258c2ecf20Sopenharmony_ci vmovdqu y6, 6 * 16(mem_cd); \ 5268c2ecf20Sopenharmony_ci vmovdqu y7, 7 * 16(mem_cd); 5278c2ecf20Sopenharmony_ci 5288c2ecf20Sopenharmony_ci/* de-byteslice, apply post-whitening and store blocks */ 5298c2ecf20Sopenharmony_ci#define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ 5308c2ecf20Sopenharmony_ci y5, y6, y7, key, stack_tmp0, stack_tmp1) \ 5318c2ecf20Sopenharmony_ci byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \ 5328c2ecf20Sopenharmony_ci y7, x3, x7, stack_tmp0, stack_tmp1); \ 5338c2ecf20Sopenharmony_ci \ 5348c2ecf20Sopenharmony_ci vmovdqu x0, stack_tmp0; \ 5358c2ecf20Sopenharmony_ci \ 5368c2ecf20Sopenharmony_ci vmovq key, x0; \ 5378c2ecf20Sopenharmony_ci vpshufb .Lpack_bswap, x0, x0; \ 5388c2ecf20Sopenharmony_ci \ 5398c2ecf20Sopenharmony_ci vpxor x0, y7, y7; \ 5408c2ecf20Sopenharmony_ci vpxor x0, y6, y6; \ 5418c2ecf20Sopenharmony_ci vpxor x0, y5, y5; \ 5428c2ecf20Sopenharmony_ci vpxor x0, y4, y4; \ 5438c2ecf20Sopenharmony_ci vpxor x0, y3, y3; \ 5448c2ecf20Sopenharmony_ci vpxor x0, y2, y2; \ 5458c2ecf20Sopenharmony_ci vpxor x0, y1, y1; \ 5468c2ecf20Sopenharmony_ci vpxor x0, y0, y0; \ 5478c2ecf20Sopenharmony_ci vpxor x0, x7, x7; \ 5488c2ecf20Sopenharmony_ci vpxor x0, x6, x6; \ 5498c2ecf20Sopenharmony_ci vpxor x0, x5, x5; \ 5508c2ecf20Sopenharmony_ci vpxor x0, x4, x4; \ 5518c2ecf20Sopenharmony_ci vpxor x0, x3, x3; \ 5528c2ecf20Sopenharmony_ci vpxor x0, x2, x2; \ 5538c2ecf20Sopenharmony_ci vpxor x0, x1, x1; \ 5548c2ecf20Sopenharmony_ci vpxor stack_tmp0, x0, x0; 5558c2ecf20Sopenharmony_ci 5568c2ecf20Sopenharmony_ci#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 5578c2ecf20Sopenharmony_ci y6, y7, rio) \ 5588c2ecf20Sopenharmony_ci vmovdqu x0, 0 * 16(rio); \ 5598c2ecf20Sopenharmony_ci vmovdqu x1, 1 * 16(rio); \ 5608c2ecf20Sopenharmony_ci vmovdqu x2, 2 * 16(rio); \ 5618c2ecf20Sopenharmony_ci vmovdqu x3, 3 * 16(rio); \ 5628c2ecf20Sopenharmony_ci vmovdqu x4, 4 * 16(rio); \ 5638c2ecf20Sopenharmony_ci vmovdqu x5, 5 * 16(rio); \ 5648c2ecf20Sopenharmony_ci vmovdqu x6, 6 * 16(rio); \ 5658c2ecf20Sopenharmony_ci vmovdqu x7, 7 * 16(rio); \ 5668c2ecf20Sopenharmony_ci vmovdqu y0, 8 * 16(rio); \ 5678c2ecf20Sopenharmony_ci vmovdqu y1, 9 * 16(rio); \ 5688c2ecf20Sopenharmony_ci vmovdqu y2, 10 * 16(rio); \ 5698c2ecf20Sopenharmony_ci vmovdqu y3, 11 * 16(rio); \ 5708c2ecf20Sopenharmony_ci vmovdqu y4, 12 * 16(rio); \ 5718c2ecf20Sopenharmony_ci vmovdqu y5, 13 * 16(rio); \ 5728c2ecf20Sopenharmony_ci vmovdqu y6, 14 * 16(rio); \ 5738c2ecf20Sopenharmony_ci vmovdqu y7, 15 * 16(rio); 5748c2ecf20Sopenharmony_ci 5758c2ecf20Sopenharmony_ci 5768c2ecf20Sopenharmony_ci/* NB: section is mergeable, all elements must be aligned 16-byte blocks */ 5778c2ecf20Sopenharmony_ci.section .rodata.cst16, "aM", @progbits, 16 5788c2ecf20Sopenharmony_ci.align 16 5798c2ecf20Sopenharmony_ci 5808c2ecf20Sopenharmony_ci#define SHUFB_BYTES(idx) \ 5818c2ecf20Sopenharmony_ci 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) 5828c2ecf20Sopenharmony_ci 5838c2ecf20Sopenharmony_ci.Lshufb_16x16b: 5848c2ecf20Sopenharmony_ci .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3); 5858c2ecf20Sopenharmony_ci 5868c2ecf20Sopenharmony_ci.Lpack_bswap: 5878c2ecf20Sopenharmony_ci .long 0x00010203 5888c2ecf20Sopenharmony_ci .long 0x04050607 5898c2ecf20Sopenharmony_ci .long 0x80808080 5908c2ecf20Sopenharmony_ci .long 0x80808080 5918c2ecf20Sopenharmony_ci 5928c2ecf20Sopenharmony_ci/* For CTR-mode IV byteswap */ 5938c2ecf20Sopenharmony_ci.Lbswap128_mask: 5948c2ecf20Sopenharmony_ci .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 5958c2ecf20Sopenharmony_ci 5968c2ecf20Sopenharmony_ci/* For XTS mode IV generation */ 5978c2ecf20Sopenharmony_ci.Lxts_gf128mul_and_shl1_mask: 5988c2ecf20Sopenharmony_ci .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 5998c2ecf20Sopenharmony_ci 6008c2ecf20Sopenharmony_ci/* 6018c2ecf20Sopenharmony_ci * pre-SubByte transform 6028c2ecf20Sopenharmony_ci * 6038c2ecf20Sopenharmony_ci * pre-lookup for sbox1, sbox2, sbox3: 6048c2ecf20Sopenharmony_ci * swap_bitendianness( 6058c2ecf20Sopenharmony_ci * isom_map_camellia_to_aes( 6068c2ecf20Sopenharmony_ci * camellia_f( 6078c2ecf20Sopenharmony_ci * swap_bitendianess(in) 6088c2ecf20Sopenharmony_ci * ) 6098c2ecf20Sopenharmony_ci * ) 6108c2ecf20Sopenharmony_ci * ) 6118c2ecf20Sopenharmony_ci * 6128c2ecf20Sopenharmony_ci * (note: '⊕ 0xc5' inside camellia_f()) 6138c2ecf20Sopenharmony_ci */ 6148c2ecf20Sopenharmony_ci.Lpre_tf_lo_s1: 6158c2ecf20Sopenharmony_ci .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86 6168c2ecf20Sopenharmony_ci .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88 6178c2ecf20Sopenharmony_ci.Lpre_tf_hi_s1: 6188c2ecf20Sopenharmony_ci .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a 6198c2ecf20Sopenharmony_ci .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23 6208c2ecf20Sopenharmony_ci 6218c2ecf20Sopenharmony_ci/* 6228c2ecf20Sopenharmony_ci * pre-SubByte transform 6238c2ecf20Sopenharmony_ci * 6248c2ecf20Sopenharmony_ci * pre-lookup for sbox4: 6258c2ecf20Sopenharmony_ci * swap_bitendianness( 6268c2ecf20Sopenharmony_ci * isom_map_camellia_to_aes( 6278c2ecf20Sopenharmony_ci * camellia_f( 6288c2ecf20Sopenharmony_ci * swap_bitendianess(in <<< 1) 6298c2ecf20Sopenharmony_ci * ) 6308c2ecf20Sopenharmony_ci * ) 6318c2ecf20Sopenharmony_ci * ) 6328c2ecf20Sopenharmony_ci * 6338c2ecf20Sopenharmony_ci * (note: '⊕ 0xc5' inside camellia_f()) 6348c2ecf20Sopenharmony_ci */ 6358c2ecf20Sopenharmony_ci.Lpre_tf_lo_s4: 6368c2ecf20Sopenharmony_ci .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25 6378c2ecf20Sopenharmony_ci .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74 6388c2ecf20Sopenharmony_ci.Lpre_tf_hi_s4: 6398c2ecf20Sopenharmony_ci .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72 6408c2ecf20Sopenharmony_ci .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf 6418c2ecf20Sopenharmony_ci 6428c2ecf20Sopenharmony_ci/* 6438c2ecf20Sopenharmony_ci * post-SubByte transform 6448c2ecf20Sopenharmony_ci * 6458c2ecf20Sopenharmony_ci * post-lookup for sbox1, sbox4: 6468c2ecf20Sopenharmony_ci * swap_bitendianness( 6478c2ecf20Sopenharmony_ci * camellia_h( 6488c2ecf20Sopenharmony_ci * isom_map_aes_to_camellia( 6498c2ecf20Sopenharmony_ci * swap_bitendianness( 6508c2ecf20Sopenharmony_ci * aes_inverse_affine_transform(in) 6518c2ecf20Sopenharmony_ci * ) 6528c2ecf20Sopenharmony_ci * ) 6538c2ecf20Sopenharmony_ci * ) 6548c2ecf20Sopenharmony_ci * ) 6558c2ecf20Sopenharmony_ci * 6568c2ecf20Sopenharmony_ci * (note: '⊕ 0x6e' inside camellia_h()) 6578c2ecf20Sopenharmony_ci */ 6588c2ecf20Sopenharmony_ci.Lpost_tf_lo_s1: 6598c2ecf20Sopenharmony_ci .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31 6608c2ecf20Sopenharmony_ci .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1 6618c2ecf20Sopenharmony_ci.Lpost_tf_hi_s1: 6628c2ecf20Sopenharmony_ci .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8 6638c2ecf20Sopenharmony_ci .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c 6648c2ecf20Sopenharmony_ci 6658c2ecf20Sopenharmony_ci/* 6668c2ecf20Sopenharmony_ci * post-SubByte transform 6678c2ecf20Sopenharmony_ci * 6688c2ecf20Sopenharmony_ci * post-lookup for sbox2: 6698c2ecf20Sopenharmony_ci * swap_bitendianness( 6708c2ecf20Sopenharmony_ci * camellia_h( 6718c2ecf20Sopenharmony_ci * isom_map_aes_to_camellia( 6728c2ecf20Sopenharmony_ci * swap_bitendianness( 6738c2ecf20Sopenharmony_ci * aes_inverse_affine_transform(in) 6748c2ecf20Sopenharmony_ci * ) 6758c2ecf20Sopenharmony_ci * ) 6768c2ecf20Sopenharmony_ci * ) 6778c2ecf20Sopenharmony_ci * ) <<< 1 6788c2ecf20Sopenharmony_ci * 6798c2ecf20Sopenharmony_ci * (note: '⊕ 0x6e' inside camellia_h()) 6808c2ecf20Sopenharmony_ci */ 6818c2ecf20Sopenharmony_ci.Lpost_tf_lo_s2: 6828c2ecf20Sopenharmony_ci .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62 6838c2ecf20Sopenharmony_ci .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3 6848c2ecf20Sopenharmony_ci.Lpost_tf_hi_s2: 6858c2ecf20Sopenharmony_ci .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51 6868c2ecf20Sopenharmony_ci .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18 6878c2ecf20Sopenharmony_ci 6888c2ecf20Sopenharmony_ci/* 6898c2ecf20Sopenharmony_ci * post-SubByte transform 6908c2ecf20Sopenharmony_ci * 6918c2ecf20Sopenharmony_ci * post-lookup for sbox3: 6928c2ecf20Sopenharmony_ci * swap_bitendianness( 6938c2ecf20Sopenharmony_ci * camellia_h( 6948c2ecf20Sopenharmony_ci * isom_map_aes_to_camellia( 6958c2ecf20Sopenharmony_ci * swap_bitendianness( 6968c2ecf20Sopenharmony_ci * aes_inverse_affine_transform(in) 6978c2ecf20Sopenharmony_ci * ) 6988c2ecf20Sopenharmony_ci * ) 6998c2ecf20Sopenharmony_ci * ) 7008c2ecf20Sopenharmony_ci * ) >>> 1 7018c2ecf20Sopenharmony_ci * 7028c2ecf20Sopenharmony_ci * (note: '⊕ 0x6e' inside camellia_h()) 7038c2ecf20Sopenharmony_ci */ 7048c2ecf20Sopenharmony_ci.Lpost_tf_lo_s3: 7058c2ecf20Sopenharmony_ci .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98 7068c2ecf20Sopenharmony_ci .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8 7078c2ecf20Sopenharmony_ci.Lpost_tf_hi_s3: 7088c2ecf20Sopenharmony_ci .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54 7098c2ecf20Sopenharmony_ci .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06 7108c2ecf20Sopenharmony_ci 7118c2ecf20Sopenharmony_ci/* For isolating SubBytes from AESENCLAST, inverse shift row */ 7128c2ecf20Sopenharmony_ci.Linv_shift_row: 7138c2ecf20Sopenharmony_ci .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b 7148c2ecf20Sopenharmony_ci .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 7158c2ecf20Sopenharmony_ci 7168c2ecf20Sopenharmony_ci/* 4-bit mask */ 7178c2ecf20Sopenharmony_ci.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4 7188c2ecf20Sopenharmony_ci.align 4 7198c2ecf20Sopenharmony_ci.L0f0f0f0f: 7208c2ecf20Sopenharmony_ci .long 0x0f0f0f0f 7218c2ecf20Sopenharmony_ci 7228c2ecf20Sopenharmony_ci.text 7238c2ecf20Sopenharmony_ci 7248c2ecf20Sopenharmony_ci.align 8 7258c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(__camellia_enc_blk16) 7268c2ecf20Sopenharmony_ci /* input: 7278c2ecf20Sopenharmony_ci * %rdi: ctx, CTX 7288c2ecf20Sopenharmony_ci * %rax: temporary storage, 256 bytes 7298c2ecf20Sopenharmony_ci * %xmm0..%xmm15: 16 plaintext blocks 7308c2ecf20Sopenharmony_ci * output: 7318c2ecf20Sopenharmony_ci * %xmm0..%xmm15: 16 encrypted blocks, order swapped: 7328c2ecf20Sopenharmony_ci * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 7338c2ecf20Sopenharmony_ci */ 7348c2ecf20Sopenharmony_ci FRAME_BEGIN 7358c2ecf20Sopenharmony_ci 7368c2ecf20Sopenharmony_ci leaq 8 * 16(%rax), %rcx; 7378c2ecf20Sopenharmony_ci 7388c2ecf20Sopenharmony_ci inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 7398c2ecf20Sopenharmony_ci %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 7408c2ecf20Sopenharmony_ci %xmm15, %rax, %rcx); 7418c2ecf20Sopenharmony_ci 7428c2ecf20Sopenharmony_ci enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 7438c2ecf20Sopenharmony_ci %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 7448c2ecf20Sopenharmony_ci %xmm15, %rax, %rcx, 0); 7458c2ecf20Sopenharmony_ci 7468c2ecf20Sopenharmony_ci fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 7478c2ecf20Sopenharmony_ci %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 7488c2ecf20Sopenharmony_ci %xmm15, 7498c2ecf20Sopenharmony_ci ((key_table + (8) * 8) + 0)(CTX), 7508c2ecf20Sopenharmony_ci ((key_table + (8) * 8) + 4)(CTX), 7518c2ecf20Sopenharmony_ci ((key_table + (8) * 8) + 8)(CTX), 7528c2ecf20Sopenharmony_ci ((key_table + (8) * 8) + 12)(CTX)); 7538c2ecf20Sopenharmony_ci 7548c2ecf20Sopenharmony_ci enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 7558c2ecf20Sopenharmony_ci %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 7568c2ecf20Sopenharmony_ci %xmm15, %rax, %rcx, 8); 7578c2ecf20Sopenharmony_ci 7588c2ecf20Sopenharmony_ci fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 7598c2ecf20Sopenharmony_ci %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 7608c2ecf20Sopenharmony_ci %xmm15, 7618c2ecf20Sopenharmony_ci ((key_table + (16) * 8) + 0)(CTX), 7628c2ecf20Sopenharmony_ci ((key_table + (16) * 8) + 4)(CTX), 7638c2ecf20Sopenharmony_ci ((key_table + (16) * 8) + 8)(CTX), 7648c2ecf20Sopenharmony_ci ((key_table + (16) * 8) + 12)(CTX)); 7658c2ecf20Sopenharmony_ci 7668c2ecf20Sopenharmony_ci enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 7678c2ecf20Sopenharmony_ci %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 7688c2ecf20Sopenharmony_ci %xmm15, %rax, %rcx, 16); 7698c2ecf20Sopenharmony_ci 7708c2ecf20Sopenharmony_ci movl $24, %r8d; 7718c2ecf20Sopenharmony_ci cmpl $16, key_length(CTX); 7728c2ecf20Sopenharmony_ci jne .Lenc_max32; 7738c2ecf20Sopenharmony_ci 7748c2ecf20Sopenharmony_ci.Lenc_done: 7758c2ecf20Sopenharmony_ci /* load CD for output */ 7768c2ecf20Sopenharmony_ci vmovdqu 0 * 16(%rcx), %xmm8; 7778c2ecf20Sopenharmony_ci vmovdqu 1 * 16(%rcx), %xmm9; 7788c2ecf20Sopenharmony_ci vmovdqu 2 * 16(%rcx), %xmm10; 7798c2ecf20Sopenharmony_ci vmovdqu 3 * 16(%rcx), %xmm11; 7808c2ecf20Sopenharmony_ci vmovdqu 4 * 16(%rcx), %xmm12; 7818c2ecf20Sopenharmony_ci vmovdqu 5 * 16(%rcx), %xmm13; 7828c2ecf20Sopenharmony_ci vmovdqu 6 * 16(%rcx), %xmm14; 7838c2ecf20Sopenharmony_ci vmovdqu 7 * 16(%rcx), %xmm15; 7848c2ecf20Sopenharmony_ci 7858c2ecf20Sopenharmony_ci outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 7868c2ecf20Sopenharmony_ci %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 7878c2ecf20Sopenharmony_ci %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax)); 7888c2ecf20Sopenharmony_ci 7898c2ecf20Sopenharmony_ci FRAME_END 7908c2ecf20Sopenharmony_ci RET; 7918c2ecf20Sopenharmony_ci 7928c2ecf20Sopenharmony_ci.align 8 7938c2ecf20Sopenharmony_ci.Lenc_max32: 7948c2ecf20Sopenharmony_ci movl $32, %r8d; 7958c2ecf20Sopenharmony_ci 7968c2ecf20Sopenharmony_ci fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 7978c2ecf20Sopenharmony_ci %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 7988c2ecf20Sopenharmony_ci %xmm15, 7998c2ecf20Sopenharmony_ci ((key_table + (24) * 8) + 0)(CTX), 8008c2ecf20Sopenharmony_ci ((key_table + (24) * 8) + 4)(CTX), 8018c2ecf20Sopenharmony_ci ((key_table + (24) * 8) + 8)(CTX), 8028c2ecf20Sopenharmony_ci ((key_table + (24) * 8) + 12)(CTX)); 8038c2ecf20Sopenharmony_ci 8048c2ecf20Sopenharmony_ci enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 8058c2ecf20Sopenharmony_ci %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 8068c2ecf20Sopenharmony_ci %xmm15, %rax, %rcx, 24); 8078c2ecf20Sopenharmony_ci 8088c2ecf20Sopenharmony_ci jmp .Lenc_done; 8098c2ecf20Sopenharmony_ciSYM_FUNC_END(__camellia_enc_blk16) 8108c2ecf20Sopenharmony_ci 8118c2ecf20Sopenharmony_ci.align 8 8128c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(__camellia_dec_blk16) 8138c2ecf20Sopenharmony_ci /* input: 8148c2ecf20Sopenharmony_ci * %rdi: ctx, CTX 8158c2ecf20Sopenharmony_ci * %rax: temporary storage, 256 bytes 8168c2ecf20Sopenharmony_ci * %r8d: 24 for 16 byte key, 32 for larger 8178c2ecf20Sopenharmony_ci * %xmm0..%xmm15: 16 encrypted blocks 8188c2ecf20Sopenharmony_ci * output: 8198c2ecf20Sopenharmony_ci * %xmm0..%xmm15: 16 plaintext blocks, order swapped: 8208c2ecf20Sopenharmony_ci * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 8218c2ecf20Sopenharmony_ci */ 8228c2ecf20Sopenharmony_ci FRAME_BEGIN 8238c2ecf20Sopenharmony_ci 8248c2ecf20Sopenharmony_ci leaq 8 * 16(%rax), %rcx; 8258c2ecf20Sopenharmony_ci 8268c2ecf20Sopenharmony_ci inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 8278c2ecf20Sopenharmony_ci %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 8288c2ecf20Sopenharmony_ci %xmm15, %rax, %rcx); 8298c2ecf20Sopenharmony_ci 8308c2ecf20Sopenharmony_ci cmpl $32, %r8d; 8318c2ecf20Sopenharmony_ci je .Ldec_max32; 8328c2ecf20Sopenharmony_ci 8338c2ecf20Sopenharmony_ci.Ldec_max24: 8348c2ecf20Sopenharmony_ci dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 8358c2ecf20Sopenharmony_ci %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 8368c2ecf20Sopenharmony_ci %xmm15, %rax, %rcx, 16); 8378c2ecf20Sopenharmony_ci 8388c2ecf20Sopenharmony_ci fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 8398c2ecf20Sopenharmony_ci %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 8408c2ecf20Sopenharmony_ci %xmm15, 8418c2ecf20Sopenharmony_ci ((key_table + (16) * 8) + 8)(CTX), 8428c2ecf20Sopenharmony_ci ((key_table + (16) * 8) + 12)(CTX), 8438c2ecf20Sopenharmony_ci ((key_table + (16) * 8) + 0)(CTX), 8448c2ecf20Sopenharmony_ci ((key_table + (16) * 8) + 4)(CTX)); 8458c2ecf20Sopenharmony_ci 8468c2ecf20Sopenharmony_ci dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 8478c2ecf20Sopenharmony_ci %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 8488c2ecf20Sopenharmony_ci %xmm15, %rax, %rcx, 8); 8498c2ecf20Sopenharmony_ci 8508c2ecf20Sopenharmony_ci fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 8518c2ecf20Sopenharmony_ci %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 8528c2ecf20Sopenharmony_ci %xmm15, 8538c2ecf20Sopenharmony_ci ((key_table + (8) * 8) + 8)(CTX), 8548c2ecf20Sopenharmony_ci ((key_table + (8) * 8) + 12)(CTX), 8558c2ecf20Sopenharmony_ci ((key_table + (8) * 8) + 0)(CTX), 8568c2ecf20Sopenharmony_ci ((key_table + (8) * 8) + 4)(CTX)); 8578c2ecf20Sopenharmony_ci 8588c2ecf20Sopenharmony_ci dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 8598c2ecf20Sopenharmony_ci %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 8608c2ecf20Sopenharmony_ci %xmm15, %rax, %rcx, 0); 8618c2ecf20Sopenharmony_ci 8628c2ecf20Sopenharmony_ci /* load CD for output */ 8638c2ecf20Sopenharmony_ci vmovdqu 0 * 16(%rcx), %xmm8; 8648c2ecf20Sopenharmony_ci vmovdqu 1 * 16(%rcx), %xmm9; 8658c2ecf20Sopenharmony_ci vmovdqu 2 * 16(%rcx), %xmm10; 8668c2ecf20Sopenharmony_ci vmovdqu 3 * 16(%rcx), %xmm11; 8678c2ecf20Sopenharmony_ci vmovdqu 4 * 16(%rcx), %xmm12; 8688c2ecf20Sopenharmony_ci vmovdqu 5 * 16(%rcx), %xmm13; 8698c2ecf20Sopenharmony_ci vmovdqu 6 * 16(%rcx), %xmm14; 8708c2ecf20Sopenharmony_ci vmovdqu 7 * 16(%rcx), %xmm15; 8718c2ecf20Sopenharmony_ci 8728c2ecf20Sopenharmony_ci outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 8738c2ecf20Sopenharmony_ci %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 8748c2ecf20Sopenharmony_ci %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax)); 8758c2ecf20Sopenharmony_ci 8768c2ecf20Sopenharmony_ci FRAME_END 8778c2ecf20Sopenharmony_ci RET; 8788c2ecf20Sopenharmony_ci 8798c2ecf20Sopenharmony_ci.align 8 8808c2ecf20Sopenharmony_ci.Ldec_max32: 8818c2ecf20Sopenharmony_ci dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 8828c2ecf20Sopenharmony_ci %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 8838c2ecf20Sopenharmony_ci %xmm15, %rax, %rcx, 24); 8848c2ecf20Sopenharmony_ci 8858c2ecf20Sopenharmony_ci fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 8868c2ecf20Sopenharmony_ci %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 8878c2ecf20Sopenharmony_ci %xmm15, 8888c2ecf20Sopenharmony_ci ((key_table + (24) * 8) + 8)(CTX), 8898c2ecf20Sopenharmony_ci ((key_table + (24) * 8) + 12)(CTX), 8908c2ecf20Sopenharmony_ci ((key_table + (24) * 8) + 0)(CTX), 8918c2ecf20Sopenharmony_ci ((key_table + (24) * 8) + 4)(CTX)); 8928c2ecf20Sopenharmony_ci 8938c2ecf20Sopenharmony_ci jmp .Ldec_max24; 8948c2ecf20Sopenharmony_ciSYM_FUNC_END(__camellia_dec_blk16) 8958c2ecf20Sopenharmony_ci 8968c2ecf20Sopenharmony_ciSYM_FUNC_START(camellia_ecb_enc_16way) 8978c2ecf20Sopenharmony_ci /* input: 8988c2ecf20Sopenharmony_ci * %rdi: ctx, CTX 8998c2ecf20Sopenharmony_ci * %rsi: dst (16 blocks) 9008c2ecf20Sopenharmony_ci * %rdx: src (16 blocks) 9018c2ecf20Sopenharmony_ci */ 9028c2ecf20Sopenharmony_ci FRAME_BEGIN 9038c2ecf20Sopenharmony_ci 9048c2ecf20Sopenharmony_ci inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 9058c2ecf20Sopenharmony_ci %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 9068c2ecf20Sopenharmony_ci %xmm15, %rdx, (key_table)(CTX)); 9078c2ecf20Sopenharmony_ci 9088c2ecf20Sopenharmony_ci /* now dst can be used as temporary buffer (even in src == dst case) */ 9098c2ecf20Sopenharmony_ci movq %rsi, %rax; 9108c2ecf20Sopenharmony_ci 9118c2ecf20Sopenharmony_ci call __camellia_enc_blk16; 9128c2ecf20Sopenharmony_ci 9138c2ecf20Sopenharmony_ci write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, 9148c2ecf20Sopenharmony_ci %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, 9158c2ecf20Sopenharmony_ci %xmm8, %rsi); 9168c2ecf20Sopenharmony_ci 9178c2ecf20Sopenharmony_ci FRAME_END 9188c2ecf20Sopenharmony_ci RET; 9198c2ecf20Sopenharmony_ciSYM_FUNC_END(camellia_ecb_enc_16way) 9208c2ecf20Sopenharmony_ci 9218c2ecf20Sopenharmony_ciSYM_FUNC_START(camellia_ecb_dec_16way) 9228c2ecf20Sopenharmony_ci /* input: 9238c2ecf20Sopenharmony_ci * %rdi: ctx, CTX 9248c2ecf20Sopenharmony_ci * %rsi: dst (16 blocks) 9258c2ecf20Sopenharmony_ci * %rdx: src (16 blocks) 9268c2ecf20Sopenharmony_ci */ 9278c2ecf20Sopenharmony_ci FRAME_BEGIN 9288c2ecf20Sopenharmony_ci 9298c2ecf20Sopenharmony_ci cmpl $16, key_length(CTX); 9308c2ecf20Sopenharmony_ci movl $32, %r8d; 9318c2ecf20Sopenharmony_ci movl $24, %eax; 9328c2ecf20Sopenharmony_ci cmovel %eax, %r8d; /* max */ 9338c2ecf20Sopenharmony_ci 9348c2ecf20Sopenharmony_ci inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 9358c2ecf20Sopenharmony_ci %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 9368c2ecf20Sopenharmony_ci %xmm15, %rdx, (key_table)(CTX, %r8, 8)); 9378c2ecf20Sopenharmony_ci 9388c2ecf20Sopenharmony_ci /* now dst can be used as temporary buffer (even in src == dst case) */ 9398c2ecf20Sopenharmony_ci movq %rsi, %rax; 9408c2ecf20Sopenharmony_ci 9418c2ecf20Sopenharmony_ci call __camellia_dec_blk16; 9428c2ecf20Sopenharmony_ci 9438c2ecf20Sopenharmony_ci write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, 9448c2ecf20Sopenharmony_ci %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, 9458c2ecf20Sopenharmony_ci %xmm8, %rsi); 9468c2ecf20Sopenharmony_ci 9478c2ecf20Sopenharmony_ci FRAME_END 9488c2ecf20Sopenharmony_ci RET; 9498c2ecf20Sopenharmony_ciSYM_FUNC_END(camellia_ecb_dec_16way) 9508c2ecf20Sopenharmony_ci 9518c2ecf20Sopenharmony_ciSYM_FUNC_START(camellia_cbc_dec_16way) 9528c2ecf20Sopenharmony_ci /* input: 9538c2ecf20Sopenharmony_ci * %rdi: ctx, CTX 9548c2ecf20Sopenharmony_ci * %rsi: dst (16 blocks) 9558c2ecf20Sopenharmony_ci * %rdx: src (16 blocks) 9568c2ecf20Sopenharmony_ci */ 9578c2ecf20Sopenharmony_ci FRAME_BEGIN 9588c2ecf20Sopenharmony_ci 9598c2ecf20Sopenharmony_ci cmpl $16, key_length(CTX); 9608c2ecf20Sopenharmony_ci movl $32, %r8d; 9618c2ecf20Sopenharmony_ci movl $24, %eax; 9628c2ecf20Sopenharmony_ci cmovel %eax, %r8d; /* max */ 9638c2ecf20Sopenharmony_ci 9648c2ecf20Sopenharmony_ci inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 9658c2ecf20Sopenharmony_ci %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 9668c2ecf20Sopenharmony_ci %xmm15, %rdx, (key_table)(CTX, %r8, 8)); 9678c2ecf20Sopenharmony_ci 9688c2ecf20Sopenharmony_ci /* 9698c2ecf20Sopenharmony_ci * dst might still be in-use (in case dst == src), so use stack for 9708c2ecf20Sopenharmony_ci * temporary storage. 9718c2ecf20Sopenharmony_ci */ 9728c2ecf20Sopenharmony_ci subq $(16 * 16), %rsp; 9738c2ecf20Sopenharmony_ci movq %rsp, %rax; 9748c2ecf20Sopenharmony_ci 9758c2ecf20Sopenharmony_ci call __camellia_dec_blk16; 9768c2ecf20Sopenharmony_ci 9778c2ecf20Sopenharmony_ci addq $(16 * 16), %rsp; 9788c2ecf20Sopenharmony_ci 9798c2ecf20Sopenharmony_ci vpxor (0 * 16)(%rdx), %xmm6, %xmm6; 9808c2ecf20Sopenharmony_ci vpxor (1 * 16)(%rdx), %xmm5, %xmm5; 9818c2ecf20Sopenharmony_ci vpxor (2 * 16)(%rdx), %xmm4, %xmm4; 9828c2ecf20Sopenharmony_ci vpxor (3 * 16)(%rdx), %xmm3, %xmm3; 9838c2ecf20Sopenharmony_ci vpxor (4 * 16)(%rdx), %xmm2, %xmm2; 9848c2ecf20Sopenharmony_ci vpxor (5 * 16)(%rdx), %xmm1, %xmm1; 9858c2ecf20Sopenharmony_ci vpxor (6 * 16)(%rdx), %xmm0, %xmm0; 9868c2ecf20Sopenharmony_ci vpxor (7 * 16)(%rdx), %xmm15, %xmm15; 9878c2ecf20Sopenharmony_ci vpxor (8 * 16)(%rdx), %xmm14, %xmm14; 9888c2ecf20Sopenharmony_ci vpxor (9 * 16)(%rdx), %xmm13, %xmm13; 9898c2ecf20Sopenharmony_ci vpxor (10 * 16)(%rdx), %xmm12, %xmm12; 9908c2ecf20Sopenharmony_ci vpxor (11 * 16)(%rdx), %xmm11, %xmm11; 9918c2ecf20Sopenharmony_ci vpxor (12 * 16)(%rdx), %xmm10, %xmm10; 9928c2ecf20Sopenharmony_ci vpxor (13 * 16)(%rdx), %xmm9, %xmm9; 9938c2ecf20Sopenharmony_ci vpxor (14 * 16)(%rdx), %xmm8, %xmm8; 9948c2ecf20Sopenharmony_ci write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, 9958c2ecf20Sopenharmony_ci %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, 9968c2ecf20Sopenharmony_ci %xmm8, %rsi); 9978c2ecf20Sopenharmony_ci 9988c2ecf20Sopenharmony_ci FRAME_END 9998c2ecf20Sopenharmony_ci RET; 10008c2ecf20Sopenharmony_ciSYM_FUNC_END(camellia_cbc_dec_16way) 10018c2ecf20Sopenharmony_ci 10028c2ecf20Sopenharmony_ci#define inc_le128(x, minus_one, tmp) \ 10038c2ecf20Sopenharmony_ci vpcmpeqq minus_one, x, tmp; \ 10048c2ecf20Sopenharmony_ci vpsubq minus_one, x, x; \ 10058c2ecf20Sopenharmony_ci vpslldq $8, tmp, tmp; \ 10068c2ecf20Sopenharmony_ci vpsubq tmp, x, x; 10078c2ecf20Sopenharmony_ci 10088c2ecf20Sopenharmony_ciSYM_FUNC_START(camellia_ctr_16way) 10098c2ecf20Sopenharmony_ci /* input: 10108c2ecf20Sopenharmony_ci * %rdi: ctx, CTX 10118c2ecf20Sopenharmony_ci * %rsi: dst (16 blocks) 10128c2ecf20Sopenharmony_ci * %rdx: src (16 blocks) 10138c2ecf20Sopenharmony_ci * %rcx: iv (little endian, 128bit) 10148c2ecf20Sopenharmony_ci */ 10158c2ecf20Sopenharmony_ci FRAME_BEGIN 10168c2ecf20Sopenharmony_ci 10178c2ecf20Sopenharmony_ci subq $(16 * 16), %rsp; 10188c2ecf20Sopenharmony_ci movq %rsp, %rax; 10198c2ecf20Sopenharmony_ci 10208c2ecf20Sopenharmony_ci vmovdqa .Lbswap128_mask, %xmm14; 10218c2ecf20Sopenharmony_ci 10228c2ecf20Sopenharmony_ci /* load IV and byteswap */ 10238c2ecf20Sopenharmony_ci vmovdqu (%rcx), %xmm0; 10248c2ecf20Sopenharmony_ci vpshufb %xmm14, %xmm0, %xmm15; 10258c2ecf20Sopenharmony_ci vmovdqu %xmm15, 15 * 16(%rax); 10268c2ecf20Sopenharmony_ci 10278c2ecf20Sopenharmony_ci vpcmpeqd %xmm15, %xmm15, %xmm15; 10288c2ecf20Sopenharmony_ci vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */ 10298c2ecf20Sopenharmony_ci 10308c2ecf20Sopenharmony_ci /* construct IVs */ 10318c2ecf20Sopenharmony_ci inc_le128(%xmm0, %xmm15, %xmm13); 10328c2ecf20Sopenharmony_ci vpshufb %xmm14, %xmm0, %xmm13; 10338c2ecf20Sopenharmony_ci vmovdqu %xmm13, 14 * 16(%rax); 10348c2ecf20Sopenharmony_ci inc_le128(%xmm0, %xmm15, %xmm13); 10358c2ecf20Sopenharmony_ci vpshufb %xmm14, %xmm0, %xmm13; 10368c2ecf20Sopenharmony_ci vmovdqu %xmm13, 13 * 16(%rax); 10378c2ecf20Sopenharmony_ci inc_le128(%xmm0, %xmm15, %xmm13); 10388c2ecf20Sopenharmony_ci vpshufb %xmm14, %xmm0, %xmm12; 10398c2ecf20Sopenharmony_ci inc_le128(%xmm0, %xmm15, %xmm13); 10408c2ecf20Sopenharmony_ci vpshufb %xmm14, %xmm0, %xmm11; 10418c2ecf20Sopenharmony_ci inc_le128(%xmm0, %xmm15, %xmm13); 10428c2ecf20Sopenharmony_ci vpshufb %xmm14, %xmm0, %xmm10; 10438c2ecf20Sopenharmony_ci inc_le128(%xmm0, %xmm15, %xmm13); 10448c2ecf20Sopenharmony_ci vpshufb %xmm14, %xmm0, %xmm9; 10458c2ecf20Sopenharmony_ci inc_le128(%xmm0, %xmm15, %xmm13); 10468c2ecf20Sopenharmony_ci vpshufb %xmm14, %xmm0, %xmm8; 10478c2ecf20Sopenharmony_ci inc_le128(%xmm0, %xmm15, %xmm13); 10488c2ecf20Sopenharmony_ci vpshufb %xmm14, %xmm0, %xmm7; 10498c2ecf20Sopenharmony_ci inc_le128(%xmm0, %xmm15, %xmm13); 10508c2ecf20Sopenharmony_ci vpshufb %xmm14, %xmm0, %xmm6; 10518c2ecf20Sopenharmony_ci inc_le128(%xmm0, %xmm15, %xmm13); 10528c2ecf20Sopenharmony_ci vpshufb %xmm14, %xmm0, %xmm5; 10538c2ecf20Sopenharmony_ci inc_le128(%xmm0, %xmm15, %xmm13); 10548c2ecf20Sopenharmony_ci vpshufb %xmm14, %xmm0, %xmm4; 10558c2ecf20Sopenharmony_ci inc_le128(%xmm0, %xmm15, %xmm13); 10568c2ecf20Sopenharmony_ci vpshufb %xmm14, %xmm0, %xmm3; 10578c2ecf20Sopenharmony_ci inc_le128(%xmm0, %xmm15, %xmm13); 10588c2ecf20Sopenharmony_ci vpshufb %xmm14, %xmm0, %xmm2; 10598c2ecf20Sopenharmony_ci inc_le128(%xmm0, %xmm15, %xmm13); 10608c2ecf20Sopenharmony_ci vpshufb %xmm14, %xmm0, %xmm1; 10618c2ecf20Sopenharmony_ci inc_le128(%xmm0, %xmm15, %xmm13); 10628c2ecf20Sopenharmony_ci vmovdqa %xmm0, %xmm13; 10638c2ecf20Sopenharmony_ci vpshufb %xmm14, %xmm0, %xmm0; 10648c2ecf20Sopenharmony_ci inc_le128(%xmm13, %xmm15, %xmm14); 10658c2ecf20Sopenharmony_ci vmovdqu %xmm13, (%rcx); 10668c2ecf20Sopenharmony_ci 10678c2ecf20Sopenharmony_ci /* inpack16_pre: */ 10688c2ecf20Sopenharmony_ci vmovq (key_table)(CTX), %xmm15; 10698c2ecf20Sopenharmony_ci vpshufb .Lpack_bswap, %xmm15, %xmm15; 10708c2ecf20Sopenharmony_ci vpxor %xmm0, %xmm15, %xmm0; 10718c2ecf20Sopenharmony_ci vpxor %xmm1, %xmm15, %xmm1; 10728c2ecf20Sopenharmony_ci vpxor %xmm2, %xmm15, %xmm2; 10738c2ecf20Sopenharmony_ci vpxor %xmm3, %xmm15, %xmm3; 10748c2ecf20Sopenharmony_ci vpxor %xmm4, %xmm15, %xmm4; 10758c2ecf20Sopenharmony_ci vpxor %xmm5, %xmm15, %xmm5; 10768c2ecf20Sopenharmony_ci vpxor %xmm6, %xmm15, %xmm6; 10778c2ecf20Sopenharmony_ci vpxor %xmm7, %xmm15, %xmm7; 10788c2ecf20Sopenharmony_ci vpxor %xmm8, %xmm15, %xmm8; 10798c2ecf20Sopenharmony_ci vpxor %xmm9, %xmm15, %xmm9; 10808c2ecf20Sopenharmony_ci vpxor %xmm10, %xmm15, %xmm10; 10818c2ecf20Sopenharmony_ci vpxor %xmm11, %xmm15, %xmm11; 10828c2ecf20Sopenharmony_ci vpxor %xmm12, %xmm15, %xmm12; 10838c2ecf20Sopenharmony_ci vpxor 13 * 16(%rax), %xmm15, %xmm13; 10848c2ecf20Sopenharmony_ci vpxor 14 * 16(%rax), %xmm15, %xmm14; 10858c2ecf20Sopenharmony_ci vpxor 15 * 16(%rax), %xmm15, %xmm15; 10868c2ecf20Sopenharmony_ci 10878c2ecf20Sopenharmony_ci call __camellia_enc_blk16; 10888c2ecf20Sopenharmony_ci 10898c2ecf20Sopenharmony_ci addq $(16 * 16), %rsp; 10908c2ecf20Sopenharmony_ci 10918c2ecf20Sopenharmony_ci vpxor 0 * 16(%rdx), %xmm7, %xmm7; 10928c2ecf20Sopenharmony_ci vpxor 1 * 16(%rdx), %xmm6, %xmm6; 10938c2ecf20Sopenharmony_ci vpxor 2 * 16(%rdx), %xmm5, %xmm5; 10948c2ecf20Sopenharmony_ci vpxor 3 * 16(%rdx), %xmm4, %xmm4; 10958c2ecf20Sopenharmony_ci vpxor 4 * 16(%rdx), %xmm3, %xmm3; 10968c2ecf20Sopenharmony_ci vpxor 5 * 16(%rdx), %xmm2, %xmm2; 10978c2ecf20Sopenharmony_ci vpxor 6 * 16(%rdx), %xmm1, %xmm1; 10988c2ecf20Sopenharmony_ci vpxor 7 * 16(%rdx), %xmm0, %xmm0; 10998c2ecf20Sopenharmony_ci vpxor 8 * 16(%rdx), %xmm15, %xmm15; 11008c2ecf20Sopenharmony_ci vpxor 9 * 16(%rdx), %xmm14, %xmm14; 11018c2ecf20Sopenharmony_ci vpxor 10 * 16(%rdx), %xmm13, %xmm13; 11028c2ecf20Sopenharmony_ci vpxor 11 * 16(%rdx), %xmm12, %xmm12; 11038c2ecf20Sopenharmony_ci vpxor 12 * 16(%rdx), %xmm11, %xmm11; 11048c2ecf20Sopenharmony_ci vpxor 13 * 16(%rdx), %xmm10, %xmm10; 11058c2ecf20Sopenharmony_ci vpxor 14 * 16(%rdx), %xmm9, %xmm9; 11068c2ecf20Sopenharmony_ci vpxor 15 * 16(%rdx), %xmm8, %xmm8; 11078c2ecf20Sopenharmony_ci write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, 11088c2ecf20Sopenharmony_ci %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, 11098c2ecf20Sopenharmony_ci %xmm8, %rsi); 11108c2ecf20Sopenharmony_ci 11118c2ecf20Sopenharmony_ci FRAME_END 11128c2ecf20Sopenharmony_ci RET; 11138c2ecf20Sopenharmony_ciSYM_FUNC_END(camellia_ctr_16way) 11148c2ecf20Sopenharmony_ci 11158c2ecf20Sopenharmony_ci#define gf128mul_x_ble(iv, mask, tmp) \ 11168c2ecf20Sopenharmony_ci vpsrad $31, iv, tmp; \ 11178c2ecf20Sopenharmony_ci vpaddq iv, iv, iv; \ 11188c2ecf20Sopenharmony_ci vpshufd $0x13, tmp, tmp; \ 11198c2ecf20Sopenharmony_ci vpand mask, tmp, tmp; \ 11208c2ecf20Sopenharmony_ci vpxor tmp, iv, iv; 11218c2ecf20Sopenharmony_ci 11228c2ecf20Sopenharmony_ci.align 8 11238c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(camellia_xts_crypt_16way) 11248c2ecf20Sopenharmony_ci /* input: 11258c2ecf20Sopenharmony_ci * %rdi: ctx, CTX 11268c2ecf20Sopenharmony_ci * %rsi: dst (16 blocks) 11278c2ecf20Sopenharmony_ci * %rdx: src (16 blocks) 11288c2ecf20Sopenharmony_ci * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 11298c2ecf20Sopenharmony_ci * %r8: index for input whitening key 11308c2ecf20Sopenharmony_ci * %r9: pointer to __camellia_enc_blk16 or __camellia_dec_blk16 11318c2ecf20Sopenharmony_ci */ 11328c2ecf20Sopenharmony_ci FRAME_BEGIN 11338c2ecf20Sopenharmony_ci 11348c2ecf20Sopenharmony_ci subq $(16 * 16), %rsp; 11358c2ecf20Sopenharmony_ci movq %rsp, %rax; 11368c2ecf20Sopenharmony_ci 11378c2ecf20Sopenharmony_ci vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14; 11388c2ecf20Sopenharmony_ci 11398c2ecf20Sopenharmony_ci /* load IV */ 11408c2ecf20Sopenharmony_ci vmovdqu (%rcx), %xmm0; 11418c2ecf20Sopenharmony_ci vpxor 0 * 16(%rdx), %xmm0, %xmm15; 11428c2ecf20Sopenharmony_ci vmovdqu %xmm15, 15 * 16(%rax); 11438c2ecf20Sopenharmony_ci vmovdqu %xmm0, 0 * 16(%rsi); 11448c2ecf20Sopenharmony_ci 11458c2ecf20Sopenharmony_ci /* construct IVs */ 11468c2ecf20Sopenharmony_ci gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 11478c2ecf20Sopenharmony_ci vpxor 1 * 16(%rdx), %xmm0, %xmm15; 11488c2ecf20Sopenharmony_ci vmovdqu %xmm15, 14 * 16(%rax); 11498c2ecf20Sopenharmony_ci vmovdqu %xmm0, 1 * 16(%rsi); 11508c2ecf20Sopenharmony_ci 11518c2ecf20Sopenharmony_ci gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 11528c2ecf20Sopenharmony_ci vpxor 2 * 16(%rdx), %xmm0, %xmm13; 11538c2ecf20Sopenharmony_ci vmovdqu %xmm0, 2 * 16(%rsi); 11548c2ecf20Sopenharmony_ci 11558c2ecf20Sopenharmony_ci gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 11568c2ecf20Sopenharmony_ci vpxor 3 * 16(%rdx), %xmm0, %xmm12; 11578c2ecf20Sopenharmony_ci vmovdqu %xmm0, 3 * 16(%rsi); 11588c2ecf20Sopenharmony_ci 11598c2ecf20Sopenharmony_ci gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 11608c2ecf20Sopenharmony_ci vpxor 4 * 16(%rdx), %xmm0, %xmm11; 11618c2ecf20Sopenharmony_ci vmovdqu %xmm0, 4 * 16(%rsi); 11628c2ecf20Sopenharmony_ci 11638c2ecf20Sopenharmony_ci gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 11648c2ecf20Sopenharmony_ci vpxor 5 * 16(%rdx), %xmm0, %xmm10; 11658c2ecf20Sopenharmony_ci vmovdqu %xmm0, 5 * 16(%rsi); 11668c2ecf20Sopenharmony_ci 11678c2ecf20Sopenharmony_ci gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 11688c2ecf20Sopenharmony_ci vpxor 6 * 16(%rdx), %xmm0, %xmm9; 11698c2ecf20Sopenharmony_ci vmovdqu %xmm0, 6 * 16(%rsi); 11708c2ecf20Sopenharmony_ci 11718c2ecf20Sopenharmony_ci gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 11728c2ecf20Sopenharmony_ci vpxor 7 * 16(%rdx), %xmm0, %xmm8; 11738c2ecf20Sopenharmony_ci vmovdqu %xmm0, 7 * 16(%rsi); 11748c2ecf20Sopenharmony_ci 11758c2ecf20Sopenharmony_ci gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 11768c2ecf20Sopenharmony_ci vpxor 8 * 16(%rdx), %xmm0, %xmm7; 11778c2ecf20Sopenharmony_ci vmovdqu %xmm0, 8 * 16(%rsi); 11788c2ecf20Sopenharmony_ci 11798c2ecf20Sopenharmony_ci gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 11808c2ecf20Sopenharmony_ci vpxor 9 * 16(%rdx), %xmm0, %xmm6; 11818c2ecf20Sopenharmony_ci vmovdqu %xmm0, 9 * 16(%rsi); 11828c2ecf20Sopenharmony_ci 11838c2ecf20Sopenharmony_ci gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 11848c2ecf20Sopenharmony_ci vpxor 10 * 16(%rdx), %xmm0, %xmm5; 11858c2ecf20Sopenharmony_ci vmovdqu %xmm0, 10 * 16(%rsi); 11868c2ecf20Sopenharmony_ci 11878c2ecf20Sopenharmony_ci gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 11888c2ecf20Sopenharmony_ci vpxor 11 * 16(%rdx), %xmm0, %xmm4; 11898c2ecf20Sopenharmony_ci vmovdqu %xmm0, 11 * 16(%rsi); 11908c2ecf20Sopenharmony_ci 11918c2ecf20Sopenharmony_ci gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 11928c2ecf20Sopenharmony_ci vpxor 12 * 16(%rdx), %xmm0, %xmm3; 11938c2ecf20Sopenharmony_ci vmovdqu %xmm0, 12 * 16(%rsi); 11948c2ecf20Sopenharmony_ci 11958c2ecf20Sopenharmony_ci gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 11968c2ecf20Sopenharmony_ci vpxor 13 * 16(%rdx), %xmm0, %xmm2; 11978c2ecf20Sopenharmony_ci vmovdqu %xmm0, 13 * 16(%rsi); 11988c2ecf20Sopenharmony_ci 11998c2ecf20Sopenharmony_ci gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 12008c2ecf20Sopenharmony_ci vpxor 14 * 16(%rdx), %xmm0, %xmm1; 12018c2ecf20Sopenharmony_ci vmovdqu %xmm0, 14 * 16(%rsi); 12028c2ecf20Sopenharmony_ci 12038c2ecf20Sopenharmony_ci gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 12048c2ecf20Sopenharmony_ci vpxor 15 * 16(%rdx), %xmm0, %xmm15; 12058c2ecf20Sopenharmony_ci vmovdqu %xmm15, 0 * 16(%rax); 12068c2ecf20Sopenharmony_ci vmovdqu %xmm0, 15 * 16(%rsi); 12078c2ecf20Sopenharmony_ci 12088c2ecf20Sopenharmony_ci gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 12098c2ecf20Sopenharmony_ci vmovdqu %xmm0, (%rcx); 12108c2ecf20Sopenharmony_ci 12118c2ecf20Sopenharmony_ci /* inpack16_pre: */ 12128c2ecf20Sopenharmony_ci vmovq (key_table)(CTX, %r8, 8), %xmm15; 12138c2ecf20Sopenharmony_ci vpshufb .Lpack_bswap, %xmm15, %xmm15; 12148c2ecf20Sopenharmony_ci vpxor 0 * 16(%rax), %xmm15, %xmm0; 12158c2ecf20Sopenharmony_ci vpxor %xmm1, %xmm15, %xmm1; 12168c2ecf20Sopenharmony_ci vpxor %xmm2, %xmm15, %xmm2; 12178c2ecf20Sopenharmony_ci vpxor %xmm3, %xmm15, %xmm3; 12188c2ecf20Sopenharmony_ci vpxor %xmm4, %xmm15, %xmm4; 12198c2ecf20Sopenharmony_ci vpxor %xmm5, %xmm15, %xmm5; 12208c2ecf20Sopenharmony_ci vpxor %xmm6, %xmm15, %xmm6; 12218c2ecf20Sopenharmony_ci vpxor %xmm7, %xmm15, %xmm7; 12228c2ecf20Sopenharmony_ci vpxor %xmm8, %xmm15, %xmm8; 12238c2ecf20Sopenharmony_ci vpxor %xmm9, %xmm15, %xmm9; 12248c2ecf20Sopenharmony_ci vpxor %xmm10, %xmm15, %xmm10; 12258c2ecf20Sopenharmony_ci vpxor %xmm11, %xmm15, %xmm11; 12268c2ecf20Sopenharmony_ci vpxor %xmm12, %xmm15, %xmm12; 12278c2ecf20Sopenharmony_ci vpxor %xmm13, %xmm15, %xmm13; 12288c2ecf20Sopenharmony_ci vpxor 14 * 16(%rax), %xmm15, %xmm14; 12298c2ecf20Sopenharmony_ci vpxor 15 * 16(%rax), %xmm15, %xmm15; 12308c2ecf20Sopenharmony_ci 12318c2ecf20Sopenharmony_ci CALL_NOSPEC r9; 12328c2ecf20Sopenharmony_ci 12338c2ecf20Sopenharmony_ci addq $(16 * 16), %rsp; 12348c2ecf20Sopenharmony_ci 12358c2ecf20Sopenharmony_ci vpxor 0 * 16(%rsi), %xmm7, %xmm7; 12368c2ecf20Sopenharmony_ci vpxor 1 * 16(%rsi), %xmm6, %xmm6; 12378c2ecf20Sopenharmony_ci vpxor 2 * 16(%rsi), %xmm5, %xmm5; 12388c2ecf20Sopenharmony_ci vpxor 3 * 16(%rsi), %xmm4, %xmm4; 12398c2ecf20Sopenharmony_ci vpxor 4 * 16(%rsi), %xmm3, %xmm3; 12408c2ecf20Sopenharmony_ci vpxor 5 * 16(%rsi), %xmm2, %xmm2; 12418c2ecf20Sopenharmony_ci vpxor 6 * 16(%rsi), %xmm1, %xmm1; 12428c2ecf20Sopenharmony_ci vpxor 7 * 16(%rsi), %xmm0, %xmm0; 12438c2ecf20Sopenharmony_ci vpxor 8 * 16(%rsi), %xmm15, %xmm15; 12448c2ecf20Sopenharmony_ci vpxor 9 * 16(%rsi), %xmm14, %xmm14; 12458c2ecf20Sopenharmony_ci vpxor 10 * 16(%rsi), %xmm13, %xmm13; 12468c2ecf20Sopenharmony_ci vpxor 11 * 16(%rsi), %xmm12, %xmm12; 12478c2ecf20Sopenharmony_ci vpxor 12 * 16(%rsi), %xmm11, %xmm11; 12488c2ecf20Sopenharmony_ci vpxor 13 * 16(%rsi), %xmm10, %xmm10; 12498c2ecf20Sopenharmony_ci vpxor 14 * 16(%rsi), %xmm9, %xmm9; 12508c2ecf20Sopenharmony_ci vpxor 15 * 16(%rsi), %xmm8, %xmm8; 12518c2ecf20Sopenharmony_ci write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, 12528c2ecf20Sopenharmony_ci %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, 12538c2ecf20Sopenharmony_ci %xmm8, %rsi); 12548c2ecf20Sopenharmony_ci 12558c2ecf20Sopenharmony_ci FRAME_END 12568c2ecf20Sopenharmony_ci RET; 12578c2ecf20Sopenharmony_ciSYM_FUNC_END(camellia_xts_crypt_16way) 12588c2ecf20Sopenharmony_ci 12598c2ecf20Sopenharmony_ciSYM_FUNC_START(camellia_xts_enc_16way) 12608c2ecf20Sopenharmony_ci /* input: 12618c2ecf20Sopenharmony_ci * %rdi: ctx, CTX 12628c2ecf20Sopenharmony_ci * %rsi: dst (16 blocks) 12638c2ecf20Sopenharmony_ci * %rdx: src (16 blocks) 12648c2ecf20Sopenharmony_ci * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 12658c2ecf20Sopenharmony_ci */ 12668c2ecf20Sopenharmony_ci xorl %r8d, %r8d; /* input whitening key, 0 for enc */ 12678c2ecf20Sopenharmony_ci 12688c2ecf20Sopenharmony_ci leaq __camellia_enc_blk16, %r9; 12698c2ecf20Sopenharmony_ci 12708c2ecf20Sopenharmony_ci jmp camellia_xts_crypt_16way; 12718c2ecf20Sopenharmony_ciSYM_FUNC_END(camellia_xts_enc_16way) 12728c2ecf20Sopenharmony_ci 12738c2ecf20Sopenharmony_ciSYM_FUNC_START(camellia_xts_dec_16way) 12748c2ecf20Sopenharmony_ci /* input: 12758c2ecf20Sopenharmony_ci * %rdi: ctx, CTX 12768c2ecf20Sopenharmony_ci * %rsi: dst (16 blocks) 12778c2ecf20Sopenharmony_ci * %rdx: src (16 blocks) 12788c2ecf20Sopenharmony_ci * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 12798c2ecf20Sopenharmony_ci */ 12808c2ecf20Sopenharmony_ci 12818c2ecf20Sopenharmony_ci cmpl $16, key_length(CTX); 12828c2ecf20Sopenharmony_ci movl $32, %r8d; 12838c2ecf20Sopenharmony_ci movl $24, %eax; 12848c2ecf20Sopenharmony_ci cmovel %eax, %r8d; /* input whitening key, last for dec */ 12858c2ecf20Sopenharmony_ci 12868c2ecf20Sopenharmony_ci leaq __camellia_dec_blk16, %r9; 12878c2ecf20Sopenharmony_ci 12888c2ecf20Sopenharmony_ci jmp camellia_xts_crypt_16way; 12898c2ecf20Sopenharmony_ciSYM_FUNC_END(camellia_xts_dec_16way) 1290