162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * SM3 AVX accelerated transform. 462306a36Sopenharmony_ci * specified in: https://datatracker.ietf.org/doc/html/draft-sca-cfrg-sm3-02 562306a36Sopenharmony_ci * 662306a36Sopenharmony_ci * Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna@iki.fi> 762306a36Sopenharmony_ci * Copyright (C) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> 862306a36Sopenharmony_ci */ 962306a36Sopenharmony_ci 1062306a36Sopenharmony_ci/* Based on SM3 AES/BMI2 accelerated work by libgcrypt at: 1162306a36Sopenharmony_ci * https://gnupg.org/software/libgcrypt/index.html 1262306a36Sopenharmony_ci */ 1362306a36Sopenharmony_ci 1462306a36Sopenharmony_ci#include <linux/linkage.h> 1562306a36Sopenharmony_ci#include <linux/cfi_types.h> 1662306a36Sopenharmony_ci#include <asm/frame.h> 1762306a36Sopenharmony_ci 1862306a36Sopenharmony_ci/* Context structure */ 1962306a36Sopenharmony_ci 2062306a36Sopenharmony_ci#define state_h0 0 2162306a36Sopenharmony_ci#define state_h1 4 2262306a36Sopenharmony_ci#define state_h2 8 2362306a36Sopenharmony_ci#define state_h3 12 2462306a36Sopenharmony_ci#define state_h4 16 2562306a36Sopenharmony_ci#define state_h5 20 2662306a36Sopenharmony_ci#define state_h6 24 2762306a36Sopenharmony_ci#define state_h7 28 2862306a36Sopenharmony_ci 2962306a36Sopenharmony_ci/* Constants */ 3062306a36Sopenharmony_ci 3162306a36Sopenharmony_ci/* Round constant macros */ 3262306a36Sopenharmony_ci 3362306a36Sopenharmony_ci#define K0 2043430169 /* 0x79cc4519 */ 3462306a36Sopenharmony_ci#define K1 -208106958 /* 0xf3988a32 */ 3562306a36Sopenharmony_ci#define K2 -416213915 /* 0xe7311465 */ 3662306a36Sopenharmony_ci#define K3 -832427829 /* 0xce6228cb */ 3762306a36Sopenharmony_ci#define K4 -1664855657 /* 0x9cc45197 */ 3862306a36Sopenharmony_ci#define K5 965255983 /* 0x3988a32f */ 3962306a36Sopenharmony_ci#define K6 1930511966 /* 0x7311465e */ 4062306a36Sopenharmony_ci#define K7 -433943364 /* 0xe6228cbc */ 4162306a36Sopenharmony_ci#define K8 -867886727 /* 0xcc451979 */ 4262306a36Sopenharmony_ci#define K9 -1735773453 /* 0x988a32f3 */ 4362306a36Sopenharmony_ci#define K10 823420391 /* 0x311465e7 */ 4462306a36Sopenharmony_ci#define K11 1646840782 /* 0x6228cbce */ 4562306a36Sopenharmony_ci#define K12 -1001285732 /* 0xc451979c */ 4662306a36Sopenharmony_ci#define K13 -2002571463 /* 0x88a32f39 */ 4762306a36Sopenharmony_ci#define K14 289824371 /* 0x11465e73 */ 4862306a36Sopenharmony_ci#define K15 579648742 /* 0x228cbce6 */ 4962306a36Sopenharmony_ci#define K16 -1651869049 /* 0x9d8a7a87 */ 5062306a36Sopenharmony_ci#define K17 991229199 /* 0x3b14f50f */ 5162306a36Sopenharmony_ci#define K18 1982458398 /* 0x7629ea1e */ 5262306a36Sopenharmony_ci#define K19 -330050500 /* 0xec53d43c */ 5362306a36Sopenharmony_ci#define K20 -660100999 /* 0xd8a7a879 */ 5462306a36Sopenharmony_ci#define K21 -1320201997 /* 0xb14f50f3 */ 5562306a36Sopenharmony_ci#define K22 1654563303 /* 0x629ea1e7 */ 5662306a36Sopenharmony_ci#define K23 -985840690 /* 0xc53d43ce */ 5762306a36Sopenharmony_ci#define K24 -1971681379 /* 0x8a7a879d */ 5862306a36Sopenharmony_ci#define K25 351604539 /* 0x14f50f3b */ 5962306a36Sopenharmony_ci#define K26 703209078 /* 0x29ea1e76 */ 6062306a36Sopenharmony_ci#define K27 1406418156 /* 0x53d43cec */ 6162306a36Sopenharmony_ci#define K28 -1482130984 /* 0xa7a879d8 */ 6262306a36Sopenharmony_ci#define K29 1330705329 /* 0x4f50f3b1 */ 6362306a36Sopenharmony_ci#define K30 -1633556638 /* 0x9ea1e762 */ 6462306a36Sopenharmony_ci#define K31 1027854021 /* 0x3d43cec5 */ 6562306a36Sopenharmony_ci#define K32 2055708042 /* 0x7a879d8a */ 6662306a36Sopenharmony_ci#define K33 -183551212 /* 0xf50f3b14 */ 6762306a36Sopenharmony_ci#define K34 -367102423 /* 0xea1e7629 */ 6862306a36Sopenharmony_ci#define K35 -734204845 /* 0xd43cec53 */ 6962306a36Sopenharmony_ci#define K36 -1468409689 /* 0xa879d8a7 */ 7062306a36Sopenharmony_ci#define K37 1358147919 /* 0x50f3b14f */ 7162306a36Sopenharmony_ci#define K38 -1578671458 /* 0xa1e7629e */ 7262306a36Sopenharmony_ci#define K39 1137624381 /* 0x43cec53d */ 7362306a36Sopenharmony_ci#define K40 -2019718534 /* 0x879d8a7a */ 7462306a36Sopenharmony_ci#define K41 255530229 /* 0x0f3b14f5 */ 7562306a36Sopenharmony_ci#define K42 511060458 /* 0x1e7629ea */ 7662306a36Sopenharmony_ci#define K43 1022120916 /* 0x3cec53d4 */ 7762306a36Sopenharmony_ci#define K44 2044241832 /* 0x79d8a7a8 */ 7862306a36Sopenharmony_ci#define K45 -206483632 /* 0xf3b14f50 */ 7962306a36Sopenharmony_ci#define K46 -412967263 /* 0xe7629ea1 */ 8062306a36Sopenharmony_ci#define K47 -825934525 /* 0xcec53d43 */ 8162306a36Sopenharmony_ci#define K48 -1651869049 /* 0x9d8a7a87 */ 8262306a36Sopenharmony_ci#define K49 991229199 /* 0x3b14f50f */ 8362306a36Sopenharmony_ci#define K50 1982458398 /* 0x7629ea1e */ 8462306a36Sopenharmony_ci#define K51 -330050500 /* 0xec53d43c */ 8562306a36Sopenharmony_ci#define K52 -660100999 /* 0xd8a7a879 */ 8662306a36Sopenharmony_ci#define K53 -1320201997 /* 0xb14f50f3 */ 8762306a36Sopenharmony_ci#define K54 1654563303 /* 0x629ea1e7 */ 8862306a36Sopenharmony_ci#define K55 -985840690 /* 0xc53d43ce */ 8962306a36Sopenharmony_ci#define K56 -1971681379 /* 0x8a7a879d */ 9062306a36Sopenharmony_ci#define K57 351604539 /* 0x14f50f3b */ 9162306a36Sopenharmony_ci#define K58 703209078 /* 0x29ea1e76 */ 9262306a36Sopenharmony_ci#define K59 1406418156 /* 0x53d43cec */ 9362306a36Sopenharmony_ci#define K60 -1482130984 /* 0xa7a879d8 */ 9462306a36Sopenharmony_ci#define K61 1330705329 /* 0x4f50f3b1 */ 9562306a36Sopenharmony_ci#define K62 -1633556638 /* 0x9ea1e762 */ 9662306a36Sopenharmony_ci#define K63 1027854021 /* 0x3d43cec5 */ 9762306a36Sopenharmony_ci 9862306a36Sopenharmony_ci/* Register macros */ 9962306a36Sopenharmony_ci 10062306a36Sopenharmony_ci#define RSTATE %rdi 10162306a36Sopenharmony_ci#define RDATA %rsi 10262306a36Sopenharmony_ci#define RNBLKS %rdx 10362306a36Sopenharmony_ci 10462306a36Sopenharmony_ci#define t0 %eax 10562306a36Sopenharmony_ci#define t1 %ebx 10662306a36Sopenharmony_ci#define t2 %ecx 10762306a36Sopenharmony_ci 10862306a36Sopenharmony_ci#define a %r8d 10962306a36Sopenharmony_ci#define b %r9d 11062306a36Sopenharmony_ci#define c %r10d 11162306a36Sopenharmony_ci#define d %r11d 11262306a36Sopenharmony_ci#define e %r12d 11362306a36Sopenharmony_ci#define f %r13d 11462306a36Sopenharmony_ci#define g %r14d 11562306a36Sopenharmony_ci#define h %r15d 11662306a36Sopenharmony_ci 11762306a36Sopenharmony_ci#define W0 %xmm0 11862306a36Sopenharmony_ci#define W1 %xmm1 11962306a36Sopenharmony_ci#define W2 %xmm2 12062306a36Sopenharmony_ci#define W3 %xmm3 12162306a36Sopenharmony_ci#define W4 %xmm4 12262306a36Sopenharmony_ci#define W5 %xmm5 12362306a36Sopenharmony_ci 12462306a36Sopenharmony_ci#define XTMP0 %xmm6 12562306a36Sopenharmony_ci#define XTMP1 %xmm7 12662306a36Sopenharmony_ci#define XTMP2 %xmm8 12762306a36Sopenharmony_ci#define XTMP3 %xmm9 12862306a36Sopenharmony_ci#define XTMP4 %xmm10 12962306a36Sopenharmony_ci#define XTMP5 %xmm11 13062306a36Sopenharmony_ci#define XTMP6 %xmm12 13162306a36Sopenharmony_ci 13262306a36Sopenharmony_ci#define BSWAP_REG %xmm15 13362306a36Sopenharmony_ci 13462306a36Sopenharmony_ci/* Stack structure */ 13562306a36Sopenharmony_ci 13662306a36Sopenharmony_ci#define STACK_W_SIZE (32 * 2 * 3) 13762306a36Sopenharmony_ci#define STACK_REG_SAVE_SIZE (64) 13862306a36Sopenharmony_ci 13962306a36Sopenharmony_ci#define STACK_W (0) 14062306a36Sopenharmony_ci#define STACK_REG_SAVE (STACK_W + STACK_W_SIZE) 14162306a36Sopenharmony_ci#define STACK_SIZE (STACK_REG_SAVE + STACK_REG_SAVE_SIZE) 14262306a36Sopenharmony_ci 14362306a36Sopenharmony_ci/* Instruction helpers. */ 14462306a36Sopenharmony_ci 14562306a36Sopenharmony_ci#define roll2(v, reg) \ 14662306a36Sopenharmony_ci roll $(v), reg; 14762306a36Sopenharmony_ci 14862306a36Sopenharmony_ci#define roll3mov(v, src, dst) \ 14962306a36Sopenharmony_ci movl src, dst; \ 15062306a36Sopenharmony_ci roll $(v), dst; 15162306a36Sopenharmony_ci 15262306a36Sopenharmony_ci#define roll3(v, src, dst) \ 15362306a36Sopenharmony_ci rorxl $(32-(v)), src, dst; 15462306a36Sopenharmony_ci 15562306a36Sopenharmony_ci#define addl2(a, out) \ 15662306a36Sopenharmony_ci leal (a, out), out; 15762306a36Sopenharmony_ci 15862306a36Sopenharmony_ci/* Round function macros. */ 15962306a36Sopenharmony_ci 16062306a36Sopenharmony_ci#define GG1(x, y, z, o, t) \ 16162306a36Sopenharmony_ci movl x, o; \ 16262306a36Sopenharmony_ci xorl y, o; \ 16362306a36Sopenharmony_ci xorl z, o; 16462306a36Sopenharmony_ci 16562306a36Sopenharmony_ci#define FF1(x, y, z, o, t) GG1(x, y, z, o, t) 16662306a36Sopenharmony_ci 16762306a36Sopenharmony_ci#define GG2(x, y, z, o, t) \ 16862306a36Sopenharmony_ci andnl z, x, o; \ 16962306a36Sopenharmony_ci movl y, t; \ 17062306a36Sopenharmony_ci andl x, t; \ 17162306a36Sopenharmony_ci addl2(t, o); 17262306a36Sopenharmony_ci 17362306a36Sopenharmony_ci#define FF2(x, y, z, o, t) \ 17462306a36Sopenharmony_ci movl y, o; \ 17562306a36Sopenharmony_ci xorl x, o; \ 17662306a36Sopenharmony_ci movl y, t; \ 17762306a36Sopenharmony_ci andl x, t; \ 17862306a36Sopenharmony_ci andl z, o; \ 17962306a36Sopenharmony_ci xorl t, o; 18062306a36Sopenharmony_ci 18162306a36Sopenharmony_ci#define R(i, a, b, c, d, e, f, g, h, round, widx, wtype) \ 18262306a36Sopenharmony_ci /* rol(a, 12) => t0 */ \ 18362306a36Sopenharmony_ci roll3mov(12, a, t0); /* rorxl here would reduce perf by 6% on zen3 */ \ 18462306a36Sopenharmony_ci /* rol (t0 + e + t), 7) => t1 */ \ 18562306a36Sopenharmony_ci leal K##round(t0, e, 1), t1; \ 18662306a36Sopenharmony_ci roll2(7, t1); \ 18762306a36Sopenharmony_ci /* h + w1 => h */ \ 18862306a36Sopenharmony_ci addl wtype##_W1_ADDR(round, widx), h; \ 18962306a36Sopenharmony_ci /* h + t1 => h */ \ 19062306a36Sopenharmony_ci addl2(t1, h); \ 19162306a36Sopenharmony_ci /* t1 ^ t0 => t0 */ \ 19262306a36Sopenharmony_ci xorl t1, t0; \ 19362306a36Sopenharmony_ci /* w1w2 + d => d */ \ 19462306a36Sopenharmony_ci addl wtype##_W1W2_ADDR(round, widx), d; \ 19562306a36Sopenharmony_ci /* FF##i(a,b,c) => t1 */ \ 19662306a36Sopenharmony_ci FF##i(a, b, c, t1, t2); \ 19762306a36Sopenharmony_ci /* d + t1 => d */ \ 19862306a36Sopenharmony_ci addl2(t1, d); \ 19962306a36Sopenharmony_ci /* GG#i(e,f,g) => t2 */ \ 20062306a36Sopenharmony_ci GG##i(e, f, g, t2, t1); \ 20162306a36Sopenharmony_ci /* h + t2 => h */ \ 20262306a36Sopenharmony_ci addl2(t2, h); \ 20362306a36Sopenharmony_ci /* rol (f, 19) => f */ \ 20462306a36Sopenharmony_ci roll2(19, f); \ 20562306a36Sopenharmony_ci /* d + t0 => d */ \ 20662306a36Sopenharmony_ci addl2(t0, d); \ 20762306a36Sopenharmony_ci /* rol (b, 9) => b */ \ 20862306a36Sopenharmony_ci roll2(9, b); \ 20962306a36Sopenharmony_ci /* P0(h) => h */ \ 21062306a36Sopenharmony_ci roll3(9, h, t2); \ 21162306a36Sopenharmony_ci roll3(17, h, t1); \ 21262306a36Sopenharmony_ci xorl t2, h; \ 21362306a36Sopenharmony_ci xorl t1, h; 21462306a36Sopenharmony_ci 21562306a36Sopenharmony_ci#define R1(a, b, c, d, e, f, g, h, round, widx, wtype) \ 21662306a36Sopenharmony_ci R(1, a, b, c, d, e, f, g, h, round, widx, wtype) 21762306a36Sopenharmony_ci 21862306a36Sopenharmony_ci#define R2(a, b, c, d, e, f, g, h, round, widx, wtype) \ 21962306a36Sopenharmony_ci R(2, a, b, c, d, e, f, g, h, round, widx, wtype) 22062306a36Sopenharmony_ci 22162306a36Sopenharmony_ci/* Input expansion macros. */ 22262306a36Sopenharmony_ci 22362306a36Sopenharmony_ci/* Byte-swapped input address. */ 22462306a36Sopenharmony_ci#define IW_W_ADDR(round, widx, offs) \ 22562306a36Sopenharmony_ci (STACK_W + ((round) / 4) * 64 + (offs) + ((widx) * 4))(%rsp) 22662306a36Sopenharmony_ci 22762306a36Sopenharmony_ci/* Expanded input address. */ 22862306a36Sopenharmony_ci#define XW_W_ADDR(round, widx, offs) \ 22962306a36Sopenharmony_ci (STACK_W + ((((round) / 3) - 4) % 2) * 64 + (offs) + ((widx) * 4))(%rsp) 23062306a36Sopenharmony_ci 23162306a36Sopenharmony_ci/* Rounds 1-12, byte-swapped input block addresses. */ 23262306a36Sopenharmony_ci#define IW_W1_ADDR(round, widx) IW_W_ADDR(round, widx, 0) 23362306a36Sopenharmony_ci#define IW_W1W2_ADDR(round, widx) IW_W_ADDR(round, widx, 32) 23462306a36Sopenharmony_ci 23562306a36Sopenharmony_ci/* Rounds 1-12, expanded input block addresses. */ 23662306a36Sopenharmony_ci#define XW_W1_ADDR(round, widx) XW_W_ADDR(round, widx, 0) 23762306a36Sopenharmony_ci#define XW_W1W2_ADDR(round, widx) XW_W_ADDR(round, widx, 32) 23862306a36Sopenharmony_ci 23962306a36Sopenharmony_ci/* Input block loading. */ 24062306a36Sopenharmony_ci#define LOAD_W_XMM_1() \ 24162306a36Sopenharmony_ci vmovdqu 0*16(RDATA), XTMP0; /* XTMP0: w3, w2, w1, w0 */ \ 24262306a36Sopenharmony_ci vmovdqu 1*16(RDATA), XTMP1; /* XTMP1: w7, w6, w5, w4 */ \ 24362306a36Sopenharmony_ci vmovdqu 2*16(RDATA), XTMP2; /* XTMP2: w11, w10, w9, w8 */ \ 24462306a36Sopenharmony_ci vmovdqu 3*16(RDATA), XTMP3; /* XTMP3: w15, w14, w13, w12 */ \ 24562306a36Sopenharmony_ci vpshufb BSWAP_REG, XTMP0, XTMP0; \ 24662306a36Sopenharmony_ci vpshufb BSWAP_REG, XTMP1, XTMP1; \ 24762306a36Sopenharmony_ci vpshufb BSWAP_REG, XTMP2, XTMP2; \ 24862306a36Sopenharmony_ci vpshufb BSWAP_REG, XTMP3, XTMP3; \ 24962306a36Sopenharmony_ci vpxor XTMP0, XTMP1, XTMP4; \ 25062306a36Sopenharmony_ci vpxor XTMP1, XTMP2, XTMP5; \ 25162306a36Sopenharmony_ci vpxor XTMP2, XTMP3, XTMP6; \ 25262306a36Sopenharmony_ci leaq 64(RDATA), RDATA; \ 25362306a36Sopenharmony_ci vmovdqa XTMP0, IW_W1_ADDR(0, 0); \ 25462306a36Sopenharmony_ci vmovdqa XTMP4, IW_W1W2_ADDR(0, 0); \ 25562306a36Sopenharmony_ci vmovdqa XTMP1, IW_W1_ADDR(4, 0); \ 25662306a36Sopenharmony_ci vmovdqa XTMP5, IW_W1W2_ADDR(4, 0); 25762306a36Sopenharmony_ci 25862306a36Sopenharmony_ci#define LOAD_W_XMM_2() \ 25962306a36Sopenharmony_ci vmovdqa XTMP2, IW_W1_ADDR(8, 0); \ 26062306a36Sopenharmony_ci vmovdqa XTMP6, IW_W1W2_ADDR(8, 0); 26162306a36Sopenharmony_ci 26262306a36Sopenharmony_ci#define LOAD_W_XMM_3() \ 26362306a36Sopenharmony_ci vpshufd $0b00000000, XTMP0, W0; /* W0: xx, w0, xx, xx */ \ 26462306a36Sopenharmony_ci vpshufd $0b11111001, XTMP0, W1; /* W1: xx, w3, w2, w1 */ \ 26562306a36Sopenharmony_ci vmovdqa XTMP1, W2; /* W2: xx, w6, w5, w4 */ \ 26662306a36Sopenharmony_ci vpalignr $12, XTMP1, XTMP2, W3; /* W3: xx, w9, w8, w7 */ \ 26762306a36Sopenharmony_ci vpalignr $8, XTMP2, XTMP3, W4; /* W4: xx, w12, w11, w10 */ \ 26862306a36Sopenharmony_ci vpshufd $0b11111001, XTMP3, W5; /* W5: xx, w15, w14, w13 */ 26962306a36Sopenharmony_ci 27062306a36Sopenharmony_ci/* Message scheduling. Note: 3 words per XMM register. */ 27162306a36Sopenharmony_ci#define SCHED_W_0(round, w0, w1, w2, w3, w4, w5) \ 27262306a36Sopenharmony_ci /* Load (w[i - 16]) => XTMP0 */ \ 27362306a36Sopenharmony_ci vpshufd $0b10111111, w0, XTMP0; \ 27462306a36Sopenharmony_ci vpalignr $12, XTMP0, w1, XTMP0; /* XTMP0: xx, w2, w1, w0 */ \ 27562306a36Sopenharmony_ci /* Load (w[i - 13]) => XTMP1 */ \ 27662306a36Sopenharmony_ci vpshufd $0b10111111, w1, XTMP1; \ 27762306a36Sopenharmony_ci vpalignr $12, XTMP1, w2, XTMP1; \ 27862306a36Sopenharmony_ci /* w[i - 9] == w3 */ \ 27962306a36Sopenharmony_ci /* XMM3 ^ XTMP0 => XTMP0 */ \ 28062306a36Sopenharmony_ci vpxor w3, XTMP0, XTMP0; 28162306a36Sopenharmony_ci 28262306a36Sopenharmony_ci#define SCHED_W_1(round, w0, w1, w2, w3, w4, w5) \ 28362306a36Sopenharmony_ci /* w[i - 3] == w5 */ \ 28462306a36Sopenharmony_ci /* rol(XMM5, 15) ^ XTMP0 => XTMP0 */ \ 28562306a36Sopenharmony_ci vpslld $15, w5, XTMP2; \ 28662306a36Sopenharmony_ci vpsrld $(32-15), w5, XTMP3; \ 28762306a36Sopenharmony_ci vpxor XTMP2, XTMP3, XTMP3; \ 28862306a36Sopenharmony_ci vpxor XTMP3, XTMP0, XTMP0; \ 28962306a36Sopenharmony_ci /* rol(XTMP1, 7) => XTMP1 */ \ 29062306a36Sopenharmony_ci vpslld $7, XTMP1, XTMP5; \ 29162306a36Sopenharmony_ci vpsrld $(32-7), XTMP1, XTMP1; \ 29262306a36Sopenharmony_ci vpxor XTMP5, XTMP1, XTMP1; \ 29362306a36Sopenharmony_ci /* XMM4 ^ XTMP1 => XTMP1 */ \ 29462306a36Sopenharmony_ci vpxor w4, XTMP1, XTMP1; \ 29562306a36Sopenharmony_ci /* w[i - 6] == XMM4 */ \ 29662306a36Sopenharmony_ci /* P1(XTMP0) ^ XTMP1 => XMM0 */ \ 29762306a36Sopenharmony_ci vpslld $15, XTMP0, XTMP5; \ 29862306a36Sopenharmony_ci vpsrld $(32-15), XTMP0, XTMP6; \ 29962306a36Sopenharmony_ci vpslld $23, XTMP0, XTMP2; \ 30062306a36Sopenharmony_ci vpsrld $(32-23), XTMP0, XTMP3; \ 30162306a36Sopenharmony_ci vpxor XTMP0, XTMP1, XTMP1; \ 30262306a36Sopenharmony_ci vpxor XTMP6, XTMP5, XTMP5; \ 30362306a36Sopenharmony_ci vpxor XTMP3, XTMP2, XTMP2; \ 30462306a36Sopenharmony_ci vpxor XTMP2, XTMP5, XTMP5; \ 30562306a36Sopenharmony_ci vpxor XTMP5, XTMP1, w0; 30662306a36Sopenharmony_ci 30762306a36Sopenharmony_ci#define SCHED_W_2(round, w0, w1, w2, w3, w4, w5) \ 30862306a36Sopenharmony_ci /* W1 in XMM12 */ \ 30962306a36Sopenharmony_ci vpshufd $0b10111111, w4, XTMP4; \ 31062306a36Sopenharmony_ci vpalignr $12, XTMP4, w5, XTMP4; \ 31162306a36Sopenharmony_ci vmovdqa XTMP4, XW_W1_ADDR((round), 0); \ 31262306a36Sopenharmony_ci /* W1 ^ W2 => XTMP1 */ \ 31362306a36Sopenharmony_ci vpxor w0, XTMP4, XTMP1; \ 31462306a36Sopenharmony_ci vmovdqa XTMP1, XW_W1W2_ADDR((round), 0); 31562306a36Sopenharmony_ci 31662306a36Sopenharmony_ci 31762306a36Sopenharmony_ci.section .rodata.cst16, "aM", @progbits, 16 31862306a36Sopenharmony_ci.align 16 31962306a36Sopenharmony_ci 32062306a36Sopenharmony_ci.Lbe32mask: 32162306a36Sopenharmony_ci .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f 32262306a36Sopenharmony_ci 32362306a36Sopenharmony_ci.text 32462306a36Sopenharmony_ci 32562306a36Sopenharmony_ci/* 32662306a36Sopenharmony_ci * Transform nblocks*64 bytes (nblocks*16 32-bit words) at DATA. 32762306a36Sopenharmony_ci * 32862306a36Sopenharmony_ci * void sm3_transform_avx(struct sm3_state *state, 32962306a36Sopenharmony_ci * const u8 *data, int nblocks); 33062306a36Sopenharmony_ci */ 33162306a36Sopenharmony_ciSYM_TYPED_FUNC_START(sm3_transform_avx) 33262306a36Sopenharmony_ci /* input: 33362306a36Sopenharmony_ci * %rdi: ctx, CTX 33462306a36Sopenharmony_ci * %rsi: data (64*nblks bytes) 33562306a36Sopenharmony_ci * %rdx: nblocks 33662306a36Sopenharmony_ci */ 33762306a36Sopenharmony_ci vzeroupper; 33862306a36Sopenharmony_ci 33962306a36Sopenharmony_ci pushq %rbp; 34062306a36Sopenharmony_ci movq %rsp, %rbp; 34162306a36Sopenharmony_ci 34262306a36Sopenharmony_ci movq %rdx, RNBLKS; 34362306a36Sopenharmony_ci 34462306a36Sopenharmony_ci subq $STACK_SIZE, %rsp; 34562306a36Sopenharmony_ci andq $(~63), %rsp; 34662306a36Sopenharmony_ci 34762306a36Sopenharmony_ci movq %rbx, (STACK_REG_SAVE + 0 * 8)(%rsp); 34862306a36Sopenharmony_ci movq %r15, (STACK_REG_SAVE + 1 * 8)(%rsp); 34962306a36Sopenharmony_ci movq %r14, (STACK_REG_SAVE + 2 * 8)(%rsp); 35062306a36Sopenharmony_ci movq %r13, (STACK_REG_SAVE + 3 * 8)(%rsp); 35162306a36Sopenharmony_ci movq %r12, (STACK_REG_SAVE + 4 * 8)(%rsp); 35262306a36Sopenharmony_ci 35362306a36Sopenharmony_ci vmovdqa .Lbe32mask (%rip), BSWAP_REG; 35462306a36Sopenharmony_ci 35562306a36Sopenharmony_ci /* Get the values of the chaining variables. */ 35662306a36Sopenharmony_ci movl state_h0(RSTATE), a; 35762306a36Sopenharmony_ci movl state_h1(RSTATE), b; 35862306a36Sopenharmony_ci movl state_h2(RSTATE), c; 35962306a36Sopenharmony_ci movl state_h3(RSTATE), d; 36062306a36Sopenharmony_ci movl state_h4(RSTATE), e; 36162306a36Sopenharmony_ci movl state_h5(RSTATE), f; 36262306a36Sopenharmony_ci movl state_h6(RSTATE), g; 36362306a36Sopenharmony_ci movl state_h7(RSTATE), h; 36462306a36Sopenharmony_ci 36562306a36Sopenharmony_ci.align 16 36662306a36Sopenharmony_ci.Loop: 36762306a36Sopenharmony_ci /* Load data part1. */ 36862306a36Sopenharmony_ci LOAD_W_XMM_1(); 36962306a36Sopenharmony_ci 37062306a36Sopenharmony_ci leaq -1(RNBLKS), RNBLKS; 37162306a36Sopenharmony_ci 37262306a36Sopenharmony_ci /* Transform 0-3 + Load data part2. */ 37362306a36Sopenharmony_ci R1(a, b, c, d, e, f, g, h, 0, 0, IW); LOAD_W_XMM_2(); 37462306a36Sopenharmony_ci R1(d, a, b, c, h, e, f, g, 1, 1, IW); 37562306a36Sopenharmony_ci R1(c, d, a, b, g, h, e, f, 2, 2, IW); 37662306a36Sopenharmony_ci R1(b, c, d, a, f, g, h, e, 3, 3, IW); LOAD_W_XMM_3(); 37762306a36Sopenharmony_ci 37862306a36Sopenharmony_ci /* Transform 4-7 + Precalc 12-14. */ 37962306a36Sopenharmony_ci R1(a, b, c, d, e, f, g, h, 4, 0, IW); 38062306a36Sopenharmony_ci R1(d, a, b, c, h, e, f, g, 5, 1, IW); 38162306a36Sopenharmony_ci R1(c, d, a, b, g, h, e, f, 6, 2, IW); SCHED_W_0(12, W0, W1, W2, W3, W4, W5); 38262306a36Sopenharmony_ci R1(b, c, d, a, f, g, h, e, 7, 3, IW); SCHED_W_1(12, W0, W1, W2, W3, W4, W5); 38362306a36Sopenharmony_ci 38462306a36Sopenharmony_ci /* Transform 8-11 + Precalc 12-17. */ 38562306a36Sopenharmony_ci R1(a, b, c, d, e, f, g, h, 8, 0, IW); SCHED_W_2(12, W0, W1, W2, W3, W4, W5); 38662306a36Sopenharmony_ci R1(d, a, b, c, h, e, f, g, 9, 1, IW); SCHED_W_0(15, W1, W2, W3, W4, W5, W0); 38762306a36Sopenharmony_ci R1(c, d, a, b, g, h, e, f, 10, 2, IW); SCHED_W_1(15, W1, W2, W3, W4, W5, W0); 38862306a36Sopenharmony_ci R1(b, c, d, a, f, g, h, e, 11, 3, IW); SCHED_W_2(15, W1, W2, W3, W4, W5, W0); 38962306a36Sopenharmony_ci 39062306a36Sopenharmony_ci /* Transform 12-14 + Precalc 18-20 */ 39162306a36Sopenharmony_ci R1(a, b, c, d, e, f, g, h, 12, 0, XW); SCHED_W_0(18, W2, W3, W4, W5, W0, W1); 39262306a36Sopenharmony_ci R1(d, a, b, c, h, e, f, g, 13, 1, XW); SCHED_W_1(18, W2, W3, W4, W5, W0, W1); 39362306a36Sopenharmony_ci R1(c, d, a, b, g, h, e, f, 14, 2, XW); SCHED_W_2(18, W2, W3, W4, W5, W0, W1); 39462306a36Sopenharmony_ci 39562306a36Sopenharmony_ci /* Transform 15-17 + Precalc 21-23 */ 39662306a36Sopenharmony_ci R1(b, c, d, a, f, g, h, e, 15, 0, XW); SCHED_W_0(21, W3, W4, W5, W0, W1, W2); 39762306a36Sopenharmony_ci R2(a, b, c, d, e, f, g, h, 16, 1, XW); SCHED_W_1(21, W3, W4, W5, W0, W1, W2); 39862306a36Sopenharmony_ci R2(d, a, b, c, h, e, f, g, 17, 2, XW); SCHED_W_2(21, W3, W4, W5, W0, W1, W2); 39962306a36Sopenharmony_ci 40062306a36Sopenharmony_ci /* Transform 18-20 + Precalc 24-26 */ 40162306a36Sopenharmony_ci R2(c, d, a, b, g, h, e, f, 18, 0, XW); SCHED_W_0(24, W4, W5, W0, W1, W2, W3); 40262306a36Sopenharmony_ci R2(b, c, d, a, f, g, h, e, 19, 1, XW); SCHED_W_1(24, W4, W5, W0, W1, W2, W3); 40362306a36Sopenharmony_ci R2(a, b, c, d, e, f, g, h, 20, 2, XW); SCHED_W_2(24, W4, W5, W0, W1, W2, W3); 40462306a36Sopenharmony_ci 40562306a36Sopenharmony_ci /* Transform 21-23 + Precalc 27-29 */ 40662306a36Sopenharmony_ci R2(d, a, b, c, h, e, f, g, 21, 0, XW); SCHED_W_0(27, W5, W0, W1, W2, W3, W4); 40762306a36Sopenharmony_ci R2(c, d, a, b, g, h, e, f, 22, 1, XW); SCHED_W_1(27, W5, W0, W1, W2, W3, W4); 40862306a36Sopenharmony_ci R2(b, c, d, a, f, g, h, e, 23, 2, XW); SCHED_W_2(27, W5, W0, W1, W2, W3, W4); 40962306a36Sopenharmony_ci 41062306a36Sopenharmony_ci /* Transform 24-26 + Precalc 30-32 */ 41162306a36Sopenharmony_ci R2(a, b, c, d, e, f, g, h, 24, 0, XW); SCHED_W_0(30, W0, W1, W2, W3, W4, W5); 41262306a36Sopenharmony_ci R2(d, a, b, c, h, e, f, g, 25, 1, XW); SCHED_W_1(30, W0, W1, W2, W3, W4, W5); 41362306a36Sopenharmony_ci R2(c, d, a, b, g, h, e, f, 26, 2, XW); SCHED_W_2(30, W0, W1, W2, W3, W4, W5); 41462306a36Sopenharmony_ci 41562306a36Sopenharmony_ci /* Transform 27-29 + Precalc 33-35 */ 41662306a36Sopenharmony_ci R2(b, c, d, a, f, g, h, e, 27, 0, XW); SCHED_W_0(33, W1, W2, W3, W4, W5, W0); 41762306a36Sopenharmony_ci R2(a, b, c, d, e, f, g, h, 28, 1, XW); SCHED_W_1(33, W1, W2, W3, W4, W5, W0); 41862306a36Sopenharmony_ci R2(d, a, b, c, h, e, f, g, 29, 2, XW); SCHED_W_2(33, W1, W2, W3, W4, W5, W0); 41962306a36Sopenharmony_ci 42062306a36Sopenharmony_ci /* Transform 30-32 + Precalc 36-38 */ 42162306a36Sopenharmony_ci R2(c, d, a, b, g, h, e, f, 30, 0, XW); SCHED_W_0(36, W2, W3, W4, W5, W0, W1); 42262306a36Sopenharmony_ci R2(b, c, d, a, f, g, h, e, 31, 1, XW); SCHED_W_1(36, W2, W3, W4, W5, W0, W1); 42362306a36Sopenharmony_ci R2(a, b, c, d, e, f, g, h, 32, 2, XW); SCHED_W_2(36, W2, W3, W4, W5, W0, W1); 42462306a36Sopenharmony_ci 42562306a36Sopenharmony_ci /* Transform 33-35 + Precalc 39-41 */ 42662306a36Sopenharmony_ci R2(d, a, b, c, h, e, f, g, 33, 0, XW); SCHED_W_0(39, W3, W4, W5, W0, W1, W2); 42762306a36Sopenharmony_ci R2(c, d, a, b, g, h, e, f, 34, 1, XW); SCHED_W_1(39, W3, W4, W5, W0, W1, W2); 42862306a36Sopenharmony_ci R2(b, c, d, a, f, g, h, e, 35, 2, XW); SCHED_W_2(39, W3, W4, W5, W0, W1, W2); 42962306a36Sopenharmony_ci 43062306a36Sopenharmony_ci /* Transform 36-38 + Precalc 42-44 */ 43162306a36Sopenharmony_ci R2(a, b, c, d, e, f, g, h, 36, 0, XW); SCHED_W_0(42, W4, W5, W0, W1, W2, W3); 43262306a36Sopenharmony_ci R2(d, a, b, c, h, e, f, g, 37, 1, XW); SCHED_W_1(42, W4, W5, W0, W1, W2, W3); 43362306a36Sopenharmony_ci R2(c, d, a, b, g, h, e, f, 38, 2, XW); SCHED_W_2(42, W4, W5, W0, W1, W2, W3); 43462306a36Sopenharmony_ci 43562306a36Sopenharmony_ci /* Transform 39-41 + Precalc 45-47 */ 43662306a36Sopenharmony_ci R2(b, c, d, a, f, g, h, e, 39, 0, XW); SCHED_W_0(45, W5, W0, W1, W2, W3, W4); 43762306a36Sopenharmony_ci R2(a, b, c, d, e, f, g, h, 40, 1, XW); SCHED_W_1(45, W5, W0, W1, W2, W3, W4); 43862306a36Sopenharmony_ci R2(d, a, b, c, h, e, f, g, 41, 2, XW); SCHED_W_2(45, W5, W0, W1, W2, W3, W4); 43962306a36Sopenharmony_ci 44062306a36Sopenharmony_ci /* Transform 42-44 + Precalc 48-50 */ 44162306a36Sopenharmony_ci R2(c, d, a, b, g, h, e, f, 42, 0, XW); SCHED_W_0(48, W0, W1, W2, W3, W4, W5); 44262306a36Sopenharmony_ci R2(b, c, d, a, f, g, h, e, 43, 1, XW); SCHED_W_1(48, W0, W1, W2, W3, W4, W5); 44362306a36Sopenharmony_ci R2(a, b, c, d, e, f, g, h, 44, 2, XW); SCHED_W_2(48, W0, W1, W2, W3, W4, W5); 44462306a36Sopenharmony_ci 44562306a36Sopenharmony_ci /* Transform 45-47 + Precalc 51-53 */ 44662306a36Sopenharmony_ci R2(d, a, b, c, h, e, f, g, 45, 0, XW); SCHED_W_0(51, W1, W2, W3, W4, W5, W0); 44762306a36Sopenharmony_ci R2(c, d, a, b, g, h, e, f, 46, 1, XW); SCHED_W_1(51, W1, W2, W3, W4, W5, W0); 44862306a36Sopenharmony_ci R2(b, c, d, a, f, g, h, e, 47, 2, XW); SCHED_W_2(51, W1, W2, W3, W4, W5, W0); 44962306a36Sopenharmony_ci 45062306a36Sopenharmony_ci /* Transform 48-50 + Precalc 54-56 */ 45162306a36Sopenharmony_ci R2(a, b, c, d, e, f, g, h, 48, 0, XW); SCHED_W_0(54, W2, W3, W4, W5, W0, W1); 45262306a36Sopenharmony_ci R2(d, a, b, c, h, e, f, g, 49, 1, XW); SCHED_W_1(54, W2, W3, W4, W5, W0, W1); 45362306a36Sopenharmony_ci R2(c, d, a, b, g, h, e, f, 50, 2, XW); SCHED_W_2(54, W2, W3, W4, W5, W0, W1); 45462306a36Sopenharmony_ci 45562306a36Sopenharmony_ci /* Transform 51-53 + Precalc 57-59 */ 45662306a36Sopenharmony_ci R2(b, c, d, a, f, g, h, e, 51, 0, XW); SCHED_W_0(57, W3, W4, W5, W0, W1, W2); 45762306a36Sopenharmony_ci R2(a, b, c, d, e, f, g, h, 52, 1, XW); SCHED_W_1(57, W3, W4, W5, W0, W1, W2); 45862306a36Sopenharmony_ci R2(d, a, b, c, h, e, f, g, 53, 2, XW); SCHED_W_2(57, W3, W4, W5, W0, W1, W2); 45962306a36Sopenharmony_ci 46062306a36Sopenharmony_ci /* Transform 54-56 + Precalc 60-62 */ 46162306a36Sopenharmony_ci R2(c, d, a, b, g, h, e, f, 54, 0, XW); SCHED_W_0(60, W4, W5, W0, W1, W2, W3); 46262306a36Sopenharmony_ci R2(b, c, d, a, f, g, h, e, 55, 1, XW); SCHED_W_1(60, W4, W5, W0, W1, W2, W3); 46362306a36Sopenharmony_ci R2(a, b, c, d, e, f, g, h, 56, 2, XW); SCHED_W_2(60, W4, W5, W0, W1, W2, W3); 46462306a36Sopenharmony_ci 46562306a36Sopenharmony_ci /* Transform 57-59 + Precalc 63 */ 46662306a36Sopenharmony_ci R2(d, a, b, c, h, e, f, g, 57, 0, XW); SCHED_W_0(63, W5, W0, W1, W2, W3, W4); 46762306a36Sopenharmony_ci R2(c, d, a, b, g, h, e, f, 58, 1, XW); 46862306a36Sopenharmony_ci R2(b, c, d, a, f, g, h, e, 59, 2, XW); SCHED_W_1(63, W5, W0, W1, W2, W3, W4); 46962306a36Sopenharmony_ci 47062306a36Sopenharmony_ci /* Transform 60-62 + Precalc 63 */ 47162306a36Sopenharmony_ci R2(a, b, c, d, e, f, g, h, 60, 0, XW); 47262306a36Sopenharmony_ci R2(d, a, b, c, h, e, f, g, 61, 1, XW); SCHED_W_2(63, W5, W0, W1, W2, W3, W4); 47362306a36Sopenharmony_ci R2(c, d, a, b, g, h, e, f, 62, 2, XW); 47462306a36Sopenharmony_ci 47562306a36Sopenharmony_ci /* Transform 63 */ 47662306a36Sopenharmony_ci R2(b, c, d, a, f, g, h, e, 63, 0, XW); 47762306a36Sopenharmony_ci 47862306a36Sopenharmony_ci /* Update the chaining variables. */ 47962306a36Sopenharmony_ci xorl state_h0(RSTATE), a; 48062306a36Sopenharmony_ci xorl state_h1(RSTATE), b; 48162306a36Sopenharmony_ci xorl state_h2(RSTATE), c; 48262306a36Sopenharmony_ci xorl state_h3(RSTATE), d; 48362306a36Sopenharmony_ci movl a, state_h0(RSTATE); 48462306a36Sopenharmony_ci movl b, state_h1(RSTATE); 48562306a36Sopenharmony_ci movl c, state_h2(RSTATE); 48662306a36Sopenharmony_ci movl d, state_h3(RSTATE); 48762306a36Sopenharmony_ci xorl state_h4(RSTATE), e; 48862306a36Sopenharmony_ci xorl state_h5(RSTATE), f; 48962306a36Sopenharmony_ci xorl state_h6(RSTATE), g; 49062306a36Sopenharmony_ci xorl state_h7(RSTATE), h; 49162306a36Sopenharmony_ci movl e, state_h4(RSTATE); 49262306a36Sopenharmony_ci movl f, state_h5(RSTATE); 49362306a36Sopenharmony_ci movl g, state_h6(RSTATE); 49462306a36Sopenharmony_ci movl h, state_h7(RSTATE); 49562306a36Sopenharmony_ci 49662306a36Sopenharmony_ci cmpq $0, RNBLKS; 49762306a36Sopenharmony_ci jne .Loop; 49862306a36Sopenharmony_ci 49962306a36Sopenharmony_ci vzeroall; 50062306a36Sopenharmony_ci 50162306a36Sopenharmony_ci movq (STACK_REG_SAVE + 0 * 8)(%rsp), %rbx; 50262306a36Sopenharmony_ci movq (STACK_REG_SAVE + 1 * 8)(%rsp), %r15; 50362306a36Sopenharmony_ci movq (STACK_REG_SAVE + 2 * 8)(%rsp), %r14; 50462306a36Sopenharmony_ci movq (STACK_REG_SAVE + 3 * 8)(%rsp), %r13; 50562306a36Sopenharmony_ci movq (STACK_REG_SAVE + 4 * 8)(%rsp), %r12; 50662306a36Sopenharmony_ci 50762306a36Sopenharmony_ci vmovdqa %xmm0, IW_W1_ADDR(0, 0); 50862306a36Sopenharmony_ci vmovdqa %xmm0, IW_W1W2_ADDR(0, 0); 50962306a36Sopenharmony_ci vmovdqa %xmm0, IW_W1_ADDR(4, 0); 51062306a36Sopenharmony_ci vmovdqa %xmm0, IW_W1W2_ADDR(4, 0); 51162306a36Sopenharmony_ci vmovdqa %xmm0, IW_W1_ADDR(8, 0); 51262306a36Sopenharmony_ci vmovdqa %xmm0, IW_W1W2_ADDR(8, 0); 51362306a36Sopenharmony_ci 51462306a36Sopenharmony_ci movq %rbp, %rsp; 51562306a36Sopenharmony_ci popq %rbp; 51662306a36Sopenharmony_ci RET; 51762306a36Sopenharmony_ciSYM_FUNC_END(sm3_transform_avx) 518