162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Serpent Cipher 4-way parallel algorithm (i586/SSE2) 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 662306a36Sopenharmony_ci * 762306a36Sopenharmony_ci * Based on crypto/serpent.c by 862306a36Sopenharmony_ci * Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no> 962306a36Sopenharmony_ci * 2003 Herbert Valerio Riedel <hvr@gnu.org> 1062306a36Sopenharmony_ci */ 1162306a36Sopenharmony_ci 1262306a36Sopenharmony_ci#include <linux/linkage.h> 1362306a36Sopenharmony_ci 1462306a36Sopenharmony_ci.file "serpent-sse2-i586-asm_32.S" 1562306a36Sopenharmony_ci.text 1662306a36Sopenharmony_ci 1762306a36Sopenharmony_ci#define arg_ctx 4 1862306a36Sopenharmony_ci#define arg_dst 8 1962306a36Sopenharmony_ci#define arg_src 12 2062306a36Sopenharmony_ci#define arg_xor 16 2162306a36Sopenharmony_ci 2262306a36Sopenharmony_ci/********************************************************************** 2362306a36Sopenharmony_ci 4-way SSE2 serpent 2462306a36Sopenharmony_ci **********************************************************************/ 2562306a36Sopenharmony_ci#define CTX %edx 2662306a36Sopenharmony_ci 2762306a36Sopenharmony_ci#define RA %xmm0 2862306a36Sopenharmony_ci#define RB %xmm1 2962306a36Sopenharmony_ci#define RC %xmm2 3062306a36Sopenharmony_ci#define RD %xmm3 3162306a36Sopenharmony_ci#define RE %xmm4 3262306a36Sopenharmony_ci 3362306a36Sopenharmony_ci#define RT0 %xmm5 3462306a36Sopenharmony_ci#define RT1 %xmm6 3562306a36Sopenharmony_ci 3662306a36Sopenharmony_ci#define RNOT %xmm7 3762306a36Sopenharmony_ci 3862306a36Sopenharmony_ci#define get_key(i, j, t) \ 3962306a36Sopenharmony_ci movd (4*(i)+(j))*4(CTX), t; \ 4062306a36Sopenharmony_ci pshufd $0, t, t; 4162306a36Sopenharmony_ci 4262306a36Sopenharmony_ci#define K(x0, x1, x2, x3, x4, i) \ 4362306a36Sopenharmony_ci get_key(i, 0, x4); \ 4462306a36Sopenharmony_ci get_key(i, 1, RT0); \ 4562306a36Sopenharmony_ci get_key(i, 2, RT1); \ 4662306a36Sopenharmony_ci pxor x4, x0; \ 4762306a36Sopenharmony_ci pxor RT0, x1; \ 4862306a36Sopenharmony_ci pxor RT1, x2; \ 4962306a36Sopenharmony_ci get_key(i, 3, x4); \ 5062306a36Sopenharmony_ci pxor x4, x3; 5162306a36Sopenharmony_ci 5262306a36Sopenharmony_ci#define LK(x0, x1, x2, x3, x4, i) \ 5362306a36Sopenharmony_ci movdqa x0, x4; \ 5462306a36Sopenharmony_ci pslld $13, x0; \ 5562306a36Sopenharmony_ci psrld $(32 - 13), x4; \ 5662306a36Sopenharmony_ci por x4, x0; \ 5762306a36Sopenharmony_ci pxor x0, x1; \ 5862306a36Sopenharmony_ci movdqa x2, x4; \ 5962306a36Sopenharmony_ci pslld $3, x2; \ 6062306a36Sopenharmony_ci psrld $(32 - 3), x4; \ 6162306a36Sopenharmony_ci por x4, x2; \ 6262306a36Sopenharmony_ci pxor x2, x1; \ 6362306a36Sopenharmony_ci movdqa x1, x4; \ 6462306a36Sopenharmony_ci pslld $1, x1; \ 6562306a36Sopenharmony_ci psrld $(32 - 1), x4; \ 6662306a36Sopenharmony_ci por x4, x1; \ 6762306a36Sopenharmony_ci movdqa x0, x4; \ 6862306a36Sopenharmony_ci pslld $3, x4; \ 6962306a36Sopenharmony_ci pxor x2, x3; \ 7062306a36Sopenharmony_ci pxor x4, x3; \ 7162306a36Sopenharmony_ci movdqa x3, x4; \ 7262306a36Sopenharmony_ci pslld $7, x3; \ 7362306a36Sopenharmony_ci psrld $(32 - 7), x4; \ 7462306a36Sopenharmony_ci por x4, x3; \ 7562306a36Sopenharmony_ci movdqa x1, x4; \ 7662306a36Sopenharmony_ci pslld $7, x4; \ 7762306a36Sopenharmony_ci pxor x1, x0; \ 7862306a36Sopenharmony_ci pxor x3, x0; \ 7962306a36Sopenharmony_ci pxor x3, x2; \ 8062306a36Sopenharmony_ci pxor x4, x2; \ 8162306a36Sopenharmony_ci movdqa x0, x4; \ 8262306a36Sopenharmony_ci get_key(i, 1, RT0); \ 8362306a36Sopenharmony_ci pxor RT0, x1; \ 8462306a36Sopenharmony_ci get_key(i, 3, RT0); \ 8562306a36Sopenharmony_ci pxor RT0, x3; \ 8662306a36Sopenharmony_ci pslld $5, x0; \ 8762306a36Sopenharmony_ci psrld $(32 - 5), x4; \ 8862306a36Sopenharmony_ci por x4, x0; \ 8962306a36Sopenharmony_ci movdqa x2, x4; \ 9062306a36Sopenharmony_ci pslld $22, x2; \ 9162306a36Sopenharmony_ci psrld $(32 - 22), x4; \ 9262306a36Sopenharmony_ci por x4, x2; \ 9362306a36Sopenharmony_ci get_key(i, 0, RT0); \ 9462306a36Sopenharmony_ci pxor RT0, x0; \ 9562306a36Sopenharmony_ci get_key(i, 2, RT0); \ 9662306a36Sopenharmony_ci pxor RT0, x2; 9762306a36Sopenharmony_ci 9862306a36Sopenharmony_ci#define KL(x0, x1, x2, x3, x4, i) \ 9962306a36Sopenharmony_ci K(x0, x1, x2, x3, x4, i); \ 10062306a36Sopenharmony_ci movdqa x0, x4; \ 10162306a36Sopenharmony_ci psrld $5, x0; \ 10262306a36Sopenharmony_ci pslld $(32 - 5), x4; \ 10362306a36Sopenharmony_ci por x4, x0; \ 10462306a36Sopenharmony_ci movdqa x2, x4; \ 10562306a36Sopenharmony_ci psrld $22, x2; \ 10662306a36Sopenharmony_ci pslld $(32 - 22), x4; \ 10762306a36Sopenharmony_ci por x4, x2; \ 10862306a36Sopenharmony_ci pxor x3, x2; \ 10962306a36Sopenharmony_ci pxor x3, x0; \ 11062306a36Sopenharmony_ci movdqa x1, x4; \ 11162306a36Sopenharmony_ci pslld $7, x4; \ 11262306a36Sopenharmony_ci pxor x1, x0; \ 11362306a36Sopenharmony_ci pxor x4, x2; \ 11462306a36Sopenharmony_ci movdqa x1, x4; \ 11562306a36Sopenharmony_ci psrld $1, x1; \ 11662306a36Sopenharmony_ci pslld $(32 - 1), x4; \ 11762306a36Sopenharmony_ci por x4, x1; \ 11862306a36Sopenharmony_ci movdqa x3, x4; \ 11962306a36Sopenharmony_ci psrld $7, x3; \ 12062306a36Sopenharmony_ci pslld $(32 - 7), x4; \ 12162306a36Sopenharmony_ci por x4, x3; \ 12262306a36Sopenharmony_ci pxor x0, x1; \ 12362306a36Sopenharmony_ci movdqa x0, x4; \ 12462306a36Sopenharmony_ci pslld $3, x4; \ 12562306a36Sopenharmony_ci pxor x4, x3; \ 12662306a36Sopenharmony_ci movdqa x0, x4; \ 12762306a36Sopenharmony_ci psrld $13, x0; \ 12862306a36Sopenharmony_ci pslld $(32 - 13), x4; \ 12962306a36Sopenharmony_ci por x4, x0; \ 13062306a36Sopenharmony_ci pxor x2, x1; \ 13162306a36Sopenharmony_ci pxor x2, x3; \ 13262306a36Sopenharmony_ci movdqa x2, x4; \ 13362306a36Sopenharmony_ci psrld $3, x2; \ 13462306a36Sopenharmony_ci pslld $(32 - 3), x4; \ 13562306a36Sopenharmony_ci por x4, x2; 13662306a36Sopenharmony_ci 13762306a36Sopenharmony_ci#define S0(x0, x1, x2, x3, x4) \ 13862306a36Sopenharmony_ci movdqa x3, x4; \ 13962306a36Sopenharmony_ci por x0, x3; \ 14062306a36Sopenharmony_ci pxor x4, x0; \ 14162306a36Sopenharmony_ci pxor x2, x4; \ 14262306a36Sopenharmony_ci pxor RNOT, x4; \ 14362306a36Sopenharmony_ci pxor x1, x3; \ 14462306a36Sopenharmony_ci pand x0, x1; \ 14562306a36Sopenharmony_ci pxor x4, x1; \ 14662306a36Sopenharmony_ci pxor x0, x2; \ 14762306a36Sopenharmony_ci pxor x3, x0; \ 14862306a36Sopenharmony_ci por x0, x4; \ 14962306a36Sopenharmony_ci pxor x2, x0; \ 15062306a36Sopenharmony_ci pand x1, x2; \ 15162306a36Sopenharmony_ci pxor x2, x3; \ 15262306a36Sopenharmony_ci pxor RNOT, x1; \ 15362306a36Sopenharmony_ci pxor x4, x2; \ 15462306a36Sopenharmony_ci pxor x2, x1; 15562306a36Sopenharmony_ci 15662306a36Sopenharmony_ci#define S1(x0, x1, x2, x3, x4) \ 15762306a36Sopenharmony_ci movdqa x1, x4; \ 15862306a36Sopenharmony_ci pxor x0, x1; \ 15962306a36Sopenharmony_ci pxor x3, x0; \ 16062306a36Sopenharmony_ci pxor RNOT, x3; \ 16162306a36Sopenharmony_ci pand x1, x4; \ 16262306a36Sopenharmony_ci por x1, x0; \ 16362306a36Sopenharmony_ci pxor x2, x3; \ 16462306a36Sopenharmony_ci pxor x3, x0; \ 16562306a36Sopenharmony_ci pxor x3, x1; \ 16662306a36Sopenharmony_ci pxor x4, x3; \ 16762306a36Sopenharmony_ci por x4, x1; \ 16862306a36Sopenharmony_ci pxor x2, x4; \ 16962306a36Sopenharmony_ci pand x0, x2; \ 17062306a36Sopenharmony_ci pxor x1, x2; \ 17162306a36Sopenharmony_ci por x0, x1; \ 17262306a36Sopenharmony_ci pxor RNOT, x0; \ 17362306a36Sopenharmony_ci pxor x2, x0; \ 17462306a36Sopenharmony_ci pxor x1, x4; 17562306a36Sopenharmony_ci 17662306a36Sopenharmony_ci#define S2(x0, x1, x2, x3, x4) \ 17762306a36Sopenharmony_ci pxor RNOT, x3; \ 17862306a36Sopenharmony_ci pxor x0, x1; \ 17962306a36Sopenharmony_ci movdqa x0, x4; \ 18062306a36Sopenharmony_ci pand x2, x0; \ 18162306a36Sopenharmony_ci pxor x3, x0; \ 18262306a36Sopenharmony_ci por x4, x3; \ 18362306a36Sopenharmony_ci pxor x1, x2; \ 18462306a36Sopenharmony_ci pxor x1, x3; \ 18562306a36Sopenharmony_ci pand x0, x1; \ 18662306a36Sopenharmony_ci pxor x2, x0; \ 18762306a36Sopenharmony_ci pand x3, x2; \ 18862306a36Sopenharmony_ci por x1, x3; \ 18962306a36Sopenharmony_ci pxor RNOT, x0; \ 19062306a36Sopenharmony_ci pxor x0, x3; \ 19162306a36Sopenharmony_ci pxor x0, x4; \ 19262306a36Sopenharmony_ci pxor x2, x0; \ 19362306a36Sopenharmony_ci por x2, x1; 19462306a36Sopenharmony_ci 19562306a36Sopenharmony_ci#define S3(x0, x1, x2, x3, x4) \ 19662306a36Sopenharmony_ci movdqa x1, x4; \ 19762306a36Sopenharmony_ci pxor x3, x1; \ 19862306a36Sopenharmony_ci por x0, x3; \ 19962306a36Sopenharmony_ci pand x0, x4; \ 20062306a36Sopenharmony_ci pxor x2, x0; \ 20162306a36Sopenharmony_ci pxor x1, x2; \ 20262306a36Sopenharmony_ci pand x3, x1; \ 20362306a36Sopenharmony_ci pxor x3, x2; \ 20462306a36Sopenharmony_ci por x4, x0; \ 20562306a36Sopenharmony_ci pxor x3, x4; \ 20662306a36Sopenharmony_ci pxor x0, x1; \ 20762306a36Sopenharmony_ci pand x3, x0; \ 20862306a36Sopenharmony_ci pand x4, x3; \ 20962306a36Sopenharmony_ci pxor x2, x3; \ 21062306a36Sopenharmony_ci por x1, x4; \ 21162306a36Sopenharmony_ci pand x1, x2; \ 21262306a36Sopenharmony_ci pxor x3, x4; \ 21362306a36Sopenharmony_ci pxor x3, x0; \ 21462306a36Sopenharmony_ci pxor x2, x3; 21562306a36Sopenharmony_ci 21662306a36Sopenharmony_ci#define S4(x0, x1, x2, x3, x4) \ 21762306a36Sopenharmony_ci movdqa x3, x4; \ 21862306a36Sopenharmony_ci pand x0, x3; \ 21962306a36Sopenharmony_ci pxor x4, x0; \ 22062306a36Sopenharmony_ci pxor x2, x3; \ 22162306a36Sopenharmony_ci por x4, x2; \ 22262306a36Sopenharmony_ci pxor x1, x0; \ 22362306a36Sopenharmony_ci pxor x3, x4; \ 22462306a36Sopenharmony_ci por x0, x2; \ 22562306a36Sopenharmony_ci pxor x1, x2; \ 22662306a36Sopenharmony_ci pand x0, x1; \ 22762306a36Sopenharmony_ci pxor x4, x1; \ 22862306a36Sopenharmony_ci pand x2, x4; \ 22962306a36Sopenharmony_ci pxor x3, x2; \ 23062306a36Sopenharmony_ci pxor x0, x4; \ 23162306a36Sopenharmony_ci por x1, x3; \ 23262306a36Sopenharmony_ci pxor RNOT, x1; \ 23362306a36Sopenharmony_ci pxor x0, x3; 23462306a36Sopenharmony_ci 23562306a36Sopenharmony_ci#define S5(x0, x1, x2, x3, x4) \ 23662306a36Sopenharmony_ci movdqa x1, x4; \ 23762306a36Sopenharmony_ci por x0, x1; \ 23862306a36Sopenharmony_ci pxor x1, x2; \ 23962306a36Sopenharmony_ci pxor RNOT, x3; \ 24062306a36Sopenharmony_ci pxor x0, x4; \ 24162306a36Sopenharmony_ci pxor x2, x0; \ 24262306a36Sopenharmony_ci pand x4, x1; \ 24362306a36Sopenharmony_ci por x3, x4; \ 24462306a36Sopenharmony_ci pxor x0, x4; \ 24562306a36Sopenharmony_ci pand x3, x0; \ 24662306a36Sopenharmony_ci pxor x3, x1; \ 24762306a36Sopenharmony_ci pxor x2, x3; \ 24862306a36Sopenharmony_ci pxor x1, x0; \ 24962306a36Sopenharmony_ci pand x4, x2; \ 25062306a36Sopenharmony_ci pxor x2, x1; \ 25162306a36Sopenharmony_ci pand x0, x2; \ 25262306a36Sopenharmony_ci pxor x2, x3; 25362306a36Sopenharmony_ci 25462306a36Sopenharmony_ci#define S6(x0, x1, x2, x3, x4) \ 25562306a36Sopenharmony_ci movdqa x1, x4; \ 25662306a36Sopenharmony_ci pxor x0, x3; \ 25762306a36Sopenharmony_ci pxor x2, x1; \ 25862306a36Sopenharmony_ci pxor x0, x2; \ 25962306a36Sopenharmony_ci pand x3, x0; \ 26062306a36Sopenharmony_ci por x3, x1; \ 26162306a36Sopenharmony_ci pxor RNOT, x4; \ 26262306a36Sopenharmony_ci pxor x1, x0; \ 26362306a36Sopenharmony_ci pxor x2, x1; \ 26462306a36Sopenharmony_ci pxor x4, x3; \ 26562306a36Sopenharmony_ci pxor x0, x4; \ 26662306a36Sopenharmony_ci pand x0, x2; \ 26762306a36Sopenharmony_ci pxor x1, x4; \ 26862306a36Sopenharmony_ci pxor x3, x2; \ 26962306a36Sopenharmony_ci pand x1, x3; \ 27062306a36Sopenharmony_ci pxor x0, x3; \ 27162306a36Sopenharmony_ci pxor x2, x1; 27262306a36Sopenharmony_ci 27362306a36Sopenharmony_ci#define S7(x0, x1, x2, x3, x4) \ 27462306a36Sopenharmony_ci pxor RNOT, x1; \ 27562306a36Sopenharmony_ci movdqa x1, x4; \ 27662306a36Sopenharmony_ci pxor RNOT, x0; \ 27762306a36Sopenharmony_ci pand x2, x1; \ 27862306a36Sopenharmony_ci pxor x3, x1; \ 27962306a36Sopenharmony_ci por x4, x3; \ 28062306a36Sopenharmony_ci pxor x2, x4; \ 28162306a36Sopenharmony_ci pxor x3, x2; \ 28262306a36Sopenharmony_ci pxor x0, x3; \ 28362306a36Sopenharmony_ci por x1, x0; \ 28462306a36Sopenharmony_ci pand x0, x2; \ 28562306a36Sopenharmony_ci pxor x4, x0; \ 28662306a36Sopenharmony_ci pxor x3, x4; \ 28762306a36Sopenharmony_ci pand x0, x3; \ 28862306a36Sopenharmony_ci pxor x1, x4; \ 28962306a36Sopenharmony_ci pxor x4, x2; \ 29062306a36Sopenharmony_ci pxor x1, x3; \ 29162306a36Sopenharmony_ci por x0, x4; \ 29262306a36Sopenharmony_ci pxor x1, x4; 29362306a36Sopenharmony_ci 29462306a36Sopenharmony_ci#define SI0(x0, x1, x2, x3, x4) \ 29562306a36Sopenharmony_ci movdqa x3, x4; \ 29662306a36Sopenharmony_ci pxor x0, x1; \ 29762306a36Sopenharmony_ci por x1, x3; \ 29862306a36Sopenharmony_ci pxor x1, x4; \ 29962306a36Sopenharmony_ci pxor RNOT, x0; \ 30062306a36Sopenharmony_ci pxor x3, x2; \ 30162306a36Sopenharmony_ci pxor x0, x3; \ 30262306a36Sopenharmony_ci pand x1, x0; \ 30362306a36Sopenharmony_ci pxor x2, x0; \ 30462306a36Sopenharmony_ci pand x3, x2; \ 30562306a36Sopenharmony_ci pxor x4, x3; \ 30662306a36Sopenharmony_ci pxor x3, x2; \ 30762306a36Sopenharmony_ci pxor x3, x1; \ 30862306a36Sopenharmony_ci pand x0, x3; \ 30962306a36Sopenharmony_ci pxor x0, x1; \ 31062306a36Sopenharmony_ci pxor x2, x0; \ 31162306a36Sopenharmony_ci pxor x3, x4; 31262306a36Sopenharmony_ci 31362306a36Sopenharmony_ci#define SI1(x0, x1, x2, x3, x4) \ 31462306a36Sopenharmony_ci pxor x3, x1; \ 31562306a36Sopenharmony_ci movdqa x0, x4; \ 31662306a36Sopenharmony_ci pxor x2, x0; \ 31762306a36Sopenharmony_ci pxor RNOT, x2; \ 31862306a36Sopenharmony_ci por x1, x4; \ 31962306a36Sopenharmony_ci pxor x3, x4; \ 32062306a36Sopenharmony_ci pand x1, x3; \ 32162306a36Sopenharmony_ci pxor x2, x1; \ 32262306a36Sopenharmony_ci pand x4, x2; \ 32362306a36Sopenharmony_ci pxor x1, x4; \ 32462306a36Sopenharmony_ci por x3, x1; \ 32562306a36Sopenharmony_ci pxor x0, x3; \ 32662306a36Sopenharmony_ci pxor x0, x2; \ 32762306a36Sopenharmony_ci por x4, x0; \ 32862306a36Sopenharmony_ci pxor x4, x2; \ 32962306a36Sopenharmony_ci pxor x0, x1; \ 33062306a36Sopenharmony_ci pxor x1, x4; 33162306a36Sopenharmony_ci 33262306a36Sopenharmony_ci#define SI2(x0, x1, x2, x3, x4) \ 33362306a36Sopenharmony_ci pxor x1, x2; \ 33462306a36Sopenharmony_ci movdqa x3, x4; \ 33562306a36Sopenharmony_ci pxor RNOT, x3; \ 33662306a36Sopenharmony_ci por x2, x3; \ 33762306a36Sopenharmony_ci pxor x4, x2; \ 33862306a36Sopenharmony_ci pxor x0, x4; \ 33962306a36Sopenharmony_ci pxor x1, x3; \ 34062306a36Sopenharmony_ci por x2, x1; \ 34162306a36Sopenharmony_ci pxor x0, x2; \ 34262306a36Sopenharmony_ci pxor x4, x1; \ 34362306a36Sopenharmony_ci por x3, x4; \ 34462306a36Sopenharmony_ci pxor x3, x2; \ 34562306a36Sopenharmony_ci pxor x2, x4; \ 34662306a36Sopenharmony_ci pand x1, x2; \ 34762306a36Sopenharmony_ci pxor x3, x2; \ 34862306a36Sopenharmony_ci pxor x4, x3; \ 34962306a36Sopenharmony_ci pxor x0, x4; 35062306a36Sopenharmony_ci 35162306a36Sopenharmony_ci#define SI3(x0, x1, x2, x3, x4) \ 35262306a36Sopenharmony_ci pxor x1, x2; \ 35362306a36Sopenharmony_ci movdqa x1, x4; \ 35462306a36Sopenharmony_ci pand x2, x1; \ 35562306a36Sopenharmony_ci pxor x0, x1; \ 35662306a36Sopenharmony_ci por x4, x0; \ 35762306a36Sopenharmony_ci pxor x3, x4; \ 35862306a36Sopenharmony_ci pxor x3, x0; \ 35962306a36Sopenharmony_ci por x1, x3; \ 36062306a36Sopenharmony_ci pxor x2, x1; \ 36162306a36Sopenharmony_ci pxor x3, x1; \ 36262306a36Sopenharmony_ci pxor x2, x0; \ 36362306a36Sopenharmony_ci pxor x3, x2; \ 36462306a36Sopenharmony_ci pand x1, x3; \ 36562306a36Sopenharmony_ci pxor x0, x1; \ 36662306a36Sopenharmony_ci pand x2, x0; \ 36762306a36Sopenharmony_ci pxor x3, x4; \ 36862306a36Sopenharmony_ci pxor x0, x3; \ 36962306a36Sopenharmony_ci pxor x1, x0; 37062306a36Sopenharmony_ci 37162306a36Sopenharmony_ci#define SI4(x0, x1, x2, x3, x4) \ 37262306a36Sopenharmony_ci pxor x3, x2; \ 37362306a36Sopenharmony_ci movdqa x0, x4; \ 37462306a36Sopenharmony_ci pand x1, x0; \ 37562306a36Sopenharmony_ci pxor x2, x0; \ 37662306a36Sopenharmony_ci por x3, x2; \ 37762306a36Sopenharmony_ci pxor RNOT, x4; \ 37862306a36Sopenharmony_ci pxor x0, x1; \ 37962306a36Sopenharmony_ci pxor x2, x0; \ 38062306a36Sopenharmony_ci pand x4, x2; \ 38162306a36Sopenharmony_ci pxor x0, x2; \ 38262306a36Sopenharmony_ci por x4, x0; \ 38362306a36Sopenharmony_ci pxor x3, x0; \ 38462306a36Sopenharmony_ci pand x2, x3; \ 38562306a36Sopenharmony_ci pxor x3, x4; \ 38662306a36Sopenharmony_ci pxor x1, x3; \ 38762306a36Sopenharmony_ci pand x0, x1; \ 38862306a36Sopenharmony_ci pxor x1, x4; \ 38962306a36Sopenharmony_ci pxor x3, x0; 39062306a36Sopenharmony_ci 39162306a36Sopenharmony_ci#define SI5(x0, x1, x2, x3, x4) \ 39262306a36Sopenharmony_ci movdqa x1, x4; \ 39362306a36Sopenharmony_ci por x2, x1; \ 39462306a36Sopenharmony_ci pxor x4, x2; \ 39562306a36Sopenharmony_ci pxor x3, x1; \ 39662306a36Sopenharmony_ci pand x4, x3; \ 39762306a36Sopenharmony_ci pxor x3, x2; \ 39862306a36Sopenharmony_ci por x0, x3; \ 39962306a36Sopenharmony_ci pxor RNOT, x0; \ 40062306a36Sopenharmony_ci pxor x2, x3; \ 40162306a36Sopenharmony_ci por x0, x2; \ 40262306a36Sopenharmony_ci pxor x1, x4; \ 40362306a36Sopenharmony_ci pxor x4, x2; \ 40462306a36Sopenharmony_ci pand x0, x4; \ 40562306a36Sopenharmony_ci pxor x1, x0; \ 40662306a36Sopenharmony_ci pxor x3, x1; \ 40762306a36Sopenharmony_ci pand x2, x0; \ 40862306a36Sopenharmony_ci pxor x3, x2; \ 40962306a36Sopenharmony_ci pxor x2, x0; \ 41062306a36Sopenharmony_ci pxor x4, x2; \ 41162306a36Sopenharmony_ci pxor x3, x4; 41262306a36Sopenharmony_ci 41362306a36Sopenharmony_ci#define SI6(x0, x1, x2, x3, x4) \ 41462306a36Sopenharmony_ci pxor x2, x0; \ 41562306a36Sopenharmony_ci movdqa x0, x4; \ 41662306a36Sopenharmony_ci pand x3, x0; \ 41762306a36Sopenharmony_ci pxor x3, x2; \ 41862306a36Sopenharmony_ci pxor x2, x0; \ 41962306a36Sopenharmony_ci pxor x1, x3; \ 42062306a36Sopenharmony_ci por x4, x2; \ 42162306a36Sopenharmony_ci pxor x3, x2; \ 42262306a36Sopenharmony_ci pand x0, x3; \ 42362306a36Sopenharmony_ci pxor RNOT, x0; \ 42462306a36Sopenharmony_ci pxor x1, x3; \ 42562306a36Sopenharmony_ci pand x2, x1; \ 42662306a36Sopenharmony_ci pxor x0, x4; \ 42762306a36Sopenharmony_ci pxor x4, x3; \ 42862306a36Sopenharmony_ci pxor x2, x4; \ 42962306a36Sopenharmony_ci pxor x1, x0; \ 43062306a36Sopenharmony_ci pxor x0, x2; 43162306a36Sopenharmony_ci 43262306a36Sopenharmony_ci#define SI7(x0, x1, x2, x3, x4) \ 43362306a36Sopenharmony_ci movdqa x3, x4; \ 43462306a36Sopenharmony_ci pand x0, x3; \ 43562306a36Sopenharmony_ci pxor x2, x0; \ 43662306a36Sopenharmony_ci por x4, x2; \ 43762306a36Sopenharmony_ci pxor x1, x4; \ 43862306a36Sopenharmony_ci pxor RNOT, x0; \ 43962306a36Sopenharmony_ci por x3, x1; \ 44062306a36Sopenharmony_ci pxor x0, x4; \ 44162306a36Sopenharmony_ci pand x2, x0; \ 44262306a36Sopenharmony_ci pxor x1, x0; \ 44362306a36Sopenharmony_ci pand x2, x1; \ 44462306a36Sopenharmony_ci pxor x2, x3; \ 44562306a36Sopenharmony_ci pxor x3, x4; \ 44662306a36Sopenharmony_ci pand x3, x2; \ 44762306a36Sopenharmony_ci por x0, x3; \ 44862306a36Sopenharmony_ci pxor x4, x1; \ 44962306a36Sopenharmony_ci pxor x4, x3; \ 45062306a36Sopenharmony_ci pand x0, x4; \ 45162306a36Sopenharmony_ci pxor x2, x4; 45262306a36Sopenharmony_ci 45362306a36Sopenharmony_ci#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 45462306a36Sopenharmony_ci movdqa x0, t2; \ 45562306a36Sopenharmony_ci punpckldq x1, x0; \ 45662306a36Sopenharmony_ci punpckhdq x1, t2; \ 45762306a36Sopenharmony_ci movdqa x2, t1; \ 45862306a36Sopenharmony_ci punpckhdq x3, x2; \ 45962306a36Sopenharmony_ci punpckldq x3, t1; \ 46062306a36Sopenharmony_ci movdqa x0, x1; \ 46162306a36Sopenharmony_ci punpcklqdq t1, x0; \ 46262306a36Sopenharmony_ci punpckhqdq t1, x1; \ 46362306a36Sopenharmony_ci movdqa t2, x3; \ 46462306a36Sopenharmony_ci punpcklqdq x2, t2; \ 46562306a36Sopenharmony_ci punpckhqdq x2, x3; \ 46662306a36Sopenharmony_ci movdqa t2, x2; 46762306a36Sopenharmony_ci 46862306a36Sopenharmony_ci#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ 46962306a36Sopenharmony_ci movdqu (0*4*4)(in), x0; \ 47062306a36Sopenharmony_ci movdqu (1*4*4)(in), x1; \ 47162306a36Sopenharmony_ci movdqu (2*4*4)(in), x2; \ 47262306a36Sopenharmony_ci movdqu (3*4*4)(in), x3; \ 47362306a36Sopenharmony_ci \ 47462306a36Sopenharmony_ci transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 47562306a36Sopenharmony_ci 47662306a36Sopenharmony_ci#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ 47762306a36Sopenharmony_ci transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 47862306a36Sopenharmony_ci \ 47962306a36Sopenharmony_ci movdqu x0, (0*4*4)(out); \ 48062306a36Sopenharmony_ci movdqu x1, (1*4*4)(out); \ 48162306a36Sopenharmony_ci movdqu x2, (2*4*4)(out); \ 48262306a36Sopenharmony_ci movdqu x3, (3*4*4)(out); 48362306a36Sopenharmony_ci 48462306a36Sopenharmony_ci#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ 48562306a36Sopenharmony_ci transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 48662306a36Sopenharmony_ci \ 48762306a36Sopenharmony_ci movdqu (0*4*4)(out), t0; \ 48862306a36Sopenharmony_ci pxor t0, x0; \ 48962306a36Sopenharmony_ci movdqu x0, (0*4*4)(out); \ 49062306a36Sopenharmony_ci movdqu (1*4*4)(out), t0; \ 49162306a36Sopenharmony_ci pxor t0, x1; \ 49262306a36Sopenharmony_ci movdqu x1, (1*4*4)(out); \ 49362306a36Sopenharmony_ci movdqu (2*4*4)(out), t0; \ 49462306a36Sopenharmony_ci pxor t0, x2; \ 49562306a36Sopenharmony_ci movdqu x2, (2*4*4)(out); \ 49662306a36Sopenharmony_ci movdqu (3*4*4)(out), t0; \ 49762306a36Sopenharmony_ci pxor t0, x3; \ 49862306a36Sopenharmony_ci movdqu x3, (3*4*4)(out); 49962306a36Sopenharmony_ci 50062306a36Sopenharmony_ciSYM_FUNC_START(__serpent_enc_blk_4way) 50162306a36Sopenharmony_ci /* input: 50262306a36Sopenharmony_ci * arg_ctx(%esp): ctx, CTX 50362306a36Sopenharmony_ci * arg_dst(%esp): dst 50462306a36Sopenharmony_ci * arg_src(%esp): src 50562306a36Sopenharmony_ci * arg_xor(%esp): bool, if true: xor output 50662306a36Sopenharmony_ci */ 50762306a36Sopenharmony_ci 50862306a36Sopenharmony_ci pcmpeqd RNOT, RNOT; 50962306a36Sopenharmony_ci 51062306a36Sopenharmony_ci movl arg_ctx(%esp), CTX; 51162306a36Sopenharmony_ci 51262306a36Sopenharmony_ci movl arg_src(%esp), %eax; 51362306a36Sopenharmony_ci read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); 51462306a36Sopenharmony_ci 51562306a36Sopenharmony_ci K(RA, RB, RC, RD, RE, 0); 51662306a36Sopenharmony_ci S0(RA, RB, RC, RD, RE); LK(RC, RB, RD, RA, RE, 1); 51762306a36Sopenharmony_ci S1(RC, RB, RD, RA, RE); LK(RE, RD, RA, RC, RB, 2); 51862306a36Sopenharmony_ci S2(RE, RD, RA, RC, RB); LK(RB, RD, RE, RC, RA, 3); 51962306a36Sopenharmony_ci S3(RB, RD, RE, RC, RA); LK(RC, RA, RD, RB, RE, 4); 52062306a36Sopenharmony_ci S4(RC, RA, RD, RB, RE); LK(RA, RD, RB, RE, RC, 5); 52162306a36Sopenharmony_ci S5(RA, RD, RB, RE, RC); LK(RC, RA, RD, RE, RB, 6); 52262306a36Sopenharmony_ci S6(RC, RA, RD, RE, RB); LK(RD, RB, RA, RE, RC, 7); 52362306a36Sopenharmony_ci S7(RD, RB, RA, RE, RC); LK(RC, RA, RE, RD, RB, 8); 52462306a36Sopenharmony_ci S0(RC, RA, RE, RD, RB); LK(RE, RA, RD, RC, RB, 9); 52562306a36Sopenharmony_ci S1(RE, RA, RD, RC, RB); LK(RB, RD, RC, RE, RA, 10); 52662306a36Sopenharmony_ci S2(RB, RD, RC, RE, RA); LK(RA, RD, RB, RE, RC, 11); 52762306a36Sopenharmony_ci S3(RA, RD, RB, RE, RC); LK(RE, RC, RD, RA, RB, 12); 52862306a36Sopenharmony_ci S4(RE, RC, RD, RA, RB); LK(RC, RD, RA, RB, RE, 13); 52962306a36Sopenharmony_ci S5(RC, RD, RA, RB, RE); LK(RE, RC, RD, RB, RA, 14); 53062306a36Sopenharmony_ci S6(RE, RC, RD, RB, RA); LK(RD, RA, RC, RB, RE, 15); 53162306a36Sopenharmony_ci S7(RD, RA, RC, RB, RE); LK(RE, RC, RB, RD, RA, 16); 53262306a36Sopenharmony_ci S0(RE, RC, RB, RD, RA); LK(RB, RC, RD, RE, RA, 17); 53362306a36Sopenharmony_ci S1(RB, RC, RD, RE, RA); LK(RA, RD, RE, RB, RC, 18); 53462306a36Sopenharmony_ci S2(RA, RD, RE, RB, RC); LK(RC, RD, RA, RB, RE, 19); 53562306a36Sopenharmony_ci S3(RC, RD, RA, RB, RE); LK(RB, RE, RD, RC, RA, 20); 53662306a36Sopenharmony_ci S4(RB, RE, RD, RC, RA); LK(RE, RD, RC, RA, RB, 21); 53762306a36Sopenharmony_ci S5(RE, RD, RC, RA, RB); LK(RB, RE, RD, RA, RC, 22); 53862306a36Sopenharmony_ci S6(RB, RE, RD, RA, RC); LK(RD, RC, RE, RA, RB, 23); 53962306a36Sopenharmony_ci S7(RD, RC, RE, RA, RB); LK(RB, RE, RA, RD, RC, 24); 54062306a36Sopenharmony_ci S0(RB, RE, RA, RD, RC); LK(RA, RE, RD, RB, RC, 25); 54162306a36Sopenharmony_ci S1(RA, RE, RD, RB, RC); LK(RC, RD, RB, RA, RE, 26); 54262306a36Sopenharmony_ci S2(RC, RD, RB, RA, RE); LK(RE, RD, RC, RA, RB, 27); 54362306a36Sopenharmony_ci S3(RE, RD, RC, RA, RB); LK(RA, RB, RD, RE, RC, 28); 54462306a36Sopenharmony_ci S4(RA, RB, RD, RE, RC); LK(RB, RD, RE, RC, RA, 29); 54562306a36Sopenharmony_ci S5(RB, RD, RE, RC, RA); LK(RA, RB, RD, RC, RE, 30); 54662306a36Sopenharmony_ci S6(RA, RB, RD, RC, RE); LK(RD, RE, RB, RC, RA, 31); 54762306a36Sopenharmony_ci S7(RD, RE, RB, RC, RA); K(RA, RB, RC, RD, RE, 32); 54862306a36Sopenharmony_ci 54962306a36Sopenharmony_ci movl arg_dst(%esp), %eax; 55062306a36Sopenharmony_ci 55162306a36Sopenharmony_ci cmpb $0, arg_xor(%esp); 55262306a36Sopenharmony_ci jnz .L__enc_xor4; 55362306a36Sopenharmony_ci 55462306a36Sopenharmony_ci write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); 55562306a36Sopenharmony_ci 55662306a36Sopenharmony_ci RET; 55762306a36Sopenharmony_ci 55862306a36Sopenharmony_ci.L__enc_xor4: 55962306a36Sopenharmony_ci xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); 56062306a36Sopenharmony_ci 56162306a36Sopenharmony_ci RET; 56262306a36Sopenharmony_ciSYM_FUNC_END(__serpent_enc_blk_4way) 56362306a36Sopenharmony_ci 56462306a36Sopenharmony_ciSYM_FUNC_START(serpent_dec_blk_4way) 56562306a36Sopenharmony_ci /* input: 56662306a36Sopenharmony_ci * arg_ctx(%esp): ctx, CTX 56762306a36Sopenharmony_ci * arg_dst(%esp): dst 56862306a36Sopenharmony_ci * arg_src(%esp): src 56962306a36Sopenharmony_ci */ 57062306a36Sopenharmony_ci 57162306a36Sopenharmony_ci pcmpeqd RNOT, RNOT; 57262306a36Sopenharmony_ci 57362306a36Sopenharmony_ci movl arg_ctx(%esp), CTX; 57462306a36Sopenharmony_ci 57562306a36Sopenharmony_ci movl arg_src(%esp), %eax; 57662306a36Sopenharmony_ci read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); 57762306a36Sopenharmony_ci 57862306a36Sopenharmony_ci K(RA, RB, RC, RD, RE, 32); 57962306a36Sopenharmony_ci SI7(RA, RB, RC, RD, RE); KL(RB, RD, RA, RE, RC, 31); 58062306a36Sopenharmony_ci SI6(RB, RD, RA, RE, RC); KL(RA, RC, RE, RB, RD, 30); 58162306a36Sopenharmony_ci SI5(RA, RC, RE, RB, RD); KL(RC, RD, RA, RE, RB, 29); 58262306a36Sopenharmony_ci SI4(RC, RD, RA, RE, RB); KL(RC, RA, RB, RE, RD, 28); 58362306a36Sopenharmony_ci SI3(RC, RA, RB, RE, RD); KL(RB, RC, RD, RE, RA, 27); 58462306a36Sopenharmony_ci SI2(RB, RC, RD, RE, RA); KL(RC, RA, RE, RD, RB, 26); 58562306a36Sopenharmony_ci SI1(RC, RA, RE, RD, RB); KL(RB, RA, RE, RD, RC, 25); 58662306a36Sopenharmony_ci SI0(RB, RA, RE, RD, RC); KL(RE, RC, RA, RB, RD, 24); 58762306a36Sopenharmony_ci SI7(RE, RC, RA, RB, RD); KL(RC, RB, RE, RD, RA, 23); 58862306a36Sopenharmony_ci SI6(RC, RB, RE, RD, RA); KL(RE, RA, RD, RC, RB, 22); 58962306a36Sopenharmony_ci SI5(RE, RA, RD, RC, RB); KL(RA, RB, RE, RD, RC, 21); 59062306a36Sopenharmony_ci SI4(RA, RB, RE, RD, RC); KL(RA, RE, RC, RD, RB, 20); 59162306a36Sopenharmony_ci SI3(RA, RE, RC, RD, RB); KL(RC, RA, RB, RD, RE, 19); 59262306a36Sopenharmony_ci SI2(RC, RA, RB, RD, RE); KL(RA, RE, RD, RB, RC, 18); 59362306a36Sopenharmony_ci SI1(RA, RE, RD, RB, RC); KL(RC, RE, RD, RB, RA, 17); 59462306a36Sopenharmony_ci SI0(RC, RE, RD, RB, RA); KL(RD, RA, RE, RC, RB, 16); 59562306a36Sopenharmony_ci SI7(RD, RA, RE, RC, RB); KL(RA, RC, RD, RB, RE, 15); 59662306a36Sopenharmony_ci SI6(RA, RC, RD, RB, RE); KL(RD, RE, RB, RA, RC, 14); 59762306a36Sopenharmony_ci SI5(RD, RE, RB, RA, RC); KL(RE, RC, RD, RB, RA, 13); 59862306a36Sopenharmony_ci SI4(RE, RC, RD, RB, RA); KL(RE, RD, RA, RB, RC, 12); 59962306a36Sopenharmony_ci SI3(RE, RD, RA, RB, RC); KL(RA, RE, RC, RB, RD, 11); 60062306a36Sopenharmony_ci SI2(RA, RE, RC, RB, RD); KL(RE, RD, RB, RC, RA, 10); 60162306a36Sopenharmony_ci SI1(RE, RD, RB, RC, RA); KL(RA, RD, RB, RC, RE, 9); 60262306a36Sopenharmony_ci SI0(RA, RD, RB, RC, RE); KL(RB, RE, RD, RA, RC, 8); 60362306a36Sopenharmony_ci SI7(RB, RE, RD, RA, RC); KL(RE, RA, RB, RC, RD, 7); 60462306a36Sopenharmony_ci SI6(RE, RA, RB, RC, RD); KL(RB, RD, RC, RE, RA, 6); 60562306a36Sopenharmony_ci SI5(RB, RD, RC, RE, RA); KL(RD, RA, RB, RC, RE, 5); 60662306a36Sopenharmony_ci SI4(RD, RA, RB, RC, RE); KL(RD, RB, RE, RC, RA, 4); 60762306a36Sopenharmony_ci SI3(RD, RB, RE, RC, RA); KL(RE, RD, RA, RC, RB, 3); 60862306a36Sopenharmony_ci SI2(RE, RD, RA, RC, RB); KL(RD, RB, RC, RA, RE, 2); 60962306a36Sopenharmony_ci SI1(RD, RB, RC, RA, RE); KL(RE, RB, RC, RA, RD, 1); 61062306a36Sopenharmony_ci SI0(RE, RB, RC, RA, RD); K(RC, RD, RB, RE, RA, 0); 61162306a36Sopenharmony_ci 61262306a36Sopenharmony_ci movl arg_dst(%esp), %eax; 61362306a36Sopenharmony_ci write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA); 61462306a36Sopenharmony_ci 61562306a36Sopenharmony_ci RET; 61662306a36Sopenharmony_ciSYM_FUNC_END(serpent_dec_blk_4way) 617