17db96d56Sopenharmony_ci/* 27db96d56Sopenharmony_ci BLAKE2 reference source code package - optimized C implementations 37db96d56Sopenharmony_ci 47db96d56Sopenharmony_ci Written in 2012 by Samuel Neves <sneves@dei.uc.pt> 57db96d56Sopenharmony_ci 67db96d56Sopenharmony_ci To the extent possible under law, the author(s) have dedicated all copyright 77db96d56Sopenharmony_ci and related and neighboring rights to this software to the public domain 87db96d56Sopenharmony_ci worldwide. This software is distributed without any warranty. 97db96d56Sopenharmony_ci 107db96d56Sopenharmony_ci You should have received a copy of the CC0 Public Domain Dedication along with 117db96d56Sopenharmony_ci this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>. 127db96d56Sopenharmony_ci*/ 137db96d56Sopenharmony_ci#pragma once 147db96d56Sopenharmony_ci#ifndef __BLAKE2S_LOAD_SSE41_H__ 157db96d56Sopenharmony_ci#define __BLAKE2S_LOAD_SSE41_H__ 167db96d56Sopenharmony_ci 177db96d56Sopenharmony_ci#define LOAD_MSG_0_1(buf) \ 187db96d56Sopenharmony_cibuf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(2,0,2,0))); 197db96d56Sopenharmony_ci 207db96d56Sopenharmony_ci#define LOAD_MSG_0_2(buf) \ 217db96d56Sopenharmony_cibuf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(3,1,3,1))); 227db96d56Sopenharmony_ci 237db96d56Sopenharmony_ci#define LOAD_MSG_0_3(buf) \ 247db96d56Sopenharmony_cibuf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(2,0,2,0))); 257db96d56Sopenharmony_ci 267db96d56Sopenharmony_ci#define LOAD_MSG_0_4(buf) \ 277db96d56Sopenharmony_cibuf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(3,1,3,1))); 287db96d56Sopenharmony_ci 297db96d56Sopenharmony_ci#define LOAD_MSG_1_1(buf) \ 307db96d56Sopenharmony_cit0 = _mm_blend_epi16(m1, m2, 0x0C); \ 317db96d56Sopenharmony_cit1 = _mm_slli_si128(m3, 4); \ 327db96d56Sopenharmony_cit2 = _mm_blend_epi16(t0, t1, 0xF0); \ 337db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3)); 347db96d56Sopenharmony_ci 357db96d56Sopenharmony_ci#define LOAD_MSG_1_2(buf) \ 367db96d56Sopenharmony_cit0 = _mm_shuffle_epi32(m2,_MM_SHUFFLE(0,0,2,0)); \ 377db96d56Sopenharmony_cit1 = _mm_blend_epi16(m1,m3,0xC0); \ 387db96d56Sopenharmony_cit2 = _mm_blend_epi16(t0, t1, 0xF0); \ 397db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); 407db96d56Sopenharmony_ci 417db96d56Sopenharmony_ci#define LOAD_MSG_1_3(buf) \ 427db96d56Sopenharmony_cit0 = _mm_slli_si128(m1, 4); \ 437db96d56Sopenharmony_cit1 = _mm_blend_epi16(m2, t0, 0x30); \ 447db96d56Sopenharmony_cit2 = _mm_blend_epi16(m0, t1, 0xF0); \ 457db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); 467db96d56Sopenharmony_ci 477db96d56Sopenharmony_ci#define LOAD_MSG_1_4(buf) \ 487db96d56Sopenharmony_cit0 = _mm_unpackhi_epi32(m0,m1); \ 497db96d56Sopenharmony_cit1 = _mm_slli_si128(m3, 4); \ 507db96d56Sopenharmony_cit2 = _mm_blend_epi16(t0, t1, 0x0C); \ 517db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); 527db96d56Sopenharmony_ci 537db96d56Sopenharmony_ci#define LOAD_MSG_2_1(buf) \ 547db96d56Sopenharmony_cit0 = _mm_unpackhi_epi32(m2,m3); \ 557db96d56Sopenharmony_cit1 = _mm_blend_epi16(m3,m1,0x0C); \ 567db96d56Sopenharmony_cit2 = _mm_blend_epi16(t0, t1, 0x0F); \ 577db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2)); 587db96d56Sopenharmony_ci 597db96d56Sopenharmony_ci#define LOAD_MSG_2_2(buf) \ 607db96d56Sopenharmony_cit0 = _mm_unpacklo_epi32(m2,m0); \ 617db96d56Sopenharmony_cit1 = _mm_blend_epi16(t0, m0, 0xF0); \ 627db96d56Sopenharmony_cit2 = _mm_slli_si128(m3, 8); \ 637db96d56Sopenharmony_cibuf = _mm_blend_epi16(t1, t2, 0xC0); 647db96d56Sopenharmony_ci 657db96d56Sopenharmony_ci#define LOAD_MSG_2_3(buf) \ 667db96d56Sopenharmony_cit0 = _mm_blend_epi16(m0, m2, 0x3C); \ 677db96d56Sopenharmony_cit1 = _mm_srli_si128(m1, 12); \ 687db96d56Sopenharmony_cit2 = _mm_blend_epi16(t0,t1,0x03); \ 697db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,3,2)); 707db96d56Sopenharmony_ci 717db96d56Sopenharmony_ci#define LOAD_MSG_2_4(buf) \ 727db96d56Sopenharmony_cit0 = _mm_slli_si128(m3, 4); \ 737db96d56Sopenharmony_cit1 = _mm_blend_epi16(m0, m1, 0x33); \ 747db96d56Sopenharmony_cit2 = _mm_blend_epi16(t1, t0, 0xC0); \ 757db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,1,2,3)); 767db96d56Sopenharmony_ci 777db96d56Sopenharmony_ci#define LOAD_MSG_3_1(buf) \ 787db96d56Sopenharmony_cit0 = _mm_unpackhi_epi32(m0,m1); \ 797db96d56Sopenharmony_cit1 = _mm_unpackhi_epi32(t0, m2); \ 807db96d56Sopenharmony_cit2 = _mm_blend_epi16(t1, m3, 0x0C); \ 817db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2)); 827db96d56Sopenharmony_ci 837db96d56Sopenharmony_ci#define LOAD_MSG_3_2(buf) \ 847db96d56Sopenharmony_cit0 = _mm_slli_si128(m2, 8); \ 857db96d56Sopenharmony_cit1 = _mm_blend_epi16(m3,m0,0x0C); \ 867db96d56Sopenharmony_cit2 = _mm_blend_epi16(t1, t0, 0xC0); \ 877db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3)); 887db96d56Sopenharmony_ci 897db96d56Sopenharmony_ci#define LOAD_MSG_3_3(buf) \ 907db96d56Sopenharmony_cit0 = _mm_blend_epi16(m0,m1,0x0F); \ 917db96d56Sopenharmony_cit1 = _mm_blend_epi16(t0, m3, 0xC0); \ 927db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2)); 937db96d56Sopenharmony_ci 947db96d56Sopenharmony_ci#define LOAD_MSG_3_4(buf) \ 957db96d56Sopenharmony_cit0 = _mm_unpacklo_epi32(m0,m2); \ 967db96d56Sopenharmony_cit1 = _mm_unpackhi_epi32(m1,m2); \ 977db96d56Sopenharmony_cibuf = _mm_unpacklo_epi64(t1,t0); 987db96d56Sopenharmony_ci 997db96d56Sopenharmony_ci#define LOAD_MSG_4_1(buf) \ 1007db96d56Sopenharmony_cit0 = _mm_unpacklo_epi64(m1,m2); \ 1017db96d56Sopenharmony_cit1 = _mm_unpackhi_epi64(m0,m2); \ 1027db96d56Sopenharmony_cit2 = _mm_blend_epi16(t0,t1,0x33); \ 1037db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3)); 1047db96d56Sopenharmony_ci 1057db96d56Sopenharmony_ci#define LOAD_MSG_4_2(buf) \ 1067db96d56Sopenharmony_cit0 = _mm_unpackhi_epi64(m1,m3); \ 1077db96d56Sopenharmony_cit1 = _mm_unpacklo_epi64(m0,m1); \ 1087db96d56Sopenharmony_cibuf = _mm_blend_epi16(t0,t1,0x33); 1097db96d56Sopenharmony_ci 1107db96d56Sopenharmony_ci#define LOAD_MSG_4_3(buf) \ 1117db96d56Sopenharmony_cit0 = _mm_unpackhi_epi64(m3,m1); \ 1127db96d56Sopenharmony_cit1 = _mm_unpackhi_epi64(m2,m0); \ 1137db96d56Sopenharmony_cibuf = _mm_blend_epi16(t1,t0,0x33); 1147db96d56Sopenharmony_ci 1157db96d56Sopenharmony_ci#define LOAD_MSG_4_4(buf) \ 1167db96d56Sopenharmony_cit0 = _mm_blend_epi16(m0,m2,0x03); \ 1177db96d56Sopenharmony_cit1 = _mm_slli_si128(t0, 8); \ 1187db96d56Sopenharmony_cit2 = _mm_blend_epi16(t1,m3,0x0F); \ 1197db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,0,3)); 1207db96d56Sopenharmony_ci 1217db96d56Sopenharmony_ci#define LOAD_MSG_5_1(buf) \ 1227db96d56Sopenharmony_cit0 = _mm_unpackhi_epi32(m0,m1); \ 1237db96d56Sopenharmony_cit1 = _mm_unpacklo_epi32(m0,m2); \ 1247db96d56Sopenharmony_cibuf = _mm_unpacklo_epi64(t0,t1); 1257db96d56Sopenharmony_ci 1267db96d56Sopenharmony_ci#define LOAD_MSG_5_2(buf) \ 1277db96d56Sopenharmony_cit0 = _mm_srli_si128(m2, 4); \ 1287db96d56Sopenharmony_cit1 = _mm_blend_epi16(m0,m3,0x03); \ 1297db96d56Sopenharmony_cibuf = _mm_blend_epi16(t1,t0,0x3C); 1307db96d56Sopenharmony_ci 1317db96d56Sopenharmony_ci#define LOAD_MSG_5_3(buf) \ 1327db96d56Sopenharmony_cit0 = _mm_blend_epi16(m1,m0,0x0C); \ 1337db96d56Sopenharmony_cit1 = _mm_srli_si128(m3, 4); \ 1347db96d56Sopenharmony_cit2 = _mm_blend_epi16(t0,t1,0x30); \ 1357db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0)); 1367db96d56Sopenharmony_ci 1377db96d56Sopenharmony_ci#define LOAD_MSG_5_4(buf) \ 1387db96d56Sopenharmony_cit0 = _mm_unpacklo_epi64(m1,m2); \ 1397db96d56Sopenharmony_cit1= _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,2,0,1)); \ 1407db96d56Sopenharmony_cibuf = _mm_blend_epi16(t0,t1,0x33); 1417db96d56Sopenharmony_ci 1427db96d56Sopenharmony_ci#define LOAD_MSG_6_1(buf) \ 1437db96d56Sopenharmony_cit0 = _mm_slli_si128(m1, 12); \ 1447db96d56Sopenharmony_cit1 = _mm_blend_epi16(m0,m3,0x33); \ 1457db96d56Sopenharmony_cibuf = _mm_blend_epi16(t1,t0,0xC0); 1467db96d56Sopenharmony_ci 1477db96d56Sopenharmony_ci#define LOAD_MSG_6_2(buf) \ 1487db96d56Sopenharmony_cit0 = _mm_blend_epi16(m3,m2,0x30); \ 1497db96d56Sopenharmony_cit1 = _mm_srli_si128(m1, 4); \ 1507db96d56Sopenharmony_cit2 = _mm_blend_epi16(t0,t1,0x03); \ 1517db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,3,0)); 1527db96d56Sopenharmony_ci 1537db96d56Sopenharmony_ci#define LOAD_MSG_6_3(buf) \ 1547db96d56Sopenharmony_cit0 = _mm_unpacklo_epi64(m0,m2); \ 1557db96d56Sopenharmony_cit1 = _mm_srli_si128(m1, 4); \ 1567db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(_mm_blend_epi16(t0,t1,0x0C), _MM_SHUFFLE(2,3,1,0)); 1577db96d56Sopenharmony_ci 1587db96d56Sopenharmony_ci#define LOAD_MSG_6_4(buf) \ 1597db96d56Sopenharmony_cit0 = _mm_unpackhi_epi32(m1,m2); \ 1607db96d56Sopenharmony_cit1 = _mm_unpackhi_epi64(m0,t0); \ 1617db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2)); 1627db96d56Sopenharmony_ci 1637db96d56Sopenharmony_ci#define LOAD_MSG_7_1(buf) \ 1647db96d56Sopenharmony_cit0 = _mm_unpackhi_epi32(m0,m1); \ 1657db96d56Sopenharmony_cit1 = _mm_blend_epi16(t0,m3,0x0F); \ 1667db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(2,0,3,1)); 1677db96d56Sopenharmony_ci 1687db96d56Sopenharmony_ci#define LOAD_MSG_7_2(buf) \ 1697db96d56Sopenharmony_cit0 = _mm_blend_epi16(m2,m3,0x30); \ 1707db96d56Sopenharmony_cit1 = _mm_srli_si128(m0,4); \ 1717db96d56Sopenharmony_cit2 = _mm_blend_epi16(t0,t1,0x03); \ 1727db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,2,3)); 1737db96d56Sopenharmony_ci 1747db96d56Sopenharmony_ci#define LOAD_MSG_7_3(buf) \ 1757db96d56Sopenharmony_cit0 = _mm_unpackhi_epi64(m0,m3); \ 1767db96d56Sopenharmony_cit1 = _mm_unpacklo_epi64(m1,m2); \ 1777db96d56Sopenharmony_cit2 = _mm_blend_epi16(t0,t1,0x3C); \ 1787db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,3,1)); 1797db96d56Sopenharmony_ci 1807db96d56Sopenharmony_ci#define LOAD_MSG_7_4(buf) \ 1817db96d56Sopenharmony_cit0 = _mm_unpacklo_epi32(m0,m1); \ 1827db96d56Sopenharmony_cit1 = _mm_unpackhi_epi32(m1,m2); \ 1837db96d56Sopenharmony_cibuf = _mm_unpacklo_epi64(t0,t1); 1847db96d56Sopenharmony_ci 1857db96d56Sopenharmony_ci#define LOAD_MSG_8_1(buf) \ 1867db96d56Sopenharmony_cit0 = _mm_unpackhi_epi32(m1,m3); \ 1877db96d56Sopenharmony_cit1 = _mm_unpacklo_epi64(t0,m0); \ 1887db96d56Sopenharmony_cit2 = _mm_blend_epi16(t1,m2,0xC0); \ 1897db96d56Sopenharmony_cibuf = _mm_shufflehi_epi16(t2,_MM_SHUFFLE(1,0,3,2)); 1907db96d56Sopenharmony_ci 1917db96d56Sopenharmony_ci#define LOAD_MSG_8_2(buf) \ 1927db96d56Sopenharmony_cit0 = _mm_unpackhi_epi32(m0,m3); \ 1937db96d56Sopenharmony_cit1 = _mm_blend_epi16(m2,t0,0xF0); \ 1947db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(0,2,1,3)); 1957db96d56Sopenharmony_ci 1967db96d56Sopenharmony_ci#define LOAD_MSG_8_3(buf) \ 1977db96d56Sopenharmony_cit0 = _mm_blend_epi16(m2,m0,0x0C); \ 1987db96d56Sopenharmony_cit1 = _mm_slli_si128(t0,4); \ 1997db96d56Sopenharmony_cibuf = _mm_blend_epi16(t1,m3,0x0F); 2007db96d56Sopenharmony_ci 2017db96d56Sopenharmony_ci#define LOAD_MSG_8_4(buf) \ 2027db96d56Sopenharmony_cit0 = _mm_blend_epi16(m1,m0,0x30); \ 2037db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t0,_MM_SHUFFLE(1,0,3,2)); 2047db96d56Sopenharmony_ci 2057db96d56Sopenharmony_ci#define LOAD_MSG_9_1(buf) \ 2067db96d56Sopenharmony_cit0 = _mm_blend_epi16(m0,m2,0x03); \ 2077db96d56Sopenharmony_cit1 = _mm_blend_epi16(m1,m2,0x30); \ 2087db96d56Sopenharmony_cit2 = _mm_blend_epi16(t1,t0,0x0F); \ 2097db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(1,3,0,2)); 2107db96d56Sopenharmony_ci 2117db96d56Sopenharmony_ci#define LOAD_MSG_9_2(buf) \ 2127db96d56Sopenharmony_cit0 = _mm_slli_si128(m0,4); \ 2137db96d56Sopenharmony_cit1 = _mm_blend_epi16(m1,t0,0xC0); \ 2147db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(1,2,0,3)); 2157db96d56Sopenharmony_ci 2167db96d56Sopenharmony_ci#define LOAD_MSG_9_3(buf) \ 2177db96d56Sopenharmony_cit0 = _mm_unpackhi_epi32(m0,m3); \ 2187db96d56Sopenharmony_cit1 = _mm_unpacklo_epi32(m2,m3); \ 2197db96d56Sopenharmony_cit2 = _mm_unpackhi_epi64(t0,t1); \ 2207db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(3,0,2,1)); 2217db96d56Sopenharmony_ci 2227db96d56Sopenharmony_ci#define LOAD_MSG_9_4(buf) \ 2237db96d56Sopenharmony_cit0 = _mm_blend_epi16(m3,m2,0xC0); \ 2247db96d56Sopenharmony_cit1 = _mm_unpacklo_epi32(m0,m3); \ 2257db96d56Sopenharmony_cit2 = _mm_blend_epi16(t0,t1,0x0F); \ 2267db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,1,2,3)); 2277db96d56Sopenharmony_ci 2287db96d56Sopenharmony_ci#endif 2297db96d56Sopenharmony_ci 230