17db96d56Sopenharmony_ci/*
27db96d56Sopenharmony_ci   BLAKE2 reference source code package - optimized C implementations
37db96d56Sopenharmony_ci
47db96d56Sopenharmony_ci   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
57db96d56Sopenharmony_ci
67db96d56Sopenharmony_ci   To the extent possible under law, the author(s) have dedicated all copyright
77db96d56Sopenharmony_ci   and related and neighboring rights to this software to the public domain
87db96d56Sopenharmony_ci   worldwide. This software is distributed without any warranty.
97db96d56Sopenharmony_ci
107db96d56Sopenharmony_ci   You should have received a copy of the CC0 Public Domain Dedication along with
117db96d56Sopenharmony_ci   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
127db96d56Sopenharmony_ci*/
137db96d56Sopenharmony_ci#pragma once
147db96d56Sopenharmony_ci#ifndef __BLAKE2S_LOAD_SSE41_H__
157db96d56Sopenharmony_ci#define __BLAKE2S_LOAD_SSE41_H__
167db96d56Sopenharmony_ci
177db96d56Sopenharmony_ci#define LOAD_MSG_0_1(buf) \
187db96d56Sopenharmony_cibuf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(2,0,2,0)));
197db96d56Sopenharmony_ci
207db96d56Sopenharmony_ci#define LOAD_MSG_0_2(buf) \
217db96d56Sopenharmony_cibuf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(3,1,3,1)));
227db96d56Sopenharmony_ci
237db96d56Sopenharmony_ci#define LOAD_MSG_0_3(buf) \
247db96d56Sopenharmony_cibuf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(2,0,2,0)));
257db96d56Sopenharmony_ci
267db96d56Sopenharmony_ci#define LOAD_MSG_0_4(buf) \
277db96d56Sopenharmony_cibuf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(3,1,3,1)));
287db96d56Sopenharmony_ci
297db96d56Sopenharmony_ci#define LOAD_MSG_1_1(buf) \
307db96d56Sopenharmony_cit0 = _mm_blend_epi16(m1, m2, 0x0C); \
317db96d56Sopenharmony_cit1 = _mm_slli_si128(m3, 4); \
327db96d56Sopenharmony_cit2 = _mm_blend_epi16(t0, t1, 0xF0); \
337db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3));
347db96d56Sopenharmony_ci
357db96d56Sopenharmony_ci#define LOAD_MSG_1_2(buf) \
367db96d56Sopenharmony_cit0 = _mm_shuffle_epi32(m2,_MM_SHUFFLE(0,0,2,0)); \
377db96d56Sopenharmony_cit1 = _mm_blend_epi16(m1,m3,0xC0); \
387db96d56Sopenharmony_cit2 = _mm_blend_epi16(t0, t1, 0xF0); \
397db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
407db96d56Sopenharmony_ci
417db96d56Sopenharmony_ci#define LOAD_MSG_1_3(buf) \
427db96d56Sopenharmony_cit0 = _mm_slli_si128(m1, 4); \
437db96d56Sopenharmony_cit1 = _mm_blend_epi16(m2, t0, 0x30); \
447db96d56Sopenharmony_cit2 = _mm_blend_epi16(m0, t1, 0xF0); \
457db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
467db96d56Sopenharmony_ci
477db96d56Sopenharmony_ci#define LOAD_MSG_1_4(buf) \
487db96d56Sopenharmony_cit0 = _mm_unpackhi_epi32(m0,m1); \
497db96d56Sopenharmony_cit1 = _mm_slli_si128(m3, 4); \
507db96d56Sopenharmony_cit2 = _mm_blend_epi16(t0, t1, 0x0C); \
517db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
527db96d56Sopenharmony_ci
537db96d56Sopenharmony_ci#define LOAD_MSG_2_1(buf) \
547db96d56Sopenharmony_cit0 = _mm_unpackhi_epi32(m2,m3); \
557db96d56Sopenharmony_cit1 = _mm_blend_epi16(m3,m1,0x0C); \
567db96d56Sopenharmony_cit2 = _mm_blend_epi16(t0, t1, 0x0F); \
577db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
587db96d56Sopenharmony_ci
597db96d56Sopenharmony_ci#define LOAD_MSG_2_2(buf) \
607db96d56Sopenharmony_cit0 = _mm_unpacklo_epi32(m2,m0); \
617db96d56Sopenharmony_cit1 = _mm_blend_epi16(t0, m0, 0xF0); \
627db96d56Sopenharmony_cit2 = _mm_slli_si128(m3, 8); \
637db96d56Sopenharmony_cibuf = _mm_blend_epi16(t1, t2, 0xC0);
647db96d56Sopenharmony_ci
657db96d56Sopenharmony_ci#define LOAD_MSG_2_3(buf) \
667db96d56Sopenharmony_cit0 = _mm_blend_epi16(m0, m2, 0x3C); \
677db96d56Sopenharmony_cit1 = _mm_srli_si128(m1, 12); \
687db96d56Sopenharmony_cit2 = _mm_blend_epi16(t0,t1,0x03); \
697db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,3,2));
707db96d56Sopenharmony_ci
717db96d56Sopenharmony_ci#define LOAD_MSG_2_4(buf) \
727db96d56Sopenharmony_cit0 = _mm_slli_si128(m3, 4); \
737db96d56Sopenharmony_cit1 = _mm_blend_epi16(m0, m1, 0x33); \
747db96d56Sopenharmony_cit2 = _mm_blend_epi16(t1, t0, 0xC0); \
757db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,1,2,3));
767db96d56Sopenharmony_ci
777db96d56Sopenharmony_ci#define LOAD_MSG_3_1(buf) \
787db96d56Sopenharmony_cit0 = _mm_unpackhi_epi32(m0,m1); \
797db96d56Sopenharmony_cit1 = _mm_unpackhi_epi32(t0, m2); \
807db96d56Sopenharmony_cit2 = _mm_blend_epi16(t1, m3, 0x0C); \
817db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
827db96d56Sopenharmony_ci
837db96d56Sopenharmony_ci#define LOAD_MSG_3_2(buf) \
847db96d56Sopenharmony_cit0 = _mm_slli_si128(m2, 8); \
857db96d56Sopenharmony_cit1 = _mm_blend_epi16(m3,m0,0x0C); \
867db96d56Sopenharmony_cit2 = _mm_blend_epi16(t1, t0, 0xC0); \
877db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
887db96d56Sopenharmony_ci
897db96d56Sopenharmony_ci#define LOAD_MSG_3_3(buf) \
907db96d56Sopenharmony_cit0 = _mm_blend_epi16(m0,m1,0x0F); \
917db96d56Sopenharmony_cit1 = _mm_blend_epi16(t0, m3, 0xC0); \
927db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
937db96d56Sopenharmony_ci
947db96d56Sopenharmony_ci#define LOAD_MSG_3_4(buf) \
957db96d56Sopenharmony_cit0 = _mm_unpacklo_epi32(m0,m2); \
967db96d56Sopenharmony_cit1 = _mm_unpackhi_epi32(m1,m2); \
977db96d56Sopenharmony_cibuf = _mm_unpacklo_epi64(t1,t0);
987db96d56Sopenharmony_ci
997db96d56Sopenharmony_ci#define LOAD_MSG_4_1(buf) \
1007db96d56Sopenharmony_cit0 = _mm_unpacklo_epi64(m1,m2); \
1017db96d56Sopenharmony_cit1 = _mm_unpackhi_epi64(m0,m2); \
1027db96d56Sopenharmony_cit2 = _mm_blend_epi16(t0,t1,0x33); \
1037db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
1047db96d56Sopenharmony_ci
1057db96d56Sopenharmony_ci#define LOAD_MSG_4_2(buf) \
1067db96d56Sopenharmony_cit0 = _mm_unpackhi_epi64(m1,m3); \
1077db96d56Sopenharmony_cit1 = _mm_unpacklo_epi64(m0,m1); \
1087db96d56Sopenharmony_cibuf = _mm_blend_epi16(t0,t1,0x33);
1097db96d56Sopenharmony_ci
1107db96d56Sopenharmony_ci#define LOAD_MSG_4_3(buf) \
1117db96d56Sopenharmony_cit0 = _mm_unpackhi_epi64(m3,m1); \
1127db96d56Sopenharmony_cit1 = _mm_unpackhi_epi64(m2,m0); \
1137db96d56Sopenharmony_cibuf = _mm_blend_epi16(t1,t0,0x33);
1147db96d56Sopenharmony_ci
1157db96d56Sopenharmony_ci#define LOAD_MSG_4_4(buf) \
1167db96d56Sopenharmony_cit0 = _mm_blend_epi16(m0,m2,0x03); \
1177db96d56Sopenharmony_cit1 = _mm_slli_si128(t0, 8); \
1187db96d56Sopenharmony_cit2 = _mm_blend_epi16(t1,m3,0x0F); \
1197db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,0,3));
1207db96d56Sopenharmony_ci
1217db96d56Sopenharmony_ci#define LOAD_MSG_5_1(buf) \
1227db96d56Sopenharmony_cit0 = _mm_unpackhi_epi32(m0,m1); \
1237db96d56Sopenharmony_cit1 = _mm_unpacklo_epi32(m0,m2); \
1247db96d56Sopenharmony_cibuf = _mm_unpacklo_epi64(t0,t1);
1257db96d56Sopenharmony_ci
1267db96d56Sopenharmony_ci#define LOAD_MSG_5_2(buf) \
1277db96d56Sopenharmony_cit0 = _mm_srli_si128(m2, 4); \
1287db96d56Sopenharmony_cit1 = _mm_blend_epi16(m0,m3,0x03); \
1297db96d56Sopenharmony_cibuf = _mm_blend_epi16(t1,t0,0x3C);
1307db96d56Sopenharmony_ci
1317db96d56Sopenharmony_ci#define LOAD_MSG_5_3(buf) \
1327db96d56Sopenharmony_cit0 = _mm_blend_epi16(m1,m0,0x0C); \
1337db96d56Sopenharmony_cit1 = _mm_srli_si128(m3, 4); \
1347db96d56Sopenharmony_cit2 = _mm_blend_epi16(t0,t1,0x30); \
1357db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0));
1367db96d56Sopenharmony_ci
1377db96d56Sopenharmony_ci#define LOAD_MSG_5_4(buf) \
1387db96d56Sopenharmony_cit0 = _mm_unpacklo_epi64(m1,m2); \
1397db96d56Sopenharmony_cit1= _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,2,0,1)); \
1407db96d56Sopenharmony_cibuf = _mm_blend_epi16(t0,t1,0x33);
1417db96d56Sopenharmony_ci
1427db96d56Sopenharmony_ci#define LOAD_MSG_6_1(buf) \
1437db96d56Sopenharmony_cit0 = _mm_slli_si128(m1, 12); \
1447db96d56Sopenharmony_cit1 = _mm_blend_epi16(m0,m3,0x33); \
1457db96d56Sopenharmony_cibuf = _mm_blend_epi16(t1,t0,0xC0);
1467db96d56Sopenharmony_ci
1477db96d56Sopenharmony_ci#define LOAD_MSG_6_2(buf) \
1487db96d56Sopenharmony_cit0 = _mm_blend_epi16(m3,m2,0x30); \
1497db96d56Sopenharmony_cit1 = _mm_srli_si128(m1, 4); \
1507db96d56Sopenharmony_cit2 = _mm_blend_epi16(t0,t1,0x03); \
1517db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,3,0));
1527db96d56Sopenharmony_ci
1537db96d56Sopenharmony_ci#define LOAD_MSG_6_3(buf) \
1547db96d56Sopenharmony_cit0 = _mm_unpacklo_epi64(m0,m2); \
1557db96d56Sopenharmony_cit1 = _mm_srli_si128(m1, 4); \
1567db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(_mm_blend_epi16(t0,t1,0x0C), _MM_SHUFFLE(2,3,1,0));
1577db96d56Sopenharmony_ci
1587db96d56Sopenharmony_ci#define LOAD_MSG_6_4(buf) \
1597db96d56Sopenharmony_cit0 = _mm_unpackhi_epi32(m1,m2); \
1607db96d56Sopenharmony_cit1 = _mm_unpackhi_epi64(m0,t0); \
1617db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
1627db96d56Sopenharmony_ci
1637db96d56Sopenharmony_ci#define LOAD_MSG_7_1(buf) \
1647db96d56Sopenharmony_cit0 = _mm_unpackhi_epi32(m0,m1); \
1657db96d56Sopenharmony_cit1 = _mm_blend_epi16(t0,m3,0x0F); \
1667db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(2,0,3,1));
1677db96d56Sopenharmony_ci
1687db96d56Sopenharmony_ci#define LOAD_MSG_7_2(buf) \
1697db96d56Sopenharmony_cit0 = _mm_blend_epi16(m2,m3,0x30); \
1707db96d56Sopenharmony_cit1 = _mm_srli_si128(m0,4); \
1717db96d56Sopenharmony_cit2 = _mm_blend_epi16(t0,t1,0x03); \
1727db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,2,3));
1737db96d56Sopenharmony_ci
1747db96d56Sopenharmony_ci#define LOAD_MSG_7_3(buf) \
1757db96d56Sopenharmony_cit0 = _mm_unpackhi_epi64(m0,m3); \
1767db96d56Sopenharmony_cit1 = _mm_unpacklo_epi64(m1,m2); \
1777db96d56Sopenharmony_cit2 = _mm_blend_epi16(t0,t1,0x3C); \
1787db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,3,1));
1797db96d56Sopenharmony_ci
1807db96d56Sopenharmony_ci#define LOAD_MSG_7_4(buf) \
1817db96d56Sopenharmony_cit0 = _mm_unpacklo_epi32(m0,m1); \
1827db96d56Sopenharmony_cit1 = _mm_unpackhi_epi32(m1,m2); \
1837db96d56Sopenharmony_cibuf = _mm_unpacklo_epi64(t0,t1);
1847db96d56Sopenharmony_ci
1857db96d56Sopenharmony_ci#define LOAD_MSG_8_1(buf) \
1867db96d56Sopenharmony_cit0 = _mm_unpackhi_epi32(m1,m3); \
1877db96d56Sopenharmony_cit1 = _mm_unpacklo_epi64(t0,m0); \
1887db96d56Sopenharmony_cit2 = _mm_blend_epi16(t1,m2,0xC0); \
1897db96d56Sopenharmony_cibuf = _mm_shufflehi_epi16(t2,_MM_SHUFFLE(1,0,3,2));
1907db96d56Sopenharmony_ci
1917db96d56Sopenharmony_ci#define LOAD_MSG_8_2(buf) \
1927db96d56Sopenharmony_cit0 = _mm_unpackhi_epi32(m0,m3); \
1937db96d56Sopenharmony_cit1 = _mm_blend_epi16(m2,t0,0xF0); \
1947db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(0,2,1,3));
1957db96d56Sopenharmony_ci
1967db96d56Sopenharmony_ci#define LOAD_MSG_8_3(buf) \
1977db96d56Sopenharmony_cit0 = _mm_blend_epi16(m2,m0,0x0C); \
1987db96d56Sopenharmony_cit1 = _mm_slli_si128(t0,4); \
1997db96d56Sopenharmony_cibuf = _mm_blend_epi16(t1,m3,0x0F);
2007db96d56Sopenharmony_ci
2017db96d56Sopenharmony_ci#define LOAD_MSG_8_4(buf) \
2027db96d56Sopenharmony_cit0 = _mm_blend_epi16(m1,m0,0x30); \
2037db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t0,_MM_SHUFFLE(1,0,3,2));
2047db96d56Sopenharmony_ci
2057db96d56Sopenharmony_ci#define LOAD_MSG_9_1(buf) \
2067db96d56Sopenharmony_cit0 = _mm_blend_epi16(m0,m2,0x03); \
2077db96d56Sopenharmony_cit1 = _mm_blend_epi16(m1,m2,0x30); \
2087db96d56Sopenharmony_cit2 = _mm_blend_epi16(t1,t0,0x0F); \
2097db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(1,3,0,2));
2107db96d56Sopenharmony_ci
2117db96d56Sopenharmony_ci#define LOAD_MSG_9_2(buf) \
2127db96d56Sopenharmony_cit0 = _mm_slli_si128(m0,4); \
2137db96d56Sopenharmony_cit1 = _mm_blend_epi16(m1,t0,0xC0); \
2147db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(1,2,0,3));
2157db96d56Sopenharmony_ci
2167db96d56Sopenharmony_ci#define LOAD_MSG_9_3(buf) \
2177db96d56Sopenharmony_cit0 = _mm_unpackhi_epi32(m0,m3); \
2187db96d56Sopenharmony_cit1 = _mm_unpacklo_epi32(m2,m3); \
2197db96d56Sopenharmony_cit2 = _mm_unpackhi_epi64(t0,t1); \
2207db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(3,0,2,1));
2217db96d56Sopenharmony_ci
2227db96d56Sopenharmony_ci#define LOAD_MSG_9_4(buf) \
2237db96d56Sopenharmony_cit0 = _mm_blend_epi16(m3,m2,0xC0); \
2247db96d56Sopenharmony_cit1 = _mm_unpacklo_epi32(m0,m3); \
2257db96d56Sopenharmony_cit2 = _mm_blend_epi16(t0,t1,0x0F); \
2267db96d56Sopenharmony_cibuf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,1,2,3));
2277db96d56Sopenharmony_ci
2287db96d56Sopenharmony_ci#endif
2297db96d56Sopenharmony_ci
230