17db96d56Sopenharmony_ci/*
27db96d56Sopenharmony_ci   BLAKE2 reference source code package - optimized C implementations
37db96d56Sopenharmony_ci
47db96d56Sopenharmony_ci   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
57db96d56Sopenharmony_ci
67db96d56Sopenharmony_ci   To the extent possible under law, the author(s) have dedicated all copyright
77db96d56Sopenharmony_ci   and related and neighboring rights to this software to the public domain
87db96d56Sopenharmony_ci   worldwide. This software is distributed without any warranty.
97db96d56Sopenharmony_ci
107db96d56Sopenharmony_ci   You should have received a copy of the CC0 Public Domain Dedication along with
117db96d56Sopenharmony_ci   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
127db96d56Sopenharmony_ci*/
137db96d56Sopenharmony_ci#pragma once
147db96d56Sopenharmony_ci#ifndef __BLAKE2S_ROUND_H__
157db96d56Sopenharmony_ci#define __BLAKE2S_ROUND_H__
167db96d56Sopenharmony_ci
177db96d56Sopenharmony_ci#define LOAD(p)  _mm_load_si128( (__m128i *)(p) )
187db96d56Sopenharmony_ci#define STORE(p,r) _mm_store_si128((__m128i *)(p), r)
197db96d56Sopenharmony_ci
207db96d56Sopenharmony_ci#define LOADU(p)  _mm_loadu_si128( (__m128i *)(p) )
217db96d56Sopenharmony_ci#define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r)
227db96d56Sopenharmony_ci
237db96d56Sopenharmony_ci#define TOF(reg) _mm_castsi128_ps((reg))
247db96d56Sopenharmony_ci#define TOI(reg) _mm_castps_si128((reg))
257db96d56Sopenharmony_ci
267db96d56Sopenharmony_ci#define LIKELY(x) __builtin_expect((x),1)
277db96d56Sopenharmony_ci
287db96d56Sopenharmony_ci
297db96d56Sopenharmony_ci/* Microarchitecture-specific macros */
307db96d56Sopenharmony_ci#ifndef HAVE_XOP
317db96d56Sopenharmony_ci#ifdef HAVE_SSSE3
327db96d56Sopenharmony_ci#define _mm_roti_epi32(r, c) ( \
337db96d56Sopenharmony_ci                (8==-(c)) ? _mm_shuffle_epi8(r,r8) \
347db96d56Sopenharmony_ci              : (16==-(c)) ? _mm_shuffle_epi8(r,r16) \
357db96d56Sopenharmony_ci              : _mm_xor_si128(_mm_srli_epi32( (r), -(c) ),_mm_slli_epi32( (r), 32-(-(c)) )) )
367db96d56Sopenharmony_ci#else
377db96d56Sopenharmony_ci#define _mm_roti_epi32(r, c) _mm_xor_si128(_mm_srli_epi32( (r), -(c) ),_mm_slli_epi32( (r), 32-(-(c)) ))
387db96d56Sopenharmony_ci#endif
397db96d56Sopenharmony_ci#else
407db96d56Sopenharmony_ci/* ... */
417db96d56Sopenharmony_ci#endif
427db96d56Sopenharmony_ci
437db96d56Sopenharmony_ci
447db96d56Sopenharmony_ci#define G1(row1,row2,row3,row4,buf) \
457db96d56Sopenharmony_ci  row1 = _mm_add_epi32( _mm_add_epi32( row1, buf), row2 ); \
467db96d56Sopenharmony_ci  row4 = _mm_xor_si128( row4, row1 ); \
477db96d56Sopenharmony_ci  row4 = _mm_roti_epi32(row4, -16); \
487db96d56Sopenharmony_ci  row3 = _mm_add_epi32( row3, row4 );   \
497db96d56Sopenharmony_ci  row2 = _mm_xor_si128( row2, row3 ); \
507db96d56Sopenharmony_ci  row2 = _mm_roti_epi32(row2, -12);
517db96d56Sopenharmony_ci
527db96d56Sopenharmony_ci#define G2(row1,row2,row3,row4,buf) \
537db96d56Sopenharmony_ci  row1 = _mm_add_epi32( _mm_add_epi32( row1, buf), row2 ); \
547db96d56Sopenharmony_ci  row4 = _mm_xor_si128( row4, row1 ); \
557db96d56Sopenharmony_ci  row4 = _mm_roti_epi32(row4, -8); \
567db96d56Sopenharmony_ci  row3 = _mm_add_epi32( row3, row4 );   \
577db96d56Sopenharmony_ci  row2 = _mm_xor_si128( row2, row3 ); \
587db96d56Sopenharmony_ci  row2 = _mm_roti_epi32(row2, -7);
597db96d56Sopenharmony_ci
607db96d56Sopenharmony_ci#define DIAGONALIZE(row1,row2,row3,row4) \
617db96d56Sopenharmony_ci  row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(2,1,0,3) ); \
627db96d56Sopenharmony_ci  row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(1,0,3,2) ); \
637db96d56Sopenharmony_ci  row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE(0,3,2,1) );
647db96d56Sopenharmony_ci
657db96d56Sopenharmony_ci#define UNDIAGONALIZE(row1,row2,row3,row4) \
667db96d56Sopenharmony_ci  row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(0,3,2,1) ); \
677db96d56Sopenharmony_ci  row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(1,0,3,2) ); \
687db96d56Sopenharmony_ci  row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE(2,1,0,3) );
697db96d56Sopenharmony_ci
707db96d56Sopenharmony_ci#if defined(HAVE_XOP)
717db96d56Sopenharmony_ci#include "blake2s-load-xop.h"
727db96d56Sopenharmony_ci#elif defined(HAVE_SSE4_1)
737db96d56Sopenharmony_ci#include "blake2s-load-sse41.h"
747db96d56Sopenharmony_ci#else
757db96d56Sopenharmony_ci#include "blake2s-load-sse2.h"
767db96d56Sopenharmony_ci#endif
777db96d56Sopenharmony_ci
787db96d56Sopenharmony_ci#define ROUND(r)  \
797db96d56Sopenharmony_ci  LOAD_MSG_ ##r ##_1(buf1); \
807db96d56Sopenharmony_ci  G1(row1,row2,row3,row4,buf1); \
817db96d56Sopenharmony_ci  LOAD_MSG_ ##r ##_2(buf2); \
827db96d56Sopenharmony_ci  G2(row1,row2,row3,row4,buf2); \
837db96d56Sopenharmony_ci  DIAGONALIZE(row1,row2,row3,row4); \
847db96d56Sopenharmony_ci  LOAD_MSG_ ##r ##_3(buf3); \
857db96d56Sopenharmony_ci  G1(row1,row2,row3,row4,buf3); \
867db96d56Sopenharmony_ci  LOAD_MSG_ ##r ##_4(buf4); \
877db96d56Sopenharmony_ci  G2(row1,row2,row3,row4,buf4); \
887db96d56Sopenharmony_ci  UNDIAGONALIZE(row1,row2,row3,row4); \
897db96d56Sopenharmony_ci
907db96d56Sopenharmony_ci#endif
917db96d56Sopenharmony_ci
92