18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * NH - ε-almost-universal hash function, x86_64 SSE2 accelerated 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Copyright 2018 Google LLC 68c2ecf20Sopenharmony_ci * 78c2ecf20Sopenharmony_ci * Author: Eric Biggers <ebiggers@google.com> 88c2ecf20Sopenharmony_ci */ 98c2ecf20Sopenharmony_ci 108c2ecf20Sopenharmony_ci#include <linux/linkage.h> 118c2ecf20Sopenharmony_ci 128c2ecf20Sopenharmony_ci#define PASS0_SUMS %xmm0 138c2ecf20Sopenharmony_ci#define PASS1_SUMS %xmm1 148c2ecf20Sopenharmony_ci#define PASS2_SUMS %xmm2 158c2ecf20Sopenharmony_ci#define PASS3_SUMS %xmm3 168c2ecf20Sopenharmony_ci#define K0 %xmm4 178c2ecf20Sopenharmony_ci#define K1 %xmm5 188c2ecf20Sopenharmony_ci#define K2 %xmm6 198c2ecf20Sopenharmony_ci#define K3 %xmm7 208c2ecf20Sopenharmony_ci#define T0 %xmm8 218c2ecf20Sopenharmony_ci#define T1 %xmm9 228c2ecf20Sopenharmony_ci#define T2 %xmm10 238c2ecf20Sopenharmony_ci#define T3 %xmm11 248c2ecf20Sopenharmony_ci#define T4 %xmm12 258c2ecf20Sopenharmony_ci#define T5 %xmm13 268c2ecf20Sopenharmony_ci#define T6 %xmm14 278c2ecf20Sopenharmony_ci#define T7 %xmm15 288c2ecf20Sopenharmony_ci#define KEY %rdi 298c2ecf20Sopenharmony_ci#define MESSAGE %rsi 308c2ecf20Sopenharmony_ci#define MESSAGE_LEN %rdx 318c2ecf20Sopenharmony_ci#define HASH %rcx 328c2ecf20Sopenharmony_ci 338c2ecf20Sopenharmony_ci.macro _nh_stride k0, k1, k2, k3, offset 348c2ecf20Sopenharmony_ci 358c2ecf20Sopenharmony_ci // Load next message stride 368c2ecf20Sopenharmony_ci movdqu \offset(MESSAGE), T1 378c2ecf20Sopenharmony_ci 388c2ecf20Sopenharmony_ci // Load next key stride 398c2ecf20Sopenharmony_ci movdqu \offset(KEY), \k3 408c2ecf20Sopenharmony_ci 418c2ecf20Sopenharmony_ci // Add message words to key words 428c2ecf20Sopenharmony_ci movdqa T1, T2 438c2ecf20Sopenharmony_ci movdqa T1, T3 448c2ecf20Sopenharmony_ci paddd T1, \k0 // reuse k0 to avoid a move 458c2ecf20Sopenharmony_ci paddd \k1, T1 468c2ecf20Sopenharmony_ci paddd \k2, T2 478c2ecf20Sopenharmony_ci paddd \k3, T3 488c2ecf20Sopenharmony_ci 498c2ecf20Sopenharmony_ci // Multiply 32x32 => 64 and accumulate 508c2ecf20Sopenharmony_ci pshufd $0x10, \k0, T4 518c2ecf20Sopenharmony_ci pshufd $0x32, \k0, \k0 528c2ecf20Sopenharmony_ci pshufd $0x10, T1, T5 538c2ecf20Sopenharmony_ci pshufd $0x32, T1, T1 548c2ecf20Sopenharmony_ci pshufd $0x10, T2, T6 558c2ecf20Sopenharmony_ci pshufd $0x32, T2, T2 568c2ecf20Sopenharmony_ci pshufd $0x10, T3, T7 578c2ecf20Sopenharmony_ci pshufd $0x32, T3, T3 588c2ecf20Sopenharmony_ci pmuludq T4, \k0 598c2ecf20Sopenharmony_ci pmuludq T5, T1 608c2ecf20Sopenharmony_ci pmuludq T6, T2 618c2ecf20Sopenharmony_ci pmuludq T7, T3 628c2ecf20Sopenharmony_ci paddq \k0, PASS0_SUMS 638c2ecf20Sopenharmony_ci paddq T1, PASS1_SUMS 648c2ecf20Sopenharmony_ci paddq T2, PASS2_SUMS 658c2ecf20Sopenharmony_ci paddq T3, PASS3_SUMS 668c2ecf20Sopenharmony_ci.endm 678c2ecf20Sopenharmony_ci 688c2ecf20Sopenharmony_ci/* 698c2ecf20Sopenharmony_ci * void nh_sse2(const u32 *key, const u8 *message, size_t message_len, 708c2ecf20Sopenharmony_ci * u8 hash[NH_HASH_BYTES]) 718c2ecf20Sopenharmony_ci * 728c2ecf20Sopenharmony_ci * It's guaranteed that message_len % 16 == 0. 738c2ecf20Sopenharmony_ci */ 748c2ecf20Sopenharmony_ciSYM_FUNC_START(nh_sse2) 758c2ecf20Sopenharmony_ci 768c2ecf20Sopenharmony_ci movdqu 0x00(KEY), K0 778c2ecf20Sopenharmony_ci movdqu 0x10(KEY), K1 788c2ecf20Sopenharmony_ci movdqu 0x20(KEY), K2 798c2ecf20Sopenharmony_ci add $0x30, KEY 808c2ecf20Sopenharmony_ci pxor PASS0_SUMS, PASS0_SUMS 818c2ecf20Sopenharmony_ci pxor PASS1_SUMS, PASS1_SUMS 828c2ecf20Sopenharmony_ci pxor PASS2_SUMS, PASS2_SUMS 838c2ecf20Sopenharmony_ci pxor PASS3_SUMS, PASS3_SUMS 848c2ecf20Sopenharmony_ci 858c2ecf20Sopenharmony_ci sub $0x40, MESSAGE_LEN 868c2ecf20Sopenharmony_ci jl .Lloop4_done 878c2ecf20Sopenharmony_ci.Lloop4: 888c2ecf20Sopenharmony_ci _nh_stride K0, K1, K2, K3, 0x00 898c2ecf20Sopenharmony_ci _nh_stride K1, K2, K3, K0, 0x10 908c2ecf20Sopenharmony_ci _nh_stride K2, K3, K0, K1, 0x20 918c2ecf20Sopenharmony_ci _nh_stride K3, K0, K1, K2, 0x30 928c2ecf20Sopenharmony_ci add $0x40, KEY 938c2ecf20Sopenharmony_ci add $0x40, MESSAGE 948c2ecf20Sopenharmony_ci sub $0x40, MESSAGE_LEN 958c2ecf20Sopenharmony_ci jge .Lloop4 968c2ecf20Sopenharmony_ci 978c2ecf20Sopenharmony_ci.Lloop4_done: 988c2ecf20Sopenharmony_ci and $0x3f, MESSAGE_LEN 998c2ecf20Sopenharmony_ci jz .Ldone 1008c2ecf20Sopenharmony_ci _nh_stride K0, K1, K2, K3, 0x00 1018c2ecf20Sopenharmony_ci 1028c2ecf20Sopenharmony_ci sub $0x10, MESSAGE_LEN 1038c2ecf20Sopenharmony_ci jz .Ldone 1048c2ecf20Sopenharmony_ci _nh_stride K1, K2, K3, K0, 0x10 1058c2ecf20Sopenharmony_ci 1068c2ecf20Sopenharmony_ci sub $0x10, MESSAGE_LEN 1078c2ecf20Sopenharmony_ci jz .Ldone 1088c2ecf20Sopenharmony_ci _nh_stride K2, K3, K0, K1, 0x20 1098c2ecf20Sopenharmony_ci 1108c2ecf20Sopenharmony_ci.Ldone: 1118c2ecf20Sopenharmony_ci // Sum the accumulators for each pass, then store the sums to 'hash' 1128c2ecf20Sopenharmony_ci movdqa PASS0_SUMS, T0 1138c2ecf20Sopenharmony_ci movdqa PASS2_SUMS, T1 1148c2ecf20Sopenharmony_ci punpcklqdq PASS1_SUMS, T0 // => (PASS0_SUM_A PASS1_SUM_A) 1158c2ecf20Sopenharmony_ci punpcklqdq PASS3_SUMS, T1 // => (PASS2_SUM_A PASS3_SUM_A) 1168c2ecf20Sopenharmony_ci punpckhqdq PASS1_SUMS, PASS0_SUMS // => (PASS0_SUM_B PASS1_SUM_B) 1178c2ecf20Sopenharmony_ci punpckhqdq PASS3_SUMS, PASS2_SUMS // => (PASS2_SUM_B PASS3_SUM_B) 1188c2ecf20Sopenharmony_ci paddq PASS0_SUMS, T0 1198c2ecf20Sopenharmony_ci paddq PASS2_SUMS, T1 1208c2ecf20Sopenharmony_ci movdqu T0, 0x00(HASH) 1218c2ecf20Sopenharmony_ci movdqu T1, 0x10(HASH) 1228c2ecf20Sopenharmony_ci RET 1238c2ecf20Sopenharmony_ciSYM_FUNC_END(nh_sse2) 124