18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */ 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Accelerated GHASH implementation with Intel PCLMULQDQ-NI 48c2ecf20Sopenharmony_ci * instructions. This file contains accelerated part of ghash 58c2ecf20Sopenharmony_ci * implementation. More information about PCLMULQDQ can be found at: 68c2ecf20Sopenharmony_ci * 78c2ecf20Sopenharmony_ci * http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/ 88c2ecf20Sopenharmony_ci * 98c2ecf20Sopenharmony_ci * Copyright (c) 2009 Intel Corp. 108c2ecf20Sopenharmony_ci * Author: Huang Ying <ying.huang@intel.com> 118c2ecf20Sopenharmony_ci * Vinodh Gopal 128c2ecf20Sopenharmony_ci * Erdinc Ozturk 138c2ecf20Sopenharmony_ci * Deniz Karakoyunlu 148c2ecf20Sopenharmony_ci */ 158c2ecf20Sopenharmony_ci 168c2ecf20Sopenharmony_ci#include <linux/linkage.h> 178c2ecf20Sopenharmony_ci#include <asm/frame.h> 188c2ecf20Sopenharmony_ci 198c2ecf20Sopenharmony_ci.section .rodata.cst16.bswap_mask, "aM", @progbits, 16 208c2ecf20Sopenharmony_ci.align 16 218c2ecf20Sopenharmony_ci.Lbswap_mask: 228c2ecf20Sopenharmony_ci .octa 0x000102030405060708090a0b0c0d0e0f 238c2ecf20Sopenharmony_ci 248c2ecf20Sopenharmony_ci#define DATA %xmm0 258c2ecf20Sopenharmony_ci#define SHASH %xmm1 268c2ecf20Sopenharmony_ci#define T1 %xmm2 278c2ecf20Sopenharmony_ci#define T2 %xmm3 288c2ecf20Sopenharmony_ci#define T3 %xmm4 298c2ecf20Sopenharmony_ci#define BSWAP %xmm5 308c2ecf20Sopenharmony_ci#define IN1 %xmm6 318c2ecf20Sopenharmony_ci 328c2ecf20Sopenharmony_ci.text 338c2ecf20Sopenharmony_ci 348c2ecf20Sopenharmony_ci/* 358c2ecf20Sopenharmony_ci * __clmul_gf128mul_ble: internal ABI 368c2ecf20Sopenharmony_ci * input: 378c2ecf20Sopenharmony_ci * DATA: operand1 388c2ecf20Sopenharmony_ci * SHASH: operand2, hash_key << 1 mod poly 398c2ecf20Sopenharmony_ci * output: 408c2ecf20Sopenharmony_ci * DATA: operand1 * operand2 mod poly 418c2ecf20Sopenharmony_ci * changed: 428c2ecf20Sopenharmony_ci * T1 438c2ecf20Sopenharmony_ci * T2 448c2ecf20Sopenharmony_ci * T3 458c2ecf20Sopenharmony_ci */ 468c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(__clmul_gf128mul_ble) 478c2ecf20Sopenharmony_ci movaps DATA, T1 488c2ecf20Sopenharmony_ci pshufd $0b01001110, DATA, T2 498c2ecf20Sopenharmony_ci pshufd $0b01001110, SHASH, T3 508c2ecf20Sopenharmony_ci pxor DATA, T2 518c2ecf20Sopenharmony_ci pxor SHASH, T3 528c2ecf20Sopenharmony_ci 538c2ecf20Sopenharmony_ci pclmulqdq $0x00, SHASH, DATA # DATA = a0 * b0 548c2ecf20Sopenharmony_ci pclmulqdq $0x11, SHASH, T1 # T1 = a1 * b1 558c2ecf20Sopenharmony_ci pclmulqdq $0x00, T3, T2 # T2 = (a1 + a0) * (b1 + b0) 568c2ecf20Sopenharmony_ci pxor DATA, T2 578c2ecf20Sopenharmony_ci pxor T1, T2 # T2 = a0 * b1 + a1 * b0 588c2ecf20Sopenharmony_ci 598c2ecf20Sopenharmony_ci movaps T2, T3 608c2ecf20Sopenharmony_ci pslldq $8, T3 618c2ecf20Sopenharmony_ci psrldq $8, T2 628c2ecf20Sopenharmony_ci pxor T3, DATA 638c2ecf20Sopenharmony_ci pxor T2, T1 # <T1:DATA> is result of 648c2ecf20Sopenharmony_ci # carry-less multiplication 658c2ecf20Sopenharmony_ci 668c2ecf20Sopenharmony_ci # first phase of the reduction 678c2ecf20Sopenharmony_ci movaps DATA, T3 688c2ecf20Sopenharmony_ci psllq $1, T3 698c2ecf20Sopenharmony_ci pxor DATA, T3 708c2ecf20Sopenharmony_ci psllq $5, T3 718c2ecf20Sopenharmony_ci pxor DATA, T3 728c2ecf20Sopenharmony_ci psllq $57, T3 738c2ecf20Sopenharmony_ci movaps T3, T2 748c2ecf20Sopenharmony_ci pslldq $8, T2 758c2ecf20Sopenharmony_ci psrldq $8, T3 768c2ecf20Sopenharmony_ci pxor T2, DATA 778c2ecf20Sopenharmony_ci pxor T3, T1 788c2ecf20Sopenharmony_ci 798c2ecf20Sopenharmony_ci # second phase of the reduction 808c2ecf20Sopenharmony_ci movaps DATA, T2 818c2ecf20Sopenharmony_ci psrlq $5, T2 828c2ecf20Sopenharmony_ci pxor DATA, T2 838c2ecf20Sopenharmony_ci psrlq $1, T2 848c2ecf20Sopenharmony_ci pxor DATA, T2 858c2ecf20Sopenharmony_ci psrlq $1, T2 868c2ecf20Sopenharmony_ci pxor T2, T1 878c2ecf20Sopenharmony_ci pxor T1, DATA 888c2ecf20Sopenharmony_ci RET 898c2ecf20Sopenharmony_ciSYM_FUNC_END(__clmul_gf128mul_ble) 908c2ecf20Sopenharmony_ci 918c2ecf20Sopenharmony_ci/* void clmul_ghash_mul(char *dst, const u128 *shash) */ 928c2ecf20Sopenharmony_ciSYM_FUNC_START(clmul_ghash_mul) 938c2ecf20Sopenharmony_ci FRAME_BEGIN 948c2ecf20Sopenharmony_ci movups (%rdi), DATA 958c2ecf20Sopenharmony_ci movups (%rsi), SHASH 968c2ecf20Sopenharmony_ci movaps .Lbswap_mask, BSWAP 978c2ecf20Sopenharmony_ci pshufb BSWAP, DATA 988c2ecf20Sopenharmony_ci call __clmul_gf128mul_ble 998c2ecf20Sopenharmony_ci pshufb BSWAP, DATA 1008c2ecf20Sopenharmony_ci movups DATA, (%rdi) 1018c2ecf20Sopenharmony_ci FRAME_END 1028c2ecf20Sopenharmony_ci RET 1038c2ecf20Sopenharmony_ciSYM_FUNC_END(clmul_ghash_mul) 1048c2ecf20Sopenharmony_ci 1058c2ecf20Sopenharmony_ci/* 1068c2ecf20Sopenharmony_ci * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, 1078c2ecf20Sopenharmony_ci * const u128 *shash); 1088c2ecf20Sopenharmony_ci */ 1098c2ecf20Sopenharmony_ciSYM_FUNC_START(clmul_ghash_update) 1108c2ecf20Sopenharmony_ci FRAME_BEGIN 1118c2ecf20Sopenharmony_ci cmp $16, %rdx 1128c2ecf20Sopenharmony_ci jb .Lupdate_just_ret # check length 1138c2ecf20Sopenharmony_ci movaps .Lbswap_mask, BSWAP 1148c2ecf20Sopenharmony_ci movups (%rdi), DATA 1158c2ecf20Sopenharmony_ci movups (%rcx), SHASH 1168c2ecf20Sopenharmony_ci pshufb BSWAP, DATA 1178c2ecf20Sopenharmony_ci.align 4 1188c2ecf20Sopenharmony_ci.Lupdate_loop: 1198c2ecf20Sopenharmony_ci movups (%rsi), IN1 1208c2ecf20Sopenharmony_ci pshufb BSWAP, IN1 1218c2ecf20Sopenharmony_ci pxor IN1, DATA 1228c2ecf20Sopenharmony_ci call __clmul_gf128mul_ble 1238c2ecf20Sopenharmony_ci sub $16, %rdx 1248c2ecf20Sopenharmony_ci add $16, %rsi 1258c2ecf20Sopenharmony_ci cmp $16, %rdx 1268c2ecf20Sopenharmony_ci jge .Lupdate_loop 1278c2ecf20Sopenharmony_ci pshufb BSWAP, DATA 1288c2ecf20Sopenharmony_ci movups DATA, (%rdi) 1298c2ecf20Sopenharmony_ci.Lupdate_just_ret: 1308c2ecf20Sopenharmony_ci FRAME_END 1318c2ecf20Sopenharmony_ci RET 1328c2ecf20Sopenharmony_ciSYM_FUNC_END(clmul_ghash_update) 133