18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
48c2ecf20Sopenharmony_ci * instructions. This file contains accelerated part of ghash
58c2ecf20Sopenharmony_ci * implementation. More information about PCLMULQDQ can be found at:
68c2ecf20Sopenharmony_ci *
78c2ecf20Sopenharmony_ci * http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
88c2ecf20Sopenharmony_ci *
98c2ecf20Sopenharmony_ci * Copyright (c) 2009 Intel Corp.
108c2ecf20Sopenharmony_ci *   Author: Huang Ying <ying.huang@intel.com>
118c2ecf20Sopenharmony_ci *	     Vinodh Gopal
128c2ecf20Sopenharmony_ci *	     Erdinc Ozturk
138c2ecf20Sopenharmony_ci *	     Deniz Karakoyunlu
148c2ecf20Sopenharmony_ci */
158c2ecf20Sopenharmony_ci
168c2ecf20Sopenharmony_ci#include <linux/linkage.h>
178c2ecf20Sopenharmony_ci#include <asm/frame.h>
188c2ecf20Sopenharmony_ci
198c2ecf20Sopenharmony_ci.section	.rodata.cst16.bswap_mask, "aM", @progbits, 16
208c2ecf20Sopenharmony_ci.align 16
218c2ecf20Sopenharmony_ci.Lbswap_mask:
228c2ecf20Sopenharmony_ci	.octa 0x000102030405060708090a0b0c0d0e0f
238c2ecf20Sopenharmony_ci
248c2ecf20Sopenharmony_ci#define DATA	%xmm0
258c2ecf20Sopenharmony_ci#define SHASH	%xmm1
268c2ecf20Sopenharmony_ci#define T1	%xmm2
278c2ecf20Sopenharmony_ci#define T2	%xmm3
288c2ecf20Sopenharmony_ci#define T3	%xmm4
298c2ecf20Sopenharmony_ci#define BSWAP	%xmm5
308c2ecf20Sopenharmony_ci#define IN1	%xmm6
318c2ecf20Sopenharmony_ci
328c2ecf20Sopenharmony_ci.text
338c2ecf20Sopenharmony_ci
348c2ecf20Sopenharmony_ci/*
358c2ecf20Sopenharmony_ci * __clmul_gf128mul_ble:	internal ABI
368c2ecf20Sopenharmony_ci * input:
378c2ecf20Sopenharmony_ci *	DATA:			operand1
388c2ecf20Sopenharmony_ci *	SHASH:			operand2, hash_key << 1 mod poly
398c2ecf20Sopenharmony_ci * output:
408c2ecf20Sopenharmony_ci *	DATA:			operand1 * operand2 mod poly
418c2ecf20Sopenharmony_ci * changed:
428c2ecf20Sopenharmony_ci *	T1
438c2ecf20Sopenharmony_ci *	T2
448c2ecf20Sopenharmony_ci *	T3
458c2ecf20Sopenharmony_ci */
468c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(__clmul_gf128mul_ble)
478c2ecf20Sopenharmony_ci	movaps DATA, T1
488c2ecf20Sopenharmony_ci	pshufd $0b01001110, DATA, T2
498c2ecf20Sopenharmony_ci	pshufd $0b01001110, SHASH, T3
508c2ecf20Sopenharmony_ci	pxor DATA, T2
518c2ecf20Sopenharmony_ci	pxor SHASH, T3
528c2ecf20Sopenharmony_ci
538c2ecf20Sopenharmony_ci	pclmulqdq $0x00, SHASH, DATA	# DATA = a0 * b0
548c2ecf20Sopenharmony_ci	pclmulqdq $0x11, SHASH, T1	# T1 = a1 * b1
558c2ecf20Sopenharmony_ci	pclmulqdq $0x00, T3, T2		# T2 = (a1 + a0) * (b1 + b0)
568c2ecf20Sopenharmony_ci	pxor DATA, T2
578c2ecf20Sopenharmony_ci	pxor T1, T2			# T2 = a0 * b1 + a1 * b0
588c2ecf20Sopenharmony_ci
598c2ecf20Sopenharmony_ci	movaps T2, T3
608c2ecf20Sopenharmony_ci	pslldq $8, T3
618c2ecf20Sopenharmony_ci	psrldq $8, T2
628c2ecf20Sopenharmony_ci	pxor T3, DATA
638c2ecf20Sopenharmony_ci	pxor T2, T1			# <T1:DATA> is result of
648c2ecf20Sopenharmony_ci					# carry-less multiplication
658c2ecf20Sopenharmony_ci
668c2ecf20Sopenharmony_ci	# first phase of the reduction
678c2ecf20Sopenharmony_ci	movaps DATA, T3
688c2ecf20Sopenharmony_ci	psllq $1, T3
698c2ecf20Sopenharmony_ci	pxor DATA, T3
708c2ecf20Sopenharmony_ci	psllq $5, T3
718c2ecf20Sopenharmony_ci	pxor DATA, T3
728c2ecf20Sopenharmony_ci	psllq $57, T3
738c2ecf20Sopenharmony_ci	movaps T3, T2
748c2ecf20Sopenharmony_ci	pslldq $8, T2
758c2ecf20Sopenharmony_ci	psrldq $8, T3
768c2ecf20Sopenharmony_ci	pxor T2, DATA
778c2ecf20Sopenharmony_ci	pxor T3, T1
788c2ecf20Sopenharmony_ci
798c2ecf20Sopenharmony_ci	# second phase of the reduction
808c2ecf20Sopenharmony_ci	movaps DATA, T2
818c2ecf20Sopenharmony_ci	psrlq $5, T2
828c2ecf20Sopenharmony_ci	pxor DATA, T2
838c2ecf20Sopenharmony_ci	psrlq $1, T2
848c2ecf20Sopenharmony_ci	pxor DATA, T2
858c2ecf20Sopenharmony_ci	psrlq $1, T2
868c2ecf20Sopenharmony_ci	pxor T2, T1
878c2ecf20Sopenharmony_ci	pxor T1, DATA
888c2ecf20Sopenharmony_ci	RET
898c2ecf20Sopenharmony_ciSYM_FUNC_END(__clmul_gf128mul_ble)
908c2ecf20Sopenharmony_ci
918c2ecf20Sopenharmony_ci/* void clmul_ghash_mul(char *dst, const u128 *shash) */
928c2ecf20Sopenharmony_ciSYM_FUNC_START(clmul_ghash_mul)
938c2ecf20Sopenharmony_ci	FRAME_BEGIN
948c2ecf20Sopenharmony_ci	movups (%rdi), DATA
958c2ecf20Sopenharmony_ci	movups (%rsi), SHASH
968c2ecf20Sopenharmony_ci	movaps .Lbswap_mask, BSWAP
978c2ecf20Sopenharmony_ci	pshufb BSWAP, DATA
988c2ecf20Sopenharmony_ci	call __clmul_gf128mul_ble
998c2ecf20Sopenharmony_ci	pshufb BSWAP, DATA
1008c2ecf20Sopenharmony_ci	movups DATA, (%rdi)
1018c2ecf20Sopenharmony_ci	FRAME_END
1028c2ecf20Sopenharmony_ci	RET
1038c2ecf20Sopenharmony_ciSYM_FUNC_END(clmul_ghash_mul)
1048c2ecf20Sopenharmony_ci
1058c2ecf20Sopenharmony_ci/*
1068c2ecf20Sopenharmony_ci * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
1078c2ecf20Sopenharmony_ci *			   const u128 *shash);
1088c2ecf20Sopenharmony_ci */
1098c2ecf20Sopenharmony_ciSYM_FUNC_START(clmul_ghash_update)
1108c2ecf20Sopenharmony_ci	FRAME_BEGIN
1118c2ecf20Sopenharmony_ci	cmp $16, %rdx
1128c2ecf20Sopenharmony_ci	jb .Lupdate_just_ret	# check length
1138c2ecf20Sopenharmony_ci	movaps .Lbswap_mask, BSWAP
1148c2ecf20Sopenharmony_ci	movups (%rdi), DATA
1158c2ecf20Sopenharmony_ci	movups (%rcx), SHASH
1168c2ecf20Sopenharmony_ci	pshufb BSWAP, DATA
1178c2ecf20Sopenharmony_ci.align 4
1188c2ecf20Sopenharmony_ci.Lupdate_loop:
1198c2ecf20Sopenharmony_ci	movups (%rsi), IN1
1208c2ecf20Sopenharmony_ci	pshufb BSWAP, IN1
1218c2ecf20Sopenharmony_ci	pxor IN1, DATA
1228c2ecf20Sopenharmony_ci	call __clmul_gf128mul_ble
1238c2ecf20Sopenharmony_ci	sub $16, %rdx
1248c2ecf20Sopenharmony_ci	add $16, %rsi
1258c2ecf20Sopenharmony_ci	cmp $16, %rdx
1268c2ecf20Sopenharmony_ci	jge .Lupdate_loop
1278c2ecf20Sopenharmony_ci	pshufb BSWAP, DATA
1288c2ecf20Sopenharmony_ci	movups DATA, (%rdi)
1298c2ecf20Sopenharmony_ci.Lupdate_just_ret:
1308c2ecf20Sopenharmony_ci	FRAME_END
1318c2ecf20Sopenharmony_ci	RET
1328c2ecf20Sopenharmony_ciSYM_FUNC_END(clmul_ghash_update)
133