162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
462306a36Sopenharmony_ci * instructions. This file contains accelerated part of ghash
562306a36Sopenharmony_ci * implementation. More information about PCLMULQDQ can be found at:
662306a36Sopenharmony_ci *
762306a36Sopenharmony_ci * https://www.intel.com/content/dam/develop/external/us/en/documents/clmul-wp-rev-2-02-2014-04-20.pdf
862306a36Sopenharmony_ci *
962306a36Sopenharmony_ci * Copyright (c) 2009 Intel Corp.
1062306a36Sopenharmony_ci *   Author: Huang Ying <ying.huang@intel.com>
1162306a36Sopenharmony_ci *	     Vinodh Gopal
1262306a36Sopenharmony_ci *	     Erdinc Ozturk
1362306a36Sopenharmony_ci *	     Deniz Karakoyunlu
1462306a36Sopenharmony_ci */
1562306a36Sopenharmony_ci
1662306a36Sopenharmony_ci#include <linux/linkage.h>
1762306a36Sopenharmony_ci#include <asm/frame.h>
1862306a36Sopenharmony_ci
1962306a36Sopenharmony_ci.section	.rodata.cst16.bswap_mask, "aM", @progbits, 16
2062306a36Sopenharmony_ci.align 16
2162306a36Sopenharmony_ci.Lbswap_mask:
2262306a36Sopenharmony_ci	.octa 0x000102030405060708090a0b0c0d0e0f
2362306a36Sopenharmony_ci
2462306a36Sopenharmony_ci#define DATA	%xmm0
2562306a36Sopenharmony_ci#define SHASH	%xmm1
2662306a36Sopenharmony_ci#define T1	%xmm2
2762306a36Sopenharmony_ci#define T2	%xmm3
2862306a36Sopenharmony_ci#define T3	%xmm4
2962306a36Sopenharmony_ci#define BSWAP	%xmm5
3062306a36Sopenharmony_ci#define IN1	%xmm6
3162306a36Sopenharmony_ci
3262306a36Sopenharmony_ci.text
3362306a36Sopenharmony_ci
3462306a36Sopenharmony_ci/*
3562306a36Sopenharmony_ci * __clmul_gf128mul_ble:	internal ABI
3662306a36Sopenharmony_ci * input:
3762306a36Sopenharmony_ci *	DATA:			operand1
3862306a36Sopenharmony_ci *	SHASH:			operand2, hash_key << 1 mod poly
3962306a36Sopenharmony_ci * output:
4062306a36Sopenharmony_ci *	DATA:			operand1 * operand2 mod poly
4162306a36Sopenharmony_ci * changed:
4262306a36Sopenharmony_ci *	T1
4362306a36Sopenharmony_ci *	T2
4462306a36Sopenharmony_ci *	T3
4562306a36Sopenharmony_ci */
4662306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(__clmul_gf128mul_ble)
4762306a36Sopenharmony_ci	movaps DATA, T1
4862306a36Sopenharmony_ci	pshufd $0b01001110, DATA, T2
4962306a36Sopenharmony_ci	pshufd $0b01001110, SHASH, T3
5062306a36Sopenharmony_ci	pxor DATA, T2
5162306a36Sopenharmony_ci	pxor SHASH, T3
5262306a36Sopenharmony_ci
5362306a36Sopenharmony_ci	pclmulqdq $0x00, SHASH, DATA	# DATA = a0 * b0
5462306a36Sopenharmony_ci	pclmulqdq $0x11, SHASH, T1	# T1 = a1 * b1
5562306a36Sopenharmony_ci	pclmulqdq $0x00, T3, T2		# T2 = (a1 + a0) * (b1 + b0)
5662306a36Sopenharmony_ci	pxor DATA, T2
5762306a36Sopenharmony_ci	pxor T1, T2			# T2 = a0 * b1 + a1 * b0
5862306a36Sopenharmony_ci
5962306a36Sopenharmony_ci	movaps T2, T3
6062306a36Sopenharmony_ci	pslldq $8, T3
6162306a36Sopenharmony_ci	psrldq $8, T2
6262306a36Sopenharmony_ci	pxor T3, DATA
6362306a36Sopenharmony_ci	pxor T2, T1			# <T1:DATA> is result of
6462306a36Sopenharmony_ci					# carry-less multiplication
6562306a36Sopenharmony_ci
6662306a36Sopenharmony_ci	# first phase of the reduction
6762306a36Sopenharmony_ci	movaps DATA, T3
6862306a36Sopenharmony_ci	psllq $1, T3
6962306a36Sopenharmony_ci	pxor DATA, T3
7062306a36Sopenharmony_ci	psllq $5, T3
7162306a36Sopenharmony_ci	pxor DATA, T3
7262306a36Sopenharmony_ci	psllq $57, T3
7362306a36Sopenharmony_ci	movaps T3, T2
7462306a36Sopenharmony_ci	pslldq $8, T2
7562306a36Sopenharmony_ci	psrldq $8, T3
7662306a36Sopenharmony_ci	pxor T2, DATA
7762306a36Sopenharmony_ci	pxor T3, T1
7862306a36Sopenharmony_ci
7962306a36Sopenharmony_ci	# second phase of the reduction
8062306a36Sopenharmony_ci	movaps DATA, T2
8162306a36Sopenharmony_ci	psrlq $5, T2
8262306a36Sopenharmony_ci	pxor DATA, T2
8362306a36Sopenharmony_ci	psrlq $1, T2
8462306a36Sopenharmony_ci	pxor DATA, T2
8562306a36Sopenharmony_ci	psrlq $1, T2
8662306a36Sopenharmony_ci	pxor T2, T1
8762306a36Sopenharmony_ci	pxor T1, DATA
8862306a36Sopenharmony_ci	RET
8962306a36Sopenharmony_ciSYM_FUNC_END(__clmul_gf128mul_ble)
9062306a36Sopenharmony_ci
9162306a36Sopenharmony_ci/* void clmul_ghash_mul(char *dst, const le128 *shash) */
9262306a36Sopenharmony_ciSYM_FUNC_START(clmul_ghash_mul)
9362306a36Sopenharmony_ci	FRAME_BEGIN
9462306a36Sopenharmony_ci	movups (%rdi), DATA
9562306a36Sopenharmony_ci	movups (%rsi), SHASH
9662306a36Sopenharmony_ci	movaps .Lbswap_mask(%rip), BSWAP
9762306a36Sopenharmony_ci	pshufb BSWAP, DATA
9862306a36Sopenharmony_ci	call __clmul_gf128mul_ble
9962306a36Sopenharmony_ci	pshufb BSWAP, DATA
10062306a36Sopenharmony_ci	movups DATA, (%rdi)
10162306a36Sopenharmony_ci	FRAME_END
10262306a36Sopenharmony_ci	RET
10362306a36Sopenharmony_ciSYM_FUNC_END(clmul_ghash_mul)
10462306a36Sopenharmony_ci
10562306a36Sopenharmony_ci/*
10662306a36Sopenharmony_ci * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
10762306a36Sopenharmony_ci *			   const le128 *shash);
10862306a36Sopenharmony_ci */
10962306a36Sopenharmony_ciSYM_FUNC_START(clmul_ghash_update)
11062306a36Sopenharmony_ci	FRAME_BEGIN
11162306a36Sopenharmony_ci	cmp $16, %rdx
11262306a36Sopenharmony_ci	jb .Lupdate_just_ret	# check length
11362306a36Sopenharmony_ci	movaps .Lbswap_mask(%rip), BSWAP
11462306a36Sopenharmony_ci	movups (%rdi), DATA
11562306a36Sopenharmony_ci	movups (%rcx), SHASH
11662306a36Sopenharmony_ci	pshufb BSWAP, DATA
11762306a36Sopenharmony_ci.align 4
11862306a36Sopenharmony_ci.Lupdate_loop:
11962306a36Sopenharmony_ci	movups (%rsi), IN1
12062306a36Sopenharmony_ci	pshufb BSWAP, IN1
12162306a36Sopenharmony_ci	pxor IN1, DATA
12262306a36Sopenharmony_ci	call __clmul_gf128mul_ble
12362306a36Sopenharmony_ci	sub $16, %rdx
12462306a36Sopenharmony_ci	add $16, %rsi
12562306a36Sopenharmony_ci	cmp $16, %rdx
12662306a36Sopenharmony_ci	jge .Lupdate_loop
12762306a36Sopenharmony_ci	pshufb BSWAP, DATA
12862306a36Sopenharmony_ci	movups DATA, (%rdi)
12962306a36Sopenharmony_ci.Lupdate_just_ret:
13062306a36Sopenharmony_ci	FRAME_END
13162306a36Sopenharmony_ci	RET
13262306a36Sopenharmony_ciSYM_FUNC_END(clmul_ghash_update)
133