162306a36Sopenharmony_ci########################################################################
262306a36Sopenharmony_ci# Implement fast SHA-256 with AVX2 instructions. (x86_64)
362306a36Sopenharmony_ci#
462306a36Sopenharmony_ci# Copyright (C) 2013 Intel Corporation.
562306a36Sopenharmony_ci#
662306a36Sopenharmony_ci# Authors:
762306a36Sopenharmony_ci#     James Guilford <james.guilford@intel.com>
862306a36Sopenharmony_ci#     Kirk Yap <kirk.s.yap@intel.com>
962306a36Sopenharmony_ci#     Tim Chen <tim.c.chen@linux.intel.com>
1062306a36Sopenharmony_ci#
1162306a36Sopenharmony_ci# This software is available to you under a choice of one of two
1262306a36Sopenharmony_ci# licenses.  You may choose to be licensed under the terms of the GNU
1362306a36Sopenharmony_ci# General Public License (GPL) Version 2, available from the file
1462306a36Sopenharmony_ci# COPYING in the main directory of this source tree, or the
1562306a36Sopenharmony_ci# OpenIB.org BSD license below:
1662306a36Sopenharmony_ci#
1762306a36Sopenharmony_ci#     Redistribution and use in source and binary forms, with or
1862306a36Sopenharmony_ci#     without modification, are permitted provided that the following
1962306a36Sopenharmony_ci#     conditions are met:
2062306a36Sopenharmony_ci#
2162306a36Sopenharmony_ci#      - Redistributions of source code must retain the above
2262306a36Sopenharmony_ci#        copyright notice, this list of conditions and the following
2362306a36Sopenharmony_ci#        disclaimer.
2462306a36Sopenharmony_ci#
2562306a36Sopenharmony_ci#      - Redistributions in binary form must reproduce the above
2662306a36Sopenharmony_ci#        copyright notice, this list of conditions and the following
2762306a36Sopenharmony_ci#        disclaimer in the documentation and/or other materials
2862306a36Sopenharmony_ci#        provided with the distribution.
2962306a36Sopenharmony_ci#
3062306a36Sopenharmony_ci# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
3162306a36Sopenharmony_ci# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
3262306a36Sopenharmony_ci# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
3362306a36Sopenharmony_ci# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
3462306a36Sopenharmony_ci# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
3562306a36Sopenharmony_ci# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
3662306a36Sopenharmony_ci# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
3762306a36Sopenharmony_ci# SOFTWARE.
3862306a36Sopenharmony_ci#
3962306a36Sopenharmony_ci########################################################################
4062306a36Sopenharmony_ci#
4162306a36Sopenharmony_ci# This code is described in an Intel White-Paper:
4262306a36Sopenharmony_ci# "Fast SHA-256 Implementations on Intel Architecture Processors"
4362306a36Sopenharmony_ci#
4462306a36Sopenharmony_ci# To find it, surf to http://www.intel.com/p/en_US/embedded
4562306a36Sopenharmony_ci# and search for that title.
4662306a36Sopenharmony_ci#
4762306a36Sopenharmony_ci########################################################################
4862306a36Sopenharmony_ci# This code schedules 2 blocks at a time, with 4 lanes per block
4962306a36Sopenharmony_ci########################################################################
5062306a36Sopenharmony_ci
5162306a36Sopenharmony_ci#include <linux/linkage.h>
5262306a36Sopenharmony_ci#include <linux/cfi_types.h>
5362306a36Sopenharmony_ci
5462306a36Sopenharmony_ci## assume buffers not aligned
5562306a36Sopenharmony_ci#define	VMOVDQ vmovdqu
5662306a36Sopenharmony_ci
5762306a36Sopenharmony_ci################################ Define Macros
5862306a36Sopenharmony_ci
5962306a36Sopenharmony_ci# addm [mem], reg
6062306a36Sopenharmony_ci# Add reg to mem using reg-mem add and store
6162306a36Sopenharmony_ci.macro addm p1 p2
6262306a36Sopenharmony_ci	add	\p1, \p2
6362306a36Sopenharmony_ci	mov	\p2, \p1
6462306a36Sopenharmony_ci.endm
6562306a36Sopenharmony_ci
6662306a36Sopenharmony_ci################################
6762306a36Sopenharmony_ci
6862306a36Sopenharmony_ciX0 = %ymm4
6962306a36Sopenharmony_ciX1 = %ymm5
7062306a36Sopenharmony_ciX2 = %ymm6
7162306a36Sopenharmony_ciX3 = %ymm7
7262306a36Sopenharmony_ci
7362306a36Sopenharmony_ci# XMM versions of above
7462306a36Sopenharmony_ciXWORD0 = %xmm4
7562306a36Sopenharmony_ciXWORD1 = %xmm5
7662306a36Sopenharmony_ciXWORD2 = %xmm6
7762306a36Sopenharmony_ciXWORD3 = %xmm7
7862306a36Sopenharmony_ci
7962306a36Sopenharmony_ciXTMP0 = %ymm0
8062306a36Sopenharmony_ciXTMP1 = %ymm1
8162306a36Sopenharmony_ciXTMP2 = %ymm2
8262306a36Sopenharmony_ciXTMP3 = %ymm3
8362306a36Sopenharmony_ciXTMP4 = %ymm8
8462306a36Sopenharmony_ciXFER  = %ymm9
8562306a36Sopenharmony_ciXTMP5 = %ymm11
8662306a36Sopenharmony_ci
8762306a36Sopenharmony_ciSHUF_00BA =	%ymm10 # shuffle xBxA -> 00BA
8862306a36Sopenharmony_ciSHUF_DC00 =	%ymm12 # shuffle xDxC -> DC00
8962306a36Sopenharmony_ciBYTE_FLIP_MASK = %ymm13
9062306a36Sopenharmony_ci
9162306a36Sopenharmony_ciX_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
9262306a36Sopenharmony_ci
9362306a36Sopenharmony_ciNUM_BLKS = %rdx	# 3rd arg
9462306a36Sopenharmony_ciINP	= %rsi  # 2nd arg
9562306a36Sopenharmony_ciCTX	= %rdi	# 1st arg
9662306a36Sopenharmony_cic	= %ecx
9762306a36Sopenharmony_cid	= %r8d
9862306a36Sopenharmony_cie       = %edx	# clobbers NUM_BLKS
9962306a36Sopenharmony_ciy3	= %esi	# clobbers INP
10062306a36Sopenharmony_ci
10162306a36Sopenharmony_ciSRND	= CTX	# SRND is same register as CTX
10262306a36Sopenharmony_ci
10362306a36Sopenharmony_cia = %eax
10462306a36Sopenharmony_cib = %ebx
10562306a36Sopenharmony_cif = %r9d
10662306a36Sopenharmony_cig = %r10d
10762306a36Sopenharmony_cih = %r11d
10862306a36Sopenharmony_ciold_h = %r11d
10962306a36Sopenharmony_ci
11062306a36Sopenharmony_ciT1 = %r12d
11162306a36Sopenharmony_ciy0 = %r13d
11262306a36Sopenharmony_ciy1 = %r14d
11362306a36Sopenharmony_ciy2 = %r15d
11462306a36Sopenharmony_ci
11562306a36Sopenharmony_ci
11662306a36Sopenharmony_ci_XFER_SIZE	= 2*64*4	# 2 blocks, 64 rounds, 4 bytes/round
11762306a36Sopenharmony_ci_XMM_SAVE_SIZE	= 0
11862306a36Sopenharmony_ci_INP_END_SIZE	= 8
11962306a36Sopenharmony_ci_INP_SIZE	= 8
12062306a36Sopenharmony_ci_CTX_SIZE	= 8
12162306a36Sopenharmony_ci
12262306a36Sopenharmony_ci_XFER		= 0
12362306a36Sopenharmony_ci_XMM_SAVE	= _XFER     + _XFER_SIZE
12462306a36Sopenharmony_ci_INP_END	= _XMM_SAVE + _XMM_SAVE_SIZE
12562306a36Sopenharmony_ci_INP		= _INP_END  + _INP_END_SIZE
12662306a36Sopenharmony_ci_CTX		= _INP      + _INP_SIZE
12762306a36Sopenharmony_ciSTACK_SIZE	= _CTX      + _CTX_SIZE
12862306a36Sopenharmony_ci
12962306a36Sopenharmony_ci# rotate_Xs
13062306a36Sopenharmony_ci# Rotate values of symbols X0...X3
13162306a36Sopenharmony_ci.macro rotate_Xs
13262306a36Sopenharmony_ci	X_ = X0
13362306a36Sopenharmony_ci	X0 = X1
13462306a36Sopenharmony_ci	X1 = X2
13562306a36Sopenharmony_ci	X2 = X3
13662306a36Sopenharmony_ci	X3 = X_
13762306a36Sopenharmony_ci.endm
13862306a36Sopenharmony_ci
13962306a36Sopenharmony_ci# ROTATE_ARGS
14062306a36Sopenharmony_ci# Rotate values of symbols a...h
14162306a36Sopenharmony_ci.macro ROTATE_ARGS
14262306a36Sopenharmony_ci	old_h = h
14362306a36Sopenharmony_ci	TMP_ = h
14462306a36Sopenharmony_ci	h = g
14562306a36Sopenharmony_ci	g = f
14662306a36Sopenharmony_ci	f = e
14762306a36Sopenharmony_ci	e = d
14862306a36Sopenharmony_ci	d = c
14962306a36Sopenharmony_ci	c = b
15062306a36Sopenharmony_ci	b = a
15162306a36Sopenharmony_ci	a = TMP_
15262306a36Sopenharmony_ci.endm
15362306a36Sopenharmony_ci
15462306a36Sopenharmony_ci.macro FOUR_ROUNDS_AND_SCHED disp
15562306a36Sopenharmony_ci################################### RND N + 0 ############################
15662306a36Sopenharmony_ci
15762306a36Sopenharmony_ci	mov	a, y3		# y3 = a                                # MAJA
15862306a36Sopenharmony_ci	rorx	$25, e, y0	# y0 = e >> 25				# S1A
15962306a36Sopenharmony_ci	rorx	$11, e, y1	# y1 = e >> 11				# S1B
16062306a36Sopenharmony_ci
16162306a36Sopenharmony_ci	addl	\disp(%rsp, SRND), h		# h = k + w + h         # --
16262306a36Sopenharmony_ci	or	c, y3		# y3 = a|c                              # MAJA
16362306a36Sopenharmony_ci	vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
16462306a36Sopenharmony_ci	mov	f, y2		# y2 = f                                # CH
16562306a36Sopenharmony_ci	rorx	$13, a, T1	# T1 = a >> 13				# S0B
16662306a36Sopenharmony_ci
16762306a36Sopenharmony_ci	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
16862306a36Sopenharmony_ci	xor	g, y2		# y2 = f^g                              # CH
16962306a36Sopenharmony_ci	vpaddd	X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
17062306a36Sopenharmony_ci	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
17162306a36Sopenharmony_ci
17262306a36Sopenharmony_ci	and	e, y2		# y2 = (f^g)&e                          # CH
17362306a36Sopenharmony_ci	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
17462306a36Sopenharmony_ci	rorx	$22, a, y1	# y1 = a >> 22				# S0A
17562306a36Sopenharmony_ci	add	h, d		# d = k + w + h + d                     # --
17662306a36Sopenharmony_ci
17762306a36Sopenharmony_ci	and	b, y3		# y3 = (a|c)&b                          # MAJA
17862306a36Sopenharmony_ci	vpalignr $4, X0, X1, XTMP1	# XTMP1 = W[-15]
17962306a36Sopenharmony_ci	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
18062306a36Sopenharmony_ci	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
18162306a36Sopenharmony_ci
18262306a36Sopenharmony_ci	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
18362306a36Sopenharmony_ci	vpsrld	$7, XTMP1, XTMP2
18462306a36Sopenharmony_ci	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
18562306a36Sopenharmony_ci	mov	a, T1		# T1 = a                                # MAJB
18662306a36Sopenharmony_ci	and	c, T1		# T1 = a&c                              # MAJB
18762306a36Sopenharmony_ci
18862306a36Sopenharmony_ci	add	y0, y2		# y2 = S1 + CH                          # --
18962306a36Sopenharmony_ci	vpslld	$(32-7), XTMP1, XTMP3
19062306a36Sopenharmony_ci	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
19162306a36Sopenharmony_ci	add	y1, h		# h = k + w + h + S0                    # --
19262306a36Sopenharmony_ci
19362306a36Sopenharmony_ci	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
19462306a36Sopenharmony_ci	vpor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7
19562306a36Sopenharmony_ci
19662306a36Sopenharmony_ci	vpsrld	$18, XTMP1, XTMP2
19762306a36Sopenharmony_ci	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
19862306a36Sopenharmony_ci	add	y3, h		# h = t1 + S0 + MAJ                     # --
19962306a36Sopenharmony_ci
20062306a36Sopenharmony_ci
20162306a36Sopenharmony_ci	ROTATE_ARGS
20262306a36Sopenharmony_ci
20362306a36Sopenharmony_ci################################### RND N + 1 ############################
20462306a36Sopenharmony_ci
20562306a36Sopenharmony_ci	mov	a, y3		# y3 = a                                # MAJA
20662306a36Sopenharmony_ci	rorx	$25, e, y0	# y0 = e >> 25				# S1A
20762306a36Sopenharmony_ci	rorx	$11, e, y1	# y1 = e >> 11				# S1B
20862306a36Sopenharmony_ci	offset = \disp + 1*4
20962306a36Sopenharmony_ci	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
21062306a36Sopenharmony_ci	or	c, y3		# y3 = a|c                              # MAJA
21162306a36Sopenharmony_ci
21262306a36Sopenharmony_ci
21362306a36Sopenharmony_ci	vpsrld	$3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
21462306a36Sopenharmony_ci	mov	f, y2		# y2 = f                                # CH
21562306a36Sopenharmony_ci	rorx	$13, a, T1	# T1 = a >> 13				# S0B
21662306a36Sopenharmony_ci	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
21762306a36Sopenharmony_ci	xor	g, y2		# y2 = f^g                              # CH
21862306a36Sopenharmony_ci
21962306a36Sopenharmony_ci
22062306a36Sopenharmony_ci	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
22162306a36Sopenharmony_ci	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
22262306a36Sopenharmony_ci	rorx	$22, a, y1	# y1 = a >> 22				# S0A
22362306a36Sopenharmony_ci	and	e, y2		# y2 = (f^g)&e                          # CH
22462306a36Sopenharmony_ci	add	h, d		# d = k + w + h + d                     # --
22562306a36Sopenharmony_ci
22662306a36Sopenharmony_ci	vpslld	$(32-18), XTMP1, XTMP1
22762306a36Sopenharmony_ci	and	b, y3		# y3 = (a|c)&b                          # MAJA
22862306a36Sopenharmony_ci	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
22962306a36Sopenharmony_ci
23062306a36Sopenharmony_ci	vpxor	XTMP1, XTMP3, XTMP3
23162306a36Sopenharmony_ci	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
23262306a36Sopenharmony_ci	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
23362306a36Sopenharmony_ci
23462306a36Sopenharmony_ci	vpxor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
23562306a36Sopenharmony_ci	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
23662306a36Sopenharmony_ci	mov	a, T1		# T1 = a                                # MAJB
23762306a36Sopenharmony_ci	and	c, T1		# T1 = a&c                              # MAJB
23862306a36Sopenharmony_ci	add	y0, y2		# y2 = S1 + CH                          # --
23962306a36Sopenharmony_ci
24062306a36Sopenharmony_ci	vpxor	XTMP4, XTMP3, XTMP1	# XTMP1 = s0
24162306a36Sopenharmony_ci	vpshufd	$0b11111010, X3, XTMP2	# XTMP2 = W[-2] {BBAA}
24262306a36Sopenharmony_ci	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
24362306a36Sopenharmony_ci	add	y1, h		# h = k + w + h + S0                    # --
24462306a36Sopenharmony_ci
24562306a36Sopenharmony_ci	vpaddd	XTMP1, XTMP0, XTMP0	# XTMP0 = W[-16] + W[-7] + s0
24662306a36Sopenharmony_ci	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
24762306a36Sopenharmony_ci	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
24862306a36Sopenharmony_ci	add	y3, h		# h = t1 + S0 + MAJ                     # --
24962306a36Sopenharmony_ci
25062306a36Sopenharmony_ci	vpsrld	$10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
25162306a36Sopenharmony_ci
25262306a36Sopenharmony_ci
25362306a36Sopenharmony_ci	ROTATE_ARGS
25462306a36Sopenharmony_ci
25562306a36Sopenharmony_ci################################### RND N + 2 ############################
25662306a36Sopenharmony_ci
25762306a36Sopenharmony_ci	mov	a, y3		# y3 = a                                # MAJA
25862306a36Sopenharmony_ci	rorx	$25, e, y0	# y0 = e >> 25				# S1A
25962306a36Sopenharmony_ci	offset = \disp + 2*4
26062306a36Sopenharmony_ci	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
26162306a36Sopenharmony_ci
26262306a36Sopenharmony_ci	vpsrlq	$19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
26362306a36Sopenharmony_ci	rorx	$11, e, y1	# y1 = e >> 11				# S1B
26462306a36Sopenharmony_ci	or	c, y3		# y3 = a|c                              # MAJA
26562306a36Sopenharmony_ci	mov	f, y2		# y2 = f                                # CH
26662306a36Sopenharmony_ci	xor	g, y2		# y2 = f^g                              # CH
26762306a36Sopenharmony_ci
26862306a36Sopenharmony_ci	rorx	$13, a, T1	# T1 = a >> 13				# S0B
26962306a36Sopenharmony_ci	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
27062306a36Sopenharmony_ci	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xBxA}
27162306a36Sopenharmony_ci	and	e, y2		# y2 = (f^g)&e                          # CH
27262306a36Sopenharmony_ci
27362306a36Sopenharmony_ci	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
27462306a36Sopenharmony_ci	vpxor	XTMP3, XTMP2, XTMP2
27562306a36Sopenharmony_ci	add	h, d		# d = k + w + h + d                     # --
27662306a36Sopenharmony_ci	and	b, y3		# y3 = (a|c)&b                          # MAJA
27762306a36Sopenharmony_ci
27862306a36Sopenharmony_ci	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
27962306a36Sopenharmony_ci	rorx	$22, a, y1	# y1 = a >> 22				# S0A
28062306a36Sopenharmony_ci	vpxor	XTMP2, XTMP4, XTMP4	# XTMP4 = s1 {xBxA}
28162306a36Sopenharmony_ci	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
28262306a36Sopenharmony_ci
28362306a36Sopenharmony_ci	vpshufb	SHUF_00BA, XTMP4, XTMP4	# XTMP4 = s1 {00BA}
28462306a36Sopenharmony_ci	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
28562306a36Sopenharmony_ci	rorx	$2, a ,T1	# T1 = (a >> 2)				# S0
28662306a36Sopenharmony_ci	vpaddd	XTMP4, XTMP0, XTMP0	# XTMP0 = {..., ..., W[1], W[0]}
28762306a36Sopenharmony_ci
28862306a36Sopenharmony_ci	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
28962306a36Sopenharmony_ci	mov	a, T1		# T1 = a                                # MAJB
29062306a36Sopenharmony_ci	and	c, T1		# T1 = a&c                              # MAJB
29162306a36Sopenharmony_ci	add	y0, y2		# y2 = S1 + CH                          # --
29262306a36Sopenharmony_ci	vpshufd	$0b01010000, XTMP0, XTMP2	# XTMP2 = W[-2] {DDCC}
29362306a36Sopenharmony_ci
29462306a36Sopenharmony_ci	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
29562306a36Sopenharmony_ci	add	y1,h		# h = k + w + h + S0                    # --
29662306a36Sopenharmony_ci	add	y2,d		# d = k + w + h + d + S1 + CH = d + t1  # --
29762306a36Sopenharmony_ci	add	y2,h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
29862306a36Sopenharmony_ci
29962306a36Sopenharmony_ci	add	y3,h		# h = t1 + S0 + MAJ                     # --
30062306a36Sopenharmony_ci
30162306a36Sopenharmony_ci
30262306a36Sopenharmony_ci	ROTATE_ARGS
30362306a36Sopenharmony_ci
30462306a36Sopenharmony_ci################################### RND N + 3 ############################
30562306a36Sopenharmony_ci
30662306a36Sopenharmony_ci	mov	a, y3		# y3 = a                                # MAJA
30762306a36Sopenharmony_ci	rorx	$25, e, y0	# y0 = e >> 25				# S1A
30862306a36Sopenharmony_ci	rorx	$11, e, y1	# y1 = e >> 11				# S1B
30962306a36Sopenharmony_ci	offset = \disp + 3*4
31062306a36Sopenharmony_ci	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
31162306a36Sopenharmony_ci	or	c, y3		# y3 = a|c                              # MAJA
31262306a36Sopenharmony_ci
31362306a36Sopenharmony_ci
31462306a36Sopenharmony_ci	vpsrld	$10, XTMP2, XTMP5	# XTMP5 = W[-2] >> 10 {DDCC}
31562306a36Sopenharmony_ci	mov	f, y2		# y2 = f                                # CH
31662306a36Sopenharmony_ci	rorx	$13, a, T1	# T1 = a >> 13				# S0B
31762306a36Sopenharmony_ci	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
31862306a36Sopenharmony_ci	xor	g, y2		# y2 = f^g                              # CH
31962306a36Sopenharmony_ci
32062306a36Sopenharmony_ci
32162306a36Sopenharmony_ci	vpsrlq	$19, XTMP2, XTMP3	# XTMP3 = W[-2] ror 19 {xDxC}
32262306a36Sopenharmony_ci	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
32362306a36Sopenharmony_ci	and	e, y2		# y2 = (f^g)&e                          # CH
32462306a36Sopenharmony_ci	add	h, d		# d = k + w + h + d                     # --
32562306a36Sopenharmony_ci	and	b, y3		# y3 = (a|c)&b                          # MAJA
32662306a36Sopenharmony_ci
32762306a36Sopenharmony_ci	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xDxC}
32862306a36Sopenharmony_ci	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
32962306a36Sopenharmony_ci	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
33062306a36Sopenharmony_ci
33162306a36Sopenharmony_ci	vpxor	XTMP3, XTMP2, XTMP2
33262306a36Sopenharmony_ci	rorx	$22, a, y1	# y1 = a >> 22				# S0A
33362306a36Sopenharmony_ci	add	y0, y2		# y2 = S1 + CH                          # --
33462306a36Sopenharmony_ci
33562306a36Sopenharmony_ci	vpxor	XTMP2, XTMP5, XTMP5	# XTMP5 = s1 {xDxC}
33662306a36Sopenharmony_ci	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
33762306a36Sopenharmony_ci	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
33862306a36Sopenharmony_ci
33962306a36Sopenharmony_ci	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
34062306a36Sopenharmony_ci	vpshufb	SHUF_DC00, XTMP5, XTMP5	# XTMP5 = s1 {DC00}
34162306a36Sopenharmony_ci
34262306a36Sopenharmony_ci	vpaddd	XTMP0, XTMP5, X0	# X0 = {W[3], W[2], W[1], W[0]}
34362306a36Sopenharmony_ci	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
34462306a36Sopenharmony_ci	mov	a, T1		# T1 = a                                # MAJB
34562306a36Sopenharmony_ci	and	c, T1		# T1 = a&c                              # MAJB
34662306a36Sopenharmony_ci	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
34762306a36Sopenharmony_ci
34862306a36Sopenharmony_ci	add	y1, h		# h = k + w + h + S0                    # --
34962306a36Sopenharmony_ci	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
35062306a36Sopenharmony_ci	add	y3, h		# h = t1 + S0 + MAJ                     # --
35162306a36Sopenharmony_ci
35262306a36Sopenharmony_ci	ROTATE_ARGS
35362306a36Sopenharmony_ci	rotate_Xs
35462306a36Sopenharmony_ci.endm
35562306a36Sopenharmony_ci
35662306a36Sopenharmony_ci.macro DO_4ROUNDS disp
35762306a36Sopenharmony_ci################################### RND N + 0 ###########################
35862306a36Sopenharmony_ci
35962306a36Sopenharmony_ci	mov	f, y2		# y2 = f                                # CH
36062306a36Sopenharmony_ci	rorx	$25, e, y0	# y0 = e >> 25				# S1A
36162306a36Sopenharmony_ci	rorx	$11, e, y1	# y1 = e >> 11				# S1B
36262306a36Sopenharmony_ci	xor	g, y2		# y2 = f^g                              # CH
36362306a36Sopenharmony_ci
36462306a36Sopenharmony_ci	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
36562306a36Sopenharmony_ci	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
36662306a36Sopenharmony_ci	and	e, y2		# y2 = (f^g)&e                          # CH
36762306a36Sopenharmony_ci
36862306a36Sopenharmony_ci	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
36962306a36Sopenharmony_ci	rorx	$13, a, T1	# T1 = a >> 13				# S0B
37062306a36Sopenharmony_ci	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
37162306a36Sopenharmony_ci	rorx	$22, a, y1	# y1 = a >> 22				# S0A
37262306a36Sopenharmony_ci	mov	a, y3		# y3 = a                                # MAJA
37362306a36Sopenharmony_ci
37462306a36Sopenharmony_ci	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
37562306a36Sopenharmony_ci	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
37662306a36Sopenharmony_ci	addl	\disp(%rsp, SRND), h		# h = k + w + h # --
37762306a36Sopenharmony_ci	or	c, y3		# y3 = a|c                              # MAJA
37862306a36Sopenharmony_ci
37962306a36Sopenharmony_ci	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
38062306a36Sopenharmony_ci	mov	a, T1		# T1 = a                                # MAJB
38162306a36Sopenharmony_ci	and	b, y3		# y3 = (a|c)&b                          # MAJA
38262306a36Sopenharmony_ci	and	c, T1		# T1 = a&c                              # MAJB
38362306a36Sopenharmony_ci	add	y0, y2		# y2 = S1 + CH                          # --
38462306a36Sopenharmony_ci
38562306a36Sopenharmony_ci
38662306a36Sopenharmony_ci	add	h, d		# d = k + w + h + d                     # --
38762306a36Sopenharmony_ci	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
38862306a36Sopenharmony_ci	add	y1, h		# h = k + w + h + S0                    # --
38962306a36Sopenharmony_ci	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
39062306a36Sopenharmony_ci
39162306a36Sopenharmony_ci	ROTATE_ARGS
39262306a36Sopenharmony_ci
39362306a36Sopenharmony_ci################################### RND N + 1 ###########################
39462306a36Sopenharmony_ci
39562306a36Sopenharmony_ci	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
39662306a36Sopenharmony_ci	mov	f, y2		# y2 = f                                # CH
39762306a36Sopenharmony_ci	rorx	$25, e, y0	# y0 = e >> 25				# S1A
39862306a36Sopenharmony_ci	rorx	$11, e, y1	# y1 = e >> 11				# S1B
39962306a36Sopenharmony_ci	xor	g, y2		# y2 = f^g                              # CH
40062306a36Sopenharmony_ci
40162306a36Sopenharmony_ci	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
40262306a36Sopenharmony_ci	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
40362306a36Sopenharmony_ci	and	e, y2		# y2 = (f^g)&e                          # CH
40462306a36Sopenharmony_ci	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
40562306a36Sopenharmony_ci
40662306a36Sopenharmony_ci	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
40762306a36Sopenharmony_ci	rorx	$13, a, T1	# T1 = a >> 13				# S0B
40862306a36Sopenharmony_ci	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
40962306a36Sopenharmony_ci	rorx	$22, a, y1	# y1 = a >> 22				# S0A
41062306a36Sopenharmony_ci	mov	a, y3		# y3 = a                                # MAJA
41162306a36Sopenharmony_ci
41262306a36Sopenharmony_ci	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
41362306a36Sopenharmony_ci	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
41462306a36Sopenharmony_ci	offset = 4*1 + \disp
41562306a36Sopenharmony_ci	addl	offset(%rsp, SRND), h		# h = k + w + h # --
41662306a36Sopenharmony_ci	or	c, y3		# y3 = a|c                              # MAJA
41762306a36Sopenharmony_ci
41862306a36Sopenharmony_ci	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
41962306a36Sopenharmony_ci	mov	a, T1		# T1 = a                                # MAJB
42062306a36Sopenharmony_ci	and	b, y3		# y3 = (a|c)&b                          # MAJA
42162306a36Sopenharmony_ci	and	c, T1		# T1 = a&c                              # MAJB
42262306a36Sopenharmony_ci	add	y0, y2		# y2 = S1 + CH                          # --
42362306a36Sopenharmony_ci
42462306a36Sopenharmony_ci
42562306a36Sopenharmony_ci	add	h, d		# d = k + w + h + d                     # --
42662306a36Sopenharmony_ci	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
42762306a36Sopenharmony_ci	add	y1, h		# h = k + w + h + S0                    # --
42862306a36Sopenharmony_ci
42962306a36Sopenharmony_ci	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
43062306a36Sopenharmony_ci
43162306a36Sopenharmony_ci	ROTATE_ARGS
43262306a36Sopenharmony_ci
43362306a36Sopenharmony_ci################################### RND N + 2 ##############################
43462306a36Sopenharmony_ci
43562306a36Sopenharmony_ci	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
43662306a36Sopenharmony_ci	mov	f, y2		# y2 = f                                # CH
43762306a36Sopenharmony_ci	rorx	$25, e, y0	# y0 = e >> 25				# S1A
43862306a36Sopenharmony_ci	rorx	$11, e, y1	# y1 = e >> 11				# S1B
43962306a36Sopenharmony_ci	xor	g, y2		# y2 = f^g                              # CH
44062306a36Sopenharmony_ci
44162306a36Sopenharmony_ci	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
44262306a36Sopenharmony_ci	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
44362306a36Sopenharmony_ci	and	e, y2		# y2 = (f^g)&e                          # CH
44462306a36Sopenharmony_ci	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
44562306a36Sopenharmony_ci
44662306a36Sopenharmony_ci	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
44762306a36Sopenharmony_ci	rorx	$13, a, T1	# T1 = a >> 13				# S0B
44862306a36Sopenharmony_ci	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
44962306a36Sopenharmony_ci	rorx	$22, a, y1	# y1 = a >> 22				# S0A
45062306a36Sopenharmony_ci	mov	a, y3		# y3 = a                                # MAJA
45162306a36Sopenharmony_ci
45262306a36Sopenharmony_ci	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
45362306a36Sopenharmony_ci	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
45462306a36Sopenharmony_ci	offset = 4*2 + \disp
45562306a36Sopenharmony_ci	addl	offset(%rsp, SRND), h		# h = k + w + h # --
45662306a36Sopenharmony_ci	or	c, y3		# y3 = a|c                              # MAJA
45762306a36Sopenharmony_ci
45862306a36Sopenharmony_ci	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
45962306a36Sopenharmony_ci	mov	a, T1		# T1 = a                                # MAJB
46062306a36Sopenharmony_ci	and	b, y3		# y3 = (a|c)&b                          # MAJA
46162306a36Sopenharmony_ci	and	c, T1		# T1 = a&c                              # MAJB
46262306a36Sopenharmony_ci	add	y0, y2		# y2 = S1 + CH                          # --
46362306a36Sopenharmony_ci
46462306a36Sopenharmony_ci
46562306a36Sopenharmony_ci	add	h, d		# d = k + w + h + d                     # --
46662306a36Sopenharmony_ci	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
46762306a36Sopenharmony_ci	add	y1, h		# h = k + w + h + S0                    # --
46862306a36Sopenharmony_ci
46962306a36Sopenharmony_ci	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
47062306a36Sopenharmony_ci
47162306a36Sopenharmony_ci	ROTATE_ARGS
47262306a36Sopenharmony_ci
47362306a36Sopenharmony_ci################################### RND N + 3 ###########################
47462306a36Sopenharmony_ci
47562306a36Sopenharmony_ci	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
47662306a36Sopenharmony_ci	mov	f, y2		# y2 = f                                # CH
47762306a36Sopenharmony_ci	rorx	$25, e, y0	# y0 = e >> 25				# S1A
47862306a36Sopenharmony_ci	rorx	$11, e, y1	# y1 = e >> 11				# S1B
47962306a36Sopenharmony_ci	xor	g, y2		# y2 = f^g                              # CH
48062306a36Sopenharmony_ci
48162306a36Sopenharmony_ci	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
48262306a36Sopenharmony_ci	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
48362306a36Sopenharmony_ci	and	e, y2		# y2 = (f^g)&e                          # CH
48462306a36Sopenharmony_ci	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
48562306a36Sopenharmony_ci
48662306a36Sopenharmony_ci	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
48762306a36Sopenharmony_ci	rorx	$13, a, T1	# T1 = a >> 13				# S0B
48862306a36Sopenharmony_ci	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
48962306a36Sopenharmony_ci	rorx	$22, a, y1	# y1 = a >> 22				# S0A
49062306a36Sopenharmony_ci	mov	a, y3		# y3 = a                                # MAJA
49162306a36Sopenharmony_ci
49262306a36Sopenharmony_ci	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
49362306a36Sopenharmony_ci	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
49462306a36Sopenharmony_ci	offset = 4*3 + \disp
49562306a36Sopenharmony_ci	addl	offset(%rsp, SRND), h		# h = k + w + h # --
49662306a36Sopenharmony_ci	or	c, y3		# y3 = a|c                              # MAJA
49762306a36Sopenharmony_ci
49862306a36Sopenharmony_ci	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
49962306a36Sopenharmony_ci	mov	a, T1		# T1 = a                                # MAJB
50062306a36Sopenharmony_ci	and	b, y3		# y3 = (a|c)&b                          # MAJA
50162306a36Sopenharmony_ci	and	c, T1		# T1 = a&c                              # MAJB
50262306a36Sopenharmony_ci	add	y0, y2		# y2 = S1 + CH                          # --
50362306a36Sopenharmony_ci
50462306a36Sopenharmony_ci
50562306a36Sopenharmony_ci	add	h, d		# d = k + w + h + d                     # --
50662306a36Sopenharmony_ci	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
50762306a36Sopenharmony_ci	add	y1, h		# h = k + w + h + S0                    # --
50862306a36Sopenharmony_ci
50962306a36Sopenharmony_ci	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
51062306a36Sopenharmony_ci
51162306a36Sopenharmony_ci
51262306a36Sopenharmony_ci	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
51362306a36Sopenharmony_ci
51462306a36Sopenharmony_ci	add	y3, h		# h = t1 + S0 + MAJ                     # --
51562306a36Sopenharmony_ci
51662306a36Sopenharmony_ci	ROTATE_ARGS
51762306a36Sopenharmony_ci
51862306a36Sopenharmony_ci.endm
51962306a36Sopenharmony_ci
52062306a36Sopenharmony_ci########################################################################
52162306a36Sopenharmony_ci## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks)
52262306a36Sopenharmony_ci## arg 1 : pointer to state
52362306a36Sopenharmony_ci## arg 2 : pointer to input data
52462306a36Sopenharmony_ci## arg 3 : Num blocks
52562306a36Sopenharmony_ci########################################################################
52662306a36Sopenharmony_ci.text
52762306a36Sopenharmony_ciSYM_TYPED_FUNC_START(sha256_transform_rorx)
52862306a36Sopenharmony_ci	pushq	%rbx
52962306a36Sopenharmony_ci	pushq	%r12
53062306a36Sopenharmony_ci	pushq	%r13
53162306a36Sopenharmony_ci	pushq	%r14
53262306a36Sopenharmony_ci	pushq	%r15
53362306a36Sopenharmony_ci
53462306a36Sopenharmony_ci	push	%rbp
53562306a36Sopenharmony_ci	mov	%rsp, %rbp
53662306a36Sopenharmony_ci
53762306a36Sopenharmony_ci	subq	$STACK_SIZE, %rsp
53862306a36Sopenharmony_ci	and	$-32, %rsp	# align rsp to 32 byte boundary
53962306a36Sopenharmony_ci
54062306a36Sopenharmony_ci	shl	$6, NUM_BLKS	# convert to bytes
54162306a36Sopenharmony_ci	jz	.Ldone_hash
54262306a36Sopenharmony_ci	lea	-64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
54362306a36Sopenharmony_ci	mov	NUM_BLKS, _INP_END(%rsp)
54462306a36Sopenharmony_ci
54562306a36Sopenharmony_ci	cmp	NUM_BLKS, INP
54662306a36Sopenharmony_ci	je	.Lonly_one_block
54762306a36Sopenharmony_ci
54862306a36Sopenharmony_ci	## load initial digest
54962306a36Sopenharmony_ci	mov	(CTX), a
55062306a36Sopenharmony_ci	mov	4*1(CTX), b
55162306a36Sopenharmony_ci	mov	4*2(CTX), c
55262306a36Sopenharmony_ci	mov	4*3(CTX), d
55362306a36Sopenharmony_ci	mov	4*4(CTX), e
55462306a36Sopenharmony_ci	mov	4*5(CTX), f
55562306a36Sopenharmony_ci	mov	4*6(CTX), g
55662306a36Sopenharmony_ci	mov	4*7(CTX), h
55762306a36Sopenharmony_ci
55862306a36Sopenharmony_ci	vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
55962306a36Sopenharmony_ci	vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
56062306a36Sopenharmony_ci	vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
56162306a36Sopenharmony_ci
56262306a36Sopenharmony_ci	mov	CTX, _CTX(%rsp)
56362306a36Sopenharmony_ci
56462306a36Sopenharmony_ci.Lloop0:
56562306a36Sopenharmony_ci	## Load first 16 dwords from two blocks
56662306a36Sopenharmony_ci	VMOVDQ	0*32(INP),XTMP0
56762306a36Sopenharmony_ci	VMOVDQ	1*32(INP),XTMP1
56862306a36Sopenharmony_ci	VMOVDQ	2*32(INP),XTMP2
56962306a36Sopenharmony_ci	VMOVDQ	3*32(INP),XTMP3
57062306a36Sopenharmony_ci
57162306a36Sopenharmony_ci	## byte swap data
57262306a36Sopenharmony_ci	vpshufb	BYTE_FLIP_MASK, XTMP0, XTMP0
57362306a36Sopenharmony_ci	vpshufb	BYTE_FLIP_MASK, XTMP1, XTMP1
57462306a36Sopenharmony_ci	vpshufb	BYTE_FLIP_MASK, XTMP2, XTMP2
57562306a36Sopenharmony_ci	vpshufb	BYTE_FLIP_MASK, XTMP3, XTMP3
57662306a36Sopenharmony_ci
57762306a36Sopenharmony_ci	## transpose data into high/low halves
57862306a36Sopenharmony_ci	vperm2i128	$0x20, XTMP2, XTMP0, X0
57962306a36Sopenharmony_ci	vperm2i128	$0x31, XTMP2, XTMP0, X1
58062306a36Sopenharmony_ci	vperm2i128	$0x20, XTMP3, XTMP1, X2
58162306a36Sopenharmony_ci	vperm2i128	$0x31, XTMP3, XTMP1, X3
58262306a36Sopenharmony_ci
58362306a36Sopenharmony_ci.Llast_block_enter:
58462306a36Sopenharmony_ci	add	$64, INP
58562306a36Sopenharmony_ci	mov	INP, _INP(%rsp)
58662306a36Sopenharmony_ci
58762306a36Sopenharmony_ci	## schedule 48 input dwords, by doing 3 rounds of 12 each
58862306a36Sopenharmony_ci	xor	SRND, SRND
58962306a36Sopenharmony_ci
59062306a36Sopenharmony_ci.align 16
59162306a36Sopenharmony_ci.Lloop1:
59262306a36Sopenharmony_ci	leaq	K256+0*32(%rip), INP		## reuse INP as scratch reg
59362306a36Sopenharmony_ci	vpaddd	(INP, SRND), X0, XFER
59462306a36Sopenharmony_ci	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
59562306a36Sopenharmony_ci	FOUR_ROUNDS_AND_SCHED	_XFER + 0*32
59662306a36Sopenharmony_ci
59762306a36Sopenharmony_ci	leaq	K256+1*32(%rip), INP
59862306a36Sopenharmony_ci	vpaddd	(INP, SRND), X0, XFER
59962306a36Sopenharmony_ci	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
60062306a36Sopenharmony_ci	FOUR_ROUNDS_AND_SCHED	_XFER + 1*32
60162306a36Sopenharmony_ci
60262306a36Sopenharmony_ci	leaq	K256+2*32(%rip), INP
60362306a36Sopenharmony_ci	vpaddd	(INP, SRND), X0, XFER
60462306a36Sopenharmony_ci	vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
60562306a36Sopenharmony_ci	FOUR_ROUNDS_AND_SCHED	_XFER + 2*32
60662306a36Sopenharmony_ci
60762306a36Sopenharmony_ci	leaq	K256+3*32(%rip), INP
60862306a36Sopenharmony_ci	vpaddd	(INP, SRND), X0, XFER
60962306a36Sopenharmony_ci	vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
61062306a36Sopenharmony_ci	FOUR_ROUNDS_AND_SCHED	_XFER + 3*32
61162306a36Sopenharmony_ci
61262306a36Sopenharmony_ci	add	$4*32, SRND
61362306a36Sopenharmony_ci	cmp	$3*4*32, SRND
61462306a36Sopenharmony_ci	jb	.Lloop1
61562306a36Sopenharmony_ci
61662306a36Sopenharmony_ci.Lloop2:
61762306a36Sopenharmony_ci	## Do last 16 rounds with no scheduling
61862306a36Sopenharmony_ci	leaq	K256+0*32(%rip), INP
61962306a36Sopenharmony_ci	vpaddd	(INP, SRND), X0, XFER
62062306a36Sopenharmony_ci	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
62162306a36Sopenharmony_ci	DO_4ROUNDS	_XFER + 0*32
62262306a36Sopenharmony_ci
62362306a36Sopenharmony_ci	leaq	K256+1*32(%rip), INP
62462306a36Sopenharmony_ci	vpaddd	(INP, SRND), X1, XFER
62562306a36Sopenharmony_ci	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
62662306a36Sopenharmony_ci	DO_4ROUNDS	_XFER + 1*32
62762306a36Sopenharmony_ci	add	$2*32, SRND
62862306a36Sopenharmony_ci
62962306a36Sopenharmony_ci	vmovdqa	X2, X0
63062306a36Sopenharmony_ci	vmovdqa	X3, X1
63162306a36Sopenharmony_ci
63262306a36Sopenharmony_ci	cmp	$4*4*32, SRND
63362306a36Sopenharmony_ci	jb	.Lloop2
63462306a36Sopenharmony_ci
63562306a36Sopenharmony_ci	mov	_CTX(%rsp), CTX
63662306a36Sopenharmony_ci	mov	_INP(%rsp), INP
63762306a36Sopenharmony_ci
63862306a36Sopenharmony_ci	addm    (4*0)(CTX),a
63962306a36Sopenharmony_ci	addm    (4*1)(CTX),b
64062306a36Sopenharmony_ci	addm    (4*2)(CTX),c
64162306a36Sopenharmony_ci	addm    (4*3)(CTX),d
64262306a36Sopenharmony_ci	addm    (4*4)(CTX),e
64362306a36Sopenharmony_ci	addm    (4*5)(CTX),f
64462306a36Sopenharmony_ci	addm    (4*6)(CTX),g
64562306a36Sopenharmony_ci	addm    (4*7)(CTX),h
64662306a36Sopenharmony_ci
64762306a36Sopenharmony_ci	cmp	_INP_END(%rsp), INP
64862306a36Sopenharmony_ci	ja	.Ldone_hash
64962306a36Sopenharmony_ci
65062306a36Sopenharmony_ci	#### Do second block using previously scheduled results
65162306a36Sopenharmony_ci	xor	SRND, SRND
65262306a36Sopenharmony_ci.align 16
65362306a36Sopenharmony_ci.Lloop3:
65462306a36Sopenharmony_ci	DO_4ROUNDS	 _XFER + 0*32 + 16
65562306a36Sopenharmony_ci	DO_4ROUNDS	 _XFER + 1*32 + 16
65662306a36Sopenharmony_ci	add	$2*32, SRND
65762306a36Sopenharmony_ci	cmp	$4*4*32, SRND
65862306a36Sopenharmony_ci	jb	.Lloop3
65962306a36Sopenharmony_ci
66062306a36Sopenharmony_ci	mov	_CTX(%rsp), CTX
66162306a36Sopenharmony_ci	mov	_INP(%rsp), INP
66262306a36Sopenharmony_ci	add	$64, INP
66362306a36Sopenharmony_ci
66462306a36Sopenharmony_ci	addm    (4*0)(CTX),a
66562306a36Sopenharmony_ci	addm    (4*1)(CTX),b
66662306a36Sopenharmony_ci	addm    (4*2)(CTX),c
66762306a36Sopenharmony_ci	addm    (4*3)(CTX),d
66862306a36Sopenharmony_ci	addm    (4*4)(CTX),e
66962306a36Sopenharmony_ci	addm    (4*5)(CTX),f
67062306a36Sopenharmony_ci	addm    (4*6)(CTX),g
67162306a36Sopenharmony_ci	addm    (4*7)(CTX),h
67262306a36Sopenharmony_ci
67362306a36Sopenharmony_ci	cmp	_INP_END(%rsp), INP
67462306a36Sopenharmony_ci	jb	.Lloop0
67562306a36Sopenharmony_ci	ja	.Ldone_hash
67662306a36Sopenharmony_ci
67762306a36Sopenharmony_ci.Ldo_last_block:
67862306a36Sopenharmony_ci	VMOVDQ	0*16(INP),XWORD0
67962306a36Sopenharmony_ci	VMOVDQ	1*16(INP),XWORD1
68062306a36Sopenharmony_ci	VMOVDQ	2*16(INP),XWORD2
68162306a36Sopenharmony_ci	VMOVDQ	3*16(INP),XWORD3
68262306a36Sopenharmony_ci
68362306a36Sopenharmony_ci	vpshufb	X_BYTE_FLIP_MASK, XWORD0, XWORD0
68462306a36Sopenharmony_ci	vpshufb	X_BYTE_FLIP_MASK, XWORD1, XWORD1
68562306a36Sopenharmony_ci	vpshufb	X_BYTE_FLIP_MASK, XWORD2, XWORD2
68662306a36Sopenharmony_ci	vpshufb	X_BYTE_FLIP_MASK, XWORD3, XWORD3
68762306a36Sopenharmony_ci
68862306a36Sopenharmony_ci	jmp	.Llast_block_enter
68962306a36Sopenharmony_ci
69062306a36Sopenharmony_ci.Lonly_one_block:
69162306a36Sopenharmony_ci
69262306a36Sopenharmony_ci	## load initial digest
69362306a36Sopenharmony_ci	mov	(4*0)(CTX),a
69462306a36Sopenharmony_ci	mov	(4*1)(CTX),b
69562306a36Sopenharmony_ci	mov	(4*2)(CTX),c
69662306a36Sopenharmony_ci	mov	(4*3)(CTX),d
69762306a36Sopenharmony_ci	mov	(4*4)(CTX),e
69862306a36Sopenharmony_ci	mov	(4*5)(CTX),f
69962306a36Sopenharmony_ci	mov	(4*6)(CTX),g
70062306a36Sopenharmony_ci	mov	(4*7)(CTX),h
70162306a36Sopenharmony_ci
70262306a36Sopenharmony_ci	vmovdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
70362306a36Sopenharmony_ci	vmovdqa	_SHUF_00BA(%rip), SHUF_00BA
70462306a36Sopenharmony_ci	vmovdqa	_SHUF_DC00(%rip), SHUF_DC00
70562306a36Sopenharmony_ci
70662306a36Sopenharmony_ci	mov	CTX, _CTX(%rsp)
70762306a36Sopenharmony_ci	jmp	.Ldo_last_block
70862306a36Sopenharmony_ci
70962306a36Sopenharmony_ci.Ldone_hash:
71062306a36Sopenharmony_ci
71162306a36Sopenharmony_ci	mov	%rbp, %rsp
71262306a36Sopenharmony_ci	pop	%rbp
71362306a36Sopenharmony_ci
71462306a36Sopenharmony_ci	popq	%r15
71562306a36Sopenharmony_ci	popq	%r14
71662306a36Sopenharmony_ci	popq	%r13
71762306a36Sopenharmony_ci	popq	%r12
71862306a36Sopenharmony_ci	popq	%rbx
71962306a36Sopenharmony_ci	RET
72062306a36Sopenharmony_ciSYM_FUNC_END(sha256_transform_rorx)
72162306a36Sopenharmony_ci
72262306a36Sopenharmony_ci.section	.rodata.cst512.K256, "aM", @progbits, 512
72362306a36Sopenharmony_ci.align 64
72462306a36Sopenharmony_ciK256:
72562306a36Sopenharmony_ci	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
72662306a36Sopenharmony_ci	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
72762306a36Sopenharmony_ci	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
72862306a36Sopenharmony_ci	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
72962306a36Sopenharmony_ci	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
73062306a36Sopenharmony_ci	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
73162306a36Sopenharmony_ci	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
73262306a36Sopenharmony_ci	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
73362306a36Sopenharmony_ci	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
73462306a36Sopenharmony_ci	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
73562306a36Sopenharmony_ci	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
73662306a36Sopenharmony_ci	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
73762306a36Sopenharmony_ci	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
73862306a36Sopenharmony_ci	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
73962306a36Sopenharmony_ci	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
74062306a36Sopenharmony_ci	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
74162306a36Sopenharmony_ci	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
74262306a36Sopenharmony_ci	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
74362306a36Sopenharmony_ci	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
74462306a36Sopenharmony_ci	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
74562306a36Sopenharmony_ci	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
74662306a36Sopenharmony_ci	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
74762306a36Sopenharmony_ci	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
74862306a36Sopenharmony_ci	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
74962306a36Sopenharmony_ci	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
75062306a36Sopenharmony_ci	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
75162306a36Sopenharmony_ci	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
75262306a36Sopenharmony_ci	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
75362306a36Sopenharmony_ci	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
75462306a36Sopenharmony_ci	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
75562306a36Sopenharmony_ci	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
75662306a36Sopenharmony_ci	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
75762306a36Sopenharmony_ci
75862306a36Sopenharmony_ci.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
75962306a36Sopenharmony_ci.align 32
76062306a36Sopenharmony_ciPSHUFFLE_BYTE_FLIP_MASK:
76162306a36Sopenharmony_ci	.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
76262306a36Sopenharmony_ci
76362306a36Sopenharmony_ci# shuffle xBxA -> 00BA
76462306a36Sopenharmony_ci.section	.rodata.cst32._SHUF_00BA, "aM", @progbits, 32
76562306a36Sopenharmony_ci.align 32
76662306a36Sopenharmony_ci_SHUF_00BA:
76762306a36Sopenharmony_ci	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
76862306a36Sopenharmony_ci
76962306a36Sopenharmony_ci# shuffle xDxC -> DC00
77062306a36Sopenharmony_ci.section	.rodata.cst32._SHUF_DC00, "aM", @progbits, 32
77162306a36Sopenharmony_ci.align 32
77262306a36Sopenharmony_ci_SHUF_DC00:
77362306a36Sopenharmony_ci	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
774