162306a36Sopenharmony_ci########################################################################
262306a36Sopenharmony_ci# Implement fast SHA-256 with SSSE3 instructions. (x86_64)
362306a36Sopenharmony_ci#
462306a36Sopenharmony_ci# Copyright (C) 2013 Intel Corporation.
562306a36Sopenharmony_ci#
662306a36Sopenharmony_ci# Authors:
762306a36Sopenharmony_ci#     James Guilford <james.guilford@intel.com>
862306a36Sopenharmony_ci#     Kirk Yap <kirk.s.yap@intel.com>
962306a36Sopenharmony_ci#     Tim Chen <tim.c.chen@linux.intel.com>
1062306a36Sopenharmony_ci#
1162306a36Sopenharmony_ci# This software is available to you under a choice of one of two
1262306a36Sopenharmony_ci# licenses.  You may choose to be licensed under the terms of the GNU
1362306a36Sopenharmony_ci# General Public License (GPL) Version 2, available from the file
1462306a36Sopenharmony_ci# COPYING in the main directory of this source tree, or the
1562306a36Sopenharmony_ci# OpenIB.org BSD license below:
1662306a36Sopenharmony_ci#
1762306a36Sopenharmony_ci#     Redistribution and use in source and binary forms, with or
1862306a36Sopenharmony_ci#     without modification, are permitted provided that the following
1962306a36Sopenharmony_ci#     conditions are met:
2062306a36Sopenharmony_ci#
2162306a36Sopenharmony_ci#      - Redistributions of source code must retain the above
2262306a36Sopenharmony_ci#        copyright notice, this list of conditions and the following
2362306a36Sopenharmony_ci#        disclaimer.
2462306a36Sopenharmony_ci#
2562306a36Sopenharmony_ci#      - Redistributions in binary form must reproduce the above
2662306a36Sopenharmony_ci#        copyright notice, this list of conditions and the following
2762306a36Sopenharmony_ci#        disclaimer in the documentation and/or other materials
2862306a36Sopenharmony_ci#        provided with the distribution.
2962306a36Sopenharmony_ci#
3062306a36Sopenharmony_ci# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
3162306a36Sopenharmony_ci# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
3262306a36Sopenharmony_ci# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
3362306a36Sopenharmony_ci# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
3462306a36Sopenharmony_ci# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
3562306a36Sopenharmony_ci# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
3662306a36Sopenharmony_ci# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
3762306a36Sopenharmony_ci# SOFTWARE.
3862306a36Sopenharmony_ci#
3962306a36Sopenharmony_ci########################################################################
4062306a36Sopenharmony_ci#
4162306a36Sopenharmony_ci# This code is described in an Intel White-Paper:
4262306a36Sopenharmony_ci# "Fast SHA-256 Implementations on Intel Architecture Processors"
4362306a36Sopenharmony_ci#
4462306a36Sopenharmony_ci# To find it, surf to http://www.intel.com/p/en_US/embedded
4562306a36Sopenharmony_ci# and search for that title.
4662306a36Sopenharmony_ci#
4762306a36Sopenharmony_ci########################################################################
4862306a36Sopenharmony_ci
4962306a36Sopenharmony_ci#include <linux/linkage.h>
5062306a36Sopenharmony_ci#include <linux/cfi_types.h>
5162306a36Sopenharmony_ci
5262306a36Sopenharmony_ci## assume buffers not aligned
5362306a36Sopenharmony_ci#define    MOVDQ movdqu
5462306a36Sopenharmony_ci
5562306a36Sopenharmony_ci################################ Define Macros
5662306a36Sopenharmony_ci
5762306a36Sopenharmony_ci# addm [mem], reg
5862306a36Sopenharmony_ci# Add reg to mem using reg-mem add and store
5962306a36Sopenharmony_ci.macro addm p1 p2
6062306a36Sopenharmony_ci        add     \p1, \p2
6162306a36Sopenharmony_ci        mov     \p2, \p1
6262306a36Sopenharmony_ci.endm
6362306a36Sopenharmony_ci
6462306a36Sopenharmony_ci################################
6562306a36Sopenharmony_ci
6662306a36Sopenharmony_ci# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
6762306a36Sopenharmony_ci# Load xmm with mem and byte swap each dword
6862306a36Sopenharmony_ci.macro COPY_XMM_AND_BSWAP p1 p2 p3
6962306a36Sopenharmony_ci        MOVDQ \p2, \p1
7062306a36Sopenharmony_ci        pshufb \p3, \p1
7162306a36Sopenharmony_ci.endm
7262306a36Sopenharmony_ci
7362306a36Sopenharmony_ci################################
7462306a36Sopenharmony_ci
7562306a36Sopenharmony_ciX0 = %xmm4
7662306a36Sopenharmony_ciX1 = %xmm5
7762306a36Sopenharmony_ciX2 = %xmm6
7862306a36Sopenharmony_ciX3 = %xmm7
7962306a36Sopenharmony_ci
8062306a36Sopenharmony_ciXTMP0 = %xmm0
8162306a36Sopenharmony_ciXTMP1 = %xmm1
8262306a36Sopenharmony_ciXTMP2 = %xmm2
8362306a36Sopenharmony_ciXTMP3 = %xmm3
8462306a36Sopenharmony_ciXTMP4 = %xmm8
8562306a36Sopenharmony_ciXFER = %xmm9
8662306a36Sopenharmony_ci
8762306a36Sopenharmony_ciSHUF_00BA = %xmm10      # shuffle xBxA -> 00BA
8862306a36Sopenharmony_ciSHUF_DC00 = %xmm11      # shuffle xDxC -> DC00
8962306a36Sopenharmony_ciBYTE_FLIP_MASK = %xmm12
9062306a36Sopenharmony_ci
9162306a36Sopenharmony_ciNUM_BLKS = %rdx   # 3rd arg
9262306a36Sopenharmony_ciINP = %rsi        # 2nd arg
9362306a36Sopenharmony_ciCTX = %rdi        # 1st arg
9462306a36Sopenharmony_ci
9562306a36Sopenharmony_ciSRND = %rsi       # clobbers INP
9662306a36Sopenharmony_cic = %ecx
9762306a36Sopenharmony_cid = %r8d
9862306a36Sopenharmony_cie = %edx
9962306a36Sopenharmony_ciTBL = %r12
10062306a36Sopenharmony_cia = %eax
10162306a36Sopenharmony_cib = %ebx
10262306a36Sopenharmony_ci
10362306a36Sopenharmony_cif = %r9d
10462306a36Sopenharmony_cig = %r10d
10562306a36Sopenharmony_cih = %r11d
10662306a36Sopenharmony_ci
10762306a36Sopenharmony_ciy0 = %r13d
10862306a36Sopenharmony_ciy1 = %r14d
10962306a36Sopenharmony_ciy2 = %r15d
11062306a36Sopenharmony_ci
11162306a36Sopenharmony_ci
11262306a36Sopenharmony_ci
11362306a36Sopenharmony_ci_INP_END_SIZE = 8
11462306a36Sopenharmony_ci_INP_SIZE = 8
11562306a36Sopenharmony_ci_XFER_SIZE = 16
11662306a36Sopenharmony_ci_XMM_SAVE_SIZE = 0
11762306a36Sopenharmony_ci
11862306a36Sopenharmony_ci_INP_END = 0
11962306a36Sopenharmony_ci_INP            = _INP_END  + _INP_END_SIZE
12062306a36Sopenharmony_ci_XFER           = _INP      + _INP_SIZE
12162306a36Sopenharmony_ci_XMM_SAVE       = _XFER     + _XFER_SIZE
12262306a36Sopenharmony_ciSTACK_SIZE      = _XMM_SAVE + _XMM_SAVE_SIZE
12362306a36Sopenharmony_ci
12462306a36Sopenharmony_ci# rotate_Xs
12562306a36Sopenharmony_ci# Rotate values of symbols X0...X3
12662306a36Sopenharmony_ci.macro rotate_Xs
12762306a36Sopenharmony_ciX_ = X0
12862306a36Sopenharmony_ciX0 = X1
12962306a36Sopenharmony_ciX1 = X2
13062306a36Sopenharmony_ciX2 = X3
13162306a36Sopenharmony_ciX3 = X_
13262306a36Sopenharmony_ci.endm
13362306a36Sopenharmony_ci
13462306a36Sopenharmony_ci# ROTATE_ARGS
13562306a36Sopenharmony_ci# Rotate values of symbols a...h
13662306a36Sopenharmony_ci.macro ROTATE_ARGS
13762306a36Sopenharmony_ciTMP_ = h
13862306a36Sopenharmony_cih = g
13962306a36Sopenharmony_cig = f
14062306a36Sopenharmony_cif = e
14162306a36Sopenharmony_cie = d
14262306a36Sopenharmony_cid = c
14362306a36Sopenharmony_cic = b
14462306a36Sopenharmony_cib = a
14562306a36Sopenharmony_cia = TMP_
14662306a36Sopenharmony_ci.endm
14762306a36Sopenharmony_ci
14862306a36Sopenharmony_ci.macro FOUR_ROUNDS_AND_SCHED
14962306a36Sopenharmony_ci	## compute s0 four at a time and s1 two at a time
15062306a36Sopenharmony_ci	## compute W[-16] + W[-7] 4 at a time
15162306a36Sopenharmony_ci	movdqa  X3, XTMP0
15262306a36Sopenharmony_ci	mov     e, y0			# y0 = e
15362306a36Sopenharmony_ci	ror     $(25-11), y0            # y0 = e >> (25-11)
15462306a36Sopenharmony_ci	mov     a, y1                   # y1 = a
15562306a36Sopenharmony_ci	palignr $4, X2, XTMP0           # XTMP0 = W[-7]
15662306a36Sopenharmony_ci	ror     $(22-13), y1            # y1 = a >> (22-13)
15762306a36Sopenharmony_ci	xor     e, y0                   # y0 = e ^ (e >> (25-11))
15862306a36Sopenharmony_ci	mov     f, y2                   # y2 = f
15962306a36Sopenharmony_ci	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
16062306a36Sopenharmony_ci	movdqa  X1, XTMP1
16162306a36Sopenharmony_ci	xor     a, y1                   # y1 = a ^ (a >> (22-13)
16262306a36Sopenharmony_ci	xor     g, y2                   # y2 = f^g
16362306a36Sopenharmony_ci	paddd   X0, XTMP0               # XTMP0 = W[-7] + W[-16]
16462306a36Sopenharmony_ci	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
16562306a36Sopenharmony_ci	and     e, y2                   # y2 = (f^g)&e
16662306a36Sopenharmony_ci	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
16762306a36Sopenharmony_ci	## compute s0
16862306a36Sopenharmony_ci	palignr $4, X0, XTMP1           # XTMP1 = W[-15]
16962306a36Sopenharmony_ci	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
17062306a36Sopenharmony_ci	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
17162306a36Sopenharmony_ci	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
17262306a36Sopenharmony_ci	movdqa  XTMP1, XTMP2            # XTMP2 = W[-15]
17362306a36Sopenharmony_ci	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
17462306a36Sopenharmony_ci	add     y0, y2                  # y2 = S1 + CH
17562306a36Sopenharmony_ci	add     _XFER(%rsp) , y2        # y2 = k + w + S1 + CH
17662306a36Sopenharmony_ci	movdqa  XTMP1, XTMP3            # XTMP3 = W[-15]
17762306a36Sopenharmony_ci	mov     a, y0                   # y0 = a
17862306a36Sopenharmony_ci	add     y2, h                   # h = h + S1 + CH + k + w
17962306a36Sopenharmony_ci	mov     a, y2                   # y2 = a
18062306a36Sopenharmony_ci	pslld   $(32-7), XTMP1          #
18162306a36Sopenharmony_ci	or      c, y0                   # y0 = a|c
18262306a36Sopenharmony_ci	add     h, d                    # d = d + h + S1 + CH + k + w
18362306a36Sopenharmony_ci	and     c, y2                   # y2 = a&c
18462306a36Sopenharmony_ci	psrld   $7, XTMP2               #
18562306a36Sopenharmony_ci	and     b, y0                   # y0 = (a|c)&b
18662306a36Sopenharmony_ci	add     y1, h                   # h = h + S1 + CH + k + w + S0
18762306a36Sopenharmony_ci	por     XTMP2, XTMP1            # XTMP1 = W[-15] ror 7
18862306a36Sopenharmony_ci	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
18962306a36Sopenharmony_ci	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
19062306a36Sopenharmony_ci					#
19162306a36Sopenharmony_ci	ROTATE_ARGS                     #
19262306a36Sopenharmony_ci	movdqa  XTMP3, XTMP2            # XTMP2 = W[-15]
19362306a36Sopenharmony_ci	mov     e, y0                   # y0 = e
19462306a36Sopenharmony_ci	mov     a, y1                   # y1 = a
19562306a36Sopenharmony_ci	movdqa  XTMP3, XTMP4            # XTMP4 = W[-15]
19662306a36Sopenharmony_ci	ror     $(25-11), y0            # y0 = e >> (25-11)
19762306a36Sopenharmony_ci	xor     e, y0                   # y0 = e ^ (e >> (25-11))
19862306a36Sopenharmony_ci	mov     f, y2                   # y2 = f
19962306a36Sopenharmony_ci	ror     $(22-13), y1            # y1 = a >> (22-13)
20062306a36Sopenharmony_ci	pslld   $(32-18), XTMP3         #
20162306a36Sopenharmony_ci	xor     a, y1                   # y1 = a ^ (a >> (22-13)
20262306a36Sopenharmony_ci	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
20362306a36Sopenharmony_ci	xor     g, y2                   # y2 = f^g
20462306a36Sopenharmony_ci	psrld   $18, XTMP2              #
20562306a36Sopenharmony_ci	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
20662306a36Sopenharmony_ci	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
20762306a36Sopenharmony_ci	and     e, y2                   # y2 = (f^g)&e
20862306a36Sopenharmony_ci	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
20962306a36Sopenharmony_ci	pxor    XTMP3, XTMP1
21062306a36Sopenharmony_ci	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
21162306a36Sopenharmony_ci	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
21262306a36Sopenharmony_ci	psrld   $3, XTMP4               # XTMP4 = W[-15] >> 3
21362306a36Sopenharmony_ci	add     y0, y2                  # y2 = S1 + CH
21462306a36Sopenharmony_ci	add     (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
21562306a36Sopenharmony_ci	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
21662306a36Sopenharmony_ci	pxor    XTMP2, XTMP1            # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
21762306a36Sopenharmony_ci	mov     a, y0                   # y0 = a
21862306a36Sopenharmony_ci	add     y2, h                   # h = h + S1 + CH + k + w
21962306a36Sopenharmony_ci	mov     a, y2                   # y2 = a
22062306a36Sopenharmony_ci	pxor    XTMP4, XTMP1            # XTMP1 = s0
22162306a36Sopenharmony_ci	or      c, y0                   # y0 = a|c
22262306a36Sopenharmony_ci	add     h, d                    # d = d + h + S1 + CH + k + w
22362306a36Sopenharmony_ci	and     c, y2                   # y2 = a&c
22462306a36Sopenharmony_ci	## compute low s1
22562306a36Sopenharmony_ci	pshufd  $0b11111010, X3, XTMP2   # XTMP2 = W[-2] {BBAA}
22662306a36Sopenharmony_ci	and     b, y0			# y0 = (a|c)&b
22762306a36Sopenharmony_ci	add     y1, h                   # h = h + S1 + CH + k + w + S0
22862306a36Sopenharmony_ci	paddd   XTMP1, XTMP0            # XTMP0 = W[-16] + W[-7] + s0
22962306a36Sopenharmony_ci	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
23062306a36Sopenharmony_ci	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
23162306a36Sopenharmony_ci
23262306a36Sopenharmony_ci	ROTATE_ARGS
23362306a36Sopenharmony_ci	movdqa  XTMP2, XTMP3            # XTMP3 = W[-2] {BBAA}
23462306a36Sopenharmony_ci	mov     e, y0                   # y0 = e
23562306a36Sopenharmony_ci	mov     a, y1                   # y1 = a
23662306a36Sopenharmony_ci	ror     $(25-11), y0            # y0 = e >> (25-11)
23762306a36Sopenharmony_ci	movdqa  XTMP2, XTMP4            # XTMP4 = W[-2] {BBAA}
23862306a36Sopenharmony_ci	xor     e, y0                   # y0 = e ^ (e >> (25-11))
23962306a36Sopenharmony_ci	ror     $(22-13), y1            # y1 = a >> (22-13)
24062306a36Sopenharmony_ci	mov     f, y2                   # y2 = f
24162306a36Sopenharmony_ci	xor     a, y1                   # y1 = a ^ (a >> (22-13)
24262306a36Sopenharmony_ci	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
24362306a36Sopenharmony_ci	psrlq   $17, XTMP2              # XTMP2 = W[-2] ror 17 {xBxA}
24462306a36Sopenharmony_ci	xor     g, y2                   # y2 = f^g
24562306a36Sopenharmony_ci	psrlq   $19, XTMP3              # XTMP3 = W[-2] ror 19 {xBxA}
24662306a36Sopenharmony_ci	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
24762306a36Sopenharmony_ci	and     e, y2                   # y2 = (f^g)&e
24862306a36Sopenharmony_ci	psrld   $10, XTMP4              # XTMP4 = W[-2] >> 10 {BBAA}
24962306a36Sopenharmony_ci	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
25062306a36Sopenharmony_ci	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
25162306a36Sopenharmony_ci	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
25262306a36Sopenharmony_ci	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
25362306a36Sopenharmony_ci	pxor    XTMP3, XTMP2
25462306a36Sopenharmony_ci	add     y0, y2                  # y2 = S1 + CH
25562306a36Sopenharmony_ci	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
25662306a36Sopenharmony_ci	add     (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
25762306a36Sopenharmony_ci	pxor    XTMP2, XTMP4            # XTMP4 = s1 {xBxA}
25862306a36Sopenharmony_ci	mov     a, y0                   # y0 = a
25962306a36Sopenharmony_ci	add     y2, h                   # h = h + S1 + CH + k + w
26062306a36Sopenharmony_ci	mov     a, y2                   # y2 = a
26162306a36Sopenharmony_ci	pshufb  SHUF_00BA, XTMP4        # XTMP4 = s1 {00BA}
26262306a36Sopenharmony_ci	or      c, y0                   # y0 = a|c
26362306a36Sopenharmony_ci	add     h, d                    # d = d + h + S1 + CH + k + w
26462306a36Sopenharmony_ci	and     c, y2                   # y2 = a&c
26562306a36Sopenharmony_ci	paddd   XTMP4, XTMP0            # XTMP0 = {..., ..., W[1], W[0]}
26662306a36Sopenharmony_ci	and     b, y0                   # y0 = (a|c)&b
26762306a36Sopenharmony_ci	add     y1, h                   # h = h + S1 + CH + k + w + S0
26862306a36Sopenharmony_ci	## compute high s1
26962306a36Sopenharmony_ci	pshufd  $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}
27062306a36Sopenharmony_ci	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
27162306a36Sopenharmony_ci	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
27262306a36Sopenharmony_ci					#
27362306a36Sopenharmony_ci	ROTATE_ARGS                     #
27462306a36Sopenharmony_ci	movdqa  XTMP2, XTMP3            # XTMP3 = W[-2] {DDCC}
27562306a36Sopenharmony_ci	mov     e, y0                   # y0 = e
27662306a36Sopenharmony_ci	ror     $(25-11), y0            # y0 = e >> (25-11)
27762306a36Sopenharmony_ci	mov     a, y1                   # y1 = a
27862306a36Sopenharmony_ci	movdqa  XTMP2, X0               # X0    = W[-2] {DDCC}
27962306a36Sopenharmony_ci	ror     $(22-13), y1            # y1 = a >> (22-13)
28062306a36Sopenharmony_ci	xor     e, y0                   # y0 = e ^ (e >> (25-11))
28162306a36Sopenharmony_ci	mov     f, y2                   # y2 = f
28262306a36Sopenharmony_ci	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
28362306a36Sopenharmony_ci	psrlq   $17, XTMP2              # XTMP2 = W[-2] ror 17 {xDxC}
28462306a36Sopenharmony_ci	xor     a, y1                   # y1 = a ^ (a >> (22-13)
28562306a36Sopenharmony_ci	xor     g, y2                   # y2 = f^g
28662306a36Sopenharmony_ci	psrlq   $19, XTMP3              # XTMP3 = W[-2] ror 19 {xDxC}
28762306a36Sopenharmony_ci	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25
28862306a36Sopenharmony_ci	and     e, y2                   # y2 = (f^g)&e
28962306a36Sopenharmony_ci	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
29062306a36Sopenharmony_ci	psrld   $10, X0                 # X0 = W[-2] >> 10 {DDCC}
29162306a36Sopenharmony_ci	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22
29262306a36Sopenharmony_ci	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
29362306a36Sopenharmony_ci	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
29462306a36Sopenharmony_ci	pxor    XTMP3, XTMP2            #
29562306a36Sopenharmony_ci	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
29662306a36Sopenharmony_ci	add     y0, y2                  # y2 = S1 + CH
29762306a36Sopenharmony_ci	add     (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
29862306a36Sopenharmony_ci	pxor    XTMP2, X0               # X0 = s1 {xDxC}
29962306a36Sopenharmony_ci	mov     a, y0                   # y0 = a
30062306a36Sopenharmony_ci	add     y2, h                   # h = h + S1 + CH + k + w
30162306a36Sopenharmony_ci	mov     a, y2                   # y2 = a
30262306a36Sopenharmony_ci	pshufb  SHUF_DC00, X0           # X0 = s1 {DC00}
30362306a36Sopenharmony_ci	or      c, y0                   # y0 = a|c
30462306a36Sopenharmony_ci	add     h, d                    # d = d + h + S1 + CH + k + w
30562306a36Sopenharmony_ci	and     c, y2                   # y2 = a&c
30662306a36Sopenharmony_ci	paddd   XTMP0, X0               # X0 = {W[3], W[2], W[1], W[0]}
30762306a36Sopenharmony_ci	and     b, y0                   # y0 = (a|c)&b
30862306a36Sopenharmony_ci	add     y1, h                   # h = h + S1 + CH + k + w + S0
30962306a36Sopenharmony_ci	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
31062306a36Sopenharmony_ci	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
31162306a36Sopenharmony_ci
31262306a36Sopenharmony_ci	ROTATE_ARGS
31362306a36Sopenharmony_ci	rotate_Xs
31462306a36Sopenharmony_ci.endm
31562306a36Sopenharmony_ci
31662306a36Sopenharmony_ci## input is [rsp + _XFER + %1 * 4]
31762306a36Sopenharmony_ci.macro DO_ROUND round
31862306a36Sopenharmony_ci	mov     e, y0                 # y0 = e
31962306a36Sopenharmony_ci	ror     $(25-11), y0          # y0 = e >> (25-11)
32062306a36Sopenharmony_ci	mov     a, y1                 # y1 = a
32162306a36Sopenharmony_ci	xor     e, y0                 # y0 = e ^ (e >> (25-11))
32262306a36Sopenharmony_ci	ror     $(22-13), y1          # y1 = a >> (22-13)
32362306a36Sopenharmony_ci	mov     f, y2                 # y2 = f
32462306a36Sopenharmony_ci	xor     a, y1                 # y1 = a ^ (a >> (22-13)
32562306a36Sopenharmony_ci	ror     $(11-6), y0           # y0 = (e >> (11-6)) ^ (e >> (25-6))
32662306a36Sopenharmony_ci	xor     g, y2                 # y2 = f^g
32762306a36Sopenharmony_ci	xor     e, y0                 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
32862306a36Sopenharmony_ci	ror     $(13-2), y1           # y1 = (a >> (13-2)) ^ (a >> (22-2))
32962306a36Sopenharmony_ci	and     e, y2                 # y2 = (f^g)&e
33062306a36Sopenharmony_ci	xor     a, y1                 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
33162306a36Sopenharmony_ci	ror     $6, y0                # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
33262306a36Sopenharmony_ci	xor     g, y2                 # y2 = CH = ((f^g)&e)^g
33362306a36Sopenharmony_ci	add     y0, y2                # y2 = S1 + CH
33462306a36Sopenharmony_ci	ror     $2, y1                # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
33562306a36Sopenharmony_ci	offset = \round * 4 + _XFER
33662306a36Sopenharmony_ci	add     offset(%rsp), y2      # y2 = k + w + S1 + CH
33762306a36Sopenharmony_ci	mov     a, y0                 # y0 = a
33862306a36Sopenharmony_ci	add     y2, h                 # h = h + S1 + CH + k + w
33962306a36Sopenharmony_ci	mov     a, y2                 # y2 = a
34062306a36Sopenharmony_ci	or      c, y0                 # y0 = a|c
34162306a36Sopenharmony_ci	add     h, d                  # d = d + h + S1 + CH + k + w
34262306a36Sopenharmony_ci	and     c, y2                 # y2 = a&c
34362306a36Sopenharmony_ci	and     b, y0                 # y0 = (a|c)&b
34462306a36Sopenharmony_ci	add     y1, h                 # h = h + S1 + CH + k + w + S0
34562306a36Sopenharmony_ci	or      y2, y0		      # y0 = MAJ = (a|c)&b)|(a&c)
34662306a36Sopenharmony_ci	add     y0, h		      # h = h + S1 + CH + k + w + S0 + MAJ
34762306a36Sopenharmony_ci	ROTATE_ARGS
34862306a36Sopenharmony_ci.endm
34962306a36Sopenharmony_ci
35062306a36Sopenharmony_ci########################################################################
35162306a36Sopenharmony_ci## void sha256_transform_ssse3(struct sha256_state *state, const u8 *data,
35262306a36Sopenharmony_ci##			       int blocks);
35362306a36Sopenharmony_ci## arg 1 : pointer to state
35462306a36Sopenharmony_ci##	   (struct sha256_state is assumed to begin with u32 state[8])
35562306a36Sopenharmony_ci## arg 2 : pointer to input data
35662306a36Sopenharmony_ci## arg 3 : Num blocks
35762306a36Sopenharmony_ci########################################################################
35862306a36Sopenharmony_ci.text
35962306a36Sopenharmony_ciSYM_TYPED_FUNC_START(sha256_transform_ssse3)
36062306a36Sopenharmony_ci	pushq   %rbx
36162306a36Sopenharmony_ci	pushq   %r12
36262306a36Sopenharmony_ci	pushq   %r13
36362306a36Sopenharmony_ci	pushq   %r14
36462306a36Sopenharmony_ci	pushq   %r15
36562306a36Sopenharmony_ci	pushq   %rbp
36662306a36Sopenharmony_ci	mov	%rsp, %rbp
36762306a36Sopenharmony_ci
36862306a36Sopenharmony_ci	subq    $STACK_SIZE, %rsp
36962306a36Sopenharmony_ci	and	$~15, %rsp
37062306a36Sopenharmony_ci
37162306a36Sopenharmony_ci	shl     $6, NUM_BLKS		 # convert to bytes
37262306a36Sopenharmony_ci	jz      .Ldone_hash
37362306a36Sopenharmony_ci	add     INP, NUM_BLKS
37462306a36Sopenharmony_ci	mov     NUM_BLKS, _INP_END(%rsp) # pointer to end of data
37562306a36Sopenharmony_ci
37662306a36Sopenharmony_ci	## load initial digest
37762306a36Sopenharmony_ci	mov     4*0(CTX), a
37862306a36Sopenharmony_ci	mov     4*1(CTX), b
37962306a36Sopenharmony_ci	mov     4*2(CTX), c
38062306a36Sopenharmony_ci	mov     4*3(CTX), d
38162306a36Sopenharmony_ci	mov     4*4(CTX), e
38262306a36Sopenharmony_ci	mov     4*5(CTX), f
38362306a36Sopenharmony_ci	mov     4*6(CTX), g
38462306a36Sopenharmony_ci	mov     4*7(CTX), h
38562306a36Sopenharmony_ci
38662306a36Sopenharmony_ci	movdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
38762306a36Sopenharmony_ci	movdqa  _SHUF_00BA(%rip), SHUF_00BA
38862306a36Sopenharmony_ci	movdqa  _SHUF_DC00(%rip), SHUF_DC00
38962306a36Sopenharmony_ci
39062306a36Sopenharmony_ci.Lloop0:
39162306a36Sopenharmony_ci	lea     K256(%rip), TBL
39262306a36Sopenharmony_ci
39362306a36Sopenharmony_ci	## byte swap first 16 dwords
39462306a36Sopenharmony_ci	COPY_XMM_AND_BSWAP      X0, 0*16(INP), BYTE_FLIP_MASK
39562306a36Sopenharmony_ci	COPY_XMM_AND_BSWAP      X1, 1*16(INP), BYTE_FLIP_MASK
39662306a36Sopenharmony_ci	COPY_XMM_AND_BSWAP      X2, 2*16(INP), BYTE_FLIP_MASK
39762306a36Sopenharmony_ci	COPY_XMM_AND_BSWAP      X3, 3*16(INP), BYTE_FLIP_MASK
39862306a36Sopenharmony_ci
39962306a36Sopenharmony_ci	mov     INP, _INP(%rsp)
40062306a36Sopenharmony_ci
40162306a36Sopenharmony_ci	## schedule 48 input dwords, by doing 3 rounds of 16 each
40262306a36Sopenharmony_ci	mov     $3, SRND
40362306a36Sopenharmony_ci.align 16
40462306a36Sopenharmony_ci.Lloop1:
40562306a36Sopenharmony_ci	movdqa  (TBL), XFER
40662306a36Sopenharmony_ci	paddd   X0, XFER
40762306a36Sopenharmony_ci	movdqa  XFER, _XFER(%rsp)
40862306a36Sopenharmony_ci	FOUR_ROUNDS_AND_SCHED
40962306a36Sopenharmony_ci
41062306a36Sopenharmony_ci	movdqa  1*16(TBL), XFER
41162306a36Sopenharmony_ci	paddd   X0, XFER
41262306a36Sopenharmony_ci	movdqa  XFER, _XFER(%rsp)
41362306a36Sopenharmony_ci	FOUR_ROUNDS_AND_SCHED
41462306a36Sopenharmony_ci
41562306a36Sopenharmony_ci	movdqa  2*16(TBL), XFER
41662306a36Sopenharmony_ci	paddd   X0, XFER
41762306a36Sopenharmony_ci	movdqa  XFER, _XFER(%rsp)
41862306a36Sopenharmony_ci	FOUR_ROUNDS_AND_SCHED
41962306a36Sopenharmony_ci
42062306a36Sopenharmony_ci	movdqa  3*16(TBL), XFER
42162306a36Sopenharmony_ci	paddd   X0, XFER
42262306a36Sopenharmony_ci	movdqa  XFER, _XFER(%rsp)
42362306a36Sopenharmony_ci	add     $4*16, TBL
42462306a36Sopenharmony_ci	FOUR_ROUNDS_AND_SCHED
42562306a36Sopenharmony_ci
42662306a36Sopenharmony_ci	sub     $1, SRND
42762306a36Sopenharmony_ci	jne     .Lloop1
42862306a36Sopenharmony_ci
42962306a36Sopenharmony_ci	mov     $2, SRND
43062306a36Sopenharmony_ci.Lloop2:
43162306a36Sopenharmony_ci	paddd   (TBL), X0
43262306a36Sopenharmony_ci	movdqa  X0, _XFER(%rsp)
43362306a36Sopenharmony_ci	DO_ROUND        0
43462306a36Sopenharmony_ci	DO_ROUND        1
43562306a36Sopenharmony_ci	DO_ROUND        2
43662306a36Sopenharmony_ci	DO_ROUND        3
43762306a36Sopenharmony_ci	paddd   1*16(TBL), X1
43862306a36Sopenharmony_ci	movdqa  X1, _XFER(%rsp)
43962306a36Sopenharmony_ci	add     $2*16, TBL
44062306a36Sopenharmony_ci	DO_ROUND        0
44162306a36Sopenharmony_ci	DO_ROUND        1
44262306a36Sopenharmony_ci	DO_ROUND        2
44362306a36Sopenharmony_ci	DO_ROUND        3
44462306a36Sopenharmony_ci
44562306a36Sopenharmony_ci	movdqa  X2, X0
44662306a36Sopenharmony_ci	movdqa  X3, X1
44762306a36Sopenharmony_ci
44862306a36Sopenharmony_ci	sub     $1, SRND
44962306a36Sopenharmony_ci	jne     .Lloop2
45062306a36Sopenharmony_ci
45162306a36Sopenharmony_ci	addm    (4*0)(CTX),a
45262306a36Sopenharmony_ci	addm    (4*1)(CTX),b
45362306a36Sopenharmony_ci	addm    (4*2)(CTX),c
45462306a36Sopenharmony_ci	addm    (4*3)(CTX),d
45562306a36Sopenharmony_ci	addm    (4*4)(CTX),e
45662306a36Sopenharmony_ci	addm    (4*5)(CTX),f
45762306a36Sopenharmony_ci	addm    (4*6)(CTX),g
45862306a36Sopenharmony_ci	addm    (4*7)(CTX),h
45962306a36Sopenharmony_ci
46062306a36Sopenharmony_ci	mov     _INP(%rsp), INP
46162306a36Sopenharmony_ci	add     $64, INP
46262306a36Sopenharmony_ci	cmp     _INP_END(%rsp), INP
46362306a36Sopenharmony_ci	jne     .Lloop0
46462306a36Sopenharmony_ci
46562306a36Sopenharmony_ci.Ldone_hash:
46662306a36Sopenharmony_ci
46762306a36Sopenharmony_ci	mov	%rbp, %rsp
46862306a36Sopenharmony_ci	popq	%rbp
46962306a36Sopenharmony_ci	popq    %r15
47062306a36Sopenharmony_ci	popq    %r14
47162306a36Sopenharmony_ci	popq    %r13
47262306a36Sopenharmony_ci	popq    %r12
47362306a36Sopenharmony_ci	popq    %rbx
47462306a36Sopenharmony_ci
47562306a36Sopenharmony_ci	RET
47662306a36Sopenharmony_ciSYM_FUNC_END(sha256_transform_ssse3)
47762306a36Sopenharmony_ci
47862306a36Sopenharmony_ci.section	.rodata.cst256.K256, "aM", @progbits, 256
47962306a36Sopenharmony_ci.align 64
48062306a36Sopenharmony_ciK256:
48162306a36Sopenharmony_ci        .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
48262306a36Sopenharmony_ci        .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
48362306a36Sopenharmony_ci        .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
48462306a36Sopenharmony_ci        .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
48562306a36Sopenharmony_ci        .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
48662306a36Sopenharmony_ci        .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
48762306a36Sopenharmony_ci        .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
48862306a36Sopenharmony_ci        .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
48962306a36Sopenharmony_ci        .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
49062306a36Sopenharmony_ci        .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
49162306a36Sopenharmony_ci        .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
49262306a36Sopenharmony_ci        .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
49362306a36Sopenharmony_ci        .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
49462306a36Sopenharmony_ci        .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
49562306a36Sopenharmony_ci        .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
49662306a36Sopenharmony_ci        .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
49762306a36Sopenharmony_ci
49862306a36Sopenharmony_ci.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
49962306a36Sopenharmony_ci.align 16
50062306a36Sopenharmony_ciPSHUFFLE_BYTE_FLIP_MASK:
50162306a36Sopenharmony_ci	.octa 0x0c0d0e0f08090a0b0405060700010203
50262306a36Sopenharmony_ci
50362306a36Sopenharmony_ci.section	.rodata.cst16._SHUF_00BA, "aM", @progbits, 16
50462306a36Sopenharmony_ci.align 16
50562306a36Sopenharmony_ci# shuffle xBxA -> 00BA
50662306a36Sopenharmony_ci_SHUF_00BA:
50762306a36Sopenharmony_ci	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
50862306a36Sopenharmony_ci
50962306a36Sopenharmony_ci.section	.rodata.cst16._SHUF_DC00, "aM", @progbits, 16
51062306a36Sopenharmony_ci.align 16
51162306a36Sopenharmony_ci# shuffle xDxC -> DC00
51262306a36Sopenharmony_ci_SHUF_DC00:
51362306a36Sopenharmony_ci	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
514