162306a36Sopenharmony_ci######################################################################## 262306a36Sopenharmony_ci# Implement fast SHA-256 with SSSE3 instructions. (x86_64) 362306a36Sopenharmony_ci# 462306a36Sopenharmony_ci# Copyright (C) 2013 Intel Corporation. 562306a36Sopenharmony_ci# 662306a36Sopenharmony_ci# Authors: 762306a36Sopenharmony_ci# James Guilford <james.guilford@intel.com> 862306a36Sopenharmony_ci# Kirk Yap <kirk.s.yap@intel.com> 962306a36Sopenharmony_ci# Tim Chen <tim.c.chen@linux.intel.com> 1062306a36Sopenharmony_ci# 1162306a36Sopenharmony_ci# This software is available to you under a choice of one of two 1262306a36Sopenharmony_ci# licenses. You may choose to be licensed under the terms of the GNU 1362306a36Sopenharmony_ci# General Public License (GPL) Version 2, available from the file 1462306a36Sopenharmony_ci# COPYING in the main directory of this source tree, or the 1562306a36Sopenharmony_ci# OpenIB.org BSD license below: 1662306a36Sopenharmony_ci# 1762306a36Sopenharmony_ci# Redistribution and use in source and binary forms, with or 1862306a36Sopenharmony_ci# without modification, are permitted provided that the following 1962306a36Sopenharmony_ci# conditions are met: 2062306a36Sopenharmony_ci# 2162306a36Sopenharmony_ci# - Redistributions of source code must retain the above 2262306a36Sopenharmony_ci# copyright notice, this list of conditions and the following 2362306a36Sopenharmony_ci# disclaimer. 2462306a36Sopenharmony_ci# 2562306a36Sopenharmony_ci# - Redistributions in binary form must reproduce the above 2662306a36Sopenharmony_ci# copyright notice, this list of conditions and the following 2762306a36Sopenharmony_ci# disclaimer in the documentation and/or other materials 2862306a36Sopenharmony_ci# provided with the distribution. 2962306a36Sopenharmony_ci# 3062306a36Sopenharmony_ci# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 3162306a36Sopenharmony_ci# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 3262306a36Sopenharmony_ci# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 3362306a36Sopenharmony_ci# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 3462306a36Sopenharmony_ci# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 3562306a36Sopenharmony_ci# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 3662306a36Sopenharmony_ci# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 3762306a36Sopenharmony_ci# SOFTWARE. 3862306a36Sopenharmony_ci# 3962306a36Sopenharmony_ci######################################################################## 4062306a36Sopenharmony_ci# 4162306a36Sopenharmony_ci# This code is described in an Intel White-Paper: 4262306a36Sopenharmony_ci# "Fast SHA-256 Implementations on Intel Architecture Processors" 4362306a36Sopenharmony_ci# 4462306a36Sopenharmony_ci# To find it, surf to http://www.intel.com/p/en_US/embedded 4562306a36Sopenharmony_ci# and search for that title. 4662306a36Sopenharmony_ci# 4762306a36Sopenharmony_ci######################################################################## 4862306a36Sopenharmony_ci 4962306a36Sopenharmony_ci#include <linux/linkage.h> 5062306a36Sopenharmony_ci#include <linux/cfi_types.h> 5162306a36Sopenharmony_ci 5262306a36Sopenharmony_ci## assume buffers not aligned 5362306a36Sopenharmony_ci#define MOVDQ movdqu 5462306a36Sopenharmony_ci 5562306a36Sopenharmony_ci################################ Define Macros 5662306a36Sopenharmony_ci 5762306a36Sopenharmony_ci# addm [mem], reg 5862306a36Sopenharmony_ci# Add reg to mem using reg-mem add and store 5962306a36Sopenharmony_ci.macro addm p1 p2 6062306a36Sopenharmony_ci add \p1, \p2 6162306a36Sopenharmony_ci mov \p2, \p1 6262306a36Sopenharmony_ci.endm 6362306a36Sopenharmony_ci 6462306a36Sopenharmony_ci################################ 6562306a36Sopenharmony_ci 6662306a36Sopenharmony_ci# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask 6762306a36Sopenharmony_ci# Load xmm with mem and byte swap each dword 6862306a36Sopenharmony_ci.macro COPY_XMM_AND_BSWAP p1 p2 p3 6962306a36Sopenharmony_ci MOVDQ \p2, \p1 7062306a36Sopenharmony_ci pshufb \p3, \p1 7162306a36Sopenharmony_ci.endm 7262306a36Sopenharmony_ci 7362306a36Sopenharmony_ci################################ 7462306a36Sopenharmony_ci 7562306a36Sopenharmony_ciX0 = %xmm4 7662306a36Sopenharmony_ciX1 = %xmm5 7762306a36Sopenharmony_ciX2 = %xmm6 7862306a36Sopenharmony_ciX3 = %xmm7 7962306a36Sopenharmony_ci 8062306a36Sopenharmony_ciXTMP0 = %xmm0 8162306a36Sopenharmony_ciXTMP1 = %xmm1 8262306a36Sopenharmony_ciXTMP2 = %xmm2 8362306a36Sopenharmony_ciXTMP3 = %xmm3 8462306a36Sopenharmony_ciXTMP4 = %xmm8 8562306a36Sopenharmony_ciXFER = %xmm9 8662306a36Sopenharmony_ci 8762306a36Sopenharmony_ciSHUF_00BA = %xmm10 # shuffle xBxA -> 00BA 8862306a36Sopenharmony_ciSHUF_DC00 = %xmm11 # shuffle xDxC -> DC00 8962306a36Sopenharmony_ciBYTE_FLIP_MASK = %xmm12 9062306a36Sopenharmony_ci 9162306a36Sopenharmony_ciNUM_BLKS = %rdx # 3rd arg 9262306a36Sopenharmony_ciINP = %rsi # 2nd arg 9362306a36Sopenharmony_ciCTX = %rdi # 1st arg 9462306a36Sopenharmony_ci 9562306a36Sopenharmony_ciSRND = %rsi # clobbers INP 9662306a36Sopenharmony_cic = %ecx 9762306a36Sopenharmony_cid = %r8d 9862306a36Sopenharmony_cie = %edx 9962306a36Sopenharmony_ciTBL = %r12 10062306a36Sopenharmony_cia = %eax 10162306a36Sopenharmony_cib = %ebx 10262306a36Sopenharmony_ci 10362306a36Sopenharmony_cif = %r9d 10462306a36Sopenharmony_cig = %r10d 10562306a36Sopenharmony_cih = %r11d 10662306a36Sopenharmony_ci 10762306a36Sopenharmony_ciy0 = %r13d 10862306a36Sopenharmony_ciy1 = %r14d 10962306a36Sopenharmony_ciy2 = %r15d 11062306a36Sopenharmony_ci 11162306a36Sopenharmony_ci 11262306a36Sopenharmony_ci 11362306a36Sopenharmony_ci_INP_END_SIZE = 8 11462306a36Sopenharmony_ci_INP_SIZE = 8 11562306a36Sopenharmony_ci_XFER_SIZE = 16 11662306a36Sopenharmony_ci_XMM_SAVE_SIZE = 0 11762306a36Sopenharmony_ci 11862306a36Sopenharmony_ci_INP_END = 0 11962306a36Sopenharmony_ci_INP = _INP_END + _INP_END_SIZE 12062306a36Sopenharmony_ci_XFER = _INP + _INP_SIZE 12162306a36Sopenharmony_ci_XMM_SAVE = _XFER + _XFER_SIZE 12262306a36Sopenharmony_ciSTACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE 12362306a36Sopenharmony_ci 12462306a36Sopenharmony_ci# rotate_Xs 12562306a36Sopenharmony_ci# Rotate values of symbols X0...X3 12662306a36Sopenharmony_ci.macro rotate_Xs 12762306a36Sopenharmony_ciX_ = X0 12862306a36Sopenharmony_ciX0 = X1 12962306a36Sopenharmony_ciX1 = X2 13062306a36Sopenharmony_ciX2 = X3 13162306a36Sopenharmony_ciX3 = X_ 13262306a36Sopenharmony_ci.endm 13362306a36Sopenharmony_ci 13462306a36Sopenharmony_ci# ROTATE_ARGS 13562306a36Sopenharmony_ci# Rotate values of symbols a...h 13662306a36Sopenharmony_ci.macro ROTATE_ARGS 13762306a36Sopenharmony_ciTMP_ = h 13862306a36Sopenharmony_cih = g 13962306a36Sopenharmony_cig = f 14062306a36Sopenharmony_cif = e 14162306a36Sopenharmony_cie = d 14262306a36Sopenharmony_cid = c 14362306a36Sopenharmony_cic = b 14462306a36Sopenharmony_cib = a 14562306a36Sopenharmony_cia = TMP_ 14662306a36Sopenharmony_ci.endm 14762306a36Sopenharmony_ci 14862306a36Sopenharmony_ci.macro FOUR_ROUNDS_AND_SCHED 14962306a36Sopenharmony_ci ## compute s0 four at a time and s1 two at a time 15062306a36Sopenharmony_ci ## compute W[-16] + W[-7] 4 at a time 15162306a36Sopenharmony_ci movdqa X3, XTMP0 15262306a36Sopenharmony_ci mov e, y0 # y0 = e 15362306a36Sopenharmony_ci ror $(25-11), y0 # y0 = e >> (25-11) 15462306a36Sopenharmony_ci mov a, y1 # y1 = a 15562306a36Sopenharmony_ci palignr $4, X2, XTMP0 # XTMP0 = W[-7] 15662306a36Sopenharmony_ci ror $(22-13), y1 # y1 = a >> (22-13) 15762306a36Sopenharmony_ci xor e, y0 # y0 = e ^ (e >> (25-11)) 15862306a36Sopenharmony_ci mov f, y2 # y2 = f 15962306a36Sopenharmony_ci ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 16062306a36Sopenharmony_ci movdqa X1, XTMP1 16162306a36Sopenharmony_ci xor a, y1 # y1 = a ^ (a >> (22-13) 16262306a36Sopenharmony_ci xor g, y2 # y2 = f^g 16362306a36Sopenharmony_ci paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16] 16462306a36Sopenharmony_ci xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 16562306a36Sopenharmony_ci and e, y2 # y2 = (f^g)&e 16662306a36Sopenharmony_ci ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 16762306a36Sopenharmony_ci ## compute s0 16862306a36Sopenharmony_ci palignr $4, X0, XTMP1 # XTMP1 = W[-15] 16962306a36Sopenharmony_ci xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 17062306a36Sopenharmony_ci ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 17162306a36Sopenharmony_ci xor g, y2 # y2 = CH = ((f^g)&e)^g 17262306a36Sopenharmony_ci movdqa XTMP1, XTMP2 # XTMP2 = W[-15] 17362306a36Sopenharmony_ci ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 17462306a36Sopenharmony_ci add y0, y2 # y2 = S1 + CH 17562306a36Sopenharmony_ci add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH 17662306a36Sopenharmony_ci movdqa XTMP1, XTMP3 # XTMP3 = W[-15] 17762306a36Sopenharmony_ci mov a, y0 # y0 = a 17862306a36Sopenharmony_ci add y2, h # h = h + S1 + CH + k + w 17962306a36Sopenharmony_ci mov a, y2 # y2 = a 18062306a36Sopenharmony_ci pslld $(32-7), XTMP1 # 18162306a36Sopenharmony_ci or c, y0 # y0 = a|c 18262306a36Sopenharmony_ci add h, d # d = d + h + S1 + CH + k + w 18362306a36Sopenharmony_ci and c, y2 # y2 = a&c 18462306a36Sopenharmony_ci psrld $7, XTMP2 # 18562306a36Sopenharmony_ci and b, y0 # y0 = (a|c)&b 18662306a36Sopenharmony_ci add y1, h # h = h + S1 + CH + k + w + S0 18762306a36Sopenharmony_ci por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 18862306a36Sopenharmony_ci or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 18962306a36Sopenharmony_ci add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 19062306a36Sopenharmony_ci # 19162306a36Sopenharmony_ci ROTATE_ARGS # 19262306a36Sopenharmony_ci movdqa XTMP3, XTMP2 # XTMP2 = W[-15] 19362306a36Sopenharmony_ci mov e, y0 # y0 = e 19462306a36Sopenharmony_ci mov a, y1 # y1 = a 19562306a36Sopenharmony_ci movdqa XTMP3, XTMP4 # XTMP4 = W[-15] 19662306a36Sopenharmony_ci ror $(25-11), y0 # y0 = e >> (25-11) 19762306a36Sopenharmony_ci xor e, y0 # y0 = e ^ (e >> (25-11)) 19862306a36Sopenharmony_ci mov f, y2 # y2 = f 19962306a36Sopenharmony_ci ror $(22-13), y1 # y1 = a >> (22-13) 20062306a36Sopenharmony_ci pslld $(32-18), XTMP3 # 20162306a36Sopenharmony_ci xor a, y1 # y1 = a ^ (a >> (22-13) 20262306a36Sopenharmony_ci ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 20362306a36Sopenharmony_ci xor g, y2 # y2 = f^g 20462306a36Sopenharmony_ci psrld $18, XTMP2 # 20562306a36Sopenharmony_ci ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 20662306a36Sopenharmony_ci xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 20762306a36Sopenharmony_ci and e, y2 # y2 = (f^g)&e 20862306a36Sopenharmony_ci ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 20962306a36Sopenharmony_ci pxor XTMP3, XTMP1 21062306a36Sopenharmony_ci xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 21162306a36Sopenharmony_ci xor g, y2 # y2 = CH = ((f^g)&e)^g 21262306a36Sopenharmony_ci psrld $3, XTMP4 # XTMP4 = W[-15] >> 3 21362306a36Sopenharmony_ci add y0, y2 # y2 = S1 + CH 21462306a36Sopenharmony_ci add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH 21562306a36Sopenharmony_ci ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 21662306a36Sopenharmony_ci pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 21762306a36Sopenharmony_ci mov a, y0 # y0 = a 21862306a36Sopenharmony_ci add y2, h # h = h + S1 + CH + k + w 21962306a36Sopenharmony_ci mov a, y2 # y2 = a 22062306a36Sopenharmony_ci pxor XTMP4, XTMP1 # XTMP1 = s0 22162306a36Sopenharmony_ci or c, y0 # y0 = a|c 22262306a36Sopenharmony_ci add h, d # d = d + h + S1 + CH + k + w 22362306a36Sopenharmony_ci and c, y2 # y2 = a&c 22462306a36Sopenharmony_ci ## compute low s1 22562306a36Sopenharmony_ci pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} 22662306a36Sopenharmony_ci and b, y0 # y0 = (a|c)&b 22762306a36Sopenharmony_ci add y1, h # h = h + S1 + CH + k + w + S0 22862306a36Sopenharmony_ci paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 22962306a36Sopenharmony_ci or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 23062306a36Sopenharmony_ci add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 23162306a36Sopenharmony_ci 23262306a36Sopenharmony_ci ROTATE_ARGS 23362306a36Sopenharmony_ci movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA} 23462306a36Sopenharmony_ci mov e, y0 # y0 = e 23562306a36Sopenharmony_ci mov a, y1 # y1 = a 23662306a36Sopenharmony_ci ror $(25-11), y0 # y0 = e >> (25-11) 23762306a36Sopenharmony_ci movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA} 23862306a36Sopenharmony_ci xor e, y0 # y0 = e ^ (e >> (25-11)) 23962306a36Sopenharmony_ci ror $(22-13), y1 # y1 = a >> (22-13) 24062306a36Sopenharmony_ci mov f, y2 # y2 = f 24162306a36Sopenharmony_ci xor a, y1 # y1 = a ^ (a >> (22-13) 24262306a36Sopenharmony_ci ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 24362306a36Sopenharmony_ci psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} 24462306a36Sopenharmony_ci xor g, y2 # y2 = f^g 24562306a36Sopenharmony_ci psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} 24662306a36Sopenharmony_ci xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 24762306a36Sopenharmony_ci and e, y2 # y2 = (f^g)&e 24862306a36Sopenharmony_ci psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} 24962306a36Sopenharmony_ci ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 25062306a36Sopenharmony_ci xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 25162306a36Sopenharmony_ci xor g, y2 # y2 = CH = ((f^g)&e)^g 25262306a36Sopenharmony_ci ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 25362306a36Sopenharmony_ci pxor XTMP3, XTMP2 25462306a36Sopenharmony_ci add y0, y2 # y2 = S1 + CH 25562306a36Sopenharmony_ci ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 25662306a36Sopenharmony_ci add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH 25762306a36Sopenharmony_ci pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA} 25862306a36Sopenharmony_ci mov a, y0 # y0 = a 25962306a36Sopenharmony_ci add y2, h # h = h + S1 + CH + k + w 26062306a36Sopenharmony_ci mov a, y2 # y2 = a 26162306a36Sopenharmony_ci pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA} 26262306a36Sopenharmony_ci or c, y0 # y0 = a|c 26362306a36Sopenharmony_ci add h, d # d = d + h + S1 + CH + k + w 26462306a36Sopenharmony_ci and c, y2 # y2 = a&c 26562306a36Sopenharmony_ci paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} 26662306a36Sopenharmony_ci and b, y0 # y0 = (a|c)&b 26762306a36Sopenharmony_ci add y1, h # h = h + S1 + CH + k + w + S0 26862306a36Sopenharmony_ci ## compute high s1 26962306a36Sopenharmony_ci pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA} 27062306a36Sopenharmony_ci or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 27162306a36Sopenharmony_ci add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 27262306a36Sopenharmony_ci # 27362306a36Sopenharmony_ci ROTATE_ARGS # 27462306a36Sopenharmony_ci movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC} 27562306a36Sopenharmony_ci mov e, y0 # y0 = e 27662306a36Sopenharmony_ci ror $(25-11), y0 # y0 = e >> (25-11) 27762306a36Sopenharmony_ci mov a, y1 # y1 = a 27862306a36Sopenharmony_ci movdqa XTMP2, X0 # X0 = W[-2] {DDCC} 27962306a36Sopenharmony_ci ror $(22-13), y1 # y1 = a >> (22-13) 28062306a36Sopenharmony_ci xor e, y0 # y0 = e ^ (e >> (25-11)) 28162306a36Sopenharmony_ci mov f, y2 # y2 = f 28262306a36Sopenharmony_ci ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 28362306a36Sopenharmony_ci psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} 28462306a36Sopenharmony_ci xor a, y1 # y1 = a ^ (a >> (22-13) 28562306a36Sopenharmony_ci xor g, y2 # y2 = f^g 28662306a36Sopenharmony_ci psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} 28762306a36Sopenharmony_ci xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25 28862306a36Sopenharmony_ci and e, y2 # y2 = (f^g)&e 28962306a36Sopenharmony_ci ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 29062306a36Sopenharmony_ci psrld $10, X0 # X0 = W[-2] >> 10 {DDCC} 29162306a36Sopenharmony_ci xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22 29262306a36Sopenharmony_ci ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2 29362306a36Sopenharmony_ci xor g, y2 # y2 = CH = ((f^g)&e)^g 29462306a36Sopenharmony_ci pxor XTMP3, XTMP2 # 29562306a36Sopenharmony_ci ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2 29662306a36Sopenharmony_ci add y0, y2 # y2 = S1 + CH 29762306a36Sopenharmony_ci add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH 29862306a36Sopenharmony_ci pxor XTMP2, X0 # X0 = s1 {xDxC} 29962306a36Sopenharmony_ci mov a, y0 # y0 = a 30062306a36Sopenharmony_ci add y2, h # h = h + S1 + CH + k + w 30162306a36Sopenharmony_ci mov a, y2 # y2 = a 30262306a36Sopenharmony_ci pshufb SHUF_DC00, X0 # X0 = s1 {DC00} 30362306a36Sopenharmony_ci or c, y0 # y0 = a|c 30462306a36Sopenharmony_ci add h, d # d = d + h + S1 + CH + k + w 30562306a36Sopenharmony_ci and c, y2 # y2 = a&c 30662306a36Sopenharmony_ci paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]} 30762306a36Sopenharmony_ci and b, y0 # y0 = (a|c)&b 30862306a36Sopenharmony_ci add y1, h # h = h + S1 + CH + k + w + S0 30962306a36Sopenharmony_ci or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 31062306a36Sopenharmony_ci add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 31162306a36Sopenharmony_ci 31262306a36Sopenharmony_ci ROTATE_ARGS 31362306a36Sopenharmony_ci rotate_Xs 31462306a36Sopenharmony_ci.endm 31562306a36Sopenharmony_ci 31662306a36Sopenharmony_ci## input is [rsp + _XFER + %1 * 4] 31762306a36Sopenharmony_ci.macro DO_ROUND round 31862306a36Sopenharmony_ci mov e, y0 # y0 = e 31962306a36Sopenharmony_ci ror $(25-11), y0 # y0 = e >> (25-11) 32062306a36Sopenharmony_ci mov a, y1 # y1 = a 32162306a36Sopenharmony_ci xor e, y0 # y0 = e ^ (e >> (25-11)) 32262306a36Sopenharmony_ci ror $(22-13), y1 # y1 = a >> (22-13) 32362306a36Sopenharmony_ci mov f, y2 # y2 = f 32462306a36Sopenharmony_ci xor a, y1 # y1 = a ^ (a >> (22-13) 32562306a36Sopenharmony_ci ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 32662306a36Sopenharmony_ci xor g, y2 # y2 = f^g 32762306a36Sopenharmony_ci xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 32862306a36Sopenharmony_ci ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 32962306a36Sopenharmony_ci and e, y2 # y2 = (f^g)&e 33062306a36Sopenharmony_ci xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 33162306a36Sopenharmony_ci ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 33262306a36Sopenharmony_ci xor g, y2 # y2 = CH = ((f^g)&e)^g 33362306a36Sopenharmony_ci add y0, y2 # y2 = S1 + CH 33462306a36Sopenharmony_ci ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 33562306a36Sopenharmony_ci offset = \round * 4 + _XFER 33662306a36Sopenharmony_ci add offset(%rsp), y2 # y2 = k + w + S1 + CH 33762306a36Sopenharmony_ci mov a, y0 # y0 = a 33862306a36Sopenharmony_ci add y2, h # h = h + S1 + CH + k + w 33962306a36Sopenharmony_ci mov a, y2 # y2 = a 34062306a36Sopenharmony_ci or c, y0 # y0 = a|c 34162306a36Sopenharmony_ci add h, d # d = d + h + S1 + CH + k + w 34262306a36Sopenharmony_ci and c, y2 # y2 = a&c 34362306a36Sopenharmony_ci and b, y0 # y0 = (a|c)&b 34462306a36Sopenharmony_ci add y1, h # h = h + S1 + CH + k + w + S0 34562306a36Sopenharmony_ci or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 34662306a36Sopenharmony_ci add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 34762306a36Sopenharmony_ci ROTATE_ARGS 34862306a36Sopenharmony_ci.endm 34962306a36Sopenharmony_ci 35062306a36Sopenharmony_ci######################################################################## 35162306a36Sopenharmony_ci## void sha256_transform_ssse3(struct sha256_state *state, const u8 *data, 35262306a36Sopenharmony_ci## int blocks); 35362306a36Sopenharmony_ci## arg 1 : pointer to state 35462306a36Sopenharmony_ci## (struct sha256_state is assumed to begin with u32 state[8]) 35562306a36Sopenharmony_ci## arg 2 : pointer to input data 35662306a36Sopenharmony_ci## arg 3 : Num blocks 35762306a36Sopenharmony_ci######################################################################## 35862306a36Sopenharmony_ci.text 35962306a36Sopenharmony_ciSYM_TYPED_FUNC_START(sha256_transform_ssse3) 36062306a36Sopenharmony_ci pushq %rbx 36162306a36Sopenharmony_ci pushq %r12 36262306a36Sopenharmony_ci pushq %r13 36362306a36Sopenharmony_ci pushq %r14 36462306a36Sopenharmony_ci pushq %r15 36562306a36Sopenharmony_ci pushq %rbp 36662306a36Sopenharmony_ci mov %rsp, %rbp 36762306a36Sopenharmony_ci 36862306a36Sopenharmony_ci subq $STACK_SIZE, %rsp 36962306a36Sopenharmony_ci and $~15, %rsp 37062306a36Sopenharmony_ci 37162306a36Sopenharmony_ci shl $6, NUM_BLKS # convert to bytes 37262306a36Sopenharmony_ci jz .Ldone_hash 37362306a36Sopenharmony_ci add INP, NUM_BLKS 37462306a36Sopenharmony_ci mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data 37562306a36Sopenharmony_ci 37662306a36Sopenharmony_ci ## load initial digest 37762306a36Sopenharmony_ci mov 4*0(CTX), a 37862306a36Sopenharmony_ci mov 4*1(CTX), b 37962306a36Sopenharmony_ci mov 4*2(CTX), c 38062306a36Sopenharmony_ci mov 4*3(CTX), d 38162306a36Sopenharmony_ci mov 4*4(CTX), e 38262306a36Sopenharmony_ci mov 4*5(CTX), f 38362306a36Sopenharmony_ci mov 4*6(CTX), g 38462306a36Sopenharmony_ci mov 4*7(CTX), h 38562306a36Sopenharmony_ci 38662306a36Sopenharmony_ci movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 38762306a36Sopenharmony_ci movdqa _SHUF_00BA(%rip), SHUF_00BA 38862306a36Sopenharmony_ci movdqa _SHUF_DC00(%rip), SHUF_DC00 38962306a36Sopenharmony_ci 39062306a36Sopenharmony_ci.Lloop0: 39162306a36Sopenharmony_ci lea K256(%rip), TBL 39262306a36Sopenharmony_ci 39362306a36Sopenharmony_ci ## byte swap first 16 dwords 39462306a36Sopenharmony_ci COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK 39562306a36Sopenharmony_ci COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK 39662306a36Sopenharmony_ci COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK 39762306a36Sopenharmony_ci COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK 39862306a36Sopenharmony_ci 39962306a36Sopenharmony_ci mov INP, _INP(%rsp) 40062306a36Sopenharmony_ci 40162306a36Sopenharmony_ci ## schedule 48 input dwords, by doing 3 rounds of 16 each 40262306a36Sopenharmony_ci mov $3, SRND 40362306a36Sopenharmony_ci.align 16 40462306a36Sopenharmony_ci.Lloop1: 40562306a36Sopenharmony_ci movdqa (TBL), XFER 40662306a36Sopenharmony_ci paddd X0, XFER 40762306a36Sopenharmony_ci movdqa XFER, _XFER(%rsp) 40862306a36Sopenharmony_ci FOUR_ROUNDS_AND_SCHED 40962306a36Sopenharmony_ci 41062306a36Sopenharmony_ci movdqa 1*16(TBL), XFER 41162306a36Sopenharmony_ci paddd X0, XFER 41262306a36Sopenharmony_ci movdqa XFER, _XFER(%rsp) 41362306a36Sopenharmony_ci FOUR_ROUNDS_AND_SCHED 41462306a36Sopenharmony_ci 41562306a36Sopenharmony_ci movdqa 2*16(TBL), XFER 41662306a36Sopenharmony_ci paddd X0, XFER 41762306a36Sopenharmony_ci movdqa XFER, _XFER(%rsp) 41862306a36Sopenharmony_ci FOUR_ROUNDS_AND_SCHED 41962306a36Sopenharmony_ci 42062306a36Sopenharmony_ci movdqa 3*16(TBL), XFER 42162306a36Sopenharmony_ci paddd X0, XFER 42262306a36Sopenharmony_ci movdqa XFER, _XFER(%rsp) 42362306a36Sopenharmony_ci add $4*16, TBL 42462306a36Sopenharmony_ci FOUR_ROUNDS_AND_SCHED 42562306a36Sopenharmony_ci 42662306a36Sopenharmony_ci sub $1, SRND 42762306a36Sopenharmony_ci jne .Lloop1 42862306a36Sopenharmony_ci 42962306a36Sopenharmony_ci mov $2, SRND 43062306a36Sopenharmony_ci.Lloop2: 43162306a36Sopenharmony_ci paddd (TBL), X0 43262306a36Sopenharmony_ci movdqa X0, _XFER(%rsp) 43362306a36Sopenharmony_ci DO_ROUND 0 43462306a36Sopenharmony_ci DO_ROUND 1 43562306a36Sopenharmony_ci DO_ROUND 2 43662306a36Sopenharmony_ci DO_ROUND 3 43762306a36Sopenharmony_ci paddd 1*16(TBL), X1 43862306a36Sopenharmony_ci movdqa X1, _XFER(%rsp) 43962306a36Sopenharmony_ci add $2*16, TBL 44062306a36Sopenharmony_ci DO_ROUND 0 44162306a36Sopenharmony_ci DO_ROUND 1 44262306a36Sopenharmony_ci DO_ROUND 2 44362306a36Sopenharmony_ci DO_ROUND 3 44462306a36Sopenharmony_ci 44562306a36Sopenharmony_ci movdqa X2, X0 44662306a36Sopenharmony_ci movdqa X3, X1 44762306a36Sopenharmony_ci 44862306a36Sopenharmony_ci sub $1, SRND 44962306a36Sopenharmony_ci jne .Lloop2 45062306a36Sopenharmony_ci 45162306a36Sopenharmony_ci addm (4*0)(CTX),a 45262306a36Sopenharmony_ci addm (4*1)(CTX),b 45362306a36Sopenharmony_ci addm (4*2)(CTX),c 45462306a36Sopenharmony_ci addm (4*3)(CTX),d 45562306a36Sopenharmony_ci addm (4*4)(CTX),e 45662306a36Sopenharmony_ci addm (4*5)(CTX),f 45762306a36Sopenharmony_ci addm (4*6)(CTX),g 45862306a36Sopenharmony_ci addm (4*7)(CTX),h 45962306a36Sopenharmony_ci 46062306a36Sopenharmony_ci mov _INP(%rsp), INP 46162306a36Sopenharmony_ci add $64, INP 46262306a36Sopenharmony_ci cmp _INP_END(%rsp), INP 46362306a36Sopenharmony_ci jne .Lloop0 46462306a36Sopenharmony_ci 46562306a36Sopenharmony_ci.Ldone_hash: 46662306a36Sopenharmony_ci 46762306a36Sopenharmony_ci mov %rbp, %rsp 46862306a36Sopenharmony_ci popq %rbp 46962306a36Sopenharmony_ci popq %r15 47062306a36Sopenharmony_ci popq %r14 47162306a36Sopenharmony_ci popq %r13 47262306a36Sopenharmony_ci popq %r12 47362306a36Sopenharmony_ci popq %rbx 47462306a36Sopenharmony_ci 47562306a36Sopenharmony_ci RET 47662306a36Sopenharmony_ciSYM_FUNC_END(sha256_transform_ssse3) 47762306a36Sopenharmony_ci 47862306a36Sopenharmony_ci.section .rodata.cst256.K256, "aM", @progbits, 256 47962306a36Sopenharmony_ci.align 64 48062306a36Sopenharmony_ciK256: 48162306a36Sopenharmony_ci .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 48262306a36Sopenharmony_ci .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 48362306a36Sopenharmony_ci .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 48462306a36Sopenharmony_ci .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 48562306a36Sopenharmony_ci .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 48662306a36Sopenharmony_ci .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 48762306a36Sopenharmony_ci .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 48862306a36Sopenharmony_ci .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 48962306a36Sopenharmony_ci .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 49062306a36Sopenharmony_ci .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 49162306a36Sopenharmony_ci .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 49262306a36Sopenharmony_ci .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 49362306a36Sopenharmony_ci .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 49462306a36Sopenharmony_ci .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 49562306a36Sopenharmony_ci .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 49662306a36Sopenharmony_ci .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 49762306a36Sopenharmony_ci 49862306a36Sopenharmony_ci.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 49962306a36Sopenharmony_ci.align 16 50062306a36Sopenharmony_ciPSHUFFLE_BYTE_FLIP_MASK: 50162306a36Sopenharmony_ci .octa 0x0c0d0e0f08090a0b0405060700010203 50262306a36Sopenharmony_ci 50362306a36Sopenharmony_ci.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16 50462306a36Sopenharmony_ci.align 16 50562306a36Sopenharmony_ci# shuffle xBxA -> 00BA 50662306a36Sopenharmony_ci_SHUF_00BA: 50762306a36Sopenharmony_ci .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 50862306a36Sopenharmony_ci 50962306a36Sopenharmony_ci.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16 51062306a36Sopenharmony_ci.align 16 51162306a36Sopenharmony_ci# shuffle xDxC -> DC00 51262306a36Sopenharmony_ci_SHUF_DC00: 51362306a36Sopenharmony_ci .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF 514