162306a36Sopenharmony_ci######################################################################## 262306a36Sopenharmony_ci# Implement fast SHA-256 with AVX2 instructions. (x86_64) 362306a36Sopenharmony_ci# 462306a36Sopenharmony_ci# Copyright (C) 2013 Intel Corporation. 562306a36Sopenharmony_ci# 662306a36Sopenharmony_ci# Authors: 762306a36Sopenharmony_ci# James Guilford <james.guilford@intel.com> 862306a36Sopenharmony_ci# Kirk Yap <kirk.s.yap@intel.com> 962306a36Sopenharmony_ci# Tim Chen <tim.c.chen@linux.intel.com> 1062306a36Sopenharmony_ci# 1162306a36Sopenharmony_ci# This software is available to you under a choice of one of two 1262306a36Sopenharmony_ci# licenses. You may choose to be licensed under the terms of the GNU 1362306a36Sopenharmony_ci# General Public License (GPL) Version 2, available from the file 1462306a36Sopenharmony_ci# COPYING in the main directory of this source tree, or the 1562306a36Sopenharmony_ci# OpenIB.org BSD license below: 1662306a36Sopenharmony_ci# 1762306a36Sopenharmony_ci# Redistribution and use in source and binary forms, with or 1862306a36Sopenharmony_ci# without modification, are permitted provided that the following 1962306a36Sopenharmony_ci# conditions are met: 2062306a36Sopenharmony_ci# 2162306a36Sopenharmony_ci# - Redistributions of source code must retain the above 2262306a36Sopenharmony_ci# copyright notice, this list of conditions and the following 2362306a36Sopenharmony_ci# disclaimer. 2462306a36Sopenharmony_ci# 2562306a36Sopenharmony_ci# - Redistributions in binary form must reproduce the above 2662306a36Sopenharmony_ci# copyright notice, this list of conditions and the following 2762306a36Sopenharmony_ci# disclaimer in the documentation and/or other materials 2862306a36Sopenharmony_ci# provided with the distribution. 2962306a36Sopenharmony_ci# 3062306a36Sopenharmony_ci# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 3162306a36Sopenharmony_ci# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 3262306a36Sopenharmony_ci# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 3362306a36Sopenharmony_ci# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 3462306a36Sopenharmony_ci# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 3562306a36Sopenharmony_ci# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 3662306a36Sopenharmony_ci# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 3762306a36Sopenharmony_ci# SOFTWARE. 3862306a36Sopenharmony_ci# 3962306a36Sopenharmony_ci######################################################################## 4062306a36Sopenharmony_ci# 4162306a36Sopenharmony_ci# This code is described in an Intel White-Paper: 4262306a36Sopenharmony_ci# "Fast SHA-256 Implementations on Intel Architecture Processors" 4362306a36Sopenharmony_ci# 4462306a36Sopenharmony_ci# To find it, surf to http://www.intel.com/p/en_US/embedded 4562306a36Sopenharmony_ci# and search for that title. 4662306a36Sopenharmony_ci# 4762306a36Sopenharmony_ci######################################################################## 4862306a36Sopenharmony_ci# This code schedules 2 blocks at a time, with 4 lanes per block 4962306a36Sopenharmony_ci######################################################################## 5062306a36Sopenharmony_ci 5162306a36Sopenharmony_ci#include <linux/linkage.h> 5262306a36Sopenharmony_ci#include <linux/cfi_types.h> 5362306a36Sopenharmony_ci 5462306a36Sopenharmony_ci## assume buffers not aligned 5562306a36Sopenharmony_ci#define VMOVDQ vmovdqu 5662306a36Sopenharmony_ci 5762306a36Sopenharmony_ci################################ Define Macros 5862306a36Sopenharmony_ci 5962306a36Sopenharmony_ci# addm [mem], reg 6062306a36Sopenharmony_ci# Add reg to mem using reg-mem add and store 6162306a36Sopenharmony_ci.macro addm p1 p2 6262306a36Sopenharmony_ci add \p1, \p2 6362306a36Sopenharmony_ci mov \p2, \p1 6462306a36Sopenharmony_ci.endm 6562306a36Sopenharmony_ci 6662306a36Sopenharmony_ci################################ 6762306a36Sopenharmony_ci 6862306a36Sopenharmony_ciX0 = %ymm4 6962306a36Sopenharmony_ciX1 = %ymm5 7062306a36Sopenharmony_ciX2 = %ymm6 7162306a36Sopenharmony_ciX3 = %ymm7 7262306a36Sopenharmony_ci 7362306a36Sopenharmony_ci# XMM versions of above 7462306a36Sopenharmony_ciXWORD0 = %xmm4 7562306a36Sopenharmony_ciXWORD1 = %xmm5 7662306a36Sopenharmony_ciXWORD2 = %xmm6 7762306a36Sopenharmony_ciXWORD3 = %xmm7 7862306a36Sopenharmony_ci 7962306a36Sopenharmony_ciXTMP0 = %ymm0 8062306a36Sopenharmony_ciXTMP1 = %ymm1 8162306a36Sopenharmony_ciXTMP2 = %ymm2 8262306a36Sopenharmony_ciXTMP3 = %ymm3 8362306a36Sopenharmony_ciXTMP4 = %ymm8 8462306a36Sopenharmony_ciXFER = %ymm9 8562306a36Sopenharmony_ciXTMP5 = %ymm11 8662306a36Sopenharmony_ci 8762306a36Sopenharmony_ciSHUF_00BA = %ymm10 # shuffle xBxA -> 00BA 8862306a36Sopenharmony_ciSHUF_DC00 = %ymm12 # shuffle xDxC -> DC00 8962306a36Sopenharmony_ciBYTE_FLIP_MASK = %ymm13 9062306a36Sopenharmony_ci 9162306a36Sopenharmony_ciX_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK 9262306a36Sopenharmony_ci 9362306a36Sopenharmony_ciNUM_BLKS = %rdx # 3rd arg 9462306a36Sopenharmony_ciINP = %rsi # 2nd arg 9562306a36Sopenharmony_ciCTX = %rdi # 1st arg 9662306a36Sopenharmony_cic = %ecx 9762306a36Sopenharmony_cid = %r8d 9862306a36Sopenharmony_cie = %edx # clobbers NUM_BLKS 9962306a36Sopenharmony_ciy3 = %esi # clobbers INP 10062306a36Sopenharmony_ci 10162306a36Sopenharmony_ciSRND = CTX # SRND is same register as CTX 10262306a36Sopenharmony_ci 10362306a36Sopenharmony_cia = %eax 10462306a36Sopenharmony_cib = %ebx 10562306a36Sopenharmony_cif = %r9d 10662306a36Sopenharmony_cig = %r10d 10762306a36Sopenharmony_cih = %r11d 10862306a36Sopenharmony_ciold_h = %r11d 10962306a36Sopenharmony_ci 11062306a36Sopenharmony_ciT1 = %r12d 11162306a36Sopenharmony_ciy0 = %r13d 11262306a36Sopenharmony_ciy1 = %r14d 11362306a36Sopenharmony_ciy2 = %r15d 11462306a36Sopenharmony_ci 11562306a36Sopenharmony_ci 11662306a36Sopenharmony_ci_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round 11762306a36Sopenharmony_ci_XMM_SAVE_SIZE = 0 11862306a36Sopenharmony_ci_INP_END_SIZE = 8 11962306a36Sopenharmony_ci_INP_SIZE = 8 12062306a36Sopenharmony_ci_CTX_SIZE = 8 12162306a36Sopenharmony_ci 12262306a36Sopenharmony_ci_XFER = 0 12362306a36Sopenharmony_ci_XMM_SAVE = _XFER + _XFER_SIZE 12462306a36Sopenharmony_ci_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE 12562306a36Sopenharmony_ci_INP = _INP_END + _INP_END_SIZE 12662306a36Sopenharmony_ci_CTX = _INP + _INP_SIZE 12762306a36Sopenharmony_ciSTACK_SIZE = _CTX + _CTX_SIZE 12862306a36Sopenharmony_ci 12962306a36Sopenharmony_ci# rotate_Xs 13062306a36Sopenharmony_ci# Rotate values of symbols X0...X3 13162306a36Sopenharmony_ci.macro rotate_Xs 13262306a36Sopenharmony_ci X_ = X0 13362306a36Sopenharmony_ci X0 = X1 13462306a36Sopenharmony_ci X1 = X2 13562306a36Sopenharmony_ci X2 = X3 13662306a36Sopenharmony_ci X3 = X_ 13762306a36Sopenharmony_ci.endm 13862306a36Sopenharmony_ci 13962306a36Sopenharmony_ci# ROTATE_ARGS 14062306a36Sopenharmony_ci# Rotate values of symbols a...h 14162306a36Sopenharmony_ci.macro ROTATE_ARGS 14262306a36Sopenharmony_ci old_h = h 14362306a36Sopenharmony_ci TMP_ = h 14462306a36Sopenharmony_ci h = g 14562306a36Sopenharmony_ci g = f 14662306a36Sopenharmony_ci f = e 14762306a36Sopenharmony_ci e = d 14862306a36Sopenharmony_ci d = c 14962306a36Sopenharmony_ci c = b 15062306a36Sopenharmony_ci b = a 15162306a36Sopenharmony_ci a = TMP_ 15262306a36Sopenharmony_ci.endm 15362306a36Sopenharmony_ci 15462306a36Sopenharmony_ci.macro FOUR_ROUNDS_AND_SCHED disp 15562306a36Sopenharmony_ci################################### RND N + 0 ############################ 15662306a36Sopenharmony_ci 15762306a36Sopenharmony_ci mov a, y3 # y3 = a # MAJA 15862306a36Sopenharmony_ci rorx $25, e, y0 # y0 = e >> 25 # S1A 15962306a36Sopenharmony_ci rorx $11, e, y1 # y1 = e >> 11 # S1B 16062306a36Sopenharmony_ci 16162306a36Sopenharmony_ci addl \disp(%rsp, SRND), h # h = k + w + h # -- 16262306a36Sopenharmony_ci or c, y3 # y3 = a|c # MAJA 16362306a36Sopenharmony_ci vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] 16462306a36Sopenharmony_ci mov f, y2 # y2 = f # CH 16562306a36Sopenharmony_ci rorx $13, a, T1 # T1 = a >> 13 # S0B 16662306a36Sopenharmony_ci 16762306a36Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 16862306a36Sopenharmony_ci xor g, y2 # y2 = f^g # CH 16962306a36Sopenharmony_ci vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1 17062306a36Sopenharmony_ci rorx $6, e, y1 # y1 = (e >> 6) # S1 17162306a36Sopenharmony_ci 17262306a36Sopenharmony_ci and e, y2 # y2 = (f^g)&e # CH 17362306a36Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 17462306a36Sopenharmony_ci rorx $22, a, y1 # y1 = a >> 22 # S0A 17562306a36Sopenharmony_ci add h, d # d = k + w + h + d # -- 17662306a36Sopenharmony_ci 17762306a36Sopenharmony_ci and b, y3 # y3 = (a|c)&b # MAJA 17862306a36Sopenharmony_ci vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] 17962306a36Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 18062306a36Sopenharmony_ci rorx $2, a, T1 # T1 = (a >> 2) # S0 18162306a36Sopenharmony_ci 18262306a36Sopenharmony_ci xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 18362306a36Sopenharmony_ci vpsrld $7, XTMP1, XTMP2 18462306a36Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 18562306a36Sopenharmony_ci mov a, T1 # T1 = a # MAJB 18662306a36Sopenharmony_ci and c, T1 # T1 = a&c # MAJB 18762306a36Sopenharmony_ci 18862306a36Sopenharmony_ci add y0, y2 # y2 = S1 + CH # -- 18962306a36Sopenharmony_ci vpslld $(32-7), XTMP1, XTMP3 19062306a36Sopenharmony_ci or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 19162306a36Sopenharmony_ci add y1, h # h = k + w + h + S0 # -- 19262306a36Sopenharmony_ci 19362306a36Sopenharmony_ci add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 19462306a36Sopenharmony_ci vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 19562306a36Sopenharmony_ci 19662306a36Sopenharmony_ci vpsrld $18, XTMP1, XTMP2 19762306a36Sopenharmony_ci add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 19862306a36Sopenharmony_ci add y3, h # h = t1 + S0 + MAJ # -- 19962306a36Sopenharmony_ci 20062306a36Sopenharmony_ci 20162306a36Sopenharmony_ci ROTATE_ARGS 20262306a36Sopenharmony_ci 20362306a36Sopenharmony_ci################################### RND N + 1 ############################ 20462306a36Sopenharmony_ci 20562306a36Sopenharmony_ci mov a, y3 # y3 = a # MAJA 20662306a36Sopenharmony_ci rorx $25, e, y0 # y0 = e >> 25 # S1A 20762306a36Sopenharmony_ci rorx $11, e, y1 # y1 = e >> 11 # S1B 20862306a36Sopenharmony_ci offset = \disp + 1*4 20962306a36Sopenharmony_ci addl offset(%rsp, SRND), h # h = k + w + h # -- 21062306a36Sopenharmony_ci or c, y3 # y3 = a|c # MAJA 21162306a36Sopenharmony_ci 21262306a36Sopenharmony_ci 21362306a36Sopenharmony_ci vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 21462306a36Sopenharmony_ci mov f, y2 # y2 = f # CH 21562306a36Sopenharmony_ci rorx $13, a, T1 # T1 = a >> 13 # S0B 21662306a36Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 21762306a36Sopenharmony_ci xor g, y2 # y2 = f^g # CH 21862306a36Sopenharmony_ci 21962306a36Sopenharmony_ci 22062306a36Sopenharmony_ci rorx $6, e, y1 # y1 = (e >> 6) # S1 22162306a36Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 22262306a36Sopenharmony_ci rorx $22, a, y1 # y1 = a >> 22 # S0A 22362306a36Sopenharmony_ci and e, y2 # y2 = (f^g)&e # CH 22462306a36Sopenharmony_ci add h, d # d = k + w + h + d # -- 22562306a36Sopenharmony_ci 22662306a36Sopenharmony_ci vpslld $(32-18), XTMP1, XTMP1 22762306a36Sopenharmony_ci and b, y3 # y3 = (a|c)&b # MAJA 22862306a36Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 22962306a36Sopenharmony_ci 23062306a36Sopenharmony_ci vpxor XTMP1, XTMP3, XTMP3 23162306a36Sopenharmony_ci rorx $2, a, T1 # T1 = (a >> 2) # S0 23262306a36Sopenharmony_ci xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 23362306a36Sopenharmony_ci 23462306a36Sopenharmony_ci vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 23562306a36Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 23662306a36Sopenharmony_ci mov a, T1 # T1 = a # MAJB 23762306a36Sopenharmony_ci and c, T1 # T1 = a&c # MAJB 23862306a36Sopenharmony_ci add y0, y2 # y2 = S1 + CH # -- 23962306a36Sopenharmony_ci 24062306a36Sopenharmony_ci vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 24162306a36Sopenharmony_ci vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} 24262306a36Sopenharmony_ci or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 24362306a36Sopenharmony_ci add y1, h # h = k + w + h + S0 # -- 24462306a36Sopenharmony_ci 24562306a36Sopenharmony_ci vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 24662306a36Sopenharmony_ci add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 24762306a36Sopenharmony_ci add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 24862306a36Sopenharmony_ci add y3, h # h = t1 + S0 + MAJ # -- 24962306a36Sopenharmony_ci 25062306a36Sopenharmony_ci vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} 25162306a36Sopenharmony_ci 25262306a36Sopenharmony_ci 25362306a36Sopenharmony_ci ROTATE_ARGS 25462306a36Sopenharmony_ci 25562306a36Sopenharmony_ci################################### RND N + 2 ############################ 25662306a36Sopenharmony_ci 25762306a36Sopenharmony_ci mov a, y3 # y3 = a # MAJA 25862306a36Sopenharmony_ci rorx $25, e, y0 # y0 = e >> 25 # S1A 25962306a36Sopenharmony_ci offset = \disp + 2*4 26062306a36Sopenharmony_ci addl offset(%rsp, SRND), h # h = k + w + h # -- 26162306a36Sopenharmony_ci 26262306a36Sopenharmony_ci vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} 26362306a36Sopenharmony_ci rorx $11, e, y1 # y1 = e >> 11 # S1B 26462306a36Sopenharmony_ci or c, y3 # y3 = a|c # MAJA 26562306a36Sopenharmony_ci mov f, y2 # y2 = f # CH 26662306a36Sopenharmony_ci xor g, y2 # y2 = f^g # CH 26762306a36Sopenharmony_ci 26862306a36Sopenharmony_ci rorx $13, a, T1 # T1 = a >> 13 # S0B 26962306a36Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 27062306a36Sopenharmony_ci vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} 27162306a36Sopenharmony_ci and e, y2 # y2 = (f^g)&e # CH 27262306a36Sopenharmony_ci 27362306a36Sopenharmony_ci rorx $6, e, y1 # y1 = (e >> 6) # S1 27462306a36Sopenharmony_ci vpxor XTMP3, XTMP2, XTMP2 27562306a36Sopenharmony_ci add h, d # d = k + w + h + d # -- 27662306a36Sopenharmony_ci and b, y3 # y3 = (a|c)&b # MAJA 27762306a36Sopenharmony_ci 27862306a36Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 27962306a36Sopenharmony_ci rorx $22, a, y1 # y1 = a >> 22 # S0A 28062306a36Sopenharmony_ci vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} 28162306a36Sopenharmony_ci xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 28262306a36Sopenharmony_ci 28362306a36Sopenharmony_ci vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} 28462306a36Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 28562306a36Sopenharmony_ci rorx $2, a ,T1 # T1 = (a >> 2) # S0 28662306a36Sopenharmony_ci vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} 28762306a36Sopenharmony_ci 28862306a36Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 28962306a36Sopenharmony_ci mov a, T1 # T1 = a # MAJB 29062306a36Sopenharmony_ci and c, T1 # T1 = a&c # MAJB 29162306a36Sopenharmony_ci add y0, y2 # y2 = S1 + CH # -- 29262306a36Sopenharmony_ci vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} 29362306a36Sopenharmony_ci 29462306a36Sopenharmony_ci or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 29562306a36Sopenharmony_ci add y1,h # h = k + w + h + S0 # -- 29662306a36Sopenharmony_ci add y2,d # d = k + w + h + d + S1 + CH = d + t1 # -- 29762306a36Sopenharmony_ci add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 29862306a36Sopenharmony_ci 29962306a36Sopenharmony_ci add y3,h # h = t1 + S0 + MAJ # -- 30062306a36Sopenharmony_ci 30162306a36Sopenharmony_ci 30262306a36Sopenharmony_ci ROTATE_ARGS 30362306a36Sopenharmony_ci 30462306a36Sopenharmony_ci################################### RND N + 3 ############################ 30562306a36Sopenharmony_ci 30662306a36Sopenharmony_ci mov a, y3 # y3 = a # MAJA 30762306a36Sopenharmony_ci rorx $25, e, y0 # y0 = e >> 25 # S1A 30862306a36Sopenharmony_ci rorx $11, e, y1 # y1 = e >> 11 # S1B 30962306a36Sopenharmony_ci offset = \disp + 3*4 31062306a36Sopenharmony_ci addl offset(%rsp, SRND), h # h = k + w + h # -- 31162306a36Sopenharmony_ci or c, y3 # y3 = a|c # MAJA 31262306a36Sopenharmony_ci 31362306a36Sopenharmony_ci 31462306a36Sopenharmony_ci vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} 31562306a36Sopenharmony_ci mov f, y2 # y2 = f # CH 31662306a36Sopenharmony_ci rorx $13, a, T1 # T1 = a >> 13 # S0B 31762306a36Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 31862306a36Sopenharmony_ci xor g, y2 # y2 = f^g # CH 31962306a36Sopenharmony_ci 32062306a36Sopenharmony_ci 32162306a36Sopenharmony_ci vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} 32262306a36Sopenharmony_ci rorx $6, e, y1 # y1 = (e >> 6) # S1 32362306a36Sopenharmony_ci and e, y2 # y2 = (f^g)&e # CH 32462306a36Sopenharmony_ci add h, d # d = k + w + h + d # -- 32562306a36Sopenharmony_ci and b, y3 # y3 = (a|c)&b # MAJA 32662306a36Sopenharmony_ci 32762306a36Sopenharmony_ci vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} 32862306a36Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 32962306a36Sopenharmony_ci xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 33062306a36Sopenharmony_ci 33162306a36Sopenharmony_ci vpxor XTMP3, XTMP2, XTMP2 33262306a36Sopenharmony_ci rorx $22, a, y1 # y1 = a >> 22 # S0A 33362306a36Sopenharmony_ci add y0, y2 # y2 = S1 + CH # -- 33462306a36Sopenharmony_ci 33562306a36Sopenharmony_ci vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} 33662306a36Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 33762306a36Sopenharmony_ci add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 33862306a36Sopenharmony_ci 33962306a36Sopenharmony_ci rorx $2, a, T1 # T1 = (a >> 2) # S0 34062306a36Sopenharmony_ci vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} 34162306a36Sopenharmony_ci 34262306a36Sopenharmony_ci vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} 34362306a36Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 34462306a36Sopenharmony_ci mov a, T1 # T1 = a # MAJB 34562306a36Sopenharmony_ci and c, T1 # T1 = a&c # MAJB 34662306a36Sopenharmony_ci or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 34762306a36Sopenharmony_ci 34862306a36Sopenharmony_ci add y1, h # h = k + w + h + S0 # -- 34962306a36Sopenharmony_ci add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 35062306a36Sopenharmony_ci add y3, h # h = t1 + S0 + MAJ # -- 35162306a36Sopenharmony_ci 35262306a36Sopenharmony_ci ROTATE_ARGS 35362306a36Sopenharmony_ci rotate_Xs 35462306a36Sopenharmony_ci.endm 35562306a36Sopenharmony_ci 35662306a36Sopenharmony_ci.macro DO_4ROUNDS disp 35762306a36Sopenharmony_ci################################### RND N + 0 ########################### 35862306a36Sopenharmony_ci 35962306a36Sopenharmony_ci mov f, y2 # y2 = f # CH 36062306a36Sopenharmony_ci rorx $25, e, y0 # y0 = e >> 25 # S1A 36162306a36Sopenharmony_ci rorx $11, e, y1 # y1 = e >> 11 # S1B 36262306a36Sopenharmony_ci xor g, y2 # y2 = f^g # CH 36362306a36Sopenharmony_ci 36462306a36Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 36562306a36Sopenharmony_ci rorx $6, e, y1 # y1 = (e >> 6) # S1 36662306a36Sopenharmony_ci and e, y2 # y2 = (f^g)&e # CH 36762306a36Sopenharmony_ci 36862306a36Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 36962306a36Sopenharmony_ci rorx $13, a, T1 # T1 = a >> 13 # S0B 37062306a36Sopenharmony_ci xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 37162306a36Sopenharmony_ci rorx $22, a, y1 # y1 = a >> 22 # S0A 37262306a36Sopenharmony_ci mov a, y3 # y3 = a # MAJA 37362306a36Sopenharmony_ci 37462306a36Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 37562306a36Sopenharmony_ci rorx $2, a, T1 # T1 = (a >> 2) # S0 37662306a36Sopenharmony_ci addl \disp(%rsp, SRND), h # h = k + w + h # -- 37762306a36Sopenharmony_ci or c, y3 # y3 = a|c # MAJA 37862306a36Sopenharmony_ci 37962306a36Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 38062306a36Sopenharmony_ci mov a, T1 # T1 = a # MAJB 38162306a36Sopenharmony_ci and b, y3 # y3 = (a|c)&b # MAJA 38262306a36Sopenharmony_ci and c, T1 # T1 = a&c # MAJB 38362306a36Sopenharmony_ci add y0, y2 # y2 = S1 + CH # -- 38462306a36Sopenharmony_ci 38562306a36Sopenharmony_ci 38662306a36Sopenharmony_ci add h, d # d = k + w + h + d # -- 38762306a36Sopenharmony_ci or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 38862306a36Sopenharmony_ci add y1, h # h = k + w + h + S0 # -- 38962306a36Sopenharmony_ci add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 39062306a36Sopenharmony_ci 39162306a36Sopenharmony_ci ROTATE_ARGS 39262306a36Sopenharmony_ci 39362306a36Sopenharmony_ci################################### RND N + 1 ########################### 39462306a36Sopenharmony_ci 39562306a36Sopenharmony_ci add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 39662306a36Sopenharmony_ci mov f, y2 # y2 = f # CH 39762306a36Sopenharmony_ci rorx $25, e, y0 # y0 = e >> 25 # S1A 39862306a36Sopenharmony_ci rorx $11, e, y1 # y1 = e >> 11 # S1B 39962306a36Sopenharmony_ci xor g, y2 # y2 = f^g # CH 40062306a36Sopenharmony_ci 40162306a36Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 40262306a36Sopenharmony_ci rorx $6, e, y1 # y1 = (e >> 6) # S1 40362306a36Sopenharmony_ci and e, y2 # y2 = (f^g)&e # CH 40462306a36Sopenharmony_ci add y3, old_h # h = t1 + S0 + MAJ # -- 40562306a36Sopenharmony_ci 40662306a36Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 40762306a36Sopenharmony_ci rorx $13, a, T1 # T1 = a >> 13 # S0B 40862306a36Sopenharmony_ci xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 40962306a36Sopenharmony_ci rorx $22, a, y1 # y1 = a >> 22 # S0A 41062306a36Sopenharmony_ci mov a, y3 # y3 = a # MAJA 41162306a36Sopenharmony_ci 41262306a36Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 41362306a36Sopenharmony_ci rorx $2, a, T1 # T1 = (a >> 2) # S0 41462306a36Sopenharmony_ci offset = 4*1 + \disp 41562306a36Sopenharmony_ci addl offset(%rsp, SRND), h # h = k + w + h # -- 41662306a36Sopenharmony_ci or c, y3 # y3 = a|c # MAJA 41762306a36Sopenharmony_ci 41862306a36Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 41962306a36Sopenharmony_ci mov a, T1 # T1 = a # MAJB 42062306a36Sopenharmony_ci and b, y3 # y3 = (a|c)&b # MAJA 42162306a36Sopenharmony_ci and c, T1 # T1 = a&c # MAJB 42262306a36Sopenharmony_ci add y0, y2 # y2 = S1 + CH # -- 42362306a36Sopenharmony_ci 42462306a36Sopenharmony_ci 42562306a36Sopenharmony_ci add h, d # d = k + w + h + d # -- 42662306a36Sopenharmony_ci or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 42762306a36Sopenharmony_ci add y1, h # h = k + w + h + S0 # -- 42862306a36Sopenharmony_ci 42962306a36Sopenharmony_ci add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 43062306a36Sopenharmony_ci 43162306a36Sopenharmony_ci ROTATE_ARGS 43262306a36Sopenharmony_ci 43362306a36Sopenharmony_ci################################### RND N + 2 ############################## 43462306a36Sopenharmony_ci 43562306a36Sopenharmony_ci add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 43662306a36Sopenharmony_ci mov f, y2 # y2 = f # CH 43762306a36Sopenharmony_ci rorx $25, e, y0 # y0 = e >> 25 # S1A 43862306a36Sopenharmony_ci rorx $11, e, y1 # y1 = e >> 11 # S1B 43962306a36Sopenharmony_ci xor g, y2 # y2 = f^g # CH 44062306a36Sopenharmony_ci 44162306a36Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 44262306a36Sopenharmony_ci rorx $6, e, y1 # y1 = (e >> 6) # S1 44362306a36Sopenharmony_ci and e, y2 # y2 = (f^g)&e # CH 44462306a36Sopenharmony_ci add y3, old_h # h = t1 + S0 + MAJ # -- 44562306a36Sopenharmony_ci 44662306a36Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 44762306a36Sopenharmony_ci rorx $13, a, T1 # T1 = a >> 13 # S0B 44862306a36Sopenharmony_ci xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 44962306a36Sopenharmony_ci rorx $22, a, y1 # y1 = a >> 22 # S0A 45062306a36Sopenharmony_ci mov a, y3 # y3 = a # MAJA 45162306a36Sopenharmony_ci 45262306a36Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 45362306a36Sopenharmony_ci rorx $2, a, T1 # T1 = (a >> 2) # S0 45462306a36Sopenharmony_ci offset = 4*2 + \disp 45562306a36Sopenharmony_ci addl offset(%rsp, SRND), h # h = k + w + h # -- 45662306a36Sopenharmony_ci or c, y3 # y3 = a|c # MAJA 45762306a36Sopenharmony_ci 45862306a36Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 45962306a36Sopenharmony_ci mov a, T1 # T1 = a # MAJB 46062306a36Sopenharmony_ci and b, y3 # y3 = (a|c)&b # MAJA 46162306a36Sopenharmony_ci and c, T1 # T1 = a&c # MAJB 46262306a36Sopenharmony_ci add y0, y2 # y2 = S1 + CH # -- 46362306a36Sopenharmony_ci 46462306a36Sopenharmony_ci 46562306a36Sopenharmony_ci add h, d # d = k + w + h + d # -- 46662306a36Sopenharmony_ci or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 46762306a36Sopenharmony_ci add y1, h # h = k + w + h + S0 # -- 46862306a36Sopenharmony_ci 46962306a36Sopenharmony_ci add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 47062306a36Sopenharmony_ci 47162306a36Sopenharmony_ci ROTATE_ARGS 47262306a36Sopenharmony_ci 47362306a36Sopenharmony_ci################################### RND N + 3 ########################### 47462306a36Sopenharmony_ci 47562306a36Sopenharmony_ci add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 47662306a36Sopenharmony_ci mov f, y2 # y2 = f # CH 47762306a36Sopenharmony_ci rorx $25, e, y0 # y0 = e >> 25 # S1A 47862306a36Sopenharmony_ci rorx $11, e, y1 # y1 = e >> 11 # S1B 47962306a36Sopenharmony_ci xor g, y2 # y2 = f^g # CH 48062306a36Sopenharmony_ci 48162306a36Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 48262306a36Sopenharmony_ci rorx $6, e, y1 # y1 = (e >> 6) # S1 48362306a36Sopenharmony_ci and e, y2 # y2 = (f^g)&e # CH 48462306a36Sopenharmony_ci add y3, old_h # h = t1 + S0 + MAJ # -- 48562306a36Sopenharmony_ci 48662306a36Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 48762306a36Sopenharmony_ci rorx $13, a, T1 # T1 = a >> 13 # S0B 48862306a36Sopenharmony_ci xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 48962306a36Sopenharmony_ci rorx $22, a, y1 # y1 = a >> 22 # S0A 49062306a36Sopenharmony_ci mov a, y3 # y3 = a # MAJA 49162306a36Sopenharmony_ci 49262306a36Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 49362306a36Sopenharmony_ci rorx $2, a, T1 # T1 = (a >> 2) # S0 49462306a36Sopenharmony_ci offset = 4*3 + \disp 49562306a36Sopenharmony_ci addl offset(%rsp, SRND), h # h = k + w + h # -- 49662306a36Sopenharmony_ci or c, y3 # y3 = a|c # MAJA 49762306a36Sopenharmony_ci 49862306a36Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 49962306a36Sopenharmony_ci mov a, T1 # T1 = a # MAJB 50062306a36Sopenharmony_ci and b, y3 # y3 = (a|c)&b # MAJA 50162306a36Sopenharmony_ci and c, T1 # T1 = a&c # MAJB 50262306a36Sopenharmony_ci add y0, y2 # y2 = S1 + CH # -- 50362306a36Sopenharmony_ci 50462306a36Sopenharmony_ci 50562306a36Sopenharmony_ci add h, d # d = k + w + h + d # -- 50662306a36Sopenharmony_ci or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 50762306a36Sopenharmony_ci add y1, h # h = k + w + h + S0 # -- 50862306a36Sopenharmony_ci 50962306a36Sopenharmony_ci add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 51062306a36Sopenharmony_ci 51162306a36Sopenharmony_ci 51262306a36Sopenharmony_ci add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 51362306a36Sopenharmony_ci 51462306a36Sopenharmony_ci add y3, h # h = t1 + S0 + MAJ # -- 51562306a36Sopenharmony_ci 51662306a36Sopenharmony_ci ROTATE_ARGS 51762306a36Sopenharmony_ci 51862306a36Sopenharmony_ci.endm 51962306a36Sopenharmony_ci 52062306a36Sopenharmony_ci######################################################################## 52162306a36Sopenharmony_ci## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks) 52262306a36Sopenharmony_ci## arg 1 : pointer to state 52362306a36Sopenharmony_ci## arg 2 : pointer to input data 52462306a36Sopenharmony_ci## arg 3 : Num blocks 52562306a36Sopenharmony_ci######################################################################## 52662306a36Sopenharmony_ci.text 52762306a36Sopenharmony_ciSYM_TYPED_FUNC_START(sha256_transform_rorx) 52862306a36Sopenharmony_ci pushq %rbx 52962306a36Sopenharmony_ci pushq %r12 53062306a36Sopenharmony_ci pushq %r13 53162306a36Sopenharmony_ci pushq %r14 53262306a36Sopenharmony_ci pushq %r15 53362306a36Sopenharmony_ci 53462306a36Sopenharmony_ci push %rbp 53562306a36Sopenharmony_ci mov %rsp, %rbp 53662306a36Sopenharmony_ci 53762306a36Sopenharmony_ci subq $STACK_SIZE, %rsp 53862306a36Sopenharmony_ci and $-32, %rsp # align rsp to 32 byte boundary 53962306a36Sopenharmony_ci 54062306a36Sopenharmony_ci shl $6, NUM_BLKS # convert to bytes 54162306a36Sopenharmony_ci jz .Ldone_hash 54262306a36Sopenharmony_ci lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block 54362306a36Sopenharmony_ci mov NUM_BLKS, _INP_END(%rsp) 54462306a36Sopenharmony_ci 54562306a36Sopenharmony_ci cmp NUM_BLKS, INP 54662306a36Sopenharmony_ci je .Lonly_one_block 54762306a36Sopenharmony_ci 54862306a36Sopenharmony_ci ## load initial digest 54962306a36Sopenharmony_ci mov (CTX), a 55062306a36Sopenharmony_ci mov 4*1(CTX), b 55162306a36Sopenharmony_ci mov 4*2(CTX), c 55262306a36Sopenharmony_ci mov 4*3(CTX), d 55362306a36Sopenharmony_ci mov 4*4(CTX), e 55462306a36Sopenharmony_ci mov 4*5(CTX), f 55562306a36Sopenharmony_ci mov 4*6(CTX), g 55662306a36Sopenharmony_ci mov 4*7(CTX), h 55762306a36Sopenharmony_ci 55862306a36Sopenharmony_ci vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 55962306a36Sopenharmony_ci vmovdqa _SHUF_00BA(%rip), SHUF_00BA 56062306a36Sopenharmony_ci vmovdqa _SHUF_DC00(%rip), SHUF_DC00 56162306a36Sopenharmony_ci 56262306a36Sopenharmony_ci mov CTX, _CTX(%rsp) 56362306a36Sopenharmony_ci 56462306a36Sopenharmony_ci.Lloop0: 56562306a36Sopenharmony_ci ## Load first 16 dwords from two blocks 56662306a36Sopenharmony_ci VMOVDQ 0*32(INP),XTMP0 56762306a36Sopenharmony_ci VMOVDQ 1*32(INP),XTMP1 56862306a36Sopenharmony_ci VMOVDQ 2*32(INP),XTMP2 56962306a36Sopenharmony_ci VMOVDQ 3*32(INP),XTMP3 57062306a36Sopenharmony_ci 57162306a36Sopenharmony_ci ## byte swap data 57262306a36Sopenharmony_ci vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0 57362306a36Sopenharmony_ci vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1 57462306a36Sopenharmony_ci vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2 57562306a36Sopenharmony_ci vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3 57662306a36Sopenharmony_ci 57762306a36Sopenharmony_ci ## transpose data into high/low halves 57862306a36Sopenharmony_ci vperm2i128 $0x20, XTMP2, XTMP0, X0 57962306a36Sopenharmony_ci vperm2i128 $0x31, XTMP2, XTMP0, X1 58062306a36Sopenharmony_ci vperm2i128 $0x20, XTMP3, XTMP1, X2 58162306a36Sopenharmony_ci vperm2i128 $0x31, XTMP3, XTMP1, X3 58262306a36Sopenharmony_ci 58362306a36Sopenharmony_ci.Llast_block_enter: 58462306a36Sopenharmony_ci add $64, INP 58562306a36Sopenharmony_ci mov INP, _INP(%rsp) 58662306a36Sopenharmony_ci 58762306a36Sopenharmony_ci ## schedule 48 input dwords, by doing 3 rounds of 12 each 58862306a36Sopenharmony_ci xor SRND, SRND 58962306a36Sopenharmony_ci 59062306a36Sopenharmony_ci.align 16 59162306a36Sopenharmony_ci.Lloop1: 59262306a36Sopenharmony_ci leaq K256+0*32(%rip), INP ## reuse INP as scratch reg 59362306a36Sopenharmony_ci vpaddd (INP, SRND), X0, XFER 59462306a36Sopenharmony_ci vmovdqa XFER, 0*32+_XFER(%rsp, SRND) 59562306a36Sopenharmony_ci FOUR_ROUNDS_AND_SCHED _XFER + 0*32 59662306a36Sopenharmony_ci 59762306a36Sopenharmony_ci leaq K256+1*32(%rip), INP 59862306a36Sopenharmony_ci vpaddd (INP, SRND), X0, XFER 59962306a36Sopenharmony_ci vmovdqa XFER, 1*32+_XFER(%rsp, SRND) 60062306a36Sopenharmony_ci FOUR_ROUNDS_AND_SCHED _XFER + 1*32 60162306a36Sopenharmony_ci 60262306a36Sopenharmony_ci leaq K256+2*32(%rip), INP 60362306a36Sopenharmony_ci vpaddd (INP, SRND), X0, XFER 60462306a36Sopenharmony_ci vmovdqa XFER, 2*32+_XFER(%rsp, SRND) 60562306a36Sopenharmony_ci FOUR_ROUNDS_AND_SCHED _XFER + 2*32 60662306a36Sopenharmony_ci 60762306a36Sopenharmony_ci leaq K256+3*32(%rip), INP 60862306a36Sopenharmony_ci vpaddd (INP, SRND), X0, XFER 60962306a36Sopenharmony_ci vmovdqa XFER, 3*32+_XFER(%rsp, SRND) 61062306a36Sopenharmony_ci FOUR_ROUNDS_AND_SCHED _XFER + 3*32 61162306a36Sopenharmony_ci 61262306a36Sopenharmony_ci add $4*32, SRND 61362306a36Sopenharmony_ci cmp $3*4*32, SRND 61462306a36Sopenharmony_ci jb .Lloop1 61562306a36Sopenharmony_ci 61662306a36Sopenharmony_ci.Lloop2: 61762306a36Sopenharmony_ci ## Do last 16 rounds with no scheduling 61862306a36Sopenharmony_ci leaq K256+0*32(%rip), INP 61962306a36Sopenharmony_ci vpaddd (INP, SRND), X0, XFER 62062306a36Sopenharmony_ci vmovdqa XFER, 0*32+_XFER(%rsp, SRND) 62162306a36Sopenharmony_ci DO_4ROUNDS _XFER + 0*32 62262306a36Sopenharmony_ci 62362306a36Sopenharmony_ci leaq K256+1*32(%rip), INP 62462306a36Sopenharmony_ci vpaddd (INP, SRND), X1, XFER 62562306a36Sopenharmony_ci vmovdqa XFER, 1*32+_XFER(%rsp, SRND) 62662306a36Sopenharmony_ci DO_4ROUNDS _XFER + 1*32 62762306a36Sopenharmony_ci add $2*32, SRND 62862306a36Sopenharmony_ci 62962306a36Sopenharmony_ci vmovdqa X2, X0 63062306a36Sopenharmony_ci vmovdqa X3, X1 63162306a36Sopenharmony_ci 63262306a36Sopenharmony_ci cmp $4*4*32, SRND 63362306a36Sopenharmony_ci jb .Lloop2 63462306a36Sopenharmony_ci 63562306a36Sopenharmony_ci mov _CTX(%rsp), CTX 63662306a36Sopenharmony_ci mov _INP(%rsp), INP 63762306a36Sopenharmony_ci 63862306a36Sopenharmony_ci addm (4*0)(CTX),a 63962306a36Sopenharmony_ci addm (4*1)(CTX),b 64062306a36Sopenharmony_ci addm (4*2)(CTX),c 64162306a36Sopenharmony_ci addm (4*3)(CTX),d 64262306a36Sopenharmony_ci addm (4*4)(CTX),e 64362306a36Sopenharmony_ci addm (4*5)(CTX),f 64462306a36Sopenharmony_ci addm (4*6)(CTX),g 64562306a36Sopenharmony_ci addm (4*7)(CTX),h 64662306a36Sopenharmony_ci 64762306a36Sopenharmony_ci cmp _INP_END(%rsp), INP 64862306a36Sopenharmony_ci ja .Ldone_hash 64962306a36Sopenharmony_ci 65062306a36Sopenharmony_ci #### Do second block using previously scheduled results 65162306a36Sopenharmony_ci xor SRND, SRND 65262306a36Sopenharmony_ci.align 16 65362306a36Sopenharmony_ci.Lloop3: 65462306a36Sopenharmony_ci DO_4ROUNDS _XFER + 0*32 + 16 65562306a36Sopenharmony_ci DO_4ROUNDS _XFER + 1*32 + 16 65662306a36Sopenharmony_ci add $2*32, SRND 65762306a36Sopenharmony_ci cmp $4*4*32, SRND 65862306a36Sopenharmony_ci jb .Lloop3 65962306a36Sopenharmony_ci 66062306a36Sopenharmony_ci mov _CTX(%rsp), CTX 66162306a36Sopenharmony_ci mov _INP(%rsp), INP 66262306a36Sopenharmony_ci add $64, INP 66362306a36Sopenharmony_ci 66462306a36Sopenharmony_ci addm (4*0)(CTX),a 66562306a36Sopenharmony_ci addm (4*1)(CTX),b 66662306a36Sopenharmony_ci addm (4*2)(CTX),c 66762306a36Sopenharmony_ci addm (4*3)(CTX),d 66862306a36Sopenharmony_ci addm (4*4)(CTX),e 66962306a36Sopenharmony_ci addm (4*5)(CTX),f 67062306a36Sopenharmony_ci addm (4*6)(CTX),g 67162306a36Sopenharmony_ci addm (4*7)(CTX),h 67262306a36Sopenharmony_ci 67362306a36Sopenharmony_ci cmp _INP_END(%rsp), INP 67462306a36Sopenharmony_ci jb .Lloop0 67562306a36Sopenharmony_ci ja .Ldone_hash 67662306a36Sopenharmony_ci 67762306a36Sopenharmony_ci.Ldo_last_block: 67862306a36Sopenharmony_ci VMOVDQ 0*16(INP),XWORD0 67962306a36Sopenharmony_ci VMOVDQ 1*16(INP),XWORD1 68062306a36Sopenharmony_ci VMOVDQ 2*16(INP),XWORD2 68162306a36Sopenharmony_ci VMOVDQ 3*16(INP),XWORD3 68262306a36Sopenharmony_ci 68362306a36Sopenharmony_ci vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0 68462306a36Sopenharmony_ci vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1 68562306a36Sopenharmony_ci vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2 68662306a36Sopenharmony_ci vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3 68762306a36Sopenharmony_ci 68862306a36Sopenharmony_ci jmp .Llast_block_enter 68962306a36Sopenharmony_ci 69062306a36Sopenharmony_ci.Lonly_one_block: 69162306a36Sopenharmony_ci 69262306a36Sopenharmony_ci ## load initial digest 69362306a36Sopenharmony_ci mov (4*0)(CTX),a 69462306a36Sopenharmony_ci mov (4*1)(CTX),b 69562306a36Sopenharmony_ci mov (4*2)(CTX),c 69662306a36Sopenharmony_ci mov (4*3)(CTX),d 69762306a36Sopenharmony_ci mov (4*4)(CTX),e 69862306a36Sopenharmony_ci mov (4*5)(CTX),f 69962306a36Sopenharmony_ci mov (4*6)(CTX),g 70062306a36Sopenharmony_ci mov (4*7)(CTX),h 70162306a36Sopenharmony_ci 70262306a36Sopenharmony_ci vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 70362306a36Sopenharmony_ci vmovdqa _SHUF_00BA(%rip), SHUF_00BA 70462306a36Sopenharmony_ci vmovdqa _SHUF_DC00(%rip), SHUF_DC00 70562306a36Sopenharmony_ci 70662306a36Sopenharmony_ci mov CTX, _CTX(%rsp) 70762306a36Sopenharmony_ci jmp .Ldo_last_block 70862306a36Sopenharmony_ci 70962306a36Sopenharmony_ci.Ldone_hash: 71062306a36Sopenharmony_ci 71162306a36Sopenharmony_ci mov %rbp, %rsp 71262306a36Sopenharmony_ci pop %rbp 71362306a36Sopenharmony_ci 71462306a36Sopenharmony_ci popq %r15 71562306a36Sopenharmony_ci popq %r14 71662306a36Sopenharmony_ci popq %r13 71762306a36Sopenharmony_ci popq %r12 71862306a36Sopenharmony_ci popq %rbx 71962306a36Sopenharmony_ci RET 72062306a36Sopenharmony_ciSYM_FUNC_END(sha256_transform_rorx) 72162306a36Sopenharmony_ci 72262306a36Sopenharmony_ci.section .rodata.cst512.K256, "aM", @progbits, 512 72362306a36Sopenharmony_ci.align 64 72462306a36Sopenharmony_ciK256: 72562306a36Sopenharmony_ci .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 72662306a36Sopenharmony_ci .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 72762306a36Sopenharmony_ci .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 72862306a36Sopenharmony_ci .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 72962306a36Sopenharmony_ci .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 73062306a36Sopenharmony_ci .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 73162306a36Sopenharmony_ci .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 73262306a36Sopenharmony_ci .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 73362306a36Sopenharmony_ci .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 73462306a36Sopenharmony_ci .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 73562306a36Sopenharmony_ci .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 73662306a36Sopenharmony_ci .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 73762306a36Sopenharmony_ci .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 73862306a36Sopenharmony_ci .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 73962306a36Sopenharmony_ci .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 74062306a36Sopenharmony_ci .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 74162306a36Sopenharmony_ci .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 74262306a36Sopenharmony_ci .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 74362306a36Sopenharmony_ci .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 74462306a36Sopenharmony_ci .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 74562306a36Sopenharmony_ci .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 74662306a36Sopenharmony_ci .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 74762306a36Sopenharmony_ci .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 74862306a36Sopenharmony_ci .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 74962306a36Sopenharmony_ci .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 75062306a36Sopenharmony_ci .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 75162306a36Sopenharmony_ci .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 75262306a36Sopenharmony_ci .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 75362306a36Sopenharmony_ci .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 75462306a36Sopenharmony_ci .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 75562306a36Sopenharmony_ci .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 75662306a36Sopenharmony_ci .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 75762306a36Sopenharmony_ci 75862306a36Sopenharmony_ci.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 75962306a36Sopenharmony_ci.align 32 76062306a36Sopenharmony_ciPSHUFFLE_BYTE_FLIP_MASK: 76162306a36Sopenharmony_ci .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203 76262306a36Sopenharmony_ci 76362306a36Sopenharmony_ci# shuffle xBxA -> 00BA 76462306a36Sopenharmony_ci.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32 76562306a36Sopenharmony_ci.align 32 76662306a36Sopenharmony_ci_SHUF_00BA: 76762306a36Sopenharmony_ci .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100 76862306a36Sopenharmony_ci 76962306a36Sopenharmony_ci# shuffle xDxC -> DC00 77062306a36Sopenharmony_ci.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32 77162306a36Sopenharmony_ci.align 32 77262306a36Sopenharmony_ci_SHUF_DC00: 77362306a36Sopenharmony_ci .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF 774