162306a36Sopenharmony_ci######################################################################## 262306a36Sopenharmony_ci# Implement fast SHA-256 with AVX1 instructions. (x86_64) 362306a36Sopenharmony_ci# 462306a36Sopenharmony_ci# Copyright (C) 2013 Intel Corporation. 562306a36Sopenharmony_ci# 662306a36Sopenharmony_ci# Authors: 762306a36Sopenharmony_ci# James Guilford <james.guilford@intel.com> 862306a36Sopenharmony_ci# Kirk Yap <kirk.s.yap@intel.com> 962306a36Sopenharmony_ci# Tim Chen <tim.c.chen@linux.intel.com> 1062306a36Sopenharmony_ci# 1162306a36Sopenharmony_ci# This software is available to you under a choice of one of two 1262306a36Sopenharmony_ci# licenses. You may choose to be licensed under the terms of the GNU 1362306a36Sopenharmony_ci# General Public License (GPL) Version 2, available from the file 1462306a36Sopenharmony_ci# COPYING in the main directory of this source tree, or the 1562306a36Sopenharmony_ci# OpenIB.org BSD license below: 1662306a36Sopenharmony_ci# 1762306a36Sopenharmony_ci# Redistribution and use in source and binary forms, with or 1862306a36Sopenharmony_ci# without modification, are permitted provided that the following 1962306a36Sopenharmony_ci# conditions are met: 2062306a36Sopenharmony_ci# 2162306a36Sopenharmony_ci# - Redistributions of source code must retain the above 2262306a36Sopenharmony_ci# copyright notice, this list of conditions and the following 2362306a36Sopenharmony_ci# disclaimer. 2462306a36Sopenharmony_ci# 2562306a36Sopenharmony_ci# - Redistributions in binary form must reproduce the above 2662306a36Sopenharmony_ci# copyright notice, this list of conditions and the following 2762306a36Sopenharmony_ci# disclaimer in the documentation and/or other materials 2862306a36Sopenharmony_ci# provided with the distribution. 2962306a36Sopenharmony_ci# 3062306a36Sopenharmony_ci# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 3162306a36Sopenharmony_ci# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 3262306a36Sopenharmony_ci# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 3362306a36Sopenharmony_ci# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 3462306a36Sopenharmony_ci# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 3562306a36Sopenharmony_ci# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 3662306a36Sopenharmony_ci# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 3762306a36Sopenharmony_ci# SOFTWARE. 3862306a36Sopenharmony_ci######################################################################## 3962306a36Sopenharmony_ci# 4062306a36Sopenharmony_ci# This code is described in an Intel White-Paper: 4162306a36Sopenharmony_ci# "Fast SHA-256 Implementations on Intel Architecture Processors" 4262306a36Sopenharmony_ci# 4362306a36Sopenharmony_ci# To find it, surf to http://www.intel.com/p/en_US/embedded 4462306a36Sopenharmony_ci# and search for that title. 4562306a36Sopenharmony_ci# 4662306a36Sopenharmony_ci######################################################################## 4762306a36Sopenharmony_ci# This code schedules 1 block at a time, with 4 lanes per block 4862306a36Sopenharmony_ci######################################################################## 4962306a36Sopenharmony_ci 5062306a36Sopenharmony_ci#include <linux/linkage.h> 5162306a36Sopenharmony_ci#include <linux/cfi_types.h> 5262306a36Sopenharmony_ci 5362306a36Sopenharmony_ci## assume buffers not aligned 5462306a36Sopenharmony_ci#define VMOVDQ vmovdqu 5562306a36Sopenharmony_ci 5662306a36Sopenharmony_ci################################ Define Macros 5762306a36Sopenharmony_ci 5862306a36Sopenharmony_ci# addm [mem], reg 5962306a36Sopenharmony_ci# Add reg to mem using reg-mem add and store 6062306a36Sopenharmony_ci.macro addm p1 p2 6162306a36Sopenharmony_ci add \p1, \p2 6262306a36Sopenharmony_ci mov \p2, \p1 6362306a36Sopenharmony_ci.endm 6462306a36Sopenharmony_ci 6562306a36Sopenharmony_ci 6662306a36Sopenharmony_ci.macro MY_ROR p1 p2 6762306a36Sopenharmony_ci shld $(32-(\p1)), \p2, \p2 6862306a36Sopenharmony_ci.endm 6962306a36Sopenharmony_ci 7062306a36Sopenharmony_ci################################ 7162306a36Sopenharmony_ci 7262306a36Sopenharmony_ci# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask 7362306a36Sopenharmony_ci# Load xmm with mem and byte swap each dword 7462306a36Sopenharmony_ci.macro COPY_XMM_AND_BSWAP p1 p2 p3 7562306a36Sopenharmony_ci VMOVDQ \p2, \p1 7662306a36Sopenharmony_ci vpshufb \p3, \p1, \p1 7762306a36Sopenharmony_ci.endm 7862306a36Sopenharmony_ci 7962306a36Sopenharmony_ci################################ 8062306a36Sopenharmony_ci 8162306a36Sopenharmony_ciX0 = %xmm4 8262306a36Sopenharmony_ciX1 = %xmm5 8362306a36Sopenharmony_ciX2 = %xmm6 8462306a36Sopenharmony_ciX3 = %xmm7 8562306a36Sopenharmony_ci 8662306a36Sopenharmony_ciXTMP0 = %xmm0 8762306a36Sopenharmony_ciXTMP1 = %xmm1 8862306a36Sopenharmony_ciXTMP2 = %xmm2 8962306a36Sopenharmony_ciXTMP3 = %xmm3 9062306a36Sopenharmony_ciXTMP4 = %xmm8 9162306a36Sopenharmony_ciXFER = %xmm9 9262306a36Sopenharmony_ciXTMP5 = %xmm11 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_ciSHUF_00BA = %xmm10 # shuffle xBxA -> 00BA 9562306a36Sopenharmony_ciSHUF_DC00 = %xmm12 # shuffle xDxC -> DC00 9662306a36Sopenharmony_ciBYTE_FLIP_MASK = %xmm13 9762306a36Sopenharmony_ci 9862306a36Sopenharmony_ciNUM_BLKS = %rdx # 3rd arg 9962306a36Sopenharmony_ciINP = %rsi # 2nd arg 10062306a36Sopenharmony_ciCTX = %rdi # 1st arg 10162306a36Sopenharmony_ci 10262306a36Sopenharmony_ciSRND = %rsi # clobbers INP 10362306a36Sopenharmony_cic = %ecx 10462306a36Sopenharmony_cid = %r8d 10562306a36Sopenharmony_cie = %edx 10662306a36Sopenharmony_ciTBL = %r12 10762306a36Sopenharmony_cia = %eax 10862306a36Sopenharmony_cib = %ebx 10962306a36Sopenharmony_ci 11062306a36Sopenharmony_cif = %r9d 11162306a36Sopenharmony_cig = %r10d 11262306a36Sopenharmony_cih = %r11d 11362306a36Sopenharmony_ci 11462306a36Sopenharmony_ciy0 = %r13d 11562306a36Sopenharmony_ciy1 = %r14d 11662306a36Sopenharmony_ciy2 = %r15d 11762306a36Sopenharmony_ci 11862306a36Sopenharmony_ci 11962306a36Sopenharmony_ci_INP_END_SIZE = 8 12062306a36Sopenharmony_ci_INP_SIZE = 8 12162306a36Sopenharmony_ci_XFER_SIZE = 16 12262306a36Sopenharmony_ci_XMM_SAVE_SIZE = 0 12362306a36Sopenharmony_ci 12462306a36Sopenharmony_ci_INP_END = 0 12562306a36Sopenharmony_ci_INP = _INP_END + _INP_END_SIZE 12662306a36Sopenharmony_ci_XFER = _INP + _INP_SIZE 12762306a36Sopenharmony_ci_XMM_SAVE = _XFER + _XFER_SIZE 12862306a36Sopenharmony_ciSTACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE 12962306a36Sopenharmony_ci 13062306a36Sopenharmony_ci# rotate_Xs 13162306a36Sopenharmony_ci# Rotate values of symbols X0...X3 13262306a36Sopenharmony_ci.macro rotate_Xs 13362306a36Sopenharmony_ciX_ = X0 13462306a36Sopenharmony_ciX0 = X1 13562306a36Sopenharmony_ciX1 = X2 13662306a36Sopenharmony_ciX2 = X3 13762306a36Sopenharmony_ciX3 = X_ 13862306a36Sopenharmony_ci.endm 13962306a36Sopenharmony_ci 14062306a36Sopenharmony_ci# ROTATE_ARGS 14162306a36Sopenharmony_ci# Rotate values of symbols a...h 14262306a36Sopenharmony_ci.macro ROTATE_ARGS 14362306a36Sopenharmony_ciTMP_ = h 14462306a36Sopenharmony_cih = g 14562306a36Sopenharmony_cig = f 14662306a36Sopenharmony_cif = e 14762306a36Sopenharmony_cie = d 14862306a36Sopenharmony_cid = c 14962306a36Sopenharmony_cic = b 15062306a36Sopenharmony_cib = a 15162306a36Sopenharmony_cia = TMP_ 15262306a36Sopenharmony_ci.endm 15362306a36Sopenharmony_ci 15462306a36Sopenharmony_ci.macro FOUR_ROUNDS_AND_SCHED 15562306a36Sopenharmony_ci ## compute s0 four at a time and s1 two at a time 15662306a36Sopenharmony_ci ## compute W[-16] + W[-7] 4 at a time 15762306a36Sopenharmony_ci 15862306a36Sopenharmony_ci mov e, y0 # y0 = e 15962306a36Sopenharmony_ci MY_ROR (25-11), y0 # y0 = e >> (25-11) 16062306a36Sopenharmony_ci mov a, y1 # y1 = a 16162306a36Sopenharmony_ci vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] 16262306a36Sopenharmony_ci MY_ROR (22-13), y1 # y1 = a >> (22-13) 16362306a36Sopenharmony_ci xor e, y0 # y0 = e ^ (e >> (25-11)) 16462306a36Sopenharmony_ci mov f, y2 # y2 = f 16562306a36Sopenharmony_ci MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 16662306a36Sopenharmony_ci xor a, y1 # y1 = a ^ (a >> (22-13) 16762306a36Sopenharmony_ci xor g, y2 # y2 = f^g 16862306a36Sopenharmony_ci vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16] 16962306a36Sopenharmony_ci xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 17062306a36Sopenharmony_ci and e, y2 # y2 = (f^g)&e 17162306a36Sopenharmony_ci MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 17262306a36Sopenharmony_ci ## compute s0 17362306a36Sopenharmony_ci vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] 17462306a36Sopenharmony_ci xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 17562306a36Sopenharmony_ci MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 17662306a36Sopenharmony_ci xor g, y2 # y2 = CH = ((f^g)&e)^g 17762306a36Sopenharmony_ci MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 17862306a36Sopenharmony_ci add y0, y2 # y2 = S1 + CH 17962306a36Sopenharmony_ci add _XFER(%rsp), y2 # y2 = k + w + S1 + CH 18062306a36Sopenharmony_ci mov a, y0 # y0 = a 18162306a36Sopenharmony_ci add y2, h # h = h + S1 + CH + k + w 18262306a36Sopenharmony_ci mov a, y2 # y2 = a 18362306a36Sopenharmony_ci vpsrld $7, XTMP1, XTMP2 18462306a36Sopenharmony_ci or c, y0 # y0 = a|c 18562306a36Sopenharmony_ci add h, d # d = d + h + S1 + CH + k + w 18662306a36Sopenharmony_ci and c, y2 # y2 = a&c 18762306a36Sopenharmony_ci vpslld $(32-7), XTMP1, XTMP3 18862306a36Sopenharmony_ci and b, y0 # y0 = (a|c)&b 18962306a36Sopenharmony_ci add y1, h # h = h + S1 + CH + k + w + S0 19062306a36Sopenharmony_ci vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 19162306a36Sopenharmony_ci or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 19262306a36Sopenharmony_ci add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 19362306a36Sopenharmony_ci ROTATE_ARGS 19462306a36Sopenharmony_ci mov e, y0 # y0 = e 19562306a36Sopenharmony_ci mov a, y1 # y1 = a 19662306a36Sopenharmony_ci MY_ROR (25-11), y0 # y0 = e >> (25-11) 19762306a36Sopenharmony_ci xor e, y0 # y0 = e ^ (e >> (25-11)) 19862306a36Sopenharmony_ci mov f, y2 # y2 = f 19962306a36Sopenharmony_ci MY_ROR (22-13), y1 # y1 = a >> (22-13) 20062306a36Sopenharmony_ci vpsrld $18, XTMP1, XTMP2 # 20162306a36Sopenharmony_ci xor a, y1 # y1 = a ^ (a >> (22-13) 20262306a36Sopenharmony_ci MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 20362306a36Sopenharmony_ci xor g, y2 # y2 = f^g 20462306a36Sopenharmony_ci vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 20562306a36Sopenharmony_ci MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 20662306a36Sopenharmony_ci xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 20762306a36Sopenharmony_ci and e, y2 # y2 = (f^g)&e 20862306a36Sopenharmony_ci MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 20962306a36Sopenharmony_ci vpslld $(32-18), XTMP1, XTMP1 21062306a36Sopenharmony_ci xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 21162306a36Sopenharmony_ci xor g, y2 # y2 = CH = ((f^g)&e)^g 21262306a36Sopenharmony_ci vpxor XTMP1, XTMP3, XTMP3 # 21362306a36Sopenharmony_ci add y0, y2 # y2 = S1 + CH 21462306a36Sopenharmony_ci add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH 21562306a36Sopenharmony_ci MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 21662306a36Sopenharmony_ci vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 21762306a36Sopenharmony_ci mov a, y0 # y0 = a 21862306a36Sopenharmony_ci add y2, h # h = h + S1 + CH + k + w 21962306a36Sopenharmony_ci mov a, y2 # y2 = a 22062306a36Sopenharmony_ci vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 22162306a36Sopenharmony_ci or c, y0 # y0 = a|c 22262306a36Sopenharmony_ci add h, d # d = d + h + S1 + CH + k + w 22362306a36Sopenharmony_ci and c, y2 # y2 = a&c 22462306a36Sopenharmony_ci ## compute low s1 22562306a36Sopenharmony_ci vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} 22662306a36Sopenharmony_ci and b, y0 # y0 = (a|c)&b 22762306a36Sopenharmony_ci add y1, h # h = h + S1 + CH + k + w + S0 22862306a36Sopenharmony_ci vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 22962306a36Sopenharmony_ci or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 23062306a36Sopenharmony_ci add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 23162306a36Sopenharmony_ci ROTATE_ARGS 23262306a36Sopenharmony_ci mov e, y0 # y0 = e 23362306a36Sopenharmony_ci mov a, y1 # y1 = a 23462306a36Sopenharmony_ci MY_ROR (25-11), y0 # y0 = e >> (25-11) 23562306a36Sopenharmony_ci xor e, y0 # y0 = e ^ (e >> (25-11)) 23662306a36Sopenharmony_ci MY_ROR (22-13), y1 # y1 = a >> (22-13) 23762306a36Sopenharmony_ci mov f, y2 # y2 = f 23862306a36Sopenharmony_ci xor a, y1 # y1 = a ^ (a >> (22-13) 23962306a36Sopenharmony_ci MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 24062306a36Sopenharmony_ci vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} 24162306a36Sopenharmony_ci xor g, y2 # y2 = f^g 24262306a36Sopenharmony_ci vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA} 24362306a36Sopenharmony_ci xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 24462306a36Sopenharmony_ci and e, y2 # y2 = (f^g)&e 24562306a36Sopenharmony_ci vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA} 24662306a36Sopenharmony_ci MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 24762306a36Sopenharmony_ci xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 24862306a36Sopenharmony_ci xor g, y2 # y2 = CH = ((f^g)&e)^g 24962306a36Sopenharmony_ci MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 25062306a36Sopenharmony_ci vpxor XTMP3, XTMP2, XTMP2 # 25162306a36Sopenharmony_ci add y0, y2 # y2 = S1 + CH 25262306a36Sopenharmony_ci MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 25362306a36Sopenharmony_ci add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH 25462306a36Sopenharmony_ci vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} 25562306a36Sopenharmony_ci mov a, y0 # y0 = a 25662306a36Sopenharmony_ci add y2, h # h = h + S1 + CH + k + w 25762306a36Sopenharmony_ci mov a, y2 # y2 = a 25862306a36Sopenharmony_ci vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} 25962306a36Sopenharmony_ci or c, y0 # y0 = a|c 26062306a36Sopenharmony_ci add h, d # d = d + h + S1 + CH + k + w 26162306a36Sopenharmony_ci and c, y2 # y2 = a&c 26262306a36Sopenharmony_ci vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} 26362306a36Sopenharmony_ci and b, y0 # y0 = (a|c)&b 26462306a36Sopenharmony_ci add y1, h # h = h + S1 + CH + k + w + S0 26562306a36Sopenharmony_ci ## compute high s1 26662306a36Sopenharmony_ci vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} 26762306a36Sopenharmony_ci or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 26862306a36Sopenharmony_ci add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 26962306a36Sopenharmony_ci ROTATE_ARGS 27062306a36Sopenharmony_ci mov e, y0 # y0 = e 27162306a36Sopenharmony_ci MY_ROR (25-11), y0 # y0 = e >> (25-11) 27262306a36Sopenharmony_ci mov a, y1 # y1 = a 27362306a36Sopenharmony_ci MY_ROR (22-13), y1 # y1 = a >> (22-13) 27462306a36Sopenharmony_ci xor e, y0 # y0 = e ^ (e >> (25-11)) 27562306a36Sopenharmony_ci mov f, y2 # y2 = f 27662306a36Sopenharmony_ci MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 27762306a36Sopenharmony_ci vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} 27862306a36Sopenharmony_ci xor a, y1 # y1 = a ^ (a >> (22-13) 27962306a36Sopenharmony_ci xor g, y2 # y2 = f^g 28062306a36Sopenharmony_ci vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC} 28162306a36Sopenharmony_ci xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 28262306a36Sopenharmony_ci and e, y2 # y2 = (f^g)&e 28362306a36Sopenharmony_ci MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 28462306a36Sopenharmony_ci vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC} 28562306a36Sopenharmony_ci xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 28662306a36Sopenharmony_ci MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 28762306a36Sopenharmony_ci xor g, y2 # y2 = CH = ((f^g)&e)^g 28862306a36Sopenharmony_ci vpxor XTMP3, XTMP2, XTMP2 28962306a36Sopenharmony_ci MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 29062306a36Sopenharmony_ci add y0, y2 # y2 = S1 + CH 29162306a36Sopenharmony_ci add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH 29262306a36Sopenharmony_ci vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} 29362306a36Sopenharmony_ci mov a, y0 # y0 = a 29462306a36Sopenharmony_ci add y2, h # h = h + S1 + CH + k + w 29562306a36Sopenharmony_ci mov a, y2 # y2 = a 29662306a36Sopenharmony_ci vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} 29762306a36Sopenharmony_ci or c, y0 # y0 = a|c 29862306a36Sopenharmony_ci add h, d # d = d + h + S1 + CH + k + w 29962306a36Sopenharmony_ci and c, y2 # y2 = a&c 30062306a36Sopenharmony_ci vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} 30162306a36Sopenharmony_ci and b, y0 # y0 = (a|c)&b 30262306a36Sopenharmony_ci add y1, h # h = h + S1 + CH + k + w + S0 30362306a36Sopenharmony_ci or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 30462306a36Sopenharmony_ci add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 30562306a36Sopenharmony_ci ROTATE_ARGS 30662306a36Sopenharmony_ci rotate_Xs 30762306a36Sopenharmony_ci.endm 30862306a36Sopenharmony_ci 30962306a36Sopenharmony_ci## input is [rsp + _XFER + %1 * 4] 31062306a36Sopenharmony_ci.macro DO_ROUND round 31162306a36Sopenharmony_ci mov e, y0 # y0 = e 31262306a36Sopenharmony_ci MY_ROR (25-11), y0 # y0 = e >> (25-11) 31362306a36Sopenharmony_ci mov a, y1 # y1 = a 31462306a36Sopenharmony_ci xor e, y0 # y0 = e ^ (e >> (25-11)) 31562306a36Sopenharmony_ci MY_ROR (22-13), y1 # y1 = a >> (22-13) 31662306a36Sopenharmony_ci mov f, y2 # y2 = f 31762306a36Sopenharmony_ci xor a, y1 # y1 = a ^ (a >> (22-13) 31862306a36Sopenharmony_ci MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 31962306a36Sopenharmony_ci xor g, y2 # y2 = f^g 32062306a36Sopenharmony_ci xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 32162306a36Sopenharmony_ci MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 32262306a36Sopenharmony_ci and e, y2 # y2 = (f^g)&e 32362306a36Sopenharmony_ci xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 32462306a36Sopenharmony_ci MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 32562306a36Sopenharmony_ci xor g, y2 # y2 = CH = ((f^g)&e)^g 32662306a36Sopenharmony_ci add y0, y2 # y2 = S1 + CH 32762306a36Sopenharmony_ci MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 32862306a36Sopenharmony_ci offset = \round * 4 + _XFER # 32962306a36Sopenharmony_ci add offset(%rsp), y2 # y2 = k + w + S1 + CH 33062306a36Sopenharmony_ci mov a, y0 # y0 = a 33162306a36Sopenharmony_ci add y2, h # h = h + S1 + CH + k + w 33262306a36Sopenharmony_ci mov a, y2 # y2 = a 33362306a36Sopenharmony_ci or c, y0 # y0 = a|c 33462306a36Sopenharmony_ci add h, d # d = d + h + S1 + CH + k + w 33562306a36Sopenharmony_ci and c, y2 # y2 = a&c 33662306a36Sopenharmony_ci and b, y0 # y0 = (a|c)&b 33762306a36Sopenharmony_ci add y1, h # h = h + S1 + CH + k + w + S0 33862306a36Sopenharmony_ci or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 33962306a36Sopenharmony_ci add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 34062306a36Sopenharmony_ci ROTATE_ARGS 34162306a36Sopenharmony_ci.endm 34262306a36Sopenharmony_ci 34362306a36Sopenharmony_ci######################################################################## 34462306a36Sopenharmony_ci## void sha256_transform_avx(state sha256_state *state, const u8 *data, int blocks) 34562306a36Sopenharmony_ci## arg 1 : pointer to state 34662306a36Sopenharmony_ci## arg 2 : pointer to input data 34762306a36Sopenharmony_ci## arg 3 : Num blocks 34862306a36Sopenharmony_ci######################################################################## 34962306a36Sopenharmony_ci.text 35062306a36Sopenharmony_ciSYM_TYPED_FUNC_START(sha256_transform_avx) 35162306a36Sopenharmony_ci pushq %rbx 35262306a36Sopenharmony_ci pushq %r12 35362306a36Sopenharmony_ci pushq %r13 35462306a36Sopenharmony_ci pushq %r14 35562306a36Sopenharmony_ci pushq %r15 35662306a36Sopenharmony_ci pushq %rbp 35762306a36Sopenharmony_ci movq %rsp, %rbp 35862306a36Sopenharmony_ci 35962306a36Sopenharmony_ci subq $STACK_SIZE, %rsp # allocate stack space 36062306a36Sopenharmony_ci and $~15, %rsp # align stack pointer 36162306a36Sopenharmony_ci 36262306a36Sopenharmony_ci shl $6, NUM_BLKS # convert to bytes 36362306a36Sopenharmony_ci jz .Ldone_hash 36462306a36Sopenharmony_ci add INP, NUM_BLKS # pointer to end of data 36562306a36Sopenharmony_ci mov NUM_BLKS, _INP_END(%rsp) 36662306a36Sopenharmony_ci 36762306a36Sopenharmony_ci ## load initial digest 36862306a36Sopenharmony_ci mov 4*0(CTX), a 36962306a36Sopenharmony_ci mov 4*1(CTX), b 37062306a36Sopenharmony_ci mov 4*2(CTX), c 37162306a36Sopenharmony_ci mov 4*3(CTX), d 37262306a36Sopenharmony_ci mov 4*4(CTX), e 37362306a36Sopenharmony_ci mov 4*5(CTX), f 37462306a36Sopenharmony_ci mov 4*6(CTX), g 37562306a36Sopenharmony_ci mov 4*7(CTX), h 37662306a36Sopenharmony_ci 37762306a36Sopenharmony_ci vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 37862306a36Sopenharmony_ci vmovdqa _SHUF_00BA(%rip), SHUF_00BA 37962306a36Sopenharmony_ci vmovdqa _SHUF_DC00(%rip), SHUF_DC00 38062306a36Sopenharmony_ci.Lloop0: 38162306a36Sopenharmony_ci lea K256(%rip), TBL 38262306a36Sopenharmony_ci 38362306a36Sopenharmony_ci ## byte swap first 16 dwords 38462306a36Sopenharmony_ci COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK 38562306a36Sopenharmony_ci COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK 38662306a36Sopenharmony_ci COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK 38762306a36Sopenharmony_ci COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK 38862306a36Sopenharmony_ci 38962306a36Sopenharmony_ci mov INP, _INP(%rsp) 39062306a36Sopenharmony_ci 39162306a36Sopenharmony_ci ## schedule 48 input dwords, by doing 3 rounds of 16 each 39262306a36Sopenharmony_ci mov $3, SRND 39362306a36Sopenharmony_ci.align 16 39462306a36Sopenharmony_ci.Lloop1: 39562306a36Sopenharmony_ci vpaddd (TBL), X0, XFER 39662306a36Sopenharmony_ci vmovdqa XFER, _XFER(%rsp) 39762306a36Sopenharmony_ci FOUR_ROUNDS_AND_SCHED 39862306a36Sopenharmony_ci 39962306a36Sopenharmony_ci vpaddd 1*16(TBL), X0, XFER 40062306a36Sopenharmony_ci vmovdqa XFER, _XFER(%rsp) 40162306a36Sopenharmony_ci FOUR_ROUNDS_AND_SCHED 40262306a36Sopenharmony_ci 40362306a36Sopenharmony_ci vpaddd 2*16(TBL), X0, XFER 40462306a36Sopenharmony_ci vmovdqa XFER, _XFER(%rsp) 40562306a36Sopenharmony_ci FOUR_ROUNDS_AND_SCHED 40662306a36Sopenharmony_ci 40762306a36Sopenharmony_ci vpaddd 3*16(TBL), X0, XFER 40862306a36Sopenharmony_ci vmovdqa XFER, _XFER(%rsp) 40962306a36Sopenharmony_ci add $4*16, TBL 41062306a36Sopenharmony_ci FOUR_ROUNDS_AND_SCHED 41162306a36Sopenharmony_ci 41262306a36Sopenharmony_ci sub $1, SRND 41362306a36Sopenharmony_ci jne .Lloop1 41462306a36Sopenharmony_ci 41562306a36Sopenharmony_ci mov $2, SRND 41662306a36Sopenharmony_ci.Lloop2: 41762306a36Sopenharmony_ci vpaddd (TBL), X0, XFER 41862306a36Sopenharmony_ci vmovdqa XFER, _XFER(%rsp) 41962306a36Sopenharmony_ci DO_ROUND 0 42062306a36Sopenharmony_ci DO_ROUND 1 42162306a36Sopenharmony_ci DO_ROUND 2 42262306a36Sopenharmony_ci DO_ROUND 3 42362306a36Sopenharmony_ci 42462306a36Sopenharmony_ci vpaddd 1*16(TBL), X1, XFER 42562306a36Sopenharmony_ci vmovdqa XFER, _XFER(%rsp) 42662306a36Sopenharmony_ci add $2*16, TBL 42762306a36Sopenharmony_ci DO_ROUND 0 42862306a36Sopenharmony_ci DO_ROUND 1 42962306a36Sopenharmony_ci DO_ROUND 2 43062306a36Sopenharmony_ci DO_ROUND 3 43162306a36Sopenharmony_ci 43262306a36Sopenharmony_ci vmovdqa X2, X0 43362306a36Sopenharmony_ci vmovdqa X3, X1 43462306a36Sopenharmony_ci 43562306a36Sopenharmony_ci sub $1, SRND 43662306a36Sopenharmony_ci jne .Lloop2 43762306a36Sopenharmony_ci 43862306a36Sopenharmony_ci addm (4*0)(CTX),a 43962306a36Sopenharmony_ci addm (4*1)(CTX),b 44062306a36Sopenharmony_ci addm (4*2)(CTX),c 44162306a36Sopenharmony_ci addm (4*3)(CTX),d 44262306a36Sopenharmony_ci addm (4*4)(CTX),e 44362306a36Sopenharmony_ci addm (4*5)(CTX),f 44462306a36Sopenharmony_ci addm (4*6)(CTX),g 44562306a36Sopenharmony_ci addm (4*7)(CTX),h 44662306a36Sopenharmony_ci 44762306a36Sopenharmony_ci mov _INP(%rsp), INP 44862306a36Sopenharmony_ci add $64, INP 44962306a36Sopenharmony_ci cmp _INP_END(%rsp), INP 45062306a36Sopenharmony_ci jne .Lloop0 45162306a36Sopenharmony_ci 45262306a36Sopenharmony_ci.Ldone_hash: 45362306a36Sopenharmony_ci 45462306a36Sopenharmony_ci mov %rbp, %rsp 45562306a36Sopenharmony_ci popq %rbp 45662306a36Sopenharmony_ci popq %r15 45762306a36Sopenharmony_ci popq %r14 45862306a36Sopenharmony_ci popq %r13 45962306a36Sopenharmony_ci popq %r12 46062306a36Sopenharmony_ci popq %rbx 46162306a36Sopenharmony_ci RET 46262306a36Sopenharmony_ciSYM_FUNC_END(sha256_transform_avx) 46362306a36Sopenharmony_ci 46462306a36Sopenharmony_ci.section .rodata.cst256.K256, "aM", @progbits, 256 46562306a36Sopenharmony_ci.align 64 46662306a36Sopenharmony_ciK256: 46762306a36Sopenharmony_ci .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 46862306a36Sopenharmony_ci .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 46962306a36Sopenharmony_ci .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 47062306a36Sopenharmony_ci .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 47162306a36Sopenharmony_ci .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 47262306a36Sopenharmony_ci .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 47362306a36Sopenharmony_ci .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 47462306a36Sopenharmony_ci .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 47562306a36Sopenharmony_ci .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 47662306a36Sopenharmony_ci .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 47762306a36Sopenharmony_ci .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 47862306a36Sopenharmony_ci .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 47962306a36Sopenharmony_ci .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 48062306a36Sopenharmony_ci .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 48162306a36Sopenharmony_ci .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 48262306a36Sopenharmony_ci .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 48362306a36Sopenharmony_ci 48462306a36Sopenharmony_ci.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 48562306a36Sopenharmony_ci.align 16 48662306a36Sopenharmony_ciPSHUFFLE_BYTE_FLIP_MASK: 48762306a36Sopenharmony_ci .octa 0x0c0d0e0f08090a0b0405060700010203 48862306a36Sopenharmony_ci 48962306a36Sopenharmony_ci.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16 49062306a36Sopenharmony_ci.align 16 49162306a36Sopenharmony_ci# shuffle xBxA -> 00BA 49262306a36Sopenharmony_ci_SHUF_00BA: 49362306a36Sopenharmony_ci .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 49462306a36Sopenharmony_ci 49562306a36Sopenharmony_ci.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16 49662306a36Sopenharmony_ci.align 16 49762306a36Sopenharmony_ci# shuffle xDxC -> DC00 49862306a36Sopenharmony_ci_SHUF_DC00: 49962306a36Sopenharmony_ci .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF 500