18c2ecf20Sopenharmony_ci######################################################################## 28c2ecf20Sopenharmony_ci# Implement fast SHA-256 with AVX1 instructions. (x86_64) 38c2ecf20Sopenharmony_ci# 48c2ecf20Sopenharmony_ci# Copyright (C) 2013 Intel Corporation. 58c2ecf20Sopenharmony_ci# 68c2ecf20Sopenharmony_ci# Authors: 78c2ecf20Sopenharmony_ci# James Guilford <james.guilford@intel.com> 88c2ecf20Sopenharmony_ci# Kirk Yap <kirk.s.yap@intel.com> 98c2ecf20Sopenharmony_ci# Tim Chen <tim.c.chen@linux.intel.com> 108c2ecf20Sopenharmony_ci# 118c2ecf20Sopenharmony_ci# This software is available to you under a choice of one of two 128c2ecf20Sopenharmony_ci# licenses. You may choose to be licensed under the terms of the GNU 138c2ecf20Sopenharmony_ci# General Public License (GPL) Version 2, available from the file 148c2ecf20Sopenharmony_ci# COPYING in the main directory of this source tree, or the 158c2ecf20Sopenharmony_ci# OpenIB.org BSD license below: 168c2ecf20Sopenharmony_ci# 178c2ecf20Sopenharmony_ci# Redistribution and use in source and binary forms, with or 188c2ecf20Sopenharmony_ci# without modification, are permitted provided that the following 198c2ecf20Sopenharmony_ci# conditions are met: 208c2ecf20Sopenharmony_ci# 218c2ecf20Sopenharmony_ci# - Redistributions of source code must retain the above 228c2ecf20Sopenharmony_ci# copyright notice, this list of conditions and the following 238c2ecf20Sopenharmony_ci# disclaimer. 248c2ecf20Sopenharmony_ci# 258c2ecf20Sopenharmony_ci# - Redistributions in binary form must reproduce the above 268c2ecf20Sopenharmony_ci# copyright notice, this list of conditions and the following 278c2ecf20Sopenharmony_ci# disclaimer in the documentation and/or other materials 288c2ecf20Sopenharmony_ci# provided with the distribution. 298c2ecf20Sopenharmony_ci# 308c2ecf20Sopenharmony_ci# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 318c2ecf20Sopenharmony_ci# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 328c2ecf20Sopenharmony_ci# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 338c2ecf20Sopenharmony_ci# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 348c2ecf20Sopenharmony_ci# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 358c2ecf20Sopenharmony_ci# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 368c2ecf20Sopenharmony_ci# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 378c2ecf20Sopenharmony_ci# SOFTWARE. 388c2ecf20Sopenharmony_ci######################################################################## 398c2ecf20Sopenharmony_ci# 408c2ecf20Sopenharmony_ci# This code is described in an Intel White-Paper: 418c2ecf20Sopenharmony_ci# "Fast SHA-256 Implementations on Intel Architecture Processors" 428c2ecf20Sopenharmony_ci# 438c2ecf20Sopenharmony_ci# To find it, surf to http://www.intel.com/p/en_US/embedded 448c2ecf20Sopenharmony_ci# and search for that title. 458c2ecf20Sopenharmony_ci# 468c2ecf20Sopenharmony_ci######################################################################## 478c2ecf20Sopenharmony_ci# This code schedules 1 block at a time, with 4 lanes per block 488c2ecf20Sopenharmony_ci######################################################################## 498c2ecf20Sopenharmony_ci 508c2ecf20Sopenharmony_ci#include <linux/linkage.h> 518c2ecf20Sopenharmony_ci 528c2ecf20Sopenharmony_ci## assume buffers not aligned 538c2ecf20Sopenharmony_ci#define VMOVDQ vmovdqu 548c2ecf20Sopenharmony_ci 558c2ecf20Sopenharmony_ci################################ Define Macros 568c2ecf20Sopenharmony_ci 578c2ecf20Sopenharmony_ci# addm [mem], reg 588c2ecf20Sopenharmony_ci# Add reg to mem using reg-mem add and store 598c2ecf20Sopenharmony_ci.macro addm p1 p2 608c2ecf20Sopenharmony_ci add \p1, \p2 618c2ecf20Sopenharmony_ci mov \p2, \p1 628c2ecf20Sopenharmony_ci.endm 638c2ecf20Sopenharmony_ci 648c2ecf20Sopenharmony_ci 658c2ecf20Sopenharmony_ci.macro MY_ROR p1 p2 668c2ecf20Sopenharmony_ci shld $(32-(\p1)), \p2, \p2 678c2ecf20Sopenharmony_ci.endm 688c2ecf20Sopenharmony_ci 698c2ecf20Sopenharmony_ci################################ 708c2ecf20Sopenharmony_ci 718c2ecf20Sopenharmony_ci# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask 728c2ecf20Sopenharmony_ci# Load xmm with mem and byte swap each dword 738c2ecf20Sopenharmony_ci.macro COPY_XMM_AND_BSWAP p1 p2 p3 748c2ecf20Sopenharmony_ci VMOVDQ \p2, \p1 758c2ecf20Sopenharmony_ci vpshufb \p3, \p1, \p1 768c2ecf20Sopenharmony_ci.endm 778c2ecf20Sopenharmony_ci 788c2ecf20Sopenharmony_ci################################ 798c2ecf20Sopenharmony_ci 808c2ecf20Sopenharmony_ciX0 = %xmm4 818c2ecf20Sopenharmony_ciX1 = %xmm5 828c2ecf20Sopenharmony_ciX2 = %xmm6 838c2ecf20Sopenharmony_ciX3 = %xmm7 848c2ecf20Sopenharmony_ci 858c2ecf20Sopenharmony_ciXTMP0 = %xmm0 868c2ecf20Sopenharmony_ciXTMP1 = %xmm1 878c2ecf20Sopenharmony_ciXTMP2 = %xmm2 888c2ecf20Sopenharmony_ciXTMP3 = %xmm3 898c2ecf20Sopenharmony_ciXTMP4 = %xmm8 908c2ecf20Sopenharmony_ciXFER = %xmm9 918c2ecf20Sopenharmony_ciXTMP5 = %xmm11 928c2ecf20Sopenharmony_ci 938c2ecf20Sopenharmony_ciSHUF_00BA = %xmm10 # shuffle xBxA -> 00BA 948c2ecf20Sopenharmony_ciSHUF_DC00 = %xmm12 # shuffle xDxC -> DC00 958c2ecf20Sopenharmony_ciBYTE_FLIP_MASK = %xmm13 968c2ecf20Sopenharmony_ci 978c2ecf20Sopenharmony_ciNUM_BLKS = %rdx # 3rd arg 988c2ecf20Sopenharmony_ciINP = %rsi # 2nd arg 998c2ecf20Sopenharmony_ciCTX = %rdi # 1st arg 1008c2ecf20Sopenharmony_ci 1018c2ecf20Sopenharmony_ciSRND = %rsi # clobbers INP 1028c2ecf20Sopenharmony_cic = %ecx 1038c2ecf20Sopenharmony_cid = %r8d 1048c2ecf20Sopenharmony_cie = %edx 1058c2ecf20Sopenharmony_ciTBL = %r12 1068c2ecf20Sopenharmony_cia = %eax 1078c2ecf20Sopenharmony_cib = %ebx 1088c2ecf20Sopenharmony_ci 1098c2ecf20Sopenharmony_cif = %r9d 1108c2ecf20Sopenharmony_cig = %r10d 1118c2ecf20Sopenharmony_cih = %r11d 1128c2ecf20Sopenharmony_ci 1138c2ecf20Sopenharmony_ciy0 = %r13d 1148c2ecf20Sopenharmony_ciy1 = %r14d 1158c2ecf20Sopenharmony_ciy2 = %r15d 1168c2ecf20Sopenharmony_ci 1178c2ecf20Sopenharmony_ci 1188c2ecf20Sopenharmony_ci_INP_END_SIZE = 8 1198c2ecf20Sopenharmony_ci_INP_SIZE = 8 1208c2ecf20Sopenharmony_ci_XFER_SIZE = 16 1218c2ecf20Sopenharmony_ci_XMM_SAVE_SIZE = 0 1228c2ecf20Sopenharmony_ci 1238c2ecf20Sopenharmony_ci_INP_END = 0 1248c2ecf20Sopenharmony_ci_INP = _INP_END + _INP_END_SIZE 1258c2ecf20Sopenharmony_ci_XFER = _INP + _INP_SIZE 1268c2ecf20Sopenharmony_ci_XMM_SAVE = _XFER + _XFER_SIZE 1278c2ecf20Sopenharmony_ciSTACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE 1288c2ecf20Sopenharmony_ci 1298c2ecf20Sopenharmony_ci# rotate_Xs 1308c2ecf20Sopenharmony_ci# Rotate values of symbols X0...X3 1318c2ecf20Sopenharmony_ci.macro rotate_Xs 1328c2ecf20Sopenharmony_ciX_ = X0 1338c2ecf20Sopenharmony_ciX0 = X1 1348c2ecf20Sopenharmony_ciX1 = X2 1358c2ecf20Sopenharmony_ciX2 = X3 1368c2ecf20Sopenharmony_ciX3 = X_ 1378c2ecf20Sopenharmony_ci.endm 1388c2ecf20Sopenharmony_ci 1398c2ecf20Sopenharmony_ci# ROTATE_ARGS 1408c2ecf20Sopenharmony_ci# Rotate values of symbols a...h 1418c2ecf20Sopenharmony_ci.macro ROTATE_ARGS 1428c2ecf20Sopenharmony_ciTMP_ = h 1438c2ecf20Sopenharmony_cih = g 1448c2ecf20Sopenharmony_cig = f 1458c2ecf20Sopenharmony_cif = e 1468c2ecf20Sopenharmony_cie = d 1478c2ecf20Sopenharmony_cid = c 1488c2ecf20Sopenharmony_cic = b 1498c2ecf20Sopenharmony_cib = a 1508c2ecf20Sopenharmony_cia = TMP_ 1518c2ecf20Sopenharmony_ci.endm 1528c2ecf20Sopenharmony_ci 1538c2ecf20Sopenharmony_ci.macro FOUR_ROUNDS_AND_SCHED 1548c2ecf20Sopenharmony_ci ## compute s0 four at a time and s1 two at a time 1558c2ecf20Sopenharmony_ci ## compute W[-16] + W[-7] 4 at a time 1568c2ecf20Sopenharmony_ci 1578c2ecf20Sopenharmony_ci mov e, y0 # y0 = e 1588c2ecf20Sopenharmony_ci MY_ROR (25-11), y0 # y0 = e >> (25-11) 1598c2ecf20Sopenharmony_ci mov a, y1 # y1 = a 1608c2ecf20Sopenharmony_ci vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] 1618c2ecf20Sopenharmony_ci MY_ROR (22-13), y1 # y1 = a >> (22-13) 1628c2ecf20Sopenharmony_ci xor e, y0 # y0 = e ^ (e >> (25-11)) 1638c2ecf20Sopenharmony_ci mov f, y2 # y2 = f 1648c2ecf20Sopenharmony_ci MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 1658c2ecf20Sopenharmony_ci xor a, y1 # y1 = a ^ (a >> (22-13) 1668c2ecf20Sopenharmony_ci xor g, y2 # y2 = f^g 1678c2ecf20Sopenharmony_ci vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16] 1688c2ecf20Sopenharmony_ci xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 1698c2ecf20Sopenharmony_ci and e, y2 # y2 = (f^g)&e 1708c2ecf20Sopenharmony_ci MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 1718c2ecf20Sopenharmony_ci ## compute s0 1728c2ecf20Sopenharmony_ci vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] 1738c2ecf20Sopenharmony_ci xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 1748c2ecf20Sopenharmony_ci MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 1758c2ecf20Sopenharmony_ci xor g, y2 # y2 = CH = ((f^g)&e)^g 1768c2ecf20Sopenharmony_ci MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 1778c2ecf20Sopenharmony_ci add y0, y2 # y2 = S1 + CH 1788c2ecf20Sopenharmony_ci add _XFER(%rsp), y2 # y2 = k + w + S1 + CH 1798c2ecf20Sopenharmony_ci mov a, y0 # y0 = a 1808c2ecf20Sopenharmony_ci add y2, h # h = h + S1 + CH + k + w 1818c2ecf20Sopenharmony_ci mov a, y2 # y2 = a 1828c2ecf20Sopenharmony_ci vpsrld $7, XTMP1, XTMP2 1838c2ecf20Sopenharmony_ci or c, y0 # y0 = a|c 1848c2ecf20Sopenharmony_ci add h, d # d = d + h + S1 + CH + k + w 1858c2ecf20Sopenharmony_ci and c, y2 # y2 = a&c 1868c2ecf20Sopenharmony_ci vpslld $(32-7), XTMP1, XTMP3 1878c2ecf20Sopenharmony_ci and b, y0 # y0 = (a|c)&b 1888c2ecf20Sopenharmony_ci add y1, h # h = h + S1 + CH + k + w + S0 1898c2ecf20Sopenharmony_ci vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 1908c2ecf20Sopenharmony_ci or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 1918c2ecf20Sopenharmony_ci add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 1928c2ecf20Sopenharmony_ci ROTATE_ARGS 1938c2ecf20Sopenharmony_ci mov e, y0 # y0 = e 1948c2ecf20Sopenharmony_ci mov a, y1 # y1 = a 1958c2ecf20Sopenharmony_ci MY_ROR (25-11), y0 # y0 = e >> (25-11) 1968c2ecf20Sopenharmony_ci xor e, y0 # y0 = e ^ (e >> (25-11)) 1978c2ecf20Sopenharmony_ci mov f, y2 # y2 = f 1988c2ecf20Sopenharmony_ci MY_ROR (22-13), y1 # y1 = a >> (22-13) 1998c2ecf20Sopenharmony_ci vpsrld $18, XTMP1, XTMP2 # 2008c2ecf20Sopenharmony_ci xor a, y1 # y1 = a ^ (a >> (22-13) 2018c2ecf20Sopenharmony_ci MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 2028c2ecf20Sopenharmony_ci xor g, y2 # y2 = f^g 2038c2ecf20Sopenharmony_ci vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 2048c2ecf20Sopenharmony_ci MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 2058c2ecf20Sopenharmony_ci xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 2068c2ecf20Sopenharmony_ci and e, y2 # y2 = (f^g)&e 2078c2ecf20Sopenharmony_ci MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 2088c2ecf20Sopenharmony_ci vpslld $(32-18), XTMP1, XTMP1 2098c2ecf20Sopenharmony_ci xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 2108c2ecf20Sopenharmony_ci xor g, y2 # y2 = CH = ((f^g)&e)^g 2118c2ecf20Sopenharmony_ci vpxor XTMP1, XTMP3, XTMP3 # 2128c2ecf20Sopenharmony_ci add y0, y2 # y2 = S1 + CH 2138c2ecf20Sopenharmony_ci add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH 2148c2ecf20Sopenharmony_ci MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 2158c2ecf20Sopenharmony_ci vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 2168c2ecf20Sopenharmony_ci mov a, y0 # y0 = a 2178c2ecf20Sopenharmony_ci add y2, h # h = h + S1 + CH + k + w 2188c2ecf20Sopenharmony_ci mov a, y2 # y2 = a 2198c2ecf20Sopenharmony_ci vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 2208c2ecf20Sopenharmony_ci or c, y0 # y0 = a|c 2218c2ecf20Sopenharmony_ci add h, d # d = d + h + S1 + CH + k + w 2228c2ecf20Sopenharmony_ci and c, y2 # y2 = a&c 2238c2ecf20Sopenharmony_ci ## compute low s1 2248c2ecf20Sopenharmony_ci vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} 2258c2ecf20Sopenharmony_ci and b, y0 # y0 = (a|c)&b 2268c2ecf20Sopenharmony_ci add y1, h # h = h + S1 + CH + k + w + S0 2278c2ecf20Sopenharmony_ci vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 2288c2ecf20Sopenharmony_ci or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 2298c2ecf20Sopenharmony_ci add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 2308c2ecf20Sopenharmony_ci ROTATE_ARGS 2318c2ecf20Sopenharmony_ci mov e, y0 # y0 = e 2328c2ecf20Sopenharmony_ci mov a, y1 # y1 = a 2338c2ecf20Sopenharmony_ci MY_ROR (25-11), y0 # y0 = e >> (25-11) 2348c2ecf20Sopenharmony_ci xor e, y0 # y0 = e ^ (e >> (25-11)) 2358c2ecf20Sopenharmony_ci MY_ROR (22-13), y1 # y1 = a >> (22-13) 2368c2ecf20Sopenharmony_ci mov f, y2 # y2 = f 2378c2ecf20Sopenharmony_ci xor a, y1 # y1 = a ^ (a >> (22-13) 2388c2ecf20Sopenharmony_ci MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 2398c2ecf20Sopenharmony_ci vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} 2408c2ecf20Sopenharmony_ci xor g, y2 # y2 = f^g 2418c2ecf20Sopenharmony_ci vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA} 2428c2ecf20Sopenharmony_ci xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 2438c2ecf20Sopenharmony_ci and e, y2 # y2 = (f^g)&e 2448c2ecf20Sopenharmony_ci vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA} 2458c2ecf20Sopenharmony_ci MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 2468c2ecf20Sopenharmony_ci xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 2478c2ecf20Sopenharmony_ci xor g, y2 # y2 = CH = ((f^g)&e)^g 2488c2ecf20Sopenharmony_ci MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 2498c2ecf20Sopenharmony_ci vpxor XTMP3, XTMP2, XTMP2 # 2508c2ecf20Sopenharmony_ci add y0, y2 # y2 = S1 + CH 2518c2ecf20Sopenharmony_ci MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 2528c2ecf20Sopenharmony_ci add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH 2538c2ecf20Sopenharmony_ci vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} 2548c2ecf20Sopenharmony_ci mov a, y0 # y0 = a 2558c2ecf20Sopenharmony_ci add y2, h # h = h + S1 + CH + k + w 2568c2ecf20Sopenharmony_ci mov a, y2 # y2 = a 2578c2ecf20Sopenharmony_ci vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} 2588c2ecf20Sopenharmony_ci or c, y0 # y0 = a|c 2598c2ecf20Sopenharmony_ci add h, d # d = d + h + S1 + CH + k + w 2608c2ecf20Sopenharmony_ci and c, y2 # y2 = a&c 2618c2ecf20Sopenharmony_ci vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} 2628c2ecf20Sopenharmony_ci and b, y0 # y0 = (a|c)&b 2638c2ecf20Sopenharmony_ci add y1, h # h = h + S1 + CH + k + w + S0 2648c2ecf20Sopenharmony_ci ## compute high s1 2658c2ecf20Sopenharmony_ci vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} 2668c2ecf20Sopenharmony_ci or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 2678c2ecf20Sopenharmony_ci add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 2688c2ecf20Sopenharmony_ci ROTATE_ARGS 2698c2ecf20Sopenharmony_ci mov e, y0 # y0 = e 2708c2ecf20Sopenharmony_ci MY_ROR (25-11), y0 # y0 = e >> (25-11) 2718c2ecf20Sopenharmony_ci mov a, y1 # y1 = a 2728c2ecf20Sopenharmony_ci MY_ROR (22-13), y1 # y1 = a >> (22-13) 2738c2ecf20Sopenharmony_ci xor e, y0 # y0 = e ^ (e >> (25-11)) 2748c2ecf20Sopenharmony_ci mov f, y2 # y2 = f 2758c2ecf20Sopenharmony_ci MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 2768c2ecf20Sopenharmony_ci vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} 2778c2ecf20Sopenharmony_ci xor a, y1 # y1 = a ^ (a >> (22-13) 2788c2ecf20Sopenharmony_ci xor g, y2 # y2 = f^g 2798c2ecf20Sopenharmony_ci vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC} 2808c2ecf20Sopenharmony_ci xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 2818c2ecf20Sopenharmony_ci and e, y2 # y2 = (f^g)&e 2828c2ecf20Sopenharmony_ci MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 2838c2ecf20Sopenharmony_ci vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC} 2848c2ecf20Sopenharmony_ci xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 2858c2ecf20Sopenharmony_ci MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 2868c2ecf20Sopenharmony_ci xor g, y2 # y2 = CH = ((f^g)&e)^g 2878c2ecf20Sopenharmony_ci vpxor XTMP3, XTMP2, XTMP2 2888c2ecf20Sopenharmony_ci MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 2898c2ecf20Sopenharmony_ci add y0, y2 # y2 = S1 + CH 2908c2ecf20Sopenharmony_ci add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH 2918c2ecf20Sopenharmony_ci vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} 2928c2ecf20Sopenharmony_ci mov a, y0 # y0 = a 2938c2ecf20Sopenharmony_ci add y2, h # h = h + S1 + CH + k + w 2948c2ecf20Sopenharmony_ci mov a, y2 # y2 = a 2958c2ecf20Sopenharmony_ci vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} 2968c2ecf20Sopenharmony_ci or c, y0 # y0 = a|c 2978c2ecf20Sopenharmony_ci add h, d # d = d + h + S1 + CH + k + w 2988c2ecf20Sopenharmony_ci and c, y2 # y2 = a&c 2998c2ecf20Sopenharmony_ci vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} 3008c2ecf20Sopenharmony_ci and b, y0 # y0 = (a|c)&b 3018c2ecf20Sopenharmony_ci add y1, h # h = h + S1 + CH + k + w + S0 3028c2ecf20Sopenharmony_ci or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 3038c2ecf20Sopenharmony_ci add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 3048c2ecf20Sopenharmony_ci ROTATE_ARGS 3058c2ecf20Sopenharmony_ci rotate_Xs 3068c2ecf20Sopenharmony_ci.endm 3078c2ecf20Sopenharmony_ci 3088c2ecf20Sopenharmony_ci## input is [rsp + _XFER + %1 * 4] 3098c2ecf20Sopenharmony_ci.macro DO_ROUND round 3108c2ecf20Sopenharmony_ci mov e, y0 # y0 = e 3118c2ecf20Sopenharmony_ci MY_ROR (25-11), y0 # y0 = e >> (25-11) 3128c2ecf20Sopenharmony_ci mov a, y1 # y1 = a 3138c2ecf20Sopenharmony_ci xor e, y0 # y0 = e ^ (e >> (25-11)) 3148c2ecf20Sopenharmony_ci MY_ROR (22-13), y1 # y1 = a >> (22-13) 3158c2ecf20Sopenharmony_ci mov f, y2 # y2 = f 3168c2ecf20Sopenharmony_ci xor a, y1 # y1 = a ^ (a >> (22-13) 3178c2ecf20Sopenharmony_ci MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 3188c2ecf20Sopenharmony_ci xor g, y2 # y2 = f^g 3198c2ecf20Sopenharmony_ci xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 3208c2ecf20Sopenharmony_ci MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 3218c2ecf20Sopenharmony_ci and e, y2 # y2 = (f^g)&e 3228c2ecf20Sopenharmony_ci xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 3238c2ecf20Sopenharmony_ci MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 3248c2ecf20Sopenharmony_ci xor g, y2 # y2 = CH = ((f^g)&e)^g 3258c2ecf20Sopenharmony_ci add y0, y2 # y2 = S1 + CH 3268c2ecf20Sopenharmony_ci MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 3278c2ecf20Sopenharmony_ci offset = \round * 4 + _XFER # 3288c2ecf20Sopenharmony_ci add offset(%rsp), y2 # y2 = k + w + S1 + CH 3298c2ecf20Sopenharmony_ci mov a, y0 # y0 = a 3308c2ecf20Sopenharmony_ci add y2, h # h = h + S1 + CH + k + w 3318c2ecf20Sopenharmony_ci mov a, y2 # y2 = a 3328c2ecf20Sopenharmony_ci or c, y0 # y0 = a|c 3338c2ecf20Sopenharmony_ci add h, d # d = d + h + S1 + CH + k + w 3348c2ecf20Sopenharmony_ci and c, y2 # y2 = a&c 3358c2ecf20Sopenharmony_ci and b, y0 # y0 = (a|c)&b 3368c2ecf20Sopenharmony_ci add y1, h # h = h + S1 + CH + k + w + S0 3378c2ecf20Sopenharmony_ci or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 3388c2ecf20Sopenharmony_ci add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 3398c2ecf20Sopenharmony_ci ROTATE_ARGS 3408c2ecf20Sopenharmony_ci.endm 3418c2ecf20Sopenharmony_ci 3428c2ecf20Sopenharmony_ci######################################################################## 3438c2ecf20Sopenharmony_ci## void sha256_transform_avx(state sha256_state *state, const u8 *data, int blocks) 3448c2ecf20Sopenharmony_ci## arg 1 : pointer to state 3458c2ecf20Sopenharmony_ci## arg 2 : pointer to input data 3468c2ecf20Sopenharmony_ci## arg 3 : Num blocks 3478c2ecf20Sopenharmony_ci######################################################################## 3488c2ecf20Sopenharmony_ci.text 3498c2ecf20Sopenharmony_ciSYM_FUNC_START(sha256_transform_avx) 3508c2ecf20Sopenharmony_ci.align 32 3518c2ecf20Sopenharmony_ci pushq %rbx 3528c2ecf20Sopenharmony_ci pushq %r12 3538c2ecf20Sopenharmony_ci pushq %r13 3548c2ecf20Sopenharmony_ci pushq %r14 3558c2ecf20Sopenharmony_ci pushq %r15 3568c2ecf20Sopenharmony_ci pushq %rbp 3578c2ecf20Sopenharmony_ci movq %rsp, %rbp 3588c2ecf20Sopenharmony_ci 3598c2ecf20Sopenharmony_ci subq $STACK_SIZE, %rsp # allocate stack space 3608c2ecf20Sopenharmony_ci and $~15, %rsp # align stack pointer 3618c2ecf20Sopenharmony_ci 3628c2ecf20Sopenharmony_ci shl $6, NUM_BLKS # convert to bytes 3638c2ecf20Sopenharmony_ci jz done_hash 3648c2ecf20Sopenharmony_ci add INP, NUM_BLKS # pointer to end of data 3658c2ecf20Sopenharmony_ci mov NUM_BLKS, _INP_END(%rsp) 3668c2ecf20Sopenharmony_ci 3678c2ecf20Sopenharmony_ci ## load initial digest 3688c2ecf20Sopenharmony_ci mov 4*0(CTX), a 3698c2ecf20Sopenharmony_ci mov 4*1(CTX), b 3708c2ecf20Sopenharmony_ci mov 4*2(CTX), c 3718c2ecf20Sopenharmony_ci mov 4*3(CTX), d 3728c2ecf20Sopenharmony_ci mov 4*4(CTX), e 3738c2ecf20Sopenharmony_ci mov 4*5(CTX), f 3748c2ecf20Sopenharmony_ci mov 4*6(CTX), g 3758c2ecf20Sopenharmony_ci mov 4*7(CTX), h 3768c2ecf20Sopenharmony_ci 3778c2ecf20Sopenharmony_ci vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 3788c2ecf20Sopenharmony_ci vmovdqa _SHUF_00BA(%rip), SHUF_00BA 3798c2ecf20Sopenharmony_ci vmovdqa _SHUF_DC00(%rip), SHUF_DC00 3808c2ecf20Sopenharmony_ciloop0: 3818c2ecf20Sopenharmony_ci lea K256(%rip), TBL 3828c2ecf20Sopenharmony_ci 3838c2ecf20Sopenharmony_ci ## byte swap first 16 dwords 3848c2ecf20Sopenharmony_ci COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK 3858c2ecf20Sopenharmony_ci COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK 3868c2ecf20Sopenharmony_ci COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK 3878c2ecf20Sopenharmony_ci COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK 3888c2ecf20Sopenharmony_ci 3898c2ecf20Sopenharmony_ci mov INP, _INP(%rsp) 3908c2ecf20Sopenharmony_ci 3918c2ecf20Sopenharmony_ci ## schedule 48 input dwords, by doing 3 rounds of 16 each 3928c2ecf20Sopenharmony_ci mov $3, SRND 3938c2ecf20Sopenharmony_ci.align 16 3948c2ecf20Sopenharmony_ciloop1: 3958c2ecf20Sopenharmony_ci vpaddd (TBL), X0, XFER 3968c2ecf20Sopenharmony_ci vmovdqa XFER, _XFER(%rsp) 3978c2ecf20Sopenharmony_ci FOUR_ROUNDS_AND_SCHED 3988c2ecf20Sopenharmony_ci 3998c2ecf20Sopenharmony_ci vpaddd 1*16(TBL), X0, XFER 4008c2ecf20Sopenharmony_ci vmovdqa XFER, _XFER(%rsp) 4018c2ecf20Sopenharmony_ci FOUR_ROUNDS_AND_SCHED 4028c2ecf20Sopenharmony_ci 4038c2ecf20Sopenharmony_ci vpaddd 2*16(TBL), X0, XFER 4048c2ecf20Sopenharmony_ci vmovdqa XFER, _XFER(%rsp) 4058c2ecf20Sopenharmony_ci FOUR_ROUNDS_AND_SCHED 4068c2ecf20Sopenharmony_ci 4078c2ecf20Sopenharmony_ci vpaddd 3*16(TBL), X0, XFER 4088c2ecf20Sopenharmony_ci vmovdqa XFER, _XFER(%rsp) 4098c2ecf20Sopenharmony_ci add $4*16, TBL 4108c2ecf20Sopenharmony_ci FOUR_ROUNDS_AND_SCHED 4118c2ecf20Sopenharmony_ci 4128c2ecf20Sopenharmony_ci sub $1, SRND 4138c2ecf20Sopenharmony_ci jne loop1 4148c2ecf20Sopenharmony_ci 4158c2ecf20Sopenharmony_ci mov $2, SRND 4168c2ecf20Sopenharmony_ciloop2: 4178c2ecf20Sopenharmony_ci vpaddd (TBL), X0, XFER 4188c2ecf20Sopenharmony_ci vmovdqa XFER, _XFER(%rsp) 4198c2ecf20Sopenharmony_ci DO_ROUND 0 4208c2ecf20Sopenharmony_ci DO_ROUND 1 4218c2ecf20Sopenharmony_ci DO_ROUND 2 4228c2ecf20Sopenharmony_ci DO_ROUND 3 4238c2ecf20Sopenharmony_ci 4248c2ecf20Sopenharmony_ci vpaddd 1*16(TBL), X1, XFER 4258c2ecf20Sopenharmony_ci vmovdqa XFER, _XFER(%rsp) 4268c2ecf20Sopenharmony_ci add $2*16, TBL 4278c2ecf20Sopenharmony_ci DO_ROUND 0 4288c2ecf20Sopenharmony_ci DO_ROUND 1 4298c2ecf20Sopenharmony_ci DO_ROUND 2 4308c2ecf20Sopenharmony_ci DO_ROUND 3 4318c2ecf20Sopenharmony_ci 4328c2ecf20Sopenharmony_ci vmovdqa X2, X0 4338c2ecf20Sopenharmony_ci vmovdqa X3, X1 4348c2ecf20Sopenharmony_ci 4358c2ecf20Sopenharmony_ci sub $1, SRND 4368c2ecf20Sopenharmony_ci jne loop2 4378c2ecf20Sopenharmony_ci 4388c2ecf20Sopenharmony_ci addm (4*0)(CTX),a 4398c2ecf20Sopenharmony_ci addm (4*1)(CTX),b 4408c2ecf20Sopenharmony_ci addm (4*2)(CTX),c 4418c2ecf20Sopenharmony_ci addm (4*3)(CTX),d 4428c2ecf20Sopenharmony_ci addm (4*4)(CTX),e 4438c2ecf20Sopenharmony_ci addm (4*5)(CTX),f 4448c2ecf20Sopenharmony_ci addm (4*6)(CTX),g 4458c2ecf20Sopenharmony_ci addm (4*7)(CTX),h 4468c2ecf20Sopenharmony_ci 4478c2ecf20Sopenharmony_ci mov _INP(%rsp), INP 4488c2ecf20Sopenharmony_ci add $64, INP 4498c2ecf20Sopenharmony_ci cmp _INP_END(%rsp), INP 4508c2ecf20Sopenharmony_ci jne loop0 4518c2ecf20Sopenharmony_ci 4528c2ecf20Sopenharmony_cidone_hash: 4538c2ecf20Sopenharmony_ci 4548c2ecf20Sopenharmony_ci mov %rbp, %rsp 4558c2ecf20Sopenharmony_ci popq %rbp 4568c2ecf20Sopenharmony_ci popq %r15 4578c2ecf20Sopenharmony_ci popq %r14 4588c2ecf20Sopenharmony_ci popq %r13 4598c2ecf20Sopenharmony_ci popq %r12 4608c2ecf20Sopenharmony_ci popq %rbx 4618c2ecf20Sopenharmony_ci RET 4628c2ecf20Sopenharmony_ciSYM_FUNC_END(sha256_transform_avx) 4638c2ecf20Sopenharmony_ci 4648c2ecf20Sopenharmony_ci.section .rodata.cst256.K256, "aM", @progbits, 256 4658c2ecf20Sopenharmony_ci.align 64 4668c2ecf20Sopenharmony_ciK256: 4678c2ecf20Sopenharmony_ci .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 4688c2ecf20Sopenharmony_ci .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 4698c2ecf20Sopenharmony_ci .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 4708c2ecf20Sopenharmony_ci .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 4718c2ecf20Sopenharmony_ci .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 4728c2ecf20Sopenharmony_ci .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 4738c2ecf20Sopenharmony_ci .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 4748c2ecf20Sopenharmony_ci .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 4758c2ecf20Sopenharmony_ci .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 4768c2ecf20Sopenharmony_ci .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 4778c2ecf20Sopenharmony_ci .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 4788c2ecf20Sopenharmony_ci .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 4798c2ecf20Sopenharmony_ci .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 4808c2ecf20Sopenharmony_ci .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 4818c2ecf20Sopenharmony_ci .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 4828c2ecf20Sopenharmony_ci .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 4838c2ecf20Sopenharmony_ci 4848c2ecf20Sopenharmony_ci.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 4858c2ecf20Sopenharmony_ci.align 16 4868c2ecf20Sopenharmony_ciPSHUFFLE_BYTE_FLIP_MASK: 4878c2ecf20Sopenharmony_ci .octa 0x0c0d0e0f08090a0b0405060700010203 4888c2ecf20Sopenharmony_ci 4898c2ecf20Sopenharmony_ci.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16 4908c2ecf20Sopenharmony_ci.align 16 4918c2ecf20Sopenharmony_ci# shuffle xBxA -> 00BA 4928c2ecf20Sopenharmony_ci_SHUF_00BA: 4938c2ecf20Sopenharmony_ci .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 4948c2ecf20Sopenharmony_ci 4958c2ecf20Sopenharmony_ci.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16 4968c2ecf20Sopenharmony_ci.align 16 4978c2ecf20Sopenharmony_ci# shuffle xDxC -> DC00 4988c2ecf20Sopenharmony_ci_SHUF_DC00: 4998c2ecf20Sopenharmony_ci .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF 500