18c2ecf20Sopenharmony_ci######################################################################## 28c2ecf20Sopenharmony_ci# Implement fast SHA-256 with AVX2 instructions. (x86_64) 38c2ecf20Sopenharmony_ci# 48c2ecf20Sopenharmony_ci# Copyright (C) 2013 Intel Corporation. 58c2ecf20Sopenharmony_ci# 68c2ecf20Sopenharmony_ci# Authors: 78c2ecf20Sopenharmony_ci# James Guilford <james.guilford@intel.com> 88c2ecf20Sopenharmony_ci# Kirk Yap <kirk.s.yap@intel.com> 98c2ecf20Sopenharmony_ci# Tim Chen <tim.c.chen@linux.intel.com> 108c2ecf20Sopenharmony_ci# 118c2ecf20Sopenharmony_ci# This software is available to you under a choice of one of two 128c2ecf20Sopenharmony_ci# licenses. You may choose to be licensed under the terms of the GNU 138c2ecf20Sopenharmony_ci# General Public License (GPL) Version 2, available from the file 148c2ecf20Sopenharmony_ci# COPYING in the main directory of this source tree, or the 158c2ecf20Sopenharmony_ci# OpenIB.org BSD license below: 168c2ecf20Sopenharmony_ci# 178c2ecf20Sopenharmony_ci# Redistribution and use in source and binary forms, with or 188c2ecf20Sopenharmony_ci# without modification, are permitted provided that the following 198c2ecf20Sopenharmony_ci# conditions are met: 208c2ecf20Sopenharmony_ci# 218c2ecf20Sopenharmony_ci# - Redistributions of source code must retain the above 228c2ecf20Sopenharmony_ci# copyright notice, this list of conditions and the following 238c2ecf20Sopenharmony_ci# disclaimer. 248c2ecf20Sopenharmony_ci# 258c2ecf20Sopenharmony_ci# - Redistributions in binary form must reproduce the above 268c2ecf20Sopenharmony_ci# copyright notice, this list of conditions and the following 278c2ecf20Sopenharmony_ci# disclaimer in the documentation and/or other materials 288c2ecf20Sopenharmony_ci# provided with the distribution. 298c2ecf20Sopenharmony_ci# 308c2ecf20Sopenharmony_ci# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 318c2ecf20Sopenharmony_ci# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 328c2ecf20Sopenharmony_ci# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 338c2ecf20Sopenharmony_ci# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 348c2ecf20Sopenharmony_ci# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 358c2ecf20Sopenharmony_ci# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 368c2ecf20Sopenharmony_ci# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 378c2ecf20Sopenharmony_ci# SOFTWARE. 388c2ecf20Sopenharmony_ci# 398c2ecf20Sopenharmony_ci######################################################################## 408c2ecf20Sopenharmony_ci# 418c2ecf20Sopenharmony_ci# This code is described in an Intel White-Paper: 428c2ecf20Sopenharmony_ci# "Fast SHA-256 Implementations on Intel Architecture Processors" 438c2ecf20Sopenharmony_ci# 448c2ecf20Sopenharmony_ci# To find it, surf to http://www.intel.com/p/en_US/embedded 458c2ecf20Sopenharmony_ci# and search for that title. 468c2ecf20Sopenharmony_ci# 478c2ecf20Sopenharmony_ci######################################################################## 488c2ecf20Sopenharmony_ci# This code schedules 2 blocks at a time, with 4 lanes per block 498c2ecf20Sopenharmony_ci######################################################################## 508c2ecf20Sopenharmony_ci 518c2ecf20Sopenharmony_ci#include <linux/linkage.h> 528c2ecf20Sopenharmony_ci 538c2ecf20Sopenharmony_ci## assume buffers not aligned 548c2ecf20Sopenharmony_ci#define VMOVDQ vmovdqu 558c2ecf20Sopenharmony_ci 568c2ecf20Sopenharmony_ci################################ Define Macros 578c2ecf20Sopenharmony_ci 588c2ecf20Sopenharmony_ci# addm [mem], reg 598c2ecf20Sopenharmony_ci# Add reg to mem using reg-mem add and store 608c2ecf20Sopenharmony_ci.macro addm p1 p2 618c2ecf20Sopenharmony_ci add \p1, \p2 628c2ecf20Sopenharmony_ci mov \p2, \p1 638c2ecf20Sopenharmony_ci.endm 648c2ecf20Sopenharmony_ci 658c2ecf20Sopenharmony_ci################################ 668c2ecf20Sopenharmony_ci 678c2ecf20Sopenharmony_ciX0 = %ymm4 688c2ecf20Sopenharmony_ciX1 = %ymm5 698c2ecf20Sopenharmony_ciX2 = %ymm6 708c2ecf20Sopenharmony_ciX3 = %ymm7 718c2ecf20Sopenharmony_ci 728c2ecf20Sopenharmony_ci# XMM versions of above 738c2ecf20Sopenharmony_ciXWORD0 = %xmm4 748c2ecf20Sopenharmony_ciXWORD1 = %xmm5 758c2ecf20Sopenharmony_ciXWORD2 = %xmm6 768c2ecf20Sopenharmony_ciXWORD3 = %xmm7 778c2ecf20Sopenharmony_ci 788c2ecf20Sopenharmony_ciXTMP0 = %ymm0 798c2ecf20Sopenharmony_ciXTMP1 = %ymm1 808c2ecf20Sopenharmony_ciXTMP2 = %ymm2 818c2ecf20Sopenharmony_ciXTMP3 = %ymm3 828c2ecf20Sopenharmony_ciXTMP4 = %ymm8 838c2ecf20Sopenharmony_ciXFER = %ymm9 848c2ecf20Sopenharmony_ciXTMP5 = %ymm11 858c2ecf20Sopenharmony_ci 868c2ecf20Sopenharmony_ciSHUF_00BA = %ymm10 # shuffle xBxA -> 00BA 878c2ecf20Sopenharmony_ciSHUF_DC00 = %ymm12 # shuffle xDxC -> DC00 888c2ecf20Sopenharmony_ciBYTE_FLIP_MASK = %ymm13 898c2ecf20Sopenharmony_ci 908c2ecf20Sopenharmony_ciX_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK 918c2ecf20Sopenharmony_ci 928c2ecf20Sopenharmony_ciNUM_BLKS = %rdx # 3rd arg 938c2ecf20Sopenharmony_ciINP = %rsi # 2nd arg 948c2ecf20Sopenharmony_ciCTX = %rdi # 1st arg 958c2ecf20Sopenharmony_cic = %ecx 968c2ecf20Sopenharmony_cid = %r8d 978c2ecf20Sopenharmony_cie = %edx # clobbers NUM_BLKS 988c2ecf20Sopenharmony_ciy3 = %esi # clobbers INP 998c2ecf20Sopenharmony_ci 1008c2ecf20Sopenharmony_ciSRND = CTX # SRND is same register as CTX 1018c2ecf20Sopenharmony_ci 1028c2ecf20Sopenharmony_cia = %eax 1038c2ecf20Sopenharmony_cib = %ebx 1048c2ecf20Sopenharmony_cif = %r9d 1058c2ecf20Sopenharmony_cig = %r10d 1068c2ecf20Sopenharmony_cih = %r11d 1078c2ecf20Sopenharmony_ciold_h = %r11d 1088c2ecf20Sopenharmony_ci 1098c2ecf20Sopenharmony_ciT1 = %r12d 1108c2ecf20Sopenharmony_ciy0 = %r13d 1118c2ecf20Sopenharmony_ciy1 = %r14d 1128c2ecf20Sopenharmony_ciy2 = %r15d 1138c2ecf20Sopenharmony_ci 1148c2ecf20Sopenharmony_ci 1158c2ecf20Sopenharmony_ci_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round 1168c2ecf20Sopenharmony_ci_XMM_SAVE_SIZE = 0 1178c2ecf20Sopenharmony_ci_INP_END_SIZE = 8 1188c2ecf20Sopenharmony_ci_INP_SIZE = 8 1198c2ecf20Sopenharmony_ci_CTX_SIZE = 8 1208c2ecf20Sopenharmony_ci_RSP_SIZE = 8 1218c2ecf20Sopenharmony_ci 1228c2ecf20Sopenharmony_ci_XFER = 0 1238c2ecf20Sopenharmony_ci_XMM_SAVE = _XFER + _XFER_SIZE 1248c2ecf20Sopenharmony_ci_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE 1258c2ecf20Sopenharmony_ci_INP = _INP_END + _INP_END_SIZE 1268c2ecf20Sopenharmony_ci_CTX = _INP + _INP_SIZE 1278c2ecf20Sopenharmony_ci_RSP = _CTX + _CTX_SIZE 1288c2ecf20Sopenharmony_ciSTACK_SIZE = _RSP + _RSP_SIZE 1298c2ecf20Sopenharmony_ci 1308c2ecf20Sopenharmony_ci# rotate_Xs 1318c2ecf20Sopenharmony_ci# Rotate values of symbols X0...X3 1328c2ecf20Sopenharmony_ci.macro rotate_Xs 1338c2ecf20Sopenharmony_ci X_ = X0 1348c2ecf20Sopenharmony_ci X0 = X1 1358c2ecf20Sopenharmony_ci X1 = X2 1368c2ecf20Sopenharmony_ci X2 = X3 1378c2ecf20Sopenharmony_ci X3 = X_ 1388c2ecf20Sopenharmony_ci.endm 1398c2ecf20Sopenharmony_ci 1408c2ecf20Sopenharmony_ci# ROTATE_ARGS 1418c2ecf20Sopenharmony_ci# Rotate values of symbols a...h 1428c2ecf20Sopenharmony_ci.macro ROTATE_ARGS 1438c2ecf20Sopenharmony_ci old_h = h 1448c2ecf20Sopenharmony_ci TMP_ = h 1458c2ecf20Sopenharmony_ci h = g 1468c2ecf20Sopenharmony_ci g = f 1478c2ecf20Sopenharmony_ci f = e 1488c2ecf20Sopenharmony_ci e = d 1498c2ecf20Sopenharmony_ci d = c 1508c2ecf20Sopenharmony_ci c = b 1518c2ecf20Sopenharmony_ci b = a 1528c2ecf20Sopenharmony_ci a = TMP_ 1538c2ecf20Sopenharmony_ci.endm 1548c2ecf20Sopenharmony_ci 1558c2ecf20Sopenharmony_ci.macro FOUR_ROUNDS_AND_SCHED disp 1568c2ecf20Sopenharmony_ci################################### RND N + 0 ############################ 1578c2ecf20Sopenharmony_ci 1588c2ecf20Sopenharmony_ci mov a, y3 # y3 = a # MAJA 1598c2ecf20Sopenharmony_ci rorx $25, e, y0 # y0 = e >> 25 # S1A 1608c2ecf20Sopenharmony_ci rorx $11, e, y1 # y1 = e >> 11 # S1B 1618c2ecf20Sopenharmony_ci 1628c2ecf20Sopenharmony_ci addl \disp(%rsp, SRND), h # h = k + w + h # -- 1638c2ecf20Sopenharmony_ci or c, y3 # y3 = a|c # MAJA 1648c2ecf20Sopenharmony_ci vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] 1658c2ecf20Sopenharmony_ci mov f, y2 # y2 = f # CH 1668c2ecf20Sopenharmony_ci rorx $13, a, T1 # T1 = a >> 13 # S0B 1678c2ecf20Sopenharmony_ci 1688c2ecf20Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 1698c2ecf20Sopenharmony_ci xor g, y2 # y2 = f^g # CH 1708c2ecf20Sopenharmony_ci vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1 1718c2ecf20Sopenharmony_ci rorx $6, e, y1 # y1 = (e >> 6) # S1 1728c2ecf20Sopenharmony_ci 1738c2ecf20Sopenharmony_ci and e, y2 # y2 = (f^g)&e # CH 1748c2ecf20Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 1758c2ecf20Sopenharmony_ci rorx $22, a, y1 # y1 = a >> 22 # S0A 1768c2ecf20Sopenharmony_ci add h, d # d = k + w + h + d # -- 1778c2ecf20Sopenharmony_ci 1788c2ecf20Sopenharmony_ci and b, y3 # y3 = (a|c)&b # MAJA 1798c2ecf20Sopenharmony_ci vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] 1808c2ecf20Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 1818c2ecf20Sopenharmony_ci rorx $2, a, T1 # T1 = (a >> 2) # S0 1828c2ecf20Sopenharmony_ci 1838c2ecf20Sopenharmony_ci xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 1848c2ecf20Sopenharmony_ci vpsrld $7, XTMP1, XTMP2 1858c2ecf20Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 1868c2ecf20Sopenharmony_ci mov a, T1 # T1 = a # MAJB 1878c2ecf20Sopenharmony_ci and c, T1 # T1 = a&c # MAJB 1888c2ecf20Sopenharmony_ci 1898c2ecf20Sopenharmony_ci add y0, y2 # y2 = S1 + CH # -- 1908c2ecf20Sopenharmony_ci vpslld $(32-7), XTMP1, XTMP3 1918c2ecf20Sopenharmony_ci or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 1928c2ecf20Sopenharmony_ci add y1, h # h = k + w + h + S0 # -- 1938c2ecf20Sopenharmony_ci 1948c2ecf20Sopenharmony_ci add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 1958c2ecf20Sopenharmony_ci vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 1968c2ecf20Sopenharmony_ci 1978c2ecf20Sopenharmony_ci vpsrld $18, XTMP1, XTMP2 1988c2ecf20Sopenharmony_ci add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 1998c2ecf20Sopenharmony_ci add y3, h # h = t1 + S0 + MAJ # -- 2008c2ecf20Sopenharmony_ci 2018c2ecf20Sopenharmony_ci 2028c2ecf20Sopenharmony_ci ROTATE_ARGS 2038c2ecf20Sopenharmony_ci 2048c2ecf20Sopenharmony_ci################################### RND N + 1 ############################ 2058c2ecf20Sopenharmony_ci 2068c2ecf20Sopenharmony_ci mov a, y3 # y3 = a # MAJA 2078c2ecf20Sopenharmony_ci rorx $25, e, y0 # y0 = e >> 25 # S1A 2088c2ecf20Sopenharmony_ci rorx $11, e, y1 # y1 = e >> 11 # S1B 2098c2ecf20Sopenharmony_ci offset = \disp + 1*4 2108c2ecf20Sopenharmony_ci addl offset(%rsp, SRND), h # h = k + w + h # -- 2118c2ecf20Sopenharmony_ci or c, y3 # y3 = a|c # MAJA 2128c2ecf20Sopenharmony_ci 2138c2ecf20Sopenharmony_ci 2148c2ecf20Sopenharmony_ci vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 2158c2ecf20Sopenharmony_ci mov f, y2 # y2 = f # CH 2168c2ecf20Sopenharmony_ci rorx $13, a, T1 # T1 = a >> 13 # S0B 2178c2ecf20Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 2188c2ecf20Sopenharmony_ci xor g, y2 # y2 = f^g # CH 2198c2ecf20Sopenharmony_ci 2208c2ecf20Sopenharmony_ci 2218c2ecf20Sopenharmony_ci rorx $6, e, y1 # y1 = (e >> 6) # S1 2228c2ecf20Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 2238c2ecf20Sopenharmony_ci rorx $22, a, y1 # y1 = a >> 22 # S0A 2248c2ecf20Sopenharmony_ci and e, y2 # y2 = (f^g)&e # CH 2258c2ecf20Sopenharmony_ci add h, d # d = k + w + h + d # -- 2268c2ecf20Sopenharmony_ci 2278c2ecf20Sopenharmony_ci vpslld $(32-18), XTMP1, XTMP1 2288c2ecf20Sopenharmony_ci and b, y3 # y3 = (a|c)&b # MAJA 2298c2ecf20Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 2308c2ecf20Sopenharmony_ci 2318c2ecf20Sopenharmony_ci vpxor XTMP1, XTMP3, XTMP3 2328c2ecf20Sopenharmony_ci rorx $2, a, T1 # T1 = (a >> 2) # S0 2338c2ecf20Sopenharmony_ci xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 2348c2ecf20Sopenharmony_ci 2358c2ecf20Sopenharmony_ci vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 2368c2ecf20Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 2378c2ecf20Sopenharmony_ci mov a, T1 # T1 = a # MAJB 2388c2ecf20Sopenharmony_ci and c, T1 # T1 = a&c # MAJB 2398c2ecf20Sopenharmony_ci add y0, y2 # y2 = S1 + CH # -- 2408c2ecf20Sopenharmony_ci 2418c2ecf20Sopenharmony_ci vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 2428c2ecf20Sopenharmony_ci vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} 2438c2ecf20Sopenharmony_ci or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 2448c2ecf20Sopenharmony_ci add y1, h # h = k + w + h + S0 # -- 2458c2ecf20Sopenharmony_ci 2468c2ecf20Sopenharmony_ci vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 2478c2ecf20Sopenharmony_ci add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 2488c2ecf20Sopenharmony_ci add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 2498c2ecf20Sopenharmony_ci add y3, h # h = t1 + S0 + MAJ # -- 2508c2ecf20Sopenharmony_ci 2518c2ecf20Sopenharmony_ci vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} 2528c2ecf20Sopenharmony_ci 2538c2ecf20Sopenharmony_ci 2548c2ecf20Sopenharmony_ci ROTATE_ARGS 2558c2ecf20Sopenharmony_ci 2568c2ecf20Sopenharmony_ci################################### RND N + 2 ############################ 2578c2ecf20Sopenharmony_ci 2588c2ecf20Sopenharmony_ci mov a, y3 # y3 = a # MAJA 2598c2ecf20Sopenharmony_ci rorx $25, e, y0 # y0 = e >> 25 # S1A 2608c2ecf20Sopenharmony_ci offset = \disp + 2*4 2618c2ecf20Sopenharmony_ci addl offset(%rsp, SRND), h # h = k + w + h # -- 2628c2ecf20Sopenharmony_ci 2638c2ecf20Sopenharmony_ci vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} 2648c2ecf20Sopenharmony_ci rorx $11, e, y1 # y1 = e >> 11 # S1B 2658c2ecf20Sopenharmony_ci or c, y3 # y3 = a|c # MAJA 2668c2ecf20Sopenharmony_ci mov f, y2 # y2 = f # CH 2678c2ecf20Sopenharmony_ci xor g, y2 # y2 = f^g # CH 2688c2ecf20Sopenharmony_ci 2698c2ecf20Sopenharmony_ci rorx $13, a, T1 # T1 = a >> 13 # S0B 2708c2ecf20Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 2718c2ecf20Sopenharmony_ci vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} 2728c2ecf20Sopenharmony_ci and e, y2 # y2 = (f^g)&e # CH 2738c2ecf20Sopenharmony_ci 2748c2ecf20Sopenharmony_ci rorx $6, e, y1 # y1 = (e >> 6) # S1 2758c2ecf20Sopenharmony_ci vpxor XTMP3, XTMP2, XTMP2 2768c2ecf20Sopenharmony_ci add h, d # d = k + w + h + d # -- 2778c2ecf20Sopenharmony_ci and b, y3 # y3 = (a|c)&b # MAJA 2788c2ecf20Sopenharmony_ci 2798c2ecf20Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 2808c2ecf20Sopenharmony_ci rorx $22, a, y1 # y1 = a >> 22 # S0A 2818c2ecf20Sopenharmony_ci vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} 2828c2ecf20Sopenharmony_ci xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 2838c2ecf20Sopenharmony_ci 2848c2ecf20Sopenharmony_ci vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} 2858c2ecf20Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 2868c2ecf20Sopenharmony_ci rorx $2, a ,T1 # T1 = (a >> 2) # S0 2878c2ecf20Sopenharmony_ci vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} 2888c2ecf20Sopenharmony_ci 2898c2ecf20Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 2908c2ecf20Sopenharmony_ci mov a, T1 # T1 = a # MAJB 2918c2ecf20Sopenharmony_ci and c, T1 # T1 = a&c # MAJB 2928c2ecf20Sopenharmony_ci add y0, y2 # y2 = S1 + CH # -- 2938c2ecf20Sopenharmony_ci vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} 2948c2ecf20Sopenharmony_ci 2958c2ecf20Sopenharmony_ci or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 2968c2ecf20Sopenharmony_ci add y1,h # h = k + w + h + S0 # -- 2978c2ecf20Sopenharmony_ci add y2,d # d = k + w + h + d + S1 + CH = d + t1 # -- 2988c2ecf20Sopenharmony_ci add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 2998c2ecf20Sopenharmony_ci 3008c2ecf20Sopenharmony_ci add y3,h # h = t1 + S0 + MAJ # -- 3018c2ecf20Sopenharmony_ci 3028c2ecf20Sopenharmony_ci 3038c2ecf20Sopenharmony_ci ROTATE_ARGS 3048c2ecf20Sopenharmony_ci 3058c2ecf20Sopenharmony_ci################################### RND N + 3 ############################ 3068c2ecf20Sopenharmony_ci 3078c2ecf20Sopenharmony_ci mov a, y3 # y3 = a # MAJA 3088c2ecf20Sopenharmony_ci rorx $25, e, y0 # y0 = e >> 25 # S1A 3098c2ecf20Sopenharmony_ci rorx $11, e, y1 # y1 = e >> 11 # S1B 3108c2ecf20Sopenharmony_ci offset = \disp + 3*4 3118c2ecf20Sopenharmony_ci addl offset(%rsp, SRND), h # h = k + w + h # -- 3128c2ecf20Sopenharmony_ci or c, y3 # y3 = a|c # MAJA 3138c2ecf20Sopenharmony_ci 3148c2ecf20Sopenharmony_ci 3158c2ecf20Sopenharmony_ci vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} 3168c2ecf20Sopenharmony_ci mov f, y2 # y2 = f # CH 3178c2ecf20Sopenharmony_ci rorx $13, a, T1 # T1 = a >> 13 # S0B 3188c2ecf20Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 3198c2ecf20Sopenharmony_ci xor g, y2 # y2 = f^g # CH 3208c2ecf20Sopenharmony_ci 3218c2ecf20Sopenharmony_ci 3228c2ecf20Sopenharmony_ci vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} 3238c2ecf20Sopenharmony_ci rorx $6, e, y1 # y1 = (e >> 6) # S1 3248c2ecf20Sopenharmony_ci and e, y2 # y2 = (f^g)&e # CH 3258c2ecf20Sopenharmony_ci add h, d # d = k + w + h + d # -- 3268c2ecf20Sopenharmony_ci and b, y3 # y3 = (a|c)&b # MAJA 3278c2ecf20Sopenharmony_ci 3288c2ecf20Sopenharmony_ci vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} 3298c2ecf20Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 3308c2ecf20Sopenharmony_ci xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 3318c2ecf20Sopenharmony_ci 3328c2ecf20Sopenharmony_ci vpxor XTMP3, XTMP2, XTMP2 3338c2ecf20Sopenharmony_ci rorx $22, a, y1 # y1 = a >> 22 # S0A 3348c2ecf20Sopenharmony_ci add y0, y2 # y2 = S1 + CH # -- 3358c2ecf20Sopenharmony_ci 3368c2ecf20Sopenharmony_ci vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} 3378c2ecf20Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 3388c2ecf20Sopenharmony_ci add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 3398c2ecf20Sopenharmony_ci 3408c2ecf20Sopenharmony_ci rorx $2, a, T1 # T1 = (a >> 2) # S0 3418c2ecf20Sopenharmony_ci vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} 3428c2ecf20Sopenharmony_ci 3438c2ecf20Sopenharmony_ci vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} 3448c2ecf20Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 3458c2ecf20Sopenharmony_ci mov a, T1 # T1 = a # MAJB 3468c2ecf20Sopenharmony_ci and c, T1 # T1 = a&c # MAJB 3478c2ecf20Sopenharmony_ci or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 3488c2ecf20Sopenharmony_ci 3498c2ecf20Sopenharmony_ci add y1, h # h = k + w + h + S0 # -- 3508c2ecf20Sopenharmony_ci add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 3518c2ecf20Sopenharmony_ci add y3, h # h = t1 + S0 + MAJ # -- 3528c2ecf20Sopenharmony_ci 3538c2ecf20Sopenharmony_ci ROTATE_ARGS 3548c2ecf20Sopenharmony_ci rotate_Xs 3558c2ecf20Sopenharmony_ci.endm 3568c2ecf20Sopenharmony_ci 3578c2ecf20Sopenharmony_ci.macro DO_4ROUNDS disp 3588c2ecf20Sopenharmony_ci################################### RND N + 0 ########################### 3598c2ecf20Sopenharmony_ci 3608c2ecf20Sopenharmony_ci mov f, y2 # y2 = f # CH 3618c2ecf20Sopenharmony_ci rorx $25, e, y0 # y0 = e >> 25 # S1A 3628c2ecf20Sopenharmony_ci rorx $11, e, y1 # y1 = e >> 11 # S1B 3638c2ecf20Sopenharmony_ci xor g, y2 # y2 = f^g # CH 3648c2ecf20Sopenharmony_ci 3658c2ecf20Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 3668c2ecf20Sopenharmony_ci rorx $6, e, y1 # y1 = (e >> 6) # S1 3678c2ecf20Sopenharmony_ci and e, y2 # y2 = (f^g)&e # CH 3688c2ecf20Sopenharmony_ci 3698c2ecf20Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 3708c2ecf20Sopenharmony_ci rorx $13, a, T1 # T1 = a >> 13 # S0B 3718c2ecf20Sopenharmony_ci xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 3728c2ecf20Sopenharmony_ci rorx $22, a, y1 # y1 = a >> 22 # S0A 3738c2ecf20Sopenharmony_ci mov a, y3 # y3 = a # MAJA 3748c2ecf20Sopenharmony_ci 3758c2ecf20Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 3768c2ecf20Sopenharmony_ci rorx $2, a, T1 # T1 = (a >> 2) # S0 3778c2ecf20Sopenharmony_ci addl \disp(%rsp, SRND), h # h = k + w + h # -- 3788c2ecf20Sopenharmony_ci or c, y3 # y3 = a|c # MAJA 3798c2ecf20Sopenharmony_ci 3808c2ecf20Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 3818c2ecf20Sopenharmony_ci mov a, T1 # T1 = a # MAJB 3828c2ecf20Sopenharmony_ci and b, y3 # y3 = (a|c)&b # MAJA 3838c2ecf20Sopenharmony_ci and c, T1 # T1 = a&c # MAJB 3848c2ecf20Sopenharmony_ci add y0, y2 # y2 = S1 + CH # -- 3858c2ecf20Sopenharmony_ci 3868c2ecf20Sopenharmony_ci 3878c2ecf20Sopenharmony_ci add h, d # d = k + w + h + d # -- 3888c2ecf20Sopenharmony_ci or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 3898c2ecf20Sopenharmony_ci add y1, h # h = k + w + h + S0 # -- 3908c2ecf20Sopenharmony_ci add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 3918c2ecf20Sopenharmony_ci 3928c2ecf20Sopenharmony_ci ROTATE_ARGS 3938c2ecf20Sopenharmony_ci 3948c2ecf20Sopenharmony_ci################################### RND N + 1 ########################### 3958c2ecf20Sopenharmony_ci 3968c2ecf20Sopenharmony_ci add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 3978c2ecf20Sopenharmony_ci mov f, y2 # y2 = f # CH 3988c2ecf20Sopenharmony_ci rorx $25, e, y0 # y0 = e >> 25 # S1A 3998c2ecf20Sopenharmony_ci rorx $11, e, y1 # y1 = e >> 11 # S1B 4008c2ecf20Sopenharmony_ci xor g, y2 # y2 = f^g # CH 4018c2ecf20Sopenharmony_ci 4028c2ecf20Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 4038c2ecf20Sopenharmony_ci rorx $6, e, y1 # y1 = (e >> 6) # S1 4048c2ecf20Sopenharmony_ci and e, y2 # y2 = (f^g)&e # CH 4058c2ecf20Sopenharmony_ci add y3, old_h # h = t1 + S0 + MAJ # -- 4068c2ecf20Sopenharmony_ci 4078c2ecf20Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 4088c2ecf20Sopenharmony_ci rorx $13, a, T1 # T1 = a >> 13 # S0B 4098c2ecf20Sopenharmony_ci xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 4108c2ecf20Sopenharmony_ci rorx $22, a, y1 # y1 = a >> 22 # S0A 4118c2ecf20Sopenharmony_ci mov a, y3 # y3 = a # MAJA 4128c2ecf20Sopenharmony_ci 4138c2ecf20Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 4148c2ecf20Sopenharmony_ci rorx $2, a, T1 # T1 = (a >> 2) # S0 4158c2ecf20Sopenharmony_ci offset = 4*1 + \disp 4168c2ecf20Sopenharmony_ci addl offset(%rsp, SRND), h # h = k + w + h # -- 4178c2ecf20Sopenharmony_ci or c, y3 # y3 = a|c # MAJA 4188c2ecf20Sopenharmony_ci 4198c2ecf20Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 4208c2ecf20Sopenharmony_ci mov a, T1 # T1 = a # MAJB 4218c2ecf20Sopenharmony_ci and b, y3 # y3 = (a|c)&b # MAJA 4228c2ecf20Sopenharmony_ci and c, T1 # T1 = a&c # MAJB 4238c2ecf20Sopenharmony_ci add y0, y2 # y2 = S1 + CH # -- 4248c2ecf20Sopenharmony_ci 4258c2ecf20Sopenharmony_ci 4268c2ecf20Sopenharmony_ci add h, d # d = k + w + h + d # -- 4278c2ecf20Sopenharmony_ci or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 4288c2ecf20Sopenharmony_ci add y1, h # h = k + w + h + S0 # -- 4298c2ecf20Sopenharmony_ci 4308c2ecf20Sopenharmony_ci add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 4318c2ecf20Sopenharmony_ci 4328c2ecf20Sopenharmony_ci ROTATE_ARGS 4338c2ecf20Sopenharmony_ci 4348c2ecf20Sopenharmony_ci################################### RND N + 2 ############################## 4358c2ecf20Sopenharmony_ci 4368c2ecf20Sopenharmony_ci add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 4378c2ecf20Sopenharmony_ci mov f, y2 # y2 = f # CH 4388c2ecf20Sopenharmony_ci rorx $25, e, y0 # y0 = e >> 25 # S1A 4398c2ecf20Sopenharmony_ci rorx $11, e, y1 # y1 = e >> 11 # S1B 4408c2ecf20Sopenharmony_ci xor g, y2 # y2 = f^g # CH 4418c2ecf20Sopenharmony_ci 4428c2ecf20Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 4438c2ecf20Sopenharmony_ci rorx $6, e, y1 # y1 = (e >> 6) # S1 4448c2ecf20Sopenharmony_ci and e, y2 # y2 = (f^g)&e # CH 4458c2ecf20Sopenharmony_ci add y3, old_h # h = t1 + S0 + MAJ # -- 4468c2ecf20Sopenharmony_ci 4478c2ecf20Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 4488c2ecf20Sopenharmony_ci rorx $13, a, T1 # T1 = a >> 13 # S0B 4498c2ecf20Sopenharmony_ci xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 4508c2ecf20Sopenharmony_ci rorx $22, a, y1 # y1 = a >> 22 # S0A 4518c2ecf20Sopenharmony_ci mov a, y3 # y3 = a # MAJA 4528c2ecf20Sopenharmony_ci 4538c2ecf20Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 4548c2ecf20Sopenharmony_ci rorx $2, a, T1 # T1 = (a >> 2) # S0 4558c2ecf20Sopenharmony_ci offset = 4*2 + \disp 4568c2ecf20Sopenharmony_ci addl offset(%rsp, SRND), h # h = k + w + h # -- 4578c2ecf20Sopenharmony_ci or c, y3 # y3 = a|c # MAJA 4588c2ecf20Sopenharmony_ci 4598c2ecf20Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 4608c2ecf20Sopenharmony_ci mov a, T1 # T1 = a # MAJB 4618c2ecf20Sopenharmony_ci and b, y3 # y3 = (a|c)&b # MAJA 4628c2ecf20Sopenharmony_ci and c, T1 # T1 = a&c # MAJB 4638c2ecf20Sopenharmony_ci add y0, y2 # y2 = S1 + CH # -- 4648c2ecf20Sopenharmony_ci 4658c2ecf20Sopenharmony_ci 4668c2ecf20Sopenharmony_ci add h, d # d = k + w + h + d # -- 4678c2ecf20Sopenharmony_ci or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 4688c2ecf20Sopenharmony_ci add y1, h # h = k + w + h + S0 # -- 4698c2ecf20Sopenharmony_ci 4708c2ecf20Sopenharmony_ci add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 4718c2ecf20Sopenharmony_ci 4728c2ecf20Sopenharmony_ci ROTATE_ARGS 4738c2ecf20Sopenharmony_ci 4748c2ecf20Sopenharmony_ci################################### RND N + 3 ########################### 4758c2ecf20Sopenharmony_ci 4768c2ecf20Sopenharmony_ci add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 4778c2ecf20Sopenharmony_ci mov f, y2 # y2 = f # CH 4788c2ecf20Sopenharmony_ci rorx $25, e, y0 # y0 = e >> 25 # S1A 4798c2ecf20Sopenharmony_ci rorx $11, e, y1 # y1 = e >> 11 # S1B 4808c2ecf20Sopenharmony_ci xor g, y2 # y2 = f^g # CH 4818c2ecf20Sopenharmony_ci 4828c2ecf20Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 4838c2ecf20Sopenharmony_ci rorx $6, e, y1 # y1 = (e >> 6) # S1 4848c2ecf20Sopenharmony_ci and e, y2 # y2 = (f^g)&e # CH 4858c2ecf20Sopenharmony_ci add y3, old_h # h = t1 + S0 + MAJ # -- 4868c2ecf20Sopenharmony_ci 4878c2ecf20Sopenharmony_ci xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 4888c2ecf20Sopenharmony_ci rorx $13, a, T1 # T1 = a >> 13 # S0B 4898c2ecf20Sopenharmony_ci xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 4908c2ecf20Sopenharmony_ci rorx $22, a, y1 # y1 = a >> 22 # S0A 4918c2ecf20Sopenharmony_ci mov a, y3 # y3 = a # MAJA 4928c2ecf20Sopenharmony_ci 4938c2ecf20Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 4948c2ecf20Sopenharmony_ci rorx $2, a, T1 # T1 = (a >> 2) # S0 4958c2ecf20Sopenharmony_ci offset = 4*3 + \disp 4968c2ecf20Sopenharmony_ci addl offset(%rsp, SRND), h # h = k + w + h # -- 4978c2ecf20Sopenharmony_ci or c, y3 # y3 = a|c # MAJA 4988c2ecf20Sopenharmony_ci 4998c2ecf20Sopenharmony_ci xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 5008c2ecf20Sopenharmony_ci mov a, T1 # T1 = a # MAJB 5018c2ecf20Sopenharmony_ci and b, y3 # y3 = (a|c)&b # MAJA 5028c2ecf20Sopenharmony_ci and c, T1 # T1 = a&c # MAJB 5038c2ecf20Sopenharmony_ci add y0, y2 # y2 = S1 + CH # -- 5048c2ecf20Sopenharmony_ci 5058c2ecf20Sopenharmony_ci 5068c2ecf20Sopenharmony_ci add h, d # d = k + w + h + d # -- 5078c2ecf20Sopenharmony_ci or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 5088c2ecf20Sopenharmony_ci add y1, h # h = k + w + h + S0 # -- 5098c2ecf20Sopenharmony_ci 5108c2ecf20Sopenharmony_ci add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 5118c2ecf20Sopenharmony_ci 5128c2ecf20Sopenharmony_ci 5138c2ecf20Sopenharmony_ci add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 5148c2ecf20Sopenharmony_ci 5158c2ecf20Sopenharmony_ci add y3, h # h = t1 + S0 + MAJ # -- 5168c2ecf20Sopenharmony_ci 5178c2ecf20Sopenharmony_ci ROTATE_ARGS 5188c2ecf20Sopenharmony_ci 5198c2ecf20Sopenharmony_ci.endm 5208c2ecf20Sopenharmony_ci 5218c2ecf20Sopenharmony_ci######################################################################## 5228c2ecf20Sopenharmony_ci## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks) 5238c2ecf20Sopenharmony_ci## arg 1 : pointer to state 5248c2ecf20Sopenharmony_ci## arg 2 : pointer to input data 5258c2ecf20Sopenharmony_ci## arg 3 : Num blocks 5268c2ecf20Sopenharmony_ci######################################################################## 5278c2ecf20Sopenharmony_ci.text 5288c2ecf20Sopenharmony_ciSYM_FUNC_START(sha256_transform_rorx) 5298c2ecf20Sopenharmony_ci.align 32 5308c2ecf20Sopenharmony_ci pushq %rbx 5318c2ecf20Sopenharmony_ci pushq %r12 5328c2ecf20Sopenharmony_ci pushq %r13 5338c2ecf20Sopenharmony_ci pushq %r14 5348c2ecf20Sopenharmony_ci pushq %r15 5358c2ecf20Sopenharmony_ci 5368c2ecf20Sopenharmony_ci mov %rsp, %rax 5378c2ecf20Sopenharmony_ci subq $STACK_SIZE, %rsp 5388c2ecf20Sopenharmony_ci and $-32, %rsp # align rsp to 32 byte boundary 5398c2ecf20Sopenharmony_ci mov %rax, _RSP(%rsp) 5408c2ecf20Sopenharmony_ci 5418c2ecf20Sopenharmony_ci 5428c2ecf20Sopenharmony_ci shl $6, NUM_BLKS # convert to bytes 5438c2ecf20Sopenharmony_ci jz done_hash 5448c2ecf20Sopenharmony_ci lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block 5458c2ecf20Sopenharmony_ci mov NUM_BLKS, _INP_END(%rsp) 5468c2ecf20Sopenharmony_ci 5478c2ecf20Sopenharmony_ci cmp NUM_BLKS, INP 5488c2ecf20Sopenharmony_ci je only_one_block 5498c2ecf20Sopenharmony_ci 5508c2ecf20Sopenharmony_ci ## load initial digest 5518c2ecf20Sopenharmony_ci mov (CTX), a 5528c2ecf20Sopenharmony_ci mov 4*1(CTX), b 5538c2ecf20Sopenharmony_ci mov 4*2(CTX), c 5548c2ecf20Sopenharmony_ci mov 4*3(CTX), d 5558c2ecf20Sopenharmony_ci mov 4*4(CTX), e 5568c2ecf20Sopenharmony_ci mov 4*5(CTX), f 5578c2ecf20Sopenharmony_ci mov 4*6(CTX), g 5588c2ecf20Sopenharmony_ci mov 4*7(CTX), h 5598c2ecf20Sopenharmony_ci 5608c2ecf20Sopenharmony_ci vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 5618c2ecf20Sopenharmony_ci vmovdqa _SHUF_00BA(%rip), SHUF_00BA 5628c2ecf20Sopenharmony_ci vmovdqa _SHUF_DC00(%rip), SHUF_DC00 5638c2ecf20Sopenharmony_ci 5648c2ecf20Sopenharmony_ci mov CTX, _CTX(%rsp) 5658c2ecf20Sopenharmony_ci 5668c2ecf20Sopenharmony_ciloop0: 5678c2ecf20Sopenharmony_ci ## Load first 16 dwords from two blocks 5688c2ecf20Sopenharmony_ci VMOVDQ 0*32(INP),XTMP0 5698c2ecf20Sopenharmony_ci VMOVDQ 1*32(INP),XTMP1 5708c2ecf20Sopenharmony_ci VMOVDQ 2*32(INP),XTMP2 5718c2ecf20Sopenharmony_ci VMOVDQ 3*32(INP),XTMP3 5728c2ecf20Sopenharmony_ci 5738c2ecf20Sopenharmony_ci ## byte swap data 5748c2ecf20Sopenharmony_ci vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0 5758c2ecf20Sopenharmony_ci vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1 5768c2ecf20Sopenharmony_ci vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2 5778c2ecf20Sopenharmony_ci vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3 5788c2ecf20Sopenharmony_ci 5798c2ecf20Sopenharmony_ci ## transpose data into high/low halves 5808c2ecf20Sopenharmony_ci vperm2i128 $0x20, XTMP2, XTMP0, X0 5818c2ecf20Sopenharmony_ci vperm2i128 $0x31, XTMP2, XTMP0, X1 5828c2ecf20Sopenharmony_ci vperm2i128 $0x20, XTMP3, XTMP1, X2 5838c2ecf20Sopenharmony_ci vperm2i128 $0x31, XTMP3, XTMP1, X3 5848c2ecf20Sopenharmony_ci 5858c2ecf20Sopenharmony_cilast_block_enter: 5868c2ecf20Sopenharmony_ci add $64, INP 5878c2ecf20Sopenharmony_ci mov INP, _INP(%rsp) 5888c2ecf20Sopenharmony_ci 5898c2ecf20Sopenharmony_ci ## schedule 48 input dwords, by doing 3 rounds of 12 each 5908c2ecf20Sopenharmony_ci xor SRND, SRND 5918c2ecf20Sopenharmony_ci 5928c2ecf20Sopenharmony_ci.align 16 5938c2ecf20Sopenharmony_ciloop1: 5948c2ecf20Sopenharmony_ci vpaddd K256+0*32(SRND), X0, XFER 5958c2ecf20Sopenharmony_ci vmovdqa XFER, 0*32+_XFER(%rsp, SRND) 5968c2ecf20Sopenharmony_ci FOUR_ROUNDS_AND_SCHED _XFER + 0*32 5978c2ecf20Sopenharmony_ci 5988c2ecf20Sopenharmony_ci vpaddd K256+1*32(SRND), X0, XFER 5998c2ecf20Sopenharmony_ci vmovdqa XFER, 1*32+_XFER(%rsp, SRND) 6008c2ecf20Sopenharmony_ci FOUR_ROUNDS_AND_SCHED _XFER + 1*32 6018c2ecf20Sopenharmony_ci 6028c2ecf20Sopenharmony_ci vpaddd K256+2*32(SRND), X0, XFER 6038c2ecf20Sopenharmony_ci vmovdqa XFER, 2*32+_XFER(%rsp, SRND) 6048c2ecf20Sopenharmony_ci FOUR_ROUNDS_AND_SCHED _XFER + 2*32 6058c2ecf20Sopenharmony_ci 6068c2ecf20Sopenharmony_ci vpaddd K256+3*32(SRND), X0, XFER 6078c2ecf20Sopenharmony_ci vmovdqa XFER, 3*32+_XFER(%rsp, SRND) 6088c2ecf20Sopenharmony_ci FOUR_ROUNDS_AND_SCHED _XFER + 3*32 6098c2ecf20Sopenharmony_ci 6108c2ecf20Sopenharmony_ci add $4*32, SRND 6118c2ecf20Sopenharmony_ci cmp $3*4*32, SRND 6128c2ecf20Sopenharmony_ci jb loop1 6138c2ecf20Sopenharmony_ci 6148c2ecf20Sopenharmony_ciloop2: 6158c2ecf20Sopenharmony_ci ## Do last 16 rounds with no scheduling 6168c2ecf20Sopenharmony_ci vpaddd K256+0*32(SRND), X0, XFER 6178c2ecf20Sopenharmony_ci vmovdqa XFER, 0*32+_XFER(%rsp, SRND) 6188c2ecf20Sopenharmony_ci DO_4ROUNDS _XFER + 0*32 6198c2ecf20Sopenharmony_ci 6208c2ecf20Sopenharmony_ci vpaddd K256+1*32(SRND), X1, XFER 6218c2ecf20Sopenharmony_ci vmovdqa XFER, 1*32+_XFER(%rsp, SRND) 6228c2ecf20Sopenharmony_ci DO_4ROUNDS _XFER + 1*32 6238c2ecf20Sopenharmony_ci add $2*32, SRND 6248c2ecf20Sopenharmony_ci 6258c2ecf20Sopenharmony_ci vmovdqa X2, X0 6268c2ecf20Sopenharmony_ci vmovdqa X3, X1 6278c2ecf20Sopenharmony_ci 6288c2ecf20Sopenharmony_ci cmp $4*4*32, SRND 6298c2ecf20Sopenharmony_ci jb loop2 6308c2ecf20Sopenharmony_ci 6318c2ecf20Sopenharmony_ci mov _CTX(%rsp), CTX 6328c2ecf20Sopenharmony_ci mov _INP(%rsp), INP 6338c2ecf20Sopenharmony_ci 6348c2ecf20Sopenharmony_ci addm (4*0)(CTX),a 6358c2ecf20Sopenharmony_ci addm (4*1)(CTX),b 6368c2ecf20Sopenharmony_ci addm (4*2)(CTX),c 6378c2ecf20Sopenharmony_ci addm (4*3)(CTX),d 6388c2ecf20Sopenharmony_ci addm (4*4)(CTX),e 6398c2ecf20Sopenharmony_ci addm (4*5)(CTX),f 6408c2ecf20Sopenharmony_ci addm (4*6)(CTX),g 6418c2ecf20Sopenharmony_ci addm (4*7)(CTX),h 6428c2ecf20Sopenharmony_ci 6438c2ecf20Sopenharmony_ci cmp _INP_END(%rsp), INP 6448c2ecf20Sopenharmony_ci ja done_hash 6458c2ecf20Sopenharmony_ci 6468c2ecf20Sopenharmony_ci #### Do second block using previously scheduled results 6478c2ecf20Sopenharmony_ci xor SRND, SRND 6488c2ecf20Sopenharmony_ci.align 16 6498c2ecf20Sopenharmony_ciloop3: 6508c2ecf20Sopenharmony_ci DO_4ROUNDS _XFER + 0*32 + 16 6518c2ecf20Sopenharmony_ci DO_4ROUNDS _XFER + 1*32 + 16 6528c2ecf20Sopenharmony_ci add $2*32, SRND 6538c2ecf20Sopenharmony_ci cmp $4*4*32, SRND 6548c2ecf20Sopenharmony_ci jb loop3 6558c2ecf20Sopenharmony_ci 6568c2ecf20Sopenharmony_ci mov _CTX(%rsp), CTX 6578c2ecf20Sopenharmony_ci mov _INP(%rsp), INP 6588c2ecf20Sopenharmony_ci add $64, INP 6598c2ecf20Sopenharmony_ci 6608c2ecf20Sopenharmony_ci addm (4*0)(CTX),a 6618c2ecf20Sopenharmony_ci addm (4*1)(CTX),b 6628c2ecf20Sopenharmony_ci addm (4*2)(CTX),c 6638c2ecf20Sopenharmony_ci addm (4*3)(CTX),d 6648c2ecf20Sopenharmony_ci addm (4*4)(CTX),e 6658c2ecf20Sopenharmony_ci addm (4*5)(CTX),f 6668c2ecf20Sopenharmony_ci addm (4*6)(CTX),g 6678c2ecf20Sopenharmony_ci addm (4*7)(CTX),h 6688c2ecf20Sopenharmony_ci 6698c2ecf20Sopenharmony_ci cmp _INP_END(%rsp), INP 6708c2ecf20Sopenharmony_ci jb loop0 6718c2ecf20Sopenharmony_ci ja done_hash 6728c2ecf20Sopenharmony_ci 6738c2ecf20Sopenharmony_cido_last_block: 6748c2ecf20Sopenharmony_ci VMOVDQ 0*16(INP),XWORD0 6758c2ecf20Sopenharmony_ci VMOVDQ 1*16(INP),XWORD1 6768c2ecf20Sopenharmony_ci VMOVDQ 2*16(INP),XWORD2 6778c2ecf20Sopenharmony_ci VMOVDQ 3*16(INP),XWORD3 6788c2ecf20Sopenharmony_ci 6798c2ecf20Sopenharmony_ci vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0 6808c2ecf20Sopenharmony_ci vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1 6818c2ecf20Sopenharmony_ci vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2 6828c2ecf20Sopenharmony_ci vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3 6838c2ecf20Sopenharmony_ci 6848c2ecf20Sopenharmony_ci jmp last_block_enter 6858c2ecf20Sopenharmony_ci 6868c2ecf20Sopenharmony_cionly_one_block: 6878c2ecf20Sopenharmony_ci 6888c2ecf20Sopenharmony_ci ## load initial digest 6898c2ecf20Sopenharmony_ci mov (4*0)(CTX),a 6908c2ecf20Sopenharmony_ci mov (4*1)(CTX),b 6918c2ecf20Sopenharmony_ci mov (4*2)(CTX),c 6928c2ecf20Sopenharmony_ci mov (4*3)(CTX),d 6938c2ecf20Sopenharmony_ci mov (4*4)(CTX),e 6948c2ecf20Sopenharmony_ci mov (4*5)(CTX),f 6958c2ecf20Sopenharmony_ci mov (4*6)(CTX),g 6968c2ecf20Sopenharmony_ci mov (4*7)(CTX),h 6978c2ecf20Sopenharmony_ci 6988c2ecf20Sopenharmony_ci vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 6998c2ecf20Sopenharmony_ci vmovdqa _SHUF_00BA(%rip), SHUF_00BA 7008c2ecf20Sopenharmony_ci vmovdqa _SHUF_DC00(%rip), SHUF_DC00 7018c2ecf20Sopenharmony_ci 7028c2ecf20Sopenharmony_ci mov CTX, _CTX(%rsp) 7038c2ecf20Sopenharmony_ci jmp do_last_block 7048c2ecf20Sopenharmony_ci 7058c2ecf20Sopenharmony_cidone_hash: 7068c2ecf20Sopenharmony_ci 7078c2ecf20Sopenharmony_ci mov _RSP(%rsp), %rsp 7088c2ecf20Sopenharmony_ci 7098c2ecf20Sopenharmony_ci popq %r15 7108c2ecf20Sopenharmony_ci popq %r14 7118c2ecf20Sopenharmony_ci popq %r13 7128c2ecf20Sopenharmony_ci popq %r12 7138c2ecf20Sopenharmony_ci popq %rbx 7148c2ecf20Sopenharmony_ci RET 7158c2ecf20Sopenharmony_ciSYM_FUNC_END(sha256_transform_rorx) 7168c2ecf20Sopenharmony_ci 7178c2ecf20Sopenharmony_ci.section .rodata.cst512.K256, "aM", @progbits, 512 7188c2ecf20Sopenharmony_ci.align 64 7198c2ecf20Sopenharmony_ciK256: 7208c2ecf20Sopenharmony_ci .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 7218c2ecf20Sopenharmony_ci .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 7228c2ecf20Sopenharmony_ci .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 7238c2ecf20Sopenharmony_ci .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 7248c2ecf20Sopenharmony_ci .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 7258c2ecf20Sopenharmony_ci .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 7268c2ecf20Sopenharmony_ci .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 7278c2ecf20Sopenharmony_ci .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 7288c2ecf20Sopenharmony_ci .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 7298c2ecf20Sopenharmony_ci .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 7308c2ecf20Sopenharmony_ci .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 7318c2ecf20Sopenharmony_ci .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 7328c2ecf20Sopenharmony_ci .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 7338c2ecf20Sopenharmony_ci .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 7348c2ecf20Sopenharmony_ci .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 7358c2ecf20Sopenharmony_ci .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 7368c2ecf20Sopenharmony_ci .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 7378c2ecf20Sopenharmony_ci .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 7388c2ecf20Sopenharmony_ci .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 7398c2ecf20Sopenharmony_ci .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 7408c2ecf20Sopenharmony_ci .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 7418c2ecf20Sopenharmony_ci .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 7428c2ecf20Sopenharmony_ci .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 7438c2ecf20Sopenharmony_ci .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 7448c2ecf20Sopenharmony_ci .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 7458c2ecf20Sopenharmony_ci .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 7468c2ecf20Sopenharmony_ci .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 7478c2ecf20Sopenharmony_ci .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 7488c2ecf20Sopenharmony_ci .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 7498c2ecf20Sopenharmony_ci .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 7508c2ecf20Sopenharmony_ci .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 7518c2ecf20Sopenharmony_ci .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 7528c2ecf20Sopenharmony_ci 7538c2ecf20Sopenharmony_ci.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 7548c2ecf20Sopenharmony_ci.align 32 7558c2ecf20Sopenharmony_ciPSHUFFLE_BYTE_FLIP_MASK: 7568c2ecf20Sopenharmony_ci .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203 7578c2ecf20Sopenharmony_ci 7588c2ecf20Sopenharmony_ci# shuffle xBxA -> 00BA 7598c2ecf20Sopenharmony_ci.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32 7608c2ecf20Sopenharmony_ci.align 32 7618c2ecf20Sopenharmony_ci_SHUF_00BA: 7628c2ecf20Sopenharmony_ci .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100 7638c2ecf20Sopenharmony_ci 7648c2ecf20Sopenharmony_ci# shuffle xDxC -> DC00 7658c2ecf20Sopenharmony_ci.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32 7668c2ecf20Sopenharmony_ci.align 32 7678c2ecf20Sopenharmony_ci_SHUF_DC00: 7688c2ecf20Sopenharmony_ci .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF 769