1 ########################################################################
2 # Implement fast SHA-512 with AVX2 instructions. (x86_64)
3 #
4 # Copyright (C) 2013 Intel Corporation.
5 #
6 # Authors:
7 #     James Guilford <james.guilford@intel.com>
8 #     Kirk Yap <kirk.s.yap@intel.com>
9 #     David Cote <david.m.cote@intel.com>
10 #     Tim Chen <tim.c.chen@linux.intel.com>
11 #
12 # This software is available to you under a choice of one of two
13 # licenses.  You may choose to be licensed under the terms of the GNU
14 # General Public License (GPL) Version 2, available from the file
15 # COPYING in the main directory of this source tree, or the
16 # OpenIB.org BSD license below:
17 #
18 #     Redistribution and use in source and binary forms, with or
19 #     without modification, are permitted provided that the following
20 #     conditions are met:
21 #
22 #      - Redistributions of source code must retain the above
23 #        copyright notice, this list of conditions and the following
24 #        disclaimer.
25 #
26 #      - Redistributions in binary form must reproduce the above
27 #        copyright notice, this list of conditions and the following
28 #        disclaimer in the documentation and/or other materials
29 #        provided with the distribution.
30 #
31 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
32 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
33 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
34 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
35 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
36 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
37 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
38 # SOFTWARE.
39 #
40 ########################################################################
41 #
42 # This code is described in an Intel White-Paper:
43 # "Fast SHA-512 Implementations on Intel Architecture Processors"
44 #
45 # To find it, surf to http://www.intel.com/p/en_US/embedded
46 # and search for that title.
47 #
48 ########################################################################
49 # This code schedules 1 blocks at a time, with 4 lanes per block
50 ########################################################################
51 
52 #include <linux/linkage.h>
53 
54 .text
55 
56 # Virtual Registers
57 Y_0 = %ymm4
58 Y_1 = %ymm5
59 Y_2 = %ymm6
60 Y_3 = %ymm7
61 
62 YTMP0 = %ymm0
63 YTMP1 = %ymm1
64 YTMP2 = %ymm2
65 YTMP3 = %ymm3
66 YTMP4 = %ymm8
67 XFER  = YTMP0
68 
69 BYTE_FLIP_MASK  = %ymm9
70 
71 # 1st arg is %rdi, which is saved to the stack and accessed later via %r12
72 CTX1        = %rdi
73 CTX2        = %r12
74 # 2nd arg
75 INP         = %rsi
76 # 3rd arg
77 NUM_BLKS    = %rdx
78 
79 c           = %rcx
80 d           = %r8
81 e           = %rdx
82 y3          = %rsi
83 
84 TBL   = %rdi # clobbers CTX1
85 
86 a     = %rax
87 b     = %rbx
88 
89 f     = %r9
90 g     = %r10
91 h     = %r11
92 old_h = %r11
93 
94 T1    = %r12 # clobbers CTX2
95 y0    = %r13
96 y1    = %r14
97 y2    = %r15
98 
99 # Local variables (stack frame)
100 XFER_SIZE = 4*8
101 SRND_SIZE = 1*8
102 INP_SIZE = 1*8
103 INPEND_SIZE = 1*8
104 CTX_SIZE = 1*8
105 RSPSAVE_SIZE = 1*8
106 GPRSAVE_SIZE = 5*8
107 
108 frame_XFER = 0
109 frame_SRND = frame_XFER + XFER_SIZE
110 frame_INP = frame_SRND + SRND_SIZE
111 frame_INPEND = frame_INP + INP_SIZE
112 frame_CTX = frame_INPEND + INPEND_SIZE
113 frame_RSPSAVE = frame_CTX + CTX_SIZE
114 frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
115 frame_size = frame_GPRSAVE + GPRSAVE_SIZE
116 
117 ## assume buffers not aligned
118 #define	VMOVDQ vmovdqu
119 
120 # addm [mem], reg
121 # Add reg to mem using reg-mem add and store
122 .macro addm p1 p2
123 	add	\p1, \p2
124 	mov	\p2, \p1
125 .endm
126 
127 
128 # COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask
129 # Load ymm with mem and byte swap each dword
130 .macro COPY_YMM_AND_BSWAP p1 p2 p3
131 	VMOVDQ \p2, \p1
132 	vpshufb \p3, \p1, \p1
133 .endm
134 # rotate_Ys
135 # Rotate values of symbols Y0...Y3
136 .macro rotate_Ys
137 	Y_ = Y_0
138 	Y_0 = Y_1
139 	Y_1 = Y_2
140 	Y_2 = Y_3
141 	Y_3 = Y_
142 .endm
143 
144 # RotateState
145 .macro RotateState
146 	# Rotate symbols a..h right
147 	old_h  = h
148 	TMP_   = h
149 	h      = g
150 	g      = f
151 	f      = e
152 	e      = d
153 	d      = c
154 	c      = b
155 	b      = a
156 	a      = TMP_
157 .endm
158 
159 # macro MY_VPALIGNR	YDST, YSRC1, YSRC2, RVAL
160 # YDST = {YSRC1, YSRC2} >> RVAL*8
161 .macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL
162 	vperm2f128      $0x3, \YSRC2, \YSRC1, \YDST     # YDST = {YS1_LO, YS2_HI}
163 	vpalignr        $\RVAL, \YSRC2, \YDST, \YDST    # YDST = {YDS1, YS2} >> RVAL*8
164 .endm
165 
166 .macro FOUR_ROUNDS_AND_SCHED
167 ################################### RND N + 0 #########################################
168 
169 	# Extract w[t-7]
170 	MY_VPALIGNR	YTMP0, Y_3, Y_2, 8		# YTMP0 = W[-7]
171 	# Calculate w[t-16] + w[t-7]
172 	vpaddq		Y_0, YTMP0, YTMP0		# YTMP0 = W[-7] + W[-16]
173 	# Extract w[t-15]
174 	MY_VPALIGNR	YTMP1, Y_1, Y_0, 8		# YTMP1 = W[-15]
175 
176 	# Calculate sigma0
177 
178 	# Calculate w[t-15] ror 1
179 	vpsrlq		$1, YTMP1, YTMP2
180 	vpsllq		$(64-1), YTMP1, YTMP3
181 	vpor		YTMP2, YTMP3, YTMP3		# YTMP3 = W[-15] ror 1
182 	# Calculate w[t-15] shr 7
183 	vpsrlq		$7, YTMP1, YTMP4		# YTMP4 = W[-15] >> 7
184 
185 	mov	a, y3		# y3 = a                                # MAJA
186 	rorx	$41, e, y0	# y0 = e >> 41				# S1A
187 	rorx	$18, e, y1	# y1 = e >> 18				# S1B
188 	add	frame_XFER(%rsp),h		# h = k + w + h         # --
189 	or	c, y3		# y3 = a|c                              # MAJA
190 	mov	f, y2		# y2 = f                                # CH
191 	rorx	$34, a, T1	# T1 = a >> 34				# S0B
192 
193 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
194 	xor	g, y2		# y2 = f^g                              # CH
195 	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
196 
197 	and	e, y2		# y2 = (f^g)&e                          # CH
198 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
199 	rorx	$39, a, y1	# y1 = a >> 39				# S0A
200 	add	h, d		# d = k + w + h + d                     # --
201 
202 	and	b, y3		# y3 = (a|c)&b                          # MAJA
203 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
204 	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
205 
206 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
207 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
208 	mov	a, T1		# T1 = a                                # MAJB
209 	and	c, T1		# T1 = a&c                              # MAJB
210 
211 	add	y0, y2		# y2 = S1 + CH                          # --
212 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
213 	add	y1, h		# h = k + w + h + S0                    # --
214 
215 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
216 
217 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
218 	add	y3, h		# h = t1 + S0 + MAJ                     # --
219 
220 	RotateState
221 
222 ################################### RND N + 1 #########################################
223 
224 	# Calculate w[t-15] ror 8
225 	vpsrlq		$8, YTMP1, YTMP2
226 	vpsllq		$(64-8), YTMP1, YTMP1
227 	vpor		YTMP2, YTMP1, YTMP1		# YTMP1 = W[-15] ror 8
228 	# XOR the three components
229 	vpxor		YTMP4, YTMP3, YTMP3		# YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
230 	vpxor		YTMP1, YTMP3, YTMP1		# YTMP1 = s0
231 
232 
233 	# Add three components, w[t-16], w[t-7] and sigma0
234 	vpaddq		YTMP1, YTMP0, YTMP0		# YTMP0 = W[-16] + W[-7] + s0
235 	# Move to appropriate lanes for calculating w[16] and w[17]
236 	vperm2f128	$0x0, YTMP0, YTMP0, Y_0		# Y_0 = W[-16] + W[-7] + s0 {BABA}
237 	# Move to appropriate lanes for calculating w[18] and w[19]
238 	vpand		MASK_YMM_LO(%rip), YTMP0, YTMP0	# YTMP0 = W[-16] + W[-7] + s0 {DC00}
239 
240 	# Calculate w[16] and w[17] in both 128 bit lanes
241 
242 	# Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
243 	vperm2f128	$0x11, Y_3, Y_3, YTMP2		# YTMP2 = W[-2] {BABA}
244 	vpsrlq		$6, YTMP2, YTMP4		# YTMP4 = W[-2] >> 6 {BABA}
245 
246 
247 	mov	a, y3		# y3 = a                                # MAJA
248 	rorx	$41, e, y0	# y0 = e >> 41				# S1A
249 	rorx	$18, e, y1	# y1 = e >> 18				# S1B
250 	add	1*8+frame_XFER(%rsp), h		# h = k + w + h         # --
251 	or	c, y3		# y3 = a|c                              # MAJA
252 
253 
254 	mov	f, y2		# y2 = f                                # CH
255 	rorx	$34, a, T1	# T1 = a >> 34				# S0B
256 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
257 	xor	g, y2		# y2 = f^g                              # CH
258 
259 
260 	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
261 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
262 	rorx	$39, a, y1	# y1 = a >> 39				# S0A
263 	and	e, y2		# y2 = (f^g)&e                          # CH
264 	add	h, d		# d = k + w + h + d                     # --
265 
266 	and	b, y3		# y3 = (a|c)&b                          # MAJA
267 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
268 
269 	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
270 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
271 
272 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
273 	mov	a, T1		# T1 = a                                # MAJB
274 	and	c, T1		# T1 = a&c                              # MAJB
275 	add	y0, y2		# y2 = S1 + CH                          # --
276 
277 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
278 	add	y1, h		# h = k + w + h + S0                    # --
279 
280 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
281 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
282 	add	y3, h		# h = t1 + S0 + MAJ                     # --
283 
284 	RotateState
285 
286 
287 ################################### RND N + 2 #########################################
288 
289 	vpsrlq		$19, YTMP2, YTMP3		# YTMP3 = W[-2] >> 19 {BABA}
290 	vpsllq		$(64-19), YTMP2, YTMP1		# YTMP1 = W[-2] << 19 {BABA}
291 	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 19 {BABA}
292 	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
293 	vpsrlq		$61, YTMP2, YTMP3		# YTMP3 = W[-2] >> 61 {BABA}
294 	vpsllq		$(64-61), YTMP2, YTMP1		# YTMP1 = W[-2] << 61 {BABA}
295 	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 61 {BABA}
296 	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = s1 = (W[-2] ror 19) ^
297 							#  (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
298 
299 	# Add sigma1 to the other compunents to get w[16] and w[17]
300 	vpaddq		YTMP4, Y_0, Y_0			# Y_0 = {W[1], W[0], W[1], W[0]}
301 
302 	# Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
303 	vpsrlq		$6, Y_0, YTMP4			# YTMP4 = W[-2] >> 6 {DC--}
304 
305 	mov	a, y3		# y3 = a                                # MAJA
306 	rorx	$41, e, y0	# y0 = e >> 41				# S1A
307 	add	2*8+frame_XFER(%rsp), h		# h = k + w + h         # --
308 
309 	rorx	$18, e, y1	# y1 = e >> 18				# S1B
310 	or	c, y3		# y3 = a|c                              # MAJA
311 	mov	f, y2		# y2 = f                                # CH
312 	xor	g, y2		# y2 = f^g                              # CH
313 
314 	rorx	$34, a, T1	# T1 = a >> 34				# S0B
315 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
316 	and	e, y2		# y2 = (f^g)&e                          # CH
317 
318 	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
319 	add	h, d		# d = k + w + h + d                     # --
320 	and	b, y3		# y3 = (a|c)&b                          # MAJA
321 
322 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
323 	rorx	$39, a, y1	# y1 = a >> 39				# S0A
324 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
325 
326 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
327 	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
328 
329 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
330 	mov	a, T1		# T1 = a                                # MAJB
331 	and	c, T1		# T1 = a&c                              # MAJB
332 	add	y0, y2		# y2 = S1 + CH                          # --
333 
334 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
335 	add	y1, h		# h = k + w + h + S0                    # --
336 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
337 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
338 
339 	add	y3, h		# h = t1 + S0 + MAJ                     # --
340 
341 	RotateState
342 
343 ################################### RND N + 3 #########################################
344 
345 	vpsrlq		$19, Y_0, YTMP3			# YTMP3 = W[-2] >> 19 {DC--}
346 	vpsllq		$(64-19), Y_0, YTMP1		# YTMP1 = W[-2] << 19 {DC--}
347 	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 19 {DC--}
348 	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
349 	vpsrlq		$61, Y_0, YTMP3			# YTMP3 = W[-2] >> 61 {DC--}
350 	vpsllq		$(64-61), Y_0, YTMP1		# YTMP1 = W[-2] << 61 {DC--}
351 	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 61 {DC--}
352 	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = s1 = (W[-2] ror 19) ^
353 							#  (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
354 
355 	# Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
356 	# to newly calculated sigma1 to get w[18] and w[19]
357 	vpaddq		YTMP4, YTMP0, YTMP2		# YTMP2 = {W[3], W[2], --, --}
358 
359 	# Form w[19, w[18], w17], w[16]
360 	vpblendd		$0xF0, YTMP2, Y_0, Y_0		# Y_0 = {W[3], W[2], W[1], W[0]}
361 
362 	mov	a, y3		# y3 = a                                # MAJA
363 	rorx	$41, e, y0	# y0 = e >> 41				# S1A
364 	rorx	$18, e, y1	# y1 = e >> 18				# S1B
365 	add	3*8+frame_XFER(%rsp), h		# h = k + w + h         # --
366 	or	c, y3		# y3 = a|c                              # MAJA
367 
368 
369 	mov	f, y2		# y2 = f                                # CH
370 	rorx	$34, a, T1	# T1 = a >> 34				# S0B
371 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
372 	xor	g, y2		# y2 = f^g                              # CH
373 
374 
375 	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
376 	and	e, y2		# y2 = (f^g)&e                          # CH
377 	add	h, d		# d = k + w + h + d                     # --
378 	and	b, y3		# y3 = (a|c)&b                          # MAJA
379 
380 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
381 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
382 
383 	rorx	$39, a, y1	# y1 = a >> 39				# S0A
384 	add	y0, y2		# y2 = S1 + CH                          # --
385 
386 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
387 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
388 
389 	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
390 
391 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
392 	mov	a, T1		# T1 = a                                # MAJB
393 	and	c, T1		# T1 = a&c                              # MAJB
394 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
395 
396 	add	y1, h		# h = k + w + h + S0                    # --
397 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
398 	add	y3, h		# h = t1 + S0 + MAJ                     # --
399 
400 	RotateState
401 
402 	rotate_Ys
403 .endm
404 
405 .macro DO_4ROUNDS
406 
407 ################################### RND N + 0 #########################################
408 
409 	mov	f, y2		# y2 = f                                # CH
410 	rorx	$41, e, y0	# y0 = e >> 41				# S1A
411 	rorx	$18, e, y1	# y1 = e >> 18				# S1B
412 	xor	g, y2		# y2 = f^g                              # CH
413 
414 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
415 	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
416 	and	e, y2		# y2 = (f^g)&e                          # CH
417 
418 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
419 	rorx	$34, a, T1	# T1 = a >> 34				# S0B
420 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
421 	rorx	$39, a, y1	# y1 = a >> 39				# S0A
422 	mov	a, y3		# y3 = a                                # MAJA
423 
424 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
425 	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
426 	add	frame_XFER(%rsp), h		# h = k + w + h         # --
427 	or	c, y3		# y3 = a|c                              # MAJA
428 
429 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
430 	mov	a, T1		# T1 = a                                # MAJB
431 	and	b, y3		# y3 = (a|c)&b                          # MAJA
432 	and	c, T1		# T1 = a&c                              # MAJB
433 	add	y0, y2		# y2 = S1 + CH                          # --
434 
435 	add	h, d		# d = k + w + h + d                     # --
436 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
437 	add	y1, h		# h = k + w + h + S0                    # --
438 
439 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
440 
441 	RotateState
442 
443 ################################### RND N + 1 #########################################
444 
445 	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
446 	mov	f, y2		# y2 = f                                # CH
447 	rorx	$41, e, y0	# y0 = e >> 41				# S1A
448 	rorx	$18, e, y1	# y1 = e >> 18				# S1B
449 	xor	g, y2		# y2 = f^g                              # CH
450 
451 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
452 	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
453 	and	e, y2		# y2 = (f^g)&e                          # CH
454 	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
455 
456 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
457 	rorx	$34, a, T1	# T1 = a >> 34				# S0B
458 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
459 	rorx	$39, a, y1	# y1 = a >> 39				# S0A
460 	mov	a, y3		# y3 = a                                # MAJA
461 
462 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
463 	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
464 	add	8*1+frame_XFER(%rsp), h		# h = k + w + h         # --
465 	or	c, y3		# y3 = a|c                              # MAJA
466 
467 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
468 	mov	a, T1		# T1 = a                                # MAJB
469 	and	b, y3		# y3 = (a|c)&b                          # MAJA
470 	and	c, T1		# T1 = a&c                              # MAJB
471 	add	y0, y2		# y2 = S1 + CH                          # --
472 
473 	add	h, d		# d = k + w + h + d                     # --
474 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
475 	add	y1, h		# h = k + w + h + S0                    # --
476 
477 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
478 
479 	RotateState
480 
481 ################################### RND N + 2 #########################################
482 
483 	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
484 	mov	f, y2		# y2 = f                                # CH
485 	rorx	$41, e, y0	# y0 = e >> 41				# S1A
486 	rorx	$18, e, y1	# y1 = e >> 18				# S1B
487 	xor	g, y2		# y2 = f^g                              # CH
488 
489 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
490 	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
491 	and	e, y2		# y2 = (f^g)&e                          # CH
492 	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
493 
494 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
495 	rorx	$34, a, T1	# T1 = a >> 34				# S0B
496 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
497 	rorx	$39, a, y1	# y1 = a >> 39				# S0A
498 	mov	a, y3		# y3 = a                                # MAJA
499 
500 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
501 	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
502 	add	8*2+frame_XFER(%rsp), h		# h = k + w + h         # --
503 	or	c, y3		# y3 = a|c                              # MAJA
504 
505 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
506 	mov	a, T1		# T1 = a                                # MAJB
507 	and	b, y3		# y3 = (a|c)&b                          # MAJA
508 	and	c, T1		# T1 = a&c                              # MAJB
509 	add	y0, y2		# y2 = S1 + CH                          # --
510 
511 	add	h, d		# d = k + w + h + d                     # --
512 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
513 	add	y1, h		# h = k + w + h + S0                    # --
514 
515 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
516 
517 	RotateState
518 
519 ################################### RND N + 3 #########################################
520 
521 	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
522 	mov	f, y2		# y2 = f                                # CH
523 	rorx	$41, e, y0	# y0 = e >> 41				# S1A
524 	rorx	$18, e, y1	# y1 = e >> 18				# S1B
525 	xor	g, y2		# y2 = f^g                              # CH
526 
527 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
528 	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
529 	and	e, y2		# y2 = (f^g)&e                          # CH
530 	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
531 
532 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
533 	rorx	$34, a, T1	# T1 = a >> 34				# S0B
534 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
535 	rorx	$39, a, y1	# y1 = a >> 39				# S0A
536 	mov	a, y3		# y3 = a                                # MAJA
537 
538 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
539 	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
540 	add	8*3+frame_XFER(%rsp), h		# h = k + w + h         # --
541 	or	c, y3		# y3 = a|c                              # MAJA
542 
543 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
544 	mov	a, T1		# T1 = a                                # MAJB
545 	and	b, y3		# y3 = (a|c)&b                          # MAJA
546 	and	c, T1		# T1 = a&c                              # MAJB
547 	add	y0, y2		# y2 = S1 + CH                          # --
548 
549 
550 	add	h, d		# d = k + w + h + d                     # --
551 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
552 	add	y1, h		# h = k + w + h + S0                    # --
553 
554 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
555 
556 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
557 
558 	add	y3, h		# h = t1 + S0 + MAJ                     # --
559 
560 	RotateState
561 
562 .endm
563 
564 ########################################################################
565 # void sha512_transform_rorx(sha512_state *state, const u8 *data, int blocks)
566 # Purpose: Updates the SHA512 digest stored at "state" with the message
567 # stored in "data".
568 # The size of the message pointed to by "data" must be an integer multiple
569 # of SHA512 message blocks.
570 # "blocks" is the message length in SHA512 blocks
571 ########################################################################
572 SYM_FUNC_START(sha512_transform_rorx)
573 	# Allocate Stack Space
574 	mov	%rsp, %rax
575 	sub	$frame_size, %rsp
576 	and	$~(0x20 - 1), %rsp
577 	mov	%rax, frame_RSPSAVE(%rsp)
578 
579 	# Save GPRs
580 	mov	%rbx, 8*0+frame_GPRSAVE(%rsp)
581 	mov	%r12, 8*1+frame_GPRSAVE(%rsp)
582 	mov	%r13, 8*2+frame_GPRSAVE(%rsp)
583 	mov	%r14, 8*3+frame_GPRSAVE(%rsp)
584 	mov	%r15, 8*4+frame_GPRSAVE(%rsp)
585 
586 	shl	$7, NUM_BLKS	# convert to bytes
587 	jz	done_hash
588 	add	INP, NUM_BLKS	# pointer to end of data
589 	mov	NUM_BLKS, frame_INPEND(%rsp)
590 
591 	## load initial digest
592 	mov	8*0(CTX1), a
593 	mov	8*1(CTX1), b
594 	mov	8*2(CTX1), c
595 	mov	8*3(CTX1), d
596 	mov	8*4(CTX1), e
597 	mov	8*5(CTX1), f
598 	mov	8*6(CTX1), g
599 	mov	8*7(CTX1), h
600 
601 	# save %rdi (CTX) before it gets clobbered
602 	mov	%rdi, frame_CTX(%rsp)
603 
604 	vmovdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
605 
606 loop0:
607 	lea	K512(%rip), TBL
608 
609 	## byte swap first 16 dwords
610 	COPY_YMM_AND_BSWAP	Y_0, (INP), BYTE_FLIP_MASK
611 	COPY_YMM_AND_BSWAP	Y_1, 1*32(INP), BYTE_FLIP_MASK
612 	COPY_YMM_AND_BSWAP	Y_2, 2*32(INP), BYTE_FLIP_MASK
613 	COPY_YMM_AND_BSWAP	Y_3, 3*32(INP), BYTE_FLIP_MASK
614 
615 	mov	INP, frame_INP(%rsp)
616 
617 	## schedule 64 input dwords, by doing 12 rounds of 4 each
618 	movq	$4, frame_SRND(%rsp)
619 
620 .align 16
621 loop1:
622 	vpaddq	(TBL), Y_0, XFER
623 	vmovdqa XFER, frame_XFER(%rsp)
624 	FOUR_ROUNDS_AND_SCHED
625 
626 	vpaddq	1*32(TBL), Y_0, XFER
627 	vmovdqa XFER, frame_XFER(%rsp)
628 	FOUR_ROUNDS_AND_SCHED
629 
630 	vpaddq	2*32(TBL), Y_0, XFER
631 	vmovdqa XFER, frame_XFER(%rsp)
632 	FOUR_ROUNDS_AND_SCHED
633 
634 	vpaddq	3*32(TBL), Y_0, XFER
635 	vmovdqa XFER, frame_XFER(%rsp)
636 	add	$(4*32), TBL
637 	FOUR_ROUNDS_AND_SCHED
638 
639 	subq	$1, frame_SRND(%rsp)
640 	jne	loop1
641 
642 	movq	$2, frame_SRND(%rsp)
643 loop2:
644 	vpaddq	(TBL), Y_0, XFER
645 	vmovdqa XFER, frame_XFER(%rsp)
646 	DO_4ROUNDS
647 	vpaddq	1*32(TBL), Y_1, XFER
648 	vmovdqa XFER, frame_XFER(%rsp)
649 	add	$(2*32), TBL
650 	DO_4ROUNDS
651 
652 	vmovdqa	Y_2, Y_0
653 	vmovdqa	Y_3, Y_1
654 
655 	subq	$1, frame_SRND(%rsp)
656 	jne	loop2
657 
658 	mov	frame_CTX(%rsp), CTX2
659 	addm	8*0(CTX2), a
660 	addm	8*1(CTX2), b
661 	addm	8*2(CTX2), c
662 	addm	8*3(CTX2), d
663 	addm	8*4(CTX2), e
664 	addm	8*5(CTX2), f
665 	addm	8*6(CTX2), g
666 	addm	8*7(CTX2), h
667 
668 	mov	frame_INP(%rsp), INP
669 	add	$128, INP
670 	cmp	frame_INPEND(%rsp), INP
671 	jne	loop0
672 
673 done_hash:
674 
675 # Restore GPRs
676 	mov	8*0+frame_GPRSAVE(%rsp), %rbx
677 	mov	8*1+frame_GPRSAVE(%rsp), %r12
678 	mov	8*2+frame_GPRSAVE(%rsp), %r13
679 	mov	8*3+frame_GPRSAVE(%rsp), %r14
680 	mov	8*4+frame_GPRSAVE(%rsp), %r15
681 
682 	# Restore Stack Pointer
683 	mov	frame_RSPSAVE(%rsp), %rsp
684 	RET
685 SYM_FUNC_END(sha512_transform_rorx)
686 
687 ########################################################################
688 ### Binary Data
689 
690 
691 # Mergeable 640-byte rodata section. This allows linker to merge the table
692 # with other, exactly the same 640-byte fragment of another rodata section
693 # (if such section exists).
694 .section	.rodata.cst640.K512, "aM", @progbits, 640
695 .align 64
696 # K[t] used in SHA512 hashing
697 K512:
698 	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
699 	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
700 	.quad	0x3956c25bf348b538,0x59f111f1b605d019
701 	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
702 	.quad	0xd807aa98a3030242,0x12835b0145706fbe
703 	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
704 	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
705 	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
706 	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
707 	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
708 	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
709 	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
710 	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
711 	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
712 	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
713 	.quad	0x06ca6351e003826f,0x142929670a0e6e70
714 	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
715 	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
716 	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
717 	.quad	0x81c2c92e47edaee6,0x92722c851482353b
718 	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
719 	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
720 	.quad	0xd192e819d6ef5218,0xd69906245565a910
721 	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
722 	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
723 	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
724 	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
725 	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
726 	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
727 	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
728 	.quad	0x90befffa23631e28,0xa4506cebde82bde9
729 	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
730 	.quad	0xca273eceea26619c,0xd186b8c721c0c207
731 	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
732 	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
733 	.quad	0x113f9804bef90dae,0x1b710b35131c471b
734 	.quad	0x28db77f523047d84,0x32caab7b40c72493
735 	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
736 	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
737 	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
738 
739 .section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
740 .align 32
741 # Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
742 PSHUFFLE_BYTE_FLIP_MASK:
743 	.octa 0x08090a0b0c0d0e0f0001020304050607
744 	.octa 0x18191a1b1c1d1e1f1011121314151617
745 
746 .section	.rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32
747 .align 32
748 MASK_YMM_LO:
749 	.octa 0x00000000000000000000000000000000
750 	.octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
751