1 ########################################################################
2 # Implement fast SHA-512 with AVX2 instructions. (x86_64)
3 #
4 # Copyright (C) 2013 Intel Corporation.
5 #
6 # Authors:
7 #     James Guilford <james.guilford@intel.com>
8 #     Kirk Yap <kirk.s.yap@intel.com>
9 #     David Cote <david.m.cote@intel.com>
10 #     Tim Chen <tim.c.chen@linux.intel.com>
11 #
12 # This software is available to you under a choice of one of two
13 # licenses.  You may choose to be licensed under the terms of the GNU
14 # General Public License (GPL) Version 2, available from the file
15 # COPYING in the main directory of this source tree, or the
16 # OpenIB.org BSD license below:
17 #
18 #     Redistribution and use in source and binary forms, with or
19 #     without modification, are permitted provided that the following
20 #     conditions are met:
21 #
22 #      - Redistributions of source code must retain the above
23 #        copyright notice, this list of conditions and the following
24 #        disclaimer.
25 #
26 #      - Redistributions in binary form must reproduce the above
27 #        copyright notice, this list of conditions and the following
28 #        disclaimer in the documentation and/or other materials
29 #        provided with the distribution.
30 #
31 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
32 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
33 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
34 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
35 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
36 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
37 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
38 # SOFTWARE.
39 #
40 ########################################################################
41 #
42 # This code is described in an Intel White-Paper:
43 # "Fast SHA-512 Implementations on Intel Architecture Processors"
44 #
45 # To find it, surf to http://www.intel.com/p/en_US/embedded
46 # and search for that title.
47 #
48 ########################################################################
49 # This code schedules 1 blocks at a time, with 4 lanes per block
50 ########################################################################
51 
52 #include <linux/linkage.h>
53 #include <linux/cfi_types.h>
54 
55 .text
56 
57 # Virtual Registers
58 Y_0 = %ymm4
59 Y_1 = %ymm5
60 Y_2 = %ymm6
61 Y_3 = %ymm7
62 
63 YTMP0 = %ymm0
64 YTMP1 = %ymm1
65 YTMP2 = %ymm2
66 YTMP3 = %ymm3
67 YTMP4 = %ymm8
68 XFER  = YTMP0
69 
70 BYTE_FLIP_MASK  = %ymm9
71 
72 # 1st arg is %rdi, which is saved to the stack and accessed later via %r12
73 CTX1        = %rdi
74 CTX2        = %r12
75 # 2nd arg
76 INP         = %rsi
77 # 3rd arg
78 NUM_BLKS    = %rdx
79 
80 c           = %rcx
81 d           = %r8
82 e           = %rdx
83 y3          = %rsi
84 
85 TBL   = %rdi # clobbers CTX1
86 
87 a     = %rax
88 b     = %rbx
89 
90 f     = %r9
91 g     = %r10
92 h     = %r11
93 old_h = %r11
94 
95 T1    = %r12 # clobbers CTX2
96 y0    = %r13
97 y1    = %r14
98 y2    = %r15
99 
100 # Local variables (stack frame)
101 XFER_SIZE = 4*8
102 SRND_SIZE = 1*8
103 INP_SIZE = 1*8
104 INPEND_SIZE = 1*8
105 CTX_SIZE = 1*8
106 
107 frame_XFER = 0
108 frame_SRND = frame_XFER + XFER_SIZE
109 frame_INP = frame_SRND + SRND_SIZE
110 frame_INPEND = frame_INP + INP_SIZE
111 frame_CTX = frame_INPEND + INPEND_SIZE
112 frame_size = frame_CTX + CTX_SIZE
113 
114 ## assume buffers not aligned
115 #define	VMOVDQ vmovdqu
116 
117 # addm [mem], reg
118 # Add reg to mem using reg-mem add and store
119 .macro addm p1 p2
120 	add	\p1, \p2
121 	mov	\p2, \p1
122 .endm
123 
124 
125 # COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask
126 # Load ymm with mem and byte swap each dword
127 .macro COPY_YMM_AND_BSWAP p1 p2 p3
128 	VMOVDQ \p2, \p1
129 	vpshufb \p3, \p1, \p1
130 .endm
131 # rotate_Ys
132 # Rotate values of symbols Y0...Y3
133 .macro rotate_Ys
134 	Y_ = Y_0
135 	Y_0 = Y_1
136 	Y_1 = Y_2
137 	Y_2 = Y_3
138 	Y_3 = Y_
139 .endm
140 
141 # RotateState
142 .macro RotateState
143 	# Rotate symbols a..h right
144 	old_h  = h
145 	TMP_   = h
146 	h      = g
147 	g      = f
148 	f      = e
149 	e      = d
150 	d      = c
151 	c      = b
152 	b      = a
153 	a      = TMP_
154 .endm
155 
156 # macro MY_VPALIGNR	YDST, YSRC1, YSRC2, RVAL
157 # YDST = {YSRC1, YSRC2} >> RVAL*8
158 .macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL
159 	vperm2f128      $0x3, \YSRC2, \YSRC1, \YDST     # YDST = {YS1_LO, YS2_HI}
160 	vpalignr        $\RVAL, \YSRC2, \YDST, \YDST    # YDST = {YDS1, YS2} >> RVAL*8
161 .endm
162 
163 .macro FOUR_ROUNDS_AND_SCHED
164 ################################### RND N + 0 #########################################
165 
166 	# Extract w[t-7]
167 	MY_VPALIGNR	YTMP0, Y_3, Y_2, 8		# YTMP0 = W[-7]
168 	# Calculate w[t-16] + w[t-7]
169 	vpaddq		Y_0, YTMP0, YTMP0		# YTMP0 = W[-7] + W[-16]
170 	# Extract w[t-15]
171 	MY_VPALIGNR	YTMP1, Y_1, Y_0, 8		# YTMP1 = W[-15]
172 
173 	# Calculate sigma0
174 
175 	# Calculate w[t-15] ror 1
176 	vpsrlq		$1, YTMP1, YTMP2
177 	vpsllq		$(64-1), YTMP1, YTMP3
178 	vpor		YTMP2, YTMP3, YTMP3		# YTMP3 = W[-15] ror 1
179 	# Calculate w[t-15] shr 7
180 	vpsrlq		$7, YTMP1, YTMP4		# YTMP4 = W[-15] >> 7
181 
182 	mov	a, y3		# y3 = a                                # MAJA
183 	rorx	$41, e, y0	# y0 = e >> 41				# S1A
184 	rorx	$18, e, y1	# y1 = e >> 18				# S1B
185 	add	frame_XFER(%rsp),h		# h = k + w + h         # --
186 	or	c, y3		# y3 = a|c                              # MAJA
187 	mov	f, y2		# y2 = f                                # CH
188 	rorx	$34, a, T1	# T1 = a >> 34				# S0B
189 
190 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
191 	xor	g, y2		# y2 = f^g                              # CH
192 	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
193 
194 	and	e, y2		# y2 = (f^g)&e                          # CH
195 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
196 	rorx	$39, a, y1	# y1 = a >> 39				# S0A
197 	add	h, d		# d = k + w + h + d                     # --
198 
199 	and	b, y3		# y3 = (a|c)&b                          # MAJA
200 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
201 	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
202 
203 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
204 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
205 	mov	a, T1		# T1 = a                                # MAJB
206 	and	c, T1		# T1 = a&c                              # MAJB
207 
208 	add	y0, y2		# y2 = S1 + CH                          # --
209 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
210 	add	y1, h		# h = k + w + h + S0                    # --
211 
212 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
213 
214 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
215 	add	y3, h		# h = t1 + S0 + MAJ                     # --
216 
217 	RotateState
218 
219 ################################### RND N + 1 #########################################
220 
221 	# Calculate w[t-15] ror 8
222 	vpsrlq		$8, YTMP1, YTMP2
223 	vpsllq		$(64-8), YTMP1, YTMP1
224 	vpor		YTMP2, YTMP1, YTMP1		# YTMP1 = W[-15] ror 8
225 	# XOR the three components
226 	vpxor		YTMP4, YTMP3, YTMP3		# YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
227 	vpxor		YTMP1, YTMP3, YTMP1		# YTMP1 = s0
228 
229 
230 	# Add three components, w[t-16], w[t-7] and sigma0
231 	vpaddq		YTMP1, YTMP0, YTMP0		# YTMP0 = W[-16] + W[-7] + s0
232 	# Move to appropriate lanes for calculating w[16] and w[17]
233 	vperm2f128	$0x0, YTMP0, YTMP0, Y_0		# Y_0 = W[-16] + W[-7] + s0 {BABA}
234 	# Move to appropriate lanes for calculating w[18] and w[19]
235 	vpand		MASK_YMM_LO(%rip), YTMP0, YTMP0	# YTMP0 = W[-16] + W[-7] + s0 {DC00}
236 
237 	# Calculate w[16] and w[17] in both 128 bit lanes
238 
239 	# Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
240 	vperm2f128	$0x11, Y_3, Y_3, YTMP2		# YTMP2 = W[-2] {BABA}
241 	vpsrlq		$6, YTMP2, YTMP4		# YTMP4 = W[-2] >> 6 {BABA}
242 
243 
244 	mov	a, y3		# y3 = a                                # MAJA
245 	rorx	$41, e, y0	# y0 = e >> 41				# S1A
246 	rorx	$18, e, y1	# y1 = e >> 18				# S1B
247 	add	1*8+frame_XFER(%rsp), h		# h = k + w + h         # --
248 	or	c, y3		# y3 = a|c                              # MAJA
249 
250 
251 	mov	f, y2		# y2 = f                                # CH
252 	rorx	$34, a, T1	# T1 = a >> 34				# S0B
253 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
254 	xor	g, y2		# y2 = f^g                              # CH
255 
256 
257 	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
258 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
259 	rorx	$39, a, y1	# y1 = a >> 39				# S0A
260 	and	e, y2		# y2 = (f^g)&e                          # CH
261 	add	h, d		# d = k + w + h + d                     # --
262 
263 	and	b, y3		# y3 = (a|c)&b                          # MAJA
264 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
265 
266 	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
267 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
268 
269 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
270 	mov	a, T1		# T1 = a                                # MAJB
271 	and	c, T1		# T1 = a&c                              # MAJB
272 	add	y0, y2		# y2 = S1 + CH                          # --
273 
274 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
275 	add	y1, h		# h = k + w + h + S0                    # --
276 
277 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
278 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
279 	add	y3, h		# h = t1 + S0 + MAJ                     # --
280 
281 	RotateState
282 
283 
284 ################################### RND N + 2 #########################################
285 
286 	vpsrlq		$19, YTMP2, YTMP3		# YTMP3 = W[-2] >> 19 {BABA}
287 	vpsllq		$(64-19), YTMP2, YTMP1		# YTMP1 = W[-2] << 19 {BABA}
288 	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 19 {BABA}
289 	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
290 	vpsrlq		$61, YTMP2, YTMP3		# YTMP3 = W[-2] >> 61 {BABA}
291 	vpsllq		$(64-61), YTMP2, YTMP1		# YTMP1 = W[-2] << 61 {BABA}
292 	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 61 {BABA}
293 	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = s1 = (W[-2] ror 19) ^
294 							#  (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
295 
296 	# Add sigma1 to the other compunents to get w[16] and w[17]
297 	vpaddq		YTMP4, Y_0, Y_0			# Y_0 = {W[1], W[0], W[1], W[0]}
298 
299 	# Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
300 	vpsrlq		$6, Y_0, YTMP4			# YTMP4 = W[-2] >> 6 {DC--}
301 
302 	mov	a, y3		# y3 = a                                # MAJA
303 	rorx	$41, e, y0	# y0 = e >> 41				# S1A
304 	add	2*8+frame_XFER(%rsp), h		# h = k + w + h         # --
305 
306 	rorx	$18, e, y1	# y1 = e >> 18				# S1B
307 	or	c, y3		# y3 = a|c                              # MAJA
308 	mov	f, y2		# y2 = f                                # CH
309 	xor	g, y2		# y2 = f^g                              # CH
310 
311 	rorx	$34, a, T1	# T1 = a >> 34				# S0B
312 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
313 	and	e, y2		# y2 = (f^g)&e                          # CH
314 
315 	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
316 	add	h, d		# d = k + w + h + d                     # --
317 	and	b, y3		# y3 = (a|c)&b                          # MAJA
318 
319 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
320 	rorx	$39, a, y1	# y1 = a >> 39				# S0A
321 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
322 
323 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
324 	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
325 
326 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
327 	mov	a, T1		# T1 = a                                # MAJB
328 	and	c, T1		# T1 = a&c                              # MAJB
329 	add	y0, y2		# y2 = S1 + CH                          # --
330 
331 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
332 	add	y1, h		# h = k + w + h + S0                    # --
333 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
334 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
335 
336 	add	y3, h		# h = t1 + S0 + MAJ                     # --
337 
338 	RotateState
339 
340 ################################### RND N + 3 #########################################
341 
342 	vpsrlq		$19, Y_0, YTMP3			# YTMP3 = W[-2] >> 19 {DC--}
343 	vpsllq		$(64-19), Y_0, YTMP1		# YTMP1 = W[-2] << 19 {DC--}
344 	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 19 {DC--}
345 	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
346 	vpsrlq		$61, Y_0, YTMP3			# YTMP3 = W[-2] >> 61 {DC--}
347 	vpsllq		$(64-61), Y_0, YTMP1		# YTMP1 = W[-2] << 61 {DC--}
348 	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 61 {DC--}
349 	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = s1 = (W[-2] ror 19) ^
350 							#  (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
351 
352 	# Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
353 	# to newly calculated sigma1 to get w[18] and w[19]
354 	vpaddq		YTMP4, YTMP0, YTMP2		# YTMP2 = {W[3], W[2], --, --}
355 
356 	# Form w[19, w[18], w17], w[16]
357 	vpblendd		$0xF0, YTMP2, Y_0, Y_0		# Y_0 = {W[3], W[2], W[1], W[0]}
358 
359 	mov	a, y3		# y3 = a                                # MAJA
360 	rorx	$41, e, y0	# y0 = e >> 41				# S1A
361 	rorx	$18, e, y1	# y1 = e >> 18				# S1B
362 	add	3*8+frame_XFER(%rsp), h		# h = k + w + h         # --
363 	or	c, y3		# y3 = a|c                              # MAJA
364 
365 
366 	mov	f, y2		# y2 = f                                # CH
367 	rorx	$34, a, T1	# T1 = a >> 34				# S0B
368 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
369 	xor	g, y2		# y2 = f^g                              # CH
370 
371 
372 	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
373 	and	e, y2		# y2 = (f^g)&e                          # CH
374 	add	h, d		# d = k + w + h + d                     # --
375 	and	b, y3		# y3 = (a|c)&b                          # MAJA
376 
377 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
378 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
379 
380 	rorx	$39, a, y1	# y1 = a >> 39				# S0A
381 	add	y0, y2		# y2 = S1 + CH                          # --
382 
383 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
384 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
385 
386 	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
387 
388 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
389 	mov	a, T1		# T1 = a                                # MAJB
390 	and	c, T1		# T1 = a&c                              # MAJB
391 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
392 
393 	add	y1, h		# h = k + w + h + S0                    # --
394 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
395 	add	y3, h		# h = t1 + S0 + MAJ                     # --
396 
397 	RotateState
398 
399 	rotate_Ys
400 .endm
401 
402 .macro DO_4ROUNDS
403 
404 ################################### RND N + 0 #########################################
405 
406 	mov	f, y2		# y2 = f                                # CH
407 	rorx	$41, e, y0	# y0 = e >> 41				# S1A
408 	rorx	$18, e, y1	# y1 = e >> 18				# S1B
409 	xor	g, y2		# y2 = f^g                              # CH
410 
411 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
412 	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
413 	and	e, y2		# y2 = (f^g)&e                          # CH
414 
415 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
416 	rorx	$34, a, T1	# T1 = a >> 34				# S0B
417 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
418 	rorx	$39, a, y1	# y1 = a >> 39				# S0A
419 	mov	a, y3		# y3 = a                                # MAJA
420 
421 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
422 	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
423 	add	frame_XFER(%rsp), h		# h = k + w + h         # --
424 	or	c, y3		# y3 = a|c                              # MAJA
425 
426 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
427 	mov	a, T1		# T1 = a                                # MAJB
428 	and	b, y3		# y3 = (a|c)&b                          # MAJA
429 	and	c, T1		# T1 = a&c                              # MAJB
430 	add	y0, y2		# y2 = S1 + CH                          # --
431 
432 	add	h, d		# d = k + w + h + d                     # --
433 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
434 	add	y1, h		# h = k + w + h + S0                    # --
435 
436 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
437 
438 	RotateState
439 
440 ################################### RND N + 1 #########################################
441 
442 	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
443 	mov	f, y2		# y2 = f                                # CH
444 	rorx	$41, e, y0	# y0 = e >> 41				# S1A
445 	rorx	$18, e, y1	# y1 = e >> 18				# S1B
446 	xor	g, y2		# y2 = f^g                              # CH
447 
448 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
449 	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
450 	and	e, y2		# y2 = (f^g)&e                          # CH
451 	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
452 
453 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
454 	rorx	$34, a, T1	# T1 = a >> 34				# S0B
455 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
456 	rorx	$39, a, y1	# y1 = a >> 39				# S0A
457 	mov	a, y3		# y3 = a                                # MAJA
458 
459 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
460 	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
461 	add	8*1+frame_XFER(%rsp), h		# h = k + w + h         # --
462 	or	c, y3		# y3 = a|c                              # MAJA
463 
464 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
465 	mov	a, T1		# T1 = a                                # MAJB
466 	and	b, y3		# y3 = (a|c)&b                          # MAJA
467 	and	c, T1		# T1 = a&c                              # MAJB
468 	add	y0, y2		# y2 = S1 + CH                          # --
469 
470 	add	h, d		# d = k + w + h + d                     # --
471 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
472 	add	y1, h		# h = k + w + h + S0                    # --
473 
474 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
475 
476 	RotateState
477 
478 ################################### RND N + 2 #########################################
479 
480 	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
481 	mov	f, y2		# y2 = f                                # CH
482 	rorx	$41, e, y0	# y0 = e >> 41				# S1A
483 	rorx	$18, e, y1	# y1 = e >> 18				# S1B
484 	xor	g, y2		# y2 = f^g                              # CH
485 
486 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
487 	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
488 	and	e, y2		# y2 = (f^g)&e                          # CH
489 	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
490 
491 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
492 	rorx	$34, a, T1	# T1 = a >> 34				# S0B
493 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
494 	rorx	$39, a, y1	# y1 = a >> 39				# S0A
495 	mov	a, y3		# y3 = a                                # MAJA
496 
497 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
498 	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
499 	add	8*2+frame_XFER(%rsp), h		# h = k + w + h         # --
500 	or	c, y3		# y3 = a|c                              # MAJA
501 
502 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
503 	mov	a, T1		# T1 = a                                # MAJB
504 	and	b, y3		# y3 = (a|c)&b                          # MAJA
505 	and	c, T1		# T1 = a&c                              # MAJB
506 	add	y0, y2		# y2 = S1 + CH                          # --
507 
508 	add	h, d		# d = k + w + h + d                     # --
509 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
510 	add	y1, h		# h = k + w + h + S0                    # --
511 
512 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
513 
514 	RotateState
515 
516 ################################### RND N + 3 #########################################
517 
518 	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
519 	mov	f, y2		# y2 = f                                # CH
520 	rorx	$41, e, y0	# y0 = e >> 41				# S1A
521 	rorx	$18, e, y1	# y1 = e >> 18				# S1B
522 	xor	g, y2		# y2 = f^g                              # CH
523 
524 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
525 	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
526 	and	e, y2		# y2 = (f^g)&e                          # CH
527 	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
528 
529 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
530 	rorx	$34, a, T1	# T1 = a >> 34				# S0B
531 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
532 	rorx	$39, a, y1	# y1 = a >> 39				# S0A
533 	mov	a, y3		# y3 = a                                # MAJA
534 
535 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
536 	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
537 	add	8*3+frame_XFER(%rsp), h		# h = k + w + h         # --
538 	or	c, y3		# y3 = a|c                              # MAJA
539 
540 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
541 	mov	a, T1		# T1 = a                                # MAJB
542 	and	b, y3		# y3 = (a|c)&b                          # MAJA
543 	and	c, T1		# T1 = a&c                              # MAJB
544 	add	y0, y2		# y2 = S1 + CH                          # --
545 
546 
547 	add	h, d		# d = k + w + h + d                     # --
548 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
549 	add	y1, h		# h = k + w + h + S0                    # --
550 
551 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
552 
553 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
554 
555 	add	y3, h		# h = t1 + S0 + MAJ                     # --
556 
557 	RotateState
558 
559 .endm
560 
561 ########################################################################
562 # void sha512_transform_rorx(sha512_state *state, const u8 *data, int blocks)
563 # Purpose: Updates the SHA512 digest stored at "state" with the message
564 # stored in "data".
565 # The size of the message pointed to by "data" must be an integer multiple
566 # of SHA512 message blocks.
567 # "blocks" is the message length in SHA512 blocks
568 ########################################################################
569 SYM_TYPED_FUNC_START(sha512_transform_rorx)
570 	# Save GPRs
571 	push	%rbx
572 	push	%r12
573 	push	%r13
574 	push	%r14
575 	push	%r15
576 
577 	# Allocate Stack Space
578 	push	%rbp
579 	mov	%rsp, %rbp
580 	sub	$frame_size, %rsp
581 	and	$~(0x20 - 1), %rsp
582 
583 	shl	$7, NUM_BLKS	# convert to bytes
584 	jz	.Ldone_hash
585 	add	INP, NUM_BLKS	# pointer to end of data
586 	mov	NUM_BLKS, frame_INPEND(%rsp)
587 
588 	## load initial digest
589 	mov	8*0(CTX1), a
590 	mov	8*1(CTX1), b
591 	mov	8*2(CTX1), c
592 	mov	8*3(CTX1), d
593 	mov	8*4(CTX1), e
594 	mov	8*5(CTX1), f
595 	mov	8*6(CTX1), g
596 	mov	8*7(CTX1), h
597 
598 	# save %rdi (CTX) before it gets clobbered
599 	mov	%rdi, frame_CTX(%rsp)
600 
601 	vmovdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
602 
603 .Lloop0:
604 	lea	K512(%rip), TBL
605 
606 	## byte swap first 16 dwords
607 	COPY_YMM_AND_BSWAP	Y_0, (INP), BYTE_FLIP_MASK
608 	COPY_YMM_AND_BSWAP	Y_1, 1*32(INP), BYTE_FLIP_MASK
609 	COPY_YMM_AND_BSWAP	Y_2, 2*32(INP), BYTE_FLIP_MASK
610 	COPY_YMM_AND_BSWAP	Y_3, 3*32(INP), BYTE_FLIP_MASK
611 
612 	mov	INP, frame_INP(%rsp)
613 
614 	## schedule 64 input dwords, by doing 12 rounds of 4 each
615 	movq	$4, frame_SRND(%rsp)
616 
617 .align 16
618 .Lloop1:
619 	vpaddq	(TBL), Y_0, XFER
620 	vmovdqa XFER, frame_XFER(%rsp)
621 	FOUR_ROUNDS_AND_SCHED
622 
623 	vpaddq	1*32(TBL), Y_0, XFER
624 	vmovdqa XFER, frame_XFER(%rsp)
625 	FOUR_ROUNDS_AND_SCHED
626 
627 	vpaddq	2*32(TBL), Y_0, XFER
628 	vmovdqa XFER, frame_XFER(%rsp)
629 	FOUR_ROUNDS_AND_SCHED
630 
631 	vpaddq	3*32(TBL), Y_0, XFER
632 	vmovdqa XFER, frame_XFER(%rsp)
633 	add	$(4*32), TBL
634 	FOUR_ROUNDS_AND_SCHED
635 
636 	subq	$1, frame_SRND(%rsp)
637 	jne	.Lloop1
638 
639 	movq	$2, frame_SRND(%rsp)
640 .Lloop2:
641 	vpaddq	(TBL), Y_0, XFER
642 	vmovdqa XFER, frame_XFER(%rsp)
643 	DO_4ROUNDS
644 	vpaddq	1*32(TBL), Y_1, XFER
645 	vmovdqa XFER, frame_XFER(%rsp)
646 	add	$(2*32), TBL
647 	DO_4ROUNDS
648 
649 	vmovdqa	Y_2, Y_0
650 	vmovdqa	Y_3, Y_1
651 
652 	subq	$1, frame_SRND(%rsp)
653 	jne	.Lloop2
654 
655 	mov	frame_CTX(%rsp), CTX2
656 	addm	8*0(CTX2), a
657 	addm	8*1(CTX2), b
658 	addm	8*2(CTX2), c
659 	addm	8*3(CTX2), d
660 	addm	8*4(CTX2), e
661 	addm	8*5(CTX2), f
662 	addm	8*6(CTX2), g
663 	addm	8*7(CTX2), h
664 
665 	mov	frame_INP(%rsp), INP
666 	add	$128, INP
667 	cmp	frame_INPEND(%rsp), INP
668 	jne	.Lloop0
669 
670 .Ldone_hash:
671 
672 	# Restore Stack Pointer
673 	mov	%rbp, %rsp
674 	pop	%rbp
675 
676 	# Restore GPRs
677 	pop	%r15
678 	pop	%r14
679 	pop	%r13
680 	pop	%r12
681 	pop	%rbx
682 
683 	RET
684 SYM_FUNC_END(sha512_transform_rorx)
685 
686 ########################################################################
687 ### Binary Data
688 
689 
690 # Mergeable 640-byte rodata section. This allows linker to merge the table
691 # with other, exactly the same 640-byte fragment of another rodata section
692 # (if such section exists).
693 .section	.rodata.cst640.K512, "aM", @progbits, 640
694 .align 64
695 # K[t] used in SHA512 hashing
696 K512:
697 	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
698 	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
699 	.quad	0x3956c25bf348b538,0x59f111f1b605d019
700 	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
701 	.quad	0xd807aa98a3030242,0x12835b0145706fbe
702 	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
703 	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
704 	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
705 	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
706 	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
707 	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
708 	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
709 	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
710 	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
711 	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
712 	.quad	0x06ca6351e003826f,0x142929670a0e6e70
713 	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
714 	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
715 	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
716 	.quad	0x81c2c92e47edaee6,0x92722c851482353b
717 	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
718 	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
719 	.quad	0xd192e819d6ef5218,0xd69906245565a910
720 	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
721 	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
722 	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
723 	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
724 	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
725 	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
726 	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
727 	.quad	0x90befffa23631e28,0xa4506cebde82bde9
728 	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
729 	.quad	0xca273eceea26619c,0xd186b8c721c0c207
730 	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
731 	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
732 	.quad	0x113f9804bef90dae,0x1b710b35131c471b
733 	.quad	0x28db77f523047d84,0x32caab7b40c72493
734 	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
735 	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
736 	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
737 
738 .section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
739 .align 32
740 # Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
741 PSHUFFLE_BYTE_FLIP_MASK:
742 	.octa 0x08090a0b0c0d0e0f0001020304050607
743 	.octa 0x18191a1b1c1d1e1f1011121314151617
744 
745 .section	.rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32
746 .align 32
747 MASK_YMM_LO:
748 	.octa 0x00000000000000000000000000000000
749 	.octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
750