1 ########################################################################
2 # Implement fast SHA-256 with AVX2 instructions. (x86_64)
3 #
4 # Copyright (C) 2013 Intel Corporation.
5 #
6 # Authors:
7 #     James Guilford <james.guilford@intel.com>
8 #     Kirk Yap <kirk.s.yap@intel.com>
9 #     Tim Chen <tim.c.chen@linux.intel.com>
10 #
11 # This software is available to you under a choice of one of two
12 # licenses.  You may choose to be licensed under the terms of the GNU
13 # General Public License (GPL) Version 2, available from the file
14 # COPYING in the main directory of this source tree, or the
15 # OpenIB.org BSD license below:
16 #
17 #     Redistribution and use in source and binary forms, with or
18 #     without modification, are permitted provided that the following
19 #     conditions are met:
20 #
21 #      - Redistributions of source code must retain the above
22 #        copyright notice, this list of conditions and the following
23 #        disclaimer.
24 #
25 #      - Redistributions in binary form must reproduce the above
26 #        copyright notice, this list of conditions and the following
27 #        disclaimer in the documentation and/or other materials
28 #        provided with the distribution.
29 #
30 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37 # SOFTWARE.
38 #
39 ########################################################################
40 #
41 # This code is described in an Intel White-Paper:
42 # "Fast SHA-256 Implementations on Intel Architecture Processors"
43 #
44 # To find it, surf to http://www.intel.com/p/en_US/embedded
45 # and search for that title.
46 #
47 ########################################################################
48 # This code schedules 2 blocks at a time, with 4 lanes per block
49 ########################################################################
50 
51 #include <linux/linkage.h>
52 
53 ## assume buffers not aligned
54 #define	VMOVDQ vmovdqu
55 
56 ################################ Define Macros
57 
58 # addm [mem], reg
59 # Add reg to mem using reg-mem add and store
60 .macro addm p1 p2
61 	add	\p1, \p2
62 	mov	\p2, \p1
63 .endm
64 
65 ################################
66 
67 X0 = %ymm4
68 X1 = %ymm5
69 X2 = %ymm6
70 X3 = %ymm7
71 
72 # XMM versions of above
73 XWORD0 = %xmm4
74 XWORD1 = %xmm5
75 XWORD2 = %xmm6
76 XWORD3 = %xmm7
77 
78 XTMP0 = %ymm0
79 XTMP1 = %ymm1
80 XTMP2 = %ymm2
81 XTMP3 = %ymm3
82 XTMP4 = %ymm8
83 XFER  = %ymm9
84 XTMP5 = %ymm11
85 
86 SHUF_00BA =	%ymm10 # shuffle xBxA -> 00BA
87 SHUF_DC00 =	%ymm12 # shuffle xDxC -> DC00
88 BYTE_FLIP_MASK = %ymm13
89 
90 X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
91 
92 NUM_BLKS = %rdx	# 3rd arg
93 INP	= %rsi  # 2nd arg
94 CTX	= %rdi	# 1st arg
95 c	= %ecx
96 d	= %r8d
97 e       = %edx	# clobbers NUM_BLKS
98 y3	= %esi	# clobbers INP
99 
100 SRND	= CTX	# SRND is same register as CTX
101 
102 a = %eax
103 b = %ebx
104 f = %r9d
105 g = %r10d
106 h = %r11d
107 old_h = %r11d
108 
109 T1 = %r12d
110 y0 = %r13d
111 y1 = %r14d
112 y2 = %r15d
113 
114 
115 _XFER_SIZE	= 2*64*4	# 2 blocks, 64 rounds, 4 bytes/round
116 _XMM_SAVE_SIZE	= 0
117 _INP_END_SIZE	= 8
118 _INP_SIZE	= 8
119 _CTX_SIZE	= 8
120 _RSP_SIZE	= 8
121 
122 _XFER		= 0
123 _XMM_SAVE	= _XFER     + _XFER_SIZE
124 _INP_END	= _XMM_SAVE + _XMM_SAVE_SIZE
125 _INP		= _INP_END  + _INP_END_SIZE
126 _CTX		= _INP      + _INP_SIZE
127 _RSP		= _CTX      + _CTX_SIZE
128 STACK_SIZE	= _RSP      + _RSP_SIZE
129 
130 # rotate_Xs
131 # Rotate values of symbols X0...X3
132 .macro rotate_Xs
133 	X_ = X0
134 	X0 = X1
135 	X1 = X2
136 	X2 = X3
137 	X3 = X_
138 .endm
139 
140 # ROTATE_ARGS
141 # Rotate values of symbols a...h
142 .macro ROTATE_ARGS
143 	old_h = h
144 	TMP_ = h
145 	h = g
146 	g = f
147 	f = e
148 	e = d
149 	d = c
150 	c = b
151 	b = a
152 	a = TMP_
153 .endm
154 
155 .macro FOUR_ROUNDS_AND_SCHED disp
156 ################################### RND N + 0 ############################
157 
158 	mov	a, y3		# y3 = a                                # MAJA
159 	rorx	$25, e, y0	# y0 = e >> 25				# S1A
160 	rorx	$11, e, y1	# y1 = e >> 11				# S1B
161 
162 	addl	\disp(%rsp, SRND), h		# h = k + w + h         # --
163 	or	c, y3		# y3 = a|c                              # MAJA
164 	vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
165 	mov	f, y2		# y2 = f                                # CH
166 	rorx	$13, a, T1	# T1 = a >> 13				# S0B
167 
168 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
169 	xor	g, y2		# y2 = f^g                              # CH
170 	vpaddd	X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
171 	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
172 
173 	and	e, y2		# y2 = (f^g)&e                          # CH
174 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
175 	rorx	$22, a, y1	# y1 = a >> 22				# S0A
176 	add	h, d		# d = k + w + h + d                     # --
177 
178 	and	b, y3		# y3 = (a|c)&b                          # MAJA
179 	vpalignr $4, X0, X1, XTMP1	# XTMP1 = W[-15]
180 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
181 	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
182 
183 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
184 	vpsrld	$7, XTMP1, XTMP2
185 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
186 	mov	a, T1		# T1 = a                                # MAJB
187 	and	c, T1		# T1 = a&c                              # MAJB
188 
189 	add	y0, y2		# y2 = S1 + CH                          # --
190 	vpslld	$(32-7), XTMP1, XTMP3
191 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
192 	add	y1, h		# h = k + w + h + S0                    # --
193 
194 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
195 	vpor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7
196 
197 	vpsrld	$18, XTMP1, XTMP2
198 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
199 	add	y3, h		# h = t1 + S0 + MAJ                     # --
200 
201 
202 	ROTATE_ARGS
203 
204 ################################### RND N + 1 ############################
205 
206 	mov	a, y3		# y3 = a                                # MAJA
207 	rorx	$25, e, y0	# y0 = e >> 25				# S1A
208 	rorx	$11, e, y1	# y1 = e >> 11				# S1B
209 	offset = \disp + 1*4
210 	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
211 	or	c, y3		# y3 = a|c                              # MAJA
212 
213 
214 	vpsrld	$3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
215 	mov	f, y2		# y2 = f                                # CH
216 	rorx	$13, a, T1	# T1 = a >> 13				# S0B
217 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
218 	xor	g, y2		# y2 = f^g                              # CH
219 
220 
221 	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
222 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
223 	rorx	$22, a, y1	# y1 = a >> 22				# S0A
224 	and	e, y2		# y2 = (f^g)&e                          # CH
225 	add	h, d		# d = k + w + h + d                     # --
226 
227 	vpslld	$(32-18), XTMP1, XTMP1
228 	and	b, y3		# y3 = (a|c)&b                          # MAJA
229 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
230 
231 	vpxor	XTMP1, XTMP3, XTMP3
232 	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
233 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
234 
235 	vpxor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
236 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
237 	mov	a, T1		# T1 = a                                # MAJB
238 	and	c, T1		# T1 = a&c                              # MAJB
239 	add	y0, y2		# y2 = S1 + CH                          # --
240 
241 	vpxor	XTMP4, XTMP3, XTMP1	# XTMP1 = s0
242 	vpshufd	$0b11111010, X3, XTMP2	# XTMP2 = W[-2] {BBAA}
243 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
244 	add	y1, h		# h = k + w + h + S0                    # --
245 
246 	vpaddd	XTMP1, XTMP0, XTMP0	# XTMP0 = W[-16] + W[-7] + s0
247 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
248 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
249 	add	y3, h		# h = t1 + S0 + MAJ                     # --
250 
251 	vpsrld	$10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
252 
253 
254 	ROTATE_ARGS
255 
256 ################################### RND N + 2 ############################
257 
258 	mov	a, y3		# y3 = a                                # MAJA
259 	rorx	$25, e, y0	# y0 = e >> 25				# S1A
260 	offset = \disp + 2*4
261 	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
262 
263 	vpsrlq	$19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
264 	rorx	$11, e, y1	# y1 = e >> 11				# S1B
265 	or	c, y3		# y3 = a|c                              # MAJA
266 	mov	f, y2		# y2 = f                                # CH
267 	xor	g, y2		# y2 = f^g                              # CH
268 
269 	rorx	$13, a, T1	# T1 = a >> 13				# S0B
270 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
271 	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xBxA}
272 	and	e, y2		# y2 = (f^g)&e                          # CH
273 
274 	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
275 	vpxor	XTMP3, XTMP2, XTMP2
276 	add	h, d		# d = k + w + h + d                     # --
277 	and	b, y3		# y3 = (a|c)&b                          # MAJA
278 
279 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
280 	rorx	$22, a, y1	# y1 = a >> 22				# S0A
281 	vpxor	XTMP2, XTMP4, XTMP4	# XTMP4 = s1 {xBxA}
282 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
283 
284 	vpshufb	SHUF_00BA, XTMP4, XTMP4	# XTMP4 = s1 {00BA}
285 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
286 	rorx	$2, a ,T1	# T1 = (a >> 2)				# S0
287 	vpaddd	XTMP4, XTMP0, XTMP0	# XTMP0 = {..., ..., W[1], W[0]}
288 
289 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
290 	mov	a, T1		# T1 = a                                # MAJB
291 	and	c, T1		# T1 = a&c                              # MAJB
292 	add	y0, y2		# y2 = S1 + CH                          # --
293 	vpshufd	$0b01010000, XTMP0, XTMP2	# XTMP2 = W[-2] {DDCC}
294 
295 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
296 	add	y1,h		# h = k + w + h + S0                    # --
297 	add	y2,d		# d = k + w + h + d + S1 + CH = d + t1  # --
298 	add	y2,h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
299 
300 	add	y3,h		# h = t1 + S0 + MAJ                     # --
301 
302 
303 	ROTATE_ARGS
304 
305 ################################### RND N + 3 ############################
306 
307 	mov	a, y3		# y3 = a                                # MAJA
308 	rorx	$25, e, y0	# y0 = e >> 25				# S1A
309 	rorx	$11, e, y1	# y1 = e >> 11				# S1B
310 	offset = \disp + 3*4
311 	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
312 	or	c, y3		# y3 = a|c                              # MAJA
313 
314 
315 	vpsrld	$10, XTMP2, XTMP5	# XTMP5 = W[-2] >> 10 {DDCC}
316 	mov	f, y2		# y2 = f                                # CH
317 	rorx	$13, a, T1	# T1 = a >> 13				# S0B
318 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
319 	xor	g, y2		# y2 = f^g                              # CH
320 
321 
322 	vpsrlq	$19, XTMP2, XTMP3	# XTMP3 = W[-2] ror 19 {xDxC}
323 	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
324 	and	e, y2		# y2 = (f^g)&e                          # CH
325 	add	h, d		# d = k + w + h + d                     # --
326 	and	b, y3		# y3 = (a|c)&b                          # MAJA
327 
328 	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xDxC}
329 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
330 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
331 
332 	vpxor	XTMP3, XTMP2, XTMP2
333 	rorx	$22, a, y1	# y1 = a >> 22				# S0A
334 	add	y0, y2		# y2 = S1 + CH                          # --
335 
336 	vpxor	XTMP2, XTMP5, XTMP5	# XTMP5 = s1 {xDxC}
337 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
338 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
339 
340 	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
341 	vpshufb	SHUF_DC00, XTMP5, XTMP5	# XTMP5 = s1 {DC00}
342 
343 	vpaddd	XTMP0, XTMP5, X0	# X0 = {W[3], W[2], W[1], W[0]}
344 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
345 	mov	a, T1		# T1 = a                                # MAJB
346 	and	c, T1		# T1 = a&c                              # MAJB
347 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
348 
349 	add	y1, h		# h = k + w + h + S0                    # --
350 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
351 	add	y3, h		# h = t1 + S0 + MAJ                     # --
352 
353 	ROTATE_ARGS
354 	rotate_Xs
355 .endm
356 
357 .macro DO_4ROUNDS disp
358 ################################### RND N + 0 ###########################
359 
360 	mov	f, y2		# y2 = f                                # CH
361 	rorx	$25, e, y0	# y0 = e >> 25				# S1A
362 	rorx	$11, e, y1	# y1 = e >> 11				# S1B
363 	xor	g, y2		# y2 = f^g                              # CH
364 
365 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
366 	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
367 	and	e, y2		# y2 = (f^g)&e                          # CH
368 
369 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
370 	rorx	$13, a, T1	# T1 = a >> 13				# S0B
371 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
372 	rorx	$22, a, y1	# y1 = a >> 22				# S0A
373 	mov	a, y3		# y3 = a                                # MAJA
374 
375 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
376 	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
377 	addl	\disp(%rsp, SRND), h		# h = k + w + h # --
378 	or	c, y3		# y3 = a|c                              # MAJA
379 
380 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
381 	mov	a, T1		# T1 = a                                # MAJB
382 	and	b, y3		# y3 = (a|c)&b                          # MAJA
383 	and	c, T1		# T1 = a&c                              # MAJB
384 	add	y0, y2		# y2 = S1 + CH                          # --
385 
386 
387 	add	h, d		# d = k + w + h + d                     # --
388 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
389 	add	y1, h		# h = k + w + h + S0                    # --
390 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
391 
392 	ROTATE_ARGS
393 
394 ################################### RND N + 1 ###########################
395 
396 	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
397 	mov	f, y2		# y2 = f                                # CH
398 	rorx	$25, e, y0	# y0 = e >> 25				# S1A
399 	rorx	$11, e, y1	# y1 = e >> 11				# S1B
400 	xor	g, y2		# y2 = f^g                              # CH
401 
402 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
403 	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
404 	and	e, y2		# y2 = (f^g)&e                          # CH
405 	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
406 
407 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
408 	rorx	$13, a, T1	# T1 = a >> 13				# S0B
409 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
410 	rorx	$22, a, y1	# y1 = a >> 22				# S0A
411 	mov	a, y3		# y3 = a                                # MAJA
412 
413 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
414 	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
415 	offset = 4*1 + \disp
416 	addl	offset(%rsp, SRND), h		# h = k + w + h # --
417 	or	c, y3		# y3 = a|c                              # MAJA
418 
419 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
420 	mov	a, T1		# T1 = a                                # MAJB
421 	and	b, y3		# y3 = (a|c)&b                          # MAJA
422 	and	c, T1		# T1 = a&c                              # MAJB
423 	add	y0, y2		# y2 = S1 + CH                          # --
424 
425 
426 	add	h, d		# d = k + w + h + d                     # --
427 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
428 	add	y1, h		# h = k + w + h + S0                    # --
429 
430 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
431 
432 	ROTATE_ARGS
433 
434 ################################### RND N + 2 ##############################
435 
436 	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
437 	mov	f, y2		# y2 = f                                # CH
438 	rorx	$25, e, y0	# y0 = e >> 25				# S1A
439 	rorx	$11, e, y1	# y1 = e >> 11				# S1B
440 	xor	g, y2		# y2 = f^g                              # CH
441 
442 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
443 	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
444 	and	e, y2		# y2 = (f^g)&e                          # CH
445 	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
446 
447 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
448 	rorx	$13, a, T1	# T1 = a >> 13				# S0B
449 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
450 	rorx	$22, a, y1	# y1 = a >> 22				# S0A
451 	mov	a, y3		# y3 = a                                # MAJA
452 
453 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
454 	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
455 	offset = 4*2 + \disp
456 	addl	offset(%rsp, SRND), h		# h = k + w + h # --
457 	or	c, y3		# y3 = a|c                              # MAJA
458 
459 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
460 	mov	a, T1		# T1 = a                                # MAJB
461 	and	b, y3		# y3 = (a|c)&b                          # MAJA
462 	and	c, T1		# T1 = a&c                              # MAJB
463 	add	y0, y2		# y2 = S1 + CH                          # --
464 
465 
466 	add	h, d		# d = k + w + h + d                     # --
467 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
468 	add	y1, h		# h = k + w + h + S0                    # --
469 
470 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
471 
472 	ROTATE_ARGS
473 
474 ################################### RND N + 3 ###########################
475 
476 	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
477 	mov	f, y2		# y2 = f                                # CH
478 	rorx	$25, e, y0	# y0 = e >> 25				# S1A
479 	rorx	$11, e, y1	# y1 = e >> 11				# S1B
480 	xor	g, y2		# y2 = f^g                              # CH
481 
482 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
483 	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
484 	and	e, y2		# y2 = (f^g)&e                          # CH
485 	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
486 
487 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
488 	rorx	$13, a, T1	# T1 = a >> 13				# S0B
489 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
490 	rorx	$22, a, y1	# y1 = a >> 22				# S0A
491 	mov	a, y3		# y3 = a                                # MAJA
492 
493 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
494 	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
495 	offset = 4*3 + \disp
496 	addl	offset(%rsp, SRND), h		# h = k + w + h # --
497 	or	c, y3		# y3 = a|c                              # MAJA
498 
499 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
500 	mov	a, T1		# T1 = a                                # MAJB
501 	and	b, y3		# y3 = (a|c)&b                          # MAJA
502 	and	c, T1		# T1 = a&c                              # MAJB
503 	add	y0, y2		# y2 = S1 + CH                          # --
504 
505 
506 	add	h, d		# d = k + w + h + d                     # --
507 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
508 	add	y1, h		# h = k + w + h + S0                    # --
509 
510 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
511 
512 
513 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
514 
515 	add	y3, h		# h = t1 + S0 + MAJ                     # --
516 
517 	ROTATE_ARGS
518 
519 .endm
520 
521 ########################################################################
522 ## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks)
523 ## arg 1 : pointer to state
524 ## arg 2 : pointer to input data
525 ## arg 3 : Num blocks
526 ########################################################################
527 .text
528 SYM_FUNC_START(sha256_transform_rorx)
529 .align 32
530 	pushq	%rbx
531 	pushq	%r12
532 	pushq	%r13
533 	pushq	%r14
534 	pushq	%r15
535 
536 	mov	%rsp, %rax
537 	subq	$STACK_SIZE, %rsp
538 	and	$-32, %rsp	# align rsp to 32 byte boundary
539 	mov	%rax, _RSP(%rsp)
540 
541 
542 	shl	$6, NUM_BLKS	# convert to bytes
543 	jz	done_hash
544 	lea	-64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
545 	mov	NUM_BLKS, _INP_END(%rsp)
546 
547 	cmp	NUM_BLKS, INP
548 	je	only_one_block
549 
550 	## load initial digest
551 	mov	(CTX), a
552 	mov	4*1(CTX), b
553 	mov	4*2(CTX), c
554 	mov	4*3(CTX), d
555 	mov	4*4(CTX), e
556 	mov	4*5(CTX), f
557 	mov	4*6(CTX), g
558 	mov	4*7(CTX), h
559 
560 	vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
561 	vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
562 	vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
563 
564 	mov	CTX, _CTX(%rsp)
565 
566 loop0:
567 	## Load first 16 dwords from two blocks
568 	VMOVDQ	0*32(INP),XTMP0
569 	VMOVDQ	1*32(INP),XTMP1
570 	VMOVDQ	2*32(INP),XTMP2
571 	VMOVDQ	3*32(INP),XTMP3
572 
573 	## byte swap data
574 	vpshufb	BYTE_FLIP_MASK, XTMP0, XTMP0
575 	vpshufb	BYTE_FLIP_MASK, XTMP1, XTMP1
576 	vpshufb	BYTE_FLIP_MASK, XTMP2, XTMP2
577 	vpshufb	BYTE_FLIP_MASK, XTMP3, XTMP3
578 
579 	## transpose data into high/low halves
580 	vperm2i128	$0x20, XTMP2, XTMP0, X0
581 	vperm2i128	$0x31, XTMP2, XTMP0, X1
582 	vperm2i128	$0x20, XTMP3, XTMP1, X2
583 	vperm2i128	$0x31, XTMP3, XTMP1, X3
584 
585 last_block_enter:
586 	add	$64, INP
587 	mov	INP, _INP(%rsp)
588 
589 	## schedule 48 input dwords, by doing 3 rounds of 12 each
590 	xor	SRND, SRND
591 
592 .align 16
593 loop1:
594 	vpaddd	K256+0*32(SRND), X0, XFER
595 	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
596 	FOUR_ROUNDS_AND_SCHED	_XFER + 0*32
597 
598 	vpaddd	K256+1*32(SRND), X0, XFER
599 	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
600 	FOUR_ROUNDS_AND_SCHED	_XFER + 1*32
601 
602 	vpaddd	K256+2*32(SRND), X0, XFER
603 	vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
604 	FOUR_ROUNDS_AND_SCHED	_XFER + 2*32
605 
606 	vpaddd	K256+3*32(SRND), X0, XFER
607 	vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
608 	FOUR_ROUNDS_AND_SCHED	_XFER + 3*32
609 
610 	add	$4*32, SRND
611 	cmp	$3*4*32, SRND
612 	jb	loop1
613 
614 loop2:
615 	## Do last 16 rounds with no scheduling
616 	vpaddd	K256+0*32(SRND), X0, XFER
617 	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
618 	DO_4ROUNDS	_XFER + 0*32
619 
620 	vpaddd	K256+1*32(SRND), X1, XFER
621 	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
622 	DO_4ROUNDS	_XFER + 1*32
623 	add	$2*32, SRND
624 
625 	vmovdqa	X2, X0
626 	vmovdqa	X3, X1
627 
628 	cmp	$4*4*32, SRND
629 	jb	loop2
630 
631 	mov	_CTX(%rsp), CTX
632 	mov	_INP(%rsp), INP
633 
634 	addm    (4*0)(CTX),a
635 	addm    (4*1)(CTX),b
636 	addm    (4*2)(CTX),c
637 	addm    (4*3)(CTX),d
638 	addm    (4*4)(CTX),e
639 	addm    (4*5)(CTX),f
640 	addm    (4*6)(CTX),g
641 	addm    (4*7)(CTX),h
642 
643 	cmp	_INP_END(%rsp), INP
644 	ja	done_hash
645 
646 	#### Do second block using previously scheduled results
647 	xor	SRND, SRND
648 .align 16
649 loop3:
650 	DO_4ROUNDS	 _XFER + 0*32 + 16
651 	DO_4ROUNDS	 _XFER + 1*32 + 16
652 	add	$2*32, SRND
653 	cmp	$4*4*32, SRND
654 	jb	loop3
655 
656 	mov	_CTX(%rsp), CTX
657 	mov	_INP(%rsp), INP
658 	add	$64, INP
659 
660 	addm    (4*0)(CTX),a
661 	addm    (4*1)(CTX),b
662 	addm    (4*2)(CTX),c
663 	addm    (4*3)(CTX),d
664 	addm    (4*4)(CTX),e
665 	addm    (4*5)(CTX),f
666 	addm    (4*6)(CTX),g
667 	addm    (4*7)(CTX),h
668 
669 	cmp	_INP_END(%rsp), INP
670 	jb	loop0
671 	ja	done_hash
672 
673 do_last_block:
674 	VMOVDQ	0*16(INP),XWORD0
675 	VMOVDQ	1*16(INP),XWORD1
676 	VMOVDQ	2*16(INP),XWORD2
677 	VMOVDQ	3*16(INP),XWORD3
678 
679 	vpshufb	X_BYTE_FLIP_MASK, XWORD0, XWORD0
680 	vpshufb	X_BYTE_FLIP_MASK, XWORD1, XWORD1
681 	vpshufb	X_BYTE_FLIP_MASK, XWORD2, XWORD2
682 	vpshufb	X_BYTE_FLIP_MASK, XWORD3, XWORD3
683 
684 	jmp	last_block_enter
685 
686 only_one_block:
687 
688 	## load initial digest
689 	mov	(4*0)(CTX),a
690 	mov	(4*1)(CTX),b
691 	mov	(4*2)(CTX),c
692 	mov	(4*3)(CTX),d
693 	mov	(4*4)(CTX),e
694 	mov	(4*5)(CTX),f
695 	mov	(4*6)(CTX),g
696 	mov	(4*7)(CTX),h
697 
698 	vmovdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
699 	vmovdqa	_SHUF_00BA(%rip), SHUF_00BA
700 	vmovdqa	_SHUF_DC00(%rip), SHUF_DC00
701 
702 	mov	CTX, _CTX(%rsp)
703 	jmp	do_last_block
704 
705 done_hash:
706 
707 	mov	_RSP(%rsp), %rsp
708 
709 	popq	%r15
710 	popq	%r14
711 	popq	%r13
712 	popq	%r12
713 	popq	%rbx
714 	RET
715 SYM_FUNC_END(sha256_transform_rorx)
716 
717 .section	.rodata.cst512.K256, "aM", @progbits, 512
718 .align 64
719 K256:
720 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
721 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
722 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
723 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
724 	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
725 	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
726 	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
727 	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
728 	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
729 	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
730 	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
731 	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
732 	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
733 	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
734 	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
735 	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
736 	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
737 	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
738 	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
739 	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
740 	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
741 	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
742 	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
743 	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
744 	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
745 	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
746 	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
747 	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
748 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
749 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
750 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
751 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
752 
753 .section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
754 .align 32
755 PSHUFFLE_BYTE_FLIP_MASK:
756 	.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
757 
758 # shuffle xBxA -> 00BA
759 .section	.rodata.cst32._SHUF_00BA, "aM", @progbits, 32
760 .align 32
761 _SHUF_00BA:
762 	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
763 
764 # shuffle xDxC -> DC00
765 .section	.rodata.cst32._SHUF_DC00, "aM", @progbits, 32
766 .align 32
767 _SHUF_DC00:
768 	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
769