1 ########################################################################
2 # Implement fast SHA-256 with AVX2 instructions. (x86_64)
3 #
4 # Copyright (C) 2013 Intel Corporation.
5 #
6 # Authors:
7 #     James Guilford <james.guilford@intel.com>
8 #     Kirk Yap <kirk.s.yap@intel.com>
9 #     Tim Chen <tim.c.chen@linux.intel.com>
10 #
11 # This software is available to you under a choice of one of two
12 # licenses.  You may choose to be licensed under the terms of the GNU
13 # General Public License (GPL) Version 2, available from the file
14 # COPYING in the main directory of this source tree, or the
15 # OpenIB.org BSD license below:
16 #
17 #     Redistribution and use in source and binary forms, with or
18 #     without modification, are permitted provided that the following
19 #     conditions are met:
20 #
21 #      - Redistributions of source code must retain the above
22 #        copyright notice, this list of conditions and the following
23 #        disclaimer.
24 #
25 #      - Redistributions in binary form must reproduce the above
26 #        copyright notice, this list of conditions and the following
27 #        disclaimer in the documentation and/or other materials
28 #        provided with the distribution.
29 #
30 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37 # SOFTWARE.
38 #
39 ########################################################################
40 #
41 # This code is described in an Intel White-Paper:
42 # "Fast SHA-256 Implementations on Intel Architecture Processors"
43 #
44 # To find it, surf to http://www.intel.com/p/en_US/embedded
45 # and search for that title.
46 #
47 ########################################################################
48 # This code schedules 2 blocks at a time, with 4 lanes per block
49 ########################################################################
50 
51 #include <linux/linkage.h>
52 #include <linux/cfi_types.h>
53 
54 ## assume buffers not aligned
55 #define	VMOVDQ vmovdqu
56 
57 ################################ Define Macros
58 
59 # addm [mem], reg
60 # Add reg to mem using reg-mem add and store
61 .macro addm p1 p2
62 	add	\p1, \p2
63 	mov	\p2, \p1
64 .endm
65 
66 ################################
67 
68 X0 = %ymm4
69 X1 = %ymm5
70 X2 = %ymm6
71 X3 = %ymm7
72 
73 # XMM versions of above
74 XWORD0 = %xmm4
75 XWORD1 = %xmm5
76 XWORD2 = %xmm6
77 XWORD3 = %xmm7
78 
79 XTMP0 = %ymm0
80 XTMP1 = %ymm1
81 XTMP2 = %ymm2
82 XTMP3 = %ymm3
83 XTMP4 = %ymm8
84 XFER  = %ymm9
85 XTMP5 = %ymm11
86 
87 SHUF_00BA =	%ymm10 # shuffle xBxA -> 00BA
88 SHUF_DC00 =	%ymm12 # shuffle xDxC -> DC00
89 BYTE_FLIP_MASK = %ymm13
90 
91 X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
92 
93 NUM_BLKS = %rdx	# 3rd arg
94 INP	= %rsi  # 2nd arg
95 CTX	= %rdi	# 1st arg
96 c	= %ecx
97 d	= %r8d
98 e       = %edx	# clobbers NUM_BLKS
99 y3	= %esi	# clobbers INP
100 
101 SRND	= CTX	# SRND is same register as CTX
102 
103 a = %eax
104 b = %ebx
105 f = %r9d
106 g = %r10d
107 h = %r11d
108 old_h = %r11d
109 
110 T1 = %r12d
111 y0 = %r13d
112 y1 = %r14d
113 y2 = %r15d
114 
115 
116 _XFER_SIZE	= 2*64*4	# 2 blocks, 64 rounds, 4 bytes/round
117 _XMM_SAVE_SIZE	= 0
118 _INP_END_SIZE	= 8
119 _INP_SIZE	= 8
120 _CTX_SIZE	= 8
121 
122 _XFER		= 0
123 _XMM_SAVE	= _XFER     + _XFER_SIZE
124 _INP_END	= _XMM_SAVE + _XMM_SAVE_SIZE
125 _INP		= _INP_END  + _INP_END_SIZE
126 _CTX		= _INP      + _INP_SIZE
127 STACK_SIZE	= _CTX      + _CTX_SIZE
128 
129 # rotate_Xs
130 # Rotate values of symbols X0...X3
131 .macro rotate_Xs
132 	X_ = X0
133 	X0 = X1
134 	X1 = X2
135 	X2 = X3
136 	X3 = X_
137 .endm
138 
139 # ROTATE_ARGS
140 # Rotate values of symbols a...h
141 .macro ROTATE_ARGS
142 	old_h = h
143 	TMP_ = h
144 	h = g
145 	g = f
146 	f = e
147 	e = d
148 	d = c
149 	c = b
150 	b = a
151 	a = TMP_
152 .endm
153 
154 .macro FOUR_ROUNDS_AND_SCHED disp
155 ################################### RND N + 0 ############################
156 
157 	mov	a, y3		# y3 = a                                # MAJA
158 	rorx	$25, e, y0	# y0 = e >> 25				# S1A
159 	rorx	$11, e, y1	# y1 = e >> 11				# S1B
160 
161 	addl	\disp(%rsp, SRND), h		# h = k + w + h         # --
162 	or	c, y3		# y3 = a|c                              # MAJA
163 	vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
164 	mov	f, y2		# y2 = f                                # CH
165 	rorx	$13, a, T1	# T1 = a >> 13				# S0B
166 
167 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
168 	xor	g, y2		# y2 = f^g                              # CH
169 	vpaddd	X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
170 	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
171 
172 	and	e, y2		# y2 = (f^g)&e                          # CH
173 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
174 	rorx	$22, a, y1	# y1 = a >> 22				# S0A
175 	add	h, d		# d = k + w + h + d                     # --
176 
177 	and	b, y3		# y3 = (a|c)&b                          # MAJA
178 	vpalignr $4, X0, X1, XTMP1	# XTMP1 = W[-15]
179 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
180 	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
181 
182 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
183 	vpsrld	$7, XTMP1, XTMP2
184 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
185 	mov	a, T1		# T1 = a                                # MAJB
186 	and	c, T1		# T1 = a&c                              # MAJB
187 
188 	add	y0, y2		# y2 = S1 + CH                          # --
189 	vpslld	$(32-7), XTMP1, XTMP3
190 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
191 	add	y1, h		# h = k + w + h + S0                    # --
192 
193 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
194 	vpor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7
195 
196 	vpsrld	$18, XTMP1, XTMP2
197 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
198 	add	y3, h		# h = t1 + S0 + MAJ                     # --
199 
200 
201 	ROTATE_ARGS
202 
203 ################################### RND N + 1 ############################
204 
205 	mov	a, y3		# y3 = a                                # MAJA
206 	rorx	$25, e, y0	# y0 = e >> 25				# S1A
207 	rorx	$11, e, y1	# y1 = e >> 11				# S1B
208 	offset = \disp + 1*4
209 	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
210 	or	c, y3		# y3 = a|c                              # MAJA
211 
212 
213 	vpsrld	$3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
214 	mov	f, y2		# y2 = f                                # CH
215 	rorx	$13, a, T1	# T1 = a >> 13				# S0B
216 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
217 	xor	g, y2		# y2 = f^g                              # CH
218 
219 
220 	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
221 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
222 	rorx	$22, a, y1	# y1 = a >> 22				# S0A
223 	and	e, y2		# y2 = (f^g)&e                          # CH
224 	add	h, d		# d = k + w + h + d                     # --
225 
226 	vpslld	$(32-18), XTMP1, XTMP1
227 	and	b, y3		# y3 = (a|c)&b                          # MAJA
228 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
229 
230 	vpxor	XTMP1, XTMP3, XTMP3
231 	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
232 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
233 
234 	vpxor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
235 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
236 	mov	a, T1		# T1 = a                                # MAJB
237 	and	c, T1		# T1 = a&c                              # MAJB
238 	add	y0, y2		# y2 = S1 + CH                          # --
239 
240 	vpxor	XTMP4, XTMP3, XTMP1	# XTMP1 = s0
241 	vpshufd	$0b11111010, X3, XTMP2	# XTMP2 = W[-2] {BBAA}
242 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
243 	add	y1, h		# h = k + w + h + S0                    # --
244 
245 	vpaddd	XTMP1, XTMP0, XTMP0	# XTMP0 = W[-16] + W[-7] + s0
246 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
247 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
248 	add	y3, h		# h = t1 + S0 + MAJ                     # --
249 
250 	vpsrld	$10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
251 
252 
253 	ROTATE_ARGS
254 
255 ################################### RND N + 2 ############################
256 
257 	mov	a, y3		# y3 = a                                # MAJA
258 	rorx	$25, e, y0	# y0 = e >> 25				# S1A
259 	offset = \disp + 2*4
260 	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
261 
262 	vpsrlq	$19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
263 	rorx	$11, e, y1	# y1 = e >> 11				# S1B
264 	or	c, y3		# y3 = a|c                              # MAJA
265 	mov	f, y2		# y2 = f                                # CH
266 	xor	g, y2		# y2 = f^g                              # CH
267 
268 	rorx	$13, a, T1	# T1 = a >> 13				# S0B
269 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
270 	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xBxA}
271 	and	e, y2		# y2 = (f^g)&e                          # CH
272 
273 	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
274 	vpxor	XTMP3, XTMP2, XTMP2
275 	add	h, d		# d = k + w + h + d                     # --
276 	and	b, y3		# y3 = (a|c)&b                          # MAJA
277 
278 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
279 	rorx	$22, a, y1	# y1 = a >> 22				# S0A
280 	vpxor	XTMP2, XTMP4, XTMP4	# XTMP4 = s1 {xBxA}
281 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
282 
283 	vpshufb	SHUF_00BA, XTMP4, XTMP4	# XTMP4 = s1 {00BA}
284 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
285 	rorx	$2, a ,T1	# T1 = (a >> 2)				# S0
286 	vpaddd	XTMP4, XTMP0, XTMP0	# XTMP0 = {..., ..., W[1], W[0]}
287 
288 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
289 	mov	a, T1		# T1 = a                                # MAJB
290 	and	c, T1		# T1 = a&c                              # MAJB
291 	add	y0, y2		# y2 = S1 + CH                          # --
292 	vpshufd	$0b01010000, XTMP0, XTMP2	# XTMP2 = W[-2] {DDCC}
293 
294 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
295 	add	y1,h		# h = k + w + h + S0                    # --
296 	add	y2,d		# d = k + w + h + d + S1 + CH = d + t1  # --
297 	add	y2,h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
298 
299 	add	y3,h		# h = t1 + S0 + MAJ                     # --
300 
301 
302 	ROTATE_ARGS
303 
304 ################################### RND N + 3 ############################
305 
306 	mov	a, y3		# y3 = a                                # MAJA
307 	rorx	$25, e, y0	# y0 = e >> 25				# S1A
308 	rorx	$11, e, y1	# y1 = e >> 11				# S1B
309 	offset = \disp + 3*4
310 	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
311 	or	c, y3		# y3 = a|c                              # MAJA
312 
313 
314 	vpsrld	$10, XTMP2, XTMP5	# XTMP5 = W[-2] >> 10 {DDCC}
315 	mov	f, y2		# y2 = f                                # CH
316 	rorx	$13, a, T1	# T1 = a >> 13				# S0B
317 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
318 	xor	g, y2		# y2 = f^g                              # CH
319 
320 
321 	vpsrlq	$19, XTMP2, XTMP3	# XTMP3 = W[-2] ror 19 {xDxC}
322 	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
323 	and	e, y2		# y2 = (f^g)&e                          # CH
324 	add	h, d		# d = k + w + h + d                     # --
325 	and	b, y3		# y3 = (a|c)&b                          # MAJA
326 
327 	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xDxC}
328 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
329 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
330 
331 	vpxor	XTMP3, XTMP2, XTMP2
332 	rorx	$22, a, y1	# y1 = a >> 22				# S0A
333 	add	y0, y2		# y2 = S1 + CH                          # --
334 
335 	vpxor	XTMP2, XTMP5, XTMP5	# XTMP5 = s1 {xDxC}
336 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
337 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
338 
339 	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
340 	vpshufb	SHUF_DC00, XTMP5, XTMP5	# XTMP5 = s1 {DC00}
341 
342 	vpaddd	XTMP0, XTMP5, X0	# X0 = {W[3], W[2], W[1], W[0]}
343 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
344 	mov	a, T1		# T1 = a                                # MAJB
345 	and	c, T1		# T1 = a&c                              # MAJB
346 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
347 
348 	add	y1, h		# h = k + w + h + S0                    # --
349 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
350 	add	y3, h		# h = t1 + S0 + MAJ                     # --
351 
352 	ROTATE_ARGS
353 	rotate_Xs
354 .endm
355 
356 .macro DO_4ROUNDS disp
357 ################################### RND N + 0 ###########################
358 
359 	mov	f, y2		# y2 = f                                # CH
360 	rorx	$25, e, y0	# y0 = e >> 25				# S1A
361 	rorx	$11, e, y1	# y1 = e >> 11				# S1B
362 	xor	g, y2		# y2 = f^g                              # CH
363 
364 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
365 	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
366 	and	e, y2		# y2 = (f^g)&e                          # CH
367 
368 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
369 	rorx	$13, a, T1	# T1 = a >> 13				# S0B
370 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
371 	rorx	$22, a, y1	# y1 = a >> 22				# S0A
372 	mov	a, y3		# y3 = a                                # MAJA
373 
374 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
375 	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
376 	addl	\disp(%rsp, SRND), h		# h = k + w + h # --
377 	or	c, y3		# y3 = a|c                              # MAJA
378 
379 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
380 	mov	a, T1		# T1 = a                                # MAJB
381 	and	b, y3		# y3 = (a|c)&b                          # MAJA
382 	and	c, T1		# T1 = a&c                              # MAJB
383 	add	y0, y2		# y2 = S1 + CH                          # --
384 
385 
386 	add	h, d		# d = k + w + h + d                     # --
387 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
388 	add	y1, h		# h = k + w + h + S0                    # --
389 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
390 
391 	ROTATE_ARGS
392 
393 ################################### RND N + 1 ###########################
394 
395 	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
396 	mov	f, y2		# y2 = f                                # CH
397 	rorx	$25, e, y0	# y0 = e >> 25				# S1A
398 	rorx	$11, e, y1	# y1 = e >> 11				# S1B
399 	xor	g, y2		# y2 = f^g                              # CH
400 
401 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
402 	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
403 	and	e, y2		# y2 = (f^g)&e                          # CH
404 	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
405 
406 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
407 	rorx	$13, a, T1	# T1 = a >> 13				# S0B
408 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
409 	rorx	$22, a, y1	# y1 = a >> 22				# S0A
410 	mov	a, y3		# y3 = a                                # MAJA
411 
412 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
413 	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
414 	offset = 4*1 + \disp
415 	addl	offset(%rsp, SRND), h		# h = k + w + h # --
416 	or	c, y3		# y3 = a|c                              # MAJA
417 
418 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
419 	mov	a, T1		# T1 = a                                # MAJB
420 	and	b, y3		# y3 = (a|c)&b                          # MAJA
421 	and	c, T1		# T1 = a&c                              # MAJB
422 	add	y0, y2		# y2 = S1 + CH                          # --
423 
424 
425 	add	h, d		# d = k + w + h + d                     # --
426 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
427 	add	y1, h		# h = k + w + h + S0                    # --
428 
429 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
430 
431 	ROTATE_ARGS
432 
433 ################################### RND N + 2 ##############################
434 
435 	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
436 	mov	f, y2		# y2 = f                                # CH
437 	rorx	$25, e, y0	# y0 = e >> 25				# S1A
438 	rorx	$11, e, y1	# y1 = e >> 11				# S1B
439 	xor	g, y2		# y2 = f^g                              # CH
440 
441 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
442 	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
443 	and	e, y2		# y2 = (f^g)&e                          # CH
444 	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
445 
446 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
447 	rorx	$13, a, T1	# T1 = a >> 13				# S0B
448 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
449 	rorx	$22, a, y1	# y1 = a >> 22				# S0A
450 	mov	a, y3		# y3 = a                                # MAJA
451 
452 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
453 	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
454 	offset = 4*2 + \disp
455 	addl	offset(%rsp, SRND), h		# h = k + w + h # --
456 	or	c, y3		# y3 = a|c                              # MAJA
457 
458 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
459 	mov	a, T1		# T1 = a                                # MAJB
460 	and	b, y3		# y3 = (a|c)&b                          # MAJA
461 	and	c, T1		# T1 = a&c                              # MAJB
462 	add	y0, y2		# y2 = S1 + CH                          # --
463 
464 
465 	add	h, d		# d = k + w + h + d                     # --
466 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
467 	add	y1, h		# h = k + w + h + S0                    # --
468 
469 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
470 
471 	ROTATE_ARGS
472 
473 ################################### RND N + 3 ###########################
474 
475 	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
476 	mov	f, y2		# y2 = f                                # CH
477 	rorx	$25, e, y0	# y0 = e >> 25				# S1A
478 	rorx	$11, e, y1	# y1 = e >> 11				# S1B
479 	xor	g, y2		# y2 = f^g                              # CH
480 
481 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
482 	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
483 	and	e, y2		# y2 = (f^g)&e                          # CH
484 	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
485 
486 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
487 	rorx	$13, a, T1	# T1 = a >> 13				# S0B
488 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
489 	rorx	$22, a, y1	# y1 = a >> 22				# S0A
490 	mov	a, y3		# y3 = a                                # MAJA
491 
492 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
493 	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
494 	offset = 4*3 + \disp
495 	addl	offset(%rsp, SRND), h		# h = k + w + h # --
496 	or	c, y3		# y3 = a|c                              # MAJA
497 
498 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
499 	mov	a, T1		# T1 = a                                # MAJB
500 	and	b, y3		# y3 = (a|c)&b                          # MAJA
501 	and	c, T1		# T1 = a&c                              # MAJB
502 	add	y0, y2		# y2 = S1 + CH                          # --
503 
504 
505 	add	h, d		# d = k + w + h + d                     # --
506 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
507 	add	y1, h		# h = k + w + h + S0                    # --
508 
509 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
510 
511 
512 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
513 
514 	add	y3, h		# h = t1 + S0 + MAJ                     # --
515 
516 	ROTATE_ARGS
517 
518 .endm
519 
520 ########################################################################
521 ## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks)
522 ## arg 1 : pointer to state
523 ## arg 2 : pointer to input data
524 ## arg 3 : Num blocks
525 ########################################################################
526 .text
527 SYM_TYPED_FUNC_START(sha256_transform_rorx)
528 	pushq	%rbx
529 	pushq	%r12
530 	pushq	%r13
531 	pushq	%r14
532 	pushq	%r15
533 
534 	push	%rbp
535 	mov	%rsp, %rbp
536 
537 	subq	$STACK_SIZE, %rsp
538 	and	$-32, %rsp	# align rsp to 32 byte boundary
539 
540 	shl	$6, NUM_BLKS	# convert to bytes
541 	jz	.Ldone_hash
542 	lea	-64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
543 	mov	NUM_BLKS, _INP_END(%rsp)
544 
545 	cmp	NUM_BLKS, INP
546 	je	.Lonly_one_block
547 
548 	## load initial digest
549 	mov	(CTX), a
550 	mov	4*1(CTX), b
551 	mov	4*2(CTX), c
552 	mov	4*3(CTX), d
553 	mov	4*4(CTX), e
554 	mov	4*5(CTX), f
555 	mov	4*6(CTX), g
556 	mov	4*7(CTX), h
557 
558 	vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
559 	vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
560 	vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
561 
562 	mov	CTX, _CTX(%rsp)
563 
564 .Lloop0:
565 	## Load first 16 dwords from two blocks
566 	VMOVDQ	0*32(INP),XTMP0
567 	VMOVDQ	1*32(INP),XTMP1
568 	VMOVDQ	2*32(INP),XTMP2
569 	VMOVDQ	3*32(INP),XTMP3
570 
571 	## byte swap data
572 	vpshufb	BYTE_FLIP_MASK, XTMP0, XTMP0
573 	vpshufb	BYTE_FLIP_MASK, XTMP1, XTMP1
574 	vpshufb	BYTE_FLIP_MASK, XTMP2, XTMP2
575 	vpshufb	BYTE_FLIP_MASK, XTMP3, XTMP3
576 
577 	## transpose data into high/low halves
578 	vperm2i128	$0x20, XTMP2, XTMP0, X0
579 	vperm2i128	$0x31, XTMP2, XTMP0, X1
580 	vperm2i128	$0x20, XTMP3, XTMP1, X2
581 	vperm2i128	$0x31, XTMP3, XTMP1, X3
582 
583 .Llast_block_enter:
584 	add	$64, INP
585 	mov	INP, _INP(%rsp)
586 
587 	## schedule 48 input dwords, by doing 3 rounds of 12 each
588 	xor	SRND, SRND
589 
590 .align 16
591 .Lloop1:
592 	leaq	K256+0*32(%rip), INP		## reuse INP as scratch reg
593 	vpaddd	(INP, SRND), X0, XFER
594 	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
595 	FOUR_ROUNDS_AND_SCHED	_XFER + 0*32
596 
597 	leaq	K256+1*32(%rip), INP
598 	vpaddd	(INP, SRND), X0, XFER
599 	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
600 	FOUR_ROUNDS_AND_SCHED	_XFER + 1*32
601 
602 	leaq	K256+2*32(%rip), INP
603 	vpaddd	(INP, SRND), X0, XFER
604 	vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
605 	FOUR_ROUNDS_AND_SCHED	_XFER + 2*32
606 
607 	leaq	K256+3*32(%rip), INP
608 	vpaddd	(INP, SRND), X0, XFER
609 	vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
610 	FOUR_ROUNDS_AND_SCHED	_XFER + 3*32
611 
612 	add	$4*32, SRND
613 	cmp	$3*4*32, SRND
614 	jb	.Lloop1
615 
616 .Lloop2:
617 	## Do last 16 rounds with no scheduling
618 	leaq	K256+0*32(%rip), INP
619 	vpaddd	(INP, SRND), X0, XFER
620 	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
621 	DO_4ROUNDS	_XFER + 0*32
622 
623 	leaq	K256+1*32(%rip), INP
624 	vpaddd	(INP, SRND), X1, XFER
625 	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
626 	DO_4ROUNDS	_XFER + 1*32
627 	add	$2*32, SRND
628 
629 	vmovdqa	X2, X0
630 	vmovdqa	X3, X1
631 
632 	cmp	$4*4*32, SRND
633 	jb	.Lloop2
634 
635 	mov	_CTX(%rsp), CTX
636 	mov	_INP(%rsp), INP
637 
638 	addm    (4*0)(CTX),a
639 	addm    (4*1)(CTX),b
640 	addm    (4*2)(CTX),c
641 	addm    (4*3)(CTX),d
642 	addm    (4*4)(CTX),e
643 	addm    (4*5)(CTX),f
644 	addm    (4*6)(CTX),g
645 	addm    (4*7)(CTX),h
646 
647 	cmp	_INP_END(%rsp), INP
648 	ja	.Ldone_hash
649 
650 	#### Do second block using previously scheduled results
651 	xor	SRND, SRND
652 .align 16
653 .Lloop3:
654 	DO_4ROUNDS	 _XFER + 0*32 + 16
655 	DO_4ROUNDS	 _XFER + 1*32 + 16
656 	add	$2*32, SRND
657 	cmp	$4*4*32, SRND
658 	jb	.Lloop3
659 
660 	mov	_CTX(%rsp), CTX
661 	mov	_INP(%rsp), INP
662 	add	$64, INP
663 
664 	addm    (4*0)(CTX),a
665 	addm    (4*1)(CTX),b
666 	addm    (4*2)(CTX),c
667 	addm    (4*3)(CTX),d
668 	addm    (4*4)(CTX),e
669 	addm    (4*5)(CTX),f
670 	addm    (4*6)(CTX),g
671 	addm    (4*7)(CTX),h
672 
673 	cmp	_INP_END(%rsp), INP
674 	jb	.Lloop0
675 	ja	.Ldone_hash
676 
677 .Ldo_last_block:
678 	VMOVDQ	0*16(INP),XWORD0
679 	VMOVDQ	1*16(INP),XWORD1
680 	VMOVDQ	2*16(INP),XWORD2
681 	VMOVDQ	3*16(INP),XWORD3
682 
683 	vpshufb	X_BYTE_FLIP_MASK, XWORD0, XWORD0
684 	vpshufb	X_BYTE_FLIP_MASK, XWORD1, XWORD1
685 	vpshufb	X_BYTE_FLIP_MASK, XWORD2, XWORD2
686 	vpshufb	X_BYTE_FLIP_MASK, XWORD3, XWORD3
687 
688 	jmp	.Llast_block_enter
689 
690 .Lonly_one_block:
691 
692 	## load initial digest
693 	mov	(4*0)(CTX),a
694 	mov	(4*1)(CTX),b
695 	mov	(4*2)(CTX),c
696 	mov	(4*3)(CTX),d
697 	mov	(4*4)(CTX),e
698 	mov	(4*5)(CTX),f
699 	mov	(4*6)(CTX),g
700 	mov	(4*7)(CTX),h
701 
702 	vmovdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
703 	vmovdqa	_SHUF_00BA(%rip), SHUF_00BA
704 	vmovdqa	_SHUF_DC00(%rip), SHUF_DC00
705 
706 	mov	CTX, _CTX(%rsp)
707 	jmp	.Ldo_last_block
708 
709 .Ldone_hash:
710 
711 	mov	%rbp, %rsp
712 	pop	%rbp
713 
714 	popq	%r15
715 	popq	%r14
716 	popq	%r13
717 	popq	%r12
718 	popq	%rbx
719 	RET
720 SYM_FUNC_END(sha256_transform_rorx)
721 
722 .section	.rodata.cst512.K256, "aM", @progbits, 512
723 .align 64
724 K256:
725 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
726 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
727 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
728 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
729 	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
730 	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
731 	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
732 	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
733 	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
734 	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
735 	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
736 	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
737 	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
738 	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
739 	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
740 	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
741 	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
742 	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
743 	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
744 	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
745 	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
746 	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
747 	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
748 	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
749 	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
750 	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
751 	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
752 	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
753 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
754 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
755 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
756 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
757 
758 .section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
759 .align 32
760 PSHUFFLE_BYTE_FLIP_MASK:
761 	.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
762 
763 # shuffle xBxA -> 00BA
764 .section	.rodata.cst32._SHUF_00BA, "aM", @progbits, 32
765 .align 32
766 _SHUF_00BA:
767 	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
768 
769 # shuffle xDxC -> DC00
770 .section	.rodata.cst32._SHUF_DC00, "aM", @progbits, 32
771 .align 32
772 _SHUF_DC00:
773 	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
774