1 ########################################################################
2 # Implement fast SHA-256 with AVX1 instructions. (x86_64)
3 #
4 # Copyright (C) 2013 Intel Corporation.
5 #
6 # Authors:
7 #     James Guilford <james.guilford@intel.com>
8 #     Kirk Yap <kirk.s.yap@intel.com>
9 #     Tim Chen <tim.c.chen@linux.intel.com>
10 #
11 # This software is available to you under a choice of one of two
12 # licenses.  You may choose to be licensed under the terms of the GNU
13 # General Public License (GPL) Version 2, available from the file
14 # COPYING in the main directory of this source tree, or the
15 # OpenIB.org BSD license below:
16 #
17 #     Redistribution and use in source and binary forms, with or
18 #     without modification, are permitted provided that the following
19 #     conditions are met:
20 #
21 #      - Redistributions of source code must retain the above
22 #        copyright notice, this list of conditions and the following
23 #        disclaimer.
24 #
25 #      - Redistributions in binary form must reproduce the above
26 #        copyright notice, this list of conditions and the following
27 #        disclaimer in the documentation and/or other materials
28 #        provided with the distribution.
29 #
30 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37 # SOFTWARE.
38 ########################################################################
39 #
40 # This code is described in an Intel White-Paper:
41 # "Fast SHA-256 Implementations on Intel Architecture Processors"
42 #
43 # To find it, surf to http://www.intel.com/p/en_US/embedded
44 # and search for that title.
45 #
46 ########################################################################
47 # This code schedules 1 block at a time, with 4 lanes per block
48 ########################################################################
49 
50 #include <linux/linkage.h>
51 
52 ## assume buffers not aligned
53 #define    VMOVDQ vmovdqu
54 
55 ################################ Define Macros
56 
57 # addm [mem], reg
58 # Add reg to mem using reg-mem add and store
59 .macro addm p1 p2
60 	add     \p1, \p2
61 	mov     \p2, \p1
62 .endm
63 
64 
65 .macro MY_ROR p1 p2
66 	shld    $(32-(\p1)), \p2, \p2
67 .endm
68 
69 ################################
70 
71 # COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
72 # Load xmm with mem and byte swap each dword
73 .macro COPY_XMM_AND_BSWAP p1 p2 p3
74 	VMOVDQ \p2, \p1
75 	vpshufb \p3, \p1, \p1
76 .endm
77 
78 ################################
79 
80 X0 = %xmm4
81 X1 = %xmm5
82 X2 = %xmm6
83 X3 = %xmm7
84 
85 XTMP0 = %xmm0
86 XTMP1 = %xmm1
87 XTMP2 = %xmm2
88 XTMP3 = %xmm3
89 XTMP4 = %xmm8
90 XFER = %xmm9
91 XTMP5 = %xmm11
92 
93 SHUF_00BA = %xmm10      # shuffle xBxA -> 00BA
94 SHUF_DC00 = %xmm12      # shuffle xDxC -> DC00
95 BYTE_FLIP_MASK = %xmm13
96 
97 NUM_BLKS = %rdx   # 3rd arg
98 INP = %rsi        # 2nd arg
99 CTX = %rdi        # 1st arg
100 
101 SRND = %rsi       # clobbers INP
102 c = %ecx
103 d = %r8d
104 e = %edx
105 TBL = %r12
106 a = %eax
107 b = %ebx
108 
109 f = %r9d
110 g = %r10d
111 h = %r11d
112 
113 y0 = %r13d
114 y1 = %r14d
115 y2 = %r15d
116 
117 
118 _INP_END_SIZE = 8
119 _INP_SIZE = 8
120 _XFER_SIZE = 16
121 _XMM_SAVE_SIZE = 0
122 
123 _INP_END = 0
124 _INP            = _INP_END  + _INP_END_SIZE
125 _XFER           = _INP      + _INP_SIZE
126 _XMM_SAVE       = _XFER     + _XFER_SIZE
127 STACK_SIZE      = _XMM_SAVE + _XMM_SAVE_SIZE
128 
129 # rotate_Xs
130 # Rotate values of symbols X0...X3
131 .macro rotate_Xs
132 X_ = X0
133 X0 = X1
134 X1 = X2
135 X2 = X3
136 X3 = X_
137 .endm
138 
139 # ROTATE_ARGS
140 # Rotate values of symbols a...h
141 .macro ROTATE_ARGS
142 TMP_ = h
143 h = g
144 g = f
145 f = e
146 e = d
147 d = c
148 c = b
149 b = a
150 a = TMP_
151 .endm
152 
153 .macro FOUR_ROUNDS_AND_SCHED
154 	## compute s0 four at a time and s1 two at a time
155 	## compute W[-16] + W[-7] 4 at a time
156 
157 	mov     e, y0			# y0 = e
158 	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
159 	mov     a, y1                   # y1 = a
160 	vpalignr $4, X2, X3, XTMP0      # XTMP0 = W[-7]
161 	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
162 	xor     e, y0                   # y0 = e ^ (e >> (25-11))
163 	mov     f, y2                   # y2 = f
164 	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
165 	xor     a, y1                   # y1 = a ^ (a >> (22-13)
166 	xor     g, y2                   # y2 = f^g
167 	vpaddd  X0, XTMP0, XTMP0        # XTMP0 = W[-7] + W[-16]
168 	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
169 	and     e, y2                   # y2 = (f^g)&e
170 	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
171 	## compute s0
172 	vpalignr $4, X0, X1, XTMP1      # XTMP1 = W[-15]
173 	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
174 	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
175 	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
176 	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
177 	add     y0, y2                  # y2 = S1 + CH
178 	add     _XFER(%rsp), y2         # y2 = k + w + S1 + CH
179 	mov     a, y0                   # y0 = a
180 	add     y2, h                   # h = h + S1 + CH + k + w
181 	mov     a, y2                   # y2 = a
182 	vpsrld  $7, XTMP1, XTMP2
183 	or      c, y0                   # y0 = a|c
184 	add     h, d                    # d = d + h + S1 + CH + k + w
185 	and     c, y2                   # y2 = a&c
186 	vpslld  $(32-7), XTMP1, XTMP3
187 	and     b, y0                   # y0 = (a|c)&b
188 	add     y1, h                   # h = h + S1 + CH + k + w + S0
189 	vpor    XTMP2, XTMP3, XTMP3     # XTMP1 = W[-15] MY_ROR 7
190 	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
191 	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
192 	ROTATE_ARGS
193 	mov     e, y0                   # y0 = e
194 	mov     a, y1                   # y1 = a
195 	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
196 	xor     e, y0                   # y0 = e ^ (e >> (25-11))
197 	mov     f, y2                   # y2 = f
198 	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
199 	vpsrld  $18, XTMP1, XTMP2       #
200 	xor     a, y1                   # y1 = a ^ (a >> (22-13)
201 	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
202 	xor     g, y2                   # y2 = f^g
203 	vpsrld  $3, XTMP1, XTMP4        # XTMP4 = W[-15] >> 3
204 	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
205 	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
206 	and     e, y2                   # y2 = (f^g)&e
207 	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
208 	vpslld  $(32-18), XTMP1, XTMP1
209 	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
210 	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
211 	vpxor   XTMP1, XTMP3, XTMP3     #
212 	add     y0, y2                  # y2 = S1 + CH
213 	add     (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
214 	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
215 	vpxor   XTMP2, XTMP3, XTMP3     # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
216 	mov     a, y0                   # y0 = a
217 	add     y2, h                   # h = h + S1 + CH + k + w
218 	mov     a, y2                   # y2 = a
219 	vpxor   XTMP4, XTMP3, XTMP1     # XTMP1 = s0
220 	or      c, y0                   # y0 = a|c
221 	add     h, d                    # d = d + h + S1 + CH + k + w
222 	and     c, y2                   # y2 = a&c
223 	## compute low s1
224 	vpshufd $0b11111010, X3, XTMP2  # XTMP2 = W[-2] {BBAA}
225 	and     b, y0                   # y0 = (a|c)&b
226 	add     y1, h                   # h = h + S1 + CH + k + w + S0
227 	vpaddd  XTMP1, XTMP0, XTMP0     # XTMP0 = W[-16] + W[-7] + s0
228 	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
229 	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
230 	ROTATE_ARGS
231 	mov     e, y0                   # y0 = e
232 	mov     a, y1                   # y1 = a
233 	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
234 	xor     e, y0                   # y0 = e ^ (e >> (25-11))
235 	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
236 	mov     f, y2                   # y2 = f
237 	xor     a, y1                   # y1 = a ^ (a >> (22-13)
238 	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
239 	vpsrld  $10, XTMP2, XTMP4       # XTMP4 = W[-2] >> 10 {BBAA}
240 	xor     g, y2                   # y2 = f^g
241 	vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] MY_ROR 19 {xBxA}
242 	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
243 	and     e, y2                   # y2 = (f^g)&e
244 	vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] MY_ROR 17 {xBxA}
245 	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
246 	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
247 	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
248 	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
249 	vpxor   XTMP3, XTMP2, XTMP2     #
250 	add     y0, y2                  # y2 = S1 + CH
251 	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
252 	add     (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
253 	vpxor   XTMP2, XTMP4, XTMP4     # XTMP4 = s1 {xBxA}
254 	mov     a, y0                   # y0 = a
255 	add     y2, h                   # h = h + S1 + CH + k + w
256 	mov     a, y2                   # y2 = a
257 	vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
258 	or      c, y0                   # y0 = a|c
259 	add     h, d                    # d = d + h + S1 + CH + k + w
260 	and     c, y2                   # y2 = a&c
261 	vpaddd  XTMP4, XTMP0, XTMP0     # XTMP0 = {..., ..., W[1], W[0]}
262 	and     b, y0                   # y0 = (a|c)&b
263 	add     y1, h                   # h = h + S1 + CH + k + w + S0
264 	## compute high s1
265 	vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
266 	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
267 	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
268 	ROTATE_ARGS
269 	mov     e, y0                   # y0 = e
270 	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
271 	mov     a, y1                   # y1 = a
272 	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
273 	xor     e, y0                   # y0 = e ^ (e >> (25-11))
274 	mov     f, y2                   # y2 = f
275 	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
276 	vpsrld  $10, XTMP2, XTMP5       # XTMP5 = W[-2] >> 10 {DDCC}
277 	xor     a, y1                   # y1 = a ^ (a >> (22-13)
278 	xor     g, y2                   # y2 = f^g
279 	vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] MY_ROR 19 {xDxC}
280 	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
281 	and     e, y2                   # y2 = (f^g)&e
282 	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
283 	vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] MY_ROR 17 {xDxC}
284 	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
285 	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
286 	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
287 	vpxor   XTMP3, XTMP2, XTMP2
288 	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
289 	add     y0, y2                  # y2 = S1 + CH
290 	add     (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
291 	vpxor   XTMP2, XTMP5, XTMP5     # XTMP5 = s1 {xDxC}
292 	mov     a, y0                   # y0 = a
293 	add     y2, h                   # h = h + S1 + CH + k + w
294 	mov     a, y2                   # y2 = a
295 	vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
296 	or      c, y0                   # y0 = a|c
297 	add     h, d                    # d = d + h + S1 + CH + k + w
298 	and     c, y2                   # y2 = a&c
299 	vpaddd  XTMP0, XTMP5, X0        # X0 = {W[3], W[2], W[1], W[0]}
300 	and     b, y0                   # y0 = (a|c)&b
301 	add     y1, h                   # h = h + S1 + CH + k + w + S0
302 	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
303 	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
304 	ROTATE_ARGS
305 	rotate_Xs
306 .endm
307 
308 ## input is [rsp + _XFER + %1 * 4]
309 .macro DO_ROUND round
310 	mov	e, y0			# y0 = e
311         MY_ROR  (25-11), y0             # y0 = e >> (25-11)
312         mov     a, y1                   # y1 = a
313         xor     e, y0                   # y0 = e ^ (e >> (25-11))
314         MY_ROR  (22-13), y1             # y1 = a >> (22-13)
315         mov     f, y2                   # y2 = f
316         xor     a, y1                   # y1 = a ^ (a >> (22-13)
317         MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
318         xor     g, y2                   # y2 = f^g
319         xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
320         MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
321         and     e, y2                   # y2 = (f^g)&e
322         xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
323         MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
324         xor     g, y2                   # y2 = CH = ((f^g)&e)^g
325         add     y0, y2                  # y2 = S1 + CH
326         MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
327         offset = \round * 4 + _XFER     #
328         add     offset(%rsp), y2	# y2 = k + w + S1 + CH
329         mov     a, y0			# y0 = a
330         add     y2, h                   # h = h + S1 + CH + k + w
331         mov     a, y2                   # y2 = a
332         or      c, y0                   # y0 = a|c
333         add     h, d                    # d = d + h + S1 + CH + k + w
334         and     c, y2                   # y2 = a&c
335         and     b, y0                   # y0 = (a|c)&b
336         add     y1, h                   # h = h + S1 + CH + k + w + S0
337         or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
338         add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
339         ROTATE_ARGS
340 .endm
341 
342 ########################################################################
343 ## void sha256_transform_avx(state sha256_state *state, const u8 *data, int blocks)
344 ## arg 1 : pointer to state
345 ## arg 2 : pointer to input data
346 ## arg 3 : Num blocks
347 ########################################################################
348 .text
349 SYM_FUNC_START(sha256_transform_avx)
350 .align 32
351 	pushq   %rbx
352 	pushq   %r12
353 	pushq   %r13
354 	pushq   %r14
355 	pushq   %r15
356 	pushq	%rbp
357 	movq	%rsp, %rbp
358 
359 	subq    $STACK_SIZE, %rsp	# allocate stack space
360 	and	$~15, %rsp		# align stack pointer
361 
362 	shl     $6, NUM_BLKS		# convert to bytes
363 	jz      done_hash
364 	add     INP, NUM_BLKS		# pointer to end of data
365 	mov     NUM_BLKS, _INP_END(%rsp)
366 
367 	## load initial digest
368 	mov     4*0(CTX), a
369 	mov     4*1(CTX), b
370 	mov     4*2(CTX), c
371 	mov     4*3(CTX), d
372 	mov     4*4(CTX), e
373 	mov     4*5(CTX), f
374 	mov     4*6(CTX), g
375 	mov     4*7(CTX), h
376 
377 	vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
378 	vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
379 	vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
380 loop0:
381 	lea     K256(%rip), TBL
382 
383 	## byte swap first 16 dwords
384 	COPY_XMM_AND_BSWAP      X0, 0*16(INP), BYTE_FLIP_MASK
385 	COPY_XMM_AND_BSWAP      X1, 1*16(INP), BYTE_FLIP_MASK
386 	COPY_XMM_AND_BSWAP      X2, 2*16(INP), BYTE_FLIP_MASK
387 	COPY_XMM_AND_BSWAP      X3, 3*16(INP), BYTE_FLIP_MASK
388 
389 	mov     INP, _INP(%rsp)
390 
391 	## schedule 48 input dwords, by doing 3 rounds of 16 each
392 	mov     $3, SRND
393 .align 16
394 loop1:
395 	vpaddd  (TBL), X0, XFER
396 	vmovdqa XFER, _XFER(%rsp)
397 	FOUR_ROUNDS_AND_SCHED
398 
399 	vpaddd  1*16(TBL), X0, XFER
400 	vmovdqa XFER, _XFER(%rsp)
401 	FOUR_ROUNDS_AND_SCHED
402 
403 	vpaddd  2*16(TBL), X0, XFER
404 	vmovdqa XFER, _XFER(%rsp)
405 	FOUR_ROUNDS_AND_SCHED
406 
407 	vpaddd  3*16(TBL), X0, XFER
408 	vmovdqa XFER, _XFER(%rsp)
409 	add	$4*16, TBL
410 	FOUR_ROUNDS_AND_SCHED
411 
412 	sub     $1, SRND
413 	jne     loop1
414 
415 	mov     $2, SRND
416 loop2:
417 	vpaddd  (TBL), X0, XFER
418 	vmovdqa XFER, _XFER(%rsp)
419 	DO_ROUND        0
420 	DO_ROUND        1
421 	DO_ROUND        2
422 	DO_ROUND        3
423 
424 	vpaddd  1*16(TBL), X1, XFER
425 	vmovdqa XFER, _XFER(%rsp)
426 	add     $2*16, TBL
427 	DO_ROUND        0
428 	DO_ROUND        1
429 	DO_ROUND        2
430 	DO_ROUND        3
431 
432 	vmovdqa X2, X0
433 	vmovdqa X3, X1
434 
435 	sub     $1, SRND
436 	jne     loop2
437 
438 	addm    (4*0)(CTX),a
439 	addm    (4*1)(CTX),b
440 	addm    (4*2)(CTX),c
441 	addm    (4*3)(CTX),d
442 	addm    (4*4)(CTX),e
443 	addm    (4*5)(CTX),f
444 	addm    (4*6)(CTX),g
445 	addm    (4*7)(CTX),h
446 
447 	mov     _INP(%rsp), INP
448 	add     $64, INP
449 	cmp     _INP_END(%rsp), INP
450 	jne     loop0
451 
452 done_hash:
453 
454 	mov	%rbp, %rsp
455 	popq	%rbp
456 	popq    %r15
457 	popq    %r14
458 	popq    %r13
459 	popq	%r12
460 	popq    %rbx
461 	RET
462 SYM_FUNC_END(sha256_transform_avx)
463 
464 .section	.rodata.cst256.K256, "aM", @progbits, 256
465 .align 64
466 K256:
467 	.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
468 	.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
469 	.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
470 	.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
471 	.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
472 	.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
473 	.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
474 	.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
475 	.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
476 	.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
477 	.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
478 	.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
479 	.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
480 	.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
481 	.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
482 	.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
483 
484 .section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
485 .align 16
486 PSHUFFLE_BYTE_FLIP_MASK:
487 	.octa 0x0c0d0e0f08090a0b0405060700010203
488 
489 .section	.rodata.cst16._SHUF_00BA, "aM", @progbits, 16
490 .align 16
491 # shuffle xBxA -> 00BA
492 _SHUF_00BA:
493 	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
494 
495 .section	.rodata.cst16._SHUF_DC00, "aM", @progbits, 16
496 .align 16
497 # shuffle xDxC -> DC00
498 _SHUF_DC00:
499 	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
500