1 ########################################################################
2 # Implement fast SHA-256 with AVX1 instructions. (x86_64)
3 #
4 # Copyright (C) 2013 Intel Corporation.
5 #
6 # Authors:
7 #     James Guilford <james.guilford@intel.com>
8 #     Kirk Yap <kirk.s.yap@intel.com>
9 #     Tim Chen <tim.c.chen@linux.intel.com>
10 #
11 # This software is available to you under a choice of one of two
12 # licenses.  You may choose to be licensed under the terms of the GNU
13 # General Public License (GPL) Version 2, available from the file
14 # COPYING in the main directory of this source tree, or the
15 # OpenIB.org BSD license below:
16 #
17 #     Redistribution and use in source and binary forms, with or
18 #     without modification, are permitted provided that the following
19 #     conditions are met:
20 #
21 #      - Redistributions of source code must retain the above
22 #        copyright notice, this list of conditions and the following
23 #        disclaimer.
24 #
25 #      - Redistributions in binary form must reproduce the above
26 #        copyright notice, this list of conditions and the following
27 #        disclaimer in the documentation and/or other materials
28 #        provided with the distribution.
29 #
30 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37 # SOFTWARE.
38 ########################################################################
39 #
40 # This code is described in an Intel White-Paper:
41 # "Fast SHA-256 Implementations on Intel Architecture Processors"
42 #
43 # To find it, surf to http://www.intel.com/p/en_US/embedded
44 # and search for that title.
45 #
46 ########################################################################
47 # This code schedules 1 block at a time, with 4 lanes per block
48 ########################################################################
49 
50 #include <linux/linkage.h>
51 #include <linux/cfi_types.h>
52 
53 ## assume buffers not aligned
54 #define    VMOVDQ vmovdqu
55 
56 ################################ Define Macros
57 
58 # addm [mem], reg
59 # Add reg to mem using reg-mem add and store
60 .macro addm p1 p2
61 	add     \p1, \p2
62 	mov     \p2, \p1
63 .endm
64 
65 
66 .macro MY_ROR p1 p2
67 	shld    $(32-(\p1)), \p2, \p2
68 .endm
69 
70 ################################
71 
72 # COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
73 # Load xmm with mem and byte swap each dword
74 .macro COPY_XMM_AND_BSWAP p1 p2 p3
75 	VMOVDQ \p2, \p1
76 	vpshufb \p3, \p1, \p1
77 .endm
78 
79 ################################
80 
81 X0 = %xmm4
82 X1 = %xmm5
83 X2 = %xmm6
84 X3 = %xmm7
85 
86 XTMP0 = %xmm0
87 XTMP1 = %xmm1
88 XTMP2 = %xmm2
89 XTMP3 = %xmm3
90 XTMP4 = %xmm8
91 XFER = %xmm9
92 XTMP5 = %xmm11
93 
94 SHUF_00BA = %xmm10      # shuffle xBxA -> 00BA
95 SHUF_DC00 = %xmm12      # shuffle xDxC -> DC00
96 BYTE_FLIP_MASK = %xmm13
97 
98 NUM_BLKS = %rdx   # 3rd arg
99 INP = %rsi        # 2nd arg
100 CTX = %rdi        # 1st arg
101 
102 SRND = %rsi       # clobbers INP
103 c = %ecx
104 d = %r8d
105 e = %edx
106 TBL = %r12
107 a = %eax
108 b = %ebx
109 
110 f = %r9d
111 g = %r10d
112 h = %r11d
113 
114 y0 = %r13d
115 y1 = %r14d
116 y2 = %r15d
117 
118 
119 _INP_END_SIZE = 8
120 _INP_SIZE = 8
121 _XFER_SIZE = 16
122 _XMM_SAVE_SIZE = 0
123 
124 _INP_END = 0
125 _INP            = _INP_END  + _INP_END_SIZE
126 _XFER           = _INP      + _INP_SIZE
127 _XMM_SAVE       = _XFER     + _XFER_SIZE
128 STACK_SIZE      = _XMM_SAVE + _XMM_SAVE_SIZE
129 
130 # rotate_Xs
131 # Rotate values of symbols X0...X3
132 .macro rotate_Xs
133 X_ = X0
134 X0 = X1
135 X1 = X2
136 X2 = X3
137 X3 = X_
138 .endm
139 
140 # ROTATE_ARGS
141 # Rotate values of symbols a...h
142 .macro ROTATE_ARGS
143 TMP_ = h
144 h = g
145 g = f
146 f = e
147 e = d
148 d = c
149 c = b
150 b = a
151 a = TMP_
152 .endm
153 
154 .macro FOUR_ROUNDS_AND_SCHED
155 	## compute s0 four at a time and s1 two at a time
156 	## compute W[-16] + W[-7] 4 at a time
157 
158 	mov     e, y0			# y0 = e
159 	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
160 	mov     a, y1                   # y1 = a
161 	vpalignr $4, X2, X3, XTMP0      # XTMP0 = W[-7]
162 	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
163 	xor     e, y0                   # y0 = e ^ (e >> (25-11))
164 	mov     f, y2                   # y2 = f
165 	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
166 	xor     a, y1                   # y1 = a ^ (a >> (22-13)
167 	xor     g, y2                   # y2 = f^g
168 	vpaddd  X0, XTMP0, XTMP0        # XTMP0 = W[-7] + W[-16]
169 	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
170 	and     e, y2                   # y2 = (f^g)&e
171 	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
172 	## compute s0
173 	vpalignr $4, X0, X1, XTMP1      # XTMP1 = W[-15]
174 	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
175 	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
176 	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
177 	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
178 	add     y0, y2                  # y2 = S1 + CH
179 	add     _XFER(%rsp), y2         # y2 = k + w + S1 + CH
180 	mov     a, y0                   # y0 = a
181 	add     y2, h                   # h = h + S1 + CH + k + w
182 	mov     a, y2                   # y2 = a
183 	vpsrld  $7, XTMP1, XTMP2
184 	or      c, y0                   # y0 = a|c
185 	add     h, d                    # d = d + h + S1 + CH + k + w
186 	and     c, y2                   # y2 = a&c
187 	vpslld  $(32-7), XTMP1, XTMP3
188 	and     b, y0                   # y0 = (a|c)&b
189 	add     y1, h                   # h = h + S1 + CH + k + w + S0
190 	vpor    XTMP2, XTMP3, XTMP3     # XTMP1 = W[-15] MY_ROR 7
191 	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
192 	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
193 	ROTATE_ARGS
194 	mov     e, y0                   # y0 = e
195 	mov     a, y1                   # y1 = a
196 	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
197 	xor     e, y0                   # y0 = e ^ (e >> (25-11))
198 	mov     f, y2                   # y2 = f
199 	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
200 	vpsrld  $18, XTMP1, XTMP2       #
201 	xor     a, y1                   # y1 = a ^ (a >> (22-13)
202 	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
203 	xor     g, y2                   # y2 = f^g
204 	vpsrld  $3, XTMP1, XTMP4        # XTMP4 = W[-15] >> 3
205 	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
206 	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
207 	and     e, y2                   # y2 = (f^g)&e
208 	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
209 	vpslld  $(32-18), XTMP1, XTMP1
210 	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
211 	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
212 	vpxor   XTMP1, XTMP3, XTMP3     #
213 	add     y0, y2                  # y2 = S1 + CH
214 	add     (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
215 	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
216 	vpxor   XTMP2, XTMP3, XTMP3     # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
217 	mov     a, y0                   # y0 = a
218 	add     y2, h                   # h = h + S1 + CH + k + w
219 	mov     a, y2                   # y2 = a
220 	vpxor   XTMP4, XTMP3, XTMP1     # XTMP1 = s0
221 	or      c, y0                   # y0 = a|c
222 	add     h, d                    # d = d + h + S1 + CH + k + w
223 	and     c, y2                   # y2 = a&c
224 	## compute low s1
225 	vpshufd $0b11111010, X3, XTMP2  # XTMP2 = W[-2] {BBAA}
226 	and     b, y0                   # y0 = (a|c)&b
227 	add     y1, h                   # h = h + S1 + CH + k + w + S0
228 	vpaddd  XTMP1, XTMP0, XTMP0     # XTMP0 = W[-16] + W[-7] + s0
229 	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
230 	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
231 	ROTATE_ARGS
232 	mov     e, y0                   # y0 = e
233 	mov     a, y1                   # y1 = a
234 	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
235 	xor     e, y0                   # y0 = e ^ (e >> (25-11))
236 	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
237 	mov     f, y2                   # y2 = f
238 	xor     a, y1                   # y1 = a ^ (a >> (22-13)
239 	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
240 	vpsrld  $10, XTMP2, XTMP4       # XTMP4 = W[-2] >> 10 {BBAA}
241 	xor     g, y2                   # y2 = f^g
242 	vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] MY_ROR 19 {xBxA}
243 	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
244 	and     e, y2                   # y2 = (f^g)&e
245 	vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] MY_ROR 17 {xBxA}
246 	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
247 	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
248 	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
249 	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
250 	vpxor   XTMP3, XTMP2, XTMP2     #
251 	add     y0, y2                  # y2 = S1 + CH
252 	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
253 	add     (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
254 	vpxor   XTMP2, XTMP4, XTMP4     # XTMP4 = s1 {xBxA}
255 	mov     a, y0                   # y0 = a
256 	add     y2, h                   # h = h + S1 + CH + k + w
257 	mov     a, y2                   # y2 = a
258 	vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
259 	or      c, y0                   # y0 = a|c
260 	add     h, d                    # d = d + h + S1 + CH + k + w
261 	and     c, y2                   # y2 = a&c
262 	vpaddd  XTMP4, XTMP0, XTMP0     # XTMP0 = {..., ..., W[1], W[0]}
263 	and     b, y0                   # y0 = (a|c)&b
264 	add     y1, h                   # h = h + S1 + CH + k + w + S0
265 	## compute high s1
266 	vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
267 	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
268 	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
269 	ROTATE_ARGS
270 	mov     e, y0                   # y0 = e
271 	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
272 	mov     a, y1                   # y1 = a
273 	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
274 	xor     e, y0                   # y0 = e ^ (e >> (25-11))
275 	mov     f, y2                   # y2 = f
276 	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
277 	vpsrld  $10, XTMP2, XTMP5       # XTMP5 = W[-2] >> 10 {DDCC}
278 	xor     a, y1                   # y1 = a ^ (a >> (22-13)
279 	xor     g, y2                   # y2 = f^g
280 	vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] MY_ROR 19 {xDxC}
281 	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
282 	and     e, y2                   # y2 = (f^g)&e
283 	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
284 	vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] MY_ROR 17 {xDxC}
285 	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
286 	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
287 	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
288 	vpxor   XTMP3, XTMP2, XTMP2
289 	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
290 	add     y0, y2                  # y2 = S1 + CH
291 	add     (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
292 	vpxor   XTMP2, XTMP5, XTMP5     # XTMP5 = s1 {xDxC}
293 	mov     a, y0                   # y0 = a
294 	add     y2, h                   # h = h + S1 + CH + k + w
295 	mov     a, y2                   # y2 = a
296 	vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
297 	or      c, y0                   # y0 = a|c
298 	add     h, d                    # d = d + h + S1 + CH + k + w
299 	and     c, y2                   # y2 = a&c
300 	vpaddd  XTMP0, XTMP5, X0        # X0 = {W[3], W[2], W[1], W[0]}
301 	and     b, y0                   # y0 = (a|c)&b
302 	add     y1, h                   # h = h + S1 + CH + k + w + S0
303 	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
304 	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
305 	ROTATE_ARGS
306 	rotate_Xs
307 .endm
308 
309 ## input is [rsp + _XFER + %1 * 4]
310 .macro DO_ROUND round
311 	mov	e, y0			# y0 = e
312         MY_ROR  (25-11), y0             # y0 = e >> (25-11)
313         mov     a, y1                   # y1 = a
314         xor     e, y0                   # y0 = e ^ (e >> (25-11))
315         MY_ROR  (22-13), y1             # y1 = a >> (22-13)
316         mov     f, y2                   # y2 = f
317         xor     a, y1                   # y1 = a ^ (a >> (22-13)
318         MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
319         xor     g, y2                   # y2 = f^g
320         xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
321         MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
322         and     e, y2                   # y2 = (f^g)&e
323         xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
324         MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
325         xor     g, y2                   # y2 = CH = ((f^g)&e)^g
326         add     y0, y2                  # y2 = S1 + CH
327         MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
328         offset = \round * 4 + _XFER     #
329         add     offset(%rsp), y2	# y2 = k + w + S1 + CH
330         mov     a, y0			# y0 = a
331         add     y2, h                   # h = h + S1 + CH + k + w
332         mov     a, y2                   # y2 = a
333         or      c, y0                   # y0 = a|c
334         add     h, d                    # d = d + h + S1 + CH + k + w
335         and     c, y2                   # y2 = a&c
336         and     b, y0                   # y0 = (a|c)&b
337         add     y1, h                   # h = h + S1 + CH + k + w + S0
338         or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
339         add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
340         ROTATE_ARGS
341 .endm
342 
343 ########################################################################
344 ## void sha256_transform_avx(state sha256_state *state, const u8 *data, int blocks)
345 ## arg 1 : pointer to state
346 ## arg 2 : pointer to input data
347 ## arg 3 : Num blocks
348 ########################################################################
349 .text
350 SYM_TYPED_FUNC_START(sha256_transform_avx)
351 	pushq   %rbx
352 	pushq   %r12
353 	pushq   %r13
354 	pushq   %r14
355 	pushq   %r15
356 	pushq	%rbp
357 	movq	%rsp, %rbp
358 
359 	subq    $STACK_SIZE, %rsp	# allocate stack space
360 	and	$~15, %rsp		# align stack pointer
361 
362 	shl     $6, NUM_BLKS		# convert to bytes
363 	jz      .Ldone_hash
364 	add     INP, NUM_BLKS		# pointer to end of data
365 	mov     NUM_BLKS, _INP_END(%rsp)
366 
367 	## load initial digest
368 	mov     4*0(CTX), a
369 	mov     4*1(CTX), b
370 	mov     4*2(CTX), c
371 	mov     4*3(CTX), d
372 	mov     4*4(CTX), e
373 	mov     4*5(CTX), f
374 	mov     4*6(CTX), g
375 	mov     4*7(CTX), h
376 
377 	vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
378 	vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
379 	vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
380 .Lloop0:
381 	lea     K256(%rip), TBL
382 
383 	## byte swap first 16 dwords
384 	COPY_XMM_AND_BSWAP      X0, 0*16(INP), BYTE_FLIP_MASK
385 	COPY_XMM_AND_BSWAP      X1, 1*16(INP), BYTE_FLIP_MASK
386 	COPY_XMM_AND_BSWAP      X2, 2*16(INP), BYTE_FLIP_MASK
387 	COPY_XMM_AND_BSWAP      X3, 3*16(INP), BYTE_FLIP_MASK
388 
389 	mov     INP, _INP(%rsp)
390 
391 	## schedule 48 input dwords, by doing 3 rounds of 16 each
392 	mov     $3, SRND
393 .align 16
394 .Lloop1:
395 	vpaddd  (TBL), X0, XFER
396 	vmovdqa XFER, _XFER(%rsp)
397 	FOUR_ROUNDS_AND_SCHED
398 
399 	vpaddd  1*16(TBL), X0, XFER
400 	vmovdqa XFER, _XFER(%rsp)
401 	FOUR_ROUNDS_AND_SCHED
402 
403 	vpaddd  2*16(TBL), X0, XFER
404 	vmovdqa XFER, _XFER(%rsp)
405 	FOUR_ROUNDS_AND_SCHED
406 
407 	vpaddd  3*16(TBL), X0, XFER
408 	vmovdqa XFER, _XFER(%rsp)
409 	add	$4*16, TBL
410 	FOUR_ROUNDS_AND_SCHED
411 
412 	sub     $1, SRND
413 	jne     .Lloop1
414 
415 	mov     $2, SRND
416 .Lloop2:
417 	vpaddd  (TBL), X0, XFER
418 	vmovdqa XFER, _XFER(%rsp)
419 	DO_ROUND        0
420 	DO_ROUND        1
421 	DO_ROUND        2
422 	DO_ROUND        3
423 
424 	vpaddd  1*16(TBL), X1, XFER
425 	vmovdqa XFER, _XFER(%rsp)
426 	add     $2*16, TBL
427 	DO_ROUND        0
428 	DO_ROUND        1
429 	DO_ROUND        2
430 	DO_ROUND        3
431 
432 	vmovdqa X2, X0
433 	vmovdqa X3, X1
434 
435 	sub     $1, SRND
436 	jne     .Lloop2
437 
438 	addm    (4*0)(CTX),a
439 	addm    (4*1)(CTX),b
440 	addm    (4*2)(CTX),c
441 	addm    (4*3)(CTX),d
442 	addm    (4*4)(CTX),e
443 	addm    (4*5)(CTX),f
444 	addm    (4*6)(CTX),g
445 	addm    (4*7)(CTX),h
446 
447 	mov     _INP(%rsp), INP
448 	add     $64, INP
449 	cmp     _INP_END(%rsp), INP
450 	jne     .Lloop0
451 
452 .Ldone_hash:
453 
454 	mov	%rbp, %rsp
455 	popq	%rbp
456 	popq    %r15
457 	popq    %r14
458 	popq    %r13
459 	popq	%r12
460 	popq    %rbx
461 	RET
462 SYM_FUNC_END(sha256_transform_avx)
463 
464 .section	.rodata.cst256.K256, "aM", @progbits, 256
465 .align 64
466 K256:
467 	.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
468 	.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
469 	.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
470 	.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
471 	.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
472 	.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
473 	.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
474 	.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
475 	.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
476 	.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
477 	.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
478 	.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
479 	.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
480 	.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
481 	.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
482 	.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
483 
484 .section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
485 .align 16
486 PSHUFFLE_BYTE_FLIP_MASK:
487 	.octa 0x0c0d0e0f08090a0b0405060700010203
488 
489 .section	.rodata.cst16._SHUF_00BA, "aM", @progbits, 16
490 .align 16
491 # shuffle xBxA -> 00BA
492 _SHUF_00BA:
493 	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
494 
495 .section	.rodata.cst16._SHUF_DC00, "aM", @progbits, 16
496 .align 16
497 # shuffle xDxC -> DC00
498 _SHUF_DC00:
499 	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
500