1 ########################################################################
2 # Implement fast SHA-256 with SSSE3 instructions. (x86_64)
3 #
4 # Copyright (C) 2013 Intel Corporation.
5 #
6 # Authors:
7 #     James Guilford <james.guilford@intel.com>
8 #     Kirk Yap <kirk.s.yap@intel.com>
9 #     Tim Chen <tim.c.chen@linux.intel.com>
10 #
11 # This software is available to you under a choice of one of two
12 # licenses.  You may choose to be licensed under the terms of the GNU
13 # General Public License (GPL) Version 2, available from the file
14 # COPYING in the main directory of this source tree, or the
15 # OpenIB.org BSD license below:
16 #
17 #     Redistribution and use in source and binary forms, with or
18 #     without modification, are permitted provided that the following
19 #     conditions are met:
20 #
21 #      - Redistributions of source code must retain the above
22 #        copyright notice, this list of conditions and the following
23 #        disclaimer.
24 #
25 #      - Redistributions in binary form must reproduce the above
26 #        copyright notice, this list of conditions and the following
27 #        disclaimer in the documentation and/or other materials
28 #        provided with the distribution.
29 #
30 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37 # SOFTWARE.
38 #
39 ########################################################################
40 #
41 # This code is described in an Intel White-Paper:
42 # "Fast SHA-256 Implementations on Intel Architecture Processors"
43 #
44 # To find it, surf to http://www.intel.com/p/en_US/embedded
45 # and search for that title.
46 #
47 ########################################################################
48 
49 #include <linux/linkage.h>
50 
51 ## assume buffers not aligned
52 #define    MOVDQ movdqu
53 
54 ################################ Define Macros
55 
56 # addm [mem], reg
57 # Add reg to mem using reg-mem add and store
58 .macro addm p1 p2
59         add     \p1, \p2
60         mov     \p2, \p1
61 .endm
62 
63 ################################
64 
65 # COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
66 # Load xmm with mem and byte swap each dword
67 .macro COPY_XMM_AND_BSWAP p1 p2 p3
68         MOVDQ \p2, \p1
69         pshufb \p3, \p1
70 .endm
71 
72 ################################
73 
74 X0 = %xmm4
75 X1 = %xmm5
76 X2 = %xmm6
77 X3 = %xmm7
78 
79 XTMP0 = %xmm0
80 XTMP1 = %xmm1
81 XTMP2 = %xmm2
82 XTMP3 = %xmm3
83 XTMP4 = %xmm8
84 XFER = %xmm9
85 
86 SHUF_00BA = %xmm10      # shuffle xBxA -> 00BA
87 SHUF_DC00 = %xmm11      # shuffle xDxC -> DC00
88 BYTE_FLIP_MASK = %xmm12
89 
90 NUM_BLKS = %rdx   # 3rd arg
91 INP = %rsi        # 2nd arg
92 CTX = %rdi        # 1st arg
93 
94 SRND = %rsi       # clobbers INP
95 c = %ecx
96 d = %r8d
97 e = %edx
98 TBL = %r12
99 a = %eax
100 b = %ebx
101 
102 f = %r9d
103 g = %r10d
104 h = %r11d
105 
106 y0 = %r13d
107 y1 = %r14d
108 y2 = %r15d
109 
110 
111 
112 _INP_END_SIZE = 8
113 _INP_SIZE = 8
114 _XFER_SIZE = 16
115 _XMM_SAVE_SIZE = 0
116 
117 _INP_END = 0
118 _INP            = _INP_END  + _INP_END_SIZE
119 _XFER           = _INP      + _INP_SIZE
120 _XMM_SAVE       = _XFER     + _XFER_SIZE
121 STACK_SIZE      = _XMM_SAVE + _XMM_SAVE_SIZE
122 
123 # rotate_Xs
124 # Rotate values of symbols X0...X3
125 .macro rotate_Xs
126 X_ = X0
127 X0 = X1
128 X1 = X2
129 X2 = X3
130 X3 = X_
131 .endm
132 
133 # ROTATE_ARGS
134 # Rotate values of symbols a...h
135 .macro ROTATE_ARGS
136 TMP_ = h
137 h = g
138 g = f
139 f = e
140 e = d
141 d = c
142 c = b
143 b = a
144 a = TMP_
145 .endm
146 
147 .macro FOUR_ROUNDS_AND_SCHED
148 	## compute s0 four at a time and s1 two at a time
149 	## compute W[-16] + W[-7] 4 at a time
150 	movdqa  X3, XTMP0
151 	mov     e, y0			# y0 = e
152 	ror     $(25-11), y0            # y0 = e >> (25-11)
153 	mov     a, y1                   # y1 = a
154 	palignr $4, X2, XTMP0           # XTMP0 = W[-7]
155 	ror     $(22-13), y1            # y1 = a >> (22-13)
156 	xor     e, y0                   # y0 = e ^ (e >> (25-11))
157 	mov     f, y2                   # y2 = f
158 	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
159 	movdqa  X1, XTMP1
160 	xor     a, y1                   # y1 = a ^ (a >> (22-13)
161 	xor     g, y2                   # y2 = f^g
162 	paddd   X0, XTMP0               # XTMP0 = W[-7] + W[-16]
163 	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
164 	and     e, y2                   # y2 = (f^g)&e
165 	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
166 	## compute s0
167 	palignr $4, X0, XTMP1           # XTMP1 = W[-15]
168 	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
169 	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
170 	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
171 	movdqa  XTMP1, XTMP2            # XTMP2 = W[-15]
172 	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
173 	add     y0, y2                  # y2 = S1 + CH
174 	add     _XFER(%rsp) , y2        # y2 = k + w + S1 + CH
175 	movdqa  XTMP1, XTMP3            # XTMP3 = W[-15]
176 	mov     a, y0                   # y0 = a
177 	add     y2, h                   # h = h + S1 + CH + k + w
178 	mov     a, y2                   # y2 = a
179 	pslld   $(32-7), XTMP1          #
180 	or      c, y0                   # y0 = a|c
181 	add     h, d                    # d = d + h + S1 + CH + k + w
182 	and     c, y2                   # y2 = a&c
183 	psrld   $7, XTMP2               #
184 	and     b, y0                   # y0 = (a|c)&b
185 	add     y1, h                   # h = h + S1 + CH + k + w + S0
186 	por     XTMP2, XTMP1            # XTMP1 = W[-15] ror 7
187 	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
188 	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
189 					#
190 	ROTATE_ARGS                     #
191 	movdqa  XTMP3, XTMP2            # XTMP2 = W[-15]
192 	mov     e, y0                   # y0 = e
193 	mov     a, y1                   # y1 = a
194 	movdqa  XTMP3, XTMP4            # XTMP4 = W[-15]
195 	ror     $(25-11), y0            # y0 = e >> (25-11)
196 	xor     e, y0                   # y0 = e ^ (e >> (25-11))
197 	mov     f, y2                   # y2 = f
198 	ror     $(22-13), y1            # y1 = a >> (22-13)
199 	pslld   $(32-18), XTMP3         #
200 	xor     a, y1                   # y1 = a ^ (a >> (22-13)
201 	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
202 	xor     g, y2                   # y2 = f^g
203 	psrld   $18, XTMP2              #
204 	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
205 	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
206 	and     e, y2                   # y2 = (f^g)&e
207 	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
208 	pxor    XTMP3, XTMP1
209 	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
210 	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
211 	psrld   $3, XTMP4               # XTMP4 = W[-15] >> 3
212 	add     y0, y2                  # y2 = S1 + CH
213 	add     (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
214 	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
215 	pxor    XTMP2, XTMP1            # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
216 	mov     a, y0                   # y0 = a
217 	add     y2, h                   # h = h + S1 + CH + k + w
218 	mov     a, y2                   # y2 = a
219 	pxor    XTMP4, XTMP1            # XTMP1 = s0
220 	or      c, y0                   # y0 = a|c
221 	add     h, d                    # d = d + h + S1 + CH + k + w
222 	and     c, y2                   # y2 = a&c
223 	## compute low s1
224 	pshufd  $0b11111010, X3, XTMP2   # XTMP2 = W[-2] {BBAA}
225 	and     b, y0			# y0 = (a|c)&b
226 	add     y1, h                   # h = h + S1 + CH + k + w + S0
227 	paddd   XTMP1, XTMP0            # XTMP0 = W[-16] + W[-7] + s0
228 	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
229 	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
230 
231 	ROTATE_ARGS
232 	movdqa  XTMP2, XTMP3            # XTMP3 = W[-2] {BBAA}
233 	mov     e, y0                   # y0 = e
234 	mov     a, y1                   # y1 = a
235 	ror     $(25-11), y0            # y0 = e >> (25-11)
236 	movdqa  XTMP2, XTMP4            # XTMP4 = W[-2] {BBAA}
237 	xor     e, y0                   # y0 = e ^ (e >> (25-11))
238 	ror     $(22-13), y1            # y1 = a >> (22-13)
239 	mov     f, y2                   # y2 = f
240 	xor     a, y1                   # y1 = a ^ (a >> (22-13)
241 	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
242 	psrlq   $17, XTMP2              # XTMP2 = W[-2] ror 17 {xBxA}
243 	xor     g, y2                   # y2 = f^g
244 	psrlq   $19, XTMP3              # XTMP3 = W[-2] ror 19 {xBxA}
245 	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
246 	and     e, y2                   # y2 = (f^g)&e
247 	psrld   $10, XTMP4              # XTMP4 = W[-2] >> 10 {BBAA}
248 	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
249 	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
250 	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
251 	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
252 	pxor    XTMP3, XTMP2
253 	add     y0, y2                  # y2 = S1 + CH
254 	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
255 	add     (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
256 	pxor    XTMP2, XTMP4            # XTMP4 = s1 {xBxA}
257 	mov     a, y0                   # y0 = a
258 	add     y2, h                   # h = h + S1 + CH + k + w
259 	mov     a, y2                   # y2 = a
260 	pshufb  SHUF_00BA, XTMP4        # XTMP4 = s1 {00BA}
261 	or      c, y0                   # y0 = a|c
262 	add     h, d                    # d = d + h + S1 + CH + k + w
263 	and     c, y2                   # y2 = a&c
264 	paddd   XTMP4, XTMP0            # XTMP0 = {..., ..., W[1], W[0]}
265 	and     b, y0                   # y0 = (a|c)&b
266 	add     y1, h                   # h = h + S1 + CH + k + w + S0
267 	## compute high s1
268 	pshufd  $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}
269 	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
270 	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
271 					#
272 	ROTATE_ARGS                     #
273 	movdqa  XTMP2, XTMP3            # XTMP3 = W[-2] {DDCC}
274 	mov     e, y0                   # y0 = e
275 	ror     $(25-11), y0            # y0 = e >> (25-11)
276 	mov     a, y1                   # y1 = a
277 	movdqa  XTMP2, X0               # X0    = W[-2] {DDCC}
278 	ror     $(22-13), y1            # y1 = a >> (22-13)
279 	xor     e, y0                   # y0 = e ^ (e >> (25-11))
280 	mov     f, y2                   # y2 = f
281 	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
282 	psrlq   $17, XTMP2              # XTMP2 = W[-2] ror 17 {xDxC}
283 	xor     a, y1                   # y1 = a ^ (a >> (22-13)
284 	xor     g, y2                   # y2 = f^g
285 	psrlq   $19, XTMP3              # XTMP3 = W[-2] ror 19 {xDxC}
286 	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25
287 	and     e, y2                   # y2 = (f^g)&e
288 	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
289 	psrld   $10, X0                 # X0 = W[-2] >> 10 {DDCC}
290 	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22
291 	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
292 	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
293 	pxor    XTMP3, XTMP2            #
294 	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
295 	add     y0, y2                  # y2 = S1 + CH
296 	add     (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
297 	pxor    XTMP2, X0               # X0 = s1 {xDxC}
298 	mov     a, y0                   # y0 = a
299 	add     y2, h                   # h = h + S1 + CH + k + w
300 	mov     a, y2                   # y2 = a
301 	pshufb  SHUF_DC00, X0           # X0 = s1 {DC00}
302 	or      c, y0                   # y0 = a|c
303 	add     h, d                    # d = d + h + S1 + CH + k + w
304 	and     c, y2                   # y2 = a&c
305 	paddd   XTMP0, X0               # X0 = {W[3], W[2], W[1], W[0]}
306 	and     b, y0                   # y0 = (a|c)&b
307 	add     y1, h                   # h = h + S1 + CH + k + w + S0
308 	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
309 	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
310 
311 	ROTATE_ARGS
312 	rotate_Xs
313 .endm
314 
315 ## input is [rsp + _XFER + %1 * 4]
316 .macro DO_ROUND round
317 	mov     e, y0                 # y0 = e
318 	ror     $(25-11), y0          # y0 = e >> (25-11)
319 	mov     a, y1                 # y1 = a
320 	xor     e, y0                 # y0 = e ^ (e >> (25-11))
321 	ror     $(22-13), y1          # y1 = a >> (22-13)
322 	mov     f, y2                 # y2 = f
323 	xor     a, y1                 # y1 = a ^ (a >> (22-13)
324 	ror     $(11-6), y0           # y0 = (e >> (11-6)) ^ (e >> (25-6))
325 	xor     g, y2                 # y2 = f^g
326 	xor     e, y0                 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
327 	ror     $(13-2), y1           # y1 = (a >> (13-2)) ^ (a >> (22-2))
328 	and     e, y2                 # y2 = (f^g)&e
329 	xor     a, y1                 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
330 	ror     $6, y0                # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
331 	xor     g, y2                 # y2 = CH = ((f^g)&e)^g
332 	add     y0, y2                # y2 = S1 + CH
333 	ror     $2, y1                # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
334 	offset = \round * 4 + _XFER
335 	add     offset(%rsp), y2      # y2 = k + w + S1 + CH
336 	mov     a, y0                 # y0 = a
337 	add     y2, h                 # h = h + S1 + CH + k + w
338 	mov     a, y2                 # y2 = a
339 	or      c, y0                 # y0 = a|c
340 	add     h, d                  # d = d + h + S1 + CH + k + w
341 	and     c, y2                 # y2 = a&c
342 	and     b, y0                 # y0 = (a|c)&b
343 	add     y1, h                 # h = h + S1 + CH + k + w + S0
344 	or      y2, y0		      # y0 = MAJ = (a|c)&b)|(a&c)
345 	add     y0, h		      # h = h + S1 + CH + k + w + S0 + MAJ
346 	ROTATE_ARGS
347 .endm
348 
349 ########################################################################
350 ## void sha256_transform_ssse3(struct sha256_state *state, const u8 *data,
351 ##			       int blocks);
352 ## arg 1 : pointer to state
353 ##	   (struct sha256_state is assumed to begin with u32 state[8])
354 ## arg 2 : pointer to input data
355 ## arg 3 : Num blocks
356 ########################################################################
357 .text
358 SYM_FUNC_START(sha256_transform_ssse3)
359 .align 32
360 	pushq   %rbx
361 	pushq   %r12
362 	pushq   %r13
363 	pushq   %r14
364 	pushq   %r15
365 	pushq   %rbp
366 	mov	%rsp, %rbp
367 
368 	subq    $STACK_SIZE, %rsp
369 	and	$~15, %rsp
370 
371 	shl     $6, NUM_BLKS		 # convert to bytes
372 	jz      done_hash
373 	add     INP, NUM_BLKS
374 	mov     NUM_BLKS, _INP_END(%rsp) # pointer to end of data
375 
376 	## load initial digest
377 	mov     4*0(CTX), a
378 	mov     4*1(CTX), b
379 	mov     4*2(CTX), c
380 	mov     4*3(CTX), d
381 	mov     4*4(CTX), e
382 	mov     4*5(CTX), f
383 	mov     4*6(CTX), g
384 	mov     4*7(CTX), h
385 
386 	movdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
387 	movdqa  _SHUF_00BA(%rip), SHUF_00BA
388 	movdqa  _SHUF_DC00(%rip), SHUF_DC00
389 
390 loop0:
391 	lea     K256(%rip), TBL
392 
393 	## byte swap first 16 dwords
394 	COPY_XMM_AND_BSWAP      X0, 0*16(INP), BYTE_FLIP_MASK
395 	COPY_XMM_AND_BSWAP      X1, 1*16(INP), BYTE_FLIP_MASK
396 	COPY_XMM_AND_BSWAP      X2, 2*16(INP), BYTE_FLIP_MASK
397 	COPY_XMM_AND_BSWAP      X3, 3*16(INP), BYTE_FLIP_MASK
398 
399 	mov     INP, _INP(%rsp)
400 
401 	## schedule 48 input dwords, by doing 3 rounds of 16 each
402 	mov     $3, SRND
403 .align 16
404 loop1:
405 	movdqa  (TBL), XFER
406 	paddd   X0, XFER
407 	movdqa  XFER, _XFER(%rsp)
408 	FOUR_ROUNDS_AND_SCHED
409 
410 	movdqa  1*16(TBL), XFER
411 	paddd   X0, XFER
412 	movdqa  XFER, _XFER(%rsp)
413 	FOUR_ROUNDS_AND_SCHED
414 
415 	movdqa  2*16(TBL), XFER
416 	paddd   X0, XFER
417 	movdqa  XFER, _XFER(%rsp)
418 	FOUR_ROUNDS_AND_SCHED
419 
420 	movdqa  3*16(TBL), XFER
421 	paddd   X0, XFER
422 	movdqa  XFER, _XFER(%rsp)
423 	add     $4*16, TBL
424 	FOUR_ROUNDS_AND_SCHED
425 
426 	sub     $1, SRND
427 	jne     loop1
428 
429 	mov     $2, SRND
430 loop2:
431 	paddd   (TBL), X0
432 	movdqa  X0, _XFER(%rsp)
433 	DO_ROUND        0
434 	DO_ROUND        1
435 	DO_ROUND        2
436 	DO_ROUND        3
437 	paddd   1*16(TBL), X1
438 	movdqa  X1, _XFER(%rsp)
439 	add     $2*16, TBL
440 	DO_ROUND        0
441 	DO_ROUND        1
442 	DO_ROUND        2
443 	DO_ROUND        3
444 
445 	movdqa  X2, X0
446 	movdqa  X3, X1
447 
448 	sub     $1, SRND
449 	jne     loop2
450 
451 	addm    (4*0)(CTX),a
452 	addm    (4*1)(CTX),b
453 	addm    (4*2)(CTX),c
454 	addm    (4*3)(CTX),d
455 	addm    (4*4)(CTX),e
456 	addm    (4*5)(CTX),f
457 	addm    (4*6)(CTX),g
458 	addm    (4*7)(CTX),h
459 
460 	mov     _INP(%rsp), INP
461 	add     $64, INP
462 	cmp     _INP_END(%rsp), INP
463 	jne     loop0
464 
465 done_hash:
466 
467 	mov	%rbp, %rsp
468 	popq	%rbp
469 	popq    %r15
470 	popq    %r14
471 	popq    %r13
472 	popq    %r12
473 	popq    %rbx
474 
475 	RET
476 SYM_FUNC_END(sha256_transform_ssse3)
477 
478 .section	.rodata.cst256.K256, "aM", @progbits, 256
479 .align 64
480 K256:
481         .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
482         .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
483         .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
484         .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
485         .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
486         .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
487         .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
488         .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
489         .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
490         .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
491         .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
492         .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
493         .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
494         .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
495         .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
496         .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
497 
498 .section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
499 .align 16
500 PSHUFFLE_BYTE_FLIP_MASK:
501 	.octa 0x0c0d0e0f08090a0b0405060700010203
502 
503 .section	.rodata.cst16._SHUF_00BA, "aM", @progbits, 16
504 .align 16
505 # shuffle xBxA -> 00BA
506 _SHUF_00BA:
507 	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
508 
509 .section	.rodata.cst16._SHUF_DC00, "aM", @progbits, 16
510 .align 16
511 # shuffle xDxC -> DC00
512 _SHUF_DC00:
513 	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
514