1 ########################################################################
2 # Implement fast SHA-256 with SSSE3 instructions. (x86_64)
3 #
4 # Copyright (C) 2013 Intel Corporation.
5 #
6 # Authors:
7 #     James Guilford <james.guilford@intel.com>
8 #     Kirk Yap <kirk.s.yap@intel.com>
9 #     Tim Chen <tim.c.chen@linux.intel.com>
10 #
11 # This software is available to you under a choice of one of two
12 # licenses.  You may choose to be licensed under the terms of the GNU
13 # General Public License (GPL) Version 2, available from the file
14 # COPYING in the main directory of this source tree, or the
15 # OpenIB.org BSD license below:
16 #
17 #     Redistribution and use in source and binary forms, with or
18 #     without modification, are permitted provided that the following
19 #     conditions are met:
20 #
21 #      - Redistributions of source code must retain the above
22 #        copyright notice, this list of conditions and the following
23 #        disclaimer.
24 #
25 #      - Redistributions in binary form must reproduce the above
26 #        copyright notice, this list of conditions and the following
27 #        disclaimer in the documentation and/or other materials
28 #        provided with the distribution.
29 #
30 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37 # SOFTWARE.
38 #
39 ########################################################################
40 #
41 # This code is described in an Intel White-Paper:
42 # "Fast SHA-256 Implementations on Intel Architecture Processors"
43 #
44 # To find it, surf to http://www.intel.com/p/en_US/embedded
45 # and search for that title.
46 #
47 ########################################################################
48 
49 #include <linux/linkage.h>
50 #include <linux/cfi_types.h>
51 
52 ## assume buffers not aligned
53 #define    MOVDQ movdqu
54 
55 ################################ Define Macros
56 
57 # addm [mem], reg
58 # Add reg to mem using reg-mem add and store
59 .macro addm p1 p2
60         add     \p1, \p2
61         mov     \p2, \p1
62 .endm
63 
64 ################################
65 
66 # COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
67 # Load xmm with mem and byte swap each dword
68 .macro COPY_XMM_AND_BSWAP p1 p2 p3
69         MOVDQ \p2, \p1
70         pshufb \p3, \p1
71 .endm
72 
73 ################################
74 
75 X0 = %xmm4
76 X1 = %xmm5
77 X2 = %xmm6
78 X3 = %xmm7
79 
80 XTMP0 = %xmm0
81 XTMP1 = %xmm1
82 XTMP2 = %xmm2
83 XTMP3 = %xmm3
84 XTMP4 = %xmm8
85 XFER = %xmm9
86 
87 SHUF_00BA = %xmm10      # shuffle xBxA -> 00BA
88 SHUF_DC00 = %xmm11      # shuffle xDxC -> DC00
89 BYTE_FLIP_MASK = %xmm12
90 
91 NUM_BLKS = %rdx   # 3rd arg
92 INP = %rsi        # 2nd arg
93 CTX = %rdi        # 1st arg
94 
95 SRND = %rsi       # clobbers INP
96 c = %ecx
97 d = %r8d
98 e = %edx
99 TBL = %r12
100 a = %eax
101 b = %ebx
102 
103 f = %r9d
104 g = %r10d
105 h = %r11d
106 
107 y0 = %r13d
108 y1 = %r14d
109 y2 = %r15d
110 
111 
112 
113 _INP_END_SIZE = 8
114 _INP_SIZE = 8
115 _XFER_SIZE = 16
116 _XMM_SAVE_SIZE = 0
117 
118 _INP_END = 0
119 _INP            = _INP_END  + _INP_END_SIZE
120 _XFER           = _INP      + _INP_SIZE
121 _XMM_SAVE       = _XFER     + _XFER_SIZE
122 STACK_SIZE      = _XMM_SAVE + _XMM_SAVE_SIZE
123 
124 # rotate_Xs
125 # Rotate values of symbols X0...X3
126 .macro rotate_Xs
127 X_ = X0
128 X0 = X1
129 X1 = X2
130 X2 = X3
131 X3 = X_
132 .endm
133 
134 # ROTATE_ARGS
135 # Rotate values of symbols a...h
136 .macro ROTATE_ARGS
137 TMP_ = h
138 h = g
139 g = f
140 f = e
141 e = d
142 d = c
143 c = b
144 b = a
145 a = TMP_
146 .endm
147 
148 .macro FOUR_ROUNDS_AND_SCHED
149 	## compute s0 four at a time and s1 two at a time
150 	## compute W[-16] + W[-7] 4 at a time
151 	movdqa  X3, XTMP0
152 	mov     e, y0			# y0 = e
153 	ror     $(25-11), y0            # y0 = e >> (25-11)
154 	mov     a, y1                   # y1 = a
155 	palignr $4, X2, XTMP0           # XTMP0 = W[-7]
156 	ror     $(22-13), y1            # y1 = a >> (22-13)
157 	xor     e, y0                   # y0 = e ^ (e >> (25-11))
158 	mov     f, y2                   # y2 = f
159 	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
160 	movdqa  X1, XTMP1
161 	xor     a, y1                   # y1 = a ^ (a >> (22-13)
162 	xor     g, y2                   # y2 = f^g
163 	paddd   X0, XTMP0               # XTMP0 = W[-7] + W[-16]
164 	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
165 	and     e, y2                   # y2 = (f^g)&e
166 	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
167 	## compute s0
168 	palignr $4, X0, XTMP1           # XTMP1 = W[-15]
169 	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
170 	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
171 	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
172 	movdqa  XTMP1, XTMP2            # XTMP2 = W[-15]
173 	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
174 	add     y0, y2                  # y2 = S1 + CH
175 	add     _XFER(%rsp) , y2        # y2 = k + w + S1 + CH
176 	movdqa  XTMP1, XTMP3            # XTMP3 = W[-15]
177 	mov     a, y0                   # y0 = a
178 	add     y2, h                   # h = h + S1 + CH + k + w
179 	mov     a, y2                   # y2 = a
180 	pslld   $(32-7), XTMP1          #
181 	or      c, y0                   # y0 = a|c
182 	add     h, d                    # d = d + h + S1 + CH + k + w
183 	and     c, y2                   # y2 = a&c
184 	psrld   $7, XTMP2               #
185 	and     b, y0                   # y0 = (a|c)&b
186 	add     y1, h                   # h = h + S1 + CH + k + w + S0
187 	por     XTMP2, XTMP1            # XTMP1 = W[-15] ror 7
188 	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
189 	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
190 					#
191 	ROTATE_ARGS                     #
192 	movdqa  XTMP3, XTMP2            # XTMP2 = W[-15]
193 	mov     e, y0                   # y0 = e
194 	mov     a, y1                   # y1 = a
195 	movdqa  XTMP3, XTMP4            # XTMP4 = W[-15]
196 	ror     $(25-11), y0            # y0 = e >> (25-11)
197 	xor     e, y0                   # y0 = e ^ (e >> (25-11))
198 	mov     f, y2                   # y2 = f
199 	ror     $(22-13), y1            # y1 = a >> (22-13)
200 	pslld   $(32-18), XTMP3         #
201 	xor     a, y1                   # y1 = a ^ (a >> (22-13)
202 	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
203 	xor     g, y2                   # y2 = f^g
204 	psrld   $18, XTMP2              #
205 	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
206 	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
207 	and     e, y2                   # y2 = (f^g)&e
208 	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
209 	pxor    XTMP3, XTMP1
210 	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
211 	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
212 	psrld   $3, XTMP4               # XTMP4 = W[-15] >> 3
213 	add     y0, y2                  # y2 = S1 + CH
214 	add     (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
215 	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
216 	pxor    XTMP2, XTMP1            # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
217 	mov     a, y0                   # y0 = a
218 	add     y2, h                   # h = h + S1 + CH + k + w
219 	mov     a, y2                   # y2 = a
220 	pxor    XTMP4, XTMP1            # XTMP1 = s0
221 	or      c, y0                   # y0 = a|c
222 	add     h, d                    # d = d + h + S1 + CH + k + w
223 	and     c, y2                   # y2 = a&c
224 	## compute low s1
225 	pshufd  $0b11111010, X3, XTMP2   # XTMP2 = W[-2] {BBAA}
226 	and     b, y0			# y0 = (a|c)&b
227 	add     y1, h                   # h = h + S1 + CH + k + w + S0
228 	paddd   XTMP1, XTMP0            # XTMP0 = W[-16] + W[-7] + s0
229 	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
230 	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
231 
232 	ROTATE_ARGS
233 	movdqa  XTMP2, XTMP3            # XTMP3 = W[-2] {BBAA}
234 	mov     e, y0                   # y0 = e
235 	mov     a, y1                   # y1 = a
236 	ror     $(25-11), y0            # y0 = e >> (25-11)
237 	movdqa  XTMP2, XTMP4            # XTMP4 = W[-2] {BBAA}
238 	xor     e, y0                   # y0 = e ^ (e >> (25-11))
239 	ror     $(22-13), y1            # y1 = a >> (22-13)
240 	mov     f, y2                   # y2 = f
241 	xor     a, y1                   # y1 = a ^ (a >> (22-13)
242 	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
243 	psrlq   $17, XTMP2              # XTMP2 = W[-2] ror 17 {xBxA}
244 	xor     g, y2                   # y2 = f^g
245 	psrlq   $19, XTMP3              # XTMP3 = W[-2] ror 19 {xBxA}
246 	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
247 	and     e, y2                   # y2 = (f^g)&e
248 	psrld   $10, XTMP4              # XTMP4 = W[-2] >> 10 {BBAA}
249 	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
250 	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
251 	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
252 	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
253 	pxor    XTMP3, XTMP2
254 	add     y0, y2                  # y2 = S1 + CH
255 	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
256 	add     (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
257 	pxor    XTMP2, XTMP4            # XTMP4 = s1 {xBxA}
258 	mov     a, y0                   # y0 = a
259 	add     y2, h                   # h = h + S1 + CH + k + w
260 	mov     a, y2                   # y2 = a
261 	pshufb  SHUF_00BA, XTMP4        # XTMP4 = s1 {00BA}
262 	or      c, y0                   # y0 = a|c
263 	add     h, d                    # d = d + h + S1 + CH + k + w
264 	and     c, y2                   # y2 = a&c
265 	paddd   XTMP4, XTMP0            # XTMP0 = {..., ..., W[1], W[0]}
266 	and     b, y0                   # y0 = (a|c)&b
267 	add     y1, h                   # h = h + S1 + CH + k + w + S0
268 	## compute high s1
269 	pshufd  $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}
270 	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
271 	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
272 					#
273 	ROTATE_ARGS                     #
274 	movdqa  XTMP2, XTMP3            # XTMP3 = W[-2] {DDCC}
275 	mov     e, y0                   # y0 = e
276 	ror     $(25-11), y0            # y0 = e >> (25-11)
277 	mov     a, y1                   # y1 = a
278 	movdqa  XTMP2, X0               # X0    = W[-2] {DDCC}
279 	ror     $(22-13), y1            # y1 = a >> (22-13)
280 	xor     e, y0                   # y0 = e ^ (e >> (25-11))
281 	mov     f, y2                   # y2 = f
282 	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
283 	psrlq   $17, XTMP2              # XTMP2 = W[-2] ror 17 {xDxC}
284 	xor     a, y1                   # y1 = a ^ (a >> (22-13)
285 	xor     g, y2                   # y2 = f^g
286 	psrlq   $19, XTMP3              # XTMP3 = W[-2] ror 19 {xDxC}
287 	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25
288 	and     e, y2                   # y2 = (f^g)&e
289 	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
290 	psrld   $10, X0                 # X0 = W[-2] >> 10 {DDCC}
291 	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22
292 	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
293 	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
294 	pxor    XTMP3, XTMP2            #
295 	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
296 	add     y0, y2                  # y2 = S1 + CH
297 	add     (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
298 	pxor    XTMP2, X0               # X0 = s1 {xDxC}
299 	mov     a, y0                   # y0 = a
300 	add     y2, h                   # h = h + S1 + CH + k + w
301 	mov     a, y2                   # y2 = a
302 	pshufb  SHUF_DC00, X0           # X0 = s1 {DC00}
303 	or      c, y0                   # y0 = a|c
304 	add     h, d                    # d = d + h + S1 + CH + k + w
305 	and     c, y2                   # y2 = a&c
306 	paddd   XTMP0, X0               # X0 = {W[3], W[2], W[1], W[0]}
307 	and     b, y0                   # y0 = (a|c)&b
308 	add     y1, h                   # h = h + S1 + CH + k + w + S0
309 	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
310 	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
311 
312 	ROTATE_ARGS
313 	rotate_Xs
314 .endm
315 
316 ## input is [rsp + _XFER + %1 * 4]
317 .macro DO_ROUND round
318 	mov     e, y0                 # y0 = e
319 	ror     $(25-11), y0          # y0 = e >> (25-11)
320 	mov     a, y1                 # y1 = a
321 	xor     e, y0                 # y0 = e ^ (e >> (25-11))
322 	ror     $(22-13), y1          # y1 = a >> (22-13)
323 	mov     f, y2                 # y2 = f
324 	xor     a, y1                 # y1 = a ^ (a >> (22-13)
325 	ror     $(11-6), y0           # y0 = (e >> (11-6)) ^ (e >> (25-6))
326 	xor     g, y2                 # y2 = f^g
327 	xor     e, y0                 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
328 	ror     $(13-2), y1           # y1 = (a >> (13-2)) ^ (a >> (22-2))
329 	and     e, y2                 # y2 = (f^g)&e
330 	xor     a, y1                 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
331 	ror     $6, y0                # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
332 	xor     g, y2                 # y2 = CH = ((f^g)&e)^g
333 	add     y0, y2                # y2 = S1 + CH
334 	ror     $2, y1                # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
335 	offset = \round * 4 + _XFER
336 	add     offset(%rsp), y2      # y2 = k + w + S1 + CH
337 	mov     a, y0                 # y0 = a
338 	add     y2, h                 # h = h + S1 + CH + k + w
339 	mov     a, y2                 # y2 = a
340 	or      c, y0                 # y0 = a|c
341 	add     h, d                  # d = d + h + S1 + CH + k + w
342 	and     c, y2                 # y2 = a&c
343 	and     b, y0                 # y0 = (a|c)&b
344 	add     y1, h                 # h = h + S1 + CH + k + w + S0
345 	or      y2, y0		      # y0 = MAJ = (a|c)&b)|(a&c)
346 	add     y0, h		      # h = h + S1 + CH + k + w + S0 + MAJ
347 	ROTATE_ARGS
348 .endm
349 
350 ########################################################################
351 ## void sha256_transform_ssse3(struct sha256_state *state, const u8 *data,
352 ##			       int blocks);
353 ## arg 1 : pointer to state
354 ##	   (struct sha256_state is assumed to begin with u32 state[8])
355 ## arg 2 : pointer to input data
356 ## arg 3 : Num blocks
357 ########################################################################
358 .text
359 SYM_TYPED_FUNC_START(sha256_transform_ssse3)
360 	pushq   %rbx
361 	pushq   %r12
362 	pushq   %r13
363 	pushq   %r14
364 	pushq   %r15
365 	pushq   %rbp
366 	mov	%rsp, %rbp
367 
368 	subq    $STACK_SIZE, %rsp
369 	and	$~15, %rsp
370 
371 	shl     $6, NUM_BLKS		 # convert to bytes
372 	jz      .Ldone_hash
373 	add     INP, NUM_BLKS
374 	mov     NUM_BLKS, _INP_END(%rsp) # pointer to end of data
375 
376 	## load initial digest
377 	mov     4*0(CTX), a
378 	mov     4*1(CTX), b
379 	mov     4*2(CTX), c
380 	mov     4*3(CTX), d
381 	mov     4*4(CTX), e
382 	mov     4*5(CTX), f
383 	mov     4*6(CTX), g
384 	mov     4*7(CTX), h
385 
386 	movdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
387 	movdqa  _SHUF_00BA(%rip), SHUF_00BA
388 	movdqa  _SHUF_DC00(%rip), SHUF_DC00
389 
390 .Lloop0:
391 	lea     K256(%rip), TBL
392 
393 	## byte swap first 16 dwords
394 	COPY_XMM_AND_BSWAP      X0, 0*16(INP), BYTE_FLIP_MASK
395 	COPY_XMM_AND_BSWAP      X1, 1*16(INP), BYTE_FLIP_MASK
396 	COPY_XMM_AND_BSWAP      X2, 2*16(INP), BYTE_FLIP_MASK
397 	COPY_XMM_AND_BSWAP      X3, 3*16(INP), BYTE_FLIP_MASK
398 
399 	mov     INP, _INP(%rsp)
400 
401 	## schedule 48 input dwords, by doing 3 rounds of 16 each
402 	mov     $3, SRND
403 .align 16
404 .Lloop1:
405 	movdqa  (TBL), XFER
406 	paddd   X0, XFER
407 	movdqa  XFER, _XFER(%rsp)
408 	FOUR_ROUNDS_AND_SCHED
409 
410 	movdqa  1*16(TBL), XFER
411 	paddd   X0, XFER
412 	movdqa  XFER, _XFER(%rsp)
413 	FOUR_ROUNDS_AND_SCHED
414 
415 	movdqa  2*16(TBL), XFER
416 	paddd   X0, XFER
417 	movdqa  XFER, _XFER(%rsp)
418 	FOUR_ROUNDS_AND_SCHED
419 
420 	movdqa  3*16(TBL), XFER
421 	paddd   X0, XFER
422 	movdqa  XFER, _XFER(%rsp)
423 	add     $4*16, TBL
424 	FOUR_ROUNDS_AND_SCHED
425 
426 	sub     $1, SRND
427 	jne     .Lloop1
428 
429 	mov     $2, SRND
430 .Lloop2:
431 	paddd   (TBL), X0
432 	movdqa  X0, _XFER(%rsp)
433 	DO_ROUND        0
434 	DO_ROUND        1
435 	DO_ROUND        2
436 	DO_ROUND        3
437 	paddd   1*16(TBL), X1
438 	movdqa  X1, _XFER(%rsp)
439 	add     $2*16, TBL
440 	DO_ROUND        0
441 	DO_ROUND        1
442 	DO_ROUND        2
443 	DO_ROUND        3
444 
445 	movdqa  X2, X0
446 	movdqa  X3, X1
447 
448 	sub     $1, SRND
449 	jne     .Lloop2
450 
451 	addm    (4*0)(CTX),a
452 	addm    (4*1)(CTX),b
453 	addm    (4*2)(CTX),c
454 	addm    (4*3)(CTX),d
455 	addm    (4*4)(CTX),e
456 	addm    (4*5)(CTX),f
457 	addm    (4*6)(CTX),g
458 	addm    (4*7)(CTX),h
459 
460 	mov     _INP(%rsp), INP
461 	add     $64, INP
462 	cmp     _INP_END(%rsp), INP
463 	jne     .Lloop0
464 
465 .Ldone_hash:
466 
467 	mov	%rbp, %rsp
468 	popq	%rbp
469 	popq    %r15
470 	popq    %r14
471 	popq    %r13
472 	popq    %r12
473 	popq    %rbx
474 
475 	RET
476 SYM_FUNC_END(sha256_transform_ssse3)
477 
478 .section	.rodata.cst256.K256, "aM", @progbits, 256
479 .align 64
480 K256:
481         .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
482         .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
483         .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
484         .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
485         .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
486         .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
487         .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
488         .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
489         .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
490         .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
491         .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
492         .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
493         .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
494         .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
495         .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
496         .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
497 
498 .section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
499 .align 16
500 PSHUFFLE_BYTE_FLIP_MASK:
501 	.octa 0x0c0d0e0f08090a0b0405060700010203
502 
503 .section	.rodata.cst16._SHUF_00BA, "aM", @progbits, 16
504 .align 16
505 # shuffle xBxA -> 00BA
506 _SHUF_00BA:
507 	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
508 
509 .section	.rodata.cst16._SHUF_DC00, "aM", @progbits, 16
510 .align 16
511 # shuffle xDxC -> DC00
512 _SHUF_DC00:
513 	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
514