1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
4  * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
5  * processors. CPUs supporting Intel(R) AVX extensions will get an additional
6  * boost.
7  *
8  * This work was inspired by the vectorized implementation of Dean Gaudet.
9  * Additional information on it can be found at:
10  *    http://www.arctic.org/~dean/crypto/sha1.html
11  *
12  * It was improved upon with more efficient vectorization of the message
13  * scheduling. This implementation has also been optimized for all current and
14  * several future generations of Intel CPUs.
15  *
16  * See this article for more information about the implementation details:
17  *   http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
18  *
19  * Copyright (C) 2010, Intel Corp.
20  *   Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
21  *            Ronen Zohar <ronen.zohar@intel.com>
22  *
23  * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
24  *   Author: Mathias Krause <minipli@googlemail.com>
25  */
26 
27 #include <linux/linkage.h>
28 
29 #define CTX	%rdi	// arg1
30 #define BUF	%rsi	// arg2
31 #define CNT	%rdx	// arg3
32 
33 #define REG_A	%ecx
34 #define REG_B	%esi
35 #define REG_C	%edi
36 #define REG_D	%r12d
37 #define REG_E	%edx
38 
39 #define REG_T1	%eax
40 #define REG_T2	%ebx
41 
42 #define K_BASE		%r8
43 #define HASH_PTR	%r9
44 #define BUFFER_PTR	%r10
45 #define BUFFER_END	%r11
46 
47 #define W_TMP1	%xmm0
48 #define W_TMP2	%xmm9
49 
50 #define W0	%xmm1
51 #define W4	%xmm2
52 #define W8	%xmm3
53 #define W12	%xmm4
54 #define W16	%xmm5
55 #define W20	%xmm6
56 #define W24	%xmm7
57 #define W28	%xmm8
58 
59 #define XMM_SHUFB_BSWAP	%xmm10
60 
61 /* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
62 #define WK(t)	(((t) & 15) * 4)(%rsp)
63 #define W_PRECALC_AHEAD	16
64 
65 /*
66  * This macro implements the SHA-1 function's body for single 64-byte block
67  * param: function's name
68  */
69 .macro SHA1_VECTOR_ASM  name
70 	SYM_FUNC_START(\name)
71 
72 	push	%rbx
73 	push	%r12
74 	push	%rbp
75 	mov	%rsp, %rbp
76 
77 	sub	$64, %rsp		# allocate workspace
78 	and	$~15, %rsp		# align stack
79 
80 	mov	CTX, HASH_PTR
81 	mov	BUF, BUFFER_PTR
82 
83 	shl	$6, CNT			# multiply by 64
84 	add	BUF, CNT
85 	mov	CNT, BUFFER_END
86 
87 	lea	K_XMM_AR(%rip), K_BASE
88 	xmm_mov	BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
89 
90 	SHA1_PIPELINED_MAIN_BODY
91 
92 	# cleanup workspace
93 	mov	$8, %ecx
94 	mov	%rsp, %rdi
95 	xor	%eax, %eax
96 	rep stosq
97 
98 	mov	%rbp, %rsp		# deallocate workspace
99 	pop	%rbp
100 	pop	%r12
101 	pop	%rbx
102 	RET
103 
104 	SYM_FUNC_END(\name)
105 .endm
106 
107 /*
108  * This macro implements 80 rounds of SHA-1 for one 64-byte block
109  */
110 .macro SHA1_PIPELINED_MAIN_BODY
111 	INIT_REGALLOC
112 
113 	mov	  (HASH_PTR), A
114 	mov	 4(HASH_PTR), B
115 	mov	 8(HASH_PTR), C
116 	mov	12(HASH_PTR), D
117 	mov	16(HASH_PTR), E
118 
119   .set i, 0
120   .rept W_PRECALC_AHEAD
121 	W_PRECALC i
122     .set i, (i+1)
123   .endr
124 
125 .align 4
126 1:
127 	RR F1,A,B,C,D,E,0
128 	RR F1,D,E,A,B,C,2
129 	RR F1,B,C,D,E,A,4
130 	RR F1,E,A,B,C,D,6
131 	RR F1,C,D,E,A,B,8
132 
133 	RR F1,A,B,C,D,E,10
134 	RR F1,D,E,A,B,C,12
135 	RR F1,B,C,D,E,A,14
136 	RR F1,E,A,B,C,D,16
137 	RR F1,C,D,E,A,B,18
138 
139 	RR F2,A,B,C,D,E,20
140 	RR F2,D,E,A,B,C,22
141 	RR F2,B,C,D,E,A,24
142 	RR F2,E,A,B,C,D,26
143 	RR F2,C,D,E,A,B,28
144 
145 	RR F2,A,B,C,D,E,30
146 	RR F2,D,E,A,B,C,32
147 	RR F2,B,C,D,E,A,34
148 	RR F2,E,A,B,C,D,36
149 	RR F2,C,D,E,A,B,38
150 
151 	RR F3,A,B,C,D,E,40
152 	RR F3,D,E,A,B,C,42
153 	RR F3,B,C,D,E,A,44
154 	RR F3,E,A,B,C,D,46
155 	RR F3,C,D,E,A,B,48
156 
157 	RR F3,A,B,C,D,E,50
158 	RR F3,D,E,A,B,C,52
159 	RR F3,B,C,D,E,A,54
160 	RR F3,E,A,B,C,D,56
161 	RR F3,C,D,E,A,B,58
162 
163 	add	$64, BUFFER_PTR		# move to the next 64-byte block
164 	cmp	BUFFER_END, BUFFER_PTR	# if the current is the last one use
165 	cmovae	K_BASE, BUFFER_PTR	# dummy source to avoid buffer overrun
166 
167 	RR F4,A,B,C,D,E,60
168 	RR F4,D,E,A,B,C,62
169 	RR F4,B,C,D,E,A,64
170 	RR F4,E,A,B,C,D,66
171 	RR F4,C,D,E,A,B,68
172 
173 	RR F4,A,B,C,D,E,70
174 	RR F4,D,E,A,B,C,72
175 	RR F4,B,C,D,E,A,74
176 	RR F4,E,A,B,C,D,76
177 	RR F4,C,D,E,A,B,78
178 
179 	UPDATE_HASH   (HASH_PTR), A
180 	UPDATE_HASH  4(HASH_PTR), B
181 	UPDATE_HASH  8(HASH_PTR), C
182 	UPDATE_HASH 12(HASH_PTR), D
183 	UPDATE_HASH 16(HASH_PTR), E
184 
185 	RESTORE_RENAMED_REGS
186 	cmp	K_BASE, BUFFER_PTR	# K_BASE means, we reached the end
187 	jne	1b
188 .endm
189 
190 .macro INIT_REGALLOC
191   .set A, REG_A
192   .set B, REG_B
193   .set C, REG_C
194   .set D, REG_D
195   .set E, REG_E
196   .set T1, REG_T1
197   .set T2, REG_T2
198 .endm
199 
200 .macro RESTORE_RENAMED_REGS
201 	# order is important (REG_C is where it should be)
202 	mov	B, REG_B
203 	mov	D, REG_D
204 	mov	A, REG_A
205 	mov	E, REG_E
206 .endm
207 
208 .macro SWAP_REG_NAMES  a, b
209   .set _T, \a
210   .set \a, \b
211   .set \b, _T
212 .endm
213 
214 .macro F1  b, c, d
215 	mov	\c, T1
216 	SWAP_REG_NAMES \c, T1
217 	xor	\d, T1
218 	and	\b, T1
219 	xor	\d, T1
220 .endm
221 
222 .macro F2  b, c, d
223 	mov	\d, T1
224 	SWAP_REG_NAMES \d, T1
225 	xor	\c, T1
226 	xor	\b, T1
227 .endm
228 
229 .macro F3  b, c ,d
230 	mov	\c, T1
231 	SWAP_REG_NAMES \c, T1
232 	mov	\b, T2
233 	or	\b, T1
234 	and	\c, T2
235 	and	\d, T1
236 	or	T2, T1
237 .endm
238 
239 .macro F4  b, c, d
240 	F2 \b, \c, \d
241 .endm
242 
243 .macro UPDATE_HASH  hash, val
244 	add	\hash, \val
245 	mov	\val, \hash
246 .endm
247 
248 /*
249  * RR does two rounds of SHA-1 back to back with W[] pre-calc
250  *   t1 = F(b, c, d);   e += w(i)
251  *   e += t1;           b <<= 30;   d  += w(i+1);
252  *   t1 = F(a, b, c);
253  *   d += t1;           a <<= 5;
254  *   e += a;
255  *   t1 = e;            a >>= 7;
256  *   t1 <<= 5;
257  *   d += t1;
258  */
259 .macro RR  F, a, b, c, d, e, round
260 	add	WK(\round), \e
261 	\F   \b, \c, \d		# t1 = F(b, c, d);
262 	W_PRECALC (\round + W_PRECALC_AHEAD)
263 	rol	$30, \b
264 	add	T1, \e
265 	add	WK(\round + 1), \d
266 
267 	\F   \a, \b, \c
268 	W_PRECALC (\round + W_PRECALC_AHEAD + 1)
269 	rol	$5, \a
270 	add	\a, \e
271 	add	T1, \d
272 	ror	$7, \a		# (a <<r 5) >>r 7) => a <<r 30)
273 
274 	mov	\e, T1
275 	SWAP_REG_NAMES \e, T1
276 
277 	rol	$5, T1
278 	add	T1, \d
279 
280 	# write:  \a, \b
281 	# rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
282 .endm
283 
284 .macro W_PRECALC  r
285   .set i, \r
286 
287   .if (i < 20)
288     .set K_XMM, 0
289   .elseif (i < 40)
290     .set K_XMM, 16
291   .elseif (i < 60)
292     .set K_XMM, 32
293   .elseif (i < 80)
294     .set K_XMM, 48
295   .endif
296 
297   .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
298     .set i, ((\r) % 80)	    # pre-compute for the next iteration
299     .if (i == 0)
300 	W_PRECALC_RESET
301     .endif
302 	W_PRECALC_00_15
303   .elseif (i<32)
304 	W_PRECALC_16_31
305   .elseif (i < 80)   // rounds 32-79
306 	W_PRECALC_32_79
307   .endif
308 .endm
309 
310 .macro W_PRECALC_RESET
311   .set W,          W0
312   .set W_minus_04, W4
313   .set W_minus_08, W8
314   .set W_minus_12, W12
315   .set W_minus_16, W16
316   .set W_minus_20, W20
317   .set W_minus_24, W24
318   .set W_minus_28, W28
319   .set W_minus_32, W
320 .endm
321 
322 .macro W_PRECALC_ROTATE
323   .set W_minus_32, W_minus_28
324   .set W_minus_28, W_minus_24
325   .set W_minus_24, W_minus_20
326   .set W_minus_20, W_minus_16
327   .set W_minus_16, W_minus_12
328   .set W_minus_12, W_minus_08
329   .set W_minus_08, W_minus_04
330   .set W_minus_04, W
331   .set W,          W_minus_32
332 .endm
333 
334 .macro W_PRECALC_SSSE3
335 
336 .macro W_PRECALC_00_15
337 	W_PRECALC_00_15_SSSE3
338 .endm
339 .macro W_PRECALC_16_31
340 	W_PRECALC_16_31_SSSE3
341 .endm
342 .macro W_PRECALC_32_79
343 	W_PRECALC_32_79_SSSE3
344 .endm
345 
346 /* message scheduling pre-compute for rounds 0-15 */
347 .macro W_PRECALC_00_15_SSSE3
348   .if ((i & 3) == 0)
349 	movdqu	(i*4)(BUFFER_PTR), W_TMP1
350   .elseif ((i & 3) == 1)
351 	pshufb	XMM_SHUFB_BSWAP, W_TMP1
352 	movdqa	W_TMP1, W
353   .elseif ((i & 3) == 2)
354 	paddd	(K_BASE), W_TMP1
355   .elseif ((i & 3) == 3)
356 	movdqa  W_TMP1, WK(i&~3)
357 	W_PRECALC_ROTATE
358   .endif
359 .endm
360 
361 /* message scheduling pre-compute for rounds 16-31
362  *
363  * - calculating last 32 w[i] values in 8 XMM registers
364  * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
365  *   instruction
366  *
367  * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
368  * dependency, but improves for 32-79
369  */
370 .macro W_PRECALC_16_31_SSSE3
371   # blended scheduling of vector and scalar instruction streams, one 4-wide
372   # vector iteration / 4 scalar rounds
373   .if ((i & 3) == 0)
374 	movdqa	W_minus_12, W
375 	palignr	$8, W_minus_16, W	# w[i-14]
376 	movdqa	W_minus_04, W_TMP1
377 	psrldq	$4, W_TMP1		# w[i-3]
378 	pxor	W_minus_08, W
379   .elseif ((i & 3) == 1)
380 	pxor	W_minus_16, W_TMP1
381 	pxor	W_TMP1, W
382 	movdqa	W, W_TMP2
383 	movdqa	W, W_TMP1
384 	pslldq	$12, W_TMP2
385   .elseif ((i & 3) == 2)
386 	psrld	$31, W
387 	pslld	$1, W_TMP1
388 	por	W, W_TMP1
389 	movdqa	W_TMP2, W
390 	psrld	$30, W_TMP2
391 	pslld	$2, W
392   .elseif ((i & 3) == 3)
393 	pxor	W, W_TMP1
394 	pxor	W_TMP2, W_TMP1
395 	movdqa	W_TMP1, W
396 	paddd	K_XMM(K_BASE), W_TMP1
397 	movdqa	W_TMP1, WK(i&~3)
398 	W_PRECALC_ROTATE
399   .endif
400 .endm
401 
402 /* message scheduling pre-compute for rounds 32-79
403  *
404  * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
405  * instead we do equal:    w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
406  * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
407  */
408 .macro W_PRECALC_32_79_SSSE3
409   .if ((i & 3) == 0)
410 	movdqa	W_minus_04, W_TMP1
411 	pxor	W_minus_28, W		# W is W_minus_32 before xor
412 	palignr	$8, W_minus_08, W_TMP1
413   .elseif ((i & 3) == 1)
414 	pxor	W_minus_16, W
415 	pxor	W_TMP1, W
416 	movdqa	W, W_TMP1
417   .elseif ((i & 3) == 2)
418 	psrld	$30, W
419 	pslld	$2, W_TMP1
420 	por	W, W_TMP1
421   .elseif ((i & 3) == 3)
422 	movdqa	W_TMP1, W
423 	paddd	K_XMM(K_BASE), W_TMP1
424 	movdqa	W_TMP1, WK(i&~3)
425 	W_PRECALC_ROTATE
426   .endif
427 .endm
428 
429 .endm		// W_PRECALC_SSSE3
430 
431 
432 #define K1	0x5a827999
433 #define K2	0x6ed9eba1
434 #define K3	0x8f1bbcdc
435 #define K4	0xca62c1d6
436 
437 .section .rodata
438 .align 16
439 
440 K_XMM_AR:
441 	.long K1, K1, K1, K1
442 	.long K2, K2, K2, K2
443 	.long K3, K3, K3, K3
444 	.long K4, K4, K4, K4
445 
446 BSWAP_SHUFB_CTL:
447 	.long 0x00010203
448 	.long 0x04050607
449 	.long 0x08090a0b
450 	.long 0x0c0d0e0f
451 
452 
453 .section .text
454 
455 W_PRECALC_SSSE3
456 .macro xmm_mov a, b
457 	movdqu	\a,\b
458 .endm
459 
460 /*
461  * SSSE3 optimized implementation:
462  *
463  * extern "C" void sha1_transform_ssse3(struct sha1_state *state,
464  *					const u8 *data, int blocks);
465  *
466  * Note that struct sha1_state is assumed to begin with u32 state[5].
467  */
468 SHA1_VECTOR_ASM     sha1_transform_ssse3
469 
470 .macro W_PRECALC_AVX
471 
472 .purgem W_PRECALC_00_15
473 .macro  W_PRECALC_00_15
474     W_PRECALC_00_15_AVX
475 .endm
476 .purgem W_PRECALC_16_31
477 .macro  W_PRECALC_16_31
478     W_PRECALC_16_31_AVX
479 .endm
480 .purgem W_PRECALC_32_79
481 .macro  W_PRECALC_32_79
482     W_PRECALC_32_79_AVX
483 .endm
484 
485 .macro W_PRECALC_00_15_AVX
486   .if ((i & 3) == 0)
487 	vmovdqu	(i*4)(BUFFER_PTR), W_TMP1
488   .elseif ((i & 3) == 1)
489 	vpshufb	XMM_SHUFB_BSWAP, W_TMP1, W
490   .elseif ((i & 3) == 2)
491 	vpaddd	(K_BASE), W, W_TMP1
492   .elseif ((i & 3) == 3)
493 	vmovdqa	W_TMP1, WK(i&~3)
494 	W_PRECALC_ROTATE
495   .endif
496 .endm
497 
498 .macro W_PRECALC_16_31_AVX
499   .if ((i & 3) == 0)
500 	vpalignr $8, W_minus_16, W_minus_12, W	# w[i-14]
501 	vpsrldq	$4, W_minus_04, W_TMP1		# w[i-3]
502 	vpxor	W_minus_08, W, W
503 	vpxor	W_minus_16, W_TMP1, W_TMP1
504   .elseif ((i & 3) == 1)
505 	vpxor	W_TMP1, W, W
506 	vpslldq	$12, W, W_TMP2
507 	vpslld	$1, W, W_TMP1
508   .elseif ((i & 3) == 2)
509 	vpsrld	$31, W, W
510 	vpor	W, W_TMP1, W_TMP1
511 	vpslld	$2, W_TMP2, W
512 	vpsrld	$30, W_TMP2, W_TMP2
513   .elseif ((i & 3) == 3)
514 	vpxor	W, W_TMP1, W_TMP1
515 	vpxor	W_TMP2, W_TMP1, W
516 	vpaddd	K_XMM(K_BASE), W, W_TMP1
517 	vmovdqu	W_TMP1, WK(i&~3)
518 	W_PRECALC_ROTATE
519   .endif
520 .endm
521 
522 .macro W_PRECALC_32_79_AVX
523   .if ((i & 3) == 0)
524 	vpalignr $8, W_minus_08, W_minus_04, W_TMP1
525 	vpxor	W_minus_28, W, W		# W is W_minus_32 before xor
526   .elseif ((i & 3) == 1)
527 	vpxor	W_minus_16, W_TMP1, W_TMP1
528 	vpxor	W_TMP1, W, W
529   .elseif ((i & 3) == 2)
530 	vpslld	$2, W, W_TMP1
531 	vpsrld	$30, W, W
532 	vpor	W, W_TMP1, W
533   .elseif ((i & 3) == 3)
534 	vpaddd	K_XMM(K_BASE), W, W_TMP1
535 	vmovdqu	W_TMP1, WK(i&~3)
536 	W_PRECALC_ROTATE
537   .endif
538 .endm
539 
540 .endm    // W_PRECALC_AVX
541 
542 W_PRECALC_AVX
543 .purgem xmm_mov
544 .macro xmm_mov a, b
545 	vmovdqu	\a,\b
546 .endm
547 
548 
549 /* AVX optimized implementation:
550  *  extern "C" void sha1_transform_avx(struct sha1_state *state,
551  *				       const u8 *data, int blocks);
552  */
553 SHA1_VECTOR_ASM     sha1_transform_avx
554