1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * Fast SHA-256 implementation for SPE instruction set (PPC)
4  *
5  * This code makes use of the SPE SIMD instruction set as defined in
6  * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
7  * Implementation is based on optimization guide notes from
8  * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
9  *
10  * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
11  */
12 
13 #include <asm/ppc_asm.h>
14 #include <asm/asm-offsets.h>
15 
16 #define rHP	r3	/* pointer to hash values in memory		*/
17 #define rKP	r24	/* pointer to round constants			*/
18 #define rWP	r4	/* pointer to input data			*/
19 
20 #define rH0	r5	/* 8 32 bit hash values in 8 registers		*/
21 #define rH1	r6
22 #define rH2	r7
23 #define rH3	r8
24 #define rH4	r9
25 #define rH5	r10
26 #define rH6	r11
27 #define rH7	r12
28 
29 #define rW0	r14	/* 64 bit registers. 16 words in 8 registers	*/
30 #define rW1	r15
31 #define rW2	r16
32 #define rW3	r17
33 #define rW4	r18
34 #define rW5	r19
35 #define rW6	r20
36 #define rW7	r21
37 
38 #define rT0	r22	/* 64 bit temporaries 				*/
39 #define rT1	r23
40 #define rT2	r0	/* 32 bit temporaries				*/
41 #define rT3	r25
42 
43 #define CMP_KN_LOOP
44 #define CMP_KC_LOOP \
45 	cmpwi		rT1,0;
46 
47 #define INITIALIZE \
48 	stwu		r1,-128(r1);	/* create stack frame		*/ \
49 	evstdw		r14,8(r1);	/* We must save non volatile	*/ \
50 	evstdw		r15,16(r1);	/* registers. Take the chance	*/ \
51 	evstdw		r16,24(r1);	/* and save the SPE part too	*/ \
52 	evstdw		r17,32(r1);					   \
53 	evstdw		r18,40(r1);					   \
54 	evstdw		r19,48(r1);					   \
55 	evstdw		r20,56(r1);					   \
56 	evstdw		r21,64(r1);					   \
57 	evstdw		r22,72(r1);					   \
58 	evstdw		r23,80(r1);					   \
59 	stw		r24,88(r1);	/* save normal registers	*/ \
60 	stw		r25,92(r1);
61 
62 
63 #define FINALIZE \
64 	evldw		r14,8(r1);	/* restore SPE registers	*/ \
65 	evldw		r15,16(r1);					   \
66 	evldw		r16,24(r1);					   \
67 	evldw		r17,32(r1);					   \
68 	evldw		r18,40(r1);					   \
69 	evldw		r19,48(r1);					   \
70 	evldw		r20,56(r1);					   \
71 	evldw		r21,64(r1);					   \
72 	evldw		r22,72(r1);					   \
73 	evldw		r23,80(r1);					   \
74 	lwz		r24,88(r1);	/* restore normal registers	*/ \
75 	lwz		r25,92(r1);					   \
76 	xor		r0,r0,r0;					   \
77 	stw		r0,8(r1);	/* Delete sensitive data	*/ \
78 	stw		r0,16(r1);	/* that we might have pushed	*/ \
79 	stw		r0,24(r1);	/* from other context that runs	*/ \
80 	stw		r0,32(r1);	/* the same code. Assume that	*/ \
81 	stw		r0,40(r1);	/* the lower part of the GPRs	*/ \
82 	stw		r0,48(r1);	/* was already overwritten on	*/ \
83 	stw		r0,56(r1);	/* the way down to here		*/ \
84 	stw		r0,64(r1);					   \
85 	stw		r0,72(r1);					   \
86 	stw		r0,80(r1);					   \
87 	addi		r1,r1,128;	/* cleanup stack frame		*/
88 
89 #ifdef __BIG_ENDIAN__
90 #define LOAD_DATA(reg, off) \
91 	lwz		reg,off(rWP);	/* load data			*/
92 #define NEXT_BLOCK \
93 	addi		rWP,rWP,64;	/* increment per block		*/
94 #else
95 #define LOAD_DATA(reg, off) \
96 	lwbrx		reg,0,rWP; 	/* load data			*/ \
97 	addi		rWP,rWP,4;	/* increment per word		*/
98 #define NEXT_BLOCK			/* nothing to do		*/
99 #endif
100 
101 #define R_LOAD_W(a, b, c, d, e, f, g, h, w, off) \
102 	LOAD_DATA(w, off)		/* 1: W				*/ \
103 	rotrwi		rT0,e,6;	/* 1: S1 = e rotr 6		*/ \
104 	rotrwi		rT1,e,11;	/* 1: S1' = e rotr 11		*/ \
105 	rotrwi		rT2,e,25;	/* 1: S1" = e rotr 25		*/ \
106 	xor		rT0,rT0,rT1;	/* 1: S1 = S1 xor S1'		*/ \
107 	and		rT3,e,f;	/* 1: ch = e and f		*/ \
108 	xor		rT0,rT0,rT2;	/* 1: S1 = S1 xor S1"		*/ \
109 	andc		rT1,g,e;	/* 1: ch' = ~e and g		*/ \
110 	lwz		rT2,off(rKP);	/* 1: K				*/ \
111 	xor		rT3,rT3,rT1;	/* 1: ch = ch xor ch'		*/ \
112 	add		h,h,rT0;	/* 1: temp1 = h + S1		*/ \
113 	add		rT3,rT3,w;	/* 1: temp1' = ch + w		*/ \
114 	rotrwi		rT0,a,2;	/* 1: S0 = a rotr 2		*/ \
115 	add		h,h,rT3;	/* 1: temp1 = temp1 + temp1'	*/ \
116 	rotrwi		rT1,a,13;	/* 1: S0' = a rotr 13		*/ \
117 	add		h,h,rT2;	/* 1: temp1 = temp1 + K		*/ \
118 	rotrwi		rT3,a,22;	/* 1: S0" = a rotr 22		*/ \
119 	xor		rT0,rT0,rT1;	/* 1: S0 = S0 xor S0'		*/ \
120 	add		d,d,h;		/* 1: d = d + temp1		*/ \
121 	xor		rT3,rT0,rT3;	/* 1: S0 = S0 xor S0"		*/ \
122 	evmergelo	w,w,w;		/*    shift W			*/ \
123 	or		rT2,a,b;	/* 1: maj = a or b		*/ \
124 	and		rT1,a,b;	/* 1: maj' = a and b		*/ \
125 	and		rT2,rT2,c;	/* 1: maj = maj and c		*/ \
126 	LOAD_DATA(w, off+4)		/* 2: W				*/ \
127 	or		rT2,rT1,rT2;	/* 1: maj = maj or maj'		*/ \
128 	rotrwi		rT0,d,6;	/* 2: S1 = e rotr 6		*/ \
129 	add		rT3,rT3,rT2;	/* 1: temp2 = S0 + maj		*/ \
130 	rotrwi		rT1,d,11;	/* 2: S1' = e rotr 11		*/ \
131 	add		h,h,rT3;	/* 1: h = temp1 + temp2		*/ \
132 	rotrwi		rT2,d,25;	/* 2: S1" = e rotr 25		*/ \
133 	xor		rT0,rT0,rT1;	/* 2: S1 = S1 xor S1'		*/ \
134 	and		rT3,d,e;	/* 2: ch = e and f		*/ \
135 	xor		rT0,rT0,rT2;	/* 2: S1 = S1 xor S1"		*/ \
136 	andc		rT1,f,d;	/* 2: ch' = ~e and g		*/ \
137 	lwz		rT2,off+4(rKP);	/* 2: K				*/ \
138 	xor		rT3,rT3,rT1;	/* 2: ch = ch xor ch'		*/ \
139 	add		g,g,rT0;	/* 2: temp1 = h + S1		*/ \
140 	add		rT3,rT3,w;	/* 2: temp1' = ch + w		*/ \
141 	rotrwi		rT0,h,2;	/* 2: S0 = a rotr 2		*/ \
142 	add		g,g,rT3;	/* 2: temp1 = temp1 + temp1'	*/ \
143 	rotrwi		rT1,h,13;	/* 2: S0' = a rotr 13		*/ \
144 	add		g,g,rT2;	/* 2: temp1 = temp1 + K		*/ \
145 	rotrwi		rT3,h,22;	/* 2: S0" = a rotr 22		*/ \
146 	xor		rT0,rT0,rT1;	/* 2: S0 = S0 xor S0'		*/ \
147 	or		rT2,h,a;	/* 2: maj = a or b		*/ \
148 	xor		rT3,rT0,rT3;	/* 2: S0 = S0 xor S0"		*/ \
149 	and		rT1,h,a;	/* 2: maj' = a and b		*/ \
150 	and		rT2,rT2,b;	/* 2: maj = maj and c		*/ \
151 	add		c,c,g;		/* 2: d = d + temp1		*/ \
152 	or		rT2,rT1,rT2;	/* 2: maj = maj or maj'		*/ \
153 	add		rT3,rT3,rT2;	/* 2: temp2 = S0 + maj		*/ \
154 	add		g,g,rT3		/* 2: h = temp1 + temp2		*/
155 
156 #define R_CALC_W(a, b, c, d, e, f, g, h, w0, w1, w4, w5, w7, k, off) \
157 	rotrwi		rT2,e,6;	/* 1: S1 = e rotr 6		*/ \
158 	evmergelohi	rT0,w0,w1;	/*    w[-15]			*/ \
159 	rotrwi		rT3,e,11;	/* 1: S1' = e rotr 11		*/ \
160 	evsrwiu		rT1,rT0,3;	/*    s0 = w[-15] >> 3		*/ \
161 	xor		rT2,rT2,rT3;	/* 1: S1 = S1 xor S1'		*/ \
162 	evrlwi		rT0,rT0,25;	/*    s0' = w[-15] rotr	7	*/ \
163 	rotrwi		rT3,e,25;	/* 1: S1' = e rotr 25		*/ \
164 	evxor		rT1,rT1,rT0;	/*    s0 = s0 xor s0'		*/ \
165 	xor		rT2,rT2,rT3;	/* 1: S1 = S1 xor S1'		*/ \
166 	evrlwi		rT0,rT0,21;	/*    s0' = w[-15] rotr 18	*/ \
167 	add		h,h,rT2;	/* 1: temp1 = h + S1		*/ \
168 	evxor		rT0,rT0,rT1;	/*    s0 = s0 xor s0'		*/ \
169 	and		rT2,e,f;	/* 1: ch = e and f		*/ \
170 	evaddw		w0,w0,rT0;	/*    w = w[-16] + s0		*/ \
171 	andc		rT3,g,e;	/* 1: ch' = ~e and g		*/ \
172 	evsrwiu		rT0,w7,10;	/*    s1 = w[-2] >> 10		*/ \
173 	xor		rT2,rT2,rT3;	/* 1: ch = ch xor ch'		*/ \
174 	evrlwi		rT1,w7,15;	/*    s1' = w[-2] rotr 17	*/ \
175 	add		h,h,rT2;	/* 1: temp1 = temp1 + ch	*/ \
176 	evxor		rT0,rT0,rT1;	/*    s1 = s1 xor s1'		*/ \
177 	rotrwi		rT2,a,2;	/* 1: S0 = a rotr 2		*/ \
178 	evrlwi		rT1,w7,13;	/*    s1' = w[-2] rotr 19	*/ \
179 	rotrwi		rT3,a,13;	/* 1: S0' = a rotr 13		*/ \
180 	evxor		rT0,rT0,rT1;	/*    s1 = s1 xor s1'		*/ \
181 	xor		rT2,rT2,rT3;	/* 1: S0 = S0 xor S0'		*/ \
182 	evldw		rT1,off(rKP);	/*    k				*/ \
183 	rotrwi		rT3,a,22;	/* 1: S0' = a rotr 22		*/ \
184 	evaddw		w0,w0,rT0;	/*    w = w + s1		*/ \
185 	xor		rT2,rT2,rT3;	/* 1: S0 = S0 xor S0'		*/ \
186 	evmergelohi	rT0,w4,w5;	/*    w[-7]			*/ \
187 	and		rT3,a,b;	/* 1: maj = a and b		*/ \
188 	evaddw		w0,w0,rT0;	/*    w = w + w[-7]		*/ \
189 	CMP_K##k##_LOOP							   \
190 	add		rT2,rT2,rT3;	/* 1: temp2 = S0 + maj		*/ \
191 	evaddw		rT1,rT1,w0;	/*    wk = w + k		*/ \
192 	xor		rT3,a,b;	/* 1: maj = a xor b		*/ \
193 	evmergehi	rT0,rT1,rT1;	/*    wk1/wk2			*/ \
194 	and		rT3,rT3,c;	/* 1: maj = maj and c		*/ \
195 	add		h,h,rT0;	/* 1: temp1 = temp1 + wk	*/ \
196 	add		rT2,rT2,rT3;	/* 1: temp2 = temp2 + maj	*/ \
197 	add		g,g,rT1;	/* 2: temp1 = temp1 + wk	*/ \
198 	add		d,d,h;		/* 1: d = d + temp1		*/ \
199 	rotrwi		rT0,d,6;	/* 2: S1 = e rotr 6		*/ \
200 	add		h,h,rT2;	/* 1: h = temp1 + temp2		*/ \
201 	rotrwi		rT1,d,11;	/* 2: S1' = e rotr 11		*/ \
202 	rotrwi		rT2,d,25;	/* 2: S" = e rotr 25		*/ \
203 	xor		rT0,rT0,rT1;	/* 2: S1 = S1 xor S1'		*/ \
204 	and		rT3,d,e;	/* 2: ch = e and f		*/ \
205 	xor		rT0,rT0,rT2;	/* 2: S1 = S1 xor S1"		*/ \
206 	andc		rT1,f,d;	/* 2: ch' = ~e and g		*/ \
207 	add		g,g,rT0;	/* 2: temp1 = h + S1		*/ \
208 	xor		rT3,rT3,rT1;	/* 2: ch = ch xor ch'		*/ \
209 	rotrwi		rT0,h,2;	/* 2: S0 = a rotr 2		*/ \
210 	add		g,g,rT3;	/* 2: temp1 = temp1 + ch	*/ \
211 	rotrwi		rT1,h,13;	/* 2: S0' = a rotr 13		*/ \
212 	rotrwi		rT3,h,22;	/* 2: S0" = a rotr 22		*/ \
213 	xor		rT0,rT0,rT1;	/* 2: S0 = S0 xor S0'		*/ \
214 	or		rT2,h,a;	/* 2: maj = a or b		*/ \
215 	and		rT1,h,a;	/* 2: maj' = a and b		*/ \
216 	and		rT2,rT2,b;	/* 2: maj = maj and c		*/ \
217 	xor		rT3,rT0,rT3;	/* 2: S0 = S0 xor S0"		*/ \
218 	or		rT2,rT1,rT2;	/* 2: maj = maj or maj'		*/ \
219 	add		c,c,g;		/* 2: d = d + temp1		*/ \
220 	add		rT3,rT3,rT2;	/* 2: temp2 = S0 + maj		*/ \
221 	add		g,g,rT3		/* 2: h = temp1 + temp2		*/
222 
223 _GLOBAL(ppc_spe_sha256_transform)
224 	INITIALIZE
225 
226 	mtctr		r5
227 	lwz		rH0,0(rHP)
228 	lwz		rH1,4(rHP)
229 	lwz		rH2,8(rHP)
230 	lwz		rH3,12(rHP)
231 	lwz		rH4,16(rHP)
232 	lwz		rH5,20(rHP)
233 	lwz		rH6,24(rHP)
234 	lwz		rH7,28(rHP)
235 
236 ppc_spe_sha256_main:
237 	lis		rKP,PPC_SPE_SHA256_K@ha
238 	addi		rKP,rKP,PPC_SPE_SHA256_K@l
239 
240 	R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW0, 0)
241 	R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW1, 8)
242 	R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW2, 16)
243 	R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW3, 24)
244 	R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW4, 32)
245 	R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW5, 40)
246 	R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW6, 48)
247 	R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW7, 56)
248 ppc_spe_sha256_16_rounds:
249 	addi		rKP,rKP,64
250 	R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
251 		 rW0, rW1, rW4, rW5, rW7, N, 0)
252 	R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
253 		 rW1, rW2, rW5, rW6, rW0, N, 8)
254 	R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
255 		 rW2, rW3, rW6, rW7, rW1, N, 16)
256 	R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
257 		 rW3, rW4, rW7, rW0, rW2, N, 24)
258 	R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
259 		 rW4, rW5, rW0, rW1, rW3, N, 32)
260 	R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
261 		 rW5, rW6, rW1, rW2, rW4, N, 40)
262 	R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
263 		 rW6, rW7, rW2, rW3, rW5, N, 48)
264 	R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
265 		 rW7, rW0, rW3, rW4, rW6, C, 56)
266 	bt		gt,ppc_spe_sha256_16_rounds
267 
268 	lwz		rW0,0(rHP)
269 	NEXT_BLOCK
270 	lwz		rW1,4(rHP)
271 	lwz		rW2,8(rHP)
272 	lwz		rW3,12(rHP)
273 	lwz		rW4,16(rHP)
274 	lwz		rW5,20(rHP)
275 	lwz		rW6,24(rHP)
276 	lwz		rW7,28(rHP)
277 
278 	add		rH0,rH0,rW0
279 	stw		rH0,0(rHP)
280 	add		rH1,rH1,rW1
281 	stw		rH1,4(rHP)
282 	add		rH2,rH2,rW2
283 	stw		rH2,8(rHP)
284 	add		rH3,rH3,rW3
285 	stw		rH3,12(rHP)
286 	add		rH4,rH4,rW4
287 	stw		rH4,16(rHP)
288 	add		rH5,rH5,rW5
289 	stw		rH5,20(rHP)
290 	add		rH6,rH6,rW6
291 	stw		rH6,24(rHP)
292 	add		rH7,rH7,rW7
293 	stw		rH7,28(rHP)
294 
295 	bdnz		ppc_spe_sha256_main
296 
297 	FINALIZE
298 	blr
299 
300 .data
301 .align 5
302 PPC_SPE_SHA256_K:
303 	.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
304 	.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
305 	.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
306 	.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
307 	.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
308 	.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
309 	.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
310 	.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
311 	.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
312 	.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
313 	.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
314 	.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
315 	.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
316 	.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
317 	.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
318 	.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
319