1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * Fast SHA-1 implementation for SPE instruction set (PPC)
4  *
5  * This code makes use of the SPE SIMD instruction set as defined in
6  * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
7  * Implementation is based on optimization guide notes from
8  * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
9  *
10  * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
11  */
12 
13 #include <asm/ppc_asm.h>
14 #include <asm/asm-offsets.h>
15 
16 #define rHP	r3	/* pointer to hash value			*/
17 #define rWP	r4	/* pointer to input				*/
18 #define rKP	r5	/* pointer to constants				*/
19 
20 #define rW0	r14	/* 64 bit round words				*/
21 #define rW1	r15
22 #define rW2	r16
23 #define rW3	r17
24 #define rW4	r18
25 #define rW5	r19
26 #define rW6	r20
27 #define rW7	r21
28 
29 #define rH0	r6	/* 32 bit hash values 				*/
30 #define rH1	r7
31 #define rH2	r8
32 #define rH3	r9
33 #define rH4	r10
34 
35 #define rT0	r22	/* 64 bit temporary				*/
36 #define rT1	r0	/* 32 bit temporaries				*/
37 #define rT2	r11
38 #define rT3	r12
39 
40 #define rK	r23	/* 64 bit constant in volatile register		*/
41 
42 #define LOAD_K01
43 
44 #define LOAD_K11 \
45 	evlwwsplat	rK,0(rKP);
46 
47 #define LOAD_K21 \
48 	evlwwsplat	rK,4(rKP);
49 
50 #define LOAD_K31 \
51 	evlwwsplat	rK,8(rKP);
52 
53 #define LOAD_K41 \
54 	evlwwsplat	rK,12(rKP);
55 
56 #define INITIALIZE \
57 	stwu		r1,-128(r1);	/* create stack frame		*/ \
58 	evstdw		r14,8(r1);	/* We must save non volatile	*/ \
59 	evstdw		r15,16(r1);	/* registers. Take the chance	*/ \
60 	evstdw		r16,24(r1);	/* and save the SPE part too	*/ \
61 	evstdw		r17,32(r1);					   \
62 	evstdw		r18,40(r1);					   \
63 	evstdw		r19,48(r1);					   \
64 	evstdw		r20,56(r1);					   \
65 	evstdw		r21,64(r1);					   \
66 	evstdw		r22,72(r1);					   \
67 	evstdw		r23,80(r1);
68 
69 
70 #define FINALIZE \
71 	evldw		r14,8(r1);	/* restore SPE registers	*/ \
72 	evldw		r15,16(r1);					   \
73 	evldw		r16,24(r1);					   \
74 	evldw		r17,32(r1);					   \
75 	evldw		r18,40(r1);					   \
76 	evldw		r19,48(r1);					   \
77 	evldw		r20,56(r1);					   \
78 	evldw		r21,64(r1);					   \
79 	evldw		r22,72(r1);					   \
80 	evldw		r23,80(r1);					   \
81 	xor		r0,r0,r0;					   \
82 	stw		r0,8(r1);	/* Delete sensitive data	*/ \
83 	stw		r0,16(r1);	/* that we might have pushed	*/ \
84 	stw		r0,24(r1);	/* from other context that runs	*/ \
85 	stw		r0,32(r1);	/* the same code. Assume that	*/ \
86 	stw		r0,40(r1);	/* the lower part of the GPRs	*/ \
87 	stw		r0,48(r1);	/* were already overwritten on	*/ \
88 	stw		r0,56(r1);	/* the way down to here		*/ \
89 	stw		r0,64(r1);					   \
90 	stw		r0,72(r1);					   \
91 	stw		r0,80(r1);					   \
92 	addi		r1,r1,128;	/* cleanup stack frame		*/
93 
94 #ifdef __BIG_ENDIAN__
95 #define LOAD_DATA(reg, off) \
96 	lwz		reg,off(rWP);	/* load data			*/
97 #define NEXT_BLOCK \
98 	addi		rWP,rWP,64;	/* increment per block		*/
99 #else
100 #define LOAD_DATA(reg, off) \
101 	lwbrx		reg,0,rWP;	/* load data			*/ \
102 	addi		rWP,rWP,4;	/* increment per word		*/
103 #define NEXT_BLOCK			/* nothing to do		*/
104 #endif
105 
106 #define	R_00_15(a, b, c, d, e, w0, w1, k, off) \
107 	LOAD_DATA(w0, off)		/* 1: W				*/ \
108 	and		rT2,b,c;	/* 1: F' = B and C 		*/ \
109 	LOAD_K##k##1							   \
110 	andc		rT1,d,b;	/* 1: F" = ~B and D 		*/ \
111 	rotrwi		rT0,a,27;	/* 1: A' = A rotl 5		*/ \
112 	or		rT2,rT2,rT1;	/* 1: F = F' or F"		*/ \
113 	add		e,e,rT0;	/* 1: E = E + A'		*/ \
114 	rotrwi		b,b,2;		/* 1: B = B rotl 30		*/ \
115 	add		e,e,w0;		/* 1: E = E + W			*/ \
116 	LOAD_DATA(w1, off+4)		/* 2: W				*/ \
117 	add		e,e,rT2;	/* 1: E = E + F			*/ \
118 	and		rT1,a,b;	/* 2: F' = B and C 		*/ \
119 	add		e,e,rK;		/* 1: E = E + K			*/ \
120 	andc		rT2,c,a;	/* 2: F" = ~B and D 		*/ \
121 	add		d,d,rK;		/* 2: E = E + K			*/ \
122 	or		rT2,rT2,rT1;	/* 2: F = F' or F"		*/ \
123 	rotrwi		rT0,e,27;	/* 2: A' = A rotl 5		*/ \
124 	add		d,d,w1;		/* 2: E = E + W			*/ \
125 	rotrwi		a,a,2;		/* 2: B = B rotl 30		*/ \
126 	add		d,d,rT0;	/* 2: E = E + A'		*/ \
127 	evmergelo	w1,w1,w0;	/*    mix W[0]/W[1]		*/ \
128 	add		d,d,rT2		/* 2: E = E + F			*/
129 
130 #define R_16_19(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
131 	and		rT2,b,c;	/* 1: F' = B and C 		*/ \
132 	evmergelohi	rT0,w7,w6;	/*    W[-3]			*/ \
133 	andc		rT1,d,b;	/* 1: F" = ~B and D 		*/ \
134 	evxor		w0,w0,rT0;	/*    W = W[-16] xor W[-3]	*/ \
135 	or		rT1,rT1,rT2;	/* 1: F = F' or F"		*/ \
136 	evxor		w0,w0,w4;	/*    W = W xor W[-8]		*/ \
137 	add		e,e,rT1;	/* 1: E = E + F			*/ \
138 	evxor		w0,w0,w1;	/*    W = W xor W[-14]		*/ \
139 	rotrwi		rT2,a,27;	/* 1: A' = A rotl 5		*/ \
140 	evrlwi		w0,w0,1;	/*    W = W rotl 1		*/ \
141 	add		e,e,rT2;	/* 1: E = E + A'		*/ \
142 	evaddw		rT0,w0,rK;	/*    WK = W + K		*/ \
143 	rotrwi		b,b,2;		/* 1: B = B rotl 30		*/ \
144 	LOAD_K##k##1							   \
145 	evmergehi	rT1,rT1,rT0;	/*    WK1/WK2			*/ \
146 	add		e,e,rT0;	/* 1: E = E + WK		*/ \
147 	add		d,d,rT1;	/* 2: E = E + WK		*/ \
148 	and		rT2,a,b;	/* 2: F' = B and C 		*/ \
149 	andc		rT1,c,a;	/* 2: F" = ~B and D 		*/ \
150 	rotrwi		rT0,e,27;	/* 2: A' = A rotl 5		*/ \
151 	or		rT1,rT1,rT2;	/* 2: F = F' or F"		*/ \
152 	add		d,d,rT0;	/* 2: E = E + A'		*/ \
153 	rotrwi		a,a,2;		/* 2: B = B rotl 30		*/ \
154 	add		d,d,rT1		/* 2: E = E + F			*/
155 
156 #define R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
157 	evmergelohi	rT0,w7,w6;	/*    W[-3]			*/ \
158 	xor		rT2,b,c;	/* 1: F' = B xor C		*/ \
159 	evxor		w0,w0,rT0;	/*    W = W[-16] xor W[-3]	*/ \
160 	xor		rT2,rT2,d;	/* 1: F = F' xor D		*/ \
161 	evxor		w0,w0,w4;	/*    W = W xor W[-8]		*/ \
162 	add		e,e,rT2;	/* 1: E = E + F			*/ \
163 	evxor		w0,w0,w1;	/*    W = W xor W[-14]		*/ \
164 	rotrwi		rT2,a,27;	/* 1: A' = A rotl 5		*/ \
165 	evrlwi		w0,w0,1;	/*    W = W rotl 1		*/ \
166 	add		e,e,rT2;	/* 1: E = E + A'		*/ \
167 	evaddw		rT0,w0,rK;	/*    WK = W + K		*/ \
168 	rotrwi		b,b,2;		/* 1: B = B rotl 30		*/ \
169 	LOAD_K##k##1							   \
170 	evmergehi	rT1,rT1,rT0;	/*    WK1/WK2			*/ \
171 	add		e,e,rT0;	/* 1: E = E + WK		*/ \
172 	xor		rT2,a,b;	/* 2: F' = B xor C		*/ \
173 	add		d,d,rT1;	/* 2: E = E + WK		*/ \
174 	xor		rT2,rT2,c;	/* 2: F = F' xor D		*/ \
175 	rotrwi		rT0,e,27;	/* 2: A' = A rotl 5		*/ \
176 	add		d,d,rT2;	/* 2: E = E + F			*/ \
177 	rotrwi		a,a,2;		/* 2: B = B rotl 30		*/ \
178 	add		d,d,rT0		/* 2: E = E + A'		*/
179 
180 #define R_40_59(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
181 	and		rT2,b,c;	/* 1: F' = B and C		*/ \
182 	evmergelohi	rT0,w7,w6;	/*    W[-3]			*/ \
183 	or		rT1,b,c;	/* 1: F" = B or C		*/ \
184 	evxor		w0,w0,rT0;	/*    W = W[-16] xor W[-3]	*/ \
185 	and		rT1,d,rT1;	/* 1: F" = F" and D		*/ \
186 	evxor		w0,w0,w4;	/*    W = W xor W[-8]		*/ \
187 	or		rT2,rT2,rT1;	/* 1: F = F' or F"		*/ \
188 	evxor		w0,w0,w1;	/*    W = W xor W[-14]		*/ \
189 	add		e,e,rT2;	/* 1: E = E + F			*/ \
190 	evrlwi		w0,w0,1;	/*    W = W rotl 1		*/ \
191 	rotrwi		rT2,a,27;	/* 1: A' = A rotl 5		*/ \
192 	evaddw		rT0,w0,rK;	/*    WK = W + K		*/ \
193 	add		e,e,rT2;	/* 1: E = E + A'		*/ \
194 	LOAD_K##k##1							   \
195 	evmergehi	rT1,rT1,rT0;	/*    WK1/WK2			*/ \
196 	rotrwi		b,b,2;		/* 1: B = B rotl 30		*/ \
197 	add		e,e,rT0;	/* 1: E = E + WK		*/ \
198 	and		rT2,a,b;	/* 2: F' = B and C		*/ \
199 	or		rT0,a,b;	/* 2: F" = B or C		*/ \
200 	add		d,d,rT1;	/* 2: E = E + WK		*/ \
201 	and		rT0,c,rT0;	/* 2: F" = F" and D		*/ \
202 	rotrwi		a,a,2;		/* 2: B = B rotl 30		*/ \
203 	or		rT2,rT2,rT0;	/* 2: F = F' or F"		*/ \
204 	rotrwi		rT0,e,27;	/* 2: A' = A rotl 5		*/ \
205 	add		d,d,rT2;	/* 2: E = E + F			*/ \
206 	add		d,d,rT0		/* 2: E = E + A'		*/
207 
208 #define R_60_79(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
209 	R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k)
210 
211 _GLOBAL(ppc_spe_sha1_transform)
212 	INITIALIZE
213 
214 	lwz		rH0,0(rHP)
215 	lwz		rH1,4(rHP)
216 	mtctr		r5
217 	lwz		rH2,8(rHP)
218 	lis		rKP,PPC_SPE_SHA1_K@h
219 	lwz		rH3,12(rHP)
220 	ori		rKP,rKP,PPC_SPE_SHA1_K@l
221 	lwz		rH4,16(rHP)
222 
223 ppc_spe_sha1_main:
224 	R_00_15(rH0, rH1, rH2, rH3, rH4, rW1, rW0, 1, 0)
225 	R_00_15(rH3, rH4, rH0, rH1, rH2, rW2, rW1, 0, 8)
226 	R_00_15(rH1, rH2, rH3, rH4, rH0, rW3, rW2, 0, 16)
227 	R_00_15(rH4, rH0, rH1, rH2, rH3, rW4, rW3, 0, 24)
228 	R_00_15(rH2, rH3, rH4, rH0, rH1, rW5, rW4, 0, 32)
229 	R_00_15(rH0, rH1, rH2, rH3, rH4, rW6, rW5, 0, 40)
230 	R_00_15(rH3, rH4, rH0, rH1, rH2, rT3, rW6, 0, 48)
231 	R_00_15(rH1, rH2, rH3, rH4, rH0, rT3, rW7, 0, 56)
232 
233 	R_16_19(rH4, rH0, rH1, rH2, rH3, rW0, rW1, rW4, rW6, rW7, 0)
234 	R_16_19(rH2, rH3, rH4, rH0, rH1, rW1, rW2, rW5, rW7, rW0, 2)
235 
236 	R_20_39(rH0, rH1, rH2, rH3, rH4, rW2, rW3, rW6, rW0, rW1, 0)
237 	R_20_39(rH3, rH4, rH0, rH1, rH2, rW3, rW4, rW7, rW1, rW2, 0)
238 	R_20_39(rH1, rH2, rH3, rH4, rH0, rW4, rW5, rW0, rW2, rW3, 0)
239 	R_20_39(rH4, rH0, rH1, rH2, rH3, rW5, rW6, rW1, rW3, rW4, 0)
240 	R_20_39(rH2, rH3, rH4, rH0, rH1, rW6, rW7, rW2, rW4, rW5, 0)
241 	R_20_39(rH0, rH1, rH2, rH3, rH4, rW7, rW0, rW3, rW5, rW6, 0)
242 	R_20_39(rH3, rH4, rH0, rH1, rH2, rW0, rW1, rW4, rW6, rW7, 0)
243 	R_20_39(rH1, rH2, rH3, rH4, rH0, rW1, rW2, rW5, rW7, rW0, 0)
244 	R_20_39(rH4, rH0, rH1, rH2, rH3, rW2, rW3, rW6, rW0, rW1, 0)
245 	R_20_39(rH2, rH3, rH4, rH0, rH1, rW3, rW4, rW7, rW1, rW2, 3)
246 
247 	R_40_59(rH0, rH1, rH2, rH3, rH4, rW4, rW5, rW0, rW2, rW3, 0)
248 	R_40_59(rH3, rH4, rH0, rH1, rH2, rW5, rW6, rW1, rW3, rW4, 0)
249 	R_40_59(rH1, rH2, rH3, rH4, rH0, rW6, rW7, rW2, rW4, rW5, 0)
250 	R_40_59(rH4, rH0, rH1, rH2, rH3, rW7, rW0, rW3, rW5, rW6, 0)
251 	R_40_59(rH2, rH3, rH4, rH0, rH1, rW0, rW1, rW4, rW6, rW7, 0)
252 	R_40_59(rH0, rH1, rH2, rH3, rH4, rW1, rW2, rW5, rW7, rW0, 0)
253 	R_40_59(rH3, rH4, rH0, rH1, rH2, rW2, rW3, rW6, rW0, rW1, 0)
254 	R_40_59(rH1, rH2, rH3, rH4, rH0, rW3, rW4, rW7, rW1, rW2, 0)
255 	R_40_59(rH4, rH0, rH1, rH2, rH3, rW4, rW5, rW0, rW2, rW3, 0)
256 	R_40_59(rH2, rH3, rH4, rH0, rH1, rW5, rW6, rW1, rW3, rW4, 4)
257 
258 	R_60_79(rH0, rH1, rH2, rH3, rH4, rW6, rW7, rW2, rW4, rW5, 0)
259 	R_60_79(rH3, rH4, rH0, rH1, rH2, rW7, rW0, rW3, rW5, rW6, 0)
260 	R_60_79(rH1, rH2, rH3, rH4, rH0, rW0, rW1, rW4, rW6, rW7, 0)
261 	R_60_79(rH4, rH0, rH1, rH2, rH3, rW1, rW2, rW5, rW7, rW0, 0)
262 	R_60_79(rH2, rH3, rH4, rH0, rH1, rW2, rW3, rW6, rW0, rW1, 0)
263 	R_60_79(rH0, rH1, rH2, rH3, rH4, rW3, rW4, rW7, rW1, rW2, 0)
264 	R_60_79(rH3, rH4, rH0, rH1, rH2, rW4, rW5, rW0, rW2, rW3, 0)
265 	lwz		rT3,0(rHP)
266 	R_60_79(rH1, rH2, rH3, rH4, rH0, rW5, rW6, rW1, rW3, rW4, 0)
267 	lwz		rW1,4(rHP)
268 	R_60_79(rH4, rH0, rH1, rH2, rH3, rW6, rW7, rW2, rW4, rW5, 0)
269 	lwz		rW2,8(rHP)
270 	R_60_79(rH2, rH3, rH4, rH0, rH1, rW7, rW0, rW3, rW5, rW6, 0)
271 	lwz		rW3,12(rHP)
272 	NEXT_BLOCK
273 	lwz		rW4,16(rHP)
274 
275 	add		rH0,rH0,rT3
276 	stw		rH0,0(rHP)
277 	add		rH1,rH1,rW1
278 	stw		rH1,4(rHP)
279 	add		rH2,rH2,rW2
280 	stw		rH2,8(rHP)
281 	add		rH3,rH3,rW3
282 	stw		rH3,12(rHP)
283 	add		rH4,rH4,rW4
284 	stw		rH4,16(rHP)
285 
286 	bdnz		ppc_spe_sha1_main
287 
288 	FINALIZE
289 	blr
290 
291 .data
292 .align 4
293 PPC_SPE_SHA1_K:
294 	.long 0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6
295