1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * Fast AES implementation for SPE instruction set (PPC)
4  *
5  * This code makes use of the SPE SIMD instruction set as defined in
6  * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
7  * Implementation is based on optimization guide notes from
8  * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
9  *
10  * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
11  */
12 
13 #include <asm/ppc_asm.h>
14 #include "aes-spe-regs.h"
15 
16 #define	EAD(in, bpos) \
17 	rlwimi		rT0,in,28-((bpos+3)%4)*8,20,27;
18 
19 #define DAD(in, bpos) \
20 	rlwimi		rT1,in,24-((bpos+3)%4)*8,24,31;
21 
22 #define LWH(out, off) \
23 	evlwwsplat	out,off(rT0);	/* load word high		*/
24 
25 #define LWL(out, off) \
26 	lwz		out,off(rT0);	/* load word low		*/
27 
28 #define LBZ(out, tab, off) \
29 	lbz		out,off(tab);	/* load byte			*/
30 
31 #define LAH(out, in, bpos, off) \
32 	EAD(in, bpos)			/* calc addr + load word high	*/ \
33 	LWH(out, off)
34 
35 #define LAL(out, in, bpos, off) \
36 	EAD(in, bpos)			/* calc addr + load word low	*/ \
37 	LWL(out, off)
38 
39 #define LAE(out, in, bpos) \
40 	EAD(in, bpos)			/* calc addr + load enc byte	*/ \
41 	LBZ(out, rT0, 8)
42 
43 #define LBE(out) \
44 	LBZ(out, rT0, 8)		/* load enc byte		*/
45 
46 #define LAD(out, in, bpos) \
47 	DAD(in, bpos)			/* calc addr + load dec byte	*/ \
48 	LBZ(out, rT1, 0)
49 
50 #define LBD(out) \
51 	LBZ(out, rT1, 0)
52 
53 /*
54  * ppc_encrypt_block: The central encryption function for a single 16 bytes
55  * block. It does no stack handling or register saving to support fast calls
56  * via bl/blr. It expects that caller has pre-xored input data with first
57  * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
58  * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
59  * and rW0-rW3 and caller must execute a final xor on the output registers.
60  * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
61  *
62  */
63 _GLOBAL(ppc_encrypt_block)
64 	LAH(rW4, rD1, 2, 4)
65 	LAH(rW6, rD0, 3, 0)
66 	LAH(rW3, rD0, 1, 8)
67 ppc_encrypt_block_loop:
68 	LAH(rW0, rD3, 0, 12)
69 	LAL(rW0, rD0, 0, 12)
70 	LAH(rW1, rD1, 0, 12)
71 	LAH(rW2, rD2, 1, 8)
72 	LAL(rW2, rD3, 1, 8)
73 	LAL(rW3, rD1, 1, 8)
74 	LAL(rW4, rD2, 2, 4)
75 	LAL(rW6, rD1, 3, 0)
76 	LAH(rW5, rD3, 2, 4)
77 	LAL(rW5, rD0, 2, 4)
78 	LAH(rW7, rD2, 3, 0)
79 	evldw		rD1,16(rKP)
80 	EAD(rD3, 3)
81 	evxor		rW2,rW2,rW4
82 	LWL(rW7, 0)
83 	evxor		rW2,rW2,rW6
84 	EAD(rD2, 0)
85 	evxor		rD1,rD1,rW2
86 	LWL(rW1, 12)
87 	evxor		rD1,rD1,rW0
88 	evldw		rD3,24(rKP)
89 	evmergehi	rD0,rD0,rD1
90 	EAD(rD1, 2)
91 	evxor		rW3,rW3,rW5
92 	LWH(rW4, 4)
93 	evxor		rW3,rW3,rW7
94 	EAD(rD0, 3)
95 	evxor		rD3,rD3,rW3
96 	LWH(rW6, 0)
97 	evxor		rD3,rD3,rW1
98 	EAD(rD0, 1)
99 	evmergehi	rD2,rD2,rD3
100 	LWH(rW3, 8)
101 	LAH(rW0, rD3, 0, 12)
102 	LAL(rW0, rD0, 0, 12)
103 	LAH(rW1, rD1, 0, 12)
104 	LAH(rW2, rD2, 1, 8)
105 	LAL(rW2, rD3, 1, 8)
106 	LAL(rW3, rD1, 1, 8)
107 	LAL(rW4, rD2, 2, 4)
108 	LAL(rW6, rD1, 3, 0)
109 	LAH(rW5, rD3, 2, 4)
110 	LAL(rW5, rD0, 2, 4)
111 	LAH(rW7, rD2, 3, 0)
112 	evldw		rD1,32(rKP)
113 	EAD(rD3, 3)
114 	evxor		rW2,rW2,rW4
115 	LWL(rW7, 0)
116 	evxor		rW2,rW2,rW6
117 	EAD(rD2, 0)
118 	evxor		rD1,rD1,rW2
119 	LWL(rW1, 12)
120 	evxor		rD1,rD1,rW0
121 	evldw		rD3,40(rKP)
122 	evmergehi	rD0,rD0,rD1
123 	EAD(rD1, 2)
124 	evxor		rW3,rW3,rW5
125 	LWH(rW4, 4)
126 	evxor		rW3,rW3,rW7
127 	EAD(rD0, 3)
128 	evxor		rD3,rD3,rW3
129 	LWH(rW6, 0)
130 	evxor		rD3,rD3,rW1
131 	EAD(rD0, 1)
132 	evmergehi	rD2,rD2,rD3
133 	LWH(rW3, 8)
134 	addi		rKP,rKP,32
135 	bdnz		ppc_encrypt_block_loop
136 	LAH(rW0, rD3, 0, 12)
137 	LAL(rW0, rD0, 0, 12)
138 	LAH(rW1, rD1, 0, 12)
139 	LAH(rW2, rD2, 1, 8)
140 	LAL(rW2, rD3, 1, 8)
141 	LAL(rW3, rD1, 1, 8)
142 	LAL(rW4, rD2, 2, 4)
143 	LAH(rW5, rD3, 2, 4)
144 	LAL(rW6, rD1, 3, 0)
145 	LAL(rW5, rD0, 2, 4)
146 	LAH(rW7, rD2, 3, 0)
147 	evldw		rD1,16(rKP)
148 	EAD(rD3, 3)
149 	evxor		rW2,rW2,rW4
150 	LWL(rW7, 0)
151 	evxor		rW2,rW2,rW6
152 	EAD(rD2, 0)
153 	evxor		rD1,rD1,rW2
154 	LWL(rW1, 12)
155 	evxor		rD1,rD1,rW0
156 	evldw		rD3,24(rKP)
157 	evmergehi	rD0,rD0,rD1
158 	EAD(rD1, 0)
159 	evxor		rW3,rW3,rW5
160 	LBE(rW2)
161 	evxor		rW3,rW3,rW7
162 	EAD(rD0, 1)
163 	evxor		rD3,rD3,rW3
164 	LBE(rW6)
165 	evxor		rD3,rD3,rW1
166 	EAD(rD0, 0)
167 	evmergehi	rD2,rD2,rD3
168 	LBE(rW1)
169 	LAE(rW0, rD3, 0)
170 	LAE(rW1, rD0, 0)
171 	LAE(rW4, rD2, 1)
172 	LAE(rW5, rD3, 1)
173 	LAE(rW3, rD2, 0)
174 	LAE(rW7, rD1, 1)
175 	rlwimi		rW0,rW4,8,16,23
176 	rlwimi		rW1,rW5,8,16,23
177 	LAE(rW4, rD1, 2)
178 	LAE(rW5, rD2, 2)
179 	rlwimi		rW2,rW6,8,16,23
180 	rlwimi		rW3,rW7,8,16,23
181 	LAE(rW6, rD3, 2)
182 	LAE(rW7, rD0, 2)
183 	rlwimi		rW0,rW4,16,8,15
184 	rlwimi		rW1,rW5,16,8,15
185 	LAE(rW4, rD0, 3)
186 	LAE(rW5, rD1, 3)
187 	rlwimi		rW2,rW6,16,8,15
188 	lwz		rD0,32(rKP)
189 	rlwimi		rW3,rW7,16,8,15
190 	lwz		rD1,36(rKP)
191 	LAE(rW6, rD2, 3)
192 	LAE(rW7, rD3, 3)
193 	rlwimi		rW0,rW4,24,0,7
194 	lwz		rD2,40(rKP)
195 	rlwimi		rW1,rW5,24,0,7
196 	lwz		rD3,44(rKP)
197 	rlwimi		rW2,rW6,24,0,7
198 	rlwimi		rW3,rW7,24,0,7
199 	blr
200 
201 /*
202  * ppc_decrypt_block: The central decryption function for a single 16 bytes
203  * block. It does no stack handling or register saving to support fast calls
204  * via bl/blr. It expects that caller has pre-xored input data with first
205  * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
206  * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
207  * and rW0-rW3 and caller must execute a final xor on the output registers.
208  * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
209  *
210  */
211 _GLOBAL(ppc_decrypt_block)
212 	LAH(rW0, rD1, 0, 12)
213 	LAH(rW6, rD0, 3, 0)
214 	LAH(rW3, rD0, 1, 8)
215 ppc_decrypt_block_loop:
216 	LAH(rW1, rD3, 0, 12)
217 	LAL(rW0, rD2, 0, 12)
218 	LAH(rW2, rD2, 1, 8)
219 	LAL(rW2, rD3, 1, 8)
220 	LAH(rW4, rD3, 2, 4)
221 	LAL(rW4, rD0, 2, 4)
222 	LAL(rW6, rD1, 3, 0)
223 	LAH(rW5, rD1, 2, 4)
224 	LAH(rW7, rD2, 3, 0)
225 	LAL(rW7, rD3, 3, 0)
226 	LAL(rW3, rD1, 1, 8)
227 	evldw		rD1,16(rKP)
228 	EAD(rD0, 0)
229 	evxor		rW4,rW4,rW6
230 	LWL(rW1, 12)
231 	evxor		rW0,rW0,rW4
232 	EAD(rD2, 2)
233 	evxor		rW0,rW0,rW2
234 	LWL(rW5, 4)
235 	evxor		rD1,rD1,rW0
236 	evldw		rD3,24(rKP)
237 	evmergehi	rD0,rD0,rD1
238 	EAD(rD1, 0)
239 	evxor		rW3,rW3,rW7
240 	LWH(rW0, 12)
241 	evxor		rW3,rW3,rW1
242 	EAD(rD0, 3)
243 	evxor		rD3,rD3,rW3
244 	LWH(rW6, 0)
245 	evxor		rD3,rD3,rW5
246 	EAD(rD0, 1)
247 	evmergehi	rD2,rD2,rD3
248 	LWH(rW3, 8)
249 	LAH(rW1, rD3, 0, 12)
250 	LAL(rW0, rD2, 0, 12)
251 	LAH(rW2, rD2, 1, 8)
252 	LAL(rW2, rD3, 1, 8)
253 	LAH(rW4, rD3, 2, 4)
254 	LAL(rW4, rD0, 2, 4)
255 	LAL(rW6, rD1, 3, 0)
256 	LAH(rW5, rD1, 2, 4)
257 	LAH(rW7, rD2, 3, 0)
258 	LAL(rW7, rD3, 3, 0)
259 	LAL(rW3, rD1, 1, 8)
260 	evldw		 rD1,32(rKP)
261 	EAD(rD0, 0)
262 	evxor		rW4,rW4,rW6
263 	LWL(rW1, 12)
264 	evxor		rW0,rW0,rW4
265 	EAD(rD2, 2)
266 	evxor		rW0,rW0,rW2
267 	LWL(rW5, 4)
268 	evxor		rD1,rD1,rW0
269 	evldw		rD3,40(rKP)
270 	evmergehi	rD0,rD0,rD1
271 	EAD(rD1, 0)
272 	evxor		rW3,rW3,rW7
273 	LWH(rW0, 12)
274 	evxor		rW3,rW3,rW1
275 	EAD(rD0, 3)
276 	evxor		rD3,rD3,rW3
277 	LWH(rW6, 0)
278 	evxor		rD3,rD3,rW5
279 	EAD(rD0, 1)
280 	evmergehi	rD2,rD2,rD3
281 	LWH(rW3, 8)
282 	addi		rKP,rKP,32
283 	bdnz		ppc_decrypt_block_loop
284 	LAH(rW1, rD3, 0, 12)
285 	LAL(rW0, rD2, 0, 12)
286 	LAH(rW2, rD2, 1, 8)
287 	LAL(rW2, rD3, 1, 8)
288 	LAH(rW4, rD3, 2, 4)
289 	LAL(rW4, rD0, 2, 4)
290 	LAL(rW6, rD1, 3, 0)
291 	LAH(rW5, rD1, 2, 4)
292 	LAH(rW7, rD2, 3, 0)
293 	LAL(rW7, rD3, 3, 0)
294 	LAL(rW3, rD1, 1, 8)
295 	evldw		 rD1,16(rKP)
296 	EAD(rD0, 0)
297 	evxor		rW4,rW4,rW6
298 	LWL(rW1, 12)
299 	evxor		rW0,rW0,rW4
300 	EAD(rD2, 2)
301 	evxor		rW0,rW0,rW2
302 	LWL(rW5, 4)
303 	evxor		rD1,rD1,rW0
304 	evldw		rD3,24(rKP)
305 	evmergehi	rD0,rD0,rD1
306 	DAD(rD1, 0)
307 	evxor		rW3,rW3,rW7
308 	LBD(rW0)
309 	evxor		rW3,rW3,rW1
310 	DAD(rD0, 1)
311 	evxor		rD3,rD3,rW3
312 	LBD(rW6)
313 	evxor		rD3,rD3,rW5
314 	DAD(rD0, 0)
315 	evmergehi	rD2,rD2,rD3
316 	LBD(rW3)
317 	LAD(rW2, rD3, 0)
318 	LAD(rW1, rD2, 0)
319 	LAD(rW4, rD2, 1)
320 	LAD(rW5, rD3, 1)
321 	LAD(rW7, rD1, 1)
322 	rlwimi		rW0,rW4,8,16,23
323 	rlwimi		rW1,rW5,8,16,23
324 	LAD(rW4, rD3, 2)
325 	LAD(rW5, rD0, 2)
326 	rlwimi		rW2,rW6,8,16,23
327 	rlwimi		rW3,rW7,8,16,23
328 	LAD(rW6, rD1, 2)
329 	LAD(rW7, rD2, 2)
330 	rlwimi		rW0,rW4,16,8,15
331 	rlwimi		rW1,rW5,16,8,15
332 	LAD(rW4, rD0, 3)
333 	LAD(rW5, rD1, 3)
334 	rlwimi		rW2,rW6,16,8,15
335 	lwz		rD0,32(rKP)
336 	rlwimi		rW3,rW7,16,8,15
337 	lwz		rD1,36(rKP)
338 	LAD(rW6, rD2, 3)
339 	LAD(rW7, rD3, 3)
340 	rlwimi		rW0,rW4,24,0,7
341 	lwz		rD2,40(rKP)
342 	rlwimi		rW1,rW5,24,0,7
343 	lwz		rD3,44(rKP)
344 	rlwimi		rW2,rW6,24,0,7
345 	rlwimi		rW3,rW7,24,0,7
346 	blr
347