1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * This file contains assembly-language implementations
4  * of IP-style 1's complement checksum routines.
5  *
6  *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
7  *
8  * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
9  */
10 
11 #include <linux/export.h>
12 #include <linux/sys.h>
13 #include <asm/processor.h>
14 #include <asm/errno.h>
15 #include <asm/ppc_asm.h>
16 
17 /*
18  * Computes the checksum of a memory block at buff, length len,
19  * and adds in "sum" (32-bit).
20  *
21  * __csum_partial(r3=buff, r4=len, r5=sum)
22  */
23 _GLOBAL(__csum_partial)
24 	addic	r0,r5,0			/* clear carry */
25 
26 	srdi.	r6,r4,3			/* less than 8 bytes? */
27 	beq	.Lcsum_tail_word
28 
29 	/*
30 	 * If only halfword aligned, align to a double word. Since odd
31 	 * aligned addresses should be rare and they would require more
32 	 * work to calculate the correct checksum, we ignore that case
33 	 * and take the potential slowdown of unaligned loads.
34 	 */
35 	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 >> 1) & 0x3 */
36 	beq	.Lcsum_aligned
37 
38 	li	r7,4
39 	sub	r6,r7,r6
40 	mtctr	r6
41 
42 1:
43 	lhz	r6,0(r3)		/* align to doubleword */
44 	subi	r4,r4,2
45 	addi	r3,r3,2
46 	adde	r0,r0,r6
47 	bdnz	1b
48 
49 .Lcsum_aligned:
50 	/*
51 	 * We unroll the loop such that each iteration is 64 bytes with an
52 	 * entry and exit limb of 64 bytes, meaning a minimum size of
53 	 * 128 bytes.
54 	 */
55 	srdi.	r6,r4,7
56 	beq	.Lcsum_tail_doublewords		/* len < 128 */
57 
58 	srdi	r6,r4,6
59 	subi	r6,r6,1
60 	mtctr	r6
61 
62 	stdu	r1,-STACKFRAMESIZE(r1)
63 	std	r14,STK_REG(R14)(r1)
64 	std	r15,STK_REG(R15)(r1)
65 	std	r16,STK_REG(R16)(r1)
66 
67 	ld	r6,0(r3)
68 	ld	r9,8(r3)
69 
70 	ld	r10,16(r3)
71 	ld	r11,24(r3)
72 
73 	/*
74 	 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
75 	 * because of the XER dependency. This means the fastest this loop can
76 	 * go is 16 cycles per iteration. The scheduling of the loop below has
77 	 * been shown to hit this on both POWER6 and POWER7.
78 	 */
79 	.align 5
80 2:
81 	adde	r0,r0,r6
82 	ld	r12,32(r3)
83 	ld	r14,40(r3)
84 
85 	adde	r0,r0,r9
86 	ld	r15,48(r3)
87 	ld	r16,56(r3)
88 	addi	r3,r3,64
89 
90 	adde	r0,r0,r10
91 
92 	adde	r0,r0,r11
93 
94 	adde	r0,r0,r12
95 
96 	adde	r0,r0,r14
97 
98 	adde	r0,r0,r15
99 	ld	r6,0(r3)
100 	ld	r9,8(r3)
101 
102 	adde	r0,r0,r16
103 	ld	r10,16(r3)
104 	ld	r11,24(r3)
105 	bdnz	2b
106 
107 
108 	adde	r0,r0,r6
109 	ld	r12,32(r3)
110 	ld	r14,40(r3)
111 
112 	adde	r0,r0,r9
113 	ld	r15,48(r3)
114 	ld	r16,56(r3)
115 	addi	r3,r3,64
116 
117 	adde	r0,r0,r10
118 	adde	r0,r0,r11
119 	adde	r0,r0,r12
120 	adde	r0,r0,r14
121 	adde	r0,r0,r15
122 	adde	r0,r0,r16
123 
124 	ld	r14,STK_REG(R14)(r1)
125 	ld	r15,STK_REG(R15)(r1)
126 	ld	r16,STK_REG(R16)(r1)
127 	addi	r1,r1,STACKFRAMESIZE
128 
129 	andi.	r4,r4,63
130 
131 .Lcsum_tail_doublewords:		/* Up to 127 bytes to go */
132 	srdi.	r6,r4,3
133 	beq	.Lcsum_tail_word
134 
135 	mtctr	r6
136 3:
137 	ld	r6,0(r3)
138 	addi	r3,r3,8
139 	adde	r0,r0,r6
140 	bdnz	3b
141 
142 	andi.	r4,r4,7
143 
144 .Lcsum_tail_word:			/* Up to 7 bytes to go */
145 	srdi.	r6,r4,2
146 	beq	.Lcsum_tail_halfword
147 
148 	lwz	r6,0(r3)
149 	addi	r3,r3,4
150 	adde	r0,r0,r6
151 	subi	r4,r4,4
152 
153 .Lcsum_tail_halfword:			/* Up to 3 bytes to go */
154 	srdi.	r6,r4,1
155 	beq	.Lcsum_tail_byte
156 
157 	lhz	r6,0(r3)
158 	addi	r3,r3,2
159 	adde	r0,r0,r6
160 	subi	r4,r4,2
161 
162 .Lcsum_tail_byte:			/* Up to 1 byte to go */
163 	andi.	r6,r4,1
164 	beq	.Lcsum_finish
165 
166 	lbz	r6,0(r3)
167 #ifdef __BIG_ENDIAN__
168 	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
169 	adde	r0,r0,r9
170 #else
171 	adde	r0,r0,r6
172 #endif
173 
174 .Lcsum_finish:
175 	addze	r0,r0			/* add in final carry */
176 	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
177 	add	r3,r4,r0
178 	srdi	r3,r3,32
179 	blr
180 EXPORT_SYMBOL(__csum_partial)
181 
182 
183 	.macro srcnr
184 100:
185 	EX_TABLE(100b,.Lerror_nr)
186 	.endm
187 
188 	.macro source
189 150:
190 	EX_TABLE(150b,.Lerror)
191 	.endm
192 
193 	.macro dstnr
194 200:
195 	EX_TABLE(200b,.Lerror_nr)
196 	.endm
197 
198 	.macro dest
199 250:
200 	EX_TABLE(250b,.Lerror)
201 	.endm
202 
203 /*
204  * Computes the checksum of a memory block at src, length len,
205  * and adds in 0xffffffff (32-bit), while copying the block to dst.
206  * If an access exception occurs, it returns 0.
207  *
208  * csum_partial_copy_generic(r3=src, r4=dst, r5=len)
209  */
210 _GLOBAL(csum_partial_copy_generic)
211 	li	r6,-1
212 	addic	r0,r6,0			/* clear carry */
213 
214 	srdi.	r6,r5,3			/* less than 8 bytes? */
215 	beq	.Lcopy_tail_word
216 
217 	/*
218 	 * If only halfword aligned, align to a double word. Since odd
219 	 * aligned addresses should be rare and they would require more
220 	 * work to calculate the correct checksum, we ignore that case
221 	 * and take the potential slowdown of unaligned loads.
222 	 *
223 	 * If the source and destination are relatively unaligned we only
224 	 * align the source. This keeps things simple.
225 	 */
226 	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 >> 1) & 0x3 */
227 	beq	.Lcopy_aligned
228 
229 	li	r9,4
230 	sub	r6,r9,r6
231 	mtctr	r6
232 
233 1:
234 srcnr;	lhz	r6,0(r3)		/* align to doubleword */
235 	subi	r5,r5,2
236 	addi	r3,r3,2
237 	adde	r0,r0,r6
238 dstnr;	sth	r6,0(r4)
239 	addi	r4,r4,2
240 	bdnz	1b
241 
242 .Lcopy_aligned:
243 	/*
244 	 * We unroll the loop such that each iteration is 64 bytes with an
245 	 * entry and exit limb of 64 bytes, meaning a minimum size of
246 	 * 128 bytes.
247 	 */
248 	srdi.	r6,r5,7
249 	beq	.Lcopy_tail_doublewords		/* len < 128 */
250 
251 	srdi	r6,r5,6
252 	subi	r6,r6,1
253 	mtctr	r6
254 
255 	stdu	r1,-STACKFRAMESIZE(r1)
256 	std	r14,STK_REG(R14)(r1)
257 	std	r15,STK_REG(R15)(r1)
258 	std	r16,STK_REG(R16)(r1)
259 
260 source;	ld	r6,0(r3)
261 source;	ld	r9,8(r3)
262 
263 source;	ld	r10,16(r3)
264 source;	ld	r11,24(r3)
265 
266 	/*
267 	 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
268 	 * because of the XER dependency. This means the fastest this loop can
269 	 * go is 16 cycles per iteration. The scheduling of the loop below has
270 	 * been shown to hit this on both POWER6 and POWER7.
271 	 */
272 	.align 5
273 2:
274 	adde	r0,r0,r6
275 source;	ld	r12,32(r3)
276 source;	ld	r14,40(r3)
277 
278 	adde	r0,r0,r9
279 source;	ld	r15,48(r3)
280 source;	ld	r16,56(r3)
281 	addi	r3,r3,64
282 
283 	adde	r0,r0,r10
284 dest;	std	r6,0(r4)
285 dest;	std	r9,8(r4)
286 
287 	adde	r0,r0,r11
288 dest;	std	r10,16(r4)
289 dest;	std	r11,24(r4)
290 
291 	adde	r0,r0,r12
292 dest;	std	r12,32(r4)
293 dest;	std	r14,40(r4)
294 
295 	adde	r0,r0,r14
296 dest;	std	r15,48(r4)
297 dest;	std	r16,56(r4)
298 	addi	r4,r4,64
299 
300 	adde	r0,r0,r15
301 source;	ld	r6,0(r3)
302 source;	ld	r9,8(r3)
303 
304 	adde	r0,r0,r16
305 source;	ld	r10,16(r3)
306 source;	ld	r11,24(r3)
307 	bdnz	2b
308 
309 
310 	adde	r0,r0,r6
311 source;	ld	r12,32(r3)
312 source;	ld	r14,40(r3)
313 
314 	adde	r0,r0,r9
315 source;	ld	r15,48(r3)
316 source;	ld	r16,56(r3)
317 	addi	r3,r3,64
318 
319 	adde	r0,r0,r10
320 dest;	std	r6,0(r4)
321 dest;	std	r9,8(r4)
322 
323 	adde	r0,r0,r11
324 dest;	std	r10,16(r4)
325 dest;	std	r11,24(r4)
326 
327 	adde	r0,r0,r12
328 dest;	std	r12,32(r4)
329 dest;	std	r14,40(r4)
330 
331 	adde	r0,r0,r14
332 dest;	std	r15,48(r4)
333 dest;	std	r16,56(r4)
334 	addi	r4,r4,64
335 
336 	adde	r0,r0,r15
337 	adde	r0,r0,r16
338 
339 	ld	r14,STK_REG(R14)(r1)
340 	ld	r15,STK_REG(R15)(r1)
341 	ld	r16,STK_REG(R16)(r1)
342 	addi	r1,r1,STACKFRAMESIZE
343 
344 	andi.	r5,r5,63
345 
346 .Lcopy_tail_doublewords:		/* Up to 127 bytes to go */
347 	srdi.	r6,r5,3
348 	beq	.Lcopy_tail_word
349 
350 	mtctr	r6
351 3:
352 srcnr;	ld	r6,0(r3)
353 	addi	r3,r3,8
354 	adde	r0,r0,r6
355 dstnr;	std	r6,0(r4)
356 	addi	r4,r4,8
357 	bdnz	3b
358 
359 	andi.	r5,r5,7
360 
361 .Lcopy_tail_word:			/* Up to 7 bytes to go */
362 	srdi.	r6,r5,2
363 	beq	.Lcopy_tail_halfword
364 
365 srcnr;	lwz	r6,0(r3)
366 	addi	r3,r3,4
367 	adde	r0,r0,r6
368 dstnr;	stw	r6,0(r4)
369 	addi	r4,r4,4
370 	subi	r5,r5,4
371 
372 .Lcopy_tail_halfword:			/* Up to 3 bytes to go */
373 	srdi.	r6,r5,1
374 	beq	.Lcopy_tail_byte
375 
376 srcnr;	lhz	r6,0(r3)
377 	addi	r3,r3,2
378 	adde	r0,r0,r6
379 dstnr;	sth	r6,0(r4)
380 	addi	r4,r4,2
381 	subi	r5,r5,2
382 
383 .Lcopy_tail_byte:			/* Up to 1 byte to go */
384 	andi.	r6,r5,1
385 	beq	.Lcopy_finish
386 
387 srcnr;	lbz	r6,0(r3)
388 #ifdef __BIG_ENDIAN__
389 	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
390 	adde	r0,r0,r9
391 #else
392 	adde	r0,r0,r6
393 #endif
394 dstnr;	stb	r6,0(r4)
395 
396 .Lcopy_finish:
397 	addze	r0,r0			/* add in final carry */
398 	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
399 	add	r3,r4,r0
400 	srdi	r3,r3,32
401 	blr
402 
403 .Lerror:
404 	ld	r14,STK_REG(R14)(r1)
405 	ld	r15,STK_REG(R15)(r1)
406 	ld	r16,STK_REG(R16)(r1)
407 	addi	r1,r1,STACKFRAMESIZE
408 .Lerror_nr:
409 	li	r3,0
410 	blr
411 
412 EXPORT_SYMBOL(csum_partial_copy_generic)
413 
414 /*
415  * __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
416  *			   const struct in6_addr *daddr,
417  *			   __u32 len, __u8 proto, __wsum sum)
418  */
419 
420 _GLOBAL(csum_ipv6_magic)
421 	ld	r8, 0(r3)
422 	ld	r9, 8(r3)
423 	add	r5, r5, r6
424 	addc	r0, r8, r9
425 	ld	r10, 0(r4)
426 	ld	r11, 8(r4)
427 #ifdef CONFIG_CPU_LITTLE_ENDIAN
428 	rotldi	r5, r5, 8
429 #endif
430 	adde	r0, r0, r10
431 	add	r5, r5, r7
432 	adde	r0, r0, r11
433 	adde	r0, r0, r5
434 	addze	r0, r0
435 	rotldi  r3, r0, 32		/* fold two 32 bit halves together */
436 	add	r3, r0, r3
437 	srdi	r0, r3, 32
438 	rotlwi	r3, r0, 16		/* fold two 16 bit halves together */
439 	add	r3, r0, r3
440 	not	r3, r3
441 	rlwinm	r3, r3, 16, 16, 31
442 	blr
443 EXPORT_SYMBOL(csum_ipv6_magic)
444