1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		IP/TCP/UDP checksumming routines
8  *
9  * Xtensa version:  Copyright (C) 2001 Tensilica, Inc. by Kevin Chea
10  *                  Optimized by Joe Taylor
11  */
12 
13 #include <linux/errno.h>
14 #include <linux/linkage.h>
15 #include <asm/asmmacro.h>
16 #include <asm/core.h>
17 
18 /*
19  * computes a partial checksum, e.g. for TCP/UDP fragments
20  */
21 
22 /*
23  * unsigned int csum_partial(const unsigned char *buf, int len,
24  *                           unsigned int sum);
25  *    a2 = buf
26  *    a3 = len
27  *    a4 = sum
28  *
29  * This function assumes 2- or 4-byte alignment.  Other alignments will fail!
30  */
31 
32 /* ONES_ADD converts twos-complement math to ones-complement. */
33 #define ONES_ADD(sum, val)	  \
34 	add	sum, sum, val	; \
35 	bgeu	sum, val, 99f	; \
36 	addi	sum, sum, 1	; \
37 99:				;
38 
39 .text
40 ENTRY(csum_partial)
41 
42 	/*
43 	 * Experiments with Ethernet and SLIP connections show that buf
44 	 * is aligned on either a 2-byte or 4-byte boundary.
45 	 */
46 	abi_entry_default
47 	extui	a5, a2, 0, 2
48 	bnez	a5, 8f		/* branch if 2-byte aligned */
49 	/* Fall-through on common case, 4-byte alignment */
50 1:
51 	srli	a5, a3, 5	/* 32-byte chunks */
52 #if XCHAL_HAVE_LOOPS
53 	loopgtz	a5, 2f
54 #else
55 	beqz	a5, 2f
56 	slli	a5, a5, 5
57 	add	a5, a5, a2	/* a5 = end of last 32-byte chunk */
58 .Loop1:
59 #endif
60 	l32i	a6, a2, 0
61 	l32i	a7, a2, 4
62 	ONES_ADD(a4, a6)
63 	ONES_ADD(a4, a7)
64 	l32i	a6, a2, 8
65 	l32i	a7, a2, 12
66 	ONES_ADD(a4, a6)
67 	ONES_ADD(a4, a7)
68 	l32i	a6, a2, 16
69 	l32i	a7, a2, 20
70 	ONES_ADD(a4, a6)
71 	ONES_ADD(a4, a7)
72 	l32i	a6, a2, 24
73 	l32i	a7, a2, 28
74 	ONES_ADD(a4, a6)
75 	ONES_ADD(a4, a7)
76 	addi	a2, a2, 4*8
77 #if !XCHAL_HAVE_LOOPS
78 	blt	a2, a5, .Loop1
79 #endif
80 2:
81 	extui	a5, a3, 2, 3	/* remaining 4-byte chunks */
82 #if XCHAL_HAVE_LOOPS
83 	loopgtz	a5, 3f
84 #else
85 	beqz	a5, 3f
86 	slli	a5, a5, 2
87 	add	a5, a5, a2	/* a5 = end of last 4-byte chunk */
88 .Loop2:
89 #endif
90 	l32i	a6, a2, 0
91 	ONES_ADD(a4, a6)
92 	addi	a2, a2, 4
93 #if !XCHAL_HAVE_LOOPS
94 	blt	a2, a5, .Loop2
95 #endif
96 3:
97 	_bbci.l	a3, 1, 5f	/* remaining 2-byte chunk */
98 	l16ui	a6, a2, 0
99 	ONES_ADD(a4, a6)
100 	addi	a2, a2, 2
101 5:
102 	_bbci.l	a3, 0, 7f	/* remaining 1-byte chunk */
103 6:	l8ui	a6, a2, 0
104 #ifdef __XTENSA_EB__
105 	slli	a6, a6, 8	/* load byte into bits 8..15 */
106 #endif
107 	ONES_ADD(a4, a6)
108 7:
109 	mov	a2, a4
110 	abi_ret_default
111 
112 	/* uncommon case, buf is 2-byte aligned */
113 8:
114 	beqz	a3, 7b		/* branch if len == 0 */
115 	beqi	a3, 1, 6b	/* branch if len == 1 */
116 
117 	extui	a5, a2, 0, 1
118 	bnez	a5, 8f		/* branch if 1-byte aligned */
119 
120 	l16ui	a6, a2, 0	/* common case, len >= 2 */
121 	ONES_ADD(a4, a6)
122 	addi	a2, a2, 2	/* adjust buf */
123 	addi	a3, a3, -2	/* adjust len */
124 	j	1b		/* now buf is 4-byte aligned */
125 
126 	/* case: odd-byte aligned, len > 1
127 	 * This case is dog slow, so don't give us an odd address.
128 	 * (I don't think this ever happens, but just in case.)
129 	 */
130 8:
131 	srli	a5, a3, 2	/* 4-byte chunks */
132 #if XCHAL_HAVE_LOOPS
133 	loopgtz	a5, 2f
134 #else
135 	beqz	a5, 2f
136 	slli	a5, a5, 2
137 	add	a5, a5, a2	/* a5 = end of last 4-byte chunk */
138 .Loop3:
139 #endif
140 	l8ui	a6, a2, 0	/* bits 24..31 */
141 	l16ui	a7, a2, 1	/* bits  8..23 */
142 	l8ui	a8, a2, 3	/* bits  0.. 8 */
143 #ifdef	__XTENSA_EB__
144 	slli	a6, a6, 24
145 #else
146 	slli	a8, a8, 24
147 #endif
148 	slli	a7, a7, 8
149 	or	a7, a7, a6
150 	or	a7, a7, a8
151 	ONES_ADD(a4, a7)
152 	addi	a2, a2, 4
153 #if !XCHAL_HAVE_LOOPS
154 	blt	a2, a5, .Loop3
155 #endif
156 2:
157 	_bbci.l	a3, 1, 3f	/* remaining 2-byte chunk, still odd addr */
158 	l8ui	a6, a2, 0
159 	l8ui	a7, a2, 1
160 #ifdef	__XTENSA_EB__
161 	slli	a6, a6, 8
162 #else
163 	slli	a7, a7, 8
164 #endif
165 	or	a7, a7, a6
166 	ONES_ADD(a4, a7)
167 	addi	a2, a2, 2
168 3:
169 	j	5b		/* branch to handle the remaining byte */
170 
171 ENDPROC(csum_partial)
172 EXPORT_SYMBOL(csum_partial)
173 
174 /*
175  * Copy from ds while checksumming, otherwise like csum_partial
176  */
177 
178 /*
179 unsigned int csum_partial_copy_generic (const char *src, char *dst, int len)
180 	a2  = src
181 	a3  = dst
182 	a4  = len
183 	a5  = sum
184 	a8  = temp
185 	a9  = temp
186 	a10 = temp
187 
188     This function is optimized for 4-byte aligned addresses.  Other
189     alignments work, but not nearly as efficiently.
190  */
191 
192 ENTRY(csum_partial_copy_generic)
193 
194 	abi_entry_default
195 	movi	a5, -1
196 	or	a10, a2, a3
197 
198 	/* We optimize the following alignment tests for the 4-byte
199 	aligned case.  Two bbsi.l instructions might seem more optimal
200 	(commented out below).  However, both labels 5: and 3: are out
201 	of the imm8 range, so the assembler relaxes them into
202 	equivalent bbci.l, j combinations, which is actually
203 	slower. */
204 
205 	extui	a9, a10, 0, 2
206 	beqz	a9, 1f		/* branch if both are 4-byte aligned */
207 	bbsi.l	a10, 0, 5f	/* branch if one address is odd */
208 	j	3f		/* one address is 2-byte aligned */
209 
210 /*	_bbsi.l	a10, 0, 5f */	/* branch if odd address */
211 /*	_bbsi.l	a10, 1, 3f */	/* branch if 2-byte-aligned address */
212 
213 1:
214 	/* src and dst are both 4-byte aligned */
215 	srli	a10, a4, 5	/* 32-byte chunks */
216 #if XCHAL_HAVE_LOOPS
217 	loopgtz	a10, 2f
218 #else
219 	beqz	a10, 2f
220 	slli	a10, a10, 5
221 	add	a10, a10, a2	/* a10 = end of last 32-byte src chunk */
222 .Loop5:
223 #endif
224 EX(10f)	l32i	a9, a2, 0
225 EX(10f)	l32i	a8, a2, 4
226 EX(10f)	s32i	a9, a3, 0
227 EX(10f)	s32i	a8, a3, 4
228 	ONES_ADD(a5, a9)
229 	ONES_ADD(a5, a8)
230 EX(10f)	l32i	a9, a2, 8
231 EX(10f)	l32i	a8, a2, 12
232 EX(10f)	s32i	a9, a3, 8
233 EX(10f)	s32i	a8, a3, 12
234 	ONES_ADD(a5, a9)
235 	ONES_ADD(a5, a8)
236 EX(10f)	l32i	a9, a2, 16
237 EX(10f)	l32i	a8, a2, 20
238 EX(10f)	s32i	a9, a3, 16
239 EX(10f)	s32i	a8, a3, 20
240 	ONES_ADD(a5, a9)
241 	ONES_ADD(a5, a8)
242 EX(10f)	l32i	a9, a2, 24
243 EX(10f)	l32i	a8, a2, 28
244 EX(10f)	s32i	a9, a3, 24
245 EX(10f)	s32i	a8, a3, 28
246 	ONES_ADD(a5, a9)
247 	ONES_ADD(a5, a8)
248 	addi	a2, a2, 32
249 	addi	a3, a3, 32
250 #if !XCHAL_HAVE_LOOPS
251 	blt	a2, a10, .Loop5
252 #endif
253 2:
254 	extui	a10, a4, 2, 3	/* remaining 4-byte chunks */
255 	extui	a4, a4, 0, 2	/* reset len for general-case, 2-byte chunks */
256 #if XCHAL_HAVE_LOOPS
257 	loopgtz	a10, 3f
258 #else
259 	beqz	a10, 3f
260 	slli	a10, a10, 2
261 	add	a10, a10, a2	/* a10 = end of last 4-byte src chunk */
262 .Loop6:
263 #endif
264 EX(10f)	l32i	a9, a2, 0
265 EX(10f)	s32i	a9, a3, 0
266 	ONES_ADD(a5, a9)
267 	addi	a2, a2, 4
268 	addi	a3, a3, 4
269 #if !XCHAL_HAVE_LOOPS
270 	blt	a2, a10, .Loop6
271 #endif
272 3:
273 	/*
274 	Control comes to here in two cases: (1) It may fall through
275 	to here from the 4-byte alignment case to process, at most,
276 	one 2-byte chunk.  (2) It branches to here from above if
277 	either src or dst is 2-byte aligned, and we process all bytes
278 	here, except for perhaps a trailing odd byte.  It's
279 	inefficient, so align your addresses to 4-byte boundaries.
280 
281 	a2 = src
282 	a3 = dst
283 	a4 = len
284 	a5 = sum
285 	*/
286 	srli	a10, a4, 1	/* 2-byte chunks */
287 #if XCHAL_HAVE_LOOPS
288 	loopgtz	a10, 4f
289 #else
290 	beqz	a10, 4f
291 	slli	a10, a10, 1
292 	add	a10, a10, a2	/* a10 = end of last 2-byte src chunk */
293 .Loop7:
294 #endif
295 EX(10f)	l16ui	a9, a2, 0
296 EX(10f)	s16i	a9, a3, 0
297 	ONES_ADD(a5, a9)
298 	addi	a2, a2, 2
299 	addi	a3, a3, 2
300 #if !XCHAL_HAVE_LOOPS
301 	blt	a2, a10, .Loop7
302 #endif
303 4:
304 	/* This section processes a possible trailing odd byte. */
305 	_bbci.l	a4, 0, 8f	/* 1-byte chunk */
306 EX(10f)	l8ui	a9, a2, 0
307 EX(10f)	s8i	a9, a3, 0
308 #ifdef __XTENSA_EB__
309 	slli	a9, a9, 8	/* shift byte to bits 8..15 */
310 #endif
311 	ONES_ADD(a5, a9)
312 8:
313 	mov	a2, a5
314 	abi_ret_default
315 
316 5:
317 	/* Control branch to here when either src or dst is odd.  We
318 	process all bytes using 8-bit accesses.  Grossly inefficient,
319 	so don't feed us an odd address. */
320 
321 	srli	a10, a4, 1	/* handle in pairs for 16-bit csum */
322 #if XCHAL_HAVE_LOOPS
323 	loopgtz	a10, 6f
324 #else
325 	beqz	a10, 6f
326 	slli	a10, a10, 1
327 	add	a10, a10, a2	/* a10 = end of last odd-aligned, 2-byte src chunk */
328 .Loop8:
329 #endif
330 EX(10f)	l8ui	a9, a2, 0
331 EX(10f)	l8ui	a8, a2, 1
332 EX(10f)	s8i	a9, a3, 0
333 EX(10f)	s8i	a8, a3, 1
334 #ifdef __XTENSA_EB__
335 	slli	a9, a9, 8	/* combine into a single 16-bit value */
336 #else				/* for checksum computation */
337 	slli	a8, a8, 8
338 #endif
339 	or	a9, a9, a8
340 	ONES_ADD(a5, a9)
341 	addi	a2, a2, 2
342 	addi	a3, a3, 2
343 #if !XCHAL_HAVE_LOOPS
344 	blt	a2, a10, .Loop8
345 #endif
346 6:
347 	j	4b		/* process the possible trailing odd byte */
348 
349 ENDPROC(csum_partial_copy_generic)
350 EXPORT_SYMBOL(csum_partial_copy_generic)
351 
352 
353 # Exception handler:
354 .section .fixup, "ax"
355 10:
356 	movi	a2, 0
357 	abi_ret_default
358 
359 .previous
360