1 /*
2  * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
3  * Copyright (C) 2008-2009 PetaLogix
4  * Copyright (C) 2008 Jim Law - Iris LP  All rights reserved.
5  *
6  * This file is subject to the terms and conditions of the GNU General
7  * Public License.  See the file COPYING in the main directory of this
8  * archive for more details.
9  *
10  * Written by Jim Law <jlaw@irispower.com>
11  *
12  * intended to replace:
13  *	memcpy in memcpy.c and
14  *	memmove in memmove.c
15  * ... in arch/microblaze/lib
16  *
17  *
18  * assly_fastcopy.S
19  *
20  * Attempt at quicker memcpy and memmove for MicroBlaze
21  *	Input :	Operand1 in Reg r5 - destination address
22  *		Operand2 in Reg r6 - source address
23  *		Operand3 in Reg r7 - number of bytes to transfer
24  *	Output: Result in Reg r3 - starting destinaition address
25  *
26  *
27  * Explanation:
28  *	Perform (possibly unaligned) copy of a block of memory
29  *	between mem locations with size of xfer spec'd in bytes
30  */
31 
32 #include <linux/linkage.h>
33 	.text
34 	.globl	memcpy
35 	.type  memcpy, @function
36 	.ent	memcpy
37 
38 memcpy:
39 fast_memcpy_ascending:
40 	/* move d to return register as value of function */
41 	addi	r3, r5, 0
42 
43 	addi	r4, r0, 4	/* n = 4 */
44 	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
45 	blti	r4, a_xfer_end	/* if n < 0, less than one word to transfer */
46 
47 	/* transfer first 0~3 bytes to get aligned dest address */
48 	andi	r4, r5, 3		/* n = d & 3 */
49 	/* if zero, destination already aligned */
50 	beqi	r4, a_dalign_done
51 	/* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */
52 	rsubi	r4, r4, 4
53 	rsub	r7, r4, r7		/* c = c - n adjust c */
54 
55 a_xfer_first_loop:
56 	/* if no bytes left to transfer, transfer the bulk */
57 	beqi	r4, a_dalign_done
58 	lbui	r11, r6, 0		/* h = *s */
59 	sbi	r11, r5, 0		/* *d = h */
60 	addi	r6, r6, 1		/* s++ */
61 	addi	r5, r5, 1		/* d++ */
62 	brid	a_xfer_first_loop	/* loop */
63 	addi	r4, r4, -1		/* n-- (IN DELAY SLOT) */
64 
65 a_dalign_done:
66 	addi	r4, r0, 32		/* n = 32 */
67 	cmpu	r4, r4, r7		/* n = c - n  (unsigned) */
68 	/* if n < 0, less than one block to transfer */
69 	blti	r4, a_block_done
70 
71 a_block_xfer:
72 	andi	r4, r7, 0xffffffe0	/* n = c & ~31 */
73 	rsub	r7, r4, r7		/* c = c - n */
74 
75 	andi	r9, r6, 3		/* t1 = s & 3 */
76 	/* if temp != 0, unaligned transfers needed */
77 	bnei	r9, a_block_unaligned
78 
79 a_block_aligned:
80 	lwi	r9, r6, 0		/* t1 = *(s + 0) */
81 	lwi	r10, r6, 4		/* t2 = *(s + 4) */
82 	lwi	r11, r6, 8		/* t3 = *(s + 8) */
83 	lwi	r12, r6, 12		/* t4 = *(s + 12) */
84 	swi	r9, r5, 0		/* *(d + 0) = t1 */
85 	swi	r10, r5, 4		/* *(d + 4) = t2 */
86 	swi	r11, r5, 8		/* *(d + 8) = t3 */
87 	swi	r12, r5, 12		/* *(d + 12) = t4 */
88 	lwi	r9, r6, 16		/* t1 = *(s + 16) */
89 	lwi	r10, r6, 20		/* t2 = *(s + 20) */
90 	lwi	r11, r6, 24		/* t3 = *(s + 24) */
91 	lwi	r12, r6, 28		/* t4 = *(s + 28) */
92 	swi	r9, r5, 16		/* *(d + 16) = t1 */
93 	swi	r10, r5, 20		/* *(d + 20) = t2 */
94 	swi	r11, r5, 24		/* *(d + 24) = t3 */
95 	swi	r12, r5, 28		/* *(d + 28) = t4 */
96 	addi	r6, r6, 32		/* s = s + 32 */
97 	addi	r4, r4, -32		/* n = n - 32 */
98 	bneid	r4, a_block_aligned	/* while (n) loop */
99 	addi	r5, r5, 32		/* d = d + 32 (IN DELAY SLOT) */
100 	bri	a_block_done
101 
102 a_block_unaligned:
103 	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
104 	add	r6, r6, r4		/* s = s + n */
105 	lwi	r11, r8, 0		/* h = *(as + 0) */
106 
107 	addi	r9, r9, -1
108 	beqi	r9, a_block_u1		/* t1 was 1 => 1 byte offset */
109 	addi	r9, r9, -1
110 	beqi	r9, a_block_u2		/* t1 was 2 => 2 byte offset */
111 
112 a_block_u3:
113 	bslli	r11, r11, 24	/* h = h << 24 */
114 a_bu3_loop:
115 	lwi	r12, r8, 4	/* v = *(as + 4) */
116 	bsrli	r9, r12, 8	/* t1 = v >> 8 */
117 	or	r9, r11, r9	/* t1 = h | t1 */
118 	swi	r9, r5, 0	/* *(d + 0) = t1 */
119 	bslli	r11, r12, 24	/* h = v << 24 */
120 	lwi	r12, r8, 8	/* v = *(as + 8) */
121 	bsrli	r9, r12, 8	/* t1 = v >> 8 */
122 	or	r9, r11, r9	/* t1 = h | t1 */
123 	swi	r9, r5, 4	/* *(d + 4) = t1 */
124 	bslli	r11, r12, 24	/* h = v << 24 */
125 	lwi	r12, r8, 12	/* v = *(as + 12) */
126 	bsrli	r9, r12, 8	/* t1 = v >> 8 */
127 	or	r9, r11, r9	/* t1 = h | t1 */
128 	swi	r9, r5, 8	/* *(d + 8) = t1 */
129 	bslli	r11, r12, 24	/* h = v << 24 */
130 	lwi	r12, r8, 16	/* v = *(as + 16) */
131 	bsrli	r9, r12, 8	/* t1 = v >> 8 */
132 	or	r9, r11, r9	/* t1 = h | t1 */
133 	swi	r9, r5, 12	/* *(d + 12) = t1 */
134 	bslli	r11, r12, 24	/* h = v << 24 */
135 	lwi	r12, r8, 20	/* v = *(as + 20) */
136 	bsrli	r9, r12, 8	/* t1 = v >> 8 */
137 	or	r9, r11, r9	/* t1 = h | t1 */
138 	swi	r9, r5, 16	/* *(d + 16) = t1 */
139 	bslli	r11, r12, 24	/* h = v << 24 */
140 	lwi	r12, r8, 24	/* v = *(as + 24) */
141 	bsrli	r9, r12, 8	/* t1 = v >> 8 */
142 	or	r9, r11, r9	/* t1 = h | t1 */
143 	swi	r9, r5, 20	/* *(d + 20) = t1 */
144 	bslli	r11, r12, 24	/* h = v << 24 */
145 	lwi	r12, r8, 28	/* v = *(as + 28) */
146 	bsrli	r9, r12, 8	/* t1 = v >> 8 */
147 	or	r9, r11, r9	/* t1 = h | t1 */
148 	swi	r9, r5, 24	/* *(d + 24) = t1 */
149 	bslli	r11, r12, 24	/* h = v << 24 */
150 	lwi	r12, r8, 32	/* v = *(as + 32) */
151 	bsrli	r9, r12, 8	/* t1 = v >> 8 */
152 	or	r9, r11, r9	/* t1 = h | t1 */
153 	swi	r9, r5, 28	/* *(d + 28) = t1 */
154 	bslli	r11, r12, 24	/* h = v << 24 */
155 	addi	r8, r8, 32	/* as = as + 32 */
156 	addi	r4, r4, -32	/* n = n - 32 */
157 	bneid	r4, a_bu3_loop	/* while (n) loop */
158 	addi	r5, r5, 32	/* d = d + 32 (IN DELAY SLOT) */
159 	bri	a_block_done
160 
161 a_block_u1:
162 	bslli	r11, r11, 8	/* h = h << 8 */
163 a_bu1_loop:
164 	lwi	r12, r8, 4	/* v = *(as + 4) */
165 	bsrli	r9, r12, 24	/* t1 = v >> 24 */
166 	or	r9, r11, r9	/* t1 = h | t1 */
167 	swi	r9, r5, 0	/* *(d + 0) = t1 */
168 	bslli	r11, r12, 8	/* h = v << 8 */
169 	lwi	r12, r8, 8	/* v = *(as + 8) */
170 	bsrli	r9, r12, 24	/* t1 = v >> 24 */
171 	or	r9, r11, r9	/* t1 = h | t1 */
172 	swi	r9, r5, 4	/* *(d + 4) = t1 */
173 	bslli	r11, r12, 8	/* h = v << 8 */
174 	lwi	r12, r8, 12	/* v = *(as + 12) */
175 	bsrli	r9, r12, 24	/* t1 = v >> 24 */
176 	or	r9, r11, r9	/* t1 = h | t1 */
177 	swi	r9, r5, 8	/* *(d + 8) = t1 */
178 	bslli	r11, r12, 8	/* h = v << 8 */
179 	lwi	r12, r8, 16	/* v = *(as + 16) */
180 	bsrli	r9, r12, 24	/* t1 = v >> 24 */
181 	or	r9, r11, r9	/* t1 = h | t1 */
182 	swi	r9, r5, 12	/* *(d + 12) = t1 */
183 	bslli	r11, r12, 8	/* h = v << 8 */
184 	lwi	r12, r8, 20	/* v = *(as + 20) */
185 	bsrli	r9, r12, 24	/* t1 = v >> 24 */
186 	or	r9, r11, r9	/* t1 = h | t1 */
187 	swi	r9, r5, 16	/* *(d + 16) = t1 */
188 	bslli	r11, r12, 8	/* h = v << 8 */
189 	lwi	r12, r8, 24	/* v = *(as + 24) */
190 	bsrli	r9, r12, 24	/* t1 = v >> 24 */
191 	or	r9, r11, r9	/* t1 = h | t1 */
192 	swi	r9, r5, 20	/* *(d + 20) = t1 */
193 	bslli	r11, r12, 8	/* h = v << 8 */
194 	lwi	r12, r8, 28	/* v = *(as + 28) */
195 	bsrli	r9, r12, 24	/* t1 = v >> 24 */
196 	or	r9, r11, r9	/* t1 = h | t1 */
197 	swi	r9, r5, 24	/* *(d + 24) = t1 */
198 	bslli	r11, r12, 8	/* h = v << 8 */
199 	lwi	r12, r8, 32	/* v = *(as + 32) */
200 	bsrli	r9, r12, 24	/* t1 = v >> 24 */
201 	or	r9, r11, r9	/* t1 = h | t1 */
202 	swi	r9, r5, 28	/* *(d + 28) = t1 */
203 	bslli	r11, r12, 8	/* h = v << 8 */
204 	addi	r8, r8, 32	/* as = as + 32 */
205 	addi	r4, r4, -32	/* n = n - 32 */
206 	bneid	r4, a_bu1_loop	/* while (n) loop */
207 	addi	r5, r5, 32	/* d = d + 32 (IN DELAY SLOT) */
208 	bri	a_block_done
209 
210 a_block_u2:
211 	bslli	r11, r11, 16	/* h = h << 16 */
212 a_bu2_loop:
213 	lwi	r12, r8, 4	/* v = *(as + 4) */
214 	bsrli	r9, r12, 16	/* t1 = v >> 16 */
215 	or	r9, r11, r9	/* t1 = h | t1 */
216 	swi	r9, r5, 0	/* *(d + 0) = t1 */
217 	bslli	r11, r12, 16	/* h = v << 16 */
218 	lwi	r12, r8, 8	/* v = *(as + 8) */
219 	bsrli	r9, r12, 16	/* t1 = v >> 16 */
220 	or	r9, r11, r9	/* t1 = h | t1 */
221 	swi	r9, r5, 4	/* *(d + 4) = t1 */
222 	bslli	r11, r12, 16	/* h = v << 16 */
223 	lwi	r12, r8, 12	/* v = *(as + 12) */
224 	bsrli	r9, r12, 16	/* t1 = v >> 16 */
225 	or	r9, r11, r9	/* t1 = h | t1 */
226 	swi	r9, r5, 8	/* *(d + 8) = t1 */
227 	bslli	r11, r12, 16	/* h = v << 16 */
228 	lwi	r12, r8, 16	/* v = *(as + 16) */
229 	bsrli	r9, r12, 16	/* t1 = v >> 16 */
230 	or	r9, r11, r9	/* t1 = h | t1 */
231 	swi	r9, r5, 12	/* *(d + 12) = t1 */
232 	bslli	r11, r12, 16	/* h = v << 16 */
233 	lwi	r12, r8, 20	/* v = *(as + 20) */
234 	bsrli	r9, r12, 16	/* t1 = v >> 16 */
235 	or	r9, r11, r9	/* t1 = h | t1 */
236 	swi	r9, r5, 16	/* *(d + 16) = t1 */
237 	bslli	r11, r12, 16	/* h = v << 16 */
238 	lwi	r12, r8, 24	/* v = *(as + 24) */
239 	bsrli	r9, r12, 16	/* t1 = v >> 16 */
240 	or	r9, r11, r9	/* t1 = h | t1 */
241 	swi	r9, r5, 20	/* *(d + 20) = t1 */
242 	bslli	r11, r12, 16	/* h = v << 16 */
243 	lwi	r12, r8, 28	/* v = *(as + 28) */
244 	bsrli	r9, r12, 16	/* t1 = v >> 16 */
245 	or	r9, r11, r9	/* t1 = h | t1 */
246 	swi	r9, r5, 24	/* *(d + 24) = t1 */
247 	bslli	r11, r12, 16	/* h = v << 16 */
248 	lwi	r12, r8, 32	/* v = *(as + 32) */
249 	bsrli	r9, r12, 16	/* t1 = v >> 16 */
250 	or	r9, r11, r9	/* t1 = h | t1 */
251 	swi	r9, r5, 28	/* *(d + 28) = t1 */
252 	bslli	r11, r12, 16	/* h = v << 16 */
253 	addi	r8, r8, 32	/* as = as + 32 */
254 	addi	r4, r4, -32	/* n = n - 32 */
255 	bneid	r4, a_bu2_loop	/* while (n) loop */
256 	addi	r5, r5, 32	/* d = d + 32 (IN DELAY SLOT) */
257 
258 a_block_done:
259 	addi	r4, r0, 4	/* n = 4 */
260 	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
261 	blti	r4, a_xfer_end	/* if n < 0, less than one word to transfer */
262 
263 a_word_xfer:
264 	andi	r4, r7, 0xfffffffc	/* n = c & ~3 */
265 	addi	r10, r0, 0		/* offset = 0 */
266 
267 	andi	r9, r6, 3		/* t1 = s & 3 */
268 	/* if temp != 0, unaligned transfers needed */
269 	bnei	r9, a_word_unaligned
270 
271 a_word_aligned:
272 	lw	r9, r6, r10		/* t1 = *(s+offset) */
273 	sw	r9, r5, r10		/* *(d+offset) = t1 */
274 	addi	r4, r4,-4		/* n-- */
275 	bneid	r4, a_word_aligned	/* loop */
276 	addi	r10, r10, 4		/* offset++ (IN DELAY SLOT) */
277 
278 	bri	a_word_done
279 
280 a_word_unaligned:
281 	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
282 	lwi	r11, r8, 0		/* h = *(as + 0) */
283 	addi	r8, r8, 4		/* as = as + 4 */
284 
285 	addi	r9, r9, -1
286 	beqi	r9, a_word_u1		/* t1 was 1 => 1 byte offset */
287 	addi	r9, r9, -1
288 	beqi	r9, a_word_u2		/* t1 was 2 => 2 byte offset */
289 
290 a_word_u3:
291 	bslli	r11, r11, 24	/* h = h << 24 */
292 a_wu3_loop:
293 	lw	r12, r8, r10	/* v = *(as + offset) */
294 	bsrli	r9, r12, 8	/* t1 = v >> 8 */
295 	or	r9, r11, r9	/* t1 = h | t1 */
296 	sw	r9, r5, r10	/* *(d + offset) = t1 */
297 	bslli	r11, r12, 24	/* h = v << 24 */
298 	addi	r4, r4,-4	/* n = n - 4 */
299 	bneid	r4, a_wu3_loop	/* while (n) loop */
300 	addi	r10, r10, 4	/* offset = ofset + 4 (IN DELAY SLOT) */
301 
302 	bri	a_word_done
303 
304 a_word_u1:
305 	bslli	r11, r11, 8	/* h = h << 8 */
306 a_wu1_loop:
307 	lw	r12, r8, r10	/* v = *(as + offset) */
308 	bsrli	r9, r12, 24	/* t1 = v >> 24 */
309 	or	r9, r11, r9	/* t1 = h | t1 */
310 	sw	r9, r5, r10	/* *(d + offset) = t1 */
311 	bslli	r11, r12, 8	/* h = v << 8 */
312 	addi	r4, r4,-4	/* n = n - 4 */
313 	bneid	r4, a_wu1_loop	/* while (n) loop */
314 	addi	r10, r10, 4	/* offset = ofset + 4 (IN DELAY SLOT) */
315 
316 	bri	a_word_done
317 
318 a_word_u2:
319 	bslli	r11, r11, 16	/* h = h << 16 */
320 a_wu2_loop:
321 	lw	r12, r8, r10	/* v = *(as + offset) */
322 	bsrli	r9, r12, 16	/* t1 = v >> 16 */
323 	or	r9, r11, r9	/* t1 = h | t1 */
324 	sw	r9, r5, r10	/* *(d + offset) = t1 */
325 	bslli	r11, r12, 16	/* h = v << 16 */
326 	addi	r4, r4,-4	/* n = n - 4 */
327 	bneid	r4, a_wu2_loop	/* while (n) loop */
328 	addi	r10, r10, 4	/* offset = ofset + 4 (IN DELAY SLOT) */
329 
330 a_word_done:
331 	add	r5, r5, r10	/* d = d + offset */
332 	add	r6, r6, r10	/* s = s + offset */
333 	rsub	r7, r10, r7	/* c = c - offset */
334 
335 a_xfer_end:
336 a_xfer_end_loop:
337 	beqi	r7, a_done		/* while (c) */
338 	lbui	r9, r6, 0		/* t1 = *s */
339 	addi	r6, r6, 1		/* s++ */
340 	sbi	r9, r5, 0		/* *d = t1 */
341 	addi	r7, r7, -1		/* c-- */
342 	brid	a_xfer_end_loop		/* loop */
343 	addi	r5, r5, 1		/* d++ (IN DELAY SLOT) */
344 
345 a_done:
346 	rtsd	r15, 8
347 	nop
348 
349 .size  memcpy, . - memcpy
350 .end memcpy
351 /*----------------------------------------------------------------------------*/
352 	.globl	memmove
353 	.type  memmove, @function
354 	.ent	memmove
355 
356 memmove:
357 	cmpu	r4, r5, r6	/* n = s - d */
358 	bgei	r4,fast_memcpy_ascending
359 
360 fast_memcpy_descending:
361 	/* move d to return register as value of function */
362 	addi	r3, r5, 0
363 
364 	add	r5, r5, r7	/* d = d + c */
365 	add	r6, r6, r7	/* s = s + c */
366 
367 	addi	r4, r0, 4	/* n = 4 */
368 	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
369 	blti	r4,d_xfer_end	/* if n < 0, less than one word to transfer */
370 
371 	/* transfer first 0~3 bytes to get aligned dest address */
372 	andi	r4, r5, 3		/* n = d & 3 */
373 	/* if zero, destination already aligned */
374 	beqi	r4,d_dalign_done
375 	rsub	r7, r4, r7		/* c = c - n adjust c */
376 
377 d_xfer_first_loop:
378 	/* if no bytes left to transfer, transfer the bulk */
379 	beqi	r4,d_dalign_done
380 	addi	r6, r6, -1		/* s-- */
381 	addi	r5, r5, -1		/* d-- */
382 	lbui	r11, r6, 0		/* h = *s */
383 	sbi	r11, r5, 0		/* *d = h */
384 	brid	d_xfer_first_loop	/* loop */
385 	addi	r4, r4, -1		/* n-- (IN DELAY SLOT) */
386 
387 d_dalign_done:
388 	addi	r4, r0, 32	/* n = 32 */
389 	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
390 	/* if n < 0, less than one block to transfer */
391 	blti	r4, d_block_done
392 
393 d_block_xfer:
394 	andi	r4, r7, 0xffffffe0	/* n = c & ~31 */
395 	rsub	r7, r4, r7		/* c = c - n */
396 
397 	andi	r9, r6, 3		/* t1 = s & 3 */
398 	/* if temp != 0, unaligned transfers needed */
399 	bnei	r9, d_block_unaligned
400 
401 d_block_aligned:
402 	addi	r6, r6, -32		/* s = s - 32 */
403 	addi	r5, r5, -32		/* d = d - 32 */
404 	lwi	r9, r6, 28		/* t1 = *(s + 28) */
405 	lwi	r10, r6, 24		/* t2 = *(s + 24) */
406 	lwi	r11, r6, 20		/* t3 = *(s + 20) */
407 	lwi	r12, r6, 16		/* t4 = *(s + 16) */
408 	swi	r9, r5, 28		/* *(d + 28) = t1 */
409 	swi	r10, r5, 24		/* *(d + 24) = t2 */
410 	swi	r11, r5, 20		/* *(d + 20) = t3 */
411 	swi	r12, r5, 16		/* *(d + 16) = t4 */
412 	lwi	r9, r6, 12		/* t1 = *(s + 12) */
413 	lwi	r10, r6, 8		/* t2 = *(s + 8) */
414 	lwi	r11, r6, 4		/* t3 = *(s + 4) */
415 	lwi	r12, r6, 0		/* t4 = *(s + 0) */
416 	swi	r9, r5, 12		/* *(d + 12) = t1 */
417 	swi	r10, r5, 8		/* *(d + 8) = t2 */
418 	swi	r11, r5, 4		/* *(d + 4) = t3 */
419 	addi	r4, r4, -32		/* n = n - 32 */
420 	bneid	r4, d_block_aligned	/* while (n) loop */
421 	swi	r12, r5, 0		/* *(d + 0) = t4 (IN DELAY SLOT) */
422 	bri	d_block_done
423 
424 d_block_unaligned:
425 	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
426 	rsub	r6, r4, r6		/* s = s - n */
427 	lwi	r11, r8, 0		/* h = *(as + 0) */
428 
429 	addi	r9, r9, -1
430 	beqi	r9,d_block_u1		/* t1 was 1 => 1 byte offset */
431 	addi	r9, r9, -1
432 	beqi	r9,d_block_u2		/* t1 was 2 => 2 byte offset */
433 
434 d_block_u3:
435 	bsrli	r11, r11, 8	/* h = h >> 8 */
436 d_bu3_loop:
437 	addi	r8, r8, -32	/* as = as - 32 */
438 	addi	r5, r5, -32	/* d = d - 32 */
439 	lwi	r12, r8, 28	/* v = *(as + 28) */
440 	bslli	r9, r12, 24	/* t1 = v << 24 */
441 	or	r9, r11, r9	/* t1 = h | t1 */
442 	swi	r9, r5, 28	/* *(d + 28) = t1 */
443 	bsrli	r11, r12, 8	/* h = v >> 8 */
444 	lwi	r12, r8, 24	/* v = *(as + 24) */
445 	bslli	r9, r12, 24	/* t1 = v << 24 */
446 	or	r9, r11, r9	/* t1 = h | t1 */
447 	swi	r9, r5, 24	/* *(d + 24) = t1 */
448 	bsrli	r11, r12, 8	/* h = v >> 8 */
449 	lwi	r12, r8, 20	/* v = *(as + 20) */
450 	bslli	r9, r12, 24	/* t1 = v << 24 */
451 	or	r9, r11, r9	/* t1 = h | t1 */
452 	swi	r9, r5, 20	/* *(d + 20) = t1 */
453 	bsrli	r11, r12, 8	/* h = v >> 8 */
454 	lwi	r12, r8, 16	/* v = *(as + 16) */
455 	bslli	r9, r12, 24	/* t1 = v << 24 */
456 	or	r9, r11, r9	/* t1 = h | t1 */
457 	swi	r9, r5, 16	/* *(d + 16) = t1 */
458 	bsrli	r11, r12, 8	/* h = v >> 8 */
459 	lwi	r12, r8, 12	/* v = *(as + 12) */
460 	bslli	r9, r12, 24	/* t1 = v << 24 */
461 	or	r9, r11, r9	/* t1 = h | t1 */
462 	swi	r9, r5, 12	/* *(d + 112) = t1 */
463 	bsrli	r11, r12, 8	/* h = v >> 8 */
464 	lwi	r12, r8, 8	/* v = *(as + 8) */
465 	bslli	r9, r12, 24	/* t1 = v << 24 */
466 	or	r9, r11, r9	/* t1 = h | t1 */
467 	swi	r9, r5, 8	/* *(d + 8) = t1 */
468 	bsrli	r11, r12, 8	/* h = v >> 8 */
469 	lwi	r12, r8, 4	/* v = *(as + 4) */
470 	bslli	r9, r12, 24	/* t1 = v << 24 */
471 	or	r9, r11, r9	/* t1 = h | t1 */
472 	swi	r9, r5, 4	/* *(d + 4) = t1 */
473 	bsrli	r11, r12, 8	/* h = v >> 8 */
474 	lwi	r12, r8, 0	/* v = *(as + 0) */
475 	bslli	r9, r12, 24	/* t1 = v << 24 */
476 	or	r9, r11, r9	/* t1 = h | t1 */
477 	swi	r9, r5, 0	/* *(d + 0) = t1 */
478 	addi	r4, r4, -32	/* n = n - 32 */
479 	bneid	r4, d_bu3_loop	/* while (n) loop */
480 	bsrli	r11, r12, 8	/* h = v >> 8 (IN DELAY SLOT) */
481 	bri	d_block_done
482 
483 d_block_u1:
484 	bsrli	r11, r11, 24	/* h = h >> 24 */
485 d_bu1_loop:
486 	addi	r8, r8, -32	/* as = as - 32 */
487 	addi	r5, r5, -32	/* d = d - 32 */
488 	lwi	r12, r8, 28	/* v = *(as + 28) */
489 	bslli	r9, r12, 8	/* t1 = v << 8 */
490 	or	r9, r11, r9	/* t1 = h | t1 */
491 	swi	r9, r5, 28	/* *(d + 28) = t1 */
492 	bsrli	r11, r12, 24	/* h = v >> 24 */
493 	lwi	r12, r8, 24	/* v = *(as + 24) */
494 	bslli	r9, r12, 8	/* t1 = v << 8 */
495 	or	r9, r11, r9	/* t1 = h | t1 */
496 	swi	r9, r5, 24	/* *(d + 24) = t1 */
497 	bsrli	r11, r12, 24	/* h = v >> 24 */
498 	lwi	r12, r8, 20	/* v = *(as + 20) */
499 	bslli	r9, r12, 8	/* t1 = v << 8 */
500 	or	r9, r11, r9	/* t1 = h | t1 */
501 	swi	r9, r5, 20	/* *(d + 20) = t1 */
502 	bsrli	r11, r12, 24	/* h = v >> 24 */
503 	lwi	r12, r8, 16	/* v = *(as + 16) */
504 	bslli	r9, r12, 8	/* t1 = v << 8 */
505 	or	r9, r11, r9	/* t1 = h | t1 */
506 	swi	r9, r5, 16	/* *(d + 16) = t1 */
507 	bsrli	r11, r12, 24	/* h = v >> 24 */
508 	lwi	r12, r8, 12	/* v = *(as + 12) */
509 	bslli	r9, r12, 8	/* t1 = v << 8 */
510 	or	r9, r11, r9	/* t1 = h | t1 */
511 	swi	r9, r5, 12	/* *(d + 112) = t1 */
512 	bsrli	r11, r12, 24	/* h = v >> 24 */
513 	lwi	r12, r8, 8	/* v = *(as + 8) */
514 	bslli	r9, r12, 8	/* t1 = v << 8 */
515 	or	r9, r11, r9	/* t1 = h | t1 */
516 	swi	r9, r5, 8	/* *(d + 8) = t1 */
517 	bsrli	r11, r12, 24	/* h = v >> 24 */
518 	lwi	r12, r8, 4	/* v = *(as + 4) */
519 	bslli	r9, r12, 8	/* t1 = v << 8 */
520 	or	r9, r11, r9	/* t1 = h | t1 */
521 	swi	r9, r5, 4	/* *(d + 4) = t1 */
522 	bsrli	r11, r12, 24	/* h = v >> 24 */
523 	lwi	r12, r8, 0	/* v = *(as + 0) */
524 	bslli	r9, r12, 8	/* t1 = v << 8 */
525 	or	r9, r11, r9	/* t1 = h | t1 */
526 	swi	r9, r5, 0	/* *(d + 0) = t1 */
527 	addi	r4, r4, -32	/* n = n - 32 */
528 	bneid	r4, d_bu1_loop	/* while (n) loop */
529 	bsrli	r11, r12, 24	/* h = v >> 24 (IN DELAY SLOT) */
530 	bri	d_block_done
531 
532 d_block_u2:
533 	bsrli	r11, r11, 16	/* h = h >> 16 */
534 d_bu2_loop:
535 	addi	r8, r8, -32	/* as = as - 32 */
536 	addi	r5, r5, -32	/* d = d - 32 */
537 	lwi	r12, r8, 28	/* v = *(as + 28) */
538 	bslli	r9, r12, 16	/* t1 = v << 16 */
539 	or	r9, r11, r9	/* t1 = h | t1 */
540 	swi	r9, r5, 28	/* *(d + 28) = t1 */
541 	bsrli	r11, r12, 16	/* h = v >> 16 */
542 	lwi	r12, r8, 24	/* v = *(as + 24) */
543 	bslli	r9, r12, 16	/* t1 = v << 16 */
544 	or	r9, r11, r9	/* t1 = h | t1 */
545 	swi	r9, r5, 24	/* *(d + 24) = t1 */
546 	bsrli	r11, r12, 16	/* h = v >> 16 */
547 	lwi	r12, r8, 20	/* v = *(as + 20) */
548 	bslli	r9, r12, 16	/* t1 = v << 16 */
549 	or	r9, r11, r9	/* t1 = h | t1 */
550 	swi	r9, r5, 20	/* *(d + 20) = t1 */
551 	bsrli	r11, r12, 16	/* h = v >> 16 */
552 	lwi	r12, r8, 16	/* v = *(as + 16) */
553 	bslli	r9, r12, 16	/* t1 = v << 16 */
554 	or	r9, r11, r9	/* t1 = h | t1 */
555 	swi	r9, r5, 16	/* *(d + 16) = t1 */
556 	bsrli	r11, r12, 16	/* h = v >> 16 */
557 	lwi	r12, r8, 12	/* v = *(as + 12) */
558 	bslli	r9, r12, 16	/* t1 = v << 16 */
559 	or	r9, r11, r9	/* t1 = h | t1 */
560 	swi	r9, r5, 12	/* *(d + 112) = t1 */
561 	bsrli	r11, r12, 16	/* h = v >> 16 */
562 	lwi	r12, r8, 8	/* v = *(as + 8) */
563 	bslli	r9, r12, 16	/* t1 = v << 16 */
564 	or	r9, r11, r9	/* t1 = h | t1 */
565 	swi	r9, r5, 8	/* *(d + 8) = t1 */
566 	bsrli	r11, r12, 16	/* h = v >> 16 */
567 	lwi	r12, r8, 4	/* v = *(as + 4) */
568 	bslli	r9, r12, 16	/* t1 = v << 16 */
569 	or	r9, r11, r9	/* t1 = h | t1 */
570 	swi	r9, r5, 4	/* *(d + 4) = t1 */
571 	bsrli	r11, r12, 16	/* h = v >> 16 */
572 	lwi	r12, r8, 0	/* v = *(as + 0) */
573 	bslli	r9, r12, 16	/* t1 = v << 16 */
574 	or	r9, r11, r9	/* t1 = h | t1 */
575 	swi	r9, r5, 0	/* *(d + 0) = t1 */
576 	addi	r4, r4, -32	/* n = n - 32 */
577 	bneid	r4, d_bu2_loop	/* while (n) loop */
578 	bsrli	r11, r12, 16	/* h = v >> 16 (IN DELAY SLOT) */
579 
580 d_block_done:
581 	addi	r4, r0, 4	/* n = 4 */
582 	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
583 	blti	r4,d_xfer_end	/* if n < 0, less than one word to transfer */
584 
585 d_word_xfer:
586 	andi	r4, r7, 0xfffffffc	/* n = c & ~3 */
587 	rsub	r5, r4, r5		/* d = d - n */
588 	rsub	r6, r4, r6		/* s = s - n */
589 	rsub	r7, r4, r7		/* c = c - n */
590 
591 	andi	r9, r6, 3		/* t1 = s & 3 */
592 	/* if temp != 0, unaligned transfers needed */
593 	bnei	r9, d_word_unaligned
594 
595 d_word_aligned:
596 	addi	r4, r4,-4		/* n-- */
597 	lw	r9, r6, r4		/* t1 = *(s+n) */
598 	bneid	r4, d_word_aligned	/* loop */
599 	sw	r9, r5, r4		/* *(d+n) = t1 (IN DELAY SLOT) */
600 
601 	bri	d_word_done
602 
603 d_word_unaligned:
604 	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
605 	lw	r11, r8, r4		/* h = *(as + n) */
606 
607 	addi	r9, r9, -1
608 	beqi	r9,d_word_u1		/* t1 was 1 => 1 byte offset */
609 	addi	r9, r9, -1
610 	beqi	r9,d_word_u2		/* t1 was 2 => 2 byte offset */
611 
612 d_word_u3:
613 	bsrli	r11, r11, 8	/* h = h >> 8 */
614 d_wu3_loop:
615 	addi	r4, r4,-4	/* n = n - 4 */
616 	lw	r12, r8, r4	/* v = *(as + n) */
617 	bslli	r9, r12, 24	/* t1 = v << 24 */
618 	or	r9, r11, r9	/* t1 = h | t1 */
619 	sw	r9, r5, r4	/* *(d + n) = t1 */
620 	bneid	r4, d_wu3_loop	/* while (n) loop */
621 	bsrli	r11, r12, 8	/* h = v >> 8 (IN DELAY SLOT) */
622 
623 	bri	d_word_done
624 
625 d_word_u1:
626 	bsrli	r11, r11, 24	/* h = h >> 24 */
627 d_wu1_loop:
628 	addi	r4, r4,-4	/* n = n - 4 */
629 	lw	r12, r8, r4	/* v = *(as + n) */
630 	bslli	r9, r12, 8	/* t1 = v << 8 */
631 	or	r9, r11, r9	/* t1 = h | t1 */
632 	sw	r9, r5, r4	/* *(d + n) = t1 */
633 	bneid	r4, d_wu1_loop	/* while (n) loop */
634 	bsrli	r11, r12, 24	/* h = v >> 24 (IN DELAY SLOT) */
635 
636 	bri	d_word_done
637 
638 d_word_u2:
639 	bsrli	r11, r11, 16	/* h = h >> 16 */
640 d_wu2_loop:
641 	addi	r4, r4,-4	/* n = n - 4 */
642 	lw	r12, r8, r4	/* v = *(as + n) */
643 	bslli	r9, r12, 16	/* t1 = v << 16 */
644 	or	r9, r11, r9	/* t1 = h | t1 */
645 	sw	r9, r5, r4	/* *(d + n) = t1 */
646 	bneid	r4, d_wu2_loop	/* while (n) loop */
647 	bsrli	r11, r12, 16	/* h = v >> 16 (IN DELAY SLOT) */
648 
649 d_word_done:
650 
651 d_xfer_end:
652 d_xfer_end_loop:
653 	beqi	r7, a_done		/* while (c) */
654 	addi	r6, r6, -1		/* s-- */
655 	lbui	r9, r6, 0		/* t1 = *s */
656 	addi	r5, r5, -1		/* d-- */
657 	sbi	r9, r5, 0		/* *d = t1 */
658 	brid	d_xfer_end_loop		/* loop */
659 	addi	r7, r7, -1		/* c-- (IN DELAY SLOT) */
660 
661 d_done:
662 	rtsd	r15, 8
663 	nop
664 
665 .size  memmove, . - memmove
666 .end memmove
667