1 /* SPDX-License-Identifier: GPL-2.0 */
2 /* NGmemcpy.S: Niagara optimized memcpy.
3  *
4  * Copyright (C) 2006, 2007 David S. Miller (davem@davemloft.net)
5  */
6 
7 #ifdef __KERNEL__
8 #include <linux/linkage.h>
9 #include <asm/asi.h>
10 #include <asm/thread_info.h>
11 #define GLOBAL_SPARE	%g7
12 #define RESTORE_ASI(TMP)	\
13 	ldub	[%g6 + TI_CURRENT_DS], TMP;  \
14 	wr	TMP, 0x0, %asi;
15 #else
16 #define GLOBAL_SPARE	%g5
17 #define RESTORE_ASI(TMP)	\
18 	wr	%g0, ASI_PNF, %asi
19 #endif
20 
21 #ifdef __sparc_v9__
22 #define SAVE_AMOUNT	128
23 #else
24 #define SAVE_AMOUNT	64
25 #endif
26 
27 #ifndef STORE_ASI
28 #define STORE_ASI	ASI_BLK_INIT_QUAD_LDD_P
29 #endif
30 
31 #ifndef EX_LD
32 #define EX_LD(x,y)	x
33 #endif
34 
35 #ifndef EX_ST
36 #define EX_ST(x,y)	x
37 #endif
38 
39 #ifndef LOAD
40 #ifndef MEMCPY_DEBUG
41 #define LOAD(type,addr,dest)	type [addr], dest
42 #else
43 #define LOAD(type,addr,dest)	type##a [addr] 0x80, dest
44 #endif
45 #endif
46 
47 #ifndef LOAD_TWIN
48 #define LOAD_TWIN(addr_reg,dest0,dest1)	\
49 	ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0
50 #endif
51 
52 #ifndef STORE
53 #define STORE(type,src,addr)	type src, [addr]
54 #endif
55 
56 #ifndef STORE_INIT
57 #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
58 #define STORE_INIT(src,addr)	stxa src, [addr] %asi
59 #else
60 #define STORE_INIT(src,addr)	stx src, [addr + 0x00]
61 #endif
62 #endif
63 
64 #ifndef FUNC_NAME
65 #define FUNC_NAME	NGmemcpy
66 #endif
67 
68 #ifndef PREAMBLE
69 #define PREAMBLE
70 #endif
71 
72 #ifndef XCC
73 #define XCC xcc
74 #endif
75 
76 	.register	%g2,#scratch
77 	.register	%g3,#scratch
78 
79 	.text
80 #ifndef EX_RETVAL
81 #define EX_RETVAL(x)	x
82 __restore_asi:
83 	ret
84 	wr	%g0, ASI_AIUS, %asi
85 	 restore
86 ENTRY(NG_ret_i2_plus_i4_plus_1)
87 	ba,pt	%xcc, __restore_asi
88 	 add	%i2, %i5, %i0
89 ENDPROC(NG_ret_i2_plus_i4_plus_1)
90 ENTRY(NG_ret_i2_plus_g1)
91 	ba,pt	%xcc, __restore_asi
92 	 add	%i2, %g1, %i0
93 ENDPROC(NG_ret_i2_plus_g1)
94 ENTRY(NG_ret_i2_plus_g1_minus_8)
95 	sub	%g1, 8, %g1
96 	ba,pt	%xcc, __restore_asi
97 	 add	%i2, %g1, %i0
98 ENDPROC(NG_ret_i2_plus_g1_minus_8)
99 ENTRY(NG_ret_i2_plus_g1_minus_16)
100 	sub	%g1, 16, %g1
101 	ba,pt	%xcc, __restore_asi
102 	 add	%i2, %g1, %i0
103 ENDPROC(NG_ret_i2_plus_g1_minus_16)
104 ENTRY(NG_ret_i2_plus_g1_minus_24)
105 	sub	%g1, 24, %g1
106 	ba,pt	%xcc, __restore_asi
107 	 add	%i2, %g1, %i0
108 ENDPROC(NG_ret_i2_plus_g1_minus_24)
109 ENTRY(NG_ret_i2_plus_g1_minus_32)
110 	sub	%g1, 32, %g1
111 	ba,pt	%xcc, __restore_asi
112 	 add	%i2, %g1, %i0
113 ENDPROC(NG_ret_i2_plus_g1_minus_32)
114 ENTRY(NG_ret_i2_plus_g1_minus_40)
115 	sub	%g1, 40, %g1
116 	ba,pt	%xcc, __restore_asi
117 	 add	%i2, %g1, %i0
118 ENDPROC(NG_ret_i2_plus_g1_minus_40)
119 ENTRY(NG_ret_i2_plus_g1_minus_48)
120 	sub	%g1, 48, %g1
121 	ba,pt	%xcc, __restore_asi
122 	 add	%i2, %g1, %i0
123 ENDPROC(NG_ret_i2_plus_g1_minus_48)
124 ENTRY(NG_ret_i2_plus_g1_minus_56)
125 	sub	%g1, 56, %g1
126 	ba,pt	%xcc, __restore_asi
127 	 add	%i2, %g1, %i0
128 ENDPROC(NG_ret_i2_plus_g1_minus_56)
129 ENTRY(NG_ret_i2_plus_i4)
130 	ba,pt	%xcc, __restore_asi
131 	 add	%i2, %i4, %i0
132 ENDPROC(NG_ret_i2_plus_i4)
133 ENTRY(NG_ret_i2_plus_i4_minus_8)
134 	sub	%i4, 8, %i4
135 	ba,pt	%xcc, __restore_asi
136 	 add	%i2, %i4, %i0
137 ENDPROC(NG_ret_i2_plus_i4_minus_8)
138 ENTRY(NG_ret_i2_plus_8)
139 	ba,pt	%xcc, __restore_asi
140 	 add	%i2, 8, %i0
141 ENDPROC(NG_ret_i2_plus_8)
142 ENTRY(NG_ret_i2_plus_4)
143 	ba,pt	%xcc, __restore_asi
144 	 add	%i2, 4, %i0
145 ENDPROC(NG_ret_i2_plus_4)
146 ENTRY(NG_ret_i2_plus_1)
147 	ba,pt	%xcc, __restore_asi
148 	 add	%i2, 1, %i0
149 ENDPROC(NG_ret_i2_plus_1)
150 ENTRY(NG_ret_i2_plus_g1_plus_1)
151 	add	%g1, 1, %g1
152 	ba,pt	%xcc, __restore_asi
153 	 add	%i2, %g1, %i0
154 ENDPROC(NG_ret_i2_plus_g1_plus_1)
155 ENTRY(NG_ret_i2)
156 	ba,pt	%xcc, __restore_asi
157 	 mov	%i2, %i0
158 ENDPROC(NG_ret_i2)
159 ENTRY(NG_ret_i2_and_7_plus_i4)
160 	and	%i2, 7, %i2
161 	ba,pt	%xcc, __restore_asi
162 	 add	%i2, %i4, %i0
163 ENDPROC(NG_ret_i2_and_7_plus_i4)
164 #endif
165 
166 	.align		64
167 
168 	.globl	FUNC_NAME
169 	.type	FUNC_NAME,#function
170 FUNC_NAME:	/* %i0=dst, %i1=src, %i2=len */
171 	PREAMBLE
172 	save		%sp, -SAVE_AMOUNT, %sp
173 	srlx		%i2, 31, %g2
174 	cmp		%g2, 0
175 	tne		%xcc, 5
176 	mov		%i0, %o0
177 	cmp		%i2, 0
178 	be,pn		%XCC, 85f
179 	 or		%o0, %i1, %i3
180 	cmp		%i2, 16
181 	blu,a,pn	%XCC, 80f
182 	 or		%i3, %i2, %i3
183 
184 	/* 2 blocks (128 bytes) is the minimum we can do the block
185 	 * copy with.  We need to ensure that we'll iterate at least
186 	 * once in the block copy loop.  At worst we'll need to align
187 	 * the destination to a 64-byte boundary which can chew up
188 	 * to (64 - 1) bytes from the length before we perform the
189 	 * block copy loop.
190 	 */
191 	cmp		%i2, (2 * 64)
192 	blu,pt		%XCC, 70f
193 	 andcc		%i3, 0x7, %g0
194 
195 	/* %o0:	dst
196 	 * %i1:	src
197 	 * %i2:	len  (known to be >= 128)
198 	 *
199 	 * The block copy loops will use %i4/%i5,%g2/%g3 as
200 	 * temporaries while copying the data.
201 	 */
202 
203 	LOAD(prefetch, %i1, #one_read)
204 	wr		%g0, STORE_ASI, %asi
205 
206 	/* Align destination on 64-byte boundary.  */
207 	andcc		%o0, (64 - 1), %i4
208 	be,pt		%XCC, 2f
209 	 sub		%i4, 64, %i4
210 	sub		%g0, %i4, %i4	! bytes to align dst
211 	sub		%i2, %i4, %i2
212 1:	subcc		%i4, 1, %i4
213 	EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_i4_plus_1)
214 	EX_ST(STORE(stb, %g1, %o0), NG_ret_i2_plus_i4_plus_1)
215 	add		%i1, 1, %i1
216 	bne,pt		%XCC, 1b
217 	add		%o0, 1, %o0
218 
219 	/* If the source is on a 16-byte boundary we can do
220 	 * the direct block copy loop.  If it is 8-byte aligned
221 	 * we can do the 16-byte loads offset by -8 bytes and the
222 	 * init stores offset by one register.
223 	 *
224 	 * If the source is not even 8-byte aligned, we need to do
225 	 * shifting and masking (basically integer faligndata).
226 	 *
227 	 * The careful bit with init stores is that if we store
228 	 * to any part of the cache line we have to store the whole
229 	 * cacheline else we can end up with corrupt L2 cache line
230 	 * contents.  Since the loop works on 64-bytes of 64-byte
231 	 * aligned store data at a time, this is easy to ensure.
232 	 */
233 2:
234 	andcc		%i1, (16 - 1), %i4
235 	andn		%i2, (64 - 1), %g1	! block copy loop iterator
236 	be,pt		%XCC, 50f
237 	 sub		%i2, %g1, %i2		! final sub-block copy bytes
238 
239 	cmp		%i4, 8
240 	be,pt		%XCC, 10f
241 	 sub		%i1, %i4, %i1
242 
243 	/* Neither 8-byte nor 16-byte aligned, shift and mask.  */
244 	and		%i4, 0x7, GLOBAL_SPARE
245 	sll		GLOBAL_SPARE, 3, GLOBAL_SPARE
246 	mov		64, %i5
247 	EX_LD(LOAD_TWIN(%i1, %g2, %g3), NG_ret_i2_plus_g1)
248 	sub		%i5, GLOBAL_SPARE, %i5
249 	mov		16, %o4
250 	mov		32, %o5
251 	mov		48, %o7
252 	mov		64, %i3
253 
254 	bg,pn	   	%XCC, 9f
255 	 nop
256 
257 #define MIX_THREE_WORDS(WORD1, WORD2, WORD3, PRE_SHIFT, POST_SHIFT, TMP) \
258 	sllx		WORD1, POST_SHIFT, WORD1; \
259 	srlx		WORD2, PRE_SHIFT, TMP; \
260 	sllx		WORD2, POST_SHIFT, WORD2; \
261 	or		WORD1, TMP, WORD1; \
262 	srlx		WORD3, PRE_SHIFT, TMP; \
263 	or		WORD2, TMP, WORD2;
264 
265 8:	EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1)
266 	MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1)
267 	LOAD(prefetch, %i1 + %i3, #one_read)
268 
269 	EX_ST(STORE_INIT(%g2, %o0 + 0x00), NG_ret_i2_plus_g1)
270 	EX_ST(STORE_INIT(%g3, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
271 
272 	EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16)
273 	MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1)
274 
275 	EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
276 	EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
277 
278 	EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
279 	MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1)
280 
281 	EX_ST(STORE_INIT(%g2, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
282 	EX_ST(STORE_INIT(%g3, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
283 
284 	EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48)
285 	add		%i1, 64, %i1
286 	MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1)
287 
288 	EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
289 	EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
290 
291 	subcc		%g1, 64, %g1
292 	bne,pt		%XCC, 8b
293 	 add		%o0, 64, %o0
294 
295 	ba,pt		%XCC, 60f
296 	 add		%i1, %i4, %i1
297 
298 9:	EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1)
299 	MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1)
300 	LOAD(prefetch, %i1 + %i3, #one_read)
301 
302 	EX_ST(STORE_INIT(%g3, %o0 + 0x00), NG_ret_i2_plus_g1)
303 	EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
304 
305 	EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16)
306 	MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1)
307 
308 	EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
309 	EX_ST(STORE_INIT(%g2, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
310 
311 	EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
312 	MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1)
313 
314 	EX_ST(STORE_INIT(%g3, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
315 	EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
316 
317 	EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48)
318 	add		%i1, 64, %i1
319 	MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1)
320 
321 	EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
322 	EX_ST(STORE_INIT(%g2, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
323 
324 	subcc		%g1, 64, %g1
325 	bne,pt		%XCC, 9b
326 	 add		%o0, 64, %o0
327 
328 	ba,pt		%XCC, 60f
329 	 add		%i1, %i4, %i1
330 
331 10:	/* Destination is 64-byte aligned, source was only 8-byte
332 	 * aligned but it has been subtracted by 8 and we perform
333 	 * one twin load ahead, then add 8 back into source when
334 	 * we finish the loop.
335 	 */
336 	EX_LD(LOAD_TWIN(%i1, %o4, %o5), NG_ret_i2_plus_g1)
337 	mov	16, %o7
338 	mov	32, %g2
339 	mov	48, %g3
340 	mov	64, %o1
341 1:	EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1)
342 	LOAD(prefetch, %i1 + %o1, #one_read)
343 	EX_ST(STORE_INIT(%o5, %o0 + 0x00), NG_ret_i2_plus_g1)	! initializes cache line
344 	EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
345 	EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16)
346 	EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
347 	EX_ST(STORE_INIT(%o4, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
348 	EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
349 	EX_ST(STORE_INIT(%o5, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
350 	EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
351 	EX_LD(LOAD_TWIN(%i1 + %o1, %o4, %o5), NG_ret_i2_plus_g1_minus_48)
352 	add		%i1, 64, %i1
353 	EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
354 	EX_ST(STORE_INIT(%o4, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
355 	subcc		%g1, 64, %g1
356 	bne,pt		%XCC, 1b
357 	 add		%o0, 64, %o0
358 
359 	ba,pt		%XCC, 60f
360 	 add		%i1, 0x8, %i1
361 
362 50:	/* Destination is 64-byte aligned, and source is 16-byte
363 	 * aligned.
364 	 */
365 	mov	16, %o7
366 	mov	32, %g2
367 	mov	48, %g3
368 	mov	64, %o1
369 1:	EX_LD(LOAD_TWIN(%i1 + %g0, %o4, %o5), NG_ret_i2_plus_g1)
370 	EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1)
371 	LOAD(prefetch, %i1 + %o1, #one_read)
372 	EX_ST(STORE_INIT(%o4, %o0 + 0x00), NG_ret_i2_plus_g1)	! initializes cache line
373 	EX_ST(STORE_INIT(%o5, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
374 	EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16)
375 	EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
376 	EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
377 	EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
378 	add	%i1, 64, %i1
379 	EX_ST(STORE_INIT(%o4, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
380 	EX_ST(STORE_INIT(%o5, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
381 	EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
382 	EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
383 	subcc	%g1, 64, %g1
384 	bne,pt	%XCC, 1b
385 	 add	%o0, 64, %o0
386 	/* fall through */
387 
388 60:
389 	membar		#Sync
390 
391 	/* %i2 contains any final bytes still needed to be copied
392 	 * over. If anything is left, we copy it one byte at a time.
393 	 */
394 	RESTORE_ASI(%i3)
395 	brz,pt		%i2, 85f
396 	 sub		%o0, %i1, %i3
397 	ba,a,pt		%XCC, 90f
398 	 nop
399 
400 	.align		64
401 70: /* 16 < len <= 64 */
402 	bne,pn		%XCC, 75f
403 	 sub		%o0, %i1, %i3
404 
405 72:
406 	andn		%i2, 0xf, %i4
407 	and		%i2, 0xf, %i2
408 1:	subcc		%i4, 0x10, %i4
409 	EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_i4)
410 	add		%i1, 0x08, %i1
411 	EX_LD(LOAD(ldx, %i1, %g1), NG_ret_i2_plus_i4)
412 	sub		%i1, 0x08, %i1
413 	EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_i4)
414 	add		%i1, 0x8, %i1
415 	EX_ST(STORE(stx, %g1, %i1 + %i3), NG_ret_i2_plus_i4_minus_8)
416 	bgu,pt		%XCC, 1b
417 	 add		%i1, 0x8, %i1
418 73:	andcc		%i2, 0x8, %g0
419 	be,pt		%XCC, 1f
420 	 nop
421 	sub		%i2, 0x8, %i2
422 	EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_8)
423 	EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_8)
424 	add		%i1, 0x8, %i1
425 1:	andcc		%i2, 0x4, %g0
426 	be,pt		%XCC, 1f
427 	 nop
428 	sub		%i2, 0x4, %i2
429 	EX_LD(LOAD(lduw, %i1, %i5), NG_ret_i2_plus_4)
430 	EX_ST(STORE(stw, %i5, %i1 + %i3), NG_ret_i2_plus_4)
431 	add		%i1, 0x4, %i1
432 1:	cmp		%i2, 0
433 	be,pt		%XCC, 85f
434 	 nop
435 	ba,pt		%xcc, 90f
436 	 nop
437 
438 75:
439 	andcc		%o0, 0x7, %g1
440 	sub		%g1, 0x8, %g1
441 	be,pn		%icc, 2f
442 	 sub		%g0, %g1, %g1
443 	sub		%i2, %g1, %i2
444 
445 1:	subcc		%g1, 1, %g1
446 	EX_LD(LOAD(ldub, %i1, %i5), NG_ret_i2_plus_g1_plus_1)
447 	EX_ST(STORE(stb, %i5, %i1 + %i3), NG_ret_i2_plus_g1_plus_1)
448 	bgu,pt		%icc, 1b
449 	 add		%i1, 1, %i1
450 
451 2:	add		%i1, %i3, %o0
452 	andcc		%i1, 0x7, %g1
453 	bne,pt		%icc, 8f
454 	 sll		%g1, 3, %g1
455 
456 	cmp		%i2, 16
457 	bgeu,pt		%icc, 72b
458 	 nop
459 	ba,a,pt		%xcc, 73b
460 
461 8:	mov		64, %i3
462 	andn		%i1, 0x7, %i1
463 	EX_LD(LOAD(ldx, %i1, %g2), NG_ret_i2)
464 	sub		%i3, %g1, %i3
465 	andn		%i2, 0x7, %i4
466 	sllx		%g2, %g1, %g2
467 1:	add		%i1, 0x8, %i1
468 	EX_LD(LOAD(ldx, %i1, %g3), NG_ret_i2_and_7_plus_i4)
469 	subcc		%i4, 0x8, %i4
470 	srlx		%g3, %i3, %i5
471 	or		%i5, %g2, %i5
472 	EX_ST(STORE(stx, %i5, %o0), NG_ret_i2_and_7_plus_i4)
473 	add		%o0, 0x8, %o0
474 	bgu,pt		%icc, 1b
475 	 sllx		%g3, %g1, %g2
476 
477 	srl		%g1, 3, %g1
478 	andcc		%i2, 0x7, %i2
479 	be,pn		%icc, 85f
480 	 add		%i1, %g1, %i1
481 	ba,pt		%xcc, 90f
482 	 sub		%o0, %i1, %i3
483 
484 	.align		64
485 80: /* 0 < len <= 16 */
486 	andcc		%i3, 0x3, %g0
487 	bne,pn		%XCC, 90f
488 	 sub		%o0, %i1, %i3
489 
490 1:
491 	subcc		%i2, 4, %i2
492 	EX_LD(LOAD(lduw, %i1, %g1), NG_ret_i2_plus_4)
493 	EX_ST(STORE(stw, %g1, %i1 + %i3), NG_ret_i2_plus_4)
494 	bgu,pt		%XCC, 1b
495 	 add		%i1, 4, %i1
496 
497 85:	ret
498 	 restore	EX_RETVAL(%i0), %g0, %o0
499 
500 	.align		32
501 90:
502 	subcc		%i2, 1, %i2
503 	EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_1)
504 	EX_ST(STORE(stb, %g1, %i1 + %i3), NG_ret_i2_plus_1)
505 	bgu,pt		%XCC, 90b
506 	 add		%i1, 1, %i1
507 	ret
508 	 restore	EX_RETVAL(%i0), %g0, %o0
509 
510 	.size		FUNC_NAME, .-FUNC_NAME
511