1 /*
2  * This file is subject to the terms and conditions of the GNU General Public
3  * License.  See the file "COPYING" in the main directory of this archive
4  * for more details.
5  *
6  * Unified implementation of memcpy, memmove and the __copy_user backend.
7  *
8  * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
9  * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
10  * Copyright (C) 2002 Broadcom, Inc.
11  *   memcpy/copy_user author: Mark Vandevoorde
12  *
13  * Mnemonic names for arguments to memcpy/__copy_user
14  */
15 
16 #include <asm/asm.h>
17 #include <asm/asm-offsets.h>
18 #include <asm/export.h>
19 #include <asm/regdef.h>
20 
21 #define dst a0
22 #define src a1
23 #define len a2
24 
25 /*
26  * Spec
27  *
28  * memcpy copies len bytes from src to dst and sets v0 to dst.
29  * It assumes that
30  *   - src and dst don't overlap
31  *   - src is readable
32  *   - dst is writable
33  * memcpy uses the standard calling convention
34  *
35  * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
36  * the number of uncopied bytes due to an exception caused by a read or write.
37  * __copy_user assumes that src and dst don't overlap, and that the call is
38  * implementing one of the following:
39  *   copy_to_user
40  *     - src is readable  (no exceptions when reading src)
41  *   copy_from_user
42  *     - dst is writable  (no exceptions when writing dst)
43  * __copy_user uses a non-standard calling convention; see
44  * arch/mips/include/asm/uaccess.h
45  *
46  * When an exception happens on a load, the handler must
47  # ensure that all of the destination buffer is overwritten to prevent
48  * leaking information to user mode programs.
49  */
50 
51 /*
52  * Implementation
53  */
54 
55 /*
56  * The exception handler for loads requires that:
57  *  1- AT contain the address of the byte just past the end of the source
58  *     of the copy,
59  *  2- src_entry <= src < AT, and
60  *  3- (dst - src) == (dst_entry - src_entry),
61  * The _entry suffix denotes values when __copy_user was called.
62  *
63  * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
64  * (2) is met by incrementing src by the number of bytes copied
65  * (3) is met by not doing loads between a pair of increments of dst and src
66  *
67  * The exception handlers for stores adjust len (if necessary) and return.
68  * These handlers do not need to overwrite any data.
69  *
70  * For __rmemcpy and memmove an exception is always a kernel bug, therefore
71  * they're not protected.
72  */
73 
74 #define EXC(inst_reg,addr,handler)		\
75 9:	inst_reg, addr;				\
76 	.section __ex_table,"a";		\
77 	PTR	9b, handler;			\
78 	.previous
79 
80 /*
81  * Only on the 64-bit kernel we can made use of 64-bit registers.
82  */
83 
84 #define LOAD   ld
85 #define LOADL  ldl
86 #define LOADR  ldr
87 #define STOREL sdl
88 #define STORER sdr
89 #define STORE  sd
90 #define ADD    daddu
91 #define SUB    dsubu
92 #define SRL    dsrl
93 #define SRA    dsra
94 #define SLL    dsll
95 #define SLLV   dsllv
96 #define SRLV   dsrlv
97 #define NBYTES 8
98 #define LOG_NBYTES 3
99 
100 /*
101  * As we are sharing code base with the mips32 tree (which use the o32 ABI
102  * register definitions). We need to redefine the register definitions from
103  * the n64 ABI register naming to the o32 ABI register naming.
104  */
105 #undef t0
106 #undef t1
107 #undef t2
108 #undef t3
109 #define t0	$8
110 #define t1	$9
111 #define t2	$10
112 #define t3	$11
113 #define t4	$12
114 #define t5	$13
115 #define t6	$14
116 #define t7	$15
117 
118 #ifdef CONFIG_CPU_LITTLE_ENDIAN
119 #define LDFIRST LOADR
120 #define LDREST	LOADL
121 #define STFIRST STORER
122 #define STREST	STOREL
123 #define SHIFT_DISCARD SLLV
124 #else
125 #define LDFIRST LOADL
126 #define LDREST	LOADR
127 #define STFIRST STOREL
128 #define STREST	STORER
129 #define SHIFT_DISCARD SRLV
130 #endif
131 
132 #define FIRST(unit) ((unit)*NBYTES)
133 #define REST(unit)  (FIRST(unit)+NBYTES-1)
134 #define UNIT(unit)  FIRST(unit)
135 
136 #define ADDRMASK (NBYTES-1)
137 
138 	.text
139 	.set	noreorder
140 	.set	noat
141 
142 /*
143  * A combined memcpy/__copy_user
144  * __copy_user sets len to 0 for success; else to an upper bound of
145  * the number of uncopied bytes.
146  * memcpy sets v0 to dst.
147  */
148 	.align	5
149 LEAF(memcpy)					/* a0=dst a1=src a2=len */
150 EXPORT_SYMBOL(memcpy)
151 	move	v0, dst				/* return value */
152 __memcpy:
153 FEXPORT(__copy_user)
154 EXPORT_SYMBOL(__copy_user)
155 	/*
156 	 * Note: dst & src may be unaligned, len may be 0
157 	 * Temps
158 	 */
159 	#
160 	# Octeon doesn't care if the destination is unaligned. The hardware
161 	# can fix it faster than we can special case the assembly.
162 	#
163 	pref	0, 0(src)
164 	sltu	t0, len, NBYTES		# Check if < 1 word
165 	bnez	t0, copy_bytes_checklen
166 	 and	t0, src, ADDRMASK	# Check if src unaligned
167 	bnez	t0, src_unaligned
168 	 sltu	t0, len, 4*NBYTES	# Check if < 4 words
169 	bnez	t0, less_than_4units
170 	 sltu	t0, len, 8*NBYTES	# Check if < 8 words
171 	bnez	t0, less_than_8units
172 	 sltu	t0, len, 16*NBYTES	# Check if < 16 words
173 	bnez	t0, cleanup_both_aligned
174 	 sltu	t0, len, 128+1		# Check if len < 129
175 	bnez	t0, 1f			# Skip prefetch if len is too short
176 	 sltu	t0, len, 256+1		# Check if len < 257
177 	bnez	t0, 1f			# Skip prefetch if len is too short
178 	 pref	0, 128(src)		# We must not prefetch invalid addresses
179 	#
180 	# This is where we loop if there is more than 128 bytes left
181 2:	pref	0, 256(src)		# We must not prefetch invalid addresses
182 	#
183 	# This is where we loop if we can't prefetch anymore
184 1:
185 EXC(	LOAD	t0, UNIT(0)(src),	l_exc)
186 EXC(	LOAD	t1, UNIT(1)(src),	l_exc_copy)
187 EXC(	LOAD	t2, UNIT(2)(src),	l_exc_copy)
188 EXC(	LOAD	t3, UNIT(3)(src),	l_exc_copy)
189 	SUB	len, len, 16*NBYTES
190 EXC(	STORE	t0, UNIT(0)(dst),	s_exc_p16u)
191 EXC(	STORE	t1, UNIT(1)(dst),	s_exc_p15u)
192 EXC(	STORE	t2, UNIT(2)(dst),	s_exc_p14u)
193 EXC(	STORE	t3, UNIT(3)(dst),	s_exc_p13u)
194 EXC(	LOAD	t0, UNIT(4)(src),	l_exc_copy)
195 EXC(	LOAD	t1, UNIT(5)(src),	l_exc_copy)
196 EXC(	LOAD	t2, UNIT(6)(src),	l_exc_copy)
197 EXC(	LOAD	t3, UNIT(7)(src),	l_exc_copy)
198 EXC(	STORE	t0, UNIT(4)(dst),	s_exc_p12u)
199 EXC(	STORE	t1, UNIT(5)(dst),	s_exc_p11u)
200 EXC(	STORE	t2, UNIT(6)(dst),	s_exc_p10u)
201 	ADD	src, src, 16*NBYTES
202 EXC(	STORE	t3, UNIT(7)(dst),	s_exc_p9u)
203 	ADD	dst, dst, 16*NBYTES
204 EXC(	LOAD	t0, UNIT(-8)(src),	l_exc_copy_rewind16)
205 EXC(	LOAD	t1, UNIT(-7)(src),	l_exc_copy_rewind16)
206 EXC(	LOAD	t2, UNIT(-6)(src),	l_exc_copy_rewind16)
207 EXC(	LOAD	t3, UNIT(-5)(src),	l_exc_copy_rewind16)
208 EXC(	STORE	t0, UNIT(-8)(dst),	s_exc_p8u)
209 EXC(	STORE	t1, UNIT(-7)(dst),	s_exc_p7u)
210 EXC(	STORE	t2, UNIT(-6)(dst),	s_exc_p6u)
211 EXC(	STORE	t3, UNIT(-5)(dst),	s_exc_p5u)
212 EXC(	LOAD	t0, UNIT(-4)(src),	l_exc_copy_rewind16)
213 EXC(	LOAD	t1, UNIT(-3)(src),	l_exc_copy_rewind16)
214 EXC(	LOAD	t2, UNIT(-2)(src),	l_exc_copy_rewind16)
215 EXC(	LOAD	t3, UNIT(-1)(src),	l_exc_copy_rewind16)
216 EXC(	STORE	t0, UNIT(-4)(dst),	s_exc_p4u)
217 EXC(	STORE	t1, UNIT(-3)(dst),	s_exc_p3u)
218 EXC(	STORE	t2, UNIT(-2)(dst),	s_exc_p2u)
219 EXC(	STORE	t3, UNIT(-1)(dst),	s_exc_p1u)
220 	sltu	t0, len, 256+1		# See if we can prefetch more
221 	beqz	t0, 2b
222 	 sltu	t0, len, 128		# See if we can loop more time
223 	beqz	t0, 1b
224 	 nop
225 	#
226 	# Jump here if there are less than 16*NBYTES left.
227 	#
228 cleanup_both_aligned:
229 	beqz	len, done
230 	 sltu	t0, len, 8*NBYTES
231 	bnez	t0, less_than_8units
232 	 nop
233 EXC(	LOAD	t0, UNIT(0)(src),	l_exc)
234 EXC(	LOAD	t1, UNIT(1)(src),	l_exc_copy)
235 EXC(	LOAD	t2, UNIT(2)(src),	l_exc_copy)
236 EXC(	LOAD	t3, UNIT(3)(src),	l_exc_copy)
237 	SUB	len, len, 8*NBYTES
238 EXC(	STORE	t0, UNIT(0)(dst),	s_exc_p8u)
239 EXC(	STORE	t1, UNIT(1)(dst),	s_exc_p7u)
240 EXC(	STORE	t2, UNIT(2)(dst),	s_exc_p6u)
241 EXC(	STORE	t3, UNIT(3)(dst),	s_exc_p5u)
242 EXC(	LOAD	t0, UNIT(4)(src),	l_exc_copy)
243 EXC(	LOAD	t1, UNIT(5)(src),	l_exc_copy)
244 EXC(	LOAD	t2, UNIT(6)(src),	l_exc_copy)
245 EXC(	LOAD	t3, UNIT(7)(src),	l_exc_copy)
246 EXC(	STORE	t0, UNIT(4)(dst),	s_exc_p4u)
247 EXC(	STORE	t1, UNIT(5)(dst),	s_exc_p3u)
248 EXC(	STORE	t2, UNIT(6)(dst),	s_exc_p2u)
249 EXC(	STORE	t3, UNIT(7)(dst),	s_exc_p1u)
250 	ADD	src, src, 8*NBYTES
251 	beqz	len, done
252 	 ADD	dst, dst, 8*NBYTES
253 	#
254 	# Jump here if there are less than 8*NBYTES left.
255 	#
256 less_than_8units:
257 	sltu	t0, len, 4*NBYTES
258 	bnez	t0, less_than_4units
259 	 nop
260 EXC(	LOAD	t0, UNIT(0)(src),	l_exc)
261 EXC(	LOAD	t1, UNIT(1)(src),	l_exc_copy)
262 EXC(	LOAD	t2, UNIT(2)(src),	l_exc_copy)
263 EXC(	LOAD	t3, UNIT(3)(src),	l_exc_copy)
264 	SUB	len, len, 4*NBYTES
265 EXC(	STORE	t0, UNIT(0)(dst),	s_exc_p4u)
266 EXC(	STORE	t1, UNIT(1)(dst),	s_exc_p3u)
267 EXC(	STORE	t2, UNIT(2)(dst),	s_exc_p2u)
268 EXC(	STORE	t3, UNIT(3)(dst),	s_exc_p1u)
269 	ADD	src, src, 4*NBYTES
270 	beqz	len, done
271 	 ADD	dst, dst, 4*NBYTES
272 	#
273 	# Jump here if there are less than 4*NBYTES left. This means
274 	# we may need to copy up to 3 NBYTES words.
275 	#
276 less_than_4units:
277 	sltu	t0, len, 1*NBYTES
278 	bnez	t0, copy_bytes_checklen
279 	 nop
280 	#
281 	# 1) Copy NBYTES, then check length again
282 	#
283 EXC(	LOAD	t0, 0(src),		l_exc)
284 	SUB	len, len, NBYTES
285 	sltu	t1, len, 8
286 EXC(	STORE	t0, 0(dst),		s_exc_p1u)
287 	ADD	src, src, NBYTES
288 	bnez	t1, copy_bytes_checklen
289 	 ADD	dst, dst, NBYTES
290 	#
291 	# 2) Copy NBYTES, then check length again
292 	#
293 EXC(	LOAD	t0, 0(src),		l_exc)
294 	SUB	len, len, NBYTES
295 	sltu	t1, len, 8
296 EXC(	STORE	t0, 0(dst),		s_exc_p1u)
297 	ADD	src, src, NBYTES
298 	bnez	t1, copy_bytes_checklen
299 	 ADD	dst, dst, NBYTES
300 	#
301 	# 3) Copy NBYTES, then check length again
302 	#
303 EXC(	LOAD	t0, 0(src),		l_exc)
304 	SUB	len, len, NBYTES
305 	ADD	src, src, NBYTES
306 	ADD	dst, dst, NBYTES
307 	b copy_bytes_checklen
308 EXC(	 STORE	t0, -8(dst),		s_exc_p1u)
309 
310 src_unaligned:
311 #define rem t8
312 	SRL	t0, len, LOG_NBYTES+2	 # +2 for 4 units/iter
313 	beqz	t0, cleanup_src_unaligned
314 	 and	rem, len, (4*NBYTES-1)	 # rem = len % 4*NBYTES
315 1:
316 /*
317  * Avoid consecutive LD*'s to the same register since some mips
318  * implementations can't issue them in the same cycle.
319  * It's OK to load FIRST(N+1) before REST(N) because the two addresses
320  * are to the same unit (unless src is aligned, but it's not).
321  */
322 EXC(	LDFIRST t0, FIRST(0)(src),	l_exc)
323 EXC(	LDFIRST t1, FIRST(1)(src),	l_exc_copy)
324 	SUB	len, len, 4*NBYTES
325 EXC(	LDREST	t0, REST(0)(src),	l_exc_copy)
326 EXC(	LDREST	t1, REST(1)(src),	l_exc_copy)
327 EXC(	LDFIRST t2, FIRST(2)(src),	l_exc_copy)
328 EXC(	LDFIRST t3, FIRST(3)(src),	l_exc_copy)
329 EXC(	LDREST	t2, REST(2)(src),	l_exc_copy)
330 EXC(	LDREST	t3, REST(3)(src),	l_exc_copy)
331 	ADD	src, src, 4*NBYTES
332 EXC(	STORE	t0, UNIT(0)(dst),	s_exc_p4u)
333 EXC(	STORE	t1, UNIT(1)(dst),	s_exc_p3u)
334 EXC(	STORE	t2, UNIT(2)(dst),	s_exc_p2u)
335 EXC(	STORE	t3, UNIT(3)(dst),	s_exc_p1u)
336 	bne	len, rem, 1b
337 	 ADD	dst, dst, 4*NBYTES
338 
339 cleanup_src_unaligned:
340 	beqz	len, done
341 	 and	rem, len, NBYTES-1  # rem = len % NBYTES
342 	beq	rem, len, copy_bytes
343 	 nop
344 1:
345 EXC(	LDFIRST t0, FIRST(0)(src),	l_exc)
346 EXC(	LDREST	t0, REST(0)(src),	l_exc_copy)
347 	SUB	len, len, NBYTES
348 EXC(	STORE	t0, 0(dst),		s_exc_p1u)
349 	ADD	src, src, NBYTES
350 	bne	len, rem, 1b
351 	 ADD	dst, dst, NBYTES
352 
353 copy_bytes_checklen:
354 	beqz	len, done
355 	 nop
356 copy_bytes:
357 	/* 0 < len < NBYTES  */
358 #define COPY_BYTE(N)			\
359 EXC(	lb	t0, N(src), l_exc);	\
360 	SUB	len, len, 1;		\
361 	beqz	len, done;		\
362 EXC(	 sb	t0, N(dst), s_exc_p1)
363 
364 	COPY_BYTE(0)
365 	COPY_BYTE(1)
366 	COPY_BYTE(2)
367 	COPY_BYTE(3)
368 	COPY_BYTE(4)
369 	COPY_BYTE(5)
370 EXC(	lb	t0, NBYTES-2(src), l_exc)
371 	SUB	len, len, 1
372 	jr	ra
373 EXC(	 sb	t0, NBYTES-2(dst), s_exc_p1)
374 done:
375 	jr	ra
376 	 nop
377 	END(memcpy)
378 
379 l_exc_copy_rewind16:
380 	/* Rewind src and dst by 16*NBYTES for l_exc_copy */
381 	SUB	src, src, 16*NBYTES
382 	SUB	dst, dst, 16*NBYTES
383 l_exc_copy:
384 	/*
385 	 * Copy bytes from src until faulting load address (or until a
386 	 * lb faults)
387 	 *
388 	 * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
389 	 * may be more than a byte beyond the last address.
390 	 * Hence, the lb below may get an exception.
391 	 *
392 	 * Assumes src < THREAD_BUADDR($28)
393 	 */
394 	LOAD	t0, TI_TASK($28)
395 	LOAD	t0, THREAD_BUADDR(t0)
396 1:
397 EXC(	lb	t1, 0(src),	l_exc)
398 	ADD	src, src, 1
399 	sb	t1, 0(dst)	# can't fault -- we're copy_from_user
400 	bne	src, t0, 1b
401 	 ADD	dst, dst, 1
402 l_exc:
403 	LOAD	t0, TI_TASK($28)
404 	LOAD	t0, THREAD_BUADDR(t0)	# t0 is just past last good address
405 	SUB	len, AT, t0		# len number of uncopied bytes
406 	jr	ra
407 	 nop
408 
409 
410 #define SEXC(n)				\
411 s_exc_p ## n ## u:			\
412 	jr	ra;			\
413 	 ADD	len, len, n*NBYTES
414 
415 SEXC(16)
416 SEXC(15)
417 SEXC(14)
418 SEXC(13)
419 SEXC(12)
420 SEXC(11)
421 SEXC(10)
422 SEXC(9)
423 SEXC(8)
424 SEXC(7)
425 SEXC(6)
426 SEXC(5)
427 SEXC(4)
428 SEXC(3)
429 SEXC(2)
430 SEXC(1)
431 
432 s_exc_p1:
433 	jr	ra
434 	 ADD	len, len, 1
435 s_exc:
436 	jr	ra
437 	 nop
438 
439 	.align	5
440 LEAF(memmove)
441 EXPORT_SYMBOL(memmove)
442 	ADD	t0, a0, a2
443 	ADD	t1, a1, a2
444 	sltu	t0, a1, t0			# dst + len <= src -> memcpy
445 	sltu	t1, a0, t1			# dst >= src + len -> memcpy
446 	and	t0, t1
447 	beqz	t0, __memcpy
448 	 move	v0, a0				/* return value */
449 	beqz	a2, r_out
450 	END(memmove)
451 
452 	/* fall through to __rmemcpy */
453 LEAF(__rmemcpy)					/* a0=dst a1=src a2=len */
454 	 sltu	t0, a1, a0
455 	beqz	t0, r_end_bytes_up		# src >= dst
456 	 nop
457 	ADD	a0, a2				# dst = dst + len
458 	ADD	a1, a2				# src = src + len
459 
460 r_end_bytes:
461 	lb	t0, -1(a1)
462 	SUB	a2, a2, 0x1
463 	sb	t0, -1(a0)
464 	SUB	a1, a1, 0x1
465 	bnez	a2, r_end_bytes
466 	 SUB	a0, a0, 0x1
467 
468 r_out:
469 	jr	ra
470 	 move	a2, zero
471 
472 r_end_bytes_up:
473 	lb	t0, (a1)
474 	SUB	a2, a2, 0x1
475 	sb	t0, (a0)
476 	ADD	a1, a1, 0x1
477 	bnez	a2, r_end_bytes_up
478 	 ADD	a0, a0, 0x1
479 
480 	jr	ra
481 	 move	a2, zero
482 	END(__rmemcpy)
483