1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * arch/alpha/lib/ev6-memset.S
4  *
5  * This is an efficient (and relatively small) implementation of the C library
6  * "memset()" function for the 21264 implementation of Alpha.
7  *
8  * 21264 version  contributed by Rick Gorton <rick.gorton@alpha-processor.com>
9  *
10  * Much of the information about 21264 scheduling/coding comes from:
11  *	Compiler Writer's Guide for the Alpha 21264
12  *	abbreviated as 'CWG' in other comments here
13  *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
14  * Scheduling notation:
15  *	E	- either cluster
16  *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
17  *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
18  * The algorithm for the leading and trailing quadwords remains the same,
19  * however the loop has been unrolled to enable better memory throughput,
20  * and the code has been replicated for each of the entry points: __memset
21  * and __memset16 to permit better scheduling to eliminate the stalling
22  * encountered during the mask replication.
23  * A future enhancement might be to put in a byte store loop for really
24  * small (say < 32 bytes) memset()s.  Whether or not that change would be
25  * a win in the kernel would depend upon the contextual usage.
26  * WARNING: Maintaining this is going to be more work than the above version,
27  * as fixes will need to be made in multiple places.  The performance gain
28  * is worth it.
29  */
30 #include <asm/export.h>
31 	.set noat
32 	.set noreorder
33 .text
34 	.globl memset
35 	.globl __memset
36 	.globl ___memset
37 	.globl __memset16
38 	.globl __constant_c_memset
39 
40 	.ent ___memset
41 .align 5
42 ___memset:
43 	.frame $30,0,$26,0
44 	.prologue 0
45 
46 	/*
47 	 * Serious stalling happens.  The only way to mitigate this is to
48 	 * undertake a major re-write to interleave the constant materialization
49 	 * with other parts of the fall-through code.  This is important, even
50 	 * though it makes maintenance tougher.
51 	 * Do this later.
52 	 */
53 	and $17,255,$1		# E : 00000000000000ch
54 	insbl $17,1,$2		# U : 000000000000ch00
55 	bis $16,$16,$0		# E : return value
56 	ble $18,end_b		# U : zero length requested?
57 
58 	addq $18,$16,$6		# E : max address to write to
59 	bis	$1,$2,$17	# E : 000000000000chch
60 	insbl	$1,2,$3		# U : 0000000000ch0000
61 	insbl	$1,3,$4		# U : 00000000ch000000
62 
63 	or	$3,$4,$3	# E : 00000000chch0000
64 	inswl	$17,4,$5	# U : 0000chch00000000
65 	xor	$16,$6,$1	# E : will complete write be within one quadword?
66 	inswl	$17,6,$2	# U : chch000000000000
67 
68 	or	$17,$3,$17	# E : 00000000chchchch
69 	or	$2,$5,$2	# E : chchchch00000000
70 	bic	$1,7,$1		# E : fit within a single quadword?
71 	and	$16,7,$3	# E : Target addr misalignment
72 
73 	or	$17,$2,$17	# E : chchchchchchchch
74 	beq	$1,within_quad_b # U :
75 	nop			# E :
76 	beq	$3,aligned_b	# U : target is 0mod8
77 
78 	/*
79 	 * Target address is misaligned, and won't fit within a quadword
80 	 */
81 	ldq_u $4,0($16)		# L : Fetch first partial
82 	bis $16,$16,$5		# E : Save the address
83 	insql $17,$16,$2	# U : Insert new bytes
84 	subq $3,8,$3		# E : Invert (for addressing uses)
85 
86 	addq $18,$3,$18		# E : $18 is new count ($3 is negative)
87 	mskql $4,$16,$4		# U : clear relevant parts of the quad
88 	subq $16,$3,$16		# E : $16 is new aligned destination
89 	bis $2,$4,$1		# E : Final bytes
90 
91 	nop
92 	stq_u $1,0($5)		# L : Store result
93 	nop
94 	nop
95 
96 .align 4
97 aligned_b:
98 	/*
99 	 * We are now guaranteed to be quad aligned, with at least
100 	 * one partial quad to write.
101 	 */
102 
103 	sra $18,3,$3		# U : Number of remaining quads to write
104 	and $18,7,$18		# E : Number of trailing bytes to write
105 	bis $16,$16,$5		# E : Save dest address
106 	beq $3,no_quad_b	# U : tail stuff only
107 
108 	/*
109 	 * it's worth the effort to unroll this and use wh64 if possible
110 	 * Lifted a bunch of code from clear_user.S
111 	 * At this point, entry values are:
112 	 * $16	Current destination address
113 	 * $5	A copy of $16
114 	 * $6	The max quadword address to write to
115 	 * $18	Number trailer bytes
116 	 * $3	Number quads to write
117 	 */
118 
119 	and	$16, 0x3f, $2	# E : Forward work (only useful for unrolled loop)
120 	subq	$3, 16, $4	# E : Only try to unroll if > 128 bytes
121 	subq	$2, 0x40, $1	# E : bias counter (aligning stuff 0mod64)
122 	blt	$4, loop_b	# U :
123 
124 	/*
125 	 * We know we've got at least 16 quads, minimum of one trip
126 	 * through unrolled loop.  Do a quad at a time to get us 0mod64
127 	 * aligned.
128 	 */
129 
130 	nop			# E :
131 	nop			# E :
132 	nop			# E :
133 	beq	$1, $bigalign_b	# U :
134 
135 $alignmod64_b:
136 	stq	$17, 0($5)	# L :
137 	subq	$3, 1, $3	# E : For consistency later
138 	addq	$1, 8, $1	# E : Increment towards zero for alignment
139 	addq	$5, 8, $4	# E : Initial wh64 address (filler instruction)
140 
141 	nop
142 	nop
143 	addq	$5, 8, $5	# E : Inc address
144 	blt	$1, $alignmod64_b # U :
145 
146 $bigalign_b:
147 	/*
148 	 * $3 - number quads left to go
149 	 * $5 - target address (aligned 0mod64)
150 	 * $17 - mask of stuff to store
151 	 * Scratch registers available: $7, $2, $4, $1
152 	 * we know that we'll be taking a minimum of one trip through
153  	 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
154 	 * Assumes the wh64 needs to be for 2 trips through the loop in the future
155 	 * The wh64 is issued on for the starting destination address for trip +2
156 	 * through the loop, and if there are less than two trips left, the target
157 	 * address will be for the current trip.
158 	 */
159 
160 $do_wh64_b:
161 	wh64	($4)		# L1 : memory subsystem write hint
162 	subq	$3, 24, $2	# E : For determining future wh64 addresses
163 	stq	$17, 0($5)	# L :
164 	nop			# E :
165 
166 	addq	$5, 128, $4	# E : speculative target of next wh64
167 	stq	$17, 8($5)	# L :
168 	stq	$17, 16($5)	# L :
169 	addq	$5, 64, $7	# E : Fallback address for wh64 (== next trip addr)
170 
171 	stq	$17, 24($5)	# L :
172 	stq	$17, 32($5)	# L :
173 	cmovlt	$2, $7, $4	# E : Latency 2, extra mapping cycle
174 	nop
175 
176 	stq	$17, 40($5)	# L :
177 	stq	$17, 48($5)	# L :
178 	subq	$3, 16, $2	# E : Repeat the loop at least once more?
179 	nop
180 
181 	stq	$17, 56($5)	# L :
182 	addq	$5, 64, $5	# E :
183 	subq	$3, 8, $3	# E :
184 	bge	$2, $do_wh64_b	# U :
185 
186 	nop
187 	nop
188 	nop
189 	beq	$3, no_quad_b	# U : Might have finished already
190 
191 .align 4
192 	/*
193 	 * Simple loop for trailing quadwords, or for small amounts
194 	 * of data (where we can't use an unrolled loop and wh64)
195 	 */
196 loop_b:
197 	stq $17,0($5)		# L :
198 	subq $3,1,$3		# E : Decrement number quads left
199 	addq $5,8,$5		# E : Inc address
200 	bne $3,loop_b		# U : more?
201 
202 no_quad_b:
203 	/*
204 	 * Write 0..7 trailing bytes.
205 	 */
206 	nop			# E :
207 	beq $18,end_b		# U : All done?
208 	ldq $7,0($5)		# L :
209 	mskqh $7,$6,$2		# U : Mask final quad
210 
211 	insqh $17,$6,$4		# U : New bits
212 	bis $2,$4,$1		# E : Put it all together
213 	stq $1,0($5)		# L : And back to memory
214 	ret $31,($26),1		# L0 :
215 
216 within_quad_b:
217 	ldq_u $1,0($16)		# L :
218 	insql $17,$16,$2	# U : New bits
219 	mskql $1,$16,$4		# U : Clear old
220 	bis $2,$4,$2		# E : New result
221 
222 	mskql $2,$6,$4		# U :
223 	mskqh $1,$6,$2		# U :
224 	bis $2,$4,$1		# E :
225 	stq_u $1,0($16)		# L :
226 
227 end_b:
228 	nop
229 	nop
230 	nop
231 	ret $31,($26),1		# L0 :
232 	.end ___memset
233 	EXPORT_SYMBOL(___memset)
234 
235 	/*
236 	 * This is the original body of code, prior to replication and
237 	 * rescheduling.  Leave it here, as there may be calls to this
238 	 * entry point.
239 	 */
240 .align 4
241 	.ent __constant_c_memset
242 __constant_c_memset:
243 	.frame $30,0,$26,0
244 	.prologue 0
245 
246 	addq $18,$16,$6		# E : max address to write to
247 	bis $16,$16,$0		# E : return value
248 	xor $16,$6,$1		# E : will complete write be within one quadword?
249 	ble $18,end		# U : zero length requested?
250 
251 	bic $1,7,$1		# E : fit within a single quadword
252 	beq $1,within_one_quad	# U :
253 	and $16,7,$3		# E : Target addr misalignment
254 	beq $3,aligned		# U : target is 0mod8
255 
256 	/*
257 	 * Target address is misaligned, and won't fit within a quadword
258 	 */
259 	ldq_u $4,0($16)		# L : Fetch first partial
260 	bis $16,$16,$5		# E : Save the address
261 	insql $17,$16,$2	# U : Insert new bytes
262 	subq $3,8,$3		# E : Invert (for addressing uses)
263 
264 	addq $18,$3,$18		# E : $18 is new count ($3 is negative)
265 	mskql $4,$16,$4		# U : clear relevant parts of the quad
266 	subq $16,$3,$16		# E : $16 is new aligned destination
267 	bis $2,$4,$1		# E : Final bytes
268 
269 	nop
270 	stq_u $1,0($5)		# L : Store result
271 	nop
272 	nop
273 
274 .align 4
275 aligned:
276 	/*
277 	 * We are now guaranteed to be quad aligned, with at least
278 	 * one partial quad to write.
279 	 */
280 
281 	sra $18,3,$3		# U : Number of remaining quads to write
282 	and $18,7,$18		# E : Number of trailing bytes to write
283 	bis $16,$16,$5		# E : Save dest address
284 	beq $3,no_quad		# U : tail stuff only
285 
286 	/*
287 	 * it's worth the effort to unroll this and use wh64 if possible
288 	 * Lifted a bunch of code from clear_user.S
289 	 * At this point, entry values are:
290 	 * $16	Current destination address
291 	 * $5	A copy of $16
292 	 * $6	The max quadword address to write to
293 	 * $18	Number trailer bytes
294 	 * $3	Number quads to write
295 	 */
296 
297 	and	$16, 0x3f, $2	# E : Forward work (only useful for unrolled loop)
298 	subq	$3, 16, $4	# E : Only try to unroll if > 128 bytes
299 	subq	$2, 0x40, $1	# E : bias counter (aligning stuff 0mod64)
300 	blt	$4, loop	# U :
301 
302 	/*
303 	 * We know we've got at least 16 quads, minimum of one trip
304 	 * through unrolled loop.  Do a quad at a time to get us 0mod64
305 	 * aligned.
306 	 */
307 
308 	nop			# E :
309 	nop			# E :
310 	nop			# E :
311 	beq	$1, $bigalign	# U :
312 
313 $alignmod64:
314 	stq	$17, 0($5)	# L :
315 	subq	$3, 1, $3	# E : For consistency later
316 	addq	$1, 8, $1	# E : Increment towards zero for alignment
317 	addq	$5, 8, $4	# E : Initial wh64 address (filler instruction)
318 
319 	nop
320 	nop
321 	addq	$5, 8, $5	# E : Inc address
322 	blt	$1, $alignmod64	# U :
323 
324 $bigalign:
325 	/*
326 	 * $3 - number quads left to go
327 	 * $5 - target address (aligned 0mod64)
328 	 * $17 - mask of stuff to store
329 	 * Scratch registers available: $7, $2, $4, $1
330 	 * we know that we'll be taking a minimum of one trip through
331  	 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
332 	 * Assumes the wh64 needs to be for 2 trips through the loop in the future
333 	 * The wh64 is issued on for the starting destination address for trip +2
334 	 * through the loop, and if there are less than two trips left, the target
335 	 * address will be for the current trip.
336 	 */
337 
338 $do_wh64:
339 	wh64	($4)		# L1 : memory subsystem write hint
340 	subq	$3, 24, $2	# E : For determining future wh64 addresses
341 	stq	$17, 0($5)	# L :
342 	nop			# E :
343 
344 	addq	$5, 128, $4	# E : speculative target of next wh64
345 	stq	$17, 8($5)	# L :
346 	stq	$17, 16($5)	# L :
347 	addq	$5, 64, $7	# E : Fallback address for wh64 (== next trip addr)
348 
349 	stq	$17, 24($5)	# L :
350 	stq	$17, 32($5)	# L :
351 	cmovlt	$2, $7, $4	# E : Latency 2, extra mapping cycle
352 	nop
353 
354 	stq	$17, 40($5)	# L :
355 	stq	$17, 48($5)	# L :
356 	subq	$3, 16, $2	# E : Repeat the loop at least once more?
357 	nop
358 
359 	stq	$17, 56($5)	# L :
360 	addq	$5, 64, $5	# E :
361 	subq	$3, 8, $3	# E :
362 	bge	$2, $do_wh64	# U :
363 
364 	nop
365 	nop
366 	nop
367 	beq	$3, no_quad	# U : Might have finished already
368 
369 .align 4
370 	/*
371 	 * Simple loop for trailing quadwords, or for small amounts
372 	 * of data (where we can't use an unrolled loop and wh64)
373 	 */
374 loop:
375 	stq $17,0($5)		# L :
376 	subq $3,1,$3		# E : Decrement number quads left
377 	addq $5,8,$5		# E : Inc address
378 	bne $3,loop		# U : more?
379 
380 no_quad:
381 	/*
382 	 * Write 0..7 trailing bytes.
383 	 */
384 	nop			# E :
385 	beq $18,end		# U : All done?
386 	ldq $7,0($5)		# L :
387 	mskqh $7,$6,$2		# U : Mask final quad
388 
389 	insqh $17,$6,$4		# U : New bits
390 	bis $2,$4,$1		# E : Put it all together
391 	stq $1,0($5)		# L : And back to memory
392 	ret $31,($26),1		# L0 :
393 
394 within_one_quad:
395 	ldq_u $1,0($16)		# L :
396 	insql $17,$16,$2	# U : New bits
397 	mskql $1,$16,$4		# U : Clear old
398 	bis $2,$4,$2		# E : New result
399 
400 	mskql $2,$6,$4		# U :
401 	mskqh $1,$6,$2		# U :
402 	bis $2,$4,$1		# E :
403 	stq_u $1,0($16)		# L :
404 
405 end:
406 	nop
407 	nop
408 	nop
409 	ret $31,($26),1		# L0 :
410 	.end __constant_c_memset
411 	EXPORT_SYMBOL(__constant_c_memset)
412 
413 	/*
414 	 * This is a replicant of the __constant_c_memset code, rescheduled
415 	 * to mask stalls.  Note that entry point names also had to change
416 	 */
417 	.align 5
418 	.ent __memset16
419 
420 __memset16:
421 	.frame $30,0,$26,0
422 	.prologue 0
423 
424 	inswl $17,0,$5		# U : 000000000000c1c2
425 	inswl $17,2,$2		# U : 00000000c1c20000
426 	bis $16,$16,$0		# E : return value
427 	addq	$18,$16,$6	# E : max address to write to
428 
429 	ble $18, end_w		# U : zero length requested?
430 	inswl	$17,4,$3	# U : 0000c1c200000000
431 	inswl	$17,6,$4	# U : c1c2000000000000
432 	xor	$16,$6,$1	# E : will complete write be within one quadword?
433 
434 	or	$2,$5,$2	# E : 00000000c1c2c1c2
435 	or	$3,$4,$17	# E : c1c2c1c200000000
436 	bic	$1,7,$1		# E : fit within a single quadword
437 	and	$16,7,$3	# E : Target addr misalignment
438 
439 	or	$17,$2,$17	# E : c1c2c1c2c1c2c1c2
440 	beq $1,within_quad_w	# U :
441 	nop
442 	beq $3,aligned_w	# U : target is 0mod8
443 
444 	/*
445 	 * Target address is misaligned, and won't fit within a quadword
446 	 */
447 	ldq_u $4,0($16)		# L : Fetch first partial
448 	bis $16,$16,$5		# E : Save the address
449 	insql $17,$16,$2	# U : Insert new bytes
450 	subq $3,8,$3		# E : Invert (for addressing uses)
451 
452 	addq $18,$3,$18		# E : $18 is new count ($3 is negative)
453 	mskql $4,$16,$4		# U : clear relevant parts of the quad
454 	subq $16,$3,$16		# E : $16 is new aligned destination
455 	bis $2,$4,$1		# E : Final bytes
456 
457 	nop
458 	stq_u $1,0($5)		# L : Store result
459 	nop
460 	nop
461 
462 .align 4
463 aligned_w:
464 	/*
465 	 * We are now guaranteed to be quad aligned, with at least
466 	 * one partial quad to write.
467 	 */
468 
469 	sra $18,3,$3		# U : Number of remaining quads to write
470 	and $18,7,$18		# E : Number of trailing bytes to write
471 	bis $16,$16,$5		# E : Save dest address
472 	beq $3,no_quad_w	# U : tail stuff only
473 
474 	/*
475 	 * it's worth the effort to unroll this and use wh64 if possible
476 	 * Lifted a bunch of code from clear_user.S
477 	 * At this point, entry values are:
478 	 * $16	Current destination address
479 	 * $5	A copy of $16
480 	 * $6	The max quadword address to write to
481 	 * $18	Number trailer bytes
482 	 * $3	Number quads to write
483 	 */
484 
485 	and	$16, 0x3f, $2	# E : Forward work (only useful for unrolled loop)
486 	subq	$3, 16, $4	# E : Only try to unroll if > 128 bytes
487 	subq	$2, 0x40, $1	# E : bias counter (aligning stuff 0mod64)
488 	blt	$4, loop_w	# U :
489 
490 	/*
491 	 * We know we've got at least 16 quads, minimum of one trip
492 	 * through unrolled loop.  Do a quad at a time to get us 0mod64
493 	 * aligned.
494 	 */
495 
496 	nop			# E :
497 	nop			# E :
498 	nop			# E :
499 	beq	$1, $bigalign_w	# U :
500 
501 $alignmod64_w:
502 	stq	$17, 0($5)	# L :
503 	subq	$3, 1, $3	# E : For consistency later
504 	addq	$1, 8, $1	# E : Increment towards zero for alignment
505 	addq	$5, 8, $4	# E : Initial wh64 address (filler instruction)
506 
507 	nop
508 	nop
509 	addq	$5, 8, $5	# E : Inc address
510 	blt	$1, $alignmod64_w	# U :
511 
512 $bigalign_w:
513 	/*
514 	 * $3 - number quads left to go
515 	 * $5 - target address (aligned 0mod64)
516 	 * $17 - mask of stuff to store
517 	 * Scratch registers available: $7, $2, $4, $1
518 	 * we know that we'll be taking a minimum of one trip through
519  	 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
520 	 * Assumes the wh64 needs to be for 2 trips through the loop in the future
521 	 * The wh64 is issued on for the starting destination address for trip +2
522 	 * through the loop, and if there are less than two trips left, the target
523 	 * address will be for the current trip.
524 	 */
525 
526 $do_wh64_w:
527 	wh64	($4)		# L1 : memory subsystem write hint
528 	subq	$3, 24, $2	# E : For determining future wh64 addresses
529 	stq	$17, 0($5)	# L :
530 	nop			# E :
531 
532 	addq	$5, 128, $4	# E : speculative target of next wh64
533 	stq	$17, 8($5)	# L :
534 	stq	$17, 16($5)	# L :
535 	addq	$5, 64, $7	# E : Fallback address for wh64 (== next trip addr)
536 
537 	stq	$17, 24($5)	# L :
538 	stq	$17, 32($5)	# L :
539 	cmovlt	$2, $7, $4	# E : Latency 2, extra mapping cycle
540 	nop
541 
542 	stq	$17, 40($5)	# L :
543 	stq	$17, 48($5)	# L :
544 	subq	$3, 16, $2	# E : Repeat the loop at least once more?
545 	nop
546 
547 	stq	$17, 56($5)	# L :
548 	addq	$5, 64, $5	# E :
549 	subq	$3, 8, $3	# E :
550 	bge	$2, $do_wh64_w	# U :
551 
552 	nop
553 	nop
554 	nop
555 	beq	$3, no_quad_w	# U : Might have finished already
556 
557 .align 4
558 	/*
559 	 * Simple loop for trailing quadwords, or for small amounts
560 	 * of data (where we can't use an unrolled loop and wh64)
561 	 */
562 loop_w:
563 	stq $17,0($5)		# L :
564 	subq $3,1,$3		# E : Decrement number quads left
565 	addq $5,8,$5		# E : Inc address
566 	bne $3,loop_w		# U : more?
567 
568 no_quad_w:
569 	/*
570 	 * Write 0..7 trailing bytes.
571 	 */
572 	nop			# E :
573 	beq $18,end_w		# U : All done?
574 	ldq $7,0($5)		# L :
575 	mskqh $7,$6,$2		# U : Mask final quad
576 
577 	insqh $17,$6,$4		# U : New bits
578 	bis $2,$4,$1		# E : Put it all together
579 	stq $1,0($5)		# L : And back to memory
580 	ret $31,($26),1		# L0 :
581 
582 within_quad_w:
583 	ldq_u $1,0($16)		# L :
584 	insql $17,$16,$2	# U : New bits
585 	mskql $1,$16,$4		# U : Clear old
586 	bis $2,$4,$2		# E : New result
587 
588 	mskql $2,$6,$4		# U :
589 	mskqh $1,$6,$2		# U :
590 	bis $2,$4,$1		# E :
591 	stq_u $1,0($16)		# L :
592 
593 end_w:
594 	nop
595 	nop
596 	nop
597 	ret $31,($26),1		# L0 :
598 
599 	.end __memset16
600 	EXPORT_SYMBOL(__memset16)
601 
602 memset = ___memset
603 __memset = ___memset
604 	EXPORT_SYMBOL(memset)
605 	EXPORT_SYMBOL(__memset)
606