1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * arch/alpha/lib/ev6-copy_page.S
4  *
5  * Copy an entire page.
6  */
7 
8 /* The following comparison of this routine vs the normal copy_page.S
9    was written by an unnamed ev6 hardware designer and forwarded to me
10    via Steven Hobbs <hobbs@steven.zko.dec.com>.
11 
12    First Problem: STQ overflows.
13    -----------------------------
14 
15 	It would be nice if EV6 handled every resource overflow efficiently,
16 	but for some it doesn't.  Including store queue overflows.  It causes
17 	a trap and a restart of the pipe.
18 
19 	To get around this we sometimes use (to borrow a term from a VSSAD
20 	researcher) "aeration".  The idea is to slow the rate at which the
21 	processor receives valid instructions by inserting nops in the fetch
22 	path.  In doing so, you can prevent the overflow and actually make
23 	the code run faster.  You can, of course, take advantage of the fact
24 	that the processor can fetch at most 4 aligned instructions per cycle.
25 
26 	I inserted enough nops to force it to take 10 cycles to fetch the
27 	loop code.  In theory, EV6 should be able to execute this loop in
28 	9 cycles but I was not able to get it to run that fast -- the initial
29 	conditions were such that I could not reach this optimum rate on
30 	(chaotic) EV6.  I wrote the code such that everything would issue
31 	in order.
32 
33    Second Problem: Dcache index matches.
34    -------------------------------------
35 
36 	If you are going to use this routine on random aligned pages, there
37 	is a 25% chance that the pages will be at the same dcache indices.
38 	This results in many nasty memory traps without care.
39 
40 	The solution is to schedule the prefetches to avoid the memory
41 	conflicts.  I schedule the wh64 prefetches farther ahead of the
42 	read prefetches to avoid this problem.
43 
44    Third Problem: Needs more prefetching.
45    --------------------------------------
46 
47 	In order to improve the code I added deeper prefetching to take the
48 	most advantage of EV6's bandwidth.
49 
50 	I also prefetched the read stream. Note that adding the read prefetch
51 	forced me to add another cycle to the inner-most kernel - up to 11
52 	from the original 8 cycles per iteration.  We could improve performance
53 	further by unrolling the loop and doing multiple prefetches per cycle.
54 
55    I think that the code below will be very robust and fast code for the
56    purposes of copying aligned pages.  It is slower when both source and
57    destination pages are in the dcache, but it is my guess that this is
58    less important than the dcache miss case.  */
59 
60 #include <asm/export.h>
61 	.text
62 	.align 4
63 	.global copy_page
64 	.ent copy_page
65 copy_page:
66 	.prologue 0
67 
68 	/* Prefetch 5 read cachelines; write-hint 10 cache lines.  */
69 	wh64	($16)
70 	ldl	$31,0($17)
71 	ldl	$31,64($17)
72 	lda	$1,1*64($16)
73 
74 	wh64	($1)
75 	ldl	$31,128($17)
76 	ldl	$31,192($17)
77 	lda	$1,2*64($16)
78 
79 	wh64	($1)
80 	ldl	$31,256($17)
81 	lda	$18,118
82 	lda	$1,3*64($16)
83 
84 	wh64	($1)
85 	nop
86 	lda	$1,4*64($16)
87 	lda	$2,5*64($16)
88 
89 	wh64	($1)
90 	wh64	($2)
91 	lda	$1,6*64($16)
92 	lda	$2,7*64($16)
93 
94 	wh64	($1)
95 	wh64	($2)
96 	lda	$1,8*64($16)
97 	lda	$2,9*64($16)
98 
99 	wh64	($1)
100 	wh64	($2)
101 	lda	$19,10*64($16)
102 	nop
103 
104 	/* Main prefetching/write-hinting loop.  */
105 1:	ldq	$0,0($17)
106 	ldq	$1,8($17)
107 	unop
108 	unop
109 
110 	unop
111 	unop
112 	ldq	$2,16($17)
113 	ldq	$3,24($17)
114 
115 	ldq	$4,32($17)
116 	ldq	$5,40($17)
117 	unop
118 	unop
119 
120 	unop
121 	unop
122 	ldq	$6,48($17)
123 	ldq	$7,56($17)
124 
125 	ldl	$31,320($17)
126 	unop
127 	unop
128 	unop
129 
130 	/* This gives the extra cycle of aeration above the minimum.  */
131 	unop
132 	unop
133 	unop
134 	unop
135 
136 	wh64	($19)
137 	unop
138 	unop
139 	unop
140 
141 	stq	$0,0($16)
142 	subq	$18,1,$18
143 	stq	$1,8($16)
144 	unop
145 
146 	unop
147 	stq	$2,16($16)
148 	addq	$17,64,$17
149 	stq	$3,24($16)
150 
151 	stq	$4,32($16)
152 	stq	$5,40($16)
153 	addq	$19,64,$19
154 	unop
155 
156 	stq	$6,48($16)
157 	stq	$7,56($16)
158 	addq	$16,64,$16
159 	bne	$18, 1b
160 
161 	/* Prefetch the final 5 cache lines of the read stream.  */
162 	lda	$18,10
163 	ldl	$31,320($17)
164 	ldl	$31,384($17)
165 	ldl	$31,448($17)
166 
167 	ldl	$31,512($17)
168 	ldl	$31,576($17)
169 	nop
170 	nop
171 
172 	/* Non-prefetching, non-write-hinting cleanup loop for the
173 	   final 10 cache lines.  */
174 2:	ldq	$0,0($17)
175 	ldq	$1,8($17)
176 	ldq	$2,16($17)
177 	ldq	$3,24($17)
178 
179 	ldq	$4,32($17)
180 	ldq	$5,40($17)
181 	ldq	$6,48($17)
182 	ldq	$7,56($17)
183 
184 	stq	$0,0($16)
185 	subq	$18,1,$18
186 	stq	$1,8($16)
187 	addq	$17,64,$17
188 
189 	stq	$2,16($16)
190 	stq	$3,24($16)
191 	stq	$4,32($16)
192 	stq	$5,40($16)
193 
194 	stq	$6,48($16)
195 	stq	$7,56($16)
196 	addq	$16,64,$16
197 	bne	$18, 2b
198 
199 	ret
200 	nop
201 	unop
202 	nop
203 
204 	.end copy_page
205 	EXPORT_SYMBOL(copy_page)
206