1// SPDX-License-Identifier: GPL-2.0 2/* 3 * MMX 3DNow! library helper functions 4 * 5 * To do: 6 * We can use MMX just for prefetch in IRQ's. This may be a win. 7 * (reported so on K6-III) 8 * We should use a better code neutral filler for the short jump 9 * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ?? 10 * We also want to clobber the filler register so we don't get any 11 * register forwarding stalls on the filler. 12 * 13 * Add *user handling. Checksums are not a win with MMX on any CPU 14 * tested so far for any MMX solution figured. 15 * 16 * 22/09/2000 - Arjan van de Ven 17 * Improved for non-egineering-sample Athlons 18 * 19 */ 20#include <linux/hardirq.h> 21#include <linux/string.h> 22#include <linux/export.h> 23#include <linux/sched.h> 24#include <linux/types.h> 25 26#include <asm/fpu/api.h> 27#include <asm/asm.h> 28 29/* 30 * Use KFPU_387. MMX instructions are not affected by MXCSR, 31 * but both AMD and Intel documentation states that even integer MMX 32 * operations will result in #MF if an exception is pending in FCW. 33 * 34 * EMMS is not needed afterwards because, after calling kernel_fpu_end(), 35 * any subsequent user of the 387 stack will reinitialize it using 36 * KFPU_387. 37 */ 38 39void *_mmx_memcpy(void *to, const void *from, size_t len) 40{ 41 void *p; 42 int i; 43 44 if (unlikely(in_interrupt())) 45 return __memcpy(to, from, len); 46 47 p = to; 48 i = len >> 6; /* len/64 */ 49 50 kernel_fpu_begin_mask(KFPU_387); 51 52 __asm__ __volatile__ ( 53 "1: prefetch (%0)\n" /* This set is 28 bytes */ 54 " prefetch 64(%0)\n" 55 " prefetch 128(%0)\n" 56 " prefetch 192(%0)\n" 57 " prefetch 256(%0)\n" 58 "2: \n" 59 ".section .fixup, \"ax\"\n" 60 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ 61 " jmp 2b\n" 62 ".previous\n" 63 _ASM_EXTABLE(1b, 3b) 64 : : "r" (from)); 65 66 for ( ; i > 5; i--) { 67 __asm__ __volatile__ ( 68 "1: prefetch 320(%0)\n" 69 "2: movq (%0), %%mm0\n" 70 " movq 8(%0), %%mm1\n" 71 " movq 16(%0), %%mm2\n" 72 " movq 24(%0), %%mm3\n" 73 " movq %%mm0, (%1)\n" 74 " movq %%mm1, 8(%1)\n" 75 " movq %%mm2, 16(%1)\n" 76 " movq %%mm3, 24(%1)\n" 77 " movq 32(%0), %%mm0\n" 78 " movq 40(%0), %%mm1\n" 79 " movq 48(%0), %%mm2\n" 80 " movq 56(%0), %%mm3\n" 81 " movq %%mm0, 32(%1)\n" 82 " movq %%mm1, 40(%1)\n" 83 " movq %%mm2, 48(%1)\n" 84 " movq %%mm3, 56(%1)\n" 85 ".section .fixup, \"ax\"\n" 86 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ 87 " jmp 2b\n" 88 ".previous\n" 89 _ASM_EXTABLE(1b, 3b) 90 : : "r" (from), "r" (to) : "memory"); 91 92 from += 64; 93 to += 64; 94 } 95 96 for ( ; i > 0; i--) { 97 __asm__ __volatile__ ( 98 " movq (%0), %%mm0\n" 99 " movq 8(%0), %%mm1\n" 100 " movq 16(%0), %%mm2\n" 101 " movq 24(%0), %%mm3\n" 102 " movq %%mm0, (%1)\n" 103 " movq %%mm1, 8(%1)\n" 104 " movq %%mm2, 16(%1)\n" 105 " movq %%mm3, 24(%1)\n" 106 " movq 32(%0), %%mm0\n" 107 " movq 40(%0), %%mm1\n" 108 " movq 48(%0), %%mm2\n" 109 " movq 56(%0), %%mm3\n" 110 " movq %%mm0, 32(%1)\n" 111 " movq %%mm1, 40(%1)\n" 112 " movq %%mm2, 48(%1)\n" 113 " movq %%mm3, 56(%1)\n" 114 : : "r" (from), "r" (to) : "memory"); 115 116 from += 64; 117 to += 64; 118 } 119 /* 120 * Now do the tail of the block: 121 */ 122 __memcpy(to, from, len & 63); 123 kernel_fpu_end(); 124 125 return p; 126} 127EXPORT_SYMBOL(_mmx_memcpy); 128 129#ifdef CONFIG_MK7 130 131/* 132 * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and 133 * other MMX using processors do not. 134 */ 135 136static void fast_clear_page(void *page) 137{ 138 int i; 139 140 kernel_fpu_begin_mask(KFPU_387); 141 142 __asm__ __volatile__ ( 143 " pxor %%mm0, %%mm0\n" : : 144 ); 145 146 for (i = 0; i < 4096/64; i++) { 147 __asm__ __volatile__ ( 148 " movntq %%mm0, (%0)\n" 149 " movntq %%mm0, 8(%0)\n" 150 " movntq %%mm0, 16(%0)\n" 151 " movntq %%mm0, 24(%0)\n" 152 " movntq %%mm0, 32(%0)\n" 153 " movntq %%mm0, 40(%0)\n" 154 " movntq %%mm0, 48(%0)\n" 155 " movntq %%mm0, 56(%0)\n" 156 : : "r" (page) : "memory"); 157 page += 64; 158 } 159 160 /* 161 * Since movntq is weakly-ordered, a "sfence" is needed to become 162 * ordered again: 163 */ 164 __asm__ __volatile__("sfence\n"::); 165 166 kernel_fpu_end(); 167} 168 169static void fast_copy_page(void *to, void *from) 170{ 171 int i; 172 173 kernel_fpu_begin_mask(KFPU_387); 174 175 /* 176 * maybe the prefetch stuff can go before the expensive fnsave... 177 * but that is for later. -AV 178 */ 179 __asm__ __volatile__( 180 "1: prefetch (%0)\n" 181 " prefetch 64(%0)\n" 182 " prefetch 128(%0)\n" 183 " prefetch 192(%0)\n" 184 " prefetch 256(%0)\n" 185 "2: \n" 186 ".section .fixup, \"ax\"\n" 187 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ 188 " jmp 2b\n" 189 ".previous\n" 190 _ASM_EXTABLE(1b, 3b) : : "r" (from)); 191 192 for (i = 0; i < (4096-320)/64; i++) { 193 __asm__ __volatile__ ( 194 "1: prefetch 320(%0)\n" 195 "2: movq (%0), %%mm0\n" 196 " movntq %%mm0, (%1)\n" 197 " movq 8(%0), %%mm1\n" 198 " movntq %%mm1, 8(%1)\n" 199 " movq 16(%0), %%mm2\n" 200 " movntq %%mm2, 16(%1)\n" 201 " movq 24(%0), %%mm3\n" 202 " movntq %%mm3, 24(%1)\n" 203 " movq 32(%0), %%mm4\n" 204 " movntq %%mm4, 32(%1)\n" 205 " movq 40(%0), %%mm5\n" 206 " movntq %%mm5, 40(%1)\n" 207 " movq 48(%0), %%mm6\n" 208 " movntq %%mm6, 48(%1)\n" 209 " movq 56(%0), %%mm7\n" 210 " movntq %%mm7, 56(%1)\n" 211 ".section .fixup, \"ax\"\n" 212 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ 213 " jmp 2b\n" 214 ".previous\n" 215 _ASM_EXTABLE(1b, 3b) : : "r" (from), "r" (to) : "memory"); 216 217 from += 64; 218 to += 64; 219 } 220 221 for (i = (4096-320)/64; i < 4096/64; i++) { 222 __asm__ __volatile__ ( 223 "2: movq (%0), %%mm0\n" 224 " movntq %%mm0, (%1)\n" 225 " movq 8(%0), %%mm1\n" 226 " movntq %%mm1, 8(%1)\n" 227 " movq 16(%0), %%mm2\n" 228 " movntq %%mm2, 16(%1)\n" 229 " movq 24(%0), %%mm3\n" 230 " movntq %%mm3, 24(%1)\n" 231 " movq 32(%0), %%mm4\n" 232 " movntq %%mm4, 32(%1)\n" 233 " movq 40(%0), %%mm5\n" 234 " movntq %%mm5, 40(%1)\n" 235 " movq 48(%0), %%mm6\n" 236 " movntq %%mm6, 48(%1)\n" 237 " movq 56(%0), %%mm7\n" 238 " movntq %%mm7, 56(%1)\n" 239 : : "r" (from), "r" (to) : "memory"); 240 from += 64; 241 to += 64; 242 } 243 /* 244 * Since movntq is weakly-ordered, a "sfence" is needed to become 245 * ordered again: 246 */ 247 __asm__ __volatile__("sfence \n"::); 248 kernel_fpu_end(); 249} 250 251#else /* CONFIG_MK7 */ 252 253/* 254 * Generic MMX implementation without K7 specific streaming 255 */ 256static void fast_clear_page(void *page) 257{ 258 int i; 259 260 kernel_fpu_begin_mask(KFPU_387); 261 262 __asm__ __volatile__ ( 263 " pxor %%mm0, %%mm0\n" : : 264 ); 265 266 for (i = 0; i < 4096/128; i++) { 267 __asm__ __volatile__ ( 268 " movq %%mm0, (%0)\n" 269 " movq %%mm0, 8(%0)\n" 270 " movq %%mm0, 16(%0)\n" 271 " movq %%mm0, 24(%0)\n" 272 " movq %%mm0, 32(%0)\n" 273 " movq %%mm0, 40(%0)\n" 274 " movq %%mm0, 48(%0)\n" 275 " movq %%mm0, 56(%0)\n" 276 " movq %%mm0, 64(%0)\n" 277 " movq %%mm0, 72(%0)\n" 278 " movq %%mm0, 80(%0)\n" 279 " movq %%mm0, 88(%0)\n" 280 " movq %%mm0, 96(%0)\n" 281 " movq %%mm0, 104(%0)\n" 282 " movq %%mm0, 112(%0)\n" 283 " movq %%mm0, 120(%0)\n" 284 : : "r" (page) : "memory"); 285 page += 128; 286 } 287 288 kernel_fpu_end(); 289} 290 291static void fast_copy_page(void *to, void *from) 292{ 293 int i; 294 295 kernel_fpu_begin_mask(KFPU_387); 296 297 __asm__ __volatile__ ( 298 "1: prefetch (%0)\n" 299 " prefetch 64(%0)\n" 300 " prefetch 128(%0)\n" 301 " prefetch 192(%0)\n" 302 " prefetch 256(%0)\n" 303 "2: \n" 304 ".section .fixup, \"ax\"\n" 305 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ 306 " jmp 2b\n" 307 ".previous\n" 308 _ASM_EXTABLE(1b, 3b) : : "r" (from)); 309 310 for (i = 0; i < 4096/64; i++) { 311 __asm__ __volatile__ ( 312 "1: prefetch 320(%0)\n" 313 "2: movq (%0), %%mm0\n" 314 " movq 8(%0), %%mm1\n" 315 " movq 16(%0), %%mm2\n" 316 " movq 24(%0), %%mm3\n" 317 " movq %%mm0, (%1)\n" 318 " movq %%mm1, 8(%1)\n" 319 " movq %%mm2, 16(%1)\n" 320 " movq %%mm3, 24(%1)\n" 321 " movq 32(%0), %%mm0\n" 322 " movq 40(%0), %%mm1\n" 323 " movq 48(%0), %%mm2\n" 324 " movq 56(%0), %%mm3\n" 325 " movq %%mm0, 32(%1)\n" 326 " movq %%mm1, 40(%1)\n" 327 " movq %%mm2, 48(%1)\n" 328 " movq %%mm3, 56(%1)\n" 329 ".section .fixup, \"ax\"\n" 330 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ 331 " jmp 2b\n" 332 ".previous\n" 333 _ASM_EXTABLE(1b, 3b) 334 : : "r" (from), "r" (to) : "memory"); 335 336 from += 64; 337 to += 64; 338 } 339 kernel_fpu_end(); 340} 341 342#endif /* !CONFIG_MK7 */ 343 344/* 345 * Favour MMX for page clear and copy: 346 */ 347static void slow_zero_page(void *page) 348{ 349 int d0, d1; 350 351 __asm__ __volatile__( 352 "cld\n\t" 353 "rep ; stosl" 354 355 : "=&c" (d0), "=&D" (d1) 356 :"a" (0), "1" (page), "0" (1024) 357 :"memory"); 358} 359 360void mmx_clear_page(void *page) 361{ 362 if (unlikely(in_interrupt())) 363 slow_zero_page(page); 364 else 365 fast_clear_page(page); 366} 367EXPORT_SYMBOL(mmx_clear_page); 368 369static void slow_copy_page(void *to, void *from) 370{ 371 int d0, d1, d2; 372 373 __asm__ __volatile__( 374 "cld\n\t" 375 "rep ; movsl" 376 : "=&c" (d0), "=&D" (d1), "=&S" (d2) 377 : "0" (1024), "1" ((long) to), "2" ((long) from) 378 : "memory"); 379} 380 381void mmx_copy_page(void *to, void *from) 382{ 383 if (unlikely(in_interrupt())) 384 slow_copy_page(to, from); 385 else 386 fast_copy_page(to, from); 387} 388EXPORT_SYMBOL(mmx_copy_page); 389