18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * linux/arch/alpha/lib/memcpy.c 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Copyright (C) 1995 Linus Torvalds 68c2ecf20Sopenharmony_ci */ 78c2ecf20Sopenharmony_ci 88c2ecf20Sopenharmony_ci/* 98c2ecf20Sopenharmony_ci * This is a reasonably optimized memcpy() routine. 108c2ecf20Sopenharmony_ci */ 118c2ecf20Sopenharmony_ci 128c2ecf20Sopenharmony_ci/* 138c2ecf20Sopenharmony_ci * Note that the C code is written to be optimized into good assembly. However, 148c2ecf20Sopenharmony_ci * at this point gcc is unable to sanely compile "if (n >= 0)", resulting in a 158c2ecf20Sopenharmony_ci * explicit compare against 0 (instead of just using the proper "blt reg, xx" or 168c2ecf20Sopenharmony_ci * "bge reg, xx"). I hope alpha-gcc will be fixed to notice this eventually.. 178c2ecf20Sopenharmony_ci */ 188c2ecf20Sopenharmony_ci 198c2ecf20Sopenharmony_ci#include <linux/types.h> 208c2ecf20Sopenharmony_ci#include <linux/export.h> 218c2ecf20Sopenharmony_ci 228c2ecf20Sopenharmony_ci/* 238c2ecf20Sopenharmony_ci * This should be done in one go with ldq_u*2/mask/stq_u. Do it 248c2ecf20Sopenharmony_ci * with a macro so that we can fix it up later.. 258c2ecf20Sopenharmony_ci */ 268c2ecf20Sopenharmony_ci#define ALIGN_DEST_TO8_UP(d,s,n) \ 278c2ecf20Sopenharmony_ci while (d & 7) { \ 288c2ecf20Sopenharmony_ci if (n <= 0) return; \ 298c2ecf20Sopenharmony_ci n--; \ 308c2ecf20Sopenharmony_ci *(char *) d = *(char *) s; \ 318c2ecf20Sopenharmony_ci d++; s++; \ 328c2ecf20Sopenharmony_ci } 338c2ecf20Sopenharmony_ci#define ALIGN_DEST_TO8_DN(d,s,n) \ 348c2ecf20Sopenharmony_ci while (d & 7) { \ 358c2ecf20Sopenharmony_ci if (n <= 0) return; \ 368c2ecf20Sopenharmony_ci n--; \ 378c2ecf20Sopenharmony_ci d--; s--; \ 388c2ecf20Sopenharmony_ci *(char *) d = *(char *) s; \ 398c2ecf20Sopenharmony_ci } 408c2ecf20Sopenharmony_ci 418c2ecf20Sopenharmony_ci/* 428c2ecf20Sopenharmony_ci * This should similarly be done with ldq_u*2/mask/stq. The destination 438c2ecf20Sopenharmony_ci * is aligned, but we don't fill in a full quad-word 448c2ecf20Sopenharmony_ci */ 458c2ecf20Sopenharmony_ci#define DO_REST_UP(d,s,n) \ 468c2ecf20Sopenharmony_ci while (n > 0) { \ 478c2ecf20Sopenharmony_ci n--; \ 488c2ecf20Sopenharmony_ci *(char *) d = *(char *) s; \ 498c2ecf20Sopenharmony_ci d++; s++; \ 508c2ecf20Sopenharmony_ci } 518c2ecf20Sopenharmony_ci#define DO_REST_DN(d,s,n) \ 528c2ecf20Sopenharmony_ci while (n > 0) { \ 538c2ecf20Sopenharmony_ci n--; \ 548c2ecf20Sopenharmony_ci d--; s--; \ 558c2ecf20Sopenharmony_ci *(char *) d = *(char *) s; \ 568c2ecf20Sopenharmony_ci } 578c2ecf20Sopenharmony_ci 588c2ecf20Sopenharmony_ci/* 598c2ecf20Sopenharmony_ci * This should be done with ldq/mask/stq. The source and destination are 608c2ecf20Sopenharmony_ci * aligned, but we don't fill in a full quad-word 618c2ecf20Sopenharmony_ci */ 628c2ecf20Sopenharmony_ci#define DO_REST_ALIGNED_UP(d,s,n) DO_REST_UP(d,s,n) 638c2ecf20Sopenharmony_ci#define DO_REST_ALIGNED_DN(d,s,n) DO_REST_DN(d,s,n) 648c2ecf20Sopenharmony_ci 658c2ecf20Sopenharmony_ci/* 668c2ecf20Sopenharmony_ci * This does unaligned memory copies. We want to avoid storing to 678c2ecf20Sopenharmony_ci * an unaligned address, as that would do a read-modify-write cycle. 688c2ecf20Sopenharmony_ci * We also want to avoid double-reading the unaligned reads. 698c2ecf20Sopenharmony_ci * 708c2ecf20Sopenharmony_ci * Note the ordering to try to avoid load (and address generation) latencies. 718c2ecf20Sopenharmony_ci */ 728c2ecf20Sopenharmony_cistatic inline void __memcpy_unaligned_up (unsigned long d, unsigned long s, 738c2ecf20Sopenharmony_ci long n) 748c2ecf20Sopenharmony_ci{ 758c2ecf20Sopenharmony_ci ALIGN_DEST_TO8_UP(d,s,n); 768c2ecf20Sopenharmony_ci n -= 8; /* to avoid compare against 8 in the loop */ 778c2ecf20Sopenharmony_ci if (n >= 0) { 788c2ecf20Sopenharmony_ci unsigned long low_word, high_word; 798c2ecf20Sopenharmony_ci __asm__("ldq_u %0,%1":"=r" (low_word):"m" (*(unsigned long *) s)); 808c2ecf20Sopenharmony_ci do { 818c2ecf20Sopenharmony_ci unsigned long tmp; 828c2ecf20Sopenharmony_ci __asm__("ldq_u %0,%1":"=r" (high_word):"m" (*(unsigned long *)(s+8))); 838c2ecf20Sopenharmony_ci n -= 8; 848c2ecf20Sopenharmony_ci __asm__("extql %1,%2,%0" 858c2ecf20Sopenharmony_ci :"=r" (low_word) 868c2ecf20Sopenharmony_ci :"r" (low_word), "r" (s)); 878c2ecf20Sopenharmony_ci __asm__("extqh %1,%2,%0" 888c2ecf20Sopenharmony_ci :"=r" (tmp) 898c2ecf20Sopenharmony_ci :"r" (high_word), "r" (s)); 908c2ecf20Sopenharmony_ci s += 8; 918c2ecf20Sopenharmony_ci *(unsigned long *) d = low_word | tmp; 928c2ecf20Sopenharmony_ci d += 8; 938c2ecf20Sopenharmony_ci low_word = high_word; 948c2ecf20Sopenharmony_ci } while (n >= 0); 958c2ecf20Sopenharmony_ci } 968c2ecf20Sopenharmony_ci n += 8; 978c2ecf20Sopenharmony_ci DO_REST_UP(d,s,n); 988c2ecf20Sopenharmony_ci} 998c2ecf20Sopenharmony_ci 1008c2ecf20Sopenharmony_cistatic inline void __memcpy_unaligned_dn (unsigned long d, unsigned long s, 1018c2ecf20Sopenharmony_ci long n) 1028c2ecf20Sopenharmony_ci{ 1038c2ecf20Sopenharmony_ci /* I don't understand AXP assembler well enough for this. -Tim */ 1048c2ecf20Sopenharmony_ci s += n; 1058c2ecf20Sopenharmony_ci d += n; 1068c2ecf20Sopenharmony_ci while (n--) 1078c2ecf20Sopenharmony_ci * (char *) --d = * (char *) --s; 1088c2ecf20Sopenharmony_ci} 1098c2ecf20Sopenharmony_ci 1108c2ecf20Sopenharmony_ci/* 1118c2ecf20Sopenharmony_ci * Hmm.. Strange. The __asm__ here is there to make gcc use an integer register 1128c2ecf20Sopenharmony_ci * for the load-store. I don't know why, but it would seem that using a floating 1138c2ecf20Sopenharmony_ci * point register for the move seems to slow things down (very small difference, 1148c2ecf20Sopenharmony_ci * though). 1158c2ecf20Sopenharmony_ci * 1168c2ecf20Sopenharmony_ci * Note the ordering to try to avoid load (and address generation) latencies. 1178c2ecf20Sopenharmony_ci */ 1188c2ecf20Sopenharmony_cistatic inline void __memcpy_aligned_up (unsigned long d, unsigned long s, 1198c2ecf20Sopenharmony_ci long n) 1208c2ecf20Sopenharmony_ci{ 1218c2ecf20Sopenharmony_ci ALIGN_DEST_TO8_UP(d,s,n); 1228c2ecf20Sopenharmony_ci n -= 8; 1238c2ecf20Sopenharmony_ci while (n >= 0) { 1248c2ecf20Sopenharmony_ci unsigned long tmp; 1258c2ecf20Sopenharmony_ci __asm__("ldq %0,%1":"=r" (tmp):"m" (*(unsigned long *) s)); 1268c2ecf20Sopenharmony_ci n -= 8; 1278c2ecf20Sopenharmony_ci s += 8; 1288c2ecf20Sopenharmony_ci *(unsigned long *) d = tmp; 1298c2ecf20Sopenharmony_ci d += 8; 1308c2ecf20Sopenharmony_ci } 1318c2ecf20Sopenharmony_ci n += 8; 1328c2ecf20Sopenharmony_ci DO_REST_ALIGNED_UP(d,s,n); 1338c2ecf20Sopenharmony_ci} 1348c2ecf20Sopenharmony_cistatic inline void __memcpy_aligned_dn (unsigned long d, unsigned long s, 1358c2ecf20Sopenharmony_ci long n) 1368c2ecf20Sopenharmony_ci{ 1378c2ecf20Sopenharmony_ci s += n; 1388c2ecf20Sopenharmony_ci d += n; 1398c2ecf20Sopenharmony_ci ALIGN_DEST_TO8_DN(d,s,n); 1408c2ecf20Sopenharmony_ci n -= 8; 1418c2ecf20Sopenharmony_ci while (n >= 0) { 1428c2ecf20Sopenharmony_ci unsigned long tmp; 1438c2ecf20Sopenharmony_ci s -= 8; 1448c2ecf20Sopenharmony_ci __asm__("ldq %0,%1":"=r" (tmp):"m" (*(unsigned long *) s)); 1458c2ecf20Sopenharmony_ci n -= 8; 1468c2ecf20Sopenharmony_ci d -= 8; 1478c2ecf20Sopenharmony_ci *(unsigned long *) d = tmp; 1488c2ecf20Sopenharmony_ci } 1498c2ecf20Sopenharmony_ci n += 8; 1508c2ecf20Sopenharmony_ci DO_REST_ALIGNED_DN(d,s,n); 1518c2ecf20Sopenharmony_ci} 1528c2ecf20Sopenharmony_ci 1538c2ecf20Sopenharmony_civoid * memcpy(void * dest, const void *src, size_t n) 1548c2ecf20Sopenharmony_ci{ 1558c2ecf20Sopenharmony_ci if (!(((unsigned long) dest ^ (unsigned long) src) & 7)) { 1568c2ecf20Sopenharmony_ci __memcpy_aligned_up ((unsigned long) dest, (unsigned long) src, 1578c2ecf20Sopenharmony_ci n); 1588c2ecf20Sopenharmony_ci return dest; 1598c2ecf20Sopenharmony_ci } 1608c2ecf20Sopenharmony_ci __memcpy_unaligned_up ((unsigned long) dest, (unsigned long) src, n); 1618c2ecf20Sopenharmony_ci return dest; 1628c2ecf20Sopenharmony_ci} 1638c2ecf20Sopenharmony_ciEXPORT_SYMBOL(memcpy); 164