162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * linux/arch/alpha/lib/memcpy.c 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Copyright (C) 1995 Linus Torvalds 662306a36Sopenharmony_ci */ 762306a36Sopenharmony_ci 862306a36Sopenharmony_ci/* 962306a36Sopenharmony_ci * This is a reasonably optimized memcpy() routine. 1062306a36Sopenharmony_ci */ 1162306a36Sopenharmony_ci 1262306a36Sopenharmony_ci/* 1362306a36Sopenharmony_ci * Note that the C code is written to be optimized into good assembly. However, 1462306a36Sopenharmony_ci * at this point gcc is unable to sanely compile "if (n >= 0)", resulting in a 1562306a36Sopenharmony_ci * explicit compare against 0 (instead of just using the proper "blt reg, xx" or 1662306a36Sopenharmony_ci * "bge reg, xx"). I hope alpha-gcc will be fixed to notice this eventually.. 1762306a36Sopenharmony_ci */ 1862306a36Sopenharmony_ci 1962306a36Sopenharmony_ci#include <linux/types.h> 2062306a36Sopenharmony_ci#include <linux/export.h> 2162306a36Sopenharmony_ci 2262306a36Sopenharmony_ci/* 2362306a36Sopenharmony_ci * This should be done in one go with ldq_u*2/mask/stq_u. Do it 2462306a36Sopenharmony_ci * with a macro so that we can fix it up later.. 2562306a36Sopenharmony_ci */ 2662306a36Sopenharmony_ci#define ALIGN_DEST_TO8_UP(d,s,n) \ 2762306a36Sopenharmony_ci while (d & 7) { \ 2862306a36Sopenharmony_ci if (n <= 0) return; \ 2962306a36Sopenharmony_ci n--; \ 3062306a36Sopenharmony_ci *(char *) d = *(char *) s; \ 3162306a36Sopenharmony_ci d++; s++; \ 3262306a36Sopenharmony_ci } 3362306a36Sopenharmony_ci#define ALIGN_DEST_TO8_DN(d,s,n) \ 3462306a36Sopenharmony_ci while (d & 7) { \ 3562306a36Sopenharmony_ci if (n <= 0) return; \ 3662306a36Sopenharmony_ci n--; \ 3762306a36Sopenharmony_ci d--; s--; \ 3862306a36Sopenharmony_ci *(char *) d = *(char *) s; \ 3962306a36Sopenharmony_ci } 4062306a36Sopenharmony_ci 4162306a36Sopenharmony_ci/* 4262306a36Sopenharmony_ci * This should similarly be done with ldq_u*2/mask/stq. The destination 4362306a36Sopenharmony_ci * is aligned, but we don't fill in a full quad-word 4462306a36Sopenharmony_ci */ 4562306a36Sopenharmony_ci#define DO_REST_UP(d,s,n) \ 4662306a36Sopenharmony_ci while (n > 0) { \ 4762306a36Sopenharmony_ci n--; \ 4862306a36Sopenharmony_ci *(char *) d = *(char *) s; \ 4962306a36Sopenharmony_ci d++; s++; \ 5062306a36Sopenharmony_ci } 5162306a36Sopenharmony_ci#define DO_REST_DN(d,s,n) \ 5262306a36Sopenharmony_ci while (n > 0) { \ 5362306a36Sopenharmony_ci n--; \ 5462306a36Sopenharmony_ci d--; s--; \ 5562306a36Sopenharmony_ci *(char *) d = *(char *) s; \ 5662306a36Sopenharmony_ci } 5762306a36Sopenharmony_ci 5862306a36Sopenharmony_ci/* 5962306a36Sopenharmony_ci * This should be done with ldq/mask/stq. The source and destination are 6062306a36Sopenharmony_ci * aligned, but we don't fill in a full quad-word 6162306a36Sopenharmony_ci */ 6262306a36Sopenharmony_ci#define DO_REST_ALIGNED_UP(d,s,n) DO_REST_UP(d,s,n) 6362306a36Sopenharmony_ci#define DO_REST_ALIGNED_DN(d,s,n) DO_REST_DN(d,s,n) 6462306a36Sopenharmony_ci 6562306a36Sopenharmony_ci/* 6662306a36Sopenharmony_ci * This does unaligned memory copies. We want to avoid storing to 6762306a36Sopenharmony_ci * an unaligned address, as that would do a read-modify-write cycle. 6862306a36Sopenharmony_ci * We also want to avoid double-reading the unaligned reads. 6962306a36Sopenharmony_ci * 7062306a36Sopenharmony_ci * Note the ordering to try to avoid load (and address generation) latencies. 7162306a36Sopenharmony_ci */ 7262306a36Sopenharmony_cistatic inline void __memcpy_unaligned_up (unsigned long d, unsigned long s, 7362306a36Sopenharmony_ci long n) 7462306a36Sopenharmony_ci{ 7562306a36Sopenharmony_ci ALIGN_DEST_TO8_UP(d,s,n); 7662306a36Sopenharmony_ci n -= 8; /* to avoid compare against 8 in the loop */ 7762306a36Sopenharmony_ci if (n >= 0) { 7862306a36Sopenharmony_ci unsigned long low_word, high_word; 7962306a36Sopenharmony_ci __asm__("ldq_u %0,%1":"=r" (low_word):"m" (*(unsigned long *) s)); 8062306a36Sopenharmony_ci do { 8162306a36Sopenharmony_ci unsigned long tmp; 8262306a36Sopenharmony_ci __asm__("ldq_u %0,%1":"=r" (high_word):"m" (*(unsigned long *)(s+8))); 8362306a36Sopenharmony_ci n -= 8; 8462306a36Sopenharmony_ci __asm__("extql %1,%2,%0" 8562306a36Sopenharmony_ci :"=r" (low_word) 8662306a36Sopenharmony_ci :"r" (low_word), "r" (s)); 8762306a36Sopenharmony_ci __asm__("extqh %1,%2,%0" 8862306a36Sopenharmony_ci :"=r" (tmp) 8962306a36Sopenharmony_ci :"r" (high_word), "r" (s)); 9062306a36Sopenharmony_ci s += 8; 9162306a36Sopenharmony_ci *(unsigned long *) d = low_word | tmp; 9262306a36Sopenharmony_ci d += 8; 9362306a36Sopenharmony_ci low_word = high_word; 9462306a36Sopenharmony_ci } while (n >= 0); 9562306a36Sopenharmony_ci } 9662306a36Sopenharmony_ci n += 8; 9762306a36Sopenharmony_ci DO_REST_UP(d,s,n); 9862306a36Sopenharmony_ci} 9962306a36Sopenharmony_ci 10062306a36Sopenharmony_cistatic inline void __memcpy_unaligned_dn (unsigned long d, unsigned long s, 10162306a36Sopenharmony_ci long n) 10262306a36Sopenharmony_ci{ 10362306a36Sopenharmony_ci /* I don't understand AXP assembler well enough for this. -Tim */ 10462306a36Sopenharmony_ci s += n; 10562306a36Sopenharmony_ci d += n; 10662306a36Sopenharmony_ci while (n--) 10762306a36Sopenharmony_ci * (char *) --d = * (char *) --s; 10862306a36Sopenharmony_ci} 10962306a36Sopenharmony_ci 11062306a36Sopenharmony_ci/* 11162306a36Sopenharmony_ci * Hmm.. Strange. The __asm__ here is there to make gcc use an integer register 11262306a36Sopenharmony_ci * for the load-store. I don't know why, but it would seem that using a floating 11362306a36Sopenharmony_ci * point register for the move seems to slow things down (very small difference, 11462306a36Sopenharmony_ci * though). 11562306a36Sopenharmony_ci * 11662306a36Sopenharmony_ci * Note the ordering to try to avoid load (and address generation) latencies. 11762306a36Sopenharmony_ci */ 11862306a36Sopenharmony_cistatic inline void __memcpy_aligned_up (unsigned long d, unsigned long s, 11962306a36Sopenharmony_ci long n) 12062306a36Sopenharmony_ci{ 12162306a36Sopenharmony_ci ALIGN_DEST_TO8_UP(d,s,n); 12262306a36Sopenharmony_ci n -= 8; 12362306a36Sopenharmony_ci while (n >= 0) { 12462306a36Sopenharmony_ci unsigned long tmp; 12562306a36Sopenharmony_ci __asm__("ldq %0,%1":"=r" (tmp):"m" (*(unsigned long *) s)); 12662306a36Sopenharmony_ci n -= 8; 12762306a36Sopenharmony_ci s += 8; 12862306a36Sopenharmony_ci *(unsigned long *) d = tmp; 12962306a36Sopenharmony_ci d += 8; 13062306a36Sopenharmony_ci } 13162306a36Sopenharmony_ci n += 8; 13262306a36Sopenharmony_ci DO_REST_ALIGNED_UP(d,s,n); 13362306a36Sopenharmony_ci} 13462306a36Sopenharmony_cistatic inline void __memcpy_aligned_dn (unsigned long d, unsigned long s, 13562306a36Sopenharmony_ci long n) 13662306a36Sopenharmony_ci{ 13762306a36Sopenharmony_ci s += n; 13862306a36Sopenharmony_ci d += n; 13962306a36Sopenharmony_ci ALIGN_DEST_TO8_DN(d,s,n); 14062306a36Sopenharmony_ci n -= 8; 14162306a36Sopenharmony_ci while (n >= 0) { 14262306a36Sopenharmony_ci unsigned long tmp; 14362306a36Sopenharmony_ci s -= 8; 14462306a36Sopenharmony_ci __asm__("ldq %0,%1":"=r" (tmp):"m" (*(unsigned long *) s)); 14562306a36Sopenharmony_ci n -= 8; 14662306a36Sopenharmony_ci d -= 8; 14762306a36Sopenharmony_ci *(unsigned long *) d = tmp; 14862306a36Sopenharmony_ci } 14962306a36Sopenharmony_ci n += 8; 15062306a36Sopenharmony_ci DO_REST_ALIGNED_DN(d,s,n); 15162306a36Sopenharmony_ci} 15262306a36Sopenharmony_ci 15362306a36Sopenharmony_civoid * memcpy(void * dest, const void *src, size_t n) 15462306a36Sopenharmony_ci{ 15562306a36Sopenharmony_ci if (!(((unsigned long) dest ^ (unsigned long) src) & 7)) { 15662306a36Sopenharmony_ci __memcpy_aligned_up ((unsigned long) dest, (unsigned long) src, 15762306a36Sopenharmony_ci n); 15862306a36Sopenharmony_ci return dest; 15962306a36Sopenharmony_ci } 16062306a36Sopenharmony_ci __memcpy_unaligned_up ((unsigned long) dest, (unsigned long) src, n); 16162306a36Sopenharmony_ci return dest; 16262306a36Sopenharmony_ci} 16362306a36Sopenharmony_ciEXPORT_SYMBOL(memcpy); 164