18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * arch/openrisc/lib/memcpy.c
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * Optimized memory copy routines for openrisc.  These are mostly copied
68c2ecf20Sopenharmony_ci * from ohter sources but slightly entended based on ideas discuassed in
78c2ecf20Sopenharmony_ci * #openrisc.
88c2ecf20Sopenharmony_ci *
98c2ecf20Sopenharmony_ci * The word unroll implementation is an extension to the arm byte
108c2ecf20Sopenharmony_ci * unrolled implementation, but using word copies (if things are
118c2ecf20Sopenharmony_ci * properly aligned)
128c2ecf20Sopenharmony_ci *
138c2ecf20Sopenharmony_ci * The great arm loop unroll algorithm can be found at:
148c2ecf20Sopenharmony_ci *  arch/arm/boot/compressed/string.c
158c2ecf20Sopenharmony_ci */
168c2ecf20Sopenharmony_ci
178c2ecf20Sopenharmony_ci#include <linux/export.h>
188c2ecf20Sopenharmony_ci
198c2ecf20Sopenharmony_ci#include <linux/string.h>
208c2ecf20Sopenharmony_ci
218c2ecf20Sopenharmony_ci#ifdef CONFIG_OR1K_1200
228c2ecf20Sopenharmony_ci/*
238c2ecf20Sopenharmony_ci * Do memcpy with word copies and loop unrolling. This gives the
248c2ecf20Sopenharmony_ci * best performance on the OR1200 and MOR1KX archirectures
258c2ecf20Sopenharmony_ci */
268c2ecf20Sopenharmony_civoid *memcpy(void *dest, __const void *src, __kernel_size_t n)
278c2ecf20Sopenharmony_ci{
288c2ecf20Sopenharmony_ci	int i = 0;
298c2ecf20Sopenharmony_ci	unsigned char *d, *s;
308c2ecf20Sopenharmony_ci	uint32_t *dest_w = (uint32_t *)dest, *src_w = (uint32_t *)src;
318c2ecf20Sopenharmony_ci
328c2ecf20Sopenharmony_ci	/* If both source and dest are word aligned copy words */
338c2ecf20Sopenharmony_ci	if (!((unsigned int)dest_w & 3) && !((unsigned int)src_w & 3)) {
348c2ecf20Sopenharmony_ci		/* Copy 32 bytes per loop */
358c2ecf20Sopenharmony_ci		for (i = n >> 5; i > 0; i--) {
368c2ecf20Sopenharmony_ci			*dest_w++ = *src_w++;
378c2ecf20Sopenharmony_ci			*dest_w++ = *src_w++;
388c2ecf20Sopenharmony_ci			*dest_w++ = *src_w++;
398c2ecf20Sopenharmony_ci			*dest_w++ = *src_w++;
408c2ecf20Sopenharmony_ci			*dest_w++ = *src_w++;
418c2ecf20Sopenharmony_ci			*dest_w++ = *src_w++;
428c2ecf20Sopenharmony_ci			*dest_w++ = *src_w++;
438c2ecf20Sopenharmony_ci			*dest_w++ = *src_w++;
448c2ecf20Sopenharmony_ci		}
458c2ecf20Sopenharmony_ci
468c2ecf20Sopenharmony_ci		if (n & 1 << 4) {
478c2ecf20Sopenharmony_ci			*dest_w++ = *src_w++;
488c2ecf20Sopenharmony_ci			*dest_w++ = *src_w++;
498c2ecf20Sopenharmony_ci			*dest_w++ = *src_w++;
508c2ecf20Sopenharmony_ci			*dest_w++ = *src_w++;
518c2ecf20Sopenharmony_ci		}
528c2ecf20Sopenharmony_ci
538c2ecf20Sopenharmony_ci		if (n & 1 << 3) {
548c2ecf20Sopenharmony_ci			*dest_w++ = *src_w++;
558c2ecf20Sopenharmony_ci			*dest_w++ = *src_w++;
568c2ecf20Sopenharmony_ci		}
578c2ecf20Sopenharmony_ci
588c2ecf20Sopenharmony_ci		if (n & 1 << 2)
598c2ecf20Sopenharmony_ci			*dest_w++ = *src_w++;
608c2ecf20Sopenharmony_ci
618c2ecf20Sopenharmony_ci		d = (unsigned char *)dest_w;
628c2ecf20Sopenharmony_ci		s = (unsigned char *)src_w;
638c2ecf20Sopenharmony_ci
648c2ecf20Sopenharmony_ci	} else {
658c2ecf20Sopenharmony_ci		d = (unsigned char *)dest_w;
668c2ecf20Sopenharmony_ci		s = (unsigned char *)src_w;
678c2ecf20Sopenharmony_ci
688c2ecf20Sopenharmony_ci		for (i = n >> 3; i > 0; i--) {
698c2ecf20Sopenharmony_ci			*d++ = *s++;
708c2ecf20Sopenharmony_ci			*d++ = *s++;
718c2ecf20Sopenharmony_ci			*d++ = *s++;
728c2ecf20Sopenharmony_ci			*d++ = *s++;
738c2ecf20Sopenharmony_ci			*d++ = *s++;
748c2ecf20Sopenharmony_ci			*d++ = *s++;
758c2ecf20Sopenharmony_ci			*d++ = *s++;
768c2ecf20Sopenharmony_ci			*d++ = *s++;
778c2ecf20Sopenharmony_ci		}
788c2ecf20Sopenharmony_ci
798c2ecf20Sopenharmony_ci		if (n & 1 << 2) {
808c2ecf20Sopenharmony_ci			*d++ = *s++;
818c2ecf20Sopenharmony_ci			*d++ = *s++;
828c2ecf20Sopenharmony_ci			*d++ = *s++;
838c2ecf20Sopenharmony_ci			*d++ = *s++;
848c2ecf20Sopenharmony_ci		}
858c2ecf20Sopenharmony_ci	}
868c2ecf20Sopenharmony_ci
878c2ecf20Sopenharmony_ci	if (n & 1 << 1) {
888c2ecf20Sopenharmony_ci		*d++ = *s++;
898c2ecf20Sopenharmony_ci		*d++ = *s++;
908c2ecf20Sopenharmony_ci	}
918c2ecf20Sopenharmony_ci
928c2ecf20Sopenharmony_ci	if (n & 1)
938c2ecf20Sopenharmony_ci		*d++ = *s++;
948c2ecf20Sopenharmony_ci
958c2ecf20Sopenharmony_ci	return dest;
968c2ecf20Sopenharmony_ci}
978c2ecf20Sopenharmony_ci#else
988c2ecf20Sopenharmony_ci/*
998c2ecf20Sopenharmony_ci * Use word copies but no loop unrolling as we cannot assume there
1008c2ecf20Sopenharmony_ci * will be benefits on the archirecture
1018c2ecf20Sopenharmony_ci */
1028c2ecf20Sopenharmony_civoid *memcpy(void *dest, __const void *src, __kernel_size_t n)
1038c2ecf20Sopenharmony_ci{
1048c2ecf20Sopenharmony_ci	unsigned char *d = (unsigned char *)dest, *s = (unsigned char *)src;
1058c2ecf20Sopenharmony_ci	uint32_t *dest_w = (uint32_t *)dest, *src_w = (uint32_t *)src;
1068c2ecf20Sopenharmony_ci
1078c2ecf20Sopenharmony_ci	/* If both source and dest are word aligned copy words */
1088c2ecf20Sopenharmony_ci	if (!((unsigned int)dest_w & 3) && !((unsigned int)src_w & 3)) {
1098c2ecf20Sopenharmony_ci		for (; n >= 4; n -= 4)
1108c2ecf20Sopenharmony_ci			*dest_w++ = *src_w++;
1118c2ecf20Sopenharmony_ci	}
1128c2ecf20Sopenharmony_ci
1138c2ecf20Sopenharmony_ci	d = (unsigned char *)dest_w;
1148c2ecf20Sopenharmony_ci	s = (unsigned char *)src_w;
1158c2ecf20Sopenharmony_ci
1168c2ecf20Sopenharmony_ci	/* For remaining or if not aligned, copy bytes */
1178c2ecf20Sopenharmony_ci	for (; n >= 1; n -= 1)
1188c2ecf20Sopenharmony_ci		*d++ = *s++;
1198c2ecf20Sopenharmony_ci
1208c2ecf20Sopenharmony_ci	return dest;
1218c2ecf20Sopenharmony_ci
1228c2ecf20Sopenharmony_ci}
1238c2ecf20Sopenharmony_ci#endif
1248c2ecf20Sopenharmony_ci
1258c2ecf20Sopenharmony_ciEXPORT_SYMBOL(memcpy);
126