18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * Copyright (c) 2011, The Linux Foundation. All rights reserved.
48c2ecf20Sopenharmony_ci */
58c2ecf20Sopenharmony_ci
68c2ecf20Sopenharmony_ci
78c2ecf20Sopenharmony_ci/* HEXAGON assembly optimized memset */
88c2ecf20Sopenharmony_ci/* Replaces the standard library function memset */
98c2ecf20Sopenharmony_ci
108c2ecf20Sopenharmony_ci
118c2ecf20Sopenharmony_ci        .macro HEXAGON_OPT_FUNC_BEGIN name
128c2ecf20Sopenharmony_ci	.text
138c2ecf20Sopenharmony_ci	.p2align 4
148c2ecf20Sopenharmony_ci	.globl \name
158c2ecf20Sopenharmony_ci	.type  \name, @function
168c2ecf20Sopenharmony_ci\name:
178c2ecf20Sopenharmony_ci	.endm
188c2ecf20Sopenharmony_ci
198c2ecf20Sopenharmony_ci	.macro HEXAGON_OPT_FUNC_FINISH name
208c2ecf20Sopenharmony_ci	.size  \name, . - \name
218c2ecf20Sopenharmony_ci	.endm
228c2ecf20Sopenharmony_ci
238c2ecf20Sopenharmony_ci/* FUNCTION: memset (v2 version) */
248c2ecf20Sopenharmony_ci#if __HEXAGON_ARCH__ < 3
258c2ecf20Sopenharmony_ciHEXAGON_OPT_FUNC_BEGIN memset
268c2ecf20Sopenharmony_ci	{
278c2ecf20Sopenharmony_ci		r6 = #8
288c2ecf20Sopenharmony_ci		r7 = extractu(r0, #3 , #0)
298c2ecf20Sopenharmony_ci		p0 = cmp.eq(r2, #0)
308c2ecf20Sopenharmony_ci		p1 = cmp.gtu(r2, #7)
318c2ecf20Sopenharmony_ci	}
328c2ecf20Sopenharmony_ci	{
338c2ecf20Sopenharmony_ci		r4 = vsplatb(r1)
348c2ecf20Sopenharmony_ci		r8 = r0           /* leave r0 intact for return val  */
358c2ecf20Sopenharmony_ci		r9 = sub(r6, r7)  /* bytes until double alignment  */
368c2ecf20Sopenharmony_ci		if p0 jumpr r31   /* count == 0, so return  */
378c2ecf20Sopenharmony_ci	}
388c2ecf20Sopenharmony_ci	{
398c2ecf20Sopenharmony_ci		r3 = #0
408c2ecf20Sopenharmony_ci		r7 = #0
418c2ecf20Sopenharmony_ci		p0 = tstbit(r9, #0)
428c2ecf20Sopenharmony_ci		if p1 jump 2f /* skip byte loop */
438c2ecf20Sopenharmony_ci	}
448c2ecf20Sopenharmony_ci
458c2ecf20Sopenharmony_ci/* less than 8 bytes to set, so just set a byte at a time and return  */
468c2ecf20Sopenharmony_ci
478c2ecf20Sopenharmony_ci		loop0(1f, r2) /* byte loop */
488c2ecf20Sopenharmony_ci	.falign
498c2ecf20Sopenharmony_ci1: /* byte loop */
508c2ecf20Sopenharmony_ci	{
518c2ecf20Sopenharmony_ci		memb(r8++#1) = r4
528c2ecf20Sopenharmony_ci	}:endloop0
538c2ecf20Sopenharmony_ci		jumpr r31
548c2ecf20Sopenharmony_ci	.falign
558c2ecf20Sopenharmony_ci2: /* skip byte loop */
568c2ecf20Sopenharmony_ci	{
578c2ecf20Sopenharmony_ci		r6 = #1
588c2ecf20Sopenharmony_ci		p0 = tstbit(r9, #1)
598c2ecf20Sopenharmony_ci		p1 = cmp.eq(r2, #1)
608c2ecf20Sopenharmony_ci		if !p0 jump 3f /* skip initial byte store */
618c2ecf20Sopenharmony_ci	}
628c2ecf20Sopenharmony_ci	{
638c2ecf20Sopenharmony_ci		memb(r8++#1) = r4
648c2ecf20Sopenharmony_ci		r3:2 = sub(r3:2, r7:6)
658c2ecf20Sopenharmony_ci		if p1 jumpr r31
668c2ecf20Sopenharmony_ci	}
678c2ecf20Sopenharmony_ci	.falign
688c2ecf20Sopenharmony_ci3: /* skip initial byte store */
698c2ecf20Sopenharmony_ci	{
708c2ecf20Sopenharmony_ci		r6 = #2
718c2ecf20Sopenharmony_ci		p0 = tstbit(r9, #2)
728c2ecf20Sopenharmony_ci		p1 = cmp.eq(r2, #2)
738c2ecf20Sopenharmony_ci		if !p0 jump 4f /* skip initial half store */
748c2ecf20Sopenharmony_ci	}
758c2ecf20Sopenharmony_ci	{
768c2ecf20Sopenharmony_ci		memh(r8++#2) = r4
778c2ecf20Sopenharmony_ci		r3:2 = sub(r3:2, r7:6)
788c2ecf20Sopenharmony_ci		if p1 jumpr r31
798c2ecf20Sopenharmony_ci	}
808c2ecf20Sopenharmony_ci	.falign
818c2ecf20Sopenharmony_ci4: /* skip initial half store */
828c2ecf20Sopenharmony_ci	{
838c2ecf20Sopenharmony_ci		r6 = #4
848c2ecf20Sopenharmony_ci		p0 = cmp.gtu(r2, #7)
858c2ecf20Sopenharmony_ci		p1 = cmp.eq(r2, #4)
868c2ecf20Sopenharmony_ci		if !p0 jump 5f /* skip initial word store */
878c2ecf20Sopenharmony_ci	}
888c2ecf20Sopenharmony_ci	{
898c2ecf20Sopenharmony_ci		memw(r8++#4) = r4
908c2ecf20Sopenharmony_ci		r3:2 = sub(r3:2, r7:6)
918c2ecf20Sopenharmony_ci		p0 = cmp.gtu(r2, #11)
928c2ecf20Sopenharmony_ci		if p1 jumpr r31
938c2ecf20Sopenharmony_ci	}
948c2ecf20Sopenharmony_ci	.falign
958c2ecf20Sopenharmony_ci5: /* skip initial word store */
968c2ecf20Sopenharmony_ci	{
978c2ecf20Sopenharmony_ci		r10 = lsr(r2, #3)
988c2ecf20Sopenharmony_ci		p1 = cmp.eq(r3, #1)
998c2ecf20Sopenharmony_ci		if !p0 jump 7f /* skip double loop */
1008c2ecf20Sopenharmony_ci	}
1018c2ecf20Sopenharmony_ci	{
1028c2ecf20Sopenharmony_ci		r5 = r4
1038c2ecf20Sopenharmony_ci		r6 = #8
1048c2ecf20Sopenharmony_ci		loop0(6f, r10) /* double loop */
1058c2ecf20Sopenharmony_ci	}
1068c2ecf20Sopenharmony_ci
1078c2ecf20Sopenharmony_ci/* set bytes a double word at a time  */
1088c2ecf20Sopenharmony_ci
1098c2ecf20Sopenharmony_ci	.falign
1108c2ecf20Sopenharmony_ci6: /* double loop */
1118c2ecf20Sopenharmony_ci	{
1128c2ecf20Sopenharmony_ci		memd(r8++#8) = r5:4
1138c2ecf20Sopenharmony_ci		r3:2 = sub(r3:2, r7:6)
1148c2ecf20Sopenharmony_ci		p1 = cmp.eq(r2, #8)
1158c2ecf20Sopenharmony_ci	}:endloop0
1168c2ecf20Sopenharmony_ci	.falign
1178c2ecf20Sopenharmony_ci7: /* skip double loop */
1188c2ecf20Sopenharmony_ci	{
1198c2ecf20Sopenharmony_ci		p0 = tstbit(r2, #2)
1208c2ecf20Sopenharmony_ci		if p1 jumpr r31
1218c2ecf20Sopenharmony_ci	}
1228c2ecf20Sopenharmony_ci	{
1238c2ecf20Sopenharmony_ci		r6 = #4
1248c2ecf20Sopenharmony_ci		p0 = tstbit(r2, #1)
1258c2ecf20Sopenharmony_ci		p1 = cmp.eq(r2, #4)
1268c2ecf20Sopenharmony_ci		if !p0 jump 8f /* skip final word store */
1278c2ecf20Sopenharmony_ci	}
1288c2ecf20Sopenharmony_ci	{
1298c2ecf20Sopenharmony_ci		memw(r8++#4) = r4
1308c2ecf20Sopenharmony_ci		r3:2 = sub(r3:2, r7:6)
1318c2ecf20Sopenharmony_ci		if p1 jumpr r31
1328c2ecf20Sopenharmony_ci	}
1338c2ecf20Sopenharmony_ci	.falign
1348c2ecf20Sopenharmony_ci8: /* skip final word store */
1358c2ecf20Sopenharmony_ci	{
1368c2ecf20Sopenharmony_ci		p1 = cmp.eq(r2, #2)
1378c2ecf20Sopenharmony_ci		if !p0 jump 9f /* skip final half store */
1388c2ecf20Sopenharmony_ci	}
1398c2ecf20Sopenharmony_ci	{
1408c2ecf20Sopenharmony_ci		memh(r8++#2) = r4
1418c2ecf20Sopenharmony_ci		if p1 jumpr r31
1428c2ecf20Sopenharmony_ci	}
1438c2ecf20Sopenharmony_ci	.falign
1448c2ecf20Sopenharmony_ci9: /* skip final half store */
1458c2ecf20Sopenharmony_ci	{
1468c2ecf20Sopenharmony_ci		memb(r8++#1) = r4
1478c2ecf20Sopenharmony_ci		jumpr r31
1488c2ecf20Sopenharmony_ci	}
1498c2ecf20Sopenharmony_ciHEXAGON_OPT_FUNC_FINISH memset
1508c2ecf20Sopenharmony_ci#endif
1518c2ecf20Sopenharmony_ci
1528c2ecf20Sopenharmony_ci
1538c2ecf20Sopenharmony_ci/*  FUNCTION: memset (v3 and higher version)  */
1548c2ecf20Sopenharmony_ci#if __HEXAGON_ARCH__ >= 3
1558c2ecf20Sopenharmony_ciHEXAGON_OPT_FUNC_BEGIN memset
1568c2ecf20Sopenharmony_ci	{
1578c2ecf20Sopenharmony_ci		r7=vsplatb(r1)
1588c2ecf20Sopenharmony_ci		r6 = r0
1598c2ecf20Sopenharmony_ci		if (r2==#0) jump:nt .L1
1608c2ecf20Sopenharmony_ci	}
1618c2ecf20Sopenharmony_ci	{
1628c2ecf20Sopenharmony_ci		r5:4=combine(r7,r7)
1638c2ecf20Sopenharmony_ci		p0 = cmp.gtu(r2,#8)
1648c2ecf20Sopenharmony_ci		if (p0.new) jump:nt .L3
1658c2ecf20Sopenharmony_ci	}
1668c2ecf20Sopenharmony_ci	{
1678c2ecf20Sopenharmony_ci		r3 = r0
1688c2ecf20Sopenharmony_ci		loop0(.L47,r2)
1698c2ecf20Sopenharmony_ci	}
1708c2ecf20Sopenharmony_ci	.falign
1718c2ecf20Sopenharmony_ci.L47:
1728c2ecf20Sopenharmony_ci	{
1738c2ecf20Sopenharmony_ci		memb(r3++#1) = r1
1748c2ecf20Sopenharmony_ci	}:endloop0 /* start=.L47 */
1758c2ecf20Sopenharmony_ci		jumpr r31
1768c2ecf20Sopenharmony_ci.L3:
1778c2ecf20Sopenharmony_ci	{
1788c2ecf20Sopenharmony_ci		p0 = tstbit(r0,#0)
1798c2ecf20Sopenharmony_ci		if (!p0.new) jump:nt .L8
1808c2ecf20Sopenharmony_ci		p1 = cmp.eq(r2, #1)
1818c2ecf20Sopenharmony_ci	}
1828c2ecf20Sopenharmony_ci	{
1838c2ecf20Sopenharmony_ci		r6 = add(r0, #1)
1848c2ecf20Sopenharmony_ci		r2 = add(r2,#-1)
1858c2ecf20Sopenharmony_ci		memb(r0) = r1
1868c2ecf20Sopenharmony_ci		if (p1) jump .L1
1878c2ecf20Sopenharmony_ci	}
1888c2ecf20Sopenharmony_ci.L8:
1898c2ecf20Sopenharmony_ci	{
1908c2ecf20Sopenharmony_ci		p0 = tstbit(r6,#1)
1918c2ecf20Sopenharmony_ci		if (!p0.new) jump:nt .L10
1928c2ecf20Sopenharmony_ci	}
1938c2ecf20Sopenharmony_ci	{
1948c2ecf20Sopenharmony_ci		r2 = add(r2,#-2)
1958c2ecf20Sopenharmony_ci		memh(r6++#2) = r7
1968c2ecf20Sopenharmony_ci		p0 = cmp.eq(r2, #2)
1978c2ecf20Sopenharmony_ci		if (p0.new) jump:nt .L1
1988c2ecf20Sopenharmony_ci	}
1998c2ecf20Sopenharmony_ci.L10:
2008c2ecf20Sopenharmony_ci	{
2018c2ecf20Sopenharmony_ci		p0 = tstbit(r6,#2)
2028c2ecf20Sopenharmony_ci		if (!p0.new) jump:nt .L12
2038c2ecf20Sopenharmony_ci	}
2048c2ecf20Sopenharmony_ci	{
2058c2ecf20Sopenharmony_ci		r2 = add(r2,#-4)
2068c2ecf20Sopenharmony_ci		memw(r6++#4) = r7
2078c2ecf20Sopenharmony_ci		p0 = cmp.eq(r2, #4)
2088c2ecf20Sopenharmony_ci		if (p0.new) jump:nt .L1
2098c2ecf20Sopenharmony_ci	}
2108c2ecf20Sopenharmony_ci.L12:
2118c2ecf20Sopenharmony_ci	{
2128c2ecf20Sopenharmony_ci		p0 = cmp.gtu(r2,#127)
2138c2ecf20Sopenharmony_ci		if (!p0.new) jump:nt .L14
2148c2ecf20Sopenharmony_ci	}
2158c2ecf20Sopenharmony_ci		r3 = and(r6,#31)
2168c2ecf20Sopenharmony_ci		if (r3==#0) jump:nt .L17
2178c2ecf20Sopenharmony_ci	{
2188c2ecf20Sopenharmony_ci		memd(r6++#8) = r5:4
2198c2ecf20Sopenharmony_ci		r2 = add(r2,#-8)
2208c2ecf20Sopenharmony_ci	}
2218c2ecf20Sopenharmony_ci		r3 = and(r6,#31)
2228c2ecf20Sopenharmony_ci		if (r3==#0) jump:nt .L17
2238c2ecf20Sopenharmony_ci	{
2248c2ecf20Sopenharmony_ci		memd(r6++#8) = r5:4
2258c2ecf20Sopenharmony_ci		r2 = add(r2,#-8)
2268c2ecf20Sopenharmony_ci	}
2278c2ecf20Sopenharmony_ci		r3 = and(r6,#31)
2288c2ecf20Sopenharmony_ci		if (r3==#0) jump:nt .L17
2298c2ecf20Sopenharmony_ci	{
2308c2ecf20Sopenharmony_ci		memd(r6++#8) = r5:4
2318c2ecf20Sopenharmony_ci		r2 = add(r2,#-8)
2328c2ecf20Sopenharmony_ci	}
2338c2ecf20Sopenharmony_ci.L17:
2348c2ecf20Sopenharmony_ci	{
2358c2ecf20Sopenharmony_ci		r3 = lsr(r2,#5)
2368c2ecf20Sopenharmony_ci		if (r1!=#0) jump:nt .L18
2378c2ecf20Sopenharmony_ci	}
2388c2ecf20Sopenharmony_ci	{
2398c2ecf20Sopenharmony_ci		r8 = r3
2408c2ecf20Sopenharmony_ci		r3 = r6
2418c2ecf20Sopenharmony_ci		loop0(.L46,r3)
2428c2ecf20Sopenharmony_ci	}
2438c2ecf20Sopenharmony_ci	.falign
2448c2ecf20Sopenharmony_ci.L46:
2458c2ecf20Sopenharmony_ci	{
2468c2ecf20Sopenharmony_ci		dczeroa(r6)
2478c2ecf20Sopenharmony_ci		r6 = add(r6,#32)
2488c2ecf20Sopenharmony_ci		r2 = add(r2,#-32)
2498c2ecf20Sopenharmony_ci	}:endloop0 /* start=.L46 */
2508c2ecf20Sopenharmony_ci.L14:
2518c2ecf20Sopenharmony_ci	{
2528c2ecf20Sopenharmony_ci		p0 = cmp.gtu(r2,#7)
2538c2ecf20Sopenharmony_ci		if (!p0.new) jump:nt .L28
2548c2ecf20Sopenharmony_ci		r8 = lsr(r2,#3)
2558c2ecf20Sopenharmony_ci	}
2568c2ecf20Sopenharmony_ci		loop0(.L44,r8)
2578c2ecf20Sopenharmony_ci	.falign
2588c2ecf20Sopenharmony_ci.L44:
2598c2ecf20Sopenharmony_ci	{
2608c2ecf20Sopenharmony_ci		memd(r6++#8) = r5:4
2618c2ecf20Sopenharmony_ci		r2 = add(r2,#-8)
2628c2ecf20Sopenharmony_ci	}:endloop0 /* start=.L44 */
2638c2ecf20Sopenharmony_ci.L28:
2648c2ecf20Sopenharmony_ci	{
2658c2ecf20Sopenharmony_ci		p0 = tstbit(r2,#2)
2668c2ecf20Sopenharmony_ci		if (!p0.new) jump:nt .L33
2678c2ecf20Sopenharmony_ci	}
2688c2ecf20Sopenharmony_ci	{
2698c2ecf20Sopenharmony_ci		r2 = add(r2,#-4)
2708c2ecf20Sopenharmony_ci		memw(r6++#4) = r7
2718c2ecf20Sopenharmony_ci	}
2728c2ecf20Sopenharmony_ci.L33:
2738c2ecf20Sopenharmony_ci	{
2748c2ecf20Sopenharmony_ci		p0 = tstbit(r2,#1)
2758c2ecf20Sopenharmony_ci		if (!p0.new) jump:nt .L35
2768c2ecf20Sopenharmony_ci	}
2778c2ecf20Sopenharmony_ci	{
2788c2ecf20Sopenharmony_ci		r2 = add(r2,#-2)
2798c2ecf20Sopenharmony_ci		memh(r6++#2) = r7
2808c2ecf20Sopenharmony_ci	}
2818c2ecf20Sopenharmony_ci.L35:
2828c2ecf20Sopenharmony_ci		p0 = cmp.eq(r2,#1)
2838c2ecf20Sopenharmony_ci		if (p0) memb(r6) = r1
2848c2ecf20Sopenharmony_ci.L1:
2858c2ecf20Sopenharmony_ci		jumpr r31
2868c2ecf20Sopenharmony_ci.L18:
2878c2ecf20Sopenharmony_ci		loop0(.L45,r3)
2888c2ecf20Sopenharmony_ci	.falign
2898c2ecf20Sopenharmony_ci.L45:
2908c2ecf20Sopenharmony_ci		dczeroa(r6)
2918c2ecf20Sopenharmony_ci	{
2928c2ecf20Sopenharmony_ci		memd(r6++#8) = r5:4
2938c2ecf20Sopenharmony_ci		r2 = add(r2,#-32)
2948c2ecf20Sopenharmony_ci	}
2958c2ecf20Sopenharmony_ci		memd(r6++#8) = r5:4
2968c2ecf20Sopenharmony_ci		memd(r6++#8) = r5:4
2978c2ecf20Sopenharmony_ci	{
2988c2ecf20Sopenharmony_ci		memd(r6++#8) = r5:4
2998c2ecf20Sopenharmony_ci	}:endloop0 /* start=.L45  */
3008c2ecf20Sopenharmony_ci		jump .L14
3018c2ecf20Sopenharmony_ciHEXAGON_OPT_FUNC_FINISH memset
3028c2ecf20Sopenharmony_ci#endif
303