xref: /kernel/linux/linux-6.6/arch/hexagon/lib/memset.S (revision 62306a36)
162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Copyright (c) 2011, The Linux Foundation. All rights reserved.
462306a36Sopenharmony_ci */
562306a36Sopenharmony_ci
662306a36Sopenharmony_ci
762306a36Sopenharmony_ci/* HEXAGON assembly optimized memset */
862306a36Sopenharmony_ci/* Replaces the standard library function memset */
962306a36Sopenharmony_ci
1062306a36Sopenharmony_ci
1162306a36Sopenharmony_ci        .macro HEXAGON_OPT_FUNC_BEGIN name
1262306a36Sopenharmony_ci	.text
1362306a36Sopenharmony_ci	.p2align 4
1462306a36Sopenharmony_ci	.globl \name
1562306a36Sopenharmony_ci	.type  \name, @function
1662306a36Sopenharmony_ci\name:
1762306a36Sopenharmony_ci	.endm
1862306a36Sopenharmony_ci
1962306a36Sopenharmony_ci	.macro HEXAGON_OPT_FUNC_FINISH name
2062306a36Sopenharmony_ci	.size  \name, . - \name
2162306a36Sopenharmony_ci	.endm
2262306a36Sopenharmony_ci
2362306a36Sopenharmony_ci/* FUNCTION: memset (v2 version) */
2462306a36Sopenharmony_ci#if __HEXAGON_ARCH__ < 3
2562306a36Sopenharmony_ciHEXAGON_OPT_FUNC_BEGIN memset
2662306a36Sopenharmony_ci	{
2762306a36Sopenharmony_ci		r6 = #8
2862306a36Sopenharmony_ci		r7 = extractu(r0, #3 , #0)
2962306a36Sopenharmony_ci		p0 = cmp.eq(r2, #0)
3062306a36Sopenharmony_ci		p1 = cmp.gtu(r2, #7)
3162306a36Sopenharmony_ci	}
3262306a36Sopenharmony_ci	{
3362306a36Sopenharmony_ci		r4 = vsplatb(r1)
3462306a36Sopenharmony_ci		r8 = r0           /* leave r0 intact for return val  */
3562306a36Sopenharmony_ci		r9 = sub(r6, r7)  /* bytes until double alignment  */
3662306a36Sopenharmony_ci		if p0 jumpr r31   /* count == 0, so return  */
3762306a36Sopenharmony_ci	}
3862306a36Sopenharmony_ci	{
3962306a36Sopenharmony_ci		r3 = #0
4062306a36Sopenharmony_ci		r7 = #0
4162306a36Sopenharmony_ci		p0 = tstbit(r9, #0)
4262306a36Sopenharmony_ci		if p1 jump 2f /* skip byte loop */
4362306a36Sopenharmony_ci	}
4462306a36Sopenharmony_ci
4562306a36Sopenharmony_ci/* less than 8 bytes to set, so just set a byte at a time and return  */
4662306a36Sopenharmony_ci
4762306a36Sopenharmony_ci		loop0(1f, r2) /* byte loop */
4862306a36Sopenharmony_ci	.falign
4962306a36Sopenharmony_ci1: /* byte loop */
5062306a36Sopenharmony_ci	{
5162306a36Sopenharmony_ci		memb(r8++#1) = r4
5262306a36Sopenharmony_ci	}:endloop0
5362306a36Sopenharmony_ci		jumpr r31
5462306a36Sopenharmony_ci	.falign
5562306a36Sopenharmony_ci2: /* skip byte loop */
5662306a36Sopenharmony_ci	{
5762306a36Sopenharmony_ci		r6 = #1
5862306a36Sopenharmony_ci		p0 = tstbit(r9, #1)
5962306a36Sopenharmony_ci		p1 = cmp.eq(r2, #1)
6062306a36Sopenharmony_ci		if !p0 jump 3f /* skip initial byte store */
6162306a36Sopenharmony_ci	}
6262306a36Sopenharmony_ci	{
6362306a36Sopenharmony_ci		memb(r8++#1) = r4
6462306a36Sopenharmony_ci		r3:2 = sub(r3:2, r7:6)
6562306a36Sopenharmony_ci		if p1 jumpr r31
6662306a36Sopenharmony_ci	}
6762306a36Sopenharmony_ci	.falign
6862306a36Sopenharmony_ci3: /* skip initial byte store */
6962306a36Sopenharmony_ci	{
7062306a36Sopenharmony_ci		r6 = #2
7162306a36Sopenharmony_ci		p0 = tstbit(r9, #2)
7262306a36Sopenharmony_ci		p1 = cmp.eq(r2, #2)
7362306a36Sopenharmony_ci		if !p0 jump 4f /* skip initial half store */
7462306a36Sopenharmony_ci	}
7562306a36Sopenharmony_ci	{
7662306a36Sopenharmony_ci		memh(r8++#2) = r4
7762306a36Sopenharmony_ci		r3:2 = sub(r3:2, r7:6)
7862306a36Sopenharmony_ci		if p1 jumpr r31
7962306a36Sopenharmony_ci	}
8062306a36Sopenharmony_ci	.falign
8162306a36Sopenharmony_ci4: /* skip initial half store */
8262306a36Sopenharmony_ci	{
8362306a36Sopenharmony_ci		r6 = #4
8462306a36Sopenharmony_ci		p0 = cmp.gtu(r2, #7)
8562306a36Sopenharmony_ci		p1 = cmp.eq(r2, #4)
8662306a36Sopenharmony_ci		if !p0 jump 5f /* skip initial word store */
8762306a36Sopenharmony_ci	}
8862306a36Sopenharmony_ci	{
8962306a36Sopenharmony_ci		memw(r8++#4) = r4
9062306a36Sopenharmony_ci		r3:2 = sub(r3:2, r7:6)
9162306a36Sopenharmony_ci		p0 = cmp.gtu(r2, #11)
9262306a36Sopenharmony_ci		if p1 jumpr r31
9362306a36Sopenharmony_ci	}
9462306a36Sopenharmony_ci	.falign
9562306a36Sopenharmony_ci5: /* skip initial word store */
9662306a36Sopenharmony_ci	{
9762306a36Sopenharmony_ci		r10 = lsr(r2, #3)
9862306a36Sopenharmony_ci		p1 = cmp.eq(r3, #1)
9962306a36Sopenharmony_ci		if !p0 jump 7f /* skip double loop */
10062306a36Sopenharmony_ci	}
10162306a36Sopenharmony_ci	{
10262306a36Sopenharmony_ci		r5 = r4
10362306a36Sopenharmony_ci		r6 = #8
10462306a36Sopenharmony_ci		loop0(6f, r10) /* double loop */
10562306a36Sopenharmony_ci	}
10662306a36Sopenharmony_ci
10762306a36Sopenharmony_ci/* set bytes a double word at a time  */
10862306a36Sopenharmony_ci
10962306a36Sopenharmony_ci	.falign
11062306a36Sopenharmony_ci6: /* double loop */
11162306a36Sopenharmony_ci	{
11262306a36Sopenharmony_ci		memd(r8++#8) = r5:4
11362306a36Sopenharmony_ci		r3:2 = sub(r3:2, r7:6)
11462306a36Sopenharmony_ci		p1 = cmp.eq(r2, #8)
11562306a36Sopenharmony_ci	}:endloop0
11662306a36Sopenharmony_ci	.falign
11762306a36Sopenharmony_ci7: /* skip double loop */
11862306a36Sopenharmony_ci	{
11962306a36Sopenharmony_ci		p0 = tstbit(r2, #2)
12062306a36Sopenharmony_ci		if p1 jumpr r31
12162306a36Sopenharmony_ci	}
12262306a36Sopenharmony_ci	{
12362306a36Sopenharmony_ci		r6 = #4
12462306a36Sopenharmony_ci		p0 = tstbit(r2, #1)
12562306a36Sopenharmony_ci		p1 = cmp.eq(r2, #4)
12662306a36Sopenharmony_ci		if !p0 jump 8f /* skip final word store */
12762306a36Sopenharmony_ci	}
12862306a36Sopenharmony_ci	{
12962306a36Sopenharmony_ci		memw(r8++#4) = r4
13062306a36Sopenharmony_ci		r3:2 = sub(r3:2, r7:6)
13162306a36Sopenharmony_ci		if p1 jumpr r31
13262306a36Sopenharmony_ci	}
13362306a36Sopenharmony_ci	.falign
13462306a36Sopenharmony_ci8: /* skip final word store */
13562306a36Sopenharmony_ci	{
13662306a36Sopenharmony_ci		p1 = cmp.eq(r2, #2)
13762306a36Sopenharmony_ci		if !p0 jump 9f /* skip final half store */
13862306a36Sopenharmony_ci	}
13962306a36Sopenharmony_ci	{
14062306a36Sopenharmony_ci		memh(r8++#2) = r4
14162306a36Sopenharmony_ci		if p1 jumpr r31
14262306a36Sopenharmony_ci	}
14362306a36Sopenharmony_ci	.falign
14462306a36Sopenharmony_ci9: /* skip final half store */
14562306a36Sopenharmony_ci	{
14662306a36Sopenharmony_ci		memb(r8++#1) = r4
14762306a36Sopenharmony_ci		jumpr r31
14862306a36Sopenharmony_ci	}
14962306a36Sopenharmony_ciHEXAGON_OPT_FUNC_FINISH memset
15062306a36Sopenharmony_ci#endif
15162306a36Sopenharmony_ci
15262306a36Sopenharmony_ci
15362306a36Sopenharmony_ci/*  FUNCTION: memset (v3 and higher version)  */
15462306a36Sopenharmony_ci#if __HEXAGON_ARCH__ >= 3
15562306a36Sopenharmony_ciHEXAGON_OPT_FUNC_BEGIN memset
15662306a36Sopenharmony_ci	{
15762306a36Sopenharmony_ci		r7=vsplatb(r1)
15862306a36Sopenharmony_ci		r6 = r0
15962306a36Sopenharmony_ci		if (r2==#0) jump:nt .L1
16062306a36Sopenharmony_ci	}
16162306a36Sopenharmony_ci	{
16262306a36Sopenharmony_ci		r5:4=combine(r7,r7)
16362306a36Sopenharmony_ci		p0 = cmp.gtu(r2,#8)
16462306a36Sopenharmony_ci		if (p0.new) jump:nt .L3
16562306a36Sopenharmony_ci	}
16662306a36Sopenharmony_ci	{
16762306a36Sopenharmony_ci		r3 = r0
16862306a36Sopenharmony_ci		loop0(.L47,r2)
16962306a36Sopenharmony_ci	}
17062306a36Sopenharmony_ci	.falign
17162306a36Sopenharmony_ci.L47:
17262306a36Sopenharmony_ci	{
17362306a36Sopenharmony_ci		memb(r3++#1) = r1
17462306a36Sopenharmony_ci	}:endloop0 /* start=.L47 */
17562306a36Sopenharmony_ci		jumpr r31
17662306a36Sopenharmony_ci.L3:
17762306a36Sopenharmony_ci	{
17862306a36Sopenharmony_ci		p0 = tstbit(r0,#0)
17962306a36Sopenharmony_ci		if (!p0.new) jump:nt .L8
18062306a36Sopenharmony_ci		p1 = cmp.eq(r2, #1)
18162306a36Sopenharmony_ci	}
18262306a36Sopenharmony_ci	{
18362306a36Sopenharmony_ci		r6 = add(r0, #1)
18462306a36Sopenharmony_ci		r2 = add(r2,#-1)
18562306a36Sopenharmony_ci		memb(r0) = r1
18662306a36Sopenharmony_ci		if (p1) jump .L1
18762306a36Sopenharmony_ci	}
18862306a36Sopenharmony_ci.L8:
18962306a36Sopenharmony_ci	{
19062306a36Sopenharmony_ci		p0 = tstbit(r6,#1)
19162306a36Sopenharmony_ci		if (!p0.new) jump:nt .L10
19262306a36Sopenharmony_ci	}
19362306a36Sopenharmony_ci	{
19462306a36Sopenharmony_ci		r2 = add(r2,#-2)
19562306a36Sopenharmony_ci		memh(r6++#2) = r7
19662306a36Sopenharmony_ci		p0 = cmp.eq(r2, #2)
19762306a36Sopenharmony_ci		if (p0.new) jump:nt .L1
19862306a36Sopenharmony_ci	}
19962306a36Sopenharmony_ci.L10:
20062306a36Sopenharmony_ci	{
20162306a36Sopenharmony_ci		p0 = tstbit(r6,#2)
20262306a36Sopenharmony_ci		if (!p0.new) jump:nt .L12
20362306a36Sopenharmony_ci	}
20462306a36Sopenharmony_ci	{
20562306a36Sopenharmony_ci		r2 = add(r2,#-4)
20662306a36Sopenharmony_ci		memw(r6++#4) = r7
20762306a36Sopenharmony_ci		p0 = cmp.eq(r2, #4)
20862306a36Sopenharmony_ci		if (p0.new) jump:nt .L1
20962306a36Sopenharmony_ci	}
21062306a36Sopenharmony_ci.L12:
21162306a36Sopenharmony_ci	{
21262306a36Sopenharmony_ci		p0 = cmp.gtu(r2,#127)
21362306a36Sopenharmony_ci		if (!p0.new) jump:nt .L14
21462306a36Sopenharmony_ci	}
21562306a36Sopenharmony_ci		r3 = and(r6,#31)
21662306a36Sopenharmony_ci		if (r3==#0) jump:nt .L17
21762306a36Sopenharmony_ci	{
21862306a36Sopenharmony_ci		memd(r6++#8) = r5:4
21962306a36Sopenharmony_ci		r2 = add(r2,#-8)
22062306a36Sopenharmony_ci	}
22162306a36Sopenharmony_ci		r3 = and(r6,#31)
22262306a36Sopenharmony_ci		if (r3==#0) jump:nt .L17
22362306a36Sopenharmony_ci	{
22462306a36Sopenharmony_ci		memd(r6++#8) = r5:4
22562306a36Sopenharmony_ci		r2 = add(r2,#-8)
22662306a36Sopenharmony_ci	}
22762306a36Sopenharmony_ci		r3 = and(r6,#31)
22862306a36Sopenharmony_ci		if (r3==#0) jump:nt .L17
22962306a36Sopenharmony_ci	{
23062306a36Sopenharmony_ci		memd(r6++#8) = r5:4
23162306a36Sopenharmony_ci		r2 = add(r2,#-8)
23262306a36Sopenharmony_ci	}
23362306a36Sopenharmony_ci.L17:
23462306a36Sopenharmony_ci	{
23562306a36Sopenharmony_ci		r3 = lsr(r2,#5)
23662306a36Sopenharmony_ci		if (r1!=#0) jump:nt .L18
23762306a36Sopenharmony_ci	}
23862306a36Sopenharmony_ci	{
23962306a36Sopenharmony_ci		r8 = r3
24062306a36Sopenharmony_ci		r3 = r6
24162306a36Sopenharmony_ci		loop0(.L46,r3)
24262306a36Sopenharmony_ci	}
24362306a36Sopenharmony_ci	.falign
24462306a36Sopenharmony_ci.L46:
24562306a36Sopenharmony_ci	{
24662306a36Sopenharmony_ci		dczeroa(r6)
24762306a36Sopenharmony_ci		r6 = add(r6,#32)
24862306a36Sopenharmony_ci		r2 = add(r2,#-32)
24962306a36Sopenharmony_ci	}:endloop0 /* start=.L46 */
25062306a36Sopenharmony_ci.L14:
25162306a36Sopenharmony_ci	{
25262306a36Sopenharmony_ci		p0 = cmp.gtu(r2,#7)
25362306a36Sopenharmony_ci		if (!p0.new) jump:nt .L28
25462306a36Sopenharmony_ci		r8 = lsr(r2,#3)
25562306a36Sopenharmony_ci	}
25662306a36Sopenharmony_ci		loop0(.L44,r8)
25762306a36Sopenharmony_ci	.falign
25862306a36Sopenharmony_ci.L44:
25962306a36Sopenharmony_ci	{
26062306a36Sopenharmony_ci		memd(r6++#8) = r5:4
26162306a36Sopenharmony_ci		r2 = add(r2,#-8)
26262306a36Sopenharmony_ci	}:endloop0 /* start=.L44 */
26362306a36Sopenharmony_ci.L28:
26462306a36Sopenharmony_ci	{
26562306a36Sopenharmony_ci		p0 = tstbit(r2,#2)
26662306a36Sopenharmony_ci		if (!p0.new) jump:nt .L33
26762306a36Sopenharmony_ci	}
26862306a36Sopenharmony_ci	{
26962306a36Sopenharmony_ci		r2 = add(r2,#-4)
27062306a36Sopenharmony_ci		memw(r6++#4) = r7
27162306a36Sopenharmony_ci	}
27262306a36Sopenharmony_ci.L33:
27362306a36Sopenharmony_ci	{
27462306a36Sopenharmony_ci		p0 = tstbit(r2,#1)
27562306a36Sopenharmony_ci		if (!p0.new) jump:nt .L35
27662306a36Sopenharmony_ci	}
27762306a36Sopenharmony_ci	{
27862306a36Sopenharmony_ci		r2 = add(r2,#-2)
27962306a36Sopenharmony_ci		memh(r6++#2) = r7
28062306a36Sopenharmony_ci	}
28162306a36Sopenharmony_ci.L35:
28262306a36Sopenharmony_ci		p0 = cmp.eq(r2,#1)
28362306a36Sopenharmony_ci		if (p0) memb(r6) = r1
28462306a36Sopenharmony_ci.L1:
28562306a36Sopenharmony_ci		jumpr r31
28662306a36Sopenharmony_ci.L18:
28762306a36Sopenharmony_ci		loop0(.L45,r3)
28862306a36Sopenharmony_ci	.falign
28962306a36Sopenharmony_ci.L45:
29062306a36Sopenharmony_ci		dczeroa(r6)
29162306a36Sopenharmony_ci	{
29262306a36Sopenharmony_ci		memd(r6++#8) = r5:4
29362306a36Sopenharmony_ci		r2 = add(r2,#-32)
29462306a36Sopenharmony_ci	}
29562306a36Sopenharmony_ci		memd(r6++#8) = r5:4
29662306a36Sopenharmony_ci		memd(r6++#8) = r5:4
29762306a36Sopenharmony_ci	{
29862306a36Sopenharmony_ci		memd(r6++#8) = r5:4
29962306a36Sopenharmony_ci	}:endloop0 /* start=.L45  */
30062306a36Sopenharmony_ci		jump .L14
30162306a36Sopenharmony_ciHEXAGON_OPT_FUNC_FINISH memset
30262306a36Sopenharmony_ci#endif
303