18c2ecf20Sopenharmony_ci/*
28c2ecf20Sopenharmony_ci * M7memset.S: SPARC M7 optimized memset.
38c2ecf20Sopenharmony_ci *
48c2ecf20Sopenharmony_ci * Copyright (c) 2016, Oracle and/or its affiliates.  All rights reserved.
58c2ecf20Sopenharmony_ci */
68c2ecf20Sopenharmony_ci
78c2ecf20Sopenharmony_ci/*
88c2ecf20Sopenharmony_ci * M7memset.S: M7 optimized memset.
98c2ecf20Sopenharmony_ci *
108c2ecf20Sopenharmony_ci * char *memset(sp, c, n)
118c2ecf20Sopenharmony_ci *
128c2ecf20Sopenharmony_ci * Set an array of n chars starting at sp to the character c.
138c2ecf20Sopenharmony_ci * Return sp.
148c2ecf20Sopenharmony_ci *
158c2ecf20Sopenharmony_ci * Fast assembler language version of the following C-program for memset
168c2ecf20Sopenharmony_ci * which represents the `standard' for the C-library.
178c2ecf20Sopenharmony_ci *
188c2ecf20Sopenharmony_ci *	void *
198c2ecf20Sopenharmony_ci *	memset(void *sp1, int c, size_t n)
208c2ecf20Sopenharmony_ci *	{
218c2ecf20Sopenharmony_ci *	    if (n != 0) {
228c2ecf20Sopenharmony_ci *		char *sp = sp1;
238c2ecf20Sopenharmony_ci *		do {
248c2ecf20Sopenharmony_ci *		    *sp++ = (char)c;
258c2ecf20Sopenharmony_ci *		} while (--n != 0);
268c2ecf20Sopenharmony_ci *	    }
278c2ecf20Sopenharmony_ci *	    return (sp1);
288c2ecf20Sopenharmony_ci *	}
298c2ecf20Sopenharmony_ci *
308c2ecf20Sopenharmony_ci * The algorithm is as follows :
318c2ecf20Sopenharmony_ci *
328c2ecf20Sopenharmony_ci *	For small 6 or fewer bytes stores, bytes will be stored.
338c2ecf20Sopenharmony_ci *
348c2ecf20Sopenharmony_ci *	For less than 32 bytes stores, align the address on 4 byte boundary.
358c2ecf20Sopenharmony_ci *	Then store as many 4-byte chunks, followed by trailing bytes.
368c2ecf20Sopenharmony_ci *
378c2ecf20Sopenharmony_ci *	For sizes greater than 32 bytes, align the address on 8 byte boundary.
388c2ecf20Sopenharmony_ci *	if (count >= 64) {
398c2ecf20Sopenharmony_ci *      	store 8-bytes chunks to align the address on 64 byte boundary
408c2ecf20Sopenharmony_ci *		if (value to be set is zero && count >= MIN_ZERO) {
418c2ecf20Sopenharmony_ci *              	Using BIS stores, set the first long word of each
428c2ecf20Sopenharmony_ci *			64-byte cache line to zero which will also clear the
438c2ecf20Sopenharmony_ci *			other seven long words of the cache line.
448c2ecf20Sopenharmony_ci *       	}
458c2ecf20Sopenharmony_ci *       	else if (count >= MIN_LOOP) {
468c2ecf20Sopenharmony_ci *       		Using BIS stores, set the first long word of each of
478c2ecf20Sopenharmony_ci *              	ST_CHUNK cache lines (64 bytes each) before the main
488c2ecf20Sopenharmony_ci *			loop is entered.
498c2ecf20Sopenharmony_ci *              	In the main loop, continue pre-setting the first long
508c2ecf20Sopenharmony_ci *              	word of each cache line ST_CHUNK lines in advance while
518c2ecf20Sopenharmony_ci *              	setting the other seven long words (56 bytes) of each
528c2ecf20Sopenharmony_ci * 			cache line until fewer than ST_CHUNK*64 bytes remain.
538c2ecf20Sopenharmony_ci *			Then set the remaining seven long words of each cache
548c2ecf20Sopenharmony_ci * 			line that has already had its first long word set.
558c2ecf20Sopenharmony_ci *       	}
568c2ecf20Sopenharmony_ci *       	store remaining data in 64-byte chunks until less than
578c2ecf20Sopenharmony_ci *       	64 bytes remain.
588c2ecf20Sopenharmony_ci *       }
598c2ecf20Sopenharmony_ci *       Store as many 8-byte chunks, followed by trailing bytes.
608c2ecf20Sopenharmony_ci *
618c2ecf20Sopenharmony_ci * BIS = Block Init Store
628c2ecf20Sopenharmony_ci *   Doing the advance store of the first element of the cache line
638c2ecf20Sopenharmony_ci *   initiates the displacement of a cache line while only using a single
648c2ecf20Sopenharmony_ci *   instruction in the pipeline. That avoids various pipeline delays,
658c2ecf20Sopenharmony_ci *   such as filling the miss buffer. The performance effect is
668c2ecf20Sopenharmony_ci *   similar to prefetching for normal stores.
678c2ecf20Sopenharmony_ci *   The special case for zero fills runs faster and uses fewer instruction
688c2ecf20Sopenharmony_ci *   cycles than the normal memset loop.
698c2ecf20Sopenharmony_ci *
708c2ecf20Sopenharmony_ci * We only use BIS for memset of greater than MIN_LOOP bytes because a sequence
718c2ecf20Sopenharmony_ci * BIS stores must be followed by a membar #StoreStore. The benefit of
728c2ecf20Sopenharmony_ci * the BIS store must be balanced against the cost of the membar operation.
738c2ecf20Sopenharmony_ci */
748c2ecf20Sopenharmony_ci
758c2ecf20Sopenharmony_ci/*
768c2ecf20Sopenharmony_ci * ASI_STBI_P marks the cache line as "least recently used"
778c2ecf20Sopenharmony_ci * which means if many threads are active, it has a high chance
788c2ecf20Sopenharmony_ci * of being pushed out of the cache between the first initializing
798c2ecf20Sopenharmony_ci * store and the final stores.
808c2ecf20Sopenharmony_ci * Thus, we use ASI_STBIMRU_P which marks the cache line as
818c2ecf20Sopenharmony_ci * "most recently used" for all but the last store to the cache line.
828c2ecf20Sopenharmony_ci */
838c2ecf20Sopenharmony_ci
848c2ecf20Sopenharmony_ci#include <asm/asi.h>
858c2ecf20Sopenharmony_ci#include <asm/page.h>
868c2ecf20Sopenharmony_ci
878c2ecf20Sopenharmony_ci#define ASI_STBI_P      ASI_BLK_INIT_QUAD_LDD_P
888c2ecf20Sopenharmony_ci#define ASI_STBIMRU_P   ASI_ST_BLKINIT_MRU_P
898c2ecf20Sopenharmony_ci
908c2ecf20Sopenharmony_ci
918c2ecf20Sopenharmony_ci#define ST_CHUNK        24   /* multiple of 4 due to loop unrolling */
928c2ecf20Sopenharmony_ci#define MIN_LOOP        16320
938c2ecf20Sopenharmony_ci#define MIN_ZERO        512
948c2ecf20Sopenharmony_ci
958c2ecf20Sopenharmony_ci	.section	".text"
968c2ecf20Sopenharmony_ci	.align		32
978c2ecf20Sopenharmony_ci
988c2ecf20Sopenharmony_ci/*
998c2ecf20Sopenharmony_ci * Define clear_page(dest) as memset(dest, 0, PAGE_SIZE)
1008c2ecf20Sopenharmony_ci * (can create a more optimized version later.)
1018c2ecf20Sopenharmony_ci */
1028c2ecf20Sopenharmony_ci	.globl		M7clear_page
1038c2ecf20Sopenharmony_ci	.globl		M7clear_user_page
1048c2ecf20Sopenharmony_ciM7clear_page:		/* clear_page(dest) */
1058c2ecf20Sopenharmony_ciM7clear_user_page:
1068c2ecf20Sopenharmony_ci	set	PAGE_SIZE, %o1
1078c2ecf20Sopenharmony_ci	/* fall through into bzero code */
1088c2ecf20Sopenharmony_ci
1098c2ecf20Sopenharmony_ci	.size		M7clear_page,.-M7clear_page
1108c2ecf20Sopenharmony_ci	.size		M7clear_user_page,.-M7clear_user_page
1118c2ecf20Sopenharmony_ci
1128c2ecf20Sopenharmony_ci/*
1138c2ecf20Sopenharmony_ci * Define bzero(dest, n) as memset(dest, 0, n)
1148c2ecf20Sopenharmony_ci * (can create a more optimized version later.)
1158c2ecf20Sopenharmony_ci */
1168c2ecf20Sopenharmony_ci	.globl		M7bzero
1178c2ecf20Sopenharmony_ciM7bzero:		/* bzero(dest, size) */
1188c2ecf20Sopenharmony_ci	mov	%o1, %o2
1198c2ecf20Sopenharmony_ci	mov	0, %o1
1208c2ecf20Sopenharmony_ci	/* fall through into memset code */
1218c2ecf20Sopenharmony_ci
1228c2ecf20Sopenharmony_ci	.size		M7bzero,.-M7bzero
1238c2ecf20Sopenharmony_ci
1248c2ecf20Sopenharmony_ci	.global		M7memset
1258c2ecf20Sopenharmony_ci	.type		M7memset, #function
1268c2ecf20Sopenharmony_ci	.register	%g3, #scratch
1278c2ecf20Sopenharmony_ciM7memset:
1288c2ecf20Sopenharmony_ci	mov     %o0, %o5                ! copy sp1 before using it
1298c2ecf20Sopenharmony_ci	cmp     %o2, 7                  ! if small counts, just write bytes
1308c2ecf20Sopenharmony_ci	bleu,pn %xcc, .wrchar
1318c2ecf20Sopenharmony_ci	 and     %o1, 0xff, %o1          ! o1 is (char)c
1328c2ecf20Sopenharmony_ci
1338c2ecf20Sopenharmony_ci	sll     %o1, 8, %o3
1348c2ecf20Sopenharmony_ci	or      %o1, %o3, %o1           ! now o1 has 2 bytes of c
1358c2ecf20Sopenharmony_ci	sll     %o1, 16, %o3
1368c2ecf20Sopenharmony_ci	cmp     %o2, 32
1378c2ecf20Sopenharmony_ci	blu,pn  %xcc, .wdalign
1388c2ecf20Sopenharmony_ci	 or      %o1, %o3, %o1           ! now o1 has 4 bytes of c
1398c2ecf20Sopenharmony_ci
1408c2ecf20Sopenharmony_ci	sllx    %o1, 32, %o3
1418c2ecf20Sopenharmony_ci	or      %o1, %o3, %o1           ! now o1 has 8 bytes of c
1428c2ecf20Sopenharmony_ci
1438c2ecf20Sopenharmony_ci.dbalign:
1448c2ecf20Sopenharmony_ci	andcc   %o5, 7, %o3             ! is sp1 aligned on a 8 byte bound?
1458c2ecf20Sopenharmony_ci	bz,pt   %xcc, .blkalign         ! already long word aligned
1468c2ecf20Sopenharmony_ci	 sub     %o3, 8, %o3             ! -(bytes till long word aligned)
1478c2ecf20Sopenharmony_ci
1488c2ecf20Sopenharmony_ci	add     %o2, %o3, %o2           ! update o2 with new count
1498c2ecf20Sopenharmony_ci	! Set -(%o3) bytes till sp1 long word aligned
1508c2ecf20Sopenharmony_ci1:	stb     %o1, [%o5]              ! there is at least 1 byte to set
1518c2ecf20Sopenharmony_ci	inccc   %o3                     ! byte clearing loop
1528c2ecf20Sopenharmony_ci	bl,pt   %xcc, 1b
1538c2ecf20Sopenharmony_ci	 inc     %o5
1548c2ecf20Sopenharmony_ci
1558c2ecf20Sopenharmony_ci	! Now sp1 is long word aligned (sp1 is found in %o5)
1568c2ecf20Sopenharmony_ci.blkalign:
1578c2ecf20Sopenharmony_ci	cmp     %o2, 64                 ! check if there are 64 bytes to set
1588c2ecf20Sopenharmony_ci	blu,pn  %xcc, .wrshort
1598c2ecf20Sopenharmony_ci	 mov     %o2, %o3
1608c2ecf20Sopenharmony_ci
1618c2ecf20Sopenharmony_ci	andcc   %o5, 63, %o3            ! is sp1 block aligned?
1628c2ecf20Sopenharmony_ci	bz,pt   %xcc, .blkwr            ! now block aligned
1638c2ecf20Sopenharmony_ci	 sub     %o3, 64, %o3            ! o3 is -(bytes till block aligned)
1648c2ecf20Sopenharmony_ci	add     %o2, %o3, %o2           ! o2 is the remainder
1658c2ecf20Sopenharmony_ci
1668c2ecf20Sopenharmony_ci	! Store -(%o3) bytes till dst is block (64 byte) aligned.
1678c2ecf20Sopenharmony_ci	! Use long word stores.
1688c2ecf20Sopenharmony_ci	! Recall that dst is already long word aligned
1698c2ecf20Sopenharmony_ci1:
1708c2ecf20Sopenharmony_ci	addcc   %o3, 8, %o3
1718c2ecf20Sopenharmony_ci	stx     %o1, [%o5]
1728c2ecf20Sopenharmony_ci	bl,pt   %xcc, 1b
1738c2ecf20Sopenharmony_ci	 add     %o5, 8, %o5
1748c2ecf20Sopenharmony_ci
1758c2ecf20Sopenharmony_ci	! Now sp1 is block aligned
1768c2ecf20Sopenharmony_ci.blkwr:
1778c2ecf20Sopenharmony_ci	andn    %o2, 63, %o4            ! calculate size of blocks in bytes
1788c2ecf20Sopenharmony_ci	brz,pn  %o1, .wrzero            ! special case if c == 0
1798c2ecf20Sopenharmony_ci	 and     %o2, 63, %o3            ! %o3 = bytes left after blk stores.
1808c2ecf20Sopenharmony_ci
1818c2ecf20Sopenharmony_ci	set     MIN_LOOP, %g1
1828c2ecf20Sopenharmony_ci	cmp     %o4, %g1                ! check there are enough bytes to set
1838c2ecf20Sopenharmony_ci	blu,pn  %xcc, .short_set        ! to justify cost of membar
1848c2ecf20Sopenharmony_ci	                                ! must be > pre-cleared lines
1858c2ecf20Sopenharmony_ci	 nop
1868c2ecf20Sopenharmony_ci
1878c2ecf20Sopenharmony_ci	! initial cache-clearing stores
1888c2ecf20Sopenharmony_ci	! get store pipeline moving
1898c2ecf20Sopenharmony_ci	rd	%asi, %g3		! save %asi to be restored later
1908c2ecf20Sopenharmony_ci	wr     %g0, ASI_STBIMRU_P, %asi
1918c2ecf20Sopenharmony_ci
1928c2ecf20Sopenharmony_ci	! Primary memset loop for large memsets
1938c2ecf20Sopenharmony_ci.wr_loop:
1948c2ecf20Sopenharmony_ci	sub     %o5, 8, %o5		! adjust %o5 for ASI store alignment
1958c2ecf20Sopenharmony_ci	mov     ST_CHUNK, %g1
1968c2ecf20Sopenharmony_ci.wr_loop_start:
1978c2ecf20Sopenharmony_ci	stxa    %o1, [%o5+8]%asi
1988c2ecf20Sopenharmony_ci	subcc   %g1, 4, %g1
1998c2ecf20Sopenharmony_ci	stxa    %o1, [%o5+8+64]%asi
2008c2ecf20Sopenharmony_ci	add     %o5, 256, %o5
2018c2ecf20Sopenharmony_ci	stxa    %o1, [%o5+8-128]%asi
2028c2ecf20Sopenharmony_ci	bgu     %xcc, .wr_loop_start
2038c2ecf20Sopenharmony_ci	 stxa    %o1, [%o5+8-64]%asi
2048c2ecf20Sopenharmony_ci
2058c2ecf20Sopenharmony_ci	sub     %o5, ST_CHUNK*64, %o5	! reset %o5
2068c2ecf20Sopenharmony_ci	mov     ST_CHUNK, %g1
2078c2ecf20Sopenharmony_ci
2088c2ecf20Sopenharmony_ci.wr_loop_rest:
2098c2ecf20Sopenharmony_ci	stxa    %o1, [%o5+8+8]%asi
2108c2ecf20Sopenharmony_ci	sub     %o4, 64, %o4
2118c2ecf20Sopenharmony_ci	stxa    %o1, [%o5+16+8]%asi
2128c2ecf20Sopenharmony_ci	subcc   %g1, 1, %g1
2138c2ecf20Sopenharmony_ci	stxa    %o1, [%o5+24+8]%asi
2148c2ecf20Sopenharmony_ci	stxa    %o1, [%o5+32+8]%asi
2158c2ecf20Sopenharmony_ci	stxa    %o1, [%o5+40+8]%asi
2168c2ecf20Sopenharmony_ci	add     %o5, 64, %o5
2178c2ecf20Sopenharmony_ci	stxa    %o1, [%o5-8]%asi
2188c2ecf20Sopenharmony_ci	bgu     %xcc, .wr_loop_rest
2198c2ecf20Sopenharmony_ci	 stxa    %o1, [%o5]ASI_STBI_P
2208c2ecf20Sopenharmony_ci
2218c2ecf20Sopenharmony_ci	! If more than ST_CHUNK*64 bytes remain to set, continue
2228c2ecf20Sopenharmony_ci	! setting the first long word of each cache line in advance
2238c2ecf20Sopenharmony_ci	! to keep the store pipeline moving.
2248c2ecf20Sopenharmony_ci
2258c2ecf20Sopenharmony_ci	cmp     %o4, ST_CHUNK*64
2268c2ecf20Sopenharmony_ci	bge,pt  %xcc, .wr_loop_start
2278c2ecf20Sopenharmony_ci	 mov     ST_CHUNK, %g1
2288c2ecf20Sopenharmony_ci
2298c2ecf20Sopenharmony_ci	brz,a,pn %o4, .asi_done
2308c2ecf20Sopenharmony_ci	 add     %o5, 8, %o5             ! restore %o5 offset
2318c2ecf20Sopenharmony_ci
2328c2ecf20Sopenharmony_ci.wr_loop_small:
2338c2ecf20Sopenharmony_ci	stxa    %o1, [%o5+8]%asi
2348c2ecf20Sopenharmony_ci	stxa    %o1, [%o5+8+8]%asi
2358c2ecf20Sopenharmony_ci	stxa    %o1, [%o5+16+8]%asi
2368c2ecf20Sopenharmony_ci	stxa    %o1, [%o5+24+8]%asi
2378c2ecf20Sopenharmony_ci	stxa    %o1, [%o5+32+8]%asi
2388c2ecf20Sopenharmony_ci	subcc   %o4, 64, %o4
2398c2ecf20Sopenharmony_ci	stxa    %o1, [%o5+40+8]%asi
2408c2ecf20Sopenharmony_ci	add     %o5, 64, %o5
2418c2ecf20Sopenharmony_ci	stxa    %o1, [%o5-8]%asi
2428c2ecf20Sopenharmony_ci	bgu,pt  %xcc, .wr_loop_small
2438c2ecf20Sopenharmony_ci	 stxa    %o1, [%o5]ASI_STBI_P
2448c2ecf20Sopenharmony_ci
2458c2ecf20Sopenharmony_ci	ba      .asi_done
2468c2ecf20Sopenharmony_ci	 add     %o5, 8, %o5             ! restore %o5 offset
2478c2ecf20Sopenharmony_ci
2488c2ecf20Sopenharmony_ci	! Special case loop for zero fill memsets
2498c2ecf20Sopenharmony_ci	! For each 64 byte cache line, single STBI to first element
2508c2ecf20Sopenharmony_ci	! clears line
2518c2ecf20Sopenharmony_ci.wrzero:
2528c2ecf20Sopenharmony_ci	cmp     %o4, MIN_ZERO           ! check if enough bytes to set
2538c2ecf20Sopenharmony_ci					! to pay %asi + membar cost
2548c2ecf20Sopenharmony_ci	blu     %xcc, .short_set
2558c2ecf20Sopenharmony_ci	 nop
2568c2ecf20Sopenharmony_ci	sub     %o4, 256, %o4
2578c2ecf20Sopenharmony_ci
2588c2ecf20Sopenharmony_ci.wrzero_loop:
2598c2ecf20Sopenharmony_ci	mov     64, %g3
2608c2ecf20Sopenharmony_ci	stxa    %o1, [%o5]ASI_STBI_P
2618c2ecf20Sopenharmony_ci	subcc   %o4, 256, %o4
2628c2ecf20Sopenharmony_ci	stxa    %o1, [%o5+%g3]ASI_STBI_P
2638c2ecf20Sopenharmony_ci	add     %o5, 256, %o5
2648c2ecf20Sopenharmony_ci	sub     %g3, 192, %g3
2658c2ecf20Sopenharmony_ci	stxa    %o1, [%o5+%g3]ASI_STBI_P
2668c2ecf20Sopenharmony_ci	add %g3, 64, %g3
2678c2ecf20Sopenharmony_ci	bge,pt  %xcc, .wrzero_loop
2688c2ecf20Sopenharmony_ci	 stxa    %o1, [%o5+%g3]ASI_STBI_P
2698c2ecf20Sopenharmony_ci	add     %o4, 256, %o4
2708c2ecf20Sopenharmony_ci
2718c2ecf20Sopenharmony_ci	brz,pn  %o4, .bsi_done
2728c2ecf20Sopenharmony_ci	 nop
2738c2ecf20Sopenharmony_ci
2748c2ecf20Sopenharmony_ci.wrzero_small:
2758c2ecf20Sopenharmony_ci	stxa    %o1, [%o5]ASI_STBI_P
2768c2ecf20Sopenharmony_ci	subcc   %o4, 64, %o4
2778c2ecf20Sopenharmony_ci	bgu,pt  %xcc, .wrzero_small
2788c2ecf20Sopenharmony_ci	 add     %o5, 64, %o5
2798c2ecf20Sopenharmony_ci	ba,a	.bsi_done
2808c2ecf20Sopenharmony_ci
2818c2ecf20Sopenharmony_ci.asi_done:
2828c2ecf20Sopenharmony_ci	wr	%g3, 0x0, %asi		! restored saved %asi
2838c2ecf20Sopenharmony_ci.bsi_done:
2848c2ecf20Sopenharmony_ci	membar  #StoreStore             ! required by use of Block Store Init
2858c2ecf20Sopenharmony_ci
2868c2ecf20Sopenharmony_ci.short_set:
2878c2ecf20Sopenharmony_ci	cmp     %o4, 64                 ! check if 64 bytes to set
2888c2ecf20Sopenharmony_ci	blu     %xcc, 5f
2898c2ecf20Sopenharmony_ci	 nop
2908c2ecf20Sopenharmony_ci4:                                      ! set final blocks of 64 bytes
2918c2ecf20Sopenharmony_ci	stx     %o1, [%o5]
2928c2ecf20Sopenharmony_ci	stx     %o1, [%o5+8]
2938c2ecf20Sopenharmony_ci	stx     %o1, [%o5+16]
2948c2ecf20Sopenharmony_ci	stx     %o1, [%o5+24]
2958c2ecf20Sopenharmony_ci	subcc   %o4, 64, %o4
2968c2ecf20Sopenharmony_ci	stx     %o1, [%o5+32]
2978c2ecf20Sopenharmony_ci	stx     %o1, [%o5+40]
2988c2ecf20Sopenharmony_ci	add     %o5, 64, %o5
2998c2ecf20Sopenharmony_ci	stx     %o1, [%o5-16]
3008c2ecf20Sopenharmony_ci	bgu,pt  %xcc, 4b
3018c2ecf20Sopenharmony_ci	 stx     %o1, [%o5-8]
3028c2ecf20Sopenharmony_ci
3038c2ecf20Sopenharmony_ci5:
3048c2ecf20Sopenharmony_ci	! Set the remaining long words
3058c2ecf20Sopenharmony_ci.wrshort:
3068c2ecf20Sopenharmony_ci	subcc   %o3, 8, %o3             ! Can we store any long words?
3078c2ecf20Sopenharmony_ci	blu,pn  %xcc, .wrchars
3088c2ecf20Sopenharmony_ci	 and     %o2, 7, %o2             ! calc bytes left after long words
3098c2ecf20Sopenharmony_ci6:
3108c2ecf20Sopenharmony_ci	subcc   %o3, 8, %o3
3118c2ecf20Sopenharmony_ci	stx     %o1, [%o5]              ! store the long words
3128c2ecf20Sopenharmony_ci	bgeu,pt %xcc, 6b
3138c2ecf20Sopenharmony_ci	 add     %o5, 8, %o5
3148c2ecf20Sopenharmony_ci
3158c2ecf20Sopenharmony_ci.wrchars:                               ! check for extra chars
3168c2ecf20Sopenharmony_ci	brnz    %o2, .wrfin
3178c2ecf20Sopenharmony_ci	 nop
3188c2ecf20Sopenharmony_ci	retl
3198c2ecf20Sopenharmony_ci	 nop
3208c2ecf20Sopenharmony_ci
3218c2ecf20Sopenharmony_ci.wdalign:
3228c2ecf20Sopenharmony_ci	andcc   %o5, 3, %o3             ! is sp1 aligned on a word boundary
3238c2ecf20Sopenharmony_ci	bz,pn   %xcc, .wrword
3248c2ecf20Sopenharmony_ci	 andn    %o2, 3, %o3             ! create word sized count in %o3
3258c2ecf20Sopenharmony_ci
3268c2ecf20Sopenharmony_ci	dec     %o2                     ! decrement count
3278c2ecf20Sopenharmony_ci	stb     %o1, [%o5]              ! clear a byte
3288c2ecf20Sopenharmony_ci	b       .wdalign
3298c2ecf20Sopenharmony_ci	 inc     %o5                     ! next byte
3308c2ecf20Sopenharmony_ci
3318c2ecf20Sopenharmony_ci.wrword:
3328c2ecf20Sopenharmony_ci	subcc   %o3, 4, %o3
3338c2ecf20Sopenharmony_ci	st      %o1, [%o5]              ! 4-byte writing loop
3348c2ecf20Sopenharmony_ci	bnz,pt  %xcc, .wrword
3358c2ecf20Sopenharmony_ci	 add     %o5, 4, %o5
3368c2ecf20Sopenharmony_ci
3378c2ecf20Sopenharmony_ci	and     %o2, 3, %o2             ! leftover count, if any
3388c2ecf20Sopenharmony_ci
3398c2ecf20Sopenharmony_ci.wrchar:
3408c2ecf20Sopenharmony_ci	! Set the remaining bytes, if any
3418c2ecf20Sopenharmony_ci	brz     %o2, .exit
3428c2ecf20Sopenharmony_ci	 nop
3438c2ecf20Sopenharmony_ci.wrfin:
3448c2ecf20Sopenharmony_ci	deccc   %o2
3458c2ecf20Sopenharmony_ci	stb     %o1, [%o5]
3468c2ecf20Sopenharmony_ci	bgu,pt  %xcc, .wrfin
3478c2ecf20Sopenharmony_ci	 inc     %o5
3488c2ecf20Sopenharmony_ci.exit:
3498c2ecf20Sopenharmony_ci	retl                            ! %o0 was preserved
3508c2ecf20Sopenharmony_ci	 nop
3518c2ecf20Sopenharmony_ci
3528c2ecf20Sopenharmony_ci	.size		M7memset,.-M7memset
353