18c2ecf20Sopenharmony_ci/* 28c2ecf20Sopenharmony_ci * M7memcpy: Optimized SPARC M7 memcpy 38c2ecf20Sopenharmony_ci * 48c2ecf20Sopenharmony_ci * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. 58c2ecf20Sopenharmony_ci */ 68c2ecf20Sopenharmony_ci 78c2ecf20Sopenharmony_ci .file "M7memcpy.S" 88c2ecf20Sopenharmony_ci 98c2ecf20Sopenharmony_ci/* 108c2ecf20Sopenharmony_ci * memcpy(s1, s2, len) 118c2ecf20Sopenharmony_ci * 128c2ecf20Sopenharmony_ci * Copy s2 to s1, always copy n bytes. 138c2ecf20Sopenharmony_ci * Note: this C code does not work for overlapped copies. 148c2ecf20Sopenharmony_ci * 158c2ecf20Sopenharmony_ci * Fast assembler language version of the following C-program for memcpy 168c2ecf20Sopenharmony_ci * which represents the `standard' for the C-library. 178c2ecf20Sopenharmony_ci * 188c2ecf20Sopenharmony_ci * void * 198c2ecf20Sopenharmony_ci * memcpy(void *s, const void *s0, size_t n) 208c2ecf20Sopenharmony_ci * { 218c2ecf20Sopenharmony_ci * if (n != 0) { 228c2ecf20Sopenharmony_ci * char *s1 = s; 238c2ecf20Sopenharmony_ci * const char *s2 = s0; 248c2ecf20Sopenharmony_ci * do { 258c2ecf20Sopenharmony_ci * *s1++ = *s2++; 268c2ecf20Sopenharmony_ci * } while (--n != 0); 278c2ecf20Sopenharmony_ci * } 288c2ecf20Sopenharmony_ci * return (s); 298c2ecf20Sopenharmony_ci * } 308c2ecf20Sopenharmony_ci * 318c2ecf20Sopenharmony_ci * 328c2ecf20Sopenharmony_ci * SPARC T7/M7 Flow : 338c2ecf20Sopenharmony_ci * 348c2ecf20Sopenharmony_ci * if (count < SMALL_MAX) { 358c2ecf20Sopenharmony_ci * if count < SHORTCOPY (SHORTCOPY=3) 368c2ecf20Sopenharmony_ci * copy bytes; exit with dst addr 378c2ecf20Sopenharmony_ci * if src & dst aligned on word boundary but not long word boundary, 388c2ecf20Sopenharmony_ci * copy with ldw/stw; branch to finish_up 398c2ecf20Sopenharmony_ci * if src & dst aligned on long word boundary 408c2ecf20Sopenharmony_ci * copy with ldx/stx; branch to finish_up 418c2ecf20Sopenharmony_ci * if src & dst not aligned and length <= SHORTCHECK (SHORTCHECK=14) 428c2ecf20Sopenharmony_ci * copy bytes; exit with dst addr 438c2ecf20Sopenharmony_ci * move enough bytes to get src to word boundary 448c2ecf20Sopenharmony_ci * if dst now on word boundary 458c2ecf20Sopenharmony_ci * move_words: 468c2ecf20Sopenharmony_ci * copy words; branch to finish_up 478c2ecf20Sopenharmony_ci * if dst now on half word boundary 488c2ecf20Sopenharmony_ci * load words, shift half words, store words; branch to finish_up 498c2ecf20Sopenharmony_ci * if dst on byte 1 508c2ecf20Sopenharmony_ci * load words, shift 3 bytes, store words; branch to finish_up 518c2ecf20Sopenharmony_ci * if dst on byte 3 528c2ecf20Sopenharmony_ci * load words, shift 1 byte, store words; branch to finish_up 538c2ecf20Sopenharmony_ci * finish_up: 548c2ecf20Sopenharmony_ci * copy bytes; exit with dst addr 558c2ecf20Sopenharmony_ci * } else { More than SMALL_MAX bytes 568c2ecf20Sopenharmony_ci * move bytes until dst is on long word boundary 578c2ecf20Sopenharmony_ci * if( src is on long word boundary ) { 588c2ecf20Sopenharmony_ci * if (count < MED_MAX) { 598c2ecf20Sopenharmony_ci * finish_long: src/dst aligned on 8 bytes 608c2ecf20Sopenharmony_ci * copy with ldx/stx in 8-way unrolled loop; 618c2ecf20Sopenharmony_ci * copy final 0-63 bytes; exit with dst addr 628c2ecf20Sopenharmony_ci * } else { src/dst aligned; count > MED_MAX 638c2ecf20Sopenharmony_ci * align dst on 64 byte boundary; for main data movement: 648c2ecf20Sopenharmony_ci * prefetch src data to L2 cache; let HW prefetch move data to L1 cache 658c2ecf20Sopenharmony_ci * Use BIS (block initializing store) to avoid copying store cache 668c2ecf20Sopenharmony_ci * lines from memory. But pre-store first element of each cache line 678c2ecf20Sopenharmony_ci * ST_CHUNK lines in advance of the rest of that cache line. That 688c2ecf20Sopenharmony_ci * gives time for replacement cache lines to be written back without 698c2ecf20Sopenharmony_ci * excess STQ and Miss Buffer filling. Repeat until near the end, 708c2ecf20Sopenharmony_ci * then finish up storing before going to finish_long. 718c2ecf20Sopenharmony_ci * } 728c2ecf20Sopenharmony_ci * } else { src/dst not aligned on 8 bytes 738c2ecf20Sopenharmony_ci * if src is word aligned and count < MED_WMAX 748c2ecf20Sopenharmony_ci * move words in 8-way unrolled loop 758c2ecf20Sopenharmony_ci * move final 0-31 bytes; exit with dst addr 768c2ecf20Sopenharmony_ci * if count < MED_UMAX 778c2ecf20Sopenharmony_ci * use alignaddr/faligndata combined with ldd/std in 8-way 788c2ecf20Sopenharmony_ci * unrolled loop to move data. 798c2ecf20Sopenharmony_ci * go to unalign_done 808c2ecf20Sopenharmony_ci * else 818c2ecf20Sopenharmony_ci * setup alignaddr for faligndata instructions 828c2ecf20Sopenharmony_ci * align dst on 64 byte boundary; prefetch src data to L1 cache 838c2ecf20Sopenharmony_ci * loadx8, falign, block-store, prefetch loop 848c2ecf20Sopenharmony_ci * (only use block-init-store when src/dst on 8 byte boundaries.) 858c2ecf20Sopenharmony_ci * unalign_done: 868c2ecf20Sopenharmony_ci * move remaining bytes for unaligned cases. exit with dst addr. 878c2ecf20Sopenharmony_ci * } 888c2ecf20Sopenharmony_ci * 898c2ecf20Sopenharmony_ci */ 908c2ecf20Sopenharmony_ci 918c2ecf20Sopenharmony_ci#include <asm/visasm.h> 928c2ecf20Sopenharmony_ci#include <asm/asi.h> 938c2ecf20Sopenharmony_ci 948c2ecf20Sopenharmony_ci#if !defined(EX_LD) && !defined(EX_ST) 958c2ecf20Sopenharmony_ci#define NON_USER_COPY 968c2ecf20Sopenharmony_ci#endif 978c2ecf20Sopenharmony_ci 988c2ecf20Sopenharmony_ci#ifndef EX_LD 998c2ecf20Sopenharmony_ci#define EX_LD(x,y) x 1008c2ecf20Sopenharmony_ci#endif 1018c2ecf20Sopenharmony_ci#ifndef EX_LD_FP 1028c2ecf20Sopenharmony_ci#define EX_LD_FP(x,y) x 1038c2ecf20Sopenharmony_ci#endif 1048c2ecf20Sopenharmony_ci 1058c2ecf20Sopenharmony_ci#ifndef EX_ST 1068c2ecf20Sopenharmony_ci#define EX_ST(x,y) x 1078c2ecf20Sopenharmony_ci#endif 1088c2ecf20Sopenharmony_ci#ifndef EX_ST_FP 1098c2ecf20Sopenharmony_ci#define EX_ST_FP(x,y) x 1108c2ecf20Sopenharmony_ci#endif 1118c2ecf20Sopenharmony_ci 1128c2ecf20Sopenharmony_ci#ifndef EX_RETVAL 1138c2ecf20Sopenharmony_ci#define EX_RETVAL(x) x 1148c2ecf20Sopenharmony_ci#endif 1158c2ecf20Sopenharmony_ci 1168c2ecf20Sopenharmony_ci#ifndef LOAD 1178c2ecf20Sopenharmony_ci#define LOAD(type,addr,dest) type [addr], dest 1188c2ecf20Sopenharmony_ci#endif 1198c2ecf20Sopenharmony_ci 1208c2ecf20Sopenharmony_ci#ifndef STORE 1218c2ecf20Sopenharmony_ci#define STORE(type,src,addr) type src, [addr] 1228c2ecf20Sopenharmony_ci#endif 1238c2ecf20Sopenharmony_ci 1248c2ecf20Sopenharmony_ci/* 1258c2ecf20Sopenharmony_ci * ASI_BLK_INIT_QUAD_LDD_P/ASI_BLK_INIT_QUAD_LDD_S marks the cache 1268c2ecf20Sopenharmony_ci * line as "least recently used" which means if many threads are 1278c2ecf20Sopenharmony_ci * active, it has a high probability of being pushed out of the cache 1288c2ecf20Sopenharmony_ci * between the first initializing store and the final stores. 1298c2ecf20Sopenharmony_ci * Thus, we use ASI_ST_BLKINIT_MRU_P/ASI_ST_BLKINIT_MRU_S which 1308c2ecf20Sopenharmony_ci * marks the cache line as "most recently used" for all 1318c2ecf20Sopenharmony_ci * but the last cache line 1328c2ecf20Sopenharmony_ci */ 1338c2ecf20Sopenharmony_ci#ifndef STORE_ASI 1348c2ecf20Sopenharmony_ci#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA 1358c2ecf20Sopenharmony_ci#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P 1368c2ecf20Sopenharmony_ci#else 1378c2ecf20Sopenharmony_ci#define STORE_ASI 0x80 /* ASI_P */ 1388c2ecf20Sopenharmony_ci#endif 1398c2ecf20Sopenharmony_ci#endif 1408c2ecf20Sopenharmony_ci 1418c2ecf20Sopenharmony_ci#ifndef STORE_MRU_ASI 1428c2ecf20Sopenharmony_ci#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA 1438c2ecf20Sopenharmony_ci#define STORE_MRU_ASI ASI_ST_BLKINIT_MRU_P 1448c2ecf20Sopenharmony_ci#else 1458c2ecf20Sopenharmony_ci#define STORE_MRU_ASI 0x80 /* ASI_P */ 1468c2ecf20Sopenharmony_ci#endif 1478c2ecf20Sopenharmony_ci#endif 1488c2ecf20Sopenharmony_ci 1498c2ecf20Sopenharmony_ci#ifndef STORE_INIT 1508c2ecf20Sopenharmony_ci#define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI 1518c2ecf20Sopenharmony_ci#endif 1528c2ecf20Sopenharmony_ci 1538c2ecf20Sopenharmony_ci#ifndef STORE_INIT_MRU 1548c2ecf20Sopenharmony_ci#define STORE_INIT_MRU(src,addr) stxa src, [addr] STORE_MRU_ASI 1558c2ecf20Sopenharmony_ci#endif 1568c2ecf20Sopenharmony_ci 1578c2ecf20Sopenharmony_ci#ifndef FUNC_NAME 1588c2ecf20Sopenharmony_ci#define FUNC_NAME M7memcpy 1598c2ecf20Sopenharmony_ci#endif 1608c2ecf20Sopenharmony_ci 1618c2ecf20Sopenharmony_ci#ifndef PREAMBLE 1628c2ecf20Sopenharmony_ci#define PREAMBLE 1638c2ecf20Sopenharmony_ci#endif 1648c2ecf20Sopenharmony_ci 1658c2ecf20Sopenharmony_ci#define BLOCK_SIZE 64 1668c2ecf20Sopenharmony_ci#define SHORTCOPY 3 1678c2ecf20Sopenharmony_ci#define SHORTCHECK 14 1688c2ecf20Sopenharmony_ci#define SHORT_LONG 64 /* max copy for short longword-aligned case */ 1698c2ecf20Sopenharmony_ci /* must be at least 64 */ 1708c2ecf20Sopenharmony_ci#define SMALL_MAX 128 1718c2ecf20Sopenharmony_ci#define MED_UMAX 1024 /* max copy for medium un-aligned case */ 1728c2ecf20Sopenharmony_ci#define MED_WMAX 1024 /* max copy for medium word-aligned case */ 1738c2ecf20Sopenharmony_ci#define MED_MAX 1024 /* max copy for medium longword-aligned case */ 1748c2ecf20Sopenharmony_ci#define ST_CHUNK 24 /* ST_CHUNK - block of values for BIS Store */ 1758c2ecf20Sopenharmony_ci#define ALIGN_PRE 24 /* distance for aligned prefetch loop */ 1768c2ecf20Sopenharmony_ci 1778c2ecf20Sopenharmony_ci .register %g2,#scratch 1788c2ecf20Sopenharmony_ci 1798c2ecf20Sopenharmony_ci .section ".text" 1808c2ecf20Sopenharmony_ci .global FUNC_NAME 1818c2ecf20Sopenharmony_ci .type FUNC_NAME, #function 1828c2ecf20Sopenharmony_ci .align 16 1838c2ecf20Sopenharmony_ciFUNC_NAME: 1848c2ecf20Sopenharmony_ci srlx %o2, 31, %g2 1858c2ecf20Sopenharmony_ci cmp %g2, 0 1868c2ecf20Sopenharmony_ci tne %xcc, 5 1878c2ecf20Sopenharmony_ci PREAMBLE 1888c2ecf20Sopenharmony_ci mov %o0, %g1 ! save %o0 1898c2ecf20Sopenharmony_ci brz,pn %o2, .Lsmallx 1908c2ecf20Sopenharmony_ci cmp %o2, 3 1918c2ecf20Sopenharmony_ci ble,pn %icc, .Ltiny_cp 1928c2ecf20Sopenharmony_ci cmp %o2, 19 1938c2ecf20Sopenharmony_ci ble,pn %icc, .Lsmall_cp 1948c2ecf20Sopenharmony_ci or %o0, %o1, %g2 1958c2ecf20Sopenharmony_ci cmp %o2, SMALL_MAX 1968c2ecf20Sopenharmony_ci bl,pn %icc, .Lmedium_cp 1978c2ecf20Sopenharmony_ci nop 1988c2ecf20Sopenharmony_ci 1998c2ecf20Sopenharmony_ci.Lmedium: 2008c2ecf20Sopenharmony_ci neg %o0, %o5 2018c2ecf20Sopenharmony_ci andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned 2028c2ecf20Sopenharmony_ci brz,pt %o5, .Ldst_aligned_on_8 2038c2ecf20Sopenharmony_ci 2048c2ecf20Sopenharmony_ci ! %o5 has the bytes to be written in partial store. 2058c2ecf20Sopenharmony_ci sub %o2, %o5, %o2 2068c2ecf20Sopenharmony_ci sub %o1, %o0, %o1 ! %o1 gets the difference 2078c2ecf20Sopenharmony_ci7: ! dst aligning loop 2088c2ecf20Sopenharmony_ci add %o1, %o0, %o4 2098c2ecf20Sopenharmony_ci EX_LD(LOAD(ldub, %o4, %o4), memcpy_retl_o2_plus_o5) ! load one byte 2108c2ecf20Sopenharmony_ci subcc %o5, 1, %o5 2118c2ecf20Sopenharmony_ci EX_ST(STORE(stb, %o4, %o0), memcpy_retl_o2_plus_o5_plus_1) 2128c2ecf20Sopenharmony_ci bgu,pt %xcc, 7b 2138c2ecf20Sopenharmony_ci add %o0, 1, %o0 ! advance dst 2148c2ecf20Sopenharmony_ci add %o1, %o0, %o1 ! restore %o1 2158c2ecf20Sopenharmony_ci.Ldst_aligned_on_8: 2168c2ecf20Sopenharmony_ci andcc %o1, 7, %o5 2178c2ecf20Sopenharmony_ci brnz,pt %o5, .Lsrc_dst_unaligned_on_8 2188c2ecf20Sopenharmony_ci nop 2198c2ecf20Sopenharmony_ci 2208c2ecf20Sopenharmony_ci.Lsrc_dst_aligned_on_8: 2218c2ecf20Sopenharmony_ci ! check if we are copying MED_MAX or more bytes 2228c2ecf20Sopenharmony_ci set MED_MAX, %o3 2238c2ecf20Sopenharmony_ci cmp %o2, %o3 ! limit to store buffer size 2248c2ecf20Sopenharmony_ci bgu,pn %xcc, .Llarge_align8_copy 2258c2ecf20Sopenharmony_ci nop 2268c2ecf20Sopenharmony_ci 2278c2ecf20Sopenharmony_ci/* 2288c2ecf20Sopenharmony_ci * Special case for handling when src and dest are both long word aligned 2298c2ecf20Sopenharmony_ci * and total data to move is less than MED_MAX bytes 2308c2ecf20Sopenharmony_ci */ 2318c2ecf20Sopenharmony_ci.Lmedlong: 2328c2ecf20Sopenharmony_ci subcc %o2, 63, %o2 ! adjust length to allow cc test 2338c2ecf20Sopenharmony_ci ble,pn %xcc, .Lmedl63 ! skip big loop if less than 64 bytes 2348c2ecf20Sopenharmony_ci nop 2358c2ecf20Sopenharmony_ci.Lmedl64: 2368c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_63) ! load 2378c2ecf20Sopenharmony_ci subcc %o2, 64, %o2 ! decrement length count 2388c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_63_64) ! and store 2398c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_63_56) ! a block of 64 2408c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_63_56) 2418c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_63_48) 2428c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_63_48) 2438c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1+24, %o3), memcpy_retl_o2_plus_63_40) 2448c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o3, %o0+24), memcpy_retl_o2_plus_63_40) 2458c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_63_32)! load and store 2468c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o4, %o0+32), memcpy_retl_o2_plus_63_32) 2478c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1+40, %o3), memcpy_retl_o2_plus_63_24)! a block of 64 2488c2ecf20Sopenharmony_ci add %o1, 64, %o1 ! increase src ptr by 64 2498c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o3, %o0+40), memcpy_retl_o2_plus_63_24) 2508c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_63_16) 2518c2ecf20Sopenharmony_ci add %o0, 64, %o0 ! increase dst ptr by 64 2528c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_63_16) 2538c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_63_8) 2548c2ecf20Sopenharmony_ci bgu,pt %xcc, .Lmedl64 ! repeat if at least 64 bytes left 2558c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_63_8) 2568c2ecf20Sopenharmony_ci.Lmedl63: 2578c2ecf20Sopenharmony_ci addcc %o2, 32, %o2 ! adjust remaining count 2588c2ecf20Sopenharmony_ci ble,pt %xcc, .Lmedl31 ! to skip if 31 or fewer bytes left 2598c2ecf20Sopenharmony_ci nop 2608c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_31) ! load 2618c2ecf20Sopenharmony_ci sub %o2, 32, %o2 ! decrement length count 2628c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_31_32) ! and store 2638c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_31_24) ! a block of 32 2648c2ecf20Sopenharmony_ci add %o1, 32, %o1 ! increase src ptr by 32 2658c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_31_24) 2668c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_31_16) 2678c2ecf20Sopenharmony_ci add %o0, 32, %o0 ! increase dst ptr by 32 2688c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_31_16) 2698c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_31_8) 2708c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_31_8) 2718c2ecf20Sopenharmony_ci.Lmedl31: 2728c2ecf20Sopenharmony_ci addcc %o2, 16, %o2 ! adjust remaining count 2738c2ecf20Sopenharmony_ci ble,pt %xcc, .Lmedl15 ! skip if 15 or fewer bytes left 2748c2ecf20Sopenharmony_ci nop ! 2758c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_15) 2768c2ecf20Sopenharmony_ci add %o1, 16, %o1 ! increase src ptr by 16 2778c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_15) 2788c2ecf20Sopenharmony_ci sub %o2, 16, %o2 ! decrease count by 16 2798c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_15_8) 2808c2ecf20Sopenharmony_ci add %o0, 16, %o0 ! increase dst ptr by 16 2818c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_15_8) 2828c2ecf20Sopenharmony_ci.Lmedl15: 2838c2ecf20Sopenharmony_ci addcc %o2, 15, %o2 ! restore count 2848c2ecf20Sopenharmony_ci bz,pt %xcc, .Lsmallx ! exit if finished 2858c2ecf20Sopenharmony_ci cmp %o2, 8 2868c2ecf20Sopenharmony_ci blt,pt %xcc, .Lmedw7 ! skip if 7 or fewer bytes left 2878c2ecf20Sopenharmony_ci tst %o2 2888c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) ! load 8 bytes 2898c2ecf20Sopenharmony_ci add %o1, 8, %o1 ! increase src ptr by 8 2908c2ecf20Sopenharmony_ci add %o0, 8, %o0 ! increase dst ptr by 8 2918c2ecf20Sopenharmony_ci subcc %o2, 8, %o2 ! decrease count by 8 2928c2ecf20Sopenharmony_ci bnz,pn %xcc, .Lmedw7 2938c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) ! and store 8 2948c2ecf20Sopenharmony_ci retl 2958c2ecf20Sopenharmony_ci mov EX_RETVAL(%g1), %o0 ! restore %o0 2968c2ecf20Sopenharmony_ci 2978c2ecf20Sopenharmony_ci .align 16 2988c2ecf20Sopenharmony_ci.Lsrc_dst_unaligned_on_8: 2998c2ecf20Sopenharmony_ci ! DST is 8-byte aligned, src is not 3008c2ecf20Sopenharmony_ci2: 3018c2ecf20Sopenharmony_ci andcc %o1, 0x3, %o5 ! test word alignment 3028c2ecf20Sopenharmony_ci bnz,pt %xcc, .Lunalignsetup ! branch to skip if not word aligned 3038c2ecf20Sopenharmony_ci nop 3048c2ecf20Sopenharmony_ci 3058c2ecf20Sopenharmony_ci/* 3068c2ecf20Sopenharmony_ci * Handle all cases where src and dest are aligned on word 3078c2ecf20Sopenharmony_ci * boundaries. Use unrolled loops for better performance. 3088c2ecf20Sopenharmony_ci * This option wins over standard large data move when 3098c2ecf20Sopenharmony_ci * source and destination is in cache for.Lmedium 3108c2ecf20Sopenharmony_ci * to short data moves. 3118c2ecf20Sopenharmony_ci */ 3128c2ecf20Sopenharmony_ci set MED_WMAX, %o3 3138c2ecf20Sopenharmony_ci cmp %o2, %o3 ! limit to store buffer size 3148c2ecf20Sopenharmony_ci bge,pt %xcc, .Lunalignrejoin ! otherwise rejoin main loop 3158c2ecf20Sopenharmony_ci nop 3168c2ecf20Sopenharmony_ci 3178c2ecf20Sopenharmony_ci subcc %o2, 31, %o2 ! adjust length to allow cc test 3188c2ecf20Sopenharmony_ci ! for end of loop 3198c2ecf20Sopenharmony_ci ble,pt %xcc, .Lmedw31 ! skip big loop if less than 16 3208c2ecf20Sopenharmony_ci.Lmedw32: 3218c2ecf20Sopenharmony_ci EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_31)! move a block of 32 3228c2ecf20Sopenharmony_ci sllx %o4, 32, %o5 3238c2ecf20Sopenharmony_ci EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_31) 3248c2ecf20Sopenharmony_ci or %o4, %o5, %o5 3258c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_31) 3268c2ecf20Sopenharmony_ci subcc %o2, 32, %o2 ! decrement length count 3278c2ecf20Sopenharmony_ci EX_LD(LOAD(ld, %o1+8, %o4), memcpy_retl_o2_plus_31_24) 3288c2ecf20Sopenharmony_ci sllx %o4, 32, %o5 3298c2ecf20Sopenharmony_ci EX_LD(LOAD(ld, %o1+12, %o4), memcpy_retl_o2_plus_31_24) 3308c2ecf20Sopenharmony_ci or %o4, %o5, %o5 3318c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o5, %o0+8), memcpy_retl_o2_plus_31_24) 3328c2ecf20Sopenharmony_ci add %o1, 32, %o1 ! increase src ptr by 32 3338c2ecf20Sopenharmony_ci EX_LD(LOAD(ld, %o1-16, %o4), memcpy_retl_o2_plus_31_16) 3348c2ecf20Sopenharmony_ci sllx %o4, 32, %o5 3358c2ecf20Sopenharmony_ci EX_LD(LOAD(ld, %o1-12, %o4), memcpy_retl_o2_plus_31_16) 3368c2ecf20Sopenharmony_ci or %o4, %o5, %o5 3378c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o5, %o0+16), memcpy_retl_o2_plus_31_16) 3388c2ecf20Sopenharmony_ci add %o0, 32, %o0 ! increase dst ptr by 32 3398c2ecf20Sopenharmony_ci EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_31_8) 3408c2ecf20Sopenharmony_ci sllx %o4, 32, %o5 3418c2ecf20Sopenharmony_ci EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_31_8) 3428c2ecf20Sopenharmony_ci or %o4, %o5, %o5 3438c2ecf20Sopenharmony_ci bgu,pt %xcc, .Lmedw32 ! repeat if at least 32 bytes left 3448c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_31_8) 3458c2ecf20Sopenharmony_ci.Lmedw31: 3468c2ecf20Sopenharmony_ci addcc %o2, 31, %o2 ! restore count 3478c2ecf20Sopenharmony_ci 3488c2ecf20Sopenharmony_ci bz,pt %xcc, .Lsmallx ! exit if finished 3498c2ecf20Sopenharmony_ci nop 3508c2ecf20Sopenharmony_ci cmp %o2, 16 3518c2ecf20Sopenharmony_ci blt,pt %xcc, .Lmedw15 3528c2ecf20Sopenharmony_ci nop 3538c2ecf20Sopenharmony_ci EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)! move a block of 16 bytes 3548c2ecf20Sopenharmony_ci sllx %o4, 32, %o5 3558c2ecf20Sopenharmony_ci subcc %o2, 16, %o2 ! decrement length count 3568c2ecf20Sopenharmony_ci EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_16) 3578c2ecf20Sopenharmony_ci or %o4, %o5, %o5 3588c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_16) 3598c2ecf20Sopenharmony_ci add %o1, 16, %o1 ! increase src ptr by 16 3608c2ecf20Sopenharmony_ci EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_8) 3618c2ecf20Sopenharmony_ci add %o0, 16, %o0 ! increase dst ptr by 16 3628c2ecf20Sopenharmony_ci sllx %o4, 32, %o5 3638c2ecf20Sopenharmony_ci EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_8) 3648c2ecf20Sopenharmony_ci or %o4, %o5, %o5 3658c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_8) 3668c2ecf20Sopenharmony_ci.Lmedw15: 3678c2ecf20Sopenharmony_ci bz,pt %xcc, .Lsmallx ! exit if finished 3688c2ecf20Sopenharmony_ci cmp %o2, 8 3698c2ecf20Sopenharmony_ci blt,pn %xcc, .Lmedw7 ! skip if 7 or fewer bytes left 3708c2ecf20Sopenharmony_ci tst %o2 3718c2ecf20Sopenharmony_ci EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2) ! load 4 bytes 3728c2ecf20Sopenharmony_ci subcc %o2, 8, %o2 ! decrease count by 8 3738c2ecf20Sopenharmony_ci EX_ST(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_8)! and store 4 bytes 3748c2ecf20Sopenharmony_ci add %o1, 8, %o1 ! increase src ptr by 8 3758c2ecf20Sopenharmony_ci EX_LD(LOAD(ld, %o1-4, %o3), memcpy_retl_o2_plus_4) ! load 4 bytes 3768c2ecf20Sopenharmony_ci add %o0, 8, %o0 ! increase dst ptr by 8 3778c2ecf20Sopenharmony_ci EX_ST(STORE(stw, %o3, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes 3788c2ecf20Sopenharmony_ci bz,pt %xcc, .Lsmallx ! exit if finished 3798c2ecf20Sopenharmony_ci.Lmedw7: ! count is ge 1, less than 8 3808c2ecf20Sopenharmony_ci cmp %o2, 4 ! check for 4 bytes left 3818c2ecf20Sopenharmony_ci blt,pn %xcc, .Lsmallleft3 ! skip if 3 or fewer bytes left 3828c2ecf20Sopenharmony_ci nop ! 3838c2ecf20Sopenharmony_ci EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2) ! load 4 bytes 3848c2ecf20Sopenharmony_ci add %o1, 4, %o1 ! increase src ptr by 4 3858c2ecf20Sopenharmony_ci add %o0, 4, %o0 ! increase dst ptr by 4 3868c2ecf20Sopenharmony_ci subcc %o2, 4, %o2 ! decrease count by 4 3878c2ecf20Sopenharmony_ci bnz .Lsmallleft3 3888c2ecf20Sopenharmony_ci EX_ST(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes 3898c2ecf20Sopenharmony_ci retl 3908c2ecf20Sopenharmony_ci mov EX_RETVAL(%g1), %o0 3918c2ecf20Sopenharmony_ci 3928c2ecf20Sopenharmony_ci .align 16 3938c2ecf20Sopenharmony_ci.Llarge_align8_copy: ! Src and dst share 8 byte alignment 3948c2ecf20Sopenharmony_ci ! align dst to 64 byte boundary 3958c2ecf20Sopenharmony_ci andcc %o0, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned 3968c2ecf20Sopenharmony_ci brz,pn %o3, .Laligned_to_64 3978c2ecf20Sopenharmony_ci andcc %o0, 8, %o3 ! odd long words to move? 3988c2ecf20Sopenharmony_ci brz,pt %o3, .Laligned_to_16 3998c2ecf20Sopenharmony_ci nop 4008c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) 4018c2ecf20Sopenharmony_ci sub %o2, 8, %o2 4028c2ecf20Sopenharmony_ci add %o1, 8, %o1 ! increment src ptr 4038c2ecf20Sopenharmony_ci add %o0, 8, %o0 ! increment dst ptr 4048c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) 4058c2ecf20Sopenharmony_ci.Laligned_to_16: 4068c2ecf20Sopenharmony_ci andcc %o0, 16, %o3 ! pair of long words to move? 4078c2ecf20Sopenharmony_ci brz,pt %o3, .Laligned_to_32 4088c2ecf20Sopenharmony_ci nop 4098c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) 4108c2ecf20Sopenharmony_ci sub %o2, 16, %o2 4118c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_16) 4128c2ecf20Sopenharmony_ci add %o1, 16, %o1 ! increment src ptr 4138c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8) 4148c2ecf20Sopenharmony_ci add %o0, 16, %o0 ! increment dst ptr 4158c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) 4168c2ecf20Sopenharmony_ci.Laligned_to_32: 4178c2ecf20Sopenharmony_ci andcc %o0, 32, %o3 ! four long words to move? 4188c2ecf20Sopenharmony_ci brz,pt %o3, .Laligned_to_64 4198c2ecf20Sopenharmony_ci nop 4208c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) 4218c2ecf20Sopenharmony_ci sub %o2, 32, %o2 4228c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_32) 4238c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_24) 4248c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_24) 4258c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_16) 4268c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_16) 4278c2ecf20Sopenharmony_ci add %o1, 32, %o1 ! increment src ptr 4288c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8) 4298c2ecf20Sopenharmony_ci add %o0, 32, %o0 ! increment dst ptr 4308c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) 4318c2ecf20Sopenharmony_ci.Laligned_to_64: 4328c2ecf20Sopenharmony_ci! 4338c2ecf20Sopenharmony_ci! Using block init store (BIS) instructions to avoid fetching cache 4348c2ecf20Sopenharmony_ci! lines from memory. Use ST_CHUNK stores to first element of each cache 4358c2ecf20Sopenharmony_ci! line (similar to prefetching) to avoid overfilling STQ or miss buffers. 4368c2ecf20Sopenharmony_ci! Gives existing cache lines time to be moved out of L1/L2/L3 cache. 4378c2ecf20Sopenharmony_ci! Initial stores using MRU version of BIS to keep cache line in 4388c2ecf20Sopenharmony_ci! cache until we are ready to store final element of cache line. 4398c2ecf20Sopenharmony_ci! Then store last element using the LRU version of BIS. 4408c2ecf20Sopenharmony_ci! 4418c2ecf20Sopenharmony_ci andn %o2, 0x3f, %o5 ! %o5 is multiple of block size 4428c2ecf20Sopenharmony_ci and %o2, 0x3f, %o2 ! residue bytes in %o2 4438c2ecf20Sopenharmony_ci! 4448c2ecf20Sopenharmony_ci! We use STORE_MRU_ASI for the first seven stores to each cache line 4458c2ecf20Sopenharmony_ci! followed by STORE_ASI (mark as LRU) for the last store. That 4468c2ecf20Sopenharmony_ci! mixed approach reduces the probability that the cache line is removed 4478c2ecf20Sopenharmony_ci! before we finish setting it, while minimizing the effects on 4488c2ecf20Sopenharmony_ci! other cached values during a large memcpy 4498c2ecf20Sopenharmony_ci! 4508c2ecf20Sopenharmony_ci! ST_CHUNK batches up initial BIS operations for several cache lines 4518c2ecf20Sopenharmony_ci! to allow multiple requests to not be blocked by overflowing the 4528c2ecf20Sopenharmony_ci! the store miss buffer. Then the matching stores for all those 4538c2ecf20Sopenharmony_ci! BIS operations are executed. 4548c2ecf20Sopenharmony_ci! 4558c2ecf20Sopenharmony_ci 4568c2ecf20Sopenharmony_ci sub %o0, 8, %o0 ! adjust %o0 for ASI alignment 4578c2ecf20Sopenharmony_ci.Lalign_loop: 4588c2ecf20Sopenharmony_ci cmp %o5, ST_CHUNK*64 4598c2ecf20Sopenharmony_ci blu,pt %xcc, .Lalign_loop_fin 4608c2ecf20Sopenharmony_ci mov ST_CHUNK,%o3 4618c2ecf20Sopenharmony_ci.Lalign_loop_start: 4628c2ecf20Sopenharmony_ci prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZE)], 21 4638c2ecf20Sopenharmony_ci subcc %o3, 1, %o3 4648c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5) 4658c2ecf20Sopenharmony_ci add %o1, 64, %o1 4668c2ecf20Sopenharmony_ci add %o0, 8, %o0 4678c2ecf20Sopenharmony_ci EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 4688c2ecf20Sopenharmony_ci bgu %xcc,.Lalign_loop_start 4698c2ecf20Sopenharmony_ci add %o0, 56, %o0 4708c2ecf20Sopenharmony_ci 4718c2ecf20Sopenharmony_ci mov ST_CHUNK,%o3 4728c2ecf20Sopenharmony_ci sllx %o3, 6, %o4 ! ST_CHUNK*64 4738c2ecf20Sopenharmony_ci sub %o1, %o4, %o1 ! reset %o1 4748c2ecf20Sopenharmony_ci sub %o0, %o4, %o0 ! reset %o0 4758c2ecf20Sopenharmony_ci 4768c2ecf20Sopenharmony_ci.Lalign_loop_rest: 4778c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5) 4788c2ecf20Sopenharmony_ci add %o0, 16, %o0 4798c2ecf20Sopenharmony_ci EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 4808c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5) 4818c2ecf20Sopenharmony_ci add %o0, 8, %o0 4828c2ecf20Sopenharmony_ci EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 4838c2ecf20Sopenharmony_ci subcc %o3, 1, %o3 4848c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5) 4858c2ecf20Sopenharmony_ci add %o0, 8, %o0 4868c2ecf20Sopenharmony_ci EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 4878c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5) 4888c2ecf20Sopenharmony_ci add %o0, 8, %o0 4898c2ecf20Sopenharmony_ci EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 4908c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5) 4918c2ecf20Sopenharmony_ci add %o0, 8, %o0 4928c2ecf20Sopenharmony_ci EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 4938c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5) 4948c2ecf20Sopenharmony_ci add %o1, 64, %o1 4958c2ecf20Sopenharmony_ci add %o0, 8, %o0 4968c2ecf20Sopenharmony_ci EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 4978c2ecf20Sopenharmony_ci add %o0, 8, %o0 4988c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5) 4998c2ecf20Sopenharmony_ci sub %o5, 64, %o5 5008c2ecf20Sopenharmony_ci bgu %xcc,.Lalign_loop_rest 5018c2ecf20Sopenharmony_ci ! mark cache line as LRU 5028c2ecf20Sopenharmony_ci EX_ST(STORE_INIT(%o4, %o0), memcpy_retl_o2_plus_o5_plus_64) 5038c2ecf20Sopenharmony_ci 5048c2ecf20Sopenharmony_ci cmp %o5, ST_CHUNK*64 5058c2ecf20Sopenharmony_ci bgu,pt %xcc, .Lalign_loop_start 5068c2ecf20Sopenharmony_ci mov ST_CHUNK,%o3 5078c2ecf20Sopenharmony_ci 5088c2ecf20Sopenharmony_ci cmp %o5, 0 5098c2ecf20Sopenharmony_ci beq .Lalign_done 5108c2ecf20Sopenharmony_ci nop 5118c2ecf20Sopenharmony_ci.Lalign_loop_fin: 5128c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5) 5138c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_o5) 5148c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5) 5158c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o4, %o0+8+8), memcpy_retl_o2_plus_o5) 5168c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5) 5178c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o4, %o0+8+16), memcpy_retl_o2_plus_o5) 5188c2ecf20Sopenharmony_ci subcc %o5, 64, %o5 5198c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5_64) 5208c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o4, %o0+8+24), memcpy_retl_o2_plus_o5_64) 5218c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5_64) 5228c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o4, %o0+8+32), memcpy_retl_o2_plus_o5_64) 5238c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5_64) 5248c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o4, %o0+8+40), memcpy_retl_o2_plus_o5_64) 5258c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5_64) 5268c2ecf20Sopenharmony_ci add %o1, 64, %o1 5278c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o4, %o0+8+48), memcpy_retl_o2_plus_o5_64) 5288c2ecf20Sopenharmony_ci add %o0, 64, %o0 5298c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5_64) 5308c2ecf20Sopenharmony_ci bgu %xcc,.Lalign_loop_fin 5318c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_o5_64) 5328c2ecf20Sopenharmony_ci 5338c2ecf20Sopenharmony_ci.Lalign_done: 5348c2ecf20Sopenharmony_ci add %o0, 8, %o0 ! restore %o0 from ASI alignment 5358c2ecf20Sopenharmony_ci membar #StoreStore 5368c2ecf20Sopenharmony_ci sub %o2, 63, %o2 ! adjust length to allow cc test 5378c2ecf20Sopenharmony_ci ba .Lmedl63 ! in .Lmedl63 5388c2ecf20Sopenharmony_ci nop 5398c2ecf20Sopenharmony_ci 5408c2ecf20Sopenharmony_ci .align 16 5418c2ecf20Sopenharmony_ci ! Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX 5428c2ecf20Sopenharmony_ci.Lunalignsetup: 5438c2ecf20Sopenharmony_ci.Lunalignrejoin: 5448c2ecf20Sopenharmony_ci mov %g1, %o3 ! save %g1 as VISEntryHalf clobbers it 5458c2ecf20Sopenharmony_ci#ifdef NON_USER_COPY 5468c2ecf20Sopenharmony_ci VISEntryHalfFast(.Lmedium_vis_entry_fail_cp) 5478c2ecf20Sopenharmony_ci#else 5488c2ecf20Sopenharmony_ci VISEntryHalf 5498c2ecf20Sopenharmony_ci#endif 5508c2ecf20Sopenharmony_ci mov %o3, %g1 ! restore %g1 5518c2ecf20Sopenharmony_ci 5528c2ecf20Sopenharmony_ci set MED_UMAX, %o3 5538c2ecf20Sopenharmony_ci cmp %o2, %o3 ! check for.Lmedium unaligned limit 5548c2ecf20Sopenharmony_ci bge,pt %xcc,.Lunalign_large 5558c2ecf20Sopenharmony_ci prefetch [%o1 + (4 * BLOCK_SIZE)], 20 5568c2ecf20Sopenharmony_ci andn %o2, 0x3f, %o5 ! %o5 is multiple of block size 5578c2ecf20Sopenharmony_ci and %o2, 0x3f, %o2 ! residue bytes in %o2 5588c2ecf20Sopenharmony_ci cmp %o2, 8 ! Insure we do not load beyond 5598c2ecf20Sopenharmony_ci bgt .Lunalign_adjust ! end of source buffer 5608c2ecf20Sopenharmony_ci andn %o1, 0x7, %o4 ! %o4 has long word aligned src address 5618c2ecf20Sopenharmony_ci add %o2, 64, %o2 ! adjust to leave loop 5628c2ecf20Sopenharmony_ci sub %o5, 64, %o5 ! early if necessary 5638c2ecf20Sopenharmony_ci.Lunalign_adjust: 5648c2ecf20Sopenharmony_ci alignaddr %o1, %g0, %g0 ! generate %gsr 5658c2ecf20Sopenharmony_ci add %o1, %o5, %o1 ! advance %o1 to after blocks 5668c2ecf20Sopenharmony_ci EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5) 5678c2ecf20Sopenharmony_ci.Lunalign_loop: 5688c2ecf20Sopenharmony_ci EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5) 5698c2ecf20Sopenharmony_ci faligndata %f0, %f2, %f16 5708c2ecf20Sopenharmony_ci EX_LD_FP(LOAD(ldd, %o4+16, %f4), memcpy_retl_o2_plus_o5) 5718c2ecf20Sopenharmony_ci subcc %o5, BLOCK_SIZE, %o5 5728c2ecf20Sopenharmony_ci EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5_plus_64) 5738c2ecf20Sopenharmony_ci faligndata %f2, %f4, %f18 5748c2ecf20Sopenharmony_ci EX_LD_FP(LOAD(ldd, %o4+24, %f6), memcpy_retl_o2_plus_o5_plus_56) 5758c2ecf20Sopenharmony_ci EX_ST_FP(STORE(std, %f18, %o0+8), memcpy_retl_o2_plus_o5_plus_56) 5768c2ecf20Sopenharmony_ci faligndata %f4, %f6, %f20 5778c2ecf20Sopenharmony_ci EX_LD_FP(LOAD(ldd, %o4+32, %f8), memcpy_retl_o2_plus_o5_plus_48) 5788c2ecf20Sopenharmony_ci EX_ST_FP(STORE(std, %f20, %o0+16), memcpy_retl_o2_plus_o5_plus_48) 5798c2ecf20Sopenharmony_ci faligndata %f6, %f8, %f22 5808c2ecf20Sopenharmony_ci EX_LD_FP(LOAD(ldd, %o4+40, %f10), memcpy_retl_o2_plus_o5_plus_40) 5818c2ecf20Sopenharmony_ci EX_ST_FP(STORE(std, %f22, %o0+24), memcpy_retl_o2_plus_o5_plus_40) 5828c2ecf20Sopenharmony_ci faligndata %f8, %f10, %f24 5838c2ecf20Sopenharmony_ci EX_LD_FP(LOAD(ldd, %o4+48, %f12), memcpy_retl_o2_plus_o5_plus_32) 5848c2ecf20Sopenharmony_ci EX_ST_FP(STORE(std, %f24, %o0+32), memcpy_retl_o2_plus_o5_plus_32) 5858c2ecf20Sopenharmony_ci faligndata %f10, %f12, %f26 5868c2ecf20Sopenharmony_ci EX_LD_FP(LOAD(ldd, %o4+56, %f14), memcpy_retl_o2_plus_o5_plus_24) 5878c2ecf20Sopenharmony_ci add %o4, BLOCK_SIZE, %o4 5888c2ecf20Sopenharmony_ci EX_ST_FP(STORE(std, %f26, %o0+40), memcpy_retl_o2_plus_o5_plus_24) 5898c2ecf20Sopenharmony_ci faligndata %f12, %f14, %f28 5908c2ecf20Sopenharmony_ci EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5_plus_16) 5918c2ecf20Sopenharmony_ci EX_ST_FP(STORE(std, %f28, %o0+48), memcpy_retl_o2_plus_o5_plus_16) 5928c2ecf20Sopenharmony_ci faligndata %f14, %f0, %f30 5938c2ecf20Sopenharmony_ci EX_ST_FP(STORE(std, %f30, %o0+56), memcpy_retl_o2_plus_o5_plus_8) 5948c2ecf20Sopenharmony_ci add %o0, BLOCK_SIZE, %o0 5958c2ecf20Sopenharmony_ci bgu,pt %xcc, .Lunalign_loop 5968c2ecf20Sopenharmony_ci prefetch [%o4 + (5 * BLOCK_SIZE)], 20 5978c2ecf20Sopenharmony_ci ba .Lunalign_done 5988c2ecf20Sopenharmony_ci nop 5998c2ecf20Sopenharmony_ci 6008c2ecf20Sopenharmony_ci.Lunalign_large: 6018c2ecf20Sopenharmony_ci andcc %o0, 0x3f, %o3 ! is dst 64-byte block aligned? 6028c2ecf20Sopenharmony_ci bz %xcc, .Lunalignsrc 6038c2ecf20Sopenharmony_ci sub %o3, 64, %o3 ! %o3 will be multiple of 8 6048c2ecf20Sopenharmony_ci neg %o3 ! bytes until dest is 64 byte aligned 6058c2ecf20Sopenharmony_ci sub %o2, %o3, %o2 ! update cnt with bytes to be moved 6068c2ecf20Sopenharmony_ci ! Move bytes according to source alignment 6078c2ecf20Sopenharmony_ci andcc %o1, 0x1, %o5 6088c2ecf20Sopenharmony_ci bnz %xcc, .Lunalignbyte ! check for byte alignment 6098c2ecf20Sopenharmony_ci nop 6108c2ecf20Sopenharmony_ci andcc %o1, 2, %o5 ! check for half word alignment 6118c2ecf20Sopenharmony_ci bnz %xcc, .Lunalignhalf 6128c2ecf20Sopenharmony_ci nop 6138c2ecf20Sopenharmony_ci ! Src is word aligned 6148c2ecf20Sopenharmony_ci.Lunalignword: 6158c2ecf20Sopenharmony_ci EX_LD_FP(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_o3) ! load 4 bytes 6168c2ecf20Sopenharmony_ci add %o1, 8, %o1 ! increase src ptr by 8 6178c2ecf20Sopenharmony_ci EX_ST_FP(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_o3) ! and store 4 6188c2ecf20Sopenharmony_ci subcc %o3, 8, %o3 ! decrease count by 8 6198c2ecf20Sopenharmony_ci EX_LD_FP(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_o3_plus_4)! load 4 6208c2ecf20Sopenharmony_ci add %o0, 8, %o0 ! increase dst ptr by 8 6218c2ecf20Sopenharmony_ci bnz %xcc, .Lunalignword 6228c2ecf20Sopenharmony_ci EX_ST_FP(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_o3_plus_4) 6238c2ecf20Sopenharmony_ci ba .Lunalignsrc 6248c2ecf20Sopenharmony_ci nop 6258c2ecf20Sopenharmony_ci 6268c2ecf20Sopenharmony_ci ! Src is half-word aligned 6278c2ecf20Sopenharmony_ci.Lunalignhalf: 6288c2ecf20Sopenharmony_ci EX_LD_FP(LOAD(lduh, %o1, %o4), memcpy_retl_o2_plus_o3) ! load 2 bytes 6298c2ecf20Sopenharmony_ci sllx %o4, 32, %o5 ! shift left 6308c2ecf20Sopenharmony_ci EX_LD_FP(LOAD(lduw, %o1+2, %o4), memcpy_retl_o2_plus_o3) 6318c2ecf20Sopenharmony_ci or %o4, %o5, %o5 6328c2ecf20Sopenharmony_ci sllx %o5, 16, %o5 6338c2ecf20Sopenharmony_ci EX_LD_FP(LOAD(lduh, %o1+6, %o4), memcpy_retl_o2_plus_o3) 6348c2ecf20Sopenharmony_ci or %o4, %o5, %o5 6358c2ecf20Sopenharmony_ci EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3) 6368c2ecf20Sopenharmony_ci add %o1, 8, %o1 6378c2ecf20Sopenharmony_ci subcc %o3, 8, %o3 6388c2ecf20Sopenharmony_ci bnz %xcc, .Lunalignhalf 6398c2ecf20Sopenharmony_ci add %o0, 8, %o0 6408c2ecf20Sopenharmony_ci ba .Lunalignsrc 6418c2ecf20Sopenharmony_ci nop 6428c2ecf20Sopenharmony_ci 6438c2ecf20Sopenharmony_ci ! Src is Byte aligned 6448c2ecf20Sopenharmony_ci.Lunalignbyte: 6458c2ecf20Sopenharmony_ci sub %o0, %o1, %o0 ! share pointer advance 6468c2ecf20Sopenharmony_ci.Lunalignbyte_loop: 6478c2ecf20Sopenharmony_ci EX_LD_FP(LOAD(ldub, %o1, %o4), memcpy_retl_o2_plus_o3) 6488c2ecf20Sopenharmony_ci sllx %o4, 56, %o5 6498c2ecf20Sopenharmony_ci EX_LD_FP(LOAD(lduh, %o1+1, %o4), memcpy_retl_o2_plus_o3) 6508c2ecf20Sopenharmony_ci sllx %o4, 40, %o4 6518c2ecf20Sopenharmony_ci or %o4, %o5, %o5 6528c2ecf20Sopenharmony_ci EX_LD_FP(LOAD(lduh, %o1+3, %o4), memcpy_retl_o2_plus_o3) 6538c2ecf20Sopenharmony_ci sllx %o4, 24, %o4 6548c2ecf20Sopenharmony_ci or %o4, %o5, %o5 6558c2ecf20Sopenharmony_ci EX_LD_FP(LOAD(lduh, %o1+5, %o4), memcpy_retl_o2_plus_o3) 6568c2ecf20Sopenharmony_ci sllx %o4, 8, %o4 6578c2ecf20Sopenharmony_ci or %o4, %o5, %o5 6588c2ecf20Sopenharmony_ci EX_LD_FP(LOAD(ldub, %o1+7, %o4), memcpy_retl_o2_plus_o3) 6598c2ecf20Sopenharmony_ci or %o4, %o5, %o5 6608c2ecf20Sopenharmony_ci add %o0, %o1, %o0 6618c2ecf20Sopenharmony_ci EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3) 6628c2ecf20Sopenharmony_ci sub %o0, %o1, %o0 6638c2ecf20Sopenharmony_ci subcc %o3, 8, %o3 6648c2ecf20Sopenharmony_ci bnz %xcc, .Lunalignbyte_loop 6658c2ecf20Sopenharmony_ci add %o1, 8, %o1 6668c2ecf20Sopenharmony_ci add %o0,%o1, %o0 ! restore pointer 6678c2ecf20Sopenharmony_ci 6688c2ecf20Sopenharmony_ci ! Destination is now block (64 byte aligned) 6698c2ecf20Sopenharmony_ci.Lunalignsrc: 6708c2ecf20Sopenharmony_ci andn %o2, 0x3f, %o5 ! %o5 is multiple of block size 6718c2ecf20Sopenharmony_ci and %o2, 0x3f, %o2 ! residue bytes in %o2 6728c2ecf20Sopenharmony_ci add %o2, 64, %o2 ! Insure we do not load beyond 6738c2ecf20Sopenharmony_ci sub %o5, 64, %o5 ! end of source buffer 6748c2ecf20Sopenharmony_ci 6758c2ecf20Sopenharmony_ci andn %o1, 0x7, %o4 ! %o4 has long word aligned src address 6768c2ecf20Sopenharmony_ci alignaddr %o1, %g0, %g0 ! generate %gsr 6778c2ecf20Sopenharmony_ci add %o1, %o5, %o1 ! advance %o1 to after blocks 6788c2ecf20Sopenharmony_ci 6798c2ecf20Sopenharmony_ci EX_LD_FP(LOAD(ldd, %o4, %f14), memcpy_retl_o2_plus_o5) 6808c2ecf20Sopenharmony_ci add %o4, 8, %o4 6818c2ecf20Sopenharmony_ci.Lunalign_sloop: 6828c2ecf20Sopenharmony_ci EX_LD_FP(LOAD(ldd, %o4, %f16), memcpy_retl_o2_plus_o5) 6838c2ecf20Sopenharmony_ci faligndata %f14, %f16, %f0 6848c2ecf20Sopenharmony_ci EX_LD_FP(LOAD(ldd, %o4+8, %f18), memcpy_retl_o2_plus_o5) 6858c2ecf20Sopenharmony_ci faligndata %f16, %f18, %f2 6868c2ecf20Sopenharmony_ci EX_LD_FP(LOAD(ldd, %o4+16, %f20), memcpy_retl_o2_plus_o5) 6878c2ecf20Sopenharmony_ci faligndata %f18, %f20, %f4 6888c2ecf20Sopenharmony_ci EX_ST_FP(STORE(std, %f0, %o0), memcpy_retl_o2_plus_o5) 6898c2ecf20Sopenharmony_ci subcc %o5, 64, %o5 6908c2ecf20Sopenharmony_ci EX_LD_FP(LOAD(ldd, %o4+24, %f22), memcpy_retl_o2_plus_o5_plus_56) 6918c2ecf20Sopenharmony_ci faligndata %f20, %f22, %f6 6928c2ecf20Sopenharmony_ci EX_ST_FP(STORE(std, %f2, %o0+8), memcpy_retl_o2_plus_o5_plus_56) 6938c2ecf20Sopenharmony_ci EX_LD_FP(LOAD(ldd, %o4+32, %f24), memcpy_retl_o2_plus_o5_plus_48) 6948c2ecf20Sopenharmony_ci faligndata %f22, %f24, %f8 6958c2ecf20Sopenharmony_ci EX_ST_FP(STORE(std, %f4, %o0+16), memcpy_retl_o2_plus_o5_plus_48) 6968c2ecf20Sopenharmony_ci EX_LD_FP(LOAD(ldd, %o4+40, %f26), memcpy_retl_o2_plus_o5_plus_40) 6978c2ecf20Sopenharmony_ci faligndata %f24, %f26, %f10 6988c2ecf20Sopenharmony_ci EX_ST_FP(STORE(std, %f6, %o0+24), memcpy_retl_o2_plus_o5_plus_40) 6998c2ecf20Sopenharmony_ci EX_LD_FP(LOAD(ldd, %o4+48, %f28), memcpy_retl_o2_plus_o5_plus_40) 7008c2ecf20Sopenharmony_ci faligndata %f26, %f28, %f12 7018c2ecf20Sopenharmony_ci EX_ST_FP(STORE(std, %f8, %o0+32), memcpy_retl_o2_plus_o5_plus_40) 7028c2ecf20Sopenharmony_ci add %o4, 64, %o4 7038c2ecf20Sopenharmony_ci EX_LD_FP(LOAD(ldd, %o4-8, %f30), memcpy_retl_o2_plus_o5_plus_40) 7048c2ecf20Sopenharmony_ci faligndata %f28, %f30, %f14 7058c2ecf20Sopenharmony_ci EX_ST_FP(STORE(std, %f10, %o0+40), memcpy_retl_o2_plus_o5_plus_40) 7068c2ecf20Sopenharmony_ci EX_ST_FP(STORE(std, %f12, %o0+48), memcpy_retl_o2_plus_o5_plus_40) 7078c2ecf20Sopenharmony_ci add %o0, 64, %o0 7088c2ecf20Sopenharmony_ci EX_ST_FP(STORE(std, %f14, %o0-8), memcpy_retl_o2_plus_o5_plus_40) 7098c2ecf20Sopenharmony_ci fsrc2 %f30, %f14 7108c2ecf20Sopenharmony_ci bgu,pt %xcc, .Lunalign_sloop 7118c2ecf20Sopenharmony_ci prefetch [%o4 + (8 * BLOCK_SIZE)], 20 7128c2ecf20Sopenharmony_ci 7138c2ecf20Sopenharmony_ci.Lunalign_done: 7148c2ecf20Sopenharmony_ci ! Handle trailing bytes, 64 to 127 7158c2ecf20Sopenharmony_ci ! Dest long word aligned, Src not long word aligned 7168c2ecf20Sopenharmony_ci cmp %o2, 15 7178c2ecf20Sopenharmony_ci bleu %xcc, .Lunalign_short 7188c2ecf20Sopenharmony_ci 7198c2ecf20Sopenharmony_ci andn %o2, 0x7, %o5 ! %o5 is multiple of 8 7208c2ecf20Sopenharmony_ci and %o2, 0x7, %o2 ! residue bytes in %o2 7218c2ecf20Sopenharmony_ci add %o2, 8, %o2 7228c2ecf20Sopenharmony_ci sub %o5, 8, %o5 ! insure we do not load past end of src 7238c2ecf20Sopenharmony_ci andn %o1, 0x7, %o4 ! %o4 has long word aligned src address 7248c2ecf20Sopenharmony_ci add %o1, %o5, %o1 ! advance %o1 to after multiple of 8 7258c2ecf20Sopenharmony_ci EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5)! fetch partialword 7268c2ecf20Sopenharmony_ci.Lunalign_by8: 7278c2ecf20Sopenharmony_ci EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5) 7288c2ecf20Sopenharmony_ci add %o4, 8, %o4 7298c2ecf20Sopenharmony_ci faligndata %f0, %f2, %f16 7308c2ecf20Sopenharmony_ci subcc %o5, 8, %o5 7318c2ecf20Sopenharmony_ci EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5) 7328c2ecf20Sopenharmony_ci fsrc2 %f2, %f0 7338c2ecf20Sopenharmony_ci bgu,pt %xcc, .Lunalign_by8 7348c2ecf20Sopenharmony_ci add %o0, 8, %o0 7358c2ecf20Sopenharmony_ci 7368c2ecf20Sopenharmony_ci.Lunalign_short: 7378c2ecf20Sopenharmony_ci#ifdef NON_USER_COPY 7388c2ecf20Sopenharmony_ci VISExitHalfFast 7398c2ecf20Sopenharmony_ci#else 7408c2ecf20Sopenharmony_ci VISExitHalf 7418c2ecf20Sopenharmony_ci#endif 7428c2ecf20Sopenharmony_ci ba .Lsmallrest 7438c2ecf20Sopenharmony_ci nop 7448c2ecf20Sopenharmony_ci 7458c2ecf20Sopenharmony_ci/* 7468c2ecf20Sopenharmony_ci * This is a special case of nested memcpy. This can happen when kernel 7478c2ecf20Sopenharmony_ci * calls unaligned memcpy back to back without saving FP registers. We need 7488c2ecf20Sopenharmony_ci * traps(context switch) to save/restore FP registers. If the kernel calls 7498c2ecf20Sopenharmony_ci * memcpy without this trap sequence we will hit FP corruption. Let's use 7508c2ecf20Sopenharmony_ci * the normal integer load/store method in this case. 7518c2ecf20Sopenharmony_ci */ 7528c2ecf20Sopenharmony_ci 7538c2ecf20Sopenharmony_ci#ifdef NON_USER_COPY 7548c2ecf20Sopenharmony_ci.Lmedium_vis_entry_fail_cp: 7558c2ecf20Sopenharmony_ci or %o0, %o1, %g2 7568c2ecf20Sopenharmony_ci#endif 7578c2ecf20Sopenharmony_ci.Lmedium_cp: 7588c2ecf20Sopenharmony_ci LOAD(prefetch, %o1 + 0x40, #n_reads_strong) 7598c2ecf20Sopenharmony_ci andcc %g2, 0x7, %g0 7608c2ecf20Sopenharmony_ci bne,pn %xcc, .Lmedium_unaligned_cp 7618c2ecf20Sopenharmony_ci nop 7628c2ecf20Sopenharmony_ci 7638c2ecf20Sopenharmony_ci.Lmedium_noprefetch_cp: 7648c2ecf20Sopenharmony_ci andncc %o2, 0x20 - 1, %o5 7658c2ecf20Sopenharmony_ci be,pn %xcc, 2f 7668c2ecf20Sopenharmony_ci sub %o2, %o5, %o2 7678c2ecf20Sopenharmony_ci1: EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5) 7688c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1 + 0x08, %g2), memcpy_retl_o2_plus_o5) 7698c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1 + 0x10, %g7), memcpy_retl_o2_plus_o5) 7708c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1 + 0x18, %o4), memcpy_retl_o2_plus_o5) 7718c2ecf20Sopenharmony_ci add %o1, 0x20, %o1 7728c2ecf20Sopenharmony_ci subcc %o5, 0x20, %o5 7738c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o3, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_32) 7748c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %g2, %o0 + 0x08), memcpy_retl_o2_plus_o5_plus_24) 7758c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %g7, %o0 + 0x10), memcpy_retl_o2_plus_o5_plus_24) 7768c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o4, %o0 + 0x18), memcpy_retl_o2_plus_o5_plus_8) 7778c2ecf20Sopenharmony_ci bne,pt %xcc, 1b 7788c2ecf20Sopenharmony_ci add %o0, 0x20, %o0 7798c2ecf20Sopenharmony_ci2: andcc %o2, 0x18, %o5 7808c2ecf20Sopenharmony_ci be,pt %xcc, 3f 7818c2ecf20Sopenharmony_ci sub %o2, %o5, %o2 7828c2ecf20Sopenharmony_ci1: EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5) 7838c2ecf20Sopenharmony_ci add %o1, 0x08, %o1 7848c2ecf20Sopenharmony_ci add %o0, 0x08, %o0 7858c2ecf20Sopenharmony_ci subcc %o5, 0x08, %o5 7868c2ecf20Sopenharmony_ci bne,pt %xcc, 1b 7878c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %o3, %o0 - 0x08), memcpy_retl_o2_plus_o5_plus_8) 7888c2ecf20Sopenharmony_ci3: brz,pt %o2, .Lexit_cp 7898c2ecf20Sopenharmony_ci cmp %o2, 0x04 7908c2ecf20Sopenharmony_ci bl,pn %xcc, .Ltiny_cp 7918c2ecf20Sopenharmony_ci nop 7928c2ecf20Sopenharmony_ci EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2) 7938c2ecf20Sopenharmony_ci add %o1, 0x04, %o1 7948c2ecf20Sopenharmony_ci add %o0, 0x04, %o0 7958c2ecf20Sopenharmony_ci subcc %o2, 0x04, %o2 7968c2ecf20Sopenharmony_ci bne,pn %xcc, .Ltiny_cp 7978c2ecf20Sopenharmony_ci EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_4) 7988c2ecf20Sopenharmony_ci ba,a,pt %xcc, .Lexit_cp 7998c2ecf20Sopenharmony_ci 8008c2ecf20Sopenharmony_ci.Lmedium_unaligned_cp: 8018c2ecf20Sopenharmony_ci /* First get dest 8 byte aligned. */ 8028c2ecf20Sopenharmony_ci sub %g0, %o0, %o3 8038c2ecf20Sopenharmony_ci and %o3, 0x7, %o3 8048c2ecf20Sopenharmony_ci brz,pt %o3, 2f 8058c2ecf20Sopenharmony_ci sub %o2, %o3, %o2 8068c2ecf20Sopenharmony_ci 8078c2ecf20Sopenharmony_ci1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2), memcpy_retl_o2_plus_g1) 8088c2ecf20Sopenharmony_ci add %o1, 1, %o1 8098c2ecf20Sopenharmony_ci subcc %o3, 1, %o3 8108c2ecf20Sopenharmony_ci add %o0, 1, %o0 8118c2ecf20Sopenharmony_ci bne,pt %xcc, 1b 8128c2ecf20Sopenharmony_ci EX_ST(STORE(stb, %g2, %o0 - 0x01), memcpy_retl_o2_plus_g1_plus_1) 8138c2ecf20Sopenharmony_ci2: 8148c2ecf20Sopenharmony_ci and %o1, 0x7, %o3 8158c2ecf20Sopenharmony_ci brz,pn %o3, .Lmedium_noprefetch_cp 8168c2ecf20Sopenharmony_ci sll %o3, 3, %o3 8178c2ecf20Sopenharmony_ci mov 64, %g2 8188c2ecf20Sopenharmony_ci sub %g2, %o3, %g2 8198c2ecf20Sopenharmony_ci andn %o1, 0x7, %o1 8208c2ecf20Sopenharmony_ci EX_LD(LOAD(ldx, %o1 + 0x00, %o4), memcpy_retl_o2) 8218c2ecf20Sopenharmony_ci sllx %o4, %o3, %o4 8228c2ecf20Sopenharmony_ci andn %o2, 0x08 - 1, %o5 8238c2ecf20Sopenharmony_ci sub %o2, %o5, %o2 8248c2ecf20Sopenharmony_ci 8258c2ecf20Sopenharmony_ci1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3), memcpy_retl_o2_plus_o5) 8268c2ecf20Sopenharmony_ci add %o1, 0x08, %o1 8278c2ecf20Sopenharmony_ci subcc %o5, 0x08, %o5 8288c2ecf20Sopenharmony_ci srlx %g3, %g2, %g7 8298c2ecf20Sopenharmony_ci or %g7, %o4, %g7 8308c2ecf20Sopenharmony_ci EX_ST(STORE(stx, %g7, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_8) 8318c2ecf20Sopenharmony_ci add %o0, 0x08, %o0 8328c2ecf20Sopenharmony_ci bne,pt %xcc, 1b 8338c2ecf20Sopenharmony_ci sllx %g3, %o3, %o4 8348c2ecf20Sopenharmony_ci srl %o3, 3, %o3 8358c2ecf20Sopenharmony_ci add %o1, %o3, %o1 8368c2ecf20Sopenharmony_ci brz,pn %o2, .Lexit_cp 8378c2ecf20Sopenharmony_ci nop 8388c2ecf20Sopenharmony_ci ba,pt %xcc, .Lsmall_unaligned_cp 8398c2ecf20Sopenharmony_ci 8408c2ecf20Sopenharmony_ci.Ltiny_cp: 8418c2ecf20Sopenharmony_ci EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2) 8428c2ecf20Sopenharmony_ci subcc %o2, 1, %o2 8438c2ecf20Sopenharmony_ci be,pn %xcc, .Lexit_cp 8448c2ecf20Sopenharmony_ci EX_ST(STORE(stb, %o3, %o0 + 0x00), memcpy_retl_o2_plus_1) 8458c2ecf20Sopenharmony_ci EX_LD(LOAD(ldub, %o1 + 0x01, %o3), memcpy_retl_o2) 8468c2ecf20Sopenharmony_ci subcc %o2, 1, %o2 8478c2ecf20Sopenharmony_ci be,pn %xcc, .Lexit_cp 8488c2ecf20Sopenharmony_ci EX_ST(STORE(stb, %o3, %o0 + 0x01), memcpy_retl_o2_plus_1) 8498c2ecf20Sopenharmony_ci EX_LD(LOAD(ldub, %o1 + 0x02, %o3), memcpy_retl_o2) 8508c2ecf20Sopenharmony_ci ba,pt %xcc, .Lexit_cp 8518c2ecf20Sopenharmony_ci EX_ST(STORE(stb, %o3, %o0 + 0x02), memcpy_retl_o2) 8528c2ecf20Sopenharmony_ci 8538c2ecf20Sopenharmony_ci.Lsmall_cp: 8548c2ecf20Sopenharmony_ci andcc %g2, 0x3, %g0 8558c2ecf20Sopenharmony_ci bne,pn %xcc, .Lsmall_unaligned_cp 8568c2ecf20Sopenharmony_ci andn %o2, 0x4 - 1, %o5 8578c2ecf20Sopenharmony_ci sub %o2, %o5, %o2 8588c2ecf20Sopenharmony_ci1: 8598c2ecf20Sopenharmony_ci EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5) 8608c2ecf20Sopenharmony_ci add %o1, 0x04, %o1 8618c2ecf20Sopenharmony_ci subcc %o5, 0x04, %o5 8628c2ecf20Sopenharmony_ci add %o0, 0x04, %o0 8638c2ecf20Sopenharmony_ci bne,pt %xcc, 1b 8648c2ecf20Sopenharmony_ci EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_o5_plus_4) 8658c2ecf20Sopenharmony_ci brz,pt %o2, .Lexit_cp 8668c2ecf20Sopenharmony_ci nop 8678c2ecf20Sopenharmony_ci ba,a,pt %xcc, .Ltiny_cp 8688c2ecf20Sopenharmony_ci 8698c2ecf20Sopenharmony_ci.Lsmall_unaligned_cp: 8708c2ecf20Sopenharmony_ci1: EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2) 8718c2ecf20Sopenharmony_ci add %o1, 1, %o1 8728c2ecf20Sopenharmony_ci add %o0, 1, %o0 8738c2ecf20Sopenharmony_ci subcc %o2, 1, %o2 8748c2ecf20Sopenharmony_ci bne,pt %xcc, 1b 8758c2ecf20Sopenharmony_ci EX_ST(STORE(stb, %o3, %o0 - 0x01), memcpy_retl_o2_plus_1) 8768c2ecf20Sopenharmony_ci ba,a,pt %xcc, .Lexit_cp 8778c2ecf20Sopenharmony_ci 8788c2ecf20Sopenharmony_ci.Lsmallrest: 8798c2ecf20Sopenharmony_ci tst %o2 8808c2ecf20Sopenharmony_ci bz,pt %xcc, .Lsmallx 8818c2ecf20Sopenharmony_ci cmp %o2, 4 8828c2ecf20Sopenharmony_ci blt,pn %xcc, .Lsmallleft3 8838c2ecf20Sopenharmony_ci nop 8848c2ecf20Sopenharmony_ci sub %o2, 3, %o2 8858c2ecf20Sopenharmony_ci.Lsmallnotalign4: 8868c2ecf20Sopenharmony_ci EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_3)! read byte 8878c2ecf20Sopenharmony_ci subcc %o2, 4, %o2 ! reduce count by 4 8888c2ecf20Sopenharmony_ci EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_7)! write byte & repeat 8898c2ecf20Sopenharmony_ci EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2_plus_6)! for total of 4 8908c2ecf20Sopenharmony_ci add %o1, 4, %o1 ! advance SRC by 4 8918c2ecf20Sopenharmony_ci EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_6) 8928c2ecf20Sopenharmony_ci EX_LD(LOAD(ldub, %o1-2, %o3), memcpy_retl_o2_plus_5) 8938c2ecf20Sopenharmony_ci add %o0, 4, %o0 ! advance DST by 4 8948c2ecf20Sopenharmony_ci EX_ST(STORE(stb, %o3, %o0-2), memcpy_retl_o2_plus_5) 8958c2ecf20Sopenharmony_ci EX_LD(LOAD(ldub, %o1-1, %o3), memcpy_retl_o2_plus_4) 8968c2ecf20Sopenharmony_ci bgu,pt %xcc, .Lsmallnotalign4 ! loop til 3 or fewer bytes remain 8978c2ecf20Sopenharmony_ci EX_ST(STORE(stb, %o3, %o0-1), memcpy_retl_o2_plus_4) 8988c2ecf20Sopenharmony_ci addcc %o2, 3, %o2 ! restore count 8998c2ecf20Sopenharmony_ci bz,pt %xcc, .Lsmallx 9008c2ecf20Sopenharmony_ci.Lsmallleft3: ! 1, 2, or 3 bytes remain 9018c2ecf20Sopenharmony_ci subcc %o2, 1, %o2 9028c2ecf20Sopenharmony_ci EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_1) ! load one byte 9038c2ecf20Sopenharmony_ci bz,pt %xcc, .Lsmallx 9048c2ecf20Sopenharmony_ci EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_1) ! store one byte 9058c2ecf20Sopenharmony_ci EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2) ! load second byte 9068c2ecf20Sopenharmony_ci subcc %o2, 1, %o2 9078c2ecf20Sopenharmony_ci bz,pt %xcc, .Lsmallx 9088c2ecf20Sopenharmony_ci EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_1)! store second byte 9098c2ecf20Sopenharmony_ci EX_LD(LOAD(ldub, %o1+2, %o3), memcpy_retl_o2) ! load third byte 9108c2ecf20Sopenharmony_ci EX_ST(STORE(stb, %o3, %o0+2), memcpy_retl_o2) ! store third byte 9118c2ecf20Sopenharmony_ci.Lsmallx: 9128c2ecf20Sopenharmony_ci retl 9138c2ecf20Sopenharmony_ci mov EX_RETVAL(%g1), %o0 9148c2ecf20Sopenharmony_ci.Lsmallfin: 9158c2ecf20Sopenharmony_ci tst %o2 9168c2ecf20Sopenharmony_ci bnz,pn %xcc, .Lsmallleft3 9178c2ecf20Sopenharmony_ci nop 9188c2ecf20Sopenharmony_ci retl 9198c2ecf20Sopenharmony_ci mov EX_RETVAL(%g1), %o0 ! restore %o0 9208c2ecf20Sopenharmony_ci.Lexit_cp: 9218c2ecf20Sopenharmony_ci retl 9228c2ecf20Sopenharmony_ci mov EX_RETVAL(%g1), %o0 9238c2ecf20Sopenharmony_ci .size FUNC_NAME, .-FUNC_NAME 924