18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * 48c2ecf20Sopenharmony_ci * Optmized version of the standard do_csum() function 58c2ecf20Sopenharmony_ci * 68c2ecf20Sopenharmony_ci * Return: a 64bit quantity containing the 16bit Internet checksum 78c2ecf20Sopenharmony_ci * 88c2ecf20Sopenharmony_ci * Inputs: 98c2ecf20Sopenharmony_ci * in0: address of buffer to checksum (char *) 108c2ecf20Sopenharmony_ci * in1: length of the buffer (int) 118c2ecf20Sopenharmony_ci * 128c2ecf20Sopenharmony_ci * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co 138c2ecf20Sopenharmony_ci * Stephane Eranian <eranian@hpl.hp.com> 148c2ecf20Sopenharmony_ci * 158c2ecf20Sopenharmony_ci * 02/04/22 Ken Chen <kenneth.w.chen@intel.com> 168c2ecf20Sopenharmony_ci * Data locality study on the checksum buffer. 178c2ecf20Sopenharmony_ci * More optimization cleanup - remove excessive stop bits. 188c2ecf20Sopenharmony_ci * 02/04/08 David Mosberger <davidm@hpl.hp.com> 198c2ecf20Sopenharmony_ci * More cleanup and tuning. 208c2ecf20Sopenharmony_ci * 01/04/18 Jun Nakajima <jun.nakajima@intel.com> 218c2ecf20Sopenharmony_ci * Clean up and optimize and the software pipeline, loading two 228c2ecf20Sopenharmony_ci * back-to-back 8-byte words per loop. Clean up the initialization 238c2ecf20Sopenharmony_ci * for the loop. Support the cases where load latency = 1 or 2. 248c2ecf20Sopenharmony_ci * Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default). 258c2ecf20Sopenharmony_ci */ 268c2ecf20Sopenharmony_ci 278c2ecf20Sopenharmony_ci#include <asm/asmmacro.h> 288c2ecf20Sopenharmony_ci 298c2ecf20Sopenharmony_ci// 308c2ecf20Sopenharmony_ci// Theory of operations: 318c2ecf20Sopenharmony_ci// The goal is to go as quickly as possible to the point where 328c2ecf20Sopenharmony_ci// we can checksum 16 bytes/loop. Before reaching that point we must 338c2ecf20Sopenharmony_ci// take care of incorrect alignment of first byte. 348c2ecf20Sopenharmony_ci// 358c2ecf20Sopenharmony_ci// The code hereafter also takes care of the "tail" part of the buffer 368c2ecf20Sopenharmony_ci// before entering the core loop, if any. The checksum is a sum so it 378c2ecf20Sopenharmony_ci// allows us to commute operations. So we do the "head" and "tail" 388c2ecf20Sopenharmony_ci// first to finish at full speed in the body. Once we get the head and 398c2ecf20Sopenharmony_ci// tail values, we feed them into the pipeline, very handy initialization. 408c2ecf20Sopenharmony_ci// 418c2ecf20Sopenharmony_ci// Of course we deal with the special case where the whole buffer fits 428c2ecf20Sopenharmony_ci// into one 8 byte word. In this case we have only one entry in the pipeline. 438c2ecf20Sopenharmony_ci// 448c2ecf20Sopenharmony_ci// We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for 458c2ecf20Sopenharmony_ci// possible load latency and also to accommodate for head and tail. 468c2ecf20Sopenharmony_ci// 478c2ecf20Sopenharmony_ci// The end of the function deals with folding the checksum from 64bits 488c2ecf20Sopenharmony_ci// down to 16bits taking care of the carry. 498c2ecf20Sopenharmony_ci// 508c2ecf20Sopenharmony_ci// This version avoids synchronization in the core loop by also using a 518c2ecf20Sopenharmony_ci// pipeline for the accumulation of the checksum in resultx[] (x=1,2). 528c2ecf20Sopenharmony_ci// 538c2ecf20Sopenharmony_ci// wordx[] (x=1,2) 548c2ecf20Sopenharmony_ci// |---| 558c2ecf20Sopenharmony_ci// | | 0 : new value loaded in pipeline 568c2ecf20Sopenharmony_ci// |---| 578c2ecf20Sopenharmony_ci// | | - : in transit data 588c2ecf20Sopenharmony_ci// |---| 598c2ecf20Sopenharmony_ci// | | LOAD_LATENCY : current value to add to checksum 608c2ecf20Sopenharmony_ci// |---| 618c2ecf20Sopenharmony_ci// | | LOAD_LATENCY+1 : previous value added to checksum 628c2ecf20Sopenharmony_ci// |---| (previous iteration) 638c2ecf20Sopenharmony_ci// 648c2ecf20Sopenharmony_ci// resultx[] (x=1,2) 658c2ecf20Sopenharmony_ci// |---| 668c2ecf20Sopenharmony_ci// | | 0 : initial value 678c2ecf20Sopenharmony_ci// |---| 688c2ecf20Sopenharmony_ci// | | LOAD_LATENCY-1 : new checksum 698c2ecf20Sopenharmony_ci// |---| 708c2ecf20Sopenharmony_ci// | | LOAD_LATENCY : previous value of checksum 718c2ecf20Sopenharmony_ci// |---| 728c2ecf20Sopenharmony_ci// | | LOAD_LATENCY+1 : final checksum when out of the loop 738c2ecf20Sopenharmony_ci// |---| 748c2ecf20Sopenharmony_ci// 758c2ecf20Sopenharmony_ci// 768c2ecf20Sopenharmony_ci// See RFC1071 "Computing the Internet Checksum" for various techniques for 778c2ecf20Sopenharmony_ci// calculating the Internet checksum. 788c2ecf20Sopenharmony_ci// 798c2ecf20Sopenharmony_ci// NOT YET DONE: 808c2ecf20Sopenharmony_ci// - Maybe another algorithm which would take care of the folding at the 818c2ecf20Sopenharmony_ci// end in a different manner 828c2ecf20Sopenharmony_ci// - Work with people more knowledgeable than me on the network stack 838c2ecf20Sopenharmony_ci// to figure out if we could not split the function depending on the 848c2ecf20Sopenharmony_ci// type of packet or alignment we get. Like the ip_fast_csum() routine 858c2ecf20Sopenharmony_ci// where we know we have at least 20bytes worth of data to checksum. 868c2ecf20Sopenharmony_ci// - Do a better job of handling small packets. 878c2ecf20Sopenharmony_ci// - Note on prefetching: it was found that under various load, i.e. ftp read/write, 888c2ecf20Sopenharmony_ci// nfs read/write, the L1 cache hit rate is at 60% and L2 cache hit rate is at 99.8% 898c2ecf20Sopenharmony_ci// on the data that buffer points to (partly because the checksum is often preceded by 908c2ecf20Sopenharmony_ci// a copy_from_user()). This finding indiate that lfetch will not be beneficial since 918c2ecf20Sopenharmony_ci// the data is already in the cache. 928c2ecf20Sopenharmony_ci// 938c2ecf20Sopenharmony_ci 948c2ecf20Sopenharmony_ci#define saved_pfs r11 958c2ecf20Sopenharmony_ci#define hmask r16 968c2ecf20Sopenharmony_ci#define tmask r17 978c2ecf20Sopenharmony_ci#define first1 r18 988c2ecf20Sopenharmony_ci#define firstval r19 998c2ecf20Sopenharmony_ci#define firstoff r20 1008c2ecf20Sopenharmony_ci#define last r21 1018c2ecf20Sopenharmony_ci#define lastval r22 1028c2ecf20Sopenharmony_ci#define lastoff r23 1038c2ecf20Sopenharmony_ci#define saved_lc r24 1048c2ecf20Sopenharmony_ci#define saved_pr r25 1058c2ecf20Sopenharmony_ci#define tmp1 r26 1068c2ecf20Sopenharmony_ci#define tmp2 r27 1078c2ecf20Sopenharmony_ci#define tmp3 r28 1088c2ecf20Sopenharmony_ci#define carry1 r29 1098c2ecf20Sopenharmony_ci#define carry2 r30 1108c2ecf20Sopenharmony_ci#define first2 r31 1118c2ecf20Sopenharmony_ci 1128c2ecf20Sopenharmony_ci#define buf in0 1138c2ecf20Sopenharmony_ci#define len in1 1148c2ecf20Sopenharmony_ci 1158c2ecf20Sopenharmony_ci#define LOAD_LATENCY 2 // XXX fix me 1168c2ecf20Sopenharmony_ci 1178c2ecf20Sopenharmony_ci#if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2) 1188c2ecf20Sopenharmony_ci# error "Only 1 or 2 is supported/tested for LOAD_LATENCY." 1198c2ecf20Sopenharmony_ci#endif 1208c2ecf20Sopenharmony_ci 1218c2ecf20Sopenharmony_ci#define PIPE_DEPTH (LOAD_LATENCY+2) 1228c2ecf20Sopenharmony_ci#define ELD p[LOAD_LATENCY] // end of load 1238c2ecf20Sopenharmony_ci#define ELD_1 p[LOAD_LATENCY+1] // and next stage 1248c2ecf20Sopenharmony_ci 1258c2ecf20Sopenharmony_ci// unsigned long do_csum(unsigned char *buf,long len) 1268c2ecf20Sopenharmony_ci 1278c2ecf20Sopenharmony_ciGLOBAL_ENTRY(do_csum) 1288c2ecf20Sopenharmony_ci .prologue 1298c2ecf20Sopenharmony_ci .save ar.pfs, saved_pfs 1308c2ecf20Sopenharmony_ci alloc saved_pfs=ar.pfs,2,16,0,16 1318c2ecf20Sopenharmony_ci .rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2] 1328c2ecf20Sopenharmony_ci .rotp p[PIPE_DEPTH], pC1[2], pC2[2] 1338c2ecf20Sopenharmony_ci mov ret0=r0 // in case we have zero length 1348c2ecf20Sopenharmony_ci cmp.lt p0,p6=r0,len // check for zero length or negative (32bit len) 1358c2ecf20Sopenharmony_ci ;; 1368c2ecf20Sopenharmony_ci add tmp1=buf,len // last byte's address 1378c2ecf20Sopenharmony_ci .save pr, saved_pr 1388c2ecf20Sopenharmony_ci mov saved_pr=pr // preserve predicates (rotation) 1398c2ecf20Sopenharmony_ci(p6) br.ret.spnt.many rp // return if zero or negative length 1408c2ecf20Sopenharmony_ci 1418c2ecf20Sopenharmony_ci mov hmask=-1 // initialize head mask 1428c2ecf20Sopenharmony_ci tbit.nz p15,p0=buf,0 // is buf an odd address? 1438c2ecf20Sopenharmony_ci and first1=-8,buf // 8-byte align down address of first1 element 1448c2ecf20Sopenharmony_ci 1458c2ecf20Sopenharmony_ci and firstoff=7,buf // how many bytes off for first1 element 1468c2ecf20Sopenharmony_ci mov tmask=-1 // initialize tail mask 1478c2ecf20Sopenharmony_ci 1488c2ecf20Sopenharmony_ci ;; 1498c2ecf20Sopenharmony_ci adds tmp2=-1,tmp1 // last-1 1508c2ecf20Sopenharmony_ci and lastoff=7,tmp1 // how many bytes off for last element 1518c2ecf20Sopenharmony_ci ;; 1528c2ecf20Sopenharmony_ci sub tmp1=8,lastoff // complement to lastoff 1538c2ecf20Sopenharmony_ci and last=-8,tmp2 // address of word containing last byte 1548c2ecf20Sopenharmony_ci ;; 1558c2ecf20Sopenharmony_ci sub tmp3=last,first1 // tmp3=distance from first1 to last 1568c2ecf20Sopenharmony_ci .save ar.lc, saved_lc 1578c2ecf20Sopenharmony_ci mov saved_lc=ar.lc // save lc 1588c2ecf20Sopenharmony_ci cmp.eq p8,p9=last,first1 // everything fits in one word ? 1598c2ecf20Sopenharmony_ci 1608c2ecf20Sopenharmony_ci ld8 firstval=[first1],8 // load, ahead of time, "first1" word 1618c2ecf20Sopenharmony_ci and tmp1=7, tmp1 // make sure that if tmp1==8 -> tmp1=0 1628c2ecf20Sopenharmony_ci shl tmp2=firstoff,3 // number of bits 1638c2ecf20Sopenharmony_ci ;; 1648c2ecf20Sopenharmony_ci(p9) ld8 lastval=[last] // load, ahead of time, "last" word, if needed 1658c2ecf20Sopenharmony_ci shl tmp1=tmp1,3 // number of bits 1668c2ecf20Sopenharmony_ci(p9) adds tmp3=-8,tmp3 // effectively loaded 1678c2ecf20Sopenharmony_ci ;; 1688c2ecf20Sopenharmony_ci(p8) mov lastval=r0 // we don't need lastval if first1==last 1698c2ecf20Sopenharmony_ci shl hmask=hmask,tmp2 // build head mask, mask off [0,first1off[ 1708c2ecf20Sopenharmony_ci shr.u tmask=tmask,tmp1 // build tail mask, mask off ]8,lastoff] 1718c2ecf20Sopenharmony_ci ;; 1728c2ecf20Sopenharmony_ci .body 1738c2ecf20Sopenharmony_ci#define count tmp3 1748c2ecf20Sopenharmony_ci 1758c2ecf20Sopenharmony_ci(p8) and hmask=hmask,tmask // apply tail mask to head mask if 1 word only 1768c2ecf20Sopenharmony_ci(p9) and word2[0]=lastval,tmask // mask last it as appropriate 1778c2ecf20Sopenharmony_ci shr.u count=count,3 // how many 8-byte? 1788c2ecf20Sopenharmony_ci ;; 1798c2ecf20Sopenharmony_ci // If count is odd, finish this 8-byte word so that we can 1808c2ecf20Sopenharmony_ci // load two back-to-back 8-byte words per loop thereafter. 1818c2ecf20Sopenharmony_ci and word1[0]=firstval,hmask // and mask it as appropriate 1828c2ecf20Sopenharmony_ci tbit.nz p10,p11=count,0 // if (count is odd) 1838c2ecf20Sopenharmony_ci ;; 1848c2ecf20Sopenharmony_ci(p8) mov result1[0]=word1[0] 1858c2ecf20Sopenharmony_ci(p9) add result1[0]=word1[0],word2[0] 1868c2ecf20Sopenharmony_ci ;; 1878c2ecf20Sopenharmony_ci cmp.ltu p6,p0=result1[0],word1[0] // check the carry 1888c2ecf20Sopenharmony_ci cmp.eq.or.andcm p8,p0=0,count // exit if zero 8-byte 1898c2ecf20Sopenharmony_ci ;; 1908c2ecf20Sopenharmony_ci(p6) adds result1[0]=1,result1[0] 1918c2ecf20Sopenharmony_ci(p8) br.cond.dptk .do_csum_exit // if (within an 8-byte word) 1928c2ecf20Sopenharmony_ci(p11) br.cond.dptk .do_csum16 // if (count is even) 1938c2ecf20Sopenharmony_ci 1948c2ecf20Sopenharmony_ci // Here count is odd. 1958c2ecf20Sopenharmony_ci ld8 word1[1]=[first1],8 // load an 8-byte word 1968c2ecf20Sopenharmony_ci cmp.eq p9,p10=1,count // if (count == 1) 1978c2ecf20Sopenharmony_ci adds count=-1,count // loaded an 8-byte word 1988c2ecf20Sopenharmony_ci ;; 1998c2ecf20Sopenharmony_ci add result1[0]=result1[0],word1[1] 2008c2ecf20Sopenharmony_ci ;; 2018c2ecf20Sopenharmony_ci cmp.ltu p6,p0=result1[0],word1[1] 2028c2ecf20Sopenharmony_ci ;; 2038c2ecf20Sopenharmony_ci(p6) adds result1[0]=1,result1[0] 2048c2ecf20Sopenharmony_ci(p9) br.cond.sptk .do_csum_exit // if (count == 1) exit 2058c2ecf20Sopenharmony_ci // Fall through to calculate the checksum, feeding result1[0] as 2068c2ecf20Sopenharmony_ci // the initial value in result1[0]. 2078c2ecf20Sopenharmony_ci // 2088c2ecf20Sopenharmony_ci // Calculate the checksum loading two 8-byte words per loop. 2098c2ecf20Sopenharmony_ci // 2108c2ecf20Sopenharmony_ci.do_csum16: 2118c2ecf20Sopenharmony_ci add first2=8,first1 2128c2ecf20Sopenharmony_ci shr.u count=count,1 // we do 16 bytes per loop 2138c2ecf20Sopenharmony_ci ;; 2148c2ecf20Sopenharmony_ci adds count=-1,count 2158c2ecf20Sopenharmony_ci mov carry1=r0 2168c2ecf20Sopenharmony_ci mov carry2=r0 2178c2ecf20Sopenharmony_ci brp.loop.imp 1f,2f 2188c2ecf20Sopenharmony_ci ;; 2198c2ecf20Sopenharmony_ci mov ar.ec=PIPE_DEPTH 2208c2ecf20Sopenharmony_ci mov ar.lc=count // set lc 2218c2ecf20Sopenharmony_ci mov pr.rot=1<<16 2228c2ecf20Sopenharmony_ci // result1[0] must be initialized in advance. 2238c2ecf20Sopenharmony_ci mov result2[0]=r0 2248c2ecf20Sopenharmony_ci ;; 2258c2ecf20Sopenharmony_ci .align 32 2268c2ecf20Sopenharmony_ci1: 2278c2ecf20Sopenharmony_ci(ELD_1) cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1] 2288c2ecf20Sopenharmony_ci(pC1[1])adds carry1=1,carry1 2298c2ecf20Sopenharmony_ci(ELD_1) cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1] 2308c2ecf20Sopenharmony_ci(pC2[1])adds carry2=1,carry2 2318c2ecf20Sopenharmony_ci(ELD) add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY] 2328c2ecf20Sopenharmony_ci(ELD) add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY] 2338c2ecf20Sopenharmony_ci2: 2348c2ecf20Sopenharmony_ci(p[0]) ld8 word1[0]=[first1],16 2358c2ecf20Sopenharmony_ci(p[0]) ld8 word2[0]=[first2],16 2368c2ecf20Sopenharmony_ci br.ctop.sptk 1b 2378c2ecf20Sopenharmony_ci ;; 2388c2ecf20Sopenharmony_ci // Since len is a 32-bit value, carry cannot be larger than a 64-bit value. 2398c2ecf20Sopenharmony_ci(pC1[1])adds carry1=1,carry1 // since we miss the last one 2408c2ecf20Sopenharmony_ci(pC2[1])adds carry2=1,carry2 2418c2ecf20Sopenharmony_ci ;; 2428c2ecf20Sopenharmony_ci add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1 2438c2ecf20Sopenharmony_ci add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2 2448c2ecf20Sopenharmony_ci ;; 2458c2ecf20Sopenharmony_ci cmp.ltu p6,p0=result1[LOAD_LATENCY+1],carry1 2468c2ecf20Sopenharmony_ci cmp.ltu p7,p0=result2[LOAD_LATENCY+1],carry2 2478c2ecf20Sopenharmony_ci ;; 2488c2ecf20Sopenharmony_ci(p6) adds result1[LOAD_LATENCY+1]=1,result1[LOAD_LATENCY+1] 2498c2ecf20Sopenharmony_ci(p7) adds result2[LOAD_LATENCY+1]=1,result2[LOAD_LATENCY+1] 2508c2ecf20Sopenharmony_ci ;; 2518c2ecf20Sopenharmony_ci add result1[0]=result1[LOAD_LATENCY+1],result2[LOAD_LATENCY+1] 2528c2ecf20Sopenharmony_ci ;; 2538c2ecf20Sopenharmony_ci cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1] 2548c2ecf20Sopenharmony_ci ;; 2558c2ecf20Sopenharmony_ci(p6) adds result1[0]=1,result1[0] 2568c2ecf20Sopenharmony_ci ;; 2578c2ecf20Sopenharmony_ci.do_csum_exit: 2588c2ecf20Sopenharmony_ci // 2598c2ecf20Sopenharmony_ci // now fold 64 into 16 bits taking care of carry 2608c2ecf20Sopenharmony_ci // that's not very good because it has lots of sequentiality 2618c2ecf20Sopenharmony_ci // 2628c2ecf20Sopenharmony_ci mov tmp3=0xffff 2638c2ecf20Sopenharmony_ci zxt4 tmp1=result1[0] 2648c2ecf20Sopenharmony_ci shr.u tmp2=result1[0],32 2658c2ecf20Sopenharmony_ci ;; 2668c2ecf20Sopenharmony_ci add result1[0]=tmp1,tmp2 2678c2ecf20Sopenharmony_ci ;; 2688c2ecf20Sopenharmony_ci and tmp1=result1[0],tmp3 2698c2ecf20Sopenharmony_ci shr.u tmp2=result1[0],16 2708c2ecf20Sopenharmony_ci ;; 2718c2ecf20Sopenharmony_ci add result1[0]=tmp1,tmp2 2728c2ecf20Sopenharmony_ci ;; 2738c2ecf20Sopenharmony_ci and tmp1=result1[0],tmp3 2748c2ecf20Sopenharmony_ci shr.u tmp2=result1[0],16 2758c2ecf20Sopenharmony_ci ;; 2768c2ecf20Sopenharmony_ci add result1[0]=tmp1,tmp2 2778c2ecf20Sopenharmony_ci ;; 2788c2ecf20Sopenharmony_ci and tmp1=result1[0],tmp3 2798c2ecf20Sopenharmony_ci shr.u tmp2=result1[0],16 2808c2ecf20Sopenharmony_ci ;; 2818c2ecf20Sopenharmony_ci add ret0=tmp1,tmp2 2828c2ecf20Sopenharmony_ci mov pr=saved_pr,0xffffffffffff0000 2838c2ecf20Sopenharmony_ci ;; 2848c2ecf20Sopenharmony_ci // if buf was odd then swap bytes 2858c2ecf20Sopenharmony_ci mov ar.pfs=saved_pfs // restore ar.ec 2868c2ecf20Sopenharmony_ci(p15) mux1 ret0=ret0,@rev // reverse word 2878c2ecf20Sopenharmony_ci ;; 2888c2ecf20Sopenharmony_ci mov ar.lc=saved_lc 2898c2ecf20Sopenharmony_ci(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes 2908c2ecf20Sopenharmony_ci br.ret.sptk.many rp 2918c2ecf20Sopenharmony_ci 2928c2ecf20Sopenharmony_ci// I (Jun Nakajima) wrote an equivalent code (see below), but it was 2938c2ecf20Sopenharmony_ci// not much better than the original. So keep the original there so that 2948c2ecf20Sopenharmony_ci// someone else can challenge. 2958c2ecf20Sopenharmony_ci// 2968c2ecf20Sopenharmony_ci// shr.u word1[0]=result1[0],32 2978c2ecf20Sopenharmony_ci// zxt4 result1[0]=result1[0] 2988c2ecf20Sopenharmony_ci// ;; 2998c2ecf20Sopenharmony_ci// add result1[0]=result1[0],word1[0] 3008c2ecf20Sopenharmony_ci// ;; 3018c2ecf20Sopenharmony_ci// zxt2 result2[0]=result1[0] 3028c2ecf20Sopenharmony_ci// extr.u word1[0]=result1[0],16,16 3038c2ecf20Sopenharmony_ci// shr.u carry1=result1[0],32 3048c2ecf20Sopenharmony_ci// ;; 3058c2ecf20Sopenharmony_ci// add result2[0]=result2[0],word1[0] 3068c2ecf20Sopenharmony_ci// ;; 3078c2ecf20Sopenharmony_ci// add result2[0]=result2[0],carry1 3088c2ecf20Sopenharmony_ci// ;; 3098c2ecf20Sopenharmony_ci// extr.u ret0=result2[0],16,16 3108c2ecf20Sopenharmony_ci// ;; 3118c2ecf20Sopenharmony_ci// add ret0=ret0,result2[0] 3128c2ecf20Sopenharmony_ci// ;; 3138c2ecf20Sopenharmony_ci// zxt2 ret0=ret0 3148c2ecf20Sopenharmony_ci// mov ar.pfs=saved_pfs // restore ar.ec 3158c2ecf20Sopenharmony_ci// mov pr=saved_pr,0xffffffffffff0000 3168c2ecf20Sopenharmony_ci// ;; 3178c2ecf20Sopenharmony_ci// // if buf was odd then swap bytes 3188c2ecf20Sopenharmony_ci// mov ar.lc=saved_lc 3198c2ecf20Sopenharmony_ci//(p15) mux1 ret0=ret0,@rev // reverse word 3208c2ecf20Sopenharmony_ci// ;; 3218c2ecf20Sopenharmony_ci//(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes 3228c2ecf20Sopenharmony_ci// br.ret.sptk.many rp 3238c2ecf20Sopenharmony_ci 3248c2ecf20Sopenharmony_ciEND(do_csum) 325