162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * 462306a36Sopenharmony_ci * Optmized version of the standard do_csum() function 562306a36Sopenharmony_ci * 662306a36Sopenharmony_ci * Return: a 64bit quantity containing the 16bit Internet checksum 762306a36Sopenharmony_ci * 862306a36Sopenharmony_ci * Inputs: 962306a36Sopenharmony_ci * in0: address of buffer to checksum (char *) 1062306a36Sopenharmony_ci * in1: length of the buffer (int) 1162306a36Sopenharmony_ci * 1262306a36Sopenharmony_ci * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co 1362306a36Sopenharmony_ci * Stephane Eranian <eranian@hpl.hp.com> 1462306a36Sopenharmony_ci * 1562306a36Sopenharmony_ci * 02/04/22 Ken Chen <kenneth.w.chen@intel.com> 1662306a36Sopenharmony_ci * Data locality study on the checksum buffer. 1762306a36Sopenharmony_ci * More optimization cleanup - remove excessive stop bits. 1862306a36Sopenharmony_ci * 02/04/08 David Mosberger <davidm@hpl.hp.com> 1962306a36Sopenharmony_ci * More cleanup and tuning. 2062306a36Sopenharmony_ci * 01/04/18 Jun Nakajima <jun.nakajima@intel.com> 2162306a36Sopenharmony_ci * Clean up and optimize and the software pipeline, loading two 2262306a36Sopenharmony_ci * back-to-back 8-byte words per loop. Clean up the initialization 2362306a36Sopenharmony_ci * for the loop. Support the cases where load latency = 1 or 2. 2462306a36Sopenharmony_ci * Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default). 2562306a36Sopenharmony_ci */ 2662306a36Sopenharmony_ci 2762306a36Sopenharmony_ci#include <asm/asmmacro.h> 2862306a36Sopenharmony_ci 2962306a36Sopenharmony_ci// 3062306a36Sopenharmony_ci// Theory of operations: 3162306a36Sopenharmony_ci// The goal is to go as quickly as possible to the point where 3262306a36Sopenharmony_ci// we can checksum 16 bytes/loop. Before reaching that point we must 3362306a36Sopenharmony_ci// take care of incorrect alignment of first byte. 3462306a36Sopenharmony_ci// 3562306a36Sopenharmony_ci// The code hereafter also takes care of the "tail" part of the buffer 3662306a36Sopenharmony_ci// before entering the core loop, if any. The checksum is a sum so it 3762306a36Sopenharmony_ci// allows us to commute operations. So we do the "head" and "tail" 3862306a36Sopenharmony_ci// first to finish at full speed in the body. Once we get the head and 3962306a36Sopenharmony_ci// tail values, we feed them into the pipeline, very handy initialization. 4062306a36Sopenharmony_ci// 4162306a36Sopenharmony_ci// Of course we deal with the special case where the whole buffer fits 4262306a36Sopenharmony_ci// into one 8 byte word. In this case we have only one entry in the pipeline. 4362306a36Sopenharmony_ci// 4462306a36Sopenharmony_ci// We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for 4562306a36Sopenharmony_ci// possible load latency and also to accommodate for head and tail. 4662306a36Sopenharmony_ci// 4762306a36Sopenharmony_ci// The end of the function deals with folding the checksum from 64bits 4862306a36Sopenharmony_ci// down to 16bits taking care of the carry. 4962306a36Sopenharmony_ci// 5062306a36Sopenharmony_ci// This version avoids synchronization in the core loop by also using a 5162306a36Sopenharmony_ci// pipeline for the accumulation of the checksum in resultx[] (x=1,2). 5262306a36Sopenharmony_ci// 5362306a36Sopenharmony_ci// wordx[] (x=1,2) 5462306a36Sopenharmony_ci// |---| 5562306a36Sopenharmony_ci// | | 0 : new value loaded in pipeline 5662306a36Sopenharmony_ci// |---| 5762306a36Sopenharmony_ci// | | - : in transit data 5862306a36Sopenharmony_ci// |---| 5962306a36Sopenharmony_ci// | | LOAD_LATENCY : current value to add to checksum 6062306a36Sopenharmony_ci// |---| 6162306a36Sopenharmony_ci// | | LOAD_LATENCY+1 : previous value added to checksum 6262306a36Sopenharmony_ci// |---| (previous iteration) 6362306a36Sopenharmony_ci// 6462306a36Sopenharmony_ci// resultx[] (x=1,2) 6562306a36Sopenharmony_ci// |---| 6662306a36Sopenharmony_ci// | | 0 : initial value 6762306a36Sopenharmony_ci// |---| 6862306a36Sopenharmony_ci// | | LOAD_LATENCY-1 : new checksum 6962306a36Sopenharmony_ci// |---| 7062306a36Sopenharmony_ci// | | LOAD_LATENCY : previous value of checksum 7162306a36Sopenharmony_ci// |---| 7262306a36Sopenharmony_ci// | | LOAD_LATENCY+1 : final checksum when out of the loop 7362306a36Sopenharmony_ci// |---| 7462306a36Sopenharmony_ci// 7562306a36Sopenharmony_ci// 7662306a36Sopenharmony_ci// See RFC1071 "Computing the Internet Checksum" for various techniques for 7762306a36Sopenharmony_ci// calculating the Internet checksum. 7862306a36Sopenharmony_ci// 7962306a36Sopenharmony_ci// NOT YET DONE: 8062306a36Sopenharmony_ci// - Maybe another algorithm which would take care of the folding at the 8162306a36Sopenharmony_ci// end in a different manner 8262306a36Sopenharmony_ci// - Work with people more knowledgeable than me on the network stack 8362306a36Sopenharmony_ci// to figure out if we could not split the function depending on the 8462306a36Sopenharmony_ci// type of packet or alignment we get. Like the ip_fast_csum() routine 8562306a36Sopenharmony_ci// where we know we have at least 20bytes worth of data to checksum. 8662306a36Sopenharmony_ci// - Do a better job of handling small packets. 8762306a36Sopenharmony_ci// - Note on prefetching: it was found that under various load, i.e. ftp read/write, 8862306a36Sopenharmony_ci// nfs read/write, the L1 cache hit rate is at 60% and L2 cache hit rate is at 99.8% 8962306a36Sopenharmony_ci// on the data that buffer points to (partly because the checksum is often preceded by 9062306a36Sopenharmony_ci// a copy_from_user()). This finding indiate that lfetch will not be beneficial since 9162306a36Sopenharmony_ci// the data is already in the cache. 9262306a36Sopenharmony_ci// 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_ci#define saved_pfs r11 9562306a36Sopenharmony_ci#define hmask r16 9662306a36Sopenharmony_ci#define tmask r17 9762306a36Sopenharmony_ci#define first1 r18 9862306a36Sopenharmony_ci#define firstval r19 9962306a36Sopenharmony_ci#define firstoff r20 10062306a36Sopenharmony_ci#define last r21 10162306a36Sopenharmony_ci#define lastval r22 10262306a36Sopenharmony_ci#define lastoff r23 10362306a36Sopenharmony_ci#define saved_lc r24 10462306a36Sopenharmony_ci#define saved_pr r25 10562306a36Sopenharmony_ci#define tmp1 r26 10662306a36Sopenharmony_ci#define tmp2 r27 10762306a36Sopenharmony_ci#define tmp3 r28 10862306a36Sopenharmony_ci#define carry1 r29 10962306a36Sopenharmony_ci#define carry2 r30 11062306a36Sopenharmony_ci#define first2 r31 11162306a36Sopenharmony_ci 11262306a36Sopenharmony_ci#define buf in0 11362306a36Sopenharmony_ci#define len in1 11462306a36Sopenharmony_ci 11562306a36Sopenharmony_ci#define LOAD_LATENCY 2 // XXX fix me 11662306a36Sopenharmony_ci 11762306a36Sopenharmony_ci#if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2) 11862306a36Sopenharmony_ci# error "Only 1 or 2 is supported/tested for LOAD_LATENCY." 11962306a36Sopenharmony_ci#endif 12062306a36Sopenharmony_ci 12162306a36Sopenharmony_ci#define PIPE_DEPTH (LOAD_LATENCY+2) 12262306a36Sopenharmony_ci#define ELD p[LOAD_LATENCY] // end of load 12362306a36Sopenharmony_ci#define ELD_1 p[LOAD_LATENCY+1] // and next stage 12462306a36Sopenharmony_ci 12562306a36Sopenharmony_ci// unsigned long do_csum(unsigned char *buf,long len) 12662306a36Sopenharmony_ci 12762306a36Sopenharmony_ciGLOBAL_ENTRY(do_csum) 12862306a36Sopenharmony_ci .prologue 12962306a36Sopenharmony_ci .save ar.pfs, saved_pfs 13062306a36Sopenharmony_ci alloc saved_pfs=ar.pfs,2,16,0,16 13162306a36Sopenharmony_ci .rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2] 13262306a36Sopenharmony_ci .rotp p[PIPE_DEPTH], pC1[2], pC2[2] 13362306a36Sopenharmony_ci mov ret0=r0 // in case we have zero length 13462306a36Sopenharmony_ci cmp.lt p0,p6=r0,len // check for zero length or negative (32bit len) 13562306a36Sopenharmony_ci ;; 13662306a36Sopenharmony_ci add tmp1=buf,len // last byte's address 13762306a36Sopenharmony_ci .save pr, saved_pr 13862306a36Sopenharmony_ci mov saved_pr=pr // preserve predicates (rotation) 13962306a36Sopenharmony_ci(p6) br.ret.spnt.many rp // return if zero or negative length 14062306a36Sopenharmony_ci 14162306a36Sopenharmony_ci mov hmask=-1 // initialize head mask 14262306a36Sopenharmony_ci tbit.nz p15,p0=buf,0 // is buf an odd address? 14362306a36Sopenharmony_ci and first1=-8,buf // 8-byte align down address of first1 element 14462306a36Sopenharmony_ci 14562306a36Sopenharmony_ci and firstoff=7,buf // how many bytes off for first1 element 14662306a36Sopenharmony_ci mov tmask=-1 // initialize tail mask 14762306a36Sopenharmony_ci 14862306a36Sopenharmony_ci ;; 14962306a36Sopenharmony_ci adds tmp2=-1,tmp1 // last-1 15062306a36Sopenharmony_ci and lastoff=7,tmp1 // how many bytes off for last element 15162306a36Sopenharmony_ci ;; 15262306a36Sopenharmony_ci sub tmp1=8,lastoff // complement to lastoff 15362306a36Sopenharmony_ci and last=-8,tmp2 // address of word containing last byte 15462306a36Sopenharmony_ci ;; 15562306a36Sopenharmony_ci sub tmp3=last,first1 // tmp3=distance from first1 to last 15662306a36Sopenharmony_ci .save ar.lc, saved_lc 15762306a36Sopenharmony_ci mov saved_lc=ar.lc // save lc 15862306a36Sopenharmony_ci cmp.eq p8,p9=last,first1 // everything fits in one word ? 15962306a36Sopenharmony_ci 16062306a36Sopenharmony_ci ld8 firstval=[first1],8 // load, ahead of time, "first1" word 16162306a36Sopenharmony_ci and tmp1=7, tmp1 // make sure that if tmp1==8 -> tmp1=0 16262306a36Sopenharmony_ci shl tmp2=firstoff,3 // number of bits 16362306a36Sopenharmony_ci ;; 16462306a36Sopenharmony_ci(p9) ld8 lastval=[last] // load, ahead of time, "last" word, if needed 16562306a36Sopenharmony_ci shl tmp1=tmp1,3 // number of bits 16662306a36Sopenharmony_ci(p9) adds tmp3=-8,tmp3 // effectively loaded 16762306a36Sopenharmony_ci ;; 16862306a36Sopenharmony_ci(p8) mov lastval=r0 // we don't need lastval if first1==last 16962306a36Sopenharmony_ci shl hmask=hmask,tmp2 // build head mask, mask off [0,first1off[ 17062306a36Sopenharmony_ci shr.u tmask=tmask,tmp1 // build tail mask, mask off ]8,lastoff] 17162306a36Sopenharmony_ci ;; 17262306a36Sopenharmony_ci .body 17362306a36Sopenharmony_ci#define count tmp3 17462306a36Sopenharmony_ci 17562306a36Sopenharmony_ci(p8) and hmask=hmask,tmask // apply tail mask to head mask if 1 word only 17662306a36Sopenharmony_ci(p9) and word2[0]=lastval,tmask // mask last it as appropriate 17762306a36Sopenharmony_ci shr.u count=count,3 // how many 8-byte? 17862306a36Sopenharmony_ci ;; 17962306a36Sopenharmony_ci // If count is odd, finish this 8-byte word so that we can 18062306a36Sopenharmony_ci // load two back-to-back 8-byte words per loop thereafter. 18162306a36Sopenharmony_ci and word1[0]=firstval,hmask // and mask it as appropriate 18262306a36Sopenharmony_ci tbit.nz p10,p11=count,0 // if (count is odd) 18362306a36Sopenharmony_ci ;; 18462306a36Sopenharmony_ci(p8) mov result1[0]=word1[0] 18562306a36Sopenharmony_ci(p9) add result1[0]=word1[0],word2[0] 18662306a36Sopenharmony_ci ;; 18762306a36Sopenharmony_ci cmp.ltu p6,p0=result1[0],word1[0] // check the carry 18862306a36Sopenharmony_ci cmp.eq.or.andcm p8,p0=0,count // exit if zero 8-byte 18962306a36Sopenharmony_ci ;; 19062306a36Sopenharmony_ci(p6) adds result1[0]=1,result1[0] 19162306a36Sopenharmony_ci(p8) br.cond.dptk .do_csum_exit // if (within an 8-byte word) 19262306a36Sopenharmony_ci(p11) br.cond.dptk .do_csum16 // if (count is even) 19362306a36Sopenharmony_ci 19462306a36Sopenharmony_ci // Here count is odd. 19562306a36Sopenharmony_ci ld8 word1[1]=[first1],8 // load an 8-byte word 19662306a36Sopenharmony_ci cmp.eq p9,p10=1,count // if (count == 1) 19762306a36Sopenharmony_ci adds count=-1,count // loaded an 8-byte word 19862306a36Sopenharmony_ci ;; 19962306a36Sopenharmony_ci add result1[0]=result1[0],word1[1] 20062306a36Sopenharmony_ci ;; 20162306a36Sopenharmony_ci cmp.ltu p6,p0=result1[0],word1[1] 20262306a36Sopenharmony_ci ;; 20362306a36Sopenharmony_ci(p6) adds result1[0]=1,result1[0] 20462306a36Sopenharmony_ci(p9) br.cond.sptk .do_csum_exit // if (count == 1) exit 20562306a36Sopenharmony_ci // Fall through to calculate the checksum, feeding result1[0] as 20662306a36Sopenharmony_ci // the initial value in result1[0]. 20762306a36Sopenharmony_ci // 20862306a36Sopenharmony_ci // Calculate the checksum loading two 8-byte words per loop. 20962306a36Sopenharmony_ci // 21062306a36Sopenharmony_ci.do_csum16: 21162306a36Sopenharmony_ci add first2=8,first1 21262306a36Sopenharmony_ci shr.u count=count,1 // we do 16 bytes per loop 21362306a36Sopenharmony_ci ;; 21462306a36Sopenharmony_ci adds count=-1,count 21562306a36Sopenharmony_ci mov carry1=r0 21662306a36Sopenharmony_ci mov carry2=r0 21762306a36Sopenharmony_ci brp.loop.imp 1f,2f 21862306a36Sopenharmony_ci ;; 21962306a36Sopenharmony_ci mov ar.ec=PIPE_DEPTH 22062306a36Sopenharmony_ci mov ar.lc=count // set lc 22162306a36Sopenharmony_ci mov pr.rot=1<<16 22262306a36Sopenharmony_ci // result1[0] must be initialized in advance. 22362306a36Sopenharmony_ci mov result2[0]=r0 22462306a36Sopenharmony_ci ;; 22562306a36Sopenharmony_ci .align 32 22662306a36Sopenharmony_ci1: 22762306a36Sopenharmony_ci(ELD_1) cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1] 22862306a36Sopenharmony_ci(pC1[1])adds carry1=1,carry1 22962306a36Sopenharmony_ci(ELD_1) cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1] 23062306a36Sopenharmony_ci(pC2[1])adds carry2=1,carry2 23162306a36Sopenharmony_ci(ELD) add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY] 23262306a36Sopenharmony_ci(ELD) add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY] 23362306a36Sopenharmony_ci2: 23462306a36Sopenharmony_ci(p[0]) ld8 word1[0]=[first1],16 23562306a36Sopenharmony_ci(p[0]) ld8 word2[0]=[first2],16 23662306a36Sopenharmony_ci br.ctop.sptk 1b 23762306a36Sopenharmony_ci ;; 23862306a36Sopenharmony_ci // Since len is a 32-bit value, carry cannot be larger than a 64-bit value. 23962306a36Sopenharmony_ci(pC1[1])adds carry1=1,carry1 // since we miss the last one 24062306a36Sopenharmony_ci(pC2[1])adds carry2=1,carry2 24162306a36Sopenharmony_ci ;; 24262306a36Sopenharmony_ci add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1 24362306a36Sopenharmony_ci add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2 24462306a36Sopenharmony_ci ;; 24562306a36Sopenharmony_ci cmp.ltu p6,p0=result1[LOAD_LATENCY+1],carry1 24662306a36Sopenharmony_ci cmp.ltu p7,p0=result2[LOAD_LATENCY+1],carry2 24762306a36Sopenharmony_ci ;; 24862306a36Sopenharmony_ci(p6) adds result1[LOAD_LATENCY+1]=1,result1[LOAD_LATENCY+1] 24962306a36Sopenharmony_ci(p7) adds result2[LOAD_LATENCY+1]=1,result2[LOAD_LATENCY+1] 25062306a36Sopenharmony_ci ;; 25162306a36Sopenharmony_ci add result1[0]=result1[LOAD_LATENCY+1],result2[LOAD_LATENCY+1] 25262306a36Sopenharmony_ci ;; 25362306a36Sopenharmony_ci cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1] 25462306a36Sopenharmony_ci ;; 25562306a36Sopenharmony_ci(p6) adds result1[0]=1,result1[0] 25662306a36Sopenharmony_ci ;; 25762306a36Sopenharmony_ci.do_csum_exit: 25862306a36Sopenharmony_ci // 25962306a36Sopenharmony_ci // now fold 64 into 16 bits taking care of carry 26062306a36Sopenharmony_ci // that's not very good because it has lots of sequentiality 26162306a36Sopenharmony_ci // 26262306a36Sopenharmony_ci mov tmp3=0xffff 26362306a36Sopenharmony_ci zxt4 tmp1=result1[0] 26462306a36Sopenharmony_ci shr.u tmp2=result1[0],32 26562306a36Sopenharmony_ci ;; 26662306a36Sopenharmony_ci add result1[0]=tmp1,tmp2 26762306a36Sopenharmony_ci ;; 26862306a36Sopenharmony_ci and tmp1=result1[0],tmp3 26962306a36Sopenharmony_ci shr.u tmp2=result1[0],16 27062306a36Sopenharmony_ci ;; 27162306a36Sopenharmony_ci add result1[0]=tmp1,tmp2 27262306a36Sopenharmony_ci ;; 27362306a36Sopenharmony_ci and tmp1=result1[0],tmp3 27462306a36Sopenharmony_ci shr.u tmp2=result1[0],16 27562306a36Sopenharmony_ci ;; 27662306a36Sopenharmony_ci add result1[0]=tmp1,tmp2 27762306a36Sopenharmony_ci ;; 27862306a36Sopenharmony_ci and tmp1=result1[0],tmp3 27962306a36Sopenharmony_ci shr.u tmp2=result1[0],16 28062306a36Sopenharmony_ci ;; 28162306a36Sopenharmony_ci add ret0=tmp1,tmp2 28262306a36Sopenharmony_ci mov pr=saved_pr,0xffffffffffff0000 28362306a36Sopenharmony_ci ;; 28462306a36Sopenharmony_ci // if buf was odd then swap bytes 28562306a36Sopenharmony_ci mov ar.pfs=saved_pfs // restore ar.ec 28662306a36Sopenharmony_ci(p15) mux1 ret0=ret0,@rev // reverse word 28762306a36Sopenharmony_ci ;; 28862306a36Sopenharmony_ci mov ar.lc=saved_lc 28962306a36Sopenharmony_ci(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes 29062306a36Sopenharmony_ci br.ret.sptk.many rp 29162306a36Sopenharmony_ci 29262306a36Sopenharmony_ci// I (Jun Nakajima) wrote an equivalent code (see below), but it was 29362306a36Sopenharmony_ci// not much better than the original. So keep the original there so that 29462306a36Sopenharmony_ci// someone else can challenge. 29562306a36Sopenharmony_ci// 29662306a36Sopenharmony_ci// shr.u word1[0]=result1[0],32 29762306a36Sopenharmony_ci// zxt4 result1[0]=result1[0] 29862306a36Sopenharmony_ci// ;; 29962306a36Sopenharmony_ci// add result1[0]=result1[0],word1[0] 30062306a36Sopenharmony_ci// ;; 30162306a36Sopenharmony_ci// zxt2 result2[0]=result1[0] 30262306a36Sopenharmony_ci// extr.u word1[0]=result1[0],16,16 30362306a36Sopenharmony_ci// shr.u carry1=result1[0],32 30462306a36Sopenharmony_ci// ;; 30562306a36Sopenharmony_ci// add result2[0]=result2[0],word1[0] 30662306a36Sopenharmony_ci// ;; 30762306a36Sopenharmony_ci// add result2[0]=result2[0],carry1 30862306a36Sopenharmony_ci// ;; 30962306a36Sopenharmony_ci// extr.u ret0=result2[0],16,16 31062306a36Sopenharmony_ci// ;; 31162306a36Sopenharmony_ci// add ret0=ret0,result2[0] 31262306a36Sopenharmony_ci// ;; 31362306a36Sopenharmony_ci// zxt2 ret0=ret0 31462306a36Sopenharmony_ci// mov ar.pfs=saved_pfs // restore ar.ec 31562306a36Sopenharmony_ci// mov pr=saved_pr,0xffffffffffff0000 31662306a36Sopenharmony_ci// ;; 31762306a36Sopenharmony_ci// // if buf was odd then swap bytes 31862306a36Sopenharmony_ci// mov ar.lc=saved_lc 31962306a36Sopenharmony_ci//(p15) mux1 ret0=ret0,@rev // reverse word 32062306a36Sopenharmony_ci// ;; 32162306a36Sopenharmony_ci//(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes 32262306a36Sopenharmony_ci// br.ret.sptk.many rp 32362306a36Sopenharmony_ci 32462306a36Sopenharmony_ciEND(do_csum) 325