162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * arch/alpha/lib/ev6-csum_ipv6_magic.S 462306a36Sopenharmony_ci * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> 562306a36Sopenharmony_ci * 662306a36Sopenharmony_ci * unsigned short csum_ipv6_magic(struct in6_addr *saddr, 762306a36Sopenharmony_ci * struct in6_addr *daddr, 862306a36Sopenharmony_ci * __u32 len, 962306a36Sopenharmony_ci * unsigned short proto, 1062306a36Sopenharmony_ci * unsigned int csum); 1162306a36Sopenharmony_ci * 1262306a36Sopenharmony_ci * Much of the information about 21264 scheduling/coding comes from: 1362306a36Sopenharmony_ci * Compiler Writer's Guide for the Alpha 21264 1462306a36Sopenharmony_ci * abbreviated as 'CWG' in other comments here 1562306a36Sopenharmony_ci * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html 1662306a36Sopenharmony_ci * Scheduling notation: 1762306a36Sopenharmony_ci * E - either cluster 1862306a36Sopenharmony_ci * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 1962306a36Sopenharmony_ci * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 2062306a36Sopenharmony_ci * Try not to change the actual algorithm if possible for consistency. 2162306a36Sopenharmony_ci * Determining actual stalls (other than slotting) doesn't appear to be easy to do. 2262306a36Sopenharmony_ci * 2362306a36Sopenharmony_ci * unsigned short csum_ipv6_magic(struct in6_addr *saddr, 2462306a36Sopenharmony_ci * struct in6_addr *daddr, 2562306a36Sopenharmony_ci * __u32 len, 2662306a36Sopenharmony_ci * unsigned short proto, 2762306a36Sopenharmony_ci * unsigned int csum); 2862306a36Sopenharmony_ci * 2962306a36Sopenharmony_ci * Swap <proto> (takes form 0xaabb) 3062306a36Sopenharmony_ci * Then shift it left by 48, so result is: 3162306a36Sopenharmony_ci * 0xbbaa0000 00000000 3262306a36Sopenharmony_ci * Then turn it back into a sign extended 32-bit item 3362306a36Sopenharmony_ci * 0xbbaa0000 3462306a36Sopenharmony_ci * 3562306a36Sopenharmony_ci * Swap <len> (an unsigned int) using Mike Burrows' 7-instruction sequence 3662306a36Sopenharmony_ci * (we can't hide the 3-cycle latency of the unpkbw in the 6-instruction sequence) 3762306a36Sopenharmony_ci * Assume input takes form 0xAABBCCDD 3862306a36Sopenharmony_ci * 3962306a36Sopenharmony_ci * Finally, original 'folding' approach is to split the long into 4 unsigned shorts 4062306a36Sopenharmony_ci * add 4 ushorts, resulting in ushort/carry 4162306a36Sopenharmony_ci * add carry bits + ushort --> ushort 4262306a36Sopenharmony_ci * add carry bits + ushort --> ushort (in case the carry results in an overflow) 4362306a36Sopenharmony_ci * Truncate to a ushort. (took 13 instructions) 4462306a36Sopenharmony_ci * From doing some testing, using the approach in checksum.c:from64to16() 4562306a36Sopenharmony_ci * results in the same outcome: 4662306a36Sopenharmony_ci * split into 2 uints, add those, generating a ulong 4762306a36Sopenharmony_ci * add the 3 low ushorts together, generating a uint 4862306a36Sopenharmony_ci * a final add of the 2 lower ushorts 4962306a36Sopenharmony_ci * truncating the result. 5062306a36Sopenharmony_ci * 5162306a36Sopenharmony_ci * Misalignment handling added by Ivan Kokshaysky <ink@jurassic.park.msu.ru> 5262306a36Sopenharmony_ci * The cost is 16 instructions (~8 cycles), including two extra loads which 5362306a36Sopenharmony_ci * may cause additional delay in rare cases (load-load replay traps). 5462306a36Sopenharmony_ci */ 5562306a36Sopenharmony_ci 5662306a36Sopenharmony_ci#include <linux/export.h> 5762306a36Sopenharmony_ci .globl csum_ipv6_magic 5862306a36Sopenharmony_ci .align 4 5962306a36Sopenharmony_ci .ent csum_ipv6_magic 6062306a36Sopenharmony_ci .frame $30,0,$26,0 6162306a36Sopenharmony_cicsum_ipv6_magic: 6262306a36Sopenharmony_ci .prologue 0 6362306a36Sopenharmony_ci 6462306a36Sopenharmony_ci ldq_u $0,0($16) # L : Latency: 3 6562306a36Sopenharmony_ci inslh $18,7,$4 # U : 0000000000AABBCC 6662306a36Sopenharmony_ci ldq_u $1,8($16) # L : Latency: 3 6762306a36Sopenharmony_ci sll $19,8,$7 # U : U L U L : 0x00000000 00aabb00 6862306a36Sopenharmony_ci 6962306a36Sopenharmony_ci and $16,7,$6 # E : src misalignment 7062306a36Sopenharmony_ci ldq_u $5,15($16) # L : Latency: 3 7162306a36Sopenharmony_ci zapnot $20,15,$20 # U : zero extend incoming csum 7262306a36Sopenharmony_ci ldq_u $2,0($17) # L : U L U L : Latency: 3 7362306a36Sopenharmony_ci 7462306a36Sopenharmony_ci extql $0,$6,$0 # U : 7562306a36Sopenharmony_ci extqh $1,$6,$22 # U : 7662306a36Sopenharmony_ci ldq_u $3,8($17) # L : Latency: 3 7762306a36Sopenharmony_ci sll $19,24,$19 # U : U U L U : 0x000000aa bb000000 7862306a36Sopenharmony_ci 7962306a36Sopenharmony_ci cmoveq $6,$31,$22 # E : src aligned? 8062306a36Sopenharmony_ci ldq_u $23,15($17) # L : Latency: 3 8162306a36Sopenharmony_ci inswl $18,3,$18 # U : 000000CCDD000000 8262306a36Sopenharmony_ci addl $19,$7,$19 # E : U L U L : <sign bits>bbaabb00 8362306a36Sopenharmony_ci 8462306a36Sopenharmony_ci or $0,$22,$0 # E : 1st src word complete 8562306a36Sopenharmony_ci extql $1,$6,$1 # U : 8662306a36Sopenharmony_ci or $18,$4,$18 # E : 000000CCDDAABBCC 8762306a36Sopenharmony_ci extqh $5,$6,$5 # U : L U L U 8862306a36Sopenharmony_ci 8962306a36Sopenharmony_ci and $17,7,$6 # E : dst misalignment 9062306a36Sopenharmony_ci extql $2,$6,$2 # U : 9162306a36Sopenharmony_ci or $1,$5,$1 # E : 2nd src word complete 9262306a36Sopenharmony_ci extqh $3,$6,$22 # U : L U L U : 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_ci cmoveq $6,$31,$22 # E : dst aligned? 9562306a36Sopenharmony_ci extql $3,$6,$3 # U : 9662306a36Sopenharmony_ci addq $20,$0,$20 # E : begin summing the words 9762306a36Sopenharmony_ci extqh $23,$6,$23 # U : L U L U : 9862306a36Sopenharmony_ci 9962306a36Sopenharmony_ci srl $18,16,$4 # U : 0000000000CCDDAA 10062306a36Sopenharmony_ci or $2,$22,$2 # E : 1st dst word complete 10162306a36Sopenharmony_ci zap $19,0x3,$19 # U : <sign bits>bbaa0000 10262306a36Sopenharmony_ci or $3,$23,$3 # E : U L U L : 2nd dst word complete 10362306a36Sopenharmony_ci 10462306a36Sopenharmony_ci cmpult $20,$0,$0 # E : 10562306a36Sopenharmony_ci addq $20,$1,$20 # E : 10662306a36Sopenharmony_ci zapnot $18,0xa,$18 # U : 00000000DD00BB00 10762306a36Sopenharmony_ci zap $4,0xa,$4 # U : U U L L : 0000000000CC00AA 10862306a36Sopenharmony_ci 10962306a36Sopenharmony_ci or $18,$4,$18 # E : 00000000DDCCBBAA 11062306a36Sopenharmony_ci nop # E : 11162306a36Sopenharmony_ci cmpult $20,$1,$1 # E : 11262306a36Sopenharmony_ci addq $20,$2,$20 # E : U L U L 11362306a36Sopenharmony_ci 11462306a36Sopenharmony_ci cmpult $20,$2,$2 # E : 11562306a36Sopenharmony_ci addq $20,$3,$20 # E : 11662306a36Sopenharmony_ci cmpult $20,$3,$3 # E : (1 cycle stall on $20) 11762306a36Sopenharmony_ci addq $20,$18,$20 # E : U L U L (1 cycle stall on $20) 11862306a36Sopenharmony_ci 11962306a36Sopenharmony_ci cmpult $20,$18,$18 # E : 12062306a36Sopenharmony_ci addq $20,$19,$20 # E : (1 cycle stall on $20) 12162306a36Sopenharmony_ci addq $0,$1,$0 # E : merge the carries back into the csum 12262306a36Sopenharmony_ci addq $2,$3,$2 # E : 12362306a36Sopenharmony_ci 12462306a36Sopenharmony_ci cmpult $20,$19,$19 # E : 12562306a36Sopenharmony_ci addq $18,$19,$18 # E : (1 cycle stall on $19) 12662306a36Sopenharmony_ci addq $0,$2,$0 # E : 12762306a36Sopenharmony_ci addq $20,$18,$20 # E : U L U L : 12862306a36Sopenharmony_ci /* (1 cycle stall on $18, 2 cycles on $20) */ 12962306a36Sopenharmony_ci 13062306a36Sopenharmony_ci addq $0,$20,$0 # E : 13162306a36Sopenharmony_ci zapnot $0,15,$1 # U : Start folding output (1 cycle stall on $0) 13262306a36Sopenharmony_ci nop # E : 13362306a36Sopenharmony_ci srl $0,32,$0 # U : U L U L : (1 cycle stall on $0) 13462306a36Sopenharmony_ci 13562306a36Sopenharmony_ci addq $1,$0,$1 # E : Finished generating ulong 13662306a36Sopenharmony_ci extwl $1,2,$2 # U : ushort[1] (1 cycle stall on $1) 13762306a36Sopenharmony_ci zapnot $1,3,$0 # U : ushort[0] (1 cycle stall on $1) 13862306a36Sopenharmony_ci extwl $1,4,$1 # U : ushort[2] (1 cycle stall on $1) 13962306a36Sopenharmony_ci 14062306a36Sopenharmony_ci addq $0,$2,$0 # E 14162306a36Sopenharmony_ci addq $0,$1,$3 # E : Finished generating uint 14262306a36Sopenharmony_ci /* (1 cycle stall on $0) */ 14362306a36Sopenharmony_ci extwl $3,2,$1 # U : ushort[1] (1 cycle stall on $3) 14462306a36Sopenharmony_ci nop # E : L U L U 14562306a36Sopenharmony_ci 14662306a36Sopenharmony_ci addq $1,$3,$0 # E : Final carry 14762306a36Sopenharmony_ci not $0,$4 # E : complement (1 cycle stall on $0) 14862306a36Sopenharmony_ci zapnot $4,3,$0 # U : clear upper garbage bits 14962306a36Sopenharmony_ci /* (1 cycle stall on $4) */ 15062306a36Sopenharmony_ci ret # L0 : L U L U 15162306a36Sopenharmony_ci 15262306a36Sopenharmony_ci .end csum_ipv6_magic 15362306a36Sopenharmony_ci EXPORT_SYMBOL(csum_ipv6_magic) 154