162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * arch/alpha/lib/ev6-divide.S 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> 662306a36Sopenharmony_ci * 762306a36Sopenharmony_ci * Alpha division.. 862306a36Sopenharmony_ci */ 962306a36Sopenharmony_ci 1062306a36Sopenharmony_ci/* 1162306a36Sopenharmony_ci * The alpha chip doesn't provide hardware division, so we have to do it 1262306a36Sopenharmony_ci * by hand. The compiler expects the functions 1362306a36Sopenharmony_ci * 1462306a36Sopenharmony_ci * __divqu: 64-bit unsigned long divide 1562306a36Sopenharmony_ci * __remqu: 64-bit unsigned long remainder 1662306a36Sopenharmony_ci * __divqs/__remqs: signed 64-bit 1762306a36Sopenharmony_ci * __divlu/__remlu: unsigned 32-bit 1862306a36Sopenharmony_ci * __divls/__remls: signed 32-bit 1962306a36Sopenharmony_ci * 2062306a36Sopenharmony_ci * These are not normal C functions: instead of the normal 2162306a36Sopenharmony_ci * calling sequence, these expect their arguments in registers 2262306a36Sopenharmony_ci * $24 and $25, and return the result in $27. Register $28 may 2362306a36Sopenharmony_ci * be clobbered (assembly temporary), anything else must be saved. 2462306a36Sopenharmony_ci * 2562306a36Sopenharmony_ci * In short: painful. 2662306a36Sopenharmony_ci * 2762306a36Sopenharmony_ci * This is a rather simple bit-at-a-time algorithm: it's very good 2862306a36Sopenharmony_ci * at dividing random 64-bit numbers, but the more usual case where 2962306a36Sopenharmony_ci * the divisor is small is handled better by the DEC algorithm 3062306a36Sopenharmony_ci * using lookup tables. This uses much less memory, though, and is 3162306a36Sopenharmony_ci * nicer on the cache.. Besides, I don't know the copyright status 3262306a36Sopenharmony_ci * of the DEC code. 3362306a36Sopenharmony_ci */ 3462306a36Sopenharmony_ci 3562306a36Sopenharmony_ci/* 3662306a36Sopenharmony_ci * My temporaries: 3762306a36Sopenharmony_ci * $0 - current bit 3862306a36Sopenharmony_ci * $1 - shifted divisor 3962306a36Sopenharmony_ci * $2 - modulus/quotient 4062306a36Sopenharmony_ci * 4162306a36Sopenharmony_ci * $23 - return address 4262306a36Sopenharmony_ci * $24 - dividend 4362306a36Sopenharmony_ci * $25 - divisor 4462306a36Sopenharmony_ci * 4562306a36Sopenharmony_ci * $27 - quotient/modulus 4662306a36Sopenharmony_ci * $28 - compare status 4762306a36Sopenharmony_ci * 4862306a36Sopenharmony_ci * Much of the information about 21264 scheduling/coding comes from: 4962306a36Sopenharmony_ci * Compiler Writer's Guide for the Alpha 21264 5062306a36Sopenharmony_ci * abbreviated as 'CWG' in other comments here 5162306a36Sopenharmony_ci * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html 5262306a36Sopenharmony_ci * Scheduling notation: 5362306a36Sopenharmony_ci * E - either cluster 5462306a36Sopenharmony_ci * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 5562306a36Sopenharmony_ci * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 5662306a36Sopenharmony_ci * Try not to change the actual algorithm if possible for consistency. 5762306a36Sopenharmony_ci */ 5862306a36Sopenharmony_ci 5962306a36Sopenharmony_ci#include <linux/export.h> 6062306a36Sopenharmony_ci#define halt .long 0 6162306a36Sopenharmony_ci 6262306a36Sopenharmony_ci/* 6362306a36Sopenharmony_ci * Select function type and registers 6462306a36Sopenharmony_ci */ 6562306a36Sopenharmony_ci#define mask $0 6662306a36Sopenharmony_ci#define divisor $1 6762306a36Sopenharmony_ci#define compare $28 6862306a36Sopenharmony_ci#define tmp1 $3 6962306a36Sopenharmony_ci#define tmp2 $4 7062306a36Sopenharmony_ci 7162306a36Sopenharmony_ci#ifdef DIV 7262306a36Sopenharmony_ci#define DIV_ONLY(x,y...) x,##y 7362306a36Sopenharmony_ci#define MOD_ONLY(x,y...) 7462306a36Sopenharmony_ci#define func(x) __div##x 7562306a36Sopenharmony_ci#define modulus $2 7662306a36Sopenharmony_ci#define quotient $27 7762306a36Sopenharmony_ci#define GETSIGN(x) xor $24,$25,x 7862306a36Sopenharmony_ci#define STACK 48 7962306a36Sopenharmony_ci#else 8062306a36Sopenharmony_ci#define DIV_ONLY(x,y...) 8162306a36Sopenharmony_ci#define MOD_ONLY(x,y...) x,##y 8262306a36Sopenharmony_ci#define func(x) __rem##x 8362306a36Sopenharmony_ci#define modulus $27 8462306a36Sopenharmony_ci#define quotient $2 8562306a36Sopenharmony_ci#define GETSIGN(x) bis $24,$24,x 8662306a36Sopenharmony_ci#define STACK 32 8762306a36Sopenharmony_ci#endif 8862306a36Sopenharmony_ci 8962306a36Sopenharmony_ci/* 9062306a36Sopenharmony_ci * For 32-bit operations, we need to extend to 64-bit 9162306a36Sopenharmony_ci */ 9262306a36Sopenharmony_ci#ifdef INTSIZE 9362306a36Sopenharmony_ci#define ufunction func(lu) 9462306a36Sopenharmony_ci#define sfunction func(l) 9562306a36Sopenharmony_ci#define LONGIFY(x) zapnot x,15,x 9662306a36Sopenharmony_ci#define SLONGIFY(x) addl x,0,x 9762306a36Sopenharmony_ci#else 9862306a36Sopenharmony_ci#define ufunction func(qu) 9962306a36Sopenharmony_ci#define sfunction func(q) 10062306a36Sopenharmony_ci#define LONGIFY(x) 10162306a36Sopenharmony_ci#define SLONGIFY(x) 10262306a36Sopenharmony_ci#endif 10362306a36Sopenharmony_ci 10462306a36Sopenharmony_ci.set noat 10562306a36Sopenharmony_ci.align 4 10662306a36Sopenharmony_ci.globl ufunction 10762306a36Sopenharmony_ci.ent ufunction 10862306a36Sopenharmony_ciufunction: 10962306a36Sopenharmony_ci subq $30,STACK,$30 # E : 11062306a36Sopenharmony_ci .frame $30,STACK,$23 11162306a36Sopenharmony_ci .prologue 0 11262306a36Sopenharmony_ci 11362306a36Sopenharmony_ci7: stq $1, 0($30) # L : 11462306a36Sopenharmony_ci bis $25,$25,divisor # E : 11562306a36Sopenharmony_ci stq $2, 8($30) # L : L U L U 11662306a36Sopenharmony_ci 11762306a36Sopenharmony_ci bis $24,$24,modulus # E : 11862306a36Sopenharmony_ci stq $0,16($30) # L : 11962306a36Sopenharmony_ci bis $31,$31,quotient # E : 12062306a36Sopenharmony_ci LONGIFY(divisor) # E : U L L U 12162306a36Sopenharmony_ci 12262306a36Sopenharmony_ci stq tmp1,24($30) # L : 12362306a36Sopenharmony_ci LONGIFY(modulus) # E : 12462306a36Sopenharmony_ci bis $31,1,mask # E : 12562306a36Sopenharmony_ci DIV_ONLY(stq tmp2,32($30)) # L : L U U L 12662306a36Sopenharmony_ci 12762306a36Sopenharmony_ci beq divisor, 9f /* div by zero */ 12862306a36Sopenharmony_ci /* 12962306a36Sopenharmony_ci * In spite of the DIV_ONLY being either a non-instruction 13062306a36Sopenharmony_ci * or an actual stq, the addition of the .align directive 13162306a36Sopenharmony_ci * below ensures that label 1 is going to be nicely aligned 13262306a36Sopenharmony_ci */ 13362306a36Sopenharmony_ci 13462306a36Sopenharmony_ci .align 4 13562306a36Sopenharmony_ci#ifdef INTSIZE 13662306a36Sopenharmony_ci /* 13762306a36Sopenharmony_ci * shift divisor left, using 3-bit shifts for 13862306a36Sopenharmony_ci * 32-bit divides as we can't overflow. Three-bit 13962306a36Sopenharmony_ci * shifts will result in looping three times less 14062306a36Sopenharmony_ci * here, but can result in two loops more later. 14162306a36Sopenharmony_ci * Thus using a large shift isn't worth it (and 14262306a36Sopenharmony_ci * s8add pairs better than a sll..) 14362306a36Sopenharmony_ci */ 14462306a36Sopenharmony_ci1: cmpult divisor,modulus,compare # E : 14562306a36Sopenharmony_ci s8addq divisor,$31,divisor # E : 14662306a36Sopenharmony_ci s8addq mask,$31,mask # E : 14762306a36Sopenharmony_ci bne compare,1b # U : U L U L 14862306a36Sopenharmony_ci#else 14962306a36Sopenharmony_ci1: cmpult divisor,modulus,compare # E : 15062306a36Sopenharmony_ci nop # E : 15162306a36Sopenharmony_ci nop # E : 15262306a36Sopenharmony_ci blt divisor, 2f # U : U L U L 15362306a36Sopenharmony_ci 15462306a36Sopenharmony_ci addq divisor,divisor,divisor # E : 15562306a36Sopenharmony_ci addq mask,mask,mask # E : 15662306a36Sopenharmony_ci unop # E : 15762306a36Sopenharmony_ci bne compare,1b # U : U L U L 15862306a36Sopenharmony_ci#endif 15962306a36Sopenharmony_ci 16062306a36Sopenharmony_ci /* ok, start to go right again.. */ 16162306a36Sopenharmony_ci2: 16262306a36Sopenharmony_ci /* 16362306a36Sopenharmony_ci * Keep things nicely bundled... use a nop instead of not 16462306a36Sopenharmony_ci * having an instruction for DIV_ONLY 16562306a36Sopenharmony_ci */ 16662306a36Sopenharmony_ci#ifdef DIV 16762306a36Sopenharmony_ci DIV_ONLY(addq quotient,mask,tmp2) # E : 16862306a36Sopenharmony_ci#else 16962306a36Sopenharmony_ci nop # E : 17062306a36Sopenharmony_ci#endif 17162306a36Sopenharmony_ci srl mask,1,mask # U : 17262306a36Sopenharmony_ci cmpule divisor,modulus,compare # E : 17362306a36Sopenharmony_ci subq modulus,divisor,tmp1 # E : 17462306a36Sopenharmony_ci 17562306a36Sopenharmony_ci#ifdef DIV 17662306a36Sopenharmony_ci DIV_ONLY(cmovne compare,tmp2,quotient) # E : Latency 2, extra map slot 17762306a36Sopenharmony_ci nop # E : as part of the cmovne 17862306a36Sopenharmony_ci srl divisor,1,divisor # U : 17962306a36Sopenharmony_ci nop # E : L U L U 18062306a36Sopenharmony_ci 18162306a36Sopenharmony_ci nop # E : 18262306a36Sopenharmony_ci cmovne compare,tmp1,modulus # E : Latency 2, extra map slot 18362306a36Sopenharmony_ci nop # E : as part of the cmovne 18462306a36Sopenharmony_ci bne mask,2b # U : U L U L 18562306a36Sopenharmony_ci#else 18662306a36Sopenharmony_ci srl divisor,1,divisor # U : 18762306a36Sopenharmony_ci cmovne compare,tmp1,modulus # E : Latency 2, extra map slot 18862306a36Sopenharmony_ci nop # E : as part of the cmovne 18962306a36Sopenharmony_ci bne mask,2b # U : U L L U 19062306a36Sopenharmony_ci#endif 19162306a36Sopenharmony_ci 19262306a36Sopenharmony_ci9: ldq $1, 0($30) # L : 19362306a36Sopenharmony_ci ldq $2, 8($30) # L : 19462306a36Sopenharmony_ci nop # E : 19562306a36Sopenharmony_ci nop # E : U U L L 19662306a36Sopenharmony_ci 19762306a36Sopenharmony_ci ldq $0,16($30) # L : 19862306a36Sopenharmony_ci ldq tmp1,24($30) # L : 19962306a36Sopenharmony_ci nop # E : 20062306a36Sopenharmony_ci nop # E : 20162306a36Sopenharmony_ci 20262306a36Sopenharmony_ci#ifdef DIV 20362306a36Sopenharmony_ci DIV_ONLY(ldq tmp2,32($30)) # L : 20462306a36Sopenharmony_ci#else 20562306a36Sopenharmony_ci nop # E : 20662306a36Sopenharmony_ci#endif 20762306a36Sopenharmony_ci addq $30,STACK,$30 # E : 20862306a36Sopenharmony_ci ret $31,($23),1 # L0 : L U U L 20962306a36Sopenharmony_ci .end ufunction 21062306a36Sopenharmony_ciEXPORT_SYMBOL(ufunction) 21162306a36Sopenharmony_ci 21262306a36Sopenharmony_ci/* 21362306a36Sopenharmony_ci * Uhh.. Ugly signed division. I'd rather not have it at all, but 21462306a36Sopenharmony_ci * it's needed in some circumstances. There are different ways to 21562306a36Sopenharmony_ci * handle this, really. This does: 21662306a36Sopenharmony_ci * -a / b = a / -b = -(a / b) 21762306a36Sopenharmony_ci * -a % b = -(a % b) 21862306a36Sopenharmony_ci * a % -b = a % b 21962306a36Sopenharmony_ci * which is probably not the best solution, but at least should 22062306a36Sopenharmony_ci * have the property that (x/y)*y + (x%y) = x. 22162306a36Sopenharmony_ci */ 22262306a36Sopenharmony_ci.align 4 22362306a36Sopenharmony_ci.globl sfunction 22462306a36Sopenharmony_ci.ent sfunction 22562306a36Sopenharmony_cisfunction: 22662306a36Sopenharmony_ci subq $30,STACK,$30 # E : 22762306a36Sopenharmony_ci .frame $30,STACK,$23 22862306a36Sopenharmony_ci .prologue 0 22962306a36Sopenharmony_ci bis $24,$25,$28 # E : 23062306a36Sopenharmony_ci SLONGIFY($28) # E : 23162306a36Sopenharmony_ci bge $28,7b # U : 23262306a36Sopenharmony_ci 23362306a36Sopenharmony_ci stq $24,0($30) # L : 23462306a36Sopenharmony_ci subq $31,$24,$28 # E : 23562306a36Sopenharmony_ci stq $25,8($30) # L : 23662306a36Sopenharmony_ci nop # E : U L U L 23762306a36Sopenharmony_ci 23862306a36Sopenharmony_ci cmovlt $24,$28,$24 /* abs($24) */ # E : Latency 2, extra map slot 23962306a36Sopenharmony_ci nop # E : as part of the cmov 24062306a36Sopenharmony_ci stq $23,16($30) # L : 24162306a36Sopenharmony_ci subq $31,$25,$28 # E : U L U L 24262306a36Sopenharmony_ci 24362306a36Sopenharmony_ci stq tmp1,24($30) # L : 24462306a36Sopenharmony_ci cmovlt $25,$28,$25 /* abs($25) */ # E : Latency 2, extra map slot 24562306a36Sopenharmony_ci nop # E : 24662306a36Sopenharmony_ci bsr $23,ufunction # L0: L U L U 24762306a36Sopenharmony_ci 24862306a36Sopenharmony_ci ldq $24,0($30) # L : 24962306a36Sopenharmony_ci ldq $25,8($30) # L : 25062306a36Sopenharmony_ci GETSIGN($28) # E : 25162306a36Sopenharmony_ci subq $31,$27,tmp1 # E : U U L L 25262306a36Sopenharmony_ci 25362306a36Sopenharmony_ci SLONGIFY($28) # E : 25462306a36Sopenharmony_ci ldq $23,16($30) # L : 25562306a36Sopenharmony_ci cmovlt $28,tmp1,$27 # E : Latency 2, extra map slot 25662306a36Sopenharmony_ci nop # E : U L L U : as part of the cmov 25762306a36Sopenharmony_ci 25862306a36Sopenharmony_ci ldq tmp1,24($30) # L : 25962306a36Sopenharmony_ci nop # E : as part of the cmov 26062306a36Sopenharmony_ci addq $30,STACK,$30 # E : 26162306a36Sopenharmony_ci ret $31,($23),1 # L0 : L U U L 26262306a36Sopenharmony_ci .end sfunction 26362306a36Sopenharmony_ciEXPORT_SYMBOL(sfunction) 264