18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * arch/alpha/lib/ev6-divide.S 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> 68c2ecf20Sopenharmony_ci * 78c2ecf20Sopenharmony_ci * Alpha division.. 88c2ecf20Sopenharmony_ci */ 98c2ecf20Sopenharmony_ci 108c2ecf20Sopenharmony_ci/* 118c2ecf20Sopenharmony_ci * The alpha chip doesn't provide hardware division, so we have to do it 128c2ecf20Sopenharmony_ci * by hand. The compiler expects the functions 138c2ecf20Sopenharmony_ci * 148c2ecf20Sopenharmony_ci * __divqu: 64-bit unsigned long divide 158c2ecf20Sopenharmony_ci * __remqu: 64-bit unsigned long remainder 168c2ecf20Sopenharmony_ci * __divqs/__remqs: signed 64-bit 178c2ecf20Sopenharmony_ci * __divlu/__remlu: unsigned 32-bit 188c2ecf20Sopenharmony_ci * __divls/__remls: signed 32-bit 198c2ecf20Sopenharmony_ci * 208c2ecf20Sopenharmony_ci * These are not normal C functions: instead of the normal 218c2ecf20Sopenharmony_ci * calling sequence, these expect their arguments in registers 228c2ecf20Sopenharmony_ci * $24 and $25, and return the result in $27. Register $28 may 238c2ecf20Sopenharmony_ci * be clobbered (assembly temporary), anything else must be saved. 248c2ecf20Sopenharmony_ci * 258c2ecf20Sopenharmony_ci * In short: painful. 268c2ecf20Sopenharmony_ci * 278c2ecf20Sopenharmony_ci * This is a rather simple bit-at-a-time algorithm: it's very good 288c2ecf20Sopenharmony_ci * at dividing random 64-bit numbers, but the more usual case where 298c2ecf20Sopenharmony_ci * the divisor is small is handled better by the DEC algorithm 308c2ecf20Sopenharmony_ci * using lookup tables. This uses much less memory, though, and is 318c2ecf20Sopenharmony_ci * nicer on the cache.. Besides, I don't know the copyright status 328c2ecf20Sopenharmony_ci * of the DEC code. 338c2ecf20Sopenharmony_ci */ 348c2ecf20Sopenharmony_ci 358c2ecf20Sopenharmony_ci/* 368c2ecf20Sopenharmony_ci * My temporaries: 378c2ecf20Sopenharmony_ci * $0 - current bit 388c2ecf20Sopenharmony_ci * $1 - shifted divisor 398c2ecf20Sopenharmony_ci * $2 - modulus/quotient 408c2ecf20Sopenharmony_ci * 418c2ecf20Sopenharmony_ci * $23 - return address 428c2ecf20Sopenharmony_ci * $24 - dividend 438c2ecf20Sopenharmony_ci * $25 - divisor 448c2ecf20Sopenharmony_ci * 458c2ecf20Sopenharmony_ci * $27 - quotient/modulus 468c2ecf20Sopenharmony_ci * $28 - compare status 478c2ecf20Sopenharmony_ci * 488c2ecf20Sopenharmony_ci * Much of the information about 21264 scheduling/coding comes from: 498c2ecf20Sopenharmony_ci * Compiler Writer's Guide for the Alpha 21264 508c2ecf20Sopenharmony_ci * abbreviated as 'CWG' in other comments here 518c2ecf20Sopenharmony_ci * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html 528c2ecf20Sopenharmony_ci * Scheduling notation: 538c2ecf20Sopenharmony_ci * E - either cluster 548c2ecf20Sopenharmony_ci * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 558c2ecf20Sopenharmony_ci * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 568c2ecf20Sopenharmony_ci * Try not to change the actual algorithm if possible for consistency. 578c2ecf20Sopenharmony_ci */ 588c2ecf20Sopenharmony_ci 598c2ecf20Sopenharmony_ci#include <asm/export.h> 608c2ecf20Sopenharmony_ci#define halt .long 0 618c2ecf20Sopenharmony_ci 628c2ecf20Sopenharmony_ci/* 638c2ecf20Sopenharmony_ci * Select function type and registers 648c2ecf20Sopenharmony_ci */ 658c2ecf20Sopenharmony_ci#define mask $0 668c2ecf20Sopenharmony_ci#define divisor $1 678c2ecf20Sopenharmony_ci#define compare $28 688c2ecf20Sopenharmony_ci#define tmp1 $3 698c2ecf20Sopenharmony_ci#define tmp2 $4 708c2ecf20Sopenharmony_ci 718c2ecf20Sopenharmony_ci#ifdef DIV 728c2ecf20Sopenharmony_ci#define DIV_ONLY(x,y...) x,##y 738c2ecf20Sopenharmony_ci#define MOD_ONLY(x,y...) 748c2ecf20Sopenharmony_ci#define func(x) __div##x 758c2ecf20Sopenharmony_ci#define modulus $2 768c2ecf20Sopenharmony_ci#define quotient $27 778c2ecf20Sopenharmony_ci#define GETSIGN(x) xor $24,$25,x 788c2ecf20Sopenharmony_ci#define STACK 48 798c2ecf20Sopenharmony_ci#else 808c2ecf20Sopenharmony_ci#define DIV_ONLY(x,y...) 818c2ecf20Sopenharmony_ci#define MOD_ONLY(x,y...) x,##y 828c2ecf20Sopenharmony_ci#define func(x) __rem##x 838c2ecf20Sopenharmony_ci#define modulus $27 848c2ecf20Sopenharmony_ci#define quotient $2 858c2ecf20Sopenharmony_ci#define GETSIGN(x) bis $24,$24,x 868c2ecf20Sopenharmony_ci#define STACK 32 878c2ecf20Sopenharmony_ci#endif 888c2ecf20Sopenharmony_ci 898c2ecf20Sopenharmony_ci/* 908c2ecf20Sopenharmony_ci * For 32-bit operations, we need to extend to 64-bit 918c2ecf20Sopenharmony_ci */ 928c2ecf20Sopenharmony_ci#ifdef INTSIZE 938c2ecf20Sopenharmony_ci#define ufunction func(lu) 948c2ecf20Sopenharmony_ci#define sfunction func(l) 958c2ecf20Sopenharmony_ci#define LONGIFY(x) zapnot x,15,x 968c2ecf20Sopenharmony_ci#define SLONGIFY(x) addl x,0,x 978c2ecf20Sopenharmony_ci#else 988c2ecf20Sopenharmony_ci#define ufunction func(qu) 998c2ecf20Sopenharmony_ci#define sfunction func(q) 1008c2ecf20Sopenharmony_ci#define LONGIFY(x) 1018c2ecf20Sopenharmony_ci#define SLONGIFY(x) 1028c2ecf20Sopenharmony_ci#endif 1038c2ecf20Sopenharmony_ci 1048c2ecf20Sopenharmony_ci.set noat 1058c2ecf20Sopenharmony_ci.align 4 1068c2ecf20Sopenharmony_ci.globl ufunction 1078c2ecf20Sopenharmony_ci.ent ufunction 1088c2ecf20Sopenharmony_ciufunction: 1098c2ecf20Sopenharmony_ci subq $30,STACK,$30 # E : 1108c2ecf20Sopenharmony_ci .frame $30,STACK,$23 1118c2ecf20Sopenharmony_ci .prologue 0 1128c2ecf20Sopenharmony_ci 1138c2ecf20Sopenharmony_ci7: stq $1, 0($30) # L : 1148c2ecf20Sopenharmony_ci bis $25,$25,divisor # E : 1158c2ecf20Sopenharmony_ci stq $2, 8($30) # L : L U L U 1168c2ecf20Sopenharmony_ci 1178c2ecf20Sopenharmony_ci bis $24,$24,modulus # E : 1188c2ecf20Sopenharmony_ci stq $0,16($30) # L : 1198c2ecf20Sopenharmony_ci bis $31,$31,quotient # E : 1208c2ecf20Sopenharmony_ci LONGIFY(divisor) # E : U L L U 1218c2ecf20Sopenharmony_ci 1228c2ecf20Sopenharmony_ci stq tmp1,24($30) # L : 1238c2ecf20Sopenharmony_ci LONGIFY(modulus) # E : 1248c2ecf20Sopenharmony_ci bis $31,1,mask # E : 1258c2ecf20Sopenharmony_ci DIV_ONLY(stq tmp2,32($30)) # L : L U U L 1268c2ecf20Sopenharmony_ci 1278c2ecf20Sopenharmony_ci beq divisor, 9f /* div by zero */ 1288c2ecf20Sopenharmony_ci /* 1298c2ecf20Sopenharmony_ci * In spite of the DIV_ONLY being either a non-instruction 1308c2ecf20Sopenharmony_ci * or an actual stq, the addition of the .align directive 1318c2ecf20Sopenharmony_ci * below ensures that label 1 is going to be nicely aligned 1328c2ecf20Sopenharmony_ci */ 1338c2ecf20Sopenharmony_ci 1348c2ecf20Sopenharmony_ci .align 4 1358c2ecf20Sopenharmony_ci#ifdef INTSIZE 1368c2ecf20Sopenharmony_ci /* 1378c2ecf20Sopenharmony_ci * shift divisor left, using 3-bit shifts for 1388c2ecf20Sopenharmony_ci * 32-bit divides as we can't overflow. Three-bit 1398c2ecf20Sopenharmony_ci * shifts will result in looping three times less 1408c2ecf20Sopenharmony_ci * here, but can result in two loops more later. 1418c2ecf20Sopenharmony_ci * Thus using a large shift isn't worth it (and 1428c2ecf20Sopenharmony_ci * s8add pairs better than a sll..) 1438c2ecf20Sopenharmony_ci */ 1448c2ecf20Sopenharmony_ci1: cmpult divisor,modulus,compare # E : 1458c2ecf20Sopenharmony_ci s8addq divisor,$31,divisor # E : 1468c2ecf20Sopenharmony_ci s8addq mask,$31,mask # E : 1478c2ecf20Sopenharmony_ci bne compare,1b # U : U L U L 1488c2ecf20Sopenharmony_ci#else 1498c2ecf20Sopenharmony_ci1: cmpult divisor,modulus,compare # E : 1508c2ecf20Sopenharmony_ci nop # E : 1518c2ecf20Sopenharmony_ci nop # E : 1528c2ecf20Sopenharmony_ci blt divisor, 2f # U : U L U L 1538c2ecf20Sopenharmony_ci 1548c2ecf20Sopenharmony_ci addq divisor,divisor,divisor # E : 1558c2ecf20Sopenharmony_ci addq mask,mask,mask # E : 1568c2ecf20Sopenharmony_ci unop # E : 1578c2ecf20Sopenharmony_ci bne compare,1b # U : U L U L 1588c2ecf20Sopenharmony_ci#endif 1598c2ecf20Sopenharmony_ci 1608c2ecf20Sopenharmony_ci /* ok, start to go right again.. */ 1618c2ecf20Sopenharmony_ci2: 1628c2ecf20Sopenharmony_ci /* 1638c2ecf20Sopenharmony_ci * Keep things nicely bundled... use a nop instead of not 1648c2ecf20Sopenharmony_ci * having an instruction for DIV_ONLY 1658c2ecf20Sopenharmony_ci */ 1668c2ecf20Sopenharmony_ci#ifdef DIV 1678c2ecf20Sopenharmony_ci DIV_ONLY(addq quotient,mask,tmp2) # E : 1688c2ecf20Sopenharmony_ci#else 1698c2ecf20Sopenharmony_ci nop # E : 1708c2ecf20Sopenharmony_ci#endif 1718c2ecf20Sopenharmony_ci srl mask,1,mask # U : 1728c2ecf20Sopenharmony_ci cmpule divisor,modulus,compare # E : 1738c2ecf20Sopenharmony_ci subq modulus,divisor,tmp1 # E : 1748c2ecf20Sopenharmony_ci 1758c2ecf20Sopenharmony_ci#ifdef DIV 1768c2ecf20Sopenharmony_ci DIV_ONLY(cmovne compare,tmp2,quotient) # E : Latency 2, extra map slot 1778c2ecf20Sopenharmony_ci nop # E : as part of the cmovne 1788c2ecf20Sopenharmony_ci srl divisor,1,divisor # U : 1798c2ecf20Sopenharmony_ci nop # E : L U L U 1808c2ecf20Sopenharmony_ci 1818c2ecf20Sopenharmony_ci nop # E : 1828c2ecf20Sopenharmony_ci cmovne compare,tmp1,modulus # E : Latency 2, extra map slot 1838c2ecf20Sopenharmony_ci nop # E : as part of the cmovne 1848c2ecf20Sopenharmony_ci bne mask,2b # U : U L U L 1858c2ecf20Sopenharmony_ci#else 1868c2ecf20Sopenharmony_ci srl divisor,1,divisor # U : 1878c2ecf20Sopenharmony_ci cmovne compare,tmp1,modulus # E : Latency 2, extra map slot 1888c2ecf20Sopenharmony_ci nop # E : as part of the cmovne 1898c2ecf20Sopenharmony_ci bne mask,2b # U : U L L U 1908c2ecf20Sopenharmony_ci#endif 1918c2ecf20Sopenharmony_ci 1928c2ecf20Sopenharmony_ci9: ldq $1, 0($30) # L : 1938c2ecf20Sopenharmony_ci ldq $2, 8($30) # L : 1948c2ecf20Sopenharmony_ci nop # E : 1958c2ecf20Sopenharmony_ci nop # E : U U L L 1968c2ecf20Sopenharmony_ci 1978c2ecf20Sopenharmony_ci ldq $0,16($30) # L : 1988c2ecf20Sopenharmony_ci ldq tmp1,24($30) # L : 1998c2ecf20Sopenharmony_ci nop # E : 2008c2ecf20Sopenharmony_ci nop # E : 2018c2ecf20Sopenharmony_ci 2028c2ecf20Sopenharmony_ci#ifdef DIV 2038c2ecf20Sopenharmony_ci DIV_ONLY(ldq tmp2,32($30)) # L : 2048c2ecf20Sopenharmony_ci#else 2058c2ecf20Sopenharmony_ci nop # E : 2068c2ecf20Sopenharmony_ci#endif 2078c2ecf20Sopenharmony_ci addq $30,STACK,$30 # E : 2088c2ecf20Sopenharmony_ci ret $31,($23),1 # L0 : L U U L 2098c2ecf20Sopenharmony_ci .end ufunction 2108c2ecf20Sopenharmony_ciEXPORT_SYMBOL(ufunction) 2118c2ecf20Sopenharmony_ci 2128c2ecf20Sopenharmony_ci/* 2138c2ecf20Sopenharmony_ci * Uhh.. Ugly signed division. I'd rather not have it at all, but 2148c2ecf20Sopenharmony_ci * it's needed in some circumstances. There are different ways to 2158c2ecf20Sopenharmony_ci * handle this, really. This does: 2168c2ecf20Sopenharmony_ci * -a / b = a / -b = -(a / b) 2178c2ecf20Sopenharmony_ci * -a % b = -(a % b) 2188c2ecf20Sopenharmony_ci * a % -b = a % b 2198c2ecf20Sopenharmony_ci * which is probably not the best solution, but at least should 2208c2ecf20Sopenharmony_ci * have the property that (x/y)*y + (x%y) = x. 2218c2ecf20Sopenharmony_ci */ 2228c2ecf20Sopenharmony_ci.align 4 2238c2ecf20Sopenharmony_ci.globl sfunction 2248c2ecf20Sopenharmony_ci.ent sfunction 2258c2ecf20Sopenharmony_cisfunction: 2268c2ecf20Sopenharmony_ci subq $30,STACK,$30 # E : 2278c2ecf20Sopenharmony_ci .frame $30,STACK,$23 2288c2ecf20Sopenharmony_ci .prologue 0 2298c2ecf20Sopenharmony_ci bis $24,$25,$28 # E : 2308c2ecf20Sopenharmony_ci SLONGIFY($28) # E : 2318c2ecf20Sopenharmony_ci bge $28,7b # U : 2328c2ecf20Sopenharmony_ci 2338c2ecf20Sopenharmony_ci stq $24,0($30) # L : 2348c2ecf20Sopenharmony_ci subq $31,$24,$28 # E : 2358c2ecf20Sopenharmony_ci stq $25,8($30) # L : 2368c2ecf20Sopenharmony_ci nop # E : U L U L 2378c2ecf20Sopenharmony_ci 2388c2ecf20Sopenharmony_ci cmovlt $24,$28,$24 /* abs($24) */ # E : Latency 2, extra map slot 2398c2ecf20Sopenharmony_ci nop # E : as part of the cmov 2408c2ecf20Sopenharmony_ci stq $23,16($30) # L : 2418c2ecf20Sopenharmony_ci subq $31,$25,$28 # E : U L U L 2428c2ecf20Sopenharmony_ci 2438c2ecf20Sopenharmony_ci stq tmp1,24($30) # L : 2448c2ecf20Sopenharmony_ci cmovlt $25,$28,$25 /* abs($25) */ # E : Latency 2, extra map slot 2458c2ecf20Sopenharmony_ci nop # E : 2468c2ecf20Sopenharmony_ci bsr $23,ufunction # L0: L U L U 2478c2ecf20Sopenharmony_ci 2488c2ecf20Sopenharmony_ci ldq $24,0($30) # L : 2498c2ecf20Sopenharmony_ci ldq $25,8($30) # L : 2508c2ecf20Sopenharmony_ci GETSIGN($28) # E : 2518c2ecf20Sopenharmony_ci subq $31,$27,tmp1 # E : U U L L 2528c2ecf20Sopenharmony_ci 2538c2ecf20Sopenharmony_ci SLONGIFY($28) # E : 2548c2ecf20Sopenharmony_ci ldq $23,16($30) # L : 2558c2ecf20Sopenharmony_ci cmovlt $28,tmp1,$27 # E : Latency 2, extra map slot 2568c2ecf20Sopenharmony_ci nop # E : U L L U : as part of the cmov 2578c2ecf20Sopenharmony_ci 2588c2ecf20Sopenharmony_ci ldq tmp1,24($30) # L : 2598c2ecf20Sopenharmony_ci nop # E : as part of the cmov 2608c2ecf20Sopenharmony_ci addq $30,STACK,$30 # E : 2618c2ecf20Sopenharmony_ci ret $31,($23),1 # L0 : L U U L 2628c2ecf20Sopenharmony_ci .end sfunction 2638c2ecf20Sopenharmony_ciEXPORT_SYMBOL(sfunction) 264