162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 262306a36Sopenharmony_ci#ifndef _ASM_HASH_H 362306a36Sopenharmony_ci#define _ASM_HASH_H 462306a36Sopenharmony_ci 562306a36Sopenharmony_ci/* 662306a36Sopenharmony_ci * HP-PA only implements integer multiply in the FPU. However, for 762306a36Sopenharmony_ci * integer multiplies by constant, it has a number of shift-and-add 862306a36Sopenharmony_ci * (but no shift-and-subtract, sigh!) instructions that a compiler 962306a36Sopenharmony_ci * can synthesize a code sequence with. 1062306a36Sopenharmony_ci * 1162306a36Sopenharmony_ci * Unfortunately, GCC isn't very efficient at using them. For example 1262306a36Sopenharmony_ci * it uses three instructions for "x *= 21" when only two are needed. 1362306a36Sopenharmony_ci * But we can find a sequence manually. 1462306a36Sopenharmony_ci */ 1562306a36Sopenharmony_ci 1662306a36Sopenharmony_ci#define HAVE_ARCH__HASH_32 1 1762306a36Sopenharmony_ci 1862306a36Sopenharmony_ci/* 1962306a36Sopenharmony_ci * This is a multiply by GOLDEN_RATIO_32 = 0x61C88647 optimized for the 2062306a36Sopenharmony_ci * PA7100 pairing rules. This is an in-order 2-way superscalar processor. 2162306a36Sopenharmony_ci * Only one instruction in a pair may be a shift (by more than 3 bits), 2262306a36Sopenharmony_ci * but other than that, simple ALU ops (including shift-and-add by up 2362306a36Sopenharmony_ci * to 3 bits) may be paired arbitrarily. 2462306a36Sopenharmony_ci * 2562306a36Sopenharmony_ci * PA8xxx processors also dual-issue ALU instructions, although with 2662306a36Sopenharmony_ci * fewer constraints, so this schedule is good for them, too. 2762306a36Sopenharmony_ci * 2862306a36Sopenharmony_ci * This 6-step sequence was found by Yevgen Voronenko's implementation 2962306a36Sopenharmony_ci * of the Hcub algorithm at http://spiral.ece.cmu.edu/mcm/gen.html. 3062306a36Sopenharmony_ci */ 3162306a36Sopenharmony_cistatic inline u32 __attribute_const__ __hash_32(u32 x) 3262306a36Sopenharmony_ci{ 3362306a36Sopenharmony_ci u32 a, b, c; 3462306a36Sopenharmony_ci 3562306a36Sopenharmony_ci /* 3662306a36Sopenharmony_ci * Phase 1: Compute a = (x << 19) + x, 3762306a36Sopenharmony_ci * b = (x << 9) + a, c = (x << 23) + b. 3862306a36Sopenharmony_ci */ 3962306a36Sopenharmony_ci a = x << 19; /* Two shifts can't be paired */ 4062306a36Sopenharmony_ci b = x << 9; a += x; 4162306a36Sopenharmony_ci c = x << 23; b += a; 4262306a36Sopenharmony_ci c += b; 4362306a36Sopenharmony_ci /* Phase 2: Return (b<<11) + (c<<6) + (a<<3) - c */ 4462306a36Sopenharmony_ci b <<= 11; 4562306a36Sopenharmony_ci a += c << 3; b -= c; 4662306a36Sopenharmony_ci return (a << 3) + b; 4762306a36Sopenharmony_ci} 4862306a36Sopenharmony_ci 4962306a36Sopenharmony_ci#if BITS_PER_LONG == 64 5062306a36Sopenharmony_ci 5162306a36Sopenharmony_ci#define HAVE_ARCH_HASH_64 1 5262306a36Sopenharmony_ci 5362306a36Sopenharmony_ci/* 5462306a36Sopenharmony_ci * Finding a good shift-and-add chain for GOLDEN_RATIO_64 is tricky, 5562306a36Sopenharmony_ci * because available software for the purpose chokes on constants this 5662306a36Sopenharmony_ci * large. (It's mostly designed for compiling FIR filter coefficients 5762306a36Sopenharmony_ci * into FPGAs.) 5862306a36Sopenharmony_ci * 5962306a36Sopenharmony_ci * However, Jason Thong pointed out a work-around. The Hcub software 6062306a36Sopenharmony_ci * (http://spiral.ece.cmu.edu/mcm/gen.html) is designed for *multiple* 6162306a36Sopenharmony_ci * constant multiplication, and is good at finding shift-and-add chains 6262306a36Sopenharmony_ci * which share common terms. 6362306a36Sopenharmony_ci * 6462306a36Sopenharmony_ci * Looking at 0x0x61C8864680B583EB in binary: 6562306a36Sopenharmony_ci * 0110000111001000100001100100011010000000101101011000001111101011 6662306a36Sopenharmony_ci * \______________/ \__________/ \_______/ \________/ 6762306a36Sopenharmony_ci * \____________________________/ \____________________/ 6862306a36Sopenharmony_ci * you can see the non-zero bits are divided into several well-separated 6962306a36Sopenharmony_ci * blocks. Hcub can find algorithms for those terms separately, which 7062306a36Sopenharmony_ci * can then be shifted and added together. 7162306a36Sopenharmony_ci * 7262306a36Sopenharmony_ci * Dividing the input into 2, 3 or 4 blocks, Hcub can find solutions 7362306a36Sopenharmony_ci * with 10, 9 or 8 adds, respectively, making a total of 11 for the 7462306a36Sopenharmony_ci * whole number. 7562306a36Sopenharmony_ci * 7662306a36Sopenharmony_ci * Using just two large blocks, 0xC3910C8D << 31 in the high bits, 7762306a36Sopenharmony_ci * and 0xB583EB in the low bits, produces as good an algorithm as any, 7862306a36Sopenharmony_ci * and with one more small shift than alternatives. 7962306a36Sopenharmony_ci * 8062306a36Sopenharmony_ci * The high bits are a larger number and more work to compute, as well 8162306a36Sopenharmony_ci * as needing one extra cycle to shift left 31 bits before the final 8262306a36Sopenharmony_ci * addition, so they are the critical path for scheduling. The low bits 8362306a36Sopenharmony_ci * can fit into the scheduling slots left over. 8462306a36Sopenharmony_ci */ 8562306a36Sopenharmony_ci 8662306a36Sopenharmony_ci 8762306a36Sopenharmony_ci/* 8862306a36Sopenharmony_ci * This _ASSIGN(dst, src) macro performs "dst = src", but prevents GCC 8962306a36Sopenharmony_ci * from inferring anything about the value assigned to "dest". 9062306a36Sopenharmony_ci * 9162306a36Sopenharmony_ci * This prevents it from mis-optimizing certain sequences. 9262306a36Sopenharmony_ci * In particular, gcc is annoyingly eager to combine consecutive shifts. 9362306a36Sopenharmony_ci * Given "x <<= 19; y += x; z += x << 1;", GCC will turn this into 9462306a36Sopenharmony_ci * "y += x << 19; z += x << 20;" even though the latter sequence needs 9562306a36Sopenharmony_ci * an additional instruction and temporary register. 9662306a36Sopenharmony_ci * 9762306a36Sopenharmony_ci * Because no actual assembly code is generated, this construct is 9862306a36Sopenharmony_ci * usefully portable across all GCC platforms, and so can be test-compiled 9962306a36Sopenharmony_ci * on non-PA systems. 10062306a36Sopenharmony_ci * 10162306a36Sopenharmony_ci * In two places, additional unused input dependencies are added. This 10262306a36Sopenharmony_ci * forces GCC's scheduling so it does not rearrange instructions too much. 10362306a36Sopenharmony_ci * Because the PA-8xxx is out of order, I'm not sure how much this matters, 10462306a36Sopenharmony_ci * but why make it more difficult for the processor than necessary? 10562306a36Sopenharmony_ci */ 10662306a36Sopenharmony_ci#define _ASSIGN(dst, src, ...) asm("" : "=r" (dst) : "0" (src), ##__VA_ARGS__) 10762306a36Sopenharmony_ci 10862306a36Sopenharmony_ci/* 10962306a36Sopenharmony_ci * Multiply by GOLDEN_RATIO_64 = 0x0x61C8864680B583EB using a heavily 11062306a36Sopenharmony_ci * optimized shift-and-add sequence. 11162306a36Sopenharmony_ci * 11262306a36Sopenharmony_ci * Without the final shift, the multiply proper is 19 instructions, 11362306a36Sopenharmony_ci * 10 cycles and uses only 4 temporaries. Whew! 11462306a36Sopenharmony_ci * 11562306a36Sopenharmony_ci * You are not expected to understand this. 11662306a36Sopenharmony_ci */ 11762306a36Sopenharmony_cistatic __always_inline u32 __attribute_const__ 11862306a36Sopenharmony_cihash_64(u64 a, unsigned int bits) 11962306a36Sopenharmony_ci{ 12062306a36Sopenharmony_ci u64 b, c, d; 12162306a36Sopenharmony_ci 12262306a36Sopenharmony_ci /* 12362306a36Sopenharmony_ci * Encourage GCC to move a dynamic shift to %sar early, 12462306a36Sopenharmony_ci * thereby freeing up an additional temporary register. 12562306a36Sopenharmony_ci */ 12662306a36Sopenharmony_ci if (!__builtin_constant_p(bits)) 12762306a36Sopenharmony_ci asm("" : "=q" (bits) : "0" (64 - bits)); 12862306a36Sopenharmony_ci else 12962306a36Sopenharmony_ci bits = 64 - bits; 13062306a36Sopenharmony_ci 13162306a36Sopenharmony_ci _ASSIGN(b, a*5); c = a << 13; 13262306a36Sopenharmony_ci b = (b << 2) + a; _ASSIGN(d, a << 17); 13362306a36Sopenharmony_ci a = b + (a << 1); c += d; 13462306a36Sopenharmony_ci d = a << 10; _ASSIGN(a, a << 19); 13562306a36Sopenharmony_ci d = a - d; _ASSIGN(a, a << 4, "X" (d)); 13662306a36Sopenharmony_ci c += b; a += b; 13762306a36Sopenharmony_ci d -= c; c += a << 1; 13862306a36Sopenharmony_ci a += c << 3; _ASSIGN(b, b << (7+31), "X" (c), "X" (d)); 13962306a36Sopenharmony_ci a <<= 31; b += d; 14062306a36Sopenharmony_ci a += b; 14162306a36Sopenharmony_ci return a >> bits; 14262306a36Sopenharmony_ci} 14362306a36Sopenharmony_ci#undef _ASSIGN /* We're a widely-used header file, so don't litter! */ 14462306a36Sopenharmony_ci 14562306a36Sopenharmony_ci#endif /* BITS_PER_LONG == 64 */ 14662306a36Sopenharmony_ci 14762306a36Sopenharmony_ci#endif /* _ASM_HASH_H */ 148