1/* 2 * Copyright © 2018 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#ifndef FAST_IDIV_BY_CONST_H 25#define FAST_IDIV_BY_CONST_H 26 27/* Imported from: 28 * https://raw.githubusercontent.com/ridiculousfish/libdivide/master/divide_by_constants_codegen_reference.c 29 */ 30 31#include <inttypes.h> 32#include <limits.h> 33#include <assert.h> 34 35#ifdef __cplusplus 36extern "C" { 37#endif 38 39/* Computes "magic info" for performing signed division by a fixed integer D. 40 * The type 'sint_t' is assumed to be defined as a signed integer type large 41 * enough to hold both the dividend and the divisor. 42 * Here >> is arithmetic (signed) shift, and >>> is logical shift. 43 * 44 * To emit code for n/d, rounding towards zero, use the following sequence: 45 * 46 * m = compute_signed_magic_info(D) 47 * emit("result = (m.multiplier * n) >> SINT_BITS"); 48 * if d > 0 and m.multiplier < 0: emit("result += n") 49 * if d < 0 and m.multiplier > 0: emit("result -= n") 50 * if m.post_shift > 0: emit("result >>= m.shift") 51 * emit("result += (result < 0)") 52 * 53 * The shifts by SINT_BITS may be "free" if the high half of the full multiply 54 * is put in a separate register. 55 * 56 * The final add can of course be implemented via the sign bit, e.g. 57 * result += (result >>> (SINT_BITS - 1)) 58 * or 59 * result -= (result >> (SINT_BITS - 1)) 60 * 61 * This code is heavily indebted to Hacker's Delight by Henry Warren. 62 * See http://www.hackersdelight.org/HDcode/magic.c.txt 63 * Used with permission from http://www.hackersdelight.org/permissions.htm 64 */ 65 66struct util_fast_sdiv_info { 67 int64_t multiplier; /* the "magic number" multiplier */ 68 unsigned shift; /* shift for the dividend after multiplying */ 69}; 70 71struct util_fast_sdiv_info 72util_compute_fast_sdiv_info(int64_t D, unsigned SINT_BITS); 73 74/* Computes "magic info" for performing unsigned division by a fixed positive 75 * integer D. UINT_BITS is the bit size at which the final "magic" 76 * calculation will be performed; it is assumed to be large enough to hold 77 * both the dividand and the divisor. num_bits can be set appropriately if n 78 * is known to be smaller than calc_bits; if this is not known then UINT_BITS 79 * for num_bits. 80 * 81 * Assume we have a hardware register of width UINT_BITS, a known constant D 82 * which is not zero and not a power of 2, and a variable n of width num_bits 83 * (which may be up to UINT_BITS). To emit code for n/d, use one of the two 84 * following sequences (here >>> refers to a logical bitshift): 85 * 86 * m = compute_unsigned_magic_info(D, num_bits) 87 * if m.pre_shift > 0: emit("n >>>= m.pre_shift") 88 * if m.increment: emit("n = saturated_increment(n)") 89 * emit("result = (m.multiplier * n) >>> UINT_BITS") 90 * if m.post_shift > 0: emit("result >>>= m.post_shift") 91 * 92 * or 93 * 94 * m = compute_unsigned_magic_info(D, num_bits) 95 * if m.pre_shift > 0: emit("n >>>= m.pre_shift") 96 * emit("result = m.multiplier * n") 97 * if m.increment: emit("result = result + m.multiplier") 98 * emit("result >>>= UINT_BITS") 99 * if m.post_shift > 0: emit("result >>>= m.post_shift") 100 * 101 * This second version works even if D is 1. The shifts by UINT_BITS may be 102 * "free" if the high half of the full multiply is put in a separate register. 103 * 104 * saturated_increment(n) means "increment n unless it would wrap to 0," i.e. 105 * if n == (1 << UINT_BITS)-1: result = n 106 * else: result = n+1 107 * A common way to implement this is with the carry bit. For example, on x86: 108 * add 1 109 * sbb 0 110 * 111 * Some invariants: 112 * 1: At least one of pre_shift and increment is zero 113 * 2: multiplier is never zero 114 * 115 * This code incorporates the "round down" optimization per ridiculous_fish. 116 */ 117 118struct util_fast_udiv_info { 119 uint64_t multiplier; /* the "magic number" multiplier */ 120 unsigned pre_shift; /* shift for the dividend before multiplying */ 121 unsigned post_shift; /* shift for the dividend after multiplying */ 122 int increment; /* 0 or 1; if set then increment the numerator, using one of 123 the two strategies */ 124}; 125 126struct util_fast_udiv_info 127util_compute_fast_udiv_info(uint64_t D, unsigned num_bits, unsigned UINT_BITS); 128 129/* Below are possible options for dividing by a uniform in a shader where 130 * the divisor is constant but not known at compile time. 131 */ 132 133/* Full version. */ 134static inline uint32_t 135util_fast_udiv32(uint32_t n, struct util_fast_udiv_info info) 136{ 137 n = n >> info.pre_shift; 138 /* If the divisor is not 1, you can instead use a 32-bit ADD that clamps 139 * to UINT_MAX. Dividing by 1 needs the full 64-bit ADD. 140 * 141 * If you have unsigned 64-bit MAD with 32-bit inputs, you can do: 142 * increment = increment ? multiplier : 0; // on the CPU 143 * (n * multiplier + increment) // on the GPU using unsigned 64-bit MAD 144 */ 145 n = (((uint64_t)n + info.increment) * info.multiplier) >> 32; 146 n = n >> info.post_shift; 147 return n; 148} 149 150/* A little more efficient version if n != UINT_MAX, i.e. no unsigned 151 * wraparound in the computation. 152 */ 153static inline uint32_t 154util_fast_udiv32_nuw(uint32_t n, struct util_fast_udiv_info info) 155{ 156 assert(n != UINT32_MAX); 157 n = n >> info.pre_shift; 158 n = n + info.increment; 159 n = ((uint64_t)n * info.multiplier) >> 32; 160 n = n >> info.post_shift; 161 return n; 162} 163 164/* Even faster version but both operands must be 31-bit unsigned integers 165 * and the divisor must be greater than 1. 166 * 167 * info must be computed with num_bits == 31. 168 */ 169static inline uint32_t 170util_fast_udiv32_u31_d_not_one(uint32_t n, struct util_fast_udiv_info info) 171{ 172 assert(info.pre_shift == 0); 173 assert(info.increment == 0); 174 n = ((uint64_t)n * info.multiplier) >> 32; 175 n = n >> info.post_shift; 176 return n; 177} 178 179#ifdef __cplusplus 180} /* extern C */ 181#endif 182 183#endif /* FAST_IDIV_BY_CONST_H */ 184