1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright © 2018 Advanced Micro Devices, Inc. 3bf215546Sopenharmony_ci * 4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 10bf215546Sopenharmony_ci * 11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next 12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the 13bf215546Sopenharmony_ci * Software. 14bf215546Sopenharmony_ci * 15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21bf215546Sopenharmony_ci * IN THE SOFTWARE. 22bf215546Sopenharmony_ci */ 23bf215546Sopenharmony_ci 24bf215546Sopenharmony_ci#ifndef FAST_IDIV_BY_CONST_H 25bf215546Sopenharmony_ci#define FAST_IDIV_BY_CONST_H 26bf215546Sopenharmony_ci 27bf215546Sopenharmony_ci/* Imported from: 28bf215546Sopenharmony_ci * https://raw.githubusercontent.com/ridiculousfish/libdivide/master/divide_by_constants_codegen_reference.c 29bf215546Sopenharmony_ci */ 30bf215546Sopenharmony_ci 31bf215546Sopenharmony_ci#include <inttypes.h> 32bf215546Sopenharmony_ci#include <limits.h> 33bf215546Sopenharmony_ci#include <assert.h> 34bf215546Sopenharmony_ci 35bf215546Sopenharmony_ci#ifdef __cplusplus 36bf215546Sopenharmony_ciextern "C" { 37bf215546Sopenharmony_ci#endif 38bf215546Sopenharmony_ci 39bf215546Sopenharmony_ci/* Computes "magic info" for performing signed division by a fixed integer D. 40bf215546Sopenharmony_ci * The type 'sint_t' is assumed to be defined as a signed integer type large 41bf215546Sopenharmony_ci * enough to hold both the dividend and the divisor. 42bf215546Sopenharmony_ci * Here >> is arithmetic (signed) shift, and >>> is logical shift. 43bf215546Sopenharmony_ci * 44bf215546Sopenharmony_ci * To emit code for n/d, rounding towards zero, use the following sequence: 45bf215546Sopenharmony_ci * 46bf215546Sopenharmony_ci * m = compute_signed_magic_info(D) 47bf215546Sopenharmony_ci * emit("result = (m.multiplier * n) >> SINT_BITS"); 48bf215546Sopenharmony_ci * if d > 0 and m.multiplier < 0: emit("result += n") 49bf215546Sopenharmony_ci * if d < 0 and m.multiplier > 0: emit("result -= n") 50bf215546Sopenharmony_ci * if m.post_shift > 0: emit("result >>= m.shift") 51bf215546Sopenharmony_ci * emit("result += (result < 0)") 52bf215546Sopenharmony_ci * 53bf215546Sopenharmony_ci * The shifts by SINT_BITS may be "free" if the high half of the full multiply 54bf215546Sopenharmony_ci * is put in a separate register. 55bf215546Sopenharmony_ci * 56bf215546Sopenharmony_ci * The final add can of course be implemented via the sign bit, e.g. 57bf215546Sopenharmony_ci * result += (result >>> (SINT_BITS - 1)) 58bf215546Sopenharmony_ci * or 59bf215546Sopenharmony_ci * result -= (result >> (SINT_BITS - 1)) 60bf215546Sopenharmony_ci * 61bf215546Sopenharmony_ci * This code is heavily indebted to Hacker's Delight by Henry Warren. 62bf215546Sopenharmony_ci * See http://www.hackersdelight.org/HDcode/magic.c.txt 63bf215546Sopenharmony_ci * Used with permission from http://www.hackersdelight.org/permissions.htm 64bf215546Sopenharmony_ci */ 65bf215546Sopenharmony_ci 66bf215546Sopenharmony_cistruct util_fast_sdiv_info { 67bf215546Sopenharmony_ci int64_t multiplier; /* the "magic number" multiplier */ 68bf215546Sopenharmony_ci unsigned shift; /* shift for the dividend after multiplying */ 69bf215546Sopenharmony_ci}; 70bf215546Sopenharmony_ci 71bf215546Sopenharmony_cistruct util_fast_sdiv_info 72bf215546Sopenharmony_ciutil_compute_fast_sdiv_info(int64_t D, unsigned SINT_BITS); 73bf215546Sopenharmony_ci 74bf215546Sopenharmony_ci/* Computes "magic info" for performing unsigned division by a fixed positive 75bf215546Sopenharmony_ci * integer D. UINT_BITS is the bit size at which the final "magic" 76bf215546Sopenharmony_ci * calculation will be performed; it is assumed to be large enough to hold 77bf215546Sopenharmony_ci * both the dividand and the divisor. num_bits can be set appropriately if n 78bf215546Sopenharmony_ci * is known to be smaller than calc_bits; if this is not known then UINT_BITS 79bf215546Sopenharmony_ci * for num_bits. 80bf215546Sopenharmony_ci * 81bf215546Sopenharmony_ci * Assume we have a hardware register of width UINT_BITS, a known constant D 82bf215546Sopenharmony_ci * which is not zero and not a power of 2, and a variable n of width num_bits 83bf215546Sopenharmony_ci * (which may be up to UINT_BITS). To emit code for n/d, use one of the two 84bf215546Sopenharmony_ci * following sequences (here >>> refers to a logical bitshift): 85bf215546Sopenharmony_ci * 86bf215546Sopenharmony_ci * m = compute_unsigned_magic_info(D, num_bits) 87bf215546Sopenharmony_ci * if m.pre_shift > 0: emit("n >>>= m.pre_shift") 88bf215546Sopenharmony_ci * if m.increment: emit("n = saturated_increment(n)") 89bf215546Sopenharmony_ci * emit("result = (m.multiplier * n) >>> UINT_BITS") 90bf215546Sopenharmony_ci * if m.post_shift > 0: emit("result >>>= m.post_shift") 91bf215546Sopenharmony_ci * 92bf215546Sopenharmony_ci * or 93bf215546Sopenharmony_ci * 94bf215546Sopenharmony_ci * m = compute_unsigned_magic_info(D, num_bits) 95bf215546Sopenharmony_ci * if m.pre_shift > 0: emit("n >>>= m.pre_shift") 96bf215546Sopenharmony_ci * emit("result = m.multiplier * n") 97bf215546Sopenharmony_ci * if m.increment: emit("result = result + m.multiplier") 98bf215546Sopenharmony_ci * emit("result >>>= UINT_BITS") 99bf215546Sopenharmony_ci * if m.post_shift > 0: emit("result >>>= m.post_shift") 100bf215546Sopenharmony_ci * 101bf215546Sopenharmony_ci * This second version works even if D is 1. The shifts by UINT_BITS may be 102bf215546Sopenharmony_ci * "free" if the high half of the full multiply is put in a separate register. 103bf215546Sopenharmony_ci * 104bf215546Sopenharmony_ci * saturated_increment(n) means "increment n unless it would wrap to 0," i.e. 105bf215546Sopenharmony_ci * if n == (1 << UINT_BITS)-1: result = n 106bf215546Sopenharmony_ci * else: result = n+1 107bf215546Sopenharmony_ci * A common way to implement this is with the carry bit. For example, on x86: 108bf215546Sopenharmony_ci * add 1 109bf215546Sopenharmony_ci * sbb 0 110bf215546Sopenharmony_ci * 111bf215546Sopenharmony_ci * Some invariants: 112bf215546Sopenharmony_ci * 1: At least one of pre_shift and increment is zero 113bf215546Sopenharmony_ci * 2: multiplier is never zero 114bf215546Sopenharmony_ci * 115bf215546Sopenharmony_ci * This code incorporates the "round down" optimization per ridiculous_fish. 116bf215546Sopenharmony_ci */ 117bf215546Sopenharmony_ci 118bf215546Sopenharmony_cistruct util_fast_udiv_info { 119bf215546Sopenharmony_ci uint64_t multiplier; /* the "magic number" multiplier */ 120bf215546Sopenharmony_ci unsigned pre_shift; /* shift for the dividend before multiplying */ 121bf215546Sopenharmony_ci unsigned post_shift; /* shift for the dividend after multiplying */ 122bf215546Sopenharmony_ci int increment; /* 0 or 1; if set then increment the numerator, using one of 123bf215546Sopenharmony_ci the two strategies */ 124bf215546Sopenharmony_ci}; 125bf215546Sopenharmony_ci 126bf215546Sopenharmony_cistruct util_fast_udiv_info 127bf215546Sopenharmony_ciutil_compute_fast_udiv_info(uint64_t D, unsigned num_bits, unsigned UINT_BITS); 128bf215546Sopenharmony_ci 129bf215546Sopenharmony_ci/* Below are possible options for dividing by a uniform in a shader where 130bf215546Sopenharmony_ci * the divisor is constant but not known at compile time. 131bf215546Sopenharmony_ci */ 132bf215546Sopenharmony_ci 133bf215546Sopenharmony_ci/* Full version. */ 134bf215546Sopenharmony_cistatic inline uint32_t 135bf215546Sopenharmony_ciutil_fast_udiv32(uint32_t n, struct util_fast_udiv_info info) 136bf215546Sopenharmony_ci{ 137bf215546Sopenharmony_ci n = n >> info.pre_shift; 138bf215546Sopenharmony_ci /* If the divisor is not 1, you can instead use a 32-bit ADD that clamps 139bf215546Sopenharmony_ci * to UINT_MAX. Dividing by 1 needs the full 64-bit ADD. 140bf215546Sopenharmony_ci * 141bf215546Sopenharmony_ci * If you have unsigned 64-bit MAD with 32-bit inputs, you can do: 142bf215546Sopenharmony_ci * increment = increment ? multiplier : 0; // on the CPU 143bf215546Sopenharmony_ci * (n * multiplier + increment) // on the GPU using unsigned 64-bit MAD 144bf215546Sopenharmony_ci */ 145bf215546Sopenharmony_ci n = (((uint64_t)n + info.increment) * info.multiplier) >> 32; 146bf215546Sopenharmony_ci n = n >> info.post_shift; 147bf215546Sopenharmony_ci return n; 148bf215546Sopenharmony_ci} 149bf215546Sopenharmony_ci 150bf215546Sopenharmony_ci/* A little more efficient version if n != UINT_MAX, i.e. no unsigned 151bf215546Sopenharmony_ci * wraparound in the computation. 152bf215546Sopenharmony_ci */ 153bf215546Sopenharmony_cistatic inline uint32_t 154bf215546Sopenharmony_ciutil_fast_udiv32_nuw(uint32_t n, struct util_fast_udiv_info info) 155bf215546Sopenharmony_ci{ 156bf215546Sopenharmony_ci assert(n != UINT32_MAX); 157bf215546Sopenharmony_ci n = n >> info.pre_shift; 158bf215546Sopenharmony_ci n = n + info.increment; 159bf215546Sopenharmony_ci n = ((uint64_t)n * info.multiplier) >> 32; 160bf215546Sopenharmony_ci n = n >> info.post_shift; 161bf215546Sopenharmony_ci return n; 162bf215546Sopenharmony_ci} 163bf215546Sopenharmony_ci 164bf215546Sopenharmony_ci/* Even faster version but both operands must be 31-bit unsigned integers 165bf215546Sopenharmony_ci * and the divisor must be greater than 1. 166bf215546Sopenharmony_ci * 167bf215546Sopenharmony_ci * info must be computed with num_bits == 31. 168bf215546Sopenharmony_ci */ 169bf215546Sopenharmony_cistatic inline uint32_t 170bf215546Sopenharmony_ciutil_fast_udiv32_u31_d_not_one(uint32_t n, struct util_fast_udiv_info info) 171bf215546Sopenharmony_ci{ 172bf215546Sopenharmony_ci assert(info.pre_shift == 0); 173bf215546Sopenharmony_ci assert(info.increment == 0); 174bf215546Sopenharmony_ci n = ((uint64_t)n * info.multiplier) >> 32; 175bf215546Sopenharmony_ci n = n >> info.post_shift; 176bf215546Sopenharmony_ci return n; 177bf215546Sopenharmony_ci} 178bf215546Sopenharmony_ci 179bf215546Sopenharmony_ci#ifdef __cplusplus 180bf215546Sopenharmony_ci} /* extern C */ 181bf215546Sopenharmony_ci#endif 182bf215546Sopenharmony_ci 183bf215546Sopenharmony_ci#endif /* FAST_IDIV_BY_CONST_H */ 184