1bf215546Sopenharmony_ci/*
2bf215546Sopenharmony_ci * Copyright © 2018 Advanced Micro Devices, Inc.
3bf215546Sopenharmony_ci *
4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation
7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the
9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions:
10bf215546Sopenharmony_ci *
11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next
12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the
13bf215546Sopenharmony_ci * Software.
14bf215546Sopenharmony_ci *
15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21bf215546Sopenharmony_ci * IN THE SOFTWARE.
22bf215546Sopenharmony_ci */
23bf215546Sopenharmony_ci
24bf215546Sopenharmony_ci#ifndef FAST_IDIV_BY_CONST_H
25bf215546Sopenharmony_ci#define FAST_IDIV_BY_CONST_H
26bf215546Sopenharmony_ci
27bf215546Sopenharmony_ci/* Imported from:
28bf215546Sopenharmony_ci *   https://raw.githubusercontent.com/ridiculousfish/libdivide/master/divide_by_constants_codegen_reference.c
29bf215546Sopenharmony_ci */
30bf215546Sopenharmony_ci
31bf215546Sopenharmony_ci#include <inttypes.h>
32bf215546Sopenharmony_ci#include <limits.h>
33bf215546Sopenharmony_ci#include <assert.h>
34bf215546Sopenharmony_ci
35bf215546Sopenharmony_ci#ifdef __cplusplus
36bf215546Sopenharmony_ciextern "C" {
37bf215546Sopenharmony_ci#endif
38bf215546Sopenharmony_ci
39bf215546Sopenharmony_ci/* Computes "magic info" for performing signed division by a fixed integer D.
40bf215546Sopenharmony_ci * The type 'sint_t' is assumed to be defined as a signed integer type large
41bf215546Sopenharmony_ci * enough to hold both the dividend and the divisor.
42bf215546Sopenharmony_ci * Here >> is arithmetic (signed) shift, and >>> is logical shift.
43bf215546Sopenharmony_ci *
44bf215546Sopenharmony_ci * To emit code for n/d, rounding towards zero, use the following sequence:
45bf215546Sopenharmony_ci *
46bf215546Sopenharmony_ci *   m = compute_signed_magic_info(D)
47bf215546Sopenharmony_ci *   emit("result = (m.multiplier * n) >> SINT_BITS");
48bf215546Sopenharmony_ci *   if d > 0 and m.multiplier < 0: emit("result += n")
49bf215546Sopenharmony_ci *   if d < 0 and m.multiplier > 0: emit("result -= n")
50bf215546Sopenharmony_ci *   if m.post_shift > 0: emit("result >>= m.shift")
51bf215546Sopenharmony_ci *   emit("result += (result < 0)")
52bf215546Sopenharmony_ci *
53bf215546Sopenharmony_ci * The shifts by SINT_BITS may be "free" if the high half of the full multiply
54bf215546Sopenharmony_ci * is put in a separate register.
55bf215546Sopenharmony_ci *
56bf215546Sopenharmony_ci * The final add can of course be implemented via the sign bit, e.g.
57bf215546Sopenharmony_ci *    result += (result >>> (SINT_BITS - 1))
58bf215546Sopenharmony_ci * or
59bf215546Sopenharmony_ci *    result -= (result >> (SINT_BITS - 1))
60bf215546Sopenharmony_ci *
61bf215546Sopenharmony_ci * This code is heavily indebted to Hacker's Delight by Henry Warren.
62bf215546Sopenharmony_ci * See http://www.hackersdelight.org/HDcode/magic.c.txt
63bf215546Sopenharmony_ci * Used with permission from http://www.hackersdelight.org/permissions.htm
64bf215546Sopenharmony_ci */
65bf215546Sopenharmony_ci
66bf215546Sopenharmony_cistruct util_fast_sdiv_info {
67bf215546Sopenharmony_ci   int64_t multiplier; /* the "magic number" multiplier */
68bf215546Sopenharmony_ci   unsigned shift; /* shift for the dividend after multiplying */
69bf215546Sopenharmony_ci};
70bf215546Sopenharmony_ci
71bf215546Sopenharmony_cistruct util_fast_sdiv_info
72bf215546Sopenharmony_ciutil_compute_fast_sdiv_info(int64_t D, unsigned SINT_BITS);
73bf215546Sopenharmony_ci
74bf215546Sopenharmony_ci/* Computes "magic info" for performing unsigned division by a fixed positive
75bf215546Sopenharmony_ci * integer D.  UINT_BITS is the bit size at which the final "magic"
76bf215546Sopenharmony_ci * calculation will be performed; it is assumed to be large enough to hold
77bf215546Sopenharmony_ci * both the dividand and the divisor.  num_bits can be set appropriately if n
78bf215546Sopenharmony_ci * is known to be smaller than calc_bits; if this is not known then UINT_BITS
79bf215546Sopenharmony_ci * for num_bits.
80bf215546Sopenharmony_ci *
81bf215546Sopenharmony_ci * Assume we have a hardware register of width UINT_BITS, a known constant D
82bf215546Sopenharmony_ci * which is not zero and not a power of 2, and a variable n of width num_bits
83bf215546Sopenharmony_ci * (which may be up to UINT_BITS). To emit code for n/d, use one of the two
84bf215546Sopenharmony_ci * following sequences (here >>> refers to a logical bitshift):
85bf215546Sopenharmony_ci *
86bf215546Sopenharmony_ci *   m = compute_unsigned_magic_info(D, num_bits)
87bf215546Sopenharmony_ci *   if m.pre_shift > 0: emit("n >>>= m.pre_shift")
88bf215546Sopenharmony_ci *   if m.increment: emit("n = saturated_increment(n)")
89bf215546Sopenharmony_ci *   emit("result = (m.multiplier * n) >>> UINT_BITS")
90bf215546Sopenharmony_ci *   if m.post_shift > 0: emit("result >>>= m.post_shift")
91bf215546Sopenharmony_ci *
92bf215546Sopenharmony_ci * or
93bf215546Sopenharmony_ci *
94bf215546Sopenharmony_ci *   m = compute_unsigned_magic_info(D, num_bits)
95bf215546Sopenharmony_ci *   if m.pre_shift > 0: emit("n >>>= m.pre_shift")
96bf215546Sopenharmony_ci *   emit("result = m.multiplier * n")
97bf215546Sopenharmony_ci *   if m.increment: emit("result = result + m.multiplier")
98bf215546Sopenharmony_ci *   emit("result >>>= UINT_BITS")
99bf215546Sopenharmony_ci *   if m.post_shift > 0: emit("result >>>= m.post_shift")
100bf215546Sopenharmony_ci *
101bf215546Sopenharmony_ci * This second version works even if D is 1.  The shifts by UINT_BITS may be
102bf215546Sopenharmony_ci * "free" if the high half of the full multiply is put in a separate register.
103bf215546Sopenharmony_ci *
104bf215546Sopenharmony_ci * saturated_increment(n) means "increment n unless it would wrap to 0," i.e.
105bf215546Sopenharmony_ci *   if n == (1 << UINT_BITS)-1: result = n
106bf215546Sopenharmony_ci *   else: result = n+1
107bf215546Sopenharmony_ci * A common way to implement this is with the carry bit. For example, on x86:
108bf215546Sopenharmony_ci *   add 1
109bf215546Sopenharmony_ci *   sbb 0
110bf215546Sopenharmony_ci *
111bf215546Sopenharmony_ci * Some invariants:
112bf215546Sopenharmony_ci *   1: At least one of pre_shift and increment is zero
113bf215546Sopenharmony_ci *   2: multiplier is never zero
114bf215546Sopenharmony_ci *
115bf215546Sopenharmony_ci * This code incorporates the "round down" optimization per ridiculous_fish.
116bf215546Sopenharmony_ci */
117bf215546Sopenharmony_ci
118bf215546Sopenharmony_cistruct util_fast_udiv_info {
119bf215546Sopenharmony_ci   uint64_t multiplier; /* the "magic number" multiplier */
120bf215546Sopenharmony_ci   unsigned pre_shift; /* shift for the dividend before multiplying */
121bf215546Sopenharmony_ci   unsigned post_shift; /* shift for the dividend after multiplying */
122bf215546Sopenharmony_ci   int increment; /* 0 or 1; if set then increment the numerator, using one of
123bf215546Sopenharmony_ci                     the two strategies */
124bf215546Sopenharmony_ci};
125bf215546Sopenharmony_ci
126bf215546Sopenharmony_cistruct util_fast_udiv_info
127bf215546Sopenharmony_ciutil_compute_fast_udiv_info(uint64_t D, unsigned num_bits, unsigned UINT_BITS);
128bf215546Sopenharmony_ci
129bf215546Sopenharmony_ci/* Below are possible options for dividing by a uniform in a shader where
130bf215546Sopenharmony_ci * the divisor is constant but not known at compile time.
131bf215546Sopenharmony_ci */
132bf215546Sopenharmony_ci
133bf215546Sopenharmony_ci/* Full version. */
134bf215546Sopenharmony_cistatic inline uint32_t
135bf215546Sopenharmony_ciutil_fast_udiv32(uint32_t n, struct util_fast_udiv_info info)
136bf215546Sopenharmony_ci{
137bf215546Sopenharmony_ci   n = n >> info.pre_shift;
138bf215546Sopenharmony_ci   /* If the divisor is not 1, you can instead use a 32-bit ADD that clamps
139bf215546Sopenharmony_ci    * to UINT_MAX. Dividing by 1 needs the full 64-bit ADD.
140bf215546Sopenharmony_ci    *
141bf215546Sopenharmony_ci    * If you have unsigned 64-bit MAD with 32-bit inputs, you can do:
142bf215546Sopenharmony_ci    *    increment = increment ? multiplier : 0; // on the CPU
143bf215546Sopenharmony_ci    *    (n * multiplier + increment) // on the GPU using unsigned 64-bit MAD
144bf215546Sopenharmony_ci    */
145bf215546Sopenharmony_ci   n = (((uint64_t)n + info.increment) * info.multiplier) >> 32;
146bf215546Sopenharmony_ci   n = n >> info.post_shift;
147bf215546Sopenharmony_ci   return n;
148bf215546Sopenharmony_ci}
149bf215546Sopenharmony_ci
150bf215546Sopenharmony_ci/* A little more efficient version if n != UINT_MAX, i.e. no unsigned
151bf215546Sopenharmony_ci * wraparound in the computation.
152bf215546Sopenharmony_ci */
153bf215546Sopenharmony_cistatic inline uint32_t
154bf215546Sopenharmony_ciutil_fast_udiv32_nuw(uint32_t n, struct util_fast_udiv_info info)
155bf215546Sopenharmony_ci{
156bf215546Sopenharmony_ci   assert(n != UINT32_MAX);
157bf215546Sopenharmony_ci   n = n >> info.pre_shift;
158bf215546Sopenharmony_ci   n = n + info.increment;
159bf215546Sopenharmony_ci   n = ((uint64_t)n * info.multiplier) >> 32;
160bf215546Sopenharmony_ci   n = n >> info.post_shift;
161bf215546Sopenharmony_ci   return n;
162bf215546Sopenharmony_ci}
163bf215546Sopenharmony_ci
164bf215546Sopenharmony_ci/* Even faster version but both operands must be 31-bit unsigned integers
165bf215546Sopenharmony_ci * and the divisor must be greater than 1.
166bf215546Sopenharmony_ci *
167bf215546Sopenharmony_ci * info must be computed with num_bits == 31.
168bf215546Sopenharmony_ci */
169bf215546Sopenharmony_cistatic inline uint32_t
170bf215546Sopenharmony_ciutil_fast_udiv32_u31_d_not_one(uint32_t n, struct util_fast_udiv_info info)
171bf215546Sopenharmony_ci{
172bf215546Sopenharmony_ci   assert(info.pre_shift == 0);
173bf215546Sopenharmony_ci   assert(info.increment == 0);
174bf215546Sopenharmony_ci   n = ((uint64_t)n * info.multiplier) >> 32;
175bf215546Sopenharmony_ci   n = n >> info.post_shift;
176bf215546Sopenharmony_ci   return n;
177bf215546Sopenharmony_ci}
178bf215546Sopenharmony_ci
179bf215546Sopenharmony_ci#ifdef __cplusplus
180bf215546Sopenharmony_ci} /* extern C */
181bf215546Sopenharmony_ci#endif
182bf215546Sopenharmony_ci
183bf215546Sopenharmony_ci#endif /* FAST_IDIV_BY_CONST_H */
184