1/* Bit and bytes utilities.
2
3   Bytes swap functions, reverse order of bytes:
4
5   - _Py_bswap16(uint16_t)
6   - _Py_bswap32(uint32_t)
7   - _Py_bswap64(uint64_t)
8*/
9
10#ifndef Py_INTERNAL_BITUTILS_H
11#define Py_INTERNAL_BITUTILS_H
12#ifdef __cplusplus
13extern "C" {
14#endif
15
16#ifndef Py_BUILD_CORE
17#  error "this header requires Py_BUILD_CORE define"
18#endif
19
20#if defined(__GNUC__) \
21      && ((__GNUC__ >= 5) || (__GNUC__ == 4) && (__GNUC_MINOR__ >= 8))
22   /* __builtin_bswap16() is available since GCC 4.8,
23      __builtin_bswap32() is available since GCC 4.3,
24      __builtin_bswap64() is available since GCC 4.3. */
25#  define _PY_HAVE_BUILTIN_BSWAP
26#endif
27
28#ifdef _MSC_VER
29   /* Get _byteswap_ushort(), _byteswap_ulong(), _byteswap_uint64() */
30#  include <intrin.h>
31#endif
32
33static inline uint16_t
34_Py_bswap16(uint16_t word)
35{
36#if defined(_PY_HAVE_BUILTIN_BSWAP) || _Py__has_builtin(__builtin_bswap16)
37    return __builtin_bswap16(word);
38#elif defined(_MSC_VER)
39    Py_BUILD_ASSERT(sizeof(word) == sizeof(unsigned short));
40    return _byteswap_ushort(word);
41#else
42    // Portable implementation which doesn't rely on circular bit shift
43    return ( ((word & UINT16_C(0x00FF)) << 8)
44           | ((word & UINT16_C(0xFF00)) >> 8));
45#endif
46}
47
48static inline uint32_t
49_Py_bswap32(uint32_t word)
50{
51#if defined(_PY_HAVE_BUILTIN_BSWAP) || _Py__has_builtin(__builtin_bswap32)
52    return __builtin_bswap32(word);
53#elif defined(_MSC_VER)
54    Py_BUILD_ASSERT(sizeof(word) == sizeof(unsigned long));
55    return _byteswap_ulong(word);
56#else
57    // Portable implementation which doesn't rely on circular bit shift
58    return ( ((word & UINT32_C(0x000000FF)) << 24)
59           | ((word & UINT32_C(0x0000FF00)) <<  8)
60           | ((word & UINT32_C(0x00FF0000)) >>  8)
61           | ((word & UINT32_C(0xFF000000)) >> 24));
62#endif
63}
64
65static inline uint64_t
66_Py_bswap64(uint64_t word)
67{
68#if defined(_PY_HAVE_BUILTIN_BSWAP) || _Py__has_builtin(__builtin_bswap64)
69    return __builtin_bswap64(word);
70#elif defined(_MSC_VER)
71    return _byteswap_uint64(word);
72#else
73    // Portable implementation which doesn't rely on circular bit shift
74    return ( ((word & UINT64_C(0x00000000000000FF)) << 56)
75           | ((word & UINT64_C(0x000000000000FF00)) << 40)
76           | ((word & UINT64_C(0x0000000000FF0000)) << 24)
77           | ((word & UINT64_C(0x00000000FF000000)) <<  8)
78           | ((word & UINT64_C(0x000000FF00000000)) >>  8)
79           | ((word & UINT64_C(0x0000FF0000000000)) >> 24)
80           | ((word & UINT64_C(0x00FF000000000000)) >> 40)
81           | ((word & UINT64_C(0xFF00000000000000)) >> 56));
82#endif
83}
84
85
86// Population count: count the number of 1's in 'x'
87// (number of bits set to 1), also known as the hamming weight.
88//
89// Implementation note. CPUID is not used, to test if x86 POPCNT instruction
90// can be used, to keep the implementation simple. For example, Visual Studio
91// __popcnt() is not used this reason. The clang and GCC builtin function can
92// use the x86 POPCNT instruction if the target architecture has SSE4a or
93// newer.
94static inline int
95_Py_popcount32(uint32_t x)
96{
97#if (defined(__clang__) || defined(__GNUC__))
98
99#if SIZEOF_INT >= 4
100    Py_BUILD_ASSERT(sizeof(x) <= sizeof(unsigned int));
101    return __builtin_popcount(x);
102#else
103    // The C standard guarantees that unsigned long will always be big enough
104    // to hold a uint32_t value without losing information.
105    Py_BUILD_ASSERT(sizeof(x) <= sizeof(unsigned long));
106    return __builtin_popcountl(x);
107#endif
108
109#else
110    // 32-bit SWAR (SIMD Within A Register) popcount
111
112    // Binary: 0 1 0 1 ...
113    const uint32_t M1 = 0x55555555;
114    // Binary: 00 11 00 11. ..
115    const uint32_t M2 = 0x33333333;
116    // Binary: 0000 1111 0000 1111 ...
117    const uint32_t M4 = 0x0F0F0F0F;
118
119    // Put count of each 2 bits into those 2 bits
120    x = x - ((x >> 1) & M1);
121    // Put count of each 4 bits into those 4 bits
122    x = (x & M2) + ((x >> 2) & M2);
123    // Put count of each 8 bits into those 8 bits
124    x = (x + (x >> 4)) & M4;
125    // Sum of the 4 byte counts.
126    // Take care when considering changes to the next line. Portability and
127    // correctness are delicate here, thanks to C's "integer promotions" (C99
128    // §6.3.1.1p2). On machines where the `int` type has width greater than 32
129    // bits, `x` will be promoted to an `int`, and following C's "usual
130    // arithmetic conversions" (C99 §6.3.1.8), the multiplication will be
131    // performed as a multiplication of two `unsigned int` operands. In this
132    // case it's critical that we cast back to `uint32_t` in order to keep only
133    // the least significant 32 bits. On machines where the `int` type has
134    // width no greater than 32, the multiplication is of two 32-bit unsigned
135    // integer types, and the (uint32_t) cast is a no-op. In both cases, we
136    // avoid the risk of undefined behaviour due to overflow of a
137    // multiplication of signed integer types.
138    return (uint32_t)(x * 0x01010101U) >> 24;
139#endif
140}
141
142
143// Return the index of the most significant 1 bit in 'x'. This is the smallest
144// integer k such that x < 2**k. Equivalent to floor(log2(x)) + 1 for x != 0.
145static inline int
146_Py_bit_length(unsigned long x)
147{
148#if (defined(__clang__) || defined(__GNUC__))
149    if (x != 0) {
150        // __builtin_clzl() is available since GCC 3.4.
151        // Undefined behavior for x == 0.
152        return (int)sizeof(unsigned long) * 8 - __builtin_clzl(x);
153    }
154    else {
155        return 0;
156    }
157#elif defined(_MSC_VER)
158    // _BitScanReverse() is documented to search 32 bits.
159    Py_BUILD_ASSERT(sizeof(unsigned long) <= 4);
160    unsigned long msb;
161    if (_BitScanReverse(&msb, x)) {
162        return (int)msb + 1;
163    }
164    else {
165        return 0;
166    }
167#else
168    const int BIT_LENGTH_TABLE[32] = {
169        0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
170        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5
171    };
172    int msb = 0;
173    while (x >= 32) {
174        msb += 6;
175        x >>= 6;
176    }
177    msb += BIT_LENGTH_TABLE[x];
178    return msb;
179#endif
180}
181
182
183#ifdef __cplusplus
184}
185#endif
186#endif /* !Py_INTERNAL_BITUTILS_H */
187