1 /**
2  * Copyright (c) 2021-2022 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 // This is the murmur3 32 bit hash implementation
16 // The main structure and constants were taken from Austin Appleby.
17 // From his gitlab (https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp):
18 // - MurmurHash3 was written by Austin Appleby, and is placed in the public
19 // - domain. The author hereby disclaims copyright to this source code.
20 
21 #ifndef LIBPANDABASE_UTILS_MURMUR3_HASH_H
22 #define LIBPANDABASE_UTILS_MURMUR3_HASH_H
23 
24 #include "hash_base.h"
25 #include "logger.h"
26 
27 namespace panda {
28 
29 struct MurmurHash32Const {
30     // Here are the main constants for hash counting:
31     // Many of them looks like some kind of magic
32     static constexpr uint32_t C1 = 0xCC9E2D51U;
33     static constexpr uint32_t C2 = 0x1B873593U;
34     static constexpr uint32_t MAX_BITS = 32;
35     static constexpr uint32_t FINALIZE_FIRST_SHIFT = 16;
36     static constexpr uint32_t FINALIZE_SECOND_SHIFT = 13;
37     static constexpr uint32_t FINALIZE_THIRD_SHIFT = 16;
38     static constexpr uint32_t FINALIZE_FIRST_MULTIPLICATOR = 0x85EBCA6BU;
39     static constexpr uint32_t FINALIZE_SECOND_MULTIPLICATOR = 0xC2BAE35U;
40     static constexpr uint32_t MAIN_FIRST_SHIFT = 15;
41     static constexpr uint32_t MAIN_SECOND_SHIFT = 13;
42     static constexpr uint32_t MAIN_CONSTANT = 0xE6546B64U;
43     static constexpr uint32_t MAIN_MULTIPLICATOR = 5;
44     static constexpr uint32_t TAIL_SHIFT = 8;
45     static constexpr uint32_t TAIL_LAST_SHIFT = 15;
46     static constexpr uint32_t BLOCK_SIZE = 4;
47 };
48 
49 // In general murmur hash looks like that:
50 // key = |....|....|....|.|.|.|
51 //         32   32   32  8 8 8
52 // Firstly, we proceed each 32 bits block from key;
53 // Secondly, we proceed last 8 bits block which were not covered in previous step.
54 template <uint32_t seed_value>
55 class MurmurHash32 final : public HashBase<MurmurHash32<seed_value>>, MurmurHash32Const {
56 public:
GetHash32WithSeedImpl(const uint8_t *key, size_t len, uint32_t seed)57     static uint32_t GetHash32WithSeedImpl(const uint8_t *key, size_t len, uint32_t seed)
58     {
59         return MurmurHash3(key, len, seed);
60     }
GetHash32Impl(const uint8_t *key, size_t len)61     static uint32_t GetHash32Impl(const uint8_t *key, size_t len)
62     {
63         return GetHash32WithSeedImpl(key, len, seed_value);
64     }
GetHash32StringImpl(const uint8_t *mutf8_string)65     static uint32_t GetHash32StringImpl(const uint8_t *mutf8_string)
66     {
67         return MurmurHash3String(mutf8_string, seed_value);
68     }
GetHash32StringWithSeedImpl(const uint8_t *mutf8_string, uint32_t seed)69     static uint32_t GetHash32StringWithSeedImpl(const uint8_t *mutf8_string, uint32_t seed)
70     {
71         return MurmurHash3String(mutf8_string, seed);
72     }
73 
74 private:
Rotl(uint32_t word, uint8_t shift)75     static uint32_t Rotl(uint32_t word, uint8_t shift)
76     {
77         return (word << shift) | (word >> (MAX_BITS - shift));
78     }
79 
80     // Finilize the result of the hash function
Finalize(uint32_t h)81     static uint32_t Finalize(uint32_t h)
82     {
83         h ^= h >> FINALIZE_FIRST_SHIFT;
84         h *= FINALIZE_FIRST_MULTIPLICATOR;
85         h ^= h >> FINALIZE_SECOND_SHIFT;
86         h *= FINALIZE_SECOND_MULTIPLICATOR;
87         h ^= h >> FINALIZE_THIRD_SHIFT;
88         return h;
89     }
90 
MurmurHash3(const uint8_t *key, size_t len, uint32_t seed)91     static uint32_t MurmurHash3(const uint8_t *key, size_t len, uint32_t seed)
92     {
93         // We start hashing from the seed
94         uint32_t hash = seed;
95 
96         // Do the main part:
97         // Iterate for each 32bits
98         auto blocks = reinterpret_cast<uintptr_t>(key);
99         std::array<uint8_t, BLOCK_SIZE> memblock = {'\0', '\0', '\0', '\0'};
100         for (size_t i = len / BLOCK_SIZE; i != 0; i--) {
101             for (auto &j : memblock) {
102                 j = *reinterpret_cast<uint8_t *>(blocks);
103                 blocks += sizeof(uint8_t);
104             }
105             // Do this because we don't want to dispatch Big/Little endianness.
106             uint32_t k1 = *reinterpret_cast<uint32_t *>(memblock.data());
107 
108             k1 *= C1;
109             k1 = Rotl(k1, MAIN_FIRST_SHIFT);
110             k1 *= C2;
111 
112             hash ^= k1;
113             hash = Rotl(hash, MAIN_SECOND_SHIFT);
114             hash = hash * MAIN_MULTIPLICATOR + MAIN_CONSTANT;
115         }
116 
117         // Proceed the tail:
118         // blocks is a pointer to the end of 32bits section
119         auto tail = blocks;
120         uint32_t k1 = 0;
121         for (size_t i = len & 3U; i > 0; i--) {
122             // Get ((uint8_t*)tail)[i - 1]:
123             uintptr_t block_pointer = tail + sizeof(uint8_t) * (i - 1);
124             uint8_t block = *reinterpret_cast<uint8_t *>(block_pointer);
125             uint32_t temp = static_cast<uint32_t>(block) << (TAIL_SHIFT * (i - 1U));
126             k1 ^= temp;
127             if (i == 1) {
128                 k1 = Rotl(k1, TAIL_LAST_SHIFT);
129                 k1 *= C2;
130                 hash ^= k1;
131             }
132         }
133 
134         // Finalize the result
135         hash ^= len;
136         hash = Finalize(hash);
137 
138         return hash;
139     }
140 
MurmurHash3String(const uint8_t *mutf8_string, uint32_t seed)141     static uint32_t MurmurHash3String(const uint8_t *mutf8_string, uint32_t seed)
142     {
143         // We start hashing from the seed
144         uint32_t hash = seed;
145         // We should still compute length of the string, we will need it later
146         size_t mutf8_length = 0;
147         // Do the main part:
148         // Iterate for each 32bits
149         auto blocks = reinterpret_cast<uintptr_t>(mutf8_string);
150         std::array<uint8_t, BLOCK_SIZE> memblock = {'\0', '\0', '\0', '\0'};
151         size_t tail_len = 0;
152         while (true) {
153             tail_len = 0;
154             for (unsigned char &i : memblock) {
155                 i = *reinterpret_cast<uint8_t *>(blocks);
156                 blocks += sizeof(uint8_t);
157                 if (i == '\0') {
158                     break;
159                 }
160                 tail_len++;
161             }
162             if (tail_len != BLOCK_SIZE) {
163                 // We couldn't read four 8bytes value
164                 break;
165             }
166             // Do this because we don't want to dispatch Big/Little endianness.
167             uint32_t k1 = *reinterpret_cast<uint32_t *>(memblock.data());
168 
169             k1 *= C1;
170             k1 = Rotl(k1, MAIN_FIRST_SHIFT);
171             k1 *= C2;
172 
173             hash ^= k1;
174             hash = Rotl(hash, MAIN_SECOND_SHIFT);
175             hash = hash * MAIN_MULTIPLICATOR + MAIN_CONSTANT;
176             mutf8_length += BLOCK_SIZE;
177         }
178 
179         // Proceed the tail:
180 
181         mutf8_length += tail_len;
182         uint32_t k1 = 0;
183         for (size_t i = tail_len; i > 0; i--) {
184             uint8_t block = memblock[i - 1U];
185             uint32_t temp = static_cast<uint32_t>(block) << (TAIL_SHIFT * (i - 1U));
186             k1 ^= temp;
187             if (i == 1) {
188                 k1 = Rotl(k1, TAIL_LAST_SHIFT);
189                 k1 *= C2;
190                 hash ^= k1;
191             }
192         }
193 
194         // Finalize the result
195         hash ^= mutf8_length;
196         hash = Finalize(hash);
197 
198         return hash;
199     }
200 };
201 
202 }  // namespace panda
203 
204 #endif  // LIBPANDABASE_UTILS_MURMUR3_HASH_H
205