xref: /third_party/node/deps/brotli/c/enc/hash.h (revision 1cb0ef41)
1/* Copyright 2010 Google Inc. All Rights Reserved.
2
3   Distributed under MIT license.
4   See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5*/
6
7/* A (forgetful) hash table to the data seen by the compressor, to
8   help create backward references to previous data. */
9
10#ifndef BROTLI_ENC_HASH_H_
11#define BROTLI_ENC_HASH_H_
12
13#include <string.h>  /* memcmp, memset */
14
15#include "../common/constants.h"
16#include "../common/dictionary.h"
17#include "../common/platform.h"
18#include <brotli/types.h>
19#include "./encoder_dict.h"
20#include "./fast_log.h"
21#include "./find_match_length.h"
22#include "./memory.h"
23#include "./quality.h"
24#include "./static_dict.h"
25
26#if defined(__cplusplus) || defined(c_plusplus)
27extern "C" {
28#endif
29
30typedef struct {
31  /* Dynamically allocated area; first member for quickest access. */
32  void* extra;
33
34  size_t dict_num_lookups;
35  size_t dict_num_matches;
36
37  BrotliHasherParams params;
38
39  /* False if hasher needs to be "prepared" before use. */
40  BROTLI_BOOL is_prepared_;
41} HasherCommon;
42
43#define score_t size_t
44
45static const uint32_t kCutoffTransformsCount = 10;
46/*   0,  12,   27,    23,    42,    63,    56,    48,    59,    64 */
47/* 0+0, 4+8, 8+19, 12+11, 16+26, 20+43, 24+32, 28+20, 32+27, 36+28 */
48static const uint64_t kCutoffTransforms =
49    BROTLI_MAKE_UINT64_T(0x071B520A, 0xDA2D3200);
50
51typedef struct HasherSearchResult {
52  size_t len;
53  size_t distance;
54  score_t score;
55  int len_code_delta; /* == len_code - len */
56} HasherSearchResult;
57
58/* kHashMul32 multiplier has these properties:
59   * The multiplier must be odd. Otherwise we may lose the highest bit.
60   * No long streaks of ones or zeros.
61   * There is no effort to ensure that it is a prime, the oddity is enough
62     for this use.
63   * The number has been tuned heuristically against compression benchmarks. */
64static const uint32_t kHashMul32 = 0x1E35A7BD;
65static const uint64_t kHashMul64 = BROTLI_MAKE_UINT64_T(0x1E35A7BD, 0x1E35A7BD);
66static const uint64_t kHashMul64Long =
67    BROTLI_MAKE_UINT64_T(0x1FE35A7Bu, 0xD3579BD3u);
68
69static BROTLI_INLINE uint32_t Hash14(const uint8_t* data) {
70  uint32_t h = BROTLI_UNALIGNED_LOAD32LE(data) * kHashMul32;
71  /* The higher bits contain more mixture from the multiplication,
72     so we take our results from there. */
73  return h >> (32 - 14);
74}
75
76static BROTLI_INLINE void PrepareDistanceCache(
77    int* BROTLI_RESTRICT distance_cache, const int num_distances) {
78  if (num_distances > 4) {
79    int last_distance = distance_cache[0];
80    distance_cache[4] = last_distance - 1;
81    distance_cache[5] = last_distance + 1;
82    distance_cache[6] = last_distance - 2;
83    distance_cache[7] = last_distance + 2;
84    distance_cache[8] = last_distance - 3;
85    distance_cache[9] = last_distance + 3;
86    if (num_distances > 10) {
87      int next_last_distance = distance_cache[1];
88      distance_cache[10] = next_last_distance - 1;
89      distance_cache[11] = next_last_distance + 1;
90      distance_cache[12] = next_last_distance - 2;
91      distance_cache[13] = next_last_distance + 2;
92      distance_cache[14] = next_last_distance - 3;
93      distance_cache[15] = next_last_distance + 3;
94    }
95  }
96}
97
98#define BROTLI_LITERAL_BYTE_SCORE 135
99#define BROTLI_DISTANCE_BIT_PENALTY 30
100/* Score must be positive after applying maximal penalty. */
101#define BROTLI_SCORE_BASE (BROTLI_DISTANCE_BIT_PENALTY * 8 * sizeof(size_t))
102
103/* Usually, we always choose the longest backward reference. This function
104   allows for the exception of that rule.
105
106   If we choose a backward reference that is further away, it will
107   usually be coded with more bits. We approximate this by assuming
108   log2(distance). If the distance can be expressed in terms of the
109   last four distances, we use some heuristic constants to estimate
110   the bits cost. For the first up to four literals we use the bit
111   cost of the literals from the literal cost model, after that we
112   use the average bit cost of the cost model.
113
114   This function is used to sometimes discard a longer backward reference
115   when it is not much longer and the bit cost for encoding it is more
116   than the saved literals.
117
118   backward_reference_offset MUST be positive. */
119static BROTLI_INLINE score_t BackwardReferenceScore(
120    size_t copy_length, size_t backward_reference_offset) {
121  return BROTLI_SCORE_BASE + BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length -
122      BROTLI_DISTANCE_BIT_PENALTY * Log2FloorNonZero(backward_reference_offset);
123}
124
125static BROTLI_INLINE score_t BackwardReferenceScoreUsingLastDistance(
126    size_t copy_length) {
127  return BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length +
128      BROTLI_SCORE_BASE + 15;
129}
130
131static BROTLI_INLINE score_t BackwardReferencePenaltyUsingLastDistance(
132    size_t distance_short_code) {
133  return (score_t)39 + ((0x1CA10 >> (distance_short_code & 0xE)) & 0xE);
134}
135
136static BROTLI_INLINE BROTLI_BOOL TestStaticDictionaryItem(
137    const BrotliEncoderDictionary* dictionary, size_t len, size_t word_idx,
138    const uint8_t* data, size_t max_length, size_t max_backward,
139    size_t max_distance, HasherSearchResult* out) {
140  size_t offset;
141  size_t matchlen;
142  size_t backward;
143  score_t score;
144  offset = dictionary->words->offsets_by_length[len] + len * word_idx;
145  if (len > max_length) {
146    return BROTLI_FALSE;
147  }
148
149  matchlen =
150      FindMatchLengthWithLimit(data, &dictionary->words->data[offset], len);
151  if (matchlen + dictionary->cutoffTransformsCount <= len || matchlen == 0) {
152    return BROTLI_FALSE;
153  }
154  {
155    size_t cut = len - matchlen;
156    size_t transform_id = (cut << 2) +
157        (size_t)((dictionary->cutoffTransforms >> (cut * 6)) & 0x3F);
158    backward = max_backward + 1 + word_idx +
159        (transform_id << dictionary->words->size_bits_by_length[len]);
160  }
161  if (backward > max_distance) {
162    return BROTLI_FALSE;
163  }
164  score = BackwardReferenceScore(matchlen, backward);
165  if (score < out->score) {
166    return BROTLI_FALSE;
167  }
168  out->len = matchlen;
169  out->len_code_delta = (int)len - (int)matchlen;
170  out->distance = backward;
171  out->score = score;
172  return BROTLI_TRUE;
173}
174
175static BROTLI_INLINE void SearchInStaticDictionary(
176    const BrotliEncoderDictionary* dictionary,
177    HasherCommon* common, const uint8_t* data, size_t max_length,
178    size_t max_backward, size_t max_distance,
179    HasherSearchResult* out, BROTLI_BOOL shallow) {
180  size_t key;
181  size_t i;
182  if (common->dict_num_matches < (common->dict_num_lookups >> 7)) {
183    return;
184  }
185  key = Hash14(data) << 1;
186  for (i = 0; i < (shallow ? 1u : 2u); ++i, ++key) {
187    common->dict_num_lookups++;
188    if (dictionary->hash_table_lengths[key] != 0) {
189      BROTLI_BOOL item_matches = TestStaticDictionaryItem(
190          dictionary, dictionary->hash_table_lengths[key],
191          dictionary->hash_table_words[key], data,
192          max_length, max_backward, max_distance, out);
193      if (item_matches) {
194        common->dict_num_matches++;
195      }
196    }
197  }
198}
199
200typedef struct BackwardMatch {
201  uint32_t distance;
202  uint32_t length_and_code;
203} BackwardMatch;
204
205static BROTLI_INLINE void InitBackwardMatch(BackwardMatch* self,
206    size_t dist, size_t len) {
207  self->distance = (uint32_t)dist;
208  self->length_and_code = (uint32_t)(len << 5);
209}
210
211static BROTLI_INLINE void InitDictionaryBackwardMatch(BackwardMatch* self,
212    size_t dist, size_t len, size_t len_code) {
213  self->distance = (uint32_t)dist;
214  self->length_and_code =
215      (uint32_t)((len << 5) | (len == len_code ? 0 : len_code));
216}
217
218static BROTLI_INLINE size_t BackwardMatchLength(const BackwardMatch* self) {
219  return self->length_and_code >> 5;
220}
221
222static BROTLI_INLINE size_t BackwardMatchLengthCode(const BackwardMatch* self) {
223  size_t code = self->length_and_code & 31;
224  return code ? code : BackwardMatchLength(self);
225}
226
227#define EXPAND_CAT(a, b) CAT(a, b)
228#define CAT(a, b) a ## b
229#define FN(X) EXPAND_CAT(X, HASHER())
230
231#define HASHER() H10
232#define BUCKET_BITS 17
233#define MAX_TREE_SEARCH_DEPTH 64
234#define MAX_TREE_COMP_LENGTH 128
235#include "./hash_to_binary_tree_inc.h"  /* NOLINT(build/include) */
236#undef MAX_TREE_SEARCH_DEPTH
237#undef MAX_TREE_COMP_LENGTH
238#undef BUCKET_BITS
239#undef HASHER
240/* MAX_NUM_MATCHES == 64 + MAX_TREE_SEARCH_DEPTH */
241#define MAX_NUM_MATCHES_H10 128
242
243/* For BUCKET_SWEEP_BITS == 0, enabling the dictionary lookup makes compression
244   a little faster (0.5% - 1%) and it compresses 0.15% better on small text
245   and HTML inputs. */
246
247#define HASHER() H2
248#define BUCKET_BITS 16
249#define BUCKET_SWEEP_BITS 0
250#define HASH_LEN 5
251#define USE_DICTIONARY 1
252#include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
253#undef BUCKET_SWEEP_BITS
254#undef USE_DICTIONARY
255#undef HASHER
256
257#define HASHER() H3
258#define BUCKET_SWEEP_BITS 1
259#define USE_DICTIONARY 0
260#include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
261#undef USE_DICTIONARY
262#undef BUCKET_SWEEP_BITS
263#undef BUCKET_BITS
264#undef HASHER
265
266#define HASHER() H4
267#define BUCKET_BITS 17
268#define BUCKET_SWEEP_BITS 2
269#define USE_DICTIONARY 1
270#include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
271#undef USE_DICTIONARY
272#undef HASH_LEN
273#undef BUCKET_SWEEP_BITS
274#undef BUCKET_BITS
275#undef HASHER
276
277#define HASHER() H5
278#include "./hash_longest_match_inc.h"  /* NOLINT(build/include) */
279#undef HASHER
280
281#define HASHER() H6
282#include "./hash_longest_match64_inc.h"  /* NOLINT(build/include) */
283#undef HASHER
284
285#define BUCKET_BITS 15
286
287#define NUM_LAST_DISTANCES_TO_CHECK 4
288#define NUM_BANKS 1
289#define BANK_BITS 16
290#define HASHER() H40
291#include "./hash_forgetful_chain_inc.h"  /* NOLINT(build/include) */
292#undef HASHER
293#undef NUM_LAST_DISTANCES_TO_CHECK
294
295#define NUM_LAST_DISTANCES_TO_CHECK 10
296#define HASHER() H41
297#include "./hash_forgetful_chain_inc.h"  /* NOLINT(build/include) */
298#undef HASHER
299#undef NUM_LAST_DISTANCES_TO_CHECK
300#undef NUM_BANKS
301#undef BANK_BITS
302
303#define NUM_LAST_DISTANCES_TO_CHECK 16
304#define NUM_BANKS 512
305#define BANK_BITS 9
306#define HASHER() H42
307#include "./hash_forgetful_chain_inc.h"  /* NOLINT(build/include) */
308#undef HASHER
309#undef NUM_LAST_DISTANCES_TO_CHECK
310#undef NUM_BANKS
311#undef BANK_BITS
312
313#undef BUCKET_BITS
314
315#define HASHER() H54
316#define BUCKET_BITS 20
317#define BUCKET_SWEEP_BITS 2
318#define HASH_LEN 7
319#define USE_DICTIONARY 0
320#include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
321#undef USE_DICTIONARY
322#undef HASH_LEN
323#undef BUCKET_SWEEP_BITS
324#undef BUCKET_BITS
325#undef HASHER
326
327/* fast large window hashers */
328
329#define HASHER() HROLLING_FAST
330#define CHUNKLEN 32
331#define JUMP 4
332#define NUMBUCKETS 16777216
333#define MASK ((NUMBUCKETS * 64) - 1)
334#include "./hash_rolling_inc.h"  /* NOLINT(build/include) */
335#undef JUMP
336#undef HASHER
337
338
339#define HASHER() HROLLING
340#define JUMP 1
341#include "./hash_rolling_inc.h"  /* NOLINT(build/include) */
342#undef MASK
343#undef NUMBUCKETS
344#undef JUMP
345#undef CHUNKLEN
346#undef HASHER
347
348#define HASHER() H35
349#define HASHER_A H3
350#define HASHER_B HROLLING_FAST
351#include "./hash_composite_inc.h"  /* NOLINT(build/include) */
352#undef HASHER_A
353#undef HASHER_B
354#undef HASHER
355
356#define HASHER() H55
357#define HASHER_A H54
358#define HASHER_B HROLLING_FAST
359#include "./hash_composite_inc.h"  /* NOLINT(build/include) */
360#undef HASHER_A
361#undef HASHER_B
362#undef HASHER
363
364#define HASHER() H65
365#define HASHER_A H6
366#define HASHER_B HROLLING
367#include "./hash_composite_inc.h"  /* NOLINT(build/include) */
368#undef HASHER_A
369#undef HASHER_B
370#undef HASHER
371
372#undef FN
373#undef CAT
374#undef EXPAND_CAT
375
376#define FOR_SIMPLE_HASHERS(H) H(2) H(3) H(4) H(5) H(6) H(40) H(41) H(42) H(54)
377#define FOR_COMPOSITE_HASHERS(H) H(35) H(55) H(65)
378#define FOR_GENERIC_HASHERS(H) FOR_SIMPLE_HASHERS(H) FOR_COMPOSITE_HASHERS(H)
379#define FOR_ALL_HASHERS(H) FOR_GENERIC_HASHERS(H) H(10)
380
381typedef struct {
382  HasherCommon common;
383
384  union {
385#define MEMBER_(N) \
386    H ## N _H ## N;
387    FOR_ALL_HASHERS(MEMBER_)
388#undef MEMBER_
389  } privat;
390} Hasher;
391
392/* MUST be invoked before any other method. */
393static BROTLI_INLINE void HasherInit(Hasher* hasher) {
394  hasher->common.extra = NULL;
395}
396
397static BROTLI_INLINE void DestroyHasher(MemoryManager* m, Hasher* hasher) {
398  if (hasher->common.extra == NULL) return;
399  BROTLI_FREE(m, hasher->common.extra);
400}
401
402static BROTLI_INLINE void HasherReset(Hasher* hasher) {
403  hasher->common.is_prepared_ = BROTLI_FALSE;
404}
405
406static BROTLI_INLINE size_t HasherSize(const BrotliEncoderParams* params,
407    BROTLI_BOOL one_shot, const size_t input_size) {
408  switch (params->hasher.type) {
409#define SIZE_(N)                                                      \
410    case N:                                                           \
411      return HashMemAllocInBytesH ## N(params, one_shot, input_size);
412    FOR_ALL_HASHERS(SIZE_)
413#undef SIZE_
414    default:
415      break;
416  }
417  return 0;  /* Default case. */
418}
419
420static BROTLI_INLINE void HasherSetup(MemoryManager* m, Hasher* hasher,
421    BrotliEncoderParams* params, const uint8_t* data, size_t position,
422    size_t input_size, BROTLI_BOOL is_last) {
423  BROTLI_BOOL one_shot = (position == 0 && is_last);
424  if (hasher->common.extra == NULL) {
425    size_t alloc_size;
426    ChooseHasher(params, &params->hasher);
427    alloc_size = HasherSize(params, one_shot, input_size);
428    hasher->common.extra = BROTLI_ALLOC(m, uint8_t, alloc_size);
429    if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(hasher->common.extra)) return;
430    hasher->common.params = params->hasher;
431    switch (hasher->common.params.type) {
432#define INITIALIZE_(N)                        \
433      case N:                                 \
434        InitializeH ## N(&hasher->common,     \
435            &hasher->privat._H ## N, params); \
436        break;
437      FOR_ALL_HASHERS(INITIALIZE_);
438#undef INITIALIZE_
439      default:
440        break;
441    }
442    HasherReset(hasher);
443  }
444
445  if (!hasher->common.is_prepared_) {
446    switch (hasher->common.params.type) {
447#define PREPARE_(N)                      \
448      case N:                            \
449        PrepareH ## N(                   \
450            &hasher->privat._H ## N,     \
451            one_shot, input_size, data); \
452        break;
453      FOR_ALL_HASHERS(PREPARE_)
454#undef PREPARE_
455      default: break;
456    }
457    if (position == 0) {
458      hasher->common.dict_num_lookups = 0;
459      hasher->common.dict_num_matches = 0;
460    }
461    hasher->common.is_prepared_ = BROTLI_TRUE;
462  }
463}
464
465static BROTLI_INLINE void InitOrStitchToPreviousBlock(
466    MemoryManager* m, Hasher* hasher, const uint8_t* data, size_t mask,
467    BrotliEncoderParams* params, size_t position, size_t input_size,
468    BROTLI_BOOL is_last) {
469  HasherSetup(m, hasher, params, data, position, input_size, is_last);
470  if (BROTLI_IS_OOM(m)) return;
471  switch (hasher->common.params.type) {
472#define INIT_(N)                             \
473    case N:                                  \
474      StitchToPreviousBlockH ## N(           \
475          &hasher->privat._H ## N,           \
476          input_size, position, data, mask); \
477    break;
478    FOR_ALL_HASHERS(INIT_)
479#undef INIT_
480    default: break;
481  }
482}
483
484#if defined(__cplusplus) || defined(c_plusplus)
485}  /* extern "C" */
486#endif
487
488#endif  /* BROTLI_ENC_HASH_H_ */
489