xref: /third_party/node/deps/brotli/c/enc/static_dict.c (revision 1cb0ef41)
1/* Copyright 2013 Google Inc. All Rights Reserved.
2
3   Distributed under MIT license.
4   See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5*/
6
7#include "./static_dict.h"
8
9#include "../common/dictionary.h"
10#include "../common/platform.h"
11#include "../common/transform.h"
12#include "./encoder_dict.h"
13#include "./find_match_length.h"
14
15#if defined(__cplusplus) || defined(c_plusplus)
16extern "C" {
17#endif
18
19static BROTLI_INLINE uint32_t Hash(const uint8_t* data) {
20  uint32_t h = BROTLI_UNALIGNED_LOAD32LE(data) * kDictHashMul32;
21  /* The higher bits contain more mixture from the multiplication,
22     so we take our results from there. */
23  return h >> (32 - kDictNumBits);
24}
25
26static BROTLI_INLINE void AddMatch(size_t distance, size_t len, size_t len_code,
27                                   uint32_t* matches) {
28  uint32_t match = (uint32_t)((distance << 5) + len_code);
29  matches[len] = BROTLI_MIN(uint32_t, matches[len], match);
30}
31
32static BROTLI_INLINE size_t DictMatchLength(const BrotliDictionary* dictionary,
33                                            const uint8_t* data,
34                                            size_t id,
35                                            size_t len,
36                                            size_t maxlen) {
37  const size_t offset = dictionary->offsets_by_length[len] + len * id;
38  return FindMatchLengthWithLimit(&dictionary->data[offset], data,
39                                  BROTLI_MIN(size_t, len, maxlen));
40}
41
42static BROTLI_INLINE BROTLI_BOOL IsMatch(const BrotliDictionary* dictionary,
43    DictWord w, const uint8_t* data, size_t max_length) {
44  if (w.len > max_length) {
45    return BROTLI_FALSE;
46  } else {
47    const size_t offset = dictionary->offsets_by_length[w.len] +
48        (size_t)w.len * (size_t)w.idx;
49    const uint8_t* dict = &dictionary->data[offset];
50    if (w.transform == 0) {
51      /* Match against base dictionary word. */
52      return
53          TO_BROTLI_BOOL(FindMatchLengthWithLimit(dict, data, w.len) == w.len);
54    } else if (w.transform == 10) {
55      /* Match against uppercase first transform.
56         Note that there are only ASCII uppercase words in the lookup table. */
57      return TO_BROTLI_BOOL(dict[0] >= 'a' && dict[0] <= 'z' &&
58              (dict[0] ^ 32) == data[0] &&
59              FindMatchLengthWithLimit(&dict[1], &data[1], w.len - 1u) ==
60              w.len - 1u);
61    } else {
62      /* Match against uppercase all transform.
63         Note that there are only ASCII uppercase words in the lookup table. */
64      size_t i;
65      for (i = 0; i < w.len; ++i) {
66        if (dict[i] >= 'a' && dict[i] <= 'z') {
67          if ((dict[i] ^ 32) != data[i]) return BROTLI_FALSE;
68        } else {
69          if (dict[i] != data[i]) return BROTLI_FALSE;
70        }
71      }
72      return BROTLI_TRUE;
73    }
74  }
75}
76
77BROTLI_BOOL BrotliFindAllStaticDictionaryMatches(
78    const BrotliEncoderDictionary* dictionary, const uint8_t* data,
79    size_t min_length, size_t max_length, uint32_t* matches) {
80  BROTLI_BOOL has_found_match = BROTLI_FALSE;
81  {
82    size_t offset = dictionary->buckets[Hash(data)];
83    BROTLI_BOOL end = !offset;
84    while (!end) {
85      DictWord w = dictionary->dict_words[offset++];
86      const size_t l = w.len & 0x1F;
87      const size_t n = (size_t)1 << dictionary->words->size_bits_by_length[l];
88      const size_t id = w.idx;
89      end = !!(w.len & 0x80);
90      w.len = (uint8_t)l;
91      if (w.transform == 0) {
92        const size_t matchlen =
93            DictMatchLength(dictionary->words, data, id, l, max_length);
94        const uint8_t* s;
95        size_t minlen;
96        size_t maxlen;
97        size_t len;
98        /* Transform "" + BROTLI_TRANSFORM_IDENTITY + "" */
99        if (matchlen == l) {
100          AddMatch(id, l, l, matches);
101          has_found_match = BROTLI_TRUE;
102        }
103        /* Transforms "" + BROTLI_TRANSFORM_OMIT_LAST_1 + "" and
104                      "" + BROTLI_TRANSFORM_OMIT_LAST_1 + "ing " */
105        if (matchlen >= l - 1) {
106          AddMatch(id + 12 * n, l - 1, l, matches);
107          if (l + 2 < max_length &&
108              data[l - 1] == 'i' && data[l] == 'n' && data[l + 1] == 'g' &&
109              data[l + 2] == ' ') {
110            AddMatch(id + 49 * n, l + 3, l, matches);
111          }
112          has_found_match = BROTLI_TRUE;
113        }
114        /* Transform "" + BROTLI_TRANSFORM_OMIT_LAST_# + "" (# = 2 .. 9) */
115        minlen = min_length;
116        if (l > 9) minlen = BROTLI_MAX(size_t, minlen, l - 9);
117        maxlen = BROTLI_MIN(size_t, matchlen, l - 2);
118        for (len = minlen; len <= maxlen; ++len) {
119          size_t cut = l - len;
120          size_t transform_id = (cut << 2) +
121              (size_t)((dictionary->cutoffTransforms >> (cut * 6)) & 0x3F);
122          AddMatch(id + transform_id * n, len, l, matches);
123          has_found_match = BROTLI_TRUE;
124        }
125        if (matchlen < l || l + 6 >= max_length) {
126          continue;
127        }
128        s = &data[l];
129        /* Transforms "" + BROTLI_TRANSFORM_IDENTITY + <suffix> */
130        if (s[0] == ' ') {
131          AddMatch(id + n, l + 1, l, matches);
132          if (s[1] == 'a') {
133            if (s[2] == ' ') {
134              AddMatch(id + 28 * n, l + 3, l, matches);
135            } else if (s[2] == 's') {
136              if (s[3] == ' ') AddMatch(id + 46 * n, l + 4, l, matches);
137            } else if (s[2] == 't') {
138              if (s[3] == ' ') AddMatch(id + 60 * n, l + 4, l, matches);
139            } else if (s[2] == 'n') {
140              if (s[3] == 'd' && s[4] == ' ') {
141                AddMatch(id + 10 * n, l + 5, l, matches);
142              }
143            }
144          } else if (s[1] == 'b') {
145            if (s[2] == 'y' && s[3] == ' ') {
146              AddMatch(id + 38 * n, l + 4, l, matches);
147            }
148          } else if (s[1] == 'i') {
149            if (s[2] == 'n') {
150              if (s[3] == ' ') AddMatch(id + 16 * n, l + 4, l, matches);
151            } else if (s[2] == 's') {
152              if (s[3] == ' ') AddMatch(id + 47 * n, l + 4, l, matches);
153            }
154          } else if (s[1] == 'f') {
155            if (s[2] == 'o') {
156              if (s[3] == 'r' && s[4] == ' ') {
157                AddMatch(id + 25 * n, l + 5, l, matches);
158              }
159            } else if (s[2] == 'r') {
160              if (s[3] == 'o' && s[4] == 'm' && s[5] == ' ') {
161                AddMatch(id + 37 * n, l + 6, l, matches);
162              }
163            }
164          } else if (s[1] == 'o') {
165            if (s[2] == 'f') {
166              if (s[3] == ' ') AddMatch(id + 8 * n, l + 4, l, matches);
167            } else if (s[2] == 'n') {
168              if (s[3] == ' ') AddMatch(id + 45 * n, l + 4, l, matches);
169            }
170          } else if (s[1] == 'n') {
171            if (s[2] == 'o' && s[3] == 't' && s[4] == ' ') {
172              AddMatch(id + 80 * n, l + 5, l, matches);
173            }
174          } else if (s[1] == 't') {
175            if (s[2] == 'h') {
176              if (s[3] == 'e') {
177                if (s[4] == ' ') AddMatch(id + 5 * n, l + 5, l, matches);
178              } else if (s[3] == 'a') {
179                if (s[4] == 't' && s[5] == ' ') {
180                  AddMatch(id + 29 * n, l + 6, l, matches);
181                }
182              }
183            } else if (s[2] == 'o') {
184              if (s[3] == ' ') AddMatch(id + 17 * n, l + 4, l, matches);
185            }
186          } else if (s[1] == 'w') {
187            if (s[2] == 'i' && s[3] == 't' && s[4] == 'h' && s[5] == ' ') {
188              AddMatch(id + 35 * n, l + 6, l, matches);
189            }
190          }
191        } else if (s[0] == '"') {
192          AddMatch(id + 19 * n, l + 1, l, matches);
193          if (s[1] == '>') {
194            AddMatch(id + 21 * n, l + 2, l, matches);
195          }
196        } else if (s[0] == '.') {
197          AddMatch(id + 20 * n, l + 1, l, matches);
198          if (s[1] == ' ') {
199            AddMatch(id + 31 * n, l + 2, l, matches);
200            if (s[2] == 'T' && s[3] == 'h') {
201              if (s[4] == 'e') {
202                if (s[5] == ' ') AddMatch(id + 43 * n, l + 6, l, matches);
203              } else if (s[4] == 'i') {
204                if (s[5] == 's' && s[6] == ' ') {
205                  AddMatch(id + 75 * n, l + 7, l, matches);
206                }
207              }
208            }
209          }
210        } else if (s[0] == ',') {
211          AddMatch(id + 76 * n, l + 1, l, matches);
212          if (s[1] == ' ') {
213            AddMatch(id + 14 * n, l + 2, l, matches);
214          }
215        } else if (s[0] == '\n') {
216          AddMatch(id + 22 * n, l + 1, l, matches);
217          if (s[1] == '\t') {
218            AddMatch(id + 50 * n, l + 2, l, matches);
219          }
220        } else if (s[0] == ']') {
221          AddMatch(id + 24 * n, l + 1, l, matches);
222        } else if (s[0] == '\'') {
223          AddMatch(id + 36 * n, l + 1, l, matches);
224        } else if (s[0] == ':') {
225          AddMatch(id + 51 * n, l + 1, l, matches);
226        } else if (s[0] == '(') {
227          AddMatch(id + 57 * n, l + 1, l, matches);
228        } else if (s[0] == '=') {
229          if (s[1] == '"') {
230            AddMatch(id + 70 * n, l + 2, l, matches);
231          } else if (s[1] == '\'') {
232            AddMatch(id + 86 * n, l + 2, l, matches);
233          }
234        } else if (s[0] == 'a') {
235          if (s[1] == 'l' && s[2] == ' ') {
236            AddMatch(id + 84 * n, l + 3, l, matches);
237          }
238        } else if (s[0] == 'e') {
239          if (s[1] == 'd') {
240            if (s[2] == ' ') AddMatch(id + 53 * n, l + 3, l, matches);
241          } else if (s[1] == 'r') {
242            if (s[2] == ' ') AddMatch(id + 82 * n, l + 3, l, matches);
243          } else if (s[1] == 's') {
244            if (s[2] == 't' && s[3] == ' ') {
245              AddMatch(id + 95 * n, l + 4, l, matches);
246            }
247          }
248        } else if (s[0] == 'f') {
249          if (s[1] == 'u' && s[2] == 'l' && s[3] == ' ') {
250            AddMatch(id + 90 * n, l + 4, l, matches);
251          }
252        } else if (s[0] == 'i') {
253          if (s[1] == 'v') {
254            if (s[2] == 'e' && s[3] == ' ') {
255              AddMatch(id + 92 * n, l + 4, l, matches);
256            }
257          } else if (s[1] == 'z') {
258            if (s[2] == 'e' && s[3] == ' ') {
259              AddMatch(id + 100 * n, l + 4, l, matches);
260            }
261          }
262        } else if (s[0] == 'l') {
263          if (s[1] == 'e') {
264            if (s[2] == 's' && s[3] == 's' && s[4] == ' ') {
265              AddMatch(id + 93 * n, l + 5, l, matches);
266            }
267          } else if (s[1] == 'y') {
268            if (s[2] == ' ') AddMatch(id + 61 * n, l + 3, l, matches);
269          }
270        } else if (s[0] == 'o') {
271          if (s[1] == 'u' && s[2] == 's' && s[3] == ' ') {
272            AddMatch(id + 106 * n, l + 4, l, matches);
273          }
274        }
275      } else {
276        /* Set is_all_caps=0 for BROTLI_TRANSFORM_UPPERCASE_FIRST and
277               is_all_caps=1 otherwise (BROTLI_TRANSFORM_UPPERCASE_ALL)
278           transform. */
279        const BROTLI_BOOL is_all_caps =
280            TO_BROTLI_BOOL(w.transform != BROTLI_TRANSFORM_UPPERCASE_FIRST);
281        const uint8_t* s;
282        if (!IsMatch(dictionary->words, w, data, max_length)) {
283          continue;
284        }
285        /* Transform "" + kUppercase{First,All} + "" */
286        AddMatch(id + (is_all_caps ? 44 : 9) * n, l, l, matches);
287        has_found_match = BROTLI_TRUE;
288        if (l + 1 >= max_length) {
289          continue;
290        }
291        /* Transforms "" + kUppercase{First,All} + <suffix> */
292        s = &data[l];
293        if (s[0] == ' ') {
294          AddMatch(id + (is_all_caps ? 68 : 4) * n, l + 1, l, matches);
295        } else if (s[0] == '"') {
296          AddMatch(id + (is_all_caps ? 87 : 66) * n, l + 1, l, matches);
297          if (s[1] == '>') {
298            AddMatch(id + (is_all_caps ? 97 : 69) * n, l + 2, l, matches);
299          }
300        } else if (s[0] == '.') {
301          AddMatch(id + (is_all_caps ? 101 : 79) * n, l + 1, l, matches);
302          if (s[1] == ' ') {
303            AddMatch(id + (is_all_caps ? 114 : 88) * n, l + 2, l, matches);
304          }
305        } else if (s[0] == ',') {
306          AddMatch(id + (is_all_caps ? 112 : 99) * n, l + 1, l, matches);
307          if (s[1] == ' ') {
308            AddMatch(id + (is_all_caps ? 107 : 58) * n, l + 2, l, matches);
309          }
310        } else if (s[0] == '\'') {
311          AddMatch(id + (is_all_caps ? 94 : 74) * n, l + 1, l, matches);
312        } else if (s[0] == '(') {
313          AddMatch(id + (is_all_caps ? 113 : 78) * n, l + 1, l, matches);
314        } else if (s[0] == '=') {
315          if (s[1] == '"') {
316            AddMatch(id + (is_all_caps ? 105 : 104) * n, l + 2, l, matches);
317          } else if (s[1] == '\'') {
318            AddMatch(id + (is_all_caps ? 116 : 108) * n, l + 2, l, matches);
319          }
320        }
321      }
322    }
323  }
324  /* Transforms with prefixes " " and "." */
325  if (max_length >= 5 && (data[0] == ' ' || data[0] == '.')) {
326    BROTLI_BOOL is_space = TO_BROTLI_BOOL(data[0] == ' ');
327    size_t offset = dictionary->buckets[Hash(&data[1])];
328    BROTLI_BOOL end = !offset;
329    while (!end) {
330      DictWord w = dictionary->dict_words[offset++];
331      const size_t l = w.len & 0x1F;
332      const size_t n = (size_t)1 << dictionary->words->size_bits_by_length[l];
333      const size_t id = w.idx;
334      end = !!(w.len & 0x80);
335      w.len = (uint8_t)l;
336      if (w.transform == 0) {
337        const uint8_t* s;
338        if (!IsMatch(dictionary->words, w, &data[1], max_length - 1)) {
339          continue;
340        }
341        /* Transforms " " + BROTLI_TRANSFORM_IDENTITY + "" and
342                      "." + BROTLI_TRANSFORM_IDENTITY + "" */
343        AddMatch(id + (is_space ? 6 : 32) * n, l + 1, l, matches);
344        has_found_match = BROTLI_TRUE;
345        if (l + 2 >= max_length) {
346          continue;
347        }
348        /* Transforms " " + BROTLI_TRANSFORM_IDENTITY + <suffix> and
349                      "." + BROTLI_TRANSFORM_IDENTITY + <suffix>
350        */
351        s = &data[l + 1];
352        if (s[0] == ' ') {
353          AddMatch(id + (is_space ? 2 : 77) * n, l + 2, l, matches);
354        } else if (s[0] == '(') {
355          AddMatch(id + (is_space ? 89 : 67) * n, l + 2, l, matches);
356        } else if (is_space) {
357          if (s[0] == ',') {
358            AddMatch(id + 103 * n, l + 2, l, matches);
359            if (s[1] == ' ') {
360              AddMatch(id + 33 * n, l + 3, l, matches);
361            }
362          } else if (s[0] == '.') {
363            AddMatch(id + 71 * n, l + 2, l, matches);
364            if (s[1] == ' ') {
365              AddMatch(id + 52 * n, l + 3, l, matches);
366            }
367          } else if (s[0] == '=') {
368            if (s[1] == '"') {
369              AddMatch(id + 81 * n, l + 3, l, matches);
370            } else if (s[1] == '\'') {
371              AddMatch(id + 98 * n, l + 3, l, matches);
372            }
373          }
374        }
375      } else if (is_space) {
376        /* Set is_all_caps=0 for BROTLI_TRANSFORM_UPPERCASE_FIRST and
377               is_all_caps=1 otherwise (BROTLI_TRANSFORM_UPPERCASE_ALL)
378           transform. */
379        const BROTLI_BOOL is_all_caps =
380            TO_BROTLI_BOOL(w.transform != BROTLI_TRANSFORM_UPPERCASE_FIRST);
381        const uint8_t* s;
382        if (!IsMatch(dictionary->words, w, &data[1], max_length - 1)) {
383          continue;
384        }
385        /* Transforms " " + kUppercase{First,All} + "" */
386        AddMatch(id + (is_all_caps ? 85 : 30) * n, l + 1, l, matches);
387        has_found_match = BROTLI_TRUE;
388        if (l + 2 >= max_length) {
389          continue;
390        }
391        /* Transforms " " + kUppercase{First,All} + <suffix> */
392        s = &data[l + 1];
393        if (s[0] == ' ') {
394          AddMatch(id + (is_all_caps ? 83 : 15) * n, l + 2, l, matches);
395        } else if (s[0] == ',') {
396          if (!is_all_caps) {
397            AddMatch(id + 109 * n, l + 2, l, matches);
398          }
399          if (s[1] == ' ') {
400            AddMatch(id + (is_all_caps ? 111 : 65) * n, l + 3, l, matches);
401          }
402        } else if (s[0] == '.') {
403          AddMatch(id + (is_all_caps ? 115 : 96) * n, l + 2, l, matches);
404          if (s[1] == ' ') {
405            AddMatch(id + (is_all_caps ? 117 : 91) * n, l + 3, l, matches);
406          }
407        } else if (s[0] == '=') {
408          if (s[1] == '"') {
409            AddMatch(id + (is_all_caps ? 110 : 118) * n, l + 3, l, matches);
410          } else if (s[1] == '\'') {
411            AddMatch(id + (is_all_caps ? 119 : 120) * n, l + 3, l, matches);
412          }
413        }
414      }
415    }
416  }
417  if (max_length >= 6) {
418    /* Transforms with prefixes "e ", "s ", ", " and "\xC2\xA0" */
419    if ((data[1] == ' ' &&
420         (data[0] == 'e' || data[0] == 's' || data[0] == ',')) ||
421        (data[0] == 0xC2 && data[1] == 0xA0)) {
422      size_t offset = dictionary->buckets[Hash(&data[2])];
423      BROTLI_BOOL end = !offset;
424      while (!end) {
425        DictWord w = dictionary->dict_words[offset++];
426        const size_t l = w.len & 0x1F;
427        const size_t n = (size_t)1 << dictionary->words->size_bits_by_length[l];
428        const size_t id = w.idx;
429        end = !!(w.len & 0x80);
430        w.len = (uint8_t)l;
431        if (w.transform == 0 &&
432            IsMatch(dictionary->words, w, &data[2], max_length - 2)) {
433          if (data[0] == 0xC2) {
434            AddMatch(id + 102 * n, l + 2, l, matches);
435            has_found_match = BROTLI_TRUE;
436          } else if (l + 2 < max_length && data[l + 2] == ' ') {
437            size_t t = data[0] == 'e' ? 18 : (data[0] == 's' ? 7 : 13);
438            AddMatch(id + t * n, l + 3, l, matches);
439            has_found_match = BROTLI_TRUE;
440          }
441        }
442      }
443    }
444  }
445  if (max_length >= 9) {
446    /* Transforms with prefixes " the " and ".com/" */
447    if ((data[0] == ' ' && data[1] == 't' && data[2] == 'h' &&
448         data[3] == 'e' && data[4] == ' ') ||
449        (data[0] == '.' && data[1] == 'c' && data[2] == 'o' &&
450         data[3] == 'm' && data[4] == '/')) {
451      size_t offset = dictionary->buckets[Hash(&data[5])];
452      BROTLI_BOOL end = !offset;
453      while (!end) {
454        DictWord w = dictionary->dict_words[offset++];
455        const size_t l = w.len & 0x1F;
456        const size_t n = (size_t)1 << dictionary->words->size_bits_by_length[l];
457        const size_t id = w.idx;
458        end = !!(w.len & 0x80);
459        w.len = (uint8_t)l;
460        if (w.transform == 0 &&
461            IsMatch(dictionary->words, w, &data[5], max_length - 5)) {
462          AddMatch(id + (data[0] == ' ' ? 41 : 72) * n, l + 5, l, matches);
463          has_found_match = BROTLI_TRUE;
464          if (l + 5 < max_length) {
465            const uint8_t* s = &data[l + 5];
466            if (data[0] == ' ') {
467              if (l + 8 < max_length &&
468                  s[0] == ' ' && s[1] == 'o' && s[2] == 'f' && s[3] == ' ') {
469                AddMatch(id + 62 * n, l + 9, l, matches);
470                if (l + 12 < max_length &&
471                    s[4] == 't' && s[5] == 'h' && s[6] == 'e' && s[7] == ' ') {
472                  AddMatch(id + 73 * n, l + 13, l, matches);
473                }
474              }
475            }
476          }
477        }
478      }
479    }
480  }
481  return has_found_match;
482}
483
484#if defined(__cplusplus) || defined(c_plusplus)
485}  /* extern "C" */
486#endif
487