1/* Copyright 2013 Google Inc. All Rights Reserved. 2 3 Distributed under MIT license. 4 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT 5*/ 6 7#include "./static_dict.h" 8 9#include "../common/dictionary.h" 10#include "../common/platform.h" 11#include "../common/transform.h" 12#include "./encoder_dict.h" 13#include "./find_match_length.h" 14 15#if defined(__cplusplus) || defined(c_plusplus) 16extern "C" { 17#endif 18 19static BROTLI_INLINE uint32_t Hash(const uint8_t* data) { 20 uint32_t h = BROTLI_UNALIGNED_LOAD32LE(data) * kDictHashMul32; 21 /* The higher bits contain more mixture from the multiplication, 22 so we take our results from there. */ 23 return h >> (32 - kDictNumBits); 24} 25 26static BROTLI_INLINE void AddMatch(size_t distance, size_t len, size_t len_code, 27 uint32_t* matches) { 28 uint32_t match = (uint32_t)((distance << 5) + len_code); 29 matches[len] = BROTLI_MIN(uint32_t, matches[len], match); 30} 31 32static BROTLI_INLINE size_t DictMatchLength(const BrotliDictionary* dictionary, 33 const uint8_t* data, 34 size_t id, 35 size_t len, 36 size_t maxlen) { 37 const size_t offset = dictionary->offsets_by_length[len] + len * id; 38 return FindMatchLengthWithLimit(&dictionary->data[offset], data, 39 BROTLI_MIN(size_t, len, maxlen)); 40} 41 42static BROTLI_INLINE BROTLI_BOOL IsMatch(const BrotliDictionary* dictionary, 43 DictWord w, const uint8_t* data, size_t max_length) { 44 if (w.len > max_length) { 45 return BROTLI_FALSE; 46 } else { 47 const size_t offset = dictionary->offsets_by_length[w.len] + 48 (size_t)w.len * (size_t)w.idx; 49 const uint8_t* dict = &dictionary->data[offset]; 50 if (w.transform == 0) { 51 /* Match against base dictionary word. */ 52 return 53 TO_BROTLI_BOOL(FindMatchLengthWithLimit(dict, data, w.len) == w.len); 54 } else if (w.transform == 10) { 55 /* Match against uppercase first transform. 56 Note that there are only ASCII uppercase words in the lookup table. */ 57 return TO_BROTLI_BOOL(dict[0] >= 'a' && dict[0] <= 'z' && 58 (dict[0] ^ 32) == data[0] && 59 FindMatchLengthWithLimit(&dict[1], &data[1], w.len - 1u) == 60 w.len - 1u); 61 } else { 62 /* Match against uppercase all transform. 63 Note that there are only ASCII uppercase words in the lookup table. */ 64 size_t i; 65 for (i = 0; i < w.len; ++i) { 66 if (dict[i] >= 'a' && dict[i] <= 'z') { 67 if ((dict[i] ^ 32) != data[i]) return BROTLI_FALSE; 68 } else { 69 if (dict[i] != data[i]) return BROTLI_FALSE; 70 } 71 } 72 return BROTLI_TRUE; 73 } 74 } 75} 76 77BROTLI_BOOL BrotliFindAllStaticDictionaryMatches( 78 const BrotliEncoderDictionary* dictionary, const uint8_t* data, 79 size_t min_length, size_t max_length, uint32_t* matches) { 80 BROTLI_BOOL has_found_match = BROTLI_FALSE; 81 { 82 size_t offset = dictionary->buckets[Hash(data)]; 83 BROTLI_BOOL end = !offset; 84 while (!end) { 85 DictWord w = dictionary->dict_words[offset++]; 86 const size_t l = w.len & 0x1F; 87 const size_t n = (size_t)1 << dictionary->words->size_bits_by_length[l]; 88 const size_t id = w.idx; 89 end = !!(w.len & 0x80); 90 w.len = (uint8_t)l; 91 if (w.transform == 0) { 92 const size_t matchlen = 93 DictMatchLength(dictionary->words, data, id, l, max_length); 94 const uint8_t* s; 95 size_t minlen; 96 size_t maxlen; 97 size_t len; 98 /* Transform "" + BROTLI_TRANSFORM_IDENTITY + "" */ 99 if (matchlen == l) { 100 AddMatch(id, l, l, matches); 101 has_found_match = BROTLI_TRUE; 102 } 103 /* Transforms "" + BROTLI_TRANSFORM_OMIT_LAST_1 + "" and 104 "" + BROTLI_TRANSFORM_OMIT_LAST_1 + "ing " */ 105 if (matchlen >= l - 1) { 106 AddMatch(id + 12 * n, l - 1, l, matches); 107 if (l + 2 < max_length && 108 data[l - 1] == 'i' && data[l] == 'n' && data[l + 1] == 'g' && 109 data[l + 2] == ' ') { 110 AddMatch(id + 49 * n, l + 3, l, matches); 111 } 112 has_found_match = BROTLI_TRUE; 113 } 114 /* Transform "" + BROTLI_TRANSFORM_OMIT_LAST_# + "" (# = 2 .. 9) */ 115 minlen = min_length; 116 if (l > 9) minlen = BROTLI_MAX(size_t, minlen, l - 9); 117 maxlen = BROTLI_MIN(size_t, matchlen, l - 2); 118 for (len = minlen; len <= maxlen; ++len) { 119 size_t cut = l - len; 120 size_t transform_id = (cut << 2) + 121 (size_t)((dictionary->cutoffTransforms >> (cut * 6)) & 0x3F); 122 AddMatch(id + transform_id * n, len, l, matches); 123 has_found_match = BROTLI_TRUE; 124 } 125 if (matchlen < l || l + 6 >= max_length) { 126 continue; 127 } 128 s = &data[l]; 129 /* Transforms "" + BROTLI_TRANSFORM_IDENTITY + <suffix> */ 130 if (s[0] == ' ') { 131 AddMatch(id + n, l + 1, l, matches); 132 if (s[1] == 'a') { 133 if (s[2] == ' ') { 134 AddMatch(id + 28 * n, l + 3, l, matches); 135 } else if (s[2] == 's') { 136 if (s[3] == ' ') AddMatch(id + 46 * n, l + 4, l, matches); 137 } else if (s[2] == 't') { 138 if (s[3] == ' ') AddMatch(id + 60 * n, l + 4, l, matches); 139 } else if (s[2] == 'n') { 140 if (s[3] == 'd' && s[4] == ' ') { 141 AddMatch(id + 10 * n, l + 5, l, matches); 142 } 143 } 144 } else if (s[1] == 'b') { 145 if (s[2] == 'y' && s[3] == ' ') { 146 AddMatch(id + 38 * n, l + 4, l, matches); 147 } 148 } else if (s[1] == 'i') { 149 if (s[2] == 'n') { 150 if (s[3] == ' ') AddMatch(id + 16 * n, l + 4, l, matches); 151 } else if (s[2] == 's') { 152 if (s[3] == ' ') AddMatch(id + 47 * n, l + 4, l, matches); 153 } 154 } else if (s[1] == 'f') { 155 if (s[2] == 'o') { 156 if (s[3] == 'r' && s[4] == ' ') { 157 AddMatch(id + 25 * n, l + 5, l, matches); 158 } 159 } else if (s[2] == 'r') { 160 if (s[3] == 'o' && s[4] == 'm' && s[5] == ' ') { 161 AddMatch(id + 37 * n, l + 6, l, matches); 162 } 163 } 164 } else if (s[1] == 'o') { 165 if (s[2] == 'f') { 166 if (s[3] == ' ') AddMatch(id + 8 * n, l + 4, l, matches); 167 } else if (s[2] == 'n') { 168 if (s[3] == ' ') AddMatch(id + 45 * n, l + 4, l, matches); 169 } 170 } else if (s[1] == 'n') { 171 if (s[2] == 'o' && s[3] == 't' && s[4] == ' ') { 172 AddMatch(id + 80 * n, l + 5, l, matches); 173 } 174 } else if (s[1] == 't') { 175 if (s[2] == 'h') { 176 if (s[3] == 'e') { 177 if (s[4] == ' ') AddMatch(id + 5 * n, l + 5, l, matches); 178 } else if (s[3] == 'a') { 179 if (s[4] == 't' && s[5] == ' ') { 180 AddMatch(id + 29 * n, l + 6, l, matches); 181 } 182 } 183 } else if (s[2] == 'o') { 184 if (s[3] == ' ') AddMatch(id + 17 * n, l + 4, l, matches); 185 } 186 } else if (s[1] == 'w') { 187 if (s[2] == 'i' && s[3] == 't' && s[4] == 'h' && s[5] == ' ') { 188 AddMatch(id + 35 * n, l + 6, l, matches); 189 } 190 } 191 } else if (s[0] == '"') { 192 AddMatch(id + 19 * n, l + 1, l, matches); 193 if (s[1] == '>') { 194 AddMatch(id + 21 * n, l + 2, l, matches); 195 } 196 } else if (s[0] == '.') { 197 AddMatch(id + 20 * n, l + 1, l, matches); 198 if (s[1] == ' ') { 199 AddMatch(id + 31 * n, l + 2, l, matches); 200 if (s[2] == 'T' && s[3] == 'h') { 201 if (s[4] == 'e') { 202 if (s[5] == ' ') AddMatch(id + 43 * n, l + 6, l, matches); 203 } else if (s[4] == 'i') { 204 if (s[5] == 's' && s[6] == ' ') { 205 AddMatch(id + 75 * n, l + 7, l, matches); 206 } 207 } 208 } 209 } 210 } else if (s[0] == ',') { 211 AddMatch(id + 76 * n, l + 1, l, matches); 212 if (s[1] == ' ') { 213 AddMatch(id + 14 * n, l + 2, l, matches); 214 } 215 } else if (s[0] == '\n') { 216 AddMatch(id + 22 * n, l + 1, l, matches); 217 if (s[1] == '\t') { 218 AddMatch(id + 50 * n, l + 2, l, matches); 219 } 220 } else if (s[0] == ']') { 221 AddMatch(id + 24 * n, l + 1, l, matches); 222 } else if (s[0] == '\'') { 223 AddMatch(id + 36 * n, l + 1, l, matches); 224 } else if (s[0] == ':') { 225 AddMatch(id + 51 * n, l + 1, l, matches); 226 } else if (s[0] == '(') { 227 AddMatch(id + 57 * n, l + 1, l, matches); 228 } else if (s[0] == '=') { 229 if (s[1] == '"') { 230 AddMatch(id + 70 * n, l + 2, l, matches); 231 } else if (s[1] == '\'') { 232 AddMatch(id + 86 * n, l + 2, l, matches); 233 } 234 } else if (s[0] == 'a') { 235 if (s[1] == 'l' && s[2] == ' ') { 236 AddMatch(id + 84 * n, l + 3, l, matches); 237 } 238 } else if (s[0] == 'e') { 239 if (s[1] == 'd') { 240 if (s[2] == ' ') AddMatch(id + 53 * n, l + 3, l, matches); 241 } else if (s[1] == 'r') { 242 if (s[2] == ' ') AddMatch(id + 82 * n, l + 3, l, matches); 243 } else if (s[1] == 's') { 244 if (s[2] == 't' && s[3] == ' ') { 245 AddMatch(id + 95 * n, l + 4, l, matches); 246 } 247 } 248 } else if (s[0] == 'f') { 249 if (s[1] == 'u' && s[2] == 'l' && s[3] == ' ') { 250 AddMatch(id + 90 * n, l + 4, l, matches); 251 } 252 } else if (s[0] == 'i') { 253 if (s[1] == 'v') { 254 if (s[2] == 'e' && s[3] == ' ') { 255 AddMatch(id + 92 * n, l + 4, l, matches); 256 } 257 } else if (s[1] == 'z') { 258 if (s[2] == 'e' && s[3] == ' ') { 259 AddMatch(id + 100 * n, l + 4, l, matches); 260 } 261 } 262 } else if (s[0] == 'l') { 263 if (s[1] == 'e') { 264 if (s[2] == 's' && s[3] == 's' && s[4] == ' ') { 265 AddMatch(id + 93 * n, l + 5, l, matches); 266 } 267 } else if (s[1] == 'y') { 268 if (s[2] == ' ') AddMatch(id + 61 * n, l + 3, l, matches); 269 } 270 } else if (s[0] == 'o') { 271 if (s[1] == 'u' && s[2] == 's' && s[3] == ' ') { 272 AddMatch(id + 106 * n, l + 4, l, matches); 273 } 274 } 275 } else { 276 /* Set is_all_caps=0 for BROTLI_TRANSFORM_UPPERCASE_FIRST and 277 is_all_caps=1 otherwise (BROTLI_TRANSFORM_UPPERCASE_ALL) 278 transform. */ 279 const BROTLI_BOOL is_all_caps = 280 TO_BROTLI_BOOL(w.transform != BROTLI_TRANSFORM_UPPERCASE_FIRST); 281 const uint8_t* s; 282 if (!IsMatch(dictionary->words, w, data, max_length)) { 283 continue; 284 } 285 /* Transform "" + kUppercase{First,All} + "" */ 286 AddMatch(id + (is_all_caps ? 44 : 9) * n, l, l, matches); 287 has_found_match = BROTLI_TRUE; 288 if (l + 1 >= max_length) { 289 continue; 290 } 291 /* Transforms "" + kUppercase{First,All} + <suffix> */ 292 s = &data[l]; 293 if (s[0] == ' ') { 294 AddMatch(id + (is_all_caps ? 68 : 4) * n, l + 1, l, matches); 295 } else if (s[0] == '"') { 296 AddMatch(id + (is_all_caps ? 87 : 66) * n, l + 1, l, matches); 297 if (s[1] == '>') { 298 AddMatch(id + (is_all_caps ? 97 : 69) * n, l + 2, l, matches); 299 } 300 } else if (s[0] == '.') { 301 AddMatch(id + (is_all_caps ? 101 : 79) * n, l + 1, l, matches); 302 if (s[1] == ' ') { 303 AddMatch(id + (is_all_caps ? 114 : 88) * n, l + 2, l, matches); 304 } 305 } else if (s[0] == ',') { 306 AddMatch(id + (is_all_caps ? 112 : 99) * n, l + 1, l, matches); 307 if (s[1] == ' ') { 308 AddMatch(id + (is_all_caps ? 107 : 58) * n, l + 2, l, matches); 309 } 310 } else if (s[0] == '\'') { 311 AddMatch(id + (is_all_caps ? 94 : 74) * n, l + 1, l, matches); 312 } else if (s[0] == '(') { 313 AddMatch(id + (is_all_caps ? 113 : 78) * n, l + 1, l, matches); 314 } else if (s[0] == '=') { 315 if (s[1] == '"') { 316 AddMatch(id + (is_all_caps ? 105 : 104) * n, l + 2, l, matches); 317 } else if (s[1] == '\'') { 318 AddMatch(id + (is_all_caps ? 116 : 108) * n, l + 2, l, matches); 319 } 320 } 321 } 322 } 323 } 324 /* Transforms with prefixes " " and "." */ 325 if (max_length >= 5 && (data[0] == ' ' || data[0] == '.')) { 326 BROTLI_BOOL is_space = TO_BROTLI_BOOL(data[0] == ' '); 327 size_t offset = dictionary->buckets[Hash(&data[1])]; 328 BROTLI_BOOL end = !offset; 329 while (!end) { 330 DictWord w = dictionary->dict_words[offset++]; 331 const size_t l = w.len & 0x1F; 332 const size_t n = (size_t)1 << dictionary->words->size_bits_by_length[l]; 333 const size_t id = w.idx; 334 end = !!(w.len & 0x80); 335 w.len = (uint8_t)l; 336 if (w.transform == 0) { 337 const uint8_t* s; 338 if (!IsMatch(dictionary->words, w, &data[1], max_length - 1)) { 339 continue; 340 } 341 /* Transforms " " + BROTLI_TRANSFORM_IDENTITY + "" and 342 "." + BROTLI_TRANSFORM_IDENTITY + "" */ 343 AddMatch(id + (is_space ? 6 : 32) * n, l + 1, l, matches); 344 has_found_match = BROTLI_TRUE; 345 if (l + 2 >= max_length) { 346 continue; 347 } 348 /* Transforms " " + BROTLI_TRANSFORM_IDENTITY + <suffix> and 349 "." + BROTLI_TRANSFORM_IDENTITY + <suffix> 350 */ 351 s = &data[l + 1]; 352 if (s[0] == ' ') { 353 AddMatch(id + (is_space ? 2 : 77) * n, l + 2, l, matches); 354 } else if (s[0] == '(') { 355 AddMatch(id + (is_space ? 89 : 67) * n, l + 2, l, matches); 356 } else if (is_space) { 357 if (s[0] == ',') { 358 AddMatch(id + 103 * n, l + 2, l, matches); 359 if (s[1] == ' ') { 360 AddMatch(id + 33 * n, l + 3, l, matches); 361 } 362 } else if (s[0] == '.') { 363 AddMatch(id + 71 * n, l + 2, l, matches); 364 if (s[1] == ' ') { 365 AddMatch(id + 52 * n, l + 3, l, matches); 366 } 367 } else if (s[0] == '=') { 368 if (s[1] == '"') { 369 AddMatch(id + 81 * n, l + 3, l, matches); 370 } else if (s[1] == '\'') { 371 AddMatch(id + 98 * n, l + 3, l, matches); 372 } 373 } 374 } 375 } else if (is_space) { 376 /* Set is_all_caps=0 for BROTLI_TRANSFORM_UPPERCASE_FIRST and 377 is_all_caps=1 otherwise (BROTLI_TRANSFORM_UPPERCASE_ALL) 378 transform. */ 379 const BROTLI_BOOL is_all_caps = 380 TO_BROTLI_BOOL(w.transform != BROTLI_TRANSFORM_UPPERCASE_FIRST); 381 const uint8_t* s; 382 if (!IsMatch(dictionary->words, w, &data[1], max_length - 1)) { 383 continue; 384 } 385 /* Transforms " " + kUppercase{First,All} + "" */ 386 AddMatch(id + (is_all_caps ? 85 : 30) * n, l + 1, l, matches); 387 has_found_match = BROTLI_TRUE; 388 if (l + 2 >= max_length) { 389 continue; 390 } 391 /* Transforms " " + kUppercase{First,All} + <suffix> */ 392 s = &data[l + 1]; 393 if (s[0] == ' ') { 394 AddMatch(id + (is_all_caps ? 83 : 15) * n, l + 2, l, matches); 395 } else if (s[0] == ',') { 396 if (!is_all_caps) { 397 AddMatch(id + 109 * n, l + 2, l, matches); 398 } 399 if (s[1] == ' ') { 400 AddMatch(id + (is_all_caps ? 111 : 65) * n, l + 3, l, matches); 401 } 402 } else if (s[0] == '.') { 403 AddMatch(id + (is_all_caps ? 115 : 96) * n, l + 2, l, matches); 404 if (s[1] == ' ') { 405 AddMatch(id + (is_all_caps ? 117 : 91) * n, l + 3, l, matches); 406 } 407 } else if (s[0] == '=') { 408 if (s[1] == '"') { 409 AddMatch(id + (is_all_caps ? 110 : 118) * n, l + 3, l, matches); 410 } else if (s[1] == '\'') { 411 AddMatch(id + (is_all_caps ? 119 : 120) * n, l + 3, l, matches); 412 } 413 } 414 } 415 } 416 } 417 if (max_length >= 6) { 418 /* Transforms with prefixes "e ", "s ", ", " and "\xC2\xA0" */ 419 if ((data[1] == ' ' && 420 (data[0] == 'e' || data[0] == 's' || data[0] == ',')) || 421 (data[0] == 0xC2 && data[1] == 0xA0)) { 422 size_t offset = dictionary->buckets[Hash(&data[2])]; 423 BROTLI_BOOL end = !offset; 424 while (!end) { 425 DictWord w = dictionary->dict_words[offset++]; 426 const size_t l = w.len & 0x1F; 427 const size_t n = (size_t)1 << dictionary->words->size_bits_by_length[l]; 428 const size_t id = w.idx; 429 end = !!(w.len & 0x80); 430 w.len = (uint8_t)l; 431 if (w.transform == 0 && 432 IsMatch(dictionary->words, w, &data[2], max_length - 2)) { 433 if (data[0] == 0xC2) { 434 AddMatch(id + 102 * n, l + 2, l, matches); 435 has_found_match = BROTLI_TRUE; 436 } else if (l + 2 < max_length && data[l + 2] == ' ') { 437 size_t t = data[0] == 'e' ? 18 : (data[0] == 's' ? 7 : 13); 438 AddMatch(id + t * n, l + 3, l, matches); 439 has_found_match = BROTLI_TRUE; 440 } 441 } 442 } 443 } 444 } 445 if (max_length >= 9) { 446 /* Transforms with prefixes " the " and ".com/" */ 447 if ((data[0] == ' ' && data[1] == 't' && data[2] == 'h' && 448 data[3] == 'e' && data[4] == ' ') || 449 (data[0] == '.' && data[1] == 'c' && data[2] == 'o' && 450 data[3] == 'm' && data[4] == '/')) { 451 size_t offset = dictionary->buckets[Hash(&data[5])]; 452 BROTLI_BOOL end = !offset; 453 while (!end) { 454 DictWord w = dictionary->dict_words[offset++]; 455 const size_t l = w.len & 0x1F; 456 const size_t n = (size_t)1 << dictionary->words->size_bits_by_length[l]; 457 const size_t id = w.idx; 458 end = !!(w.len & 0x80); 459 w.len = (uint8_t)l; 460 if (w.transform == 0 && 461 IsMatch(dictionary->words, w, &data[5], max_length - 5)) { 462 AddMatch(id + (data[0] == ' ' ? 41 : 72) * n, l + 5, l, matches); 463 has_found_match = BROTLI_TRUE; 464 if (l + 5 < max_length) { 465 const uint8_t* s = &data[l + 5]; 466 if (data[0] == ' ') { 467 if (l + 8 < max_length && 468 s[0] == ' ' && s[1] == 'o' && s[2] == 'f' && s[3] == ' ') { 469 AddMatch(id + 62 * n, l + 9, l, matches); 470 if (l + 12 < max_length && 471 s[4] == 't' && s[5] == 'h' && s[6] == 'e' && s[7] == ' ') { 472 AddMatch(id + 73 * n, l + 13, l, matches); 473 } 474 } 475 } 476 } 477 } 478 } 479 } 480 } 481 return has_found_match; 482} 483 484#if defined(__cplusplus) || defined(c_plusplus) 485} /* extern "C" */ 486#endif 487