11cb0ef41Sopenharmony_ci/* Copyright 2013 Google Inc. All Rights Reserved. 21cb0ef41Sopenharmony_ci 31cb0ef41Sopenharmony_ci Distributed under MIT license. 41cb0ef41Sopenharmony_ci See file LICENSE for detail or copy at https://opensource.org/licenses/MIT 51cb0ef41Sopenharmony_ci*/ 61cb0ef41Sopenharmony_ci 71cb0ef41Sopenharmony_ci/* Heuristics for deciding about the UTF8-ness of strings. */ 81cb0ef41Sopenharmony_ci 91cb0ef41Sopenharmony_ci#include "./utf8_util.h" 101cb0ef41Sopenharmony_ci 111cb0ef41Sopenharmony_ci#include <brotli/types.h> 121cb0ef41Sopenharmony_ci 131cb0ef41Sopenharmony_ci#if defined(__cplusplus) || defined(c_plusplus) 141cb0ef41Sopenharmony_ciextern "C" { 151cb0ef41Sopenharmony_ci#endif 161cb0ef41Sopenharmony_ci 171cb0ef41Sopenharmony_cistatic size_t BrotliParseAsUTF8( 181cb0ef41Sopenharmony_ci int* symbol, const uint8_t* input, size_t size) { 191cb0ef41Sopenharmony_ci /* ASCII */ 201cb0ef41Sopenharmony_ci if ((input[0] & 0x80) == 0) { 211cb0ef41Sopenharmony_ci *symbol = input[0]; 221cb0ef41Sopenharmony_ci if (*symbol > 0) { 231cb0ef41Sopenharmony_ci return 1; 241cb0ef41Sopenharmony_ci } 251cb0ef41Sopenharmony_ci } 261cb0ef41Sopenharmony_ci /* 2-byte UTF8 */ 271cb0ef41Sopenharmony_ci if (size > 1u && 281cb0ef41Sopenharmony_ci (input[0] & 0xE0) == 0xC0 && 291cb0ef41Sopenharmony_ci (input[1] & 0xC0) == 0x80) { 301cb0ef41Sopenharmony_ci *symbol = (((input[0] & 0x1F) << 6) | 311cb0ef41Sopenharmony_ci (input[1] & 0x3F)); 321cb0ef41Sopenharmony_ci if (*symbol > 0x7F) { 331cb0ef41Sopenharmony_ci return 2; 341cb0ef41Sopenharmony_ci } 351cb0ef41Sopenharmony_ci } 361cb0ef41Sopenharmony_ci /* 3-byte UFT8 */ 371cb0ef41Sopenharmony_ci if (size > 2u && 381cb0ef41Sopenharmony_ci (input[0] & 0xF0) == 0xE0 && 391cb0ef41Sopenharmony_ci (input[1] & 0xC0) == 0x80 && 401cb0ef41Sopenharmony_ci (input[2] & 0xC0) == 0x80) { 411cb0ef41Sopenharmony_ci *symbol = (((input[0] & 0x0F) << 12) | 421cb0ef41Sopenharmony_ci ((input[1] & 0x3F) << 6) | 431cb0ef41Sopenharmony_ci (input[2] & 0x3F)); 441cb0ef41Sopenharmony_ci if (*symbol > 0x7FF) { 451cb0ef41Sopenharmony_ci return 3; 461cb0ef41Sopenharmony_ci } 471cb0ef41Sopenharmony_ci } 481cb0ef41Sopenharmony_ci /* 4-byte UFT8 */ 491cb0ef41Sopenharmony_ci if (size > 3u && 501cb0ef41Sopenharmony_ci (input[0] & 0xF8) == 0xF0 && 511cb0ef41Sopenharmony_ci (input[1] & 0xC0) == 0x80 && 521cb0ef41Sopenharmony_ci (input[2] & 0xC0) == 0x80 && 531cb0ef41Sopenharmony_ci (input[3] & 0xC0) == 0x80) { 541cb0ef41Sopenharmony_ci *symbol = (((input[0] & 0x07) << 18) | 551cb0ef41Sopenharmony_ci ((input[1] & 0x3F) << 12) | 561cb0ef41Sopenharmony_ci ((input[2] & 0x3F) << 6) | 571cb0ef41Sopenharmony_ci (input[3] & 0x3F)); 581cb0ef41Sopenharmony_ci if (*symbol > 0xFFFF && *symbol <= 0x10FFFF) { 591cb0ef41Sopenharmony_ci return 4; 601cb0ef41Sopenharmony_ci } 611cb0ef41Sopenharmony_ci } 621cb0ef41Sopenharmony_ci /* Not UTF8, emit a special symbol above the UTF8-code space */ 631cb0ef41Sopenharmony_ci *symbol = 0x110000 | input[0]; 641cb0ef41Sopenharmony_ci return 1; 651cb0ef41Sopenharmony_ci} 661cb0ef41Sopenharmony_ci 671cb0ef41Sopenharmony_ci/* Returns 1 if at least min_fraction of the data is UTF8-encoded.*/ 681cb0ef41Sopenharmony_ciBROTLI_BOOL BrotliIsMostlyUTF8( 691cb0ef41Sopenharmony_ci const uint8_t* data, const size_t pos, const size_t mask, 701cb0ef41Sopenharmony_ci const size_t length, const double min_fraction) { 711cb0ef41Sopenharmony_ci size_t size_utf8 = 0; 721cb0ef41Sopenharmony_ci size_t i = 0; 731cb0ef41Sopenharmony_ci while (i < length) { 741cb0ef41Sopenharmony_ci int symbol; 751cb0ef41Sopenharmony_ci size_t bytes_read = 761cb0ef41Sopenharmony_ci BrotliParseAsUTF8(&symbol, &data[(pos + i) & mask], length - i); 771cb0ef41Sopenharmony_ci i += bytes_read; 781cb0ef41Sopenharmony_ci if (symbol < 0x110000) size_utf8 += bytes_read; 791cb0ef41Sopenharmony_ci } 801cb0ef41Sopenharmony_ci return TO_BROTLI_BOOL((double)size_utf8 > min_fraction * (double)length); 811cb0ef41Sopenharmony_ci} 821cb0ef41Sopenharmony_ci 831cb0ef41Sopenharmony_ci#if defined(__cplusplus) || defined(c_plusplus) 841cb0ef41Sopenharmony_ci} /* extern "C" */ 851cb0ef41Sopenharmony_ci#endif 86