11cb0ef41Sopenharmony_ci/* Copyright 2013 Google Inc. All Rights Reserved.
21cb0ef41Sopenharmony_ci
31cb0ef41Sopenharmony_ci   Distributed under MIT license.
41cb0ef41Sopenharmony_ci   See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
51cb0ef41Sopenharmony_ci*/
61cb0ef41Sopenharmony_ci
71cb0ef41Sopenharmony_ci/* Heuristics for deciding about the UTF8-ness of strings. */
81cb0ef41Sopenharmony_ci
91cb0ef41Sopenharmony_ci#include "./utf8_util.h"
101cb0ef41Sopenharmony_ci
111cb0ef41Sopenharmony_ci#include <brotli/types.h>
121cb0ef41Sopenharmony_ci
131cb0ef41Sopenharmony_ci#if defined(__cplusplus) || defined(c_plusplus)
141cb0ef41Sopenharmony_ciextern "C" {
151cb0ef41Sopenharmony_ci#endif
161cb0ef41Sopenharmony_ci
171cb0ef41Sopenharmony_cistatic size_t BrotliParseAsUTF8(
181cb0ef41Sopenharmony_ci    int* symbol, const uint8_t* input, size_t size) {
191cb0ef41Sopenharmony_ci  /* ASCII */
201cb0ef41Sopenharmony_ci  if ((input[0] & 0x80) == 0) {
211cb0ef41Sopenharmony_ci    *symbol = input[0];
221cb0ef41Sopenharmony_ci    if (*symbol > 0) {
231cb0ef41Sopenharmony_ci      return 1;
241cb0ef41Sopenharmony_ci    }
251cb0ef41Sopenharmony_ci  }
261cb0ef41Sopenharmony_ci  /* 2-byte UTF8 */
271cb0ef41Sopenharmony_ci  if (size > 1u &&
281cb0ef41Sopenharmony_ci      (input[0] & 0xE0) == 0xC0 &&
291cb0ef41Sopenharmony_ci      (input[1] & 0xC0) == 0x80) {
301cb0ef41Sopenharmony_ci    *symbol = (((input[0] & 0x1F) << 6) |
311cb0ef41Sopenharmony_ci               (input[1] & 0x3F));
321cb0ef41Sopenharmony_ci    if (*symbol > 0x7F) {
331cb0ef41Sopenharmony_ci      return 2;
341cb0ef41Sopenharmony_ci    }
351cb0ef41Sopenharmony_ci  }
361cb0ef41Sopenharmony_ci  /* 3-byte UFT8 */
371cb0ef41Sopenharmony_ci  if (size > 2u &&
381cb0ef41Sopenharmony_ci      (input[0] & 0xF0) == 0xE0 &&
391cb0ef41Sopenharmony_ci      (input[1] & 0xC0) == 0x80 &&
401cb0ef41Sopenharmony_ci      (input[2] & 0xC0) == 0x80) {
411cb0ef41Sopenharmony_ci    *symbol = (((input[0] & 0x0F) << 12) |
421cb0ef41Sopenharmony_ci               ((input[1] & 0x3F) << 6) |
431cb0ef41Sopenharmony_ci               (input[2] & 0x3F));
441cb0ef41Sopenharmony_ci    if (*symbol > 0x7FF) {
451cb0ef41Sopenharmony_ci      return 3;
461cb0ef41Sopenharmony_ci    }
471cb0ef41Sopenharmony_ci  }
481cb0ef41Sopenharmony_ci  /* 4-byte UFT8 */
491cb0ef41Sopenharmony_ci  if (size > 3u &&
501cb0ef41Sopenharmony_ci      (input[0] & 0xF8) == 0xF0 &&
511cb0ef41Sopenharmony_ci      (input[1] & 0xC0) == 0x80 &&
521cb0ef41Sopenharmony_ci      (input[2] & 0xC0) == 0x80 &&
531cb0ef41Sopenharmony_ci      (input[3] & 0xC0) == 0x80) {
541cb0ef41Sopenharmony_ci    *symbol = (((input[0] & 0x07) << 18) |
551cb0ef41Sopenharmony_ci               ((input[1] & 0x3F) << 12) |
561cb0ef41Sopenharmony_ci               ((input[2] & 0x3F) << 6) |
571cb0ef41Sopenharmony_ci               (input[3] & 0x3F));
581cb0ef41Sopenharmony_ci    if (*symbol > 0xFFFF && *symbol <= 0x10FFFF) {
591cb0ef41Sopenharmony_ci      return 4;
601cb0ef41Sopenharmony_ci    }
611cb0ef41Sopenharmony_ci  }
621cb0ef41Sopenharmony_ci  /* Not UTF8, emit a special symbol above the UTF8-code space */
631cb0ef41Sopenharmony_ci  *symbol = 0x110000 | input[0];
641cb0ef41Sopenharmony_ci  return 1;
651cb0ef41Sopenharmony_ci}
661cb0ef41Sopenharmony_ci
671cb0ef41Sopenharmony_ci/* Returns 1 if at least min_fraction of the data is UTF8-encoded.*/
681cb0ef41Sopenharmony_ciBROTLI_BOOL BrotliIsMostlyUTF8(
691cb0ef41Sopenharmony_ci    const uint8_t* data, const size_t pos, const size_t mask,
701cb0ef41Sopenharmony_ci    const size_t length, const double min_fraction) {
711cb0ef41Sopenharmony_ci  size_t size_utf8 = 0;
721cb0ef41Sopenharmony_ci  size_t i = 0;
731cb0ef41Sopenharmony_ci  while (i < length) {
741cb0ef41Sopenharmony_ci    int symbol;
751cb0ef41Sopenharmony_ci    size_t bytes_read =
761cb0ef41Sopenharmony_ci        BrotliParseAsUTF8(&symbol, &data[(pos + i) & mask], length - i);
771cb0ef41Sopenharmony_ci    i += bytes_read;
781cb0ef41Sopenharmony_ci    if (symbol < 0x110000) size_utf8 += bytes_read;
791cb0ef41Sopenharmony_ci  }
801cb0ef41Sopenharmony_ci  return TO_BROTLI_BOOL((double)size_utf8 > min_fraction * (double)length);
811cb0ef41Sopenharmony_ci}
821cb0ef41Sopenharmony_ci
831cb0ef41Sopenharmony_ci#if defined(__cplusplus) || defined(c_plusplus)
841cb0ef41Sopenharmony_ci}  /* extern "C" */
851cb0ef41Sopenharmony_ci#endif
86