16d528ed9Sopenharmony_ci// Copyright (c) 2018 The Chromium Authors. All rights reserved.
26d528ed9Sopenharmony_ci// Use of this source code is governed by a BSD-style license that can be
36d528ed9Sopenharmony_ci// found in the LICENSE file.
46d528ed9Sopenharmony_ci
56d528ed9Sopenharmony_ci#include "base/strings/utf_string_conversions.h"
66d528ed9Sopenharmony_ci
76d528ed9Sopenharmony_ci#include <stdint.h>
86d528ed9Sopenharmony_ci
96d528ed9Sopenharmony_ci#include <string_view>
106d528ed9Sopenharmony_ci
116d528ed9Sopenharmony_ci#include "base/strings/string_util.h"
126d528ed9Sopenharmony_ci#include "base/strings/utf_string_conversion_utils.h"
136d528ed9Sopenharmony_ci#include "base/third_party/icu/icu_utf.h"
146d528ed9Sopenharmony_ci#include "util/build_config.h"
156d528ed9Sopenharmony_ci
166d528ed9Sopenharmony_cinamespace base {
176d528ed9Sopenharmony_ci
186d528ed9Sopenharmony_cinamespace {
196d528ed9Sopenharmony_ci
206d528ed9Sopenharmony_ciconstexpr int32_t kErrorCodePoint = 0xFFFD;
216d528ed9Sopenharmony_ci
226d528ed9Sopenharmony_ci// Size coefficient ----------------------------------------------------------
236d528ed9Sopenharmony_ci// The maximum number of codeunits in the destination encoding corresponding to
246d528ed9Sopenharmony_ci// one codeunit in the source encoding.
256d528ed9Sopenharmony_ci
266d528ed9Sopenharmony_citemplate <typename SrcChar, typename DestChar>
276d528ed9Sopenharmony_cistruct SizeCoefficient {
286d528ed9Sopenharmony_ci  static_assert(sizeof(SrcChar) < sizeof(DestChar),
296d528ed9Sopenharmony_ci                "Default case: from a smaller encoding to the bigger one");
306d528ed9Sopenharmony_ci
316d528ed9Sopenharmony_ci  // ASCII symbols are encoded by one codeunit in all encodings.
326d528ed9Sopenharmony_ci  static constexpr int value = 1;
336d528ed9Sopenharmony_ci};
346d528ed9Sopenharmony_ci
356d528ed9Sopenharmony_citemplate <>
366d528ed9Sopenharmony_cistruct SizeCoefficient<char16_t, char> {
376d528ed9Sopenharmony_ci  // One UTF-16 codeunit corresponds to at most 3 codeunits in UTF-8.
386d528ed9Sopenharmony_ci  static constexpr int value = 3;
396d528ed9Sopenharmony_ci};
406d528ed9Sopenharmony_ci
416d528ed9Sopenharmony_citemplate <typename SrcChar, typename DestChar>
426d528ed9Sopenharmony_ciconstexpr int size_coefficient_v =
436d528ed9Sopenharmony_ci    SizeCoefficient<std::decay_t<SrcChar>, std::decay_t<DestChar>>::value;
446d528ed9Sopenharmony_ci
456d528ed9Sopenharmony_ci// UnicodeAppendUnsafe --------------------------------------------------------
466d528ed9Sopenharmony_ci// Function overloads that write code_point to the output string. Output string
476d528ed9Sopenharmony_ci// has to have enough space for the codepoint.
486d528ed9Sopenharmony_ci
496d528ed9Sopenharmony_civoid UnicodeAppendUnsafe(char* out, int32_t* size, uint32_t code_point) {
506d528ed9Sopenharmony_ci  CBU8_APPEND_UNSAFE(out, *size, code_point);
516d528ed9Sopenharmony_ci}
526d528ed9Sopenharmony_ci
536d528ed9Sopenharmony_civoid UnicodeAppendUnsafe(char16_t* out, int32_t* size, uint32_t code_point) {
546d528ed9Sopenharmony_ci  CBU16_APPEND_UNSAFE(out, *size, code_point);
556d528ed9Sopenharmony_ci}
566d528ed9Sopenharmony_ci
576d528ed9Sopenharmony_ci// DoUTFConversion ------------------------------------------------------------
586d528ed9Sopenharmony_ci// Main driver of UTFConversion specialized for different Src encodings.
596d528ed9Sopenharmony_ci// dest has to have enough room for the converted text.
606d528ed9Sopenharmony_ci
616d528ed9Sopenharmony_citemplate <typename DestChar>
626d528ed9Sopenharmony_cibool DoUTFConversion(const char* src,
636d528ed9Sopenharmony_ci                     int32_t src_len,
646d528ed9Sopenharmony_ci                     DestChar* dest,
656d528ed9Sopenharmony_ci                     int32_t* dest_len) {
666d528ed9Sopenharmony_ci  bool success = true;
676d528ed9Sopenharmony_ci
686d528ed9Sopenharmony_ci  for (int32_t i = 0; i < src_len;) {
696d528ed9Sopenharmony_ci    int32_t code_point;
706d528ed9Sopenharmony_ci    CBU8_NEXT(src, i, src_len, code_point);
716d528ed9Sopenharmony_ci
726d528ed9Sopenharmony_ci    if (!IsValidCodepoint(code_point)) {
736d528ed9Sopenharmony_ci      success = false;
746d528ed9Sopenharmony_ci      code_point = kErrorCodePoint;
756d528ed9Sopenharmony_ci    }
766d528ed9Sopenharmony_ci
776d528ed9Sopenharmony_ci    UnicodeAppendUnsafe(dest, dest_len, code_point);
786d528ed9Sopenharmony_ci  }
796d528ed9Sopenharmony_ci
806d528ed9Sopenharmony_ci  return success;
816d528ed9Sopenharmony_ci}
826d528ed9Sopenharmony_ci
836d528ed9Sopenharmony_citemplate <typename DestChar>
846d528ed9Sopenharmony_cibool DoUTFConversion(const char16_t* src,
856d528ed9Sopenharmony_ci                     int32_t src_len,
866d528ed9Sopenharmony_ci                     DestChar* dest,
876d528ed9Sopenharmony_ci                     int32_t* dest_len) {
886d528ed9Sopenharmony_ci  bool success = true;
896d528ed9Sopenharmony_ci
906d528ed9Sopenharmony_ci  auto ConvertSingleChar = [&success](char16_t in) -> int32_t {
916d528ed9Sopenharmony_ci    if (!CBU16_IS_SINGLE(in) || !IsValidCodepoint(in)) {
926d528ed9Sopenharmony_ci      success = false;
936d528ed9Sopenharmony_ci      return kErrorCodePoint;
946d528ed9Sopenharmony_ci    }
956d528ed9Sopenharmony_ci    return in;
966d528ed9Sopenharmony_ci  };
976d528ed9Sopenharmony_ci
986d528ed9Sopenharmony_ci  int32_t i = 0;
996d528ed9Sopenharmony_ci
1006d528ed9Sopenharmony_ci  // Always have another symbol in order to avoid checking boundaries in the
1016d528ed9Sopenharmony_ci  // middle of the surrogate pair.
1026d528ed9Sopenharmony_ci  while (i < src_len - 1) {
1036d528ed9Sopenharmony_ci    int32_t code_point;
1046d528ed9Sopenharmony_ci
1056d528ed9Sopenharmony_ci    if (CBU16_IS_LEAD(src[i]) && CBU16_IS_TRAIL(src[i + 1])) {
1066d528ed9Sopenharmony_ci      code_point = CBU16_GET_SUPPLEMENTARY(src[i], src[i + 1]);
1076d528ed9Sopenharmony_ci      if (!IsValidCodepoint(code_point)) {
1086d528ed9Sopenharmony_ci        code_point = kErrorCodePoint;
1096d528ed9Sopenharmony_ci        success = false;
1106d528ed9Sopenharmony_ci      }
1116d528ed9Sopenharmony_ci      i += 2;
1126d528ed9Sopenharmony_ci    } else {
1136d528ed9Sopenharmony_ci      code_point = ConvertSingleChar(src[i]);
1146d528ed9Sopenharmony_ci      ++i;
1156d528ed9Sopenharmony_ci    }
1166d528ed9Sopenharmony_ci
1176d528ed9Sopenharmony_ci    UnicodeAppendUnsafe(dest, dest_len, code_point);
1186d528ed9Sopenharmony_ci  }
1196d528ed9Sopenharmony_ci
1206d528ed9Sopenharmony_ci  if (i < src_len)
1216d528ed9Sopenharmony_ci    UnicodeAppendUnsafe(dest, dest_len, ConvertSingleChar(src[i]));
1226d528ed9Sopenharmony_ci
1236d528ed9Sopenharmony_ci  return success;
1246d528ed9Sopenharmony_ci}
1256d528ed9Sopenharmony_ci
1266d528ed9Sopenharmony_ci// UTFConversion --------------------------------------------------------------
1276d528ed9Sopenharmony_ci// Function template for generating all UTF conversions.
1286d528ed9Sopenharmony_ci
1296d528ed9Sopenharmony_citemplate <typename InputString, typename DestString>
1306d528ed9Sopenharmony_cibool UTFConversion(const InputString& src_str, DestString* dest_str) {
1316d528ed9Sopenharmony_ci  if (IsStringASCII(src_str)) {
1326d528ed9Sopenharmony_ci    dest_str->assign(src_str.begin(), src_str.end());
1336d528ed9Sopenharmony_ci    return true;
1346d528ed9Sopenharmony_ci  }
1356d528ed9Sopenharmony_ci
1366d528ed9Sopenharmony_ci  dest_str->resize(src_str.length() *
1376d528ed9Sopenharmony_ci                   size_coefficient_v<typename InputString::value_type,
1386d528ed9Sopenharmony_ci                                      typename DestString::value_type>);
1396d528ed9Sopenharmony_ci
1406d528ed9Sopenharmony_ci  // Empty string is ASCII => it OK to call operator[].
1416d528ed9Sopenharmony_ci  auto* dest = &(*dest_str)[0];
1426d528ed9Sopenharmony_ci
1436d528ed9Sopenharmony_ci  // ICU requires 32 bit numbers.
1446d528ed9Sopenharmony_ci  int32_t src_len32 = static_cast<int32_t>(src_str.length());
1456d528ed9Sopenharmony_ci  int32_t dest_len32 = 0;
1466d528ed9Sopenharmony_ci
1476d528ed9Sopenharmony_ci  bool res = DoUTFConversion(src_str.data(), src_len32, dest, &dest_len32);
1486d528ed9Sopenharmony_ci
1496d528ed9Sopenharmony_ci  dest_str->resize(dest_len32);
1506d528ed9Sopenharmony_ci  dest_str->shrink_to_fit();
1516d528ed9Sopenharmony_ci
1526d528ed9Sopenharmony_ci  return res;
1536d528ed9Sopenharmony_ci}
1546d528ed9Sopenharmony_ci
1556d528ed9Sopenharmony_ci}  // namespace
1566d528ed9Sopenharmony_ci
1576d528ed9Sopenharmony_ci// UTF16 <-> UTF8 --------------------------------------------------------------
1586d528ed9Sopenharmony_ci
1596d528ed9Sopenharmony_cibool UTF8ToUTF16(const char* src, size_t src_len, std::u16string* output) {
1606d528ed9Sopenharmony_ci  return UTFConversion(std::string_view(src, src_len), output);
1616d528ed9Sopenharmony_ci}
1626d528ed9Sopenharmony_ci
1636d528ed9Sopenharmony_cistd::u16string UTF8ToUTF16(std::string_view utf8) {
1646d528ed9Sopenharmony_ci  std::u16string ret;
1656d528ed9Sopenharmony_ci  // Ignore the success flag of this call, it will do the best it can for
1666d528ed9Sopenharmony_ci  // invalid input, which is what we want here.
1676d528ed9Sopenharmony_ci  UTF8ToUTF16(utf8.data(), utf8.size(), &ret);
1686d528ed9Sopenharmony_ci  return ret;
1696d528ed9Sopenharmony_ci}
1706d528ed9Sopenharmony_ci
1716d528ed9Sopenharmony_cibool UTF16ToUTF8(const char16_t* src, size_t src_len, std::string* output) {
1726d528ed9Sopenharmony_ci  return UTFConversion(std::u16string_view(src, src_len), output);
1736d528ed9Sopenharmony_ci}
1746d528ed9Sopenharmony_ci
1756d528ed9Sopenharmony_cistd::string UTF16ToUTF8(std::u16string_view utf16) {
1766d528ed9Sopenharmony_ci  std::string ret;
1776d528ed9Sopenharmony_ci  // Ignore the success flag of this call, it will do the best it can for
1786d528ed9Sopenharmony_ci  // invalid input, which is what we want here.
1796d528ed9Sopenharmony_ci  UTF16ToUTF8(utf16.data(), utf16.length(), &ret);
1806d528ed9Sopenharmony_ci  return ret;
1816d528ed9Sopenharmony_ci}
1826d528ed9Sopenharmony_ci
1836d528ed9Sopenharmony_ci// ASCII <-> UTF-16 -----------------------------------------------------------
1846d528ed9Sopenharmony_ci
1856d528ed9Sopenharmony_cistd::u16string ASCIIToUTF16(std::string_view ascii) {
1866d528ed9Sopenharmony_ci  DCHECK(IsStringASCII(ascii)) << ascii;
1876d528ed9Sopenharmony_ci  return std::u16string(ascii.begin(), ascii.end());
1886d528ed9Sopenharmony_ci}
1896d528ed9Sopenharmony_ci
1906d528ed9Sopenharmony_cistd::string UTF16ToASCII(std::u16string_view utf16) {
1916d528ed9Sopenharmony_ci  DCHECK(IsStringASCII(utf16)) << UTF16ToUTF8(utf16);
1926d528ed9Sopenharmony_ci  return std::string(utf16.begin(), utf16.end());
1936d528ed9Sopenharmony_ci}
1946d528ed9Sopenharmony_ci
1956d528ed9Sopenharmony_ci}  // namespace base
196