16d528ed9Sopenharmony_ci// Copyright (c) 2009 The Chromium Authors. All rights reserved.
26d528ed9Sopenharmony_ci// Use of this source code is governed by a BSD-style license that can be
36d528ed9Sopenharmony_ci// found in the LICENSE file.
46d528ed9Sopenharmony_ci
56d528ed9Sopenharmony_ci#include "base/strings/utf_string_conversion_utils.h"
66d528ed9Sopenharmony_ci
76d528ed9Sopenharmony_ci#include "base/third_party/icu/icu_utf.h"
86d528ed9Sopenharmony_ci#include "util/build_config.h"
96d528ed9Sopenharmony_ci
106d528ed9Sopenharmony_cinamespace base {
116d528ed9Sopenharmony_ci
126d528ed9Sopenharmony_ci// ReadUnicodeCharacter --------------------------------------------------------
136d528ed9Sopenharmony_ci
146d528ed9Sopenharmony_cibool ReadUnicodeCharacter(const char* src,
156d528ed9Sopenharmony_ci                          int32_t src_len,
166d528ed9Sopenharmony_ci                          int32_t* char_index,
176d528ed9Sopenharmony_ci                          uint32_t* code_point_out) {
186d528ed9Sopenharmony_ci  // U8_NEXT expects to be able to use -1 to signal an error, so we must
196d528ed9Sopenharmony_ci  // use a signed type for code_point.  But this function returns false
206d528ed9Sopenharmony_ci  // on error anyway, so code_point_out is unsigned.
216d528ed9Sopenharmony_ci  int32_t code_point;
226d528ed9Sopenharmony_ci  CBU8_NEXT(src, *char_index, src_len, code_point);
236d528ed9Sopenharmony_ci  *code_point_out = static_cast<uint32_t>(code_point);
246d528ed9Sopenharmony_ci
256d528ed9Sopenharmony_ci  // The ICU macro above moves to the next char, we want to point to the last
266d528ed9Sopenharmony_ci  // char consumed.
276d528ed9Sopenharmony_ci  (*char_index)--;
286d528ed9Sopenharmony_ci
296d528ed9Sopenharmony_ci  // Validate the decoded value.
306d528ed9Sopenharmony_ci  return IsValidCodepoint(code_point);
316d528ed9Sopenharmony_ci}
326d528ed9Sopenharmony_ci
336d528ed9Sopenharmony_cibool ReadUnicodeCharacter(const char16_t* src,
346d528ed9Sopenharmony_ci                          int32_t src_len,
356d528ed9Sopenharmony_ci                          int32_t* char_index,
366d528ed9Sopenharmony_ci                          uint32_t* code_point) {
376d528ed9Sopenharmony_ci  if (CBU16_IS_SURROGATE(src[*char_index])) {
386d528ed9Sopenharmony_ci    if (!CBU16_IS_SURROGATE_LEAD(src[*char_index]) ||
396d528ed9Sopenharmony_ci        *char_index + 1 >= src_len || !CBU16_IS_TRAIL(src[*char_index + 1])) {
406d528ed9Sopenharmony_ci      // Invalid surrogate pair.
416d528ed9Sopenharmony_ci      return false;
426d528ed9Sopenharmony_ci    }
436d528ed9Sopenharmony_ci
446d528ed9Sopenharmony_ci    // Valid surrogate pair.
456d528ed9Sopenharmony_ci    *code_point =
466d528ed9Sopenharmony_ci        CBU16_GET_SUPPLEMENTARY(src[*char_index], src[*char_index + 1]);
476d528ed9Sopenharmony_ci    (*char_index)++;
486d528ed9Sopenharmony_ci  } else {
496d528ed9Sopenharmony_ci    // Not a surrogate, just one 16-bit word.
506d528ed9Sopenharmony_ci    *code_point = src[*char_index];
516d528ed9Sopenharmony_ci  }
526d528ed9Sopenharmony_ci
536d528ed9Sopenharmony_ci  return IsValidCodepoint(*code_point);
546d528ed9Sopenharmony_ci}
556d528ed9Sopenharmony_ci
566d528ed9Sopenharmony_ci// WriteUnicodeCharacter -------------------------------------------------------
576d528ed9Sopenharmony_ci
586d528ed9Sopenharmony_cisize_t WriteUnicodeCharacter(uint32_t code_point, std::string* output) {
596d528ed9Sopenharmony_ci  if (code_point <= 0x7f) {
606d528ed9Sopenharmony_ci    // Fast path the common case of one byte.
616d528ed9Sopenharmony_ci    output->push_back(static_cast<char>(code_point));
626d528ed9Sopenharmony_ci    return 1;
636d528ed9Sopenharmony_ci  }
646d528ed9Sopenharmony_ci
656d528ed9Sopenharmony_ci  // CBU8_APPEND_UNSAFE can append up to 4 bytes.
666d528ed9Sopenharmony_ci  size_t char_offset = output->length();
676d528ed9Sopenharmony_ci  size_t original_char_offset = char_offset;
686d528ed9Sopenharmony_ci  output->resize(char_offset + CBU8_MAX_LENGTH);
696d528ed9Sopenharmony_ci
706d528ed9Sopenharmony_ci  CBU8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
716d528ed9Sopenharmony_ci
726d528ed9Sopenharmony_ci  // CBU8_APPEND_UNSAFE will advance our pointer past the inserted character, so
736d528ed9Sopenharmony_ci  // it will represent the new length of the string.
746d528ed9Sopenharmony_ci  output->resize(char_offset);
756d528ed9Sopenharmony_ci  return char_offset - original_char_offset;
766d528ed9Sopenharmony_ci}
776d528ed9Sopenharmony_ci
786d528ed9Sopenharmony_cisize_t WriteUnicodeCharacter(uint32_t code_point, std::u16string* output) {
796d528ed9Sopenharmony_ci  if (CBU16_LENGTH(code_point) == 1) {
806d528ed9Sopenharmony_ci    // Thie code point is in the Basic Multilingual Plane (BMP).
816d528ed9Sopenharmony_ci    output->push_back(static_cast<char16_t>(code_point));
826d528ed9Sopenharmony_ci    return 1;
836d528ed9Sopenharmony_ci  }
846d528ed9Sopenharmony_ci  // Non-BMP characters use a double-character encoding.
856d528ed9Sopenharmony_ci  size_t char_offset = output->length();
866d528ed9Sopenharmony_ci  output->resize(char_offset + CBU16_MAX_LENGTH);
876d528ed9Sopenharmony_ci  CBU16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
886d528ed9Sopenharmony_ci  return CBU16_MAX_LENGTH;
896d528ed9Sopenharmony_ci}
906d528ed9Sopenharmony_ci
916d528ed9Sopenharmony_ci// Generalized Unicode converter -----------------------------------------------
926d528ed9Sopenharmony_ci
936d528ed9Sopenharmony_citemplate <typename CHAR>
946d528ed9Sopenharmony_civoid PrepareForUTF8Output(const CHAR* src,
956d528ed9Sopenharmony_ci                          size_t src_len,
966d528ed9Sopenharmony_ci                          std::string* output) {
976d528ed9Sopenharmony_ci  output->clear();
986d528ed9Sopenharmony_ci  if (src_len == 0)
996d528ed9Sopenharmony_ci    return;
1006d528ed9Sopenharmony_ci  if (src[0] < 0x80) {
1016d528ed9Sopenharmony_ci    // Assume that the entire input will be ASCII.
1026d528ed9Sopenharmony_ci    output->reserve(src_len);
1036d528ed9Sopenharmony_ci  } else {
1046d528ed9Sopenharmony_ci    // Assume that the entire input is non-ASCII and will have 3 bytes per char.
1056d528ed9Sopenharmony_ci    output->reserve(src_len * 3);
1066d528ed9Sopenharmony_ci  }
1076d528ed9Sopenharmony_ci}
1086d528ed9Sopenharmony_ci
1096d528ed9Sopenharmony_ci// Instantiate versions we know callers will need.
1106d528ed9Sopenharmony_citemplate void PrepareForUTF8Output(const char16_t*, size_t, std::string*);
1116d528ed9Sopenharmony_ci
1126d528ed9Sopenharmony_citemplate <typename STRING>
1136d528ed9Sopenharmony_civoid PrepareForUTF16Or32Output(const char* src,
1146d528ed9Sopenharmony_ci                               size_t src_len,
1156d528ed9Sopenharmony_ci                               STRING* output) {
1166d528ed9Sopenharmony_ci  output->clear();
1176d528ed9Sopenharmony_ci  if (src_len == 0)
1186d528ed9Sopenharmony_ci    return;
1196d528ed9Sopenharmony_ci  if (static_cast<unsigned char>(src[0]) < 0x80) {
1206d528ed9Sopenharmony_ci    // Assume the input is all ASCII, which means 1:1 correspondence.
1216d528ed9Sopenharmony_ci    output->reserve(src_len);
1226d528ed9Sopenharmony_ci  } else {
1236d528ed9Sopenharmony_ci    // Otherwise assume that the UTF-8 sequences will have 2 bytes for each
1246d528ed9Sopenharmony_ci    // character.
1256d528ed9Sopenharmony_ci    output->reserve(src_len / 2);
1266d528ed9Sopenharmony_ci  }
1276d528ed9Sopenharmony_ci}
1286d528ed9Sopenharmony_ci
1296d528ed9Sopenharmony_ci// Instantiate versions we know callers will need.
1306d528ed9Sopenharmony_citemplate void PrepareForUTF16Or32Output(const char*, size_t, std::u16string*);
1316d528ed9Sopenharmony_ci
1326d528ed9Sopenharmony_ci}  // namespace base
133