16d528ed9Sopenharmony_ci// Copyright (c) 2018 The Chromium Authors. All rights reserved. 26d528ed9Sopenharmony_ci// Use of this source code is governed by a BSD-style license that can be 36d528ed9Sopenharmony_ci// found in the LICENSE file. 46d528ed9Sopenharmony_ci 56d528ed9Sopenharmony_ci#include "base/strings/utf_string_conversions.h" 66d528ed9Sopenharmony_ci 76d528ed9Sopenharmony_ci#include <stdint.h> 86d528ed9Sopenharmony_ci 96d528ed9Sopenharmony_ci#include <string_view> 106d528ed9Sopenharmony_ci 116d528ed9Sopenharmony_ci#include "base/strings/string_util.h" 126d528ed9Sopenharmony_ci#include "base/strings/utf_string_conversion_utils.h" 136d528ed9Sopenharmony_ci#include "base/third_party/icu/icu_utf.h" 146d528ed9Sopenharmony_ci#include "util/build_config.h" 156d528ed9Sopenharmony_ci 166d528ed9Sopenharmony_cinamespace base { 176d528ed9Sopenharmony_ci 186d528ed9Sopenharmony_cinamespace { 196d528ed9Sopenharmony_ci 206d528ed9Sopenharmony_ciconstexpr int32_t kErrorCodePoint = 0xFFFD; 216d528ed9Sopenharmony_ci 226d528ed9Sopenharmony_ci// Size coefficient ---------------------------------------------------------- 236d528ed9Sopenharmony_ci// The maximum number of codeunits in the destination encoding corresponding to 246d528ed9Sopenharmony_ci// one codeunit in the source encoding. 256d528ed9Sopenharmony_ci 266d528ed9Sopenharmony_citemplate <typename SrcChar, typename DestChar> 276d528ed9Sopenharmony_cistruct SizeCoefficient { 286d528ed9Sopenharmony_ci static_assert(sizeof(SrcChar) < sizeof(DestChar), 296d528ed9Sopenharmony_ci "Default case: from a smaller encoding to the bigger one"); 306d528ed9Sopenharmony_ci 316d528ed9Sopenharmony_ci // ASCII symbols are encoded by one codeunit in all encodings. 326d528ed9Sopenharmony_ci static constexpr int value = 1; 336d528ed9Sopenharmony_ci}; 346d528ed9Sopenharmony_ci 356d528ed9Sopenharmony_citemplate <> 366d528ed9Sopenharmony_cistruct SizeCoefficient<char16_t, char> { 376d528ed9Sopenharmony_ci // One UTF-16 codeunit corresponds to at most 3 codeunits in UTF-8. 386d528ed9Sopenharmony_ci static constexpr int value = 3; 396d528ed9Sopenharmony_ci}; 406d528ed9Sopenharmony_ci 416d528ed9Sopenharmony_citemplate <typename SrcChar, typename DestChar> 426d528ed9Sopenharmony_ciconstexpr int size_coefficient_v = 436d528ed9Sopenharmony_ci SizeCoefficient<std::decay_t<SrcChar>, std::decay_t<DestChar>>::value; 446d528ed9Sopenharmony_ci 456d528ed9Sopenharmony_ci// UnicodeAppendUnsafe -------------------------------------------------------- 466d528ed9Sopenharmony_ci// Function overloads that write code_point to the output string. Output string 476d528ed9Sopenharmony_ci// has to have enough space for the codepoint. 486d528ed9Sopenharmony_ci 496d528ed9Sopenharmony_civoid UnicodeAppendUnsafe(char* out, int32_t* size, uint32_t code_point) { 506d528ed9Sopenharmony_ci CBU8_APPEND_UNSAFE(out, *size, code_point); 516d528ed9Sopenharmony_ci} 526d528ed9Sopenharmony_ci 536d528ed9Sopenharmony_civoid UnicodeAppendUnsafe(char16_t* out, int32_t* size, uint32_t code_point) { 546d528ed9Sopenharmony_ci CBU16_APPEND_UNSAFE(out, *size, code_point); 556d528ed9Sopenharmony_ci} 566d528ed9Sopenharmony_ci 576d528ed9Sopenharmony_ci// DoUTFConversion ------------------------------------------------------------ 586d528ed9Sopenharmony_ci// Main driver of UTFConversion specialized for different Src encodings. 596d528ed9Sopenharmony_ci// dest has to have enough room for the converted text. 606d528ed9Sopenharmony_ci 616d528ed9Sopenharmony_citemplate <typename DestChar> 626d528ed9Sopenharmony_cibool DoUTFConversion(const char* src, 636d528ed9Sopenharmony_ci int32_t src_len, 646d528ed9Sopenharmony_ci DestChar* dest, 656d528ed9Sopenharmony_ci int32_t* dest_len) { 666d528ed9Sopenharmony_ci bool success = true; 676d528ed9Sopenharmony_ci 686d528ed9Sopenharmony_ci for (int32_t i = 0; i < src_len;) { 696d528ed9Sopenharmony_ci int32_t code_point; 706d528ed9Sopenharmony_ci CBU8_NEXT(src, i, src_len, code_point); 716d528ed9Sopenharmony_ci 726d528ed9Sopenharmony_ci if (!IsValidCodepoint(code_point)) { 736d528ed9Sopenharmony_ci success = false; 746d528ed9Sopenharmony_ci code_point = kErrorCodePoint; 756d528ed9Sopenharmony_ci } 766d528ed9Sopenharmony_ci 776d528ed9Sopenharmony_ci UnicodeAppendUnsafe(dest, dest_len, code_point); 786d528ed9Sopenharmony_ci } 796d528ed9Sopenharmony_ci 806d528ed9Sopenharmony_ci return success; 816d528ed9Sopenharmony_ci} 826d528ed9Sopenharmony_ci 836d528ed9Sopenharmony_citemplate <typename DestChar> 846d528ed9Sopenharmony_cibool DoUTFConversion(const char16_t* src, 856d528ed9Sopenharmony_ci int32_t src_len, 866d528ed9Sopenharmony_ci DestChar* dest, 876d528ed9Sopenharmony_ci int32_t* dest_len) { 886d528ed9Sopenharmony_ci bool success = true; 896d528ed9Sopenharmony_ci 906d528ed9Sopenharmony_ci auto ConvertSingleChar = [&success](char16_t in) -> int32_t { 916d528ed9Sopenharmony_ci if (!CBU16_IS_SINGLE(in) || !IsValidCodepoint(in)) { 926d528ed9Sopenharmony_ci success = false; 936d528ed9Sopenharmony_ci return kErrorCodePoint; 946d528ed9Sopenharmony_ci } 956d528ed9Sopenharmony_ci return in; 966d528ed9Sopenharmony_ci }; 976d528ed9Sopenharmony_ci 986d528ed9Sopenharmony_ci int32_t i = 0; 996d528ed9Sopenharmony_ci 1006d528ed9Sopenharmony_ci // Always have another symbol in order to avoid checking boundaries in the 1016d528ed9Sopenharmony_ci // middle of the surrogate pair. 1026d528ed9Sopenharmony_ci while (i < src_len - 1) { 1036d528ed9Sopenharmony_ci int32_t code_point; 1046d528ed9Sopenharmony_ci 1056d528ed9Sopenharmony_ci if (CBU16_IS_LEAD(src[i]) && CBU16_IS_TRAIL(src[i + 1])) { 1066d528ed9Sopenharmony_ci code_point = CBU16_GET_SUPPLEMENTARY(src[i], src[i + 1]); 1076d528ed9Sopenharmony_ci if (!IsValidCodepoint(code_point)) { 1086d528ed9Sopenharmony_ci code_point = kErrorCodePoint; 1096d528ed9Sopenharmony_ci success = false; 1106d528ed9Sopenharmony_ci } 1116d528ed9Sopenharmony_ci i += 2; 1126d528ed9Sopenharmony_ci } else { 1136d528ed9Sopenharmony_ci code_point = ConvertSingleChar(src[i]); 1146d528ed9Sopenharmony_ci ++i; 1156d528ed9Sopenharmony_ci } 1166d528ed9Sopenharmony_ci 1176d528ed9Sopenharmony_ci UnicodeAppendUnsafe(dest, dest_len, code_point); 1186d528ed9Sopenharmony_ci } 1196d528ed9Sopenharmony_ci 1206d528ed9Sopenharmony_ci if (i < src_len) 1216d528ed9Sopenharmony_ci UnicodeAppendUnsafe(dest, dest_len, ConvertSingleChar(src[i])); 1226d528ed9Sopenharmony_ci 1236d528ed9Sopenharmony_ci return success; 1246d528ed9Sopenharmony_ci} 1256d528ed9Sopenharmony_ci 1266d528ed9Sopenharmony_ci// UTFConversion -------------------------------------------------------------- 1276d528ed9Sopenharmony_ci// Function template for generating all UTF conversions. 1286d528ed9Sopenharmony_ci 1296d528ed9Sopenharmony_citemplate <typename InputString, typename DestString> 1306d528ed9Sopenharmony_cibool UTFConversion(const InputString& src_str, DestString* dest_str) { 1316d528ed9Sopenharmony_ci if (IsStringASCII(src_str)) { 1326d528ed9Sopenharmony_ci dest_str->assign(src_str.begin(), src_str.end()); 1336d528ed9Sopenharmony_ci return true; 1346d528ed9Sopenharmony_ci } 1356d528ed9Sopenharmony_ci 1366d528ed9Sopenharmony_ci dest_str->resize(src_str.length() * 1376d528ed9Sopenharmony_ci size_coefficient_v<typename InputString::value_type, 1386d528ed9Sopenharmony_ci typename DestString::value_type>); 1396d528ed9Sopenharmony_ci 1406d528ed9Sopenharmony_ci // Empty string is ASCII => it OK to call operator[]. 1416d528ed9Sopenharmony_ci auto* dest = &(*dest_str)[0]; 1426d528ed9Sopenharmony_ci 1436d528ed9Sopenharmony_ci // ICU requires 32 bit numbers. 1446d528ed9Sopenharmony_ci int32_t src_len32 = static_cast<int32_t>(src_str.length()); 1456d528ed9Sopenharmony_ci int32_t dest_len32 = 0; 1466d528ed9Sopenharmony_ci 1476d528ed9Sopenharmony_ci bool res = DoUTFConversion(src_str.data(), src_len32, dest, &dest_len32); 1486d528ed9Sopenharmony_ci 1496d528ed9Sopenharmony_ci dest_str->resize(dest_len32); 1506d528ed9Sopenharmony_ci dest_str->shrink_to_fit(); 1516d528ed9Sopenharmony_ci 1526d528ed9Sopenharmony_ci return res; 1536d528ed9Sopenharmony_ci} 1546d528ed9Sopenharmony_ci 1556d528ed9Sopenharmony_ci} // namespace 1566d528ed9Sopenharmony_ci 1576d528ed9Sopenharmony_ci// UTF16 <-> UTF8 -------------------------------------------------------------- 1586d528ed9Sopenharmony_ci 1596d528ed9Sopenharmony_cibool UTF8ToUTF16(const char* src, size_t src_len, std::u16string* output) { 1606d528ed9Sopenharmony_ci return UTFConversion(std::string_view(src, src_len), output); 1616d528ed9Sopenharmony_ci} 1626d528ed9Sopenharmony_ci 1636d528ed9Sopenharmony_cistd::u16string UTF8ToUTF16(std::string_view utf8) { 1646d528ed9Sopenharmony_ci std::u16string ret; 1656d528ed9Sopenharmony_ci // Ignore the success flag of this call, it will do the best it can for 1666d528ed9Sopenharmony_ci // invalid input, which is what we want here. 1676d528ed9Sopenharmony_ci UTF8ToUTF16(utf8.data(), utf8.size(), &ret); 1686d528ed9Sopenharmony_ci return ret; 1696d528ed9Sopenharmony_ci} 1706d528ed9Sopenharmony_ci 1716d528ed9Sopenharmony_cibool UTF16ToUTF8(const char16_t* src, size_t src_len, std::string* output) { 1726d528ed9Sopenharmony_ci return UTFConversion(std::u16string_view(src, src_len), output); 1736d528ed9Sopenharmony_ci} 1746d528ed9Sopenharmony_ci 1756d528ed9Sopenharmony_cistd::string UTF16ToUTF8(std::u16string_view utf16) { 1766d528ed9Sopenharmony_ci std::string ret; 1776d528ed9Sopenharmony_ci // Ignore the success flag of this call, it will do the best it can for 1786d528ed9Sopenharmony_ci // invalid input, which is what we want here. 1796d528ed9Sopenharmony_ci UTF16ToUTF8(utf16.data(), utf16.length(), &ret); 1806d528ed9Sopenharmony_ci return ret; 1816d528ed9Sopenharmony_ci} 1826d528ed9Sopenharmony_ci 1836d528ed9Sopenharmony_ci// ASCII <-> UTF-16 ----------------------------------------------------------- 1846d528ed9Sopenharmony_ci 1856d528ed9Sopenharmony_cistd::u16string ASCIIToUTF16(std::string_view ascii) { 1866d528ed9Sopenharmony_ci DCHECK(IsStringASCII(ascii)) << ascii; 1876d528ed9Sopenharmony_ci return std::u16string(ascii.begin(), ascii.end()); 1886d528ed9Sopenharmony_ci} 1896d528ed9Sopenharmony_ci 1906d528ed9Sopenharmony_cistd::string UTF16ToASCII(std::u16string_view utf16) { 1916d528ed9Sopenharmony_ci DCHECK(IsStringASCII(utf16)) << UTF16ToUTF8(utf16); 1926d528ed9Sopenharmony_ci return std::string(utf16.begin(), utf16.end()); 1936d528ed9Sopenharmony_ci} 1946d528ed9Sopenharmony_ci 1956d528ed9Sopenharmony_ci} // namespace base 196