16d528ed9Sopenharmony_ci// Copyright (c) 2011 The Chromium Authors. All rights reserved. 26d528ed9Sopenharmony_ci// Use of this source code is governed by a BSD-style license that can be 36d528ed9Sopenharmony_ci// found in the LICENSE file. 46d528ed9Sopenharmony_ci 56d528ed9Sopenharmony_ci#include "base/strings/utf_offset_string_conversions.h" 66d528ed9Sopenharmony_ci 76d528ed9Sopenharmony_ci#include <stdint.h> 86d528ed9Sopenharmony_ci 96d528ed9Sopenharmony_ci#include <algorithm> 106d528ed9Sopenharmony_ci#include <memory> 116d528ed9Sopenharmony_ci#include <string_view> 126d528ed9Sopenharmony_ci 136d528ed9Sopenharmony_ci#include "base/logging.h" 146d528ed9Sopenharmony_ci#include "base/strings/utf_string_conversion_utils.h" 156d528ed9Sopenharmony_ci 166d528ed9Sopenharmony_cinamespace base { 176d528ed9Sopenharmony_ci 186d528ed9Sopenharmony_ciOffsetAdjuster::Adjustment::Adjustment(size_t original_offset, 196d528ed9Sopenharmony_ci size_t original_length, 206d528ed9Sopenharmony_ci size_t output_length) 216d528ed9Sopenharmony_ci : original_offset(original_offset), 226d528ed9Sopenharmony_ci original_length(original_length), 236d528ed9Sopenharmony_ci output_length(output_length) {} 246d528ed9Sopenharmony_ci 256d528ed9Sopenharmony_ci// static 266d528ed9Sopenharmony_civoid OffsetAdjuster::AdjustOffsets(const Adjustments& adjustments, 276d528ed9Sopenharmony_ci std::vector<size_t>* offsets_for_adjustment, 286d528ed9Sopenharmony_ci size_t limit) { 296d528ed9Sopenharmony_ci DCHECK(offsets_for_adjustment); 306d528ed9Sopenharmony_ci for (std::vector<size_t>::iterator i(offsets_for_adjustment->begin()); 316d528ed9Sopenharmony_ci i != offsets_for_adjustment->end(); ++i) 326d528ed9Sopenharmony_ci AdjustOffset(adjustments, &(*i), limit); 336d528ed9Sopenharmony_ci} 346d528ed9Sopenharmony_ci 356d528ed9Sopenharmony_ci// static 366d528ed9Sopenharmony_civoid OffsetAdjuster::AdjustOffset(const Adjustments& adjustments, 376d528ed9Sopenharmony_ci size_t* offset, 386d528ed9Sopenharmony_ci size_t limit) { 396d528ed9Sopenharmony_ci DCHECK(offset); 406d528ed9Sopenharmony_ci if (*offset == std::u16string::npos) 416d528ed9Sopenharmony_ci return; 426d528ed9Sopenharmony_ci int adjustment = 0; 436d528ed9Sopenharmony_ci for (Adjustments::const_iterator i = adjustments.begin(); 446d528ed9Sopenharmony_ci i != adjustments.end(); ++i) { 456d528ed9Sopenharmony_ci if (*offset <= i->original_offset) 466d528ed9Sopenharmony_ci break; 476d528ed9Sopenharmony_ci if (*offset < (i->original_offset + i->original_length)) { 486d528ed9Sopenharmony_ci *offset = std::u16string::npos; 496d528ed9Sopenharmony_ci return; 506d528ed9Sopenharmony_ci } 516d528ed9Sopenharmony_ci adjustment += static_cast<int>(i->original_length - i->output_length); 526d528ed9Sopenharmony_ci } 536d528ed9Sopenharmony_ci *offset -= adjustment; 546d528ed9Sopenharmony_ci 556d528ed9Sopenharmony_ci if (*offset > limit) 566d528ed9Sopenharmony_ci *offset = std::u16string::npos; 576d528ed9Sopenharmony_ci} 586d528ed9Sopenharmony_ci 596d528ed9Sopenharmony_ci// static 606d528ed9Sopenharmony_civoid OffsetAdjuster::UnadjustOffsets( 616d528ed9Sopenharmony_ci const Adjustments& adjustments, 626d528ed9Sopenharmony_ci std::vector<size_t>* offsets_for_unadjustment) { 636d528ed9Sopenharmony_ci if (!offsets_for_unadjustment || adjustments.empty()) 646d528ed9Sopenharmony_ci return; 656d528ed9Sopenharmony_ci for (std::vector<size_t>::iterator i(offsets_for_unadjustment->begin()); 666d528ed9Sopenharmony_ci i != offsets_for_unadjustment->end(); ++i) 676d528ed9Sopenharmony_ci UnadjustOffset(adjustments, &(*i)); 686d528ed9Sopenharmony_ci} 696d528ed9Sopenharmony_ci 706d528ed9Sopenharmony_ci// static 716d528ed9Sopenharmony_civoid OffsetAdjuster::UnadjustOffset(const Adjustments& adjustments, 726d528ed9Sopenharmony_ci size_t* offset) { 736d528ed9Sopenharmony_ci if (*offset == std::u16string::npos) 746d528ed9Sopenharmony_ci return; 756d528ed9Sopenharmony_ci int adjustment = 0; 766d528ed9Sopenharmony_ci for (Adjustments::const_iterator i = adjustments.begin(); 776d528ed9Sopenharmony_ci i != adjustments.end(); ++i) { 786d528ed9Sopenharmony_ci if (*offset + adjustment <= i->original_offset) 796d528ed9Sopenharmony_ci break; 806d528ed9Sopenharmony_ci adjustment += static_cast<int>(i->original_length - i->output_length); 816d528ed9Sopenharmony_ci if ((*offset + adjustment) < (i->original_offset + i->original_length)) { 826d528ed9Sopenharmony_ci *offset = std::u16string::npos; 836d528ed9Sopenharmony_ci return; 846d528ed9Sopenharmony_ci } 856d528ed9Sopenharmony_ci } 866d528ed9Sopenharmony_ci *offset += adjustment; 876d528ed9Sopenharmony_ci} 886d528ed9Sopenharmony_ci 896d528ed9Sopenharmony_ci// static 906d528ed9Sopenharmony_civoid OffsetAdjuster::MergeSequentialAdjustments( 916d528ed9Sopenharmony_ci const Adjustments& first_adjustments, 926d528ed9Sopenharmony_ci Adjustments* adjustments_on_adjusted_string) { 936d528ed9Sopenharmony_ci Adjustments::iterator adjusted_iter = adjustments_on_adjusted_string->begin(); 946d528ed9Sopenharmony_ci Adjustments::const_iterator first_iter = first_adjustments.begin(); 956d528ed9Sopenharmony_ci // Simultaneously iterate over all |adjustments_on_adjusted_string| and 966d528ed9Sopenharmony_ci // |first_adjustments|, adding adjustments to or correcting the adjustments 976d528ed9Sopenharmony_ci // in |adjustments_on_adjusted_string| as we go. |shift| keeps track of the 986d528ed9Sopenharmony_ci // current number of characters collapsed by |first_adjustments| up to this 996d528ed9Sopenharmony_ci // point. |currently_collapsing| keeps track of the number of characters 1006d528ed9Sopenharmony_ci // collapsed by |first_adjustments| into the current |adjusted_iter|'s 1016d528ed9Sopenharmony_ci // length. These are characters that will change |shift| as soon as we're 1026d528ed9Sopenharmony_ci // done processing the current |adjusted_iter|; they are not yet reflected in 1036d528ed9Sopenharmony_ci // |shift|. 1046d528ed9Sopenharmony_ci size_t shift = 0; 1056d528ed9Sopenharmony_ci size_t currently_collapsing = 0; 1066d528ed9Sopenharmony_ci while (adjusted_iter != adjustments_on_adjusted_string->end()) { 1076d528ed9Sopenharmony_ci if ((first_iter == first_adjustments.end()) || 1086d528ed9Sopenharmony_ci ((adjusted_iter->original_offset + shift + 1096d528ed9Sopenharmony_ci adjusted_iter->original_length) <= first_iter->original_offset)) { 1106d528ed9Sopenharmony_ci // Entire |adjusted_iter| (accounting for its shift and including its 1116d528ed9Sopenharmony_ci // whole original length) comes before |first_iter|. 1126d528ed9Sopenharmony_ci // 1136d528ed9Sopenharmony_ci // Correct the offset at |adjusted_iter| and move onto the next 1146d528ed9Sopenharmony_ci // adjustment that needs revising. 1156d528ed9Sopenharmony_ci adjusted_iter->original_offset += shift; 1166d528ed9Sopenharmony_ci shift += currently_collapsing; 1176d528ed9Sopenharmony_ci currently_collapsing = 0; 1186d528ed9Sopenharmony_ci ++adjusted_iter; 1196d528ed9Sopenharmony_ci } else if ((adjusted_iter->original_offset + shift) > 1206d528ed9Sopenharmony_ci first_iter->original_offset) { 1216d528ed9Sopenharmony_ci // |first_iter| comes before the |adjusted_iter| (as adjusted by |shift|). 1226d528ed9Sopenharmony_ci 1236d528ed9Sopenharmony_ci // It's not possible for the adjustments to overlap. (It shouldn't 1246d528ed9Sopenharmony_ci // be possible that we have an |adjusted_iter->original_offset| that, 1256d528ed9Sopenharmony_ci // when adjusted by the computed |shift|, is in the middle of 1266d528ed9Sopenharmony_ci // |first_iter|'s output's length. After all, that would mean the 1276d528ed9Sopenharmony_ci // current adjustment_on_adjusted_string somehow points to an offset 1286d528ed9Sopenharmony_ci // that was supposed to have been eliminated by the first set of 1296d528ed9Sopenharmony_ci // adjustments.) 1306d528ed9Sopenharmony_ci DCHECK_LE(first_iter->original_offset + first_iter->output_length, 1316d528ed9Sopenharmony_ci adjusted_iter->original_offset + shift); 1326d528ed9Sopenharmony_ci 1336d528ed9Sopenharmony_ci // Add the |first_adjustment_iter| to the full set of adjustments while 1346d528ed9Sopenharmony_ci // making sure |adjusted_iter| continues pointing to the same element. 1356d528ed9Sopenharmony_ci // We do this by inserting the |first_adjustment_iter| right before 1366d528ed9Sopenharmony_ci // |adjusted_iter|, then incrementing |adjusted_iter| so it points to 1376d528ed9Sopenharmony_ci // the following element. 1386d528ed9Sopenharmony_ci shift += first_iter->original_length - first_iter->output_length; 1396d528ed9Sopenharmony_ci adjusted_iter = 1406d528ed9Sopenharmony_ci adjustments_on_adjusted_string->insert(adjusted_iter, *first_iter); 1416d528ed9Sopenharmony_ci ++adjusted_iter; 1426d528ed9Sopenharmony_ci ++first_iter; 1436d528ed9Sopenharmony_ci } else { 1446d528ed9Sopenharmony_ci // The first adjustment adjusted something that then got further adjusted 1456d528ed9Sopenharmony_ci // by the second set of adjustments. In other words, |first_iter| points 1466d528ed9Sopenharmony_ci // to something in the range covered by |adjusted_iter|'s length (after 1476d528ed9Sopenharmony_ci // accounting for |shift|). Precisely, 1486d528ed9Sopenharmony_ci // adjusted_iter->original_offset + shift 1496d528ed9Sopenharmony_ci // <= 1506d528ed9Sopenharmony_ci // first_iter->original_offset 1516d528ed9Sopenharmony_ci // <= 1526d528ed9Sopenharmony_ci // adjusted_iter->original_offset + shift + 1536d528ed9Sopenharmony_ci // adjusted_iter->original_length 1546d528ed9Sopenharmony_ci 1556d528ed9Sopenharmony_ci // Modify the current |adjusted_iter| to include whatever collapsing 1566d528ed9Sopenharmony_ci // happened in |first_iter|, then advance to the next |first_adjustments| 1576d528ed9Sopenharmony_ci // because we dealt with the current one. 1586d528ed9Sopenharmony_ci const int collapse = static_cast<int>(first_iter->original_length) - 1596d528ed9Sopenharmony_ci static_cast<int>(first_iter->output_length); 1606d528ed9Sopenharmony_ci // This function does not know how to deal with a string that expands and 1616d528ed9Sopenharmony_ci // then gets modified, only strings that collapse and then get modified. 1626d528ed9Sopenharmony_ci DCHECK_GT(collapse, 0); 1636d528ed9Sopenharmony_ci adjusted_iter->original_length += collapse; 1646d528ed9Sopenharmony_ci currently_collapsing += collapse; 1656d528ed9Sopenharmony_ci ++first_iter; 1666d528ed9Sopenharmony_ci } 1676d528ed9Sopenharmony_ci } 1686d528ed9Sopenharmony_ci DCHECK_EQ(0u, currently_collapsing); 1696d528ed9Sopenharmony_ci if (first_iter != first_adjustments.end()) { 1706d528ed9Sopenharmony_ci // Only first adjustments are left. These do not need to be modified. 1716d528ed9Sopenharmony_ci // (Their offsets are already correct with respect to the original string.) 1726d528ed9Sopenharmony_ci // Append them all. 1736d528ed9Sopenharmony_ci DCHECK(adjusted_iter == adjustments_on_adjusted_string->end()); 1746d528ed9Sopenharmony_ci adjustments_on_adjusted_string->insert( 1756d528ed9Sopenharmony_ci adjustments_on_adjusted_string->end(), first_iter, 1766d528ed9Sopenharmony_ci first_adjustments.end()); 1776d528ed9Sopenharmony_ci } 1786d528ed9Sopenharmony_ci} 1796d528ed9Sopenharmony_ci 1806d528ed9Sopenharmony_ci// Converts the given source Unicode character type to the given destination 1816d528ed9Sopenharmony_ci// Unicode character type as a STL string. The given input buffer and size 1826d528ed9Sopenharmony_ci// determine the source, and the given output STL string will be replaced by 1836d528ed9Sopenharmony_ci// the result. If non-NULL, |adjustments| is set to reflect the all the 1846d528ed9Sopenharmony_ci// alterations to the string that are not one-character-to-one-character. 1856d528ed9Sopenharmony_ci// It will always be sorted by increasing offset. 1866d528ed9Sopenharmony_citemplate <typename SrcChar, typename DestStdString> 1876d528ed9Sopenharmony_cibool ConvertUnicode(const SrcChar* src, 1886d528ed9Sopenharmony_ci size_t src_len, 1896d528ed9Sopenharmony_ci DestStdString* output, 1906d528ed9Sopenharmony_ci OffsetAdjuster::Adjustments* adjustments) { 1916d528ed9Sopenharmony_ci if (adjustments) 1926d528ed9Sopenharmony_ci adjustments->clear(); 1936d528ed9Sopenharmony_ci // ICU requires 32-bit numbers. 1946d528ed9Sopenharmony_ci bool success = true; 1956d528ed9Sopenharmony_ci int32_t src_len32 = static_cast<int32_t>(src_len); 1966d528ed9Sopenharmony_ci for (int32_t i = 0; i < src_len32; i++) { 1976d528ed9Sopenharmony_ci uint32_t code_point; 1986d528ed9Sopenharmony_ci size_t original_i = i; 1996d528ed9Sopenharmony_ci size_t chars_written = 0; 2006d528ed9Sopenharmony_ci if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) { 2016d528ed9Sopenharmony_ci chars_written = WriteUnicodeCharacter(code_point, output); 2026d528ed9Sopenharmony_ci } else { 2036d528ed9Sopenharmony_ci chars_written = WriteUnicodeCharacter(0xFFFD, output); 2046d528ed9Sopenharmony_ci success = false; 2056d528ed9Sopenharmony_ci } 2066d528ed9Sopenharmony_ci 2076d528ed9Sopenharmony_ci // Only bother writing an adjustment if this modification changed the 2086d528ed9Sopenharmony_ci // length of this character. 2096d528ed9Sopenharmony_ci // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last 2106d528ed9Sopenharmony_ci // character read, not after it (so that incrementing it in the loop 2116d528ed9Sopenharmony_ci // increment will place it at the right location), so we need to account 2126d528ed9Sopenharmony_ci // for that in determining the amount that was read. 2136d528ed9Sopenharmony_ci if (adjustments && ((i - original_i + 1) != chars_written)) { 2146d528ed9Sopenharmony_ci adjustments->push_back(OffsetAdjuster::Adjustment( 2156d528ed9Sopenharmony_ci original_i, i - original_i + 1, chars_written)); 2166d528ed9Sopenharmony_ci } 2176d528ed9Sopenharmony_ci } 2186d528ed9Sopenharmony_ci return success; 2196d528ed9Sopenharmony_ci} 2206d528ed9Sopenharmony_ci 2216d528ed9Sopenharmony_cibool UTF8ToUTF16WithAdjustments( 2226d528ed9Sopenharmony_ci const char* src, 2236d528ed9Sopenharmony_ci size_t src_len, 2246d528ed9Sopenharmony_ci std::u16string* output, 2256d528ed9Sopenharmony_ci base::OffsetAdjuster::Adjustments* adjustments) { 2266d528ed9Sopenharmony_ci PrepareForUTF16Or32Output(src, src_len, output); 2276d528ed9Sopenharmony_ci return ConvertUnicode(src, src_len, output, adjustments); 2286d528ed9Sopenharmony_ci} 2296d528ed9Sopenharmony_ci 2306d528ed9Sopenharmony_cistd::u16string UTF8ToUTF16WithAdjustments( 2316d528ed9Sopenharmony_ci std::string_view utf8, 2326d528ed9Sopenharmony_ci base::OffsetAdjuster::Adjustments* adjustments) { 2336d528ed9Sopenharmony_ci std::u16string result; 2346d528ed9Sopenharmony_ci UTF8ToUTF16WithAdjustments(utf8.data(), utf8.length(), &result, adjustments); 2356d528ed9Sopenharmony_ci return result; 2366d528ed9Sopenharmony_ci} 2376d528ed9Sopenharmony_ci 2386d528ed9Sopenharmony_cistd::u16string UTF8ToUTF16AndAdjustOffsets( 2396d528ed9Sopenharmony_ci std::string_view utf8, 2406d528ed9Sopenharmony_ci std::vector<size_t>* offsets_for_adjustment) { 2416d528ed9Sopenharmony_ci for (size_t& offset : *offsets_for_adjustment) { 2426d528ed9Sopenharmony_ci if (offset > utf8.length()) 2436d528ed9Sopenharmony_ci offset = std::u16string::npos; 2446d528ed9Sopenharmony_ci } 2456d528ed9Sopenharmony_ci OffsetAdjuster::Adjustments adjustments; 2466d528ed9Sopenharmony_ci std::u16string result = UTF8ToUTF16WithAdjustments(utf8, &adjustments); 2476d528ed9Sopenharmony_ci OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment); 2486d528ed9Sopenharmony_ci return result; 2496d528ed9Sopenharmony_ci} 2506d528ed9Sopenharmony_ci 2516d528ed9Sopenharmony_cistd::string UTF16ToUTF8AndAdjustOffsets( 2526d528ed9Sopenharmony_ci std::u16string_view utf16, 2536d528ed9Sopenharmony_ci std::vector<size_t>* offsets_for_adjustment) { 2546d528ed9Sopenharmony_ci for (size_t& offset : *offsets_for_adjustment) { 2556d528ed9Sopenharmony_ci if (offset > utf16.length()) 2566d528ed9Sopenharmony_ci offset = std::u16string::npos; 2576d528ed9Sopenharmony_ci } 2586d528ed9Sopenharmony_ci std::string result; 2596d528ed9Sopenharmony_ci PrepareForUTF8Output(utf16.data(), utf16.length(), &result); 2606d528ed9Sopenharmony_ci OffsetAdjuster::Adjustments adjustments; 2616d528ed9Sopenharmony_ci ConvertUnicode(utf16.data(), utf16.length(), &result, &adjustments); 2626d528ed9Sopenharmony_ci OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment); 2636d528ed9Sopenharmony_ci return result; 2646d528ed9Sopenharmony_ci} 2656d528ed9Sopenharmony_ci 2666d528ed9Sopenharmony_ci} // namespace base 267