16d528ed9Sopenharmony_ci// Copyright (c) 2011 The Chromium Authors. All rights reserved.
26d528ed9Sopenharmony_ci// Use of this source code is governed by a BSD-style license that can be
36d528ed9Sopenharmony_ci// found in the LICENSE file.
46d528ed9Sopenharmony_ci
56d528ed9Sopenharmony_ci#include "base/strings/utf_offset_string_conversions.h"
66d528ed9Sopenharmony_ci
76d528ed9Sopenharmony_ci#include <stdint.h>
86d528ed9Sopenharmony_ci
96d528ed9Sopenharmony_ci#include <algorithm>
106d528ed9Sopenharmony_ci#include <memory>
116d528ed9Sopenharmony_ci#include <string_view>
126d528ed9Sopenharmony_ci
136d528ed9Sopenharmony_ci#include "base/logging.h"
146d528ed9Sopenharmony_ci#include "base/strings/utf_string_conversion_utils.h"
156d528ed9Sopenharmony_ci
166d528ed9Sopenharmony_cinamespace base {
176d528ed9Sopenharmony_ci
186d528ed9Sopenharmony_ciOffsetAdjuster::Adjustment::Adjustment(size_t original_offset,
196d528ed9Sopenharmony_ci                                       size_t original_length,
206d528ed9Sopenharmony_ci                                       size_t output_length)
216d528ed9Sopenharmony_ci    : original_offset(original_offset),
226d528ed9Sopenharmony_ci      original_length(original_length),
236d528ed9Sopenharmony_ci      output_length(output_length) {}
246d528ed9Sopenharmony_ci
256d528ed9Sopenharmony_ci// static
266d528ed9Sopenharmony_civoid OffsetAdjuster::AdjustOffsets(const Adjustments& adjustments,
276d528ed9Sopenharmony_ci                                   std::vector<size_t>* offsets_for_adjustment,
286d528ed9Sopenharmony_ci                                   size_t limit) {
296d528ed9Sopenharmony_ci  DCHECK(offsets_for_adjustment);
306d528ed9Sopenharmony_ci  for (std::vector<size_t>::iterator i(offsets_for_adjustment->begin());
316d528ed9Sopenharmony_ci       i != offsets_for_adjustment->end(); ++i)
326d528ed9Sopenharmony_ci    AdjustOffset(adjustments, &(*i), limit);
336d528ed9Sopenharmony_ci}
346d528ed9Sopenharmony_ci
356d528ed9Sopenharmony_ci// static
366d528ed9Sopenharmony_civoid OffsetAdjuster::AdjustOffset(const Adjustments& adjustments,
376d528ed9Sopenharmony_ci                                  size_t* offset,
386d528ed9Sopenharmony_ci                                  size_t limit) {
396d528ed9Sopenharmony_ci  DCHECK(offset);
406d528ed9Sopenharmony_ci  if (*offset == std::u16string::npos)
416d528ed9Sopenharmony_ci    return;
426d528ed9Sopenharmony_ci  int adjustment = 0;
436d528ed9Sopenharmony_ci  for (Adjustments::const_iterator i = adjustments.begin();
446d528ed9Sopenharmony_ci       i != adjustments.end(); ++i) {
456d528ed9Sopenharmony_ci    if (*offset <= i->original_offset)
466d528ed9Sopenharmony_ci      break;
476d528ed9Sopenharmony_ci    if (*offset < (i->original_offset + i->original_length)) {
486d528ed9Sopenharmony_ci      *offset = std::u16string::npos;
496d528ed9Sopenharmony_ci      return;
506d528ed9Sopenharmony_ci    }
516d528ed9Sopenharmony_ci    adjustment += static_cast<int>(i->original_length - i->output_length);
526d528ed9Sopenharmony_ci  }
536d528ed9Sopenharmony_ci  *offset -= adjustment;
546d528ed9Sopenharmony_ci
556d528ed9Sopenharmony_ci  if (*offset > limit)
566d528ed9Sopenharmony_ci    *offset = std::u16string::npos;
576d528ed9Sopenharmony_ci}
586d528ed9Sopenharmony_ci
596d528ed9Sopenharmony_ci// static
606d528ed9Sopenharmony_civoid OffsetAdjuster::UnadjustOffsets(
616d528ed9Sopenharmony_ci    const Adjustments& adjustments,
626d528ed9Sopenharmony_ci    std::vector<size_t>* offsets_for_unadjustment) {
636d528ed9Sopenharmony_ci  if (!offsets_for_unadjustment || adjustments.empty())
646d528ed9Sopenharmony_ci    return;
656d528ed9Sopenharmony_ci  for (std::vector<size_t>::iterator i(offsets_for_unadjustment->begin());
666d528ed9Sopenharmony_ci       i != offsets_for_unadjustment->end(); ++i)
676d528ed9Sopenharmony_ci    UnadjustOffset(adjustments, &(*i));
686d528ed9Sopenharmony_ci}
696d528ed9Sopenharmony_ci
706d528ed9Sopenharmony_ci// static
716d528ed9Sopenharmony_civoid OffsetAdjuster::UnadjustOffset(const Adjustments& adjustments,
726d528ed9Sopenharmony_ci                                    size_t* offset) {
736d528ed9Sopenharmony_ci  if (*offset == std::u16string::npos)
746d528ed9Sopenharmony_ci    return;
756d528ed9Sopenharmony_ci  int adjustment = 0;
766d528ed9Sopenharmony_ci  for (Adjustments::const_iterator i = adjustments.begin();
776d528ed9Sopenharmony_ci       i != adjustments.end(); ++i) {
786d528ed9Sopenharmony_ci    if (*offset + adjustment <= i->original_offset)
796d528ed9Sopenharmony_ci      break;
806d528ed9Sopenharmony_ci    adjustment += static_cast<int>(i->original_length - i->output_length);
816d528ed9Sopenharmony_ci    if ((*offset + adjustment) < (i->original_offset + i->original_length)) {
826d528ed9Sopenharmony_ci      *offset = std::u16string::npos;
836d528ed9Sopenharmony_ci      return;
846d528ed9Sopenharmony_ci    }
856d528ed9Sopenharmony_ci  }
866d528ed9Sopenharmony_ci  *offset += adjustment;
876d528ed9Sopenharmony_ci}
886d528ed9Sopenharmony_ci
896d528ed9Sopenharmony_ci// static
906d528ed9Sopenharmony_civoid OffsetAdjuster::MergeSequentialAdjustments(
916d528ed9Sopenharmony_ci    const Adjustments& first_adjustments,
926d528ed9Sopenharmony_ci    Adjustments* adjustments_on_adjusted_string) {
936d528ed9Sopenharmony_ci  Adjustments::iterator adjusted_iter = adjustments_on_adjusted_string->begin();
946d528ed9Sopenharmony_ci  Adjustments::const_iterator first_iter = first_adjustments.begin();
956d528ed9Sopenharmony_ci  // Simultaneously iterate over all |adjustments_on_adjusted_string| and
966d528ed9Sopenharmony_ci  // |first_adjustments|, adding adjustments to or correcting the adjustments
976d528ed9Sopenharmony_ci  // in |adjustments_on_adjusted_string| as we go.  |shift| keeps track of the
986d528ed9Sopenharmony_ci  // current number of characters collapsed by |first_adjustments| up to this
996d528ed9Sopenharmony_ci  // point.  |currently_collapsing| keeps track of the number of characters
1006d528ed9Sopenharmony_ci  // collapsed by |first_adjustments| into the current |adjusted_iter|'s
1016d528ed9Sopenharmony_ci  // length.  These are characters that will change |shift| as soon as we're
1026d528ed9Sopenharmony_ci  // done processing the current |adjusted_iter|; they are not yet reflected in
1036d528ed9Sopenharmony_ci  // |shift|.
1046d528ed9Sopenharmony_ci  size_t shift = 0;
1056d528ed9Sopenharmony_ci  size_t currently_collapsing = 0;
1066d528ed9Sopenharmony_ci  while (adjusted_iter != adjustments_on_adjusted_string->end()) {
1076d528ed9Sopenharmony_ci    if ((first_iter == first_adjustments.end()) ||
1086d528ed9Sopenharmony_ci        ((adjusted_iter->original_offset + shift +
1096d528ed9Sopenharmony_ci          adjusted_iter->original_length) <= first_iter->original_offset)) {
1106d528ed9Sopenharmony_ci      // Entire |adjusted_iter| (accounting for its shift and including its
1116d528ed9Sopenharmony_ci      // whole original length) comes before |first_iter|.
1126d528ed9Sopenharmony_ci      //
1136d528ed9Sopenharmony_ci      // Correct the offset at |adjusted_iter| and move onto the next
1146d528ed9Sopenharmony_ci      // adjustment that needs revising.
1156d528ed9Sopenharmony_ci      adjusted_iter->original_offset += shift;
1166d528ed9Sopenharmony_ci      shift += currently_collapsing;
1176d528ed9Sopenharmony_ci      currently_collapsing = 0;
1186d528ed9Sopenharmony_ci      ++adjusted_iter;
1196d528ed9Sopenharmony_ci    } else if ((adjusted_iter->original_offset + shift) >
1206d528ed9Sopenharmony_ci               first_iter->original_offset) {
1216d528ed9Sopenharmony_ci      // |first_iter| comes before the |adjusted_iter| (as adjusted by |shift|).
1226d528ed9Sopenharmony_ci
1236d528ed9Sopenharmony_ci      // It's not possible for the adjustments to overlap.  (It shouldn't
1246d528ed9Sopenharmony_ci      // be possible that we have an |adjusted_iter->original_offset| that,
1256d528ed9Sopenharmony_ci      // when adjusted by the computed |shift|, is in the middle of
1266d528ed9Sopenharmony_ci      // |first_iter|'s output's length.  After all, that would mean the
1276d528ed9Sopenharmony_ci      // current adjustment_on_adjusted_string somehow points to an offset
1286d528ed9Sopenharmony_ci      // that was supposed to have been eliminated by the first set of
1296d528ed9Sopenharmony_ci      // adjustments.)
1306d528ed9Sopenharmony_ci      DCHECK_LE(first_iter->original_offset + first_iter->output_length,
1316d528ed9Sopenharmony_ci                adjusted_iter->original_offset + shift);
1326d528ed9Sopenharmony_ci
1336d528ed9Sopenharmony_ci      // Add the |first_adjustment_iter| to the full set of adjustments while
1346d528ed9Sopenharmony_ci      // making sure |adjusted_iter| continues pointing to the same element.
1356d528ed9Sopenharmony_ci      // We do this by inserting the |first_adjustment_iter| right before
1366d528ed9Sopenharmony_ci      // |adjusted_iter|, then incrementing |adjusted_iter| so it points to
1376d528ed9Sopenharmony_ci      // the following element.
1386d528ed9Sopenharmony_ci      shift += first_iter->original_length - first_iter->output_length;
1396d528ed9Sopenharmony_ci      adjusted_iter =
1406d528ed9Sopenharmony_ci          adjustments_on_adjusted_string->insert(adjusted_iter, *first_iter);
1416d528ed9Sopenharmony_ci      ++adjusted_iter;
1426d528ed9Sopenharmony_ci      ++first_iter;
1436d528ed9Sopenharmony_ci    } else {
1446d528ed9Sopenharmony_ci      // The first adjustment adjusted something that then got further adjusted
1456d528ed9Sopenharmony_ci      // by the second set of adjustments.  In other words, |first_iter| points
1466d528ed9Sopenharmony_ci      // to something in the range covered by |adjusted_iter|'s length (after
1476d528ed9Sopenharmony_ci      // accounting for |shift|).  Precisely,
1486d528ed9Sopenharmony_ci      //   adjusted_iter->original_offset + shift
1496d528ed9Sopenharmony_ci      //   <=
1506d528ed9Sopenharmony_ci      //   first_iter->original_offset
1516d528ed9Sopenharmony_ci      //   <=
1526d528ed9Sopenharmony_ci      //   adjusted_iter->original_offset + shift +
1536d528ed9Sopenharmony_ci      //       adjusted_iter->original_length
1546d528ed9Sopenharmony_ci
1556d528ed9Sopenharmony_ci      // Modify the current |adjusted_iter| to include whatever collapsing
1566d528ed9Sopenharmony_ci      // happened in |first_iter|, then advance to the next |first_adjustments|
1576d528ed9Sopenharmony_ci      // because we dealt with the current one.
1586d528ed9Sopenharmony_ci      const int collapse = static_cast<int>(first_iter->original_length) -
1596d528ed9Sopenharmony_ci                           static_cast<int>(first_iter->output_length);
1606d528ed9Sopenharmony_ci      // This function does not know how to deal with a string that expands and
1616d528ed9Sopenharmony_ci      // then gets modified, only strings that collapse and then get modified.
1626d528ed9Sopenharmony_ci      DCHECK_GT(collapse, 0);
1636d528ed9Sopenharmony_ci      adjusted_iter->original_length += collapse;
1646d528ed9Sopenharmony_ci      currently_collapsing += collapse;
1656d528ed9Sopenharmony_ci      ++first_iter;
1666d528ed9Sopenharmony_ci    }
1676d528ed9Sopenharmony_ci  }
1686d528ed9Sopenharmony_ci  DCHECK_EQ(0u, currently_collapsing);
1696d528ed9Sopenharmony_ci  if (first_iter != first_adjustments.end()) {
1706d528ed9Sopenharmony_ci    // Only first adjustments are left.  These do not need to be modified.
1716d528ed9Sopenharmony_ci    // (Their offsets are already correct with respect to the original string.)
1726d528ed9Sopenharmony_ci    // Append them all.
1736d528ed9Sopenharmony_ci    DCHECK(adjusted_iter == adjustments_on_adjusted_string->end());
1746d528ed9Sopenharmony_ci    adjustments_on_adjusted_string->insert(
1756d528ed9Sopenharmony_ci        adjustments_on_adjusted_string->end(), first_iter,
1766d528ed9Sopenharmony_ci        first_adjustments.end());
1776d528ed9Sopenharmony_ci  }
1786d528ed9Sopenharmony_ci}
1796d528ed9Sopenharmony_ci
1806d528ed9Sopenharmony_ci// Converts the given source Unicode character type to the given destination
1816d528ed9Sopenharmony_ci// Unicode character type as a STL string. The given input buffer and size
1826d528ed9Sopenharmony_ci// determine the source, and the given output STL string will be replaced by
1836d528ed9Sopenharmony_ci// the result.  If non-NULL, |adjustments| is set to reflect the all the
1846d528ed9Sopenharmony_ci// alterations to the string that are not one-character-to-one-character.
1856d528ed9Sopenharmony_ci// It will always be sorted by increasing offset.
1866d528ed9Sopenharmony_citemplate <typename SrcChar, typename DestStdString>
1876d528ed9Sopenharmony_cibool ConvertUnicode(const SrcChar* src,
1886d528ed9Sopenharmony_ci                    size_t src_len,
1896d528ed9Sopenharmony_ci                    DestStdString* output,
1906d528ed9Sopenharmony_ci                    OffsetAdjuster::Adjustments* adjustments) {
1916d528ed9Sopenharmony_ci  if (adjustments)
1926d528ed9Sopenharmony_ci    adjustments->clear();
1936d528ed9Sopenharmony_ci  // ICU requires 32-bit numbers.
1946d528ed9Sopenharmony_ci  bool success = true;
1956d528ed9Sopenharmony_ci  int32_t src_len32 = static_cast<int32_t>(src_len);
1966d528ed9Sopenharmony_ci  for (int32_t i = 0; i < src_len32; i++) {
1976d528ed9Sopenharmony_ci    uint32_t code_point;
1986d528ed9Sopenharmony_ci    size_t original_i = i;
1996d528ed9Sopenharmony_ci    size_t chars_written = 0;
2006d528ed9Sopenharmony_ci    if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
2016d528ed9Sopenharmony_ci      chars_written = WriteUnicodeCharacter(code_point, output);
2026d528ed9Sopenharmony_ci    } else {
2036d528ed9Sopenharmony_ci      chars_written = WriteUnicodeCharacter(0xFFFD, output);
2046d528ed9Sopenharmony_ci      success = false;
2056d528ed9Sopenharmony_ci    }
2066d528ed9Sopenharmony_ci
2076d528ed9Sopenharmony_ci    // Only bother writing an adjustment if this modification changed the
2086d528ed9Sopenharmony_ci    // length of this character.
2096d528ed9Sopenharmony_ci    // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last
2106d528ed9Sopenharmony_ci    // character read, not after it (so that incrementing it in the loop
2116d528ed9Sopenharmony_ci    // increment will place it at the right location), so we need to account
2126d528ed9Sopenharmony_ci    // for that in determining the amount that was read.
2136d528ed9Sopenharmony_ci    if (adjustments && ((i - original_i + 1) != chars_written)) {
2146d528ed9Sopenharmony_ci      adjustments->push_back(OffsetAdjuster::Adjustment(
2156d528ed9Sopenharmony_ci          original_i, i - original_i + 1, chars_written));
2166d528ed9Sopenharmony_ci    }
2176d528ed9Sopenharmony_ci  }
2186d528ed9Sopenharmony_ci  return success;
2196d528ed9Sopenharmony_ci}
2206d528ed9Sopenharmony_ci
2216d528ed9Sopenharmony_cibool UTF8ToUTF16WithAdjustments(
2226d528ed9Sopenharmony_ci    const char* src,
2236d528ed9Sopenharmony_ci    size_t src_len,
2246d528ed9Sopenharmony_ci    std::u16string* output,
2256d528ed9Sopenharmony_ci    base::OffsetAdjuster::Adjustments* adjustments) {
2266d528ed9Sopenharmony_ci  PrepareForUTF16Or32Output(src, src_len, output);
2276d528ed9Sopenharmony_ci  return ConvertUnicode(src, src_len, output, adjustments);
2286d528ed9Sopenharmony_ci}
2296d528ed9Sopenharmony_ci
2306d528ed9Sopenharmony_cistd::u16string UTF8ToUTF16WithAdjustments(
2316d528ed9Sopenharmony_ci    std::string_view utf8,
2326d528ed9Sopenharmony_ci    base::OffsetAdjuster::Adjustments* adjustments) {
2336d528ed9Sopenharmony_ci  std::u16string result;
2346d528ed9Sopenharmony_ci  UTF8ToUTF16WithAdjustments(utf8.data(), utf8.length(), &result, adjustments);
2356d528ed9Sopenharmony_ci  return result;
2366d528ed9Sopenharmony_ci}
2376d528ed9Sopenharmony_ci
2386d528ed9Sopenharmony_cistd::u16string UTF8ToUTF16AndAdjustOffsets(
2396d528ed9Sopenharmony_ci    std::string_view utf8,
2406d528ed9Sopenharmony_ci    std::vector<size_t>* offsets_for_adjustment) {
2416d528ed9Sopenharmony_ci  for (size_t& offset : *offsets_for_adjustment) {
2426d528ed9Sopenharmony_ci    if (offset > utf8.length())
2436d528ed9Sopenharmony_ci      offset = std::u16string::npos;
2446d528ed9Sopenharmony_ci  }
2456d528ed9Sopenharmony_ci  OffsetAdjuster::Adjustments adjustments;
2466d528ed9Sopenharmony_ci  std::u16string result = UTF8ToUTF16WithAdjustments(utf8, &adjustments);
2476d528ed9Sopenharmony_ci  OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
2486d528ed9Sopenharmony_ci  return result;
2496d528ed9Sopenharmony_ci}
2506d528ed9Sopenharmony_ci
2516d528ed9Sopenharmony_cistd::string UTF16ToUTF8AndAdjustOffsets(
2526d528ed9Sopenharmony_ci    std::u16string_view utf16,
2536d528ed9Sopenharmony_ci    std::vector<size_t>* offsets_for_adjustment) {
2546d528ed9Sopenharmony_ci  for (size_t& offset : *offsets_for_adjustment) {
2556d528ed9Sopenharmony_ci    if (offset > utf16.length())
2566d528ed9Sopenharmony_ci      offset = std::u16string::npos;
2576d528ed9Sopenharmony_ci  }
2586d528ed9Sopenharmony_ci  std::string result;
2596d528ed9Sopenharmony_ci  PrepareForUTF8Output(utf16.data(), utf16.length(), &result);
2606d528ed9Sopenharmony_ci  OffsetAdjuster::Adjustments adjustments;
2616d528ed9Sopenharmony_ci  ConvertUnicode(utf16.data(), utf16.length(), &result, &adjustments);
2626d528ed9Sopenharmony_ci  OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
2636d528ed9Sopenharmony_ci  return result;
2646d528ed9Sopenharmony_ci}
2656d528ed9Sopenharmony_ci
2666d528ed9Sopenharmony_ci}  // namespace base
267