1/* 2* Copyright 2020 Google Inc. 3* 4* Use of this source code is governed by a BSD-style license that can be 5* found in the LICENSE file. 6*/ 7 8#include "include/core/SkString.h" 9#include "include/core/SkTypes.h" 10#include "include/private/SkBitmaskEnum.h" 11#include "include/private/SkMutex.h" 12#include "include/private/SkOnce.h" 13#include "include/private/SkTArray.h" 14#include "include/private/SkTemplates.h" 15#include "include/private/SkTo.h" 16#include "modules/skunicode/include/SkUnicode.h" 17#include "modules/skunicode/src/SkUnicode_icu.h" 18#include "modules/skunicode/src/SkUnicode_icu_bidi.h" 19#include "src/utils/SkUTF.h" 20#include "include/private/SkTHash.h" 21#include <unicode/umachine.h> 22#include <functional> 23#include <string> 24#include <utility> 25#include <vector> 26 27#if defined(SK_USING_THIRD_PARTY_ICU) 28#include "SkLoadICU.h" 29#endif 30 31static const SkICULib* ICULib() { 32 static const auto gICU = SkLoadICULib(); 33 34 return gICU.get(); 35} 36 37// sk_* wrappers for ICU funcs 38#define SKICU_FUNC(funcname) \ 39 template <typename... Args> \ 40 auto sk_##funcname(Args&&... args) -> decltype(funcname(std::forward<Args>(args)...)) { \ 41 return ICULib()->f_##funcname(std::forward<Args>(args)...); \ 42 } \ 43 44SKICU_EMIT_FUNCS 45#undef SKICU_FUNC 46 47const char* SkUnicode_IcuBidi::errorName(UErrorCode status) { 48 return sk_u_errorName(status); 49} 50 51void SkUnicode_IcuBidi::bidi_close(UBiDi* bidi) { 52 sk_ubidi_close(bidi); 53} 54UBiDiDirection SkUnicode_IcuBidi::bidi_getDirection(const UBiDi* bidi) { 55 return sk_ubidi_getDirection(bidi); 56} 57SkBidiIterator::Position SkUnicode_IcuBidi::bidi_getLength(const UBiDi* bidi) { 58 return sk_ubidi_getLength(bidi); 59} 60SkBidiIterator::Level SkUnicode_IcuBidi::bidi_getLevelAt(const UBiDi* bidi, int pos) { 61 return sk_ubidi_getLevelAt(bidi, pos); 62} 63UBiDi* SkUnicode_IcuBidi::bidi_openSized(int32_t maxLength, int32_t maxRunCount, UErrorCode* pErrorCode) { 64 return sk_ubidi_openSized(maxLength, maxRunCount, pErrorCode); 65} 66void SkUnicode_IcuBidi::bidi_setPara(UBiDi* bidi, 67 const UChar* text, 68 int32_t length, 69 UBiDiLevel paraLevel, 70 UBiDiLevel* embeddingLevels, 71 UErrorCode* status) { 72 return sk_ubidi_setPara(bidi, text, length, paraLevel, embeddingLevels, status); 73} 74void SkUnicode_IcuBidi::bidi_reorderVisual(const SkUnicode::BidiLevel runLevels[], 75 int levelsCount, 76 int32_t logicalFromVisual[]) { 77 sk_ubidi_reorderVisual(runLevels, levelsCount, logicalFromVisual); 78} 79 80static inline UBreakIterator* sk_ubrk_clone(const UBreakIterator* bi, UErrorCode* status) { 81 const auto* icu = ICULib(); 82 SkASSERT(icu->f_ubrk_clone_ || icu->f_ubrk_safeClone_); 83 return icu->f_ubrk_clone_ 84 ? icu->f_ubrk_clone_(bi, status) 85 : icu->f_ubrk_safeClone_(bi, nullptr, nullptr, status); 86} 87 88static UText* utext_close_wrapper(UText* ut) { 89 return sk_utext_close(ut); 90} 91static void ubrk_close_wrapper(UBreakIterator* bi) { 92 sk_ubrk_close(bi); 93} 94 95using ICUUText = std::unique_ptr<UText, SkFunctionWrapper<decltype(utext_close), 96 utext_close_wrapper>>; 97using ICUBreakIterator = std::unique_ptr<UBreakIterator, SkFunctionWrapper<decltype(ubrk_close), 98 ubrk_close_wrapper>>; 99/** Replaces invalid utf-8 sequences with REPLACEMENT CHARACTER U+FFFD. */ 100static inline SkUnichar utf8_next(const char** ptr, const char* end) { 101 SkUnichar val = SkUTF::NextUTF8(ptr, end); 102 return val < 0 ? 0xFFFD : val; 103} 104 105static UBreakIteratorType convertType(SkUnicode::BreakType type) { 106 switch (type) { 107 case SkUnicode::BreakType::kLines: return UBRK_LINE; 108 case SkUnicode::BreakType::kGraphemes: return UBRK_CHARACTER; 109 case SkUnicode::BreakType::kWords: return UBRK_WORD; 110 default: 111 return UBRK_CHARACTER; 112 } 113} 114 115class SkBreakIterator_icu : public SkBreakIterator { 116 ICUBreakIterator fBreakIterator; 117 Position fLastResult; 118 public: 119 explicit SkBreakIterator_icu(ICUBreakIterator iter) 120 : fBreakIterator(std::move(iter)) 121 , fLastResult(0) {} 122 Position first() override { return fLastResult = sk_ubrk_first(fBreakIterator.get()); } 123 Position current() override { return fLastResult = sk_ubrk_current(fBreakIterator.get()); } 124 Position next() override { return fLastResult = sk_ubrk_next(fBreakIterator.get()); } 125 Status status() override { return sk_ubrk_getRuleStatus(fBreakIterator.get()); } 126 bool isDone() override { return fLastResult == UBRK_DONE; } 127 128 bool setText(const char utftext8[], int utf8Units) override { 129 UErrorCode status = U_ZERO_ERROR; 130 ICUUText text(sk_utext_openUTF8(nullptr, &utftext8[0], utf8Units, &status)); 131 132 if (U_FAILURE(status)) { 133 SkDEBUGF("Break error: %s", sk_u_errorName(status)); 134 return false; 135 } 136 SkASSERT(text); 137 sk_ubrk_setUText(fBreakIterator.get(), text.get(), &status); 138 if (U_FAILURE(status)) { 139 SkDEBUGF("Break error: %s", sk_u_errorName(status)); 140 return false; 141 } 142 fLastResult = 0; 143 return true; 144 } 145 bool setText(const char16_t utftext16[], int utf16Units) override { 146 UErrorCode status = U_ZERO_ERROR; 147 ICUUText text(sk_utext_openUChars(nullptr, reinterpret_cast<const UChar*>(&utftext16[0]), 148 utf16Units, &status)); 149 150 if (U_FAILURE(status)) { 151 SkDEBUGF("Break error: %s", sk_u_errorName(status)); 152 return false; 153 } 154 SkASSERT(text); 155 sk_ubrk_setUText(fBreakIterator.get(), text.get(), &status); 156 if (U_FAILURE(status)) { 157 SkDEBUGF("Break error: %s", sk_u_errorName(status)); 158 return false; 159 } 160 fLastResult = 0; 161 return true; 162 } 163}; 164 165class SkIcuBreakIteratorCache { 166 SkTHashMap<SkUnicode::BreakType, ICUBreakIterator> fBreakCache; 167 SkMutex fBreakCacheMutex; 168 169 public: 170 static SkIcuBreakIteratorCache& get() { 171 static SkIcuBreakIteratorCache instance; 172 return instance; 173 } 174 175 ICUBreakIterator makeBreakIterator(SkUnicode::BreakType type) { 176 UErrorCode status = U_ZERO_ERROR; 177 ICUBreakIterator* cachedIterator; 178 { 179 SkAutoMutexExclusive lock(fBreakCacheMutex); 180 cachedIterator = fBreakCache.find(type); 181 if (!cachedIterator) { 182 ICUBreakIterator newIterator(sk_ubrk_open(convertType(type), sk_uloc_getDefault(), 183 nullptr, 0, &status)); 184 if (U_FAILURE(status)) { 185 SkDEBUGF("Break error: %s", sk_u_errorName(status)); 186 } else { 187 cachedIterator = fBreakCache.set(type, std::move(newIterator)); 188 } 189 } 190 } 191 ICUBreakIterator iterator; 192 if (cachedIterator) { 193 iterator.reset(sk_ubrk_clone(cachedIterator->get(), &status)); 194 if (U_FAILURE(status)) { 195 SkDEBUGF("Break error: %s", sk_u_errorName(status)); 196 } 197 } 198 return iterator; 199 } 200}; 201 202class SkUnicode_icu : public SkUnicode { 203 204 std::unique_ptr<SkUnicode> copy() override { 205 return std::make_unique<SkUnicode_icu>(); 206 } 207 208 static bool extractWords(uint16_t utf16[], int utf16Units, const char* locale, std::vector<Position>* words) { 209 210 UErrorCode status = U_ZERO_ERROR; 211 212 ICUBreakIterator iterator = SkIcuBreakIteratorCache::get().makeBreakIterator(BreakType::kWords); 213 if (!iterator) { 214 SkDEBUGF("Break error: %s", sk_u_errorName(status)); 215 return false; 216 } 217 SkASSERT(iterator); 218 219 ICUUText utf16UText(sk_utext_openUChars(nullptr, (UChar*)utf16, utf16Units, &status)); 220 if (U_FAILURE(status)) { 221 SkDEBUGF("Break error: %s", sk_u_errorName(status)); 222 return false; 223 } 224 225 sk_ubrk_setUText(iterator.get(), utf16UText.get(), &status); 226 if (U_FAILURE(status)) { 227 SkDEBUGF("Break error: %s", sk_u_errorName(status)); 228 return false; 229 } 230 231 // Get the words 232 int32_t pos = sk_ubrk_first(iterator.get()); 233 while (pos != UBRK_DONE) { 234 words->emplace_back(pos); 235 pos = sk_ubrk_next(iterator.get()); 236 } 237 238 return true; 239 } 240 241 static bool extractPositions 242 (const char utf8[], int utf8Units, BreakType type, std::function<void(int, int)> setBreak) { 243 244 UErrorCode status = U_ZERO_ERROR; 245 ICUUText text(sk_utext_openUTF8(nullptr, &utf8[0], utf8Units, &status)); 246 247 if (U_FAILURE(status)) { 248 SkDEBUGF("Break error: %s", sk_u_errorName(status)); 249 return false; 250 } 251 SkASSERT(text); 252 253 ICUBreakIterator iterator = SkIcuBreakIteratorCache::get().makeBreakIterator(type); 254 if (!iterator) { 255 return false; 256 } 257 258 sk_ubrk_setUText(iterator.get(), text.get(), &status); 259 if (U_FAILURE(status)) { 260 SkDEBUGF("Break error: %s", sk_u_errorName(status)); 261 return false; 262 } 263 264 auto iter = iterator.get(); 265 int32_t pos = sk_ubrk_first(iter); 266 while (pos != UBRK_DONE) { 267 int s = type == SkUnicode::BreakType::kLines 268 ? UBRK_LINE_SOFT 269 : sk_ubrk_getRuleStatus(iter); 270 setBreak(pos, s); 271 pos = sk_ubrk_next(iter); 272 } 273 274 if (type == SkUnicode::BreakType::kLines) { 275 // This is a workaround for https://bugs.chromium.org/p/skia/issues/detail?id=10715 276 // (ICU line break iterator does not work correctly on Thai text with new lines) 277 // So, we only use the iterator to collect soft line breaks and 278 // scan the text for all hard line breaks ourselves 279 const char* end = utf8 + utf8Units; 280 const char* ch = utf8; 281 while (ch < end) { 282 auto unichar = utf8_next(&ch, end); 283 if (isHardLineBreak(unichar)) { 284 setBreak(ch - utf8, UBRK_LINE_HARD); 285 } 286 } 287 } 288 return true; 289 } 290 291 static bool isControl(SkUnichar utf8) { 292 return sk_u_iscntrl(utf8); 293 } 294 295 static bool isWhitespace(SkUnichar utf8) { 296 return sk_u_isWhitespace(utf8); 297 } 298 299 static bool isSpace(SkUnichar utf8) { 300 return sk_u_isspace(utf8); 301 } 302 303 static bool isTabulation(SkUnichar utf8) { 304 return utf8 == '\t'; 305 } 306 307 static bool isHardBreak(SkUnichar utf8) { 308 auto property = sk_u_getIntPropertyValue(utf8, UCHAR_LINE_BREAK); 309 return property == U_LB_LINE_FEED || property == U_LB_MANDATORY_BREAK; 310 } 311 312 static bool isIdeographic(SkUnichar unichar) { 313 return sk_u_hasBinaryProperty(unichar, UCHAR_IDEOGRAPHIC); 314 } 315 316public: 317 ~SkUnicode_icu() override { } 318 std::unique_ptr<SkBidiIterator> makeBidiIterator(const uint16_t text[], int count, 319 SkBidiIterator::Direction dir) override { 320 return SkUnicode::makeBidiIterator(text, count, dir); 321 } 322 std::unique_ptr<SkBidiIterator> makeBidiIterator(const char text[], 323 int count, 324 SkBidiIterator::Direction dir) override { 325 return SkUnicode::makeBidiIterator(text, count, dir); 326 } 327 std::unique_ptr<SkBreakIterator> makeBreakIterator(const char locale[], 328 BreakType breakType) override { 329 UErrorCode status = U_ZERO_ERROR; 330 ICUBreakIterator iterator(sk_ubrk_open(convertType(breakType), locale, nullptr, 0, 331 &status)); 332 if (U_FAILURE(status)) { 333 SkDEBUGF("Break error: %s", sk_u_errorName(status)); 334 return nullptr; 335 } 336 return std::unique_ptr<SkBreakIterator>(new SkBreakIterator_icu(std::move(iterator))); 337 } 338 std::unique_ptr<SkBreakIterator> makeBreakIterator(BreakType breakType) override { 339 return makeBreakIterator(sk_uloc_getDefault(), breakType); 340 } 341 342 static bool isHardLineBreak(SkUnichar utf8) { 343 auto property = sk_u_getIntPropertyValue(utf8, UCHAR_LINE_BREAK); 344 return property == U_LB_LINE_FEED || property == U_LB_MANDATORY_BREAK; 345 } 346 347 SkString toUpper(const SkString& str) override { 348 // Convert to UTF16 since that's what ICU wants. 349 auto str16 = SkUnicode::convertUtf8ToUtf16(str.c_str(), str.size()); 350 351 UErrorCode icu_err = U_ZERO_ERROR; 352 const auto upper16len = sk_u_strToUpper(nullptr, 0, (UChar*)(str16.c_str()), str16.size(), 353 nullptr, &icu_err); 354 if (icu_err != U_BUFFER_OVERFLOW_ERROR || upper16len <= 0) { 355 return SkString(); 356 } 357 358 SkAutoSTArray<128, uint16_t> upper16(upper16len); 359 icu_err = U_ZERO_ERROR; 360 sk_u_strToUpper((UChar*)(upper16.get()), SkToS32(upper16.size()), 361 (UChar*)(str16.c_str()), str16.size(), 362 nullptr, &icu_err); 363 SkASSERT(!U_FAILURE(icu_err)); 364 365 // ... and back to utf8 'cause that's what we want. 366 return convertUtf16ToUtf8((char16_t*)upper16.get(), upper16.size()); 367 } 368 369 bool getBidiRegions(const char utf8[], 370 int utf8Units, 371 TextDirection dir, 372 std::vector<BidiRegion>* results) override { 373 return SkUnicode::extractBidi(utf8, utf8Units, dir, results); 374 } 375 376 bool getWords(const char utf8[], int utf8Units, const char* locale, std::vector<Position>* results) override { 377 378 // Convert to UTF16 since we want the results in utf16 379 auto utf16 = convertUtf8ToUtf16(utf8, utf8Units); 380 return SkUnicode_icu::extractWords((uint16_t*)utf16.c_str(), utf16.size(), locale, results); 381 } 382 383 bool computeCodeUnitFlags(char utf8[], int utf8Units, bool replaceTabs, 384 SkTArray<SkUnicode::CodeUnitFlags, true>* results) override { 385 results->reset(); 386 results->push_back_n(utf8Units + 1, CodeUnitFlags::kNoCodeUnitFlag); 387 388 SkUnicode_icu::extractPositions(utf8, utf8Units, BreakType::kLines, [&](int pos, 389 int status) { 390 (*results)[pos] |= status == UBRK_LINE_HARD 391 ? CodeUnitFlags::kHardLineBreakBefore 392 : CodeUnitFlags::kSoftLineBreakBefore; 393 }); 394 395 SkUnicode_icu::extractPositions(utf8, utf8Units, BreakType::kGraphemes, [&](int pos, 396 int status) { 397 (*results)[pos] |= CodeUnitFlags::kGraphemeStart; 398 }); 399 400 const char* current = utf8; 401 const char* end = utf8 + utf8Units; 402 while (current < end) { 403 auto before = current - utf8; 404 SkUnichar unichar = SkUTF::NextUTF8(¤t, end); 405 if (unichar < 0) unichar = 0xFFFD; 406 auto after = current - utf8; 407 if (replaceTabs && SkUnicode_icu::isTabulation(unichar)) { 408 results->at(before) |= SkUnicode::kTabulation; 409 if (replaceTabs) { 410 unichar = ' '; 411 utf8[before] = ' '; 412 } 413 } 414 for (auto i = before; i < after; ++i) { 415 if (SkUnicode_icu::isSpace(unichar)) { 416 results->at(i) |= SkUnicode::kPartOfIntraWordBreak; 417 } 418 if (SkUnicode_icu::isWhitespace(unichar)) { 419 results->at(i) |= SkUnicode::kPartOfWhiteSpaceBreak; 420 } 421 if (SkUnicode_icu::isControl(unichar)) { 422 results->at(i) |= SkUnicode::kControl; 423 } 424 if (SkUnicode_icu::isIdeographic(unichar)) { 425 results->at(i) |= SkUnicode::kIdeographic; 426 } 427 } 428 } 429 430 return true; 431 } 432 433 bool computeCodeUnitFlags(char16_t utf16[], int utf16Units, bool replaceTabs, 434 SkTArray<SkUnicode::CodeUnitFlags, true>* results) override { 435 results->reset(); 436 results->push_back_n(utf16Units + 1, CodeUnitFlags::kNoCodeUnitFlag); 437 438 // Get white spaces 439 this->forEachCodepoint((char16_t*)&utf16[0], utf16Units, 440 [results, replaceTabs, &utf16](SkUnichar unichar, int32_t start, int32_t end) { 441 for (auto i = start; i < end; ++i) { 442 if (replaceTabs && SkUnicode_icu::isTabulation(unichar)) { 443 results->at(i) |= SkUnicode::kTabulation; 444 if (replaceTabs) { 445 unichar = ' '; 446 utf16[start] = ' '; 447 } 448 } 449 if (SkUnicode_icu::isSpace(unichar)) { 450 results->at(i) |= SkUnicode::kPartOfIntraWordBreak; 451 } 452 if (SkUnicode_icu::isWhitespace(unichar)) { 453 results->at(i) |= SkUnicode::kPartOfWhiteSpaceBreak; 454 } 455 if (SkUnicode_icu::isControl(unichar)) { 456 results->at(i) |= SkUnicode::kControl; 457 } 458 } 459 }); 460 // Get graphemes 461 this->forEachBreak((char16_t*)&utf16[0], 462 utf16Units, 463 SkUnicode::BreakType::kGraphemes, 464 [results](SkBreakIterator::Position pos, SkBreakIterator::Status) { 465 (*results)[pos] |= CodeUnitFlags::kGraphemeStart; 466 }); 467 // Get line breaks 468 this->forEachBreak( 469 (char16_t*)&utf16[0], 470 utf16Units, 471 SkUnicode::BreakType::kLines, 472 [results](SkBreakIterator::Position pos, SkBreakIterator::Status status) { 473 if (status == 474 (SkBreakIterator::Status)SkUnicode::LineBreakType::kHardLineBreak) { 475 // Hard line breaks clears off all the other flags 476 // TODO: Treat \n as a formatting mark and do not pass it to SkShaper 477 (*results)[pos-1] = CodeUnitFlags::kHardLineBreakBefore; 478 } else { 479 (*results)[pos] |= CodeUnitFlags::kSoftLineBreakBefore; 480 } 481 }); 482 483 return true; 484 } 485 486 void reorderVisual(const BidiLevel runLevels[], 487 int levelsCount, 488 int32_t logicalFromVisual[]) override { 489 SkUnicode_IcuBidi::bidi_reorderVisual(runLevels, levelsCount, logicalFromVisual); 490 } 491}; 492 493std::unique_ptr<SkUnicode> SkUnicode::MakeIcuBasedUnicode() { 494 #if defined(SK_USING_THIRD_PARTY_ICU) 495 if (!SkLoadICU()) { 496 static SkOnce once; 497 once([] { SkDEBUGF("SkLoadICU() failed!\n"); }); 498 return nullptr; 499 } 500 #endif 501 502 return ICULib() 503 ? std::make_unique<SkUnicode_icu>() 504 : nullptr; 505} 506