11767c5feSopenharmony_ci/** 21767c5feSopenharmony_ci * Copyright 2010 Google Inc. 31767c5feSopenharmony_ci * 41767c5feSopenharmony_ci * Licensed under the Apache License, Version 2.0 (the "License"); 51767c5feSopenharmony_ci * you may not use this file except in compliance with the License. 61767c5feSopenharmony_ci * You may obtain a copy of the License at 71767c5feSopenharmony_ci * 81767c5feSopenharmony_ci * http://www.apache.org/licenses/LICENSE-2.0 91767c5feSopenharmony_ci * 101767c5feSopenharmony_ci * Unless required by applicable law or agreed to in writing, software 111767c5feSopenharmony_ci * distributed under the License is distributed on an "AS IS" BASIS, 121767c5feSopenharmony_ci * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 131767c5feSopenharmony_ci * See the License for the specific language governing permissions and 141767c5feSopenharmony_ci * limitations under the License. 151767c5feSopenharmony_ci */ 161767c5feSopenharmony_ci 171767c5feSopenharmony_ci// Routines to do manipulation of Unicode characters or text 181767c5feSopenharmony_ci// 191767c5feSopenharmony_ci// The StructurallyValid routines accept buffers of arbitrary bytes. 201767c5feSopenharmony_ci// For CoerceToStructurallyValid(), the input buffer and output buffers may 211767c5feSopenharmony_ci// point to exactly the same memory. 221767c5feSopenharmony_ci// 231767c5feSopenharmony_ci// In all other cases, the UTF-8 string must be structurally valid and 241767c5feSopenharmony_ci// have all codepoints in the range U+0000 to U+D7FF or U+E000 to U+10FFFF. 251767c5feSopenharmony_ci// Debug builds take a fatal error for invalid UTF-8 input. 261767c5feSopenharmony_ci// The input and output buffers may not overlap at all. 271767c5feSopenharmony_ci// 281767c5feSopenharmony_ci// The char32 routines are here only for convenience; they convert to UTF-8 291767c5feSopenharmony_ci// internally and use the UTF-8 routines. 301767c5feSopenharmony_ci 311767c5feSopenharmony_ci#ifndef UTIL_UTF8_UNILIB_H__ 321767c5feSopenharmony_ci#define UTIL_UTF8_UNILIB_H__ 331767c5feSopenharmony_ci 341767c5feSopenharmony_ci#include <string> 351767c5feSopenharmony_ci#include "phonenumbers/base/basictypes.h" 361767c5feSopenharmony_ci 371767c5feSopenharmony_cinamespace i18n { 381767c5feSopenharmony_cinamespace phonenumbers { 391767c5feSopenharmony_cinamespace UniLib { 401767c5feSopenharmony_ci 411767c5feSopenharmony_ci// Returns true unless a surrogate code point 421767c5feSopenharmony_ciinline bool IsValidCodepoint(char32 c) { 431767c5feSopenharmony_ci // In the range [0, 0xD800) or [0xE000, 0x10FFFF] 441767c5feSopenharmony_ci return (static_cast<uint32>(c) < 0xD800) 451767c5feSopenharmony_ci || (c >= 0xE000 && c <= 0x10FFFF); 461767c5feSopenharmony_ci} 471767c5feSopenharmony_ci 481767c5feSopenharmony_ci// Table of UTF-8 character lengths, based on first byte 491767c5feSopenharmony_cistatic const unsigned char kUTF8LenTbl[256] = { 501767c5feSopenharmony_ci 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 511767c5feSopenharmony_ci 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 521767c5feSopenharmony_ci 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 531767c5feSopenharmony_ci 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 541767c5feSopenharmony_ci 551767c5feSopenharmony_ci 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 561767c5feSopenharmony_ci 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 571767c5feSopenharmony_ci 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 581767c5feSopenharmony_ci 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4 591767c5feSopenharmony_ci}; 601767c5feSopenharmony_ci 611767c5feSopenharmony_ci// Return length of a single UTF-8 source character 621767c5feSopenharmony_ciinline int OneCharLen(const char* src) { 631767c5feSopenharmony_ci return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)]; 641767c5feSopenharmony_ci} 651767c5feSopenharmony_ci 661767c5feSopenharmony_ci// Return length of a single UTF-8 source character 671767c5feSopenharmony_ciinline int OneCharLen(const uint8* src) { 681767c5feSopenharmony_ci return kUTF8LenTbl[*src]; 691767c5feSopenharmony_ci} 701767c5feSopenharmony_ci 711767c5feSopenharmony_ci// Return true if this byte is a trailing UTF-8 byte (10xx xxxx) 721767c5feSopenharmony_ciinline bool IsTrailByte(char x) { 731767c5feSopenharmony_ci // return (x & 0xC0) == 0x80; 741767c5feSopenharmony_ci // Since trail bytes are always in [0x80, 0xBF], we can optimize: 751767c5feSopenharmony_ci return static_cast<signed char>(x) < -0x40; 761767c5feSopenharmony_ci} 771767c5feSopenharmony_ci 781767c5feSopenharmony_ci// Returns the length in bytes of the prefix of src that is all 791767c5feSopenharmony_ci// interchange valid UTF-8 801767c5feSopenharmony_ciint SpanInterchangeValid(const char* src, int byte_length); 811767c5feSopenharmony_ciinline int SpanInterchangeValid(const std::string& src) { 821767c5feSopenharmony_ci return SpanInterchangeValid(src.data(), static_cast<int>(src.size())); 831767c5feSopenharmony_ci} 841767c5feSopenharmony_ci 851767c5feSopenharmony_ci// Returns true if the source is all interchange valid UTF-8 861767c5feSopenharmony_ci// "Interchange valid" is a stronger than structurally valid -- 871767c5feSopenharmony_ci// no C0 or C1 control codes (other than CR LF HT FF) and no non-characters. 881767c5feSopenharmony_ciinline bool IsInterchangeValid(const char* src, int byte_length) { 891767c5feSopenharmony_ci return (byte_length == SpanInterchangeValid(src, byte_length)); 901767c5feSopenharmony_ci} 911767c5feSopenharmony_ciinline bool IsInterchangeValid(const std::string& src) { 921767c5feSopenharmony_ci return IsInterchangeValid(src.data(), static_cast<int>(src.size())); 931767c5feSopenharmony_ci} 941767c5feSopenharmony_ci 951767c5feSopenharmony_ci} // namespace UniLib 961767c5feSopenharmony_ci} // namespace phonenumbers 971767c5feSopenharmony_ci} // namespace i18n 981767c5feSopenharmony_ci 991767c5feSopenharmony_ci#endif // UTIL_UTF8_PUBLIC_UNILIB_H_ 100