11767c5feSopenharmony_ci/**
21767c5feSopenharmony_ci * Copyright 2010 Google Inc.
31767c5feSopenharmony_ci *
41767c5feSopenharmony_ci * Licensed under the Apache License, Version 2.0 (the "License");
51767c5feSopenharmony_ci * you may not use this file except in compliance with the License.
61767c5feSopenharmony_ci * You may obtain a copy of the License at
71767c5feSopenharmony_ci *
81767c5feSopenharmony_ci *      http://www.apache.org/licenses/LICENSE-2.0
91767c5feSopenharmony_ci *
101767c5feSopenharmony_ci * Unless required by applicable law or agreed to in writing, software
111767c5feSopenharmony_ci * distributed under the License is distributed on an "AS IS" BASIS,
121767c5feSopenharmony_ci * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
131767c5feSopenharmony_ci * See the License for the specific language governing permissions and
141767c5feSopenharmony_ci * limitations under the License.
151767c5feSopenharmony_ci */
161767c5feSopenharmony_ci
171767c5feSopenharmony_ci// Routines to do manipulation of Unicode characters or text
181767c5feSopenharmony_ci//
191767c5feSopenharmony_ci// The StructurallyValid routines accept buffers of arbitrary bytes.
201767c5feSopenharmony_ci// For CoerceToStructurallyValid(), the input buffer and output buffers may
211767c5feSopenharmony_ci// point to exactly the same memory.
221767c5feSopenharmony_ci//
231767c5feSopenharmony_ci// In all other cases, the UTF-8 string must be structurally valid and
241767c5feSopenharmony_ci// have all codepoints in the range  U+0000 to U+D7FF or U+E000 to U+10FFFF.
251767c5feSopenharmony_ci// Debug builds take a fatal error for invalid UTF-8 input.
261767c5feSopenharmony_ci// The input and output buffers may not overlap at all.
271767c5feSopenharmony_ci//
281767c5feSopenharmony_ci// The char32 routines are here only for convenience; they convert to UTF-8
291767c5feSopenharmony_ci// internally and use the UTF-8 routines.
301767c5feSopenharmony_ci
311767c5feSopenharmony_ci#ifndef UTIL_UTF8_UNILIB_H__
321767c5feSopenharmony_ci#define UTIL_UTF8_UNILIB_H__
331767c5feSopenharmony_ci
341767c5feSopenharmony_ci#include <string>
351767c5feSopenharmony_ci#include "phonenumbers/base/basictypes.h"
361767c5feSopenharmony_ci
371767c5feSopenharmony_cinamespace i18n {
381767c5feSopenharmony_cinamespace phonenumbers {
391767c5feSopenharmony_cinamespace UniLib {
401767c5feSopenharmony_ci
411767c5feSopenharmony_ci// Returns true unless a surrogate code point
421767c5feSopenharmony_ciinline bool IsValidCodepoint(char32 c) {
431767c5feSopenharmony_ci  // In the range [0, 0xD800) or [0xE000, 0x10FFFF]
441767c5feSopenharmony_ci  return (static_cast<uint32>(c) < 0xD800)
451767c5feSopenharmony_ci    || (c >= 0xE000 && c <= 0x10FFFF);
461767c5feSopenharmony_ci}
471767c5feSopenharmony_ci
481767c5feSopenharmony_ci// Table of UTF-8 character lengths, based on first byte
491767c5feSopenharmony_cistatic const unsigned char kUTF8LenTbl[256] = {
501767c5feSopenharmony_ci  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
511767c5feSopenharmony_ci  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
521767c5feSopenharmony_ci  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
531767c5feSopenharmony_ci  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
541767c5feSopenharmony_ci
551767c5feSopenharmony_ci  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
561767c5feSopenharmony_ci  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
571767c5feSopenharmony_ci  2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
581767c5feSopenharmony_ci  3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
591767c5feSopenharmony_ci};
601767c5feSopenharmony_ci
611767c5feSopenharmony_ci// Return length of a single UTF-8 source character
621767c5feSopenharmony_ciinline int OneCharLen(const char* src) {
631767c5feSopenharmony_ci  return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)];
641767c5feSopenharmony_ci}
651767c5feSopenharmony_ci
661767c5feSopenharmony_ci// Return length of a single UTF-8 source character
671767c5feSopenharmony_ciinline int OneCharLen(const uint8* src) {
681767c5feSopenharmony_ci  return kUTF8LenTbl[*src];
691767c5feSopenharmony_ci}
701767c5feSopenharmony_ci
711767c5feSopenharmony_ci// Return true if this byte is a trailing UTF-8 byte (10xx xxxx)
721767c5feSopenharmony_ciinline bool IsTrailByte(char x) {
731767c5feSopenharmony_ci  // return (x & 0xC0) == 0x80;
741767c5feSopenharmony_ci  // Since trail bytes are always in [0x80, 0xBF], we can optimize:
751767c5feSopenharmony_ci  return static_cast<signed char>(x) < -0x40;
761767c5feSopenharmony_ci}
771767c5feSopenharmony_ci
781767c5feSopenharmony_ci// Returns the length in bytes of the prefix of src that is all
791767c5feSopenharmony_ci//  interchange valid UTF-8
801767c5feSopenharmony_ciint SpanInterchangeValid(const char* src, int byte_length);
811767c5feSopenharmony_ciinline int SpanInterchangeValid(const std::string& src) {
821767c5feSopenharmony_ci  return SpanInterchangeValid(src.data(), static_cast<int>(src.size()));
831767c5feSopenharmony_ci}
841767c5feSopenharmony_ci
851767c5feSopenharmony_ci// Returns true if the source is all interchange valid UTF-8
861767c5feSopenharmony_ci// "Interchange valid" is a stronger than structurally valid --
871767c5feSopenharmony_ci// no C0 or C1 control codes (other than CR LF HT FF) and no non-characters.
881767c5feSopenharmony_ciinline bool IsInterchangeValid(const char* src, int byte_length) {
891767c5feSopenharmony_ci  return (byte_length == SpanInterchangeValid(src, byte_length));
901767c5feSopenharmony_ci}
911767c5feSopenharmony_ciinline bool IsInterchangeValid(const std::string& src) {
921767c5feSopenharmony_ci  return IsInterchangeValid(src.data(), static_cast<int>(src.size()));
931767c5feSopenharmony_ci}
941767c5feSopenharmony_ci
951767c5feSopenharmony_ci}  // namespace UniLib
961767c5feSopenharmony_ci}  // namespace phonenumbers
971767c5feSopenharmony_ci}  // namespace i18n
981767c5feSopenharmony_ci
991767c5feSopenharmony_ci#endif  // UTIL_UTF8_PUBLIC_UNILIB_H_
100