1// Copyright 2011 the V8 project authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#ifndef V8_STRINGS_UNICODE_H_ 6#define V8_STRINGS_UNICODE_H_ 7 8#include <sys/types.h> 9#include "src/base/bit-field.h" 10#include "src/common/globals.h" 11#include "src/third_party/utf8-decoder/utf8-decoder.h" 12/** 13 * \file 14 * Definitions and convenience functions for working with unicode. 15 */ 16 17namespace unibrow { 18 19using uchar = unsigned int; 20using byte = unsigned char; 21 22/** 23 * The max length of the result of converting the case of a single 24 * character. 25 */ 26const int kMaxMappingSize = 4; 27 28#ifndef V8_INTL_SUPPORT 29template <class T, int size = 256> 30class Predicate { 31 public: 32 inline Predicate() = default; 33 inline bool get(uchar c); 34 35 private: 36 friend class Test; 37 bool CalculateValue(uchar c); 38 class CacheEntry { 39 public: 40 inline CacheEntry() 41 : bit_field_(CodePointField::encode(0) | ValueField::encode(0)) {} 42 inline CacheEntry(uchar code_point, bool value) 43 : bit_field_( 44 CodePointField::encode(CodePointField::kMask & code_point) | 45 ValueField::encode(value)) { 46 DCHECK_IMPLIES((CodePointField::kMask & code_point) != code_point, 47 code_point == static_cast<uchar>(-1)); 48 } 49 50 uchar code_point() const { return CodePointField::decode(bit_field_); } 51 bool value() const { return ValueField::decode(bit_field_); } 52 53 private: 54 using CodePointField = v8::base::BitField<uchar, 0, 21>; 55 using ValueField = v8::base::BitField<bool, 21, 1>; 56 57 uint32_t bit_field_; 58 }; 59 static const int kSize = size; 60 static const int kMask = kSize - 1; 61 CacheEntry entries_[kSize]; 62}; 63 64// A cache used in case conversion. It caches the value for characters 65// that either have no mapping or map to a single character independent 66// of context. Characters that map to more than one character or that 67// map differently depending on context are always looked up. 68template <class T, int size = 256> 69class Mapping { 70 public: 71 inline Mapping() = default; 72 inline int get(uchar c, uchar n, uchar* result); 73 74 private: 75 friend class Test; 76 int CalculateValue(uchar c, uchar n, uchar* result); 77 struct CacheEntry { 78 inline CacheEntry() : code_point_(kNoChar), offset_(0) {} 79 inline CacheEntry(uchar code_point, signed offset) 80 : code_point_(code_point), offset_(offset) {} 81 uchar code_point_; 82 signed offset_; 83 static const int kNoChar = (1 << 21) - 1; 84 }; 85 static const int kSize = size; 86 static const int kMask = kSize - 1; 87 CacheEntry entries_[kSize]; 88}; 89 90class UnicodeData { 91 private: 92 friend class Test; 93 static int GetByteCount(); 94 static const uchar kMaxCodePoint; 95}; 96 97#endif // !V8_INTL_SUPPORT 98 99class Utf16 { 100 public: 101 static const int kNoPreviousCharacter = -1; 102 static inline bool IsSurrogatePair(int lead, int trail) { 103 return IsLeadSurrogate(lead) && IsTrailSurrogate(trail); 104 } 105 static inline bool IsLeadSurrogate(int code) { 106 return (code & 0xfc00) == 0xd800; 107 } 108 static inline bool IsTrailSurrogate(int code) { 109 return (code & 0xfc00) == 0xdc00; 110 } 111 112 static inline int CombineSurrogatePair(uchar lead, uchar trail) { 113 return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff); 114 } 115 static const uchar kMaxNonSurrogateCharCode = 0xffff; 116 // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes 117 // of UTF-8 data. The special case where the unit is a surrogate 118 // trail produces 1 byte net, because the encoding of the pair is 119 // 4 bytes and the 3 bytes that were used to encode the lead surrogate 120 // can be reclaimed. 121 static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3; 122 // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes. 123 // The illegality stems from the surrogate not being part of a pair. 124 static const int kUtf8BytesToCodeASurrogate = 3; 125 static inline uint16_t LeadSurrogate(uint32_t char_code) { 126 return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff); 127 } 128 static inline uint16_t TrailSurrogate(uint32_t char_code) { 129 return 0xdc00 + (char_code & 0x3ff); 130 } 131 static inline bool HasUnpairedSurrogate(const uint16_t* code_units, 132 size_t length); 133}; 134 135class Latin1 { 136 public: 137 static const uint16_t kMaxChar = 0xff; 138 // Convert the character to Latin-1 case equivalent if possible. 139 static inline uint16_t TryConvertToLatin1(uint16_t c) { 140 switch (c) { 141 // This are equivalent characters in unicode. 142 case 0x39c: 143 case 0x3bc: 144 return 0xb5; 145 // This is an uppercase of a Latin-1 character 146 // outside of Latin-1. 147 case 0x178: 148 return 0xff; 149 } 150 return c; 151 } 152}; 153 154class V8_EXPORT_PRIVATE Utf8 { 155 public: 156 using State = Utf8DfaDecoder::State; 157 158 static inline uchar Length(uchar chr, int previous); 159 static inline unsigned EncodeOneByte(char* out, uint8_t c); 160 static inline unsigned Encode(char* out, uchar c, int previous, 161 bool replace_invalid = false); 162 static uchar CalculateValue(const byte* str, size_t length, size_t* cursor); 163 164 // The unicode replacement character, used to signal invalid unicode 165 // sequences (e.g. an orphan surrogate) when converting to a UTF-8 encoding. 166 static const uchar kBadChar = 0xFFFD; 167 static const uchar kBufferEmpty = 0x0; 168 static const uchar kIncomplete = 0xFFFFFFFC; // any non-valid code point. 169 static const unsigned kMaxEncodedSize = 4; 170 static const unsigned kMaxOneByteChar = 0x7f; 171 static const unsigned kMaxTwoByteChar = 0x7ff; 172 static const unsigned kMaxThreeByteChar = 0xffff; 173 static const unsigned kMaxFourByteChar = 0x1fffff; 174 175 // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together 176 // that match are coded as a 4 byte UTF-8 sequence. 177 static const unsigned kBytesSavedByCombiningSurrogates = 2; 178 static const unsigned kSizeOfUnmatchedSurrogate = 3; 179 // The maximum size a single UTF-16 code unit may take up when encoded as 180 // UTF-8. 181 static const unsigned kMax16BitCodeUnitSize = 3; 182 static inline uchar ValueOf(const byte* str, size_t length, size_t* cursor); 183 184 using Utf8IncrementalBuffer = uint32_t; 185 static inline uchar ValueOfIncremental(const byte** cursor, State* state, 186 Utf8IncrementalBuffer* buffer); 187 static uchar ValueOfIncrementalFinish(State* state); 188 189 // Excludes non-characters from the set of valid code points. 190 static inline bool IsValidCharacter(uchar c); 191 192 // Validate if the input has a valid utf-8 encoding. Unlike JS source code 193 // this validation function will accept any unicode code point, including 194 // kBadChar and BOMs. 195 // 196 // This method checks for: 197 // - valid utf-8 endcoding (e.g. no over-long encodings), 198 // - absence of surrogates, 199 // - valid code point range. 200 static bool ValidateEncoding(const byte* str, size_t length); 201}; 202 203struct Uppercase { 204 static bool Is(uchar c); 205}; 206struct Letter { 207 static bool Is(uchar c); 208}; 209#ifndef V8_INTL_SUPPORT 210struct V8_EXPORT_PRIVATE ID_Start { 211 static bool Is(uchar c); 212}; 213struct V8_EXPORT_PRIVATE ID_Continue { 214 static bool Is(uchar c); 215}; 216struct V8_EXPORT_PRIVATE WhiteSpace { 217 static bool Is(uchar c); 218}; 219#endif // !V8_INTL_SUPPORT 220 221// LineTerminator: 'JS_Line_Terminator' in point.properties 222// ES#sec-line-terminators lists exactly 4 code points: 223// LF (U+000A), CR (U+000D), LS(U+2028), PS(U+2029) 224V8_INLINE bool IsLineTerminator(uchar c) { 225 return c == 0x000A || c == 0x000D || c == 0x2028 || c == 0x2029; 226} 227 228V8_INLINE bool IsStringLiteralLineTerminator(uchar c) { 229 return c == 0x000A || c == 0x000D; 230} 231 232#ifndef V8_INTL_SUPPORT 233struct V8_EXPORT_PRIVATE ToLowercase { 234 static const int kMaxWidth = 3; 235 static const bool kIsToLower = true; 236 static int Convert(uchar c, uchar n, uchar* result, bool* allow_caching_ptr); 237}; 238struct V8_EXPORT_PRIVATE ToUppercase { 239 static const int kMaxWidth = 3; 240 static const bool kIsToLower = false; 241 static int Convert(uchar c, uchar n, uchar* result, bool* allow_caching_ptr); 242}; 243struct V8_EXPORT_PRIVATE Ecma262Canonicalize { 244 static const int kMaxWidth = 1; 245 static int Convert(uchar c, uchar n, uchar* result, bool* allow_caching_ptr); 246}; 247struct V8_EXPORT_PRIVATE Ecma262UnCanonicalize { 248 static const int kMaxWidth = 4; 249 static int Convert(uchar c, uchar n, uchar* result, bool* allow_caching_ptr); 250}; 251struct V8_EXPORT_PRIVATE CanonicalizationRange { 252 static const int kMaxWidth = 1; 253 static int Convert(uchar c, uchar n, uchar* result, bool* allow_caching_ptr); 254}; 255#endif // !V8_INTL_SUPPORT 256 257} // namespace unibrow 258 259#endif // V8_STRINGS_UNICODE_H_ 260