1cb93a386Sopenharmony_ci// Copyright 2018 Google LLC. 2cb93a386Sopenharmony_ci// Use of this source code is governed by a BSD-style license that can be found in the LICENSE file. 3cb93a386Sopenharmony_ci 4cb93a386Sopenharmony_ci#include "include/private/SkTFitsIn.h" 5cb93a386Sopenharmony_ci#include "src/utils/SkUTF.h" 6cb93a386Sopenharmony_ci 7cb93a386Sopenharmony_ci#include <climits> 8cb93a386Sopenharmony_ci 9cb93a386Sopenharmony_cistatic constexpr inline int32_t left_shift(int32_t value, int32_t shift) { 10cb93a386Sopenharmony_ci return (int32_t) ((uint32_t) value << shift); 11cb93a386Sopenharmony_ci} 12cb93a386Sopenharmony_ci 13cb93a386Sopenharmony_citemplate <typename T> static constexpr bool is_align2(T x) { return 0 == (x & 1); } 14cb93a386Sopenharmony_ci 15cb93a386Sopenharmony_citemplate <typename T> static constexpr bool is_align4(T x) { return 0 == (x & 3); } 16cb93a386Sopenharmony_ci 17cb93a386Sopenharmony_cistatic constexpr inline bool utf16_is_high_surrogate(uint16_t c) { return (c & 0xFC00) == 0xD800; } 18cb93a386Sopenharmony_ci 19cb93a386Sopenharmony_cistatic constexpr inline bool utf16_is_low_surrogate(uint16_t c) { return (c & 0xFC00) == 0xDC00; } 20cb93a386Sopenharmony_ci 21cb93a386Sopenharmony_ci/** @returns -1 iff invalid UTF8 byte, 22cb93a386Sopenharmony_ci 0 iff UTF8 continuation byte, 23cb93a386Sopenharmony_ci 1 iff ASCII byte, 24cb93a386Sopenharmony_ci 2 iff leading byte of 2-byte sequence, 25cb93a386Sopenharmony_ci 3 iff leading byte of 3-byte sequence, and 26cb93a386Sopenharmony_ci 4 iff leading byte of 4-byte sequence. 27cb93a386Sopenharmony_ci I.e.: if return value > 0, then gives length of sequence. 28cb93a386Sopenharmony_ci*/ 29cb93a386Sopenharmony_cistatic int utf8_byte_type(uint8_t c) { 30cb93a386Sopenharmony_ci if (c < 0x80) { 31cb93a386Sopenharmony_ci return 1; 32cb93a386Sopenharmony_ci } else if (c < 0xC0) { 33cb93a386Sopenharmony_ci return 0; 34cb93a386Sopenharmony_ci } else if (c >= 0xF5 || (c & 0xFE) == 0xC0) { // "octet values c0, c1, f5 to ff never appear" 35cb93a386Sopenharmony_ci return -1; 36cb93a386Sopenharmony_ci } else { 37cb93a386Sopenharmony_ci int value = (((0xe5 << 24) >> ((unsigned)c >> 4 << 1)) & 3) + 1; 38cb93a386Sopenharmony_ci // assert(value >= 2 && value <=4); 39cb93a386Sopenharmony_ci return value; 40cb93a386Sopenharmony_ci } 41cb93a386Sopenharmony_ci} 42cb93a386Sopenharmony_cistatic bool utf8_type_is_valid_leading_byte(int type) { return type > 0; } 43cb93a386Sopenharmony_ci 44cb93a386Sopenharmony_cistatic bool utf8_byte_is_continuation(uint8_t c) { return utf8_byte_type(c) == 0; } 45cb93a386Sopenharmony_ci 46cb93a386Sopenharmony_ci//////////////////////////////////////////////////////////////////////////////// 47cb93a386Sopenharmony_ci 48cb93a386Sopenharmony_ciint SkUTF::CountUTF8(const char* utf8, size_t byteLength) { 49cb93a386Sopenharmony_ci if (!utf8) { 50cb93a386Sopenharmony_ci return -1; 51cb93a386Sopenharmony_ci } 52cb93a386Sopenharmony_ci int count = 0; 53cb93a386Sopenharmony_ci const char* stop = utf8 + byteLength; 54cb93a386Sopenharmony_ci while (utf8 < stop) { 55cb93a386Sopenharmony_ci int type = utf8_byte_type(*(const uint8_t*)utf8); 56cb93a386Sopenharmony_ci if (!utf8_type_is_valid_leading_byte(type) || utf8 + type > stop) { 57cb93a386Sopenharmony_ci return -1; // Sequence extends beyond end. 58cb93a386Sopenharmony_ci } 59cb93a386Sopenharmony_ci while(type-- > 1) { 60cb93a386Sopenharmony_ci ++utf8; 61cb93a386Sopenharmony_ci if (!utf8_byte_is_continuation(*(const uint8_t*)utf8)) { 62cb93a386Sopenharmony_ci return -1; 63cb93a386Sopenharmony_ci } 64cb93a386Sopenharmony_ci } 65cb93a386Sopenharmony_ci ++utf8; 66cb93a386Sopenharmony_ci ++count; 67cb93a386Sopenharmony_ci } 68cb93a386Sopenharmony_ci return count; 69cb93a386Sopenharmony_ci} 70cb93a386Sopenharmony_ci 71cb93a386Sopenharmony_ciint SkUTF::CountUTF16(const uint16_t* utf16, size_t byteLength) { 72cb93a386Sopenharmony_ci if (!utf16 || !is_align2(intptr_t(utf16)) || !is_align2(byteLength)) { 73cb93a386Sopenharmony_ci return -1; 74cb93a386Sopenharmony_ci } 75cb93a386Sopenharmony_ci const uint16_t* src = (const uint16_t*)utf16; 76cb93a386Sopenharmony_ci const uint16_t* stop = src + (byteLength >> 1); 77cb93a386Sopenharmony_ci int count = 0; 78cb93a386Sopenharmony_ci while (src < stop) { 79cb93a386Sopenharmony_ci unsigned c = *src++; 80cb93a386Sopenharmony_ci if (utf16_is_low_surrogate(c)) { 81cb93a386Sopenharmony_ci return -1; 82cb93a386Sopenharmony_ci } 83cb93a386Sopenharmony_ci if (utf16_is_high_surrogate(c)) { 84cb93a386Sopenharmony_ci if (src >= stop) { 85cb93a386Sopenharmony_ci return -1; 86cb93a386Sopenharmony_ci } 87cb93a386Sopenharmony_ci c = *src++; 88cb93a386Sopenharmony_ci if (!utf16_is_low_surrogate(c)) { 89cb93a386Sopenharmony_ci return -1; 90cb93a386Sopenharmony_ci } 91cb93a386Sopenharmony_ci } 92cb93a386Sopenharmony_ci count += 1; 93cb93a386Sopenharmony_ci } 94cb93a386Sopenharmony_ci return count; 95cb93a386Sopenharmony_ci} 96cb93a386Sopenharmony_ci 97cb93a386Sopenharmony_ciint SkUTF::CountUTF32(const int32_t* utf32, size_t byteLength) { 98cb93a386Sopenharmony_ci if (!is_align4(intptr_t(utf32)) || !is_align4(byteLength) || !SkTFitsIn<int>(byteLength >> 2)) { 99cb93a386Sopenharmony_ci return -1; 100cb93a386Sopenharmony_ci } 101cb93a386Sopenharmony_ci const uint32_t kInvalidUnicharMask = 0xFF000000; // unichar fits in 24 bits 102cb93a386Sopenharmony_ci const uint32_t* ptr = (const uint32_t*)utf32; 103cb93a386Sopenharmony_ci const uint32_t* stop = ptr + (byteLength >> 2); 104cb93a386Sopenharmony_ci while (ptr < stop) { 105cb93a386Sopenharmony_ci if (*ptr & kInvalidUnicharMask) { 106cb93a386Sopenharmony_ci return -1; 107cb93a386Sopenharmony_ci } 108cb93a386Sopenharmony_ci ptr += 1; 109cb93a386Sopenharmony_ci } 110cb93a386Sopenharmony_ci return (int)(byteLength >> 2); 111cb93a386Sopenharmony_ci} 112cb93a386Sopenharmony_ci 113cb93a386Sopenharmony_citemplate <typename T> 114cb93a386Sopenharmony_cistatic SkUnichar next_fail(const T** ptr, const T* end) { 115cb93a386Sopenharmony_ci *ptr = end; 116cb93a386Sopenharmony_ci return -1; 117cb93a386Sopenharmony_ci} 118cb93a386Sopenharmony_ci 119cb93a386Sopenharmony_ciSkUnichar SkUTF::NextUTF8(const char** ptr, const char* end) { 120cb93a386Sopenharmony_ci if (!ptr || !end ) { 121cb93a386Sopenharmony_ci return -1; 122cb93a386Sopenharmony_ci } 123cb93a386Sopenharmony_ci const uint8_t* p = (const uint8_t*)*ptr; 124cb93a386Sopenharmony_ci if (!p || p >= (const uint8_t*)end) { 125cb93a386Sopenharmony_ci return next_fail(ptr, end); 126cb93a386Sopenharmony_ci } 127cb93a386Sopenharmony_ci int c = *p; 128cb93a386Sopenharmony_ci int hic = c << 24; 129cb93a386Sopenharmony_ci 130cb93a386Sopenharmony_ci if (!utf8_type_is_valid_leading_byte(utf8_byte_type(c))) { 131cb93a386Sopenharmony_ci return next_fail(ptr, end); 132cb93a386Sopenharmony_ci } 133cb93a386Sopenharmony_ci if (hic < 0) { 134cb93a386Sopenharmony_ci uint32_t mask = (uint32_t)~0x3F; 135cb93a386Sopenharmony_ci hic = left_shift(hic, 1); 136cb93a386Sopenharmony_ci do { 137cb93a386Sopenharmony_ci ++p; 138cb93a386Sopenharmony_ci if (p >= (const uint8_t*)end) { 139cb93a386Sopenharmony_ci return next_fail(ptr, end); 140cb93a386Sopenharmony_ci } 141cb93a386Sopenharmony_ci // check before reading off end of array. 142cb93a386Sopenharmony_ci uint8_t nextByte = *p; 143cb93a386Sopenharmony_ci if (!utf8_byte_is_continuation(nextByte)) { 144cb93a386Sopenharmony_ci return next_fail(ptr, end); 145cb93a386Sopenharmony_ci } 146cb93a386Sopenharmony_ci c = (c << 6) | (nextByte & 0x3F); 147cb93a386Sopenharmony_ci mask <<= 5; 148cb93a386Sopenharmony_ci } while ((hic = left_shift(hic, 1)) < 0); 149cb93a386Sopenharmony_ci c &= ~mask; 150cb93a386Sopenharmony_ci } 151cb93a386Sopenharmony_ci *ptr = (char*)p + 1; 152cb93a386Sopenharmony_ci return c; 153cb93a386Sopenharmony_ci} 154cb93a386Sopenharmony_ci 155cb93a386Sopenharmony_ciSkUnichar SkUTF::NextUTF16(const uint16_t** ptr, const uint16_t* end) { 156cb93a386Sopenharmony_ci if (!ptr || !end ) { 157cb93a386Sopenharmony_ci return -1; 158cb93a386Sopenharmony_ci } 159cb93a386Sopenharmony_ci const uint16_t* src = *ptr; 160cb93a386Sopenharmony_ci if (!src || src + 1 > end || !is_align2(intptr_t(src))) { 161cb93a386Sopenharmony_ci return next_fail(ptr, end); 162cb93a386Sopenharmony_ci } 163cb93a386Sopenharmony_ci uint16_t c = *src++; 164cb93a386Sopenharmony_ci SkUnichar result = c; 165cb93a386Sopenharmony_ci if (utf16_is_low_surrogate(c)) { 166cb93a386Sopenharmony_ci return next_fail(ptr, end); // srcPtr should never point at low surrogate. 167cb93a386Sopenharmony_ci } 168cb93a386Sopenharmony_ci if (utf16_is_high_surrogate(c)) { 169cb93a386Sopenharmony_ci if (src + 1 > end) { 170cb93a386Sopenharmony_ci return next_fail(ptr, end); // Truncated string. 171cb93a386Sopenharmony_ci } 172cb93a386Sopenharmony_ci uint16_t low = *src++; 173cb93a386Sopenharmony_ci if (!utf16_is_low_surrogate(low)) { 174cb93a386Sopenharmony_ci return next_fail(ptr, end); 175cb93a386Sopenharmony_ci } 176cb93a386Sopenharmony_ci /* 177cb93a386Sopenharmony_ci [paraphrased from wikipedia] 178cb93a386Sopenharmony_ci Take the high surrogate and subtract 0xD800, then multiply by 0x400. 179cb93a386Sopenharmony_ci Take the low surrogate and subtract 0xDC00. Add these two results 180cb93a386Sopenharmony_ci together, and finally add 0x10000 to get the final decoded codepoint. 181cb93a386Sopenharmony_ci 182cb93a386Sopenharmony_ci unicode = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000 183cb93a386Sopenharmony_ci unicode = (high * 0x400) - (0xD800 * 0x400) + low - 0xDC00 + 0x10000 184cb93a386Sopenharmony_ci unicode = (high << 10) - (0xD800 << 10) + low - 0xDC00 + 0x10000 185cb93a386Sopenharmony_ci unicode = (high << 10) + low - ((0xD800 << 10) + 0xDC00 - 0x10000) 186cb93a386Sopenharmony_ci */ 187cb93a386Sopenharmony_ci result = (result << 10) + (SkUnichar)low - ((0xD800 << 10) + 0xDC00 - 0x10000); 188cb93a386Sopenharmony_ci } 189cb93a386Sopenharmony_ci *ptr = src; 190cb93a386Sopenharmony_ci return result; 191cb93a386Sopenharmony_ci} 192cb93a386Sopenharmony_ci 193cb93a386Sopenharmony_ciSkUnichar SkUTF::NextUTF32(const int32_t** ptr, const int32_t* end) { 194cb93a386Sopenharmony_ci if (!ptr || !end ) { 195cb93a386Sopenharmony_ci return -1; 196cb93a386Sopenharmony_ci } 197cb93a386Sopenharmony_ci const int32_t* s = *ptr; 198cb93a386Sopenharmony_ci if (!s || s + 1 > end || !is_align4(intptr_t(s))) { 199cb93a386Sopenharmony_ci return next_fail(ptr, end); 200cb93a386Sopenharmony_ci } 201cb93a386Sopenharmony_ci int32_t value = *s; 202cb93a386Sopenharmony_ci const uint32_t kInvalidUnicharMask = 0xFF000000; // unichar fits in 24 bits 203cb93a386Sopenharmony_ci if (value & kInvalidUnicharMask) { 204cb93a386Sopenharmony_ci return next_fail(ptr, end); 205cb93a386Sopenharmony_ci } 206cb93a386Sopenharmony_ci *ptr = s + 1; 207cb93a386Sopenharmony_ci return value; 208cb93a386Sopenharmony_ci} 209cb93a386Sopenharmony_ci 210cb93a386Sopenharmony_cisize_t SkUTF::ToUTF8(SkUnichar uni, char utf8[SkUTF::kMaxBytesInUTF8Sequence]) { 211cb93a386Sopenharmony_ci if ((uint32_t)uni > 0x10FFFF) { 212cb93a386Sopenharmony_ci return 0; 213cb93a386Sopenharmony_ci } 214cb93a386Sopenharmony_ci if (uni <= 127) { 215cb93a386Sopenharmony_ci if (utf8) { 216cb93a386Sopenharmony_ci *utf8 = (char)uni; 217cb93a386Sopenharmony_ci } 218cb93a386Sopenharmony_ci return 1; 219cb93a386Sopenharmony_ci } 220cb93a386Sopenharmony_ci char tmp[4]; 221cb93a386Sopenharmony_ci char* p = tmp; 222cb93a386Sopenharmony_ci size_t count = 1; 223cb93a386Sopenharmony_ci while (uni > 0x7F >> count) { 224cb93a386Sopenharmony_ci *p++ = (char)(0x80 | (uni & 0x3F)); 225cb93a386Sopenharmony_ci uni >>= 6; 226cb93a386Sopenharmony_ci count += 1; 227cb93a386Sopenharmony_ci } 228cb93a386Sopenharmony_ci if (utf8) { 229cb93a386Sopenharmony_ci p = tmp; 230cb93a386Sopenharmony_ci utf8 += count; 231cb93a386Sopenharmony_ci while (p < tmp + count - 1) { 232cb93a386Sopenharmony_ci *--utf8 = *p++; 233cb93a386Sopenharmony_ci } 234cb93a386Sopenharmony_ci *--utf8 = (char)(~(0xFF >> count) | uni); 235cb93a386Sopenharmony_ci } 236cb93a386Sopenharmony_ci return count; 237cb93a386Sopenharmony_ci} 238cb93a386Sopenharmony_ci 239cb93a386Sopenharmony_cisize_t SkUTF::ToUTF16(SkUnichar uni, uint16_t utf16[2]) { 240cb93a386Sopenharmony_ci if ((uint32_t)uni > 0x10FFFF) { 241cb93a386Sopenharmony_ci return 0; 242cb93a386Sopenharmony_ci } 243cb93a386Sopenharmony_ci int extra = (uni > 0xFFFF); 244cb93a386Sopenharmony_ci if (utf16) { 245cb93a386Sopenharmony_ci if (extra) { 246cb93a386Sopenharmony_ci utf16[0] = (uint16_t)((0xD800 - 64) + (uni >> 10)); 247cb93a386Sopenharmony_ci utf16[1] = (uint16_t)(0xDC00 | (uni & 0x3FF)); 248cb93a386Sopenharmony_ci } else { 249cb93a386Sopenharmony_ci utf16[0] = (uint16_t)uni; 250cb93a386Sopenharmony_ci } 251cb93a386Sopenharmony_ci } 252cb93a386Sopenharmony_ci return 1 + extra; 253cb93a386Sopenharmony_ci} 254cb93a386Sopenharmony_ci 255cb93a386Sopenharmony_ciint SkUTF::UTF8ToUTF16(uint16_t dst[], int dstCapacity, const char src[], size_t srcByteLength) { 256cb93a386Sopenharmony_ci if (!dst) { 257cb93a386Sopenharmony_ci dstCapacity = 0; 258cb93a386Sopenharmony_ci } 259cb93a386Sopenharmony_ci 260cb93a386Sopenharmony_ci int dstLength = 0; 261cb93a386Sopenharmony_ci uint16_t* endDst = dst + dstCapacity; 262cb93a386Sopenharmony_ci const char* endSrc = src + srcByteLength; 263cb93a386Sopenharmony_ci while (src < endSrc) { 264cb93a386Sopenharmony_ci SkUnichar uni = NextUTF8(&src, endSrc); 265cb93a386Sopenharmony_ci if (uni < 0) { 266cb93a386Sopenharmony_ci return -1; 267cb93a386Sopenharmony_ci } 268cb93a386Sopenharmony_ci 269cb93a386Sopenharmony_ci uint16_t utf16[2]; 270cb93a386Sopenharmony_ci size_t count = ToUTF16(uni, utf16); 271cb93a386Sopenharmony_ci if (count == 0) { 272cb93a386Sopenharmony_ci return -1; 273cb93a386Sopenharmony_ci } 274cb93a386Sopenharmony_ci dstLength += count; 275cb93a386Sopenharmony_ci 276cb93a386Sopenharmony_ci if (dst) { 277cb93a386Sopenharmony_ci uint16_t* elems = utf16; 278cb93a386Sopenharmony_ci while (dst < endDst && count > 0) { 279cb93a386Sopenharmony_ci *dst++ = *elems++; 280cb93a386Sopenharmony_ci count -= 1; 281cb93a386Sopenharmony_ci } 282cb93a386Sopenharmony_ci } 283cb93a386Sopenharmony_ci } 284cb93a386Sopenharmony_ci return dstLength; 285cb93a386Sopenharmony_ci} 286cb93a386Sopenharmony_ci 287cb93a386Sopenharmony_ciint SkUTF::UTF16ToUTF8(char dst[], int dstCapacity, const uint16_t src[], size_t srcLength) { 288cb93a386Sopenharmony_ci if (!dst) { 289cb93a386Sopenharmony_ci dstCapacity = 0; 290cb93a386Sopenharmony_ci } 291cb93a386Sopenharmony_ci 292cb93a386Sopenharmony_ci int dstLength = 0; 293cb93a386Sopenharmony_ci const char* endDst = dst + dstCapacity; 294cb93a386Sopenharmony_ci const uint16_t* endSrc = src + srcLength; 295cb93a386Sopenharmony_ci while (src < endSrc) { 296cb93a386Sopenharmony_ci SkUnichar uni = NextUTF16(&src, endSrc); 297cb93a386Sopenharmony_ci if (uni < 0) { 298cb93a386Sopenharmony_ci return -1; 299cb93a386Sopenharmony_ci } 300cb93a386Sopenharmony_ci 301cb93a386Sopenharmony_ci char utf8[SkUTF::kMaxBytesInUTF8Sequence]; 302cb93a386Sopenharmony_ci size_t count = ToUTF8(uni, utf8); 303cb93a386Sopenharmony_ci if (count == 0) { 304cb93a386Sopenharmony_ci return -1; 305cb93a386Sopenharmony_ci } 306cb93a386Sopenharmony_ci dstLength += count; 307cb93a386Sopenharmony_ci 308cb93a386Sopenharmony_ci if (dst) { 309cb93a386Sopenharmony_ci const char* elems = utf8; 310cb93a386Sopenharmony_ci while (dst < endDst && count > 0) { 311cb93a386Sopenharmony_ci *dst++ = *elems++; 312cb93a386Sopenharmony_ci count -= 1; 313cb93a386Sopenharmony_ci } 314cb93a386Sopenharmony_ci } 315cb93a386Sopenharmony_ci } 316cb93a386Sopenharmony_ci return dstLength; 317cb93a386Sopenharmony_ci} 318