1/* 2 * Copyright (c) 2021 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16#ifndef ECMASCRIPT_BASE_UTF_HELPER_H 17#define ECMASCRIPT_BASE_UTF_HELPER_H 18 19#include <cstdint> 20#include <vector> 21 22#include "libpandabase/utils/utf.h" 23#include "ecmascript/common.h" 24 25namespace panda::ecmascript::base::utf_helper { 26 27static constexpr size_t CONST_2 = 2; 28static constexpr size_t CONST_3 = 3; 29static constexpr size_t CONST_4 = 4; 30static constexpr size_t MASK1 = 0x80; 31static constexpr size_t MASK2 = 0x20; 32static constexpr size_t MASK3 = 0x10; 33static constexpr size_t LOW_3BITS = 0x7; 34static constexpr size_t LOW_4BITS = 0xF; 35static constexpr size_t LOW_5BITS = 0x1F; 36static constexpr size_t LOW_6BITS = 0x3F; 37static constexpr size_t L_SURROGATE_START = 0xDC00; 38static constexpr size_t H_SURROGATE_START = 0xD800; 39static constexpr size_t SURROGATE_RAIR_START = 0x10000; 40static constexpr size_t OFFSET_18POS = 18; 41static constexpr size_t OFFSET_12POS = 12; 42static constexpr size_t OFFSET_10POS = 10; 43static constexpr size_t OFFSET_6POS = 6; 44static constexpr uint16_t DECODE_LEAD_LOW = 0xD800; 45static constexpr uint16_t DECODE_LEAD_HIGH = 0xDBFF; 46static constexpr uint16_t DECODE_TRAIL_LOW = 0xDC00; 47static constexpr uint16_t DECODE_TRAIL_HIGH = 0xDFFF; 48static constexpr uint32_t DECODE_FIRST_FACTOR = 0x400; 49static constexpr uint32_t DECODE_SECOND_FACTOR = 0x10000; 50static constexpr uint32_t UTF8_OFFSET = 6; 51static constexpr uint32_t UTF16_OFFSET = 10; 52static constexpr uint16_t SURROGATE_MASK = 0xF800; 53static constexpr uint16_t UTF16_REPLACEMENT_CHARACTER = 0xFFFD; 54 55static constexpr uint8_t BIT_MASK_1 = 0x80; 56static constexpr uint8_t BIT_MASK_2 = 0xC0; 57static constexpr uint8_t BIT_MASK_3 = 0xE0; 58static constexpr uint8_t BIT_MASK_4 = 0xF0; 59static constexpr uint8_t BIT_MASK_5 = 0xF8; 60static constexpr uint8_t BIT_MASK_FF = 0xFF; 61static constexpr uint16_t BIT16_MASK = 0x3FF; 62 63static constexpr uint8_t UTF8_1B_MAX = 0x7f; 64 65static constexpr uint16_t UTF8_2B_MAX = 0x7ff; 66static constexpr uint8_t UTF8_2B_FIRST = 0xc0; 67static constexpr uint8_t UTF8_2B_SECOND = 0x80; 68static constexpr uint8_t UTF8_2B_THIRD = 0x3f; 69static constexpr uint8_t UTF8_2B_FIRST_MIN = 0xc2; // the minimum for 2 bytes is 128, which is 0xc280 70 71static constexpr uint16_t UTF8_3B_MAX = 0xffff; 72static constexpr uint8_t UTF8_3B_FIRST = 0xe0; 73static constexpr uint8_t UTF8_3B_SECOND = 0x80; 74static constexpr uint8_t UTF8_3B_THIRD = 0x80; 75static constexpr uint8_t UTF8_3B_SECOND_MIN = 0xa0; // the minimum for 3 bytes is 2048, which is 0xe0a080 76static constexpr uint8_t UTF8_3B_RESERVED_FIRST = 0xED; 77static constexpr uint8_t UTF8_3B_RESERVED_SECOND_MIN = 0xA0; 78static constexpr uint8_t UTF8_3B_RESERVED_SECOND_MAX = 0xBF; // U+D800~U+DFFF is reserved for UTF-16 surrogate pairs 79 80static constexpr uint8_t UTF8_4B_FIRST = 0xf0; 81static constexpr uint8_t UTF8_4B_SECOND_MIN = 0x90; // the minimum for 4 bytes is 65536, which is 0xf0908080 82static constexpr uint8_t UTF8_4B_FIRST_MAX = 0xF4; // the maximum for 4 bytes is 1114111, which is 0x10FFFF 83static constexpr uint8_t UTF8_4B_SECOND_MAX = 0x8F; 84 85static constexpr uint8_t byteMask = 0xbf; 86static constexpr uint8_t byteMark = 0x80; 87 88static constexpr uint8_t latin1Limit = 0xFF; 89 90static constexpr int32_t INVALID_UTF8 = -1; 91 92enum UtfLength : uint8_t { ONE = 1, TWO = 2, THREE = 3, FOUR = 4 }; 93enum UtfOffset : uint8_t { SIX = 6, TEN = 10, TWELVE = 12, EIGHTEEN = 18 }; 94 95static constexpr size_t MAX_BYTES = 4; 96struct Utf8Char { 97 size_t n; 98 std::array<uint8_t, MAX_BYTES> ch; 99}; 100 101static const unsigned char firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC}; 102 103uint32_t DecodeUTF16(uint16_t const *utf16, size_t len, size_t *index, bool cesu8 = false); 104 105size_t EncodeUTF8(uint32_t codepoint, uint8_t* utf8, size_t len, size_t index); 106 107uint32_t UTF16Decode(uint16_t lead, uint16_t trail); 108 109bool IsValidUTF8(const std::vector<uint8_t> &data); 110 111Utf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify, bool isWriteBuffer = false); 112 113size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify = true, 114 bool isGetBufferSize = false, bool cesu8 = false); 115 116size_t PUBLIC_API ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, 117 size_t utf8Len, size_t start, bool modify = true, 118 bool isWriteBuffer = false, bool cesu = false); 119 120size_t DebuggerConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len, 121 size_t start, bool modify = true, bool isWriteBuffer = false); 122 123uint32_t HandleAndDecodeInvalidUTF16(uint16_t const *utf16, size_t len, size_t *index); 124 125std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine = false); 126 127size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len); 128 129size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len); 130 131size_t ConvertRegionUtf16ToLatin1(const uint16_t *utf16In, uint8_t *latin1Out, size_t utf16Len, size_t latin1Len); 132 133static inline uint32_t CombineTwoU16(uint16_t d0, uint16_t d1) 134{ 135 uint32_t codePoint = d0 - utf::HI_SURROGATE_MIN; 136 codePoint <<= UtfOffset::TEN; 137 codePoint |= d1 - utf::LO_SURROGATE_MIN; 138 codePoint += utf::LO_SUPPLEMENTS_MIN; 139 return codePoint; 140} 141 142std::pair<int32_t, size_t> ConvertUtf8ToUnicodeChar(const uint8_t *utf8, size_t maxLen); 143 144static inline bool IsHexDigits(uint16_t ch) 145{ 146 return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f'); 147} 148 149} // namespace panda::ecmascript::base::utf_helper 150 151#endif // ECMASCRIPT_BASE_UTF_HELPER_H