1/* 2 * Copyright (c) 2022 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16#include "ecmascript/base/utf_helper.h" 17#include "ecmascript/tests/test_helper.h" 18 19using namespace panda::ecmascript; 20using namespace panda::ecmascript::base; 21using namespace panda::ecmascript::base::utf_helper; 22 23namespace panda::test { 24class UtfHelperTest : public BaseTestWithScope<false> { 25}; 26 27/* 28* @tc.name: CombineTwoU16 29* @tc.desc: Enter a pair of UTF16-encoded surrogate pair corresponding to the lead surrogates and trail surrogates, 30* and return the corresponding Unicode codepoint value. 31* @tc.type: FUNC 32*/ 33HWTEST_F_L0(UtfHelperTest, CombineTwoU16) 34{ 35 uint16_t leadSur = 0xD800; 36 uint16_t trailSur = 0xDC00; 37 uint32_t codePoint = static_cast<uint32_t>(((leadSur - 0xD800U) << 10)|(trailSur - 0xDc00U)) + 0x10000U; 38 uint32_t utfHelperCodePoint = CombineTwoU16(leadSur, trailSur); 39 EXPECT_EQ(codePoint, utfHelperCodePoint); 40 EXPECT_EQ(codePoint, static_cast<uint32_t>(0x10000)); 41 trailSur = 0xDFFF; 42 codePoint = static_cast<uint32_t>(((leadSur - 0xD800U) << 10) | (trailSur - 0xDC00U))+ 0x10000U; 43 utfHelperCodePoint = CombineTwoU16(leadSur, trailSur); 44 EXPECT_EQ(codePoint, utfHelperCodePoint); 45 EXPECT_EQ(codePoint, static_cast<uint32_t>(0x103FF)); 46 leadSur = 0xDBFF; 47 codePoint = static_cast<uint32_t>(((leadSur - 0xD800U) << 10) | (trailSur - 0xDC00U)) + 0x10000U; 48 utfHelperCodePoint = CombineTwoU16(leadSur, trailSur); 49 EXPECT_EQ(codePoint, utfHelperCodePoint); 50 EXPECT_EQ(codePoint, static_cast<uint32_t>(0x10FFFF)); 51 trailSur = 0xDC00; 52 codePoint = static_cast<uint32_t>(((leadSur - 0xD800U) << 10) | (trailSur - 0xDC00U)) + 0x10000U; 53 utfHelperCodePoint = CombineTwoU16(leadSur, trailSur); 54 EXPECT_EQ(codePoint, utfHelperCodePoint); 55 EXPECT_EQ(codePoint, static_cast<uint32_t>(0x10FC00)); 56 leadSur = 0xD950; 57 trailSur = 0xDF21; 58 codePoint = static_cast<uint32_t>(((leadSur - 0xD800U)<< 10) | (trailSur - 0xDC00U)) + 0x10000U; 59 utfHelperCodePoint = CombineTwoU16(leadSur, trailSur); 60 EXPECT_EQ(codePoint, utfHelperCodePoint); 61 EXPECT_EQ(codePoint, static_cast<uint32_t>(0x64321)); 62} 63 64/* 65* @tc.name: UTF16Decode 66* @tc.desc: Enter a pair of UTF16-encoded surrogate pair corresponding to the lead surrogates and trail surrogates, 67 Decodes them into corresponding Unicode codepoint values and returns. 68* @tc.type: FUNC 69*/ 70HWTEST_F_L0(UtfHelperTest, UTF16Decode) 71{ 72 uint16_t lead = 0xD950; 73 uint16_t trail = 0xDF21; 74 EXPECT_TRUE(lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH); 75 EXPECT_TRUE(trail >= DECODE_TRAIL_LOW && trail <= DECODE_TRAIL_HIGH); 76 uint32_t codePoint = utf_helper::UTF16Decode(lead, trail); 77 EXPECT_EQ(codePoint, 0x64321U); 78 lead = 0xD85D; 79 trail = 0xDFCC; 80 EXPECT_TRUE(lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH); 81 EXPECT_TRUE(trail >= DECODE_TRAIL_LOW && trail <= DECODE_TRAIL_HIGH); 82 codePoint = utf_helper::UTF16Decode(lead, trail); 83 EXPECT_EQ(codePoint, 0x277CCU); 84} 85 86/* 87 * @tc.name: IsValidUTF8 88 * @tc.desc: Judge whether an input group of symbols is a valid UTF8 coding sequence. 89 * @tc.type: FUNC 90 */ 91HWTEST_F_L0(UtfHelperTest, IsValidUTF8) 92{ 93 // 0xxxxxxx, min:0, max:127 94 const std::vector<uint8_t> utfDataOneBitVaild1 = {0x00}; 95 const std::vector<uint8_t> utfDataOneBitVaild2 = {BIT_MASK_1 - 0x01}; 96 const std::vector<uint8_t> utfDataOneBitInvaild = {BIT_MASK_1}; 97 EXPECT_TRUE(utf_helper::IsValidUTF8(utfDataOneBitVaild1)); 98 EXPECT_TRUE(utf_helper::IsValidUTF8(utfDataOneBitVaild2)); 99 EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataOneBitInvaild)); 100 // 110xxxxx 10xxxxxx, min:128, max:2047 101 const std::vector<uint8_t> utfDataTwoBitVaild1 = {BIT_MASK_2 + 0x02, BIT_MASK_1}; 102 const std::vector<uint8_t> utfDataTwoBitVaild2 = {BIT_MASK_3 - 0x01, BIT_MASK_2 - 0x01}; 103 const std::vector<uint8_t> utfDataTwoBitInvaild1 = {BIT_MASK_2, BIT_MASK_2}; 104 const std::vector<uint8_t> utfDataTwoBitInvaild2 = {BIT_MASK_3, BIT_MASK_1}; 105 const std::vector<uint8_t> utfDataTwoBitInvaild3 = {BIT_MASK_2, BIT_MASK_1}; 106 EXPECT_TRUE(utf_helper::IsValidUTF8(utfDataTwoBitVaild1)); 107 EXPECT_TRUE(utf_helper::IsValidUTF8(utfDataTwoBitVaild2)); 108 EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataTwoBitInvaild1)); 109 EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataTwoBitInvaild2)); 110 EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataTwoBitInvaild3)); 111 // 1110xxxx 10xxxxxx 10xxxxxx, min:2048, max:65535 112 const std::vector<uint8_t> utfDataThreeBitVaild1 = {BIT_MASK_3, BIT_MASK_1 + 0x20, BIT_MASK_1}; 113 const std::vector<uint8_t> utfDataThreeBitVaild2 = {BIT_MASK_4 - 0x01, BIT_MASK_2 - 0x01, BIT_MASK_2 - 0x01}; 114 const std::vector<uint8_t> utfDataThreeBitVaild3 = {BIT_MASK_3 + 0x01, BIT_MASK_1, BIT_MASK_1}; 115 const std::vector<uint8_t> utfDataThreeBitInvaild1 = {BIT_MASK_3, BIT_MASK_1, BIT_MASK_2}; 116 const std::vector<uint8_t> utfDataThreeBitInvaild2 = {BIT_MASK_3, BIT_MASK_2, BIT_MASK_1}; 117 const std::vector<uint8_t> utfDataThreeBitInvaild3 = {BIT_MASK_4, BIT_MASK_1, BIT_MASK_1}; 118 const std::vector<uint8_t> utfDataThreeBitInvaild4 = {BIT_MASK_4, BIT_MASK_2, BIT_MASK_2}; 119 const std::vector<uint8_t> utfDataThreeBitInvaild5 = {BIT_MASK_3, BIT_MASK_1, BIT_MASK_1}; 120 EXPECT_TRUE(utf_helper::IsValidUTF8(utfDataThreeBitVaild1)); 121 EXPECT_TRUE(utf_helper::IsValidUTF8(utfDataThreeBitVaild2)); 122 EXPECT_TRUE(utf_helper::IsValidUTF8(utfDataThreeBitVaild3)); 123 EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataThreeBitInvaild1)); 124 EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataThreeBitInvaild2)); 125 EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataThreeBitInvaild3)); 126 EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataThreeBitInvaild4)); 127 EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataThreeBitInvaild5)); 128 // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx, min:65536, max:1114111(0x10FFFF) 129 const std::vector<uint8_t> utfDataFourBitVaild1 = {BIT_MASK_4, BIT_MASK_1 + 0x10, BIT_MASK_1, BIT_MASK_1}; 130 const std::vector<uint8_t> utfDataFourBitVaild3 = {BIT_MASK_4 + 0x01, BIT_MASK_1, BIT_MASK_1, BIT_MASK_1}; 131 const std::vector<uint8_t> utfDataFourBitInvaild1 = {BIT_MASK_4, BIT_MASK_1, BIT_MASK_1, BIT_MASK_2}; 132 const std::vector<uint8_t> utfDataFourBitInvaild2 = {BIT_MASK_4, BIT_MASK_1, BIT_MASK_2, BIT_MASK_1}; 133 const std::vector<uint8_t> utfDataFourBitInvaild3 = {BIT_MASK_4, BIT_MASK_2, BIT_MASK_1, BIT_MASK_1}; 134 const std::vector<uint8_t> utfDataFourBitInvaild4 = {BIT_MASK_5, BIT_MASK_1, BIT_MASK_1, BIT_MASK_1}; 135 const std::vector<uint8_t> utfDataFourBitInvaild5 = {BIT_MASK_5, BIT_MASK_2, BIT_MASK_2, BIT_MASK_2}; 136 const std::vector<uint8_t> utfDataFourBitInvaild6 = {BIT_MASK_4, BIT_MASK_1, BIT_MASK_1, BIT_MASK_1}; 137 const std::vector<uint8_t> utfDataFourBitInvaild7 = 138 {BIT_MASK_5 - 0x01, BIT_MASK_2 - 0x01, BIT_MASK_2 - 0x01, BIT_MASK_2 - 0x01}; 139 EXPECT_TRUE(utf_helper::IsValidUTF8(utfDataFourBitVaild1)); 140 EXPECT_TRUE(utf_helper::IsValidUTF8(utfDataFourBitVaild3)); 141 EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataFourBitInvaild1)); 142 EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataFourBitInvaild2)); 143 EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataFourBitInvaild3)); 144 EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataFourBitInvaild4)); 145 EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataFourBitInvaild5)); 146 EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataFourBitInvaild6)); 147 EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataFourBitInvaild7)); 148} 149 150/* 151* @tc.name: ConvertUtf16ToUtf8 152* @tc.desc: Converts a UTF16 encoding sequence encoding a character into a UTF8 encoding sequence, 153* and returns the sequence and the byte length of the sequence. The parameter "modify" 154* indicates whether to perform special conversion for 0. 155* @tc.type: FUNC 156*/ 157HWTEST_F_L0(UtfHelperTest, ConvertUtf16ToUtf8_001) 158{ 159 // codePoint lie in [0,0x7F]--->UTF-8(length:1) 160 { 161 uint16_t utf16Data0 = 0x00; 162 uint16_t utf16Data1 = 0x00; 163 Utf8Char utf8Char = ConvertUtf16ToUtf8(utf16Data0, utf16Data1, false); 164 Utf8Char utf8CharTemp = {0, {0x00U}}; 165 EXPECT_EQ(utf8Char.n, utf8CharTemp.n); 166 EXPECT_EQ(utf8Char.ch, utf8CharTemp.ch); 167 } 168 169 // special case for \u0000 ==> Co80- 1100'0000 1000'0000 170 { 171 uint16_t utf16Data0 = 0x00; 172 uint16_t utf16Data1 = 0x00; 173 Utf8Char utf8Char = ConvertUtf16ToUtf8(utf16Data0, utf16Data1, true); 174 Utf8Char utf8CharTemp = {2, {UTF8_2B_FIRST, UTF8_2B_SECOND}}; 175 EXPECT_EQ(utf8Char.n, utf8CharTemp.n); 176 EXPECT_EQ(utf8Char.ch, utf8CharTemp.ch); 177 utf16Data0 = 0x7F; 178 utf8Char = ConvertUtf16ToUtf8(utf16Data0, utf16Data1, false); 179 utf8CharTemp = {1, {0x7F}}; 180 EXPECT_EQ(utf8Char.n, utf8CharTemp.n); 181 EXPECT_EQ(utf8Char.ch, utf8CharTemp.ch); 182 183 // codePoint lie in [0x80,0x7FF]--> UTF-8(length:2) 184 utf16Data0 = 0x80; 185 utf8Char = ConvertUtf16ToUtf8(utf16Data0, utf16Data1, false); 186 utf8CharTemp = {2, {UTF8_2B_FIRST + 0x02U, UTF8_2B_SECOND}}; 187 EXPECT_EQ(utf8Char.n, utf8CharTemp.n); 188 EXPECT_EQ(utf8Char.ch, utf8CharTemp.ch); 189 utf16Data0 = 0x7FF; 190 utf8Char = ConvertUtf16ToUtf8(utf16Data0, utf16Data1, false); 191 utf8CharTemp = {2, {BIT_MASK_3 - 0x01, BIT_MASK_2 - 0x01}}; 192 EXPECT_EQ(utf8Char.n, utf8CharTemp.n); 193 EXPECT_EQ(utf8Char.ch, utf8CharTemp.ch); 194 } 195 196 // codePoint lie in [0xD800,0xDFFF]--> UTF-8(length:3) 197 { 198 uint16_t utf16Data0 = 0xD800; 199 uint16_t utf16Data1 = 0x00; 200 Utf8Char utf8Char = ConvertUtf16ToUtf8(utf16Data0, utf16Data1, false); 201 Utf8Char utf8CharTemp = {3, {UTF8_3B_FIRST | static_cast<uint8_t>(0xD800 >> 12), 202 UTF8_3B_SECOND | (static_cast<uint8_t>(0xD800 >> 6) & utf::MASK_6BIT), 203 UTF8_3B_THIRD | (static_cast<uint8_t>(0xD800) & utf::MASK_6BIT)}}; 204 EXPECT_EQ(utf8Char.n, utf8CharTemp.n); 205 EXPECT_EQ(utf8Char.ch, utf8CharTemp.ch); 206 utf16Data0 = 0xDFFF; 207 utf8Char = ConvertUtf16ToUtf8(utf16Data0, utf16Data1, false); 208 utf8CharTemp = {3, {UTF8_3B_FIRST | static_cast<uint8_t>(0xDFFF >> 12), 209 UTF8_3B_SECOND | (static_cast<uint8_t>(0xDFFF >> 6) & utf::MASK_6BIT), 210 UTF8_3B_THIRD | (static_cast<uint8_t>(0xDFFF) & utf::MASK_6BIT)}}; 211 EXPECT_EQ(utf8Char.n, utf8CharTemp.n); 212 EXPECT_EQ(utf8Char.ch, utf8CharTemp.ch); 213 } 214} 215 216HWTEST_F_L0(UtfHelperTest, ConvertUtf16ToUtf8_002) 217{ 218 // codePoint lie in [0x800,0xD7FF]&&[0xE000,0xFFFF]-->UTF-8(length:3) 219 uint16_t utf16Data0 = 0x800; 220 uint16_t utf16Data1 = 0x00; 221 Utf8Char utf8Char = ConvertUtf16ToUtf8(utf16Data0, utf16Data1, false); 222 Utf8Char utf8CharTemp = {3, {UTF8_3B_FIRST | static_cast<uint8_t>(0x800 >> 12), 223 UTF8_3B_SECOND | (static_cast<uint8_t>(0x800 >> 6) & utf::MASK_6BIT), 224 UTF8_3B_THIRD | (static_cast<uint8_t>(0x800) & utf::MASK_6BIT)}}; 225 EXPECT_EQ(utf8Char.n, utf8CharTemp.n); 226 EXPECT_EQ(utf8Char.ch, utf8CharTemp.ch); 227 utf16Data0 = 0xD7FF; 228 utf8Char = ConvertUtf16ToUtf8(utf16Data0, utf16Data1, false); 229 utf8CharTemp = {3, {UTF8_3B_FIRST | static_cast<uint8_t>(0xD7FF>>12), 230 UTF8_3B_SECOND | (static_cast<uint8_t>(0xD7FF >> 6) & utf::MASK_6BIT), 231 UTF8_3B_THIRD | (static_cast<uint8_t>(0xD7FF) & utf::MASK_6BIT)}}; 232 EXPECT_EQ(utf8Char.n, utf8CharTemp.n); 233 EXPECT_EQ(utf8Char.ch, utf8CharTemp.ch); 234 utf16Data0 = 0xE000; 235 utf8Char = ConvertUtf16ToUtf8(utf16Data0, utf16Data1, false); 236 utf8CharTemp = {3, {UTF8_3B_FIRST | static_cast<uint8_t>(0xE000 >> 12), 237 UTF8_3B_SECOND | (static_cast<uint8_t>(0xE000 >> 6)& utf::MASK_6BIT), 238 UTF8_3B_THIRD | (static_cast<uint8_t>(0xE000) & utf::MASK_6BIT)}}; 239 EXPECT_EQ(utf8Char.n, utf8CharTemp.n); 240 EXPECT_EQ(utf8Char.ch, utf8CharTemp.ch); 241 utf16Data0 = 0xFFFF; 242 utf8Char = ConvertUtf16ToUtf8(utf16Data0, utf16Data1, false); 243 utf8CharTemp = {3, {UTF8_3B_FIRST | static_cast<uint8_t>(0xFFFF >> 12), 244 UTF8_3B_SECOND | (static_cast<uint8_t>(0xFFFF >> 6)& utf::MASK_6BIT), 245 UTF8_3B_THIRD | (static_cast<uint8_t>(0xFFFF) & utf::MASK_6BIT)}}; 246 EXPECT_EQ(utf8Char.n, utf8CharTemp.n); 247 EXPECT_EQ(utf8Char.ch, utf8CharTemp.ch); 248} 249 250HWTEST_F_L0(UtfHelperTest, ConvertUtf16ToUtf8_003) 251{ 252 // codePoint lie in [0x10000,0x10FFFF] --> UTF-8(length:4) 253 { 254 uint16_t utf16Data0 = 0xD800; 255 uint16_t utf16Data1 = 0xDC00; 256 Utf8Char utf8Char = ConvertUtf16ToUtf8(utf16Data0, utf16Data1, false); 257 uint32_t codePoint = CombineTwoU16(utf16Data0, utf16Data1); 258 Utf8Char utf8CharTemp = {4, {static_cast<uint8_t>((codePoint >> 18) | UTF8_4B_FIRST), 259 static_cast<uint8_t>(((codePoint >> 12) & utf::MASK_6BIT) | utf::MASK1), 260 static_cast<uint8_t>(((codePoint >> 6) & utf::MASK_6BIT) | utf::MASK1), 261 static_cast<uint8_t>((codePoint & utf::MASK_6BIT) | utf::MASK1)}}; 262 EXPECT_EQ(utf8Char.n, utf8CharTemp.n); 263 EXPECT_EQ(utf8Char.ch, utf8CharTemp.ch); 264 } 265 266 // 0xD950 0xDF21 --> 0x64321 --> 0xf1 0xa4 0x8c 0xa1 267 { 268 uint16_t utf16Data0 = 0xD950; 269 uint16_t utf16Data1 = 0xDF21; 270 Utf8Char utf8Char = ConvertUtf16ToUtf8(utf16Data0, utf16Data1, false); 271 uint32_t codePoint = CombineTwoU16(utf16Data0, utf16Data1); 272 Utf8Char utf8CharTemp = {4, {static_cast<uint8_t>((codePoint >> 18) | UTF8_4B_FIRST), 273 static_cast<uint8_t>(((codePoint >> 12)& utf::MASK_6BIT)| utf::MASK1), 274 static_cast<uint8_t>(((codePoint >> 6)& utf::MASK_6BIT) | utf::MASK1), 275 static_cast<uint8_t>((codePoint & utf::MASK_6BIT) | utf::MASK1)}}; 276 EXPECT_EQ(utf8Char.n, utf8CharTemp.n); 277 EXPECT_EQ(utf8Char.ch, utf8CharTemp.ch); 278 utf8CharTemp = {4, {0xf1, 0xa4, 0x8c, 0xa1}}; 279 EXPECT_EQ(utf8Char.n, utf8CharTemp.n); 280 EXPECT_EQ(utf8Char.ch, utf8CharTemp.ch); 281 } 282} 283 284/* 285* @tc.name: Utf16ToUtf8Size 286* @tc.desc: Enter a string of UTF16 coded sequences and return the length of the sequence converted into UTF8 coded 287* sequences. "length" indicates the length of the input UTF16 sequence, and "modify" indicates whether 288* to perform special conversion for. 289* @tc.type: FUNC 290*/ 291HWTEST_F_L0(UtfHelperTest, Utf16ToUtf8Size_001) 292{ 293 // when utf16 data length is only 1 and code in 0xd800-0xdfff, means that is a single code point, it needs to be 294 // represented by three UTF8 code. 295 uint32_t length = 0; 296 uint16_t utf16Value1[1] = {0xD800}; 297 const uint16_t *utf16ValuePtr1 = utf16Value1; 298 length = Utf16ToUtf8Size(utf16ValuePtr1, 1, false); 299 EXPECT_EQ(length - 1, UtfLength::THREE); 300 length = 1; 301 uint16_t utf16Value2[1] = {0xDFFF}; 302 const uint16_t *utf16ValuePtr2 = utf16Value2; 303 length = Utf16ToUtf8Size(utf16ValuePtr2, 1, false); 304 EXPECT_EQ(length - 1, UtfLength::THREE); 305 306 // special case for U+0000 => c0 80 307 uint16_t utf16Value3[1] = {0x00}; 308 const uint16_t *utf16ValuePtr3 = utf16Value3; 309 length = Utf16ToUtf8Size(utf16ValuePtr3, 1, false); 310 EXPECT_EQ(length - 1, 0U); 311 length = Utf16ToUtf8Size(utf16ValuePtr3, 1, true); 312 EXPECT_EQ(length - 1, 2U); 313 314 // if isGetBufferSize is true, special case for U+0000 => 00 315 uint16_t utf16Value12[1] = {0x00}; 316 const uint16_t *utf16ValuePtr12 = utf16Value12; 317 length = Utf16ToUtf8Size(utf16ValuePtr12, 1, false, true); 318 EXPECT_EQ(length - 1, 1U); 319 length = Utf16ToUtf8Size(utf16ValuePtr12, 1, true, true); 320 EXPECT_EQ(length - 1, 1U); 321 322 // code point lie in [0x00, 0x7F], it needs to be represented by one UTF8 code. 323 uint16_t utf16Value4[1] = {0x00}; 324 uint16_t utf16Value5[1] = {0x7F}; 325 const uint16_t *utf16ValuePtr4 = utf16Value4; 326 const uint16_t *utf16ValuePtr5 = utf16Value5; 327 length = Utf16ToUtf8Size(utf16ValuePtr4, 1, false); 328 EXPECT_EQ(length - 1, 0U); 329 length = Utf16ToUtf8Size(utf16ValuePtr5, 1, false); 330 EXPECT_EQ(length - 1, 1U); 331 332 // code point lie in [0x80, 0x7FF], it needs to be represented by two UTF8 code, 333 uint16_t utf16Value6[1] = {0x80}; 334 uint16_t utf16Value7[1] = {0x7FF}; 335 const uint16_t *utf16ValuePtr6 = utf16Value6; 336 const uint16_t *utf16ValuePtr7 = utf16Value7; 337 length = Utf16ToUtf8Size(utf16ValuePtr6, 1, false); 338 EXPECT_EQ(length - 1, 2U); 339 length = Utf16ToUtf8Size(utf16ValuePtr7, 1, false); 340 EXPECT_EQ(length - 1, 2U); 341 // code point lie in [0x800, 0xD7FF] or [0xDCoo, 0xFFFF], it needs to be represented by three UTF8 code. 342 uint16_t utf16Value8[1] = {0x800}; 343 uint16_t utf16Value9[1] = {0xD7FF}; 344 uint16_t utf16Value10[1] = {0xDC00}; 345 uint16_t utf16Value11[1] = {0xFFFF}; 346 const uint16_t *utf16ValuePtr8 = utf16Value8; 347 const uint16_t *utf16ValuePtr9 = utf16Value9; 348 const uint16_t *utf16ValuePtr10 = utf16Value10; 349 const uint16_t *utf16ValuePtr11 = utf16Value11; 350 length = Utf16ToUtf8Size(utf16ValuePtr8, 1, false); 351 EXPECT_EQ(length - 1, 3U); 352 length = Utf16ToUtf8Size(utf16ValuePtr9, 1, false); 353 EXPECT_EQ(length - 1, 3U); 354 length = Utf16ToUtf8Size(utf16ValuePtr10, 1, false); 355 EXPECT_EQ(length-1, 3U); 356 length = Utf16ToUtf8Size(utf16ValuePtr11, 1, false); 357 EXPECT_EQ(length - 1, 3U); 358} 359 360HWTEST_F_L0(UtfHelperTest, Utf16ToUtf8Size_002) 361{ 362 // The trail value is valid, located in [0xDc00, 0xDFFF].It needs to be represented by four UTF8 code. 363 uint16_t utf16Value12[2] = {0xD800, 0xDc00}; 364 uint16_t utf16Value13[2] = {0xD800, 0xDFFF}; 365 uint16_t utf16Value14[2] = {0xDBFF, 0xDC00}; 366 uint16_t utf16Value15[2] = {0xDBFF, 0xDFFF}; 367 const uint16_t *utf16ValuePtr12 = utf16Value12; 368 const uint16_t *utf16ValuePtr13 = utf16Value13; 369 const uint16_t *utf16ValuePtr14 = utf16Value14; 370 const uint16_t *utf16ValuePtr15 = utf16Value15; 371 uint32_t length = Utf16ToUtf8Size(utf16ValuePtr12, 2, false); 372 EXPECT_EQ(length - 1, 4U); 373 length = Utf16ToUtf8Size(utf16ValuePtr13, 2, false); 374 EXPECT_EQ(length- 1, 4U); 375 length = Utf16ToUtf8Size(utf16ValuePtr14, 2, false); 376 EXPECT_EQ(length - 1, 4U); 377 length = Utf16ToUtf8Size(utf16ValuePtr15, 2, false); 378 EXPECT_EQ(length - 1, 4U); 379 380 // The trail value of Bad sequence is invalid, not located in [0xDC00, 0xDFFF]. 381 // Need to return 6 bytes length 382 uint16_t utf16Value16[2] = {0xD800, 0xDBFF}; 383 uint16_t utf16Value17[2] = {0xDC00, 0xDFFF}; 384 const uint16_t *utf16ValuePtr16 = utf16Value16; 385 const uint16_t *utf16ValuePtr17 = utf16Value17; 386 length = Utf16ToUtf8Size(utf16ValuePtr16, 2, false); 387 EXPECT_EQ(length- 1, 6U); 388 length = Utf16ToUtf8Size(utf16ValuePtr17, 2, false); 389 EXPECT_EQ(length-1, 6U); 390 391 // 0(or 2)+ 1+ 2 + 3 + 4 = 10(or 12) 392 uint16_t utf16Value18[6] = {0x00, 0x7F, 0x80, 0x800, 0xD800, 0xDC00}; 393 const uint16_t *utf16ValuePtr18 = utf16Value18; 394 length = Utf16ToUtf8Size(utf16ValuePtr18, 6, false); 395 EXPECT_EQ(length - 1, 10U); 396 length = Utf16ToUtf8Size(utf16ValuePtr18, 6, true); 397 EXPECT_EQ(length - 1, 12U); 398} 399 400/* 401* @tc.name: ConvertUtf8ToUtf16Pair 402* @tc.desc: Converts a UTF8 encoding sequence encoding a character into a UTF16 encoding sequence, and returns the 403* sequence and the byte length of the UTF16 encoding sequence. The parameter "combine" identifies whether 404* to return a pr0xy pair of Unicode values in the secondary plane, or the Unicode value itself. 405* @tc.type: FUNC 406*/ 407HWTEST_F_L0(UtfHelperTest, ConvertUtf8ToUtf16Pair) 408{ 409 // code point lie in [0x00, 0x7F], the length of utf8 code element byte is 1 410 uint8_t utf8Value1[1] = {0x00}; 411 uint8_t utf8Value2[1] = {UTF8_1B_MAX}; 412 const uint8_t *utf8ValuePtr1 = utf8Value1; 413 const uint8_t *utf8ValuePtr2 = utf8Value2; 414 std::pair<uint32_t, size_t> utf16Res = ConvertUtf8ToUtf16Pair(utf8ValuePtr1); 415 std::pair<uint32_t, size_t> utf16Value = {utf8Value1[0], 1}; 416 EXPECT_EQ(utf16Res, utf16Value); 417 utf16Res = ConvertUtf8ToUtf16Pair(utf8ValuePtr2); 418 utf16Value = {utf8Value2[0], 1}; 419 EXPECT_EQ(utf16Res, utf16Value); 420 // code point lie in [0x80, 0x7FF], the length of utf8 code element byte is 2 421 uint8_t utf8Value3[2] = {0xc2, 0x80}; // 0x80 422 uint8_t utf8Value4[2] = {0xDF, 0xBF}; // 0x7FF 423 const uint8_t *utf8ValuePtr3 = utf8Value3; 424 const uint8_t *utf8ValuePtr4 = utf8Value4; 425 utf16Res = ConvertUtf8ToUtf16Pair(utf8ValuePtr3); 426 utf16Value = {0x80, 2}; 427 EXPECT_EQ(utf16Res, utf16Value); 428 utf16Res = ConvertUtf8ToUtf16Pair(utf8ValuePtr4); 429 utf16Value = {0x7FF, 2}; 430 EXPECT_EQ(utf16Res, utf16Value); 431 432 // code point lie in [0x800, 0xD7FF] or [0xDC00,0xFFFF], the length of utf8 code element byte is 3. 433 // when code point lie in [0xD800, 0xDBFF], due to the use of UCS-2, it corresponds to 3 utf8 symbols. 434 uint8_t utf8Value5[3] = {0xE0, 0xA0, 0x80}; // 0x800 435 uint8_t utf8Value6[3] = {0xEF, 0xBF, 0xBF}; // 0xFFFF 436 const uint8_t *utf8ValuePtr5 = utf8Value5; 437 const uint8_t *utf8ValuePtr6 = utf8Value6; 438 utf16Res = ConvertUtf8ToUtf16Pair(utf8ValuePtr5); 439 utf16Value = {0x800, 3}; 440 EXPECT_EQ(utf16Res, utf16Value); 441 utf16Res = ConvertUtf8ToUtf16Pair(utf8ValuePtr6); 442 utf16Value = {0xFFFF, 3}; 443 EXPECT_EQ(utf16Res, utf16Value); 444 // code point lie in [0x10000, 0x10FFFF], the length of utf8 code element byte is 4. 445 uint8_t utf8Value9[4] = {0xF0, 0x90, 0x80, 0x80}; // 0x10000 446 uint8_t utf8Value10[4] = {0xF4, 0x8F, 0xBF, 0xBF}; // 0x10FFFF 447 const uint8_t *utf8ValuePtr9 = utf8Value9; 448 const uint8_t *utf8ValuePtr10 = utf8Value10; 449 utf16Res = ConvertUtf8ToUtf16Pair(utf8ValuePtr9); 450 utf16Value = {0xD800 << 16 | 0xDC00U, 4}; 451 EXPECT_EQ(utf16Res, utf16Value); 452 utf16Res = ConvertUtf8ToUtf16Pair(utf8ValuePtr10); 453 utf16Value = {0xDBFF << 16 | 0xDFFF, 4}; 454 EXPECT_EQ(utf16Res, utf16Value); 455 utf16Res = ConvertUtf8ToUtf16Pair(utf8ValuePtr9, true); 456 utf16Value = {0x10000, 4}; 457 EXPECT_EQ(utf16Res, utf16Value); 458 utf16Res = ConvertUtf8ToUtf16Pair(utf8ValuePtr10, true); 459 utf16Value = {0x10FFFF, 4}; 460 EXPECT_EQ(utf16Res, utf16Value); 461} 462 463/* 464* @tc.name: Utf8ToUtf16Size 465* @tc.desc: Enter a string of UTF8 coded sequences and return the length of the sequence converted into UTF16 coded 466* sequences. 467* @tc.type: FUNC 468*/ 469HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size) 470{ 471 // when code point lie in (0x00, 0xFFFF], the required utf16 code element length is 1. 472 uint8_t utf8ValueOneByteMin[1] = {0x00}; 473 uint8_t utf8ValueOneByteMax[4] = {0xEF, 0xBF, 0xBF, 0x00}; // 0xFFFF 474 const uint8_t *utf8ValueOneByteMinPtr = utf8ValueOneByteMin; 475 const uint8_t *utf8ValueOneByteMaxPtr = utf8ValueOneByteMax; 476 size_t length = Utf8ToUtf16Size(utf8ValueOneByteMinPtr, sizeof(utf8ValueOneByteMin)); 477 EXPECT_EQ(length, 1U); 478 length = Utf8ToUtf16Size(utf8ValueOneByteMaxPtr, sizeof(utf8ValueOneByteMax)); 479 EXPECT_EQ(length, 2U); 480 // when code point lie in [0x10000, 0x10FFFF], the required utf16 code element length is 2. 481 const uint8_t utf8ValueTwoBytesMin[5] = {0xF0, 0x90, 0x80, 0x80, 0x00}; // 0x10000 482 const uint8_t utf8ValueTwoBytesMax[5] = {0xF4, 0x8F, 0xBF, 0xBF, 0x00}; // 0x10FFFF 483 const uint8_t *utf8ValueTwoBytesMinPtr = utf8ValueTwoBytesMin; 484 const uint8_t *utf8ValueTwoBytesMaxPtr = utf8ValueTwoBytesMax; 485 length = Utf8ToUtf16Size(utf8ValueTwoBytesMinPtr, sizeof(utf8ValueTwoBytesMin)); 486 EXPECT_EQ(length, 3U); 487 length = Utf8ToUtf16Size(utf8ValueTwoBytesMaxPtr, sizeof(utf8ValueTwoBytesMax)); 488 EXPECT_EQ(length, 3U); 489 uint8_t utf8Value[12] = { 490 0xEF, 0xBF, 0xBF, 0xF0, 491 0x90, 0x80, 0x80, 0xF4, 492 0x8F, 0xBF, 0xBF, 0x00}; 493 const uint8_t *utf8ValuePtr = utf8Value; 494 length = Utf8ToUtf16Size(utf8ValuePtr, sizeof(utf8Value)); 495 EXPECT_EQ(length, 6U); 496} 497 498static void ConvertRegionUtf16ToUtf8Test(bool isDebugger = false) 499{ 500 size_t utf16Len = 8; 501 size_t utf8Len = 100; 502 size_t start = 0; 503 size_t utf8Pos = 0; 504 bool modify = false; 505 uint16_t utf16Value[8] = { 506 0x00, // 0 or 2 (special case for \u0000 ==> C080 - 1100'0000 1000'0000) 507 0x7F, // 1(0x00, 0x7F] 508 0x7FF, // 2 [0x80, 0x7FF] 509 0x800, // 3 [0x800, 0xD7FF] 510 0xD800, // 3 [0xD800, 0xDFFF] 511 0xFFFF, // 3 [0xE000, 0xFFFF] 512 0xD800, 0xDFFF}; // 4 [0x10000, 0x10FFFF] 513 const uint16_t *utf16ValuePtr = utf16Value; 514 uint8_t *utf8Out = (uint8_t*)malloc(utf8Len); 515 if (isDebugger) { 516 utf8Pos = DebuggerConvertRegionUtf16ToUtf8(utf16ValuePtr, utf8Out, utf16Len, utf8Len, start, modify); 517 } else { 518 utf8Pos = ConvertRegionUtf16ToUtf8(utf16ValuePtr, utf8Out, utf16Len, utf8Len, start, modify); 519 } 520 // 0 + 1 + 2 +(3 *3)+ 4= 16 521 EXPECT_EQ(utf8Pos, 16U); 522 // 2 + 1 + 2 +(3 * 3)+ 4 = 18 523 modify = true; 524 if (isDebugger) { 525 utf8Pos = DebuggerConvertRegionUtf16ToUtf8(utf16ValuePtr, utf8Out, utf16Len, utf8Len, start, modify); 526 } else { 527 utf8Pos = ConvertRegionUtf16ToUtf8(utf16ValuePtr, utf8Out, utf16Len, utf8Len, start, modify); 528 } 529 EXPECT_EQ(utf8Pos, 18U); 530 free(utf8Out); 531} 532 533/* 534* @tc.name: ConvertRegionUtf16ToUtf8 535* @tc.desc: Input aUTF16-encoded sequence (thelength is "utf16Len"), convert part of the sequence into a UTF8-encoded 536* sequence, and save it to "utf8Out"(the maximum length is "utf8Len"). The start parameter indicates the 537* start position of the conversion. Whether to perform special processing for O in the "modify" parameter. 538* @tc.type: FUNC 539*/ 540HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf16ToUtf8) 541{ 542 ConvertRegionUtf16ToUtf8Test(); 543} 544 545HWTEST_F_L0(UtfHelperTest, DebuggerConvertRegionUtf16ToUtf8) 546{ 547 ConvertRegionUtf16ToUtf8Test(true); 548} 549 550/* 551* @tc.name: ConvertRegionUtf8ToUtf16 552* @tc.desc: Input a UTF8-encoded sequence, convert part of the sequence into a UTF8-encoded sequence, and save it to 553* "utf16Out"(the maximum length is "utf16Len"), The start parameter indicates the start position of the 554* conversion. 555* @tc.type: FUNC 556*/ 557HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16) 558{ 559 size_t utf16Len = 100; 560 uint8_t utf8Value[10] = { 561 0x7F, // 1-length UTF16 encoding 562 0xDF, 0xBF, // 1-length UTF16 encoding 563 0xEF, 0xBF, 0xBF, // 1-length UTF16 encoding 564 0xF4, 0x8F, 0xBF, 0xBF}; // 2-length UTF16 encoding 565 const uint8_t *utf8ValuePtr = utf8Value; 566 uint16_t *utf16Out = (uint16_t*)malloc(utf16Len); 567 size_t outPos = ConvertRegionUtf8ToUtf16(utf8ValuePtr, utf16Out, sizeof(utf8Value), utf16Len); 568 // 1 + 1 + 1 + 2 = 5s 569 EXPECT_EQ(outPos, 5U); 570 // 1 + 2 = 3 571 utf8ValuePtr = utf8Value + 3; 572 outPos = ConvertRegionUtf8ToUtf16(utf8ValuePtr, utf16Out, sizeof(utf8Value) - 3, utf16Len); 573 EXPECT_EQ(outPos, 3U); 574} 575 576/* 577* @tc.name: ConvertUtf8ToUnicodeChar 578* @tc.desc: Converts a UTF8 encoding sequence encoding a character into a unicode point, and returns the 579* unicode point and the byte length of the utf8 encoding sequence. 580* @tc.type: FUNC 581*/ 582HWTEST_F_L0(UtfHelperTest, ConvertUtf8ToUnicodeChar) 583{ 584 std::pair<int32_t, size_t> invalidValue = {INVALID_UTF8, 0}; 585 // utf-8 is one byte, code point lie in [0x00, 0x7F] 586 uint8_t utf8Value1[1] = {0x00}; // 0x00 587 uint8_t utf8Value2[1] = {0x7F}; // 0x7F 588 const uint8_t *utf8ValuePtr1 = utf8Value1; 589 const uint8_t *utf8ValuePtr2 = utf8Value2; 590 std::pair<int32_t, size_t> unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr1, UtfLength::ONE); 591 std::pair<int32_t, size_t> unicodeValue = {0x00, 1}; 592 EXPECT_EQ(unicodeRes, unicodeValue); 593 unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr2, UtfLength::ONE); 594 unicodeValue = {0x7F, 1}; 595 EXPECT_EQ(unicodeRes, unicodeValue); 596 unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr2, 0); 597 EXPECT_EQ(unicodeRes, invalidValue); 598 599 // utf-8 is two bytes, code point lie in [0x80, 0x7FF] 600 uint8_t utf8Value3[2] = {0xC2, 0x80}; // 0x80 601 uint8_t utf8Value4[2] = {0xDF, 0xBF}; // 0x7FF 602 const uint8_t *utf8ValuePtr3 = utf8Value3; 603 const uint8_t *utf8ValuePtr4 = utf8Value4; 604 unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr3, UtfLength::TWO); 605 unicodeValue = {0x80, 2}; 606 EXPECT_EQ(unicodeRes, unicodeValue); 607 unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr4, UtfLength::TWO); 608 unicodeValue = {0x7FF, 2}; 609 EXPECT_EQ(unicodeRes, unicodeValue); 610 uint8_t utf8Value5[2] = {0xD0, 0x00}; // invalid 611 const uint8_t *utf8ValuePtr5 = utf8Value5; 612 unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr5, UtfLength::TWO); 613 EXPECT_EQ(unicodeRes, invalidValue); 614 unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr4, UtfLength::ONE); 615 EXPECT_EQ(unicodeRes, invalidValue); 616 617 // utf-8 is three bytes, code point lie in [0x800, 0xFFFF] 618 uint8_t utf8Value6[3] = {0xE0, 0xA0, 0x80}; // 0x800 619 uint8_t utf8Value7[3] = {0xED, 0x9F, 0xBF}; // 0xD7FF 620 const uint8_t *utf8ValuePtr6 = utf8Value6; 621 const uint8_t *utf8ValuePtr7 = utf8Value7; 622 unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr6, UtfLength::THREE); 623 unicodeValue = {0x800, 3}; 624 EXPECT_EQ(unicodeRes, unicodeValue); 625 unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr7, UtfLength::THREE); 626 unicodeValue = {0xD7FF, 3}; 627 EXPECT_EQ(unicodeRes, unicodeValue); 628 uint8_t utf8Value8[3] = {0xEB, 0x80, 0x40}; // invalid 629 const uint8_t *utf8ValuePtr8 = utf8Value8; 630 unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr8, UtfLength::THREE); 631 EXPECT_EQ(unicodeRes, invalidValue); 632 unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr7, UtfLength::TWO); 633 EXPECT_EQ(unicodeRes, invalidValue); 634 635 // utf-8 is four bytes, code point lie in [0x10000, 0x10FFFF]. 636 uint8_t utf8Value9[4] = {0xF0, 0x90, 0x80, 0x80}; // 0x10000 637 uint8_t utf8Value10[4] = {0xF4, 0x8F, 0xBF, 0xBF}; // 0x10FFFF 638 const uint8_t *utf8ValuePtr9 = utf8Value9; 639 const uint8_t *utf8ValuePtr10 = utf8Value10; 640 unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr9, UtfLength::FOUR); 641 unicodeValue = {0x10000, 4}; 642 EXPECT_EQ(unicodeRes, unicodeValue); 643 unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr10, UtfLength::FOUR); 644 unicodeValue = {0x10FFFF, 4}; 645 EXPECT_EQ(unicodeRes, unicodeValue); 646 uint8_t utf8Value11[4] = {0xF4, 0x80, 0x80, 0x40}; // invalid 647 const uint8_t *utf8ValuePtr11 = utf8Value11; 648 unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr11, UtfLength::FOUR); 649 EXPECT_EQ(unicodeRes, invalidValue); 650 unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr10, UtfLength::THREE); 651 EXPECT_EQ(unicodeRes, invalidValue); 652 653 // other exception 654 uint8_t utf8Value12[2] = {0x90, 0x00}; // invalid 655 const uint8_t *utf8ValuePtr12 = utf8Value12; 656 unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr12, UtfLength::FOUR); 657 EXPECT_EQ(unicodeRes, invalidValue); 658 uint8_t utf8Value13[2] = {0xF8, 0x00}; // invalid 659 const uint8_t *utf8ValuePtr13 = utf8Value13; 660 unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr13, UtfLength::FOUR); 661 EXPECT_EQ(unicodeRes, invalidValue); 662} 663 664/* 665* @tc.name: Utf8ToUtf16Size 666* @tc.desc: Test single byte characters 667* @tc.type: FUNC 668*/ 669HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_001) { 670 std::string utf8 = "Hello"; 671 std::vector<uint16_t> expected_utf16 = {0x0, 0x0, 0x0, 0x0, 0x0}; // "Hello" 672 std::vector<uint16_t> utf16(10); 673 size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size()); 674 utf16.resize(converted); 675 EXPECT_EQ(utf16, expected_utf16); 676} 677 678/* 679* @tc.name: Utf8ToUtf16Size 680* @tc.desc: Test includes Chinese characters 681* @tc.type: FUNC 682*/ 683HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_002) { 684 std::string utf8 = "你好,世界!"; 685 std::vector<uint16_t> expected_utf16 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; // "你好,世界!" 686 std::vector<uint16_t> utf16(10); 687 size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size()); 688 utf16.resize(converted); 689 EXPECT_EQ(utf16, expected_utf16); 690} 691 692/* 693* @tc.name: Utf8ToUtf16Size 694* @tc.desc: Test empty string 695* @tc.type: FUNC 696*/ 697HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_003) { 698 std::string utf8 = ""; 699 std::vector<uint16_t> expected_utf16 = {}; // empty 700 std::vector<uint16_t> utf16(10); 701 size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size()); 702 utf16.resize(converted); 703 EXPECT_EQ(utf16, expected_utf16); 704} 705 706/* 707* @tc.name: Utf8ToUtf16Size 708* @tc.desc: Test section conversion 709* @tc.type: FUNC 710*/ 711HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_004) { 712 std::string utf8 = "Hello, 你好"; 713 std::vector<uint16_t> expected_utf16 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; // "Hello, 你" 714 std::vector<uint16_t> utf16(10); 715 size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size()); 716 utf16.resize(converted); 717 EXPECT_EQ(utf16, expected_utf16); 718} 719 720/* 721* @tc.name: Utf8ToUtf16Size 722* @tc.desc: Test buffer length limit 723* @tc.type: FUNC 724*/ 725HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_005) { 726 std::string utf8 = "你好,世界!"; 727 std::vector<uint16_t> expected_utf16 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; // "你好" 728 std::vector<uint16_t> utf16(2); // Limit buffer length 729 size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size()); 730 utf16.resize(converted); 731 EXPECT_EQ(utf16, expected_utf16); 732} 733 734/* 735* @tc.name: Utf8ToUtf16Size 736* @tc.desc: Test for incorrect UTF-8 data 737* @tc.type: FUNC 738*/ 739HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_006) { 740 std::string utf8 = "\xF0\x28\x8C\x28"; 741 std::vector<uint16_t> expected_utf16 = {0x0, 0x0}; 742 std::vector<uint16_t> utf16(10); 743 size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size()); 744 utf16.resize(converted); 745 EXPECT_EQ(utf16, expected_utf16); 746} 747 748/* 749* @tc.name: Utf8ToUtf16Size 750* @tc.desc: Test single byte UTF-8 characters 751* @tc.type: FUNC 752*/ 753HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_007) { 754 std::string utf8 = "ABC"; // All are single byte characters 755 std::vector<uint16_t> expected_utf16 = {0x0, 0x0, 0x0}; // ASCII characters: A, B, C 756 std::vector<uint16_t> utf16(10); 757 size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size()); 758 utf16.resize(converted); 759 EXPECT_EQ(utf16, expected_utf16); 760} 761 762/* 763* @tc.name: Utf8ToUtf16Size 764* @tc.desc: Testing Double Byte UTF-8 Characters 765* @tc.type: FUNC 766*/ 767HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_008) { 768 std::string utf8 = "\xC2\xA2\xC3\xBC"; // They are ¢ and ü, respectively 769 std::vector<uint16_t> expected_utf16 = {0x0, 0x0}; // Unicode . 770 std::vector<uint16_t> utf16(10); 771 size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size()); 772 utf16.resize(converted); 773 EXPECT_EQ(utf16, expected_utf16); 774} 775 776/* 777* @tc.name: Utf8ToUtf16Size 778* @tc.desc: Test three byte UTF-8 characters 779* @tc.type: FUNC 780*/ 781HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_009) { 782 std::string utf8 = "\xE2\x82\xAC"; // euro: € 783 std::vector<uint16_t> expected_utf16 = {0x0}; // Unicode . 784 std::vector<uint16_t> utf16(10); 785 size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size()); 786 utf16.resize(converted); 787 EXPECT_EQ(utf16, expected_utf16); 788} 789 790/* 791* @tc.name: Utf8ToUtf16Size 792* @tc.desc: Test four byte UTF-8 characters and proxy pairs 793* @tc.type: FUNC 794*/ 795HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_010) { 796 std::string utf8 = "\xF0\x9F\x98\x8E"; // Emoji 797 std::vector<uint16_t> expected_utf16 = {0x0, 0x0}; // surrogates 798 std::vector<uint16_t> utf16(10); 799 size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size()); 800 utf16.resize(converted); 801 EXPECT_EQ(utf16, expected_utf16); 802} 803 804/* 805* @tc.name: Utf8ToUtf16Size 806* @tc.desc: Test UTF-8 data containing zero bytes 807* @tc.type: FUNC 808*/ 809HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_011) { 810 std::string utf8 = "Hello\0World", utf8Nul = utf8 + '\0' + "World"; // Clearly including zero bytes 811 std::vector<uint16_t> expected_utf16 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; 812 std::vector<uint16_t> utf16(15); 813 size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8Nul.data()), utf8Nul.size()); 814 utf16.resize(converted); 815 EXPECT_EQ(utf16, expected_utf16); 816} 817 818/* 819* @tc.name: Utf8ToUtf16Size 820* @tc.desc: Test continuous illegal sequences 821* @tc.type: FUNC 822*/ 823HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_012) { 824 std::string utf8 = "\xC0\x80\xC0\x80"; // Continuous illegal sequence 825 std::vector<uint16_t> expected_utf16 = {0x0, 0x0}; 826 std::vector<uint16_t> utf16(10); 827 size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size()); 828 utf16.resize(converted); 829 EXPECT_EQ(utf16, expected_utf16); 830} 831 832/* 833* @tc.name: ConvertRegionUtf8ToUtf16 834* @tc.desc: Test single byte characters 835* @tc.type: FUNC 836*/ 837HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_001) { 838 std::string utf8 = "Hello"; 839 std::vector<uint16_t> expected_utf16 = {0x0048, 0x0065, 0x006C, 0x006C, 0x006F}; // "Hello" 840 std::vector<uint16_t> utf16(10); 841 size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()), 842 utf16.data(), utf8.size(), utf16.size()); 843 utf16.resize(converted); 844 EXPECT_EQ(utf16, expected_utf16); 845} 846 847/* 848* @tc.name: ConvertRegionUtf8ToUtf16 849* @tc.desc: Test includes Chinese characters 850* @tc.type: FUNC 851*/ 852HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_002) { 853 std::string utf8 = "你好,世界!"; 854 std::vector<uint16_t> expected_utf16 = {0x4F60, 0x597D, 0xFF0C, 0x4E16, 0x754C, 0xFF01}; // "你好,世界!" 855 std::vector<uint16_t> utf16(10); 856 size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()), 857 utf16.data(), utf8.size(), utf16.size()); 858 utf16.resize(converted); 859 EXPECT_EQ(utf16, expected_utf16); 860} 861 862/* 863* @tc.name: ConvertRegionUtf8ToUtf16 864* @tc.desc: Test empty string 865* @tc.type: FUNC 866*/ 867HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_003) { 868 std::string utf8 = ""; 869 std::vector<uint16_t> expected_utf16 = {}; // Empty 870 std::vector<uint16_t> utf16(10); 871 size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()), 872 utf16.data(), utf8.size(), utf16.size()); 873 utf16.resize(converted); 874 EXPECT_EQ(utf16, expected_utf16); 875} 876 877/* 878* @tc.name: ConvertRegionUtf8ToUtf16 879* @tc.desc: Test section conversion 880* @tc.type: FUNC 881*/ 882HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_004) { 883 std::string utf8 = "Hello, 你好"; 884 std::vector<uint16_t> expected_utf16 = {0x0048, 0x0065, 0x006C, 0x006C, 0x006F, 0x002C, 0x20, 0x4F60}; 885 std::vector<uint16_t> utf16(10); 886 size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()), 887 utf16.data(), 10, utf16.size()); // Only process the first 9 bytes 888 utf16.resize(converted); 889 EXPECT_EQ(utf16, expected_utf16); 890} 891 892/* 893* @tc.name: ConvertRegionUtf8ToUtf16 894* @tc.desc: Test buffer length limit 895* @tc.type: FUNC 896*/ 897HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_005) { 898 std::string utf8 = "你好,世界!"; 899 std::vector<uint16_t> expected_utf16 = {0x4F60, 0x597D}; // "你好" 900 std::vector<uint16_t> utf16(2); // Limit buffer length 901 size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()), 902 utf16.data(), utf8.size(), utf16.size()); 903 utf16.resize(converted); 904 EXPECT_EQ(utf16, expected_utf16); 905} 906 907/* 908* @tc.name: ConvertRegionUtf8ToUtf16 909* @tc.desc: Test for incorrect UTF-8 data 910* @tc.type: FUNC 911*/ 912HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_006) { 913 std::string utf8 = "\xF0\x28\x8C\x28"; 914 std::vector<uint16_t> expected_utf16 = {}; // Expected empty output, handling erroneous data 915 std::vector<uint16_t> utf16(10); 916 size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()), 917 utf16.data(), utf8.size(), utf16.size()); 918 utf16.resize(converted); 919 EXPECT_NE(utf16, expected_utf16); 920} 921 922/* 923* @tc.name: ConvertRegionUtf8ToUtf16 924* @tc.desc: Test single byte UTF-8 characters 925* @tc.type: FUNC 926*/ 927HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_007) { 928 std::string utf8 = "ABC"; // All are single byte characters 929 std::vector<uint16_t> expected_utf16 = {0x0041, 0x0042, 0x0043}; // ASCII characters: A, B, C 930 std::vector<uint16_t> utf16(10); 931 size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()), 932 utf16.data(), utf8.size(), utf16.size()); 933 utf16.resize(converted); 934 EXPECT_EQ(utf16, expected_utf16); 935} 936 937/* 938* @tc.name: ConvertRegionUtf8ToUtf16 939* @tc.desc: Testing Double Byte UTF-8 Characters 940* @tc.type: FUNC 941*/ 942HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_008) { 943 std::string utf8 = "\xC2\xA2\xC3\xBC"; // They are ¢ and ü, respectively 944 std::vector<uint16_t> expected_utf16 = {0x00A2, 0x00FC}; // Unicode . 945 std::vector<uint16_t> utf16(10); 946 size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()), 947 utf16.data(), utf8.size(), utf16.size()); 948 utf16.resize(converted); 949 EXPECT_EQ(utf16, expected_utf16); 950} 951 952/* 953* @tc.name: ConvertRegionUtf8ToUtf16 954* @tc.desc: Test three byte UTF-8 characters 955* @tc.type: FUNC 956*/ 957HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_009) { 958 std::string utf8 = "\xE2\x82\xAC"; // euro € 959 std::vector<uint16_t> expected_utf16 = {0x20AC}; // Unicode . 960 std::vector<uint16_t> utf16(10); 961 size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()), 962 utf16.data(), utf8.size(), utf16.size()); 963 utf16.resize(converted); 964 EXPECT_EQ(utf16, expected_utf16); 965} 966 967/* 968* @tc.name: ConvertRegionUtf8ToUtf16 969* @tc.desc: Test four byte UTF-8 characters and proxy pairs 970* @tc.type: FUNC 971*/ 972HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_010) { 973 std::string utf8 = "\xF0\x9F\x98\x8E"; // Emoji 974 std::vector<uint16_t> expected_utf16 = {0xD83D, 0xDE0E}; // surrogates 975 std::vector<uint16_t> utf16(10); 976 size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()), 977 utf16.data(), utf8.size(), utf16.size()); 978 utf16.resize(converted); 979 EXPECT_EQ(utf16, expected_utf16); 980} 981 982/* 983* @tc.name: ConvertRegionUtf8ToUtf16 984* @tc.desc: Test UTF-8 data containing zero bytes 985* @tc.type: FUNC 986*/ 987HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_011) { 988 std::string utf8 = "Hello\0World", utf8Nul = utf8 + '\0' + "World"; // Clearly including zero bytes 989 std::vector<uint16_t> expected_utf16 = {0x0048, 0x0065, 0x006C, 0x006C, 0x006F, 990 0x0000, 0x0057, 0x006F, 0x0072, 0x006C, 0x0064}; // Including NULL characters 991 std::vector<uint16_t> utf16(15); 992 size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8Nul.data()), 993 utf16.data(), utf8Nul.size(), utf16.size()); 994 utf16.resize(converted); 995 EXPECT_EQ(utf16, expected_utf16); 996} 997 998/* 999* @tc.name: ConvertRegionUtf8ToUtf16 1000* @tc.desc: Test continuous illegal sequences 1001* @tc.type: FUNC 1002*/ 1003HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_012) { 1004 std::string utf8 = "\xC0\x80\xC0\x80"; // Continuous illegal sequence 1005 std::vector<uint16_t> expected_utf16 = {}; 1006 std::vector<uint16_t> utf16(10); 1007 size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()), 1008 utf16.data(), utf8.size(), utf16.size()); 1009 utf16.resize(converted); 1010 EXPECT_NE(utf16, expected_utf16); 1011} 1012 1013/* 1014* @tc.name: ConvertRegionUtf8ToUtf16 1015* @tc.desc: Test four byte UTF-8 characters and proxy pairs 1016* @tc.type: FUNC 1017*/ 1018HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_013) { 1019 std::string utf8 = "\xF0\x9F\x98\x8E"; // Emoji 1020 std::vector<uint16_t> expected_utf16 = {0xD83D, 0xDE0E}; // surrogates 1021 std::vector<uint16_t> utf16(0); 1022 size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()), 1023 utf16.data(), utf8.size(), utf16.size()); 1024 utf16.resize(converted); 1025 EXPECT_EQ(converted, 0); 1026} 1027/* 1028* @tc.name: ConvertRegionUtf8ToUtf16 1029* @tc.desc: Test four byte UTF-8 characters and proxy pairs 1030* @tc.type: FUNC 1031*/ 1032HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_014) { 1033 std::string utf8 = "\xF0\x9F\x98\x8E"; // Emoji 1034 std::vector<uint16_t> expected_utf16 = {0xD83D, 0xDE0E}; // surrogates 1035 std::vector<uint16_t> utf16(1); 1036 size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()), 1037 utf16.data(), utf8.size(), utf16.size()); 1038 utf16.resize(converted); 1039 EXPECT_EQ(converted, 0); 1040} 1041} // namespace panda:test 1042