1/* 2 * Copyright (c) 2022 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16#include "converter.h" 17 18#include <climits> 19#include <codecvt> 20#include <locale> 21 22using namespace std; 23 24namespace OHOS::buffer { 25 26bool IsOneByte(uint8_t u8Char) 27{ 28 return (u8Char & 0x80) == 0; 29} 30 31void Utf8ToUtf16BEToData(const unsigned char *data, u16string &u16Str, string::size_type &index, uint8_t &c1) 32{ 33 uint8_t c2 = data[++index]; // The second byte 34 uint8_t c3 = data[++index]; // The third byte 35 uint8_t c4 = data[++index]; // The forth byte 36 // Calculate the UNICODE code point value (3 bits lower for the first byte, 6 bits for the other) 37 // 3 : shift left 3 times of UTF8_VALID_BITS 38 uint32_t codePoint = ((c1 & LOWER_3_BITS_MASK) << (3 * UTF8_VALID_BITS)) | 39 // 2 : shift left 2 times of UTF8_VALID_BITS 40 ((c2 & LOWER_6_BITS_MASK) << (2 * UTF8_VALID_BITS)) | 41 ((c3 & LOWER_6_BITS_MASK) << UTF8_VALID_BITS) | 42 (c4 & LOWER_6_BITS_MASK); 43 // In UTF-16, U+10000 to U+10FFFF represent surrogate pairs with two 16-bit units 44 if (codePoint >= UTF16_SPECIAL_VALUE) { 45 codePoint -= UTF16_SPECIAL_VALUE; 46 // 10 : a half of 20 , shift right 10 bits 47 u16Str.push_back(static_cast<char16_t>((codePoint >> 10) | HIGH_AGENT_MASK)); 48 u16Str.push_back(static_cast<char16_t>((codePoint & LOWER_10_BITS_MASK) | LOW_AGENT_MASK)); 49 } else { // In UTF-16, U+0000 to U+D7FF and U+E000 to U+FFFF are Unicode code point values 50 // U+D800 to U+DFFF are invalid characters, for simplicity, 51 // assume it does not exist (if any, not encoded) 52 u16Str.push_back(static_cast<char16_t>(codePoint)); 53 } 54} 55 56u16string Utf8ToUtf16BE(const string &u8Str, bool *ok) 57{ 58 u16string u16Str = u""; 59 u16Str.reserve(u8Str.size()); 60 string::size_type len = u8Str.length(); 61 const unsigned char *data = reinterpret_cast<const unsigned char *>(u8Str.data()); 62 bool isOk = true; 63 for (string::size_type i = 0; i < len; ++i) { 64 uint8_t c1 = data[i]; // The first byte 65 if (IsOneByte(c1)) { // only 1 byte represents the UNICODE code point 66 u16Str.push_back(static_cast<char16_t>(c1)); 67 continue; 68 } 69 switch (c1 & HIGER_4_BITS_MASK) { 70 case FOUR_BYTES_STYLE: { // 4 byte characters, from 0x10000 to 0x10FFFF 71 Utf8ToUtf16BEToData(data, u16Str, i, c1); 72 break; 73 } 74 case THREE_BYTES_STYLE: { // 3 byte characters, from 0x800 to 0xFFFF 75 uint8_t c2 = data[++i]; // The second byte 76 uint8_t c3 = data[++i]; // The third byte 77 // Calculates the UNICODE code point value 78 // (4 bits lower for the first byte, 6 bits lower for the other) 79 // 2 : shift left 2 times of UTF8_VALID_BITS 80 uint32_t codePoint = ((c1 & LOWER_4_BITS_MASK) << (2 * UTF8_VALID_BITS)) | 81 ((c2 & LOWER_6_BITS_MASK) << UTF8_VALID_BITS) | 82 (c3 & LOWER_6_BITS_MASK); 83 u16Str.push_back(static_cast<char16_t>(codePoint)); 84 break; 85 } 86 case TWO_BYTES_STYLE1: // 2 byte characters, from 0x80 to 0x7FF 87 case TWO_BYTES_STYLE2: { 88 uint8_t c2 = data[++i]; // The second byte 89 // Calculates the UNICODE code point value 90 // (5 bits lower for the first byte, 6 bits lower for the other) 91 uint32_t codePoint = ((c1 & LOWER_5_BITS_MASK) << UTF8_VALID_BITS) | 92 (c2 & LOWER_6_BITS_MASK); 93 u16Str.push_back(static_cast<char16_t>(codePoint)); 94 break; 95 } 96 default: { 97 isOk = false; 98 break; 99 } 100 } 101 } 102 if (ok != nullptr) { 103 *ok = isOk; 104 } 105 return u16Str; 106} 107 108u16string Utf16BEToLE(const u16string &wstr) 109{ 110 u16string str16 = u""; 111 const char16_t *data = wstr.data(); 112 for (unsigned int i = 0; i < wstr.length(); i++) { 113 char16_t wc = data[i]; 114 char16_t high = (wc >> 8) & 0x00FF; 115 char16_t low = wc & 0x00FF; 116 char16_t c16 = (low << 8) | high; 117 str16.push_back(c16); 118 } 119 return str16; 120} 121 122string Utf16BEToANSI(const u16string &wstr) 123{ 124 string ret = ""; 125 for (u16string::const_iterator it = wstr.begin(); it != wstr.end(); ++it) { 126 char16_t wc = (*it); 127 // get the lower bit from the UNICODE code point 128 char c = static_cast<char>(wc & LOWER_8_BITS_MASK); 129 ret.push_back(c); 130 } 131 return ret; 132} 133 134string Utf8ToUtf16BEToANSI(const string &str) 135{ 136 u16string u16Str = Utf8ToUtf16BE(str); 137 string ret = Utf16BEToANSI(u16Str); 138 return ret; 139} 140 141bool IsBase64Char(unsigned char c) 142{ 143 return (isalnum(c) || (c == '+') || (c == '/') || (c == '-') || (c == '_')); 144} 145 146/** 147* Base64Encode - Base64 encode 148* @src: Data to be encoded 149* @len: Length of the data to be encoded 150* Returns: Allocated buffer of outLen bytes of encoded data, 151* or empty string on failure 152*/ 153string Base64Encode(const unsigned char *src, size_t len, EncodingType type) 154{ 155 if (src == nullptr) { 156 return string(); 157 } 158 unsigned char *out = nullptr; 159 unsigned char *pos = nullptr; 160 const unsigned char *pEnd = nullptr; 161 const unsigned char *pStart = nullptr; 162 size_t outLen = 4 * ((len + 2) / 3); // 3-byte blocks to 4-byte 163 164 if (outLen < len) { 165 return string(); // integer overflow 166 } 167 168 string outStr = ""; 169 outStr.resize(outLen); 170 out = reinterpret_cast<unsigned char *>(&outStr[0]); 171 172 pEnd = src + len; 173 pStart = src; 174 pos = out; 175 176 string table = BASE64_TABLE; 177 if (type == BASE64URL) { 178 table = BASE64URL_TABLE; 179 } 180 // 3 : 3 bytes is just 24 bits which is 4 times of 6 bits 181 while (pEnd - pStart >= 3) { 182 // 2 : add two zeros in front of the first set of 6 bits to become a new 8 binary bits 183 *pos = table[pStart[0] >> 2]; 184 // 4 : add two zeros in front of the following second set of 6 bits to become the new 8 binary bits 185 *(pos + 1) = table[((pStart[0] & LOWER_2_BITS_MASK) << 4) | (pStart[1] >> 4)]; 186 // 2 : 4 : 6 : add two zeros in front of the following third set of 6 bits to become the new 8 binary bits 187 *(pos + 2) = table[((pStart[1] & LOWER_4_BITS_MASK) << 2) | (pStart[2] >> 6)]; 188 // 2 : 3 : add two zeros in front of the following forth set of 6 bits to become the new 8 binary bits 189 *(pos + 3) = table[pStart[2] & LOWER_6_BITS_MASK]; 190 // 4 : the pointer of pos scrolls off 4 bytes to point the next 4 bytes of encoded chars 191 pos += 4; 192 // 3 : the pointer of pStart scrolls off 3 bytes to point the next 3 bytes of which will be encoded chars 193 pStart += 3; 194 } 195 196 // process the last set of less than 3 bytes of data 197 if (pEnd - pStart > 0) { 198 // 2 : add two zeros in front of the first set of 6 bits to become a new 8 binary bits 199 *pos = table[pStart[0] >> 2]; 200 if (pEnd - pStart == 1) { // one byte remaining 201 // 4 : paddle the last two bits of the last byte with two zeros in front of it and four zeros after it 202 *(pos + 1) = table[(pStart[0] & LOWER_2_BITS_MASK) << 4]; 203 // 2 : fill in the missing bytes with '=' 204 *(pos + 2) = '='; 205 } else { // two bytes remaining 206 // 4 : add two zeros in front of the second set of 6 bits to become the new 8 binary bits 207 *(pos + 1) = table[((pStart[0] & LOWER_2_BITS_MASK) << 4) | (pStart[1] >> 4)]; 208 // 2 : paddle the last four bits of the last byte with two zeros in front of it and two zeros after it 209 *(pos + 2) = table[(pStart[1] & LOWER_4_BITS_MASK) << 2]; 210 } 211 // 3 : fill in the missing bytes with '=' 212 *(pos + 3) = '='; 213 } 214 215 if (type == BASE64URL) { 216 size_t poss = outStr.find_last_not_of('='); 217 if (poss != std::string::npos) { 218 outStr.erase(poss + 1); 219 } 220 } 221 return outStr; 222} 223 224string Base64Decode(string const& encodedStr, EncodingType type) 225{ 226 size_t len = encodedStr.size(); 227 unsigned int index = 0; 228 unsigned int cursor = 0; 229 unsigned char charArray4[4] = {0}; // an array to stage a group of indexes for encoded string 230 unsigned char charArray3[3] = {0}; // an array to stage a set of original string 231 string ret = ""; 232 string table = BASE64_TABLE; 233 234 if (type == BASE64URL) { 235 table = BASE64URL_TABLE; 236 } 237 while ((encodedStr[cursor] != '=') && IsBase64Char(encodedStr[cursor])) { 238 // stage a 4-byte string to charArray4 239 charArray4[index] = encodedStr[cursor]; 240 index++; 241 cursor++; 242 if (index == 4) { // 4 : after 4 chars is assigned to charArray4 243 // 4 : fill data into charArray4 244 for (index = 0; index < 4; index++) { 245 charArray4[index] = table.find(charArray4[index]) & LOWER_8_BITS_MASK; 246 } 247 // get the last six bits of the first byte of charArray4 and the first valid 248 // 2 : 4 : two bits(except two higer bits) of the second byte, combine them to a new byte 249 charArray3[0] = (charArray4[0] << 2) + ((charArray4[1] & 0x30) >> 4); 250 // get the last four bits of the second byte of charArray4 and the first valid 251 // 4 : 2 : four bits(except two higer bits) of the third byte, combine them to a new byte 252 charArray3[1] = ((charArray4[1] & LOWER_4_BITS_MASK) << 4) + ((charArray4[2] & MIDDLE_4_BITS_MASK) >> 2); 253 // get the last two bits of the third byte of charArray4 and the forth byte, 254 // 2 : 3 : 6 : combine them to a new byte 255 charArray3[2] = ((charArray4[2] & LOWER_2_BITS_MASK) << 6) + charArray4[3]; 256 // 3 : assigns the decoded string to the return value 257 for (index = 0; index < 3; index++) { 258 ret += charArray3[index]; 259 } 260 index = 0; 261 } 262 if (cursor > len - 1) { 263 break; 264 } 265 } 266 267 if (index != 0) { 268 // fill data into charArray4 269 for (unsigned int i = 0; i < index; i++) { 270 charArray4[i] = table.find(charArray4[i]) & LOWER_8_BITS_MASK; 271 } 272 // get the last six bits of the first byte of charArray4 and the first valid 273 // 2 : 4 : two bits(except two higer bits) of the second byte, combine them to a new byte 274 charArray3[0] = (charArray4[0] << 2) + ((charArray4[1] & 0x30) >> 4); 275 // get the last four bits of the second byte of charArray4 and the first valid 276 // 4 : 2 : four bits(except two higer bits) of the third byte, combine them to a new byte 277 charArray3[1] = ((charArray4[1] & LOWER_4_BITS_MASK) << 4) + ((charArray4[2] & LOWER_6_BITS_MASK) >> 2); 278 // assigns the decoded string to the return value 279 for (unsigned int i = 0; i < index - 1; i++) { 280 ret += charArray3[i]; 281 } 282 } 283 284 return ret; 285} 286 287bool IsValidHex(const string &hex) 288{ 289 bool isValid = false; 290 for (unsigned int i = 0; i < hex.size(); i++) { 291 char c = hex.at(i); 292 // 0 ~ 9, A ~ F, a ~ f 293 if ((c <= '9' && c >= '0') || (c <= 'F' && c >= 'A') || (c <= 'f' && c >= 'a')) { 294 isValid = true; 295 } else { 296 isValid = false; 297 break; 298 } 299 } 300 return isValid; 301} 302 303string HexDecode(const string &hexStr) 304{ 305 string nums = ""; 306 unsigned int arrSize = hexStr.size(); 307 308 // 2 : means a half length of hex str's size 309 for (unsigned int i = 0; i < arrSize / 2; i++) { 310 string hexStrTmp = ""; 311 int num = 0; 312 // 2 : offset is i * 2 313 hexStrTmp.push_back(hexStr[i * 2]); 314 // 2 : offset is i * 2 + 1 315 hexStrTmp.push_back(hexStr[i * 2 + 1]); 316 if (!IsValidHex(hexStrTmp)) { 317 break; 318 } 319 // 16 : the base is 16 320 num = stoi(hexStrTmp, nullptr, 16); 321 nums.push_back(static_cast<char>(num)); 322 } 323 324 return nums; 325} 326 327// Find the position of the last character in pat from patIndex 328int GetGoodSuffixLengthByLastChar(uint8_t *pat, int patIndex, int patLen) 329{ 330 int lastIndex = patLen - 1; 331 int index = -1; 332 while (patIndex >= 0) { 333 if (pat[patIndex] == pat[lastIndex]) { 334 index = patIndex; 335 break; 336 } else { 337 patIndex--; 338 } 339 } 340 return lastIndex - index; 341} 342// Find the position of the first character in pat from patIndex 343int GetGoodSuffixLengthByFirstChar(uint8_t *pat, int patIndex, int tarlen) 344{ 345 int indexOfNextFirstChar = tarlen; 346 for (int i = patIndex; i < tarlen; i++) { 347 if (pat[0] == pat[i]) { 348 indexOfNextFirstChar = i; 349 break; 350 } 351 } 352 return indexOfNextFirstChar; 353} 354 355// Match forward from patIndex to get the position of the singleChar in the pat 356// and the length of the bad character 357int GetBadCharLengthInReverseOrder(uint8_t *pat, char singleChar, int patIndex) 358{ 359 int index = -1; 360 for (int i = patIndex - 1; i >= 0; --i) { 361 if (pat[i] == singleChar) { 362 index = i; 363 break; 364 } 365 } 366 return patIndex - index; 367} 368 369// Get the position of character c in pat 370int GetBadCharLengthInSequence(uint8_t *pat, char singleChar, int patIndex, int tarlen) 371{ 372 int resIndex = tarlen; 373 for (int i = patIndex; i < tarlen; i++) { 374 if (singleChar == pat[i]) { 375 resIndex = i; 376 break; 377 } 378 } 379 return resIndex; 380} 381 382int FindLastIndex(uint8_t *source, uint8_t *target, int soulen, int tarlen) 383{ 384 if (source == nullptr || target == nullptr) { 385 return -1; 386 } 387 if (soulen < tarlen || tarlen == 0) { 388 return -1; 389 } 390 int i = soulen - tarlen; 391 int j = 0; 392 393 while (i >= 0) { 394 if (source[i] == target[j]) { 395 if (j == tarlen - 1) { 396 return i - (tarlen - 1); 397 } 398 i++; 399 j++; 400 } else { 401 if (j == 0) { 402 int badValue = GetBadCharLengthInSequence(target, source[i], j, tarlen); 403 i = i - badValue; 404 j = 0; 405 } else { 406 int badValue = GetBadCharLengthInSequence(target, source[i], j, tarlen); 407 int goodSuffix = GetGoodSuffixLengthByFirstChar(target, j, tarlen); 408 int distance = badValue > goodSuffix ? badValue : goodSuffix; 409 i = i - distance; 410 j = 0; 411 } 412 } 413 } 414 return -1; 415} 416 417bool FindIndexInner(uint8_t* target, uint8_t* source, int tarlen, int &indexI, int &indexJ) 418{ 419 if (indexJ == tarlen - 1) { 420 int badValue = GetBadCharLengthInReverseOrder(target, source[indexI], indexJ); 421 indexI = indexI + badValue; 422 } else { 423 int badValue = GetBadCharLengthInReverseOrder(target, source[indexI], indexJ); 424 int goodSuffix = GetGoodSuffixLengthByLastChar(target, indexJ, tarlen); 425 int distance = badValue > goodSuffix ? badValue : goodSuffix; 426 long addVal = static_cast<long>(indexI) + tarlen; 427 long addRst = addVal + distance; 428 if (abs(addVal) > INT_MAX || abs(addRst) > INT_MAX) { 429 return false; 430 } 431 indexI = indexI + tarlen - 1 - indexJ + distance; 432 indexJ = tarlen - 1; 433 } 434 return true; 435} 436 437int FindIndex(uint8_t* source, uint8_t* target, int soulen, int tarlen) 438{ 439 if (source == nullptr || target == nullptr) { 440 return -1; 441 } 442 if (soulen < tarlen || tarlen == 0) { 443 return -1; 444 } 445 int i = tarlen - 1; 446 int j = tarlen - 1; 447 while (i < soulen) { 448 if (source[i] == target[j]) { 449 if (j == 0) { 450 return i; 451 } 452 i--; 453 j--; 454 } else { 455 bool flag = FindIndexInner(target, source, tarlen, i, j); 456 if (!flag) { 457 return -1; 458 } 459 } 460 } 461 return -1; 462} 463} 464