1/*
2 * Copyright (c) 2022 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 *     http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#include "converter.h"
17
18#include <climits>
19#include <codecvt>
20#include <locale>
21
22using namespace std;
23
24namespace OHOS::buffer {
25
26bool IsOneByte(uint8_t u8Char)
27{
28    return (u8Char & 0x80) == 0;
29}
30
31void Utf8ToUtf16BEToData(const unsigned char *data, u16string &u16Str, string::size_type &index, uint8_t &c1)
32{
33    uint8_t c2 = data[++index]; // The second byte
34    uint8_t c3 = data[++index]; // The third byte
35    uint8_t c4 = data[++index]; // The forth byte
36    // Calculate the UNICODE code point value (3 bits lower for the first byte, 6 bits for the other)
37    // 3 : shift left 3 times of UTF8_VALID_BITS
38    uint32_t codePoint = ((c1 & LOWER_3_BITS_MASK) << (3 * UTF8_VALID_BITS)) |
39        // 2 : shift left 2 times of UTF8_VALID_BITS
40        ((c2 & LOWER_6_BITS_MASK) << (2 * UTF8_VALID_BITS)) |
41        ((c3 & LOWER_6_BITS_MASK) << UTF8_VALID_BITS) |
42        (c4 & LOWER_6_BITS_MASK);
43    // In UTF-16, U+10000 to U+10FFFF represent surrogate pairs with two 16-bit units
44    if (codePoint >= UTF16_SPECIAL_VALUE) {
45        codePoint -= UTF16_SPECIAL_VALUE;
46        // 10 : a half of 20 , shift right 10 bits
47        u16Str.push_back(static_cast<char16_t>((codePoint >> 10) | HIGH_AGENT_MASK));
48        u16Str.push_back(static_cast<char16_t>((codePoint & LOWER_10_BITS_MASK) | LOW_AGENT_MASK));
49    } else { // In UTF-16, U+0000 to U+D7FF and U+E000 to U+FFFF are Unicode code point values
50        // U+D800 to U+DFFF are invalid characters, for simplicity,
51        // assume it does not exist (if any, not encoded)
52        u16Str.push_back(static_cast<char16_t>(codePoint));
53    }
54}
55
56u16string Utf8ToUtf16BE(const string &u8Str, bool *ok)
57{
58    u16string u16Str = u"";
59    u16Str.reserve(u8Str.size());
60    string::size_type len = u8Str.length();
61    const unsigned char *data = reinterpret_cast<const unsigned char *>(u8Str.data());
62    bool isOk = true;
63    for (string::size_type i = 0; i < len; ++i) {
64        uint8_t c1 = data[i]; // The first byte
65        if (IsOneByte(c1)) { // only 1 byte represents the UNICODE code point
66            u16Str.push_back(static_cast<char16_t>(c1));
67            continue;
68        }
69        switch (c1 & HIGER_4_BITS_MASK) {
70            case FOUR_BYTES_STYLE: { // 4 byte characters, from 0x10000 to 0x10FFFF
71                Utf8ToUtf16BEToData(data, u16Str, i, c1);
72                break;
73            }
74            case THREE_BYTES_STYLE: { // 3 byte characters, from 0x800 to 0xFFFF
75                uint8_t c2 = data[++i]; // The second byte
76                uint8_t c3 = data[++i]; // The third byte
77                // Calculates the UNICODE code point value
78                // (4 bits lower for the first byte, 6 bits lower for the other)
79                // 2 : shift left 2 times of UTF8_VALID_BITS
80                uint32_t codePoint = ((c1 & LOWER_4_BITS_MASK) << (2 * UTF8_VALID_BITS)) |
81                    ((c2 & LOWER_6_BITS_MASK) << UTF8_VALID_BITS) |
82                    (c3 & LOWER_6_BITS_MASK);
83                u16Str.push_back(static_cast<char16_t>(codePoint));
84                break;
85            }
86            case TWO_BYTES_STYLE1: // 2 byte characters, from 0x80 to 0x7FF
87            case TWO_BYTES_STYLE2: {
88                uint8_t c2 = data[++i]; // The second byte
89                // Calculates the UNICODE code point value
90                // (5 bits lower for the first byte, 6 bits lower for the other)
91                uint32_t codePoint = ((c1 & LOWER_5_BITS_MASK) << UTF8_VALID_BITS) |
92                    (c2 & LOWER_6_BITS_MASK);
93                u16Str.push_back(static_cast<char16_t>(codePoint));
94                break;
95            }
96            default: {
97                isOk = false;
98                break;
99            }
100        }
101    }
102    if (ok != nullptr) {
103        *ok = isOk;
104    }
105    return u16Str;
106}
107
108u16string Utf16BEToLE(const u16string &wstr)
109{
110    u16string str16 = u"";
111    const char16_t *data = wstr.data();
112    for (unsigned int i = 0; i < wstr.length(); i++) {
113        char16_t wc = data[i];
114        char16_t high = (wc >> 8) & 0x00FF;
115        char16_t low = wc & 0x00FF;
116        char16_t c16 = (low << 8) | high;
117        str16.push_back(c16);
118    }
119    return str16;
120}
121
122string Utf16BEToANSI(const u16string &wstr)
123{
124    string ret = "";
125    for (u16string::const_iterator it = wstr.begin(); it != wstr.end(); ++it) {
126        char16_t wc = (*it);
127        // get the lower bit from the UNICODE code point
128        char c = static_cast<char>(wc & LOWER_8_BITS_MASK);
129        ret.push_back(c);
130    }
131    return ret;
132}
133
134string Utf8ToUtf16BEToANSI(const string &str)
135{
136    u16string u16Str = Utf8ToUtf16BE(str);
137    string ret = Utf16BEToANSI(u16Str);
138    return ret;
139}
140
141bool IsBase64Char(unsigned char c)
142{
143    return (isalnum(c) || (c == '+') || (c == '/') || (c == '-') || (c == '_'));
144}
145
146/**
147* Base64Encode - Base64 encode
148* @src: Data to be encoded
149* @len: Length of the data to be encoded
150* Returns: Allocated buffer of outLen bytes of encoded data,
151* or empty string on failure
152*/
153string Base64Encode(const unsigned char *src, size_t len, EncodingType type)
154{
155    if (src == nullptr) {
156        return string();
157    }
158    unsigned char *out = nullptr;
159    unsigned char *pos = nullptr;
160    const unsigned char *pEnd = nullptr;
161    const unsigned char *pStart = nullptr;
162    size_t outLen = 4 * ((len + 2) / 3); // 3-byte blocks to 4-byte
163
164    if (outLen < len) {
165        return string(); // integer overflow
166    }
167
168    string outStr = "";
169    outStr.resize(outLen);
170    out = reinterpret_cast<unsigned char *>(&outStr[0]);
171
172    pEnd = src + len;
173    pStart = src;
174    pos = out;
175
176    string table = BASE64_TABLE;
177    if (type == BASE64URL) {
178        table = BASE64URL_TABLE;
179    }
180    // 3 : 3 bytes is just 24 bits which is 4 times of 6 bits
181    while (pEnd - pStart >= 3) {
182        // 2 : add two zeros in front of the first set of 6 bits to become a new 8 binary bits
183        *pos = table[pStart[0] >> 2];
184        // 4 : add two zeros in front of the following second set of 6 bits to become the new 8 binary bits
185        *(pos + 1) = table[((pStart[0] & LOWER_2_BITS_MASK) << 4) | (pStart[1] >> 4)];
186        // 2 : 4 : 6 : add two zeros in front of the following third set of 6 bits to become the new 8 binary bits
187        *(pos + 2) = table[((pStart[1] & LOWER_4_BITS_MASK) << 2) | (pStart[2] >> 6)];
188        // 2 : 3 : add two zeros in front of the following forth set of 6 bits to become the new 8 binary bits
189        *(pos + 3) = table[pStart[2] & LOWER_6_BITS_MASK];
190        // 4 : the pointer of pos scrolls off 4 bytes to point the next 4 bytes of encoded chars
191        pos += 4;
192        // 3 : the pointer of pStart scrolls off 3 bytes to point the next 3 bytes of which will be encoded chars
193        pStart += 3;
194    }
195
196    // process the last set of less than 3 bytes of data
197    if (pEnd - pStart > 0) {
198        // 2 : add two zeros in front of the first set of 6 bits to become a new 8 binary bits
199        *pos = table[pStart[0] >> 2];
200        if (pEnd - pStart == 1) { // one byte remaining
201            // 4 : paddle the last two bits of the last byte with two zeros in front of it and four zeros after it
202            *(pos + 1) = table[(pStart[0] & LOWER_2_BITS_MASK) << 4];
203            // 2 : fill in the missing bytes with '='
204            *(pos + 2) = '=';
205        } else { // two bytes remaining
206            // 4 : add two zeros in front of the second set of 6 bits to become the new 8 binary bits
207            *(pos + 1) = table[((pStart[0] & LOWER_2_BITS_MASK) << 4) | (pStart[1] >> 4)];
208            // 2 : paddle the last four bits of the last byte with two zeros in front of it and two zeros after it
209            *(pos + 2) = table[(pStart[1] & LOWER_4_BITS_MASK) << 2];
210        }
211        // 3 : fill in the missing bytes with '='
212        *(pos + 3) = '=';
213    }
214
215    if (type == BASE64URL) {
216        size_t poss = outStr.find_last_not_of('=');
217        if (poss != std::string::npos) {
218            outStr.erase(poss + 1);
219        }
220    }
221    return outStr;
222}
223
224string Base64Decode(string const& encodedStr, EncodingType type)
225{
226    size_t len = encodedStr.size();
227    unsigned int index = 0;
228    unsigned int cursor = 0;
229    unsigned char charArray4[4] = {0}; // an array to stage a group of indexes for encoded string
230    unsigned char charArray3[3] = {0}; // an array to stage a set of original string
231    string ret = "";
232    string table = BASE64_TABLE;
233
234    if (type == BASE64URL) {
235        table = BASE64URL_TABLE;
236    }
237    while ((encodedStr[cursor] != '=') && IsBase64Char(encodedStr[cursor])) {
238        // stage a 4-byte string to charArray4
239        charArray4[index] = encodedStr[cursor];
240        index++;
241        cursor++;
242        if (index == 4) { // 4 : after 4 chars is assigned to charArray4
243            // 4 : fill data into charArray4
244            for (index = 0; index < 4; index++) {
245                charArray4[index] = table.find(charArray4[index]) & LOWER_8_BITS_MASK;
246            }
247            // get the last six bits of the first byte of charArray4 and the first valid
248            // 2 : 4 : two bits(except two higer bits) of the second byte, combine them to a new byte
249            charArray3[0] = (charArray4[0] << 2) + ((charArray4[1] & 0x30) >> 4);
250            // get the last four bits of the second byte of charArray4 and the first valid
251            // 4 : 2 : four bits(except two higer bits) of the third byte, combine them to a new byte
252            charArray3[1] = ((charArray4[1] & LOWER_4_BITS_MASK) << 4) + ((charArray4[2] & MIDDLE_4_BITS_MASK) >> 2);
253            // get the last two bits of the third byte of charArray4 and the forth byte,
254            // 2 : 3 : 6 : combine them to a new byte
255            charArray3[2] = ((charArray4[2] & LOWER_2_BITS_MASK) << 6) + charArray4[3];
256            // 3 : assigns the decoded string to the return value
257            for (index = 0; index < 3; index++) {
258                ret += charArray3[index];
259            }
260            index = 0;
261        }
262        if (cursor > len - 1) {
263            break;
264        }
265    }
266
267    if (index != 0) {
268        // fill data into charArray4
269        for (unsigned int i = 0; i < index; i++) {
270            charArray4[i] = table.find(charArray4[i]) & LOWER_8_BITS_MASK;
271        }
272        // get the last six bits of the first byte of charArray4 and the first valid
273        // 2 : 4 : two bits(except two higer bits) of the second byte, combine them to a new byte
274        charArray3[0] = (charArray4[0] << 2) + ((charArray4[1] & 0x30) >> 4);
275        // get the last four bits of the second byte of charArray4 and the first valid
276        // 4 : 2 : four bits(except two higer bits) of the third byte, combine them to a new byte
277        charArray3[1] = ((charArray4[1] & LOWER_4_BITS_MASK) << 4) + ((charArray4[2] & LOWER_6_BITS_MASK) >> 2);
278        // assigns the decoded string to the return value
279        for (unsigned int i = 0; i < index - 1; i++) {
280            ret += charArray3[i];
281        }
282    }
283
284    return ret;
285}
286
287bool IsValidHex(const string &hex)
288{
289    bool isValid = false;
290    for (unsigned int i = 0; i < hex.size(); i++) {
291        char c = hex.at(i);
292        // 0 ~ 9, A ~ F, a ~ f
293        if ((c <= '9' && c >= '0') || (c <= 'F' && c >= 'A') || (c <= 'f' && c >= 'a')) {
294            isValid = true;
295        } else {
296            isValid = false;
297            break;
298        }
299    }
300    return isValid;
301}
302
303string HexDecode(const string &hexStr)
304{
305    string nums = "";
306    unsigned int arrSize = hexStr.size();
307
308    // 2 : means a half length of hex str's size
309    for (unsigned int i = 0; i < arrSize / 2; i++) {
310        string hexStrTmp = "";
311        int num = 0;
312        // 2 : offset is i * 2
313        hexStrTmp.push_back(hexStr[i * 2]);
314        // 2 : offset is i * 2 + 1
315        hexStrTmp.push_back(hexStr[i * 2 + 1]);
316        if (!IsValidHex(hexStrTmp)) {
317            break;
318        }
319        // 16 : the base is 16
320        num = stoi(hexStrTmp, nullptr, 16);
321        nums.push_back(static_cast<char>(num));
322    }
323
324    return nums;
325}
326
327// Find the position of the last character in pat from patIndex
328int GetGoodSuffixLengthByLastChar(uint8_t *pat, int patIndex, int patLen)
329{
330    int lastIndex = patLen - 1;
331    int index = -1;
332    while (patIndex >= 0) {
333        if (pat[patIndex] == pat[lastIndex]) {
334            index = patIndex;
335            break;
336        } else {
337            patIndex--;
338        }
339    }
340    return lastIndex - index;
341}
342// Find the position of the first character in pat from patIndex
343int GetGoodSuffixLengthByFirstChar(uint8_t *pat, int patIndex, int tarlen)
344{
345    int indexOfNextFirstChar = tarlen;
346    for (int i = patIndex; i < tarlen; i++) {
347        if (pat[0] == pat[i]) {
348            indexOfNextFirstChar = i;
349            break;
350        }
351    }
352    return indexOfNextFirstChar;
353}
354
355// Match forward from patIndex to get the position of the singleChar in the pat
356// and the length of the bad character
357int GetBadCharLengthInReverseOrder(uint8_t *pat, char singleChar, int patIndex)
358{
359    int index = -1;
360    for (int i = patIndex - 1; i >= 0; --i) {
361        if (pat[i] == singleChar) {
362            index = i;
363            break;
364        }
365    }
366    return patIndex - index;
367}
368
369// Get the position of character c in pat
370int GetBadCharLengthInSequence(uint8_t *pat, char singleChar, int patIndex, int tarlen)
371{
372    int resIndex = tarlen;
373    for (int i = patIndex; i < tarlen; i++) {
374        if (singleChar == pat[i]) {
375            resIndex = i;
376            break;
377        }
378    }
379    return resIndex;
380}
381
382int FindLastIndex(uint8_t *source, uint8_t *target, int soulen, int tarlen)
383{
384    if (source == nullptr || target == nullptr) {
385        return -1;
386    }
387    if (soulen < tarlen || tarlen == 0) {
388        return -1;
389    }
390    int i = soulen - tarlen;
391    int j = 0;
392
393    while (i >= 0) {
394        if (source[i] == target[j]) {
395            if (j == tarlen - 1) {
396                return i - (tarlen - 1);
397            }
398            i++;
399            j++;
400        } else {
401            if (j == 0) {
402                int badValue = GetBadCharLengthInSequence(target, source[i], j, tarlen);
403                i = i - badValue;
404                j = 0;
405            } else {
406                int badValue = GetBadCharLengthInSequence(target, source[i], j, tarlen);
407                int goodSuffix = GetGoodSuffixLengthByFirstChar(target, j, tarlen);
408                int distance = badValue > goodSuffix ? badValue : goodSuffix;
409                i = i - distance;
410                j = 0;
411            }
412        }
413    }
414    return -1;
415}
416
417bool FindIndexInner(uint8_t* target, uint8_t* source, int tarlen, int &indexI, int &indexJ)
418{
419    if (indexJ == tarlen - 1) {
420        int badValue = GetBadCharLengthInReverseOrder(target, source[indexI], indexJ);
421        indexI = indexI + badValue;
422    } else {
423        int badValue = GetBadCharLengthInReverseOrder(target, source[indexI], indexJ);
424        int goodSuffix = GetGoodSuffixLengthByLastChar(target, indexJ, tarlen);
425        int distance = badValue > goodSuffix ? badValue : goodSuffix;
426        long addVal = static_cast<long>(indexI) + tarlen;
427        long addRst = addVal + distance;
428        if (abs(addVal) > INT_MAX || abs(addRst) > INT_MAX) {
429            return false;
430        }
431        indexI = indexI + tarlen - 1 - indexJ + distance;
432        indexJ = tarlen - 1;
433    }
434    return true;
435}
436
437int FindIndex(uint8_t* source, uint8_t* target, int soulen, int tarlen)
438{
439    if (source == nullptr || target == nullptr) {
440        return -1;
441    }
442    if (soulen < tarlen || tarlen == 0) {
443        return -1;
444    }
445    int i = tarlen - 1;
446    int j = tarlen - 1;
447    while (i < soulen) {
448        if (source[i] == target[j]) {
449            if (j == 0) {
450                return i;
451            }
452            i--;
453            j--;
454        } else {
455            bool flag = FindIndexInner(target, source, tarlen, i, j);
456            if (!flag) {
457                return -1;
458            }
459        }
460    }
461    return -1;
462}
463}
464