xref: /ide/tools/previewer/util/StringHelper.h (revision 7c804472)
1/*
2 * Copyright (c) 2023 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 *     http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#ifndef STRINGHELPER_H
17#define STRINGHELPER_H
18
19#pragma once
20#include <string>
21
22class StringHelper {
23public:
24    enum Encode { ANSI = 1, UTF16_LE, UTF16_BE, UTF8_BOM, UTF8 };
25    static Encode IsUtf8Data(const uint8_t* data, size_t size)
26    {
27        bool bAnsi = true;
28        uint8_t ch = 0x00;
29        int32_t nBytes = 0;
30        int32_t byteNumber[] = {0, 1, 2, 3, 4, 5, 6};
31        for (auto i = 0; i < size; i++) {
32            ch = *(data + i);
33            if ((ch & 0x80) != 0x00) { // The first digit of the byte is 0:0XXX_XXXX
34                bAnsi = false;
35            }
36            if (nBytes == byteNumber[0]) {
37                if (ch < 0x80) {
38                    continue;
39                }
40                if (ch >= 0xFC && ch <= 0xFD) {
41                    // The char has 6 bytes:1111_110X 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
42                    nBytes = byteNumber[6];
43                } else if (ch >= 0xF8) {
44                    // The char has 5 bytes:1111_10XX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
45                    nBytes = byteNumber[5];
46                } else if (ch >= 0xF0) {
47                    // The char has 4 bytes:1111_0XXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
48                    nBytes = byteNumber[4];
49                } else if (ch >= 0xE0) {
50                    nBytes = byteNumber[3]; // The char has 3 bytes:1110_XXXX 10XX_XXXX 10XX_XXXX
51                } else if (ch >= 0xC0) {
52                    nBytes = byteNumber[2]; // The char has 2 bytes:110X_XXXX 10XX_XXXX
53                } else {
54                    return Encode::ANSI;
55                }
56                nBytes--;
57            } else {
58                if ((ch & 0xC0) != 0x80) {
59                    return Encode::ANSI;
60                }
61                nBytes--;
62            }
63        }
64        if (nBytes > byteNumber[0] || bAnsi) {
65            return Encode::ANSI;
66        }
67        return Encode::UTF8;
68    };
69
70    static Encode DetectEncode(const uint8_t* data, size_t size)
71    {
72        int utf16HeadLen = 2;
73        int utf8HeadLen = 3;
74        // Detect the size and the first and second bytes
75        if (size > utf16HeadLen && data[0] == 0xFF && data[1] == 0xFE) {
76            return Encode::UTF16_LE;
77        // Detect the size and the first and second bytes
78        } else if (size > utf16HeadLen && data[0] == 0xFE && data[1] == 0xFF) {
79            return Encode::UTF16_BE;
80        // Detect the size and the first, second, and third bytes
81        } else if (size > utf8HeadLen && data[0] == 0xEF && data[1] == 0xBB&& data[2] == 0xBF) { // NOLINT
82            return Encode::UTF8_BOM;
83        } else {
84            return IsUtf8Data(data, size);
85        }
86    };
87
88    static std::string StringToUtf8(const std::string& str);
89    static std::string Utf8ToString(const std::string& str);
90};
91
92#endif // STRINGHELPER_H