17c804472Sopenharmony_ci/*
27c804472Sopenharmony_ci * Copyright (c) 2023 Huawei Device Co., Ltd.
37c804472Sopenharmony_ci * Licensed under the Apache License, Version 2.0 (the "License");
47c804472Sopenharmony_ci * you may not use this file except in compliance with the License.
57c804472Sopenharmony_ci * You may obtain a copy of the License at
67c804472Sopenharmony_ci *
77c804472Sopenharmony_ci *     http://www.apache.org/licenses/LICENSE-2.0
87c804472Sopenharmony_ci *
97c804472Sopenharmony_ci * Unless required by applicable law or agreed to in writing, software
107c804472Sopenharmony_ci * distributed under the License is distributed on an "AS IS" BASIS,
117c804472Sopenharmony_ci * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
127c804472Sopenharmony_ci * See the License for the specific language governing permissions and
137c804472Sopenharmony_ci * limitations under the License.
147c804472Sopenharmony_ci */
157c804472Sopenharmony_ci
167c804472Sopenharmony_ci#ifndef STRINGHELPER_H
177c804472Sopenharmony_ci#define STRINGHELPER_H
187c804472Sopenharmony_ci
197c804472Sopenharmony_ci#pragma once
207c804472Sopenharmony_ci#include <string>
217c804472Sopenharmony_ci
227c804472Sopenharmony_ciclass StringHelper {
237c804472Sopenharmony_cipublic:
247c804472Sopenharmony_ci    enum Encode { ANSI = 1, UTF16_LE, UTF16_BE, UTF8_BOM, UTF8 };
257c804472Sopenharmony_ci    static Encode IsUtf8Data(const uint8_t* data, size_t size)
267c804472Sopenharmony_ci    {
277c804472Sopenharmony_ci        bool bAnsi = true;
287c804472Sopenharmony_ci        uint8_t ch = 0x00;
297c804472Sopenharmony_ci        int32_t nBytes = 0;
307c804472Sopenharmony_ci        int32_t byteNumber[] = {0, 1, 2, 3, 4, 5, 6};
317c804472Sopenharmony_ci        for (auto i = 0; i < size; i++) {
327c804472Sopenharmony_ci            ch = *(data + i);
337c804472Sopenharmony_ci            if ((ch & 0x80) != 0x00) { // The first digit of the byte is 0:0XXX_XXXX
347c804472Sopenharmony_ci                bAnsi = false;
357c804472Sopenharmony_ci            }
367c804472Sopenharmony_ci            if (nBytes == byteNumber[0]) {
377c804472Sopenharmony_ci                if (ch < 0x80) {
387c804472Sopenharmony_ci                    continue;
397c804472Sopenharmony_ci                }
407c804472Sopenharmony_ci                if (ch >= 0xFC && ch <= 0xFD) {
417c804472Sopenharmony_ci                    // The char has 6 bytes:1111_110X 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
427c804472Sopenharmony_ci                    nBytes = byteNumber[6];
437c804472Sopenharmony_ci                } else if (ch >= 0xF8) {
447c804472Sopenharmony_ci                    // The char has 5 bytes:1111_10XX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
457c804472Sopenharmony_ci                    nBytes = byteNumber[5];
467c804472Sopenharmony_ci                } else if (ch >= 0xF0) {
477c804472Sopenharmony_ci                    // The char has 4 bytes:1111_0XXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
487c804472Sopenharmony_ci                    nBytes = byteNumber[4];
497c804472Sopenharmony_ci                } else if (ch >= 0xE0) {
507c804472Sopenharmony_ci                    nBytes = byteNumber[3]; // The char has 3 bytes:1110_XXXX 10XX_XXXX 10XX_XXXX
517c804472Sopenharmony_ci                } else if (ch >= 0xC0) {
527c804472Sopenharmony_ci                    nBytes = byteNumber[2]; // The char has 2 bytes:110X_XXXX 10XX_XXXX
537c804472Sopenharmony_ci                } else {
547c804472Sopenharmony_ci                    return Encode::ANSI;
557c804472Sopenharmony_ci                }
567c804472Sopenharmony_ci                nBytes--;
577c804472Sopenharmony_ci            } else {
587c804472Sopenharmony_ci                if ((ch & 0xC0) != 0x80) {
597c804472Sopenharmony_ci                    return Encode::ANSI;
607c804472Sopenharmony_ci                }
617c804472Sopenharmony_ci                nBytes--;
627c804472Sopenharmony_ci            }
637c804472Sopenharmony_ci        }
647c804472Sopenharmony_ci        if (nBytes > byteNumber[0] || bAnsi) {
657c804472Sopenharmony_ci            return Encode::ANSI;
667c804472Sopenharmony_ci        }
677c804472Sopenharmony_ci        return Encode::UTF8;
687c804472Sopenharmony_ci    };
697c804472Sopenharmony_ci
707c804472Sopenharmony_ci    static Encode DetectEncode(const uint8_t* data, size_t size)
717c804472Sopenharmony_ci    {
727c804472Sopenharmony_ci        int utf16HeadLen = 2;
737c804472Sopenharmony_ci        int utf8HeadLen = 3;
747c804472Sopenharmony_ci        // Detect the size and the first and second bytes
757c804472Sopenharmony_ci        if (size > utf16HeadLen && data[0] == 0xFF && data[1] == 0xFE) {
767c804472Sopenharmony_ci            return Encode::UTF16_LE;
777c804472Sopenharmony_ci        // Detect the size and the first and second bytes
787c804472Sopenharmony_ci        } else if (size > utf16HeadLen && data[0] == 0xFE && data[1] == 0xFF) {
797c804472Sopenharmony_ci            return Encode::UTF16_BE;
807c804472Sopenharmony_ci        // Detect the size and the first, second, and third bytes
817c804472Sopenharmony_ci        } else if (size > utf8HeadLen && data[0] == 0xEF && data[1] == 0xBB&& data[2] == 0xBF) { // NOLINT
827c804472Sopenharmony_ci            return Encode::UTF8_BOM;
837c804472Sopenharmony_ci        } else {
847c804472Sopenharmony_ci            return IsUtf8Data(data, size);
857c804472Sopenharmony_ci        }
867c804472Sopenharmony_ci    };
877c804472Sopenharmony_ci
887c804472Sopenharmony_ci    static std::string StringToUtf8(const std::string& str);
897c804472Sopenharmony_ci    static std::string Utf8ToString(const std::string& str);
907c804472Sopenharmony_ci};
917c804472Sopenharmony_ci
927c804472Sopenharmony_ci#endif // STRINGHELPER_H