xref: /third_party/icu/icu4c/source/i18n/csrmbcs.cpp (revision 2e5b6d6d)
12e5b6d6dSopenharmony_ci// © 2016 and later: Unicode, Inc. and others.
22e5b6d6dSopenharmony_ci// License & terms of use: http://www.unicode.org/copyright.html
32e5b6d6dSopenharmony_ci/*
42e5b6d6dSopenharmony_ci **********************************************************************
52e5b6d6dSopenharmony_ci *   Copyright (C) 2005-2016, International Business Machines
62e5b6d6dSopenharmony_ci *   Corporation and others.  All Rights Reserved.
72e5b6d6dSopenharmony_ci **********************************************************************
82e5b6d6dSopenharmony_ci */
92e5b6d6dSopenharmony_ci
102e5b6d6dSopenharmony_ci#include "unicode/utypes.h"
112e5b6d6dSopenharmony_ci
122e5b6d6dSopenharmony_ci#if !UCONFIG_NO_CONVERSION
132e5b6d6dSopenharmony_ci
142e5b6d6dSopenharmony_ci#include "cmemory.h"
152e5b6d6dSopenharmony_ci#include "csmatch.h"
162e5b6d6dSopenharmony_ci#include "csrmbcs.h"
172e5b6d6dSopenharmony_ci
182e5b6d6dSopenharmony_ci#include <math.h>
192e5b6d6dSopenharmony_ci
202e5b6d6dSopenharmony_ciU_NAMESPACE_BEGIN
212e5b6d6dSopenharmony_ci
222e5b6d6dSopenharmony_ci#define min(x,y) (((x)<(y))?(x):(y))
232e5b6d6dSopenharmony_ci
242e5b6d6dSopenharmony_cistatic const uint16_t commonChars_sjis [] = {
252e5b6d6dSopenharmony_ci// TODO:  This set of data comes from the character frequency-
262e5b6d6dSopenharmony_ci//        of-occurrence analysis tool.  The data needs to be moved
272e5b6d6dSopenharmony_ci//        into a resource and loaded from there.
282e5b6d6dSopenharmony_ci0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
292e5b6d6dSopenharmony_ci0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
302e5b6d6dSopenharmony_ci0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
312e5b6d6dSopenharmony_ci0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
322e5b6d6dSopenharmony_ci0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
332e5b6d6dSopenharmony_ci0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
342e5b6d6dSopenharmony_ci
352e5b6d6dSopenharmony_cistatic const uint16_t commonChars_euc_jp[] = {
362e5b6d6dSopenharmony_ci// TODO:  This set of data comes from the character frequency-
372e5b6d6dSopenharmony_ci//        of-occurrence analysis tool.  The data needs to be moved
382e5b6d6dSopenharmony_ci//        into a resource and loaded from there.
392e5b6d6dSopenharmony_ci0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
402e5b6d6dSopenharmony_ci0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
412e5b6d6dSopenharmony_ci0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
422e5b6d6dSopenharmony_ci0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
432e5b6d6dSopenharmony_ci0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
442e5b6d6dSopenharmony_ci0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
452e5b6d6dSopenharmony_ci0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
462e5b6d6dSopenharmony_ci0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
472e5b6d6dSopenharmony_ci0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
482e5b6d6dSopenharmony_ci0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
492e5b6d6dSopenharmony_ci
502e5b6d6dSopenharmony_cistatic const uint16_t commonChars_euc_kr[] = {
512e5b6d6dSopenharmony_ci// TODO:  This set of data comes from the character frequency-
522e5b6d6dSopenharmony_ci//        of-occurrence analysis tool.  The data needs to be moved
532e5b6d6dSopenharmony_ci//        into a resource and loaded from there.
542e5b6d6dSopenharmony_ci0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
552e5b6d6dSopenharmony_ci0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
562e5b6d6dSopenharmony_ci0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
572e5b6d6dSopenharmony_ci0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
582e5b6d6dSopenharmony_ci0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
592e5b6d6dSopenharmony_ci0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
602e5b6d6dSopenharmony_ci0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
612e5b6d6dSopenharmony_ci0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
622e5b6d6dSopenharmony_ci0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
632e5b6d6dSopenharmony_ci0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
642e5b6d6dSopenharmony_ci
652e5b6d6dSopenharmony_cistatic const uint16_t commonChars_big5[] = {
662e5b6d6dSopenharmony_ci// TODO:  This set of data comes from the character frequency-
672e5b6d6dSopenharmony_ci//        of-occurrence analysis tool.  The data needs to be moved
682e5b6d6dSopenharmony_ci//        into a resource and loaded from there.
692e5b6d6dSopenharmony_ci0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
702e5b6d6dSopenharmony_ci0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
712e5b6d6dSopenharmony_ci0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
722e5b6d6dSopenharmony_ci0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
732e5b6d6dSopenharmony_ci0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
742e5b6d6dSopenharmony_ci0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
752e5b6d6dSopenharmony_ci0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
762e5b6d6dSopenharmony_ci0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
772e5b6d6dSopenharmony_ci0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
782e5b6d6dSopenharmony_ci0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
792e5b6d6dSopenharmony_ci
802e5b6d6dSopenharmony_cistatic const uint16_t commonChars_gb_18030[] = {
812e5b6d6dSopenharmony_ci// TODO:  This set of data comes from the character frequency-
822e5b6d6dSopenharmony_ci//        of-occurrence analysis tool.  The data needs to be moved
832e5b6d6dSopenharmony_ci//        into a resource and loaded from there.
842e5b6d6dSopenharmony_ci0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
852e5b6d6dSopenharmony_ci0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
862e5b6d6dSopenharmony_ci0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
872e5b6d6dSopenharmony_ci0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
882e5b6d6dSopenharmony_ci0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
892e5b6d6dSopenharmony_ci0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
902e5b6d6dSopenharmony_ci0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
912e5b6d6dSopenharmony_ci0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
922e5b6d6dSopenharmony_ci0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
932e5b6d6dSopenharmony_ci0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
942e5b6d6dSopenharmony_ci
952e5b6d6dSopenharmony_cistatic int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
962e5b6d6dSopenharmony_ci{
972e5b6d6dSopenharmony_ci    int32_t start = 0, end = len-1;
982e5b6d6dSopenharmony_ci    int32_t mid = (start+end)/2;
992e5b6d6dSopenharmony_ci
1002e5b6d6dSopenharmony_ci    while(start <= end) {
1012e5b6d6dSopenharmony_ci        if(array[mid] == value) {
1022e5b6d6dSopenharmony_ci            return mid;
1032e5b6d6dSopenharmony_ci        }
1042e5b6d6dSopenharmony_ci
1052e5b6d6dSopenharmony_ci        if(array[mid] < value){
1062e5b6d6dSopenharmony_ci            start = mid+1;
1072e5b6d6dSopenharmony_ci        } else {
1082e5b6d6dSopenharmony_ci            end = mid-1;
1092e5b6d6dSopenharmony_ci        }
1102e5b6d6dSopenharmony_ci
1112e5b6d6dSopenharmony_ci        mid = (start+end)/2;
1122e5b6d6dSopenharmony_ci    }
1132e5b6d6dSopenharmony_ci
1142e5b6d6dSopenharmony_ci    return -1;
1152e5b6d6dSopenharmony_ci}
1162e5b6d6dSopenharmony_ci
1172e5b6d6dSopenharmony_ciIteratedChar::IteratedChar() :
1182e5b6d6dSopenharmony_cicharValue(0), index(-1), nextIndex(0), error(false), done(false)
1192e5b6d6dSopenharmony_ci{
1202e5b6d6dSopenharmony_ci    // nothing else to do.
1212e5b6d6dSopenharmony_ci}
1222e5b6d6dSopenharmony_ci
1232e5b6d6dSopenharmony_ci/*void IteratedChar::reset()
1242e5b6d6dSopenharmony_ci{
1252e5b6d6dSopenharmony_ci    charValue = 0;
1262e5b6d6dSopenharmony_ci    index     = -1;
1272e5b6d6dSopenharmony_ci    nextIndex = 0;
1282e5b6d6dSopenharmony_ci    error     = false;
1292e5b6d6dSopenharmony_ci    done      = false;
1302e5b6d6dSopenharmony_ci}*/
1312e5b6d6dSopenharmony_ci
1322e5b6d6dSopenharmony_ciint32_t IteratedChar::nextByte(InputText *det)
1332e5b6d6dSopenharmony_ci{
1342e5b6d6dSopenharmony_ci    if (nextIndex >= det->fRawLength) {
1352e5b6d6dSopenharmony_ci        done = true;
1362e5b6d6dSopenharmony_ci
1372e5b6d6dSopenharmony_ci        return -1;
1382e5b6d6dSopenharmony_ci    }
1392e5b6d6dSopenharmony_ci
1402e5b6d6dSopenharmony_ci    return det->fRawInput[nextIndex++];
1412e5b6d6dSopenharmony_ci}
1422e5b6d6dSopenharmony_ci
1432e5b6d6dSopenharmony_ciCharsetRecog_mbcs::~CharsetRecog_mbcs()
1442e5b6d6dSopenharmony_ci{
1452e5b6d6dSopenharmony_ci    // nothing to do.
1462e5b6d6dSopenharmony_ci}
1472e5b6d6dSopenharmony_ci
1482e5b6d6dSopenharmony_ciint32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
1492e5b6d6dSopenharmony_ci    int32_t doubleByteCharCount = 0;
1502e5b6d6dSopenharmony_ci    int32_t commonCharCount     = 0;
1512e5b6d6dSopenharmony_ci    int32_t badCharCount        = 0;
1522e5b6d6dSopenharmony_ci    int32_t totalCharCount      = 0;
1532e5b6d6dSopenharmony_ci    int32_t confidence          = 0;
1542e5b6d6dSopenharmony_ci    IteratedChar iter;
1552e5b6d6dSopenharmony_ci
1562e5b6d6dSopenharmony_ci    while (nextChar(&iter, det)) {
1572e5b6d6dSopenharmony_ci        totalCharCount++;
1582e5b6d6dSopenharmony_ci
1592e5b6d6dSopenharmony_ci        if (iter.error) {
1602e5b6d6dSopenharmony_ci            badCharCount++;
1612e5b6d6dSopenharmony_ci        } else {
1622e5b6d6dSopenharmony_ci            if (iter.charValue > 0xFF) {
1632e5b6d6dSopenharmony_ci                doubleByteCharCount++;
1642e5b6d6dSopenharmony_ci
1652e5b6d6dSopenharmony_ci                if (commonChars != 0) {
1662e5b6d6dSopenharmony_ci                    if (binarySearch(commonChars, commonCharsLen, static_cast<uint16_t>(iter.charValue)) >= 0){
1672e5b6d6dSopenharmony_ci                        commonCharCount += 1;
1682e5b6d6dSopenharmony_ci                    }
1692e5b6d6dSopenharmony_ci                }
1702e5b6d6dSopenharmony_ci            }
1712e5b6d6dSopenharmony_ci        }
1722e5b6d6dSopenharmony_ci
1732e5b6d6dSopenharmony_ci
1742e5b6d6dSopenharmony_ci        if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
1752e5b6d6dSopenharmony_ci            // Bail out early if the byte data is not matching the encoding scheme.
1762e5b6d6dSopenharmony_ci            // break detectBlock;
1772e5b6d6dSopenharmony_ci            return confidence;
1782e5b6d6dSopenharmony_ci        }
1792e5b6d6dSopenharmony_ci    }
1802e5b6d6dSopenharmony_ci
1812e5b6d6dSopenharmony_ci    if (doubleByteCharCount <= 10 && badCharCount == 0) {
1822e5b6d6dSopenharmony_ci        // Not many multi-byte chars.
1832e5b6d6dSopenharmony_ci        if (doubleByteCharCount == 0 && totalCharCount < 10) {
1842e5b6d6dSopenharmony_ci            // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
1852e5b6d6dSopenharmony_ci            // We don't have enough data to have any confidence.
1862e5b6d6dSopenharmony_ci            // Statistical analysis of single byte non-ASCII characters would probably help here.
1872e5b6d6dSopenharmony_ci            confidence = 0;
1882e5b6d6dSopenharmony_ci        }
1892e5b6d6dSopenharmony_ci        else {
1902e5b6d6dSopenharmony_ci            //   ASCII or ISO file?  It's probably not our encoding,
1912e5b6d6dSopenharmony_ci            //   but is not incompatible with our encoding, so don't give it a zero.
1922e5b6d6dSopenharmony_ci            confidence = 10;
1932e5b6d6dSopenharmony_ci        }
1942e5b6d6dSopenharmony_ci
1952e5b6d6dSopenharmony_ci        return confidence;
1962e5b6d6dSopenharmony_ci    }
1972e5b6d6dSopenharmony_ci
1982e5b6d6dSopenharmony_ci    //
1992e5b6d6dSopenharmony_ci    //  No match if there are too many characters that don't fit the encoding scheme.
2002e5b6d6dSopenharmony_ci    //    (should we have zero tolerance for these?)
2012e5b6d6dSopenharmony_ci    //
2022e5b6d6dSopenharmony_ci    if (doubleByteCharCount < 20*badCharCount) {
2032e5b6d6dSopenharmony_ci        confidence = 0;
2042e5b6d6dSopenharmony_ci
2052e5b6d6dSopenharmony_ci        return confidence;
2062e5b6d6dSopenharmony_ci    }
2072e5b6d6dSopenharmony_ci
2082e5b6d6dSopenharmony_ci    if (commonChars == 0) {
2092e5b6d6dSopenharmony_ci        // We have no statistics on frequently occurring characters.
2102e5b6d6dSopenharmony_ci        //  Assess confidence purely on having a reasonable number of
2112e5b6d6dSopenharmony_ci        //  multi-byte characters (the more the better)
2122e5b6d6dSopenharmony_ci        confidence = 30 + doubleByteCharCount - 20*badCharCount;
2132e5b6d6dSopenharmony_ci
2142e5b6d6dSopenharmony_ci        if (confidence > 100) {
2152e5b6d6dSopenharmony_ci            confidence = 100;
2162e5b6d6dSopenharmony_ci        }
2172e5b6d6dSopenharmony_ci    } else {
2182e5b6d6dSopenharmony_ci        //
2192e5b6d6dSopenharmony_ci        // Frequency of occurrence statistics exist.
2202e5b6d6dSopenharmony_ci        //
2212e5b6d6dSopenharmony_ci
2222e5b6d6dSopenharmony_ci        double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/
2232e5b6d6dSopenharmony_ci        double scaleFactor = 90.0 / maxVal;
2242e5b6d6dSopenharmony_ci        confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);
2252e5b6d6dSopenharmony_ci
2262e5b6d6dSopenharmony_ci        confidence = min(confidence, 100);
2272e5b6d6dSopenharmony_ci    }
2282e5b6d6dSopenharmony_ci
2292e5b6d6dSopenharmony_ci    if (confidence < 0) {
2302e5b6d6dSopenharmony_ci        confidence = 0;
2312e5b6d6dSopenharmony_ci    }
2322e5b6d6dSopenharmony_ci
2332e5b6d6dSopenharmony_ci    return confidence;
2342e5b6d6dSopenharmony_ci}
2352e5b6d6dSopenharmony_ci
2362e5b6d6dSopenharmony_ciCharsetRecog_sjis::~CharsetRecog_sjis()
2372e5b6d6dSopenharmony_ci{
2382e5b6d6dSopenharmony_ci    // nothing to do
2392e5b6d6dSopenharmony_ci}
2402e5b6d6dSopenharmony_ci
2412e5b6d6dSopenharmony_ciUBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
2422e5b6d6dSopenharmony_ci    it->index = it->nextIndex;
2432e5b6d6dSopenharmony_ci    it->error = false;
2442e5b6d6dSopenharmony_ci
2452e5b6d6dSopenharmony_ci    int32_t firstByte = it->charValue = it->nextByte(det);
2462e5b6d6dSopenharmony_ci
2472e5b6d6dSopenharmony_ci    if (firstByte < 0) {
2482e5b6d6dSopenharmony_ci        return false;
2492e5b6d6dSopenharmony_ci    }
2502e5b6d6dSopenharmony_ci
2512e5b6d6dSopenharmony_ci    if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
2522e5b6d6dSopenharmony_ci        return true;
2532e5b6d6dSopenharmony_ci    }
2542e5b6d6dSopenharmony_ci
2552e5b6d6dSopenharmony_ci    int32_t secondByte = it->nextByte(det);
2562e5b6d6dSopenharmony_ci    if (secondByte >= 0) {
2572e5b6d6dSopenharmony_ci        it->charValue = (firstByte << 8) | secondByte;
2582e5b6d6dSopenharmony_ci    }
2592e5b6d6dSopenharmony_ci    // else we'll handle the error later.
2602e5b6d6dSopenharmony_ci
2612e5b6d6dSopenharmony_ci    if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
2622e5b6d6dSopenharmony_ci        // Illegal second byte value.
2632e5b6d6dSopenharmony_ci        it->error = true;
2642e5b6d6dSopenharmony_ci    }
2652e5b6d6dSopenharmony_ci
2662e5b6d6dSopenharmony_ci    return true;
2672e5b6d6dSopenharmony_ci}
2682e5b6d6dSopenharmony_ci
2692e5b6d6dSopenharmony_ciUBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
2702e5b6d6dSopenharmony_ci    int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis));
2712e5b6d6dSopenharmony_ci    results->set(det, this, confidence);
2722e5b6d6dSopenharmony_ci    return (confidence > 0);
2732e5b6d6dSopenharmony_ci}
2742e5b6d6dSopenharmony_ci
2752e5b6d6dSopenharmony_ciconst char *CharsetRecog_sjis::getName() const
2762e5b6d6dSopenharmony_ci{
2772e5b6d6dSopenharmony_ci    return "Shift_JIS";
2782e5b6d6dSopenharmony_ci}
2792e5b6d6dSopenharmony_ci
2802e5b6d6dSopenharmony_ciconst char *CharsetRecog_sjis::getLanguage() const
2812e5b6d6dSopenharmony_ci{
2822e5b6d6dSopenharmony_ci    return "ja";
2832e5b6d6dSopenharmony_ci}
2842e5b6d6dSopenharmony_ci
2852e5b6d6dSopenharmony_ciCharsetRecog_euc::~CharsetRecog_euc()
2862e5b6d6dSopenharmony_ci{
2872e5b6d6dSopenharmony_ci    // nothing to do
2882e5b6d6dSopenharmony_ci}
2892e5b6d6dSopenharmony_ci
2902e5b6d6dSopenharmony_ciUBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
2912e5b6d6dSopenharmony_ci    int32_t firstByte  = 0;
2922e5b6d6dSopenharmony_ci    int32_t secondByte = 0;
2932e5b6d6dSopenharmony_ci    int32_t thirdByte  = 0;
2942e5b6d6dSopenharmony_ci
2952e5b6d6dSopenharmony_ci    it->index = it->nextIndex;
2962e5b6d6dSopenharmony_ci    it->error = false;
2972e5b6d6dSopenharmony_ci    firstByte = it->charValue = it->nextByte(det);
2982e5b6d6dSopenharmony_ci
2992e5b6d6dSopenharmony_ci    if (firstByte < 0) {
3002e5b6d6dSopenharmony_ci        // Ran off the end of the input data
3012e5b6d6dSopenharmony_ci        return false;
3022e5b6d6dSopenharmony_ci    }
3032e5b6d6dSopenharmony_ci
3042e5b6d6dSopenharmony_ci    if (firstByte <= 0x8D) {
3052e5b6d6dSopenharmony_ci        // single byte char
3062e5b6d6dSopenharmony_ci        return true;
3072e5b6d6dSopenharmony_ci    }
3082e5b6d6dSopenharmony_ci
3092e5b6d6dSopenharmony_ci    secondByte = it->nextByte(det);
3102e5b6d6dSopenharmony_ci    if (secondByte >= 0) {
3112e5b6d6dSopenharmony_ci        it->charValue = (it->charValue << 8) | secondByte;
3122e5b6d6dSopenharmony_ci    }
3132e5b6d6dSopenharmony_ci    // else we'll handle the error later.
3142e5b6d6dSopenharmony_ci
3152e5b6d6dSopenharmony_ci    if (firstByte >= 0xA1 && firstByte <= 0xFE) {
3162e5b6d6dSopenharmony_ci        // Two byte Char
3172e5b6d6dSopenharmony_ci        if (secondByte < 0xA1) {
3182e5b6d6dSopenharmony_ci            it->error = true;
3192e5b6d6dSopenharmony_ci        }
3202e5b6d6dSopenharmony_ci
3212e5b6d6dSopenharmony_ci        return true;
3222e5b6d6dSopenharmony_ci    }
3232e5b6d6dSopenharmony_ci
3242e5b6d6dSopenharmony_ci    if (firstByte == 0x8E) {
3252e5b6d6dSopenharmony_ci        // Code Set 2.
3262e5b6d6dSopenharmony_ci        //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
3272e5b6d6dSopenharmony_ci        //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
3282e5b6d6dSopenharmony_ci        // We don't know which we've got.
3292e5b6d6dSopenharmony_ci        // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
3302e5b6d6dSopenharmony_ci        //   bytes will look like a well formed 2 byte char.
3312e5b6d6dSopenharmony_ci        if (secondByte < 0xA1) {
3322e5b6d6dSopenharmony_ci            it->error = true;
3332e5b6d6dSopenharmony_ci        }
3342e5b6d6dSopenharmony_ci
3352e5b6d6dSopenharmony_ci        return true;
3362e5b6d6dSopenharmony_ci    }
3372e5b6d6dSopenharmony_ci
3382e5b6d6dSopenharmony_ci    if (firstByte == 0x8F) {
3392e5b6d6dSopenharmony_ci        // Code set 3.
3402e5b6d6dSopenharmony_ci        // Three byte total char size, two bytes of actual char value.
3412e5b6d6dSopenharmony_ci        thirdByte    = it->nextByte(det);
3422e5b6d6dSopenharmony_ci        it->charValue = (it->charValue << 8) | thirdByte;
3432e5b6d6dSopenharmony_ci
3442e5b6d6dSopenharmony_ci        if (thirdByte < 0xa1) {
3452e5b6d6dSopenharmony_ci            // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
3462e5b6d6dSopenharmony_ci            it->error = true;
3472e5b6d6dSopenharmony_ci        }
3482e5b6d6dSopenharmony_ci    }
3492e5b6d6dSopenharmony_ci
3502e5b6d6dSopenharmony_ci    return true;
3512e5b6d6dSopenharmony_ci
3522e5b6d6dSopenharmony_ci}
3532e5b6d6dSopenharmony_ci
3542e5b6d6dSopenharmony_ciCharsetRecog_euc_jp::~CharsetRecog_euc_jp()
3552e5b6d6dSopenharmony_ci{
3562e5b6d6dSopenharmony_ci    // nothing to do
3572e5b6d6dSopenharmony_ci}
3582e5b6d6dSopenharmony_ci
3592e5b6d6dSopenharmony_ciconst char *CharsetRecog_euc_jp::getName() const
3602e5b6d6dSopenharmony_ci{
3612e5b6d6dSopenharmony_ci    return "EUC-JP";
3622e5b6d6dSopenharmony_ci}
3632e5b6d6dSopenharmony_ci
3642e5b6d6dSopenharmony_ciconst char *CharsetRecog_euc_jp::getLanguage() const
3652e5b6d6dSopenharmony_ci{
3662e5b6d6dSopenharmony_ci    return "ja";
3672e5b6d6dSopenharmony_ci}
3682e5b6d6dSopenharmony_ci
3692e5b6d6dSopenharmony_ciUBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
3702e5b6d6dSopenharmony_ci{
3712e5b6d6dSopenharmony_ci    int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp));
3722e5b6d6dSopenharmony_ci    results->set(det, this, confidence);
3732e5b6d6dSopenharmony_ci    return (confidence > 0);
3742e5b6d6dSopenharmony_ci}
3752e5b6d6dSopenharmony_ci
3762e5b6d6dSopenharmony_ciCharsetRecog_euc_kr::~CharsetRecog_euc_kr()
3772e5b6d6dSopenharmony_ci{
3782e5b6d6dSopenharmony_ci    // nothing to do
3792e5b6d6dSopenharmony_ci}
3802e5b6d6dSopenharmony_ci
3812e5b6d6dSopenharmony_ciconst char *CharsetRecog_euc_kr::getName() const
3822e5b6d6dSopenharmony_ci{
3832e5b6d6dSopenharmony_ci    return "EUC-KR";
3842e5b6d6dSopenharmony_ci}
3852e5b6d6dSopenharmony_ci
3862e5b6d6dSopenharmony_ciconst char *CharsetRecog_euc_kr::getLanguage() const
3872e5b6d6dSopenharmony_ci{
3882e5b6d6dSopenharmony_ci    return "ko";
3892e5b6d6dSopenharmony_ci}
3902e5b6d6dSopenharmony_ci
3912e5b6d6dSopenharmony_ciUBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
3922e5b6d6dSopenharmony_ci{
3932e5b6d6dSopenharmony_ci    int32_t confidence =  match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr));
3942e5b6d6dSopenharmony_ci    results->set(det, this, confidence);
3952e5b6d6dSopenharmony_ci    return (confidence > 0);
3962e5b6d6dSopenharmony_ci}
3972e5b6d6dSopenharmony_ci
3982e5b6d6dSopenharmony_ciCharsetRecog_big5::~CharsetRecog_big5()
3992e5b6d6dSopenharmony_ci{
4002e5b6d6dSopenharmony_ci    // nothing to do
4012e5b6d6dSopenharmony_ci}
4022e5b6d6dSopenharmony_ci
4032e5b6d6dSopenharmony_ciUBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
4042e5b6d6dSopenharmony_ci{
4052e5b6d6dSopenharmony_ci    int32_t firstByte;
4062e5b6d6dSopenharmony_ci
4072e5b6d6dSopenharmony_ci    it->index = it->nextIndex;
4082e5b6d6dSopenharmony_ci    it->error = false;
4092e5b6d6dSopenharmony_ci    firstByte = it->charValue = it->nextByte(det);
4102e5b6d6dSopenharmony_ci
4112e5b6d6dSopenharmony_ci    if (firstByte < 0) {
4122e5b6d6dSopenharmony_ci        return false;
4132e5b6d6dSopenharmony_ci    }
4142e5b6d6dSopenharmony_ci
4152e5b6d6dSopenharmony_ci    if (firstByte <= 0x7F || firstByte == 0xFF) {
4162e5b6d6dSopenharmony_ci        // single byte character.
4172e5b6d6dSopenharmony_ci        return true;
4182e5b6d6dSopenharmony_ci    }
4192e5b6d6dSopenharmony_ci
4202e5b6d6dSopenharmony_ci    int32_t secondByte = it->nextByte(det);
4212e5b6d6dSopenharmony_ci    if (secondByte >= 0)  {
4222e5b6d6dSopenharmony_ci        it->charValue = (it->charValue << 8) | secondByte;
4232e5b6d6dSopenharmony_ci    }
4242e5b6d6dSopenharmony_ci    // else we'll handle the error later.
4252e5b6d6dSopenharmony_ci
4262e5b6d6dSopenharmony_ci    if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
4272e5b6d6dSopenharmony_ci        it->error = true;
4282e5b6d6dSopenharmony_ci    }
4292e5b6d6dSopenharmony_ci
4302e5b6d6dSopenharmony_ci    return true;
4312e5b6d6dSopenharmony_ci}
4322e5b6d6dSopenharmony_ci
4332e5b6d6dSopenharmony_ciconst char *CharsetRecog_big5::getName() const
4342e5b6d6dSopenharmony_ci{
4352e5b6d6dSopenharmony_ci    return "Big5";
4362e5b6d6dSopenharmony_ci}
4372e5b6d6dSopenharmony_ci
4382e5b6d6dSopenharmony_ciconst char *CharsetRecog_big5::getLanguage() const
4392e5b6d6dSopenharmony_ci{
4402e5b6d6dSopenharmony_ci    return "zh";
4412e5b6d6dSopenharmony_ci}
4422e5b6d6dSopenharmony_ci
4432e5b6d6dSopenharmony_ciUBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
4442e5b6d6dSopenharmony_ci{
4452e5b6d6dSopenharmony_ci    int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5));
4462e5b6d6dSopenharmony_ci    results->set(det, this, confidence);
4472e5b6d6dSopenharmony_ci    return (confidence > 0);
4482e5b6d6dSopenharmony_ci}
4492e5b6d6dSopenharmony_ci
4502e5b6d6dSopenharmony_ciCharsetRecog_gb_18030::~CharsetRecog_gb_18030()
4512e5b6d6dSopenharmony_ci{
4522e5b6d6dSopenharmony_ci    // nothing to do
4532e5b6d6dSopenharmony_ci}
4542e5b6d6dSopenharmony_ci
4552e5b6d6dSopenharmony_ciUBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
4562e5b6d6dSopenharmony_ci    int32_t firstByte  = 0;
4572e5b6d6dSopenharmony_ci    int32_t secondByte = 0;
4582e5b6d6dSopenharmony_ci    int32_t thirdByte  = 0;
4592e5b6d6dSopenharmony_ci    int32_t fourthByte = 0;
4602e5b6d6dSopenharmony_ci
4612e5b6d6dSopenharmony_ci    it->index = it->nextIndex;
4622e5b6d6dSopenharmony_ci    it->error = false;
4632e5b6d6dSopenharmony_ci    firstByte = it->charValue = it->nextByte(det);
4642e5b6d6dSopenharmony_ci
4652e5b6d6dSopenharmony_ci    if (firstByte < 0) {
4662e5b6d6dSopenharmony_ci        // Ran off the end of the input data
4672e5b6d6dSopenharmony_ci        return false;
4682e5b6d6dSopenharmony_ci    }
4692e5b6d6dSopenharmony_ci
4702e5b6d6dSopenharmony_ci    if (firstByte <= 0x80) {
4712e5b6d6dSopenharmony_ci        // single byte char
4722e5b6d6dSopenharmony_ci        return true;
4732e5b6d6dSopenharmony_ci    }
4742e5b6d6dSopenharmony_ci
4752e5b6d6dSopenharmony_ci    secondByte = it->nextByte(det);
4762e5b6d6dSopenharmony_ci    if (secondByte >= 0) {
4772e5b6d6dSopenharmony_ci        it->charValue = (it->charValue << 8) | secondByte;
4782e5b6d6dSopenharmony_ci    }
4792e5b6d6dSopenharmony_ci    // else we'll handle the error later.
4802e5b6d6dSopenharmony_ci
4812e5b6d6dSopenharmony_ci    if (firstByte >= 0x81 && firstByte <= 0xFE) {
4822e5b6d6dSopenharmony_ci        // Two byte Char
4832e5b6d6dSopenharmony_ci        if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
4842e5b6d6dSopenharmony_ci            return true;
4852e5b6d6dSopenharmony_ci        }
4862e5b6d6dSopenharmony_ci
4872e5b6d6dSopenharmony_ci        // Four byte char
4882e5b6d6dSopenharmony_ci        if (secondByte >= 0x30 && secondByte <= 0x39) {
4892e5b6d6dSopenharmony_ci            thirdByte = it->nextByte(det);
4902e5b6d6dSopenharmony_ci
4912e5b6d6dSopenharmony_ci            if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
4922e5b6d6dSopenharmony_ci                fourthByte = it->nextByte(det);
4932e5b6d6dSopenharmony_ci
4942e5b6d6dSopenharmony_ci                if (fourthByte >= 0x30 && fourthByte <= 0x39) {
4952e5b6d6dSopenharmony_ci                    it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
4962e5b6d6dSopenharmony_ci
4972e5b6d6dSopenharmony_ci                    return true;
4982e5b6d6dSopenharmony_ci                }
4992e5b6d6dSopenharmony_ci            }
5002e5b6d6dSopenharmony_ci        }
5012e5b6d6dSopenharmony_ci
5022e5b6d6dSopenharmony_ci        // Something wasn't valid, or we ran out of data (-1).
5032e5b6d6dSopenharmony_ci        it->error = true;
5042e5b6d6dSopenharmony_ci    }
5052e5b6d6dSopenharmony_ci
5062e5b6d6dSopenharmony_ci    return true;
5072e5b6d6dSopenharmony_ci}
5082e5b6d6dSopenharmony_ci
5092e5b6d6dSopenharmony_ciconst char *CharsetRecog_gb_18030::getName() const
5102e5b6d6dSopenharmony_ci{
5112e5b6d6dSopenharmony_ci    return "GB18030";
5122e5b6d6dSopenharmony_ci}
5132e5b6d6dSopenharmony_ci
5142e5b6d6dSopenharmony_ciconst char *CharsetRecog_gb_18030::getLanguage() const
5152e5b6d6dSopenharmony_ci{
5162e5b6d6dSopenharmony_ci    return "zh";
5172e5b6d6dSopenharmony_ci}
5182e5b6d6dSopenharmony_ci
5192e5b6d6dSopenharmony_ciUBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
5202e5b6d6dSopenharmony_ci{
5212e5b6d6dSopenharmony_ci    int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030));
5222e5b6d6dSopenharmony_ci    results->set(det, this, confidence);
5232e5b6d6dSopenharmony_ci    return (confidence > 0);
5242e5b6d6dSopenharmony_ci}
5252e5b6d6dSopenharmony_ci
5262e5b6d6dSopenharmony_ciU_NAMESPACE_END
5272e5b6d6dSopenharmony_ci#endif
528