12e5b6d6dSopenharmony_ci// © 2016 and later: Unicode, Inc. and others. 22e5b6d6dSopenharmony_ci// License & terms of use: http://www.unicode.org/copyright.html 32e5b6d6dSopenharmony_ci/* 42e5b6d6dSopenharmony_ci ********************************************************************** 52e5b6d6dSopenharmony_ci * Copyright (C) 2005-2016, International Business Machines 62e5b6d6dSopenharmony_ci * Corporation and others. All Rights Reserved. 72e5b6d6dSopenharmony_ci ********************************************************************** 82e5b6d6dSopenharmony_ci */ 92e5b6d6dSopenharmony_ci 102e5b6d6dSopenharmony_ci#include "unicode/utypes.h" 112e5b6d6dSopenharmony_ci 122e5b6d6dSopenharmony_ci#if !UCONFIG_NO_CONVERSION 132e5b6d6dSopenharmony_ci 142e5b6d6dSopenharmony_ci#include "cmemory.h" 152e5b6d6dSopenharmony_ci#include "csmatch.h" 162e5b6d6dSopenharmony_ci#include "csrmbcs.h" 172e5b6d6dSopenharmony_ci 182e5b6d6dSopenharmony_ci#include <math.h> 192e5b6d6dSopenharmony_ci 202e5b6d6dSopenharmony_ciU_NAMESPACE_BEGIN 212e5b6d6dSopenharmony_ci 222e5b6d6dSopenharmony_ci#define min(x,y) (((x)<(y))?(x):(y)) 232e5b6d6dSopenharmony_ci 242e5b6d6dSopenharmony_cistatic const uint16_t commonChars_sjis [] = { 252e5b6d6dSopenharmony_ci// TODO: This set of data comes from the character frequency- 262e5b6d6dSopenharmony_ci// of-occurrence analysis tool. The data needs to be moved 272e5b6d6dSopenharmony_ci// into a resource and loaded from there. 282e5b6d6dSopenharmony_ci0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0, 292e5b6d6dSopenharmony_ci0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5, 302e5b6d6dSopenharmony_ci0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc, 312e5b6d6dSopenharmony_ci0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341, 322e5b6d6dSopenharmony_ci0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389, 332e5b6d6dSopenharmony_ci0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa}; 342e5b6d6dSopenharmony_ci 352e5b6d6dSopenharmony_cistatic const uint16_t commonChars_euc_jp[] = { 362e5b6d6dSopenharmony_ci// TODO: This set of data comes from the character frequency- 372e5b6d6dSopenharmony_ci// of-occurrence analysis tool. The data needs to be moved 382e5b6d6dSopenharmony_ci// into a resource and loaded from there. 392e5b6d6dSopenharmony_ci0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2, 402e5b6d6dSopenharmony_ci0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3, 412e5b6d6dSopenharmony_ci0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4, 422e5b6d6dSopenharmony_ci0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de, 432e5b6d6dSopenharmony_ci0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef, 442e5b6d6dSopenharmony_ci0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af, 452e5b6d6dSopenharmony_ci0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7, 462e5b6d6dSopenharmony_ci0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1, 472e5b6d6dSopenharmony_ci0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee, 482e5b6d6dSopenharmony_ci0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1}; 492e5b6d6dSopenharmony_ci 502e5b6d6dSopenharmony_cistatic const uint16_t commonChars_euc_kr[] = { 512e5b6d6dSopenharmony_ci// TODO: This set of data comes from the character frequency- 522e5b6d6dSopenharmony_ci// of-occurrence analysis tool. The data needs to be moved 532e5b6d6dSopenharmony_ci// into a resource and loaded from there. 542e5b6d6dSopenharmony_ci0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc, 552e5b6d6dSopenharmony_ci0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9, 562e5b6d6dSopenharmony_ci0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce, 572e5b6d6dSopenharmony_ci0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce, 582e5b6d6dSopenharmony_ci0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba, 592e5b6d6dSopenharmony_ci0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee, 602e5b6d6dSopenharmony_ci0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7, 612e5b6d6dSopenharmony_ci0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6, 622e5b6d6dSopenharmony_ci0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6, 632e5b6d6dSopenharmony_ci0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad}; 642e5b6d6dSopenharmony_ci 652e5b6d6dSopenharmony_cistatic const uint16_t commonChars_big5[] = { 662e5b6d6dSopenharmony_ci// TODO: This set of data comes from the character frequency- 672e5b6d6dSopenharmony_ci// of-occurrence analysis tool. The data needs to be moved 682e5b6d6dSopenharmony_ci// into a resource and loaded from there. 692e5b6d6dSopenharmony_ci0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446, 702e5b6d6dSopenharmony_ci0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3, 712e5b6d6dSopenharmony_ci0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548, 722e5b6d6dSopenharmony_ci0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8, 732e5b6d6dSopenharmony_ci0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da, 742e5b6d6dSopenharmony_ci0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3, 752e5b6d6dSopenharmony_ci0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59, 762e5b6d6dSopenharmony_ci0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c, 772e5b6d6dSopenharmony_ci0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44, 782e5b6d6dSopenharmony_ci0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f}; 792e5b6d6dSopenharmony_ci 802e5b6d6dSopenharmony_cistatic const uint16_t commonChars_gb_18030[] = { 812e5b6d6dSopenharmony_ci// TODO: This set of data comes from the character frequency- 822e5b6d6dSopenharmony_ci// of-occurrence analysis tool. The data needs to be moved 832e5b6d6dSopenharmony_ci// into a resource and loaded from there. 842e5b6d6dSopenharmony_ci0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac, 852e5b6d6dSopenharmony_ci0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4, 862e5b6d6dSopenharmony_ci0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4, 872e5b6d6dSopenharmony_ci0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6, 882e5b6d6dSopenharmony_ci0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6, 892e5b6d6dSopenharmony_ci0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7, 902e5b6d6dSopenharmony_ci0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7, 912e5b6d6dSopenharmony_ci0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5, 922e5b6d6dSopenharmony_ci0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2, 932e5b6d6dSopenharmony_ci0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0}; 942e5b6d6dSopenharmony_ci 952e5b6d6dSopenharmony_cistatic int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value) 962e5b6d6dSopenharmony_ci{ 972e5b6d6dSopenharmony_ci int32_t start = 0, end = len-1; 982e5b6d6dSopenharmony_ci int32_t mid = (start+end)/2; 992e5b6d6dSopenharmony_ci 1002e5b6d6dSopenharmony_ci while(start <= end) { 1012e5b6d6dSopenharmony_ci if(array[mid] == value) { 1022e5b6d6dSopenharmony_ci return mid; 1032e5b6d6dSopenharmony_ci } 1042e5b6d6dSopenharmony_ci 1052e5b6d6dSopenharmony_ci if(array[mid] < value){ 1062e5b6d6dSopenharmony_ci start = mid+1; 1072e5b6d6dSopenharmony_ci } else { 1082e5b6d6dSopenharmony_ci end = mid-1; 1092e5b6d6dSopenharmony_ci } 1102e5b6d6dSopenharmony_ci 1112e5b6d6dSopenharmony_ci mid = (start+end)/2; 1122e5b6d6dSopenharmony_ci } 1132e5b6d6dSopenharmony_ci 1142e5b6d6dSopenharmony_ci return -1; 1152e5b6d6dSopenharmony_ci} 1162e5b6d6dSopenharmony_ci 1172e5b6d6dSopenharmony_ciIteratedChar::IteratedChar() : 1182e5b6d6dSopenharmony_cicharValue(0), index(-1), nextIndex(0), error(false), done(false) 1192e5b6d6dSopenharmony_ci{ 1202e5b6d6dSopenharmony_ci // nothing else to do. 1212e5b6d6dSopenharmony_ci} 1222e5b6d6dSopenharmony_ci 1232e5b6d6dSopenharmony_ci/*void IteratedChar::reset() 1242e5b6d6dSopenharmony_ci{ 1252e5b6d6dSopenharmony_ci charValue = 0; 1262e5b6d6dSopenharmony_ci index = -1; 1272e5b6d6dSopenharmony_ci nextIndex = 0; 1282e5b6d6dSopenharmony_ci error = false; 1292e5b6d6dSopenharmony_ci done = false; 1302e5b6d6dSopenharmony_ci}*/ 1312e5b6d6dSopenharmony_ci 1322e5b6d6dSopenharmony_ciint32_t IteratedChar::nextByte(InputText *det) 1332e5b6d6dSopenharmony_ci{ 1342e5b6d6dSopenharmony_ci if (nextIndex >= det->fRawLength) { 1352e5b6d6dSopenharmony_ci done = true; 1362e5b6d6dSopenharmony_ci 1372e5b6d6dSopenharmony_ci return -1; 1382e5b6d6dSopenharmony_ci } 1392e5b6d6dSopenharmony_ci 1402e5b6d6dSopenharmony_ci return det->fRawInput[nextIndex++]; 1412e5b6d6dSopenharmony_ci} 1422e5b6d6dSopenharmony_ci 1432e5b6d6dSopenharmony_ciCharsetRecog_mbcs::~CharsetRecog_mbcs() 1442e5b6d6dSopenharmony_ci{ 1452e5b6d6dSopenharmony_ci // nothing to do. 1462e5b6d6dSopenharmony_ci} 1472e5b6d6dSopenharmony_ci 1482e5b6d6dSopenharmony_ciint32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const { 1492e5b6d6dSopenharmony_ci int32_t doubleByteCharCount = 0; 1502e5b6d6dSopenharmony_ci int32_t commonCharCount = 0; 1512e5b6d6dSopenharmony_ci int32_t badCharCount = 0; 1522e5b6d6dSopenharmony_ci int32_t totalCharCount = 0; 1532e5b6d6dSopenharmony_ci int32_t confidence = 0; 1542e5b6d6dSopenharmony_ci IteratedChar iter; 1552e5b6d6dSopenharmony_ci 1562e5b6d6dSopenharmony_ci while (nextChar(&iter, det)) { 1572e5b6d6dSopenharmony_ci totalCharCount++; 1582e5b6d6dSopenharmony_ci 1592e5b6d6dSopenharmony_ci if (iter.error) { 1602e5b6d6dSopenharmony_ci badCharCount++; 1612e5b6d6dSopenharmony_ci } else { 1622e5b6d6dSopenharmony_ci if (iter.charValue > 0xFF) { 1632e5b6d6dSopenharmony_ci doubleByteCharCount++; 1642e5b6d6dSopenharmony_ci 1652e5b6d6dSopenharmony_ci if (commonChars != 0) { 1662e5b6d6dSopenharmony_ci if (binarySearch(commonChars, commonCharsLen, static_cast<uint16_t>(iter.charValue)) >= 0){ 1672e5b6d6dSopenharmony_ci commonCharCount += 1; 1682e5b6d6dSopenharmony_ci } 1692e5b6d6dSopenharmony_ci } 1702e5b6d6dSopenharmony_ci } 1712e5b6d6dSopenharmony_ci } 1722e5b6d6dSopenharmony_ci 1732e5b6d6dSopenharmony_ci 1742e5b6d6dSopenharmony_ci if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) { 1752e5b6d6dSopenharmony_ci // Bail out early if the byte data is not matching the encoding scheme. 1762e5b6d6dSopenharmony_ci // break detectBlock; 1772e5b6d6dSopenharmony_ci return confidence; 1782e5b6d6dSopenharmony_ci } 1792e5b6d6dSopenharmony_ci } 1802e5b6d6dSopenharmony_ci 1812e5b6d6dSopenharmony_ci if (doubleByteCharCount <= 10 && badCharCount == 0) { 1822e5b6d6dSopenharmony_ci // Not many multi-byte chars. 1832e5b6d6dSopenharmony_ci if (doubleByteCharCount == 0 && totalCharCount < 10) { 1842e5b6d6dSopenharmony_ci // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes. 1852e5b6d6dSopenharmony_ci // We don't have enough data to have any confidence. 1862e5b6d6dSopenharmony_ci // Statistical analysis of single byte non-ASCII characters would probably help here. 1872e5b6d6dSopenharmony_ci confidence = 0; 1882e5b6d6dSopenharmony_ci } 1892e5b6d6dSopenharmony_ci else { 1902e5b6d6dSopenharmony_ci // ASCII or ISO file? It's probably not our encoding, 1912e5b6d6dSopenharmony_ci // but is not incompatible with our encoding, so don't give it a zero. 1922e5b6d6dSopenharmony_ci confidence = 10; 1932e5b6d6dSopenharmony_ci } 1942e5b6d6dSopenharmony_ci 1952e5b6d6dSopenharmony_ci return confidence; 1962e5b6d6dSopenharmony_ci } 1972e5b6d6dSopenharmony_ci 1982e5b6d6dSopenharmony_ci // 1992e5b6d6dSopenharmony_ci // No match if there are too many characters that don't fit the encoding scheme. 2002e5b6d6dSopenharmony_ci // (should we have zero tolerance for these?) 2012e5b6d6dSopenharmony_ci // 2022e5b6d6dSopenharmony_ci if (doubleByteCharCount < 20*badCharCount) { 2032e5b6d6dSopenharmony_ci confidence = 0; 2042e5b6d6dSopenharmony_ci 2052e5b6d6dSopenharmony_ci return confidence; 2062e5b6d6dSopenharmony_ci } 2072e5b6d6dSopenharmony_ci 2082e5b6d6dSopenharmony_ci if (commonChars == 0) { 2092e5b6d6dSopenharmony_ci // We have no statistics on frequently occurring characters. 2102e5b6d6dSopenharmony_ci // Assess confidence purely on having a reasonable number of 2112e5b6d6dSopenharmony_ci // multi-byte characters (the more the better) 2122e5b6d6dSopenharmony_ci confidence = 30 + doubleByteCharCount - 20*badCharCount; 2132e5b6d6dSopenharmony_ci 2142e5b6d6dSopenharmony_ci if (confidence > 100) { 2152e5b6d6dSopenharmony_ci confidence = 100; 2162e5b6d6dSopenharmony_ci } 2172e5b6d6dSopenharmony_ci } else { 2182e5b6d6dSopenharmony_ci // 2192e5b6d6dSopenharmony_ci // Frequency of occurrence statistics exist. 2202e5b6d6dSopenharmony_ci // 2212e5b6d6dSopenharmony_ci 2222e5b6d6dSopenharmony_ci double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/ 2232e5b6d6dSopenharmony_ci double scaleFactor = 90.0 / maxVal; 2242e5b6d6dSopenharmony_ci confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0); 2252e5b6d6dSopenharmony_ci 2262e5b6d6dSopenharmony_ci confidence = min(confidence, 100); 2272e5b6d6dSopenharmony_ci } 2282e5b6d6dSopenharmony_ci 2292e5b6d6dSopenharmony_ci if (confidence < 0) { 2302e5b6d6dSopenharmony_ci confidence = 0; 2312e5b6d6dSopenharmony_ci } 2322e5b6d6dSopenharmony_ci 2332e5b6d6dSopenharmony_ci return confidence; 2342e5b6d6dSopenharmony_ci} 2352e5b6d6dSopenharmony_ci 2362e5b6d6dSopenharmony_ciCharsetRecog_sjis::~CharsetRecog_sjis() 2372e5b6d6dSopenharmony_ci{ 2382e5b6d6dSopenharmony_ci // nothing to do 2392e5b6d6dSopenharmony_ci} 2402e5b6d6dSopenharmony_ci 2412e5b6d6dSopenharmony_ciUBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const { 2422e5b6d6dSopenharmony_ci it->index = it->nextIndex; 2432e5b6d6dSopenharmony_ci it->error = false; 2442e5b6d6dSopenharmony_ci 2452e5b6d6dSopenharmony_ci int32_t firstByte = it->charValue = it->nextByte(det); 2462e5b6d6dSopenharmony_ci 2472e5b6d6dSopenharmony_ci if (firstByte < 0) { 2482e5b6d6dSopenharmony_ci return false; 2492e5b6d6dSopenharmony_ci } 2502e5b6d6dSopenharmony_ci 2512e5b6d6dSopenharmony_ci if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) { 2522e5b6d6dSopenharmony_ci return true; 2532e5b6d6dSopenharmony_ci } 2542e5b6d6dSopenharmony_ci 2552e5b6d6dSopenharmony_ci int32_t secondByte = it->nextByte(det); 2562e5b6d6dSopenharmony_ci if (secondByte >= 0) { 2572e5b6d6dSopenharmony_ci it->charValue = (firstByte << 8) | secondByte; 2582e5b6d6dSopenharmony_ci } 2592e5b6d6dSopenharmony_ci // else we'll handle the error later. 2602e5b6d6dSopenharmony_ci 2612e5b6d6dSopenharmony_ci if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) { 2622e5b6d6dSopenharmony_ci // Illegal second byte value. 2632e5b6d6dSopenharmony_ci it->error = true; 2642e5b6d6dSopenharmony_ci } 2652e5b6d6dSopenharmony_ci 2662e5b6d6dSopenharmony_ci return true; 2672e5b6d6dSopenharmony_ci} 2682e5b6d6dSopenharmony_ci 2692e5b6d6dSopenharmony_ciUBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const { 2702e5b6d6dSopenharmony_ci int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis)); 2712e5b6d6dSopenharmony_ci results->set(det, this, confidence); 2722e5b6d6dSopenharmony_ci return (confidence > 0); 2732e5b6d6dSopenharmony_ci} 2742e5b6d6dSopenharmony_ci 2752e5b6d6dSopenharmony_ciconst char *CharsetRecog_sjis::getName() const 2762e5b6d6dSopenharmony_ci{ 2772e5b6d6dSopenharmony_ci return "Shift_JIS"; 2782e5b6d6dSopenharmony_ci} 2792e5b6d6dSopenharmony_ci 2802e5b6d6dSopenharmony_ciconst char *CharsetRecog_sjis::getLanguage() const 2812e5b6d6dSopenharmony_ci{ 2822e5b6d6dSopenharmony_ci return "ja"; 2832e5b6d6dSopenharmony_ci} 2842e5b6d6dSopenharmony_ci 2852e5b6d6dSopenharmony_ciCharsetRecog_euc::~CharsetRecog_euc() 2862e5b6d6dSopenharmony_ci{ 2872e5b6d6dSopenharmony_ci // nothing to do 2882e5b6d6dSopenharmony_ci} 2892e5b6d6dSopenharmony_ci 2902e5b6d6dSopenharmony_ciUBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const { 2912e5b6d6dSopenharmony_ci int32_t firstByte = 0; 2922e5b6d6dSopenharmony_ci int32_t secondByte = 0; 2932e5b6d6dSopenharmony_ci int32_t thirdByte = 0; 2942e5b6d6dSopenharmony_ci 2952e5b6d6dSopenharmony_ci it->index = it->nextIndex; 2962e5b6d6dSopenharmony_ci it->error = false; 2972e5b6d6dSopenharmony_ci firstByte = it->charValue = it->nextByte(det); 2982e5b6d6dSopenharmony_ci 2992e5b6d6dSopenharmony_ci if (firstByte < 0) { 3002e5b6d6dSopenharmony_ci // Ran off the end of the input data 3012e5b6d6dSopenharmony_ci return false; 3022e5b6d6dSopenharmony_ci } 3032e5b6d6dSopenharmony_ci 3042e5b6d6dSopenharmony_ci if (firstByte <= 0x8D) { 3052e5b6d6dSopenharmony_ci // single byte char 3062e5b6d6dSopenharmony_ci return true; 3072e5b6d6dSopenharmony_ci } 3082e5b6d6dSopenharmony_ci 3092e5b6d6dSopenharmony_ci secondByte = it->nextByte(det); 3102e5b6d6dSopenharmony_ci if (secondByte >= 0) { 3112e5b6d6dSopenharmony_ci it->charValue = (it->charValue << 8) | secondByte; 3122e5b6d6dSopenharmony_ci } 3132e5b6d6dSopenharmony_ci // else we'll handle the error later. 3142e5b6d6dSopenharmony_ci 3152e5b6d6dSopenharmony_ci if (firstByte >= 0xA1 && firstByte <= 0xFE) { 3162e5b6d6dSopenharmony_ci // Two byte Char 3172e5b6d6dSopenharmony_ci if (secondByte < 0xA1) { 3182e5b6d6dSopenharmony_ci it->error = true; 3192e5b6d6dSopenharmony_ci } 3202e5b6d6dSopenharmony_ci 3212e5b6d6dSopenharmony_ci return true; 3222e5b6d6dSopenharmony_ci } 3232e5b6d6dSopenharmony_ci 3242e5b6d6dSopenharmony_ci if (firstByte == 0x8E) { 3252e5b6d6dSopenharmony_ci // Code Set 2. 3262e5b6d6dSopenharmony_ci // In EUC-JP, total char size is 2 bytes, only one byte of actual char value. 3272e5b6d6dSopenharmony_ci // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value. 3282e5b6d6dSopenharmony_ci // We don't know which we've got. 3292e5b6d6dSopenharmony_ci // Treat it like EUC-JP. If the data really was EUC-TW, the following two 3302e5b6d6dSopenharmony_ci // bytes will look like a well formed 2 byte char. 3312e5b6d6dSopenharmony_ci if (secondByte < 0xA1) { 3322e5b6d6dSopenharmony_ci it->error = true; 3332e5b6d6dSopenharmony_ci } 3342e5b6d6dSopenharmony_ci 3352e5b6d6dSopenharmony_ci return true; 3362e5b6d6dSopenharmony_ci } 3372e5b6d6dSopenharmony_ci 3382e5b6d6dSopenharmony_ci if (firstByte == 0x8F) { 3392e5b6d6dSopenharmony_ci // Code set 3. 3402e5b6d6dSopenharmony_ci // Three byte total char size, two bytes of actual char value. 3412e5b6d6dSopenharmony_ci thirdByte = it->nextByte(det); 3422e5b6d6dSopenharmony_ci it->charValue = (it->charValue << 8) | thirdByte; 3432e5b6d6dSopenharmony_ci 3442e5b6d6dSopenharmony_ci if (thirdByte < 0xa1) { 3452e5b6d6dSopenharmony_ci // Bad second byte or ran off the end of the input data with a non-ASCII first byte. 3462e5b6d6dSopenharmony_ci it->error = true; 3472e5b6d6dSopenharmony_ci } 3482e5b6d6dSopenharmony_ci } 3492e5b6d6dSopenharmony_ci 3502e5b6d6dSopenharmony_ci return true; 3512e5b6d6dSopenharmony_ci 3522e5b6d6dSopenharmony_ci} 3532e5b6d6dSopenharmony_ci 3542e5b6d6dSopenharmony_ciCharsetRecog_euc_jp::~CharsetRecog_euc_jp() 3552e5b6d6dSopenharmony_ci{ 3562e5b6d6dSopenharmony_ci // nothing to do 3572e5b6d6dSopenharmony_ci} 3582e5b6d6dSopenharmony_ci 3592e5b6d6dSopenharmony_ciconst char *CharsetRecog_euc_jp::getName() const 3602e5b6d6dSopenharmony_ci{ 3612e5b6d6dSopenharmony_ci return "EUC-JP"; 3622e5b6d6dSopenharmony_ci} 3632e5b6d6dSopenharmony_ci 3642e5b6d6dSopenharmony_ciconst char *CharsetRecog_euc_jp::getLanguage() const 3652e5b6d6dSopenharmony_ci{ 3662e5b6d6dSopenharmony_ci return "ja"; 3672e5b6d6dSopenharmony_ci} 3682e5b6d6dSopenharmony_ci 3692e5b6d6dSopenharmony_ciUBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const 3702e5b6d6dSopenharmony_ci{ 3712e5b6d6dSopenharmony_ci int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp)); 3722e5b6d6dSopenharmony_ci results->set(det, this, confidence); 3732e5b6d6dSopenharmony_ci return (confidence > 0); 3742e5b6d6dSopenharmony_ci} 3752e5b6d6dSopenharmony_ci 3762e5b6d6dSopenharmony_ciCharsetRecog_euc_kr::~CharsetRecog_euc_kr() 3772e5b6d6dSopenharmony_ci{ 3782e5b6d6dSopenharmony_ci // nothing to do 3792e5b6d6dSopenharmony_ci} 3802e5b6d6dSopenharmony_ci 3812e5b6d6dSopenharmony_ciconst char *CharsetRecog_euc_kr::getName() const 3822e5b6d6dSopenharmony_ci{ 3832e5b6d6dSopenharmony_ci return "EUC-KR"; 3842e5b6d6dSopenharmony_ci} 3852e5b6d6dSopenharmony_ci 3862e5b6d6dSopenharmony_ciconst char *CharsetRecog_euc_kr::getLanguage() const 3872e5b6d6dSopenharmony_ci{ 3882e5b6d6dSopenharmony_ci return "ko"; 3892e5b6d6dSopenharmony_ci} 3902e5b6d6dSopenharmony_ci 3912e5b6d6dSopenharmony_ciUBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const 3922e5b6d6dSopenharmony_ci{ 3932e5b6d6dSopenharmony_ci int32_t confidence = match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr)); 3942e5b6d6dSopenharmony_ci results->set(det, this, confidence); 3952e5b6d6dSopenharmony_ci return (confidence > 0); 3962e5b6d6dSopenharmony_ci} 3972e5b6d6dSopenharmony_ci 3982e5b6d6dSopenharmony_ciCharsetRecog_big5::~CharsetRecog_big5() 3992e5b6d6dSopenharmony_ci{ 4002e5b6d6dSopenharmony_ci // nothing to do 4012e5b6d6dSopenharmony_ci} 4022e5b6d6dSopenharmony_ci 4032e5b6d6dSopenharmony_ciUBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const 4042e5b6d6dSopenharmony_ci{ 4052e5b6d6dSopenharmony_ci int32_t firstByte; 4062e5b6d6dSopenharmony_ci 4072e5b6d6dSopenharmony_ci it->index = it->nextIndex; 4082e5b6d6dSopenharmony_ci it->error = false; 4092e5b6d6dSopenharmony_ci firstByte = it->charValue = it->nextByte(det); 4102e5b6d6dSopenharmony_ci 4112e5b6d6dSopenharmony_ci if (firstByte < 0) { 4122e5b6d6dSopenharmony_ci return false; 4132e5b6d6dSopenharmony_ci } 4142e5b6d6dSopenharmony_ci 4152e5b6d6dSopenharmony_ci if (firstByte <= 0x7F || firstByte == 0xFF) { 4162e5b6d6dSopenharmony_ci // single byte character. 4172e5b6d6dSopenharmony_ci return true; 4182e5b6d6dSopenharmony_ci } 4192e5b6d6dSopenharmony_ci 4202e5b6d6dSopenharmony_ci int32_t secondByte = it->nextByte(det); 4212e5b6d6dSopenharmony_ci if (secondByte >= 0) { 4222e5b6d6dSopenharmony_ci it->charValue = (it->charValue << 8) | secondByte; 4232e5b6d6dSopenharmony_ci } 4242e5b6d6dSopenharmony_ci // else we'll handle the error later. 4252e5b6d6dSopenharmony_ci 4262e5b6d6dSopenharmony_ci if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) { 4272e5b6d6dSopenharmony_ci it->error = true; 4282e5b6d6dSopenharmony_ci } 4292e5b6d6dSopenharmony_ci 4302e5b6d6dSopenharmony_ci return true; 4312e5b6d6dSopenharmony_ci} 4322e5b6d6dSopenharmony_ci 4332e5b6d6dSopenharmony_ciconst char *CharsetRecog_big5::getName() const 4342e5b6d6dSopenharmony_ci{ 4352e5b6d6dSopenharmony_ci return "Big5"; 4362e5b6d6dSopenharmony_ci} 4372e5b6d6dSopenharmony_ci 4382e5b6d6dSopenharmony_ciconst char *CharsetRecog_big5::getLanguage() const 4392e5b6d6dSopenharmony_ci{ 4402e5b6d6dSopenharmony_ci return "zh"; 4412e5b6d6dSopenharmony_ci} 4422e5b6d6dSopenharmony_ci 4432e5b6d6dSopenharmony_ciUBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const 4442e5b6d6dSopenharmony_ci{ 4452e5b6d6dSopenharmony_ci int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5)); 4462e5b6d6dSopenharmony_ci results->set(det, this, confidence); 4472e5b6d6dSopenharmony_ci return (confidence > 0); 4482e5b6d6dSopenharmony_ci} 4492e5b6d6dSopenharmony_ci 4502e5b6d6dSopenharmony_ciCharsetRecog_gb_18030::~CharsetRecog_gb_18030() 4512e5b6d6dSopenharmony_ci{ 4522e5b6d6dSopenharmony_ci // nothing to do 4532e5b6d6dSopenharmony_ci} 4542e5b6d6dSopenharmony_ci 4552e5b6d6dSopenharmony_ciUBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const { 4562e5b6d6dSopenharmony_ci int32_t firstByte = 0; 4572e5b6d6dSopenharmony_ci int32_t secondByte = 0; 4582e5b6d6dSopenharmony_ci int32_t thirdByte = 0; 4592e5b6d6dSopenharmony_ci int32_t fourthByte = 0; 4602e5b6d6dSopenharmony_ci 4612e5b6d6dSopenharmony_ci it->index = it->nextIndex; 4622e5b6d6dSopenharmony_ci it->error = false; 4632e5b6d6dSopenharmony_ci firstByte = it->charValue = it->nextByte(det); 4642e5b6d6dSopenharmony_ci 4652e5b6d6dSopenharmony_ci if (firstByte < 0) { 4662e5b6d6dSopenharmony_ci // Ran off the end of the input data 4672e5b6d6dSopenharmony_ci return false; 4682e5b6d6dSopenharmony_ci } 4692e5b6d6dSopenharmony_ci 4702e5b6d6dSopenharmony_ci if (firstByte <= 0x80) { 4712e5b6d6dSopenharmony_ci // single byte char 4722e5b6d6dSopenharmony_ci return true; 4732e5b6d6dSopenharmony_ci } 4742e5b6d6dSopenharmony_ci 4752e5b6d6dSopenharmony_ci secondByte = it->nextByte(det); 4762e5b6d6dSopenharmony_ci if (secondByte >= 0) { 4772e5b6d6dSopenharmony_ci it->charValue = (it->charValue << 8) | secondByte; 4782e5b6d6dSopenharmony_ci } 4792e5b6d6dSopenharmony_ci // else we'll handle the error later. 4802e5b6d6dSopenharmony_ci 4812e5b6d6dSopenharmony_ci if (firstByte >= 0x81 && firstByte <= 0xFE) { 4822e5b6d6dSopenharmony_ci // Two byte Char 4832e5b6d6dSopenharmony_ci if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) { 4842e5b6d6dSopenharmony_ci return true; 4852e5b6d6dSopenharmony_ci } 4862e5b6d6dSopenharmony_ci 4872e5b6d6dSopenharmony_ci // Four byte char 4882e5b6d6dSopenharmony_ci if (secondByte >= 0x30 && secondByte <= 0x39) { 4892e5b6d6dSopenharmony_ci thirdByte = it->nextByte(det); 4902e5b6d6dSopenharmony_ci 4912e5b6d6dSopenharmony_ci if (thirdByte >= 0x81 && thirdByte <= 0xFE) { 4922e5b6d6dSopenharmony_ci fourthByte = it->nextByte(det); 4932e5b6d6dSopenharmony_ci 4942e5b6d6dSopenharmony_ci if (fourthByte >= 0x30 && fourthByte <= 0x39) { 4952e5b6d6dSopenharmony_ci it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte; 4962e5b6d6dSopenharmony_ci 4972e5b6d6dSopenharmony_ci return true; 4982e5b6d6dSopenharmony_ci } 4992e5b6d6dSopenharmony_ci } 5002e5b6d6dSopenharmony_ci } 5012e5b6d6dSopenharmony_ci 5022e5b6d6dSopenharmony_ci // Something wasn't valid, or we ran out of data (-1). 5032e5b6d6dSopenharmony_ci it->error = true; 5042e5b6d6dSopenharmony_ci } 5052e5b6d6dSopenharmony_ci 5062e5b6d6dSopenharmony_ci return true; 5072e5b6d6dSopenharmony_ci} 5082e5b6d6dSopenharmony_ci 5092e5b6d6dSopenharmony_ciconst char *CharsetRecog_gb_18030::getName() const 5102e5b6d6dSopenharmony_ci{ 5112e5b6d6dSopenharmony_ci return "GB18030"; 5122e5b6d6dSopenharmony_ci} 5132e5b6d6dSopenharmony_ci 5142e5b6d6dSopenharmony_ciconst char *CharsetRecog_gb_18030::getLanguage() const 5152e5b6d6dSopenharmony_ci{ 5162e5b6d6dSopenharmony_ci return "zh"; 5172e5b6d6dSopenharmony_ci} 5182e5b6d6dSopenharmony_ci 5192e5b6d6dSopenharmony_ciUBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const 5202e5b6d6dSopenharmony_ci{ 5212e5b6d6dSopenharmony_ci int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030)); 5222e5b6d6dSopenharmony_ci results->set(det, this, confidence); 5232e5b6d6dSopenharmony_ci return (confidence > 0); 5242e5b6d6dSopenharmony_ci} 5252e5b6d6dSopenharmony_ci 5262e5b6d6dSopenharmony_ciU_NAMESPACE_END 5272e5b6d6dSopenharmony_ci#endif 528