11cb0ef41Sopenharmony_ci// © 2016 and later: Unicode, Inc. and others. 21cb0ef41Sopenharmony_ci// License & terms of use: http://www.unicode.org/copyright.html 31cb0ef41Sopenharmony_ci/* 41cb0ef41Sopenharmony_ci********************************************************************** 51cb0ef41Sopenharmony_ci* Copyright (C) 2000-2016, International Business Machines 61cb0ef41Sopenharmony_ci* Corporation and others. All Rights Reserved. 71cb0ef41Sopenharmony_ci********************************************************************** 81cb0ef41Sopenharmony_ci* file name: ucnv_lmb.cpp 91cb0ef41Sopenharmony_ci* encoding: UTF-8 101cb0ef41Sopenharmony_ci* tab size: 4 (not used) 111cb0ef41Sopenharmony_ci* indentation:4 121cb0ef41Sopenharmony_ci* 131cb0ef41Sopenharmony_ci* created on: 2000feb09 141cb0ef41Sopenharmony_ci* created by: Brendan Murray 151cb0ef41Sopenharmony_ci* extensively hacked up by: Jim Snyder-Grant 161cb0ef41Sopenharmony_ci* 171cb0ef41Sopenharmony_ci* Modification History: 181cb0ef41Sopenharmony_ci* 191cb0ef41Sopenharmony_ci* Date Name Description 201cb0ef41Sopenharmony_ci* 211cb0ef41Sopenharmony_ci* 06/20/2000 helena OS/400 port changes; mostly typecast. 221cb0ef41Sopenharmony_ci* 06/27/2000 Jim Snyder-Grant Deal with partial characters and small buffers. 231cb0ef41Sopenharmony_ci* Add comments to document LMBCS format and implementation 241cb0ef41Sopenharmony_ci* restructured order & breakdown of functions 251cb0ef41Sopenharmony_ci* 06/28/2000 helena Major rewrite for the callback API changes. 261cb0ef41Sopenharmony_ci*/ 271cb0ef41Sopenharmony_ci 281cb0ef41Sopenharmony_ci#include "unicode/utypes.h" 291cb0ef41Sopenharmony_ci 301cb0ef41Sopenharmony_ci#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION 311cb0ef41Sopenharmony_ci 321cb0ef41Sopenharmony_ci#include "unicode/ucnv_err.h" 331cb0ef41Sopenharmony_ci#include "unicode/ucnv.h" 341cb0ef41Sopenharmony_ci#include "unicode/uset.h" 351cb0ef41Sopenharmony_ci#include "cmemory.h" 361cb0ef41Sopenharmony_ci#include "cstring.h" 371cb0ef41Sopenharmony_ci#include "uassert.h" 381cb0ef41Sopenharmony_ci#include "ucnv_imp.h" 391cb0ef41Sopenharmony_ci#include "ucnv_bld.h" 401cb0ef41Sopenharmony_ci#include "ucnv_cnv.h" 411cb0ef41Sopenharmony_ci 421cb0ef41Sopenharmony_ci#ifdef EBCDIC_RTL 431cb0ef41Sopenharmony_ci #include "ascii_a.h" 441cb0ef41Sopenharmony_ci#endif 451cb0ef41Sopenharmony_ci 461cb0ef41Sopenharmony_ci/* 471cb0ef41Sopenharmony_ci LMBCS 481cb0ef41Sopenharmony_ci 491cb0ef41Sopenharmony_ci (Lotus Multi-Byte Character Set) 501cb0ef41Sopenharmony_ci 511cb0ef41Sopenharmony_ci LMBCS was invented in the late 1980's and is primarily used in Lotus Notes 521cb0ef41Sopenharmony_ci databases and in Lotus 1-2-3 files. Programmers who work with the APIs 531cb0ef41Sopenharmony_ci into these products will sometimes need to deal with strings in this format. 541cb0ef41Sopenharmony_ci 551cb0ef41Sopenharmony_ci The code in this file provides an implementation for an ICU converter of 561cb0ef41Sopenharmony_ci LMBCS to and from Unicode. 571cb0ef41Sopenharmony_ci 581cb0ef41Sopenharmony_ci Since the LMBCS character set is only sparsely documented in existing 591cb0ef41Sopenharmony_ci printed or online material, we have added extensive annotation to this 601cb0ef41Sopenharmony_ci file to serve as a guide to understanding LMBCS. 611cb0ef41Sopenharmony_ci 621cb0ef41Sopenharmony_ci LMBCS was originally designed with these four sometimes-competing design goals: 631cb0ef41Sopenharmony_ci 641cb0ef41Sopenharmony_ci -Provide encodings for the characters in 12 existing national standards 651cb0ef41Sopenharmony_ci (plus a few other characters) 661cb0ef41Sopenharmony_ci -Minimal memory footprint 671cb0ef41Sopenharmony_ci -Maximal speed of conversion into the existing national character sets 681cb0ef41Sopenharmony_ci -No need to track a changing state as you interpret a string. 691cb0ef41Sopenharmony_ci 701cb0ef41Sopenharmony_ci 711cb0ef41Sopenharmony_ci All of the national character sets LMBCS was trying to encode are 'ANSI' 721cb0ef41Sopenharmony_ci based, in that the bytes from 0x20 - 0x7F are almost exactly the 731cb0ef41Sopenharmony_ci same common Latin unaccented characters and symbols in all character sets. 741cb0ef41Sopenharmony_ci 751cb0ef41Sopenharmony_ci So, in order to help meet the speed & memory design goals, the common ANSI 761cb0ef41Sopenharmony_ci bytes from 0x20-0x7F are represented by the same single-byte values in LMBCS. 771cb0ef41Sopenharmony_ci 781cb0ef41Sopenharmony_ci The general LMBCS code unit is from 1-3 bytes. We can describe the 3 bytes as 791cb0ef41Sopenharmony_ci follows: 801cb0ef41Sopenharmony_ci 811cb0ef41Sopenharmony_ci [G] D1 [D2] 821cb0ef41Sopenharmony_ci 831cb0ef41Sopenharmony_ci That is, a sometimes-optional 'group' byte, followed by 1 and sometimes 2 841cb0ef41Sopenharmony_ci data bytes. The maximum size of a LMBCS character is 3 bytes: 851cb0ef41Sopenharmony_ci*/ 861cb0ef41Sopenharmony_ci#define ULMBCS_CHARSIZE_MAX 3 871cb0ef41Sopenharmony_ci/* 881cb0ef41Sopenharmony_ci The single-byte values from 0x20 to 0x7F are examples of single D1 bytes. 891cb0ef41Sopenharmony_ci We often have to figure out if byte values are below or above this, so we 901cb0ef41Sopenharmony_ci use the ANSI nomenclature 'C0' and 'C1' to refer to the range of control 911cb0ef41Sopenharmony_ci characters just above & below the common lower-ANSI range */ 921cb0ef41Sopenharmony_ci#define ULMBCS_C0END 0x1F 931cb0ef41Sopenharmony_ci#define ULMBCS_C1START 0x80 941cb0ef41Sopenharmony_ci/* 951cb0ef41Sopenharmony_ci Since LMBCS is always dealing in byte units. we create a local type here for 961cb0ef41Sopenharmony_ci dealing with these units of LMBCS code units: 971cb0ef41Sopenharmony_ci 981cb0ef41Sopenharmony_ci*/ 991cb0ef41Sopenharmony_citypedef uint8_t ulmbcs_byte_t; 1001cb0ef41Sopenharmony_ci 1011cb0ef41Sopenharmony_ci/* 1021cb0ef41Sopenharmony_ci Most of the values less than 0x20 are reserved in LMBCS to announce 1031cb0ef41Sopenharmony_ci which national character standard is being used for the 'D' bytes. 1041cb0ef41Sopenharmony_ci In the comments we show the common name and the IBM character-set ID 1051cb0ef41Sopenharmony_ci for these character-set announcers: 1061cb0ef41Sopenharmony_ci*/ 1071cb0ef41Sopenharmony_ci 1081cb0ef41Sopenharmony_ci#define ULMBCS_GRP_L1 0x01 /* Latin-1 :ibm-850 */ 1091cb0ef41Sopenharmony_ci#define ULMBCS_GRP_GR 0x02 /* Greek :ibm-851 */ 1101cb0ef41Sopenharmony_ci#define ULMBCS_GRP_HE 0x03 /* Hebrew :ibm-1255 */ 1111cb0ef41Sopenharmony_ci#define ULMBCS_GRP_AR 0x04 /* Arabic :ibm-1256 */ 1121cb0ef41Sopenharmony_ci#define ULMBCS_GRP_RU 0x05 /* Cyrillic :ibm-1251 */ 1131cb0ef41Sopenharmony_ci#define ULMBCS_GRP_L2 0x06 /* Latin-2 :ibm-852 */ 1141cb0ef41Sopenharmony_ci#define ULMBCS_GRP_TR 0x08 /* Turkish :ibm-1254 */ 1151cb0ef41Sopenharmony_ci#define ULMBCS_GRP_TH 0x0B /* Thai :ibm-874 */ 1161cb0ef41Sopenharmony_ci#define ULMBCS_GRP_JA 0x10 /* Japanese :ibm-943 */ 1171cb0ef41Sopenharmony_ci#define ULMBCS_GRP_KO 0x11 /* Korean :ibm-1261 */ 1181cb0ef41Sopenharmony_ci#define ULMBCS_GRP_TW 0x12 /* Chinese SC :ibm-950 */ 1191cb0ef41Sopenharmony_ci#define ULMBCS_GRP_CN 0x13 /* Chinese TC :ibm-1386 */ 1201cb0ef41Sopenharmony_ci 1211cb0ef41Sopenharmony_ci/* 1221cb0ef41Sopenharmony_ci So, the beginning of understanding LMBCS is that IF the first byte of a LMBCS 1231cb0ef41Sopenharmony_ci character is one of those 12 values, you can interpret the remaining bytes of 1241cb0ef41Sopenharmony_ci that character as coming from one of those character sets. Since the lower 1251cb0ef41Sopenharmony_ci ANSI bytes already are represented in single bytes, using one of the character 1261cb0ef41Sopenharmony_ci set announcers is used to announce a character that starts with a byte of 1271cb0ef41Sopenharmony_ci 0x80 or greater. 1281cb0ef41Sopenharmony_ci 1291cb0ef41Sopenharmony_ci The character sets are arranged so that the single byte sets all appear 1301cb0ef41Sopenharmony_ci before the multi-byte character sets. When we need to tell whether a 1311cb0ef41Sopenharmony_ci group byte is for a single byte char set or not we use this define: */ 1321cb0ef41Sopenharmony_ci 1331cb0ef41Sopenharmony_ci#define ULMBCS_DOUBLEOPTGROUP_START 0x10 1341cb0ef41Sopenharmony_ci 1351cb0ef41Sopenharmony_ci/* 1361cb0ef41Sopenharmony_ciHowever, to fully understand LMBCS, you must also understand a series of 1371cb0ef41Sopenharmony_ciexceptions & optimizations made in service of the design goals. 1381cb0ef41Sopenharmony_ci 1391cb0ef41Sopenharmony_ciFirst, those of you who are character set mavens may have noticed that 1401cb0ef41Sopenharmony_cithe 'double-byte' character sets are actually multi-byte character sets 1411cb0ef41Sopenharmony_cithat can have 1 or two bytes, even in the upper-ascii range. To force 1421cb0ef41Sopenharmony_cieach group byte to introduce a fixed-width encoding (to make it faster to 1431cb0ef41Sopenharmony_cicount characters), we use a convention of doubling up on the group byte 1441cb0ef41Sopenharmony_cito introduce any single-byte character > 0x80 in an otherwise double-byte 1451cb0ef41Sopenharmony_cicharacter set. So, for example, the LMBCS sequence x10 x10 xAE is the 1461cb0ef41Sopenharmony_cisame as '0xAE' in the Japanese code page 943. 1471cb0ef41Sopenharmony_ci 1481cb0ef41Sopenharmony_ciNext, you will notice that the list of group bytes has some gaps. 1491cb0ef41Sopenharmony_ciThese are used in various ways. 1501cb0ef41Sopenharmony_ci 1511cb0ef41Sopenharmony_ciWe reserve a few special single byte values for common control 1521cb0ef41Sopenharmony_cicharacters. These are in the same place as their ANSI equivalents for speed. 1531cb0ef41Sopenharmony_ci*/ 1541cb0ef41Sopenharmony_ci 1551cb0ef41Sopenharmony_ci#define ULMBCS_HT 0x09 /* Fixed control char - Horizontal Tab */ 1561cb0ef41Sopenharmony_ci#define ULMBCS_LF 0x0A /* Fixed control char - Line Feed */ 1571cb0ef41Sopenharmony_ci#define ULMBCS_CR 0x0D /* Fixed control char - Carriage Return */ 1581cb0ef41Sopenharmony_ci 1591cb0ef41Sopenharmony_ci/* Then, 1-2-3 reserved a special single-byte character to put at the 1601cb0ef41Sopenharmony_cibeginning of internal 'system' range names: */ 1611cb0ef41Sopenharmony_ci 1621cb0ef41Sopenharmony_ci#define ULMBCS_123SYSTEMRANGE 0x19 1631cb0ef41Sopenharmony_ci 1641cb0ef41Sopenharmony_ci/* Then we needed a place to put all the other ansi control characters 1651cb0ef41Sopenharmony_cithat must be moved to different values because LMBCS reserves those 1661cb0ef41Sopenharmony_civalues for other purposes. To represent the control characters, we start 1671cb0ef41Sopenharmony_ciwith a first byte of 0xF & add the control character value as the 1681cb0ef41Sopenharmony_cisecond byte */ 1691cb0ef41Sopenharmony_ci#define ULMBCS_GRP_CTRL 0x0F 1701cb0ef41Sopenharmony_ci 1711cb0ef41Sopenharmony_ci/* For the C0 controls (less than 0x20), we add 0x20 to preserve the 1721cb0ef41Sopenharmony_ciuseful doctrine that any byte less than 0x20 in a LMBCS char must be 1731cb0ef41Sopenharmony_cithe first byte of a character:*/ 1741cb0ef41Sopenharmony_ci#define ULMBCS_CTRLOFFSET 0x20 1751cb0ef41Sopenharmony_ci 1761cb0ef41Sopenharmony_ci/* 1771cb0ef41Sopenharmony_ciWhere to put the characters that aren't part of any of the 12 national 1781cb0ef41Sopenharmony_cicharacter sets? The first thing that was done, in the earlier years of 1791cb0ef41Sopenharmony_ciLMBCS, was to use up the spaces of the form 1801cb0ef41Sopenharmony_ci 1811cb0ef41Sopenharmony_ci [G] D1, 1821cb0ef41Sopenharmony_ci 1831cb0ef41Sopenharmony_ci where 'G' was one of the single-byte character groups, and 1841cb0ef41Sopenharmony_ci D1 was less than 0x80. These sequences are gathered together 1851cb0ef41Sopenharmony_ci into a Lotus-invented doublebyte character set to represent a 1861cb0ef41Sopenharmony_ci lot of stray values. Internally, in this implementation, we track this 1871cb0ef41Sopenharmony_ci as group '0', as a place to tuck this exceptions list.*/ 1881cb0ef41Sopenharmony_ci 1891cb0ef41Sopenharmony_ci#define ULMBCS_GRP_EXCEPT 0x00 1901cb0ef41Sopenharmony_ci/* 1911cb0ef41Sopenharmony_ci Finally, as the durability and usefulness of UNICODE became clear, 1921cb0ef41Sopenharmony_ci LOTUS added a new group 0x14 to hold Unicode values not otherwise 1931cb0ef41Sopenharmony_ci represented in LMBCS: */ 1941cb0ef41Sopenharmony_ci#define ULMBCS_GRP_UNICODE 0x14 1951cb0ef41Sopenharmony_ci/* The two bytes appearing after a 0x14 are interpreted as UFT-16 BE 1961cb0ef41Sopenharmony_ci(Big-Endian) characters. The exception comes when the UTF16 1971cb0ef41Sopenharmony_cirepresentation would have a zero as the second byte. In that case, 1981cb0ef41Sopenharmony_ci'F6' is used in its place, and the bytes are swapped. (This prevents 1991cb0ef41Sopenharmony_ciLMBCS from encoding any Unicode values of the form U+F6xx, but that's OK: 2001cb0ef41Sopenharmony_ci0xF6xx is in the middle of the Private Use Area.)*/ 2011cb0ef41Sopenharmony_ci#define ULMBCS_UNICOMPATZERO 0xF6 2021cb0ef41Sopenharmony_ci 2031cb0ef41Sopenharmony_ci/* It is also useful in our code to have a constant for the size of 2041cb0ef41Sopenharmony_cia LMBCS char that holds a literal Unicode value */ 2051cb0ef41Sopenharmony_ci#define ULMBCS_UNICODE_SIZE 3 2061cb0ef41Sopenharmony_ci 2071cb0ef41Sopenharmony_ci/* 2081cb0ef41Sopenharmony_ciTo squish the LMBCS representations down even further, and to make 2091cb0ef41Sopenharmony_citranslations even faster,sometimes the optimization group byte can be dropped 2101cb0ef41Sopenharmony_cifrom a LMBCS character. This is decided on a process-by-process basis. The 2111cb0ef41Sopenharmony_cigroup byte that is dropped is called the 'optimization group'. 2121cb0ef41Sopenharmony_ci 2131cb0ef41Sopenharmony_ciFor Notes, the optimzation group is always 0x1.*/ 2141cb0ef41Sopenharmony_ci#define ULMBCS_DEFAULTOPTGROUP 0x1 2151cb0ef41Sopenharmony_ci/* For 1-2-3 files, the optimzation group is stored in the header of the 1-2-3 2161cb0ef41Sopenharmony_cifile. 2171cb0ef41Sopenharmony_ci 2181cb0ef41Sopenharmony_ci In any case, when using ICU, you either pass in the 2191cb0ef41Sopenharmony_cioptimization group as part of the name of the converter (LMBCS-1, LMBCS-2, 2201cb0ef41Sopenharmony_cietc.). Using plain 'LMBCS' as the name of the converter will give you 2211cb0ef41Sopenharmony_ciLMBCS-1. 2221cb0ef41Sopenharmony_ci 2231cb0ef41Sopenharmony_ci 2241cb0ef41Sopenharmony_ci*** Implementation strategy *** 2251cb0ef41Sopenharmony_ci 2261cb0ef41Sopenharmony_ci 2271cb0ef41Sopenharmony_ciBecause of the extensive use of other character sets, the LMBCS converter 2281cb0ef41Sopenharmony_cikeeps a mapping between optimization groups and IBM character sets, so that 2291cb0ef41Sopenharmony_ciICU converters can be created and used as needed. */ 2301cb0ef41Sopenharmony_ci 2311cb0ef41Sopenharmony_ci/* As you can see, even though any byte below 0x20 could be an optimization 2321cb0ef41Sopenharmony_cibyte, only those at 0x13 or below can map to an actual converter. To limit 2331cb0ef41Sopenharmony_cisome loops and searches, we define a value for that last group converter:*/ 2341cb0ef41Sopenharmony_ci 2351cb0ef41Sopenharmony_ci#define ULMBCS_GRP_LAST 0x13 /* last LMBCS group that has a converter */ 2361cb0ef41Sopenharmony_ci 2371cb0ef41Sopenharmony_cistatic const char * const OptGroupByteToCPName[ULMBCS_GRP_LAST + 1] = { 2381cb0ef41Sopenharmony_ci /* 0x0000 */ "lmb-excp", /* internal home for the LOTUS exceptions list */ 2391cb0ef41Sopenharmony_ci /* 0x0001 */ "ibm-850", 2401cb0ef41Sopenharmony_ci /* 0x0002 */ "ibm-851", 2411cb0ef41Sopenharmony_ci /* 0x0003 */ "windows-1255", 2421cb0ef41Sopenharmony_ci /* 0x0004 */ "windows-1256", 2431cb0ef41Sopenharmony_ci /* 0x0005 */ "windows-1251", 2441cb0ef41Sopenharmony_ci /* 0x0006 */ "ibm-852", 2451cb0ef41Sopenharmony_ci /* 0x0007 */ nullptr, /* Unused */ 2461cb0ef41Sopenharmony_ci /* 0x0008 */ "windows-1254", 2471cb0ef41Sopenharmony_ci /* 0x0009 */ nullptr, /* Control char HT */ 2481cb0ef41Sopenharmony_ci /* 0x000A */ nullptr, /* Control char LF */ 2491cb0ef41Sopenharmony_ci /* 0x000B */ "windows-874", 2501cb0ef41Sopenharmony_ci /* 0x000C */ nullptr, /* Unused */ 2511cb0ef41Sopenharmony_ci /* 0x000D */ nullptr, /* Control char CR */ 2521cb0ef41Sopenharmony_ci /* 0x000E */ nullptr, /* Unused */ 2531cb0ef41Sopenharmony_ci /* 0x000F */ nullptr, /* Control chars: 0x0F20 + C0/C1 character: algorithmic */ 2541cb0ef41Sopenharmony_ci /* 0x0010 */ "windows-932", 2551cb0ef41Sopenharmony_ci /* 0x0011 */ "windows-949", 2561cb0ef41Sopenharmony_ci /* 0x0012 */ "windows-950", 2571cb0ef41Sopenharmony_ci /* 0x0013 */ "windows-936" 2581cb0ef41Sopenharmony_ci 2591cb0ef41Sopenharmony_ci /* The rest are null, including the 0x0014 Unicode compatibility region 2601cb0ef41Sopenharmony_ci and 0x0019, the 1-2-3 system range control char */ 2611cb0ef41Sopenharmony_ci}; 2621cb0ef41Sopenharmony_ci 2631cb0ef41Sopenharmony_ci 2641cb0ef41Sopenharmony_ci/* That's approximately all the data that's needed for translating 2651cb0ef41Sopenharmony_ci LMBCS to Unicode. 2661cb0ef41Sopenharmony_ci 2671cb0ef41Sopenharmony_ci 2681cb0ef41Sopenharmony_ciHowever, to translate Unicode to LMBCS, we need some more support. 2691cb0ef41Sopenharmony_ci 2701cb0ef41Sopenharmony_ciThat's because there are often more than one possible mappings from a Unicode 2711cb0ef41Sopenharmony_cicode point back into LMBCS. The first thing we do is look up into a table 2721cb0ef41Sopenharmony_cito figure out if there are more than one possible mappings. This table, 2731cb0ef41Sopenharmony_ciarranged by Unicode values (including ranges) either lists which group 2741cb0ef41Sopenharmony_cito use, or says that it could go into one or more of the SBCS sets, or 2751cb0ef41Sopenharmony_ciinto one or more of the DBCS sets. (If the character exists in both DBCS & 2761cb0ef41Sopenharmony_ciSBCS, the table will place it in the SBCS sets, to make the LMBCS code point 2771cb0ef41Sopenharmony_cilength as small as possible. Here's the two special markers we use to indicate 2781cb0ef41Sopenharmony_ciambiguous mappings: */ 2791cb0ef41Sopenharmony_ci 2801cb0ef41Sopenharmony_ci#define ULMBCS_AMBIGUOUS_SBCS 0x80 /* could fit in more than one 2811cb0ef41Sopenharmony_ci LMBCS sbcs native encoding 2821cb0ef41Sopenharmony_ci (example: most accented latin) */ 2831cb0ef41Sopenharmony_ci#define ULMBCS_AMBIGUOUS_MBCS 0x81 /* could fit in more than one 2841cb0ef41Sopenharmony_ci LMBCS mbcs native encoding 2851cb0ef41Sopenharmony_ci (example: Unihan) */ 2861cb0ef41Sopenharmony_ci#define ULMBCS_AMBIGUOUS_ALL 0x82 2871cb0ef41Sopenharmony_ci/* And here's a simple way to see if a group falls in an appropriate range */ 2881cb0ef41Sopenharmony_ci#define ULMBCS_AMBIGUOUS_MATCH(agroup, xgroup) \ 2891cb0ef41Sopenharmony_ci ((((agroup) == ULMBCS_AMBIGUOUS_SBCS) && \ 2901cb0ef41Sopenharmony_ci (xgroup) < ULMBCS_DOUBLEOPTGROUP_START) || \ 2911cb0ef41Sopenharmony_ci (((agroup) == ULMBCS_AMBIGUOUS_MBCS) && \ 2921cb0ef41Sopenharmony_ci (xgroup) >= ULMBCS_DOUBLEOPTGROUP_START)) || \ 2931cb0ef41Sopenharmony_ci ((agroup) == ULMBCS_AMBIGUOUS_ALL) 2941cb0ef41Sopenharmony_ci 2951cb0ef41Sopenharmony_ci 2961cb0ef41Sopenharmony_ci/* The table & some code to use it: */ 2971cb0ef41Sopenharmony_ci 2981cb0ef41Sopenharmony_ci 2991cb0ef41Sopenharmony_cistatic const struct _UniLMBCSGrpMap 3001cb0ef41Sopenharmony_ci{ 3011cb0ef41Sopenharmony_ci const char16_t uniStartRange; 3021cb0ef41Sopenharmony_ci const char16_t uniEndRange; 3031cb0ef41Sopenharmony_ci const ulmbcs_byte_t GrpType; 3041cb0ef41Sopenharmony_ci} UniLMBCSGrpMap[] 3051cb0ef41Sopenharmony_ci= 3061cb0ef41Sopenharmony_ci{ 3071cb0ef41Sopenharmony_ci 3081cb0ef41Sopenharmony_ci {0x0001, 0x001F, ULMBCS_GRP_CTRL}, 3091cb0ef41Sopenharmony_ci {0x0080, 0x009F, ULMBCS_GRP_CTRL}, 3101cb0ef41Sopenharmony_ci {0x00A0, 0x00A6, ULMBCS_AMBIGUOUS_SBCS}, 3111cb0ef41Sopenharmony_ci {0x00A7, 0x00A8, ULMBCS_AMBIGUOUS_ALL}, 3121cb0ef41Sopenharmony_ci {0x00A9, 0x00AF, ULMBCS_AMBIGUOUS_SBCS}, 3131cb0ef41Sopenharmony_ci {0x00B0, 0x00B1, ULMBCS_AMBIGUOUS_ALL}, 3141cb0ef41Sopenharmony_ci {0x00B2, 0x00B3, ULMBCS_AMBIGUOUS_SBCS}, 3151cb0ef41Sopenharmony_ci {0x00B4, 0x00B4, ULMBCS_AMBIGUOUS_ALL}, 3161cb0ef41Sopenharmony_ci {0x00B5, 0x00B5, ULMBCS_AMBIGUOUS_SBCS}, 3171cb0ef41Sopenharmony_ci {0x00B6, 0x00B6, ULMBCS_AMBIGUOUS_ALL}, 3181cb0ef41Sopenharmony_ci {0x00B7, 0x00D6, ULMBCS_AMBIGUOUS_SBCS}, 3191cb0ef41Sopenharmony_ci {0x00D7, 0x00D7, ULMBCS_AMBIGUOUS_ALL}, 3201cb0ef41Sopenharmony_ci {0x00D8, 0x00F6, ULMBCS_AMBIGUOUS_SBCS}, 3211cb0ef41Sopenharmony_ci {0x00F7, 0x00F7, ULMBCS_AMBIGUOUS_ALL}, 3221cb0ef41Sopenharmony_ci {0x00F8, 0x01CD, ULMBCS_AMBIGUOUS_SBCS}, 3231cb0ef41Sopenharmony_ci {0x01CE, 0x01CE, ULMBCS_GRP_TW }, 3241cb0ef41Sopenharmony_ci {0x01CF, 0x02B9, ULMBCS_AMBIGUOUS_SBCS}, 3251cb0ef41Sopenharmony_ci {0x02BA, 0x02BA, ULMBCS_GRP_CN}, 3261cb0ef41Sopenharmony_ci {0x02BC, 0x02C8, ULMBCS_AMBIGUOUS_SBCS}, 3271cb0ef41Sopenharmony_ci {0x02C9, 0x02D0, ULMBCS_AMBIGUOUS_MBCS}, 3281cb0ef41Sopenharmony_ci {0x02D8, 0x02DD, ULMBCS_AMBIGUOUS_SBCS}, 3291cb0ef41Sopenharmony_ci {0x0384, 0x0390, ULMBCS_AMBIGUOUS_SBCS}, 3301cb0ef41Sopenharmony_ci {0x0391, 0x03A9, ULMBCS_AMBIGUOUS_ALL}, 3311cb0ef41Sopenharmony_ci {0x03AA, 0x03B0, ULMBCS_AMBIGUOUS_SBCS}, 3321cb0ef41Sopenharmony_ci {0x03B1, 0x03C9, ULMBCS_AMBIGUOUS_ALL}, 3331cb0ef41Sopenharmony_ci {0x03CA, 0x03CE, ULMBCS_AMBIGUOUS_SBCS}, 3341cb0ef41Sopenharmony_ci {0x0400, 0x0400, ULMBCS_GRP_RU}, 3351cb0ef41Sopenharmony_ci {0x0401, 0x0401, ULMBCS_AMBIGUOUS_ALL}, 3361cb0ef41Sopenharmony_ci {0x0402, 0x040F, ULMBCS_GRP_RU}, 3371cb0ef41Sopenharmony_ci {0x0410, 0x0431, ULMBCS_AMBIGUOUS_ALL}, 3381cb0ef41Sopenharmony_ci {0x0432, 0x044E, ULMBCS_GRP_RU}, 3391cb0ef41Sopenharmony_ci {0x044F, 0x044F, ULMBCS_AMBIGUOUS_ALL}, 3401cb0ef41Sopenharmony_ci {0x0450, 0x0491, ULMBCS_GRP_RU}, 3411cb0ef41Sopenharmony_ci {0x05B0, 0x05F2, ULMBCS_GRP_HE}, 3421cb0ef41Sopenharmony_ci {0x060C, 0x06AF, ULMBCS_GRP_AR}, 3431cb0ef41Sopenharmony_ci {0x0E01, 0x0E5B, ULMBCS_GRP_TH}, 3441cb0ef41Sopenharmony_ci {0x200C, 0x200F, ULMBCS_AMBIGUOUS_SBCS}, 3451cb0ef41Sopenharmony_ci {0x2010, 0x2010, ULMBCS_AMBIGUOUS_MBCS}, 3461cb0ef41Sopenharmony_ci {0x2013, 0x2014, ULMBCS_AMBIGUOUS_SBCS}, 3471cb0ef41Sopenharmony_ci {0x2015, 0x2015, ULMBCS_AMBIGUOUS_MBCS}, 3481cb0ef41Sopenharmony_ci {0x2016, 0x2016, ULMBCS_AMBIGUOUS_MBCS}, 3491cb0ef41Sopenharmony_ci {0x2017, 0x2017, ULMBCS_AMBIGUOUS_SBCS}, 3501cb0ef41Sopenharmony_ci {0x2018, 0x2019, ULMBCS_AMBIGUOUS_ALL}, 3511cb0ef41Sopenharmony_ci {0x201A, 0x201B, ULMBCS_AMBIGUOUS_SBCS}, 3521cb0ef41Sopenharmony_ci {0x201C, 0x201D, ULMBCS_AMBIGUOUS_ALL}, 3531cb0ef41Sopenharmony_ci {0x201E, 0x201F, ULMBCS_AMBIGUOUS_SBCS}, 3541cb0ef41Sopenharmony_ci {0x2020, 0x2021, ULMBCS_AMBIGUOUS_ALL}, 3551cb0ef41Sopenharmony_ci {0x2022, 0x2024, ULMBCS_AMBIGUOUS_SBCS}, 3561cb0ef41Sopenharmony_ci {0x2025, 0x2025, ULMBCS_AMBIGUOUS_MBCS}, 3571cb0ef41Sopenharmony_ci {0x2026, 0x2026, ULMBCS_AMBIGUOUS_ALL}, 3581cb0ef41Sopenharmony_ci {0x2027, 0x2027, ULMBCS_GRP_TW}, 3591cb0ef41Sopenharmony_ci {0x2030, 0x2030, ULMBCS_AMBIGUOUS_ALL}, 3601cb0ef41Sopenharmony_ci {0x2031, 0x2031, ULMBCS_AMBIGUOUS_SBCS}, 3611cb0ef41Sopenharmony_ci {0x2032, 0x2033, ULMBCS_AMBIGUOUS_MBCS}, 3621cb0ef41Sopenharmony_ci {0x2035, 0x2035, ULMBCS_AMBIGUOUS_MBCS}, 3631cb0ef41Sopenharmony_ci {0x2039, 0x203A, ULMBCS_AMBIGUOUS_SBCS}, 3641cb0ef41Sopenharmony_ci {0x203B, 0x203B, ULMBCS_AMBIGUOUS_MBCS}, 3651cb0ef41Sopenharmony_ci {0x203C, 0x203C, ULMBCS_GRP_EXCEPT}, 3661cb0ef41Sopenharmony_ci {0x2074, 0x2074, ULMBCS_GRP_KO}, 3671cb0ef41Sopenharmony_ci {0x207F, 0x207F, ULMBCS_GRP_EXCEPT}, 3681cb0ef41Sopenharmony_ci {0x2081, 0x2084, ULMBCS_GRP_KO}, 3691cb0ef41Sopenharmony_ci {0x20A4, 0x20AC, ULMBCS_AMBIGUOUS_SBCS}, 3701cb0ef41Sopenharmony_ci {0x2103, 0x2109, ULMBCS_AMBIGUOUS_MBCS}, 3711cb0ef41Sopenharmony_ci {0x2111, 0x2120, ULMBCS_AMBIGUOUS_SBCS}, 3721cb0ef41Sopenharmony_ci /*zhujin: upgrade, for regressiont test, spr HKIA4YHTSU*/ 3731cb0ef41Sopenharmony_ci {0x2121, 0x2121, ULMBCS_AMBIGUOUS_MBCS}, 3741cb0ef41Sopenharmony_ci {0x2122, 0x2126, ULMBCS_AMBIGUOUS_SBCS}, 3751cb0ef41Sopenharmony_ci {0x212B, 0x212B, ULMBCS_AMBIGUOUS_MBCS}, 3761cb0ef41Sopenharmony_ci {0x2135, 0x2135, ULMBCS_AMBIGUOUS_SBCS}, 3771cb0ef41Sopenharmony_ci {0x2153, 0x2154, ULMBCS_GRP_KO}, 3781cb0ef41Sopenharmony_ci {0x215B, 0x215E, ULMBCS_GRP_EXCEPT}, 3791cb0ef41Sopenharmony_ci {0x2160, 0x2179, ULMBCS_AMBIGUOUS_MBCS}, 3801cb0ef41Sopenharmony_ci {0x2190, 0x2193, ULMBCS_AMBIGUOUS_ALL}, 3811cb0ef41Sopenharmony_ci {0x2194, 0x2195, ULMBCS_GRP_EXCEPT}, 3821cb0ef41Sopenharmony_ci {0x2196, 0x2199, ULMBCS_AMBIGUOUS_MBCS}, 3831cb0ef41Sopenharmony_ci {0x21A8, 0x21A8, ULMBCS_GRP_EXCEPT}, 3841cb0ef41Sopenharmony_ci {0x21B8, 0x21B9, ULMBCS_GRP_CN}, 3851cb0ef41Sopenharmony_ci {0x21D0, 0x21D1, ULMBCS_GRP_EXCEPT}, 3861cb0ef41Sopenharmony_ci {0x21D2, 0x21D2, ULMBCS_AMBIGUOUS_MBCS}, 3871cb0ef41Sopenharmony_ci {0x21D3, 0x21D3, ULMBCS_GRP_EXCEPT}, 3881cb0ef41Sopenharmony_ci {0x21D4, 0x21D4, ULMBCS_AMBIGUOUS_MBCS}, 3891cb0ef41Sopenharmony_ci {0x21D5, 0x21D5, ULMBCS_GRP_EXCEPT}, 3901cb0ef41Sopenharmony_ci {0x21E7, 0x21E7, ULMBCS_GRP_CN}, 3911cb0ef41Sopenharmony_ci {0x2200, 0x2200, ULMBCS_AMBIGUOUS_MBCS}, 3921cb0ef41Sopenharmony_ci {0x2201, 0x2201, ULMBCS_GRP_EXCEPT}, 3931cb0ef41Sopenharmony_ci {0x2202, 0x2202, ULMBCS_AMBIGUOUS_MBCS}, 3941cb0ef41Sopenharmony_ci {0x2203, 0x2203, ULMBCS_AMBIGUOUS_MBCS}, 3951cb0ef41Sopenharmony_ci {0x2204, 0x2206, ULMBCS_GRP_EXCEPT}, 3961cb0ef41Sopenharmony_ci {0x2207, 0x2208, ULMBCS_AMBIGUOUS_MBCS}, 3971cb0ef41Sopenharmony_ci {0x2209, 0x220A, ULMBCS_GRP_EXCEPT}, 3981cb0ef41Sopenharmony_ci {0x220B, 0x220B, ULMBCS_AMBIGUOUS_MBCS}, 3991cb0ef41Sopenharmony_ci {0x220F, 0x2215, ULMBCS_AMBIGUOUS_MBCS}, 4001cb0ef41Sopenharmony_ci {0x2219, 0x2219, ULMBCS_GRP_EXCEPT}, 4011cb0ef41Sopenharmony_ci {0x221A, 0x221A, ULMBCS_AMBIGUOUS_MBCS}, 4021cb0ef41Sopenharmony_ci {0x221B, 0x221C, ULMBCS_GRP_EXCEPT}, 4031cb0ef41Sopenharmony_ci {0x221D, 0x221E, ULMBCS_AMBIGUOUS_MBCS}, 4041cb0ef41Sopenharmony_ci {0x221F, 0x221F, ULMBCS_GRP_EXCEPT}, 4051cb0ef41Sopenharmony_ci {0x2220, 0x2220, ULMBCS_AMBIGUOUS_MBCS}, 4061cb0ef41Sopenharmony_ci {0x2223, 0x222A, ULMBCS_AMBIGUOUS_MBCS}, 4071cb0ef41Sopenharmony_ci {0x222B, 0x223D, ULMBCS_AMBIGUOUS_MBCS}, 4081cb0ef41Sopenharmony_ci {0x2245, 0x2248, ULMBCS_GRP_EXCEPT}, 4091cb0ef41Sopenharmony_ci {0x224C, 0x224C, ULMBCS_GRP_TW}, 4101cb0ef41Sopenharmony_ci {0x2252, 0x2252, ULMBCS_AMBIGUOUS_MBCS}, 4111cb0ef41Sopenharmony_ci {0x2260, 0x2261, ULMBCS_AMBIGUOUS_MBCS}, 4121cb0ef41Sopenharmony_ci {0x2262, 0x2265, ULMBCS_GRP_EXCEPT}, 4131cb0ef41Sopenharmony_ci {0x2266, 0x226F, ULMBCS_AMBIGUOUS_MBCS}, 4141cb0ef41Sopenharmony_ci {0x2282, 0x2283, ULMBCS_AMBIGUOUS_MBCS}, 4151cb0ef41Sopenharmony_ci {0x2284, 0x2285, ULMBCS_GRP_EXCEPT}, 4161cb0ef41Sopenharmony_ci {0x2286, 0x2287, ULMBCS_AMBIGUOUS_MBCS}, 4171cb0ef41Sopenharmony_ci {0x2288, 0x2297, ULMBCS_GRP_EXCEPT}, 4181cb0ef41Sopenharmony_ci {0x2299, 0x22BF, ULMBCS_AMBIGUOUS_MBCS}, 4191cb0ef41Sopenharmony_ci {0x22C0, 0x22C0, ULMBCS_GRP_EXCEPT}, 4201cb0ef41Sopenharmony_ci {0x2310, 0x2310, ULMBCS_GRP_EXCEPT}, 4211cb0ef41Sopenharmony_ci {0x2312, 0x2312, ULMBCS_AMBIGUOUS_MBCS}, 4221cb0ef41Sopenharmony_ci {0x2318, 0x2321, ULMBCS_GRP_EXCEPT}, 4231cb0ef41Sopenharmony_ci {0x2318, 0x2321, ULMBCS_GRP_CN}, 4241cb0ef41Sopenharmony_ci {0x2460, 0x24E9, ULMBCS_AMBIGUOUS_MBCS}, 4251cb0ef41Sopenharmony_ci {0x2500, 0x2500, ULMBCS_AMBIGUOUS_SBCS}, 4261cb0ef41Sopenharmony_ci {0x2501, 0x2501, ULMBCS_AMBIGUOUS_MBCS}, 4271cb0ef41Sopenharmony_ci {0x2502, 0x2502, ULMBCS_AMBIGUOUS_ALL}, 4281cb0ef41Sopenharmony_ci {0x2503, 0x2503, ULMBCS_AMBIGUOUS_MBCS}, 4291cb0ef41Sopenharmony_ci {0x2504, 0x2505, ULMBCS_GRP_TW}, 4301cb0ef41Sopenharmony_ci {0x2506, 0x2665, ULMBCS_AMBIGUOUS_ALL}, 4311cb0ef41Sopenharmony_ci {0x2666, 0x2666, ULMBCS_GRP_EXCEPT}, 4321cb0ef41Sopenharmony_ci {0x2667, 0x2669, ULMBCS_AMBIGUOUS_SBCS}, 4331cb0ef41Sopenharmony_ci {0x266A, 0x266A, ULMBCS_AMBIGUOUS_ALL}, 4341cb0ef41Sopenharmony_ci {0x266B, 0x266C, ULMBCS_AMBIGUOUS_SBCS}, 4351cb0ef41Sopenharmony_ci {0x266D, 0x266D, ULMBCS_AMBIGUOUS_MBCS}, 4361cb0ef41Sopenharmony_ci {0x266E, 0x266E, ULMBCS_AMBIGUOUS_SBCS}, 4371cb0ef41Sopenharmony_ci {0x266F, 0x266F, ULMBCS_GRP_JA}, 4381cb0ef41Sopenharmony_ci {0x2670, 0x2E7F, ULMBCS_AMBIGUOUS_SBCS}, 4391cb0ef41Sopenharmony_ci {0x2E80, 0xF861, ULMBCS_AMBIGUOUS_MBCS}, 4401cb0ef41Sopenharmony_ci {0xF862, 0xF8FF, ULMBCS_GRP_EXCEPT}, 4411cb0ef41Sopenharmony_ci {0xF900, 0xFA2D, ULMBCS_AMBIGUOUS_MBCS}, 4421cb0ef41Sopenharmony_ci {0xFB00, 0xFEFF, ULMBCS_AMBIGUOUS_SBCS}, 4431cb0ef41Sopenharmony_ci {0xFF01, 0xFFEE, ULMBCS_AMBIGUOUS_MBCS}, 4441cb0ef41Sopenharmony_ci {0xFFFF, 0xFFFF, ULMBCS_GRP_UNICODE} 4451cb0ef41Sopenharmony_ci}; 4461cb0ef41Sopenharmony_ci 4471cb0ef41Sopenharmony_cistatic ulmbcs_byte_t 4481cb0ef41Sopenharmony_ciFindLMBCSUniRange(char16_t uniChar) 4491cb0ef41Sopenharmony_ci{ 4501cb0ef41Sopenharmony_ci const struct _UniLMBCSGrpMap * pTable = UniLMBCSGrpMap; 4511cb0ef41Sopenharmony_ci 4521cb0ef41Sopenharmony_ci while (uniChar > pTable->uniEndRange) 4531cb0ef41Sopenharmony_ci { 4541cb0ef41Sopenharmony_ci pTable++; 4551cb0ef41Sopenharmony_ci } 4561cb0ef41Sopenharmony_ci 4571cb0ef41Sopenharmony_ci if (uniChar >= pTable->uniStartRange) 4581cb0ef41Sopenharmony_ci { 4591cb0ef41Sopenharmony_ci return pTable->GrpType; 4601cb0ef41Sopenharmony_ci } 4611cb0ef41Sopenharmony_ci return ULMBCS_GRP_UNICODE; 4621cb0ef41Sopenharmony_ci} 4631cb0ef41Sopenharmony_ci 4641cb0ef41Sopenharmony_ci/* 4651cb0ef41Sopenharmony_ciWe also ask the creator of a converter to send in a preferred locale 4661cb0ef41Sopenharmony_cithat we can use in resolving ambiguous mappings. They send the locale 4671cb0ef41Sopenharmony_ciin as a string, and we map it, if possible, to one of the 4681cb0ef41Sopenharmony_ciLMBCS groups. We use this table, and the associated code, to 4691cb0ef41Sopenharmony_cido the lookup: */ 4701cb0ef41Sopenharmony_ci 4711cb0ef41Sopenharmony_ci/************************************************** 4721cb0ef41Sopenharmony_ci This table maps locale ID's to LMBCS opt groups. 4731cb0ef41Sopenharmony_ci The default return is group 0x01. Note that for 4741cb0ef41Sopenharmony_ci performance reasons, the table is sorted in 4751cb0ef41Sopenharmony_ci increasing alphabetic order, with the notable 4761cb0ef41Sopenharmony_ci exception of zhTW. This is to force the check 4771cb0ef41Sopenharmony_ci for Traditonal Chinese before dropping back to 4781cb0ef41Sopenharmony_ci Simplified. 4791cb0ef41Sopenharmony_ci 4801cb0ef41Sopenharmony_ci Note too that the Latin-1 groups have been 4811cb0ef41Sopenharmony_ci commented out because it's the default, and 4821cb0ef41Sopenharmony_ci this shortens the table, allowing a serial 4831cb0ef41Sopenharmony_ci search to go quickly. 4841cb0ef41Sopenharmony_ci *************************************************/ 4851cb0ef41Sopenharmony_ci 4861cb0ef41Sopenharmony_cistatic const struct _LocaleLMBCSGrpMap 4871cb0ef41Sopenharmony_ci{ 4881cb0ef41Sopenharmony_ci const char *LocaleID; 4891cb0ef41Sopenharmony_ci const ulmbcs_byte_t OptGroup; 4901cb0ef41Sopenharmony_ci} LocaleLMBCSGrpMap[] = 4911cb0ef41Sopenharmony_ci{ 4921cb0ef41Sopenharmony_ci {"ar", ULMBCS_GRP_AR}, 4931cb0ef41Sopenharmony_ci {"be", ULMBCS_GRP_RU}, 4941cb0ef41Sopenharmony_ci {"bg", ULMBCS_GRP_L2}, 4951cb0ef41Sopenharmony_ci /* {"ca", ULMBCS_GRP_L1}, */ 4961cb0ef41Sopenharmony_ci {"cs", ULMBCS_GRP_L2}, 4971cb0ef41Sopenharmony_ci /* {"da", ULMBCS_GRP_L1}, */ 4981cb0ef41Sopenharmony_ci /* {"de", ULMBCS_GRP_L1}, */ 4991cb0ef41Sopenharmony_ci {"el", ULMBCS_GRP_GR}, 5001cb0ef41Sopenharmony_ci /* {"en", ULMBCS_GRP_L1}, */ 5011cb0ef41Sopenharmony_ci /* {"es", ULMBCS_GRP_L1}, */ 5021cb0ef41Sopenharmony_ci /* {"et", ULMBCS_GRP_L1}, */ 5031cb0ef41Sopenharmony_ci /* {"fi", ULMBCS_GRP_L1}, */ 5041cb0ef41Sopenharmony_ci /* {"fr", ULMBCS_GRP_L1}, */ 5051cb0ef41Sopenharmony_ci {"he", ULMBCS_GRP_HE}, 5061cb0ef41Sopenharmony_ci {"hu", ULMBCS_GRP_L2}, 5071cb0ef41Sopenharmony_ci /* {"is", ULMBCS_GRP_L1}, */ 5081cb0ef41Sopenharmony_ci /* {"it", ULMBCS_GRP_L1}, */ 5091cb0ef41Sopenharmony_ci {"iw", ULMBCS_GRP_HE}, 5101cb0ef41Sopenharmony_ci {"ja", ULMBCS_GRP_JA}, 5111cb0ef41Sopenharmony_ci {"ko", ULMBCS_GRP_KO}, 5121cb0ef41Sopenharmony_ci /* {"lt", ULMBCS_GRP_L1}, */ 5131cb0ef41Sopenharmony_ci /* {"lv", ULMBCS_GRP_L1}, */ 5141cb0ef41Sopenharmony_ci {"mk", ULMBCS_GRP_RU}, 5151cb0ef41Sopenharmony_ci /* {"nl", ULMBCS_GRP_L1}, */ 5161cb0ef41Sopenharmony_ci /* {"no", ULMBCS_GRP_L1}, */ 5171cb0ef41Sopenharmony_ci {"pl", ULMBCS_GRP_L2}, 5181cb0ef41Sopenharmony_ci /* {"pt", ULMBCS_GRP_L1}, */ 5191cb0ef41Sopenharmony_ci {"ro", ULMBCS_GRP_L2}, 5201cb0ef41Sopenharmony_ci {"ru", ULMBCS_GRP_RU}, 5211cb0ef41Sopenharmony_ci {"sh", ULMBCS_GRP_L2}, 5221cb0ef41Sopenharmony_ci {"sk", ULMBCS_GRP_L2}, 5231cb0ef41Sopenharmony_ci {"sl", ULMBCS_GRP_L2}, 5241cb0ef41Sopenharmony_ci {"sq", ULMBCS_GRP_L2}, 5251cb0ef41Sopenharmony_ci {"sr", ULMBCS_GRP_RU}, 5261cb0ef41Sopenharmony_ci /* {"sv", ULMBCS_GRP_L1}, */ 5271cb0ef41Sopenharmony_ci {"th", ULMBCS_GRP_TH}, 5281cb0ef41Sopenharmony_ci {"tr", ULMBCS_GRP_TR}, 5291cb0ef41Sopenharmony_ci {"uk", ULMBCS_GRP_RU}, 5301cb0ef41Sopenharmony_ci /* {"vi", ULMBCS_GRP_L1}, */ 5311cb0ef41Sopenharmony_ci {"zhTW", ULMBCS_GRP_TW}, 5321cb0ef41Sopenharmony_ci {"zh", ULMBCS_GRP_CN}, 5331cb0ef41Sopenharmony_ci {nullptr, ULMBCS_GRP_L1} 5341cb0ef41Sopenharmony_ci}; 5351cb0ef41Sopenharmony_ci 5361cb0ef41Sopenharmony_ci 5371cb0ef41Sopenharmony_cistatic ulmbcs_byte_t 5381cb0ef41Sopenharmony_ciFindLMBCSLocale(const char *LocaleID) 5391cb0ef41Sopenharmony_ci{ 5401cb0ef41Sopenharmony_ci const struct _LocaleLMBCSGrpMap *pTable = LocaleLMBCSGrpMap; 5411cb0ef41Sopenharmony_ci 5421cb0ef41Sopenharmony_ci if ((!LocaleID) || (!*LocaleID)) 5431cb0ef41Sopenharmony_ci { 5441cb0ef41Sopenharmony_ci return 0; 5451cb0ef41Sopenharmony_ci } 5461cb0ef41Sopenharmony_ci 5471cb0ef41Sopenharmony_ci while (pTable->LocaleID) 5481cb0ef41Sopenharmony_ci { 5491cb0ef41Sopenharmony_ci if (*pTable->LocaleID == *LocaleID) /* Check only first char for speed */ 5501cb0ef41Sopenharmony_ci { 5511cb0ef41Sopenharmony_ci /* First char matches - check whole name, for entry-length */ 5521cb0ef41Sopenharmony_ci if (uprv_strncmp(pTable->LocaleID, LocaleID, strlen(pTable->LocaleID)) == 0) 5531cb0ef41Sopenharmony_ci return pTable->OptGroup; 5541cb0ef41Sopenharmony_ci } 5551cb0ef41Sopenharmony_ci else 5561cb0ef41Sopenharmony_ci if (*pTable->LocaleID > *LocaleID) /* Sorted alphabetically - exit */ 5571cb0ef41Sopenharmony_ci break; 5581cb0ef41Sopenharmony_ci pTable++; 5591cb0ef41Sopenharmony_ci } 5601cb0ef41Sopenharmony_ci return ULMBCS_GRP_L1; 5611cb0ef41Sopenharmony_ci} 5621cb0ef41Sopenharmony_ci 5631cb0ef41Sopenharmony_ci 5641cb0ef41Sopenharmony_ci/* 5651cb0ef41Sopenharmony_ci Before we get to the main body of code, here's how we hook up to the rest 5661cb0ef41Sopenharmony_ci of ICU. ICU converters are required to define a structure that includes 5671cb0ef41Sopenharmony_ci some function pointers, and some common data, in the style of a C++ 5681cb0ef41Sopenharmony_ci vtable. There is also room in there for converter-specific data. LMBCS 5691cb0ef41Sopenharmony_ci uses that converter-specific data to keep track of the 12 subconverters 5701cb0ef41Sopenharmony_ci we use, the optimization group, and the group (if any) that matches the 5711cb0ef41Sopenharmony_ci locale. We have one structure instantiated for each of the 12 possible 5721cb0ef41Sopenharmony_ci optimization groups. To avoid typos & to avoid boring the reader, we 5731cb0ef41Sopenharmony_ci put the declarations of these structures and functions into macros. To see 5741cb0ef41Sopenharmony_ci the definitions of these structures, see unicode\ucnv_bld.h 5751cb0ef41Sopenharmony_ci*/ 5761cb0ef41Sopenharmony_ci 5771cb0ef41Sopenharmony_citypedef struct 5781cb0ef41Sopenharmony_ci { 5791cb0ef41Sopenharmony_ci UConverterSharedData *OptGrpConverter[ULMBCS_GRP_LAST+1]; /* Converter per Opt. grp. */ 5801cb0ef41Sopenharmony_ci uint8_t OptGroup; /* default Opt. grp. for this LMBCS session */ 5811cb0ef41Sopenharmony_ci uint8_t localeConverterIndex; /* reasonable locale match for index */ 5821cb0ef41Sopenharmony_ci } 5831cb0ef41Sopenharmony_ciUConverterDataLMBCS; 5841cb0ef41Sopenharmony_ci 5851cb0ef41Sopenharmony_ciU_CDECL_BEGIN 5861cb0ef41Sopenharmony_cistatic void U_CALLCONV _LMBCSClose(UConverter * _this); 5871cb0ef41Sopenharmony_ciU_CDECL_END 5881cb0ef41Sopenharmony_ci 5891cb0ef41Sopenharmony_ci#define DECLARE_LMBCS_DATA(n) \ 5901cb0ef41Sopenharmony_cistatic const UConverterImpl _LMBCSImpl##n={\ 5911cb0ef41Sopenharmony_ci UCNV_LMBCS_##n,\ 5921cb0ef41Sopenharmony_ci nullptr,nullptr,\ 5931cb0ef41Sopenharmony_ci _LMBCSOpen##n,\ 5941cb0ef41Sopenharmony_ci _LMBCSClose,\ 5951cb0ef41Sopenharmony_ci nullptr,\ 5961cb0ef41Sopenharmony_ci _LMBCSToUnicodeWithOffsets,\ 5971cb0ef41Sopenharmony_ci _LMBCSToUnicodeWithOffsets,\ 5981cb0ef41Sopenharmony_ci _LMBCSFromUnicode,\ 5991cb0ef41Sopenharmony_ci _LMBCSFromUnicode,\ 6001cb0ef41Sopenharmony_ci nullptr,\ 6011cb0ef41Sopenharmony_ci nullptr,\ 6021cb0ef41Sopenharmony_ci nullptr,\ 6031cb0ef41Sopenharmony_ci nullptr,\ 6041cb0ef41Sopenharmony_ci _LMBCSSafeClone,\ 6051cb0ef41Sopenharmony_ci ucnv_getCompleteUnicodeSet,\ 6061cb0ef41Sopenharmony_ci nullptr,\ 6071cb0ef41Sopenharmony_ci nullptr\ 6081cb0ef41Sopenharmony_ci};\ 6091cb0ef41Sopenharmony_cistatic const UConverterStaticData _LMBCSStaticData##n={\ 6101cb0ef41Sopenharmony_ci sizeof(UConverterStaticData),\ 6111cb0ef41Sopenharmony_ci "LMBCS-" #n,\ 6121cb0ef41Sopenharmony_ci 0, UCNV_IBM, UCNV_LMBCS_##n, 1, 3,\ 6131cb0ef41Sopenharmony_ci { 0x3f, 0, 0, 0 },1,false,false,0,0,{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} \ 6141cb0ef41Sopenharmony_ci};\ 6151cb0ef41Sopenharmony_ciconst UConverterSharedData _LMBCSData##n= \ 6161cb0ef41Sopenharmony_ci UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_LMBCSStaticData##n, &_LMBCSImpl##n); 6171cb0ef41Sopenharmony_ci 6181cb0ef41Sopenharmony_ci /* The only function we needed to duplicate 12 times was the 'open' 6191cb0ef41Sopenharmony_cifunction, which will do basically the same thing except set a different 6201cb0ef41Sopenharmony_cioptimization group. So, we put the common stuff into a worker function, 6211cb0ef41Sopenharmony_ciand set up another macro to stamp out the 12 open functions:*/ 6221cb0ef41Sopenharmony_ci#define DEFINE_LMBCS_OPEN(n) \ 6231cb0ef41Sopenharmony_cistatic void U_CALLCONV \ 6241cb0ef41Sopenharmony_ci _LMBCSOpen##n(UConverter* _this, UConverterLoadArgs* pArgs, UErrorCode* err) \ 6251cb0ef41Sopenharmony_ci{ _LMBCSOpenWorker(_this, pArgs, err, n); } 6261cb0ef41Sopenharmony_ci 6271cb0ef41Sopenharmony_ci 6281cb0ef41Sopenharmony_ci 6291cb0ef41Sopenharmony_ci/* Here's the open worker & the common close function */ 6301cb0ef41Sopenharmony_cistatic void 6311cb0ef41Sopenharmony_ci_LMBCSOpenWorker(UConverter* _this, 6321cb0ef41Sopenharmony_ci UConverterLoadArgs *pArgs, 6331cb0ef41Sopenharmony_ci UErrorCode* err, 6341cb0ef41Sopenharmony_ci ulmbcs_byte_t OptGroup) 6351cb0ef41Sopenharmony_ci{ 6361cb0ef41Sopenharmony_ci UConverterDataLMBCS * extraInfo = (UConverterDataLMBCS*)uprv_malloc (sizeof (UConverterDataLMBCS)); 6371cb0ef41Sopenharmony_ci _this->extraInfo = extraInfo; 6381cb0ef41Sopenharmony_ci if(extraInfo != nullptr) 6391cb0ef41Sopenharmony_ci { 6401cb0ef41Sopenharmony_ci UConverterNamePieces stackPieces; 6411cb0ef41Sopenharmony_ci UConverterLoadArgs stackArgs= UCNV_LOAD_ARGS_INITIALIZER; 6421cb0ef41Sopenharmony_ci ulmbcs_byte_t i; 6431cb0ef41Sopenharmony_ci 6441cb0ef41Sopenharmony_ci uprv_memset(extraInfo, 0, sizeof(UConverterDataLMBCS)); 6451cb0ef41Sopenharmony_ci 6461cb0ef41Sopenharmony_ci stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable; 6471cb0ef41Sopenharmony_ci 6481cb0ef41Sopenharmony_ci for (i=0; i <= ULMBCS_GRP_LAST && U_SUCCESS(*err); i++) 6491cb0ef41Sopenharmony_ci { 6501cb0ef41Sopenharmony_ci if(OptGroupByteToCPName[i] != nullptr) { 6511cb0ef41Sopenharmony_ci extraInfo->OptGrpConverter[i] = ucnv_loadSharedData(OptGroupByteToCPName[i], &stackPieces, &stackArgs, err); 6521cb0ef41Sopenharmony_ci } 6531cb0ef41Sopenharmony_ci } 6541cb0ef41Sopenharmony_ci 6551cb0ef41Sopenharmony_ci if(U_FAILURE(*err) || pArgs->onlyTestIsLoadable) { 6561cb0ef41Sopenharmony_ci _LMBCSClose(_this); 6571cb0ef41Sopenharmony_ci return; 6581cb0ef41Sopenharmony_ci } 6591cb0ef41Sopenharmony_ci extraInfo->OptGroup = OptGroup; 6601cb0ef41Sopenharmony_ci extraInfo->localeConverterIndex = FindLMBCSLocale(pArgs->locale); 6611cb0ef41Sopenharmony_ci } 6621cb0ef41Sopenharmony_ci else 6631cb0ef41Sopenharmony_ci { 6641cb0ef41Sopenharmony_ci *err = U_MEMORY_ALLOCATION_ERROR; 6651cb0ef41Sopenharmony_ci } 6661cb0ef41Sopenharmony_ci} 6671cb0ef41Sopenharmony_ci 6681cb0ef41Sopenharmony_ciU_CDECL_BEGIN 6691cb0ef41Sopenharmony_cistatic void U_CALLCONV 6701cb0ef41Sopenharmony_ci_LMBCSClose(UConverter * _this) 6711cb0ef41Sopenharmony_ci{ 6721cb0ef41Sopenharmony_ci if (_this->extraInfo != nullptr) 6731cb0ef41Sopenharmony_ci { 6741cb0ef41Sopenharmony_ci ulmbcs_byte_t Ix; 6751cb0ef41Sopenharmony_ci UConverterDataLMBCS * extraInfo = (UConverterDataLMBCS *) _this->extraInfo; 6761cb0ef41Sopenharmony_ci 6771cb0ef41Sopenharmony_ci for (Ix=0; Ix <= ULMBCS_GRP_LAST; Ix++) 6781cb0ef41Sopenharmony_ci { 6791cb0ef41Sopenharmony_ci if (extraInfo->OptGrpConverter[Ix] != nullptr) 6801cb0ef41Sopenharmony_ci ucnv_unloadSharedDataIfReady(extraInfo->OptGrpConverter[Ix]); 6811cb0ef41Sopenharmony_ci } 6821cb0ef41Sopenharmony_ci if (!_this->isExtraLocal) { 6831cb0ef41Sopenharmony_ci uprv_free (_this->extraInfo); 6841cb0ef41Sopenharmony_ci _this->extraInfo = nullptr; 6851cb0ef41Sopenharmony_ci } 6861cb0ef41Sopenharmony_ci } 6871cb0ef41Sopenharmony_ci} 6881cb0ef41Sopenharmony_ci 6891cb0ef41Sopenharmony_citypedef struct LMBCSClone { 6901cb0ef41Sopenharmony_ci UConverter cnv; 6911cb0ef41Sopenharmony_ci UConverterDataLMBCS lmbcs; 6921cb0ef41Sopenharmony_ci} LMBCSClone; 6931cb0ef41Sopenharmony_ci 6941cb0ef41Sopenharmony_cistatic UConverter * U_CALLCONV 6951cb0ef41Sopenharmony_ci_LMBCSSafeClone(const UConverter *cnv, 6961cb0ef41Sopenharmony_ci void *stackBuffer, 6971cb0ef41Sopenharmony_ci int32_t *pBufferSize, 6981cb0ef41Sopenharmony_ci UErrorCode *status) { 6991cb0ef41Sopenharmony_ci (void)status; 7001cb0ef41Sopenharmony_ci LMBCSClone *newLMBCS; 7011cb0ef41Sopenharmony_ci UConverterDataLMBCS *extraInfo; 7021cb0ef41Sopenharmony_ci int32_t i; 7031cb0ef41Sopenharmony_ci 7041cb0ef41Sopenharmony_ci if(*pBufferSize<=0) { 7051cb0ef41Sopenharmony_ci *pBufferSize=(int32_t)sizeof(LMBCSClone); 7061cb0ef41Sopenharmony_ci return nullptr; 7071cb0ef41Sopenharmony_ci } 7081cb0ef41Sopenharmony_ci 7091cb0ef41Sopenharmony_ci extraInfo=(UConverterDataLMBCS *)cnv->extraInfo; 7101cb0ef41Sopenharmony_ci newLMBCS=(LMBCSClone *)stackBuffer; 7111cb0ef41Sopenharmony_ci 7121cb0ef41Sopenharmony_ci /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ 7131cb0ef41Sopenharmony_ci 7141cb0ef41Sopenharmony_ci uprv_memcpy(&newLMBCS->lmbcs, extraInfo, sizeof(UConverterDataLMBCS)); 7151cb0ef41Sopenharmony_ci 7161cb0ef41Sopenharmony_ci /* share the subconverters */ 7171cb0ef41Sopenharmony_ci for(i = 0; i <= ULMBCS_GRP_LAST; ++i) { 7181cb0ef41Sopenharmony_ci if(extraInfo->OptGrpConverter[i] != nullptr) { 7191cb0ef41Sopenharmony_ci ucnv_incrementRefCount(extraInfo->OptGrpConverter[i]); 7201cb0ef41Sopenharmony_ci } 7211cb0ef41Sopenharmony_ci } 7221cb0ef41Sopenharmony_ci 7231cb0ef41Sopenharmony_ci newLMBCS->cnv.extraInfo = &newLMBCS->lmbcs; 7241cb0ef41Sopenharmony_ci newLMBCS->cnv.isExtraLocal = true; 7251cb0ef41Sopenharmony_ci return &newLMBCS->cnv; 7261cb0ef41Sopenharmony_ci} 7271cb0ef41Sopenharmony_ci 7281cb0ef41Sopenharmony_ci/* 7291cb0ef41Sopenharmony_ci * There used to be a _LMBCSGetUnicodeSet() function here (up to svn revision 20117) 7301cb0ef41Sopenharmony_ci * which added all code points except for U+F6xx 7311cb0ef41Sopenharmony_ci * because those cannot be represented in the Unicode group. 7321cb0ef41Sopenharmony_ci * However, it turns out that windows-950 has roundtrips for all of U+F6xx 7331cb0ef41Sopenharmony_ci * which means that LMBCS can convert all Unicode code points after all. 7341cb0ef41Sopenharmony_ci * We now simply use ucnv_getCompleteUnicodeSet(). 7351cb0ef41Sopenharmony_ci * 7361cb0ef41Sopenharmony_ci * This may need to be looked at again as Lotus uses _LMBCSGetUnicodeSet(). (091216) 7371cb0ef41Sopenharmony_ci */ 7381cb0ef41Sopenharmony_ci 7391cb0ef41Sopenharmony_ci/* 7401cb0ef41Sopenharmony_ci Here's the basic helper function that we use when converting from 7411cb0ef41Sopenharmony_ci Unicode to LMBCS, and we suspect that a Unicode character will fit into 7421cb0ef41Sopenharmony_ci one of the 12 groups. The return value is the number of bytes written 7431cb0ef41Sopenharmony_ci starting at pStartLMBCS (if any). 7441cb0ef41Sopenharmony_ci*/ 7451cb0ef41Sopenharmony_ci 7461cb0ef41Sopenharmony_cistatic size_t 7471cb0ef41Sopenharmony_ciLMBCSConversionWorker ( 7481cb0ef41Sopenharmony_ci UConverterDataLMBCS * extraInfo, /* subconverters, opt & locale groups */ 7491cb0ef41Sopenharmony_ci ulmbcs_byte_t group, /* The group to try */ 7501cb0ef41Sopenharmony_ci ulmbcs_byte_t * pStartLMBCS, /* where to put the results */ 7511cb0ef41Sopenharmony_ci char16_t * pUniChar, /* The input unicode character */ 7521cb0ef41Sopenharmony_ci ulmbcs_byte_t * lastConverterIndex, /* output: track last successful group used */ 7531cb0ef41Sopenharmony_ci UBool * groups_tried /* output: track any unsuccessful groups */ 7541cb0ef41Sopenharmony_ci) 7551cb0ef41Sopenharmony_ci{ 7561cb0ef41Sopenharmony_ci ulmbcs_byte_t * pLMBCS = pStartLMBCS; 7571cb0ef41Sopenharmony_ci UConverterSharedData * xcnv = extraInfo->OptGrpConverter[group]; 7581cb0ef41Sopenharmony_ci 7591cb0ef41Sopenharmony_ci int bytesConverted; 7601cb0ef41Sopenharmony_ci uint32_t value; 7611cb0ef41Sopenharmony_ci ulmbcs_byte_t firstByte; 7621cb0ef41Sopenharmony_ci 7631cb0ef41Sopenharmony_ci U_ASSERT(xcnv); 7641cb0ef41Sopenharmony_ci U_ASSERT(group<ULMBCS_GRP_UNICODE); 7651cb0ef41Sopenharmony_ci 7661cb0ef41Sopenharmony_ci bytesConverted = ucnv_MBCSFromUChar32(xcnv, *pUniChar, &value, false); 7671cb0ef41Sopenharmony_ci 7681cb0ef41Sopenharmony_ci /* get the first result byte */ 7691cb0ef41Sopenharmony_ci if(bytesConverted > 0) { 7701cb0ef41Sopenharmony_ci firstByte = (ulmbcs_byte_t)(value >> ((bytesConverted - 1) * 8)); 7711cb0ef41Sopenharmony_ci } else { 7721cb0ef41Sopenharmony_ci /* most common failure mode is an unassigned character */ 7731cb0ef41Sopenharmony_ci groups_tried[group] = true; 7741cb0ef41Sopenharmony_ci return 0; 7751cb0ef41Sopenharmony_ci } 7761cb0ef41Sopenharmony_ci 7771cb0ef41Sopenharmony_ci *lastConverterIndex = group; 7781cb0ef41Sopenharmony_ci 7791cb0ef41Sopenharmony_ci /* All initial byte values in lower ascii range should have been caught by now, 7801cb0ef41Sopenharmony_ci except with the exception group. 7811cb0ef41Sopenharmony_ci */ 7821cb0ef41Sopenharmony_ci U_ASSERT((firstByte <= ULMBCS_C0END) || (firstByte >= ULMBCS_C1START) || (group == ULMBCS_GRP_EXCEPT)); 7831cb0ef41Sopenharmony_ci 7841cb0ef41Sopenharmony_ci /* use converted data: first write 0, 1 or two group bytes */ 7851cb0ef41Sopenharmony_ci if (group != ULMBCS_GRP_EXCEPT && extraInfo->OptGroup != group) 7861cb0ef41Sopenharmony_ci { 7871cb0ef41Sopenharmony_ci *pLMBCS++ = group; 7881cb0ef41Sopenharmony_ci if (bytesConverted == 1 && group >= ULMBCS_DOUBLEOPTGROUP_START) 7891cb0ef41Sopenharmony_ci { 7901cb0ef41Sopenharmony_ci *pLMBCS++ = group; 7911cb0ef41Sopenharmony_ci } 7921cb0ef41Sopenharmony_ci } 7931cb0ef41Sopenharmony_ci 7941cb0ef41Sopenharmony_ci /* don't emit control chars */ 7951cb0ef41Sopenharmony_ci if ( bytesConverted == 1 && firstByte < 0x20 ) 7961cb0ef41Sopenharmony_ci return 0; 7971cb0ef41Sopenharmony_ci 7981cb0ef41Sopenharmony_ci 7991cb0ef41Sopenharmony_ci /* then move over the converted data */ 8001cb0ef41Sopenharmony_ci switch(bytesConverted) 8011cb0ef41Sopenharmony_ci { 8021cb0ef41Sopenharmony_ci case 4: 8031cb0ef41Sopenharmony_ci *pLMBCS++ = (ulmbcs_byte_t)(value >> 24); 8041cb0ef41Sopenharmony_ci U_FALLTHROUGH; 8051cb0ef41Sopenharmony_ci case 3: 8061cb0ef41Sopenharmony_ci *pLMBCS++ = (ulmbcs_byte_t)(value >> 16); 8071cb0ef41Sopenharmony_ci U_FALLTHROUGH; 8081cb0ef41Sopenharmony_ci case 2: 8091cb0ef41Sopenharmony_ci *pLMBCS++ = (ulmbcs_byte_t)(value >> 8); 8101cb0ef41Sopenharmony_ci U_FALLTHROUGH; 8111cb0ef41Sopenharmony_ci case 1: 8121cb0ef41Sopenharmony_ci *pLMBCS++ = (ulmbcs_byte_t)value; 8131cb0ef41Sopenharmony_ci U_FALLTHROUGH; 8141cb0ef41Sopenharmony_ci default: 8151cb0ef41Sopenharmony_ci /* will never occur */ 8161cb0ef41Sopenharmony_ci break; 8171cb0ef41Sopenharmony_ci } 8181cb0ef41Sopenharmony_ci 8191cb0ef41Sopenharmony_ci return (pLMBCS - pStartLMBCS); 8201cb0ef41Sopenharmony_ci} 8211cb0ef41Sopenharmony_ci 8221cb0ef41Sopenharmony_ci 8231cb0ef41Sopenharmony_ci/* This is a much simpler version of above, when we 8241cb0ef41Sopenharmony_ciknow we are writing LMBCS using the Unicode group 8251cb0ef41Sopenharmony_ci*/ 8261cb0ef41Sopenharmony_cistatic size_t 8271cb0ef41Sopenharmony_ciLMBCSConvertUni(ulmbcs_byte_t * pLMBCS, char16_t uniChar) 8281cb0ef41Sopenharmony_ci{ 8291cb0ef41Sopenharmony_ci /* encode into LMBCS Unicode range */ 8301cb0ef41Sopenharmony_ci uint8_t LowCh = (uint8_t)(uniChar & 0x00FF); 8311cb0ef41Sopenharmony_ci uint8_t HighCh = (uint8_t)(uniChar >> 8); 8321cb0ef41Sopenharmony_ci 8331cb0ef41Sopenharmony_ci *pLMBCS++ = ULMBCS_GRP_UNICODE; 8341cb0ef41Sopenharmony_ci 8351cb0ef41Sopenharmony_ci if (LowCh == 0) 8361cb0ef41Sopenharmony_ci { 8371cb0ef41Sopenharmony_ci *pLMBCS++ = ULMBCS_UNICOMPATZERO; 8381cb0ef41Sopenharmony_ci *pLMBCS++ = HighCh; 8391cb0ef41Sopenharmony_ci } 8401cb0ef41Sopenharmony_ci else 8411cb0ef41Sopenharmony_ci { 8421cb0ef41Sopenharmony_ci *pLMBCS++ = HighCh; 8431cb0ef41Sopenharmony_ci *pLMBCS++ = LowCh; 8441cb0ef41Sopenharmony_ci } 8451cb0ef41Sopenharmony_ci return ULMBCS_UNICODE_SIZE; 8461cb0ef41Sopenharmony_ci} 8471cb0ef41Sopenharmony_ci 8481cb0ef41Sopenharmony_ci 8491cb0ef41Sopenharmony_ci 8501cb0ef41Sopenharmony_ci/* The main Unicode to LMBCS conversion function */ 8511cb0ef41Sopenharmony_cistatic void U_CALLCONV 8521cb0ef41Sopenharmony_ci_LMBCSFromUnicode(UConverterFromUnicodeArgs* args, 8531cb0ef41Sopenharmony_ci UErrorCode* err) 8541cb0ef41Sopenharmony_ci{ 8551cb0ef41Sopenharmony_ci ulmbcs_byte_t lastConverterIndex = 0; 8561cb0ef41Sopenharmony_ci char16_t uniChar; 8571cb0ef41Sopenharmony_ci ulmbcs_byte_t LMBCS[ULMBCS_CHARSIZE_MAX]; 8581cb0ef41Sopenharmony_ci ulmbcs_byte_t * pLMBCS; 8591cb0ef41Sopenharmony_ci int32_t bytes_written; 8601cb0ef41Sopenharmony_ci UBool groups_tried[ULMBCS_GRP_LAST+1]; 8611cb0ef41Sopenharmony_ci UConverterDataLMBCS * extraInfo = (UConverterDataLMBCS *) args->converter->extraInfo; 8621cb0ef41Sopenharmony_ci int sourceIndex = 0; 8631cb0ef41Sopenharmony_ci 8641cb0ef41Sopenharmony_ci /* Basic strategy: attempt to fill in local LMBCS 1-char buffer.(LMBCS) 8651cb0ef41Sopenharmony_ci If that succeeds, see if it will all fit into the target & copy it over 8661cb0ef41Sopenharmony_ci if it does. 8671cb0ef41Sopenharmony_ci 8681cb0ef41Sopenharmony_ci We try conversions in the following order: 8691cb0ef41Sopenharmony_ci 8701cb0ef41Sopenharmony_ci 1. Single-byte ascii & special fixed control chars (&null) 8711cb0ef41Sopenharmony_ci 2. Look up group in table & try that (could be 8721cb0ef41Sopenharmony_ci A) Unicode group 8731cb0ef41Sopenharmony_ci B) control group, 8741cb0ef41Sopenharmony_ci C) national encoding, 8751cb0ef41Sopenharmony_ci or ambiguous SBCS or MBCS group (on to step 4...) 8761cb0ef41Sopenharmony_ci 8771cb0ef41Sopenharmony_ci 3. If its ambiguous, try this order: 8781cb0ef41Sopenharmony_ci A) The optimization group 8791cb0ef41Sopenharmony_ci B) The locale group 8801cb0ef41Sopenharmony_ci C) The last group that succeeded with this string. 8811cb0ef41Sopenharmony_ci D) every other group that's relevant (single or double) 8821cb0ef41Sopenharmony_ci E) If its single-byte ambiguous, try the exceptions group 8831cb0ef41Sopenharmony_ci 8841cb0ef41Sopenharmony_ci 4. And as a grand fallback: Unicode 8851cb0ef41Sopenharmony_ci */ 8861cb0ef41Sopenharmony_ci 8871cb0ef41Sopenharmony_ci /*Fix for SPR#DJOE66JFN3 (Lotus)*/ 8881cb0ef41Sopenharmony_ci ulmbcs_byte_t OldConverterIndex = 0; 8891cb0ef41Sopenharmony_ci 8901cb0ef41Sopenharmony_ci while (args->source < args->sourceLimit && !U_FAILURE(*err)) 8911cb0ef41Sopenharmony_ci { 8921cb0ef41Sopenharmony_ci /*Fix for SPR#DJOE66JFN3 (Lotus)*/ 8931cb0ef41Sopenharmony_ci OldConverterIndex = extraInfo->localeConverterIndex; 8941cb0ef41Sopenharmony_ci 8951cb0ef41Sopenharmony_ci if (args->target >= args->targetLimit) 8961cb0ef41Sopenharmony_ci { 8971cb0ef41Sopenharmony_ci *err = U_BUFFER_OVERFLOW_ERROR; 8981cb0ef41Sopenharmony_ci break; 8991cb0ef41Sopenharmony_ci } 9001cb0ef41Sopenharmony_ci uniChar = *(args->source); 9011cb0ef41Sopenharmony_ci bytes_written = 0; 9021cb0ef41Sopenharmony_ci pLMBCS = LMBCS; 9031cb0ef41Sopenharmony_ci 9041cb0ef41Sopenharmony_ci /* check cases in rough order of how common they are, for speed */ 9051cb0ef41Sopenharmony_ci 9061cb0ef41Sopenharmony_ci /* single byte matches: strategy 1 */ 9071cb0ef41Sopenharmony_ci /*Fix for SPR#DJOE66JFN3 (Lotus)*/ 9081cb0ef41Sopenharmony_ci if((uniChar>=0x80) && (uniChar<=0xff) 9091cb0ef41Sopenharmony_ci /*Fix for SPR#JUYA6XAERU and TSAO7GL5NK (Lotus)*/ &&(uniChar!=0xB1) &&(uniChar!=0xD7) &&(uniChar!=0xF7) 9101cb0ef41Sopenharmony_ci &&(uniChar!=0xB0) &&(uniChar!=0xB4) &&(uniChar!=0xB6) &&(uniChar!=0xA7) &&(uniChar!=0xA8)) 9111cb0ef41Sopenharmony_ci { 9121cb0ef41Sopenharmony_ci extraInfo->localeConverterIndex = ULMBCS_GRP_L1; 9131cb0ef41Sopenharmony_ci } 9141cb0ef41Sopenharmony_ci if (((uniChar > ULMBCS_C0END) && (uniChar < ULMBCS_C1START)) || 9151cb0ef41Sopenharmony_ci uniChar == 0 || uniChar == ULMBCS_HT || uniChar == ULMBCS_CR || 9161cb0ef41Sopenharmony_ci uniChar == ULMBCS_LF || uniChar == ULMBCS_123SYSTEMRANGE 9171cb0ef41Sopenharmony_ci ) 9181cb0ef41Sopenharmony_ci { 9191cb0ef41Sopenharmony_ci *pLMBCS++ = (ulmbcs_byte_t ) uniChar; 9201cb0ef41Sopenharmony_ci bytes_written = 1; 9211cb0ef41Sopenharmony_ci } 9221cb0ef41Sopenharmony_ci 9231cb0ef41Sopenharmony_ci 9241cb0ef41Sopenharmony_ci if (!bytes_written) 9251cb0ef41Sopenharmony_ci { 9261cb0ef41Sopenharmony_ci /* Check by UNICODE range (Strategy 2) */ 9271cb0ef41Sopenharmony_ci ulmbcs_byte_t group = FindLMBCSUniRange(uniChar); 9281cb0ef41Sopenharmony_ci 9291cb0ef41Sopenharmony_ci if (group == ULMBCS_GRP_UNICODE) /* (Strategy 2A) */ 9301cb0ef41Sopenharmony_ci { 9311cb0ef41Sopenharmony_ci pLMBCS += LMBCSConvertUni(pLMBCS,uniChar); 9321cb0ef41Sopenharmony_ci 9331cb0ef41Sopenharmony_ci bytes_written = (int32_t)(pLMBCS - LMBCS); 9341cb0ef41Sopenharmony_ci } 9351cb0ef41Sopenharmony_ci else if (group == ULMBCS_GRP_CTRL) /* (Strategy 2B) */ 9361cb0ef41Sopenharmony_ci { 9371cb0ef41Sopenharmony_ci /* Handle control characters here */ 9381cb0ef41Sopenharmony_ci if (uniChar <= ULMBCS_C0END) 9391cb0ef41Sopenharmony_ci { 9401cb0ef41Sopenharmony_ci *pLMBCS++ = ULMBCS_GRP_CTRL; 9411cb0ef41Sopenharmony_ci *pLMBCS++ = (ulmbcs_byte_t)(ULMBCS_CTRLOFFSET + uniChar); 9421cb0ef41Sopenharmony_ci } 9431cb0ef41Sopenharmony_ci else if (uniChar >= ULMBCS_C1START && uniChar <= ULMBCS_C1START + ULMBCS_CTRLOFFSET) 9441cb0ef41Sopenharmony_ci { 9451cb0ef41Sopenharmony_ci *pLMBCS++ = ULMBCS_GRP_CTRL; 9461cb0ef41Sopenharmony_ci *pLMBCS++ = (ulmbcs_byte_t ) (uniChar & 0x00FF); 9471cb0ef41Sopenharmony_ci } 9481cb0ef41Sopenharmony_ci bytes_written = (int32_t)(pLMBCS - LMBCS); 9491cb0ef41Sopenharmony_ci } 9501cb0ef41Sopenharmony_ci else if (group < ULMBCS_GRP_UNICODE) /* (Strategy 2C) */ 9511cb0ef41Sopenharmony_ci { 9521cb0ef41Sopenharmony_ci /* a specific converter has been identified - use it */ 9531cb0ef41Sopenharmony_ci bytes_written = (int32_t)LMBCSConversionWorker ( 9541cb0ef41Sopenharmony_ci extraInfo, group, pLMBCS, &uniChar, 9551cb0ef41Sopenharmony_ci &lastConverterIndex, groups_tried); 9561cb0ef41Sopenharmony_ci } 9571cb0ef41Sopenharmony_ci if (!bytes_written) /* the ambiguous group cases (Strategy 3) */ 9581cb0ef41Sopenharmony_ci { 9591cb0ef41Sopenharmony_ci uprv_memset(groups_tried, 0, sizeof(groups_tried)); 9601cb0ef41Sopenharmony_ci 9611cb0ef41Sopenharmony_ci /* check for non-default optimization group (Strategy 3A )*/ 9621cb0ef41Sopenharmony_ci if ((extraInfo->OptGroup != 1) && (ULMBCS_AMBIGUOUS_MATCH(group, extraInfo->OptGroup))) 9631cb0ef41Sopenharmony_ci { 9641cb0ef41Sopenharmony_ci /*zhujin: upgrade, merge #39299 here (Lotus) */ 9651cb0ef41Sopenharmony_ci /*To make R5 compatible translation, look for exceptional group first for non-DBCS*/ 9661cb0ef41Sopenharmony_ci 9671cb0ef41Sopenharmony_ci if(extraInfo->localeConverterIndex < ULMBCS_DOUBLEOPTGROUP_START) 9681cb0ef41Sopenharmony_ci { 9691cb0ef41Sopenharmony_ci bytes_written = (int32_t)LMBCSConversionWorker (extraInfo, 9701cb0ef41Sopenharmony_ci ULMBCS_GRP_L1, pLMBCS, &uniChar, 9711cb0ef41Sopenharmony_ci &lastConverterIndex, groups_tried); 9721cb0ef41Sopenharmony_ci 9731cb0ef41Sopenharmony_ci if(!bytes_written) 9741cb0ef41Sopenharmony_ci { 9751cb0ef41Sopenharmony_ci bytes_written = (int32_t)LMBCSConversionWorker (extraInfo, 9761cb0ef41Sopenharmony_ci ULMBCS_GRP_EXCEPT, pLMBCS, &uniChar, 9771cb0ef41Sopenharmony_ci &lastConverterIndex, groups_tried); 9781cb0ef41Sopenharmony_ci } 9791cb0ef41Sopenharmony_ci if(!bytes_written) 9801cb0ef41Sopenharmony_ci { 9811cb0ef41Sopenharmony_ci bytes_written = (int32_t)LMBCSConversionWorker (extraInfo, 9821cb0ef41Sopenharmony_ci extraInfo->localeConverterIndex, pLMBCS, &uniChar, 9831cb0ef41Sopenharmony_ci &lastConverterIndex, groups_tried); 9841cb0ef41Sopenharmony_ci } 9851cb0ef41Sopenharmony_ci } 9861cb0ef41Sopenharmony_ci else 9871cb0ef41Sopenharmony_ci { 9881cb0ef41Sopenharmony_ci bytes_written = (int32_t)LMBCSConversionWorker (extraInfo, 9891cb0ef41Sopenharmony_ci extraInfo->localeConverterIndex, pLMBCS, &uniChar, 9901cb0ef41Sopenharmony_ci &lastConverterIndex, groups_tried); 9911cb0ef41Sopenharmony_ci } 9921cb0ef41Sopenharmony_ci } 9931cb0ef41Sopenharmony_ci /* check for locale optimization group (Strategy 3B) */ 9941cb0ef41Sopenharmony_ci if (!bytes_written && (extraInfo->localeConverterIndex) && (ULMBCS_AMBIGUOUS_MATCH(group, extraInfo->localeConverterIndex))) 9951cb0ef41Sopenharmony_ci { 9961cb0ef41Sopenharmony_ci bytes_written = (int32_t)LMBCSConversionWorker (extraInfo, 9971cb0ef41Sopenharmony_ci extraInfo->localeConverterIndex, pLMBCS, &uniChar, &lastConverterIndex, groups_tried); 9981cb0ef41Sopenharmony_ci } 9991cb0ef41Sopenharmony_ci /* check for last optimization group used for this string (Strategy 3C) */ 10001cb0ef41Sopenharmony_ci if (!bytes_written && (lastConverterIndex) && (ULMBCS_AMBIGUOUS_MATCH(group, lastConverterIndex))) 10011cb0ef41Sopenharmony_ci { 10021cb0ef41Sopenharmony_ci bytes_written = (int32_t)LMBCSConversionWorker (extraInfo, 10031cb0ef41Sopenharmony_ci lastConverterIndex, pLMBCS, &uniChar, &lastConverterIndex, groups_tried); 10041cb0ef41Sopenharmony_ci } 10051cb0ef41Sopenharmony_ci if (!bytes_written) 10061cb0ef41Sopenharmony_ci { 10071cb0ef41Sopenharmony_ci /* just check every possible matching converter (Strategy 3D) */ 10081cb0ef41Sopenharmony_ci ulmbcs_byte_t grp_start; 10091cb0ef41Sopenharmony_ci ulmbcs_byte_t grp_end; 10101cb0ef41Sopenharmony_ci ulmbcs_byte_t grp_ix; 10111cb0ef41Sopenharmony_ci grp_start = (ulmbcs_byte_t)((group == ULMBCS_AMBIGUOUS_MBCS) 10121cb0ef41Sopenharmony_ci ? ULMBCS_DOUBLEOPTGROUP_START 10131cb0ef41Sopenharmony_ci : ULMBCS_GRP_L1); 10141cb0ef41Sopenharmony_ci grp_end = (ulmbcs_byte_t)((group == ULMBCS_AMBIGUOUS_MBCS) 10151cb0ef41Sopenharmony_ci ? ULMBCS_GRP_LAST 10161cb0ef41Sopenharmony_ci : ULMBCS_GRP_TH); 10171cb0ef41Sopenharmony_ci if(group == ULMBCS_AMBIGUOUS_ALL) 10181cb0ef41Sopenharmony_ci { 10191cb0ef41Sopenharmony_ci grp_start = ULMBCS_GRP_L1; 10201cb0ef41Sopenharmony_ci grp_end = ULMBCS_GRP_LAST; 10211cb0ef41Sopenharmony_ci } 10221cb0ef41Sopenharmony_ci for (grp_ix = grp_start; 10231cb0ef41Sopenharmony_ci grp_ix <= grp_end && !bytes_written; 10241cb0ef41Sopenharmony_ci grp_ix++) 10251cb0ef41Sopenharmony_ci { 10261cb0ef41Sopenharmony_ci if (extraInfo->OptGrpConverter [grp_ix] && !groups_tried [grp_ix]) 10271cb0ef41Sopenharmony_ci { 10281cb0ef41Sopenharmony_ci bytes_written = (int32_t)LMBCSConversionWorker (extraInfo, 10291cb0ef41Sopenharmony_ci grp_ix, pLMBCS, &uniChar, 10301cb0ef41Sopenharmony_ci &lastConverterIndex, groups_tried); 10311cb0ef41Sopenharmony_ci } 10321cb0ef41Sopenharmony_ci } 10331cb0ef41Sopenharmony_ci /* a final conversion fallback to the exceptions group if its likely 10341cb0ef41Sopenharmony_ci to be single byte (Strategy 3E) */ 10351cb0ef41Sopenharmony_ci if (!bytes_written && grp_start == ULMBCS_GRP_L1) 10361cb0ef41Sopenharmony_ci { 10371cb0ef41Sopenharmony_ci bytes_written = (int32_t)LMBCSConversionWorker (extraInfo, 10381cb0ef41Sopenharmony_ci ULMBCS_GRP_EXCEPT, pLMBCS, &uniChar, 10391cb0ef41Sopenharmony_ci &lastConverterIndex, groups_tried); 10401cb0ef41Sopenharmony_ci } 10411cb0ef41Sopenharmony_ci } 10421cb0ef41Sopenharmony_ci /* all of our other strategies failed. Fallback to Unicode. (Strategy 4)*/ 10431cb0ef41Sopenharmony_ci if (!bytes_written) 10441cb0ef41Sopenharmony_ci { 10451cb0ef41Sopenharmony_ci 10461cb0ef41Sopenharmony_ci pLMBCS += LMBCSConvertUni(pLMBCS, uniChar); 10471cb0ef41Sopenharmony_ci bytes_written = (int32_t)(pLMBCS - LMBCS); 10481cb0ef41Sopenharmony_ci } 10491cb0ef41Sopenharmony_ci } 10501cb0ef41Sopenharmony_ci } 10511cb0ef41Sopenharmony_ci 10521cb0ef41Sopenharmony_ci /* we have a translation. increment source and write as much as possible to target */ 10531cb0ef41Sopenharmony_ci args->source++; 10541cb0ef41Sopenharmony_ci pLMBCS = LMBCS; 10551cb0ef41Sopenharmony_ci while (args->target < args->targetLimit && bytes_written--) 10561cb0ef41Sopenharmony_ci { 10571cb0ef41Sopenharmony_ci *(args->target)++ = *pLMBCS++; 10581cb0ef41Sopenharmony_ci if (args->offsets) 10591cb0ef41Sopenharmony_ci { 10601cb0ef41Sopenharmony_ci *(args->offsets)++ = sourceIndex; 10611cb0ef41Sopenharmony_ci } 10621cb0ef41Sopenharmony_ci } 10631cb0ef41Sopenharmony_ci sourceIndex++; 10641cb0ef41Sopenharmony_ci if (bytes_written > 0) 10651cb0ef41Sopenharmony_ci { 10661cb0ef41Sopenharmony_ci /* write any bytes that didn't fit in target to the error buffer, 10671cb0ef41Sopenharmony_ci common code will move this to target if we get called back with 10681cb0ef41Sopenharmony_ci enough target room 10691cb0ef41Sopenharmony_ci */ 10701cb0ef41Sopenharmony_ci uint8_t * pErrorBuffer = args->converter->charErrorBuffer; 10711cb0ef41Sopenharmony_ci *err = U_BUFFER_OVERFLOW_ERROR; 10721cb0ef41Sopenharmony_ci args->converter->charErrorBufferLength = (int8_t)bytes_written; 10731cb0ef41Sopenharmony_ci while (bytes_written--) 10741cb0ef41Sopenharmony_ci { 10751cb0ef41Sopenharmony_ci *pErrorBuffer++ = *pLMBCS++; 10761cb0ef41Sopenharmony_ci } 10771cb0ef41Sopenharmony_ci } 10781cb0ef41Sopenharmony_ci /*Fix for SPR#DJOE66JFN3 (Lotus)*/ 10791cb0ef41Sopenharmony_ci extraInfo->localeConverterIndex = OldConverterIndex; 10801cb0ef41Sopenharmony_ci } 10811cb0ef41Sopenharmony_ci} 10821cb0ef41Sopenharmony_ci 10831cb0ef41Sopenharmony_ci 10841cb0ef41Sopenharmony_ci/* Now, the Unicode from LMBCS section */ 10851cb0ef41Sopenharmony_ci 10861cb0ef41Sopenharmony_ci 10871cb0ef41Sopenharmony_ci/* A function to call when we are looking at the Unicode group byte in LMBCS */ 10881cb0ef41Sopenharmony_cistatic char16_t 10891cb0ef41Sopenharmony_ciGetUniFromLMBCSUni(char const ** ppLMBCSin) /* Called with LMBCS-style Unicode byte stream */ 10901cb0ef41Sopenharmony_ci{ 10911cb0ef41Sopenharmony_ci uint8_t HighCh = *(*ppLMBCSin)++; /* Big-endian Unicode in LMBCS compatibility group*/ 10921cb0ef41Sopenharmony_ci uint8_t LowCh = *(*ppLMBCSin)++; 10931cb0ef41Sopenharmony_ci 10941cb0ef41Sopenharmony_ci if (HighCh == ULMBCS_UNICOMPATZERO ) 10951cb0ef41Sopenharmony_ci { 10961cb0ef41Sopenharmony_ci HighCh = LowCh; 10971cb0ef41Sopenharmony_ci LowCh = 0; /* zero-byte in LSB special character */ 10981cb0ef41Sopenharmony_ci } 10991cb0ef41Sopenharmony_ci return (char16_t)((HighCh << 8) | LowCh); 11001cb0ef41Sopenharmony_ci} 11011cb0ef41Sopenharmony_ci 11021cb0ef41Sopenharmony_ci 11031cb0ef41Sopenharmony_ci 11041cb0ef41Sopenharmony_ci/* CHECK_SOURCE_LIMIT: Helper macro to verify that there are at least'index' 11051cb0ef41Sopenharmony_ci bytes left in source up to sourceLimit.Errors appropriately if not. 11061cb0ef41Sopenharmony_ci If we reach the limit, then update the source pointer to there to consume 11071cb0ef41Sopenharmony_ci all input as required by ICU converter semantics. 11081cb0ef41Sopenharmony_ci*/ 11091cb0ef41Sopenharmony_ci 11101cb0ef41Sopenharmony_ci#define CHECK_SOURCE_LIMIT(index) UPRV_BLOCK_MACRO_BEGIN { \ 11111cb0ef41Sopenharmony_ci if (args->source+index > args->sourceLimit) { \ 11121cb0ef41Sopenharmony_ci *err = U_TRUNCATED_CHAR_FOUND; \ 11131cb0ef41Sopenharmony_ci args->source = args->sourceLimit; \ 11141cb0ef41Sopenharmony_ci return 0xffff; \ 11151cb0ef41Sopenharmony_ci } \ 11161cb0ef41Sopenharmony_ci} UPRV_BLOCK_MACRO_END 11171cb0ef41Sopenharmony_ci 11181cb0ef41Sopenharmony_ci/* Return the Unicode representation for the current LMBCS character */ 11191cb0ef41Sopenharmony_ci 11201cb0ef41Sopenharmony_cistatic UChar32 U_CALLCONV 11211cb0ef41Sopenharmony_ci_LMBCSGetNextUCharWorker(UConverterToUnicodeArgs* args, 11221cb0ef41Sopenharmony_ci UErrorCode* err) 11231cb0ef41Sopenharmony_ci{ 11241cb0ef41Sopenharmony_ci UChar32 uniChar = 0; /* an output UNICODE char */ 11251cb0ef41Sopenharmony_ci ulmbcs_byte_t CurByte; /* A byte from the input stream */ 11261cb0ef41Sopenharmony_ci 11271cb0ef41Sopenharmony_ci /* error check */ 11281cb0ef41Sopenharmony_ci if (args->source >= args->sourceLimit) 11291cb0ef41Sopenharmony_ci { 11301cb0ef41Sopenharmony_ci *err = U_ILLEGAL_ARGUMENT_ERROR; 11311cb0ef41Sopenharmony_ci return 0xffff; 11321cb0ef41Sopenharmony_ci } 11331cb0ef41Sopenharmony_ci /* Grab first byte & save address for error recovery */ 11341cb0ef41Sopenharmony_ci CurByte = *((ulmbcs_byte_t *) (args->source++)); 11351cb0ef41Sopenharmony_ci 11361cb0ef41Sopenharmony_ci /* 11371cb0ef41Sopenharmony_ci * at entry of each if clause: 11381cb0ef41Sopenharmony_ci * 1. 'CurByte' points at the first byte of a LMBCS character 11391cb0ef41Sopenharmony_ci * 2. '*source'points to the next byte of the source stream after 'CurByte' 11401cb0ef41Sopenharmony_ci * 11411cb0ef41Sopenharmony_ci * the job of each if clause is: 11421cb0ef41Sopenharmony_ci * 1. set '*source' to point at the beginning of next char (nop if LMBCS char is only 1 byte) 11431cb0ef41Sopenharmony_ci * 2. set 'uniChar' up with the right Unicode value, or set 'err' appropriately 11441cb0ef41Sopenharmony_ci */ 11451cb0ef41Sopenharmony_ci 11461cb0ef41Sopenharmony_ci /* First lets check the simple fixed values. */ 11471cb0ef41Sopenharmony_ci 11481cb0ef41Sopenharmony_ci if(((CurByte > ULMBCS_C0END) && (CurByte < ULMBCS_C1START)) /* ascii range */ 11491cb0ef41Sopenharmony_ci || (CurByte == 0) 11501cb0ef41Sopenharmony_ci || CurByte == ULMBCS_HT || CurByte == ULMBCS_CR 11511cb0ef41Sopenharmony_ci || CurByte == ULMBCS_LF || CurByte == ULMBCS_123SYSTEMRANGE) 11521cb0ef41Sopenharmony_ci { 11531cb0ef41Sopenharmony_ci uniChar = CurByte; 11541cb0ef41Sopenharmony_ci } 11551cb0ef41Sopenharmony_ci else 11561cb0ef41Sopenharmony_ci { 11571cb0ef41Sopenharmony_ci UConverterDataLMBCS * extraInfo; 11581cb0ef41Sopenharmony_ci ulmbcs_byte_t group; 11591cb0ef41Sopenharmony_ci UConverterSharedData *cnv; 11601cb0ef41Sopenharmony_ci 11611cb0ef41Sopenharmony_ci if (CurByte == ULMBCS_GRP_CTRL) /* Control character group - no opt group update */ 11621cb0ef41Sopenharmony_ci { 11631cb0ef41Sopenharmony_ci ulmbcs_byte_t C0C1byte; 11641cb0ef41Sopenharmony_ci CHECK_SOURCE_LIMIT(1); 11651cb0ef41Sopenharmony_ci C0C1byte = *(args->source)++; 11661cb0ef41Sopenharmony_ci uniChar = (C0C1byte < ULMBCS_C1START) ? C0C1byte - ULMBCS_CTRLOFFSET : C0C1byte; 11671cb0ef41Sopenharmony_ci } 11681cb0ef41Sopenharmony_ci else 11691cb0ef41Sopenharmony_ci if (CurByte == ULMBCS_GRP_UNICODE) /* Unicode compatibility group: BigEndian UTF16 */ 11701cb0ef41Sopenharmony_ci { 11711cb0ef41Sopenharmony_ci CHECK_SOURCE_LIMIT(2); 11721cb0ef41Sopenharmony_ci 11731cb0ef41Sopenharmony_ci /* don't check for error indicators fffe/ffff below */ 11741cb0ef41Sopenharmony_ci return GetUniFromLMBCSUni(&(args->source)); 11751cb0ef41Sopenharmony_ci } 11761cb0ef41Sopenharmony_ci else if (CurByte <= ULMBCS_CTRLOFFSET) 11771cb0ef41Sopenharmony_ci { 11781cb0ef41Sopenharmony_ci group = CurByte; /* group byte is in the source */ 11791cb0ef41Sopenharmony_ci extraInfo = (UConverterDataLMBCS *) args->converter->extraInfo; 11801cb0ef41Sopenharmony_ci if (group > ULMBCS_GRP_LAST || (cnv = extraInfo->OptGrpConverter[group]) == nullptr) 11811cb0ef41Sopenharmony_ci { 11821cb0ef41Sopenharmony_ci /* this is not a valid group byte - no converter*/ 11831cb0ef41Sopenharmony_ci *err = U_INVALID_CHAR_FOUND; 11841cb0ef41Sopenharmony_ci } 11851cb0ef41Sopenharmony_ci else if (group >= ULMBCS_DOUBLEOPTGROUP_START) /* double byte conversion */ 11861cb0ef41Sopenharmony_ci { 11871cb0ef41Sopenharmony_ci 11881cb0ef41Sopenharmony_ci CHECK_SOURCE_LIMIT(2); 11891cb0ef41Sopenharmony_ci 11901cb0ef41Sopenharmony_ci /* check for LMBCS doubled-group-byte case */ 11911cb0ef41Sopenharmony_ci if (*args->source == group) { 11921cb0ef41Sopenharmony_ci /* single byte */ 11931cb0ef41Sopenharmony_ci ++args->source; 11941cb0ef41Sopenharmony_ci uniChar = ucnv_MBCSSimpleGetNextUChar(cnv, args->source, 1, false); 11951cb0ef41Sopenharmony_ci ++args->source; 11961cb0ef41Sopenharmony_ci } else { 11971cb0ef41Sopenharmony_ci /* double byte */ 11981cb0ef41Sopenharmony_ci uniChar = ucnv_MBCSSimpleGetNextUChar(cnv, args->source, 2, false); 11991cb0ef41Sopenharmony_ci args->source += 2; 12001cb0ef41Sopenharmony_ci } 12011cb0ef41Sopenharmony_ci } 12021cb0ef41Sopenharmony_ci else { /* single byte conversion */ 12031cb0ef41Sopenharmony_ci CHECK_SOURCE_LIMIT(1); 12041cb0ef41Sopenharmony_ci CurByte = *(args->source)++; 12051cb0ef41Sopenharmony_ci 12061cb0ef41Sopenharmony_ci if (CurByte >= ULMBCS_C1START) 12071cb0ef41Sopenharmony_ci { 12081cb0ef41Sopenharmony_ci uniChar = _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(cnv, CurByte); 12091cb0ef41Sopenharmony_ci } 12101cb0ef41Sopenharmony_ci else 12111cb0ef41Sopenharmony_ci { 12121cb0ef41Sopenharmony_ci /* The non-optimizable oddballs where there is an explicit byte 12131cb0ef41Sopenharmony_ci * AND the second byte is not in the upper ascii range 12141cb0ef41Sopenharmony_ci */ 12151cb0ef41Sopenharmony_ci char bytes[2]; 12161cb0ef41Sopenharmony_ci 12171cb0ef41Sopenharmony_ci extraInfo = (UConverterDataLMBCS *) args->converter->extraInfo; 12181cb0ef41Sopenharmony_ci cnv = extraInfo->OptGrpConverter [ULMBCS_GRP_EXCEPT]; 12191cb0ef41Sopenharmony_ci 12201cb0ef41Sopenharmony_ci /* Lookup value must include opt group */ 12211cb0ef41Sopenharmony_ci bytes[0] = group; 12221cb0ef41Sopenharmony_ci bytes[1] = CurByte; 12231cb0ef41Sopenharmony_ci uniChar = ucnv_MBCSSimpleGetNextUChar(cnv, bytes, 2, false); 12241cb0ef41Sopenharmony_ci } 12251cb0ef41Sopenharmony_ci } 12261cb0ef41Sopenharmony_ci } 12271cb0ef41Sopenharmony_ci else if (CurByte >= ULMBCS_C1START) /* group byte is implicit */ 12281cb0ef41Sopenharmony_ci { 12291cb0ef41Sopenharmony_ci extraInfo = (UConverterDataLMBCS *) args->converter->extraInfo; 12301cb0ef41Sopenharmony_ci group = extraInfo->OptGroup; 12311cb0ef41Sopenharmony_ci cnv = extraInfo->OptGrpConverter[group]; 12321cb0ef41Sopenharmony_ci if (group >= ULMBCS_DOUBLEOPTGROUP_START) /* double byte conversion */ 12331cb0ef41Sopenharmony_ci { 12341cb0ef41Sopenharmony_ci if (!ucnv_MBCSIsLeadByte(cnv, CurByte)) 12351cb0ef41Sopenharmony_ci { 12361cb0ef41Sopenharmony_ci CHECK_SOURCE_LIMIT(0); 12371cb0ef41Sopenharmony_ci 12381cb0ef41Sopenharmony_ci /* let the MBCS conversion consume CurByte again */ 12391cb0ef41Sopenharmony_ci uniChar = ucnv_MBCSSimpleGetNextUChar(cnv, args->source - 1, 1, false); 12401cb0ef41Sopenharmony_ci } 12411cb0ef41Sopenharmony_ci else 12421cb0ef41Sopenharmony_ci { 12431cb0ef41Sopenharmony_ci CHECK_SOURCE_LIMIT(1); 12441cb0ef41Sopenharmony_ci /* let the MBCS conversion consume CurByte again */ 12451cb0ef41Sopenharmony_ci uniChar = ucnv_MBCSSimpleGetNextUChar(cnv, args->source - 1, 2, false); 12461cb0ef41Sopenharmony_ci ++args->source; 12471cb0ef41Sopenharmony_ci } 12481cb0ef41Sopenharmony_ci } 12491cb0ef41Sopenharmony_ci else /* single byte conversion */ 12501cb0ef41Sopenharmony_ci { 12511cb0ef41Sopenharmony_ci uniChar = _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(cnv, CurByte); 12521cb0ef41Sopenharmony_ci } 12531cb0ef41Sopenharmony_ci } 12541cb0ef41Sopenharmony_ci } 12551cb0ef41Sopenharmony_ci return uniChar; 12561cb0ef41Sopenharmony_ci} 12571cb0ef41Sopenharmony_ci 12581cb0ef41Sopenharmony_ci 12591cb0ef41Sopenharmony_ci/* The exported function that converts lmbcs to one or more 12601cb0ef41Sopenharmony_ci UChars - currently UTF-16 12611cb0ef41Sopenharmony_ci*/ 12621cb0ef41Sopenharmony_cistatic void U_CALLCONV 12631cb0ef41Sopenharmony_ci_LMBCSToUnicodeWithOffsets(UConverterToUnicodeArgs* args, 12641cb0ef41Sopenharmony_ci UErrorCode* err) 12651cb0ef41Sopenharmony_ci{ 12661cb0ef41Sopenharmony_ci char LMBCS [ULMBCS_CHARSIZE_MAX]; 12671cb0ef41Sopenharmony_ci char16_t uniChar; /* one output UNICODE char */ 12681cb0ef41Sopenharmony_ci const char * saveSource; /* beginning of current code point */ 12691cb0ef41Sopenharmony_ci const char * pStartLMBCS = args->source; /* beginning of whole string */ 12701cb0ef41Sopenharmony_ci const char * errSource = nullptr; /* pointer to actual input in case an error occurs */ 12711cb0ef41Sopenharmony_ci int8_t savebytes = 0; 12721cb0ef41Sopenharmony_ci 12731cb0ef41Sopenharmony_ci /* Process from source to limit, or until error */ 12741cb0ef41Sopenharmony_ci while (U_SUCCESS(*err) && args->sourceLimit > args->source && args->targetLimit > args->target) 12751cb0ef41Sopenharmony_ci { 12761cb0ef41Sopenharmony_ci saveSource = args->source; /* beginning of current code point */ 12771cb0ef41Sopenharmony_ci 12781cb0ef41Sopenharmony_ci if (args->converter->toULength) /* reassemble char from previous call */ 12791cb0ef41Sopenharmony_ci { 12801cb0ef41Sopenharmony_ci const char *saveSourceLimit; 12811cb0ef41Sopenharmony_ci size_t size_old = args->converter->toULength; 12821cb0ef41Sopenharmony_ci 12831cb0ef41Sopenharmony_ci /* limit from source is either remainder of temp buffer, or user limit on source */ 12841cb0ef41Sopenharmony_ci size_t size_new_maybe_1 = sizeof(LMBCS) - size_old; 12851cb0ef41Sopenharmony_ci size_t size_new_maybe_2 = args->sourceLimit - args->source; 12861cb0ef41Sopenharmony_ci size_t size_new = (size_new_maybe_1 < size_new_maybe_2) ? size_new_maybe_1 : size_new_maybe_2; 12871cb0ef41Sopenharmony_ci 12881cb0ef41Sopenharmony_ci 12891cb0ef41Sopenharmony_ci uprv_memcpy(LMBCS, args->converter->toUBytes, size_old); 12901cb0ef41Sopenharmony_ci uprv_memcpy(LMBCS + size_old, args->source, size_new); 12911cb0ef41Sopenharmony_ci saveSourceLimit = args->sourceLimit; 12921cb0ef41Sopenharmony_ci args->source = errSource = LMBCS; 12931cb0ef41Sopenharmony_ci args->sourceLimit = LMBCS+size_old+size_new; 12941cb0ef41Sopenharmony_ci savebytes = (int8_t)(size_old+size_new); 12951cb0ef41Sopenharmony_ci uniChar = (char16_t) _LMBCSGetNextUCharWorker(args, err); 12961cb0ef41Sopenharmony_ci args->source = saveSource + ((args->source - LMBCS) - size_old); 12971cb0ef41Sopenharmony_ci args->sourceLimit = saveSourceLimit; 12981cb0ef41Sopenharmony_ci 12991cb0ef41Sopenharmony_ci if (*err == U_TRUNCATED_CHAR_FOUND) 13001cb0ef41Sopenharmony_ci { 13011cb0ef41Sopenharmony_ci /* evil special case: source buffers so small a char spans more than 2 buffers */ 13021cb0ef41Sopenharmony_ci args->converter->toULength = savebytes; 13031cb0ef41Sopenharmony_ci uprv_memcpy(args->converter->toUBytes, LMBCS, savebytes); 13041cb0ef41Sopenharmony_ci args->source = args->sourceLimit; 13051cb0ef41Sopenharmony_ci *err = U_ZERO_ERROR; 13061cb0ef41Sopenharmony_ci return; 13071cb0ef41Sopenharmony_ci } 13081cb0ef41Sopenharmony_ci else 13091cb0ef41Sopenharmony_ci { 13101cb0ef41Sopenharmony_ci /* clear the partial-char marker */ 13111cb0ef41Sopenharmony_ci args->converter->toULength = 0; 13121cb0ef41Sopenharmony_ci } 13131cb0ef41Sopenharmony_ci } 13141cb0ef41Sopenharmony_ci else 13151cb0ef41Sopenharmony_ci { 13161cb0ef41Sopenharmony_ci errSource = saveSource; 13171cb0ef41Sopenharmony_ci uniChar = (char16_t) _LMBCSGetNextUCharWorker(args, err); 13181cb0ef41Sopenharmony_ci savebytes = (int8_t)(args->source - saveSource); 13191cb0ef41Sopenharmony_ci } 13201cb0ef41Sopenharmony_ci if (U_SUCCESS(*err)) 13211cb0ef41Sopenharmony_ci { 13221cb0ef41Sopenharmony_ci if (uniChar < 0xfffe) 13231cb0ef41Sopenharmony_ci { 13241cb0ef41Sopenharmony_ci *(args->target)++ = uniChar; 13251cb0ef41Sopenharmony_ci if(args->offsets) 13261cb0ef41Sopenharmony_ci { 13271cb0ef41Sopenharmony_ci *(args->offsets)++ = (int32_t)(saveSource - pStartLMBCS); 13281cb0ef41Sopenharmony_ci } 13291cb0ef41Sopenharmony_ci } 13301cb0ef41Sopenharmony_ci else if (uniChar == 0xfffe) 13311cb0ef41Sopenharmony_ci { 13321cb0ef41Sopenharmony_ci *err = U_INVALID_CHAR_FOUND; 13331cb0ef41Sopenharmony_ci } 13341cb0ef41Sopenharmony_ci else /* if (uniChar == 0xffff) */ 13351cb0ef41Sopenharmony_ci { 13361cb0ef41Sopenharmony_ci *err = U_ILLEGAL_CHAR_FOUND; 13371cb0ef41Sopenharmony_ci } 13381cb0ef41Sopenharmony_ci } 13391cb0ef41Sopenharmony_ci } 13401cb0ef41Sopenharmony_ci /* if target ran out before source, return U_BUFFER_OVERFLOW_ERROR */ 13411cb0ef41Sopenharmony_ci if (U_SUCCESS(*err) && args->sourceLimit > args->source && args->targetLimit <= args->target) 13421cb0ef41Sopenharmony_ci { 13431cb0ef41Sopenharmony_ci *err = U_BUFFER_OVERFLOW_ERROR; 13441cb0ef41Sopenharmony_ci } 13451cb0ef41Sopenharmony_ci else if (U_FAILURE(*err)) 13461cb0ef41Sopenharmony_ci { 13471cb0ef41Sopenharmony_ci /* If character incomplete or unmappable/illegal, store it in toUBytes[] */ 13481cb0ef41Sopenharmony_ci args->converter->toULength = savebytes; 13491cb0ef41Sopenharmony_ci if (savebytes > 0) { 13501cb0ef41Sopenharmony_ci uprv_memcpy(args->converter->toUBytes, errSource, savebytes); 13511cb0ef41Sopenharmony_ci } 13521cb0ef41Sopenharmony_ci if (*err == U_TRUNCATED_CHAR_FOUND) { 13531cb0ef41Sopenharmony_ci *err = U_ZERO_ERROR; 13541cb0ef41Sopenharmony_ci } 13551cb0ef41Sopenharmony_ci } 13561cb0ef41Sopenharmony_ci} 13571cb0ef41Sopenharmony_ci 13581cb0ef41Sopenharmony_ci/* And now, the macroized declarations of data & functions: */ 13591cb0ef41Sopenharmony_ciDEFINE_LMBCS_OPEN(1) 13601cb0ef41Sopenharmony_ciDEFINE_LMBCS_OPEN(2) 13611cb0ef41Sopenharmony_ciDEFINE_LMBCS_OPEN(3) 13621cb0ef41Sopenharmony_ciDEFINE_LMBCS_OPEN(4) 13631cb0ef41Sopenharmony_ciDEFINE_LMBCS_OPEN(5) 13641cb0ef41Sopenharmony_ciDEFINE_LMBCS_OPEN(6) 13651cb0ef41Sopenharmony_ciDEFINE_LMBCS_OPEN(8) 13661cb0ef41Sopenharmony_ciDEFINE_LMBCS_OPEN(11) 13671cb0ef41Sopenharmony_ciDEFINE_LMBCS_OPEN(16) 13681cb0ef41Sopenharmony_ciDEFINE_LMBCS_OPEN(17) 13691cb0ef41Sopenharmony_ciDEFINE_LMBCS_OPEN(18) 13701cb0ef41Sopenharmony_ciDEFINE_LMBCS_OPEN(19) 13711cb0ef41Sopenharmony_ci 13721cb0ef41Sopenharmony_ci 13731cb0ef41Sopenharmony_ciDECLARE_LMBCS_DATA(1) 13741cb0ef41Sopenharmony_ciDECLARE_LMBCS_DATA(2) 13751cb0ef41Sopenharmony_ciDECLARE_LMBCS_DATA(3) 13761cb0ef41Sopenharmony_ciDECLARE_LMBCS_DATA(4) 13771cb0ef41Sopenharmony_ciDECLARE_LMBCS_DATA(5) 13781cb0ef41Sopenharmony_ciDECLARE_LMBCS_DATA(6) 13791cb0ef41Sopenharmony_ciDECLARE_LMBCS_DATA(8) 13801cb0ef41Sopenharmony_ciDECLARE_LMBCS_DATA(11) 13811cb0ef41Sopenharmony_ciDECLARE_LMBCS_DATA(16) 13821cb0ef41Sopenharmony_ciDECLARE_LMBCS_DATA(17) 13831cb0ef41Sopenharmony_ciDECLARE_LMBCS_DATA(18) 13841cb0ef41Sopenharmony_ciDECLARE_LMBCS_DATA(19) 13851cb0ef41Sopenharmony_ci 13861cb0ef41Sopenharmony_ciU_CDECL_END 13871cb0ef41Sopenharmony_ci 13881cb0ef41Sopenharmony_ci#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ 1389