11767c5feSopenharmony_ci/* 21767c5feSopenharmony_ci * The authors of this software are Rob Pike and Ken Thompson. 31767c5feSopenharmony_ci * Copyright (c) 2002 by Lucent Technologies. 41767c5feSopenharmony_ci * Portions Copyright (c) 2009 The Go Authors. All rights reserved. 51767c5feSopenharmony_ci * Permission to use, copy, modify, and distribute this software for any 61767c5feSopenharmony_ci * purpose without fee is hereby granted, provided that this entire notice 71767c5feSopenharmony_ci * is included in all copies of any software which is or includes a copy 81767c5feSopenharmony_ci * or modification of this software and in all copies of the supporting 91767c5feSopenharmony_ci * documentation for such software. 101767c5feSopenharmony_ci * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED 111767c5feSopenharmony_ci * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY 121767c5feSopenharmony_ci * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY 131767c5feSopenharmony_ci * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. 141767c5feSopenharmony_ci */ 151767c5feSopenharmony_ci#include "phonenumbers/utf/utf.h" 161767c5feSopenharmony_ci#include "phonenumbers/utf/utfdef.h" 171767c5feSopenharmony_ci 181767c5feSopenharmony_cienum 191767c5feSopenharmony_ci{ 201767c5feSopenharmony_ci Bit1 = 7, 211767c5feSopenharmony_ci Bitx = 6, 221767c5feSopenharmony_ci Bit2 = 5, 231767c5feSopenharmony_ci Bit3 = 4, 241767c5feSopenharmony_ci Bit4 = 3, 251767c5feSopenharmony_ci Bit5 = 2, 261767c5feSopenharmony_ci 271767c5feSopenharmony_ci T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ 281767c5feSopenharmony_ci Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ 291767c5feSopenharmony_ci T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ 301767c5feSopenharmony_ci T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ 311767c5feSopenharmony_ci T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ 321767c5feSopenharmony_ci T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ 331767c5feSopenharmony_ci 341767c5feSopenharmony_ci Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ 351767c5feSopenharmony_ci Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ 361767c5feSopenharmony_ci Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ 371767c5feSopenharmony_ci Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0001 1111 1111 1111 1111 1111 */ 381767c5feSopenharmony_ci 391767c5feSopenharmony_ci Maskx = (1<<Bitx)-1, /* 0011 1111 */ 401767c5feSopenharmony_ci Testx = Maskx ^ 0xFF, /* 1100 0000 */ 411767c5feSopenharmony_ci 421767c5feSopenharmony_ci SurrogateMin = 0xD800, 431767c5feSopenharmony_ci SurrogateMax = 0xDFFF, 441767c5feSopenharmony_ci 451767c5feSopenharmony_ci Bad = Runeerror, 461767c5feSopenharmony_ci}; 471767c5feSopenharmony_ci 481767c5feSopenharmony_ci/* 491767c5feSopenharmony_ci * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24 501767c5feSopenharmony_ci * This is a slower but "safe" version of the old chartorune 511767c5feSopenharmony_ci * that works on strings that are not necessarily null-terminated. 521767c5feSopenharmony_ci * 531767c5feSopenharmony_ci * If you know for sure that your string is null-terminated, 541767c5feSopenharmony_ci * chartorune will be a bit faster. 551767c5feSopenharmony_ci * 561767c5feSopenharmony_ci * It is guaranteed not to attempt to access "length" 571767c5feSopenharmony_ci * past the incoming pointer. This is to avoid 581767c5feSopenharmony_ci * possible access violations. If the string appears to be 591767c5feSopenharmony_ci * well-formed but incomplete (i.e., to get the whole Rune 601767c5feSopenharmony_ci * we'd need to read past str+length) then we'll set the Rune 611767c5feSopenharmony_ci * to Bad and return 0. 621767c5feSopenharmony_ci * 631767c5feSopenharmony_ci * Note that if we have decoding problems for other 641767c5feSopenharmony_ci * reasons, we return 1 instead of 0. 651767c5feSopenharmony_ci */ 661767c5feSopenharmony_ciint 671767c5feSopenharmony_cicharntorune(Rune *rune, const char *str, int length) 681767c5feSopenharmony_ci{ 691767c5feSopenharmony_ci int c, c1, c2, c3; 701767c5feSopenharmony_ci long l; 711767c5feSopenharmony_ci 721767c5feSopenharmony_ci /* When we're not allowed to read anything */ 731767c5feSopenharmony_ci if(length <= 0) { 741767c5feSopenharmony_ci goto badlen; 751767c5feSopenharmony_ci } 761767c5feSopenharmony_ci 771767c5feSopenharmony_ci /* 781767c5feSopenharmony_ci * one character sequence (7-bit value) 791767c5feSopenharmony_ci * 00000-0007F => T1 801767c5feSopenharmony_ci */ 811767c5feSopenharmony_ci c = *(uchar*)str; 821767c5feSopenharmony_ci if(c < Tx) { 831767c5feSopenharmony_ci *rune = (Rune)c; 841767c5feSopenharmony_ci return 1; 851767c5feSopenharmony_ci } 861767c5feSopenharmony_ci 871767c5feSopenharmony_ci // If we can't read more than one character we must stop 881767c5feSopenharmony_ci if(length <= 1) { 891767c5feSopenharmony_ci goto badlen; 901767c5feSopenharmony_ci } 911767c5feSopenharmony_ci 921767c5feSopenharmony_ci /* 931767c5feSopenharmony_ci * two character sequence (11-bit value) 941767c5feSopenharmony_ci * 0080-07FF => T2 Tx 951767c5feSopenharmony_ci */ 961767c5feSopenharmony_ci c1 = *(uchar*)(str+1) ^ Tx; 971767c5feSopenharmony_ci if(c1 & Testx) 981767c5feSopenharmony_ci goto bad; 991767c5feSopenharmony_ci if(c < T3) { 1001767c5feSopenharmony_ci if(c < T2) 1011767c5feSopenharmony_ci goto bad; 1021767c5feSopenharmony_ci l = ((c << Bitx) | c1) & Rune2; 1031767c5feSopenharmony_ci if(l <= Rune1) 1041767c5feSopenharmony_ci goto bad; 1051767c5feSopenharmony_ci *rune = (Rune)l; 1061767c5feSopenharmony_ci return 2; 1071767c5feSopenharmony_ci } 1081767c5feSopenharmony_ci 1091767c5feSopenharmony_ci // If we can't read more than two characters we must stop 1101767c5feSopenharmony_ci if(length <= 2) { 1111767c5feSopenharmony_ci goto badlen; 1121767c5feSopenharmony_ci } 1131767c5feSopenharmony_ci 1141767c5feSopenharmony_ci /* 1151767c5feSopenharmony_ci * three character sequence (16-bit value) 1161767c5feSopenharmony_ci * 0800-FFFF => T3 Tx Tx 1171767c5feSopenharmony_ci */ 1181767c5feSopenharmony_ci c2 = *(uchar*)(str+2) ^ Tx; 1191767c5feSopenharmony_ci if(c2 & Testx) 1201767c5feSopenharmony_ci goto bad; 1211767c5feSopenharmony_ci if(c < T4) { 1221767c5feSopenharmony_ci l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; 1231767c5feSopenharmony_ci if(l <= Rune2) 1241767c5feSopenharmony_ci goto bad; 1251767c5feSopenharmony_ci if (SurrogateMin <= l && l <= SurrogateMax) 1261767c5feSopenharmony_ci goto bad; 1271767c5feSopenharmony_ci *rune = (Rune)l; 1281767c5feSopenharmony_ci return 3; 1291767c5feSopenharmony_ci } 1301767c5feSopenharmony_ci 1311767c5feSopenharmony_ci if (length <= 3) 1321767c5feSopenharmony_ci goto badlen; 1331767c5feSopenharmony_ci 1341767c5feSopenharmony_ci /* 1351767c5feSopenharmony_ci * four character sequence (21-bit value) 1361767c5feSopenharmony_ci * 10000-1FFFFF => T4 Tx Tx Tx 1371767c5feSopenharmony_ci */ 1381767c5feSopenharmony_ci c3 = *(uchar*)(str+3) ^ Tx; 1391767c5feSopenharmony_ci if (c3 & Testx) 1401767c5feSopenharmony_ci goto bad; 1411767c5feSopenharmony_ci if (c < T5) { 1421767c5feSopenharmony_ci l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; 1431767c5feSopenharmony_ci if (l <= Rune3 || l > Runemax) 1441767c5feSopenharmony_ci goto bad; 1451767c5feSopenharmony_ci *rune = (Rune)l; 1461767c5feSopenharmony_ci return 4; 1471767c5feSopenharmony_ci } 1481767c5feSopenharmony_ci 1491767c5feSopenharmony_ci // Support for 5-byte or longer UTF-8 would go here, but 1501767c5feSopenharmony_ci // since we don't have that, we'll just fall through to bad. 1511767c5feSopenharmony_ci 1521767c5feSopenharmony_ci /* 1531767c5feSopenharmony_ci * bad decoding 1541767c5feSopenharmony_ci */ 1551767c5feSopenharmony_cibad: 1561767c5feSopenharmony_ci *rune = Bad; 1571767c5feSopenharmony_ci return 1; 1581767c5feSopenharmony_cibadlen: 1591767c5feSopenharmony_ci *rune = Bad; 1601767c5feSopenharmony_ci return 0; 1611767c5feSopenharmony_ci 1621767c5feSopenharmony_ci} 1631767c5feSopenharmony_ci 1641767c5feSopenharmony_ci 1651767c5feSopenharmony_ci/* 1661767c5feSopenharmony_ci * This is the older "unsafe" version, which works fine on 1671767c5feSopenharmony_ci * null-terminated strings. 1681767c5feSopenharmony_ci */ 1691767c5feSopenharmony_ciint 1701767c5feSopenharmony_cichartorune(Rune *rune, const char *str) 1711767c5feSopenharmony_ci{ 1721767c5feSopenharmony_ci int c, c1, c2, c3; 1731767c5feSopenharmony_ci long l; 1741767c5feSopenharmony_ci 1751767c5feSopenharmony_ci /* 1761767c5feSopenharmony_ci * one character sequence 1771767c5feSopenharmony_ci * 00000-0007F => T1 1781767c5feSopenharmony_ci */ 1791767c5feSopenharmony_ci c = *(uchar*)str; 1801767c5feSopenharmony_ci if(c < Tx) { 1811767c5feSopenharmony_ci *rune = (Rune)c; 1821767c5feSopenharmony_ci return 1; 1831767c5feSopenharmony_ci } 1841767c5feSopenharmony_ci 1851767c5feSopenharmony_ci /* 1861767c5feSopenharmony_ci * two character sequence 1871767c5feSopenharmony_ci * 0080-07FF => T2 Tx 1881767c5feSopenharmony_ci */ 1891767c5feSopenharmony_ci c1 = *(uchar*)(str+1) ^ Tx; 1901767c5feSopenharmony_ci if(c1 & Testx) 1911767c5feSopenharmony_ci goto bad; 1921767c5feSopenharmony_ci if(c < T3) { 1931767c5feSopenharmony_ci if(c < T2) 1941767c5feSopenharmony_ci goto bad; 1951767c5feSopenharmony_ci l = ((c << Bitx) | c1) & Rune2; 1961767c5feSopenharmony_ci if(l <= Rune1) 1971767c5feSopenharmony_ci goto bad; 1981767c5feSopenharmony_ci *rune = (Rune)l; 1991767c5feSopenharmony_ci return 2; 2001767c5feSopenharmony_ci } 2011767c5feSopenharmony_ci 2021767c5feSopenharmony_ci /* 2031767c5feSopenharmony_ci * three character sequence 2041767c5feSopenharmony_ci * 0800-FFFF => T3 Tx Tx 2051767c5feSopenharmony_ci */ 2061767c5feSopenharmony_ci c2 = *(uchar*)(str+2) ^ Tx; 2071767c5feSopenharmony_ci if(c2 & Testx) 2081767c5feSopenharmony_ci goto bad; 2091767c5feSopenharmony_ci if(c < T4) { 2101767c5feSopenharmony_ci l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; 2111767c5feSopenharmony_ci if(l <= Rune2) 2121767c5feSopenharmony_ci goto bad; 2131767c5feSopenharmony_ci if (SurrogateMin <= l && l <= SurrogateMax) 2141767c5feSopenharmony_ci goto bad; 2151767c5feSopenharmony_ci *rune = (Rune)l; 2161767c5feSopenharmony_ci return 3; 2171767c5feSopenharmony_ci } 2181767c5feSopenharmony_ci 2191767c5feSopenharmony_ci /* 2201767c5feSopenharmony_ci * four character sequence (21-bit value) 2211767c5feSopenharmony_ci * 10000-1FFFFF => T4 Tx Tx Tx 2221767c5feSopenharmony_ci */ 2231767c5feSopenharmony_ci c3 = *(uchar*)(str+3) ^ Tx; 2241767c5feSopenharmony_ci if (c3 & Testx) 2251767c5feSopenharmony_ci goto bad; 2261767c5feSopenharmony_ci if (c < T5) { 2271767c5feSopenharmony_ci l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; 2281767c5feSopenharmony_ci if (l <= Rune3 || l > Runemax) 2291767c5feSopenharmony_ci goto bad; 2301767c5feSopenharmony_ci *rune = (Rune)l; 2311767c5feSopenharmony_ci return 4; 2321767c5feSopenharmony_ci } 2331767c5feSopenharmony_ci 2341767c5feSopenharmony_ci /* 2351767c5feSopenharmony_ci * Support for 5-byte or longer UTF-8 would go here, but 2361767c5feSopenharmony_ci * since we don't have that, we'll just fall through to bad. 2371767c5feSopenharmony_ci */ 2381767c5feSopenharmony_ci 2391767c5feSopenharmony_ci /* 2401767c5feSopenharmony_ci * bad decoding 2411767c5feSopenharmony_ci */ 2421767c5feSopenharmony_cibad: 2431767c5feSopenharmony_ci *rune = Bad; 2441767c5feSopenharmony_ci return 1; 2451767c5feSopenharmony_ci} 2461767c5feSopenharmony_ci 2471767c5feSopenharmony_ciint 2481767c5feSopenharmony_ciisvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) 2491767c5feSopenharmony_ci{ 2501767c5feSopenharmony_ci *consumed = charntorune(rune, str, length); 2511767c5feSopenharmony_ci return *rune != Runeerror || *consumed == 3; 2521767c5feSopenharmony_ci} 2531767c5feSopenharmony_ci 2541767c5feSopenharmony_ciint 2551767c5feSopenharmony_cirunetochar(char *str, const Rune *rune) 2561767c5feSopenharmony_ci{ 2571767c5feSopenharmony_ci /* Runes are signed, so convert to unsigned for range check. */ 2581767c5feSopenharmony_ci unsigned long c; 2591767c5feSopenharmony_ci 2601767c5feSopenharmony_ci /* 2611767c5feSopenharmony_ci * one character sequence 2621767c5feSopenharmony_ci * 00000-0007F => 00-7F 2631767c5feSopenharmony_ci */ 2641767c5feSopenharmony_ci c = *rune; 2651767c5feSopenharmony_ci if(c <= Rune1) { 2661767c5feSopenharmony_ci str[0] = (char)c; 2671767c5feSopenharmony_ci return 1; 2681767c5feSopenharmony_ci } 2691767c5feSopenharmony_ci 2701767c5feSopenharmony_ci /* 2711767c5feSopenharmony_ci * two character sequence 2721767c5feSopenharmony_ci * 0080-07FF => T2 Tx 2731767c5feSopenharmony_ci */ 2741767c5feSopenharmony_ci if(c <= Rune2) { 2751767c5feSopenharmony_ci str[0] = (char)(T2 | (c >> 1*Bitx)); 2761767c5feSopenharmony_ci str[1] = (char)(Tx | (c & Maskx)); 2771767c5feSopenharmony_ci return 2; 2781767c5feSopenharmony_ci } 2791767c5feSopenharmony_ci 2801767c5feSopenharmony_ci /* 2811767c5feSopenharmony_ci * If the Rune is out of range or a surrogate half, convert it to the error rune. 2821767c5feSopenharmony_ci * Do this test here because the error rune encodes to three bytes. 2831767c5feSopenharmony_ci * Doing it earlier would duplicate work, since an out of range 2841767c5feSopenharmony_ci * Rune wouldn't have fit in one or two bytes. 2851767c5feSopenharmony_ci */ 2861767c5feSopenharmony_ci if (c > Runemax) 2871767c5feSopenharmony_ci c = Runeerror; 2881767c5feSopenharmony_ci if (SurrogateMin <= c && c <= SurrogateMax) 2891767c5feSopenharmony_ci c = Runeerror; 2901767c5feSopenharmony_ci 2911767c5feSopenharmony_ci /* 2921767c5feSopenharmony_ci * three character sequence 2931767c5feSopenharmony_ci * 0800-FFFF => T3 Tx Tx 2941767c5feSopenharmony_ci */ 2951767c5feSopenharmony_ci if (c <= Rune3) { 2961767c5feSopenharmony_ci str[0] = (char)(T3 | (c >> 2*Bitx)); 2971767c5feSopenharmony_ci str[1] = (char)(Tx | ((c >> 1*Bitx) & Maskx)); 2981767c5feSopenharmony_ci str[2] = (char)(Tx | (c & Maskx)); 2991767c5feSopenharmony_ci return 3; 3001767c5feSopenharmony_ci } 3011767c5feSopenharmony_ci 3021767c5feSopenharmony_ci /* 3031767c5feSopenharmony_ci * four character sequence (21-bit value) 3041767c5feSopenharmony_ci * 10000-1FFFFF => T4 Tx Tx Tx 3051767c5feSopenharmony_ci */ 3061767c5feSopenharmony_ci str[0] = (char)(T4 | (c >> 3*Bitx)); 3071767c5feSopenharmony_ci str[1] = (char)(Tx | ((c >> 2*Bitx) & Maskx)); 3081767c5feSopenharmony_ci str[2] = (char)(Tx | ((c >> 1*Bitx) & Maskx)); 3091767c5feSopenharmony_ci str[3] = (char)(Tx | (c & Maskx)); 3101767c5feSopenharmony_ci return 4; 3111767c5feSopenharmony_ci} 3121767c5feSopenharmony_ci 3131767c5feSopenharmony_ciint 3141767c5feSopenharmony_cirunelen(Rune rune) 3151767c5feSopenharmony_ci{ 3161767c5feSopenharmony_ci char str[10]; 3171767c5feSopenharmony_ci 3181767c5feSopenharmony_ci return runetochar(str, &rune); 3191767c5feSopenharmony_ci} 3201767c5feSopenharmony_ci 3211767c5feSopenharmony_ciint 3221767c5feSopenharmony_cirunenlen(const Rune *r, int nrune) 3231767c5feSopenharmony_ci{ 3241767c5feSopenharmony_ci int nb, c; 3251767c5feSopenharmony_ci 3261767c5feSopenharmony_ci nb = 0; 3271767c5feSopenharmony_ci while(nrune--) { 3281767c5feSopenharmony_ci c = (int)*r++; 3291767c5feSopenharmony_ci if (c <= Rune1) 3301767c5feSopenharmony_ci nb++; 3311767c5feSopenharmony_ci else if (c <= Rune2) 3321767c5feSopenharmony_ci nb += 2; 3331767c5feSopenharmony_ci else if (c <= Rune3) 3341767c5feSopenharmony_ci nb += 3; 3351767c5feSopenharmony_ci else /* assert(c <= Rune4) */ 3361767c5feSopenharmony_ci nb += 4; 3371767c5feSopenharmony_ci } 3381767c5feSopenharmony_ci return nb; 3391767c5feSopenharmony_ci} 3401767c5feSopenharmony_ci 3411767c5feSopenharmony_ciint 3421767c5feSopenharmony_cifullrune(const char *str, int n) 3431767c5feSopenharmony_ci{ 3441767c5feSopenharmony_ci if (n > 0) { 3451767c5feSopenharmony_ci int c = *(uchar*)str; 3461767c5feSopenharmony_ci if (c < Tx) 3471767c5feSopenharmony_ci return 1; 3481767c5feSopenharmony_ci if (n > 1) { 3491767c5feSopenharmony_ci if (c < T3) 3501767c5feSopenharmony_ci return 1; 3511767c5feSopenharmony_ci if (n > 2) { 3521767c5feSopenharmony_ci if (c < T4 || n > 3) 3531767c5feSopenharmony_ci return 1; 3541767c5feSopenharmony_ci } 3551767c5feSopenharmony_ci } 3561767c5feSopenharmony_ci } 3571767c5feSopenharmony_ci return 0; 3581767c5feSopenharmony_ci} 359