11767c5feSopenharmony_ci/*
21767c5feSopenharmony_ci * The authors of this software are Rob Pike and Ken Thompson.
31767c5feSopenharmony_ci *              Copyright (c) 2002 by Lucent Technologies.
41767c5feSopenharmony_ci *              Portions Copyright (c) 2009 The Go Authors.  All rights reserved.
51767c5feSopenharmony_ci * Permission to use, copy, modify, and distribute this software for any
61767c5feSopenharmony_ci * purpose without fee is hereby granted, provided that this entire notice
71767c5feSopenharmony_ci * is included in all copies of any software which is or includes a copy
81767c5feSopenharmony_ci * or modification of this software and in all copies of the supporting
91767c5feSopenharmony_ci * documentation for such software.
101767c5feSopenharmony_ci * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
111767c5feSopenharmony_ci * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
121767c5feSopenharmony_ci * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
131767c5feSopenharmony_ci * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
141767c5feSopenharmony_ci */
151767c5feSopenharmony_ci#include "phonenumbers/utf/utf.h"
161767c5feSopenharmony_ci#include "phonenumbers/utf/utfdef.h"
171767c5feSopenharmony_ci
181767c5feSopenharmony_cienum
191767c5feSopenharmony_ci{
201767c5feSopenharmony_ci	Bit1	= 7,
211767c5feSopenharmony_ci	Bitx	= 6,
221767c5feSopenharmony_ci	Bit2	= 5,
231767c5feSopenharmony_ci	Bit3	= 4,
241767c5feSopenharmony_ci	Bit4	= 3,
251767c5feSopenharmony_ci	Bit5	= 2,
261767c5feSopenharmony_ci
271767c5feSopenharmony_ci	T1	= ((1<<(Bit1+1))-1) ^ 0xFF,	/* 0000 0000 */
281767c5feSopenharmony_ci	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */
291767c5feSopenharmony_ci	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */
301767c5feSopenharmony_ci	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */
311767c5feSopenharmony_ci	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */
321767c5feSopenharmony_ci	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */
331767c5feSopenharmony_ci
341767c5feSopenharmony_ci	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0111 1111 */
351767c5feSopenharmony_ci	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0111 1111 1111 */
361767c5feSopenharmony_ci	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 1111 1111 1111 1111 */
371767c5feSopenharmony_ci	Rune4	= (1<<(Bit4+3*Bitx))-1,		/* 0001 1111 1111 1111 1111 1111 */
381767c5feSopenharmony_ci
391767c5feSopenharmony_ci	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
401767c5feSopenharmony_ci	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */
411767c5feSopenharmony_ci
421767c5feSopenharmony_ci	SurrogateMin	= 0xD800,
431767c5feSopenharmony_ci	SurrogateMax	= 0xDFFF,
441767c5feSopenharmony_ci
451767c5feSopenharmony_ci	Bad	= Runeerror,
461767c5feSopenharmony_ci};
471767c5feSopenharmony_ci
481767c5feSopenharmony_ci/*
491767c5feSopenharmony_ci * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
501767c5feSopenharmony_ci * This is a slower but "safe" version of the old chartorune
511767c5feSopenharmony_ci * that works on strings that are not necessarily null-terminated.
521767c5feSopenharmony_ci *
531767c5feSopenharmony_ci * If you know for sure that your string is null-terminated,
541767c5feSopenharmony_ci * chartorune will be a bit faster.
551767c5feSopenharmony_ci *
561767c5feSopenharmony_ci * It is guaranteed not to attempt to access "length"
571767c5feSopenharmony_ci * past the incoming pointer.  This is to avoid
581767c5feSopenharmony_ci * possible access violations.  If the string appears to be
591767c5feSopenharmony_ci * well-formed but incomplete (i.e., to get the whole Rune
601767c5feSopenharmony_ci * we'd need to read past str+length) then we'll set the Rune
611767c5feSopenharmony_ci * to Bad and return 0.
621767c5feSopenharmony_ci *
631767c5feSopenharmony_ci * Note that if we have decoding problems for other
641767c5feSopenharmony_ci * reasons, we return 1 instead of 0.
651767c5feSopenharmony_ci */
661767c5feSopenharmony_ciint
671767c5feSopenharmony_cicharntorune(Rune *rune, const char *str, int length)
681767c5feSopenharmony_ci{
691767c5feSopenharmony_ci	int c, c1, c2, c3;
701767c5feSopenharmony_ci	long l;
711767c5feSopenharmony_ci
721767c5feSopenharmony_ci	/* When we're not allowed to read anything */
731767c5feSopenharmony_ci	if(length <= 0) {
741767c5feSopenharmony_ci		goto badlen;
751767c5feSopenharmony_ci	}
761767c5feSopenharmony_ci
771767c5feSopenharmony_ci	/*
781767c5feSopenharmony_ci	 * one character sequence (7-bit value)
791767c5feSopenharmony_ci	 *	00000-0007F => T1
801767c5feSopenharmony_ci	 */
811767c5feSopenharmony_ci	c = *(uchar*)str;
821767c5feSopenharmony_ci	if(c < Tx) {
831767c5feSopenharmony_ci		*rune = (Rune)c;
841767c5feSopenharmony_ci		return 1;
851767c5feSopenharmony_ci	}
861767c5feSopenharmony_ci
871767c5feSopenharmony_ci	// If we can't read more than one character we must stop
881767c5feSopenharmony_ci	if(length <= 1) {
891767c5feSopenharmony_ci		goto badlen;
901767c5feSopenharmony_ci	}
911767c5feSopenharmony_ci
921767c5feSopenharmony_ci	/*
931767c5feSopenharmony_ci	 * two character sequence (11-bit value)
941767c5feSopenharmony_ci	 *	0080-07FF => T2 Tx
951767c5feSopenharmony_ci	 */
961767c5feSopenharmony_ci	c1 = *(uchar*)(str+1) ^ Tx;
971767c5feSopenharmony_ci	if(c1 & Testx)
981767c5feSopenharmony_ci		goto bad;
991767c5feSopenharmony_ci	if(c < T3) {
1001767c5feSopenharmony_ci		if(c < T2)
1011767c5feSopenharmony_ci			goto bad;
1021767c5feSopenharmony_ci		l = ((c << Bitx) | c1) & Rune2;
1031767c5feSopenharmony_ci		if(l <= Rune1)
1041767c5feSopenharmony_ci			goto bad;
1051767c5feSopenharmony_ci		*rune = (Rune)l;
1061767c5feSopenharmony_ci		return 2;
1071767c5feSopenharmony_ci	}
1081767c5feSopenharmony_ci
1091767c5feSopenharmony_ci	// If we can't read more than two characters we must stop
1101767c5feSopenharmony_ci	if(length <= 2) {
1111767c5feSopenharmony_ci		goto badlen;
1121767c5feSopenharmony_ci	}
1131767c5feSopenharmony_ci
1141767c5feSopenharmony_ci	/*
1151767c5feSopenharmony_ci	 * three character sequence (16-bit value)
1161767c5feSopenharmony_ci	 *	0800-FFFF => T3 Tx Tx
1171767c5feSopenharmony_ci	 */
1181767c5feSopenharmony_ci	c2 = *(uchar*)(str+2) ^ Tx;
1191767c5feSopenharmony_ci	if(c2 & Testx)
1201767c5feSopenharmony_ci		goto bad;
1211767c5feSopenharmony_ci	if(c < T4) {
1221767c5feSopenharmony_ci		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
1231767c5feSopenharmony_ci		if(l <= Rune2)
1241767c5feSopenharmony_ci			goto bad;
1251767c5feSopenharmony_ci		if (SurrogateMin <= l && l <= SurrogateMax)
1261767c5feSopenharmony_ci			goto bad;
1271767c5feSopenharmony_ci		*rune = (Rune)l;
1281767c5feSopenharmony_ci		return 3;
1291767c5feSopenharmony_ci	}
1301767c5feSopenharmony_ci
1311767c5feSopenharmony_ci	if (length <= 3)
1321767c5feSopenharmony_ci		goto badlen;
1331767c5feSopenharmony_ci
1341767c5feSopenharmony_ci	/*
1351767c5feSopenharmony_ci	 * four character sequence (21-bit value)
1361767c5feSopenharmony_ci	 *	10000-1FFFFF => T4 Tx Tx Tx
1371767c5feSopenharmony_ci	 */
1381767c5feSopenharmony_ci	c3 = *(uchar*)(str+3) ^ Tx;
1391767c5feSopenharmony_ci	if (c3 & Testx)
1401767c5feSopenharmony_ci		goto bad;
1411767c5feSopenharmony_ci	if (c < T5) {
1421767c5feSopenharmony_ci		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
1431767c5feSopenharmony_ci		if (l <= Rune3 || l > Runemax)
1441767c5feSopenharmony_ci			goto bad;
1451767c5feSopenharmony_ci		*rune = (Rune)l;
1461767c5feSopenharmony_ci		return 4;
1471767c5feSopenharmony_ci	}
1481767c5feSopenharmony_ci
1491767c5feSopenharmony_ci	// Support for 5-byte or longer UTF-8 would go here, but
1501767c5feSopenharmony_ci	// since we don't have that, we'll just fall through to bad.
1511767c5feSopenharmony_ci
1521767c5feSopenharmony_ci	/*
1531767c5feSopenharmony_ci	 * bad decoding
1541767c5feSopenharmony_ci	 */
1551767c5feSopenharmony_cibad:
1561767c5feSopenharmony_ci	*rune = Bad;
1571767c5feSopenharmony_ci	return 1;
1581767c5feSopenharmony_cibadlen:
1591767c5feSopenharmony_ci	*rune = Bad;
1601767c5feSopenharmony_ci	return 0;
1611767c5feSopenharmony_ci
1621767c5feSopenharmony_ci}
1631767c5feSopenharmony_ci
1641767c5feSopenharmony_ci
1651767c5feSopenharmony_ci/*
1661767c5feSopenharmony_ci * This is the older "unsafe" version, which works fine on
1671767c5feSopenharmony_ci * null-terminated strings.
1681767c5feSopenharmony_ci */
1691767c5feSopenharmony_ciint
1701767c5feSopenharmony_cichartorune(Rune *rune, const char *str)
1711767c5feSopenharmony_ci{
1721767c5feSopenharmony_ci	int c, c1, c2, c3;
1731767c5feSopenharmony_ci	long l;
1741767c5feSopenharmony_ci
1751767c5feSopenharmony_ci	/*
1761767c5feSopenharmony_ci	 * one character sequence
1771767c5feSopenharmony_ci	 *	00000-0007F => T1
1781767c5feSopenharmony_ci	 */
1791767c5feSopenharmony_ci	c = *(uchar*)str;
1801767c5feSopenharmony_ci	if(c < Tx) {
1811767c5feSopenharmony_ci		*rune = (Rune)c;
1821767c5feSopenharmony_ci		return 1;
1831767c5feSopenharmony_ci	}
1841767c5feSopenharmony_ci
1851767c5feSopenharmony_ci	/*
1861767c5feSopenharmony_ci	 * two character sequence
1871767c5feSopenharmony_ci	 *	0080-07FF => T2 Tx
1881767c5feSopenharmony_ci	 */
1891767c5feSopenharmony_ci	c1 = *(uchar*)(str+1) ^ Tx;
1901767c5feSopenharmony_ci	if(c1 & Testx)
1911767c5feSopenharmony_ci		goto bad;
1921767c5feSopenharmony_ci	if(c < T3) {
1931767c5feSopenharmony_ci		if(c < T2)
1941767c5feSopenharmony_ci			goto bad;
1951767c5feSopenharmony_ci		l = ((c << Bitx) | c1) & Rune2;
1961767c5feSopenharmony_ci		if(l <= Rune1)
1971767c5feSopenharmony_ci			goto bad;
1981767c5feSopenharmony_ci		*rune = (Rune)l;
1991767c5feSopenharmony_ci		return 2;
2001767c5feSopenharmony_ci	}
2011767c5feSopenharmony_ci
2021767c5feSopenharmony_ci	/*
2031767c5feSopenharmony_ci	 * three character sequence
2041767c5feSopenharmony_ci	 *	0800-FFFF => T3 Tx Tx
2051767c5feSopenharmony_ci	 */
2061767c5feSopenharmony_ci	c2 = *(uchar*)(str+2) ^ Tx;
2071767c5feSopenharmony_ci	if(c2 & Testx)
2081767c5feSopenharmony_ci		goto bad;
2091767c5feSopenharmony_ci	if(c < T4) {
2101767c5feSopenharmony_ci		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
2111767c5feSopenharmony_ci		if(l <= Rune2)
2121767c5feSopenharmony_ci			goto bad;
2131767c5feSopenharmony_ci		if (SurrogateMin <= l && l <= SurrogateMax)
2141767c5feSopenharmony_ci			goto bad;
2151767c5feSopenharmony_ci		*rune = (Rune)l;
2161767c5feSopenharmony_ci		return 3;
2171767c5feSopenharmony_ci	}
2181767c5feSopenharmony_ci
2191767c5feSopenharmony_ci	/*
2201767c5feSopenharmony_ci	 * four character sequence (21-bit value)
2211767c5feSopenharmony_ci	 *	10000-1FFFFF => T4 Tx Tx Tx
2221767c5feSopenharmony_ci	 */
2231767c5feSopenharmony_ci	c3 = *(uchar*)(str+3) ^ Tx;
2241767c5feSopenharmony_ci	if (c3 & Testx)
2251767c5feSopenharmony_ci		goto bad;
2261767c5feSopenharmony_ci	if (c < T5) {
2271767c5feSopenharmony_ci		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
2281767c5feSopenharmony_ci		if (l <= Rune3 || l > Runemax)
2291767c5feSopenharmony_ci			goto bad;
2301767c5feSopenharmony_ci		*rune = (Rune)l;
2311767c5feSopenharmony_ci		return 4;
2321767c5feSopenharmony_ci	}
2331767c5feSopenharmony_ci
2341767c5feSopenharmony_ci	/*
2351767c5feSopenharmony_ci	 * Support for 5-byte or longer UTF-8 would go here, but
2361767c5feSopenharmony_ci	 * since we don't have that, we'll just fall through to bad.
2371767c5feSopenharmony_ci	 */
2381767c5feSopenharmony_ci
2391767c5feSopenharmony_ci	/*
2401767c5feSopenharmony_ci	 * bad decoding
2411767c5feSopenharmony_ci	 */
2421767c5feSopenharmony_cibad:
2431767c5feSopenharmony_ci	*rune = Bad;
2441767c5feSopenharmony_ci	return 1;
2451767c5feSopenharmony_ci}
2461767c5feSopenharmony_ci
2471767c5feSopenharmony_ciint
2481767c5feSopenharmony_ciisvalidcharntorune(const char* str, int length, Rune* rune, int* consumed)
2491767c5feSopenharmony_ci{
2501767c5feSopenharmony_ci	*consumed = charntorune(rune, str, length);
2511767c5feSopenharmony_ci	return *rune != Runeerror || *consumed == 3;
2521767c5feSopenharmony_ci}
2531767c5feSopenharmony_ci
2541767c5feSopenharmony_ciint
2551767c5feSopenharmony_cirunetochar(char *str, const Rune *rune)
2561767c5feSopenharmony_ci{
2571767c5feSopenharmony_ci	/* Runes are signed, so convert to unsigned for range check. */
2581767c5feSopenharmony_ci	unsigned long c;
2591767c5feSopenharmony_ci
2601767c5feSopenharmony_ci	/*
2611767c5feSopenharmony_ci	 * one character sequence
2621767c5feSopenharmony_ci	 *	00000-0007F => 00-7F
2631767c5feSopenharmony_ci	 */
2641767c5feSopenharmony_ci	c = *rune;
2651767c5feSopenharmony_ci	if(c <= Rune1) {
2661767c5feSopenharmony_ci		str[0] = (char)c;
2671767c5feSopenharmony_ci		return 1;
2681767c5feSopenharmony_ci	}
2691767c5feSopenharmony_ci
2701767c5feSopenharmony_ci	/*
2711767c5feSopenharmony_ci	 * two character sequence
2721767c5feSopenharmony_ci	 *	0080-07FF => T2 Tx
2731767c5feSopenharmony_ci	 */
2741767c5feSopenharmony_ci	if(c <= Rune2) {
2751767c5feSopenharmony_ci		str[0] = (char)(T2 | (c >> 1*Bitx));
2761767c5feSopenharmony_ci		str[1] = (char)(Tx | (c & Maskx));
2771767c5feSopenharmony_ci		return 2;
2781767c5feSopenharmony_ci	}
2791767c5feSopenharmony_ci
2801767c5feSopenharmony_ci	/*
2811767c5feSopenharmony_ci	 * If the Rune is out of range or a surrogate half, convert it to the error rune.
2821767c5feSopenharmony_ci	 * Do this test here because the error rune encodes to three bytes.
2831767c5feSopenharmony_ci	 * Doing it earlier would duplicate work, since an out of range
2841767c5feSopenharmony_ci	 * Rune wouldn't have fit in one or two bytes.
2851767c5feSopenharmony_ci	 */
2861767c5feSopenharmony_ci	if (c > Runemax)
2871767c5feSopenharmony_ci		c = Runeerror;
2881767c5feSopenharmony_ci	if (SurrogateMin <= c && c <= SurrogateMax)
2891767c5feSopenharmony_ci		c = Runeerror;
2901767c5feSopenharmony_ci
2911767c5feSopenharmony_ci	/*
2921767c5feSopenharmony_ci	 * three character sequence
2931767c5feSopenharmony_ci	 *	0800-FFFF => T3 Tx Tx
2941767c5feSopenharmony_ci	 */
2951767c5feSopenharmony_ci	if (c <= Rune3) {
2961767c5feSopenharmony_ci		str[0] = (char)(T3 |  (c >> 2*Bitx));
2971767c5feSopenharmony_ci		str[1] = (char)(Tx | ((c >> 1*Bitx) & Maskx));
2981767c5feSopenharmony_ci		str[2] = (char)(Tx |  (c & Maskx));
2991767c5feSopenharmony_ci		return 3;
3001767c5feSopenharmony_ci	}
3011767c5feSopenharmony_ci
3021767c5feSopenharmony_ci	/*
3031767c5feSopenharmony_ci	 * four character sequence (21-bit value)
3041767c5feSopenharmony_ci	 *     10000-1FFFFF => T4 Tx Tx Tx
3051767c5feSopenharmony_ci	 */
3061767c5feSopenharmony_ci	str[0] = (char)(T4 | (c >> 3*Bitx));
3071767c5feSopenharmony_ci	str[1] = (char)(Tx | ((c >> 2*Bitx) & Maskx));
3081767c5feSopenharmony_ci	str[2] = (char)(Tx | ((c >> 1*Bitx) & Maskx));
3091767c5feSopenharmony_ci	str[3] = (char)(Tx | (c & Maskx));
3101767c5feSopenharmony_ci	return 4;
3111767c5feSopenharmony_ci}
3121767c5feSopenharmony_ci
3131767c5feSopenharmony_ciint
3141767c5feSopenharmony_cirunelen(Rune rune)
3151767c5feSopenharmony_ci{
3161767c5feSopenharmony_ci	char str[10];
3171767c5feSopenharmony_ci
3181767c5feSopenharmony_ci	return runetochar(str, &rune);
3191767c5feSopenharmony_ci}
3201767c5feSopenharmony_ci
3211767c5feSopenharmony_ciint
3221767c5feSopenharmony_cirunenlen(const Rune *r, int nrune)
3231767c5feSopenharmony_ci{
3241767c5feSopenharmony_ci	int nb, c;
3251767c5feSopenharmony_ci
3261767c5feSopenharmony_ci	nb = 0;
3271767c5feSopenharmony_ci	while(nrune--) {
3281767c5feSopenharmony_ci		c = (int)*r++;
3291767c5feSopenharmony_ci		if (c <= Rune1)
3301767c5feSopenharmony_ci			nb++;
3311767c5feSopenharmony_ci		else if (c <= Rune2)
3321767c5feSopenharmony_ci			nb += 2;
3331767c5feSopenharmony_ci		else if (c <= Rune3)
3341767c5feSopenharmony_ci			nb += 3;
3351767c5feSopenharmony_ci		else /* assert(c <= Rune4) */
3361767c5feSopenharmony_ci			nb += 4;
3371767c5feSopenharmony_ci	}
3381767c5feSopenharmony_ci	return nb;
3391767c5feSopenharmony_ci}
3401767c5feSopenharmony_ci
3411767c5feSopenharmony_ciint
3421767c5feSopenharmony_cifullrune(const char *str, int n)
3431767c5feSopenharmony_ci{
3441767c5feSopenharmony_ci	if (n > 0) {
3451767c5feSopenharmony_ci		int c = *(uchar*)str;
3461767c5feSopenharmony_ci		if (c < Tx)
3471767c5feSopenharmony_ci			return 1;
3481767c5feSopenharmony_ci		if (n > 1) {
3491767c5feSopenharmony_ci			if (c < T3)
3501767c5feSopenharmony_ci				return 1;
3511767c5feSopenharmony_ci			if (n > 2) {
3521767c5feSopenharmony_ci				if (c < T4 || n > 3)
3531767c5feSopenharmony_ci					return 1;
3541767c5feSopenharmony_ci			}
3551767c5feSopenharmony_ci		}
3561767c5feSopenharmony_ci	}
3571767c5feSopenharmony_ci	return 0;
3581767c5feSopenharmony_ci}
359