1/*
2 * The authors of this software are Rob Pike and Ken Thompson.
3 *              Copyright (c) 2002 by Lucent Technologies.
4 *              Portions Copyright (c) 2009 The Go Authors.  All rights reserved.
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose without fee is hereby granted, provided that this entire notice
7 * is included in all copies of any software which is or includes a copy
8 * or modification of this software and in all copies of the supporting
9 * documentation for such software.
10 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
11 * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
12 * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
13 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
14 */
15#include "phonenumbers/utf/utf.h"
16#include "phonenumbers/utf/utfdef.h"
17
18enum
19{
20	Bit1	= 7,
21	Bitx	= 6,
22	Bit2	= 5,
23	Bit3	= 4,
24	Bit4	= 3,
25	Bit5	= 2,
26
27	T1	= ((1<<(Bit1+1))-1) ^ 0xFF,	/* 0000 0000 */
28	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */
29	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */
30	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */
31	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */
32	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */
33
34	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0111 1111 */
35	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0111 1111 1111 */
36	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 1111 1111 1111 1111 */
37	Rune4	= (1<<(Bit4+3*Bitx))-1,		/* 0001 1111 1111 1111 1111 1111 */
38
39	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
40	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */
41
42	SurrogateMin	= 0xD800,
43	SurrogateMax	= 0xDFFF,
44
45	Bad	= Runeerror,
46};
47
48/*
49 * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
50 * This is a slower but "safe" version of the old chartorune
51 * that works on strings that are not necessarily null-terminated.
52 *
53 * If you know for sure that your string is null-terminated,
54 * chartorune will be a bit faster.
55 *
56 * It is guaranteed not to attempt to access "length"
57 * past the incoming pointer.  This is to avoid
58 * possible access violations.  If the string appears to be
59 * well-formed but incomplete (i.e., to get the whole Rune
60 * we'd need to read past str+length) then we'll set the Rune
61 * to Bad and return 0.
62 *
63 * Note that if we have decoding problems for other
64 * reasons, we return 1 instead of 0.
65 */
66int
67charntorune(Rune *rune, const char *str, int length)
68{
69	int c, c1, c2, c3;
70	long l;
71
72	/* When we're not allowed to read anything */
73	if(length <= 0) {
74		goto badlen;
75	}
76
77	/*
78	 * one character sequence (7-bit value)
79	 *	00000-0007F => T1
80	 */
81	c = *(uchar*)str;
82	if(c < Tx) {
83		*rune = (Rune)c;
84		return 1;
85	}
86
87	// If we can't read more than one character we must stop
88	if(length <= 1) {
89		goto badlen;
90	}
91
92	/*
93	 * two character sequence (11-bit value)
94	 *	0080-07FF => T2 Tx
95	 */
96	c1 = *(uchar*)(str+1) ^ Tx;
97	if(c1 & Testx)
98		goto bad;
99	if(c < T3) {
100		if(c < T2)
101			goto bad;
102		l = ((c << Bitx) | c1) & Rune2;
103		if(l <= Rune1)
104			goto bad;
105		*rune = (Rune)l;
106		return 2;
107	}
108
109	// If we can't read more than two characters we must stop
110	if(length <= 2) {
111		goto badlen;
112	}
113
114	/*
115	 * three character sequence (16-bit value)
116	 *	0800-FFFF => T3 Tx Tx
117	 */
118	c2 = *(uchar*)(str+2) ^ Tx;
119	if(c2 & Testx)
120		goto bad;
121	if(c < T4) {
122		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
123		if(l <= Rune2)
124			goto bad;
125		if (SurrogateMin <= l && l <= SurrogateMax)
126			goto bad;
127		*rune = (Rune)l;
128		return 3;
129	}
130
131	if (length <= 3)
132		goto badlen;
133
134	/*
135	 * four character sequence (21-bit value)
136	 *	10000-1FFFFF => T4 Tx Tx Tx
137	 */
138	c3 = *(uchar*)(str+3) ^ Tx;
139	if (c3 & Testx)
140		goto bad;
141	if (c < T5) {
142		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
143		if (l <= Rune3 || l > Runemax)
144			goto bad;
145		*rune = (Rune)l;
146		return 4;
147	}
148
149	// Support for 5-byte or longer UTF-8 would go here, but
150	// since we don't have that, we'll just fall through to bad.
151
152	/*
153	 * bad decoding
154	 */
155bad:
156	*rune = Bad;
157	return 1;
158badlen:
159	*rune = Bad;
160	return 0;
161
162}
163
164
165/*
166 * This is the older "unsafe" version, which works fine on
167 * null-terminated strings.
168 */
169int
170chartorune(Rune *rune, const char *str)
171{
172	int c, c1, c2, c3;
173	long l;
174
175	/*
176	 * one character sequence
177	 *	00000-0007F => T1
178	 */
179	c = *(uchar*)str;
180	if(c < Tx) {
181		*rune = (Rune)c;
182		return 1;
183	}
184
185	/*
186	 * two character sequence
187	 *	0080-07FF => T2 Tx
188	 */
189	c1 = *(uchar*)(str+1) ^ Tx;
190	if(c1 & Testx)
191		goto bad;
192	if(c < T3) {
193		if(c < T2)
194			goto bad;
195		l = ((c << Bitx) | c1) & Rune2;
196		if(l <= Rune1)
197			goto bad;
198		*rune = (Rune)l;
199		return 2;
200	}
201
202	/*
203	 * three character sequence
204	 *	0800-FFFF => T3 Tx Tx
205	 */
206	c2 = *(uchar*)(str+2) ^ Tx;
207	if(c2 & Testx)
208		goto bad;
209	if(c < T4) {
210		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
211		if(l <= Rune2)
212			goto bad;
213		if (SurrogateMin <= l && l <= SurrogateMax)
214			goto bad;
215		*rune = (Rune)l;
216		return 3;
217	}
218
219	/*
220	 * four character sequence (21-bit value)
221	 *	10000-1FFFFF => T4 Tx Tx Tx
222	 */
223	c3 = *(uchar*)(str+3) ^ Tx;
224	if (c3 & Testx)
225		goto bad;
226	if (c < T5) {
227		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
228		if (l <= Rune3 || l > Runemax)
229			goto bad;
230		*rune = (Rune)l;
231		return 4;
232	}
233
234	/*
235	 * Support for 5-byte or longer UTF-8 would go here, but
236	 * since we don't have that, we'll just fall through to bad.
237	 */
238
239	/*
240	 * bad decoding
241	 */
242bad:
243	*rune = Bad;
244	return 1;
245}
246
247int
248isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed)
249{
250	*consumed = charntorune(rune, str, length);
251	return *rune != Runeerror || *consumed == 3;
252}
253
254int
255runetochar(char *str, const Rune *rune)
256{
257	/* Runes are signed, so convert to unsigned for range check. */
258	unsigned long c;
259
260	/*
261	 * one character sequence
262	 *	00000-0007F => 00-7F
263	 */
264	c = *rune;
265	if(c <= Rune1) {
266		str[0] = (char)c;
267		return 1;
268	}
269
270	/*
271	 * two character sequence
272	 *	0080-07FF => T2 Tx
273	 */
274	if(c <= Rune2) {
275		str[0] = (char)(T2 | (c >> 1*Bitx));
276		str[1] = (char)(Tx | (c & Maskx));
277		return 2;
278	}
279
280	/*
281	 * If the Rune is out of range or a surrogate half, convert it to the error rune.
282	 * Do this test here because the error rune encodes to three bytes.
283	 * Doing it earlier would duplicate work, since an out of range
284	 * Rune wouldn't have fit in one or two bytes.
285	 */
286	if (c > Runemax)
287		c = Runeerror;
288	if (SurrogateMin <= c && c <= SurrogateMax)
289		c = Runeerror;
290
291	/*
292	 * three character sequence
293	 *	0800-FFFF => T3 Tx Tx
294	 */
295	if (c <= Rune3) {
296		str[0] = (char)(T3 |  (c >> 2*Bitx));
297		str[1] = (char)(Tx | ((c >> 1*Bitx) & Maskx));
298		str[2] = (char)(Tx |  (c & Maskx));
299		return 3;
300	}
301
302	/*
303	 * four character sequence (21-bit value)
304	 *     10000-1FFFFF => T4 Tx Tx Tx
305	 */
306	str[0] = (char)(T4 | (c >> 3*Bitx));
307	str[1] = (char)(Tx | ((c >> 2*Bitx) & Maskx));
308	str[2] = (char)(Tx | ((c >> 1*Bitx) & Maskx));
309	str[3] = (char)(Tx | (c & Maskx));
310	return 4;
311}
312
313int
314runelen(Rune rune)
315{
316	char str[10];
317
318	return runetochar(str, &rune);
319}
320
321int
322runenlen(const Rune *r, int nrune)
323{
324	int nb, c;
325
326	nb = 0;
327	while(nrune--) {
328		c = (int)*r++;
329		if (c <= Rune1)
330			nb++;
331		else if (c <= Rune2)
332			nb += 2;
333		else if (c <= Rune3)
334			nb += 3;
335		else /* assert(c <= Rune4) */
336			nb += 4;
337	}
338	return nb;
339}
340
341int
342fullrune(const char *str, int n)
343{
344	if (n > 0) {
345		int c = *(uchar*)str;
346		if (c < Tx)
347			return 1;
348		if (n > 1) {
349			if (c < T3)
350				return 1;
351			if (n > 2) {
352				if (c < T4 || n > 3)
353					return 1;
354			}
355		}
356	}
357	return 0;
358}
359