1/*
2 * Copyright (C) 2004-2005 Kay Sievers <kay.sievers@vrfy.org>
3 *
4 *	This program is free software; you can redistribute it and/or modify it
5 *	under the terms of the GNU General Public License as published by the
6 *	Free Software Foundation version 2 of the License.
7 *
8 *	This program is distributed in the hope that it will be useful, but
9 *	WITHOUT ANY WARRANTY; without even the implied warranty of
10 *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11 *	General Public License for more details.
12 *
13 *	You should have received a copy of the GNU General Public License along
14 *	with this program; if not, write to the Free Software Foundation, Inc.,
15 *	51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
16 *
17 */
18
19
20static void remove_trailing_chars(char *path, char c)
21{
22	size_t len;
23
24	len = strlen(path);
25	while (len > 0 && path[len-1] == c)
26		path[--len] = '\0';
27}
28
29/* count of characters used to encode one unicode char */
30static int utf8_encoded_expected_len(const char *str)
31{
32	unsigned char c = (unsigned char)str[0];
33
34	if (c < 0x80)
35		return 1;
36	if ((c & 0xe0) == 0xc0)
37		return 2;
38	if ((c & 0xf0) == 0xe0)
39		return 3;
40	if ((c & 0xf8) == 0xf0)
41		return 4;
42	if ((c & 0xfc) == 0xf8)
43		return 5;
44	if ((c & 0xfe) == 0xfc)
45		return 6;
46	return 0;
47}
48
49/* decode one unicode char */
50static int utf8_encoded_to_unichar(const char *str)
51{
52	int unichar;
53	int len;
54	int i;
55
56	len = utf8_encoded_expected_len(str);
57	switch (len) {
58	case 1:
59		return (int)str[0];
60	case 2:
61		unichar = str[0] & 0x1f;
62		break;
63	case 3:
64		unichar = (int)str[0] & 0x0f;
65		break;
66	case 4:
67		unichar = (int)str[0] & 0x07;
68		break;
69	case 5:
70		unichar = (int)str[0] & 0x03;
71		break;
72	case 6:
73		unichar = (int)str[0] & 0x01;
74		break;
75	default:
76		return -1;
77	}
78
79	for (i = 1; i < len; i++) {
80		if (((int)str[i] & 0xc0) != 0x80)
81			return -1;
82		unichar <<= 6;
83		unichar |= (int)str[i] & 0x3f;
84	}
85
86	return unichar;
87}
88
89/* expected size used to encode one unicode char */
90static int utf8_unichar_to_encoded_len(int unichar)
91{
92	if (unichar < 0x80)
93		return 1;
94	if (unichar < 0x800)
95		return 2;
96	if (unichar < 0x10000)
97		return 3;
98	if (unichar < 0x200000)
99		return 4;
100	if (unichar < 0x4000000)
101		return 5;
102	return 6;
103}
104
105/* check if unicode char has a valid numeric range */
106static int utf8_unichar_valid_range(int unichar)
107{
108	if (unichar > 0x10ffff)
109		return 0;
110	if ((unichar & 0xfffff800) == 0xd800)
111		return 0;
112	if ((unichar > 0xfdcf) && (unichar < 0xfdf0))
113		return 0;
114	if ((unichar & 0xffff) == 0xffff)
115		return 0;
116	return 1;
117}
118
119/* validate one encoded unicode char and return its length */
120static int utf8_encoded_valid_unichar(const char *str)
121{
122	int len;
123	int unichar;
124	int i;
125
126	len = utf8_encoded_expected_len(str);
127	if (len == 0)
128		return -1;
129
130	/* ascii is valid */
131	if (len == 1)
132		return 1;
133
134	/* check if expected encoded chars are available */
135	for (i = 0; i < len; i++)
136		if ((str[i] & 0x80) != 0x80)
137			return -1;
138
139	unichar = utf8_encoded_to_unichar(str);
140
141	/* check if encoded length matches encoded value */
142	if (utf8_unichar_to_encoded_len(unichar) != len)
143		return -1;
144
145	/* check if value has valid range */
146	if (!utf8_unichar_valid_range(unichar))
147		return -1;
148
149	return len;
150}
151
152/* replace everything but whitelisted plain ascii and valid utf8 */
153static int replace_untrusted_chars(char *str)
154{
155	size_t i = 0;
156	int replaced = 0;
157
158	while (str[i] != '\0') {
159		int len;
160
161		/* valid printable ascii char */
162		if ((str[i] >= '0' && str[i] <= '9') ||
163		    (str[i] >= 'A' && str[i] <= 'Z') ||
164		    (str[i] >= 'a' && str[i] <= 'z') ||
165		    strchr(" #$%+-./:=?@_,", str[i])) {
166			i++;
167			continue;
168		}
169		/* valid utf8 is accepted */
170		len = utf8_encoded_valid_unichar(&str[i]);
171		if (len > 1) {
172			i += len;
173			continue;
174		}
175
176		/* everything else is garbage */
177		str[i] = '_';
178		i++;
179		replaced++;
180	}
181
182	return replaced;
183}
184