xref: /third_party/pulseaudio/src/pulse/utf8.c (revision 53a5a1b3)
1/***
2  This file is part of PulseAudio.
3
4  Copyright 2006 Lennart Poettering
5  Copyright 2006 Pierre Ossman <ossman@cendio.se> for Cendio AB
6
7  PulseAudio is free software; you can redistribute it and/or modify
8  it under the terms of the GNU Lesser General Public License as
9  published by the Free Software Foundation; either version 2.1 of the
10  License, or (at your option) any later version.
11
12  PulseAudio is distributed in the hope that it will be useful, but
13  WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  Lesser General Public License for more details.
16
17  You should have received a copy of the GNU Lesser General Public
18  License along with PulseAudio; if not, see <http://www.gnu.org/licenses/>.
19***/
20
21/* This file is based on the GLIB utf8 validation functions. The
22 * original license text follows. */
23
24/* gutf8.c - Operations on UTF-8 strings.
25 *
26 * Copyright (C) 1999 Tom Tromey
27 * Copyright (C) 2000 Red Hat, Inc.
28 *
29 * This library is free software; you can redistribute it and/or
30 * modify it under the terms of the GNU Lesser General Public
31 * License as published by the Free Software Foundation; either
32 * version 2 of the License, or (at your option) any later version.
33 *
34 * This library is distributed in the hope that it will be useful,
35 * but WITHOUT ANY WARRANTY; without even the implied warranty of
36 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
37 * Lesser General Public License for more details.
38 *
39 * You should have received a copy of the GNU Lesser General Public
40 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
41 */
42
43#ifdef HAVE_CONFIG_H
44#include <config.h>
45#endif
46
47#include <errno.h>
48#include <stdlib.h>
49#include <inttypes.h>
50#include <string.h>
51
52#ifdef HAVE_ICONV
53#include <iconv.h>
54#endif
55
56#include <pulse/xmalloc.h>
57#include <pulsecore/macro.h>
58
59#include "utf8.h"
60
61#define FILTER_CHAR '_'
62
63static inline bool is_unicode_valid(uint32_t ch) {
64
65    if (ch >= 0x110000) /* End of unicode space */
66        return false;
67    if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */
68        return false;
69    if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */
70        return false;
71    if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */
72        return false;
73
74    return true;
75}
76
77static inline bool is_continuation_char(uint8_t ch) {
78    if ((ch & 0xc0) != 0x80) /* 10xxxxxx */
79        return false;
80    return true;
81}
82
83static inline void merge_continuation_char(uint32_t *u_ch, uint8_t ch) {
84    *u_ch <<= 6;
85    *u_ch |= ch & 0x3f;
86}
87
88static char* utf8_validate(const char *str, char *output) {
89    uint32_t val = 0;
90    uint32_t min = 0;
91    const uint8_t *p, *last;
92    int size;
93    uint8_t *o;
94
95    pa_assert(str);
96
97    o = (uint8_t*) output;
98    for (p = (const uint8_t*) str; *p; p++) {
99        if (*p < 128) {
100            if (o)
101                *o = *p;
102        } else {
103            last = p;
104
105            if ((*p & 0xe0) == 0xc0) { /* 110xxxxx two-char seq. */
106                size = 2;
107                min = 128;
108                val = (uint32_t) (*p & 0x1e);
109                goto ONE_REMAINING;
110            } else if ((*p & 0xf0) == 0xe0) { /* 1110xxxx three-char seq.*/
111                size = 3;
112                min = (1 << 11);
113                val = (uint32_t) (*p & 0x0f);
114                goto TWO_REMAINING;
115            } else if ((*p & 0xf8) == 0xf0) { /* 11110xxx four-char seq */
116                size = 4;
117                min = (1 << 16);
118                val = (uint32_t) (*p & 0x07);
119            } else
120                goto error;
121
122            p++;
123            if (!is_continuation_char(*p))
124                goto error;
125            merge_continuation_char(&val, *p);
126
127TWO_REMAINING:
128            p++;
129            if (!is_continuation_char(*p))
130                goto error;
131            merge_continuation_char(&val, *p);
132
133ONE_REMAINING:
134            p++;
135            if (!is_continuation_char(*p))
136                goto error;
137            merge_continuation_char(&val, *p);
138
139            if (val < min)
140                goto error;
141
142            if (!is_unicode_valid(val))
143                goto error;
144
145            if (o) {
146                memcpy(o, last, (size_t) size);
147                o += size;
148            }
149
150            continue;
151
152error:
153            if (o) {
154                *o = FILTER_CHAR;
155                p = last; /* We retry at the next character */
156            } else
157                goto failure;
158        }
159
160        if (o)
161            o++;
162    }
163
164    if (o) {
165        *o = '\0';
166        return output;
167    }
168
169    return (char*) str;
170
171failure:
172    return NULL;
173}
174
175char* pa_utf8_valid (const char *str) {
176    return utf8_validate(str, NULL);
177}
178
179char* pa_utf8_filter (const char *str) {
180    char *new_str;
181
182    pa_assert(str);
183    new_str = pa_xmalloc(strlen(str) + 1);
184    return utf8_validate(str, new_str);
185}
186
187#ifdef HAVE_ICONV
188
189static char* iconv_simple(const char *str, const char *to, const char *from) {
190    char *new_str;
191    size_t len, inlen;
192    iconv_t cd;
193    ICONV_CONST char *inbuf;
194    char *outbuf;
195    size_t res, inbytes, outbytes;
196
197    pa_assert(str);
198    pa_assert(to);
199    pa_assert(from);
200
201    cd = iconv_open(to, from);
202    if (cd == (iconv_t)-1)
203        return NULL;
204
205    inlen = len = strlen(str) + 1;
206    new_str = pa_xmalloc(len);
207
208    for (;;) {
209        inbuf = (ICONV_CONST char*) str; /* Brain dead prototype for iconv() */
210        inbytes = inlen;
211        outbuf = new_str;
212        outbytes = len;
213
214        res = iconv(cd, &inbuf, &inbytes, &outbuf, &outbytes);
215
216        if (res != (size_t)-1)
217            break;
218
219        if (errno != E2BIG) {
220            pa_xfree(new_str);
221            new_str = NULL;
222            break;
223        }
224
225        pa_assert(inbytes != 0);
226
227        len += inbytes;
228        new_str = pa_xrealloc(new_str, len);
229    }
230
231    iconv_close(cd);
232
233    return new_str;
234}
235
236char* pa_utf8_to_locale (const char *str) {
237    return iconv_simple(str, "", "UTF-8");
238}
239
240char* pa_locale_to_utf8 (const char *str) {
241    return iconv_simple(str, "UTF-8", "");
242}
243
244#else
245
246char* pa_utf8_to_locale (const char *str) {
247    pa_assert(str);
248
249    return pa_ascii_filter(str);
250}
251
252char* pa_locale_to_utf8 (const char *str) {
253    pa_assert(str);
254
255    if (pa_utf8_valid(str))
256        return pa_xstrdup(str);
257
258    return NULL;
259}
260
261#endif
262
263char *pa_ascii_valid(const char *str) {
264    const char *p;
265    pa_assert(str);
266
267    for (p = str; *p; p++)
268        if ((unsigned char) *p >= 128)
269            return NULL;
270
271    return (char*) str;
272}
273
274char *pa_ascii_filter(const char *str) {
275    char *r, *s, *d;
276    pa_assert(str);
277
278    r = pa_xstrdup(str);
279
280    for (s = r, d = r; *s; s++)
281        if ((unsigned char) *s < 128)
282            *(d++) = *s;
283
284    *d = 0;
285
286    return r;
287}
288