1/*** 2 This file is part of PulseAudio. 3 4 Copyright 2006 Lennart Poettering 5 Copyright 2006 Pierre Ossman <ossman@cendio.se> for Cendio AB 6 7 PulseAudio is free software; you can redistribute it and/or modify 8 it under the terms of the GNU Lesser General Public License as 9 published by the Free Software Foundation; either version 2.1 of the 10 License, or (at your option) any later version. 11 12 PulseAudio is distributed in the hope that it will be useful, but 13 WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 Lesser General Public License for more details. 16 17 You should have received a copy of the GNU Lesser General Public 18 License along with PulseAudio; if not, see <http://www.gnu.org/licenses/>. 19***/ 20 21/* This file is based on the GLIB utf8 validation functions. The 22 * original license text follows. */ 23 24/* gutf8.c - Operations on UTF-8 strings. 25 * 26 * Copyright (C) 1999 Tom Tromey 27 * Copyright (C) 2000 Red Hat, Inc. 28 * 29 * This library is free software; you can redistribute it and/or 30 * modify it under the terms of the GNU Lesser General Public 31 * License as published by the Free Software Foundation; either 32 * version 2 of the License, or (at your option) any later version. 33 * 34 * This library is distributed in the hope that it will be useful, 35 * but WITHOUT ANY WARRANTY; without even the implied warranty of 36 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 37 * Lesser General Public License for more details. 38 * 39 * You should have received a copy of the GNU Lesser General Public 40 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 41 */ 42 43#ifdef HAVE_CONFIG_H 44#include <config.h> 45#endif 46 47#include <errno.h> 48#include <stdlib.h> 49#include <inttypes.h> 50#include <string.h> 51 52#ifdef HAVE_ICONV 53#include <iconv.h> 54#endif 55 56#include <pulse/xmalloc.h> 57#include <pulsecore/macro.h> 58 59#include "utf8.h" 60 61#define FILTER_CHAR '_' 62 63static inline bool is_unicode_valid(uint32_t ch) { 64 65 if (ch >= 0x110000) /* End of unicode space */ 66 return false; 67 if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */ 68 return false; 69 if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */ 70 return false; 71 if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */ 72 return false; 73 74 return true; 75} 76 77static inline bool is_continuation_char(uint8_t ch) { 78 if ((ch & 0xc0) != 0x80) /* 10xxxxxx */ 79 return false; 80 return true; 81} 82 83static inline void merge_continuation_char(uint32_t *u_ch, uint8_t ch) { 84 *u_ch <<= 6; 85 *u_ch |= ch & 0x3f; 86} 87 88static char* utf8_validate(const char *str, char *output) { 89 uint32_t val = 0; 90 uint32_t min = 0; 91 const uint8_t *p, *last; 92 int size; 93 uint8_t *o; 94 95 pa_assert(str); 96 97 o = (uint8_t*) output; 98 for (p = (const uint8_t*) str; *p; p++) { 99 if (*p < 128) { 100 if (o) 101 *o = *p; 102 } else { 103 last = p; 104 105 if ((*p & 0xe0) == 0xc0) { /* 110xxxxx two-char seq. */ 106 size = 2; 107 min = 128; 108 val = (uint32_t) (*p & 0x1e); 109 goto ONE_REMAINING; 110 } else if ((*p & 0xf0) == 0xe0) { /* 1110xxxx three-char seq.*/ 111 size = 3; 112 min = (1 << 11); 113 val = (uint32_t) (*p & 0x0f); 114 goto TWO_REMAINING; 115 } else if ((*p & 0xf8) == 0xf0) { /* 11110xxx four-char seq */ 116 size = 4; 117 min = (1 << 16); 118 val = (uint32_t) (*p & 0x07); 119 } else 120 goto error; 121 122 p++; 123 if (!is_continuation_char(*p)) 124 goto error; 125 merge_continuation_char(&val, *p); 126 127TWO_REMAINING: 128 p++; 129 if (!is_continuation_char(*p)) 130 goto error; 131 merge_continuation_char(&val, *p); 132 133ONE_REMAINING: 134 p++; 135 if (!is_continuation_char(*p)) 136 goto error; 137 merge_continuation_char(&val, *p); 138 139 if (val < min) 140 goto error; 141 142 if (!is_unicode_valid(val)) 143 goto error; 144 145 if (o) { 146 memcpy(o, last, (size_t) size); 147 o += size; 148 } 149 150 continue; 151 152error: 153 if (o) { 154 *o = FILTER_CHAR; 155 p = last; /* We retry at the next character */ 156 } else 157 goto failure; 158 } 159 160 if (o) 161 o++; 162 } 163 164 if (o) { 165 *o = '\0'; 166 return output; 167 } 168 169 return (char*) str; 170 171failure: 172 return NULL; 173} 174 175char* pa_utf8_valid (const char *str) { 176 return utf8_validate(str, NULL); 177} 178 179char* pa_utf8_filter (const char *str) { 180 char *new_str; 181 182 pa_assert(str); 183 new_str = pa_xmalloc(strlen(str) + 1); 184 return utf8_validate(str, new_str); 185} 186 187#ifdef HAVE_ICONV 188 189static char* iconv_simple(const char *str, const char *to, const char *from) { 190 char *new_str; 191 size_t len, inlen; 192 iconv_t cd; 193 ICONV_CONST char *inbuf; 194 char *outbuf; 195 size_t res, inbytes, outbytes; 196 197 pa_assert(str); 198 pa_assert(to); 199 pa_assert(from); 200 201 cd = iconv_open(to, from); 202 if (cd == (iconv_t)-1) 203 return NULL; 204 205 inlen = len = strlen(str) + 1; 206 new_str = pa_xmalloc(len); 207 208 for (;;) { 209 inbuf = (ICONV_CONST char*) str; /* Brain dead prototype for iconv() */ 210 inbytes = inlen; 211 outbuf = new_str; 212 outbytes = len; 213 214 res = iconv(cd, &inbuf, &inbytes, &outbuf, &outbytes); 215 216 if (res != (size_t)-1) 217 break; 218 219 if (errno != E2BIG) { 220 pa_xfree(new_str); 221 new_str = NULL; 222 break; 223 } 224 225 pa_assert(inbytes != 0); 226 227 len += inbytes; 228 new_str = pa_xrealloc(new_str, len); 229 } 230 231 iconv_close(cd); 232 233 return new_str; 234} 235 236char* pa_utf8_to_locale (const char *str) { 237 return iconv_simple(str, "", "UTF-8"); 238} 239 240char* pa_locale_to_utf8 (const char *str) { 241 return iconv_simple(str, "UTF-8", ""); 242} 243 244#else 245 246char* pa_utf8_to_locale (const char *str) { 247 pa_assert(str); 248 249 return pa_ascii_filter(str); 250} 251 252char* pa_locale_to_utf8 (const char *str) { 253 pa_assert(str); 254 255 if (pa_utf8_valid(str)) 256 return pa_xstrdup(str); 257 258 return NULL; 259} 260 261#endif 262 263char *pa_ascii_valid(const char *str) { 264 const char *p; 265 pa_assert(str); 266 267 for (p = str; *p; p++) 268 if ((unsigned char) *p >= 128) 269 return NULL; 270 271 return (char*) str; 272} 273 274char *pa_ascii_filter(const char *str) { 275 char *r, *s, *d; 276 pa_assert(str); 277 278 r = pa_xstrdup(str); 279 280 for (s = r, d = r; *s; s++) 281 if ((unsigned char) *s < 128) 282 *(d++) = *s; 283 284 *d = 0; 285 286 return r; 287} 288