1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1997-2011,2014-2015 International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * Date Name Description 9 * 06/21/00 aliu Creation. 10 ******************************************************************************* 11 */ 12 13 #ifndef UTRANS_H 14 #define UTRANS_H 15 16 #include "unicode/utypes.h" 17 18 #if !UCONFIG_NO_TRANSLITERATION 19 20 #include "unicode/parseerr.h" 21 #include "unicode/uenum.h" 22 #include "unicode/uset.h" 23 24 #if U_SHOW_CPLUSPLUS_API 25 #include "unicode/localpointer.h" 26 #endif // U_SHOW_CPLUSPLUS_API 27 28 /******************************************************************** 29 * General Notes 30 ******************************************************************** 31 */ 32 /** 33 * \file 34 * \brief C API: Transliterator 35 * 36 * <h2> Transliteration </h2> 37 * The data structures and functions described in this header provide 38 * transliteration services. Transliteration services are implemented 39 * as C++ classes. The comments and documentation in this header 40 * assume the reader is familiar with the C++ headers translit.h and 41 * associated documentation. 42 * 43 * A significant but incomplete subset of the C++ transliteration 44 * services are available to C code through this header. In order to 45 * access more complex transliteration services, refer to the C++ 46 * headers and documentation. 47 * 48 * There are two sets of functions for working with transliterator IDs: 49 * 50 * An old, deprecated set uses char * IDs, which works for true and pure 51 * identifiers that these APIs were designed for, 52 * for example "Cyrillic-Latin". 53 * It does not work when the ID contains filters ("[:Script=Cyrl:]") 54 * or even a complete set of rules because then the ID string contains more 55 * than just "invariant" characters (see utypes.h). 56 * 57 * A new set of functions replaces the old ones and uses UChar * IDs, 58 * paralleling the UnicodeString IDs in the C++ API. (New in ICU 2.8.) 59 */ 60 61 /******************************************************************** 62 * Data Structures 63 ********************************************************************/ 64 65 /** 66 * An opaque transliterator for use in C. Open with utrans_openxxx() 67 * and close with utrans_close() when done. Equivalent to the C++ class 68 * Transliterator and its subclasses. 69 * @see Transliterator 70 * @stable ICU 2.0 71 */ 72 typedef void* UTransliterator; 73 74 /** 75 * Direction constant indicating the direction in a transliterator, 76 * e.g., the forward or reverse rules of a RuleBasedTransliterator. 77 * Specified when a transliterator is opened. An "A-B" transliterator 78 * transliterates A to B when operating in the forward direction, and 79 * B to A when operating in the reverse direction. 80 * @stable ICU 2.0 81 */ 82 typedef enum UTransDirection { 83 84 /** 85 * UTRANS_FORWARD means from <source> to <target> for a 86 * transliterator with ID <source>-<target>. For a transliterator 87 * opened using a rule, it means forward direction rules, e.g., 88 * "A > B". 89 */ 90 UTRANS_FORWARD, 91 92 /** 93 * UTRANS_REVERSE means from <target> to <source> for a 94 * transliterator with ID <source>-<target>. For a transliterator 95 * opened using a rule, it means reverse direction rules, e.g., 96 * "A < B". 97 */ 98 UTRANS_REVERSE 99 100 } UTransDirection; 101 102 /** 103 * Position structure for utrans_transIncremental() incremental 104 * transliteration. This structure defines two substrings of the text 105 * being transliterated. The first region, [contextStart, 106 * contextLimit), defines what characters the transliterator will read 107 * as context. The second region, [start, limit), defines what 108 * characters will actually be transliterated. The second region 109 * should be a subset of the first. 110 * 111 * <p>After a transliteration operation, some of the indices in this 112 * structure will be modified. See the field descriptions for 113 * details. 114 * 115 * <p>contextStart <= start <= limit <= contextLimit 116 * 117 * <p>Note: All index values in this structure must be at code point 118 * boundaries. That is, none of them may occur between two code units 119 * of a surrogate pair. If any index does split a surrogate pair, 120 * results are unspecified. 121 * 122 * @stable ICU 2.0 123 */ 124 typedef struct UTransPosition { 125 126 /** 127 * Beginning index, inclusive, of the context to be considered for 128 * a transliteration operation. The transliterator will ignore 129 * anything before this index. INPUT/OUTPUT parameter: This parameter 130 * is updated by a transliteration operation to reflect the maximum 131 * amount of antecontext needed by a transliterator. 132 * @stable ICU 2.4 133 */ 134 int32_t contextStart; 135 136 /** 137 * Ending index, exclusive, of the context to be considered for a 138 * transliteration operation. The transliterator will ignore 139 * anything at or after this index. INPUT/OUTPUT parameter: This 140 * parameter is updated to reflect changes in the length of the 141 * text, but points to the same logical position in the text. 142 * @stable ICU 2.4 143 */ 144 int32_t contextLimit; 145 146 /** 147 * Beginning index, inclusive, of the text to be transliterated. 148 * INPUT/OUTPUT parameter: This parameter is advanced past 149 * characters that have already been transliterated by a 150 * transliteration operation. 151 * @stable ICU 2.4 152 */ 153 int32_t start; 154 155 /** 156 * Ending index, exclusive, of the text to be transliterated. 157 * INPUT/OUTPUT parameter: This parameter is updated to reflect 158 * changes in the length of the text, but points to the same 159 * logical position in the text. 160 * @stable ICU 2.4 161 */ 162 int32_t limit; 163 164 } UTransPosition; 165 166 /******************************************************************** 167 * General API 168 ********************************************************************/ 169 170 /** 171 * Open a custom transliterator, given a custom rules string 172 * OR 173 * a system transliterator, given its ID. 174 * Any non-NULL result from this function should later be closed with 175 * utrans_close(). 176 * 177 * @param id a valid transliterator ID 178 * @param idLength the length of the ID string, or -1 if NUL-terminated 179 * @param dir the desired direction 180 * @param rules the transliterator rules. See the C++ header rbt.h for 181 * rules syntax. If NULL then a system transliterator matching 182 * the ID is returned. 183 * @param rulesLength the length of the rules, or -1 if the rules 184 * are NUL-terminated. 185 * @param parseError a pointer to a UParseError struct to receive the details 186 * of any parsing errors. This parameter may be NULL if no 187 * parsing error details are desired. 188 * @param pErrorCode a pointer to the UErrorCode 189 * @return a transliterator pointer that may be passed to other 190 * utrans_xxx() functions, or NULL if the open call fails. 191 * @stable ICU 2.8 192 */ 193 U_CAPI UTransliterator* U_EXPORT2 194 utrans_openU(const UChar *id, 195 int32_t idLength, 196 UTransDirection dir, 197 const UChar *rules, 198 int32_t rulesLength, 199 UParseError *parseError, 200 UErrorCode *pErrorCode); 201 202 /** 203 * Open an inverse of an existing transliterator. For this to work, 204 * the inverse must be registered with the system. For example, if 205 * the Transliterator "A-B" is opened, and then its inverse is opened, 206 * the result is the Transliterator "B-A", if such a transliterator is 207 * registered with the system. Otherwise the result is NULL and a 208 * failing UErrorCode is set. Any non-NULL result from this function 209 * should later be closed with utrans_close(). 210 * 211 * @param trans the transliterator to open the inverse of. 212 * @param status a pointer to the UErrorCode 213 * @return a pointer to a newly-opened transliterator that is the 214 * inverse of trans, or NULL if the open call fails. 215 * @stable ICU 2.0 216 */ 217 U_CAPI UTransliterator* U_EXPORT2 218 utrans_openInverse(const UTransliterator* trans, 219 UErrorCode* status); 220 221 /** 222 * Create a copy of a transliterator. Any non-NULL result from this 223 * function should later be closed with utrans_close(). 224 * 225 * @param trans the transliterator to be copied. 226 * @param status a pointer to the UErrorCode 227 * @return a transliterator pointer that may be passed to other 228 * utrans_xxx() functions, or NULL if the clone call fails. 229 * @stable ICU 2.0 230 */ 231 U_CAPI UTransliterator* U_EXPORT2 232 utrans_clone(const UTransliterator* trans, 233 UErrorCode* status); 234 235 /** 236 * Close a transliterator. Any non-NULL pointer returned by 237 * utrans_openXxx() or utrans_clone() should eventually be closed. 238 * @param trans the transliterator to be closed. 239 * @stable ICU 2.0 240 */ 241 U_CAPI void U_EXPORT2 242 utrans_close(UTransliterator* trans); 243 244 #if U_SHOW_CPLUSPLUS_API 245 246 U_NAMESPACE_BEGIN 247 248 /** 249 * \class LocalUTransliteratorPointer 250 * "Smart pointer" class, closes a UTransliterator via utrans_close(). 251 * For most methods see the LocalPointerBase base class. 252 * 253 * @see LocalPointerBase 254 * @see LocalPointer 255 * @stable ICU 4.4 256 */ 257 U_DEFINE_LOCAL_OPEN_POINTER(LocalUTransliteratorPointer, UTransliterator, utrans_close); 258 259 U_NAMESPACE_END 260 261 #endif 262 263 /** 264 * Return the programmatic identifier for this transliterator. 265 * If this identifier is passed to utrans_openU(), it will open 266 * a transliterator equivalent to this one, if the ID has been 267 * registered. 268 * 269 * @param trans the transliterator to return the ID of. 270 * @param resultLength pointer to an output variable receiving the length 271 * of the ID string; can be NULL 272 * @return the NUL-terminated ID string. This pointer remains 273 * valid until utrans_close() is called on this transliterator. 274 * 275 * @stable ICU 2.8 276 */ 277 U_CAPI const UChar * U_EXPORT2 278 utrans_getUnicodeID(const UTransliterator *trans, 279 int32_t *resultLength); 280 281 /** 282 * Register an open transliterator with the system. When 283 * utrans_open() is called with an ID string that is equal to that 284 * returned by utrans_getID(adoptedTrans,...), then 285 * utrans_clone(adoptedTrans,...) is returned. 286 * 287 * <p>NOTE: After this call the system owns the adoptedTrans and will 288 * close it. The user must not call utrans_close() on adoptedTrans. 289 * 290 * @param adoptedTrans a transliterator, typically the result of 291 * utrans_openRules(), to be registered with the system. 292 * @param status a pointer to the UErrorCode 293 * @stable ICU 2.0 294 */ 295 U_CAPI void U_EXPORT2 296 utrans_register(UTransliterator* adoptedTrans, 297 UErrorCode* status); 298 299 /** 300 * Unregister a transliterator from the system. After this call the 301 * system will no longer recognize the given ID when passed to 302 * utrans_open(). If the ID is invalid then nothing is done. 303 * 304 * @param id an ID to unregister 305 * @param idLength the length of id, or -1 if id is zero-terminated 306 * @stable ICU 2.8 307 */ 308 U_CAPI void U_EXPORT2 309 utrans_unregisterID(const UChar* id, int32_t idLength); 310 311 /** 312 * Set the filter used by a transliterator. A filter can be used to 313 * make the transliterator pass certain characters through untouched. 314 * The filter is expressed using a UnicodeSet pattern. If the 315 * filterPattern is NULL or the empty string, then the transliterator 316 * will be reset to use no filter. 317 * 318 * @param trans the transliterator 319 * @param filterPattern a pattern string, in the form accepted by 320 * UnicodeSet, specifying which characters to apply the 321 * transliteration to. May be NULL or the empty string to indicate no 322 * filter. 323 * @param filterPatternLen the length of filterPattern, or -1 if 324 * filterPattern is zero-terminated 325 * @param status a pointer to the UErrorCode 326 * @see UnicodeSet 327 * @stable ICU 2.0 328 */ 329 U_CAPI void U_EXPORT2 330 utrans_setFilter(UTransliterator* trans, 331 const UChar* filterPattern, 332 int32_t filterPatternLen, 333 UErrorCode* status); 334 335 /** 336 * Return the number of system transliterators. 337 * It is recommended to use utrans_openIDs() instead. 338 * 339 * @return the number of system transliterators. 340 * @stable ICU 2.0 341 */ 342 U_CAPI int32_t U_EXPORT2 343 utrans_countAvailableIDs(void); 344 345 /** 346 * Return a UEnumeration for the available transliterators. 347 * 348 * @param pErrorCode Pointer to the UErrorCode in/out parameter. 349 * @return UEnumeration for the available transliterators. 350 * Close with uenum_close(). 351 * 352 * @stable ICU 2.8 353 */ 354 U_CAPI UEnumeration * U_EXPORT2 355 utrans_openIDs(UErrorCode *pErrorCode); 356 357 /******************************************************************** 358 * Transliteration API 359 ********************************************************************/ 360 361 /** 362 * Transliterate a segment of a UChar* string. The string is passed 363 * in in a UChar* buffer. The string is modified in place. If the 364 * result is longer than textCapacity, it is truncated. The actual 365 * length of the result is returned in *textLength, if textLength is 366 * non-NULL. *textLength may be greater than textCapacity, but only 367 * textCapacity UChars will be written to *text, including the zero 368 * terminator. 369 * 370 * @param trans the transliterator 371 * @param text a pointer to a buffer containing the text to be 372 * transliterated on input and the result text on output. 373 * @param textLength a pointer to the length of the string in text. 374 * If the length is -1 then the string is assumed to be 375 * zero-terminated. Upon return, the new length is stored in 376 * *textLength. If textLength is NULL then the string is assumed to 377 * be zero-terminated. 378 * @param textCapacity the length of the text buffer 379 * @param start the beginning index, inclusive; <code>0 <= start <= 380 * limit</code>. 381 * @param limit pointer to the ending index, exclusive; <code>start <= 382 * limit <= repFunc->length(rep)</code>. Upon return, *limit will 383 * contain the new limit index. The text previously occupying 384 * <code>[start, limit)</code> has been transliterated, possibly to a 385 * string of a different length, at <code>[start, 386 * </code><em>new-limit</em><code>)</code>, where <em>new-limit</em> 387 * is the return value. 388 * @param status a pointer to the UErrorCode 389 * @stable ICU 2.0 390 */ 391 U_CAPI void U_EXPORT2 392 utrans_transUChars(const UTransliterator* trans, 393 UChar* text, 394 int32_t* textLength, 395 int32_t textCapacity, 396 int32_t start, 397 int32_t* limit, 398 UErrorCode* status); 399 400 /** 401 * Transliterate the portion of the UChar* text buffer that can be 402 * transliterated unambiguously. See utrans_transIncremental(). The 403 * string is passed in in a UChar* buffer. The string is modified in 404 * place. If the result is longer than textCapacity, it is truncated. 405 * The actual length of the result is returned in *textLength, if 406 * textLength is non-NULL. *textLength may be greater than 407 * textCapacity, but only textCapacity UChars will be written to 408 * *text, including the zero terminator. See utrans_transIncremental() 409 * for usage details. 410 * 411 * @param trans the transliterator 412 * @param text a pointer to a buffer containing the text to be 413 * transliterated on input and the result text on output. 414 * @param textLength a pointer to the length of the string in text. 415 * If the length is -1 then the string is assumed to be 416 * zero-terminated. Upon return, the new length is stored in 417 * *textLength. If textLength is NULL then the string is assumed to 418 * be zero-terminated. 419 * @param textCapacity the length of the text buffer 420 * @param pos a struct containing the start and limit indices of the 421 * text to be read and the text to be transliterated 422 * @param status a pointer to the UErrorCode 423 * @see utrans_transIncremental 424 * @stable ICU 2.0 425 */ 426 U_CAPI void U_EXPORT2 427 utrans_transIncrementalUChars(const UTransliterator* trans, 428 UChar* text, 429 int32_t* textLength, 430 int32_t textCapacity, 431 UTransPosition* pos, 432 UErrorCode* status); 433 434 /** 435 * Create a rule string that can be passed to utrans_openU to recreate this 436 * transliterator. 437 * 438 * @param trans The transliterator 439 * @param escapeUnprintable if true then convert unprintable characters to their 440 * hex escape representations, \\uxxxx or \\Uxxxxxxxx. 441 * Unprintable characters are those other than 442 * U+000A, U+0020..U+007E. 443 * @param result A pointer to a buffer to receive the rules. 444 * @param resultLength The maximum size of result. 445 * @param status A pointer to the UErrorCode. In case of error status, the 446 * contents of result are undefined. 447 * @return int32_t The length of the rule string (may be greater than resultLength, 448 * in which case an error is returned). 449 * @stable ICU 53 450 */ 451 U_CAPI int32_t U_EXPORT2 452 utrans_toRules( const UTransliterator* trans, 453 UBool escapeUnprintable, 454 UChar* result, int32_t resultLength, 455 UErrorCode* status); 456 457 /** 458 * Returns the set of all characters that may be modified in the input text by 459 * this UTransliterator, optionally ignoring the transliterator's current filter. 460 * @param trans The transliterator. 461 * @param ignoreFilter If false, the returned set incorporates the 462 * UTransliterator's current filter; if the filter is changed, 463 * the return value of this function will change. If true, the 464 * returned set ignores the effect of the UTransliterator's 465 * current filter. 466 * @param fillIn Pointer to a USet object to receive the modifiable characters 467 * set. Previous contents of fillIn are lost. <em>If fillIn is 468 * NULL, then a new USet is created and returned. The caller 469 * owns the result and must dispose of it by calling uset_close.</em> 470 * @param status A pointer to the UErrorCode. 471 * @return USet* Either fillIn, or if fillIn is NULL, a pointer to a 472 * newly-allocated USet that the user must close. In case of 473 * error, NULL is returned. 474 * @stable ICU 53 475 */ 476 U_CAPI USet* U_EXPORT2 477 utrans_getSourceSet(const UTransliterator* trans, 478 UBool ignoreFilter, 479 USet* fillIn, 480 UErrorCode* status); 481 482 /* deprecated API ----------------------------------------------------------- */ 483 484 #ifndef U_HIDE_DEPRECATED_API 485 486 /* see utrans.h documentation for why these functions are deprecated */ 487 488 489 490 #endif /* U_HIDE_DEPRECATED_API */ 491 492 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 493 494 #endif 495