1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*   Copyright (C) 1997-2011,2014-2015 International Business Machines
6*   Corporation and others.  All Rights Reserved.
7*******************************************************************************
8*   Date        Name        Description
9*   06/21/00    aliu        Creation.
10*******************************************************************************
11*/
12
13#ifndef UTRANS_H
14#define UTRANS_H
15
16#include "unicode/utypes.h"
17
18#if !UCONFIG_NO_TRANSLITERATION
19
20#include "unicode/parseerr.h"
21#include "unicode/uenum.h"
22#include "unicode/uset.h"
23
24#if U_SHOW_CPLUSPLUS_API
25#include "unicode/localpointer.h"
26#endif   // U_SHOW_CPLUSPLUS_API
27
28/********************************************************************
29 * General Notes
30 ********************************************************************
31 */
32/**
33 * \file
34 * \brief C API: Transliterator
35 *
36 * <h2> Transliteration </h2>
37 * The data structures and functions described in this header provide
38 * transliteration services.  Transliteration services are implemented
39 * as C++ classes.  The comments and documentation in this header
40 * assume the reader is familiar with the C++ headers translit.h and
41 * associated documentation.
42 *
43 * A significant but incomplete subset of the C++ transliteration
44 * services are available to C code through this header.  In order to
45 * access more complex transliteration services, refer to the C++
46 * headers and documentation.
47 *
48 * There are two sets of functions for working with transliterator IDs:
49 *
50 * An old, deprecated set uses char * IDs, which works for true and pure
51 * identifiers that these APIs were designed for,
52 * for example "Cyrillic-Latin".
53 * It does not work when the ID contains filters ("[:Script=Cyrl:]")
54 * or even a complete set of rules because then the ID string contains more
55 * than just "invariant" characters (see utypes.h).
56 *
57 * A new set of functions replaces the old ones and uses UChar * IDs,
58 * paralleling the UnicodeString IDs in the C++ API. (New in ICU 2.8.)
59 */
60
61/********************************************************************
62 * Data Structures
63 ********************************************************************/
64
65/**
66 * An opaque transliterator for use in C.  Open with utrans_openxxx()
67 * and close with utrans_close() when done.  Equivalent to the C++ class
68 * Transliterator and its subclasses.
69 * @see Transliterator
70 * @stable ICU 2.0
71 */
72typedef void* UTransliterator;
73
74/**
75 * Direction constant indicating the direction in a transliterator,
76 * e.g., the forward or reverse rules of a RuleBasedTransliterator.
77 * Specified when a transliterator is opened.  An "A-B" transliterator
78 * transliterates A to B when operating in the forward direction, and
79 * B to A when operating in the reverse direction.
80 * @stable ICU 2.0
81 */
82typedef enum UTransDirection {
83
84    /**
85     * UTRANS_FORWARD means from &lt;source&gt; to &lt;target&gt; for a
86     * transliterator with ID &lt;source&gt;-&lt;target&gt;.  For a transliterator
87     * opened using a rule, it means forward direction rules, e.g.,
88     * "A > B".
89     */
90    UTRANS_FORWARD,
91
92    /**
93     * UTRANS_REVERSE means from &lt;target&gt; to &lt;source&gt; for a
94     * transliterator with ID &lt;source&gt;-&lt;target&gt;.  For a transliterator
95     * opened using a rule, it means reverse direction rules, e.g.,
96     * "A < B".
97     */
98    UTRANS_REVERSE
99
100} UTransDirection;
101
102/**
103 * Position structure for utrans_transIncremental() incremental
104 * transliteration.  This structure defines two substrings of the text
105 * being transliterated.  The first region, [contextStart,
106 * contextLimit), defines what characters the transliterator will read
107 * as context.  The second region, [start, limit), defines what
108 * characters will actually be transliterated.  The second region
109 * should be a subset of the first.
110 *
111 * <p>After a transliteration operation, some of the indices in this
112 * structure will be modified.  See the field descriptions for
113 * details.
114 *
115 * <p>contextStart <= start <= limit <= contextLimit
116 *
117 * <p>Note: All index values in this structure must be at code point
118 * boundaries.  That is, none of them may occur between two code units
119 * of a surrogate pair.  If any index does split a surrogate pair,
120 * results are unspecified.
121 *
122 * @stable ICU 2.0
123 */
124typedef struct UTransPosition {
125
126    /**
127     * Beginning index, inclusive, of the context to be considered for
128     * a transliteration operation.  The transliterator will ignore
129     * anything before this index.  INPUT/OUTPUT parameter: This parameter
130     * is updated by a transliteration operation to reflect the maximum
131     * amount of antecontext needed by a transliterator.
132     * @stable ICU 2.4
133     */
134    int32_t contextStart;
135
136    /**
137     * Ending index, exclusive, of the context to be considered for a
138     * transliteration operation.  The transliterator will ignore
139     * anything at or after this index.  INPUT/OUTPUT parameter: This
140     * parameter is updated to reflect changes in the length of the
141     * text, but points to the same logical position in the text.
142     * @stable ICU 2.4
143     */
144    int32_t contextLimit;
145
146    /**
147     * Beginning index, inclusive, of the text to be transliterated.
148     * INPUT/OUTPUT parameter: This parameter is advanced past
149     * characters that have already been transliterated by a
150     * transliteration operation.
151     * @stable ICU 2.4
152     */
153    int32_t start;
154
155    /**
156     * Ending index, exclusive, of the text to be transliterated.
157     * INPUT/OUTPUT parameter: This parameter is updated to reflect
158     * changes in the length of the text, but points to the same
159     * logical position in the text.
160     * @stable ICU 2.4
161     */
162    int32_t limit;
163
164} UTransPosition;
165
166/********************************************************************
167 * General API
168 ********************************************************************/
169
170/**
171 * Open a custom transliterator, given a custom rules string
172 * OR
173 * a system transliterator, given its ID.
174 * Any non-NULL result from this function should later be closed with
175 * utrans_close().
176 *
177 * @param id a valid transliterator ID
178 * @param idLength the length of the ID string, or -1 if NUL-terminated
179 * @param dir the desired direction
180 * @param rules the transliterator rules.  See the C++ header rbt.h for
181 *              rules syntax. If NULL then a system transliterator matching
182 *              the ID is returned.
183 * @param rulesLength the length of the rules, or -1 if the rules
184 *                    are NUL-terminated.
185 * @param parseError a pointer to a UParseError struct to receive the details
186 *                   of any parsing errors. This parameter may be NULL if no
187 *                   parsing error details are desired.
188 * @param pErrorCode a pointer to the UErrorCode
189 * @return a transliterator pointer that may be passed to other
190 *         utrans_xxx() functions, or NULL if the open call fails.
191 * @stable ICU 2.8
192 */
193U_CAPI UTransliterator* U_EXPORT2
194utrans_openU(const UChar *id,
195             int32_t idLength,
196             UTransDirection dir,
197             const UChar *rules,
198             int32_t rulesLength,
199             UParseError *parseError,
200             UErrorCode *pErrorCode);
201
202/**
203 * Open an inverse of an existing transliterator.  For this to work,
204 * the inverse must be registered with the system.  For example, if
205 * the Transliterator "A-B" is opened, and then its inverse is opened,
206 * the result is the Transliterator "B-A", if such a transliterator is
207 * registered with the system.  Otherwise the result is NULL and a
208 * failing UErrorCode is set.  Any non-NULL result from this function
209 * should later be closed with utrans_close().
210 *
211 * @param trans the transliterator to open the inverse of.
212 * @param status a pointer to the UErrorCode
213 * @return a pointer to a newly-opened transliterator that is the
214 * inverse of trans, or NULL if the open call fails.
215 * @stable ICU 2.0
216 */
217U_CAPI UTransliterator* U_EXPORT2
218utrans_openInverse(const UTransliterator* trans,
219                   UErrorCode* status);
220
221/**
222 * Create a copy of a transliterator.  Any non-NULL result from this
223 * function should later be closed with utrans_close().
224 *
225 * @param trans the transliterator to be copied.
226 * @param status a pointer to the UErrorCode
227 * @return a transliterator pointer that may be passed to other
228 * utrans_xxx() functions, or NULL if the clone call fails.
229 * @stable ICU 2.0
230 */
231U_CAPI UTransliterator* U_EXPORT2
232utrans_clone(const UTransliterator* trans,
233             UErrorCode* status);
234
235/**
236 * Close a transliterator.  Any non-NULL pointer returned by
237 * utrans_openXxx() or utrans_clone() should eventually be closed.
238 * @param trans the transliterator to be closed.
239 * @stable ICU 2.0
240 */
241U_CAPI void U_EXPORT2
242utrans_close(UTransliterator* trans);
243
244#if U_SHOW_CPLUSPLUS_API
245
246U_NAMESPACE_BEGIN
247
248/**
249 * \class LocalUTransliteratorPointer
250 * "Smart pointer" class, closes a UTransliterator via utrans_close().
251 * For most methods see the LocalPointerBase base class.
252 *
253 * @see LocalPointerBase
254 * @see LocalPointer
255 * @stable ICU 4.4
256 */
257U_DEFINE_LOCAL_OPEN_POINTER(LocalUTransliteratorPointer, UTransliterator, utrans_close);
258
259U_NAMESPACE_END
260
261#endif
262
263/**
264 * Return the programmatic identifier for this transliterator.
265 * If this identifier is passed to utrans_openU(), it will open
266 * a transliterator equivalent to this one, if the ID has been
267 * registered.
268 *
269 * @param trans the transliterator to return the ID of.
270 * @param resultLength pointer to an output variable receiving the length
271 *        of the ID string; can be NULL
272 * @return the NUL-terminated ID string. This pointer remains
273 * valid until utrans_close() is called on this transliterator.
274 *
275 * @stable ICU 2.8
276 */
277U_CAPI const UChar * U_EXPORT2
278utrans_getUnicodeID(const UTransliterator *trans,
279                    int32_t *resultLength);
280
281/**
282 * Register an open transliterator with the system.  When
283 * utrans_open() is called with an ID string that is equal to that
284 * returned by utrans_getID(adoptedTrans,...), then
285 * utrans_clone(adoptedTrans,...) is returned.
286 *
287 * <p>NOTE: After this call the system owns the adoptedTrans and will
288 * close it.  The user must not call utrans_close() on adoptedTrans.
289 *
290 * @param adoptedTrans a transliterator, typically the result of
291 * utrans_openRules(), to be registered with the system.
292 * @param status a pointer to the UErrorCode
293 * @stable ICU 2.0
294 */
295U_CAPI void U_EXPORT2
296utrans_register(UTransliterator* adoptedTrans,
297                UErrorCode* status);
298
299/**
300 * Unregister a transliterator from the system.  After this call the
301 * system will no longer recognize the given ID when passed to
302 * utrans_open(). If the ID is invalid then nothing is done.
303 *
304 * @param id an ID to unregister
305 * @param idLength the length of id, or -1 if id is zero-terminated
306 * @stable ICU 2.8
307 */
308U_CAPI void U_EXPORT2
309utrans_unregisterID(const UChar* id, int32_t idLength);
310
311/**
312 * Set the filter used by a transliterator.  A filter can be used to
313 * make the transliterator pass certain characters through untouched.
314 * The filter is expressed using a UnicodeSet pattern.  If the
315 * filterPattern is NULL or the empty string, then the transliterator
316 * will be reset to use no filter.
317 *
318 * @param trans the transliterator
319 * @param filterPattern a pattern string, in the form accepted by
320 * UnicodeSet, specifying which characters to apply the
321 * transliteration to.  May be NULL or the empty string to indicate no
322 * filter.
323 * @param filterPatternLen the length of filterPattern, or -1 if
324 * filterPattern is zero-terminated
325 * @param status a pointer to the UErrorCode
326 * @see UnicodeSet
327 * @stable ICU 2.0
328 */
329U_CAPI void U_EXPORT2
330utrans_setFilter(UTransliterator* trans,
331                 const UChar* filterPattern,
332                 int32_t filterPatternLen,
333                 UErrorCode* status);
334
335/**
336 * Return the number of system transliterators.
337 * It is recommended to use utrans_openIDs() instead.
338 *
339 * @return the number of system transliterators.
340 * @stable ICU 2.0
341 */
342U_CAPI int32_t U_EXPORT2
343utrans_countAvailableIDs(void);
344
345/**
346 * Return a UEnumeration for the available transliterators.
347 *
348 * @param pErrorCode Pointer to the UErrorCode in/out parameter.
349 * @return UEnumeration for the available transliterators.
350 *         Close with uenum_close().
351 *
352 * @stable ICU 2.8
353 */
354U_CAPI UEnumeration * U_EXPORT2
355utrans_openIDs(UErrorCode *pErrorCode);
356
357/********************************************************************
358 * Transliteration API
359 ********************************************************************/
360
361/**
362 * Transliterate a segment of a UChar* string.  The string is passed
363 * in in a UChar* buffer.  The string is modified in place.  If the
364 * result is longer than textCapacity, it is truncated.  The actual
365 * length of the result is returned in *textLength, if textLength is
366 * non-NULL. *textLength may be greater than textCapacity, but only
367 * textCapacity UChars will be written to *text, including the zero
368 * terminator.
369 *
370 * @param trans the transliterator
371 * @param text a pointer to a buffer containing the text to be
372 * transliterated on input and the result text on output.
373 * @param textLength a pointer to the length of the string in text.
374 * If the length is -1 then the string is assumed to be
375 * zero-terminated.  Upon return, the new length is stored in
376 * *textLength.  If textLength is NULL then the string is assumed to
377 * be zero-terminated.
378 * @param textCapacity the length of the text buffer
379 * @param start the beginning index, inclusive; <code>0 <= start <=
380 * limit</code>.
381 * @param limit pointer to the ending index, exclusive; <code>start <=
382 * limit <= repFunc->length(rep)</code>.  Upon return, *limit will
383 * contain the new limit index.  The text previously occupying
384 * <code>[start, limit)</code> has been transliterated, possibly to a
385 * string of a different length, at <code>[start,
386 * </code><em>new-limit</em><code>)</code>, where <em>new-limit</em>
387 * is the return value.
388 * @param status a pointer to the UErrorCode
389 * @stable ICU 2.0
390 */
391U_CAPI void U_EXPORT2
392utrans_transUChars(const UTransliterator* trans,
393                   UChar* text,
394                   int32_t* textLength,
395                   int32_t textCapacity,
396                   int32_t start,
397                   int32_t* limit,
398                   UErrorCode* status);
399
400/**
401 * Transliterate the portion of the UChar* text buffer that can be
402 * transliterated unambiguously.  See utrans_transIncremental().  The
403 * string is passed in in a UChar* buffer.  The string is modified in
404 * place.  If the result is longer than textCapacity, it is truncated.
405 * The actual length of the result is returned in *textLength, if
406 * textLength is non-NULL. *textLength may be greater than
407 * textCapacity, but only textCapacity UChars will be written to
408 * *text, including the zero terminator.  See utrans_transIncremental()
409 * for usage details.
410 *
411 * @param trans the transliterator
412 * @param text a pointer to a buffer containing the text to be
413 * transliterated on input and the result text on output.
414 * @param textLength a pointer to the length of the string in text.
415 * If the length is -1 then the string is assumed to be
416 * zero-terminated.  Upon return, the new length is stored in
417 * *textLength.  If textLength is NULL then the string is assumed to
418 * be zero-terminated.
419 * @param textCapacity the length of the text buffer
420 * @param pos a struct containing the start and limit indices of the
421 * text to be read and the text to be transliterated
422 * @param status a pointer to the UErrorCode
423 * @see utrans_transIncremental
424 * @stable ICU 2.0
425 */
426U_CAPI void U_EXPORT2
427utrans_transIncrementalUChars(const UTransliterator* trans,
428                              UChar* text,
429                              int32_t* textLength,
430                              int32_t textCapacity,
431                              UTransPosition* pos,
432                              UErrorCode* status);
433
434/**
435 * Create a rule string that can be passed to utrans_openU to recreate this
436 * transliterator.
437 *
438 * @param trans     The transliterator
439 * @param escapeUnprintable if true then convert unprintable characters to their
440 *                  hex escape representations, \\uxxxx or \\Uxxxxxxxx.
441 *                  Unprintable characters are those other than
442 *                  U+000A, U+0020..U+007E.
443 * @param result    A pointer to a buffer to receive the rules.
444 * @param resultLength The maximum size of result.
445 * @param status    A pointer to the UErrorCode. In case of error status, the
446 *                  contents of result are undefined.
447 * @return int32_t   The length of the rule string (may be greater than resultLength,
448 *                  in which case an error is returned).
449 * @stable ICU 53
450 */
451U_CAPI int32_t U_EXPORT2
452utrans_toRules(     const UTransliterator* trans,
453                    UBool escapeUnprintable,
454                    UChar* result, int32_t resultLength,
455                    UErrorCode* status);
456
457/**
458 * Returns the set of all characters that may be modified in the input text by
459 * this UTransliterator, optionally ignoring the transliterator's current filter.
460 * @param trans     The transliterator.
461 * @param ignoreFilter If false, the returned set incorporates the
462 *                  UTransliterator's current filter; if the filter is changed,
463 *                  the return value of this function will change. If true, the
464 *                  returned set ignores the effect of the UTransliterator's
465 *                  current filter.
466 * @param fillIn    Pointer to a USet object to receive the modifiable characters
467 *                  set. Previous contents of fillIn are lost. <em>If fillIn is
468 *                  NULL, then a new USet is created and returned. The caller
469 *                  owns the result and must dispose of it by calling uset_close.</em>
470 * @param status    A pointer to the UErrorCode.
471 * @return USet*    Either fillIn, or if fillIn is NULL, a pointer to a
472 *                  newly-allocated USet that the user must close. In case of
473 *                  error, NULL is returned.
474 * @stable ICU 53
475 */
476U_CAPI USet* U_EXPORT2
477utrans_getSourceSet(const UTransliterator* trans,
478                    UBool ignoreFilter,
479                    USet* fillIn,
480                    UErrorCode* status);
481
482/* deprecated API ----------------------------------------------------------- */
483
484#ifndef U_HIDE_DEPRECATED_API
485
486/* see utrans.h documentation for why these functions are deprecated */
487
488
489
490#endif  /* U_HIDE_DEPRECATED_API */
491
492#endif /* #if !UCONFIG_NO_TRANSLITERATION */
493
494#endif
495