xref: /third_party/icu/icu4c/source/i18n/anytrans.cpp (revision 2e5b6d6d)
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*****************************************************************
5* Copyright (c) 2002-2014, International Business Machines Corporation
6* and others.  All Rights Reserved.
7*****************************************************************
8* Date        Name        Description
9* 06/06/2002  aliu        Creation.
10*****************************************************************
11*/
12
13#include "unicode/utypes.h"
14
15#if !UCONFIG_NO_TRANSLITERATION
16
17#include "unicode/uobject.h"
18#include "unicode/uscript.h"
19
20#include "anytrans.h"
21#include "hash.h"
22#include "mutex.h"
23#include "nultrans.h"
24#include "putilimp.h"
25#include "tridpars.h"
26#include "uinvchar.h"
27#include "uvector.h"
28
29//------------------------------------------------------------
30// Constants
31
32static const UChar TARGET_SEP = 45; // '-'
33static const UChar VARIANT_SEP = 47; // '/'
34static const UChar ANY[] = {0x41,0x6E,0x79,0}; // "Any"
35static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null"
36static const UChar LATIN_PIVOT[] = {0x2D,0x4C,0x61,0x74,0x6E,0x3B,0x4C,0x61,0x74,0x6E,0x2D,0}; // "-Latn;Latn-"
37
38// initial size for an Any-XXXX transform's cache of script-XXXX transforms
39// (will grow as necessary, but we don't expect to have source text with more than 7 scripts)
40#define ANY_TRANS_CACHE_INIT_SIZE 7
41
42//------------------------------------------------------------
43
44U_CDECL_BEGIN
45/**
46 * Deleter function for Transliterator*.
47 */
48static void U_CALLCONV
49_deleteTransliterator(void *obj) {
50    delete (icu::Transliterator*) obj;
51}
52U_CDECL_END
53
54//------------------------------------------------------------
55
56U_NAMESPACE_BEGIN
57
58//------------------------------------------------------------
59// ScriptRunIterator
60
61/**
62 * Returns a series of ranges corresponding to scripts. They will be
63 * of the form:
64 *
65 * ccccSScSSccccTTcTcccc   - c = common, S = first script, T = second
66 * |            |          - first run (start, limit)
67 *          |           |  - second run (start, limit)
68 *
69 * That is, the runs will overlap. The reason for this is so that a
70 * transliterator can consider common characters both before and after
71 * the scripts.
72 */
73class ScriptRunIterator : public UMemory {
74private:
75    const Replaceable& text;
76    int32_t textStart;
77    int32_t textLimit;
78
79public:
80    /**
81     * The code of the current run, valid after next() returns.  May
82     * be USCRIPT_INVALID_CODE if and only if the entire text is
83     * COMMON/INHERITED.
84     */
85    UScriptCode scriptCode;
86
87    /**
88     * The start of the run, inclusive, valid after next() returns.
89     */
90    int32_t start;
91
92    /**
93     * The end of the run, exclusive, valid after next() returns.
94     */
95    int32_t limit;
96
97    /**
98     * Constructs a run iterator over the given text from start
99     * (inclusive) to limit (exclusive).
100     */
101    ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit);
102
103    /**
104     * Returns true if there are any more runs.  true is always
105     * returned at least once.  Upon return, the caller should
106     * examine scriptCode, start, and limit.
107     */
108    UBool next();
109
110    /**
111     * Adjusts internal indices for a change in the limit index of the
112     * given delta.  A positive delta means the limit has increased.
113     */
114    void adjustLimit(int32_t delta);
115
116private:
117    ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class
118    ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class
119};
120
121ScriptRunIterator::ScriptRunIterator(const Replaceable& theText,
122                                     int32_t myStart, int32_t myLimit) :
123    text(theText)
124{
125    textStart = myStart;
126    textLimit = myLimit;
127    limit = myStart;
128}
129
130UBool ScriptRunIterator::next() {
131    UChar32 ch;
132    UScriptCode s;
133    UErrorCode ec = U_ZERO_ERROR;
134
135    scriptCode = USCRIPT_INVALID_CODE; // don't know script yet
136    start = limit;
137
138    // Are we done?
139    if (start == textLimit) {
140        return false;
141    }
142
143    // Move start back to include adjacent COMMON or INHERITED
144    // characters
145    while (start > textStart) {
146        ch = text.char32At(start - 1); // look back
147        s = uscript_getScript(ch, &ec);
148        if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) {
149            --start;
150        } else {
151            break;
152        }
153    }
154
155    // Move limit ahead to include COMMON, INHERITED, and characters
156    // of the current script.
157    while (limit < textLimit) {
158        ch = text.char32At(limit); // look ahead
159        s = uscript_getScript(ch, &ec);
160        if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) {
161            if (scriptCode == USCRIPT_INVALID_CODE) {
162                scriptCode = s;
163            } else if (s != scriptCode) {
164                break;
165            }
166        }
167        ++limit;
168    }
169
170    // Return true even if the entire text is COMMON / INHERITED, in
171    // which case scriptCode will be USCRIPT_INVALID_CODE.
172    return true;
173}
174
175void ScriptRunIterator::adjustLimit(int32_t delta) {
176    limit += delta;
177    textLimit += delta;
178}
179
180//------------------------------------------------------------
181// AnyTransliterator
182
183UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator)
184
185AnyTransliterator::AnyTransliterator(const UnicodeString& id,
186                                     const UnicodeString& theTarget,
187                                     const UnicodeString& theVariant,
188                                     UScriptCode theTargetScript,
189                                     UErrorCode& ec) :
190    Transliterator(id, NULL),
191    targetScript(theTargetScript)
192{
193    cache = uhash_openSize(uhash_hashLong, uhash_compareLong, NULL, ANY_TRANS_CACHE_INIT_SIZE, &ec);
194    if (U_FAILURE(ec)) {
195        return;
196    }
197    uhash_setValueDeleter(cache, _deleteTransliterator);
198
199    target = theTarget;
200    if (theVariant.length() > 0) {
201        target.append(VARIANT_SEP).append(theVariant);
202    }
203}
204
205AnyTransliterator::~AnyTransliterator() {
206    uhash_close(cache);
207}
208
209/**
210 * Copy constructor.
211 */
212AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) :
213    Transliterator(o),
214    target(o.target),
215    targetScript(o.targetScript)
216{
217    // Don't copy the cache contents
218    UErrorCode ec = U_ZERO_ERROR;
219    cache = uhash_openSize(uhash_hashLong, uhash_compareLong, NULL, ANY_TRANS_CACHE_INIT_SIZE, &ec);
220    if (U_FAILURE(ec)) {
221        return;
222    }
223    uhash_setValueDeleter(cache, _deleteTransliterator);
224}
225
226/**
227 * Transliterator API.
228 */
229AnyTransliterator* AnyTransliterator::clone() const {
230    return new AnyTransliterator(*this);
231}
232
233/**
234 * Implements {@link Transliterator#handleTransliterate}.
235 */
236void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
237                                            UBool isIncremental) const {
238    int32_t allStart = pos.start;
239    int32_t allLimit = pos.limit;
240
241    ScriptRunIterator it(text, pos.contextStart, pos.contextLimit);
242
243    while (it.next()) {
244        // Ignore runs in the ante context
245        if (it.limit <= allStart) continue;
246
247        // Try to instantiate transliterator from it.scriptCode to
248        // our target or target/variant
249        Transliterator* t = getTransliterator(it.scriptCode);
250
251        if (t == NULL) {
252            // We have no transliterator.  Do nothing, but keep
253            // pos.start up to date.
254            pos.start = it.limit;
255            continue;
256        }
257
258        // If the run end is before the transliteration limit, do
259        // a non-incremental transliteration.  Otherwise do an
260        // incremental one.
261        UBool incremental = isIncremental && (it.limit >= allLimit);
262
263        pos.start = uprv_max(allStart, it.start);
264        pos.limit = uprv_min(allLimit, it.limit);
265        int32_t limit = pos.limit;
266        t->filteredTransliterate(text, pos, incremental);
267        int32_t delta = pos.limit - limit;
268        allLimit += delta;
269        it.adjustLimit(delta);
270
271        // We're done if we enter the post context
272        if (it.limit >= allLimit) break;
273    }
274
275    // Restore limit.  pos.start is fine where the last transliterator
276    // left it, or at the end of the last run.
277    pos.limit = allLimit;
278}
279
280Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const {
281
282    if (source == targetScript || source == USCRIPT_INVALID_CODE) {
283        return NULL;
284    }
285
286    Transliterator* t = NULL;
287    {
288        Mutex m(NULL);
289        t = (Transliterator*) uhash_iget(cache, (int32_t) source);
290    }
291    if (t == NULL) {
292        UErrorCode ec = U_ZERO_ERROR;
293        UnicodeString sourceName(uscript_getShortName(source), -1, US_INV);
294        UnicodeString id(sourceName);
295        id.append(TARGET_SEP).append(target);
296
297        t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
298        if (U_FAILURE(ec) || t == NULL) {
299            delete t;
300
301            // Try to pivot around Latin, our most common script
302            id = sourceName;
303            id.append(LATIN_PIVOT, -1).append(target);
304            t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
305            if (U_FAILURE(ec) || t == NULL) {
306                delete t;
307                t = NULL;
308            }
309        }
310
311        if (t != NULL) {
312            Transliterator *rt = NULL;
313            {
314                Mutex m(NULL);
315                rt = static_cast<Transliterator *> (uhash_iget(cache, (int32_t) source));
316                if (rt == NULL) {
317                    // Common case, no race to cache this new transliterator.
318                    uhash_iput(cache, (int32_t) source, t, &ec);
319                } else {
320                    // Race case, some other thread beat us to caching this transliterator.
321                    Transliterator *temp = rt;
322                    rt = t;    // Our newly created transliterator that lost the race & now needs deleting.
323                    t  = temp; // The transliterator from the cache that we will return.
324                }
325            }
326            delete rt;    // will be non-null only in case of races.
327        }
328    }
329    return t;
330}
331
332/**
333 * Return the script code for a given name, or -1 if not found.
334 */
335static UScriptCode scriptNameToCode(const UnicodeString& name) {
336    char buf[128];
337    UScriptCode code;
338    UErrorCode ec = U_ZERO_ERROR;
339    int32_t nameLen = name.length();
340    UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen);
341
342    if (isInvariant) {
343        name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV);
344        buf[127] = 0;   // Make sure that we NULL terminate the string.
345    }
346    if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec))
347    {
348        code = USCRIPT_INVALID_CODE;
349    }
350    return code;
351}
352
353/**
354 * Registers standard transliterators with the system.  Called by
355 * Transliterator during initialization.  Scan all current targets and
356 * register those that are scripts T as Any-T/V.
357 */
358void AnyTransliterator::registerIDs() {
359
360    UErrorCode ec = U_ZERO_ERROR;
361    Hashtable seen(true, ec);
362
363    int32_t sourceCount = Transliterator::_countAvailableSources();
364    for (int32_t s=0; s<sourceCount; ++s) {
365        UnicodeString source;
366        Transliterator::_getAvailableSource(s, source);
367
368        // Ignore the "Any" source
369        if (source.caseCompare(ANY, 3, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue;
370
371        int32_t targetCount = Transliterator::_countAvailableTargets(source);
372        for (int32_t t=0; t<targetCount; ++t) {
373            UnicodeString target;
374            Transliterator::_getAvailableTarget(t, source, target);
375
376            // Only process each target once
377            if (seen.geti(target) != 0) continue;
378            ec = U_ZERO_ERROR;
379            seen.puti(target, 1, ec);
380
381            // Get the script code for the target.  If not a script, ignore.
382            UScriptCode targetScript = scriptNameToCode(target);
383            if (targetScript == USCRIPT_INVALID_CODE) continue;
384
385            int32_t variantCount = Transliterator::_countAvailableVariants(source, target);
386            // assert(variantCount >= 1);
387            for (int32_t v=0; v<variantCount; ++v) {
388                UnicodeString variant;
389                Transliterator::_getAvailableVariant(v, source, target, variant);
390
391                UnicodeString id;
392                TransliteratorIDParser::STVtoID(UnicodeString(true, ANY, 3), target, variant, id);
393                ec = U_ZERO_ERROR;
394                AnyTransliterator* tl = new AnyTransliterator(id, target, variant,
395                                                             targetScript, ec);
396                if (U_FAILURE(ec)) {
397                    delete tl;
398                } else {
399                    Transliterator::_registerInstance(tl);
400                    Transliterator::_registerSpecialInverse(target, UnicodeString(true, NULL_ID, 4), false);
401                }
402            }
403        }
404    }
405}
406
407U_NAMESPACE_END
408
409#endif /* #if !UCONFIG_NO_TRANSLITERATION */
410
411//eof
412