1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6*   Copyright (C) 1997-2016, International Business Machines
7*   Corporation and others.  All Rights Reserved.
8*
9*******************************************************************************
10*   file name:  loclikely.cpp
11*   encoding:   UTF-8
12*   tab size:   8 (not used)
13*   indentation:4
14*
15*   created on: 2010feb25
16*   created by: Markus W. Scherer
17*
18*   Code for likely and minimized locale subtags, separated out from other .cpp files
19*   that then do not depend on resource bundle code and likely-subtags data.
20*/
21
22#include "unicode/bytestream.h"
23#include "unicode/utypes.h"
24#include "unicode/locid.h"
25#include "unicode/putil.h"
26#include "unicode/uchar.h"
27#include "unicode/uloc.h"
28#include "unicode/ures.h"
29#include "unicode/uscript.h"
30#include "bytesinkutil.h"
31#include "charstr.h"
32#include "cmemory.h"
33#include "cstring.h"
34#include "ulocimp.h"
35#include "ustr_imp.h"
36
37/**
38 * These are the canonical strings for unknown languages, scripts and regions.
39 **/
40static const char* const unknownLanguage = "und";
41static const char* const unknownScript = "Zzzz";
42static const char* const unknownRegion = "ZZ";
43
44/**
45 * This function looks for the localeID in the likelySubtags resource.
46 *
47 * @param localeID The tag to find.
48 * @param buffer A buffer to hold the matching entry
49 * @param bufferLength The length of the output buffer
50 * @return A pointer to "buffer" if found, or a null pointer if not.
51 */
52static const char*  U_CALLCONV
53findLikelySubtags(const char* localeID,
54                  char* buffer,
55                  int32_t bufferLength,
56                  UErrorCode* err) {
57    const char* result = NULL;
58
59    if (!U_FAILURE(*err)) {
60        int32_t resLen = 0;
61        const UChar* s = NULL;
62        UErrorCode tmpErr = U_ZERO_ERROR;
63        icu::LocalUResourceBundlePointer subtags(ures_openDirect(NULL, "likelySubtags", &tmpErr));
64        if (U_SUCCESS(tmpErr)) {
65            icu::CharString und;
66            if (localeID != NULL) {
67                if (*localeID == '\0') {
68                    localeID = unknownLanguage;
69                } else if (*localeID == '_') {
70                    und.append(unknownLanguage, *err);
71                    und.append(localeID, *err);
72                    if (U_FAILURE(*err)) {
73                        return NULL;
74                    }
75                    localeID = und.data();
76                }
77            }
78            s = ures_getStringByKey(subtags.getAlias(), localeID, &resLen, &tmpErr);
79
80            if (U_FAILURE(tmpErr)) {
81                /*
82                 * If a resource is missing, it's not really an error, it's
83                 * just that we don't have any data for that particular locale ID.
84                 */
85                if (tmpErr != U_MISSING_RESOURCE_ERROR) {
86                    *err = tmpErr;
87                }
88            }
89            else if (resLen >= bufferLength) {
90                /* The buffer should never overflow. */
91                *err = U_INTERNAL_PROGRAM_ERROR;
92            }
93            else {
94                u_UCharsToChars(s, buffer, resLen + 1);
95                if (resLen >= 3 &&
96                    uprv_strnicmp(buffer, unknownLanguage, 3) == 0 &&
97                    (resLen == 3 || buffer[3] == '_')) {
98                    uprv_memmove(buffer, buffer + 3, resLen - 3 + 1);
99                }
100                result = buffer;
101            }
102        } else {
103            *err = tmpErr;
104        }
105    }
106
107    return result;
108}
109
110/**
111 * Append a tag to a buffer, adding the separator if necessary.  The buffer
112 * must be large enough to contain the resulting tag plus any separator
113 * necessary. The tag must not be a zero-length string.
114 *
115 * @param tag The tag to add.
116 * @param tagLength The length of the tag.
117 * @param buffer The output buffer.
118 * @param bufferLength The length of the output buffer.  This is an input/output parameter.
119 **/
120static void U_CALLCONV
121appendTag(
122    const char* tag,
123    int32_t tagLength,
124    char* buffer,
125    int32_t* bufferLength,
126    UBool withSeparator) {
127
128    if (withSeparator) {
129        buffer[*bufferLength] = '_';
130        ++(*bufferLength);
131    }
132
133    uprv_memmove(
134        &buffer[*bufferLength],
135        tag,
136        tagLength);
137
138    *bufferLength += tagLength;
139}
140
141/**
142 * Create a tag string from the supplied parameters.  The lang, script and region
143 * parameters may be NULL pointers. If they are, their corresponding length parameters
144 * must be less than or equal to 0.
145 *
146 * If any of the language, script or region parameters are empty, and the alternateTags
147 * parameter is not NULL, it will be parsed for potential language, script and region tags
148 * to be used when constructing the new tag.  If the alternateTags parameter is NULL, or
149 * it contains no language tag, the default tag for the unknown language is used.
150 *
151 * If the length of the new string exceeds the capacity of the output buffer,
152 * the function copies as many bytes to the output buffer as it can, and returns
153 * the error U_BUFFER_OVERFLOW_ERROR.
154 *
155 * If an illegal argument is provided, the function returns the error
156 * U_ILLEGAL_ARGUMENT_ERROR.
157 *
158 * Note that this function can return the warning U_STRING_NOT_TERMINATED_WARNING if
159 * the tag string fits in the output buffer, but the null terminator doesn't.
160 *
161 * @param lang The language tag to use.
162 * @param langLength The length of the language tag.
163 * @param script The script tag to use.
164 * @param scriptLength The length of the script tag.
165 * @param region The region tag to use.
166 * @param regionLength The length of the region tag.
167 * @param trailing Any trailing data to append to the new tag.
168 * @param trailingLength The length of the trailing data.
169 * @param alternateTags A string containing any alternate tags.
170 * @param sink The output sink receiving the tag string.
171 * @param err A pointer to a UErrorCode for error reporting.
172 **/
173static void U_CALLCONV
174createTagStringWithAlternates(
175    const char* lang,
176    int32_t langLength,
177    const char* script,
178    int32_t scriptLength,
179    const char* region,
180    int32_t regionLength,
181    const char* trailing,
182    int32_t trailingLength,
183    const char* alternateTags,
184    icu::ByteSink& sink,
185    UErrorCode* err) {
186
187    if (U_FAILURE(*err)) {
188        goto error;
189    }
190    else if (langLength >= ULOC_LANG_CAPACITY ||
191             scriptLength >= ULOC_SCRIPT_CAPACITY ||
192             regionLength >= ULOC_COUNTRY_CAPACITY) {
193        goto error;
194    }
195    else {
196        /**
197         * ULOC_FULLNAME_CAPACITY will provide enough capacity
198         * that we can build a string that contains the language,
199         * script and region code without worrying about overrunning
200         * the user-supplied buffer.
201         **/
202        char tagBuffer[ULOC_FULLNAME_CAPACITY];
203        int32_t tagLength = 0;
204        UBool regionAppended = false;
205
206        if (langLength > 0) {
207            appendTag(
208                lang,
209                langLength,
210                tagBuffer,
211                &tagLength,
212                /*withSeparator=*/false);
213        }
214        else if (alternateTags == NULL) {
215            /*
216             * Use the empty string for an unknown language, if
217             * we found no language.
218             */
219        }
220        else {
221            /*
222             * Parse the alternateTags string for the language.
223             */
224            char alternateLang[ULOC_LANG_CAPACITY];
225            int32_t alternateLangLength = sizeof(alternateLang);
226
227            alternateLangLength =
228                uloc_getLanguage(
229                    alternateTags,
230                    alternateLang,
231                    alternateLangLength,
232                    err);
233            if(U_FAILURE(*err) ||
234                alternateLangLength >= ULOC_LANG_CAPACITY) {
235                goto error;
236            }
237            else if (alternateLangLength == 0) {
238                /*
239                 * Use the empty string for an unknown language, if
240                 * we found no language.
241                 */
242            }
243            else {
244                appendTag(
245                    alternateLang,
246                    alternateLangLength,
247                    tagBuffer,
248                    &tagLength,
249                    /*withSeparator=*/false);
250            }
251        }
252
253        if (scriptLength > 0) {
254            appendTag(
255                script,
256                scriptLength,
257                tagBuffer,
258                &tagLength,
259                /*withSeparator=*/true);
260        }
261        else if (alternateTags != NULL) {
262            /*
263             * Parse the alternateTags string for the script.
264             */
265            char alternateScript[ULOC_SCRIPT_CAPACITY];
266
267            const int32_t alternateScriptLength =
268                uloc_getScript(
269                    alternateTags,
270                    alternateScript,
271                    sizeof(alternateScript),
272                    err);
273
274            if (U_FAILURE(*err) ||
275                alternateScriptLength >= ULOC_SCRIPT_CAPACITY) {
276                goto error;
277            }
278            else if (alternateScriptLength > 0) {
279                appendTag(
280                    alternateScript,
281                    alternateScriptLength,
282                    tagBuffer,
283                    &tagLength,
284                    /*withSeparator=*/true);
285            }
286        }
287
288        if (regionLength > 0) {
289            appendTag(
290                region,
291                regionLength,
292                tagBuffer,
293                &tagLength,
294                /*withSeparator=*/true);
295
296            regionAppended = true;
297        }
298        else if (alternateTags != NULL) {
299            /*
300             * Parse the alternateTags string for the region.
301             */
302            char alternateRegion[ULOC_COUNTRY_CAPACITY];
303
304            const int32_t alternateRegionLength =
305                uloc_getCountry(
306                    alternateTags,
307                    alternateRegion,
308                    sizeof(alternateRegion),
309                    err);
310            if (U_FAILURE(*err) ||
311                alternateRegionLength >= ULOC_COUNTRY_CAPACITY) {
312                goto error;
313            }
314            else if (alternateRegionLength > 0) {
315                appendTag(
316                    alternateRegion,
317                    alternateRegionLength,
318                    tagBuffer,
319                    &tagLength,
320                    /*withSeparator=*/true);
321
322                regionAppended = true;
323            }
324        }
325
326        /**
327         * Copy the partial tag from our internal buffer to the supplied
328         * target.
329         **/
330        sink.Append(tagBuffer, tagLength);
331
332        if (trailingLength > 0) {
333            if (*trailing != '@') {
334                sink.Append("_", 1);
335                if (!regionAppended) {
336                    /* extra separator is required */
337                    sink.Append("_", 1);
338                }
339            }
340
341            /*
342             * Copy the trailing data into the supplied buffer.
343             */
344            sink.Append(trailing, trailingLength);
345        }
346
347        return;
348    }
349
350error:
351
352    /**
353     * An overflow indicates the locale ID passed in
354     * is ill-formed.  If we got here, and there was
355     * no previous error, it's an implicit overflow.
356     **/
357    if (*err ==  U_BUFFER_OVERFLOW_ERROR ||
358        U_SUCCESS(*err)) {
359        *err = U_ILLEGAL_ARGUMENT_ERROR;
360    }
361}
362
363/**
364 * Create a tag string from the supplied parameters.  The lang, script and region
365 * parameters may be NULL pointers. If they are, their corresponding length parameters
366 * must be less than or equal to 0.  If the lang parameter is an empty string, the
367 * default value for an unknown language is written to the output buffer.
368 *
369 * If the length of the new string exceeds the capacity of the output buffer,
370 * the function copies as many bytes to the output buffer as it can, and returns
371 * the error U_BUFFER_OVERFLOW_ERROR.
372 *
373 * If an illegal argument is provided, the function returns the error
374 * U_ILLEGAL_ARGUMENT_ERROR.
375 *
376 * @param lang The language tag to use.
377 * @param langLength The length of the language tag.
378 * @param script The script tag to use.
379 * @param scriptLength The length of the script tag.
380 * @param region The region tag to use.
381 * @param regionLength The length of the region tag.
382 * @param trailing Any trailing data to append to the new tag.
383 * @param trailingLength The length of the trailing data.
384 * @param sink The output sink receiving the tag string.
385 * @param err A pointer to a UErrorCode for error reporting.
386 **/
387static void U_CALLCONV
388createTagString(
389    const char* lang,
390    int32_t langLength,
391    const char* script,
392    int32_t scriptLength,
393    const char* region,
394    int32_t regionLength,
395    const char* trailing,
396    int32_t trailingLength,
397    icu::ByteSink& sink,
398    UErrorCode* err)
399{
400    createTagStringWithAlternates(
401                lang,
402                langLength,
403                script,
404                scriptLength,
405                region,
406                regionLength,
407                trailing,
408                trailingLength,
409                NULL,
410                sink,
411                err);
412}
413
414/**
415 * Parse the language, script, and region subtags from a tag string, and copy the
416 * results into the corresponding output parameters. The buffers are null-terminated,
417 * unless overflow occurs.
418 *
419 * The langLength, scriptLength, and regionLength parameters are input/output
420 * parameters, and must contain the capacity of their corresponding buffers on
421 * input.  On output, they will contain the actual length of the buffers, not
422 * including the null terminator.
423 *
424 * If the length of any of the output subtags exceeds the capacity of the corresponding
425 * buffer, the function copies as many bytes to the output buffer as it can, and returns
426 * the error U_BUFFER_OVERFLOW_ERROR.  It will not parse any more subtags once overflow
427 * occurs.
428 *
429 * If an illegal argument is provided, the function returns the error
430 * U_ILLEGAL_ARGUMENT_ERROR.
431 *
432 * @param localeID The locale ID to parse.
433 * @param lang The language tag buffer.
434 * @param langLength The length of the language tag.
435 * @param script The script tag buffer.
436 * @param scriptLength The length of the script tag.
437 * @param region The region tag buffer.
438 * @param regionLength The length of the region tag.
439 * @param err A pointer to a UErrorCode for error reporting.
440 * @return The number of chars of the localeID parameter consumed.
441 **/
442static int32_t U_CALLCONV
443parseTagString(
444    const char* localeID,
445    char* lang,
446    int32_t* langLength,
447    char* script,
448    int32_t* scriptLength,
449    char* region,
450    int32_t* regionLength,
451    UErrorCode* err)
452{
453    const char* position = localeID;
454    int32_t subtagLength = 0;
455
456    if(U_FAILURE(*err) ||
457       localeID == NULL ||
458       lang == NULL ||
459       langLength == NULL ||
460       script == NULL ||
461       scriptLength == NULL ||
462       region == NULL ||
463       regionLength == NULL) {
464        goto error;
465    }
466
467    subtagLength = ulocimp_getLanguage(position, &position, *err).extract(lang, *langLength, *err);
468
469    /*
470     * Note that we explicit consider U_STRING_NOT_TERMINATED_WARNING
471     * to be an error, because it indicates the user-supplied tag is
472     * not well-formed.
473     */
474    if(U_FAILURE(*err)) {
475        goto error;
476    }
477
478    *langLength = subtagLength;
479
480    /*
481     * If no language was present, use the empty string instead.
482     * Otherwise, move past any separator.
483     */
484    if (_isIDSeparator(*position)) {
485        ++position;
486    }
487
488    subtagLength = ulocimp_getScript(position, &position, *err).extract(script, *scriptLength, *err);
489
490    if(U_FAILURE(*err)) {
491        goto error;
492    }
493
494    *scriptLength = subtagLength;
495
496    if (*scriptLength > 0) {
497        if (uprv_strnicmp(script, unknownScript, *scriptLength) == 0) {
498            /**
499             * If the script part is the "unknown" script, then don't return it.
500             **/
501            *scriptLength = 0;
502        }
503
504        /*
505         * Move past any separator.
506         */
507        if (_isIDSeparator(*position)) {
508            ++position;
509        }
510    }
511
512    subtagLength = ulocimp_getCountry(position, &position, *err).extract(region, *regionLength, *err);
513
514    if(U_FAILURE(*err)) {
515        goto error;
516    }
517
518    *regionLength = subtagLength;
519
520    if (*regionLength > 0) {
521        if (uprv_strnicmp(region, unknownRegion, *regionLength) == 0) {
522            /**
523             * If the region part is the "unknown" region, then don't return it.
524             **/
525            *regionLength = 0;
526        }
527    } else if (*position != 0 && *position != '@') {
528        /* back up over consumed trailing separator */
529        --position;
530    }
531
532exit:
533
534    return (int32_t)(position - localeID);
535
536error:
537
538    /**
539     * If we get here, we have no explicit error, it's the result of an
540     * illegal argument.
541     **/
542    if (!U_FAILURE(*err)) {
543        *err = U_ILLEGAL_ARGUMENT_ERROR;
544    }
545
546    goto exit;
547}
548
549static UBool U_CALLCONV
550createLikelySubtagsString(
551    const char* lang,
552    int32_t langLength,
553    const char* script,
554    int32_t scriptLength,
555    const char* region,
556    int32_t regionLength,
557    const char* variants,
558    int32_t variantsLength,
559    icu::ByteSink& sink,
560    UErrorCode* err) {
561    /**
562     * ULOC_FULLNAME_CAPACITY will provide enough capacity
563     * that we can build a string that contains the language,
564     * script and region code without worrying about overrunning
565     * the user-supplied buffer.
566     **/
567    char likelySubtagsBuffer[ULOC_FULLNAME_CAPACITY];
568
569    if(U_FAILURE(*err)) {
570        goto error;
571    }
572
573    /**
574     * Try the language with the script and region first.
575     **/
576    if (scriptLength > 0 && regionLength > 0) {
577
578        const char* likelySubtags = NULL;
579
580        icu::CharString tagBuffer;
581        {
582            icu::CharStringByteSink sink(&tagBuffer);
583            createTagString(
584                lang,
585                langLength,
586                script,
587                scriptLength,
588                region,
589                regionLength,
590                NULL,
591                0,
592                sink,
593                err);
594        }
595        if(U_FAILURE(*err)) {
596            goto error;
597        }
598
599        likelySubtags =
600            findLikelySubtags(
601                tagBuffer.data(),
602                likelySubtagsBuffer,
603                sizeof(likelySubtagsBuffer),
604                err);
605        if(U_FAILURE(*err)) {
606            goto error;
607        }
608
609        if (likelySubtags != NULL) {
610            /* Always use the language tag from the
611               maximal string, since it may be more
612               specific than the one provided. */
613            createTagStringWithAlternates(
614                        NULL,
615                        0,
616                        NULL,
617                        0,
618                        NULL,
619                        0,
620                        variants,
621                        variantsLength,
622                        likelySubtags,
623                        sink,
624                        err);
625            return true;
626        }
627    }
628
629    /**
630     * Try the language with just the script.
631     **/
632    if (scriptLength > 0) {
633
634        const char* likelySubtags = NULL;
635
636        icu::CharString tagBuffer;
637        {
638            icu::CharStringByteSink sink(&tagBuffer);
639            createTagString(
640                lang,
641                langLength,
642                script,
643                scriptLength,
644                NULL,
645                0,
646                NULL,
647                0,
648                sink,
649                err);
650        }
651        if(U_FAILURE(*err)) {
652            goto error;
653        }
654
655        likelySubtags =
656            findLikelySubtags(
657                tagBuffer.data(),
658                likelySubtagsBuffer,
659                sizeof(likelySubtagsBuffer),
660                err);
661        if(U_FAILURE(*err)) {
662            goto error;
663        }
664
665        if (likelySubtags != NULL) {
666            /* Always use the language tag from the
667               maximal string, since it may be more
668               specific than the one provided. */
669            createTagStringWithAlternates(
670                        NULL,
671                        0,
672                        NULL,
673                        0,
674                        region,
675                        regionLength,
676                        variants,
677                        variantsLength,
678                        likelySubtags,
679                        sink,
680                        err);
681            return true;
682        }
683    }
684
685    /**
686     * Try the language with just the region.
687     **/
688    if (regionLength > 0) {
689
690        const char* likelySubtags = NULL;
691
692        icu::CharString tagBuffer;
693        {
694            icu::CharStringByteSink sink(&tagBuffer);
695            createTagString(
696                lang,
697                langLength,
698                NULL,
699                0,
700                region,
701                regionLength,
702                NULL,
703                0,
704                sink,
705                err);
706        }
707        if(U_FAILURE(*err)) {
708            goto error;
709        }
710
711        likelySubtags =
712            findLikelySubtags(
713                tagBuffer.data(),
714                likelySubtagsBuffer,
715                sizeof(likelySubtagsBuffer),
716                err);
717        if(U_FAILURE(*err)) {
718            goto error;
719        }
720
721        if (likelySubtags != NULL) {
722            /* Always use the language tag from the
723               maximal string, since it may be more
724               specific than the one provided. */
725            createTagStringWithAlternates(
726                        NULL,
727                        0,
728                        script,
729                        scriptLength,
730                        NULL,
731                        0,
732                        variants,
733                        variantsLength,
734                        likelySubtags,
735                        sink,
736                        err);
737            return true;
738        }
739    }
740
741    /**
742     * Finally, try just the language.
743     **/
744    {
745        const char* likelySubtags = NULL;
746
747        icu::CharString tagBuffer;
748        {
749            icu::CharStringByteSink sink(&tagBuffer);
750            createTagString(
751                lang,
752                langLength,
753                NULL,
754                0,
755                NULL,
756                0,
757                NULL,
758                0,
759                sink,
760                err);
761        }
762        if(U_FAILURE(*err)) {
763            goto error;
764        }
765
766        likelySubtags =
767            findLikelySubtags(
768                tagBuffer.data(),
769                likelySubtagsBuffer,
770                sizeof(likelySubtagsBuffer),
771                err);
772        if(U_FAILURE(*err)) {
773            goto error;
774        }
775
776        if (likelySubtags != NULL) {
777            /* Always use the language tag from the
778               maximal string, since it may be more
779               specific than the one provided. */
780            createTagStringWithAlternates(
781                        NULL,
782                        0,
783                        script,
784                        scriptLength,
785                        region,
786                        regionLength,
787                        variants,
788                        variantsLength,
789                        likelySubtags,
790                        sink,
791                        err);
792            return true;
793        }
794    }
795
796    return false;
797
798error:
799
800    if (!U_FAILURE(*err)) {
801        *err = U_ILLEGAL_ARGUMENT_ERROR;
802    }
803
804    return false;
805}
806
807#define CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength) UPRV_BLOCK_MACRO_BEGIN { \
808    int32_t count = 0; \
809    int32_t i; \
810    for (i = 0; i < trailingLength; i++) { \
811        if (trailing[i] == '-' || trailing[i] == '_') { \
812            count = 0; \
813            if (count > 8) { \
814                goto error; \
815            } \
816        } else if (trailing[i] == '@') { \
817            break; \
818        } else if (count > 8) { \
819            goto error; \
820        } else { \
821            count++; \
822        } \
823    } \
824} UPRV_BLOCK_MACRO_END
825
826static UBool
827_uloc_addLikelySubtags(const char* localeID,
828                       icu::ByteSink& sink,
829                       UErrorCode* err) {
830    char lang[ULOC_LANG_CAPACITY];
831    int32_t langLength = sizeof(lang);
832    char script[ULOC_SCRIPT_CAPACITY];
833    int32_t scriptLength = sizeof(script);
834    char region[ULOC_COUNTRY_CAPACITY];
835    int32_t regionLength = sizeof(region);
836    const char* trailing = "";
837    int32_t trailingLength = 0;
838    int32_t trailingIndex = 0;
839    UBool success = false;
840
841    if(U_FAILURE(*err)) {
842        goto error;
843    }
844    if (localeID == NULL) {
845        goto error;
846    }
847
848    trailingIndex = parseTagString(
849        localeID,
850        lang,
851        &langLength,
852        script,
853        &scriptLength,
854        region,
855        &regionLength,
856        err);
857    if(U_FAILURE(*err)) {
858        /* Overflow indicates an illegal argument error */
859        if (*err == U_BUFFER_OVERFLOW_ERROR) {
860            *err = U_ILLEGAL_ARGUMENT_ERROR;
861        }
862
863        goto error;
864    }
865
866    /* Find the length of the trailing portion. */
867    while (_isIDSeparator(localeID[trailingIndex])) {
868        trailingIndex++;
869    }
870    trailing = &localeID[trailingIndex];
871    trailingLength = (int32_t)uprv_strlen(trailing);
872
873    CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength);
874
875    success =
876        createLikelySubtagsString(
877            lang,
878            langLength,
879            script,
880            scriptLength,
881            region,
882            regionLength,
883            trailing,
884            trailingLength,
885            sink,
886            err);
887
888    if (!success) {
889        const int32_t localIDLength = (int32_t)uprv_strlen(localeID);
890
891        /*
892         * If we get here, we need to return localeID.
893         */
894        sink.Append(localeID, localIDLength);
895    }
896
897    return success;
898
899error:
900
901    if (!U_FAILURE(*err)) {
902        *err = U_ILLEGAL_ARGUMENT_ERROR;
903    }
904    return false;
905}
906
907// Add likely subtags to the sink
908// return true if the value in the sink is produced by a match during the lookup
909// return false if the value in the sink is the same as input because there are
910// no match after the lookup.
911static UBool _ulocimp_addLikelySubtags(const char*, icu::ByteSink&, UErrorCode*);
912
913static void
914_uloc_minimizeSubtags(const char* localeID,
915                      icu::ByteSink& sink,
916                      UErrorCode* err) {
917    icu::CharString maximizedTagBuffer;
918
919    char lang[ULOC_LANG_CAPACITY];
920    int32_t langLength = sizeof(lang);
921    char script[ULOC_SCRIPT_CAPACITY];
922    int32_t scriptLength = sizeof(script);
923    char region[ULOC_COUNTRY_CAPACITY];
924    int32_t regionLength = sizeof(region);
925    const char* trailing = "";
926    int32_t trailingLength = 0;
927    int32_t trailingIndex = 0;
928    UBool successGetMax = false;
929
930    if(U_FAILURE(*err)) {
931        goto error;
932    }
933    else if (localeID == NULL) {
934        goto error;
935    }
936
937    trailingIndex =
938        parseTagString(
939            localeID,
940            lang,
941            &langLength,
942            script,
943            &scriptLength,
944            region,
945            &regionLength,
946            err);
947    if(U_FAILURE(*err)) {
948
949        /* Overflow indicates an illegal argument error */
950        if (*err == U_BUFFER_OVERFLOW_ERROR) {
951            *err = U_ILLEGAL_ARGUMENT_ERROR;
952        }
953
954        goto error;
955    }
956
957    /* Find the spot where the variants or the keywords begin, if any. */
958    while (_isIDSeparator(localeID[trailingIndex])) {
959        trailingIndex++;
960    }
961    trailing = &localeID[trailingIndex];
962    trailingLength = (int32_t)uprv_strlen(trailing);
963
964    CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength);
965
966    {
967        icu::CharString base;
968        {
969            icu::CharStringByteSink baseSink(&base);
970            createTagString(
971                lang,
972                langLength,
973                script,
974                scriptLength,
975                region,
976                regionLength,
977                NULL,
978                0,
979                baseSink,
980                err);
981        }
982
983        /**
984         * First, we need to first get the maximization
985         * from AddLikelySubtags.
986         **/
987        {
988            icu::CharStringByteSink maxSink(&maximizedTagBuffer);
989            successGetMax = _ulocimp_addLikelySubtags(base.data(), maxSink, err);
990        }
991    }
992
993    if(U_FAILURE(*err)) {
994        goto error;
995    }
996
997    if (!successGetMax) {
998        /**
999         * If we got here, return the locale ID parameter unchanged.
1000         **/
1001        const int32_t localeIDLength = (int32_t)uprv_strlen(localeID);
1002        sink.Append(localeID, localeIDLength);
1003        return;
1004    }
1005
1006    // In the following, the lang, script, region are referring to those in
1007    // the maximizedTagBuffer, not the one in the localeID.
1008    langLength = sizeof(lang);
1009    scriptLength = sizeof(script);
1010    regionLength = sizeof(region);
1011    parseTagString(
1012        maximizedTagBuffer.data(),
1013        lang,
1014        &langLength,
1015        script,
1016        &scriptLength,
1017        region,
1018        &regionLength,
1019        err);
1020    if(U_FAILURE(*err)) {
1021        goto error;
1022    }
1023
1024    /**
1025     * Start first with just the language.
1026     **/
1027    {
1028        icu::CharString tagBuffer;
1029        {
1030            icu::CharStringByteSink tagSink(&tagBuffer);
1031            createLikelySubtagsString(
1032                lang,
1033                langLength,
1034                NULL,
1035                0,
1036                NULL,
1037                0,
1038                NULL,
1039                0,
1040                tagSink,
1041                err);
1042        }
1043
1044        if(U_FAILURE(*err)) {
1045            goto error;
1046        }
1047        else if (!tagBuffer.isEmpty() &&
1048                 uprv_strnicmp(
1049                    maximizedTagBuffer.data(),
1050                    tagBuffer.data(),
1051                    tagBuffer.length()) == 0) {
1052
1053            createTagString(
1054                        lang,
1055                        langLength,
1056                        NULL,
1057                        0,
1058                        NULL,
1059                        0,
1060                        trailing,
1061                        trailingLength,
1062                        sink,
1063                        err);
1064            return;
1065        }
1066    }
1067
1068    /**
1069     * Next, try the language and region.
1070     **/
1071    if (regionLength > 0) {
1072
1073        icu::CharString tagBuffer;
1074        {
1075            icu::CharStringByteSink tagSink(&tagBuffer);
1076            createLikelySubtagsString(
1077                lang,
1078                langLength,
1079                NULL,
1080                0,
1081                region,
1082                regionLength,
1083                NULL,
1084                0,
1085                tagSink,
1086                err);
1087        }
1088
1089        if(U_FAILURE(*err)) {
1090            goto error;
1091        }
1092        else if (!tagBuffer.isEmpty() &&
1093                 uprv_strnicmp(
1094                    maximizedTagBuffer.data(),
1095                    tagBuffer.data(),
1096                    tagBuffer.length()) == 0) {
1097
1098            createTagString(
1099                        lang,
1100                        langLength,
1101                        NULL,
1102                        0,
1103                        region,
1104                        regionLength,
1105                        trailing,
1106                        trailingLength,
1107                        sink,
1108                        err);
1109            return;
1110        }
1111    }
1112
1113    /**
1114     * Finally, try the language and script.  This is our last chance,
1115     * since trying with all three subtags would only yield the
1116     * maximal version that we already have.
1117     **/
1118    if (scriptLength > 0) {
1119        icu::CharString tagBuffer;
1120        {
1121            icu::CharStringByteSink tagSink(&tagBuffer);
1122            createLikelySubtagsString(
1123                lang,
1124                langLength,
1125                script,
1126                scriptLength,
1127                NULL,
1128                0,
1129                NULL,
1130                0,
1131                tagSink,
1132                err);
1133        }
1134
1135        if(U_FAILURE(*err)) {
1136            goto error;
1137        }
1138        else if (!tagBuffer.isEmpty() &&
1139                 uprv_strnicmp(
1140                    maximizedTagBuffer.data(),
1141                    tagBuffer.data(),
1142                    tagBuffer.length()) == 0) {
1143
1144            createTagString(
1145                        lang,
1146                        langLength,
1147                        script,
1148                        scriptLength,
1149                        NULL,
1150                        0,
1151                        trailing,
1152                        trailingLength,
1153                        sink,
1154                        err);
1155            return;
1156        }
1157    }
1158
1159    {
1160        /**
1161         * If we got here, return the max + trail.
1162         **/
1163        createTagString(
1164                    lang,
1165                    langLength,
1166                    script,
1167                    scriptLength,
1168                    region,
1169                    regionLength,
1170                    trailing,
1171                    trailingLength,
1172                    sink,
1173                    err);
1174        return;
1175    }
1176
1177error:
1178
1179    if (!U_FAILURE(*err)) {
1180        *err = U_ILLEGAL_ARGUMENT_ERROR;
1181    }
1182}
1183
1184static int32_t
1185do_canonicalize(const char*    localeID,
1186         char* buffer,
1187         int32_t bufferCapacity,
1188         UErrorCode* err)
1189{
1190    int32_t canonicalizedSize = uloc_canonicalize(
1191        localeID,
1192        buffer,
1193        bufferCapacity,
1194        err);
1195
1196    if (*err == U_STRING_NOT_TERMINATED_WARNING ||
1197        *err == U_BUFFER_OVERFLOW_ERROR) {
1198        return canonicalizedSize;
1199    }
1200    else if (U_FAILURE(*err)) {
1201
1202        return -1;
1203    }
1204    else {
1205        return canonicalizedSize;
1206    }
1207}
1208
1209U_CAPI int32_t U_EXPORT2
1210uloc_addLikelySubtags(const char* localeID,
1211                      char* maximizedLocaleID,
1212                      int32_t maximizedLocaleIDCapacity,
1213                      UErrorCode* status) {
1214    if (U_FAILURE(*status)) {
1215        return 0;
1216    }
1217
1218    icu::CheckedArrayByteSink sink(
1219            maximizedLocaleID, maximizedLocaleIDCapacity);
1220
1221    ulocimp_addLikelySubtags(localeID, sink, status);
1222    int32_t reslen = sink.NumberOfBytesAppended();
1223
1224    if (U_FAILURE(*status)) {
1225        return sink.Overflowed() ? reslen : -1;
1226    }
1227
1228    if (sink.Overflowed()) {
1229        *status = U_BUFFER_OVERFLOW_ERROR;
1230    } else {
1231        u_terminateChars(
1232                maximizedLocaleID, maximizedLocaleIDCapacity, reslen, status);
1233    }
1234
1235    return reslen;
1236}
1237
1238static UBool
1239_ulocimp_addLikelySubtags(const char* localeID,
1240                          icu::ByteSink& sink,
1241                          UErrorCode* status) {
1242    PreflightingLocaleIDBuffer localeBuffer;
1243    do {
1244        localeBuffer.requestedCapacity = do_canonicalize(localeID, localeBuffer.getBuffer(),
1245            localeBuffer.getCapacity(), status);
1246    } while (localeBuffer.needToTryAgain(status));
1247
1248    if (U_SUCCESS(*status)) {
1249        return _uloc_addLikelySubtags(localeBuffer.getBuffer(), sink, status);
1250    } else {
1251        return false;
1252    }
1253}
1254
1255U_CAPI void U_EXPORT2
1256ulocimp_addLikelySubtags(const char* localeID,
1257                         icu::ByteSink& sink,
1258                         UErrorCode* status) {
1259    _ulocimp_addLikelySubtags(localeID, sink, status);
1260}
1261
1262U_CAPI int32_t U_EXPORT2
1263uloc_minimizeSubtags(const char* localeID,
1264                     char* minimizedLocaleID,
1265                     int32_t minimizedLocaleIDCapacity,
1266                     UErrorCode* status) {
1267    if (U_FAILURE(*status)) {
1268        return 0;
1269    }
1270
1271    icu::CheckedArrayByteSink sink(
1272            minimizedLocaleID, minimizedLocaleIDCapacity);
1273
1274    ulocimp_minimizeSubtags(localeID, sink, status);
1275    int32_t reslen = sink.NumberOfBytesAppended();
1276
1277    if (U_FAILURE(*status)) {
1278        return sink.Overflowed() ? reslen : -1;
1279    }
1280
1281    if (sink.Overflowed()) {
1282        *status = U_BUFFER_OVERFLOW_ERROR;
1283    } else {
1284        u_terminateChars(
1285                minimizedLocaleID, minimizedLocaleIDCapacity, reslen, status);
1286    }
1287
1288    return reslen;
1289}
1290
1291U_CAPI void U_EXPORT2
1292ulocimp_minimizeSubtags(const char* localeID,
1293                        icu::ByteSink& sink,
1294                        UErrorCode* status) {
1295    PreflightingLocaleIDBuffer localeBuffer;
1296    do {
1297        localeBuffer.requestedCapacity = do_canonicalize(localeID, localeBuffer.getBuffer(),
1298            localeBuffer.getCapacity(), status);
1299    } while (localeBuffer.needToTryAgain(status));
1300
1301    _uloc_minimizeSubtags(localeBuffer.getBuffer(), sink, status);
1302}
1303
1304// Pairs of (language subtag, + or -) for finding out fast if common languages
1305// are LTR (minus) or RTL (plus).
1306static const char LANG_DIR_STRING[] =
1307        "root-en-es-pt-zh-ja-ko-de-fr-it-ar+he+fa+ru-nl-pl-th-tr-";
1308
1309// Implemented here because this calls ulocimp_addLikelySubtags().
1310U_CAPI UBool U_EXPORT2
1311uloc_isRightToLeft(const char *locale) {
1312    UErrorCode errorCode = U_ZERO_ERROR;
1313    char script[8];
1314    int32_t scriptLength = uloc_getScript(locale, script, UPRV_LENGTHOF(script), &errorCode);
1315    if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING ||
1316            scriptLength == 0) {
1317        // Fastpath: We know the likely scripts and their writing direction
1318        // for some common languages.
1319        errorCode = U_ZERO_ERROR;
1320        char lang[8];
1321        int32_t langLength = uloc_getLanguage(locale, lang, UPRV_LENGTHOF(lang), &errorCode);
1322        if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING) {
1323            return false;
1324        }
1325        if (langLength > 0) {
1326            const char* langPtr = uprv_strstr(LANG_DIR_STRING, lang);
1327            if (langPtr != NULL) {
1328                switch (langPtr[langLength]) {
1329                case '-': return false;
1330                case '+': return true;
1331                default: break;  // partial match of a longer code
1332                }
1333            }
1334        }
1335        // Otherwise, find the likely script.
1336        errorCode = U_ZERO_ERROR;
1337        icu::CharString likely;
1338        {
1339            icu::CharStringByteSink sink(&likely);
1340            ulocimp_addLikelySubtags(locale, sink, &errorCode);
1341        }
1342        if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING) {
1343            return false;
1344        }
1345        scriptLength = uloc_getScript(likely.data(), script, UPRV_LENGTHOF(script), &errorCode);
1346        if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING ||
1347                scriptLength == 0) {
1348            return false;
1349        }
1350    }
1351    UScriptCode scriptCode = (UScriptCode)u_getPropertyValueEnum(UCHAR_SCRIPT, script);
1352    return uscript_isRightToLeft(scriptCode);
1353}
1354
1355U_NAMESPACE_BEGIN
1356
1357UBool
1358Locale::isRightToLeft() const {
1359    return uloc_isRightToLeft(getBaseName());
1360}
1361
1362U_NAMESPACE_END
1363
1364// The following must at least allow for rg key value (6) plus terminator (1).
1365#define ULOC_RG_BUFLEN 8
1366
1367U_CAPI int32_t U_EXPORT2
1368ulocimp_getRegionForSupplementalData(const char *localeID, UBool inferRegion,
1369                                     char *region, int32_t regionCapacity, UErrorCode* status) {
1370    if (U_FAILURE(*status)) {
1371        return 0;
1372    }
1373    char rgBuf[ULOC_RG_BUFLEN];
1374    UErrorCode rgStatus = U_ZERO_ERROR;
1375
1376    // First check for rg keyword value
1377    int32_t rgLen = uloc_getKeywordValue(localeID, "rg", rgBuf, ULOC_RG_BUFLEN, &rgStatus);
1378    if (U_FAILURE(rgStatus) || rgLen != 6) {
1379        rgLen = 0;
1380    } else {
1381        // rgBuf guaranteed to be zero terminated here, with text len 6
1382        char *rgPtr = rgBuf;
1383        for (; *rgPtr!= 0; rgPtr++) {
1384            *rgPtr = uprv_toupper(*rgPtr);
1385        }
1386        rgLen = (uprv_strcmp(rgBuf+2, "ZZZZ") == 0)? 2: 0;
1387    }
1388
1389    if (rgLen == 0) {
1390        // No valid rg keyword value, try for unicode_region_subtag
1391        rgLen = uloc_getCountry(localeID, rgBuf, ULOC_RG_BUFLEN, status);
1392        if (U_FAILURE(*status)) {
1393            rgLen = 0;
1394        } else if (rgLen == 0 && inferRegion) {
1395            // no unicode_region_subtag but inferRegion true, try likely subtags
1396            rgStatus = U_ZERO_ERROR;
1397            icu::CharString locBuf;
1398            {
1399                icu::CharStringByteSink sink(&locBuf);
1400                ulocimp_addLikelySubtags(localeID, sink, &rgStatus);
1401            }
1402            if (U_SUCCESS(rgStatus)) {
1403                rgLen = uloc_getCountry(locBuf.data(), rgBuf, ULOC_RG_BUFLEN, status);
1404                if (U_FAILURE(*status)) {
1405                    rgLen = 0;
1406                }
1407            }
1408        }
1409    }
1410
1411    rgBuf[rgLen] = 0;
1412    uprv_strncpy(region, rgBuf, regionCapacity);
1413    return u_terminateChars(region, regionCapacity, rgLen, status);
1414}
1415
1416