1// © 2017 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3
4// ucasemap_imp.h
5// created: 2017feb08 Markus W. Scherer
6
7#ifndef __UCASEMAP_IMP_H__
8#define __UCASEMAP_IMP_H__
9
10#include "unicode/utypes.h"
11#include "unicode/ucasemap.h"
12#include "unicode/uchar.h"
13#include "ucase.h"
14
15/**
16 * Bit mask for the titlecasing iterator options bit field.
17 * Currently only 3 out of 8 values are used:
18 * 0 (words), U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
19 * See stringoptions.h.
20 * @internal
21 */
22#define U_TITLECASE_ITERATOR_MASK 0xe0
23
24/**
25 * Bit mask for the titlecasing index adjustment options bit set.
26 * Currently two bits are defined:
27 * U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED.
28 * See stringoptions.h.
29 * @internal
30 */
31#define U_TITLECASE_ADJUSTMENT_MASK 0x600
32
33/**
34 * Internal API, used by u_strcasecmp() etc.
35 * Compare strings case-insensitively,
36 * in code point order or code unit order.
37 */
38U_CFUNC int32_t
39u_strcmpFold(const UChar *s1, int32_t length1,
40             const UChar *s2, int32_t length2,
41             uint32_t options,
42             UErrorCode *pErrorCode);
43
44/**
45 * Internal API, used for detecting length of
46 * shared prefix case-insensitively.
47 * @param s1            input string 1
48 * @param length1       length of string 1, or -1 (NULL terminated)
49 * @param s2            input string 2
50 * @param length2       length of string 2, or -1 (NULL terminated)
51 * @param options       compare options
52 * @param matchLen1     (output) length of partial prefix match in s1
53 * @param matchLen2     (output) length of partial prefix match in s2
54 * @param pErrorCode    receives error status
55 */
56U_CAPI void
57u_caseInsensitivePrefixMatch(const UChar *s1, int32_t length1,
58                             const UChar *s2, int32_t length2,
59                             uint32_t options,
60                             int32_t *matchLen1, int32_t *matchLen2,
61                             UErrorCode *pErrorCode);
62
63#ifdef __cplusplus
64
65U_NAMESPACE_BEGIN
66
67class BreakIterator;        // unicode/brkiter.h
68class ByteSink;
69class Locale;               // unicode/locid.h
70
71/** Returns true if the options are valid. Otherwise false, and sets an error. */
72inline UBool ustrcase_checkTitleAdjustmentOptions(uint32_t options, UErrorCode &errorCode) {
73    if (U_FAILURE(errorCode)) { return false; }
74    if ((options & U_TITLECASE_ADJUSTMENT_MASK) == U_TITLECASE_ADJUSTMENT_MASK) {
75        // Both options together.
76        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
77        return false;
78    }
79    return true;
80}
81
82inline UBool ustrcase_isLNS(UChar32 c) {
83    // Letter, number, symbol,
84    // or a private use code point because those are typically used as letters or numbers.
85    // Consider modifier letters only if they are cased.
86    const uint32_t LNS = (U_GC_L_MASK|U_GC_N_MASK|U_GC_S_MASK|U_GC_CO_MASK) & ~U_GC_LM_MASK;
87    int gc = u_charType(c);
88    return (U_MASK(gc) & LNS) != 0 || (gc == U_MODIFIER_LETTER && ucase_getType(c) != UCASE_NONE);
89}
90
91#if !UCONFIG_NO_BREAK_ITERATION
92
93/** Returns nullptr if error. Pass in either locale or locID, not both. */
94U_CFUNC
95BreakIterator *ustrcase_getTitleBreakIterator(
96        const Locale *locale, const char *locID, uint32_t options, BreakIterator *iter,
97        LocalPointer<BreakIterator> &ownedIter, UErrorCode &errorCode);
98
99#endif
100
101U_NAMESPACE_END
102
103#include "unicode/unistr.h"  // for UStringCaseMapper
104
105/*
106 * Internal string casing functions implementing
107 * ustring.h/ustrcase.cpp and UnicodeString case mapping functions.
108 */
109
110struct UCaseMap : public icu::UMemory {
111    /** Implements most of ucasemap_open(). */
112    UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode);
113    ~UCaseMap();
114
115#if !UCONFIG_NO_BREAK_ITERATION
116    icu::BreakIterator *iter;  /* We adopt the iterator, so we own it. */
117#endif
118    char locale[32];
119    int32_t caseLocale;
120    uint32_t options;
121};
122
123#if UCONFIG_NO_BREAK_ITERATION
124#   define UCASEMAP_BREAK_ITERATOR_PARAM
125#   define UCASEMAP_BREAK_ITERATOR_UNUSED
126#   define UCASEMAP_BREAK_ITERATOR
127#   define UCASEMAP_BREAK_ITERATOR_NULL
128#else
129#   define UCASEMAP_BREAK_ITERATOR_PARAM icu::BreakIterator *iter,
130#   define UCASEMAP_BREAK_ITERATOR_UNUSED icu::BreakIterator *,
131#   define UCASEMAP_BREAK_ITERATOR iter,
132#   define UCASEMAP_BREAK_ITERATOR_NULL NULL,
133#endif
134
135U_CFUNC int32_t
136ustrcase_getCaseLocale(const char *locale);
137
138// TODO: swap src / dest if approved for new public api
139/** Implements UStringCaseMapper. */
140U_CFUNC int32_t U_CALLCONV
141ustrcase_internalToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
142                         UChar *dest, int32_t destCapacity,
143                         const UChar *src, int32_t srcLength,
144                         icu::Edits *edits,
145                         UErrorCode &errorCode);
146
147/** Implements UStringCaseMapper. */
148U_CFUNC int32_t U_CALLCONV
149ustrcase_internalToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
150                         UChar *dest, int32_t destCapacity,
151                         const UChar *src, int32_t srcLength,
152                         icu::Edits *edits,
153                         UErrorCode &errorCode);
154
155#if !UCONFIG_NO_BREAK_ITERATION
156
157/** Implements UStringCaseMapper. */
158U_CFUNC int32_t U_CALLCONV
159ustrcase_internalToTitle(int32_t caseLocale, uint32_t options,
160                         icu::BreakIterator *iter,
161                         UChar *dest, int32_t destCapacity,
162                         const UChar *src, int32_t srcLength,
163                         icu::Edits *edits,
164                         UErrorCode &errorCode);
165
166#endif
167
168/** Implements UStringCaseMapper. */
169U_CFUNC int32_t U_CALLCONV
170ustrcase_internalFold(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
171                      UChar *dest, int32_t destCapacity,
172                      const UChar *src, int32_t srcLength,
173                      icu::Edits *edits,
174                      UErrorCode &errorCode);
175
176/**
177 * Common string case mapping implementation for ucasemap_toXyz() and UnicodeString::toXyz().
178 * Implements argument checking.
179 */
180U_CFUNC int32_t
181ustrcase_map(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
182             UChar *dest, int32_t destCapacity,
183             const UChar *src, int32_t srcLength,
184             UStringCaseMapper *stringCaseMapper,
185             icu::Edits *edits,
186             UErrorCode &errorCode);
187
188/**
189 * Common string case mapping implementation for old-fashioned u_strToXyz() functions
190 * that allow the source string to overlap the destination buffer.
191 * Implements argument checking and internally works with an intermediate buffer if necessary.
192 */
193U_CFUNC int32_t
194ustrcase_mapWithOverlap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
195                        UChar *dest, int32_t destCapacity,
196                        const UChar *src, int32_t srcLength,
197                        UStringCaseMapper *stringCaseMapper,
198                        UErrorCode &errorCode);
199
200/**
201 * UTF-8 string case mapping function type, used by ucasemap_mapUTF8().
202 * UTF-8 version of UStringCaseMapper.
203 * All error checking must be done.
204 * The UCaseMap must be fully initialized, with locale and/or iter set as needed.
205 */
206typedef void U_CALLCONV
207UTF8CaseMapper(int32_t caseLocale, uint32_t options,
208#if !UCONFIG_NO_BREAK_ITERATION
209               icu::BreakIterator *iter,
210#endif
211               const uint8_t *src, int32_t srcLength,
212               icu::ByteSink &sink, icu::Edits *edits,
213               UErrorCode &errorCode);
214
215#if !UCONFIG_NO_BREAK_ITERATION
216
217/** Implements UTF8CaseMapper. */
218U_CFUNC void U_CALLCONV
219ucasemap_internalUTF8ToTitle(int32_t caseLocale, uint32_t options,
220        icu::BreakIterator *iter,
221        const uint8_t *src, int32_t srcLength,
222        icu::ByteSink &sink, icu::Edits *edits,
223        UErrorCode &errorCode);
224
225#endif
226
227void
228ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
229                 const char *src, int32_t srcLength,
230                 UTF8CaseMapper *stringCaseMapper,
231                 icu::ByteSink &sink, icu::Edits *edits,
232                 UErrorCode &errorCode);
233
234/**
235 * Implements argument checking and buffer handling
236 * for UTF-8 string case mapping as a common function.
237 */
238int32_t
239ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
240                 char *dest, int32_t destCapacity,
241                 const char *src, int32_t srcLength,
242                 UTF8CaseMapper *stringCaseMapper,
243                 icu::Edits *edits,
244                 UErrorCode &errorCode);
245
246U_NAMESPACE_BEGIN
247namespace GreekUpper {
248
249// Data bits.
250static const uint32_t UPPER_MASK = 0x3ff;
251static const uint32_t HAS_VOWEL = 0x1000;
252static const uint32_t HAS_YPOGEGRAMMENI = 0x2000;
253static const uint32_t HAS_ACCENT = 0x4000;
254static const uint32_t HAS_DIALYTIKA = 0x8000;
255// Further bits during data building and processing, not stored in the data map.
256static const uint32_t HAS_COMBINING_DIALYTIKA = 0x10000;
257static const uint32_t HAS_OTHER_GREEK_DIACRITIC = 0x20000;
258
259static const uint32_t HAS_VOWEL_AND_ACCENT = HAS_VOWEL | HAS_ACCENT;
260static const uint32_t HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA =
261        HAS_VOWEL_AND_ACCENT | HAS_DIALYTIKA;
262static const uint32_t HAS_EITHER_DIALYTIKA = HAS_DIALYTIKA | HAS_COMBINING_DIALYTIKA;
263
264// State bits.
265static const uint32_t AFTER_CASED = 1;
266static const uint32_t AFTER_VOWEL_WITH_ACCENT = 2;
267
268uint32_t getLetterData(UChar32 c);
269
270/**
271 * Returns a non-zero value for each of the Greek combining diacritics
272 * listed in The Unicode Standard, version 8, chapter 7.2 Greek,
273 * plus some perispomeni look-alikes.
274 */
275uint32_t getDiacriticData(UChar32 c);
276
277}  // namespace GreekUpper
278U_NAMESPACE_END
279
280#endif  // __cplusplus
281
282#endif  // __UCASEMAP_IMP_H__
283