1                                           /*
2* Copyright 2022 Google Inc.
3*
4* Use of this source code is governed by a BSD-style license that can be
5* found in the LICENSE file.
6*/
7#include "include/core/SkSpan.h"
8#include "include/core/SkString.h"
9#include "include/core/SkTypes.h"
10#include "include/private/SkBitmaskEnum.h"
11#include "include/private/SkTArray.h"
12#include "include/private/SkTo.h"
13#include "modules/skunicode/include/SkUnicode.h"
14#include "modules/skunicode/src/SkUnicode_client.h"
15#include "modules/skunicode/src/SkUnicode_icu_bidi.h"
16#include "src/utils/SkUTF.h"
17
18#include <algorithm>
19#include <cstdint>
20#include <memory>
21#include <string>
22#include <utility>
23#include <vector>
24#include <array>
25#include <unicode/ubidi.h>
26#include <unicode/ubrk.h>
27#include <unicode/uchar.h>
28#include <unicode/uloc.h>
29#include <unicode/uscript.h>
30#include <unicode/ustring.h>
31#include <unicode/utext.h>
32#include <unicode/utypes.h>
33
34
35#ifndef SK_UNICODE_ICU_IMPLEMENTATION
36
37const char* SkUnicode_IcuBidi::errorName(UErrorCode status) {
38    return u_errorName_skia(status);
39}
40void SkUnicode_IcuBidi::bidi_close(UBiDi* bidi) {
41    ubidi_close_skia(bidi);
42}
43UBiDiDirection SkUnicode_IcuBidi::bidi_getDirection(const UBiDi* bidi) {
44    return ubidi_getDirection_skia(bidi);
45}
46SkBidiIterator::Position SkUnicode_IcuBidi::bidi_getLength(const UBiDi* bidi) {
47    return ubidi_getLength_skia(bidi);
48}
49SkBidiIterator::Level SkUnicode_IcuBidi::bidi_getLevelAt(const UBiDi* bidi, int pos) {
50    return ubidi_getLevelAt_skia(bidi, pos);
51}
52UBiDi* SkUnicode_IcuBidi::bidi_openSized(int32_t maxLength, int32_t maxRunCount, UErrorCode* pErrorCode) {
53    return ubidi_openSized_skia(maxLength, maxRunCount, pErrorCode);
54}
55void SkUnicode_IcuBidi::bidi_setPara(UBiDi* bidi,
56                         const UChar* text,
57                         int32_t length,
58                         UBiDiLevel paraLevel,
59                         UBiDiLevel* embeddingLevels,
60                         UErrorCode* status) {
61    return ubidi_setPara_skia(bidi, text, length, paraLevel, embeddingLevels, status);
62}
63void SkUnicode_IcuBidi::bidi_reorderVisual(const SkUnicode::BidiLevel runLevels[],
64                               int levelsCount,
65                               int32_t logicalFromVisual[]) {
66    ubidi_reorderVisual_skia(runLevels, levelsCount, logicalFromVisual);
67}
68#endif
69
70class SkUnicode_client : public SkUnicode {
71public:
72    struct Data {
73        SkSpan<const char> fText8;
74        SkSpan<const char16_t> fText16;
75        std::vector<Position> fWords;
76        std::vector<SkUnicode::Position> fGraphemeBreaks;
77        std::vector<SkUnicode::LineBreakBefore> fLineBreaks;
78        Data(SkSpan<char> text,
79             std::vector<SkUnicode::Position> words,
80             std::vector<SkUnicode::Position> graphemeBreaks,
81             std::vector<SkUnicode::LineBreakBefore> lineBreaks)
82            : fText8(text)
83            , fText16(SkSpan<const char16_t>(nullptr, 0))
84            , fWords(std::move(words))
85            , fGraphemeBreaks(std::move(graphemeBreaks))
86            , fLineBreaks(std::move(lineBreaks)) {
87        }
88
89        void reset() {
90            fText8 = SkSpan<const char>(nullptr, 0);
91            fText16 = SkSpan<const char16_t>(nullptr, 0);
92            fGraphemeBreaks.clear();
93            fLineBreaks.clear();
94        }
95    };
96    SkUnicode_client() = delete;
97    SkUnicode_client(SkSpan<char> text,
98                     std::vector<SkUnicode::Position> words,
99                     std::vector<SkUnicode::Position> graphemeBreaks,
100                     std::vector<SkUnicode::LineBreakBefore> lineBreaks)
101            : fData(std::make_shared<Data>(text,
102                                           std::move(words),
103                                           std::move(graphemeBreaks),
104                                           std::move(lineBreaks))) { }
105    SkUnicode_client(const SkUnicode_client* origin)
106            : fData(origin->fData) {}
107
108
109    std::unique_ptr<SkUnicode> copy() override {
110        return std::make_unique<SkUnicode_client>(this);
111    }
112
113    ~SkUnicode_client() override = default;
114
115    void reset() { fData->reset(); }
116    // For SkShaper
117    std::unique_ptr<SkBidiIterator> makeBidiIterator(const uint16_t text[], int count,
118                                                     SkBidiIterator::Direction dir) override;
119    std::unique_ptr<SkBidiIterator> makeBidiIterator(const char text[],
120                                                     int count,
121                                                     SkBidiIterator::Direction dir) override;
122    std::unique_ptr<SkBreakIterator> makeBreakIterator(const char locale[],
123                                                       BreakType breakType) override;
124    std::unique_ptr<SkBreakIterator> makeBreakIterator(BreakType breakType) override;
125    // For SkParagraph
126    bool getBidiRegions(const char utf8[],
127                        int utf8Units,
128                        TextDirection dir,
129                        std::vector<BidiRegion>* results) override {
130        return SkUnicode::extractBidi(utf8, utf8Units, dir, results);
131    }
132
133    // TODO: Take if from the Client or hard code here?
134    static bool isControl(SkUnichar utf8) {
135        return (utf8 < ' ') || (utf8 >= 0x7f && utf8 <= 0x9f) ||
136               (utf8 >= 0x200D && utf8 <= 0x200F) ||
137               (utf8 >= 0x202A && utf8 <= 0x202E);
138    }
139
140    static bool isWhitespace(SkUnichar unichar) {
141        static constexpr std::array<SkUnichar, 21> whitespaces {
142                0x0009, // character tabulation
143                0x000A, // line feed
144                0x000B, // line tabulation
145                0x000C, // form feed
146                0x000D, // carriage return
147                0x0020, // space
148              //0x0085, // next line
149              //0x00A0, // no-break space
150                0x1680, // ogham space mark
151                0x2000, // en quad
152                0x2001, // em quad
153                0x2002, // en space
154                0x2003, // em space
155                0x2004, // three-per-em space
156                0x2005, // four-per-em space
157                0x2006, // six-per-em space
158              //0x2007, // figure space
159                0x2008, // punctuation space
160                0x2009, // thin space
161                0x200A, // hair space
162                0x2028, // line separator
163                0x2029, // paragraph separator
164              //0x202F, // narrow no-break space
165                0x205F, // medium mathematical space
166                0x3000};// ideographic space
167        return std::find(whitespaces.begin(), whitespaces.end(), unichar) != whitespaces.end();
168    }
169
170    static bool isSpace(SkUnichar unichar) {
171        static constexpr std::array<SkUnichar, 25> spaces {
172                0x0009, // character tabulation
173                0x000A, // line feed
174                0x000B, // line tabulation
175                0x000C, // form feed
176                0x000D, // carriage return
177                0x0020, // space
178                0x0085, // next line
179                0x00A0, // no-break space
180                0x1680, // ogham space mark
181                0x2000, // en quad
182                0x2001, // em quad
183                0x2002, // en space
184                0x2003, // em space
185                0x2004, // three-per-em space
186                0x2005, // four-per-em space
187                0x2006, // six-per-em space
188                0x2007, // figure space
189                0x2008, // punctuation space
190                0x2009, // thin space
191                0x200A, // hair space
192                0x2028, // line separator
193                0x2029, // paragraph separator
194                0x202F, // narrow no-break space
195                0x205F, // medium mathematical space
196                0x3000}; // ideographic space
197        return std::find(spaces.begin(), spaces.end(), unichar) != spaces.end();
198    }
199
200    static bool isTabulation(SkUnichar utf8) {
201        return utf8 == '\t';
202    }
203
204    static bool isHardBreak(SkUnichar utf8) {
205        return utf8 == '\n';
206    }
207
208    static bool isIdeographic(SkUnichar unichar) {
209        static constexpr std::array<std::pair<SkUnichar, SkUnichar>, 8> ranges {{
210            {4352,   4607}, // Hangul Jamo
211            {11904, 42191}, // CJK_Radicals
212            {43072, 43135}, // Phags_Pa
213            {44032, 55215}, // Hangul_Syllables
214            {63744, 64255}, // CJK_Compatibility_Ideographs
215            {65072, 65103}, // CJK_Compatibility_Forms
216            {65381, 65500}, // Katakana_Hangul_Halfwidth
217            {131072, 196607} // Supplementary_Ideographic_Plane
218        }};
219        for (auto range : ranges) {
220            if (range.first <= unichar && range.second > unichar) {
221                return true;
222            }
223        }
224        return false;
225    }
226
227    bool computeCodeUnitFlags(char utf8[],
228                              int utf8Units,
229                              bool replaceTabs,
230                              SkTArray<SkUnicode::CodeUnitFlags, true>* results) override {
231        results->clear();
232        results->push_back_n(utf8Units + 1, CodeUnitFlags::kNoCodeUnitFlag);
233        for (auto& lineBreak : fData->fLineBreaks) {
234            (*results)[lineBreak.pos] |=
235                lineBreak.breakType == LineBreakType::kHardLineBreak
236                    ? CodeUnitFlags::kHardLineBreakBefore
237                    : CodeUnitFlags::kSoftLineBreakBefore;
238        }
239        for (auto& grapheme : fData->fGraphemeBreaks) {
240            (*results)[grapheme] |= CodeUnitFlags::kGraphemeStart;
241        }
242        const char* current = utf8;
243        const char* end = utf8 + utf8Units;
244        while (current < end) {
245            auto before = current - utf8;
246            SkUnichar unichar = SkUTF::NextUTF8(&current, end);
247            if (unichar < 0) unichar = 0xFFFD;
248            auto after = current - utf8;
249            if (replaceTabs && SkUnicode_client::isTabulation(unichar)) {
250                results->at(before) |= SkUnicode::kTabulation;
251                if (replaceTabs) {
252                    unichar = ' ';
253                    utf8[before] = ' ';
254                }
255            }
256            for (auto i = before; i < after; ++i) {
257                if (SkUnicode_client::isSpace(unichar)) {
258                    results->at(i) |= SkUnicode::kPartOfIntraWordBreak;
259                }
260                if (SkUnicode_client::isWhitespace(unichar)) {
261                    results->at(i) |= SkUnicode::kPartOfWhiteSpaceBreak;
262                }
263                if (SkUnicode_client::isControl(unichar)) {
264                    results->at(i) |= SkUnicode::kControl;
265                }
266                if (SkUnicode_client::isIdeographic(unichar)) {
267                    results->at(i) |= SkUnicode::kIdeographic;
268                }
269            }
270        }
271        return true;
272    }
273
274    bool computeCodeUnitFlags(char16_t utf16[], int utf16Units, bool replaceTabs,
275                          SkTArray<SkUnicode::CodeUnitFlags, true>* results) override {
276        results->clear();
277        results->push_back_n(utf16Units + 1, CodeUnitFlags::kNoCodeUnitFlag);
278        for (auto& lineBreak : fData->fLineBreaks) {
279            (*results)[lineBreak.pos] |=
280                lineBreak.breakType == LineBreakType::kHardLineBreak
281                    ? CodeUnitFlags::kHardLineBreakBefore
282                    : CodeUnitFlags::kSoftLineBreakBefore;
283        }
284        for (auto& grapheme : fData->fGraphemeBreaks) {
285            (*results)[grapheme] |= CodeUnitFlags::kGraphemeStart;
286        }
287        return true;
288    }
289
290    bool getWords(const char utf8[], int utf8Units, const char* locale, std::vector<Position>* results) override {
291        *results = fData->fWords;
292        return true;
293    }
294
295    SkString toUpper(const SkString& str) override {
296        SkASSERT(false);
297        return SkString(fData->fText8.data(), fData->fText8.size());
298    }
299
300    void reorderVisual(const BidiLevel runLevels[],
301                       int levelsCount,
302                       int32_t logicalFromVisual[]) override {
303        SkUnicode_IcuBidi::bidi_reorderVisual(runLevels, levelsCount, logicalFromVisual);
304    }
305private:
306    friend class SkBreakIterator_client;
307
308    std::shared_ptr<Data> fData;
309};
310
311class SkBreakIterator_client: public SkBreakIterator {
312    std::shared_ptr<SkUnicode_client::Data> fData;
313    Position fLastResult;
314    Position fStart;
315    Position fEnd;
316public:
317    explicit SkBreakIterator_client(std::shared_ptr<SkUnicode_client::Data> data) : fData(data) { }
318    Position first() override
319      { return fData->fLineBreaks[fStart + (fLastResult = 0)].pos; }
320    Position current() override
321      { return fData->fLineBreaks[fStart + fLastResult].pos; }
322    Position next() override
323      { return fData->fLineBreaks[fStart + fLastResult + 1].pos; }
324    Status status() override {
325        return fData->fLineBreaks[fStart + fLastResult].breakType ==
326                       SkUnicode::LineBreakType::kHardLineBreak
327                       ? SkUnicode::CodeUnitFlags::kHardLineBreakBefore
328                       : SkUnicode::CodeUnitFlags::kSoftLineBreakBefore;
329    }
330    bool isDone() override { return fStart + fLastResult == fEnd; }
331    bool setText(const char utftext8[], int utf8Units) override {
332        SkASSERT(utftext8 >= fData->fText8.data() &&
333                 utf8Units <= SkToS16(fData->fText8.size()));
334        fStart = utftext8 - fData->fText8.data();
335        fEnd = fStart + utf8Units;
336        fLastResult = 0;
337        return true;
338    }
339    bool setText(const char16_t utftext16[], int utf16Units) override {
340        SkASSERT(utftext16 >= fData->fText16.data() &&
341                 utf16Units <= SkToS16(fData->fText16.size()));
342        fStart = utftext16 - fData->fText16.data();
343        fEnd = fStart + utf16Units;
344        fLastResult = 0;
345        return true;
346    }
347};
348std::unique_ptr<SkBidiIterator> SkUnicode_client::makeBidiIterator(const uint16_t text[], int count,
349                                                 SkBidiIterator::Direction dir) {
350    return SkUnicode::makeBidiIterator(text, count, dir);
351}
352std::unique_ptr<SkBidiIterator> SkUnicode_client::makeBidiIterator(const char text[],
353                                                 int count,
354                                                 SkBidiIterator::Direction dir) {
355    return SkUnicode::makeBidiIterator(text, count, dir);
356}
357std::unique_ptr<SkBreakIterator> SkUnicode_client::makeBreakIterator(const char locale[],
358                                                   BreakType breakType) {
359    return std::make_unique<SkBreakIterator_client>(fData);
360}
361std::unique_ptr<SkBreakIterator> SkUnicode_client::makeBreakIterator(BreakType breakType) {
362    return std::make_unique<SkBreakIterator_client>(fData);
363}
364
365std::unique_ptr<SkUnicode> SkUnicode::MakeClientBasedUnicode(
366        SkSpan<char> text,
367        std::vector<SkUnicode::Position> words,
368        std::vector<SkUnicode::Position> graphemeBreaks,
369        std::vector<SkUnicode::LineBreakBefore> lineBreaks) {
370    return std::make_unique<SkUnicode_client>(text, words, graphemeBreaks, lineBreaks);
371}
372
373