1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4 * Copyright (C) 2001-2011, International Business Machines Corporation
5 * and others. All Rights Reserved.
6 **********************************************************************
7 *   Date        Name        Description
8 *   07/23/01    aliu        Creation.
9 **********************************************************************
10 */
11#ifndef STRMATCH_H
12#define STRMATCH_H
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_TRANSLITERATION
17
18#include "unicode/unistr.h"
19#include "unicode/unifunct.h"
20#include "unicode/unimatch.h"
21#include "unicode/unirepl.h"
22
23U_NAMESPACE_BEGIN
24
25class TransliterationRuleData;
26
27/**
28 * An object that matches a fixed input string, implementing the
29 * UnicodeMatcher API.  This object also implements the
30 * UnicodeReplacer API, allowing it to emit the matched text as
31 * output.  Since the match text may contain flexible match elements,
32 * such as UnicodeSets, the emitted text is not the match pattern, but
33 * instead a substring of the actual matched text.  Following
34 * convention, the output text is the leftmost match seen up to this
35 * point.
36 *
37 * A StringMatcher may represent a segment, in which case it has a
38 * positive segment number.  This affects how the matcher converts
39 * itself to a pattern but does not otherwise affect its function.
40 *
41 * A StringMatcher that is not a segment should not be used as a
42 * UnicodeReplacer.
43 */
44class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer {
45
46 public:
47
48    /**
49     * Construct a matcher that matches the given pattern string.
50     * @param string the pattern to be matched, possibly containing
51     * stand-ins that represent nested UnicodeMatcher objects.
52     * @param start inclusive start index of text to be replaced
53     * @param limit exclusive end index of text to be replaced;
54     * must be greater than or equal to start
55     * @param segmentNum the segment number from 1..n, or 0 if this is
56     * not a segment.
57     * @param data context object mapping stand-ins to
58     * UnicodeMatcher objects.
59     */
60    StringMatcher(const UnicodeString& string,
61                  int32_t start,
62                  int32_t limit,
63                  int32_t segmentNum,
64                  const TransliterationRuleData& data);
65
66    /**
67     * Copy constructor
68     * @param o  the object to be copied.
69     */
70    StringMatcher(const StringMatcher& o);
71
72    /**
73     * Destructor
74     */
75    virtual ~StringMatcher();
76
77    /**
78     * Implement UnicodeFunctor
79     * @return a copy of the object.
80     */
81    virtual StringMatcher* clone() const override;
82
83    /**
84     * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
85     * and return the pointer.
86     * @return the UnicodeMatcher point.
87     */
88    virtual UnicodeMatcher* toMatcher() const override;
89
90    /**
91     * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
92     * and return the pointer.
93     * @return the UnicodeReplacer pointer.
94     */
95    virtual UnicodeReplacer* toReplacer() const override;
96
97    /**
98     * Implement UnicodeMatcher
99     * @param text the text to be matched
100     * @param offset on input, the index into text at which to begin
101     * matching.  On output, the limit of the matched text.  The
102     * number of matched characters is the output value of offset
103     * minus the input value.  Offset should always point to the
104     * HIGH SURROGATE (leading code unit) of a pair of surrogates,
105     * both on entry and upon return.
106     * @param limit the limit index of text to be matched.  Greater
107     * than offset for a forward direction match, less than offset for
108     * a backward direction match.  The last character to be
109     * considered for matching will be text.charAt(limit-1) in the
110     * forward direction or text.charAt(limit+1) in the backward
111     * direction.
112     * @param incremental  if true, then assume further characters may
113     * be inserted at limit and check for partial matching.  Otherwise
114     * assume the text as given is complete.
115     * @return a match degree value indicating a full match, a partial
116     * match, or a mismatch.  If incremental is false then
117     * U_PARTIAL_MATCH should never be returned.
118     */
119    virtual UMatchDegree matches(const Replaceable& text,
120                                 int32_t& offset,
121                                 int32_t limit,
122                                 UBool incremental) override;
123
124    /**
125     * Implement UnicodeMatcher
126     * @param result            Output param to receive the pattern.
127     * @param escapeUnprintable if True then escape the unprintable characters.
128     * @return                  A reference to 'result'.
129     */
130    virtual UnicodeString& toPattern(UnicodeString& result,
131                                     UBool escapeUnprintable = false) const override;
132
133    /**
134     * Implement UnicodeMatcher
135     * Returns true if this matcher will match a character c, where c
136     * & 0xFF == v, at offset, in the forward direction (with limit >
137     * offset).  This is used by <tt>RuleBasedTransliterator</tt> for
138     * indexing.
139     * @param v    the given value
140     * @return     true if this matcher will match a character c,
141     *             where c & 0xFF == v
142     */
143    virtual UBool matchesIndexValue(uint8_t v) const override;
144
145    /**
146     * Implement UnicodeMatcher
147     */
148    virtual void addMatchSetTo(UnicodeSet& toUnionTo) const override;
149
150    /**
151     * Implement UnicodeFunctor
152     */
153    virtual void setData(const TransliterationRuleData*) override;
154
155    /**
156     * Replace characters in 'text' from 'start' to 'limit' with the
157     * output text of this object.  Update the 'cursor' parameter to
158     * give the cursor position and return the length of the
159     * replacement text.
160     *
161     * @param text the text to be matched
162     * @param start inclusive start index of text to be replaced
163     * @param limit exclusive end index of text to be replaced;
164     * must be greater than or equal to start
165     * @param cursor output parameter for the cursor position.
166     * Not all replacer objects will update this, but in a complete
167     * tree of replacer objects, representing the entire output side
168     * of a transliteration rule, at least one must update it.
169     * @return the number of 16-bit code units in the text replacing
170     * the characters at offsets start..(limit-1) in text
171     */
172    virtual int32_t replace(Replaceable& text,
173                            int32_t start,
174                            int32_t limit,
175                            int32_t& cursor) override;
176
177    /**
178     * Returns a string representation of this replacer.  If the
179     * result of calling this function is passed to the appropriate
180     * parser, typically TransliteratorParser, it will produce another
181     * replacer that is equal to this one.
182     * @param result the string to receive the pattern.  Previous
183     * contents will be deleted.
184     * @param escapeUnprintable if true then convert unprintable
185     * character to their hex escape representations, \\uxxxx or
186     * \\Uxxxxxxxx.  Unprintable characters are defined by
187     * Utility.isUnprintable().
188     * @return a reference to 'result'.
189     */
190    virtual UnicodeString& toReplacerPattern(UnicodeString& result,
191                                             UBool escapeUnprintable) const override;
192
193    /**
194     * Remove any match data.  This must be called before performing a
195     * set of matches with this segment.
196     */
197    void resetMatch();
198
199    /**
200     * ICU "poor man's RTTI", returns a UClassID for the actual class.
201     */
202    virtual UClassID getDynamicClassID() const override;
203
204    /**
205     * ICU "poor man's RTTI", returns a UClassID for this class.
206     */
207    static UClassID U_EXPORT2 getStaticClassID();
208
209    /**
210     * Union the set of all characters that may output by this object
211     * into the given set.
212     * @param toUnionTo the set into which to union the output characters
213     */
214    virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const override;
215
216 private:
217
218    /**
219     * The text to be matched.
220     */
221    UnicodeString pattern;
222
223    /**
224     * Context object that maps stand-ins to matcher and replacer
225     * objects.
226     */
227    const TransliterationRuleData* data;
228
229    /**
230     * The segment number, 1-based, or 0 if not a segment.
231     */
232    int32_t segmentNumber;
233
234    /**
235     * Start offset, in the match text, of the <em>rightmost</em>
236     * match.
237     */
238    int32_t matchStart;
239
240    /**
241     * Limit offset, in the match text, of the <em>rightmost</em>
242     * match.
243     */
244    int32_t matchLimit;
245
246};
247
248U_NAMESPACE_END
249
250#endif /* #if !UCONFIG_NO_TRANSLITERATION */
251
252#endif
253