11cb0ef41Sopenharmony_ci// © 2016 and later: Unicode, Inc. and others.
21cb0ef41Sopenharmony_ci// License & terms of use: http://www.unicode.org/copyright.html
31cb0ef41Sopenharmony_ci/*
41cb0ef41Sopenharmony_ci * Copyright (C) 2001-2011, International Business Machines Corporation
51cb0ef41Sopenharmony_ci * and others. All Rights Reserved.
61cb0ef41Sopenharmony_ci **********************************************************************
71cb0ef41Sopenharmony_ci *   Date        Name        Description
81cb0ef41Sopenharmony_ci *   07/23/01    aliu        Creation.
91cb0ef41Sopenharmony_ci **********************************************************************
101cb0ef41Sopenharmony_ci */
111cb0ef41Sopenharmony_ci#ifndef STRMATCH_H
121cb0ef41Sopenharmony_ci#define STRMATCH_H
131cb0ef41Sopenharmony_ci
141cb0ef41Sopenharmony_ci#include "unicode/utypes.h"
151cb0ef41Sopenharmony_ci
161cb0ef41Sopenharmony_ci#if !UCONFIG_NO_TRANSLITERATION
171cb0ef41Sopenharmony_ci
181cb0ef41Sopenharmony_ci#include "unicode/unistr.h"
191cb0ef41Sopenharmony_ci#include "unicode/unifunct.h"
201cb0ef41Sopenharmony_ci#include "unicode/unimatch.h"
211cb0ef41Sopenharmony_ci#include "unicode/unirepl.h"
221cb0ef41Sopenharmony_ci
231cb0ef41Sopenharmony_ciU_NAMESPACE_BEGIN
241cb0ef41Sopenharmony_ci
251cb0ef41Sopenharmony_ciclass TransliterationRuleData;
261cb0ef41Sopenharmony_ci
271cb0ef41Sopenharmony_ci/**
281cb0ef41Sopenharmony_ci * An object that matches a fixed input string, implementing the
291cb0ef41Sopenharmony_ci * UnicodeMatcher API.  This object also implements the
301cb0ef41Sopenharmony_ci * UnicodeReplacer API, allowing it to emit the matched text as
311cb0ef41Sopenharmony_ci * output.  Since the match text may contain flexible match elements,
321cb0ef41Sopenharmony_ci * such as UnicodeSets, the emitted text is not the match pattern, but
331cb0ef41Sopenharmony_ci * instead a substring of the actual matched text.  Following
341cb0ef41Sopenharmony_ci * convention, the output text is the leftmost match seen up to this
351cb0ef41Sopenharmony_ci * point.
361cb0ef41Sopenharmony_ci *
371cb0ef41Sopenharmony_ci * A StringMatcher may represent a segment, in which case it has a
381cb0ef41Sopenharmony_ci * positive segment number.  This affects how the matcher converts
391cb0ef41Sopenharmony_ci * itself to a pattern but does not otherwise affect its function.
401cb0ef41Sopenharmony_ci *
411cb0ef41Sopenharmony_ci * A StringMatcher that is not a segment should not be used as a
421cb0ef41Sopenharmony_ci * UnicodeReplacer.
431cb0ef41Sopenharmony_ci */
441cb0ef41Sopenharmony_ciclass StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer {
451cb0ef41Sopenharmony_ci
461cb0ef41Sopenharmony_ci public:
471cb0ef41Sopenharmony_ci
481cb0ef41Sopenharmony_ci    /**
491cb0ef41Sopenharmony_ci     * Construct a matcher that matches the given pattern string.
501cb0ef41Sopenharmony_ci     * @param string the pattern to be matched, possibly containing
511cb0ef41Sopenharmony_ci     * stand-ins that represent nested UnicodeMatcher objects.
521cb0ef41Sopenharmony_ci     * @param start inclusive start index of text to be replaced
531cb0ef41Sopenharmony_ci     * @param limit exclusive end index of text to be replaced;
541cb0ef41Sopenharmony_ci     * must be greater than or equal to start
551cb0ef41Sopenharmony_ci     * @param segmentNum the segment number from 1..n, or 0 if this is
561cb0ef41Sopenharmony_ci     * not a segment.
571cb0ef41Sopenharmony_ci     * @param data context object mapping stand-ins to
581cb0ef41Sopenharmony_ci     * UnicodeMatcher objects.
591cb0ef41Sopenharmony_ci     */
601cb0ef41Sopenharmony_ci    StringMatcher(const UnicodeString& string,
611cb0ef41Sopenharmony_ci                  int32_t start,
621cb0ef41Sopenharmony_ci                  int32_t limit,
631cb0ef41Sopenharmony_ci                  int32_t segmentNum,
641cb0ef41Sopenharmony_ci                  const TransliterationRuleData& data);
651cb0ef41Sopenharmony_ci
661cb0ef41Sopenharmony_ci    /**
671cb0ef41Sopenharmony_ci     * Copy constructor
681cb0ef41Sopenharmony_ci     * @param o  the object to be copied.
691cb0ef41Sopenharmony_ci     */
701cb0ef41Sopenharmony_ci    StringMatcher(const StringMatcher& o);
711cb0ef41Sopenharmony_ci
721cb0ef41Sopenharmony_ci    /**
731cb0ef41Sopenharmony_ci     * Destructor
741cb0ef41Sopenharmony_ci     */
751cb0ef41Sopenharmony_ci    virtual ~StringMatcher();
761cb0ef41Sopenharmony_ci
771cb0ef41Sopenharmony_ci    /**
781cb0ef41Sopenharmony_ci     * Implement UnicodeFunctor
791cb0ef41Sopenharmony_ci     * @return a copy of the object.
801cb0ef41Sopenharmony_ci     */
811cb0ef41Sopenharmony_ci    virtual StringMatcher* clone() const override;
821cb0ef41Sopenharmony_ci
831cb0ef41Sopenharmony_ci    /**
841cb0ef41Sopenharmony_ci     * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
851cb0ef41Sopenharmony_ci     * and return the pointer.
861cb0ef41Sopenharmony_ci     * @return the UnicodeMatcher point.
871cb0ef41Sopenharmony_ci     */
881cb0ef41Sopenharmony_ci    virtual UnicodeMatcher* toMatcher() const override;
891cb0ef41Sopenharmony_ci
901cb0ef41Sopenharmony_ci    /**
911cb0ef41Sopenharmony_ci     * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
921cb0ef41Sopenharmony_ci     * and return the pointer.
931cb0ef41Sopenharmony_ci     * @return the UnicodeReplacer pointer.
941cb0ef41Sopenharmony_ci     */
951cb0ef41Sopenharmony_ci    virtual UnicodeReplacer* toReplacer() const override;
961cb0ef41Sopenharmony_ci
971cb0ef41Sopenharmony_ci    /**
981cb0ef41Sopenharmony_ci     * Implement UnicodeMatcher
991cb0ef41Sopenharmony_ci     * @param text the text to be matched
1001cb0ef41Sopenharmony_ci     * @param offset on input, the index into text at which to begin
1011cb0ef41Sopenharmony_ci     * matching.  On output, the limit of the matched text.  The
1021cb0ef41Sopenharmony_ci     * number of matched characters is the output value of offset
1031cb0ef41Sopenharmony_ci     * minus the input value.  Offset should always point to the
1041cb0ef41Sopenharmony_ci     * HIGH SURROGATE (leading code unit) of a pair of surrogates,
1051cb0ef41Sopenharmony_ci     * both on entry and upon return.
1061cb0ef41Sopenharmony_ci     * @param limit the limit index of text to be matched.  Greater
1071cb0ef41Sopenharmony_ci     * than offset for a forward direction match, less than offset for
1081cb0ef41Sopenharmony_ci     * a backward direction match.  The last character to be
1091cb0ef41Sopenharmony_ci     * considered for matching will be text.charAt(limit-1) in the
1101cb0ef41Sopenharmony_ci     * forward direction or text.charAt(limit+1) in the backward
1111cb0ef41Sopenharmony_ci     * direction.
1121cb0ef41Sopenharmony_ci     * @param incremental  if true, then assume further characters may
1131cb0ef41Sopenharmony_ci     * be inserted at limit and check for partial matching.  Otherwise
1141cb0ef41Sopenharmony_ci     * assume the text as given is complete.
1151cb0ef41Sopenharmony_ci     * @return a match degree value indicating a full match, a partial
1161cb0ef41Sopenharmony_ci     * match, or a mismatch.  If incremental is false then
1171cb0ef41Sopenharmony_ci     * U_PARTIAL_MATCH should never be returned.
1181cb0ef41Sopenharmony_ci     */
1191cb0ef41Sopenharmony_ci    virtual UMatchDegree matches(const Replaceable& text,
1201cb0ef41Sopenharmony_ci                                 int32_t& offset,
1211cb0ef41Sopenharmony_ci                                 int32_t limit,
1221cb0ef41Sopenharmony_ci                                 UBool incremental) override;
1231cb0ef41Sopenharmony_ci
1241cb0ef41Sopenharmony_ci    /**
1251cb0ef41Sopenharmony_ci     * Implement UnicodeMatcher
1261cb0ef41Sopenharmony_ci     * @param result            Output param to receive the pattern.
1271cb0ef41Sopenharmony_ci     * @param escapeUnprintable if True then escape the unprintable characters.
1281cb0ef41Sopenharmony_ci     * @return                  A reference to 'result'.
1291cb0ef41Sopenharmony_ci     */
1301cb0ef41Sopenharmony_ci    virtual UnicodeString& toPattern(UnicodeString& result,
1311cb0ef41Sopenharmony_ci                                     UBool escapeUnprintable = false) const override;
1321cb0ef41Sopenharmony_ci
1331cb0ef41Sopenharmony_ci    /**
1341cb0ef41Sopenharmony_ci     * Implement UnicodeMatcher
1351cb0ef41Sopenharmony_ci     * Returns true if this matcher will match a character c, where c
1361cb0ef41Sopenharmony_ci     * & 0xFF == v, at offset, in the forward direction (with limit >
1371cb0ef41Sopenharmony_ci     * offset).  This is used by <tt>RuleBasedTransliterator</tt> for
1381cb0ef41Sopenharmony_ci     * indexing.
1391cb0ef41Sopenharmony_ci     * @param v    the given value
1401cb0ef41Sopenharmony_ci     * @return     true if this matcher will match a character c,
1411cb0ef41Sopenharmony_ci     *             where c & 0xFF == v
1421cb0ef41Sopenharmony_ci     */
1431cb0ef41Sopenharmony_ci    virtual UBool matchesIndexValue(uint8_t v) const override;
1441cb0ef41Sopenharmony_ci
1451cb0ef41Sopenharmony_ci    /**
1461cb0ef41Sopenharmony_ci     * Implement UnicodeMatcher
1471cb0ef41Sopenharmony_ci     */
1481cb0ef41Sopenharmony_ci    virtual void addMatchSetTo(UnicodeSet& toUnionTo) const override;
1491cb0ef41Sopenharmony_ci
1501cb0ef41Sopenharmony_ci    /**
1511cb0ef41Sopenharmony_ci     * Implement UnicodeFunctor
1521cb0ef41Sopenharmony_ci     */
1531cb0ef41Sopenharmony_ci    virtual void setData(const TransliterationRuleData*) override;
1541cb0ef41Sopenharmony_ci
1551cb0ef41Sopenharmony_ci    /**
1561cb0ef41Sopenharmony_ci     * Replace characters in 'text' from 'start' to 'limit' with the
1571cb0ef41Sopenharmony_ci     * output text of this object.  Update the 'cursor' parameter to
1581cb0ef41Sopenharmony_ci     * give the cursor position and return the length of the
1591cb0ef41Sopenharmony_ci     * replacement text.
1601cb0ef41Sopenharmony_ci     *
1611cb0ef41Sopenharmony_ci     * @param text the text to be matched
1621cb0ef41Sopenharmony_ci     * @param start inclusive start index of text to be replaced
1631cb0ef41Sopenharmony_ci     * @param limit exclusive end index of text to be replaced;
1641cb0ef41Sopenharmony_ci     * must be greater than or equal to start
1651cb0ef41Sopenharmony_ci     * @param cursor output parameter for the cursor position.
1661cb0ef41Sopenharmony_ci     * Not all replacer objects will update this, but in a complete
1671cb0ef41Sopenharmony_ci     * tree of replacer objects, representing the entire output side
1681cb0ef41Sopenharmony_ci     * of a transliteration rule, at least one must update it.
1691cb0ef41Sopenharmony_ci     * @return the number of 16-bit code units in the text replacing
1701cb0ef41Sopenharmony_ci     * the characters at offsets start..(limit-1) in text
1711cb0ef41Sopenharmony_ci     */
1721cb0ef41Sopenharmony_ci    virtual int32_t replace(Replaceable& text,
1731cb0ef41Sopenharmony_ci                            int32_t start,
1741cb0ef41Sopenharmony_ci                            int32_t limit,
1751cb0ef41Sopenharmony_ci                            int32_t& cursor) override;
1761cb0ef41Sopenharmony_ci
1771cb0ef41Sopenharmony_ci    /**
1781cb0ef41Sopenharmony_ci     * Returns a string representation of this replacer.  If the
1791cb0ef41Sopenharmony_ci     * result of calling this function is passed to the appropriate
1801cb0ef41Sopenharmony_ci     * parser, typically TransliteratorParser, it will produce another
1811cb0ef41Sopenharmony_ci     * replacer that is equal to this one.
1821cb0ef41Sopenharmony_ci     * @param result the string to receive the pattern.  Previous
1831cb0ef41Sopenharmony_ci     * contents will be deleted.
1841cb0ef41Sopenharmony_ci     * @param escapeUnprintable if true then convert unprintable
1851cb0ef41Sopenharmony_ci     * character to their hex escape representations, \\uxxxx or
1861cb0ef41Sopenharmony_ci     * \\Uxxxxxxxx.  Unprintable characters are defined by
1871cb0ef41Sopenharmony_ci     * Utility.isUnprintable().
1881cb0ef41Sopenharmony_ci     * @return a reference to 'result'.
1891cb0ef41Sopenharmony_ci     */
1901cb0ef41Sopenharmony_ci    virtual UnicodeString& toReplacerPattern(UnicodeString& result,
1911cb0ef41Sopenharmony_ci                                             UBool escapeUnprintable) const override;
1921cb0ef41Sopenharmony_ci
1931cb0ef41Sopenharmony_ci    /**
1941cb0ef41Sopenharmony_ci     * Remove any match data.  This must be called before performing a
1951cb0ef41Sopenharmony_ci     * set of matches with this segment.
1961cb0ef41Sopenharmony_ci     */
1971cb0ef41Sopenharmony_ci    void resetMatch();
1981cb0ef41Sopenharmony_ci
1991cb0ef41Sopenharmony_ci    /**
2001cb0ef41Sopenharmony_ci     * ICU "poor man's RTTI", returns a UClassID for the actual class.
2011cb0ef41Sopenharmony_ci     */
2021cb0ef41Sopenharmony_ci    virtual UClassID getDynamicClassID() const override;
2031cb0ef41Sopenharmony_ci
2041cb0ef41Sopenharmony_ci    /**
2051cb0ef41Sopenharmony_ci     * ICU "poor man's RTTI", returns a UClassID for this class.
2061cb0ef41Sopenharmony_ci     */
2071cb0ef41Sopenharmony_ci    static UClassID U_EXPORT2 getStaticClassID();
2081cb0ef41Sopenharmony_ci
2091cb0ef41Sopenharmony_ci    /**
2101cb0ef41Sopenharmony_ci     * Union the set of all characters that may output by this object
2111cb0ef41Sopenharmony_ci     * into the given set.
2121cb0ef41Sopenharmony_ci     * @param toUnionTo the set into which to union the output characters
2131cb0ef41Sopenharmony_ci     */
2141cb0ef41Sopenharmony_ci    virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const override;
2151cb0ef41Sopenharmony_ci
2161cb0ef41Sopenharmony_ci private:
2171cb0ef41Sopenharmony_ci
2181cb0ef41Sopenharmony_ci    /**
2191cb0ef41Sopenharmony_ci     * The text to be matched.
2201cb0ef41Sopenharmony_ci     */
2211cb0ef41Sopenharmony_ci    UnicodeString pattern;
2221cb0ef41Sopenharmony_ci
2231cb0ef41Sopenharmony_ci    /**
2241cb0ef41Sopenharmony_ci     * Context object that maps stand-ins to matcher and replacer
2251cb0ef41Sopenharmony_ci     * objects.
2261cb0ef41Sopenharmony_ci     */
2271cb0ef41Sopenharmony_ci    const TransliterationRuleData* data;
2281cb0ef41Sopenharmony_ci
2291cb0ef41Sopenharmony_ci    /**
2301cb0ef41Sopenharmony_ci     * The segment number, 1-based, or 0 if not a segment.
2311cb0ef41Sopenharmony_ci     */
2321cb0ef41Sopenharmony_ci    int32_t segmentNumber;
2331cb0ef41Sopenharmony_ci
2341cb0ef41Sopenharmony_ci    /**
2351cb0ef41Sopenharmony_ci     * Start offset, in the match text, of the <em>rightmost</em>
2361cb0ef41Sopenharmony_ci     * match.
2371cb0ef41Sopenharmony_ci     */
2381cb0ef41Sopenharmony_ci    int32_t matchStart;
2391cb0ef41Sopenharmony_ci
2401cb0ef41Sopenharmony_ci    /**
2411cb0ef41Sopenharmony_ci     * Limit offset, in the match text, of the <em>rightmost</em>
2421cb0ef41Sopenharmony_ci     * match.
2431cb0ef41Sopenharmony_ci     */
2441cb0ef41Sopenharmony_ci    int32_t matchLimit;
2451cb0ef41Sopenharmony_ci
2461cb0ef41Sopenharmony_ci};
2471cb0ef41Sopenharmony_ci
2481cb0ef41Sopenharmony_ciU_NAMESPACE_END
2491cb0ef41Sopenharmony_ci
2501cb0ef41Sopenharmony_ci#endif /* #if !UCONFIG_NO_TRANSLITERATION */
2511cb0ef41Sopenharmony_ci
2521cb0ef41Sopenharmony_ci#endif
253