11cb0ef41Sopenharmony_ci// © 2016 and later: Unicode, Inc. and others. 21cb0ef41Sopenharmony_ci// License & terms of use: http://www.unicode.org/copyright.html 31cb0ef41Sopenharmony_ci/* 41cb0ef41Sopenharmony_ci * Copyright (C) 2001-2011, International Business Machines Corporation 51cb0ef41Sopenharmony_ci * and others. All Rights Reserved. 61cb0ef41Sopenharmony_ci ********************************************************************** 71cb0ef41Sopenharmony_ci * Date Name Description 81cb0ef41Sopenharmony_ci * 07/23/01 aliu Creation. 91cb0ef41Sopenharmony_ci ********************************************************************** 101cb0ef41Sopenharmony_ci */ 111cb0ef41Sopenharmony_ci#ifndef STRMATCH_H 121cb0ef41Sopenharmony_ci#define STRMATCH_H 131cb0ef41Sopenharmony_ci 141cb0ef41Sopenharmony_ci#include "unicode/utypes.h" 151cb0ef41Sopenharmony_ci 161cb0ef41Sopenharmony_ci#if !UCONFIG_NO_TRANSLITERATION 171cb0ef41Sopenharmony_ci 181cb0ef41Sopenharmony_ci#include "unicode/unistr.h" 191cb0ef41Sopenharmony_ci#include "unicode/unifunct.h" 201cb0ef41Sopenharmony_ci#include "unicode/unimatch.h" 211cb0ef41Sopenharmony_ci#include "unicode/unirepl.h" 221cb0ef41Sopenharmony_ci 231cb0ef41Sopenharmony_ciU_NAMESPACE_BEGIN 241cb0ef41Sopenharmony_ci 251cb0ef41Sopenharmony_ciclass TransliterationRuleData; 261cb0ef41Sopenharmony_ci 271cb0ef41Sopenharmony_ci/** 281cb0ef41Sopenharmony_ci * An object that matches a fixed input string, implementing the 291cb0ef41Sopenharmony_ci * UnicodeMatcher API. This object also implements the 301cb0ef41Sopenharmony_ci * UnicodeReplacer API, allowing it to emit the matched text as 311cb0ef41Sopenharmony_ci * output. Since the match text may contain flexible match elements, 321cb0ef41Sopenharmony_ci * such as UnicodeSets, the emitted text is not the match pattern, but 331cb0ef41Sopenharmony_ci * instead a substring of the actual matched text. Following 341cb0ef41Sopenharmony_ci * convention, the output text is the leftmost match seen up to this 351cb0ef41Sopenharmony_ci * point. 361cb0ef41Sopenharmony_ci * 371cb0ef41Sopenharmony_ci * A StringMatcher may represent a segment, in which case it has a 381cb0ef41Sopenharmony_ci * positive segment number. This affects how the matcher converts 391cb0ef41Sopenharmony_ci * itself to a pattern but does not otherwise affect its function. 401cb0ef41Sopenharmony_ci * 411cb0ef41Sopenharmony_ci * A StringMatcher that is not a segment should not be used as a 421cb0ef41Sopenharmony_ci * UnicodeReplacer. 431cb0ef41Sopenharmony_ci */ 441cb0ef41Sopenharmony_ciclass StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer { 451cb0ef41Sopenharmony_ci 461cb0ef41Sopenharmony_ci public: 471cb0ef41Sopenharmony_ci 481cb0ef41Sopenharmony_ci /** 491cb0ef41Sopenharmony_ci * Construct a matcher that matches the given pattern string. 501cb0ef41Sopenharmony_ci * @param string the pattern to be matched, possibly containing 511cb0ef41Sopenharmony_ci * stand-ins that represent nested UnicodeMatcher objects. 521cb0ef41Sopenharmony_ci * @param start inclusive start index of text to be replaced 531cb0ef41Sopenharmony_ci * @param limit exclusive end index of text to be replaced; 541cb0ef41Sopenharmony_ci * must be greater than or equal to start 551cb0ef41Sopenharmony_ci * @param segmentNum the segment number from 1..n, or 0 if this is 561cb0ef41Sopenharmony_ci * not a segment. 571cb0ef41Sopenharmony_ci * @param data context object mapping stand-ins to 581cb0ef41Sopenharmony_ci * UnicodeMatcher objects. 591cb0ef41Sopenharmony_ci */ 601cb0ef41Sopenharmony_ci StringMatcher(const UnicodeString& string, 611cb0ef41Sopenharmony_ci int32_t start, 621cb0ef41Sopenharmony_ci int32_t limit, 631cb0ef41Sopenharmony_ci int32_t segmentNum, 641cb0ef41Sopenharmony_ci const TransliterationRuleData& data); 651cb0ef41Sopenharmony_ci 661cb0ef41Sopenharmony_ci /** 671cb0ef41Sopenharmony_ci * Copy constructor 681cb0ef41Sopenharmony_ci * @param o the object to be copied. 691cb0ef41Sopenharmony_ci */ 701cb0ef41Sopenharmony_ci StringMatcher(const StringMatcher& o); 711cb0ef41Sopenharmony_ci 721cb0ef41Sopenharmony_ci /** 731cb0ef41Sopenharmony_ci * Destructor 741cb0ef41Sopenharmony_ci */ 751cb0ef41Sopenharmony_ci virtual ~StringMatcher(); 761cb0ef41Sopenharmony_ci 771cb0ef41Sopenharmony_ci /** 781cb0ef41Sopenharmony_ci * Implement UnicodeFunctor 791cb0ef41Sopenharmony_ci * @return a copy of the object. 801cb0ef41Sopenharmony_ci */ 811cb0ef41Sopenharmony_ci virtual StringMatcher* clone() const override; 821cb0ef41Sopenharmony_ci 831cb0ef41Sopenharmony_ci /** 841cb0ef41Sopenharmony_ci * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer 851cb0ef41Sopenharmony_ci * and return the pointer. 861cb0ef41Sopenharmony_ci * @return the UnicodeMatcher point. 871cb0ef41Sopenharmony_ci */ 881cb0ef41Sopenharmony_ci virtual UnicodeMatcher* toMatcher() const override; 891cb0ef41Sopenharmony_ci 901cb0ef41Sopenharmony_ci /** 911cb0ef41Sopenharmony_ci * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer 921cb0ef41Sopenharmony_ci * and return the pointer. 931cb0ef41Sopenharmony_ci * @return the UnicodeReplacer pointer. 941cb0ef41Sopenharmony_ci */ 951cb0ef41Sopenharmony_ci virtual UnicodeReplacer* toReplacer() const override; 961cb0ef41Sopenharmony_ci 971cb0ef41Sopenharmony_ci /** 981cb0ef41Sopenharmony_ci * Implement UnicodeMatcher 991cb0ef41Sopenharmony_ci * @param text the text to be matched 1001cb0ef41Sopenharmony_ci * @param offset on input, the index into text at which to begin 1011cb0ef41Sopenharmony_ci * matching. On output, the limit of the matched text. The 1021cb0ef41Sopenharmony_ci * number of matched characters is the output value of offset 1031cb0ef41Sopenharmony_ci * minus the input value. Offset should always point to the 1041cb0ef41Sopenharmony_ci * HIGH SURROGATE (leading code unit) of a pair of surrogates, 1051cb0ef41Sopenharmony_ci * both on entry and upon return. 1061cb0ef41Sopenharmony_ci * @param limit the limit index of text to be matched. Greater 1071cb0ef41Sopenharmony_ci * than offset for a forward direction match, less than offset for 1081cb0ef41Sopenharmony_ci * a backward direction match. The last character to be 1091cb0ef41Sopenharmony_ci * considered for matching will be text.charAt(limit-1) in the 1101cb0ef41Sopenharmony_ci * forward direction or text.charAt(limit+1) in the backward 1111cb0ef41Sopenharmony_ci * direction. 1121cb0ef41Sopenharmony_ci * @param incremental if true, then assume further characters may 1131cb0ef41Sopenharmony_ci * be inserted at limit and check for partial matching. Otherwise 1141cb0ef41Sopenharmony_ci * assume the text as given is complete. 1151cb0ef41Sopenharmony_ci * @return a match degree value indicating a full match, a partial 1161cb0ef41Sopenharmony_ci * match, or a mismatch. If incremental is false then 1171cb0ef41Sopenharmony_ci * U_PARTIAL_MATCH should never be returned. 1181cb0ef41Sopenharmony_ci */ 1191cb0ef41Sopenharmony_ci virtual UMatchDegree matches(const Replaceable& text, 1201cb0ef41Sopenharmony_ci int32_t& offset, 1211cb0ef41Sopenharmony_ci int32_t limit, 1221cb0ef41Sopenharmony_ci UBool incremental) override; 1231cb0ef41Sopenharmony_ci 1241cb0ef41Sopenharmony_ci /** 1251cb0ef41Sopenharmony_ci * Implement UnicodeMatcher 1261cb0ef41Sopenharmony_ci * @param result Output param to receive the pattern. 1271cb0ef41Sopenharmony_ci * @param escapeUnprintable if True then escape the unprintable characters. 1281cb0ef41Sopenharmony_ci * @return A reference to 'result'. 1291cb0ef41Sopenharmony_ci */ 1301cb0ef41Sopenharmony_ci virtual UnicodeString& toPattern(UnicodeString& result, 1311cb0ef41Sopenharmony_ci UBool escapeUnprintable = false) const override; 1321cb0ef41Sopenharmony_ci 1331cb0ef41Sopenharmony_ci /** 1341cb0ef41Sopenharmony_ci * Implement UnicodeMatcher 1351cb0ef41Sopenharmony_ci * Returns true if this matcher will match a character c, where c 1361cb0ef41Sopenharmony_ci * & 0xFF == v, at offset, in the forward direction (with limit > 1371cb0ef41Sopenharmony_ci * offset). This is used by <tt>RuleBasedTransliterator</tt> for 1381cb0ef41Sopenharmony_ci * indexing. 1391cb0ef41Sopenharmony_ci * @param v the given value 1401cb0ef41Sopenharmony_ci * @return true if this matcher will match a character c, 1411cb0ef41Sopenharmony_ci * where c & 0xFF == v 1421cb0ef41Sopenharmony_ci */ 1431cb0ef41Sopenharmony_ci virtual UBool matchesIndexValue(uint8_t v) const override; 1441cb0ef41Sopenharmony_ci 1451cb0ef41Sopenharmony_ci /** 1461cb0ef41Sopenharmony_ci * Implement UnicodeMatcher 1471cb0ef41Sopenharmony_ci */ 1481cb0ef41Sopenharmony_ci virtual void addMatchSetTo(UnicodeSet& toUnionTo) const override; 1491cb0ef41Sopenharmony_ci 1501cb0ef41Sopenharmony_ci /** 1511cb0ef41Sopenharmony_ci * Implement UnicodeFunctor 1521cb0ef41Sopenharmony_ci */ 1531cb0ef41Sopenharmony_ci virtual void setData(const TransliterationRuleData*) override; 1541cb0ef41Sopenharmony_ci 1551cb0ef41Sopenharmony_ci /** 1561cb0ef41Sopenharmony_ci * Replace characters in 'text' from 'start' to 'limit' with the 1571cb0ef41Sopenharmony_ci * output text of this object. Update the 'cursor' parameter to 1581cb0ef41Sopenharmony_ci * give the cursor position and return the length of the 1591cb0ef41Sopenharmony_ci * replacement text. 1601cb0ef41Sopenharmony_ci * 1611cb0ef41Sopenharmony_ci * @param text the text to be matched 1621cb0ef41Sopenharmony_ci * @param start inclusive start index of text to be replaced 1631cb0ef41Sopenharmony_ci * @param limit exclusive end index of text to be replaced; 1641cb0ef41Sopenharmony_ci * must be greater than or equal to start 1651cb0ef41Sopenharmony_ci * @param cursor output parameter for the cursor position. 1661cb0ef41Sopenharmony_ci * Not all replacer objects will update this, but in a complete 1671cb0ef41Sopenharmony_ci * tree of replacer objects, representing the entire output side 1681cb0ef41Sopenharmony_ci * of a transliteration rule, at least one must update it. 1691cb0ef41Sopenharmony_ci * @return the number of 16-bit code units in the text replacing 1701cb0ef41Sopenharmony_ci * the characters at offsets start..(limit-1) in text 1711cb0ef41Sopenharmony_ci */ 1721cb0ef41Sopenharmony_ci virtual int32_t replace(Replaceable& text, 1731cb0ef41Sopenharmony_ci int32_t start, 1741cb0ef41Sopenharmony_ci int32_t limit, 1751cb0ef41Sopenharmony_ci int32_t& cursor) override; 1761cb0ef41Sopenharmony_ci 1771cb0ef41Sopenharmony_ci /** 1781cb0ef41Sopenharmony_ci * Returns a string representation of this replacer. If the 1791cb0ef41Sopenharmony_ci * result of calling this function is passed to the appropriate 1801cb0ef41Sopenharmony_ci * parser, typically TransliteratorParser, it will produce another 1811cb0ef41Sopenharmony_ci * replacer that is equal to this one. 1821cb0ef41Sopenharmony_ci * @param result the string to receive the pattern. Previous 1831cb0ef41Sopenharmony_ci * contents will be deleted. 1841cb0ef41Sopenharmony_ci * @param escapeUnprintable if true then convert unprintable 1851cb0ef41Sopenharmony_ci * character to their hex escape representations, \\uxxxx or 1861cb0ef41Sopenharmony_ci * \\Uxxxxxxxx. Unprintable characters are defined by 1871cb0ef41Sopenharmony_ci * Utility.isUnprintable(). 1881cb0ef41Sopenharmony_ci * @return a reference to 'result'. 1891cb0ef41Sopenharmony_ci */ 1901cb0ef41Sopenharmony_ci virtual UnicodeString& toReplacerPattern(UnicodeString& result, 1911cb0ef41Sopenharmony_ci UBool escapeUnprintable) const override; 1921cb0ef41Sopenharmony_ci 1931cb0ef41Sopenharmony_ci /** 1941cb0ef41Sopenharmony_ci * Remove any match data. This must be called before performing a 1951cb0ef41Sopenharmony_ci * set of matches with this segment. 1961cb0ef41Sopenharmony_ci */ 1971cb0ef41Sopenharmony_ci void resetMatch(); 1981cb0ef41Sopenharmony_ci 1991cb0ef41Sopenharmony_ci /** 2001cb0ef41Sopenharmony_ci * ICU "poor man's RTTI", returns a UClassID for the actual class. 2011cb0ef41Sopenharmony_ci */ 2021cb0ef41Sopenharmony_ci virtual UClassID getDynamicClassID() const override; 2031cb0ef41Sopenharmony_ci 2041cb0ef41Sopenharmony_ci /** 2051cb0ef41Sopenharmony_ci * ICU "poor man's RTTI", returns a UClassID for this class. 2061cb0ef41Sopenharmony_ci */ 2071cb0ef41Sopenharmony_ci static UClassID U_EXPORT2 getStaticClassID(); 2081cb0ef41Sopenharmony_ci 2091cb0ef41Sopenharmony_ci /** 2101cb0ef41Sopenharmony_ci * Union the set of all characters that may output by this object 2111cb0ef41Sopenharmony_ci * into the given set. 2121cb0ef41Sopenharmony_ci * @param toUnionTo the set into which to union the output characters 2131cb0ef41Sopenharmony_ci */ 2141cb0ef41Sopenharmony_ci virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const override; 2151cb0ef41Sopenharmony_ci 2161cb0ef41Sopenharmony_ci private: 2171cb0ef41Sopenharmony_ci 2181cb0ef41Sopenharmony_ci /** 2191cb0ef41Sopenharmony_ci * The text to be matched. 2201cb0ef41Sopenharmony_ci */ 2211cb0ef41Sopenharmony_ci UnicodeString pattern; 2221cb0ef41Sopenharmony_ci 2231cb0ef41Sopenharmony_ci /** 2241cb0ef41Sopenharmony_ci * Context object that maps stand-ins to matcher and replacer 2251cb0ef41Sopenharmony_ci * objects. 2261cb0ef41Sopenharmony_ci */ 2271cb0ef41Sopenharmony_ci const TransliterationRuleData* data; 2281cb0ef41Sopenharmony_ci 2291cb0ef41Sopenharmony_ci /** 2301cb0ef41Sopenharmony_ci * The segment number, 1-based, or 0 if not a segment. 2311cb0ef41Sopenharmony_ci */ 2321cb0ef41Sopenharmony_ci int32_t segmentNumber; 2331cb0ef41Sopenharmony_ci 2341cb0ef41Sopenharmony_ci /** 2351cb0ef41Sopenharmony_ci * Start offset, in the match text, of the <em>rightmost</em> 2361cb0ef41Sopenharmony_ci * match. 2371cb0ef41Sopenharmony_ci */ 2381cb0ef41Sopenharmony_ci int32_t matchStart; 2391cb0ef41Sopenharmony_ci 2401cb0ef41Sopenharmony_ci /** 2411cb0ef41Sopenharmony_ci * Limit offset, in the match text, of the <em>rightmost</em> 2421cb0ef41Sopenharmony_ci * match. 2431cb0ef41Sopenharmony_ci */ 2441cb0ef41Sopenharmony_ci int32_t matchLimit; 2451cb0ef41Sopenharmony_ci 2461cb0ef41Sopenharmony_ci}; 2471cb0ef41Sopenharmony_ci 2481cb0ef41Sopenharmony_ciU_NAMESPACE_END 2491cb0ef41Sopenharmony_ci 2501cb0ef41Sopenharmony_ci#endif /* #if !UCONFIG_NO_TRANSLITERATION */ 2511cb0ef41Sopenharmony_ci 2521cb0ef41Sopenharmony_ci#endif 253