1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4**********************************************************************
5*   Copyright (C) 2001-2011 IBM and others. All rights reserved.
6**********************************************************************
7*   Date        Name        Description
8*  03/22/2000   helena      Creation.
9**********************************************************************
10*/
11
12#ifndef SEARCH_H
13#define SEARCH_H
14
15#include "unicode/utypes.h"
16
17#if U_SHOW_CPLUSPLUS_API
18
19/**
20 * \file
21 * \brief C++ API: SearchIterator object.
22 */
23
24#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
25
26#include "unicode/uobject.h"
27#include "unicode/unistr.h"
28#include "unicode/chariter.h"
29#include "unicode/brkiter.h"
30#include "unicode/usearch.h"
31
32/**
33* @stable ICU 2.0
34*/
35struct USearch;
36/**
37* @stable ICU 2.0
38*/
39typedef struct USearch USearch;
40
41U_NAMESPACE_BEGIN
42
43/**
44 *
45 * <tt>SearchIterator</tt> is an abstract base class that provides
46 * methods to search for a pattern within a text string. Instances of
47 * <tt>SearchIterator</tt> maintain a current position and scans over the
48 * target text, returning the indices the pattern is matched and the length
49 * of each match.
50 * <p>
51 * <tt>SearchIterator</tt> defines a protocol for text searching.
52 * Subclasses provide concrete implementations of various search algorithms.
53 * For example, <tt>StringSearch</tt> implements language-sensitive pattern
54 * matching based on the comparison rules defined in a
55 * <tt>RuleBasedCollator</tt> object.
56 * <p>
57 * Other options for searching includes using a BreakIterator to restrict
58 * the points at which matches are detected.
59 * <p>
60 * <tt>SearchIterator</tt> provides an API that is similar to that of
61 * other text iteration classes such as <tt>BreakIterator</tt>. Using
62 * this class, it is easy to scan through text looking for all occurrences of
63 * a given pattern. The following example uses a <tt>StringSearch</tt>
64 * object to find all instances of "fox" in the target string. Any other
65 * subclass of <tt>SearchIterator</tt> can be used in an identical
66 * manner.
67 * <pre><code>
68 * UnicodeString target("The quick brown fox jumped over the lazy fox");
69 * UnicodeString pattern("fox");
70 *
71 * SearchIterator *iter  = new StringSearch(pattern, target);
72 * UErrorCode      error = U_ZERO_ERROR;
73 * for (int pos = iter->first(error); pos != USEARCH_DONE;
74 *                               pos = iter->next(error)) {
75 *     printf("Found match at %d pos, length is %d\n", pos, iter.getMatchedLength());
76 * }
77 * </code></pre>
78 *
79 * @see StringSearch
80 * @see RuleBasedCollator
81 */
82class U_I18N_API SearchIterator : public UObject {
83
84public:
85
86    // public constructors and destructors -------------------------------
87
88    /**
89    * Copy constructor that creates a SearchIterator instance with the same
90    * behavior, and iterating over the same text.
91    * @param other the SearchIterator instance to be copied.
92    * @stable ICU 2.0
93    */
94    SearchIterator(const SearchIterator &other);
95
96    /**
97     * Destructor. Cleans up the search iterator data struct.
98     * @stable ICU 2.0
99     */
100    virtual ~SearchIterator();
101
102    // public get and set methods ----------------------------------------
103
104    /**
105     * Sets the index to point to the given position, and clears any state
106     * that's affected.
107     * <p>
108     * This method takes the argument index and sets the position in the text
109     * string accordingly without checking if the index is pointing to a
110     * valid starting point to begin searching.
111     * @param position within the text to be set. If position is less
112     *             than or greater than the text range for searching,
113     *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
114     * @param status for errors if it occurs
115     * @stable ICU 2.0
116     */
117    virtual void setOffset(int32_t position, UErrorCode &status) = 0;
118
119    /**
120     * Return the current index in the text being searched.
121     * If the iteration has gone past the end of the text
122     * (or past the beginning for a backwards search), USEARCH_DONE
123     * is returned.
124     * @return current index in the text being searched.
125     * @stable ICU 2.0
126     */
127    virtual int32_t getOffset(void) const = 0;
128
129    /**
130    * Sets the text searching attributes located in the enum
131    * USearchAttribute with values from the enum USearchAttributeValue.
132    * USEARCH_DEFAULT can be used for all attributes for resetting.
133    * @param attribute text attribute (enum USearchAttribute) to be set
134    * @param value text attribute value
135    * @param status for errors if it occurs
136    * @stable ICU 2.0
137    */
138    void setAttribute(USearchAttribute       attribute,
139                      USearchAttributeValue  value,
140                      UErrorCode            &status);
141
142    /**
143    * Gets the text searching attributes
144    * @param attribute text attribute (enum USearchAttribute) to be retrieve
145    * @return text attribute value
146    * @stable ICU 2.0
147    */
148    USearchAttributeValue getAttribute(USearchAttribute  attribute) const;
149
150    /**
151    * Returns the index to the match in the text string that was searched.
152    * This call returns a valid result only after a successful call to
153    * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
154    * Just after construction, or after a searching method returns
155    * <tt>USEARCH_DONE</tt>, this method will return <tt>USEARCH_DONE</tt>.
156    * <p>
157    * Use getMatchedLength to get the matched string length.
158    * @return index of a substring within the text string that is being
159    *         searched.
160    * @see #first
161    * @see #next
162    * @see #previous
163    * @see #last
164    * @stable ICU 2.0
165    */
166    int32_t getMatchedStart(void) const;
167
168    /**
169     * Returns the length of text in the string which matches the search
170     * pattern. This call returns a valid result only after a successful call
171     * to <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
172     * Just after construction, or after a searching method returns
173     * <tt>USEARCH_DONE</tt>, this method will return 0.
174     * @return The length of the match in the target text, or 0 if there
175     *         is no match currently.
176     * @see #first
177     * @see #next
178     * @see #previous
179     * @see #last
180     * @stable ICU 2.0
181     */
182    int32_t getMatchedLength(void) const;
183
184    /**
185     * Returns the text that was matched by the most recent call to
186     * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
187     * If the iterator is not pointing at a valid match (e.g. just after
188     * construction or after <tt>USEARCH_DONE</tt> has been returned,
189     * returns an empty string.
190     * @param result stores the matched string or an empty string if a match
191     *        is not found.
192     * @see #first
193     * @see #next
194     * @see #previous
195     * @see #last
196     * @stable ICU 2.0
197     */
198    void getMatchedText(UnicodeString &result) const;
199
200    /**
201     * Set the BreakIterator that will be used to restrict the points
202     * at which matches are detected. The user is responsible for deleting
203     * the breakiterator.
204     * @param breakiter A BreakIterator that will be used to restrict the
205     *                points at which matches are detected. If a match is
206     *                found, but the match's start or end index is not a
207     *                boundary as determined by the <tt>BreakIterator</tt>,
208     *                the match will be rejected and another will be searched
209     *                for. If this parameter is <tt>nullptr</tt>, no break
210     *                detection is attempted.
211     * @param status for errors if it occurs
212     * @see BreakIterator
213     * @stable ICU 2.0
214     */
215    void setBreakIterator(BreakIterator *breakiter, UErrorCode &status);
216
217    /**
218     * Returns the BreakIterator that is used to restrict the points at
219     * which matches are detected.  This will be the same object that was
220     * passed to the constructor or to <tt>setBreakIterator</tt>.
221     * Note that <tt>nullptr</tt> is a legal value; it means that break
222     * detection should not be attempted.
223     * @return BreakIterator used to restrict matchings.
224     * @see #setBreakIterator
225     * @stable ICU 2.0
226     */
227    const BreakIterator * getBreakIterator(void) const;
228
229    /**
230     * Set the string text to be searched. Text iteration will hence begin at
231     * the start of the text string. This method is useful if you want to
232     * re-use an iterator to search for the same pattern within a different
233     * body of text. The user is responsible for deleting the text.
234     * @param text string to be searched.
235     * @param status for errors. If the text length is 0,
236     *        an U_ILLEGAL_ARGUMENT_ERROR is returned.
237     * @stable ICU 2.0
238     */
239    virtual void setText(const UnicodeString &text, UErrorCode &status);
240
241    /**
242     * Set the string text to be searched. Text iteration will hence begin at
243     * the start of the text string. This method is useful if you want to
244     * re-use an iterator to search for the same pattern within a different
245     * body of text.
246     * <p>
247     * Note: No parsing of the text within the <tt>CharacterIterator</tt>
248     * will be done during searching for this version. The block of text
249     * in <tt>CharacterIterator</tt> will be used as it is.
250     * The user is responsible for deleting the text.
251     * @param text string iterator to be searched.
252     * @param status for errors if any. If the text length is 0 then an
253     *        U_ILLEGAL_ARGUMENT_ERROR is returned.
254     * @stable ICU 2.0
255     */
256    virtual void setText(CharacterIterator &text, UErrorCode &status);
257
258    /**
259     * Return the string text to be searched.
260     * @return text string to be searched.
261     * @stable ICU 2.0
262     */
263    const UnicodeString & getText(void) const;
264
265    // operator overloading ----------------------------------------------
266
267    /**
268     * Equality operator.
269     * @param that SearchIterator instance to be compared.
270     * @return true if both BreakIterators are of the same class, have the
271     *         same behavior, terates over the same text and have the same
272     *         attributes. false otherwise.
273     * @stable ICU 2.0
274     */
275    virtual bool operator==(const SearchIterator &that) const;
276
277    /**
278     * Not-equal operator.
279     * @param that SearchIterator instance to be compared.
280     * @return false if operator== returns true, and vice versa.
281     * @stable ICU 2.0
282     */
283    bool operator!=(const SearchIterator &that) const;
284
285    // public methods ----------------------------------------------------
286
287    /**
288     * Returns a copy of SearchIterator with the same behavior, and
289     * iterating over the same text, as this one. Note that all data will be
290     * replicated, except for the text string to be searched.
291     * @return cloned object
292     * @stable ICU 2.0
293     */
294    virtual SearchIterator* safeClone(void) const = 0;
295
296    /**
297     * Returns the first index at which the string text matches the search
298     * pattern. The iterator is adjusted so that its current index (as
299     * returned by <tt>getOffset</tt>) is the match position if one
300     * was found.
301     * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
302     * the iterator will be adjusted to the index USEARCH_DONE
303     * @param  status for errors if it occurs
304     * @return The character index of the first match, or
305     *         <tt>USEARCH_DONE</tt> if there are no matches.
306     * @see #getOffset
307     * @stable ICU 2.0
308     */
309    int32_t first(UErrorCode &status);
310
311    /**
312     * Returns the first index equal or greater than <tt>position</tt> at which the
313     * string text matches the search pattern. The iterator is adjusted so
314     * that its current index (as returned by <tt>getOffset</tt>) is the
315     * match position if one was found.
316     * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and the
317     * iterator will be adjusted to the index <tt>USEARCH_DONE</tt>.
318     * @param  position where search if to start from. If position is less
319     *             than or greater than the text range for searching,
320     *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
321     * @param  status for errors if it occurs
322     * @return The character index of the first match following
323     *         <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are no
324     *         matches.
325     * @see #getOffset
326     * @stable ICU 2.0
327     */
328    int32_t following(int32_t position, UErrorCode &status);
329
330    /**
331     * Returns the last index in the target text at which it matches the
332     * search pattern. The iterator is adjusted so that its current index
333     * (as returned by <tt>getOffset</tt>) is the match position if one was
334     * found.
335     * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
336     * the iterator will be adjusted to the index USEARCH_DONE.
337     * @param  status for errors if it occurs
338     * @return The index of the first match, or <tt>USEARCH_DONE</tt> if
339     *         there are no matches.
340     * @see #getOffset
341     * @stable ICU 2.0
342     */
343    int32_t last(UErrorCode &status);
344
345    /**
346     * Returns the first index less than <tt>position</tt> at which the string
347     * text matches the search pattern. The iterator is adjusted so that its
348     * current index (as returned by <tt>getOffset</tt>) is the match
349     * position if one was found. If a match is not found,
350     * <tt>USEARCH_DONE</tt> will be returned and the iterator will be
351     * adjusted to the index USEARCH_DONE
352     * <p>
353     * When <tt>USEARCH_OVERLAP</tt> option is off, the last index of the
354     * result match is always less than <tt>position</tt>.
355     * When <tt>USERARCH_OVERLAP</tt> is on, the result match may span across
356     * <tt>position</tt>.
357     *
358     * @param  position where search is to start from. If position is less
359     *             than or greater than the text range for searching,
360     *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
361     * @param  status for errors if it occurs
362     * @return The character index of the first match preceding
363     *         <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are
364     *         no matches.
365     * @see #getOffset
366     * @stable ICU 2.0
367     */
368    int32_t preceding(int32_t position, UErrorCode &status);
369
370    /**
371     * Returns the index of the next point at which the text matches the
372     * search pattern, starting from the current position
373     * The iterator is adjusted so that its current index (as returned by
374     * <tt>getOffset</tt>) is the match position if one was found.
375     * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
376     * the iterator will be adjusted to a position after the end of the text
377     * string.
378     * @param  status for errors if it occurs
379     * @return The index of the next match after the current position,
380     *          or <tt>USEARCH_DONE</tt> if there are no more matches.
381     * @see #getOffset
382     * @stable ICU 2.0
383     */
384     int32_t next(UErrorCode &status);
385
386    /**
387     * Returns the index of the previous point at which the string text
388     * matches the search pattern, starting at the current position.
389     * The iterator is adjusted so that its current index (as returned by
390     * <tt>getOffset</tt>) is the match position if one was found.
391     * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
392     * the iterator will be adjusted to the index USEARCH_DONE
393     * @param  status for errors if it occurs
394     * @return The index of the previous match before the current position,
395     *          or <tt>USEARCH_DONE</tt> if there are no more matches.
396     * @see #getOffset
397     * @stable ICU 2.0
398     */
399    int32_t previous(UErrorCode &status);
400
401    /**
402    * Resets the iteration.
403    * Search will begin at the start of the text string if a forward
404    * iteration is initiated before a backwards iteration. Otherwise if a
405    * backwards iteration is initiated before a forwards iteration, the
406    * search will begin at the end of the text string.
407    * @stable ICU 2.0
408    */
409    virtual void reset();
410
411protected:
412    // protected data members ---------------------------------------------
413
414    /**
415    * C search data struct
416    * @stable ICU 2.0
417    */
418    USearch *m_search_;
419
420    /**
421    * Break iterator.
422    * Currently the C++ breakiterator does not have getRules etc to reproduce
423    * another in C. Hence we keep the original around and do the verification
424    * at the end of the match. The user is responsible for deleting this
425    * break iterator.
426    * @stable ICU 2.0
427    */
428    BreakIterator *m_breakiterator_;
429
430    /**
431    * Unicode string version of the search text
432    * @stable ICU 2.0
433    */
434    UnicodeString  m_text_;
435
436    // protected constructors and destructors -----------------------------
437
438    /**
439    * Default constructor.
440    * Initializes data to the default values.
441    * @stable ICU 2.0
442    */
443    SearchIterator();
444
445    /**
446     * Constructor for use by subclasses.
447     * @param text The target text to be searched.
448     * @param breakiter A {@link BreakIterator} that is used to restrict the
449     *                points at which matches are detected. If
450     *                <tt>handleNext</tt> or <tt>handlePrev</tt> finds a
451     *                match, but the match's start or end index is not a
452     *                boundary as determined by the <tt>BreakIterator</tt>,
453     *                the match is rejected and <tt>handleNext</tt> or
454     *                <tt>handlePrev</tt> is called again. If this parameter
455     *                is <tt>nullptr</tt>, no break detection is attempted.
456     * @see #handleNext
457     * @see #handlePrev
458     * @stable ICU 2.0
459     */
460    SearchIterator(const UnicodeString &text,
461                         BreakIterator *breakiter = nullptr);
462
463    /**
464     * Constructor for use by subclasses.
465     * <p>
466     * Note: No parsing of the text within the <tt>CharacterIterator</tt>
467     * will be done during searching for this version. The block of text
468     * in <tt>CharacterIterator</tt> will be used as it is.
469     * @param text The target text to be searched.
470     * @param breakiter A {@link BreakIterator} that is used to restrict the
471     *                points at which matches are detected. If
472     *                <tt>handleNext</tt> or <tt>handlePrev</tt> finds a
473     *                match, but the match's start or end index is not a
474     *                boundary as determined by the <tt>BreakIterator</tt>,
475     *                the match is rejected and <tt>handleNext</tt> or
476     *                <tt>handlePrev</tt> is called again. If this parameter
477     *                is <tt>nullptr</tt>, no break detection is attempted.
478     * @see #handleNext
479     * @see #handlePrev
480     * @stable ICU 2.0
481     */
482    SearchIterator(CharacterIterator &text, BreakIterator *breakiter = nullptr);
483
484    // protected methods --------------------------------------------------
485
486    /**
487     * Assignment operator. Sets this iterator to have the same behavior,
488     * and iterate over the same text, as the one passed in.
489     * @param that instance to be copied.
490     * @stable ICU 2.0
491     */
492    SearchIterator & operator=(const SearchIterator &that);
493
494    /**
495     * Abstract method which subclasses override to provide the mechanism
496     * for finding the next match in the target text. This allows different
497     * subclasses to provide different search algorithms.
498     * <p>
499     * If a match is found, the implementation should return the index at
500     * which the match starts and should call
501     * <tt>setMatchLength</tt> with the number of characters
502     * in the target text that make up the match. If no match is found, the
503     * method should return USEARCH_DONE.
504     * <p>
505     * @param position The index in the target text at which the search
506     *                 should start.
507     * @param status for error codes if it occurs.
508     * @return index at which the match starts, else if match is not found
509     *         USEARCH_DONE is returned
510     * @see #setMatchLength
511     * @stable ICU 2.0
512     */
513    virtual int32_t handleNext(int32_t position, UErrorCode &status)
514                                                                         = 0;
515
516    /**
517     * Abstract method which subclasses override to provide the mechanism for
518     * finding the previous match in the target text. This allows different
519     * subclasses to provide different search algorithms.
520     * <p>
521     * If a match is found, the implementation should return the index at
522     * which the match starts and should call
523     * <tt>setMatchLength</tt> with the number of characters
524     * in the target text that make up the match. If no match is found, the
525     * method should return USEARCH_DONE.
526     * <p>
527     * @param position The index in the target text at which the search
528     *                 should start.
529     * @param status for error codes if it occurs.
530     * @return index at which the match starts, else if match is not found
531     *         USEARCH_DONE is returned
532     * @see #setMatchLength
533     * @stable ICU 2.0
534     */
535     virtual int32_t handlePrev(int32_t position, UErrorCode &status)
536                                                                         = 0;
537
538    /**
539     * Sets the length of the currently matched string in the text string to
540     * be searched.
541     * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
542     * methods should call this when they find a match in the target text.
543     * @param length length of the matched text.
544     * @see #handleNext
545     * @see #handlePrev
546     * @stable ICU 2.0
547     */
548    virtual void setMatchLength(int32_t length);
549
550    /**
551     * Sets the offset of the currently matched string in the text string to
552     * be searched.
553     * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
554     * methods should call this when they find a match in the target text.
555     * @param position start offset of the matched text.
556     * @see #handleNext
557     * @see #handlePrev
558     * @stable ICU 2.0
559     */
560    virtual void setMatchStart(int32_t position);
561
562    /**
563    * sets match not found
564    * @stable ICU 2.0
565    */
566    void setMatchNotFound();
567};
568
569inline bool SearchIterator::operator!=(const SearchIterator &that) const
570{
571   return !operator==(that);
572}
573U_NAMESPACE_END
574
575#endif /* #if !UCONFIG_NO_COLLATION */
576
577#endif /* U_SHOW_CPLUSPLUS_API */
578
579#endif
580
581