1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4********************************************************************
5*
6*   Copyright (C) 1997-2011, International Business Machines
7*   Corporation and others.  All Rights Reserved.
8*
9********************************************************************
10*/
11
12#ifndef CHARITER_H
13#define CHARITER_H
14
15#include "unicode/utypes.h"
16
17#if U_SHOW_CPLUSPLUS_API
18
19#include "unicode/uobject.h"
20#include "unicode/unistr.h"
21/**
22 * \file
23 * \brief C++ API: Character Iterator
24 */
25
26U_NAMESPACE_BEGIN
27/**
28 * Abstract class that defines an API for forward-only iteration
29 * on text objects.
30 * This is a minimal interface for iteration without random access
31 * or backwards iteration. It is especially useful for wrapping
32 * streams with converters into an object for collation or
33 * normalization.
34 *
35 * <p>Characters can be accessed in two ways: as code units or as
36 * code points.
37 * Unicode code points are 21-bit integers and are the scalar values
38 * of Unicode characters. ICU uses the type UChar32 for them.
39 * Unicode code units are the storage units of a given
40 * Unicode/UCS Transformation Format (a character encoding scheme).
41 * With UTF-16, all code points can be represented with either one
42 * or two code units ("surrogates").
43 * String storage is typically based on code units, while properties
44 * of characters are typically determined using code point values.
45 * Some processes may be designed to work with sequences of code units,
46 * or it may be known that all characters that are important to an
47 * algorithm can be represented with single code units.
48 * Other processes will need to use the code point access functions.</p>
49 *
50 * <p>ForwardCharacterIterator provides nextPostInc() to access
51 * a code unit and advance an internal position into the text object,
52 * similar to a <code>return text[position++]</code>.<br>
53 * It provides next32PostInc() to access a code point and advance an internal
54 * position.</p>
55 *
56 * <p>next32PostInc() assumes that the current position is that of
57 * the beginning of a code point, i.e., of its first code unit.
58 * After next32PostInc(), this will be true again.
59 * In general, access to code units and code points in the same
60 * iteration loop should not be mixed. In UTF-16, if the current position
61 * is on a second code unit (Low Surrogate), then only that code unit
62 * is returned even by next32PostInc().</p>
63 *
64 * <p>For iteration with either function, there are two ways to
65 * check for the end of the iteration. When there are no more
66 * characters in the text object:
67 * <ul>
68 * <li>The hasNext() function returns false.</li>
69 * <li>nextPostInc() and next32PostInc() return DONE
70 *     when one attempts to read beyond the end of the text object.</li>
71 * </ul>
72 *
73 * Example:
74 * \code
75 * void function1(ForwardCharacterIterator &it) {
76 *     UChar32 c;
77 *     while(it.hasNext()) {
78 *         c=it.next32PostInc();
79 *         // use c
80 *     }
81 * }
82 *
83 * void function1(ForwardCharacterIterator &it) {
84 *     char16_t c;
85 *     while((c=it.nextPostInc())!=ForwardCharacterIterator::DONE) {
86 *         // use c
87 *      }
88 *  }
89 * \endcode
90 * </p>
91 *
92 * @stable ICU 2.0
93 */
94class U_COMMON_API ForwardCharacterIterator : public UObject {
95public:
96    /**
97     * Value returned by most of ForwardCharacterIterator's functions
98     * when the iterator has reached the limits of its iteration.
99     * @stable ICU 2.0
100     */
101    enum { DONE = 0xffff };
102
103    /**
104     * Destructor.
105     * @stable ICU 2.0
106     */
107    virtual ~ForwardCharacterIterator();
108
109    /**
110     * Returns true when both iterators refer to the same
111     * character in the same character-storage object.
112     * @param that The ForwardCharacterIterator to be compared for equality
113     * @return true when both iterators refer to the same
114     * character in the same character-storage object
115     * @stable ICU 2.0
116     */
117    virtual bool operator==(const ForwardCharacterIterator& that) const = 0;
118
119    /**
120     * Returns true when the iterators refer to different
121     * text-storage objects, or to different characters in the
122     * same text-storage object.
123     * @param that The ForwardCharacterIterator to be compared for inequality
124     * @return true when the iterators refer to different
125     * text-storage objects, or to different characters in the
126     * same text-storage object
127     * @stable ICU 2.0
128     */
129    inline bool operator!=(const ForwardCharacterIterator& that) const;
130
131    /**
132     * Generates a hash code for this iterator.
133     * @return the hash code.
134     * @stable ICU 2.0
135     */
136    virtual int32_t hashCode(void) const = 0;
137
138    /**
139     * Returns a UClassID for this ForwardCharacterIterator ("poor man's
140     * RTTI").<P> Despite the fact that this function is public,
141     * DO NOT CONSIDER IT PART OF CHARACTERITERATOR'S API!
142     * @return a UClassID for this ForwardCharacterIterator
143     * @stable ICU 2.0
144     */
145    virtual UClassID getDynamicClassID(void) const override = 0;
146
147    /**
148     * Gets the current code unit for returning and advances to the next code unit
149     * in the iteration range
150     * (toward endIndex()).  If there are
151     * no more code units to return, returns DONE.
152     * @return the current code unit.
153     * @stable ICU 2.0
154     */
155    virtual char16_t         nextPostInc(void) = 0;
156
157    /**
158     * Gets the current code point for returning and advances to the next code point
159     * in the iteration range
160     * (toward endIndex()).  If there are
161     * no more code points to return, returns DONE.
162     * @return the current code point.
163     * @stable ICU 2.0
164     */
165    virtual UChar32       next32PostInc(void) = 0;
166
167    /**
168     * Returns false if there are no more code units or code points
169     * at or after the current position in the iteration range.
170     * This is used with nextPostInc() or next32PostInc() in forward
171     * iteration.
172     * @returns false if there are no more code units or code points
173     * at or after the current position in the iteration range.
174     * @stable ICU 2.0
175     */
176    virtual UBool        hasNext() = 0;
177
178protected:
179    /** Default constructor to be overridden in the implementing class. @stable ICU 2.0*/
180    ForwardCharacterIterator();
181
182    /** Copy constructor to be overridden in the implementing class. @stable ICU 2.0*/
183    ForwardCharacterIterator(const ForwardCharacterIterator &other);
184
185    /**
186     * Assignment operator to be overridden in the implementing class.
187     * @stable ICU 2.0
188     */
189    ForwardCharacterIterator &operator=(const ForwardCharacterIterator&) { return *this; }
190};
191
192/**
193 * Abstract class that defines an API for iteration
194 * on text objects.
195 * This is an interface for forward and backward iteration
196 * and random access into a text object.
197 *
198 * <p>The API provides backward compatibility to the Java and older ICU
199 * CharacterIterator classes but extends them significantly:
200 * <ol>
201 * <li>CharacterIterator is now a subclass of ForwardCharacterIterator.</li>
202 * <li>While the old API functions provided forward iteration with
203 *     "pre-increment" semantics, the new one also provides functions
204 *     with "post-increment" semantics. They are more efficient and should
205 *     be the preferred iterator functions for new implementations.
206 *     The backward iteration always had "pre-decrement" semantics, which
207 *     are efficient.</li>
208 * <li>Just like ForwardCharacterIterator, it provides access to
209 *     both code units and code points. Code point access versions are available
210 *     for the old and the new iteration semantics.</li>
211 * <li>There are new functions for setting and moving the current position
212 *     without returning a character, for efficiency.</li>
213 * </ol>
214 *
215 * See ForwardCharacterIterator for examples for using the new forward iteration
216 * functions. For backward iteration, there is also a hasPrevious() function
217 * that can be used analogously to hasNext().
218 * The old functions work as before and are shown below.</p>
219 *
220 * <p>Examples for some of the new functions:</p>
221 *
222 * Forward iteration with hasNext():
223 * \code
224 * void forward1(CharacterIterator &it) {
225 *     UChar32 c;
226 *     for(it.setToStart(); it.hasNext();) {
227 *         c=it.next32PostInc();
228 *         // use c
229 *     }
230 *  }
231 * \endcode
232 * Forward iteration more similar to loops with the old forward iteration,
233 * showing a way to convert simple for() loops:
234 * \code
235 * void forward2(CharacterIterator &it) {
236 *     char16_t c;
237 *     for(c=it.firstPostInc(); c!=CharacterIterator::DONE; c=it.nextPostInc()) {
238 *          // use c
239 *      }
240 * }
241 * \endcode
242 * Backward iteration with setToEnd() and hasPrevious():
243 * \code
244 *  void backward1(CharacterIterator &it) {
245 *      UChar32 c;
246 *      for(it.setToEnd(); it.hasPrevious();) {
247 *         c=it.previous32();
248 *          // use c
249 *      }
250 *  }
251 * \endcode
252 * Backward iteration with a more traditional for() loop:
253 * \code
254 * void backward2(CharacterIterator &it) {
255 *     char16_t c;
256 *     for(c=it.last(); c!=CharacterIterator::DONE; c=it.previous()) {
257 *         // use c
258 *      }
259 *  }
260 * \endcode
261 *
262 * Example for random access:
263 * \code
264 *  void random(CharacterIterator &it) {
265 *      // set to the third code point from the beginning
266 *      it.move32(3, CharacterIterator::kStart);
267 *      // get a code point from here without moving the position
268 *      UChar32 c=it.current32();
269 *      // get the position
270 *      int32_t pos=it.getIndex();
271 *      // get the previous code unit
272 *      char16_t u=it.previous();
273 *      // move back one more code unit
274 *      it.move(-1, CharacterIterator::kCurrent);
275 *      // set the position back to where it was
276 *      // and read the same code point c and move beyond it
277 *      it.setIndex(pos);
278 *      if(c!=it.next32PostInc()) {
279 *          exit(1); // CharacterIterator inconsistent
280 *      }
281 *  }
282 * \endcode
283 *
284 * <p>Examples, especially for the old API:</p>
285 *
286 * Function processing characters, in this example simple output
287 * <pre>
288 * \code
289 *  void processChar( char16_t c )
290 *  {
291 *      cout << " " << c;
292 *  }
293 * \endcode
294 * </pre>
295 * Traverse the text from start to finish
296 * <pre>
297 * \code
298 *  void traverseForward(CharacterIterator& iter)
299 *  {
300 *      for(char16_t c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) {
301 *          processChar(c);
302 *      }
303 *  }
304 * \endcode
305 * </pre>
306 * Traverse the text backwards, from end to start
307 * <pre>
308 * \code
309 *  void traverseBackward(CharacterIterator& iter)
310 *  {
311 *      for(char16_t c = iter.last(); c != CharacterIterator.DONE; c = iter.previous()) {
312 *          processChar(c);
313 *      }
314 *  }
315 * \endcode
316 * </pre>
317 * Traverse both forward and backward from a given position in the text.
318 * Calls to notBoundary() in this example represents some additional stopping criteria.
319 * <pre>
320 * \code
321 * void traverseOut(CharacterIterator& iter, int32_t pos)
322 * {
323 *      char16_t c;
324 *      for (c = iter.setIndex(pos);
325 *      c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));
326 *          c = iter.next()) {}
327 *      int32_t end = iter.getIndex();
328 *      for (c = iter.setIndex(pos);
329 *          c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));
330 *          c = iter.previous()) {}
331 *      int32_t start = iter.getIndex() + 1;
332 *
333 *      cout << "start: " << start << " end: " << end << endl;
334 *      for (c = iter.setIndex(start); iter.getIndex() < end; c = iter.next() ) {
335 *          processChar(c);
336 *     }
337 *  }
338 * \endcode
339 * </pre>
340 * Creating a StringCharacterIterator and calling the test functions
341 * <pre>
342 * \code
343 *  void CharacterIterator_Example( void )
344 *   {
345 *       cout << endl << "===== CharacterIterator_Example: =====" << endl;
346 *       UnicodeString text("Ein kleiner Satz.");
347 *       StringCharacterIterator iterator(text);
348 *       cout << "----- traverseForward: -----------" << endl;
349 *       traverseForward( iterator );
350 *       cout << endl << endl << "----- traverseBackward: ----------" << endl;
351 *       traverseBackward( iterator );
352 *       cout << endl << endl << "----- traverseOut: ---------------" << endl;
353 *       traverseOut( iterator, 7 );
354 *       cout << endl << endl << "-----" << endl;
355 *   }
356 * \endcode
357 * </pre>
358 *
359 * @stable ICU 2.0
360 */
361class U_COMMON_API CharacterIterator : public ForwardCharacterIterator {
362public:
363    /**
364     * Origin enumeration for the move() and move32() functions.
365     * @stable ICU 2.0
366     */
367    enum EOrigin { kStart, kCurrent, kEnd };
368
369    /**
370     * Destructor.
371     * @stable ICU 2.0
372     */
373    virtual ~CharacterIterator();
374
375    /**
376     * Returns a pointer to a new CharacterIterator of the same
377     * concrete class as this one, and referring to the same
378     * character in the same text-storage object as this one.  The
379     * caller is responsible for deleting the new clone.
380     * @return a pointer to a new CharacterIterator
381     * @stable ICU 2.0
382     */
383    virtual CharacterIterator* clone() const = 0;
384
385    /**
386     * Sets the iterator to refer to the first code unit in its
387     * iteration range, and returns that code unit.
388     * This can be used to begin an iteration with next().
389     * @return the first code unit in its iteration range.
390     * @stable ICU 2.0
391     */
392    virtual char16_t         first(void) = 0;
393
394    /**
395     * Sets the iterator to refer to the first code unit in its
396     * iteration range, returns that code unit, and moves the position
397     * to the second code unit. This is an alternative to setToStart()
398     * for forward iteration with nextPostInc().
399     * @return the first code unit in its iteration range.
400     * @stable ICU 2.0
401     */
402    virtual char16_t         firstPostInc(void);
403
404    /**
405     * Sets the iterator to refer to the first code point in its
406     * iteration range, and returns that code unit,
407     * This can be used to begin an iteration with next32().
408     * Note that an iteration with next32PostInc(), beginning with,
409     * e.g., setToStart() or firstPostInc(), is more efficient.
410     * @return the first code point in its iteration range.
411     * @stable ICU 2.0
412     */
413    virtual UChar32       first32(void) = 0;
414
415    /**
416     * Sets the iterator to refer to the first code point in its
417     * iteration range, returns that code point, and moves the position
418     * to the second code point. This is an alternative to setToStart()
419     * for forward iteration with next32PostInc().
420     * @return the first code point in its iteration range.
421     * @stable ICU 2.0
422     */
423    virtual UChar32       first32PostInc(void);
424
425    /**
426     * Sets the iterator to refer to the first code unit or code point in its
427     * iteration range. This can be used to begin a forward
428     * iteration with nextPostInc() or next32PostInc().
429     * @return the start position of the iteration range
430     * @stable ICU 2.0
431     */
432    inline int32_t    setToStart();
433
434    /**
435     * Sets the iterator to refer to the last code unit in its
436     * iteration range, and returns that code unit.
437     * This can be used to begin an iteration with previous().
438     * @return the last code unit.
439     * @stable ICU 2.0
440     */
441    virtual char16_t         last(void) = 0;
442
443    /**
444     * Sets the iterator to refer to the last code point in its
445     * iteration range, and returns that code unit.
446     * This can be used to begin an iteration with previous32().
447     * @return the last code point.
448     * @stable ICU 2.0
449     */
450    virtual UChar32       last32(void) = 0;
451
452    /**
453     * Sets the iterator to the end of its iteration range, just behind
454     * the last code unit or code point. This can be used to begin a backward
455     * iteration with previous() or previous32().
456     * @return the end position of the iteration range
457     * @stable ICU 2.0
458     */
459    inline int32_t    setToEnd();
460
461    /**
462     * Sets the iterator to refer to the "position"-th code unit
463     * in the text-storage object the iterator refers to, and
464     * returns that code unit.
465     * @param position the "position"-th code unit in the text-storage object
466     * @return the "position"-th code unit.
467     * @stable ICU 2.0
468     */
469    virtual char16_t         setIndex(int32_t position) = 0;
470
471    /**
472     * Sets the iterator to refer to the beginning of the code point
473     * that contains the "position"-th code unit
474     * in the text-storage object the iterator refers to, and
475     * returns that code point.
476     * The current position is adjusted to the beginning of the code point
477     * (its first code unit).
478     * @param position the "position"-th code unit in the text-storage object
479     * @return the "position"-th code point.
480     * @stable ICU 2.0
481     */
482    virtual UChar32       setIndex32(int32_t position) = 0;
483
484    /**
485     * Returns the code unit the iterator currently refers to.
486     * @return the current code unit.
487     * @stable ICU 2.0
488     */
489    virtual char16_t         current(void) const = 0;
490
491    /**
492     * Returns the code point the iterator currently refers to.
493     * @return the current code point.
494     * @stable ICU 2.0
495     */
496    virtual UChar32       current32(void) const = 0;
497
498    /**
499     * Advances to the next code unit in the iteration range
500     * (toward endIndex()), and returns that code unit.  If there are
501     * no more code units to return, returns DONE.
502     * @return the next code unit.
503     * @stable ICU 2.0
504     */
505    virtual char16_t         next(void) = 0;
506
507    /**
508     * Advances to the next code point in the iteration range
509     * (toward endIndex()), and returns that code point.  If there are
510     * no more code points to return, returns DONE.
511     * Note that iteration with "pre-increment" semantics is less
512     * efficient than iteration with "post-increment" semantics
513     * that is provided by next32PostInc().
514     * @return the next code point.
515     * @stable ICU 2.0
516     */
517    virtual UChar32       next32(void) = 0;
518
519    /**
520     * Advances to the previous code unit in the iteration range
521     * (toward startIndex()), and returns that code unit.  If there are
522     * no more code units to return, returns DONE.
523     * @return the previous code unit.
524     * @stable ICU 2.0
525     */
526    virtual char16_t         previous(void) = 0;
527
528    /**
529     * Advances to the previous code point in the iteration range
530     * (toward startIndex()), and returns that code point.  If there are
531     * no more code points to return, returns DONE.
532     * @return the previous code point.
533     * @stable ICU 2.0
534     */
535    virtual UChar32       previous32(void) = 0;
536
537    /**
538     * Returns false if there are no more code units or code points
539     * before the current position in the iteration range.
540     * This is used with previous() or previous32() in backward
541     * iteration.
542     * @return false if there are no more code units or code points
543     * before the current position in the iteration range, return true otherwise.
544     * @stable ICU 2.0
545     */
546    virtual UBool        hasPrevious() = 0;
547
548    /**
549     * Returns the numeric index in the underlying text-storage
550     * object of the character returned by first().  Since it's
551     * possible to create an iterator that iterates across only
552     * part of a text-storage object, this number isn't
553     * necessarily 0.
554     * @returns the numeric index in the underlying text-storage
555     * object of the character returned by first().
556     * @stable ICU 2.0
557     */
558    inline int32_t       startIndex(void) const;
559
560    /**
561     * Returns the numeric index in the underlying text-storage
562     * object of the position immediately BEYOND the character
563     * returned by last().
564     * @return the numeric index in the underlying text-storage
565     * object of the position immediately BEYOND the character
566     * returned by last().
567     * @stable ICU 2.0
568     */
569    inline int32_t       endIndex(void) const;
570
571    /**
572     * Returns the numeric index in the underlying text-storage
573     * object of the character the iterator currently refers to
574     * (i.e., the character returned by current()).
575     * @return the numeric index in the text-storage object of
576     * the character the iterator currently refers to
577     * @stable ICU 2.0
578     */
579    inline int32_t       getIndex(void) const;
580
581    /**
582     * Returns the length of the entire text in the underlying
583     * text-storage object.
584     * @return the length of the entire text in the text-storage object
585     * @stable ICU 2.0
586     */
587    inline int32_t           getLength() const;
588
589    /**
590     * Moves the current position relative to the start or end of the
591     * iteration range, or relative to the current position itself.
592     * The movement is expressed in numbers of code units forward
593     * or backward by specifying a positive or negative delta.
594     * @param delta the position relative to origin. A positive delta means forward;
595     * a negative delta means backward.
596     * @param origin Origin enumeration {kStart, kCurrent, kEnd}
597     * @return the new position
598     * @stable ICU 2.0
599     */
600    virtual int32_t      move(int32_t delta, EOrigin origin) = 0;
601
602    /**
603     * Moves the current position relative to the start or end of the
604     * iteration range, or relative to the current position itself.
605     * The movement is expressed in numbers of code points forward
606     * or backward by specifying a positive or negative delta.
607     * @param delta the position relative to origin. A positive delta means forward;
608     * a negative delta means backward.
609     * @param origin Origin enumeration {kStart, kCurrent, kEnd}
610     * @return the new position
611     * @stable ICU 2.0
612     */
613#ifdef move32
614     // One of the system headers right now is sometimes defining a conflicting macro we don't use
615#undef move32
616#endif
617    virtual int32_t      move32(int32_t delta, EOrigin origin) = 0;
618
619    /**
620     * Copies the text under iteration into the UnicodeString
621     * referred to by "result".
622     * @param result Receives a copy of the text under iteration.
623     * @stable ICU 2.0
624     */
625    virtual void            getText(UnicodeString&  result) = 0;
626
627protected:
628    /**
629     * Empty constructor.
630     * @stable ICU 2.0
631     */
632    CharacterIterator();
633
634    /**
635     * Constructor, just setting the length field in this base class.
636     * @stable ICU 2.0
637     */
638    CharacterIterator(int32_t length);
639
640    /**
641     * Constructor, just setting the length and position fields in this base class.
642     * @stable ICU 2.0
643     */
644    CharacterIterator(int32_t length, int32_t position);
645
646    /**
647     * Constructor, just setting the length, start, end, and position fields in this base class.
648     * @stable ICU 2.0
649     */
650    CharacterIterator(int32_t length, int32_t textBegin, int32_t textEnd, int32_t position);
651
652    /**
653     * Copy constructor.
654     *
655     * @param that The CharacterIterator to be copied
656     * @stable ICU 2.0
657     */
658    CharacterIterator(const CharacterIterator &that);
659
660    /**
661     * Assignment operator.  Sets this CharacterIterator to have the same behavior,
662     * as the one passed in.
663     * @param that The CharacterIterator passed in.
664     * @return the newly set CharacterIterator.
665     * @stable ICU 2.0
666     */
667    CharacterIterator &operator=(const CharacterIterator &that);
668
669    /**
670     * Base class text length field.
671     * Necessary this for correct getText() and hashCode().
672     * @stable ICU 2.0
673     */
674    int32_t textLength;
675
676    /**
677     * Base class field for the current position.
678     * @stable ICU 2.0
679     */
680    int32_t  pos;
681
682    /**
683     * Base class field for the start of the iteration range.
684     * @stable ICU 2.0
685     */
686    int32_t  begin;
687
688    /**
689     * Base class field for the end of the iteration range.
690     * @stable ICU 2.0
691     */
692    int32_t  end;
693};
694
695inline bool
696ForwardCharacterIterator::operator!=(const ForwardCharacterIterator& that) const {
697    return !operator==(that);
698}
699
700inline int32_t
701CharacterIterator::setToStart() {
702    return move(0, kStart);
703}
704
705inline int32_t
706CharacterIterator::setToEnd() {
707    return move(0, kEnd);
708}
709
710inline int32_t
711CharacterIterator::startIndex(void) const {
712    return begin;
713}
714
715inline int32_t
716CharacterIterator::endIndex(void) const {
717    return end;
718}
719
720inline int32_t
721CharacterIterator::getIndex(void) const {
722    return pos;
723}
724
725inline int32_t
726CharacterIterator::getLength(void) const {
727    return textLength;
728}
729
730U_NAMESPACE_END
731
732#endif /* U_SHOW_CPLUSPLUS_API */
733
734#endif
735