1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/* 4******************************************************************** 5* 6* Copyright (C) 1997-2011, International Business Machines 7* Corporation and others. All Rights Reserved. 8* 9******************************************************************** 10*/ 11 12#ifndef CHARITER_H 13#define CHARITER_H 14 15#include "unicode/utypes.h" 16 17#if U_SHOW_CPLUSPLUS_API 18 19#include "unicode/uobject.h" 20#include "unicode/unistr.h" 21/** 22 * \file 23 * \brief C++ API: Character Iterator 24 */ 25 26U_NAMESPACE_BEGIN 27/** 28 * Abstract class that defines an API for forward-only iteration 29 * on text objects. 30 * This is a minimal interface for iteration without random access 31 * or backwards iteration. It is especially useful for wrapping 32 * streams with converters into an object for collation or 33 * normalization. 34 * 35 * <p>Characters can be accessed in two ways: as code units or as 36 * code points. 37 * Unicode code points are 21-bit integers and are the scalar values 38 * of Unicode characters. ICU uses the type UChar32 for them. 39 * Unicode code units are the storage units of a given 40 * Unicode/UCS Transformation Format (a character encoding scheme). 41 * With UTF-16, all code points can be represented with either one 42 * or two code units ("surrogates"). 43 * String storage is typically based on code units, while properties 44 * of characters are typically determined using code point values. 45 * Some processes may be designed to work with sequences of code units, 46 * or it may be known that all characters that are important to an 47 * algorithm can be represented with single code units. 48 * Other processes will need to use the code point access functions.</p> 49 * 50 * <p>ForwardCharacterIterator provides nextPostInc() to access 51 * a code unit and advance an internal position into the text object, 52 * similar to a <code>return text[position++]</code>.<br> 53 * It provides next32PostInc() to access a code point and advance an internal 54 * position.</p> 55 * 56 * <p>next32PostInc() assumes that the current position is that of 57 * the beginning of a code point, i.e., of its first code unit. 58 * After next32PostInc(), this will be true again. 59 * In general, access to code units and code points in the same 60 * iteration loop should not be mixed. In UTF-16, if the current position 61 * is on a second code unit (Low Surrogate), then only that code unit 62 * is returned even by next32PostInc().</p> 63 * 64 * <p>For iteration with either function, there are two ways to 65 * check for the end of the iteration. When there are no more 66 * characters in the text object: 67 * <ul> 68 * <li>The hasNext() function returns false.</li> 69 * <li>nextPostInc() and next32PostInc() return DONE 70 * when one attempts to read beyond the end of the text object.</li> 71 * </ul> 72 * 73 * Example: 74 * \code 75 * void function1(ForwardCharacterIterator &it) { 76 * UChar32 c; 77 * while(it.hasNext()) { 78 * c=it.next32PostInc(); 79 * // use c 80 * } 81 * } 82 * 83 * void function1(ForwardCharacterIterator &it) { 84 * char16_t c; 85 * while((c=it.nextPostInc())!=ForwardCharacterIterator::DONE) { 86 * // use c 87 * } 88 * } 89 * \endcode 90 * </p> 91 * 92 * @stable ICU 2.0 93 */ 94class U_COMMON_API ForwardCharacterIterator : public UObject { 95public: 96 /** 97 * Value returned by most of ForwardCharacterIterator's functions 98 * when the iterator has reached the limits of its iteration. 99 * @stable ICU 2.0 100 */ 101 enum { DONE = 0xffff }; 102 103 /** 104 * Destructor. 105 * @stable ICU 2.0 106 */ 107 virtual ~ForwardCharacterIterator(); 108 109 /** 110 * Returns true when both iterators refer to the same 111 * character in the same character-storage object. 112 * @param that The ForwardCharacterIterator to be compared for equality 113 * @return true when both iterators refer to the same 114 * character in the same character-storage object 115 * @stable ICU 2.0 116 */ 117 virtual bool operator==(const ForwardCharacterIterator& that) const = 0; 118 119 /** 120 * Returns true when the iterators refer to different 121 * text-storage objects, or to different characters in the 122 * same text-storage object. 123 * @param that The ForwardCharacterIterator to be compared for inequality 124 * @return true when the iterators refer to different 125 * text-storage objects, or to different characters in the 126 * same text-storage object 127 * @stable ICU 2.0 128 */ 129 inline bool operator!=(const ForwardCharacterIterator& that) const; 130 131 /** 132 * Generates a hash code for this iterator. 133 * @return the hash code. 134 * @stable ICU 2.0 135 */ 136 virtual int32_t hashCode(void) const = 0; 137 138 /** 139 * Returns a UClassID for this ForwardCharacterIterator ("poor man's 140 * RTTI").<P> Despite the fact that this function is public, 141 * DO NOT CONSIDER IT PART OF CHARACTERITERATOR'S API! 142 * @return a UClassID for this ForwardCharacterIterator 143 * @stable ICU 2.0 144 */ 145 virtual UClassID getDynamicClassID(void) const override = 0; 146 147 /** 148 * Gets the current code unit for returning and advances to the next code unit 149 * in the iteration range 150 * (toward endIndex()). If there are 151 * no more code units to return, returns DONE. 152 * @return the current code unit. 153 * @stable ICU 2.0 154 */ 155 virtual char16_t nextPostInc(void) = 0; 156 157 /** 158 * Gets the current code point for returning and advances to the next code point 159 * in the iteration range 160 * (toward endIndex()). If there are 161 * no more code points to return, returns DONE. 162 * @return the current code point. 163 * @stable ICU 2.0 164 */ 165 virtual UChar32 next32PostInc(void) = 0; 166 167 /** 168 * Returns false if there are no more code units or code points 169 * at or after the current position in the iteration range. 170 * This is used with nextPostInc() or next32PostInc() in forward 171 * iteration. 172 * @returns false if there are no more code units or code points 173 * at or after the current position in the iteration range. 174 * @stable ICU 2.0 175 */ 176 virtual UBool hasNext() = 0; 177 178protected: 179 /** Default constructor to be overridden in the implementing class. @stable ICU 2.0*/ 180 ForwardCharacterIterator(); 181 182 /** Copy constructor to be overridden in the implementing class. @stable ICU 2.0*/ 183 ForwardCharacterIterator(const ForwardCharacterIterator &other); 184 185 /** 186 * Assignment operator to be overridden in the implementing class. 187 * @stable ICU 2.0 188 */ 189 ForwardCharacterIterator &operator=(const ForwardCharacterIterator&) { return *this; } 190}; 191 192/** 193 * Abstract class that defines an API for iteration 194 * on text objects. 195 * This is an interface for forward and backward iteration 196 * and random access into a text object. 197 * 198 * <p>The API provides backward compatibility to the Java and older ICU 199 * CharacterIterator classes but extends them significantly: 200 * <ol> 201 * <li>CharacterIterator is now a subclass of ForwardCharacterIterator.</li> 202 * <li>While the old API functions provided forward iteration with 203 * "pre-increment" semantics, the new one also provides functions 204 * with "post-increment" semantics. They are more efficient and should 205 * be the preferred iterator functions for new implementations. 206 * The backward iteration always had "pre-decrement" semantics, which 207 * are efficient.</li> 208 * <li>Just like ForwardCharacterIterator, it provides access to 209 * both code units and code points. Code point access versions are available 210 * for the old and the new iteration semantics.</li> 211 * <li>There are new functions for setting and moving the current position 212 * without returning a character, for efficiency.</li> 213 * </ol> 214 * 215 * See ForwardCharacterIterator for examples for using the new forward iteration 216 * functions. For backward iteration, there is also a hasPrevious() function 217 * that can be used analogously to hasNext(). 218 * The old functions work as before and are shown below.</p> 219 * 220 * <p>Examples for some of the new functions:</p> 221 * 222 * Forward iteration with hasNext(): 223 * \code 224 * void forward1(CharacterIterator &it) { 225 * UChar32 c; 226 * for(it.setToStart(); it.hasNext();) { 227 * c=it.next32PostInc(); 228 * // use c 229 * } 230 * } 231 * \endcode 232 * Forward iteration more similar to loops with the old forward iteration, 233 * showing a way to convert simple for() loops: 234 * \code 235 * void forward2(CharacterIterator &it) { 236 * char16_t c; 237 * for(c=it.firstPostInc(); c!=CharacterIterator::DONE; c=it.nextPostInc()) { 238 * // use c 239 * } 240 * } 241 * \endcode 242 * Backward iteration with setToEnd() and hasPrevious(): 243 * \code 244 * void backward1(CharacterIterator &it) { 245 * UChar32 c; 246 * for(it.setToEnd(); it.hasPrevious();) { 247 * c=it.previous32(); 248 * // use c 249 * } 250 * } 251 * \endcode 252 * Backward iteration with a more traditional for() loop: 253 * \code 254 * void backward2(CharacterIterator &it) { 255 * char16_t c; 256 * for(c=it.last(); c!=CharacterIterator::DONE; c=it.previous()) { 257 * // use c 258 * } 259 * } 260 * \endcode 261 * 262 * Example for random access: 263 * \code 264 * void random(CharacterIterator &it) { 265 * // set to the third code point from the beginning 266 * it.move32(3, CharacterIterator::kStart); 267 * // get a code point from here without moving the position 268 * UChar32 c=it.current32(); 269 * // get the position 270 * int32_t pos=it.getIndex(); 271 * // get the previous code unit 272 * char16_t u=it.previous(); 273 * // move back one more code unit 274 * it.move(-1, CharacterIterator::kCurrent); 275 * // set the position back to where it was 276 * // and read the same code point c and move beyond it 277 * it.setIndex(pos); 278 * if(c!=it.next32PostInc()) { 279 * exit(1); // CharacterIterator inconsistent 280 * } 281 * } 282 * \endcode 283 * 284 * <p>Examples, especially for the old API:</p> 285 * 286 * Function processing characters, in this example simple output 287 * <pre> 288 * \code 289 * void processChar( char16_t c ) 290 * { 291 * cout << " " << c; 292 * } 293 * \endcode 294 * </pre> 295 * Traverse the text from start to finish 296 * <pre> 297 * \code 298 * void traverseForward(CharacterIterator& iter) 299 * { 300 * for(char16_t c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) { 301 * processChar(c); 302 * } 303 * } 304 * \endcode 305 * </pre> 306 * Traverse the text backwards, from end to start 307 * <pre> 308 * \code 309 * void traverseBackward(CharacterIterator& iter) 310 * { 311 * for(char16_t c = iter.last(); c != CharacterIterator.DONE; c = iter.previous()) { 312 * processChar(c); 313 * } 314 * } 315 * \endcode 316 * </pre> 317 * Traverse both forward and backward from a given position in the text. 318 * Calls to notBoundary() in this example represents some additional stopping criteria. 319 * <pre> 320 * \code 321 * void traverseOut(CharacterIterator& iter, int32_t pos) 322 * { 323 * char16_t c; 324 * for (c = iter.setIndex(pos); 325 * c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c)); 326 * c = iter.next()) {} 327 * int32_t end = iter.getIndex(); 328 * for (c = iter.setIndex(pos); 329 * c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c)); 330 * c = iter.previous()) {} 331 * int32_t start = iter.getIndex() + 1; 332 * 333 * cout << "start: " << start << " end: " << end << endl; 334 * for (c = iter.setIndex(start); iter.getIndex() < end; c = iter.next() ) { 335 * processChar(c); 336 * } 337 * } 338 * \endcode 339 * </pre> 340 * Creating a StringCharacterIterator and calling the test functions 341 * <pre> 342 * \code 343 * void CharacterIterator_Example( void ) 344 * { 345 * cout << endl << "===== CharacterIterator_Example: =====" << endl; 346 * UnicodeString text("Ein kleiner Satz."); 347 * StringCharacterIterator iterator(text); 348 * cout << "----- traverseForward: -----------" << endl; 349 * traverseForward( iterator ); 350 * cout << endl << endl << "----- traverseBackward: ----------" << endl; 351 * traverseBackward( iterator ); 352 * cout << endl << endl << "----- traverseOut: ---------------" << endl; 353 * traverseOut( iterator, 7 ); 354 * cout << endl << endl << "-----" << endl; 355 * } 356 * \endcode 357 * </pre> 358 * 359 * @stable ICU 2.0 360 */ 361class U_COMMON_API CharacterIterator : public ForwardCharacterIterator { 362public: 363 /** 364 * Origin enumeration for the move() and move32() functions. 365 * @stable ICU 2.0 366 */ 367 enum EOrigin { kStart, kCurrent, kEnd }; 368 369 /** 370 * Destructor. 371 * @stable ICU 2.0 372 */ 373 virtual ~CharacterIterator(); 374 375 /** 376 * Returns a pointer to a new CharacterIterator of the same 377 * concrete class as this one, and referring to the same 378 * character in the same text-storage object as this one. The 379 * caller is responsible for deleting the new clone. 380 * @return a pointer to a new CharacterIterator 381 * @stable ICU 2.0 382 */ 383 virtual CharacterIterator* clone() const = 0; 384 385 /** 386 * Sets the iterator to refer to the first code unit in its 387 * iteration range, and returns that code unit. 388 * This can be used to begin an iteration with next(). 389 * @return the first code unit in its iteration range. 390 * @stable ICU 2.0 391 */ 392 virtual char16_t first(void) = 0; 393 394 /** 395 * Sets the iterator to refer to the first code unit in its 396 * iteration range, returns that code unit, and moves the position 397 * to the second code unit. This is an alternative to setToStart() 398 * for forward iteration with nextPostInc(). 399 * @return the first code unit in its iteration range. 400 * @stable ICU 2.0 401 */ 402 virtual char16_t firstPostInc(void); 403 404 /** 405 * Sets the iterator to refer to the first code point in its 406 * iteration range, and returns that code unit, 407 * This can be used to begin an iteration with next32(). 408 * Note that an iteration with next32PostInc(), beginning with, 409 * e.g., setToStart() or firstPostInc(), is more efficient. 410 * @return the first code point in its iteration range. 411 * @stable ICU 2.0 412 */ 413 virtual UChar32 first32(void) = 0; 414 415 /** 416 * Sets the iterator to refer to the first code point in its 417 * iteration range, returns that code point, and moves the position 418 * to the second code point. This is an alternative to setToStart() 419 * for forward iteration with next32PostInc(). 420 * @return the first code point in its iteration range. 421 * @stable ICU 2.0 422 */ 423 virtual UChar32 first32PostInc(void); 424 425 /** 426 * Sets the iterator to refer to the first code unit or code point in its 427 * iteration range. This can be used to begin a forward 428 * iteration with nextPostInc() or next32PostInc(). 429 * @return the start position of the iteration range 430 * @stable ICU 2.0 431 */ 432 inline int32_t setToStart(); 433 434 /** 435 * Sets the iterator to refer to the last code unit in its 436 * iteration range, and returns that code unit. 437 * This can be used to begin an iteration with previous(). 438 * @return the last code unit. 439 * @stable ICU 2.0 440 */ 441 virtual char16_t last(void) = 0; 442 443 /** 444 * Sets the iterator to refer to the last code point in its 445 * iteration range, and returns that code unit. 446 * This can be used to begin an iteration with previous32(). 447 * @return the last code point. 448 * @stable ICU 2.0 449 */ 450 virtual UChar32 last32(void) = 0; 451 452 /** 453 * Sets the iterator to the end of its iteration range, just behind 454 * the last code unit or code point. This can be used to begin a backward 455 * iteration with previous() or previous32(). 456 * @return the end position of the iteration range 457 * @stable ICU 2.0 458 */ 459 inline int32_t setToEnd(); 460 461 /** 462 * Sets the iterator to refer to the "position"-th code unit 463 * in the text-storage object the iterator refers to, and 464 * returns that code unit. 465 * @param position the "position"-th code unit in the text-storage object 466 * @return the "position"-th code unit. 467 * @stable ICU 2.0 468 */ 469 virtual char16_t setIndex(int32_t position) = 0; 470 471 /** 472 * Sets the iterator to refer to the beginning of the code point 473 * that contains the "position"-th code unit 474 * in the text-storage object the iterator refers to, and 475 * returns that code point. 476 * The current position is adjusted to the beginning of the code point 477 * (its first code unit). 478 * @param position the "position"-th code unit in the text-storage object 479 * @return the "position"-th code point. 480 * @stable ICU 2.0 481 */ 482 virtual UChar32 setIndex32(int32_t position) = 0; 483 484 /** 485 * Returns the code unit the iterator currently refers to. 486 * @return the current code unit. 487 * @stable ICU 2.0 488 */ 489 virtual char16_t current(void) const = 0; 490 491 /** 492 * Returns the code point the iterator currently refers to. 493 * @return the current code point. 494 * @stable ICU 2.0 495 */ 496 virtual UChar32 current32(void) const = 0; 497 498 /** 499 * Advances to the next code unit in the iteration range 500 * (toward endIndex()), and returns that code unit. If there are 501 * no more code units to return, returns DONE. 502 * @return the next code unit. 503 * @stable ICU 2.0 504 */ 505 virtual char16_t next(void) = 0; 506 507 /** 508 * Advances to the next code point in the iteration range 509 * (toward endIndex()), and returns that code point. If there are 510 * no more code points to return, returns DONE. 511 * Note that iteration with "pre-increment" semantics is less 512 * efficient than iteration with "post-increment" semantics 513 * that is provided by next32PostInc(). 514 * @return the next code point. 515 * @stable ICU 2.0 516 */ 517 virtual UChar32 next32(void) = 0; 518 519 /** 520 * Advances to the previous code unit in the iteration range 521 * (toward startIndex()), and returns that code unit. If there are 522 * no more code units to return, returns DONE. 523 * @return the previous code unit. 524 * @stable ICU 2.0 525 */ 526 virtual char16_t previous(void) = 0; 527 528 /** 529 * Advances to the previous code point in the iteration range 530 * (toward startIndex()), and returns that code point. If there are 531 * no more code points to return, returns DONE. 532 * @return the previous code point. 533 * @stable ICU 2.0 534 */ 535 virtual UChar32 previous32(void) = 0; 536 537 /** 538 * Returns false if there are no more code units or code points 539 * before the current position in the iteration range. 540 * This is used with previous() or previous32() in backward 541 * iteration. 542 * @return false if there are no more code units or code points 543 * before the current position in the iteration range, return true otherwise. 544 * @stable ICU 2.0 545 */ 546 virtual UBool hasPrevious() = 0; 547 548 /** 549 * Returns the numeric index in the underlying text-storage 550 * object of the character returned by first(). Since it's 551 * possible to create an iterator that iterates across only 552 * part of a text-storage object, this number isn't 553 * necessarily 0. 554 * @returns the numeric index in the underlying text-storage 555 * object of the character returned by first(). 556 * @stable ICU 2.0 557 */ 558 inline int32_t startIndex(void) const; 559 560 /** 561 * Returns the numeric index in the underlying text-storage 562 * object of the position immediately BEYOND the character 563 * returned by last(). 564 * @return the numeric index in the underlying text-storage 565 * object of the position immediately BEYOND the character 566 * returned by last(). 567 * @stable ICU 2.0 568 */ 569 inline int32_t endIndex(void) const; 570 571 /** 572 * Returns the numeric index in the underlying text-storage 573 * object of the character the iterator currently refers to 574 * (i.e., the character returned by current()). 575 * @return the numeric index in the text-storage object of 576 * the character the iterator currently refers to 577 * @stable ICU 2.0 578 */ 579 inline int32_t getIndex(void) const; 580 581 /** 582 * Returns the length of the entire text in the underlying 583 * text-storage object. 584 * @return the length of the entire text in the text-storage object 585 * @stable ICU 2.0 586 */ 587 inline int32_t getLength() const; 588 589 /** 590 * Moves the current position relative to the start or end of the 591 * iteration range, or relative to the current position itself. 592 * The movement is expressed in numbers of code units forward 593 * or backward by specifying a positive or negative delta. 594 * @param delta the position relative to origin. A positive delta means forward; 595 * a negative delta means backward. 596 * @param origin Origin enumeration {kStart, kCurrent, kEnd} 597 * @return the new position 598 * @stable ICU 2.0 599 */ 600 virtual int32_t move(int32_t delta, EOrigin origin) = 0; 601 602 /** 603 * Moves the current position relative to the start or end of the 604 * iteration range, or relative to the current position itself. 605 * The movement is expressed in numbers of code points forward 606 * or backward by specifying a positive or negative delta. 607 * @param delta the position relative to origin. A positive delta means forward; 608 * a negative delta means backward. 609 * @param origin Origin enumeration {kStart, kCurrent, kEnd} 610 * @return the new position 611 * @stable ICU 2.0 612 */ 613#ifdef move32 614 // One of the system headers right now is sometimes defining a conflicting macro we don't use 615#undef move32 616#endif 617 virtual int32_t move32(int32_t delta, EOrigin origin) = 0; 618 619 /** 620 * Copies the text under iteration into the UnicodeString 621 * referred to by "result". 622 * @param result Receives a copy of the text under iteration. 623 * @stable ICU 2.0 624 */ 625 virtual void getText(UnicodeString& result) = 0; 626 627protected: 628 /** 629 * Empty constructor. 630 * @stable ICU 2.0 631 */ 632 CharacterIterator(); 633 634 /** 635 * Constructor, just setting the length field in this base class. 636 * @stable ICU 2.0 637 */ 638 CharacterIterator(int32_t length); 639 640 /** 641 * Constructor, just setting the length and position fields in this base class. 642 * @stable ICU 2.0 643 */ 644 CharacterIterator(int32_t length, int32_t position); 645 646 /** 647 * Constructor, just setting the length, start, end, and position fields in this base class. 648 * @stable ICU 2.0 649 */ 650 CharacterIterator(int32_t length, int32_t textBegin, int32_t textEnd, int32_t position); 651 652 /** 653 * Copy constructor. 654 * 655 * @param that The CharacterIterator to be copied 656 * @stable ICU 2.0 657 */ 658 CharacterIterator(const CharacterIterator &that); 659 660 /** 661 * Assignment operator. Sets this CharacterIterator to have the same behavior, 662 * as the one passed in. 663 * @param that The CharacterIterator passed in. 664 * @return the newly set CharacterIterator. 665 * @stable ICU 2.0 666 */ 667 CharacterIterator &operator=(const CharacterIterator &that); 668 669 /** 670 * Base class text length field. 671 * Necessary this for correct getText() and hashCode(). 672 * @stable ICU 2.0 673 */ 674 int32_t textLength; 675 676 /** 677 * Base class field for the current position. 678 * @stable ICU 2.0 679 */ 680 int32_t pos; 681 682 /** 683 * Base class field for the start of the iteration range. 684 * @stable ICU 2.0 685 */ 686 int32_t begin; 687 688 /** 689 * Base class field for the end of the iteration range. 690 * @stable ICU 2.0 691 */ 692 int32_t end; 693}; 694 695inline bool 696ForwardCharacterIterator::operator!=(const ForwardCharacterIterator& that) const { 697 return !operator==(that); 698} 699 700inline int32_t 701CharacterIterator::setToStart() { 702 return move(0, kStart); 703} 704 705inline int32_t 706CharacterIterator::setToEnd() { 707 return move(0, kEnd); 708} 709 710inline int32_t 711CharacterIterator::startIndex(void) const { 712 return begin; 713} 714 715inline int32_t 716CharacterIterator::endIndex(void) const { 717 return end; 718} 719 720inline int32_t 721CharacterIterator::getIndex(void) const { 722 return pos; 723} 724 725inline int32_t 726CharacterIterator::getLength(void) const { 727 return textLength; 728} 729 730U_NAMESPACE_END 731 732#endif /* U_SHOW_CPLUSPLUS_API */ 733 734#endif 735