1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4********************************************************************
5*
6* Copyright (C) 1997-2011, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9********************************************************************
10*/
11
12#ifndef CHARITER_H
13#define CHARITER_H
14
15#include "unicode/utypes.h"
16#include "unicode/uobject.h"
17#include "unicode/unistr.h"
18/**
19 * \file
20 * \brief C++ API: Character Iterator
21 */
22
23U_NAMESPACE_BEGIN
24/**
25 * Abstract class that defines an API for forward-only iteration
26 * on text objects.
27 * This is a minimal interface for iteration without random access
28 * or backwards iteration. It is especially useful for wrapping
29 * streams with converters into an object for collation or
30 * normalization.
31 *
32 * <p>Characters can be accessed in two ways: as code units or as
33 * code points.
34 * Unicode code points are 21-bit integers and are the scalar values
35 * of Unicode characters. ICU uses the type UChar32 for them.
36 * Unicode code units are the storage units of a given
37 * Unicode/UCS Transformation Format (a character encoding scheme).
38 * With UTF-16, all code points can be represented with either one
39 * or two code units ("surrogates").
40 * String storage is typically based on code units, while properties
41 * of characters are typically determined using code point values.
42 * Some processes may be designed to work with sequences of code units,
43 * or it may be known that all characters that are important to an
44 * algorithm can be represented with single code units.
45 * Other processes will need to use the code point access functions.</p>
46 *
47 * <p>ForwardCharacterIterator provides nextPostInc() to access
48 * a code unit and advance an internal position into the text object,
49 * similar to a <code>return text[position++]</code>.<br>
50 * It provides next32PostInc() to access a code point and advance an internal
51 * position.</p>
52 *
53 * <p>next32PostInc() assumes that the current position is that of
54 * the beginning of a code point, i.e., of its first code unit.
55 * After next32PostInc(), this will be true again.
56 * In general, access to code units and code points in the same
57 * iteration loop should not be mixed. In UTF-16, if the current position
58 * is on a second code unit (Low Surrogate), then only that code unit
59 * is returned even by next32PostInc().</p>
60 *
61 * <p>For iteration with either function, there are two ways to
62 * check for the end of the iteration. When there are no more
63 * characters in the text object:
64 * <ul>
65 * <li>The hasNext() function returns FALSE.</li>
66 * <li>nextPostInc() and next32PostInc() return DONE
67 * when one attempts to read beyond the end of the text object.</li>
68 * </ul>
69 *
70 * Example:
71 * \code
72 * void function1(ForwardCharacterIterator &it) {
73 * UChar32 c;
74 * while(it.hasNext()) {
75 * c=it.next32PostInc();
76 * // use c
77 * }
78 * }
79 *
80 * void function1(ForwardCharacterIterator &it) {
81 * char16_t c;
82 * while((c=it.nextPostInc())!=ForwardCharacterIterator::DONE) {
83 * // use c
84 * }
85 * }
86 * \endcode
87 * </p>
88 *
89 * @stable ICU 2.0
90 */
91class U_COMMON_API ForwardCharacterIterator : public UObject {
92public:
93 /**
94 * Value returned by most of ForwardCharacterIterator's functions
95 * when the iterator has reached the limits of its iteration.
96 * @stable ICU 2.0
97 */
98 enum { DONE = 0xffff };
99
100 /**
101 * Destructor.
102 * @stable ICU 2.0
103 */
104 virtual ~ForwardCharacterIterator();
105
106 /**
107 * Returns true when both iterators refer to the same
108 * character in the same character-storage object.
109 * @param that The ForwardCharacterIterator to be compared for equality
110 * @return true when both iterators refer to the same
111 * character in the same character-storage object
112 * @stable ICU 2.0
113 */
114 virtual UBool operator==(const ForwardCharacterIterator& that) const = 0;
115
116 /**
117 * Returns true when the iterators refer to different
118 * text-storage objects, or to different characters in the
119 * same text-storage object.
120 * @param that The ForwardCharacterIterator to be compared for inequality
121 * @return true when the iterators refer to different
122 * text-storage objects, or to different characters in the
123 * same text-storage object
124 * @stable ICU 2.0
125 */
126 inline UBool operator!=(const ForwardCharacterIterator& that) const;
127
128 /**
129 * Generates a hash code for this iterator.
130 * @return the hash code.
131 * @stable ICU 2.0
132 */
133 virtual int32_t hashCode(void) const = 0;
134
135 /**
136 * Returns a UClassID for this ForwardCharacterIterator ("poor man's
137 * RTTI").<P> Despite the fact that this function is public,
138 * DO NOT CONSIDER IT PART OF CHARACTERITERATOR'S API!
139 * @return a UClassID for this ForwardCharacterIterator
140 * @stable ICU 2.0
141 */
142 virtual UClassID getDynamicClassID(void) const = 0;
143
144 /**
145 * Gets the current code unit for returning and advances to the next code unit
146 * in the iteration range
147 * (toward endIndex()). If there are
148 * no more code units to return, returns DONE.
149 * @return the current code unit.
150 * @stable ICU 2.0
151 */
152 virtual char16_t nextPostInc(void) = 0;
153
154 /**
155 * Gets the current code point for returning and advances to the next code point
156 * in the iteration range
157 * (toward endIndex()). If there are
158 * no more code points to return, returns DONE.
159 * @return the current code point.
160 * @stable ICU 2.0
161 */
162 virtual UChar32 next32PostInc(void) = 0;
163
164 /**
165 * Returns FALSE if there are no more code units or code points
166 * at or after the current position in the iteration range.
167 * This is used with nextPostInc() or next32PostInc() in forward
168 * iteration.
169 * @returns FALSE if there are no more code units or code points
170 * at or after the current position in the iteration range.
171 * @stable ICU 2.0
172 */
173 virtual UBool hasNext() = 0;
174
175protected:
176 /** Default constructor to be overridden in the implementing class. @stable ICU 2.0*/
177 ForwardCharacterIterator();
178
179 /** Copy constructor to be overridden in the implementing class. @stable ICU 2.0*/
180 ForwardCharacterIterator(const ForwardCharacterIterator &other);
181
182 /**
183 * Assignment operator to be overridden in the implementing class.
184 * @stable ICU 2.0
185 */
186 ForwardCharacterIterator &operator=(const ForwardCharacterIterator&) { return *this; }
187};
188
189/**
190 * Abstract class that defines an API for iteration
191 * on text objects.
192 * This is an interface for forward and backward iteration
193 * and random access into a text object.
194 *
195 * <p>The API provides backward compatibility to the Java and older ICU
196 * CharacterIterator classes but extends them significantly:
197 * <ol>
198 * <li>CharacterIterator is now a subclass of ForwardCharacterIterator.</li>
199 * <li>While the old API functions provided forward iteration with
200 * "pre-increment" semantics, the new one also provides functions
201 * with "post-increment" semantics. They are more efficient and should
202 * be the preferred iterator functions for new implementations.
203 * The backward iteration always had "pre-decrement" semantics, which
204 * are efficient.</li>
205 * <li>Just like ForwardCharacterIterator, it provides access to
206 * both code units and code points. Code point access versions are available
207 * for the old and the new iteration semantics.</li>
208 * <li>There are new functions for setting and moving the current position
209 * without returning a character, for efficiency.</li>
210 * </ol>
211 *
212 * See ForwardCharacterIterator for examples for using the new forward iteration
213 * functions. For backward iteration, there is also a hasPrevious() function
214 * that can be used analogously to hasNext().
215 * The old functions work as before and are shown below.</p>
216 *
217 * <p>Examples for some of the new functions:</p>
218 *
219 * Forward iteration with hasNext():
220 * \code
221 * void forward1(CharacterIterator &it) {
222 * UChar32 c;
223 * for(it.setToStart(); it.hasNext();) {
224 * c=it.next32PostInc();
225 * // use c
226 * }
227 * }
228 * \endcode
229 * Forward iteration more similar to loops with the old forward iteration,
230 * showing a way to convert simple for() loops:
231 * \code
232 * void forward2(CharacterIterator &it) {
233 * char16_t c;
234 * for(c=it.firstPostInc(); c!=CharacterIterator::DONE; c=it.nextPostInc()) {
235 * // use c
236 * }
237 * }
238 * \endcode
239 * Backward iteration with setToEnd() and hasPrevious():
240 * \code
241 * void backward1(CharacterIterator &it) {
242 * UChar32 c;
243 * for(it.setToEnd(); it.hasPrevious();) {
244 * c=it.previous32();
245 * // use c
246 * }
247 * }
248 * \endcode
249 * Backward iteration with a more traditional for() loop:
250 * \code
251 * void backward2(CharacterIterator &it) {
252 * char16_t c;
253 * for(c=it.last(); c!=CharacterIterator::DONE; c=it.previous()) {
254 * // use c
255 * }
256 * }
257 * \endcode
258 *
259 * Example for random access:
260 * \code
261 * void random(CharacterIterator &it) {
262 * // set to the third code point from the beginning
263 * it.move32(3, CharacterIterator::kStart);
264 * // get a code point from here without moving the position
265 * UChar32 c=it.current32();
266 * // get the position
267 * int32_t pos=it.getIndex();
268 * // get the previous code unit
269 * char16_t u=it.previous();
270 * // move back one more code unit
271 * it.move(-1, CharacterIterator::kCurrent);
272 * // set the position back to where it was
273 * // and read the same code point c and move beyond it
274 * it.setIndex(pos);
275 * if(c!=it.next32PostInc()) {
276 * exit(1); // CharacterIterator inconsistent
277 * }
278 * }
279 * \endcode
280 *
281 * <p>Examples, especially for the old API:</p>
282 *
283 * Function processing characters, in this example simple output
284 * <pre>
285 * \code
286 * void processChar( char16_t c )
287 * {
288 * cout << " " << c;
289 * }
290 * \endcode
291 * </pre>
292 * Traverse the text from start to finish
293 * <pre>
294 * \code
295 * void traverseForward(CharacterIterator& iter)
296 * {
297 * for(char16_t c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) {
298 * processChar(c);
299 * }
300 * }
301 * \endcode
302 * </pre>
303 * Traverse the text backwards, from end to start
304 * <pre>
305 * \code
306 * void traverseBackward(CharacterIterator& iter)
307 * {
308 * for(char16_t c = iter.last(); c != CharacterIterator.DONE; c = iter.previous()) {
309 * processChar(c);
310 * }
311 * }
312 * \endcode
313 * </pre>
314 * Traverse both forward and backward from a given position in the text.
315 * Calls to notBoundary() in this example represents some additional stopping criteria.
316 * <pre>
317 * \code
318 * void traverseOut(CharacterIterator& iter, int32_t pos)
319 * {
320 * char16_t c;
321 * for (c = iter.setIndex(pos);
322 * c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));
323 * c = iter.next()) {}
324 * int32_t end = iter.getIndex();
325 * for (c = iter.setIndex(pos);
326 * c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));
327 * c = iter.previous()) {}
328 * int32_t start = iter.getIndex() + 1;
329 *
330 * cout << "start: " << start << " end: " << end << endl;
331 * for (c = iter.setIndex(start); iter.getIndex() < end; c = iter.next() ) {
332 * processChar(c);
333 * }
334 * }
335 * \endcode
336 * </pre>
337 * Creating a StringCharacterIterator and calling the test functions
338 * <pre>
339 * \code
340 * void CharacterIterator_Example( void )
341 * {
342 * cout << endl << "===== CharacterIterator_Example: =====" << endl;
343 * UnicodeString text("Ein kleiner Satz.");
344 * StringCharacterIterator iterator(text);
345 * cout << "----- traverseForward: -----------" << endl;
346 * traverseForward( iterator );
347 * cout << endl << endl << "----- traverseBackward: ----------" << endl;
348 * traverseBackward( iterator );
349 * cout << endl << endl << "----- traverseOut: ---------------" << endl;
350 * traverseOut( iterator, 7 );
351 * cout << endl << endl << "-----" << endl;
352 * }
353 * \endcode
354 * </pre>
355 *
356 * @stable ICU 2.0
357 */
358class U_COMMON_API CharacterIterator : public ForwardCharacterIterator {
359public:
360 /**
361 * Origin enumeration for the move() and move32() functions.
362 * @stable ICU 2.0
363 */
364 enum EOrigin { kStart, kCurrent, kEnd };
365
366 /**
367 * Destructor.
368 * @stable ICU 2.0
369 */
370 virtual ~CharacterIterator();
371
372 /**
373 * Returns a pointer to a new CharacterIterator of the same
374 * concrete class as this one, and referring to the same
375 * character in the same text-storage object as this one. The
376 * caller is responsible for deleting the new clone.
377 * @return a pointer to a new CharacterIterator
378 * @stable ICU 2.0
379 */
380 virtual CharacterIterator* clone(void) const = 0;
381
382 /**
383 * Sets the iterator to refer to the first code unit in its
384 * iteration range, and returns that code unit.
385 * This can be used to begin an iteration with next().
386 * @return the first code unit in its iteration range.
387 * @stable ICU 2.0
388 */
389 virtual char16_t first(void) = 0;
390
391 /**
392 * Sets the iterator to refer to the first code unit in its
393 * iteration range, returns that code unit, and moves the position
394 * to the second code unit. This is an alternative to setToStart()
395 * for forward iteration with nextPostInc().
396 * @return the first code unit in its iteration range.
397 * @stable ICU 2.0
398 */
399 virtual char16_t firstPostInc(void);
400
401 /**
402 * Sets the iterator to refer to the first code point in its
403 * iteration range, and returns that code unit,
404 * This can be used to begin an iteration with next32().
405 * Note that an iteration with next32PostInc(), beginning with,
406 * e.g., setToStart() or firstPostInc(), is more efficient.
407 * @return the first code point in its iteration range.
408 * @stable ICU 2.0
409 */
410 virtual UChar32 first32(void) = 0;
411
412 /**
413 * Sets the iterator to refer to the first code point in its
414 * iteration range, returns that code point, and moves the position
415 * to the second code point. This is an alternative to setToStart()
416 * for forward iteration with next32PostInc().
417 * @return the first code point in its iteration range.
418 * @stable ICU 2.0
419 */
420 virtual UChar32 first32PostInc(void);
421
422 /**
423 * Sets the iterator to refer to the first code unit or code point in its
424 * iteration range. This can be used to begin a forward
425 * iteration with nextPostInc() or next32PostInc().
426 * @return the start position of the iteration range
427 * @stable ICU 2.0
428 */
429 inline int32_t setToStart();
430
431 /**
432 * Sets the iterator to refer to the last code unit in its
433 * iteration range, and returns that code unit.
434 * This can be used to begin an iteration with previous().
435 * @return the last code unit.
436 * @stable ICU 2.0
437 */
438 virtual char16_t last(void) = 0;
439
440 /**
441 * Sets the iterator to refer to the last code point in its
442 * iteration range, and returns that code unit.
443 * This can be used to begin an iteration with previous32().
444 * @return the last code point.
445 * @stable ICU 2.0
446 */
447 virtual UChar32 last32(void) = 0;
448
449 /**
450 * Sets the iterator to the end of its iteration range, just behind
451 * the last code unit or code point. This can be used to begin a backward
452 * iteration with previous() or previous32().
453 * @return the end position of the iteration range
454 * @stable ICU 2.0
455 */
456 inline int32_t setToEnd();
457
458 /**
459 * Sets the iterator to refer to the "position"-th code unit
460 * in the text-storage object the iterator refers to, and
461 * returns that code unit.
462 * @param position the "position"-th code unit in the text-storage object
463 * @return the "position"-th code unit.
464 * @stable ICU 2.0
465 */
466 virtual char16_t setIndex(int32_t position) = 0;
467
468 /**
469 * Sets the iterator to refer to the beginning of the code point
470 * that contains the "position"-th code unit
471 * in the text-storage object the iterator refers to, and
472 * returns that code point.
473 * The current position is adjusted to the beginning of the code point
474 * (its first code unit).
475 * @param position the "position"-th code unit in the text-storage object
476 * @return the "position"-th code point.
477 * @stable ICU 2.0
478 */
479 virtual UChar32 setIndex32(int32_t position) = 0;
480
481 /**
482 * Returns the code unit the iterator currently refers to.
483 * @return the current code unit.
484 * @stable ICU 2.0
485 */
486 virtual char16_t current(void) const = 0;
487
488 /**
489 * Returns the code point the iterator currently refers to.
490 * @return the current code point.
491 * @stable ICU 2.0
492 */
493 virtual UChar32 current32(void) const = 0;
494
495 /**
496 * Advances to the next code unit in the iteration range
497 * (toward endIndex()), and returns that code unit. If there are
498 * no more code units to return, returns DONE.
499 * @return the next code unit.
500 * @stable ICU 2.0
501 */
502 virtual char16_t next(void) = 0;
503
504 /**
505 * Advances to the next code point in the iteration range
506 * (toward endIndex()), and returns that code point. If there are
507 * no more code points to return, returns DONE.
508 * Note that iteration with "pre-increment" semantics is less
509 * efficient than iteration with "post-increment" semantics
510 * that is provided by next32PostInc().
511 * @return the next code point.
512 * @stable ICU 2.0
513 */
514 virtual UChar32 next32(void) = 0;
515
516 /**
517 * Advances to the previous code unit in the iteration range
518 * (toward startIndex()), and returns that code unit. If there are
519 * no more code units to return, returns DONE.
520 * @return the previous code unit.
521 * @stable ICU 2.0
522 */
523 virtual char16_t previous(void) = 0;
524
525 /**
526 * Advances to the previous code point in the iteration range
527 * (toward startIndex()), and returns that code point. If there are
528 * no more code points to return, returns DONE.
529 * @return the previous code point.
530 * @stable ICU 2.0
531 */
532 virtual UChar32 previous32(void) = 0;
533
534 /**
535 * Returns FALSE if there are no more code units or code points
536 * before the current position in the iteration range.
537 * This is used with previous() or previous32() in backward
538 * iteration.
539 * @return FALSE if there are no more code units or code points
540 * before the current position in the iteration range, return TRUE otherwise.
541 * @stable ICU 2.0
542 */
543 virtual UBool hasPrevious() = 0;
544
545 /**
546 * Returns the numeric index in the underlying text-storage
547 * object of the character returned by first(). Since it's
548 * possible to create an iterator that iterates across only
549 * part of a text-storage object, this number isn't
550 * necessarily 0.
551 * @returns the numeric index in the underlying text-storage
552 * object of the character returned by first().
553 * @stable ICU 2.0
554 */
555 inline int32_t startIndex(void) const;
556
557 /**
558 * Returns the numeric index in the underlying text-storage
559 * object of the position immediately BEYOND the character
560 * returned by last().
561 * @return the numeric index in the underlying text-storage
562 * object of the position immediately BEYOND the character
563 * returned by last().
564 * @stable ICU 2.0
565 */
566 inline int32_t endIndex(void) const;
567
568 /**
569 * Returns the numeric index in the underlying text-storage
570 * object of the character the iterator currently refers to
571 * (i.e., the character returned by current()).
572 * @return the numeric index in the text-storage object of
573 * the character the iterator currently refers to
574 * @stable ICU 2.0
575 */
576 inline int32_t getIndex(void) const;
577
578 /**
579 * Returns the length of the entire text in the underlying
580 * text-storage object.
581 * @return the length of the entire text in the text-storage object
582 * @stable ICU 2.0
583 */
584 inline int32_t getLength() const;
585
586 /**
587 * Moves the current position relative to the start or end of the
588 * iteration range, or relative to the current position itself.
589 * The movement is expressed in numbers of code units forward
590 * or backward by specifying a positive or negative delta.
591 * @param delta the position relative to origin. A positive delta means forward;
592 * a negative delta means backward.
593 * @param origin Origin enumeration {kStart, kCurrent, kEnd}
594 * @return the new position
595 * @stable ICU 2.0
596 */
597 virtual int32_t move(int32_t delta, EOrigin origin) = 0;
598
599 /**
600 * Moves the current position relative to the start or end of the
601 * iteration range, or relative to the current position itself.
602 * The movement is expressed in numbers of code points forward
603 * or backward by specifying a positive or negative delta.
604 * @param delta the position relative to origin. A positive delta means forward;
605 * a negative delta means backward.
606 * @param origin Origin enumeration {kStart, kCurrent, kEnd}
607 * @return the new position
608 * @stable ICU 2.0
609 */
610#ifdef move32
611 // One of the system headers right now is sometimes defining a conflicting macro we don't use
612#undef move32
613#endif
614 virtual int32_t move32(int32_t delta, EOrigin origin) = 0;
615
616 /**
617 * Copies the text under iteration into the UnicodeString
618 * referred to by "result".
619 * @param result Receives a copy of the text under iteration.
620 * @stable ICU 2.0
621 */
622 virtual void getText(UnicodeString& result) = 0;
623
624protected:
625 /**
626 * Empty constructor.
627 * @stable ICU 2.0
628 */
629 CharacterIterator();
630
631 /**
632 * Constructor, just setting the length field in this base class.
633 * @stable ICU 2.0
634 */
635 CharacterIterator(int32_t length);
636
637 /**
638 * Constructor, just setting the length and position fields in this base class.
639 * @stable ICU 2.0
640 */
641 CharacterIterator(int32_t length, int32_t position);
642
643 /**
644 * Constructor, just setting the length, start, end, and position fields in this base class.
645 * @stable ICU 2.0
646 */
647 CharacterIterator(int32_t length, int32_t textBegin, int32_t textEnd, int32_t position);
648
649 /**
650 * Copy constructor.
651 *
652 * @param that The CharacterIterator to be copied
653 * @stable ICU 2.0
654 */
655 CharacterIterator(const CharacterIterator &that);
656
657 /**
658 * Assignment operator. Sets this CharacterIterator to have the same behavior,
659 * as the one passed in.
660 * @param that The CharacterIterator passed in.
661 * @return the newly set CharacterIterator.
662 * @stable ICU 2.0
663 */
664 CharacterIterator &operator=(const CharacterIterator &that);
665
666 /**
667 * Base class text length field.
668 * Necessary this for correct getText() and hashCode().
669 * @stable ICU 2.0
670 */
671 int32_t textLength;
672
673 /**
674 * Base class field for the current position.
675 * @stable ICU 2.0
676 */
677 int32_t pos;
678
679 /**
680 * Base class field for the start of the iteration range.
681 * @stable ICU 2.0
682 */
683 int32_t begin;
684
685 /**
686 * Base class field for the end of the iteration range.
687 * @stable ICU 2.0
688 */
689 int32_t end;
690};
691
692inline UBool
693ForwardCharacterIterator::operator!=(const ForwardCharacterIterator& that) const {
694 return !operator==(that);
695}
696
697inline int32_t
698CharacterIterator::setToStart() {
699 return move(0, kStart);
700}
701
702inline int32_t
703CharacterIterator::setToEnd() {
704 return move(0, kEnd);
705}
706
707inline int32_t
708CharacterIterator::startIndex(void) const {
709 return begin;
710}
711
712inline int32_t
713CharacterIterator::endIndex(void) const {
714 return end;
715}
716
717inline int32_t
718CharacterIterator::getIndex(void) const {
719 return pos;
720}
721
722inline int32_t
723CharacterIterator::getLength(void) const {
724 return textLength;
725}
726
727U_NAMESPACE_END
728#endif
729