Skip to content

Commit

Permalink
WIP: make safe iter work with single-pass UnitIter
Browse files Browse the repository at this point in the history
  • Loading branch information
markusicu committed Jan 31, 2025
1 parent 1a47b1f commit c5a250d
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 22 deletions.
108 changes: 86 additions & 22 deletions icu4c/source/common/unicode/utfiter.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ class CodeUnits {

bool wellFormed() const { return ok; }

// TODO: disable for single-pass input iterator
UnitIter data() const { return p; }

int32_t length() const { return len; }
Expand Down Expand Up @@ -126,10 +127,12 @@ class UnsafeCodeUnits {

UChar32 codePoint() const { return c; }

// TODO: disable for single-pass input iterator
const Unit *data() const { return p; }

int32_t length() const { return len; }

// TODO: disable unless pointer
std::basic_string_view<Unit> stringView() const {
return std::basic_string_view<Unit>(p, len);
}
Expand Down Expand Up @@ -174,11 +177,24 @@ class U16IteratorBase {
bool operator!=(const U16IteratorBase &other) const { return !operator==(other); }

// @internal
void dec() {
void inc() {
// TODO: assert current != limit -- more precisely: start <= current < limit
// Very similar to U16_FWD_1().
CP32 c = *current;
++current;
if (U16_IS_LEAD(c) && current != limit && U16_IS_TRAIL(*current) {
++current;
}
}

// TODO: still needed?
// @internal
void dec() {
// TODO: assert p != start -- more precisely: start < p <= limit
// Very similar to U16_BACK_1().
if (U16_IS_TRAIL(*(--current)) && current != start && U16_IS_LEAD(*(current - 1))) {
--current;
UnitIter p1;
if (U16_IS_TRAIL(*--current) && current != start && (p1 = current, U16_IS_LEAD(*--p1))) {
current = p1;
}
}

Expand All @@ -187,7 +203,8 @@ class U16IteratorBase {
// TODO: assert p != limit -- more precisely: start <= p < limit
// Very similar to U16_NEXT_OR_FFFD().
UnitIter p0 = p;
CP32 c = *p++;
CP32 c = *p;
++p;
if (!U16_IS_SURROGATE(c)) {
return {c, 1, true, p0};
} else {
Expand All @@ -204,15 +221,15 @@ class U16IteratorBase {

// @internal
CodeUnits<UnitIter, CP32> decAndRead(UnitIter &p) const {
// TODO: assert p != limit -- more precisely: start <= p < limit
// TODO: assert p != start -- more precisely: start < p <= limit
// Very similar to U16_PREV_OR_FFFD().
CP32 c = *--p;
if (!U16_IS_SURROGATE(c)) {
return {c, 1, true, p};
} else {
UnitIter p1;
uint16_t c2;
if (U16_IS_SURROGATE_TRAIL(c) && p != start && (p1 = p--, U16_IS_LEAD(c2 = *p1))) {
if (U16_IS_SURROGATE_TRAIL(c) && p != start && (p1 = p, U16_IS_LEAD(c2 = *--p1))) {
p = p1;
c = U16_GET_SUPPLEMENTARY(c2, c);
return {c, 2, true, p};
Expand Down Expand Up @@ -268,36 +285,67 @@ class U16Iterator : private U16IteratorBase<UnitIter, CP32, behavior> {
bool operator!=(const U16Iterator &other) const { return !Super::operator==(other); }

CodeUnits<UnitIter, CP32> operator*() const {
// Call the same function in both operator*() and operator++() so that an
// optimizing compiler can easily eliminate redundant work when alternating between the two.
UnitIter p = Super::current;
return Super::readAndInc(p);
if (state == 0) {
units = Super::readAndInc(Super::current);
state = units.len;
}
return units;
}

U16Iterator &operator++() { // pre-increment
// Call the same function in both operator*() and operator++() so that an
// optimizing compiler can easily eliminate redundant work when alternating between the two.
Super::readAndInc(Super::current);
if (state > 0) {
// operator*() called readAndInc() so current is already ahead.
state = 0;
} else if (state == 0) {
Super::inc();
} else /* state < 0 */ {
// operator--() called decAndRead() so we know how far to skip.
do {
++Super::current;
} while (++state < 0);
}
return *this;
}

// TODO: disable for single-pass input iterator
U16Iterator operator++(int) { // post-increment
// Call the same function in both operator*() and operator++() so that an
// optimizing compiler can easily eliminate redundant work when alternating between the two.
U16Iterator result(*this);
Super::readAndInc(Super::current);
operator++();
return result;
}

U16Iterator &operator--() { // pre-decrement
return Super::dec();
if (state > 0) {
// operator*() called readAndInc() so current is ahead.
// Return to the logical position before reading backward from there.
do {
--Super::current;
} while (--state > 0);
}
units = Super::decAndRead(Super::current);
state = -units.len;
return *this;
}

U16Iterator operator--(int) { // post-decrement
U16Iterator result(*this);
Super::dec();
// TODO: just dec()?
operator--();
return result;
}

private:
// Keep state so that we call readAndInc() only once for both operator*() and ++
// so that we can use a single-pass input iterator for UnitIter.
CodeUnits<UnitIter, CP32> units;
// >0: units = readAndInc(), current = units limit, state = units.len
// which means that current is ahead of its logical position
// 0: initial state
// <0: units = decAndRead(), current = units start, state = -units.len
// TODO: could also set state = -1 & use units.len when needed, but less consistent
// TODO: could insert state into hidden CodeUnits field to avoid padding,
// but mostly irrelevant when inlined?
int8_t state = 0;
};

/**
Expand Down Expand Up @@ -429,7 +477,7 @@ class U16UnsafeIteratorBase {
// @internal
void dec() {
// Very similar to U16_BACK_1_UNSAFE().
if (U16_IS_TRAIL(*(--current))) {
if (U16_IS_TRAIL(*--current)) {
--current;
}
}
Expand All @@ -438,11 +486,13 @@ class U16UnsafeIteratorBase {
UnsafeCodeUnits<Unit16, CP32> readAndInc(const Unit16 *&p) const {
// Very similar to U16_NEXT_UNSAFE().
const Unit16 *p0 = p;
CP32 c = *p++;
CP32 c = *p;
++p;
if (!U16_IS_LEAD(c)) {
return {c, 1, p0};
} else {
c = U16_GET_SUPPLEMENTARY(c, *p++);
c = U16_GET_SUPPLEMENTARY(c, *p);
++p;
return {c, 2, p0};
}
}
Expand All @@ -463,6 +513,7 @@ class U16UnsafeIteratorBase {
const Unit16 *current;
};

// TODO: make this one work single-pass as well
/**
* Non-validating bidirectional iterator over the code points in a UTF-16 string.
* The string must be well-formed.
Expand Down Expand Up @@ -500,6 +551,7 @@ class U16UnsafeIterator : private U16UnsafeIteratorBase<Unit16, CP32> {
return *this;
}

// TODO: disable for single-pass input iterator
U16UnsafeIterator operator++(int) { // post-increment
// Call the same function in both operator*() and operator++() so that an
// optimizing compiler can easily eliminate redundant work when alternating between the two.
Expand All @@ -509,7 +561,8 @@ class U16UnsafeIterator : private U16UnsafeIteratorBase<Unit16, CP32> {
}

U16UnsafeIterator &operator--() { // pre-decrement
return Super::dec();
Super::dec();
return *this;
}

U16UnsafeIterator operator--(int) { // post-decrement
Expand Down Expand Up @@ -637,6 +690,17 @@ int32_t loopIterPlusPlus(std::u16string_view s) {
return sum;
}

int32_t backwardLoop(std::u16string_view s) {
header::U16StringCodePoints<char16_t, UChar32, U_BEHAVIOR_NEGATIVE> range(s);
int32_t sum = 0;
auto start = range.begin();
auto iter = range.end();
while (start != iter) {
sum += (*--iter).codePoint();
}
return sum;
}

int32_t reverseLoop(std::u16string_view s) {
header::U16StringCodePoints<char16_t, UChar32, U_BEHAVIOR_NEGATIVE> range(s);
int32_t sum = 0;
Expand Down
4 changes: 4 additions & 0 deletions icu4c/source/test/intltest/utfitertest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -212,3 +212,7 @@ void U16IteratorTest::testFwdIter() {
assertTrue("iter[4] * data()[1]", *data == u"🚴"[1]);
assertTrue("iter == endIter", iter == rangeLimit);
}

// TODO: test with single-pass input iterator

// TODO: test back & forth with bidirectional iterator (not random access, not contiguous)

0 comments on commit c5a250d

Please sign in to comment.