Skip to content

Commit f4068db

Browse files
maskri17copybara-github
authored andcommitted
Adding find and substring functions into byte_string
PiperOrigin-RevId: 809118592
1 parent 2abc4a6 commit f4068db

File tree

3 files changed

+275
-1
lines changed

3 files changed

+275
-1
lines changed

common/internal/byte_string.cc

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,106 @@ bool ByteString::EndsWith(const absl::Cord& rhs) const {
286286
[&rhs](const absl::Cord& lhs) -> bool { return lhs.EndsWith(rhs); }));
287287
}
288288

289+
absl::optional<size_t> ByteString::Find(absl::string_view needle,
290+
size_t pos) const {
291+
ABSL_DCHECK_LE(pos, size());
292+
293+
return Visit(absl::Overload(
294+
[&needle, pos](absl::string_view lhs) -> absl::optional<size_t> {
295+
absl::string_view::size_type i = lhs.find(needle, pos);
296+
if (i == absl::string_view::npos) {
297+
return absl::nullopt;
298+
}
299+
return i;
300+
},
301+
[&needle, pos](const absl::Cord& lhs) -> absl::optional<size_t> {
302+
absl::Cord cord = lhs.Subcord(pos, lhs.size() - pos);
303+
absl::Cord::CharIterator it = cord.Find(needle);
304+
if (it == cord.char_end()) {
305+
return absl::nullopt;
306+
}
307+
return pos +
308+
static_cast<size_t>(absl::Cord::Distance(cord.char_begin(), it));
309+
}));
310+
}
311+
312+
absl::optional<size_t> ByteString::Find(const absl::Cord& needle,
313+
size_t pos) const {
314+
ABSL_DCHECK_LE(pos, size());
315+
316+
return Visit(absl::Overload(
317+
[&needle, pos](absl::string_view lhs) -> absl::optional<size_t> {
318+
if (auto flat_needle = needle.TryFlat(); flat_needle) {
319+
absl::string_view::size_type i = lhs.find(*flat_needle, pos);
320+
if (i == absl::string_view::npos) {
321+
return absl::nullopt;
322+
}
323+
return i;
324+
}
325+
// Needle is fragmented, we have to do a linear scan.
326+
const size_t needle_size = needle.size();
327+
if (pos + needle_size > lhs.size()) {
328+
return absl::nullopt;
329+
}
330+
if (ABSL_PREDICT_FALSE(needle_size == 0)) {
331+
return pos;
332+
}
333+
// Optimization: find the first chunk of the needle, then compare the
334+
// rest. If the first chunk is empty, `lhs.find` will return
335+
// `current_pos`, which correctly degrades to a linear scan.
336+
absl::string_view first_chunk = *needle.Chunks().begin();
337+
absl::Cord rest_of_needle = needle.Subcord(
338+
first_chunk.size(), needle_size - first_chunk.size());
339+
size_t current_pos = pos;
340+
while (true) {
341+
size_t found_pos = lhs.find(first_chunk, current_pos);
342+
if (found_pos == absl::string_view::npos ||
343+
found_pos > lhs.size() - needle_size) {
344+
return absl::nullopt;
345+
}
346+
if (lhs.substr(found_pos + first_chunk.size(),
347+
rest_of_needle.size()) == rest_of_needle) {
348+
return found_pos;
349+
}
350+
current_pos = found_pos + 1;
351+
}
352+
},
353+
[&needle, pos](const absl::Cord& lhs) -> absl::optional<size_t> {
354+
absl::Cord cord = lhs.Subcord(pos, lhs.size() - pos);
355+
absl::Cord::CharIterator it = cord.Find(needle);
356+
if (it == cord.char_end()) {
357+
return absl::nullopt;
358+
}
359+
return pos +
360+
static_cast<size_t>(absl::Cord::Distance(cord.char_begin(), it));
361+
}));
362+
}
363+
364+
ByteString ByteString::Substring(size_t pos, size_t npos) const {
365+
ABSL_DCHECK_LE(npos, size());
366+
ABSL_DCHECK_LE(pos, npos);
367+
368+
switch (GetKind()) {
369+
case ByteStringKind::kSmall: {
370+
ByteString result;
371+
result.rep_.header.kind = ByteStringKind::kSmall;
372+
result.rep_.small.size = npos - pos;
373+
std::memcpy(result.rep_.small.data, rep_.small.data + pos,
374+
result.rep_.small.size);
375+
result.rep_.small.arena = GetSmallArena();
376+
return result;
377+
}
378+
case ByteStringKind::kMedium: {
379+
ByteString result(*this);
380+
result.rep_.medium.data += pos;
381+
result.rep_.medium.size = npos - pos;
382+
return result;
383+
}
384+
case ByteStringKind::kLarge:
385+
return ByteString(GetLarge().Subcord(pos, npos - pos));
386+
}
387+
}
388+
289389
void ByteString::RemovePrefix(size_t n) {
290390
ABSL_DCHECK_LE(n, size());
291391
if (n == 0) {

common/internal/byte_string.h

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ union CEL_COMMON_INTERNAL_BYTE_STRING_TRIVIAL_ABI ByteStringRep final {
159159
absl::string_view LegacyByteString(const ByteString& string, bool stable,
160160
google::protobuf::Arena* absl_nonnull arena);
161161

162-
// `ByteString` is an vocabulary type capable of representing copy-on-write
162+
// `ByteString` is a vocabulary type capable of representing copy-on-write
163163
// strings efficiently for arenas and reference counting. The contents of the
164164
// byte string are owned by an arena or managed by a reference count. All byte
165165
// strings have an associated allocator specified at construction, once the byte
@@ -275,6 +275,24 @@ ByteString final {
275275
bool EndsWith(const absl::Cord& rhs) const;
276276
bool EndsWith(const ByteString& rhs) const;
277277

278+
// Finds the first occurrence of `needle` in this object, starting at byte
279+
// position `pos`. Returns `absl::nullopt` if `needle` is not found.
280+
// Note: Positions are byte-based, not code point based as in
281+
// `cel::StringValue`.
282+
absl::optional<size_t> Find(absl::string_view needle, size_t pos = 0) const;
283+
absl::optional<size_t> Find(const absl::Cord& needle, size_t pos = 0) const;
284+
absl::optional<size_t> Find(const ByteString& needle, size_t pos = 0) const;
285+
286+
// Returns a new `ByteString` that is a substring of this object, starting at
287+
// byte position `pos` and with a length of `npos` bytes.
288+
// Note: Positions are byte-based, not code point based as in
289+
// `cel::StringValue`.
290+
ByteString Substring(size_t pos, size_t npos) const;
291+
ByteString Substring(size_t pos) const {
292+
ABSL_DCHECK_LE(pos, size());
293+
return Substring(pos, size());
294+
}
295+
278296
void RemovePrefix(size_t n);
279297

280298
void RemoveSuffix(size_t n);
@@ -501,6 +519,17 @@ inline bool ByteString::EndsWith(const ByteString& rhs) const {
501519
[this](const absl::Cord& rhs) -> bool { return EndsWith(rhs); }));
502520
}
503521

522+
inline absl::optional<size_t> ByteString::Find(const ByteString& needle,
523+
size_t pos) const {
524+
return needle.Visit(absl::Overload(
525+
[this, pos](absl::string_view rhs) -> absl::optional<size_t> {
526+
return Find(rhs, pos);
527+
},
528+
[this, pos](const absl::Cord& rhs) -> absl::optional<size_t> {
529+
return Find(rhs, pos);
530+
}));
531+
}
532+
504533
inline bool operator==(const ByteString& lhs, const ByteString& rhs) {
505534
return lhs.Equals(rhs);
506535
}

common/internal/byte_string_test.cc

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -747,6 +747,151 @@ TEST_P(ByteStringTest, EndsWith) {
747747
GetMediumOrLargeCord().size() - kSmallByteStringCapacity)));
748748
}
749749

750+
TEST_P(ByteStringTest, Find) {
751+
ByteString byte_string = ByteString(GetAllocator(), GetMediumStringView());
752+
753+
// Find string_view
754+
EXPECT_THAT(byte_string.Find("A string"), Optional(0));
755+
EXPECT_THAT(
756+
byte_string.Find("small string optimization!"),
757+
Optional(GetMediumStringView().find("small string optimization!")));
758+
EXPECT_THAT(byte_string.Find("not found"), Eq(absl::nullopt));
759+
EXPECT_THAT(byte_string.Find(""), Optional(0));
760+
EXPECT_THAT(byte_string.Find("", 3), Optional(3));
761+
EXPECT_THAT(byte_string.Find("A string", 1), Eq(absl::nullopt));
762+
763+
// Find cord
764+
EXPECT_THAT(byte_string.Find(absl::Cord("A string")), Optional(0));
765+
EXPECT_THAT(
766+
byte_string.Find(absl::Cord("small string optimization!")),
767+
Optional(GetMediumStringView().find("small string optimization!")));
768+
EXPECT_THAT(
769+
byte_string.Find(absl::MakeFragmentedCord(
770+
{"A string", " that is too large for the small string optimization!",
771+
" extra"})),
772+
Eq(absl::nullopt));
773+
EXPECT_THAT(byte_string.Find(GetMediumOrLargeFragmentedCord()), Optional(0));
774+
EXPECT_THAT(byte_string.Find(absl::Cord("not found")), Eq(absl::nullopt));
775+
EXPECT_THAT(byte_string.Find(absl::Cord("")), Optional(0));
776+
EXPECT_THAT(byte_string.Find(absl::Cord(""), 3), Optional(3));
777+
}
778+
779+
TEST_P(ByteStringTest, FindEdgeCases) {
780+
ByteString empty_byte_string(GetAllocator(), "");
781+
EXPECT_THAT(empty_byte_string.Find("a"), Eq(absl::nullopt));
782+
EXPECT_THAT(empty_byte_string.Find(""), Optional(0));
783+
ByteString cord_byte_string =
784+
ByteString(GetAllocator(), GetMediumOrLargeCord());
785+
EXPECT_THAT(cord_byte_string.Find("not found"), Eq(absl::nullopt));
786+
ByteString byte_string = ByteString(GetAllocator(), GetMediumStringView());
787+
788+
// Needle longer than haystack.
789+
EXPECT_THAT(byte_string.Find(std::string(byte_string.size() + 1, 'a')),
790+
Eq(absl::nullopt));
791+
792+
// Needle at the end.
793+
absl::string_view suffix = "optimization!";
794+
EXPECT_THAT(byte_string.Find(suffix),
795+
Optional(byte_string.size() - suffix.size()));
796+
797+
// pos at the end.
798+
EXPECT_THAT(byte_string.Find("a", byte_string.size()), Eq(absl::nullopt));
799+
EXPECT_THAT(byte_string.Find("", byte_string.size()),
800+
Optional(byte_string.size()));
801+
802+
// Search in a cord-backed ByteString with pos > 0.
803+
EXPECT_THAT(cord_byte_string.Find("string", 1),
804+
Optional(GetMediumStringView().find("string", 1)));
805+
806+
// Needle at the end of a cord-backed ByteString.
807+
absl::string_view suffix_sv = "optimization!";
808+
EXPECT_THAT(cord_byte_string.Find(suffix_sv),
809+
Optional(cord_byte_string.size() - suffix_sv.size()));
810+
EXPECT_THAT(cord_byte_string.Find(absl::Cord(suffix_sv)),
811+
Optional(cord_byte_string.size() - suffix_sv.size()));
812+
813+
// Fragmented needle with empty first chunk.
814+
absl::Cord fragmented_with_empty_chunk;
815+
fragmented_with_empty_chunk.Append("");
816+
fragmented_with_empty_chunk.Append("A string");
817+
EXPECT_THAT(byte_string.Find(fragmented_with_empty_chunk), Optional(0));
818+
819+
// Search with fragmented cord needle on string_view backed ByteString with
820+
// partial match.
821+
ByteString partial_match_haystack(GetAllocator(), "abababac");
822+
absl::Cord partial_match_needle = absl::MakeFragmentedCord({"aba", "c"});
823+
EXPECT_THAT(partial_match_haystack.Find(partial_match_needle), Optional(4));
824+
825+
// Search with fragmented cord needle where first chunk is found but not
826+
// enough space for the rest.
827+
ByteString short_haystack(GetAllocator(), "abcdefg");
828+
absl::Cord needle_too_long = absl::MakeFragmentedCord({"ef", "gh"});
829+
EXPECT_THAT(short_haystack.Find(needle_too_long), Eq(absl::nullopt));
830+
831+
// Search with a fragmented empty cord.
832+
absl::Cord fragmented_empty_cord = absl::MakeFragmentedCord({"", ""});
833+
EXPECT_THAT(byte_string.Find(fragmented_empty_cord), Optional(0));
834+
EXPECT_THAT(byte_string.Find(fragmented_empty_cord, 3), Optional(3));
835+
836+
// Search for suffix in a fragmented cord.
837+
ByteString fragmented_cord_byte_string(GetAllocator(),
838+
GetMediumOrLargeFragmentedCord());
839+
EXPECT_THAT(fragmented_cord_byte_string.Find(suffix_sv),
840+
Optional(fragmented_cord_byte_string.size() - suffix_sv.size()));
841+
EXPECT_THAT(fragmented_cord_byte_string.Find(absl::Cord(suffix_sv)),
842+
Optional(fragmented_cord_byte_string.size() - suffix_sv.size()));
843+
}
844+
845+
#ifndef NDEBUG
846+
TEST_P(ByteStringTest, FindOutOfBounds) {
847+
ByteString byte_string = ByteString(GetAllocator(), "test");
848+
EXPECT_DEATH(byte_string.Find("t", 5), _);
849+
}
850+
#endif
851+
852+
TEST_P(ByteStringTest, Substring) {
853+
// small byte_string substring
854+
ByteString small_byte_string =
855+
ByteString(GetAllocator(), GetSmallStringView());
856+
EXPECT_EQ(small_byte_string.Substring(1, 5),
857+
GetSmallStringView().substr(1, 4));
858+
EXPECT_EQ(small_byte_string.Substring(0, small_byte_string.size()),
859+
GetSmallStringView());
860+
EXPECT_EQ(small_byte_string.Substring(1, 1), "");
861+
// medium byte_string substring
862+
ByteString medium_byte_string =
863+
ByteString(GetAllocator(), GetMediumStringView());
864+
EXPECT_EQ(medium_byte_string.Substring(2, 12),
865+
GetMediumStringView().substr(2, 10));
866+
EXPECT_EQ(medium_byte_string.Substring(0, medium_byte_string.size()),
867+
GetMediumStringView());
868+
// large byte_string substring
869+
ByteString large_byte_string =
870+
ByteString(GetAllocator(), GetMediumOrLargeCord());
871+
EXPECT_EQ(large_byte_string.Substring(3, 15),
872+
GetMediumOrLargeCord().Subcord(3, 12));
873+
EXPECT_EQ(large_byte_string.Substring(0, large_byte_string.size()),
874+
GetMediumOrLargeCord());
875+
// substring with one parameter
876+
ByteString tacocat_byte_string = ByteString(GetAllocator(), "tacocat");
877+
EXPECT_EQ(tacocat_byte_string.Substring(4), "cat");
878+
}
879+
880+
TEST_P(ByteStringTest, SubstringEdgeCases) {
881+
ByteString byte_string = ByteString(GetAllocator(), GetSmallStringView());
882+
EXPECT_EQ(byte_string.Substring(byte_string.size(), byte_string.size()), "");
883+
EXPECT_EQ(byte_string.Substring(0, 0), "");
884+
}
885+
886+
#ifndef NDEBUG
887+
TEST_P(ByteStringTest, SubstringOutOfBounds) {
888+
ByteString byte_string = ByteString(GetAllocator(), "test");
889+
EXPECT_DEATH(static_cast<void>(byte_string.Substring(5, 5)), _);
890+
EXPECT_DEATH(static_cast<void>(byte_string.Substring(0, 5)), _);
891+
EXPECT_DEATH(static_cast<void>(byte_string.Substring(3, 2)), _);
892+
}
893+
#endif
894+
750895
TEST_P(ByteStringTest, RemovePrefixSmall) {
751896
ByteString byte_string = ByteString(GetAllocator(), GetSmallStringView());
752897
byte_string.RemovePrefix(1);

0 commit comments

Comments
 (0)