Skip to content

Commit aa1cd1a

Browse files
jckingcopybara-github
authored andcommitted
Refactor UTF8 encoding/decoding functions
PiperOrigin-RevId: 805528572
1 parent f0e495e commit aa1cd1a

File tree

4 files changed

+114
-47
lines changed

4 files changed

+114
-47
lines changed

internal/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,7 @@ cc_library(
344344
deps = [
345345
":unicode",
346346
"@com_google_absl//absl/base:core_headers",
347+
"@com_google_absl//absl/base:nullability",
347348
"@com_google_absl//absl/log:absl_check",
348349
"@com_google_absl//absl/strings",
349350
"@com_google_absl//absl/strings:cord",

internal/utf8.cc

Lines changed: 87 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@
2020
#include <string>
2121
#include <utility>
2222

23-
#include "absl/base/attributes.h"
2423
#include "absl/base/macros.h"
24+
#include "absl/base/nullability.h"
2525
#include "absl/base/optimization.h"
2626
#include "absl/log/absl_check.h"
2727
#include "absl/strings/cord.h"
@@ -355,77 +355,109 @@ std::pair<size_t, bool> Utf8Validate(const absl::Cord& str) {
355355

356356
namespace {
357357

358-
std::pair<char32_t, size_t> Utf8DecodeImpl(uint8_t b, uint8_t leading,
359-
size_t size, absl::string_view str) {
358+
size_t Utf8DecodeImpl(uint8_t b, uint8_t leading, size_t size,
359+
absl::string_view str,
360+
char32_t* absl_nullable code_point) {
360361
const auto& accept = kAccept[leading >> 4];
361362
const auto b1 = static_cast<uint8_t>(str.front());
362363
if (ABSL_PREDICT_FALSE(b1 < accept.first || b1 > accept.second)) {
363-
return {kUnicodeReplacementCharacter, 1};
364+
if (code_point != nullptr) {
365+
*code_point = kUnicodeReplacementCharacter;
366+
}
367+
return 1;
364368
}
365369
if (size <= 1) {
366-
return {(static_cast<char32_t>(b & kMask2) << 6) |
367-
static_cast<char32_t>(b1 & kMaskX),
368-
2};
370+
if (code_point != nullptr) {
371+
*code_point = (static_cast<char32_t>(b & kMask2) << 6) |
372+
static_cast<char32_t>(b1 & kMaskX);
373+
}
374+
return 2;
369375
}
370376
str.remove_prefix(1);
371377
const auto b2 = static_cast<uint8_t>(str.front());
372378
if (ABSL_PREDICT_FALSE(b2 < kLow || b2 > kHigh)) {
373-
return {kUnicodeReplacementCharacter, 1};
379+
if (code_point != nullptr) {
380+
*code_point = kUnicodeReplacementCharacter;
381+
}
382+
return 1;
374383
}
375384
if (size <= 2) {
376-
return {(static_cast<char32_t>(b & kMask3) << 12) |
377-
(static_cast<char32_t>(b1 & kMaskX) << 6) |
378-
static_cast<char32_t>(b2 & kMaskX),
379-
3};
385+
if (code_point != nullptr) {
386+
*code_point = (static_cast<char32_t>(b & kMask3) << 12) |
387+
(static_cast<char32_t>(b1 & kMaskX) << 6) |
388+
static_cast<char32_t>(b2 & kMaskX);
389+
}
390+
return 3;
380391
}
381392
str.remove_prefix(1);
382393
const auto b3 = static_cast<uint8_t>(str.front());
383394
if (ABSL_PREDICT_FALSE(b3 < kLow || b3 > kHigh)) {
384-
return {kUnicodeReplacementCharacter, 1};
395+
if (code_point != nullptr) {
396+
*code_point = kUnicodeReplacementCharacter;
397+
}
398+
return 1;
385399
}
386-
return {(static_cast<char32_t>(b & kMask4) << 18) |
387-
(static_cast<char32_t>(b1 & kMaskX) << 12) |
388-
(static_cast<char32_t>(b2 & kMaskX) << 6) |
389-
static_cast<char32_t>(b3 & kMaskX),
390-
4};
400+
if (code_point != nullptr) {
401+
*code_point = (static_cast<char32_t>(b & kMask4) << 18) |
402+
(static_cast<char32_t>(b1 & kMaskX) << 12) |
403+
(static_cast<char32_t>(b2 & kMaskX) << 6) |
404+
static_cast<char32_t>(b3 & kMaskX);
405+
}
406+
return 4;
391407
}
392408

393409
} // namespace
394410

395-
std::pair<char32_t, size_t> Utf8Decode(absl::string_view str) {
411+
size_t Utf8Decode(absl::string_view str, char32_t* absl_nullable code_point) {
396412
ABSL_DCHECK(!str.empty());
397413
const auto b = static_cast<uint8_t>(str.front());
398414
if (b < kUtf8RuneSelf) {
399-
return {static_cast<char32_t>(b), 1};
415+
if (code_point != nullptr) {
416+
*code_point = static_cast<char32_t>(b);
417+
}
418+
return 1;
400419
}
401420
const auto leading = kLeading[b];
402421
if (ABSL_PREDICT_FALSE(leading == kXX)) {
403-
return {kUnicodeReplacementCharacter, 1};
422+
if (code_point != nullptr) {
423+
*code_point = kUnicodeReplacementCharacter;
424+
}
425+
return 1;
404426
}
405427
auto size = static_cast<size_t>(leading & 7) - 1;
406428
str.remove_prefix(1);
407429
if (ABSL_PREDICT_FALSE(size > str.size())) {
408-
return {kUnicodeReplacementCharacter, 1};
430+
if (code_point != nullptr) {
431+
*code_point = kUnicodeReplacementCharacter;
432+
}
433+
return 1;
409434
}
410-
return Utf8DecodeImpl(b, leading, size, str);
435+
return Utf8DecodeImpl(b, leading, size, str, code_point);
411436
}
412437

413-
std::pair<char32_t, size_t> Utf8Decode(const absl::Cord::CharIterator& it) {
438+
size_t Utf8Decode(const absl::Cord::CharIterator& it,
439+
char32_t* absl_nullable code_point) {
414440
absl::string_view str = absl::Cord::ChunkRemaining(it);
415441
ABSL_DCHECK(!str.empty());
416442
const auto b = static_cast<uint8_t>(str.front());
417443
if (b < kUtf8RuneSelf) {
418-
return {static_cast<char32_t>(b), 1};
444+
if (code_point != nullptr) {
445+
*code_point = static_cast<char32_t>(b);
446+
}
447+
return 1;
419448
}
420449
const auto leading = kLeading[b];
421450
if (ABSL_PREDICT_FALSE(leading == kXX)) {
422-
return {kUnicodeReplacementCharacter, 1};
451+
if (code_point != nullptr) {
452+
*code_point = kUnicodeReplacementCharacter;
453+
}
454+
return 1;
423455
}
424456
auto size = static_cast<size_t>(leading & 7) - 1;
425457
str.remove_prefix(1);
426458
if (ABSL_PREDICT_TRUE(size <= str.size())) {
427459
// Fast path.
428-
return Utf8DecodeImpl(b, leading, size, str);
460+
return Utf8DecodeImpl(b, leading, size, str, code_point);
429461
}
430462
absl::Cord::CharIterator current = it;
431463
absl::Cord::Advance(&current, 1);
@@ -434,49 +466,60 @@ std::pair<char32_t, size_t> Utf8Decode(const absl::Cord::CharIterator& it) {
434466
while (buffer_len < size) {
435467
str = absl::Cord::ChunkRemaining(current);
436468
if (ABSL_PREDICT_FALSE(str.empty())) {
437-
return {kUnicodeReplacementCharacter, 1};
469+
if (code_point != nullptr) {
470+
*code_point = kUnicodeReplacementCharacter;
471+
}
472+
return 1;
438473
}
439474
size_t to_copy = std::min(size_t{3} - buffer_len, str.size());
440475
std::memcpy(buffer + buffer_len, str.data(), to_copy);
441476
buffer_len += to_copy;
442477
absl::Cord::Advance(&current, to_copy);
443478
}
444-
return Utf8DecodeImpl(b, leading, size,
445-
absl::string_view(buffer, buffer_len));
479+
return Utf8DecodeImpl(b, leading, size, absl::string_view(buffer, buffer_len),
480+
code_point);
446481
}
447482

448-
size_t Utf8Encode(std::string& buffer, char32_t code_point) {
483+
size_t Utf8Encode(char32_t code_point, std::string* absl_nonnull buffer) {
484+
ABSL_DCHECK(buffer != nullptr);
485+
486+
char storage[4];
487+
size_t storage_len = Utf8Encode(code_point, storage);
488+
buffer->append(storage, storage_len);
489+
return storage_len;
490+
}
491+
492+
size_t Utf8Encode(char32_t code_point, char* absl_nonnull buffer) {
493+
ABSL_DCHECK(buffer != nullptr);
494+
449495
if (ABSL_PREDICT_FALSE(!UnicodeIsValid(code_point))) {
450496
code_point = kUnicodeReplacementCharacter;
451497
}
452-
char storage[4];
453498
size_t storage_len = 0;
454499
if (code_point <= 0x7f) {
455-
storage[storage_len++] =
456-
static_cast<char>(static_cast<uint8_t>(code_point));
500+
buffer[storage_len++] = static_cast<char>(static_cast<uint8_t>(code_point));
457501
} else if (code_point <= 0x7ff) {
458-
storage[storage_len++] =
502+
buffer[storage_len++] =
459503
static_cast<char>(kT2 | static_cast<uint8_t>(code_point >> 6));
460-
storage[storage_len++] =
504+
buffer[storage_len++] =
461505
static_cast<char>(kTX | (static_cast<uint8_t>(code_point) & kMaskX));
462506
} else if (code_point <= 0xffff) {
463-
storage[storage_len++] =
507+
buffer[storage_len++] =
464508
static_cast<char>(kT3 | static_cast<uint8_t>(code_point >> 12));
465-
storage[storage_len++] = static_cast<char>(
509+
buffer[storage_len++] = static_cast<char>(
466510
kTX | (static_cast<uint8_t>(code_point >> 6) & kMaskX));
467-
storage[storage_len++] =
511+
buffer[storage_len++] =
468512
static_cast<char>(kTX | (static_cast<uint8_t>(code_point) & kMaskX));
469513
} else {
470-
storage[storage_len++] =
514+
buffer[storage_len++] =
471515
static_cast<char>(kT4 | static_cast<uint8_t>(code_point >> 18));
472-
storage[storage_len++] = static_cast<char>(
516+
buffer[storage_len++] = static_cast<char>(
473517
kTX | (static_cast<uint8_t>(code_point >> 12) & kMaskX));
474-
storage[storage_len++] = static_cast<char>(
518+
buffer[storage_len++] = static_cast<char>(
475519
kTX | (static_cast<uint8_t>(code_point >> 6) & kMaskX));
476-
storage[storage_len++] =
520+
buffer[storage_len++] =
477521
static_cast<char>(kTX | (static_cast<uint8_t>(code_point) & kMaskX));
478522
}
479-
buffer.append(storage, storage_len);
480523
return storage_len;
481524
}
482525

internal/utf8.h

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
#include <string>
2020
#include <utility>
2121

22+
#include "absl/base/attributes.h"
23+
#include "absl/base/nullability.h"
2224
#include "absl/strings/cord.h"
2325
#include "absl/strings/string_view.h"
2426

@@ -50,13 +52,30 @@ std::pair<size_t, bool> Utf8Validate(const absl::Cord& str);
5052
// sequence is returned the replacement character, U+FFFD, is returned with a
5153
// code unit count of 1. As U+FFFD requires 3 code units when encoded, this can
5254
// be used to differentiate valid input from malformed input.
53-
std::pair<char32_t, size_t> Utf8Decode(absl::string_view str);
54-
std::pair<char32_t, size_t> Utf8Decode(const absl::Cord::CharIterator& it);
55+
size_t Utf8Decode(absl::string_view str, char32_t* absl_nullable code_point);
56+
size_t Utf8Decode(const absl::Cord::CharIterator& it,
57+
char32_t* absl_nullable code_point);
58+
inline std::pair<char32_t, size_t> Utf8Decode(absl::string_view str) {
59+
char32_t code_point;
60+
size_t code_units = Utf8Decode(str, &code_point);
61+
return std::pair{code_point, code_units};
62+
}
63+
inline std::pair<char32_t, size_t> Utf8Decode(
64+
const absl::Cord::CharIterator& it) {
65+
char32_t code_point;
66+
size_t code_units = Utf8Decode(it, &code_point);
67+
return std::pair{code_point, code_units};
68+
}
5569

5670
// Encodes the given code point and appends it to the buffer. If the code point
5771
// is an unpaired surrogate or outside of the valid Unicode range it is replaced
5872
// with the replacement character, U+FFFD.
59-
size_t Utf8Encode(std::string& buffer, char32_t code_point);
73+
size_t Utf8Encode(char32_t code_point, std::string* absl_nonnull buffer);
74+
size_t Utf8Encode(char32_t code_point, char* absl_nonnull buffer);
75+
ABSL_DEPRECATED("Use other overload")
76+
inline size_t Utf8Encode(std::string& buffer, char32_t code_point) {
77+
return Utf8Encode(code_point, &buffer);
78+
}
6079

6180
} // namespace cel::internal
6281

internal/utf8_test.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,8 @@ TEST_P(Utf8DecodeTest, StringView) {
226226
<< absl::CHexEscape(test_case.code_units);
227227
EXPECT_EQ(code_point, test_case.code_point)
228228
<< absl::CHexEscape(test_case.code_units);
229+
EXPECT_EQ(Utf8Decode(test_case.code_units, nullptr),
230+
test_case.code_units.size());
229231
}
230232

231233
TEST_P(Utf8DecodeTest, Cord) {
@@ -239,6 +241,8 @@ TEST_P(Utf8DecodeTest, Cord) {
239241
<< absl::CHexEscape(test_case.code_units);
240242
EXPECT_EQ(code_point, test_case.code_point)
241243
<< absl::CHexEscape(test_case.code_units);
244+
it = cord.char_begin();
245+
EXPECT_EQ(Utf8Decode(it, nullptr), test_case.code_units.size());
242246
}
243247

244248
std::vector<std::string> FragmentString(absl::string_view text) {

0 commit comments

Comments
 (0)