20
20
#include < string>
21
21
#include < utility>
22
22
23
- #include " absl/base/attributes.h"
24
23
#include " absl/base/macros.h"
24
+ #include " absl/base/nullability.h"
25
25
#include " absl/base/optimization.h"
26
26
#include " absl/log/absl_check.h"
27
27
#include " absl/strings/cord.h"
@@ -355,77 +355,109 @@ std::pair<size_t, bool> Utf8Validate(const absl::Cord& str) {
355
355
356
356
namespace {
357
357
358
- std::pair<char32_t , size_t > Utf8DecodeImpl (uint8_t b, uint8_t leading,
359
- size_t size, absl::string_view str) {
358
+ size_t Utf8DecodeImpl (uint8_t b, uint8_t leading, size_t size,
359
+ absl::string_view str,
360
+ char32_t * absl_nullable code_point) {
360
361
const auto & accept = kAccept [leading >> 4 ];
361
362
const auto b1 = static_cast <uint8_t >(str.front ());
362
363
if (ABSL_PREDICT_FALSE (b1 < accept.first || b1 > accept.second )) {
363
- return {kUnicodeReplacementCharacter , 1 };
364
+ if (code_point != nullptr ) {
365
+ *code_point = kUnicodeReplacementCharacter ;
366
+ }
367
+ return 1 ;
364
368
}
365
369
if (size <= 1 ) {
366
- return {(static_cast <char32_t >(b & kMask2 ) << 6 ) |
367
- static_cast <char32_t >(b1 & kMaskX ),
368
- 2 };
370
+ if (code_point != nullptr ) {
371
+ *code_point = (static_cast <char32_t >(b & kMask2 ) << 6 ) |
372
+ static_cast <char32_t >(b1 & kMaskX );
373
+ }
374
+ return 2 ;
369
375
}
370
376
str.remove_prefix (1 );
371
377
const auto b2 = static_cast <uint8_t >(str.front ());
372
378
if (ABSL_PREDICT_FALSE (b2 < kLow || b2 > kHigh )) {
373
- return {kUnicodeReplacementCharacter , 1 };
379
+ if (code_point != nullptr ) {
380
+ *code_point = kUnicodeReplacementCharacter ;
381
+ }
382
+ return 1 ;
374
383
}
375
384
if (size <= 2 ) {
376
- return {(static_cast <char32_t >(b & kMask3 ) << 12 ) |
377
- (static_cast <char32_t >(b1 & kMaskX ) << 6 ) |
378
- static_cast <char32_t >(b2 & kMaskX ),
379
- 3 };
385
+ if (code_point != nullptr ) {
386
+ *code_point = (static_cast <char32_t >(b & kMask3 ) << 12 ) |
387
+ (static_cast <char32_t >(b1 & kMaskX ) << 6 ) |
388
+ static_cast <char32_t >(b2 & kMaskX );
389
+ }
390
+ return 3 ;
380
391
}
381
392
str.remove_prefix (1 );
382
393
const auto b3 = static_cast <uint8_t >(str.front ());
383
394
if (ABSL_PREDICT_FALSE (b3 < kLow || b3 > kHigh )) {
384
- return {kUnicodeReplacementCharacter , 1 };
395
+ if (code_point != nullptr ) {
396
+ *code_point = kUnicodeReplacementCharacter ;
397
+ }
398
+ return 1 ;
385
399
}
386
- return {(static_cast <char32_t >(b & kMask4 ) << 18 ) |
387
- (static_cast <char32_t >(b1 & kMaskX ) << 12 ) |
388
- (static_cast <char32_t >(b2 & kMaskX ) << 6 ) |
389
- static_cast <char32_t >(b3 & kMaskX ),
390
- 4 };
400
+ if (code_point != nullptr ) {
401
+ *code_point = (static_cast <char32_t >(b & kMask4 ) << 18 ) |
402
+ (static_cast <char32_t >(b1 & kMaskX ) << 12 ) |
403
+ (static_cast <char32_t >(b2 & kMaskX ) << 6 ) |
404
+ static_cast <char32_t >(b3 & kMaskX );
405
+ }
406
+ return 4 ;
391
407
}
392
408
393
409
} // namespace
394
410
395
- std::pair< char32_t , size_t > Utf8Decode (absl::string_view str) {
411
+ size_t Utf8Decode (absl::string_view str, char32_t * absl_nullable code_point ) {
396
412
ABSL_DCHECK (!str.empty ());
397
413
const auto b = static_cast <uint8_t >(str.front ());
398
414
if (b < kUtf8RuneSelf ) {
399
- return {static_cast <char32_t >(b), 1 };
415
+ if (code_point != nullptr ) {
416
+ *code_point = static_cast <char32_t >(b);
417
+ }
418
+ return 1 ;
400
419
}
401
420
const auto leading = kLeading [b];
402
421
if (ABSL_PREDICT_FALSE (leading == kXX )) {
403
- return {kUnicodeReplacementCharacter , 1 };
422
+ if (code_point != nullptr ) {
423
+ *code_point = kUnicodeReplacementCharacter ;
424
+ }
425
+ return 1 ;
404
426
}
405
427
auto size = static_cast <size_t >(leading & 7 ) - 1 ;
406
428
str.remove_prefix (1 );
407
429
if (ABSL_PREDICT_FALSE (size > str.size ())) {
408
- return {kUnicodeReplacementCharacter , 1 };
430
+ if (code_point != nullptr ) {
431
+ *code_point = kUnicodeReplacementCharacter ;
432
+ }
433
+ return 1 ;
409
434
}
410
- return Utf8DecodeImpl (b, leading, size, str);
435
+ return Utf8DecodeImpl (b, leading, size, str, code_point );
411
436
}
412
437
413
- std::pair<char32_t , size_t > Utf8Decode (const absl::Cord::CharIterator& it) {
438
+ size_t Utf8Decode (const absl::Cord::CharIterator& it,
439
+ char32_t * absl_nullable code_point) {
414
440
absl::string_view str = absl::Cord::ChunkRemaining (it);
415
441
ABSL_DCHECK (!str.empty ());
416
442
const auto b = static_cast <uint8_t >(str.front ());
417
443
if (b < kUtf8RuneSelf ) {
418
- return {static_cast <char32_t >(b), 1 };
444
+ if (code_point != nullptr ) {
445
+ *code_point = static_cast <char32_t >(b);
446
+ }
447
+ return 1 ;
419
448
}
420
449
const auto leading = kLeading [b];
421
450
if (ABSL_PREDICT_FALSE (leading == kXX )) {
422
- return {kUnicodeReplacementCharacter , 1 };
451
+ if (code_point != nullptr ) {
452
+ *code_point = kUnicodeReplacementCharacter ;
453
+ }
454
+ return 1 ;
423
455
}
424
456
auto size = static_cast <size_t >(leading & 7 ) - 1 ;
425
457
str.remove_prefix (1 );
426
458
if (ABSL_PREDICT_TRUE (size <= str.size ())) {
427
459
// Fast path.
428
- return Utf8DecodeImpl (b, leading, size, str);
460
+ return Utf8DecodeImpl (b, leading, size, str, code_point );
429
461
}
430
462
absl::Cord::CharIterator current = it;
431
463
absl::Cord::Advance (¤t, 1 );
@@ -434,49 +466,60 @@ std::pair<char32_t, size_t> Utf8Decode(const absl::Cord::CharIterator& it) {
434
466
while (buffer_len < size) {
435
467
str = absl::Cord::ChunkRemaining (current);
436
468
if (ABSL_PREDICT_FALSE (str.empty ())) {
437
- return {kUnicodeReplacementCharacter , 1 };
469
+ if (code_point != nullptr ) {
470
+ *code_point = kUnicodeReplacementCharacter ;
471
+ }
472
+ return 1 ;
438
473
}
439
474
size_t to_copy = std::min (size_t {3 } - buffer_len, str.size ());
440
475
std::memcpy (buffer + buffer_len, str.data (), to_copy);
441
476
buffer_len += to_copy;
442
477
absl::Cord::Advance (¤t, to_copy);
443
478
}
444
- return Utf8DecodeImpl (b, leading, size,
445
- absl::string_view (buffer, buffer_len) );
479
+ return Utf8DecodeImpl (b, leading, size, absl::string_view (buffer, buffer_len),
480
+ code_point );
446
481
}
447
482
448
- size_t Utf8Encode (std::string& buffer, char32_t code_point) {
483
+ size_t Utf8Encode (char32_t code_point, std::string* absl_nonnull buffer) {
484
+ ABSL_DCHECK (buffer != nullptr );
485
+
486
+ char storage[4 ];
487
+ size_t storage_len = Utf8Encode (code_point, storage);
488
+ buffer->append (storage, storage_len);
489
+ return storage_len;
490
+ }
491
+
492
+ size_t Utf8Encode (char32_t code_point, char * absl_nonnull buffer) {
493
+ ABSL_DCHECK (buffer != nullptr );
494
+
449
495
if (ABSL_PREDICT_FALSE (!UnicodeIsValid (code_point))) {
450
496
code_point = kUnicodeReplacementCharacter ;
451
497
}
452
- char storage[4 ];
453
498
size_t storage_len = 0 ;
454
499
if (code_point <= 0x7f ) {
455
- storage[storage_len++] =
456
- static_cast <char >(static_cast <uint8_t >(code_point));
500
+ buffer[storage_len++] = static_cast <char >(static_cast <uint8_t >(code_point));
457
501
} else if (code_point <= 0x7ff ) {
458
- storage [storage_len++] =
502
+ buffer [storage_len++] =
459
503
static_cast <char >(kT2 | static_cast <uint8_t >(code_point >> 6 ));
460
- storage [storage_len++] =
504
+ buffer [storage_len++] =
461
505
static_cast <char >(kTX | (static_cast <uint8_t >(code_point) & kMaskX ));
462
506
} else if (code_point <= 0xffff ) {
463
- storage [storage_len++] =
507
+ buffer [storage_len++] =
464
508
static_cast <char >(kT3 | static_cast <uint8_t >(code_point >> 12 ));
465
- storage [storage_len++] = static_cast <char >(
509
+ buffer [storage_len++] = static_cast <char >(
466
510
kTX | (static_cast <uint8_t >(code_point >> 6 ) & kMaskX ));
467
- storage [storage_len++] =
511
+ buffer [storage_len++] =
468
512
static_cast <char >(kTX | (static_cast <uint8_t >(code_point) & kMaskX ));
469
513
} else {
470
- storage [storage_len++] =
514
+ buffer [storage_len++] =
471
515
static_cast <char >(kT4 | static_cast <uint8_t >(code_point >> 18 ));
472
- storage [storage_len++] = static_cast <char >(
516
+ buffer [storage_len++] = static_cast <char >(
473
517
kTX | (static_cast <uint8_t >(code_point >> 12 ) & kMaskX ));
474
- storage [storage_len++] = static_cast <char >(
518
+ buffer [storage_len++] = static_cast <char >(
475
519
kTX | (static_cast <uint8_t >(code_point >> 6 ) & kMaskX ));
476
- storage [storage_len++] =
520
+ buffer [storage_len++] =
477
521
static_cast <char >(kTX | (static_cast <uint8_t >(code_point) & kMaskX ));
478
522
}
479
- buffer.append (storage, storage_len);
480
523
return storage_len;
481
524
}
482
525
0 commit comments