Skip to content

Commit a30b0aa

Browse files
committed
Add WTF-8 support
Serialize invalid UTF-8 (WTF-8) symbols and some other invisible symbols as \u escaped sequences.
1 parent fffd849 commit a30b0aa

File tree

3 files changed

+100
-3
lines changed

3 files changed

+100
-3
lines changed

json-builder.c

Lines changed: 84 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -480,7 +480,39 @@ static size_t measure_string (unsigned int length,
480480

481481
default:
482482

483-
++ measured_length;
483+
if ((unsigned char)c <= 0x1F)
484+
{
485+
measured_length += 6;
486+
}
487+
else if ((unsigned char)c == 0xED && i + 2 < length)
488+
{
489+
unsigned char c2 = (unsigned char)str [i + 1];
490+
unsigned char c3 = (unsigned char)str [i + 2];
491+
492+
if (c2 == 0x80 && (c3 == 0xA8 || c3 == 0xA9))
493+
{
494+
/* U+2028 line separator, U+2029 paragraph separator */
495+
measured_length += 6;
496+
}
497+
else if (c2 == 0xBF && (c3 == 0xBE || c3 == 0xBF))
498+
{
499+
/* Noncharacters U+FFFE / U+FFFF */
500+
measured_length += 6;
501+
}
502+
else if ((c2 >= 0xA0 && c2 <= 0xBF) && (c3 >= 0x80 && c3 <= 0xBF))
503+
{
504+
/* Decode WTF-8 unpaired surrogate */
505+
measured_length += 6;
506+
}
507+
else
508+
{
509+
measured_length++;
510+
}
511+
}
512+
else
513+
{
514+
measured_length++;
515+
}
484516
break;
485517
};
486518
};
@@ -493,13 +525,25 @@ static size_t measure_string (unsigned int length,
493525
*buf ++ = (c); \
494526
} while(0); \
495527

528+
/* Escape code point cp (0..0xFFFF) as \uXXXX */
529+
#define PRINT_ESCAPED_CP(cp) do { \
530+
*buf ++ = '\\'; \
531+
*buf ++ = 'u'; \
532+
*buf ++ = hex[((cp) >> 12) & 0xF]; \
533+
*buf ++ = hex[((cp) >> 8) & 0xF]; \
534+
*buf ++ = hex[((cp) >> 4) & 0xF]; \
535+
*buf ++ = hex[(cp) & 0xF]; \
536+
} while(0)
537+
496538
static size_t serialize_string (json_char * buf,
497539
unsigned int length,
498540
const json_char * str)
499541
{
500542
json_char * orig_buf = buf;
501543
unsigned int i;
502544

545+
static const char hex[] = "0123456789ABCDEF";
546+
503547
for(i = 0; i < length; ++ i)
504548
{
505549
json_char c = str [i];
@@ -516,7 +560,45 @@ static size_t serialize_string (json_char * buf,
516560

517561
default:
518562

519-
*buf ++ = c;
563+
if ((unsigned char)c <= 0x1F)
564+
{
565+
PRINT_ESCAPED_CP (c);
566+
}
567+
else if ((unsigned char)c == 0xED && i + 2 < length)
568+
{
569+
unsigned char c2 = (unsigned char)str [i + 1];
570+
unsigned char c3 = (unsigned char)str [i + 2];
571+
572+
if (c2 == 0x80 && (c3 == 0xA8 || c3 == 0xA9))
573+
{
574+
/* U+2028 line separator, U+2029 paragraph separator */
575+
unsigned int cp = ((c & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
576+
PRINT_ESCAPED_CP (cp);
577+
i += 2;
578+
}
579+
else if (c2 == 0xBF && (c3 == 0xBE || c3 == 0xBF))
580+
{
581+
/* Noncharacters U+FFFE / U+FFFF */
582+
unsigned int cp = ((c & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
583+
PRINT_ESCAPED_CP (cp);
584+
i += 2;
585+
}
586+
else if ((c2 >= 0xA0 && c2 <= 0xBF) && (c3 >= 0x80 && c3 <= 0xBF))
587+
{
588+
/* Decode WTF-8 unpaired surrogate */
589+
unsigned int cp = ((c & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
590+
PRINT_ESCAPED_CP (cp);
591+
i += 2;
592+
}
593+
else
594+
{
595+
*buf ++ = c;
596+
}
597+
}
598+
else
599+
{
600+
*buf ++ = c;
601+
}
520602
break;
521603
};
522604
};

test/main.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,8 @@ int main (int argc, char * argv [])
5757
test_file ("valid-0009.json", &num_failed);
5858
test_file ("valid-0010.json", &num_failed);
5959
test_file ("valid-0011.json", &num_failed);
60-
test_file ("valid-0012.json", &num_failed);
60+
test_file ("valid-0012.json", &num_failed);
61+
test_file ("valid-0013.json", &num_failed);
6162

6263
printf ("Total failed tests: %d\n", num_failed);
6364

test/valid-0013.json

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
{
2+
"valid surrogate pair (😀 U+1F600)": "\uD83D\uDE00",
3+
"lone high surrogate": "\uD800",
4+
"lone low surrogate": "\uDC00",
5+
"high surrogate not followed by low surrogate": "\uD834\u0061",
6+
"low surrogate not preceded by high surrogate": "\u0061\uDD1E",
7+
"reversed surrogate order (low then high)": "\uDC00\uD800",
8+
"two high surrogates in a row": "\uD800\uD801",
9+
"two low surrogates in a row": "\uDC00\uDC01",
10+
"surrogate pair split by space": "\uD83D\u0020\uDE00",
11+
"surrogate halves separated by text": "\uD83Dtest\uDE00",
12+
"high surrogate followed by another escape": "\uD83D\u000A",
13+
"high surrogate at end of string": "ABC\uD800"
14+
}

0 commit comments

Comments
 (0)