Add WTF-8 support

drizt · drizt · commit a30b0aa2a6ea · 2026-01-05T22:37:19.000+05:00
Serialize invalid UTF-8 (WTF-8) symbols and some other invisible symbols
as \u escaped sequences.
diff --git a/json-builder.c b/json-builder.c
@@ -480,7 +480,39 @@ static size_t measure_string (unsigned int length,
 
       default:
 
-         ++ measured_length;
+         if ((unsigned char)c <= 0x1F)
+         {
+            measured_length += 6;
+         }
+         else if ((unsigned char)c == 0xED && i + 2 < length)
+         {
+            unsigned char c2 = (unsigned char)str [i + 1];
+            unsigned char c3 = (unsigned char)str [i + 2];
+
+            if (c2 == 0x80 && (c3 == 0xA8 || c3 == 0xA9))
+            {
+               /* U+2028 line separator, U+2029 paragraph separator */
+               measured_length += 6;
+            }
+            else if (c2 == 0xBF && (c3 == 0xBE || c3 == 0xBF))
+            {
+               /* Noncharacters U+FFFE / U+FFFF */
+               measured_length += 6;
+            }
+            else if ((c2 >= 0xA0 && c2 <= 0xBF) && (c3 >= 0x80 && c3 <= 0xBF))
+            {
+               /* Decode WTF-8 unpaired surrogate */
+               measured_length += 6;
+            }
+            else
+            {
+               measured_length++;
+            }
+         }
+         else
+         {
+            measured_length++;
+         }
          break;
       };
    };
@@ -493,13 +525,25 @@ static size_t measure_string (unsigned int length,
    *buf ++ = (c);              \
 } while(0);                    \
 
+/* Escape code point cp (0..0xFFFF) as \uXXXX */
+#define PRINT_ESCAPED_CP(cp) do {      \
+   *buf ++ = '\\';                     \
+   *buf ++ = 'u';                      \
+   *buf ++ = hex[((cp) >> 12) & 0xF];  \
+   *buf ++ = hex[((cp) >> 8)  & 0xF];  \
+   *buf ++ = hex[((cp) >> 4)  & 0xF];  \
+   *buf ++ = hex[(cp)         & 0xF];  \
+} while(0)
+
 static size_t serialize_string (json_char * buf,
                                 unsigned int length,
                                 const json_char * str)
 {
    json_char * orig_buf = buf;
    unsigned int i;
 
+   static const char hex[] = "0123456789ABCDEF";
+
    for(i = 0; i < length; ++ i)
    {
       json_char c = str [i];
@@ -516,7 +560,45 @@ static size_t serialize_string (json_char * buf,
 
       default:
 
-         *buf ++ = c;
+         if ((unsigned char)c <= 0x1F)
+         {
+            PRINT_ESCAPED_CP (c);
+         }
+         else if ((unsigned char)c == 0xED && i + 2 < length)
+         {
+            unsigned char c2 = (unsigned char)str [i + 1];
+            unsigned char c3 = (unsigned char)str [i + 2];
+
+            if (c2 == 0x80 && (c3 == 0xA8 || c3 == 0xA9))
+            {
+               /* U+2028 line separator, U+2029 paragraph separator */
+               unsigned int cp = ((c & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
+               PRINT_ESCAPED_CP (cp);
+               i += 2;
+            }
+            else if (c2 == 0xBF && (c3 == 0xBE || c3 == 0xBF))
+            {
+               /* Noncharacters U+FFFE / U+FFFF */
+               unsigned int cp = ((c & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
+               PRINT_ESCAPED_CP (cp);
+               i += 2;
+            }
+            else if ((c2 >= 0xA0 && c2 <= 0xBF) && (c3 >= 0x80 && c3 <= 0xBF))
+            {
+               /* Decode WTF-8 unpaired surrogate */
+               unsigned int cp = ((c & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
+               PRINT_ESCAPED_CP (cp);
+               i += 2;
+            }
+            else
+            {
+               *buf ++ = c;
+            }
+         }
+         else
+         {
+            *buf ++ = c;
+         }
          break;
       };
    };
diff --git a/test/main.cc b/test/main.cc
@@ -57,7 +57,8 @@ int main (int argc, char * argv [])
    test_file ("valid-0009.json", &num_failed);
    test_file ("valid-0010.json", &num_failed);
    test_file ("valid-0011.json", &num_failed);
-   test_file ("valid-0012.json", &num_failed); 
+   test_file ("valid-0012.json", &num_failed);
+   test_file ("valid-0013.json", &num_failed);
 
    printf ("Total failed tests: %d\n", num_failed);
 
diff --git a/test/valid-0013.json b/test/valid-0013.json
@@ -0,0 +1,14 @@
+{
+  "valid surrogate pair (😀 U+1F600)": "\uD83D\uDE00",
+  "lone high surrogate": "\uD800",
+  "lone low surrogate": "\uDC00",
+  "high surrogate not followed by low surrogate": "\uD834\u0061",
+  "low surrogate not preceded by high surrogate": "\u0061\uDD1E",
+  "reversed surrogate order (low then high)": "\uDC00\uD800",
+  "two high surrogates in a row": "\uD800\uD801",
+  "two low surrogates in a row": "\uDC00\uDC01",
+  "surrogate pair split by space": "\uD83D\u0020\uDE00",
+  "surrogate halves separated by text": "\uD83Dtest\uDE00",
+  "high surrogate followed by another escape": "\uD83D\u000A",
+  "high surrogate at end of string": "ABC\uD800"
+}