refactor(core): Use the utf8 decoding in /deps for JSON

GoetzGoerisch · Dec 23, 2024 · c29c090 · c29c090
1 parent ba18106
commit c29c090
Show file tree

Hide file tree

Showing 2 changed files with 66 additions and 111 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -742,6 +742,7 @@ set(lib_headers ${PROJECT_SOURCE_DIR}/deps/open62541_queue.h
                 ${PROJECT_SOURCE_DIR}/deps/base64.h
                 ${PROJECT_SOURCE_DIR}/deps/dtoa.h
                 ${PROJECT_SOURCE_DIR}/deps/mp_printf.h
+                ${PROJECT_SOURCE_DIR}/deps/utf8.h
                 ${PROJECT_SOURCE_DIR}/deps/itoa.h
                 ${PROJECT_SOURCE_DIR}/deps/ziptree.h
                 ${PROJECT_SOURCE_DIR}/src/ua_types_encoding_binary.h
@@ -812,6 +813,7 @@ set(lib_sources ${PROJECT_SOURCE_DIR}/src/ua_types.c
                 ${PROJECT_SOURCE_DIR}/deps/base64.c
                 ${PROJECT_SOURCE_DIR}/deps/dtoa.c
                 ${PROJECT_SOURCE_DIR}/deps/mp_printf.c
+                ${PROJECT_SOURCE_DIR}/deps/utf8.c
                 ${PROJECT_SOURCE_DIR}/deps/itoa.c
                 ${PROJECT_SOURCE_DIR}/deps/ziptree.c)
 

diff --git a/src/ua_types_encoding_json.c b/src/ua_types_encoding_json.c
@@ -16,6 +16,7 @@
 #include <float.h>
 #include <math.h>
 
+#include "../deps/utf8.h"
 #include "../deps/itoa.h"
 #include "../deps/dtoa.h"
 #include "../deps/parse_num.h"
@@ -446,54 +447,9 @@ encodeJsonArray(CtxJson *ctx, const void *ptr, size_t length,
     return ret | writeJsonArrEnd(ctx, type);
 }
 
-static const uint32_t min_codepoints[5] = {0x00, 0x00, 0x80, 0x800, 0x10000};
 static const u8 hexmap[16] =
     {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};
 
-/* Extract the next utf8 codepoint from the buffer. Return the next position in
- * the buffer or NULL upon an error. */
-static const unsigned char *
-extract_codepoint(const unsigned char *pos, size_t len, uint32_t *codepoint) {
-    UA_assert(len > 0);
-
-    *codepoint = pos[0];
-    if(UA_LIKELY(*codepoint < 0x80))
-        return pos + 1; /* Normal ASCII */
-
-    if(UA_UNLIKELY(*codepoint <= 0xC1))
-        return NULL; /* Continuation byte not allowed here */
-
-    unsigned char count;
-    if(*codepoint <= 0xDF) {
-        count = 2; /* 2-byte sequence */
-        *codepoint &= 0x1F;
-    } else if(*codepoint <= 0xEF) {
-        count = 3; /* 3-byte sequence */
-        *codepoint &= 0xF;
-    } else if(*codepoint <= 0xF4) {
-        count = 4; /* 4-byte sequence */
-        *codepoint &= 0x7;
-    } else {
-        return NULL; /* invalid utf8 */
-    }
-
-    if(UA_UNLIKELY(count > len))
-        return NULL; /* Not enough bytes left */
-
-    for(unsigned char i = 1; i < count; i++) {
-        unsigned char byte = pos[i];
-        if(UA_UNLIKELY(byte < 0x80 || byte > 0xBF))
-            return NULL; /* Not a continuation byte */
-        *codepoint = (*codepoint << 6) + (byte & 0x3F);
-    }
-
-    /* Not in Unicode range or too small for the encoding length */
-    if(UA_UNLIKELY(*codepoint > 0x10FFFF || *codepoint < min_codepoints[count]))
-        return NULL;
-
-    return pos + count; /* Return the new position in the pos */
-}
-
 ENCODE_JSON(String) {
     if(!src->data)
         return writeChars(ctx, "null", 4);
@@ -503,71 +459,65 @@ ENCODE_JSON(String) {
 
     UA_StatusCode ret = writeJsonQuote(ctx);
 
-    const unsigned char *str = src->data;
-    const unsigned char *pos = str;
-    const unsigned char *end = str;
-    const unsigned char *lim = str + src->length;
-    uint32_t codepoint = 0;
-    while(1) {
-        /* Iterate over codepoints in the utf8 encoding. Until the first
-         * character that needs to be escaped. */
-        while(end < lim) {
-            end = extract_codepoint(pos, (size_t)(lim - pos), &codepoint);
-            if(!end)  {
-                /* A malformed utf8 character. Print anyway and let the
-                 * receiving side choose how to handle it. */
-                pos++;
-                end = pos;
-                continue;
-            }
-
-            /* Escape unprintable ASCII and escape characters */
-            if(codepoint < ' '   || codepoint == 127  ||
-               codepoint == '\\' || codepoint == '\"')
+    const unsigned char *pos = src->data;         /* Input position */
+    const unsigned char *end = pos + src->length; /* End of input */
+    while(pos < end) {
+        /* Find the first escaped character */
+        const unsigned char *start = pos;
+        for(; pos < end; pos++) {
+            if(*pos >= 127 || *pos < ' ' || *pos == '\\' || *pos == '\"')
                 break;
-
-            pos = end;
         }
 
-        /* Write out the characters that don't need escaping */
-        if(pos != str) {
-            if(ctx->pos + (pos - str) > ctx->end)
+        /* Write out the unescaped ascii sequence */
+        if(pos > start) {
+            if(ctx->pos + (pos - start) > ctx->end)
                 return UA_STATUSCODE_BADENCODINGLIMITSEXCEEDED;
             if(!ctx->calcOnly)
-                memcpy(ctx->pos, str, (size_t)(pos - str));
-            ctx->pos += pos - str;
+                memcpy(ctx->pos, start, (size_t)(pos - start));
+            ctx->pos += pos - start;
         }
 
-        /* Reached the end of the utf8 encoding */
-        if(end == pos)
+        /* The unescaped ascii sequence reached the end */
+        if(pos == end)
             break;
 
-        /* Handle an escaped character */
-        size_t length = 2;
-        u8 seq[13];
-        const char *text;
+        /* Parse an escaped character */
+        unsigned codepoint = 0;
+        unsigned len = utf8_to_codepoint(pos, (size_t)(end - pos), &codepoint);
+        if(len == 0)  {
+            /* A malformed utf8 character. Print anyway and let the
+             * receiving side choose how to handle it. */
+            codepoint = *pos;
+            len = 1;
+        }
+        pos += len;
 
+        /* Write an escaped character */
+        u8 escape_buf[13];
+        const char *escape_text;
+        size_t escape_length = 2;
         switch(codepoint) {
-        case '\\': text = "\\\\"; break;
-        case '\"': text = "\\\""; break;
-        case '\b': text = "\\b"; break;
-        case '\f': text = "\\f"; break;
-        case '\n': text = "\\n"; break;
-        case '\r': text = "\\r"; break;
-        case '\t': text = "\\t"; break;
+        case '\\': escape_text = "\\\\"; break;
+        case '\"': escape_text = "\\\""; break;
+        case '\b': escape_text = "\\b";  break;
+        case '\f': escape_text = "\\f";  break;
+        case '\n': escape_text = "\\n";  break;
+        case '\r': escape_text = "\\r";  break;
+        case '\t': escape_text = "\\t";  break;
         default:
-            text = (char*)seq;
+            escape_text = (char*)escape_buf;
             if(codepoint < 0x10000) {
                 /* codepoint is in BMP */
-                seq[0] = '\\';
-                seq[1] = 'u';
+                escape_buf[0] = '\\';
+                escape_buf[1] = 'u';
                 UA_Byte b1 = (UA_Byte)(codepoint >> 8u);
                 UA_Byte b2 = (UA_Byte)(codepoint >> 0u);
-                seq[2] = hexmap[(b1 & 0xF0u) >> 4u];
-                seq[3] = hexmap[b1 & 0x0Fu];
-                seq[4] = hexmap[(b2 & 0xF0u) >> 4u];
-                seq[5] = hexmap[b2 & 0x0Fu];
-                length = 6;
+                escape_buf[2] = hexmap[(b1 & 0xF0u) >> 4u];
+                escape_buf[3] = hexmap[b1 & 0x0Fu];
+                escape_buf[4] = hexmap[(b2 & 0xF0u) >> 4u];
+                escape_buf[5] = hexmap[b2 & 0x0Fu];
+                escape_length = 6;
             } else {
                 /* not in BMP -> construct a UTF-16 surrogate pair */
                 codepoint -= 0x10000;
@@ -577,28 +527,31 @@ ENCODE_JSON(String) {
                 UA_Byte fb2 = (UA_Byte)(first >> 0u);
                 UA_Byte lb1 = (UA_Byte)(last >> 8u);
                 UA_Byte lb2 = (UA_Byte)(last >> 0u);
-                seq[0] = '\\';
-                seq[1] = 'u';
-                seq[2] = hexmap[(fb1 & 0xF0u) >> 4u];
-                seq[3] = hexmap[fb1 & 0x0Fu];
-                seq[4] = hexmap[(fb2 & 0xF0u) >> 4u];
-                seq[5] = hexmap[fb2 & 0x0Fu];
-                seq[6] = '\\';
-                seq[7] = 'u';
-                seq[8] = hexmap[(lb1 & 0xF0u) >> 4u];
-                seq[9] = hexmap[lb1 & 0x0Fu];
-                seq[10] = hexmap[(lb2 & 0xF0u) >> 4u];
-                seq[11] = hexmap[lb2 & 0x0Fu];
-                length = 12;
+                escape_buf[0] = '\\';
+                escape_buf[1] = 'u';
+                escape_buf[2] = hexmap[(fb1 & 0xF0u) >> 4u];
+                escape_buf[3] = hexmap[fb1 & 0x0Fu];
+                escape_buf[4] = hexmap[(fb2 & 0xF0u) >> 4u];
+                escape_buf[5] = hexmap[fb2 & 0x0Fu];
+                escape_buf[6] = '\\';
+                escape_buf[7] = 'u';
+                escape_buf[8] = hexmap[(lb1 & 0xF0u) >> 4u];
+                escape_buf[9] = hexmap[lb1 & 0x0Fu];
+                escape_buf[10] = hexmap[(lb2 & 0xF0u) >> 4u];
+                escape_buf[11] = hexmap[lb2 & 0x0Fu];
+                escape_length = 12;
             }
             break;
         }
-        if(ctx->pos + length > ctx->end)
+
+        /* Enough space? */
+        if(ctx->pos + escape_length > ctx->end)
             return UA_STATUSCODE_BADENCODINGLIMITSEXCEEDED;
+
+        /* Write the escaped character */
         if(!ctx->calcOnly)
-            memcpy(ctx->pos, text, length);
-        ctx->pos += length;
-        str = pos = end;
+            memcpy(ctx->pos, escape_text, escape_length);
+        ctx->pos += escape_length;
     }
 
     return ret | writeJsonQuote(ctx);