Skip to content

Commit

Permalink
refactor(core): Use the utf8 decoding in /deps for JSON
Browse files Browse the repository at this point in the history
  • Loading branch information
jpfr committed Dec 23, 2024
1 parent ba18106 commit c29c090
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 111 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -742,6 +742,7 @@ set(lib_headers ${PROJECT_SOURCE_DIR}/deps/open62541_queue.h
${PROJECT_SOURCE_DIR}/deps/base64.h
${PROJECT_SOURCE_DIR}/deps/dtoa.h
${PROJECT_SOURCE_DIR}/deps/mp_printf.h
${PROJECT_SOURCE_DIR}/deps/utf8.h
${PROJECT_SOURCE_DIR}/deps/itoa.h
${PROJECT_SOURCE_DIR}/deps/ziptree.h
${PROJECT_SOURCE_DIR}/src/ua_types_encoding_binary.h
Expand Down Expand Up @@ -812,6 +813,7 @@ set(lib_sources ${PROJECT_SOURCE_DIR}/src/ua_types.c
${PROJECT_SOURCE_DIR}/deps/base64.c
${PROJECT_SOURCE_DIR}/deps/dtoa.c
${PROJECT_SOURCE_DIR}/deps/mp_printf.c
${PROJECT_SOURCE_DIR}/deps/utf8.c
${PROJECT_SOURCE_DIR}/deps/itoa.c
${PROJECT_SOURCE_DIR}/deps/ziptree.c)

Expand Down
175 changes: 64 additions & 111 deletions src/ua_types_encoding_json.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <float.h>
#include <math.h>

#include "../deps/utf8.h"
#include "../deps/itoa.h"
#include "../deps/dtoa.h"
#include "../deps/parse_num.h"
Expand Down Expand Up @@ -446,54 +447,9 @@ encodeJsonArray(CtxJson *ctx, const void *ptr, size_t length,
return ret | writeJsonArrEnd(ctx, type);
}

static const uint32_t min_codepoints[5] = {0x00, 0x00, 0x80, 0x800, 0x10000};
static const u8 hexmap[16] =
{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};

/* Extract the next utf8 codepoint from the buffer. Return the next position in
* the buffer or NULL upon an error. */
static const unsigned char *
extract_codepoint(const unsigned char *pos, size_t len, uint32_t *codepoint) {
UA_assert(len > 0);

*codepoint = pos[0];
if(UA_LIKELY(*codepoint < 0x80))
return pos + 1; /* Normal ASCII */

if(UA_UNLIKELY(*codepoint <= 0xC1))
return NULL; /* Continuation byte not allowed here */

unsigned char count;
if(*codepoint <= 0xDF) {
count = 2; /* 2-byte sequence */
*codepoint &= 0x1F;
} else if(*codepoint <= 0xEF) {
count = 3; /* 3-byte sequence */
*codepoint &= 0xF;
} else if(*codepoint <= 0xF4) {
count = 4; /* 4-byte sequence */
*codepoint &= 0x7;
} else {
return NULL; /* invalid utf8 */
}

if(UA_UNLIKELY(count > len))
return NULL; /* Not enough bytes left */

for(unsigned char i = 1; i < count; i++) {
unsigned char byte = pos[i];
if(UA_UNLIKELY(byte < 0x80 || byte > 0xBF))
return NULL; /* Not a continuation byte */
*codepoint = (*codepoint << 6) + (byte & 0x3F);
}

/* Not in Unicode range or too small for the encoding length */
if(UA_UNLIKELY(*codepoint > 0x10FFFF || *codepoint < min_codepoints[count]))
return NULL;

return pos + count; /* Return the new position in the pos */
}

ENCODE_JSON(String) {
if(!src->data)
return writeChars(ctx, "null", 4);
Expand All @@ -503,71 +459,65 @@ ENCODE_JSON(String) {

UA_StatusCode ret = writeJsonQuote(ctx);

const unsigned char *str = src->data;
const unsigned char *pos = str;
const unsigned char *end = str;
const unsigned char *lim = str + src->length;
uint32_t codepoint = 0;
while(1) {
/* Iterate over codepoints in the utf8 encoding. Until the first
* character that needs to be escaped. */
while(end < lim) {
end = extract_codepoint(pos, (size_t)(lim - pos), &codepoint);
if(!end) {
/* A malformed utf8 character. Print anyway and let the
* receiving side choose how to handle it. */
pos++;
end = pos;
continue;
}

/* Escape unprintable ASCII and escape characters */
if(codepoint < ' ' || codepoint == 127 ||
codepoint == '\\' || codepoint == '\"')
const unsigned char *pos = src->data; /* Input position */
const unsigned char *end = pos + src->length; /* End of input */
while(pos < end) {
/* Find the first escaped character */
const unsigned char *start = pos;
for(; pos < end; pos++) {
if(*pos >= 127 || *pos < ' ' || *pos == '\\' || *pos == '\"')
break;

pos = end;
}

/* Write out the characters that don't need escaping */
if(pos != str) {
if(ctx->pos + (pos - str) > ctx->end)
/* Write out the unescaped ascii sequence */
if(pos > start) {
if(ctx->pos + (pos - start) > ctx->end)
return UA_STATUSCODE_BADENCODINGLIMITSEXCEEDED;
if(!ctx->calcOnly)
memcpy(ctx->pos, str, (size_t)(pos - str));
ctx->pos += pos - str;
memcpy(ctx->pos, start, (size_t)(pos - start));
ctx->pos += pos - start;
}

/* Reached the end of the utf8 encoding */
if(end == pos)
/* The unescaped ascii sequence reached the end */
if(pos == end)
break;

/* Handle an escaped character */
size_t length = 2;
u8 seq[13];
const char *text;
/* Parse an escaped character */
unsigned codepoint = 0;
unsigned len = utf8_to_codepoint(pos, (size_t)(end - pos), &codepoint);
if(len == 0) {
/* A malformed utf8 character. Print anyway and let the
* receiving side choose how to handle it. */
codepoint = *pos;
len = 1;
}
pos += len;

/* Write an escaped character */
u8 escape_buf[13];
const char *escape_text;
size_t escape_length = 2;
switch(codepoint) {
case '\\': text = "\\\\"; break;
case '\"': text = "\\\""; break;
case '\b': text = "\\b"; break;
case '\f': text = "\\f"; break;
case '\n': text = "\\n"; break;
case '\r': text = "\\r"; break;
case '\t': text = "\\t"; break;
case '\\': escape_text = "\\\\"; break;
case '\"': escape_text = "\\\""; break;
case '\b': escape_text = "\\b"; break;
case '\f': escape_text = "\\f"; break;
case '\n': escape_text = "\\n"; break;
case '\r': escape_text = "\\r"; break;
case '\t': escape_text = "\\t"; break;
default:
text = (char*)seq;
escape_text = (char*)escape_buf;
if(codepoint < 0x10000) {
/* codepoint is in BMP */
seq[0] = '\\';
seq[1] = 'u';
escape_buf[0] = '\\';
escape_buf[1] = 'u';
UA_Byte b1 = (UA_Byte)(codepoint >> 8u);
UA_Byte b2 = (UA_Byte)(codepoint >> 0u);
seq[2] = hexmap[(b1 & 0xF0u) >> 4u];
seq[3] = hexmap[b1 & 0x0Fu];
seq[4] = hexmap[(b2 & 0xF0u) >> 4u];
seq[5] = hexmap[b2 & 0x0Fu];
length = 6;
escape_buf[2] = hexmap[(b1 & 0xF0u) >> 4u];
escape_buf[3] = hexmap[b1 & 0x0Fu];
escape_buf[4] = hexmap[(b2 & 0xF0u) >> 4u];
escape_buf[5] = hexmap[b2 & 0x0Fu];
escape_length = 6;
} else {
/* not in BMP -> construct a UTF-16 surrogate pair */
codepoint -= 0x10000;
Expand All @@ -577,28 +527,31 @@ ENCODE_JSON(String) {
UA_Byte fb2 = (UA_Byte)(first >> 0u);
UA_Byte lb1 = (UA_Byte)(last >> 8u);
UA_Byte lb2 = (UA_Byte)(last >> 0u);
seq[0] = '\\';
seq[1] = 'u';
seq[2] = hexmap[(fb1 & 0xF0u) >> 4u];
seq[3] = hexmap[fb1 & 0x0Fu];
seq[4] = hexmap[(fb2 & 0xF0u) >> 4u];
seq[5] = hexmap[fb2 & 0x0Fu];
seq[6] = '\\';
seq[7] = 'u';
seq[8] = hexmap[(lb1 & 0xF0u) >> 4u];
seq[9] = hexmap[lb1 & 0x0Fu];
seq[10] = hexmap[(lb2 & 0xF0u) >> 4u];
seq[11] = hexmap[lb2 & 0x0Fu];
length = 12;
escape_buf[0] = '\\';
escape_buf[1] = 'u';
escape_buf[2] = hexmap[(fb1 & 0xF0u) >> 4u];
escape_buf[3] = hexmap[fb1 & 0x0Fu];
escape_buf[4] = hexmap[(fb2 & 0xF0u) >> 4u];
escape_buf[5] = hexmap[fb2 & 0x0Fu];
escape_buf[6] = '\\';
escape_buf[7] = 'u';
escape_buf[8] = hexmap[(lb1 & 0xF0u) >> 4u];
escape_buf[9] = hexmap[lb1 & 0x0Fu];
escape_buf[10] = hexmap[(lb2 & 0xF0u) >> 4u];
escape_buf[11] = hexmap[lb2 & 0x0Fu];
escape_length = 12;
}
break;
}
if(ctx->pos + length > ctx->end)

/* Enough space? */
if(ctx->pos + escape_length > ctx->end)
return UA_STATUSCODE_BADENCODINGLIMITSEXCEEDED;

/* Write the escaped character */
if(!ctx->calcOnly)
memcpy(ctx->pos, text, length);
ctx->pos += length;
str = pos = end;
memcpy(ctx->pos, escape_text, escape_length);
ctx->pos += escape_length;
}

return ret | writeJsonQuote(ctx);
Expand Down

0 comments on commit c29c090

Please sign in to comment.