From 9f5af3994fb149d006c12797618c95e9f2e56bed Mon Sep 17 00:00:00 2001 From: Michael Squires Date: Tue, 29 Apr 2025 07:58:55 -0400 Subject: [PATCH 1/2] fix unicode counting bug --- tests/test_document.py | 55 ++++++++++++++++++++++++++++++++++++++++++ yyjson/document.c | 18 ++++++-------- 2 files changed, 63 insertions(+), 10 deletions(-) diff --git a/tests/test_document.py b/tests/test_document.py index 62afd04..3fd9b40 100644 --- a/tests/test_document.py +++ b/tests/test_document.py @@ -25,6 +25,61 @@ def test_document_from_str(): assert doc.as_obj == {"hello": "world"} +def test_document_unicode(): + value = '["bar�"]' + doc = Document(value) + assert doc.dumps() == value + assert doc.as_obj == ['bar�'] + + value = '["bar\uFFFD"]' + doc = Document(value) + assert doc.dumps() == value + assert doc.as_obj == ['bar�'] + + assert doc.dumps(flags=WriterFlags.ESCAPE_UNICODE) == '["bar\\uFFFD"]' + +def test_document_unicode_stdlib(): + + # Adapted tests from cpython lib/tests/test_json/test_unicode.py + + # test_encoding3 + value = '"\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}"' + doc = Document(value) + assert doc.dumps() == value + assert doc.dumps(flags=WriterFlags.ESCAPE_UNICODE) == '"\\u03B1\\u03A9"' + assert doc.as_obj == '\u03b1\u03a9' + + # test_encoding4 + value = '\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}' + doc = Document([value]) + assert doc.dumps() == f'["{value}"]' + assert doc.dumps(flags=WriterFlags.ESCAPE_UNICODE) == '["\\u03B1\\u03A9"]' + assert doc.as_obj == ['\u03b1\u03a9'] + + # test_big_unicode_encode + value = '"\U0001d120"' + doc = Document(value) + assert doc.dumps() == value + assert doc.dumps(flags=WriterFlags.ESCAPE_UNICODE) == '"\\uD834\\uDD20"' + assert doc.as_obj == '𝄠' + + # test_big_unicode_decode + value = '"z\U0001d120x"' + doc = Document(value) + assert doc.dumps() == value + assert doc.dumps(flags=WriterFlags.ESCAPE_UNICODE) == '"z\\uD834\\uDD20x"' + assert doc.as_obj == 'z𝄠x' + + def loads(s: str, reader_flags=0): + '''Load a string as json.''' + return Document(s, flags=reader_flags).as_obj + + # test_unicode_decode + for i in range(0, 0xd7ff): + u = chr(i) + value = '"\\u{0:04x}"'.format(i) + assert loads(value) == u + def test_document_types(): """Ensure each primitive type can be upcast (which does not have its own dedicated test.)""" diff --git a/yyjson/document.c b/yyjson/document.c index 69c6c5f..2ca1c3c 100644 --- a/yyjson/document.c +++ b/yyjson/document.c @@ -16,17 +16,13 @@ static PyObject *element_to_primitive(yyjson_val *val); static PyObject *pathlib = NULL; static PyObject *path = NULL; -/** - * Count the number of UTF-8 characters in the given string. - */ -static inline size_t num_utf8_chars(const char *src, size_t len) { - size_t count = 0; +static inline bool is_ascii(const char * src, size_t len) { for (size_t i = 0; i < len; i++) { - if (yyjson_likely(src[i] >> 6 != 2)) { - count++; + if (yyjson_unlikely((src[i] & 0x80) != 0)) { + return false; } } - return count; + return true; } /** @@ -41,9 +37,11 @@ static inline PyObject *unicode_from_str(const char *src, size_t len) { // // The details of these structures are here: // https://github.com/python/cpython/blob/main/Include/cpython/unicodeobject.h#L53 - size_t num_chars = num_utf8_chars(src, len); - if (yyjson_likely(num_chars == len)) { + // Checking the string for non-ascii characters is faster than counting + // characters of the whole string because we can return at the first + // non-ascii character. + if (yyjson_likely(is_ascii(src, len))) { PyObject *uni = PyUnicode_New(len, 127); if (!uni) return NULL; PyASCIIObject *uni_ascii = (PyASCIIObject *)uni; From 083ba321404bee3083dd0ad110f7b08be0c7ec9a Mon Sep 17 00:00:00 2001 From: Michael Squires Date: Wed, 9 Jul 2025 11:16:20 -0400 Subject: [PATCH 2/2] restore original counting code --- yyjson/document.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/yyjson/document.c b/yyjson/document.c index 2ca1c3c..dda4456 100644 --- a/yyjson/document.c +++ b/yyjson/document.c @@ -16,13 +16,17 @@ static PyObject *element_to_primitive(yyjson_val *val); static PyObject *pathlib = NULL; static PyObject *path = NULL; -static inline bool is_ascii(const char * src, size_t len) { +/** + * Count the number of UTF-8 characters in the given string. + */ +static inline size_t num_utf8_chars(const char *src, size_t len) { + size_t count = 0; for (size_t i = 0; i < len; i++) { - if (yyjson_unlikely((src[i] & 0x80) != 0)) { - return false; + if (yyjson_likely((src[i] & 0xff) >> 6 != 2)) { + count++; } } - return true; + return count; } /** @@ -37,11 +41,9 @@ static inline PyObject *unicode_from_str(const char *src, size_t len) { // // The details of these structures are here: // https://github.com/python/cpython/blob/main/Include/cpython/unicodeobject.h#L53 + size_t num_chars = num_utf8_chars(src, len); - // Checking the string for non-ascii characters is faster than counting - // characters of the whole string because we can return at the first - // non-ascii character. - if (yyjson_likely(is_ascii(src, len))) { + if (yyjson_likely(num_chars == len)) { PyObject *uni = PyUnicode_New(len, 127); if (!uni) return NULL; PyASCIIObject *uni_ascii = (PyASCIIObject *)uni;