From 9f5af3994fb149d006c12797618c95e9f2e56bed Mon Sep 17 00:00:00 2001
From: Michael Squires <blackout@vertex.link>
Date: Tue, 29 Apr 2025 07:58:55 -0400
Subject: [PATCH 1/2] fix unicode counting bug

---
 tests/test_document.py | 55 ++++++++++++++++++++++++++++++++++++++++++
 yyjson/document.c      | 18 ++++++--------
 2 files changed, 63 insertions(+), 10 deletions(-)

diff --git a/tests/test_document.py b/tests/test_document.py
index 62afd04..3fd9b40 100644
--- a/tests/test_document.py
+++ b/tests/test_document.py
@@ -25,6 +25,61 @@ def test_document_from_str():
     assert doc.as_obj == {"hello": "world"}
 
 
+def test_document_unicode():
+    value = '["bar�"]'
+    doc = Document(value)
+    assert doc.dumps() == value
+    assert doc.as_obj == ['bar�']
+
+    value = '["bar\uFFFD"]'
+    doc = Document(value)
+    assert doc.dumps() == value
+    assert doc.as_obj == ['bar�']
+
+    assert doc.dumps(flags=WriterFlags.ESCAPE_UNICODE) == '["bar\\uFFFD"]'
+
+def test_document_unicode_stdlib():
+
+    # Adapted tests from cpython lib/tests/test_json/test_unicode.py
+
+    # test_encoding3
+    value = '"\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}"'
+    doc = Document(value)
+    assert doc.dumps() == value
+    assert doc.dumps(flags=WriterFlags.ESCAPE_UNICODE) == '"\\u03B1\\u03A9"'
+    assert doc.as_obj == '\u03b1\u03a9'
+
+    # test_encoding4
+    value = '\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
+    doc = Document([value])
+    assert doc.dumps() == f'["{value}"]'
+    assert doc.dumps(flags=WriterFlags.ESCAPE_UNICODE) == '["\\u03B1\\u03A9"]'
+    assert doc.as_obj == ['\u03b1\u03a9']
+
+    # test_big_unicode_encode
+    value = '"\U0001d120"'
+    doc = Document(value)
+    assert doc.dumps() == value
+    assert doc.dumps(flags=WriterFlags.ESCAPE_UNICODE) == '"\\uD834\\uDD20"'
+    assert doc.as_obj == '𝄠'
+
+    # test_big_unicode_decode
+    value = '"z\U0001d120x"'
+    doc = Document(value)
+    assert doc.dumps() == value
+    assert doc.dumps(flags=WriterFlags.ESCAPE_UNICODE) == '"z\\uD834\\uDD20x"'
+    assert doc.as_obj == 'z𝄠x'
+
+    def loads(s: str, reader_flags=0):
+        '''Load a string as json.'''
+        return Document(s, flags=reader_flags).as_obj
+
+    # test_unicode_decode
+    for i in range(0, 0xd7ff):
+        u = chr(i)
+        value = '"\\u{0:04x}"'.format(i)
+        assert loads(value) == u
+
 def test_document_types():
     """Ensure each primitive type can be upcast (which does not have its own
     dedicated test.)"""
diff --git a/yyjson/document.c b/yyjson/document.c
index 69c6c5f..2ca1c3c 100644
--- a/yyjson/document.c
+++ b/yyjson/document.c
@@ -16,17 +16,13 @@ static PyObject *element_to_primitive(yyjson_val *val);
 static PyObject *pathlib = NULL;
 static PyObject *path = NULL;
 
-/**
- * Count the number of UTF-8 characters in the given string.
- */
-static inline size_t num_utf8_chars(const char *src, size_t len) {
-  size_t count = 0;
+static inline bool is_ascii(const char * src, size_t len) {
   for (size_t i = 0; i < len; i++) {
-    if (yyjson_likely(src[i] >> 6 != 2)) {
-      count++;
+    if (yyjson_unlikely((src[i] & 0x80) != 0)) {
+      return false;
     }
   }
-  return count;
+  return true;
 }
 
 /**
@@ -41,9 +37,11 @@ static inline PyObject *unicode_from_str(const char *src, size_t len) {
   //
   // The details of these structures are here:
   //    https://github.com/python/cpython/blob/main/Include/cpython/unicodeobject.h#L53
-  size_t num_chars = num_utf8_chars(src, len);
 
-  if (yyjson_likely(num_chars == len)) {
+  // Checking the string for non-ascii characters is faster than counting
+  // characters of the whole string because we can return at the first
+  // non-ascii character.
+  if (yyjson_likely(is_ascii(src, len))) {
     PyObject *uni = PyUnicode_New(len, 127);
     if (!uni) return NULL;
     PyASCIIObject *uni_ascii = (PyASCIIObject *)uni;

From 083ba321404bee3083dd0ad110f7b08be0c7ec9a Mon Sep 17 00:00:00 2001
From: Michael Squires <blackout@vertex.link>
Date: Wed, 9 Jul 2025 11:16:20 -0400
Subject: [PATCH 2/2] restore original counting code

---
 yyjson/document.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/yyjson/document.c b/yyjson/document.c
index 2ca1c3c..dda4456 100644
--- a/yyjson/document.c
+++ b/yyjson/document.c
@@ -16,13 +16,17 @@ static PyObject *element_to_primitive(yyjson_val *val);
 static PyObject *pathlib = NULL;
 static PyObject *path = NULL;
 
-static inline bool is_ascii(const char * src, size_t len) {
+/**
+ * Count the number of UTF-8 characters in the given string.
+ */
+static inline size_t num_utf8_chars(const char *src, size_t len) {
+  size_t count = 0;
   for (size_t i = 0; i < len; i++) {
-    if (yyjson_unlikely((src[i] & 0x80) != 0)) {
-      return false;
+    if (yyjson_likely((src[i] & 0xff) >> 6 != 2)) {
+      count++;
     }
   }
-  return true;
+  return count;
 }
 
 /**
@@ -37,11 +41,9 @@ static inline PyObject *unicode_from_str(const char *src, size_t len) {
   //
   // The details of these structures are here:
   //    https://github.com/python/cpython/blob/main/Include/cpython/unicodeobject.h#L53
+  size_t num_chars = num_utf8_chars(src, len);
 
-  // Checking the string for non-ascii characters is faster than counting
-  // characters of the whole string because we can return at the first
-  // non-ascii character.
-  if (yyjson_likely(is_ascii(src, len))) {
+  if (yyjson_likely(num_chars == len)) {
     PyObject *uni = PyUnicode_New(len, 127);
     if (!uni) return NULL;
     PyASCIIObject *uni_ascii = (PyASCIIObject *)uni;