From 27a843bdebafbce025061a9923c480fbe766af5b Mon Sep 17 00:00:00 2001 From: Viicos <65306057+Viicos@users.noreply.github.com> Date: Fri, 11 Apr 2025 12:44:50 +0200 Subject: [PATCH] Add `ensure_ascii` option --- python/pydantic_core/_pydantic_core.pyi | 6 ++ src/serializers/mod.rs | 20 ++++-- src/serializers/shared.rs | 89 ++++++++++++++++++++++-- src/serializers/type_serializers/json.rs | 8 +-- tests/serializers/test_string.py | 32 +++++++++ tests/test_json.py | 3 + 6 files changed, 146 insertions(+), 12 deletions(-) diff --git a/python/pydantic_core/_pydantic_core.pyi b/python/pydantic_core/_pydantic_core.pyi index 17098cca5..a95ba3445 100644 --- a/python/pydantic_core/_pydantic_core.pyi +++ b/python/pydantic_core/_pydantic_core.pyi @@ -345,6 +345,7 @@ class SchemaSerializer: value: Any, *, indent: int | None = None, + ensure_ascii: bool = False, include: _IncEx | None = None, exclude: _IncEx | None = None, by_alias: bool | None = None, @@ -363,6 +364,8 @@ class SchemaSerializer: Arguments: value: The Python object to serialize. indent: If `None`, the JSON will be compact, otherwise it will be pretty-printed with the indent provided. + ensure_ascii: If `True`, the output is guaranteed to have all incoming non-ASCII characters escaped. + If `False` (the default), these characters will be outputted as-is. include: A set of fields to include, if `None` all fields are included. exclude: A set of fields to exclude, if `None` no fields are excluded. by_alias: Whether to use the alias names of fields. @@ -390,6 +393,7 @@ def to_json( value: Any, *, indent: int | None = None, + ensure_ascii: bool = False, include: _IncEx | None = None, exclude: _IncEx | None = None, # Note: In Pydantic 2.11, the default value of `by_alias` on `SchemaSerializer` was changed from `True` to `None`, @@ -414,6 +418,8 @@ def to_json( Arguments: value: The Python object to serialize. indent: If `None`, the JSON will be compact, otherwise it will be pretty-printed with the indent provided. + ensure_ascii: If `True`, the output is guaranteed to have all incoming non-ASCII characters escaped. + If `False` (the default), these characters will be outputted as-is. include: A set of fields to include, if `None` all fields are included. exclude: A set of fields to exclude, if `None` no fields are excluded. by_alias: Whether to use the alias names of fields. diff --git a/src/serializers/mod.rs b/src/serializers/mod.rs index 434b43c53..b4424e788 100644 --- a/src/serializers/mod.rs +++ b/src/serializers/mod.rs @@ -155,7 +155,7 @@ impl SchemaSerializer { } #[allow(clippy::too_many_arguments)] - #[pyo3(signature = (value, *, indent = None, include = None, exclude = None, by_alias = None, + #[pyo3(signature = (value, *, indent = None, ensure_ascii = false, include = None, exclude = None, by_alias = None, exclude_unset = false, exclude_defaults = false, exclude_none = false, round_trip = false, warnings = WarningsArg::Bool(true), fallback = None, serialize_as_any = false, context = None))] pub fn to_json( @@ -163,6 +163,7 @@ impl SchemaSerializer { py: Python, value: &Bound<'_, PyAny>, indent: Option, + ensure_ascii: Option, include: Option<&Bound<'_, PyAny>>, exclude: Option<&Bound<'_, PyAny>>, by_alias: Option, @@ -204,6 +205,7 @@ impl SchemaSerializer { exclude, &extra, indent, + ensure_ascii.unwrap_or(false), self.expected_json_size.load(Ordering::Relaxed), )?; @@ -239,14 +241,15 @@ impl SchemaSerializer { #[allow(clippy::too_many_arguments)] #[pyfunction] -#[pyo3(signature = (value, *, indent = None, include = None, exclude = None, by_alias = true, - exclude_none = false, round_trip = false, timedelta_mode = "iso8601", bytes_mode = "utf8", +#[pyo3(signature = (value, *, indent = None, ensure_ascii = false, include = None, exclude = None, + by_alias = true, exclude_none = false, round_trip = false, timedelta_mode = "iso8601", bytes_mode = "utf8", inf_nan_mode = "constants", serialize_unknown = false, fallback = None, serialize_as_any = false, context = None))] pub fn to_json( py: Python, value: &Bound<'_, PyAny>, indent: Option, + ensure_ascii: Option, include: Option<&Bound<'_, PyAny>>, exclude: Option<&Bound<'_, PyAny>>, by_alias: bool, @@ -274,7 +277,16 @@ pub fn to_json( context, ); let serializer = type_serializers::any::AnySerializer.into(); - let bytes = to_json_bytes(value, &serializer, include, exclude, &extra, indent, 1024)?; + let bytes = to_json_bytes( + value, + &serializer, + include, + exclude, + &extra, + indent, + ensure_ascii.unwrap_or(false), + 1024, + )?; state.final_check(py)?; let py_bytes = PyBytes::new(py, &bytes); Ok(py_bytes.into()) diff --git a/src/serializers/shared.rs b/src/serializers/shared.rs index f7a018749..142bca66f 100644 --- a/src/serializers/shared.rs +++ b/src/serializers/shared.rs @@ -1,5 +1,6 @@ use std::borrow::Cow; use std::fmt::Debug; +use std::io::{self, Write}; use pyo3::exceptions::PyTypeError; use pyo3::prelude::*; @@ -9,7 +10,7 @@ use pyo3::{intern, PyTraverseError, PyVisit}; use enum_dispatch::enum_dispatch; use serde::Serialize; -use serde_json::ser::PrettyFormatter; +use serde_json::ser::{Formatter, PrettyFormatter}; use crate::build_tools::py_schema_err; use crate::build_tools::py_schema_error_type; @@ -349,6 +350,71 @@ impl Serialize for PydanticSerializer<'_> { } } +struct EscapeNonAsciiFormatter; + +impl Formatter for EscapeNonAsciiFormatter { + fn write_string_fragment(&mut self, writer: &mut W, fragment: &str) -> io::Result<()> { + for ch in fragment.chars() { + if ch.is_ascii() { + writer.write_all(ch.encode_utf8(&mut [0; 4]).as_bytes())?; + } else { + for escape in ch.encode_utf16(&mut [0; 2]) { + write!(writer, "\\u{escape:04x}")?; + } + } + } + Ok(()) + } +} + +struct EscapeNonAsciiPrettyFormatter<'a> { + pretty: PrettyFormatter<'a>, + escape_non_ascii: EscapeNonAsciiFormatter, +} + +impl<'a> EscapeNonAsciiPrettyFormatter<'a> { + pub fn with_indent(indent: &'a [u8]) -> Self { + Self { + pretty: PrettyFormatter::with_indent(indent), + escape_non_ascii: EscapeNonAsciiFormatter, + } + } +} + +macro_rules! defer { + ($formatter:ident, $fun:ident) => { + fn $fun(&mut self, writer: &mut W) -> io::Result<()> + where + W: ?Sized + io::Write, + { + self.$formatter.$fun(writer) + } + }; + ($formatter:ident, $fun:ident, $val:ty) => { + fn $fun(&mut self, writer: &mut W, val: $val) -> io::Result<()> + where + W: ?Sized + io::Write, + { + self.$formatter.$fun(writer, val) + } + }; +} + +#[allow(clippy::needless_lifetimes)] +impl<'a> Formatter for EscapeNonAsciiPrettyFormatter<'a> { + defer!(escape_non_ascii, write_string_fragment, &str); + defer!(pretty, begin_array); + defer!(pretty, end_array); + defer!(pretty, begin_array_value, bool); + defer!(pretty, end_array_value); + defer!(pretty, begin_object); + defer!(pretty, end_object); + defer!(pretty, begin_object_key, bool); + defer!(pretty, end_object_key); + defer!(pretty, begin_object_value); + defer!(pretty, end_object_value); +} + #[allow(clippy::too_many_arguments)] pub(crate) fn to_json_bytes( value: &Bound<'_, PyAny>, @@ -357,25 +423,40 @@ pub(crate) fn to_json_bytes( exclude: Option<&Bound<'_, PyAny>>, extra: &Extra, indent: Option, + ensure_ascii: bool, expected_json_size: usize, ) -> PyResult> { let serializer = PydanticSerializer::new(value, serializer, include, exclude, extra); let writer: Vec = Vec::with_capacity(expected_json_size); - let bytes = match indent { - Some(indent) => { + + let bytes = match (indent, ensure_ascii) { + (Some(indent), true) => { + let indent = vec![b' '; indent]; + let formatter = EscapeNonAsciiPrettyFormatter::with_indent(&indent); + let mut ser = PythonSerializer::with_formatter(writer, formatter); + serializer.serialize(&mut ser).map_err(se_err_py_err)?; + ser.into_inner() + } + (Some(indent), false) => { let indent = vec![b' '; indent]; let formatter = PrettyFormatter::with_indent(&indent); let mut ser = PythonSerializer::with_formatter(writer, formatter); serializer.serialize(&mut ser).map_err(se_err_py_err)?; ser.into_inner() } - None => { + (None, true) => { + let mut ser = PythonSerializer::with_formatter(writer, EscapeNonAsciiFormatter); + serializer.serialize(&mut ser).map_err(se_err_py_err)?; + ser.into_inner() + } + (None, false) => { let mut ser = PythonSerializer::new(writer); serializer.serialize(&mut ser).map_err(se_err_py_err)?; ser.into_inner() } }; + Ok(bytes) } diff --git a/src/serializers/type_serializers/json.rs b/src/serializers/type_serializers/json.rs index 6b8b4294f..fb2a53f6b 100644 --- a/src/serializers/type_serializers/json.rs +++ b/src/serializers/type_serializers/json.rs @@ -54,7 +54,7 @@ impl TypeSerializer for JsonSerializer { extra: &Extra, ) -> PyResult { if extra.round_trip { - let bytes = to_json_bytes(value, &self.serializer, include, exclude, extra, None, 0)?; + let bytes = to_json_bytes(value, &self.serializer, include, exclude, extra, None, false, 0)?; let py = value.py(); let s = from_utf8(&bytes).map_err(|e| utf8_py_error(py, e, &bytes))?; Ok(PyString::new(py, s).into()) @@ -65,7 +65,7 @@ impl TypeSerializer for JsonSerializer { fn json_key<'a>(&self, key: &'a Bound<'_, PyAny>, extra: &Extra) -> PyResult> { if extra.round_trip { - let bytes = to_json_bytes(key, &self.serializer, None, None, extra, None, 0)?; + let bytes = to_json_bytes(key, &self.serializer, None, None, extra, None, false, 0)?; let py = key.py(); let s = from_utf8(&bytes).map_err(|e| utf8_py_error(py, e, &bytes))?; Ok(Cow::Owned(s.to_string())) @@ -83,8 +83,8 @@ impl TypeSerializer for JsonSerializer { extra: &Extra, ) -> Result { if extra.round_trip { - let bytes = - to_json_bytes(value, &self.serializer, include, exclude, extra, None, 0).map_err(py_err_se_err)?; + let bytes = to_json_bytes(value, &self.serializer, include, exclude, extra, None, false, 0) + .map_err(py_err_se_err)?; match from_utf8(&bytes) { Ok(s) => serializer.serialize_str(s), Err(e) => Err(Error::custom(e.to_string())), diff --git a/tests/serializers/test_string.py b/tests/serializers/test_string.py index f547d49b2..555edf1d2 100644 --- a/tests/serializers/test_string.py +++ b/tests/serializers/test_string.py @@ -23,6 +23,38 @@ def test_str(): assert json.loads(json_emoji) == 'emoji 💩' +# Tests borrowed from: +# - https://github.com/python/cpython/blob/d87e7f35/Lib/test/test_json/test_encode_basestring_ascii.py +# - https://github.com/python/cpython/blob/d87e7f35/Lib/test/test_json/test_unicode.py +@pytest.mark.parametrize( + ['input', 'expected'], + [ + ( + '/\\"\ucafe\ubabe\uab98\ufcde\ubcda\uef4a\x08\x0c\n\r\t`1~!@#$%^&*()_+-=[]{}|;:\',./<>?', + '"/\\\\\\"\\ucafe\\ubabe\\uab98\\ufcde\\ubcda\\uef4a\\b\\f\\n\\r\\t`1~!@#$%^&*()_+-=[]{}|;:\',./<>?"', + ), + ('\u0123\u4567\u89ab\ucdef\uabcd\uef4a', '"\\u0123\\u4567\\u89ab\\ucdef\\uabcd\\uef4a"'), + ('controls', '"controls"'), + ('\x08\x0c\n\r\t', '"\\b\\f\\n\\r\\t"'), + ( + '{"object with 1 member":["array with 1 element"]}', + '"{\\"object with 1 member\\":[\\"array with 1 element\\"]}"', + ), + (' s p a c e d ', '" s p a c e d "'), + ('\U0001d120', '"\\ud834\\udd20"'), + ('\u03b1\u03a9', '"\\u03b1\\u03a9"'), + ("`1~!@#$%^&*()_+-={':[,]}|;.?", '"`1~!@#$%^&*()_+-={\':[,]}|;.?"'), + ('\x08\x0c\n\r\t', '"\\b\\f\\n\\r\\t"'), + ('\u0123\u4567\u89ab\ucdef\uabcd\uef4a', '"\\u0123\\u4567\\u89ab\\ucdef\\uabcd\\uef4a"'), + ('\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}', '"\\u03b1\\u03a9"'), + ('\U0001d120', '"\\ud834\\udd20"'), + ], +) +def test_str_ensure_ascii(input: str, expected: str) -> None: + v = SchemaSerializer(core_schema.str_schema()) + assert v.to_json(input, ensure_ascii=True).decode('utf-8') == expected + + def test_huge_str(): v = SchemaSerializer(core_schema.int_schema()) msg = r"Expected `int` - serialized value may not be as expected \[input_value='123456789012345678901234...89012345678901234567890', input_type=str\]" diff --git a/tests/test_json.py b/tests/test_json.py index 4d40ceb16..dab2207f9 100644 --- a/tests/test_json.py +++ b/tests/test_json.py @@ -218,6 +218,9 @@ def test_to_json(): assert to_json([1, 2]) == b'[1,2]' assert to_json([1, 2], indent=2) == b'[\n 1,\n 2\n]' assert to_json([1, b'x']) == b'[1,"x"]' + assert to_json(['à', 'é']).decode('utf-8') == '["à","é"]' + assert to_json(['à', 'é'], indent=2).decode('utf-8') == '[\n "à",\n "é"\n]' + assert to_json(['à', 'é'], indent=2, ensure_ascii=True).decode('utf-8') == '[\n "\\u00e0",\n "\\u00e9"\n]' # kwargs required with pytest.raises(TypeError, match=r'to_json\(\) takes 1 positional arguments but 2 were given'):