Skip to content

Add ensure_ascii option #1689

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions python/pydantic_core/_pydantic_core.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,7 @@ class SchemaSerializer:
value: Any,
*,
indent: int | None = None,
ensure_ascii: bool = False,
include: _IncEx | None = None,
exclude: _IncEx | None = None,
by_alias: bool | None = None,
Expand All @@ -363,6 +364,8 @@ class SchemaSerializer:
Arguments:
value: The Python object to serialize.
indent: If `None`, the JSON will be compact, otherwise it will be pretty-printed with the indent provided.
ensure_ascii: If `True`, the output is guaranteed to have all incoming non-ASCII characters escaped.
If `False` (the default), these characters will be outputted as-is.
include: A set of fields to include, if `None` all fields are included.
exclude: A set of fields to exclude, if `None` no fields are excluded.
by_alias: Whether to use the alias names of fields.
Expand Down Expand Up @@ -390,6 +393,7 @@ def to_json(
value: Any,
*,
indent: int | None = None,
ensure_ascii: bool = False,
include: _IncEx | None = None,
exclude: _IncEx | None = None,
# Note: In Pydantic 2.11, the default value of `by_alias` on `SchemaSerializer` was changed from `True` to `None`,
Expand All @@ -414,6 +418,8 @@ def to_json(
Arguments:
value: The Python object to serialize.
indent: If `None`, the JSON will be compact, otherwise it will be pretty-printed with the indent provided.
ensure_ascii: If `True`, the output is guaranteed to have all incoming non-ASCII characters escaped.
If `False` (the default), these characters will be outputted as-is.
include: A set of fields to include, if `None` all fields are included.
exclude: A set of fields to exclude, if `None` no fields are excluded.
by_alias: Whether to use the alias names of fields.
Expand Down
20 changes: 16 additions & 4 deletions src/serializers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -155,14 +155,15 @@ impl SchemaSerializer {
}

#[allow(clippy::too_many_arguments)]
#[pyo3(signature = (value, *, indent = None, include = None, exclude = None, by_alias = None,
#[pyo3(signature = (value, *, indent = None, ensure_ascii = false, include = None, exclude = None, by_alias = None,
exclude_unset = false, exclude_defaults = false, exclude_none = false, round_trip = false, warnings = WarningsArg::Bool(true),
fallback = None, serialize_as_any = false, context = None))]
pub fn to_json(
&self,
py: Python,
value: &Bound<'_, PyAny>,
indent: Option<usize>,
ensure_ascii: Option<bool>,
include: Option<&Bound<'_, PyAny>>,
exclude: Option<&Bound<'_, PyAny>>,
by_alias: Option<bool>,
Expand Down Expand Up @@ -204,6 +205,7 @@ impl SchemaSerializer {
exclude,
&extra,
indent,
ensure_ascii.unwrap_or(false),
self.expected_json_size.load(Ordering::Relaxed),
)?;

Expand Down Expand Up @@ -239,14 +241,15 @@ impl SchemaSerializer {

#[allow(clippy::too_many_arguments)]
#[pyfunction]
#[pyo3(signature = (value, *, indent = None, include = None, exclude = None, by_alias = true,
exclude_none = false, round_trip = false, timedelta_mode = "iso8601", bytes_mode = "utf8",
#[pyo3(signature = (value, *, indent = None, ensure_ascii = false, include = None, exclude = None,
by_alias = true, exclude_none = false, round_trip = false, timedelta_mode = "iso8601", bytes_mode = "utf8",
inf_nan_mode = "constants", serialize_unknown = false, fallback = None, serialize_as_any = false,
context = None))]
pub fn to_json(
py: Python,
value: &Bound<'_, PyAny>,
indent: Option<usize>,
ensure_ascii: Option<bool>,
include: Option<&Bound<'_, PyAny>>,
exclude: Option<&Bound<'_, PyAny>>,
by_alias: bool,
Expand Down Expand Up @@ -274,7 +277,16 @@ pub fn to_json(
context,
);
let serializer = type_serializers::any::AnySerializer.into();
let bytes = to_json_bytes(value, &serializer, include, exclude, &extra, indent, 1024)?;
let bytes = to_json_bytes(
value,
&serializer,
include,
exclude,
&extra,
indent,
ensure_ascii.unwrap_or(false),
1024,
)?;
state.final_check(py)?;
let py_bytes = PyBytes::new(py, &bytes);
Ok(py_bytes.into())
Expand Down
89 changes: 85 additions & 4 deletions src/serializers/shared.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use std::borrow::Cow;
use std::fmt::Debug;
use std::io::{self, Write};

use pyo3::exceptions::PyTypeError;
use pyo3::prelude::*;
Expand All @@ -9,7 +10,7 @@ use pyo3::{intern, PyTraverseError, PyVisit};

use enum_dispatch::enum_dispatch;
use serde::Serialize;
use serde_json::ser::PrettyFormatter;
use serde_json::ser::{Formatter, PrettyFormatter};

use crate::build_tools::py_schema_err;
use crate::build_tools::py_schema_error_type;
Expand Down Expand Up @@ -349,6 +350,71 @@ impl Serialize for PydanticSerializer<'_> {
}
}

struct EscapeNonAsciiFormatter;

impl Formatter for EscapeNonAsciiFormatter {
fn write_string_fragment<W: ?Sized + Write>(&mut self, writer: &mut W, fragment: &str) -> io::Result<()> {
for ch in fragment.chars() {
if ch.is_ascii() {
writer.write_all(ch.encode_utf8(&mut [0; 4]).as_bytes())?;
} else {
for escape in ch.encode_utf16(&mut [0; 2]) {
write!(writer, "\\u{escape:04x}")?;
}
}
}
Ok(())
}
}

struct EscapeNonAsciiPrettyFormatter<'a> {
pretty: PrettyFormatter<'a>,
escape_non_ascii: EscapeNonAsciiFormatter,
}

impl<'a> EscapeNonAsciiPrettyFormatter<'a> {
pub fn with_indent(indent: &'a [u8]) -> Self {
Self {
pretty: PrettyFormatter::with_indent(indent),
escape_non_ascii: EscapeNonAsciiFormatter,
}
}
}

macro_rules! defer {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As per @cetra3's suggestion. The match statement in to_json_bytes isn't going to scale well, so maybe an alternative API should be used in the future.

($formatter:ident, $fun:ident) => {
fn $fun<W>(&mut self, writer: &mut W) -> io::Result<()>
where
W: ?Sized + io::Write,
{
self.$formatter.$fun(writer)
}
};
($formatter:ident, $fun:ident, $val:ty) => {
fn $fun<W>(&mut self, writer: &mut W, val: $val) -> io::Result<()>
where
W: ?Sized + io::Write,
{
self.$formatter.$fun(writer, val)
}
};
}

#[allow(clippy::needless_lifetimes)]
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like a false positive from Clippy? There are already a bunch.

impl<'a> Formatter for EscapeNonAsciiPrettyFormatter<'a> {
defer!(escape_non_ascii, write_string_fragment, &str);
defer!(pretty, begin_array);
defer!(pretty, end_array);
defer!(pretty, begin_array_value, bool);
defer!(pretty, end_array_value);
defer!(pretty, begin_object);
defer!(pretty, end_object);
defer!(pretty, begin_object_key, bool);
defer!(pretty, end_object_key);
defer!(pretty, begin_object_value);
defer!(pretty, end_object_value);
}

#[allow(clippy::too_many_arguments)]
pub(crate) fn to_json_bytes(
value: &Bound<'_, PyAny>,
Expand All @@ -357,25 +423,40 @@ pub(crate) fn to_json_bytes(
exclude: Option<&Bound<'_, PyAny>>,
extra: &Extra,
indent: Option<usize>,
ensure_ascii: bool,
expected_json_size: usize,
) -> PyResult<Vec<u8>> {
let serializer = PydanticSerializer::new(value, serializer, include, exclude, extra);

let writer: Vec<u8> = Vec::with_capacity(expected_json_size);
let bytes = match indent {
Some(indent) => {

let bytes = match (indent, ensure_ascii) {
(Some(indent), true) => {
let indent = vec![b' '; indent];
let formatter = EscapeNonAsciiPrettyFormatter::with_indent(&indent);
let mut ser = PythonSerializer::with_formatter(writer, formatter);
serializer.serialize(&mut ser).map_err(se_err_py_err)?;
ser.into_inner()
}
(Some(indent), false) => {
let indent = vec![b' '; indent];
let formatter = PrettyFormatter::with_indent(&indent);
let mut ser = PythonSerializer::with_formatter(writer, formatter);
serializer.serialize(&mut ser).map_err(se_err_py_err)?;
ser.into_inner()
}
None => {
(None, true) => {
let mut ser = PythonSerializer::with_formatter(writer, EscapeNonAsciiFormatter);
serializer.serialize(&mut ser).map_err(se_err_py_err)?;
ser.into_inner()
}
(None, false) => {
let mut ser = PythonSerializer::new(writer);
serializer.serialize(&mut ser).map_err(se_err_py_err)?;
ser.into_inner()
}
};

Ok(bytes)
}

Expand Down
8 changes: 4 additions & 4 deletions src/serializers/type_serializers/json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ impl TypeSerializer for JsonSerializer {
extra: &Extra,
) -> PyResult<PyObject> {
if extra.round_trip {
let bytes = to_json_bytes(value, &self.serializer, include, exclude, extra, None, 0)?;
let bytes = to_json_bytes(value, &self.serializer, include, exclude, extra, None, false, 0)?;
let py = value.py();
let s = from_utf8(&bytes).map_err(|e| utf8_py_error(py, e, &bytes))?;
Ok(PyString::new(py, s).into())
Expand All @@ -65,7 +65,7 @@ impl TypeSerializer for JsonSerializer {

fn json_key<'a>(&self, key: &'a Bound<'_, PyAny>, extra: &Extra) -> PyResult<Cow<'a, str>> {
if extra.round_trip {
let bytes = to_json_bytes(key, &self.serializer, None, None, extra, None, 0)?;
let bytes = to_json_bytes(key, &self.serializer, None, None, extra, None, false, 0)?;
let py = key.py();
let s = from_utf8(&bytes).map_err(|e| utf8_py_error(py, e, &bytes))?;
Ok(Cow::Owned(s.to_string()))
Expand All @@ -83,8 +83,8 @@ impl TypeSerializer for JsonSerializer {
extra: &Extra,
) -> Result<S::Ok, S::Error> {
if extra.round_trip {
let bytes =
to_json_bytes(value, &self.serializer, include, exclude, extra, None, 0).map_err(py_err_se_err)?;
let bytes = to_json_bytes(value, &self.serializer, include, exclude, extra, None, false, 0)
.map_err(py_err_se_err)?;
match from_utf8(&bytes) {
Ok(s) => serializer.serialize_str(s),
Err(e) => Err(Error::custom(e.to_string())),
Expand Down
32 changes: 32 additions & 0 deletions tests/serializers/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,38 @@ def test_str():
assert json.loads(json_emoji) == 'emoji 💩'


# Tests borrowed from:
# - https://github.com/python/cpython/blob/d87e7f35/Lib/test/test_json/test_encode_basestring_ascii.py
# - https://github.com/python/cpython/blob/d87e7f35/Lib/test/test_json/test_unicode.py
@pytest.mark.parametrize(
['input', 'expected'],
[
(
'/\\"\ucafe\ubabe\uab98\ufcde\ubcda\uef4a\x08\x0c\n\r\t`1~!@#$%^&*()_+-=[]{}|;:\',./<>?',
'"/\\\\\\"\\ucafe\\ubabe\\uab98\\ufcde\\ubcda\\uef4a\\b\\f\\n\\r\\t`1~!@#$%^&*()_+-=[]{}|;:\',./<>?"',
),
('\u0123\u4567\u89ab\ucdef\uabcd\uef4a', '"\\u0123\\u4567\\u89ab\\ucdef\\uabcd\\uef4a"'),
('controls', '"controls"'),
('\x08\x0c\n\r\t', '"\\b\\f\\n\\r\\t"'),
(
'{"object with 1 member":["array with 1 element"]}',
'"{\\"object with 1 member\\":[\\"array with 1 element\\"]}"',
),
(' s p a c e d ', '" s p a c e d "'),
('\U0001d120', '"\\ud834\\udd20"'),
('\u03b1\u03a9', '"\\u03b1\\u03a9"'),
("`1~!@#$%^&*()_+-={':[,]}|;.</>?", '"`1~!@#$%^&*()_+-={\':[,]}|;.</>?"'),
('\x08\x0c\n\r\t', '"\\b\\f\\n\\r\\t"'),
('\u0123\u4567\u89ab\ucdef\uabcd\uef4a', '"\\u0123\\u4567\\u89ab\\ucdef\\uabcd\\uef4a"'),
('\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}', '"\\u03b1\\u03a9"'),
('\U0001d120', '"\\ud834\\udd20"'),
],
)
def test_str_ensure_ascii(input: str, expected: str) -> None:
v = SchemaSerializer(core_schema.str_schema())
assert v.to_json(input, ensure_ascii=True).decode('utf-8') == expected


def test_huge_str():
v = SchemaSerializer(core_schema.int_schema())
msg = r"Expected `int` - serialized value may not be as expected \[input_value='123456789012345678901234...89012345678901234567890', input_type=str\]"
Expand Down
3 changes: 3 additions & 0 deletions tests/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,9 @@ def test_to_json():
assert to_json([1, 2]) == b'[1,2]'
assert to_json([1, 2], indent=2) == b'[\n 1,\n 2\n]'
assert to_json([1, b'x']) == b'[1,"x"]'
assert to_json(['à', 'é']).decode('utf-8') == '["à","é"]'
assert to_json(['à', 'é'], indent=2).decode('utf-8') == '[\n "à",\n "é"\n]'
assert to_json(['à', 'é'], indent=2, ensure_ascii=True).decode('utf-8') == '[\n "\\u00e0",\n "\\u00e9"\n]'

# kwargs required
with pytest.raises(TypeError, match=r'to_json\(\) takes 1 positional arguments but 2 were given'):
Expand Down
Loading