Skip to content

Serde support for serializing and deserializing binary blobs in XML files #788

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ serde = { version = ">=1.0.139", optional = true }
tokio = { version = "1.10", optional = true, default-features = false, features = ["io-util"] }
memchr = "2.1"
arbitrary = { version = "1", features = ["derive"], optional = true }
ref-cast = "1"

[dev-dependencies]
criterion = "0.4"
Expand Down
5 changes: 5 additions & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,15 @@

### New Features

- [#623]: Added `Reader::stream()` that can be used to read arbitrary data
from the inner reader while track position for XML reader.

### Bug Fixes

### Misc Changes

[#623]: https://github.com/tafia/quick-xml/issues/623


## 0.36.0 -- 2024-07-08

Expand Down
7 changes: 5 additions & 2 deletions src/de/map.rs
Original file line number Diff line number Diff line change
Expand Up @@ -247,15 +247,15 @@ where
// We shouldn't have both `$value` and `$text` fields in the same
// struct, so if we have `$value` field, the we should deserialize
// text content to `$value`
DeEvent::Text(_) if self.has_value_field => {
DeEvent::Text(_) | DeEvent::Binary(_) if self.has_value_field => {
self.source = ValueSource::Content;
// Deserialize `key` from special attribute name which means
// that value should be taken from the text content of the
// XML node
let de = BorrowedStrDeserializer::<DeError>::new(VALUE_KEY);
seed.deserialize(de).map(Some)
}
DeEvent::Text(_) => {
DeEvent::Text(_) | DeEvent::Binary(_) => {
self.source = ValueSource::Text;
// Deserialize `key` from special attribute name which means
// that value should be taken from the text content of the
Expand Down Expand Up @@ -943,6 +943,9 @@ where
// SAFETY: we just checked that the next event is Text
_ => unreachable!(),
},
DeEvent::Binary(_) => Err(Self::Error::Unsupported(
"undecodable binary data among a sequence of xml elements".into(),
)),
DeEvent::Start(_) => match self.map.de.next()? {
DeEvent::Start(start) => seed
.deserialize(ElementDeserializer {
Expand Down
144 changes: 126 additions & 18 deletions src/de/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2005,7 +2005,7 @@ use crate::{
errors::Error,
events::{BytesCData, BytesEnd, BytesStart, BytesText, Event},
name::QName,
reader::Reader,
reader::{Config, Reader},
};
use serde::de::{self, Deserialize, DeserializeOwned, DeserializeSeed, SeqAccess, Visitor};
use std::borrow::Cow;
Expand Down Expand Up @@ -2056,6 +2056,31 @@ impl<'a> From<&'a str> for Text<'a> {
}
}

/// Docs
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct Binary<'a> {
/// Field
pub text: Cow<'a, [u8]>,
}

impl<'a> Deref for Binary<'a> {
type Target = [u8];

#[inline]
fn deref(&self) -> &Self::Target {
self.text.deref()
}
}

impl<'a> From<&'a [u8]> for Binary<'a> {
#[inline]
fn from(text: &'a [u8]) -> Self {
Self {
text: Cow::Borrowed(text),
}
}
}

////////////////////////////////////////////////////////////////////////////////////////////////////

/// Simplified event which contains only these variants that used by deserializer
Expand All @@ -2074,6 +2099,8 @@ pub enum DeEvent<'a> {
/// [`Comment`]: Event::Comment
/// [`PI`]: Event::PI
Text(Text<'a>),
/// Binary undecoded
Binary(Binary<'a>),
/// End of XML document.
Eof,
}
Expand Down Expand Up @@ -2179,19 +2206,22 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> {
/// occurs. Content of all events would be appended to `result` and returned
/// as [`DeEvent::Text`].
///
/// If the resulting text empty, this function returns None to avoid creating an empty Event.
///
/// [`Text`]: PayloadEvent::Text
/// [`CData`]: PayloadEvent::CData
fn drain_text(&mut self, mut result: Cow<'i, str>) -> Result<DeEvent<'i>, DeError> {
fn drain_text(&mut self, mut result: Cow<'i, str>) -> Result<Option<DeEvent<'i>>, DeError> {
loop {
if self.current_event_is_last_text() {
break;
}

match self.next_impl()? {
PayloadEvent::Text(mut e) => {
if self.current_event_is_last_text() {
// FIXME: Actually, we should trim after decoding text, but now we trim before
e.inplace_trim_end();
if self.reader.config().trim_text_end {
e.inplace_trim_end();
}
}
result
.to_mut()
Expand All @@ -2200,10 +2230,12 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> {
PayloadEvent::CData(e) => result.to_mut().push_str(&e.decode()?),

// SAFETY: current_event_is_last_text checks that event is Text or CData
_ => unreachable!("Only `Text` and `CData` events can come here"),
e => {
unreachable!("Only `Text` and `CData` events can come here: {:?}", &e);
}
}
}
Ok(DeEvent::Text(Text { text: result }))
Ok(Some(DeEvent::Text(Text { text: result })))
}

/// Return an input-borrowing event.
Expand All @@ -2213,13 +2245,29 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> {
PayloadEvent::Start(e) => Ok(DeEvent::Start(e)),
PayloadEvent::End(e) => Ok(DeEvent::End(e)),
PayloadEvent::Text(mut e) => {
if self.current_event_is_last_text() && e.inplace_trim_end() {
// FIXME: Actually, we should trim after decoding text, but now we trim before
continue;
if self.current_event_is_last_text() {
if self.reader.config().trim_text_end && e.inplace_trim_end() {
continue;
}
}

match e
.unescape_with(|entity| self.entity_resolver.resolve(entity))
.map(|res| self.drain_text(res))
{
Ok(Ok(None)) => continue,
Ok(Ok(Some(x))) => Ok(x),
Ok(Err(x)) => Err(x),
// failed to escape treat as binary blob.
Err(_) => Ok(DeEvent::Binary(Binary {
text: e.into_inner(),
})),
}
self.drain_text(e.unescape_with(|entity| self.entity_resolver.resolve(entity))?)
}
PayloadEvent::CData(e) => self.drain_text(e.decode()?),
PayloadEvent::CData(e) => match self.drain_text(e.decode()?).transpose() {
None => continue,
Some(x) => x,
},
PayloadEvent::DocType(e) => {
self.entity_resolver
.capture(e)
Expand Down Expand Up @@ -2296,6 +2344,16 @@ where
T::deserialize(&mut de)
}

/// Deserialize from a custom reader.
pub fn from_custom_reader<R, T>(reader: Reader<R>) -> Result<T, DeError>
where
R: BufRead,
T: DeserializeOwned,
{
let mut de = Deserializer::from_custom_reader(reader);
T::deserialize(&mut de)
}

// TODO: According to the https://www.w3.org/TR/xmlschema11-2/#boolean,
// valid boolean representations are only "true", "false", "1", and "0"
fn str2bool<'de, V>(value: &str, visitor: V) -> Result<V::Value, DeError>
Expand Down Expand Up @@ -2687,6 +2745,8 @@ where
fn read_string_impl(&mut self, allow_start: bool) -> Result<Cow<'de, str>, DeError> {
match self.next()? {
DeEvent::Text(e) => Ok(e.text),
// SAFETY: Binary event should never be emitted for decoded strings.
DeEvent::Binary(e) => unreachable!("{:?}", e),
// allow one nested level
DeEvent::Start(e) if allow_start => self.read_text(e.name()),
DeEvent::Start(e) => Err(DeError::UnexpectedStart(e.name().as_ref().to_owned())),
Expand All @@ -2708,10 +2768,12 @@ where
// The matching tag name is guaranteed by the reader
DeEvent::End(_) => Ok(e.text),
// SAFETY: Cannot be two consequent Text events, they would be merged into one
DeEvent::Text(_) => unreachable!(),
DeEvent::Text(_) | DeEvent::Binary(_) => unreachable!(),
DeEvent::Start(e) => Err(DeError::UnexpectedStart(e.name().as_ref().to_owned())),
DeEvent::Eof => Err(Error::missed_end(name, self.reader.decoder()).into()),
},
// SAFETY: Binary event should never be emitted for decoded strings.
DeEvent::Binary(e) => unreachable!("{:?}", e),
// We can get End event in case of `<tag></tag>` or `<tag/>` input
// Return empty text in that case
// The matching tag name is guaranteed by the reader
Expand Down Expand Up @@ -2827,6 +2889,30 @@ where
}
}

impl<'de, R> Deserializer<'de, IoReader<R>>
where
R: BufRead,
{
/// Create new deserializer that will copy data from the specified reader
/// into internal buffer.
///
/// If you already have a string use [`Self::from_str`] instead, because it
/// will borrow instead of copy. If you have `&[u8]` which is known to represent
/// UTF-8, you can decode it first before using [`from_str`].
///
/// Deserializer created with this method will not resolve custom entities.
pub fn from_custom_reader(reader: Reader<R>) -> Self {
Self::new(
IoReader {
reader,
start_trimmer: StartTrimmer::default(),
buf: Vec::new(),
},
PredefinedEntityResolver,
)
}
}

impl<'de, R, E> Deserializer<'de, IoReader<R>, E>
where
R: BufRead,
Expand Down Expand Up @@ -2884,6 +2970,10 @@ where
Cow::Borrowed(s) => visitor.visit_borrowed_str(s),
Cow::Owned(s) => visitor.visit_string(s),
},
DeEvent::Binary(e) => match e.text {
Cow::Borrowed(s) => visitor.visit_borrowed_bytes(s),
Cow::Owned(s) => visitor.visit_byte_buf(s),
},
DeEvent::Eof => Err(DeError::UnexpectedEof),
}
}
Expand Down Expand Up @@ -2914,7 +3004,7 @@ where
self.read_to_end(s.name())?;
visitor.visit_unit()
}
DeEvent::Text(_) => visitor.visit_unit(),
DeEvent::Text(_) | DeEvent::Binary(_) => visitor.visit_unit(),
// SAFETY: The reader is guaranteed that we don't have unmatched tags
// If we here, then out deserializer has a bug
DeEvent::End(e) => unreachable!("{:?}", e),
Expand Down Expand Up @@ -3022,7 +3112,7 @@ impl StartTrimmer {
/// Converts raw reader's event into a payload event.
/// Returns `None`, if event should be skipped.
#[inline(always)]
fn trim<'a>(&mut self, event: Event<'a>) -> Option<PayloadEvent<'a>> {
fn trim<'a>(&mut self, event: Event<'a>, trim_text_start: bool) -> Option<PayloadEvent<'a>> {
let (event, trim_next_event) = match event {
Event::DocType(e) => (PayloadEvent::DocType(e), true),
Event::Start(e) => (PayloadEvent::Start(e), true),
Expand All @@ -3033,7 +3123,10 @@ impl StartTrimmer {
Event::CData(e) => (PayloadEvent::CData(e), false),
Event::Text(mut e) => {
// If event is empty after trimming, skip it
if self.trim_start && e.inplace_trim_start() {
// Or if event is all white space, skip it regardless of trimming settings
if (trim_text_start && self.trim_start && e.inplace_trim_start())
|| e.is_all_whitespace()
{
return None;
}
(PayloadEvent::Text(e), false)
Expand Down Expand Up @@ -3071,6 +3164,9 @@ pub trait XmlRead<'i> {

/// A copy of the reader's decoder used to decode strings.
fn decoder(&self) -> Decoder;

/// Returns a reference to the reader config.
fn config(&self) -> &Config;
}

/// XML input source that reads from a std::io input stream.
Expand Down Expand Up @@ -3123,8 +3219,9 @@ impl<'i, R: BufRead> XmlRead<'i> for IoReader<R> {
loop {
self.buf.clear();

let trim_text_start = self.reader.config().trim_text_start;
let event = self.reader.read_event_into(&mut self.buf)?;
if let Some(event) = self.start_trimmer.trim(event) {
if let Some(event) = self.start_trimmer.trim(event, trim_text_start) {
return Ok(event.into_owned());
}
}
Expand All @@ -3140,6 +3237,10 @@ impl<'i, R: BufRead> XmlRead<'i> for IoReader<R> {
fn decoder(&self) -> Decoder {
self.reader.decoder()
}

fn config(&self) -> &Config {
self.reader.config()
}
}

/// XML input source that reads from a slice of bytes and can borrow from it.
Expand Down Expand Up @@ -3189,7 +3290,10 @@ impl<'de> XmlRead<'de> for SliceReader<'de> {
fn next(&mut self) -> Result<PayloadEvent<'de>, DeError> {
loop {
let event = self.reader.read_event()?;
if let Some(event) = self.start_trimmer.trim(event) {
if let Some(event) = self
.start_trimmer
.trim(event, self.config().trim_text_start)
{
return Ok(event);
}
}
Expand All @@ -3205,6 +3309,10 @@ impl<'de> XmlRead<'de> for SliceReader<'de> {
fn decoder(&self) -> Decoder {
self.reader.decoder()
}

fn config(&self) -> &Config {
self.reader.config()
}
}

#[cfg(test)]
Expand Down Expand Up @@ -4363,7 +4471,7 @@ mod tests {
fn start() {
let mut de = make_de(" text <tag1><tag2>");
// Text is trimmed from both sides
assert_eq!(de.next().unwrap(), DeEvent::Text("text".into()));
assert_eq!(de.next().unwrap(), DeEvent::Text(" text ".into()));
assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag1")));
assert_eq!(de.next().unwrap(), DeEvent::Start(BytesStart::new("tag2")));
assert_eq!(de.next().unwrap(), DeEvent::Eof);
Expand Down
2 changes: 1 addition & 1 deletion src/de/var.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ where
seed.deserialize(QNameDeserializer::from_elem(e.raw_name(), decoder)?)?,
false,
),
DeEvent::Text(_) => (
DeEvent::Text(_) | DeEvent::Binary(_) => (
seed.deserialize(BorrowedStrDeserializer::<DeError>::new(TEXT_KEY))?,
true,
),
Expand Down
6 changes: 6 additions & 0 deletions src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -464,4 +464,10 @@ pub mod serialize {
Self::Custom(e.to_string())
}
}
impl From<std::io::Error> for DeError {
#[inline]
fn from(e: std::io::Error) -> Self {
Self::Custom(e.to_string())
}
}
}
7 changes: 6 additions & 1 deletion src/events/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ use crate::escape::{
use crate::name::{LocalName, QName};
#[cfg(feature = "serialize")]
use crate::utils::CowRef;
use crate::utils::{name_len, trim_xml_end, trim_xml_start, write_cow_string};
use crate::utils::{is_whitespace, name_len, trim_xml_end, trim_xml_start, write_cow_string};
use attributes::{Attribute, Attributes};

/// Opening tag data (`Event::Start`), with optional attributes: `<name attr="value">`.
Expand Down Expand Up @@ -622,6 +622,11 @@ impl<'a> BytesText<'a> {
self.content = trim_cow(replace(&mut self.content, Cow::Borrowed(b"")), trim_xml_end);
self.content.is_empty()
}

/// Returns `true` if all characters are whitespace characters.
pub fn is_all_whitespace(&mut self) -> bool {
self.content.iter().all(|&x| is_whitespace(x))
}
}

impl<'a> Debug for BytesText<'a> {
Expand Down
Loading
Loading