Skip to content

Commit 56246ea

Browse files
committed
add serializer for binary xml text + tweak binary deserializer
Added reading the config to determine trimming options.
1 parent 47e2a69 commit 56246ea

File tree

11 files changed

+4118
-1764
lines changed

11 files changed

+4118
-1764
lines changed

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ serde = { version = ">=1.0.139", optional = true }
2525
tokio = { version = "1.10", optional = true, default-features = false, features = ["io-util"] }
2626
memchr = "2.1"
2727
arbitrary = { version = "1", features = ["derive"], optional = true }
28+
ref-cast = "1"
2829

2930
[dev-dependencies]
3031
criterion = "0.4"

src/de/mod.rs

Lines changed: 61 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2005,7 +2005,7 @@ use crate::{
20052005
errors::Error,
20062006
events::{BytesCData, BytesEnd, BytesStart, BytesText, Event},
20072007
name::QName,
2008-
reader::Reader,
2008+
reader::{Config, Reader},
20092009
};
20102010
use serde::de::{self, Deserialize, DeserializeOwned, DeserializeSeed, SeqAccess, Visitor};
20112011
use std::borrow::Cow;
@@ -2169,6 +2169,31 @@ struct XmlReader<'i, R: XmlRead<'i>, E: EntityResolver = PredefinedEntityResolve
21692169
entity_resolver: E,
21702170
}
21712171

2172+
fn trim_cow<'a, F>(value: Cow<'a, str>, trim: F) -> Cow<'a, str>
2173+
where
2174+
F: FnOnce(&str) -> &str,
2175+
{
2176+
match value {
2177+
Cow::Borrowed(bytes) => Cow::Borrowed(trim(bytes)),
2178+
Cow::Owned(mut bytes) => {
2179+
let trimmed = trim(&bytes);
2180+
if trimmed.len() != bytes.len() {
2181+
bytes = trimmed.to_string();
2182+
}
2183+
Cow::Owned(bytes)
2184+
}
2185+
}
2186+
}
2187+
2188+
/// Removes trailing XML whitespace bytes from text content.
2189+
///
2190+
/// Returns `true` if content is empty after that
2191+
fn inplace_trim_end(mut s: &mut Cow<str>) -> bool {
2192+
let c: Cow<str> = replace(&mut s, Cow::Borrowed(""));
2193+
*s = trim_cow(c, str::trim_end);
2194+
s.is_empty()
2195+
}
2196+
21722197
impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> {
21732198
fn new(mut reader: R, entity_resolver: E) -> Self {
21742199
// Lookahead by one event immediately, so we do not need to check in the
@@ -2206,20 +2231,23 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> {
22062231
/// Read all consequent [`Text`] and [`CData`] events until non-text event
22072232
/// occurs. Content of all events would be appended to `result` and returned
22082233
/// as [`DeEvent::Text`].
2234+
///
2235+
/// If the resulting text empty, this function returns None to avoid creating an empty Event.
22092236
///
22102237
/// [`Text`]: PayloadEvent::Text
22112238
/// [`CData`]: PayloadEvent::CData
2212-
fn drain_text(&mut self, mut result: Cow<'i, str>) -> Result<DeEvent<'i>, DeError> {
2239+
fn drain_text(&mut self, mut result: Cow<'i, str>) -> Result<Option<DeEvent<'i>>, DeError> {
22132240
loop {
22142241
if self.current_event_is_last_text() {
22152242
break;
22162243
}
2217-
22182244
match self.next_impl()? {
22192245
PayloadEvent::Text(mut e) => {
22202246
if self.current_event_is_last_text() {
22212247
// FIXME: Actually, we should trim after decoding text, but now we trim before
2222-
e.inplace_trim_end();
2248+
if self.reader.config().trim_text_end {
2249+
e.inplace_trim_end();
2250+
}
22232251
}
22242252
result
22252253
.to_mut()
@@ -2228,10 +2256,12 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> {
22282256
PayloadEvent::CData(e) => result.to_mut().push_str(&e.decode()?),
22292257

22302258
// SAFETY: current_event_is_last_text checks that event is Text or CData
2231-
_ => unreachable!("Only `Text` and `CData` events can come here"),
2259+
e => {
2260+
unreachable!("Only `Text` and `CData` events can come here: {:?}", &e);
2261+
}
22322262
}
22332263
}
2234-
Ok(DeEvent::Text(Text { text: result }))
2264+
Ok(Some(DeEvent::Text(Text { text: result })))
22352265
}
22362266

22372267
/// Return an input-borrowing event.
@@ -2241,17 +2271,24 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> {
22412271
PayloadEvent::Start(e) => Ok(DeEvent::Start(e)),
22422272
PayloadEvent::End(e) => Ok(DeEvent::End(e)),
22432273
PayloadEvent::Text(mut e) => {
2244-
if self.current_event_is_last_text() && e.inplace_trim_end() {
2245-
// FIXME: Actually, we should trim after decoding text, but now we trim before
2246-
continue;
2274+
if self.current_event_is_last_text() {
2275+
if self.reader.config().trim_text_end && e.inplace_trim_end() {
2276+
continue;
2277+
}
22472278
}
2279+
22482280
match e.unescape_with(|entity| self.entity_resolver.resolve(entity)).map(|res| self.drain_text(res)) {
2249-
Ok(x) => x,
2281+
Ok(Ok(None)) => continue,
2282+
Ok(Ok(Some(x))) => Ok(x),
2283+
Ok(Err(x)) => Err(x),
22502284
// failed to escape treat as binary blob.
22512285
Err(_) => Ok(DeEvent::Binary(Binary { text: e.into_inner() })),
22522286
}
22532287
}
2254-
PayloadEvent::CData(e) => self.drain_text(e.decode()?),
2288+
PayloadEvent::CData(e) => match self.drain_text(e.decode()?).transpose() {
2289+
None => continue,
2290+
Some(x) => x,
2291+
},
22552292
PayloadEvent::DocType(e) => {
22562293
self.entity_resolver
22572294
.capture(e)
@@ -2834,6 +2871,8 @@ where
28342871
pub fn from_str_with_resolver(source: &'de str, entity_resolver: E) -> Self {
28352872
let mut reader = Reader::from_str(source);
28362873
let config = reader.config_mut();
2874+
config.trim_text_start = true;
2875+
config.trim_text_end = true;
28372876
config.expand_empty_elements = true;
28382877

28392878
Self::new(
@@ -3135,6 +3174,9 @@ pub trait XmlRead<'i> {
31353174

31363175
/// A copy of the reader's decoder used to decode strings.
31373176
fn decoder(&self) -> Decoder;
3177+
3178+
/// Returns a reference to the reader config.
3179+
fn config(&self) -> &Config;
31383180
}
31393181

31403182
/// XML input source that reads from a std::io input stream.
@@ -3204,6 +3246,10 @@ impl<'i, R: BufRead> XmlRead<'i> for IoReader<R> {
32043246
fn decoder(&self) -> Decoder {
32053247
self.reader.decoder()
32063248
}
3249+
3250+
fn config(&self) -> &Config{
3251+
self.reader.config()
3252+
}
32073253
}
32083254

32093255
/// XML input source that reads from a slice of bytes and can borrow from it.
@@ -3269,6 +3315,10 @@ impl<'de> XmlRead<'de> for SliceReader<'de> {
32693315
fn decoder(&self) -> Decoder {
32703316
self.reader.decoder()
32713317
}
3318+
3319+
fn config(&self) -> &Config {
3320+
self.reader.config()
3321+
}
32723322
}
32733323

32743324
#[cfg(test)]

src/se/content.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ macro_rules! write_primitive {
5858
/// with indent, sequence of strings become one big string with additional content
5959
/// and it would be impossible to distinguish between content of the original
6060
/// strings and inserted indent characters.
61-
pub struct ContentSerializer<'w, 'i, W: Write> {
61+
pub struct ContentSerializer<'w, 'i, W> {
6262
pub writer: &'w mut W,
6363
/// Defines which XML characters need to be escaped in text content
6464
pub level: QuoteLevel,

src/se/element.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ macro_rules! write_primitive {
5656
/// - other variants are not supported ([`DeError::Unsupported`] is returned);
5757
///
5858
/// Usage of empty tags depends on the [`ContentSerializer::expand_empty_elements`] setting.
59-
pub struct ElementSerializer<'w, 'k, W: Write> {
59+
pub struct ElementSerializer<'w, 'k, W> {
6060
/// The inner serializer that contains the settings and mostly do the actual work
6161
pub ser: ContentSerializer<'w, 'k, W>,
6262
/// Tag name used to wrap serialized types except enum variants which uses the variant name

0 commit comments

Comments
 (0)