Skip to content

Commit 0a6ecd6

Browse files
Mingundralley
authored andcommitted
Add reusable parser for XML element and use it internally
1 parent 02de8a5 commit 0a6ecd6

File tree

6 files changed

+132
-49
lines changed

6 files changed

+132
-49
lines changed

Changelog.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ resolve predefined entities.
4343
- `quick_xml::escape::resolve_xml_entity`
4444
- `quick_xml::escape::resolve_html5_entity`
4545
- [#753]: Added parser for processing instructions: `quick_xml::reader::PiParser`.
46+
- [#754]: Added parser for elements: `quick_xml::reader::ElementParser`.
4647

4748
### Bug Fixes
4849

@@ -101,6 +102,7 @@ resolve predefined entities.
101102
[#743]: https://github.com/tafia/quick-xml/pull/743
102103
[#748]: https://github.com/tafia/quick-xml/pull/748
103104
[#753]: https://github.com/tafia/quick-xml/pull/753
105+
[#754]: https://github.com/tafia/quick-xml/pull/754
104106
[`DeEvent`]: https://docs.rs/quick-xml/latest/quick_xml/de/enum.DeEvent.html
105107
[`PayloadEvent`]: https://docs.rs/quick-xml/latest/quick_xml/de/enum.PayloadEvent.html
106108
[`Text`]: https://docs.rs/quick-xml/latest/quick_xml/de/struct.Text.html

src/reader/async_tokio.rs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,7 @@ use crate::errors::{Error, Result, SyntaxError};
88
use crate::events::Event;
99
use crate::name::{QName, ResolveResult};
1010
use crate::reader::buffered_reader::impl_buffered_source;
11-
use crate::reader::{
12-
is_whitespace, BangType, NsReader, ParseState, ReadElementState, Reader, Span,
13-
};
11+
use crate::reader::{is_whitespace, BangType, ElementParser, NsReader, ParseState, Reader, Span};
1412

1513
/// A struct for read XML asynchronously from an [`AsyncBufRead`].
1614
///

src/reader/buffered_reader.rs

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ use std::path::Path;
88
use crate::errors::{Error, Result, SyntaxError};
99
use crate::events::Event;
1010
use crate::name::QName;
11-
use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, XmlSource};
11+
use crate::reader::{is_whitespace, BangType, ElementParser, Reader, Span, XmlSource};
1212

1313
macro_rules! impl_buffered_source {
1414
($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
@@ -190,19 +190,20 @@ macro_rules! impl_buffered_source {
190190
buf: &'b mut Vec<u8>,
191191
position: &mut usize,
192192
) -> Result<&'b [u8]> {
193-
let mut state = ReadElementState::Elem;
193+
let mut parser = ElementParser::default();
194194
let mut read = 0;
195195

196196
let start = buf.len();
197197
loop {
198198
match self $(.$reader)? .fill_buf() $(.$await)? {
199199
Ok(n) if n.is_empty() => break,
200200
Ok(available) => {
201-
if let Some((consumed, used)) = state.change(available) {
202-
buf.extend_from_slice(consumed);
201+
if let Some(used) = parser.feed(available) {
202+
buf.extend_from_slice(&available[..used]);
203203

204-
self $(.$reader)? .consume(used);
205-
read += used;
204+
// +1 for `>` which we do not include
205+
self $(.$reader)? .consume(used + 1);
206+
read += used + 1;
206207

207208
// Position now just after the `>` symbol
208209
*position += read;

src/reader/element.rs

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
//! Contains a parser for an XML element.
2+
3+
/// A parser that search a `>` symbol in the slice outside of quoted regions.
4+
///
5+
/// The parser considers two quoted regions: a double-quoted (`"..."`) and
6+
/// a single-quoted (`'...'`) region. Matches found inside those regions are not
7+
/// considered as results. Each region starts and ends by its quote symbol,
8+
/// which cannot be escaped (but can be encoded as XML character entity or named
9+
/// entity. Anyway, that encoding does not contain literal quotes).
10+
///
11+
/// To use a parser create an instance of parser and [`feed`] data into it.
12+
/// After successful search the parser will return [`Some`] with position of
13+
/// found symbol. If search is unsuccessful, a [`None`] will be returned. You
14+
/// typically would expect positive result of search, so that you should feed
15+
/// new data until you get it.
16+
///
17+
/// NOTE: after successful match the parser does not returned to the initial
18+
/// state and should not be used anymore. Create a new parser if you want to perform
19+
/// new search.
20+
///
21+
/// # Example
22+
///
23+
/// ```
24+
/// # use quick_xml::reader::ElementParser;
25+
/// # use pretty_assertions::assert_eq;
26+
/// let mut parser = ElementParser::default();
27+
///
28+
/// // Parse `<my-element with = 'some > inside'>and the text follow...`
29+
/// // splitted into three chunks
30+
/// assert_eq!(parser.feed(b"<my-element"), None);
31+
/// // ...get new chunk of data
32+
/// assert_eq!(parser.feed(b" with = 'some >"), None);
33+
/// // ...get another chunk of data
34+
/// assert_eq!(parser.feed(b" inside'>and the text follow..."), Some(8));
35+
/// // ^ ^
36+
/// // 0 8
37+
/// ```
38+
///
39+
/// [`feed`]: Self::feed()
40+
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
41+
pub enum ElementParser {
42+
/// The initial state (inside element, but outside of attribute value).
43+
Outside,
44+
/// Inside a single-quoted region (`'...'`).
45+
SingleQ,
46+
/// Inside a double-quoted region (`"..."`).
47+
DoubleQ,
48+
}
49+
50+
impl ElementParser {
51+
/// Returns number of consumed bytes or `None` if `>` was not found in `bytes`.
52+
#[inline]
53+
pub fn feed(&mut self, bytes: &[u8]) -> Option<usize> {
54+
for i in memchr::memchr3_iter(b'>', b'\'', b'"', bytes) {
55+
*self = match (*self, bytes[i]) {
56+
// only allowed to match `>` while we are in state `Outside`
57+
(Self::Outside, b'>') => return Some(i),
58+
(Self::Outside, b'\'') => Self::SingleQ,
59+
(Self::Outside, b'\"') => Self::DoubleQ,
60+
61+
// the only end_byte that gets us out if the same character
62+
(Self::SingleQ, b'\'') | (Self::DoubleQ, b'"') => Self::Outside,
63+
64+
// all other bytes: no state change
65+
_ => continue,
66+
};
67+
}
68+
None
69+
}
70+
}
71+
72+
impl Default for ElementParser {
73+
#[inline]
74+
fn default() -> Self {
75+
Self::Outside
76+
}
77+
}
78+
79+
#[test]
80+
fn parse() {
81+
use pretty_assertions::assert_eq;
82+
use ElementParser::*;
83+
84+
/// Returns `Ok(pos)` with the position in the buffer where element is ended.
85+
///
86+
/// Returns `Err(internal_state)` if parsing does not done yet.
87+
fn parse_element(bytes: &[u8], mut parser: ElementParser) -> Result<usize, ElementParser> {
88+
match parser.feed(bytes) {
89+
Some(i) => Ok(i),
90+
None => Err(parser),
91+
}
92+
}
93+
94+
assert_eq!(parse_element(b"", Outside), Err(Outside));
95+
assert_eq!(parse_element(b"", SingleQ), Err(SingleQ));
96+
assert_eq!(parse_element(b"", DoubleQ), Err(DoubleQ));
97+
98+
assert_eq!(parse_element(b"'", Outside), Err(SingleQ));
99+
assert_eq!(parse_element(b"'", SingleQ), Err(Outside));
100+
assert_eq!(parse_element(b"'", DoubleQ), Err(DoubleQ));
101+
102+
assert_eq!(parse_element(b"\"", Outside), Err(DoubleQ));
103+
assert_eq!(parse_element(b"\"", SingleQ), Err(SingleQ));
104+
assert_eq!(parse_element(b"\"", DoubleQ), Err(Outside));
105+
106+
assert_eq!(parse_element(b">", Outside), Ok(0));
107+
assert_eq!(parse_element(b">", SingleQ), Err(SingleQ));
108+
assert_eq!(parse_element(b">", DoubleQ), Err(DoubleQ));
109+
110+
assert_eq!(parse_element(b"''>", Outside), Ok(2));
111+
assert_eq!(parse_element(b"''>", SingleQ), Err(SingleQ));
112+
assert_eq!(parse_element(b"''>", DoubleQ), Err(DoubleQ));
113+
}

src/reader/mod.rs

Lines changed: 2 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -426,11 +426,13 @@ macro_rules! read_to_end {
426426
#[cfg(feature = "async-tokio")]
427427
mod async_tokio;
428428
mod buffered_reader;
429+
mod element;
429430
mod ns_reader;
430431
mod pi;
431432
mod slice_reader;
432433
mod state;
433434

435+
pub use element::ElementParser;
434436
pub use ns_reader::NsReader;
435437
pub use pi::PiParser;
436438

@@ -986,40 +988,6 @@ impl BangType {
986988
}
987989
}
988990

989-
/// State machine for the [`XmlSource::read_element`]
990-
#[derive(Clone, Copy)]
991-
enum ReadElementState {
992-
/// The initial state (inside element, but outside of attribute value)
993-
Elem,
994-
/// Inside a single-quoted attribute value
995-
SingleQ,
996-
/// Inside a double-quoted attribute value
997-
DoubleQ,
998-
}
999-
impl ReadElementState {
1000-
/// Changes state by analyzing part of input.
1001-
/// Returns a tuple with part of chunk up to element closing symbol `>`
1002-
/// and a position after that symbol or `None` if such symbol was not found
1003-
#[inline(always)]
1004-
fn change<'b>(&mut self, chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
1005-
for i in memchr::memchr3_iter(b'>', b'\'', b'"', chunk) {
1006-
*self = match (*self, chunk[i]) {
1007-
// only allowed to match `>` while we are in state `Elem`
1008-
(Self::Elem, b'>') => return Some((&chunk[..i], i + 1)),
1009-
(Self::Elem, b'\'') => Self::SingleQ,
1010-
(Self::Elem, b'\"') => Self::DoubleQ,
1011-
1012-
// the only end_byte that gets us out if the same character
1013-
(Self::SingleQ, b'\'') | (Self::DoubleQ, b'"') => Self::Elem,
1014-
1015-
// all other bytes: no state change
1016-
_ => *self,
1017-
};
1018-
}
1019-
None
1020-
}
1021-
}
1022-
1023991
/// A function to check whether the byte is a whitespace (blank, new line, carriage return or tab)
1024992
#[inline]
1025993
pub(crate) const fn is_whitespace(b: u8) -> bool {

src/reader/slice_reader.rs

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ use encoding_rs::{Encoding, UTF_8};
1212
use crate::errors::{Error, Result, SyntaxError};
1313
use crate::events::Event;
1414
use crate::name::QName;
15-
use crate::reader::{is_whitespace, BangType, PiParser, ReadElementState, Reader, Span, XmlSource};
15+
use crate::reader::{is_whitespace, BangType, ElementParser, PiParser, Reader, Span, XmlSource};
1616

1717
/// This is an implementation for reading from a `&[u8]` as underlying byte stream.
1818
/// This implementation supports not using an intermediate buffer as the byte slice
@@ -312,12 +312,13 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] {
312312
}
313313

314314
fn read_element(&mut self, _buf: (), position: &mut usize) -> Result<&'a [u8]> {
315-
let mut state = ReadElementState::Elem;
315+
let mut parser = ElementParser::default();
316316

317-
if let Some((bytes, i)) = state.change(self) {
318-
// Position now just after the `>` symbol
319-
*position += i;
320-
*self = &self[i..];
317+
if let Some(i) = parser.feed(self) {
318+
// +1 for `>` which we do not include
319+
*position += i + 1;
320+
let bytes = &self[..i];
321+
*self = &self[i + 1..];
321322
return Ok(bytes);
322323
}
323324

0 commit comments

Comments
 (0)