Skip to content

Commit 704ce89

Browse files
Mingundralley
authored andcommitted
Generalize reading methods of PI and element
They are identical except different type of parser used.
1 parent 6f1a644 commit 704ce89

File tree

6 files changed

+84
-120
lines changed

6 files changed

+84
-120
lines changed

src/reader/async_tokio.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@ use crate::errors::{Error, Result, SyntaxError};
88
use crate::events::Event;
99
use crate::name::{QName, ResolveResult};
1010
use crate::reader::buffered_reader::impl_buffered_source;
11-
use crate::reader::{is_whitespace, BangType, ElementParser, NsReader, ParseState, Reader, Span};
11+
use crate::reader::{
12+
is_whitespace, BangType, ElementParser, NsReader, ParseState, Parser, PiParser, Reader, Span,
13+
};
1214

1315
/// A struct for read XML asynchronously from an [`AsyncBufRead`].
1416
///

src/reader/buffered_reader.rs

Lines changed: 5 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@ use std::fs::File;
55
use std::io::{self, BufRead, BufReader};
66
use std::path::Path;
77

8-
use crate::errors::{Error, Result, SyntaxError};
8+
use crate::errors::{Error, Result};
99
use crate::events::Event;
1010
use crate::name::QName;
11-
use crate::reader::{is_whitespace, BangType, ElementParser, Reader, Span, XmlSource};
11+
use crate::reader::{is_whitespace, BangType, Parser, Reader, Span, XmlSource};
1212

1313
macro_rules! impl_buffered_source {
1414
($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
@@ -91,13 +91,12 @@ macro_rules! impl_buffered_source {
9191
Ok((&buf[start..], done))
9292
}
9393

94-
$($async)? fn read_pi $(<$lf>)? (
94+
$($async)? fn read_with<$($lf,)? P: Parser>(
9595
&mut self,
96+
mut parser: P,
9697
buf: &'b mut Vec<u8>,
9798
position: &mut usize,
9899
) -> Result<&'b [u8]> {
99-
let mut parser = super::PiParser::default();
100-
101100
let mut read = 0;
102101
let start = buf.len();
103102
loop {
@@ -131,7 +130,7 @@ macro_rules! impl_buffered_source {
131130
}
132131

133132
*position += read;
134-
Err(Error::Syntax(SyntaxError::UnclosedPIOrXmlDecl))
133+
Err(Error::Syntax(P::eof_error()))
135134
}
136135

137136
$($async)? fn read_bang_element $(<$lf>)? (
@@ -184,50 +183,6 @@ macro_rules! impl_buffered_source {
184183
Err(bang_type.to_err())
185184
}
186185

187-
#[inline]
188-
$($async)? fn read_element $(<$lf>)? (
189-
&mut self,
190-
buf: &'b mut Vec<u8>,
191-
position: &mut usize,
192-
) -> Result<&'b [u8]> {
193-
let mut parser = ElementParser::default();
194-
let mut read = 0;
195-
196-
let start = buf.len();
197-
loop {
198-
let available = match self $(.$reader)? .fill_buf() $(.$await)? {
199-
Ok(n) if n.is_empty() => break,
200-
Ok(n) => n,
201-
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
202-
Err(e) => {
203-
*position += read;
204-
return Err(Error::Io(e.into()));
205-
}
206-
};
207-
208-
if let Some(used) = parser.feed(available) {
209-
buf.extend_from_slice(&available[..used]);
210-
211-
// +1 for `>` which we do not include
212-
self $(.$reader)? .consume(used + 1);
213-
read += used + 1;
214-
215-
// Position now just after the `>` symbol
216-
*position += read;
217-
return Ok(&buf[start..]);
218-
}
219-
220-
// The `>` symbol not yet found, continue reading
221-
buf.extend_from_slice(available);
222-
let used = available.len();
223-
self $(.$reader)? .consume(used);
224-
read += used;
225-
}
226-
227-
*position += read;
228-
Err(Error::Syntax(SyntaxError::UnclosedTag))
229-
}
230-
231186
$($async)? fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> {
232187
loop {
233188
break match self $(.$reader)? .fill_buf() $(.$await)? {

src/reader/element.rs

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
//! Contains a parser for an XML element.
22
3+
use crate::errors::SyntaxError;
4+
use crate::reader::Parser;
5+
36
/// A parser that search a `>` symbol in the slice outside of quoted regions.
47
///
58
/// The parser considers two quoted regions: a double-quoted (`"..."`) and
@@ -21,8 +24,9 @@
2124
/// # Example
2225
///
2326
/// ```
24-
/// # use quick_xml::reader::ElementParser;
2527
/// # use pretty_assertions::assert_eq;
28+
/// use quick_xml::reader::{ElementParser, Parser};
29+
///
2630
/// let mut parser = ElementParser::default();
2731
///
2832
/// // Parse `<my-element with = 'some > inside'>and the text follow...`
@@ -47,10 +51,10 @@ pub enum ElementParser {
4751
DoubleQ,
4852
}
4953

50-
impl ElementParser {
54+
impl Parser for ElementParser {
5155
/// Returns number of consumed bytes or `None` if `>` was not found in `bytes`.
5256
#[inline]
53-
pub fn feed(&mut self, bytes: &[u8]) -> Option<usize> {
57+
fn feed(&mut self, bytes: &[u8]) -> Option<usize> {
5458
for i in memchr::memchr3_iter(b'>', b'\'', b'"', bytes) {
5559
*self = match (*self, bytes[i]) {
5660
// only allowed to match `>` while we are in state `Outside`
@@ -67,6 +71,11 @@ impl ElementParser {
6771
}
6872
None
6973
}
74+
75+
#[inline]
76+
fn eof_error() -> SyntaxError {
77+
SyntaxError::UnclosedTag
78+
}
7079
}
7180

7281
impl Default for ElementParser {

src/reader/mod.rs

Lines changed: 44 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -361,7 +361,7 @@ macro_rules! read_until_close {
361361
},
362362
// `<?` - processing instruction
363363
Ok(Some(b'?')) => match $reader
364-
.read_pi($buf, &mut $self.state.offset)
364+
.read_with(PiParser::default(), $buf, &mut $self.state.offset)
365365
$(.$await)?
366366
{
367367
Ok(bytes) => $self.state.emit_question_mark(bytes),
@@ -374,7 +374,7 @@ macro_rules! read_until_close {
374374
},
375375
// `<...` - opening or self-closed tag
376376
Ok(Some(_)) => match $reader
377-
.read_element($buf, &mut $self.state.offset)
377+
.read_with(ElementParser::default(), $buf, &mut $self.state.offset)
378378
$(.$await)?
379379
{
380380
Ok(bytes) => $self.state.emit_start(bytes),
@@ -763,6 +763,26 @@ impl<R> Reader<R> {
763763

764764
////////////////////////////////////////////////////////////////////////////////////////////////////
765765

766+
/// Used to decouple reading of data from data source and parsing XML structure from it.
767+
/// This is a state preserved between getting chunks of bytes from the reader.
768+
///
769+
/// This trait is implemented for every parser that processes piece of XML grammar.
770+
pub trait Parser {
771+
/// Process new data and try to determine end of the parsed thing.
772+
///
773+
/// Returns position of the end of thing in `bytes` in case of successful search
774+
/// and `None` otherwise.
775+
///
776+
/// # Parameters
777+
/// - `bytes`: a slice to find the end of a thing.
778+
/// Should contain text in ASCII-compatible encoding
779+
fn feed(&mut self, bytes: &[u8]) -> Option<usize>;
780+
781+
/// Returns parse error produced by this parser in case of reaching end of
782+
/// input without finding the end of a parsed thing.
783+
fn eof_error() -> SyntaxError;
784+
}
785+
766786
/// Represents an input for a reader that can return borrowed data.
767787
///
768788
/// There are two implementors of this trait: generic one that read data from
@@ -821,20 +841,25 @@ trait XmlSource<'r, B> {
821841

822842
/// Read input until processing instruction is finished.
823843
///
824-
/// This method expect that `<?` already was read.
844+
/// This method expect that start sequence of a parser already was read.
825845
///
826-
/// Returns a slice of data read up to end of processing instruction (`>`),
827-
/// which does not include into result (`?` at the end included).
846+
/// Returns a slice of data read up to the end of the thing being parsed.
847+
/// The end of thing and the returned content is determined by the used parser.
828848
///
829-
/// If input (`Self`) is exhausted and nothing was read, returns `None`.
849+
/// If input (`Self`) is exhausted and no bytes was read, or if the specified
850+
/// parser could not find the ending sequence of the thing, returns `SyntaxError`.
830851
///
831852
/// # Parameters
832853
/// - `buf`: Buffer that could be filled from an input (`Self`) and
833854
/// from which [events] could borrow their data
834855
/// - `position`: Will be increased by amount of bytes consumed
835856
///
857+
/// A `P` type parameter is used to preserve state between calls to the underlying
858+
/// reader which provides bytes fed into the parser.
836859
/// [events]: crate::events::Event
837-
fn read_pi(&mut self, buf: B, position: &mut usize) -> Result<&'r [u8]>;
860+
fn read_with<P>(&mut self, parser: P, buf: B, position: &mut usize) -> Result<&'r [u8]>
861+
where
862+
P: Parser;
838863

839864
/// Read input until comment or CDATA is finished.
840865
///
@@ -853,30 +878,6 @@ trait XmlSource<'r, B> {
853878
/// [events]: crate::events::Event
854879
fn read_bang_element(&mut self, buf: B, position: &mut usize) -> Result<(BangType, &'r [u8])>;
855880

856-
/// Read input until XML element is closed by approaching a `>` symbol.
857-
/// Returns a buffer that contains a data between `<` and `>` or
858-
/// [`SyntaxError::UnclosedTag`] if end-of-input was reached before reading `>`.
859-
///
860-
/// Derived from `read_until`, but modified to handle XML attributes
861-
/// using a minimal state machine.
862-
///
863-
/// Attribute values are [defined] as follows:
864-
/// ```plain
865-
/// AttValue := '"' (([^<&"]) | Reference)* '"'
866-
/// | "'" (([^<&']) | Reference)* "'"
867-
/// ```
868-
/// (`Reference` is something like `&quot;`, but we don't care about
869-
/// escaped characters at this level)
870-
///
871-
/// # Parameters
872-
/// - `buf`: Buffer that could be filled from an input (`Self`) and
873-
/// from which [events] could borrow their data
874-
/// - `position`: Will be increased by amount of bytes consumed
875-
///
876-
/// [defined]: https://www.w3.org/TR/xml11/#NT-AttValue
877-
/// [events]: crate::events::Event
878-
fn read_element(&mut self, buf: B, position: &mut usize) -> Result<&'r [u8]>;
879-
880881
/// Consume and discard all the whitespace until the next non-whitespace
881882
/// character or EOF.
882883
///
@@ -1510,6 +1511,7 @@ mod test {
15101511
mod read_element {
15111512
use super::*;
15121513
use crate::errors::{Error, SyntaxError};
1514+
use crate::reader::ElementParser;
15131515
use crate::utils::Bytes;
15141516
use pretty_assertions::assert_eq;
15151517

@@ -1521,7 +1523,7 @@ mod test {
15211523
let mut input = b"".as_ref();
15221524
// ^= 1
15231525

1524-
match $source(&mut input).read_element(buf, &mut position) $(.$await)? {
1526+
match $source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? {
15251527
Err(Error::Syntax(SyntaxError::UnclosedTag)) => {}
15261528
x => panic!(
15271529
"Expected `Err(Syntax(UnclosedTag))`, but got `{:?}`",
@@ -1543,7 +1545,7 @@ mod test {
15431545
// ^= 2
15441546

15451547
assert_eq!(
1546-
Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()),
1548+
Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
15471549
Bytes(b"")
15481550
);
15491551
assert_eq!(position, 2);
@@ -1557,7 +1559,7 @@ mod test {
15571559
// ^= 5
15581560

15591561
assert_eq!(
1560-
Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()),
1562+
Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
15611563
Bytes(b"tag")
15621564
);
15631565
assert_eq!(position, 5);
@@ -1571,7 +1573,7 @@ mod test {
15711573
// ^= 3
15721574

15731575
assert_eq!(
1574-
Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()),
1576+
Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
15751577
Bytes(b":")
15761578
);
15771579
assert_eq!(position, 3);
@@ -1585,7 +1587,7 @@ mod test {
15851587
// ^= 6
15861588

15871589
assert_eq!(
1588-
Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()),
1590+
Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
15891591
Bytes(b":tag")
15901592
);
15911593
assert_eq!(position, 6);
@@ -1599,7 +1601,7 @@ mod test {
15991601
// ^= 39
16001602

16011603
assert_eq!(
1602-
Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()),
1604+
Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
16031605
Bytes(br#"tag attr-1=">" attr2 = '>' 3attr"#)
16041606
);
16051607
assert_eq!(position, 39);
@@ -1618,7 +1620,7 @@ mod test {
16181620
// ^= 3
16191621

16201622
assert_eq!(
1621-
Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()),
1623+
Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
16221624
Bytes(b"/")
16231625
);
16241626
assert_eq!(position, 3);
@@ -1632,7 +1634,7 @@ mod test {
16321634
// ^= 6
16331635

16341636
assert_eq!(
1635-
Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()),
1637+
Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
16361638
Bytes(b"tag/")
16371639
);
16381640
assert_eq!(position, 6);
@@ -1646,7 +1648,7 @@ mod test {
16461648
// ^= 4
16471649

16481650
assert_eq!(
1649-
Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()),
1651+
Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
16501652
Bytes(b":/")
16511653
);
16521654
assert_eq!(position, 4);
@@ -1660,7 +1662,7 @@ mod test {
16601662
// ^= 7
16611663

16621664
assert_eq!(
1663-
Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()),
1665+
Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
16641666
Bytes(b":tag/")
16651667
);
16661668
assert_eq!(position, 7);
@@ -1674,7 +1676,7 @@ mod test {
16741676
// ^= 42
16751677

16761678
assert_eq!(
1677-
Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()),
1679+
Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
16781680
Bytes(br#"tag attr-1="/>" attr2 = '/>' 3attr/"#)
16791681
);
16801682
assert_eq!(position, 42);

0 commit comments

Comments
 (0)