Skip to content

Commit 08c0eea

Browse files
committed
Properly normalize attribute values
closes tafia#371
1 parent 0febc2b commit 08c0eea

File tree

6 files changed

+286
-99
lines changed

6 files changed

+286
-99
lines changed

benches/macrobenches.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,13 @@ static SAMPLE_NS: &[u8] = include_bytes!("../tests/documents/sample_ns.xml");
1717
static PLAYERS: &[u8] = include_bytes!("../tests/documents/players.xml");
1818

1919
// TODO: read the namespaces too
20-
// TODO: use fully normalized attribute values
2120
fn parse_document(doc: &[u8]) -> XmlResult<()> {
2221
let mut r = Reader::from_reader(doc);
2322
loop {
2423
match r.read_event_unbuffered()? {
2524
Event::Start(e) | Event::Empty(e) => {
2625
for attr in e.attributes() {
27-
criterion::black_box(attr?.unescaped_value()?);
26+
criterion::black_box(attr?.normalized_value()?);
2827
}
2928
}
3029
Event::Text(e) => {

benches/microbenches.rs

Lines changed: 125 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -125,86 +125,6 @@ fn read_namespaced_event(c: &mut Criterion) {
125125
group.finish();
126126
}
127127

128-
/// Benchmarks the `BytesText::unescaped()` method (includes time of `read_event`
129-
/// benchmark)
130-
fn bytes_text_unescaped(c: &mut Criterion) {
131-
let mut group = c.benchmark_group("BytesText::unescaped");
132-
group.bench_function("trim_text = false", |b| {
133-
b.iter(|| {
134-
let mut buf = Vec::new();
135-
let mut r = Reader::from_reader(SAMPLE);
136-
r.check_end_names(false).check_comments(false);
137-
let mut count = criterion::black_box(0);
138-
let mut nbtxt = criterion::black_box(0);
139-
loop {
140-
match r.read_event(&mut buf) {
141-
Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
142-
Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
143-
Ok(Event::Eof) => break,
144-
_ => (),
145-
}
146-
buf.clear();
147-
}
148-
assert_eq!(
149-
count, 1550,
150-
"Overall tag count in ./tests/documents/sample_rss.xml"
151-
);
152-
153-
// Windows has \r\n instead of \n
154-
#[cfg(windows)]
155-
assert_eq!(
156-
nbtxt, 67661,
157-
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
158-
);
159-
160-
#[cfg(not(windows))]
161-
assert_eq!(
162-
nbtxt, 66277,
163-
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
164-
);
165-
});
166-
});
167-
168-
group.bench_function("trim_text = true", |b| {
169-
b.iter(|| {
170-
let mut buf = Vec::new();
171-
let mut r = Reader::from_reader(SAMPLE);
172-
r.check_end_names(false)
173-
.check_comments(false)
174-
.trim_text(true);
175-
let mut count = criterion::black_box(0);
176-
let mut nbtxt = criterion::black_box(0);
177-
loop {
178-
match r.read_event(&mut buf) {
179-
Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
180-
Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
181-
Ok(Event::Eof) => break,
182-
_ => (),
183-
}
184-
buf.clear();
185-
}
186-
assert_eq!(
187-
count, 1550,
188-
"Overall tag count in ./tests/documents/sample_rss.xml"
189-
);
190-
191-
// Windows has \r\n instead of \n
192-
#[cfg(windows)]
193-
assert_eq!(
194-
nbtxt, 50334,
195-
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
196-
);
197-
198-
#[cfg(not(windows))]
199-
assert_eq!(
200-
nbtxt, 50261,
201-
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
202-
);
203-
});
204-
});
205-
group.finish();
206-
}
207-
208128
/// Benchmarks, how fast individual event parsed
209129
fn one_event(c: &mut Criterion) {
210130
let mut group = c.benchmark_group("One event");
@@ -364,6 +284,130 @@ fn attributes(c: &mut Criterion) {
364284
assert_eq!(count, 150);
365285
})
366286
});
287+
288+
group.finish();
289+
}
290+
291+
/// Benchmarks normalizing attribute values
292+
fn attribute_value_normalization(c: &mut Criterion) {
293+
let mut group = c.benchmark_group("attribute_value_normalization");
294+
295+
group.bench_function("noop_short", |b| {
296+
b.iter(|| {
297+
criterion::black_box(unescape(b"foobar")).unwrap();
298+
})
299+
});
300+
301+
group.bench_function("noop_long", |b| {
302+
b.iter(|| {
303+
criterion::black_box(unescape(b"just a bit of text without any entities")).unwrap();
304+
})
305+
});
306+
307+
group.bench_function("replacement_chars", |b| {
308+
b.iter(|| {
309+
criterion::black_box(unescape(b"just a bit\n of text without\tany entities")).unwrap();
310+
})
311+
});
312+
313+
group.bench_function("char_reference", |b| {
314+
b.iter(|| {
315+
let text = b"prefix &#34;some stuff&#34;,&#x22;more stuff&#x22;";
316+
criterion::black_box(unescape(text)).unwrap();
317+
let text = b"&#38;&#60;";
318+
criterion::black_box(unescape(text)).unwrap();
319+
})
320+
});
321+
322+
group.bench_function("entity_reference", |b| {
323+
b.iter(|| {
324+
let text = b"age &gt; 72 &amp;&amp; age &lt; 21";
325+
criterion::black_box(unescape(text)).unwrap();
326+
let text = b"&quot;what&apos;s that?&quot;";
327+
criterion::black_box(unescape(text)).unwrap();
328+
})
329+
});
330+
331+
group.finish();
332+
}
333+
334+
/// Benchmarks the `BytesText::unescaped()` method (includes time of `read_event`
335+
/// benchmark)
336+
fn bytes_text_unescaped(c: &mut Criterion) {
337+
let mut group = c.benchmark_group("BytesText::unescaped");
338+
group.bench_function("trim_text = false", |b| {
339+
b.iter(|| {
340+
let mut buf = Vec::new();
341+
let mut r = Reader::from_reader(SAMPLE);
342+
r.check_end_names(false).check_comments(false);
343+
let mut count = criterion::black_box(0);
344+
let mut nbtxt = criterion::black_box(0);
345+
loop {
346+
match r.read_event(&mut buf) {
347+
Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
348+
Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
349+
Ok(Event::Eof) => break,
350+
_ => (),
351+
}
352+
buf.clear();
353+
}
354+
assert_eq!(
355+
count, 1550,
356+
"Overall tag count in ./tests/documents/sample_rss.xml"
357+
);
358+
359+
// Windows has \r\n instead of \n
360+
#[cfg(windows)]
361+
assert_eq!(
362+
nbtxt, 67661,
363+
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
364+
);
365+
366+
#[cfg(not(windows))]
367+
assert_eq!(
368+
nbtxt, 66277,
369+
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
370+
);
371+
});
372+
});
373+
374+
group.bench_function("trim_text = true", |b| {
375+
b.iter(|| {
376+
let mut buf = Vec::new();
377+
let mut r = Reader::from_reader(SAMPLE);
378+
r.check_end_names(false)
379+
.check_comments(false)
380+
.trim_text(true);
381+
let mut count = criterion::black_box(0);
382+
let mut nbtxt = criterion::black_box(0);
383+
loop {
384+
match r.read_event(&mut buf) {
385+
Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
386+
Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
387+
Ok(Event::Eof) => break,
388+
_ => (),
389+
}
390+
buf.clear();
391+
}
392+
assert_eq!(
393+
count, 1550,
394+
"Overall tag count in ./tests/documents/sample_rss.xml"
395+
);
396+
397+
// Windows has \r\n instead of \n
398+
#[cfg(windows)]
399+
assert_eq!(
400+
nbtxt, 50334,
401+
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
402+
);
403+
404+
#[cfg(not(windows))]
405+
assert_eq!(
406+
nbtxt, 50261,
407+
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
408+
);
409+
});
410+
});
367411
group.finish();
368412
}
369413

@@ -477,6 +521,7 @@ criterion_group!(
477521
read_namespaced_event,
478522
one_event,
479523
attributes,
524+
attribute_value_normalization,
480525
escaping,
481526
unescaping,
482527
);

src/errors.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ impl From<EscapeError> for Error {
7272
}
7373

7474
impl From<AttrError> for Error {
75+
/// Creates a new `Error::InvalidAttr` from the given error
7576
#[inline]
7677
fn from(error: AttrError) -> Self {
7778
Error::InvalidAttr(error)

src/escapei.rs

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ use std::ops::Range;
99
use pretty_assertions::assert_eq;
1010

1111
/// Error for XML escape/unescqpe.
12-
#[derive(Debug)]
12+
#[derive(Debug, PartialEq)]
1313
pub enum EscapeError {
1414
/// Entity with Null character
1515
EntityWithNull(::std::ops::Range<usize>),
@@ -134,7 +134,7 @@ pub fn unescape(raw: &[u8]) -> Result<Cow<[u8]>, EscapeError> {
134134
}
135135

136136
/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
137-
/// value, using a dictionnary of custom entities.
137+
/// value, using a dictionary of custom entities.
138138
///
139139
/// # Pre-condition
140140
///
@@ -174,7 +174,9 @@ pub fn do_unescape<'a>(
174174
if let Some(s) = named_entity(pat) {
175175
unescaped.extend_from_slice(s.as_bytes());
176176
} else if pat.starts_with(b"#") {
177-
push_utf8(unescaped, parse_number(&pat[1..], start..end)?);
177+
let entity = &pat[1..]; // starts after the #
178+
let codepoint = parse_number(entity, start..end)?;
179+
push_utf8(unescaped, codepoint);
178180
} else if let Some(value) = custom_entities.and_then(|hm| hm.get(pat)) {
179181
unescaped.extend_from_slice(&value);
180182
} else {
@@ -201,7 +203,7 @@ pub fn do_unescape<'a>(
201203
}
202204

203205
#[cfg(not(feature = "escape-html"))]
204-
const fn named_entity(name: &[u8]) -> Option<&str> {
206+
pub(crate) const fn named_entity(name: &[u8]) -> Option<&str> {
205207
let s = match name {
206208
b"lt" => "<",
207209
b"gt" => ">",
@@ -213,7 +215,7 @@ const fn named_entity(name: &[u8]) -> Option<&str> {
213215
Some(s)
214216
}
215217
#[cfg(feature = "escape-html")]
216-
const fn named_entity(name: &[u8]) -> Option<&str> {
218+
pub(crate) const fn named_entity(name: &[u8]) -> Option<&str> {
217219
// imported from https://dev.w3.org/html5/html-author/charref
218220
let s = match name {
219221
b"Tab" => "\u{09}",
@@ -1675,12 +1677,12 @@ const fn named_entity(name: &[u8]) -> Option<&str> {
16751677
Some(s)
16761678
}
16771679

1678-
fn push_utf8(out: &mut Vec<u8>, code: char) {
1680+
pub(crate) fn push_utf8(out: &mut Vec<u8>, code: char) {
16791681
let mut buf = [0u8; 4];
16801682
out.extend_from_slice(code.encode_utf8(&mut buf).as_bytes());
16811683
}
16821684

1683-
fn parse_number(bytes: &[u8], range: Range<usize>) -> Result<char, EscapeError> {
1685+
pub(crate) fn parse_number(bytes: &[u8], range: Range<usize>) -> Result<char, EscapeError> {
16841686
let code = if bytes.starts_with(b"x") {
16851687
parse_hexadecimal(&bytes[1..])
16861688
} else {
@@ -1695,7 +1697,7 @@ fn parse_number(bytes: &[u8], range: Range<usize>) -> Result<char, EscapeError>
16951697
}
16961698
}
16971699

1698-
fn parse_hexadecimal(bytes: &[u8]) -> Result<u32, EscapeError> {
1700+
pub(crate) fn parse_hexadecimal(bytes: &[u8]) -> Result<u32, EscapeError> {
16991701
// maximum code is 0x10FFFF => 6 characters
17001702
if bytes.len() > 6 {
17011703
return Err(EscapeError::TooLongHexadecimal);
@@ -1713,7 +1715,7 @@ fn parse_hexadecimal(bytes: &[u8]) -> Result<u32, EscapeError> {
17131715
Ok(code)
17141716
}
17151717

1716-
fn parse_decimal(bytes: &[u8]) -> Result<u32, EscapeError> {
1718+
pub(crate) fn parse_decimal(bytes: &[u8]) -> Result<u32, EscapeError> {
17171719
// maximum code is 0x10FFFF = 1114111 => 7 characters
17181720
if bytes.len() > 7 {
17191721
return Err(EscapeError::TooLongDecimal);

0 commit comments

Comments
 (0)