Properly normalize attribute values

dralley · dralley · commit 08c0eea76c36 · 2022-07-04T13:08:29.000-04:00
closes tafia#371
diff --git a/benches/macrobenches.rs b/benches/macrobenches.rs
@@ -17,14 +17,13 @@ static SAMPLE_NS: &[u8] = include_bytes!("../tests/documents/sample_ns.xml");
 static PLAYERS: &[u8] = include_bytes!("../tests/documents/players.xml");
 
 // TODO: read the namespaces too
-// TODO: use fully normalized attribute values
 fn parse_document(doc: &[u8]) -> XmlResult<()> {
     let mut r = Reader::from_reader(doc);
     loop {
         match r.read_event_unbuffered()? {
             Event::Start(e) | Event::Empty(e) => {
                 for attr in e.attributes() {
-                    criterion::black_box(attr?.unescaped_value()?);
+                    criterion::black_box(attr?.normalized_value()?);
                 }
             }
             Event::Text(e) => {
diff --git a/benches/microbenches.rs b/benches/microbenches.rs
@@ -125,86 +125,6 @@ fn read_namespaced_event(c: &mut Criterion) {
     group.finish();
 }
 
-/// Benchmarks the `BytesText::unescaped()` method (includes time of `read_event`
-/// benchmark)
-fn bytes_text_unescaped(c: &mut Criterion) {
-    let mut group = c.benchmark_group("BytesText::unescaped");
-    group.bench_function("trim_text = false", |b| {
-        b.iter(|| {
-            let mut buf = Vec::new();
-            let mut r = Reader::from_reader(SAMPLE);
-            r.check_end_names(false).check_comments(false);
-            let mut count = criterion::black_box(0);
-            let mut nbtxt = criterion::black_box(0);
-            loop {
-                match r.read_event(&mut buf) {
-                    Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
-                    Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
-                    Ok(Event::Eof) => break,
-                    _ => (),
-                }
-                buf.clear();
-            }
-            assert_eq!(
-                count, 1550,
-                "Overall tag count in ./tests/documents/sample_rss.xml"
-            );
-
-            // Windows has \r\n instead of \n
-            #[cfg(windows)]
-            assert_eq!(
-                nbtxt, 67661,
-                "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
-            );
-
-            #[cfg(not(windows))]
-            assert_eq!(
-                nbtxt, 66277,
-                "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
-            );
-        });
-    });
-
-    group.bench_function("trim_text = true", |b| {
-        b.iter(|| {
-            let mut buf = Vec::new();
-            let mut r = Reader::from_reader(SAMPLE);
-            r.check_end_names(false)
-                .check_comments(false)
-                .trim_text(true);
-            let mut count = criterion::black_box(0);
-            let mut nbtxt = criterion::black_box(0);
-            loop {
-                match r.read_event(&mut buf) {
-                    Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
-                    Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
-                    Ok(Event::Eof) => break,
-                    _ => (),
-                }
-                buf.clear();
-            }
-            assert_eq!(
-                count, 1550,
-                "Overall tag count in ./tests/documents/sample_rss.xml"
-            );
-
-            // Windows has \r\n instead of \n
-            #[cfg(windows)]
-            assert_eq!(
-                nbtxt, 50334,
-                "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
-            );
-
-            #[cfg(not(windows))]
-            assert_eq!(
-                nbtxt, 50261,
-                "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
-            );
-        });
-    });
-    group.finish();
-}
-
 /// Benchmarks, how fast individual event parsed
 fn one_event(c: &mut Criterion) {
     let mut group = c.benchmark_group("One event");
@@ -364,6 +284,130 @@ fn attributes(c: &mut Criterion) {
             assert_eq!(count, 150);
         })
     });
+
+    group.finish();
+}
+
+/// Benchmarks normalizing attribute values
+fn attribute_value_normalization(c: &mut Criterion) {
+    let mut group = c.benchmark_group("attribute_value_normalization");
+
+    group.bench_function("noop_short", |b| {
+        b.iter(|| {
+            criterion::black_box(unescape(b"foobar")).unwrap();
+        })
+    });
+
+    group.bench_function("noop_long", |b| {
+        b.iter(|| {
+            criterion::black_box(unescape(b"just a bit of text without any entities")).unwrap();
+        })
+    });
+
+    group.bench_function("replacement_chars", |b| {
+        b.iter(|| {
+            criterion::black_box(unescape(b"just a bit\n of text without\tany entities")).unwrap();
+        })
+    });
+
+    group.bench_function("char_reference", |b| {
+        b.iter(|| {
+            let text = b"prefix &#34;some stuff&#34;,&#x22;more stuff&#x22;";
+            criterion::black_box(unescape(text)).unwrap();
+            let text = b"&#38;&#60;";
+            criterion::black_box(unescape(text)).unwrap();
+        })
+    });
+
+    group.bench_function("entity_reference", |b| {
+        b.iter(|| {
+            let text = b"age &gt; 72 &amp;&amp; age &lt; 21";
+            criterion::black_box(unescape(text)).unwrap();
+            let text = b"&quot;what&apos;s that?&quot;";
+            criterion::black_box(unescape(text)).unwrap();
+        })
+    });
+
+    group.finish();
+}
+
+/// Benchmarks the `BytesText::unescaped()` method (includes time of `read_event`
+/// benchmark)
+fn bytes_text_unescaped(c: &mut Criterion) {
+    let mut group = c.benchmark_group("BytesText::unescaped");
+    group.bench_function("trim_text = false", |b| {
+        b.iter(|| {
+            let mut buf = Vec::new();
+            let mut r = Reader::from_reader(SAMPLE);
+            r.check_end_names(false).check_comments(false);
+            let mut count = criterion::black_box(0);
+            let mut nbtxt = criterion::black_box(0);
+            loop {
+                match r.read_event(&mut buf) {
+                    Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
+                    Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
+                    Ok(Event::Eof) => break,
+                    _ => (),
+                }
+                buf.clear();
+            }
+            assert_eq!(
+                count, 1550,
+                "Overall tag count in ./tests/documents/sample_rss.xml"
+            );
+
+            // Windows has \r\n instead of \n
+            #[cfg(windows)]
+            assert_eq!(
+                nbtxt, 67661,
+                "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
+            );
+
+            #[cfg(not(windows))]
+            assert_eq!(
+                nbtxt, 66277,
+                "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
+            );
+        });
+    });
+
+    group.bench_function("trim_text = true", |b| {
+        b.iter(|| {
+            let mut buf = Vec::new();
+            let mut r = Reader::from_reader(SAMPLE);
+            r.check_end_names(false)
+                .check_comments(false)
+                .trim_text(true);
+            let mut count = criterion::black_box(0);
+            let mut nbtxt = criterion::black_box(0);
+            loop {
+                match r.read_event(&mut buf) {
+                    Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
+                    Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
+                    Ok(Event::Eof) => break,
+                    _ => (),
+                }
+                buf.clear();
+            }
+            assert_eq!(
+                count, 1550,
+                "Overall tag count in ./tests/documents/sample_rss.xml"
+            );
+
+            // Windows has \r\n instead of \n
+            #[cfg(windows)]
+            assert_eq!(
+                nbtxt, 50334,
+                "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
+            );
+
+            #[cfg(not(windows))]
+            assert_eq!(
+                nbtxt, 50261,
+                "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
+            );
+        });
+    });
     group.finish();
 }
 
@@ -477,6 +521,7 @@ criterion_group!(
     read_namespaced_event,
     one_event,
     attributes,
+    attribute_value_normalization,
     escaping,
     unescaping,
 );
diff --git a/src/errors.rs b/src/errors.rs
@@ -72,6 +72,7 @@ impl From<EscapeError> for Error {
 }
 
 impl From<AttrError> for Error {
+    /// Creates a new `Error::InvalidAttr` from the given error
     #[inline]
     fn from(error: AttrError) -> Self {
         Error::InvalidAttr(error)
diff --git a/src/escapei.rs b/src/escapei.rs
@@ -9,7 +9,7 @@ use std::ops::Range;
 use pretty_assertions::assert_eq;
 
 /// Error for XML escape/unescqpe.
-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 pub enum EscapeError {
     /// Entity with Null character
     EntityWithNull(::std::ops::Range<usize>),
@@ -134,7 +134,7 @@ pub fn unescape(raw: &[u8]) -> Result<Cow<[u8]>, EscapeError> {
 }
 
 /// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
-/// value, using a dictionnary of custom entities.
+/// value, using a dictionary of custom entities.
 ///
 /// # Pre-condition
 ///
@@ -174,7 +174,9 @@ pub fn do_unescape<'a>(
                 if let Some(s) = named_entity(pat) {
                     unescaped.extend_from_slice(s.as_bytes());
                 } else if pat.starts_with(b"#") {
-                    push_utf8(unescaped, parse_number(&pat[1..], start..end)?);
+                    let entity = &pat[1..]; // starts after the #
+                    let codepoint = parse_number(entity, start..end)?;
+                    push_utf8(unescaped, codepoint);
                 } else if let Some(value) = custom_entities.and_then(|hm| hm.get(pat)) {
                     unescaped.extend_from_slice(&value);
                 } else {
@@ -201,7 +203,7 @@ pub fn do_unescape<'a>(
 }
 
 #[cfg(not(feature = "escape-html"))]
-const fn named_entity(name: &[u8]) -> Option<&str> {
+pub(crate) const fn named_entity(name: &[u8]) -> Option<&str> {
     let s = match name {
         b"lt" => "<",
         b"gt" => ">",
@@ -213,7 +215,7 @@ const fn named_entity(name: &[u8]) -> Option<&str> {
     Some(s)
 }
 #[cfg(feature = "escape-html")]
-const fn named_entity(name: &[u8]) -> Option<&str> {
+pub(crate) const fn named_entity(name: &[u8]) -> Option<&str> {
     // imported from https://dev.w3.org/html5/html-author/charref
     let s = match name {
         b"Tab" => "\u{09}",
@@ -1675,12 +1677,12 @@ const fn named_entity(name: &[u8]) -> Option<&str> {
     Some(s)
 }
 
-fn push_utf8(out: &mut Vec<u8>, code: char) {
+pub(crate) fn push_utf8(out: &mut Vec<u8>, code: char) {
     let mut buf = [0u8; 4];
     out.extend_from_slice(code.encode_utf8(&mut buf).as_bytes());
 }
 
-fn parse_number(bytes: &[u8], range: Range<usize>) -> Result<char, EscapeError> {
+pub(crate) fn parse_number(bytes: &[u8], range: Range<usize>) -> Result<char, EscapeError> {
     let code = if bytes.starts_with(b"x") {
         parse_hexadecimal(&bytes[1..])
     } else {
@@ -1695,7 +1697,7 @@ fn parse_number(bytes: &[u8], range: Range<usize>) -> Result<char, EscapeError>
     }
 }
 
-fn parse_hexadecimal(bytes: &[u8]) -> Result<u32, EscapeError> {
+pub(crate) fn parse_hexadecimal(bytes: &[u8]) -> Result<u32, EscapeError> {
     // maximum code is 0x10FFFF => 6 characters
     if bytes.len() > 6 {
         return Err(EscapeError::TooLongHexadecimal);
@@ -1713,7 +1715,7 @@ fn parse_hexadecimal(bytes: &[u8]) -> Result<u32, EscapeError> {
     Ok(code)
 }
 
-fn parse_decimal(bytes: &[u8]) -> Result<u32, EscapeError> {
+pub(crate) fn parse_decimal(bytes: &[u8]) -> Result<u32, EscapeError> {
     // maximum code is 0x10FFFF = 1114111 => 7 characters
     if bytes.len() > 7 {
         return Err(EscapeError::TooLongDecimal);
diff --git a/src/events/attributes.rs b/src/events/attributes.rs
diff --git a/src/reader.rs b/src/reader.rs

Original file line number	Diff line number	Diff line change
`@@ -72,6 +72,7 @@ impl From<EscapeError> for Error {`
`72`	`72`	`}`
`73`	`73`
`74`	`74`	`impl From<AttrError> for Error {`
	`75`	+ /// Creates a new `Error::InvalidAttr` from the given error
`75`	`76`	`#[inline]`
`76`	`77`	`fn from(error: AttrError) -> Self {`
`77`	`78`	`Error::InvalidAttr(error)`