Skip to content

Commit b8ba501

Browse files
committed
Properly normalize attribute values
closes tafia#371
1 parent e701c4d commit b8ba501

File tree

3 files changed

+231
-1
lines changed

3 files changed

+231
-1
lines changed

src/errors.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ impl From<EscapeError> for Error {
6262
}
6363

6464
impl From<AttrError> for Error {
65+
/// Creates a new `Error::InvalidAttr` from the given error
6566
#[inline]
6667
fn from(error: AttrError) -> Self {
6768
Error::InvalidAttr(error)

src/escapei.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ pub fn unescape(raw: &[u8]) -> Result<Cow<[u8]>, EscapeError> {
134134
}
135135

136136
/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
137-
/// value, using a dictionnary of custom entities.
137+
/// value, using a dictionary of custom entities.
138138
///
139139
/// # Pre-condition
140140
///

src/events/attributes.rs

Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,12 @@ pub struct Attribute<'a> {
3232
}
3333

3434
impl<'a> Attribute<'a> {
35+
pub fn normalized_value(&'a self) -> Result<Cow<'a, [u8]>> {
36+
let normalized = normalize_attribute_value(&*self.value);
37+
let escaped = do_unescape(&*normalized, None).map_err(Error::EscapeError)?;
38+
Ok(Cow::Owned(escaped.into_owned()))
39+
}
40+
3541
/// Returns the unescaped value.
3642
///
3743
/// This is normally the value you are interested in. Escape sequences such as `&gt;` are
@@ -289,6 +295,202 @@ impl<'a> From<Attr<&'a [u8]>> for Attribute<'a> {
289295
}
290296
}
291297

298+
/// Normalize the attribute value according to xml specification section 3.3.3
299+
///
300+
/// https://www.w3.org/TR/xml/#AVNormalize
301+
///
302+
/// * Whitespace-like characters (\r, \n, \t, ' ') are trimmed from the ends of the value
303+
/// * Sequences of whitespace-like characters are replaced with a single whitespace character
304+
/// * Character and entity references are substituted as defined by the spec
305+
fn normalize_attribute_value(attr: &[u8]) -> Cow<[u8]> {
306+
// TODO: character references, entity references, error handling associated with those
307+
308+
#[derive(PartialEq)]
309+
enum ParseState {
310+
Space,
311+
CDATA,
312+
}
313+
314+
// Trim characters from the beginning and end of the attribute value - this can't fail.
315+
fn trim_value(attr: &[u8]) -> &[u8] {
316+
let is_whitespace_like = |c| matches!(c, b'\n' | b'\r' | b'\t' | b' ');
317+
318+
let first_non_space_char = attr.iter().position(|c| !is_whitespace_like(*c));
319+
320+
if first_non_space_char.is_none() {
321+
// The entire value was whitespace-like characters
322+
return b"";
323+
}
324+
325+
let last_non_space_char = attr.iter().rposition(|c| !is_whitespace_like(*c));
326+
327+
// Trim all whitespace-like characters away from the beginning and end of the attribute value.
328+
let begin = first_non_space_char.unwrap();
329+
let end = last_non_space_char.unwrap_or(attr.len());
330+
&attr[begin..=end]
331+
}
332+
333+
let trimmed_attr = trim_value(attr);
334+
335+
// A new buffer is only created when we encounter a situation that requires it.
336+
let mut normalized: Option<Vec<u8>> = None;
337+
// We start on character data because all whitespace-like characters are already trimmed away.
338+
let mut current_state = ParseState::CDATA;
339+
340+
// Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference
341+
// or whitespace-like characters that need to be substituted, copy everything processed thus far to a new
342+
// buffer and continue using this buffer.
343+
for (idx, ch) in trimmed_attr.iter().enumerate() {
344+
match ch {
345+
b'\n' | b'\r' | b'\t' | b' ' => match current_state {
346+
ParseState::Space => match normalized {
347+
Some(_) => continue,
348+
None => normalized = Some(Vec::from(&trimmed_attr[..idx])),
349+
},
350+
ParseState::CDATA => {
351+
current_state = ParseState::Space;
352+
match normalized.as_mut() {
353+
Some(buf) => buf.push(b' '),
354+
None => {
355+
let mut buf = Vec::from(&trimmed_attr[..idx]);
356+
buf.push(b' ');
357+
normalized = Some(buf);
358+
}
359+
}
360+
}
361+
},
362+
c @ _ => match current_state {
363+
ParseState::Space => {
364+
current_state = ParseState::CDATA;
365+
if let Some(normalized) = normalized.as_mut() {
366+
normalized.push(*c);
367+
}
368+
}
369+
ParseState::CDATA => {
370+
if let Some(normalized) = normalized.as_mut() {
371+
normalized.push(*c);
372+
}
373+
}
374+
},
375+
}
376+
}
377+
378+
match normalized {
379+
Some(normalized) => Cow::Owned(normalized),
380+
None => Cow::Borrowed(trimmed_attr),
381+
}
382+
}
383+
384+
impl<'a> Iterator for Attributes<'a> {
385+
type Item = Result<Attribute<'a>>;
386+
fn next(&mut self) -> Option<Self::Item> {
387+
let len = self.bytes.len();
388+
389+
macro_rules! err {
390+
($err:expr) => {{
391+
self.position = len;
392+
return Some(Err($err.into()));
393+
}};
394+
}
395+
396+
macro_rules! attr {
397+
($key:expr) => {{
398+
self.position = len;
399+
if self.html {
400+
attr!($key, 0..0)
401+
} else {
402+
None
403+
}
404+
}};
405+
($key:expr, $val:expr) => {
406+
Some(Ok(Attribute {
407+
key: &self.bytes[$key],
408+
value: Cow::Borrowed(&self.bytes[$val]),
409+
}))
410+
};
411+
}
412+
413+
if len <= self.position {
414+
return None;
415+
}
416+
417+
let mut bytes = self.bytes.iter().enumerate().skip(self.position);
418+
419+
// key starts after the whitespace
420+
let start_key = match bytes
421+
.by_ref()
422+
.skip_while(|&(_, &b)| !is_whitespace(b))
423+
.find(|&(_, &b)| !is_whitespace(b))
424+
{
425+
Some((i, _)) => i,
426+
None => return attr!(self.position..len),
427+
};
428+
429+
// key ends with either whitespace or =
430+
let end_key = match bytes
431+
.by_ref()
432+
.find(|&(_, &b)| b == b'=' || is_whitespace(b))
433+
{
434+
Some((i, &b'=')) => i,
435+
Some((i, &b'\'')) | Some((i, &b'"')) if self.with_checks => {
436+
err!(Error::NameWithQuote(i));
437+
}
438+
Some((i, _)) => {
439+
// consume until `=` or return if html
440+
match bytes.by_ref().find(|&(_, &b)| !is_whitespace(b)) {
441+
Some((_, &b'=')) => i,
442+
Some((j, _)) if self.html => {
443+
self.position = j - 1;
444+
return attr!(start_key..i, 0..0);
445+
}
446+
Some((j, _)) => err!(Error::NoEqAfterName(j)),
447+
None if self.html => {
448+
self.position = len;
449+
return attr!(start_key..len, 0..0);
450+
}
451+
None => err!(Error::NoEqAfterName(len)),
452+
}
453+
}
454+
None => return attr!(start_key..len),
455+
};
456+
457+
if self.with_checks {
458+
if let Some(start) = self
459+
.consumed
460+
.iter()
461+
.filter(|r| r.len() == end_key - start_key)
462+
.find(|r| self.bytes[(*r).clone()] == self.bytes[start_key..end_key])
463+
.map(|ref r| r.start)
464+
{
465+
err!(Error::DuplicatedAttribute(start_key, start));
466+
}
467+
self.consumed.push(start_key..end_key);
468+
}
469+
470+
// value has quote if not html
471+
match bytes.by_ref().find(|&(_, &b)| !is_whitespace(b)) {
472+
Some((i, quote @ &b'\'')) | Some((i, quote @ &b'"')) => {
473+
match bytes.by_ref().find(|&(_, &b)| b == *quote) {
474+
Some((j, _)) => {
475+
self.position = j + 1;
476+
return attr!(start_key..end_key, i + 1..j);
477+
}
478+
None => err!(Error::UnquotedValue(i)),
479+
}
480+
}
481+
Some((i, _)) if self.html => {
482+
let j = bytes
483+
.by_ref()
484+
.find(|&(_, &b)| is_whitespace(b))
485+
.map_or(len, |(j, _)| j);
486+
self.position = j;
487+
return attr!(start_key..end_key, i..j);
488+
Some((i, _)) => err!(Error::UnquotedValue(i)),
489+
None => return attr!(start_key..end_key),
490+
}
491+
}
492+
}
493+
292494
////////////////////////////////////////////////////////////////////////////////////////////////////
293495

294496
/// Iterator over XML attributes.
@@ -2353,4 +2555,31 @@ mod html {
23532555
assert_eq!(iter.next(), None);
23542556
assert_eq!(iter.next(), None);
23552557
}
2558+
2559+
#[test]
2560+
fn attribute_value_normalization() {
2561+
// empty value
2562+
assert_eq!(normalize_attribute_value(b"").as_ref(), b"");
2563+
// return, tab, and newline characters (0xD, 0x9, 0xA) must be replaced with a space character
2564+
assert_eq!(
2565+
normalize_attribute_value(b"\rfoo\rbar\tbaz\ndelta\n").as_ref(),
2566+
b"foo bar baz delta"
2567+
);
2568+
// leading and trailing spaces must be stripped
2569+
assert_eq!(normalize_attribute_value(b" foo ").as_ref(), b"foo");
2570+
// leading space
2571+
assert_eq!(normalize_attribute_value(b" bar").as_ref(), b"bar");
2572+
// trailing space
2573+
assert_eq!(normalize_attribute_value(b"baz ").as_ref(), b"baz");
2574+
// sequences of spaces must be replaced with a single space
2575+
assert_eq!(
2576+
normalize_attribute_value(b" foo bar baz ").as_ref(),
2577+
b"foo bar baz"
2578+
);
2579+
// sequence replacement mixed with characters treated as whitespace (\t \r \n)
2580+
assert_eq!(
2581+
normalize_attribute_value(b" \tfoo\tbar \rbaz \n\ndelta\n\t\r echo foxtrot\r").as_ref(),
2582+
b"foo bar baz delta echo foxtrot"
2583+
);
2584+
}
23562585
}

0 commit comments

Comments
 (0)