Skip to content

Commit 1cdea62

Browse files
committed
Properly normalize attribute values
closes tafia#371
1 parent 8a74258 commit 1cdea62

File tree

1 file changed

+127
-1
lines changed

1 file changed

+127
-1
lines changed

src/events/attributes.rs

Lines changed: 127 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,95 @@ impl<'a> From<(&'a str, &'a str)> for Attribute<'a> {
331331
}
332332
}
333333

334+
// 1) All line breaks MUST have been normalized on input to #xA as described in 2.11 End-of-Line Handling, so the rest of this algorithm operates on text normalized in this way.
335+
// 2) Begin with a normalized value consisting of the empty string.
336+
// 3) For each character, entity reference, or character reference in the unnormalized attribute value, beginning with the first and continuing to the last, do the following:
337+
// * For a character reference, append the referenced character to the normalized value.
338+
// * For an entity reference, recursively apply step 3 of this algorithm to the replacement text of the entity.
339+
// * For a white space character (#x20, #xD, #xA, #x9), append a space character (#x20) to the normalized value.
340+
// * For another character, append the character to the normalized value.
341+
//
342+
// If the attribute type is not CDATA, then the XML processor MUST further process the normalized attribute value by discarding any leading and trailing space (#x20) characters,
343+
// and by replacing sequences of space (#x20) characters by a single space (#x20) character.
344+
//
345+
// Note that if the unnormalized attribute value contains a character reference to a white space character other than space (#x20), the normalized value contains the referenced
346+
// character itself (#xD, #xA or #x9). This contrasts with the case where the unnormalized value contains a white space character (not a reference), which is replaced with a
347+
// space character (#x20) in the normalized value and also contrasts with the case where the unnormalized value contains an entity reference whose replacement text contains a
348+
// white space character; being recursively processed, the white space character is replaced with a space character (#x20) in the normalized value.
349+
fn normalize_attribute_value(attr: Cow<[u8]>) -> Cow<[u8]> {
350+
// TODO: character references, entity references, error handling associated with those
351+
// TODO: don't allocated unless needed?
352+
353+
#[derive(PartialEq)]
354+
enum ParseState {
355+
SpaceOrStart,
356+
CDATA,
357+
}
358+
359+
let mut value: Vec<u8> = Vec::new();
360+
// Starting in the state where we think we've added a space means we implicitly skip leading spaces
361+
let mut current_state = ParseState::SpaceOrStart;
362+
// Used for trimming trailing spaces
363+
let mut last_cdata_idx = 0;
364+
365+
// In one pass, strip leading and trailing spaces and replace sequences of spaces with a single one
366+
for ch in attr.as_ref() {
367+
match ch {
368+
b'\n' | b'\r' | b'\t' | b' ' => match current_state {
369+
ParseState::SpaceOrStart => continue,
370+
ParseState::CDATA => {
371+
current_state = ParseState::SpaceOrStart;
372+
value.push(b' ');
373+
}
374+
},
375+
c @ _ => match current_state {
376+
ParseState::SpaceOrStart => {
377+
current_state = ParseState::CDATA;
378+
last_cdata_idx = value.len();
379+
value.push(*c);
380+
}
381+
ParseState::CDATA => {
382+
last_cdata_idx = value.len();
383+
value.push(*c);
384+
}
385+
},
386+
}
387+
}
388+
389+
// Trim any trailing spaces
390+
if current_state == ParseState::SpaceOrStart {
391+
value.truncate(last_cdata_idx + 1);
392+
}
393+
394+
Cow::Owned(value)
395+
396+
// let mut value: Vec<u8> = Vec::new();
397+
398+
// // TODO: replace sequences of spaces
399+
// for i in 0..attr.len() {
400+
// let ch = attr[i];
401+
// match ch {
402+
// b'\n' => value.push(b' '),
403+
// b'\r' => value.push(b' '),
404+
// b'\t' => value.push(b' '),
405+
// c @ _ => value.push(c),
406+
// }
407+
// }
408+
409+
// // Position where value starts after whitespace.
410+
// let first_non_space_char = value
411+
// .iter()
412+
// .position(|c| !c.is_ascii_whitespace())
413+
// .unwrap_or(0);
414+
// // Position where the trailing whitespace starts.
415+
// let last_non_space_char = value
416+
// .iter()
417+
// .rposition(|c| !c.is_ascii_whitespace())
418+
// .and_then(|idx| Some(idx + 1))
419+
// .unwrap_or(0);
420+
// Cow::Owned(value[first_non_space_char..last_non_space_char].to_vec())
421+
}
422+
334423
impl<'a> Iterator for Attributes<'a> {
335424
type Item = Result<Attribute<'a>>;
336425
fn next(&mut self) -> Option<Self::Item> {
@@ -355,7 +444,7 @@ impl<'a> Iterator for Attributes<'a> {
355444
($key:expr, $val:expr) => {
356445
Some(Ok(Attribute {
357446
key: &self.bytes[$key],
358-
value: Cow::Borrowed(&self.bytes[$val]),
447+
value: normalize_attribute_value(Cow::Borrowed(&self.bytes[$val])),
359448
}))
360449
};
361450
}
@@ -513,4 +602,41 @@ mod tests {
513602
assert_eq!(&*a.value, b"ee");
514603
assert!(attributes.next().is_none());
515604
}
605+
606+
#[test]
607+
fn attribute_value_normalization() {
608+
// return, tab, and newline characters (0xD, 0x9, 0xA) must be replaced with a space character
609+
assert_eq!(
610+
normalize_attribute_value(Cow::Borrowed(b"\rfoo\rbar\tbaz\ndelta\n")).as_ref(),
611+
b"foo bar baz delta"
612+
);
613+
// leading and trailing spaces must be stripped
614+
assert_eq!(
615+
normalize_attribute_value(Cow::Borrowed(b" foo ")).as_ref(),
616+
b"foo"
617+
);
618+
// leading space
619+
assert_eq!(
620+
normalize_attribute_value(Cow::Borrowed(b" bar")).as_ref(),
621+
b"bar"
622+
);
623+
// trailing space
624+
assert_eq!(
625+
normalize_attribute_value(Cow::Borrowed(b"baz ")).as_ref(),
626+
b"baz"
627+
);
628+
// sequences of spaces must be replaced with a single space
629+
assert_eq!(
630+
normalize_attribute_value(Cow::Borrowed(b" foo bar baz ")).as_ref(),
631+
b"foo bar baz"
632+
);
633+
// sequence replacement mixed with characters treated as whitespace (\t \r \n)
634+
assert_eq!(
635+
normalize_attribute_value(Cow::Borrowed(
636+
b" \tfoo\tbar \rbaz \n\ndelta\n\t\r echo foxtrot\r"
637+
))
638+
.as_ref(),
639+
b"foo bar baz delta echo foxtrot"
640+
);
641+
}
516642
}

0 commit comments

Comments
 (0)