diff --git a/Cargo.toml b/Cargo.toml index 1f871430..1fe12779 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,8 @@ serde = { version = ">=1.0.100,<1.0.181", optional = true } tokio = { version = "1.10", optional = true, default-features = false, features = ["io-util"] } memchr = "2.1" arbitrary = { version = "1", features = ["derive"], optional = true } +jetscii = "0.5.2" +once_cell = "1.12.0" [dev-dependencies] criterion = "0.4" diff --git a/src/escapei.rs b/src/escapei.rs index 46b75f50..cb2a996a 100644 --- a/src/escapei.rs +++ b/src/escapei.rs @@ -4,9 +4,18 @@ use memchr::memchr2_iter; use std::borrow::Cow; use std::ops::Range; +use jetscii::bytes; +use memchr; +use once_cell::sync::Lazy; + #[cfg(test)] use pretty_assertions::assert_eq; + +static XML_ESCAPE_BYTES: Lazy = + Lazy::new(|| bytes!(b'<', b'>', b'&', b'\'', b'"')); +static XML_PARTIAL_ESCAPE_BYTES: Lazy = Lazy::new(|| bytes!(b'<', b'>', b'&')); + /// Error for XML escape / unescape. #[derive(Clone, Debug)] pub enum EscapeError { @@ -72,7 +81,8 @@ impl std::error::Error for EscapeError {} /// | `'` | `'` /// | `"` | `"` pub fn escape(raw: &str) -> Cow { - _escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&' | b'\'' | b'\"')) + // _escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&' | b'\'' | b'\"')) + simd_escape(raw, &XML_ESCAPE_BYTES) } /// Escapes an `&str` and replaces xml special characters (`<`, `>`, `&`) @@ -89,9 +99,11 @@ pub fn escape(raw: &str) -> Cow { /// | `>` | `>` /// | `&` | `&` pub fn partial_escape(raw: &str) -> Cow { - _escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&')) + // _escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&')) + simd_escape(raw, &XML_PARTIAL_ESCAPE_BYTES) } + /// Escapes an `&str` and replaces a subset of xml special characters (`<`, `>`, /// `&`, `'`, `"`) with their corresponding xml escaped value. pub(crate) fn _escape bool>(raw: &str, escape_chars: F) -> Cow { @@ -121,7 +133,47 @@ pub(crate) fn _escape bool>(raw: &str, escape_chars: F) -> Cow b'\r' => escaped.extend_from_slice(b" "), b' ' => escaped.extend_from_slice(b" "), _ => unreachable!( - "Only '<', '>','\', '&', '\"', '\\t', '\\r', '\\n', and ' ' are escaped" + "Only '<', '>','\', '&', '\"', '\\t', '\\r', '\\n', and ' ' are escaped"), + } + pos = new_pos + 1; + } + + if let Some(mut escaped) = escaped { + if let Some(raw) = bytes.get(pos..) { + escaped.extend_from_slice(raw); + } + // SAFETY: we operate on UTF-8 input and search for an one byte chars only, + // so all slices that was put to the `escaped` is a valid UTF-8 encoded strings + // TODO: Can be replaced with `unsafe { String::from_utf8_unchecked() }` + // if unsafe code will be allowed + Cow::Owned(String::from_utf8(escaped).unwrap()) + } else { + Cow::Borrowed(raw) + } +} + +/// Escapes a `&[u8]` and replaces all xml special characters (<, >, &, ', ") with their +/// corresponding xml escaped value. +pub fn simd_escape<'a>(raw: &'a str, escape_matcher: &jetscii::BytesConst) -> Cow<'a, str> { + let bytes = raw.as_bytes(); + let mut escaped = None; + let mut pos = 0; + while let Some(i) = escape_matcher.find(&bytes[pos..]) { + if escaped.is_none() { + escaped = Some(Vec::with_capacity(raw.len())); + } + let escaped = escaped.as_mut().expect("initialized"); + let new_pos = pos + i; + escaped.extend_from_slice(&bytes[pos..new_pos]); + match bytes[new_pos] { + b'<' => escaped.extend_from_slice(b"<"), + b'>' => escaped.extend_from_slice(b">"), + b'\'' => escaped.extend_from_slice(b"'"), + b'&' => escaped.extend_from_slice(b"&"), + b'"' => escaped.extend_from_slice(b"""), + c @ _ => unreachable!( + "Found {} but only '<', '>', ', '&' and '\"' are escaped", + c as char ), } pos = new_pos + 1;