Skip to content

Commit 56841c9

Browse files
committed
escape into a string directly, rather than bytes, then converting
I believe this is quite a bit faster, because rust only has to verify that each string slicing operation starts/ends at character boundaries, none of the inner bytes need to be checked for UTF8-ness since they're already from a `&str` Also, when initially creating the escaped string, preallocate a little extra room, since we know the string will grow.
1 parent ef83740 commit 56841c9

File tree

1 file changed

+22
-24
lines changed

1 file changed

+22
-24
lines changed

src/escapei.rs

+22-24
Original file line numberDiff line numberDiff line change
@@ -110,23 +110,30 @@ where
110110
let mut escaped = None;
111111
let mut last_pos = 0;
112112
for i in escapes {
113-
let escaped = escaped.get_or_insert_with(|| Vec::with_capacity(raw.len()));
113+
// If we have an escape, the escaped string will be at least some larger than the raw string,
114+
// reserve a little more space, so we might not resize at all if only a few escapes are found.
115+
let escaped = escaped.get_or_insert_with(|| String::with_capacity(raw.len() + 64));
114116
let byte = bytes[i];
115-
escaped.extend_from_slice(&bytes[last_pos..i]);
117+
// SAFETY: the escapes iterator should only return indexes of bytes we know how to escape.
118+
// if one of those bytes are found, it _must_ be a complete character, so `i` must be a
119+
// character boundary.
120+
// last_pos will only be either 0 or i+1, and all supported chars are one byte long,
121+
// last_pos will also always be at a char boundary
122+
escaped.push_str(&raw[last_pos..i]);
116123
match byte {
117-
b'<' => escaped.extend_from_slice(b"&lt;"),
118-
b'>' => escaped.extend_from_slice(b"&gt;"),
119-
b'\'' => escaped.extend_from_slice(b"&apos;"),
120-
b'&' => escaped.extend_from_slice(b"&amp;"),
121-
b'"' => escaped.extend_from_slice(b"&quot;"),
124+
b'<' => escaped.push_str("&lt;"),
125+
b'>' => escaped.push_str("&gt;"),
126+
b'\'' => escaped.push_str("&apos;"),
127+
b'&' => escaped.push_str("&amp;"),
128+
b'"' => escaped.push_str("&quot;"),
122129

123130
// This set of escapes handles characters that should be escaped
124131
// in elements of xs:lists, because those characters works as
125132
// delimiters of list elements
126-
b'\t' => escaped.extend_from_slice(b"&#9;"),
127-
b'\n' => escaped.extend_from_slice(b"&#10;"),
128-
b'\r' => escaped.extend_from_slice(b"&#13;"),
129-
b' ' => escaped.extend_from_slice(b"&#32;"),
133+
b'\t' => escaped.push_str("&#9;"),
134+
b'\n' => escaped.push_str("&#10;"),
135+
b'\r' => escaped.push_str("&#13;"),
136+
b' ' => escaped.push_str("&#32;"),
130137
_ => unreachable!(
131138
"Only '<', '>','\', '&', '\"', '\\t', '\\r', '\\n', and ' ' are escaped"
132139
),
@@ -135,14 +142,8 @@ where
135142
}
136143

137144
if let Some(mut escaped) = escaped {
138-
if let Some(raw) = bytes.get(last_pos..) {
139-
escaped.extend_from_slice(raw);
140-
}
141-
// SAFETY: we operate on UTF-8 input and search for an one byte chars only,
142-
// so all slices that was put to the `escaped` is a valid UTF-8 encoded strings
143-
// TODO: Can be replaced with `unsafe { String::from_utf8_unchecked() }`
144-
// if unsafe code will be allowed
145-
Cow::Owned(String::from_utf8(escaped).unwrap())
145+
escaped.push_str(&raw[last_pos..]);
146+
Cow::Owned(escaped)
146147
} else {
147148
Cow::Borrowed(raw)
148149
}
@@ -182,17 +183,14 @@ where
182183
match iter.next() {
183184
Some(end) if bytes[end] == b';' => {
184185
// append valid data
185-
if unescaped.is_none() {
186-
unescaped = Some(String::with_capacity(raw.len()));
187-
}
188-
let unescaped = unescaped.as_mut().expect("initialized");
186+
let unescaped = unescaped.get_or_insert_with(|| String::with_capacity(raw.len()));
189187
unescaped.push_str(&raw[last_end..start]);
190188

191189
// search for character correctness
192190
let pat = &raw[start + 1..end];
193191
if let Some(entity) = pat.strip_prefix('#') {
194192
let codepoint = parse_number(entity, start..end)?;
195-
unescaped.push_str(codepoint.encode_utf8(&mut [0u8; 4]));
193+
unescaped.push(codepoint);
196194
} else if let Some(value) = named_entity(pat) {
197195
unescaped.push_str(value);
198196
} else if let Some(value) = resolve_entity(pat) {

0 commit comments

Comments
 (0)