From 360b17eb2a73ceb36da230f45bca5a93a5988ab1 Mon Sep 17 00:00:00 2001 From: Josh McKinney Date: Wed, 18 Sep 2024 21:17:17 -0700 Subject: [PATCH 1/3] Add AsciiSet::EMPTY and impl ops::Add for AsciiSet In RFCs, the sets of characters to percent-encode are often defined as the union of multiple sets. This change adds an `EMPTY` constant to `AsciiSet` and implements the `Add` trait for `AsciiSet` so that sets can be combined with the `+` operator. AsciiSet now derives `Debug`, `PartialEq`, and `Eq` so that it can be used in tests. --- percent_encoding/src/lib.rs | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/percent_encoding/src/lib.rs b/percent_encoding/src/lib.rs index 10e0fc69..72d144bb 100644 --- a/percent_encoding/src/lib.rs +++ b/percent_encoding/src/lib.rs @@ -51,7 +51,7 @@ use alloc::{ string::String, vec::Vec, }; -use core::{fmt, mem, slice, str}; +use core::{fmt, mem, ops, slice, str}; /// Represents a set of characters or bytes in the ASCII range. /// @@ -66,6 +66,7 @@ use core::{fmt, mem, slice, str}; /// /// https://url.spec.whatwg.org/#fragment-percent-encode-set /// const FRAGMENT: &AsciiSet = &CONTROLS.add(b' ').add(b'"').add(b'<').add(b'>').add(b'`'); /// ``` +#[derive(Debug, PartialEq, Eq)] pub struct AsciiSet { mask: [Chunk; ASCII_RANGE_LEN / BITS_PER_CHUNK], } @@ -77,6 +78,11 @@ const ASCII_RANGE_LEN: usize = 0x80; const BITS_PER_CHUNK: usize = 8 * mem::size_of::(); impl AsciiSet { + /// An empty set. + pub const EMPTY: AsciiSet = AsciiSet { + mask: [0; ASCII_RANGE_LEN / BITS_PER_CHUNK], + }; + /// Called with UTF-8 bytes rather than code points. /// Not used for non-ASCII bytes. const fn contains(&self, byte: u8) -> bool { @@ -102,6 +108,18 @@ impl AsciiSet { } } +impl ops::Add for AsciiSet { + type Output = Self; + + fn add(self, other: Self) -> Self { + let mut mask = self.mask.clone(); + for i in 0..mask.len() { + mask[i] |= other.mask[i]; + } + AsciiSet { mask } + } +} + /// The set of 0x00 to 0x1F (C0 controls), and 0x7F (DEL). /// /// Note that this includes the newline and tab characters, but not the space 0x20. @@ -478,3 +496,16 @@ fn decode_utf8_lossy(input: Cow<'_, [u8]>) -> Cow<'_, str> { } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn add() { + let left = AsciiSet::EMPTY.add(b'A'); + let right = AsciiSet::EMPTY.add(b'B'); + let expected = AsciiSet::EMPTY.add(b'A').add(b'B'); + assert_eq!(left + right, expected); + } +} From b9f44f6a38d36be80af6c24207e8710756df6355 Mon Sep 17 00:00:00 2001 From: Josh McKinney Date: Wed, 18 Sep 2024 21:26:57 -0700 Subject: [PATCH 2/3] implement ops::Not for AsciiSet --- percent_encoding/src/lib.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/percent_encoding/src/lib.rs b/percent_encoding/src/lib.rs index 72d144bb..34ad8f53 100644 --- a/percent_encoding/src/lib.rs +++ b/percent_encoding/src/lib.rs @@ -120,6 +120,15 @@ impl ops::Add for AsciiSet { } } +impl ops::Not for AsciiSet { + type Output = Self; + + fn not(self) -> Self { + let mask = self.mask.map(|chunk| !chunk); + AsciiSet { mask } + } +} + /// The set of 0x00 to 0x1F (C0 controls), and 0x7F (DEL). /// /// Note that this includes the newline and tab characters, but not the space 0x20. @@ -508,4 +517,12 @@ mod tests { let expected = AsciiSet::EMPTY.add(b'A').add(b'B'); assert_eq!(left + right, expected); } + + #[test] + fn not() { + let set = AsciiSet::EMPTY.add(b'A').add(b'B'); + let not_set = !set; + assert!(!not_set.contains(b'A')); + assert!(not_set.contains(b'C')); + } } From 3407c406ef5f56b41d45a42d1ea9874f6acc93a4 Mon Sep 17 00:00:00 2001 From: Josh McKinney Date: Wed, 18 Sep 2024 21:49:18 -0700 Subject: [PATCH 3/3] Add const functions for negation / union of AsciiSet --- percent_encoding/src/lib.rs | 52 ++++++++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/percent_encoding/src/lib.rs b/percent_encoding/src/lib.rs index 34ad8f53..2213943b 100644 --- a/percent_encoding/src/lib.rs +++ b/percent_encoding/src/lib.rs @@ -106,17 +106,30 @@ impl AsciiSet { mask[byte as usize / BITS_PER_CHUNK] &= !(1 << (byte as usize % BITS_PER_CHUNK)); AsciiSet { mask } } + + /// Return the union of two sets. + pub const fn union(&self, other: Self) -> Self { + let mask = [ + self.mask[0] | other.mask[0], + self.mask[1] | other.mask[1], + self.mask[2] | other.mask[2], + self.mask[3] | other.mask[3], + ]; + AsciiSet { mask } + } + + /// Return the negation of the set. + pub const fn complement(&self) -> Self { + let mask = [!self.mask[0], !self.mask[1], !self.mask[2], !self.mask[3]]; + AsciiSet { mask } + } } impl ops::Add for AsciiSet { type Output = Self; fn add(self, other: Self) -> Self { - let mut mask = self.mask.clone(); - for i in 0..mask.len() { - mask[i] |= other.mask[i]; - } - AsciiSet { mask } + self.union(other) } } @@ -124,8 +137,7 @@ impl ops::Not for AsciiSet { type Output = Self; fn not(self) -> Self { - let mask = self.mask.map(|chunk| !chunk); - AsciiSet { mask } + self.complement() } } @@ -511,7 +523,7 @@ mod tests { use super::*; #[test] - fn add() { + fn add_op() { let left = AsciiSet::EMPTY.add(b'A'); let right = AsciiSet::EMPTY.add(b'B'); let expected = AsciiSet::EMPTY.add(b'A').add(b'B'); @@ -519,10 +531,32 @@ mod tests { } #[test] - fn not() { + fn not_op() { let set = AsciiSet::EMPTY.add(b'A').add(b'B'); let not_set = !set; assert!(!not_set.contains(b'A')); assert!(not_set.contains(b'C')); } + + /// This test ensures that we can get the union of two sets as a constant value, which is + /// useful for defining sets in a modular way. + #[test] + fn union() { + const A: AsciiSet = AsciiSet::EMPTY.add(b'A'); + const B: AsciiSet = AsciiSet::EMPTY.add(b'B'); + const UNION: AsciiSet = A.union(B); + const EXPECTED: AsciiSet = AsciiSet::EMPTY.add(b'A').add(b'B'); + assert_eq!(UNION, EXPECTED); + } + + /// This test ensures that we can get the complement of a set as a constant value, which is + /// useful for defining sets in a modular way. + #[test] + fn complement() { + const BOTH: AsciiSet = AsciiSet::EMPTY.add(b'A').add(b'B'); + const COMPLEMENT: AsciiSet = BOTH.complement(); + assert!(!COMPLEMENT.contains(b'A')); + assert!(!COMPLEMENT.contains(b'B')); + assert!(COMPLEMENT.contains(b'C')); + } }