From 495751c215acc7f72b913ff2fe720679a4bf104e Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Tue, 1 Jul 2025 21:28:34 +0200 Subject: [PATCH 01/44] Add generic trait for method parameters --- .gitignore | 5 +- src/fast_automaton/operation/alternation.rs | 23 +++++-- src/fast_automaton/operation/concatenate.rs | 21 ++++++- src/fast_automaton/operation/intersection.rs | 34 +++++++++-- src/lib.rs | 34 ++++++----- src/regex/mod.rs | 8 +-- src/regex/operation/union.rs | 64 +++++++++++++++----- src/traits.rs | 54 +++++++++++++++++ 8 files changed, 193 insertions(+), 50 deletions(-) create mode 100644 src/traits.rs diff --git a/.gitignore b/.gitignore index d01bd1a..bf7ff1c 100644 --- a/.gitignore +++ b/.gitignore @@ -18,4 +18,7 @@ Cargo.lock # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ \ No newline at end of file +#.idea/ + +# cargo mutants output +mutants.out*/ \ No newline at end of file diff --git a/src/fast_automaton/operation/alternation.rs b/src/fast_automaton/operation/alternation.rs index 06c386e..90437d1 100644 --- a/src/fast_automaton/operation/alternation.rs +++ b/src/fast_automaton/operation/alternation.rs @@ -2,18 +2,29 @@ use std::hash::BuildHasherDefault; use condition::converter::ConditionConverter; -use crate::error::EngineError; +use crate::{error::EngineError, traits::MethodParameters}; use super::*; impl FastAutomaton { - pub fn union(&self, that: &FastAutomaton) -> Result { - let mut union = self.clone(); - union.alternate(that)?; - Ok(union) + pub fn union<'o, S>(&self, others: S) -> Result + where + S: MethodParameters<'o, FastAutomaton>, + { + let mut result = self.clone(); + + for other in others.parameters() { + result.alternate(other)?; + + if result.is_total() { + break; + } + } + + Ok(result) } - pub fn alternation(automatons: Vec) -> Result { + pub fn alternation(automatons: &Vec) -> Result { if automatons.len() == 1 { return Ok(automatons[0].clone()); } diff --git a/src/fast_automaton/operation/concatenate.rs b/src/fast_automaton/operation/concatenate.rs index 3741e01..e39642f 100644 --- a/src/fast_automaton/operation/concatenate.rs +++ b/src/fast_automaton/operation/concatenate.rs @@ -2,12 +2,29 @@ use std::hash::BuildHasherDefault; use condition::converter::ConditionConverter; -use crate::error::EngineError; +use crate::{error::EngineError, traits::MethodParameters}; use super::*; impl FastAutomaton { - pub fn concatenate(automatons: Vec) -> Result { + pub fn concatenation<'o, S>(&self, others: S) -> Result + where + S: MethodParameters<'o, FastAutomaton>, + { + let mut result = self.clone(); + + for other in others.parameters() { + result.concat(other)?; + + if result.is_total() { + break; + } + } + + Ok(result) + } + + pub fn concatenate(automatons: &Vec) -> Result { if automatons.len() == 1 { return Ok(automatons[0].clone()); } diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index 96007e6..4483199 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -1,17 +1,39 @@ +use std::borrow::Cow; + use condition::converter::ConditionConverter; -use crate::{error::EngineError, execution_profile::ThreadLocalParams}; +use crate::{error::EngineError, execution_profile::ThreadLocalParams, traits::MethodParameters}; use super::*; impl FastAutomaton { - pub fn intersection(&self, other: &FastAutomaton) -> Result { + pub fn intersection<'o, S>(&self, others: S) -> Result + where + S: MethodParameters<'o, FastAutomaton>, + { + let mut result = Cow::Borrowed(self); + + for other in others.parameters() { + result = result.intersection_(other)?; + + if result.is_empty() { + break; + } + } + + Ok(result.into_owned()) + } + + fn intersection_<'a>( + &self, + other: &'a FastAutomaton, + ) -> Result, EngineError> { if self.is_empty() || other.is_empty() { - return Ok(Self::new_empty()); + return Ok(Cow::Owned(Self::new_empty())); } else if self.is_total() { - return Ok(other.clone()); + return Ok(Cow::Borrowed(other)); } else if other.is_total() { - return Ok(self.clone()); + return Ok(Cow::Owned(self.clone())); } let execution_profile = ThreadLocalParams::get_execution_profile(); @@ -70,7 +92,7 @@ impl FastAutomaton { } new_automaton.spanning_set = new_spanning_set; new_automaton.remove_dead_transitions(); - Ok(new_automaton) + Ok(Cow::Owned(new_automaton)) } pub fn has_intersection(&self, other: &FastAutomaton) -> Result { diff --git a/src/lib.rs b/src/lib.rs index 91493c7..f1681ee 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -20,6 +20,7 @@ pub mod execution_profile; pub mod fast_automaton; pub mod regex; pub mod tokenizer; +pub(crate) mod traits; type IntMap = HashMap>>; type IntSet = HashSet>>; @@ -73,33 +74,36 @@ impl Term { pub fn union(&self, terms: &[Term]) -> Result { Self::check_number_of_terms(terms)?; - let mut return_regex = RegularExpression::new_empty(); - let mut return_automaton = FastAutomaton::new_empty(); - match self { - Term::RegularExpression(regular_expression) => { - return_regex = regular_expression.clone(); - } - Term::Automaton(fast_automaton) => { - return_automaton = fast_automaton.clone(); - } - } + let mut regex_list = Vec::with_capacity(terms.len()); + let mut automaton_list = Vec::with_capacity(terms.len()); for operand in terms { match operand { Term::RegularExpression(regex) => { - return_regex = return_regex.union(regex); - if return_regex.is_total() { + if regex.is_total() { return Ok(Term::RegularExpression(RegularExpression::new_total())); } + regex_list.push(regex); } Term::Automaton(automaton) => { - return_automaton = return_automaton.union(automaton)?; - if return_automaton.is_total() { + if automaton.is_total() { return Ok(Term::RegularExpression(RegularExpression::new_total())); } + automaton_list.push(automaton); } } } + let mut return_regex = RegularExpression::new_empty(); + let mut return_automaton = FastAutomaton::new_empty(); + match self { + Term::RegularExpression(regular_expression) => { + return_regex = regular_expression.union(®ex_list); + } + Term::Automaton(fast_automaton) => { + return_automaton = fast_automaton.union(&automaton_list)?; + } + } + if return_automaton.is_empty() { Ok(Term::RegularExpression(return_regex)) } else { @@ -138,7 +142,7 @@ impl Term { let mut return_automaton = self.get_automaton()?; for term in terms { let automaton = term.get_automaton()?; - return_automaton = Cow::Owned(return_automaton.intersection(&automaton)?); + return_automaton = Cow::Owned(return_automaton.intersection(automaton.as_ref())?); if return_automaton.is_empty() { return Ok(Term::RegularExpression(RegularExpression::new_empty())); } diff --git a/src/regex/mod.rs b/src/regex/mod.rs index 176612f..59be90b 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -138,14 +138,14 @@ impl RegularExpression { for c in concat.iter() { concats.push(c.to_automaton()?); } - FastAutomaton::concatenate(concats) + FastAutomaton::concatenate(&concats) } RegularExpression::Alternation(alternation) => { - let mut concats = Vec::with_capacity(alternation.len()); + let mut alternates = Vec::with_capacity(alternation.len()); for c in alternation.iter() { - concats.push(c.to_automaton()?); + alternates.push(c.to_automaton()?); } - FastAutomaton::alternation(concats) + FastAutomaton::alternation(&alternates) } } } diff --git a/src/regex/operation/union.rs b/src/regex/operation/union.rs index 8f5c1ae..8508e4e 100644 --- a/src/regex/operation/union.rs +++ b/src/regex/operation/union.rs @@ -1,21 +1,40 @@ use std::collections::BTreeSet; +use crate::traits::MethodParameters; + use super::*; impl RegularExpression { - pub fn union(&self, other: &RegularExpression) -> RegularExpression { + pub fn union<'o, S>(&self, others: S) -> RegularExpression + where + S: MethodParameters<'o, RegularExpression>, + { + let mut result = Cow::Borrowed(self); + + for other in others.parameters() { + result = result.union_(other); + + if result.is_total() { + break; + } + } + + result.into_owned() + } + + fn union_<'a>(&self, other: &'a RegularExpression) -> Cow<'a, RegularExpression> { if self.is_total() || other.is_total() { - return RegularExpression::new_total(); + return Cow::Owned(RegularExpression::new_total()); } else if self.is_empty() { - return other.clone(); + return Cow::Borrowed(other); } else if other.is_empty() || self == other { - return self.clone(); + return Cow::Owned(self.clone()); } else if other.is_empty_string() { - return self.clone().repeat(0, Some(1)); + return Cow::Owned(self.repeat(0, Some(1))); } else if self.is_empty_string() { - return other.clone().repeat(0, Some(1)); + return Cow::Owned(other.repeat(0, Some(1))); } - match (self, other) { + Cow::Owned(match (self, other) { ( RegularExpression::Character(self_range), RegularExpression::Character(other_range), @@ -63,14 +82,14 @@ impl RegularExpression { Self::opunion_concat_and_alternation(other, self) } (RegularExpression::Alternation(self_elements), RegularExpression::Alternation(_)) => { - let mut new_alternation = other.clone(); + let mut new_alternation = Cow::Borrowed(other); for self_element in self_elements { - new_alternation = new_alternation.union(self_element); + new_alternation = new_alternation.union_(self_element); } - new_alternation + new_alternation.into_owned() } - } + }) } fn opunion_character_and_repetition( @@ -116,17 +135,25 @@ impl RegularExpression { if prefix.is_none() && suffix.is_none() { let mut alternate_elements = vec![self_regex, other_regex]; alternate_elements.sort_unstable(); - RegularExpression::Alternation(alternate_elements) + Cow::Owned(RegularExpression::Alternation(alternate_elements)) } else { - self_regex.union(&other_regex) + self_regex.union_(&other_regex) } } else { - RegularExpression::Repetition(Box::new(self_regex), 0, Some(1)) + Cow::Owned(RegularExpression::Repetition( + Box::new(self_regex), + 0, + Some(1), + )) } } else if !other_regex.is_empty_string() { - RegularExpression::Repetition(Box::new(other_regex), 0, Some(1)) + Cow::Owned(RegularExpression::Repetition( + Box::new(other_regex), + 0, + Some(1), + )) } else { - RegularExpression::new_empty_string() + Cow::Owned(RegularExpression::new_empty_string()) }; regex = regex.concat(®ex_from_alternate, true); @@ -354,6 +381,11 @@ mod tests { #[test] fn test_union() -> Result<(), String> { assert_union("(a+|a+b)", "a+b?"); + assert_union("(a+|a*)", "a*"); + assert_union("(a?|a{0,2})", "a{0,2}"); + assert_union("(a{2,4}|a{1,3})", "a{1,4}"); + assert_union("(a{1,2}|a{3,4})", "a{1,4}"); + assert_union("(a{3,4}|a{1,2})", "a{1,4}"); Ok(()) } diff --git a/src/traits.rs b/src/traits.rs new file mode 100644 index 0000000..6254022 --- /dev/null +++ b/src/traits.rs @@ -0,0 +1,54 @@ +pub trait MethodParameters<'a, T: 'a> { + /// the iterator that yields `&'a T` + type Iter: Iterator; + fn parameters(self) -> Self::Iter; +} + +impl<'a, T> MethodParameters<'a, T> for &'a T { + type Iter = std::iter::Once<&'a T>; + fn parameters(self) -> Self::Iter { + std::iter::once(self) + } +} + +impl<'a, T> MethodParameters<'a, T> for &'a [&'a T] { + type Iter = std::iter::Copied>; + fn parameters(self) -> Self::Iter { + self.iter().copied() + } +} + +impl<'a, T> MethodParameters<'a, T> for &'a [T] { + type Iter = std::slice::Iter<'a, T>; + fn parameters(self) -> Self::Iter { + self.iter() + } +} + +impl<'a, T> MethodParameters<'a, T> for &'a Vec { + type Iter = std::slice::Iter<'a, T>; + fn parameters(self) -> Self::Iter { + self.iter() + } +} + +impl<'a, T> MethodParameters<'a, T> for &'a Vec<&'a T> { + type Iter = std::iter::Copied>; + fn parameters(self) -> Self::Iter { + self.iter().copied() + } +} + +impl<'a, T, const N: usize> MethodParameters<'a, T> for &'a [T; N] { + type Iter = std::slice::Iter<'a, T>; + fn parameters(self) -> Self::Iter { + self.iter() + } +} + +impl<'a, T, const N: usize> MethodParameters<'a, T> for &'a [&'a T; N] { + type Iter = std::iter::Copied>; + fn parameters(self) -> Self::Iter { + self.iter().copied() + } +} From 8abe57347434ad8f3343754a021bc1e312a7355d Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sun, 6 Jul 2025 17:58:30 +0200 Subject: [PATCH 02/44] WIP --- src/fast_automaton/operation/alternation.rs | 33 +++++------- src/fast_automaton/operation/concatenate.rs | 43 ++++++---------- src/fast_automaton/operation/intersection.rs | 12 +++-- src/lib.rs | 36 ++++++++----- src/regex/mod.rs | 5 +- src/regex/operation/mod.rs | 10 ++++ src/regex/operation/union.rs | 12 +++-- src/traits.rs | 54 -------------------- 8 files changed, 81 insertions(+), 124 deletions(-) diff --git a/src/fast_automaton/operation/alternation.rs b/src/fast_automaton/operation/alternation.rs index 90437d1..3daa9ca 100644 --- a/src/fast_automaton/operation/alternation.rs +++ b/src/fast_automaton/operation/alternation.rs @@ -2,36 +2,27 @@ use std::hash::BuildHasherDefault; use condition::converter::ConditionConverter; -use crate::{error::EngineError, traits::MethodParameters}; +use crate::error::EngineError; use super::*; impl FastAutomaton { - pub fn union<'o, S>(&self, others: S) -> Result + pub fn union(&self, other: &FastAutomaton) -> Result { + Self::build_union([self, other]) + } + + pub fn union_all<'a, I>(&'a self, others: I) -> Result where - S: MethodParameters<'o, FastAutomaton>, + I: IntoIterator, { - let mut result = self.clone(); - - for other in others.parameters() { - result.alternate(other)?; - - if result.is_total() { - break; - } - } - - Ok(result) + Self::build_union(std::iter::once(self).chain(others.into_iter())) } - pub fn alternation(automatons: &Vec) -> Result { - if automatons.len() == 1 { - return Ok(automatons[0].clone()); - } + pub(crate) fn build_union<'a, I>(automatons: I) -> Result + where + I: IntoIterator, + { let mut new_automaton = FastAutomaton::new_empty(); - if automatons.is_empty() { - return Ok(new_automaton); - } for automaton in automatons { new_automaton.alternate(&automaton)?; } diff --git a/src/fast_automaton/operation/concatenate.rs b/src/fast_automaton/operation/concatenate.rs index e39642f..6de4339 100644 --- a/src/fast_automaton/operation/concatenate.rs +++ b/src/fast_automaton/operation/concatenate.rs @@ -2,38 +2,29 @@ use std::hash::BuildHasherDefault; use condition::converter::ConditionConverter; -use crate::{error::EngineError, traits::MethodParameters}; +use crate::error::EngineError; use super::*; impl FastAutomaton { - pub fn concatenation<'o, S>(&self, others: S) -> Result + pub fn concat(&self, other: &FastAutomaton) -> Result { + Self::build_concat([self, other]) + } + + pub fn concat_all<'a, I>(&'a self, others: I) -> Result where - S: MethodParameters<'o, FastAutomaton>, + I: IntoIterator, { - let mut result = self.clone(); - - for other in others.parameters() { - result.concat(other)?; - - if result.is_total() { - break; - } - } - - Ok(result) + Self::build_concat(std::iter::once(self).chain(others.into_iter())) } - pub fn concatenate(automatons: &Vec) -> Result { - if automatons.len() == 1 { - return Ok(automatons[0].clone()); - } + pub(crate) fn build_concat<'a, I>(automatons: I) -> Result + where + I: IntoIterator, + { let mut new_automaton = FastAutomaton::new_empty_string(); - if automatons.is_empty() { - return Ok(new_automaton); - } for automaton in automatons { - new_automaton.concat(&automaton)?; + new_automaton.concat_(&automaton)?; } Ok(new_automaton) @@ -80,7 +71,7 @@ impl FastAutomaton { let iter = if min == 0 { 0..0 } else { 0..min - 1 }; for _ in iter { - self.concat(&automaton_to_repeat)?; + self.concat_(&automaton_to_repeat)?; } if max_opt.is_none() { @@ -116,7 +107,7 @@ impl FastAutomaton { if min == 0 { self.apply_model(&automaton_to_repeat); } else { - self.concat(&automaton_to_repeat)?; + self.concat_(&automaton_to_repeat)?; } return Ok(()); @@ -124,7 +115,7 @@ impl FastAutomaton { let mut end_states = self.accept_states.iter().cloned().collect::>(); for _ in cmp::max(min, 1)..max_opt.unwrap() { - self.concat(&automaton_to_repeat)?; + self.concat_(&automaton_to_repeat)?; end_states.extend(self.accept_states.iter()); } self.accept_states.extend(end_states); @@ -134,7 +125,7 @@ impl FastAutomaton { Ok(()) } - fn concat(&mut self, other: &FastAutomaton) -> Result<(), EngineError> { + fn concat_(&mut self, other: &FastAutomaton) -> Result<(), EngineError> { if other.is_empty() { return Ok(()); } diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index 4483199..3bf07fd 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -2,18 +2,22 @@ use std::borrow::Cow; use condition::converter::ConditionConverter; -use crate::{error::EngineError, execution_profile::ThreadLocalParams, traits::MethodParameters}; +use crate::{error::EngineError, execution_profile::ThreadLocalParams}; use super::*; impl FastAutomaton { - pub fn intersection<'o, S>(&self, others: S) -> Result + pub fn intersection(&self, other: &FastAutomaton) -> Result { + self.intersection_all([other]) + } + + pub fn intersection_all<'a, I>(&'a self, others: I) -> Result where - S: MethodParameters<'o, FastAutomaton>, + I: IntoIterator, { let mut result = Cow::Borrowed(self); - for other in others.parameters() { + for other in others { result = result.intersection_(other)?; if result.is_empty() { diff --git a/src/lib.rs b/src/lib.rs index f1681ee..359fb65 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -20,7 +20,6 @@ pub mod execution_profile; pub mod fast_automaton; pub mod regex; pub mod tokenizer; -pub(crate) mod traits; type IntMap = HashMap>>; type IntSet = HashSet>>; @@ -80,13 +79,13 @@ impl Term { match operand { Term::RegularExpression(regex) => { if regex.is_total() { - return Ok(Term::RegularExpression(RegularExpression::new_total())); + return Ok(Term::new_total()); } regex_list.push(regex); } Term::Automaton(automaton) => { if automaton.is_total() { - return Ok(Term::RegularExpression(RegularExpression::new_total())); + return Ok(Term::new_total()); } automaton_list.push(automaton); } @@ -97,10 +96,10 @@ impl Term { let mut return_automaton = FastAutomaton::new_empty(); match self { Term::RegularExpression(regular_expression) => { - return_regex = regular_expression.union(®ex_list); + return_regex = regular_expression.union_all(regex_list); } Term::Automaton(fast_automaton) => { - return_automaton = fast_automaton.union(&automaton_list)?; + return_automaton = fast_automaton.union_all(automaton_list)?; } } @@ -139,19 +138,24 @@ impl Term { /// ``` pub fn intersection(&self, terms: &[Term]) -> Result { Self::check_number_of_terms(terms)?; - let mut return_automaton = self.get_automaton()?; - for term in terms { - let automaton = term.get_automaton()?; - return_automaton = Cow::Owned(return_automaton.intersection(automaton.as_ref())?); - if return_automaton.is_empty() { - return Ok(Term::RegularExpression(RegularExpression::new_empty())); + + let mut automaton_list = Vec::with_capacity(terms.len()); + for operand in terms { + let automaton = operand.get_automaton()?; + if automaton.is_empty() { + return Ok(Term::new_empty()); } + automaton_list.push(automaton); } + let return_automaton = self + .get_automaton()? + .intersection_all(automaton_list.iter().map(Cow::as_ref))?; + if let Some(regex) = return_automaton.to_regex() { Ok(Term::RegularExpression(regex)) } else { - Ok(Term::Automaton(return_automaton.into_owned())) + Ok(Term::Automaton(return_automaton)) } } @@ -322,6 +326,14 @@ impl Term { Term::Automaton(automaton) => Cow::Borrowed(automaton), }) } + + fn new_empty() -> Self { + Term::RegularExpression(RegularExpression::new_empty()) + } + + fn new_total() -> Self { + Term::RegularExpression(RegularExpression::new_total()) + } } /// Represents details about a [Term]. diff --git a/src/regex/mod.rs b/src/regex/mod.rs index 59be90b..c131d2b 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -138,14 +138,15 @@ impl RegularExpression { for c in concat.iter() { concats.push(c.to_automaton()?); } - FastAutomaton::concatenate(&concats) + println!("{:?}", concats); + FastAutomaton::build_concat(&concats) } RegularExpression::Alternation(alternation) => { let mut alternates = Vec::with_capacity(alternation.len()); for c in alternation.iter() { alternates.push(c.to_automaton()?); } - FastAutomaton::alternation(&alternates) + FastAutomaton::build_union(&alternates) } } } diff --git a/src/regex/operation/mod.rs b/src/regex/operation/mod.rs index 2baa587..b01ac78 100644 --- a/src/regex/operation/mod.rs +++ b/src/regex/operation/mod.rs @@ -103,6 +103,8 @@ mod tests { assert_parse_and_simplify("(cd|ab)*(ab|cd)*", "(ab|cd)*"); assert_parse_and_simplify(".*q(ab|ab|abc|ca)x", ".*q(abc?|ca)x"); assert_parse_and_simplify("((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,100}", "(q|(a|ads|a{2}d)*abc.*def.*uif(x|ads|a{2}d)*abc.*oxs.*def(ads|ax|a{2}d)*abc.*def.*ksd){1,100}"); + + assert_parse_and_simplify("(a{2,4}){2,4}", "a{4,16}"); Ok(()) } @@ -201,6 +203,14 @@ mod tests { None, ); + assert_repeat_simplify( + &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), + 2, + Some(4), + 2, + Some(4), + ); + Ok(()) } diff --git a/src/regex/operation/union.rs b/src/regex/operation/union.rs index 8508e4e..9589b4a 100644 --- a/src/regex/operation/union.rs +++ b/src/regex/operation/union.rs @@ -1,17 +1,19 @@ use std::collections::BTreeSet; -use crate::traits::MethodParameters; - use super::*; impl RegularExpression { - pub fn union<'o, S>(&self, others: S) -> RegularExpression + pub fn union(&self, other: &RegularExpression) -> RegularExpression { + self.union_all([other]) + } + + pub fn union_all<'a, I>(&'a self, others: I) -> RegularExpression where - S: MethodParameters<'o, RegularExpression>, + I: IntoIterator, { let mut result = Cow::Borrowed(self); - for other in others.parameters() { + for other in others { result = result.union_(other); if result.is_total() { diff --git a/src/traits.rs b/src/traits.rs index 6254022..e69de29 100644 --- a/src/traits.rs +++ b/src/traits.rs @@ -1,54 +0,0 @@ -pub trait MethodParameters<'a, T: 'a> { - /// the iterator that yields `&'a T` - type Iter: Iterator; - fn parameters(self) -> Self::Iter; -} - -impl<'a, T> MethodParameters<'a, T> for &'a T { - type Iter = std::iter::Once<&'a T>; - fn parameters(self) -> Self::Iter { - std::iter::once(self) - } -} - -impl<'a, T> MethodParameters<'a, T> for &'a [&'a T] { - type Iter = std::iter::Copied>; - fn parameters(self) -> Self::Iter { - self.iter().copied() - } -} - -impl<'a, T> MethodParameters<'a, T> for &'a [T] { - type Iter = std::slice::Iter<'a, T>; - fn parameters(self) -> Self::Iter { - self.iter() - } -} - -impl<'a, T> MethodParameters<'a, T> for &'a Vec { - type Iter = std::slice::Iter<'a, T>; - fn parameters(self) -> Self::Iter { - self.iter() - } -} - -impl<'a, T> MethodParameters<'a, T> for &'a Vec<&'a T> { - type Iter = std::iter::Copied>; - fn parameters(self) -> Self::Iter { - self.iter().copied() - } -} - -impl<'a, T, const N: usize> MethodParameters<'a, T> for &'a [T; N] { - type Iter = std::slice::Iter<'a, T>; - fn parameters(self) -> Self::Iter { - self.iter() - } -} - -impl<'a, T, const N: usize> MethodParameters<'a, T> for &'a [&'a T; N] { - type Iter = std::iter::Copied>; - fn parameters(self) -> Self::Iter { - self.iter().copied() - } -} From 1e7ec951e8ea0d7bc17ac37f1bd8ea26cf7ad74e Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Tue, 8 Jul 2025 21:59:09 +0200 Subject: [PATCH 03/44] WIP --- Cargo.toml | 4 +- README.md | 114 ++++---- src/fast_automaton/builder.rs | 6 +- .../condition/fast_bit_vec/mod.rs | 2 +- src/fast_automaton/condition/mod.rs | 11 +- .../convert/to_regex/builder/mod.rs | 5 +- src/fast_automaton/convert/to_regex/mod.rs | 45 +-- src/fast_automaton/mod.rs | 17 +- src/fast_automaton/operation/alternation.rs | 6 +- src/fast_automaton/operation/concatenate.rs | 101 +------ src/fast_automaton/operation/intersection.rs | 4 +- src/fast_automaton/operation/mod.rs | 1 + src/fast_automaton/operation/repeat.rs | 107 +++++++ src/lib.rs | 264 ++++++++++++++---- src/regex/mod.rs | 19 +- src/regex/operation/mod.rs | 67 +---- src/regex/operation/repeat.rs | 67 +++++ src/regex/operation/union.rs | 3 +- src/regex/serializer.rs | 5 +- src/traits.rs | 0 20 files changed, 507 insertions(+), 341 deletions(-) create mode 100644 src/fast_automaton/operation/repeat.rs create mode 100644 src/regex/operation/repeat.rs delete mode 100644 src/traits.rs diff --git a/Cargo.toml b/Cargo.toml index cd03087..7147509 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "regexsolver" -version = "0.3.1" -edition = "2021" +version = "1.0.0" +edition = "2024" authors = ["Alexandre van Beurden"] repository = "https://github.com/RegexSolver/regexsolver" license = "MIT" diff --git a/README.md b/README.md index dcb0b47..2d2bff0 100644 --- a/README.md +++ b/README.md @@ -1,73 +1,67 @@ -# RegexSolver +# RegexSolver [![Crates.io Version](https://img.shields.io/crates/v/regexsolver)](https://crates.io/crates/regexsolver) - -This repository contains the code of [RegexSolver](https://regexsolver.com/) engine. - -For more information, you can check the library's [documentation](https://docs.rs/regexsolver/latest/regexsolver/). - -If you want to use this library with other programming languages, we provide a wide range of wrappers: - -- [regexsolver-java](https://github.com/RegexSolver/regexsolver-java) -- [regexsolver-js](https://github.com/RegexSolver/regexsolver-js) -- [regexsolver-python](https://github.com/RegexSolver/regexsolver-python) - -For more information about how to use the wrappers, you can refer to our [getting started guide](https://docs.regexsolver.com/getting-started.html). + A high-performance Rust library for building, combining, and analyzing regular expressions and finite automata. + +Ideal for constraint solvers, code generators, test-case generators, and any use case requiring rich regex/automaton operations at scale. + +## Key Features +- **Dual Representation**: Work interchangeably with regex syntax or compiled automata via the `Term` enum. +- **Set Operations**: Concatenate, union, intersect, subtract, and repeat regex/automaton terms. +- **Analysis & Properties**: + - Compute language **cardinality**, **length bounds**, **emptiness**, and **totality**. + - Check **equivalence** and **subset** relations between terms. +- **String Generation**: Generate example strings matching a term, for testing or sampling. +- **Performance & Tuning**: Pluggable `ExecutionProfile` to bound cost and resource usage. ## Installation - Add the following line in your `Cargo.toml`: - ```toml [dependencies] -regexsolver = "0.3" +regexsolver = "1" ``` - ## Examples -### Union - ```rust -use regexsolver::Term; - -let term1 = Term::from_regex("abc").unwrap(); -let term2 = Term::from_regex("de").unwrap(); -let term3 = Term::from_regex("fghi").unwrap(); - -let union = term1.union(&[term2, term3]).unwrap(); - -if let Term::RegularExpression(regex) = union { - println!("{}", regex.to_string()); // (abc|de|fghi) -} +// Create terms from regex +let t1 = Term::from_regex("abc.*")?; +let t2 = Term::from_regex(".*xyz")?; + +// Concatenate +let concat = t1.concat(&[t2])?; +assert_eq!(concat.to_string(), "abc.*xyz"); + +// Union +let union = t1.union(&[Term::from_regex("fgh")?])?; // (abc.*|fgh) +assert_eq!(union.to_string(), "(abc.*|fgh)"); + +// Intersection +let inter = Term::from_regex("(ab|xy){2}")?.intersection(&[Term::from_regex(".*xy")?])?; // (ab|xy)xy +assert_eq!(inter.to_string(), "(ab|xy)xy"); + +// Subtraction +let diff = Term::from_regex("a*")?.subtraction(&Term::from_regex("")?)?; +assert_eq!(diff.to_string(), "a+"); + +// Repetition +let rep = Term::from_regex("abc")?.repeat(2, Some(4))?; // (abc){2,4} +assert_eq!(rep.to_string(), "(abc){2,4}"); + +// Analyze +let details = rep.get_details()?; +assert_eq!(details.get_length(), &(Some(6), Some(12))); +assert!(!details.is_empty()); + +// Generate examples +let samples = Term::from_regex("(x|y){1,3}")?.generate_strings(5)?; +println!("Some matches: {:?}", samples); + +// Equivalence & subset +let a = Term::from_regex("a+")?; +let b = Term::from_regex("a*")?; +assert!(!a.are_equivalent(&b)?); +assert!(a.is_subset_of(&b)?); ``` -### Intersection - -```rust -use regexsolver::Term; - -let term1 = Term::from_regex("(abc|de){2}").unwrap(); -let term2 = Term::from_regex("de.*").unwrap(); -let term3 = Term::from_regex(".*abc").unwrap(); - -let intersection = term1.intersection(&[term2, term3]).unwrap(); - -if let Term::RegularExpression(regex) = intersection { - println!("{}", regex.to_string()); // deabc -} -``` - -### Difference/Subtraction - -```rust -use regexsolver::Term; - -let term1 = Term::from_regex("(abc|de)").unwrap(); -let term2 = Term::from_regex("de").unwrap(); - -let subtraction = term1.subtraction(&term2).unwrap(); - -if let Term::RegularExpression(regex) = subtraction { - println!("{}", regex.to_string()); // abc -} -``` +## Execution Profiles +By default, all operations run without limits. For heavy or untrusted patterns, use an `ExecutionProfile` to cap time, memory or term count: diff --git a/src/fast_automaton/builder.rs b/src/fast_automaton/builder.rs index b6cf50b..c597747 100644 --- a/src/fast_automaton/builder.rs +++ b/src/fast_automaton/builder.rs @@ -192,8 +192,7 @@ impl FastAutomaton { self.assert_state_exists(state); if self.start_state == state { panic!( - "Can not remove the state {}, it is still used as start state.", - state + "Can not remove the state {state}, it is still used as start state." ); } self.accept_states.remove(&state); @@ -228,8 +227,7 @@ impl FastAutomaton { for &state in states { if self.start_state == state { panic!( - "Can not remove the state {}, it is still used as start state.", - state + "Can not remove the state {state}, it is still used as start state." ); } if self.transitions.len() - 1 == state { diff --git a/src/fast_automaton/condition/fast_bit_vec/mod.rs b/src/fast_automaton/condition/fast_bit_vec/mod.rs index bbf4376..82b0ead 100644 --- a/src/fast_automaton/condition/fast_bit_vec/mod.rs +++ b/src/fast_automaton/condition/fast_bit_vec/mod.rs @@ -8,7 +8,7 @@ impl std::fmt::Display for FastBitVec { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { for i in 0..self.n { let bit = if self.get(i).unwrap() { 1 } else { 0 }; - write!(f, "{}", bit)?; + write!(f, "{bit}")?; } Ok(()) } diff --git a/src/fast_automaton/condition/mod.rs b/src/fast_automaton/condition/mod.rs index da9c2b8..40415e3 100644 --- a/src/fast_automaton/condition/mod.rs +++ b/src/fast_automaton/condition/mod.rs @@ -151,7 +151,8 @@ impl Condition { Ok(self.to_range(spanning_set)?.get_cardinality()) } - pub fn get_bits(&self) -> Vec { + #[inline] + pub fn get_binary_representation(&self) -> Vec { self.0.get_bits() } } @@ -193,11 +194,11 @@ mod tests { let empty = Condition::empty(&spanning_set); //println!("{empty}"); assert!(empty.is_empty()); - assert_eq!(vec![false, false, false, false], empty.get_bits()); + assert_eq!(vec![false, false, false, false], empty.get_binary_representation()); let total = Condition::total(&spanning_set); //println!("{total}"); assert!(total.is_total()); - assert_eq!(vec![true, true, true, true], total.get_bits()); + assert_eq!(vec![true, true, true, true], total.get_binary_representation()); assert_eq!(Range::empty(), empty.to_range(&spanning_set).unwrap()); assert_eq!(Range::total(), total.to_range(&spanning_set).unwrap()); @@ -225,13 +226,13 @@ mod tests { empty, Condition::from_range(&Range::empty(), &spanning_set).unwrap() ); - assert_eq!(vec![false], empty.get_bits()); + assert_eq!(vec![false], empty.get_binary_representation()); assert_eq!( total, Condition::from_range(&Range::total(), &spanning_set).unwrap() ); - assert_eq!(vec![true], total.get_bits()); + assert_eq!(vec![true], total.get_binary_representation()); assert_eq!(empty, total.complement()); assert_eq!(total, empty.complement()); diff --git a/src/fast_automaton/convert/to_regex/builder/mod.rs b/src/fast_automaton/convert/to_regex/builder/mod.rs index b6c8dd5..648f733 100644 --- a/src/fast_automaton/convert/to_regex/builder/mod.rs +++ b/src/fast_automaton/convert/to_regex/builder/mod.rs @@ -85,7 +85,7 @@ impl StateEliminationAutomaton { #[inline] fn assert_state_exists(&self, state: State) { if !self.has_state(state) { - panic!("The state {} does not exist", state); + panic!("The state {state} does not exist"); } } @@ -124,8 +124,7 @@ impl StateEliminationAutomaton { self.assert_state_exists(state); if self.start_state == state || self.accept_state == state { panic!( - "Can not remove the state {}, it is still used as start state or accept state.", - state + "Can not remove the state {state}, it is still used as start state or accept state." ); } self.transitions_in.remove(&state); diff --git a/src/fast_automaton/convert/to_regex/mod.rs b/src/fast_automaton/convert/to_regex/mod.rs index d9a1dd0..2d84ff8 100644 --- a/src/fast_automaton/convert/to_regex/mod.rs +++ b/src/fast_automaton/convert/to_regex/mod.rs @@ -1,5 +1,5 @@ use std::{ - collections::{hash_map::Entry, VecDeque}, + collections::{VecDeque, hash_map::Entry}, fmt::Display, }; @@ -56,7 +56,7 @@ impl StateEliminationAutomaton { #[allow(dead_code)] #[inline] pub fn to_dot(&self) { - println!("{}", self); + println!("{self}"); } #[inline] @@ -68,8 +68,8 @@ impl StateEliminationAutomaton { let is_subgraph; let indent; let prefix = if let Some(prefix) = prefix { - writeln!(sb, "\tsubgraph cluster_{} {{", prefix)?; - writeln!(sb, "\t\tlabel = \"{} - cyclic={}\";", prefix, self.cyclic)?; + writeln!(sb, "\tsubgraph cluster_{prefix} {{")?; + writeln!(sb, "\t\tlabel = \"{prefix} - cyclic={}\";", self.cyclic)?; indent = "\t"; is_subgraph = true; prefix @@ -89,16 +89,16 @@ impl StateEliminationAutomaton { format!("S{from_state}") }; - write!(sb, "{indent}\t{}", from_state_with_prefix)?; + write!(sb, "{indent}\t{from_state_with_prefix}")?; if !is_subgraph && self.accept_state == from_state { - writeln!(sb, "\t[shape=doublecircle,label=\"{}\"];", from_state)?; + writeln!(sb, "\t[shape=doublecircle,label=\"{from_state}\"];")?; } else { - writeln!(sb, "{indent}\t[shape=circle,label=\"{}\"];", from_state)?; + writeln!(sb, "{indent}\t[shape=circle,label=\"{from_state}\"];")?; } if !is_subgraph && self.start_state == from_state { writeln!(sb, "\tinitial [shape=plaintext,label=\"\"];")?; - writeln!(sb, "\tinitial -> {}", from_state_with_prefix)?; + writeln!(sb, "\tinitial -> {from_state_with_prefix}")?; } for (to_state, weight) in self.transitions_from_state_enumerate_iter(&from_state) { let to_state_with_prefix = if is_subgraph { @@ -117,23 +117,21 @@ impl StateEliminationAutomaton { state_elimination_automaton.to_graph_dot(sb, Some(&subgraph_prefix))?; writeln!(sb)?; let subgraph_start_state = format!( - "S{}_{}", - subgraph_prefix, state_elimination_automaton.start_state + "S{subgraph_prefix}_{}", + state_elimination_automaton.start_state ); writeln!( sb, - "{indent}\t{} -> {} [label=\"ε\"]", - from_state_with_prefix, subgraph_start_state + "{indent}\t{from_state_with_prefix} -> {subgraph_start_state} [label=\"ε\"]" )?; let subgraph_accept_state = format!( - "S{}_{}", - subgraph_prefix, state_elimination_automaton.accept_state + "S{subgraph_prefix}_{}", + state_elimination_automaton.accept_state ); writeln!( sb, - "{indent}\t{} -> {} [label=\"ε\"]", - subgraph_accept_state, to_state_with_prefix + "{indent}\t{subgraph_accept_state} -> {to_state_with_prefix} [label=\"ε\"]" ) } GraphTransition::Weight(range) => { @@ -150,8 +148,7 @@ impl StateEliminationAutomaton { } GraphTransition::Epsilon => writeln!( sb, - "{indent}\t{} -> {} [label=\"ε\"]", - from_state_with_prefix, to_state_with_prefix + "{indent}\t{from_state_with_prefix} -> {to_state_with_prefix} [label=\"ε\"]" ), }?; } @@ -259,20 +256,26 @@ impl FastAutomaton { Ok(automaton) => match self.is_equivalent_of(&automaton) { Ok(result) => { if !result { - warn!("The automaton is not equivalent to the generated regex; automaton={}, regex={}", self, regex); + warn!( + "The automaton is not equivalent to the generated regex; automaton={self}, regex={regex}" + ); None } else { Some(regex) } } Err(err) => { - warn!("Engine error while checking for equivalence ({}); automaton={}, regex={}", err, self, regex); + warn!( + "Engine error while checking for equivalence ({err}); automaton={self}, regex={regex}" + ); None } }, Err(err) => { if let crate::error::EngineError::RegexSyntaxError(err) = err { - warn!("The generated regex cannot be converted to automaton to be checked for equivalence ({}); automaton={}, regex={}", err, self, regex); + warn!( + "The generated regex cannot be converted to automaton to be checked for equivalence ({err}); automaton={self}, regex={regex}" + ); } None } diff --git a/src/fast_automaton/mod.rs b/src/fast_automaton/mod.rs index 6d6fcbc..224b150 100644 --- a/src/fast_automaton/mod.rs +++ b/src/fast_automaton/mod.rs @@ -6,6 +6,7 @@ use spanning_set::SpanningSet; use std::collections::hash_map::Entry; use std::collections::VecDeque; use std::fmt::Display; +use crate::error::EngineError; use crate::{IntMap, IntSet}; @@ -40,23 +41,21 @@ impl Display for FastAutomaton { writeln!(sb, "digraph Automaton {{")?; writeln!(sb, "\trankdir = LR;")?; for from_state in self.transitions_iter() { - write!(sb, "\t{}", from_state)?; + write!(sb, "\t{from_state}")?; if self.accept_states.contains(&from_state) { - writeln!(sb, "\t[shape=doublecircle,label=\"{}\"];", from_state)?; + writeln!(sb, "\t[shape=doublecircle,label=\"{from_state}\"];")?; } else { - writeln!(sb, "\t[shape=circle,label=\"{}\"];", from_state)?; + writeln!(sb, "\t[shape=circle,label=\"{from_state}\"];")?; } if self.start_state == from_state { writeln!(sb, "\tinitial [shape=plaintext,label=\"\"];")?; - writeln!(sb, "\tinitial -> {}", from_state)?; + writeln!(sb, "\tinitial -> {from_state}")?; } for (to_state, cond) in self.transitions_from_state_enumerate_iter(&from_state) { writeln!( sb, - "\t{} -> {} [label=\"{}\"]", - from_state, - to_state, + "\t{from_state} -> {to_state} [label=\"{}\"]", cond.to_range(&self.spanning_set) .expect("Cannot convert condition to range.") .to_regex() @@ -73,7 +72,7 @@ impl FastAutomaton { #[inline] fn assert_state_exists(&self, state: State) { if !self.has_state(state) { - panic!("The state {} does not exist", state); + panic!("The state {state} does not exist"); } } @@ -292,7 +291,7 @@ impl FastAutomaton { #[inline] pub fn to_dot(&self) { - println!("{}", self); + println!("{self}"); } } diff --git a/src/fast_automaton/operation/alternation.rs b/src/fast_automaton/operation/alternation.rs index 3daa9ca..84c8749 100644 --- a/src/fast_automaton/operation/alternation.rs +++ b/src/fast_automaton/operation/alternation.rs @@ -15,7 +15,7 @@ impl FastAutomaton { where I: IntoIterator, { - Self::build_union(std::iter::once(self).chain(others.into_iter())) + Self::build_union(std::iter::once(self).chain(others)) } pub(crate) fn build_union<'a, I>(automatons: I) -> Result @@ -24,7 +24,7 @@ impl FastAutomaton { { let mut new_automaton = FastAutomaton::new_empty(); for automaton in automatons { - new_automaton.alternate(&automaton)?; + new_automaton.union_mut(automaton)?; } Ok(new_automaton) } @@ -136,7 +136,7 @@ impl FastAutomaton { * - the start states can't be merged if they have incoming edges * - the accept states can't be merged if they have outgoing edges */ - fn alternate(&mut self, other: &FastAutomaton) -> Result<(), EngineError> { + pub(crate) fn union_mut(&mut self, other: &FastAutomaton) -> Result<(), EngineError> { if other.is_empty() || self.is_total() { return Ok(()); } else if other.is_total() { diff --git a/src/fast_automaton/operation/concatenate.rs b/src/fast_automaton/operation/concatenate.rs index 6de4339..1dbd644 100644 --- a/src/fast_automaton/operation/concatenate.rs +++ b/src/fast_automaton/operation/concatenate.rs @@ -15,7 +15,7 @@ impl FastAutomaton { where I: IntoIterator, { - Self::build_concat(std::iter::once(self).chain(others.into_iter())) + Self::build_concat(std::iter::once(self).chain(others)) } pub(crate) fn build_concat<'a, I>(automatons: I) -> Result @@ -24,108 +24,13 @@ impl FastAutomaton { { let mut new_automaton = FastAutomaton::new_empty_string(); for automaton in automatons { - new_automaton.concat_(&automaton)?; + new_automaton.concat_mut(automaton)?; } Ok(new_automaton) } - pub fn repeat(&mut self, min: u32, max_opt: Option) -> Result<(), EngineError> { - if let Some(max) = max_opt { - if min > max { - self.make_empty(); - return Ok(()); - } - } - - let automaton_to_repeat = self.clone(); - - if min == 0 && self.in_degree(self.start_state) != 0 { - let new_state = self.new_state(); - if self.is_accepted(&self.start_state) { - self.accept(new_state); - } - - for to_state in self.transitions_from_state(&self.start_state) { - self.add_epsilon(new_state, to_state); - } - self.start_state = new_state; - - if max_opt.is_none() { - for accept_state in self.accept_states.clone() { - self.add_epsilon(accept_state, self.start_state); - } - self.accept(self.start_state); - return Ok(()); - } - } - - if let Some(max) = max_opt { - if min <= 1 && max == 1 { - if min == 0 { - self.accept_states.insert(self.start_state); - } - return Ok(()); - } - } - - let iter = if min == 0 { 0..0 } else { 0..min - 1 }; - for _ in iter { - self.concat_(&automaton_to_repeat)?; - } - - if max_opt.is_none() { - let mut automaton_to_repeat = automaton_to_repeat.clone(); - - let accept_state = *automaton_to_repeat.accept_states.iter().next().unwrap(); - if automaton_to_repeat.accept_states.len() == 1 - && automaton_to_repeat.out_degree(accept_state) == 0 - && automaton_to_repeat.in_degree(automaton_to_repeat.start_state) == 0 - { - automaton_to_repeat.add_epsilon(accept_state, automaton_to_repeat.start_state); - let old_start_state = automaton_to_repeat.start_state; - automaton_to_repeat.start_state = accept_state; - automaton_to_repeat.remove_state(old_start_state); - } else { - let t = Self::transitions_from_state_set( - &automaton_to_repeat.transitions, - automaton_to_repeat.start_state, - ); - let transitions = - Self::transitions_from_state_enumerate(&t, &automaton_to_repeat.removed_states); - - for state in automaton_to_repeat.accept_states.clone() { - for &(to_state, condition) in &transitions { - automaton_to_repeat.add_transition_to(state, *to_state, condition); - } - } - - automaton_to_repeat.accept(automaton_to_repeat.get_start_state()); - } - automaton_to_repeat.cyclic = true; - - if min == 0 { - self.apply_model(&automaton_to_repeat); - } else { - self.concat_(&automaton_to_repeat)?; - } - - return Ok(()); - } - - let mut end_states = self.accept_states.iter().cloned().collect::>(); - for _ in cmp::max(min, 1)..max_opt.unwrap() { - self.concat_(&automaton_to_repeat)?; - end_states.extend(self.accept_states.iter()); - } - self.accept_states.extend(end_states); - if min == 0 { - self.accept(self.start_state); - } - Ok(()) - } - - fn concat_(&mut self, other: &FastAutomaton) -> Result<(), EngineError> { + pub(crate) fn concat_mut(&mut self, other: &FastAutomaton) -> Result<(), EngineError> { if other.is_empty() { return Ok(()); } diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index 3bf07fd..2664aec 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -18,7 +18,7 @@ impl FastAutomaton { let mut result = Cow::Borrowed(self); for other in others { - result = result.intersection_(other)?; + result = result.intersection_internal(other)?; if result.is_empty() { break; @@ -28,7 +28,7 @@ impl FastAutomaton { Ok(result.into_owned()) } - fn intersection_<'a>( + fn intersection_internal<'a>( &self, other: &'a FastAutomaton, ) -> Result, EngineError> { diff --git a/src/fast_automaton/operation/mod.rs b/src/fast_automaton/operation/mod.rs index 7c7c0f1..bf0523e 100644 --- a/src/fast_automaton/operation/mod.rs +++ b/src/fast_automaton/operation/mod.rs @@ -9,6 +9,7 @@ mod concatenate; mod determinize; mod intersection; mod subtraction; +mod repeat; impl FastAutomaton { pub fn remove_dead_transitions(&mut self) { diff --git a/src/fast_automaton/operation/repeat.rs b/src/fast_automaton/operation/repeat.rs new file mode 100644 index 0000000..f451678 --- /dev/null +++ b/src/fast_automaton/operation/repeat.rs @@ -0,0 +1,107 @@ +use super::*; + +impl FastAutomaton { + pub fn repeat(&self, min: u32, max_opt: Option) -> Result { + let mut automaton = self.clone(); + if let Err(error) = automaton.repeat_mut(min, max_opt) { + Err(error) + } else { + Ok(automaton) + } + } + + pub(crate) fn repeat_mut(&mut self, min: u32, max_opt: Option) -> Result<(), EngineError> { + if let Some(max) = max_opt { + if min > max { + self.make_empty(); + return Ok(()); + } + } + + let automaton_to_repeat = self.clone(); + + if min == 0 && self.in_degree(self.start_state) != 0 { + let new_state = self.new_state(); + if self.is_accepted(&self.start_state) { + self.accept(new_state); + } + + for to_state in self.transitions_from_state(&self.start_state) { + self.add_epsilon(new_state, to_state); + } + self.start_state = new_state; + + if max_opt.is_none() { + for accept_state in self.accept_states.clone() { + self.add_epsilon(accept_state, self.start_state); + } + self.accept(self.start_state); + return Ok(()); + } + } + + if let Some(max) = max_opt { + if min <= 1 && max == 1 { + if min == 0 { + self.accept_states.insert(self.start_state); + } + return Ok(()); + } + } + + let iter = if min == 0 { 0..0 } else { 0..min - 1 }; + for _ in iter { + self.concat_mut(&automaton_to_repeat)?; + } + + if max_opt.is_none() { + let mut automaton_to_repeat = automaton_to_repeat.clone(); + + let accept_state = *automaton_to_repeat.accept_states.iter().next().unwrap(); + if automaton_to_repeat.accept_states.len() == 1 + && automaton_to_repeat.out_degree(accept_state) == 0 + && automaton_to_repeat.in_degree(automaton_to_repeat.start_state) == 0 + { + automaton_to_repeat.add_epsilon(accept_state, automaton_to_repeat.start_state); + let old_start_state = automaton_to_repeat.start_state; + automaton_to_repeat.start_state = accept_state; + automaton_to_repeat.remove_state(old_start_state); + } else { + let t = Self::transitions_from_state_set( + &automaton_to_repeat.transitions, + automaton_to_repeat.start_state, + ); + let transitions = + Self::transitions_from_state_enumerate(&t, &automaton_to_repeat.removed_states); + + for state in automaton_to_repeat.accept_states.clone() { + for &(to_state, condition) in &transitions { + automaton_to_repeat.add_transition_to(state, *to_state, condition); + } + } + + automaton_to_repeat.accept(automaton_to_repeat.get_start_state()); + } + automaton_to_repeat.cyclic = true; + + if min == 0 { + self.apply_model(&automaton_to_repeat); + } else { + self.concat_mut(&automaton_to_repeat)?; + } + + return Ok(()); + } + + let mut end_states = self.accept_states.iter().cloned().collect::>(); + for _ in cmp::max(min, 1)..max_opt.unwrap() { + self.concat_mut(&automaton_to_repeat)?; + end_states.extend(self.accept_states.iter()); + } + self.accept_states.extend(end_states); + if min == 0 { + self.accept(self.start_state); + } + Ok(()) + } +} diff --git a/src/lib.rs b/src/lib.rs index 359fb65..17f26c7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,7 +1,5 @@ use std::{ - borrow::Cow, - collections::{HashMap, HashSet}, - hash::BuildHasherDefault, + borrow::Cow, collections::{HashMap, HashSet}, fmt::Display, hash::BuildHasherDefault }; use cardinality::Cardinality; @@ -38,6 +36,15 @@ pub enum Term { Automaton(FastAutomaton), } +impl Display for Term { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Term::RegularExpression(regular_expression) => write!(f, "{regular_expression}"), + Term::Automaton(fast_automaton) => write!(f, "{fast_automaton}"), + } + } +} + impl Term { /// Create a term based on the given pattern. /// @@ -52,7 +59,67 @@ impl Term { Ok(Term::RegularExpression(RegularExpression::new(regex)?)) } - /// Compute the union of the given collection of terms. + /// Compute the concatenation of the current term with the given list of terms. + /// Returns the resulting term. + /// + /// # Example: + /// + /// ``` + /// use regexsolver::Term; + /// + /// let term1 = Term::from_regex("abc").unwrap(); + /// let term2 = Term::from_regex("d.").unwrap(); + /// let term3 = Term::from_regex(".*").unwrap(); + /// + /// let concat = term1.concat(&[term2, term3]).unwrap(); + /// + /// if let Term::RegularExpression(regex) = concat { + /// assert_eq!("abcd.+", regex.to_string()); + /// } + /// ``` + pub fn concat(&self, terms: &[Term]) -> Result { + Self::check_number_of_terms(terms)?; + + let mut return_regex = RegularExpression::new_empty(); + let mut return_automaton = FastAutomaton::new_empty(); + let mut has_automaton = false; + match self { + Term::RegularExpression(regular_expression) => { + return_regex = regular_expression.clone() + } + Term::Automaton(fast_automaton) => { + has_automaton = true; + return_automaton = fast_automaton.clone(); + } + } + for term in terms { + if has_automaton { + return_automaton = return_automaton.concat(term.get_automaton()?.as_ref())?; + } else { + match term { + Term::RegularExpression(regular_expression) => { + return_regex = return_regex.concat(regular_expression, true); + } + Term::Automaton(fast_automaton) => { + has_automaton = true; + return_automaton = return_regex.to_automaton()?.concat(fast_automaton)?; + } + } + } + } + + if !has_automaton { + Ok(Term::RegularExpression(return_regex)) + } else { + if let Some(return_regex) = return_automaton.to_regex() { + Ok(Term::RegularExpression(return_regex)) + } else { + Ok(Term::Automaton(return_automaton)) + } + } + } + + /// Compute the union of the current term with the given collection of terms. /// Returns the resulting term. /// /// # Example: @@ -73,52 +140,53 @@ impl Term { pub fn union(&self, terms: &[Term]) -> Result { Self::check_number_of_terms(terms)?; - let mut regex_list = Vec::with_capacity(terms.len()); - let mut automaton_list = Vec::with_capacity(terms.len()); - for operand in terms { - match operand { - Term::RegularExpression(regex) => { - if regex.is_total() { - return Ok(Term::new_total()); - } - regex_list.push(regex); - } - Term::Automaton(automaton) => { - if automaton.is_total() { - return Ok(Term::new_total()); - } - automaton_list.push(automaton); - } - } + if self.is_total() { + return Ok(Term::new_total()); } let mut return_regex = RegularExpression::new_empty(); let mut return_automaton = FastAutomaton::new_empty(); + let mut has_automaton = false; match self { Term::RegularExpression(regular_expression) => { - return_regex = regular_expression.union_all(regex_list); + return_regex = regular_expression.clone() } Term::Automaton(fast_automaton) => { - return_automaton = fast_automaton.union_all(automaton_list)?; + has_automaton = true; + return_automaton = fast_automaton.clone(); + } + } + for term in terms { + if term.is_total() { + return Ok(Term::new_total()); + } + if has_automaton { + return_automaton = return_automaton.union(term.get_automaton()?.as_ref())?; + } else { + match term { + Term::RegularExpression(regular_expression) => { + return_regex = return_regex.union(regular_expression); + } + Term::Automaton(fast_automaton) => { + has_automaton = true; + return_automaton = return_regex.to_automaton()?.union(fast_automaton)?; + } + } } } - if return_automaton.is_empty() { + if !has_automaton { Ok(Term::RegularExpression(return_regex)) } else { - if !return_regex.is_empty() { - return_automaton = return_automaton.union(&return_regex.to_automaton()?)?; - } - - if let Some(regex) = return_automaton.to_regex() { - Ok(Term::RegularExpression(regex)) + if let Some(return_regex) = return_automaton.to_regex() { + Ok(Term::RegularExpression(return_regex)) } else { Ok(Term::Automaton(return_automaton)) } } } - /// Compute the intersection of the given collection of terms. + /// Compute the intersection of the current term with the given collection of terms. /// Returns the resulting term. /// /// # Example: @@ -139,27 +207,30 @@ impl Term { pub fn intersection(&self, terms: &[Term]) -> Result { Self::check_number_of_terms(terms)?; + if self.is_empty() { + return Ok(Term::new_empty()); + } + let mut automaton_list = Vec::with_capacity(terms.len()); - for operand in terms { - let automaton = operand.get_automaton()?; - if automaton.is_empty() { + for term in terms { + if term.is_empty() { return Ok(Term::new_empty()); } - automaton_list.push(automaton); + automaton_list.push(term.get_automaton()?); } let return_automaton = self .get_automaton()? .intersection_all(automaton_list.iter().map(Cow::as_ref))?; - if let Some(regex) = return_automaton.to_regex() { - Ok(Term::RegularExpression(regex)) + if let Some(return_regex) = return_automaton.to_regex() { + Ok(Term::RegularExpression(return_regex)) } else { Ok(Term::Automaton(return_automaton)) } } - /// Compute the subtraction/difference of the two given terms. + /// Compute the subtraction of the current term and the given `subtrahend`. /// Returns the resulting term. /// /// # Example: @@ -183,8 +254,8 @@ impl Term { Self::determinize_subtrahend(&minuend_automaton, &subtrahend_automaton)?; let return_automaton = minuend_automaton.subtraction(&subtrahend_automaton)?; - if let Some(regex) = return_automaton.to_regex() { - Ok(Term::RegularExpression(regex)) + if let Some(return_regex) = return_automaton.to_regex() { + Ok(Term::RegularExpression(return_regex)) } else { Ok(Term::Automaton(return_automaton)) } @@ -196,7 +267,45 @@ impl Term { self.subtraction(subtrahend) } - /// Returns the Details of the given term. + /// Returns the repetition of the current term, + /// between `min` and `max_opt` times. If `max_opt` is `None`, the repetition is unbounded. + /// + /// # Example: + /// + /// ``` + /// use regexsolver::Term; + /// + /// let term = Term::from_regex("abc").unwrap(); + /// + /// let repeat = term.repeat(1, None).unwrap(); + /// + /// if let Term::RegularExpression(regex) = repeat { + /// assert_eq!("(abc)+", regex.to_string()); + /// } + /// + /// let repeat = term.repeat(3, Some(5)).unwrap(); + /// + /// if let Term::RegularExpression(regex) = repeat { + /// assert_eq!("(abc){3,5}", regex.to_string()); + /// } + /// ``` + pub fn repeat(&self, min: u32, max_opt: Option) -> Result { + match self { + Term::RegularExpression(regular_expression) => Ok(Term::RegularExpression( + regular_expression.repeat(min, max_opt), + )), + Term::Automaton(fast_automaton) => { + let repeat_automaton = fast_automaton.repeat(min, max_opt)?; + Ok(if let Some(repeat_regex) = repeat_automaton.to_regex() { + Term::RegularExpression(repeat_regex) + } else { + Term::Automaton(repeat_automaton) + }) + } + } + } + + /// Returns the details of the current term, including cardinality, length, and emptiness. /// /// # Example: /// @@ -250,7 +359,8 @@ impl Term { .collect()) } - /// Compute if the two given terms are equivalent. + /// Compute whether the current term and the given term are equivalent. + /// Returns `true` if both terms accept the same language. /// /// # Example: /// @@ -272,7 +382,8 @@ impl Term { automaton_1.is_equivalent_of(&automaton_2) } - /// Compute if the first term is a subset of the second one. + /// Compute whether the current term is a subset of the given term. + /// Returns `true` if all strings matched by the current term are also matched by the given term. /// /// # Example: /// @@ -327,13 +438,31 @@ impl Term { }) } - fn new_empty() -> Self { + /// Create a term that matches the empty language. + pub fn new_empty() -> Self { Term::RegularExpression(RegularExpression::new_empty()) } - fn new_total() -> Self { + /// Create a term that matches all possible strings. + pub fn new_total() -> Self { Term::RegularExpression(RegularExpression::new_total()) } + + /// Check if the current term matches the empty language. + pub fn is_empty(&self) -> bool { + match self { + Term::RegularExpression(regular_expression) => regular_expression.is_empty(), + Term::Automaton(fast_automaton) => fast_automaton.is_empty(), + } + } + + /// Check if the current term matches all possible strings. + pub fn is_total(&self) -> bool { + match self { + Term::RegularExpression(regular_expression) => regular_expression.is_total(), + Term::Automaton(fast_automaton) => fast_automaton.is_total(), + } + } } /// Represents details about a [Term]. @@ -448,12 +577,45 @@ mod tests { } #[test] - fn test__() -> Result<(), String> { - let term = Term::from_regex("(abc|de){2}").unwrap(); - - let strings = term.generate_strings(3).unwrap(); - - println!("strings={:?}", strings); + fn test__() -> Result<(), EngineError> { + // Create terms from regex + let t1 = Term::from_regex("abc.*")?; + let t2 = Term::from_regex(".*xyz")?; + + // Concatenate + let concat = t1.concat(&[t2])?; + assert_eq!(concat.to_string(), "abc.*xyz"); + + // Union + let union = t1.union(&[Term::from_regex("fgh")?])?; // (abc.*|fgh) + assert_eq!(union.to_string(), "(abc.*|fgh)"); + + // Intersection + let inter = Term::from_regex("(ab|xy){2}")?.intersection(&[Term::from_regex(".*xy")?])?; // (ab|xy)xy + assert_eq!(inter.to_string(), "(ab|xy)xy"); + + // Subtraction + let diff = Term::from_regex("a*")?.subtraction(&Term::from_regex("")?)?; + assert_eq!(diff.to_string(), "a+"); + + // Repetition + let rep = Term::from_regex("abc")?.repeat(2, Some(4))?; // (abc){2,4} + assert_eq!(rep.to_string(), "(abc){2,4}"); + + // Analyze + let details = rep.get_details()?; + assert_eq!(details.get_length(), &(Some(6), Some(12))); + assert!(!details.is_empty()); + + // Generate examples + let samples = Term::from_regex("(x|y){1,3}")?.generate_strings(5)?; + println!("Some matches: {:?}", samples); + + // Equivalence & subset + let a = Term::from_regex("a+")?; + let b = Term::from_regex("a*")?; + assert!(!a.are_equivalent(&b)?); + assert!(a.is_subset_of(&b)?); Ok(()) } diff --git a/src/regex/mod.rs b/src/regex/mod.rs index c131d2b..59ceb29 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -44,21 +44,21 @@ impl Display for RegularExpression { multiplicator_part = String::from("?"); } else if let Some(max) = max_opt { if max == min { - multiplicator_part = format!("{{{}}}", max); + multiplicator_part = format!("{{{max}}}"); } else { - multiplicator_part = format!("{{{},{}}}", min, max); + multiplicator_part = format!("{{{min},{max}}}"); } } else { - multiplicator_part = format!("{{{},}}", min); + multiplicator_part = format!("{{{min},}}"); } match **regular_expression { RegularExpression::Repetition(_, _, _) => { - format!("({}){}", regex_part, multiplicator_part) + format!("({regex_part}){multiplicator_part}") } RegularExpression::Concat(_) => { - format!("({}){}", regex_part, multiplicator_part) + format!("({regex_part}){multiplicator_part}") } - _ => format!("{}{}", regex_part, multiplicator_part), + _ => format!("{regex_part}{multiplicator_part}"), } } RegularExpression::Concat(concat) => { @@ -82,11 +82,11 @@ impl Display for RegularExpression { if alternation.len() == 1 { sb } else { - format!("({})", sb) + format!("({sb})") } } }; - write!(f, "{}", str) + write!(f, "{str}") } } @@ -130,7 +130,7 @@ impl RegularExpression { RegularExpression::Character(range) => FastAutomaton::make_from_range(range), RegularExpression::Repetition(regular_expression, min, max_opt) => { let mut automaton = regular_expression.to_automaton()?; - automaton.repeat(*min, *max_opt)?; + automaton.repeat_mut(*min, *max_opt)?; Ok(automaton) } RegularExpression::Concat(concat) => { @@ -138,7 +138,6 @@ impl RegularExpression { for c in concat.iter() { concats.push(c.to_automaton()?); } - println!("{:?}", concats); FastAutomaton::build_concat(&concats) } RegularExpression::Alternation(alternation) => { diff --git a/src/regex/operation/mod.rs b/src/regex/operation/mod.rs index b01ac78..382c885 100644 --- a/src/regex/operation/mod.rs +++ b/src/regex/operation/mod.rs @@ -3,72 +3,7 @@ use super::*; mod concat; mod simplify; mod union; - -impl RegularExpression { - pub fn repeat(&self, min: u32, max_opt: Option) -> RegularExpression { - if self.is_total() { - return RegularExpression::new_total(); - } else if self.is_empty() { - return RegularExpression::new_empty(); - } else if self.is_empty_string() { - return Self::new_empty_string(); - } else if let Some(max) = max_opt { - if max < min || max == 0 { - return RegularExpression::new_empty_string(); - } else if min == 1 && max == 1 { - return self.clone(); - } - } - - match self { - RegularExpression::Repetition(regular_expression, o_min, o_max_opt) => { - let new_max = if let (Some(max), Some(o_max)) = (max_opt, o_max_opt) { - Some(max * o_max) - } else { - None - }; - - let o_min = *o_min; - if let Some(o_max) = o_max_opt { - let o_max = *o_max; - if o_min <= 1 || max_opt.is_some() && max_opt.unwrap() == min { - RegularExpression::Repetition( - regular_expression.clone(), - min * o_min, - new_max, - ) - } else if o_min == o_max && o_min > 1 { - RegularExpression::Repetition(Box::new(self.clone()), min, max_opt) - } else { - let r = ((o_max as f64) - 1f64) / ((o_max as f64) - (o_min as f64)); - if r > cmp::max(2, min) as f64 { - return RegularExpression::Repetition( - Box::new(self.clone()), - min, - max_opt, - ); - } - - RegularExpression::Repetition( - regular_expression.clone(), - min * o_min, - new_max, - ) - } - } else if o_max_opt.is_none() - || max_opt.is_some() && (max_opt.unwrap() == min || max_opt.unwrap() == 1) - || o_max_opt.is_some() && o_max_opt.unwrap() == 1 - || max_opt.is_none() && o_min == 0 - { - RegularExpression::Repetition(regular_expression.clone(), min * o_min, new_max) - } else { - RegularExpression::Repetition(Box::new(self.clone()), min, max_opt) - } - } - _ => RegularExpression::Repetition(Box::new(self.clone()), min, max_opt), - } - } -} +mod repeat; #[cfg(test)] mod tests { diff --git a/src/regex/operation/repeat.rs b/src/regex/operation/repeat.rs new file mode 100644 index 0000000..00b9685 --- /dev/null +++ b/src/regex/operation/repeat.rs @@ -0,0 +1,67 @@ +use super::*; + +impl RegularExpression { + pub fn repeat(&self, min: u32, max_opt: Option) -> RegularExpression { + if self.is_total() { + return RegularExpression::new_total(); + } else if self.is_empty() { + return RegularExpression::new_empty(); + } else if self.is_empty_string() { + return Self::new_empty_string(); + } else if let Some(max) = max_opt { + if max < min || max == 0 { + return RegularExpression::new_empty_string(); + } else if min == 1 && max == 1 { + return self.clone(); + } + } + + match self { + RegularExpression::Repetition(regular_expression, o_min, o_max_opt) => { + let new_max = if let (Some(max), Some(o_max)) = (max_opt, o_max_opt) { + Some(max * o_max) + } else { + None + }; + + let o_min = *o_min; + if let Some(o_max) = o_max_opt { + let o_max = *o_max; + if o_min <= 1 || max_opt.is_some() && max_opt.unwrap() == min { + RegularExpression::Repetition( + regular_expression.clone(), + min * o_min, + new_max, + ) + } else if o_min == o_max && o_min > 1 { + RegularExpression::Repetition(Box::new(self.clone()), min, max_opt) + } else { + let r = ((o_max as f64) - 1f64) / ((o_max as f64) - (o_min as f64)); + if r > cmp::max(2, min) as f64 { + return RegularExpression::Repetition( + Box::new(self.clone()), + min, + max_opt, + ); + } + + RegularExpression::Repetition( + regular_expression.clone(), + min * o_min, + new_max, + ) + } + } else if o_max_opt.is_none() + || max_opt.is_some() && (max_opt.unwrap() == min || max_opt.unwrap() == 1) + || o_max_opt.is_some() && o_max_opt.unwrap() == 1 + || max_opt.is_none() && o_min == 0 + { + RegularExpression::Repetition(regular_expression.clone(), min * o_min, new_max) + } else { + RegularExpression::Repetition(Box::new(self.clone()), min, max_opt) + } + } + _ => RegularExpression::Repetition(Box::new(self.clone()), min, max_opt), + } + } +} \ No newline at end of file diff --git a/src/regex/operation/union.rs b/src/regex/operation/union.rs index 9589b4a..62789e6 100644 --- a/src/regex/operation/union.rs +++ b/src/regex/operation/union.rs @@ -116,8 +116,7 @@ impl RegularExpression { } } else { panic!( - "Not character and repetition {:?} {:?}", - this_character, that_repetition + "Not character and repetition {this_character:?} {that_repetition:?}" ) } } diff --git a/src/regex/serializer.rs b/src/regex/serializer.rs index 83fd99f..0832756 100644 --- a/src/regex/serializer.rs +++ b/src/regex/serializer.rs @@ -16,10 +16,7 @@ impl<'de> serde::Deserialize<'de> for RegularExpression { where D: Deserializer<'de>, { - let regex_string = match String::deserialize(deserializer) { - Ok(str) => str, - Err(err) => return Err(err), - }; + let regex_string = String::deserialize(deserializer)?; match RegularExpression::new(®ex_string) { Ok(regex) => Ok(regex), Err(err) => Err(de::Error::custom(err.to_string())), diff --git a/src/traits.rs b/src/traits.rs deleted file mode 100644 index e69de29..0000000 From 9cf30a632129bfeb0c05d42526bfe0c083cf9e57 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Thu, 10 Jul 2025 22:17:34 +0200 Subject: [PATCH 04/44] WIP --- README.md | 74 +++- src/error/mod.rs | 23 +- src/execution_profile.rs | 382 +++++++++++-------- src/fast_automaton/convert/to_regex/mod.rs | 4 +- src/fast_automaton/generate.rs | 6 +- src/fast_automaton/operation/determinize.rs | 14 +- src/fast_automaton/operation/intersection.rs | 6 +- src/lib.rs | 119 +++--- src/regex/mod.rs | 8 +- 9 files changed, 378 insertions(+), 258 deletions(-) diff --git a/README.md b/README.md index 2d2bff0..040684f 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ Ideal for constraint solvers, code generators, test-case generators, and any use - Compute language **cardinality**, **length bounds**, **emptiness**, and **totality**. - Check **equivalence** and **subset** relations between terms. - **String Generation**: Generate example strings matching a term, for testing or sampling. -- **Performance & Tuning**: Pluggable `ExecutionProfile` to bound cost and resource usage. +- **Performance & Tuning**: Pluggable `ExecutionProfile` to bound time and resource usage. ## Installation Add the following line in your `Cargo.toml`: @@ -23,45 +23,89 @@ regexsolver = "1" ## Examples ```rust +use regexsolver::Term; + // Create terms from regex -let t1 = Term::from_regex("abc.*")?; -let t2 = Term::from_regex(".*xyz")?; +let t1 = Term::from_regex("abc.*").unwrap(); +let t2 = Term::from_regex(".*xyz").unwrap(); // Concatenate -let concat = t1.concat(&[t2])?; +let concat = t1.concat(&[t2]).unwrap(); assert_eq!(concat.to_string(), "abc.*xyz"); // Union -let union = t1.union(&[Term::from_regex("fgh")?])?; // (abc.*|fgh) +let union = t1.union(&[Term::from_regex("fgh").unwrap()]).unwrap(); // (abc.*|fgh) assert_eq!(union.to_string(), "(abc.*|fgh)"); // Intersection -let inter = Term::from_regex("(ab|xy){2}")?.intersection(&[Term::from_regex(".*xy")?])?; // (ab|xy)xy +let inter = Term::from_regex("(ab|xy){2}") + .unwrap() + .intersection(&[Term::from_regex(".*xy").unwrap()]) + .unwrap(); // (ab|xy)xy assert_eq!(inter.to_string(), "(ab|xy)xy"); // Subtraction -let diff = Term::from_regex("a*")?.subtraction(&Term::from_regex("")?)?; +let diff = Term::from_regex("a*") + .unwrap() + .subtraction(&Term::from_regex("").unwrap()) + .unwrap(); assert_eq!(diff.to_string(), "a+"); // Repetition -let rep = Term::from_regex("abc")?.repeat(2, Some(4))?; // (abc){2,4} +let rep = Term::from_regex("abc").unwrap().repeat(2, Some(4)).unwrap(); // (abc){2,4} assert_eq!(rep.to_string(), "(abc){2,4}"); // Analyze -let details = rep.get_details()?; +let details = rep.get_details().unwrap(); assert_eq!(details.get_length(), &(Some(6), Some(12))); assert!(!details.is_empty()); // Generate examples -let samples = Term::from_regex("(x|y){1,3}")?.generate_strings(5)?; +let samples = Term::from_regex("(x|y){1,3}") + .unwrap() + .generate_strings(5) + .unwrap(); println!("Some matches: {:?}", samples); // Equivalence & subset -let a = Term::from_regex("a+")?; -let b = Term::from_regex("a*")?; -assert!(!a.are_equivalent(&b)?); -assert!(a.is_subset_of(&b)?); +let a = Term::from_regex("a+").unwrap(); +let b = Term::from_regex("a*").unwrap(); +assert!(!a.are_equivalent(&b).unwrap()); +assert!(a.is_subset_of(&b).unwrap()); ``` ## Execution Profiles -By default, all operations run without limits. For heavy or untrusted patterns, use an `ExecutionProfile` to cap time, memory or term count: +By default, all operations run without limits. For heavy or untrusted patterns, use an `ExecutionProfile` to cap execution time and maximum number of states in used automata. + +### Example: Limit the execution time +```rust +use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; + +let term = Term::from_regex(".*abc.*cdef.*sqdsqf.*").unwrap(); + +let execution_profile = ExecutionProfileBuilder::new() + .execution_timeout(5) // We set the limit (5ms) + .build(); + +// We run the operation with the defined limitation +execution_profile.run(|| { + assert_eq!(EngineError::OperationTimeOutError, term.generate_strings(1000).unwrap_err()); +}); +``` + +### Example: Limit the number of states +```rust +use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; + +let term1 = Term::from_regex(".*abcdef.*").unwrap(); +let term2 = Term::from_regex(".*defabc.*").unwrap(); + +let execution_profile = ExecutionProfileBuilder::new() + .max_number_of_states(5) // We set the limit + .build(); + +// We run the operation with the defined limitation +execution_profile.run(|| { + assert_eq!(EngineError::AutomatonHasTooManyStates, term1.intersection(&[term2]).unwrap_err()); +}); +``` \ No newline at end of file diff --git a/src/error/mod.rs b/src/error/mod.rs index 6447ebe..91085b3 100644 --- a/src/error/mod.rs +++ b/src/error/mod.rs @@ -15,8 +15,6 @@ pub enum EngineError { AutomatonHasTooManyStates, /// The regular expression can not be parsed. RegexSyntaxError(String), - /// Too many terms are used in the operation. - TooMuchTerms(usize, usize), /// The provided range can not be built from the spanning set. ConditionInvalidRange, /// The provided index is out of bound of the condition. @@ -30,13 +28,21 @@ impl fmt::Display for EngineError { match self { EngineError::InvalidCharacterInRegex => write!(f, "Invalid character used in regex."), EngineError::OperationTimeOutError => write!(f, "The operation took too much time."), - EngineError::AutomatonShouldBeDeterministic => write!(f, "The given automaton should be deterministic."), - EngineError::AutomatonHasTooManyStates => write!(f, "The automaton has too many states."), + EngineError::AutomatonShouldBeDeterministic => { + write!(f, "The given automaton should be deterministic.") + } + EngineError::AutomatonHasTooManyStates => { + write!(f, "The automaton has too many states.") + } EngineError::RegexSyntaxError(err) => write!(f, "{err}."), - EngineError::TooMuchTerms(max, got) => write!(f, "Too many terms are used in this operation, the maximum allowed for your plan is {max} and you used {got}."), - EngineError::TokenError(err) => write!(f, "{err}."), - EngineError::ConditionInvalidRange => write!(f, "The provided range can not be built from the spanning set."), - EngineError::ConditionIndexOutOfBound => write!(f, "The provided index is out of bound of the condition."), + EngineError::TokenError(err) => write!(f, "{err}."), + EngineError::ConditionInvalidRange => write!( + f, + "The provided range can not be built from the spanning set." + ), + EngineError::ConditionIndexOutOfBound => { + write!(f, "The provided index is out of bound of the condition.") + } } } } @@ -53,7 +59,6 @@ impl EngineError { EngineError::AutomatonShouldBeDeterministic => true, EngineError::AutomatonHasTooManyStates => false, EngineError::RegexSyntaxError(_) => false, - EngineError::TooMuchTerms(_, _) => false, EngineError::TokenError(_) => false, EngineError::ConditionInvalidRange => true, EngineError::ConditionIndexOutOfBound => true, diff --git a/src/execution_profile.rs b/src/execution_profile.rs index 2ae8e2b..008cffb 100644 --- a/src/execution_profile.rs +++ b/src/execution_profile.rs @@ -4,102 +4,76 @@ use crate::error::EngineError; /// Hold settings about limitations and constraints of operations execution within the engine. /// -/// To apply the settings on the current thread you need to call the following function: -/// ``` -/// use regexsolver::execution_profile::{ExecutionProfile, ThreadLocalParams}; -/// -/// let execution_profile = ExecutionProfile { -/// max_number_of_states: 1, -/// start_execution_time: None, -/// execution_timeout: 1000, -/// max_number_of_terms: 10, -/// }; -/// -/// // Store the settings on the current thread. -/// ThreadLocalParams::init_profile(&execution_profile); -/// ``` -/// /// # Examples: /// /// ## Limiting the number of states /// ``` -/// use regexsolver::{Term, execution_profile::{ExecutionProfile, ThreadLocalParams}, error::EngineError}; -/// -/// let term1 = Term::from_regex(".*abc.*").unwrap(); -/// let term2 = Term::from_regex(".*def.*").unwrap(); -/// -/// let execution_profile = ExecutionProfile { -/// max_number_of_states: 1, -/// start_execution_time: None, -/// execution_timeout: 1000, -/// max_number_of_terms: 10, -/// }; -/// ThreadLocalParams::init_profile(&execution_profile); -/// -/// assert_eq!(EngineError::AutomatonHasTooManyStates, term1.intersection(&[term2]).unwrap_err()); -/// ``` +/// use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; /// -/// ## Limiting the number of terms -/// ``` -/// use regexsolver::{Term, execution_profile::{ExecutionProfile, ThreadLocalParams}, error::EngineError}; -/// -/// let term1 = Term::from_regex(".*abc.*").unwrap(); -/// let term2 = Term::from_regex(".*def.*").unwrap(); -/// let term3 = Term::from_regex(".*hij.*").unwrap(); +/// let term1 = Term::from_regex(".*abcdef.*").unwrap(); +/// let term2 = Term::from_regex(".*defabc.*").unwrap(); /// -/// let execution_profile = ExecutionProfile { -/// max_number_of_states: 8192, -/// start_execution_time: None, -/// execution_timeout: 1000, -/// max_number_of_terms: 2, -/// }; -/// ThreadLocalParams::init_profile(&execution_profile); +/// let execution_profile = ExecutionProfileBuilder::new() +/// .max_number_of_states(5) +/// .build(); /// -/// assert_eq!(EngineError::TooMuchTerms(2,3), term1.intersection(&[term2, term3]).unwrap_err()); +/// execution_profile.run(|| { +/// assert_eq!(EngineError::AutomatonHasTooManyStates, term1.intersection(&[term2]).unwrap_err()); +/// }); /// ``` /// /// ## Limiting the execution time /// ``` -/// use regexsolver::{Term, execution_profile::{ExecutionProfile, ThreadLocalParams}, error::EngineError}; +/// use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; /// use std::time::SystemTime; /// /// let term = Term::from_regex(".*abc.*cdef.*sqdsqf.*").unwrap(); /// -/// let execution_profile = ExecutionProfile { -/// max_number_of_states: 8192, -/// start_execution_time: Some(SystemTime::now()), -/// execution_timeout: 1, -/// max_number_of_terms: 50, -/// }; -/// ThreadLocalParams::init_profile(&execution_profile); +/// let execution_profile = ExecutionProfileBuilder::new() +/// .execution_timeout(5) // 5ms +/// .build(); /// -/// assert_eq!(EngineError::OperationTimeOutError, term.generate_strings(100).unwrap_err()); +/// execution_profile.run(|| { +/// assert_eq!(EngineError::OperationTimeOutError, term.generate_strings(1000).unwrap_err()); +/// }); /// ``` +#[derive(Clone, Debug)] pub struct ExecutionProfile { /// The maximum number of states that a non-determinitic finite automaton can hold, this is checked during the convertion of regular expression to automaton. - pub max_number_of_states: usize, + max_number_of_states: Option, /// Timestamp of when the execution has started, if this value is not set the operations will never timeout. - pub start_execution_time: Option, + start_execution_time: Option, /// The longest time in milliseconds that an operation execution can last, there are no guaranties that the exact time will be respected. - pub execution_timeout: u128, - /// The maximum number of terms that an operation can have. - pub max_number_of_terms: usize, + execution_timeout: Option, +} + +impl PartialEq for ExecutionProfile { + fn eq(&self, other: &ExecutionProfile) -> bool { + self.max_number_of_states == other.max_number_of_states + && self.execution_timeout == other.execution_timeout + } } impl ExecutionProfile { + pub fn get() -> ExecutionProfile { + ThreadLocalParams::get_execution_profile() + } + /// Assert that `execution_timeout` is not exceeded. /// - /// Return empty if `execution_timeout` is not exceeded or if `start_execution_time` is not set. + /// Return empty if `execution_timeout` is not exceeded. /// /// Return [`EngineError::OperationTimeOutError`] otherwise. - pub fn assert_not_timed_out(&self) -> Result<(), EngineError> { - if let Some(start) = self.start_execution_time { + pub(crate) fn assert_not_timed_out(&self) -> Result<(), EngineError> { + if let (Some(start), Some(execution_timeout)) = + (self.start_execution_time, self.execution_timeout) + { let run_duration = SystemTime::now() .duration_since(start) .expect("Time went backwards") .as_millis(); - if run_duration > self.execution_timeout { + if run_duration > execution_timeout { Err(EngineError::OperationTimeOutError) } else { Ok(()) @@ -108,32 +82,103 @@ impl ExecutionProfile { Ok(()) } } + + /// Assert that `max_number_of_states` is not exceeded. + /// + /// Return empty if `max_number_of_states` is not exceeded. + /// + /// Return [`EngineError::AutomatonHasTooManyStates`] otherwise. + pub(crate) fn assert_max_number_of_states( + &self, + number_of_states: usize, + ) -> Result<(), EngineError> { + if let Some(max_number_of_states) = self.max_number_of_states { + if number_of_states >= max_number_of_states { + return Err(EngineError::AutomatonHasTooManyStates); + } + } + Ok(()) + } + + pub fn with_execution_timeout(mut self, execution_timeout_in_ms: u128) -> Self { + self.execution_timeout = Some(execution_timeout_in_ms); + self + } + + pub fn with_max_number_of_states(mut self, max_number_of_states: usize) -> Self { + self.max_number_of_states = Some(max_number_of_states); + self + } + + pub fn set(&self) -> &Self { + self + } + + pub fn run(&self, f: F) -> R + where + F: FnOnce() -> R, + { + let initial_execution_profile = ThreadLocalParams::get_execution_profile(); + + let mut execution_profile = self.clone(); + execution_profile.start_execution_time = Some(SystemTime::now()); + + ThreadLocalParams::set_execution_profile(&execution_profile); + let result = f(); + ThreadLocalParams::set_execution_profile(&initial_execution_profile); + result + } } -/// Hold [`ExecutionProfile`] on the current thread. -/// -/// The default [`ExecutionProfile`] is the following: -/// ``` -/// use regexsolver::execution_profile::ExecutionProfile; -/// -/// ExecutionProfile { -/// max_number_of_states: 8192, -/// start_execution_time: None, -/// execution_timeout: 1500, -/// max_number_of_terms: 50, -/// }; -/// ``` -pub struct ThreadLocalParams; +pub struct ExecutionProfileBuilder { + /// The maximum number of states that a non-determinitic finite automaton can hold, this is checked during the convertion of regular expression to automaton. + max_number_of_states: Option, + /// The longest time in milliseconds that an operation execution can last, there are no guaranties that the exact time will be respected. + execution_timeout: Option, +} +impl Default for ExecutionProfileBuilder { + fn default() -> Self { + Self::new() + } +} + +impl ExecutionProfileBuilder { + pub fn new() -> Self { + Self { + max_number_of_states: None, + execution_timeout: None, + } + } + + pub fn execution_timeout(mut self, execution_timeout_in_ms: u128) -> Self { + self.execution_timeout = Some(execution_timeout_in_ms); + self + } + + pub fn max_number_of_states(mut self, max_number_of_states: usize) -> Self { + self.max_number_of_states = Some(max_number_of_states); + self + } + + pub fn build(self) -> ExecutionProfile { + ExecutionProfile { + max_number_of_states: self.max_number_of_states, + execution_timeout: self.execution_timeout, + start_execution_time: None, + } + } +} + +struct ThreadLocalParams; impl ThreadLocalParams { thread_local! { - static MAX_NUMBER_OF_STATES: RefCell = const { RefCell::new(8192) }; + static MAX_NUMBER_OF_STATES: RefCell> = const { RefCell::new(None) }; static START_EXECUTION_TIME: RefCell> = const { RefCell::new(None) }; - static EXECUTION_TIMEOUT: RefCell = const { RefCell::new(1500) }; - static MAX_NUMBER_OF_TERMS: RefCell = const { RefCell::new(50) }; + static EXECUTION_TIMEOUT: RefCell> = const { RefCell::new(None) }; } /// Store on the current thread [`ExecutionProfile`]. - pub fn init_profile(profile: &ExecutionProfile) { + fn set_execution_profile(profile: &ExecutionProfile) { ThreadLocalParams::MAX_NUMBER_OF_STATES.with(|cell| { *cell.borrow_mut() = profile.max_number_of_states; }); @@ -145,62 +190,64 @@ impl ThreadLocalParams { ThreadLocalParams::EXECUTION_TIMEOUT.with(|cell| { *cell.borrow_mut() = profile.execution_timeout; }); - - ThreadLocalParams::MAX_NUMBER_OF_TERMS.with(|cell| { - *cell.borrow_mut() = profile.max_number_of_terms; - }); } - pub fn get_max_number_of_states() -> usize { + fn get_max_number_of_states() -> Option { ThreadLocalParams::MAX_NUMBER_OF_STATES.with(|cell| *cell.borrow()) } - pub fn get_start_execution_time() -> Option { + fn get_start_execution_time() -> Option { ThreadLocalParams::START_EXECUTION_TIME.with(|cell| *cell.borrow()) } - pub fn get_execution_timeout() -> u128 { + fn get_execution_timeout() -> Option { ThreadLocalParams::EXECUTION_TIMEOUT.with(|cell| *cell.borrow()) } - pub fn get_max_number_of_terms() -> usize { - ThreadLocalParams::MAX_NUMBER_OF_TERMS.with(|cell| *cell.borrow()) - } - /// Return the [`ExecutionProfile`] stored on the current thread. - pub fn get_execution_profile() -> ExecutionProfile { + fn get_execution_profile() -> ExecutionProfile { ExecutionProfile { max_number_of_states: Self::get_max_number_of_states(), start_execution_time: Self::get_start_execution_time(), execution_timeout: Self::get_execution_timeout(), - max_number_of_terms: Self::get_max_number_of_terms(), } } } #[cfg(test)] mod tests { - use crate::{regex::RegularExpression, Term}; + use crate::{Term, regex::RegularExpression}; use super::*; #[test] - fn test_execution() -> Result<(), String> { - let execution_profile = ExecutionProfile { - max_number_of_states: 1, - start_execution_time: None, - execution_timeout: 1000, - max_number_of_terms: 10, - }; - ThreadLocalParams::init_profile(&execution_profile); + fn test_execution_get() -> Result<(), String> { + let execution_profile = ExecutionProfileBuilder::new() + .execution_timeout(1000) + .max_number_of_states(8192) + .build(); + + execution_profile.run(|| { + assert_eq!(execution_profile, ExecutionProfile::get()); + }); - let regex = RegularExpression::new("test").unwrap(); + Ok(()) + } - assert!(regex.to_automaton().is_err()); - assert_eq!( - EngineError::AutomatonHasTooManyStates, - regex.to_automaton().unwrap_err() - ); + #[test] + fn test_execution() -> Result<(), String> { + ExecutionProfileBuilder::new() + .max_number_of_states(1) + .build() + .run(|| { + let regex = RegularExpression::new("test").unwrap(); + + assert!(regex.to_automaton().is_err()); + assert_eq!( + EngineError::AutomatonHasTooManyStates, + regex.to_automaton().unwrap_err() + ); + }); Ok(()) } @@ -209,27 +256,26 @@ mod tests { fn test_execution_timeout_generate_strings() -> Result<(), String> { let term = Term::from_regex(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz").unwrap(); + let execution_timeout_in_ms = 10; let start_time = SystemTime::now(); - let execution_profile = ExecutionProfile { - max_number_of_states: 8192, - start_execution_time: Some(start_time), - execution_timeout: 100, - max_number_of_terms: 50, - }; - ThreadLocalParams::init_profile(&execution_profile); - - assert_eq!( - EngineError::OperationTimeOutError, - term.generate_strings(100).unwrap_err() - ); - - let run_duration = SystemTime::now() - .duration_since(start_time) - .expect("Time went backwards") - .as_millis(); - - println!("{run_duration}"); - assert!(run_duration <= execution_profile.execution_timeout + 50); + ExecutionProfileBuilder::new() + .execution_timeout(execution_timeout_in_ms) + .build() + .run(|| { + assert_eq!( + EngineError::OperationTimeOutError, + term.generate_strings(100).unwrap_err() + ); + + let run_duration = SystemTime::now() + .duration_since(start_time) + .expect("Time went backwards") + .as_millis(); + + println!("{run_duration}"); + assert!(run_duration <= execution_timeout_in_ms + 50); + }); + Ok(()) } @@ -238,27 +284,26 @@ mod tests { let term1 = Term::from_regex(".*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); let term2 = Term::from_regex(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz").unwrap(); + let execution_timeout_in_ms = 50; let start_time = SystemTime::now(); - let execution_profile = ExecutionProfile { - max_number_of_states: 8192, - start_execution_time: Some(start_time), - execution_timeout: 100, - max_number_of_terms: 50, - }; - ThreadLocalParams::init_profile(&execution_profile); - - assert_eq!( - EngineError::OperationTimeOutError, - term1.difference(&term2).unwrap_err() - ); - - let run_duration = SystemTime::now() - .duration_since(start_time) - .expect("Time went backwards") - .as_millis(); - - println!("{run_duration}"); - assert!(run_duration <= execution_profile.execution_timeout + 50); + ExecutionProfileBuilder::new() + .execution_timeout(execution_timeout_in_ms) + .build() + .run(|| { + assert_eq!( + EngineError::OperationTimeOutError, + term1.difference(&term2).unwrap_err() + ); + + let run_duration = SystemTime::now() + .duration_since(start_time) + .expect("Time went backwards") + .as_millis(); + + println!("{run_duration}"); + assert!(run_duration <= execution_timeout_in_ms + 25); + }); + Ok(()) } @@ -267,27 +312,26 @@ mod tests { let term1 = Term::from_regex(".*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); let term2 = Term::from_regex(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); + let execution_timeout_in_ms = 100; let start_time = SystemTime::now(); - let execution_profile = ExecutionProfile { - max_number_of_states: 8192, - start_execution_time: Some(start_time), - execution_timeout: 100, - max_number_of_terms: 50, - }; - ThreadLocalParams::init_profile(&execution_profile); - - assert_eq!( - EngineError::OperationTimeOutError, - term1.intersection(&[term2]).unwrap_err() - ); - - let run_duration = SystemTime::now() - .duration_since(start_time) - .expect("Time went backwards") - .as_millis(); - - println!("{run_duration}"); - assert!(run_duration <= execution_profile.execution_timeout + 50); + ExecutionProfileBuilder::new() + .execution_timeout(execution_timeout_in_ms) + .build() + .run(|| { + assert_eq!( + EngineError::OperationTimeOutError, + term1.intersection(&[term2]).unwrap_err() + ); + + let run_duration = SystemTime::now() + .duration_since(start_time) + .expect("Time went backwards") + .as_millis(); + + println!("{run_duration}"); + assert!(run_duration <= execution_timeout_in_ms + 50); + }); + Ok(()) } } diff --git a/src/fast_automaton/convert/to_regex/mod.rs b/src/fast_automaton/convert/to_regex/mod.rs index 2d84ff8..17d539f 100644 --- a/src/fast_automaton/convert/to_regex/mod.rs +++ b/src/fast_automaton/convert/to_regex/mod.rs @@ -7,7 +7,7 @@ use ahash::{HashMapExt, HashSetExt}; use log::warn; use nohash_hasher::IntMap; -use crate::{error::EngineError, execution_profile::ThreadLocalParams, regex::RegularExpression}; +use crate::{error::EngineError, execution_profile::ExecutionProfile, regex::RegularExpression}; use super::{FastAutomaton, IntSet, Range, State}; @@ -248,7 +248,7 @@ impl FastAutomaton { if self.is_empty() { return Some(RegularExpression::new_empty()); } - let execution_profile = ThreadLocalParams::get_execution_profile(); + let execution_profile = ExecutionProfile::get(); if let Ok(graph) = StateEliminationAutomaton::new(self) { if let Ok(regex) = graph?.convert_to_regex(&execution_profile) { let regex = regex?; diff --git a/src/fast_automaton/generate.rs b/src/fast_automaton/generate.rs index 638ba11..0efa0e3 100644 --- a/src/fast_automaton/generate.rs +++ b/src/fast_automaton/generate.rs @@ -1,6 +1,6 @@ use std::cmp; -use crate::{execution_profile::ThreadLocalParams, EngineError}; +use crate::{EngineError, execution_profile::ExecutionProfile}; use ahash::AHashSet; use super::*; @@ -13,7 +13,7 @@ impl FastAutomaton { let mut strings = AHashSet::with_capacity(cmp::min(number, 1000)); - let execution_profile = ThreadLocalParams::get_execution_profile(); + let execution_profile = ExecutionProfile::get(); let mut ranges_cache: AHashMap<&Condition, Range> = AHashMap::with_capacity(self.get_number_of_states()); @@ -98,7 +98,7 @@ mod tests { assert_generate_strings("(?:A+(?:\\.[AB]+)*|\"(?:C|\\\\D)*\")@", 500); assert_generate_strings( "(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@", - 500 + 500, ); assert_generate_strings("[0-9]+[A-Z]*", 500); assert_generate_strings("a+(ba+)*", 200); diff --git a/src/fast_automaton/operation/determinize.rs b/src/fast_automaton/operation/determinize.rs index 3d4057b..1cf7a88 100644 --- a/src/fast_automaton/operation/determinize.rs +++ b/src/fast_automaton/operation/determinize.rs @@ -1,6 +1,6 @@ use ahash::HashMapExt; -use crate::{execution_profile::ThreadLocalParams, EngineError}; +use crate::{EngineError, execution_profile::ExecutionProfile}; use super::*; @@ -9,7 +9,7 @@ impl FastAutomaton { if self.deterministic { return Ok(self.clone()); } - let execution_profile = ThreadLocalParams::get_execution_profile(); + let execution_profile = ExecutionProfile::get(); let ranges = self.get_ranges()?; @@ -125,9 +125,11 @@ mod tests { deterministic_automaton.get_number_of_states() ); assert!(deterministic_automaton.is_determinitic()); - assert!(automaton - .subtraction(&deterministic_automaton) - .unwrap() - .is_empty()); + assert!( + automaton + .subtraction(&deterministic_automaton) + .unwrap() + .is_empty() + ); } } diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index 2664aec..a2b381e 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -2,7 +2,7 @@ use std::borrow::Cow; use condition::converter::ConditionConverter; -use crate::{error::EngineError, execution_profile::ThreadLocalParams}; +use crate::{error::EngineError, execution_profile::ExecutionProfile}; use super::*; @@ -39,7 +39,7 @@ impl FastAutomaton { } else if other.is_total() { return Ok(Cow::Owned(self.clone())); } - let execution_profile = ThreadLocalParams::get_execution_profile(); + let execution_profile = ExecutionProfile::get(); let new_spanning_set = self.spanning_set.merge(&other.spanning_set); @@ -105,7 +105,7 @@ impl FastAutomaton { } else if self.is_total() || other.is_total() { return Ok(true); } - let execution_profile = ThreadLocalParams::get_execution_profile(); + let execution_profile = ExecutionProfile::get(); let new_spanning_set = self.spanning_set.merge(&other.spanning_set); diff --git a/src/lib.rs b/src/lib.rs index 17f26c7..9c55997 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,10 +1,12 @@ use std::{ - borrow::Cow, collections::{HashMap, HashSet}, fmt::Display, hash::BuildHasherDefault + borrow::Cow, + collections::{HashMap, HashSet}, + fmt::Display, + hash::BuildHasherDefault, }; use cardinality::Cardinality; use error::EngineError; -use execution_profile::ThreadLocalParams; use fast_automaton::FastAutomaton; use nohash_hasher::NoHashHasher; use regex::RegularExpression; @@ -78,8 +80,6 @@ impl Term { /// } /// ``` pub fn concat(&self, terms: &[Term]) -> Result { - Self::check_number_of_terms(terms)?; - let mut return_regex = RegularExpression::new_empty(); let mut return_automaton = FastAutomaton::new_empty(); let mut has_automaton = false; @@ -110,12 +110,10 @@ impl Term { if !has_automaton { Ok(Term::RegularExpression(return_regex)) + } else if let Some(return_regex) = return_automaton.to_regex() { + Ok(Term::RegularExpression(return_regex)) } else { - if let Some(return_regex) = return_automaton.to_regex() { - Ok(Term::RegularExpression(return_regex)) - } else { - Ok(Term::Automaton(return_automaton)) - } + Ok(Term::Automaton(return_automaton)) } } @@ -138,8 +136,6 @@ impl Term { /// } /// ``` pub fn union(&self, terms: &[Term]) -> Result { - Self::check_number_of_terms(terms)?; - if self.is_total() { return Ok(Term::new_total()); } @@ -177,12 +173,10 @@ impl Term { if !has_automaton { Ok(Term::RegularExpression(return_regex)) + } else if let Some(return_regex) = return_automaton.to_regex() { + Ok(Term::RegularExpression(return_regex)) } else { - if let Some(return_regex) = return_automaton.to_regex() { - Ok(Term::RegularExpression(return_regex)) - } else { - Ok(Term::Automaton(return_automaton)) - } + Ok(Term::Automaton(return_automaton)) } } @@ -205,8 +199,6 @@ impl Term { /// } /// ``` pub fn intersection(&self, terms: &[Term]) -> Result { - Self::check_number_of_terms(terms)?; - if self.is_empty() { return Ok(Term::new_empty()); } @@ -405,19 +397,6 @@ impl Term { automaton_1.is_subset_of(&automaton_2) } - fn check_number_of_terms(terms: &[Term]) -> Result<(), EngineError> { - let number_of_terms = terms.len() + 1; - let max_number_of_terms = ThreadLocalParams::get_max_number_of_terms(); - if number_of_terms > max_number_of_terms { - Err(EngineError::TooMuchTerms( - max_number_of_terms, - number_of_terms, - )) - } else { - Ok(()) - } - } - fn determinize_subtrahend<'a>( minuend: &FastAutomaton, subtrahend: &'a FastAutomaton, @@ -500,7 +479,7 @@ impl Details { #[cfg(test)] mod tests { - use crate::regex::RegularExpression; + use crate::{execution_profile::ExecutionProfileBuilder, regex::RegularExpression}; use super::*; @@ -577,45 +556,93 @@ mod tests { } #[test] - fn test__() -> Result<(), EngineError> { + fn test_readme_code_1() -> Result<(), String> { // Create terms from regex - let t1 = Term::from_regex("abc.*")?; - let t2 = Term::from_regex(".*xyz")?; + let t1 = Term::from_regex("abc.*").unwrap(); + let t2 = Term::from_regex(".*xyz").unwrap(); // Concatenate - let concat = t1.concat(&[t2])?; + let concat = t1.concat(&[t2]).unwrap(); assert_eq!(concat.to_string(), "abc.*xyz"); // Union - let union = t1.union(&[Term::from_regex("fgh")?])?; // (abc.*|fgh) + let union = t1.union(&[Term::from_regex("fgh").unwrap()]).unwrap(); // (abc.*|fgh) assert_eq!(union.to_string(), "(abc.*|fgh)"); // Intersection - let inter = Term::from_regex("(ab|xy){2}")?.intersection(&[Term::from_regex(".*xy")?])?; // (ab|xy)xy + let inter = Term::from_regex("(ab|xy){2}") + .unwrap() + .intersection(&[Term::from_regex(".*xy").unwrap()]) + .unwrap(); // (ab|xy)xy assert_eq!(inter.to_string(), "(ab|xy)xy"); // Subtraction - let diff = Term::from_regex("a*")?.subtraction(&Term::from_regex("")?)?; + let diff = Term::from_regex("a*") + .unwrap() + .subtraction(&Term::from_regex("").unwrap()) + .unwrap(); assert_eq!(diff.to_string(), "a+"); // Repetition - let rep = Term::from_regex("abc")?.repeat(2, Some(4))?; // (abc){2,4} + let rep = Term::from_regex("abc").unwrap().repeat(2, Some(4)).unwrap(); // (abc){2,4} assert_eq!(rep.to_string(), "(abc){2,4}"); // Analyze - let details = rep.get_details()?; + let details = rep.get_details().unwrap(); assert_eq!(details.get_length(), &(Some(6), Some(12))); assert!(!details.is_empty()); // Generate examples - let samples = Term::from_regex("(x|y){1,3}")?.generate_strings(5)?; + let samples = Term::from_regex("(x|y){1,3}") + .unwrap() + .generate_strings(5) + .unwrap(); println!("Some matches: {:?}", samples); // Equivalence & subset - let a = Term::from_regex("a+")?; - let b = Term::from_regex("a*")?; - assert!(!a.are_equivalent(&b)?); - assert!(a.is_subset_of(&b)?); + let a = Term::from_regex("a+").unwrap(); + let b = Term::from_regex("a*").unwrap(); + assert!(!a.are_equivalent(&b).unwrap()); + assert!(a.is_subset_of(&b).unwrap()); + + Ok(()) + } + + #[test] + fn test_readme_code_2() -> Result<(), String> { + let term = Term::from_regex(".*abc.*cdef.*sqdsqf.*").unwrap(); + + let execution_profile = ExecutionProfileBuilder::new() + .execution_timeout(5) // We set the limit (5ms) + .build(); + + // We run the operation with the defined limitation + execution_profile.run(|| { + assert_eq!( + EngineError::OperationTimeOutError, + term.generate_strings(1000).unwrap_err() + ); + }); + + Ok(()) + } + + #[test] + fn test_readme_code_3() -> Result<(), String> { + let term1 = Term::from_regex(".*abcdef.*").unwrap(); + let term2 = Term::from_regex(".*defabc.*").unwrap(); + + let execution_profile = ExecutionProfileBuilder::new() + .max_number_of_states(5) // We set the limit + .build(); + + // We run the operation with the defined limitation + execution_profile.run(|| { + assert_eq!( + EngineError::AutomatonHasTooManyStates, + term1.intersection(&[term2]).unwrap_err() + ); + }); Ok(()) } diff --git a/src/regex/mod.rs b/src/regex/mod.rs index 59ceb29..4965cad 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -1,7 +1,6 @@ use std::{cmp, collections::VecDeque, fmt::Display}; -use crate::Range; -use execution_profile::ThreadLocalParams; +use crate::{Range, execution_profile::ExecutionProfile}; use regex_charclass::CharacterClass; use regex_syntax::hir::{Class, ClassBytes, ClassUnicode, Hir, HirKind}; @@ -123,9 +122,8 @@ impl RegularExpression { } pub fn to_automaton(&self) -> Result { - if self.get_number_of_states_in_nfa() >= ThreadLocalParams::get_max_number_of_states() { - return Err(EngineError::AutomatonHasTooManyStates); - } + ExecutionProfile::get().assert_max_number_of_states(self.get_number_of_states_in_nfa())?; + match self { RegularExpression::Character(range) => FastAutomaton::make_from_range(range), RegularExpression::Repetition(regular_expression, min, max_opt) => { From 671f3a38f5e8301c78e0d9a5fd319c830f56901b Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Thu, 10 Jul 2025 22:19:21 +0200 Subject: [PATCH 05/44] WIP --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 040684f..6ad02c5 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,8 @@ # RegexSolver [![Crates.io Version](https://img.shields.io/crates/v/regexsolver)](https://crates.io/crates/regexsolver) - A high-performance Rust library for building, combining, and analyzing regular expressions and finite automata. + +A high-performance Rust library for building, combining, and analyzing regular expressions and finite automata. Ideal for constraint solvers, code generators, test-case generators, and any use case requiring rich regex/automaton operations at scale. From 75c06b3ccc87d89b36d40f3cfb5d653e91087e61 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sun, 27 Jul 2025 20:19:07 +0200 Subject: [PATCH 06/44] add parallel intersection --- Cargo.toml | 3 +- README.md | 27 ++-- src/error/mod.rs | 7 ++ src/execution_profile.rs | 26 ++++ src/fast_automaton/mod.rs | 11 ++ src/fast_automaton/operation/intersection.rs | 64 ++++++++-- src/lib.rs | 124 ++++++++----------- 7 files changed, 170 insertions(+), 92 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 7147509..c691486 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,6 +26,7 @@ lazy_static = "1.4.0" regex = "1.10.3" regex-syntax = "0.8.5" regex-charclass = { version = "1.0.3" } +rayon = "1.10.0" [dev-dependencies] criterion = { version = "0.5", features = ["html_reports"] } @@ -47,4 +48,4 @@ serde = [ [[bench]] name = "my_benchmark" -harness = false \ No newline at end of file +harness = false diff --git a/README.md b/README.md index 6ad02c5..3c89a0f 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,7 @@ [![Crates.io Version](https://img.shields.io/crates/v/regexsolver)](https://crates.io/crates/regexsolver) A high-performance Rust library for building, combining, and analyzing regular expressions and finite automata. - -Ideal for constraint solvers, code generators, test-case generators, and any use case requiring rich regex/automaton operations at scale. +Ideal for constraint solvers, code generators, test-case generators, and any use case requiring rich regex/automaton operations. ## Key Features - **Dual Representation**: Work interchangeably with regex syntax or compiled automata via the `Term` enum. @@ -15,6 +14,8 @@ Ideal for constraint solvers, code generators, test-case generators, and any use - **String Generation**: Generate example strings matching a term, for testing or sampling. - **Performance & Tuning**: Pluggable `ExecutionProfile` to bound time and resource usage. +This library also exposes the `regex` and `fast_automaton` modules for advanced use, providing low-level APIs for direct pattern and automaton operations. + ## Installation Add the following line in your `Cargo.toml`: ```toml @@ -35,7 +36,7 @@ let concat = t1.concat(&[t2]).unwrap(); assert_eq!(concat.to_string(), "abc.*xyz"); // Union -let union = t1.union(&[Term::from_regex("fgh").unwrap()]).unwrap(); // (abc.*|fgh) +let union = t1.union(&[Term::from_regex("fgh").unwrap()]).unwrap(); assert_eq!(union.to_string(), "(abc.*|fgh)"); // Intersection @@ -53,13 +54,12 @@ let diff = Term::from_regex("a*") assert_eq!(diff.to_string(), "a+"); // Repetition -let rep = Term::from_regex("abc").unwrap().repeat(2, Some(4)).unwrap(); // (abc){2,4} +let rep = Term::from_regex("abc").unwrap().repeat(2, Some(4)).unwrap(); assert_eq!(rep.to_string(), "(abc){2,4}"); // Analyze -let details = rep.get_details().unwrap(); -assert_eq!(details.get_length(), &(Some(6), Some(12))); -assert!(!details.is_empty()); +assert_eq!(rep.get_length(), (Some(6), Some(12))); +assert!(!rep.is_empty()); // Generate examples let samples = Term::from_regex("(x|y){1,3}") @@ -76,7 +76,7 @@ assert!(a.is_subset_of(&b).unwrap()); ``` ## Execution Profiles -By default, all operations run without limits. For heavy or untrusted patterns, use an `ExecutionProfile` to cap execution time and maximum number of states in used automata. +By default, all operations run without limits. For heavy or untrusted patterns, use a thread local `ExecutionProfile` to cap execution time and maximum number of states in used automata. ### Example: Limit the execution time ```rust @@ -109,4 +109,13 @@ let execution_profile = ExecutionProfileBuilder::new() execution_profile.run(|| { assert_eq!(EngineError::AutomatonHasTooManyStates, term1.intersection(&[term2]).unwrap_err()); }); -``` \ No newline at end of file +``` + +## Usage with other programming languages + +If you want to use this library with other programming languages, we provide a wide range of wrappers: +- [regexsolver-java](https://github.com/RegexSolver/regexsolver-java) +- [regexsolver-js](https://github.com/RegexSolver/regexsolver-js) +- [regexsolver-python](https://github.com/RegexSolver/regexsolver-python) + +For more information about how to use the wrappers, you can refer to our [getting started guide](https://docs.regexsolver.com/getting-started.html). \ No newline at end of file diff --git a/src/error/mod.rs b/src/error/mod.rs index 91085b3..e88d1e1 100644 --- a/src/error/mod.rs +++ b/src/error/mod.rs @@ -21,6 +21,8 @@ pub enum EngineError { ConditionIndexOutOfBound, /// There is an error with one of the token. TokenError(TokenError), + /// Computing the cardinality of the provided automaton failed. + CannotComputeAutomatonCardinality, } impl fmt::Display for EngineError { @@ -43,6 +45,10 @@ impl fmt::Display for EngineError { EngineError::ConditionIndexOutOfBound => { write!(f, "The provided index is out of bound of the condition.") } + EngineError::CannotComputeAutomatonCardinality => write!( + f, + "Computing the cardinality of the provided automaton failed." + ), } } } @@ -62,6 +68,7 @@ impl EngineError { EngineError::TokenError(_) => false, EngineError::ConditionInvalidRange => true, EngineError::ConditionIndexOutOfBound => true, + EngineError::CannotComputeAutomatonCardinality => false, } } } diff --git a/src/execution_profile.rs b/src/execution_profile.rs index 008cffb..de3c485 100644 --- a/src/execution_profile.rs +++ b/src/execution_profile.rs @@ -55,6 +55,7 @@ impl PartialEq for ExecutionProfile { } impl ExecutionProfile { + /// Retrieve the current thread-local execution profile. pub fn get() -> ExecutionProfile { ThreadLocalParams::get_execution_profile() } @@ -114,6 +115,7 @@ impl ExecutionProfile { self } + /// Run the given closure with this profile at thread level, setting its start time to now. pub fn run(&self, f: F) -> R where F: FnOnce() -> R, @@ -128,6 +130,19 @@ impl ExecutionProfile { ThreadLocalParams::set_execution_profile(&initial_execution_profile); result } + + /// Like [`run`], but does *not* reset its start time. Useful if you want to pass a profile state to a new thread. + pub fn apply(&self, f: F) -> R + where + F: FnOnce() -> R, + { + let initial_execution_profile = ThreadLocalParams::get_execution_profile(); + + ThreadLocalParams::set_execution_profile(self); + let result = f(); + ThreadLocalParams::set_execution_profile(&initial_execution_profile); + result + } } pub struct ExecutionProfileBuilder { @@ -220,6 +235,17 @@ mod tests { use super::*; + fn assert_send() {} + fn assert_sync() {} + + #[test] + fn test_traits() -> Result<(), String> { + assert_send::(); + assert_sync::(); + + Ok(()) + } + #[test] fn test_execution_get() -> Result<(), String> { let execution_profile = ExecutionProfileBuilder::new() diff --git a/src/fast_automaton/mod.rs b/src/fast_automaton/mod.rs index 224b150..a4da641 100644 --- a/src/fast_automaton/mod.rs +++ b/src/fast_automaton/mod.rs @@ -314,4 +314,15 @@ mod tests { assert!(automaton.is_total()); Ok(()) } + + fn assert_send() {} + fn assert_sync() {} + + #[test] + fn test_traits() -> Result<(), String> { + assert_send::(); + assert_sync::(); + + Ok(()) + } } diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index a2b381e..1498d7f 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -1,24 +1,29 @@ use std::borrow::Cow; +use rayon::prelude::*; + use condition::converter::ConditionConverter; -use crate::{error::EngineError, execution_profile::ExecutionProfile}; +use crate::{ + error::EngineError, + execution_profile::{ExecutionProfile}, +}; use super::*; impl FastAutomaton { pub fn intersection(&self, other: &FastAutomaton) -> Result { - self.intersection_all([other]) + FastAutomaton::intersection_all([self, other]) } - pub fn intersection_all<'a, I>(&'a self, others: I) -> Result + pub fn intersection_all<'a, I>(automatons: I) -> Result where I: IntoIterator, { - let mut result = Cow::Borrowed(self); + let mut result: Cow<'a, FastAutomaton> = Cow::Owned(FastAutomaton::new_total()); - for other in others { - result = result.intersection_internal(other)?; + for automaton in automatons { + result = result.intersection_internal(automaton)?; if result.is_empty() { break; @@ -28,6 +33,22 @@ impl FastAutomaton { Ok(result.into_owned()) } + pub fn intersection_all_par<'a, I>(others: I) -> Result + where + I: IntoParallelIterator, + { + let execution_profile = ExecutionProfile::get(); + + let total = FastAutomaton::new_total(); + + others.into_par_iter().cloned().map(Result::Ok).try_reduce( + || total.clone(), + |acc, next| { + execution_profile.apply(|| Ok(acc.intersection_internal(&next)?.into_owned())) + }, + ) + } + fn intersection_internal<'a>( &self, other: &'a FastAutomaton, @@ -182,7 +203,7 @@ impl FastAutomaton { #[cfg(test)] mod tests { - use crate::regex::RegularExpression; + use crate::{fast_automaton::FastAutomaton, regex::RegularExpression}; #[test] fn test_simple_intersection_regex_1() -> Result<(), String> { @@ -282,4 +303,33 @@ mod tests { assert!(intersection.match_string("avb@gmail.com")); Ok(()) } + + #[test] + fn test_intersection_par() -> Result<(), String> { + let c = 12; + let mut automaton_list = Vec::with_capacity(c); + + for i in 0..c { + automaton_list.push( + RegularExpression::new(&format!(".*{i}.*")) + .unwrap() + .to_automaton() + .unwrap(), + ) + } + + // FastAutomaton::intersection_all(automaton_list.iter().collect::>()); + + // 3.76 + // 4.47 + // 3.84 + + let _ = FastAutomaton::intersection_all_par(automaton_list.iter().collect::>()); + + // 0.59 + // 0.55 + // 0.53 + + Ok(()) + } } diff --git a/src/lib.rs b/src/lib.rs index 9c55997..232c602 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,11 +9,14 @@ use cardinality::Cardinality; use error::EngineError; use fast_automaton::FastAutomaton; use nohash_hasher::NoHashHasher; +use rayon::prelude::*; use regex::RegularExpression; use regex_charclass::{char::Char, irange::RangeSet}; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +use crate::execution_profile::ExecutionProfile; + pub mod cardinality; pub mod error; pub mod execution_profile; @@ -199,21 +202,33 @@ impl Term { /// } /// ``` pub fn intersection(&self, terms: &[Term]) -> Result { - if self.is_empty() { + if self.is_empty() || terms.iter().any(|t| t.is_empty()) { return Ok(Term::new_empty()); } - let mut automaton_list = Vec::with_capacity(terms.len()); - for term in terms { - if term.is_empty() { - return Ok(Term::new_empty()); - } - automaton_list.push(term.get_automaton()?); - } + let parallel = terms.len() > 3; - let return_automaton = self - .get_automaton()? - .intersection_all(automaton_list.iter().map(Cow::as_ref))?; + let mut automaton_list = if parallel { + let execution_profile = ExecutionProfile::get(); + terms + .par_iter() + .map(|a| execution_profile.apply(|| a.get_automaton())) + .collect::, _>>()? + } else { + terms + .iter() + .map(Term::get_automaton) + .collect::, _>>()? + }; + automaton_list.push(self.get_automaton()?); + + let automaton_list = automaton_list.iter().map(AsRef::as_ref).collect::>(); + + let return_automaton = if parallel { + FastAutomaton::intersection_all_par(automaton_list) + } else { + FastAutomaton::intersection_all(automaton_list) + }?; if let Some(return_regex) = return_automaton.to_regex() { Ok(Term::RegularExpression(return_regex)) @@ -297,39 +312,6 @@ impl Term { } } - /// Returns the details of the current term, including cardinality, length, and emptiness. - /// - /// # Example: - /// - /// ``` - /// use regexsolver::{Term, cardinality::Cardinality}; - /// - /// let term = Term::from_regex("(abc|de)").unwrap(); - /// - /// let details = term.get_details().unwrap(); - /// - /// assert_eq!(Some(Cardinality::Integer(2)), *details.get_cardinality()); - /// assert_eq!((Some(2), Some(3)), *details.get_length()); - /// assert!(!details.is_empty()); - /// assert!(!details.is_total()); - /// ``` - pub fn get_details(&self) -> Result { - match self { - Term::RegularExpression(regex) => Ok(Details { - cardinality: Some(regex.get_cardinality()), - length: regex.get_length(), - empty: regex.is_empty(), - total: regex.is_total(), - }), - Term::Automaton(automaton) => Ok(Details { - cardinality: automaton.get_cardinality(), - length: automaton.get_length(), - empty: automaton.is_empty(), - total: automaton.is_total(), - }), - } - } - /// Generate strings matched by the given term. /// /// # Example: @@ -442,38 +424,31 @@ impl Term { Term::Automaton(fast_automaton) => fast_automaton.is_total(), } } -} - -/// Represents details about a [Term]. -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -#[derive(Clone, PartialEq, Eq, Debug)] -#[cfg_attr(feature = "serde", serde(tag = "type", rename = "details"))] -pub struct Details { - cardinality: Option>, - length: (Option, Option), - empty: bool, - total: bool, -} - -impl Details { - /// Return the number of unique strings matched. - pub fn get_cardinality(&self) -> &Option> { - &self.cardinality - } - /// Return the minimum and the maximum length of matched strings. - pub fn get_length(&self) -> &(Option, Option) { - &self.length + pub fn get_length(&self) -> (Option, Option) { + match self { + Term::RegularExpression(regex) => regex.get_length(), + Term::Automaton(automaton) => automaton.get_length(), + } } - /// Return `true` if it does not match any string. - pub fn is_empty(&self) -> bool { - self.empty - } + pub fn get_cardinality(&self) -> Result, EngineError> { + match self { + Term::RegularExpression(regex) => Ok(regex.get_cardinality()), + Term::Automaton(automaton) => { + let cardinality = if !automaton.is_determinitic() { + automaton.determinize()?.get_cardinality() + } else { + automaton.get_cardinality() + }; - /// Return `true` if it match all possible strings. - pub fn is_total(&self) -> bool { - self.total + if let Some(cardinality) = cardinality { + Ok(cardinality) + } else { + Err(EngineError::CannotComputeAutomatonCardinality) + } + } + } } } @@ -588,9 +563,8 @@ mod tests { assert_eq!(rep.to_string(), "(abc){2,4}"); // Analyze - let details = rep.get_details().unwrap(); - assert_eq!(details.get_length(), &(Some(6), Some(12))); - assert!(!details.is_empty()); + assert_eq!(rep.get_length(), (Some(6), Some(12))); + assert!(!rep.is_empty()); // Generate examples let samples = Term::from_regex("(x|y){1,3}") From b37ef6599e3cbc9769735874c4defe0d5949130c Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Mon, 28 Jul 2025 21:23:54 +0200 Subject: [PATCH 07/44] WIP --- src/execution_profile.rs | 2 +- src/fast_automaton/operation/alternation.rs | 42 ++++-- src/fast_automaton/operation/concatenate.rs | 11 +- src/fast_automaton/operation/intersection.rs | 13 +- src/lib.rs | 134 ++++++++++++------- src/regex/mod.rs | 4 +- src/regex/operation/union.rs | 8 +- 7 files changed, 140 insertions(+), 74 deletions(-) diff --git a/src/execution_profile.rs b/src/execution_profile.rs index de3c485..86045d5 100644 --- a/src/execution_profile.rs +++ b/src/execution_profile.rs @@ -355,7 +355,7 @@ mod tests { .as_millis(); println!("{run_duration}"); - assert!(run_duration <= execution_timeout_in_ms + 50); + assert!(run_duration <= execution_timeout_in_ms + 100); }); Ok(()) diff --git a/src/fast_automaton/operation/alternation.rs b/src/fast_automaton/operation/alternation.rs index 84c8749..fe1ab80 100644 --- a/src/fast_automaton/operation/alternation.rs +++ b/src/fast_automaton/operation/alternation.rs @@ -1,24 +1,18 @@ use std::hash::BuildHasherDefault; use condition::converter::ConditionConverter; +use rayon::prelude::*; -use crate::error::EngineError; +use crate::{error::EngineError, execution_profile::ExecutionProfile}; use super::*; impl FastAutomaton { pub fn union(&self, other: &FastAutomaton) -> Result { - Self::build_union([self, other]) + Self::union_all([self, other]) } - pub fn union_all<'a, I>(&'a self, others: I) -> Result - where - I: IntoIterator, - { - Self::build_union(std::iter::once(self).chain(others)) - } - - pub(crate) fn build_union<'a, I>(automatons: I) -> Result + pub fn union_all<'a, I>(automatons: I) -> Result where I: IntoIterator, { @@ -29,6 +23,34 @@ impl FastAutomaton { Ok(new_automaton) } + pub fn union_all_par<'a, I>(automatons: I) -> Result + where + I: IntoParallelIterator, + { + let execution_profile = ExecutionProfile::get(); + + let empty = FastAutomaton::new_empty(); + + automatons.into_par_iter() + .try_fold( + || empty.clone(), + |mut acc, next| { + execution_profile.apply(|| { + acc.union_mut(next)?; + Ok(acc) + }) + }, + ).try_reduce( + || empty.clone(), + |mut acc, next| { + execution_profile.apply(|| { + acc.union_mut(&next)?; + Ok(acc) + }) + }, + ) + } + fn prepare_start_states( &mut self, other: &FastAutomaton, diff --git a/src/fast_automaton/operation/concatenate.rs b/src/fast_automaton/operation/concatenate.rs index 1dbd644..b22ad2d 100644 --- a/src/fast_automaton/operation/concatenate.rs +++ b/src/fast_automaton/operation/concatenate.rs @@ -8,17 +8,10 @@ use super::*; impl FastAutomaton { pub fn concat(&self, other: &FastAutomaton) -> Result { - Self::build_concat([self, other]) + Self::concat_all([self, other]) } - pub fn concat_all<'a, I>(&'a self, others: I) -> Result - where - I: IntoIterator, - { - Self::build_concat(std::iter::once(self).chain(others)) - } - - pub(crate) fn build_concat<'a, I>(automatons: I) -> Result + pub fn concat_all<'a, I>(automatons: I) -> Result where I: IntoIterator, { diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index 1498d7f..8f3e4a3 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -33,7 +33,7 @@ impl FastAutomaton { Ok(result.into_owned()) } - pub fn intersection_all_par<'a, I>(others: I) -> Result + pub fn intersection_all_par<'a, I>(automatons: I) -> Result where I: IntoParallelIterator, { @@ -41,7 +41,14 @@ impl FastAutomaton { let total = FastAutomaton::new_total(); - others.into_par_iter().cloned().map(Result::Ok).try_reduce( + automatons.into_par_iter() + .try_fold( + || total.clone(), + |acc, next| { + execution_profile.apply(|| Ok(acc.intersection_internal(next)?.into_owned())) + }, + ) + .try_reduce( || total.clone(), |acc, next| { execution_profile.apply(|| Ok(acc.intersection_internal(&next)?.into_owned())) @@ -306,7 +313,7 @@ mod tests { #[test] fn test_intersection_par() -> Result<(), String> { - let c = 12; + let c = 14; let mut automaton_list = Vec::with_capacity(c); for i in 0..c { diff --git a/src/lib.rs b/src/lib.rs index 232c602..3cf1c82 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -143,43 +143,45 @@ impl Term { return Ok(Term::new_total()); } - let mut return_regex = RegularExpression::new_empty(); - let mut return_automaton = FastAutomaton::new_empty(); - let mut has_automaton = false; - match self { - Term::RegularExpression(regular_expression) => { - return_regex = regular_expression.clone() - } - Term::Automaton(fast_automaton) => { - has_automaton = true; - return_automaton = fast_automaton.clone(); - } - } - for term in terms { - if term.is_total() { - return Ok(Term::new_total()); - } - if has_automaton { - return_automaton = return_automaton.union(term.get_automaton()?.as_ref())?; - } else { - match term { - Term::RegularExpression(regular_expression) => { - return_regex = return_regex.union(regular_expression); - } - Term::Automaton(fast_automaton) => { - has_automaton = true; - return_automaton = return_regex.to_automaton()?.union(fast_automaton)?; - } + let mut has_automaton = matches!(self, Term::Automaton(_)); + if !has_automaton { + for term in terms { + if term.is_total() { + return Ok(Term::new_total()); + } + if matches!(term, Term::Automaton(_)) { + has_automaton = true; + break; } } } - if !has_automaton { - Ok(Term::RegularExpression(return_regex)) - } else if let Some(return_regex) = return_automaton.to_regex() { - Ok(Term::RegularExpression(return_regex)) + if has_automaton { + let parallel = terms.len() > 3; + + let automaton_list = self.get_automata(terms, parallel)?; + + let automaton_list = automaton_list.iter().map(AsRef::as_ref).collect::>(); + + let return_automaton = if parallel { + FastAutomaton::union_all_par(automaton_list) + } else { + FastAutomaton::union_all(automaton_list) + }?; + + if let Some(return_regex) = return_automaton.to_regex() { + Ok(Term::RegularExpression(return_regex)) + } else { + Ok(Term::Automaton(return_automaton)) + } } else { - Ok(Term::Automaton(return_automaton)) + let regexes_list = self.get_regexes(terms)?; + + let regexes_list = regexes_list.iter().map(AsRef::as_ref).collect::>(); + + Ok(Term::RegularExpression(RegularExpression::union_all( + regexes_list, + ))) } } @@ -208,19 +210,7 @@ impl Term { let parallel = terms.len() > 3; - let mut automaton_list = if parallel { - let execution_profile = ExecutionProfile::get(); - terms - .par_iter() - .map(|a| execution_profile.apply(|| a.get_automaton())) - .collect::, _>>()? - } else { - terms - .iter() - .map(Term::get_automaton) - .collect::, _>>()? - }; - automaton_list.push(self.get_automaton()?); + let automaton_list = self.get_automata(terms, parallel)?; let automaton_list = automaton_list.iter().map(AsRef::as_ref).collect::>(); @@ -392,6 +382,47 @@ impl Term { } } + fn get_automata<'a>( + &'a self, + terms: &'a [Term], + parallel: bool, + ) -> Result>, EngineError> { + let mut automaton_list = Vec::with_capacity(terms.len() + 1); + automaton_list.push(self.get_automaton()?); + + let mut terms_automata = if parallel { + let execution_profile = ExecutionProfile::get(); + terms + .par_iter() + .map(|a| execution_profile.apply(|| a.get_automaton())) + .collect::, _>>() + } else { + terms + .iter() + .map(Term::get_automaton) + .collect::, _>>() + }?; + automaton_list.append(&mut terms_automata); + + Ok(automaton_list) + } + + fn get_regexes<'a>( + &'a self, + terms: &'a [Term], + ) -> Result>, EngineError> { + let mut regex_list = Vec::with_capacity(terms.len() + 1); + regex_list.push(self.get_regex()?); + + let mut terms_regexes = terms + .iter() + .map(Term::get_regex) + .collect::, _>>()?; + regex_list.append(&mut terms_regexes); + + Ok(regex_list) + } + fn get_automaton(&self) -> Result, EngineError> { Ok(match self { Term::RegularExpression(regex) => Cow::Owned(regex.to_automaton()?), @@ -399,6 +430,19 @@ impl Term { }) } + fn get_regex(&self) -> Result, EngineError> { + Ok(match self { + Term::RegularExpression(regex) => Cow::Borrowed(regex), + Term::Automaton(automaton) => { + if let Some(regex) = automaton.to_regex() { + Cow::Owned(regex) + } else { + todo!() + } + } + }) + } + /// Create a term that matches the empty language. pub fn new_empty() -> Self { Term::RegularExpression(RegularExpression::new_empty()) diff --git a/src/regex/mod.rs b/src/regex/mod.rs index 4965cad..848842f 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -136,14 +136,14 @@ impl RegularExpression { for c in concat.iter() { concats.push(c.to_automaton()?); } - FastAutomaton::build_concat(&concats) + FastAutomaton::concat_all(&concats) } RegularExpression::Alternation(alternation) => { let mut alternates = Vec::with_capacity(alternation.len()); for c in alternation.iter() { alternates.push(c.to_automaton()?); } - FastAutomaton::build_union(&alternates) + FastAutomaton::union_all(&alternates) } } } diff --git a/src/regex/operation/union.rs b/src/regex/operation/union.rs index 62789e6..65b34f4 100644 --- a/src/regex/operation/union.rs +++ b/src/regex/operation/union.rs @@ -4,16 +4,16 @@ use super::*; impl RegularExpression { pub fn union(&self, other: &RegularExpression) -> RegularExpression { - self.union_all([other]) + Self::union_all([self, other]) } - pub fn union_all<'a, I>(&'a self, others: I) -> RegularExpression + pub fn union_all<'a, I>(regexes: I) -> RegularExpression where I: IntoIterator, { - let mut result = Cow::Borrowed(self); + let mut result: Cow<'a, RegularExpression> = Cow::Owned(RegularExpression::new_empty()); - for other in others { + for other in regexes { result = result.union_(other); if result.is_total() { From 3ea0dec5f7d25545a223138f9c50eaff5db43a46 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Tue, 29 Jul 2025 21:54:57 +0200 Subject: [PATCH 08/44] update readme --- README.md | 95 +++++++++++---- src/execution_profile.rs | 10 +- src/fast_automaton/analyze/mod.rs | 5 + src/lib.rs | 190 +++++++++++++++++------------- 4 files changed, 193 insertions(+), 107 deletions(-) diff --git a/README.md b/README.md index 3c89a0f..c6b28c5 100644 --- a/README.md +++ b/README.md @@ -1,28 +1,18 @@ - # RegexSolver [![Crates.io Version](https://img.shields.io/crates/v/regexsolver)](https://crates.io/crates/regexsolver) -A high-performance Rust library for building, combining, and analyzing regular expressions and finite automata. -Ideal for constraint solvers, code generators, test-case generators, and any use case requiring rich regex/automaton operations. +**RegexSolver** is a high-performance Rust library for building, combining, and analyzing regular expressions and finite automata. Ideal for constraint solvers, code generators, test-case generators, and any use case requiring rich regex/automaton operations. -## Key Features -- **Dual Representation**: Work interchangeably with regex syntax or compiled automata via the `Term` enum. -- **Set Operations**: Concatenate, union, intersect, subtract, and repeat regex/automaton terms. -- **Analysis & Properties**: - - Compute language **cardinality**, **length bounds**, **emptiness**, and **totality**. - - Check **equivalence** and **subset** relations between terms. -- **String Generation**: Generate example strings matching a term, for testing or sampling. -- **Performance & Tuning**: Pluggable `ExecutionProfile` to bound time and resource usage. +## Installation -This library also exposes the `regex` and `fast_automaton` modules for advanced use, providing low-level APIs for direct pattern and automaton operations. +Add to your `Cargo.toml`: -## Installation -Add the following line in your `Cargo.toml`: ```toml [dependencies] regexsolver = "1" ``` -## Examples + +## Example ```rust use regexsolver::Term; @@ -75,10 +65,54 @@ assert!(!a.are_equivalent(&b).unwrap()); assert!(a.is_subset_of(&b).unwrap()); ``` -## Execution Profiles +## API + +### Term + +`Term` is an enum designed to represent either a regular expression pattern or a compiled automaton. This unified representation enables seamless and efficient execution of set operations across multiple instances. It's particularly valuable when working with both regular expressions and automata, allowing operations to be performed transparently regardless of the underlying representation. + +| Method | Return | Description | +| -------- | ------- | ------- | +| `Term::new_empty()` | `Term` | Create a term that matches the empty language. | +| `Term::new_total()` | `Term` | Create a term that matches all possible strings. | +| `Term::new_empty_string()` | `Term` | Create a term that only match the empty string `""`. | +| `Term::from_pattern(pattern: &str)` | `Result` | Parse the provided pattern and return a new `Term` holding the resulting `RegularExpression`. | +| `Term::from_regex(regex: RegularExpression)` | `Term` | Create a new `Term` holding the provided `RegularExpression`. | +| `Term::from_automaton(automaton: FastAutomaton)` | `Term` | Create a new `Term` holding the provided `FastAutomaton`. | +| `self.concat(terms: &[Term])` | `Result` | Compute the concatenation of the given collection of terms. Returns the resulting term. | +| `self.union(terms: &[Term])` | `Result` | Compute the union of the given collection of terms. Returns the resulting term. | +| `self.intersection(terms: &[Term])` | `Result` | Compute the intersection of the given collection of terms. Returns the resulting term. | +| `self.subtraction(subtrahend: &Term)` | `Result` | Compute the subtraction/difference of the two given terms. Returns the resulting term. | +| `self.difference(subtrahend: &Term)` | `Result` | See `self.subtraction(subtrahend: &Term)`. | +| `self.repeat(min: u32, max_opt: Option)` | `Result` | Returns the repetition of the current term, between `min` and `max_opt` times. If `max_opt` is `None`, the repetition is unbounded. | +| `self.generate_strings(count: usize)` | `Result, EngineError>` | Generate the given count of strings matched by the given term. | +| `self.are_equivalent(term: &Term)` | `Result` | Compute whether the current term and the given term are equivalent. Returns `true` if both terms accept the same language. | +| `self.is_subset_of(term: &Term)` | `Result` | Compute whether the current term is a subset of the given term. Returns `true` if all strings matched by the current term are also matched by the given term. | +| `self.is_empty()` | `bool` | Check if the current term matches the empty language. | +| `self.is_total()` | `bool` | Check if the current term matches all possible strings. | +| `self.is_empty_string()` | `bool` | Check if the current term only match the empty string `""`. | +| `self.get_length()` | `(Option, Option)` | Returns the minimum and maximum length of the possible matched strings. | +| `self.get_cardinality()` | `Result, EngineError>` | Returns the cardinality of the provided term (i.e. the number of the possible matched strings). | + + +### FastAutomaton + +`FastAutomaton` is used to directly build, manipulate and analyze automata. To convert an automaton to a `RegularExpression` the method `to_regex()` can be used, not all automaton can be converted to a regular expression. + + + +### RegularExpression + +`RegularExpression` is used to directly build, manipulate and analyze regular expression patterns. Not all the set operations are available, for more advanced operation such as intersection, subtraction/difference and complement it is necessary to convert in to a `FastAutomaton` with the method `to_automaton()`. + +## Error Handling + +## Bound Execution + By default, all operations run without limits. For heavy or untrusted patterns, use a thread local `ExecutionProfile` to cap execution time and maximum number of states in used automata. -### Example: Limit the execution time +### Time-Bounded Execution + ```rust use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; @@ -94,7 +128,8 @@ execution_profile.run(|| { }); ``` -### Example: Limit the number of states +### State-Limited Execution + ```rust use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; @@ -111,11 +146,31 @@ execution_profile.run(|| { }); ``` -## Usage with other programming languages + + +## Key Concepts & Limitations + +RegexSolver supports a subset of regular expressions that adhere to the principles of regular languages. Here are the key characteristics and limitations of the regular expressions supported by RegexSolver: + +- **Anchored Expressions:** All regular expressions in RegexSolver are anchored. This means that the expressions are treated as if they start and end at the boundaries of the input text. For example, the expression `abc` will match the string "abc" but not "xabc" or "abcx". +- **Lookahead/Lookbehind:** RegexSolver does not support lookahead (`(?=...)`) or lookbehind (`(?<=...)`) assertions. Using them would return an error. +- **Greedy/Ungreedy Quantifiers:** The concept of ungreedy (`*?`, `+?`, `??`) quantifiers is not supported. All quantifiers are treated as greedy. For example, `a*` or `a*?` will match the longest possible sequence of "a"s. +- **Line Feed and Dot:** RegexSolver handle every characters the same way. The dot character . matches every possible unicode characters including the line feed (`\n`). +- **Pure Regular Expressions:** RegexSolver focuses on pure regular expressions as defined in regular language theory. This means features that extend beyond regular languages, such as backreferences (`\1`, `\2`, etc.), are not supported. Any use of backreference would return an error. +- **Empty Regular Expressions:** An empty regular expression is denoted by `[]`, which represents a pattern that matches no input, not even an empty string. + +RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/regex_syntax/) library for parsing expressions. As a result, unsupported features supported by the parser will be parsed but ignored. This allows for some flexibility in writing regular expressions, but it is important to be aware of the unsupported features to avoid unexpected behavior. + +## Cross-Language Support + If you want to use this library with other programming languages, we provide a wide range of wrappers: - [regexsolver-java](https://github.com/RegexSolver/regexsolver-java) - [regexsolver-js](https://github.com/RegexSolver/regexsolver-js) - [regexsolver-python](https://github.com/RegexSolver/regexsolver-python) -For more information about how to use the wrappers, you can refer to our [getting started guide](https://docs.regexsolver.com/getting-started.html). \ No newline at end of file +For more information about how to use the wrappers, you can refer to our [getting started guide](https://docs.regexsolver.com/getting-started.html). + +## License + +This project is licensed under the MIT License. diff --git a/src/execution_profile.rs b/src/execution_profile.rs index 86045d5..708fbac 100644 --- a/src/execution_profile.rs +++ b/src/execution_profile.rs @@ -280,7 +280,7 @@ mod tests { #[test] fn test_execution_timeout_generate_strings() -> Result<(), String> { - let term = Term::from_regex(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz").unwrap(); + let term = Term::from_pattern(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz").unwrap(); let execution_timeout_in_ms = 10; let start_time = SystemTime::now(); @@ -307,8 +307,8 @@ mod tests { #[test] fn test_execution_timeout_difference() -> Result<(), String> { - let term1 = Term::from_regex(".*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); - let term2 = Term::from_regex(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz").unwrap(); + let term1 = Term::from_pattern(".*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); + let term2 = Term::from_pattern(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz").unwrap(); let execution_timeout_in_ms = 50; let start_time = SystemTime::now(); @@ -335,8 +335,8 @@ mod tests { #[test] fn test_execution_timeout_intersection() -> Result<(), String> { - let term1 = Term::from_regex(".*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); - let term2 = Term::from_regex(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); + let term1 = Term::from_pattern(".*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); + let term2 = Term::from_pattern(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); let execution_timeout_in_ms = 100; let start_time = SystemTime::now(); diff --git a/src/fast_automaton/analyze/mod.rs b/src/fast_automaton/analyze/mod.rs index 56f0884..9340d7d 100644 --- a/src/fast_automaton/analyze/mod.rs +++ b/src/fast_automaton/analyze/mod.rs @@ -25,6 +25,11 @@ impl FastAutomaton { false } + #[inline] + pub fn is_empty_string(&self) -> bool { + self.accept_states.len() == 1 && self.accept_states.contains(&self.start_state) && self.in_degree(self.start_state) == 0 + } + pub fn get_reacheable_states(&self) -> IntSet { let mut states_map: IntMap> = IntMap::with_capacity_and_hasher(self.transitions.len(), BuildHasherDefault::default()); diff --git a/src/lib.rs b/src/lib.rs index 3cf1c82..5bd5ea0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -51,17 +51,42 @@ impl Display for Term { } impl Term { - /// Create a term based on the given pattern. + /// Create a term that matches the empty language. + pub fn new_empty() -> Self { + Term::RegularExpression(RegularExpression::new_empty()) + } + + /// Create a term that matches all possible strings. + pub fn new_total() -> Self { + Term::RegularExpression(RegularExpression::new_total()) + } + + /// Create a term that only match the empty string `""`. + pub fn new_empty_string() -> Self { + Term::RegularExpression(RegularExpression::new_empty_string()) + } + + /// Parse the provided pattern and return a new `Term` holding the resulting `RegularExpression`. /// /// # Example: /// /// ``` /// use regexsolver::Term; /// - /// let term = Term::from_regex(".*abc.*").unwrap(); + /// let term = Term::from_pattern(".*abc.*").unwrap(); /// ``` - pub fn from_regex(regex: &str) -> Result { - Ok(Term::RegularExpression(RegularExpression::new(regex)?)) + pub fn from_pattern(pattern: &str) -> Result { + Ok(Term::RegularExpression(RegularExpression::new(pattern)?)) + } + + /// Create a new `Term` holding the provided `RegularExpression`. + pub fn from_regex(regex: RegularExpression) -> Self { + Term::RegularExpression(regex) + } + + /// Create a new `Term` holding the provided `FastAutomaton`. + pub fn from_automaton(automaton: FastAutomaton) -> Self { + Term::Automaton(automaton) } /// Compute the concatenation of the current term with the given list of terms. @@ -302,7 +327,7 @@ impl Term { } } - /// Generate strings matched by the given term. + /// Generate the given count of strings matched by the given term. /// /// # Example: /// @@ -369,6 +394,59 @@ impl Term { automaton_1.is_subset_of(&automaton_2) } + + /// Check if the current term matches the empty language. + pub fn is_empty(&self) -> bool { + match self { + Term::RegularExpression(regular_expression) => regular_expression.is_empty(), + Term::Automaton(fast_automaton) => fast_automaton.is_empty(), + } + } + + /// Check if the current term matches all possible strings. + pub fn is_total(&self) -> bool { + match self { + Term::RegularExpression(regular_expression) => regular_expression.is_total(), + Term::Automaton(fast_automaton) => fast_automaton.is_total(), + } + } + + /// Check if the current term only match the empty string `""`. + pub fn is_empty_string(&self) -> bool { + match self { + Term::RegularExpression(regular_expression) => regular_expression.is_empty_string(), + Term::Automaton(fast_automaton) => fast_automaton.is_empty_string(), + } + } + + /// Returns the minimum and maximum length of the possible matched strings. + pub fn get_length(&self) -> (Option, Option) { + match self { + Term::RegularExpression(regex) => regex.get_length(), + Term::Automaton(automaton) => automaton.get_length(), + } + } + + /// Returns the cardinality of the provided term (i.e. the number of the possible matched strings). + pub fn get_cardinality(&self) -> Result, EngineError> { + match self { + Term::RegularExpression(regex) => Ok(regex.get_cardinality()), + Term::Automaton(automaton) => { + let cardinality = if !automaton.is_determinitic() { + automaton.determinize()?.get_cardinality() + } else { + automaton.get_cardinality() + }; + + if let Some(cardinality) = cardinality { + Ok(cardinality) + } else { + Err(EngineError::CannotComputeAutomatonCardinality) + } + } + } + } + fn determinize_subtrahend<'a>( minuend: &FastAutomaton, subtrahend: &'a FastAutomaton, @@ -442,58 +520,6 @@ impl Term { } }) } - - /// Create a term that matches the empty language. - pub fn new_empty() -> Self { - Term::RegularExpression(RegularExpression::new_empty()) - } - - /// Create a term that matches all possible strings. - pub fn new_total() -> Self { - Term::RegularExpression(RegularExpression::new_total()) - } - - /// Check if the current term matches the empty language. - pub fn is_empty(&self) -> bool { - match self { - Term::RegularExpression(regular_expression) => regular_expression.is_empty(), - Term::Automaton(fast_automaton) => fast_automaton.is_empty(), - } - } - - /// Check if the current term matches all possible strings. - pub fn is_total(&self) -> bool { - match self { - Term::RegularExpression(regular_expression) => regular_expression.is_total(), - Term::Automaton(fast_automaton) => fast_automaton.is_total(), - } - } - - pub fn get_length(&self) -> (Option, Option) { - match self { - Term::RegularExpression(regex) => regex.get_length(), - Term::Automaton(automaton) => automaton.get_length(), - } - } - - pub fn get_cardinality(&self) -> Result, EngineError> { - match self { - Term::RegularExpression(regex) => Ok(regex.get_cardinality()), - Term::Automaton(automaton) => { - let cardinality = if !automaton.is_determinitic() { - automaton.determinize()?.get_cardinality() - } else { - automaton.get_cardinality() - }; - - if let Some(cardinality) = cardinality { - Ok(cardinality) - } else { - Err(EngineError::CannotComputeAutomatonCardinality) - } - } - } - } } #[cfg(test)] @@ -504,8 +530,8 @@ mod tests { #[test] fn test_details() -> Result<(), String> { - let regex1 = Term::from_regex("a").unwrap(); - let regex2 = Term::from_regex("b").unwrap(); + let regex1 = Term::from_pattern("a").unwrap(); + let regex2 = Term::from_pattern("b").unwrap(); let details = regex1.intersection(&vec![regex2]); assert!(details.is_ok()); @@ -515,8 +541,8 @@ mod tests { #[test] fn test_subtraction_1() -> Result<(), String> { - let regex1 = Term::from_regex("a*").unwrap(); - let regex2 = Term::from_regex("").unwrap(); + let regex1 = Term::from_pattern("a*").unwrap(); + let regex2 = Term::from_pattern("").unwrap(); let result = regex1.subtraction(®ex2); assert!(result.is_ok()); @@ -531,8 +557,8 @@ mod tests { #[test] fn test_subtraction_2() -> Result<(), String> { - let regex1 = Term::from_regex("x*").unwrap(); - let regex2 = Term::from_regex("(xxx)*").unwrap(); + let regex1 = Term::from_pattern("x*").unwrap(); + let regex2 = Term::from_pattern("(xxx)*").unwrap(); let result = regex1.subtraction(®ex2); assert!(result.is_ok()); @@ -547,21 +573,21 @@ mod tests { #[test] fn test_intersection_1() -> Result<(), String> { - let regex1 = Term::from_regex("a*").unwrap(); - let regex2 = Term::from_regex("b*").unwrap(); + let regex1 = Term::from_pattern("a*").unwrap(); + let regex2 = Term::from_pattern("b*").unwrap(); let result = regex1.intersection(&vec![regex2]); assert!(result.is_ok()); let result = result.unwrap(); - assert_eq!(Term::from_regex("").unwrap(), result); + assert_eq!(Term::from_pattern("").unwrap(), result); Ok(()) } #[test] fn test_intersection_2() -> Result<(), String> { - let regex1 = Term::from_regex("x*").unwrap(); - let regex2 = Term::from_regex("(xxx)*").unwrap(); + let regex1 = Term::from_pattern("x*").unwrap(); + let regex2 = Term::from_pattern("(xxx)*").unwrap(); let result = regex1.intersection(&vec![regex2]); assert!(result.is_ok()); @@ -577,33 +603,33 @@ mod tests { #[test] fn test_readme_code_1() -> Result<(), String> { // Create terms from regex - let t1 = Term::from_regex("abc.*").unwrap(); - let t2 = Term::from_regex(".*xyz").unwrap(); + let t1 = Term::from_pattern("abc.*").unwrap(); + let t2 = Term::from_pattern(".*xyz").unwrap(); // Concatenate let concat = t1.concat(&[t2]).unwrap(); assert_eq!(concat.to_string(), "abc.*xyz"); // Union - let union = t1.union(&[Term::from_regex("fgh").unwrap()]).unwrap(); // (abc.*|fgh) + let union = t1.union(&[Term::from_pattern("fgh").unwrap()]).unwrap(); // (abc.*|fgh) assert_eq!(union.to_string(), "(abc.*|fgh)"); // Intersection - let inter = Term::from_regex("(ab|xy){2}") + let inter = Term::from_pattern("(ab|xy){2}") .unwrap() - .intersection(&[Term::from_regex(".*xy").unwrap()]) + .intersection(&[Term::from_pattern(".*xy").unwrap()]) .unwrap(); // (ab|xy)xy assert_eq!(inter.to_string(), "(ab|xy)xy"); // Subtraction - let diff = Term::from_regex("a*") + let diff = Term::from_pattern("a*") .unwrap() - .subtraction(&Term::from_regex("").unwrap()) + .subtraction(&Term::from_pattern("").unwrap()) .unwrap(); assert_eq!(diff.to_string(), "a+"); // Repetition - let rep = Term::from_regex("abc").unwrap().repeat(2, Some(4)).unwrap(); // (abc){2,4} + let rep = Term::from_pattern("abc").unwrap().repeat(2, Some(4)).unwrap(); // (abc){2,4} assert_eq!(rep.to_string(), "(abc){2,4}"); // Analyze @@ -611,15 +637,15 @@ mod tests { assert!(!rep.is_empty()); // Generate examples - let samples = Term::from_regex("(x|y){1,3}") + let samples = Term::from_pattern("(x|y){1,3}") .unwrap() .generate_strings(5) .unwrap(); println!("Some matches: {:?}", samples); // Equivalence & subset - let a = Term::from_regex("a+").unwrap(); - let b = Term::from_regex("a*").unwrap(); + let a = Term::from_pattern("a+").unwrap(); + let b = Term::from_pattern("a*").unwrap(); assert!(!a.are_equivalent(&b).unwrap()); assert!(a.is_subset_of(&b).unwrap()); @@ -628,7 +654,7 @@ mod tests { #[test] fn test_readme_code_2() -> Result<(), String> { - let term = Term::from_regex(".*abc.*cdef.*sqdsqf.*").unwrap(); + let term = Term::from_pattern(".*abc.*cdef.*sqdsqf.*").unwrap(); let execution_profile = ExecutionProfileBuilder::new() .execution_timeout(5) // We set the limit (5ms) @@ -647,8 +673,8 @@ mod tests { #[test] fn test_readme_code_3() -> Result<(), String> { - let term1 = Term::from_regex(".*abcdef.*").unwrap(); - let term2 = Term::from_regex(".*defabc.*").unwrap(); + let term1 = Term::from_pattern(".*abcdef.*").unwrap(); + let term2 = Term::from_pattern(".*defabc.*").unwrap(); let execution_profile = ExecutionProfileBuilder::new() .max_number_of_states(5) // We set the limit From a47c77912c06b6080ef338242485f22658d30ab9 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sat, 2 Aug 2025 17:43:49 +0200 Subject: [PATCH 09/44] rename methods --- README.md | 206 +++++++++++++----- src/execution_profile.rs | 6 +- src/fast_automaton/analyze/cardinality.rs | 6 +- src/fast_automaton/analyze/length.rs | 4 +- src/fast_automaton/analyze/mod.rs | 8 +- src/fast_automaton/builder.rs | 123 ++++++----- src/fast_automaton/condition/converter.rs | 27 ++- src/fast_automaton/condition/mod.rs | 73 +++---- .../convert/to_regex/builder/mod.rs | 14 +- .../convert/to_regex/builder/scc.rs | 2 +- src/fast_automaton/convert/to_regex/mod.rs | 11 +- .../convert/to_regex/transform.rs | 2 +- src/fast_automaton/generate.rs | 6 +- src/fast_automaton/mod.rs | 191 ++++++++-------- .../operation/{concatenate.rs => concat.rs} | 16 +- src/fast_automaton/operation/determinize.rs | 4 +- src/fast_automaton/operation/intersection.rs | 28 +-- src/fast_automaton/operation/mod.rs | 8 +- src/fast_automaton/operation/repeat.rs | 16 +- src/fast_automaton/operation/subtraction.rs | 10 +- .../operation/{alternation.rs => union.rs} | 38 ++-- src/fast_automaton/spanning_set/mod.rs | 26 ++- src/lib.rs | 145 ++++++------ src/regex/builder.rs | 14 +- src/regex/mod.rs | 6 +- src/regex/operation/mod.rs | 40 ++-- src/tokenizer/embed_automaton.rs | 61 +++--- src/tokenizer/embed_regex.rs | 4 +- src/tokenizer/mod.rs | 8 +- src/tokenizer/range_tokenizer.rs | 14 +- 30 files changed, 607 insertions(+), 510 deletions(-) rename src/fast_automaton/operation/{concatenate.rs => concat.rs} (96%) rename src/fast_automaton/operation/{alternation.rs => union.rs} (87%) diff --git a/README.md b/README.md index c6b28c5..1385f96 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,20 @@ **RegexSolver** is a high-performance Rust library for building, combining, and analyzing regular expressions and finite automata. Ideal for constraint solvers, code generators, test-case generators, and any use case requiring rich regex/automaton operations. +## Table of Contents + + - [Installation](#installation) + - [Example](#example) + - [Key Concepts & Limitations](#key-concepts-limitations) + - [API](#api) + - [Term](#term) + - [FastAutomaton](#fastautomaton) + - [RegularExpression](#regularexpression) + - [Error Handling](#error-handling) + - [Bound Execution](#bound-execution) + - [Cross-Language Support](#cross-language-support) + - [License](#license) + ## Installation Add to your `Cargo.toml`: @@ -18,33 +32,33 @@ regexsolver = "1" use regexsolver::Term; // Create terms from regex -let t1 = Term::from_regex("abc.*").unwrap(); -let t2 = Term::from_regex(".*xyz").unwrap(); +let t1 = Term::from_pattern("abc.*").unwrap(); +let t2 = Term::from_pattern(".*xyz").unwrap(); // Concatenate let concat = t1.concat(&[t2]).unwrap(); assert_eq!(concat.to_string(), "abc.*xyz"); // Union -let union = t1.union(&[Term::from_regex("fgh").unwrap()]).unwrap(); +let union = t1.union(&[Term::from_pattern("fgh").unwrap()]).unwrap(); assert_eq!(union.to_string(), "(abc.*|fgh)"); // Intersection -let inter = Term::from_regex("(ab|xy){2}") +let inter = Term::from_pattern("(ab|xy){2}") .unwrap() - .intersection(&[Term::from_regex(".*xy").unwrap()]) + .intersection(&[Term::from_pattern(".*xy").unwrap()]) .unwrap(); // (ab|xy)xy assert_eq!(inter.to_string(), "(ab|xy)xy"); // Subtraction -let diff = Term::from_regex("a*") +let diff = Term::from_pattern("a*") .unwrap() - .subtraction(&Term::from_regex("").unwrap()) + .subtraction(&Term::from_pattern("").unwrap()) .unwrap(); assert_eq!(diff.to_string(), "a+"); // Repetition -let rep = Term::from_regex("abc").unwrap().repeat(2, Some(4)).unwrap(); +let rep = Term::from_pattern("abc").unwrap().repeat(2, Some(4)).unwrap(); assert_eq!(rep.to_string(), "(abc){2,4}"); // Analyze @@ -52,58 +66,161 @@ assert_eq!(rep.get_length(), (Some(6), Some(12))); assert!(!rep.is_empty()); // Generate examples -let samples = Term::from_regex("(x|y){1,3}") +let samples = Term::from_pattern("(x|y){1,3}") .unwrap() .generate_strings(5) .unwrap(); println!("Some matches: {:?}", samples); // Equivalence & subset -let a = Term::from_regex("a+").unwrap(); -let b = Term::from_regex("a*").unwrap(); +let a = Term::from_pattern("a+").unwrap(); +let b = Term::from_pattern("a*").unwrap(); assert!(!a.are_equivalent(&b).unwrap()); assert!(a.is_subset_of(&b).unwrap()); ``` +## Key Concepts & Limitations + +RegexSolver supports a subset of regular expressions that adhere to the principles of regular languages. Here are the key characteristics and limitations of the regular expressions supported by RegexSolver: +- **Anchored Expressions:** All regular expressions in RegexSolver are anchored. This means that the expressions are treated as if they start and end at the boundaries of the input text. For example, the expression `abc` will match the string "abc" but not "xabc" or "abcx". +- **Lookahead/Lookbehind:** RegexSolver does not support lookahead (`(?=...)`) or lookbehind (`(?<=...)`) assertions. Using them would return an error. +- **Greedy/Ungreedy Quantifiers:** The concept of ungreedy (`*?`, `+?`, `??`) quantifiers is not supported. All quantifiers are treated as greedy. For example, `a*` or `a*?` will match the longest possible sequence of "a"s. +- **Line Feed and Dot:** RegexSolver handle every characters the same way. The dot character `.` matches every possible unicode characters including the line feed (`\n`). +- **Pure Regular Expressions:** RegexSolver focuses on pure regular expressions as defined in regular language theory. This means features that extend beyond regular languages, such as backreferences (`\1`, `\2`, etc.), are not supported. Any use of backreference would return an error. +- **Empty Regular Expressions:** An empty regular expression is denoted by `[]`, which represents a pattern that matches no input, not even an empty string. + +RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/regex_syntax/) library for parsing patterns. As a result, unsupported features supported by the parser will be parsed but ignored. This allows for some flexibility in writing regular expressions, but it is important to be aware of the unsupported features to avoid unexpected behavior. + ## API ### Term -`Term` is an enum designed to represent either a regular expression pattern or a compiled automaton. This unified representation enables seamless and efficient execution of set operations across multiple instances. It's particularly valuable when working with both regular expressions and automata, allowing operations to be performed transparently regardless of the underlying representation. +`Term` is an enum designed to represent either a regular expression or a compiled automaton. This unified representation enables seamless and efficient execution of set operations across multiple instances. It's particularly valuable when working with both regular expressions and automata, allowing operations to be performed transparently regardless of the underlying representation. +#### Build +| Method | Return | Description | +| -------- | ------- | ------- | +| `new_empty()` | `Term` | Creates a term that matches the empty language. | +| `new_total()` | `Term` | Creates a term that matches all possible strings. | +| `new_empty_string()` | `Term` | Creates a term that only match the empty string `""`. | +| `from_pattern(pattern: &str)` | `Result` | Parses the provided pattern and return a new `Term` holding the resulting `RegularExpression`. | +| `from_pattern(regex: RegularExpression)` | `Term` | Creates a new `Term` holding the provided `RegularExpression`. | +| `from_automaton(automaton: FastAutomaton)` | `Term` | Creates a new `Term` holding the provided `FastAutomaton`. | + +#### Manipulate +| Method | Return | Description | +| -------- | ------- | ------- | +| `concat(&self, terms: &[Term])` | `Result` | Computes the concatenation of the given collection of terms. Returns the resulting term. | +| `union(&self, terms: &[Term])` | `Result` | Computes the union of the given collection of terms. Returns the resulting term. | +| `intersection(&self, terms: &[Term])` | `Result` | Computes the intersection of the given collection of terms. Returns the resulting term. | +| `subtraction(&self, subtrahend: &Term)` | `Result` | Computes the subtraction/difference of the two given terms. Returns the resulting term. | +| `difference(&self, subtrahend: &Term)` | `Result` | See `self.subtraction(subtrahend: &Term)`. | +| `repeat(&self, min: u32, max_opt: Option)` | `Result` | Returns the repetition of the current term, between `min` and `max_opt` times. If `max_opt` is `None`, the repetition is unbounded. | + +#### Analyze | Method | Return | Description | | -------- | ------- | ------- | -| `Term::new_empty()` | `Term` | Create a term that matches the empty language. | -| `Term::new_total()` | `Term` | Create a term that matches all possible strings. | -| `Term::new_empty_string()` | `Term` | Create a term that only match the empty string `""`. | -| `Term::from_pattern(pattern: &str)` | `Result` | Parse the provided pattern and return a new `Term` holding the resulting `RegularExpression`. | -| `Term::from_regex(regex: RegularExpression)` | `Term` | Create a new `Term` holding the provided `RegularExpression`. | -| `Term::from_automaton(automaton: FastAutomaton)` | `Term` | Create a new `Term` holding the provided `FastAutomaton`. | -| `self.concat(terms: &[Term])` | `Result` | Compute the concatenation of the given collection of terms. Returns the resulting term. | -| `self.union(terms: &[Term])` | `Result` | Compute the union of the given collection of terms. Returns the resulting term. | -| `self.intersection(terms: &[Term])` | `Result` | Compute the intersection of the given collection of terms. Returns the resulting term. | -| `self.subtraction(subtrahend: &Term)` | `Result` | Compute the subtraction/difference of the two given terms. Returns the resulting term. | -| `self.difference(subtrahend: &Term)` | `Result` | See `self.subtraction(subtrahend: &Term)`. | -| `self.repeat(min: u32, max_opt: Option)` | `Result` | Returns the repetition of the current term, between `min` and `max_opt` times. If `max_opt` is `None`, the repetition is unbounded. | -| `self.generate_strings(count: usize)` | `Result, EngineError>` | Generate the given count of strings matched by the given term. | -| `self.are_equivalent(term: &Term)` | `Result` | Compute whether the current term and the given term are equivalent. Returns `true` if both terms accept the same language. | -| `self.is_subset_of(term: &Term)` | `Result` | Compute whether the current term is a subset of the given term. Returns `true` if all strings matched by the current term are also matched by the given term. | -| `self.is_empty()` | `bool` | Check if the current term matches the empty language. | -| `self.is_total()` | `bool` | Check if the current term matches all possible strings. | -| `self.is_empty_string()` | `bool` | Check if the current term only match the empty string `""`. | -| `self.get_length()` | `(Option, Option)` | Returns the minimum and maximum length of the possible matched strings. | -| `self.get_cardinality()` | `Result, EngineError>` | Returns the cardinality of the provided term (i.e. the number of the possible matched strings). | +| `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates the given count of strings matched by the given term. | +| `are_equivalent(&self, term: &Term)` | `Result` | Computes whether the current term and the given term are equivalent. Returns `true` if both terms accept the same language. | +| `is_subset_of(&self, term: &Term)` | `Result` | Computes whether the current term is a subset of the given term. Returns `true` if all strings matched by the current term are also matched by the given term. | +| `is_empty(&self)` | `bool` | Checks if the current term matches the empty language. | +| `is_total(&self)` | `bool` | Checks if the current term matches all possible strings. | +| `is_empty_string(&self)` | `bool` | Checks if the current term only match the empty string `""`. | +| `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of the possible matched strings. | +| `get_cardinality()` | `Result, EngineError>` | Returns the cardinality of the provided term (i.e. the number of the possible matched strings). | +| `to_automaton(&self)` | `Result, EngineError>` | Converts the current `Term` to a `FastAutomaton`. | +| `to_regex(&self)` | `Option>` | Converts the current `Term` to a `RegularExpression`. Returns `None` if the automaton cannot be converted. | ### FastAutomaton -`FastAutomaton` is used to directly build, manipulate and analyze automata. To convert an automaton to a `RegularExpression` the method `to_regex()` can be used, not all automaton can be converted to a regular expression. +`FastAutomaton` is used to directly build, manipulate and analyze automata. To convert an automaton to a `RegularExpression` the method `to_regex()` can be used. Not all automaton can be converted to a regular expression. + +When building or modifying an automaton you might come to use the method `add_transition(&mut self, from_state: State, to_state: State, new_cond: &Condition)`. This method accepts a `Condition` rather than a raw character set. To construct a Condition, call: To build a `Condition`, call: +```rust +Condition::from_range(&range, &spanning_set); +``` +where `spanning_set` is the automaton's current `SpanningSet`. The `CharRange` you pass must be fully covered by that spanning set. If it isn't, you have two options: +1. Merge an existing spanning set with another: +```rust +let new_set = SpanningSet::merge(&old_set, &other_set); +``` + +2. Recompute from a list of ranges: +```rust +let new_set = SpanningSet::compute_spanning_set(&[range_set1, range_set2, …]); +``` + +After constructing `new_set`, apply it to the automaton: +```rust +fast_automaton.apply_new_spanning_set(&new_set); +``` + +This design allows us to perform unions, intersections, and complements of transition conditions in O(1) time, but it does add some complexity to automaton construction. For more details, you can check [this article](https://alexvbrdn.me/post/optimizing-transition-conditions-automaton-representation). + +#### Build +| Method | Return | Description | +| -------- | ------- | ------- | +| `new_empty()` | `FastAutomaton` | Create an automaton that matches the empty language. | +| `new_total()` | `FastAutomaton` | Create an automaton that matches all possible strings. | +| `new_empty_string()` | `FastAutomaton` | Create an automaton that only match the empty string `""`. | +| `new_from_range(range: &CharRange)` | `Result` | Create an automaton that matches one of the characters in the provided `CharRange`. | +| `new_state(&mut self)` | `State` | Create a new state in the automaton and returns its identifier. | +| `accept(&mut self, state: State)` | | Make the automaton accept the provided state as a valid final state. | +| `add_transition(&mut self, from_state: State, to_state: State, new_cond: &Condition)` | | Create a new transition between the two provided states with the given condition, the provided condition must follow the same spanning set as the rest of the automaton. | +| `add_epsilon_transition(&mut self, from_state: State, to_state: State)` | | Create a new epsilon transition between the two provided states. | +| `remove_state(&mut self, state: State)` | | Remove the provided state from the automaton. Remove all the transitions it is connected to. Panic if the state is used as a start state. | +| `remove_states(&mut self, states: &IntSet)` | | Remove the provided states from the automaton. Remove all the transitions they are connected to. Panic if one of the state is used as a start state. | +| `apply_new_spanning_set(&mut self, new_spanning_set: &SpanningSet)` | `Result<(), EngineError>` | Apply the provided spanning set to the automaton and project all of its conditions on it. | + +#### Manipulate +| Method | Return | Description | +| -------- | ------- | ------- | +| `union(&self, other: &FastAutomaton)` | `Result` | | +| `union_all<'a, I: IntoIterator>(automatons: I)` | `Result` | | +| `union_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | | +| `concat(&self, other: &FastAutomaton)` | `Result` | | +| `concat_all<'a, I: IntoIterator>(automatons: I)` | `Result` | | +| `determinize(&self)` | `Result` | | +| `intersection(&self, other: &FastAutomaton)` | `Result` | | +| `intersection_all<'a, I: IntoIterator>(automatons: I)` | `Result` | | +| `intersection_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | | +| `complement(&mut self)` | `Result<(), EngineError>` | | +| `subtraction(&self, other: &FastAutomaton)` | `Result` | | +| `repeat(&self, min: u32, max_opt: Option)` | `Result` | | + +#### Analyze +| Method | Return | Description | +| -------- | ------- | ------- | +| `state_in_degree(&self, state: State)` | `usize` | Returns the number of transitions to the provided state. | +| `state_out_degree(&self, state: State)` | `usize` | Returns the number of transitions from the provided state. | +| `all_states_iter(&self)` | `impl Iterator` | Returns an iterator of the states of the automaton. | +| `all_states_vec(&self)` | `Vec` | Returns a vector containing the states of the automaton. | +| `direct_states_iter(&self, state: &State)` | `impl Iterator` | Returns an iterator over all states directly reachable from the given state in one transition. | +| `direct_states_vec(&self, state: &State)` | `Vec` | Returns a vector containing all states directly reachable from the given state in one transition. | +| `transitions_to_vec(&self, state: State)` | `Vec` | Returns a vector containing the transitions to the provided state. | +| `transitions_from_vec(&self, state: State)` | `Vec` | Returns a vector containing the transitions from the provided state. | +| `transitions_from_iter(&self, state: State)` | `impl Iterator` | Returns an iterator containing the transitions from the provided state. | +| `transitions_from_iter_mut(&mut self, state: State)` | `impl Iterator` | Returns a mutable iterator containing the transitions from the provided state. | +| `transitions_from_into_iter(&self, state: State)` | `impl Iterator` | Returns an owned iterator containing the transitions from the provided state. | +| `does_transition_exists(&self, from_state: State, to_state: State)` | `bool` | Returns `true` if there is a directed transition between the two provided states. | +| `get_condition(&self, from_state: State, to_state: State)` | `Option<&Condition>` | Get a reference of the directed transtion's condition between the two provided states. | +| `get_condition_mut(&mut self, from_state: State, to_state: State)` | `Option<&Condition>` | Get a mutable reference of the directed transtion's condition between the two provided states. | +| `get_start_state(&self)` | `State` | Returns the start state of the automaton. | +| `get_accept_states(&self)` | `&IntSet` | Get a reference to the set of accept (final) states of the automaton. | +| `get_spanning_set(&self)` | `&SpanningSet` | Returns a reference to the automaton's spanning set. | +| `is_accepted(&self, state: &State)` | `bool` | Returns `true` if the given `state` is one of the automaton's accept states. | +| `is_determinitic(&self)` | `bool` | Returns `true` if the automaton is deterministic. | +| `is_cyclic(&self)` | `bool` | Returns `true` if the automaton contains at least one cycle. | +| `has_state(&self, state: State)` | `bool` | Returns `true` if the automaton contains at least one cycle. | +| `to_regex(&self)` | `Option` | Try to convert the automaton to a `RegularExpression`. If it cannot find an equivalent pattern returns `None`. | +| `has_intersection(&self, other: &FastAutomaton)` | `Result` | | ### RegularExpression -`RegularExpression` is used to directly build, manipulate and analyze regular expression patterns. Not all the set operations are available, for more advanced operation such as intersection, subtraction/difference and complement it is necessary to convert in to a `FastAutomaton` with the method `to_automaton()`. +`RegularExpression` is used to directly build, manipulate and analyze regular expression patterns. Not all the set operations are available, for more advanced operation such as intersection, subtraction/difference and complement it is necessary to convert in to a `FastAutomaton` with the method `to_automaton()`. ## Error Handling @@ -116,7 +233,7 @@ By default, all operations run without limits. For heavy or untrusted patterns, ```rust use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; -let term = Term::from_regex(".*abc.*cdef.*sqdsqf.*").unwrap(); +let term = Term::from_pattern(".*abc.*cdef.*sqdsqf.*").unwrap(); let execution_profile = ExecutionProfileBuilder::new() .execution_timeout(5) // We set the limit (5ms) @@ -133,8 +250,8 @@ execution_profile.run(|| { ```rust use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; -let term1 = Term::from_regex(".*abcdef.*").unwrap(); -let term2 = Term::from_regex(".*defabc.*").unwrap(); +let term1 = Term::from_pattern(".*abcdef.*").unwrap(); +let term2 = Term::from_pattern(".*defabc.*").unwrap(); let execution_profile = ExecutionProfileBuilder::new() .max_number_of_states(5) // We set the limit @@ -146,21 +263,6 @@ execution_profile.run(|| { }); ``` - - -## Key Concepts & Limitations - -RegexSolver supports a subset of regular expressions that adhere to the principles of regular languages. Here are the key characteristics and limitations of the regular expressions supported by RegexSolver: - -- **Anchored Expressions:** All regular expressions in RegexSolver are anchored. This means that the expressions are treated as if they start and end at the boundaries of the input text. For example, the expression `abc` will match the string "abc" but not "xabc" or "abcx". -- **Lookahead/Lookbehind:** RegexSolver does not support lookahead (`(?=...)`) or lookbehind (`(?<=...)`) assertions. Using them would return an error. -- **Greedy/Ungreedy Quantifiers:** The concept of ungreedy (`*?`, `+?`, `??`) quantifiers is not supported. All quantifiers are treated as greedy. For example, `a*` or `a*?` will match the longest possible sequence of "a"s. -- **Line Feed and Dot:** RegexSolver handle every characters the same way. The dot character . matches every possible unicode characters including the line feed (`\n`). -- **Pure Regular Expressions:** RegexSolver focuses on pure regular expressions as defined in regular language theory. This means features that extend beyond regular languages, such as backreferences (`\1`, `\2`, etc.), are not supported. Any use of backreference would return an error. -- **Empty Regular Expressions:** An empty regular expression is denoted by `[]`, which represents a pattern that matches no input, not even an empty string. - -RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/regex_syntax/) library for parsing expressions. As a result, unsupported features supported by the parser will be parsed but ignored. This allows for some flexibility in writing regular expressions, but it is important to be aware of the unsupported features to avoid unexpected behavior. - ## Cross-Language Support diff --git a/src/execution_profile.rs b/src/execution_profile.rs index 708fbac..3ba3a33 100644 --- a/src/execution_profile.rs +++ b/src/execution_profile.rs @@ -10,8 +10,8 @@ use crate::error::EngineError; /// ``` /// use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; /// -/// let term1 = Term::from_regex(".*abcdef.*").unwrap(); -/// let term2 = Term::from_regex(".*defabc.*").unwrap(); +/// let term1 = Term::from_pattern(".*abcdef.*").unwrap(); +/// let term2 = Term::from_pattern(".*defabc.*").unwrap(); /// /// let execution_profile = ExecutionProfileBuilder::new() /// .max_number_of_states(5) @@ -27,7 +27,7 @@ use crate::error::EngineError; /// use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; /// use std::time::SystemTime; /// -/// let term = Term::from_regex(".*abc.*cdef.*sqdsqf.*").unwrap(); +/// let term = Term::from_pattern(".*abc.*cdef.*sqdsqf.*").unwrap(); /// /// let execution_profile = ExecutionProfileBuilder::new() /// .execution_timeout(5) // 5ms diff --git a/src/fast_automaton/analyze/cardinality.rs b/src/fast_automaton/analyze/cardinality.rs index 04ea226..a2d6d91 100644 --- a/src/fast_automaton/analyze/cardinality.rs +++ b/src/fast_automaton/analyze/cardinality.rs @@ -65,9 +65,9 @@ impl FastAutomaton { let mut queue = VecDeque::with_capacity(len); let mut order = Vec::with_capacity(len); - for from_state in &self.transitions_vec() { + for from_state in &self.all_states_vec() { in_degree.entry(*from_state).or_insert(0); - for to_state in self.transitions_from_state_iter(from_state) { + for to_state in self.direct_states_iter(from_state) { *in_degree.entry(to_state).or_insert(0) += 1; } } @@ -80,7 +80,7 @@ impl FastAutomaton { while let Some(from_state) = queue.pop_front() { order.push(from_state); - for to_state in self.transitions_from_state_iter(&from_state) { + for to_state in self.direct_states_iter(&from_state) { *in_degree.entry(to_state).or_default() -= 1; if in_degree[&to_state] == 0 { diff --git a/src/fast_automaton/analyze/length.rs b/src/fast_automaton/analyze/length.rs index 70eccbd..c03ee80 100644 --- a/src/fast_automaton/analyze/length.rs +++ b/src/fast_automaton/analyze/length.rs @@ -26,7 +26,7 @@ impl FastAutomaton { } seen.insert(state); - for to_state in self.transitions_from_state_iter(&state) { + for to_state in self.direct_states_iter(&state) { if to_state == state || seen.contains(&to_state) { is_infinite = true; continue; @@ -53,7 +53,7 @@ impl FastAutomaton { } seen.insert(state); - for to_state in self.transitions_from_state_iter(&state) { + for to_state in self.direct_states_iter(&state) { if to_state == state || seen.contains(&to_state) { max = None; break; diff --git a/src/fast_automaton/analyze/mod.rs b/src/fast_automaton/analyze/mod.rs index 9340d7d..de49b73 100644 --- a/src/fast_automaton/analyze/mod.rs +++ b/src/fast_automaton/analyze/mod.rs @@ -27,15 +27,15 @@ impl FastAutomaton { #[inline] pub fn is_empty_string(&self) -> bool { - self.accept_states.len() == 1 && self.accept_states.contains(&self.start_state) && self.in_degree(self.start_state) == 0 + self.accept_states.len() == 1 && self.accept_states.contains(&self.start_state) && self.state_in_degree(self.start_state) == 0 } pub fn get_reacheable_states(&self) -> IntSet { let mut states_map: IntMap> = IntMap::with_capacity_and_hasher(self.transitions.len(), BuildHasherDefault::default()); - for from_state in self.transitions_iter() { - for (to_state, transition) in self.transitions_from_state_enumerate_iter(&from_state) { - if transition.is_empty() { + for from_state in self.all_states_iter() { + for (condition, to_state) in self.transitions_from_iter(from_state) { + if condition.is_empty() { continue; } match states_map.entry(*to_state) { diff --git a/src/fast_automaton/builder.rs b/src/fast_automaton/builder.rs index c597747..d6b69f4 100644 --- a/src/fast_automaton/builder.rs +++ b/src/fast_automaton/builder.rs @@ -5,6 +5,7 @@ use crate::error::EngineError; use super::*; impl FastAutomaton { + /// Create an automaton that matches the empty language. #[inline] pub fn new_empty() -> Self { Self { @@ -19,6 +20,7 @@ impl FastAutomaton { } } + /// Create an automaton that only match the empty string `""`. #[inline] pub fn new_empty_string() -> Self { let mut automaton = Self::new_empty(); @@ -26,26 +28,18 @@ impl FastAutomaton { automaton } + /// Create an automaton that matches all possible strings. #[inline] pub fn new_total() -> Self { let mut automaton: FastAutomaton = Self::new_empty(); automaton.spanning_set = SpanningSet::new_total(); automaton.accept(automaton.start_state); - automaton.add_transition_to(0, 0, &Condition::total(&automaton.spanning_set)); + automaton.add_transition(0, 0, &Condition::total(&automaton.spanning_set)); automaton } - #[inline] - pub fn make_empty(&mut self) { - self.apply_model(&Self::new_empty()) - } - - #[inline] - pub fn make_total(&mut self) { - self.apply_model(&Self::new_total()) - } - - pub fn make_from_range(range: &Range) -> Result { + /// Create an automaton that matches one of the characters in the provided `CharRange`. + pub fn new_from_range(range: &CharRange) -> Result { let mut automaton = Self::new_empty(); if range.is_empty() { return Ok(automaton); @@ -55,44 +49,12 @@ impl FastAutomaton { let spanning_set = SpanningSet::compute_spanning_set(&[range.clone()]); let condition = Condition::from_range(range, &spanning_set)?; automaton.spanning_set = spanning_set; - automaton.add_transition_to(0, new_state, &condition); + automaton.add_transition(0, new_state, &condition); automaton.accept(new_state); Ok(automaton) } - pub fn apply_new_spanning_set( - &mut self, - new_spanning_set: &SpanningSet, - ) -> Result<(), EngineError> { - if new_spanning_set == &self.spanning_set { - return Ok(()); - } - let condition_converter = ConditionConverter::new(&self.spanning_set, new_spanning_set)?; - for from_state in &self.transitions_vec() { - for to_state in self.transitions_from_state(from_state) { - match self.transitions[*from_state].entry(to_state) { - Entry::Occupied(mut o) => { - o.insert(condition_converter.convert(o.get())?); - } - Entry::Vacant(_) => {} - }; - } - } - self.spanning_set = new_spanning_set.clone(); - Ok(()) - } - - #[inline] - pub fn apply_model(&mut self, model: &FastAutomaton) { - self.transitions = model.transitions.clone(); - self.start_state = model.start_state; - self.accept_states = model.accept_states.clone(); - self.removed_states = model.removed_states.clone(); - self.spanning_set = model.spanning_set.clone(); - self.deterministic = model.deterministic; - self.cyclic = model.cyclic; - } - + /// Create a new state in the automaton and returns its identifier. #[inline] pub fn new_state(&mut self) -> State { if let Some(new_state) = self.removed_states.clone().iter().next() { @@ -104,13 +66,15 @@ impl FastAutomaton { } } + /// Make the automaton accept the provided state as a valid final state. #[inline] pub fn accept(&mut self, state: State) { self.assert_state_exists(state); self.accept_states.insert(state); } - pub fn add_transition_to(&mut self, from_state: State, to_state: State, new_cond: &Condition) { + /// Create a new transition between the two provided states with the given condition, the provided condition must follow the same spanning set as the rest of the automaton. + pub fn add_transition(&mut self, from_state: State, to_state: State, new_cond: &Condition) { self.assert_state_exists(from_state); if from_state != to_state { self.assert_state_exists(to_state); @@ -121,7 +85,7 @@ impl FastAutomaton { if self.deterministic { let mut deterministic = true; - for (state, condition) in self.transitions_from_state_enumerate_iter(&from_state) { + for (condition, state) in self.transitions_from_iter(from_state) { if state == &to_state { continue; } @@ -147,7 +111,8 @@ impl FastAutomaton { }; } - pub fn add_epsilon(&mut self, from_state: State, to_state: State) { + /// Create a new epsilon transition between the two provided states. + pub fn add_epsilon_transition(&mut self, from_state: State, to_state: State) { if from_state == to_state { return; } @@ -157,12 +122,12 @@ impl FastAutomaton { self.accept_states.insert(from_state); } - let transitions_to: Vec<_> = self.transitions_from_state_into_iter(&to_state).collect(); + let transitions_to: Vec<_> = self.transitions_from_into_iter(&to_state).collect(); - for (state, cond) in transitions_to { + for (cond, state) in transitions_to { if self.deterministic { let mut deterministic = true; - for (s, c) in self.transitions_from_state_enumerate_iter(&from_state) { + for (c, s) in self.transitions_from_iter(from_state) { if state == *s { continue; } @@ -188,12 +153,11 @@ impl FastAutomaton { } } + /// Remove the provided state from the automaton. Remove all the transitions it is connected to. Panic if the state is used as a start state. pub fn remove_state(&mut self, state: State) { self.assert_state_exists(state); if self.start_state == state { - panic!( - "Can not remove the state {state}, it is still used as start state." - ); + panic!("Can not remove the state {state}, it is still used as start state."); } self.accept_states.remove(&state); self.transitions_in.remove(&state); @@ -219,6 +183,7 @@ impl FastAutomaton { } } + /// Remove the provided states from the automaton. Remove all the transitions they are connected to. Panic if one of the state is used as a start state. pub fn remove_states(&mut self, states: &IntSet) { self.accept_states.retain(|e| !states.contains(e)); @@ -226,9 +191,7 @@ impl FastAutomaton { for &state in states { if self.start_state == state { - panic!( - "Can not remove the state {state}, it is still used as start state." - ); + panic!("Can not remove the state {state}, it is still used as start state."); } if self.transitions.len() - 1 == state { self.transitions.remove(state); @@ -259,6 +222,50 @@ impl FastAutomaton { } } } + + /// Apply the provided spanning set to the automaton and project all of its conditions on it. + pub fn apply_new_spanning_set( + &mut self, + new_spanning_set: &SpanningSet, + ) -> Result<(), EngineError> { + if new_spanning_set == &self.spanning_set { + return Ok(()); + } + let condition_converter = ConditionConverter::new(&self.spanning_set, new_spanning_set)?; + for from_state in &self.all_states_vec() { + for to_state in self.direct_states_vec(from_state) { + match self.transitions[*from_state].entry(to_state) { + Entry::Occupied(mut o) => { + o.insert(condition_converter.convert(o.get())?); + } + Entry::Vacant(_) => {} + }; + } + } + self.spanning_set = new_spanning_set.clone(); + Ok(()) + } + + #[inline] + pub(crate) fn make_empty(&mut self) { + self.apply_model(&Self::new_empty()) + } + + #[inline] + pub(crate) fn make_total(&mut self) { + self.apply_model(&Self::new_total()) + } + + #[inline] + pub(crate) fn apply_model(&mut self, model: &FastAutomaton) { + self.transitions = model.transitions.clone(); + self.start_state = model.start_state; + self.accept_states = model.accept_states.clone(); + self.removed_states = model.removed_states.clone(); + self.spanning_set = model.spanning_set.clone(); + self.deterministic = model.deterministic; + self.cyclic = model.cyclic; + } } #[cfg(test)] diff --git a/src/fast_automaton/condition/converter.rs b/src/fast_automaton/condition/converter.rs index 89bb123..503d6ce 100644 --- a/src/fast_automaton/condition/converter.rs +++ b/src/fast_automaton/condition/converter.rs @@ -86,17 +86,16 @@ impl<'a, 'b> ConditionConverter<'a, 'b> { #[cfg(test)] mod tests { - use regex_charclass::{char::Char, irange::range::AnyRange}; - - use crate::Range; + use regex_charclass::{char::Char, irange::{range::AnyRange}}; + use crate::CharRange; use super::*; fn get_from_spanning_set() -> SpanningSet { let ranges = vec![ - Range::new_from_range(Char::new('\0')..=Char::new('\u{2}')), - Range::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), - Range::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), + CharRange::new_from_range(Char::new('\0')..=Char::new('\u{2}')), + CharRange::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), + CharRange::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), ]; SpanningSet::compute_spanning_set(&ranges) @@ -104,11 +103,11 @@ mod tests { fn get_to_spanning_set() -> SpanningSet { let ranges = vec![ - Range::new_from_range(Char::new('\0')..=Char::new('\u{1}')), - Range::new_from_range(Char::new('\u{2}')..=Char::new('\u{2}')), - Range::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), - Range::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), - Range::new_from_range(Char::new('\u{20}')..=Char::new('\u{22}')), + CharRange::new_from_range(Char::new('\0')..=Char::new('\u{1}')), + CharRange::new_from_range(Char::new('\u{2}')..=Char::new('\u{2}')), + CharRange::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), + CharRange::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), + CharRange::new_from_range(Char::new('\u{20}')..=Char::new('\u{22}')), ]; SpanningSet::compute_spanning_set(&ranges) @@ -127,7 +126,7 @@ mod tests { let total = Condition::total(&from_spanning_set); assert!(converter.convert(&total).unwrap().is_total()); - let range = Range::new_from_range(Char::new('\0')..=Char::new('\u{2}')); + let range = CharRange::new_from_range(Char::new('\0')..=Char::new('\u{2}')); let condition = Condition::from_range(&range, &from_spanning_set).unwrap(); assert_eq!( range, @@ -138,7 +137,7 @@ mod tests { .unwrap() ); - let range = Range::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')); + let range = CharRange::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')); let condition = Condition::from_range(&range, &from_spanning_set).unwrap(); assert_eq!( range, @@ -149,7 +148,7 @@ mod tests { .unwrap() ); - let range = Range::new_from_ranges(&[ + let range = CharRange::new_from_ranges(&[ AnyRange::from(Char::new('\u{4}')..=Char::new('\u{6}')), AnyRange::from(Char::new('\u{9}')..=Char::new('\u{9}')), ]); diff --git a/src/fast_automaton/condition/mod.rs b/src/fast_automaton/condition/mod.rs index 40415e3..08a439b 100644 --- a/src/fast_automaton/condition/mod.rs +++ b/src/fast_automaton/condition/mod.rs @@ -1,10 +1,9 @@ use std::hash::Hash; -use crate::Range; use fast_bit_vec::FastBitVec; use regex_charclass::{char::Char, CharacterClass}; -use crate::error::EngineError; +use crate::{error::EngineError, CharRange}; use super::spanning_set::SpanningSet; pub mod converter; @@ -43,7 +42,7 @@ impl Condition { )) } - pub fn from_range(range: &Range, spanning_set: &SpanningSet) -> Result { + pub fn from_range(range: &CharRange, spanning_set: &SpanningSet) -> Result { if range.is_empty() { return Ok(Self::empty(spanning_set)); } else if range.is_total() { @@ -69,8 +68,8 @@ impl Condition { Ok(cond) } - pub fn to_range(&self, spanning_set: &SpanningSet) -> Result { - let mut range = Range::empty(); + pub fn to_range(&self, spanning_set: &SpanningSet) -> Result { + let mut range = CharRange::empty(); for (i, base) in spanning_set .get_spanning_ranges_with_rest() @@ -166,25 +165,25 @@ mod tests { fn get_spanning_set() -> SpanningSet { let ranges = vec![ - Range::new_from_range(Char::new('\u{0}')..=Char::new('\u{2}')), - Range::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), - Range::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), + CharRange::new_from_range(Char::new('\u{0}')..=Char::new('\u{2}')), + CharRange::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), + CharRange::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), ]; SpanningSet::compute_spanning_set(&ranges) } - fn get_test_cases_range() -> Vec { + fn get_test_cases_range() -> Vec { vec![ - Range::empty(), - Range::total(), - Range::new_from_range(Char::new('\u{0}')..=Char::new('\u{2}')), - Range::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), - Range::new_from_ranges(&[ + CharRange::empty(), + CharRange::total(), + CharRange::new_from_range(Char::new('\u{0}')..=Char::new('\u{2}')), + CharRange::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), + CharRange::new_from_ranges(&[ AnyRange::from(Char::new('\u{0}')..=Char::new('\u{2}')), AnyRange::from(Char::new('\u{4}')..=Char::new('\u{6}')), ]), - Range::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), + CharRange::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), ] } @@ -200,16 +199,16 @@ mod tests { assert!(total.is_total()); assert_eq!(vec![true, true, true, true], total.get_binary_representation()); - assert_eq!(Range::empty(), empty.to_range(&spanning_set).unwrap()); - assert_eq!(Range::total(), total.to_range(&spanning_set).unwrap()); + assert_eq!(CharRange::empty(), empty.to_range(&spanning_set).unwrap()); + assert_eq!(CharRange::total(), total.to_range(&spanning_set).unwrap()); assert_eq!( empty, - Condition::from_range(&Range::empty(), &spanning_set).unwrap() + Condition::from_range(&CharRange::empty(), &spanning_set).unwrap() ); assert_eq!( total, - Condition::from_range(&Range::total(), &spanning_set).unwrap() + Condition::from_range(&CharRange::total(), &spanning_set).unwrap() ); assert_eq!(empty, total.complement()); @@ -219,18 +218,18 @@ mod tests { let empty = Condition::empty(&spanning_set); let total = Condition::total(&spanning_set); - assert_eq!(Range::empty(), empty.to_range(&spanning_set).unwrap()); - assert_eq!(Range::total(), total.to_range(&spanning_set).unwrap()); + assert_eq!(CharRange::empty(), empty.to_range(&spanning_set).unwrap()); + assert_eq!(CharRange::total(), total.to_range(&spanning_set).unwrap()); assert_eq!( empty, - Condition::from_range(&Range::empty(), &spanning_set).unwrap() + Condition::from_range(&CharRange::empty(), &spanning_set).unwrap() ); assert_eq!(vec![false], empty.get_binary_representation()); assert_eq!( total, - Condition::from_range(&Range::total(), &spanning_set).unwrap() + Condition::from_range(&CharRange::total(), &spanning_set).unwrap() ); assert_eq!(vec![true], total.get_binary_representation()); @@ -252,7 +251,7 @@ mod tests { Ok(()) } - fn assert_range_convertion_to_range(range: &Range, spanning_set: &SpanningSet) { + fn assert_range_convertion_to_range(range: &CharRange, spanning_set: &SpanningSet) { let condition = Condition::from_range(range, spanning_set).unwrap(); let range_from_condition = condition.to_range(spanning_set).unwrap(); assert_eq!(range, &range_from_condition); @@ -267,11 +266,11 @@ mod tests { let current_spanning_set = get_spanning_set(); let ranges = vec![ - Range::new_from_range(Char::new('\u{0}')..=Char::new('\u{1}')), - Range::new_from_range(Char::new('\u{2}')..=Char::new('\u{2}')), - Range::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), - Range::new_from_range(Char::new('\u{5}')..=Char::new('\u{6}')), - Range::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), + CharRange::new_from_range(Char::new('\u{0}')..=Char::new('\u{1}')), + CharRange::new_from_range(Char::new('\u{2}')..=Char::new('\u{2}')), + CharRange::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), + CharRange::new_from_range(Char::new('\u{5}')..=Char::new('\u{6}')), + CharRange::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), ]; let new_spanning_set = SpanningSet::compute_spanning_set(&ranges); let condition_converter = @@ -296,7 +295,7 @@ mod tests { } fn assert_project_to( - range: &Range, + range: &CharRange, currently_used_spanning_set: &SpanningSet, newly_used_spanning_set: &SpanningSet, condition_converter: &ConditionConverter, @@ -348,8 +347,8 @@ mod tests { } fn assert_union_intersection_complement( - range_1: &Range, - range_2: &Range, + range_1: &CharRange, + range_2: &CharRange, used_characters: &SpanningSet, ) { let condition_1 = Condition::from_range(range_1, used_characters).unwrap(); @@ -378,14 +377,14 @@ mod tests { #[test] fn test_1() -> Result<(), String> { let ranges = vec![ - Range::new_from_range(Char::new('\u{0}')..=Char::new('\u{9}')), - Range::new_from_range(Char::new('\u{B}')..=Char::new('\u{63}')), - Range::new_from_range(Char::new('\u{65}')..=Char::new('\u{10FFFF}')), + CharRange::new_from_range(Char::new('\u{0}')..=Char::new('\u{9}')), + CharRange::new_from_range(Char::new('\u{B}')..=Char::new('\u{63}')), + CharRange::new_from_range(Char::new('\u{65}')..=Char::new('\u{10FFFF}')), ]; let spanning_set = SpanningSet::compute_spanning_set(&ranges); println!("{:?}", spanning_set); - let range1 = Range::new_from_ranges(&[ + let range1 = CharRange::new_from_ranges(&[ AnyRange::from(Char::new('\u{0}')..=Char::new('\u{9}')), AnyRange::from(Char::new('\u{B}')..=Char::new('\u{63}')), AnyRange::from(Char::new('\u{65}')..=Char::new('\u{10FFFF}')), @@ -393,7 +392,7 @@ mod tests { let condition1 = Condition::from_range(&range1, &spanning_set).unwrap(); assert_eq!(range1, condition1.to_range(&spanning_set).unwrap()); - let range2 = Range::new_from_range(Char::new('\u{B}')..=Char::new('\u{63}')); + let range2 = CharRange::new_from_range(Char::new('\u{B}')..=Char::new('\u{63}')); let condition2 = Condition::from_range(&range2, &spanning_set).unwrap(); assert_eq!(range2, condition2.to_range(&spanning_set).unwrap()); diff --git a/src/fast_automaton/convert/to_regex/builder/mod.rs b/src/fast_automaton/convert/to_regex/builder/mod.rs index 648f733..0790851 100644 --- a/src/fast_automaton/convert/to_regex/builder/mod.rs +++ b/src/fast_automaton/convert/to_regex/builder/mod.rs @@ -2,7 +2,7 @@ use super::*; mod scc; -impl StateEliminationAutomaton { +impl StateEliminationAutomaton { pub fn new(automaton: &FastAutomaton) -> Result, EngineError> { if automaton.is_empty() { return Ok(None); @@ -19,15 +19,15 @@ impl StateEliminationAutomaton { let mut states_map = IntMap::with_capacity(automaton.get_number_of_states()); - for from_state in automaton.transitions_iter() { + for from_state in automaton.all_states_iter() { let new_from_state = *states_map .entry(from_state) .or_insert_with(|| state_elimination_automaton.new_state()); - for (to_state, condition) in - automaton.transitions_from_state_enumerate_into_iter(&from_state) + for (condition, to_state) in + automaton.transitions_from_iter(from_state) { let new_to_state = *states_map - .entry(to_state) + .entry(*to_state) .or_insert_with(|| state_elimination_automaton.new_state()); state_elimination_automaton.add_transition_to( @@ -93,7 +93,7 @@ impl StateEliminationAutomaton { &mut self, from_state: State, to_state: State, - transition: GraphTransition, + transition: GraphTransition, ) { self.assert_state_exists(from_state); if from_state != to_state { @@ -163,7 +163,7 @@ impl StateEliminationAutomaton { self.transitions[from_state].remove(&to_state); } - pub fn get_transition(&self, from_state: State, to_state: State) -> Option<&GraphTransition> { + pub fn get_transition(&self, from_state: State, to_state: State) -> Option<&GraphTransition> { self.transitions.get(from_state)?.get(&to_state) } } diff --git a/src/fast_automaton/convert/to_regex/builder/scc.rs b/src/fast_automaton/convert/to_regex/builder/scc.rs index 815188a..c99cbc5 100644 --- a/src/fast_automaton/convert/to_regex/builder/scc.rs +++ b/src/fast_automaton/convert/to_regex/builder/scc.rs @@ -1,6 +1,6 @@ use super::*; -impl StateEliminationAutomaton { +impl StateEliminationAutomaton { pub fn identify_and_apply_components(&mut self) -> Result<(), EngineError> { let mut index = 0; let mut stack = Vec::new(); diff --git a/src/fast_automaton/convert/to_regex/mod.rs b/src/fast_automaton/convert/to_regex/mod.rs index 17d539f..2469d03 100644 --- a/src/fast_automaton/convert/to_regex/mod.rs +++ b/src/fast_automaton/convert/to_regex/mod.rs @@ -5,11 +5,10 @@ use std::{ use ahash::{HashMapExt, HashSetExt}; use log::warn; -use nohash_hasher::IntMap; use crate::{error::EngineError, execution_profile::ExecutionProfile, regex::RegularExpression}; -use super::{FastAutomaton, IntSet, Range, State}; +use super::*; mod builder; mod transform; @@ -45,13 +44,13 @@ struct StateEliminationAutomaton { cyclic: bool, } -impl Display for StateEliminationAutomaton { +impl Display for StateEliminationAutomaton { fn fmt(&self, sb: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { self.to_graph_dot(sb, None) } } -impl StateEliminationAutomaton { +impl StateEliminationAutomaton { //#[cfg(test)] #[allow(dead_code)] #[inline] @@ -165,7 +164,7 @@ impl StateEliminationAutomaton { pub fn transitions_from_state_enumerate_iter( &self, from_state: &State, - ) -> impl Iterator)> { + ) -> impl Iterator)> { self.transitions[*from_state] .iter() .filter(|s| !self.removed_states.contains(s.0)) @@ -180,7 +179,7 @@ impl StateEliminationAutomaton { .collect() } - pub fn in_transitions_vec(&self, to_state: State) -> Vec<(State, GraphTransition)> { + pub fn in_transitions_vec(&self, to_state: State) -> Vec<(State, GraphTransition)> { let mut in_transitions = vec![]; for from_state in self.transitions_in.get(&to_state).unwrap_or(&IntSet::new()) { for (state, transition) in self.transitions_from_state_enumerate_iter(from_state) { diff --git a/src/fast_automaton/convert/to_regex/transform.rs b/src/fast_automaton/convert/to_regex/transform.rs index aaeca76..4498578 100644 --- a/src/fast_automaton/convert/to_regex/transform.rs +++ b/src/fast_automaton/convert/to_regex/transform.rs @@ -4,7 +4,7 @@ use crate::execution_profile::ExecutionProfile; use super::*; -impl StateEliminationAutomaton { +impl StateEliminationAutomaton { pub fn convert_to_regex( &self, execution_profile: &ExecutionProfile, diff --git a/src/fast_automaton/generate.rs b/src/fast_automaton/generate.rs index 0efa0e3..7bbaf58 100644 --- a/src/fast_automaton/generate.rs +++ b/src/fast_automaton/generate.rs @@ -15,10 +15,10 @@ impl FastAutomaton { let execution_profile = ExecutionProfile::get(); - let mut ranges_cache: AHashMap<&Condition, Range> = + let mut ranges_cache: AHashMap<&Condition, CharRange> = AHashMap::with_capacity(self.get_number_of_states()); - let mut worklist: VecDeque<(Vec, usize)> = + let mut worklist: VecDeque<(Vec, usize)> = VecDeque::with_capacity(cmp::min(number, 1000)); let mut visited = AHashSet::with_capacity(cmp::min(number, 1000)); @@ -57,7 +57,7 @@ impl FastAutomaton { break; } } - for (to_state, cond) in self.transitions_from_state_enumerate_iter(&state) { + for (cond, to_state) in self.transitions_from_iter(state) { execution_profile.assert_not_timed_out()?; let range = match ranges_cache.entry(cond) { Entry::Occupied(o) => o.get().clone(), diff --git a/src/fast_automaton/mod.rs b/src/fast_automaton/mod.rs index a4da641..4b2475b 100644 --- a/src/fast_automaton/mod.rs +++ b/src/fast_automaton/mod.rs @@ -1,18 +1,25 @@ -use crate::Range; +use crate::error::EngineError; use ahash::{AHashMap, HashSetExt}; use condition::Condition; use regex_charclass::CharacterClass; use spanning_set::SpanningSet; -use std::collections::hash_map::Entry; use std::collections::VecDeque; +use std::collections::hash_map::Entry; use std::fmt::Display; -use crate::error::EngineError; -use crate::{IntMap, IntSet}; +use super::*; -pub(crate) type State = usize; pub(crate) type Transitions = IntMap; +/// The identifier of state in an [`FastAutomaton`] +pub type State = usize; + +/// A tuple containing the condition of a transition to a state. +pub type TransitionTo = (Condition, State); + +/// A tuple containing the condition of a transition from a state. +pub type TransitionFrom = (State, Condition); + mod analyze; mod builder; pub mod condition; @@ -40,7 +47,7 @@ impl Display for FastAutomaton { fn fmt(&self, sb: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { writeln!(sb, "digraph Automaton {{")?; writeln!(sb, "\trankdir = LR;")?; - for from_state in self.transitions_iter() { + for from_state in self.all_states_iter() { write!(sb, "\t{from_state}")?; if self.accept_states.contains(&from_state) { writeln!(sb, "\t[shape=doublecircle,label=\"{from_state}\"];")?; @@ -52,7 +59,7 @@ impl Display for FastAutomaton { writeln!(sb, "\tinitial [shape=plaintext,label=\"\"];")?; writeln!(sb, "\tinitial -> {from_state}")?; } - for (to_state, cond) in self.transitions_from_state_enumerate_iter(&from_state) { + for (cond, to_state) in self.transitions_from_iter(from_state) { writeln!( sb, "\t{from_state} -> {to_state} [label=\"{}\"]", @@ -76,80 +83,110 @@ impl FastAutomaton { } } + /// Returns the number of transitions to the provided state. #[inline] - pub fn in_degree(&self, state: State) -> usize { + pub fn state_in_degree(&self, state: State) -> usize { self.transitions_in .get(&state) .unwrap_or(&IntSet::new()) .len() } + /// Returns the number of transitions from the provided state. #[inline] - pub fn out_degree(&self, state: State) -> usize { + pub fn state_out_degree(&self, state: State) -> usize { self.transitions[state].len() } - pub fn in_transitions(&self, state: State) -> Vec<(usize, Condition)> { + /// Returns an iterator of the state of the automaton. + #[inline] + pub fn all_states_iter(&self) -> impl Iterator + '_ { + (0..self.transitions.len()).filter(|s| !self.removed_states.contains(s)) + } + + /// Returns a vector containing the states of the automaton. + #[inline] + pub fn all_states_vec(&self) -> Vec { + self.all_states_iter().collect() + } + + /// Returns an iterator over all states directly reachable from the given state in one transition. + #[inline] + pub fn direct_states_iter(&self, state: &State) -> impl Iterator + '_ { + self.transitions[*state] + .keys() + .cloned() + .filter(|s| !self.removed_states.contains(s)) + } + + /// Returns a vector containing all states directly reachable from the given state in one transition. + #[inline] + pub fn direct_states_vec(&self, state: &State) -> Vec { + self.direct_states_iter(state).collect() + } + + /// Returns a vector containing the transitions to the provided state. + pub fn transitions_to_vec(&self, state: State) -> Vec { let mut in_transitions = vec![]; for from_state in self.transitions_in.get(&state).unwrap_or(&IntSet::new()) { - for (to_state, condition) in self.transitions_from_state_enumerate_vec(from_state) { + for (condition, to_state) in self.transitions_from_vec(*from_state) { if to_state == state { in_transitions.push((*from_state, condition)); + break; } } } in_transitions } - pub fn in_states(&self, state: State) -> IntSet { - self.transitions_in - .get(&state) - .unwrap_or(&IntSet::new()) - .clone() - } - + /// Returns a vector containing the transitions from the provided state. #[inline] - pub fn transitions_iter(&self) -> impl Iterator + '_ { - (0..self.transitions.len()).filter(|s| !self.removed_states.contains(s)) - } - - #[inline] - pub fn transitions_vec(&self) -> Vec { - self.transitions_iter().collect() + pub fn transitions_from_vec(&self, state: State) -> Vec { + self.transitions[state] + .iter() + .map(|(s, c)| (c.clone(), *s)) + .filter(|s| !self.removed_states.contains(&s.1)) + .collect() } + /// Returns an iterator containing the transitions from the provided state. #[inline] - pub fn transitions_from_state_enumerate_iter( + pub fn transitions_from_iter( &self, - from_state: &State, - ) -> impl Iterator { - self.transitions[*from_state] + state: State, + ) -> impl Iterator { + self.transitions[state] .iter() - .filter(|s| !self.removed_states.contains(s.0)) + .map(|(s, c)| (c, s)) + .filter(|s| !self.removed_states.contains(s.1)) } + /// Returns a mutable iterator containing the transitions from the provided state. #[inline] - pub fn transitions_from_state_enumerate_iter_mut( + pub fn transitions_from_iter_mut( &mut self, - from_state: &State, - ) -> impl Iterator { - self.transitions[*from_state] + state: &State, + ) -> impl Iterator { + self.transitions[*state] .iter_mut() - .filter(|s| !self.removed_states.contains(s.0)) + .map(|(s, c)| (c, s)) + .filter(|s| !self.removed_states.contains(s.1)) } + /// Returns an owned iterator containing the transitions from the provided state. #[inline] - pub fn transitions_from_state_enumerate_vec( + pub fn transitions_from_into_iter( &self, - from_state: &State, - ) -> Vec<(State, Condition)> { - self.transitions[*from_state] - .iter() - .map(|(s, c)| (*s, c.clone())) - .filter(|s| !self.removed_states.contains(&s.0)) - .collect() + state: &State, + ) -> impl Iterator + '_ { + self.transitions[*state] + .clone() + .into_iter() + .map(|(s, c)| (c, s)) + .filter(|(_, state)| !self.removed_states.contains(state)) } + /// Returns `true` if there is a directed transition between the two provided states. #[inline] pub fn does_transition_exists(&self, from_state: State, to_state: State) -> bool { if !self.has_state(from_state) || !self.has_state(to_state) { @@ -172,91 +209,65 @@ impl FastAutomaton { .collect() } - #[inline] - pub fn transitions_from_state_enumerate_into_iter( - &self, - from_state: &State, - ) -> impl Iterator + '_ { - self.transitions - .get(*from_state) // Assume transitions is a map; adjust accordingly. - .into_iter() // Creates an iterator over Option<&V> - .flat_map(|transitions| transitions.iter()) // Flattens into Iterator - .filter(move |(state, _)| !self.removed_states.contains(state)) // Filters out removed states - .map(|(state, condition)| (*state, condition.clone())) // Creates owned data; adjust if cloning is expensive - } - - #[inline] - pub fn transitions_from_state_iter( - &self, - from_state: &State, - ) -> impl Iterator + '_ { - self.transitions[*from_state] - .keys() - .cloned() - .filter(|s| !self.removed_states.contains(s)) - } - - #[inline] - pub fn transitions_from_state(&self, from_state: &State) -> Vec { - self.transitions_from_state_iter(from_state).collect() - } - - #[inline] - pub fn transitions_from_state_into_iter<'a>( - &'a self, - from_state: &State, - ) -> impl Iterator + 'a { - self.transitions[*from_state] - .clone() - .into_iter() - .filter(|s| !self.removed_states.contains(&s.0)) - } - + // Returns the number of states in the automaton. #[inline] pub fn get_number_of_states(&self) -> usize { self.transitions.len() - self.removed_states.len() } + // Get a reference of the directed transtion's condition between the two provided states. #[inline] - pub fn get_condition(&self, from_state: &State, to_state: &State) -> Option<&Condition> { - self.transitions[*from_state].get(to_state) + pub fn get_condition(&self, from_state: State, to_state: State) -> Option<&Condition> { + self.transitions[from_state].get(&to_state) } + // Get a mutable reference of the directed transtion's condition between the two provided states. #[inline] - pub fn get_start_state(&self) -> State { - self.start_state + pub fn get_condition_mut( + &mut self, + from_state: State, + to_state: State, + ) -> Option<&mut Condition> { + self.transitions[from_state].get_mut(&to_state) } + /// Returns the start state of the automaton. #[inline] - pub fn get_removed_states(&self) -> &IntSet { - &self.removed_states + pub fn get_start_state(&self) -> State { + self.start_state } + // Get a reference to the set of accept (final) states of the automaton. #[inline] pub fn get_accept_states(&self) -> &IntSet { &self.accept_states } + /// Returns a reference to the automaton's spanning set. #[inline] pub fn get_spanning_set(&self) -> &SpanningSet { &self.spanning_set } + /// Returns `true` if the given `state` is one of the automaton's accept states. #[inline] pub fn is_accepted(&self, state: &State) -> bool { self.accept_states.contains(state) } + /// Returns `true` if the automaton is deterministic. #[inline] pub fn is_determinitic(&self) -> bool { self.deterministic } + /// Returns `true` if the automaton contains at least one cycle. #[inline] pub fn is_cyclic(&self) -> bool { self.cyclic } + /// Returns `true` if the automaton has the provided state. #[inline] pub fn has_state(&self, state: State) -> bool { !(state >= self.transitions.len() || self.removed_states.contains(&state)) @@ -274,7 +285,7 @@ impl FastAutomaton { continue; } let curr_char = input.chars().nth(position).unwrap() as u32; - for (to_state, cond) in self.transitions_from_state_enumerate_iter(current_state) { + for (cond, to_state) in self.transitions_from_iter(*current_state) { if cond.has_character(&curr_char, &self.spanning_set).unwrap() { if position + 1 == input.len() { if self.accept_states.contains(to_state) { diff --git a/src/fast_automaton/operation/concatenate.rs b/src/fast_automaton/operation/concat.rs similarity index 96% rename from src/fast_automaton/operation/concatenate.rs rename to src/fast_automaton/operation/concat.rs index b22ad2d..71d97b0 100644 --- a/src/fast_automaton/operation/concatenate.rs +++ b/src/fast_automaton/operation/concat.rs @@ -11,9 +11,7 @@ impl FastAutomaton { Self::concat_all([self, other]) } - pub fn concat_all<'a, I>(automatons: I) -> Result - where - I: IntoIterator, + pub fn concat_all<'a, I: IntoIterator>(automatons: I) -> Result { let mut new_automaton = FastAutomaton::new_empty_string(); for automaton in automatons { @@ -41,12 +39,12 @@ impl FastAutomaton { BuildHasherDefault::default(), ); - let start_state_and_accept_states_not_mergeable = other.in_degree(other.start_state) > 0 + let start_state_and_accept_states_not_mergeable = other.state_in_degree(other.start_state) > 0 && self .accept_states .iter() .cloned() - .any(|s| self.out_degree(s) > 0); + .any(|s| self.state_out_degree(s) > 0); let accept_states = self.accept_states.iter().cloned().collect::>(); @@ -67,7 +65,7 @@ impl FastAutomaton { } } - for from_state in other.transitions_iter() { + for from_state in other.all_states_iter() { let new_from_states = match new_states.entry(from_state) { Entry::Occupied(o) => { vec![*o.get()] @@ -86,7 +84,7 @@ impl FastAutomaton { } }; - for (to_state, condition) in other.transitions_from_state_enumerate_iter(&from_state) { + for (condition, to_state) in other.transitions_from_iter(from_state) { let new_to_states = match new_states.entry(*to_state) { Entry::Occupied(o) => { vec![*o.get()] @@ -107,7 +105,7 @@ impl FastAutomaton { let projected_condition = condition_converter.convert(condition)?; for new_from_state in new_from_states.iter() { for new_to_state in new_to_states.iter() { - self.add_transition_to( + self.add_transition( *new_from_state, *new_to_state, &projected_condition, @@ -120,7 +118,7 @@ impl FastAutomaton { if start_state_and_accept_states_not_mergeable { if let Some(&other_start_state) = new_states.get(&other.start_state) { for accept_state in &accept_states { - self.add_epsilon(*accept_state, other_start_state); + self.add_epsilon_transition(*accept_state, other_start_state); } } } diff --git a/src/fast_automaton/operation/determinize.rs b/src/fast_automaton/operation/determinize.rs index 1cf7a88..b0efb67 100644 --- a/src/fast_automaton/operation/determinize.rs +++ b/src/fast_automaton/operation/determinize.rs @@ -39,7 +39,7 @@ impl FastAutomaton { for base in &ranges { for from_state in &states { - for (to_state, cond) in self.transitions_from_state_enumerate_iter(from_state) { + for (cond, to_state) in self.transitions_from_iter(*from_state) { if cond.has_intersection(base) { match new_states_to_add.binary_search(to_state) { Ok(_) => {} // element already in vector @ `pos` @@ -60,7 +60,7 @@ impl FastAutomaton { } }; - new_automaton.add_transition_to(r, q, base); + new_automaton.add_transition(r, q, base); } new_states_to_add.clear(); } diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index 8f3e4a3..5dac078 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -16,9 +16,7 @@ impl FastAutomaton { FastAutomaton::intersection_all([self, other]) } - pub fn intersection_all<'a, I>(automatons: I) -> Result - where - I: IntoIterator, + pub fn intersection_all<'a, I: IntoIterator>(automatons: I) -> Result { let mut result: Cow<'a, FastAutomaton> = Cow::Owned(FastAutomaton::new_total()); @@ -33,9 +31,7 @@ impl FastAutomaton { Ok(result.into_owned()) } - pub fn intersection_all_par<'a, I>(automatons: I) -> Result - where - I: IntoParallelIterator, + pub fn intersection_all_par<'a, I: IntoParallelIterator>(automatons: I) -> Result { let execution_profile = ExecutionProfile::get(); @@ -102,8 +98,8 @@ impl FastAutomaton { let transitions_2 = other.get_projected_transitions(p.2, &condition_converter_other_to_new)?; - for (n1, condition_1) in transitions_1 { - for (n2, condition_2) in &transitions_2 { + for (condition_1, n1) in transitions_1 { + for (condition_2, n2) in &transitions_2 { let intersection = condition_1.intersection(condition_2); if intersection.is_empty() { continue; @@ -118,7 +114,7 @@ impl FastAutomaton { new_r } }; - new_automaton.add_transition_to(p.0, r.0, &intersection); + new_automaton.add_transition(p.0, r.0, &intersection); } } } @@ -168,8 +164,8 @@ impl FastAutomaton { let transitions_2 = other.get_projected_transitions(p.2, &condition_converter_other_to_new)?; - for (n1, condition_1) in transitions_1 { - for (n2, condition_2) in &transitions_2 { + for (condition_1, n1) in transitions_1 { + for (condition_2, n2) in &transitions_2 { let intersection = condition_1.intersection(condition_2); if intersection.is_empty() { continue; @@ -184,7 +180,7 @@ impl FastAutomaton { new_r } }; - new_automaton.add_transition_to(p.0, r.0, &intersection); + new_automaton.add_transition(p.0, r.0, &intersection); } } } @@ -195,11 +191,11 @@ impl FastAutomaton { &self, state: State, condition_converter: &ConditionConverter, - ) -> Result, EngineError> { + ) -> Result, EngineError> { let transitions_1: Result, EngineError> = self - .transitions_from_state_enumerate_iter(&state) - .map(|(&s, c)| match condition_converter.convert(c) { - Ok(condition) => Ok((s, condition)), + .transitions_from_iter(state) + .map(|(c, &s)| match condition_converter.convert(c) { + Ok(condition) => Ok((condition, s)), Err(err) => Err(err), }) .collect(); diff --git a/src/fast_automaton/operation/mod.rs b/src/fast_automaton/operation/mod.rs index bf0523e..a574a0e 100644 --- a/src/fast_automaton/operation/mod.rs +++ b/src/fast_automaton/operation/mod.rs @@ -4,20 +4,20 @@ use ahash::AHasher; use super::*; -mod alternation; -mod concatenate; +mod union; +mod concat; mod determinize; mod intersection; mod subtraction; mod repeat; impl FastAutomaton { - pub fn remove_dead_transitions(&mut self) { + pub(crate) fn remove_dead_transitions(&mut self) { if !self.is_empty() { let reacheable_states = self.get_reacheable_states(); let mut dead_states = IntSet::default(); - for from_state in self.transitions_iter() { + for from_state in self.all_states_iter() { if !reacheable_states.contains(&from_state) { dead_states.insert(from_state); } diff --git a/src/fast_automaton/operation/repeat.rs b/src/fast_automaton/operation/repeat.rs index f451678..2fd14db 100644 --- a/src/fast_automaton/operation/repeat.rs +++ b/src/fast_automaton/operation/repeat.rs @@ -20,20 +20,20 @@ impl FastAutomaton { let automaton_to_repeat = self.clone(); - if min == 0 && self.in_degree(self.start_state) != 0 { + if min == 0 && self.state_in_degree(self.start_state) != 0 { let new_state = self.new_state(); if self.is_accepted(&self.start_state) { self.accept(new_state); } - for to_state in self.transitions_from_state(&self.start_state) { - self.add_epsilon(new_state, to_state); + for to_state in self.direct_states_vec(&self.start_state) { + self.add_epsilon_transition(new_state, to_state); } self.start_state = new_state; if max_opt.is_none() { for accept_state in self.accept_states.clone() { - self.add_epsilon(accept_state, self.start_state); + self.add_epsilon_transition(accept_state, self.start_state); } self.accept(self.start_state); return Ok(()); @@ -59,10 +59,10 @@ impl FastAutomaton { let accept_state = *automaton_to_repeat.accept_states.iter().next().unwrap(); if automaton_to_repeat.accept_states.len() == 1 - && automaton_to_repeat.out_degree(accept_state) == 0 - && automaton_to_repeat.in_degree(automaton_to_repeat.start_state) == 0 + && automaton_to_repeat.state_out_degree(accept_state) == 0 + && automaton_to_repeat.state_in_degree(automaton_to_repeat.start_state) == 0 { - automaton_to_repeat.add_epsilon(accept_state, automaton_to_repeat.start_state); + automaton_to_repeat.add_epsilon_transition(accept_state, automaton_to_repeat.start_state); let old_start_state = automaton_to_repeat.start_state; automaton_to_repeat.start_state = accept_state; automaton_to_repeat.remove_state(old_start_state); @@ -76,7 +76,7 @@ impl FastAutomaton { for state in automaton_to_repeat.accept_states.clone() { for &(to_state, condition) in &transitions { - automaton_to_repeat.add_transition_to(state, *to_state, condition); + automaton_to_repeat.add_transition(state, *to_state, condition); } } diff --git a/src/fast_automaton/operation/subtraction.rs b/src/fast_automaton/operation/subtraction.rs index d513fbb..8d45ae7 100644 --- a/src/fast_automaton/operation/subtraction.rs +++ b/src/fast_automaton/operation/subtraction.rs @@ -17,9 +17,9 @@ impl FastAutomaton { ); let mut ranges = Vec::with_capacity(self.get_number_of_states()); - for from_state in self.transitions_iter() { + for from_state in self.all_states_iter() { let mut new_condition = Condition::empty(&self.spanning_set); - for (_, condition) in self.transitions_from_state_enumerate_iter(&from_state) { + for (condition, _) in self.transitions_from_iter(from_state) { new_condition = new_condition.union(condition); ranges.push(condition.to_range(self.get_spanning_set())?); } @@ -30,14 +30,14 @@ impl FastAutomaton { } for (from_state, condition) in &transitions_to_crash_state { - self.add_transition_to(*from_state, crash_state, condition); + self.add_transition(*from_state, crash_state, condition); ranges.push(condition.to_range(self.get_spanning_set())?); } let new_spanning_set = SpanningSet::compute_spanning_set(&ranges); self.apply_new_spanning_set(&new_spanning_set)?; - if self.in_degree(crash_state) == 1 { + if self.state_in_degree(crash_state) == 1 { self.remove_state(crash_state); } Ok(()) @@ -47,7 +47,7 @@ impl FastAutomaton { self.totalize()?; let mut new_accept_states = IntSet::default(); - for state in self.transitions_iter() { + for state in self.all_states_iter() { if self.accept_states.contains(&state) { continue; } diff --git a/src/fast_automaton/operation/alternation.rs b/src/fast_automaton/operation/union.rs similarity index 87% rename from src/fast_automaton/operation/alternation.rs rename to src/fast_automaton/operation/union.rs index fe1ab80..ea9ad44 100644 --- a/src/fast_automaton/operation/alternation.rs +++ b/src/fast_automaton/operation/union.rs @@ -12,9 +12,7 @@ impl FastAutomaton { Self::union_all([self, other]) } - pub fn union_all<'a, I>(automatons: I) -> Result - where - I: IntoIterator, + pub fn union_all<'a, I: IntoIterator>(automatons: I) -> Result { let mut new_automaton = FastAutomaton::new_empty(); for automaton in automatons { @@ -23,9 +21,7 @@ impl FastAutomaton { Ok(new_automaton) } - pub fn union_all_par<'a, I>(automatons: I) -> Result - where - I: IntoParallelIterator, + pub fn union_all_par<'a, I: IntoParallelIterator>(automatons: I) -> Result { let execution_profile = ExecutionProfile::get(); @@ -57,9 +53,9 @@ impl FastAutomaton { new_states: &mut IntMap, condition_converter: &ConditionConverter, ) -> Result, EngineError> { - let mut imcomplete_states = IntSet::with_capacity(other.out_degree(other.start_state) + 1); - let self_start_state_in_degree = self.in_degree(self.start_state); - let other_start_state_in_degree = other.in_degree(other.start_state); + let mut imcomplete_states = IntSet::with_capacity(other.state_out_degree(other.start_state) + 1); + let self_start_state_in_degree = self.state_in_degree(self.start_state); + let other_start_state_in_degree = other.state_in_degree(other.start_state); if self_start_state_in_degree == 0 && other_start_state_in_degree == 0 { // The start states can be the same state without any consequence new_states.insert(other.start_state, self.start_state); @@ -71,9 +67,9 @@ impl FastAutomaton { self.accept(new_state); } - for (to_state, cond) in self.transitions_from_state_enumerate_vec(&self.start_state) + for (cond, to_state) in self.transitions_from_vec(self.start_state) { - self.add_transition_to(new_state, to_state, &cond); + self.add_transition(new_state, to_state, &cond); } self.start_state = new_state; } @@ -87,8 +83,8 @@ impl FastAutomaton { new_states.insert(other.start_state, new_state); imcomplete_states.insert(new_state); - for (other_to_state, cond) in - other.transitions_from_state_enumerate_vec(&other.start_state) + for (cond, other_to_state) in + other.transitions_from_vec(other.start_state) { let cond = condition_converter.convert(&cond)?; let to_state = match new_states.entry(other_to_state) { @@ -100,7 +96,7 @@ impl FastAutomaton { new_state } }; - self.add_transition_to(self.start_state, to_state, &cond); + self.add_transition(self.start_state, to_state, &cond); } } } @@ -115,7 +111,7 @@ impl FastAutomaton { ) { let mut self_accept_states_without_outgoing_edges = vec![]; for &state in &self.accept_states { - if self.out_degree(state) == 0 && !imcomplete_states.contains(&state) { + if self.state_out_degree(state) == 0 && !imcomplete_states.contains(&state) { self_accept_states_without_outgoing_edges.push(state); } } @@ -127,8 +123,8 @@ impl FastAutomaton { self.accept(new_state); for &accept_state in &self_accept_states_without_outgoing_edges { - for (from_state, condition) in self.in_transitions(accept_state) { - self.add_transition_to(from_state, new_state, &condition); + for (from_state, condition) in self.transitions_to_vec(accept_state) { + self.add_transition(from_state, new_state, &condition); } self.remove_state(accept_state); } @@ -142,7 +138,7 @@ impl FastAutomaton { }; for &state in &other.accept_states { - if other.out_degree(state) == 0 { + if other.state_out_degree(state) == 0 { new_states .entry(state) .or_insert(accept_state_without_outgoing_edges); @@ -182,7 +178,7 @@ impl FastAutomaton { self.prepare_start_states(other, &mut new_states, &condition_converter)?; self.prepare_accept_states(other, &mut new_states, &imcomplete_states); - for from_state in other.transitions_iter() { + for from_state in other.all_states_iter() { let new_from_state = match new_states.entry(from_state) { Entry::Occupied(o) => *o.get(), Entry::Vacant(v) => { @@ -191,7 +187,7 @@ impl FastAutomaton { new_state } }; - for (to_state, condition) in other.transitions_from_state_enumerate_iter(&from_state) { + for (condition, to_state) in other.transitions_from_iter(from_state) { let new_condition = condition_converter.convert(condition)?; let new_to_state = match new_states.entry(*to_state) { Entry::Occupied(o) => *o.get(), @@ -201,7 +197,7 @@ impl FastAutomaton { new_state } }; - self.add_transition_to(new_from_state, new_to_state, &new_condition); + self.add_transition(new_from_state, new_to_state, &new_condition); } } self.cyclic = self.cyclic || other.cyclic; diff --git a/src/fast_automaton/spanning_set/mod.rs b/src/fast_automaton/spanning_set/mod.rs index 2aa2780..2e998b8 100644 --- a/src/fast_automaton/spanning_set/mod.rs +++ b/src/fast_automaton/spanning_set/mod.rs @@ -1,22 +1,24 @@ use std::slice::Iter; use ahash::AHashSet; -use regex_charclass::{char::Char, irange::RangeSet}; + #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; -/// Contains a set of [`RangeSet`] that span all the transition of a [`crate::FastAutomaton`]. +use crate::CharRange; + +/// Contains a set of [`CharRange`] that span all the transition of a [`crate::FastAutomaton`]. #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Clone, Debug, PartialEq, Eq)] -pub struct SpanningSet(Vec>, RangeSet); +pub struct SpanningSet(Vec, CharRange); impl SpanningSet { pub fn new_empty() -> Self { - SpanningSet(vec![], RangeSet::total()) + SpanningSet(vec![], CharRange::total()) } pub fn new_total() -> Self { - SpanningSet(vec![RangeSet::total()], RangeSet::empty()) + SpanningSet(vec![CharRange::total()], CharRange::empty()) } pub fn is_empty(&self) -> bool { @@ -35,7 +37,7 @@ impl SpanningSet { } } - pub(crate) fn get_spanning_ranges_with_rest(&self) -> Vec> { + pub(crate) fn get_spanning_ranges_with_rest(&self) -> Vec { if self.1.is_empty() { self.0.clone() } else { @@ -45,7 +47,7 @@ impl SpanningSet { } } - pub fn get_spanning_ranges(&self) -> Iter> { + pub fn get_spanning_ranges(&self) -> Iter { self.0.iter() } @@ -53,11 +55,11 @@ impl SpanningSet { self.0.len() } - pub fn get_spanning_range(&self, i: usize) -> Option<&RangeSet> { + pub fn get_spanning_range(&self, i: usize) -> Option<&CharRange> { self.0.get(i) } - pub fn get_rest(&self) -> &RangeSet { + pub fn get_rest(&self) -> &CharRange { &self.1 } @@ -69,8 +71,8 @@ impl SpanningSet { Self::compute_spanning_set(&ranges) } - pub fn compute_spanning_set(ranges: &[RangeSet]) -> Self { - let mut spanning_ranges: Vec> = ranges.to_vec(); + pub fn compute_spanning_set(ranges: &[CharRange]) -> Self { + let mut spanning_ranges: Vec = ranges.to_vec(); spanning_ranges.sort_unstable(); spanning_ranges.dedup(); @@ -105,7 +107,7 @@ impl SpanningSet { spanning_ranges.sort_unstable(); - let mut total = RangeSet::empty(); + let mut total = CharRange::empty(); for base in &spanning_ranges { total = total.union(base); } diff --git a/src/lib.rs b/src/lib.rs index 5bd5ea0..3de5f60 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -24,9 +24,9 @@ pub mod fast_automaton; pub mod regex; pub mod tokenizer; -type IntMap = HashMap>>; -type IntSet = HashSet>>; -type Range = RangeSet; +pub type IntMap = HashMap>>; +pub type IntSet = HashSet>>; +pub type CharRange = RangeSet; /// Represents a term that can be either a regular expression or a finite automaton. This term can be manipulated with a wide range of operations. /// @@ -97,9 +97,9 @@ impl Term { /// ``` /// use regexsolver::Term; /// - /// let term1 = Term::from_regex("abc").unwrap(); - /// let term2 = Term::from_regex("d.").unwrap(); - /// let term3 = Term::from_regex(".*").unwrap(); + /// let term1 = Term::from_pattern("abc").unwrap(); + /// let term2 = Term::from_pattern("d.").unwrap(); + /// let term3 = Term::from_pattern(".*").unwrap(); /// /// let concat = term1.concat(&[term2, term3]).unwrap(); /// @@ -122,7 +122,7 @@ impl Term { } for term in terms { if has_automaton { - return_automaton = return_automaton.concat(term.get_automaton()?.as_ref())?; + return_automaton = return_automaton.concat(term.to_automaton()?.as_ref())?; } else { match term { Term::RegularExpression(regular_expression) => { @@ -138,8 +138,6 @@ impl Term { if !has_automaton { Ok(Term::RegularExpression(return_regex)) - } else if let Some(return_regex) = return_automaton.to_regex() { - Ok(Term::RegularExpression(return_regex)) } else { Ok(Term::Automaton(return_automaton)) } @@ -153,9 +151,9 @@ impl Term { /// ``` /// use regexsolver::Term; /// - /// let term1 = Term::from_regex("abc").unwrap(); - /// let term2 = Term::from_regex("de").unwrap(); - /// let term3 = Term::from_regex("fghi").unwrap(); + /// let term1 = Term::from_pattern("abc").unwrap(); + /// let term2 = Term::from_pattern("de").unwrap(); + /// let term3 = Term::from_pattern("fghi").unwrap(); /// /// let union = term1.union(&[term2, term3]).unwrap(); /// @@ -194,13 +192,11 @@ impl Term { FastAutomaton::union_all(automaton_list) }?; - if let Some(return_regex) = return_automaton.to_regex() { - Ok(Term::RegularExpression(return_regex)) - } else { - Ok(Term::Automaton(return_automaton)) - } + Ok(Term::Automaton(return_automaton)) } else { - let regexes_list = self.get_regexes(terms)?; + let regexes_list = self + .get_regexes(terms) + .expect("No automaton should be here so this operation is not supposed to fail."); let regexes_list = regexes_list.iter().map(AsRef::as_ref).collect::>(); @@ -218,9 +214,9 @@ impl Term { /// ``` /// use regexsolver::Term; /// - /// let term1 = Term::from_regex("(abc|de){2}").unwrap(); - /// let term2 = Term::from_regex("de.*").unwrap(); - /// let term3 = Term::from_regex(".*abc").unwrap(); + /// let term1 = Term::from_pattern("(abc|de){2}").unwrap(); + /// let term2 = Term::from_pattern("de.*").unwrap(); + /// let term3 = Term::from_pattern(".*abc").unwrap(); /// /// let intersection = term1.intersection(&[term2, term3]).unwrap(); /// @@ -245,11 +241,7 @@ impl Term { FastAutomaton::intersection_all(automaton_list) }?; - if let Some(return_regex) = return_automaton.to_regex() { - Ok(Term::RegularExpression(return_regex)) - } else { - Ok(Term::Automaton(return_automaton)) - } + Ok(Term::Automaton(return_automaton)) } /// Compute the subtraction of the current term and the given `subtrahend`. @@ -260,8 +252,8 @@ impl Term { /// ``` /// use regexsolver::Term; /// - /// let term1 = Term::from_regex("(abc|de)").unwrap(); - /// let term2 = Term::from_regex("de").unwrap(); + /// let term1 = Term::from_pattern("(abc|de)").unwrap(); + /// let term2 = Term::from_pattern("de").unwrap(); /// /// let subtraction = term1.subtraction(&term2).unwrap(); /// @@ -270,17 +262,13 @@ impl Term { /// } /// ``` pub fn subtraction(&self, subtrahend: &Term) -> Result { - let minuend_automaton = self.get_automaton()?; - let subtrahend_automaton = subtrahend.get_automaton()?; + let minuend_automaton = self.to_automaton()?; + let subtrahend_automaton = subtrahend.to_automaton()?; let subtrahend_automaton = Self::determinize_subtrahend(&minuend_automaton, &subtrahend_automaton)?; let return_automaton = minuend_automaton.subtraction(&subtrahend_automaton)?; - if let Some(return_regex) = return_automaton.to_regex() { - Ok(Term::RegularExpression(return_regex)) - } else { - Ok(Term::Automaton(return_automaton)) - } + Ok(Term::Automaton(return_automaton)) } /// See [`Self::subtraction`]. @@ -297,7 +285,7 @@ impl Term { /// ``` /// use regexsolver::Term; /// - /// let term = Term::from_regex("abc").unwrap(); + /// let term = Term::from_pattern("abc").unwrap(); /// /// let repeat = term.repeat(1, None).unwrap(); /// @@ -318,11 +306,7 @@ impl Term { )), Term::Automaton(fast_automaton) => { let repeat_automaton = fast_automaton.repeat(min, max_opt)?; - Ok(if let Some(repeat_regex) = repeat_automaton.to_regex() { - Term::RegularExpression(repeat_regex) - } else { - Term::Automaton(repeat_automaton) - }) + Ok(Term::Automaton(repeat_automaton)) } } } @@ -334,7 +318,7 @@ impl Term { /// ``` /// use regexsolver::Term; /// - /// let term = Term::from_regex("(abc|de){2}").unwrap(); + /// let term = Term::from_pattern("(abc|de){2}").unwrap(); /// /// let strings = term.generate_strings(3).unwrap(); /// @@ -342,7 +326,7 @@ impl Term { /// ``` pub fn generate_strings(&self, count: usize) -> Result, EngineError> { Ok(self - .get_automaton()? + .to_automaton()? .generate_strings(count)? .into_iter() .collect()) @@ -356,8 +340,8 @@ impl Term { /// ``` /// use regexsolver::Term; /// - /// let term1 = Term::from_regex("(abc|de)").unwrap(); - /// let term2 = Term::from_regex("(abc|de)*").unwrap(); + /// let term1 = Term::from_pattern("(abc|de)").unwrap(); + /// let term2 = Term::from_pattern("(abc|de)*").unwrap(); /// /// assert!(!term1.are_equivalent(&term2).unwrap()); /// ``` @@ -366,8 +350,8 @@ impl Term { return Ok(true); } - let automaton_1 = self.get_automaton()?; - let automaton_2 = that.get_automaton()?; + let automaton_1 = self.to_automaton()?; + let automaton_2 = that.to_automaton()?; automaton_1.is_equivalent_of(&automaton_2) } @@ -379,8 +363,8 @@ impl Term { /// ``` /// use regexsolver::Term; /// - /// let term1 = Term::from_regex("de").unwrap(); - /// let term2 = Term::from_regex("(abc|de)").unwrap(); + /// let term1 = Term::from_pattern("de").unwrap(); + /// let term2 = Term::from_pattern("(abc|de)").unwrap(); /// /// assert!(term1.is_subset_of(&term2).unwrap()); /// ``` @@ -389,12 +373,11 @@ impl Term { return Ok(true); } - let automaton_1 = self.get_automaton()?; - let automaton_2 = that.get_automaton()?; + let automaton_1 = self.to_automaton()?; + let automaton_2 = that.to_automaton()?; automaton_1.is_subset_of(&automaton_2) } - /// Check if the current term matches the empty language. pub fn is_empty(&self) -> bool { match self { @@ -447,6 +430,20 @@ impl Term { } } + pub fn to_automaton(&self) -> Result, EngineError> { + Ok(match self { + Term::RegularExpression(regex) => Cow::Owned(regex.to_automaton()?), + Term::Automaton(automaton) => Cow::Borrowed(automaton), + }) + } + + pub fn to_regex(&self) -> Option> { + Some(match self { + Term::RegularExpression(regex) => Cow::Borrowed(regex), + Term::Automaton(automaton) => Cow::Owned(automaton.to_regex()?), + }) + } + fn determinize_subtrahend<'a>( minuend: &FastAutomaton, subtrahend: &'a FastAutomaton, @@ -466,18 +463,18 @@ impl Term { parallel: bool, ) -> Result>, EngineError> { let mut automaton_list = Vec::with_capacity(terms.len() + 1); - automaton_list.push(self.get_automaton()?); + automaton_list.push(self.to_automaton()?); let mut terms_automata = if parallel { let execution_profile = ExecutionProfile::get(); terms .par_iter() - .map(|a| execution_profile.apply(|| a.get_automaton())) + .map(|a| execution_profile.apply(|| a.to_automaton())) .collect::, _>>() } else { terms .iter() - .map(Term::get_automaton) + .map(Term::to_automaton) .collect::, _>>() }?; automaton_list.append(&mut terms_automata); @@ -485,40 +482,17 @@ impl Term { Ok(automaton_list) } - fn get_regexes<'a>( - &'a self, - terms: &'a [Term], - ) -> Result>, EngineError> { + fn get_regexes<'a>(&'a self, terms: &'a [Term]) -> Option>> { let mut regex_list = Vec::with_capacity(terms.len() + 1); - regex_list.push(self.get_regex()?); + regex_list.push(self.to_regex()?); let mut terms_regexes = terms .iter() - .map(Term::get_regex) - .collect::, _>>()?; + .map(Term::to_regex) + .collect::>>()?; regex_list.append(&mut terms_regexes); - Ok(regex_list) - } - - fn get_automaton(&self) -> Result, EngineError> { - Ok(match self { - Term::RegularExpression(regex) => Cow::Owned(regex.to_automaton()?), - Term::Automaton(automaton) => Cow::Borrowed(automaton), - }) - } - - fn get_regex(&self) -> Result, EngineError> { - Ok(match self { - Term::RegularExpression(regex) => Cow::Borrowed(regex), - Term::Automaton(automaton) => { - if let Some(regex) = automaton.to_regex() { - Cow::Owned(regex) - } else { - todo!() - } - } - }) + Some(regex_list) } } @@ -629,7 +603,10 @@ mod tests { assert_eq!(diff.to_string(), "a+"); // Repetition - let rep = Term::from_pattern("abc").unwrap().repeat(2, Some(4)).unwrap(); // (abc){2,4} + let rep = Term::from_pattern("abc") + .unwrap() + .repeat(2, Some(4)) + .unwrap(); // (abc){2,4} assert_eq!(rep.to_string(), "(abc){2,4}"); // Analyze diff --git a/src/regex/builder.rs b/src/regex/builder.rs index e8a354f..b1958a1 100644 --- a/src/regex/builder.rs +++ b/src/regex/builder.rs @@ -34,14 +34,14 @@ impl RegularExpression { pub fn new_total() -> Self { RegularExpression::Repetition( - Box::new(RegularExpression::Character(Range::total())), + Box::new(RegularExpression::Character(CharRange::total())), 0, None, ) } pub fn new_empty() -> Self { - RegularExpression::Character(Range::empty()) + RegularExpression::Character(CharRange::empty()) } pub fn new_empty_string() -> Self { @@ -56,7 +56,7 @@ impl RegularExpression { if let Ok(string) = String::from_utf8(literal.0.clone().into_vec()) { for char in string.chars() { regex_concat = regex_concat.concat( - &RegularExpression::Character(Range::new_from_range( + &RegularExpression::Character(CharRange::new_from_range( Char::new(char)..=Char::new(char), )), true, @@ -104,24 +104,24 @@ impl RegularExpression { } } - fn to_range_unicode(class_unicode: &ClassUnicode) -> Range { + fn to_range_unicode(class_unicode: &ClassUnicode) -> CharRange { let mut new_range = Vec::with_capacity(class_unicode.ranges().len()); for range in class_unicode.ranges() { new_range.push(AnyRange::from( Char::new(range.start())..=Char::new(range.end()), )); } - Range::new_from_ranges(&new_range) + CharRange::new_from_ranges(&new_range) } - fn to_range_bytes(class_bytes: &ClassBytes) -> Range { + fn to_range_bytes(class_bytes: &ClassBytes) -> CharRange { let mut new_range = Vec::with_capacity(class_bytes.ranges().len()); for range in class_bytes.ranges() { new_range.push(AnyRange::from( Char::new(range.start() as char)..=Char::new(range.end() as char), )); } - Range::new_from_ranges(&new_range) + CharRange::new_from_ranges(&new_range) } } diff --git a/src/regex/mod.rs b/src/regex/mod.rs index 848842f..26e4c7f 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -1,6 +1,6 @@ use std::{cmp, collections::VecDeque, fmt::Display}; -use crate::{Range, execution_profile::ExecutionProfile}; +use crate::execution_profile::ExecutionProfile; use regex_charclass::CharacterClass; use regex_syntax::hir::{Class, ClassBytes, ClassUnicode, Hir, HirKind}; @@ -17,7 +17,7 @@ mod serializer; /// Represent a regular expression. #[derive(Clone, PartialEq, Eq, Hash, Debug, PartialOrd, Ord)] pub enum RegularExpression { - Character(Range), + Character(CharRange), Repetition(Box, u32, Option), Concat(VecDeque), Alternation(Vec), @@ -125,7 +125,7 @@ impl RegularExpression { ExecutionProfile::get().assert_max_number_of_states(self.get_number_of_states_in_nfa())?; match self { - RegularExpression::Character(range) => FastAutomaton::make_from_range(range), + RegularExpression::Character(range) => FastAutomaton::new_from_range(range), RegularExpression::Repetition(regular_expression, min, max_opt) => { let mut automaton = regular_expression.to_automaton()?; automaton.repeat_mut(*min, *max_opt)?; diff --git a/src/regex/operation/mod.rs b/src/regex/operation/mod.rs index 382c885..ae7da22 100644 --- a/src/regex/operation/mod.rs +++ b/src/regex/operation/mod.rs @@ -1,15 +1,16 @@ use super::*; mod concat; +mod repeat; mod simplify; mod union; -mod repeat; #[cfg(test)] mod tests { - use regex_charclass::{char::Char, irange::RangeSet}; - use crate::regex::RegularExpression; + use regex_charclass::char::Char; + + use crate::{regex::RegularExpression, CharRange}; #[test] fn test_parse_and_simplify() -> Result<(), String> { @@ -37,8 +38,11 @@ mod tests { assert_parse_and_simplify("((ab))?(ab)(((ab)))((((ab)){3}))", "(ab){5,6}"); assert_parse_and_simplify("(cd|ab)*(ab|cd)*", "(ab|cd)*"); assert_parse_and_simplify(".*q(ab|ab|abc|ca)x", ".*q(abc?|ca)x"); - assert_parse_and_simplify("((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,100}", "(q|(a|ads|a{2}d)*abc.*def.*uif(x|ads|a{2}d)*abc.*oxs.*def(ads|ax|a{2}d)*abc.*def.*ksd){1,100}"); - + assert_parse_and_simplify( + "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,100}", + "(q|(a|ads|a{2}d)*abc.*def.*uif(x|ads|a{2}d)*abc.*oxs.*def(ads|ax|a{2}d)*abc.*def.*ksd){1,100}", + ); + assert_parse_and_simplify("(a{2,4}){2,4}", "a{4,16}"); Ok(()) } @@ -51,7 +55,7 @@ mod tests { #[test] fn test_repeat_simplify() -> Result<(), String> { assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), 2, Some(2), 3, @@ -59,7 +63,7 @@ mod tests { ); assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), 2, Some(2), 2, @@ -67,7 +71,7 @@ mod tests { ); assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), 3, Some(3), 0, @@ -75,7 +79,7 @@ mod tests { ); assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), 0, Some(3), 1, @@ -83,7 +87,7 @@ mod tests { ); assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), 1, Some(2), 1, @@ -91,7 +95,7 @@ mod tests { ); assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), 2, Some(3), 1, @@ -99,7 +103,7 @@ mod tests { ); assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), 3, Some(4), 1, @@ -107,7 +111,7 @@ mod tests { ); assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), 7, Some(8), 1, @@ -115,7 +119,7 @@ mod tests { ); assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), 0, None, 3, @@ -123,7 +127,7 @@ mod tests { ); assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), 1, None, 0, @@ -131,7 +135,7 @@ mod tests { ); assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), 0, Some(1), 1, @@ -139,7 +143,7 @@ mod tests { ); assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), 2, Some(4), 2, @@ -150,7 +154,7 @@ mod tests { } fn assert_repeat_simplify( - range: &RangeSet, + range: &CharRange, min1: u32, max1: Option, min2: u32, diff --git a/src/tokenizer/embed_automaton.rs b/src/tokenizer/embed_automaton.rs index 79697dd..602cb93 100644 --- a/src/tokenizer/embed_automaton.rs +++ b/src/tokenizer/embed_automaton.rs @@ -1,6 +1,6 @@ use token::TokenError; -use crate::{error::EngineError, fast_automaton::condition::Condition}; +use crate::{error::EngineError, fast_automaton::condition::Condition, CharRange}; use self::token::range_token::RangeToken; @@ -32,10 +32,7 @@ impl Tokenizer<'_> { vec.push(AutomatonToken::AcceptState) } - for (to_state, condition) in self - .automaton - .transitions_from_state_enumerate_iter(¤t_state) - { + for (condition, to_state) in self.automaton.transitions_from_iter(current_state) { if condition.is_empty() { continue; } @@ -73,7 +70,7 @@ impl Tokenizer<'_> { let mut from_state = None; let mut to_state = None; - let mut range = Range::empty(); + let mut range = CharRange::empty(); for token in vec { match token { AutomatonToken::Range(r) => { @@ -86,7 +83,7 @@ impl Tokenizer<'_> { if let Some(fs) = from_state { if let Some(ts) = to_state { Self::apply_transition(&mut automaton, fs, ts, &range)?; - range = Range::empty(); + range = CharRange::empty(); } to_state = Some((*s).into()); } else { @@ -107,7 +104,7 @@ impl Tokenizer<'_> { } from_state = None; to_state = None; - range = Range::empty(); + range = CharRange::empty(); } _ => return Err(EngineError::TokenError(TokenError::UnknownToken)), }; @@ -122,10 +119,10 @@ impl Tokenizer<'_> { automaton: &mut FastAutomaton, from_state: State, to_state: State, - range: &Range, + range: &CharRange, ) -> Result<(), EngineError> { let condition = Condition::from_range(range, automaton.get_spanning_set())?; - automaton.add_transition_to(from_state, to_state, &condition); + automaton.add_transition(from_state, to_state, &condition); Ok(()) } } @@ -150,7 +147,9 @@ mod tests { assert_embedding_convertion_for_fair( "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q)", ); - assert_embedding_convertion_for_fair("(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])"); + assert_embedding_convertion_for_fair( + "(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])", + ); Ok(()) } @@ -181,14 +180,18 @@ mod tests { let unembedded_automaton = tokenizer.from_embedding(&embedding).unwrap(); - assert!(automaton - .subtraction(&unembedded_automaton) - .unwrap() - .is_empty()); - assert!(unembedded_automaton - .subtraction(&automaton) - .unwrap() - .is_empty()); + assert!( + automaton + .subtraction(&unembedded_automaton) + .unwrap() + .is_empty() + ); + assert!( + unembedded_automaton + .subtraction(&automaton) + .unwrap() + .is_empty() + ); if !ignore_ai { // AI @@ -200,14 +203,18 @@ mod tests { let unembedded_automaton = tokenizer.from_embedding(&embedding).unwrap(); - assert!(automaton - .subtraction(&unembedded_automaton) - .unwrap() - .is_empty()); - assert!(unembedded_automaton - .subtraction(&automaton) - .unwrap() - .is_empty()); + assert!( + automaton + .subtraction(&unembedded_automaton) + .unwrap() + .is_empty() + ); + assert!( + unembedded_automaton + .subtraction(&automaton) + .unwrap() + .is_empty() + ); } } } diff --git a/src/tokenizer/embed_regex.rs b/src/tokenizer/embed_regex.rs index cb581e6..3e05757 100644 --- a/src/tokenizer/embed_regex.rs +++ b/src/tokenizer/embed_regex.rs @@ -1,6 +1,6 @@ use token::TokenError; -use crate::regex::RegularExpression; +use crate::{regex::RegularExpression, CharRange}; use self::token::regex_token::RegexToken; @@ -94,7 +94,7 @@ impl Tokenizer<'_> { vec: &[RegexToken], ) -> Result { let mut regex_groups = vec![(RegularExpression::new_empty_string(), false)]; - let mut current_range: Option = None; + let mut current_range: Option = None; let mut current_min = None; for i in 0..vec.len() { let token = vec[i]; diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 2e3e4ed..3273b0e 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -2,8 +2,6 @@ use std::{cmp::Ordering, collections::VecDeque, vec}; use ahash::HashMapExt; use crate::fast_automaton::spanning_set::SpanningSet; -use crate::Range; - use crate::{ fast_automaton::{FastAutomaton, State}, IntMap, IntSet, @@ -43,9 +41,9 @@ impl Tokenizer<'_> { state_counter += 1; automaton - .transitions_from_state_enumerate_iter(¤t_state) - .filter(|(_, c)| !c.is_empty()) - .for_each(|(to_state, _)| { + .transitions_from_iter(current_state) + .filter(|(c, _)| !c.is_empty()) + .for_each(|(_, to_state)| { if !seen.contains(to_state) { worklist.push_front(*to_state); } diff --git a/src/tokenizer/range_tokenizer.rs b/src/tokenizer/range_tokenizer.rs index 3950033..e3b3c9c 100644 --- a/src/tokenizer/range_tokenizer.rs +++ b/src/tokenizer/range_tokenizer.rs @@ -1,3 +1,5 @@ +use crate::CharRange; + use self::token::range_token::RangeToken; use super::*; @@ -5,7 +7,7 @@ use super::*; #[derive(Debug)] pub struct RangeTokenizer<'a> { spanning_set: &'a SpanningSet, - total: Range, + total: CharRange, } impl RangeTokenizer<'_> { @@ -21,7 +23,7 @@ impl RangeTokenizer<'_> { } } - pub fn range_to_embedding(&self, range: &Range) -> Option> { + pub fn range_to_embedding(&self, range: &CharRange) -> Option> { if range == &self.total { return Some(vec![RangeToken::Total]); } else if !range.difference(&self.total).is_empty() { @@ -39,12 +41,12 @@ impl RangeTokenizer<'_> { Some(vec) } - pub fn embedding_to_range(&self, vec: &[RangeToken]) -> Option { + pub fn embedding_to_range(&self, vec: &[RangeToken]) -> Option { if vec.is_empty() { - return Some(Range::empty()); + return Some(CharRange::empty()); } - let mut range = Range::empty(); + let mut range = CharRange::empty(); if vec[0] == RangeToken::Total { return Some(self.total.clone()); } @@ -60,7 +62,7 @@ impl RangeTokenizer<'_> { Some(range) } - pub fn token_to_range(&self, token: &RangeToken) -> Option<&Range> { + pub fn token_to_range(&self, token: &RangeToken) -> Option<&CharRange> { match token { RangeToken::Total => Some(&self.total), RangeToken::Base(b) => self.spanning_set.get_spanning_range(*b), From bcc9d7d699dd66518a4e0ff32ba073c33a5542c4 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sat, 2 Aug 2025 18:11:27 +0200 Subject: [PATCH 10/44] Update description --- README.md | 26 ++++++++++---------- src/fast_automaton/operation/concat.rs | 2 ++ src/fast_automaton/operation/determinize.rs | 1 + src/fast_automaton/operation/intersection.rs | 3 +++ src/fast_automaton/operation/repeat.rs | 1 + src/fast_automaton/operation/subtraction.rs | 2 ++ src/fast_automaton/operation/union.rs | 3 +++ 7 files changed, 25 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 1385f96..f009adc 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ - [Installation](#installation) - [Example](#example) - - [Key Concepts & Limitations](#key-concepts-limitations) + - [Key Concepts & Limitations](#key-concepts--limitations) - [API](#api) - [Term](#term) - [FastAutomaton](#fastautomaton) @@ -177,18 +177,18 @@ This design allows us to perform unions, intersections, and complements of trans #### Manipulate | Method | Return | Description | | -------- | ------- | ------- | -| `union(&self, other: &FastAutomaton)` | `Result` | | -| `union_all<'a, I: IntoIterator>(automatons: I)` | `Result` | | -| `union_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | | -| `concat(&self, other: &FastAutomaton)` | `Result` | | -| `concat_all<'a, I: IntoIterator>(automatons: I)` | `Result` | | -| `determinize(&self)` | `Result` | | -| `intersection(&self, other: &FastAutomaton)` | `Result` | | -| `intersection_all<'a, I: IntoIterator>(automatons: I)` | `Result` | | -| `intersection_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | | -| `complement(&mut self)` | `Result<(), EngineError>` | | -| `subtraction(&self, other: &FastAutomaton)` | `Result` | | -| `repeat(&self, min: u32, max_opt: Option)` | `Result` | | +| `union(&self, other: &FastAutomaton)` | `Result` | Returns a new `FastAutomaton` representing the union of `self` and `other`. | +| `union_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the union of all automatons in the given iterator. | +| `union_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the union of all automatons in the given parallel iterator. | +| `concat(&self, other: &FastAutomaton)` | `Result` | Returns a new `FastAutomaton` representing the concatenation of `self` and `other`. | +| `concat_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` representing the concatenation of `self` and `other`. | +| `determinize(&self)` | `Result` | Determinize the automaton and returns it as a new `FastAutomaton`. | +| `intersection(&self, other: &FastAutomaton)` | `Result` | Returns a new `FastAutomaton` representing the intersection of `self` and `other`. | +| `intersection_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the intersection of all automatons in the given iterator. | +| `intersection_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the union of all automatons in the given parallel iterator. | +| `complement(&mut self)` | `Result<(), EngineError>` | Complement the automaton, the automaton needs to be deterministic. | +| `subtraction(&self, other: &FastAutomaton)` | `Result` | Returns a new `FastAutomaton` representing the substraction of `self` and `other`. | +| `repeat(&self, min: u32, max_opt: Option)` | `Result` | Returns the repetition of the automaton, between `min` and `max_opt` times. If `max_opt` is `None`, the repetition is unbounded. | #### Analyze | Method | Return | Description | diff --git a/src/fast_automaton/operation/concat.rs b/src/fast_automaton/operation/concat.rs index 71d97b0..6fa9d1b 100644 --- a/src/fast_automaton/operation/concat.rs +++ b/src/fast_automaton/operation/concat.rs @@ -7,10 +7,12 @@ use crate::error::EngineError; use super::*; impl FastAutomaton { + /// Returns a new `FastAutomaton` representing the concatenation of `self` and `other`. pub fn concat(&self, other: &FastAutomaton) -> Result { Self::concat_all([self, other]) } + /// Returns a new `FastAutomaton` that is the concatenation of all automatons in the given iterator. pub fn concat_all<'a, I: IntoIterator>(automatons: I) -> Result { let mut new_automaton = FastAutomaton::new_empty_string(); diff --git a/src/fast_automaton/operation/determinize.rs b/src/fast_automaton/operation/determinize.rs index b0efb67..55d8c46 100644 --- a/src/fast_automaton/operation/determinize.rs +++ b/src/fast_automaton/operation/determinize.rs @@ -5,6 +5,7 @@ use crate::{EngineError, execution_profile::ExecutionProfile}; use super::*; impl FastAutomaton { + /// Determinize the automaton and returns it as a new `FastAutomaton`. pub fn determinize(&self) -> Result { if self.deterministic { return Ok(self.clone()); diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index 5dac078..29e5d49 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -12,10 +12,12 @@ use crate::{ use super::*; impl FastAutomaton { + /// Returns a new `FastAutomaton` representing the intersection of `self` and `other`. pub fn intersection(&self, other: &FastAutomaton) -> Result { FastAutomaton::intersection_all([self, other]) } + /// Returns a new `FastAutomaton` that is the intersection of all automatons in the given iterator. pub fn intersection_all<'a, I: IntoIterator>(automatons: I) -> Result { let mut result: Cow<'a, FastAutomaton> = Cow::Owned(FastAutomaton::new_total()); @@ -31,6 +33,7 @@ impl FastAutomaton { Ok(result.into_owned()) } + /// Returns a new `FastAutomaton` that is the union of all automatons in the given parallel iterator. pub fn intersection_all_par<'a, I: IntoParallelIterator>(automatons: I) -> Result { let execution_profile = ExecutionProfile::get(); diff --git a/src/fast_automaton/operation/repeat.rs b/src/fast_automaton/operation/repeat.rs index 2fd14db..cc4fb76 100644 --- a/src/fast_automaton/operation/repeat.rs +++ b/src/fast_automaton/operation/repeat.rs @@ -1,6 +1,7 @@ use super::*; impl FastAutomaton { + // Returns the repetition of the automaton, between `min` and `max_opt` times. If `max_opt` is `None`, the repetition is unbounded. pub fn repeat(&self, min: u32, max_opt: Option) -> Result { let mut automaton = self.clone(); if let Err(error) = automaton.repeat_mut(min, max_opt) { diff --git a/src/fast_automaton/operation/subtraction.rs b/src/fast_automaton/operation/subtraction.rs index 8d45ae7..e4406e8 100644 --- a/src/fast_automaton/operation/subtraction.rs +++ b/src/fast_automaton/operation/subtraction.rs @@ -43,6 +43,7 @@ impl FastAutomaton { Ok(()) } + /// Complement the automaton, the automaton needs to be deterministic. pub fn complement(&mut self) -> Result<(), EngineError> { self.totalize()?; @@ -58,6 +59,7 @@ impl FastAutomaton { Ok(()) } + /// Returns a new `FastAutomaton` representing the substraction of `self` and `other`. pub fn subtraction(&self, other: &FastAutomaton) -> Result { let mut complement = other.clone(); match complement.complement() { diff --git a/src/fast_automaton/operation/union.rs b/src/fast_automaton/operation/union.rs index ea9ad44..d83e7de 100644 --- a/src/fast_automaton/operation/union.rs +++ b/src/fast_automaton/operation/union.rs @@ -8,10 +8,12 @@ use crate::{error::EngineError, execution_profile::ExecutionProfile}; use super::*; impl FastAutomaton { + /// Returns a new `FastAutomaton` representing the union of `self` and `other`. pub fn union(&self, other: &FastAutomaton) -> Result { Self::union_all([self, other]) } + /// Returns a new `FastAutomaton` that is the union of all automatons in the given iterator. pub fn union_all<'a, I: IntoIterator>(automatons: I) -> Result { let mut new_automaton = FastAutomaton::new_empty(); @@ -21,6 +23,7 @@ impl FastAutomaton { Ok(new_automaton) } + /// Returns a new `FastAutomaton` that is the union of all automatons in the given parallel iterator. pub fn union_all_par<'a, I: IntoParallelIterator>(automatons: I) -> Result { let execution_profile = ExecutionProfile::get(); From 8d5b66e77e9430a1afc4d8b4c68abb401d755354 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sat, 2 Aug 2025 21:19:46 +0200 Subject: [PATCH 11/44] Update docs --- README.md | 39 ++++++++++++++++++-- src/fast_automaton/analyze/cardinality.rs | 1 + src/fast_automaton/analyze/equivalence.rs | 1 + src/fast_automaton/analyze/length.rs | 1 + src/fast_automaton/analyze/mod.rs | 3 +- src/fast_automaton/analyze/subset.rs | 1 + src/fast_automaton/operation/intersection.rs | 3 +- src/lib.rs | 8 ++-- src/regex/analyze/affixes.rs | 4 +- src/regex/analyze/mod.rs | 2 + src/regex/analyze/number_of_states.rs | 14 +++---- src/regex/builder.rs | 12 ++++-- src/regex/mod.rs | 4 ++ src/regex/operation/concat.rs | 1 + src/regex/operation/repeat.rs | 1 + src/regex/operation/simplify.rs | 1 + src/regex/operation/union.rs | 12 +++--- 17 files changed, 79 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index f009adc..a808caf 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ println!("Some matches: {:?}", samples); // Equivalence & subset let a = Term::from_pattern("a+").unwrap(); let b = Term::from_pattern("a*").unwrap(); -assert!(!a.are_equivalent(&b).unwrap()); +assert!(!a.is_equivalent_of(&b).unwrap()); assert!(a.is_subset_of(&b).unwrap()); ``` @@ -121,7 +121,7 @@ RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/re | Method | Return | Description | | -------- | ------- | ------- | | `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates the given count of strings matched by the given term. | -| `are_equivalent(&self, term: &Term)` | `Result` | Computes whether the current term and the given term are equivalent. Returns `true` if both terms accept the same language. | +| `is_equivalent_of(&self, term: &Term)` | `Result` | Computes whether the current term and the given term are equivalent. Returns `true` if both terms accept the same language. | | `is_subset_of(&self, term: &Term)` | `Result` | Computes whether the current term is a subset of the given term. Returns `true` if all strings matched by the current term are also matched by the given term. | | `is_empty(&self)` | `bool` | Checks if the current term matches the empty language. | | `is_total(&self)` | `bool` | Checks if the current term matches all possible strings. | @@ -185,7 +185,7 @@ This design allows us to perform unions, intersections, and complements of trans | `determinize(&self)` | `Result` | Determinize the automaton and returns it as a new `FastAutomaton`. | | `intersection(&self, other: &FastAutomaton)` | `Result` | Returns a new `FastAutomaton` representing the intersection of `self` and `other`. | | `intersection_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the intersection of all automatons in the given iterator. | -| `intersection_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the union of all automatons in the given parallel iterator. | +| `intersection_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the intersection of all automatons in the given parallel iterator. | | `complement(&mut self)` | `Result<(), EngineError>` | Complement the automaton, the automaton needs to be deterministic. | | `subtraction(&self, other: &FastAutomaton)` | `Result` | Returns a new `FastAutomaton` representing the substraction of `self` and `other`. | | `repeat(&self, min: u32, max_opt: Option)` | `Result` | Returns the repetition of the automaton, between `min` and `max_opt` times. If `max_opt` is `None`, the repetition is unbounded. | @@ -193,6 +193,10 @@ This design allows us to perform unions, intersections, and complements of trans #### Analyze | Method | Return | Description | | -------- | ------- | ------- | +| `is_empty(&self)` | `bool` | Checks if the current `FastAutomaton` matches the empty language. | +| `is_total(&self)` | `bool` | Checks if the current `FastAutomaton` matches all possible strings. | +| `is_empty_string(&self)` | `bool` | Checks if the current `FastAutomaton` only match the empty string `""`. | +| `get_reacheable_states(&self)` | `IntSet` | Get a set of all reacheable states from the start state. | | `state_in_degree(&self, state: State)` | `usize` | Returns the number of transitions to the provided state. | | `state_out_degree(&self, state: State)` | `usize` | Returns the number of transitions from the provided state. | | `all_states_iter(&self)` | `impl Iterator` | Returns an iterator of the states of the automaton. | @@ -215,13 +219,40 @@ This design allows us to perform unions, intersections, and complements of trans | `is_cyclic(&self)` | `bool` | Returns `true` if the automaton contains at least one cycle. | | `has_state(&self, state: State)` | `bool` | Returns `true` if the automaton contains at least one cycle. | | `to_regex(&self)` | `Option` | Try to convert the automaton to a `RegularExpression`. If it cannot find an equivalent pattern returns `None`. | -| `has_intersection(&self, other: &FastAutomaton)` | `Result` | | +| `has_intersection(&self, other: &FastAutomaton)` | `Result` | Returns `true` if the two automatons have a non-empty intersection. | +| `is_equivalent_of(&self, other: &FastAutomaton)` | `Result` | Computes whether the current `FastAutomaton` and the given `FastAutomaton` are equivalent. Returns `true` if both automata accept the same language. | +| `is_subset_of(&self, other: &FastAutomaton)` | `Result` | Computes whether the current `FastAutomaton` is a subset of the given `FastAutomaton`. Returns `true` if all strings matched by the current `FastAutomaton` are also matched by the given `FastAutomaton`. | +| `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of the possible matched strings. | +| `get_cardinality(&self)` | `Cardinality` | Returns the cardinality of the provided term (i.e. the number of the possible matched strings). | ### RegularExpression `RegularExpression` is used to directly build, manipulate and analyze regular expression patterns. Not all the set operations are available, for more advanced operation such as intersection, subtraction/difference and complement it is necessary to convert in to a `FastAutomaton` with the method `to_automaton()`. +#### Build +| Method | Return | Description | +| -------- | ------- | ------- | +| `new(pattern: &str)` | `Result` | Parses the provided pattern and return the resulting `RegularExpression`. | +| `new_empty()` | `RegularExpression` | Create a `RegularExpression` that matches the empty language. | +| `new_total()` | `RegularExpression` | Create a `RegularExpression` that matches all possible strings. | +| `new_empty_string()` | `RegularExpression` | Create a `RegularExpression` that only match the empty string `""`. | +| `concat(&self, other: &RegularExpression, append_back: bool)` | `RegularExpression` | Returns a new `RegularExpression` representing the concatenation of `self` and `other`, using `append_back` to determine their order. | +| `repeat(&self, min: u32, max_opt: Option)` | `RegularExpression` | Returns the repetition of the `RegularExpression`, between `min` and `max_opt` times. If `max_opt` is `None`, the repetition is unbounded. | +| `union(&self, other: &RegularExpression)` | `RegularExpression` | Create a`RegularExpression` that only match the empty string `""`. | +| `union_all<'a, I: IntoIterator>(patterns: I)` | `RegularExpression` | Returns a `RegularExpression` formed by taking the union of all expressions in `patterns`. | +| `simplify(&self)` | `RegularExpression` | Returns a simplified version of this regular expression by eliminating redundant constructs and applying canonical reductions. | + +#### Analyze +| Method | Return | Description | +| -------- | ------- | ------- | +| `is_empty(&self)` | `bool` | Checks if the current `RegularExpression` matches the empty language. | +| `is_total(&self)` | `bool` | Checks if the current `RegularExpression` matches all possible strings. | +| `is_empty_string(&self)` | `bool` | Checks if the current `RegularExpression` only match the empty string `""`. | +| `to_automaton(&self)` | `Result` | Convert the current `RegularExpression` to an equivalent `FastAutomaton`. | +| `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of the possible matched strings. | +| `get_cardinality(&self)` | `Cardinality` | Returns the cardinality of the provided term (i.e. the number of the possible matched strings). | + ## Error Handling ## Bound Execution diff --git a/src/fast_automaton/analyze/cardinality.rs b/src/fast_automaton/analyze/cardinality.rs index a2d6d91..57a346a 100644 --- a/src/fast_automaton/analyze/cardinality.rs +++ b/src/fast_automaton/analyze/cardinality.rs @@ -3,6 +3,7 @@ use std::hash::BuildHasherDefault; use super::*; impl FastAutomaton { + /// Returns the cardinality of the provided term (i.e. the number of the possible matched strings). pub fn get_cardinality(&self) -> Option> { if self.is_empty() { return Some(Cardinality::Integer(0)); diff --git a/src/fast_automaton/analyze/equivalence.rs b/src/fast_automaton/analyze/equivalence.rs index d81294c..18a6f14 100644 --- a/src/fast_automaton/analyze/equivalence.rs +++ b/src/fast_automaton/analyze/equivalence.rs @@ -3,6 +3,7 @@ use crate::error::EngineError; use super::*; impl FastAutomaton { + /// Computes whether the current `FastAutomaton` and the given `FastAutomaton` are equivalent. Returns `true` if both automata accept the same language. pub fn is_equivalent_of(&self, other: &FastAutomaton) -> Result { if self.is_empty() != other.is_empty() && self.is_total() != other.is_total() { return Ok(false); diff --git a/src/fast_automaton/analyze/length.rs b/src/fast_automaton/analyze/length.rs index c03ee80..638a93c 100644 --- a/src/fast_automaton/analyze/length.rs +++ b/src/fast_automaton/analyze/length.rs @@ -1,6 +1,7 @@ use super::*; impl FastAutomaton { + /// Returns the minimum and maximum length of the possible matched strings. pub fn get_length(&self) -> (Option, Option) { if self.is_empty() { return (None, None); diff --git a/src/fast_automaton/analyze/mod.rs b/src/fast_automaton/analyze/mod.rs index de49b73..7d5d7fa 100644 --- a/src/fast_automaton/analyze/mod.rs +++ b/src/fast_automaton/analyze/mod.rs @@ -30,6 +30,7 @@ impl FastAutomaton { self.accept_states.len() == 1 && self.accept_states.contains(&self.start_state) && self.state_in_degree(self.start_state) == 0 } + /// Get a set of all reacheable states from the start state. pub fn get_reacheable_states(&self) -> IntSet { let mut states_map: IntMap> = IntMap::with_capacity_and_hasher(self.transitions.len(), BuildHasherDefault::default()); @@ -66,7 +67,7 @@ impl FastAutomaton { live } - pub fn get_ranges(&self) -> Result, EngineError> { + pub(crate) fn get_ranges(&self) -> Result, EngineError> { self.spanning_set.get_spanning_ranges().map(|range| { Condition::from_range(range, &self.spanning_set) }).collect() diff --git a/src/fast_automaton/analyze/subset.rs b/src/fast_automaton/analyze/subset.rs index 5705fc2..6eb8888 100644 --- a/src/fast_automaton/analyze/subset.rs +++ b/src/fast_automaton/analyze/subset.rs @@ -3,6 +3,7 @@ use crate::error::EngineError; use super::*; impl FastAutomaton { + /// Computes whether the current `FastAutomaton` is a subset of the given `FastAutomaton`. Returns `true` if all strings matched by the current `FastAutomaton` are also matched by the given `FastAutomaton`. pub fn is_subset_of(&self, other: &FastAutomaton) -> Result { if self.is_empty() || other.is_total() || self == other { return Ok(true); diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index 29e5d49..6987c31 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -33,7 +33,7 @@ impl FastAutomaton { Ok(result.into_owned()) } - /// Returns a new `FastAutomaton` that is the union of all automatons in the given parallel iterator. + /// Returns a new `FastAutomaton` that is the intersection of all automatons in the given parallel iterator. pub fn intersection_all_par<'a, I: IntoParallelIterator>(automatons: I) -> Result { let execution_profile = ExecutionProfile::get(); @@ -126,6 +126,7 @@ impl FastAutomaton { Ok(Cow::Owned(new_automaton)) } + // Returns `true` if the two automatons have a non-empty intersection. pub fn has_intersection(&self, other: &FastAutomaton) -> Result { if self.is_empty() || other.is_empty() { return Ok(false); diff --git a/src/lib.rs b/src/lib.rs index 3de5f60..0dc2ec9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -343,9 +343,9 @@ impl Term { /// let term1 = Term::from_pattern("(abc|de)").unwrap(); /// let term2 = Term::from_pattern("(abc|de)*").unwrap(); /// - /// assert!(!term1.are_equivalent(&term2).unwrap()); + /// assert!(!term1.is_equivalent_of(&term2).unwrap()); /// ``` - pub fn are_equivalent(&self, that: &Term) -> Result { + pub fn is_equivalent_of(&self, that: &Term) -> Result { if self == that { return Ok(true); } @@ -430,6 +430,7 @@ impl Term { } } + /// Converts the current `Term` to a `FastAutomaton`. pub fn to_automaton(&self) -> Result, EngineError> { Ok(match self { Term::RegularExpression(regex) => Cow::Owned(regex.to_automaton()?), @@ -437,6 +438,7 @@ impl Term { }) } + /// Converts the current `Term` to a `RegularExpression`. Returns `None` if the automaton cannot be converted. pub fn to_regex(&self) -> Option> { Some(match self { Term::RegularExpression(regex) => Cow::Borrowed(regex), @@ -623,7 +625,7 @@ mod tests { // Equivalence & subset let a = Term::from_pattern("a+").unwrap(); let b = Term::from_pattern("a*").unwrap(); - assert!(!a.are_equivalent(&b).unwrap()); + assert!(!a.is_equivalent_of(&b).unwrap()); assert!(a.is_subset_of(&b).unwrap()); Ok(()) diff --git a/src/regex/analyze/affixes.rs b/src/regex/analyze/affixes.rs index 4213e3f..540580b 100644 --- a/src/regex/analyze/affixes.rs +++ b/src/regex/analyze/affixes.rs @@ -3,7 +3,7 @@ use std::collections::BTreeSet; use super::*; impl RegularExpression { - pub fn get_common_affixes( + pub(crate) fn get_common_affixes( &self, other: &RegularExpression, ) -> ( @@ -21,7 +21,7 @@ impl RegularExpression { (common_prefix, (self_regex, other_regex), common_suffix) } - pub fn get_common_affix( + pub(crate) fn get_common_affix( &self, other: &RegularExpression, is_prefix: bool, diff --git a/src/regex/analyze/mod.rs b/src/regex/analyze/mod.rs index ae08148..2b946cb 100644 --- a/src/regex/analyze/mod.rs +++ b/src/regex/analyze/mod.rs @@ -6,6 +6,7 @@ mod affixes; mod number_of_states; impl RegularExpression { + /// Returns the minimum and maximum length of the possible matched strings. pub fn get_length(&self) -> (Option, Option) { match self { RegularExpression::Character(range) => { @@ -84,6 +85,7 @@ impl RegularExpression { } } + /// Returns the cardinality of the provided term (i.e. the number of the possible matched strings). pub fn get_cardinality(&self) -> Cardinality { if self.is_empty() { return Cardinality::Integer(0); diff --git a/src/regex/analyze/number_of_states.rs b/src/regex/analyze/number_of_states.rs index 90c1897..e7460f8 100644 --- a/src/regex/analyze/number_of_states.rs +++ b/src/regex/analyze/number_of_states.rs @@ -9,7 +9,7 @@ struct AbstractStateMetadata { } impl AbstractStateMetadata { - pub fn new(has_incoming_edges: bool, has_outgoing_edges: bool) -> Self { + pub(crate) fn new(has_incoming_edges: bool, has_outgoing_edges: bool) -> Self { AbstractStateMetadata { has_incoming_edges, has_outgoing_edges, @@ -25,7 +25,7 @@ struct AbstractNFAMetadata { } impl AbstractNFAMetadata { - pub fn new() -> Self { + pub(crate) fn new() -> Self { AbstractNFAMetadata { start: AbstractStateMetadata::new(false, true), accepted: vec![AbstractStateMetadata::new(true, false)], @@ -33,7 +33,7 @@ impl AbstractNFAMetadata { } } - pub fn new_empty_string() -> Self { + pub(crate) fn new_empty_string() -> Self { AbstractNFAMetadata { start: AbstractStateMetadata::new(false, false), accepted: vec![AbstractStateMetadata::new(false, false)], @@ -41,7 +41,7 @@ impl AbstractNFAMetadata { } } - pub fn new_empty() -> Self { + pub(crate) fn new_empty() -> Self { AbstractNFAMetadata { start: AbstractStateMetadata::new(false, false), accepted: vec![], @@ -49,7 +49,7 @@ impl AbstractNFAMetadata { } } - pub fn concat(&self, nfa: &AbstractNFAMetadata) -> Self { + pub(crate) fn concat(&self, nfa: &AbstractNFAMetadata) -> Self { let start_state_and_accept_states_not_mergeable = nfa.start.has_incoming_edges && self.accepted.iter().any(|s| s.has_outgoing_edges); @@ -68,7 +68,7 @@ impl AbstractNFAMetadata { } } - pub fn repeat(&self, min: u32, max_opt: &Option) -> Self { + pub(crate) fn repeat(&self, min: u32, max_opt: &Option) -> Self { let start_state_not_mergeable = self.start.has_incoming_edges; let accepted_not_mergeable = self.accepted.iter().any(|s| s.has_outgoing_edges); let start_state_or_accept_states_not_mergeable = @@ -129,7 +129,7 @@ impl AbstractNFAMetadata { } } - pub fn alternate(&mut self, nfa: &AbstractNFAMetadata) -> Self { + pub(crate) fn alternate(&mut self, nfa: &AbstractNFAMetadata) -> Self { let self_start_state_not_mergeable = self.start.has_incoming_edges; let self_accepted_not_mergeable = self.accepted.iter().any(|s| s.has_outgoing_edges); diff --git a/src/regex/builder.rs b/src/regex/builder.rs index b1958a1..1b8f636 100644 --- a/src/regex/builder.rs +++ b/src/regex/builder.rs @@ -11,17 +11,18 @@ lazy_static! { } impl RegularExpression { - pub fn new(regex: &str) -> Result { - if regex.is_empty() { + /// Parses the provided pattern and return the resulting `RegularExpression`. + pub fn new(pattern: &str) -> Result { + if pattern.is_empty() { return Ok(RegularExpression::new_empty_string()); } - if regex == "[]" { + if pattern == "[]" { return Ok(RegularExpression::new_empty()); } match ParserBuilder::new() .dot_matches_new_line(true) .build() - .parse(&Self::remove_flags(regex)) + .parse(&Self::remove_flags(pattern)) { Ok(hir) => Self::convert_to_regex(&hir), Err(err) => Err(EngineError::RegexSyntaxError(err.to_string())), @@ -32,6 +33,7 @@ impl RegularExpression { RE_FLAG_DETECTION.replace_all(regex, "").to_string() } + /// Create a `RegularExpression` that matches all possible strings. pub fn new_total() -> Self { RegularExpression::Repetition( Box::new(RegularExpression::Character(CharRange::total())), @@ -40,10 +42,12 @@ impl RegularExpression { ) } + /// Create a `RegularExpression` that matches the empty language. pub fn new_empty() -> Self { RegularExpression::Character(CharRange::empty()) } + /// Create a`RegularExpression` that only match the empty string `""`. pub fn new_empty_string() -> Self { RegularExpression::Concat(VecDeque::new()) } diff --git a/src/regex/mod.rs b/src/regex/mod.rs index 26e4c7f..5c7ca16 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -90,6 +90,7 @@ impl Display for RegularExpression { } impl RegularExpression { + /// Checks if the current `RegularExpression` matches the empty language. pub fn is_empty(&self) -> bool { match self { RegularExpression::Alternation(alternation) => alternation.is_empty(), @@ -98,6 +99,7 @@ impl RegularExpression { } } + /// Checks if the current `RegularExpression` matches all possible strings. pub fn is_empty_string(&self) -> bool { match self { RegularExpression::Concat(concat) => concat.is_empty(), @@ -105,6 +107,7 @@ impl RegularExpression { } } + /// Checks if the current `RegularExpression` only match the empty string `""`. pub fn is_total(&self) -> bool { match self { RegularExpression::Repetition(regular_expression, min, max_opt) => { @@ -121,6 +124,7 @@ impl RegularExpression { } } + /// Convert the current `RegularExpression` to an equivalent `FastAutomaton`. pub fn to_automaton(&self) -> Result { ExecutionProfile::get().assert_max_number_of_states(self.get_number_of_states_in_nfa())?; diff --git a/src/regex/operation/concat.rs b/src/regex/operation/concat.rs index 6907d9b..ac699d8 100644 --- a/src/regex/operation/concat.rs +++ b/src/regex/operation/concat.rs @@ -1,6 +1,7 @@ use super::*; impl RegularExpression { + /// Returns a new `RegularExpression` representing the concatenation of `self` and `other`, using `append_back` to determine their order. pub fn concat(&self, other: &RegularExpression, append_back: bool) -> RegularExpression { if self.is_empty() || other.is_empty() { return RegularExpression::new_empty(); diff --git a/src/regex/operation/repeat.rs b/src/regex/operation/repeat.rs index 00b9685..7da36bb 100644 --- a/src/regex/operation/repeat.rs +++ b/src/regex/operation/repeat.rs @@ -1,6 +1,7 @@ use super::*; impl RegularExpression { + /// Returns the repetition of the `RegularExpression`, between `min` and `max_opt` times. If `max_opt` is `None`, the repetition is unbounded. pub fn repeat(&self, min: u32, max_opt: Option) -> RegularExpression { if self.is_total() { return RegularExpression::new_total(); diff --git a/src/regex/operation/simplify.rs b/src/regex/operation/simplify.rs index ae87087..51f66b7 100644 --- a/src/regex/operation/simplify.rs +++ b/src/regex/operation/simplify.rs @@ -1,6 +1,7 @@ use super::*; impl RegularExpression { + /// Returns a simplified version of this regular expression by eliminating redundant constructs and applying canonical reductions. pub fn simplify(&self) -> Self { match self { RegularExpression::Character(_) => self.clone(), diff --git a/src/regex/operation/union.rs b/src/regex/operation/union.rs index 65b34f4..1c09e78 100644 --- a/src/regex/operation/union.rs +++ b/src/regex/operation/union.rs @@ -3,17 +3,17 @@ use std::collections::BTreeSet; use super::*; impl RegularExpression { + /// Returns a new `RegularExpression` representing the union of this expression with `other`. pub fn union(&self, other: &RegularExpression) -> RegularExpression { Self::union_all([self, other]) } - pub fn union_all<'a, I>(regexes: I) -> RegularExpression - where - I: IntoIterator, + /// Returns a `RegularExpression` formed by taking the union of all expressions in `patterns`. + pub fn union_all<'a, I: IntoIterator>(patterns: I) -> RegularExpression { let mut result: Cow<'a, RegularExpression> = Cow::Owned(RegularExpression::new_empty()); - for other in regexes { + for other in patterns { result = result.union_(other); if result.is_total() { @@ -115,9 +115,7 @@ impl RegularExpression { RegularExpression::Alternation(alternate) } } else { - panic!( - "Not character and repetition {this_character:?} {that_repetition:?}" - ) + panic!("Not character and repetition {this_character:?} {that_repetition:?}") } } From 90c462b5c63d5089284187522035940901748b81 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sun, 3 Aug 2025 14:08:36 +0200 Subject: [PATCH 12/44] update --- README.md | 24 ++-- src/error/mod.rs | 18 --- src/execution_profile.rs | 4 +- src/fast_automaton/analyze/equivalence.rs | 8 +- src/fast_automaton/analyze/mod.rs | 47 ++++++- src/fast_automaton/convert/to_regex/mod.rs | 6 +- src/fast_automaton/spanning_set/mod.rs | 2 + src/lib.rs | 39 +++--- src/regex/analyze/affixes.rs | 1 + src/regex/operation/mod.rs | 2 +- src/tokenizer/embed_automaton.rs | 46 ++----- src/tokenizer/embed_regex.rs | 13 -- src/tokenizer/embed_regex_operations.rs | 119 ------------------ src/tokenizer/mod.rs | 1 - src/tokenizer/token/automaton_token.rs | 43 ------- src/tokenizer/token/mod.rs | 16 --- src/tokenizer/token/range_token.rs | 34 ----- src/tokenizer/token/regex_operations_token.rs | 64 ---------- src/tokenizer/token/regex_token.rs | 53 -------- tests/integration_tests.rs | 4 +- 20 files changed, 101 insertions(+), 443 deletions(-) delete mode 100644 src/tokenizer/embed_regex_operations.rs delete mode 100644 src/tokenizer/token/regex_operations_token.rs diff --git a/README.md b/README.md index a808caf..2bca560 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,6 @@ - [Term](#term) - [FastAutomaton](#fastautomaton) - [RegularExpression](#regularexpression) - - [Error Handling](#error-handling) - [Bound Execution](#bound-execution) - [Cross-Language Support](#cross-language-support) - [License](#license) @@ -37,29 +36,32 @@ let t2 = Term::from_pattern(".*xyz").unwrap(); // Concatenate let concat = t1.concat(&[t2]).unwrap(); -assert_eq!(concat.to_string(), "abc.*xyz"); +assert_eq!(concat.to_pattern().unwrap(), "abc.*xyz"); // Union let union = t1.union(&[Term::from_pattern("fgh").unwrap()]).unwrap(); -assert_eq!(union.to_string(), "(abc.*|fgh)"); +assert_eq!(union.to_pattern().unwrap(), "(abc.*|fgh)"); // Intersection let inter = Term::from_pattern("(ab|xy){2}") .unwrap() .intersection(&[Term::from_pattern(".*xy").unwrap()]) .unwrap(); // (ab|xy)xy -assert_eq!(inter.to_string(), "(ab|xy)xy"); +assert_eq!(inter.to_pattern().unwrap(), "(ab|xy)xy"); // Subtraction let diff = Term::from_pattern("a*") .unwrap() .subtraction(&Term::from_pattern("").unwrap()) .unwrap(); -assert_eq!(diff.to_string(), "a+"); +assert_eq!(diff.to_pattern().unwrap(), "a+"); // Repetition -let rep = Term::from_pattern("abc").unwrap().repeat(2, Some(4)).unwrap(); -assert_eq!(rep.to_string(), "(abc){2,4}"); +let rep = Term::from_pattern("abc") + .unwrap() + .repeat(2, Some(4)) + .unwrap(); +assert_eq!(rep.to_pattern().unwrap(), "(abc){2,4}"); // Analyze assert_eq!(rep.get_length(), (Some(6), Some(12))); @@ -75,7 +77,7 @@ println!("Some matches: {:?}", samples); // Equivalence & subset let a = Term::from_pattern("a+").unwrap(); let b = Term::from_pattern("a*").unwrap(); -assert!(!a.is_equivalent_of(&b).unwrap()); +assert!(!a.are_equivalent(&b).unwrap()); assert!(a.is_subset_of(&b).unwrap()); ``` @@ -121,7 +123,7 @@ RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/re | Method | Return | Description | | -------- | ------- | ------- | | `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates the given count of strings matched by the given term. | -| `is_equivalent_of(&self, term: &Term)` | `Result` | Computes whether the current term and the given term are equivalent. Returns `true` if both terms accept the same language. | +| `are_equivalent(&self, term: &Term)` | `Result` | Computes whether the current term and the given term are equivalent. Returns `true` if both terms accept the same language. | | `is_subset_of(&self, term: &Term)` | `Result` | Computes whether the current term is a subset of the given term. Returns `true` if all strings matched by the current term are also matched by the given term. | | `is_empty(&self)` | `bool` | Checks if the current term matches the empty language. | | `is_total(&self)` | `bool` | Checks if the current term matches all possible strings. | @@ -220,7 +222,7 @@ This design allows us to perform unions, intersections, and complements of trans | `has_state(&self, state: State)` | `bool` | Returns `true` if the automaton contains at least one cycle. | | `to_regex(&self)` | `Option` | Try to convert the automaton to a `RegularExpression`. If it cannot find an equivalent pattern returns `None`. | | `has_intersection(&self, other: &FastAutomaton)` | `Result` | Returns `true` if the two automatons have a non-empty intersection. | -| `is_equivalent_of(&self, other: &FastAutomaton)` | `Result` | Computes whether the current `FastAutomaton` and the given `FastAutomaton` are equivalent. Returns `true` if both automata accept the same language. | +| `are_equivalent(&self, other: &FastAutomaton)` | `Result` | Computes whether the current `FastAutomaton` and the given `FastAutomaton` are equivalent. Returns `true` if both automata accept the same language. | | `is_subset_of(&self, other: &FastAutomaton)` | `Result` | Computes whether the current `FastAutomaton` is a subset of the given `FastAutomaton`. Returns `true` if all strings matched by the current `FastAutomaton` are also matched by the given `FastAutomaton`. | | `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of the possible matched strings. | | `get_cardinality(&self)` | `Cardinality` | Returns the cardinality of the provided term (i.e. the number of the possible matched strings). | @@ -253,8 +255,6 @@ This design allows us to perform unions, intersections, and complements of trans | `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of the possible matched strings. | | `get_cardinality(&self)` | `Cardinality` | Returns the cardinality of the provided term (i.e. the number of the possible matched strings). | -## Error Handling - ## Bound Execution By default, all operations run without limits. For heavy or untrusted patterns, use a thread local `ExecutionProfile` to cap execution time and maximum number of states in used automata. diff --git a/src/error/mod.rs b/src/error/mod.rs index e88d1e1..303c225 100644 --- a/src/error/mod.rs +++ b/src/error/mod.rs @@ -54,21 +54,3 @@ impl fmt::Display for EngineError { } impl std::error::Error for EngineError {} - -impl EngineError { - /// Determine if the error is a server error. - /// A server error should not be shown to the end user. - pub fn is_server_error(&self) -> bool { - match self { - EngineError::InvalidCharacterInRegex => false, - EngineError::OperationTimeOutError => false, - EngineError::AutomatonShouldBeDeterministic => true, - EngineError::AutomatonHasTooManyStates => false, - EngineError::RegexSyntaxError(_) => false, - EngineError::TokenError(_) => false, - EngineError::ConditionInvalidRange => true, - EngineError::ConditionIndexOutOfBound => true, - EngineError::CannotComputeAutomatonCardinality => false, - } - } -} diff --git a/src/execution_profile.rs b/src/execution_profile.rs index 3ba3a33..76c1b78 100644 --- a/src/execution_profile.rs +++ b/src/execution_profile.rs @@ -333,7 +333,7 @@ mod tests { Ok(()) } - #[test] + /*#[test] fn test_execution_timeout_intersection() -> Result<(), String> { let term1 = Term::from_pattern(".*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); let term2 = Term::from_pattern(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); @@ -359,5 +359,5 @@ mod tests { }); Ok(()) - } + }*/ } diff --git a/src/fast_automaton/analyze/equivalence.rs b/src/fast_automaton/analyze/equivalence.rs index 18a6f14..6483d68 100644 --- a/src/fast_automaton/analyze/equivalence.rs +++ b/src/fast_automaton/analyze/equivalence.rs @@ -4,7 +4,7 @@ use super::*; impl FastAutomaton { /// Computes whether the current `FastAutomaton` and the given `FastAutomaton` are equivalent. Returns `true` if both automata accept the same language. - pub fn is_equivalent_of(&self, other: &FastAutomaton) -> Result { + pub fn are_equivalent(&self, other: &FastAutomaton) -> Result { if self.is_empty() != other.is_empty() && self.is_total() != other.is_total() { return Ok(false); } else if self == other { @@ -72,14 +72,14 @@ mod tests { fn assert_equivalent(regex_1: &RegularExpression, regex_2: &RegularExpression, expected: bool) { println!("{regex_1} and {regex_2}"); let automaton_1 = regex_1.to_automaton().unwrap(); - assert_eq!(true, automaton_1.is_equivalent_of(&automaton_1).unwrap()); + assert_eq!(true, automaton_1.are_equivalent(&automaton_1).unwrap()); let automaton_2 = regex_2.to_automaton().unwrap(); - assert_eq!(true, automaton_2.is_equivalent_of(&automaton_2).unwrap()); + assert_eq!(true, automaton_2.are_equivalent(&automaton_2).unwrap()); assert_eq!( expected, - automaton_1.is_equivalent_of(&automaton_2).unwrap() + automaton_1.are_equivalent(&automaton_2).unwrap() ); } } diff --git a/src/fast_automaton/analyze/mod.rs b/src/fast_automaton/analyze/mod.rs index 7d5d7fa..2f220b0 100644 --- a/src/fast_automaton/analyze/mod.rs +++ b/src/fast_automaton/analyze/mod.rs @@ -10,11 +10,13 @@ mod length; mod subset; impl FastAutomaton { + /// Checks if the current `FastAutomaton` matches the empty language. #[inline] pub fn is_empty(&self) -> bool { self.accept_states.is_empty() } + /// Checks if the current `FastAutomaton` matches all possible strings. #[inline] pub fn is_total(&self) -> bool { if self.accept_states.contains(&self.start_state) { @@ -25,9 +27,12 @@ impl FastAutomaton { false } + /// Checks if the current `FastAutomaton` only match the empty string `""`. #[inline] pub fn is_empty_string(&self) -> bool { - self.accept_states.len() == 1 && self.accept_states.contains(&self.start_state) && self.state_in_degree(self.start_state) == 0 + self.accept_states.len() == 1 + && self.accept_states.contains(&self.start_state) + && self.state_in_degree(self.start_state) == 0 } /// Get a set of all reacheable states from the start state. @@ -68,8 +73,42 @@ impl FastAutomaton { } pub(crate) fn get_ranges(&self) -> Result, EngineError> { - self.spanning_set.get_spanning_ranges().map(|range| { - Condition::from_range(range, &self.spanning_set) - }).collect() + self.spanning_set + .get_spanning_ranges() + .map(|range| Condition::from_range(range, &self.spanning_set)) + .collect() + } +} + +#[cfg(test)] +mod tests { + + use crate::fast_automaton::FastAutomaton; + + #[test] + fn test_empty() -> Result<(), String> { + assert!(!FastAutomaton::new_total().is_empty()); + assert!(!FastAutomaton::new_empty_string().is_empty()); + assert!(FastAutomaton::new_empty().is_empty()); + + Ok(()) + } + + #[test] + fn test_empty_string() -> Result<(), String> { + assert!(!FastAutomaton::new_total().is_empty_string()); + assert!(FastAutomaton::new_empty_string().is_empty_string()); + assert!(!FastAutomaton::new_empty().is_empty_string()); + + Ok(()) + } + + #[test] + fn test_total() -> Result<(), String> { + assert!(FastAutomaton::new_total().is_total()); + assert!(!FastAutomaton::new_empty_string().is_total()); + assert!(!FastAutomaton::new_empty().is_total()); + + Ok(()) } } diff --git a/src/fast_automaton/convert/to_regex/mod.rs b/src/fast_automaton/convert/to_regex/mod.rs index 2469d03..3e6193c 100644 --- a/src/fast_automaton/convert/to_regex/mod.rs +++ b/src/fast_automaton/convert/to_regex/mod.rs @@ -252,7 +252,7 @@ impl FastAutomaton { if let Ok(regex) = graph?.convert_to_regex(&execution_profile) { let regex = regex?; match regex.to_automaton() { - Ok(automaton) => match self.is_equivalent_of(&automaton) { + Ok(automaton) => match self.are_equivalent(&automaton) { Ok(result) => { if !result { warn!( @@ -337,7 +337,7 @@ mod tests { println!("OUT (non deterministic): {}", output_regex); let output_automaton = output_regex.to_automaton().unwrap(); - assert!(input_automaton.is_equivalent_of(&output_automaton).unwrap()); + assert!(input_automaton.are_equivalent(&output_automaton).unwrap()); let input_automaton = input_automaton.determinize().unwrap(); @@ -347,7 +347,7 @@ mod tests { println!("OUT (deterministic) : {}", output_regex); let output_automaton = output_regex.to_automaton().unwrap(); - assert!(input_automaton.is_equivalent_of(&output_automaton).unwrap()); + assert!(input_automaton.are_equivalent(&output_automaton).unwrap()); } #[test] diff --git a/src/fast_automaton/spanning_set/mod.rs b/src/fast_automaton/spanning_set/mod.rs index 2e998b8..bfaefcb 100644 --- a/src/fast_automaton/spanning_set/mod.rs +++ b/src/fast_automaton/spanning_set/mod.rs @@ -63,6 +63,7 @@ impl SpanningSet { &self.1 } + /// Compute a new minimal spanning set by merging the provided spanning set. pub fn merge(&self, other: &Self) -> Self { let mut ranges = Vec::with_capacity(self.0.len() + other.0.len()); ranges.extend_from_slice(&self.0); @@ -71,6 +72,7 @@ impl SpanningSet { Self::compute_spanning_set(&ranges) } + /// Compute a new minimal spanning set for the provided ranges. pub fn compute_spanning_set(ranges: &[CharRange]) -> Self { let mut spanning_ranges: Vec = ranges.to_vec(); spanning_ranges.sort_unstable(); diff --git a/src/lib.rs b/src/lib.rs index 0dc2ec9..d177120 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -343,16 +343,16 @@ impl Term { /// let term1 = Term::from_pattern("(abc|de)").unwrap(); /// let term2 = Term::from_pattern("(abc|de)*").unwrap(); /// - /// assert!(!term1.is_equivalent_of(&term2).unwrap()); + /// assert!(!term1.are_equivalent(&term2).unwrap()); /// ``` - pub fn is_equivalent_of(&self, that: &Term) -> Result { + pub fn are_equivalent(&self, that: &Term) -> Result { if self == that { return Ok(true); } let automaton_1 = self.to_automaton()?; let automaton_2 = that.to_automaton()?; - automaton_1.is_equivalent_of(&automaton_2) + automaton_1.are_equivalent(&automaton_2) } /// Compute whether the current term is a subset of the given term. @@ -446,6 +446,11 @@ impl Term { }) } + /// Converts the current `Term` to a regular expression pattern. Returns `None` if the automaton cannot be converted. + pub fn to_pattern(&self) -> Option { + Some(self.to_regex()?.to_string()) + } + fn determinize_subtrahend<'a>( minuend: &FastAutomaton, subtrahend: &'a FastAutomaton, @@ -522,9 +527,9 @@ mod tests { let result = regex1.subtraction(®ex2); assert!(result.is_ok()); - let result = result.unwrap(); + let result = result.unwrap().to_pattern().unwrap(); assert_eq!( - Term::RegularExpression(RegularExpression::new("a+").unwrap()), + "a+", result ); @@ -538,10 +543,10 @@ mod tests { let result = regex1.subtraction(®ex2); assert!(result.is_ok()); - let result = result.unwrap(); + let result = result.unwrap().to_regex().unwrap().into_owned(); assert_eq!( Term::RegularExpression(RegularExpression::new("(xxx)*(x|xx)").unwrap()), - result + Term::RegularExpression(result) ); Ok(()) @@ -554,8 +559,8 @@ mod tests { let result = regex1.intersection(&vec![regex2]); assert!(result.is_ok()); - let result = result.unwrap(); - assert_eq!(Term::from_pattern("").unwrap(), result); + let result = result.unwrap().to_pattern().unwrap(); + assert_eq!("", result); Ok(()) } @@ -567,9 +572,9 @@ mod tests { let result = regex1.intersection(&vec![regex2]); assert!(result.is_ok()); - let result = result.unwrap(); + let result = result.unwrap().to_pattern().unwrap(); assert_eq!( - Term::RegularExpression(RegularExpression::new("(x{3})*").unwrap()), + "(x{3})*", result ); @@ -584,32 +589,32 @@ mod tests { // Concatenate let concat = t1.concat(&[t2]).unwrap(); - assert_eq!(concat.to_string(), "abc.*xyz"); + assert_eq!(concat.to_pattern().unwrap(), "abc.*xyz"); // Union let union = t1.union(&[Term::from_pattern("fgh").unwrap()]).unwrap(); // (abc.*|fgh) - assert_eq!(union.to_string(), "(abc.*|fgh)"); + assert_eq!(union.to_pattern().unwrap(), "(abc.*|fgh)"); // Intersection let inter = Term::from_pattern("(ab|xy){2}") .unwrap() .intersection(&[Term::from_pattern(".*xy").unwrap()]) .unwrap(); // (ab|xy)xy - assert_eq!(inter.to_string(), "(ab|xy)xy"); + assert_eq!(inter.to_pattern().unwrap(), "(ab|xy)xy"); // Subtraction let diff = Term::from_pattern("a*") .unwrap() .subtraction(&Term::from_pattern("").unwrap()) .unwrap(); - assert_eq!(diff.to_string(), "a+"); + assert_eq!(diff.to_pattern().unwrap(), "a+"); // Repetition let rep = Term::from_pattern("abc") .unwrap() .repeat(2, Some(4)) .unwrap(); // (abc){2,4} - assert_eq!(rep.to_string(), "(abc){2,4}"); + assert_eq!(rep.to_pattern().unwrap(), "(abc){2,4}"); // Analyze assert_eq!(rep.get_length(), (Some(6), Some(12))); @@ -625,7 +630,7 @@ mod tests { // Equivalence & subset let a = Term::from_pattern("a+").unwrap(); let b = Term::from_pattern("a*").unwrap(); - assert!(!a.is_equivalent_of(&b).unwrap()); + assert!(!a.are_equivalent(&b).unwrap()); assert!(a.is_subset_of(&b).unwrap()); Ok(()) diff --git a/src/regex/analyze/affixes.rs b/src/regex/analyze/affixes.rs index 540580b..34aa401 100644 --- a/src/regex/analyze/affixes.rs +++ b/src/regex/analyze/affixes.rs @@ -285,6 +285,7 @@ mod tests { assert_regex_affix(true, "(ab|cd)x", "(ab|cd)y", "(ab|cd)", "x", "y"); assert_regex_affix(true, "a+", "a+b", "a+", "", "b"); + assert_regex_affix(true, "(ab|cd)", "(ab|cd)", "(ab|cd)", "", ""); Ok(()) } diff --git a/src/regex/operation/mod.rs b/src/regex/operation/mod.rs index ae7da22..f572238 100644 --- a/src/regex/operation/mod.rs +++ b/src/regex/operation/mod.rs @@ -180,6 +180,6 @@ mod tests { let result = got.to_automaton().unwrap(); - assert!(repeat.is_equivalent_of(&result).unwrap()); + assert!(repeat.are_equivalent(&result).unwrap()); } } diff --git a/src/tokenizer/embed_automaton.rs b/src/tokenizer/embed_automaton.rs index 602cb93..0838525 100644 --- a/src/tokenizer/embed_automaton.rs +++ b/src/tokenizer/embed_automaton.rs @@ -137,13 +137,13 @@ mod tests { #[test] fn test_tokenize() -> Result<(), String> { - assert_embedding_convertion_for_fair_and_ai("(a|b)"); - assert_embedding_convertion_for_fair_and_ai("(|a)"); - assert_embedding_convertion_for_fair_and_ai(".*ab"); - assert_embedding_convertion_for_fair_and_ai("toto"); - assert_embedding_convertion_for_fair_and_ai(".{2,3}"); - assert_embedding_convertion_for_fair_and_ai("q(ab|ca|ab|abc)x"); - assert_embedding_convertion_for_fair_and_ai(".*q(ab|ca|ab|abc)x"); + assert_embedding_convertion_for_fair("(a|b)"); + assert_embedding_convertion_for_fair("(|a)"); + assert_embedding_convertion_for_fair(".*ab"); + assert_embedding_convertion_for_fair("toto"); + assert_embedding_convertion_for_fair(".{2,3}"); + assert_embedding_convertion_for_fair("q(ab|ca|ab|abc)x"); + assert_embedding_convertion_for_fair(".*q(ab|ca|ab|abc)x"); assert_embedding_convertion_for_fair( "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q)", ); @@ -155,14 +155,10 @@ mod tests { } fn assert_embedding_convertion_for_fair(regex: &str) { - assert_embedding_convertion(regex, true); + assert_embedding_convertion(regex); } - fn assert_embedding_convertion_for_fair_and_ai(regex: &str) { - assert_embedding_convertion(regex, false); - } - - fn assert_embedding_convertion(regex: &str, ignore_ai: bool) { + fn assert_embedding_convertion(regex: &str) { let regex = RegularExpression::new(regex).unwrap(); println!("{}", regex); @@ -192,29 +188,5 @@ mod tests { .unwrap() .is_empty() ); - - if !ignore_ai { - // AI - let embedding_u8 = AutomatonToken::to_ai_tokens(&embedding).unwrap(); - let embedding: Vec = embedding_u8 - .iter() - .map(|&t| AutomatonToken::from_ai_token(t)) - .collect(); - - let unembedded_automaton = tokenizer.from_embedding(&embedding).unwrap(); - - assert!( - automaton - .subtraction(&unembedded_automaton) - .unwrap() - .is_empty() - ); - assert!( - unembedded_automaton - .subtraction(&automaton) - .unwrap() - .is_empty() - ); - } } } diff --git a/src/tokenizer/embed_regex.rs b/src/tokenizer/embed_regex.rs index 3e05757..fe73cab 100644 --- a/src/tokenizer/embed_regex.rs +++ b/src/tokenizer/embed_regex.rs @@ -290,18 +290,5 @@ mod tests { let unembedded_regex = tokenizer.from_regex_embedding(&embedding).unwrap(); assert_eq!(regex, unembedded_regex); - - // AI - let embedding_u8 = RegexToken::to_ai_tokens(&embedding).unwrap(); - assert_eq!( - embedding, - embedding_u8 - .iter() - .map(|&t| RegexToken::from_ai_token(t)) - .collect::>() - ); - - let unembedded_regex = tokenizer.from_regex_embedding(&embedding).unwrap(); - assert_eq!(regex, unembedded_regex); } } diff --git a/src/tokenizer/embed_regex_operations.rs b/src/tokenizer/embed_regex_operations.rs deleted file mode 100644 index 4dcb19f..0000000 --- a/src/tokenizer/embed_regex_operations.rs +++ /dev/null @@ -1,119 +0,0 @@ -use token::TokenError; - -use crate::regex::RegularExpression; - -use self::token::regex_operations_token::RegexOperationsToken; - -use super::*; - -impl Tokenizer<'_> { - pub fn to_regex_operations_embedding( - &self, - regex_operations: &[(bool, RegularExpression)], - ) -> Vec { - let mut vec = vec![]; - - for (not, regex) in regex_operations { - if !vec.is_empty() { - vec.push(RegexOperationsToken::And); - } - if *not { - vec.push(RegexOperationsToken::Not); - } - - vec.extend( - self.to_regex_embedding(regex) - .into_iter() - .map(RegexOperationsToken::RegexToken), - ); - } - - vec - } - - pub fn from_regex_operations_embedding( - &self, - vec: &[RegexOperationsToken], - ) -> Result, TokenError> { - let mut operations = vec![]; - let mut current_regex_not = false; - let mut current_regex_token = vec![]; - for token in vec { - match token { - RegexOperationsToken::RegexToken(regex_token) => { - current_regex_token.push(*regex_token) - } - RegexOperationsToken::And => { - let regex = self.from_regex_embedding(¤t_regex_token)?; - operations.push((current_regex_not, regex)); - current_regex_not = false; - current_regex_token.clear(); - } - RegexOperationsToken::Not => current_regex_not = true, - RegexOperationsToken::Error => return Err(TokenError::UnknownToken), - }; - } - - if !current_regex_token.is_empty() { - let regex = self.from_regex_embedding(¤t_regex_token)?; - operations.push((current_regex_not, regex)); - } - - Ok(operations) - } -} - -#[cfg(test)] -mod tests { - use embed_regex_operations::token::Token; - - use crate::regex::RegularExpression; - - use super::*; - - #[test] - fn test_tokenize() -> Result<(), String> { - assert_embedding_convertion(&[(false, "(a|b)")]); - assert_embedding_convertion(&[(false, "(|a)")]); - assert_embedding_convertion(&[(false, ".*ab")]); - assert_embedding_convertion(&[(true, "toto")]); - assert_embedding_convertion(&[(false, ".{2,3}")]); - assert_embedding_convertion(&[(false, "q(abc?|ca)x")]); - assert_embedding_convertion(&[(false, ".*q(abc?|ca)x")]); - assert_embedding_convertion(&[(false, "(abc){3,6}")]); - assert_embedding_convertion(&[(true, "((|a)abd+){3}")]); - - assert_embedding_convertion(&[(false, ".*a.*"), (false, ".*b.*"), (true, ".*abc.*")]); - Ok(()) - } - - fn assert_embedding_convertion(operations: &[(bool, &str)]) { - let mut automaton = FastAutomaton::new_total(); - let operations: Vec<(bool, RegularExpression)> = operations - .iter() - .map(|(not, regex)| { - let regex = RegularExpression::new(regex).unwrap(); - automaton = automaton.intersection(®ex.to_automaton().unwrap()).unwrap(); - (*not, regex) - }) - .collect(); - - let tokenizer = Tokenizer::new(&automaton); - let embedding = tokenizer.to_regex_operations_embedding(&operations); - - // AI - let embedding_u8: Vec = RegexOperationsToken::to_ai_tokens(&embedding).unwrap(); - assert_eq!( - embedding, - embedding_u8 - .iter() - .map(|&t| RegexOperationsToken::from_ai_token(t)) - .collect::>() - ); - - let unembedded_operations = tokenizer - .from_regex_operations_embedding(&embedding) - .unwrap(); - assert_eq!(operations, unembedded_operations); - } -} diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 3273b0e..7c83c1a 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -11,7 +11,6 @@ use self::{range_tokenizer::RangeTokenizer, token::automaton_token::AutomatonTok mod embed_automaton; mod embed_regex; -mod embed_regex_operations; pub mod range_tokenizer; pub mod token; diff --git a/src/tokenizer/token/automaton_token.rs b/src/tokenizer/token/automaton_token.rs index 215ffed..e5f379c 100644 --- a/src/tokenizer/token/automaton_token.rs +++ b/src/tokenizer/token/automaton_token.rs @@ -24,15 +24,6 @@ impl PartialOrd for AutomatonToken { } impl AutomatonToken { - const TK_AI_RANGE: u8 = 0; - const TK_AI_STATE: u8 = Self::TK_AI_RANGE + RangeToken::AI_VOCABULARY_SIZE; - const TK_AI_ACCEPT_STATE: u8 = Self::TK_AI_STATE + Self::AI_MAX_NUMBER_OF_STATES; - const TK_AI_SEPARATOR_STATE: u8 = Self::TK_AI_ACCEPT_STATE + 1; - - pub const AI_MAX_NUMBER_OF_STATES: u8 = 100; - - pub const AI_VOCABULARY_SIZE: u8 = Self::TK_AI_SEPARATOR_STATE + 1; - const TK_FAIR_RANGE: u16 = 0; const TK_FAIR_STATE: u16 = Self::TK_FAIR_RANGE + RangeToken::FAIR_VOCABULARY_SIZE; const TK_FAIR_ACCEPT_STATE: u16 = Self::TK_FAIR_STATE + Self::FAIR_MAX_NUMBER_OF_STATES; @@ -44,40 +35,6 @@ impl AutomatonToken { } impl Token for AutomatonToken { - fn from_ai_token(token: u8) -> AutomatonToken { - if (Self::TK_AI_RANGE..Self::TK_AI_RANGE + RangeToken::AI_VOCABULARY_SIZE).contains(&token) - { - AutomatonToken::Range(RangeToken::from_ai_token(token)) - } else if (Self::TK_AI_STATE..Self::TK_AI_STATE + Self::AI_MAX_NUMBER_OF_STATES) - .contains(&token) - { - AutomatonToken::State((token - Self::TK_AI_STATE) as u16) - } else if token == Self::TK_AI_ACCEPT_STATE { - AutomatonToken::AcceptState - } else if token == Self::TK_AI_SEPARATOR_STATE { - AutomatonToken::SeparatorState - } else { - AutomatonToken::Error - } - } - - fn to_ai_token(&self) -> Result { - Ok(match self { - AutomatonToken::Range(r) => r.to_ai_token()?, - AutomatonToken::State(s) => { - let max = Self::AI_MAX_NUMBER_OF_STATES; - let s = *s as u8; - if s > max { - return Err(TokenError::TokenOutOfBound("State", max.into(), s.into())); - } - s + Self::TK_AI_STATE - } - AutomatonToken::AcceptState => Self::TK_AI_ACCEPT_STATE, - AutomatonToken::SeparatorState => Self::TK_AI_SEPARATOR_STATE, - AutomatonToken::Error => return Err(TokenError::UnknownToken), - }) - } - fn from_fair_token(token: u16) -> AutomatonToken { if (Self::TK_FAIR_RANGE..Self::TK_FAIR_RANGE + RangeToken::FAIR_VOCABULARY_SIZE) .contains(&token) diff --git a/src/tokenizer/token/mod.rs b/src/tokenizer/token/mod.rs index 2f28e32..4342be8 100644 --- a/src/tokenizer/token/mod.rs +++ b/src/tokenizer/token/mod.rs @@ -4,7 +4,6 @@ use super::*; pub mod automaton_token; pub mod range_token; -pub mod regex_operations_token; pub mod regex_token; #[derive(Debug, PartialEq, Eq)] @@ -28,21 +27,6 @@ impl Display for TokenError { } pub trait Token { - fn from_ai_token(token: u8) -> Self; - - fn to_ai_token(&self) -> Result; - - fn to_ai_tokens(tokens: &[Self]) -> Result, TokenError> - where - Self: Sized, - { - let mut vec = Vec::with_capacity(tokens.len()); - for token in tokens { - vec.push(token.to_ai_token()?); - } - Ok(vec) - } - fn from_fair_token(token: u16) -> Self; fn to_fair_token(&self) -> Result; diff --git a/src/tokenizer/token/range_token.rs b/src/tokenizer/token/range_token.rs index 62a1753..7876452 100644 --- a/src/tokenizer/token/range_token.rs +++ b/src/tokenizer/token/range_token.rs @@ -8,13 +8,6 @@ pub enum RangeToken { } impl RangeToken { - const TK_AI_TOTAL: u8 = 0; - const TK_AI_BASE: u8 = 1; - - pub const AI_MAX_NUMBER_OF_BASES: u8 = 10; - - pub const AI_VOCABULARY_SIZE: u8 = Self::TK_AI_BASE + Self::AI_MAX_NUMBER_OF_BASES + 1; - const TK_FAIR_TOTAL: u16 = 0; const TK_FAIR_BASE: u16 = 1; @@ -36,33 +29,6 @@ impl PartialOrd for RangeToken { } impl Token for RangeToken { - fn from_ai_token(token: u8) -> RangeToken { - if token == Self::TK_AI_TOTAL { - RangeToken::Total - } else if (Self::TK_AI_BASE..Self::TK_AI_BASE + Self::AI_MAX_NUMBER_OF_BASES) - .contains(&token) - { - RangeToken::Base((token - Self::TK_AI_BASE) as usize) - } else { - RangeToken::Error - } - } - - fn to_ai_token(&self) -> Result { - Ok(match self { - RangeToken::Total => Self::TK_AI_TOTAL, - RangeToken::Base(b) => { - let max = Self::AI_MAX_NUMBER_OF_BASES; - let b = *b as u8; - if b > max { - return Err(TokenError::TokenOutOfBound("Base", max.into(), b.into())); - } - b + Self::TK_AI_BASE - } - RangeToken::Error => return Err(TokenError::UnknownToken), - }) - } - fn from_fair_token(token: u16) -> RangeToken { if token == Self::TK_FAIR_TOTAL { RangeToken::Total diff --git a/src/tokenizer/token/regex_operations_token.rs b/src/tokenizer/token/regex_operations_token.rs deleted file mode 100644 index 1074f7f..0000000 --- a/src/tokenizer/token/regex_operations_token.rs +++ /dev/null @@ -1,64 +0,0 @@ -use self::regex_token::RegexToken; - -use super::*; - -#[derive(Debug, Eq, PartialEq, Clone, Copy)] -pub enum RegexOperationsToken { - RegexToken(RegexToken), - And, - Not, - Error, -} - -impl Ord for RegexOperationsToken { - fn cmp(&self, other: &Self) -> Ordering { - (self.to_ai_token().unwrap()).cmp(&other.to_ai_token().unwrap()) - } -} - -impl PartialOrd for RegexOperationsToken { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl RegexOperationsToken { - const TK_AI_REGEX_TOKEN: u8 = 0; - const TK_AI_AND: u8 = Self::TK_AI_REGEX_TOKEN + RegexToken::AI_VOCABULARY_SIZE; - const TK_AI_NOT: u8 = Self::TK_AI_AND + 1; - - pub const AI_VOCABULARY_SIZE: u8 = Self::TK_AI_NOT + 1; -} - -impl Token for RegexOperationsToken { - fn from_ai_token(token: u8) -> RegexOperationsToken { - if (Self::TK_AI_REGEX_TOKEN..Self::TK_AI_REGEX_TOKEN + RegexToken::AI_VOCABULARY_SIZE) - .contains(&token) - { - RegexOperationsToken::RegexToken(RegexToken::from_ai_token(token)) - } else if token == Self::TK_AI_AND { - RegexOperationsToken::And - } else if token == Self::TK_AI_NOT { - RegexOperationsToken::Not - } else { - RegexOperationsToken::Error - } - } - - fn to_ai_token(&self) -> Result { - Ok(match self { - RegexOperationsToken::RegexToken(regex_token) => regex_token.to_ai_token()?, - RegexOperationsToken::And => Self::TK_AI_AND, - RegexOperationsToken::Not => Self::TK_AI_NOT, - RegexOperationsToken::Error => return Err(TokenError::UnknownToken), - }) - } - - fn from_fair_token(_: u16) -> RegexOperationsToken { - panic!("A RegexOperationsToken does not have a FAIR representation.") - } - - fn to_fair_token(&self) -> Result { - panic!("A RegexOperationsToken does not have a FAIR representation.") - } -} diff --git a/src/tokenizer/token/regex_token.rs b/src/tokenizer/token/regex_token.rs index 2f4c2f2..bcb2e2b 100644 --- a/src/tokenizer/token/regex_token.rs +++ b/src/tokenizer/token/regex_token.rs @@ -26,18 +26,6 @@ impl PartialOrd for RegexToken { } impl RegexToken { - const TK_AI_RANGE: u8 = 0; - const TK_AI_START_GROUP: u8 = Self::TK_AI_RANGE + RangeToken::AI_VOCABULARY_SIZE; - const TK_AI_END_GROUP: u8 = Self::TK_AI_START_GROUP + 1; - const TK_AI_ALTERNATION: u8 = Self::TK_AI_END_GROUP + 1; - const TK_AI_REPETITION_NONE: u8 = Self::TK_AI_ALTERNATION + 1; - const TK_AI_REPETITION: u8 = Self::TK_AI_REPETITION_NONE + 1; - - pub const AI_MAX_NUMBER_OF_REPETITION: u8 = 10; - - pub const AI_VOCABULARY_SIZE: u8 = - Self::TK_AI_REPETITION + Self::AI_MAX_NUMBER_OF_REPETITION + 1; - const TK_FAIR_RANGE: u16 = 0; const TK_FAIR_START_GROUP: u16 = Self::TK_FAIR_RANGE + RangeToken::FAIR_VOCABULARY_SIZE; const TK_FAIR_END_GROUP: u16 = Self::TK_FAIR_START_GROUP + 1; @@ -52,47 +40,6 @@ impl RegexToken { } impl Token for RegexToken { - fn from_ai_token(token: u8) -> RegexToken { - if (Self::TK_AI_RANGE..Self::TK_AI_RANGE + RangeToken::AI_VOCABULARY_SIZE).contains(&token) - { - RegexToken::Range(RangeToken::from_ai_token(token)) - } else if token == Self::TK_AI_START_GROUP { - RegexToken::StartGroup - } else if token == Self::TK_AI_END_GROUP { - RegexToken::EndGroup - } else if token == Self::TK_AI_ALTERNATION { - RegexToken::Alternation - } else if token == Self::TK_AI_REPETITION_NONE { - RegexToken::RepetitionNone - } else if (Self::TK_AI_REPETITION - ..Self::TK_AI_REPETITION + Self::AI_MAX_NUMBER_OF_REPETITION) - .contains(&token) - { - RegexToken::Repetition((token - Self::TK_AI_REPETITION) as u16) - } else { - RegexToken::Error - } - } - - fn to_ai_token(&self) -> Result { - Ok(match self { - RegexToken::Range(r) => r.to_ai_token()?, - RegexToken::StartGroup => Self::TK_AI_START_GROUP, - RegexToken::EndGroup => Self::TK_AI_END_GROUP, - RegexToken::Alternation => Self::TK_AI_ALTERNATION, - RegexToken::RepetitionNone => Self::TK_AI_REPETITION_NONE, - RegexToken::Repetition(r) => { - let max = Self::AI_MAX_NUMBER_OF_REPETITION; - let r = *r as u8; - if r > max { - return Err(TokenError::TokenOutOfBound("Repetition", max.into(), r.into())); - } - r + Self::TK_AI_REPETITION - } - RegexToken::Error => return Err(TokenError::UnknownToken), - }) - } - fn from_fair_token(token: u16) -> RegexToken { if (Self::TK_FAIR_RANGE..Self::TK_FAIR_RANGE + RangeToken::FAIR_VOCABULARY_SIZE) .contains(&token) diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs index 1e572a9..d1dd407 100644 --- a/tests/integration_tests.rs +++ b/tests/integration_tests.rs @@ -29,11 +29,11 @@ fn assert_regex(regex: &str) { assert!(automaton.is_subset_of(&determinized_automaton).unwrap()); assert!(determinized_automaton.is_subset_of(&automaton).unwrap()); - assert!(automaton.is_equivalent_of(&determinized_automaton).unwrap()); + assert!(automaton.are_equivalent(&determinized_automaton).unwrap()); let regex_from_automaton = automaton.to_regex().unwrap(); let automaton_from_regex = regex_from_automaton.to_automaton().unwrap(); - assert!(automaton.is_equivalent_of(&automaton_from_regex).unwrap()); + assert!(automaton.are_equivalent(&automaton_from_regex).unwrap()); } #[test] From 7884f7336817e1ab9b9d6796730b4612bdea5eb0 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sun, 3 Aug 2025 15:54:58 +0200 Subject: [PATCH 13/44] update most descriptions --- README.md | 168 ++++++++++--------- src/fast_automaton/analyze/cardinality.rs | 2 +- src/fast_automaton/analyze/equivalence.rs | 6 +- src/fast_automaton/analyze/length.rs | 2 +- src/fast_automaton/analyze/mod.rs | 8 +- src/fast_automaton/analyze/subset.rs | 4 +- src/fast_automaton/builder.rs | 43 +++-- src/fast_automaton/convert/to_regex/mod.rs | 10 +- src/fast_automaton/generate.rs | 7 +- src/fast_automaton/mod.rs | 30 ++-- src/fast_automaton/operation/concat.rs | 2 +- src/fast_automaton/operation/determinize.rs | 8 +- src/fast_automaton/operation/intersection.rs | 2 +- src/fast_automaton/operation/repeat.rs | 2 +- src/fast_automaton/operation/subtraction.rs | 2 +- src/fast_automaton/serializer.rs | 19 ++- src/lib.rs | 67 +++----- src/regex/analyze/mod.rs | 6 +- src/regex/builder.rs | 8 +- src/regex/mod.rs | 8 +- src/regex/operation/concat.rs | 2 +- src/regex/operation/repeat.rs | 2 +- src/regex/operation/simplify.rs | 2 +- src/regex/operation/union.rs | 4 +- src/tokenizer/embed_automaton.rs | 5 +- src/tokenizer/embed_regex.rs | 3 +- 26 files changed, 215 insertions(+), 207 deletions(-) diff --git a/README.md b/README.md index 2bca560..3b1084c 100644 --- a/README.md +++ b/README.md @@ -102,43 +102,44 @@ RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/re #### Build | Method | Return | Description | | -------- | ------- | ------- | +| `from_automaton(automaton: FastAutomaton)` | `Term` | Creates a new `Term` holding the provided `FastAutomaton`. | +| `from_pattern(pattern: &str)` | `Result` | Parses the provided pattern and returns a new `Term` holding the resulting `RegularExpression`. | +| `from_regex(regex: RegularExpression)` | `Term` | Creates a new `Term` holding the provided `RegularExpression`. | | `new_empty()` | `Term` | Creates a term that matches the empty language. | +| `new_empty_string()` | `Term` | Creates a term that only matches the empty string `""`. | | `new_total()` | `Term` | Creates a term that matches all possible strings. | -| `new_empty_string()` | `Term` | Creates a term that only match the empty string `""`. | -| `from_pattern(pattern: &str)` | `Result` | Parses the provided pattern and return a new `Term` holding the resulting `RegularExpression`. | -| `from_pattern(regex: RegularExpression)` | `Term` | Creates a new `Term` holding the provided `RegularExpression`. | -| `from_automaton(automaton: FastAutomaton)` | `Term` | Creates a new `Term` holding the provided `FastAutomaton`. | #### Manipulate | Method | Return | Description | | -------- | ------- | ------- | -| `concat(&self, terms: &[Term])` | `Result` | Computes the concatenation of the given collection of terms. Returns the resulting term. | -| `union(&self, terms: &[Term])` | `Result` | Computes the union of the given collection of terms. Returns the resulting term. | -| `intersection(&self, terms: &[Term])` | `Result` | Computes the intersection of the given collection of terms. Returns the resulting term. | -| `subtraction(&self, subtrahend: &Term)` | `Result` | Computes the subtraction/difference of the two given terms. Returns the resulting term. | -| `difference(&self, subtrahend: &Term)` | `Result` | See `self.subtraction(subtrahend: &Term)`. | -| `repeat(&self, min: u32, max_opt: Option)` | `Result` | Returns the repetition of the current term, between `min` and `max_opt` times. If `max_opt` is `None`, the repetition is unbounded. | +| `concat(&self, terms: &[Term])` | `Result` | Computes the concatenation of the given terms. | +| `difference(&self, subtrahend: &Term)` | `Result` | Alias for `subtraction`. | +| `intersection(&self, terms: &[Term])` | `Result` | Computes the intersection of the given terms. | +| `repeat(&self, min: u32, max_opt: Option)` | `Result` | Computes the repetition of the current term between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. | +| `subtraction(&self, subtrahend: &Term)` | `Result` | Computes the difference between `self` and the given subtrahend. | +| `union(&self, terms: &[Term])` | `Result` | Computes the union of the given terms. | #### Analyze | Method | Return | Description | | -------- | ------- | ------- | -| `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates the given count of strings matched by the given term. | -| `are_equivalent(&self, term: &Term)` | `Result` | Computes whether the current term and the given term are equivalent. Returns `true` if both terms accept the same language. | -| `is_subset_of(&self, term: &Term)` | `Result` | Computes whether the current term is a subset of the given term. Returns `true` if all strings matched by the current term are also matched by the given term. | -| `is_empty(&self)` | `bool` | Checks if the current term matches the empty language. | -| `is_total(&self)` | `bool` | Checks if the current term matches all possible strings. | -| `is_empty_string(&self)` | `bool` | Checks if the current term only match the empty string `""`. | -| `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of the possible matched strings. | -| `get_cardinality()` | `Result, EngineError>` | Returns the cardinality of the provided term (i.e. the number of the possible matched strings). | -| `to_automaton(&self)` | `Result, EngineError>` | Converts the current `Term` to a `FastAutomaton`. | -| `to_regex(&self)` | `Option>` | Converts the current `Term` to a `RegularExpression`. Returns `None` if the automaton cannot be converted. | +| `are_equivalent(&self, term: &Term)` | `Result` | Returns `true` if both terms accept the same language. | +| `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates `count` strings matched by the term. | +| `get_cardinality()` | `Result, EngineError>` | Returns the cardinality of the term (i.e., the number of possible matched strings). | +| `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of matched strings. | +| `is_empty(&self)` | `bool` | Checks if the term matches the empty language. | +| `is_empty_string(&self)` | `bool` | Checks if the term matches only the empty string `""`. | +| `is_subset_of(&self, term: &Term)` | `Result` | Returns `true` if all strings matched by the current term are also matched by the given term. | +| `is_total(&self)` | `bool` | Checks if the term matches all possible strings. | +| `to_automaton(&self)` | `Result, EngineError>` | Converts the term to a `FastAutomaton`. | +| `to_pattern(&self)` | `Option` | Converts the term to a regular expression pattern; returns `None` if conversion isn’t possible. | +| `to_regex(&self)` | `Option>` | Converts the term to a RegularExpression; returns `None` if conversion isn’t possible. | ### FastAutomaton `FastAutomaton` is used to directly build, manipulate and analyze automata. To convert an automaton to a `RegularExpression` the method `to_regex()` can be used. Not all automaton can be converted to a regular expression. -When building or modifying an automaton you might come to use the method `add_transition(&mut self, from_state: State, to_state: State, new_cond: &Condition)`. This method accepts a `Condition` rather than a raw character set. To construct a Condition, call: To build a `Condition`, call: +When building or modifying an automaton you might come to use the method `add_transition(&mut self, from_state: State, to_state: State, new_cond: &Condition)`. This method accepts a `Condition` rather than a raw character set. To build a `Condition`, call: ```rust Condition::from_range(&range, &spanning_set); ``` @@ -164,68 +165,69 @@ This design allows us to perform unions, intersections, and complements of trans #### Build | Method | Return | Description | | -------- | ------- | ------- | -| `new_empty()` | `FastAutomaton` | Create an automaton that matches the empty language. | -| `new_total()` | `FastAutomaton` | Create an automaton that matches all possible strings. | -| `new_empty_string()` | `FastAutomaton` | Create an automaton that only match the empty string `""`. | -| `new_from_range(range: &CharRange)` | `Result` | Create an automaton that matches one of the characters in the provided `CharRange`. | -| `new_state(&mut self)` | `State` | Create a new state in the automaton and returns its identifier. | -| `accept(&mut self, state: State)` | | Make the automaton accept the provided state as a valid final state. | -| `add_transition(&mut self, from_state: State, to_state: State, new_cond: &Condition)` | | Create a new transition between the two provided states with the given condition, the provided condition must follow the same spanning set as the rest of the automaton. | -| `add_epsilon_transition(&mut self, from_state: State, to_state: State)` | | Create a new epsilon transition between the two provided states. | -| `remove_state(&mut self, state: State)` | | Remove the provided state from the automaton. Remove all the transitions it is connected to. Panic if the state is used as a start state. | -| `remove_states(&mut self, states: &IntSet)` | | Remove the provided states from the automaton. Remove all the transitions they are connected to. Panic if one of the state is used as a start state. | -| `apply_new_spanning_set(&mut self, new_spanning_set: &SpanningSet)` | `Result<(), EngineError>` | Apply the provided spanning set to the automaton and project all of its conditions on it. | +| `accept(&mut self, state: State)` | | Marks the provided state as an accepting (final) state. | +| `add_epsilon_transition(&mut self, from_state: State, to_state: State)` | | Creates a new epsilon transition between the two states. | +| `add_transition(&mut self, from_state: State, to_state: State, new_cond: &Condition)` | | Creates a new transition with the given condition; the condition must follow the automaton’s current spanning set. | +| `apply_new_spanning_set(&mut self, new_spanning_set: &SpanningSet)` | `Result<(), EngineError>` | Applies the provided spanning set and projects all existing conditions onto it. | +| `new_empty()` | `FastAutomaton` | Creates an automaton that matches the empty language. | +| `new_empty_string()` | `FastAutomaton` | Creates an automaton that only matches the empty string `""`. | +| `new_from_range(range: &CharRange)` | `Result` | Creates an automaton that matches one of the characters in the given `CharRange`. | +| `new_state(&mut self)` | `State` | Creates a new state and returns its identifier. | +| `new_total()` | `FastAutomaton` | Creates an automaton that matches all possible strings. | +| `remove_state(&mut self, state: State)` | | Removes the state and all its connected transitions; panics if it's a start state. | +| `remove_states(&mut self, states: &IntSet)` | | Removes the given states and their connected transitions; panics if any is a start state. | #### Manipulate | Method | Return | Description | | -------- | ------- | ------- | -| `union(&self, other: &FastAutomaton)` | `Result` | Returns a new `FastAutomaton` representing the union of `self` and `other`. | -| `union_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the union of all automatons in the given iterator. | -| `union_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the union of all automatons in the given parallel iterator. | +| `complement(&mut self)` | `Result<(), EngineError>` | Complements the automaton; it must be deterministic. | | `concat(&self, other: &FastAutomaton)` | `Result` | Returns a new `FastAutomaton` representing the concatenation of `self` and `other`. | -| `concat_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` representing the concatenation of `self` and `other`. | -| `determinize(&self)` | `Result` | Determinize the automaton and returns it as a new `FastAutomaton`. | +| `concat_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` representing the concatenation of all automata in the given iterator. | +| `determinize(&self)` | `Result, EngineError>` | Determinizes the automaton and returns the result as a new `FastAutomaton`. | | `intersection(&self, other: &FastAutomaton)` | `Result` | Returns a new `FastAutomaton` representing the intersection of `self` and `other`. | | `intersection_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the intersection of all automatons in the given iterator. | | `intersection_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the intersection of all automatons in the given parallel iterator. | -| `complement(&mut self)` | `Result<(), EngineError>` | Complement the automaton, the automaton needs to be deterministic. | +| `repeat(&self, min: u32, max_opt: Option)` | `Result` | Computes the repetition of the automaton between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. | | `subtraction(&self, other: &FastAutomaton)` | `Result` | Returns a new `FastAutomaton` representing the substraction of `self` and `other`. | -| `repeat(&self, min: u32, max_opt: Option)` | `Result` | Returns the repetition of the automaton, between `min` and `max_opt` times. If `max_opt` is `None`, the repetition is unbounded. | +| `union(&self, other: &FastAutomaton)` | `Result` | Returns a new `FastAutomaton` representing the union of `self` and `other`. | +| `union_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the union of all automatons in the given iterator. | +| `union_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the union of all automatons in the given parallel iterator. | #### Analyze | Method | Return | Description | | -------- | ------- | ------- | -| `is_empty(&self)` | `bool` | Checks if the current `FastAutomaton` matches the empty language. | -| `is_total(&self)` | `bool` | Checks if the current `FastAutomaton` matches all possible strings. | -| `is_empty_string(&self)` | `bool` | Checks if the current `FastAutomaton` only match the empty string `""`. | -| `get_reacheable_states(&self)` | `IntSet` | Get a set of all reacheable states from the start state. | -| `state_in_degree(&self, state: State)` | `usize` | Returns the number of transitions to the provided state. | -| `state_out_degree(&self, state: State)` | `usize` | Returns the number of transitions from the provided state. | -| `all_states_iter(&self)` | `impl Iterator` | Returns an iterator of the states of the automaton. | -| `all_states_vec(&self)` | `Vec` | Returns a vector containing the states of the automaton. | -| `direct_states_iter(&self, state: &State)` | `impl Iterator` | Returns an iterator over all states directly reachable from the given state in one transition. | -| `direct_states_vec(&self, state: &State)` | `Vec` | Returns a vector containing all states directly reachable from the given state in one transition. | -| `transitions_to_vec(&self, state: State)` | `Vec` | Returns a vector containing the transitions to the provided state. | -| `transitions_from_vec(&self, state: State)` | `Vec` | Returns a vector containing the transitions from the provided state. | -| `transitions_from_iter(&self, state: State)` | `impl Iterator` | Returns an iterator containing the transitions from the provided state. | -| `transitions_from_iter_mut(&mut self, state: State)` | `impl Iterator` | Returns a mutable iterator containing the transitions from the provided state. | -| `transitions_from_into_iter(&self, state: State)` | `impl Iterator` | Returns an owned iterator containing the transitions from the provided state. | -| `does_transition_exists(&self, from_state: State, to_state: State)` | `bool` | Returns `true` if there is a directed transition between the two provided states. | -| `get_condition(&self, from_state: State, to_state: State)` | `Option<&Condition>` | Get a reference of the directed transtion's condition between the two provided states. | -| `get_condition_mut(&mut self, from_state: State, to_state: State)` | `Option<&Condition>` | Get a mutable reference of the directed transtion's condition between the two provided states. | -| `get_start_state(&self)` | `State` | Returns the start state of the automaton. | -| `get_accept_states(&self)` | `&IntSet` | Get a reference to the set of accept (final) states of the automaton. | +| `all_states_iter(&self)` | `impl Iterator` | Returns an iterator over the automaton’s states. | +| `all_states_vec(&self)` | `Vec` | Returns a vector containing the automaton’s states. | +| `are_equivalent(&self, other: &FastAutomaton)` | `Result` | Returns `true` if both automata accept the same language. | +| `direct_states_iter(&self, state: &State)` | `impl Iterator` | Returns an iterator over states directly reachable from the given state in one transition. | +| `direct_states_vec(&self, state: &State)` | `Vec` | Returns a vector of states directly reachable from the given state in one transition. | +| `does_transition_exists(&self, from_state: State, to_state: State)` | `bool` | Returns `true` if there is a directed transition from `from_state` to `to_state`. | +| `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates `count` strings matched by the automaton. | +| `get_accept_states(&self)` | `&IntSet` | Returns a reference to the set of accept (final) states. | +| `get_cardinality(&self)` | `Cardinality` | Returns the cardinality of the automaton (i.e., the number of possible matched strings). | +| `get_condition(&self, from_state: State, to_state: State)` | `Option<&Condition>` | Returns a reference to the condition of the directed transition between the two states, if any. | +| `get_condition_mut(&mut self, from_state: State, to_state: State)` | `Option<&Condition>` | Returns a mutable reference to the condition of the directed transition between the two states, if any. | +| `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of matched strings. | +| `get_reacheable_states(&self)` | `IntSet` | Returns the set of all states reachable from the start state. | | `get_spanning_set(&self)` | `&SpanningSet` | Returns a reference to the automaton's spanning set. | -| `is_accepted(&self, state: &State)` | `bool` | Returns `true` if the given `state` is one of the automaton's accept states. | -| `is_determinitic(&self)` | `bool` | Returns `true` if the automaton is deterministic. | +| `get_start_state(&self)` | `State` | Returns the start state. | +| `has_intersection(&self, other: &FastAutomaton)` | `Result` | Returns `true` if the two automata have a non-empty intersection. | +| `has_state(&self, state: State)` | `bool` | Returns `true` if the automaton contains the given state. | +| `is_accepted(&self, state: &State)` | `bool` | Returns `true` if the given state is one of the accept states. | | `is_cyclic(&self)` | `bool` | Returns `true` if the automaton contains at least one cycle. | -| `has_state(&self, state: State)` | `bool` | Returns `true` if the automaton contains at least one cycle. | -| `to_regex(&self)` | `Option` | Try to convert the automaton to a `RegularExpression`. If it cannot find an equivalent pattern returns `None`. | -| `has_intersection(&self, other: &FastAutomaton)` | `Result` | Returns `true` if the two automatons have a non-empty intersection. | -| `are_equivalent(&self, other: &FastAutomaton)` | `Result` | Computes whether the current `FastAutomaton` and the given `FastAutomaton` are equivalent. Returns `true` if both automata accept the same language. | -| `is_subset_of(&self, other: &FastAutomaton)` | `Result` | Computes whether the current `FastAutomaton` is a subset of the given `FastAutomaton`. Returns `true` if all strings matched by the current `FastAutomaton` are also matched by the given `FastAutomaton`. | -| `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of the possible matched strings. | -| `get_cardinality(&self)` | `Cardinality` | Returns the cardinality of the provided term (i.e. the number of the possible matched strings). | +| `is_determinitic(&self)` | `bool` | Returns `true` if the automaton is deterministic. | +| `is_empty(&self)` | `bool` | Checks if the automaton matches the empty language. | +| `is_empty_string(&self)` | `bool` | Checks if the automaton only matches the empty string `""`. | +| `is_subset_of(&self, other: &FastAutomaton)` | `Result` | Returns `true` if all strings accepted by `self` are also accepted by `other`. | +| `is_total(&self)` | `bool` | Checks if the automaton matches all possible strings. | +| `state_in_degree(&self, state: State)` | `usize` | Returns the number of transitions to the provided state. | +| `state_out_degree(&self, state: State)` | `usize` | Returns the number of transitions from the provided state. | +| `to_regex(&self)` | `Option` | Attempts to convert the automaton to a `RegularExpression`; returns `None` if no equivalent pattern are found. | +| `transitions_from_into_iter(&self, state: State)` | `impl Iterator` | Returns an owned iterator over transitions from the given state. | +| `transitions_from_iter(&self, state: State)` | `impl Iterator` | Returns an iterator over transitions from the given state. | +| `transitions_from_iter_mut(&mut self, state: State)` | `impl Iterator` | Returns a mutable iterator over transitions from the given state. | +| `transitions_from_vec(&self, state: State)` | `Vec` | Returns a vector of transitions from the given state. | +| `transitions_to_vec(&self, state: State)` | `Vec` | Returns a vector of transitions to the given state. | ### RegularExpression @@ -235,25 +237,25 @@ This design allows us to perform unions, intersections, and complements of trans #### Build | Method | Return | Description | | -------- | ------- | ------- | -| `new(pattern: &str)` | `Result` | Parses the provided pattern and return the resulting `RegularExpression`. | -| `new_empty()` | `RegularExpression` | Create a `RegularExpression` that matches the empty language. | -| `new_total()` | `RegularExpression` | Create a `RegularExpression` that matches all possible strings. | -| `new_empty_string()` | `RegularExpression` | Create a `RegularExpression` that only match the empty string `""`. | -| `concat(&self, other: &RegularExpression, append_back: bool)` | `RegularExpression` | Returns a new `RegularExpression` representing the concatenation of `self` and `other`, using `append_back` to determine their order. | -| `repeat(&self, min: u32, max_opt: Option)` | `RegularExpression` | Returns the repetition of the `RegularExpression`, between `min` and `max_opt` times. If `max_opt` is `None`, the repetition is unbounded. | -| `union(&self, other: &RegularExpression)` | `RegularExpression` | Create a`RegularExpression` that only match the empty string `""`. | -| `union_all<'a, I: IntoIterator>(patterns: I)` | `RegularExpression` | Returns a `RegularExpression` formed by taking the union of all expressions in `patterns`. | -| `simplify(&self)` | `RegularExpression` | Returns a simplified version of this regular expression by eliminating redundant constructs and applying canonical reductions. | +| `concat(&self, other: &RegularExpression, append_back: bool)` | `RegularExpression` | Returns a new regular expression representing the concatenation of `self` and `other`; `append_back` determines their order. | +| `new(pattern: &str)` | `Result` | Parses the provided pattern and returns the resulting `RegularExpression`. | +| `new_empty()` | `RegularExpression` | Creates a regular expression that matches the empty language. | +| `new_empty_string()` | `RegularExpression` | Creates a regular expression that matches only the empty string `""`. | +| `new_total()` | `RegularExpression` | Creates a regular expression that matches all possible strings. | +| `repeat(&self, min: u32, max_opt: Option)` | `RegularExpression` | Returns the repetition of the expression between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. | +| `simplify(&self)` | `RegularExpression` | Returns a simplified version by eliminating redundant constructs and applying canonical reductions. | +| `union(&self, other: &RegularExpression)` | `RegularExpression` | Returns a regular expression matching the union of `self` and `other`. | +| `union_all<'a, I: IntoIterator>(patterns: I)` | `RegularExpression` | Returns a regular expression that is the union of all expressions in `patterns`. | #### Analyze | Method | Return | Description | | -------- | ------- | ------- | -| `is_empty(&self)` | `bool` | Checks if the current `RegularExpression` matches the empty language. | -| `is_total(&self)` | `bool` | Checks if the current `RegularExpression` matches all possible strings. | -| `is_empty_string(&self)` | `bool` | Checks if the current `RegularExpression` only match the empty string `""`. | -| `to_automaton(&self)` | `Result` | Convert the current `RegularExpression` to an equivalent `FastAutomaton`. | -| `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of the possible matched strings. | -| `get_cardinality(&self)` | `Cardinality` | Returns the cardinality of the provided term (i.e. the number of the possible matched strings). | +| `get_cardinality(&self)` | `Cardinality` | Returns the cardinality of the regular expression (i.e., the number of possible matched strings). | +| `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of possible matched strings. | +| `is_empty(&self)` | `bool` | Checks if the regular expression matches the empty language. | +| `is_empty_string(&self)` | `bool` | Checks if the regular expression only matches the empty string `""`. | +| `is_total(&self)` | `bool` | Checks if the regular expression matches all possible strings. | +| `to_automaton(&self)` | `Result` | Converts the regular expression to an equivalent `FastAutomaton`. | ## Bound Execution diff --git a/src/fast_automaton/analyze/cardinality.rs b/src/fast_automaton/analyze/cardinality.rs index 57a346a..7157bae 100644 --- a/src/fast_automaton/analyze/cardinality.rs +++ b/src/fast_automaton/analyze/cardinality.rs @@ -3,7 +3,7 @@ use std::hash::BuildHasherDefault; use super::*; impl FastAutomaton { - /// Returns the cardinality of the provided term (i.e. the number of the possible matched strings). + /// Returns the cardinality of the automaton (i.e., the number of possible matched strings). pub fn get_cardinality(&self) -> Option> { if self.is_empty() { return Some(Cardinality::Integer(0)); diff --git a/src/fast_automaton/analyze/equivalence.rs b/src/fast_automaton/analyze/equivalence.rs index 6483d68..32f2ccb 100644 --- a/src/fast_automaton/analyze/equivalence.rs +++ b/src/fast_automaton/analyze/equivalence.rs @@ -3,7 +3,7 @@ use crate::error::EngineError; use super::*; impl FastAutomaton { - /// Computes whether the current `FastAutomaton` and the given `FastAutomaton` are equivalent. Returns `true` if both automata accept the same language. + /// Returns `true` if both automata accept the same language. pub fn are_equivalent(&self, other: &FastAutomaton) -> Result { if self.is_empty() != other.is_empty() && self.is_total() != other.is_total() { return Ok(false); @@ -11,14 +11,14 @@ impl FastAutomaton { return Ok(true); } - let mut other_complement = other.determinize()?; + let mut other_complement = other.determinize()?.into_owned(); other_complement.complement()?; if self.has_intersection(&other_complement)? { return Ok(false); } - let mut self_complement = self.determinize()?; + let mut self_complement = self.determinize()?.into_owned(); self_complement.complement()?; Ok(!self_complement.has_intersection(other)?) diff --git a/src/fast_automaton/analyze/length.rs b/src/fast_automaton/analyze/length.rs index 638a93c..5ab7180 100644 --- a/src/fast_automaton/analyze/length.rs +++ b/src/fast_automaton/analyze/length.rs @@ -1,7 +1,7 @@ use super::*; impl FastAutomaton { - /// Returns the minimum and maximum length of the possible matched strings. + /// Returns the minimum and maximum length of matched strings. pub fn get_length(&self) -> (Option, Option) { if self.is_empty() { return (None, None); diff --git a/src/fast_automaton/analyze/mod.rs b/src/fast_automaton/analyze/mod.rs index 2f220b0..46b7c23 100644 --- a/src/fast_automaton/analyze/mod.rs +++ b/src/fast_automaton/analyze/mod.rs @@ -10,13 +10,13 @@ mod length; mod subset; impl FastAutomaton { - /// Checks if the current `FastAutomaton` matches the empty language. + /// Checks if the automaton matches the empty language. #[inline] pub fn is_empty(&self) -> bool { self.accept_states.is_empty() } - /// Checks if the current `FastAutomaton` matches all possible strings. + /// Checks if the automaton matches all possible strings. #[inline] pub fn is_total(&self) -> bool { if self.accept_states.contains(&self.start_state) { @@ -27,7 +27,7 @@ impl FastAutomaton { false } - /// Checks if the current `FastAutomaton` only match the empty string `""`. + /// Checks if the automaton only matches the empty string `""`. #[inline] pub fn is_empty_string(&self) -> bool { self.accept_states.len() == 1 @@ -35,7 +35,7 @@ impl FastAutomaton { && self.state_in_degree(self.start_state) == 0 } - /// Get a set of all reacheable states from the start state. + /// Returns the set of all states reachable from the start state. pub fn get_reacheable_states(&self) -> IntSet { let mut states_map: IntMap> = IntMap::with_capacity_and_hasher(self.transitions.len(), BuildHasherDefault::default()); diff --git a/src/fast_automaton/analyze/subset.rs b/src/fast_automaton/analyze/subset.rs index 6eb8888..e08a476 100644 --- a/src/fast_automaton/analyze/subset.rs +++ b/src/fast_automaton/analyze/subset.rs @@ -3,7 +3,7 @@ use crate::error::EngineError; use super::*; impl FastAutomaton { - /// Computes whether the current `FastAutomaton` is a subset of the given `FastAutomaton`. Returns `true` if all strings matched by the current `FastAutomaton` are also matched by the given `FastAutomaton`. + /// Returns `true` if all strings accepted by `self` are also accepted by `other`. pub fn is_subset_of(&self, other: &FastAutomaton) -> Result { if self.is_empty() || other.is_total() || self == other { return Ok(true); @@ -11,7 +11,7 @@ impl FastAutomaton { return Ok(false); } - let mut other = other.determinize()?; + let mut other = other.determinize()?.into_owned(); other.complement()?; Ok(!self.has_intersection(&other)?) diff --git a/src/fast_automaton/builder.rs b/src/fast_automaton/builder.rs index d6b69f4..8047152 100644 --- a/src/fast_automaton/builder.rs +++ b/src/fast_automaton/builder.rs @@ -5,7 +5,7 @@ use crate::error::EngineError; use super::*; impl FastAutomaton { - /// Create an automaton that matches the empty language. + /// Creates an automaton that matches the empty language. #[inline] pub fn new_empty() -> Self { Self { @@ -20,7 +20,7 @@ impl FastAutomaton { } } - /// Create an automaton that only match the empty string `""`. + /// Creates an automaton that only matches the empty string `""`. #[inline] pub fn new_empty_string() -> Self { let mut automaton = Self::new_empty(); @@ -28,7 +28,7 @@ impl FastAutomaton { automaton } - /// Create an automaton that matches all possible strings. + /// Creates an automaton that matches all possible strings. #[inline] pub fn new_total() -> Self { let mut automaton: FastAutomaton = Self::new_empty(); @@ -38,7 +38,7 @@ impl FastAutomaton { automaton } - /// Create an automaton that matches one of the characters in the provided `CharRange`. + /// Creates an automaton that matches one of the characters in the given `CharRange`. pub fn new_from_range(range: &CharRange) -> Result { let mut automaton = Self::new_empty(); if range.is_empty() { @@ -54,7 +54,7 @@ impl FastAutomaton { Ok(automaton) } - /// Create a new state in the automaton and returns its identifier. + /// Creates a new state and returns its identifier. #[inline] pub fn new_state(&mut self) -> State { if let Some(new_state) = self.removed_states.clone().iter().next() { @@ -66,14 +66,37 @@ impl FastAutomaton { } } - /// Make the automaton accept the provided state as a valid final state. + /// Marks the provided state as an accepting (final) state. #[inline] pub fn accept(&mut self, state: State) { self.assert_state_exists(state); self.accept_states.insert(state); } - /// Create a new transition between the two provided states with the given condition, the provided condition must follow the same spanning set as the rest of the automaton. + /// Creates a new transition with the given condition; the condition must follow the automaton’s current spanning set. + /// + /// This method accepts a `Condition` rather than a raw character set. To build a `Condition`, call: + /// ```rust,ignore + /// Condition::from_range(&range, &spanning_set); + /// ``` + /// where `spanning_set` is the automaton's current `SpanningSet`. The `CharRange` you pass must be fully covered by that spanning set. If it isn't, you have two options: + /// + /// 1. Merge an existing spanning set with another: + /// ```rust,ignore + /// let new_set = SpanningSet::merge(&old_set, &other_set); + /// ``` + /// + /// 2. Recompute from a list of ranges: + /// ```rust,ignore + /// let new_set = SpanningSet::compute_spanning_set(&[range_set1, range_set2, …]); + /// ``` + /// + /// After constructing `new_set`, apply it to the automaton: + /// ```rust,ignore + /// fast_automaton.apply_new_spanning_set(&new_set); + /// ``` + /// + /// This design allows us to perform unions, intersections, and complements of transition conditions in O(1) time, but it does add some complexity to automaton construction. For more details, you can check [this article](https://alexvbrdn.me/post/optimizing-transition-conditions-automaton-representation). pub fn add_transition(&mut self, from_state: State, to_state: State, new_cond: &Condition) { self.assert_state_exists(from_state); if from_state != to_state { @@ -111,7 +134,7 @@ impl FastAutomaton { }; } - /// Create a new epsilon transition between the two provided states. + /// Creates a new epsilon transition between the two states. pub fn add_epsilon_transition(&mut self, from_state: State, to_state: State) { if from_state == to_state { return; @@ -153,7 +176,7 @@ impl FastAutomaton { } } - /// Remove the provided state from the automaton. Remove all the transitions it is connected to. Panic if the state is used as a start state. + /// Removes the state and all its connected transitions; panics if it's a start state. pub fn remove_state(&mut self, state: State) { self.assert_state_exists(state); if self.start_state == state { @@ -223,7 +246,7 @@ impl FastAutomaton { } } - /// Apply the provided spanning set to the automaton and project all of its conditions on it. + /// Applies the provided spanning set and projects all existing conditions onto it. pub fn apply_new_spanning_set( &mut self, new_spanning_set: &SpanningSet, diff --git a/src/fast_automaton/convert/to_regex/mod.rs b/src/fast_automaton/convert/to_regex/mod.rs index 3e6193c..e99e506 100644 --- a/src/fast_automaton/convert/to_regex/mod.rs +++ b/src/fast_automaton/convert/to_regex/mod.rs @@ -240,9 +240,7 @@ impl StateEliminationAutomaton { } impl FastAutomaton { - /// Try to convert the current FastAutomaton to a RegularExpression. - /// If it cannot find an equivalent regex it returns None. - /// This method is still a work in progress. + /// Attempts to convert the automaton to a `RegularExpression`; returns `None` if no equivalent pattern are found. pub fn to_regex(&self) -> Option { if self.is_empty() { return Some(RegularExpression::new_empty()); @@ -359,9 +357,8 @@ mod tests { let automaton2 = RegularExpression::new("ab") .unwrap() .to_automaton() - .unwrap() - .determinize() .unwrap(); + let automaton2 = automaton2.determinize().unwrap(); let result = automaton1.subtraction(&automaton2).unwrap(); @@ -403,9 +400,8 @@ mod tests { let automaton2 = RegularExpression::new("(xxx)*") .unwrap() .to_automaton() - .unwrap() - .determinize() .unwrap(); + let automaton2 = automaton2.determinize().unwrap(); let result = automaton1.subtraction(&automaton2).unwrap(); result.to_dot(); diff --git a/src/fast_automaton/generate.rs b/src/fast_automaton/generate.rs index 7bbaf58..6cb0628 100644 --- a/src/fast_automaton/generate.rs +++ b/src/fast_automaton/generate.rs @@ -6,9 +6,10 @@ use ahash::AHashSet; use super::*; impl FastAutomaton { - pub fn generate_strings(&self, number: usize) -> Result, EngineError> { + /// Generates `count` strings matched by the automaton. + pub fn generate_strings(&self, number: usize) -> Result, EngineError> { if self.is_empty() { - return Ok(AHashSet::new()); + return Ok(Vec::new()); } let mut strings = AHashSet::with_capacity(cmp::min(number, 1000)); @@ -81,7 +82,7 @@ impl FastAutomaton { } } - Ok(strings) + Ok(strings.into_iter().collect()) } } diff --git a/src/fast_automaton/mod.rs b/src/fast_automaton/mod.rs index 4b2475b..30a09a9 100644 --- a/src/fast_automaton/mod.rs +++ b/src/fast_automaton/mod.rs @@ -98,19 +98,19 @@ impl FastAutomaton { self.transitions[state].len() } - /// Returns an iterator of the state of the automaton. + /// Returns an iterator over the automaton’s states. #[inline] pub fn all_states_iter(&self) -> impl Iterator + '_ { (0..self.transitions.len()).filter(|s| !self.removed_states.contains(s)) } - /// Returns a vector containing the states of the automaton. + /// Returns a vector containing the automaton’s states. #[inline] pub fn all_states_vec(&self) -> Vec { self.all_states_iter().collect() } - /// Returns an iterator over all states directly reachable from the given state in one transition. + /// Returns an iterator over states directly reachable from the given state in one transition. #[inline] pub fn direct_states_iter(&self, state: &State) -> impl Iterator + '_ { self.transitions[*state] @@ -119,7 +119,7 @@ impl FastAutomaton { .filter(|s| !self.removed_states.contains(s)) } - /// Returns a vector containing all states directly reachable from the given state in one transition. + /// Returns a vector of states directly reachable from the given state in one transition. #[inline] pub fn direct_states_vec(&self, state: &State) -> Vec { self.direct_states_iter(state).collect() @@ -139,7 +139,7 @@ impl FastAutomaton { in_transitions } - /// Returns a vector containing the transitions from the provided state. + /// Returns a vector of transitions from the given state. #[inline] pub fn transitions_from_vec(&self, state: State) -> Vec { self.transitions[state] @@ -149,7 +149,7 @@ impl FastAutomaton { .collect() } - /// Returns an iterator containing the transitions from the provided state. + /// Returns an iterator over transitions from the given state. #[inline] pub fn transitions_from_iter( &self, @@ -161,7 +161,7 @@ impl FastAutomaton { .filter(|s| !self.removed_states.contains(s.1)) } - /// Returns a mutable iterator containing the transitions from the provided state. + /// Returns a mutable iterator over transitions from the given state. #[inline] pub fn transitions_from_iter_mut( &mut self, @@ -173,7 +173,7 @@ impl FastAutomaton { .filter(|s| !self.removed_states.contains(s.1)) } - /// Returns an owned iterator containing the transitions from the provided state. + /// Returns an owned iterator over transitions from the given state. #[inline] pub fn transitions_from_into_iter( &self, @@ -186,7 +186,7 @@ impl FastAutomaton { .filter(|(_, state)| !self.removed_states.contains(state)) } - /// Returns `true` if there is a directed transition between the two provided states. + /// Returns `true` if there is a directed transition from `from_state` to `to_state`. #[inline] pub fn does_transition_exists(&self, from_state: State, to_state: State) -> bool { if !self.has_state(from_state) || !self.has_state(to_state) { @@ -215,13 +215,13 @@ impl FastAutomaton { self.transitions.len() - self.removed_states.len() } - // Get a reference of the directed transtion's condition between the two provided states. + // Returns a reference to the condition of the directed transition between the two states, if any. #[inline] pub fn get_condition(&self, from_state: State, to_state: State) -> Option<&Condition> { self.transitions[from_state].get(&to_state) } - // Get a mutable reference of the directed transtion's condition between the two provided states. + // Returns a mutable reference to the condition of the directed transition between the two states, if any. #[inline] pub fn get_condition_mut( &mut self, @@ -231,13 +231,13 @@ impl FastAutomaton { self.transitions[from_state].get_mut(&to_state) } - /// Returns the start state of the automaton. + /// Returns the start state. #[inline] pub fn get_start_state(&self) -> State { self.start_state } - // Get a reference to the set of accept (final) states of the automaton. + // Returns a reference to the set of accept (final) states. #[inline] pub fn get_accept_states(&self) -> &IntSet { &self.accept_states @@ -249,7 +249,7 @@ impl FastAutomaton { &self.spanning_set } - /// Returns `true` if the given `state` is one of the automaton's accept states. + /// Returns `true` if the given state is one of the accept states. #[inline] pub fn is_accepted(&self, state: &State) -> bool { self.accept_states.contains(state) @@ -267,7 +267,7 @@ impl FastAutomaton { self.cyclic } - /// Returns `true` if the automaton has the provided state. + /// Returns `true` if the automaton contains the given state. #[inline] pub fn has_state(&self, state: State) -> bool { !(state >= self.transitions.len() || self.removed_states.contains(&state)) diff --git a/src/fast_automaton/operation/concat.rs b/src/fast_automaton/operation/concat.rs index 6fa9d1b..3ee4456 100644 --- a/src/fast_automaton/operation/concat.rs +++ b/src/fast_automaton/operation/concat.rs @@ -12,7 +12,7 @@ impl FastAutomaton { Self::concat_all([self, other]) } - /// Returns a new `FastAutomaton` that is the concatenation of all automatons in the given iterator. + /// Returns a new `FastAutomaton` representing the concatenation of all automata in the given iterator. pub fn concat_all<'a, I: IntoIterator>(automatons: I) -> Result { let mut new_automaton = FastAutomaton::new_empty_string(); diff --git a/src/fast_automaton/operation/determinize.rs b/src/fast_automaton/operation/determinize.rs index 55d8c46..8257c8f 100644 --- a/src/fast_automaton/operation/determinize.rs +++ b/src/fast_automaton/operation/determinize.rs @@ -5,10 +5,10 @@ use crate::{EngineError, execution_profile::ExecutionProfile}; use super::*; impl FastAutomaton { - /// Determinize the automaton and returns it as a new `FastAutomaton`. - pub fn determinize(&self) -> Result { + /// Determinizes the automaton and returns the result as a new `FastAutomaton`. + pub fn determinize(&self) -> Result, EngineError> { if self.deterministic { - return Ok(self.clone()); + return Ok(Cow::Borrowed(self)); } let execution_profile = ExecutionProfile::get(); @@ -66,7 +66,7 @@ impl FastAutomaton { new_states_to_add.clear(); } } - Ok(new_automaton) + Ok(Cow::Owned(new_automaton)) } fn simple_hash(list: &VecDeque) -> u64 { diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index 6987c31..778e0e3 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -126,7 +126,7 @@ impl FastAutomaton { Ok(Cow::Owned(new_automaton)) } - // Returns `true` if the two automatons have a non-empty intersection. + // Returns `true` if the two automata have a non-empty intersection. pub fn has_intersection(&self, other: &FastAutomaton) -> Result { if self.is_empty() || other.is_empty() { return Ok(false); diff --git a/src/fast_automaton/operation/repeat.rs b/src/fast_automaton/operation/repeat.rs index cc4fb76..8bbed81 100644 --- a/src/fast_automaton/operation/repeat.rs +++ b/src/fast_automaton/operation/repeat.rs @@ -1,7 +1,7 @@ use super::*; impl FastAutomaton { - // Returns the repetition of the automaton, between `min` and `max_opt` times. If `max_opt` is `None`, the repetition is unbounded. + // Computes the repetition of the automaton between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. pub fn repeat(&self, min: u32, max_opt: Option) -> Result { let mut automaton = self.clone(); if let Err(error) = automaton.repeat_mut(min, max_opt) { diff --git a/src/fast_automaton/operation/subtraction.rs b/src/fast_automaton/operation/subtraction.rs index e4406e8..d7adeef 100644 --- a/src/fast_automaton/operation/subtraction.rs +++ b/src/fast_automaton/operation/subtraction.rs @@ -43,7 +43,7 @@ impl FastAutomaton { Ok(()) } - /// Complement the automaton, the automaton needs to be deterministic. + /// Complements the automaton; it must be deterministic. pub fn complement(&mut self) -> Result<(), EngineError> { self.totalize()?; diff --git a/src/fast_automaton/serializer.rs b/src/fast_automaton/serializer.rs index 017341b..d2dc30b 100644 --- a/src/fast_automaton/serializer.rs +++ b/src/fast_automaton/serializer.rs @@ -1,24 +1,24 @@ use super::*; +use crate::tokenizer::Tokenizer; use lazy_static::lazy_static; use rand::Rng; -use serde::{de, ser, Deserializer, Serializer}; use serde::{Deserialize, Serialize}; +use serde::{Deserializer, Serializer, de, ser}; use std::env; use z85::{decode, encode}; -use crate::tokenizer::Tokenizer; use sha2::{Digest, Sha256}; use aes_gcm_siv::{ - aead::{Aead, KeyInit}, Aes256GcmSiv, Nonce, + aead::{Aead, KeyInit}, }; +use flate2::Compression; use flate2::read::ZlibDecoder; use flate2::write::ZlibEncoder; -use flate2::Compression; use std::io::prelude::*; -use crate::tokenizer::token::{automaton_token::AutomatonToken, Token}; +use crate::tokenizer::token::{Token, automaton_token::AutomatonToken}; pub struct FastAutomatonReader { cipher: Aes256GcmSiv, @@ -171,7 +171,9 @@ mod tests { assert_serialization( "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,2}", ); - assert_serialization("(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])"); + assert_serialization( + "(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])", + ); Ok(()) } @@ -203,9 +205,8 @@ mod tests { let automaton2 = RegularExpression::new("\\d+") .unwrap() .to_automaton() - .unwrap() - .determinize() .unwrap(); + let automaton2 = automaton2.determinize().unwrap(); let subtraction = automaton1.subtraction(&automaton2).unwrap(); @@ -219,7 +220,7 @@ mod tests { assert!(automaton.subtraction(&unserialized).unwrap().is_empty()); assert!(unserialized.subtraction(&automaton).unwrap().is_empty()); - + Ok(()) } } diff --git a/src/lib.rs b/src/lib.rs index d177120..0d92556 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -51,22 +51,22 @@ impl Display for Term { } impl Term { - /// Create a term that matches the empty language. + /// Creates a term that matches the empty language. pub fn new_empty() -> Self { Term::RegularExpression(RegularExpression::new_empty()) } - /// Create a term that matches all possible strings. + /// Creates a term that matches all possible strings. pub fn new_total() -> Self { Term::RegularExpression(RegularExpression::new_total()) } - /// Create a term that only match the empty string `""`. + /// Creates a term that only matches the empty string `""`. pub fn new_empty_string() -> Self { Term::RegularExpression(RegularExpression::new_empty_string()) } - /// Parse the provided pattern and return a new `Term` holding the resulting `RegularExpression`. + /// Parses the provided pattern and returns a new `Term` holding the resulting `RegularExpression`. /// /// # Example: /// @@ -79,18 +79,17 @@ impl Term { Ok(Term::RegularExpression(RegularExpression::new(pattern)?)) } - /// Create a new `Term` holding the provided `RegularExpression`. + /// Creates a new `Term` holding the provided `RegularExpression`. pub fn from_regex(regex: RegularExpression) -> Self { Term::RegularExpression(regex) } - /// Create a new `Term` holding the provided `FastAutomaton`. + /// Creates a new `Term` holding the provided `FastAutomaton`. pub fn from_automaton(automaton: FastAutomaton) -> Self { Term::Automaton(automaton) } - /// Compute the concatenation of the current term with the given list of terms. - /// Returns the resulting term. + /// Computes the concatenation of the given terms. /// /// # Example: /// @@ -143,8 +142,7 @@ impl Term { } } - /// Compute the union of the current term with the given collection of terms. - /// Returns the resulting term. + /// Computes the union of the given terms. /// /// # Example: /// @@ -206,8 +204,7 @@ impl Term { } } - /// Compute the intersection of the current term with the given collection of terms. - /// Returns the resulting term. + /// Computes the intersection of the given terms. /// /// # Example: /// @@ -244,8 +241,7 @@ impl Term { Ok(Term::Automaton(return_automaton)) } - /// Compute the subtraction of the current term and the given `subtrahend`. - /// Returns the resulting term. + /// Computes the difference between `self` and the given subtrahend. /// /// # Example: /// @@ -277,8 +273,7 @@ impl Term { self.subtraction(subtrahend) } - /// Returns the repetition of the current term, - /// between `min` and `max_opt` times. If `max_opt` is `None`, the repetition is unbounded. + /// Computes the repetition of the current term between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. /// /// # Example: /// @@ -311,7 +306,7 @@ impl Term { } } - /// Generate the given count of strings matched by the given term. + /// Generates `count` strings matched by the term. /// /// # Example: /// @@ -325,14 +320,9 @@ impl Term { /// assert_eq!(3, strings.len()); // ex: ["deabc", "dede", "abcde"] /// ``` pub fn generate_strings(&self, count: usize) -> Result, EngineError> { - Ok(self - .to_automaton()? - .generate_strings(count)? - .into_iter() - .collect()) + self.to_automaton()?.generate_strings(count) } - /// Compute whether the current term and the given term are equivalent. /// Returns `true` if both terms accept the same language. /// /// # Example: @@ -355,7 +345,6 @@ impl Term { automaton_1.are_equivalent(&automaton_2) } - /// Compute whether the current term is a subset of the given term. /// Returns `true` if all strings matched by the current term are also matched by the given term. /// /// # Example: @@ -378,7 +367,7 @@ impl Term { automaton_1.is_subset_of(&automaton_2) } - /// Check if the current term matches the empty language. + /// Checks if the term matches the empty language. pub fn is_empty(&self) -> bool { match self { Term::RegularExpression(regular_expression) => regular_expression.is_empty(), @@ -386,7 +375,7 @@ impl Term { } } - /// Check if the current term matches all possible strings. + /// Checks if the term matches all possible strings. pub fn is_total(&self) -> bool { match self { Term::RegularExpression(regular_expression) => regular_expression.is_total(), @@ -394,7 +383,7 @@ impl Term { } } - /// Check if the current term only match the empty string `""`. + /// Checks if the term matches only the empty string `""`. pub fn is_empty_string(&self) -> bool { match self { Term::RegularExpression(regular_expression) => regular_expression.is_empty_string(), @@ -402,7 +391,7 @@ impl Term { } } - /// Returns the minimum and maximum length of the possible matched strings. + /// Returns the minimum and maximum length of matched strings. pub fn get_length(&self) -> (Option, Option) { match self { Term::RegularExpression(regex) => regex.get_length(), @@ -410,7 +399,7 @@ impl Term { } } - /// Returns the cardinality of the provided term (i.e. the number of the possible matched strings). + /// Returns the cardinality of the term (i.e., the number of possible matched strings). pub fn get_cardinality(&self) -> Result, EngineError> { match self { Term::RegularExpression(regex) => Ok(regex.get_cardinality()), @@ -430,7 +419,7 @@ impl Term { } } - /// Converts the current `Term` to a `FastAutomaton`. + /// Converts the term to a `FastAutomaton`. pub fn to_automaton(&self) -> Result, EngineError> { Ok(match self { Term::RegularExpression(regex) => Cow::Owned(regex.to_automaton()?), @@ -438,7 +427,7 @@ impl Term { }) } - /// Converts the current `Term` to a `RegularExpression`. Returns `None` if the automaton cannot be converted. + /// Converts the term to a RegularExpression; returns `None` if conversion isn’t possible. pub fn to_regex(&self) -> Option> { Some(match self { Term::RegularExpression(regex) => Cow::Borrowed(regex), @@ -446,7 +435,7 @@ impl Term { }) } - /// Converts the current `Term` to a regular expression pattern. Returns `None` if the automaton cannot be converted. + /// Converts the term to a regular expression pattern; returns `None` if conversion isn’t possible. pub fn to_pattern(&self) -> Option { Some(self.to_regex()?.to_string()) } @@ -458,9 +447,9 @@ impl Term { if subtrahend.is_determinitic() { Ok(Cow::Borrowed(subtrahend)) } else if !minuend.is_cyclic() && subtrahend.is_cyclic() { - Ok(Cow::Owned(minuend.intersection(subtrahend)?.determinize()?)) + Ok(Cow::Owned(minuend.intersection(subtrahend)?.determinize()?.into_owned())) } else { - Ok(Cow::Owned(subtrahend.determinize()?)) + Ok(subtrahend.determinize()?) } } @@ -528,10 +517,7 @@ mod tests { let result = regex1.subtraction(®ex2); assert!(result.is_ok()); let result = result.unwrap().to_pattern().unwrap(); - assert_eq!( - "a+", - result - ); + assert_eq!("a+", result); Ok(()) } @@ -573,10 +559,7 @@ mod tests { let result = regex1.intersection(&vec![regex2]); assert!(result.is_ok()); let result = result.unwrap().to_pattern().unwrap(); - assert_eq!( - "(x{3})*", - result - ); + assert_eq!("(x{3})*", result); Ok(()) } diff --git a/src/regex/analyze/mod.rs b/src/regex/analyze/mod.rs index 2b946cb..2ee4bc5 100644 --- a/src/regex/analyze/mod.rs +++ b/src/regex/analyze/mod.rs @@ -6,7 +6,7 @@ mod affixes; mod number_of_states; impl RegularExpression { - /// Returns the minimum and maximum length of the possible matched strings. + /// Returns the minimum and maximum length of possible matched strings. pub fn get_length(&self) -> (Option, Option) { match self { RegularExpression::Character(range) => { @@ -85,7 +85,7 @@ impl RegularExpression { } } - /// Returns the cardinality of the provided term (i.e. the number of the possible matched strings). + /// Returns the cardinality of the regular expression (i.e., the number of possible matched strings). pub fn get_cardinality(&self) -> Cardinality { if self.is_empty() { return Cardinality::Integer(0); @@ -227,7 +227,7 @@ mod tests { let mut automaton = regex.to_automaton().unwrap(); if !automaton.is_cyclic() { - automaton = automaton.determinize().unwrap(); + automaton = automaton.determinize().unwrap().into_owned(); } //automaton.to_dot(); diff --git a/src/regex/builder.rs b/src/regex/builder.rs index 1b8f636..ae66725 100644 --- a/src/regex/builder.rs +++ b/src/regex/builder.rs @@ -11,7 +11,7 @@ lazy_static! { } impl RegularExpression { - /// Parses the provided pattern and return the resulting `RegularExpression`. + /// Parses the provided pattern and returns the resulting `RegularExpression`. pub fn new(pattern: &str) -> Result { if pattern.is_empty() { return Ok(RegularExpression::new_empty_string()); @@ -33,7 +33,7 @@ impl RegularExpression { RE_FLAG_DETECTION.replace_all(regex, "").to_string() } - /// Create a `RegularExpression` that matches all possible strings. + /// Creates a regular expression that matches all possible strings. pub fn new_total() -> Self { RegularExpression::Repetition( Box::new(RegularExpression::Character(CharRange::total())), @@ -42,12 +42,12 @@ impl RegularExpression { ) } - /// Create a `RegularExpression` that matches the empty language. + /// Creates a regular expression that matches the empty language. pub fn new_empty() -> Self { RegularExpression::Character(CharRange::empty()) } - /// Create a`RegularExpression` that only match the empty string `""`. + /// Creates a regular expression that matches only the empty string `""`. pub fn new_empty_string() -> Self { RegularExpression::Concat(VecDeque::new()) } diff --git a/src/regex/mod.rs b/src/regex/mod.rs index 5c7ca16..ba569e8 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -90,7 +90,7 @@ impl Display for RegularExpression { } impl RegularExpression { - /// Checks if the current `RegularExpression` matches the empty language. + /// Checks if the regular expression matches the empty language. pub fn is_empty(&self) -> bool { match self { RegularExpression::Alternation(alternation) => alternation.is_empty(), @@ -99,7 +99,7 @@ impl RegularExpression { } } - /// Checks if the current `RegularExpression` matches all possible strings. + /// Checks if the regular expression only matches the empty string `""`. pub fn is_empty_string(&self) -> bool { match self { RegularExpression::Concat(concat) => concat.is_empty(), @@ -107,7 +107,7 @@ impl RegularExpression { } } - /// Checks if the current `RegularExpression` only match the empty string `""`. + /// Checks if the regular expression matches all possible strings. pub fn is_total(&self) -> bool { match self { RegularExpression::Repetition(regular_expression, min, max_opt) => { @@ -124,7 +124,7 @@ impl RegularExpression { } } - /// Convert the current `RegularExpression` to an equivalent `FastAutomaton`. + /// Converts the regular expression to an equivalent `FastAutomaton`. pub fn to_automaton(&self) -> Result { ExecutionProfile::get().assert_max_number_of_states(self.get_number_of_states_in_nfa())?; diff --git a/src/regex/operation/concat.rs b/src/regex/operation/concat.rs index ac699d8..fe83c46 100644 --- a/src/regex/operation/concat.rs +++ b/src/regex/operation/concat.rs @@ -1,7 +1,7 @@ use super::*; impl RegularExpression { - /// Returns a new `RegularExpression` representing the concatenation of `self` and `other`, using `append_back` to determine their order. + /// Returns a new regular expression representing the concatenation of `self` and `other`; `append_back` determines their order. pub fn concat(&self, other: &RegularExpression, append_back: bool) -> RegularExpression { if self.is_empty() || other.is_empty() { return RegularExpression::new_empty(); diff --git a/src/regex/operation/repeat.rs b/src/regex/operation/repeat.rs index 7da36bb..181724f 100644 --- a/src/regex/operation/repeat.rs +++ b/src/regex/operation/repeat.rs @@ -1,7 +1,7 @@ use super::*; impl RegularExpression { - /// Returns the repetition of the `RegularExpression`, between `min` and `max_opt` times. If `max_opt` is `None`, the repetition is unbounded. + /// Returns the repetition of the expression between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. pub fn repeat(&self, min: u32, max_opt: Option) -> RegularExpression { if self.is_total() { return RegularExpression::new_total(); diff --git a/src/regex/operation/simplify.rs b/src/regex/operation/simplify.rs index 51f66b7..5156ce8 100644 --- a/src/regex/operation/simplify.rs +++ b/src/regex/operation/simplify.rs @@ -1,7 +1,7 @@ use super::*; impl RegularExpression { - /// Returns a simplified version of this regular expression by eliminating redundant constructs and applying canonical reductions. + /// Returns a simplified version by eliminating redundant constructs and applying canonical reductions. pub fn simplify(&self) -> Self { match self { RegularExpression::Character(_) => self.clone(), diff --git a/src/regex/operation/union.rs b/src/regex/operation/union.rs index 1c09e78..8e4f1f3 100644 --- a/src/regex/operation/union.rs +++ b/src/regex/operation/union.rs @@ -3,12 +3,12 @@ use std::collections::BTreeSet; use super::*; impl RegularExpression { - /// Returns a new `RegularExpression` representing the union of this expression with `other`. + /// Returns a regular expression matching the union of `self` and `other`. pub fn union(&self, other: &RegularExpression) -> RegularExpression { Self::union_all([self, other]) } - /// Returns a `RegularExpression` formed by taking the union of all expressions in `patterns`. + /// Returns a regular expression that is the union of all expressions in `patterns`. pub fn union_all<'a, I: IntoIterator>(patterns: I) -> RegularExpression { let mut result: Cow<'a, RegularExpression> = Cow::Owned(RegularExpression::new_empty()); diff --git a/src/tokenizer/embed_automaton.rs b/src/tokenizer/embed_automaton.rs index 0838525..40d0fcb 100644 --- a/src/tokenizer/embed_automaton.rs +++ b/src/tokenizer/embed_automaton.rs @@ -1,6 +1,6 @@ use token::TokenError; -use crate::{error::EngineError, fast_automaton::condition::Condition, CharRange}; +use crate::{CharRange, error::EngineError, fast_automaton::condition::Condition}; use self::token::range_token::RangeToken; @@ -162,7 +162,8 @@ mod tests { let regex = RegularExpression::new(regex).unwrap(); println!("{}", regex); - let automaton = regex.to_automaton().unwrap().determinize().unwrap(); + let automaton = regex.to_automaton().unwrap(); + let automaton = automaton.determinize().unwrap(); let tokenizer = Tokenizer::new(&automaton); let embedding = tokenizer.to_embedding(); diff --git a/src/tokenizer/embed_regex.rs b/src/tokenizer/embed_regex.rs index fe73cab..d9e6892 100644 --- a/src/tokenizer/embed_regex.rs +++ b/src/tokenizer/embed_regex.rs @@ -270,7 +270,8 @@ mod tests { let regex = RegularExpression::new(regex).unwrap(); println!("{}", regex); - let automaton = regex.to_automaton().unwrap().determinize().unwrap(); + let automaton = regex.to_automaton().unwrap(); + let automaton = automaton.determinize().unwrap(); //automaton.to_dot(); let tokenizer = Tokenizer::new(&automaton); From eb79826c1ac920f832c619c561d62673fde4bddd Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sun, 3 Aug 2025 16:12:59 +0200 Subject: [PATCH 14/44] fix bench --- benches/my_benchmark.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/benches/my_benchmark.rs b/benches/my_benchmark.rs index f2f9fdc..c35164a 100644 --- a/benches/my_benchmark.rs +++ b/benches/my_benchmark.rs @@ -1,4 +1,3 @@ -use ahash::AHashSet; use criterion::{black_box, criterion_group, criterion_main, Criterion}; use regexsolver::{fast_automaton::FastAutomaton, regex::RegularExpression}; @@ -11,14 +10,14 @@ fn to_regex(automaton: &FastAutomaton) -> RegularExpression { } fn determinize(automaton: &FastAutomaton) -> FastAutomaton { - automaton.determinize().unwrap() + automaton.determinize().unwrap().into_owned() } fn intersection(automaton_1: &FastAutomaton, automaton_2: &FastAutomaton) -> FastAutomaton { automaton_1.intersection(automaton_2).unwrap() } -fn generate_strings(automaton: &FastAutomaton) -> AHashSet { +fn generate_strings(automaton: &FastAutomaton) -> Vec { automaton.generate_strings(2000).unwrap() } From 691a9727d834bc2c103670c15b53396aaab65284 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sun, 3 Aug 2025 16:13:07 +0200 Subject: [PATCH 15/44] fix docs test --- src/fast_automaton/builder.rs | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/src/fast_automaton/builder.rs b/src/fast_automaton/builder.rs index 8047152..f2011e4 100644 --- a/src/fast_automaton/builder.rs +++ b/src/fast_automaton/builder.rs @@ -76,23 +76,37 @@ impl FastAutomaton { /// Creates a new transition with the given condition; the condition must follow the automaton’s current spanning set. /// /// This method accepts a `Condition` rather than a raw character set. To build a `Condition`, call: - /// ```rust,ignore + /// ```rust + /// # use regexsolver::CharRange; + /// # use regexsolver::fast_automaton::{condition::Condition, spanning_set::SpanningSet}; + /// # let range = CharRange::total(); + /// # let spanning_set = SpanningSet::new_total(); /// Condition::from_range(&range, &spanning_set); /// ``` /// where `spanning_set` is the automaton's current `SpanningSet`. The `CharRange` you pass must be fully covered by that spanning set. If it isn't, you have two options: /// /// 1. Merge an existing spanning set with another: - /// ```rust,ignore + /// ```rust + /// # use regexsolver::fast_automaton::spanning_set::SpanningSet; + /// # let old_set = SpanningSet::new_total(); + /// # let other_set = SpanningSet::new_total(); /// let new_set = SpanningSet::merge(&old_set, &other_set); /// ``` /// /// 2. Recompute from a list of ranges: - /// ```rust,ignore - /// let new_set = SpanningSet::compute_spanning_set(&[range_set1, range_set2, …]); + /// ```rust + /// # use regexsolver::CharRange; + /// # use regexsolver::fast_automaton::spanning_set::SpanningSet; + /// # let range_set1 = CharRange::total(); + /// # let range_set2 = CharRange::total(); + /// let new_set = SpanningSet::compute_spanning_set(&[range_set1, range_set2]); /// ``` /// /// After constructing `new_set`, apply it to the automaton: - /// ```rust,ignore + /// ```rust + /// # use regexsolver::fast_automaton::{FastAutomaton, spanning_set::SpanningSet}; + /// # let mut fast_automaton = FastAutomaton::new_total(); + /// # let new_set = SpanningSet::new_total(); /// fast_automaton.apply_new_spanning_set(&new_set); /// ``` /// From 4fe1d94be02c850117b644f17db5131e7092e390 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sun, 3 Aug 2025 16:22:23 +0200 Subject: [PATCH 16/44] update docs --- src/lib.rs | 157 +++++++++++++++++++++-------------------------------- 1 file changed, 61 insertions(+), 96 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 0d92556..75051db 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -30,6 +30,60 @@ pub type CharRange = RangeSet; /// Represents a term that can be either a regular expression or a finite automaton. This term can be manipulated with a wide range of operations. /// +/// ```rust +/// use regexsolver::Term; +/// +/// // Create terms from regex +/// let t1 = Term::from_pattern("abc.*").unwrap(); +/// let t2 = Term::from_pattern(".*xyz").unwrap(); +/// +/// // Concatenate +/// let concat = t1.concat(&[t2]).unwrap(); +/// assert_eq!(concat.to_pattern().unwrap(), "abc.*xyz"); +/// +/// // Union +/// let union = t1.union(&[Term::from_pattern("fgh").unwrap()]).unwrap(); +/// assert_eq!(union.to_pattern().unwrap(), "(abc.*|fgh)"); +/// +/// // Intersection +/// let inter = Term::from_pattern("(ab|xy){2}") +/// .unwrap() +/// .intersection(&[Term::from_pattern(".*xy").unwrap()]) +/// .unwrap(); // (ab|xy)xy +/// assert_eq!(inter.to_pattern().unwrap(), "(ab|xy)xy"); +/// +/// // Subtraction +/// let diff = Term::from_pattern("a*") +/// .unwrap() +/// .subtraction(&Term::from_pattern("").unwrap()) +/// .unwrap(); +/// assert_eq!(diff.to_pattern().unwrap(), "a+"); +/// +/// // Repetition +/// let rep = Term::from_pattern("abc") +/// .unwrap() +/// .repeat(2, Some(4)) +/// .unwrap(); +/// assert_eq!(rep.to_pattern().unwrap(), "(abc){2,4}"); +/// +/// // Analyze +/// assert_eq!(rep.get_length(), (Some(6), Some(12))); +/// assert!(!rep.is_empty()); +/// +/// // Generate examples +/// let samples = Term::from_pattern("(x|y){1,3}") +/// .unwrap() +/// .generate_strings(5) +/// .unwrap(); +/// println!("Some matches: {:?}", samples); +/// +/// // Equivalence & subset +/// let a = Term::from_pattern("a+").unwrap(); +/// let b = Term::from_pattern("a*").unwrap(); +/// assert!(!a.are_equivalent(&b).unwrap()); +/// assert!(a.is_subset_of(&b).unwrap()); +/// ``` +/// /// To put constraint and limitation on the execution of operations please refer to [`execution_profile::ExecutionProfile`]. #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Clone, PartialEq, Eq, Debug)] @@ -447,7 +501,12 @@ impl Term { if subtrahend.is_determinitic() { Ok(Cow::Borrowed(subtrahend)) } else if !minuend.is_cyclic() && subtrahend.is_cyclic() { - Ok(Cow::Owned(minuend.intersection(subtrahend)?.determinize()?.into_owned())) + Ok(Cow::Owned( + minuend + .intersection(subtrahend)? + .determinize()? + .into_owned(), + )) } else { Ok(subtrahend.determinize()?) } @@ -494,7 +553,7 @@ impl Term { #[cfg(test)] mod tests { - use crate::{execution_profile::ExecutionProfileBuilder, regex::RegularExpression}; + use crate::regex::RegularExpression; use super::*; @@ -563,98 +622,4 @@ mod tests { Ok(()) } - - #[test] - fn test_readme_code_1() -> Result<(), String> { - // Create terms from regex - let t1 = Term::from_pattern("abc.*").unwrap(); - let t2 = Term::from_pattern(".*xyz").unwrap(); - - // Concatenate - let concat = t1.concat(&[t2]).unwrap(); - assert_eq!(concat.to_pattern().unwrap(), "abc.*xyz"); - - // Union - let union = t1.union(&[Term::from_pattern("fgh").unwrap()]).unwrap(); // (abc.*|fgh) - assert_eq!(union.to_pattern().unwrap(), "(abc.*|fgh)"); - - // Intersection - let inter = Term::from_pattern("(ab|xy){2}") - .unwrap() - .intersection(&[Term::from_pattern(".*xy").unwrap()]) - .unwrap(); // (ab|xy)xy - assert_eq!(inter.to_pattern().unwrap(), "(ab|xy)xy"); - - // Subtraction - let diff = Term::from_pattern("a*") - .unwrap() - .subtraction(&Term::from_pattern("").unwrap()) - .unwrap(); - assert_eq!(diff.to_pattern().unwrap(), "a+"); - - // Repetition - let rep = Term::from_pattern("abc") - .unwrap() - .repeat(2, Some(4)) - .unwrap(); // (abc){2,4} - assert_eq!(rep.to_pattern().unwrap(), "(abc){2,4}"); - - // Analyze - assert_eq!(rep.get_length(), (Some(6), Some(12))); - assert!(!rep.is_empty()); - - // Generate examples - let samples = Term::from_pattern("(x|y){1,3}") - .unwrap() - .generate_strings(5) - .unwrap(); - println!("Some matches: {:?}", samples); - - // Equivalence & subset - let a = Term::from_pattern("a+").unwrap(); - let b = Term::from_pattern("a*").unwrap(); - assert!(!a.are_equivalent(&b).unwrap()); - assert!(a.is_subset_of(&b).unwrap()); - - Ok(()) - } - - #[test] - fn test_readme_code_2() -> Result<(), String> { - let term = Term::from_pattern(".*abc.*cdef.*sqdsqf.*").unwrap(); - - let execution_profile = ExecutionProfileBuilder::new() - .execution_timeout(5) // We set the limit (5ms) - .build(); - - // We run the operation with the defined limitation - execution_profile.run(|| { - assert_eq!( - EngineError::OperationTimeOutError, - term.generate_strings(1000).unwrap_err() - ); - }); - - Ok(()) - } - - #[test] - fn test_readme_code_3() -> Result<(), String> { - let term1 = Term::from_pattern(".*abcdef.*").unwrap(); - let term2 = Term::from_pattern(".*defabc.*").unwrap(); - - let execution_profile = ExecutionProfileBuilder::new() - .max_number_of_states(5) // We set the limit - .build(); - - // We run the operation with the defined limitation - execution_profile.run(|| { - assert_eq!( - EngineError::AutomatonHasTooManyStates, - term1.intersection(&[term2]).unwrap_err() - ); - }); - - Ok(()) - } } From a42c87a9ec1925e6d2194944fdc10cab1a0cd714 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Mon, 4 Aug 2025 15:23:20 +0200 Subject: [PATCH 17/44] Update README.md --- README.md | 164 ++++++++++++++++++++++++++---------------------------- 1 file changed, 80 insertions(+), 84 deletions(-) diff --git a/README.md b/README.md index 3b1084c..4406477 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # RegexSolver [![Crates.io Version](https://img.shields.io/crates/v/regexsolver)](https://crates.io/crates/regexsolver) -**RegexSolver** is a high-performance Rust library for building, combining, and analyzing regular expressions and finite automata. Ideal for constraint solvers, code generators, test-case generators, and any use case requiring rich regex/automaton operations. +**RegexSolver** is a high-performance Rust library for building, combining, and analyzing regular expressions and finite automata. Ideal for constraint solvers, code or test-case generators, and any system needing rich regex or automaton operations. ## Table of Contents @@ -29,75 +29,72 @@ regexsolver = "1" ```rust use regexsolver::Term; - -// Create terms from regex -let t1 = Term::from_pattern("abc.*").unwrap(); -let t2 = Term::from_pattern(".*xyz").unwrap(); - -// Concatenate -let concat = t1.concat(&[t2]).unwrap(); -assert_eq!(concat.to_pattern().unwrap(), "abc.*xyz"); - -// Union -let union = t1.union(&[Term::from_pattern("fgh").unwrap()]).unwrap(); -assert_eq!(union.to_pattern().unwrap(), "(abc.*|fgh)"); - -// Intersection -let inter = Term::from_pattern("(ab|xy){2}") - .unwrap() - .intersection(&[Term::from_pattern(".*xy").unwrap()]) - .unwrap(); // (ab|xy)xy -assert_eq!(inter.to_pattern().unwrap(), "(ab|xy)xy"); - -// Subtraction -let diff = Term::from_pattern("a*") - .unwrap() - .subtraction(&Term::from_pattern("").unwrap()) - .unwrap(); -assert_eq!(diff.to_pattern().unwrap(), "a+"); - -// Repetition -let rep = Term::from_pattern("abc") - .unwrap() - .repeat(2, Some(4)) - .unwrap(); -assert_eq!(rep.to_pattern().unwrap(), "(abc){2,4}"); - -// Analyze -assert_eq!(rep.get_length(), (Some(6), Some(12))); -assert!(!rep.is_empty()); - -// Generate examples -let samples = Term::from_pattern("(x|y){1,3}") - .unwrap() - .generate_strings(5) - .unwrap(); -println!("Some matches: {:?}", samples); - -// Equivalence & subset -let a = Term::from_pattern("a+").unwrap(); -let b = Term::from_pattern("a*").unwrap(); -assert!(!a.are_equivalent(&b).unwrap()); -assert!(a.is_subset_of(&b).unwrap()); +use regexsolver::error::EngineError; + +fn main() -> Result<(), EngineError> { + // Create terms from regex + let t1 = Term::from_pattern("abc.*")?; + let t2 = Term::from_pattern(".*xyz")?; + + // Concatenate + let concat = t1.concat(&[t2])?; + assert_eq!(concat.to_pattern().unwrap(), "abc.*xyz"); + + // Union + let union = t1.union(&[Term::from_pattern("fgh")?])?; + assert_eq!(union.to_pattern().unwrap(), "(abc.*|fgh)"); + + // Intersection + let inter = Term::from_pattern("(ab|xy){2}")? + .intersection(&[Term::from_pattern(".*xy")?])?; + assert_eq!(inter.to_pattern().unwrap(), "(ab|xy)xy"); + + // Difference + let diff = Term::from_pattern("a*")? + .difference(&Term::from_pattern("")?)?; + assert_eq!(diff.to_pattern().unwrap(), "a+"); + + // Repetition + let rep = Term::from_pattern("abc")? + .repeat(2, Some(4))?; + assert_eq!(rep.to_pattern().unwrap(), "(abc){2,4}"); + + // Analyze + assert_eq!(rep.get_length(), (Some(6), Some(12))); + assert!(!rep.is_empty()); + + // Generate examples + let samples = Term::from_pattern("(x|y){1,3}")? + .generate_strings(5)?; + println!("Some matches: {:?}", samples); + + // Equivalence & subset + let a = Term::from_pattern("a+")?; + let b = Term::from_pattern("a*")?; + assert!(!a.are_equivalent(&b)?); + assert!(a.is_subset_of(&b)?); + + Ok(()) +} ``` ## Key Concepts & Limitations RegexSolver supports a subset of regular expressions that adhere to the principles of regular languages. Here are the key characteristics and limitations of the regular expressions supported by RegexSolver: - **Anchored Expressions:** All regular expressions in RegexSolver are anchored. This means that the expressions are treated as if they start and end at the boundaries of the input text. For example, the expression `abc` will match the string "abc" but not "xabc" or "abcx". -- **Lookahead/Lookbehind:** RegexSolver does not support lookahead (`(?=...)`) or lookbehind (`(?<=...)`) assertions. Using them would return an error. +- **Lookahead/Lookbehind:** RegexSolver does not support lookahead (`(?=...)`) or lookbehind (`(?<=...)`) assertions. Using them returns an error. - **Greedy/Ungreedy Quantifiers:** The concept of ungreedy (`*?`, `+?`, `??`) quantifiers is not supported. All quantifiers are treated as greedy. For example, `a*` or `a*?` will match the longest possible sequence of "a"s. -- **Line Feed and Dot:** RegexSolver handle every characters the same way. The dot character `.` matches every possible unicode characters including the line feed (`\n`). +- **Line Feed and Dot:** RegexSolver handles all characters the same way. The dot `.` matches any Unicode character including line feed (`\n`). - **Pure Regular Expressions:** RegexSolver focuses on pure regular expressions as defined in regular language theory. This means features that extend beyond regular languages, such as backreferences (`\1`, `\2`, etc.), are not supported. Any use of backreference would return an error. -- **Empty Regular Expressions:** An empty regular expression is denoted by `[]`, which represents a pattern that matches no input, not even an empty string. +- **Empty Regular Expressions:** The empty language (matches no string) is represented by constructs like `[]` (empty character class). This is distinct from the empty string. -RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/regex_syntax/) library for parsing patterns. As a result, unsupported features supported by the parser will be parsed but ignored. This allows for some flexibility in writing regular expressions, but it is important to be aware of the unsupported features to avoid unexpected behavior. +RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/regex_syntax/) library for parsing patterns. Unsupported features are parsed but ignored; they do not raise an error unless they affect semantics that cannot be represented (e.g., backreferences). This allows for some flexibility in writing regular expressions, but it is important to be aware of the unsupported features to avoid unexpected behavior. ## API ### Term -`Term` is an enum designed to represent either a regular expression or a compiled automaton. This unified representation enables seamless and efficient execution of set operations across multiple instances. It's particularly valuable when working with both regular expressions and automata, allowing operations to be performed transparently regardless of the underlying representation. +`Term` is an enum designed to represent either a regular expression or a compiled automaton. This unified representation enables seamless and efficient execution of set operations across multiple instances. It's particularly valuable when working with both regular expressions and automata, allowing operations to be performed transparently regardless of the underlying representation. #### Build | Method | Return | Description | @@ -113,10 +110,9 @@ RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/re | Method | Return | Description | | -------- | ------- | ------- | | `concat(&self, terms: &[Term])` | `Result` | Computes the concatenation of the given terms. | -| `difference(&self, subtrahend: &Term)` | `Result` | Alias for `subtraction`. | +| `difference(&self, other: &Term)` | `Result` | Computes the difference between `self` and `other`. | | `intersection(&self, terms: &[Term])` | `Result` | Computes the intersection of the given terms. | | `repeat(&self, min: u32, max_opt: Option)` | `Result` | Computes the repetition of the current term between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. | -| `subtraction(&self, subtrahend: &Term)` | `Result` | Computes the difference between `self` and the given subtrahend. | | `union(&self, terms: &[Term])` | `Result` | Computes the union of the given terms. | #### Analyze @@ -137,7 +133,7 @@ RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/re ### FastAutomaton -`FastAutomaton` is used to directly build, manipulate and analyze automata. To convert an automaton to a `RegularExpression` the method `to_regex()` can be used. Not all automaton can be converted to a regular expression. +`FastAutomaton` is used to directly build, manipulate and analyze automata. To convert an automaton to a `RegularExpression` the method `to_regex()` can be used. Not all automata can be converted to a regular expression. When building or modifying an automaton you might come to use the method `add_transition(&mut self, from_state: State, to_state: State, new_cond: &Condition)`. This method accepts a `Condition` rather than a raw character set. To build a `Condition`, call: ```rust @@ -165,33 +161,33 @@ This design allows us to perform unions, intersections, and complements of trans #### Build | Method | Return | Description | | -------- | ------- | ------- | -| `accept(&mut self, state: State)` | | Marks the provided state as an accepting (final) state. | -| `add_epsilon_transition(&mut self, from_state: State, to_state: State)` | | Creates a new epsilon transition between the two states. | -| `add_transition(&mut self, from_state: State, to_state: State, new_cond: &Condition)` | | Creates a new transition with the given condition; the condition must follow the automaton’s current spanning set. | +| `accept(&mut self, state: State)` | `()` | Marks the provided state as an accepting (final) state. | +| `add_epsilon_transition(&mut self, from_state: State, to_state: State)` | `()` | Creates a new epsilon transition between the two states. | +| `add_transition(&mut self, from_state: State, to_state: State, new_cond: &Condition)` | `()` | Creates a new transition with the given condition; the condition must follow the automaton’s current spanning set. | | `apply_new_spanning_set(&mut self, new_spanning_set: &SpanningSet)` | `Result<(), EngineError>` | Applies the provided spanning set and projects all existing conditions onto it. | | `new_empty()` | `FastAutomaton` | Creates an automaton that matches the empty language. | | `new_empty_string()` | `FastAutomaton` | Creates an automaton that only matches the empty string `""`. | | `new_from_range(range: &CharRange)` | `Result` | Creates an automaton that matches one of the characters in the given `CharRange`. | | `new_state(&mut self)` | `State` | Creates a new state and returns its identifier. | | `new_total()` | `FastAutomaton` | Creates an automaton that matches all possible strings. | -| `remove_state(&mut self, state: State)` | | Removes the state and all its connected transitions; panics if it's a start state. | -| `remove_states(&mut self, states: &IntSet)` | | Removes the given states and their connected transitions; panics if any is a start state. | +| `remove_state(&mut self, state: State)` | `()` | Removes the state and all its connected transitions; panics if it's a start state. | +| `remove_states(&mut self, states: &IntSet)` | `()` | Removes the given states and their connected transitions; panics if any is a start state. | #### Manipulate | Method | Return | Description | | -------- | ------- | ------- | | `complement(&mut self)` | `Result<(), EngineError>` | Complements the automaton; it must be deterministic. | -| `concat(&self, other: &FastAutomaton)` | `Result` | Returns a new `FastAutomaton` representing the concatenation of `self` and `other`. | -| `concat_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` representing the concatenation of all automata in the given iterator. | -| `determinize(&self)` | `Result, EngineError>` | Determinizes the automaton and returns the result as a new `FastAutomaton`. | -| `intersection(&self, other: &FastAutomaton)` | `Result` | Returns a new `FastAutomaton` representing the intersection of `self` and `other`. | -| `intersection_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the intersection of all automatons in the given iterator. | -| `intersection_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the intersection of all automatons in the given parallel iterator. | +| `concat(&self, other: &FastAutomaton)` | `Result` | Computes the concatenation between `self` and `other`. | +| `concat_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Computes the concatenation of all automatons in the given iterator. | +| `determinize(&self)` | `Result, EngineError>` | Determinizes the automaton and returns the result. | +| `difference(&self, other: &FastAutomaton)` | `Result` | Computes the difference between `self` and `other`. | +| `intersection(&self, other: &FastAutomaton)` | `Result` | Computes the intersection between `self` and `other`. | +| `intersection_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Computes the intersection of all automatons in the given iterator. | +| `intersection_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | Computes in parallel the intersection of all automatons in the given iterator. | | `repeat(&self, min: u32, max_opt: Option)` | `Result` | Computes the repetition of the automaton between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. | -| `subtraction(&self, other: &FastAutomaton)` | `Result` | Returns a new `FastAutomaton` representing the substraction of `self` and `other`. | -| `union(&self, other: &FastAutomaton)` | `Result` | Returns a new `FastAutomaton` representing the union of `self` and `other`. | -| `union_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the union of all automatons in the given iterator. | -| `union_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the union of all automatons in the given parallel iterator. | +| `union(&self, other: &FastAutomaton)` | `Result` | Computes the union between `self` and `other`. | +| `union_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Computes the union of all automatons in the given iterator. | +| `union_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | Computes in parallel the union of all automatons in the given iterator. | #### Analyze | Method | Return | Description | @@ -208,14 +204,14 @@ This design allows us to perform unions, intersections, and complements of trans | `get_condition(&self, from_state: State, to_state: State)` | `Option<&Condition>` | Returns a reference to the condition of the directed transition between the two states, if any. | | `get_condition_mut(&mut self, from_state: State, to_state: State)` | `Option<&Condition>` | Returns a mutable reference to the condition of the directed transition between the two states, if any. | | `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of matched strings. | -| `get_reacheable_states(&self)` | `IntSet` | Returns the set of all states reachable from the start state. | +| `get_reachable_states(&self)` | `IntSet` | Returns the set of all states reachable from the start state. | | `get_spanning_set(&self)` | `&SpanningSet` | Returns a reference to the automaton's spanning set. | | `get_start_state(&self)` | `State` | Returns the start state. | | `has_intersection(&self, other: &FastAutomaton)` | `Result` | Returns `true` if the two automata have a non-empty intersection. | | `has_state(&self, state: State)` | `bool` | Returns `true` if the automaton contains the given state. | | `is_accepted(&self, state: &State)` | `bool` | Returns `true` if the given state is one of the accept states. | | `is_cyclic(&self)` | `bool` | Returns `true` if the automaton contains at least one cycle. | -| `is_determinitic(&self)` | `bool` | Returns `true` if the automaton is deterministic. | +| `is_deterministic(&self)` | `bool` | Returns `true` if the automaton is deterministic. | | `is_empty(&self)` | `bool` | Checks if the automaton matches the empty language. | | `is_empty_string(&self)` | `bool` | Checks if the automaton only matches the empty string `""`. | | `is_subset_of(&self, other: &FastAutomaton)` | `Result` | Returns `true` if all strings accepted by `self` are also accepted by `other`. | @@ -232,7 +228,7 @@ This design allows us to perform unions, intersections, and complements of trans ### RegularExpression -`RegularExpression` is used to directly build, manipulate and analyze regular expression patterns. Not all the set operations are available, for more advanced operation such as intersection, subtraction/difference and complement it is necessary to convert in to a `FastAutomaton` with the method `to_automaton()`. +`RegularExpression` is used to directly build, manipulate and analyze regular expression patterns. Not all the set operations are available, for more advanced operation such as intersection, subtraction/difference and complement it is necessary to convert into a `FastAutomaton` with the method `to_automaton()`. #### Build | Method | Return | Description | @@ -259,17 +255,17 @@ This design allows us to perform unions, intersections, and complements of trans ## Bound Execution -By default, all operations run without limits. For heavy or untrusted patterns, use a thread local `ExecutionProfile` to cap execution time and maximum number of states in used automata. +Use a thread-local `ExecutionProfile` to cap runtime or state explosion; hitting a limit returns a specific `EngineError`. ### Time-Bounded Execution ```rust use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; -let term = Term::from_pattern(".*abc.*cdef.*sqdsqf.*").unwrap(); +let term = Term::from_pattern(".*abc.*cdef.*sqdsqf.*")?; let execution_profile = ExecutionProfileBuilder::new() - .execution_timeout(5) // We set the limit (5ms) + .execution_timeout(5) // limit in milliseconds .build(); // We run the operation with the defined limitation @@ -283,11 +279,11 @@ execution_profile.run(|| { ```rust use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; -let term1 = Term::from_pattern(".*abcdef.*").unwrap(); -let term2 = Term::from_pattern(".*defabc.*").unwrap(); +let term1 = Term::from_pattern(".*abcdef.*")?; +let term2 = Term::from_pattern(".*defabc.*")?; let execution_profile = ExecutionProfileBuilder::new() - .max_number_of_states(5) // We set the limit + .max_number_of_states(5) // we set the limit .build(); // We run the operation with the defined limitation @@ -304,7 +300,7 @@ If you want to use this library with other programming languages, we provide a w - [regexsolver-js](https://github.com/RegexSolver/regexsolver-js) - [regexsolver-python](https://github.com/RegexSolver/regexsolver-python) -For more information about how to use the wrappers, you can refer to our [getting started guide](https://docs.regexsolver.com/getting-started.html). +For more information about how to use the wrappers, you can refer to our [guide](https://docs.regexsolver.com/getting-started.html). ## License From aec5c39f78279aa22aa8cd63208aebaef8b598e9 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Mon, 4 Aug 2025 22:05:21 +0200 Subject: [PATCH 18/44] Update naming and docs --- Cargo.toml | 10 +- src/cardinality/mod.rs | 6 +- src/error/mod.rs | 3 + src/execution_profile.rs | 2 +- src/fast_automaton/builder.rs | 6 +- .../condition/fast_bit_vec/mod.rs | 6 +- src/fast_automaton/convert/to_regex/mod.rs | 8 +- src/fast_automaton/mod.rs | 2 +- src/fast_automaton/operation/concat.rs | 4 +- src/fast_automaton/operation/determinize.rs | 4 +- .../{subtraction.rs => difference.rs} | 4 +- src/fast_automaton/operation/intersection.rs | 6 +- src/fast_automaton/operation/mod.rs | 2 +- src/fast_automaton/operation/union.rs | 6 +- src/fast_automaton/serializer.rs | 122 ++++------------- src/fast_automaton/spanning_set/mod.rs | 16 +-- src/lib.rs | 128 +++++++++--------- src/regex/analyze/number_of_states.rs | 2 +- src/regex/builder.rs | 2 +- src/regex/mod.rs | 4 +- src/tokenizer/embed_automaton.rs | 4 +- tests/integration_tests.rs | 5 - 22 files changed, 139 insertions(+), 213 deletions(-) rename src/fast_automaton/operation/{subtraction.rs => difference.rs} (92%) diff --git a/Cargo.toml b/Cargo.toml index c691486..eb32825 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,15 +6,13 @@ authors = ["Alexandre van Beurden"] repository = "https://github.com/RegexSolver/regexsolver" license = "MIT" keywords = ["automaton", "intersection", "union", "difference", "regex"] -description = "Manipulate regex and automaton as if they were sets." +description = "High-performance Rust library for building, combining, and analyzing regular expressions and finite automata" readme = "README.md" [dependencies] serde = { version = "1.0", features = ["derive"], optional = true } ciborium = { version = "0.2.2", optional = true } z85 = { version = "3.0.5", optional = true } -aes-gcm-siv = { version = "0.11.1", optional = true } -sha2 = { version = "0.10.8", optional = true } flate2 = { version = "1.0.30", features = [ "zlib-ng", ], default-features = false, optional = true } @@ -35,14 +33,12 @@ serde_json = "1.0.114" [features] -default = ["serde"] -serde = [ +default = [] +serializable = [ "regex-charclass/serde", "dep:serde", "dep:ciborium", "dep:z85", - "dep:aes-gcm-siv", - "dep:sha2", "dep:flate2", ] diff --git a/src/cardinality/mod.rs b/src/cardinality/mod.rs index 08131e0..9adad1c 100644 --- a/src/cardinality/mod.rs +++ b/src/cardinality/mod.rs @@ -1,10 +1,10 @@ -#[cfg(feature = "serde")] +#[cfg(feature = "serializable")] use serde::{Deserialize, Serialize}; /// Represent a number. -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "serializable", derive(Serialize, Deserialize))] #[derive(PartialEq, Eq, Debug, Clone)] -#[cfg_attr(feature = "serde", serde(tag = "type", content = "value"))] +#[cfg_attr(feature = "serializable", serde(tag = "type", content = "value"))] pub enum Cardinality { /// An infinite number. Infinite, diff --git a/src/error/mod.rs b/src/error/mod.rs index 303c225..b960147 100644 --- a/src/error/mod.rs +++ b/src/error/mod.rs @@ -1,5 +1,6 @@ use std::fmt::{self}; +#[cfg(feature = "serializable")] use crate::tokenizer::token::TokenError; /// An error thrown by the engine. @@ -19,6 +20,7 @@ pub enum EngineError { ConditionInvalidRange, /// The provided index is out of bound of the condition. ConditionIndexOutOfBound, + #[cfg(feature = "serializable")] /// There is an error with one of the token. TokenError(TokenError), /// Computing the cardinality of the provided automaton failed. @@ -37,6 +39,7 @@ impl fmt::Display for EngineError { write!(f, "The automaton has too many states.") } EngineError::RegexSyntaxError(err) => write!(f, "{err}."), + #[cfg(feature = "serializable")] EngineError::TokenError(err) => write!(f, "{err}."), EngineError::ConditionInvalidRange => write!( f, diff --git a/src/execution_profile.rs b/src/execution_profile.rs index 76c1b78..cda0e11 100644 --- a/src/execution_profile.rs +++ b/src/execution_profile.rs @@ -131,7 +131,7 @@ impl ExecutionProfile { result } - /// Like [`run`], but does *not* reset its start time. Useful if you want to pass a profile state to a new thread. + /// Like [`ExecutionProfile::run`], but does *not* reset its start time. Useful if you want to pass a profile state to a new thread. pub fn apply(&self, f: F) -> R where F: FnOnce() -> R, diff --git a/src/fast_automaton/builder.rs b/src/fast_automaton/builder.rs index f2011e4..8bcc136 100644 --- a/src/fast_automaton/builder.rs +++ b/src/fast_automaton/builder.rs @@ -38,7 +38,7 @@ impl FastAutomaton { automaton } - /// Creates an automaton that matches one of the characters in the given `CharRange`. + /// Creates an automaton that matches one of the characters in the given [`CharRange`]. pub fn new_from_range(range: &CharRange) -> Result { let mut automaton = Self::new_empty(); if range.is_empty() { @@ -75,7 +75,7 @@ impl FastAutomaton { /// Creates a new transition with the given condition; the condition must follow the automaton’s current spanning set. /// - /// This method accepts a `Condition` rather than a raw character set. To build a `Condition`, call: + /// This method accepts a [`Condition`] rather than a raw character set. To build a [`Condition`], call: /// ```rust /// # use regexsolver::CharRange; /// # use regexsolver::fast_automaton::{condition::Condition, spanning_set::SpanningSet}; @@ -83,7 +83,7 @@ impl FastAutomaton { /// # let spanning_set = SpanningSet::new_total(); /// Condition::from_range(&range, &spanning_set); /// ``` - /// where `spanning_set` is the automaton's current `SpanningSet`. The `CharRange` you pass must be fully covered by that spanning set. If it isn't, you have two options: + /// where `spanning_set` is the automaton's current [`SpanningSet`]. The [`CharRange`] you pass must be fully covered by that spanning set. If it isn't, you have two options: /// /// 1. Merge an existing spanning set with another: /// ```rust diff --git a/src/fast_automaton/condition/fast_bit_vec/mod.rs b/src/fast_automaton/condition/fast_bit_vec/mod.rs index 82b0ead..a1b46c5 100644 --- a/src/fast_automaton/condition/fast_bit_vec/mod.rs +++ b/src/fast_automaton/condition/fast_bit_vec/mod.rs @@ -124,10 +124,10 @@ impl FastBitVec { } pub fn get_bits(&self) -> Vec { - let mut hot_bits = Vec::with_capacity(self.n); + let mut bits = Vec::with_capacity(self.n); for i in 0..self.n { - hot_bits.push(self.get(i).unwrap()); + bits.push(self.get(i).unwrap()); } - hot_bits + bits } } diff --git a/src/fast_automaton/convert/to_regex/mod.rs b/src/fast_automaton/convert/to_regex/mod.rs index e99e506..e8e7e8e 100644 --- a/src/fast_automaton/convert/to_regex/mod.rs +++ b/src/fast_automaton/convert/to_regex/mod.rs @@ -240,7 +240,7 @@ impl StateEliminationAutomaton { } impl FastAutomaton { - /// Attempts to convert the automaton to a `RegularExpression`; returns `None` if no equivalent pattern are found. + /// Attempts to convert the automaton to a [`RegularExpression`]; returns `None` if no equivalent pattern are found. pub fn to_regex(&self) -> Option { if self.is_empty() { return Some(RegularExpression::new_empty()); @@ -360,7 +360,7 @@ mod tests { .unwrap(); let automaton2 = automaton2.determinize().unwrap(); - let result = automaton1.subtraction(&automaton2).unwrap(); + let result = automaton1.difference(&automaton2).unwrap(); result.to_dot(); @@ -403,7 +403,7 @@ mod tests { .unwrap(); let automaton2 = automaton2.determinize().unwrap(); - let result = automaton1.subtraction(&automaton2).unwrap(); + let result = automaton1.difference(&automaton2).unwrap(); result.to_dot(); let result = result.to_regex().unwrap(); @@ -451,7 +451,7 @@ mod tests { .determinize() .unwrap(); - let result = automaton1.subtraction(&automaton2).unwrap(); + let result = automaton1.difference(&automaton2).unwrap(); result.to_dot(); let result = result.to_regex().unwrap(); diff --git a/src/fast_automaton/mod.rs b/src/fast_automaton/mod.rs index 30a09a9..908ed4e 100644 --- a/src/fast_automaton/mod.rs +++ b/src/fast_automaton/mod.rs @@ -26,7 +26,7 @@ pub mod condition; mod convert; mod generate; mod operation; -#[cfg(feature = "serde")] +#[cfg(feature = "serializable")] mod serializer; pub mod spanning_set; diff --git a/src/fast_automaton/operation/concat.rs b/src/fast_automaton/operation/concat.rs index 3ee4456..45654d3 100644 --- a/src/fast_automaton/operation/concat.rs +++ b/src/fast_automaton/operation/concat.rs @@ -7,12 +7,12 @@ use crate::error::EngineError; use super::*; impl FastAutomaton { - /// Returns a new `FastAutomaton` representing the concatenation of `self` and `other`. + /// Computes the concatenation between `self` and `other`. pub fn concat(&self, other: &FastAutomaton) -> Result { Self::concat_all([self, other]) } - /// Returns a new `FastAutomaton` representing the concatenation of all automata in the given iterator. + /// Computes the concatenation of all automatons in the given iterator. pub fn concat_all<'a, I: IntoIterator>(automatons: I) -> Result { let mut new_automaton = FastAutomaton::new_empty_string(); diff --git a/src/fast_automaton/operation/determinize.rs b/src/fast_automaton/operation/determinize.rs index 8257c8f..734f622 100644 --- a/src/fast_automaton/operation/determinize.rs +++ b/src/fast_automaton/operation/determinize.rs @@ -5,7 +5,7 @@ use crate::{EngineError, execution_profile::ExecutionProfile}; use super::*; impl FastAutomaton { - /// Determinizes the automaton and returns the result as a new `FastAutomaton`. + /// Determinizes the automaton and returns the result. pub fn determinize(&self) -> Result, EngineError> { if self.deterministic { return Ok(Cow::Borrowed(self)); @@ -128,7 +128,7 @@ mod tests { assert!(deterministic_automaton.is_determinitic()); assert!( automaton - .subtraction(&deterministic_automaton) + .difference(&deterministic_automaton) .unwrap() .is_empty() ); diff --git a/src/fast_automaton/operation/subtraction.rs b/src/fast_automaton/operation/difference.rs similarity index 92% rename from src/fast_automaton/operation/subtraction.rs rename to src/fast_automaton/operation/difference.rs index d7adeef..a7b8ecf 100644 --- a/src/fast_automaton/operation/subtraction.rs +++ b/src/fast_automaton/operation/difference.rs @@ -59,8 +59,8 @@ impl FastAutomaton { Ok(()) } - /// Returns a new `FastAutomaton` representing the substraction of `self` and `other`. - pub fn subtraction(&self, other: &FastAutomaton) -> Result { + /// Computes the difference between `self` and `other`. + pub fn difference(&self, other: &FastAutomaton) -> Result { let mut complement = other.clone(); match complement.complement() { Ok(()) => self.intersection(&complement), diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index 778e0e3..4f42859 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -12,12 +12,12 @@ use crate::{ use super::*; impl FastAutomaton { - /// Returns a new `FastAutomaton` representing the intersection of `self` and `other`. + /// Computes the intersection between `self` and `other`. pub fn intersection(&self, other: &FastAutomaton) -> Result { FastAutomaton::intersection_all([self, other]) } - /// Returns a new `FastAutomaton` that is the intersection of all automatons in the given iterator. + /// Computes the intersection of all automatons in the given iterator. pub fn intersection_all<'a, I: IntoIterator>(automatons: I) -> Result { let mut result: Cow<'a, FastAutomaton> = Cow::Owned(FastAutomaton::new_total()); @@ -33,7 +33,7 @@ impl FastAutomaton { Ok(result.into_owned()) } - /// Returns a new `FastAutomaton` that is the intersection of all automatons in the given parallel iterator. + /// Computes in parallel the intersection of all automatons in the given iterator. pub fn intersection_all_par<'a, I: IntoParallelIterator>(automatons: I) -> Result { let execution_profile = ExecutionProfile::get(); diff --git a/src/fast_automaton/operation/mod.rs b/src/fast_automaton/operation/mod.rs index a574a0e..54bcba3 100644 --- a/src/fast_automaton/operation/mod.rs +++ b/src/fast_automaton/operation/mod.rs @@ -8,7 +8,7 @@ mod union; mod concat; mod determinize; mod intersection; -mod subtraction; +mod difference; mod repeat; impl FastAutomaton { diff --git a/src/fast_automaton/operation/union.rs b/src/fast_automaton/operation/union.rs index d83e7de..e71f346 100644 --- a/src/fast_automaton/operation/union.rs +++ b/src/fast_automaton/operation/union.rs @@ -8,12 +8,12 @@ use crate::{error::EngineError, execution_profile::ExecutionProfile}; use super::*; impl FastAutomaton { - /// Returns a new `FastAutomaton` representing the union of `self` and `other`. + /// Computes the union between `self` and `other`. pub fn union(&self, other: &FastAutomaton) -> Result { Self::union_all([self, other]) } - /// Returns a new `FastAutomaton` that is the union of all automatons in the given iterator. + /// Computes the union of all automatons in the given iterator. pub fn union_all<'a, I: IntoIterator>(automatons: I) -> Result { let mut new_automaton = FastAutomaton::new_empty(); @@ -23,7 +23,7 @@ impl FastAutomaton { Ok(new_automaton) } - /// Returns a new `FastAutomaton` that is the union of all automatons in the given parallel iterator. + /// Computes in parallel the union of all automatons in the given iterator. pub fn union_all_par<'a, I: IntoParallelIterator>(automatons: I) -> Result { let execution_profile = ExecutionProfile::get(); diff --git a/src/fast_automaton/serializer.rs b/src/fast_automaton/serializer.rs index d2dc30b..29c41e3 100644 --- a/src/fast_automaton/serializer.rs +++ b/src/fast_automaton/serializer.rs @@ -1,18 +1,10 @@ use super::*; use crate::tokenizer::Tokenizer; -use lazy_static::lazy_static; -use rand::Rng; use serde::{Deserialize, Serialize}; use serde::{Deserializer, Serializer, de, ser}; -use std::env; -use z85::{decode, encode}; -use sha2::{Digest, Sha256}; +use z85::{decode, encode}; -use aes_gcm_siv::{ - Aes256GcmSiv, Nonce, - aead::{Aead, KeyInit}, -}; use flate2::Compression; use flate2::read::ZlibDecoder; use flate2::write::ZlibEncoder; @@ -20,34 +12,6 @@ use std::io::prelude::*; use crate::tokenizer::token::{Token, automaton_token::AutomatonToken}; -pub struct FastAutomatonReader { - cipher: Aes256GcmSiv, -} - -impl FastAutomatonReader { - pub fn new() -> Self { - let env_var = env::var("RS_FAIR_SECRET_KEY").unwrap_or("DEFAULT PASSKEY".to_string()); - let key = Sha256::digest(env_var.as_bytes()); - FastAutomatonReader { - cipher: Aes256GcmSiv::new(&key), - } - } - - pub fn random_nonce() -> [u8; 12] { - let mut nonce = [0u8; 12]; - rand::thread_rng().fill(&mut nonce); - nonce - } -} - -lazy_static! { - static ref SINGLETON_INSTANCE: FastAutomatonReader = FastAutomatonReader::new(); -} - -fn get_fast_automaton_reader() -> &'static FastAutomatonReader { - &SINGLETON_INSTANCE -} - #[derive(Serialize, Deserialize, Debug)] struct SerializedAutomaton(Vec, SpanningSet); @@ -67,22 +31,7 @@ impl serde::Serialize for FastAutomaton { return Err(ser::Error::custom(err.to_string())); } - serialized = compress_data(&serialized); - - let nonce = FastAutomatonReader::random_nonce(); - - match get_fast_automaton_reader() - .cipher - .encrypt(Nonce::from_slice(&nonce), serialized.as_ref()) - { - Ok(ciphertext) => { - let mut encrypted = Vec::from_iter(nonce); - encrypted.extend(ciphertext); - - serializer.serialize_str(&encode(&encrypted)) - } - Err(err) => Err(ser::Error::custom(err.to_string())), - } + serializer.serialize_str(&encode(compress_data(&serialized))) } Err(err) => Err(ser::Error::custom(err.to_string())), } @@ -96,38 +45,27 @@ impl<'de> serde::Deserialize<'de> for FastAutomaton { { match String::deserialize(deserializer) { Ok(decoded) => match decode(decoded) { - Ok(encrypted) => { - let nonce = &encrypted[0..12]; - let payload = encrypted[12..].to_vec(); - let cipher_result = get_fast_automaton_reader() - .cipher - .decrypt(Nonce::from_slice(nonce), payload.as_ref()); - - match cipher_result { - Ok(cipher_result) => { - let decrypted = decompress_data(&cipher_result); - - let automaton: Result< - SerializedAutomaton, - ciborium::de::Error, - > = ciborium::from_reader(&decrypted[..]); - match automaton { - Ok(automaton) => { - let mut temp_automaton = FastAutomaton::new_empty(); - temp_automaton.spanning_set = automaton.1; - let tokenizer = Tokenizer::new(&temp_automaton); - - match tokenizer.from_embedding( - &automaton - .0 - .into_iter() - .map(AutomatonToken::from_fair_token) - .collect::>(), - ) { - Ok(res) => Ok(res), - Err(err) => Err(de::Error::custom(err.to_string())), - } - } + Ok(compressed) => { + let payload = decompress_data(&compressed); + + let automaton: Result< + SerializedAutomaton, + ciborium::de::Error, + > = ciborium::from_reader(&payload[..]); + match automaton { + Ok(automaton) => { + let mut temp_automaton = FastAutomaton::new_empty(); + temp_automaton.spanning_set = automaton.1; + let tokenizer = Tokenizer::new(&temp_automaton); + + match tokenizer.from_embedding( + &automaton + .0 + .into_iter() + .map(AutomatonToken::from_fair_token) + .collect::>(), + ) { + Ok(res) => Ok(res), Err(err) => Err(de::Error::custom(err.to_string())), } } @@ -192,8 +130,8 @@ mod tests { let unserialized = unserialized.determinize().unwrap(); let automaton = automaton.determinize().unwrap(); - assert!(automaton.subtraction(&unserialized).unwrap().is_empty()); - assert!(unserialized.subtraction(&automaton).unwrap().is_empty()); + assert!(automaton.difference(&unserialized).unwrap().is_empty()); + assert!(unserialized.difference(&automaton).unwrap().is_empty()); } #[test] @@ -208,18 +146,18 @@ mod tests { .unwrap(); let automaton2 = automaton2.determinize().unwrap(); - let subtraction = automaton1.subtraction(&automaton2).unwrap(); + let difference = automaton1.difference(&automaton2).unwrap(); - let serialized = serde_json::to_string(&subtraction).unwrap(); + let serialized = serde_json::to_string(&difference).unwrap(); println!("{serialized}"); let unserialized: FastAutomaton = serde_json::from_str(&serialized).unwrap(); let unserialized = unserialized.determinize().unwrap(); - let automaton = subtraction.determinize().unwrap(); + let automaton = difference.determinize().unwrap(); - assert!(automaton.subtraction(&unserialized).unwrap().is_empty()); - assert!(unserialized.subtraction(&automaton).unwrap().is_empty()); + assert!(automaton.difference(&unserialized).unwrap().is_empty()); + assert!(unserialized.difference(&automaton).unwrap().is_empty()); Ok(()) } diff --git a/src/fast_automaton/spanning_set/mod.rs b/src/fast_automaton/spanning_set/mod.rs index bfaefcb..bdb9d9c 100644 --- a/src/fast_automaton/spanning_set/mod.rs +++ b/src/fast_automaton/spanning_set/mod.rs @@ -2,13 +2,13 @@ use std::slice::Iter; use ahash::AHashSet; -#[cfg(feature = "serde")] +#[cfg(feature = "serializable")] use serde::{Deserialize, Serialize}; use crate::CharRange; /// Contains a set of [`CharRange`] that span all the transition of a [`crate::FastAutomaton`]. -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "serializable", derive(Serialize, Deserialize))] #[derive(Clone, Debug, PartialEq, Eq)] pub struct SpanningSet(Vec, CharRange); @@ -91,13 +91,13 @@ impl SpanningSet { let other_set = spanning_ranges.swap_remove(index); let intersection_set = set.intersection(&other_set); new_spanning_ranges.insert(intersection_set); - let subtraction_set = set.difference(&other_set); - if !subtraction_set.is_empty() { - new_spanning_ranges.insert(subtraction_set); + let difference_set = set.difference(&other_set); + if !difference_set.is_empty() { + new_spanning_ranges.insert(difference_set); } - let subtraction_set = other_set.difference(&set); - if !subtraction_set.is_empty() { - new_spanning_ranges.insert(subtraction_set); + let difference_set = other_set.difference(&set); + if !difference_set.is_empty() { + new_spanning_ranges.insert(difference_set); } changed = true; } else if !set.is_empty() { diff --git a/src/lib.rs b/src/lib.rs index 75051db..9dd5328 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,7 +12,7 @@ use nohash_hasher::NoHashHasher; use rayon::prelude::*; use regex::RegularExpression; use regex_charclass::{char::Char, irange::RangeSet}; -#[cfg(feature = "serde")] +#[cfg(feature = "serializable")] use serde::{Deserialize, Serialize}; use crate::execution_profile::ExecutionProfile; @@ -22,6 +22,7 @@ pub mod error; pub mod execution_profile; pub mod fast_automaton; pub mod regex; +#[cfg(feature = "serializable")] pub mod tokenizer; pub type IntMap = HashMap>>; @@ -30,68 +31,67 @@ pub type CharRange = RangeSet; /// Represents a term that can be either a regular expression or a finite automaton. This term can be manipulated with a wide range of operations. /// +/// # Example /// ```rust /// use regexsolver::Term; +/// use regexsolver::error::EngineError; /// -/// // Create terms from regex -/// let t1 = Term::from_pattern("abc.*").unwrap(); -/// let t2 = Term::from_pattern(".*xyz").unwrap(); +/// fn main() -> Result<(), EngineError> { +/// // Create terms from regex +/// let t1 = Term::from_pattern("abc.*")?; +/// let t2 = Term::from_pattern(".*xyz")?; /// -/// // Concatenate -/// let concat = t1.concat(&[t2]).unwrap(); -/// assert_eq!(concat.to_pattern().unwrap(), "abc.*xyz"); +/// // Concatenate +/// let concat = t1.concat(&[t2])?; +/// assert_eq!(concat.to_pattern().unwrap(), "abc.*xyz"); /// -/// // Union -/// let union = t1.union(&[Term::from_pattern("fgh").unwrap()]).unwrap(); -/// assert_eq!(union.to_pattern().unwrap(), "(abc.*|fgh)"); +/// // Union +/// let union = t1.union(&[Term::from_pattern("fgh")?])?; +/// assert_eq!(union.to_pattern().unwrap(), "(abc.*|fgh)"); /// -/// // Intersection -/// let inter = Term::from_pattern("(ab|xy){2}") -/// .unwrap() -/// .intersection(&[Term::from_pattern(".*xy").unwrap()]) -/// .unwrap(); // (ab|xy)xy -/// assert_eq!(inter.to_pattern().unwrap(), "(ab|xy)xy"); +/// // Intersection +/// let inter = Term::from_pattern("(ab|xy){2}")? +/// .intersection(&[Term::from_pattern(".*xy")?])?; +/// assert_eq!(inter.to_pattern().unwrap(), "(ab|xy)xy"); /// -/// // Subtraction -/// let diff = Term::from_pattern("a*") -/// .unwrap() -/// .subtraction(&Term::from_pattern("").unwrap()) -/// .unwrap(); -/// assert_eq!(diff.to_pattern().unwrap(), "a+"); +/// // Difference +/// let diff = Term::from_pattern("a*")? +/// .difference(&Term::from_pattern("")?)?; +/// assert_eq!(diff.to_pattern().unwrap(), "a+"); /// -/// // Repetition -/// let rep = Term::from_pattern("abc") -/// .unwrap() -/// .repeat(2, Some(4)) -/// .unwrap(); -/// assert_eq!(rep.to_pattern().unwrap(), "(abc){2,4}"); +/// // Repetition +/// let rep = Term::from_pattern("abc")? +/// .repeat(2, Some(4))?; +/// assert_eq!(rep.to_pattern().unwrap(), "(abc){2,4}"); /// -/// // Analyze -/// assert_eq!(rep.get_length(), (Some(6), Some(12))); -/// assert!(!rep.is_empty()); +/// // Analyze +/// assert_eq!(rep.get_length(), (Some(6), Some(12))); +/// assert!(!rep.is_empty()); /// -/// // Generate examples -/// let samples = Term::from_pattern("(x|y){1,3}") -/// .unwrap() -/// .generate_strings(5) -/// .unwrap(); -/// println!("Some matches: {:?}", samples); +/// // Generate examples +/// let samples = Term::from_pattern("(x|y){1,3}")? +/// .generate_strings(5)?; +/// println!("Some matches: {:?}", samples); /// -/// // Equivalence & subset -/// let a = Term::from_pattern("a+").unwrap(); -/// let b = Term::from_pattern("a*").unwrap(); -/// assert!(!a.are_equivalent(&b).unwrap()); -/// assert!(a.is_subset_of(&b).unwrap()); +/// // Equivalence & subset +/// let a = Term::from_pattern("a+")?; +/// let b = Term::from_pattern("a*")?; +/// assert!(!a.are_equivalent(&b)?); +/// assert!(a.is_subset_of(&b)?); +/// +/// Ok(()) +/// } +/// # main(); /// ``` /// -/// To put constraint and limitation on the execution of operations please refer to [`execution_profile::ExecutionProfile`]. -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +/// To put constraint and limitation on the execution of operations please refer to [`ExecutionProfile`]. +#[cfg_attr(feature = "serializable", derive(Serialize, Deserialize))] #[derive(Clone, PartialEq, Eq, Debug)] -#[cfg_attr(feature = "serde", serde(tag = "type", content = "value"))] +#[cfg_attr(feature = "serializable", serde(tag = "type", content = "value"))] pub enum Term { - #[cfg_attr(feature = "serde", serde(rename = "regex"))] + #[cfg_attr(feature = "serializable", serde(rename = "regex"))] RegularExpression(RegularExpression), - #[cfg_attr(feature = "serde", serde(rename = "fair"))] + #[cfg_attr(feature = "serializable", serde(rename = "fair"))] Automaton(FastAutomaton), } @@ -120,7 +120,7 @@ impl Term { Term::RegularExpression(RegularExpression::new_empty_string()) } - /// Parses the provided pattern and returns a new `Term` holding the resulting `RegularExpression`. + /// Parses the provided pattern and returns a new `Term` holding the resulting [`RegularExpression`]. /// /// # Example: /// @@ -133,12 +133,12 @@ impl Term { Ok(Term::RegularExpression(RegularExpression::new(pattern)?)) } - /// Creates a new `Term` holding the provided `RegularExpression`. + /// Creates a new `Term` holding the provided [`RegularExpression`]. pub fn from_regex(regex: RegularExpression) -> Self { Term::RegularExpression(regex) } - /// Creates a new `Term` holding the provided `FastAutomaton`. + /// Creates a new `Term` holding the provided [`FastAutomaton`]. pub fn from_automaton(automaton: FastAutomaton) -> Self { Term::Automaton(automaton) } @@ -295,7 +295,7 @@ impl Term { Ok(Term::Automaton(return_automaton)) } - /// Computes the difference between `self` and the given subtrahend. + /// Computes the difference between `self` and `other`. /// /// # Example: /// @@ -305,28 +305,22 @@ impl Term { /// let term1 = Term::from_pattern("(abc|de)").unwrap(); /// let term2 = Term::from_pattern("de").unwrap(); /// - /// let subtraction = term1.subtraction(&term2).unwrap(); + /// let difference = term1.difference(&term2).unwrap(); /// - /// if let Term::RegularExpression(regex) = subtraction { + /// if let Term::RegularExpression(regex) = difference { /// assert_eq!("abc", regex.to_string()); /// } /// ``` - pub fn subtraction(&self, subtrahend: &Term) -> Result { + pub fn difference(&self, other: &Term) -> Result { let minuend_automaton = self.to_automaton()?; - let subtrahend_automaton = subtrahend.to_automaton()?; + let subtrahend_automaton = other.to_automaton()?; let subtrahend_automaton = Self::determinize_subtrahend(&minuend_automaton, &subtrahend_automaton)?; - let return_automaton = minuend_automaton.subtraction(&subtrahend_automaton)?; + let return_automaton = minuend_automaton.difference(&subtrahend_automaton)?; Ok(Term::Automaton(return_automaton)) } - /// See [`Self::subtraction`]. - #[inline] - pub fn difference(&self, subtrahend: &Term) -> Result { - self.subtraction(subtrahend) - } - /// Computes the repetition of the current term between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. /// /// # Example: @@ -473,7 +467,7 @@ impl Term { } } - /// Converts the term to a `FastAutomaton`. + /// Converts the term to a [`FastAutomaton`]. pub fn to_automaton(&self) -> Result, EngineError> { Ok(match self { Term::RegularExpression(regex) => Cow::Owned(regex.to_automaton()?), @@ -569,11 +563,11 @@ mod tests { } #[test] - fn test_subtraction_1() -> Result<(), String> { + fn test_difference_1() -> Result<(), String> { let regex1 = Term::from_pattern("a*").unwrap(); let regex2 = Term::from_pattern("").unwrap(); - let result = regex1.subtraction(®ex2); + let result = regex1.difference(®ex2); assert!(result.is_ok()); let result = result.unwrap().to_pattern().unwrap(); assert_eq!("a+", result); @@ -582,11 +576,11 @@ mod tests { } #[test] - fn test_subtraction_2() -> Result<(), String> { + fn test_difference_2() -> Result<(), String> { let regex1 = Term::from_pattern("x*").unwrap(); let regex2 = Term::from_pattern("(xxx)*").unwrap(); - let result = regex1.subtraction(®ex2); + let result = regex1.difference(®ex2); assert!(result.is_ok()); let result = result.unwrap().to_regex().unwrap().into_owned(); assert_eq!( diff --git a/src/regex/analyze/number_of_states.rs b/src/regex/analyze/number_of_states.rs index e7460f8..8325456 100644 --- a/src/regex/analyze/number_of_states.rs +++ b/src/regex/analyze/number_of_states.rs @@ -162,7 +162,7 @@ impl AbstractNFAMetadata { } impl RegularExpression { - pub fn get_number_of_states_in_nfa(&self) -> usize { + pub(crate) fn get_number_of_states_in_nfa(&self) -> usize { self.evaluate_number_of_states_in_nfa().number_of_states } diff --git a/src/regex/builder.rs b/src/regex/builder.rs index ae66725..799c69d 100644 --- a/src/regex/builder.rs +++ b/src/regex/builder.rs @@ -11,7 +11,7 @@ lazy_static! { } impl RegularExpression { - /// Parses the provided pattern and returns the resulting `RegularExpression`. + /// Parses the provided pattern and returns the resulting [`RegularExpression`]. pub fn new(pattern: &str) -> Result { if pattern.is_empty() { return Ok(RegularExpression::new_empty_string()); diff --git a/src/regex/mod.rs b/src/regex/mod.rs index ba569e8..05908c0 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -11,7 +11,7 @@ use super::*; mod analyze; mod builder; mod operation; -#[cfg(feature = "serde")] +#[cfg(feature = "serializable")] mod serializer; /// Represent a regular expression. @@ -124,7 +124,7 @@ impl RegularExpression { } } - /// Converts the regular expression to an equivalent `FastAutomaton`. + /// Converts the regular expression to an equivalent [`FastAutomaton`]. pub fn to_automaton(&self) -> Result { ExecutionProfile::get().assert_max_number_of_states(self.get_number_of_states_in_nfa())?; diff --git a/src/tokenizer/embed_automaton.rs b/src/tokenizer/embed_automaton.rs index 40d0fcb..49733a0 100644 --- a/src/tokenizer/embed_automaton.rs +++ b/src/tokenizer/embed_automaton.rs @@ -179,13 +179,13 @@ mod tests { assert!( automaton - .subtraction(&unembedded_automaton) + .difference(&unembedded_automaton) .unwrap() .is_empty() ); assert!( unembedded_automaton - .subtraction(&automaton) + .difference(&automaton) .unwrap() .is_empty() ); diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs index d1dd407..4dd7e47 100644 --- a/tests/integration_tests.rs +++ b/tests/integration_tests.rs @@ -16,11 +16,6 @@ fn assert_regex(regex: &str) { assert!(re.is_match(&string), "'{string}'"); } - assert_eq!( - automaton.get_number_of_states(), - regex.get_number_of_states_in_nfa() - ); - let determinized_automaton = automaton.determinize().unwrap(); let strings = determinized_automaton.generate_strings(500).unwrap(); for string in strings { From 19aef3f59013bf405b4a6c0d30e194a6182ca5e3 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Mon, 4 Aug 2025 22:13:16 +0200 Subject: [PATCH 19/44] improve test --- src/lib.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 9dd5328..c81d84d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -552,12 +552,13 @@ mod tests { use super::*; #[test] - fn test_details() -> Result<(), String> { + fn test_intersection() -> Result<(), String> { let regex1 = Term::from_pattern("a").unwrap(); let regex2 = Term::from_pattern("b").unwrap(); - let details = regex1.intersection(&vec![regex2]); - assert!(details.is_ok()); + let intersection = regex1.intersection(&vec![regex2]).unwrap(); + assert!(intersection.is_empty()); + assert_eq!("[]", intersection.to_pattern().unwrap()); Ok(()) } From 6878356d84714a0e3cd46d1f8f0d554798f896f5 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Wed, 6 Aug 2025 21:45:12 +0200 Subject: [PATCH 20/44] Fix bad repetition case --- src/fast_automaton/operation/repeat.rs | 27 +++++++++++-- tests/data/regex-todo.txt | 5 +++ tests/data/regex.txt | 53 +++++++++++++++++++++++++- 3 files changed, 80 insertions(+), 5 deletions(-) create mode 100644 tests/data/regex-todo.txt diff --git a/src/fast_automaton/operation/repeat.rs b/src/fast_automaton/operation/repeat.rs index 8bbed81..b49207b 100644 --- a/src/fast_automaton/operation/repeat.rs +++ b/src/fast_automaton/operation/repeat.rs @@ -27,9 +27,7 @@ impl FastAutomaton { self.accept(new_state); } - for to_state in self.direct_states_vec(&self.start_state) { - self.add_epsilon_transition(new_state, to_state); - } + self.add_epsilon_transition(new_state, self.start_state); self.start_state = new_state; if max_opt.is_none() { @@ -63,7 +61,8 @@ impl FastAutomaton { && automaton_to_repeat.state_out_degree(accept_state) == 0 && automaton_to_repeat.state_in_degree(automaton_to_repeat.start_state) == 0 { - automaton_to_repeat.add_epsilon_transition(accept_state, automaton_to_repeat.start_state); + automaton_to_repeat + .add_epsilon_transition(accept_state, automaton_to_repeat.start_state); let old_start_state = automaton_to_repeat.start_state; automaton_to_repeat.start_state = accept_state; automaton_to_repeat.remove_state(old_start_state); @@ -106,3 +105,23 @@ impl FastAutomaton { Ok(()) } } + +#[cfg(test)] +mod tests { + use crate::regex::RegularExpression; + + #[test] + fn test_repeat_1() -> Result<(), String> { + let automaton = RegularExpression::new("(a*,a*)?") + .unwrap() + .to_automaton() + .unwrap(); + assert!(automaton.match_string("")); + assert!(automaton.match_string(",")); + assert!(automaton.match_string("aaa,")); + assert!(automaton.match_string("aaaa,aa")); + assert!(!automaton.match_string("a")); + assert!(!automaton.match_string("aa")); + Ok(()) + } +} diff --git a/tests/data/regex-todo.txt b/tests/data/regex-todo.txt new file mode 100644 index 0000000..1d97d13 --- /dev/null +++ b/tests/data/regex-todo.txt @@ -0,0 +1,5 @@ +#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3}) +\{(?:\s*"[^"]*"\s*:\s*(?:\d+|"(...)" )\s*)(?:,\s*"[^"]*"\s*:\s*(?:\d+|"(...)" )\s*)*\} +rgba?\(\s*(?:\d{1,3}\s*,\s*){2}\d{1,3}(?:\s*,\s*(?:0|1|0?\.\d+))?\s*\) +[+-]?(?:\d+\.\d*|\.\d+)(?:[eE][+-]?\d+)? +<\w+(?:\s+\w+(?:="[^"]*")?)*\s*/?> \ No newline at end of file diff --git a/tests/data/regex.txt b/tests/data/regex.txt index e5fb5df..3eebe62 100644 --- a/tests/data/regex.txt +++ b/tests/data/regex.txt @@ -1,3 +1,5 @@ +(a*,a*)? +(?:\s*,\s*(?:0|1|0?\.\d+))? [\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f] a{2,3} (abc|fg){2} @@ -19,4 +21,53 @@ a+(ba+)* [0-9]+[A-Z]* ù -^\d$ \ No newline at end of file +^\d$ +foo +bar? +baz+ +qux* +quux{3} +quuux{2,5} +quuuux{0,4} +.* +[aeiou] +[^aeiou] +[a-zA-Z0-9] +[\dA-Fa-f] +[\w&&[^_]] +[[:alpha:]]+ +[\p{L}]+ +[0-9]{2,4} +[01]?\d +[1-9][0-9]* +(cat|dog|mouse) +(?:red|green|blue){2} +(gr(a|e)y){1,3} +((ab|cd)ef)+ +(a(b(c|d)e)f)+ +(a|b(c|d(e|f))){2,3} +(?:abc){0,} +(?:abc){1,} +(?:abc){2,5} +a++ +\.\*\?\+\(\)\[\]\{\}\\\| +\u0041\u0042\u0043 +\p{Greek}+ +\p{Sc} +[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[A-Za-z]{2,} +\b((25[0-5]|2[0-4]\d|[01]?\d?\d)\.){3}(25[0-5]|2[0-4]\d|[01]?\d?\d)\b +https?://[^\s/$.?#][^\s]* +\d{4}/\d{2}/\d{2} +\d{1,2}:\d{2}(:\d{2})? +<([A-Za-z][A-Za-z0-9]*)\b[^>]*?/> +\{(?:[^{}]|\{[^{}]*\})*\} +\b(?:\d[ -]*?){13,16}\b +#([A-Fa-f0-9]{8}) +(a|b|c|d|e|f|g|h|i|j){5} +(?:"[^"]*"|[^,]*)(?:,(?:"[^"]*"|[^,]*))* +\b([0-9A-Fa-f]{2}:){5}[0-9A-Fa-f]{2}\b +\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b +[[:alnum:]&&[^0-9]] +[ \t]+ +[\r\n]+ +[^\t\r\n]+ \ No newline at end of file From 29697f8fab7a9d59578e8d9bcf300a2c8ba56725 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Fri, 8 Aug 2025 22:59:57 +0200 Subject: [PATCH 21/44] fix algo repeat --- src/regex/operation/mod.rs | 181 +---------------------- src/regex/operation/repeat.rs | 268 ++++++++++++++++++++++++++++------ tests/data/regex-todo.txt | 1 + 3 files changed, 227 insertions(+), 223 deletions(-) diff --git a/src/regex/operation/mod.rs b/src/regex/operation/mod.rs index f572238..7364d65 100644 --- a/src/regex/operation/mod.rs +++ b/src/regex/operation/mod.rs @@ -3,183 +3,4 @@ use super::*; mod concat; mod repeat; mod simplify; -mod union; - -#[cfg(test)] -mod tests { - - use regex_charclass::char::Char; - - use crate::{regex::RegularExpression, CharRange}; - - #[test] - fn test_parse_and_simplify() -> Result<(), String> { - assert_parse_and_simplify("(xxx)*", "(x{3})*"); - assert_parse_and_simplify("(x*){3}", "x*"); - assert_parse_and_simplify("(x+)?", "x*"); - assert_parse_and_simplify("(x?)+", "x*"); - assert_parse_and_simplify("(x{0,3})+", "x*"); - assert_parse_and_simplify("(x{2,3})+", "x{2,}"); - assert_parse_and_simplify("(x{7,9})+", "(x{7,9})+"); - assert_parse_and_simplify("(x+)*", "x*"); - assert_parse_and_simplify(".*abc", ".*abc"); - assert_parse_and_simplify(".*a(b|cd)", ".*a(b|cd)"); - assert_parse_and_simplify( - "a(bcfe|bcdg|mkv)*(abc){2,3}(abc){2}", - "a(bc(dg|fe)|mkv)*(abc){4,5}", - ); - assert_parse_and_simplify("((abc|fg)abc|(abc|fg)fg)", "(abc|fg){2}"); - assert_parse_and_simplify("(a{2}|a{3})", "a{2,3}"); - assert_parse_and_simplify("(a|b)", "[ab]"); - assert_parse_and_simplify("(ab|a|cd|b|ef)", "(b|ab?|cd|ef)"); - assert_parse_and_simplify("(ab|ab)", "ab"); - assert_parse_and_simplify("(ab)(ab)(ab)", "(ab){3}"); - assert_parse_and_simplify("aaaabbbbbccc", "a{4}b{5}c{3}"); - assert_parse_and_simplify("((ab))?(ab)(((ab)))((((ab)){3}))", "(ab){5,6}"); - assert_parse_and_simplify("(cd|ab)*(ab|cd)*", "(ab|cd)*"); - assert_parse_and_simplify(".*q(ab|ab|abc|ca)x", ".*q(abc?|ca)x"); - assert_parse_and_simplify( - "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,100}", - "(q|(a|ads|a{2}d)*abc.*def.*uif(x|ads|a{2}d)*abc.*oxs.*def(ads|ax|a{2}d)*abc.*def.*ksd){1,100}", - ); - - assert_parse_and_simplify("(a{2,4}){2,4}", "a{4,16}"); - Ok(()) - } - - fn assert_parse_and_simplify(regex: &str, regex_simplified: &str) { - let regex_parsed = RegularExpression::new(regex).unwrap(); - assert_eq!(regex_simplified, regex_parsed.to_string()); - } - - #[test] - fn test_repeat_simplify() -> Result<(), String> { - assert_repeat_simplify( - &CharRange::new_from_range(Char::new('a')..=Char::new('a')), - 2, - Some(2), - 3, - Some(3), - ); - - assert_repeat_simplify( - &CharRange::new_from_range(Char::new('a')..=Char::new('a')), - 2, - Some(2), - 2, - Some(4), - ); - - assert_repeat_simplify( - &CharRange::new_from_range(Char::new('a')..=Char::new('a')), - 3, - Some(3), - 0, - None, - ); - - assert_repeat_simplify( - &CharRange::new_from_range(Char::new('a')..=Char::new('a')), - 0, - Some(3), - 1, - None, - ); - - assert_repeat_simplify( - &CharRange::new_from_range(Char::new('a')..=Char::new('a')), - 1, - Some(2), - 1, - None, - ); - - assert_repeat_simplify( - &CharRange::new_from_range(Char::new('a')..=Char::new('a')), - 2, - Some(3), - 1, - None, - ); - - assert_repeat_simplify( - &CharRange::new_from_range(Char::new('a')..=Char::new('a')), - 3, - Some(4), - 1, - None, - ); - - assert_repeat_simplify( - &CharRange::new_from_range(Char::new('a')..=Char::new('a')), - 7, - Some(8), - 1, - None, - ); - - assert_repeat_simplify( - &CharRange::new_from_range(Char::new('a')..=Char::new('a')), - 0, - None, - 3, - Some(3), - ); - - assert_repeat_simplify( - &CharRange::new_from_range(Char::new('a')..=Char::new('a')), - 1, - None, - 0, - Some(1), - ); - - assert_repeat_simplify( - &CharRange::new_from_range(Char::new('a')..=Char::new('a')), - 0, - Some(1), - 1, - None, - ); - - assert_repeat_simplify( - &CharRange::new_from_range(Char::new('a')..=Char::new('a')), - 2, - Some(4), - 2, - Some(4), - ); - - Ok(()) - } - - fn assert_repeat_simplify( - range: &CharRange, - min1: u32, - max1: Option, - min2: u32, - max2: Option, - ) { - let repeat = RegularExpression::Repetition( - Box::new(RegularExpression::Repetition( - Box::new(RegularExpression::Character(range.clone())), - min1, - max1, - )), - min2, - max2, - ); - - let got = RegularExpression::new(&repeat.to_string()).unwrap(); - - println!("{} -> {}", repeat, got); - - let repeat = repeat.to_automaton().unwrap(); - - //repeat.to_dot(); - - let result = got.to_automaton().unwrap(); - - assert!(repeat.are_equivalent(&result).unwrap()); - } -} +mod union; \ No newline at end of file diff --git a/src/regex/operation/repeat.rs b/src/regex/operation/repeat.rs index 181724f..6503f75 100644 --- a/src/regex/operation/repeat.rs +++ b/src/regex/operation/repeat.rs @@ -2,67 +2,249 @@ use super::*; impl RegularExpression { /// Returns the repetition of the expression between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. - pub fn repeat(&self, min: u32, max_opt: Option) -> RegularExpression { + pub fn repeat(&self, o_min: u32, o_max_opt: Option) -> RegularExpression { if self.is_total() { return RegularExpression::new_total(); } else if self.is_empty() { return RegularExpression::new_empty(); } else if self.is_empty_string() { return Self::new_empty_string(); - } else if let Some(max) = max_opt { - if max < min || max == 0 { + } else if let Some(max) = o_max_opt { + if max < o_min || max == 0 { return RegularExpression::new_empty_string(); - } else if min == 1 && max == 1 { + } else if o_min == 1 && max == 1 { return self.clone(); } } match self { - RegularExpression::Repetition(regular_expression, o_min, o_max_opt) => { - let new_max = if let (Some(max), Some(o_max)) = (max_opt, o_max_opt) { - Some(max * o_max) + RegularExpression::Repetition(regular_expression, i_min, i_max_opt) => { + let new_max = if let (Some(o_max), Some(i_max)) = (o_max_opt, i_max_opt) { + Some(o_max * i_max) } else { None }; - let o_min = *o_min; - if let Some(o_max) = o_max_opt { - let o_max = *o_max; - if o_min <= 1 || max_opt.is_some() && max_opt.unwrap() == min { - RegularExpression::Repetition( - regular_expression.clone(), - min * o_min, - new_max, - ) - } else if o_min == o_max && o_min > 1 { - RegularExpression::Repetition(Box::new(self.clone()), min, max_opt) - } else { - let r = ((o_max as f64) - 1f64) / ((o_max as f64) - (o_min as f64)); - if r > cmp::max(2, min) as f64 { - return RegularExpression::Repetition( - Box::new(self.clone()), - min, - max_opt, - ); - } - - RegularExpression::Repetition( - regular_expression.clone(), - min * o_min, - new_max, - ) - } - } else if o_max_opt.is_none() - || max_opt.is_some() && (max_opt.unwrap() == min || max_opt.unwrap() == 1) - || o_max_opt.is_some() && o_max_opt.unwrap() == 1 - || max_opt.is_none() && o_min == 0 - { - RegularExpression::Repetition(regular_expression.clone(), min * o_min, new_max) + if Self::can_simplify_nested_repetition(*i_min, *i_max_opt, o_min, o_max_opt) { + RegularExpression::Repetition( + regular_expression.clone(), + o_min * i_min, + new_max, + ) } else { - RegularExpression::Repetition(Box::new(self.clone()), min, max_opt) + RegularExpression::Repetition(Box::new(self.clone()), o_min, o_max_opt) } } - _ => RegularExpression::Repetition(Box::new(self.clone()), min, max_opt), + _ => RegularExpression::Repetition(Box::new(self.clone()), o_min, o_max_opt), } } -} \ No newline at end of file + + fn can_simplify_nested_repetition( + i_min: u32, + i_max_opt: Option, + o_min: u32, + o_max_opt: Option, + ) -> bool { + if let Some(o_max) = o_max_opt { + if o_min == o_max { + return true; + } + } + + if let Some(i_max) = i_max_opt { + // We check if there is any gap by resolving: + // o_min * i_max >= (o_min + 1) * i_min - 1 + // <=> o_min * (i_max - i_min) >= i_min - 1 + o_min.saturating_mul(i_max.saturating_sub(i_min)) >= i_min.saturating_sub(1) + } else { + if o_min > 0 { true } else { i_min <= 1 } + } + } +} + +#[cfg(test)] +mod tests { + + use regex_charclass::char::Char; + + use crate::{CharRange, regex::RegularExpression}; + + #[test] + fn test_parse_and_simplify() -> Result<(), String> { + assert_parse_and_simplify("(xxx)*", "(x{3})*"); + assert_parse_and_simplify("(x*){3}", "x*"); + assert_parse_and_simplify("(x+)?", "x*"); + assert_parse_and_simplify("(x?)+", "x*"); + assert_parse_and_simplify("(x{0,3})+", "x*"); + assert_parse_and_simplify("(x{2,3})+", "x{2,}"); + assert_parse_and_simplify("(x{7,9})+", "(x{7,9})+"); + assert_parse_and_simplify("(x+)*", "x*"); + assert_parse_and_simplify(".*abc", ".*abc"); + assert_parse_and_simplify(".*a(b|cd)", ".*a(b|cd)"); + assert_parse_and_simplify( + "a(bcfe|bcdg|mkv)*(abc){2,3}(abc){2}", + "a(bc(dg|fe)|mkv)*(abc){4,5}", + ); + assert_parse_and_simplify("((abc|fg)abc|(abc|fg)fg)", "(abc|fg){2}"); + assert_parse_and_simplify("(a{2}|a{3})", "a{2,3}"); + assert_parse_and_simplify("(a|b)", "[ab]"); + assert_parse_and_simplify("(ab|a|cd|b|ef)", "(b|ab?|cd|ef)"); + assert_parse_and_simplify("(ab|ab)", "ab"); + assert_parse_and_simplify("(ab)(ab)(ab)", "(ab){3}"); + assert_parse_and_simplify("aaaabbbbbccc", "a{4}b{5}c{3}"); + assert_parse_and_simplify("((ab))?(ab)(((ab)))((((ab)){3}))", "(ab){5,6}"); + assert_parse_and_simplify("(cd|ab)*(ab|cd)*", "(ab|cd)*"); + assert_parse_and_simplify(".*q(ab|ab|abc|ca)x", ".*q(abc?|ca)x"); + assert_parse_and_simplify( + "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,100}", + "(q|(a|ads|a{2}d)*abc.*def.*uif(x|ads|a{2}d)*abc.*oxs.*def(ads|ax|a{2}d)*abc.*def.*ksd){1,100}", + ); + + assert_parse_and_simplify("(a{2,4}){2,4}", "a{4,16}"); + Ok(()) + } + + fn assert_parse_and_simplify(regex: &str, regex_simplified: &str) { + let regex_parsed = RegularExpression::new(regex).unwrap(); + assert_eq!(regex_simplified, regex_parsed.to_string()); + } + + #[test] + fn test_repeat_simplify() -> Result<(), String> { + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 2, + Some(2), + 3, + Some(3), + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 2, + Some(2), + 2, + Some(4), + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 3, + Some(3), + 0, + None, + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 0, + Some(3), + 1, + None, + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 1, + Some(2), + 1, + None, + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 2, + Some(3), + 1, + None, + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 3, + Some(4), + 1, + None, + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 7, + Some(8), + 1, + None, + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 0, + None, + 3, + Some(3), + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 1, + None, + 0, + Some(1), + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 0, + Some(1), + 1, + None, + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 2, + Some(4), + 2, + Some(4), + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 2, + Some(3), + 2, + Some(2), + ); + + Ok(()) + } + + fn assert_repeat_simplify( + range: &CharRange, + min1: u32, + max1: Option, + min2: u32, + max2: Option, + ) { + let repeat = RegularExpression::Repetition( + Box::new(RegularExpression::Repetition( + Box::new(RegularExpression::Character(range.clone())), + min1, + max1, + )), + min2, + max2, + ); + + let got = RegularExpression::new(&repeat.to_string()).unwrap(); + + println!("{} -> {}", repeat, got); + + let repeat = repeat.to_automaton().unwrap(); + + //repeat.to_dot(); + + let result = got.to_automaton().unwrap(); + + assert!(repeat.are_equivalent(&result).unwrap()); + } +} diff --git a/tests/data/regex-todo.txt b/tests/data/regex-todo.txt index 1d97d13..05849d6 100644 --- a/tests/data/regex-todo.txt +++ b/tests/data/regex-todo.txt @@ -1,3 +1,4 @@ +(a*,a*)* #([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3}) \{(?:\s*"[^"]*"\s*:\s*(?:\d+|"(...)" )\s*)(?:,\s*"[^"]*"\s*:\s*(?:\d+|"(...)" )\s*)*\} rgba?\(\s*(?:\d{1,3}\s*,\s*){2}\d{1,3}(?:\s*,\s*(?:0|1|0?\.\d+))?\s*\) From e24624e42900fcd7b698d3bd8af6a0dc0ad272d4 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sat, 9 Aug 2025 16:24:02 +0200 Subject: [PATCH 22/44] update serialization --- src/error/mod.rs | 2 +- src/fast_automaton/mod.rs | 2 +- .../{serializer.rs => serializer/mod.rs} | 31 +- .../serializer}/tokenizer/embed_automaton.rs | 21 +- .../serializer}/tokenizer/mod.rs | 11 +- .../serializer}/tokenizer/range_tokenizer.rs | 0 .../tokenizer/token/automaton_token.rs | 94 ++++++ .../serializer}/tokenizer/token/mod.rs | 20 +- .../serializer/tokenizer/token/range_token.rs | 55 ++++ src/lib.rs | 2 - src/regex/operation/repeat.rs | 5 +- src/tokenizer/embed_regex.rs | 295 ------------------ src/tokenizer/token/automaton_token.rs | 72 ----- src/tokenizer/token/range_token.rs | 58 ---- src/tokenizer/token/regex_token.rs | 84 ----- 15 files changed, 196 insertions(+), 556 deletions(-) rename src/fast_automaton/{serializer.rs => serializer/mod.rs} (82%) rename src/{ => fast_automaton/serializer}/tokenizer/embed_automaton.rs (90%) rename src/{ => fast_automaton/serializer}/tokenizer/mod.rs (90%) rename src/{ => fast_automaton/serializer}/tokenizer/range_tokenizer.rs (100%) create mode 100644 src/fast_automaton/serializer/tokenizer/token/automaton_token.rs rename src/{ => fast_automaton/serializer}/tokenizer/token/mod.rs (62%) create mode 100644 src/fast_automaton/serializer/tokenizer/token/range_token.rs delete mode 100644 src/tokenizer/embed_regex.rs delete mode 100644 src/tokenizer/token/automaton_token.rs delete mode 100644 src/tokenizer/token/range_token.rs delete mode 100644 src/tokenizer/token/regex_token.rs diff --git a/src/error/mod.rs b/src/error/mod.rs index b960147..448dfb9 100644 --- a/src/error/mod.rs +++ b/src/error/mod.rs @@ -1,7 +1,7 @@ use std::fmt::{self}; #[cfg(feature = "serializable")] -use crate::tokenizer::token::TokenError; +use crate::fast_automaton::serializer::tokenizer::token::TokenError; /// An error thrown by the engine. #[derive(Debug, PartialEq, Eq)] diff --git a/src/fast_automaton/mod.rs b/src/fast_automaton/mod.rs index 908ed4e..cfa4f68 100644 --- a/src/fast_automaton/mod.rs +++ b/src/fast_automaton/mod.rs @@ -27,7 +27,7 @@ mod convert; mod generate; mod operation; #[cfg(feature = "serializable")] -mod serializer; +pub mod serializer; pub mod spanning_set; /// Represent a finite state automaton. diff --git a/src/fast_automaton/serializer.rs b/src/fast_automaton/serializer/mod.rs similarity index 82% rename from src/fast_automaton/serializer.rs rename to src/fast_automaton/serializer/mod.rs index 29c41e3..aa06df0 100644 --- a/src/fast_automaton/serializer.rs +++ b/src/fast_automaton/serializer/mod.rs @@ -1,5 +1,7 @@ +use crate::fast_automaton::serializer::tokenizer::token::automaton_token::AutomatonToken; +use crate::fast_automaton::serializer::tokenizer::Tokenizer; + use super::*; -use crate::tokenizer::Tokenizer; use serde::{Deserialize, Serialize}; use serde::{Deserializer, Serializer, de, ser}; @@ -10,10 +12,11 @@ use flate2::read::ZlibDecoder; use flate2::write::ZlibEncoder; use std::io::prelude::*; -use crate::tokenizer::token::{Token, automaton_token::AutomatonToken}; +#[cfg(feature = "serializable")] +pub mod tokenizer; #[derive(Serialize, Deserialize, Debug)] -struct SerializedAutomaton(Vec, SpanningSet); +struct SerializedAutomaton(Vec, SpanningSet, usize); impl serde::Serialize for FastAutomaton { fn serialize(&self, serializer: S) -> Result @@ -21,12 +24,17 @@ impl serde::Serialize for FastAutomaton { S: Serializer, { let tokenizer = Tokenizer::new(self); - match AutomatonToken::to_fair_tokens(&tokenizer.to_embedding()) { + let number_of_states = self.get_number_of_states(); + match AutomatonToken::to_tokens( + &tokenizer.to_embedding(), + self.get_spanning_set().get_number_of_spanning_ranges(), + number_of_states, + ) { Ok(tokens) => { let serialized_automaton = - SerializedAutomaton(tokens, self.get_spanning_set().clone()); + SerializedAutomaton(tokens, self.get_spanning_set().clone(), number_of_states); - let mut serialized = Vec::with_capacity(self.get_number_of_states() * 8); + let mut serialized = Vec::with_capacity(number_of_states * 8); if let Err(err) = ciborium::into_writer(&serialized_automaton, &mut serialized) { return Err(ser::Error::custom(err.to_string())); } @@ -56,13 +64,22 @@ impl<'de> serde::Deserialize<'de> for FastAutomaton { Ok(automaton) => { let mut temp_automaton = FastAutomaton::new_empty(); temp_automaton.spanning_set = automaton.1; + let number_of_states = automaton.2; + let number_of_bases = + temp_automaton.spanning_set.get_number_of_spanning_ranges(); let tokenizer = Tokenizer::new(&temp_automaton); match tokenizer.from_embedding( &automaton .0 .into_iter() - .map(AutomatonToken::from_fair_token) + .map(|t| { + AutomatonToken::from_token( + t, + number_of_bases, + number_of_states, + ) + }) .collect::>(), ) { Ok(res) => Ok(res), diff --git a/src/tokenizer/embed_automaton.rs b/src/fast_automaton/serializer/tokenizer/embed_automaton.rs similarity index 90% rename from src/tokenizer/embed_automaton.rs rename to src/fast_automaton/serializer/tokenizer/embed_automaton.rs index 49733a0..825ea7e 100644 --- a/src/tokenizer/embed_automaton.rs +++ b/src/fast_automaton/serializer/tokenizer/embed_automaton.rs @@ -1,6 +1,6 @@ use token::TokenError; -use crate::{CharRange, error::EngineError, fast_automaton::condition::Condition}; +use crate::{error::EngineError, fast_automaton::{condition::Condition, serializer::tokenizer::token::automaton_token::AutomatonToken}, CharRange}; use self::token::range_token::RangeToken; @@ -77,7 +77,7 @@ impl Tokenizer<'_> { range = range.union(self.range_tokenizer.token_to_range(r).unwrap()); } AutomatonToken::State(s) => { - while !automaton.has_state((*s).into()) { + while !automaton.has_state(*s) { automaton.new_state(); } if let Some(fs) = from_state { @@ -85,9 +85,9 @@ impl Tokenizer<'_> { Self::apply_transition(&mut automaton, fs, ts, &range)?; range = CharRange::empty(); } - to_state = Some((*s).into()); + to_state = Some(*s); } else { - from_state = Some((*s).into()); + from_state = Some(*s); } } AutomatonToken::AcceptState => { @@ -129,8 +129,6 @@ impl Tokenizer<'_> { #[cfg(test)] mod tests { - use embed_automaton::token::Token; - use crate::regex::RegularExpression; use super::*; @@ -168,11 +166,14 @@ mod tests { let tokenizer = Tokenizer::new(&automaton); let embedding = tokenizer.to_embedding(); - // FAIR - let embedding_u16 = AutomatonToken::to_fair_tokens(&embedding).unwrap(); - let embedding: Vec = embedding_u16 + let number_of_bases = automaton.get_spanning_set().get_number_of_spanning_ranges(); + let number_of_states = automaton.get_number_of_states(); + + let embedding_usize = + AutomatonToken::to_tokens(&embedding, number_of_bases, number_of_states).unwrap(); + let embedding: Vec = embedding_usize .iter() - .map(|&t| AutomatonToken::from_fair_token(t)) + .map(|&t| AutomatonToken::from_token(t, number_of_bases, number_of_states)) .collect(); let unembedded_automaton = tokenizer.from_embedding(&embedding).unwrap(); diff --git a/src/tokenizer/mod.rs b/src/fast_automaton/serializer/tokenizer/mod.rs similarity index 90% rename from src/tokenizer/mod.rs rename to src/fast_automaton/serializer/tokenizer/mod.rs index 7c83c1a..95ccb18 100644 --- a/src/tokenizer/mod.rs +++ b/src/fast_automaton/serializer/tokenizer/mod.rs @@ -1,16 +1,15 @@ use std::{cmp::Ordering, collections::VecDeque, vec}; -use ahash::HashMapExt; +use crate::fast_automaton::serializer::tokenizer::range_tokenizer::RangeTokenizer; use crate::fast_automaton::spanning_set::SpanningSet; use crate::{ - fast_automaton::{FastAutomaton, State}, IntMap, IntSet, + fast_automaton::{FastAutomaton, State}, }; +use ahash::HashMapExt; -use self::{range_tokenizer::RangeTokenizer, token::automaton_token::AutomatonToken}; mod embed_automaton; -mod embed_regex; pub mod range_tokenizer; pub mod token; @@ -18,7 +17,7 @@ pub mod token; pub struct Tokenizer<'a> { range_tokenizer: RangeTokenizer<'a>, automaton: &'a FastAutomaton, - state_to_token: IntMap, + state_to_token: IntMap, } impl Tokenizer<'_> { @@ -28,7 +27,7 @@ impl Tokenizer<'_> { worklist.push_front(automaton.get_start_state()); - let mut state_counter: u16 = 0; + let mut state_counter = 0; let mut state_to_token = IntMap::with_capacity(automaton.get_number_of_states()); while let Some(current_state) = worklist.pop_back() { diff --git a/src/tokenizer/range_tokenizer.rs b/src/fast_automaton/serializer/tokenizer/range_tokenizer.rs similarity index 100% rename from src/tokenizer/range_tokenizer.rs rename to src/fast_automaton/serializer/tokenizer/range_tokenizer.rs diff --git a/src/fast_automaton/serializer/tokenizer/token/automaton_token.rs b/src/fast_automaton/serializer/tokenizer/token/automaton_token.rs new file mode 100644 index 0000000..2e68ded --- /dev/null +++ b/src/fast_automaton/serializer/tokenizer/token/automaton_token.rs @@ -0,0 +1,94 @@ +use self::range_token::RangeToken; + +use super::*; + +#[derive(Debug, Eq, PartialEq, Clone, Copy)] +pub enum AutomatonToken { + Range(RangeToken), + State(usize), + AcceptState, + SeparatorState, + Error, +} + +impl Ord for AutomatonToken { + fn cmp(&self, other: &Self) -> Ordering { + match (self, other) { + (AutomatonToken::Range(a), AutomatonToken::Range(b)) => a.cmp(b), + (AutomatonToken::Range(_), _) => Ordering::Less, + (_, AutomatonToken::Range(_)) => Ordering::Greater, + + (AutomatonToken::State(a), AutomatonToken::State(b)) => a.cmp(b), + (AutomatonToken::State(_), _) => Ordering::Less, + (_, AutomatonToken::State(_)) => Ordering::Greater, + + (AutomatonToken::AcceptState, AutomatonToken::AcceptState) => Ordering::Equal, + (AutomatonToken::AcceptState, _) => Ordering::Less, + (_, AutomatonToken::AcceptState) => Ordering::Greater, + + (AutomatonToken::SeparatorState, AutomatonToken::SeparatorState) => Ordering::Equal, + (AutomatonToken::SeparatorState, _) => Ordering::Less, + (_, AutomatonToken::SeparatorState) => Ordering::Greater, + + (AutomatonToken::Error, AutomatonToken::Error) => Ordering::Equal, + } + } +} + +impl PartialOrd for AutomatonToken { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl AutomatonToken { + pub fn from_token( + token: usize, + number_of_bases: usize, + number_of_states: usize, + ) -> AutomatonToken { + let states = number_of_bases + 1; + let accept_state = states + number_of_states; + let separator_state = accept_state + 1; + if (0..states).contains(&token) { + AutomatonToken::Range(RangeToken::from_token(token, number_of_bases)) + } else if (states..accept_state).contains(&token) { + AutomatonToken::State(token - states) + } else if token == accept_state { + AutomatonToken::AcceptState + } else if token == separator_state { + AutomatonToken::SeparatorState + } else { + AutomatonToken::Error + } + } + + pub fn to_token( + &self, + number_of_bases: usize, + number_of_states: usize, + ) -> Result { + let states = number_of_bases + 1; + let accept_state = states + number_of_states; + let separator_state = accept_state + 1; + Ok(match self { + AutomatonToken::Range(r) => r.to_token(number_of_bases)?, + AutomatonToken::State(s) => s + states, + AutomatonToken::AcceptState => accept_state, + AutomatonToken::SeparatorState => separator_state, + AutomatonToken::Error => return Err(TokenError::UnknownToken), + }) + } + + pub fn to_tokens( + tokens: &[Self], + number_of_bases: usize, + number_of_states: usize, + ) -> Result, TokenError> { + let mut vec = Vec::with_capacity(tokens.len()); + for token in tokens { + vec.push(token.to_token(number_of_bases, number_of_states)?); + } + Ok(vec) + } +} diff --git a/src/tokenizer/token/mod.rs b/src/fast_automaton/serializer/tokenizer/token/mod.rs similarity index 62% rename from src/tokenizer/token/mod.rs rename to src/fast_automaton/serializer/tokenizer/token/mod.rs index 4342be8..c510dd4 100644 --- a/src/tokenizer/token/mod.rs +++ b/src/fast_automaton/serializer/tokenizer/token/mod.rs @@ -4,7 +4,6 @@ use super::*; pub mod automaton_token; pub mod range_token; -pub mod regex_token; #[derive(Debug, PartialEq, Eq)] pub enum TokenError { @@ -24,21 +23,4 @@ impl Display for TokenError { TokenError::SyntaxError => write!(f, "SyntaxError"), } } -} - -pub trait Token { - fn from_fair_token(token: u16) -> Self; - - fn to_fair_token(&self) -> Result; - - fn to_fair_tokens(tokens: &[Self]) -> Result, TokenError> - where - Self: Sized, - { - let mut vec = Vec::with_capacity(tokens.len()); - for token in tokens { - vec.push(token.to_fair_token()?); - } - Ok(vec) - } -} +} \ No newline at end of file diff --git a/src/fast_automaton/serializer/tokenizer/token/range_token.rs b/src/fast_automaton/serializer/tokenizer/token/range_token.rs new file mode 100644 index 0000000..20ed515 --- /dev/null +++ b/src/fast_automaton/serializer/tokenizer/token/range_token.rs @@ -0,0 +1,55 @@ +use super::*; + +#[derive(Debug, Eq, PartialEq, Clone, Copy)] +pub enum RangeToken { + Total, + Base(usize), + Error, +} + +impl Ord for RangeToken { + fn cmp(&self, other: &Self) -> Ordering { + match (self, other) { + (RangeToken::Total, RangeToken::Total) => Ordering::Equal, + (RangeToken::Total, _) => Ordering::Less, + (_, RangeToken::Total) => Ordering::Greater, + (RangeToken::Base(a), RangeToken::Base(b)) => a.cmp(b), + (RangeToken::Base(_), RangeToken::Error) => Ordering::Less, + (RangeToken::Error, RangeToken::Base(_)) => Ordering::Greater, + (RangeToken::Error, RangeToken::Error) => Ordering::Equal, + } + } +} + +impl PartialOrd for RangeToken { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl RangeToken { + pub fn from_token(token: usize, number_of_bases: usize) -> RangeToken { + let max_number_of_bases = number_of_bases + 1; + if token == 0 { + RangeToken::Total + } else if (1..max_number_of_bases).contains(&token) { + RangeToken::Base(token - 1) + } else { + RangeToken::Error + } + } + + pub fn to_token(&self, number_of_bases: usize) -> Result { + let max_number_of_bases = number_of_bases + 1; + Ok(match self { + RangeToken::Total => 0, + RangeToken::Base(b) => { + if *b > max_number_of_bases { + return Err(TokenError::TokenOutOfBound("Base", max_number_of_bases, *b)); + } + b + 1 + } + RangeToken::Error => return Err(TokenError::UnknownToken), + }) + } +} diff --git a/src/lib.rs b/src/lib.rs index c81d84d..e4119e2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -22,8 +22,6 @@ pub mod error; pub mod execution_profile; pub mod fast_automaton; pub mod regex; -#[cfg(feature = "serializable")] -pub mod tokenizer; pub type IntMap = HashMap>>; pub type IntSet = HashSet>>; diff --git a/src/regex/operation/repeat.rs b/src/regex/operation/repeat.rs index 6503f75..86ddbe5 100644 --- a/src/regex/operation/repeat.rs +++ b/src/regex/operation/repeat.rs @@ -39,6 +39,7 @@ impl RegularExpression { } } + /// Evaluate if the repetition `(r{i_min,i_max_opt}){o_min,o_max_opt}` can be simplified to `r{i_min*o_min,i_max_opt*o_max_opt}`. fn can_simplify_nested_repetition( i_min: u32, i_max_opt: Option, @@ -56,8 +57,10 @@ impl RegularExpression { // o_min * i_max >= (o_min + 1) * i_min - 1 // <=> o_min * (i_max - i_min) >= i_min - 1 o_min.saturating_mul(i_max.saturating_sub(i_min)) >= i_min.saturating_sub(1) + } else if o_min > 0 { + true } else { - if o_min > 0 { true } else { i_min <= 1 } + i_min <= 1 } } } diff --git a/src/tokenizer/embed_regex.rs b/src/tokenizer/embed_regex.rs deleted file mode 100644 index d9e6892..0000000 --- a/src/tokenizer/embed_regex.rs +++ /dev/null @@ -1,295 +0,0 @@ -use token::TokenError; - -use crate::{regex::RegularExpression, CharRange}; - -use self::token::regex_token::RegexToken; - -use super::*; - -impl Tokenizer<'_> { - pub fn to_regex_embedding(&self, regex: &RegularExpression) -> Vec { - let mut vec = self.to_regex_embedding_vec(regex); - - Self::append_counter_if_necessary(&mut vec); - - vec - } - - fn append_counter_if_necessary(vec: &mut Vec) { - if let Some(last) = vec.last() { - match last { - RegexToken::RepetitionNone => {} - RegexToken::Repetition(_) => {} - RegexToken::EndGroup => {} - RegexToken::StartGroup => {} - RegexToken::Alternation => {} - RegexToken::Error => todo!(), - _ => { - vec.push(RegexToken::Repetition(1)); - } - }; - } - } - - fn to_regex_embedding_vec(&self, regex: &RegularExpression) -> Vec { - let mut vec = vec![]; - - match regex { - RegularExpression::Character(range) => { - self.range_tokenizer - .range_to_embedding(range) - .unwrap() - .into_iter() - .for_each(|t| vec.push(RegexToken::Range(t))); - } - RegularExpression::Repetition(regex, min, max_opt) => { - if matches!( - **regex, - RegularExpression::Repetition(_, _, _) | RegularExpression::Concat(_) - ) { - vec.push(RegexToken::StartGroup); - vec.extend(self.to_regex_embedding_vec(regex)); - vec.push(RegexToken::EndGroup); - } else { - vec.extend(self.to_regex_embedding_vec(regex)); - } - - vec.push(RegexToken::Repetition(*min as u16)); - - if let Some(max) = max_opt { - if max != min { - vec.push(RegexToken::Repetition(*max as u16)); - } - } else { - vec.push(RegexToken::RepetitionNone); - } - } - RegularExpression::Concat(elements) => { - for element in elements { - vec.extend(self.to_regex_embedding_vec(element)); - Self::append_counter_if_necessary(&mut vec); - } - } - RegularExpression::Alternation(elements) => { - vec.push(RegexToken::StartGroup); - - for i in 0..elements.len() { - let element = &elements[i]; - vec.extend(self.to_regex_embedding_vec(element)); - Self::append_counter_if_necessary(&mut vec); - if i < elements.len() - 1 { - vec.push(RegexToken::Alternation); - } - } - - vec.push(RegexToken::EndGroup); - } - } - - vec - } - - pub fn from_regex_embedding( - &self, - vec: &[RegexToken], - ) -> Result { - let mut regex_groups = vec![(RegularExpression::new_empty_string(), false)]; - let mut current_range: Option = None; - let mut current_min = None; - for i in 0..vec.len() { - let token = vec[i]; - let current_group = regex_groups.len() - 1; - match token { - RegexToken::Range(range_token) => { - let range = self.range_tokenizer.token_to_range(&range_token).unwrap(); - if let Some(curr_range) = ¤t_range { - current_range = Some(curr_range.union(range)); - } else { - current_range = Some(range.clone()); - } - } - RegexToken::StartGroup => { - regex_groups.push((RegularExpression::new_empty_string(), false)); - } - RegexToken::EndGroup => { - if current_group == 0 { - return Err(TokenError::SyntaxError); - } - if i == vec.len() - 1 || !matches!(vec[i + 1], RegexToken::Repetition(_)) { - let alternation: bool = regex_groups[current_group].1; - Self::pop_regex_group(&mut regex_groups, &None, &None); - if alternation { - Self::pop_regex_group(&mut regex_groups, &None, &None); - } - } - } - RegexToken::Alternation => { - if regex_groups[current_group].1 { - Self::pop_regex_group(&mut regex_groups, &None, &None); - } - regex_groups.push((RegularExpression::new_empty_string(), true)); - } - RegexToken::RepetitionNone => { - if current_min.is_some() { - if let Some(range) = ¤t_range { - Self::add_regex( - &mut regex_groups, - ¤t_min, - &None, - &RegularExpression::Character(range.clone()), - false, - ); - current_range = None; - } else { - Self::pop_regex_group(&mut regex_groups, ¤t_min, &None); - } - current_min = None; - } else { - return Err(TokenError::SyntaxError); - } - } - RegexToken::Repetition(count) => { - if current_min.is_some() - || i == vec.len() - 1 - || !matches!(vec[i + 1], RegexToken::Repetition(_)) - && !matches!(vec[i + 1], RegexToken::RepetitionNone) - { - let min; - let max; - if current_min.is_some() { - min = current_min; - max = Some(count as u32); - } else { - min = Some(count as u32); - max = Some(count as u32); - } - if let Some(range) = ¤t_range { - Self::add_regex( - &mut regex_groups, - &min, - &max, - &RegularExpression::Character(range.clone()), - false, - ); - current_range = None; - } else { - Self::pop_regex_group(&mut regex_groups, &min, &max); - } - current_min = None; - } else { - current_min = Some(count as u32); - } - } - _ => return Err(TokenError::UnknownToken), - }; - } - - Ok(regex_groups[0].0.clone()) - } - - fn pop_regex_group( - regex_groups: &mut Vec<(RegularExpression, bool)>, - current_min: &Option, - current_max: &Option, - ) -> bool { - if regex_groups.len() <= 1 { - return false; - } - - let popped_group = regex_groups.pop().unwrap(); - Self::add_regex( - regex_groups, - current_min, - current_max, - &popped_group.0, - popped_group.1, - ); - true - } - - fn add_regex( - regex_groups: &mut [(RegularExpression, bool)], - current_min: &Option, - current_max: &Option, - regex: &RegularExpression, - alternation: bool, - ) { - let current_group = regex_groups.len() - 1; - let regex_to_use = if let Some(min) = current_min { - if min == &1 && current_max.is_some() { - if current_max.unwrap() == 1 { - regex.clone() - } else { - RegularExpression::Repetition(Box::new(regex.clone()), *min, *current_max) - } - } else { - RegularExpression::Repetition(Box::new(regex.clone()), *min, *current_max) - } - } else { - regex.clone() - }; - - if alternation { - regex_groups[current_group].0 = regex_groups[current_group].0.union(®ex_to_use); - } else { - regex_groups[current_group].0 = - regex_groups[current_group].0.concat(®ex_to_use, true); - } - } -} - -#[cfg(test)] -mod tests { - use embed_regex::token::Token; - - use crate::regex::RegularExpression; - - use super::*; - - #[test] - fn test_tokenize() -> Result<(), String> { - assert_embedding_convertion(".*"); - assert_embedding_convertion("(a|b)"); - assert_embedding_convertion("(|a)"); - assert_embedding_convertion(".*ab"); - assert_embedding_convertion("[a-e]{3}"); - assert_embedding_convertion("[a-e]{3}efg"); - assert_embedding_convertion("toto"); - assert_embedding_convertion(".{2,3}"); - assert_embedding_convertion("q(abc?|ca)x"); - assert_embedding_convertion(".*q(abc?|ca)x"); - assert_embedding_convertion("(abc){3,6}"); - assert_embedding_convertion("((|a)abd+){3}"); - /*assert_embedding_convertion( - "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q)", - );*/ - Ok(()) - } - - fn assert_embedding_convertion(regex: &str) { - let regex = RegularExpression::new(regex).unwrap(); - println!("{}", regex); - - let automaton = regex.to_automaton().unwrap(); - let automaton = automaton.determinize().unwrap(); - //automaton.to_dot(); - - let tokenizer = Tokenizer::new(&automaton); - let embedding = tokenizer.to_regex_embedding(®ex); - - //println!("{:?}", embedding); - - // FAIR - let embedding_u16 = RegexToken::to_fair_tokens(&embedding).unwrap(); - assert_eq!( - embedding, - embedding_u16 - .iter() - .map(|&t| RegexToken::from_fair_token(t)) - .collect::>() - ); - - let unembedded_regex = tokenizer.from_regex_embedding(&embedding).unwrap(); - assert_eq!(regex, unembedded_regex); - } -} diff --git a/src/tokenizer/token/automaton_token.rs b/src/tokenizer/token/automaton_token.rs deleted file mode 100644 index e5f379c..0000000 --- a/src/tokenizer/token/automaton_token.rs +++ /dev/null @@ -1,72 +0,0 @@ -use self::range_token::RangeToken; - -use super::*; - -#[derive(Debug, Eq, PartialEq, Clone, Copy)] -pub enum AutomatonToken { - Range(RangeToken), - State(u16), - AcceptState, - SeparatorState, - Error, -} - -impl Ord for AutomatonToken { - fn cmp(&self, other: &Self) -> Ordering { - (self.to_fair_token().unwrap()).cmp(&other.to_fair_token().unwrap()) - } -} - -impl PartialOrd for AutomatonToken { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl AutomatonToken { - const TK_FAIR_RANGE: u16 = 0; - const TK_FAIR_STATE: u16 = Self::TK_FAIR_RANGE + RangeToken::FAIR_VOCABULARY_SIZE; - const TK_FAIR_ACCEPT_STATE: u16 = Self::TK_FAIR_STATE + Self::FAIR_MAX_NUMBER_OF_STATES; - const TK_FAIR_SEPARATOR_STATE: u16 = Self::TK_FAIR_ACCEPT_STATE + 1; - - pub const FAIR_MAX_NUMBER_OF_STATES: u16 = 65_000; - - pub const FAIR_VOCABULARY_SIZE: u16 = Self::TK_FAIR_SEPARATOR_STATE + 1; -} - -impl Token for AutomatonToken { - fn from_fair_token(token: u16) -> AutomatonToken { - if (Self::TK_FAIR_RANGE..Self::TK_FAIR_RANGE + RangeToken::FAIR_VOCABULARY_SIZE) - .contains(&token) - { - AutomatonToken::Range(RangeToken::from_fair_token(token)) - } else if (Self::TK_FAIR_STATE..Self::TK_FAIR_STATE + Self::FAIR_MAX_NUMBER_OF_STATES) - .contains(&token) - { - AutomatonToken::State(token - Self::TK_FAIR_STATE) - } else if token == Self::TK_FAIR_ACCEPT_STATE { - AutomatonToken::AcceptState - } else if token == Self::TK_FAIR_SEPARATOR_STATE { - AutomatonToken::SeparatorState - } else { - AutomatonToken::Error - } - } - - fn to_fair_token(&self) -> Result { - Ok(match self { - AutomatonToken::Range(r) => r.to_fair_token()?, - AutomatonToken::State(s) => { - let max = Self::FAIR_MAX_NUMBER_OF_STATES; - let s = *s; - if s > max { - return Err(TokenError::TokenOutOfBound("State", max.into(), s.into())); - } - s + Self::TK_FAIR_STATE - } - AutomatonToken::AcceptState => Self::TK_FAIR_ACCEPT_STATE, - AutomatonToken::SeparatorState => Self::TK_FAIR_SEPARATOR_STATE, - AutomatonToken::Error => return Err(TokenError::UnknownToken), - }) - } -} diff --git a/src/tokenizer/token/range_token.rs b/src/tokenizer/token/range_token.rs deleted file mode 100644 index 7876452..0000000 --- a/src/tokenizer/token/range_token.rs +++ /dev/null @@ -1,58 +0,0 @@ -use super::*; - -#[derive(Debug, Eq, PartialEq, Clone, Copy)] -pub enum RangeToken { - Total, - Base(usize), - Error, -} - -impl RangeToken { - const TK_FAIR_TOTAL: u16 = 0; - const TK_FAIR_BASE: u16 = 1; - - pub const FAIR_MAX_NUMBER_OF_BASES: u16 = 127; - - pub const FAIR_VOCABULARY_SIZE: u16 = Self::TK_FAIR_BASE + Self::FAIR_MAX_NUMBER_OF_BASES + 1; -} - -impl Ord for RangeToken { - fn cmp(&self, other: &Self) -> Ordering { - (self.to_fair_token().unwrap()).cmp(&other.to_fair_token().unwrap()) - } -} - -impl PartialOrd for RangeToken { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Token for RangeToken { - fn from_fair_token(token: u16) -> RangeToken { - if token == Self::TK_FAIR_TOTAL { - RangeToken::Total - } else if (Self::TK_FAIR_BASE..Self::TK_FAIR_BASE + Self::FAIR_MAX_NUMBER_OF_BASES) - .contains(&token) - { - RangeToken::Base((token - Self::TK_FAIR_BASE) as usize) - } else { - RangeToken::Error - } - } - - fn to_fair_token(&self) -> Result { - Ok(match self { - RangeToken::Total => Self::TK_FAIR_TOTAL, - RangeToken::Base(b) => { - let max = Self::FAIR_MAX_NUMBER_OF_BASES; - let b = *b as u16; - if b > max { - return Err(TokenError::TokenOutOfBound("Base", max.into(), b.into())); - } - b + Self::TK_FAIR_BASE - } - RangeToken::Error => return Err(TokenError::UnknownToken), - }) - } -} diff --git a/src/tokenizer/token/regex_token.rs b/src/tokenizer/token/regex_token.rs deleted file mode 100644 index bcb2e2b..0000000 --- a/src/tokenizer/token/regex_token.rs +++ /dev/null @@ -1,84 +0,0 @@ -use self::range_token::RangeToken; - -use super::*; - -#[derive(Debug, Eq, PartialEq, Clone, Copy)] -pub enum RegexToken { - Range(RangeToken), - StartGroup, - EndGroup, - Alternation, - RepetitionNone, - Repetition(u16), - Error, -} - -impl Ord for RegexToken { - fn cmp(&self, other: &Self) -> Ordering { - (self.to_fair_token().unwrap()).cmp(&other.to_fair_token().unwrap()) - } -} - -impl PartialOrd for RegexToken { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl RegexToken { - const TK_FAIR_RANGE: u16 = 0; - const TK_FAIR_START_GROUP: u16 = Self::TK_FAIR_RANGE + RangeToken::FAIR_VOCABULARY_SIZE; - const TK_FAIR_END_GROUP: u16 = Self::TK_FAIR_START_GROUP + 1; - const TK_FAIR_ALTERNATION: u16 = Self::TK_FAIR_END_GROUP + 1; - const TK_FAIR_REPETITION_NONE: u16 = Self::TK_FAIR_ALTERNATION + 1; - const TK_FAIR_REPETITION: u16 = Self::TK_FAIR_REPETITION_NONE + 1; - - pub const FAIR_MAX_NUMBER_OF_REPETITION: u16 = 1024; - - pub const FAIR_VOCABULARY_SIZE: u16 = - Self::TK_FAIR_REPETITION + Self::FAIR_MAX_NUMBER_OF_REPETITION + 1; -} - -impl Token for RegexToken { - fn from_fair_token(token: u16) -> RegexToken { - if (Self::TK_FAIR_RANGE..Self::TK_FAIR_RANGE + RangeToken::FAIR_VOCABULARY_SIZE) - .contains(&token) - { - RegexToken::Range(RangeToken::from_fair_token(token)) - } else if token == Self::TK_FAIR_START_GROUP { - RegexToken::StartGroup - } else if token == Self::TK_FAIR_END_GROUP { - RegexToken::EndGroup - } else if token == Self::TK_FAIR_ALTERNATION { - RegexToken::Alternation - } else if token == Self::TK_FAIR_REPETITION_NONE { - RegexToken::RepetitionNone - } else if (Self::TK_FAIR_REPETITION - ..Self::TK_FAIR_REPETITION + Self::FAIR_MAX_NUMBER_OF_REPETITION) - .contains(&token) - { - RegexToken::Repetition(token - Self::TK_FAIR_REPETITION) - } else { - RegexToken::Error - } - } - - fn to_fair_token(&self) -> Result { - Ok(match self { - RegexToken::Range(r) => r.to_fair_token()?, - RegexToken::StartGroup => Self::TK_FAIR_START_GROUP, - RegexToken::EndGroup => Self::TK_FAIR_END_GROUP, - RegexToken::Alternation => Self::TK_FAIR_ALTERNATION, - RegexToken::RepetitionNone => Self::TK_FAIR_REPETITION_NONE, - RegexToken::Repetition(r) => { - let max = Self::FAIR_MAX_NUMBER_OF_REPETITION; - let r = *r; - if r > max { - return Err(TokenError::TokenOutOfBound("Repetition", max.into(), r.into())); - } - r + Self::TK_FAIR_REPETITION - } - RegexToken::Error => return Err(TokenError::UnknownToken), - }) - } -} From f874caad390cbe97872660b2a11ab58be4ca25d1 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Mon, 11 Aug 2025 21:51:29 +0200 Subject: [PATCH 23/44] remove some errors --- src/error/mod.rs | 16 -------------- src/fast_automaton/analyze/cardinality.rs | 17 +++++++------- src/fast_automaton/condition/converter.rs | 14 +++++------- .../condition/fast_bit_vec/mod.rs | 12 +++++----- src/fast_automaton/condition/mod.rs | 22 ++++++++++--------- src/fast_automaton/operation/difference.rs | 5 ++--- src/lib.rs | 18 +++++---------- src/regex/analyze/mod.rs | 2 +- 8 files changed, 38 insertions(+), 68 deletions(-) diff --git a/src/error/mod.rs b/src/error/mod.rs index 448dfb9..29052b4 100644 --- a/src/error/mod.rs +++ b/src/error/mod.rs @@ -10,21 +10,15 @@ pub enum EngineError { InvalidCharacterInRegex, /// The operation took too much time. OperationTimeOutError, - /// The given automaton should be deterministic. - AutomatonShouldBeDeterministic, /// The automaton has too many states. AutomatonHasTooManyStates, /// The regular expression can not be parsed. RegexSyntaxError(String), /// The provided range can not be built from the spanning set. ConditionInvalidRange, - /// The provided index is out of bound of the condition. - ConditionIndexOutOfBound, #[cfg(feature = "serializable")] /// There is an error with one of the token. TokenError(TokenError), - /// Computing the cardinality of the provided automaton failed. - CannotComputeAutomatonCardinality, } impl fmt::Display for EngineError { @@ -32,9 +26,6 @@ impl fmt::Display for EngineError { match self { EngineError::InvalidCharacterInRegex => write!(f, "Invalid character used in regex."), EngineError::OperationTimeOutError => write!(f, "The operation took too much time."), - EngineError::AutomatonShouldBeDeterministic => { - write!(f, "The given automaton should be deterministic.") - } EngineError::AutomatonHasTooManyStates => { write!(f, "The automaton has too many states.") } @@ -45,13 +36,6 @@ impl fmt::Display for EngineError { f, "The provided range can not be built from the spanning set." ), - EngineError::ConditionIndexOutOfBound => { - write!(f, "The provided index is out of bound of the condition.") - } - EngineError::CannotComputeAutomatonCardinality => write!( - f, - "Computing the cardinality of the provided automaton failed." - ), } } } diff --git a/src/fast_automaton/analyze/cardinality.rs b/src/fast_automaton/analyze/cardinality.rs index 7157bae..5741e77 100644 --- a/src/fast_automaton/analyze/cardinality.rs +++ b/src/fast_automaton/analyze/cardinality.rs @@ -4,18 +4,17 @@ use super::*; impl FastAutomaton { /// Returns the cardinality of the automaton (i.e., the number of possible matched strings). - pub fn get_cardinality(&self) -> Option> { + pub fn get_cardinality(&self) -> Cardinality { if self.is_empty() { - return Some(Cardinality::Integer(0)); + return Cardinality::Integer(0); } else if self.cyclic || self.is_total() { - return Some(Cardinality::Infinite); - } else if !self.deterministic { - return None; + return Cardinality::Infinite; } + assert!(self.is_determinitic(), "The automaton should be deterministic."); let topologically_sorted_states = self.topological_sorted_states(); if topologically_sorted_states.is_none() { - return Some(Cardinality::Infinite); + return Cardinality::Infinite; } let topologically_sorted_states = topologically_sorted_states.unwrap(); @@ -41,7 +40,7 @@ impl FastAutomaton { } } - return Some(Cardinality::BigInteger); + return Cardinality::BigInteger; } } } @@ -53,10 +52,10 @@ impl FastAutomaton { temp_cardinality = add; continue; } - return Some(Cardinality::BigInteger); + return Cardinality::BigInteger; } } - Some(Cardinality::Integer(temp_cardinality)) + Cardinality::Integer(temp_cardinality) } fn topological_sorted_states(&self) -> Option> { diff --git a/src/fast_automaton/condition/converter.rs b/src/fast_automaton/condition/converter.rs index 503d6ce..9fabd11 100644 --- a/src/fast_automaton/condition/converter.rs +++ b/src/fast_automaton/condition/converter.rs @@ -59,14 +59,10 @@ impl<'a, 'b> ConditionConverter<'a, 'b> { pub fn convert(&self, condition: &Condition) -> Result { let mut new_condition = Condition::empty(self.to_spanning_set); for (from_index, to_indexes) in self.equivalence_map.iter().enumerate() { - if let Some(has) = condition.0.get(from_index) { - if has && !to_indexes.is_empty() { - to_indexes.iter().for_each(|&to_index| { - new_condition.0.set(to_index, true); - }); - } - } else { - return Err(EngineError::ConditionIndexOutOfBound); + if condition.0.get(from_index) && !to_indexes.is_empty() { + to_indexes.iter().for_each(|&to_index| { + new_condition.0.set(to_index, true); + }); } } @@ -86,8 +82,8 @@ impl<'a, 'b> ConditionConverter<'a, 'b> { #[cfg(test)] mod tests { - use regex_charclass::{char::Char, irange::{range::AnyRange}}; use crate::CharRange; + use regex_charclass::{char::Char, irange::range::AnyRange}; use super::*; diff --git a/src/fast_automaton/condition/fast_bit_vec/mod.rs b/src/fast_automaton/condition/fast_bit_vec/mod.rs index a1b46c5..9c85a43 100644 --- a/src/fast_automaton/condition/fast_bit_vec/mod.rs +++ b/src/fast_automaton/condition/fast_bit_vec/mod.rs @@ -7,7 +7,7 @@ pub struct FastBitVec { impl std::fmt::Display for FastBitVec { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { for i in 0..self.n { - let bit = if self.get(i).unwrap() { 1 } else { 0 }; + let bit = if self.get(i) { 1 } else { 0 }; write!(f, "{bit}")?; } Ok(()) @@ -48,13 +48,11 @@ impl FastBitVec { } #[inline] - pub fn get(&self, i: usize) -> Option { - if i >= self.n { - return None; - } + pub fn get(&self, i: usize) -> bool { + assert!(i < self.n, "The provided bit index is out of bound."); let w = i / 64; let b = i % 64; - self.bits.get(w).map(|&block| (block & (1 << b)) != 0) + (self.bits[w] & (1 << b)) != 0 } #[inline] @@ -126,7 +124,7 @@ impl FastBitVec { pub fn get_bits(&self) -> Vec { let mut bits = Vec::with_capacity(self.n); for i in 0..self.n { - bits.push(self.get(i).unwrap()); + bits.push(self.get(i)); } bits } diff --git a/src/fast_automaton/condition/mod.rs b/src/fast_automaton/condition/mod.rs index 08a439b..122ccc9 100644 --- a/src/fast_automaton/condition/mod.rs +++ b/src/fast_automaton/condition/mod.rs @@ -1,9 +1,9 @@ use std::hash::Hash; use fast_bit_vec::FastBitVec; -use regex_charclass::{char::Char, CharacterClass}; +use regex_charclass::{CharacterClass, char::Char}; -use crate::{error::EngineError, CharRange}; +use crate::{CharRange, error::EngineError}; use super::spanning_set::SpanningSet; pub mod converter; @@ -76,12 +76,8 @@ impl Condition { .iter() .enumerate() { - if let Some(has) = self.0.get(i) { - if has { - range = range.union(base); - } - } else { - return Err(EngineError::ConditionIndexOutOfBound); + if self.0.get(i) { + range = range.union(base); } } @@ -193,11 +189,17 @@ mod tests { let empty = Condition::empty(&spanning_set); //println!("{empty}"); assert!(empty.is_empty()); - assert_eq!(vec![false, false, false, false], empty.get_binary_representation()); + assert_eq!( + vec![false, false, false, false], + empty.get_binary_representation() + ); let total = Condition::total(&spanning_set); //println!("{total}"); assert!(total.is_total()); - assert_eq!(vec![true, true, true, true], total.get_binary_representation()); + assert_eq!( + vec![true, true, true, true], + total.get_binary_representation() + ); assert_eq!(CharRange::empty(), empty.to_range(&spanning_set).unwrap()); assert_eq!(CharRange::total(), total.to_range(&spanning_set).unwrap()); diff --git a/src/fast_automaton/operation/difference.rs b/src/fast_automaton/operation/difference.rs index a7b8ecf..acb63e9 100644 --- a/src/fast_automaton/operation/difference.rs +++ b/src/fast_automaton/operation/difference.rs @@ -6,9 +6,8 @@ use super::*; impl FastAutomaton { fn totalize(&mut self) -> Result<(), EngineError> { - if !self.is_determinitic() { - return Err(EngineError::AutomatonShouldBeDeterministic); - } + assert!(self.is_determinitic(), "The automaton should be deterministic."); + let crash_state = self.new_state(); let mut transitions_to_crash_state: IntMap = IntMap::with_capacity_and_hasher( diff --git a/src/lib.rs b/src/lib.rs index e4119e2..9c8e6ad 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -449,19 +449,11 @@ impl Term { pub fn get_cardinality(&self) -> Result, EngineError> { match self { Term::RegularExpression(regex) => Ok(regex.get_cardinality()), - Term::Automaton(automaton) => { - let cardinality = if !automaton.is_determinitic() { - automaton.determinize()?.get_cardinality() - } else { - automaton.get_cardinality() - }; - - if let Some(cardinality) = cardinality { - Ok(cardinality) - } else { - Err(EngineError::CannotComputeAutomatonCardinality) - } - } + Term::Automaton(automaton) => Ok(if !automaton.is_determinitic() { + automaton.determinize()?.get_cardinality() + } else { + automaton.get_cardinality() + }), } } diff --git a/src/regex/analyze/mod.rs b/src/regex/analyze/mod.rs index 2ee4bc5..f5d1975 100644 --- a/src/regex/analyze/mod.rs +++ b/src/regex/analyze/mod.rs @@ -232,7 +232,7 @@ mod tests { //automaton.to_dot(); - let expected = automaton.get_cardinality().unwrap(); + let expected = automaton.get_cardinality(); assert_eq!(expected, cardinality); } From c2cc84234182704b941002e66fc878f78ce97c4e Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Tue, 16 Sep 2025 21:06:52 +0200 Subject: [PATCH 24/44] Change regex convertion algo --- src/fast_automaton/analyze/cardinality.rs | 6 +- src/fast_automaton/analyze/equivalence.rs | 28 +- src/fast_automaton/analyze/length.rs | 4 +- src/fast_automaton/analyze/mod.rs | 6 +- src/fast_automaton/analyze/subset.rs | 34 +- src/fast_automaton/builder.rs | 27 +- .../convert/to_regex/builder/scc.rs | 207 ---------- src/fast_automaton/convert/to_regex/mod.rs | 378 +++--------------- .../mod.rs => state_elimination/builder.rs} | 103 +++-- .../to_regex/state_elimination/eliminate.rs | 118 ++++++ .../convert/to_regex/state_elimination/mod.rs | 121 ++++++ .../convert/to_regex/transform.rs | 208 ---------- .../convert/to_regex/transform/mod.rs | 16 + .../to_regex/transform/shape/dotstar.rs | 172 ++++++++ .../convert/to_regex/transform/shape/mod.rs | 1 + src/fast_automaton/generate.rs | 4 +- src/fast_automaton/mod.rs | 64 +-- src/fast_automaton/operation/concat.rs | 72 ++-- src/fast_automaton/operation/determinize.rs | 6 +- src/fast_automaton/operation/difference.rs | 8 +- src/fast_automaton/operation/intersection.rs | 99 ++--- src/fast_automaton/operation/mod.rs | 6 +- src/fast_automaton/operation/repeat.rs | 8 +- src/fast_automaton/operation/union.rs | 203 +++++++--- src/fast_automaton/serializer/mod.rs | 6 +- .../serializer/tokenizer/embed_automaton.rs | 24 +- src/lib.rs | 46 +-- src/regex/builder.rs | 40 +- src/regex/mod.rs | 90 +++++ src/regex/operation/concat.rs | 31 +- src/regex/operation/repeat.rs | 2 +- src/regex/operation/union.rs | 44 +- tests/integration_tests.rs | 10 +- 33 files changed, 1018 insertions(+), 1174 deletions(-) delete mode 100644 src/fast_automaton/convert/to_regex/builder/scc.rs rename src/fast_automaton/convert/to_regex/{builder/mod.rs => state_elimination/builder.rs} (62%) create mode 100644 src/fast_automaton/convert/to_regex/state_elimination/eliminate.rs create mode 100644 src/fast_automaton/convert/to_regex/state_elimination/mod.rs delete mode 100644 src/fast_automaton/convert/to_regex/transform.rs create mode 100644 src/fast_automaton/convert/to_regex/transform/mod.rs create mode 100644 src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs create mode 100644 src/fast_automaton/convert/to_regex/transform/shape/mod.rs diff --git a/src/fast_automaton/analyze/cardinality.rs b/src/fast_automaton/analyze/cardinality.rs index 5741e77..ccad761 100644 --- a/src/fast_automaton/analyze/cardinality.rs +++ b/src/fast_automaton/analyze/cardinality.rs @@ -65,9 +65,9 @@ impl FastAutomaton { let mut queue = VecDeque::with_capacity(len); let mut order = Vec::with_capacity(len); - for from_state in &self.all_states_vec() { + for from_state in &self.states_vec() { in_degree.entry(*from_state).or_insert(0); - for to_state in self.direct_states_iter(from_state) { + for to_state in self.direct_states(from_state) { *in_degree.entry(to_state).or_insert(0) += 1; } } @@ -80,7 +80,7 @@ impl FastAutomaton { while let Some(from_state) = queue.pop_front() { order.push(from_state); - for to_state in self.direct_states_iter(&from_state) { + for to_state in self.direct_states(&from_state) { *in_degree.entry(to_state).or_default() -= 1; if in_degree[&to_state] == 0 { diff --git a/src/fast_automaton/analyze/equivalence.rs b/src/fast_automaton/analyze/equivalence.rs index 32f2ccb..3f70711 100644 --- a/src/fast_automaton/analyze/equivalence.rs +++ b/src/fast_automaton/analyze/equivalence.rs @@ -4,7 +4,7 @@ use super::*; impl FastAutomaton { /// Returns `true` if both automata accept the same language. - pub fn are_equivalent(&self, other: &FastAutomaton) -> Result { + pub fn equivalent(&self, other: &FastAutomaton) -> Result { if self.is_empty() != other.is_empty() && self.is_total() != other.is_total() { return Ok(false); } else if self == other { @@ -44,26 +44,26 @@ mod tests { false, ); - let regex_1 = RegularExpression::new("cd").unwrap(); - let regex_2 = RegularExpression::new("cd").unwrap(); + let regex_1 = RegularExpression::parse("cd", false).unwrap(); + let regex_2 = RegularExpression::parse("cd", false).unwrap(); assert_equivalent(®ex_1, ®ex_2, true); - let regex_1 = RegularExpression::new("test.*other").unwrap(); - let regex_2 = RegularExpression::new("test.*othew").unwrap(); + let regex_1 = RegularExpression::parse("test.*other", false).unwrap(); + let regex_2 = RegularExpression::parse("test.*othew", false).unwrap(); assert_equivalent(®ex_1, ®ex_2, false); - let regex_1 = RegularExpression::new("test.{0,50}other").unwrap(); - let regex_2 = RegularExpression::new("test.{0,49}other").unwrap(); + let regex_1 = RegularExpression::parse("test.{0,50}other", false).unwrap(); + let regex_2 = RegularExpression::parse("test.{0,49}other", false).unwrap(); assert_equivalent(®ex_1, ®ex_2, false); - let regex_1 = RegularExpression::new("[0]").unwrap(); - let regex_2 = RegularExpression::new("[01]").unwrap(); + let regex_1 = RegularExpression::parse("[0]", false).unwrap(); + let regex_2 = RegularExpression::parse("[01]", false).unwrap(); assert_equivalent(®ex_1, ®ex_2, false); - let regex_1 = RegularExpression::new("(b+a+)*").unwrap(); - let regex_2 = RegularExpression::new("(b[a-b]*a)?").unwrap(); + let regex_1 = RegularExpression::parse("(b+a+)*", false).unwrap(); + let regex_2 = RegularExpression::parse("(b[a-b]*a)?", false).unwrap(); assert_equivalent(®ex_1, ®ex_2, true); Ok(()) @@ -72,14 +72,14 @@ mod tests { fn assert_equivalent(regex_1: &RegularExpression, regex_2: &RegularExpression, expected: bool) { println!("{regex_1} and {regex_2}"); let automaton_1 = regex_1.to_automaton().unwrap(); - assert_eq!(true, automaton_1.are_equivalent(&automaton_1).unwrap()); + assert_eq!(true, automaton_1.equivalent(&automaton_1).unwrap()); let automaton_2 = regex_2.to_automaton().unwrap(); - assert_eq!(true, automaton_2.are_equivalent(&automaton_2).unwrap()); + assert_eq!(true, automaton_2.equivalent(&automaton_2).unwrap()); assert_eq!( expected, - automaton_1.are_equivalent(&automaton_2).unwrap() + automaton_1.equivalent(&automaton_2).unwrap() ); } } diff --git a/src/fast_automaton/analyze/length.rs b/src/fast_automaton/analyze/length.rs index 5ab7180..bbec964 100644 --- a/src/fast_automaton/analyze/length.rs +++ b/src/fast_automaton/analyze/length.rs @@ -27,7 +27,7 @@ impl FastAutomaton { } seen.insert(state); - for to_state in self.direct_states_iter(&state) { + for to_state in self.direct_states(&state) { if to_state == state || seen.contains(&to_state) { is_infinite = true; continue; @@ -54,7 +54,7 @@ impl FastAutomaton { } seen.insert(state); - for to_state in self.direct_states_iter(&state) { + for to_state in self.direct_states(&state) { if to_state == state || seen.contains(&to_state) { max = None; break; diff --git a/src/fast_automaton/analyze/mod.rs b/src/fast_automaton/analyze/mod.rs index 46b7c23..3902460 100644 --- a/src/fast_automaton/analyze/mod.rs +++ b/src/fast_automaton/analyze/mod.rs @@ -32,15 +32,15 @@ impl FastAutomaton { pub fn is_empty_string(&self) -> bool { self.accept_states.len() == 1 && self.accept_states.contains(&self.start_state) - && self.state_in_degree(self.start_state) == 0 + && self.in_degree(self.start_state) == 0 } /// Returns the set of all states reachable from the start state. pub fn get_reacheable_states(&self) -> IntSet { let mut states_map: IntMap> = IntMap::with_capacity_and_hasher(self.transitions.len(), BuildHasherDefault::default()); - for from_state in self.all_states_iter() { - for (condition, to_state) in self.transitions_from_iter(from_state) { + for from_state in self.states() { + for (condition, to_state) in self.transitions_from(from_state) { if condition.is_empty() { continue; } diff --git a/src/fast_automaton/analyze/subset.rs b/src/fast_automaton/analyze/subset.rs index e08a476..e4ca7d6 100644 --- a/src/fast_automaton/analyze/subset.rs +++ b/src/fast_automaton/analyze/subset.rs @@ -4,7 +4,7 @@ use super::*; impl FastAutomaton { /// Returns `true` if all strings accepted by `self` are also accepted by `other`. - pub fn is_subset_of(&self, other: &FastAutomaton) -> Result { + pub fn subset(&self, other: &FastAutomaton) -> Result { if self.is_empty() || other.is_total() || self == other { return Ok(true); } else if other.is_empty() || self.is_total() { @@ -39,33 +39,33 @@ mod tests { true, ); - let regex1 = RegularExpression::new("test.*other").unwrap(); - let regex2 = RegularExpression::new("test.*othew").unwrap(); + let regex1 = RegularExpression::parse("test.*other", false).unwrap(); + let regex2 = RegularExpression::parse("test.*othew", false).unwrap(); assert_subset(®ex1, ®ex2, false, false); - let regex1 = RegularExpression::new("test.{0,50}other").unwrap(); - let regex2 = RegularExpression::new("test.{0,49}other").unwrap(); + let regex1 = RegularExpression::parse("test.{0,50}other", false).unwrap(); + let regex2 = RegularExpression::parse("test.{0,49}other", false).unwrap(); assert_subset(®ex1, ®ex2, false, true); - let regex1 = RegularExpression::new("(abc|def)").unwrap(); - let regex2 = RegularExpression::new("(abc|def|xyz)").unwrap(); + let regex1 = RegularExpression::parse("(abc|def)", false).unwrap(); + let regex2 = RegularExpression::parse("(abc|def|xyz)", false).unwrap(); assert_subset(®ex1, ®ex2, true, false); - let regex1 = RegularExpression::new("[0]").unwrap(); - let regex2 = RegularExpression::new("[01]").unwrap(); + let regex1 = RegularExpression::parse("[0]", false).unwrap(); + let regex2 = RegularExpression::parse("[01]", false).unwrap(); assert_subset(®ex1, ®ex2, true, false); - let regex1 = RegularExpression::new("a.*b.*c.*").unwrap(); - let regex2 = RegularExpression::new("a.*b.*").unwrap(); + let regex1 = RegularExpression::parse("a.*b.*c.*", false).unwrap(); + let regex2 = RegularExpression::parse("a.*b.*", false).unwrap(); assert_subset(®ex1, ®ex2, true, false); - let regex1 = RegularExpression::new("1..").unwrap(); - let regex2 = RegularExpression::new("...").unwrap(); + let regex1 = RegularExpression::parse("1..", false).unwrap(); + let regex2 = RegularExpression::parse("...", false).unwrap(); assert_subset(®ex1, ®ex2, true, false); @@ -80,18 +80,18 @@ mod tests { ) { println!("{regex_1} and {regex_2}"); let automaton_1 = regex_1.to_automaton().unwrap(); - assert_eq!(true, automaton_1.is_subset_of(&automaton_1).unwrap()); + assert_eq!(true, automaton_1.subset(&automaton_1).unwrap()); let automaton_2 = regex_2.to_automaton().unwrap(); - assert_eq!(true, automaton_2.is_subset_of(&automaton_2).unwrap()); + assert_eq!(true, automaton_2.subset(&automaton_2).unwrap()); assert_eq!( expected_1_2, - automaton_1.is_subset_of(&automaton_2).unwrap() + automaton_1.subset(&automaton_2).unwrap() ); assert_eq!( expected_2_1, - automaton_2.is_subset_of(&automaton_1).unwrap() + automaton_2.subset(&automaton_1).unwrap() ); } } diff --git a/src/fast_automaton/builder.rs b/src/fast_automaton/builder.rs index 8bcc136..da16808 100644 --- a/src/fast_automaton/builder.rs +++ b/src/fast_automaton/builder.rs @@ -122,7 +122,7 @@ impl FastAutomaton { if self.deterministic { let mut deterministic = true; - for (condition, state) in self.transitions_from_iter(from_state) { + for (condition, state) in self.transitions_from(from_state) { if state == &to_state { continue; } @@ -159,12 +159,15 @@ impl FastAutomaton { self.accept_states.insert(from_state); } - let transitions_to: Vec<_> = self.transitions_from_into_iter(&to_state).collect(); + let transitions_to: Vec<_> = self + .transitions_from(to_state) + .map(|(cond, to_state)| (cond.clone(), *to_state)) + .collect(); for (cond, state) in transitions_to { if self.deterministic { let mut deterministic = true; - for (c, s) in self.transitions_from_iter(from_state) { + for (c, s) in self.transitions_from(from_state) { if state == *s { continue; } @@ -190,6 +193,19 @@ impl FastAutomaton { } } + pub fn remove_transition(&mut self, from_state: State, to_state: State) { + self.assert_state_exists(from_state); + if from_state != to_state { + self.assert_state_exists(to_state); + } + + self.transitions_in + .entry(to_state) + .or_default() + .remove(&from_state); + self.transitions[from_state].remove(&to_state); + } + /// Removes the state and all its connected transitions; panics if it's a start state. pub fn remove_state(&mut self, state: State) { self.assert_state_exists(state); @@ -269,7 +285,7 @@ impl FastAutomaton { return Ok(()); } let condition_converter = ConditionConverter::new(&self.spanning_set, new_spanning_set)?; - for from_state in &self.all_states_vec() { + for from_state in &self.states_vec() { for to_state in self.direct_states_vec(from_state) { match self.transitions[*from_state].entry(to_state) { Entry::Occupied(mut o) => { @@ -296,6 +312,7 @@ impl FastAutomaton { #[inline] pub(crate) fn apply_model(&mut self, model: &FastAutomaton) { self.transitions = model.transitions.clone(); + self.transitions_in = model.transitions_in.clone(); self.start_state = model.start_state; self.accept_states = model.accept_states.clone(); self.removed_states = model.removed_states.clone(); @@ -320,7 +337,7 @@ mod tests { } fn assert_regex_build_deterministic_automaton(regex: &str, deterministic: bool) { - let automaton = RegularExpression::new(regex) + let automaton = RegularExpression::parse(regex, false) .unwrap() .to_automaton() .unwrap(); diff --git a/src/fast_automaton/convert/to_regex/builder/scc.rs b/src/fast_automaton/convert/to_regex/builder/scc.rs deleted file mode 100644 index c99cbc5..0000000 --- a/src/fast_automaton/convert/to_regex/builder/scc.rs +++ /dev/null @@ -1,207 +0,0 @@ -use super::*; - -impl StateEliminationAutomaton { - pub fn identify_and_apply_components(&mut self) -> Result<(), EngineError> { - let mut index = 0; - let mut stack = Vec::new(); - let mut indices = vec![-1; self.transitions.len()]; - let mut lowlink = vec![-1; self.transitions.len()]; - let mut on_stack = vec![false; self.transitions.len()]; - let mut scc = Vec::new(); - - for state in self.states_iter() { - if self.removed_states.contains(&state) { - continue; - } - if indices[state] == -1 { - self.strongconnect( - state, - &mut index, - &mut stack, - &mut indices, - &mut lowlink, - &mut on_stack, - &mut scc, - ); - } - } - - let scc = scc - .into_iter() - .filter(|states| { - let first_state = states.iter().next().unwrap(); - let self_loop = if let Some(transitions_in) = self.transitions_in.get(first_state) { - transitions_in.contains(first_state) - } else { - false - }; - states.len() != 1 || self_loop - }) - .collect::>(); - - for component in scc { - self.build_component(&component)?; - } - - self.cyclic = false; - - Ok(()) - } - - #[allow(clippy::too_many_arguments)] - fn strongconnect( - &self, - v: usize, - index: &mut usize, - stack: &mut Vec, - indices: &mut Vec, - lowlink: &mut Vec, - on_stack: &mut Vec, - scc: &mut Vec>, - ) { - indices[v] = *index as i32; - lowlink[v] = *index as i32; - *index += 1; - stack.push(v); - on_stack[v] = true; - - if let Some(neighbors) = self.transitions.get(v) { - for &w in neighbors.keys() { - if indices[w] == -1 { - self.strongconnect(w, index, stack, indices, lowlink, on_stack, scc); - lowlink[v] = lowlink[v].min(lowlink[w]); - } else if on_stack[w] { - lowlink[v] = lowlink[v].min(indices[w]); - } - } - } - - if lowlink[v] == indices[v] { - let mut component = Vec::new(); - while let Some(w) = stack.pop() { - on_stack[w] = false; - component.push(w); - if w == v { - break; - } - } - scc.push(component); - } - } - - fn build_component(&mut self, states: &[usize]) -> Result<(), EngineError> { - let state_set = states.iter().copied().collect::>(); - let mut start_states = IntMap::new(); - let mut accept_states = IntMap::new(); - - let mut state_elimination_automaton = StateEliminationAutomaton { - start_state: 0, // start_state is not set yet - accept_state: 0, // accept_state is not set yet - transitions: Vec::with_capacity(states.len()), - transitions_in: IntMap::with_capacity(states.len()), - removed_states: IntSet::new(), - cyclic: true, - }; - - let mut states_map = IntMap::with_capacity(states.len()); - for from_state in states { - if *from_state == self.accept_state { - self.accept_state = self.new_state(); - self.add_transition_to(*from_state, self.accept_state, GraphTransition::Epsilon); - } - if *from_state == self.start_state { - self.start_state = self.new_state(); - self.add_transition_to(self.start_state, *from_state, GraphTransition::Epsilon); - } - let from_state_new = *states_map - .entry(*from_state) - .or_insert_with(|| state_elimination_automaton.new_state()); - for (to_state, transition) in self.transitions_from_state_enumerate_iter(from_state) { - if !state_set.contains(to_state) { - accept_states - .entry(*to_state) - .or_insert_with(Vec::new) - .push((from_state_new, transition.clone())); - continue; - } - - let to_state_new = *states_map - .entry(*to_state) - .or_insert_with(|| state_elimination_automaton.new_state()); - - state_elimination_automaton.add_transition_to( - from_state_new, - to_state_new, - transition.clone(), - ); - } - - for (parent_state, transition) in self.in_transitions_vec(*from_state) { - if !state_set.contains(&parent_state) { - start_states - .entry(from_state_new) - .or_insert_with(Vec::new) - .push((parent_state, transition.clone())); - } - } - } - - for state in states { - self.remove_state(*state); - } - - for (start_state, parent_states) in &start_states { - for (parent_state, transition) in parent_states { - let new_parent_state = if !transition.is_empty_string() { - let new_parent_state = self.new_state(); - - self.add_transition_to(*parent_state, new_parent_state, transition.clone()); - new_parent_state - } else { - *parent_state - }; - for (target_state, accept_states_transition) in &accept_states { - let mut new_automaton = state_elimination_automaton.clone(); - - let target_state = if accept_states_transition.len() > 1 { - new_automaton.accept_state = new_automaton.new_state(); - for (accept_state, transition) in accept_states_transition { - new_automaton.add_transition_to( - *accept_state, - new_automaton.accept_state, - transition.clone(), - ); - } - *target_state - } else { - let (accept_state, transition) = - accept_states_transition.iter().next().unwrap(); - - new_automaton.accept_state = *accept_state; - if !transition.is_empty_string() { - let new_target_state = self.new_state(); - self.add_transition_to( - new_target_state, - *target_state, - transition.clone(), - ); - new_target_state - } else { - *target_state - } - }; - - new_automaton.start_state = *start_state; - - self.add_transition_to( - new_parent_state, - target_state, - GraphTransition::Graph(new_automaton), - ); - } - } - } - - Ok(()) - } -} diff --git a/src/fast_automaton/convert/to_regex/mod.rs b/src/fast_automaton/convert/to_regex/mod.rs index e8e7e8e..10a530e 100644 --- a/src/fast_automaton/convert/to_regex/mod.rs +++ b/src/fast_automaton/convert/to_regex/mod.rs @@ -1,288 +1,12 @@ -use std::{ - collections::{VecDeque, hash_map::Entry}, - fmt::Display, -}; - -use ahash::{HashMapExt, HashSetExt}; -use log::warn; - -use crate::{error::EngineError, execution_profile::ExecutionProfile, regex::RegularExpression}; - use super::*; -mod builder; +mod state_elimination; mod transform; -#[derive(Clone, Debug)] -enum GraphTransition { - Graph(StateEliminationAutomaton), - Weight(T), - Epsilon, -} - -impl GraphTransition { - pub fn is_empty_string(&self) -> bool { - matches!(self, GraphTransition::Epsilon) - } - - pub fn get_weight(&self) -> Option<&T> { - if let GraphTransition::Weight(weight) = self { - Some(weight) - } else { - None - } - } -} - -#[derive(Clone, Debug)] -struct StateEliminationAutomaton { - start_state: usize, - accept_state: usize, - transitions: Vec>>, - transitions_in: IntMap>, - removed_states: IntSet, - cyclic: bool, -} - -impl Display for StateEliminationAutomaton { - fn fmt(&self, sb: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - self.to_graph_dot(sb, None) - } -} - -impl StateEliminationAutomaton { - //#[cfg(test)] - #[allow(dead_code)] - #[inline] - pub fn to_dot(&self) { - println!("{self}"); - } - - #[inline] - fn to_graph_dot( - &self, - sb: &mut std::fmt::Formatter<'_>, - prefix: Option<&str>, - ) -> std::fmt::Result { - let is_subgraph; - let indent; - let prefix = if let Some(prefix) = prefix { - writeln!(sb, "\tsubgraph cluster_{prefix} {{")?; - writeln!(sb, "\t\tlabel = \"{prefix} - cyclic={}\";", self.cyclic)?; - indent = "\t"; - is_subgraph = true; - prefix - } else { - writeln!(sb, "digraph Automaton {{")?; - writeln!(sb, "\trankdir = LR;")?; - writeln!(sb, "\tlabel = \"cyclic={}\";", self.cyclic)?; - indent = ""; - is_subgraph = false; - "" - }; - - for from_state in self.states_iter() { - let from_state_with_prefix = if is_subgraph { - format!("S{prefix}_{from_state}") - } else { - format!("S{from_state}") - }; - - write!(sb, "{indent}\t{from_state_with_prefix}")?; - if !is_subgraph && self.accept_state == from_state { - writeln!(sb, "\t[shape=doublecircle,label=\"{from_state}\"];")?; - } else { - writeln!(sb, "{indent}\t[shape=circle,label=\"{from_state}\"];")?; - } - - if !is_subgraph && self.start_state == from_state { - writeln!(sb, "\tinitial [shape=plaintext,label=\"\"];")?; - writeln!(sb, "\tinitial -> {from_state_with_prefix}")?; - } - for (to_state, weight) in self.transitions_from_state_enumerate_iter(&from_state) { - let to_state_with_prefix = if is_subgraph { - format!("S{prefix}_{to_state}") - } else { - format!("S{to_state}") - }; - - match weight { - GraphTransition::Graph(state_elimination_automaton) => { - let subgraph_prefix = if is_subgraph { - format!("{prefix}_{from_state}_{to_state}") - } else { - format!("{from_state}_{to_state}") - }; - state_elimination_automaton.to_graph_dot(sb, Some(&subgraph_prefix))?; - writeln!(sb)?; - let subgraph_start_state = format!( - "S{subgraph_prefix}_{}", - state_elimination_automaton.start_state - ); - writeln!( - sb, - "{indent}\t{from_state_with_prefix} -> {subgraph_start_state} [label=\"ε\"]" - )?; - - let subgraph_accept_state = format!( - "S{subgraph_prefix}_{}", - state_elimination_automaton.accept_state - ); - writeln!( - sb, - "{indent}\t{subgraph_accept_state} -> {to_state_with_prefix} [label=\"ε\"]" - ) - } - GraphTransition::Weight(range) => { - writeln!( - sb, - "{indent}\t{} -> {} [label=\"{}\"]", - from_state_with_prefix, - to_state_with_prefix, - RegularExpression::Character(range.clone()) - .to_string() - .replace('\\', "\\\\") - .replace('"', "\\\"") - ) - } - GraphTransition::Epsilon => writeln!( - sb, - "{indent}\t{from_state_with_prefix} -> {to_state_with_prefix} [label=\"ε\"]" - ), - }?; - } - } - write!(sb, "{indent}}}") - } - - #[inline] - pub fn states_iter(&self) -> impl Iterator + '_ { - (0..self.transitions.len()).filter(|s| !self.removed_states.contains(s)) - } - - #[inline] - pub fn transitions_from_state_enumerate_iter( - &self, - from_state: &State, - ) -> impl Iterator)> { - self.transitions[*from_state] - .iter() - .filter(|s| !self.removed_states.contains(s.0)) - } - - #[inline] - pub fn transitions_from_state_vec(&self, from_state: &State) -> Vec { - self.transitions[*from_state] - .keys() - .filter(|s| !self.removed_states.contains(s)) - .copied() - .collect() - } - - pub fn in_transitions_vec(&self, to_state: State) -> Vec<(State, GraphTransition)> { - let mut in_transitions = vec![]; - for from_state in self.transitions_in.get(&to_state).unwrap_or(&IntSet::new()) { - for (state, transition) in self.transitions_from_state_enumerate_iter(from_state) { - if to_state == *state { - in_transitions.push((*from_state, transition.clone())); - } - } - } - in_transitions - } - - pub fn states_topo_vec(&self) -> Vec { - if self.cyclic { - panic!("The graph has a cycle"); - } - - let mut in_degree: IntMap = self - .transitions_in - .iter() - .map(|(state, parents)| (*state, parents.len())) - .collect(); - - let mut worklist: VecDeque = VecDeque::new(); - for (&state, °ree) in &in_degree { - if degree == 0 { - worklist.push_back(state); - } - } - - let mut sorted_order = Vec::with_capacity(self.get_number_of_states()); - while let Some(state) = worklist.pop_front() { - sorted_order.push(state); - - if let Some(neighbors) = self.transitions.get(state) { - let neighbors = neighbors.keys(); - for &neighbor in neighbors { - if let Some(degree) = in_degree.get_mut(&neighbor) { - *degree -= 1; - if *degree == 0 { - worklist.push_back(neighbor); - } - } - } - } - } - - if sorted_order.len() == self.get_number_of_states() { - sorted_order - } else { - panic!("The graph has a cycle"); - } - } - - #[inline] - pub fn get_number_of_states(&self) -> usize { - self.transitions.len() - self.removed_states.len() - } -} - impl FastAutomaton { - /// Attempts to convert the automaton to a [`RegularExpression`]; returns `None` if no equivalent pattern are found. - pub fn to_regex(&self) -> Option { - if self.is_empty() { - return Some(RegularExpression::new_empty()); - } - let execution_profile = ExecutionProfile::get(); - if let Ok(graph) = StateEliminationAutomaton::new(self) { - if let Ok(regex) = graph?.convert_to_regex(&execution_profile) { - let regex = regex?; - match regex.to_automaton() { - Ok(automaton) => match self.are_equivalent(&automaton) { - Ok(result) => { - if !result { - warn!( - "The automaton is not equivalent to the generated regex; automaton={self}, regex={regex}" - ); - None - } else { - Some(regex) - } - } - Err(err) => { - warn!( - "Engine error while checking for equivalence ({err}); automaton={self}, regex={regex}" - ); - None - } - }, - Err(err) => { - if let crate::error::EngineError::RegexSyntaxError(err) = err { - warn!( - "The generated regex cannot be converted to automaton to be checked for equivalence ({err}); automaton={self}, regex={regex}" - ); - } - None - } - } - } else { - None - } - } else { - None - } + pub fn to_regex(&self) -> RegularExpression { + let transformed_automaton = transform::transform(self); + state_elimination::convert_to_regex(&transformed_automaton) } } @@ -290,8 +14,24 @@ impl FastAutomaton { mod tests { use super::*; + #[test] + fn test_convert_t() -> Result<(), String> { + assert_convert("abc.*def.*uif(ab|de)"); + + Ok(()) + } + #[test] fn test_convert() -> Result<(), String> { + + assert_convert(".*u(ab|de)"); + assert_convert(".*sf.*uif(ab|de)"); + + assert_convert("(a+|,)*"); + assert_convert("((ab)*,(cd)*)*"); + assert_convert("(a*,a*,a*)*"); + assert_convert("(a*,a*)*"); + assert_convert("(ac|ads|a)*"); assert_convert(".*sf"); assert_convert(".*sf.*uif(ab|de)"); @@ -325,36 +65,33 @@ mod tests { } fn assert_convert(regex: &str) { - let input_regex = RegularExpression::new(regex).unwrap(); + let input_regex = RegularExpression::parse(regex, false).unwrap(); println!("IN : {}", input_regex); let input_automaton = input_regex.to_automaton().unwrap(); - //input_automaton.to_dot(); - - let output_regex = input_automaton.to_regex().unwrap(); + let output_regex = input_automaton.to_regex(); println!("OUT (non deterministic): {}", output_regex); let output_automaton = output_regex.to_automaton().unwrap(); - assert!(input_automaton.are_equivalent(&output_automaton).unwrap()); + assert!(input_automaton.equivalent(&output_automaton).unwrap()); let input_automaton = input_automaton.determinize().unwrap(); - //input_automaton.to_dot(); - let output_regex = input_automaton.to_regex().unwrap(); + let output_regex = input_automaton.to_regex(); println!("OUT (deterministic) : {}", output_regex); let output_automaton = output_regex.to_automaton().unwrap(); - assert!(input_automaton.are_equivalent(&output_automaton).unwrap()); + assert!(input_automaton.equivalent(&output_automaton).unwrap()); } #[test] fn test_convert_after_operation_1() -> Result<(), String> { - let automaton1 = RegularExpression::new("(ab|cd)") + let automaton1 = RegularExpression::parse("(ab|cd)", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("ab") + let automaton2 = RegularExpression::parse("ab", false) .unwrap() .to_automaton() .unwrap(); @@ -362,9 +99,9 @@ mod tests { let result = automaton1.difference(&automaton2).unwrap(); - result.to_dot(); + result.print_dot(); - let output_regex = result.to_regex().unwrap(); + let output_regex = result.to_regex(); assert_eq!("cd", output_regex.to_string()); Ok(()) @@ -372,20 +109,20 @@ mod tests { #[test] fn test_convert_after_operation_2() -> Result<(), String> { - let automaton1 = RegularExpression::new("a*") + let automaton1 = RegularExpression::parse("a*", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("b*") + let automaton2 = RegularExpression::parse("b*", false) .unwrap() .to_automaton() .unwrap(); let result = automaton1.intersection(&automaton2).unwrap(); - result.to_dot(); + result.print_dot(); - let output_regex = result.to_regex().unwrap(); + let output_regex = result.to_regex(); assert_eq!("", output_regex.to_string()); Ok(()) @@ -393,71 +130,72 @@ mod tests { #[test] fn test_convert_after_operation_3() -> Result<(), String> { - let automaton1 = RegularExpression::new("x*") + let automaton1 = RegularExpression::parse("x*", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("(xxx)*") + let automaton2 = RegularExpression::parse("(xxx)*", false) .unwrap() .to_automaton() .unwrap(); let automaton2 = automaton2.determinize().unwrap(); let result = automaton1.difference(&automaton2).unwrap(); - result.to_dot(); + result.print_dot(); - let result = result.to_regex().unwrap(); + let result = result.to_regex(); - assert_eq!("(x{3})*x{1,2}", result.to_string()); + assert_eq!("x(x{3})*x?", result.to_string()); Ok(()) } #[test] fn test_convert_after_operation_4() -> Result<(), String> { - let automaton1 = RegularExpression::new(".*abc.*") + let automaton1 = RegularExpression::parse(".*abc.*", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new(".*def.*") + let automaton2 = RegularExpression::parse(".*def.*", false) .unwrap() .to_automaton() .unwrap(); let result = automaton1.intersection(&automaton2).unwrap(); - let result = result.to_regex().unwrap(); + let result = result.to_regex(); assert_eq!(".*(abc.*def|def.*abc).*", result.to_string()); Ok(()) } - /*#[test] - fn test_convert_after_operation_5() -> Result<(), String> { - if std::env::var_os("RUST_LOG").is_none() { - std::env::set_var("RUST_LOG", "regexsolver=debug"); - } - env_logger::init(); - - let automaton1 = RegularExpression::new(".*abc.*") + #[test] + fn test_automaton() -> Result<(), String> { + let automaton = RegularExpression::parse("a*ba*", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new(".*def.*") + automaton.print_dot(); + + let automaton1 = RegularExpression::parse("(a*ba*)*", false) .unwrap() .to_automaton() - .unwrap() - .determinize() .unwrap(); + automaton1.print_dot(); - let result = automaton1.difference(&automaton2).unwrap(); - result.to_dot(); + automaton1.determinize().unwrap().print_dot(); + + // (a*b[ab]*)? + // a*b+a+b+ - let result = result.to_regex().unwrap(); + let automaton2 = RegularExpression::parse("(a*b[ab]*)?", false) + .unwrap() + .to_automaton() + .unwrap(); - assert_eq!("(x{3})*x{1,2}", result.to_string()); + assert!(automaton1.equivalent(&automaton2).unwrap()); Ok(()) - }*/ + } } diff --git a/src/fast_automaton/convert/to_regex/builder/mod.rs b/src/fast_automaton/convert/to_regex/state_elimination/builder.rs similarity index 62% rename from src/fast_automaton/convert/to_regex/builder/mod.rs rename to src/fast_automaton/convert/to_regex/state_elimination/builder.rs index 0790851..54cec00 100644 --- a/src/fast_automaton/convert/to_regex/builder/mod.rs +++ b/src/fast_automaton/convert/to_regex/state_elimination/builder.rs @@ -1,48 +1,64 @@ -use super::*; - -mod scc; +use ahash::HashMapExt; -impl StateEliminationAutomaton { - pub fn new(automaton: &FastAutomaton) -> Result, EngineError> { - if automaton.is_empty() { - return Ok(None); - } +use super::*; - let mut state_elimination_automaton = StateEliminationAutomaton { +impl Gnfa { + pub(super) fn from_automaton(automaton: &FastAutomaton) -> Gnfa { + let mut state_elimination_automaton = Gnfa { start_state: 0, // start_state is not set yet accept_state: 0, // accept_state is not set yet transitions: Vec::with_capacity(automaton.get_number_of_states()), transitions_in: IntMap::with_capacity(automaton.get_number_of_states()), - removed_states: IntSet::new(), - cyclic: false, + removed_states: IntSet::with_capacity(automaton.get_number_of_states()), + empty: false }; + if automaton.is_empty() { + state_elimination_automaton.empty = true; + return state_elimination_automaton; + } + let mut states_map = IntMap::with_capacity(automaton.get_number_of_states()); - for from_state in automaton.all_states_iter() { + for from_state in automaton.states() { let new_from_state = *states_map .entry(from_state) .or_insert_with(|| state_elimination_automaton.new_state()); - for (condition, to_state) in - automaton.transitions_from_iter(from_state) - { + for (condition, to_state) in automaton.transitions_from(from_state) { let new_to_state = *states_map .entry(*to_state) .or_insert_with(|| state_elimination_automaton.new_state()); - state_elimination_automaton.add_transition_to( + state_elimination_automaton.add_transition( new_from_state, new_to_state, - GraphTransition::Weight(condition.to_range(automaton.get_spanning_set())?), + RegularExpression::Character( + condition.to_range(automaton.get_spanning_set()).unwrap(), + ), ); } } - state_elimination_automaton.start_state = - *states_map.get(&automaton.get_start_state()).unwrap(); // We finally set start_state + if automaton.in_degree(automaton.get_start_state()) == 0 { + // If the start state does not have any incoming state we just set it + state_elimination_automaton.start_state = + *states_map.get(&automaton.get_start_state()).unwrap(); + } else { + // If not we create a new state that will be the new start state + state_elimination_automaton.start_state = state_elimination_automaton.new_state(); + + let previous_start_state = *states_map.get(&automaton.get_start_state()).unwrap(); + // We add an empty string transition to the new start state + state_elimination_automaton.add_transition( + state_elimination_automaton.start_state, + previous_start_state, + RegularExpression::new_empty_string(), + ); + } - if automaton.get_accept_states().len() == 1 { - // If there is only one accept state with just set it + let accept_state = *automaton.get_accept_states().iter().next().unwrap(); + if automaton.get_accept_states().len() == 1 && automaton.out_degree(accept_state) == 0 { + // If there is only one accept state we just set it state_elimination_automaton.accept_state = *states_map .get(automaton.get_accept_states().iter().next().unwrap()) .unwrap(); @@ -52,19 +68,18 @@ impl StateEliminationAutomaton { for accept_state in automaton.get_accept_states() { let accept_state = *states_map.get(accept_state).unwrap(); // We add an empty string transition to the new accept state - state_elimination_automaton.add_transition_to( + state_elimination_automaton.add_transition( accept_state, state_elimination_automaton.accept_state, - GraphTransition::Epsilon, + RegularExpression::new_empty_string(), ); } } - state_elimination_automaton.identify_and_apply_components()?; - //state_elimination_automaton.to_dot(); - Ok(Some(state_elimination_automaton)) + + state_elimination_automaton } - pub fn new_state(&mut self) -> usize { + fn new_state(&mut self) -> usize { if let Some(new_state) = self.removed_states.clone().iter().next() { self.removed_states.remove(new_state); self.transitions_in.insert(*new_state, IntSet::new()); @@ -78,7 +93,7 @@ impl StateEliminationAutomaton { } #[inline] - pub fn has_state(&self, state: State) -> bool { + pub(super) fn has_state(&self, state: State) -> bool { !(state >= self.transitions.len() || self.removed_states.contains(&state)) } @@ -89,11 +104,11 @@ impl StateEliminationAutomaton { } } - pub fn add_transition_to( + pub(crate) fn add_transition( &mut self, from_state: State, to_state: State, - transition: GraphTransition, + transition: RegularExpression, ) { self.assert_state_exists(from_state); if from_state != to_state { @@ -106,13 +121,8 @@ impl StateEliminationAutomaton { .insert(from_state); match self.transitions[from_state].entry(to_state) { Entry::Occupied(mut o) => { - if let (GraphTransition::Weight(current_regex), GraphTransition::Weight(regex)) = - (o.get(), transition) - { - o.insert(GraphTransition::Weight(current_regex.union(®ex))); - } else { - panic!("Cannot add transition"); - } + //o.insert(RegularExpression::Alternation(vec![transition, o.get().clone()])); + o.insert(transition.union(o.get())); } Entry::Vacant(v) => { v.insert(transition); @@ -120,7 +130,7 @@ impl StateEliminationAutomaton { }; } - pub fn remove_state(&mut self, state: State) { + pub(super) fn remove_state(&mut self, state: State) { self.assert_state_exists(state); if self.start_state == state || self.accept_state == state { panic!( @@ -149,21 +159,4 @@ impl StateEliminationAutomaton { transitions.remove(&state); } } - - pub fn remove_transition(&mut self, from_state: State, to_state: State) { - self.assert_state_exists(from_state); - if from_state != to_state { - self.assert_state_exists(to_state); - } - - if let Some(from_states) = self.transitions_in.get_mut(&to_state) { - from_states.remove(&from_state); - } - - self.transitions[from_state].remove(&to_state); - } - - pub fn get_transition(&self, from_state: State, to_state: State) -> Option<&GraphTransition> { - self.transitions.get(from_state)?.get(&to_state) - } } diff --git a/src/fast_automaton/convert/to_regex/state_elimination/eliminate.rs b/src/fast_automaton/convert/to_regex/state_elimination/eliminate.rs new file mode 100644 index 0000000..d528f1b --- /dev/null +++ b/src/fast_automaton/convert/to_regex/state_elimination/eliminate.rs @@ -0,0 +1,118 @@ +use super::*; + +impl Gnfa { + pub(super) fn convert(&mut self) -> RegularExpression { + if self.empty { + return RegularExpression::new_empty(); + } + + while let Some(state) = self.get_next_state_to_eliminate() { + self.eliminate_state(state); + } + + self.get_transition(self.start_state, self.accept_state) + .cloned() + .unwrap_or(RegularExpression::new_empty_string()) + } + + fn get_next_state_to_eliminate(&self) -> Option { + let mut best_state: Option = None; + let mut best_score: u128 = u128::MAX; + + for state in self.all_states_iter() { + if state == self.start_state || state == self.accept_state { + continue; + } + + let preds = self.transitions_to_vec(state); + let succs = self.transitions_from_vec(state); + + let in_deg = preds.len() as u128; + let out_deg = succs.len() as u128; + + if in_deg == 0 || out_deg == 0 { + let score = state as u128 & 0xFF; + if score < best_score { + best_score = score; + best_state = Some(state); + } + continue; + } + + let mut score: u128 = in_deg * out_deg; + + if self.has_self_loop(state) { + score = score + (score >> 1); + } + + let mut label_cost: u128 = 0; + + for (_, regex) in &preds { + label_cost += regex.evaluate_complexity() as u128; + } + for (regex, _) in &succs { + label_cost += regex.evaluate_complexity() as u128; + } + if let Some(re) = self.get_transition(state, state) { + label_cost += (re.evaluate_complexity() as u128) * 2; + } + + score = score.saturating_mul(1).saturating_add(label_cost); + + let tie = state as u128 & 0xFFFF; + let score = score.saturating_add(tie); + + if score < best_score { + best_score = score; + best_state = Some(state); + } + } + + best_state + } + + fn eliminate_state(&mut self, k: usize) { + if self.removed_states.contains(&k) { + return; + } + + let in_states = self + .transitions_in + .get(&k) + .unwrap() + .iter() + .cloned() + .filter(|&s| s != k) + .collect::>(); + let out_states = self.transitions[k] + .keys() + .cloned() + .filter(|&s| s != k) + .collect::>(); + + for p in in_states { + for &q in &out_states { + self.bridge(p, k, q); + } + } + + self.remove_state(k); + } + + fn bridge(&mut self, p: usize, k: usize, q: usize) { + let rpk = self.get_transition(p, k); + let rkk = self.get_transition(k, k); + let rkq = self.get_transition(k, q); + + if let (Some(rpk), Some(rkq)) = (rpk, rkq) { + let mut regex = rpk.clone(); + if let Some(rkk) = rkk { + //regex = RegularExpression::Concat(VecDeque::from_iter(vec![regex, RegularExpression::Repetition(Box::new(rkk.clone()), 0, None)])); + regex = regex.concat(&rkk.repeat(0, None), true); + } + //regex = RegularExpression::Concat(VecDeque::from_iter(vec![regex, rkq.clone()])); + regex = regex.concat(rkq, true); + self.add_transition(p, q, regex); + } + } +} diff --git a/src/fast_automaton/convert/to_regex/state_elimination/mod.rs b/src/fast_automaton/convert/to_regex/state_elimination/mod.rs new file mode 100644 index 0000000..023d6b1 --- /dev/null +++ b/src/fast_automaton/convert/to_regex/state_elimination/mod.rs @@ -0,0 +1,121 @@ +use super::*; + +mod builder; +mod eliminate; + +struct Gnfa { + start_state: usize, + accept_state: usize, + transitions: Vec>, + transitions_in: IntMap>, + removed_states: IntSet, + empty: bool, +} + +impl Display for Gnfa { + fn fmt(&self, sb: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(sb, "digraph GNFA {{")?; + writeln!(sb, "\trankdir = LR;")?; + for from_state in self.all_states_iter() { + write!(sb, "\t{from_state}")?; + if self.accept_state == from_state { + writeln!(sb, "\t[shape=doublecircle,label=\"{from_state}\"];")?; + } else { + writeln!(sb, "\t[shape=circle,label=\"{from_state}\"];")?; + } + + if self.start_state == from_state { + writeln!(sb, "\tinitial [shape=plaintext,label=\"\"];")?; + writeln!(sb, "\tinitial -> {from_state}")?; + } + for (regex, to_state) in self.transitions_from_vec(from_state) { + writeln!(sb, "\t{from_state} -> {to_state} [label=\"{regex}\"]")?; + } + } + write!(sb, "}}") + } +} + +impl Gnfa { + fn get_transition(&self, from_state: State, to_state: State) -> Option<&RegularExpression> { + self.transitions.get(from_state)?.get(&to_state) + } + + #[inline] + fn all_states_iter(&self) -> impl Iterator + '_ { + (0..self.transitions.len()).filter(|s| !self.removed_states.contains(s)) + } + + fn transitions_to_vec(&self, state: State) -> Vec<(State, RegularExpression)> { + let mut in_transitions = vec![]; + for from_state in self.transitions_in.get(&state).unwrap_or(&IntSet::new()) { + for (condition, to_state) in self.transitions_from_vec(*from_state) { + if to_state == state { + in_transitions.push((*from_state, condition)); + break; + } + } + } + in_transitions + } + + #[inline] + fn transitions_from_vec(&self, state: State) -> Vec<(RegularExpression, State)> { + self.transitions[state] + .iter() + .map(|(s, c)| (c.clone(), *s)) + .filter(|s| !self.removed_states.contains(&s.1)) + .collect() + } + + #[inline] + fn has_self_loop(&self, state: State) -> bool { + self.get_transition(state, state).is_some() + } +} + +pub(super) fn convert_to_regex(automaton: &FastAutomaton) -> RegularExpression { + let mut gnfa = Gnfa::from_automaton(automaton); + gnfa.convert() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_state_elimination() -> Result<(), String> { + test_correct("abc"); + test_correct(".*de"); + test_correct(".*def"); + test_correct("(a*ba*)*"); + test_correct(".*u(ab|d)"); + test_correct(".*u(ab|de)"); + Ok(()) + } + + fn test_correct(pattern: &str) { + println!("Pattern: {pattern}"); + + let automaton = RegularExpression::new(pattern) + .unwrap() + .to_automaton() + .unwrap(); + + let regex = Gnfa::from_automaton(&automaton).convert(); + println!("-> {regex}"); + + let new_automaton = regex.to_automaton().unwrap(); + + assert!(automaton.equivalent(&new_automaton).unwrap()); + + let automaton = automaton.determinize().unwrap().into_owned(); + + let regex = Gnfa::from_automaton(&automaton).convert(); + println!("-> {regex}"); + + let new_automaton = regex.to_automaton().unwrap(); + + assert!(automaton.equivalent(&new_automaton).unwrap()); + } +} diff --git a/src/fast_automaton/convert/to_regex/transform.rs b/src/fast_automaton/convert/to_regex/transform.rs deleted file mode 100644 index 4498578..0000000 --- a/src/fast_automaton/convert/to_regex/transform.rs +++ /dev/null @@ -1,208 +0,0 @@ -use std::hash::BuildHasherDefault; - -use crate::execution_profile::ExecutionProfile; - -use super::*; - -impl StateEliminationAutomaton { - pub fn convert_to_regex( - &self, - execution_profile: &ExecutionProfile, - ) -> Result, EngineError> { - if self.cyclic { - return self.convert_graph_to_regex(execution_profile); - } - execution_profile.assert_not_timed_out()?; - - let mut regex_map: IntMap = IntMap::with_capacity_and_hasher( - self.get_number_of_states(), - BuildHasherDefault::default(), - ); - regex_map.insert(self.start_state, RegularExpression::new_empty_string()); - for from_state in self.states_topo_vec() { - let current_regex = if let Some(current_regex) = regex_map.get(&from_state) { - current_regex.clone() - } else { - RegularExpression::new_empty_string() - }; - if let Some(transitions) = self.transitions.get(from_state) { - for (to_state, transition) in transitions { - let transition_regex = match transition { - GraphTransition::Graph(graph) => { - if let Some(regex) = graph.convert_graph_to_regex(execution_profile)? { - regex - } else { - return Ok(None); - } - } - GraphTransition::Weight(range) => { - RegularExpression::Character(range.clone()) - } - GraphTransition::Epsilon => RegularExpression::new_empty_string(), - }; - let new_regex = current_regex.concat(&transition_regex, true); - match regex_map.entry(*to_state) { - Entry::Occupied(mut o) => { - o.insert(new_regex.union(o.get()).simplify()); - } - Entry::Vacant(v) => { - v.insert(new_regex); - } - }; - } - } - } - - Ok(regex_map.get(&self.accept_state).cloned()) - } - - fn convert_graph_to_regex( - &self, - execution_profile: &ExecutionProfile, - ) -> Result, EngineError> { - execution_profile.assert_not_timed_out()?; - if let Some(regex) = self.convert_shape_dot_star(execution_profile)? { - return Ok(Some(regex)); - } else if let Some(regex) = self.convert_shape_self_loop(execution_profile)? { - return Ok(Some(regex)); - } - Ok(None) - } - - /// We try to idenfify the regex following the shape: - /// A*B - fn convert_shape_dot_star( - &self, - execution_profile: &ExecutionProfile, - ) -> Result, EngineError> { - if self.get_number_of_states() < 2 { - return Ok(None); - } - //self.to_dot(); - let mut dot_value = - if let Some(dot_value) = self.get_transition(self.start_state, self.start_state) { - if let Some(dot_value) = dot_value.get_weight() { - dot_value.clone() - } else { - return Ok(None); - } - } else { - return Ok(None); - }; - - for state in self.states_iter() { - if state == self.start_state { - continue; - } - let weight = if let Some(weight) = self.get_transition(state, self.start_state) { - if let Some(weight) = weight.get_weight() { - weight - } else { - return Ok(None); - } - } else if state == self.accept_state { - continue; - } else { - return Ok(None); - }; - - if !dot_value.contains_all(weight) { - return Ok(None); - } - } - - let mut graph = self.clone(); - - for (from_state, transition) in graph.in_transitions_vec(graph.start_state) { - let weight = if let Some(weight) = transition.get_weight() { - weight - } else { - return Ok(None); - }; - dot_value = dot_value.union(weight); - graph.remove_transition(from_state, graph.start_state); - } - - let mut worklist = VecDeque::new(); - let mut seen = IntSet::with_capacity(graph.get_number_of_states()); - - worklist.push_back(graph.start_state); - seen.insert(self.start_state); - - while let Some(from_state) = worklist.pop_front() { - for to_state in graph.transitions_from_state_vec(&from_state) { - let transition = - if let Some(transition) = graph.get_transition(from_state, to_state) { - transition - } else { - return Ok(None); - }; - let weight = if let Some(weight) = transition.get_weight() { - weight - } else { - continue; - }; - dot_value = dot_value.union(weight); - if seen.contains(&to_state) { - if graph.accept_state != to_state || to_state == from_state { - graph.remove_transition(from_state, to_state); - } - } else { - seen.insert(to_state); - worklist.push_back(to_state); - } - } - } - - graph.add_transition_to( - self.start_state, - self.start_state, - GraphTransition::Weight(dot_value), - ); - - graph.identify_and_apply_components()?; - graph.convert_to_regex(execution_profile) - } - - /// We try to identify the regex following the shape: - /// A*B - fn convert_shape_self_loop( - &self, - execution_profile: &ExecutionProfile, - ) -> Result, EngineError> { - let mut graph = self.clone(); - - graph.accept_state = graph.new_state(); - - for (from_state, transition) in graph.in_transitions_vec(self.start_state) { - graph.remove_transition(from_state, self.start_state); - - graph.add_transition_to(from_state, graph.accept_state, transition); - } - - graph.identify_and_apply_components()?; - - let a_part = if let Some(a_part) = graph.convert_to_regex(execution_profile)? { - a_part - } else { - return Ok(None); - }; - - let mut graph = self.clone(); - - for (from_state, _) in graph.in_transitions_vec(self.start_state) { - graph.remove_transition(from_state, self.start_state); - } - - graph.identify_and_apply_components()?; - let b_part = if let Some(b_part) = graph.convert_to_regex(execution_profile)? { - b_part - } else { - return Ok(None); - }; - - let regex = a_part.repeat(0, None).concat(&b_part, true); - - Ok(Some(regex)) - } -} diff --git a/src/fast_automaton/convert/to_regex/transform/mod.rs b/src/fast_automaton/convert/to_regex/transform/mod.rs new file mode 100644 index 0000000..552222e --- /dev/null +++ b/src/fast_automaton/convert/to_regex/transform/mod.rs @@ -0,0 +1,16 @@ +use crate::fast_automaton::{ + FastAutomaton, convert::to_regex::transform::shape::dotstar::dot_star, +}; + +mod shape; + +const TRANSFORM_FUNCTION: &[fn(&FastAutomaton) -> FastAutomaton] = &[dot_star]; + +pub fn transform(automaton: &FastAutomaton) -> FastAutomaton { + let mut automaton = automaton.clone(); + for transform in TRANSFORM_FUNCTION { + automaton = transform(&automaton); + } + + automaton +} diff --git a/src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs b/src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs new file mode 100644 index 0000000..bf5e682 --- /dev/null +++ b/src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs @@ -0,0 +1,172 @@ +use nohash_hasher::IntSet; + +use crate::fast_automaton::{FastAutomaton, State, condition::Condition}; + +pub(crate) fn dot_star(automaton: &FastAutomaton) -> FastAutomaton { + let components = identify_and_apply_components(automaton); + + let mut automaton = automaton.clone(); + for component in components { + dot_star_component(&mut automaton, &component); + } + + automaton +} + +fn dot_star_component(automaton: &mut FastAutomaton, component: &IntSet) { + let mut start_state = if component.contains(&automaton.start_state) { + Some(automaton.start_state) + } else { + None + }; + for &state in component { + for (from_state, _) in automaton.transitions_to_vec(state) { + if !component.contains(&from_state) { + if start_state.is_none() { + start_state = Some(state); + } else { + // Only one start state possible + return; + } + } + } + } + + if start_state.is_none() { + // Only one start state possible + return; + } + let start_state = start_state.unwrap(); + + let mut first_hop = automaton + .direct_states(&start_state) + .filter(|&s| s != start_state) + .collect::>(); + let mut states_to_remove = vec![]; + + for state in &first_hop { + let transitions = automaton.transitions_to_vec(*state); + if !transitions.iter().all(|(_, c)| *c == transitions[0].1) { + // Some condition(s) to a given first hop state are not the same. + return; + } + + if transitions.len() != component.len() { + states_to_remove.push(*state); + } + } + + states_to_remove.iter().for_each(|s| { + first_hop.remove(s); + }); + + let mut out_condition = None; + for &state in component { + let mut has_transition_to_start_state = false; + + let mut this_condition = Condition::empty(automaton.get_spanning_set()); + for (condition, &to_state) in automaton.transitions_from(state) { + if to_state == start_state { + has_transition_to_start_state = true; + } + + this_condition = this_condition.union(&condition); + } + if !has_transition_to_start_state { + // Some state(s) do not have transition to the start state. + return; + } + + if let Some(condition) = &out_condition { + if &this_condition != condition { + // The union of outcoming condition for some states are not identical + return; + } + } else { + out_condition = Some(this_condition); + } + } + + automaton.add_transition(start_state, start_state, &out_condition.unwrap()); + for &state in component { + for to_state in automaton.direct_states_vec(&state) { + if !component.contains(&to_state) { + continue; + } + + if state != start_state && (to_state == start_state || first_hop.contains(&to_state)) { + automaton.remove_transition(state, to_state); + } + } + } + for state in states_to_remove { + automaton.remove_state(state); + } +} + +pub fn identify_and_apply_components(automaton: &FastAutomaton) -> Vec> { + let mut index = 0; + let mut stack = Vec::new(); + let mut indices = vec![-1; automaton.transitions.len()]; + let mut lowlink = vec![-1; automaton.transitions.len()]; + let mut on_stack = vec![false; automaton.transitions.len()]; + let mut scc = Vec::new(); + + for state in automaton.states() { + if indices[state] == -1 { + strongconnect( + automaton, + state, + &mut index, + &mut stack, + &mut indices, + &mut lowlink, + &mut on_stack, + &mut scc, + ); + } + } + + scc.into_iter() + .filter(|states| states.len() != 1) + .collect::>() +} + +#[allow(clippy::too_many_arguments)] +fn strongconnect( + automaton: &FastAutomaton, + v: usize, + index: &mut usize, + stack: &mut Vec, + indices: &mut Vec, + lowlink: &mut Vec, + on_stack: &mut Vec, + scc: &mut Vec>, +) { + indices[v] = *index as i32; + lowlink[v] = *index as i32; + *index += 1; + stack.push(v); + on_stack[v] = true; + + for w in automaton.direct_states(&v) { + if indices[w] == -1 { + strongconnect(automaton, w, index, stack, indices, lowlink, on_stack, scc); + lowlink[v] = lowlink[v].min(lowlink[w]); + } else if on_stack[w] { + lowlink[v] = lowlink[v].min(indices[w]); + } + } + + if lowlink[v] == indices[v] { + let mut component = IntSet::default(); + while let Some(w) = stack.pop() { + on_stack[w] = false; + component.insert(w); + if w == v { + break; + } + } + scc.push(component); + } +} diff --git a/src/fast_automaton/convert/to_regex/transform/shape/mod.rs b/src/fast_automaton/convert/to_regex/transform/shape/mod.rs new file mode 100644 index 0000000..5c83bf6 --- /dev/null +++ b/src/fast_automaton/convert/to_regex/transform/shape/mod.rs @@ -0,0 +1 @@ +pub(super) mod dotstar; \ No newline at end of file diff --git a/src/fast_automaton/generate.rs b/src/fast_automaton/generate.rs index 6cb0628..7532309 100644 --- a/src/fast_automaton/generate.rs +++ b/src/fast_automaton/generate.rs @@ -58,7 +58,7 @@ impl FastAutomaton { break; } } - for (cond, to_state) in self.transitions_from_iter(state) { + for (cond, to_state) in self.transitions_from(state) { execution_profile.assert_not_timed_out()?; let range = match ranges_cache.entry(cond) { Entry::Occupied(o) => o.get().clone(), @@ -114,7 +114,7 @@ mod tests { fn assert_generate_strings(regex: &str, number: usize) { println!(":{}", regex); - let automaton = RegularExpression::new(regex) + let automaton = RegularExpression::parse(regex, false) .unwrap() .to_automaton() .unwrap(); diff --git a/src/fast_automaton/mod.rs b/src/fast_automaton/mod.rs index cfa4f68..7bf0313 100644 --- a/src/fast_automaton/mod.rs +++ b/src/fast_automaton/mod.rs @@ -47,7 +47,7 @@ impl Display for FastAutomaton { fn fmt(&self, sb: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { writeln!(sb, "digraph Automaton {{")?; writeln!(sb, "\trankdir = LR;")?; - for from_state in self.all_states_iter() { + for from_state in self.states() { write!(sb, "\t{from_state}")?; if self.accept_states.contains(&from_state) { writeln!(sb, "\t[shape=doublecircle,label=\"{from_state}\"];")?; @@ -59,7 +59,7 @@ impl Display for FastAutomaton { writeln!(sb, "\tinitial [shape=plaintext,label=\"\"];")?; writeln!(sb, "\tinitial -> {from_state}")?; } - for (cond, to_state) in self.transitions_from_iter(from_state) { + for (cond, to_state) in self.transitions_from(from_state) { writeln!( sb, "\t{from_state} -> {to_state} [label=\"{}\"]", @@ -85,7 +85,7 @@ impl FastAutomaton { /// Returns the number of transitions to the provided state. #[inline] - pub fn state_in_degree(&self, state: State) -> usize { + pub fn in_degree(&self, state: State) -> usize { self.transitions_in .get(&state) .unwrap_or(&IntSet::new()) @@ -94,25 +94,25 @@ impl FastAutomaton { /// Returns the number of transitions from the provided state. #[inline] - pub fn state_out_degree(&self, state: State) -> usize { + pub fn out_degree(&self, state: State) -> usize { self.transitions[state].len() } /// Returns an iterator over the automaton’s states. #[inline] - pub fn all_states_iter(&self) -> impl Iterator + '_ { + pub fn states(&self) -> impl Iterator + '_ { (0..self.transitions.len()).filter(|s| !self.removed_states.contains(s)) } /// Returns a vector containing the automaton’s states. #[inline] - pub fn all_states_vec(&self) -> Vec { - self.all_states_iter().collect() + pub fn states_vec(&self) -> Vec { + self.states().collect() } /// Returns an iterator over states directly reachable from the given state in one transition. #[inline] - pub fn direct_states_iter(&self, state: &State) -> impl Iterator + '_ { + pub fn direct_states(&self, state: &State) -> impl Iterator + '_ { self.transitions[*state] .keys() .cloned() @@ -122,7 +122,7 @@ impl FastAutomaton { /// Returns a vector of states directly reachable from the given state in one transition. #[inline] pub fn direct_states_vec(&self, state: &State) -> Vec { - self.direct_states_iter(state).collect() + self.direct_states(state).collect() } /// Returns a vector containing the transitions to the provided state. @@ -151,7 +151,7 @@ impl FastAutomaton { /// Returns an iterator over transitions from the given state. #[inline] - pub fn transitions_from_iter( + pub fn transitions_from( &self, state: State, ) -> impl Iterator { @@ -161,31 +161,6 @@ impl FastAutomaton { .filter(|s| !self.removed_states.contains(s.1)) } - /// Returns a mutable iterator over transitions from the given state. - #[inline] - pub fn transitions_from_iter_mut( - &mut self, - state: &State, - ) -> impl Iterator { - self.transitions[*state] - .iter_mut() - .map(|(s, c)| (c, s)) - .filter(|s| !self.removed_states.contains(s.1)) - } - - /// Returns an owned iterator over transitions from the given state. - #[inline] - pub fn transitions_from_into_iter( - &self, - state: &State, - ) -> impl Iterator + '_ { - self.transitions[*state] - .clone() - .into_iter() - .map(|(s, c)| (c, s)) - .filter(|(_, state)| !self.removed_states.contains(state)) - } - /// Returns `true` if there is a directed transition from `from_state` to `to_state`. #[inline] pub fn does_transition_exists(&self, from_state: State, to_state: State) -> bool { @@ -221,16 +196,6 @@ impl FastAutomaton { self.transitions[from_state].get(&to_state) } - // Returns a mutable reference to the condition of the directed transition between the two states, if any. - #[inline] - pub fn get_condition_mut( - &mut self, - from_state: State, - to_state: State, - ) -> Option<&mut Condition> { - self.transitions[from_state].get_mut(&to_state) - } - /// Returns the start state. #[inline] pub fn get_start_state(&self) -> State { @@ -285,7 +250,7 @@ impl FastAutomaton { continue; } let curr_char = input.chars().nth(position).unwrap() as u32; - for (cond, to_state) in self.transitions_from_iter(*current_state) { + for (cond, to_state) in self.transitions_from(*current_state) { if cond.has_character(&curr_char, &self.spanning_set).unwrap() { if position + 1 == input.len() { if self.accept_states.contains(to_state) { @@ -301,7 +266,12 @@ impl FastAutomaton { } #[inline] - pub fn to_dot(&self) { + pub fn as_dot(&self) -> String { + format!("{self}") + } + + #[inline] + pub fn print_dot(&self) { println!("{self}"); } } diff --git a/src/fast_automaton/operation/concat.rs b/src/fast_automaton/operation/concat.rs index 45654d3..6318c61 100644 --- a/src/fast_automaton/operation/concat.rs +++ b/src/fast_automaton/operation/concat.rs @@ -41,12 +41,12 @@ impl FastAutomaton { BuildHasherDefault::default(), ); - let start_state_and_accept_states_not_mergeable = other.state_in_degree(other.start_state) > 0 + let start_state_and_accept_states_not_mergeable = other.in_degree(other.start_state) > 0 && self .accept_states .iter() .cloned() - .any(|s| self.state_out_degree(s) > 0); + .any(|s| self.out_degree(s) > 0); let accept_states = self.accept_states.iter().cloned().collect::>(); @@ -67,7 +67,7 @@ impl FastAutomaton { } } - for from_state in other.all_states_iter() { + for from_state in other.states() { let new_from_states = match new_states.entry(from_state) { Entry::Occupied(o) => { vec![*o.get()] @@ -86,7 +86,7 @@ impl FastAutomaton { } }; - for (condition, to_state) in other.transitions_from_iter(from_state) { + for (condition, to_state) in other.transitions_from(from_state) { let new_to_states = match new_states.entry(*to_state) { Entry::Occupied(o) => { vec![*o.get()] @@ -135,12 +135,12 @@ mod tests { #[test] fn test_simple_concatenation_regex() -> Result<(), String> { - let automaton = RegularExpression::new("abc") + let automaton = RegularExpression::parse("abc", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(automaton.match_string("abc")); assert!(!automaton.match_string("abcd")); assert!(!automaton.match_string("ab")); @@ -150,7 +150,7 @@ mod tests { #[test] fn test_simple_concat_alternation_regex() -> Result<(), String> { - let automaton = RegularExpression::new("0101(abc|ac|aaa)") + let automaton = RegularExpression::parse("0101(abc|ac|aaa)", false) .unwrap() .to_automaton() .unwrap(); @@ -170,7 +170,7 @@ mod tests { #[test] fn test_simple_concat_repeat_regex() -> Result<(), String> { - let automaton = RegularExpression::new("A+B*") + let automaton = RegularExpression::parse("A+B*", false) .unwrap() .to_automaton() .unwrap(); @@ -185,7 +185,7 @@ mod tests { #[test] fn test_simple_repeat_regex_01() -> Result<(), String> { - let automaton = RegularExpression::new("a+") + let automaton = RegularExpression::parse("a+", false) .unwrap() .to_automaton() .unwrap(); @@ -200,7 +200,7 @@ mod tests { #[test] fn test_simple_repeat_regex_02() -> Result<(), String> { - let automaton = RegularExpression::new("a*c") + let automaton = RegularExpression::parse("a*c", false) .unwrap() .to_automaton() .unwrap(); @@ -214,11 +214,11 @@ mod tests { #[test] fn test_simple_repeat_regex_03() -> Result<(), String> { - let automaton = RegularExpression::new("(ab){3,4}") + let automaton = RegularExpression::parse("(ab){3,4}", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(automaton.match_string("ababab")); assert!(automaton.match_string("abababab")); assert!(!automaton.match_string("ab")); @@ -229,11 +229,11 @@ mod tests { #[test] fn test_simple_repeat_regex_04() -> Result<(), String> { - let automaton = RegularExpression::new("a{3,}") + let automaton = RegularExpression::parse("a{3,}", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(automaton.match_string("aaa")); assert!(automaton.match_string("aaaaa")); assert!(!automaton.match_string("a")); @@ -243,11 +243,11 @@ mod tests { #[test] fn test_simple_repeat_regex_05() -> Result<(), String> { - let automaton = RegularExpression::new("a?") + let automaton = RegularExpression::parse("a?", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(automaton.match_string("")); assert!(automaton.match_string("a")); assert!(!automaton.match_string("aa")); @@ -257,11 +257,11 @@ mod tests { #[test] fn test_simple_repeat_regex_06() -> Result<(), String> { - let automaton = RegularExpression::new("a{0,2}") + let automaton = RegularExpression::parse("a{0,2}", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(automaton.match_string("")); assert!(automaton.match_string("a")); assert!(automaton.match_string("aa")); @@ -272,11 +272,11 @@ mod tests { #[test] fn test_simple_repeat_regex_07() -> Result<(), String> { - let automaton = RegularExpression::new("a{1,3}") + let automaton = RegularExpression::parse("a{1,3}", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(!automaton.match_string("")); assert!(automaton.match_string("a")); assert!(automaton.match_string("aa")); @@ -287,11 +287,11 @@ mod tests { #[test] fn test_simple_repeat_regex_08() -> Result<(), String> { - let automaton = RegularExpression::new("a+(ba+)*") + let automaton = RegularExpression::parse("a+(ba+)*", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(!automaton.match_string("")); assert!(!automaton.match_string("aab")); assert!(automaton.match_string("a")); @@ -306,11 +306,11 @@ mod tests { #[test] fn test_simple_repeat_regex_09() -> Result<(), String> { - let automaton = RegularExpression::new("(ac|ads|a)*") + let automaton = RegularExpression::parse("(ac|ads|a)*", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(automaton.match_string("")); assert!(automaton.match_string("ac")); assert!(automaton.match_string("ads")); @@ -328,11 +328,11 @@ mod tests { #[test] fn test_simple_repeat_regex_10() -> Result<(), String> { - let automaton = RegularExpression::new("(ef|ads|a)+") + let automaton = RegularExpression::parse("(ef|ads|a)+", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(!automaton.match_string("")); assert!(automaton.match_string("ef")); assert!(automaton.match_string("ads")); @@ -350,11 +350,11 @@ mod tests { #[test] fn test_simple_repeat_regex_11() -> Result<(), String> { - let automaton = RegularExpression::new("(a|bc)*") + let automaton = RegularExpression::parse("(a|bc)*", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(automaton.match_string("")); assert!(automaton.match_string("a")); assert!(automaton.match_string("bc")); @@ -367,11 +367,11 @@ mod tests { #[test] fn test_simple_repeat_regex_12() -> Result<(), String> { - let automaton = RegularExpression::new("([ab]*a)?") + let automaton = RegularExpression::parse("([ab]*a)?", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(automaton.match_string("")); assert!(automaton.match_string("a")); assert!(automaton.match_string("aa")); @@ -385,11 +385,11 @@ mod tests { #[test] fn test_simple_repeat_regex_13() -> Result<(), String> { - let automaton = RegularExpression::new("([ab]*a)*") + let automaton = RegularExpression::parse("([ab]*a)*", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(automaton.match_string("")); assert!(automaton.match_string("a")); assert!(automaton.match_string("aa")); @@ -403,22 +403,22 @@ mod tests { #[test] fn test_simple_repeat_right_number_of_states_1() -> Result<(), String> { - let automaton = RegularExpression::new("a*") + let automaton = RegularExpression::parse("a*", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert_eq!(1, automaton.get_number_of_states()); Ok(()) } #[test] fn test_simple_concat_right_number_of_states_2() -> Result<(), String> { - let automaton = RegularExpression::new("(a*bc)") + let automaton = RegularExpression::parse("(a*bc)", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert_eq!(3, automaton.get_number_of_states()); Ok(()) } diff --git a/src/fast_automaton/operation/determinize.rs b/src/fast_automaton/operation/determinize.rs index 734f622..73f8464 100644 --- a/src/fast_automaton/operation/determinize.rs +++ b/src/fast_automaton/operation/determinize.rs @@ -40,7 +40,7 @@ impl FastAutomaton { for base in &ranges { for from_state in &states { - for (cond, to_state) in self.transitions_from_iter(*from_state) { + for (cond, to_state) in self.transitions_from(*from_state) { if cond.has_intersection(base) { match new_states_to_add.binary_search(to_state) { Ok(_) => {} // element already in vector @ `pos` @@ -84,7 +84,7 @@ mod tests { #[test] fn test_determinize_1() -> Result<(), String> { - let automaton = RegularExpression::new(".*ab") + let automaton = RegularExpression::parse(".*ab", false) .unwrap() .to_automaton() .unwrap(); @@ -113,7 +113,7 @@ mod tests { fn assert_determinization(regex: &str) { println!(":{}", regex); - let automaton = RegularExpression::new(regex) + let automaton = RegularExpression::parse(regex, false) .unwrap() .to_automaton() .unwrap(); diff --git a/src/fast_automaton/operation/difference.rs b/src/fast_automaton/operation/difference.rs index acb63e9..4e2bd22 100644 --- a/src/fast_automaton/operation/difference.rs +++ b/src/fast_automaton/operation/difference.rs @@ -16,9 +16,9 @@ impl FastAutomaton { ); let mut ranges = Vec::with_capacity(self.get_number_of_states()); - for from_state in self.all_states_iter() { + for from_state in self.states() { let mut new_condition = Condition::empty(&self.spanning_set); - for (condition, _) in self.transitions_from_iter(from_state) { + for (condition, _) in self.transitions_from(from_state) { new_condition = new_condition.union(condition); ranges.push(condition.to_range(self.get_spanning_set())?); } @@ -36,7 +36,7 @@ impl FastAutomaton { let new_spanning_set = SpanningSet::compute_spanning_set(&ranges); self.apply_new_spanning_set(&new_spanning_set)?; - if self.state_in_degree(crash_state) == 1 { + if self.in_degree(crash_state) == 1 { self.remove_state(crash_state); } Ok(()) @@ -47,7 +47,7 @@ impl FastAutomaton { self.totalize()?; let mut new_accept_states = IntSet::default(); - for state in self.all_states_iter() { + for state in self.states() { if self.accept_states.contains(&state) { continue; } diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index 4f42859..694d66c 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -4,10 +4,7 @@ use rayon::prelude::*; use condition::converter::ConditionConverter; -use crate::{ - error::EngineError, - execution_profile::{ExecutionProfile}, -}; +use crate::{error::EngineError, execution_profile::ExecutionProfile}; use super::*; @@ -18,8 +15,9 @@ impl FastAutomaton { } /// Computes the intersection of all automatons in the given iterator. - pub fn intersection_all<'a, I: IntoIterator>(automatons: I) -> Result - { + pub fn intersection_all<'a, I: IntoIterator>( + automatons: I, + ) -> Result { let mut result: Cow<'a, FastAutomaton> = Cow::Owned(FastAutomaton::new_total()); for automaton in automatons { @@ -34,25 +32,27 @@ impl FastAutomaton { } /// Computes in parallel the intersection of all automatons in the given iterator. - pub fn intersection_all_par<'a, I: IntoParallelIterator>(automatons: I) -> Result - { + pub fn intersection_all_par<'a, I: IntoParallelIterator>( + automatons: I, + ) -> Result { let execution_profile = ExecutionProfile::get(); let total = FastAutomaton::new_total(); - automatons.into_par_iter() - .try_fold( - || total.clone(), - |acc, next| { - execution_profile.apply(|| Ok(acc.intersection_internal(next)?.into_owned())) - }, - ) - .try_reduce( - || total.clone(), - |acc, next| { - execution_profile.apply(|| Ok(acc.intersection_internal(&next)?.into_owned())) - }, - ) + automatons + .into_par_iter() + .try_fold( + || total.clone(), + |acc, next| { + execution_profile.apply(|| Ok(acc.intersection_internal(next)?.into_owned())) + }, + ) + .try_reduce( + || total.clone(), + |acc, next| { + execution_profile.apply(|| Ok(acc.intersection_internal(&next)?.into_owned())) + }, + ) } fn intersection_internal<'a>( @@ -197,7 +197,7 @@ impl FastAutomaton { condition_converter: &ConditionConverter, ) -> Result, EngineError> { let transitions_1: Result, EngineError> = self - .transitions_from_iter(state) + .transitions_from(state) .map(|(c, &s)| match condition_converter.convert(c) { Ok(condition) => Ok((condition, s)), Err(err) => Err(err), @@ -210,15 +210,15 @@ impl FastAutomaton { #[cfg(test)] mod tests { - use crate::{fast_automaton::FastAutomaton, regex::RegularExpression}; + use crate::regex::RegularExpression; #[test] fn test_simple_intersection_regex_1() -> Result<(), String> { - let automaton1 = RegularExpression::new("(abc|ac|aaa)") + let automaton1 = RegularExpression::parse("(abc|ac|aaa)", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("(abcd|ac|aba)") + let automaton2 = RegularExpression::parse("(abcd|ac|aba)", false) .unwrap() .to_automaton() .unwrap(); @@ -234,11 +234,11 @@ mod tests { #[test] fn test_simple_intersection_regex_2() -> Result<(), String> { - let automaton1 = RegularExpression::new("a*") + let automaton1 = RegularExpression::parse("a*", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("b*") + let automaton2 = RegularExpression::parse("b*", false) .unwrap() .to_automaton() .unwrap(); @@ -252,11 +252,11 @@ mod tests { #[test] fn test_simple_intersection_regex_3() -> Result<(), String> { - let automaton1 = RegularExpression::new("x*") + let automaton1 = RegularExpression::parse("x*", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("(xxx)*") + let automaton2 = RegularExpression::parse("(xxx)*", false) .unwrap() .to_automaton() .unwrap(); @@ -272,11 +272,11 @@ mod tests { #[test] fn test_complex_intersection_regex_1() -> Result<(), String> { - let automaton1 = RegularExpression::new(".*(abc|ac|aaa)") + let automaton1 = RegularExpression::parse(".*(abc|ac|aaa)", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("(abcd|ac|aba)") + let automaton2 = RegularExpression::parse("(abcd|ac|aba)", false) .unwrap() .to_automaton() .unwrap(); @@ -293,16 +293,16 @@ mod tests { #[test] fn test_complex_intersection_regex_2() -> Result<(), String> { - let automaton1 = RegularExpression::new("(?:[a-z0-9]+(?:\\.[a-z0-9]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])") + let automaton1 = RegularExpression::parse("(?:[a-z0-9]+(?:\\.[a-z0-9]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])", false) .unwrap() .to_automaton().unwrap(); - let automaton2 = RegularExpression::new("avb@.*") + let automaton2 = RegularExpression::parse("avb@.*", false) .unwrap() .to_automaton() .unwrap(); - automaton1.to_dot(); - automaton2.to_dot(); + automaton1.print_dot(); + automaton2.print_dot(); let intersection = automaton1.intersection(&automaton2).unwrap(); assert!(!intersection.is_empty()); @@ -310,33 +310,4 @@ mod tests { assert!(intersection.match_string("avb@gmail.com")); Ok(()) } - - #[test] - fn test_intersection_par() -> Result<(), String> { - let c = 14; - let mut automaton_list = Vec::with_capacity(c); - - for i in 0..c { - automaton_list.push( - RegularExpression::new(&format!(".*{i}.*")) - .unwrap() - .to_automaton() - .unwrap(), - ) - } - - // FastAutomaton::intersection_all(automaton_list.iter().collect::>()); - - // 3.76 - // 4.47 - // 3.84 - - let _ = FastAutomaton::intersection_all_par(automaton_list.iter().collect::>()); - - // 0.59 - // 0.55 - // 0.53 - - Ok(()) - } } diff --git a/src/fast_automaton/operation/mod.rs b/src/fast_automaton/operation/mod.rs index 54bcba3..b21d25f 100644 --- a/src/fast_automaton/operation/mod.rs +++ b/src/fast_automaton/operation/mod.rs @@ -17,7 +17,7 @@ impl FastAutomaton { let reacheable_states = self.get_reacheable_states(); let mut dead_states = IntSet::default(); - for from_state in self.all_states_iter() { + for from_state in self.states() { if !reacheable_states.contains(&from_state) { dead_states.insert(from_state); } @@ -35,11 +35,11 @@ mod tests { #[test] fn test_remove_dead_states() -> Result<(), String> { - let automaton1 = RegularExpression::new("(abc|ac|aaa)") + let automaton1 = RegularExpression::parse("(abc|ac|aaa)", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("(abcd|ac|aba)") + let automaton2 = RegularExpression::parse("(abcd|ac|aba)", false) .unwrap() .to_automaton() .unwrap(); diff --git a/src/fast_automaton/operation/repeat.rs b/src/fast_automaton/operation/repeat.rs index b49207b..cdfd36b 100644 --- a/src/fast_automaton/operation/repeat.rs +++ b/src/fast_automaton/operation/repeat.rs @@ -21,7 +21,7 @@ impl FastAutomaton { let automaton_to_repeat = self.clone(); - if min == 0 && self.state_in_degree(self.start_state) != 0 { + if min == 0 && self.in_degree(self.start_state) != 0 { let new_state = self.new_state(); if self.is_accepted(&self.start_state) { self.accept(new_state); @@ -58,8 +58,8 @@ impl FastAutomaton { let accept_state = *automaton_to_repeat.accept_states.iter().next().unwrap(); if automaton_to_repeat.accept_states.len() == 1 - && automaton_to_repeat.state_out_degree(accept_state) == 0 - && automaton_to_repeat.state_in_degree(automaton_to_repeat.start_state) == 0 + && automaton_to_repeat.out_degree(accept_state) == 0 + && automaton_to_repeat.in_degree(automaton_to_repeat.start_state) == 0 { automaton_to_repeat .add_epsilon_transition(accept_state, automaton_to_repeat.start_state); @@ -112,7 +112,7 @@ mod tests { #[test] fn test_repeat_1() -> Result<(), String> { - let automaton = RegularExpression::new("(a*,a*)?") + let automaton = RegularExpression::parse("(a*,a*)?", false) .unwrap() .to_automaton() .unwrap(); diff --git a/src/fast_automaton/operation/union.rs b/src/fast_automaton/operation/union.rs index e71f346..8e80b39 100644 --- a/src/fast_automaton/operation/union.rs +++ b/src/fast_automaton/operation/union.rs @@ -14,8 +14,9 @@ impl FastAutomaton { } /// Computes the union of all automatons in the given iterator. - pub fn union_all<'a, I: IntoIterator>(automatons: I) -> Result - { + pub fn union_all<'a, I: IntoIterator>( + automatons: I, + ) -> Result { let mut new_automaton = FastAutomaton::new_empty(); for automaton in automatons { new_automaton.union_mut(automaton)?; @@ -24,30 +25,33 @@ impl FastAutomaton { } /// Computes in parallel the union of all automatons in the given iterator. - pub fn union_all_par<'a, I: IntoParallelIterator>(automatons: I) -> Result - { + pub fn union_all_par<'a, I: IntoParallelIterator>( + automatons: I, + ) -> Result { let execution_profile = ExecutionProfile::get(); let empty = FastAutomaton::new_empty(); - automatons.into_par_iter() - .try_fold( - || empty.clone(), - |mut acc, next| { - execution_profile.apply(|| { - acc.union_mut(next)?; - Ok(acc) - }) - }, - ).try_reduce( - || empty.clone(), - |mut acc, next| { - execution_profile.apply(|| { - acc.union_mut(&next)?; - Ok(acc) - }) - }, - ) + automatons + .into_par_iter() + .try_fold( + || empty.clone(), + |mut acc, next| { + execution_profile.apply(|| { + acc.union_mut(next)?; + Ok(acc) + }) + }, + ) + .try_reduce( + || empty.clone(), + |mut acc, next| { + execution_profile.apply(|| { + acc.union_mut(&next)?; + Ok(acc) + }) + }, + ) } fn prepare_start_states( @@ -56,9 +60,13 @@ impl FastAutomaton { new_states: &mut IntMap, condition_converter: &ConditionConverter, ) -> Result, EngineError> { - let mut imcomplete_states = IntSet::with_capacity(other.state_out_degree(other.start_state) + 1); - let self_start_state_in_degree = self.state_in_degree(self.start_state); - let other_start_state_in_degree = other.state_in_degree(other.start_state); + let mut imcomplete_states = + IntSet::with_capacity(other.out_degree(other.start_state) + 1); + if other.is_accepted(&other.start_state) { + self.accept(self.start_state); + } + let self_start_state_in_degree = self.in_degree(self.start_state); + let other_start_state_in_degree = other.in_degree(other.start_state); if self_start_state_in_degree == 0 && other_start_state_in_degree == 0 { // The start states can be the same state without any consequence new_states.insert(other.start_state, self.start_state); @@ -66,29 +74,22 @@ impl FastAutomaton { } else { if self_start_state_in_degree != 0 { let new_state = self.new_state(); - if self.is_accepted(&self.start_state) { - self.accept(new_state); - } - for (cond, to_state) in self.transitions_from_vec(self.start_state) - { - self.add_transition(new_state, to_state, &cond); - } + self.add_epsilon_transition(new_state, self.start_state); self.start_state = new_state; + new_states.insert(other.start_state, self.start_state); + imcomplete_states.insert(self.start_state); } if other_start_state_in_degree != 0 { let new_state = self.new_state(); if other.is_accepted(&other.start_state) { self.accept(new_state); - self.accept(self.start_state); } new_states.insert(other.start_state, new_state); imcomplete_states.insert(new_state); - for (cond, other_to_state) in - other.transitions_from_vec(other.start_state) - { + for (cond, other_to_state) in other.transitions_from_vec(other.start_state) { let cond = condition_converter.convert(&cond)?; let to_state = match new_states.entry(other_to_state) { Entry::Occupied(o) => *o.get(), @@ -114,13 +115,13 @@ impl FastAutomaton { ) { let mut self_accept_states_without_outgoing_edges = vec![]; for &state in &self.accept_states { - if self.state_out_degree(state) == 0 && !imcomplete_states.contains(&state) { + if self.out_degree(state) == 0 && !imcomplete_states.contains(&state) { self_accept_states_without_outgoing_edges.push(state); } } let accept_state_without_outgoing_edges = match self_accept_states_without_outgoing_edges.len() { - 1 => self_accept_states_without_outgoing_edges[0], + 1 => Some(self_accept_states_without_outgoing_edges[0]), n if n > 1 => { let new_state = self.new_state(); self.accept(new_state); @@ -131,24 +132,23 @@ impl FastAutomaton { } self.remove_state(accept_state); } - new_state - } - _ => { - let new_state = self.new_state(); - self.accept(new_state); - new_state + Some(new_state) } + _ => None, }; for &state in &other.accept_states { - if other.state_out_degree(state) == 0 { - new_states - .entry(state) - .or_insert(accept_state_without_outgoing_edges); - } else if new_states.get(&state).is_none() { - let new_accept_state = self.new_state(); - self.accept(new_accept_state); - new_states.insert(state, new_accept_state); + match accept_state_without_outgoing_edges { + Some(accept_state) if other.out_degree(state) == 0 => { + new_states.entry(state).or_insert(accept_state); + } + _ => { + if new_states.get(&state).is_none() { + let new_accept_state = self.new_state(); + self.accept(new_accept_state); + new_states.insert(state, new_accept_state); + } + } } } } @@ -181,7 +181,7 @@ impl FastAutomaton { self.prepare_start_states(other, &mut new_states, &condition_converter)?; self.prepare_accept_states(other, &mut new_states, &imcomplete_states); - for from_state in other.all_states_iter() { + for from_state in other.states() { let new_from_state = match new_states.entry(from_state) { Entry::Occupied(o) => *o.get(), Entry::Vacant(v) => { @@ -190,7 +190,7 @@ impl FastAutomaton { new_state } }; - for (condition, to_state) in other.transitions_from_iter(from_state) { + for (condition, to_state) in other.transitions_from(from_state) { let new_condition = condition_converter.convert(condition)?; let new_to_state = match new_states.entry(*to_state) { Entry::Occupied(o) => *o.get(), @@ -214,7 +214,7 @@ mod tests { #[test] fn test_simple_alternation_regex_1() -> Result<(), String> { - let automaton = RegularExpression::new("(abc|ac|aaa)") + let automaton = RegularExpression::parse("(abc|ac|aaa)", false) .unwrap() .to_automaton() .unwrap(); @@ -233,11 +233,11 @@ mod tests { #[test] fn test_simple_alternation_regex_2() -> Result<(), String> { - let automaton = RegularExpression::new("(b?|b{2})") + let automaton = RegularExpression::parse("(b?|b{2})", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(automaton.match_string("")); assert!(automaton.match_string("b")); assert!(automaton.match_string("bb")); @@ -248,11 +248,27 @@ mod tests { #[test] fn test_simple_alternation_regex_3() -> Result<(), String> { - let automaton = RegularExpression::new("((a|bc)*|d)") + let automaton = RegularExpression::parse("((a|bc)*|d)", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.match_string("")); + assert!(automaton.match_string("a")); + assert!(automaton.match_string("abcaaabcbc")); + assert!(automaton.match_string("d")); + assert!(!automaton.match_string("ad")); + assert!(!automaton.match_string("abcd")); + Ok(()) + } + + #[test] + fn test_simple_alternation_regex_3b() -> Result<(), String> { + let automaton = RegularExpression::parse("(d|(a|bc)*)", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(automaton.match_string("")); assert!(automaton.match_string("a")); assert!(automaton.match_string("abcaaabcbc")); @@ -262,13 +278,30 @@ mod tests { Ok(()) } + #[test] + fn test_simple_alternation_regex_3t() -> Result<(), String> { + let automaton = RegularExpression::parse("(d*|(a|bc)*)", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.match_string("")); + assert!(automaton.match_string("a")); + assert!(automaton.match_string("abcaaabcbc")); + assert!(automaton.match_string("d")); + assert!(automaton.match_string("ddd")); + assert!(!automaton.match_string("ad")); + assert!(!automaton.match_string("abcd")); + Ok(()) + } + #[test] fn test_simple_alternation_regex_4() -> Result<(), String> { - let automaton = RegularExpression::new("(a+(ba+)*|ca*c)") + let automaton = RegularExpression::parse("(a+(ba+)*|ca*c)", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(automaton.match_string("cc")); assert!(automaton.match_string("caaac")); assert!(automaton.match_string("a")); @@ -278,11 +311,11 @@ mod tests { #[test] fn test_simple_alternation_regex_5() -> Result<(), String> { - let automaton = RegularExpression::new("((aad|ads|a)*|q)") + let automaton = RegularExpression::parse("((aad|ads|a)*|q)", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(automaton.match_string("q")); assert!(automaton.match_string("aad")); assert!(automaton.match_string("ads")); @@ -294,4 +327,48 @@ mod tests { assert!(!automaton.match_string("qq")); Ok(()) } + + #[test] + fn test_simple_alternation_regex_6() -> Result<(), String> { + let automaton = RegularExpression::parse("(ab|)", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.match_string("ab")); + assert!(automaton.match_string("")); + assert!(!automaton.match_string("a")); + assert!(!automaton.match_string("b")); + assert!(!automaton.match_string("aab")); + Ok(()) + } + + #[test] + fn test_simple_alternation_regex_7() -> Result<(), String> { + let automaton = RegularExpression::parse("(d|a?|ab)", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.match_string("a")); + assert!(automaton.match_string("d")); + assert!(automaton.match_string("ab")); + assert!(automaton.match_string("")); + Ok(()) + } + + #[test] + fn test_simple_alternation_regex_8() -> Result<(), String> { + let automaton = RegularExpression::parse("((d|a?|ab)u)*", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.match_string("au")); + assert!(automaton.match_string("du")); + assert!(automaton.match_string("abu")); + assert!(automaton.match_string("u")); + assert!(automaton.match_string("")); + Ok(()) + } } diff --git a/src/fast_automaton/serializer/mod.rs b/src/fast_automaton/serializer/mod.rs index aa06df0..7a40bae 100644 --- a/src/fast_automaton/serializer/mod.rs +++ b/src/fast_automaton/serializer/mod.rs @@ -134,7 +134,7 @@ mod tests { } fn assert_serialization(regex: &str) { - let regex = RegularExpression::new(regex).unwrap(); + let regex = RegularExpression::parse(regex, false).unwrap(); println!("{regex}"); let automaton = regex.to_automaton().unwrap(); @@ -153,11 +153,11 @@ mod tests { #[test] fn test_serialization_case_1() -> Result<(), String> { - let automaton1 = RegularExpression::new(".*") + let automaton1 = RegularExpression::parse(".*", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("\\d+") + let automaton2 = RegularExpression::parse("\\d+", false) .unwrap() .to_automaton() .unwrap(); diff --git a/src/fast_automaton/serializer/tokenizer/embed_automaton.rs b/src/fast_automaton/serializer/tokenizer/embed_automaton.rs index 825ea7e..429c008 100644 --- a/src/fast_automaton/serializer/tokenizer/embed_automaton.rs +++ b/src/fast_automaton/serializer/tokenizer/embed_automaton.rs @@ -135,29 +135,25 @@ mod tests { #[test] fn test_tokenize() -> Result<(), String> { - assert_embedding_convertion_for_fair("(a|b)"); - assert_embedding_convertion_for_fair("(|a)"); - assert_embedding_convertion_for_fair(".*ab"); - assert_embedding_convertion_for_fair("toto"); - assert_embedding_convertion_for_fair(".{2,3}"); - assert_embedding_convertion_for_fair("q(ab|ca|ab|abc)x"); - assert_embedding_convertion_for_fair(".*q(ab|ca|ab|abc)x"); - assert_embedding_convertion_for_fair( + assert_embedding_convertion("(a|b)"); + assert_embedding_convertion("(|a)"); + assert_embedding_convertion(".*ab"); + assert_embedding_convertion("toto"); + assert_embedding_convertion(".{2,3}"); + assert_embedding_convertion("q(ab|ca|ab|abc)x"); + assert_embedding_convertion(".*q(ab|ca|ab|abc)x"); + assert_embedding_convertion( "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q)", ); - assert_embedding_convertion_for_fair( + assert_embedding_convertion( "(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])", ); Ok(()) } - fn assert_embedding_convertion_for_fair(regex: &str) { - assert_embedding_convertion(regex); - } - fn assert_embedding_convertion(regex: &str) { - let regex = RegularExpression::new(regex).unwrap(); + let regex = RegularExpression::parse(regex, false).unwrap(); println!("{}", regex); let automaton = regex.to_automaton().unwrap(); diff --git a/src/lib.rs b/src/lib.rs index 9c8e6ad..ea1bb71 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -41,26 +41,26 @@ pub type CharRange = RangeSet; /// /// // Concatenate /// let concat = t1.concat(&[t2])?; -/// assert_eq!(concat.to_pattern().unwrap(), "abc.*xyz"); +/// assert_eq!(concat.to_pattern(), "abc.*xyz"); /// /// // Union /// let union = t1.union(&[Term::from_pattern("fgh")?])?; -/// assert_eq!(union.to_pattern().unwrap(), "(abc.*|fgh)"); +/// assert_eq!(union.to_pattern(), "(abc.*|fgh)"); /// /// // Intersection /// let inter = Term::from_pattern("(ab|xy){2}")? /// .intersection(&[Term::from_pattern(".*xy")?])?; -/// assert_eq!(inter.to_pattern().unwrap(), "(ab|xy)xy"); +/// assert_eq!(inter.to_pattern(), "(ab|xy)xy"); /// /// // Difference /// let diff = Term::from_pattern("a*")? /// .difference(&Term::from_pattern("")?)?; -/// assert_eq!(diff.to_pattern().unwrap(), "a+"); +/// assert_eq!(diff.to_pattern(), "a+"); /// /// // Repetition /// let rep = Term::from_pattern("abc")? /// .repeat(2, Some(4))?; -/// assert_eq!(rep.to_pattern().unwrap(), "(abc){2,4}"); +/// assert_eq!(rep.to_pattern(), "(abc){2,4}"); /// /// // Analyze /// assert_eq!(rep.get_length(), (Some(6), Some(12))); @@ -388,7 +388,7 @@ impl Term { let automaton_1 = self.to_automaton()?; let automaton_2 = that.to_automaton()?; - automaton_1.are_equivalent(&automaton_2) + automaton_1.equivalent(&automaton_2) } /// Returns `true` if all strings matched by the current term are also matched by the given term. @@ -410,7 +410,7 @@ impl Term { let automaton_1 = self.to_automaton()?; let automaton_2 = that.to_automaton()?; - automaton_1.is_subset_of(&automaton_2) + automaton_1.subset(&automaton_2) } /// Checks if the term matches the empty language. @@ -465,17 +465,17 @@ impl Term { }) } - /// Converts the term to a RegularExpression; returns `None` if conversion isn’t possible. - pub fn to_regex(&self) -> Option> { - Some(match self { + /// Converts the term to a RegularExpression. + pub fn to_regex(&self) -> Cow { + match self { Term::RegularExpression(regex) => Cow::Borrowed(regex), - Term::Automaton(automaton) => Cow::Owned(automaton.to_regex()?), - }) + Term::Automaton(automaton) => Cow::Owned(automaton.to_regex()), + } } - /// Converts the term to a regular expression pattern; returns `None` if conversion isn’t possible. - pub fn to_pattern(&self) -> Option { - Some(self.to_regex()?.to_string()) + /// Converts the term to a regular expression pattern. + pub fn to_pattern(&self) -> String { + self.to_regex().to_string() } fn determinize_subtrahend<'a>( @@ -523,12 +523,12 @@ impl Term { fn get_regexes<'a>(&'a self, terms: &'a [Term]) -> Option>> { let mut regex_list = Vec::with_capacity(terms.len() + 1); - regex_list.push(self.to_regex()?); + regex_list.push(self.to_regex()); let mut terms_regexes = terms .iter() .map(Term::to_regex) - .collect::>>()?; + .collect::>(); regex_list.append(&mut terms_regexes); Some(regex_list) @@ -548,7 +548,7 @@ mod tests { let intersection = regex1.intersection(&vec![regex2]).unwrap(); assert!(intersection.is_empty()); - assert_eq!("[]", intersection.to_pattern().unwrap()); + assert_eq!("[]", intersection.to_pattern()); Ok(()) } @@ -560,7 +560,7 @@ mod tests { let result = regex1.difference(®ex2); assert!(result.is_ok()); - let result = result.unwrap().to_pattern().unwrap(); + let result = result.unwrap().to_pattern(); assert_eq!("a+", result); Ok(()) @@ -573,9 +573,9 @@ mod tests { let result = regex1.difference(®ex2); assert!(result.is_ok()); - let result = result.unwrap().to_regex().unwrap().into_owned(); + let result = result.unwrap().to_regex().into_owned(); assert_eq!( - Term::RegularExpression(RegularExpression::new("(xxx)*(x|xx)").unwrap()), + Term::RegularExpression(RegularExpression::new("x(x{3})*x?").unwrap()), Term::RegularExpression(result) ); @@ -589,7 +589,7 @@ mod tests { let result = regex1.intersection(&vec![regex2]); assert!(result.is_ok()); - let result = result.unwrap().to_pattern().unwrap(); + let result = result.unwrap().to_pattern(); assert_eq!("", result); Ok(()) @@ -602,7 +602,7 @@ mod tests { let result = regex1.intersection(&vec![regex2]); assert!(result.is_ok()); - let result = result.unwrap().to_pattern().unwrap(); + let result = result.unwrap().to_pattern(); assert_eq!("(x{3})*", result); Ok(()) diff --git a/src/regex/builder.rs b/src/regex/builder.rs index 799c69d..e5c8f9b 100644 --- a/src/regex/builder.rs +++ b/src/regex/builder.rs @@ -11,8 +11,13 @@ lazy_static! { } impl RegularExpression { - /// Parses the provided pattern and returns the resulting [`RegularExpression`]. + /// Parses and simplify the provided pattern and returns the resulting [`RegularExpression`]. pub fn new(pattern: &str) -> Result { + Self::parse(pattern, true) + } + + /// Parses the provided pattern and returns the resulting [`RegularExpression`]. + pub fn parse(pattern: &str, simplify: bool) -> Result { if pattern.is_empty() { return Ok(RegularExpression::new_empty_string()); } @@ -24,7 +29,7 @@ impl RegularExpression { .build() .parse(&Self::remove_flags(pattern)) { - Ok(hir) => Self::convert_to_regex(&hir), + Ok(hir) => Self::convert_to_regex(&hir, simplify), Err(err) => Err(EngineError::RegexSyntaxError(err.to_string())), } } @@ -52,7 +57,7 @@ impl RegularExpression { RegularExpression::Concat(VecDeque::new()) } - fn convert_to_regex(hir: &Hir) -> Result { + fn convert_to_regex(hir: &Hir, simplify: bool) -> Result { match hir.kind() { HirKind::Empty => Ok(RegularExpression::new_empty_string()), HirKind::Literal(literal) => { @@ -84,15 +89,26 @@ impl RegularExpression { HirKind::Look(_) => Ok(RegularExpression::new_empty_string()), HirKind::Repetition(repetition) => { let (min, max) = (repetition.min, repetition.max); - Self::convert_to_regex(&repetition.sub).map(|v| v.repeat(min, max)) + let regex = Self::convert_to_regex(&repetition.sub, simplify)?; + Ok(if simplify { + regex.repeat(min, max) + } else { + RegularExpression::Repetition(Box::new(regex), min, max) + }) } - HirKind::Capture(capture) => Self::convert_to_regex(&capture.sub), + HirKind::Capture(capture) => Self::convert_to_regex(&capture.sub, simplify), HirKind::Concat(concat) => { let mut concat_regex = RegularExpression::Concat(VecDeque::with_capacity(concat.len())); for c in concat { - let concat_value = Self::convert_to_regex(c)?; - concat_regex = concat_regex.concat(&concat_value, true); + let concat_value = Self::convert_to_regex(c, simplify)?; + if simplify { + concat_regex = concat_regex.concat(&concat_value, true); + } else if let RegularExpression::Concat(values) = concat_regex { + let mut values = values.clone(); + values.push_back(concat_value); + concat_regex = RegularExpression::Concat(values); + } } Ok(concat_regex) } @@ -100,8 +116,14 @@ impl RegularExpression { let mut alternation_regex = RegularExpression::Alternation(Vec::with_capacity(alternation.len())); for a in alternation { - let alternation_value = Self::convert_to_regex(a)?; - alternation_regex = alternation_regex.union(&alternation_value); + let alternation_value = Self::convert_to_regex(a, simplify)?; + if simplify { + alternation_regex = alternation_regex.union(&alternation_value); + } else if let RegularExpression::Alternation(values) = alternation_regex { + let mut values = values.clone(); + values.push(alternation_value); + alternation_regex = RegularExpression::Alternation(values); + } } Ok(alternation_regex) } diff --git a/src/regex/mod.rs b/src/regex/mod.rs index 05908c0..2f5062f 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -151,6 +151,96 @@ impl RegularExpression { } } } + + pub fn evaluate_complexity(&self) -> f64 { + let (score, depth, _) = self.eval_inner(); + score + Self::depth_penalty(depth) + } + + // returns: (score, max_depth, contains_repetition) + fn eval_inner(&self) -> (f64, usize, bool) { + match self { + RegularExpression::Character(range) => { + let len = range.to_regex().len() as f64; + // small, capped cost for raw length + let base = 1.0 + 0.05 * len.min(40.0); + (base, 1, false) + } + + RegularExpression::Repetition(inner, min, max_opt) => { + let (inner_score, inner_depth, inner_has_rep) = inner.eval_inner(); + + // multipliers tuned for readability impact + let mut m = match max_opt { + None => 1.6, // open upper bound like a+ or a{m,} + Some(max) if max > min => 1.3, // variable upper bound a{m,n} + Some(max) if max == min && *min > 1 => 1.1, // exact count a{n} + _ => 1.0, // a{1} or degenerate + }; + + // nested quantifiers like (?:...+)+ are harder + if inner_has_rep { + m *= 1.5; + } + + (inner_score * m, inner_depth + 1, true) + } + + RegularExpression::Concat(items) => { + let mut sum = 0.0; + let mut max_depth = 0usize; + let mut has_rep = false; + + for (i, it) in items.iter().enumerate() { + let (s, d, h) = it.eval_inner(); + sum += s; + if i > 0 { + // tiny discount: linear sequences are relatively easy to read + sum *= 0.98; + } + if d > max_depth { + max_depth = d; + } + has_rep |= h; + } + + (sum, max_depth + 1, has_rep) + } + + RegularExpression::Alternation(branches) => { + if branches.is_empty() { + return (0.0, 1, false); + } + let mut sum = 0.0; + let mut max_depth = 0usize; + let mut has_rep = false; + + for b in branches { + let (s, d, h) = b.eval_inner(); + sum += s; + if d > max_depth { + max_depth = d; + } + has_rep |= h; + } + + // branching cost: more alternatives = harder to scan + let k = branches.len() as f64; + let multiplier = 1.0 + 0.15 * (k - 1.0); + + (sum * multiplier, max_depth + 1, has_rep) + } + } + } + + fn depth_penalty(depth: usize) -> f64 { + // no penalty up to depth 2, then quadratic growth + if depth <= 2 { + 0.0 + } else { + ((depth - 2) as f64).powi(2) * 0.8 + } + } } #[cfg(test)] diff --git a/src/regex/operation/concat.rs b/src/regex/operation/concat.rs index fe83c46..4e346f8 100644 --- a/src/regex/operation/concat.rs +++ b/src/regex/operation/concat.rs @@ -179,17 +179,9 @@ impl RegularExpression { } else { None }; - Some(RegularExpression::Repetition( - this_regex.clone(), - new_min, - new_max_opt, - )) + Some(this_regex.repeat(new_min, new_max_opt)) } else { - Some(RegularExpression::Repetition( - Box::new(this.clone()), - 2, - Some(2), - )) + Some(this.repeat(2, Some(2))) } } else if let ( RegularExpression::Repetition(this_regex, this_min, this_max_opt), @@ -204,11 +196,8 @@ impl RegularExpression { } else { None }; - Some(RegularExpression::Repetition( - this_regex.clone(), - new_min, - new_max_opt, - )) + + Some(this_regex.repeat(new_min, new_max_opt)) } else if let ( RegularExpression::Character(this_range), RegularExpression::Character(that_range), @@ -227,11 +216,7 @@ impl RegularExpression { if **this_regex == *that { let new_min = this_min + 1; let new_max_opt = this_max_opt.as_ref().map(|this_max| this_max + 1); - Some(RegularExpression::Repetition( - this_regex.clone(), - new_min, - new_max_opt, - )) + Some(this_regex.repeat(new_min, new_max_opt)) } else { None } @@ -239,11 +224,7 @@ impl RegularExpression { if **that_regex == *this { let new_min = that_min + 1; let new_max_opt = that_max_opt.as_ref().map(|this_max| this_max + 1); - Some(RegularExpression::Repetition( - that_regex.clone(), - new_min, - new_max_opt, - )) + Some(that_regex.repeat(new_min, new_max_opt)) } else { None } diff --git a/src/regex/operation/repeat.rs b/src/regex/operation/repeat.rs index 86ddbe5..c5578ca 100644 --- a/src/regex/operation/repeat.rs +++ b/src/regex/operation/repeat.rs @@ -248,6 +248,6 @@ mod tests { let result = got.to_automaton().unwrap(); - assert!(repeat.are_equivalent(&result).unwrap()); + assert!(repeat.equivalent(&result).unwrap()); } } diff --git a/src/regex/operation/union.rs b/src/regex/operation/union.rs index 8e4f1f3..ee6abee 100644 --- a/src/regex/operation/union.rs +++ b/src/regex/operation/union.rs @@ -9,8 +9,9 @@ impl RegularExpression { } /// Returns a regular expression that is the union of all expressions in `patterns`. - pub fn union_all<'a, I: IntoIterator>(patterns: I) -> RegularExpression - { + pub fn union_all<'a, I: IntoIterator>( + patterns: I, + ) -> RegularExpression { let mut result: Cow<'a, RegularExpression> = Cow::Owned(RegularExpression::new_empty()); for other in patterns { @@ -104,11 +105,7 @@ impl RegularExpression { ) = (this_character, that_repetition) { if this_character == &**that_regex && *that_min <= 2 { - RegularExpression::Repetition( - that_regex.clone(), - cmp::min(1, *that_min), - *that_max_opt, - ) + that_regex.repeat(cmp::min(1, *that_min), *that_max_opt) } else { let mut alternate = vec![this_character.clone(), that_repetition.clone()]; alternate.sort_unstable(); @@ -139,18 +136,10 @@ impl RegularExpression { self_regex.union_(&other_regex) } } else { - Cow::Owned(RegularExpression::Repetition( - Box::new(self_regex), - 0, - Some(1), - )) + Cow::Owned(self_regex.repeat(0, Some(1))) } } else if !other_regex.is_empty_string() { - Cow::Owned(RegularExpression::Repetition( - Box::new(other_regex), - 0, - Some(1), - )) + Cow::Owned(other_regex.repeat(0, Some(1))) } else { Cow::Owned(RegularExpression::new_empty_string()) }; @@ -228,11 +217,7 @@ impl RegularExpression { ) = (this_concat, that_repetition) { if this_concat == &**that_regex && *that_min <= 2 { - RegularExpression::Repetition( - that_regex.clone(), - cmp::min(1, *that_min), - *that_max_opt, - ) + that_regex.repeat(cmp::min(1, *that_min), *that_max_opt) } else { Self::opunion_common_affixes(this_concat, that_repetition) } @@ -288,18 +273,13 @@ impl RegularExpression { || this_max + 1 == *that_min || that_max + 1 == *this_min { - return RegularExpression::Repetition( - this_regex.clone(), + return this_regex.repeat( cmp::min(*this_min, *that_min), Some(cmp::max(*this_max, *that_max)), ); } } else { - return RegularExpression::Repetition( - this_regex.clone(), - cmp::min(*this_min, *that_min), - None, - ); + return this_regex.repeat(cmp::min(*this_min, *that_min), None); } } @@ -321,11 +301,7 @@ impl RegularExpression { ) = (this_repetition, that_alternation) { if that_alternation == &**this_regex && *this_min <= 2 { - RegularExpression::Repetition( - this_regex.clone(), - cmp::min(1, *this_min), - *this_max_opt, - ) + this_regex.repeat(cmp::min(1, *this_min), *this_max_opt) } else { let mut set = BTreeSet::new(); diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs index 4dd7e47..9609c8f 100644 --- a/tests/integration_tests.rs +++ b/tests/integration_tests.rs @@ -22,13 +22,13 @@ fn assert_regex(regex: &str) { assert!(re.is_match(&string), "'{string}'"); } - assert!(automaton.is_subset_of(&determinized_automaton).unwrap()); - assert!(determinized_automaton.is_subset_of(&automaton).unwrap()); - assert!(automaton.are_equivalent(&determinized_automaton).unwrap()); + assert!(automaton.subset(&determinized_automaton).unwrap()); + assert!(determinized_automaton.subset(&automaton).unwrap()); + assert!(automaton.equivalent(&determinized_automaton).unwrap()); - let regex_from_automaton = automaton.to_regex().unwrap(); + let regex_from_automaton = automaton.to_regex(); let automaton_from_regex = regex_from_automaton.to_automaton().unwrap(); - assert!(automaton.are_equivalent(&automaton_from_regex).unwrap()); + assert!(automaton.equivalent(&automaton_from_regex).unwrap()); } #[test] From 135cca6cd5a9f44c6319ae74996f2d9f49001d9f Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Tue, 16 Sep 2025 21:09:59 +0200 Subject: [PATCH 25/44] update tests --- benches/my_benchmark.rs | 2 +- tests/data/regex-todo.txt | 6 ------ tests/data/regex.txt | 8 +++++++- tests/integration_tests.rs | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/benches/my_benchmark.rs b/benches/my_benchmark.rs index c35164a..71898ec 100644 --- a/benches/my_benchmark.rs +++ b/benches/my_benchmark.rs @@ -6,7 +6,7 @@ fn parse_regex(regex: &str) -> RegularExpression { } fn to_regex(automaton: &FastAutomaton) -> RegularExpression { - automaton.to_regex().unwrap() + automaton.to_regex() } fn determinize(automaton: &FastAutomaton) -> FastAutomaton { diff --git a/tests/data/regex-todo.txt b/tests/data/regex-todo.txt index 05849d6..e69de29 100644 --- a/tests/data/regex-todo.txt +++ b/tests/data/regex-todo.txt @@ -1,6 +0,0 @@ -(a*,a*)* -#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3}) -\{(?:\s*"[^"]*"\s*:\s*(?:\d+|"(...)" )\s*)(?:,\s*"[^"]*"\s*:\s*(?:\d+|"(...)" )\s*)*\} -rgba?\(\s*(?:\d{1,3}\s*,\s*){2}\d{1,3}(?:\s*,\s*(?:0|1|0?\.\d+))?\s*\) -[+-]?(?:\d+\.\d*|\.\d+)(?:[eE][+-]?\d+)? -<\w+(?:\s+\w+(?:="[^"]*")?)*\s*/?> \ No newline at end of file diff --git a/tests/data/regex.txt b/tests/data/regex.txt index 3eebe62..31aa829 100644 --- a/tests/data/regex.txt +++ b/tests/data/regex.txt @@ -70,4 +70,10 @@ https?://[^\s/$.?#][^\s]* [[:alnum:]&&[^0-9]] [ \t]+ [\r\n]+ -[^\t\r\n]+ \ No newline at end of file +[^\t\r\n]+ +(a*,a*)* +#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3}) +\{(?:\s*"[^"]*"\s*:\s*(?:\d+|"(...)" )\s*)(?:,\s*"[^"]*"\s*:\s*(?:\d+|"(...)" )\s*)*\} +rgba?\(\s*(?:\d{1,3}\s*,\s*){2}\d{1,3}(?:\s*,\s*(?:0|1|0?\.\d+))?\s*\) +[+-]?(?:\d+\.\d*|\.\d+)(?:[eE][+-]?\d+)? +<\w+(?:\s+\w+(?:="[^"]*")?)*\s*/?> \ No newline at end of file diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs index 9609c8f..319f261 100644 --- a/tests/integration_tests.rs +++ b/tests/integration_tests.rs @@ -9,7 +9,7 @@ use regexsolver::regex::RegularExpression; fn assert_regex(regex: &str) { let re = Regex::new(&format!("(?s)^{}$", regex)).unwrap(); - let regex = RegularExpression::new(regex).unwrap(); + let regex = RegularExpression::parse(regex, true).unwrap(); let automaton = regex.to_automaton().unwrap(); let strings = automaton.generate_strings(500).unwrap(); for string in strings { From c3d800a998f82c7caa8413e5d1113e60928ba282 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Tue, 16 Sep 2025 21:12:42 +0200 Subject: [PATCH 26/44] fix clippy --- src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs b/src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs index bf5e682..33142ca 100644 --- a/src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs +++ b/src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs @@ -70,7 +70,7 @@ fn dot_star_component(automaton: &mut FastAutomaton, component: &IntSet) has_transition_to_start_state = true; } - this_condition = this_condition.union(&condition); + this_condition = this_condition.union(condition); } if !has_transition_to_start_state { // Some state(s) do not have transition to the start state. From afbacc6941d046d444aeb0e75d3092bba6fe037d Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Tue, 16 Sep 2025 21:24:28 +0200 Subject: [PATCH 27/44] add test --- .../convert/to_regex/transform/mod.rs | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/src/fast_automaton/convert/to_regex/transform/mod.rs b/src/fast_automaton/convert/to_regex/transform/mod.rs index 552222e..643e6ba 100644 --- a/src/fast_automaton/convert/to_regex/transform/mod.rs +++ b/src/fast_automaton/convert/to_regex/transform/mod.rs @@ -14,3 +14,34 @@ pub fn transform(automaton: &FastAutomaton) -> FastAutomaton { automaton } + +#[cfg(test)] +mod tests { + use crate::{ + fast_automaton::convert::to_regex::transform::transform, regex::RegularExpression, + }; + + #[test] + fn test_equivalence() -> Result<(), String> { + assert_equivalent("abc"); + assert_equivalent(".*abc"); + assert_equivalent(".*abc.*def"); + assert_equivalent(".*abc.*def(ab|fr)"); + assert_equivalent(".*abc.*def(ab|fr).*mpa"); + + Ok(()) + } + + fn assert_equivalent(pattern: &str) { + let before = RegularExpression::parse(pattern, false) + .unwrap() + .to_automaton() + .unwrap(); + + let before = before.determinize().unwrap(); + + let after = transform(&before); + + assert!(before.equivalent(&after).unwrap()); + } +} From 7b576f72dcda1f65fd1c2bc77f465197235b7482 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Tue, 16 Sep 2025 21:41:29 +0200 Subject: [PATCH 28/44] update readme --- README.md | 46 ++++++++++++++++++++++------------------------ src/lib.rs | 12 ++++++------ 2 files changed, 28 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 4406477..b3baf47 100644 --- a/README.md +++ b/README.md @@ -38,26 +38,26 @@ fn main() -> Result<(), EngineError> { // Concatenate let concat = t1.concat(&[t2])?; - assert_eq!(concat.to_pattern().unwrap(), "abc.*xyz"); + assert_eq!(concat.to_pattern(), "abc.*xyz"); // Union let union = t1.union(&[Term::from_pattern("fgh")?])?; - assert_eq!(union.to_pattern().unwrap(), "(abc.*|fgh)"); + assert_eq!(union.to_pattern(), "(abc.*|fgh)"); // Intersection let inter = Term::from_pattern("(ab|xy){2}")? .intersection(&[Term::from_pattern(".*xy")?])?; - assert_eq!(inter.to_pattern().unwrap(), "(ab|xy)xy"); + assert_eq!(inter.to_pattern(), "(ab|xy)xy"); // Difference let diff = Term::from_pattern("a*")? .difference(&Term::from_pattern("")?)?; - assert_eq!(diff.to_pattern().unwrap(), "a+"); + assert_eq!(diff.to_pattern(), "a+"); // Repetition let rep = Term::from_pattern("abc")? .repeat(2, Some(4))?; - assert_eq!(rep.to_pattern().unwrap(), "(abc){2,4}"); + assert_eq!(rep.to_pattern(), "(abc){2,4}"); // Analyze assert_eq!(rep.get_length(), (Some(6), Some(12))); @@ -71,8 +71,8 @@ fn main() -> Result<(), EngineError> { // Equivalence & subset let a = Term::from_pattern("a+")?; let b = Term::from_pattern("a*")?; - assert!(!a.are_equivalent(&b)?); - assert!(a.is_subset_of(&b)?); + assert!(!a.equivalent(&b)?); + assert!(a.subset(&b)?); Ok(()) } @@ -118,17 +118,17 @@ RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/re #### Analyze | Method | Return | Description | | -------- | ------- | ------- | -| `are_equivalent(&self, term: &Term)` | `Result` | Returns `true` if both terms accept the same language. | +| `equivalent(&self, term: &Term)` | `Result` | Returns `true` if both terms accept the same language. | | `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates `count` strings matched by the term. | | `get_cardinality()` | `Result, EngineError>` | Returns the cardinality of the term (i.e., the number of possible matched strings). | | `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of matched strings. | | `is_empty(&self)` | `bool` | Checks if the term matches the empty language. | | `is_empty_string(&self)` | `bool` | Checks if the term matches only the empty string `""`. | -| `is_subset_of(&self, term: &Term)` | `Result` | Returns `true` if all strings matched by the current term are also matched by the given term. | | `is_total(&self)` | `bool` | Checks if the term matches all possible strings. | +| `subset(&self, term: &Term)` | `Result` | Returns `true` if all strings matched by the current term are also matched by the given term. | | `to_automaton(&self)` | `Result, EngineError>` | Converts the term to a `FastAutomaton`. | -| `to_pattern(&self)` | `Option` | Converts the term to a regular expression pattern; returns `None` if conversion isn’t possible. | -| `to_regex(&self)` | `Option>` | Converts the term to a RegularExpression; returns `None` if conversion isn’t possible. | +| `to_pattern(&self)` | `String` | Converts the term to a regular expression pattern. | +| `to_regex(&self)` | `Cow` | Converts the term to a RegularExpression. | ### FastAutomaton @@ -192,17 +192,16 @@ This design allows us to perform unions, intersections, and complements of trans #### Analyze | Method | Return | Description | | -------- | ------- | ------- | -| `all_states_iter(&self)` | `impl Iterator` | Returns an iterator over the automaton’s states. | -| `all_states_vec(&self)` | `Vec` | Returns a vector containing the automaton’s states. | -| `are_equivalent(&self, other: &FastAutomaton)` | `Result` | Returns `true` if both automata accept the same language. | -| `direct_states_iter(&self, state: &State)` | `impl Iterator` | Returns an iterator over states directly reachable from the given state in one transition. | +| `states(&self)` | `impl Iterator` | Returns an iterator over the automaton’s states. | +| `states_vec(&self)` | `Vec` | Returns a vector containing the automaton’s states. | +| `direct_states(&self, state: &State)` | `impl Iterator` | Returns an iterator over states directly reachable from the given state in one transition. | | `direct_states_vec(&self, state: &State)` | `Vec` | Returns a vector of states directly reachable from the given state in one transition. | | `does_transition_exists(&self, from_state: State, to_state: State)` | `bool` | Returns `true` if there is a directed transition from `from_state` to `to_state`. | +| `equivalent(&self, other: &FastAutomaton)` | `Result` | Returns `true` if both automata accept the same language. | | `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates `count` strings matched by the automaton. | | `get_accept_states(&self)` | `&IntSet` | Returns a reference to the set of accept (final) states. | | `get_cardinality(&self)` | `Cardinality` | Returns the cardinality of the automaton (i.e., the number of possible matched strings). | | `get_condition(&self, from_state: State, to_state: State)` | `Option<&Condition>` | Returns a reference to the condition of the directed transition between the two states, if any. | -| `get_condition_mut(&mut self, from_state: State, to_state: State)` | `Option<&Condition>` | Returns a mutable reference to the condition of the directed transition between the two states, if any. | | `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of matched strings. | | `get_reachable_states(&self)` | `IntSet` | Returns the set of all states reachable from the start state. | | `get_spanning_set(&self)` | `&SpanningSet` | Returns a reference to the automaton's spanning set. | @@ -214,14 +213,12 @@ This design allows us to perform unions, intersections, and complements of trans | `is_deterministic(&self)` | `bool` | Returns `true` if the automaton is deterministic. | | `is_empty(&self)` | `bool` | Checks if the automaton matches the empty language. | | `is_empty_string(&self)` | `bool` | Checks if the automaton only matches the empty string `""`. | -| `is_subset_of(&self, other: &FastAutomaton)` | `Result` | Returns `true` if all strings accepted by `self` are also accepted by `other`. | | `is_total(&self)` | `bool` | Checks if the automaton matches all possible strings. | -| `state_in_degree(&self, state: State)` | `usize` | Returns the number of transitions to the provided state. | -| `state_out_degree(&self, state: State)` | `usize` | Returns the number of transitions from the provided state. | -| `to_regex(&self)` | `Option` | Attempts to convert the automaton to a `RegularExpression`; returns `None` if no equivalent pattern are found. | -| `transitions_from_into_iter(&self, state: State)` | `impl Iterator` | Returns an owned iterator over transitions from the given state. | -| `transitions_from_iter(&self, state: State)` | `impl Iterator` | Returns an iterator over transitions from the given state. | -| `transitions_from_iter_mut(&mut self, state: State)` | `impl Iterator` | Returns a mutable iterator over transitions from the given state. | +| `in_degree(&self, state: State)` | `usize` | Returns the number of transitions to the provided state. | +| `out_degree(&self, state: State)` | `usize` | Returns the number of transitions from the provided state. | +| `to_regex(&self)` | `RegularExpression` | Convert the automaton to a `RegularExpression`. | +| `subset(&self, other: &FastAutomaton)` | `Result` | Returns `true` if all strings accepted by `self` are also accepted by `other`. | +| `transitions_from(&self, state: State)` | `impl Iterator` | Returns an iterator over transitions from the given state. | | `transitions_from_vec(&self, state: State)` | `Vec` | Returns a vector of transitions from the given state. | | `transitions_to_vec(&self, state: State)` | `Vec` | Returns a vector of transitions to the given state. | @@ -234,10 +231,11 @@ This design allows us to perform unions, intersections, and complements of trans | Method | Return | Description | | -------- | ------- | ------- | | `concat(&self, other: &RegularExpression, append_back: bool)` | `RegularExpression` | Returns a new regular expression representing the concatenation of `self` and `other`; `append_back` determines their order. | -| `new(pattern: &str)` | `Result` | Parses the provided pattern and returns the resulting `RegularExpression`. | +| `new(pattern: &str)` | `Result` | Parses and simplifies the provided pattern and returns the resulting `RegularExpression`. | | `new_empty()` | `RegularExpression` | Creates a regular expression that matches the empty language. | | `new_empty_string()` | `RegularExpression` | Creates a regular expression that matches only the empty string `""`. | | `new_total()` | `RegularExpression` | Creates a regular expression that matches all possible strings. | +| `parse(pattern: &str, simplify: bool)` | `Result` | Parses the given regular expression pattern and returns a corresponding `RegularExpression`. If simplify is `true`, the expression is simplified during parsing. | | `repeat(&self, min: u32, max_opt: Option)` | `RegularExpression` | Returns the repetition of the expression between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. | | `simplify(&self)` | `RegularExpression` | Returns a simplified version by eliminating redundant constructs and applying canonical reductions. | | `union(&self, other: &RegularExpression)` | `RegularExpression` | Returns a regular expression matching the union of `self` and `other`. | diff --git a/src/lib.rs b/src/lib.rs index ea1bb71..5c52410 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -74,8 +74,8 @@ pub type CharRange = RangeSet; /// // Equivalence & subset /// let a = Term::from_pattern("a+")?; /// let b = Term::from_pattern("a*")?; -/// assert!(!a.are_equivalent(&b)?); -/// assert!(a.is_subset_of(&b)?); +/// assert!(!a.equivalent(&b)?); +/// assert!(a.subset(&b)?); /// /// Ok(()) /// } @@ -379,9 +379,9 @@ impl Term { /// let term1 = Term::from_pattern("(abc|de)").unwrap(); /// let term2 = Term::from_pattern("(abc|de)*").unwrap(); /// - /// assert!(!term1.are_equivalent(&term2).unwrap()); + /// assert!(!term1.equivalent(&term2).unwrap()); /// ``` - pub fn are_equivalent(&self, that: &Term) -> Result { + pub fn equivalent(&self, that: &Term) -> Result { if self == that { return Ok(true); } @@ -401,9 +401,9 @@ impl Term { /// let term1 = Term::from_pattern("de").unwrap(); /// let term2 = Term::from_pattern("(abc|de)").unwrap(); /// - /// assert!(term1.is_subset_of(&term2).unwrap()); + /// assert!(term1.subset(&term2).unwrap()); /// ``` - pub fn is_subset_of(&self, that: &Term) -> Result { + pub fn subset(&self, that: &Term) -> Result { if self == that { return Ok(true); } From 0a0d91ba57cc3bff05aff3b80365fbba006451d5 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Tue, 16 Sep 2025 21:55:00 +0200 Subject: [PATCH 29/44] update readme --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index b3baf47..2afb543 100644 --- a/README.md +++ b/README.md @@ -94,13 +94,13 @@ RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/re ### Term -`Term` is an enum designed to represent either a regular expression or a compiled automaton. This unified representation enables seamless and efficient execution of set operations across multiple instances. It's particularly valuable when working with both regular expressions and automata, allowing operations to be performed transparently regardless of the underlying representation. +`Term` is an enum designed to represent either a regular expression or an automaton. Used when working with both regular expressions and automata, allowing operations to be performed transparently regardless of the underlying representation. #### Build | Method | Return | Description | | -------- | ------- | ------- | | `from_automaton(automaton: FastAutomaton)` | `Term` | Creates a new `Term` holding the provided `FastAutomaton`. | -| `from_pattern(pattern: &str)` | `Result` | Parses the provided pattern and returns a new `Term` holding the resulting `RegularExpression`. | +| `from_pattern(pattern: &str)` | `Result` | Parses and simplifies the provided pattern and returns a new `Term` holding the resulting `RegularExpression`. | | `from_regex(regex: RegularExpression)` | `Term` | Creates a new `Term` holding the provided `RegularExpression`. | | `new_empty()` | `Term` | Creates a term that matches the empty language. | | `new_empty_string()` | `Term` | Creates a term that only matches the empty string `""`. | @@ -133,7 +133,7 @@ RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/re ### FastAutomaton -`FastAutomaton` is used to directly build, manipulate and analyze automata. To convert an automaton to a `RegularExpression` the method `to_regex()` can be used. Not all automata can be converted to a regular expression. +`FastAutomaton` is used to directly build, manipulate and analyze automata. To convert an automaton to a `RegularExpression` the method `to_regex()` can be used. When building or modifying an automaton you might come to use the method `add_transition(&mut self, from_state: State, to_state: State, new_cond: &Condition)`. This method accepts a `Condition` rather than a raw character set. To build a `Condition`, call: ```rust @@ -192,8 +192,6 @@ This design allows us to perform unions, intersections, and complements of trans #### Analyze | Method | Return | Description | | -------- | ------- | ------- | -| `states(&self)` | `impl Iterator` | Returns an iterator over the automaton’s states. | -| `states_vec(&self)` | `Vec` | Returns a vector containing the automaton’s states. | | `direct_states(&self, state: &State)` | `impl Iterator` | Returns an iterator over states directly reachable from the given state in one transition. | | `direct_states_vec(&self, state: &State)` | `Vec` | Returns a vector of states directly reachable from the given state in one transition. | | `does_transition_exists(&self, from_state: State, to_state: State)` | `bool` | Returns `true` if there is a directed transition from `from_state` to `to_state`. | @@ -216,8 +214,10 @@ This design allows us to perform unions, intersections, and complements of trans | `is_total(&self)` | `bool` | Checks if the automaton matches all possible strings. | | `in_degree(&self, state: State)` | `usize` | Returns the number of transitions to the provided state. | | `out_degree(&self, state: State)` | `usize` | Returns the number of transitions from the provided state. | -| `to_regex(&self)` | `RegularExpression` | Convert the automaton to a `RegularExpression`. | +| `states(&self)` | `impl Iterator` | Returns an iterator over the automaton’s states. | +| `states_vec(&self)` | `Vec` | Returns a vector containing the automaton’s states. | | `subset(&self, other: &FastAutomaton)` | `Result` | Returns `true` if all strings accepted by `self` are also accepted by `other`. | +| `to_regex(&self)` | `RegularExpression` | Convert the automaton to a `RegularExpression`. | | `transitions_from(&self, state: State)` | `impl Iterator` | Returns an iterator over transitions from the given state. | | `transitions_from_vec(&self, state: State)` | `Vec` | Returns a vector of transitions from the given state. | | `transitions_to_vec(&self, state: State)` | `Vec` | Returns a vector of transitions to the given state. | From 05b68020137c2777d6076bac8fe713f8c4babc8f Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Tue, 16 Sep 2025 21:59:05 +0200 Subject: [PATCH 30/44] update method signature --- README.md | 2 +- src/fast_automaton/builder.rs | 8 ++++---- src/regex/mod.rs | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 2afb543..72e10fa 100644 --- a/README.md +++ b/README.md @@ -167,7 +167,7 @@ This design allows us to perform unions, intersections, and complements of trans | `apply_new_spanning_set(&mut self, new_spanning_set: &SpanningSet)` | `Result<(), EngineError>` | Applies the provided spanning set and projects all existing conditions onto it. | | `new_empty()` | `FastAutomaton` | Creates an automaton that matches the empty language. | | `new_empty_string()` | `FastAutomaton` | Creates an automaton that only matches the empty string `""`. | -| `new_from_range(range: &CharRange)` | `Result` | Creates an automaton that matches one of the characters in the given `CharRange`. | +| `new_from_range(range: &CharRange)` | `FastAutomaton` | Creates an automaton that matches one of the characters in the given `CharRange`. | | `new_state(&mut self)` | `State` | Creates a new state and returns its identifier. | | `new_total()` | `FastAutomaton` | Creates an automaton that matches all possible strings. | | `remove_state(&mut self, state: State)` | `()` | Removes the state and all its connected transitions; panics if it's a start state. | diff --git a/src/fast_automaton/builder.rs b/src/fast_automaton/builder.rs index da16808..4346fd2 100644 --- a/src/fast_automaton/builder.rs +++ b/src/fast_automaton/builder.rs @@ -39,19 +39,19 @@ impl FastAutomaton { } /// Creates an automaton that matches one of the characters in the given [`CharRange`]. - pub fn new_from_range(range: &CharRange) -> Result { + pub fn new_from_range(range: &CharRange) -> Self { let mut automaton = Self::new_empty(); if range.is_empty() { - return Ok(automaton); + return automaton; } let new_state = automaton.new_state(); let spanning_set = SpanningSet::compute_spanning_set(&[range.clone()]); - let condition = Condition::from_range(range, &spanning_set)?; + let condition = Condition::from_range(range, &spanning_set).expect("The spanning set should be valid"); automaton.spanning_set = spanning_set; automaton.add_transition(0, new_state, &condition); automaton.accept(new_state); - Ok(automaton) + automaton } /// Creates a new state and returns its identifier. diff --git a/src/regex/mod.rs b/src/regex/mod.rs index 2f5062f..d3d2b14 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -129,7 +129,7 @@ impl RegularExpression { ExecutionProfile::get().assert_max_number_of_states(self.get_number_of_states_in_nfa())?; match self { - RegularExpression::Character(range) => FastAutomaton::new_from_range(range), + RegularExpression::Character(range) => Ok(FastAutomaton::new_from_range(range)), RegularExpression::Repetition(regular_expression, min, max_opt) => { let mut automaton = regular_expression.to_automaton()?; automaton.repeat_mut(*min, *max_opt)?; From a2dc371580d5b1f2fef367582ec731821fc5f919 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Wed, 17 Sep 2025 22:06:09 +0200 Subject: [PATCH 31/44] add concat all for regex --- README.md | 4 +-- src/regex/mod.rs | 10 +++---- src/regex/operation/concat.rs | 51 +++++++++++++++++------------------ 3 files changed, 31 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 72e10fa..aa9505d 100644 --- a/README.md +++ b/README.md @@ -227,10 +227,11 @@ This design allows us to perform unions, intersections, and complements of trans `RegularExpression` is used to directly build, manipulate and analyze regular expression patterns. Not all the set operations are available, for more advanced operation such as intersection, subtraction/difference and complement it is necessary to convert into a `FastAutomaton` with the method `to_automaton()`. -#### Build +#### Build/Manipulate | Method | Return | Description | | -------- | ------- | ------- | | `concat(&self, other: &RegularExpression, append_back: bool)` | `RegularExpression` | Returns a new regular expression representing the concatenation of `self` and `other`; `append_back` determines their order. | +| `concat_all<'a, I: IntoIterator>(patterns: I)` | `RegularExpression` | Returns a regular expression that is the concatenation of all expressions in `patterns`. | | `new(pattern: &str)` | `Result` | Parses and simplifies the provided pattern and returns the resulting `RegularExpression`. | | `new_empty()` | `RegularExpression` | Creates a regular expression that matches the empty language. | | `new_empty_string()` | `RegularExpression` | Creates a regular expression that matches only the empty string `""`. | @@ -292,7 +293,6 @@ execution_profile.run(|| { ## Cross-Language Support - If you want to use this library with other programming languages, we provide a wide range of wrappers: - [regexsolver-java](https://github.com/RegexSolver/regexsolver-java) - [regexsolver-js](https://github.com/RegexSolver/regexsolver-js) diff --git a/src/regex/mod.rs b/src/regex/mod.rs index d3d2b14..44feecd 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -172,13 +172,13 @@ impl RegularExpression { // multipliers tuned for readability impact let mut m = match max_opt { - None => 1.6, // open upper bound like a+ or a{m,} - Some(max) if max > min => 1.3, // variable upper bound a{m,n} - Some(max) if max == min && *min > 1 => 1.1, // exact count a{n} - _ => 1.0, // a{1} or degenerate + None => 1.6, + Some(max) if max > min => 1.3, + Some(max) if max == min && *min > 1 => 1.1, + _ => 1.0, }; - // nested quantifiers like (?:...+)+ are harder + // nested quantifiers like (...+)+ are harder if inner_has_rep { m *= 1.5; } diff --git a/src/regex/operation/concat.rs b/src/regex/operation/concat.rs index 4e346f8..9cefb01 100644 --- a/src/regex/operation/concat.rs +++ b/src/regex/operation/concat.rs @@ -1,6 +1,19 @@ use super::*; impl RegularExpression { + /// Returns a regular expression that is the concatenation of all expressions in `patterns`. + pub fn concat_all<'a, I: IntoIterator>( + patterns: I, + ) -> RegularExpression { + let mut result = RegularExpression::new_empty_string(); + + for other in patterns { + result = result.concat(other, true); + } + + result + } + /// Returns a new regular expression representing the concatenation of `self` and `other`; `append_back` determines their order. pub fn concat(&self, other: &RegularExpression, append_back: bool) -> RegularExpression { if self.is_empty() || other.is_empty() { @@ -11,35 +24,19 @@ impl RegularExpression { return self.clone(); } - match (self, other) { + let (front, back) = if append_back { + (self, other) + } else { + (other, self) + }; + + match (front, back) { (RegularExpression::Concat(_), RegularExpression::Concat(_)) => { - if append_back { - Self::opconcat_concat_and_concat(self, other) - } else { - Self::opconcat_concat_and_concat(other, self) - } - } - (RegularExpression::Concat(_), _) => { - if append_back { - Self::opconcat_concat_and_other(self, other) - } else { - Self::opconcat_other_and_concat(other, self) - } - } - (_, RegularExpression::Concat(_)) => { - if append_back { - Self::opconcat_other_and_concat(self, other) - } else { - Self::opconcat_concat_and_other(other, self) - } - } - (_, _) => { - if append_back { - Self::opconcat_other_and_other(self, other) - } else { - Self::opconcat_other_and_other(other, self) - } + Self::opconcat_concat_and_concat(front, back) } + (RegularExpression::Concat(_), _) => Self::opconcat_concat_and_other(front, back), + (_, RegularExpression::Concat(_)) => Self::opconcat_other_and_concat(front, back), + (_, _) => Self::opconcat_other_and_other(front, back), } } From 7afac6256ba76c3d3c376f2d4a4ccbe1e1c41939 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Fri, 19 Sep 2025 20:48:21 +0200 Subject: [PATCH 32/44] update docs --- README.md | 31 ++++++++++++-------- src/fast_automaton/builder.rs | 5 ++-- src/fast_automaton/mod.rs | 29 ++++++++---------- src/fast_automaton/operation/intersection.rs | 4 +-- src/fast_automaton/operation/repeat.rs | 2 +- src/lib.rs | 14 ++++----- src/regex/builder.rs | 4 +-- src/regex/mod.rs | 3 +- src/regex/operation/repeat.rs | 20 ++++++------- 9 files changed, 58 insertions(+), 54 deletions(-) diff --git a/README.md b/README.md index aa9505d..1173ea8 100644 --- a/README.md +++ b/README.md @@ -120,7 +120,7 @@ RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/re | -------- | ------- | ------- | | `equivalent(&self, term: &Term)` | `Result` | Returns `true` if both terms accept the same language. | | `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates `count` strings matched by the term. | -| `get_cardinality()` | `Result, EngineError>` | Returns the cardinality of the term (i.e., the number of possible matched strings). | +| `get_cardinality(&self)` | `Result, EngineError>` | Returns the cardinality of the term (i.e., the number of possible matched strings). | | `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of matched strings. | | `is_empty(&self)` | `bool` | Checks if the term matches the empty language. | | `is_empty_string(&self)` | `bool` | Checks if the term matches only the empty string `""`. | @@ -130,7 +130,6 @@ RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/re | `to_pattern(&self)` | `String` | Converts the term to a regular expression pattern. | | `to_regex(&self)` | `Cow` | Converts the term to a RegularExpression. | - ### FastAutomaton `FastAutomaton` is used to directly build, manipulate and analyze automata. To convert an automaton to a `RegularExpression` the method `to_regex()` can be used. @@ -170,8 +169,9 @@ This design allows us to perform unions, intersections, and complements of trans | `new_from_range(range: &CharRange)` | `FastAutomaton` | Creates an automaton that matches one of the characters in the given `CharRange`. | | `new_state(&mut self)` | `State` | Creates a new state and returns its identifier. | | `new_total()` | `FastAutomaton` | Creates an automaton that matches all possible strings. | -| `remove_state(&mut self, state: State)` | `()` | Removes the state and all its connected transitions; panics if it's a start state. | +| `remove_state(&mut self, state: State)` | `()` | Removes the state and its connected transitions; panics if it's a start state. | | `remove_states(&mut self, states: &IntSet)` | `()` | Removes the given states and their connected transitions; panics if any is a start state. | +| `remove_transition(&mut self, from_state: State, to_state: State)` | `()` | Removes the transition between the two provided states if it exists. | #### Manipulate | Method | Return | Description | @@ -181,6 +181,7 @@ This design allows us to perform unions, intersections, and complements of trans | `concat_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Computes the concatenation of all automatons in the given iterator. | | `determinize(&self)` | `Result, EngineError>` | Determinizes the automaton and returns the result. | | `difference(&self, other: &FastAutomaton)` | `Result` | Computes the difference between `self` and `other`. | +| `has_intersection(&self, other: &FastAutomaton)` | `Result` | Returns `true` if the two automata have a non-empty intersection. | | `intersection(&self, other: &FastAutomaton)` | `Result` | Computes the intersection between `self` and `other`. | | `intersection_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Computes the intersection of all automatons in the given iterator. | | `intersection_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | Computes in parallel the intersection of all automatons in the given iterator. | @@ -192,35 +193,37 @@ This design allows us to perform unions, intersections, and complements of trans #### Analyze | Method | Return | Description | | -------- | ------- | ------- | +| `as_dot(&self)` | `String` | Returns the automaton's DOT representation. | | `direct_states(&self, state: &State)` | `impl Iterator` | Returns an iterator over states directly reachable from the given state in one transition. | | `direct_states_vec(&self, state: &State)` | `Vec` | Returns a vector of states directly reachable from the given state in one transition. | | `does_transition_exists(&self, from_state: State, to_state: State)` | `bool` | Returns `true` if there is a directed transition from `from_state` to `to_state`. | | `equivalent(&self, other: &FastAutomaton)` | `Result` | Returns `true` if both automata accept the same language. | -| `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates `count` strings matched by the automaton. | +| `generate_strings(&self, number: usize)` | `Result, EngineError>` | Generates `count` strings matched by the automaton. | | `get_accept_states(&self)` | `&IntSet` | Returns a reference to the set of accept (final) states. | | `get_cardinality(&self)` | `Cardinality` | Returns the cardinality of the automaton (i.e., the number of possible matched strings). | | `get_condition(&self, from_state: State, to_state: State)` | `Option<&Condition>` | Returns a reference to the condition of the directed transition between the two states, if any. | | `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of matched strings. | -| `get_reachable_states(&self)` | `IntSet` | Returns the set of all states reachable from the start state. | +| `get_number_of_states(&self)` | `usize` | Returns the number of states in the automaton. | +| `get_reacheable_states(&self)` | `IntSet` | Returns the set of all states reachable from the start state. | | `get_spanning_set(&self)` | `&SpanningSet` | Returns a reference to the automaton's spanning set. | | `get_start_state(&self)` | `State` | Returns the start state. | -| `has_intersection(&self, other: &FastAutomaton)` | `Result` | Returns `true` if the two automata have a non-empty intersection. | | `has_state(&self, state: State)` | `bool` | Returns `true` if the automaton contains the given state. | +| `in_degree(&self, state: State)` | `usize` | Returns the number of transitions to the provided state. | | `is_accepted(&self, state: &State)` | `bool` | Returns `true` if the given state is one of the accept states. | | `is_cyclic(&self)` | `bool` | Returns `true` if the automaton contains at least one cycle. | -| `is_deterministic(&self)` | `bool` | Returns `true` if the automaton is deterministic. | +| `is_determinitic(&self)` | `bool` | Returns `true` if the automaton is deterministic. | | `is_empty(&self)` | `bool` | Checks if the automaton matches the empty language. | | `is_empty_string(&self)` | `bool` | Checks if the automaton only matches the empty string `""`. | | `is_total(&self)` | `bool` | Checks if the automaton matches all possible strings. | -| `in_degree(&self, state: State)` | `usize` | Returns the number of transitions to the provided state. | +| `match_string(&self, string: &str)` | `bool` | Returns `true` if the automaton matches the given string. | | `out_degree(&self, state: State)` | `usize` | Returns the number of transitions from the provided state. | +| `print_dot(&self)` | `()` | Prints the automaton's DOT representation. | | `states(&self)` | `impl Iterator` | Returns an iterator over the automaton’s states. | | `states_vec(&self)` | `Vec` | Returns a vector containing the automaton’s states. | | `subset(&self, other: &FastAutomaton)` | `Result` | Returns `true` if all strings accepted by `self` are also accepted by `other`. | -| `to_regex(&self)` | `RegularExpression` | Convert the automaton to a `RegularExpression`. | | `transitions_from(&self, state: State)` | `impl Iterator` | Returns an iterator over transitions from the given state. | -| `transitions_from_vec(&self, state: State)` | `Vec` | Returns a vector of transitions from the given state. | -| `transitions_to_vec(&self, state: State)` | `Vec` | Returns a vector of transitions to the given state. | +| `transitions_from_vec(&self, state: State)` | `Vec<(Condition, State)>` | Returns a vector of transitions from the given state. | +| `transitions_to_vec(&self, state: State)` | `Vec<(State, Condition)>` | Returns a vector of transitions to the given state. | ### RegularExpression @@ -236,8 +239,8 @@ This design allows us to perform unions, intersections, and complements of trans | `new_empty()` | `RegularExpression` | Creates a regular expression that matches the empty language. | | `new_empty_string()` | `RegularExpression` | Creates a regular expression that matches only the empty string `""`. | | `new_total()` | `RegularExpression` | Creates a regular expression that matches all possible strings. | -| `parse(pattern: &str, simplify: bool)` | `Result` | Parses the given regular expression pattern and returns a corresponding `RegularExpression`. If simplify is `true`, the expression is simplified during parsing. | -| `repeat(&self, min: u32, max_opt: Option)` | `RegularExpression` | Returns the repetition of the expression between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. | +| `parse(pattern: &str, simplify: bool)` | `Result` | Parses the provided pattern and returns the resulting `RegularExpression`. If simplify is `true`, the expression is simplified during parsing. | +| `repeat(&self, min: u32, max_opt: Option)` | `RegularExpression` | Computes the repetition of the automaton between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. | | `simplify(&self)` | `RegularExpression` | Returns a simplified version by eliminating redundant constructs and applying canonical reductions. | | `union(&self, other: &RegularExpression)` | `RegularExpression` | Returns a regular expression matching the union of `self` and `other`. | | `union_all<'a, I: IntoIterator>(patterns: I)` | `RegularExpression` | Returns a regular expression that is the union of all expressions in `patterns`. | @@ -245,6 +248,7 @@ This design allows us to perform unions, intersections, and complements of trans #### Analyze | Method | Return | Description | | -------- | ------- | ------- | +| `evaluate_complexity(&self)` | `f64` | Returns a heuristic score for the readability of the pattern. | | `get_cardinality(&self)` | `Cardinality` | Returns the cardinality of the regular expression (i.e., the number of possible matched strings). | | `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of possible matched strings. | | `is_empty(&self)` | `bool` | Checks if the regular expression matches the empty language. | @@ -252,6 +256,7 @@ This design allows us to perform unions, intersections, and complements of trans | `is_total(&self)` | `bool` | Checks if the regular expression matches all possible strings. | | `to_automaton(&self)` | `Result` | Converts the regular expression to an equivalent `FastAutomaton`. | + ## Bound Execution Use a thread-local `ExecutionProfile` to cap runtime or state explosion; hitting a limit returns a specific `EngineError`. diff --git a/src/fast_automaton/builder.rs b/src/fast_automaton/builder.rs index 4346fd2..c12b366 100644 --- a/src/fast_automaton/builder.rs +++ b/src/fast_automaton/builder.rs @@ -193,6 +193,7 @@ impl FastAutomaton { } } + /// Removes the transition between the two provided states if it exists. pub fn remove_transition(&mut self, from_state: State, to_state: State) { self.assert_state_exists(from_state); if from_state != to_state { @@ -206,7 +207,7 @@ impl FastAutomaton { self.transitions[from_state].remove(&to_state); } - /// Removes the state and all its connected transitions; panics if it's a start state. + /// Removes the state and its connected transitions; panics if it's a start state. pub fn remove_state(&mut self, state: State) { self.assert_state_exists(state); if self.start_state == state { @@ -236,7 +237,7 @@ impl FastAutomaton { } } - /// Remove the provided states from the automaton. Remove all the transitions they are connected to. Panic if one of the state is used as a start state. + /// Removes the given states and their connected transitions; panics if any is a start state. pub fn remove_states(&mut self, states: &IntSet) { self.accept_states.retain(|e| !states.contains(e)); diff --git a/src/fast_automaton/mod.rs b/src/fast_automaton/mod.rs index 7bf0313..c78604a 100644 --- a/src/fast_automaton/mod.rs +++ b/src/fast_automaton/mod.rs @@ -14,12 +14,6 @@ pub(crate) type Transitions = IntMap; /// The identifier of state in an [`FastAutomaton`] pub type State = usize; -/// A tuple containing the condition of a transition to a state. -pub type TransitionTo = (Condition, State); - -/// A tuple containing the condition of a transition from a state. -pub type TransitionFrom = (State, Condition); - mod analyze; mod builder; pub mod condition; @@ -125,8 +119,8 @@ impl FastAutomaton { self.direct_states(state).collect() } - /// Returns a vector containing the transitions to the provided state. - pub fn transitions_to_vec(&self, state: State) -> Vec { + /// Returns a vector of transitions to the given state. + pub fn transitions_to_vec(&self, state: State) -> Vec<(State, Condition)> { let mut in_transitions = vec![]; for from_state in self.transitions_in.get(&state).unwrap_or(&IntSet::new()) { for (condition, to_state) in self.transitions_from_vec(*from_state) { @@ -141,7 +135,7 @@ impl FastAutomaton { /// Returns a vector of transitions from the given state. #[inline] - pub fn transitions_from_vec(&self, state: State) -> Vec { + pub fn transitions_from_vec(&self, state: State) -> Vec<(Condition, State)> { self.transitions[state] .iter() .map(|(s, c)| (c.clone(), *s)) @@ -184,13 +178,13 @@ impl FastAutomaton { .collect() } - // Returns the number of states in the automaton. + /// Returns the number of states in the automaton. #[inline] pub fn get_number_of_states(&self) -> usize { self.transitions.len() - self.removed_states.len() } - // Returns a reference to the condition of the directed transition between the two states, if any. + /// Returns a reference to the condition of the directed transition between the two states, if any. #[inline] pub fn get_condition(&self, from_state: State, to_state: State) -> Option<&Condition> { self.transitions[from_state].get(&to_state) @@ -202,7 +196,7 @@ impl FastAutomaton { self.start_state } - // Returns a reference to the set of accept (final) states. + /// Returns a reference to the set of accept (final) states. #[inline] pub fn get_accept_states(&self) -> &IntSet { &self.accept_states @@ -238,21 +232,22 @@ impl FastAutomaton { !(state >= self.transitions.len() || self.removed_states.contains(&state)) } - pub fn match_string(&self, input: &str) -> bool { + /// Returns `true` if the automaton matches the given string. + pub fn match_string(&self, string: &str) -> bool { let mut worklist = VecDeque::with_capacity(self.get_number_of_states()); worklist.push_back((0, &self.start_state)); while let Some((position, current_state)) = worklist.pop_back() { - if input.len() == position { + if string.len() == position { if self.accept_states.contains(current_state) { return true; } continue; } - let curr_char = input.chars().nth(position).unwrap() as u32; + let curr_char = string.chars().nth(position).unwrap() as u32; for (cond, to_state) in self.transitions_from(*current_state) { if cond.has_character(&curr_char, &self.spanning_set).unwrap() { - if position + 1 == input.len() { + if position + 1 == string.len() { if self.accept_states.contains(to_state) { return true; } @@ -265,11 +260,13 @@ impl FastAutomaton { false } + /// Returns the automaton's DOT representation. #[inline] pub fn as_dot(&self) -> String { format!("{self}") } + /// Prints the automaton's DOT representation. #[inline] pub fn print_dot(&self) { println!("{self}"); diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index 694d66c..b79ebab 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -126,7 +126,7 @@ impl FastAutomaton { Ok(Cow::Owned(new_automaton)) } - // Returns `true` if the two automata have a non-empty intersection. + /// Returns `true` if the two automata have a non-empty intersection. pub fn has_intersection(&self, other: &FastAutomaton) -> Result { if self.is_empty() || other.is_empty() { return Ok(false); @@ -195,7 +195,7 @@ impl FastAutomaton { &self, state: State, condition_converter: &ConditionConverter, - ) -> Result, EngineError> { + ) -> Result, EngineError> { let transitions_1: Result, EngineError> = self .transitions_from(state) .map(|(c, &s)| match condition_converter.convert(c) { diff --git a/src/fast_automaton/operation/repeat.rs b/src/fast_automaton/operation/repeat.rs index cdfd36b..c7ff2d8 100644 --- a/src/fast_automaton/operation/repeat.rs +++ b/src/fast_automaton/operation/repeat.rs @@ -1,7 +1,7 @@ use super::*; impl FastAutomaton { - // Computes the repetition of the automaton between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. + /// Computes the repetition of the automaton between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. pub fn repeat(&self, min: u32, max_opt: Option) -> Result { let mut automaton = self.clone(); if let Err(error) = automaton.repeat_mut(min, max_opt) { diff --git a/src/lib.rs b/src/lib.rs index 5c52410..3d750f0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -118,7 +118,7 @@ impl Term { Term::RegularExpression(RegularExpression::new_empty_string()) } - /// Parses the provided pattern and returns a new `Term` holding the resulting [`RegularExpression`]. + /// Parses and simplifies the provided pattern and returns a new [`Term`] holding the resulting [`RegularExpression`]. /// /// # Example: /// @@ -381,13 +381,13 @@ impl Term { /// /// assert!(!term1.equivalent(&term2).unwrap()); /// ``` - pub fn equivalent(&self, that: &Term) -> Result { - if self == that { + pub fn equivalent(&self, term: &Term) -> Result { + if self == term { return Ok(true); } let automaton_1 = self.to_automaton()?; - let automaton_2 = that.to_automaton()?; + let automaton_2 = term.to_automaton()?; automaton_1.equivalent(&automaton_2) } @@ -403,13 +403,13 @@ impl Term { /// /// assert!(term1.subset(&term2).unwrap()); /// ``` - pub fn subset(&self, that: &Term) -> Result { - if self == that { + pub fn subset(&self, term: &Term) -> Result { + if self == term { return Ok(true); } let automaton_1 = self.to_automaton()?; - let automaton_2 = that.to_automaton()?; + let automaton_2 = term.to_automaton()?; automaton_1.subset(&automaton_2) } diff --git a/src/regex/builder.rs b/src/regex/builder.rs index e5c8f9b..6d00471 100644 --- a/src/regex/builder.rs +++ b/src/regex/builder.rs @@ -11,12 +11,12 @@ lazy_static! { } impl RegularExpression { - /// Parses and simplify the provided pattern and returns the resulting [`RegularExpression`]. + /// Parses and simplifies the provided pattern and returns the resulting [`RegularExpression`]. pub fn new(pattern: &str) -> Result { Self::parse(pattern, true) } - /// Parses the provided pattern and returns the resulting [`RegularExpression`]. + /// Parses the provided pattern and returns the resulting [`RegularExpression`]. If simplify is `true`, the expression is simplified during parsing. pub fn parse(pattern: &str, simplify: bool) -> Result { if pattern.is_empty() { return Ok(RegularExpression::new_empty_string()); diff --git a/src/regex/mod.rs b/src/regex/mod.rs index 44feecd..a7682f6 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -152,12 +152,13 @@ impl RegularExpression { } } + /// Returns a heuristic score for the readability of the pattern. pub fn evaluate_complexity(&self) -> f64 { let (score, depth, _) = self.eval_inner(); score + Self::depth_penalty(depth) } - // returns: (score, max_depth, contains_repetition) + /// Returns: (score, max_depth, contains_repetition) fn eval_inner(&self) -> (f64, usize, bool) { match self { RegularExpression::Character(range) => { diff --git a/src/regex/operation/repeat.rs b/src/regex/operation/repeat.rs index c5578ca..235f4f1 100644 --- a/src/regex/operation/repeat.rs +++ b/src/regex/operation/repeat.rs @@ -1,41 +1,41 @@ use super::*; impl RegularExpression { - /// Returns the repetition of the expression between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. - pub fn repeat(&self, o_min: u32, o_max_opt: Option) -> RegularExpression { + /// Computes the repetition of the automaton between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. + pub fn repeat(&self, min: u32, max_opt: Option) -> RegularExpression { if self.is_total() { return RegularExpression::new_total(); } else if self.is_empty() { return RegularExpression::new_empty(); } else if self.is_empty_string() { return Self::new_empty_string(); - } else if let Some(max) = o_max_opt { - if max < o_min || max == 0 { + } else if let Some(max) = max_opt { + if max < min || max == 0 { return RegularExpression::new_empty_string(); - } else if o_min == 1 && max == 1 { + } else if min == 1 && max == 1 { return self.clone(); } } match self { RegularExpression::Repetition(regular_expression, i_min, i_max_opt) => { - let new_max = if let (Some(o_max), Some(i_max)) = (o_max_opt, i_max_opt) { + let new_max = if let (Some(o_max), Some(i_max)) = (max_opt, i_max_opt) { Some(o_max * i_max) } else { None }; - if Self::can_simplify_nested_repetition(*i_min, *i_max_opt, o_min, o_max_opt) { + if Self::can_simplify_nested_repetition(*i_min, *i_max_opt, min, max_opt) { RegularExpression::Repetition( regular_expression.clone(), - o_min * i_min, + min * i_min, new_max, ) } else { - RegularExpression::Repetition(Box::new(self.clone()), o_min, o_max_opt) + RegularExpression::Repetition(Box::new(self.clone()), min, max_opt) } } - _ => RegularExpression::Repetition(Box::new(self.clone()), o_min, o_max_opt), + _ => RegularExpression::Repetition(Box::new(self.clone()), min, max_opt), } } From 2119ea6880a53bfa93ff8dc1a994f36a52127ff6 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Fri, 19 Sep 2025 20:58:18 +0200 Subject: [PATCH 33/44] update doc --- README.md | 2 +- src/regex/builder.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1173ea8..c299a26 100644 --- a/README.md +++ b/README.md @@ -239,7 +239,7 @@ This design allows us to perform unions, intersections, and complements of trans | `new_empty()` | `RegularExpression` | Creates a regular expression that matches the empty language. | | `new_empty_string()` | `RegularExpression` | Creates a regular expression that matches only the empty string `""`. | | `new_total()` | `RegularExpression` | Creates a regular expression that matches all possible strings. | -| `parse(pattern: &str, simplify: bool)` | `Result` | Parses the provided pattern and returns the resulting `RegularExpression`. If simplify is `true`, the expression is simplified during parsing. | +| `parse(pattern: &str, simplify: bool)` | `Result` | Parses the provided pattern and returns the resulting `RegularExpression`. If `simplify` is `true`, the expression is simplified during parsing. | | `repeat(&self, min: u32, max_opt: Option)` | `RegularExpression` | Computes the repetition of the automaton between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. | | `simplify(&self)` | `RegularExpression` | Returns a simplified version by eliminating redundant constructs and applying canonical reductions. | | `union(&self, other: &RegularExpression)` | `RegularExpression` | Returns a regular expression matching the union of `self` and `other`. | diff --git a/src/regex/builder.rs b/src/regex/builder.rs index 6d00471..7681b8e 100644 --- a/src/regex/builder.rs +++ b/src/regex/builder.rs @@ -16,7 +16,7 @@ impl RegularExpression { Self::parse(pattern, true) } - /// Parses the provided pattern and returns the resulting [`RegularExpression`]. If simplify is `true`, the expression is simplified during parsing. + /// Parses the provided pattern and returns the resulting [`RegularExpression`]. If `simplify` is `true`, the expression is simplified during parsing. pub fn parse(pattern: &str, simplify: bool) -> Result { if pattern.is_empty() { return Ok(RegularExpression::new_empty_string()); From 0315e7d7fcbfbea88bf7723154b922b894889b67 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Fri, 19 Sep 2025 21:25:55 +0200 Subject: [PATCH 34/44] additional updates --- README.md | 26 +- src/fast_automaton/analyze/cardinality.rs | 2 +- src/fast_automaton/analyze/mod.rs | 2 +- src/fast_automaton/builder.rs | 2 +- src/fast_automaton/generate.rs | 12 +- src/fast_automaton/mod.rs | 6 +- src/fast_automaton/operation/concat.rs | 226 +++++++++--------- src/fast_automaton/operation/determinize.rs | 4 +- src/fast_automaton/operation/difference.rs | 2 +- src/fast_automaton/operation/intersection.rs | 52 ++-- src/fast_automaton/operation/mod.rs | 4 +- src/fast_automaton/operation/repeat.rs | 12 +- src/fast_automaton/operation/union.rs | 134 +++++------ .../serializer/tokenizer/embed_automaton.rs | 2 +- .../serializer/tokenizer/mod.rs | 2 +- src/lib.rs | 4 +- src/regex/builder.rs | 16 +- 17 files changed, 254 insertions(+), 254 deletions(-) diff --git a/README.md b/README.md index c299a26..93f116f 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # RegexSolver [![Crates.io Version](https://img.shields.io/crates/v/regexsolver)](https://crates.io/crates/regexsolver) -**RegexSolver** is a high-performance Rust library for building, combining, and analyzing regular expressions and finite automata. Ideal for constraint solvers, code or test-case generators, and any system needing rich regex or automaton operations. +**RegexSolver** is a Rust library for building, combining, and analyzing regular expressions and finite automata. It is designed for constraint solvers, test generators, and other systems that need advanced regex and automaton operations. ## Table of Contents @@ -81,11 +81,11 @@ fn main() -> Result<(), EngineError> { ## Key Concepts & Limitations RegexSolver supports a subset of regular expressions that adhere to the principles of regular languages. Here are the key characteristics and limitations of the regular expressions supported by RegexSolver: -- **Anchored Expressions:** All regular expressions in RegexSolver are anchored. This means that the expressions are treated as if they start and end at the boundaries of the input text. For example, the expression `abc` will match the string "abc" but not "xabc" or "abcx". - **Lookahead/Lookbehind:** RegexSolver does not support lookahead (`(?=...)`) or lookbehind (`(?<=...)`) assertions. Using them returns an error. +- **Pure Regular Expressions:** RegexSolver focuses on pure regular expressions as defined in regular language theory. This means features that extend beyond regular languages, such as backreferences (`\1`, `\2`, etc.), are not supported. Any use of backreference would return an error. - **Greedy/Ungreedy Quantifiers:** The concept of ungreedy (`*?`, `+?`, `??`) quantifiers is not supported. All quantifiers are treated as greedy. For example, `a*` or `a*?` will match the longest possible sequence of "a"s. - **Line Feed and Dot:** RegexSolver handles all characters the same way. The dot `.` matches any Unicode character including line feed (`\n`). -- **Pure Regular Expressions:** RegexSolver focuses on pure regular expressions as defined in regular language theory. This means features that extend beyond regular languages, such as backreferences (`\1`, `\2`, etc.), are not supported. Any use of backreference would return an error. +- **Anchored Expressions:** All regular expressions in RegexSolver are anchored. This means that the expressions are treated as if they start and end at the boundaries of the input text. For example, the expression `abc` will match the string "abc" but not "xabc" or "abcx". - **Empty Regular Expressions:** The empty language (matches no string) is represented by constructs like `[]` (empty character class). This is distinct from the empty string. RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/regex_syntax/) library for parsing patterns. Unsupported features are parsed but ignored; they do not raise an error unless they affect semantics that cannot be represented (e.g., backreferences). This allows for some flexibility in writing regular expressions, but it is important to be aware of the unsupported features to avoid unexpected behavior. @@ -178,17 +178,17 @@ This design allows us to perform unions, intersections, and complements of trans | -------- | ------- | ------- | | `complement(&mut self)` | `Result<(), EngineError>` | Complements the automaton; it must be deterministic. | | `concat(&self, other: &FastAutomaton)` | `Result` | Computes the concatenation between `self` and `other`. | -| `concat_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Computes the concatenation of all automatons in the given iterator. | +| `concat_all<'a, I: IntoIterator>(automata: I)` | `Result` | Computes the concatenation of all automata in the given iterator. | | `determinize(&self)` | `Result, EngineError>` | Determinizes the automaton and returns the result. | | `difference(&self, other: &FastAutomaton)` | `Result` | Computes the difference between `self` and `other`. | | `has_intersection(&self, other: &FastAutomaton)` | `Result` | Returns `true` if the two automata have a non-empty intersection. | | `intersection(&self, other: &FastAutomaton)` | `Result` | Computes the intersection between `self` and `other`. | -| `intersection_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Computes the intersection of all automatons in the given iterator. | -| `intersection_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | Computes in parallel the intersection of all automatons in the given iterator. | +| `intersection_all<'a, I: IntoIterator>(automata: I)` | `Result` | Computes the intersection of all automata in the given iterator. | +| `intersection_all_par<'a, I: IntoParallelIterator>(automata: I)` | `Result` | Computes in parallel the intersection of all automata in the given iterator. | | `repeat(&self, min: u32, max_opt: Option)` | `Result` | Computes the repetition of the automaton between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. | | `union(&self, other: &FastAutomaton)` | `Result` | Computes the union between `self` and `other`. | -| `union_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Computes the union of all automatons in the given iterator. | -| `union_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | Computes in parallel the union of all automatons in the given iterator. | +| `union_all<'a, I: IntoIterator>(automata: I)` | `Result` | Computes the union of all automata in the given iterator. | +| `union_all_par<'a, I: IntoParallelIterator>(automata: I)` | `Result` | Computes in parallel the union of all automata in the given iterator. | #### Analyze | Method | Return | Description | @@ -196,26 +196,26 @@ This design allows us to perform unions, intersections, and complements of trans | `as_dot(&self)` | `String` | Returns the automaton's DOT representation. | | `direct_states(&self, state: &State)` | `impl Iterator` | Returns an iterator over states directly reachable from the given state in one transition. | | `direct_states_vec(&self, state: &State)` | `Vec` | Returns a vector of states directly reachable from the given state in one transition. | -| `does_transition_exists(&self, from_state: State, to_state: State)` | `bool` | Returns `true` if there is a directed transition from `from_state` to `to_state`. | | `equivalent(&self, other: &FastAutomaton)` | `Result` | Returns `true` if both automata accept the same language. | -| `generate_strings(&self, number: usize)` | `Result, EngineError>` | Generates `count` strings matched by the automaton. | +| `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates `count` strings matched by the term. | | `get_accept_states(&self)` | `&IntSet` | Returns a reference to the set of accept (final) states. | | `get_cardinality(&self)` | `Cardinality` | Returns the cardinality of the automaton (i.e., the number of possible matched strings). | | `get_condition(&self, from_state: State, to_state: State)` | `Option<&Condition>` | Returns a reference to the condition of the directed transition between the two states, if any. | | `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of matched strings. | | `get_number_of_states(&self)` | `usize` | Returns the number of states in the automaton. | -| `get_reacheable_states(&self)` | `IntSet` | Returns the set of all states reachable from the start state. | +| `get_reachable_states(&self)` | `IntSet` | Returns the set of all states reachable from the start state. | | `get_spanning_set(&self)` | `&SpanningSet` | Returns a reference to the automaton's spanning set. | | `get_start_state(&self)` | `State` | Returns the start state. | | `has_state(&self, state: State)` | `bool` | Returns `true` if the automaton contains the given state. | +| `has_transition(&self, from_state: State, to_state: State)` | `bool` | Returns `true` if there is a directed transition from `from_state` to `to_state`. | | `in_degree(&self, state: State)` | `usize` | Returns the number of transitions to the provided state. | | `is_accepted(&self, state: &State)` | `bool` | Returns `true` if the given state is one of the accept states. | | `is_cyclic(&self)` | `bool` | Returns `true` if the automaton contains at least one cycle. | -| `is_determinitic(&self)` | `bool` | Returns `true` if the automaton is deterministic. | +| `is_deterministic(&self)` | `bool` | Returns `true` if the automaton is deterministic. | | `is_empty(&self)` | `bool` | Checks if the automaton matches the empty language. | | `is_empty_string(&self)` | `bool` | Checks if the automaton only matches the empty string `""`. | +| `is_match(&self, string: &str)` | `bool` | Returns `true` if the automaton matches the given string. | | `is_total(&self)` | `bool` | Checks if the automaton matches all possible strings. | -| `match_string(&self, string: &str)` | `bool` | Returns `true` if the automaton matches the given string. | | `out_degree(&self, state: State)` | `usize` | Returns the number of transitions from the provided state. | | `print_dot(&self)` | `()` | Prints the automaton's DOT representation. | | `states(&self)` | `impl Iterator` | Returns an iterator over the automaton’s states. | diff --git a/src/fast_automaton/analyze/cardinality.rs b/src/fast_automaton/analyze/cardinality.rs index ccad761..4bbf9b7 100644 --- a/src/fast_automaton/analyze/cardinality.rs +++ b/src/fast_automaton/analyze/cardinality.rs @@ -10,7 +10,7 @@ impl FastAutomaton { } else if self.cyclic || self.is_total() { return Cardinality::Infinite; } - assert!(self.is_determinitic(), "The automaton should be deterministic."); + assert!(self.is_deterministic(), "The automaton should be deterministic."); let topologically_sorted_states = self.topological_sorted_states(); if topologically_sorted_states.is_none() { diff --git a/src/fast_automaton/analyze/mod.rs b/src/fast_automaton/analyze/mod.rs index 3902460..dbc05f4 100644 --- a/src/fast_automaton/analyze/mod.rs +++ b/src/fast_automaton/analyze/mod.rs @@ -36,7 +36,7 @@ impl FastAutomaton { } /// Returns the set of all states reachable from the start state. - pub fn get_reacheable_states(&self) -> IntSet { + pub fn get_reachable_states(&self) -> IntSet { let mut states_map: IntMap> = IntMap::with_capacity_and_hasher(self.transitions.len(), BuildHasherDefault::default()); for from_state in self.states() { diff --git a/src/fast_automaton/builder.rs b/src/fast_automaton/builder.rs index c12b366..16f647b 100644 --- a/src/fast_automaton/builder.rs +++ b/src/fast_automaton/builder.rs @@ -342,6 +342,6 @@ mod tests { .unwrap() .to_automaton() .unwrap(); - assert_eq!(deterministic, automaton.is_determinitic()); + assert_eq!(deterministic, automaton.is_deterministic()); } } diff --git a/src/fast_automaton/generate.rs b/src/fast_automaton/generate.rs index 7532309..3e60f29 100644 --- a/src/fast_automaton/generate.rs +++ b/src/fast_automaton/generate.rs @@ -7,12 +7,12 @@ use super::*; impl FastAutomaton { /// Generates `count` strings matched by the automaton. - pub fn generate_strings(&self, number: usize) -> Result, EngineError> { + pub fn generate_strings(&self, count: usize) -> Result, EngineError> { if self.is_empty() { return Ok(Vec::new()); } - let mut strings = AHashSet::with_capacity(cmp::min(number, 1000)); + let mut strings = AHashSet::with_capacity(cmp::min(count, 1000)); let execution_profile = ExecutionProfile::get(); @@ -20,8 +20,8 @@ impl FastAutomaton { AHashMap::with_capacity(self.get_number_of_states()); let mut worklist: VecDeque<(Vec, usize)> = - VecDeque::with_capacity(cmp::min(number, 1000)); - let mut visited = AHashSet::with_capacity(cmp::min(number, 1000)); + VecDeque::with_capacity(cmp::min(count, 1000)); + let mut visited = AHashSet::with_capacity(cmp::min(count, 1000)); worklist.push_back((vec![], self.start_state)); while let Some((ranges, state)) = worklist.pop_front() { @@ -31,7 +31,7 @@ impl FastAutomaton { } else { let mut end = false; let mut ranges_iter: Vec<_> = ranges.iter().map(|range| range.iter()).collect(); - while strings.len() < number { + while strings.len() < count { execution_profile.assert_not_timed_out()?; let mut string = vec![]; for i in 0..ranges.len() { @@ -54,7 +54,7 @@ impl FastAutomaton { } } - if strings.len() == number { + if strings.len() == count { break; } } diff --git a/src/fast_automaton/mod.rs b/src/fast_automaton/mod.rs index c78604a..94ff5f4 100644 --- a/src/fast_automaton/mod.rs +++ b/src/fast_automaton/mod.rs @@ -157,7 +157,7 @@ impl FastAutomaton { /// Returns `true` if there is a directed transition from `from_state` to `to_state`. #[inline] - pub fn does_transition_exists(&self, from_state: State, to_state: State) -> bool { + pub fn has_transition(&self, from_state: State, to_state: State) -> bool { if !self.has_state(from_state) || !self.has_state(to_state) { return false; } @@ -216,7 +216,7 @@ impl FastAutomaton { /// Returns `true` if the automaton is deterministic. #[inline] - pub fn is_determinitic(&self) -> bool { + pub fn is_deterministic(&self) -> bool { self.deterministic } @@ -233,7 +233,7 @@ impl FastAutomaton { } /// Returns `true` if the automaton matches the given string. - pub fn match_string(&self, string: &str) -> bool { + pub fn is_match(&self, string: &str) -> bool { let mut worklist = VecDeque::with_capacity(self.get_number_of_states()); worklist.push_back((0, &self.start_state)); diff --git a/src/fast_automaton/operation/concat.rs b/src/fast_automaton/operation/concat.rs index 6318c61..7eac73f 100644 --- a/src/fast_automaton/operation/concat.rs +++ b/src/fast_automaton/operation/concat.rs @@ -12,11 +12,11 @@ impl FastAutomaton { Self::concat_all([self, other]) } - /// Computes the concatenation of all automatons in the given iterator. - pub fn concat_all<'a, I: IntoIterator>(automatons: I) -> Result + /// Computes the concatenation of all automata in the given iterator. + pub fn concat_all<'a, I: IntoIterator>(automata: I) -> Result { let mut new_automaton = FastAutomaton::new_empty_string(); - for automaton in automatons { + for automaton in automata { new_automaton.concat_mut(automaton)?; } @@ -141,10 +141,10 @@ mod tests { .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("abc")); - assert!(!automaton.match_string("abcd")); - assert!(!automaton.match_string("ab")); - assert!(!automaton.match_string("")); + assert!(automaton.is_match("abc")); + assert!(!automaton.is_match("abcd")); + assert!(!automaton.is_match("ab")); + assert!(!automaton.is_match("")); Ok(()) } @@ -154,17 +154,17 @@ mod tests { .unwrap() .to_automaton() .unwrap(); - assert!(automaton.match_string("0101abc")); - assert!(automaton.match_string("0101ac")); - assert!(automaton.match_string("0101aaa")); - assert!(!automaton.match_string("abc")); - assert!(!automaton.match_string("0101abcd")); - assert!(!automaton.match_string("ab")); - assert!(!automaton.match_string("acc")); - assert!(!automaton.match_string("a")); - assert!(!automaton.match_string("aaaa")); - assert!(!automaton.match_string("aa")); - assert!(!automaton.match_string("")); + assert!(automaton.is_match("0101abc")); + assert!(automaton.is_match("0101ac")); + assert!(automaton.is_match("0101aaa")); + assert!(!automaton.is_match("abc")); + assert!(!automaton.is_match("0101abcd")); + assert!(!automaton.is_match("ab")); + assert!(!automaton.is_match("acc")); + assert!(!automaton.is_match("a")); + assert!(!automaton.is_match("aaaa")); + assert!(!automaton.is_match("aa")); + assert!(!automaton.is_match("")); Ok(()) } @@ -174,12 +174,12 @@ mod tests { .unwrap() .to_automaton() .unwrap(); - assert!(automaton.match_string("AAABBB")); - assert!(automaton.match_string("AA")); - assert!(automaton.match_string("AB")); - assert!(!automaton.match_string("B")); - assert!(!automaton.match_string("ABA")); - assert!(!automaton.match_string("")); + assert!(automaton.is_match("AAABBB")); + assert!(automaton.is_match("AA")); + assert!(automaton.is_match("AB")); + assert!(!automaton.is_match("B")); + assert!(!automaton.is_match("ABA")); + assert!(!automaton.is_match("")); Ok(()) } @@ -189,11 +189,11 @@ mod tests { .unwrap() .to_automaton() .unwrap(); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aa")); - assert!(automaton.match_string("aaaaaaa")); - assert!(!automaton.match_string("ab")); - assert!(!automaton.match_string("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aa")); + assert!(automaton.is_match("aaaaaaa")); + assert!(!automaton.is_match("ab")); + assert!(!automaton.is_match("")); Ok(()) } @@ -204,11 +204,11 @@ mod tests { .unwrap() .to_automaton() .unwrap(); - assert!(automaton.match_string("c")); - assert!(automaton.match_string("ac")); - assert!(automaton.match_string("aac")); - assert!(automaton.match_string("aaaaaaac")); - assert!(!automaton.match_string("abc")); + assert!(automaton.is_match("c")); + assert!(automaton.is_match("ac")); + assert!(automaton.is_match("aac")); + assert!(automaton.is_match("aaaaaaac")); + assert!(!automaton.is_match("abc")); Ok(()) } @@ -219,11 +219,11 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("ababab")); - assert!(automaton.match_string("abababab")); - assert!(!automaton.match_string("ab")); - assert!(!automaton.match_string("abab")); - assert!(!automaton.match_string("ababababab")); + assert!(automaton.is_match("ababab")); + assert!(automaton.is_match("abababab")); + assert!(!automaton.is_match("ab")); + assert!(!automaton.is_match("abab")); + assert!(!automaton.is_match("ababababab")); Ok(()) } @@ -234,10 +234,10 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("aaa")); - assert!(automaton.match_string("aaaaa")); - assert!(!automaton.match_string("a")); - assert!(!automaton.match_string("aa")); + assert!(automaton.is_match("aaa")); + assert!(automaton.is_match("aaaaa")); + assert!(!automaton.is_match("a")); + assert!(!automaton.is_match("aa")); Ok(()) } @@ -248,10 +248,10 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(!automaton.match_string("aa")); - assert!(!automaton.match_string("aaa")); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(!automaton.is_match("aa")); + assert!(!automaton.is_match("aaa")); Ok(()) } @@ -262,11 +262,11 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aa")); - assert!(!automaton.match_string("aaa")); - assert!(!automaton.match_string("aaaa")); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aa")); + assert!(!automaton.is_match("aaa")); + assert!(!automaton.is_match("aaaa")); Ok(()) } @@ -277,11 +277,11 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(!automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aa")); - assert!(automaton.match_string("aaa")); - assert!(!automaton.match_string("aaaa")); + assert!(!automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aa")); + assert!(automaton.is_match("aaa")); + assert!(!automaton.is_match("aaaa")); Ok(()) } @@ -292,15 +292,15 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(!automaton.match_string("")); - assert!(!automaton.match_string("aab")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aaa")); - assert!(automaton.match_string("aba")); - assert!(automaton.match_string("aaba")); - assert!(automaton.match_string("aabaaa")); - assert!(automaton.match_string("aaabaaabaaba")); - assert!(!automaton.match_string("aaabbaa")); + assert!(!automaton.is_match("")); + assert!(!automaton.is_match("aab")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aaa")); + assert!(automaton.is_match("aba")); + assert!(automaton.is_match("aaba")); + assert!(automaton.is_match("aabaaa")); + assert!(automaton.is_match("aaabaaabaaba")); + assert!(!automaton.is_match("aaabbaa")); Ok(()) } @@ -311,18 +311,18 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("ac")); - assert!(automaton.match_string("ads")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("acaadsac")); - assert!(automaton.match_string("adsaaaaaaaacaa")); - assert!(!automaton.match_string("as")); - assert!(!automaton.match_string("ad")); - assert!(!automaton.match_string("c")); - assert!(!automaton.match_string("ds")); - assert!(!automaton.match_string("d")); - assert!(!automaton.match_string("s")); + assert!(automaton.is_match("")); + assert!(automaton.is_match("ac")); + assert!(automaton.is_match("ads")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("acaadsac")); + assert!(automaton.is_match("adsaaaaaaaacaa")); + assert!(!automaton.is_match("as")); + assert!(!automaton.is_match("ad")); + assert!(!automaton.is_match("c")); + assert!(!automaton.is_match("ds")); + assert!(!automaton.is_match("d")); + assert!(!automaton.is_match("s")); Ok(()) } @@ -333,18 +333,18 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(!automaton.match_string("")); - assert!(automaton.match_string("ef")); - assert!(automaton.match_string("ads")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("efadsa")); - assert!(automaton.match_string("aaadsefef")); - assert!(!automaton.match_string("as")); - assert!(!automaton.match_string("ad")); - assert!(!automaton.match_string("e")); - assert!(!automaton.match_string("ds")); - assert!(!automaton.match_string("d")); - assert!(!automaton.match_string("s")); + assert!(!automaton.is_match("")); + assert!(automaton.is_match("ef")); + assert!(automaton.is_match("ads")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("efadsa")); + assert!(automaton.is_match("aaadsefef")); + assert!(!automaton.is_match("as")); + assert!(!automaton.is_match("ad")); + assert!(!automaton.is_match("e")); + assert!(!automaton.is_match("ds")); + assert!(!automaton.is_match("d")); + assert!(!automaton.is_match("s")); Ok(()) } @@ -355,13 +355,13 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("bc")); - assert!(automaton.match_string("abcbca")); - assert!(automaton.match_string("bcabcbcaaaa")); - assert!(!automaton.match_string("b")); - assert!(!automaton.match_string("c")); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("bc")); + assert!(automaton.is_match("abcbca")); + assert!(automaton.is_match("bcabcbcaaaa")); + assert!(!automaton.is_match("b")); + assert!(!automaton.is_match("c")); Ok(()) } @@ -372,14 +372,14 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aa")); - assert!(automaton.match_string("ba")); - assert!(automaton.match_string("aba")); - assert!(automaton.match_string("abbaabbaba")); - assert!(!automaton.match_string("b")); - assert!(!automaton.match_string("abab")); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aa")); + assert!(automaton.is_match("ba")); + assert!(automaton.is_match("aba")); + assert!(automaton.is_match("abbaabbaba")); + assert!(!automaton.is_match("b")); + assert!(!automaton.is_match("abab")); Ok(()) } @@ -390,14 +390,14 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aa")); - assert!(automaton.match_string("ba")); - assert!(automaton.match_string("aba")); - assert!(automaton.match_string("abbaabbaba")); - assert!(!automaton.match_string("b")); - assert!(!automaton.match_string("abab")); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aa")); + assert!(automaton.is_match("ba")); + assert!(automaton.is_match("aba")); + assert!(automaton.is_match("abbaabbaba")); + assert!(!automaton.is_match("b")); + assert!(!automaton.is_match("abab")); Ok(()) } diff --git a/src/fast_automaton/operation/determinize.rs b/src/fast_automaton/operation/determinize.rs index 73f8464..0217834 100644 --- a/src/fast_automaton/operation/determinize.rs +++ b/src/fast_automaton/operation/determinize.rs @@ -91,7 +91,7 @@ mod tests { let deterministic_automaton = automaton.determinize().unwrap(); - assert!(deterministic_automaton.is_determinitic()); + assert!(deterministic_automaton.is_deterministic()); Ok(()) } @@ -125,7 +125,7 @@ mod tests { "States After: {}", deterministic_automaton.get_number_of_states() ); - assert!(deterministic_automaton.is_determinitic()); + assert!(deterministic_automaton.is_deterministic()); assert!( automaton .difference(&deterministic_automaton) diff --git a/src/fast_automaton/operation/difference.rs b/src/fast_automaton/operation/difference.rs index 4e2bd22..59ca6ed 100644 --- a/src/fast_automaton/operation/difference.rs +++ b/src/fast_automaton/operation/difference.rs @@ -6,7 +6,7 @@ use super::*; impl FastAutomaton { fn totalize(&mut self) -> Result<(), EngineError> { - assert!(self.is_determinitic(), "The automaton should be deterministic."); + assert!(self.is_deterministic(), "The automaton should be deterministic."); let crash_state = self.new_state(); let mut transitions_to_crash_state: IntMap = diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index b79ebab..e373c55 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -14,13 +14,13 @@ impl FastAutomaton { FastAutomaton::intersection_all([self, other]) } - /// Computes the intersection of all automatons in the given iterator. + /// Computes the intersection of all automata in the given iterator. pub fn intersection_all<'a, I: IntoIterator>( - automatons: I, + automata: I, ) -> Result { let mut result: Cow<'a, FastAutomaton> = Cow::Owned(FastAutomaton::new_total()); - for automaton in automatons { + for automaton in automata { result = result.intersection_internal(automaton)?; if result.is_empty() { @@ -31,15 +31,15 @@ impl FastAutomaton { Ok(result.into_owned()) } - /// Computes in parallel the intersection of all automatons in the given iterator. + /// Computes in parallel the intersection of all automata in the given iterator. pub fn intersection_all_par<'a, I: IntoParallelIterator>( - automatons: I, + automata: I, ) -> Result { let execution_profile = ExecutionProfile::get(); let total = FastAutomaton::new_total(); - automatons + automata .into_par_iter() .try_fold( || total.clone(), @@ -224,11 +224,11 @@ mod tests { .unwrap(); let intersection = automaton1.intersection(&automaton2).unwrap(); - assert!(intersection.match_string("ac")); - assert!(!intersection.match_string("abc")); - assert!(!intersection.match_string("aaa")); - assert!(!intersection.match_string("abcd")); - assert!(!intersection.match_string("aba")); + assert!(intersection.is_match("ac")); + assert!(!intersection.is_match("abc")); + assert!(!intersection.is_match("aaa")); + assert!(!intersection.is_match("abcd")); + assert!(!intersection.is_match("aba")); Ok(()) } @@ -244,9 +244,9 @@ mod tests { .unwrap(); let intersection = automaton1.intersection(&automaton2).unwrap(); - assert!(intersection.match_string("")); - assert!(!intersection.match_string("a")); - assert!(!intersection.match_string("b")); + assert!(intersection.is_match("")); + assert!(!intersection.is_match("a")); + assert!(!intersection.is_match("b")); Ok(()) } @@ -262,11 +262,11 @@ mod tests { .unwrap(); let intersection = automaton1.intersection(&automaton2).unwrap(); - assert!(intersection.match_string("")); - assert!(intersection.match_string("xxx")); - assert!(intersection.match_string("xxxxxx")); - assert!(!intersection.match_string("xx")); - assert!(!intersection.match_string("xxxx")); + assert!(intersection.is_match("")); + assert!(intersection.is_match("xxx")); + assert!(intersection.is_match("xxxxxx")); + assert!(!intersection.is_match("xx")); + assert!(!intersection.is_match("xxxx")); Ok(()) } @@ -282,12 +282,12 @@ mod tests { .unwrap(); let intersection = automaton1.intersection(&automaton2).unwrap(); - assert!(intersection.match_string("ac")); - assert!(!intersection.match_string("aaac")); - assert!(!intersection.match_string("abc")); - assert!(!intersection.match_string("aaa")); - assert!(!intersection.match_string("abcd")); - assert!(!intersection.match_string("aba")); + assert!(intersection.is_match("ac")); + assert!(!intersection.is_match("aaac")); + assert!(!intersection.is_match("abc")); + assert!(!intersection.is_match("aaa")); + assert!(!intersection.is_match("abcd")); + assert!(!intersection.is_match("aba")); Ok(()) } @@ -307,7 +307,7 @@ mod tests { assert!(!intersection.is_empty()); - assert!(intersection.match_string("avb@gmail.com")); + assert!(intersection.is_match("avb@gmail.com")); Ok(()) } } diff --git a/src/fast_automaton/operation/mod.rs b/src/fast_automaton/operation/mod.rs index b21d25f..0805011 100644 --- a/src/fast_automaton/operation/mod.rs +++ b/src/fast_automaton/operation/mod.rs @@ -14,7 +14,7 @@ mod repeat; impl FastAutomaton { pub(crate) fn remove_dead_transitions(&mut self) { if !self.is_empty() { - let reacheable_states = self.get_reacheable_states(); + let reacheable_states = self.get_reachable_states(); let mut dead_states = IntSet::default(); for from_state in self.states() { @@ -45,7 +45,7 @@ mod tests { .unwrap(); let intersection = automaton1.intersection(&automaton2).unwrap(); assert_eq!(3, intersection.get_number_of_states()); - assert_eq!(3, intersection.get_reacheable_states().len()); + assert_eq!(3, intersection.get_reachable_states().len()); Ok(()) } } diff --git a/src/fast_automaton/operation/repeat.rs b/src/fast_automaton/operation/repeat.rs index c7ff2d8..b94b328 100644 --- a/src/fast_automaton/operation/repeat.rs +++ b/src/fast_automaton/operation/repeat.rs @@ -116,12 +116,12 @@ mod tests { .unwrap() .to_automaton() .unwrap(); - assert!(automaton.match_string("")); - assert!(automaton.match_string(",")); - assert!(automaton.match_string("aaa,")); - assert!(automaton.match_string("aaaa,aa")); - assert!(!automaton.match_string("a")); - assert!(!automaton.match_string("aa")); + assert!(automaton.is_match("")); + assert!(automaton.is_match(",")); + assert!(automaton.is_match("aaa,")); + assert!(automaton.is_match("aaaa,aa")); + assert!(!automaton.is_match("a")); + assert!(!automaton.is_match("aa")); Ok(()) } } diff --git a/src/fast_automaton/operation/union.rs b/src/fast_automaton/operation/union.rs index 8e80b39..db2f62c 100644 --- a/src/fast_automaton/operation/union.rs +++ b/src/fast_automaton/operation/union.rs @@ -13,26 +13,26 @@ impl FastAutomaton { Self::union_all([self, other]) } - /// Computes the union of all automatons in the given iterator. + /// Computes the union of all automata in the given iterator. pub fn union_all<'a, I: IntoIterator>( - automatons: I, + automata: I, ) -> Result { let mut new_automaton = FastAutomaton::new_empty(); - for automaton in automatons { + for automaton in automata { new_automaton.union_mut(automaton)?; } Ok(new_automaton) } - /// Computes in parallel the union of all automatons in the given iterator. + /// Computes in parallel the union of all automata in the given iterator. pub fn union_all_par<'a, I: IntoParallelIterator>( - automatons: I, + automata: I, ) -> Result { let execution_profile = ExecutionProfile::get(); let empty = FastAutomaton::new_empty(); - automatons + automata .into_par_iter() .try_fold( || empty.clone(), @@ -218,16 +218,16 @@ mod tests { .unwrap() .to_automaton() .unwrap(); - assert!(automaton.match_string("abc")); - assert!(automaton.match_string("ac")); - assert!(automaton.match_string("aaa")); - assert!(!automaton.match_string("abcd")); - assert!(!automaton.match_string("ab")); - assert!(!automaton.match_string("acc")); - assert!(!automaton.match_string("a")); - assert!(!automaton.match_string("aaaa")); - assert!(!automaton.match_string("aa")); - assert!(!automaton.match_string("")); + assert!(automaton.is_match("abc")); + assert!(automaton.is_match("ac")); + assert!(automaton.is_match("aaa")); + assert!(!automaton.is_match("abcd")); + assert!(!automaton.is_match("ab")); + assert!(!automaton.is_match("acc")); + assert!(!automaton.is_match("a")); + assert!(!automaton.is_match("aaaa")); + assert!(!automaton.is_match("aa")); + assert!(!automaton.is_match("")); Ok(()) } @@ -238,11 +238,11 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("b")); - assert!(automaton.match_string("bb")); - assert!(!automaton.match_string("bbb")); - assert!(!automaton.match_string("bbbb")); + assert!(automaton.is_match("")); + assert!(automaton.is_match("b")); + assert!(automaton.is_match("bb")); + assert!(!automaton.is_match("bbb")); + assert!(!automaton.is_match("bbbb")); Ok(()) } @@ -253,12 +253,12 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("abcaaabcbc")); - assert!(automaton.match_string("d")); - assert!(!automaton.match_string("ad")); - assert!(!automaton.match_string("abcd")); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("abcaaabcbc")); + assert!(automaton.is_match("d")); + assert!(!automaton.is_match("ad")); + assert!(!automaton.is_match("abcd")); Ok(()) } @@ -269,12 +269,12 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("abcaaabcbc")); - assert!(automaton.match_string("d")); - assert!(!automaton.match_string("ad")); - assert!(!automaton.match_string("abcd")); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("abcaaabcbc")); + assert!(automaton.is_match("d")); + assert!(!automaton.is_match("ad")); + assert!(!automaton.is_match("abcd")); Ok(()) } @@ -285,13 +285,13 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("abcaaabcbc")); - assert!(automaton.match_string("d")); - assert!(automaton.match_string("ddd")); - assert!(!automaton.match_string("ad")); - assert!(!automaton.match_string("abcd")); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("abcaaabcbc")); + assert!(automaton.is_match("d")); + assert!(automaton.is_match("ddd")); + assert!(!automaton.is_match("ad")); + assert!(!automaton.is_match("abcd")); Ok(()) } @@ -302,10 +302,10 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("cc")); - assert!(automaton.match_string("caaac")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aababa")); + assert!(automaton.is_match("cc")); + assert!(automaton.is_match("caaac")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aababa")); Ok(()) } @@ -316,15 +316,15 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("q")); - assert!(automaton.match_string("aad")); - assert!(automaton.match_string("ads")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aadadsaaa")); - assert!(!automaton.match_string("aaaas")); - assert!(!automaton.match_string("ad")); - assert!(!automaton.match_string("adsq")); - assert!(!automaton.match_string("qq")); + assert!(automaton.is_match("q")); + assert!(automaton.is_match("aad")); + assert!(automaton.is_match("ads")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aadadsaaa")); + assert!(!automaton.is_match("aaaas")); + assert!(!automaton.is_match("ad")); + assert!(!automaton.is_match("adsq")); + assert!(!automaton.is_match("qq")); Ok(()) } @@ -335,11 +335,11 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("ab")); - assert!(automaton.match_string("")); - assert!(!automaton.match_string("a")); - assert!(!automaton.match_string("b")); - assert!(!automaton.match_string("aab")); + assert!(automaton.is_match("ab")); + assert!(automaton.is_match("")); + assert!(!automaton.is_match("a")); + assert!(!automaton.is_match("b")); + assert!(!automaton.is_match("aab")); Ok(()) } @@ -350,10 +350,10 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("d")); - assert!(automaton.match_string("ab")); - assert!(automaton.match_string("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("d")); + assert!(automaton.is_match("ab")); + assert!(automaton.is_match("")); Ok(()) } @@ -364,11 +364,11 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("au")); - assert!(automaton.match_string("du")); - assert!(automaton.match_string("abu")); - assert!(automaton.match_string("u")); - assert!(automaton.match_string("")); + assert!(automaton.is_match("au")); + assert!(automaton.is_match("du")); + assert!(automaton.is_match("abu")); + assert!(automaton.is_match("u")); + assert!(automaton.is_match("")); Ok(()) } } diff --git a/src/fast_automaton/serializer/tokenizer/embed_automaton.rs b/src/fast_automaton/serializer/tokenizer/embed_automaton.rs index 429c008..3074a0f 100644 --- a/src/fast_automaton/serializer/tokenizer/embed_automaton.rs +++ b/src/fast_automaton/serializer/tokenizer/embed_automaton.rs @@ -32,7 +32,7 @@ impl Tokenizer<'_> { vec.push(AutomatonToken::AcceptState) } - for (condition, to_state) in self.automaton.transitions_from_iter(current_state) { + for (condition, to_state) in self.automaton.transitions_from(current_state) { if condition.is_empty() { continue; } diff --git a/src/fast_automaton/serializer/tokenizer/mod.rs b/src/fast_automaton/serializer/tokenizer/mod.rs index 95ccb18..8dccca2 100644 --- a/src/fast_automaton/serializer/tokenizer/mod.rs +++ b/src/fast_automaton/serializer/tokenizer/mod.rs @@ -39,7 +39,7 @@ impl Tokenizer<'_> { state_counter += 1; automaton - .transitions_from_iter(current_state) + .transitions_from(current_state) .filter(|(c, _)| !c.is_empty()) .for_each(|(_, to_state)| { if !seen.contains(to_state) { diff --git a/src/lib.rs b/src/lib.rs index 3d750f0..dcff59b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -449,7 +449,7 @@ impl Term { pub fn get_cardinality(&self) -> Result, EngineError> { match self { Term::RegularExpression(regex) => Ok(regex.get_cardinality()), - Term::Automaton(automaton) => Ok(if !automaton.is_determinitic() { + Term::Automaton(automaton) => Ok(if !automaton.is_deterministic() { automaton.determinize()?.get_cardinality() } else { automaton.get_cardinality() @@ -482,7 +482,7 @@ impl Term { minuend: &FastAutomaton, subtrahend: &'a FastAutomaton, ) -> Result, EngineError> { - if subtrahend.is_determinitic() { + if subtrahend.is_deterministic() { Ok(Cow::Borrowed(subtrahend)) } else if !minuend.is_cyclic() && subtrahend.is_cyclic() { Ok(Cow::Owned( diff --git a/src/regex/builder.rs b/src/regex/builder.rs index 7681b8e..727bcfe 100644 --- a/src/regex/builder.rs +++ b/src/regex/builder.rs @@ -272,22 +272,22 @@ mod tests { let regex_parsed = RegularExpression::new(".").unwrap(); let automaton = regex_parsed.to_automaton().unwrap(); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("\t")); - assert!(automaton.match_string("\n")); - assert!(automaton.match_string("\r")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("\t")); + assert!(automaton.is_match("\n")); + assert!(automaton.is_match("\r")); let regex_parsed = RegularExpression::new("(?i)a").unwrap(); let automaton = regex_parsed.to_automaton().unwrap(); - assert!(automaton.match_string("a")); - assert!(!automaton.match_string("A")); + assert!(automaton.is_match("a")); + assert!(!automaton.is_match("A")); let regex_parsed = RegularExpression::new("a(?i)a(?-s).").unwrap(); let automaton = regex_parsed.to_automaton().unwrap(); - assert!(automaton.match_string("aa\n")); - assert!(!automaton.match_string("aAb")); + assert!(automaton.is_match("aa\n")); + assert!(!automaton.is_match("aAb")); assert!(RegularExpression::new("\\1").is_err()); Ok(()) From 1fd6bfcdbec4bf2434ca3a357207f266f266c1bb Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Fri, 19 Sep 2025 21:32:33 +0200 Subject: [PATCH 35/44] update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 93f116f..0a960fb 100644 --- a/README.md +++ b/README.md @@ -81,11 +81,11 @@ fn main() -> Result<(), EngineError> { ## Key Concepts & Limitations RegexSolver supports a subset of regular expressions that adhere to the principles of regular languages. Here are the key characteristics and limitations of the regular expressions supported by RegexSolver: +- **Anchored Expressions:** All regular expressions in RegexSolver are anchored. This means that the expressions are treated as if they start and end at the boundaries of the input text. For example, the expression `abc` will match the string "abc" but not "xabc" or "abcx". - **Lookahead/Lookbehind:** RegexSolver does not support lookahead (`(?=...)`) or lookbehind (`(?<=...)`) assertions. Using them returns an error. - **Pure Regular Expressions:** RegexSolver focuses on pure regular expressions as defined in regular language theory. This means features that extend beyond regular languages, such as backreferences (`\1`, `\2`, etc.), are not supported. Any use of backreference would return an error. - **Greedy/Ungreedy Quantifiers:** The concept of ungreedy (`*?`, `+?`, `??`) quantifiers is not supported. All quantifiers are treated as greedy. For example, `a*` or `a*?` will match the longest possible sequence of "a"s. - **Line Feed and Dot:** RegexSolver handles all characters the same way. The dot `.` matches any Unicode character including line feed (`\n`). -- **Anchored Expressions:** All regular expressions in RegexSolver are anchored. This means that the expressions are treated as if they start and end at the boundaries of the input text. For example, the expression `abc` will match the string "abc" but not "xabc" or "abcx". - **Empty Regular Expressions:** The empty language (matches no string) is represented by constructs like `[]` (empty character class). This is distinct from the empty string. RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/regex_syntax/) library for parsing patterns. Unsupported features are parsed but ignored; they do not raise an error unless they affect semantics that cannot be represented (e.g., backreferences). This allows for some flexibility in writing regular expressions, but it is important to be aware of the unsupported features to avoid unexpected behavior. From 499735d77afc09060aa46affe37f89637715cf6e Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sat, 20 Sep 2025 21:17:45 +0200 Subject: [PATCH 36/44] update docs --- README.md | 5 +++-- src/fast_automaton/convert/to_regex/mod.rs | 1 + src/lib.rs | 2 +- tests/data/regex-todo.txt | 0 4 files changed, 5 insertions(+), 3 deletions(-) delete mode 100644 tests/data/regex-todo.txt diff --git a/README.md b/README.md index 0a960fb..69025a2 100644 --- a/README.md +++ b/README.md @@ -128,7 +128,7 @@ RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/re | `subset(&self, term: &Term)` | `Result` | Returns `true` if all strings matched by the current term are also matched by the given term. | | `to_automaton(&self)` | `Result, EngineError>` | Converts the term to a `FastAutomaton`. | | `to_pattern(&self)` | `String` | Converts the term to a regular expression pattern. | -| `to_regex(&self)` | `Cow` | Converts the term to a RegularExpression. | +| `to_regex(&self)` | `Cow` | Converts the term to a `RegularExpression`. | ### FastAutomaton @@ -197,7 +197,7 @@ This design allows us to perform unions, intersections, and complements of trans | `direct_states(&self, state: &State)` | `impl Iterator` | Returns an iterator over states directly reachable from the given state in one transition. | | `direct_states_vec(&self, state: &State)` | `Vec` | Returns a vector of states directly reachable from the given state in one transition. | | `equivalent(&self, other: &FastAutomaton)` | `Result` | Returns `true` if both automata accept the same language. | -| `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates `count` strings matched by the term. | +| `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates `count` strings matched by the automaton. | | `get_accept_states(&self)` | `&IntSet` | Returns a reference to the set of accept (final) states. | | `get_cardinality(&self)` | `Cardinality` | Returns the cardinality of the automaton (i.e., the number of possible matched strings). | | `get_condition(&self, from_state: State, to_state: State)` | `Option<&Condition>` | Returns a reference to the condition of the directed transition between the two states, if any. | @@ -221,6 +221,7 @@ This design allows us to perform unions, intersections, and complements of trans | `states(&self)` | `impl Iterator` | Returns an iterator over the automaton’s states. | | `states_vec(&self)` | `Vec` | Returns a vector containing the automaton’s states. | | `subset(&self, other: &FastAutomaton)` | `Result` | Returns `true` if all strings accepted by `self` are also accepted by `other`. | +| `to_regex(&self)` | `RegularExpression` | Converts the term to a `RegularExpression`. | | `transitions_from(&self, state: State)` | `impl Iterator` | Returns an iterator over transitions from the given state. | | `transitions_from_vec(&self, state: State)` | `Vec<(Condition, State)>` | Returns a vector of transitions from the given state. | | `transitions_to_vec(&self, state: State)` | `Vec<(State, Condition)>` | Returns a vector of transitions to the given state. | diff --git a/src/fast_automaton/convert/to_regex/mod.rs b/src/fast_automaton/convert/to_regex/mod.rs index 10a530e..6378449 100644 --- a/src/fast_automaton/convert/to_regex/mod.rs +++ b/src/fast_automaton/convert/to_regex/mod.rs @@ -4,6 +4,7 @@ mod state_elimination; mod transform; impl FastAutomaton { + /// Converts the term to a [`RegularExpression`]. pub fn to_regex(&self) -> RegularExpression { let transformed_automaton = transform::transform(self); state_elimination::convert_to_regex(&transformed_automaton) diff --git a/src/lib.rs b/src/lib.rs index dcff59b..de45599 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -465,7 +465,7 @@ impl Term { }) } - /// Converts the term to a RegularExpression. + /// Converts the term to a [`RegularExpression`]. pub fn to_regex(&self) -> Cow { match self { Term::RegularExpression(regex) => Cow::Borrowed(regex), diff --git a/tests/data/regex-todo.txt b/tests/data/regex-todo.txt deleted file mode 100644 index e69de29..0000000 From b67597dbd58ff080587689dab173f1107db4c246 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sun, 21 Sep 2025 08:38:35 +0200 Subject: [PATCH 37/44] update method signatures --- README.md | 6 +++--- src/fast_automaton/analyze/cardinality.rs | 6 +++--- src/fast_automaton/analyze/length.rs | 4 ++-- src/fast_automaton/builder.rs | 4 ++-- .../convert/to_regex/transform/shape/dotstar.rs | 6 +++--- src/fast_automaton/mod.rs | 10 +++++----- src/fast_automaton/operation/repeat.rs | 2 +- src/fast_automaton/operation/union.rs | 4 ++-- 8 files changed, 21 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 69025a2..f45837d 100644 --- a/README.md +++ b/README.md @@ -194,8 +194,8 @@ This design allows us to perform unions, intersections, and complements of trans | Method | Return | Description | | -------- | ------- | ------- | | `as_dot(&self)` | `String` | Returns the automaton's DOT representation. | -| `direct_states(&self, state: &State)` | `impl Iterator` | Returns an iterator over states directly reachable from the given state in one transition. | -| `direct_states_vec(&self, state: &State)` | `Vec` | Returns a vector of states directly reachable from the given state in one transition. | +| `direct_states(&self, state: State)` | `impl Iterator` | Returns an iterator over states directly reachable from the given state in one transition. | +| `direct_states_vec(&self, state: State)` | `Vec` | Returns a vector of states directly reachable from the given state in one transition. | | `equivalent(&self, other: &FastAutomaton)` | `Result` | Returns `true` if both automata accept the same language. | | `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates `count` strings matched by the automaton. | | `get_accept_states(&self)` | `&IntSet` | Returns a reference to the set of accept (final) states. | @@ -209,7 +209,7 @@ This design allows us to perform unions, intersections, and complements of trans | `has_state(&self, state: State)` | `bool` | Returns `true` if the automaton contains the given state. | | `has_transition(&self, from_state: State, to_state: State)` | `bool` | Returns `true` if there is a directed transition from `from_state` to `to_state`. | | `in_degree(&self, state: State)` | `usize` | Returns the number of transitions to the provided state. | -| `is_accepted(&self, state: &State)` | `bool` | Returns `true` if the given state is one of the accept states. | +| `is_accepted(&self, state: State)` | `bool` | Returns `true` if the given state is one of the accept states. | | `is_cyclic(&self)` | `bool` | Returns `true` if the automaton contains at least one cycle. | | `is_deterministic(&self)` | `bool` | Returns `true` if the automaton is deterministic. | | `is_empty(&self)` | `bool` | Checks if the automaton matches the empty language. | diff --git a/src/fast_automaton/analyze/cardinality.rs b/src/fast_automaton/analyze/cardinality.rs index 4bbf9b7..ec5f514 100644 --- a/src/fast_automaton/analyze/cardinality.rs +++ b/src/fast_automaton/analyze/cardinality.rs @@ -65,8 +65,8 @@ impl FastAutomaton { let mut queue = VecDeque::with_capacity(len); let mut order = Vec::with_capacity(len); - for from_state in &self.states_vec() { - in_degree.entry(*from_state).or_insert(0); + for &from_state in &self.states_vec() { + in_degree.entry(from_state).or_insert(0); for to_state in self.direct_states(from_state) { *in_degree.entry(to_state).or_insert(0) += 1; } @@ -80,7 +80,7 @@ impl FastAutomaton { while let Some(from_state) = queue.pop_front() { order.push(from_state); - for to_state in self.direct_states(&from_state) { + for to_state in self.direct_states(from_state) { *in_degree.entry(to_state).or_default() -= 1; if in_degree[&to_state] == 0 { diff --git a/src/fast_automaton/analyze/length.rs b/src/fast_automaton/analyze/length.rs index bbec964..c753908 100644 --- a/src/fast_automaton/analyze/length.rs +++ b/src/fast_automaton/analyze/length.rs @@ -27,7 +27,7 @@ impl FastAutomaton { } seen.insert(state); - for to_state in self.direct_states(&state) { + for to_state in self.direct_states(state) { if to_state == state || seen.contains(&to_state) { is_infinite = true; continue; @@ -54,7 +54,7 @@ impl FastAutomaton { } seen.insert(state); - for to_state in self.direct_states(&state) { + for to_state in self.direct_states(state) { if to_state == state || seen.contains(&to_state) { max = None; break; diff --git a/src/fast_automaton/builder.rs b/src/fast_automaton/builder.rs index 16f647b..8b8f844 100644 --- a/src/fast_automaton/builder.rs +++ b/src/fast_automaton/builder.rs @@ -286,9 +286,9 @@ impl FastAutomaton { return Ok(()); } let condition_converter = ConditionConverter::new(&self.spanning_set, new_spanning_set)?; - for from_state in &self.states_vec() { + for &from_state in &self.states_vec() { for to_state in self.direct_states_vec(from_state) { - match self.transitions[*from_state].entry(to_state) { + match self.transitions[from_state].entry(to_state) { Entry::Occupied(mut o) => { o.insert(condition_converter.convert(o.get())?); } diff --git a/src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs b/src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs index 33142ca..6c91106 100644 --- a/src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs +++ b/src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs @@ -39,7 +39,7 @@ fn dot_star_component(automaton: &mut FastAutomaton, component: &IntSet) let start_state = start_state.unwrap(); let mut first_hop = automaton - .direct_states(&start_state) + .direct_states(start_state) .filter(|&s| s != start_state) .collect::>(); let mut states_to_remove = vec![]; @@ -89,7 +89,7 @@ fn dot_star_component(automaton: &mut FastAutomaton, component: &IntSet) automaton.add_transition(start_state, start_state, &out_condition.unwrap()); for &state in component { - for to_state in automaton.direct_states_vec(&state) { + for to_state in automaton.direct_states_vec(state) { if !component.contains(&to_state) { continue; } @@ -149,7 +149,7 @@ fn strongconnect( stack.push(v); on_stack[v] = true; - for w in automaton.direct_states(&v) { + for w in automaton.direct_states(v) { if indices[w] == -1 { strongconnect(automaton, w, index, stack, indices, lowlink, on_stack, scc); lowlink[v] = lowlink[v].min(lowlink[w]); diff --git a/src/fast_automaton/mod.rs b/src/fast_automaton/mod.rs index 94ff5f4..10b269c 100644 --- a/src/fast_automaton/mod.rs +++ b/src/fast_automaton/mod.rs @@ -106,8 +106,8 @@ impl FastAutomaton { /// Returns an iterator over states directly reachable from the given state in one transition. #[inline] - pub fn direct_states(&self, state: &State) -> impl Iterator + '_ { - self.transitions[*state] + pub fn direct_states(&self, state: State) -> impl Iterator + '_ { + self.transitions[state] .keys() .cloned() .filter(|s| !self.removed_states.contains(s)) @@ -115,7 +115,7 @@ impl FastAutomaton { /// Returns a vector of states directly reachable from the given state in one transition. #[inline] - pub fn direct_states_vec(&self, state: &State) -> Vec { + pub fn direct_states_vec(&self, state: State) -> Vec { self.direct_states(state).collect() } @@ -210,8 +210,8 @@ impl FastAutomaton { /// Returns `true` if the given state is one of the accept states. #[inline] - pub fn is_accepted(&self, state: &State) -> bool { - self.accept_states.contains(state) + pub fn is_accepted(&self, state: State) -> bool { + self.accept_states.contains(&state) } /// Returns `true` if the automaton is deterministic. diff --git a/src/fast_automaton/operation/repeat.rs b/src/fast_automaton/operation/repeat.rs index b94b328..f9e256d 100644 --- a/src/fast_automaton/operation/repeat.rs +++ b/src/fast_automaton/operation/repeat.rs @@ -23,7 +23,7 @@ impl FastAutomaton { if min == 0 && self.in_degree(self.start_state) != 0 { let new_state = self.new_state(); - if self.is_accepted(&self.start_state) { + if self.is_accepted(self.start_state) { self.accept(new_state); } diff --git a/src/fast_automaton/operation/union.rs b/src/fast_automaton/operation/union.rs index db2f62c..96480ea 100644 --- a/src/fast_automaton/operation/union.rs +++ b/src/fast_automaton/operation/union.rs @@ -62,7 +62,7 @@ impl FastAutomaton { ) -> Result, EngineError> { let mut imcomplete_states = IntSet::with_capacity(other.out_degree(other.start_state) + 1); - if other.is_accepted(&other.start_state) { + if other.is_accepted(other.start_state) { self.accept(self.start_state); } let self_start_state_in_degree = self.in_degree(self.start_state); @@ -82,7 +82,7 @@ impl FastAutomaton { } if other_start_state_in_degree != 0 { let new_state = self.new_state(); - if other.is_accepted(&other.start_state) { + if other.is_accepted(other.start_state) { self.accept(new_state); } From c243522264bd8f4774551d9b69e9a3326b28e871 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sun, 21 Sep 2025 20:02:22 +0200 Subject: [PATCH 38/44] fix failed build --- src/fast_automaton/serializer/tokenizer/embed_automaton.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fast_automaton/serializer/tokenizer/embed_automaton.rs b/src/fast_automaton/serializer/tokenizer/embed_automaton.rs index 3074a0f..5cb2c5b 100644 --- a/src/fast_automaton/serializer/tokenizer/embed_automaton.rs +++ b/src/fast_automaton/serializer/tokenizer/embed_automaton.rs @@ -27,7 +27,7 @@ impl Tokenizer<'_> { AutomatonToken::State(*self.state_to_token.get(¤t_state).unwrap()); vec.push(embedded_state); - if self.automaton.is_accepted(¤t_state) { + if self.automaton.is_accepted(current_state) { // accept state vec.push(AutomatonToken::AcceptState) } From 02852f967f334c40979b7328a27fbdaffb3beaec Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Wed, 24 Sep 2025 21:57:28 +0200 Subject: [PATCH 39/44] fix serialization --- src/cardinality/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cardinality/mod.rs b/src/cardinality/mod.rs index 9adad1c..54bdcde 100644 --- a/src/cardinality/mod.rs +++ b/src/cardinality/mod.rs @@ -4,7 +4,7 @@ use serde::{Deserialize, Serialize}; /// Represent a number. #[cfg_attr(feature = "serializable", derive(Serialize, Deserialize))] #[derive(PartialEq, Eq, Debug, Clone)] -#[cfg_attr(feature = "serializable", serde(tag = "type", content = "value"))] +#[cfg_attr(feature = "serializable", serde(tag = "type", content = "value", rename_all = "camelCase"))] pub enum Cardinality { /// An infinite number. Infinite, From 9a1266f3f3752fa6eb3cfbb6e8f0dbec37c47279 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Thu, 2 Oct 2025 21:37:03 +0200 Subject: [PATCH 40/44] Huge improvements in generate strings --- src/fast_automaton/generate.rs | 171 +++++++++++++++++++++------------ 1 file changed, 112 insertions(+), 59 deletions(-) diff --git a/src/fast_automaton/generate.rs b/src/fast_automaton/generate.rs index 3e60f29..e0d5ae3 100644 --- a/src/fast_automaton/generate.rs +++ b/src/fast_automaton/generate.rs @@ -1,5 +1,3 @@ -use std::cmp; - use crate::{EngineError, execution_profile::ExecutionProfile}; use ahash::AHashSet; @@ -9,80 +7,119 @@ impl FastAutomaton { /// Generates `count` strings matched by the automaton. pub fn generate_strings(&self, count: usize) -> Result, EngineError> { if self.is_empty() { - return Ok(Vec::new()); + return Ok(vec![]); } - let mut strings = AHashSet::with_capacity(cmp::min(count, 1000)); + let (min, max) = self.get_length(); + let max_len = if let Some(max) = max { + max + } else { + let min = min.expect("A non empty automaton should have a minimum length"); + min.saturating_add(100) + } as usize; let execution_profile = ExecutionProfile::get(); - let mut ranges_cache: AHashMap<&Condition, CharRange> = - AHashMap::with_capacity(self.get_number_of_states()); + let mut ranges_cache = AHashMap::with_capacity(self.get_number_of_states()); + let mut strings = AHashSet::with_capacity(count); + let mut visited = AHashSet::with_capacity(self.get_number_of_states()); + let mut q = VecDeque::with_capacity(self.get_number_of_states()); + q.push_back((self.get_start_state(), vec![], 0u64)); + while let Some((state, ranges, h)) = q.pop_front() { + execution_profile.assert_not_timed_out()?; - let mut worklist: VecDeque<(Vec, usize)> = - VecDeque::with_capacity(cmp::min(count, 1000)); - let mut visited = AHashSet::with_capacity(cmp::min(count, 1000)); + if ranges.len() > max_len { + continue; + } - worklist.push_back((vec![], self.start_state)); - while let Some((ranges, state)) = worklist.pop_front() { - if self.accept_states.contains(&state) { + if self.is_accepted(state) { if ranges.is_empty() { strings.insert(String::new()); } else { - let mut end = false; - let mut ranges_iter: Vec<_> = ranges.iter().map(|range| range.iter()).collect(); - while strings.len() < count { - execution_profile.assert_not_timed_out()?; - let mut string = vec![]; - for i in 0..ranges.len() { - if let Some(character) = ranges_iter[i].next() { - string.push(character); - } else { - ranges_iter[i] = ranges[i].iter(); - if i + 1 < ranges.len() { - string.push(ranges_iter[i].next().unwrap()); - } else { - end = true; - break; - } - } - } - if end { - break; - } - strings.insert(string.into_iter().map(|c| c.to_char()).collect()); - } + Self::ranges_to_strings(&mut strings, &ranges, count, &execution_profile)?; } - if strings.len() == count { + if strings.len() >= count { break; } } - for (cond, to_state) in self.transitions_from(state) { + + for (cond, &to_state) in self.transitions_from(state) { + let hash = + Self::path_mix(h, Self::mix64(state as u64 ^ Self::mix64(to_state as u64))); + + if visited.insert((to_state, ranges.len() + 1, hash)) { + let mut new_ranges = ranges.clone(); + new_ranges.push( + ranges_cache + .entry(cond) + .or_insert_with(|| cond.to_range(&self.spanning_set).unwrap()) + .clone(), + ); + + q.push_back((to_state, new_ranges, hash)); + } + } + } + let mut strings: Vec = strings.into_iter().collect(); + strings.sort_unstable_by(|a, b| a.len().cmp(&b.len()).then_with(|| a.cmp(b))); + Ok(strings) + } + + pub fn ranges_to_strings( + strings: &mut AHashSet, + ranges: &Vec, + count: usize, + execution_profile: &ExecutionProfile, + ) -> Result<(), EngineError> { + let n = count - strings.len(); + if n == 0 { + return Ok(()); + } + + let mut end = false; + let mut out: Vec = Vec::with_capacity(n); + out.push(String::with_capacity(ranges.len())); + for r in ranges { + let mut next = Vec::with_capacity(n); + for prefix in out.into_iter() { execution_profile.assert_not_timed_out()?; - let range = match ranges_cache.entry(cond) { - Entry::Occupied(o) => o.get().clone(), - Entry::Vacant(v) => { - let range = cond.to_range(&self.spanning_set)?; - v.insert(range.clone()); - range + for ch in r.clone().iter() { + let mut s = prefix.clone(); + s.push(ch.to_char()); + next.push(s); + if next.len() == n { + end = true; + break; } - }; - if range.is_empty() { - continue; } - let mut new_ranges = ranges.clone(); - new_ranges.push(range); - let element = (new_ranges, *to_state); - - if !visited.contains(&element) { - visited.insert(element.clone()); - worklist.push_back(element); + if end { + end = false; + break; } } + out = next; + if out.is_empty() { + break; + } } + strings.extend(out); + Ok(()) + } + + #[inline] + fn mix64(mut x: u64) -> u64 { + // splitmix64 + x = x.wrapping_add(0x9E3779B97F4A7C15); + let mut z = x; + z = (z ^ (z >> 30)).wrapping_mul(0xBF58476D1CE4E5B9); + z = (z ^ (z >> 27)).wrapping_mul(0x94D049BB133111EB); + z ^ (z >> 31) + } - Ok(strings.into_iter().collect()) + #[inline] + fn path_mix(h: u64, x: u64) -> u64 { + h.wrapping_mul(0x9E3779B97F4A7C15).rotate_left(7) ^ x } } @@ -94,8 +131,24 @@ mod tests { #[test] fn test_generate_strings() -> Result<(), String> { + assert_generate_strings("a{100}[a-z]", 100); + assert_generate_strings("(ab|cd)e", 100); + assert_generate_strings("[a-z]+", 100); + assert_generate_strings("[a-z]+@", 100); assert_generate_strings("ù", 1000); + assert_generate_strings("[0-9]+[A-Z]*", 500); + assert_generate_strings("a+(ba+)*", 200); + assert_generate_strings("((a|bc)*|d)", 200); + assert_generate_strings(".*", 50); + assert_generate_strings("(ac|ads|a)*", 200); + assert_generate_strings("((aad|ads|a)*|q)", 200); + + assert_generate_strings( + r"john[!#-'\*\+\-/-9=\?\^-\u{007e}]*(\.[!#-'\*\+\-/-9=\?\^-\u{007e}](\.?[!#-'\*\+\-/-9=\?\^-\u{007e}])*)?\.?doe@example\.com", + 1000, + ); + assert_generate_strings("(?:A+(?:\\.[AB]+)*|\"(?:C|\\\\D)*\")@", 500); assert_generate_strings( "(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@", @@ -109,6 +162,7 @@ mod tests { assert_generate_strings("((aad|ads|a)*|q)", 200); assert_generate_strings("((aad|ads|a)*abc.*uif(aad|ads|x)*|q)", 1000); //((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,5} + Ok(()) } @@ -118,23 +172,22 @@ mod tests { .unwrap() .to_automaton() .unwrap(); - println!("{}", automaton.get_number_of_states()); + //println!("{}", automaton.get_number_of_states()); //automaton.to_dot(); let re = Regex::new(&format!("(?s)^{}$", regex)).unwrap(); let strings = automaton.generate_strings(number).unwrap(); - let mut strings: Vec<_> = strings.iter().collect(); - strings.sort_unstable(); println!("nb of strings: {}/{}", strings.len(), number); assert!(number >= strings.len()); for string in strings { - if !re.is_match(string) { + // println!("{string}"); + if !re.is_match(&string) { for byte in string.as_bytes() { print!("{:02x} ", byte); } panic!("'{string}'") } - assert!(re.is_match(string), "'{string}'"); + assert!(re.is_match(&string), "'{string}'"); } } } From fb5eb1a72dcb6bc390c1a9a828e15be0fbd6505c Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Fri, 3 Oct 2025 21:20:34 +0200 Subject: [PATCH 41/44] Fix bad implementation of to_embedding --- src/fast_automaton/serializer/tokenizer/embed_automaton.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/fast_automaton/serializer/tokenizer/embed_automaton.rs b/src/fast_automaton/serializer/tokenizer/embed_automaton.rs index 5cb2c5b..bfe8197 100644 --- a/src/fast_automaton/serializer/tokenizer/embed_automaton.rs +++ b/src/fast_automaton/serializer/tokenizer/embed_automaton.rs @@ -16,11 +16,13 @@ impl Tokenizer<'_> { worklist.push_front(self.automaton.get_start_state()); while let Some(current_state) = worklist.pop_back() { + if !seen.insert(current_state) { + continue; + } if !vec.is_empty() { // separator vec.push(AutomatonToken::SeparatorState) } - seen.insert(current_state); // state let embedded_state = From 1e4980b201bd4a549a82d495e49a74e6f209f9a1 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Fri, 3 Oct 2025 21:47:53 +0200 Subject: [PATCH 42/44] improve assert_not_timed_out clock cycle --- src/execution_profile.rs | 64 +++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 34 deletions(-) diff --git a/src/execution_profile.rs b/src/execution_profile.rs index cda0e11..b845261 100644 --- a/src/execution_profile.rs +++ b/src/execution_profile.rs @@ -1,4 +1,7 @@ -use std::{cell::RefCell, time::SystemTime}; +use std::{ + cell::RefCell, + time::{Duration, Instant}, +}; use crate::error::EngineError; @@ -41,10 +44,10 @@ use crate::error::EngineError; pub struct ExecutionProfile { /// The maximum number of states that a non-determinitic finite automaton can hold, this is checked during the convertion of regular expression to automaton. max_number_of_states: Option, - /// Timestamp of when the execution has started, if this value is not set the operations will never timeout. - start_execution_time: Option, /// The longest time in milliseconds that an operation execution can last, there are no guaranties that the exact time will be respected. - execution_timeout: Option, + execution_timeout: Option, + /// The time after when a [`EngineError::OperationTimeOutError`] should be thrown. + execution_deadline: Option, } impl PartialEq for ExecutionProfile { @@ -66,15 +69,8 @@ impl ExecutionProfile { /// /// Return [`EngineError::OperationTimeOutError`] otherwise. pub(crate) fn assert_not_timed_out(&self) -> Result<(), EngineError> { - if let (Some(start), Some(execution_timeout)) = - (self.start_execution_time, self.execution_timeout) - { - let run_duration = SystemTime::now() - .duration_since(start) - .expect("Time went backwards") - .as_millis(); - - if run_duration > execution_timeout { + if let Some(execution_deadline) = self.execution_deadline { + if Instant::now() > execution_deadline { Err(EngineError::OperationTimeOutError) } else { Ok(()) @@ -101,7 +97,7 @@ impl ExecutionProfile { Ok(()) } - pub fn with_execution_timeout(mut self, execution_timeout_in_ms: u128) -> Self { + pub fn with_execution_timeout(mut self, execution_timeout_in_ms: u64) -> Self { self.execution_timeout = Some(execution_timeout_in_ms); self } @@ -123,7 +119,9 @@ impl ExecutionProfile { let initial_execution_profile = ThreadLocalParams::get_execution_profile(); let mut execution_profile = self.clone(); - execution_profile.start_execution_time = Some(SystemTime::now()); + if let Some(execution_timeout) = execution_profile.execution_timeout { + execution_profile.execution_deadline = Some(Instant::now() + Duration::from_millis(execution_timeout)); + } ThreadLocalParams::set_execution_profile(&execution_profile); let result = f(); @@ -149,7 +147,7 @@ pub struct ExecutionProfileBuilder { /// The maximum number of states that a non-determinitic finite automaton can hold, this is checked during the convertion of regular expression to automaton. max_number_of_states: Option, /// The longest time in milliseconds that an operation execution can last, there are no guaranties that the exact time will be respected. - execution_timeout: Option, + execution_timeout: Option, } impl Default for ExecutionProfileBuilder { fn default() -> Self { @@ -165,7 +163,7 @@ impl ExecutionProfileBuilder { } } - pub fn execution_timeout(mut self, execution_timeout_in_ms: u128) -> Self { + pub fn execution_timeout(mut self, execution_timeout_in_ms: u64) -> Self { self.execution_timeout = Some(execution_timeout_in_ms); self } @@ -179,7 +177,7 @@ impl ExecutionProfileBuilder { ExecutionProfile { max_number_of_states: self.max_number_of_states, execution_timeout: self.execution_timeout, - start_execution_time: None, + execution_deadline: None, } } } @@ -188,8 +186,8 @@ struct ThreadLocalParams; impl ThreadLocalParams { thread_local! { static MAX_NUMBER_OF_STATES: RefCell> = const { RefCell::new(None) }; - static START_EXECUTION_TIME: RefCell> = const { RefCell::new(None) }; - static EXECUTION_TIMEOUT: RefCell> = const { RefCell::new(None) }; + static EXECUTION_DEADLINE: RefCell> = const { RefCell::new(None) }; + static EXECUTION_TIMEOUT: RefCell> = const { RefCell::new(None) }; } /// Store on the current thread [`ExecutionProfile`]. @@ -198,8 +196,8 @@ impl ThreadLocalParams { *cell.borrow_mut() = profile.max_number_of_states; }); - ThreadLocalParams::START_EXECUTION_TIME.with(|cell| { - *cell.borrow_mut() = profile.start_execution_time; + ThreadLocalParams::EXECUTION_DEADLINE.with(|cell| { + *cell.borrow_mut() = profile.execution_deadline; }); ThreadLocalParams::EXECUTION_TIMEOUT.with(|cell| { @@ -211,11 +209,11 @@ impl ThreadLocalParams { ThreadLocalParams::MAX_NUMBER_OF_STATES.with(|cell| *cell.borrow()) } - fn get_start_execution_time() -> Option { - ThreadLocalParams::START_EXECUTION_TIME.with(|cell| *cell.borrow()) + fn get_execution_deadline() -> Option { + ThreadLocalParams::EXECUTION_DEADLINE.with(|cell| *cell.borrow()) } - fn get_execution_timeout() -> Option { + fn get_execution_timeout() -> Option { ThreadLocalParams::EXECUTION_TIMEOUT.with(|cell| *cell.borrow()) } @@ -223,7 +221,7 @@ impl ThreadLocalParams { fn get_execution_profile() -> ExecutionProfile { ExecutionProfile { max_number_of_states: Self::get_max_number_of_states(), - start_execution_time: Self::get_start_execution_time(), + execution_deadline: Self::get_execution_deadline(), execution_timeout: Self::get_execution_timeout(), } } @@ -283,7 +281,7 @@ mod tests { let term = Term::from_pattern(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz").unwrap(); let execution_timeout_in_ms = 10; - let start_time = SystemTime::now(); + let start_time = Instant::now(); ExecutionProfileBuilder::new() .execution_timeout(execution_timeout_in_ms) .build() @@ -293,13 +291,12 @@ mod tests { term.generate_strings(100).unwrap_err() ); - let run_duration = SystemTime::now() + let run_duration = Instant::now() .duration_since(start_time) - .expect("Time went backwards") .as_millis(); println!("{run_duration}"); - assert!(run_duration <= execution_timeout_in_ms + 50); + assert!(run_duration <= (execution_timeout_in_ms + 50) as u128); }); Ok(()) @@ -311,7 +308,7 @@ mod tests { let term2 = Term::from_pattern(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz").unwrap(); let execution_timeout_in_ms = 50; - let start_time = SystemTime::now(); + let start_time = Instant::now(); ExecutionProfileBuilder::new() .execution_timeout(execution_timeout_in_ms) .build() @@ -321,13 +318,12 @@ mod tests { term1.difference(&term2).unwrap_err() ); - let run_duration = SystemTime::now() + let run_duration = Instant::now() .duration_since(start_time) - .expect("Time went backwards") .as_millis(); println!("{run_duration}"); - assert!(run_duration <= execution_timeout_in_ms + 25); + assert!(run_duration <= (execution_timeout_in_ms + 25) as u128); }); Ok(()) From 68e87c416bf83f6ece73d6fd5f75ea6b17c35b46 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Wed, 8 Oct 2025 20:36:30 +0200 Subject: [PATCH 43/44] Parallelize state selection for elimination --- src/fast_automaton/convert/to_regex/mod.rs | 7 -- .../to_regex/state_elimination/eliminate.rs | 83 +++++++++---------- 2 files changed, 37 insertions(+), 53 deletions(-) diff --git a/src/fast_automaton/convert/to_regex/mod.rs b/src/fast_automaton/convert/to_regex/mod.rs index 6378449..fbe36ce 100644 --- a/src/fast_automaton/convert/to_regex/mod.rs +++ b/src/fast_automaton/convert/to_regex/mod.rs @@ -15,13 +15,6 @@ impl FastAutomaton { mod tests { use super::*; - #[test] - fn test_convert_t() -> Result<(), String> { - assert_convert("abc.*def.*uif(ab|de)"); - - Ok(()) - } - #[test] fn test_convert() -> Result<(), String> { diff --git a/src/fast_automaton/convert/to_regex/state_elimination/eliminate.rs b/src/fast_automaton/convert/to_regex/state_elimination/eliminate.rs index d528f1b..c587a99 100644 --- a/src/fast_automaton/convert/to_regex/state_elimination/eliminate.rs +++ b/src/fast_automaton/convert/to_regex/state_elimination/eliminate.rs @@ -16,59 +16,50 @@ impl Gnfa { } fn get_next_state_to_eliminate(&self) -> Option { - let mut best_state: Option = None; - let mut best_score: u128 = u128::MAX; - - for state in self.all_states_iter() { - if state == self.start_state || state == self.accept_state { - continue; - } - - let preds = self.transitions_to_vec(state); - let succs = self.transitions_from_vec(state); - - let in_deg = preds.len() as u128; - let out_deg = succs.len() as u128; - - if in_deg == 0 || out_deg == 0 { - let score = state as u128 & 0xFF; - if score < best_score { - best_score = score; - best_state = Some(state); + let states: Vec = self + .all_states_iter() + .filter(|&s| s != self.start_state && s != self.accept_state) + .collect(); + + states + .into_par_iter() + .filter_map(|state| { + let preds = self.transitions_to_vec(state); + let succs = self.transitions_from_vec(state); + + let in_deg = preds.len() as u128; + let out_deg = succs.len() as u128; + + if in_deg == 0 || out_deg == 0 { + let score = (state as u128) & 0xFF; + return Some((score, state)); } - continue; - } - - let mut score: u128 = in_deg * out_deg; - - if self.has_self_loop(state) { - score = score + (score >> 1); - } - let mut label_cost: u128 = 0; + let mut score: u128 = in_deg * out_deg; - for (_, regex) in &preds { - label_cost += regex.evaluate_complexity() as u128; - } - for (regex, _) in &succs { - label_cost += regex.evaluate_complexity() as u128; - } - if let Some(re) = self.get_transition(state, state) { - label_cost += (re.evaluate_complexity() as u128) * 2; - } + if self.has_self_loop(state) { + score = score + (score >> 1); + } - score = score.saturating_mul(1).saturating_add(label_cost); + let mut label_cost: u128 = 0; - let tie = state as u128 & 0xFFFF; - let score = score.saturating_add(tie); + for (_, regex) in &preds { + label_cost += regex.evaluate_complexity() as u128; + } + for (regex, _) in &succs { + label_cost += regex.evaluate_complexity() as u128; + } + if let Some(re) = self.get_transition(state, state) { + label_cost += (re.evaluate_complexity() as u128) * 2; + } - if score < best_score { - best_score = score; - best_state = Some(state); - } - } + score = score.saturating_add(label_cost); - best_state + let tie = (state as u128) & 0xFFFF; + Some((score.saturating_add(tie), state)) + }) + .reduce_with(|a, b| if a.0 < b.0 { a } else { b }) + .map(|(_, state)| state) } fn eliminate_state(&mut self, k: usize) { From 863fdce9d8bdc4e2ab0b86011b16bd4612a292d1 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Wed, 8 Oct 2025 20:38:33 +0200 Subject: [PATCH 44/44] Fix misuse of hashmap for determinize --- Cargo.toml | 1 + src/fast_automaton/analyze/mod.rs | 2 +- src/fast_automaton/operation/determinize.rs | 84 ++++++++------------- src/fast_automaton/operation/mod.rs | 8 +- 4 files changed, 38 insertions(+), 57 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index eb32825..0cf5afd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,6 +25,7 @@ regex = "1.10.3" regex-syntax = "0.8.5" regex-charclass = { version = "1.0.3" } rayon = "1.10.0" +bit-set = "0.8.0" [dev-dependencies] criterion = { version = "0.5", features = ["html_reports"] } diff --git a/src/fast_automaton/analyze/mod.rs b/src/fast_automaton/analyze/mod.rs index dbc05f4..700a2d9 100644 --- a/src/fast_automaton/analyze/mod.rs +++ b/src/fast_automaton/analyze/mod.rs @@ -72,7 +72,7 @@ impl FastAutomaton { live } - pub(crate) fn get_ranges(&self) -> Result, EngineError> { + pub(crate) fn get_spanning_bases(&self) -> Result, EngineError> { self.spanning_set .get_spanning_ranges() .map(|range| Condition::from_range(range, &self.spanning_set)) diff --git a/src/fast_automaton/operation/determinize.rs b/src/fast_automaton/operation/determinize.rs index 0217834..6a578eb 100644 --- a/src/fast_automaton/operation/determinize.rs +++ b/src/fast_automaton/operation/determinize.rs @@ -1,4 +1,4 @@ -use ahash::HashMapExt; +use bit_set::BitSet; use crate::{EngineError, execution_profile::ExecutionProfile}; @@ -12,90 +12,73 @@ impl FastAutomaton { } let execution_profile = ExecutionProfile::get(); - let ranges = self.get_ranges()?; - - let initial_vec = VecDeque::from(vec![self.start_state]); + let bases = self.get_spanning_bases()?; let mut worklist = VecDeque::with_capacity(self.get_number_of_states()); let map_capacity = (self.get_number_of_states() as f64 / 0.75).ceil() as usize; - let mut new_states = IntMap::with_capacity(map_capacity); + let mut new_states = AHashMap::with_capacity(map_capacity); + + let mut accept_states = BitSet::new(); + for &state in &self.accept_states { + accept_states.insert(state); + } let mut new_automaton = FastAutomaton::new_empty(); new_automaton.spanning_set = self.spanning_set.clone(); - worklist.push_back((vec![self.start_state], new_automaton.start_state)); - new_states.insert(Self::simple_hash(&initial_vec), new_automaton.start_state); + let mut initial_state = BitSet::new(); + initial_state.insert(self.start_state); + + worklist.push_back((initial_state.clone(), new_automaton.start_state)); + new_states.insert(initial_state, new_automaton.start_state); - let mut new_states_to_add = VecDeque::with_capacity(self.get_number_of_states()); + let mut new_states_to_add = BitSet::new(); while let Some((states, r)) = worklist.pop_front() { execution_profile.assert_not_timed_out()?; - for state in &states { - if self.accept_states.contains(state) { - new_automaton.accept_states.insert(r); - break; - } + if !states.is_disjoint(&accept_states) { + new_automaton.accept_states.insert(r); } - for base in &ranges { + for base in &bases { for from_state in &states { - for (cond, to_state) in self.transitions_from(*from_state) { + for (cond, to_state) in self.transitions_from(from_state) { if cond.has_intersection(base) { - match new_states_to_add.binary_search(to_state) { - Ok(_) => {} // element already in vector @ `pos` - Err(pos) => new_states_to_add.insert(pos, *to_state), - }; + new_states_to_add.insert(*to_state); } } } if !new_states_to_add.is_empty() { - let q = match new_states.entry(Self::simple_hash(&new_states_to_add)) { - Entry::Occupied(o) => *o.get(), + match new_states.entry(new_states_to_add.clone()) { + Entry::Occupied(o) => { + let q = *o.get(); + + new_states_to_add.clear(); + + new_automaton.add_transition(r, q, base); + } Entry::Vacant(v) => { let new_q = new_automaton.new_state(); - worklist - .push_back((new_states_to_add.iter().cloned().collect(), new_q)); v.insert(new_q); - new_q + + let new_states = std::mem::take(&mut new_states_to_add); + worklist.push_back((new_states, new_q)); + + new_automaton.add_transition(r, new_q, base); } }; - - new_automaton.add_transition(r, q, base); } - new_states_to_add.clear(); } } Ok(Cow::Owned(new_automaton)) } - - fn simple_hash(list: &VecDeque) -> u64 { - let mut hasher = AHasher::default(); - for &item in list { - hasher.write_usize(item); - } - hasher.finish() - } } #[cfg(test)] mod tests { use crate::regex::RegularExpression; - #[test] - fn test_determinize_1() -> Result<(), String> { - let automaton = RegularExpression::parse(".*ab", false) - .unwrap() - .to_automaton() - .unwrap(); - - let deterministic_automaton = automaton.determinize().unwrap(); - - assert!(deterministic_automaton.is_deterministic()); - - Ok(()) - } - #[test] fn test_determinize_regex() -> Result<(), String> { assert_determinization("(aad|ads|a)"); @@ -117,8 +100,6 @@ mod tests { .unwrap() .to_automaton() .unwrap(); - //automaton.compute_determinization_cost(); - //println!("Determinization Cost: {:?}", automaton.determinisation_cost); println!("States Before: {}", automaton.get_number_of_states()); let deterministic_automaton = automaton.determinize().unwrap(); println!( @@ -126,6 +107,7 @@ mod tests { deterministic_automaton.get_number_of_states() ); assert!(deterministic_automaton.is_deterministic()); + //deterministic_automaton.print_dot(); assert!( automaton .difference(&deterministic_automaton) diff --git a/src/fast_automaton/operation/mod.rs b/src/fast_automaton/operation/mod.rs index 0805011..37241a8 100644 --- a/src/fast_automaton/operation/mod.rs +++ b/src/fast_automaton/operation/mod.rs @@ -1,15 +1,13 @@ -use std::{cmp, hash::Hasher}; - -use ahash::AHasher; +use std::cmp; use super::*; -mod union; mod concat; mod determinize; -mod intersection; mod difference; +mod intersection; mod repeat; +mod union; impl FastAutomaton { pub(crate) fn remove_dead_transitions(&mut self) {