From 5fcb2ad3e33637276ab98b9eb523be1e805784b5 Mon Sep 17 00:00:00 2001 From: Jason Lee Date: Fri, 23 Dec 2022 20:06:31 +0800 Subject: [PATCH] Improve insensitive to support (string | ident). --- derive/tests/grammar.pest | 2 + derive/tests/grammar.rs | 27 ++++++++++++ generator/src/generator.rs | 30 ++++++++----- meta/src/ast.rs | 6 +-- meta/src/grammar.pest | 2 +- meta/src/optimizer/concatenator.rs | 21 +++++++++- meta/src/optimizer/mod.rs | 27 +++++++++--- meta/src/parser.rs | 67 ++++++++++++++++++++++++++---- vm/src/lib.rs | 12 +++++- 9 files changed, 166 insertions(+), 28 deletions(-) diff --git a/derive/tests/grammar.pest b/derive/tests/grammar.pest index 126f112d..7ae9173b 100644 --- a/derive/tests/grammar.pest +++ b/derive/tests/grammar.pest @@ -9,6 +9,8 @@ string = { "abc" } insensitive = { ^"abc" } +tag_name = _{ ('a'..'z')+ } +insensitive_ident = { ^tag_name } range = { '0'..'9' } ident = { string } pos_pred = { &string } diff --git a/derive/tests/grammar.rs b/derive/tests/grammar.rs index cf5a4a60..e0e8a54e 100644 --- a/derive/tests/grammar.rs +++ b/derive/tests/grammar.rs @@ -41,6 +41,33 @@ fn insensitive() { insensitive(0, 3) ] }; + + parses_to! { + parser: GrammarParser, + input: "html", + rule: Rule::insensitive_ident, + tokens: [ + insensitive_ident(0, 4) + ] + }; + + parses_to! { + parser: GrammarParser, + input: "HTML", + rule: Rule::insensitive_ident, + tokens: [ + insensitive_ident(0, 4) + ] + }; + + parses_to! { + parser: GrammarParser, + input: "Body", + rule: Rule::insensitive_ident, + tokens: [ + insensitive_ident(0, 4) + ] + }; } #[test] diff --git a/generator/src/generator.rs b/generator/src/generator.rs index 9fd7a0e0..f90a94ba 100644 --- a/generator/src/generator.rs +++ b/generator/src/generator.rs @@ -348,11 +348,14 @@ fn generate_expr(expr: OptimizedExpr) -> TokenStream { state.match_string(#string) } } - OptimizedExpr::Insens(string) => { - quote! { - state.match_insensitive(#string) + OptimizedExpr::Insens(insens) => match insens.as_ref() { + OptimizedExpr::Str(string) => { + quote! { + state.match_insensitive(#string) + } } - } + _ => generate_expr(*insens), + }, OptimizedExpr::Range(start, end) => { let start = start.chars().next().unwrap(); let end = end.chars().next().unwrap(); @@ -494,11 +497,14 @@ fn generate_expr_atomic(expr: OptimizedExpr) -> TokenStream { state.match_string(#string) } } - OptimizedExpr::Insens(string) => { - quote! { - state.match_insensitive(#string) + OptimizedExpr::Insens(insens) => match insens.as_ref() { + OptimizedExpr::Str(string) => { + quote! { + state.match_insensitive(#string) + } } - } + _ => generate_expr_atomic(*insens), + }, OptimizedExpr::Range(start, end) => { let start = start.chars().next().unwrap(); let end = end.chars().next().unwrap(); @@ -827,7 +833,9 @@ mod tests { Box::new(OptimizedExpr::Range("a".to_owned(), "b".to_owned())), Box::new(OptimizedExpr::Seq( Box::new(OptimizedExpr::NegPred(Box::new(OptimizedExpr::Rep( - Box::new(OptimizedExpr::Insens("b".to_owned())), + Box::new(OptimizedExpr::Insens(Box::new(OptimizedExpr::Str( + "b".to_owned(), + )))), )))), Box::new(OptimizedExpr::PosPred(Box::new(OptimizedExpr::Opt( Box::new(OptimizedExpr::Rep(Box::new(OptimizedExpr::Choice( @@ -914,7 +922,9 @@ mod tests { Box::new(OptimizedExpr::Range("a".to_owned(), "b".to_owned())), Box::new(OptimizedExpr::Seq( Box::new(OptimizedExpr::NegPred(Box::new(OptimizedExpr::Rep( - Box::new(OptimizedExpr::Insens("b".to_owned())), + Box::new(OptimizedExpr::Insens(Box::new(OptimizedExpr::Str( + "b".to_owned(), + )))), )))), Box::new(OptimizedExpr::PosPred(Box::new(OptimizedExpr::Opt( Box::new(OptimizedExpr::Rep(Box::new(OptimizedExpr::Choice( diff --git a/meta/src/ast.rs b/meta/src/ast.rs index ffac8ea7..85cfbd0f 100644 --- a/meta/src/ast.rs +++ b/meta/src/ast.rs @@ -53,8 +53,8 @@ pub enum RuleType { pub enum Expr { /// Matches an exact string, e.g. `"a"` Str(String), - /// Matches an exact string, case insensitively (ASCII only), e.g. `^"a"` - Insens(String), + /// Matches an exact string or rule with the given name, case insensitively (ASCII only), e.g. `^"a"` or `^ASCII_ALPHA` + Insens(Box), /// Matches one character in the range, e.g. `'a'..'z'` Range(String, String), /// Matches the rule with the given name, e.g. `a` @@ -327,7 +327,7 @@ mod tests { )), Box::new(Expr::PosPred(Box::new(Expr::NegPred(Box::new(Expr::Rep( Box::new(Expr::RepOnce(Box::new(Expr::Opt(Box::new(Expr::Choice( - Box::new(Expr::Insens("c".to_owned())), + Box::new(Expr::Insens(Box::new(Expr::Str("c".to_owned())))), Box::new(Expr::Push(Box::new(Expr::Range( "'d'".to_owned(), "'e'".to_owned(), diff --git a/meta/src/grammar.pest b/meta/src/grammar.pest index 282ca35b..673591e1 100644 --- a/meta/src/grammar.pest +++ b/meta/src/grammar.pest @@ -77,7 +77,7 @@ alpha = _{ 'a'..'z' | 'A'..'Z' } alpha_num = _{ alpha | '0'..'9' } string = ${ quote ~ inner_str ~ quote } -insensitive_string = { "^" ~ string } +insensitive_string = { "^" ~ (string | identifier) } range = { character ~ range_operator ~ character } character = ${ single_quote ~ inner_chr ~ single_quote } diff --git a/meta/src/optimizer/concatenator.rs b/meta/src/optimizer/concatenator.rs index 31d3aa53..adb63d5c 100644 --- a/meta/src/optimizer/concatenator.rs +++ b/meta/src/optimizer/concatenator.rs @@ -9,6 +9,13 @@ use crate::ast::*; +fn convert_expr(expr: Box) -> Option { + match *expr { + Expr::Str(string) => Some(string), + _ => None, + } +} + pub fn concatenate(rule: Rule) -> Rule { let Rule { name, ty, expr } = rule; Rule { @@ -20,7 +27,19 @@ pub fn concatenate(rule: Rule) -> Rule { match expr { Expr::Seq(lhs, rhs) => match (*lhs, *rhs) { (Expr::Str(lhs), Expr::Str(rhs)) => Expr::Str(lhs + &rhs), - (Expr::Insens(lhs), Expr::Insens(rhs)) => Expr::Insens(lhs + &rhs), + (Expr::Insens(lhs), Expr::Insens(rhs)) => { + let lhs_str = convert_expr(lhs.clone()); + let rhs_str = convert_expr(rhs.clone()); + + if lhs_str.is_none() || rhs_str.is_none() { + return Expr::Seq( + Box::new(Expr::Insens(lhs)), + Box::new(Expr::Insens(rhs)), + ); + } + + Expr::Insens(Box::new(Expr::Str(lhs_str.unwrap() + &rhs_str.unwrap()))) + } (lhs, rhs) => Expr::Seq(Box::new(lhs), Box::new(rhs)), }, expr => expr, diff --git a/meta/src/optimizer/mod.rs b/meta/src/optimizer/mod.rs index 2038753b..5cb90e2f 100644 --- a/meta/src/optimizer/mod.rs +++ b/meta/src/optimizer/mod.rs @@ -20,6 +20,17 @@ macro_rules! box_tree { ($expr:expr) => ($expr); } +// box_tree!(Seq( +// Seq( +// Insens(Box::new(Ident(String::from("a")))), +// Insens(Box::new(Ident(String::from("b")))) +// ), +// Seq( +// Insens(Box::new(Ident(String::from("c")))), +// Insens(Box::new(Ident(String::from("d")))) +// ) +// )), + mod concatenator; mod factorizer; mod lister; @@ -52,7 +63,7 @@ fn rule_to_optimized_rule(rule: Rule) -> OptimizedRule { fn to_optimized(expr: Expr) -> OptimizedExpr { match expr { Expr::Str(string) => OptimizedExpr::Str(string), - Expr::Insens(string) => OptimizedExpr::Insens(string), + Expr::Insens(expr) => OptimizedExpr::Insens(Box::new(to_optimized(*expr))), Expr::Range(start, end) => OptimizedExpr::Range(start, end), Expr::Ident(ident) => OptimizedExpr::Ident(ident), Expr::PeekSlice(start, end) => OptimizedExpr::PeekSlice(start, end), @@ -107,7 +118,7 @@ pub enum OptimizedExpr { /// Matches an exact string, e.g. `"a"` Str(String), /// Matches an exact string, case insensitively (ASCII only), e.g. `^"a"` - Insens(String), + Insens(Box), /// Matches one character in the range, e.g. `'a'..'z'` Range(String, String), /// Matches the rule with the given name, e.g. `a` @@ -486,15 +497,21 @@ mod tests { name: "rule".to_owned(), ty: RuleType::Atomic, expr: box_tree!(Seq( - Seq(Insens(String::from("a")), Insens(String::from("b"))), - Seq(Insens(String::from("c")), Insens(String::from("d"))) + Seq( + Insens(Box::new(Str(String::from("a")))), + Insens(Box::new(Str(String::from("b")))) + ), + Seq( + Insens(Box::new(Str(String::from("c")))), + Insens(Box::new(Str(String::from("d")))) + ) )), }] }; let concatenated = vec![OptimizedRule { name: "rule".to_owned(), ty: RuleType::Atomic, - expr: OptimizedExpr::Insens(String::from("abcd")), + expr: OptimizedExpr::Insens(Box::new(OptimizedExpr::Str("abcd".to_owned()))), }]; assert_eq!(optimize(rules), concatenated); diff --git a/meta/src/parser.rs b/meta/src/parser.rs index fc0224b3..2a853809 100644 --- a/meta/src/parser.rs +++ b/meta/src/parser.rs @@ -133,7 +133,7 @@ pub enum ParserExpr<'i> { /// Matches an exact string, e.g. `"a"` Str(String), /// Matches an exact string, case insensitively (ASCII only), e.g. `^"a"` - Insens(String), + Insens(Box>), /// Matches one character in the range, e.g. `'a'..'z'` Range(String, String), /// Matches the rule with the given name, e.g. `a` @@ -175,7 +175,7 @@ fn convert_rule(rule: ParserRule<'_>) -> AstRule { fn convert_node(node: ParserNode<'_>) -> Expr { match node.expr { ParserExpr::Str(string) => Expr::Str(string), - ParserExpr::Insens(string) => Expr::Insens(string), + ParserExpr::Insens(node) => Expr::Insens(Box::new(convert_node(*node))), ParserExpr::Range(start, end) => Expr::Range(start, end), ParserExpr::Ident(ident) => Expr::Ident(ident), ParserExpr::PeekSlice(start, end) => Expr::PeekSlice(start, end), @@ -296,6 +296,46 @@ fn consume_rules_with_spans( .collect() } +fn consume_insensitive_string( + pairs: Peekable>, +) -> Result, Vec>> { + let mut pairs = pairs.peekable(); + let pair = pairs.next().unwrap(); + + let node = match pair.as_rule() { + Rule::string => { + let span = pair.as_span(); + let string = unescape(span.as_str()).expect("incorrect string literal"); + // Remove quote + let string_val = string[1..string.len() - 1].to_owned(); + + ParserNode { + expr: ParserExpr::Str(string_val), + span, + } + } + Rule::identifier => { + let span = pair.as_span(); + let string = span.as_str().to_owned(); + + ParserNode { + expr: ParserExpr::Ident(string), + span, + } + } + _ => { + return Err(vec![Error::new_from_span( + ErrorVariant::CustomError { + message: "Expected string or identifier".to_owned(), + }, + pair.as_span(), + )]) + } + }; + + Ok(node) +} + fn consume_expr<'i>( pairs: Peekable>, pratt: &PrattParser, @@ -389,10 +429,12 @@ fn consume_expr<'i>( } } Rule::insensitive_string => { - let string = unescape(pair.as_str()).expect("incorrect string literal"); + let node = + consume_insensitive_string(pair.clone().into_inner().peekable())?; + ParserNode { - expr: ParserExpr::Insens(string[2..string.len() - 1].to_owned()), - span: pair.clone().as_span(), + expr: ParserExpr::Insens(Box::new(node)), + span: pair.as_span(), } } Rule::range => { @@ -1019,6 +1061,17 @@ mod tests { ]) ] }; + + parses_to! { + parser: PestParser, + input: "^ASCII_ALPHA", + rule: Rule::insensitive_string, + tokens: [ + insensitive_string(0, 12, [ + identifier(1, 12) + ]) + ] + }; } #[test] @@ -1263,7 +1316,7 @@ mod tests { parser: PestParser, input: "a = { ^ }", rule: Rule::grammar_rules, - positives: vec![Rule::quote], + positives: vec![Rule::identifier, Rule::quote], negatives: vec![], pos: 8 }; @@ -1340,7 +1393,7 @@ mod tests { )), Box::new(Expr::NegPred(Box::new(Expr::Rep(Box::new(Expr::Opt( Box::new(Expr::Choice( - Box::new(Expr::Insens("c".to_owned())), + Box::new(Expr::Insens(Box::new(Expr::Str("c".to_owned())))), Box::new(Expr::Push(Box::new(Expr::Range( "d".to_owned(), "e".to_owned() diff --git a/vm/src/lib.rs b/vm/src/lib.rs index 48391870..7d264c9e 100644 --- a/vm/src/lib.rs +++ b/vm/src/lib.rs @@ -186,7 +186,17 @@ impl Vm { ) -> ParseResult>> { match *expr { OptimizedExpr::Str(ref string) => state.match_string(string), - OptimizedExpr::Insens(ref string) => state.match_insensitive(string), + OptimizedExpr::Insens(ref insens) => match insens.as_ref() { + OptimizedExpr::Str(ref string) => { + std::println!("------------- match_insensitive: {}", string); + return state.match_insensitive(string); + } + OptimizedExpr::Ident(ref name) => { + std::println!("------------- match_insensitive Ident: {}", name); + return self.parse_rule(name, state); + } + _ => panic!("invalid insensitive expr only support string | ident"), + }, OptimizedExpr::Range(ref start, ref end) => { let start = start.chars().next().expect("empty char literal"); let end = end.chars().next().expect("empty char literal");