Skip to content

Commit 4fae9af

Browse files
feat: implemented non-greedy matching
1 parent 45cb796 commit 4fae9af

File tree

8 files changed

+219
-93
lines changed

8 files changed

+219
-93
lines changed

assembly/__spec_tests__/generated.spec.ts

+150-70
Large diffs are not rendered by default.

assembly/__tests__/quantifiers.spec.ts

+24-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@ it("matches empty strings", () => {
77

88
it("zero or one", () => {
99
expectMatch("a?", ["a"]);
10-
// expectNotMatch("a?", ["bc"]);
10+
let match = exec("a?", "bc");
11+
expect(match).not.toBeNull();
12+
expect(match.matches[0]).toStrictEqual("");
1113
});
1214

1315
it("one or more", () => {
@@ -35,3 +37,24 @@ it("one or more is greedy", () => {
3537
expect(match).not.toBeNull();
3638
expect(match.matches[0]).toStrictEqual("aaaaa");
3739
});
40+
41+
describe("non-greedy", () => {
42+
it("one or more supports non-greedy mode", () => {
43+
let match = exec("[a-c]+?b", "abb");
44+
expect(match).not.toBeNull();
45+
expect(match.matches[0]).toStrictEqual("ab");
46+
});
47+
48+
it("zero or more supports non-greedy mode", () => {
49+
let match = exec("[a-c]*?b", "abb");
50+
expect(match).not.toBeNull();
51+
expect(match.matches[0]).toStrictEqual("ab");
52+
});
53+
54+
it("zero or one supports non-greedy mode", () => {
55+
expectMatch("a?", ["a"]);
56+
let match = exec("a?", "bc");
57+
expect(match).not.toBeNull();
58+
expect(match.matches[0]).toStrictEqual("");
59+
});
60+
});

assembly/nfa/nfa.ts

+22-14
Original file line numberDiff line numberDiff line change
@@ -117,15 +117,20 @@ function union(first: Automata, second: Automata): Automata {
117117
return new Automata(start, end);
118118
}
119119

120-
function closure(nfa: Automata): Automata {
120+
function closure(nfa: Automata, greedy: bool): Automata {
121121
const start = new State();
122122
const end = new State();
123-
// to ensure greedy matches, the epsilon transitions that loop-back
124-
// need to be first in the list
125-
start.transitions.push(nfa.start);
126-
start.transitions.push(end);
127-
nfa.end.transitions.push(nfa.start);
128-
nfa.end.transitions.push(end);
123+
if (greedy) {
124+
nfa.end.transitions.push(nfa.start);
125+
nfa.end.transitions.push(end);
126+
start.transitions.push(nfa.start);
127+
start.transitions.push(end);
128+
} else {
129+
nfa.end.transitions.push(end);
130+
nfa.end.transitions.push(nfa.start);
131+
start.transitions.push(end);
132+
start.transitions.push(nfa.start);
133+
}
129134
return new Automata(start, end);
130135
}
131136

@@ -138,14 +143,17 @@ function zeroOrOne(nfa: Automata): Automata {
138143
return new Automata(start, end);
139144
}
140145

141-
function oneOrMore(nfa: Automata): Automata {
146+
function oneOrMore(nfa: Automata, greedy: bool): Automata {
142147
const start = new State();
143148
const end = new State();
144149
start.transitions.push(nfa.start);
145-
// to ensure greedy matches, the epsilon transitions that loop-back
146-
// need to be first in the list
147-
nfa.end.transitions.push(nfa.start);
148-
nfa.end.transitions.push(end);
150+
if (greedy) {
151+
nfa.end.transitions.push(nfa.start);
152+
nfa.end.transitions.push(end);
153+
} else {
154+
nfa.end.transitions.push(end);
155+
nfa.end.transitions.push(nfa.start);
156+
}
149157
return new Automata(start, end);
150158
}
151159

@@ -176,9 +184,9 @@ class AutomataFactor {
176184
if (quantifier == Char.Question) {
177185
return zeroOrOne(automata);
178186
} else if (quantifier == Char.Plus) {
179-
return oneOrMore(automata);
187+
return oneOrMore(automata, node.greedy);
180188
} else if (quantifier == Char.Asterisk) {
181-
return closure(automata);
189+
return closure(automata, node.greedy);
182190
} else {
183191
throw new Error(
184192
"unsupported quantifier - " + String.fromCharCode(quantifier)

assembly/parser/node.ts

+5-1
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,11 @@ export class CharacterClassNode extends Node {
137137
}
138138

139139
export class RepetitionNode extends Node {
140-
constructor(public expression: Node, public quantifier: Char) {
140+
constructor(
141+
public expression: Node,
142+
public quantifier: Char,
143+
public greedy: bool = true
144+
) {
141145
super(NodeType.Repetition);
142146
}
143147

assembly/parser/parser.ts

+7-2
Original file line numberDiff line numberDiff line change
@@ -243,8 +243,13 @@ export class Parser {
243243
}
244244
} else if (isQuantifier(token)) {
245245
const expression = nodes.pop();
246-
nodes.push(new RepetitionNode(expression, token));
247-
this.eatToken();
246+
const quantifier = this.eatToken();
247+
let greedy = true;
248+
if (this.iterator.current == Char.Question) {
249+
greedy = false;
250+
this.eatToken();
251+
}
252+
nodes.push(new RepetitionNode(expression, quantifier, greedy));
248253
// @ts-ignore
249254
} else if (token == Char.LeftSquareBracket) {
250255
nodes.push(this.parseCharacterSet());

spec/pcre-1.dat

+1-1
Original file line numberDiff line numberDiff line change
@@ -481,7 +481,7 @@ E$ckv (?: [\\040\\t] | \\(\n\
481481
(?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] | \\( (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] )* \\) )*\n\
482482
\\) )* > # trailing >\n\
483483
# name and address\n\
484-
) (?: [\\040\\t] | \\(\n\
484+
) (?: [\\040\\t] | \\(\n\^[ab]{1,3}(ab*?|b)
485485
(?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] | \\( (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] )* \\) )*\n\
486486
\\) )* # optional trailing comment\n\
487487
Alan Other <[email protected]> (0,25)

spec/test-generator.js

+8-2
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ const knownIssues = {
1313
1103,
1414
...range(1185, 1188),
1515
...range(1095, 1098),
16+
...range(487, 494),
17+
...range(1077, 1082),
1618
],
1719
"issues with repeated capture groups": [
1820
262,
@@ -21,6 +23,10 @@ const knownIssues = {
2123
1391,
2224
1392,
2325
],
26+
"lazy quantifiers should still yield the longest overall regex match": [
27+
...range(141, 143),
28+
1288,
29+
],
2430
"test contains an octal escape sequence": [1102],
2531
"requires triage": [
2632
1087,
@@ -103,8 +109,8 @@ lines.forEach((line, index) => {
103109
return;
104110
}
105111

106-
if (["*?", "??", "+?", "}?"].some((f) => regex.includes(f))) {
107-
testCase += `xit("line: ${index} - lazy quantifiers are not supported", () => { });`;
112+
if (["}?"].some((f) => regex.includes(f))) {
113+
testCase += `xit("line: ${index} - lazy range repitition quantifiers are not supported", () => { });`;
108114
return;
109115
}
110116

ts/index.ts

+2-2
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ globalAny.log = console.log;
55

66
import { RegExp } from "../assembly/regexp";
77

8-
const regexObj = new RegExp("^\\s");
9-
const match = regexObj.exec("\040abc");
8+
const regexObj = new RegExp(".*?");
9+
const match = regexObj.exec("abc");
1010

1111
console.log(match);

0 commit comments

Comments
 (0)