Skip to content

Commit a5f529d

Browse files
committed
Update CFGGuide to use outlines.fsm.parsing. Enable generate.cfg
1 parent d78041e commit a5f529d

23 files changed

+1488
-651
lines changed

benchmarks/bench_cfg_guide.py

+62
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import random
2+
3+
from transformers import AutoTokenizer
4+
5+
import outlines.grammars
6+
from outlines.caching import cache_disabled
7+
from outlines.fsm.guide import CFGGuide
8+
from outlines.models.transformers import TransformerTokenizer
9+
10+
from .common import ensure_numba_compiled
11+
12+
random.seed(42)
13+
14+
15+
def get_tiny_tokenizer():
16+
"""1000 tokens in vocabulary"""
17+
return TransformerTokenizer(
18+
AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
19+
)
20+
21+
22+
benched_grammars = {
23+
"json": outlines.grammars.json,
24+
"arithmetic": outlines.grammars.arithmetic,
25+
}
26+
27+
28+
class CFGGuideBenchmark:
29+
params = benched_grammars.keys()
30+
31+
def setup(self, grammar_name):
32+
self.tokenizer = get_tiny_tokenizer()
33+
ensure_numba_compiled(
34+
self.tokenizer
35+
) # numba not currently used, but will be in the future
36+
self.prebuilt_cfg_guide = CFGGuide(
37+
benched_grammars[grammar_name], self.tokenizer
38+
)
39+
40+
@staticmethod
41+
def _run_random_cfg(guide):
42+
state = guide.initial_state
43+
44+
for i in range(40):
45+
# simulate ordering of logits top prob to lowest prob
46+
token_ids = list(range(guide.tokenizer.vocabulary))
47+
random.shuffle(token_ids)
48+
# simulate sampling and state update
49+
next_token_id = next(guide.iter_valid_token_ids(state, token_ids))
50+
state = guide.get_next_state(state, next_token_id)
51+
52+
@cache_disabled()
53+
def time_cfg_guide_setup(self, grammar_name):
54+
CFGGuide(benched_grammars[grammar_name], self.tokenizer)
55+
56+
@cache_disabled()
57+
def time_cfg_guide_run(self, grammar):
58+
self._run_random_cfg(self.prebuilt_cfg_guide)
59+
60+
@cache_disabled()
61+
def peakmem_cfg_guide_run(self, grammar):
62+
self._run_random_cfg(self.prebuilt_cfg_guide)

docs/reference/creating_grammars.md

+99
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
# Overview
2+
3+
Outlines allows the use of [Lark](https://github.com/lark-parser/lark) grammars to guide generation. These grammars are used to construct parsers that filter out incompatible tokens during the generation process The result is a generation that adheres to the grammar's production rules.
4+
5+
# Primer on Creating Grammars
6+
7+
To create grammars for Outlines, a solid understanding of Lark grammars is necessary. Here's how you can get started:
8+
9+
- Read Lark's grammars documentations [here](https://lark-parser.readthedocs.io/en/latest/grammar.html).
10+
- Review Outlines' existing grammars [here](/outlines/grammars).
11+
12+
13+
# Compatibility With Outlines
14+
15+
It's important to note that not all Lark grammars work with Outlines. Changes may be necessary to ensure compatability.
16+
17+
### LALR(1) Parser
18+
19+
Outlines utilizes Larks LALR(1) parser, meaning the grammar must be unambiguous at least up to the next token (one token lookahead). Read Lark's official LALR(1) parser documentation [here](https://lark-parser.readthedocs.io/en/stable/parsers.html#lalr-1).
20+
21+
If your grammar is ambiguous, you will recieve the following error at runtime:
22+
23+
```
24+
GrammarError: Reduce/Reduce collision in Terminal('B') between the following rules:
25+
```
26+
27+
### Regex Terminal Restrictions
28+
29+
Outlines converts terminals to finite state machines using the [Interegular](https://github.com/MegaIng/interegular/) library. Not all regular expressions work with Interegular, mitigation is described in the subsections which follow.
30+
31+
32+
#### Avoid Lookarounds
33+
34+
Examples of removing lookaround while maintaining the same functionality
35+
36+
##### Example: Escaped String
37+
38+
From Outlines' modified `ESCAPED_STRING` in [common.lark](/outlines/grammars/common.lark).
39+
40+
Before:
41+
```
42+
_STRING_INNER: /.*?/
43+
_STRING_ESC_INNER: _STRING_INNER /(?<!\\)(\\\\)*?/
44+
45+
ESCAPED_STRING : "\"" _STRING_ESC_INNER "\""
46+
```
47+
48+
After:
49+
```
50+
_NON_CONTROL_CHAR: /([^"\\\x00-\x1F\x7F-\x9F])/
51+
_ESCAPED_CHAR: /\\/ (_NON_CONTROL_CHAR | /\\/ | /"/)
52+
ESCAPED_STRING_INNER: _NON_CONTROL_CHAR | _ESCAPED_CHAR
53+
ESCAPED_STRING: /"/ ESCAPED_STRING_INNER* /"/
54+
```
55+
56+
#### Avoid Backreferences
57+
58+
Backreferences, for example `([ab]^*)\1`, cannot be simulated by a finite state machine, and will result in an error if used.
59+
60+
# Creating a Valid Grammar
61+
62+
You can use Outlines' test suite to verify your grammar.
63+
64+
### 1) Create Your Grammar
65+
66+
Create your grammar file named `your_new_grammar.lark`, adhering to the guidelines provided above. Add it to `outlines/grammars/` (ensure attribution is included and license is compatible).
67+
68+
Update `outlines/grammars.py` with a line including your grammar.
69+
70+
### 2) Test Your Grammar
71+
72+
Test grammar for false negatives, ensure sample grammars can be generated:
73+
- Add valid example outputs which are compliant with the grammar to `tests/benchmark/cfg_samples/your_new_grammar/`
74+
- Run the tests for your grammar via `pytest -s tests/fsm/test_cfg_guide.py::test_cfg_grammar_sample -k "your_new_grammar"`
75+
76+
Test grammar for false positives, ensure invalid outputs aren't generated.
77+
78+
Currently there isn't a builtin false positive testing utility. It is recommended you smoke test via
79+
```
80+
from outlines import models, generate, grammars
81+
model = models.transformers("mistralai/Mistral-7B-v0.1")
82+
generator = generate.cfg(model, grammars.your_new_grammar)
83+
result = generator(<your prompt to generate output for your grammar>)
84+
print(result)
85+
```
86+
87+
# Converting
88+
There are a few tools available for converting from other grammars to lark. These tools serve as a starting point. However, you will typically need to make additional adjustments to ensure full compatibility and proper functioning within Outlines.
89+
90+
Tools:
91+
- Larks built in "Nearley-to-Lark" converter https://lark-parser.readthedocs.io/en/latest/tools.html
92+
- Convert ANTLR4 to Lark (Note, most antlr4 grammars are not LALR(1) compatible, so will require additional tweaking) https://github.com/kaby76/Domemtech.Trash/blob/main/src/trconvert/readme.md
93+
- Extract EBNF from Yacc files https://www.bottlecaps.de/rr/ui
94+
95+
Reference Grammars:
96+
- Github Lark Grammars https://github.com/search?q=path%3A*.lark&type=code
97+
- Github Nearley Grammars https://github.com/search?q=path%3A*.ne+%22-%3E%22&type=code
98+
- Antlr4 grammars https://github.com/antlr/grammars-v4/
99+
- Grammar zoo https://slebok.github.io/zoo/index.html#html

mkdocs.yml

+1
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ nav:
124124
- JSON (function calling): reference/json.md
125125
- JSON mode: reference/json_mode.md
126126
- Grammar: reference/cfg.md
127+
- Creating Grammars: reference/creating_grammars.md
127128
- Custom FSM operations: reference/custom_fsm_ops.md
128129
- Utilities:
129130
- Serve with vLLM: reference/serve/vllm.md

outlines/fsm/fsm.py

+1-23
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import warnings
22
from typing import TYPE_CHECKING, Iterable, NewType, Optional
33

4-
from outlines.fsm.guide import CFGGuide, RegexGuide, StopAtEOSGuide
4+
from outlines.fsm.guide import RegexGuide, StopAtEOSGuide
55

66
if TYPE_CHECKING:
77
from outlines.models.tokenizer import Tokenizer
@@ -45,25 +45,3 @@ def allowed_token_ids(self, state: FSMState) -> Optional[Iterable[int]]:
4545

4646
def next_state(self, state: FSMState, token_id: int) -> FSMState:
4747
return FSMState(self.get_next_state(state, token_id))
48-
49-
50-
class CFGFSM(CFGGuide):
51-
"""FSM to generate text that is in the language of a context-free grammar."""
52-
53-
def __init__(self, cfg_string: str, tokenizer):
54-
warnings.warn(
55-
UserWarning(
56-
"The `CFGFSM` interface is deprecated and will be removed on 2024-06-01. Please use `CFGGuide` instead."
57-
)
58-
)
59-
super().__init__(cfg_string, tokenizer)
60-
61-
def allowed_token_ids(self, state: FSMState) -> Optional[Iterable[int]]:
62-
return self.get_next_instruction(state).tokens
63-
64-
def next_state(self, state: FSMState, token_id: int) -> FSMState:
65-
return FSMState(self.get_next_state(state, token_id))
66-
67-
def copy(self) -> "CFGFSM":
68-
"""Create a copy of the FSM."""
69-
return CFGFSM(self.cfg_string, self.tokenizer)

0 commit comments

Comments
 (0)