From 0a1da141be05528937bd42236cc8f40b3e5cba12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Louf?= Date: Tue, 13 Aug 2024 11:32:43 +0200 Subject: [PATCH] Add lm-format-enforcer benchmark --- .github/workflows/tests.yml | 18 ++++++++++++++++ pyproject.toml | 7 +++++- setup.cfg | 6 ++++++ src/lfe.py | 43 +++++++++++++++++++++++++++++++++++++ 4 files changed, 73 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/tests.yml create mode 100644 setup.cfg create mode 100644 src/lfe.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..bf25b9c --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,18 @@ +name: Tests + +on: + pull_request: + branches: [main] + push: + branches: [main] + +jobs: + style: + name: Check the code style + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: "3.10" + - uses: pre-commit/action@v3.0.0 diff --git a/pyproject.toml b/pyproject.toml index 4f6bbf7..f32ab38 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,4 +4,9 @@ version = "0.1" authors = [{"name" = "The Outlines developers"}] description = "A benchmarking suite for structured generation libraries." requires-python = ">=3.10" -dependencies = ["outlines==0.0.46", "transformers==4.44.0", "torch==2.4.0"] +dependencies = [ + "lm-format-enforcer==0.10.6", + "outlines==0.0.46", + "torch==2.4.0", + "transformers==4.44.0", +] diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..b057e8a --- /dev/null +++ b/setup.cfg @@ -0,0 +1,6 @@ +[flake8] +max-line-length = 88 +select = C,E,F,W +ignore = E203,E231,E501,E741,W503,W504,C901,E731 +per-file-ignores = + **/__init__.py:F401,F403 diff --git a/src/lfe.py b/src/lfe.py new file mode 100644 index 0000000..019ea0c --- /dev/null +++ b/src/lfe.py @@ -0,0 +1,43 @@ +"""Benchmark the lm-format-enforcer library.""" +from lmformatenforcer import RegexParser, TokenEnforcer +from lmformatenforcer.integrations.transformers import ( + build_token_enforcer_tokenizer_data, +) +from transformers import AutoTokenizer + +models = [ + "meta-llama/Llama-2-7b-hf", # 32,000 tokens vocabulary + "gpt2", # 50,257 tokens vocabulary + "meta-llama/Meta-Llama-3.1-8B-Instruct", # 128,256 tokens vocabulary + "google/gemma-2-2b-it", # 256,128 tokens vocabulary +] + +case = [(r"\d{3}-\d{2}-\d{4}", "203-22-1234")] + + +class LMFormatEnforcer: + params = [models, case] + param_names = ["model", "regex"] + timeout = 600 + + def setup(self, model, _): + """Set up the benchmark. + + We convert the tokenizer during set up as this only + needs to be done once for a given model. + + """ + self.tokenizer = AutoTokenizer.from_pretrained( + model, clean_up_tokenization_spaces=True + ) + self.tokenizer_data = build_token_enforcer_tokenizer_data(self.tokenizer) + + def time_lfe(self, _, regex): + regex_string, regex_example = regex + regex_example_tokens = self.tokenizer.encode(regex_example) + + parser = RegexParser(regex_string) + token_enforcer = TokenEnforcer(self.tokenizer_data, parser) + + for i in range(len(regex_example_tokens)): + _ = token_enforcer.get_allowed_tokens(regex_example_tokens[: i + 1])