Skip to content

Fix null byte #21

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
72 changes: 72 additions & 0 deletions .github/workflows/asv_benchmark_pr.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
name: Benchmark PR

on:
pull_request:
branches: [main]

permissions:
contents: read # Read access for repository contents
pull-requests: write # Write access for pull requests

env:
PYTHON_VERSION: "3.10"
WORKING_DIR: ${{ github.workspace }}/benchmarks

jobs:
benchmark-pr:
runs-on: ubuntu-latest

defaults:
run:
working-directory: ${{ env.WORKING_DIR }}

steps:

- name: Checkout repository
uses: actions/checkout@v3
with:
fetch-depth: 0

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: ${{ env.PYTHON_VERSION }}

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install asv virtualenv lf-asv-formatter

- name: Create ASV machine config file
run: asv machine --machine gh-runner --yes

- name: Save comparison of PR against main branch
run: |
# prepare main branch for comparison
git remote add upstream https://github.com/${{ github.repository }}.git
git fetch upstream main

# Run benchmarks, writing comment contents to ./output
asv continuous upstream/main HEAD \
--factor 1.1 --sort ratio --split --interleave-rounds -a repeat=3
asv compare upstream/main HEAD --factor 1.1 --sort ratio --split | tee output
python -m lf_asv_formatter --asv_version "$(echo asv --version)"
printf "Benchmark Suite Results:\n\n" >> comment_body
cat output >> comment_body

# from https://github.com/hombit/load_ztfdr_for_tape/blob/9acf7c83/.github/workflows/asv-pr.yml
- name: Find benchmarks comment
uses: peter-evans/find-comment@v2
id: find-comment
with:
issue-number: ${{ github.event.pull_request.number }}
comment-author: 'github-actions[bot]'
body-includes: Benchmark Suite Results

- name: Create or update benchmarks comment
uses: peter-evans/create-or-update-comment@v3
with:
comment-id: ${{ steps.find-comment.outputs.comment-id }}
issue-number: ${{ github.event.pull_request.number }}
body-path: ${{ env.WORKING_DIR }}/comment_body
edit-mode: replace
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ docs/build
.idea/
*.gguf
.venv
benchmarks/results
Empty file added benchmarks/__init__.py
Empty file.
20 changes: 20 additions & 0 deletions benchmarks/asv.conf.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"version": 1,
"project": "Outlines",
"project_url": "https://outlines-dev.github.io/outlines/",
"repo": "..",
"branches": [
"HEAD"
],
"build_command": [
"pip install .[test]",
"python -m build --wheel -o {build_cache_dir} {build_dir}"
],
"environment_type": "virtualenv",
"show_commit_url": "https://github.com/lapp0/outlines/commit/",
"benchmark_dir": ".",
"env_dir": "env",
"results_dir": "results",
"html_dir": "html",
"build_cache_size": 8
}
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import pytest

import outlines

outlines.disable_cache()

from outlines.fsm.guide import RegexGuide # noqa: E402
from outlines.fsm.json_schema import build_regex_from_schema # noqa: E402

from .common import ( # noqa: E402
clear_outlines_cache,
ensure_numba_compiled,
setup_tokenizer,
)

simple_schema = """{
"$defs": {
"Armor": {
Expand Down Expand Up @@ -63,30 +67,21 @@
"required": ["id", "work", "recording_artists"]
}"""


schemas = dict(simple_schema=simple_schema, complex_schema=complex_schema)


@pytest.mark.parametrize("schema_name", schemas.keys())
def test_benchmark_json_schema_to_regex(benchmark, ensure_numba_compiled, schema_name):
"""Benchmark convert json schema to regex"""
schema = schemas[schema_name]
benchmark.pedantic(
build_regex_from_schema,
args=(schema,),
rounds=8,
)
class JsonSchemaBenchmark:
params = schemas.keys()

def setup(self, schema_name):
clear_outlines_cache()
self.tokenizer = setup_tokenizer()
self.schema = schemas[schema_name]
ensure_numba_compiled(self.tokenizer)

def time_json_schema_to_regex(self, schema_name):
build_regex_from_schema(self.schema)

@pytest.mark.parametrize("schema_name", schemas.keys())
def test_benchmark_json_schema_to_fsm(
benchmark, tokenizer, ensure_numba_compiled, schema_name
):
"""Benchmark compile json schema as FSM"""
schema = schemas[schema_name]
regex = build_regex_from_schema(schema)
benchmark.pedantic(
RegexGuide,
args=(regex, tokenizer),
rounds=8,
)
def time_json_schema_to_fsm(self, schema_name):
regex = build_regex_from_schema(self.schema)
RegexGuide(regex, self.tokenizer)
37 changes: 37 additions & 0 deletions benchmarks/bench_numba_compile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import importlib

import interegular
import numba

import outlines

from .common import clear_outlines_cache, setup_tokenizer

outlines.disable_cache()


class NumbaCompileBenchmark:
def setup(self):
clear_outlines_cache()
from outlines.fsm import regex

self.tokenizer = setup_tokenizer()
self.regex = regex
original_njit = numba.njit

def mock_njit(*args, **kwargs):
kwargs["cache"] = False
return original_njit(*args, **kwargs)

self.original_njit = original_njit
numba.njit = mock_njit
importlib.reload(self.regex)
self.regex_pattern, _ = self.regex.make_deterministic_fsm(
interegular.parse_pattern("a").to_fsm().reduce()
)

def teardown(self):
numba.njit = self.original_njit

def time_compile_numba(self):
self.regex.create_fsm_index_tokenizer(self.regex_pattern, self.tokenizer)
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pytest

import outlines

from .common import clear_outlines_cache, ensure_numba_compiled, setup_tokenizer

outlines.disable_cache()

from outlines.fsm.guide import RegexGuide # noqa: E402
Expand All @@ -19,14 +19,27 @@
}


@pytest.mark.parametrize("regex_name", regex_samples.keys())
def test_benchmark_regex_to_fsm(
benchmark, tokenizer, ensure_numba_compiled, regex_name
):
"""Benchmark converting regex to FSM"""
regex_str = regex_samples[regex_name]
benchmark.pedantic(
RegexGuide,
args=(regex_str, tokenizer),
rounds=8,
)
class RegexGuideBenchmark:
params = regex_samples.keys()

def setup(self, pattern_name):
clear_outlines_cache()
self.tokenizer = setup_tokenizer()
ensure_numba_compiled(self.tokenizer)
self.pattern = regex_samples[pattern_name]

def time_regex_to_guide(self, pattern_name):
RegexGuide(self.pattern, self.tokenizer)


class MemoryRegexGuideBenchmark:
params = ["simple_phone", "complex_span_constrained_relation_extraction"]

def setup(self, pattern_name):
clear_outlines_cache()
self.tokenizer = setup_tokenizer()
ensure_numba_compiled(self.tokenizer)
self.pattern = regex_samples[pattern_name]

def peakmem_regex_to_guide(self, pattern_name):
RegexGuide(self.pattern, self.tokenizer)
10 changes: 6 additions & 4 deletions tests/benchmark/conftest.py → benchmarks/common.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
import pytest
from transformers import AutoTokenizer

import outlines.caching
from outlines.fsm.guide import RegexGuide
from outlines.models.transformers import TransformerTokenizer


@pytest.fixture
def tokenizer():
def clear_outlines_cache():
outlines.caching.clear_cache()


def setup_tokenizer():
tokenizer = AutoTokenizer.from_pretrained("gpt2")
return TransformerTokenizer(tokenizer)


@pytest.fixture
def ensure_numba_compiled(tokenizer):
RegexGuide("a", tokenizer)
return True
Loading
Loading