Skip to content

Commit 16482f9

Browse files
authored
Add Github action for time regressions (#4261)
1. Adds a new action for testing the time to partition over a set number of documents. 2. Changes from time.time() to time.perf_counter()
1 parent afbda95 commit 16482f9

File tree

5 files changed

+392
-1
lines changed

5 files changed

+392
-1
lines changed
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
name: Partition Benchmark
2+
3+
# Runs on every PR targeting main to detect regressions.
4+
# Can also be triggered manually to establish or inspect a new baseline.
5+
on:
6+
pull_request:
7+
branches: [main]
8+
workflow_dispatch:
9+
10+
permissions:
11+
contents: read
12+
13+
env:
14+
NLTK_DATA: ${{ github.workspace }}/nltk_data
15+
PYTHON_VERSION: "3.12"
16+
# Number of times to run the full benchmark suite.
17+
NUM_ITERATIONS: "3"
18+
# 20% threshold for now and tune later
19+
REGRESSION_THRESHOLD: "0.20"
20+
# Increment to change cache key when benchmark-affecting dependencies are updated, to ensure clean slate runs.
21+
CACHE_VERSION: "v2"
22+
# S3 location for metrics – matches core-product convention.
23+
S3_METRICS_BUCKET_KEY: utic-metrics/ci-metrics
24+
S3_BENCHMARK_PATH: open-source/partition-benchmark/benchmark_best.json
25+
26+
jobs:
27+
setup:
28+
runs-on: ubuntu-latest
29+
steps:
30+
- uses: actions/checkout@v4
31+
- uses: ./.github/actions/base-cache
32+
with:
33+
python-version: ${{ env.PYTHON_VERSION }}
34+
35+
benchmark:
36+
name: Measure and compare partition() runtime
37+
runs-on: ubuntu-latest
38+
needs: [setup]
39+
40+
steps:
41+
42+
- uses: actions/checkout@v4
43+
44+
- uses: ./.github/actions/base-cache
45+
with:
46+
python-version: ${{ env.PYTHON_VERSION }}
47+
48+
- name: Install system dependencies
49+
run: |
50+
sudo apt-get update
51+
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
52+
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
53+
sudo apt-get update
54+
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
55+
56+
57+
- name: Restore HuggingFace model cache
58+
uses: actions/cache/restore@v4
59+
with:
60+
path: ~/.cache/huggingface
61+
key: hf-models-${{ runner.os }}-${{ env.CACHE_VERSION }}-${{ github.sha }}
62+
restore-keys: |
63+
hf-models-${{ runner.os }}-${{ env.CACHE_VERSION }}-
64+
hf-models-${{ runner.os }}-
65+
66+
67+
- name: Run partition benchmark
68+
env:
69+
NUM_ITERATIONS: ${{ env.NUM_ITERATIONS }}
70+
run: |
71+
uv run --no-sync python scripts/performance/benchmark_partition.py \
72+
benchmark_results.json
73+
74+
- name: Save HuggingFace model cache
75+
uses: actions/cache/save@v4
76+
with:
77+
path: ~/.cache/huggingface
78+
key: hf-models-${{ runner.os }}-${{ env.CACHE_VERSION }}-${{ github.sha }}
79+
80+
81+
- name: Download previous best from S3
82+
continue-on-error: true
83+
env:
84+
AWS_ACCESS_KEY_ID: ${{ secrets.S3_EVAL_ACCESS_KEY }}
85+
AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_EVAL_SECRET_KEY }}
86+
run: |
87+
aws s3 cp \
88+
"s3://${{ env.S3_METRICS_BUCKET_KEY }}/${{ env.S3_BENCHMARK_PATH }}" \
89+
benchmark_best.json
90+
91+
92+
- name: Compare results against stored best
93+
id: compare
94+
run: |
95+
uv run --no-sync python scripts/performance/compare_benchmark.py \
96+
benchmark_results.json \
97+
benchmark_best.json \
98+
${{ env.REGRESSION_THRESHOLD }}
99+
100+
101+
- name: Upload best result to S3
102+
continue-on-error: true
103+
env:
104+
AWS_ACCESS_KEY_ID: ${{ secrets.S3_EVAL_ACCESS_KEY }}
105+
AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_EVAL_SECRET_KEY }}
106+
run: |
107+
aws s3 cp \
108+
benchmark_best.json \
109+
"s3://${{ env.S3_METRICS_BUCKET_KEY }}/${{ env.S3_BENCHMARK_PATH }}"
110+
111+
112+
- name: Upload benchmark artifacts
113+
if: always()
114+
uses: actions/upload-artifact@v4
115+
with:
116+
name: benchmark-results-${{ github.sha }}
117+
path: |
118+
benchmark_results.json
119+
benchmark_best.json
120+
retention-days: 30

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
## 0.21.4
2+
- Add a github action for testing time regressions
3+
14
## 0.21.3
25

36
### Enhancements
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
#!/usr/bin/env python3
2+
"""Measure partition() runtime over a fixed set of representative example-docs files.
3+
4+
Follows the same conventions as the existing scripts/performance tooling:
5+
- PDFs and images are run with strategy="hi_res".
6+
- Everything else is run with strategy="fast".
7+
- Each file is timed over NUM_ITERATIONS runs (after a warmup) and the
8+
average is recorded, matching time_partition.py behaviour.
9+
10+
Writes a JSON file mapping each file to its average runtime, plus a ``__total__``
11+
key with the wall-clock total. An optional positional argument sets the output
12+
path (default: scripts/performance/partition-speed-test/benchmark_results.json).
13+
14+
Also writes the total duration to $GITHUB_OUTPUT as ``duration=<seconds>``.
15+
16+
Usage:
17+
uv run --no-sync python scripts/performance/benchmark_partition.py [output.json]
18+
19+
Environment variables:
20+
NUM_ITERATIONS number of timed iterations per file (default: 1)
21+
"""
22+
23+
from __future__ import annotations
24+
25+
import json
26+
import logging
27+
import os
28+
import sys
29+
import time
30+
from pathlib import Path
31+
32+
from unstructured.partition.auto import partition
33+
34+
logging.basicConfig(level=logging.INFO, format="%(message)s")
35+
logger = logging.getLogger(__name__)
36+
37+
38+
BENCHMARK_FILES: list[tuple[str, str]] = [
39+
# PDFs - hi_res
40+
("example-docs/pdf/a1977-backus-p21.pdf", "hi_res"),
41+
("example-docs/pdf/copy-protected.pdf", "hi_res"),
42+
("example-docs/pdf/reliance.pdf", "hi_res"),
43+
("example-docs/pdf/pdf-with-ocr-text.pdf", "hi_res"),
44+
# Images - hi_res
45+
("example-docs/double-column-A.jpg", "hi_res"),
46+
("example-docs/double-column-B.jpg", "hi_res"),
47+
("example-docs/embedded-images-tables.jpg", "hi_res"),
48+
# Other document types - fast
49+
("example-docs/contains-pictures.docx", "fast"),
50+
("example-docs/example-10k-1p.html", "fast"),
51+
("example-docs/science-exploration-1p.pptx", "fast"),
52+
]
53+
54+
NUM_ITERATIONS: int = int(os.environ.get("NUM_ITERATIONS", "1"))
55+
56+
DEFAULT_OUTPUT = Path(__file__).parent / "partition-speed-test" / "benchmark_results.json"
57+
58+
59+
def _warmup(filepath: str) -> None:
60+
"""Run a single fast-strategy partition to warm the process up.
61+
62+
Mirrors warm_up_process() in time_partition.py: uses a warmup-docs/
63+
variant if present, otherwise falls back to the file itself.
64+
"""
65+
66+
warmup_dir = Path(__file__).parent / "warmup-docs"
67+
warmup_file = warmup_dir / f"warmup{Path(filepath).suffix}"
68+
target = str(warmup_file) if warmup_file.exists() else filepath
69+
partition(target, strategy="fast")
70+
71+
72+
def _measure(filepath: str, strategy: str, iterations: int) -> float:
73+
"""Return the average wall-clock seconds for partitioning *filepath*.
74+
75+
Identical logic to time_partition.measure_execution_time().
76+
"""
77+
78+
total = 0.0
79+
for _ in range(iterations):
80+
t0 = time.perf_counter()
81+
partition(filepath, strategy=strategy)
82+
total += time.perf_counter() - t0
83+
return total / iterations
84+
85+
86+
def _set_github_output(key: str, value: str) -> None:
87+
"""Write key=value to $GITHUB_OUTPUT when running in Actions."""
88+
gho = os.environ.get("GITHUB_OUTPUT")
89+
if gho:
90+
with open(gho, "a") as fh:
91+
fh.write(f"{key}={value}\n")
92+
93+
94+
def main() -> None:
95+
output_path = Path(sys.argv[1]) if len(sys.argv) > 1 else DEFAULT_OUTPUT
96+
repo_root = Path(__file__).resolve().parent.parent.parent # scripts/performance/ -> repo root
97+
98+
logger.info("=" * 60)
99+
logger.info(f"Partition benchmark (NUM_ITERATIONS={NUM_ITERATIONS})")
100+
logger.info("=" * 60)
101+
102+
results: dict[str, float] = {}
103+
grand_start = time.perf_counter()
104+
105+
for rel_path, strategy in BENCHMARK_FILES:
106+
filepath = repo_root / rel_path
107+
if not filepath.exists():
108+
logger.warning(f" WARNING: {rel_path} not found – skipping.")
109+
continue
110+
111+
logger.info(f" {rel_path} (strategy={strategy}, iterations={NUM_ITERATIONS})")
112+
_warmup(str(filepath))
113+
avg = _measure(str(filepath), strategy, NUM_ITERATIONS)
114+
results[rel_path] = round(avg, 4)
115+
logger.info(f" avg {avg:.2f}s")
116+
117+
total_seconds = round(time.perf_counter() - grand_start, 2)
118+
results["__total__"] = total_seconds
119+
120+
logger.info(f"\nTotal wall-clock time: {total_seconds}s")
121+
122+
# Write JSON results file (consumed by compare_benchmark.py)
123+
output_path.parent.mkdir(parents=True, exist_ok=True)
124+
output_path.write_text(json.dumps(results, indent=2) + "\n")
125+
logger.info(f"Results written to {output_path}")
126+
127+
# Also expose total as a GitHub Actions step output
128+
_set_github_output("duration", str(int(total_seconds)))
129+
130+
131+
if __name__ == "__main__":
132+
main()
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
#!/usr/bin/env python3
2+
"""Compare current benchmark results against the stored best runtime.
3+
4+
Usage:
5+
uv run --no-sync python scripts/performance/compare_benchmark.py \
6+
benchmark_results.json \
7+
benchmark_best.json \
8+
[threshold]
9+
10+
current.json JSON produced by benchmark_partition.py for this run
11+
best.json JSON produced by a previous run (the stored best); may not
12+
exist yet on the very first run
13+
threshold Float regression allowance, e.g. 0.20 for 20% (default 0.20)
14+
"""
15+
16+
from __future__ import annotations
17+
18+
import json
19+
import logging
20+
import math
21+
import os
22+
import sys
23+
from pathlib import Path
24+
25+
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
26+
logger = logging.getLogger(__name__)
27+
28+
29+
def _github_output(key: str, value: str) -> None:
30+
"""Write a key=value pair to $GITHUB_OUTPUT when running in Actions."""
31+
gho = os.environ.get("GITHUB_OUTPUT")
32+
if gho:
33+
with open(gho, "a") as fh:
34+
fh.write(f"{key}={value}\n")
35+
36+
37+
def _fmt(seconds: float) -> str:
38+
"""Format a duration, handling NaN for files missing from one side."""
39+
if math.isnan(seconds):
40+
return " n/a"
41+
return f"{seconds:7.2f}s"
42+
43+
44+
def _pct_diff(current: float, best: float) -> str:
45+
if best == 0:
46+
return " n/a"
47+
diff = (current - best) / best * 100
48+
sign = "+" if diff >= 0 else ""
49+
return f"{sign}{diff:.1f}%"
50+
51+
52+
def main() -> None:
53+
if len(sys.argv) < 3:
54+
print(__doc__, file=sys.stderr)
55+
sys.exit(2)
56+
57+
current_path = Path(sys.argv[1])
58+
best_path = Path(sys.argv[2])
59+
threshold = float(sys.argv[3]) if len(sys.argv) > 3 else 0.20
60+
61+
current: dict[str, float] = json.loads(current_path.read_text())
62+
current_total: float = current["__total__"]
63+
64+
if not best_path.exists():
65+
logger.info("No stored best found – saving current run as the baseline.")
66+
logger.info(f" Total: {current_total:.2f}s")
67+
best_path.parent.mkdir(parents=True, exist_ok=True)
68+
best_path.write_text(current_path.read_text())
69+
_github_output("new_best", "true")
70+
_github_output("regression", "false")
71+
sys.exit(0)
72+
73+
best: dict[str, float] = json.loads(best_path.read_text())
74+
best_total: float = best["__total__"]
75+
limit: float = best_total * (1.0 + threshold)
76+
77+
# Collect all file keys (exclude the __total__ sentinel)
78+
all_files = sorted((set(current.keys()) | set(best.keys())) - {"__total__"})
79+
80+
col_w = max((len(f) for f in all_files), default=40) + 2
81+
header = f"{'File':<{col_w}} {'Current':>9} {'Best':>9} {'Delta':>8}"
82+
logger.info("=" * len(header))
83+
logger.info("Partition benchmark comparison")
84+
logger.info("=" * len(header))
85+
logger.info(header)
86+
logger.info("-" * len(header))
87+
88+
for fname in all_files:
89+
c = current.get(fname, float("nan"))
90+
b = best.get(fname, float("nan"))
91+
logger.info(f"{fname:<{col_w}} {_fmt(c)} {_fmt(b)} {_pct_diff(c, b):>8}")
92+
93+
logger.info("-" * len(header))
94+
logger.info(
95+
f"{'TOTAL':<{col_w}} {_fmt(current_total)} {_fmt(best_total)}"
96+
f" {_pct_diff(current_total, best_total):>8}"
97+
)
98+
logger.info("")
99+
logger.info(f"Threshold : {threshold * 100:.0f}% (fail if current > {limit:.2f}s)")
100+
logger.info("")
101+
102+
# fail on regression beyond threshold
103+
if current_total > limit:
104+
excess_pct = (current_total - best_total) / best_total * 100
105+
logger.error(
106+
f"FAIL: current runtime {current_total:.2f}s exceeds best "
107+
f"{best_total:.2f}s by {excess_pct:.1f}% "
108+
f"(threshold {threshold * 100:.0f}%, limit {limit:.2f}s)."
109+
)
110+
_github_output("new_best", "false")
111+
_github_output("regression", "true")
112+
sys.exit(1)
113+
114+
# pass: current is within threshold of best; update best if current is faster
115+
if current_total < best_total:
116+
improvement_pct = (best_total - current_total) / best_total * 100
117+
logger.info(
118+
f"PASS (new best): {current_total:.2f}s is {improvement_pct:.1f}% "
119+
f"faster than the previous best {best_total:.2f}s – updating in S3."
120+
)
121+
best_path.write_text(current_path.read_text())
122+
_github_output("new_best", "true")
123+
else:
124+
slack_pct = (current_total - best_total) / best_total * 100
125+
logger.info(
126+
f"PASS: {current_total:.2f}s is {slack_pct:.1f}% slower than best "
127+
f"{best_total:.2f}s (within {threshold * 100:.0f}% threshold)."
128+
)
129+
_github_output("new_best", "false")
130+
131+
_github_output("regression", "false")
132+
sys.exit(0)
133+
134+
135+
if __name__ == "__main__":
136+
main()

0 commit comments

Comments
 (0)