Add Github action for time regressions (#4261)

aadland6 · web-flow · commit 16482f96a072 · 2026-02-23T17:54:10.000Z
1. Adds a new action for testing the time to partition over a set number
of documents.
2. Changes from time.time() to time.perf_counter()
diff --git a/.github/workflows/partition-benchmark.yaml b/.github/workflows/partition-benchmark.yaml
@@ -0,0 +1,120 @@
+name: Partition Benchmark
+
+# Runs on every PR targeting main to detect regressions.
+# Can also be triggered manually to establish or inspect a new baseline.
+on:
+  pull_request:
+    branches: [main]
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+env:
+  NLTK_DATA: ${{ github.workspace }}/nltk_data
+  PYTHON_VERSION: "3.12"
+  # Number of times to run the full benchmark suite.
+  NUM_ITERATIONS: "3"
+  # 20% threshold for now and tune later
+  REGRESSION_THRESHOLD: "0.20"
+  # Increment to change cache key when benchmark-affecting dependencies are updated, to ensure clean slate runs.
+  CACHE_VERSION: "v2"
+  # S3 location for metrics – matches core-product convention.
+  S3_METRICS_BUCKET_KEY: utic-metrics/ci-metrics
+  S3_BENCHMARK_PATH: open-source/partition-benchmark/benchmark_best.json
+
+jobs:
+  setup:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: ./.github/actions/base-cache
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+
+  benchmark:
+    name: Measure and compare partition() runtime
+    runs-on: ubuntu-latest
+    needs: [setup]
+
+    steps:
+
+      - uses: actions/checkout@v4
+
+      - uses: ./.github/actions/base-cache
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+
+      - name: Install system dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y libmagic-dev poppler-utils libreoffice
+          sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
+          sudo apt-get update
+          sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
+
+
+      - name: Restore HuggingFace model cache
+        uses: actions/cache/restore@v4
+        with:
+          path: ~/.cache/huggingface
+          key: hf-models-${{ runner.os }}-${{ env.CACHE_VERSION }}-${{ github.sha }}
+          restore-keys: |
+            hf-models-${{ runner.os }}-${{ env.CACHE_VERSION }}-
+            hf-models-${{ runner.os }}-
+
+
+      - name: Run partition benchmark
+        env:
+          NUM_ITERATIONS: ${{ env.NUM_ITERATIONS }}
+        run: |
+          uv run --no-sync python scripts/performance/benchmark_partition.py \
+            benchmark_results.json
+
+      - name: Save HuggingFace model cache
+        uses: actions/cache/save@v4
+        with:
+          path: ~/.cache/huggingface
+          key: hf-models-${{ runner.os }}-${{ env.CACHE_VERSION }}-${{ github.sha }}
+
+
+      - name: Download previous best from S3
+        continue-on-error: true
+        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.S3_EVAL_ACCESS_KEY }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_EVAL_SECRET_KEY }}
+        run: |
+          aws s3 cp \
+            "s3://${{ env.S3_METRICS_BUCKET_KEY }}/${{ env.S3_BENCHMARK_PATH }}" \
+            benchmark_best.json
+
+
+      - name: Compare results against stored best
+        id: compare
+        run: |
+          uv run --no-sync python scripts/performance/compare_benchmark.py \
+            benchmark_results.json \
+            benchmark_best.json \
+            ${{ env.REGRESSION_THRESHOLD }}
+
+
+      - name: Upload best result to S3
+        continue-on-error: true
+        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.S3_EVAL_ACCESS_KEY }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_EVAL_SECRET_KEY }}
+        run: |
+          aws s3 cp \
+            benchmark_best.json \
+            "s3://${{ env.S3_METRICS_BUCKET_KEY }}/${{ env.S3_BENCHMARK_PATH }}"
+
+
+      - name: Upload benchmark artifacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-results-${{ github.sha }}
+          path: |
+            benchmark_results.json
+            benchmark_best.json
+          retention-days: 30
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,6 @@
+## 0.21.4
+- Add a github action for testing time regressions
+
 ## 0.21.3
 
 ### Enhancements
diff --git a/scripts/performance/benchmark_partition.py b/scripts/performance/benchmark_partition.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+"""Measure partition() runtime over a fixed set of representative example-docs files.
+
+Follows the same conventions as the existing scripts/performance tooling:
+    - PDFs and images are run with strategy="hi_res".
+    - Everything else is run with strategy="fast".
+    - Each file is timed over NUM_ITERATIONS runs (after a warmup) and the
+      average is recorded, matching time_partition.py behaviour.
+
+Writes a JSON file mapping each file to its average runtime, plus a ``__total__``
+key with the wall-clock total.  An optional positional argument sets the output
+path (default: scripts/performance/partition-speed-test/benchmark_results.json).
+
+Also writes the total duration to $GITHUB_OUTPUT as ``duration=<seconds>``.
+
+Usage:
+    uv run --no-sync python scripts/performance/benchmark_partition.py [output.json]
+
+Environment variables:
+    NUM_ITERATIONS   number of timed iterations per file (default: 1)
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import sys
+import time
+from pathlib import Path
+
+from unstructured.partition.auto import partition
+
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+logger = logging.getLogger(__name__)
+
+
+BENCHMARK_FILES: list[tuple[str, str]] = [
+    # PDFs - hi_res
+    ("example-docs/pdf/a1977-backus-p21.pdf", "hi_res"),
+    ("example-docs/pdf/copy-protected.pdf", "hi_res"),
+    ("example-docs/pdf/reliance.pdf", "hi_res"),
+    ("example-docs/pdf/pdf-with-ocr-text.pdf", "hi_res"),
+    # Images - hi_res
+    ("example-docs/double-column-A.jpg", "hi_res"),
+    ("example-docs/double-column-B.jpg", "hi_res"),
+    ("example-docs/embedded-images-tables.jpg", "hi_res"),
+    # Other document types - fast
+    ("example-docs/contains-pictures.docx", "fast"),
+    ("example-docs/example-10k-1p.html", "fast"),
+    ("example-docs/science-exploration-1p.pptx", "fast"),
+]
+
+NUM_ITERATIONS: int = int(os.environ.get("NUM_ITERATIONS", "1"))
+
+DEFAULT_OUTPUT = Path(__file__).parent / "partition-speed-test" / "benchmark_results.json"
+
+
+def _warmup(filepath: str) -> None:
+    """Run a single fast-strategy partition to warm the process up.
+
+    Mirrors warm_up_process() in time_partition.py: uses a warmup-docs/
+    variant if present, otherwise falls back to the file itself.
+    """
+
+    warmup_dir = Path(__file__).parent / "warmup-docs"
+    warmup_file = warmup_dir / f"warmup{Path(filepath).suffix}"
+    target = str(warmup_file) if warmup_file.exists() else filepath
+    partition(target, strategy="fast")
+
+
+def _measure(filepath: str, strategy: str, iterations: int) -> float:
+    """Return the average wall-clock seconds for partitioning *filepath*.
+
+    Identical logic to time_partition.measure_execution_time().
+    """
+
+    total = 0.0
+    for _ in range(iterations):
+        t0 = time.perf_counter()
+        partition(filepath, strategy=strategy)
+        total += time.perf_counter() - t0
+    return total / iterations
+
+
+def _set_github_output(key: str, value: str) -> None:
+    """Write key=value to $GITHUB_OUTPUT when running in Actions."""
+    gho = os.environ.get("GITHUB_OUTPUT")
+    if gho:
+        with open(gho, "a") as fh:
+            fh.write(f"{key}={value}\n")
+
+
+def main() -> None:
+    output_path = Path(sys.argv[1]) if len(sys.argv) > 1 else DEFAULT_OUTPUT
+    repo_root = Path(__file__).resolve().parent.parent.parent  # scripts/performance/ -> repo root
+
+    logger.info("=" * 60)
+    logger.info(f"Partition benchmark  (NUM_ITERATIONS={NUM_ITERATIONS})")
+    logger.info("=" * 60)
+
+    results: dict[str, float] = {}
+    grand_start = time.perf_counter()
+
+    for rel_path, strategy in BENCHMARK_FILES:
+        filepath = repo_root / rel_path
+        if not filepath.exists():
+            logger.warning(f"  WARNING: {rel_path} not found – skipping.")
+            continue
+
+        logger.info(f"  {rel_path}  (strategy={strategy}, iterations={NUM_ITERATIONS})")
+        _warmup(str(filepath))
+        avg = _measure(str(filepath), strategy, NUM_ITERATIONS)
+        results[rel_path] = round(avg, 4)
+        logger.info(f"    avg {avg:.2f}s")
+
+    total_seconds = round(time.perf_counter() - grand_start, 2)
+    results["__total__"] = total_seconds
+
+    logger.info(f"\nTotal wall-clock time: {total_seconds}s")
+
+    # Write JSON results file (consumed by compare_benchmark.py)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output_path.write_text(json.dumps(results, indent=2) + "\n")
+    logger.info(f"Results written to {output_path}")
+
+    # Also expose total as a GitHub Actions step output
+    _set_github_output("duration", str(int(total_seconds)))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/performance/compare_benchmark.py b/scripts/performance/compare_benchmark.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+"""Compare current benchmark results against the stored best runtime.
+
+Usage:
+    uv run --no-sync python scripts/performance/compare_benchmark.py \
+        benchmark_results.json \
+        benchmark_best.json \
+        [threshold]
+
+    current.json  JSON produced by benchmark_partition.py for this run
+    best.json     JSON produced by a previous run (the stored best); may not
+                  exist yet on the very first run
+    threshold     Float regression allowance, e.g. 0.20 for 20% (default 0.20)
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import math
+import os
+import sys
+from pathlib import Path
+
+logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+logger = logging.getLogger(__name__)
+
+
+def _github_output(key: str, value: str) -> None:
+    """Write a key=value pair to $GITHUB_OUTPUT when running in Actions."""
+    gho = os.environ.get("GITHUB_OUTPUT")
+    if gho:
+        with open(gho, "a") as fh:
+            fh.write(f"{key}={value}\n")
+
+
+def _fmt(seconds: float) -> str:
+    """Format a duration, handling NaN for files missing from one side."""
+    if math.isnan(seconds):
+        return "    n/a"
+    return f"{seconds:7.2f}s"
+
+
+def _pct_diff(current: float, best: float) -> str:
+    if best == 0:
+        return "   n/a"
+    diff = (current - best) / best * 100
+    sign = "+" if diff >= 0 else ""
+    return f"{sign}{diff:.1f}%"
+
+
+def main() -> None:
+    if len(sys.argv) < 3:
+        print(__doc__, file=sys.stderr)
+        sys.exit(2)
+
+    current_path = Path(sys.argv[1])
+    best_path = Path(sys.argv[2])
+    threshold = float(sys.argv[3]) if len(sys.argv) > 3 else 0.20
+
+    current: dict[str, float] = json.loads(current_path.read_text())
+    current_total: float = current["__total__"]
+
+    if not best_path.exists():
+        logger.info("No stored best found – saving current run as the baseline.")
+        logger.info(f"  Total: {current_total:.2f}s")
+        best_path.parent.mkdir(parents=True, exist_ok=True)
+        best_path.write_text(current_path.read_text())
+        _github_output("new_best", "true")
+        _github_output("regression", "false")
+        sys.exit(0)
+
+    best: dict[str, float] = json.loads(best_path.read_text())
+    best_total: float = best["__total__"]
+    limit: float = best_total * (1.0 + threshold)
+
+    # Collect all file keys (exclude the __total__ sentinel)
+    all_files = sorted((set(current.keys()) | set(best.keys())) - {"__total__"})
+
+    col_w = max((len(f) for f in all_files), default=40) + 2
+    header = f"{'File':<{col_w}} {'Current':>9}  {'Best':>9}  {'Delta':>8}"
+    logger.info("=" * len(header))
+    logger.info("Partition benchmark comparison")
+    logger.info("=" * len(header))
+    logger.info(header)
+    logger.info("-" * len(header))
+
+    for fname in all_files:
+        c = current.get(fname, float("nan"))
+        b = best.get(fname, float("nan"))
+        logger.info(f"{fname:<{col_w}} {_fmt(c)}  {_fmt(b)}  {_pct_diff(c, b):>8}")
+
+    logger.info("-" * len(header))
+    logger.info(
+        f"{'TOTAL':<{col_w}} {_fmt(current_total)}  {_fmt(best_total)}"
+        f"  {_pct_diff(current_total, best_total):>8}"
+    )
+    logger.info("")
+    logger.info(f"Threshold : {threshold * 100:.0f}%  (fail if current > {limit:.2f}s)")
+    logger.info("")
+
+    # fail on regression beyond threshold
+    if current_total > limit:
+        excess_pct = (current_total - best_total) / best_total * 100
+        logger.error(
+            f"FAIL: current runtime {current_total:.2f}s exceeds best "
+            f"{best_total:.2f}s by {excess_pct:.1f}% "
+            f"(threshold {threshold * 100:.0f}%, limit {limit:.2f}s)."
+        )
+        _github_output("new_best", "false")
+        _github_output("regression", "true")
+        sys.exit(1)
+
+    # pass: current is within threshold of best; update best if current is faster
+    if current_total < best_total:
+        improvement_pct = (best_total - current_total) / best_total * 100
+        logger.info(
+            f"PASS (new best): {current_total:.2f}s is {improvement_pct:.1f}% "
+            f"faster than the previous best {best_total:.2f}s – updating in S3."
+        )
+        best_path.write_text(current_path.read_text())
+        _github_output("new_best", "true")
+    else:
+        slack_pct = (current_total - best_total) / best_total * 100
+        logger.info(
+            f"PASS: {current_total:.2f}s is {slack_pct:.1f}% slower than best "
+            f"{best_total:.2f}s (within {threshold * 100:.0f}% threshold)."
+        )
+        _github_output("new_best", "false")
+
+    _github_output("regression", "false")
+    sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/unstructured/__version__.py b/unstructured/__version__.py