Skip to content

fix: handle multipart MIME attachments in partition_email #124

fix: handle multipart MIME attachments in partition_email

fix: handle multipart MIME attachments in partition_email #124

name: Partition Benchmark
# Runs on every PR targeting main to detect regressions.
# Can also be triggered manually to establish or inspect a new baseline.
on:
pull_request:
branches: [main]
workflow_dispatch:
permissions:
contents: read
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
PYTHON_VERSION: "3.12"
# Number of times to run the full benchmark suite.
NUM_ITERATIONS: "3"
# 20% threshold for now and tune later
REGRESSION_THRESHOLD: "0.20"
# Increment to change cache key when benchmark-affecting dependencies are updated, to ensure clean slate runs.
CACHE_VERSION: "v2"
# S3 location for metrics – matches core-product convention.
S3_METRICS_BUCKET_KEY: utic-metrics/ci-metrics
S3_BENCHMARK_PATH: open-source/partition-benchmark/benchmark_best.json
jobs:
setup:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/base-cache
with:
python-version: ${{ env.PYTHON_VERSION }}
benchmark:
name: Measure and compare partition() runtime
runs-on: ubuntu-latest
needs: [setup]
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/base-cache
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install system dependencies
run: |
sudo apt-get update
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
sudo apt-get update
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
- name: Restore HuggingFace model cache
uses: actions/cache/restore@v4
with:
path: ~/.cache/huggingface
key: hf-models-${{ runner.os }}-${{ env.CACHE_VERSION }}-${{ github.sha }}
restore-keys: |
hf-models-${{ runner.os }}-${{ env.CACHE_VERSION }}-
hf-models-${{ runner.os }}-
- name: Run partition benchmark
env:
NUM_ITERATIONS: ${{ env.NUM_ITERATIONS }}
run: |
uv run --no-sync python scripts/performance/benchmark_partition.py \
benchmark_results.json
- name: Save HuggingFace model cache
uses: actions/cache/save@v4
with:
path: ~/.cache/huggingface
key: hf-models-${{ runner.os }}-${{ env.CACHE_VERSION }}-${{ github.sha }}
- name: Download previous best from S3
continue-on-error: true
env:
AWS_ACCESS_KEY_ID: ${{ secrets.S3_EVAL_ACCESS_KEY }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_EVAL_SECRET_KEY }}
run: |
aws s3 cp \
"s3://${{ env.S3_METRICS_BUCKET_KEY }}/${{ env.S3_BENCHMARK_PATH }}" \
benchmark_best.json
- name: Compare results against stored best
id: compare
run: |
uv run --no-sync python scripts/performance/compare_benchmark.py \
benchmark_results.json \
benchmark_best.json \
${{ env.REGRESSION_THRESHOLD }}
- name: Upload best result to S3
continue-on-error: true
env:
AWS_ACCESS_KEY_ID: ${{ secrets.S3_EVAL_ACCESS_KEY }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_EVAL_SECRET_KEY }}
run: |
aws s3 cp \
benchmark_best.json \
"s3://${{ env.S3_METRICS_BUCKET_KEY }}/${{ env.S3_BENCHMARK_PATH }}"
- name: Upload benchmark artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: benchmark-results-${{ github.sha }}
path: |
benchmark_results.json
benchmark_best.json
retention-days: 30