-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMakefile
More file actions
94 lines (74 loc) · 3.49 KB
/
Copy pathMakefile
File metadata and controls
94 lines (74 loc) · 3.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
PYTHON := .venv/bin/python
RUNNER := eval/parallel_benchmark.py
# ── Extractor: always OpenRouter (fast, accurate)
# ── Answer: Gemini Flash via OpenRouter (default) or set ANSWER_MODEL for local
EXTRACT_ENV := EXTRACTOR_PROVIDER=openrouter
# ---------------------------------------------------------------------------
# Benchmark runs
# ---------------------------------------------------------------------------
## Full 500-example run — A + E, 8 workers (~1.5 hours with OpenRouter extractor)
benchmark:
$(EXTRACT_ENV) $(PYTHON) $(RUNNER) --variant E A --workers 8
## Quick dev run — 60 examples, Variant E only (~10 min)
benchmark-quick:
$(EXTRACT_ENV) $(PYTHON) $(RUNNER) --variant E --workers 6 --n 60
## Variant E only, full 500 examples
benchmark-e:
$(EXTRACT_ENV) $(PYTHON) $(RUNNER) --variant E --workers 8
## Variant F (consolidation) — run after validating E
benchmark-f:
$(EXTRACT_ENV) $(PYTHON) $(RUNNER) --variant F --workers 8
## Local answer LLM (proves model-agnosticism) — set OLLAMA_MODEL before calling
## e.g.: OLLAMA_MODEL=llama3.1 make benchmark-local
benchmark-local:
$(EXTRACT_ENV) ANSWER_PROVIDER=ollama $(PYTHON) $(RUNNER) --variant E --workers 4
# ---------------------------------------------------------------------------
# Results
# ---------------------------------------------------------------------------
## List all previous runs
runs:
$(PYTHON) $(RUNNER) --show-runs
## Re-score the most recent run with F1 (no API calls)
rescore-f1:
@LATEST=$$(ls -dt eval/results/runs/*/ 2>/dev/null | head -1); \
if [ -z "$$LATEST" ]; then echo "No runs found"; exit 1; fi; \
echo "Re-scoring $$LATEST with F1..."; \
$(PYTHON) $(RUNNER) --rescore f1 --run-dir $$LATEST
## Print results from the most recent run
results:
@LATEST=$$(ls -dt eval/results/runs/*/ 2>/dev/null | head -1); \
if [ -z "$$LATEST" ]; then echo "No runs found"; exit 1; fi; \
$(PYTHON) $(RUNNER) --show-run $$LATEST
# ---------------------------------------------------------------------------
# Maintenance
# ---------------------------------------------------------------------------
## Remove all but the last 5 runs (keeps disk tidy)
clean-old-runs:
@COUNT=$$(ls -dt eval/results/runs/*/ 2>/dev/null | wc -l | tr -d ' '); \
echo "Found $$COUNT runs"; \
ls -dt eval/results/runs/*/ 2>/dev/null | tail -n +6 | xargs -I{} rm -rf {} && \
echo "Kept last 5 runs"
## Re-index codebase in jcodemunch after changes
index:
$(PYTHON) -c "print('Run: mcp__jcodemunch__index_folder in Claude Code')"
# ---------------------------------------------------------------------------
# Docker
# ---------------------------------------------------------------------------
## Build the Docker image (re-run after requirements.txt changes)
docker-build:
docker compose build
## Smoke test inside Docker: 8 examples, 2 workers, E + A (~2 min)
docker-test:
docker compose run --rm benchmark \
python eval/parallel_benchmark.py --variant E A --n 8 --workers 2
## Full 500-example benchmark inside Docker (same as `make benchmark` but isolated)
docker-benchmark:
docker compose run --rm benchmark \
python eval/parallel_benchmark.py --variant E A --workers 8
## Quick dev run inside Docker: 60 examples, E only
docker-quick:
docker compose run --rm benchmark \
python eval/parallel_benchmark.py --variant E --workers 6 --n 60
.PHONY: benchmark benchmark-quick benchmark-e benchmark-f benchmark-local \
runs rescore-f1 results clean-old-runs index \
docker-build docker-test docker-benchmark docker-quick