Skip to content
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
c844ba3
Wayback Slider Working Version
nnt-git13 Oct 6, 2025
5cbe40a
Fixed Calendar and Unit Test Added
nnt-git13 Oct 14, 2025
af483bc
Merge branch 'master' into new_webUROP_2
KartikP Oct 14, 2025
30f465e
Fixed Unit test
nnt-git13 Oct 20, 2025
23fa0e3
Merge branch 'new_webUROP_2' of https://github.com/brain-score/brain-…
nnt-git13 Oct 20, 2025
9fe5e01
Merge branch 'master' into new_webUROP_2
KartikP Oct 20, 2025
c265f0b
Headless_True
nnt-git13 Oct 20, 2025
214d2a4
Remove Sleep Time
nnt-git13 Oct 20, 2025
8564da2
Small change for date box
nnt-git13 Nov 4, 2025
0d0534a
Merge branch 'master' into new_webUROP_2
KartikP Nov 12, 2025
60f5895
Wayback Filter Added to Export
nnt-git13 Nov 20, 2025
0fe88d1
Merge branch 'new_webUROP_2' of https://github.com/brain-score/brain-…
nnt-git13 Nov 20, 2025
dc1bc21
Merge branch 'master' into new_webUROP_2
KartikP Nov 20, 2025
cb23dc2
Addressed code-review changes
nnt-git13 Dec 8, 2025
4538d62
Merge branch 'new_webUROP_2' of https://github.com/brain-score/brain-…
nnt-git13 Dec 8, 2025
5a6cfab
Try testing without ranks
nnt-git13 Dec 9, 2025
dc4146c
clean up
KartikP Dec 15, 2025
55f8070
reset slider handle after reset button press
KartikP Dec 15, 2025
761389f
Move wayback timestamp slider to first column and adjust input box width
KartikP Dec 15, 2025
0509932
Merge branch 'master' into new_webUROP_2
KartikP Dec 15, 2025
64a47cf
Disable start_timestamp. Make it conditional so we can re-enable if n…
KartikP Dec 15, 2025
52fe74b
Fix input box overflow in col container
KartikP Dec 15, 2025
92a7bd0
re-introduce whitespace
KartikP Dec 15, 2025
95c3c64
fix blank lines
KartikP Dec 15, 2025
71f2776
Update tests after web_test update
KartikP Dec 15, 2025
09b3ed1
Merge timestamp fields from kp/add-timestamp-to-scores into mv.sql
KartikP Dec 15, 2025
e7fde84
upper bound calendar input with checks
KartikP Dec 17, 2025
25378ed
parseURLFilters() and setRangerValues() correctly
KartikP Dec 17, 2025
64a47cc
Update benchmark counts
KartikP Dec 17, 2025
6fdadd6
use color-utils from #485 to rescale colors after wayback
KartikP Dec 17, 2025
5cd6128
move depth calculation outside of model row loop
KartikP Dec 17, 2025
1d379bc
fix color and aggregation calculation
KartikP Dec 17, 2025
ca450a9
Optimization improvement:
KartikP Dec 17, 2025
2baba6d
fix export to exclude hidden benchmark and sort by fscore
KartikP Dec 17, 2025
0602e43
Only exclude benchmarks hidden by wayback filtering, not where where …
KartikP Dec 18, 2025
fab43a9
Merge remote-tracking branch 'origin/master' into new_webUROP_2
KartikP Dec 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions benchmarks/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,8 +273,8 @@ class Score(models.Model):
score_raw = models.FloatField(default=0, null=True)
score_ceiled = models.FloatField(default=0, null=True)
error = models.FloatField(default=0, null=True)
start_timestamp = models.DateTimeField(blank=True)
end_timestamp = models.DateTimeField(auto_now_add=True, blank=True, null=True)
start_timestamp = models.DateTimeField(null=True, blank=True)
end_timestamp = models.DateTimeField(null=True, blank=True)
comment = models.CharField(max_length=1000, null=True)

def __repr__(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,11 @@
<span class="text-wrapper">Advanced Filters</span>
</button>
{% endif %}

<button id="exportCsvButton" class="btn-secondary" style="background-color: #47B7DE; color: white; border: none; white-space: nowrap" title="Export leaderboard data and plugin metadata">
<i class="fa-solid fa-download"></i> Export
</button>

{% if domain == "vision" %}
<button id="tutorialBtn" style="margin-left: 8px;" title="Interactive walkthrough of the leaderboard">
<i class="fa-solid fa-info"></i>
Expand Down Expand Up @@ -215,6 +215,27 @@ <h4>Benchmark Properties</h4>
</div>
</div>
</div>
<!-- Wayback Timestamp Section -->
<div class="metadata-section" id="waybackTimestampSection" style="display: none;">
<div class="filter-group" id="waybackTimestampFilter">
<div class="filter-header">
<label>Wayback Timestamp</label>
<div class="range-values">
<input type="date" class="range-input-min" id="waybackDateMin">
<span class="range-separator">to</span>
<input type="date" class="range-input-max" id="waybackDateMax">
</div>
</div>
<div class="range-filter dual-handle">
<div class="slider-container" data-min="0" data-max="2000000000">
<div class="slider-track"></div>
<div class="slider-range"></div>
<div class="slider-handle handle-min" data-value="0"></div>
<div class="slider-handle handle-max" data-value="2000000000"></div>
</div>
</div>
</div>
</div>
</div>
</div>

Expand Down
106 changes: 106 additions & 0 deletions benchmarks/tests/compare_brain_score_leaderboards.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#!/usr/bin/env python3
"""
compare_brain_score_leaderboards_playwright_v2.py

Renders local + archived Brain-Score leaderboards with Playwright,
extracts models + average scores, computes differences, and flags
significant changes (|diff| > 0.01).
"""

import pandas as pd
from datetime import datetime
from playwright.sync_api import sync_playwright

# === CONFIG ===
LOCAL_URL = (
"http://localhost:8000/vision/leaderboard/"
"?min_param_count=0&max_param_count=900"
"&min_model_size=0&max_model_size=4000"
"&min_score=0&max_score=1"
"&min_stimuli_count=0&max_stimuli_count=51000"
"&min_wayback_timestamp=1598486400"
"&max_wayback_timestamp=1724457600"
)
ARCHIVE_URL = "https://web.archive.org/web/20240824001822/https://www.brain-score.org/vision/"
OUTPUT_FILE = "brain_score_leaderboard_differences.csv"


def scrape_leaderboard(page, label):
"""Extract model names and scores from rendered leaderboard, adapting to unknown column IDs."""
print(f"🔎 Parsing {label} leaderboard ...")
page.wait_for_selector("div.ag-center-cols-container div.ag-row", timeout=60000)

# Inspect first row to find actual column IDs
first_row = page.query_selector("div.ag-center-cols-container div.ag-row")
if not first_row:
print(f"⚠️ No rows found in {label} leaderboard.")
return pd.DataFrame()

all_cells = first_row.query_selector_all("div[col-id]")
col_ids = [c.get_attribute("col-id") for c in all_cells if c.get_attribute("col-id")]
print(f"🧩 {label} detected column IDs: {col_ids[:10]} ...")

# Try to infer which col-id represents model and average
model_col = next((c for c in col_ids if "model" in c.lower()), None)
avg_col = next((c for c in col_ids if "average" in c.lower() or "vision_v0" in c.lower()), None)

if not model_col or not avg_col:
print(f"❌ Could not identify model or average score columns for {label}.")
return pd.DataFrame()

rows = page.query_selector_all("div.ag-center-cols-container div.ag-row")
data = []
for row in rows:
model_el = row.query_selector(f"div[col-id='{model_col}']")
score_el = row.query_selector(f"div[col-id='{avg_col}']")
if model_el and score_el:
model = model_el.inner_text().strip()
score_text = score_el.inner_text().strip()
try:
score = float(score_text)
data.append({"model": model, f"score_{label}": score})
except ValueError:
continue

df = pd.DataFrame(data)
print(f"✅ Parsed {len(df)} models from {label} leaderboard")
return df


def main():
print(f"🧠 Brain-Score Leaderboard Comparison — {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")

with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()

# --- Local leaderboard ---
print(f"🌐 Fetching Local leaderboard: {LOCAL_URL}")
page.goto(LOCAL_URL, timeout=90000)
local_df = scrape_leaderboard(page, "local")

# --- Archive leaderboard ---
print(f"\n🌐 Fetching Archive leaderboard: {ARCHIVE_URL}")
page.goto(ARCHIVE_URL, timeout=120000)
archive_df = scrape_leaderboard(page, "archive")

browser.close()

if local_df.empty or archive_df.empty:
print("\n❌ One of the leaderboards could not be parsed. Check detected column IDs above.")
return

# --- Merge & compare ---
merged = pd.merge(local_df, archive_df, on="model", how="outer")
merged["diff"] = merged["score_local"] - merged["score_archive"]
merged["significant_change"] = merged["diff"].abs() > 0.01
merged["significant_change"] = merged["significant_change"].map({True: "YES", False: "NO"})

merged.to_csv(OUTPUT_FILE, index=False)
print(f"\n✅ Comparison complete. Saved to: {OUTPUT_FILE}\n")
print("📊 Preview:")
print(merged.head(10).to_string(index=False))


if __name__ == "__main__":
main()
69 changes: 68 additions & 1 deletion benchmarks/tests/test_ag_grid.py
Original file line number Diff line number Diff line change
Expand Up @@ -982,6 +982,73 @@ def test_stimuli_count_filter(self, page):
assert actual_models == expected_models, f"Expected models {expected_models}, got {actual_models}"
assert actual_scores == expected_scores, f"Expected scores {expected_scores}, got {actual_scores}"

def test_wayback_timestamp_filter(self, page):
"""
Verifies wayback timestamp filtering by directly driving the filter logic:

1) Opens the Advanced Filtering panel.
2) Sets the minimum wayback timestamp to Aug 27th 2020 and maximum to Aug 12th 2024 by updating the inputs in JS.
3) Calls applyCombinedFilters() to re‐apply the filter pipeline.
4) Waits for the grid to repaint.
5) Asserts that:
a) the slider inputs read "2020-08-27T00:00:00.000Z" and "2024-08-12T22:41:02.470Z"
b) window.activeFilters.min_wayback_timestamp == 2020-08-27T00:00:00.000Z and
window.activeFilters.max_wayback_timestamp == 2024-08-12T22:41:02.470Z.
6) Extracts and verifies the top-5 rows (ranks, model names, global scores) match expectations.
"""
# 1) open the panel
page.click('#advancedFilterBtn')
page.wait_for_selector('#waybackDateMin', state='visible')
page.wait_for_selector('#waybackDateMax', state='visible')

# 2) set both inputs, then rerun the filter pipeline in JS
page.evaluate("""
() => {
const maxInput = document.getElementById('waybackDateMax');
maxInput.value = "2024-08-12";
maxInput.dispatchEvent(new Event('input', { bubbles: true }));
maxInput.dispatchEvent(new Event('change', { bubbles: true }));
window.activeFilters.max_wayback_timestamp = Math.floor(new Date("2024-08-12").getTime() / 1000);
applyCombinedFilters();
}
""")

# 3) give the grid a moment to re-filter
page.wait_for_timeout(500)

# 4) verify both UI inputs and JS state
min_val = page.evaluate("document.getElementById('waybackDateMin')?.value")
max_val = page.evaluate("document.getElementById('waybackDateMax')?.value")

print(f"✅ WaybackDateMin input: {min_val}")
print(f"✅ WaybackDateMax input: {max_val}")

expected_min_val = "2020-08-27"
expected_max_val = "2024-08-12"

# UI check (ISO strings)
assert min_val == expected_min_val, f"❌ Min input mismatch: {min_val}"
assert max_val == expected_max_val, f"❌ Max input mismatch: {max_val}"


# 5) verify leaderboard contents after filter
expected_ranks = [8, 8, 13, 20, 26]
expected_models = [
"cvt_cvt-w24-384-in22k_finetuned-in1k_4",
"resnext101_32x8d_wsl",
"resnext101_32x48d_wsl",
"effnetb1_272x240",
"effnetb1_cutmixpatch_augmix_robust32_avge4e7_manylayers_324x288",
]
expected_scores = ["0.43", "0.43", "0.41", "0.40", "0.39"]

actual_ranks = page.locator('.ag-cell[col-id="rank"]').all_text_contents()[:5]
actual_models = page.locator('.ag-cell[col-id="model"] a').all_text_contents()[:5]
actual_scores = page.locator('.ag-cell[col-id="average_vision_v0"]').all_text_contents()[:5]

assert actual_ranks == [str(r) for r in expected_ranks], f"Expected ranks {expected_ranks}, got {actual_ranks}"
assert actual_models == expected_models, f"Expected models {expected_models}, got {actual_models}"
assert actual_scores == expected_scores, f"Expected scores {expected_scores}, got {actual_scores}"

def test_copy_bibtex_button_all(self, page):
"""
Expand Down Expand Up @@ -1125,4 +1192,4 @@ def test_search_bar_filters_models_by_name(self, page):
# Compare results
assert actual_ranks == [str(r) for r in expected_ranks], f"Ranks: {actual_ranks}"
assert actual_models == expected_models, f"Models: {actual_models}"
assert actual_scores == expected_scores, f"Scores: {actual_scores}"
assert actual_scores == expected_scores, f"Scores: {actual_scores}"
19 changes: 18 additions & 1 deletion benchmarks/views/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from benchmarks.models import Score, FinalBenchmarkContext, FinalModelContext, Reference
from ..utils import cache_get_context
from datetime import datetime

_logger = logging.getLogger(__name__)

Expand All @@ -33,7 +34,23 @@
color_suffix = '_color'
color_None = '#e0e1e2'


def get_datetime_range(models):
"""Extract min and max timestamps from model scores."""
timestamps = []
for model in models:
for score in (model.scores or []):
ts = score.get("end_timestamp")
if ts:
try:
timestamps.append(datetime.fromisoformat(ts))
except Exception:
pass # Ignore malformed timestamps
if timestamps:
return {
"min": min(timestamps).isoformat(),
"max": max(timestamps).isoformat(),
}
return None

def get_base_model_query(domain="vision"):
"""Get the base model query for a domain before any filtering"""
Expand Down
19 changes: 18 additions & 1 deletion benchmarks/views/leaderboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
from django.views.decorators.cache import cache_page
from django.db.models import Model
logger = logging.getLogger(__name__)
from .index import get_context, get_datetime_range # Add get_datetime_range import
from datetime import datetime
import pytz

def json_serializable(obj):
"""Recursively convert NumPy and other types to Python native types"""
Expand Down Expand Up @@ -337,7 +340,8 @@ def get_ag_grid_context(user=None, domain="vision", benchmark_filter=None, model
'raw': score.get('score_raw'),
'error': score.get('error'),
'color': score.get('color'),
'complete': score.get('is_complete', True)
'complete': score.get('is_complete', True),
'timestamp': score.get('end_timestamp')
}
row_data.append(rd)

Expand Down Expand Up @@ -471,6 +475,19 @@ def get_priority(field):
}
}

# Compute datetime range for wayback timestamp filter
datetime_range = get_datetime_range(context['models'])
if datetime_range:
# Parse the timestamps to get Unix timestamps for the slider
min_timestamp = datetime.fromisoformat(datetime_range['min'])
max_timestamp = datetime.fromisoformat(datetime_range['max'])
filter_options['datetime_range'] = {
'min': datetime_range['min'],
'max': datetime_range['max'],
'min_unix': int(min_timestamp.timestamp()),
'max_unix': int(max_timestamp.timestamp())
}

# 4) Attach JSON-serialized data to template context
stimuli_map = {}
data_map = {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

.range-input-min,
.range-input-max
width: 60px
width: 95px
padding: 4px 6px
border: 1px solid #ced4da
border-radius: 4px
Expand Down Expand Up @@ -196,4 +196,4 @@
border-radius: 50%
top: 50%
left: 50%
transform: translate(-50%, -50%)
transform: translate(-50%, -50%)
Loading