brain-score · KartikP · Dec 18, 2025 · Oct 6, 2025 · Oct 14, 2025 · Oct 14, 2025
diff --git a/benchmarks/models.py b/benchmarks/models.py
@@ -273,8 +273,8 @@ class Score(models.Model):
     score_raw = models.FloatField(default=0, null=True)
     score_ceiled = models.FloatField(default=0, null=True)
     error = models.FloatField(default=0, null=True)
-    start_timestamp = models.DateTimeField(blank=True)
-    end_timestamp = models.DateTimeField(auto_now_add=True, blank=True, null=True)
+    start_timestamp = models.DateTimeField(null=True, blank=True)
+    end_timestamp = models.DateTimeField(null=True, blank=True)
     comment = models.CharField(max_length=1000, null=True)
 
     def __repr__(self):

diff --git a/benchmarks/templates/benchmarks/leaderboard/ag-grid-leaderboard-content.html b/benchmarks/templates/benchmarks/leaderboard/ag-grid-leaderboard-content.html
@@ -28,11 +28,11 @@
       <span class="text-wrapper">Advanced Filters</span>
     </button>
   {% endif %}
-  
+
   <button id="exportCsvButton" class="btn-secondary" style="background-color: #47B7DE; color: white; border: none; white-space: nowrap" title="Export leaderboard data and plugin metadata">
     <i class="fa-solid fa-download"></i>  Export
   </button>
-  
+
   {% if domain == "vision" %}
   <button id="tutorialBtn" style="margin-left: 8px;" title="Interactive walkthrough of the leaderboard">
     <i class="fa-solid fa-info"></i>
@@ -215,6 +215,27 @@ <h4>Benchmark Properties</h4>
           </div>
         </div>
       </div>
+      <!-- Wayback Timestamp Section -->
+      <div class="metadata-section" id="waybackTimestampSection" style="display: none;">
+        <div class="filter-group" id="waybackTimestampFilter">
+          <div class="filter-header">
+            <label>Wayback Timestamp</label>
+            <div class="range-values">
+              <input type="date" class="range-input-min" id="waybackDateMin">
+              <span class="range-separator">to</span>
+              <input type="date" class="range-input-max" id="waybackDateMax">
+            </div>
+          </div>
+          <div class="range-filter dual-handle">
+            <div class="slider-container" data-min="0" data-max="2000000000">
+              <div class="slider-track"></div>
+              <div class="slider-range"></div>
+              <div class="slider-handle handle-min" data-value="0"></div>
+              <div class="slider-handle handle-max" data-value="2000000000"></div>
+            </div>
+          </div>
+        </div>
+      </div>
     </div>
   </div>
 

diff --git a/benchmarks/tests/compare_brain_score_leaderboards.py b/benchmarks/tests/compare_brain_score_leaderboards.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+"""
+compare_brain_score_leaderboards_playwright_v2.py
+
+Renders local + archived Brain-Score leaderboards with Playwright,
+extracts models + average scores, computes differences, and flags
+significant changes (|diff| > 0.01).
+"""
+
+import pandas as pd
+from datetime import datetime
+from playwright.sync_api import sync_playwright
+
+# === CONFIG ===
+LOCAL_URL = (
+    "http://localhost:8000/vision/leaderboard/"
+    "?min_param_count=0&max_param_count=900"
+    "&min_model_size=0&max_model_size=4000"
+    "&min_score=0&max_score=1"
+    "&min_stimuli_count=0&max_stimuli_count=51000"
+    "&min_wayback_timestamp=1598486400"
+    "&max_wayback_timestamp=1724457600"
+)
+ARCHIVE_URL = "https://web.archive.org/web/20240824001822/https://www.brain-score.org/vision/"
+OUTPUT_FILE = "brain_score_leaderboard_differences.csv"
+
+
+def scrape_leaderboard(page, label):
+    """Extract model names and scores from rendered leaderboard, adapting to unknown column IDs."""
+    print(f"🔎 Parsing {label} leaderboard ...")
+    page.wait_for_selector("div.ag-center-cols-container div.ag-row", timeout=60000)
+
+    # Inspect first row to find actual column IDs
+    first_row = page.query_selector("div.ag-center-cols-container div.ag-row")
+    if not first_row:
+        print(f"⚠️ No rows found in {label} leaderboard.")
+        return pd.DataFrame()
+
+    all_cells = first_row.query_selector_all("div[col-id]")
+    col_ids = [c.get_attribute("col-id") for c in all_cells if c.get_attribute("col-id")]
+    print(f"🧩 {label} detected column IDs: {col_ids[:10]} ...")
+
+    # Try to infer which col-id represents model and average
+    model_col = next((c for c in col_ids if "model" in c.lower()), None)
+    avg_col = next((c for c in col_ids if "average" in c.lower() or "vision_v0" in c.lower()), None)
+
+    if not model_col or not avg_col:
+        print(f"❌ Could not identify model or average score columns for {label}.")
+        return pd.DataFrame()
+
+    rows = page.query_selector_all("div.ag-center-cols-container div.ag-row")
+    data = []
+    for row in rows:
+        model_el = row.query_selector(f"div[col-id='{model_col}']")
+        score_el = row.query_selector(f"div[col-id='{avg_col}']")
+        if model_el and score_el:
+            model = model_el.inner_text().strip()
+            score_text = score_el.inner_text().strip()
+            try:
+                score = float(score_text)
+                data.append({"model": model, f"score_{label}": score})
+            except ValueError:
+                continue
+
+    df = pd.DataFrame(data)
+    print(f"✅ Parsed {len(df)} models from {label} leaderboard")
+    return df
+
+
+def main():
+    print(f"🧠 Brain-Score Leaderboard Comparison — {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+
+    with sync_playwright() as p:
+        browser = p.chromium.launch(headless=True)
+        page = browser.new_page()
+
+        # --- Local leaderboard ---
+        print(f"🌐 Fetching Local leaderboard: {LOCAL_URL}")
+        page.goto(LOCAL_URL, timeout=90000)
+        local_df = scrape_leaderboard(page, "local")
+
+        # --- Archive leaderboard ---
+        print(f"\n🌐 Fetching Archive leaderboard: {ARCHIVE_URL}")
+        page.goto(ARCHIVE_URL, timeout=120000)
+        archive_df = scrape_leaderboard(page, "archive")
+
+        browser.close()
+
+    if local_df.empty or archive_df.empty:
+        print("\n❌ One of the leaderboards could not be parsed. Check detected column IDs above.")
+        return
+
+    # --- Merge & compare ---
+    merged = pd.merge(local_df, archive_df, on="model", how="outer")
+    merged["diff"] = merged["score_local"] - merged["score_archive"]
+    merged["significant_change"] = merged["diff"].abs() > 0.01
+    merged["significant_change"] = merged["significant_change"].map({True: "YES", False: "NO"})
+
+    merged.to_csv(OUTPUT_FILE, index=False)
+    print(f"\n✅ Comparison complete. Saved to: {OUTPUT_FILE}\n")
+    print("📊 Preview:")
+    print(merged.head(10).to_string(index=False))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/tests/test_ag_grid.py b/benchmarks/tests/test_ag_grid.py
@@ -982,6 +982,73 @@ def test_stimuli_count_filter(self, page):
         assert actual_models == expected_models, f"Expected models {expected_models}, got {actual_models}"
         assert actual_scores == expected_scores, f"Expected scores {expected_scores}, got {actual_scores}"
 
+    def test_wayback_timestamp_filter(self, page):
+        """
+        Verifies wayback timestamp filtering by directly driving the filter logic:
+
+        1) Opens the Advanced Filtering panel.
+        2) Sets the minimum wayback timestamp to Aug 27th 2020 and maximum to Aug 12th 2024 by updating the inputs in JS.
+        3) Calls applyCombinedFilters() to re‐apply the filter pipeline.
+        4) Waits for the grid to repaint.
+        5) Asserts that:
+           a) the slider inputs read  "2020-08-27T00:00:00.000Z" and "2024-08-12T22:41:02.470Z"
+           b) window.activeFilters.min_wayback_timestamp == 2020-08-27T00:00:00.000Z and
+              window.activeFilters.max_wayback_timestamp == 2024-08-12T22:41:02.470Z.
+        6) Extracts and verifies the top-5 rows (ranks, model names, global scores) match expectations.
+        """
+        # 1) open the panel
+        page.click('#advancedFilterBtn')
+        page.wait_for_selector('#waybackDateMin', state='visible')
+        page.wait_for_selector('#waybackDateMax', state='visible')
+
+        # 2) set both inputs, then rerun the filter pipeline in JS
+        page.evaluate("""
+        () => {
+        const maxInput = document.getElementById('waybackDateMax');
+        maxInput.value = "2024-08-12";
+        maxInput.dispatchEvent(new Event('input', { bubbles: true }));
+        maxInput.dispatchEvent(new Event('change', { bubbles: true }));
+        window.activeFilters.max_wayback_timestamp = Math.floor(new Date("2024-08-12").getTime() / 1000);
+        applyCombinedFilters();
+        }
+        """)
+
+        # 3) give the grid a moment to re-filter
+        page.wait_for_timeout(500)
+
+        # 4) verify both UI inputs and JS state
+        min_val = page.evaluate("document.getElementById('waybackDateMin')?.value")
+        max_val = page.evaluate("document.getElementById('waybackDateMax')?.value")
+
+        print(f"✅ WaybackDateMin input: {min_val}")
+        print(f"✅ WaybackDateMax input: {max_val}")
+
+        expected_min_val = "2020-08-27"
+        expected_max_val = "2024-08-12"
+
+        # UI check (ISO strings)
+        assert min_val == expected_min_val, f"❌ Min input mismatch: {min_val}"
+        assert max_val == expected_max_val, f"❌ Max input mismatch: {max_val}"
+
+
+        # 5) verify leaderboard contents after filter
+        expected_ranks = [8, 8, 13, 20, 26]
+        expected_models = [
+            "cvt_cvt-w24-384-in22k_finetuned-in1k_4",
+            "resnext101_32x8d_wsl",
+            "resnext101_32x48d_wsl",
+            "effnetb1_272x240",
+            "effnetb1_cutmixpatch_augmix_robust32_avge4e7_manylayers_324x288",
+        ]
+        expected_scores = ["0.43", "0.43", "0.41", "0.40", "0.39"]
+
+        actual_ranks  = page.locator('.ag-cell[col-id="rank"]').all_text_contents()[:5]
+        actual_models = page.locator('.ag-cell[col-id="model"] a').all_text_contents()[:5]
+        actual_scores = page.locator('.ag-cell[col-id="average_vision_v0"]').all_text_contents()[:5]
+
+        assert actual_ranks  == [str(r) for r in expected_ranks], f"Expected ranks {expected_ranks}, got {actual_ranks}"
+        assert actual_models == expected_models, f"Expected models {expected_models}, got {actual_models}"
+        assert actual_scores == expected_scores, f"Expected scores {expected_scores}, got {actual_scores}"
 
     def test_copy_bibtex_button_all(self, page):
         """
@@ -1125,4 +1192,4 @@ def test_search_bar_filters_models_by_name(self, page):
         # Compare results
         assert actual_ranks == [str(r) for r in expected_ranks], f"Ranks: {actual_ranks}"
         assert actual_models == expected_models, f"Models: {actual_models}"
-        assert actual_scores == expected_scores, f"Scores: {actual_scores}"
+        assert actual_scores == expected_scores, f"Scores: {actual_scores}"
diff --git a/benchmarks/views/index.py b/benchmarks/views/index.py
@@ -14,6 +14,7 @@
 
 from benchmarks.models import Score, FinalBenchmarkContext, FinalModelContext, Reference
 from ..utils import cache_get_context
+from datetime import datetime
 
 _logger = logging.getLogger(__name__)
 
@@ -33,7 +34,23 @@
 color_suffix = '_color'
 color_None = '#e0e1e2'
 
-
+def get_datetime_range(models):
+    """Extract min and max timestamps from model scores."""
+    timestamps = []
+    for model in models:
+        for score in (model.scores or []):
+            ts = score.get("end_timestamp")
+            if ts:
+                try:
+                    timestamps.append(datetime.fromisoformat(ts))
+                except Exception:
+                    pass  # Ignore malformed timestamps
+    if timestamps:
+        return {
+            "min": min(timestamps).isoformat(),
+            "max": max(timestamps).isoformat(),
+        }
+    return None
 
 def get_base_model_query(domain="vision"):
     """Get the base model query for a domain before any filtering"""

diff --git a/benchmarks/views/leaderboard.py b/benchmarks/views/leaderboard.py
@@ -9,6 +9,9 @@
 from django.views.decorators.cache import cache_page
 from django.db.models import Model
 logger = logging.getLogger(__name__)
+from .index import get_context, get_datetime_range  # Add get_datetime_range import
+from datetime import datetime
+import pytz
 
 def json_serializable(obj):
     """Recursively convert NumPy and other types to Python native types"""
@@ -337,7 +340,8 @@ def get_ag_grid_context(user=None, domain="vision", benchmark_filter=None, model
                 'raw': score.get('score_raw'),
                 'error': score.get('error'),
                 'color': score.get('color'),
-                'complete': score.get('is_complete', True)
+                'complete': score.get('is_complete', True),
+                'timestamp': score.get('end_timestamp')
             }
         row_data.append(rd)
 
@@ -471,6 +475,19 @@ def get_priority(field):
         }
     }
 
+    # Compute datetime range for wayback timestamp filter
+    datetime_range = get_datetime_range(context['models'])
+    if datetime_range:
+        # Parse the timestamps to get Unix timestamps for the slider
+        min_timestamp = datetime.fromisoformat(datetime_range['min'])
+        max_timestamp = datetime.fromisoformat(datetime_range['max'])
+        filter_options['datetime_range'] = {
+            'min': datetime_range['min'],
+            'max': datetime_range['max'],
+            'min_unix': int(min_timestamp.timestamp()),
+            'max_unix': int(max_timestamp.timestamp())
+        }
+
     # 4) Attach JSON-serialized data to template context
     stimuli_map = {}
     data_map = {}

diff --git a/static/benchmarks/css/leaderboard/components/_range-sliders.sass b/static/benchmarks/css/leaderboard/components/_range-sliders.sass
@@ -22,7 +22,7 @@
 
     .range-input-min,
     .range-input-max
-      width: 60px
+      width: 95px
       padding: 4px 6px
       border: 1px solid #ced4da
       border-radius: 4px
@@ -196,4 +196,4 @@
         border-radius: 50%
         top: 50%
         left: 50%
-        transform: translate(-50%, -50%)
+        transform: translate(-50%, -50%)