diff --git a/benchmarks/benchmark_config.py b/benchmarks/benchmark_config.py
index f11ae2b93..d18073eff 100644
--- a/benchmarks/benchmark_config.py
+++ b/benchmarks/benchmark_config.py
@@ -30,11 +30,11 @@
EXCLUDED_BENCHMARKS = [
# Coggan to possibly be excluded in future:
- # "tong.Coggan2024_fMRI.V1-rdm_v1",
- # "tong.Coggan2024_fMRI.V2-rdm_v1",
- # "tong.Coggan2024_fMRI.V4-rdm_v1",
- # "tong.Coggan2024_fMRI.IT-rdm_v1",
- # "tong.Coggan2024_behavior-ConditionWiseAccuracySimilarity_v1"
+ "tong.Coggan2024_fMRI.V1-rdm_v1",
+ "tong.Coggan2024_fMRI.V2-rdm_v1",
+ "tong.Coggan2024_fMRI.V4-rdm_v1",
+ "tong.Coggan2024_fMRI.IT-rdm_v1",
+ "tong.Coggan2024_behavior-ConditionWiseAccuracySimilarity_v1"
]
# Domain-specific exclusions (if needed in the future)
diff --git a/benchmarks/templates/benchmarks/leaderboard/ag-grid-leaderboard-content.html b/benchmarks/templates/benchmarks/leaderboard/ag-grid-leaderboard-content.html
index cb24b97b5..4c316d62d 100644
--- a/benchmarks/templates/benchmarks/leaderboard/ag-grid-leaderboard-content.html
+++ b/benchmarks/templates/benchmarks/leaderboard/ag-grid-leaderboard-content.html
@@ -240,6 +240,7 @@
Benchmark Properties
{% endif %}
+
diff --git a/static/benchmarks/js/leaderboard/core/grid-initialization.js b/static/benchmarks/js/leaderboard/core/grid-initialization.js
index df488196b..b033e4c38 100644
--- a/static/benchmarks/js/leaderboard/core/grid-initialization.js
+++ b/static/benchmarks/js/leaderboard/core/grid-initialization.js
@@ -215,7 +215,8 @@ function initializeGrid(rowData, columnDefs, benchmarkGroups) {
state: [
{ colId: 'runnable_status', hide: false },
{ colId: 'filtered_score', hide: true },
- { colId: 'average_vision_v0', hide: false }
+ { colId: 'average_vision_v0', hide: false },
+ { colId: 'rank', sort: 'asc' }
]
});
diff --git a/static/benchmarks/js/leaderboard/core/template-initialization.js b/static/benchmarks/js/leaderboard/core/template-initialization.js
index b48b4f5bf..8f7bdb5c0 100644
--- a/static/benchmarks/js/leaderboard/core/template-initialization.js
+++ b/static/benchmarks/js/leaderboard/core/template-initialization.js
@@ -231,9 +231,268 @@ function setupFilters() {
window.filteredOutBenchmarks.add(cb.value);
}
});
+
+ // Recalculate baseline scores to exclude the default-excluded benchmarks
+ // This ensures the Global Score on initial load correctly excludes these benchmarks
+ if (window.excludedBenchmarks && window.excludedBenchmarks.size > 0) {
+ setTimeout(() => {
+ recalculateBaselineScores();
+ }, 50);
+ }
}, 20);
}
+// Recalculate baseline scores to properly exclude the default-excluded benchmarks
+// This modifies originalRowData so that the "baseline" Global Score is correct
+// It also preserves the true original values so they can be restored when benchmarks are re-added
+function recalculateBaselineScores() {
+ if (!window.originalRowData || !window.benchmarkTree || !window.excludedBenchmarks) return;
+
+ const hierarchyMap = window.buildHierarchyFromTree(window.benchmarkTree);
+ const excludedBenchmarks = window.filteredOutBenchmarks || new Set();
+
+ // Store the true original values BEFORE modifying (for restoring when benchmarks are re-added)
+ // Only do this once on initial load
+ if (!window.trueOriginalRowData) {
+ window.trueOriginalRowData = JSON.parse(JSON.stringify(window.originalRowData));
+ }
+
+ // Recalculate scores for each row
+ window.originalRowData.forEach(row => {
+ // Process benchmarks from leaves up to parents (using shared utility)
+ const allBenchmarkIds = Array.from(hierarchyMap.keys());
+ const benchmarksByDepth = allBenchmarkIds
+ .map(id => ({ id, depth: window.LeaderboardHierarchyUtils.getDepthLevel(id, hierarchyMap) }))
+ .sort((a, b) => a.depth - b.depth);
+
+ benchmarksByDepth.forEach(({ id: benchmarkId }) => {
+ const children = hierarchyMap.get(benchmarkId) || [];
+
+ if (children.length === 0) {
+ // Leaf benchmark: mark as X if excluded
+ if (excludedBenchmarks.has(benchmarkId) && row[benchmarkId]) {
+ row[benchmarkId] = {
+ ...row[benchmarkId],
+ value: 'X',
+ color: '#e0e1e2'
+ };
+ }
+ } else {
+ // Parent benchmark: recalculate average from non-excluded children (using shared utility)
+ const childScores = [];
+ const childInfo = [];
+
+ children.forEach(childId => {
+ // Skip if child is explicitly excluded OR if its entire subtree is excluded
+ if (excludedBenchmarks.has(childId) || window.LeaderboardHierarchyUtils.isFullyExcluded(childId, hierarchyMap, excludedBenchmarks)) return;
+
+ if (row[childId]) {
+ const childScore = row[childId].value;
+ const hasValidScore = childScore !== null && childScore !== undefined &&
+ childScore !== '' && childScore !== 'X' &&
+ !isNaN(parseFloat(childScore));
+ childInfo.push({ childId, childScore, hasValidScore });
+ }
+ });
+
+ const hasAnyValidScores = childInfo.some(info => info.hasValidScore);
+
+ childInfo.forEach(({ childScore, hasValidScore }) => {
+ if (hasValidScore) {
+ childScores.push(parseFloat(childScore));
+ } else if (hasAnyValidScores && (childScore === 'X' || childScore === '')) {
+ // Treat X as 0 only if there are other valid scores (normal X, not fully excluded)
+ childScores.push(0);
+ }
+ });
+
+ // Determine if this column should be dropped out
+ let shouldDropOut = childScores.length === 0 || !hasAnyValidScores;
+
+ if (shouldDropOut) {
+ if (row[benchmarkId]) {
+ row[benchmarkId] = {
+ ...row[benchmarkId],
+ value: 'X',
+ color: '#e0e1e2'
+ };
+ }
+ } else if (row[benchmarkId]) {
+ const average = childScores.reduce((a, b) => a + b, 0) / childScores.length;
+ row[benchmarkId] = {
+ ...row[benchmarkId],
+ value: parseFloat(average.toFixed(3))
+ };
+ }
+ }
+ });
+
+ // Recalculate average_vision_v0 (Global Score) from neural and behavior (using shared utility)
+ const visionCategories = ['neural_vision_v0', 'behavior_vision_v0'];
+ const categoryScores = [];
+
+ visionCategories.forEach(category => {
+ // Skip if explicitly excluded OR if its entire subtree is excluded
+ if (excludedBenchmarks.has(category) || window.LeaderboardHierarchyUtils.isFullyExcluded(category, hierarchyMap, excludedBenchmarks)) return;
+
+ if (row[category]) {
+ const score = row[category].value;
+ if (score !== null && score !== undefined && score !== '') {
+ if (score === 'X') {
+ // Treat X as 0 (normal X, model has no data)
+ categoryScores.push(0);
+ } else {
+ const numVal = typeof score === 'string' ? parseFloat(score) : score;
+ if (!isNaN(numVal)) {
+ categoryScores.push(numVal);
+ } else {
+ categoryScores.push(0);
+ }
+ }
+ }
+ }
+ });
+
+ if (row['average_vision_v0']) {
+ if (categoryScores.length > 0) {
+ const globalAverage = categoryScores.reduce((a, b) => a + b, 0) / categoryScores.length;
+ row['average_vision_v0'] = {
+ ...row['average_vision_v0'],
+ value: parseFloat(globalAverage.toFixed(3))
+ };
+ } else {
+ // All categories are excluded - mark global score as X
+ row['average_vision_v0'] = {
+ ...row['average_vision_v0'],
+ value: 'X',
+ color: '#e0e1e2'
+ };
+ }
+ }
+ });
+
+ // Recalculate colors for parent benchmarks based on NEW score distribution
+ // This ensures colors reflect the new min/max ranges after exclusions
+ const allBenchmarkIds = Array.from(hierarchyMap.keys());
+ const parentBenchmarkIds = allBenchmarkIds.filter(bid => {
+ const children = hierarchyMap.get(bid) || [];
+ return children.length > 0; // Only parent benchmarks
+ });
+
+ // Also include global score (average_vision_v0) if it exists
+ const globalScoreId = 'average_vision_v0';
+ if (!parentBenchmarkIds.includes(globalScoreId) &&
+ window.originalRowData.length > 0 &&
+ window.originalRowData[0][globalScoreId]) {
+ parentBenchmarkIds.push(globalScoreId);
+ }
+
+ // Recalculate colors for each parent benchmark
+ parentBenchmarkIds.forEach(benchmarkId => {
+ if (window.LeaderboardColorUtils && window.LeaderboardColorUtils.recalculateColorsForBenchmark) {
+ window.LeaderboardColorUtils.recalculateColorsForBenchmark(
+ window.originalRowData,
+ benchmarkId,
+ hierarchyMap
+ );
+ }
+ });
+
+ // Recalculate ranks based on the new global scores
+ recalculateRanks(window.originalRowData);
+
+ // Update the grid with recalculated data
+ if (window.globalGridApi) {
+ window.globalGridApi.setGridOption('rowData', window.originalRowData);
+ window.globalGridApi.refreshCells({ force: true });
+ }
+}
+
+// Recalculate ranks based on the current global scores (average_vision_v0)
+// This ensures ranks reflect the recalculated scores after excluding benchmarks
+function recalculateRanks(rowData) {
+ if (!rowData || rowData.length === 0) return;
+
+ // Extract global scores for each model
+ const modelScores = rowData.map(row => {
+ const globalScore = row.average_vision_v0;
+ let score = null;
+ let isX = false;
+
+ if (globalScore) {
+ const val = globalScore.value;
+ if (val === 'X' || val === '' || val === null || val === undefined) {
+ isX = true;
+ } else {
+ const numVal = typeof val === 'string' ? parseFloat(val) : val;
+ if (!isNaN(numVal)) {
+ score = numVal;
+ } else {
+ isX = true;
+ }
+ }
+ } else {
+ isX = true;
+ }
+
+ return { row, score, isX, modelName: row.model?.name || row.id || '' };
+ });
+
+ // Sort: valid scores descending, then X at the bottom
+ modelScores.sort((a, b) => {
+ // X values go to the bottom
+ if (a.isX && !b.isX) return 1;
+ if (!a.isX && b.isX) return -1;
+ if (a.isX && b.isX) return a.modelName.localeCompare(b.modelName);
+
+ // Sort by score descending
+ if (b.score !== a.score) return b.score - a.score;
+
+ // Tiebreaker: model name
+ return a.modelName.localeCompare(b.modelName);
+ });
+
+ // Assign ranks based on rounded scores (2 decimal places, matching display)
+ let currentRank = 1;
+ let previousRoundedScore = null;
+ let tiedCount = 0;
+
+ modelScores.forEach((item, index) => {
+ if (item.isX) {
+ // All X get the same rank (last valid rank + tied count + 1)
+ return;
+ }
+
+ // Round to 2 decimal places for comparison (matching display format)
+ const roundedScore = Math.round(item.score * 100) / 100;
+
+ // Compare rounded scores for tie detection
+ if (index === 0 || roundedScore !== previousRoundedScore) {
+ if (tiedCount > 0) {
+ currentRank += tiedCount;
+ }
+ tiedCount = 1;
+ item.row.rank = currentRank;
+ } else {
+ // Tie - same rounded score, use same rank as previous
+ tiedCount++;
+ item.row.rank = modelScores[index - 1].row.rank;
+ }
+
+ previousRoundedScore = roundedScore;
+ });
+
+ // Assign rank to all X models (after all valid ones)
+ const xRank = currentRank + tiedCount;
+ modelScores.forEach(item => {
+ if (item.isX) {
+ item.row.rank = xRank;
+ }
+ });
+
+ console.log('[recalculateRanks] Recalculated ranks for', rowData.length, 'models');
+}
+
function setupEventHandlers() {
// Setup filter action buttons
const resetBtn = document.getElementById('resetAllFiltersBtn');
diff --git a/static/benchmarks/js/leaderboard/filters/filter-coordinator.js b/static/benchmarks/js/leaderboard/filters/filter-coordinator.js
index e419d5e94..feb32c37d 100644
--- a/static/benchmarks/js/leaderboard/filters/filter-coordinator.js
+++ b/static/benchmarks/js/leaderboard/filters/filter-coordinator.js
@@ -320,13 +320,25 @@ function updateFilteredScores(rowData) {
const workingRowData = rowData.map(row => ({ ...row }));
// First restore original data for all columns
+ // For benchmarks that were default-excluded but are now re-added, use trueOriginalRowData
workingRowData.forEach((row) => {
const originalRow = window.originalRowData.find(origRow => origRow.id === row.id);
+ const trueOriginalRow = window.trueOriginalRowData?.find(origRow => origRow.id === row.id);
if (!originalRow) return;
Object.keys(originalRow).forEach(key => {
if (key !== 'model' && key !== 'rank' && originalRow[key] && typeof originalRow[key] === 'object') {
- row[key] = { ...originalRow[key] };
+ // Check if this is a re-added benchmark (was default-excluded, now included)
+ const isDefaultExcluded = window.excludedBenchmarks && window.excludedBenchmarks.has(key);
+ const isCurrentlyIncluded = !excludedBenchmarks.has(key);
+ const isReAdded = isDefaultExcluded && isCurrentlyIncluded;
+
+ if (isReAdded && trueOriginalRow && trueOriginalRow[key]) {
+ // Restore from true original (before baseline recalculation)
+ row[key] = { ...trueOriginalRow[key] };
+ } else {
+ row[key] = { ...originalRow[key] };
+ }
}
});
});
@@ -336,20 +348,10 @@ function updateFilteredScores(rowData) {
const originalRow = window.originalRowData.find(origRow => origRow.id === row.id);
if (!originalRow) return;
- function getDepthLevel(benchmarkId, visited = new Set()) {
- if (visited.has(benchmarkId)) return 0;
- visited.add(benchmarkId);
-
- const children = hierarchyMap.get(benchmarkId) || [];
- if (children.length === 0) return 0;
-
- const maxChildDepth = Math.max(...children.map(child => getDepthLevel(child, new Set(visited))));
- return maxChildDepth + 1;
- }
-
+ // Process benchmarks from leaves up to parents (using shared utility)
const allBenchmarkIds = Array.from(hierarchyMap.keys());
const benchmarksByDepth = allBenchmarkIds
- .map(id => ({ id, depth: getDepthLevel(id) }))
+ .map(id => ({ id, depth: window.LeaderboardHierarchyUtils.getDepthLevel(id, hierarchyMap) }))
.sort((a, b) => a.depth - b.depth);
benchmarksByDepth.forEach(({ id: benchmarkId }) => {
@@ -364,13 +366,15 @@ function updateFilteredScores(rowData) {
};
}
} else {
+ // Parent benchmark: recalculate average from non-excluded children (using shared utility)
const childScores = [];
-
-
- // First pass collect all children and determine if mixed valid/invalid scores
const childInfo = [];
+
children.forEach(childId => {
- if (!excludedBenchmarks.has(childId) && row[childId]) {
+ // Skip if child is explicitly excluded OR if its entire subtree is excluded
+ if (excludedBenchmarks.has(childId) || window.LeaderboardHierarchyUtils.isFullyExcluded(childId, hierarchyMap, excludedBenchmarks)) return;
+
+ if (row[childId]) {
const childScore = row[childId].value;
const hasValidScore = childScore !== null && childScore !== undefined &&
childScore !== '' && childScore !== 'X' &&
@@ -379,50 +383,19 @@ function updateFilteredScores(rowData) {
}
});
- // Check if any valid scores among the children
const hasAnyValidScores = childInfo.some(info => info.hasValidScore);
- // Second pass, build the scores array
- childInfo.forEach(({ childId, childScore, hasValidScore }) => {
+ childInfo.forEach(({ childScore, hasValidScore }) => {
if (hasValidScore) {
- // Include valid numeric scores
- const numVal = parseFloat(childScore);
- childScores.push(numVal);
+ childScores.push(parseFloat(childScore));
} else if (hasAnyValidScores && (childScore === 'X' || childScore === '')) {
- // If we have some valid scores, treat X/empty as 0
+ // Treat X as 0 only if there are other valid scores (normal X, not fully excluded)
childScores.push(0);
}
- // If no valid scores exist at all, skip everything (childScores will be empty)
});
-
- // Check if we should drop out this parent column
- let shouldDropOut = false;
-
- if (childScores.length === 0) {
- // No children available at all
- shouldDropOut = true;
- } else {
- // Check if all non-excluded children are X or 0
- let validChildrenCount = 0;
- let nonZeroChildrenCount = 0;
-
- children.forEach(childId => {
- if (!excludedBenchmarks.has(childId) && row[childId]) {
- validChildrenCount++;
- const childScore = row[childId].value;
- if (childScore !== null && childScore !== undefined && childScore !== '' && childScore !== 'X') {
- const numVal = typeof childScore === 'string' ? parseFloat(childScore) : childScore;
- if (!isNaN(numVal) && numVal > 0) {
- nonZeroChildrenCount++;
- }
- }
- }
- });
-
- // Drop out if no valid children or all valid children are 0/X
- shouldDropOut = validChildrenCount === 0 || nonZeroChildrenCount === 0;
- }
+ // Determine if this column should be dropped out
+ let shouldDropOut = childScores.length === 0 || !hasAnyValidScores;
if (shouldDropOut) {
row[benchmarkId] = {
@@ -440,35 +413,25 @@ function updateFilteredScores(rowData) {
}
});
- // Calculate global filtered score
+ // Calculate global filtered score (using shared utility)
const visionCategories = ['neural_vision_v0', 'behavior_vision_v0'];
const categoryScores = [];
visionCategories.forEach(category => {
- const isExcluded = excludedBenchmarks.has(category);
-
- // Check if this column would be visible (not dropped out)
- let isColumnVisible = true;
- if (window.getFilteredLeafCount && typeof window.getFilteredLeafCount === 'function') {
- const leafCount = window.getFilteredLeafCount(category);
- if (leafCount === 0) {
- isColumnVisible = false; // Column is dropped out
- }
- }
+ // Skip if explicitly excluded OR if its entire subtree is excluded
+ if (excludedBenchmarks.has(category) || window.LeaderboardHierarchyUtils.isFullyExcluded(category, hierarchyMap, excludedBenchmarks)) return;
- // Only include in filtered score if column is visible and not excluded
- if (row[category] && !isExcluded && isColumnVisible) {
+ if (row[category]) {
const score = row[category].value;
if (score !== null && score !== undefined && score !== '') {
if (score === 'X') {
- // Treat X as 0 in filtered score calculation (but only if column is visible)
+ // Treat X as 0 (normal X, model has no data)
categoryScores.push(0);
} else {
const numVal = typeof score === 'string' ? parseFloat(score) : score;
if (!isNaN(numVal)) {
categoryScores.push(numVal);
} else {
- // Treat non-numeric values as 0
categoryScores.push(0);
}
}
@@ -489,29 +452,49 @@ function updateFilteredScores(rowData) {
const allBenchmarkIds = Array.from(hierarchyMap.keys());
const recalculatedBenchmarks = new Set();
+ // Helper function to mark ancestors as recalculated
+ function markAncestorsRecalculated(targetId) {
+ allBenchmarkIds.forEach(parentId => {
+ const parentChildren = hierarchyMap.get(parentId) || [];
+ if (parentChildren.includes(targetId)) {
+ recalculatedBenchmarks.add(parentId);
+ markAncestorsRecalculated(parentId);
+ }
+ });
+ }
+
allBenchmarkIds.forEach(benchmarkId => {
const children = hierarchyMap.get(benchmarkId) || [];
if (children.length > 0) {
- // Only count as excluded if it's manually filtered (not a default-excluded benchmark)
- const hasManuallyExcludedChildren = children.some(childId => {
- const isExcluded = excludedBenchmarks.has(childId);
+ // Check for children that deviate from their default state
+ const hasChildrenDeviatingFromDefault = children.some(childId => {
+ const isCurrentlyExcluded = excludedBenchmarks.has(childId);
const isDefaultExcluded = window.excludedBenchmarks && window.excludedBenchmarks.has(childId);
- // Only count if excluded AND not a default-excluded benchmark
- return isExcluded && !isDefaultExcluded;
+
+ // Case 1: Normal benchmark that was manually excluded (unchecked)
+ // isCurrentlyExcluded = true, isDefaultExcluded = false
+ const wasManuallyExcluded = isCurrentlyExcluded && !isDefaultExcluded;
+
+ // Case 2: Excluded benchmark that was re-added (checked)
+ // isCurrentlyExcluded = false, isDefaultExcluded = true
+ const wasReAdded = !isCurrentlyExcluded && isDefaultExcluded;
+
+ return wasManuallyExcluded || wasReAdded;
});
- if (hasManuallyExcludedChildren) {
+
+ if (hasChildrenDeviatingFromDefault) {
+ recalculatedBenchmarks.add(benchmarkId);
+ markAncestorsRecalculated(benchmarkId);
+ }
+ } else {
+ // Leaf benchmark: check if it itself deviates from default
+ const isCurrentlyExcluded = excludedBenchmarks.has(benchmarkId);
+ const isDefaultExcluded = window.excludedBenchmarks && window.excludedBenchmarks.has(benchmarkId);
+
+ // If this leaf was re-added (default excluded but now included), mark it for blue coloring
+ if (!isCurrentlyExcluded && isDefaultExcluded) {
recalculatedBenchmarks.add(benchmarkId);
-
- function markAncestorsRecalculated(targetId) {
- allBenchmarkIds.forEach(parentId => {
- const parentChildren = hierarchyMap.get(parentId) || [];
- if (parentChildren.includes(targetId)) {
- recalculatedBenchmarks.add(parentId);
- markAncestorsRecalculated(parentId);
- }
- });
- }
markAncestorsRecalculated(benchmarkId);
}
}
@@ -625,28 +608,38 @@ function toggleFilteredScoreColumn(gridApi) {
hasStimuliFiltering
);
- const uncheckedCheckboxes = document.querySelectorAll('#benchmarkFilterPanel input[type="checkbox"]:not(:checked)');
- let hasNonEngineeringBenchmarkFilters = false;
+ let hasBenchmarkDeviationsFromDefault = false;
- // Only check for non-engineering benchmark filters if the benchmark panel is ready
+ // Only check for benchmark deviations if the benchmark panel is ready
const benchmarkPanel = document.getElementById('benchmarkFilterPanel');
if (benchmarkPanel && benchmarkPanel.children.length > 0) {
- uncheckedCheckboxes.forEach(checkbox => {
- const engineeringNode = document.querySelector('input[value="engineering_vision_v0"]')?.closest('.benchmark-node');
+ const allCheckboxes = document.querySelectorAll('#benchmarkFilterPanel input[type="checkbox"]');
+ const engineeringNode = document.querySelector('input[value="engineering_vision_v0"]')?.closest('.benchmark-node');
+
+ allCheckboxes.forEach(checkbox => {
const isEngineeringChild = engineeringNode && engineeringNode.contains(checkbox);
const isEngineeringParent = checkbox.value === 'engineering_vision_v0';
+ // Skip engineering benchmarks (they don't affect global score)
+ if (isEngineeringChild || isEngineeringParent) {
+ return;
+ }
+
// Check if this is an excluded benchmark (default unchecked)
const isExcludedBenchmark = checkbox.dataset.excluded === 'true';
-
- // Only count as a manual filter if it's not engineering and not a default-excluded benchmark
- if (!isEngineeringChild && !isEngineeringParent && !isExcludedBenchmark) {
- hasNonEngineeringBenchmarkFilters = true;
+
+ // Expected default state: checked if NOT excluded, unchecked if excluded
+ const expectedDefaultState = !isExcludedBenchmark;
+ const actualState = checkbox.checked;
+
+ // If the current state differs from the expected default, it's a modification
+ if (actualState !== expectedDefaultState) {
+ hasBenchmarkDeviationsFromDefault = true;
}
});
}
- const shouldShowFilteredScore = hasNonEngineeringBenchmarkFilters || hasBenchmarkMetadataFilters;
+ const shouldShowFilteredScore = hasBenchmarkDeviationsFromDefault || hasBenchmarkMetadataFilters;
if (shouldShowFilteredScore) {
// First, make column visible
diff --git a/static/benchmarks/js/leaderboard/utilities/color-utils.js b/static/benchmarks/js/leaderboard/utilities/color-utils.js
new file mode 100644
index 000000000..0f3cf6c71
--- /dev/null
+++ b/static/benchmarks/js/leaderboard/utilities/color-utils.js
@@ -0,0 +1,190 @@
+// Color calculation utilities
+// Replicates the SQL representative_color_sql_precomputed function logic
+
+// Precomputed color arrays matching SQL (101 colors each)
+const REDGREEN_COLORS = [
+ '#ff0000', '#ff0000', '#ff0000', '#ff0000', '#fe0600', '#fe0600', '#fd0d01', '#fd0d01', '#fc1301', '#fb1901',
+ '#fb1901', '#fa1f02', '#f92502', '#f92502', '#f82b02', '#f73103', '#f73103', '#f63703', '#f53d03', '#f44204',
+ '#f44204', '#f44804', '#f34d04', '#f25305', '#f15805', '#f15805', '#f05e05', '#ef6306', '#ee6806', '#ed6e06',
+ '#ec7307', '#eb7807', '#ea7d07', '#e98208', '#e88708', '#e88708', '#e78c08', '#e69109', '#e69509', '#e59a09',
+ '#e49f0a', '#e3a30a', '#e2a80a', '#e1ac0a', '#e0b10b', '#dfb50b', '#deb90b', '#ddbe0c', '#dcc20c', '#dcc60c',
+ '#dbca0d', '#d9d20d', '#d8d60d', '#d4d70e', '#cfd60e', '#c9d50e', '#c4d40f', '#bed40f', '#b9d30f', '#b4d20f',
+ '#aed110', '#a4cf10', '#9fce10', '#9acd11', '#95cc11', '#90cc11', '#8bcb11', '#86ca12', '#7dc812', '#78c712',
+ '#74c613', '#6fc613', '#6ac513', '#66c413', '#5dc214', '#59c114', '#55c014', '#51c015', '#48be15', '#44bd15',
+ '#40bc16', '#3cbb16', '#38bb16', '#31b917', '#2db817', '#29b717', '#26b617', '#1eb518', '#1bb418', '#18b319',
+ '#18b21c', '#19b124', '#19b028', '#19af2b', '#19ad32', '#1aad36', '#1aac39', '#1aaa40', '#1aa943', '#1ba947',
+ '#1ba84a'
+];
+
+const GRAY_COLORS = [
+ '#f2f2f2', '#f2f2f2', '#f2f2f2', '#f2f2f2', '#f0f0f0', '#f0f0f0', '#eeeeee', '#eeeeee', '#ededed', '#ebebeb',
+ '#ebebeb', '#e9e9e9', '#e7e7e7', '#e7e7e7', '#e6e6e6', '#e4e4e4', '#e4e4e4', '#e2e2e2', '#e0e0e0', '#dedede',
+ '#dedede', '#dddddd', '#dbdbdb', '#d9d9d9', '#d7d7d7', '#d7d7d7', '#d6d6d6', '#d4d4d4', '#d2d2d2', '#d0d0d0',
+ '#cecece', '#cdcdcd', '#cbcbcb', '#c9c9c9', '#c7c7c7', '#c7c7c7', '#c5c5c5', '#c4c4c4', '#c2c2c2', '#c0c0c0',
+ '#bebebe', '#bdbdbd', '#bbbbbb', '#b9b9b9', '#b7b7b7', '#b5b5b5', '#b4b4b4', '#b2b2b2', '#b0b0b0', '#aeaeae',
+ '#adadad', '#a9a9a9', '#a7a7a7', '#a5a5a5', '#a4a4a4', '#a2a2a2', '#a0a0a0', '#9e9e9e', '#9d9d9d', '#9b9b9b',
+ '#999999', '#959595', '#949494', '#929292', '#909090', '#8e8e8e', '#8d8d8d', '#8b8b8b', '#878787', '#858585',
+ '#848484', '#828282', '#808080', '#7e7e7e', '#7b7b7b', '#797979', '#777777', '#757575', '#727272', '#707070',
+ '#6e6e6e', '#6c6c6c', '#6b6b6b', '#676767', '#656565', '#646464', '#626262', '#5e5e5e', '#5c5c5c', '#5b5b5b',
+ '#595959', '#555555', '#545454', '#525252', '#4e4e4e', '#4c4c4c', '#4b4b4b', '#474747', '#454545', '#444444',
+ '#424242'
+];
+
+const COLOR_NONE = '#e0e1e2';
+const GAMMA = 0.5; // Gamma value to stretch high-end differences
+
+/**
+ * Calculate representative color for a score value
+ * Replicates the SQL representative_color_sql_precomputed function
+ *
+ * @param {number} value - The score value
+ * @param {number} minValue - Minimum value in the distribution
+ * @param {number} maxValue - Maximum value in the distribution
+ * @param {string} rootParent - Root parent identifier (e.g., 'engineering_vision_v0')
+ * @returns {string} CSS color string in format "background-color: rgb(...); background-color: rgba(...);"
+ */
+function calculateRepresentativeColor(value, minValue, maxValue, rootParent) {
+ // Return neutral grey if value is null, NaN, or invalid
+ if (value === null || value === undefined || isNaN(value) || value === 'NaN' || value === '') {
+ return `background-color: ${COLOR_NONE};`;
+ }
+
+ // Normalize the input value between 0 and 1
+ let normalizedValue;
+ if (maxValue - minValue === 0) {
+ normalizedValue = 0.5;
+ } else {
+ normalizedValue = (value - minValue) / (maxValue - minValue);
+ }
+ normalizedValue = Math.max(0, Math.min(1, normalizedValue));
+
+ // Apply gamma correction to emphasize differences at the top-end
+ normalizedValue = Math.pow(normalizedValue, 1.0 / GAMMA);
+
+ // Scale down the normalized value (0.8 factor)
+ normalizedValue = 0.8 * normalizedValue;
+ normalizedValue = Math.max(0, Math.min(1, normalizedValue));
+
+ // Get color array index (0-100)
+ let idx = Math.floor(100 * normalizedValue);
+ if (idx > 100) {
+ idx = 100;
+ }
+
+ // Determine color palette based on root parent
+ const isEngineering = rootParent && rootParent.toLowerCase().includes('engineering');
+ const colorHex = isEngineering ? GRAY_COLORS[idx] : REDGREEN_COLORS[idx];
+
+ // Extract RGB values from hex color
+ const r = parseInt(colorHex.substring(1, 3), 16);
+ const g = parseInt(colorHex.substring(3, 5), 16);
+ const b = parseInt(colorHex.substring(5, 7), 16);
+
+ // Calculate alpha based on value position
+ let alpha;
+ if (maxValue - minValue === 0) {
+ alpha = 1.0;
+ } else {
+ // Linear interpolation: alpha ranges from 0.1 (at min) to 1.0 (at max)
+ // slope = -0.9 / (min_value - max_value)
+ // intercept = 0.1 - slope * min_value
+ const slope = -0.9 / (minValue - maxValue);
+ const intercept = 0.1 - slope * minValue;
+ alpha = slope * value + intercept;
+ }
+ alpha = Math.max(0, Math.min(1, alpha));
+
+ // Build CSS color string
+ const fallbackColor = `rgb(${Math.round(r)}, ${Math.round(g)}, ${Math.round(b)})`;
+ const rgbaColor = `rgba(${r}, ${g}, ${b}, ${alpha.toFixed(2)})`;
+
+ return `background-color: ${fallbackColor}; background-color: ${rgbaColor};`;
+}
+
+/**
+ * Recalculate colors for a set of benchmarks based on new score distribution
+ * This is used when excluded benchmarks change the min/max ranges
+ *
+ * @param {Array} rowData - Array of model row data
+ * @param {string} benchmarkId - Benchmark identifier to recalculate colors for
+ * @param {Map} hierarchyMap - Benchmark hierarchy map
+ * @returns {void} Modifies rowData in place
+ */
+function recalculateColorsForBenchmark(rowData, benchmarkId, hierarchyMap) {
+ // Collect all values for this benchmark across all models
+ const values = [];
+ rowData.forEach(row => {
+ if (row[benchmarkId] && row[benchmarkId].value !== 'X' && row[benchmarkId].value !== null) {
+ const val = row[benchmarkId].value;
+ const numVal = typeof val === 'string' ? parseFloat(val) : val;
+ if (!isNaN(numVal)) {
+ values.push(numVal);
+ }
+ }
+ });
+
+ if (values.length === 0) {
+ return; // No valid scores to calculate colors for
+ }
+
+ const minValue = Math.min(...values);
+ const maxValue = Math.max(...values);
+
+ // Determine root parent for color palette selection
+ // Find the root parent by traversing up the hierarchy
+ let rootParent = null;
+ let currentId = benchmarkId;
+ const visited = new Set();
+
+ // Traverse up the hierarchy to find root
+ while (currentId && !visited.has(currentId)) {
+ visited.add(currentId);
+
+ // Check if currentId is a root (no parent in hierarchy)
+ let hasParent = false;
+ for (const [parentId, children] of hierarchyMap.entries()) {
+ if (children.includes(currentId)) {
+ currentId = parentId;
+ hasParent = true;
+ break;
+ }
+ }
+
+ if (!hasParent) {
+ // This is a root
+ rootParent = currentId;
+ break;
+ }
+ }
+
+ // Fallback: if we couldn't determine root parent, infer from benchmarkId
+ if (!rootParent) {
+ // Check if benchmarkId or any ancestor contains 'engineering'
+ const checkId = benchmarkId.toLowerCase();
+ if (checkId.includes('engineering')) {
+ rootParent = 'engineering_vision_v0';
+ } else {
+ // Default to neural for non-engineering benchmarks
+ rootParent = 'neural_vision_v0';
+ }
+ }
+
+ // Recalculate colors for each model
+ rowData.forEach(row => {
+ if (row[benchmarkId] && row[benchmarkId].value !== 'X' && row[benchmarkId].value !== null) {
+ const val = row[benchmarkId].value;
+ const numVal = typeof val === 'string' ? parseFloat(val) : val;
+ if (!isNaN(numVal)) {
+ const color = calculateRepresentativeColor(numVal, minValue, maxValue, rootParent);
+ row[benchmarkId].color = color;
+ }
+ }
+ });
+}
+
+// Export functions
+window.LeaderboardColorUtils = {
+ calculateRepresentativeColor,
+ recalculateColorsForBenchmark
+};
+
diff --git a/static/benchmarks/js/leaderboard/utilities/hierarchy-utils.js b/static/benchmarks/js/leaderboard/utilities/hierarchy-utils.js
index 169ac5bc3..fdef3831f 100644
--- a/static/benchmarks/js/leaderboard/utilities/hierarchy-utils.js
+++ b/static/benchmarks/js/leaderboard/utilities/hierarchy-utils.js
@@ -141,6 +141,15 @@ function isParentBenchmark(benchmarkId, hierarchyMap) {
return children.length > 0;
}
+// Check if a benchmark subtree is fully excluded
+// A benchmark is fully excluded if it OR all its descendants are in the excluded set
+function isFullyExcluded(benchmarkId, hierarchyMap, excludedSet) {
+ if (excludedSet.has(benchmarkId)) return true;
+ const children = hierarchyMap.get(benchmarkId) || [];
+ if (children.length === 0) return excludedSet.has(benchmarkId);
+ return children.every(child => isFullyExcluded(child, hierarchyMap, excludedSet));
+}
+
// Get depth level of a benchmark in the hierarchy
function getDepthLevel(benchmarkId, hierarchyMap, visited = new Set()) {
if (visited.has(benchmarkId)) return 0;
@@ -253,6 +262,7 @@ window.LeaderboardHierarchyUtils = {
findParent,
isLeafBenchmark,
isParentBenchmark,
+ isFullyExcluded,
getDepthLevel,
getBenchmarksAtDepth,
getBenchmarkPath,