diff --git a/benchmarks/benchmark_config.py b/benchmarks/benchmark_config.py index f11ae2b93..d18073eff 100644 --- a/benchmarks/benchmark_config.py +++ b/benchmarks/benchmark_config.py @@ -30,11 +30,11 @@ EXCLUDED_BENCHMARKS = [ # Coggan to possibly be excluded in future: - # "tong.Coggan2024_fMRI.V1-rdm_v1", - # "tong.Coggan2024_fMRI.V2-rdm_v1", - # "tong.Coggan2024_fMRI.V4-rdm_v1", - # "tong.Coggan2024_fMRI.IT-rdm_v1", - # "tong.Coggan2024_behavior-ConditionWiseAccuracySimilarity_v1" + "tong.Coggan2024_fMRI.V1-rdm_v1", + "tong.Coggan2024_fMRI.V2-rdm_v1", + "tong.Coggan2024_fMRI.V4-rdm_v1", + "tong.Coggan2024_fMRI.IT-rdm_v1", + "tong.Coggan2024_behavior-ConditionWiseAccuracySimilarity_v1" ] # Domain-specific exclusions (if needed in the future) diff --git a/benchmarks/templates/benchmarks/leaderboard/ag-grid-leaderboard-content.html b/benchmarks/templates/benchmarks/leaderboard/ag-grid-leaderboard-content.html index cb24b97b5..4c316d62d 100644 --- a/benchmarks/templates/benchmarks/leaderboard/ag-grid-leaderboard-content.html +++ b/benchmarks/templates/benchmarks/leaderboard/ag-grid-leaderboard-content.html @@ -240,6 +240,7 @@

Benchmark Properties

{% endif %} + diff --git a/static/benchmarks/js/leaderboard/core/grid-initialization.js b/static/benchmarks/js/leaderboard/core/grid-initialization.js index df488196b..b033e4c38 100644 --- a/static/benchmarks/js/leaderboard/core/grid-initialization.js +++ b/static/benchmarks/js/leaderboard/core/grid-initialization.js @@ -215,7 +215,8 @@ function initializeGrid(rowData, columnDefs, benchmarkGroups) { state: [ { colId: 'runnable_status', hide: false }, { colId: 'filtered_score', hide: true }, - { colId: 'average_vision_v0', hide: false } + { colId: 'average_vision_v0', hide: false }, + { colId: 'rank', sort: 'asc' } ] }); diff --git a/static/benchmarks/js/leaderboard/core/template-initialization.js b/static/benchmarks/js/leaderboard/core/template-initialization.js index b48b4f5bf..8f7bdb5c0 100644 --- a/static/benchmarks/js/leaderboard/core/template-initialization.js +++ b/static/benchmarks/js/leaderboard/core/template-initialization.js @@ -231,9 +231,268 @@ function setupFilters() { window.filteredOutBenchmarks.add(cb.value); } }); + + // Recalculate baseline scores to exclude the default-excluded benchmarks + // This ensures the Global Score on initial load correctly excludes these benchmarks + if (window.excludedBenchmarks && window.excludedBenchmarks.size > 0) { + setTimeout(() => { + recalculateBaselineScores(); + }, 50); + } }, 20); } +// Recalculate baseline scores to properly exclude the default-excluded benchmarks +// This modifies originalRowData so that the "baseline" Global Score is correct +// It also preserves the true original values so they can be restored when benchmarks are re-added +function recalculateBaselineScores() { + if (!window.originalRowData || !window.benchmarkTree || !window.excludedBenchmarks) return; + + const hierarchyMap = window.buildHierarchyFromTree(window.benchmarkTree); + const excludedBenchmarks = window.filteredOutBenchmarks || new Set(); + + // Store the true original values BEFORE modifying (for restoring when benchmarks are re-added) + // Only do this once on initial load + if (!window.trueOriginalRowData) { + window.trueOriginalRowData = JSON.parse(JSON.stringify(window.originalRowData)); + } + + // Recalculate scores for each row + window.originalRowData.forEach(row => { + // Process benchmarks from leaves up to parents (using shared utility) + const allBenchmarkIds = Array.from(hierarchyMap.keys()); + const benchmarksByDepth = allBenchmarkIds + .map(id => ({ id, depth: window.LeaderboardHierarchyUtils.getDepthLevel(id, hierarchyMap) })) + .sort((a, b) => a.depth - b.depth); + + benchmarksByDepth.forEach(({ id: benchmarkId }) => { + const children = hierarchyMap.get(benchmarkId) || []; + + if (children.length === 0) { + // Leaf benchmark: mark as X if excluded + if (excludedBenchmarks.has(benchmarkId) && row[benchmarkId]) { + row[benchmarkId] = { + ...row[benchmarkId], + value: 'X', + color: '#e0e1e2' + }; + } + } else { + // Parent benchmark: recalculate average from non-excluded children (using shared utility) + const childScores = []; + const childInfo = []; + + children.forEach(childId => { + // Skip if child is explicitly excluded OR if its entire subtree is excluded + if (excludedBenchmarks.has(childId) || window.LeaderboardHierarchyUtils.isFullyExcluded(childId, hierarchyMap, excludedBenchmarks)) return; + + if (row[childId]) { + const childScore = row[childId].value; + const hasValidScore = childScore !== null && childScore !== undefined && + childScore !== '' && childScore !== 'X' && + !isNaN(parseFloat(childScore)); + childInfo.push({ childId, childScore, hasValidScore }); + } + }); + + const hasAnyValidScores = childInfo.some(info => info.hasValidScore); + + childInfo.forEach(({ childScore, hasValidScore }) => { + if (hasValidScore) { + childScores.push(parseFloat(childScore)); + } else if (hasAnyValidScores && (childScore === 'X' || childScore === '')) { + // Treat X as 0 only if there are other valid scores (normal X, not fully excluded) + childScores.push(0); + } + }); + + // Determine if this column should be dropped out + let shouldDropOut = childScores.length === 0 || !hasAnyValidScores; + + if (shouldDropOut) { + if (row[benchmarkId]) { + row[benchmarkId] = { + ...row[benchmarkId], + value: 'X', + color: '#e0e1e2' + }; + } + } else if (row[benchmarkId]) { + const average = childScores.reduce((a, b) => a + b, 0) / childScores.length; + row[benchmarkId] = { + ...row[benchmarkId], + value: parseFloat(average.toFixed(3)) + }; + } + } + }); + + // Recalculate average_vision_v0 (Global Score) from neural and behavior (using shared utility) + const visionCategories = ['neural_vision_v0', 'behavior_vision_v0']; + const categoryScores = []; + + visionCategories.forEach(category => { + // Skip if explicitly excluded OR if its entire subtree is excluded + if (excludedBenchmarks.has(category) || window.LeaderboardHierarchyUtils.isFullyExcluded(category, hierarchyMap, excludedBenchmarks)) return; + + if (row[category]) { + const score = row[category].value; + if (score !== null && score !== undefined && score !== '') { + if (score === 'X') { + // Treat X as 0 (normal X, model has no data) + categoryScores.push(0); + } else { + const numVal = typeof score === 'string' ? parseFloat(score) : score; + if (!isNaN(numVal)) { + categoryScores.push(numVal); + } else { + categoryScores.push(0); + } + } + } + } + }); + + if (row['average_vision_v0']) { + if (categoryScores.length > 0) { + const globalAverage = categoryScores.reduce((a, b) => a + b, 0) / categoryScores.length; + row['average_vision_v0'] = { + ...row['average_vision_v0'], + value: parseFloat(globalAverage.toFixed(3)) + }; + } else { + // All categories are excluded - mark global score as X + row['average_vision_v0'] = { + ...row['average_vision_v0'], + value: 'X', + color: '#e0e1e2' + }; + } + } + }); + + // Recalculate colors for parent benchmarks based on NEW score distribution + // This ensures colors reflect the new min/max ranges after exclusions + const allBenchmarkIds = Array.from(hierarchyMap.keys()); + const parentBenchmarkIds = allBenchmarkIds.filter(bid => { + const children = hierarchyMap.get(bid) || []; + return children.length > 0; // Only parent benchmarks + }); + + // Also include global score (average_vision_v0) if it exists + const globalScoreId = 'average_vision_v0'; + if (!parentBenchmarkIds.includes(globalScoreId) && + window.originalRowData.length > 0 && + window.originalRowData[0][globalScoreId]) { + parentBenchmarkIds.push(globalScoreId); + } + + // Recalculate colors for each parent benchmark + parentBenchmarkIds.forEach(benchmarkId => { + if (window.LeaderboardColorUtils && window.LeaderboardColorUtils.recalculateColorsForBenchmark) { + window.LeaderboardColorUtils.recalculateColorsForBenchmark( + window.originalRowData, + benchmarkId, + hierarchyMap + ); + } + }); + + // Recalculate ranks based on the new global scores + recalculateRanks(window.originalRowData); + + // Update the grid with recalculated data + if (window.globalGridApi) { + window.globalGridApi.setGridOption('rowData', window.originalRowData); + window.globalGridApi.refreshCells({ force: true }); + } +} + +// Recalculate ranks based on the current global scores (average_vision_v0) +// This ensures ranks reflect the recalculated scores after excluding benchmarks +function recalculateRanks(rowData) { + if (!rowData || rowData.length === 0) return; + + // Extract global scores for each model + const modelScores = rowData.map(row => { + const globalScore = row.average_vision_v0; + let score = null; + let isX = false; + + if (globalScore) { + const val = globalScore.value; + if (val === 'X' || val === '' || val === null || val === undefined) { + isX = true; + } else { + const numVal = typeof val === 'string' ? parseFloat(val) : val; + if (!isNaN(numVal)) { + score = numVal; + } else { + isX = true; + } + } + } else { + isX = true; + } + + return { row, score, isX, modelName: row.model?.name || row.id || '' }; + }); + + // Sort: valid scores descending, then X at the bottom + modelScores.sort((a, b) => { + // X values go to the bottom + if (a.isX && !b.isX) return 1; + if (!a.isX && b.isX) return -1; + if (a.isX && b.isX) return a.modelName.localeCompare(b.modelName); + + // Sort by score descending + if (b.score !== a.score) return b.score - a.score; + + // Tiebreaker: model name + return a.modelName.localeCompare(b.modelName); + }); + + // Assign ranks based on rounded scores (2 decimal places, matching display) + let currentRank = 1; + let previousRoundedScore = null; + let tiedCount = 0; + + modelScores.forEach((item, index) => { + if (item.isX) { + // All X get the same rank (last valid rank + tied count + 1) + return; + } + + // Round to 2 decimal places for comparison (matching display format) + const roundedScore = Math.round(item.score * 100) / 100; + + // Compare rounded scores for tie detection + if (index === 0 || roundedScore !== previousRoundedScore) { + if (tiedCount > 0) { + currentRank += tiedCount; + } + tiedCount = 1; + item.row.rank = currentRank; + } else { + // Tie - same rounded score, use same rank as previous + tiedCount++; + item.row.rank = modelScores[index - 1].row.rank; + } + + previousRoundedScore = roundedScore; + }); + + // Assign rank to all X models (after all valid ones) + const xRank = currentRank + tiedCount; + modelScores.forEach(item => { + if (item.isX) { + item.row.rank = xRank; + } + }); + + console.log('[recalculateRanks] Recalculated ranks for', rowData.length, 'models'); +} + function setupEventHandlers() { // Setup filter action buttons const resetBtn = document.getElementById('resetAllFiltersBtn'); diff --git a/static/benchmarks/js/leaderboard/filters/filter-coordinator.js b/static/benchmarks/js/leaderboard/filters/filter-coordinator.js index e419d5e94..feb32c37d 100644 --- a/static/benchmarks/js/leaderboard/filters/filter-coordinator.js +++ b/static/benchmarks/js/leaderboard/filters/filter-coordinator.js @@ -320,13 +320,25 @@ function updateFilteredScores(rowData) { const workingRowData = rowData.map(row => ({ ...row })); // First restore original data for all columns + // For benchmarks that were default-excluded but are now re-added, use trueOriginalRowData workingRowData.forEach((row) => { const originalRow = window.originalRowData.find(origRow => origRow.id === row.id); + const trueOriginalRow = window.trueOriginalRowData?.find(origRow => origRow.id === row.id); if (!originalRow) return; Object.keys(originalRow).forEach(key => { if (key !== 'model' && key !== 'rank' && originalRow[key] && typeof originalRow[key] === 'object') { - row[key] = { ...originalRow[key] }; + // Check if this is a re-added benchmark (was default-excluded, now included) + const isDefaultExcluded = window.excludedBenchmarks && window.excludedBenchmarks.has(key); + const isCurrentlyIncluded = !excludedBenchmarks.has(key); + const isReAdded = isDefaultExcluded && isCurrentlyIncluded; + + if (isReAdded && trueOriginalRow && trueOriginalRow[key]) { + // Restore from true original (before baseline recalculation) + row[key] = { ...trueOriginalRow[key] }; + } else { + row[key] = { ...originalRow[key] }; + } } }); }); @@ -336,20 +348,10 @@ function updateFilteredScores(rowData) { const originalRow = window.originalRowData.find(origRow => origRow.id === row.id); if (!originalRow) return; - function getDepthLevel(benchmarkId, visited = new Set()) { - if (visited.has(benchmarkId)) return 0; - visited.add(benchmarkId); - - const children = hierarchyMap.get(benchmarkId) || []; - if (children.length === 0) return 0; - - const maxChildDepth = Math.max(...children.map(child => getDepthLevel(child, new Set(visited)))); - return maxChildDepth + 1; - } - + // Process benchmarks from leaves up to parents (using shared utility) const allBenchmarkIds = Array.from(hierarchyMap.keys()); const benchmarksByDepth = allBenchmarkIds - .map(id => ({ id, depth: getDepthLevel(id) })) + .map(id => ({ id, depth: window.LeaderboardHierarchyUtils.getDepthLevel(id, hierarchyMap) })) .sort((a, b) => a.depth - b.depth); benchmarksByDepth.forEach(({ id: benchmarkId }) => { @@ -364,13 +366,15 @@ function updateFilteredScores(rowData) { }; } } else { + // Parent benchmark: recalculate average from non-excluded children (using shared utility) const childScores = []; - - - // First pass collect all children and determine if mixed valid/invalid scores const childInfo = []; + children.forEach(childId => { - if (!excludedBenchmarks.has(childId) && row[childId]) { + // Skip if child is explicitly excluded OR if its entire subtree is excluded + if (excludedBenchmarks.has(childId) || window.LeaderboardHierarchyUtils.isFullyExcluded(childId, hierarchyMap, excludedBenchmarks)) return; + + if (row[childId]) { const childScore = row[childId].value; const hasValidScore = childScore !== null && childScore !== undefined && childScore !== '' && childScore !== 'X' && @@ -379,50 +383,19 @@ function updateFilteredScores(rowData) { } }); - // Check if any valid scores among the children const hasAnyValidScores = childInfo.some(info => info.hasValidScore); - // Second pass, build the scores array - childInfo.forEach(({ childId, childScore, hasValidScore }) => { + childInfo.forEach(({ childScore, hasValidScore }) => { if (hasValidScore) { - // Include valid numeric scores - const numVal = parseFloat(childScore); - childScores.push(numVal); + childScores.push(parseFloat(childScore)); } else if (hasAnyValidScores && (childScore === 'X' || childScore === '')) { - // If we have some valid scores, treat X/empty as 0 + // Treat X as 0 only if there are other valid scores (normal X, not fully excluded) childScores.push(0); } - // If no valid scores exist at all, skip everything (childScores will be empty) }); - - // Check if we should drop out this parent column - let shouldDropOut = false; - - if (childScores.length === 0) { - // No children available at all - shouldDropOut = true; - } else { - // Check if all non-excluded children are X or 0 - let validChildrenCount = 0; - let nonZeroChildrenCount = 0; - - children.forEach(childId => { - if (!excludedBenchmarks.has(childId) && row[childId]) { - validChildrenCount++; - const childScore = row[childId].value; - if (childScore !== null && childScore !== undefined && childScore !== '' && childScore !== 'X') { - const numVal = typeof childScore === 'string' ? parseFloat(childScore) : childScore; - if (!isNaN(numVal) && numVal > 0) { - nonZeroChildrenCount++; - } - } - } - }); - - // Drop out if no valid children or all valid children are 0/X - shouldDropOut = validChildrenCount === 0 || nonZeroChildrenCount === 0; - } + // Determine if this column should be dropped out + let shouldDropOut = childScores.length === 0 || !hasAnyValidScores; if (shouldDropOut) { row[benchmarkId] = { @@ -440,35 +413,25 @@ function updateFilteredScores(rowData) { } }); - // Calculate global filtered score + // Calculate global filtered score (using shared utility) const visionCategories = ['neural_vision_v0', 'behavior_vision_v0']; const categoryScores = []; visionCategories.forEach(category => { - const isExcluded = excludedBenchmarks.has(category); - - // Check if this column would be visible (not dropped out) - let isColumnVisible = true; - if (window.getFilteredLeafCount && typeof window.getFilteredLeafCount === 'function') { - const leafCount = window.getFilteredLeafCount(category); - if (leafCount === 0) { - isColumnVisible = false; // Column is dropped out - } - } + // Skip if explicitly excluded OR if its entire subtree is excluded + if (excludedBenchmarks.has(category) || window.LeaderboardHierarchyUtils.isFullyExcluded(category, hierarchyMap, excludedBenchmarks)) return; - // Only include in filtered score if column is visible and not excluded - if (row[category] && !isExcluded && isColumnVisible) { + if (row[category]) { const score = row[category].value; if (score !== null && score !== undefined && score !== '') { if (score === 'X') { - // Treat X as 0 in filtered score calculation (but only if column is visible) + // Treat X as 0 (normal X, model has no data) categoryScores.push(0); } else { const numVal = typeof score === 'string' ? parseFloat(score) : score; if (!isNaN(numVal)) { categoryScores.push(numVal); } else { - // Treat non-numeric values as 0 categoryScores.push(0); } } @@ -489,29 +452,49 @@ function updateFilteredScores(rowData) { const allBenchmarkIds = Array.from(hierarchyMap.keys()); const recalculatedBenchmarks = new Set(); + // Helper function to mark ancestors as recalculated + function markAncestorsRecalculated(targetId) { + allBenchmarkIds.forEach(parentId => { + const parentChildren = hierarchyMap.get(parentId) || []; + if (parentChildren.includes(targetId)) { + recalculatedBenchmarks.add(parentId); + markAncestorsRecalculated(parentId); + } + }); + } + allBenchmarkIds.forEach(benchmarkId => { const children = hierarchyMap.get(benchmarkId) || []; if (children.length > 0) { - // Only count as excluded if it's manually filtered (not a default-excluded benchmark) - const hasManuallyExcludedChildren = children.some(childId => { - const isExcluded = excludedBenchmarks.has(childId); + // Check for children that deviate from their default state + const hasChildrenDeviatingFromDefault = children.some(childId => { + const isCurrentlyExcluded = excludedBenchmarks.has(childId); const isDefaultExcluded = window.excludedBenchmarks && window.excludedBenchmarks.has(childId); - // Only count if excluded AND not a default-excluded benchmark - return isExcluded && !isDefaultExcluded; + + // Case 1: Normal benchmark that was manually excluded (unchecked) + // isCurrentlyExcluded = true, isDefaultExcluded = false + const wasManuallyExcluded = isCurrentlyExcluded && !isDefaultExcluded; + + // Case 2: Excluded benchmark that was re-added (checked) + // isCurrentlyExcluded = false, isDefaultExcluded = true + const wasReAdded = !isCurrentlyExcluded && isDefaultExcluded; + + return wasManuallyExcluded || wasReAdded; }); - if (hasManuallyExcludedChildren) { + + if (hasChildrenDeviatingFromDefault) { + recalculatedBenchmarks.add(benchmarkId); + markAncestorsRecalculated(benchmarkId); + } + } else { + // Leaf benchmark: check if it itself deviates from default + const isCurrentlyExcluded = excludedBenchmarks.has(benchmarkId); + const isDefaultExcluded = window.excludedBenchmarks && window.excludedBenchmarks.has(benchmarkId); + + // If this leaf was re-added (default excluded but now included), mark it for blue coloring + if (!isCurrentlyExcluded && isDefaultExcluded) { recalculatedBenchmarks.add(benchmarkId); - - function markAncestorsRecalculated(targetId) { - allBenchmarkIds.forEach(parentId => { - const parentChildren = hierarchyMap.get(parentId) || []; - if (parentChildren.includes(targetId)) { - recalculatedBenchmarks.add(parentId); - markAncestorsRecalculated(parentId); - } - }); - } markAncestorsRecalculated(benchmarkId); } } @@ -625,28 +608,38 @@ function toggleFilteredScoreColumn(gridApi) { hasStimuliFiltering ); - const uncheckedCheckboxes = document.querySelectorAll('#benchmarkFilterPanel input[type="checkbox"]:not(:checked)'); - let hasNonEngineeringBenchmarkFilters = false; + let hasBenchmarkDeviationsFromDefault = false; - // Only check for non-engineering benchmark filters if the benchmark panel is ready + // Only check for benchmark deviations if the benchmark panel is ready const benchmarkPanel = document.getElementById('benchmarkFilterPanel'); if (benchmarkPanel && benchmarkPanel.children.length > 0) { - uncheckedCheckboxes.forEach(checkbox => { - const engineeringNode = document.querySelector('input[value="engineering_vision_v0"]')?.closest('.benchmark-node'); + const allCheckboxes = document.querySelectorAll('#benchmarkFilterPanel input[type="checkbox"]'); + const engineeringNode = document.querySelector('input[value="engineering_vision_v0"]')?.closest('.benchmark-node'); + + allCheckboxes.forEach(checkbox => { const isEngineeringChild = engineeringNode && engineeringNode.contains(checkbox); const isEngineeringParent = checkbox.value === 'engineering_vision_v0'; + // Skip engineering benchmarks (they don't affect global score) + if (isEngineeringChild || isEngineeringParent) { + return; + } + // Check if this is an excluded benchmark (default unchecked) const isExcludedBenchmark = checkbox.dataset.excluded === 'true'; - - // Only count as a manual filter if it's not engineering and not a default-excluded benchmark - if (!isEngineeringChild && !isEngineeringParent && !isExcludedBenchmark) { - hasNonEngineeringBenchmarkFilters = true; + + // Expected default state: checked if NOT excluded, unchecked if excluded + const expectedDefaultState = !isExcludedBenchmark; + const actualState = checkbox.checked; + + // If the current state differs from the expected default, it's a modification + if (actualState !== expectedDefaultState) { + hasBenchmarkDeviationsFromDefault = true; } }); } - const shouldShowFilteredScore = hasNonEngineeringBenchmarkFilters || hasBenchmarkMetadataFilters; + const shouldShowFilteredScore = hasBenchmarkDeviationsFromDefault || hasBenchmarkMetadataFilters; if (shouldShowFilteredScore) { // First, make column visible diff --git a/static/benchmarks/js/leaderboard/utilities/color-utils.js b/static/benchmarks/js/leaderboard/utilities/color-utils.js new file mode 100644 index 000000000..0f3cf6c71 --- /dev/null +++ b/static/benchmarks/js/leaderboard/utilities/color-utils.js @@ -0,0 +1,190 @@ +// Color calculation utilities +// Replicates the SQL representative_color_sql_precomputed function logic + +// Precomputed color arrays matching SQL (101 colors each) +const REDGREEN_COLORS = [ + '#ff0000', '#ff0000', '#ff0000', '#ff0000', '#fe0600', '#fe0600', '#fd0d01', '#fd0d01', '#fc1301', '#fb1901', + '#fb1901', '#fa1f02', '#f92502', '#f92502', '#f82b02', '#f73103', '#f73103', '#f63703', '#f53d03', '#f44204', + '#f44204', '#f44804', '#f34d04', '#f25305', '#f15805', '#f15805', '#f05e05', '#ef6306', '#ee6806', '#ed6e06', + '#ec7307', '#eb7807', '#ea7d07', '#e98208', '#e88708', '#e88708', '#e78c08', '#e69109', '#e69509', '#e59a09', + '#e49f0a', '#e3a30a', '#e2a80a', '#e1ac0a', '#e0b10b', '#dfb50b', '#deb90b', '#ddbe0c', '#dcc20c', '#dcc60c', + '#dbca0d', '#d9d20d', '#d8d60d', '#d4d70e', '#cfd60e', '#c9d50e', '#c4d40f', '#bed40f', '#b9d30f', '#b4d20f', + '#aed110', '#a4cf10', '#9fce10', '#9acd11', '#95cc11', '#90cc11', '#8bcb11', '#86ca12', '#7dc812', '#78c712', + '#74c613', '#6fc613', '#6ac513', '#66c413', '#5dc214', '#59c114', '#55c014', '#51c015', '#48be15', '#44bd15', + '#40bc16', '#3cbb16', '#38bb16', '#31b917', '#2db817', '#29b717', '#26b617', '#1eb518', '#1bb418', '#18b319', + '#18b21c', '#19b124', '#19b028', '#19af2b', '#19ad32', '#1aad36', '#1aac39', '#1aaa40', '#1aa943', '#1ba947', + '#1ba84a' +]; + +const GRAY_COLORS = [ + '#f2f2f2', '#f2f2f2', '#f2f2f2', '#f2f2f2', '#f0f0f0', '#f0f0f0', '#eeeeee', '#eeeeee', '#ededed', '#ebebeb', + '#ebebeb', '#e9e9e9', '#e7e7e7', '#e7e7e7', '#e6e6e6', '#e4e4e4', '#e4e4e4', '#e2e2e2', '#e0e0e0', '#dedede', + '#dedede', '#dddddd', '#dbdbdb', '#d9d9d9', '#d7d7d7', '#d7d7d7', '#d6d6d6', '#d4d4d4', '#d2d2d2', '#d0d0d0', + '#cecece', '#cdcdcd', '#cbcbcb', '#c9c9c9', '#c7c7c7', '#c7c7c7', '#c5c5c5', '#c4c4c4', '#c2c2c2', '#c0c0c0', + '#bebebe', '#bdbdbd', '#bbbbbb', '#b9b9b9', '#b7b7b7', '#b5b5b5', '#b4b4b4', '#b2b2b2', '#b0b0b0', '#aeaeae', + '#adadad', '#a9a9a9', '#a7a7a7', '#a5a5a5', '#a4a4a4', '#a2a2a2', '#a0a0a0', '#9e9e9e', '#9d9d9d', '#9b9b9b', + '#999999', '#959595', '#949494', '#929292', '#909090', '#8e8e8e', '#8d8d8d', '#8b8b8b', '#878787', '#858585', + '#848484', '#828282', '#808080', '#7e7e7e', '#7b7b7b', '#797979', '#777777', '#757575', '#727272', '#707070', + '#6e6e6e', '#6c6c6c', '#6b6b6b', '#676767', '#656565', '#646464', '#626262', '#5e5e5e', '#5c5c5c', '#5b5b5b', + '#595959', '#555555', '#545454', '#525252', '#4e4e4e', '#4c4c4c', '#4b4b4b', '#474747', '#454545', '#444444', + '#424242' +]; + +const COLOR_NONE = '#e0e1e2'; +const GAMMA = 0.5; // Gamma value to stretch high-end differences + +/** + * Calculate representative color for a score value + * Replicates the SQL representative_color_sql_precomputed function + * + * @param {number} value - The score value + * @param {number} minValue - Minimum value in the distribution + * @param {number} maxValue - Maximum value in the distribution + * @param {string} rootParent - Root parent identifier (e.g., 'engineering_vision_v0') + * @returns {string} CSS color string in format "background-color: rgb(...); background-color: rgba(...);" + */ +function calculateRepresentativeColor(value, minValue, maxValue, rootParent) { + // Return neutral grey if value is null, NaN, or invalid + if (value === null || value === undefined || isNaN(value) || value === 'NaN' || value === '') { + return `background-color: ${COLOR_NONE};`; + } + + // Normalize the input value between 0 and 1 + let normalizedValue; + if (maxValue - minValue === 0) { + normalizedValue = 0.5; + } else { + normalizedValue = (value - minValue) / (maxValue - minValue); + } + normalizedValue = Math.max(0, Math.min(1, normalizedValue)); + + // Apply gamma correction to emphasize differences at the top-end + normalizedValue = Math.pow(normalizedValue, 1.0 / GAMMA); + + // Scale down the normalized value (0.8 factor) + normalizedValue = 0.8 * normalizedValue; + normalizedValue = Math.max(0, Math.min(1, normalizedValue)); + + // Get color array index (0-100) + let idx = Math.floor(100 * normalizedValue); + if (idx > 100) { + idx = 100; + } + + // Determine color palette based on root parent + const isEngineering = rootParent && rootParent.toLowerCase().includes('engineering'); + const colorHex = isEngineering ? GRAY_COLORS[idx] : REDGREEN_COLORS[idx]; + + // Extract RGB values from hex color + const r = parseInt(colorHex.substring(1, 3), 16); + const g = parseInt(colorHex.substring(3, 5), 16); + const b = parseInt(colorHex.substring(5, 7), 16); + + // Calculate alpha based on value position + let alpha; + if (maxValue - minValue === 0) { + alpha = 1.0; + } else { + // Linear interpolation: alpha ranges from 0.1 (at min) to 1.0 (at max) + // slope = -0.9 / (min_value - max_value) + // intercept = 0.1 - slope * min_value + const slope = -0.9 / (minValue - maxValue); + const intercept = 0.1 - slope * minValue; + alpha = slope * value + intercept; + } + alpha = Math.max(0, Math.min(1, alpha)); + + // Build CSS color string + const fallbackColor = `rgb(${Math.round(r)}, ${Math.round(g)}, ${Math.round(b)})`; + const rgbaColor = `rgba(${r}, ${g}, ${b}, ${alpha.toFixed(2)})`; + + return `background-color: ${fallbackColor}; background-color: ${rgbaColor};`; +} + +/** + * Recalculate colors for a set of benchmarks based on new score distribution + * This is used when excluded benchmarks change the min/max ranges + * + * @param {Array} rowData - Array of model row data + * @param {string} benchmarkId - Benchmark identifier to recalculate colors for + * @param {Map} hierarchyMap - Benchmark hierarchy map + * @returns {void} Modifies rowData in place + */ +function recalculateColorsForBenchmark(rowData, benchmarkId, hierarchyMap) { + // Collect all values for this benchmark across all models + const values = []; + rowData.forEach(row => { + if (row[benchmarkId] && row[benchmarkId].value !== 'X' && row[benchmarkId].value !== null) { + const val = row[benchmarkId].value; + const numVal = typeof val === 'string' ? parseFloat(val) : val; + if (!isNaN(numVal)) { + values.push(numVal); + } + } + }); + + if (values.length === 0) { + return; // No valid scores to calculate colors for + } + + const minValue = Math.min(...values); + const maxValue = Math.max(...values); + + // Determine root parent for color palette selection + // Find the root parent by traversing up the hierarchy + let rootParent = null; + let currentId = benchmarkId; + const visited = new Set(); + + // Traverse up the hierarchy to find root + while (currentId && !visited.has(currentId)) { + visited.add(currentId); + + // Check if currentId is a root (no parent in hierarchy) + let hasParent = false; + for (const [parentId, children] of hierarchyMap.entries()) { + if (children.includes(currentId)) { + currentId = parentId; + hasParent = true; + break; + } + } + + if (!hasParent) { + // This is a root + rootParent = currentId; + break; + } + } + + // Fallback: if we couldn't determine root parent, infer from benchmarkId + if (!rootParent) { + // Check if benchmarkId or any ancestor contains 'engineering' + const checkId = benchmarkId.toLowerCase(); + if (checkId.includes('engineering')) { + rootParent = 'engineering_vision_v0'; + } else { + // Default to neural for non-engineering benchmarks + rootParent = 'neural_vision_v0'; + } + } + + // Recalculate colors for each model + rowData.forEach(row => { + if (row[benchmarkId] && row[benchmarkId].value !== 'X' && row[benchmarkId].value !== null) { + const val = row[benchmarkId].value; + const numVal = typeof val === 'string' ? parseFloat(val) : val; + if (!isNaN(numVal)) { + const color = calculateRepresentativeColor(numVal, minValue, maxValue, rootParent); + row[benchmarkId].color = color; + } + } + }); +} + +// Export functions +window.LeaderboardColorUtils = { + calculateRepresentativeColor, + recalculateColorsForBenchmark +}; + diff --git a/static/benchmarks/js/leaderboard/utilities/hierarchy-utils.js b/static/benchmarks/js/leaderboard/utilities/hierarchy-utils.js index 169ac5bc3..fdef3831f 100644 --- a/static/benchmarks/js/leaderboard/utilities/hierarchy-utils.js +++ b/static/benchmarks/js/leaderboard/utilities/hierarchy-utils.js @@ -141,6 +141,15 @@ function isParentBenchmark(benchmarkId, hierarchyMap) { return children.length > 0; } +// Check if a benchmark subtree is fully excluded +// A benchmark is fully excluded if it OR all its descendants are in the excluded set +function isFullyExcluded(benchmarkId, hierarchyMap, excludedSet) { + if (excludedSet.has(benchmarkId)) return true; + const children = hierarchyMap.get(benchmarkId) || []; + if (children.length === 0) return excludedSet.has(benchmarkId); + return children.every(child => isFullyExcluded(child, hierarchyMap, excludedSet)); +} + // Get depth level of a benchmark in the hierarchy function getDepthLevel(benchmarkId, hierarchyMap, visited = new Set()) { if (visited.has(benchmarkId)) return 0; @@ -253,6 +262,7 @@ window.LeaderboardHierarchyUtils = { findParent, isLeafBenchmark, isParentBenchmark, + isFullyExcluded, getDepthLevel, getBenchmarksAtDepth, getBenchmarkPath,