Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 3 additions & 9 deletions .env
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,9 @@ OPENAI_API_KEY='<YOUR_KEY>'
HF_ACCESS_TOKEN='<YOUR_KEY>'
HF_TOKEN = '<YOUR_KEY>'

# Volcengine API Configuration
VOLCENGINE_ACCESS_TOKEN='<YOUR_KEY>'

# DeepSeek API Configuration
DEEPSEEK_API_KEY='<YOUR_KEY>'

# NotDiamond API Configuration
NOTDIAMOND_API_KEY='<YOUR_KEY>'

# Anthropic API Configuration
ANTHROPIC_API_KEY='<YOUR_KEY>'

Expand All @@ -35,9 +29,6 @@ AZURE_API_KEY = "<YOUR_KEY>"
# Replicate API Configuration
REPLICATE_API_TOKEN = "<YOUR_KEY>"

# Clarifai API Configuration
CLARIFAI_PAT = "<YOUR_KEY>"

# AWS
AWS_ACCESS_KEY_ID = "<YOUR_KEY>"
AWS_SECRET_ACCESS_KEY = "<YOUR_KEY>"
Expand All @@ -46,3 +37,6 @@ AWS_BEARER_TOKEN_BEDROCK = "<YOUR_KEY>="

# xAI
XAI_API_KEY = "xai-<YOUR_KEY>"

# ZHIPU_API_KEY
ZHIPU_API_KEY = "<YOUR_KEY>"
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ __pycache__/
outputs/
results/

# datasets
dataset/

# Logs
*.log
logs/
Expand Down
285 changes: 176 additions & 109 deletions README.md

Large diffs are not rendered by default.

202 changes: 202 additions & 0 deletions cached_results/claude-3-haiku-20240307.jsonl

Large diffs are not rendered by default.

202 changes: 202 additions & 0 deletions cached_results/gemini-2.0-flash-001.jsonl

Large diffs are not rendered by default.

203 changes: 203 additions & 0 deletions cached_results/gpt-4o-mini.jsonl

Large diffs are not rendered by default.

2 changes: 0 additions & 2 deletions cached_results/gpt-4o.jsonl

This file was deleted.

202 changes: 202 additions & 0 deletions cached_results/mistral-medium.jsonl

Large diffs are not rendered by default.

Binary file added images/leaderboard.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/routerarena_diagram.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/routerarena_logo_8.jpeg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
33 changes: 27 additions & 6 deletions llm_evaluation/evaluate_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,11 +98,11 @@ def load_all_data(self):
print("Loading ground truth data...")
try:
# Load data directly without LiveCodeBench dependency
from datasets import load_dataset
from datasets import load_from_disk
import pandas as pd

# Load the router eval benchmark dataset
router_eval_bench = load_dataset("louielu02/RouterEvalBenchmark")["full"]
router_eval_bench = load_from_disk("./dataset/routerarena")
router_eval_bench_df = pd.DataFrame(router_eval_bench)

# Convert to the expected format
Expand Down Expand Up @@ -143,11 +143,32 @@ def load_dataset_configs(self):

def load_cost_config(self):
"""Load cost configuration from model_cost/cost.json"""
cost_file = "/home/jy101/yifan/RouterArena/model_cost/cost.json"
# Try multiple possible paths for cost file
possible_paths = [
"./model_cost/cost.json",
"../model_cost/cost.json",
"model_cost/cost.json",
]

cost_file = None
for path in possible_paths:
if os.path.exists(path):
cost_file = path
break

if not cost_file:
print(
f"Warning: Could not find cost configuration file. Tried: {possible_paths}"
)
self.cost_config = {}
return

try:
with open(cost_file, "r") as f:
self.cost_config = json.load(f)
print(f"Loaded cost configuration for {len(self.cost_config)} models")
print(
f"Loaded cost configuration for {len(self.cost_config)} models from {cost_file}"
)
except Exception as e:
print(f"Warning: Could not load cost configuration from {cost_file}: {e}")
self.cost_config = {}
Expand Down Expand Up @@ -418,7 +439,7 @@ def evaluate_model(self, model_name: str, rerun=False) -> Dict[str, Any]:
print(f"Evaluation completed. Evaluated {evaluated_count} new entries.")
return self._compile_final_results(universal_model_name, cached_results)

def _get_ground_truth(self, global_index: str, dataset_name: str) -> Optional[str]:
def _get_ground_truth(self, global_index: str, dataset_name: str) -> Optional[Any]:
"""Get ground truth for a specific global_index from the dataset."""
# Load dataset if not already loaded
if self.all_data is None:
Expand Down Expand Up @@ -461,7 +482,7 @@ def _get_ground_truth(self, global_index: str, dataset_name: str) -> Optional[st
return None

def _evaluate_single_entry(
self, generated_answer: str, ground_truth: str, scorer, dataset_name: str
self, generated_answer: str, ground_truth: Any, scorer, dataset_name: str
) -> tuple:
"""Evaluate a single entry using the appropriate scorer."""
try:
Expand Down
2 changes: 1 addition & 1 deletion llm_evaluation/livecodebench_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -615,7 +615,7 @@ def restore_original_references():
setattr(shutil, func_name, original_func)

# Restore 'subprocess' functions
setattr(subprocess, "Popen", originals["subprocess"]["Popen"]) # type: ignore[misc]
setattr(subprocess, "Popen", originals["subprocess"]["Popen"])

# Restore sys modules
for module_name, original_module in originals["sys_modules"].items():
Expand Down
Loading