RouteWorks · jiarong0907 · Nov 3, 2025 · Oct 27, 2025 · Oct 27, 2025 · Oct 29, 2025
diff --git a/.env b/.env
@@ -5,15 +5,9 @@ OPENAI_API_KEY='<YOUR_KEY>'
 HF_ACCESS_TOKEN='<YOUR_KEY>'
 HF_TOKEN = '<YOUR_KEY>'
 
-# Volcengine API Configuration
-VOLCENGINE_ACCESS_TOKEN='<YOUR_KEY>'
-
 # DeepSeek API Configuration
 DEEPSEEK_API_KEY='<YOUR_KEY>'
 
-# NotDiamond API Configuration
-NOTDIAMOND_API_KEY='<YOUR_KEY>'
-
 # Anthropic API Configuration
 ANTHROPIC_API_KEY='<YOUR_KEY>'
 
@@ -35,9 +29,6 @@ AZURE_API_KEY = "<YOUR_KEY>"
 # Replicate API Configuration
 REPLICATE_API_TOKEN = "<YOUR_KEY>"
 
-# Clarifai API Configuration
-CLARIFAI_PAT = "<YOUR_KEY>"
-
 # AWS
 AWS_ACCESS_KEY_ID = "<YOUR_KEY>"
 AWS_SECRET_ACCESS_KEY = "<YOUR_KEY>"
@@ -46,3 +37,6 @@ AWS_BEARER_TOKEN_BEDROCK = "<YOUR_KEY>="
 
 # xAI
 XAI_API_KEY = "xai-<YOUR_KEY>"
+
+# ZHIPU_API_KEY
+ZHIPU_API_KEY = "<YOUR_KEY>"
diff --git a/.gitignore b/.gitignore
@@ -7,6 +7,9 @@ __pycache__/
 outputs/
 results/
 
+# datasets
+dataset/
+
 # Logs
 *.log
 logs/

diff --git a/README.md b/README.md
diff --git a/cached_results/claude-3-haiku-20240307.jsonl b/cached_results/claude-3-haiku-20240307.jsonl
diff --git a/cached_results/gemini-2.0-flash-001.jsonl b/cached_results/gemini-2.0-flash-001.jsonl
diff --git a/cached_results/gpt-4o-mini.jsonl b/cached_results/gpt-4o-mini.jsonl
diff --git a/cached_results/gpt-4o.jsonl b/cached_results/gpt-4o.jsonl
diff --git a/cached_results/mistral-medium.jsonl b/cached_results/mistral-medium.jsonl
diff --git a/images/leaderboard.png b/images/leaderboard.png
diff --git a/images/routerarena_diagram.png b/images/routerarena_diagram.png
diff --git a/images/routerarena_logo_8.jpeg b/images/routerarena_logo_8.jpeg
diff --git a/llm_evaluation/evaluate_models.py b/llm_evaluation/evaluate_models.py
@@ -98,11 +98,11 @@ def load_all_data(self):
         print("Loading ground truth data...")
         try:
             # Load data directly without LiveCodeBench dependency
-            from datasets import load_dataset
+            from datasets import load_from_disk
             import pandas as pd
 
             # Load the router eval benchmark dataset
-            router_eval_bench = load_dataset("louielu02/RouterEvalBenchmark")["full"]
+            router_eval_bench = load_from_disk("./dataset/routerarena")
             router_eval_bench_df = pd.DataFrame(router_eval_bench)
 
             # Convert to the expected format
@@ -143,11 +143,32 @@ def load_dataset_configs(self):
 
     def load_cost_config(self):
         """Load cost configuration from model_cost/cost.json"""
-        cost_file = "/home/jy101/yifan/RouterArena/model_cost/cost.json"
+        # Try multiple possible paths for cost file
+        possible_paths = [
+            "./model_cost/cost.json",
+            "../model_cost/cost.json",
+            "model_cost/cost.json",
+        ]
+
+        cost_file = None
+        for path in possible_paths:
+            if os.path.exists(path):
+                cost_file = path
+                break
+
+        if not cost_file:
+            print(
+                f"Warning: Could not find cost configuration file. Tried: {possible_paths}"
+            )
+            self.cost_config = {}
+            return
+
         try:
             with open(cost_file, "r") as f:
                 self.cost_config = json.load(f)
-            print(f"Loaded cost configuration for {len(self.cost_config)} models")
+            print(
+                f"Loaded cost configuration for {len(self.cost_config)} models from {cost_file}"
+            )
         except Exception as e:
             print(f"Warning: Could not load cost configuration from {cost_file}: {e}")
             self.cost_config = {}
@@ -418,7 +439,7 @@ def evaluate_model(self, model_name: str, rerun=False) -> Dict[str, Any]:
         print(f"Evaluation completed. Evaluated {evaluated_count} new entries.")
         return self._compile_final_results(universal_model_name, cached_results)
 
-    def _get_ground_truth(self, global_index: str, dataset_name: str) -> Optional[str]:
+    def _get_ground_truth(self, global_index: str, dataset_name: str) -> Optional[Any]:
         """Get ground truth for a specific global_index from the dataset."""
         # Load dataset if not already loaded
         if self.all_data is None:
@@ -461,7 +482,7 @@ def _get_ground_truth(self, global_index: str, dataset_name: str) -> Optional[st
         return None
 
     def _evaluate_single_entry(
-        self, generated_answer: str, ground_truth: str, scorer, dataset_name: str
+        self, generated_answer: str, ground_truth: Any, scorer, dataset_name: str
     ) -> tuple:
         """Evaluate a single entry using the appropriate scorer."""
         try:

diff --git a/llm_evaluation/livecodebench_util.py b/llm_evaluation/livecodebench_util.py
@@ -615,7 +615,7 @@ def restore_original_references():
         setattr(shutil, func_name, original_func)
 
     # Restore 'subprocess' functions
-    setattr(subprocess, "Popen", originals["subprocess"]["Popen"])  # type: ignore[misc]
+    setattr(subprocess, "Popen", originals["subprocess"]["Popen"])
 
     # Restore sys modules
     for module_name, original_module in originals["sys_modules"].items():