RouteWorks
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎cached_results/claude-3-haiku-20240307.jsonl‎
Lines changed: 809 additions & 202 deletions b/‎cached_results/claude-3-haiku-20240307.jsonl‎
Lines changed: 809 additions & 202 deletions
diff --git a/‎cached_results/gemini-2.0-flash-001.jsonl‎
Lines changed: 809 additions & 202 deletions b/‎cached_results/gemini-2.0-flash-001.jsonl‎
Lines changed: 809 additions & 202 deletions
diff --git a/‎cached_results/gpt-4o-mini.jsonl‎
Lines changed: 8400 additions & 203 deletions b/‎cached_results/gpt-4o-mini.jsonl‎
Lines changed: 8400 additions & 203 deletions
diff --git a/‎cached_results/mistral-medium.jsonl‎
Lines changed: 0 additions & 202 deletions b/‎cached_results/mistral-medium.jsonl‎
Lines changed: 0 additions & 202 deletions
diff --git a/‎llm_evaluation/run.py‎
Lines changed: 50 additions & 20 deletions b/‎llm_evaluation/run.py‎
Lines changed: 50 additions & 20 deletions
diff --git a/‎router_inference/config/glm-4-air-router.json‎
Lines changed: 8 additions & 0 deletions b/‎router_inference/config/glm-4-air-router.json‎
Lines changed: 8 additions & 0 deletions
@@ -14,6 +14,8 @@ dataset/
 *.log
 logs/
 /cached_results2/
+.pr_worktrees/
+pr_evaluations/
 
 # Environment files
 .venv
 
@@ -30,6 +30,15 @@
 # Add parent directory to path for imports
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))
 
+# Load environment variables from .env file if it exists
+try:
+    from dotenv import load_dotenv
+
+    load_dotenv()
+except ImportError:
+    # dotenv is optional
+    pass
+
 from universal_model_names import ModelNameManager
 
 # Import model evaluator from current directory
@@ -119,39 +128,58 @@ def save_predictions_file(predictions: List[Dict[str, Any]], router_name: str) -
 
 def load_ground_truth_dataset(split: str) -> Dict[str, Dict[str, Any]]:
     """
-    Load ground truth dataset based on split from local disk.
+    Load ground truth dataset based on split from local disk or private repo.
+
+    For "full" split: If HF_TOKEN is available, loads from RouteWorks/RouterEvalBenchmark
+    (private repo with answers). Otherwise, loads from local disk (public dataset without answers).
 
     Args:
         split: Dataset split ("sub_10" for testing or "full" for submission)
 
     Returns:
         Dictionary mapping global_index to ground truth data
     """
-    from datasets import load_from_disk  # type: ignore[import-not-found,import-untyped]
+    from datasets import load_dataset, load_from_disk  # type: ignore[import-not-found,import-untyped]
     import pandas as pd  # type: ignore[import-untyped]
 
     if split not in ["sub_10", "full"]:
         raise ValueError(f"Invalid split: {split}. Must be 'sub_10' or 'full'")
 
-    logger.info(f"Loading ground truth dataset (split: {split}) from local disk...")
+    router_eval_bench_df = None
+    hf_token = os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_HUB_TOKEN")
 
-    # Load the RouterArena dataset from local disk
-    dataset_path = "./dataset/routerarena"
-    if split == "sub_10":
-        dataset_path = "./dataset/routerarena_10"
-    if not os.path.exists(dataset_path):
-        raise FileNotFoundError(
-            f"Dataset not found at {dataset_path}. "
-            f"Please run the following command to download the dataset: python scripts/process_datasets/prep_datasets.py"
-        )
-
-    router_arena_dataset = load_from_disk(dataset_path)
+    # For "full" split, try private repo first if token is available
+    if split == "full" and hf_token:
+        logger.info("Loading full dataset with answers from private repo...")
+        try:
+            router_arena_dataset = load_dataset(
+                "RouteWorks/RouterEvalBenchmark",
+                split="full",
+                token=hf_token,
+            )
+            router_eval_bench_df = pd.DataFrame(router_arena_dataset)
+            logger.info("Successfully loaded from private repo.")
+        except Exception as e:
+            logger.warning(
+                f"Could not load from private repo: {e}. Falling back to local dataset."
+            )
 
-    router_eval_bench_df = pd.DataFrame(router_arena_dataset)
+    # Load from local disk if not already loaded
+    if router_eval_bench_df is None:
+        dataset_path = (
+            "./dataset/routerarena_10" if split == "sub_10" else "./dataset/routerarena"
+        )
+        if not os.path.exists(dataset_path):
+            raise FileNotFoundError(
+                f"Dataset not found at {dataset_path}. "
+                f"Please run: python scripts/process_datasets/prep_datasets.py"
+            )
+        logger.info(f"Loading dataset from {dataset_path}...")
+        router_arena_dataset = load_from_disk(dataset_path)
+        router_eval_bench_df = pd.DataFrame(router_arena_dataset)
 
-    # Check if we have answers for the "full" split
+    # Verify answers exist for "full" split
     if split == "full":
-        # Sample a few rows to check if answers are empty
         sample_size = min(100, len(router_eval_bench_df))
         sample_answers = router_eval_bench_df.head(sample_size)["Answer"]
         has_answers = any(
@@ -166,9 +194,11 @@ def load_ground_truth_dataset(split: str) -> Dict[str, Dict[str, Any]]:
             logger.error("")
             logger.error("To submit predictions for the full dataset evaluation:")
             logger.error("1. Generate predictions for the full dataset")
-            logger.error("2. Create an issue in the RouterArena repository")
-            logger.error("3. Upload your predictions file")
-            logger.error("4. We will run the official evaluation for you")
+            logger.error("2. Create a pull request to the RouterArena repository")
+            logger.error("3. Include your predictions file in the PR")
+            logger.error(
+                "4. The official evaluation would be automatically conducted for you"
+            )
             logger.error("=" * 80)
             raise ValueError(
                 "The 'full' split does not have ground truth answers. "
 
@@ -0,0 +1,8 @@
+{
+    "pipeline_params": {
+        "router_name": "glm-4-air-router",
+        "models": [
+            "glm-4-air"
+        ]
+    }
+}