From 7bbbbc2416067c3b4f92895c15dff223555ccfed Mon Sep 17 00:00:00 2001
From: jiarong0907 <xingjiarong0907@gmail.com>
Date: Fri, 13 Feb 2026 23:57:44 -0600
Subject: [PATCH 1/3] test comment triggered CI

---
 router_inference/predictions/vllm-sr.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/router_inference/predictions/vllm-sr.json b/router_inference/predictions/vllm-sr.json
index deaceee..272d4c6 100644
--- a/router_inference/predictions/vllm-sr.json
+++ b/router_inference/predictions/vllm-sr.json
@@ -2,7 +2,7 @@
   {
     "global index": "ArcMMLU_655",
     "prompt": "Please read the following multiple-choice questions and provide the most likely correct answer based on the options given.\n\nContext: None\n\nQuestion: The average life expectancy of the most of firm is around ( ) years.\n\nOptions: \nA. 10\nB. 20\nC. 30\nD. 40\n\n\nProvide the correct letter choice in \\boxed{X}, where X is the correct letter choice. Keep the explanation or feedback within 3 sentences.",
-    "prediction": "gemini-2.0-flash-001",
+    "prediction": "gpt-4o-mini",
     "generated_result": {
       "generated_answer": "The correct answer is difficult to know without more context about what kind of \"firm\" is being referred to. However, a commonly cited statistic suggests that the average lifespan of S&P 500 companies is decreasing, with estimates around 20 years, so that makes B plausible. Therefore, the best answer among the options is:\n\n\\boxed{B}\n",
       "success": true,

From 001bde8c63c88d4d48b52fbc55036f50782b620f Mon Sep 17 00:00:00 2001
From: jiarong0907 <xingjiarong0907@gmail.com>
Date: Sat, 14 Feb 2026 00:07:17 -0600
Subject: [PATCH 2/3] change the robust file

---
 router_inference/predictions/vllm-sr-robustness.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/router_inference/predictions/vllm-sr-robustness.json b/router_inference/predictions/vllm-sr-robustness.json
index 5b3f2b3..d984a1f 100644
--- a/router_inference/predictions/vllm-sr-robustness.json
+++ b/router_inference/predictions/vllm-sr-robustness.json
@@ -2,7 +2,7 @@
   {
     "global index": "AIME_112",
     "prompt": "Deliver a thorough resolution of the ensuing arithmetic puzzle in a sequential manner.\n\nScenario: Absent\n\nInquiry: With an inexhaustible stockpile of coins valued at 1-cent, 10-cents, and 25-cents, Silas is on a quest to assemble a coin collection summing to $ N $ cents, where $ N $ is a positive integer figure. He employs what's known as the **greedy method**, opting each time for the highest denomination that doesn't push his collection's sum beyond $ N $. Illustratively, for 42 cents, Silas opts for a 25-cent coin, followed by a 10-cent, and then seven 1-cent coins. However, this assortment of nine coins surpasses the required count to reach 42 cents; in fact, selecting four 10-cent coins and two 1-cent coins sums up the same with only six coins.\n\nTypically, the greedy method prevails for a specific $ N $ if no alternate assemblage of 1-cent, 10-cent, and 25-cent coins sums to $ N $ cents using a smaller number of coins than the greedy method's set. Ascertain the quantity of values for $ N $ spanning from 1 to 1000 inclusive where the greedy method is successful.\n\nSubmit your conclusive response in the format \\boxed{{}}, with the precise mathematical expression or numeral encased within the braces. For instance: \\boxed{{42}}. Ensure your elucidation is explicit, succinct, and confined to three sentences.",
-    "prediction": "gemini-2.0-flash-001",
+    "prediction": "gpt-4o-mini",
     "generated_result": null,
     "cost": null,
     "accuracy": null,

From 18ab1b7120794d7d6bbb74c4bc5757176eb5b870 Mon Sep 17 00:00:00 2001
From: jiarong0907 <xingjiarong0907@gmail.com>
Date: Sat, 14 Feb 2026 00:15:58 -0600
Subject: [PATCH 3/3] Trigger checks

---
 .github/workflows/pr-evaluation.yml | 56 +++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/.github/workflows/pr-evaluation.yml b/.github/workflows/pr-evaluation.yml
index cbc05e4..b34998b 100644
--- a/.github/workflows/pr-evaluation.yml
+++ b/.github/workflows/pr-evaluation.yml
@@ -17,6 +17,7 @@ jobs:
       )
     runs-on: self-hosted
     permissions:
+      checks: write
       contents: read
       pull-requests: write
     steps:
@@ -46,6 +47,26 @@ jobs:
             core.setOutput('base_sha', pr.data.base.sha);
             core.setOutput('number', pr.data.number);
 
+      - name: Create in-progress PR check
+        id: checkrun
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const result = await github.rest.checks.create({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              name: 'Router Submission Evaluation (/evaluate)',
+              head_sha: '${{ steps.pr.outputs.head_sha }}',
+              status: 'in_progress',
+              started_at: new Date().toISOString(),
+              details_url: `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`,
+              output: {
+                title: 'Evaluation started',
+                summary: 'Router evaluation was triggered via `/evaluate` and is now running.'
+              }
+            });
+            core.setOutput('id', String(result.data.id));
+
       - name: Checkout base repository (for evaluation scripts)
         uses: actions/checkout@v4
         with:
@@ -264,3 +285,38 @@ jobs:
               comment_id: context.payload.comment.id,
               content: 'rocket'
             });
+
+      - name: Complete PR check
+        if: ${{ always() && steps.checkrun.outputs.id != '' }}
+        uses: actions/github-script@v7
+        env:
+          DETECTED_ROUTER: ${{ steps.detect.outputs.router }}
+          DETECT_OUTCOME: ${{ steps.detect.outcome }}
+          EVALUATE_OUTCOME: ${{ steps.evaluate.outcome }}
+        with:
+          script: |
+            let conclusion = 'success';
+            let title = 'Evaluation completed';
+            let summary = 'Router evaluation finished successfully.';
+
+            if (!process.env.DETECTED_ROUTER) {
+              conclusion = process.env.DETECT_OUTCOME === 'success' ? 'neutral' : 'failure';
+              title = process.env.DETECT_OUTCOME === 'success' ? 'No router file detected' : 'Evaluation setup failed';
+              summary = process.env.DETECT_OUTCOME === 'success'
+                ? 'No changed prediction file was detected for this PR, so evaluation was skipped.'
+                : 'Failed while detecting prediction files for this PR.';
+            } else if (process.env.EVALUATE_OUTCOME !== 'success') {
+              conclusion = 'failure';
+              title = 'Evaluation failed';
+              summary = 'The evaluation step failed. Check this workflow run logs for details.';
+            }
+
+            await github.rest.checks.update({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              check_run_id: Number('${{ steps.checkrun.outputs.id }}'),
+              status: 'completed',
+              conclusion,
+              completed_at: new Date().toISOString(),
+              output: { title, summary }
+            });