From 7bbbbc2416067c3b4f92895c15dff223555ccfed Mon Sep 17 00:00:00 2001 From: jiarong0907 Date: Fri, 13 Feb 2026 23:57:44 -0600 Subject: [PATCH 1/3] test comment triggered CI --- router_inference/predictions/vllm-sr.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/router_inference/predictions/vllm-sr.json b/router_inference/predictions/vllm-sr.json index deaceee..272d4c6 100644 --- a/router_inference/predictions/vllm-sr.json +++ b/router_inference/predictions/vllm-sr.json @@ -2,7 +2,7 @@ { "global index": "ArcMMLU_655", "prompt": "Please read the following multiple-choice questions and provide the most likely correct answer based on the options given.\n\nContext: None\n\nQuestion: The average life expectancy of the most of firm is around ( ) years.\n\nOptions: \nA. 10\nB. 20\nC. 30\nD. 40\n\n\nProvide the correct letter choice in \\boxed{X}, where X is the correct letter choice. Keep the explanation or feedback within 3 sentences.", - "prediction": "gemini-2.0-flash-001", + "prediction": "gpt-4o-mini", "generated_result": { "generated_answer": "The correct answer is difficult to know without more context about what kind of \"firm\" is being referred to. However, a commonly cited statistic suggests that the average lifespan of S&P 500 companies is decreasing, with estimates around 20 years, so that makes B plausible. Therefore, the best answer among the options is:\n\n\\boxed{B}\n", "success": true, From 001bde8c63c88d4d48b52fbc55036f50782b620f Mon Sep 17 00:00:00 2001 From: jiarong0907 Date: Sat, 14 Feb 2026 00:07:17 -0600 Subject: [PATCH 2/3] change the robust file --- router_inference/predictions/vllm-sr-robustness.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/router_inference/predictions/vllm-sr-robustness.json b/router_inference/predictions/vllm-sr-robustness.json index 5b3f2b3..d984a1f 100644 --- a/router_inference/predictions/vllm-sr-robustness.json +++ b/router_inference/predictions/vllm-sr-robustness.json @@ -2,7 +2,7 @@ { "global index": "AIME_112", "prompt": "Deliver a thorough resolution of the ensuing arithmetic puzzle in a sequential manner.\n\nScenario: Absent\n\nInquiry: With an inexhaustible stockpile of coins valued at 1-cent, 10-cents, and 25-cents, Silas is on a quest to assemble a coin collection summing to $ N $ cents, where $ N $ is a positive integer figure. He employs what's known as the **greedy method**, opting each time for the highest denomination that doesn't push his collection's sum beyond $ N $. Illustratively, for 42 cents, Silas opts for a 25-cent coin, followed by a 10-cent, and then seven 1-cent coins. However, this assortment of nine coins surpasses the required count to reach 42 cents; in fact, selecting four 10-cent coins and two 1-cent coins sums up the same with only six coins.\n\nTypically, the greedy method prevails for a specific $ N $ if no alternate assemblage of 1-cent, 10-cent, and 25-cent coins sums to $ N $ cents using a smaller number of coins than the greedy method's set. Ascertain the quantity of values for $ N $ spanning from 1 to 1000 inclusive where the greedy method is successful.\n\nSubmit your conclusive response in the format \\boxed{{}}, with the precise mathematical expression or numeral encased within the braces. For instance: \\boxed{{42}}. Ensure your elucidation is explicit, succinct, and confined to three sentences.", - "prediction": "gemini-2.0-flash-001", + "prediction": "gpt-4o-mini", "generated_result": null, "cost": null, "accuracy": null, From 18ab1b7120794d7d6bbb74c4bc5757176eb5b870 Mon Sep 17 00:00:00 2001 From: jiarong0907 Date: Sat, 14 Feb 2026 00:15:58 -0600 Subject: [PATCH 3/3] Trigger checks --- .github/workflows/pr-evaluation.yml | 56 +++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/.github/workflows/pr-evaluation.yml b/.github/workflows/pr-evaluation.yml index cbc05e4..b34998b 100644 --- a/.github/workflows/pr-evaluation.yml +++ b/.github/workflows/pr-evaluation.yml @@ -17,6 +17,7 @@ jobs: ) runs-on: self-hosted permissions: + checks: write contents: read pull-requests: write steps: @@ -46,6 +47,26 @@ jobs: core.setOutput('base_sha', pr.data.base.sha); core.setOutput('number', pr.data.number); + - name: Create in-progress PR check + id: checkrun + uses: actions/github-script@v7 + with: + script: | + const result = await github.rest.checks.create({ + owner: context.repo.owner, + repo: context.repo.repo, + name: 'Router Submission Evaluation (/evaluate)', + head_sha: '${{ steps.pr.outputs.head_sha }}', + status: 'in_progress', + started_at: new Date().toISOString(), + details_url: `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`, + output: { + title: 'Evaluation started', + summary: 'Router evaluation was triggered via `/evaluate` and is now running.' + } + }); + core.setOutput('id', String(result.data.id)); + - name: Checkout base repository (for evaluation scripts) uses: actions/checkout@v4 with: @@ -264,3 +285,38 @@ jobs: comment_id: context.payload.comment.id, content: 'rocket' }); + + - name: Complete PR check + if: ${{ always() && steps.checkrun.outputs.id != '' }} + uses: actions/github-script@v7 + env: + DETECTED_ROUTER: ${{ steps.detect.outputs.router }} + DETECT_OUTCOME: ${{ steps.detect.outcome }} + EVALUATE_OUTCOME: ${{ steps.evaluate.outcome }} + with: + script: | + let conclusion = 'success'; + let title = 'Evaluation completed'; + let summary = 'Router evaluation finished successfully.'; + + if (!process.env.DETECTED_ROUTER) { + conclusion = process.env.DETECT_OUTCOME === 'success' ? 'neutral' : 'failure'; + title = process.env.DETECT_OUTCOME === 'success' ? 'No router file detected' : 'Evaluation setup failed'; + summary = process.env.DETECT_OUTCOME === 'success' + ? 'No changed prediction file was detected for this PR, so evaluation was skipped.' + : 'Failed while detecting prediction files for this PR.'; + } else if (process.env.EVALUATE_OUTCOME !== 'success') { + conclusion = 'failure'; + title = 'Evaluation failed'; + summary = 'The evaluation step failed. Check this workflow run logs for details.'; + } + + await github.rest.checks.update({ + owner: context.repo.owner, + repo: context.repo.repo, + check_run_id: Number('${{ steps.checkrun.outputs.id }}'), + status: 'completed', + conclusion, + completed_at: new Date().toISOString(), + output: { title, summary } + });