-
Notifications
You must be signed in to change notification settings - Fork 3.7k
Add Mistral Large 3 to nightly CI tests #14459
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
0233520
a42e2bf
c5a05ab
e903baf
42431ba
5c770dd
0248bae
a12f4c2
6e2195c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,105 @@ | ||
| import os | ||
| import unittest | ||
| from types import SimpleNamespace | ||
|
|
||
| from nightly_utils import NightlyBenchmarkRunner | ||
|
|
||
| from sglang.srt.utils import kill_process_tree | ||
| from sglang.test.ci.ci_register import register_cuda_ci | ||
| from sglang.test.run_eval import run_eval | ||
| from sglang.test.test_utils import ( | ||
| DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, | ||
| DEFAULT_URL_FOR_TEST, | ||
| _parse_int_list_env, | ||
| popen_launch_server, | ||
| ) | ||
|
|
||
| register_cuda_ci(est_time=600, suite="nightly-8-gpu-b200", nightly=True) | ||
|
|
||
| MISTRAL_LARGE3_MODEL_PATH = "mistralai/Mistral-Large-3-675B-Instruct-2512" | ||
| PROFILE_DIR = "performance_profiles_mistral_large3" | ||
|
|
||
|
|
||
| class TestNightlyMistralLarge3Performance(unittest.TestCase): | ||
| @classmethod | ||
| def setUpClass(cls): | ||
| # Set environment variable to disable JIT DeepGemm | ||
| os.environ["SGLANG_ENABLE_JIT_DEEPGEMM"] = "0" | ||
|
|
||
| cls.model = MISTRAL_LARGE3_MODEL_PATH | ||
| cls.base_url = DEFAULT_URL_FOR_TEST | ||
| cls.batch_sizes = [1, 1, 8, 16, 64] | ||
| cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_INPUT_LENS", "4096")) | ||
| cls.output_lens = tuple(_parse_int_list_env("NIGHTLY_OUTPUT_LENS", "512")) | ||
|
|
||
| # Mistral-Large-3-675B requires TP=8 and trtllm_mla attention backend | ||
| cls.other_args = [ | ||
| "--tp", | ||
| "8", | ||
| "--attention-backend", | ||
| "trtllm_mla", | ||
| "--model-loader-extra-config", | ||
| '{"enable_multithread_load": true}', | ||
| "--chat-template", | ||
| "mistral", | ||
| ] | ||
|
|
||
| cls.runner = NightlyBenchmarkRunner(PROFILE_DIR, cls.__name__, cls.base_url) | ||
| cls.runner.setup_profile_directory() | ||
|
|
||
| @classmethod | ||
| def tearDownClass(cls): | ||
| # Clean up environment variable | ||
| if "SGLANG_ENABLE_JIT_DEEPGEMM" in os.environ: | ||
| del os.environ["SGLANG_ENABLE_JIT_DEEPGEMM"] | ||
|
|
||
| def test_bench_one_batch(self): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we also add accuracy test for it?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, i can do that. What score threshold should we expect? How do we determine this value?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just added test_accuracy_mgsm method using the same eval framework as other text models. Set initial threshold to 0.90 (placeholder). Will calibrate the threshold after observing actual model performance.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. MGSM accuracy for mistralai/Mistral-Large-3-675B-Instruct-2512: 0.972 |
||
| results, success = self.runner.run_benchmark_for_model( | ||
| model_path=self.model, | ||
| batch_sizes=self.batch_sizes, | ||
| input_lens=self.input_lens, | ||
| output_lens=self.output_lens, | ||
| other_args=self.other_args, | ||
| ) | ||
|
|
||
| self.runner.add_report(results) | ||
| self.runner.write_final_report() | ||
|
|
||
| if not success: | ||
| raise AssertionError( | ||
| f"Benchmark failed for {self.model}. Check the logs for details." | ||
| ) | ||
|
|
||
| def test_accuracy_mgsm(self): | ||
| """Run MGSM accuracy evaluation for Mistral Large 3.""" | ||
| process = popen_launch_server( | ||
| model=self.model, | ||
| base_url=self.base_url, | ||
| other_args=self.other_args, | ||
| timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, | ||
| ) | ||
|
|
||
| try: | ||
| args = SimpleNamespace( | ||
| base_url=self.base_url, | ||
| model=self.model, | ||
| eval_name="mgsm_en", | ||
| num_examples=None, | ||
| num_threads=1024, | ||
| ) | ||
| metrics = run_eval(args) | ||
| print(f"MGSM accuracy for {self.model}: {metrics['score']}") | ||
|
|
||
| # Placeholder threshold - adjust after first successful run | ||
| expected_threshold = 0.90 | ||
| self.assertGreaterEqual( | ||
| metrics["score"], | ||
| expected_threshold, | ||
| f"MGSM accuracy {metrics['score']} below threshold {expected_threshold}", | ||
| ) | ||
| finally: | ||
| kill_process_tree(process.pid) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| unittest.main() | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do we need this change for Mistral large 3 model?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it's a general bug fix discovered while debugging the Mistral test. When model weights are corrupted/missing, all 8 TP ranks detect the issue simultaneously and race to delete the cache directory. One rank succeeds, others fail with "No such file or directory" errors. This was causing CI failures.
the fix adds a file lock so only one rank performs validation and cleanup at a time. other models just happened to have complete caches so the bug wasn't triggered.