-
Notifications
You must be signed in to change notification settings - Fork 676
chore: add e2e test for reasoning_effort for gpt-oss model #3421
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 2 commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
bccf2bb
chore: add e2e test for reasoning_effort for gpt-oss model
zhongdaor-nv a0eaef1
black
zhongdaor-nv fb03309
copy DynamoFrontendProcess rather than import it
zhongdaor-nv db92873
Merge branch 'main' into zhongdaor/test-gpt-oss-reasoning-effort
zhongdaor-nv 56115ff
black
zhongdaor-nv d2628e3
Merge branch 'main' into zhongdaor/test-gpt-oss-reasoning-effort
zhongdaor-nv fb4c278
Merge branch 'main' into zhongdaor/test-gpt-oss-reasoning-effort
zhongdaor-nv File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
201 changes: 201 additions & 0 deletions
201
tests/frontend/reasoning_effort/test_reasoning_effort.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,201 @@ | ||
| # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
|
|
||
| """End-to-end tests covering reasoning effort behaviour.""" | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import logging | ||
| import os | ||
| import shutil | ||
| from typing import Any, Dict, Optional, Tuple | ||
|
|
||
| import pytest | ||
| import requests | ||
|
|
||
| from tests.fault_tolerance.test_request_migration import DynamoFrontendProcess | ||
| from tests.utils.constants import GPT_OSS | ||
| from tests.utils.managed_process import ManagedProcess | ||
| from tests.utils.payloads import check_models_api | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
| REASONING_TEST_MODEL = GPT_OSS | ||
|
|
||
|
|
||
| class GPTOSSWorkerProcess(ManagedProcess): | ||
| """Worker process for GPT-OSS model.""" | ||
|
|
||
| def __init__(self, request, worker_id: str = "reasoning-worker"): | ||
| self.worker_id = worker_id | ||
|
|
||
| command = [ | ||
| "python3", | ||
| "-m", | ||
| "dynamo.vllm", | ||
| "--model", | ||
| REASONING_TEST_MODEL, | ||
| "--enforce-eager", | ||
| "--dyn-tool-call-parser", | ||
| "harmony", | ||
| "--dyn-reasoning-parser", | ||
| "gpt_oss", | ||
| ] | ||
|
|
||
| env = os.environ.copy() | ||
| env["DYN_LOG"] = "debug" | ||
| env["DYN_SYSTEM_ENABLED"] = "true" | ||
| env["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"] = '["generate"]' | ||
| env["DYN_SYSTEM_PORT"] = "8083" | ||
|
|
||
| log_dir = f"{request.node.name}_{worker_id}" | ||
|
|
||
| try: | ||
| shutil.rmtree(log_dir) | ||
| except FileNotFoundError: | ||
| pass | ||
|
|
||
| super().__init__( | ||
| command=command, | ||
| env=env, | ||
| health_check_urls=[ | ||
| ("http://localhost:8000/v1/models", check_models_api), | ||
| ("http://localhost:8083/health", self.is_ready), | ||
| ], | ||
| timeout=300, | ||
| display_output=True, | ||
| terminate_existing=False, | ||
| stragglers=["VLLM::EngineCore"], | ||
| straggler_commands=["-m dynamo.vllm"], | ||
| log_dir=log_dir, | ||
| ) | ||
|
|
||
| def is_ready(self, response) -> bool: | ||
| try: | ||
| status = (response.json() or {}).get("status") | ||
| except ValueError: | ||
| logger.warning("%s health response is not valid JSON", self.worker_id) | ||
| return False | ||
|
|
||
| is_ready = status == "ready" | ||
| if is_ready: | ||
| logger.info("%s status is ready", self.worker_id) | ||
| else: | ||
| logger.warning("%s status is not ready: %s", self.worker_id, status) | ||
| return is_ready | ||
|
|
||
|
|
||
| def _send_chat_request( | ||
| prompt: str, | ||
| reasoning_effort: str, | ||
| timeout: int = 180, | ||
| ) -> requests.Response: | ||
| """Send a chat completion request with a specific reasoning effort.""" | ||
|
|
||
| payload: Dict[str, Any] = { | ||
| "model": REASONING_TEST_MODEL, | ||
| "messages": [ | ||
| { | ||
| "role": "user", | ||
| "content": prompt, | ||
| } | ||
| ], | ||
| "max_tokens": 2000, | ||
| "chat_template_args": {"reasoning_effort": reasoning_effort}, | ||
| } | ||
|
|
||
| headers = {"Content-Type": "application/json"} | ||
|
|
||
| logger.info( | ||
| "Sending chat completion request with reasoning effort '%s'", reasoning_effort | ||
| ) | ||
| response = requests.post( | ||
| "http://localhost:8000/v1/chat/completions", | ||
| headers=headers, | ||
| json=payload, | ||
| timeout=timeout, | ||
| ) | ||
| logger.info( | ||
| "Received response for reasoning effort '%s' with status code %s", | ||
| reasoning_effort, | ||
| response.status_code, | ||
| ) | ||
| return response | ||
|
|
||
|
|
||
| def _extract_reasoning_metrics(data: Dict[str, Any]) -> Tuple[str, Optional[int]]: | ||
| """Return the reasoning content and optional reasoning token count from a response.""" | ||
| choices = data.get("choices") or [] | ||
| if not choices: | ||
| raise AssertionError(f"Response missing choices: {data}") | ||
|
|
||
| message = choices[0].get("message") or {} | ||
| reasoning_text = (message.get("reasoning_content") or "").strip() | ||
|
|
||
| usage_block = data.get("usage") or {} | ||
| tokens = usage_block.get("reasoning_tokens") | ||
| reasoning_tokens: Optional[int] = tokens if isinstance(tokens, int) else None | ||
|
|
||
| if not reasoning_text: | ||
| raise AssertionError(f"Response missing reasoning content: {data}") | ||
|
|
||
| return reasoning_text, reasoning_tokens | ||
|
|
||
|
|
||
| def _validate_chat_response(response: requests.Response) -> Dict[str, Any]: | ||
| """Ensure the chat completion response is well-formed and return its payload.""" | ||
| assert ( | ||
| response.status_code == 200 | ||
| ), f"Chat request failed with status {response.status_code}: {response.text}" | ||
| payload = response.json() | ||
| if "choices" not in payload: | ||
| raise AssertionError(f"Chat response missing 'choices': {payload}") | ||
| return payload | ||
|
|
||
|
|
||
| @pytest.mark.vllm | ||
| @pytest.mark.gpu_1 | ||
| @pytest.mark.e2e | ||
| @pytest.mark.model(REASONING_TEST_MODEL) | ||
| def test_reasoning_effort(request, runtime_services, predownload_models) -> None: | ||
| """High reasoning effort should yield more detailed reasoning than low effort.""" | ||
|
|
||
| prompt = ( | ||
| "Outline the critical steps and trade-offs when designing a Mars habitat. " | ||
| "Focus on life-support, energy, and redundancy considerations." | ||
| ) | ||
|
|
||
| with DynamoFrontendProcess(request): | ||
| logger.info("Frontend started for reasoning effort test") | ||
|
|
||
| with GPTOSSWorkerProcess(request): | ||
| logger.info("Worker started for reasoning effort test") | ||
|
|
||
| low_response = _send_chat_request(prompt, reasoning_effort="low") | ||
| low_payload = _validate_chat_response(low_response) | ||
| low_reasoning_text, low_reasoning_tokens = _extract_reasoning_metrics( | ||
| low_payload | ||
| ) | ||
|
|
||
| high_response = _send_chat_request(prompt, reasoning_effort="high") | ||
| high_payload = _validate_chat_response(high_response) | ||
| high_reasoning_text, high_reasoning_tokens = _extract_reasoning_metrics( | ||
| high_payload | ||
| ) | ||
|
|
||
| logger.info( | ||
| "Low effort reasoning tokens: %s, High effort reasoning tokens: %s", | ||
| low_reasoning_tokens, | ||
| high_reasoning_tokens, | ||
| ) | ||
|
|
||
| if low_reasoning_tokens is not None and high_reasoning_tokens is not None: | ||
| assert high_reasoning_tokens >= low_reasoning_tokens, ( | ||
| "Expected high reasoning effort to use at least as many reasoning tokens " | ||
| f"as low effort (low={low_reasoning_tokens}, high={high_reasoning_tokens})" | ||
| ) | ||
| else: | ||
| assert len(high_reasoning_text) > len(low_reasoning_text), ( | ||
| "Expected high reasoning effort response to include longer reasoning " | ||
| "content than low effort" | ||
| ) | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.