-
Notifications
You must be signed in to change notification settings - Fork 24
Support reasoning summary models in AzureOpenAIEvalClient #216
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 8 commits
Commits
Show all changes
10 commits
Select commit
Hold shift + click to select a range
bfab9a4
feat: support reasoning summary in AzureOpenAIEvalClient
taniokay c6ae1c4
feat: add stack trace
taniokay 43977cf
fix: config propagation
taniokay 9516f23
Update src/langcheck/metrics/eval_clients/_openai.py
taniokay ce02ed2
fix: remove top_logprobs from dispatch args
taniokay 8409f03
add docstrings
taniokay 8d826a3
Update src/langcheck/metrics/eval_clients/_openai.py
taniokay 0ae2cbd
fix: indentation
taniokay 422884c
fix: only allow logprobs for non reasoning models
taniokay d98842e
bump the version to 0.10.0.dev12
taniokay File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Some comments aren't visible on the classic Files Changed page.
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,12 +2,14 @@ | |
|
|
||
| import asyncio | ||
| import os | ||
| import traceback | ||
| import warnings | ||
| from typing import Any, Literal | ||
|
|
||
| import torch | ||
| from openai import AsyncAzureOpenAI, AsyncOpenAI, AzureOpenAI, OpenAI | ||
| from openai.types.create_embedding_response import CreateEmbeddingResponse | ||
| from openai.types.shared_params import Reasoning, ReasoningEffort | ||
| from pydantic import BaseModel | ||
|
|
||
| from langcheck.metrics.eval_clients.eval_response import ( | ||
|
|
@@ -30,6 +32,10 @@ def __init__( | |
| openai_args: dict[str, str] | None = None, | ||
| *, | ||
| use_async: bool = False, | ||
| use_reasoning_summary: bool = False, | ||
|
taniokay marked this conversation as resolved.
|
||
| reasoning_effort: ReasoningEffort = "medium", | ||
| reasoning_summary: Literal["auto", "concise", "detailed"] | ||
| | None = "auto", | ||
| system_prompt: str | None = None, | ||
| extractor: Extractor | None = None, | ||
| ): | ||
|
|
@@ -44,6 +50,15 @@ def __init__( | |
| `client.chat.completions.create` function. | ||
| use_async: If True, the async client will be used. Defaults to | ||
| False. | ||
| use_reasoning_summary: Whether to use reasoning summary. | ||
| NOTE: Please make sure that the model and API version support | ||
| reasoning summary. | ||
| https://platform.openai.com/docs/models | ||
| https://learn.microsoft.com/en-us/azure/ai-foundry/openai/how-to/reasoning#api--feature-support | ||
| reasoning_effort: How many reasoning tokens to generate. | ||
| This is only used when `use_reasoning_summary` is True. | ||
| reasoning_summary: The level of detail of the summarizer. | ||
| This is only used when `use_reasoning_summary` is True. | ||
| system_prompt (Optional): The system prompt to use. If not provided, | ||
| no system prompt will be used. | ||
| extractor (Optional): The extractor to use. If not provided, the | ||
|
|
@@ -77,6 +92,13 @@ def __init__( | |
| self._openai_args = openai_args | ||
| self._system_prompt = system_prompt | ||
|
|
||
| self._reasoning_effort: ReasoningEffort = ( | ||
| reasoning_effort if use_reasoning_summary else None | ||
| ) | ||
| self._reasoning_summary: ( | ||
| Literal["auto", "concise", "detailed"] | None | ||
| ) = reasoning_summary if use_reasoning_summary else None | ||
|
|
||
| if extractor is None: | ||
| self._extractor = OpenAIExtractor( | ||
| openai_client=self._client, | ||
|
|
@@ -86,6 +108,41 @@ def __init__( | |
| else: | ||
| self._extractor = extractor | ||
|
|
||
| def _dispatch( | ||
| self, | ||
| messages: list[dict[str, str]], | ||
| seed: int | None = None, | ||
| config: dict[str, str] | None = None, | ||
| ) -> Any: | ||
| """Dispatch the API call to the OpenAI API.""" | ||
| if self._reasoning_summary is None: | ||
| return self._client.chat.completions.create( | ||
| messages=messages, # type: ignore | ||
| seed=seed, | ||
| **config, | ||
| ) | ||
| else: | ||
| # To use reasoning summary, we must use the Responses API | ||
| # instead of Chat Completions API. | ||
| # https://platform.openai.com/docs/guides/reasoning#reasoning-summaries | ||
|
|
||
| include = [] | ||
|
|
||
| reasoning: Reasoning = { | ||
| "effort": self._reasoning_effort, | ||
| "summary": self._reasoning_summary, | ||
| } | ||
|
|
||
| # seed and logprobs are not supported in responses API. | ||
| return self._client.responses.create( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah nice, I was wondering if we set this properly to avoid logging prompts |
||
| input=messages, # type: ignore | ||
| include=include, | ||
| store=False, | ||
| reasoning=reasoning, | ||
| truncation="auto", | ||
| **config, | ||
| ) | ||
|
|
||
| def _call_api( | ||
| self, | ||
| prompts: list[str], | ||
|
|
@@ -100,7 +157,11 @@ def _call_api_with_exception_filter(model_input: dict[str, Any]) -> Any: | |
| if model_input is None: | ||
| return None | ||
| try: | ||
| return self._client.chat.completions.create(**model_input) | ||
| return self._dispatch( | ||
| model_input["messages"], | ||
| model_input["seed"], | ||
| config=config, | ||
| ) | ||
|
taniokay marked this conversation as resolved.
|
||
| except Exception as e: | ||
| return e | ||
|
|
||
|
|
@@ -114,7 +175,6 @@ def _call_api_with_exception_filter(model_input: dict[str, Any]) -> Any: | |
| "messages": system_message | ||
| + [{"role": "user", "content": prompt}], | ||
| "seed": i, | ||
| **config, | ||
| } | ||
| for i, prompt in enumerate(prompts) | ||
| ] | ||
|
|
@@ -124,8 +184,10 @@ def _call_api_with_exception_filter(model_input: dict[str, Any]) -> Any: | |
| async def _call_async_api() -> list[Any]: | ||
| responses = await asyncio.gather( | ||
| *map( | ||
| lambda model_input: self._client.chat.completions.create( | ||
| **model_input | ||
| lambda model_input: self._dispatch( | ||
| model_input["messages"], | ||
| model_input["seed"], | ||
| config=config, | ||
| ), | ||
|
taniokay marked this conversation as resolved.
|
||
| model_inputs, | ||
| ), | ||
|
|
@@ -150,6 +212,7 @@ async def _call_async_api() -> list[Any]: | |
| "OpenAI failed to return an assessment corresponding to " | ||
| f"{i}th prompt: {response}" | ||
| ) | ||
| traceback.print_exception(response) | ||
| responses[i] = None | ||
| return responses | ||
|
|
||
|
|
@@ -185,11 +248,39 @@ def get_text_responses( | |
| tqdm_description=tqdm_description, | ||
| system_prompt=self._system_prompt, | ||
| ) | ||
| response_texts = [ | ||
| response.choices[0].message.content if response else None | ||
| for response in responses | ||
| ] | ||
|
|
||
| response_texts = [] | ||
| for response in responses: | ||
| if not response: | ||
| response_texts.append(None) | ||
| continue | ||
| # Use the Responses API only when a reasoning summary is required. | ||
| # Otherwise, use the Chat Completions API. | ||
| if self._reasoning_summary is None: | ||
| content = response.choices[0].message.content | ||
| else: | ||
| content = None | ||
| summaries = [] | ||
|
|
||
| for output in response.output: | ||
| if hasattr(output, "summary"): | ||
| if output.summary == []: | ||
| print( | ||
| "Reasoning summary is empty. " | ||
| "This may happen even if model supports reasoning summary." | ||
| ) | ||
| continue | ||
|
|
||
| # Summary can be a list of summaries | ||
| summaries.extend([s.text for s in output.summary]) | ||
| elif hasattr(output, "content"): | ||
| content = output.content[0].text | ||
|
|
||
| if content is not None and summaries: | ||
| summaries_str = "\n\n".join(summaries) | ||
| content += f"\n\n**Reasoning Summary:**\n\n{summaries_str}" | ||
|
|
||
| response_texts.append(content) | ||
|
kennysong marked this conversation as resolved.
taniokay marked this conversation as resolved.
|
||
| # Token usage is not supported in OpenAIEvalClient | ||
| # If you need token usage, please use LiteLLMEvalClient instead. | ||
| return ResponsesWithMetadata(response_texts, None) | ||
|
|
@@ -425,6 +516,7 @@ def _call_api_with_exception_filter( | |
| "OpenAI failed to return an assessment corresponding to " | ||
| f"{i}th prompt: {response}" | ||
| ) | ||
| traceback.print_exception(response) | ||
| responses[i] = None | ||
|
|
||
| assessments = [ | ||
|
|
@@ -454,6 +546,10 @@ def __init__( | |
| openai_args: dict[str, str] | None = None, | ||
| *, | ||
| use_async: bool = False, | ||
| use_reasoning_summary: bool = False, | ||
| reasoning_effort: ReasoningEffort = "medium", | ||
| reasoning_summary: Literal["auto", "concise", "detailed"] | ||
|
taniokay marked this conversation as resolved.
|
||
| | None = "auto", | ||
| system_prompt: str | None = None, | ||
| extractor: Extractor | None = None, | ||
| ): | ||
|
|
@@ -473,6 +569,15 @@ def __init__( | |
| openai_args (Optional): dict of additional args to pass in to the | ||
| `client.chat.completions.create` function. | ||
| use_async (Optional): If True, the async client will be used. | ||
| use_reasoning_summary: Whether to use reasoning summary. | ||
| NOTE: Please make sure that the model and API version support | ||
| reasoning summary. | ||
| https://platform.openai.com/docs/models | ||
| https://learn.microsoft.com/en-us/azure/ai-foundry/openai/how-to/reasoning#api--feature-support | ||
| reasoning_effort: How many reasoning tokens to generate. | ||
| This is only used when `use_reasoning_summary` is True. | ||
| reasoning_summary: The level of detail of the summarizer. | ||
| This is only used when `use_reasoning_summary` is True. | ||
| system_prompt (Optional): The system prompt to use. If not provided, | ||
| no system prompt will be used. | ||
| extractor (Optional): The extractor to use. If not provided, the | ||
|
|
@@ -541,6 +646,13 @@ def __init__( | |
| self._openai_args = openai_args or {} | ||
| self._system_prompt = system_prompt | ||
|
|
||
| self._reasoning_effort: ReasoningEffort = ( | ||
| reasoning_effort if use_reasoning_summary else None | ||
| ) | ||
| self._reasoning_summary: ( | ||
| Literal["auto", "concise", "detailed"] | None | ||
| ) = reasoning_summary if use_reasoning_summary else None | ||
|
|
||
| if self._text_model_name is not None: | ||
| self._openai_args["model"] = self._text_model_name | ||
|
|
||
|
|
||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's also bump the version in this PR!
https://langcheck.readthedocs.io/en/latest/contributing.html#publishing