Skip to content
3 changes: 3 additions & 0 deletions src/langcheck/metrics/eval_clients/_litellm.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import asyncio
import traceback
from typing import Any, Literal

import instructor
Expand Down Expand Up @@ -223,6 +224,7 @@ async def _gather():
f"Failed to return an assessment corresponding to {i}th prompt: "
f"{response}"
)
traceback.print_exception(response)
responses[i] = None
return responses

Expand Down Expand Up @@ -530,6 +532,7 @@ def _call_api_with_exception_filter(
f"Failed to return an assessment for the {i}th prompt: "
f"{response}"
)
traceback.print_exception(response)
responses[i] = None

assessments = [
Expand Down
128 changes: 120 additions & 8 deletions src/langcheck/metrics/eval_clients/_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

import asyncio
import os
import traceback
import warnings
from typing import Any, Literal

import torch
from openai import AsyncAzureOpenAI, AsyncOpenAI, AzureOpenAI, OpenAI
from openai.types.create_embedding_response import CreateEmbeddingResponse
from openai.types.shared_params import Reasoning, ReasoningEffort
from pydantic import BaseModel

from langcheck.metrics.eval_clients.eval_response import (
Expand All @@ -30,6 +32,10 @@ def __init__(
openai_args: dict[str, str] | None = None,
*,
use_async: bool = False,
use_reasoning_summary: bool = False,
Comment thread
taniokay marked this conversation as resolved.
reasoning_effort: ReasoningEffort = "medium",
reasoning_summary: Literal["auto", "concise", "detailed"]
| None = "auto",
system_prompt: str | None = None,
extractor: Extractor | None = None,
):
Expand All @@ -44,6 +50,15 @@ def __init__(
`client.chat.completions.create` function.
use_async: If True, the async client will be used. Defaults to
False.
use_reasoning_summary: Whether to use reasoning summary.
NOTE: Please make sure that the model and API version support
reasoning summary.
https://platform.openai.com/docs/models
https://learn.microsoft.com/en-us/azure/ai-foundry/openai/how-to/reasoning#api--feature-support
reasoning_effort: How many reasoning tokens to generate.
This is only used when `use_reasoning_summary` is True.
reasoning_summary: The level of detail of the summarizer.
This is only used when `use_reasoning_summary` is True.
system_prompt (Optional): The system prompt to use. If not provided,
no system prompt will be used.
extractor (Optional): The extractor to use. If not provided, the
Expand Down Expand Up @@ -77,6 +92,13 @@ def __init__(
self._openai_args = openai_args
self._system_prompt = system_prompt

self._reasoning_effort: ReasoningEffort = (
reasoning_effort if use_reasoning_summary else None
)
self._reasoning_summary: (
Literal["auto", "concise", "detailed"] | None
) = reasoning_summary if use_reasoning_summary else None

if extractor is None:
self._extractor = OpenAIExtractor(
openai_client=self._client,
Expand All @@ -86,6 +108,41 @@ def __init__(
else:
self._extractor = extractor

def _dispatch(
self,
messages: list[dict[str, str]],
seed: int | None = None,
config: dict[str, str] | None = None,
) -> Any:
"""Dispatch the API call to the OpenAI API."""
if self._reasoning_summary is None:
return self._client.chat.completions.create(
messages=messages, # type: ignore
seed=seed,
**config,
)
else:
# To use reasoning summary, we must use the Responses API
# instead of Chat Completions API.
# https://platform.openai.com/docs/guides/reasoning#reasoning-summaries

include = []

reasoning: Reasoning = {
"effort": self._reasoning_effort,
"summary": self._reasoning_summary,
}

# seed and logprobs are not supported in responses API.
return self._client.responses.create(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah nice, I was wondering if we set this properly to avoid logging prompts

input=messages, # type: ignore
include=include,
store=False,
reasoning=reasoning,
truncation="auto",
**config,
)

def _call_api(
self,
prompts: list[str],
Expand All @@ -100,7 +157,11 @@ def _call_api_with_exception_filter(model_input: dict[str, Any]) -> Any:
if model_input is None:
return None
try:
return self._client.chat.completions.create(**model_input)
return self._dispatch(
model_input["messages"],
model_input["seed"],
config=config,
)
Comment thread
taniokay marked this conversation as resolved.
except Exception as e:
return e

Expand All @@ -114,7 +175,6 @@ def _call_api_with_exception_filter(model_input: dict[str, Any]) -> Any:
"messages": system_message
+ [{"role": "user", "content": prompt}],
"seed": i,
**config,
}
for i, prompt in enumerate(prompts)
]
Expand All @@ -124,8 +184,10 @@ def _call_api_with_exception_filter(model_input: dict[str, Any]) -> Any:
async def _call_async_api() -> list[Any]:
responses = await asyncio.gather(
*map(
lambda model_input: self._client.chat.completions.create(
**model_input
lambda model_input: self._dispatch(
model_input["messages"],
model_input["seed"],
config=config,
),
Comment thread
taniokay marked this conversation as resolved.
model_inputs,
),
Expand All @@ -150,6 +212,7 @@ async def _call_async_api() -> list[Any]:
"OpenAI failed to return an assessment corresponding to "
f"{i}th prompt: {response}"
)
traceback.print_exception(response)
responses[i] = None
return responses

Expand Down Expand Up @@ -185,11 +248,39 @@ def get_text_responses(
tqdm_description=tqdm_description,
system_prompt=self._system_prompt,
)
response_texts = [
response.choices[0].message.content if response else None
for response in responses
]

response_texts = []
for response in responses:
if not response:
response_texts.append(None)
continue
# Use the Responses API only when a reasoning summary is required.
# Otherwise, use the Chat Completions API.
if self._reasoning_summary is None:
content = response.choices[0].message.content
else:
content = None
summaries = []

for output in response.output:
if hasattr(output, "summary"):
if output.summary == []:
print(
"Reasoning summary is empty. "
"This may happen even if model supports reasoning summary."
)
continue

# Summary can be a list of summaries
summaries.extend([s.text for s in output.summary])
elif hasattr(output, "content"):
content = output.content[0].text

if content is not None and summaries:
summaries_str = "\n\n".join(summaries)
content += f"\n\n**Reasoning Summary:**\n\n{summaries_str}"

response_texts.append(content)
Comment thread
kennysong marked this conversation as resolved.
Comment thread
taniokay marked this conversation as resolved.
# Token usage is not supported in OpenAIEvalClient
# If you need token usage, please use LiteLLMEvalClient instead.
return ResponsesWithMetadata(response_texts, None)
Expand Down Expand Up @@ -425,6 +516,7 @@ def _call_api_with_exception_filter(
"OpenAI failed to return an assessment corresponding to "
f"{i}th prompt: {response}"
)
traceback.print_exception(response)
responses[i] = None

assessments = [
Expand Down Expand Up @@ -454,6 +546,10 @@ def __init__(
openai_args: dict[str, str] | None = None,
*,
use_async: bool = False,
use_reasoning_summary: bool = False,
reasoning_effort: ReasoningEffort = "medium",
reasoning_summary: Literal["auto", "concise", "detailed"]
Comment thread
taniokay marked this conversation as resolved.
| None = "auto",
system_prompt: str | None = None,
extractor: Extractor | None = None,
):
Expand All @@ -473,6 +569,15 @@ def __init__(
openai_args (Optional): dict of additional args to pass in to the
`client.chat.completions.create` function.
use_async (Optional): If True, the async client will be used.
use_reasoning_summary: Whether to use reasoning summary.
NOTE: Please make sure that the model and API version support
reasoning summary.
https://platform.openai.com/docs/models
https://learn.microsoft.com/en-us/azure/ai-foundry/openai/how-to/reasoning#api--feature-support
reasoning_effort: How many reasoning tokens to generate.
This is only used when `use_reasoning_summary` is True.
reasoning_summary: The level of detail of the summarizer.
This is only used when `use_reasoning_summary` is True.
system_prompt (Optional): The system prompt to use. If not provided,
no system prompt will be used.
extractor (Optional): The extractor to use. If not provided, the
Expand Down Expand Up @@ -541,6 +646,13 @@ def __init__(
self._openai_args = openai_args or {}
self._system_prompt = system_prompt

self._reasoning_effort: ReasoningEffort = (
reasoning_effort if use_reasoning_summary else None
)
self._reasoning_summary: (
Literal["auto", "concise", "detailed"] | None
) = reasoning_summary if use_reasoning_summary else None

if self._text_model_name is not None:
self._openai_args["model"] = self._text_model_name

Expand Down