Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "langcheck"
version = "0.10.0.dev11"
version = "0.10.0.dev12"
description = "Simple, Pythonic building blocks to evaluate LLM-based applications"
readme = "README.md"
authors = [{ name = "Citadel AI", email = "[email protected]" }]
Expand Down
2 changes: 1 addition & 1 deletion src/langcheck/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from langcheck import augment, metrics, plot, utils

__all__ = ["augment", "metrics", "plot", "utils"]
__version__ = "0.10.0.dev11"
__version__ = "0.10.0.dev12"
3 changes: 3 additions & 0 deletions src/langcheck/metrics/eval_clients/_litellm.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import asyncio
import traceback
from typing import Any, Literal

import instructor
Expand Down Expand Up @@ -223,6 +224,7 @@ async def _gather():
f"Failed to return an assessment corresponding to {i}th prompt: "
f"{response}"
)
traceback.print_exception(response)
responses[i] = None
return responses

Expand Down Expand Up @@ -530,6 +532,7 @@ def _call_api_with_exception_filter(
f"Failed to return an assessment for the {i}th prompt: "
f"{response}"
)
traceback.print_exception(response)
responses[i] = None

assessments = [
Expand Down
135 changes: 126 additions & 9 deletions src/langcheck/metrics/eval_clients/_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

import asyncio
import os
import traceback
import warnings
from typing import Any, Literal

import torch
from openai import AsyncAzureOpenAI, AsyncOpenAI, AzureOpenAI, OpenAI
from openai.types.create_embedding_response import CreateEmbeddingResponse
from openai.types.shared_params import Reasoning, ReasoningEffort
from pydantic import BaseModel

from langcheck.metrics.eval_clients.eval_response import (
Expand All @@ -30,6 +32,10 @@ def __init__(
openai_args: dict[str, str] | None = None,
*,
use_async: bool = False,
use_reasoning_summary: bool = False,
Comment thread
taniokay marked this conversation as resolved.
reasoning_effort: ReasoningEffort = "medium",
reasoning_summary: Literal["auto", "concise", "detailed"]
| None = "auto",
system_prompt: str | None = None,
extractor: Extractor | None = None,
):
Expand All @@ -44,6 +50,15 @@ def __init__(
`client.chat.completions.create` function.
use_async: If True, the async client will be used. Defaults to
False.
use_reasoning_summary: Whether to use reasoning summary.
NOTE: Please make sure that the model and API version support
reasoning summary.
https://platform.openai.com/docs/models
https://learn.microsoft.com/en-us/azure/ai-foundry/openai/how-to/reasoning#api--feature-support
reasoning_effort: How many reasoning tokens to generate.
This is only used when `use_reasoning_summary` is True.
reasoning_summary: The level of detail of the summarizer.
This is only used when `use_reasoning_summary` is True.
system_prompt (Optional): The system prompt to use. If not provided,
no system prompt will be used.
extractor (Optional): The extractor to use. If not provided, the
Expand Down Expand Up @@ -77,6 +92,13 @@ def __init__(
self._openai_args = openai_args
self._system_prompt = system_prompt

self._reasoning_effort: ReasoningEffort = (
reasoning_effort if use_reasoning_summary else None
)
self._reasoning_summary: (
Literal["auto", "concise", "detailed"] | None
) = reasoning_summary if use_reasoning_summary else None

if extractor is None:
self._extractor = OpenAIExtractor(
openai_client=self._client,
Expand All @@ -86,6 +108,38 @@ def __init__(
else:
self._extractor = extractor

def _dispatch(
self,
messages: list[dict[str, str]],
seed: int | None = None,
config: dict[str, str] | None = None,
) -> Any:
"""Dispatch the API call to the OpenAI API."""
if self._reasoning_summary is None:
return self._client.chat.completions.create(
messages=messages, # type: ignore
seed=seed,
**config,
)
else:
# To use reasoning summary, we must use the Responses API
# instead of Chat Completions API.
# https://platform.openai.com/docs/guides/reasoning#reasoning-summaries

reasoning: Reasoning = {
"effort": self._reasoning_effort,
"summary": self._reasoning_summary,
}

# seed and logprobs are not supported in responses API.
return self._client.responses.create(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah nice, I was wondering if we set this properly to avoid logging prompts

input=messages, # type: ignore
store=False,
reasoning=reasoning,
truncation="auto",
**config,
)

def _call_api(
self,
prompts: list[str],
Expand All @@ -100,7 +154,11 @@ def _call_api_with_exception_filter(model_input: dict[str, Any]) -> Any:
if model_input is None:
return None
try:
return self._client.chat.completions.create(**model_input)
return self._dispatch(
model_input["messages"],
model_input["seed"],
config=config,
)
Comment thread
taniokay marked this conversation as resolved.
except Exception as e:
return e

Expand All @@ -114,7 +172,6 @@ def _call_api_with_exception_filter(model_input: dict[str, Any]) -> Any:
"messages": system_message
+ [{"role": "user", "content": prompt}],
"seed": i,
**config,
}
for i, prompt in enumerate(prompts)
]
Expand All @@ -124,8 +181,10 @@ def _call_api_with_exception_filter(model_input: dict[str, Any]) -> Any:
async def _call_async_api() -> list[Any]:
responses = await asyncio.gather(
*map(
lambda model_input: self._client.chat.completions.create(
**model_input
lambda model_input: self._dispatch(
model_input["messages"],
model_input["seed"],
config=config,
),
Comment thread
taniokay marked this conversation as resolved.
model_inputs,
),
Expand All @@ -150,6 +209,7 @@ async def _call_async_api() -> list[Any]:
"OpenAI failed to return an assessment corresponding to "
f"{i}th prompt: {response}"
)
traceback.print_exception(response)
responses[i] = None
return responses

Expand Down Expand Up @@ -185,11 +245,39 @@ def get_text_responses(
tqdm_description=tqdm_description,
system_prompt=self._system_prompt,
)
response_texts = [
response.choices[0].message.content if response else None
for response in responses
]

response_texts = []
for response in responses:
if not response:
response_texts.append(None)
continue
# Use the Responses API only when a reasoning summary is required.
# Otherwise, use the Chat Completions API.
if self._reasoning_summary is None:
content = response.choices[0].message.content
else:
content = None
summaries = []

for output in response.output:
if hasattr(output, "summary"):
if output.summary == []:
print(
"Reasoning summary is empty. "
"This may happen even if model supports reasoning summary."
)
continue

# Summary can be a list of summaries
summaries.extend([s.text for s in output.summary])
elif hasattr(output, "content"):
content = output.content[0].text

if content is not None and summaries:
summaries_str = "\n\n".join(summaries)
content += f"\n\n**Reasoning Summary:**\n\n{summaries_str}"

response_texts.append(content)
Comment thread
kennysong marked this conversation as resolved.
Comment thread
taniokay marked this conversation as resolved.
# Token usage is not supported in OpenAIEvalClient
# If you need token usage, please use LiteLLMEvalClient instead.
return ResponsesWithMetadata(response_texts, None)
Expand All @@ -204,6 +292,7 @@ def get_text_responses_with_log_likelihood(
"""The function that gets responses with log likelihood to the given
prompt texts. Each concrete subclass needs to define the concrete
implementation of this function to enable text scoring.
This is not available for reasoning models.

NOTE: Please make sure that the model you use supports logprobs. In
Azure OpenAI, the API version 2024-06-01 is the earliest GA version that
Expand All @@ -218,8 +307,15 @@ def get_text_responses_with_log_likelihood(
output text and the list of tuples of the output tokens and the log
probabilities. The responses can be None if the evaluation fails.
"""
config = {"model": "gpt-4o-mini", "logprobs": True}
if self._reasoning_summary is not None:
raise ValueError(
"Log likelihood is not supported with reasoning models."
)

config: dict[str, Any] = {"model": "gpt-4o-mini"}

Comment thread
taniokay marked this conversation as resolved.
if top_logprobs:
config["logprobs"] = True
config["top_logprobs"] = top_logprobs
config.update(self._openai_args or {})
tqdm_description = tqdm_description or "Getting log likelihoods"
Expand Down Expand Up @@ -425,6 +521,7 @@ def _call_api_with_exception_filter(
"OpenAI failed to return an assessment corresponding to "
f"{i}th prompt: {response}"
)
traceback.print_exception(response)
responses[i] = None

assessments = [
Expand Down Expand Up @@ -454,6 +551,10 @@ def __init__(
openai_args: dict[str, str] | None = None,
*,
use_async: bool = False,
use_reasoning_summary: bool = False,
reasoning_effort: ReasoningEffort = "medium",
reasoning_summary: Literal["auto", "concise", "detailed"]
Comment thread
taniokay marked this conversation as resolved.
| None = "auto",
system_prompt: str | None = None,
extractor: Extractor | None = None,
):
Expand All @@ -473,6 +574,15 @@ def __init__(
openai_args (Optional): dict of additional args to pass in to the
`client.chat.completions.create` function.
use_async (Optional): If True, the async client will be used.
use_reasoning_summary: Whether to use reasoning summary.
NOTE: Please make sure that the model and API version support
reasoning summary.
https://platform.openai.com/docs/models
https://learn.microsoft.com/en-us/azure/ai-foundry/openai/how-to/reasoning#api--feature-support
reasoning_effort: How many reasoning tokens to generate.
This is only used when `use_reasoning_summary` is True.
reasoning_summary: The level of detail of the summarizer.
This is only used when `use_reasoning_summary` is True.
system_prompt (Optional): The system prompt to use. If not provided,
no system prompt will be used.
extractor (Optional): The extractor to use. If not provided, the
Expand Down Expand Up @@ -541,6 +651,13 @@ def __init__(
self._openai_args = openai_args or {}
self._system_prompt = system_prompt

self._reasoning_effort: ReasoningEffort = (
reasoning_effort if use_reasoning_summary else None
)
self._reasoning_summary: (
Literal["auto", "concise", "detailed"] | None
) = reasoning_summary if use_reasoning_summary else None

if self._text_model_name is not None:
self._openai_args["model"] = self._text_model_name

Expand Down