Skip to content

Add OpenAI Chat Completions API #75

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jun 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ extra-dependencies = [
"pytest-asyncio",
"python-dotenv",
"tiktoken",
"openai",
]
[tool.hatch.envs.types.scripts]
check = "mypy --strict --install-types --non-interactive {args:src/cleanlab_tlm tests}"
Expand All @@ -59,6 +60,7 @@ extra-dependencies = [
"python-dotenv",
"pytest-asyncio",
"tiktoken",
"openai",
]

[tool.hatch.envs.hatch-test.env-vars]
Expand Down
1 change: 1 addition & 0 deletions src/cleanlab_tlm/internal/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# prepend constants with _ so that they don't show up in help.cleanlab.ai docs
_VALID_TLM_QUALITY_PRESETS: list[str] = ["best", "high", "medium", "low", "base"]
_VALID_TLM_QUALITY_PRESETS_RAG: list[str] = ["medium", "low", "base"]
_VALID_TLM_QUALITY_PRESETS_CHAT_COMPLETIONS: list[str] = ["medium", "low", "base"]
_DEFAULT_TLM_QUALITY_PRESET: TLMQualityPreset = "medium"
_DEFAULT_TLM_MAX_TOKENS: int = 512
_VALID_TLM_MODELS: list[str] = [
Expand Down
107 changes: 107 additions & 0 deletions src/cleanlab_tlm/utils/chat_completions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
"""
Real-time evaluation of responses from OpenAI Chat Completions API.

If you are using OpenAI's Chat Completions API, this module allows you to incorporate TLM trust scoring without any change to your existing code.
It works for any OpenAI LLM model, as well as the many other non-OpenAI LLMs that are also usable via Chat Completions API (Gemini, DeepSeek, Llama, etc).
"""

from typing import TYPE_CHECKING, Any, Optional, cast

from cleanlab_tlm.internal.base import BaseTLM
from cleanlab_tlm.internal.constants import (
_DEFAULT_TLM_QUALITY_PRESET,
_VALID_TLM_QUALITY_PRESETS_CHAT_COMPLETIONS,
)
from cleanlab_tlm.internal.types import TLMQualityPreset
from cleanlab_tlm.tlm import TLM, TLMOptions, TLMScore
from cleanlab_tlm.utils.chat import form_prompt_string

if TYPE_CHECKING:
from openai.types.chat import ChatCompletion


class TLMChatCompletion(BaseTLM):
"""
Represents a Trustworthy Language Model (TLM) instance specifically designed for evaluating OpenAI Chat Completions responses.

This class provides a TLM wrapper that can be used to evaluate the quality and trustworthiness of responses from any OpenAI model
by passing in the inputs to OpenAI's Chat Completions API and the ChatCompletion response object.

Args:
quality_preset ({"base", "low", "medium"}, default = "medium"): an optional preset configuration to control
the quality of TLM trustworthiness scores vs. latency/costs.

api_key (str, optional): Cleanlab TLM API key. If not provided, will attempt to read from CLEANLAB_API_KEY environment variable.

options ([TLMOptions](#class-tlmoptions), optional): a typed dict of configurations you can optionally specify.
See detailed documentation under [TLMOptions](#class-tlmoptions).

timeout (float, optional): timeout (in seconds) to apply to each TLM evaluation.
"""

def __init__(
self,
quality_preset: TLMQualityPreset = _DEFAULT_TLM_QUALITY_PRESET,
*,
api_key: Optional[str] = None,
options: Optional[TLMOptions] = None,
timeout: Optional[float] = None,
):
"""
lazydocs: ignore
"""
super().__init__(
quality_preset=quality_preset,
valid_quality_presets=_VALID_TLM_QUALITY_PRESETS_CHAT_COMPLETIONS,
support_custom_eval_criteria=True,
api_key=api_key,
options=options,
timeout=timeout,
verbose=False,
)

self._tlm = TLM(
quality_preset=quality_preset,
api_key=api_key,
options=options,
timeout=timeout,
)

def score(
self,
*,
response: "ChatCompletion",
**openai_kwargs: Any,
) -> TLMScore:
"""Score the trustworthiness of an OpenAI ChatCompletion response.

Args:
response (ChatCompletion): The OpenAI ChatCompletion response object to evaluate
**openai_kwargs (Any): The original kwargs passed to OpenAI's create() method, must include 'messages'

Returns:
TLMScore: A dict containing the trustworthiness score and optional logs
"""
if (messages := openai_kwargs.get("messages")) is None:
raise ValueError("messages is a required OpenAI input argument.")
tools = openai_kwargs.get("tools", None)

prompt_text = form_prompt_string(messages, tools)
Comment on lines +87 to +89
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nit] cleaner to just pass openai_kwargs.get("tools", None) directly instead of creating an additional variable

response_text = _get_string_response(response)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you save a ticket for future PR:

we should try to extract token-probs out of response object if they are in there and include them in the score, like in TLMLite

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

when the user does: TLMChatCompletions.create()

the resulting scores should also be including token-probs (can save for future PR too)


return cast(TLMScore, self._tlm.get_trustworthiness_score(prompt_text, response_text))


def _get_string_response(response: "ChatCompletion") -> str:
try:
from openai.types.chat import ChatCompletion
except ImportError:
raise ImportError(
"OpenAI is required to use the TLMChatCompletion class. Please install it with `pip install openai`."
)

if not isinstance(response, ChatCompletion):
raise TypeError("The response is not an OpenAI ChatCompletion object.")
if response.choices[0].message.content is None:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will tool call output / structured output be here too if OpenAI chose to return that?

if not, can you save a ticket for future PR that we should get those outputs and use the appropriate TLM scoring for them

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No this code does not do that yet, I believe I can add rudimentary support for that pretty easily tho

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@huiwengoh do you still plan to add the support in this PR or in a followup ticket?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Created ticket!

raise ValueError("The OpenAI ChatCompletion object does not contain a message content.")
return str(response.choices[0].message.content)
155 changes: 155 additions & 0 deletions tests/test_chat_completions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
import pytest
from openai.types.chat import (
ChatCompletion,
ChatCompletionMessage,
)
from openai.types.chat.chat_completion import Choice

from cleanlab_tlm.internal.types import TLMQualityPreset
from cleanlab_tlm.utils.chat_completions import TLMChatCompletion
from tests.conftest import make_text_unique
from tests.constants import TEST_PROMPT, TEST_RESPONSE
from tests.test_get_trustworthiness_score import is_trustworthiness_score_json_format

test_prompt = make_text_unique(TEST_PROMPT)
test_response = make_text_unique(TEST_RESPONSE)


@pytest.mark.parametrize(
"quality_preset",
["base", "low", "medium"],
)
def test_tlm_chat_completion_score(quality_preset: TLMQualityPreset) -> None:
tlm_chat = TLMChatCompletion(quality_preset=quality_preset)
openai_kwargs = {
"model": "gpt-4.1-mini",
"messages": [{"role": "user", "content": test_prompt}],
}
response = ChatCompletion(
id="test",
choices=[
Choice(
index=0,
message=ChatCompletionMessage(role="assistant", content=test_response),
finish_reason="stop",
)
],
created=1234567890,
model="test-model",
object="chat.completion",
)

score = tlm_chat.score(response=response, **openai_kwargs)

assert score is not None
assert is_trustworthiness_score_json_format(score)


def test_tlm_chat_completion_score_with_options() -> None:
tlm_chat = TLMChatCompletion(options={"log": ["explanation", "perplexity"]})
openai_kwargs = {
"model": "gpt-4.1-mini",
"messages": [{"role": "user", "content": test_prompt}],
}
response = ChatCompletion(
id="test",
choices=[
Choice(
index=0,
message=ChatCompletionMessage(role="assistant", content=test_response),
finish_reason="stop",
)
],
created=1234567890,
model="test-model",
object="chat.completion",
)

score = tlm_chat.score(response=response, **openai_kwargs)

assert score is not None
assert is_trustworthiness_score_json_format(score)


def test_tlm_chat_completion_score_with_tools() -> None:
tlm_chat = TLMChatCompletion()
openai_kwargs = {
"model": "gpt-4.1-mini",
"messages": [{"role": "user", "content": test_prompt}],
"tools": [
{
"type": "function",
"function": {
"name": "search",
"description": "Search the web for information",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The search query",
}
},
"required": ["query"],
},
},
}
],
}
response = ChatCompletion(
id="test",
choices=[
Choice(
index=0,
message=ChatCompletionMessage(role="assistant", content=test_response),
finish_reason="stop",
)
],
created=1234567890,
model="test-model",
object="chat.completion",
)

score = tlm_chat.score(response=response, **openai_kwargs)

assert score is not None
assert is_trustworthiness_score_json_format(score)


def test_tlm_chat_completion_score_invalid_response() -> None:
tlm_chat = TLMChatCompletion()
openai_kwargs = {
"model": "gpt-4.1-mini",
"messages": [{"role": "user", "content": test_prompt}],
}
invalid_response = {"invalid": "response"}

with pytest.raises(TypeError, match="The response is not an OpenAI ChatCompletion object."):
tlm_chat.score(response=invalid_response, **openai_kwargs) # type: ignore


def test_tlm_chat_completion_score_missing_messages() -> None:
tlm_chat = TLMChatCompletion()
openai_kwargs = {
"model": "gpt-4.1-mini",
"messages": [{"role": "user", "content": test_prompt}],
}
response = ChatCompletion(
id="test",
choices=[
Choice(
index=0,
message=ChatCompletionMessage(role="assistant", content=None),
finish_reason="stop",
)
],
created=1234567890,
model="test-model",
object="chat.completion",
)

with pytest.raises(
ValueError,
match="The OpenAI ChatCompletion object does not contain a message content.",
):
tlm_chat.score(response=response, **openai_kwargs)