cleanlab · huiwengoh · Jun 17, 2025 · Jun 12, 2025 · Jun 12, 2025 · Jun 12, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -48,6 +48,7 @@ extra-dependencies = [
   "pytest-asyncio",
   "python-dotenv",
   "tiktoken",
+  "openai",
 ]
 [tool.hatch.envs.types.scripts]
 check = "mypy --strict --install-types --non-interactive {args:src/cleanlab_tlm tests}"
@@ -59,6 +60,7 @@ extra-dependencies = [
   "python-dotenv",
   "pytest-asyncio",
   "tiktoken",
+  "openai",
 ]
 
 [tool.hatch.envs.hatch-test.env-vars]

diff --git a/src/cleanlab_tlm/internal/constants.py b/src/cleanlab_tlm/internal/constants.py
@@ -4,6 +4,7 @@
 # prepend constants with _ so that they don't show up in help.cleanlab.ai docs
 _VALID_TLM_QUALITY_PRESETS: list[str] = ["best", "high", "medium", "low", "base"]
 _VALID_TLM_QUALITY_PRESETS_RAG: list[str] = ["medium", "low", "base"]
+_VALID_TLM_QUALITY_PRESETS_CHAT_COMPLETIONS: list[str] = ["medium", "low", "base"]
 _DEFAULT_TLM_QUALITY_PRESET: TLMQualityPreset = "medium"
 _DEFAULT_TLM_MAX_TOKENS: int = 512
 _VALID_TLM_MODELS: list[str] = [

diff --git a/src/cleanlab_tlm/utils/chat_completions.py b/src/cleanlab_tlm/utils/chat_completions.py
@@ -0,0 +1,107 @@
+"""
+Real-time evaluation of responses from OpenAI Chat Completions API.
+
+If you are using OpenAI's Chat Completions API, this module allows you to incorporate TLM trust scoring without any change to your existing code.
+It works for any OpenAI LLM model, as well as the many other non-OpenAI LLMs that are also usable via Chat Completions API (Gemini, DeepSeek, Llama, etc).
+"""
+
+from typing import TYPE_CHECKING, Any, Optional, cast
+
+from cleanlab_tlm.internal.base import BaseTLM
+from cleanlab_tlm.internal.constants import (
+    _DEFAULT_TLM_QUALITY_PRESET,
+    _VALID_TLM_QUALITY_PRESETS_CHAT_COMPLETIONS,
+)
+from cleanlab_tlm.internal.types import TLMQualityPreset
+from cleanlab_tlm.tlm import TLM, TLMOptions, TLMScore
+from cleanlab_tlm.utils.chat import form_prompt_string
+
+if TYPE_CHECKING:
+    from openai.types.chat import ChatCompletion
+
+
+class TLMChatCompletion(BaseTLM):
+    """
+    Represents a Trustworthy Language Model (TLM) instance specifically designed for evaluating OpenAI Chat Completions responses.
+
+    This class provides a TLM wrapper that can be used to evaluate the quality and trustworthiness of responses from any OpenAI model
+    by passing in the inputs to OpenAI's Chat Completions API and the ChatCompletion response object.
+
+    Args:
+        quality_preset ({"base", "low", "medium"}, default = "medium"): an optional preset configuration to control
+            the quality of TLM trustworthiness scores vs. latency/costs.
+
+        api_key (str, optional): Cleanlab TLM API key. If not provided, will attempt to read from CLEANLAB_API_KEY environment variable.
+
+        options ([TLMOptions](#class-tlmoptions), optional): a typed dict of configurations you can optionally specify.
+            See detailed documentation under [TLMOptions](#class-tlmoptions).
+
+        timeout (float, optional): timeout (in seconds) to apply to each TLM evaluation.
+    """
+
+    def __init__(
+        self,
+        quality_preset: TLMQualityPreset = _DEFAULT_TLM_QUALITY_PRESET,
+        *,
+        api_key: Optional[str] = None,
+        options: Optional[TLMOptions] = None,
+        timeout: Optional[float] = None,
+    ):
+        """
+        lazydocs: ignore
+        """
+        super().__init__(
+            quality_preset=quality_preset,
+            valid_quality_presets=_VALID_TLM_QUALITY_PRESETS_CHAT_COMPLETIONS,
+            support_custom_eval_criteria=True,
+            api_key=api_key,
+            options=options,
+            timeout=timeout,
+            verbose=False,
+        )
+
+        self._tlm = TLM(
+            quality_preset=quality_preset,
+            api_key=api_key,
+            options=options,
+            timeout=timeout,
+        )
+
+    def score(
+        self,
+        *,
+        response: "ChatCompletion",
+        **openai_kwargs: Any,
+    ) -> TLMScore:
+        """Score the trustworthiness of an OpenAI ChatCompletion response.
+
+        Args:
+            response (ChatCompletion): The OpenAI ChatCompletion response object to evaluate
+            **openai_kwargs (Any): The original kwargs passed to OpenAI's create() method, must include 'messages'
+
+        Returns:
+            TLMScore: A dict containing the trustworthiness score and optional logs
+        """
+        if (messages := openai_kwargs.get("messages")) is None:
+            raise ValueError("messages is a required OpenAI input argument.")
+        tools = openai_kwargs.get("tools", None)
+
+        prompt_text = form_prompt_string(messages, tools)
+        response_text = _get_string_response(response)
+
+        return cast(TLMScore, self._tlm.get_trustworthiness_score(prompt_text, response_text))
+
+
+def _get_string_response(response: "ChatCompletion") -> str:
+    try:
+        from openai.types.chat import ChatCompletion
+    except ImportError:
+        raise ImportError(
+            "OpenAI is required to use the TLMChatCompletion class. Please install it with `pip install openai`."
+        )
+
+    if not isinstance(response, ChatCompletion):
+        raise TypeError("The response is not an OpenAI ChatCompletion object.")
+    if response.choices[0].message.content is None:
+        raise ValueError("The OpenAI ChatCompletion object does not contain a message content.")
+    return str(response.choices[0].message.content)
diff --git a/tests/test_chat_completions.py b/tests/test_chat_completions.py
@@ -0,0 +1,155 @@
+import pytest
+from openai.types.chat import (
+    ChatCompletion,
+    ChatCompletionMessage,
+)
+from openai.types.chat.chat_completion import Choice
+
+from cleanlab_tlm.internal.types import TLMQualityPreset
+from cleanlab_tlm.utils.chat_completions import TLMChatCompletion
+from tests.conftest import make_text_unique
+from tests.constants import TEST_PROMPT, TEST_RESPONSE
+from tests.test_get_trustworthiness_score import is_trustworthiness_score_json_format
+
+test_prompt = make_text_unique(TEST_PROMPT)
+test_response = make_text_unique(TEST_RESPONSE)
+
+
+@pytest.mark.parametrize(
+    "quality_preset",
+    ["base", "low", "medium"],
+)
+def test_tlm_chat_completion_score(quality_preset: TLMQualityPreset) -> None:
+    tlm_chat = TLMChatCompletion(quality_preset=quality_preset)
+    openai_kwargs = {
+        "model": "gpt-4.1-mini",
+        "messages": [{"role": "user", "content": test_prompt}],
+    }
+    response = ChatCompletion(
+        id="test",
+        choices=[
+            Choice(
+                index=0,
+                message=ChatCompletionMessage(role="assistant", content=test_response),
+                finish_reason="stop",
+            )
+        ],
+        created=1234567890,
+        model="test-model",
+        object="chat.completion",
+    )
+
+    score = tlm_chat.score(response=response, **openai_kwargs)
+
+    assert score is not None
+    assert is_trustworthiness_score_json_format(score)
+
+
+def test_tlm_chat_completion_score_with_options() -> None:
+    tlm_chat = TLMChatCompletion(options={"log": ["explanation", "perplexity"]})
+    openai_kwargs = {
+        "model": "gpt-4.1-mini",
+        "messages": [{"role": "user", "content": test_prompt}],
+    }
+    response = ChatCompletion(
+        id="test",
+        choices=[
+            Choice(
+                index=0,
+                message=ChatCompletionMessage(role="assistant", content=test_response),
+                finish_reason="stop",
+            )
+        ],
+        created=1234567890,
+        model="test-model",
+        object="chat.completion",
+    )
+
+    score = tlm_chat.score(response=response, **openai_kwargs)
+
+    assert score is not None
+    assert is_trustworthiness_score_json_format(score)
+
+
+def test_tlm_chat_completion_score_with_tools() -> None:
+    tlm_chat = TLMChatCompletion()
+    openai_kwargs = {
+        "model": "gpt-4.1-mini",
+        "messages": [{"role": "user", "content": test_prompt}],
+        "tools": [
+            {
+                "type": "function",
+                "function": {
+                    "name": "search",
+                    "description": "Search the web for information",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "query": {
+                                "type": "string",
+                                "description": "The search query",
+                            }
+                        },
+                        "required": ["query"],
+                    },
+                },
+            }
+        ],
+    }
+    response = ChatCompletion(
+        id="test",
+        choices=[
+            Choice(
+                index=0,
+                message=ChatCompletionMessage(role="assistant", content=test_response),
+                finish_reason="stop",
+            )
+        ],
+        created=1234567890,
+        model="test-model",
+        object="chat.completion",
+    )
+
+    score = tlm_chat.score(response=response, **openai_kwargs)
+
+    assert score is not None
+    assert is_trustworthiness_score_json_format(score)
+
+
+def test_tlm_chat_completion_score_invalid_response() -> None:
+    tlm_chat = TLMChatCompletion()
+    openai_kwargs = {
+        "model": "gpt-4.1-mini",
+        "messages": [{"role": "user", "content": test_prompt}],
+    }
+    invalid_response = {"invalid": "response"}
+
+    with pytest.raises(TypeError, match="The response is not an OpenAI ChatCompletion object."):
+        tlm_chat.score(response=invalid_response, **openai_kwargs)  # type: ignore
+
+
+def test_tlm_chat_completion_score_missing_messages() -> None:
+    tlm_chat = TLMChatCompletion()
+    openai_kwargs = {
+        "model": "gpt-4.1-mini",
+        "messages": [{"role": "user", "content": test_prompt}],
+    }
+    response = ChatCompletion(
+        id="test",
+        choices=[
+            Choice(
+                index=0,
+                message=ChatCompletionMessage(role="assistant", content=None),
+                finish_reason="stop",
+            )
+        ],
+        created=1234567890,
+        model="test-model",
+        object="chat.completion",
+    )
+
+    with pytest.raises(
+        ValueError,
+        match="The OpenAI ChatCompletion object does not contain a message content.",
+    ):
+        tlm_chat.score(response=response, **openai_kwargs)