-
Notifications
You must be signed in to change notification settings - Fork 1
Add OpenAI Chat Completions API #75
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
4495994
9774e00
9e5bf55
4bbfb12
f03eaf0
6d32861
baf9c69
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
""" | ||
Real-time evaluation of responses from OpenAI Chat Completions API. | ||
|
||
If you are using OpenAI's Chat Completions API, this module allows you to incorporate TLM trust scoring without any change to your existing code. | ||
It works for any OpenAI LLM model, as well as the many other non-OpenAI LLMs that are also usable via Chat Completions API (Gemini, DeepSeek, Llama, etc). | ||
""" | ||
|
||
from typing import TYPE_CHECKING, Any, Optional, cast | ||
|
||
from cleanlab_tlm.internal.base import BaseTLM | ||
from cleanlab_tlm.internal.constants import ( | ||
_DEFAULT_TLM_QUALITY_PRESET, | ||
_VALID_TLM_QUALITY_PRESETS_CHAT_COMPLETIONS, | ||
) | ||
from cleanlab_tlm.internal.types import TLMQualityPreset | ||
from cleanlab_tlm.tlm import TLM, TLMOptions, TLMScore | ||
from cleanlab_tlm.utils.chat import form_prompt_string | ||
|
||
if TYPE_CHECKING: | ||
from openai.types.chat import ChatCompletion | ||
|
||
|
||
class TLMChatCompletion(BaseTLM): | ||
jas2600 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
""" | ||
Represents a Trustworthy Language Model (TLM) instance specifically designed for evaluating OpenAI Chat Completions responses. | ||
|
||
This class provides a TLM wrapper that can be used to evaluate the quality and trustworthiness of responses from any OpenAI model | ||
by passing in the inputs to OpenAI's Chat Completions API and the ChatCompletion response object. | ||
|
||
Args: | ||
quality_preset ({"base", "low", "medium"}, default = "medium"): an optional preset configuration to control | ||
the quality of TLM trustworthiness scores vs. latency/costs. | ||
|
||
api_key (str, optional): Cleanlab TLM API key. If not provided, will attempt to read from CLEANLAB_API_KEY environment variable. | ||
|
||
options ([TLMOptions](#class-tlmoptions), optional): a typed dict of configurations you can optionally specify. | ||
See detailed documentation under [TLMOptions](#class-tlmoptions). | ||
|
||
timeout (float, optional): timeout (in seconds) to apply to each TLM evaluation. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
quality_preset: TLMQualityPreset = _DEFAULT_TLM_QUALITY_PRESET, | ||
*, | ||
api_key: Optional[str] = None, | ||
options: Optional[TLMOptions] = None, | ||
timeout: Optional[float] = None, | ||
): | ||
""" | ||
lazydocs: ignore | ||
""" | ||
super().__init__( | ||
quality_preset=quality_preset, | ||
valid_quality_presets=_VALID_TLM_QUALITY_PRESETS_CHAT_COMPLETIONS, | ||
support_custom_eval_criteria=True, | ||
api_key=api_key, | ||
options=options, | ||
timeout=timeout, | ||
verbose=False, | ||
) | ||
|
||
self._tlm = TLM( | ||
quality_preset=quality_preset, | ||
api_key=api_key, | ||
options=options, | ||
timeout=timeout, | ||
) | ||
|
||
jas2600 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
def score( | ||
self, | ||
*, | ||
response: "ChatCompletion", | ||
**openai_kwargs: Any, | ||
) -> TLMScore: | ||
"""Score the trustworthiness of an OpenAI ChatCompletion response. | ||
|
||
Args: | ||
response (ChatCompletion): The OpenAI ChatCompletion response object to evaluate | ||
**openai_kwargs (Any): The original kwargs passed to OpenAI's create() method, must include 'messages' | ||
|
||
Returns: | ||
TLMScore: A dict containing the trustworthiness score and optional logs | ||
""" | ||
if (messages := openai_kwargs.get("messages")) is None: | ||
raise ValueError("messages is a required OpenAI input argument.") | ||
tools = openai_kwargs.get("tools", None) | ||
|
||
prompt_text = form_prompt_string(messages, tools) | ||
Comment on lines
+87
to
+89
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [nit] cleaner to just pass |
||
response_text = _get_string_response(response) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you save a ticket for future PR: we should try to extract token-probs out of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. when the user does: TLMChatCompletions.create() the resulting scores should also be including token-probs (can save for future PR too) |
||
|
||
return cast(TLMScore, self._tlm.get_trustworthiness_score(prompt_text, response_text)) | ||
|
||
|
||
def _get_string_response(response: "ChatCompletion") -> str: | ||
try: | ||
from openai.types.chat import ChatCompletion | ||
except ImportError: | ||
raise ImportError( | ||
"OpenAI is required to use the TLMChatCompletion class. Please install it with `pip install openai`." | ||
) | ||
|
||
if not isinstance(response, ChatCompletion): | ||
raise TypeError("The response is not an OpenAI ChatCompletion object.") | ||
if response.choices[0].message.content is None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. will tool call output / structured output be here too if OpenAI chose to return that? if not, can you save a ticket for future PR that we should get those outputs and use the appropriate TLM scoring for them There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No this code does not do that yet, I believe I can add rudimentary support for that pretty easily tho There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @huiwengoh do you still plan to add the support in this PR or in a followup ticket? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Created ticket! |
||
raise ValueError("The OpenAI ChatCompletion object does not contain a message content.") | ||
return str(response.choices[0].message.content) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
import pytest | ||
from openai.types.chat import ( | ||
ChatCompletion, | ||
ChatCompletionMessage, | ||
) | ||
from openai.types.chat.chat_completion import Choice | ||
|
||
from cleanlab_tlm.internal.types import TLMQualityPreset | ||
from cleanlab_tlm.utils.chat_completions import TLMChatCompletion | ||
from tests.conftest import make_text_unique | ||
from tests.constants import TEST_PROMPT, TEST_RESPONSE | ||
from tests.test_get_trustworthiness_score import is_trustworthiness_score_json_format | ||
|
||
test_prompt = make_text_unique(TEST_PROMPT) | ||
test_response = make_text_unique(TEST_RESPONSE) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"quality_preset", | ||
["base", "low", "medium"], | ||
) | ||
def test_tlm_chat_completion_score(quality_preset: TLMQualityPreset) -> None: | ||
tlm_chat = TLMChatCompletion(quality_preset=quality_preset) | ||
openai_kwargs = { | ||
"model": "gpt-4.1-mini", | ||
"messages": [{"role": "user", "content": test_prompt}], | ||
} | ||
response = ChatCompletion( | ||
id="test", | ||
choices=[ | ||
Choice( | ||
index=0, | ||
message=ChatCompletionMessage(role="assistant", content=test_response), | ||
finish_reason="stop", | ||
) | ||
], | ||
created=1234567890, | ||
model="test-model", | ||
object="chat.completion", | ||
) | ||
|
||
score = tlm_chat.score(response=response, **openai_kwargs) | ||
|
||
assert score is not None | ||
assert is_trustworthiness_score_json_format(score) | ||
|
||
|
||
def test_tlm_chat_completion_score_with_options() -> None: | ||
tlm_chat = TLMChatCompletion(options={"log": ["explanation", "perplexity"]}) | ||
openai_kwargs = { | ||
"model": "gpt-4.1-mini", | ||
"messages": [{"role": "user", "content": test_prompt}], | ||
} | ||
response = ChatCompletion( | ||
id="test", | ||
choices=[ | ||
Choice( | ||
index=0, | ||
message=ChatCompletionMessage(role="assistant", content=test_response), | ||
finish_reason="stop", | ||
) | ||
], | ||
created=1234567890, | ||
model="test-model", | ||
object="chat.completion", | ||
) | ||
|
||
score = tlm_chat.score(response=response, **openai_kwargs) | ||
|
||
assert score is not None | ||
assert is_trustworthiness_score_json_format(score) | ||
|
||
|
||
def test_tlm_chat_completion_score_with_tools() -> None: | ||
tlm_chat = TLMChatCompletion() | ||
openai_kwargs = { | ||
"model": "gpt-4.1-mini", | ||
"messages": [{"role": "user", "content": test_prompt}], | ||
"tools": [ | ||
{ | ||
"type": "function", | ||
"function": { | ||
"name": "search", | ||
"description": "Search the web for information", | ||
"parameters": { | ||
"type": "object", | ||
"properties": { | ||
"query": { | ||
"type": "string", | ||
"description": "The search query", | ||
} | ||
}, | ||
"required": ["query"], | ||
}, | ||
}, | ||
} | ||
], | ||
} | ||
response = ChatCompletion( | ||
id="test", | ||
choices=[ | ||
Choice( | ||
index=0, | ||
message=ChatCompletionMessage(role="assistant", content=test_response), | ||
finish_reason="stop", | ||
) | ||
], | ||
created=1234567890, | ||
model="test-model", | ||
object="chat.completion", | ||
) | ||
|
||
score = tlm_chat.score(response=response, **openai_kwargs) | ||
|
||
assert score is not None | ||
assert is_trustworthiness_score_json_format(score) | ||
|
||
|
||
def test_tlm_chat_completion_score_invalid_response() -> None: | ||
tlm_chat = TLMChatCompletion() | ||
openai_kwargs = { | ||
"model": "gpt-4.1-mini", | ||
"messages": [{"role": "user", "content": test_prompt}], | ||
} | ||
invalid_response = {"invalid": "response"} | ||
|
||
with pytest.raises(TypeError, match="The response is not an OpenAI ChatCompletion object."): | ||
tlm_chat.score(response=invalid_response, **openai_kwargs) # type: ignore | ||
|
||
|
||
def test_tlm_chat_completion_score_missing_messages() -> None: | ||
tlm_chat = TLMChatCompletion() | ||
openai_kwargs = { | ||
"model": "gpt-4.1-mini", | ||
"messages": [{"role": "user", "content": test_prompt}], | ||
} | ||
response = ChatCompletion( | ||
id="test", | ||
choices=[ | ||
Choice( | ||
index=0, | ||
message=ChatCompletionMessage(role="assistant", content=None), | ||
finish_reason="stop", | ||
) | ||
], | ||
created=1234567890, | ||
model="test-model", | ||
object="chat.completion", | ||
) | ||
|
||
with pytest.raises( | ||
ValueError, | ||
match="The OpenAI ChatCompletion object does not contain a message content.", | ||
): | ||
tlm_chat.score(response=response, **openai_kwargs) |
Uh oh!
There was an error while loading. Please reload this page.