diff --git a/pyproject.toml b/pyproject.toml index 176c1203..bda7e943 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "langcheck" -version = "0.10.0.dev12" +version = "0.10.0.dev13" description = "Simple, Pythonic building blocks to evaluate LLM-based applications" readme = "README.md" authors = [{ name = "Citadel AI", email = "info@citadel.co.jp" }] diff --git a/src/langcheck/__init__.py b/src/langcheck/__init__.py index e628b7dc..7fe04c19 100644 --- a/src/langcheck/__init__.py +++ b/src/langcheck/__init__.py @@ -1,4 +1,4 @@ from langcheck import augment, metrics, plot, utils __all__ = ["augment", "metrics", "plot", "utils"] -__version__ = "0.10.0.dev12" +__version__ = "0.10.0.dev13" diff --git a/src/langcheck/metrics/eval_clients/_litellm.py b/src/langcheck/metrics/eval_clients/_litellm.py index b9793de3..5ffc44ab 100644 --- a/src/langcheck/metrics/eval_clients/_litellm.py +++ b/src/langcheck/metrics/eval_clients/_litellm.py @@ -639,12 +639,29 @@ def _embed(self, inputs: list[str]) -> torch.Tensor: def _get_token_usage(responses: list[Any], model: str) -> MetricTokenUsage: """Get the token usage from the response.""" + + # For Responses API, the token usage is stored in the usage field + # with type ResponseAPIUsage, which has input_tokens and output_tokens. + # For Chat Completions API, the token usage is stored in the usage field + # with type Usage, which has prompt_tokens and completion_tokens. input_token_count = sum( - response.usage.prompt_tokens if response and response.usage else 0 + getattr( + response.usage, + "prompt_tokens", + getattr(response.usage, "input_tokens", 0), + ) + if response and response.usage + else 0 for response in responses ) output_token_count = sum( - response.usage.completion_tokens if response and response.usage else 0 + getattr( + response.usage, + "completion_tokens", + getattr(response.usage, "output_tokens", 0), + ) + if response and response.usage + else 0 for response in responses ) input_token_cost, output_token_cost = cost_per_token( diff --git a/tests/metrics/eval_clients/test_litellm.py b/tests/metrics/eval_clients/test_litellm.py index eff2285a..2d711f6d 100644 --- a/tests/metrics/eval_clients/test_litellm.py +++ b/tests/metrics/eval_clients/test_litellm.py @@ -4,7 +4,7 @@ from unittest.mock import Mock, patch import pytest -from litellm.types.llms.openai import ResponsesAPIResponse +from litellm.types.llms.openai import ResponseAPIUsage, ResponsesAPIResponse from litellm.types.utils import ( Choices, EmbeddingResponse, @@ -95,7 +95,7 @@ def test_get_text_response_with_reasoning_summary(system_prompt): ) mock_response.usage = Mock( - spec=Usage, prompt_tokens=20, completion_tokens=30 + spec=ResponseAPIUsage, input_tokens=20, output_tokens=30 ) # Calling litellm.responses requires a credentials, so we mock the return