Merge branch 'main' into set-token-limit

cpsievert · cpsievert · commit 950d6e1614a5 · 2024-12-19T17:41:51.000-06:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,7 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### New features
 
-* The `Chat` class gains a `.token_count()` method to help estimate input tokens before sending it to the LLM. (#23)
+* `Chat`'s `.tokens()` method gains a `values` argument. Set it to `"discrete"` to get a result that can be summed to determine the token cost of submitting the current turns. The default (`"cumulative"`), remains the same (the result can be summed to determine the overall token cost of the conversation).
+* `Chat` gains a `.token_count()` method to help estimate token cost of new input. (#23)
 
 ### Bug fixes
 
diff --git a/chatlas/_chat.py b/chatlas/_chat.py
@@ -16,6 +16,7 @@
     Optional,
     Sequence,
     TypeVar,
+    overload,
 )
 
 from pydantic import BaseModel
@@ -177,14 +178,42 @@ def system_prompt(self, value: str | None):
         if value is not None:
             self._turns.insert(0, Turn("system", value))
 
-    def tokens(self) -> list[int]:
+    @overload
+    def tokens(self) -> list[tuple[int, int] | None]: ...
+
+    @overload
+    def tokens(
+        self,
+        values: Literal["cumulative"],
+    ) -> list[tuple[int, int] | None]: ...
+
+    @overload
+    def tokens(
+        self,
+        values: Literal["discrete"],
+    ) -> list[int]: ...
+
+    def tokens(
+        self,
+        values: Literal["cumulative", "discrete"] = "discrete",
+    ) -> list[int] | list[tuple[int, int] | None]:
         """
         Get the tokens for each turn in the chat.
 
+        Parameters
+        ----------
+        values
+            If "cumulative" (the default), the result can be summed to get the
+            chat's overall token usage (helpful for computing overall cost of
+            the chat). If "discrete", the result can be summed to get the number of
+            tokens the turns will cost to generate the next response (helpful
+            for estimating cost of the next response, or for determining if you
+            are about to exceed the token limit).
+
         Returns
         -------
         list[int]
-            A list of token counts for each turn in the chat. Note that the
+            A list of token counts for each (non-system) turn in the chat. The
             1st turn includes the tokens count for the system prompt (if any).
 
         Raises
@@ -199,6 +228,9 @@ def tokens(self) -> list[int]:
 
         turns = self.get_turns(include_system_prompt=False)
 
+        if values == "cumulative":
+            return [turn.tokens for turn in turns]
+
         if len(turns) == 0:
             return []
 
@@ -220,21 +252,25 @@ def tokens(self) -> list[int]:
             )
 
         if turns[0].role != "user":
-            raise ValueError("Expected the first turn to have role='user'. " + err_info)
+            raise ValueError(
+                "Expected the 1st non-system turn to have role='user'. " + err_info
+            )
 
         if turns[1].role != "assistant":
             raise ValueError(
-                "Expected the 2nd turn to have role='assistant'. " + err_info
+                "Expected the 2nd turn non-system to have role='assistant'. " + err_info
             )
 
         if turns[1].tokens is None:
             raise ValueError(
                 "Expected the 1st assistant turn to contain token counts. " + err_info
             )
 
-        tokens: list[int] = [
+        res: list[int] = [
+            # Implied token count for the 1st user input
             turns[1].tokens[0],
-            sum(turns[1].tokens),
+            # The token count for the 1st assistant response
+            turns[1].tokens[1],
         ]
         for i in range(1, len(turns) - 1, 2):
             ti = turns[i]
@@ -248,7 +284,7 @@ def tokens(self) -> list[int]:
                     "Expected role='assistant' turns to contain token counts."
                     + err_info
                 )
-            tokens.extend(
+            res.extend(
                 [
                     # Implied token count for the user input
                     tj.tokens[0] - sum(ti.tokens),
@@ -257,7 +293,7 @@ def tokens(self) -> list[int]:
                 ]
             )
 
-        return tokens
+        return res
 
     def token_count(
         self,
@@ -285,12 +321,18 @@ def token_count(
         int
             The token count for the input.
 
+        Note
+        ----
+        Remember that the token count is an estimate. Also, models based on
+        `ChatOpenAI()` currently does not take tools into account when
+        estimating token counts.
+
         Examples
         --------
         ```python
-        from chatlas import ChatOpenAI
+        from chatlas import ChatAnthropic
 
-        chat = ChatOpenAI()
+        chat = ChatAnthropic()
         # Estimate the token count before sending the input
         print(chat.token_count("What is 2 + 2?"))
 
diff --git a/chatlas/_openai.py b/chatlas/_openai.py
@@ -295,10 +295,12 @@ def _chat_perform_args(
             "stream": stream,
             "messages": self._as_message_param(turns),
             "model": self._model,
-            "seed": self._seed,
             **(kwargs or {}),
         }
 
+        if self._seed is not None:
+            kwargs_full["seed"] = self._seed
+
         if tool_schemas:
             kwargs_full["tools"] = tool_schemas
 
diff --git a/docs/get-started.qmd b/docs/get-started.qmd
@@ -82,7 +82,7 @@ Learn more in the article on [structured data extraction](structured-data.qmd).
 
 LLMs can also be useful to solve general programming problems. For example:
 
-* You can use LLMs to explain code, or even ask them to [generate a diagram](https://bsky.app/profile/daviddiviny.bsky.social/post/3lb6kjaen4c2u).
+* You can use LLMs to explain code, or even ask them to [generate a diagram](https://bsky.app/profile/daviddiviny.com/post/3lb6kjaen4c2u).
 
 * You can ask an LLM to analyse your code for potential code smells or security issues. You can do this a function at a time, or explore including the entire source code for your package or script in the prompt.
 
diff --git a/tests/test_provider_openai.py b/tests/test_provider_openai.py
@@ -21,7 +21,7 @@ def test_openai_simple_request():
     chat.chat("What is 1 + 1?")
     turn = chat.get_last_turn()
     assert turn is not None
-    assert turn.tokens == (27, 1)
+    assert turn.tokens == (27, 2)
     assert turn.finish_reason == "stop"
 
 
diff --git a/tests/test_tokens.py b/tests/test_tokens.py
@@ -1,7 +1,45 @@
+from chatlas import ChatAnthropic, ChatGoogle, ChatOpenAI, Turn
 from chatlas._openai import OpenAIAzureProvider, OpenAIProvider
 from chatlas._tokens import token_usage, tokens_log, tokens_reset
 
 
+def test_tokens_method():
+    chat = ChatOpenAI()
+    assert chat.tokens(values="discrete") == []
+
+    chat = ChatOpenAI(
+        turns=[
+            Turn(role="user", contents="Hi"),
+            Turn(role="assistant", contents="Hello", tokens=(2, 10)),
+        ]
+    )
+
+    assert chat.tokens(values="discrete") == [2, 10]
+
+    chat = ChatOpenAI(
+        turns=[
+            Turn(role="user", contents="Hi"),
+            Turn(role="assistant", contents="Hello", tokens=(2, 10)),
+            Turn(role="user", contents="Hi"),
+            Turn(role="assistant", contents="Hello", tokens=(14, 10)),
+        ]
+    )
+
+    assert chat.tokens(values="discrete") == [2, 10, 2, 10]
+    assert chat.tokens(values="cumulative") == [None, (2, 10), None, (14, 10)]
+
+
+def test_token_count_method():
+    chat = ChatOpenAI(model="gpt-4o-mini")
+    assert chat.token_count("What is 1 + 1?") == 31
+
+    chat = ChatAnthropic(model="claude-3-5-sonnet-20241022")
+    assert chat.token_count("What is 1 + 1?") == 16
+
+    chat = ChatGoogle(model="gemini-1.5-flash")
+    assert chat.token_count("What is 1 + 1?") == 9
+
+
 def test_usage_is_none():
     tokens_reset()
     assert token_usage() is None