huggingface · konrad-gerlach · Jan 10, 2025 · Jan 10, 2025 · Jan 10, 2025 · Jan 10, 2025
diff --git a/docs/source/text_environments.md b/docs/source/text_environments.md
@@ -114,6 +114,7 @@ Let's decompose the settings:
 | `max_tool_response`| The tool response is truncated to this number to avoid running out of model context.|
 | `max_length`       |  The maximum number of tokens to allow in an episode. |
 | `generation_kwargs`| Generation settings used by the language model. |
+| `use_cache` | Cache keys and values between segment generation. Warning: This feature is experimental! When using caching, TextEnvironment is not suited for training use, i.e. backpropagation through the generated graph. Use with trl Trainers is of course possible. Furthermore, caching requires, that there be no calculation dependencies between examples at inference time. When using BatchNorm, the model should thus be in eval model. Caching is not guaranteed to produce identical results compared to not using caching and you should test for yourself, if it is suited to your needs, model and generation_kwargs. Cache use has been tested for GPT-2 with greedy search. |
 
 You can customize the environment to your needs and add custom tools and settings. Let's see how you can use the environment to have the model interact with the available tools!
 

diff --git a/tests/test_environments.py b/tests/test_environments.py
@@ -16,7 +16,7 @@
 from unittest.mock import patch
 
 import torch
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer, DynamicCache
 
 from trl import AutoModelForCausalLMWithValueHead, TextEnvironment, TextHistory
 
@@ -26,10 +26,22 @@ def __call__(self, text):
         return text
 
 
-def dummy_generate(histories):
+def dummy_generate(
+    histories, past_key_values=None, past_attention_masks=None, past_input_ids=None, last_active_histories=None
+):
     for i in range(len(histories)):
         histories[i].append_segment("<request><DummyTool>test<call>", torch.tensor([1, 2, 3]), system=False)
-    return histories
+    return histories, None, None, None, None
+
+
+def reshape_cache(cache):
+    new_cache = []
+    for layer in cache:
+        keys, values = layer
+        keys = keys.reshape((-1, 1, 1, 1))
+        values = values.reshape((-1, 1, 1, 1))
+        new_cache.append((keys, values))
+    return tuple(new_cache)
 
 
 class TextHistoryTest(unittest.TestCase):
@@ -79,6 +91,7 @@ def test_text_history_last_segment(self):
         history.append_segment("General Kenobi!", torch.tensor([4, 5, 6]))
         history.append_segment("You are a bold one!", torch.tensor([7, 8, 9]))
         self.assertEqual(history.last_text_segment, "You are a bold one!")
+        self.assertTrue(torch.all(history.last_token_segment == torch.tensor([7, 8, 9])).item())
 
     def test_text_history_split_query_response(self):
         text = "Hello there!"
@@ -131,10 +144,10 @@ def test_text_environment_generate(self):
 
         model_inputs = [self.gpt2_tokenizer(txt, return_tensors="pt").input_ids.squeeze() for txt in input_texts]
 
-        generations_batched = env._generate_batched(model_inputs, batch_size=2)
+        generations_batched, _, _, _, _ = env._generate_batched(model_inputs, batch_size=2)
         generations_batched = self.gpt2_tokenizer.batch_decode(generations_batched)
 
-        generations_single = [env._generate_batched([inputs], batch_size=1)[0] for inputs in model_inputs]
+        generations_single = [env._generate_batched([inputs], batch_size=1)[0][0] for inputs in model_inputs]
         generations_single = self.gpt2_tokenizer.batch_decode(generations_single)
 
         self.assertEqual(generations_single, generations_batched)
@@ -276,3 +289,197 @@ def test_text_environment_run(self, mock_generate):
             ("I am a prompt!\n" + "Hello there! General Kenobi!")
             + (2 * "<request><DummyTool>test<call>test<response>"),
         )
+
+    def test_combine_cache(self):
+        env = TextEnvironment(
+            self.gpt2_model,
+            self.gpt2_tokenizer,
+            tools={"DummyTool": DummyTool()},
+            reward_fn=lambda x: [torch.tensor(i) for i, _ in enumerate(x)],
+            prompt="I am a prompt!\n",
+            max_turns=2,
+        )
+
+        caches = [
+            (
+                (torch.tensor([[1], [2]]), torch.tensor([[3], [4]])),
+                (torch.tensor([[7], [8]]), torch.tensor([[9], [10]])),
+            ),
+            (
+                (torch.tensor([[5]]), torch.tensor([[6]])),
+                (torch.tensor([[11]]), torch.tensor([[12]])),
+            ),
+        ]
+        caches = [DynamicCache().from_legacy_cache(reshape_cache(cache)) for cache in caches]
+        attention_masks = [torch.tensor([[0, 1], [1, 0]]), torch.tensor([[2, 4]])]
+        input_ids = [torch.tensor([[1, 4], [2, 5]]), torch.tensor([[3, 6]])]
+        example_mask = [True, False, True]
+
+        expected_cache = reshape_cache(
+            (
+                (torch.tensor([[1], [5]]), torch.tensor([[3], [6]])),
+                (torch.tensor([[7], [11]]), torch.tensor([[9], [12]])),
+            )
+        )
+        expected_attention_mask = torch.tensor([[0, 1], [2, 4]])
+        expected_input_ids = torch.tensor([[1, 4], [3, 6]])
+
+        combined_cache, combined_attention_masks, combined_input_ids = env._combine_cache(
+            example_mask, caches, attention_masks, input_ids
+        )
+
+        self.assertEqual(len(combined_cache), len(expected_cache))
+        self.assertEqual(len(combined_cache[0]), len(expected_cache[0]))
+        self.assertTrue(torch.all(combined_cache[0][0] == expected_cache[0][0]))
+        self.assertTrue(torch.all(combined_cache[0][1] == expected_cache[0][1]))
+        self.assertEqual(len(combined_cache[1]), len(expected_cache[1]))
+        self.assertTrue(torch.all(combined_cache[1][0] == expected_cache[1][0]))
+        self.assertTrue(torch.all(combined_cache[1][1] == expected_cache[1][1]))
+        self.assertTrue(torch.all(combined_attention_masks == expected_attention_mask))
+        self.assertTrue(torch.all(combined_input_ids == expected_input_ids))
+
+    def test_get_batched_cache(self):
+        env = TextEnvironment(
+            self.gpt2_model,
+            self.gpt2_tokenizer,
+            tools={"DummyTool": DummyTool()},
+            reward_fn=lambda x: [torch.tensor(i) for i, _ in enumerate(x)],
+            prompt="I am a prompt!\n",
+            max_turns=2,
+        )
+
+        cache = reshape_cache(
+            (
+                (torch.tensor([[1], [2], [3]]), torch.tensor([[4], [5], [6]])),
+                (torch.tensor([[7], [8], [9]]), torch.tensor([[10], [11], [12]])),
+            )
+        )
+        attention_masks = torch.tensor([[1], [2], [3]])
+        input_ids = torch.tensor([[4], [5], [6]])
+        batched_cache, batched_attention_masks, batched_input_ids = env._get_batched_cache(
+            1, 3, cache, attention_masks, input_ids
+        )
+        batched_cache = batched_cache.to_legacy_cache()
+        expected_cache = reshape_cache(
+            (
+                (torch.tensor([[2], [3]]), torch.tensor([[5], [6]])),
+                (torch.tensor([[8], [9]]), torch.tensor([[11], [12]])),
+            )
+        )
+
+        self.assertEqual(len(batched_cache), len(expected_cache))
+        self.assertEqual(len(batched_cache[0]), len(expected_cache[0]))
+        self.assertTrue(torch.all(batched_cache[0][0] == expected_cache[0][0]))
+        self.assertTrue(torch.all(batched_cache[0][1] == expected_cache[0][1]))
+        self.assertEqual(len(batched_cache[1]), len(expected_cache[1]))
+        self.assertTrue(torch.all(batched_cache[1][0] == expected_cache[1][0]))
+        self.assertTrue(torch.all(batched_cache[1][1] == expected_cache[1][1]))
+
+        expected_attention_mask = torch.tensor([[2], [3]])
+        self.assertTrue(torch.all(batched_attention_masks == expected_attention_mask))
+
+        expected_input_ids = torch.tensor([[5], [6]])
+        self.assertTrue(torch.all(batched_input_ids == expected_input_ids))
+
+    def test_cached_generate_batched(self):
+        generation_kwargs = {"do_sample": False, "max_new_tokens": 4, "pad_token_id": self.gpt2_tokenizer.eos_token_id}
+        env = TextEnvironment(
+            self.gpt2_model,
+            self.gpt2_tokenizer,
+            tools=[DummyTool()],
+            reward_fn=lambda x: torch.tensor(1),
+            prompt="I am a prompt!\n",
+            generation_kwargs=generation_kwargs,
+        )
+
+        input_texts = ["this is a test", "this is another, longer test", "some other batch", "something unnecessary"]
+        model_inputs = [self.gpt2_tokenizer(txt, return_tensors="pt").input_ids.squeeze() for txt in input_texts]
+        outputs, past_key_values, past_attention_masks, past_input_ids, _ = env._generate_batched(
+            model_inputs, batch_size=2
+        )
+
+        past_key_values, past_attention_masks, past_input_ids = env._combine_cache(
+            [True, True, True, False], past_key_values, past_attention_masks, past_input_ids
+        )
+
+        input_texts2 = [" short interim", " a somewhat longer section in between"]
+        model_inputs2 = [self.gpt2_tokenizer(txt, return_tensors="pt").input_ids.squeeze() for txt in input_texts2]
+        # for single token query
+        model_inputs2.append(
+            torch.tensor([self.gpt2_tokenizer(" a", return_tensors="pt").input_ids], dtype=model_inputs2[0].dtype)
+        )
+
+        outputs_cached, _, _, _, _ = env._generate_batched(
+            model_inputs2,
+            batch_size=2,
+            combined_past_key_values=past_key_values,
+            combined_past_attention_masks=past_attention_masks,
+            combined_past_input_ids=past_input_ids,
+        )
+
+        model_inputs2_full = [
+            torch.concat([in1, out1, in2], dim=0) for in1, out1, in2 in zip(model_inputs[:-1], outputs, model_inputs2)
+        ]
+        outputs_uncached, _, _, _, _ = env._generate_batched(model_inputs2_full, batch_size=2)
+        for cached, uncached in zip(outputs_cached, outputs_uncached):
+            self.assertTrue(torch.all(cached == uncached))
+
+    def test_different_sequence_lengths(self):
+        generation_kwargs = {"do_sample": False, "max_new_tokens": 4, "pad_token_id": self.gpt2_tokenizer.eos_token_id}
+        env = TextEnvironment(
+            self.gpt2_model,
+            self.gpt2_tokenizer,
+            tools=[DummyTool()],
+            reward_fn=lambda x: torch.tensor(1),
+            prompt="I am a prompt!\n",
+            generation_kwargs=generation_kwargs,
+        )
+
+        input_texts = ["this is a test", "this is another, longer test", "some other batch"]
+        model_inputs = [self.gpt2_tokenizer(txt, return_tensors="pt").input_ids.squeeze() for txt in input_texts]
+        outputs, past_key_values, past_attention_masks, past_input_ids, _ = env._generate_batched(
+            model_inputs, batch_size=2
+        )
+        # remove the last two tokens from the second batch to pretend they were never generated
+        second_cache = past_key_values[1].to_legacy_cache()
+        edited_cache = []
+        for layer in second_cache:
+            keys, values = layer
+            new_keys = keys[:, :, :-2, :]
+            new_values = values[:, :, :-2, :]
+            edited_cache.append((new_keys, new_values))
+
+        past_key_values[1] = DynamicCache().from_legacy_cache(tuple(edited_cache))
+        past_attention_masks[1] = past_attention_masks[1][:, :-2]
+        past_input_ids[1] = past_input_ids[1][:, :-2]
+
+        # ensure this actually removes generated tokens and not skipped tokens / padding
+        self.assertEqual(len(outputs[2]), 4)
+
+        past_key_values, past_attention_masks, past_input_ids = env._combine_cache(
+            [True, True, True], past_key_values, past_attention_masks, past_input_ids
+        )
+
+        self.assertEqual(past_attention_masks.shape, past_input_ids.shape)
+        self.assertEqual(past_key_values[0][0].shape[2], past_attention_masks.shape[1] - 1)
+        self.assertEqual(past_key_values[0][0].shape[0], past_attention_masks.shape[0])
+        input_texts2 = [" short interim", " a somewhat longer section in between"]
+        model_inputs2 = [self.gpt2_tokenizer(txt, return_tensors="pt").input_ids.squeeze() for txt in input_texts2]
+        # for single token query
+        model_inputs2.append(
+            torch.tensor([self.gpt2_tokenizer(" a", return_tensors="pt").input_ids], dtype=model_inputs2[0].dtype)
+        )
+        outputs_cached, _, _, _, _ = env._generate_batched(
+            model_inputs2,
+            batch_size=2,
+            combined_past_key_values=past_key_values,
+            combined_past_attention_masks=past_attention_masks,
+            combined_past_input_ids=past_input_ids,
+        )
+        outputs[2] = outputs[2][:-2]  # remove last two generated tokens from input
+        model_inputs2_full = [
+            torch.concat([in1, out1, in2], dim=0) for in1, out1, in2 in zip(model_inputs, outputs, model_inputs2)
+        ]
+        outputs_uncached, _, _, _, _ = env._generate_batched(model_inputs2_full, batch_size=2)
+        for cached, uncached in zip(outputs_cached, outputs_uncached):
+            self.assertTrue(torch.all(cached == uncached))