add postprocess_fn unit tests for llm_map

Signed-off-by: Henry Lindeman <hmlindeman@yahoo.com>
aryn-ai · HenryL27 · Jan 30, 2025 · Jan 17, 2025 · Jan 17, 2025 · Jan 17, 2025
commit d7ff1ebf46e22da762c94054c99855da71be9c35
diff --git a/lib/sycamore/sycamore/tests/unit/transforms/test_base_llm.py b/lib/sycamore/sycamore/tests/unit/transforms/test_base_llm.py
@@ -54,6 +54,23 @@ def test_happy_path(self):
         assert outdocs[1].text_representation == "booga"
         assert outdocs[1].properties["out"] == "booga"
 
+    def test_postprocess(self):
+        prompt = FakeDocPrompt()
+        llm = FakeLLM()
+        doc1 = Document({"text_representation": "ooga"})
+        doc2 = Document({"text_representation": "booga"})
+        count = 0
+
+        def ppfn(d: Document, i: int) -> Document:
+            nonlocal count
+            count += 1
+            return d
+
+        map = LLMMap(None, prompt, "out", llm, postprocess_fn=ppfn)
+        _ = map.llm_map([doc1, doc2])
+
+        assert count == 2
+
 
 class TestLLMMapElements:
     def test_wrong_prompt_fails_fast(self):
@@ -67,13 +84,40 @@ def test_happy_path(self):
         prompt = FakeEltPrompt()
         llm = FakeLLM()
         doc1 = Document(
-            {"text_representation": "ooga", "elements": [{"text_representation": "yo"}, {"text_representation": "ho"}]}
+            {
+                "doc_id": "1",
+                "text_representation": "ooga",
+                "elements": [{"text_representation": "yo"}, {"text_representation": "ho"}],
+            }
         )
-        doc2 = Document({"elements": [{"text_representation": "booga"}, {}]})
+        doc2 = Document({"doc_id": "2", "elements": [{"text_representation": "booga"}, {}]})
         map = LLMMapElements(None, prompt, "out", llm)
         outdocs = map.llm_map_elements([doc1, doc2])
 
         assert outdocs[0].elements[0].properties["out"] == "oogayo"
         assert outdocs[0].elements[1].properties["out"] == "oogaho"
         assert outdocs[1].elements[0].properties["out"] == "Nonebooga"
         assert outdocs[1].elements[1].properties["out"] == "NoneNone"
+
+    def test_postprocess(self):
+        prompt = FakeEltPrompt()
+        llm = FakeLLM()
+        doc1 = Document(
+            {
+                "doc_id": "1",
+                "text_representation": "ooga",
+                "elements": [{"text_representation": "yo"}, {"text_representation": "ho"}],
+            }
+        )
+        doc2 = Document({"doc_id": "2", "elements": [{"text_representation": "booga"}, {}]})
+        count = 0
+
+        def ppfn(e: Element, i: int) -> Element:
+            nonlocal count
+            count += 1
+            return e
+
+        map = LLMMapElements(None, prompt, "out", llm, postprocess_fn=ppfn)
+        _ = map.llm_map_elements([doc1, doc2])
+
+        assert count == 4
diff --git a/lib/sycamore/sycamore/transforms/base_llm.py b/lib/sycamore/sycamore/transforms/base_llm.py
@@ -119,6 +119,11 @@ class LLMMapElements(MapBatch):
         llm: The llm to use for inference.
         llm_mode: How to call the llm - sync/async/batch. All LLMs do not
             necessarily implement all options.
+        postprocess_fn: function to call on documents after performing the
+            llm inference. If the prompt rendered into multiple RenderedPrompts,
+            ``i`` is the index of the RenderedPrompt that succeeded; if the
+            prompt rendered into an empty list, ``i`` is -1; and otherwise
+            ``i`` is 0
 
     Example:
          .. code-block:: python
@@ -138,22 +143,33 @@ def __init__(
         output_field: str,
         llm: LLM,
         llm_mode: LLMMode = LLMMode.SYNC,
+        postprocess_fn: Callable[[Element, int], Element] = lambda e, i: e,
         **kwargs,
     ):
         self._prompt = prompt
         self._validate_prompt()
         self._output_field = output_field
         self._llm = llm
         self._llm_mode = llm_mode
+        self._postprocess_fn = postprocess_fn
         super().__init__(child, f=self.llm_map_elements, **kwargs)
 
     def llm_map_elements(self, documents: list[Document]) -> list[Document]:
-        rendered = [(e, self._prompt.render_element(e, d)) for d in documents for e in d.elements]
+        rendered = [(d, e, self._prompt.render_element(e, d)) for d in documents for e in d.elements]
         results = _infer_prompts(
-            _as_sequences([p for _, p in rendered]), self._llm, self._llm_mode, self._prompt.is_done
+            _as_sequences([p for _, _, p in rendered]), self._llm, self._llm_mode, self._prompt.is_done
         )
-        for (r, i), (e, _) in zip(results, rendered):
+        new_elts = []
+        last_doc = None
+        for (r, i), (d, e, _) in zip(results, rendered):
+            if last_doc is not None and last_doc.doc_id != d.doc_id:
+                last_doc.elements = new_elts
+                new_elts = []
             e.properties[self._output_field] = r
+            new_elts.append(self._postprocess_fn(e, i))
+            last_doc = d
+        if last_doc is not None:
+            last_doc.elements = new_elts
         return documents
 
     def _validate_prompt(self):