microsoft · keenranger · Dec 26, 2024 · Dec 27, 2024 · Dec 27, 2024
diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
@@ -768,6 +768,17 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
                     except Exception:
                         pass
 
+                    # Try describing the image using GPTV
+                    llm_client = kwargs.get("llm_client")
+                    llm_model = kwargs.get("llm_model")
+                    if llm_client is not None and llm_model is not None:
+                        alt_text += self._get_llm_description(
+                            shape.image,
+                            llm_client,
+                            llm_model,
+                            prompt=kwargs.get("llm_prompt"),
+                        ).strip()
+
                     # A placeholder name
                     filename = re.sub(r"\W", "", shape.name) + ".jpg"
                     md_content += (
@@ -857,6 +868,37 @@ def _convert_chart_to_markdown(self, chart):
         separator = "|" + "|".join(["---"] * len(data[0])) + "|"
         return md + "\n".join([header, separator] + markdown_table[1:])
 
+    def _get_llm_description(self, image, client, model, prompt=None):
+        if image.content_type not in [
+            "image/jpeg",
+            "image/png",
+            "image/webp",
+            "image/gif",
+        ]:
+            return ""  # https://platform.openai.com/docs/guides/vision#what-type-of-files-can-i-upload
+        if prompt is None or prompt.strip() == "":
+            prompt = "Write a caption for this image."
+        image_base64 = base64.b64encode(image.blob).decode("utf-8")
+        data_uri = f"data:{image.content_type};base64,{image_base64}"
+
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prompt},
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": data_uri,
+                        },
+                    },
+                ],
+            }
+        ]
+
+        response = client.chat.completions.create(model=model, messages=messages)
+        return response.choices[0].message.content
+
 
 class MediaConverter(DocumentConverter):
     """

diff --git a/tests/test_files/test.pptx b/tests/test_files/test.pptx
diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py
@@ -300,10 +300,26 @@ def test_markitdown_llm() -> None:
         assert test_string in result.text_content.lower()
 
 
+@pytest.mark.skipif(
+    skip_llm,
+    reason="do not run llm tests without a key",
+)
+def test_markitdown_pptx_llm() -> None:
+    client = openai.OpenAI()
+    markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o-mini")
+
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx"))
+
+    # like test_markitdown_llm, this should be improved
+    for test_string in ["red", "blue"]:
+        assert test_string in result.text_content.lower()
+
+
 if __name__ == "__main__":
     """Runs this file's tests from the command line."""
     test_markitdown_remote()
     test_markitdown_local()
     test_markitdown_exiftool()
     test_markitdown_deprecation()
     test_markitdown_llm()
+    test_markitdown_pptx_llm()