diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 33806e1..ed1e847 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -714,6 +714,29 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: result = mammoth.convert_to_html(docx_file, style_map=style_map) html_content = result.value result = self._convert(html_content) + + # Extract any base64 encoded images from the HTML + descriptions = [] + if kwargs.get("llm_client") and kwargs.get("llm_model"): + for match in re.finditer(r'data:image/[^;]+;base64,([^"\']+)', html_content): + img_converter = ImageConverter() + descriptions.append(img_converter.convert_from_base64(match.group(1),'.png',**kwargs)) + + # Replace each base64 image with its description + if descriptions and result: + text_content = result.text_content + + # Find all base64 image markdown patterns + base64_pattern = r'!\[[\s\S]*?\]\(data:image/[a-z]+;base64.*?\)' + + # Find all base64 image markdown patterns + matches = list(re.finditer(base64_pattern, text_content)) + + # Replace each match with corresponding description + for i, match in enumerate(matches): + if i < len(descriptions): + text_content = text_content.replace(match.group(), f'[Image description {i}] \n{descriptions[i]}\n[End Image description {i}]') + result.text_content = text_content return result @@ -1114,6 +1137,59 @@ def _get_llm_description(self, local_path, extension, client, model, prompt=None response = client.chat.completions.create(model=model, messages=messages) return response.choices[0].message.content + + def _get_llm_description_from_base64( + self, + base64_str: str, + extension: str, + client: Any, + model: str, + prompt: Optional[str] = None + ) -> str: + """Get LLM description for a base64-encoded image string.""" + if prompt is None or prompt.strip() == "": + prompt = "Write a detailed caption for this image." + + # Remove data URI prefix if present + if ',' in base64_str: + base64_str = base64_str.split(',')[1] + + # Create data URI + content_type, encoding = mimetypes.guess_type("_dummy" + extension) + if content_type is None: + content_type = "image/jpeg" + + data_uri = f"data:{content_type};base64,{base64_str}" + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + { + "type": "image_url", + "image_url": { + "url": data_uri, + }, + }, + ], + } + ] + + response = client.chat.completions.create(model=model, messages=messages) + return response.choices[0].message.content + + def convert_from_base64( + self, + base64_str: str, + extension: str, + **kwargs: Any + ) -> Union[None, DocumentConverterResult]: + """Convert a base64-encoded image string to markdown.""" + client = kwargs.get("llm_client") + model = kwargs.get("llm_model") + prompt = kwargs.get("llm_prompt") + result = self._get_llm_description_from_base64(base64_str, extension, client, model, prompt) + return result class OutlookMsgConverter(DocumentConverter):