diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index b6acfe8..fa46bf9 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -698,24 +698,99 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: class DocxConverter(HtmlConverter): """ - Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. + Converts DOCX files to Markdown. Style information (e.g., headings) and tables are preserved where possible. """ - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + def sanitize_filename(self, name: str) -> str: + """Sanitizes a string to make it a valid file name across different operating systems.""" + # Normalize underscore + name = re.sub(r"\s+", "_", name.strip()) + + # Replace invalid characters with underscores + name = re.sub(r'[\\/*?:"<>|]', "_", name) + + # Remove leading and trailing dots and spaces + name = name.strip(" .") + + # Limit the length of the filename to a reasonable length (e.g., 251 characters) + max_length = 251 + if len(name) > max_length: + name = name[:max_length] + + return name + + def truncate_filename(self, name: str, max_length: int, extension: str = "") -> str: + """Truncates the filename to ensure the final length is within the limit.""" + max_base_length = max_length - len(extension) + if len(name) > max_base_length: + return name[:max_base_length] + return name + + def unique_filename(self, base_path: str, max_length: int = 251) -> str: + """Generates a unique filename while ensuring it stays within the length limit.""" + base, ext = os.path.splitext(base_path) + truncated_base = self.truncate_filename(base, max_length, ext) + + counter = 1 + unique_path = f"{truncated_base}{ext}" + while os.path.exists(unique_path): + suffix = f"_{counter}" + # Ensure base is short enough to add the suffix + truncated_base = self.truncate_filename( + base, max_length - len(suffix) - len(ext) + ) + unique_path = f"{truncated_base}{suffix}{ext}" + counter += 1 + + return unique_path + + def convert_image(self, image, output_dir: str) -> dict: + """Handles image extraction and saving with collision avoidance and length limits.""" + os.makedirs(output_dir, exist_ok=True) + + image.alt_text = image.alt_text.replace("\n", " ") + raw_name = image.alt_text or f"image_{hash(image)}" + sanitized_name = self.sanitize_filename(raw_name) + truncated_name = self.truncate_filename(sanitized_name, 251, ".png") + image_path = os.path.join(output_dir, truncated_name + ".png") + + # Ensure unique filename + image_path = self.unique_filename(image_path) + + try: + with image.open() as image_bytes: + with open(image_path, "wb") as img_file: + img_file.write(image_bytes.read()) + return {"src": image_path, "alt": image.alt_text} + except Exception: + # Return an empty src if saving fails + return {"src": ""} + + def convert( + self, local_path: str, **kwargs + ) -> Union[None, DocumentConverterResult]: # Bail if not a DOCX extension = kwargs.get("file_extension", "") if extension.lower() != ".docx": return None - result = None - with open(local_path, "rb") as docx_file: - style_map = kwargs.get("style_map", None) - - result = mammoth.convert_to_html(docx_file, style_map=style_map) - html_content = result.value - result = self._convert(html_content) + try: + with open(local_path, "rb") as docx_file: + style_map = kwargs.get("style_map") + image_output_dir = kwargs.get("image_output_dir", "images") + + mammoth_result = mammoth.convert_to_html( + docx_file, + style_map=style_map, + convert_image=mammoth.images.inline( + lambda img: self.convert_image(img, image_output_dir) + ), + ) - return result + html_content = mammoth_result.value + return self._convert(html_content) + except Exception: + return None class XlsxConverter(HtmlConverter): diff --git a/tests/test_files/test_with_images.docx b/tests/test_files/test_with_images.docx new file mode 100644 index 0000000..e8c79f5 Binary files /dev/null and b/tests/test_files/test_with_images.docx differ diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 3333bcb..261376e 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -89,6 +89,16 @@ "Yet another comment in the doc. 55yiyi-asd09", ] +DOCX_IMAGES_TEST_STRINGS = [ + "314b0a30-5b04-470b-b9f7-eed2c2bec74a", + "49e168b7-d2ae-407f-a055-2167576f39a1", + "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f", + "# Abstract", + "# Introduction", + "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", + "GitHub_-_microsoft_markitdown__Python_tool_for_converting_files_and_office_documents_to_Markdown.png", +] + PPTX_TEST_STRINGS = [ "2cdda5c8-e50e-4db4-b5f0-9722a649f455", "04191ea8-5c73-4215-a1d3-1cfb43aaaf12", @@ -206,6 +216,10 @@ def test_markitdown_local() -> None: result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx")) validate_strings(result, DOCX_TEST_STRINGS) + # Test DOCX processing, with images + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_with_images.docx")) + validate_strings(result, DOCX_IMAGES_TEST_STRINGS) + # Test DOCX processing, with comments result = markitdown.convert( os.path.join(TEST_FILES_DIR, "test_with_comment.docx"),