microsoft · gingerninja85 · Jul 4, 2026
diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py
@@ -115,6 +115,24 @@ def _pre_process_math(content: bytes) -> bytes:
     return str(soup).encode()
 
 
+def _pre_process_styles(content: bytes) -> bytes:
+    """
+    Removes malformed DOCX style definitions that are missing required attributes.
+
+    Mammoth expects every ``w:style`` element to include ``w:type`` and
+    ``w:styleId``. Some DOCX producers emit malformed style entries without
+    those attributes, which otherwise causes conversion to fail with a
+    ``KeyError`` before any document text can be extracted. Dropping only the
+    malformed style definition allows Mammoth to continue and preserves the
+    document body content.
+    """
+    soup = BeautifulSoup(content.decode(), features="xml")
+    for tag in soup.find_all("w:style"):
+        if not tag.has_attr("w:type") or not tag.has_attr("w:styleId"):
+            tag.decompose()
+    return str(soup).encode()
+
+
 def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
     """
     Pre-processes a DOCX file with provided steps.
@@ -131,11 +149,12 @@ def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
     """
     output_docx = BytesIO()
     # The files that need to be pre-processed from .docx
-    pre_process_enable_files = [
-        "word/document.xml",
-        "word/footnotes.xml",
-        "word/endnotes.xml",
-    ]
+    pre_process_enable_files = {
+        "word/document.xml": _pre_process_math,
+        "word/footnotes.xml": _pre_process_math,
+        "word/endnotes.xml": _pre_process_math,
+        "word/styles.xml": _pre_process_styles,
+    }
     with zipfile.ZipFile(input_docx, mode="r") as zip_input:
         files = {name: zip_input.read(name) for name in zip_input.namelist()}
         with zipfile.ZipFile(output_docx, mode="w") as zip_output:
@@ -144,7 +163,7 @@ def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
                 if name in pre_process_enable_files:
                     try:
                         # Pre-process the content
-                        updated_content = _pre_process_math(content)
+                        updated_content = pre_process_enable_files[name](content)
                         # In the future, if there are more pre-processing steps, they can be added here
                         zip_output.writestr(name, updated_content)
                     except Exception:

diff --git a/packages/markitdown/tests/test_module_vectors.py b/packages/markitdown/tests/test_module_vectors.py
@@ -3,6 +3,8 @@
 import time
 import pytest
 import base64
+import re
+import zipfile
 
 from pathlib import Path
 
@@ -199,6 +201,34 @@ def test_convert_stream_keep_data_uris(test_vector):
             assert string not in result.markdown
 
 
+def test_convert_docx_with_style_missing_type(tmp_path):
+    """DOCX conversion should not fail when a style entry is missing w:type."""
+    source_path = os.path.join(TEST_FILES_DIR, "test.docx")
+    malformed_path = tmp_path / "missing_style_type.docx"
+
+    with zipfile.ZipFile(source_path, mode="r") as zip_input:
+        with zipfile.ZipFile(malformed_path, mode="w") as zip_output:
+            for item in zip_input.infolist():
+                content = zip_input.read(item.filename)
+                if item.filename == "word/styles.xml":
+                    styles_xml = content.decode("utf-8")
+                    styles_xml = re.sub(
+                        r'<w:style\s+w:type="[^"]+"',
+                        "<w:style",
+                        styles_xml,
+                        count=1,
+                    )
+                    content = styles_xml.encode("utf-8")
+                zip_output.writestr(item, content)
+
+    result = MarkItDown().convert(str(malformed_path))
+
+    assert (
+        "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation"
+        in result.markdown
+    )
+
+
 if __name__ == "__main__":
     """Runs this file's tests from the command line."""