Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,24 @@ def _pre_process_math(content: bytes) -> bytes:
return str(soup).encode()


def _pre_process_styles(content: bytes) -> bytes:
"""
Removes malformed DOCX style definitions that are missing required attributes.

Mammoth expects every ``w:style`` element to include ``w:type`` and
``w:styleId``. Some DOCX producers emit malformed style entries without
those attributes, which otherwise causes conversion to fail with a
``KeyError`` before any document text can be extracted. Dropping only the
malformed style definition allows Mammoth to continue and preserves the
document body content.
"""
soup = BeautifulSoup(content.decode(), features="xml")
for tag in soup.find_all("w:style"):
if not tag.has_attr("w:type") or not tag.has_attr("w:styleId"):
tag.decompose()
return str(soup).encode()


def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
"""
Pre-processes a DOCX file with provided steps.
Expand All @@ -131,11 +149,12 @@ def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
"""
output_docx = BytesIO()
# The files that need to be pre-processed from .docx
pre_process_enable_files = [
"word/document.xml",
"word/footnotes.xml",
"word/endnotes.xml",
]
pre_process_enable_files = {
"word/document.xml": _pre_process_math,
"word/footnotes.xml": _pre_process_math,
"word/endnotes.xml": _pre_process_math,
"word/styles.xml": _pre_process_styles,
}
with zipfile.ZipFile(input_docx, mode="r") as zip_input:
files = {name: zip_input.read(name) for name in zip_input.namelist()}
with zipfile.ZipFile(output_docx, mode="w") as zip_output:
Expand All @@ -144,7 +163,7 @@ def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
if name in pre_process_enable_files:
try:
# Pre-process the content
updated_content = _pre_process_math(content)
updated_content = pre_process_enable_files[name](content)
# In the future, if there are more pre-processing steps, they can be added here
zip_output.writestr(name, updated_content)
except Exception:
Expand Down
30 changes: 30 additions & 0 deletions packages/markitdown/tests/test_module_vectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import time
import pytest
import base64
import re
import zipfile

from pathlib import Path

Expand Down Expand Up @@ -199,6 +201,34 @@ def test_convert_stream_keep_data_uris(test_vector):
assert string not in result.markdown


def test_convert_docx_with_style_missing_type(tmp_path):
"""DOCX conversion should not fail when a style entry is missing w:type."""
source_path = os.path.join(TEST_FILES_DIR, "test.docx")
malformed_path = tmp_path / "missing_style_type.docx"

with zipfile.ZipFile(source_path, mode="r") as zip_input:
with zipfile.ZipFile(malformed_path, mode="w") as zip_output:
for item in zip_input.infolist():
content = zip_input.read(item.filename)
if item.filename == "word/styles.xml":
styles_xml = content.decode("utf-8")
styles_xml = re.sub(
r'<w:style\s+w:type="[^"]+"',
"<w:style",
styles_xml,
count=1,
)
content = styles_xml.encode("utf-8")
zip_output.writestr(item, content)

result = MarkItDown().convert(str(malformed_path))

assert (
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation"
in result.markdown
)


if __name__ == "__main__":
"""Runs this file's tests from the command line."""

Expand Down