From c8f23087fa67f503859938652a3a1037c74eedb6 Mon Sep 17 00:00:00 2001 From: S1MS4 Date: Fri, 3 Jul 2026 13:30:11 +0300 Subject: [PATCH] fix: handle math run with no text child in OMML->LaTeX conversion do_r() called elm.findtext("./m:t") and iterated over the result directly. When a math run () has no text child (e.g. a run that only carries formatting properties, produced by some Word equation editors), findtext() returns None and iterating over it raises TypeError: 'NoneType' object is not iterable. Because equation pre-processing is applied at the whole-document.xml level with a blanket try/except, this single malformed run aborts LaTeX conversion for every equation in the document, silently dropping all native Word equations from the output. --- .../converter_utils/docx/math/omml.py | 2 +- packages/markitdown/tests/test_docx_omml.py | 48 +++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 packages/markitdown/tests/test_docx_omml.py diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py b/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py index dfa734cdc..97ccdf0c8 100644 --- a/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py +++ b/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py @@ -373,7 +373,7 @@ def do_r(self, elm): @todo \text (latex pure text support) """ _str = [] - for s in elm.findtext("./{0}t".format(OMML_NS)): + for s in elm.findtext("./{0}t".format(OMML_NS)) or "": # s = s if isinstance(s,unicode) else unicode(s,'utf-8') _str.append(self._t_dict.get(s, s)) return escape_latex(BLANK.join(_str)) diff --git a/packages/markitdown/tests/test_docx_omml.py b/packages/markitdown/tests/test_docx_omml.py new file mode 100644 index 000000000..98574108c --- /dev/null +++ b/packages/markitdown/tests/test_docx_omml.py @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- +""" +Regression test for a crash in the OMML -> LaTeX converter when a math run +() has no text child (e.g. a run that only carries formatting +properties). Previously `do_r()` called `elm.findtext(...)` directly and +iterated over the result, which raised `TypeError: 'NoneType' object is not +iterable` when the run had no text, aborting equation conversion for the +entire document. +""" + +from xml.etree import ElementTree as ET + +from markitdown.converter_utils.docx.math.omml import OMML_NS, oMath2Latex + +MATH_NS_DECL = f'xmlns:m="{OMML_NS[1:-1]}"' + + +def _parse_omath(xml_fragment: str): + wrapped = f"{xml_fragment}" + return ET.fromstring(wrapped) + + +def test_run_without_text_child_does_not_crash(): + # with only , no child. + element = _parse_omath("") + # Should not raise TypeError: 'NoneType' object is not iterable + result = oMath2Latex(element) + assert result.latex == "" + + +def test_run_with_text_still_converts(): + element = _parse_omath("x") + result = oMath2Latex(element) + assert result.latex == "x" + + +def test_subscript_with_missing_text_run_does_not_crash(): + # Mirrors a real-world document: a subscript expression where one of the + # runs involved has no text (e.g. produced by some Word equation editors). + element = _parse_omath( + "" + "l" + "1" + "" + ) + result = oMath2Latex(element) + assert "l" in result.latex + assert "1" in result.latex