From 573c9d616da3f32dc0d20c18712d9a7626092dc3 Mon Sep 17 00:00:00 2001 From: cancelself <332509+cancelself@users.noreply.github.com> Date: Fri, 5 Sep 2025 19:52:06 -0700 Subject: [PATCH] Handle invalid Office Open XML file error Adds a check for the specific error string indicating an invalid Office Open XML file. If detected, a FailedConversionAttempt is recorded and the next converter is tried, improving error handling for unsupported file formats. See https://github.com/microsoft/markitdown/issues/1408 --- .../markitdown/src/markitdown/_markitdown.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index 702b10c6..d8509b44 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -607,6 +607,25 @@ def _convert( file_stream.seek(cur_pos) if res is not None: + # Check for Office Open XML error string and raise if found + if ( + res.text_content.strip() + == "This is not a valid Office Open XML file." + ): + failed_attempts.append( + FailedConversionAttempt( + converter=converter, + exc_info=( + FileConversionException, + FileConversionException( + "Invalid Office Open XML file detected." + ), + None, + ), + ) + ) + continue # Try next converter + # Normalize the content res.text_content = "\n".join( [line.rstrip() for line in re.split(r"\r?\n", res.text_content)]