diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index 702b10c6..d8509b44 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -607,6 +607,25 @@ def _convert( file_stream.seek(cur_pos) if res is not None: + # Check for Office Open XML error string and raise if found + if ( + res.text_content.strip() + == "This is not a valid Office Open XML file." + ): + failed_attempts.append( + FailedConversionAttempt( + converter=converter, + exc_info=( + FileConversionException, + FileConversionException( + "Invalid Office Open XML file detected." + ), + None, + ), + ) + ) + continue # Try next converter + # Normalize the content res.text_content = "\n".join( [line.rstrip() for line in re.split(r"\r?\n", res.text_content)]