diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index b6acfe8..33806e1 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -892,14 +892,25 @@ class MediaConverter(DocumentConverter): Abstract class for multi-modal media (e.g., images and audio) """ - def _get_metadata(self, local_path): - exiftool = shutil.which("exiftool") - if not exiftool: + def _get_metadata(self, local_path, exiftool_path=None): + if not exiftool_path: + which_exiftool = shutil.which("exiftool") + if which_exiftool: + warn( + f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g., + + md = MarkItDown(exiftool_path="{which_exiftool}") + +This warning will be removed in future releases. +""", + DeprecationWarning, + ) + return None else: try: result = subprocess.run( - [exiftool, "-json", local_path], capture_output=True, text=True + [exiftool_path, "-json", local_path], capture_output=True, text=True ).stdout return json.loads(result)[0] except Exception: @@ -920,7 +931,7 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: md_content = "" # Add metadata - metadata = self._get_metadata(local_path) + metadata = self._get_metadata(local_path, kwargs.get("exiftool_path")) if metadata: for f in [ "Title", @@ -975,7 +986,7 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: md_content = "" # Add metadata - metadata = self._get_metadata(local_path) + metadata = self._get_metadata(local_path, kwargs.get("exiftool_path")) if metadata: for f in [ "Title", @@ -1036,7 +1047,7 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: md_content = "" # Add metadata - metadata = self._get_metadata(local_path) + metadata = self._get_metadata(local_path, kwargs.get("exiftool_path")) if metadata: for f in [ "ImageSize", @@ -1325,6 +1336,7 @@ def __init__( llm_client: Optional[Any] = None, llm_model: Optional[str] = None, style_map: Optional[str] = None, + exiftool_path: Optional[str] = None, # Deprecated mlm_client: Optional[Any] = None, mlm_model: Optional[str] = None, @@ -1334,6 +1346,9 @@ def __init__( else: self._requests_session = requests_session + if exiftool_path is None: + exiftool_path = os.environ.get("EXIFTOOL_PATH") + # Handle deprecation notices ############################# if mlm_client is not None: @@ -1366,6 +1381,7 @@ def __init__( self._llm_client = llm_client self._llm_model = llm_model self._style_map = style_map + self._exiftool_path = exiftool_path self._page_converters: List[DocumentConverter] = [] @@ -1549,12 +1565,15 @@ def _convert( if "llm_model" not in _kwargs and self._llm_model is not None: _kwargs["llm_model"] = self._llm_model - # Add the list of converters for nested processing - _kwargs["_parent_converters"] = self._page_converters - if "style_map" not in _kwargs and self._style_map is not None: _kwargs["style_map"] = self._style_map + if "exiftool_path" not in _kwargs and self._exiftool_path is not None: + _kwargs["exiftool_path"] = self._exiftool_path + + # Add the list of converters for nested processing + _kwargs["_parent_converters"] = self._page_converters + # If we hit an error log it and keep trying try: res = converter.convert(local_path, **_kwargs) diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 3333bcb..689d6f3 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -277,9 +277,29 @@ def test_markitdown_local() -> None: reason="do not run if exiftool is not installed", ) def test_markitdown_exiftool() -> None: - markitdown = MarkItDown() + # Test the automatic discovery of exiftool throws a warning + # and is disabled + try: + with catch_warnings(record=True) as w: + markitdown = MarkItDown() + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg")) + assert len(w) == 1 + assert w[0].category is DeprecationWarning + assert result.text_content.strip() == "" + finally: + resetwarnings() - # Test JPG metadata processing + # Test explicitly setting the location of exiftool + which_exiftool = shutil.which("exiftool") + markitdown = MarkItDown(exiftool_path=which_exiftool) + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg")) + for key in JPG_TEST_EXIFTOOL: + target = f"{key}: {JPG_TEST_EXIFTOOL[key]}" + assert target in result.text_content + + # Test setting the exiftool path through an environment variable + os.environ["EXIFTOOL_PATH"] = which_exiftool + markitdown = MarkItDown() result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg")) for key in JPG_TEST_EXIFTOOL: target = f"{key}: {JPG_TEST_EXIFTOOL[key]}" @@ -341,8 +361,8 @@ def test_markitdown_llm() -> None: if __name__ == "__main__": """Runs this file's tests from the command line.""" - test_markitdown_remote() - test_markitdown_local() + # test_markitdown_remote() + # test_markitdown_local() test_markitdown_exiftool() - test_markitdown_deprecation() - test_markitdown_llm() + # test_markitdown_deprecation() + # test_markitdown_llm()