Skip to content

Commit

Permalink
Set exiftool path explicitly. (#267)
Browse files Browse the repository at this point in the history
  • Loading branch information
afourney authored Jan 6, 2025
1 parent 265aea2 commit f58a864
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 16 deletions.
39 changes: 29 additions & 10 deletions src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -892,14 +892,25 @@ class MediaConverter(DocumentConverter):
Abstract class for multi-modal media (e.g., images and audio)
"""

def _get_metadata(self, local_path):
exiftool = shutil.which("exiftool")
if not exiftool:
def _get_metadata(self, local_path, exiftool_path=None):
if not exiftool_path:
which_exiftool = shutil.which("exiftool")
if which_exiftool:
warn(
f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
md = MarkItDown(exiftool_path="{which_exiftool}")
This warning will be removed in future releases.
""",
DeprecationWarning,
)

return None
else:
try:
result = subprocess.run(
[exiftool, "-json", local_path], capture_output=True, text=True
[exiftool_path, "-json", local_path], capture_output=True, text=True
).stdout
return json.loads(result)[0]
except Exception:
Expand All @@ -920,7 +931,7 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
md_content = ""

# Add metadata
metadata = self._get_metadata(local_path)
metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
if metadata:
for f in [
"Title",
Expand Down Expand Up @@ -975,7 +986,7 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
md_content = ""

# Add metadata
metadata = self._get_metadata(local_path)
metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
if metadata:
for f in [
"Title",
Expand Down Expand Up @@ -1036,7 +1047,7 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
md_content = ""

# Add metadata
metadata = self._get_metadata(local_path)
metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
if metadata:
for f in [
"ImageSize",
Expand Down Expand Up @@ -1325,6 +1336,7 @@ def __init__(
llm_client: Optional[Any] = None,
llm_model: Optional[str] = None,
style_map: Optional[str] = None,
exiftool_path: Optional[str] = None,
# Deprecated
mlm_client: Optional[Any] = None,
mlm_model: Optional[str] = None,
Expand All @@ -1334,6 +1346,9 @@ def __init__(
else:
self._requests_session = requests_session

if exiftool_path is None:
exiftool_path = os.environ.get("EXIFTOOL_PATH")

# Handle deprecation notices
#############################
if mlm_client is not None:
Expand Down Expand Up @@ -1366,6 +1381,7 @@ def __init__(
self._llm_client = llm_client
self._llm_model = llm_model
self._style_map = style_map
self._exiftool_path = exiftool_path

self._page_converters: List[DocumentConverter] = []

Expand Down Expand Up @@ -1549,12 +1565,15 @@ def _convert(
if "llm_model" not in _kwargs and self._llm_model is not None:
_kwargs["llm_model"] = self._llm_model

# Add the list of converters for nested processing
_kwargs["_parent_converters"] = self._page_converters

if "style_map" not in _kwargs and self._style_map is not None:
_kwargs["style_map"] = self._style_map

if "exiftool_path" not in _kwargs and self._exiftool_path is not None:
_kwargs["exiftool_path"] = self._exiftool_path

# Add the list of converters for nested processing
_kwargs["_parent_converters"] = self._page_converters

# If we hit an error log it and keep trying
try:
res = converter.convert(local_path, **_kwargs)
Expand Down
32 changes: 26 additions & 6 deletions tests/test_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,9 +277,29 @@ def test_markitdown_local() -> None:
reason="do not run if exiftool is not installed",
)
def test_markitdown_exiftool() -> None:
markitdown = MarkItDown()
# Test the automatic discovery of exiftool throws a warning
# and is disabled
try:
with catch_warnings(record=True) as w:
markitdown = MarkItDown()
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert result.text_content.strip() == ""
finally:
resetwarnings()

# Test JPG metadata processing
# Test explicitly setting the location of exiftool
which_exiftool = shutil.which("exiftool")
markitdown = MarkItDown(exiftool_path=which_exiftool)
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
for key in JPG_TEST_EXIFTOOL:
target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
assert target in result.text_content

# Test setting the exiftool path through an environment variable
os.environ["EXIFTOOL_PATH"] = which_exiftool
markitdown = MarkItDown()
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
for key in JPG_TEST_EXIFTOOL:
target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
Expand Down Expand Up @@ -341,8 +361,8 @@ def test_markitdown_llm() -> None:

if __name__ == "__main__":
"""Runs this file's tests from the command line."""
test_markitdown_remote()
test_markitdown_local()
# test_markitdown_remote()
# test_markitdown_local()
test_markitdown_exiftool()
test_markitdown_deprecation()
test_markitdown_llm()
# test_markitdown_deprecation()
# test_markitdown_llm()

0 comments on commit f58a864

Please sign in to comment.