From f398f3d4434ef1664250e1032b7f2733944dd6d5 Mon Sep 17 00:00:00 2001 From: "Petr@AP Consulting" Date: Tue, 17 Dec 2024 10:26:09 +0100 Subject: [PATCH 01/40] Update README.md I added description and script for batch of files processing --- README.md | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/README.md b/README.md index 7079dbfa..01ceb716 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,51 @@ You can also use the project as Docker Image: docker build -t markitdown:latest . docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md ``` +Batch Processing Multiple Files +This extension allows you to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files. + +Features + +- Converts multiple files in one operation +- Supports various file formats (.pptx, .docx, .pdf, .jpg, .jpeg, .png etc. you can change it) +- Maintains original filenames (changes extension to .md) +- Includes GPT-4o-latest image descriptions when available +- Continues processing if individual files fail + +Usage +1. Create a Python script (e.g., convert.py): +```python +from markitdown import MarkItDown +from openai import OpenAI +import os +client = OpenAI(api_key="your-api-key-here") +md = MarkItDown(mlm_client=client, mlm_model="gpt-4o-2024-11-20") +supported_extensions = ('.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png') +files_to_convert = [f for f in os.listdir('.') if f.lower().endswith(supported_extensions)] +for file in files_to_convert: + print(f"\nConverting {file}...") + try: + md_file = os.path.splitext(file)[0] + '.md' + result = md.convert(file) + with open(md_file, 'w') as f: + f.write(result.text_content) + + print(f"Successfully converted {file} to {md_file}") + except Exception as e: + print(f"Error converting {file}: {str(e)}") + +print("\nAll conversions completed!") +``` +2. Place the script in the same directory as your files +3. Install required packages: like openai +4. Run script ```bash python3 convert.py ``` + +- The script processes all supported files in the current directory +- Original files remain unchanged +- New markdown files are created with the same base name +- Progress and any errors are displayed during conversion + ## Contributing This project welcomes contributions and suggestions. Most contributions require you to agree to a From 224f1df0fc33e83d49825e3d8b947d945787ad7d Mon Sep 17 00:00:00 2001 From: "Petr@AP Consulting" Date: Wed, 18 Dec 2024 09:28:18 +0100 Subject: [PATCH 02/40] Update README.md I collapsed section about batch processing as was suggested --- README.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 01ceb716..669caa2e 100644 --- a/README.md +++ b/README.md @@ -78,11 +78,13 @@ You can also use the project as Docker Image: docker build -t markitdown:latest . docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md ``` -Batch Processing Multiple Files +
+ +Batch Processing Multiple Files This extension allows you to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files. -Features +### Features - Converts multiple files in one operation - Supports various file formats (.pptx, .docx, .pdf, .jpg, .jpeg, .png etc. you can change it) @@ -90,7 +92,7 @@ Features - Includes GPT-4o-latest image descriptions when available - Continues processing if individual files fail -Usage +### Usage 1. Create a Python script (e.g., convert.py): ```python from markitdown import MarkItDown @@ -122,6 +124,8 @@ print("\nAll conversions completed!") - Original files remain unchanged - New markdown files are created with the same base name - Progress and any errors are displayed during conversion + +
## Contributing From 6e4caac70d63c87a532be773b2dc3f330f9fdbda Mon Sep 17 00:00:00 2001 From: Joel Esler Date: Wed, 18 Dec 2024 13:12:55 -0500 Subject: [PATCH 03/40] Safeguard against path traversal for ZipConverter fix: prevent path traversal vulnerabilities in ZipConverter Added a secure check for path traversal vulnerabilities in the ZipConverter class. Now validates extracted file paths using `os.path.commonprefix` to ensure all files remain within the intended extraction directory. Raises a `ValueError` if a path traversal attempt is detected. - Normalized file paths using `os.path.normpath`. - Added specific exception handling for `zipfile.BadZipFile` and traversal errors. - Ensured cleanup of extracted files after processing when `cleanup_extracted` is enabled. --- .gitignore | 2 ++ src/markitdown/_markitdown.py | 30 ++++++++++++++++++------------ 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/.gitignore b/.gitignore index 82f92755..b6139eb7 100644 --- a/.gitignore +++ b/.gitignore @@ -160,3 +160,5 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ +src/.DS_Store +.DS_Store diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 2e7e5ffd..28770f40 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -1133,27 +1133,28 @@ def convert( extracted_zip_folder_name = ( f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}" ) - new_folder = os.path.normpath( + extraction_dir = os.path.normpath( os.path.join(os.path.dirname(local_path), extracted_zip_folder_name) ) md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n" - # Safety check for path traversal - if not new_folder.startswith(os.path.dirname(local_path)): - return DocumentConverterResult( - title=None, text_content=f"[ERROR] Invalid zip file path: {local_path}" - ) - try: - # Extract the zip file + # Extract the zip file safely with zipfile.ZipFile(local_path, "r") as zipObj: - zipObj.extractall(path=new_folder) + # Safeguard against path traversal + for member in zipObj.namelist(): + member_path = os.path.normpath(os.path.join(extraction_dir, member)) + if not os.path.commonprefix([extraction_dir, member_path]) == extraction_dir: + raise ValueError(f"Path traversal detected in zip file: {member}") + + # Extract all files safely + zipObj.extractall(path=extraction_dir) # Process each extracted file - for root, dirs, files in os.walk(new_folder): + for root, dirs, files in os.walk(extraction_dir): for name in files: file_path = os.path.join(root, name) - relative_path = os.path.relpath(file_path, new_folder) + relative_path = os.path.relpath(file_path, extraction_dir) # Get file extension _, file_extension = os.path.splitext(name) @@ -1177,7 +1178,7 @@ def convert( # Clean up extracted files if specified if kwargs.get("cleanup_extracted", True): - shutil.rmtree(new_folder) + shutil.rmtree(extraction_dir) return DocumentConverterResult(title=None, text_content=md_content.strip()) @@ -1186,6 +1187,11 @@ def convert( title=None, text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}", ) + except ValueError as ve: + return DocumentConverterResult( + title=None, + text_content=f"[ERROR] Security error in zip file {local_path}: {str(ve)}", + ) except Exception as e: return DocumentConverterResult( title=None, From 39410d01df6ecb42a81c4219bdbd3ff6e21b8bfd Mon Sep 17 00:00:00 2001 From: Sugato Ray Date: Wed, 18 Dec 2024 14:22:58 -0500 Subject: [PATCH 04/40] Update CLI helpdoc formatting to allow indentation in code Use `textwrap.dedent()` to allow indented cli-helpdoc in `__main__.py` file. The indentation increases readability, while `textwrap.dedent` helps maintain the same functionality without breaking code. --- src/markitdown/__main__.py | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/src/markitdown/__main__.py b/src/markitdown/__main__.py index 2d531735..9c48cd4e 100644 --- a/src/markitdown/__main__.py +++ b/src/markitdown/__main__.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: MIT import sys import argparse +from textwrap import dedent from ._markitdown import MarkItDown @@ -10,24 +11,24 @@ def main(): parser = argparse.ArgumentParser( description="Convert various file formats to markdown.", formatter_class=argparse.RawDescriptionHelpFormatter, - usage=""" -SYNTAX: - - markitdown - If FILENAME is empty, markitdown reads from stdin. - -EXAMPLE: - - markitdown example.pdf - - OR - - cat example.pdf | markitdown - - OR - - markitdown < example.pdf -""".strip(), + usage=dedent(""" + SYNTAX: + + markitdown + If FILENAME is empty, markitdown reads from stdin. + + EXAMPLE: + + markitdown example.pdf + + OR + + cat example.pdf | markitdown + + OR + + markitdown < example.pdf + """).strip(), ) parser.add_argument("filename", nargs="?") From 5fc70864f23c75ea315bfb1c011a4ed82a76ccf0 Mon Sep 17 00:00:00 2001 From: gagb Date: Wed, 18 Dec 2024 11:46:39 -0800 Subject: [PATCH 05/40] Run pre-commit --- src/markitdown/_markitdown.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 28770f40..040a586e 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -1144,9 +1144,14 @@ def convert( # Safeguard against path traversal for member in zipObj.namelist(): member_path = os.path.normpath(os.path.join(extraction_dir, member)) - if not os.path.commonprefix([extraction_dir, member_path]) == extraction_dir: - raise ValueError(f"Path traversal detected in zip file: {member}") - + if ( + not os.path.commonprefix([extraction_dir, member_path]) + == extraction_dir + ): + raise ValueError( + f"Path traversal detected in zip file: {member}" + ) + # Extract all files safely zipObj.extractall(path=extraction_dir) From 233ba679b88389fb53aded8a15f7b967f93f5af3 Mon Sep 17 00:00:00 2001 From: "Petr@AP Consulting" <173082609+PetrAPConsulting@users.noreply.github.com> Date: Wed, 18 Dec 2024 21:05:04 +0100 Subject: [PATCH 06/40] Update README.md Co-authored-by: gagb --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ebeca7f3..f160d865 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,7 @@ docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md Batch Processing Multiple Files -This extension allows you to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files. +This example shows how to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files. ### Features From bb929629f3c573adef8fcdf15ae7112f052299d5 Mon Sep 17 00:00:00 2001 From: "Petr@AP Consulting" <173082609+PetrAPConsulting@users.noreply.github.com> Date: Wed, 18 Dec 2024 21:05:36 +0100 Subject: [PATCH 07/40] Update README.md Co-authored-by: gagb --- README.md | 7 ------- 1 file changed, 7 deletions(-) diff --git a/README.md b/README.md index f160d865..627243de 100644 --- a/README.md +++ b/README.md @@ -66,13 +66,6 @@ docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md This example shows how to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files. -### Features - -- Converts multiple files in one operation -- Supports various file formats (.pptx, .docx, .pdf, .jpg, .jpeg, .png etc. you can change it) -- Maintains original filenames (changes extension to .md) -- Includes GPT-4o-latest image descriptions when available -- Continues processing if individual files fail ### Usage 1. Create a Python script (e.g., convert.py): From 088007338d1567299a6654cf95fb4616413f7131 Mon Sep 17 00:00:00 2001 From: "Petr@AP Consulting" <173082609+PetrAPConsulting@users.noreply.github.com> Date: Wed, 18 Dec 2024 21:07:55 +0100 Subject: [PATCH 08/40] Update README.md Co-authored-by: gagb --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index 627243de..62de2e6b 100644 --- a/README.md +++ b/README.md @@ -67,8 +67,6 @@ docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md This example shows how to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files. -### Usage -1. Create a Python script (e.g., convert.py): ```python from markitdown import MarkItDown from openai import OpenAI From f4471d96e2b61de672e3e1f4bf95222191844274 Mon Sep 17 00:00:00 2001 From: "Petr@AP Consulting" <173082609+PetrAPConsulting@users.noreply.github.com> Date: Wed, 18 Dec 2024 21:08:10 +0100 Subject: [PATCH 09/40] Update README.md Co-authored-by: gagb --- README.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/README.md b/README.md index 62de2e6b..70a1b5cf 100644 --- a/README.md +++ b/README.md @@ -93,10 +93,7 @@ print("\nAll conversions completed!") 3. Install required packages: like openai 4. Run script ```bash python3 convert.py ``` -- The script processes all supported files in the current directory -- Original files remain unchanged -- New markdown files are created with the same base name -- Progress and any errors are displayed during conversion +Note that original files will remain unchanged and new markdown files are created with the same base name. From f6e75c46d4f08a073f5fc07dd0bc122138f52436 Mon Sep 17 00:00:00 2001 From: "Petr@AP Consulting" <173082609+PetrAPConsulting@users.noreply.github.com> Date: Wed, 18 Dec 2024 21:17:47 +0100 Subject: [PATCH 10/40] Update README.md I changed command for running script from Mac version (python3) to Windows version (python) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 70a1b5cf..b82e5fc5 100644 --- a/README.md +++ b/README.md @@ -91,7 +91,7 @@ print("\nAll conversions completed!") ``` 2. Place the script in the same directory as your files 3. Install required packages: like openai -4. Run script ```bash python3 convert.py ``` +4. Run script ```bash python convert.py ``` Note that original files will remain unchanged and new markdown files are created with the same base name. From 356e895306baf01633ebacd5888487321c940f6a Mon Sep 17 00:00:00 2001 From: Sugato Ray Date: Wed, 18 Dec 2024 21:25:23 +0000 Subject: [PATCH 11/40] update formatting with pre-commit --- src/markitdown/__main__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/markitdown/__main__.py b/src/markitdown/__main__.py index 9c48cd4e..be2a0f2f 100644 --- a/src/markitdown/__main__.py +++ b/src/markitdown/__main__.py @@ -11,7 +11,8 @@ def main(): parser = argparse.ArgumentParser( description="Convert various file formats to markdown.", formatter_class=argparse.RawDescriptionHelpFormatter, - usage=dedent(""" + usage=dedent( + """ SYNTAX: markitdown @@ -28,7 +29,8 @@ def main(): OR markitdown < example.pdf - """).strip(), + """ + ).strip(), ) parser.add_argument("filename", nargs="?") From 1384e8072578278977ec6d67c852f9c2f79d799e Mon Sep 17 00:00:00 2001 From: Sugato Ray Date: Wed, 18 Dec 2024 21:24:57 +0000 Subject: [PATCH 12/40] update .gitignore to exclude .vscode folder --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 82f92755..e6c8f2ee 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +.vscode + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] From 6e1b9a7402f425b3a740051a35db0fcd336ce549 Mon Sep 17 00:00:00 2001 From: gagb Date: Wed, 18 Dec 2024 13:46:10 -0800 Subject: [PATCH 13/40] Run precommit --- src/markitdown/__main__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/markitdown/__main__.py b/src/markitdown/__main__.py index 9c48cd4e..be2a0f2f 100644 --- a/src/markitdown/__main__.py +++ b/src/markitdown/__main__.py @@ -11,7 +11,8 @@ def main(): parser = argparse.ArgumentParser( description="Convert various file formats to markdown.", formatter_class=argparse.RawDescriptionHelpFormatter, - usage=dedent(""" + usage=dedent( + """ SYNTAX: markitdown @@ -28,7 +29,8 @@ def main(): OR markitdown < example.pdf - """).strip(), + """ + ).strip(), ) parser.add_argument("filename", nargs="?") From a2743a5314936fdfb83e17978323a463e2111bda Mon Sep 17 00:00:00 2001 From: gagb Date: Wed, 18 Dec 2024 14:26:36 -0800 Subject: [PATCH 14/40] Add downloads badge --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 75c2ba05..978327ca 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,9 @@ # MarkItDown [![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/) +![PyPI - Downloads](https://img.shields.io/pypi/dd/markitdown) + + MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc). It supports: From c86287b7e3c6b41138c9b8e5e9097c359ea32fbc Mon Sep 17 00:00:00 2001 From: lumin Date: Wed, 18 Dec 2024 18:22:41 +0900 Subject: [PATCH 15/40] feat: add project description in pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c5bd58ba..3e14cec8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "hatchling.build" [project] name = "markitdown" dynamic = ["version"] -description = '' +description = 'Utility tool for converting various files to Markdown' readme = "README.md" requires-python = ">=3.10" license = "MIT" From b28f380a4768bfb88f9bd209cadc97ae73b7a5b8 Mon Sep 17 00:00:00 2001 From: "Petr@AP Consulting" <173082609+PetrAPConsulting@users.noreply.github.com> Date: Thu, 19 Dec 2024 09:23:15 +0100 Subject: [PATCH 16/40] Update README.md Co-authored-by: gagb --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b82e5fc5..d0201d45 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md This example shows how to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files. -```python +```python convert.py from markitdown import MarkItDown from openai import OpenAI import os From 5c776bda70619e1a59ec8178fae1e3bdb12ff17b Mon Sep 17 00:00:00 2001 From: gagb Date: Thu, 19 Dec 2024 10:30:53 -0800 Subject: [PATCH 17/40] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a91768ae..6ffe8fff 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ from markitdown import MarkItDown from openai import OpenAI import os client = OpenAI(api_key="your-api-key-here") -md = MarkItDown(mlm_client=client, mlm_model="gpt-4o-2024-11-20") +md = MarkItDown(llm_client=client, llm_model="gpt-4o-2024-11-20") supported_extensions = ('.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png') files_to_convert = [f for f in os.listdir('.') if f.lower().endswith(supported_extensions)] for file in files_to_convert: From 535147b2e8f99d47868c214261d62aef8117ae12 Mon Sep 17 00:00:00 2001 From: afourney Date: Thu, 19 Dec 2024 11:11:54 -0800 Subject: [PATCH 18/40] Added holiday notice. Added holiday notice. --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 978327ca..6dc096c5 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,12 @@ +> [!IMPORTANT] +> (12/19/24) Hello! MarkItDown team members will be resting and recharging with family and friends over the holiday period. Activity/responses on the project may be delayed during the period of Dec 21-Jan 06. We will be excited to engage with you in the new year! + # MarkItDown [![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/) ![PyPI - Downloads](https://img.shields.io/pypi/dd/markitdown) - MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc). It supports: - PDF From 613825d5b3e07353083be66196159bfeb2778ad8 Mon Sep 17 00:00:00 2001 From: Sugato Ray Date: Fri, 20 Dec 2024 02:12:24 +0000 Subject: [PATCH 19/40] [feat]: add support for type-hinting for PEP-561 --- src/markitdown/py.typed | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 src/markitdown/py.typed diff --git a/src/markitdown/py.typed b/src/markitdown/py.typed new file mode 100644 index 00000000..e69de29b From 8921fe7304b5ac3117e6445935d8a8eff9104ebb Mon Sep 17 00:00:00 2001 From: Sugato Ray Date: Fri, 20 Dec 2024 02:18:14 +0000 Subject: [PATCH 20/40] ignore .vscode folder - avoid local developer vscode editor settings --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 82f92755..e6c8f2ee 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +.vscode + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] From 52d73080c7ae3d7662d04f5cc520a09de1908878 Mon Sep 17 00:00:00 2001 From: lumin <71011125+l-lumin@users.noreply.github.com> Date: Sat, 21 Dec 2024 04:42:32 +0900 Subject: [PATCH 21/40] refactor(tests): add helper function for tests (#87) * refactor(tests): simplify string validation in tests Introduce a helper function `validate_strings` to streamline the validation of expected and excluded strings in test cases. Replace repetitive string assertions in the `test_markitdown_local` function with calls to this new helper, improving code readability and maintainability. * run pre-commit --------- Co-authored-by: lumin <71011125+l-melon@users.noreply.github.com> Co-authored-by: gagb --- tests/test_markitdown.py | 53 ++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 32 deletions(-) diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 316e670e..4a981bdc 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -131,6 +131,17 @@ ] +# --- Helper Functions --- +def validate_strings(result, expected_strings, exclude_strings=None): + """Validate presence or absence of specific strings.""" + text_content = result.text_content.replace("\\", "") + for string in expected_strings: + assert string in text_content + if exclude_strings: + for string in exclude_strings: + assert string not in text_content + + @pytest.mark.skipif( skip_remote, reason="do not run tests that query external urls", @@ -163,73 +174,53 @@ def test_markitdown_local() -> None: # Test XLSX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx")) - for test_string in XLSX_TEST_STRINGS: - text_content = result.text_content.replace("\\", "") - assert test_string in text_content + validate_strings(result, XLSX_TEST_STRINGS) # Test DOCX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx")) - for test_string in DOCX_TEST_STRINGS: - text_content = result.text_content.replace("\\", "") - assert test_string in text_content + validate_strings(result, DOCX_TEST_STRINGS) # Test DOCX processing, with comments result = markitdown.convert( os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), style_map="comment-reference => ", ) - for test_string in DOCX_COMMENT_TEST_STRINGS: - text_content = result.text_content.replace("\\", "") - assert test_string in text_content + validate_strings(result, DOCX_COMMENT_TEST_STRINGS) # Test DOCX processing, with comments and setting style_map on init markitdown_with_style_map = MarkItDown(style_map="comment-reference => ") result = markitdown_with_style_map.convert( os.path.join(TEST_FILES_DIR, "test_with_comment.docx") ) - for test_string in DOCX_COMMENT_TEST_STRINGS: - text_content = result.text_content.replace("\\", "") - assert test_string in text_content + validate_strings(result, DOCX_COMMENT_TEST_STRINGS) # Test PPTX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx")) - for test_string in PPTX_TEST_STRINGS: - text_content = result.text_content.replace("\\", "") - assert test_string in text_content + validate_strings(result, PPTX_TEST_STRINGS) # Test HTML processing result = markitdown.convert( os.path.join(TEST_FILES_DIR, "test_blog.html"), url=BLOG_TEST_URL ) - for test_string in BLOG_TEST_STRINGS: - text_content = result.text_content.replace("\\", "") - assert test_string in text_content + validate_strings(result, BLOG_TEST_STRINGS) # Test ZIP file processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip")) - for test_string in DOCX_TEST_STRINGS: - text_content = result.text_content.replace("\\", "") - assert test_string in text_content + validate_strings(result, XLSX_TEST_STRINGS) # Test Wikipedia processing result = markitdown.convert( os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL ) text_content = result.text_content.replace("\\", "") - for test_string in WIKIPEDIA_TEST_EXCLUDES: - assert test_string not in text_content - for test_string in WIKIPEDIA_TEST_STRINGS: - assert test_string in text_content + validate_strings(result, WIKIPEDIA_TEST_STRINGS, WIKIPEDIA_TEST_EXCLUDES) # Test Bing processing result = markitdown.convert( os.path.join(TEST_FILES_DIR, "test_serp.html"), url=SERP_TEST_URL ) text_content = result.text_content.replace("\\", "") - for test_string in SERP_TEST_EXCLUDES: - assert test_string not in text_content - for test_string in SERP_TEST_STRINGS: - assert test_string in text_content + validate_strings(result, SERP_TEST_STRINGS, SERP_TEST_EXCLUDES) # Test RSS processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_rss.xml")) @@ -239,9 +230,7 @@ def test_markitdown_local() -> None: ## Test non-UTF-8 encoding result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv")) - text_content = result.text_content.replace("\\", "") - for test_string in CSV_CP932_TEST_STRINGS: - assert test_string in text_content + validate_strings(result, CSV_CP932_TEST_STRINGS) @pytest.mark.skipif( From 7e6c36c5d4af0eb51b3b404b837391a8c6bb549e Mon Sep 17 00:00:00 2001 From: gagb Date: Fri, 20 Dec 2024 14:08:58 -0800 Subject: [PATCH 22/40] docs: add contribution guidelines to README (#176) --- README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/README.md b/README.md index 70403114..be265604 100644 --- a/README.md +++ b/README.md @@ -116,6 +116,20 @@ This project has adopted the [Microsoft Open Source Code of Conduct](https://ope For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. +### How to Contribute + +You can help by looking at issues or helping review PRs. Any issue or PR is welcome, but we have also marked some as 'open for contribution' and 'open for reviewing' to help faciliate community contributions. These are ofcourse just suggestions and you are welcome to contribute in any way you like. + + +
+ +| | All | Especially Needs Help from Community | +|-----------------------|------------------------------------------|------------------------------------------------------------------------------------------| +| **Issues** | [All Issues](https://github.com/microsoft/markitdown/issues) | [Issues open for contribution](https://github.com/microsoft/markitdown/issues?q=is%3Aissue+is%3Aopen+label%3A%22open+for+contribution%22) | +| **PRs** | [All PRs](https://github.com/microsoft/markitdown/pulls) | [PRs open for reviewing](https://github.com/microsoft/markitdown/pulls?q=is%3Apr+is%3Aopen+label%3A%22open+for+reviewing%22) | + +
+ ### Running Tests and Checks - Install `hatch` in your environment and run tests: From 5276616ba1476f663c42c00a0136c9b6257219b6 Mon Sep 17 00:00:00 2001 From: SigireddyBalasai Date: Sat, 21 Dec 2024 03:42:48 +0530 Subject: [PATCH 23/40] Added support to use Pathlib (#93) * Add support for Path objects in MarkItDown conversion methods * Remove unnecessary blank line in test_markitdown_exiftool function * Remove unnecessary blank line in test_markitdown_exiftool function * remove pathlib path in test file --------- Co-authored-by: afourney Co-authored-by: gagb --- src/markitdown/_markitdown.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 040a586e..789c1e55 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -15,6 +15,7 @@ import zipfile from xml.dom import minidom from typing import Any, Dict, List, Optional, Union +from pathlib import Path from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse from warnings import warn, resetwarnings, catch_warnings @@ -1286,11 +1287,11 @@ def __init__( self.register_page_converter(ZipConverter()) def convert( - self, source: Union[str, requests.Response], **kwargs: Any + self, source: Union[str, requests.Response, Path], **kwargs: Any ) -> DocumentConverterResult: # TODO: deal with kwargs """ Args: - - source: can be a string representing a path or url, or a requests.response object + - source: can be a string representing a path either as string pathlib path object or url, or a requests.response object - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.) """ @@ -1307,10 +1308,14 @@ def convert( # Request response elif isinstance(source, requests.Response): return self.convert_response(source, **kwargs) + elif isinstance(source, Path): + return self.convert_local(source, **kwargs) def convert_local( - self, path: str, **kwargs: Any + self, path: Union[str, Path], **kwargs: Any ) -> DocumentConverterResult: # TODO: deal with kwargs + if isinstance(path, Path): + path = str(path) # Prepare a list of extensions to try (in order of priority) ext = kwargs.get("file_extension") extensions = [ext] if ext is not None else [] From c1a0d3deaf3d3794a1f670ad68dd00f0b9cc1ddf Mon Sep 17 00:00:00 2001 From: lumin <71011125+l-lumin@users.noreply.github.com> Date: Sat, 21 Dec 2024 07:28:55 +0900 Subject: [PATCH 24/40] chore: configure Dependabot for GitHub Actions updates (#112) Sets up Dependabot to automatically check for updates to GitHub Actions on a weekly basis, ensuring that the project remains up-to-date with the latest dependencies and security fixes. Co-authored-by: gagb --- .github/dependabot.yml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .github/dependabot.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 00000000..5ace4600 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,6 @@ +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" From 377a7eaa7d6ed3eb0bb125db0e87c15d43699dcc Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 20 Dec 2024 14:36:48 -0800 Subject: [PATCH 25/40] Bump actions/checkout from 2 to 4 (#177) Bumps [actions/checkout](https://github.com/actions/checkout) from 2 to 4. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/v2...v4) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/pre-commit.yml | 2 +- .github/workflows/tests.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index d3f2789c..6128f9ba 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -5,7 +5,7 @@ jobs: pre-commit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v2 with: diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 8aa6d189..fe35e4f5 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -5,7 +5,7 @@ jobs: tests: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: python-version: | From 1123392306bf14c40610bef2773d7a7b01ffbc05 Mon Sep 17 00:00:00 2001 From: Soulter <37870767+Soulter@users.noreply.github.com> Date: Sat, 21 Dec 2024 06:43:00 +0800 Subject: [PATCH 26/40] fix: support -o param to avoid encoding issues (#116) * perf: cli supports -o param * doc: update README --------- Co-authored-by: gagb --- README.md | 6 ++++++ src/markitdown/__main__.py | 26 ++++++++++++++++++++++++-- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index be265604..0fae1e6d 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,12 @@ To install MarkItDown, use pip: `pip install markitdown`. Alternatively, you can markitdown path-to-file.pdf > document.md ``` +Or use `-o` to specify the output file: + +```bash +markitdown path-to-file.pdf -o document.md +``` + You can also pipe content: ```bash diff --git a/src/markitdown/__main__.py b/src/markitdown/__main__.py index be2a0f2f..3193ae7e 100644 --- a/src/markitdown/__main__.py +++ b/src/markitdown/__main__.py @@ -4,7 +4,7 @@ import sys import argparse from textwrap import dedent -from ._markitdown import MarkItDown +from ._markitdown import MarkItDown, DocumentConverterResult def main(): @@ -29,20 +29,42 @@ def main(): OR markitdown < example.pdf + + OR to save to a file use + + markitdown example.pdf -o example.md + + OR + + markitdown example.pdf > example.md """ ).strip(), ) parser.add_argument("filename", nargs="?") + parser.add_argument( + "-o", + "--output", + help="Output file name. If not provided, output is written to stdout.", + ) args = parser.parse_args() if args.filename is None: markitdown = MarkItDown() result = markitdown.convert_stream(sys.stdin.buffer) - print(result.text_content) + _handle_output(args, result) else: markitdown = MarkItDown() result = markitdown.convert(args.filename) + _handle_output(args, result) + + +def _handle_output(args, result: DocumentConverterResult): + """Handle output to stdout or file""" + if args.output: + with open(args.output, "w", encoding="utf-8") as f: + f.write(result.text_content) + else: print(result.text_content) From 857a2d160d0c44310a1804c03a2ef8856f7f673d Mon Sep 17 00:00:00 2001 From: gagb Date: Fri, 20 Dec 2024 14:49:20 -0800 Subject: [PATCH 27/40] Update README.md (#180) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 0fae1e6d..6b514159 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ [![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/) ![PyPI - Downloads](https://img.shields.io/pypi/dd/markitdown) +[![Built by AutoGen Team](https://img.shields.io/badge/Built%20by-AutoGen%20Team-blue)](https://github.com/microsoft/autogen) MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc). From 9b6946777253c15739111e05a906d304c67fd267 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 20 Dec 2024 16:17:43 -0800 Subject: [PATCH 28/40] Bump actions/cache from 3 to 4 (#178) Bumps [actions/cache](https://github.com/actions/cache) from 3 to 4. - [Release notes](https://github.com/actions/cache/releases) - [Changelog](https://github.com/actions/cache/blob/main/RELEASES.md) - [Commits](https://github.com/actions/cache/compare/v3...v4) --- updated-dependencies: - dependency-name: actions/cache dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: gagb Co-authored-by: afourney --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index fe35e4f5..678995a5 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -14,7 +14,7 @@ jobs: 3.12 - name: Set up pip cache if: runner.os == 'Linux' - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/pip key: ${{ runner.os }}-pip-${{ hashFiles('pyproject.toml') }} From 73161982fff4bf6755e706496e4d090f4cafe77c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 20 Dec 2024 16:20:22 -0800 Subject: [PATCH 29/40] Bump actions/setup-python from 2 to 5 (#179) Bumps [actions/setup-python](https://github.com/actions/setup-python) from 2 to 5. - [Release notes](https://github.com/actions/setup-python/releases) - [Commits](https://github.com/actions/setup-python/compare/v2...v5) --- updated-dependencies: - dependency-name: actions/setup-python dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: afourney --- .github/workflows/pre-commit.yml | 2 +- .github/workflows/tests.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 6128f9ba..321f8233 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -7,7 +7,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: "3.x" diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 678995a5..c4dbdcfd 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -6,7 +6,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: python-version: | 3.10 From cfd2319c14a1ed0be6f5a89e145f18878803fa7e Mon Sep 17 00:00:00 2001 From: lumin <71011125+l-lumin@users.noreply.github.com> Date: Sat, 21 Dec 2024 09:24:45 +0900 Subject: [PATCH 30/40] feat: add version option to markitdown CLI (#172) Add a `--version` option to the markitdown command-line interface that displays the current version number. --- src/markitdown/__main__.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/src/markitdown/__main__.py b/src/markitdown/__main__.py index 3193ae7e..b6cf963b 100644 --- a/src/markitdown/__main__.py +++ b/src/markitdown/__main__.py @@ -1,33 +1,35 @@ # SPDX-FileCopyrightText: 2024-present Adam Fourney # # SPDX-License-Identifier: MIT -import sys import argparse +import sys from textwrap import dedent +from .__about__ import __version__ from ._markitdown import MarkItDown, DocumentConverterResult def main(): parser = argparse.ArgumentParser( description="Convert various file formats to markdown.", + prog="markitdown", formatter_class=argparse.RawDescriptionHelpFormatter, usage=dedent( """ - SYNTAX: - + SYNTAX: + markitdown If FILENAME is empty, markitdown reads from stdin. - + EXAMPLE: - + markitdown example.pdf - + OR - + cat example.pdf | markitdown - - OR - + + OR + markitdown < example.pdf OR to save to a file use @@ -41,6 +43,14 @@ def main(): ).strip(), ) + parser.add_argument( + "-v", + "--version", + action="version", + version=f"%(prog)s {__version__}", + help="show the version number and exit", + ) + parser.add_argument("filename", nargs="?") parser.add_argument( "-o", From f94d09990ef6ccb9d7d94800dbec4b5068504055 Mon Sep 17 00:00:00 2001 From: numekudi <51479021+numekudi@users.noreply.github.com> Date: Sat, 21 Dec 2024 11:09:17 +0900 Subject: [PATCH 31/40] feat: enable Git support in devcontainer (#136) Co-authored-by: gagb --- .devcontainer/devcontainer.json | 5 ++++- Dockerfile | 7 ++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index f12fbcb3..e13e299d 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -6,7 +6,10 @@ // Sets the run context to one level up instead of the .devcontainer folder. "context": "..", // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename. - "dockerfile": "../Dockerfile" + "dockerfile": "../Dockerfile", + "args": { + "INSTALL_GIT": "true" + } }, // Features to add to the dev container. More info: https://containers.dev/features. diff --git a/Dockerfile b/Dockerfile index f9c0bef0..0072d9e3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,10 +2,15 @@ FROM python:3.13-slim-bullseye USER root +ARG INSTALL_GIT=false +RUN if [ "$INSTALL_GIT" = "true" ]; then \ + apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*; \ + fi + # Runtime dependency RUN apt-get update && apt-get install -y --no-install-recommends \ ffmpeg \ - && rm -rf /var/lib/apt/lists/* + && rm -rf /var/lib/apt/lists/* RUN pip install markitdown From 125e206047dd4635a71c036d52c56d1655636c50 Mon Sep 17 00:00:00 2001 From: Ikko Eltociear Ashimine Date: Sat, 21 Dec 2024 18:51:30 +0900 Subject: [PATCH 32/40] docs: update README.md (#182) faciliate -> facilitate --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6b514159..d2314c3b 100644 --- a/README.md +++ b/README.md @@ -125,7 +125,7 @@ contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additio ### How to Contribute -You can help by looking at issues or helping review PRs. Any issue or PR is welcome, but we have also marked some as 'open for contribution' and 'open for reviewing' to help faciliate community contributions. These are ofcourse just suggestions and you are welcome to contribute in any way you like. +You can help by looking at issues or helping review PRs. Any issue or PR is welcome, but we have also marked some as 'open for contribution' and 'open for reviewing' to help facilitate community contributions. These are ofcourse just suggestions and you are welcome to contribute in any way you like.
From 4678c8a2a4c5f2984b2a8b3051b0376cf0c2bec4 Mon Sep 17 00:00:00 2001 From: AbSadiki Date: Fri, 3 Jan 2025 16:29:26 -0500 Subject: [PATCH 33/40] fix(transcription): IS_AUDIO_TRANSCRIPTION_CAPABLE should be iniztialized (#194) --- src/markitdown/_markitdown.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 789c1e55..6df13e31 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -33,6 +33,7 @@ from charset_normalizer import from_path # Optional Transcription support +IS_AUDIO_TRANSCRIPTION_CAPABLE = False try: # Using warnings' catch_warnings to catch # pydub's warning of ffmpeg or avconv missing From d248621ba4e7f4f91dba22c000a17c62b394d0c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Murat=20Can=20Kurtulu=C5=9F?= Date: Sat, 4 Jan 2025 00:34:39 +0300 Subject: [PATCH 34/40] feat: outlook ".msg" file converter (#196) * feat: outlook .msg converter * add test, adjust docstring --- pyproject.toml | 1 + src/markitdown/_markitdown.py | 75 ++++++++++++++++++++++++++ tests/test_files/test_outlook_msg.msg | Bin 0 -> 13312 bytes tests/test_markitdown.py | 13 +++++ 4 files changed, 89 insertions(+) create mode 100644 tests/test_files/test_outlook_msg.msg diff --git a/pyproject.toml b/pyproject.toml index 3e14cec8..67f68252 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ dependencies = [ "pdfminer.six", "puremagic", "pydub", + "olefile", "youtube-transcript-api", "SpeechRecognition", "pathvalidate", diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 6df13e31..d209b5e0 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -21,6 +21,7 @@ import mammoth import markdownify +import olefile import pandas as pd import pdfminer import pdfminer.high_level @@ -1077,6 +1078,79 @@ def _get_llm_description(self, local_path, extension, client, model, prompt=None return response.choices[0].message.content +class OutlookMsgConverter(DocumentConverter): + """Converts Outlook .msg files to markdown by extracting email metadata and content. + + Uses the olefile package to parse the .msg file structure and extract: + - Email headers (From, To, Subject) + - Email body content + """ + + def convert( + self, local_path: str, **kwargs: Any + ) -> Union[None, DocumentConverterResult]: + # Bail if not a MSG file + extension = kwargs.get("file_extension", "") + if extension.lower() != ".msg": + return None + + try: + msg = olefile.OleFileIO(local_path) + # Extract email metadata + md_content = "# Email Message\n\n" + + # Get headers + headers = { + "From": self._get_stream_data(msg, "__substg1.0_0C1F001F"), + "To": self._get_stream_data(msg, "__substg1.0_0E04001F"), + "Subject": self._get_stream_data(msg, "__substg1.0_0037001F"), + } + + # Add headers to markdown + for key, value in headers.items(): + if value: + md_content += f"**{key}:** {value}\n" + + md_content += "\n## Content\n\n" + + # Get email body + body = self._get_stream_data(msg, "__substg1.0_1000001F") + if body: + md_content += body + + msg.close() + + return DocumentConverterResult( + title=headers.get("Subject"), text_content=md_content.strip() + ) + + except Exception as e: + raise FileConversionException( + f"Could not convert MSG file '{local_path}': {str(e)}" + ) + + def _get_stream_data( + self, msg: olefile.OleFileIO, stream_path: str + ) -> Union[str, None]: + """Helper to safely extract and decode stream data from the MSG file.""" + try: + if msg.exists(stream_path): + data = msg.openstream(stream_path).read() + # Try UTF-16 first (common for .msg files) + try: + return data.decode("utf-16-le").strip() + except UnicodeDecodeError: + # Fall back to UTF-8 + try: + return data.decode("utf-8").strip() + except UnicodeDecodeError: + # Last resort - ignore errors + return data.decode("utf-8", errors="ignore").strip() + except Exception: + pass + return None + + class ZipConverter(DocumentConverter): """Converts ZIP files to markdown by extracting and converting all contained files. @@ -1286,6 +1360,7 @@ def __init__( self.register_page_converter(IpynbConverter()) self.register_page_converter(PdfConverter()) self.register_page_converter(ZipConverter()) + self.register_page_converter(OutlookMsgConverter()) def convert( self, source: Union[str, requests.Response, Path], **kwargs: Any diff --git a/tests/test_files/test_outlook_msg.msg b/tests/test_files/test_outlook_msg.msg new file mode 100644 index 0000000000000000000000000000000000000000..05b087b77c785c8b57a479485b9715fb7dfecbb4 GIT binary patch literal 13312 zcmeHN-BVk~6+c3bjg7ILwn=KIj;=}Trgp#*2rveM2nZVkG6mtd9nCNjlCWAKQAxyM zJj|q-$xPdqzW6l{{rCs^(npW~gZ8mAopjpA=DE{8CG_{(s}+Li<6J|AWX6m4-g|b> z?(dwl=j@mK*T1~{&)@y&(!b?y~s1$oPDb zzCjk~blees#sL(WabNo9xsPR^kLX*voQG{cD~qxqeG-$RR3zgSUgBs|MoUMcvLQ*y zNgm$|rnC%ty-lCX3-QHU3oA>L@u|sJ-`vVld}V%RIdXepa(2FN>fS;-fhfBpQ|37* zTT+57TaaN(R(+0)K_-?ZQM!g_0enB-$5oaHWVDj^fvX7Wop!Lb`seGv)*P0aMUG0Z z+=rz~uw@Ps6yz4P3PIYSbYq^FHX3A1=`!Rm$lIvz$Df0`45TI%L=KyFA#nD~0G>ho zIdUD(0rSn?_K!|4B$zfmkHLq_e5c+?J??|1!8`+*RQp=S5%;;7z z(rG-6EoWdwjv*~LE)&3wqpTM?OhD=hP|J{f9a3$Z_cAPVAKz(6ejggAVa@j>3(j%$ z@cbS`S>lky%CQ9>%pQ{*Q^uzl-vvm~3%Rz<2vIhAa2};Olq603+`yY9^v8nnyd1nC zBctF>p(pxZ+VF0}Mm@%_=w}E2Mo^N1b=OSI5Ik}O?S}CjLfIf%(nidOOTWAx_~X+* zT)6m2K|b}Jk9S~I{gBdgUh2b>TBnUR5j;PDDkPm@#u2AZ#f#9jxI%_4<;8D=cAgrll|}MtwH-T&ta{#*S>`DSTkOj!x#Ou`DpFQ z5eM&K)}J_Lq#Sy1s?Q4OOx4fRh!O|>p2gFaTX`LD*;RuvL@Dfg=f$Hx_K7erMCT`| z)#F5|k_{!2g>ue3);8Be{e%69Y^k{Mu(FX4If-m4m(G=)L^_kr1|KJ~#X_Z$%Adhk zFp(`k%9YE`WGR=-kM8WCaIKgxmQbH4WzxaKdMT4B1QV(JRwg*Q(|snfwH8WjCQIqW zWmx(XSUUHL}xnBI1ZS@T`Vlq*hVBbm!P8<}!Bxt^)(Yq-DG7iF#yL8tr7 zJ~2bVurqc8g4T&w$6c=x6h&9g&d#1odHVXgteKvKu47&=e)4-m-hKMFWY4Fabyc1| z{Z(%JwvGMrn%UpN#c?7M3CJZwf2+nnRNMc$^}~j~TCI;@wJ)^Tye4@Tqj_2R{i4~uY6VlUX_Jv4rG{xL_0rRdb` z;%qb)_j-z;&$|9=*R5vr&lAYZ`NGaWd-IJu|JZf0+5BVc?fmm*wLjY8I1=w}8)Z(2 zww!wf-us?K?nXX`d>)x==F7<33w;~;JIJpfv)yZW^FG-2(dg||>wEB~9lnqJ1LUj7 zKSXAmAK`ti`c1kQ-+jn8khy0GA@?H>Acv6$ksD!+qI|4L-Mc8Cs;is9_w72mzCT{* zaW!QbUf=k%g!3V9PI>>yGiwQFMlb1AdC3P;n8euW>UJaIncE3X*)Tn$!yv8Vk9RaY zf#z__v1)E=_U;kgJ4Bba(eo|w8GOv+wLNqrzGh{_FPpnMp2F9wcb2QwyCB|4hLGuD zjng&kRH!`W1M}NQW3DfkGIp3+Pha6&%hb&N z#2vs{RBM0kqqX?-l`8~CYK=dw)}L#X7GL9k)IV(ePdENtAGP?Ls~#XTwbFkGtC@EE zxi)L@=___0(h7eVzk_J;xi8S-(^pP12#}iNPa*Dj1FxI^xF^x#zY5-u2#(YmUmyS6 z4{7l)g7h%EKrR03;B7zua{Oo1_}qhP@o9VR zA(@)%Pu;ox=;EK-87;nU{vp2mo2O>|;oke`@wxu%^yj``i%;LRACU+>D0i@zARKZgXX`+v@9T72GNF1p7g zqyOFwQDf9qoQ|z8+Bdo|=~ngp^Cl>}rc|p`uV(fSV(y*6ENDB_>GGr7ICe)7kX`uWd${PWh!O*Z~Hr`a{l)@k0& z#@}oF^WILY|2TM$I{vSEz~}#tJL>r7eWh0auOj}rrfTsgz_knM-~6Pm)F{V8oWmx|2)rW@uT2B_4^;s;ye$t&F2*A JMf%!W;Qy?8Su_9u literal 0 HcmV?d00001 diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 4a981bdc..a0626d19 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -63,6 +63,15 @@ "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", ] +MSG_TEST_STRINGS = [ + "# Email Message", + "**From:** test.sender@example.com", + "**To:** test.recipient@example.com", + "**Subject:** Test Email Message", + "## Content", + "This is the body of the test email message", +] + DOCX_COMMENT_TEST_STRINGS = [ "314b0a30-5b04-470b-b9f7-eed2c2bec74a", "49e168b7-d2ae-407f-a055-2167576f39a1", @@ -232,6 +241,10 @@ def test_markitdown_local() -> None: result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv")) validate_strings(result, CSV_CP932_TEST_STRINGS) + # Test MSG (Outlook email) processing + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg")) + validate_strings(result, MSG_TEST_STRINGS) + @pytest.mark.skipif( skip_exiftool, From 08ed32869eae01d0b7c39944a092b90221f81ae6 Mon Sep 17 00:00:00 2001 From: yeungadrian <47532646+yeungadrian@users.noreply.github.com> Date: Fri, 3 Jan 2025 21:58:17 +0000 Subject: [PATCH 35/40] Feature/ Add xls support (#169) * add xlrd * add xls converter with tests --- pyproject.toml | 1 + src/markitdown/_markitdown.py | 27 ++++++++++++++++++++++++++- tests/test_files/test.xls | Bin 0 -> 27648 bytes tests/test_markitdown.py | 12 ++++++++++++ 4 files changed, 39 insertions(+), 1 deletion(-) create mode 100644 tests/test_files/test.xls diff --git a/pyproject.toml b/pyproject.toml index 67f68252..9c113ade 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ dependencies = [ "python-pptx", "pandas", "openpyxl", + "xlrd", "pdfminer.six", "puremagic", "pydub", diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index d209b5e0..50c83b46 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -726,7 +726,31 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: if extension.lower() != ".xlsx": return None - sheets = pd.read_excel(local_path, sheet_name=None) + sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl") + md_content = "" + for s in sheets: + md_content += f"## {s}\n" + html_content = sheets[s].to_html(index=False) + md_content += self._convert(html_content).text_content.strip() + "\n\n" + + return DocumentConverterResult( + title=None, + text_content=md_content.strip(), + ) + + +class XlsConverter(HtmlConverter): + """ + Converts XLS files to Markdown, with each sheet presented as a separate Markdown table. + """ + + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + # Bail if not a XLS + extension = kwargs.get("file_extension", "") + if extension.lower() != ".xls": + return None + + sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd") md_content = "" for s in sheets: md_content += f"## {s}\n" @@ -1353,6 +1377,7 @@ def __init__( self.register_page_converter(BingSerpConverter()) self.register_page_converter(DocxConverter()) self.register_page_converter(XlsxConverter()) + self.register_page_converter(XlsConverter()) self.register_page_converter(PptxConverter()) self.register_page_converter(WavConverter()) self.register_page_converter(Mp3Converter()) diff --git a/tests/test_files/test.xls b/tests/test_files/test.xls new file mode 100644 index 0000000000000000000000000000000000000000..de4f368c24d489ff7aca786acc29d80652189eb9 GIT binary patch literal 27648 zcmeHQ2V7J~x1U|wf+$5qipWY)L_m5`aVesxfY=K*$|6JrL{R)9fvCh9MKo5TF^C#t z?_JbbP)t1ti zc8Api!a2GVE%I5SPjp#y9$b6N^2P-2OC*?;eR{z)kTl2tA`9pXpdod2$cTOy^o|og z(G?K#06w>CV~&uAkVcYpVn|4ELPm0Is&KeCD|MXkzxoKFI>ho&hf;97K%O??0j&s< z39+QQnbLDhdfrAk57ToXF(r~*L#?wT{C)kQJb{wK_(7EKTY9#m=N540k$tqxO-k_~ zZOLum7mxyUb`Y+|3nj56g(QUkPaT|0Kki%#I6?!zJsWVA}TKLx0r8%cA zLJ|Qr8b^{yCasqXj|bf9Bx*Ovfm*A_K7>fOW({(5YmlRR0~|aeQd`^CP#5n8 zI4~lr_Z1$|ZxF+v0SrEA-2g{JzJgZQSBcS9yA9C+8N{yG&fdk|-hE8Rt|OgfnUNja zYPW-tppD%9m;vI1QSwZ7M330iGr76T<3JvvI)Gc-ak)5XQdU3WOgsr0gQ_?SWtvH| zBb_K=WXG;jf}N`!=sm_MXbebkBOT!nin7Z&3)zL7g>ofC#D#Q`7sk{;ULIAAB~m1+ z7wStoK~Jg|Dl1|BSOZ+c;++s81Sk6 zcS~<+ucG!>q;FF}pQD1lQU!gD3i>7$^vx>hs`Oc^g8yd~bXE9G<^Ol-vqOcR2PoZK zp}iP>Mfx5U{QFeUcd4KsRY7O;ff*9Vf3{G5m|rpUB_<}znk{Qaw3&7>t)nL<9dVmyn4Qtnpq?&-OGP?IZVMTIoqZVpF_F=)kg;%SCBttnqu1%l_$RJZXuXO| zOw5~^gPoavXC$KoK4WLb{<<(8Dzm4ajGv=tD;fO?ObE%2IyIxRq&!?>UYml%U^@PuKdNd>S`np6P0MUx6(e`!(y?5IsDfRU+51+a@XsQ~uj zCKbR=*`xy9n_8e#Qwz8>wSa3=3%E740L(_4W~Huj1+c@iqXsWW5X=cPFj<8FBMSoD z0(^lq{$NjLCe}isA_&aT8wQbx)PvyWQG=RzddY)uqoy~2)Vq~Jbre;1uak=55CXe| zZ7{wz)E{r0e)X$Fb^V>>K#GX2auCTGbs&E7AVXDXBB?H7X|jmZoIU`eN1GG17KtXS z(4>4t9f+Q^in+PD5~1|KW=8-vO`k=BTl6qcLdy|wl?nh}wo)SHO1*viRzWF|NTi?? znDtD|sJKtc5?YF;0WEP?-P7 zI;KQYu274eJ9jEL4Vxyb1=0YU9x|t4(-h}eoq@N$7U&0Zrp*xy=JdyIP(ZsbyeiZJ z?T*!B2Q{?2j3(=kigC*hO4Ub8$S5|l3`BL@!j(xqh=O)#V=W?1WNW9W2`;x91SyYD z4+1+&4TErlKezYT%Yi^fxlM4AsoEyE>S(Y!;zaczu#wgM5ZFj-7{t>{J&1!G z2xOGo1n1GJZGtP+M%Y9>2&|VH1}TqF57M?VAf8_8K^)~kAfwzSxKvPW6I}H*!Y1lL zEE@w-9-$t@wlN@{Ug|;G%Yi^fxlM32q}nECav&<>iFy!N=QnKF@(A@H!p4AjdZ`EL zAO`{&Kcg-UZZ!w zEWi;rGf<2PSB$5k7&|p$m^BC$qa#lI`_Wb9VobSWycET>S0jd5rBE@t;=~s(UMLsS zk}JkrQH--1F}RMA8ck1}D0y41T#Ol4jE|xiH#K5#xgZrI5GUSv@K(8)R$MW@iefy~ zh+(F+)M)zR#D}kLDi>ppV!YLRnk|U&ZiJ_mM;y8KQn{GcTro;K&0dWdwx^Xxyg$E1 zxflzs7$u(8L5&!;rLJ;HI2v76$T6N-yaxpesF-knm zL5&!;r+Iq4+dfmdn6_LoN<7U;jTpA4d3u$Vl_?iv%N3)<(_GYuVSAdVm*hsGaxp@# z7$u(Ou0{;o(>%Qv{G_7awBw3V;%VM$#IQZh)9a^;3zXHy5T0|xysK{94CaiRHWEFR zVl^H52SR#e{GuGRLp>;X0vFVS1IliF zLejVNQU)p@PB6NXkl2JmL4bVA^QO9Nf^n7H}$aX zQ)&CBfg)4E=7T_;Z14x%yppv(Hn!Bg^r3FBJ<}^K4IH(rJTA{lqD^{%(b71siaX*m zED$EcY5}jmzzQN<9Gj2~ZwbO$fupFFF<3Sn6hhxm21Un#g3MD0gwul#(__#V3>>w@77i67`sqQqI58L<#vIDwhh1_E;w0Ao!vz^5>%FqrlTMXn;T{sr7kq3DLQ(zxZQ+iAu{97YgF!G57}1Ul1b`lp;_+S>(ly zYFS9bP3HPQ%j9GsjmgPE8k1YhZ$<(LY-;t(%A_qmg(I{TK=-3IP;7H{$j1mN*XBs5 z*M!N*G+}ZwO_&@8%r1F4N`jvb1ixsga|X1qZ0V0iLuc)Z&N?VLGX?DBA+NfY5j4;s z@Q+MrI<%Ogzw;2H8-+;AL?7VnK4@7|2sAm<{@T>7^|5PHba z*o;&RT1O=J$>^F6N$yoZ%az51R6h0#c-x!!A7H4|olMbXJ7%8^I zJ;2@9-NVz{HO@!u=j!d};pZBV;O^(@74PotD~?+!upld0>M=gfJZdS1m%@u#_YiPC z_&g8ZkUY*4KqAEFj~3)_vIxh2nxIF7rlkZkA0f;~DD%++nm;%tJt-ElA>zzf$o7hz zFabY%h*L6S9pJ%hVq&~sLTrMok7q)>t9M+0ziU984`e;WzCH>5vHr0M?!e}onCR`{ z=N0Gb?HTI>b@KNHHje<;MDKujfbdQT@Jf)t+us7x8y>bpo06IE*$Y171?G_8LVB1> zKlx;-U~#jhtUGuky04bD8CVs`?Z~scM#rF8NgH@})e=5qLeo++VJB?NpsDHNNu%7d zC!}m%bUx~&yKz`e_}?=gyY?yC86*QT>VMm!dd?#MsE9H8d27#@!_k+DyIt_GE(zc3{++4O>>K?K57-)# zaH-7ra-XnU-f<4O8x~C*bIo@7YRmh+5fy)#54h$xcI(28Ws+Sd-OE?^JD$7E<#CvH zs^5}`o3vI;))!{s%%u05`r;rHu3 zv&vO{n-6y>(rZ3fe8Fw@zSq-SHdt@G8Q$iUOUi>Yt)4u~wkX^EZOePT&ra%}V_#X2 zzh%!I;nh8zD%-tsxik8J!xw?`dJTPAQPc5ZSaqlN27G_z$`@C==Qe{uWimX4z}~=8 zo2h~as{MTUg?ZL~X2)885E)KNOL_Ide)PlUhd!U(dm}RB`MO6=zn%4(>r)(DIHfA4 zchnIxgXCu=7h{Y{ye|fYcK#G^ZhG_G!l#Q9yDd!#_;`H4$0qSElo?>{pfPaCU-%8?xdQkg{eQ- zSXOC&Id;uln=5bPJMQ|vPnfp*{9k;QJ}`S3Y0X5Y4hLl#7priFBhsr+JBp|J47 z=((opUrzCC$NOi`b0x5MzH47;ZTbA+4;?e@6KW9Eq zn7JnWUU7K%{!4Fu{_xwj=XW>$@coI|=bG>R=1!2L)#UtO|2vl_y-gY!HDa6J9~X{X zKBgOeU_tf14;Q}L^=<>tNien~pmoK#_+v-zjW-*nwIny|{mVW7xaHFMQrBA!e?JbsxV!ZFO{X+JvlX^U*UhrcG*oZ_~@a{%{2@565Sj%|}h} z-gjtT<D(K;1Hu{?Z?Z@svIM`;;z6ZxXT=~n-#$@-5MX_75c`bMNu6v(tci&;L zjl}EqAJzrewvGP2d(PAk@Ag%U`z>nC=K!p0?P30$+&}+ZmiCpTYShcvQ|F@(fBxsGRm(b$&aJ6jBMO|7eXlAh<|ML8G{@oS!f1G#`-*u)@z{dH_vd91P`ok{s zYa>k0l<%whh2PerXN~0)-TQl2o}c|XX=%@wWwqtM-0Ai7y3dr>B}1lv+-3OayZs|4 z$ERF6+^fHH|Cs|_0<#uIdQ9H@(xlyxkqg61SB!1>EYR+5ce`H;=AXUpysqRDueY%C zq}IXVds?k3o^`i-@z%jLuXlVq^9;YgUaiUcAr@(CSM->3{;|OEisAcqg*)awO8-@> z<+awPo;I=P4u3T}KmEq-$J$|ZOyFN9xZRgSESA_EYG(o z^!DAcW^TtNPH$g&JezLo@ln73wF12*sS!1y)3;=>M~ZD#HIru@={t8Wd59K4odpEG2% zarEBS?P4Pm7hj)TkmRa6xZqt`o9R!4YllvDbh+8W{MvUP!!}(VM)r7i+CK5^*h7_X zth3yC_WlFf%?f3;Dc@x6<)K8eoXz|9(4GT$WhyGPcd??D#HKH6}}j6%QWBGn|umO|+-m%64DZtSBjPtxhZPXti_v z(da8aOBUL%Jhkzu)B5GVr@KCOw8gsf8V~N zdi8{LcH?G0**R{+rXLTkk2$s3(tN^L$EC~0zt#OYY<6%(d5blDYc@UF+j_*lo&B5+ zX=@*zzoxi)b#Xh3D@zv-zhYLEWjSQ!sQEXK#P2Be+W7s3-WHqgjod$KPRtGaqKf19 zOS*`zZyDlh7-{~ok4^HYle21XXSlT!BZ!3_A6!Wsdk6M#sA6Ygd+Yo<>Tq@`6HG+xyH&g_d@ zu4v7CMHlA|s?FQ#U%YV0BfH}D!DkO$ z46n(``Qgnf-n3H=N8*gXtR$98qgP6Hee$=OGp@6(`KZtRgD2g{*KzvsboqVzRhuHT z&&3=(_{%-07*ll~-xXemQ8gn3HrAnJei+?X9)LSVsxR#h_GC?%xCOaWj{1U@Fc)Jt z0@wUpB<}cMk91WT5}gfjF}O}pkI@P?YS`MSaf#@@I{3l{tlD(QD5$+LE-A<=Sc9x$ zP0Qg1nGdx2EE@75GxH@@W4SM}HshP&ZF=ClC%53;*nnUH+y3~aPeV8-5F>hR0%wp4 zll{McdUH84E?P8((w!;26FpC-lsvj1U@#qFDNKk9?`ybo92N_Nw1PB)a?hma_Q0J; zk0&Dh&YSGosSf3-s3YroEj%sI0z@9@$HN^4IK1dlA(DL>9V>!;$0CjmD690;4Y#D| zF&MNkAUSjyM0k|~eCHYt*y+LpZNj{HCg9_XV0i6ZMfri?1 z`Cz%HK*NUT^1*(O0u3w7prJQ#`EUusrQy<$OM_>E3N*A8mk;g8rJ)76G<=ne zOEXnM!zCq`5B-TtL;vE^aN^FT;oE0y+9%pa2?J)j7^oL= zv_Wc6e|pb>o;{=(s4LtnrKP1(7^J>Z4%8np{Af92JTrVGK+1u20%{(E&y?r&_OQWS6=#PjIO*uU2 znL!Gda-er21|!?7o`Yh11^Z#>tH==q%cz887*8Zx~-TPRc%cFjoG1#!Ivs z##Q)ni2sK1l8M9^v+-bZ7_+hNOb%l<#$G0eF&pD7D`pegccBNNe+lT>SnAnW4yjCJ zVWI$yy+kPGki~2}T8@du?ddfW_bsR;5W|^LUFg{@h)?y=23=qd0e7##7B$W)qKT}R zQ2&QJpcp+j;QywyT097AaHZ8K|ChpQJ{n_z&#wlNpt+@Vw3JEuwas3|+3AccTLBh>a+@!J|x^9I0=cpAAoxacrfdAV~T}&#KeR9*%pLgdEAd+-hU)QxNP9X zM`9=~hFc8?IW#R}{3LvRauSt{h?+xV0gVMT7SLEgV*!l?G#1cUKw|-o1vD1WSU_We z|63Nobv&+kam|hEZ+y)K-%i3cH9phFwLZSmh6`m}v*Y?*`YIV+d@ zg8#w+eq?~T?*gB$klY}3hU5+jzdG&-3Cn+igrAzD@fH{W-!sLp&I;kz1Mug%@D~G` za+nhx`tlh{L_ZP=nEjJd2BfDlEZP5y!h1tOdIo$qJdWxyUKzVu8vGBSI{B!BT4vVG ziXKcxhYY0Oy2YP_z`ElX=M$+;NU8q~4OUUPODB+zeF*zcW98c;0&bAOp+kCrH2lqu W3Gf|YY=Nff None: result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx")) validate_strings(result, XLSX_TEST_STRINGS) + # Test XLS processing + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xls")) + for test_string in XLS_TEST_STRINGS: + text_content = result.text_content.replace("\\", "") + assert test_string in text_content + # Test DOCX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx")) validate_strings(result, DOCX_TEST_STRINGS) From 731b39e7f5d36469b2912ed1608fd86c04a1ddcc Mon Sep 17 00:00:00 2001 From: afourney Date: Fri, 3 Jan 2025 14:34:33 -0800 Subject: [PATCH 36/40] Added a test for leading spaces. (#258) --- tests/test_markitdown.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 1ac9041e..9dc7374a 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -257,6 +257,11 @@ def test_markitdown_local() -> None: result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg")) validate_strings(result, MSG_TEST_STRINGS) + # Test input with leading blank characters + input_data = b" \n\n\n

Test

" + result = markitdown.convert_stream(io.BytesIO(input_data), file_extension=".html") + assert "# Test" in result.text_content + @pytest.mark.skipif( skip_exiftool, From 436407288f01b5a2c31111062b0c2ac959dad443 Mon Sep 17 00:00:00 2001 From: afourney Date: Fri, 3 Jan 2025 16:03:11 -0800 Subject: [PATCH 37/40] If puremagic has no guesses, try again after ltrim. (#260) --- src/markitdown/_markitdown.py | 19 +++++++++++++++++++ tests/test_markitdown.py | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 50c83b46..aceaa86d 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -1594,6 +1594,25 @@ def _guess_ext_magic(self, path): # Use puremagic to guess try: guesses = puremagic.magic_file(path) + + # Fix for: https://github.com/microsoft/markitdown/issues/222 + # If there are no guesses, then try again after trimming leading ASCII whitespaces. + # ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f' + # (space, tab, newline, carriage return, vertical tab, form feed). + if len(guesses) == 0: + with open(path, "rb") as file: + while True: + char = file.read(1) + if not char: # End of file + break + if not char.isspace(): + file.seek(file.tell() - 1) + break + try: + guesses = puremagic.magic_stream(file) + except puremagic.main.PureError: + pass + extensions = list() for g in guesses: ext = g.extension.strip() diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 9dc7374a..e2d2e75f 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -259,7 +259,7 @@ def test_markitdown_local() -> None: # Test input with leading blank characters input_data = b" \n\n\n

Test

" - result = markitdown.convert_stream(io.BytesIO(input_data), file_extension=".html") + result = markitdown.convert_stream(io.BytesIO(input_data)) assert "# Test" in result.text_content From 05b78e7ce18cf2f8d8d75058a1f2c98f9930318b Mon Sep 17 00:00:00 2001 From: afourney Date: Fri, 3 Jan 2025 16:40:43 -0800 Subject: [PATCH 38/40] Recognize json as plain text (if no other handlers are present). (#261) * Recognize json as plain text (if no other handlers are present). --- src/markitdown/_markitdown.py | 5 ++++- tests/test_files/test.json | 10 ++++++++++ tests/test_markitdown.py | 9 +++++++++ 3 files changed, 23 insertions(+), 1 deletion(-) create mode 100644 tests/test_files/test.json diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index aceaa86d..b6acfe80 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -173,7 +173,10 @@ def convert( # Only accept text files if content_type is None: return None - elif "text/" not in content_type.lower(): + elif all( + not content_type.lower().startswith(type_prefix) + for type_prefix in ["text/", "application/json"] + ): return None text_content = str(from_path(local_path).best()) diff --git a/tests/test_files/test.json b/tests/test_files/test.json new file mode 100644 index 00000000..eba30594 --- /dev/null +++ b/tests/test_files/test.json @@ -0,0 +1,10 @@ +{ + "key1": "string_value", + "key2": 1234, + "key3": [ + "list_value1", + "list_value2" + ], + "5b64c88c-b3c3-4510-bcb8-da0b200602d8": "uuid_key", + "uuid_value": "9700dc99-6685-40b4-9a3a-5e406dcb37f3" +} diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index e2d2e75f..3333bcbc 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -145,6 +145,11 @@ "5bda1dd6", ] +JSON_TEST_STRINGS = [ + "5b64c88c-b3c3-4510-bcb8-da0b200602d8", + "9700dc99-6685-40b4-9a3a-5e406dcb37f3", +] + # --- Helper Functions --- def validate_strings(result, expected_strings, exclude_strings=None): @@ -257,6 +262,10 @@ def test_markitdown_local() -> None: result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg")) validate_strings(result, MSG_TEST_STRINGS) + # Test JSON processing + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.json")) + validate_strings(result, JSON_TEST_STRINGS) + # Test input with leading blank characters input_data = b" \n\n\n

Test

" result = markitdown.convert_stream(io.BytesIO(input_data)) From 265aea2edf31bf1b022992e59f0ade1e54903aee Mon Sep 17 00:00:00 2001 From: afourney Date: Mon, 6 Jan 2025 09:06:21 -0800 Subject: [PATCH 39/40] Removed the holiday away message from README.md (#266) --- README.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/README.md b/README.md index d2314c3b..6bc91e6c 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,3 @@ -> [!IMPORTANT] -> (12/19/24) Hello! MarkItDown team members will be resting and recharging with family and friends over the holiday period. Activity/responses on the project may be delayed during the period of Dec 21-Jan 06. We will be excited to engage with you in the new year! - # MarkItDown [![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/) From f58a864951da6c720d3e10987371133c67db296a Mon Sep 17 00:00:00 2001 From: afourney Date: Mon, 6 Jan 2025 12:43:47 -0800 Subject: [PATCH 40/40] Set exiftool path explicitly. (#267) --- src/markitdown/_markitdown.py | 39 ++++++++++++++++++++++++++--------- tests/test_markitdown.py | 32 ++++++++++++++++++++++------ 2 files changed, 55 insertions(+), 16 deletions(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index b6acfe80..33806e13 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -892,14 +892,25 @@ class MediaConverter(DocumentConverter): Abstract class for multi-modal media (e.g., images and audio) """ - def _get_metadata(self, local_path): - exiftool = shutil.which("exiftool") - if not exiftool: + def _get_metadata(self, local_path, exiftool_path=None): + if not exiftool_path: + which_exiftool = shutil.which("exiftool") + if which_exiftool: + warn( + f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g., + + md = MarkItDown(exiftool_path="{which_exiftool}") + +This warning will be removed in future releases. +""", + DeprecationWarning, + ) + return None else: try: result = subprocess.run( - [exiftool, "-json", local_path], capture_output=True, text=True + [exiftool_path, "-json", local_path], capture_output=True, text=True ).stdout return json.loads(result)[0] except Exception: @@ -920,7 +931,7 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: md_content = "" # Add metadata - metadata = self._get_metadata(local_path) + metadata = self._get_metadata(local_path, kwargs.get("exiftool_path")) if metadata: for f in [ "Title", @@ -975,7 +986,7 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: md_content = "" # Add metadata - metadata = self._get_metadata(local_path) + metadata = self._get_metadata(local_path, kwargs.get("exiftool_path")) if metadata: for f in [ "Title", @@ -1036,7 +1047,7 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: md_content = "" # Add metadata - metadata = self._get_metadata(local_path) + metadata = self._get_metadata(local_path, kwargs.get("exiftool_path")) if metadata: for f in [ "ImageSize", @@ -1325,6 +1336,7 @@ def __init__( llm_client: Optional[Any] = None, llm_model: Optional[str] = None, style_map: Optional[str] = None, + exiftool_path: Optional[str] = None, # Deprecated mlm_client: Optional[Any] = None, mlm_model: Optional[str] = None, @@ -1334,6 +1346,9 @@ def __init__( else: self._requests_session = requests_session + if exiftool_path is None: + exiftool_path = os.environ.get("EXIFTOOL_PATH") + # Handle deprecation notices ############################# if mlm_client is not None: @@ -1366,6 +1381,7 @@ def __init__( self._llm_client = llm_client self._llm_model = llm_model self._style_map = style_map + self._exiftool_path = exiftool_path self._page_converters: List[DocumentConverter] = [] @@ -1549,12 +1565,15 @@ def _convert( if "llm_model" not in _kwargs and self._llm_model is not None: _kwargs["llm_model"] = self._llm_model - # Add the list of converters for nested processing - _kwargs["_parent_converters"] = self._page_converters - if "style_map" not in _kwargs and self._style_map is not None: _kwargs["style_map"] = self._style_map + if "exiftool_path" not in _kwargs and self._exiftool_path is not None: + _kwargs["exiftool_path"] = self._exiftool_path + + # Add the list of converters for nested processing + _kwargs["_parent_converters"] = self._page_converters + # If we hit an error log it and keep trying try: res = converter.convert(local_path, **_kwargs) diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 3333bcbc..689d6f31 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -277,9 +277,29 @@ def test_markitdown_local() -> None: reason="do not run if exiftool is not installed", ) def test_markitdown_exiftool() -> None: - markitdown = MarkItDown() + # Test the automatic discovery of exiftool throws a warning + # and is disabled + try: + with catch_warnings(record=True) as w: + markitdown = MarkItDown() + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg")) + assert len(w) == 1 + assert w[0].category is DeprecationWarning + assert result.text_content.strip() == "" + finally: + resetwarnings() - # Test JPG metadata processing + # Test explicitly setting the location of exiftool + which_exiftool = shutil.which("exiftool") + markitdown = MarkItDown(exiftool_path=which_exiftool) + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg")) + for key in JPG_TEST_EXIFTOOL: + target = f"{key}: {JPG_TEST_EXIFTOOL[key]}" + assert target in result.text_content + + # Test setting the exiftool path through an environment variable + os.environ["EXIFTOOL_PATH"] = which_exiftool + markitdown = MarkItDown() result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg")) for key in JPG_TEST_EXIFTOOL: target = f"{key}: {JPG_TEST_EXIFTOOL[key]}" @@ -341,8 +361,8 @@ def test_markitdown_llm() -> None: if __name__ == "__main__": """Runs this file's tests from the command line.""" - test_markitdown_remote() - test_markitdown_local() + # test_markitdown_remote() + # test_markitdown_local() test_markitdown_exiftool() - test_markitdown_deprecation() - test_markitdown_llm() + # test_markitdown_deprecation() + # test_markitdown_llm()