From f398f3d4434ef1664250e1032b7f2733944dd6d5 Mon Sep 17 00:00:00 2001
From: "Petr@AP Consulting" <petr.adamek@apconsulting.cz>
Date: Tue, 17 Dec 2024 10:26:09 +0100
Subject: [PATCH 01/40] Update README.md

I added description and script for batch of files processing
---
 README.md | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
diff --git a/README.md b/README.md
index 7079dbfa..01ceb716 100644
--- a/README.md
+++ b/README.md
@@ -78,7 +78,51 @@ You can also use the project as Docker Image:
 docker build -t markitdown:latest .
 docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
 ```
+Batch Processing Multiple Files
 
+This extension allows you to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files.
+
+Features
+
+- Converts multiple files in one operation
+- Supports various file formats (.pptx, .docx, .pdf, .jpg, .jpeg, .png etc. you can change it)
+- Maintains original filenames (changes extension to .md)
+- Includes GPT-4o-latest image descriptions when available
+- Continues processing if individual files fail
+
+Usage
+1. Create a Python script (e.g., convert.py):
+```python
+from markitdown import MarkItDown
+from openai import OpenAI
+import os
+client = OpenAI(api_key="your-api-key-here")
+md = MarkItDown(mlm_client=client, mlm_model="gpt-4o-2024-11-20")
+supported_extensions = ('.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png')
+files_to_convert = [f for f in os.listdir('.') if f.lower().endswith(supported_extensions)]
+for file in files_to_convert:
+    print(f"\nConverting {file}...")
+    try:
+        md_file = os.path.splitext(file)[0] + '.md'
+        result = md.convert(file)
+        with open(md_file, 'w') as f:
+            f.write(result.text_content)
+        
+        print(f"Successfully converted {file} to {md_file}")
+    except Exception as e:
+        print(f"Error converting {file}: {str(e)}")
+
+print("\nAll conversions completed!")
+```
+2. Place the script in the same directory as your files
+3. Install required packages: like openai
+4. Run script ```bash python3 convert.py ```
+
+- The script processes all supported files in the current directory
+- Original files remain unchanged
+- New markdown files are created with the same base name
+- Progress and any errors are displayed during conversion
+   
 ## Contributing
 
 This project welcomes contributions and suggestions.  Most contributions require you to agree to a

From 224f1df0fc33e83d49825e3d8b947d945787ad7d Mon Sep 17 00:00:00 2001
From: "Petr@AP Consulting" <petr.adamek@apconsulting.cz>
Date: Wed, 18 Dec 2024 09:28:18 +0100
Subject: [PATCH 02/40] Update README.md

I collapsed section about batch processing as was suggested
---
 README.md | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 01ceb716..669caa2e 100644
--- a/README.md
+++ b/README.md
@@ -78,11 +78,13 @@ You can also use the project as Docker Image:
 docker build -t markitdown:latest .
 docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
 ```
-Batch Processing Multiple Files
+<details>
+    
+<summary>Batch Processing Multiple Files</summary>
 
 This extension allows you to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files.
 
-Features
+### Features
 
 - Converts multiple files in one operation
 - Supports various file formats (.pptx, .docx, .pdf, .jpg, .jpeg, .png etc. you can change it)
@@ -90,7 +92,7 @@ Features
 - Includes GPT-4o-latest image descriptions when available
 - Continues processing if individual files fail
 
-Usage
+### Usage
 1. Create a Python script (e.g., convert.py):
 ```python
 from markitdown import MarkItDown
@@ -122,6 +124,8 @@ print("\nAll conversions completed!")
 - Original files remain unchanged
 - New markdown files are created with the same base name
 - Progress and any errors are displayed during conversion
+
+</details>
    
 ## Contributing
 

From 6e4caac70d63c87a532be773b2dc3f330f9fdbda Mon Sep 17 00:00:00 2001
From: Joel Esler <jesler@threatstop.com>
Date: Wed, 18 Dec 2024 13:12:55 -0500
Subject: [PATCH 03/40] Safeguard against path traversal for ZipConverter

fix: prevent path traversal vulnerabilities in ZipConverter

Added a secure check for path traversal vulnerabilities in the ZipConverter class.
Now validates extracted file paths using `os.path.commonprefix` to ensure all files
remain within the intended extraction directory. Raises a `ValueError` if a
path traversal attempt is detected.

- Normalized file paths using `os.path.normpath`.
- Added specific exception handling for `zipfile.BadZipFile` and traversal errors.
- Ensured cleanup of extracted files after processing when `cleanup_extracted` is enabled.
---
 .gitignore                    |  2 ++
 src/markitdown/_markitdown.py | 30 ++++++++++++++++++------------
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/.gitignore b/.gitignore
index 82f92755..b6139eb7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -160,3 +160,5 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+src/.DS_Store
+.DS_Store
diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
index 2e7e5ffd..28770f40 100644
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -1133,27 +1133,28 @@ def convert(
         extracted_zip_folder_name = (
             f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}"
         )
-        new_folder = os.path.normpath(
+        extraction_dir = os.path.normpath(
             os.path.join(os.path.dirname(local_path), extracted_zip_folder_name)
         )
         md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n"
 
-        # Safety check for path traversal
-        if not new_folder.startswith(os.path.dirname(local_path)):
-            return DocumentConverterResult(
-                title=None, text_content=f"[ERROR] Invalid zip file path: {local_path}"
-            )
-
         try:
-            # Extract the zip file
+            # Extract the zip file safely
             with zipfile.ZipFile(local_path, "r") as zipObj:
-                zipObj.extractall(path=new_folder)
+                # Safeguard against path traversal
+                for member in zipObj.namelist():
+                    member_path = os.path.normpath(os.path.join(extraction_dir, member))
+                    if not os.path.commonprefix([extraction_dir, member_path]) == extraction_dir:
+                        raise ValueError(f"Path traversal detected in zip file: {member}")
+                
+                # Extract all files safely
+                zipObj.extractall(path=extraction_dir)
 
             # Process each extracted file
-            for root, dirs, files in os.walk(new_folder):
+            for root, dirs, files in os.walk(extraction_dir):
                 for name in files:
                     file_path = os.path.join(root, name)
-                    relative_path = os.path.relpath(file_path, new_folder)
+                    relative_path = os.path.relpath(file_path, extraction_dir)
 
                     # Get file extension
                     _, file_extension = os.path.splitext(name)
@@ -1177,7 +1178,7 @@ def convert(
 
             # Clean up extracted files if specified
             if kwargs.get("cleanup_extracted", True):
-                shutil.rmtree(new_folder)
+                shutil.rmtree(extraction_dir)
 
             return DocumentConverterResult(title=None, text_content=md_content.strip())
 
@@ -1186,6 +1187,11 @@ def convert(
                 title=None,
                 text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}",
             )
+        except ValueError as ve:
+            return DocumentConverterResult(
+                title=None,
+                text_content=f"[ERROR] Security error in zip file {local_path}: {str(ve)}",
+            )
         except Exception as e:
             return DocumentConverterResult(
                 title=None,

From 39410d01df6ecb42a81c4219bdbd3ff6e21b8bfd Mon Sep 17 00:00:00 2001
From: Sugato Ray <sugatoray@users.noreply.github.com>
Date: Wed, 18 Dec 2024 14:22:58 -0500
Subject: [PATCH 04/40] Update CLI helpdoc formatting to allow indentation in
 code

Use `textwrap.dedent()` to allow indented cli-helpdoc in `__main__.py` file. The indentation increases readability, while `textwrap.dedent` helps maintain the same functionality without breaking code.
---
 src/markitdown/__main__.py | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/src/markitdown/__main__.py b/src/markitdown/__main__.py
index 2d531735..9c48cd4e 100644
--- a/src/markitdown/__main__.py
+++ b/src/markitdown/__main__.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: MIT
 import sys
 import argparse
+from textwrap import dedent
 from ._markitdown import MarkItDown
 
 
@@ -10,24 +11,24 @@ def main():
     parser = argparse.ArgumentParser(
         description="Convert various file formats to markdown.",
         formatter_class=argparse.RawDescriptionHelpFormatter,
-        usage="""
-SYNTAX: 
-    
-    markitdown <OPTIONAL: FILENAME>
-    If FILENAME is empty, markitdown reads from stdin.
-
-EXAMPLE:
-    
-    markitdown example.pdf
-    
-    OR
-
-    cat example.pdf | markitdown
-
-    OR 
-
-    markitdown < example.pdf
-""".strip(),
+        usage=dedent("""
+            SYNTAX: 
+                
+                markitdown <OPTIONAL: FILENAME>
+                If FILENAME is empty, markitdown reads from stdin.
+            
+            EXAMPLE:
+                
+                markitdown example.pdf
+                
+                OR
+            
+                cat example.pdf | markitdown
+            
+                OR 
+            
+                markitdown < example.pdf
+            """).strip(),
     )
 
     parser.add_argument("filename", nargs="?")

From 5fc70864f23c75ea315bfb1c011a4ed82a76ccf0 Mon Sep 17 00:00:00 2001
From: gagb <gagb@users.noreply.github.com>
Date: Wed, 18 Dec 2024 11:46:39 -0800
Subject: [PATCH 05/40] Run pre-commit

---
 src/markitdown/_markitdown.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
index 28770f40..040a586e 100644
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -1144,9 +1144,14 @@ def convert(
                 # Safeguard against path traversal
                 for member in zipObj.namelist():
                     member_path = os.path.normpath(os.path.join(extraction_dir, member))
-                    if not os.path.commonprefix([extraction_dir, member_path]) == extraction_dir:
-                        raise ValueError(f"Path traversal detected in zip file: {member}")
-                
+                    if (
+                        not os.path.commonprefix([extraction_dir, member_path])
+                        == extraction_dir
+                    ):
+                        raise ValueError(
+                            f"Path traversal detected in zip file: {member}"
+                        )
+
                 # Extract all files safely
                 zipObj.extractall(path=extraction_dir)
 

From 233ba679b88389fb53aded8a15f7b967f93f5af3 Mon Sep 17 00:00:00 2001
From: "Petr@AP Consulting"
 <173082609+PetrAPConsulting@users.noreply.github.com>
Date: Wed, 18 Dec 2024 21:05:04 +0100
Subject: [PATCH 06/40] Update README.md

Co-authored-by: gagb <gagb@users.noreply.github.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ebeca7f3..f160d865 100644
--- a/README.md
+++ b/README.md
@@ -64,7 +64,7 @@ docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
     
 <summary>Batch Processing Multiple Files</summary>
 
-This extension allows you to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files.
+This example shows how to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files.
 
 ### Features
 

From bb929629f3c573adef8fcdf15ae7112f052299d5 Mon Sep 17 00:00:00 2001
From: "Petr@AP Consulting"
 <173082609+PetrAPConsulting@users.noreply.github.com>
Date: Wed, 18 Dec 2024 21:05:36 +0100
Subject: [PATCH 07/40] Update README.md

Co-authored-by: gagb <gagb@users.noreply.github.com>
---
 README.md | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/README.md b/README.md
index f160d865..627243de 100644
--- a/README.md
+++ b/README.md
@@ -66,13 +66,6 @@ docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
 
 This example shows how to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files.
 
-### Features
-
-- Converts multiple files in one operation
-- Supports various file formats (.pptx, .docx, .pdf, .jpg, .jpeg, .png etc. you can change it)
-- Maintains original filenames (changes extension to .md)
-- Includes GPT-4o-latest image descriptions when available
-- Continues processing if individual files fail
 
 ### Usage
 1. Create a Python script (e.g., convert.py):

From 088007338d1567299a6654cf95fb4616413f7131 Mon Sep 17 00:00:00 2001
From: "Petr@AP Consulting"
 <173082609+PetrAPConsulting@users.noreply.github.com>
Date: Wed, 18 Dec 2024 21:07:55 +0100
Subject: [PATCH 08/40] Update README.md

Co-authored-by: gagb <gagb@users.noreply.github.com>
---
 README.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/README.md b/README.md
index 627243de..62de2e6b 100644
--- a/README.md
+++ b/README.md
@@ -67,8 +67,6 @@ docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
 This example shows how to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files.
 
 
-### Usage
-1. Create a Python script (e.g., convert.py):
 ```python
 from markitdown import MarkItDown
 from openai import OpenAI

From f4471d96e2b61de672e3e1f4bf95222191844274 Mon Sep 17 00:00:00 2001
From: "Petr@AP Consulting"
 <173082609+PetrAPConsulting@users.noreply.github.com>
Date: Wed, 18 Dec 2024 21:08:10 +0100
Subject: [PATCH 09/40] Update README.md

Co-authored-by: gagb <gagb@users.noreply.github.com>
---
 README.md | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 62de2e6b..70a1b5cf 100644
--- a/README.md
+++ b/README.md
@@ -93,10 +93,7 @@ print("\nAll conversions completed!")
 3. Install required packages: like openai
 4. Run script ```bash python3 convert.py ```
 
-- The script processes all supported files in the current directory
-- Original files remain unchanged
-- New markdown files are created with the same base name
-- Progress and any errors are displayed during conversion
+Note that original files will remain unchanged and new markdown files are created with the same base name.
 
 </details>
    

From f6e75c46d4f08a073f5fc07dd0bc122138f52436 Mon Sep 17 00:00:00 2001
From: "Petr@AP Consulting"
 <173082609+PetrAPConsulting@users.noreply.github.com>
Date: Wed, 18 Dec 2024 21:17:47 +0100
Subject: [PATCH 10/40] Update README.md

I changed command for running script from Mac version (python3) to Windows version (python)
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 70a1b5cf..b82e5fc5 100644
--- a/README.md
+++ b/README.md
@@ -91,7 +91,7 @@ print("\nAll conversions completed!")
 ```
 2. Place the script in the same directory as your files
 3. Install required packages: like openai
-4. Run script ```bash python3 convert.py ```
+4. Run script ```bash python convert.py ```
 
 Note that original files will remain unchanged and new markdown files are created with the same base name.
 

From 356e895306baf01633ebacd5888487321c940f6a Mon Sep 17 00:00:00 2001
From: Sugato Ray <sugatoray@users.noreply.github.com>
Date: Wed, 18 Dec 2024 21:25:23 +0000
Subject: [PATCH 11/40] update formatting with pre-commit

---
 src/markitdown/__main__.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/markitdown/__main__.py b/src/markitdown/__main__.py
index 9c48cd4e..be2a0f2f 100644
--- a/src/markitdown/__main__.py
+++ b/src/markitdown/__main__.py
@@ -11,7 +11,8 @@ def main():
     parser = argparse.ArgumentParser(
         description="Convert various file formats to markdown.",
         formatter_class=argparse.RawDescriptionHelpFormatter,
-        usage=dedent("""
+        usage=dedent(
+            """
             SYNTAX: 
                 
                 markitdown <OPTIONAL: FILENAME>
@@ -28,7 +29,8 @@ def main():
                 OR 
             
                 markitdown < example.pdf
-            """).strip(),
+            """
+        ).strip(),
     )
 
     parser.add_argument("filename", nargs="?")

From 1384e8072578278977ec6d67c852f9c2f79d799e Mon Sep 17 00:00:00 2001
From: Sugato Ray <sugatoray@users.noreply.github.com>
Date: Wed, 18 Dec 2024 21:24:57 +0000
Subject: [PATCH 12/40] update .gitignore to exclude .vscode folder

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 82f92755..e6c8f2ee 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
+.vscode
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

From 6e1b9a7402f425b3a740051a35db0fcd336ce549 Mon Sep 17 00:00:00 2001
From: gagb <gagb@users.noreply.github.com>
Date: Wed, 18 Dec 2024 13:46:10 -0800
Subject: [PATCH 13/40] Run precommit

---
 src/markitdown/__main__.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/markitdown/__main__.py b/src/markitdown/__main__.py
index 9c48cd4e..be2a0f2f 100644
--- a/src/markitdown/__main__.py
+++ b/src/markitdown/__main__.py
@@ -11,7 +11,8 @@ def main():
     parser = argparse.ArgumentParser(
         description="Convert various file formats to markdown.",
         formatter_class=argparse.RawDescriptionHelpFormatter,
-        usage=dedent("""
+        usage=dedent(
+            """
             SYNTAX: 
                 
                 markitdown <OPTIONAL: FILENAME>
@@ -28,7 +29,8 @@ def main():
                 OR 
             
                 markitdown < example.pdf
-            """).strip(),
+            """
+        ).strip(),
     )
 
     parser.add_argument("filename", nargs="?")

From a2743a5314936fdfb83e17978323a463e2111bda Mon Sep 17 00:00:00 2001
From: gagb <gagb@users.noreply.github.com>
Date: Wed, 18 Dec 2024 14:26:36 -0800
Subject: [PATCH 14/40] Add downloads badge

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index 75c2ba05..978327ca 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,9 @@
 # MarkItDown
 
 [![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/)
+![PyPI - Downloads](https://img.shields.io/pypi/dd/markitdown)
+
+
 
 MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc).
 It supports:

From c86287b7e3c6b41138c9b8e5e9097c359ea32fbc Mon Sep 17 00:00:00 2001
From: lumin <baolong1027@icloud.com>
Date: Wed, 18 Dec 2024 18:22:41 +0900
Subject: [PATCH 15/40] feat: add project description in pyproject.toml

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index c5bd58ba..3e14cec8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "hatchling.build"
 [project]
 name = "markitdown"
 dynamic = ["version"]
-description = ''
+description = 'Utility tool for converting various files to Markdown'
 readme = "README.md"
 requires-python = ">=3.10"
 license = "MIT"

From b28f380a4768bfb88f9bd209cadc97ae73b7a5b8 Mon Sep 17 00:00:00 2001
From: "Petr@AP Consulting"
 <173082609+PetrAPConsulting@users.noreply.github.com>
Date: Thu, 19 Dec 2024 09:23:15 +0100
Subject: [PATCH 16/40] Update README.md

Co-authored-by: gagb <gagb@users.noreply.github.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b82e5fc5..d0201d45 100644
--- a/README.md
+++ b/README.md
@@ -67,7 +67,7 @@ docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
 This example shows how to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files.
 
 
-```python
+```python convert.py
 from markitdown import MarkItDown
 from openai import OpenAI
 import os

From 5c776bda70619e1a59ec8178fae1e3bdb12ff17b Mon Sep 17 00:00:00 2001
From: gagb <gagb@users.noreply.github.com>
Date: Thu, 19 Dec 2024 10:30:53 -0800
Subject: [PATCH 17/40] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a91768ae..6ffe8fff 100644
--- a/README.md
+++ b/README.md
@@ -75,7 +75,7 @@ from markitdown import MarkItDown
 from openai import OpenAI
 import os
 client = OpenAI(api_key="your-api-key-here")
-md = MarkItDown(mlm_client=client, mlm_model="gpt-4o-2024-11-20")
+md = MarkItDown(llm_client=client, llm_model="gpt-4o-2024-11-20")
 supported_extensions = ('.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png')
 files_to_convert = [f for f in os.listdir('.') if f.lower().endswith(supported_extensions)]
 for file in files_to_convert:

From 535147b2e8f99d47868c214261d62aef8117ae12 Mon Sep 17 00:00:00 2001
From: afourney <adamfo@microsoft.com>
Date: Thu, 19 Dec 2024 11:11:54 -0800
Subject: [PATCH 18/40] Added holiday notice.

Added holiday notice.
---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 978327ca..6dc096c5 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,12 @@
+> [!IMPORTANT]
+> (12/19/24) Hello! MarkItDown team members will be resting and recharging with family and friends over the holiday period. Activity/responses on the project may be delayed during the period of Dec 21-Jan 06. We will be excited to engage with you in the new year!
+
 # MarkItDown
 
 [![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/)
 ![PyPI - Downloads](https://img.shields.io/pypi/dd/markitdown)
 
 
-
 MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc).
 It supports:
 - PDF

From 613825d5b3e07353083be66196159bfeb2778ad8 Mon Sep 17 00:00:00 2001
From: Sugato Ray <sugatoray@users.noreply.github.com>
Date: Fri, 20 Dec 2024 02:12:24 +0000
Subject: [PATCH 19/40] [feat]: add support for type-hinting for PEP-561

---
 src/markitdown/py.typed | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 src/markitdown/py.typed

diff --git a/src/markitdown/py.typed b/src/markitdown/py.typed
new file mode 100644
index 00000000..e69de29b

From 8921fe7304b5ac3117e6445935d8a8eff9104ebb Mon Sep 17 00:00:00 2001
From: Sugato Ray <sugatoray@users.noreply.github.com>
Date: Fri, 20 Dec 2024 02:18:14 +0000
Subject: [PATCH 20/40] ignore .vscode folder

- avoid local developer vscode editor settings
---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 82f92755..e6c8f2ee 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
+.vscode
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

From 52d73080c7ae3d7662d04f5cc520a09de1908878 Mon Sep 17 00:00:00 2001
From: lumin <71011125+l-lumin@users.noreply.github.com>
Date: Sat, 21 Dec 2024 04:42:32 +0900
Subject: [PATCH 21/40] refactor(tests): add helper function for tests (#87)

* refactor(tests): simplify string validation in tests

Introduce a helper function `validate_strings` to streamline the
validation of expected and excluded strings in test cases. Replace
repetitive string assertions in the `test_markitdown_local` function
with calls to this new helper, improving code readability and
maintainability.

* run pre-commit

---------

Co-authored-by: lumin <71011125+l-melon@users.noreply.github.com>
Co-authored-by: gagb <gagb@users.noreply.github.com>
---
 tests/test_markitdown.py | 53 ++++++++++++++++------------------------
 1 file changed, 21 insertions(+), 32 deletions(-)

diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py
index 316e670e..4a981bdc 100644
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@@ -131,6 +131,17 @@
 ]
 
 
+# --- Helper Functions ---
+def validate_strings(result, expected_strings, exclude_strings=None):
+    """Validate presence or absence of specific strings."""
+    text_content = result.text_content.replace("\\", "")
+    for string in expected_strings:
+        assert string in text_content
+    if exclude_strings:
+        for string in exclude_strings:
+            assert string not in text_content
+
+
 @pytest.mark.skipif(
     skip_remote,
     reason="do not run tests that query external urls",
@@ -163,73 +174,53 @@ def test_markitdown_local() -> None:
 
     # Test XLSX processing
     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
-    for test_string in XLSX_TEST_STRINGS:
-        text_content = result.text_content.replace("\\", "")
-        assert test_string in text_content
+    validate_strings(result, XLSX_TEST_STRINGS)
 
     # Test DOCX processing
     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))
-    for test_string in DOCX_TEST_STRINGS:
-        text_content = result.text_content.replace("\\", "")
-        assert test_string in text_content
+    validate_strings(result, DOCX_TEST_STRINGS)
 
     # Test DOCX processing, with comments
     result = markitdown.convert(
         os.path.join(TEST_FILES_DIR, "test_with_comment.docx"),
         style_map="comment-reference => ",
     )
-    for test_string in DOCX_COMMENT_TEST_STRINGS:
-        text_content = result.text_content.replace("\\", "")
-        assert test_string in text_content
+    validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
 
     # Test DOCX processing, with comments and setting style_map on init
     markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
     result = markitdown_with_style_map.convert(
         os.path.join(TEST_FILES_DIR, "test_with_comment.docx")
     )
-    for test_string in DOCX_COMMENT_TEST_STRINGS:
-        text_content = result.text_content.replace("\\", "")
-        assert test_string in text_content
+    validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
 
     # Test PPTX processing
     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx"))
-    for test_string in PPTX_TEST_STRINGS:
-        text_content = result.text_content.replace("\\", "")
-        assert test_string in text_content
+    validate_strings(result, PPTX_TEST_STRINGS)
 
     # Test HTML processing
     result = markitdown.convert(
         os.path.join(TEST_FILES_DIR, "test_blog.html"), url=BLOG_TEST_URL
     )
-    for test_string in BLOG_TEST_STRINGS:
-        text_content = result.text_content.replace("\\", "")
-        assert test_string in text_content
+    validate_strings(result, BLOG_TEST_STRINGS)
 
     # Test ZIP file processing
     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip"))
-    for test_string in DOCX_TEST_STRINGS:
-        text_content = result.text_content.replace("\\", "")
-        assert test_string in text_content
+    validate_strings(result, XLSX_TEST_STRINGS)
 
     # Test Wikipedia processing
     result = markitdown.convert(
         os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL
     )
     text_content = result.text_content.replace("\\", "")
-    for test_string in WIKIPEDIA_TEST_EXCLUDES:
-        assert test_string not in text_content
-    for test_string in WIKIPEDIA_TEST_STRINGS:
-        assert test_string in text_content
+    validate_strings(result, WIKIPEDIA_TEST_STRINGS, WIKIPEDIA_TEST_EXCLUDES)
 
     # Test Bing processing
     result = markitdown.convert(
         os.path.join(TEST_FILES_DIR, "test_serp.html"), url=SERP_TEST_URL
     )
     text_content = result.text_content.replace("\\", "")
-    for test_string in SERP_TEST_EXCLUDES:
-        assert test_string not in text_content
-    for test_string in SERP_TEST_STRINGS:
-        assert test_string in text_content
+    validate_strings(result, SERP_TEST_STRINGS, SERP_TEST_EXCLUDES)
 
     # Test RSS processing
     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_rss.xml"))
@@ -239,9 +230,7 @@ def test_markitdown_local() -> None:
 
     ## Test non-UTF-8 encoding
     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
-    text_content = result.text_content.replace("\\", "")
-    for test_string in CSV_CP932_TEST_STRINGS:
-        assert test_string in text_content
+    validate_strings(result, CSV_CP932_TEST_STRINGS)
 
 
 @pytest.mark.skipif(

From 7e6c36c5d4af0eb51b3b404b837391a8c6bb549e Mon Sep 17 00:00:00 2001
From: gagb <gagb@users.noreply.github.com>
Date: Fri, 20 Dec 2024 14:08:58 -0800
Subject: [PATCH 22/40] docs: add contribution guidelines to README (#176)

---
 README.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/README.md b/README.md
index 70403114..be265604 100644
--- a/README.md
+++ b/README.md
@@ -116,6 +116,20 @@ This project has adopted the [Microsoft Open Source Code of Conduct](https://ope
 For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
 contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
 
+### How to Contribute
+
+You can help by looking at issues or helping review PRs. Any issue or PR is welcome, but we have also marked some as 'open for contribution' and 'open for reviewing' to help faciliate community contributions. These are ofcourse just suggestions and you are welcome to contribute in any way you like.
+
+
+<div align="center">
+
+|                       | All                                      | Especially Needs Help from Community                                                                 |
+|-----------------------|------------------------------------------|------------------------------------------------------------------------------------------|
+| **Issues**            | [All Issues](https://github.com/microsoft/markitdown/issues) | [Issues open for contribution](https://github.com/microsoft/markitdown/issues?q=is%3Aissue+is%3Aopen+label%3A%22open+for+contribution%22) |
+| **PRs**               | [All PRs](https://github.com/microsoft/markitdown/pulls)     | [PRs open for reviewing](https://github.com/microsoft/markitdown/pulls?q=is%3Apr+is%3Aopen+label%3A%22open+for+reviewing%22)               |
+
+</div>
+
 ### Running Tests and Checks
 
 - Install `hatch` in your environment and run tests:

From 5276616ba1476f663c42c00a0136c9b6257219b6 Mon Sep 17 00:00:00 2001
From: SigireddyBalasai <sigireddybalasai@gmail.com>
Date: Sat, 21 Dec 2024 03:42:48 +0530
Subject: [PATCH 23/40] Added support to use Pathlib (#93)

* Add support for Path objects in MarkItDown conversion methods

* Remove unnecessary blank line in test_markitdown_exiftool function

* Remove unnecessary blank line in test_markitdown_exiftool function

* remove pathlib path in test file

---------

Co-authored-by: afourney <adamfo@microsoft.com>
Co-authored-by: gagb <gagb@users.noreply.github.com>
---
 src/markitdown/_markitdown.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
index 040a586e..789c1e55 100644
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -15,6 +15,7 @@
 import zipfile
 from xml.dom import minidom
 from typing import Any, Dict, List, Optional, Union
+from pathlib import Path
 from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
 from warnings import warn, resetwarnings, catch_warnings
 
@@ -1286,11 +1287,11 @@ def __init__(
         self.register_page_converter(ZipConverter())
 
     def convert(
-        self, source: Union[str, requests.Response], **kwargs: Any
+        self, source: Union[str, requests.Response, Path], **kwargs: Any
     ) -> DocumentConverterResult:  # TODO: deal with kwargs
         """
         Args:
-            - source: can be a string representing a path or url, or a requests.response object
+            - source: can be a string representing a path either as string pathlib path object or url, or a requests.response object
             - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
         """
 
@@ -1307,10 +1308,14 @@ def convert(
         # Request response
         elif isinstance(source, requests.Response):
             return self.convert_response(source, **kwargs)
+        elif isinstance(source, Path):
+            return self.convert_local(source, **kwargs)
 
     def convert_local(
-        self, path: str, **kwargs: Any
+        self, path: Union[str, Path], **kwargs: Any
     ) -> DocumentConverterResult:  # TODO: deal with kwargs
+        if isinstance(path, Path):
+            path = str(path)
         # Prepare a list of extensions to try (in order of priority)
         ext = kwargs.get("file_extension")
         extensions = [ext] if ext is not None else []

From c1a0d3deaf3d3794a1f670ad68dd00f0b9cc1ddf Mon Sep 17 00:00:00 2001
From: lumin <71011125+l-lumin@users.noreply.github.com>
Date: Sat, 21 Dec 2024 07:28:55 +0900
Subject: [PATCH 24/40] chore: configure Dependabot for GitHub Actions updates
 (#112)

Sets up Dependabot to automatically check for updates to
GitHub Actions on a weekly basis, ensuring that the project
remains up-to-date with the latest dependencies and security
fixes.

Co-authored-by: gagb <gagb@users.noreply.github.com>
---
 .github/dependabot.yml | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 .github/dependabot.yml

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 00000000..5ace4600
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,6 @@
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"

From 377a7eaa7d6ed3eb0bb125db0e87c15d43699dcc Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 20 Dec 2024 14:36:48 -0800
Subject: [PATCH 25/40] Bump actions/checkout from 2 to 4 (#177)

Bumps [actions/checkout](https://github.com/actions/checkout) from 2 to 4.
- [Release notes](https://github.com/actions/checkout/releases)
- [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md)
- [Commits](https://github.com/actions/checkout/compare/v2...v4)

---
updated-dependencies:
- dependency-name: actions/checkout
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/pre-commit.yml | 2 +-
 .github/workflows/tests.yml      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index d3f2789c..6128f9ba 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -5,7 +5,7 @@ jobs:
   pre-commit:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Set up Python
         uses: actions/setup-python@v2
         with:
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 8aa6d189..fe35e4f5 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -5,7 +5,7 @@ jobs:
   tests:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - uses: actions/setup-python@v4
         with:
           python-version: |

From 1123392306bf14c40610bef2773d7a7b01ffbc05 Mon Sep 17 00:00:00 2001
From: Soulter <37870767+Soulter@users.noreply.github.com>
Date: Sat, 21 Dec 2024 06:43:00 +0800
Subject: [PATCH 26/40] fix: support -o param to avoid encoding issues (#116)

* perf: cli supports -o param

* doc: update README

---------

Co-authored-by: gagb <gagb@users.noreply.github.com>
---
 README.md                  |  6 ++++++
 src/markitdown/__main__.py | 26 ++++++++++++++++++++++++--
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index be265604..0fae1e6d 100644
--- a/README.md
+++ b/README.md
@@ -29,6 +29,12 @@ To install MarkItDown, use pip: `pip install markitdown`. Alternatively, you can
 markitdown path-to-file.pdf > document.md
 ```
 
+Or use `-o` to specify the output file:
+
+```bash
+markitdown path-to-file.pdf -o document.md
+```
+
 You can also pipe content:
 
 ```bash
diff --git a/src/markitdown/__main__.py b/src/markitdown/__main__.py
index be2a0f2f..3193ae7e 100644
--- a/src/markitdown/__main__.py
+++ b/src/markitdown/__main__.py
@@ -4,7 +4,7 @@
 import sys
 import argparse
 from textwrap import dedent
-from ._markitdown import MarkItDown
+from ._markitdown import MarkItDown, DocumentConverterResult
 
 
 def main():
@@ -29,20 +29,42 @@ def main():
                 OR 
             
                 markitdown < example.pdf
+                
+                OR to save to a file use
+    
+                markitdown example.pdf -o example.md
+                
+                OR
+                
+                markitdown example.pdf > example.md
             """
         ).strip(),
     )
 
     parser.add_argument("filename", nargs="?")
+    parser.add_argument(
+        "-o",
+        "--output",
+        help="Output file name. If not provided, output is written to stdout.",
+    )
     args = parser.parse_args()
 
     if args.filename is None:
         markitdown = MarkItDown()
         result = markitdown.convert_stream(sys.stdin.buffer)
-        print(result.text_content)
+        _handle_output(args, result)
     else:
         markitdown = MarkItDown()
         result = markitdown.convert(args.filename)
+        _handle_output(args, result)
+
+
+def _handle_output(args, result: DocumentConverterResult):
+    """Handle output to stdout or file"""
+    if args.output:
+        with open(args.output, "w", encoding="utf-8") as f:
+            f.write(result.text_content)
+    else:
         print(result.text_content)
 
 

From 857a2d160d0c44310a1804c03a2ef8856f7f673d Mon Sep 17 00:00:00 2001
From: gagb <gagb@users.noreply.github.com>
Date: Fri, 20 Dec 2024 14:49:20 -0800
Subject: [PATCH 27/40] Update README.md (#180)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 0fae1e6d..6b514159 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,7 @@
 
 [![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/)
 ![PyPI - Downloads](https://img.shields.io/pypi/dd/markitdown)
+[![Built by AutoGen Team](https://img.shields.io/badge/Built%20by-AutoGen%20Team-blue)](https://github.com/microsoft/autogen)
 
 
 MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc).

From 9b6946777253c15739111e05a906d304c67fd267 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 20 Dec 2024 16:17:43 -0800
Subject: [PATCH 28/40] Bump actions/cache from 3 to 4 (#178)

Bumps [actions/cache](https://github.com/actions/cache) from 3 to 4.
- [Release notes](https://github.com/actions/cache/releases)
- [Changelog](https://github.com/actions/cache/blob/main/RELEASES.md)
- [Commits](https://github.com/actions/cache/compare/v3...v4)

---
updated-dependencies:
- dependency-name: actions/cache
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: gagb <gagb@users.noreply.github.com>
Co-authored-by: afourney <adamfo@microsoft.com>
---
 .github/workflows/tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index fe35e4f5..678995a5 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -14,7 +14,7 @@ jobs:
             3.12
       - name: Set up pip cache
         if: runner.os == 'Linux'
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: ~/.cache/pip
           key: ${{ runner.os }}-pip-${{ hashFiles('pyproject.toml') }}

From 73161982fff4bf6755e706496e4d090f4cafe77c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 20 Dec 2024 16:20:22 -0800
Subject: [PATCH 29/40] Bump actions/setup-python from 2 to 5 (#179)

Bumps [actions/setup-python](https://github.com/actions/setup-python) from 2 to 5.
- [Release notes](https://github.com/actions/setup-python/releases)
- [Commits](https://github.com/actions/setup-python/compare/v2...v5)

---
updated-dependencies:
- dependency-name: actions/setup-python
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: afourney <adamfo@microsoft.com>
---
 .github/workflows/pre-commit.yml | 2 +-
 .github/workflows/tests.yml      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 6128f9ba..321f8233 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -7,7 +7,7 @@ jobs:
     steps:
       - uses: actions/checkout@v4
       - name: Set up Python
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v5
         with:
           python-version: "3.x"
 
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 678995a5..c4dbdcfd 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -6,7 +6,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@v5
         with:
           python-version: |
             3.10

From cfd2319c14a1ed0be6f5a89e145f18878803fa7e Mon Sep 17 00:00:00 2001
From: lumin <71011125+l-lumin@users.noreply.github.com>
Date: Sat, 21 Dec 2024 09:24:45 +0900
Subject: [PATCH 30/40] feat: add version option to markitdown CLI (#172)

Add a `--version` option to the markitdown command-line interface
that displays the current version number.
---
 src/markitdown/__main__.py | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/src/markitdown/__main__.py b/src/markitdown/__main__.py
index 3193ae7e..b6cf963b 100644
--- a/src/markitdown/__main__.py
+++ b/src/markitdown/__main__.py
@@ -1,33 +1,35 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-import sys
 import argparse
+import sys
 from textwrap import dedent
+from .__about__ import __version__
 from ._markitdown import MarkItDown, DocumentConverterResult
 
 
 def main():
     parser = argparse.ArgumentParser(
         description="Convert various file formats to markdown.",
+        prog="markitdown",
         formatter_class=argparse.RawDescriptionHelpFormatter,
         usage=dedent(
             """
-            SYNTAX: 
-                
+            SYNTAX:
+
                 markitdown <OPTIONAL: FILENAME>
                 If FILENAME is empty, markitdown reads from stdin.
-            
+
             EXAMPLE:
-                
+
                 markitdown example.pdf
-                
+
                 OR
-            
+
                 cat example.pdf | markitdown
-            
-                OR 
-            
+
+                OR
+
                 markitdown < example.pdf
                 
                 OR to save to a file use
@@ -41,6 +43,14 @@ def main():
         ).strip(),
     )
 
+    parser.add_argument(
+        "-v",
+        "--version",
+        action="version",
+        version=f"%(prog)s {__version__}",
+        help="show the version number and exit",
+    )
+
     parser.add_argument("filename", nargs="?")
     parser.add_argument(
         "-o",

From f94d09990ef6ccb9d7d94800dbec4b5068504055 Mon Sep 17 00:00:00 2001
From: numekudi <51479021+numekudi@users.noreply.github.com>
Date: Sat, 21 Dec 2024 11:09:17 +0900
Subject: [PATCH 31/40] feat: enable Git support in devcontainer (#136)

Co-authored-by: gagb <gagb@users.noreply.github.com>
---
 .devcontainer/devcontainer.json | 5 ++++-
 Dockerfile                      | 7 ++++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index f12fbcb3..e13e299d 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -6,7 +6,10 @@
 		// Sets the run context to one level up instead of the .devcontainer folder.
 		"context": "..",
 		// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
-		"dockerfile": "../Dockerfile"
+		"dockerfile": "../Dockerfile",
+		"args": {
+			"INSTALL_GIT": "true"
+		}
 	},
 
 	// Features to add to the dev container. More info: https://containers.dev/features.
diff --git a/Dockerfile b/Dockerfile
index f9c0bef0..0072d9e3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,10 +2,15 @@ FROM python:3.13-slim-bullseye
 
 USER root
 
+ARG INSTALL_GIT=false
+RUN if [ "$INSTALL_GIT" = "true" ]; then \
+    apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*; \
+    fi
+
 # Runtime dependency
 RUN apt-get update && apt-get install -y --no-install-recommends \
     ffmpeg \
- && rm -rf /var/lib/apt/lists/*
+    && rm -rf /var/lib/apt/lists/*
 
 RUN pip install markitdown
 

From 125e206047dd4635a71c036d52c56d1655636c50 Mon Sep 17 00:00:00 2001
From: Ikko Eltociear Ashimine <eltociear@gmail.com>
Date: Sat, 21 Dec 2024 18:51:30 +0900
Subject: [PATCH 32/40] docs: update README.md (#182)

faciliate -> facilitate
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6b514159..d2314c3b 100644
--- a/README.md
+++ b/README.md
@@ -125,7 +125,7 @@ contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additio
 
 ### How to Contribute
 
-You can help by looking at issues or helping review PRs. Any issue or PR is welcome, but we have also marked some as 'open for contribution' and 'open for reviewing' to help faciliate community contributions. These are ofcourse just suggestions and you are welcome to contribute in any way you like.
+You can help by looking at issues or helping review PRs. Any issue or PR is welcome, but we have also marked some as 'open for contribution' and 'open for reviewing' to help facilitate community contributions. These are ofcourse just suggestions and you are welcome to contribute in any way you like.
 
 
 <div align="center">

From 4678c8a2a4c5f2984b2a8b3051b0376cf0c2bec4 Mon Sep 17 00:00:00 2001
From: AbSadiki <sadiki.abdeladim@gmail.com>
Date: Fri, 3 Jan 2025 16:29:26 -0500
Subject: [PATCH 33/40] fix(transcription): IS_AUDIO_TRANSCRIPTION_CAPABLE
 should be iniztialized (#194)

---
 src/markitdown/_markitdown.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
index 789c1e55..6df13e31 100644
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -33,6 +33,7 @@
 from charset_normalizer import from_path
 
 # Optional Transcription support
+IS_AUDIO_TRANSCRIPTION_CAPABLE = False
 try:
     # Using warnings' catch_warnings to catch
     # pydub's warning of ffmpeg or avconv missing

From d248621ba4e7f4f91dba22c000a17c62b394d0c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Murat=20Can=20Kurtulu=C5=9F?= <kmt1bu@bosch.com>
Date: Sat, 4 Jan 2025 00:34:39 +0300
Subject: [PATCH 34/40] feat: outlook ".msg" file converter (#196)

* feat: outlook .msg converter
* add test, adjust docstring
---
 pyproject.toml                        |   1 +
 src/markitdown/_markitdown.py         |  75 ++++++++++++++++++++++++++
 tests/test_files/test_outlook_msg.msg | Bin 0 -> 13312 bytes
 tests/test_markitdown.py              |  13 +++++
 4 files changed, 89 insertions(+)
 create mode 100644 tests/test_files/test_outlook_msg.msg

diff --git a/pyproject.toml b/pyproject.toml
index 3e14cec8..67f68252 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,6 +35,7 @@ dependencies = [
   "pdfminer.six",
   "puremagic",
   "pydub",
+  "olefile",
   "youtube-transcript-api",
   "SpeechRecognition",
   "pathvalidate",
diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
index 6df13e31..d209b5e0 100644
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -21,6 +21,7 @@
 
 import mammoth
 import markdownify
+import olefile
 import pandas as pd
 import pdfminer
 import pdfminer.high_level
@@ -1077,6 +1078,79 @@ def _get_llm_description(self, local_path, extension, client, model, prompt=None
         return response.choices[0].message.content
 
 
+class OutlookMsgConverter(DocumentConverter):
+    """Converts Outlook .msg files to markdown by extracting email metadata and content.
+
+    Uses the olefile package to parse the .msg file structure and extract:
+    - Email headers (From, To, Subject)
+    - Email body content
+    """
+
+    def convert(
+        self, local_path: str, **kwargs: Any
+    ) -> Union[None, DocumentConverterResult]:
+        # Bail if not a MSG file
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".msg":
+            return None
+
+        try:
+            msg = olefile.OleFileIO(local_path)
+            # Extract email metadata
+            md_content = "# Email Message\n\n"
+
+            # Get headers
+            headers = {
+                "From": self._get_stream_data(msg, "__substg1.0_0C1F001F"),
+                "To": self._get_stream_data(msg, "__substg1.0_0E04001F"),
+                "Subject": self._get_stream_data(msg, "__substg1.0_0037001F"),
+            }
+
+            # Add headers to markdown
+            for key, value in headers.items():
+                if value:
+                    md_content += f"**{key}:** {value}\n"
+
+            md_content += "\n## Content\n\n"
+
+            # Get email body
+            body = self._get_stream_data(msg, "__substg1.0_1000001F")
+            if body:
+                md_content += body
+
+            msg.close()
+
+            return DocumentConverterResult(
+                title=headers.get("Subject"), text_content=md_content.strip()
+            )
+
+        except Exception as e:
+            raise FileConversionException(
+                f"Could not convert MSG file '{local_path}': {str(e)}"
+            )
+
+    def _get_stream_data(
+        self, msg: olefile.OleFileIO, stream_path: str
+    ) -> Union[str, None]:
+        """Helper to safely extract and decode stream data from the MSG file."""
+        try:
+            if msg.exists(stream_path):
+                data = msg.openstream(stream_path).read()
+                # Try UTF-16 first (common for .msg files)
+                try:
+                    return data.decode("utf-16-le").strip()
+                except UnicodeDecodeError:
+                    # Fall back to UTF-8
+                    try:
+                        return data.decode("utf-8").strip()
+                    except UnicodeDecodeError:
+                        # Last resort - ignore errors
+                        return data.decode("utf-8", errors="ignore").strip()
+        except Exception:
+            pass
+        return None
+
+
 class ZipConverter(DocumentConverter):
     """Converts ZIP files to markdown by extracting and converting all contained files.
 
@@ -1286,6 +1360,7 @@ def __init__(
         self.register_page_converter(IpynbConverter())
         self.register_page_converter(PdfConverter())
         self.register_page_converter(ZipConverter())
+        self.register_page_converter(OutlookMsgConverter())
 
     def convert(
         self, source: Union[str, requests.Response, Path], **kwargs: Any
diff --git a/tests/test_files/test_outlook_msg.msg b/tests/test_files/test_outlook_msg.msg
new file mode 100644
index 0000000000000000000000000000000000000000..05b087b77c785c8b57a479485b9715fb7dfecbb4
GIT binary patch
literal 13312
zcmeHN-BVk~6+c3bjg7ILwn=KIj;=}Trgp#*2rveM2nZVkG6mtd9nCNjlCWAKQAxyM
zJj|q-$xPdqzW6l{{rCs^(npW~gZ8mAopjpA=DE{8CG_{(s}+Li<6J|AWX6m4-g|b>
z?(dwl=j@mK*T1~{&)@y&(!b<Dx*~r0V)wM1tSk57yUWrSM9$!wWnb*>?y~s1$oPDb
zzCjk~blees#sL(WabNo9xsPR^kLX*voQG{cD~qxqeG-$RR3zgSUgBs|MoUMcvLQ*y
zNgm$|rnC%ty-lCX3-QHU3oA>L@u|sJ-`vVld}V%RIdXepa(2FN>fS;-fhfBpQ|37*
zTT+57TaaN(R(+0)K_-?ZQM!g_0enB-$5oaHWVDj^fvX7Wop!Lb`seGv)*P0aMUG0Z
z+=rz~uw@Ps6yz4P3PIYSbYq^FHX3A1=`!Rm$lIvz$Df0`45TI%L=KyFA#nD~0G>ho
zIdUD(0rSn?_K!|4B$zfmkHL<Ua$5#v6Y{(-KY=u1^gbsyA=R)vka76QAl}RJF8T~h
zO76e|*1^3bKbBv}BiQbajG(_wwA%n*ucT4)2>q_e5c+?J??|1!8`+*RQp=S5%;;7z
z(rG-6EoWdwjv*~LE)&3wqpTM?OhD=hP|J{f9a3$Z_cAPVAKz(6ejggAVa@j>3(j%$
z@cbS`S>lky%CQ9>%pQ{*Q^uzl-vvm~3%Rz<2vIhAa2};Olq603+`yY9^v8nnyd1nC
zBctF>p(pxZ+VF0}Mm@%_=w}E2Mo^N1b=OSI5Ik}O?S}CjLfIf%(nidOOTWAx_~X+*
zT)6m2K|b}Jk9S~I{gBdgUh2b>TBnUR5j;PD<Qvd6k8d0A0-sobRTd$L*OrHqz194s
z>DkPm@#u2AZ#f#9jxI%_4<;8D=cAgrll|}MtwH-T&ta{#*S>`DSTkOj!x#Ou`DpFQ
z5eM&K)}J_Lq#Sy1s?Q4OOx4fRh!O|>p2gFaTX`LD*;RuvL@Dfg=f$Hx_K7erMCT`|
z)#F5|k_{!2g>ue3);8Be{e%69Y^k{Mu(FX4If-m4m(G=)L^_kr1|KJ~#X_Z$%Adhk
zFp(`k%9YE`WGR=-kM8WCaIKgxmQbH4WzxaKdMT4B1QV(JRwg*Q(|snfwH8WjCQIqW
zW<HrKu+K^|H82ze414PjccYJM2|`zr>mx(XSUUH<lg^dKd-9c1&pSJ&3Nky7?N<<=
zwH8C+h8*#S&;b7`4>L}xnBI1ZS@T`Vlq*hVBbm!P8<}!Bxt^)(Yq-DG7iF#yL8tr7
zJ~2bVurqc8g4T&w$6c=x6h&9g&d#1odHVXgteKvKu47&=e)4-m-hKMFWY4Fabyc1|
z{Z(%JwvGMrn%UpN#c?7M3CJZwf2+nnRNMc$^}~j~TCI;@wJ<yEySM**H~x2Dhi}x-
zS4@AOoeE$DVm&5(lLA=dN&mD)XAFI_M(-H9vnptZ@8VhVe}wE8_IJt9Z&dj^e3uP<
zxmN#%q3xO#kbt3sRsMi<oZByt{{Vf$&_h-Jh@3RERQ2=ADMMR(b;;{)y}_sbE*tu4
zwSAWa_VN4WKj$qzRrJgM3{7^|zH))});=A+(}w;v0A2kD4gDK7K#ItzAo)`tJb6m{
zN2GHf-QnvpG>)^Ty<fV+<ljDb!r=Mk9}}ei{ke<pmsFNC^<%<$_(MB(Zi(mTZ#}m-
z(4f)&L9lO(Vdoo@d)N)e4z$=KbzFP=2ftQgbFq8x#cmxe_e%Ta72}7rCC9nL_g^#a
zCk*}BsSe++q2I33;4$L{(xia+3~k2(eq0+@G6clGuilRPY1GkgOefFrfE+Kz&)Co%
zKaQQ3@w0JsT;qpW{z}Hr?z8%E2GQ>e4@Tqj_2R{i4~uY6VlUX_Jv4rG{xL_0rRdb`
z;%qb)_j-z;&$|9=*R5vr&lAYZ`NGaWd-IJu|JZf0+5BVc?fmm*wLjY8I1=w}8)Z(2
zww!wf-us?K?nXX`d>)x==F7<33w;~;JIJpfv)yZW^FG-2(dg||>wEB~9lnqJ1LUj7
zKSXAmAK`ti`c1kQ-+jn8khy0GA@?H>Acv6$ksD!+qI|4L-Mc8Cs;is9_w72mzCT{*
zaW!QbUf=k%g!3V9PI>>yGiwQFMlb1AdC3P;n8euW>UJaIncE3X*)Tn$!yv8Vk9RaY
zf#z__v1)E=_U;kgJ4Bba(eo|w8GOv+wLNqrzGh{_FPpnMp2F9wcb2QwyCB|4hLGuD
zjng&kRH!`W1M}NQW3DfkGIp3+<MYl_htGWy<uIMLd2pQUL8&!<SdIUZ2mJcjX@*a&
zL$JG6f9lTEMs-t757p=&!mTgw1Vb%;7O~^c`BIBdU(t;J0PY&K_|!*>Pha6&%hb&N
z#2vs{RBM0kqqX?-l`8~CYK=dw)}L#X7GL9k)IV(ePdENtAGP?Ls~#XTwbFkGtC@EE
zxi)L@=___0(h7eVzk_J;xi8S-(^pP12#}iNPa*Dj1FxI^xF^x#zY5-u2#(YmUmyS6
z4{7l)g7<p_M{12v`|I@Qe4@p_1YX_;F}22zs>h%EKrR03;B7zua{Oo1_}qhP@o9VR
zA(@)%Pu;ox=;EK-87;nU{vp2mo2O>|;oke`@wxu%^yj``i%;L<UY)6x{X-t+AI7v6
zpLe6});~s_{nOZ==N&D60KDzye-~dr|M9G)#pl|~`%<Q6_9y;;hxLbZnil^p@cxS6
zNX_tF&)4riIHzgxuY<RK|FwxzPtp9I%>RACU+>D0i@zARKZgXX`+v@9T72GNF1p7g
zqyOFwQDf9qoQ|z8+Bdo|=~ngp^Cl>}rc|p`uV(fSV(y*6E<wBgFs8NobML`DC{t^E
z&i}ggk1?&q9|kYSfT<b2oq1_*P5d*awfLI(-^JIjKaL0do8aYsiK(^z)(TCJ@wl6D
z&%c~gj$VI#{P%dk9|14-piI>NDB_>GGr7ICe)7kX`uWd${PWh!O*Z~Hr`a{l)@k0&
z#@}oF^WILY|2TM$I{vSEz~}#tJL>r7eWh0auOj}rrfTsgz<bp3&wE-ezFvQQ{PT`j
ziyr~+amN27@DCUNd-stx{;9k5X>_knM-~6Pm)F{V8oWmx|2)rW@uT2<g3Q!Rf4lN&
z*MI&Vp~b%g-gf67uj_x%1O6;{k2?OjB5C!%tJYs1|BMJN{v3F@UuHUv^?x3?HvWwe
zhkMk1f6?k+565F5k0QK|V@&dHmiLpq&+bI#JjwZ*`7FM9&qv>B_4^;s;ye$t&F2*A
JMf%!W;Qy?8Su_9u

literal 0
HcmV?d00001

diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py
index 4a981bdc..a0626d19 100644
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@@ -63,6 +63,15 @@
     "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
 ]
 
+MSG_TEST_STRINGS = [
+    "# Email Message",
+    "**From:** test.sender@example.com",
+    "**To:** test.recipient@example.com",
+    "**Subject:** Test Email Message",
+    "## Content",
+    "This is the body of the test email message",
+]
+
 DOCX_COMMENT_TEST_STRINGS = [
     "314b0a30-5b04-470b-b9f7-eed2c2bec74a",
     "49e168b7-d2ae-407f-a055-2167576f39a1",
@@ -232,6 +241,10 @@ def test_markitdown_local() -> None:
     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
     validate_strings(result, CSV_CP932_TEST_STRINGS)
 
+    # Test MSG (Outlook email) processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg"))
+    validate_strings(result, MSG_TEST_STRINGS)
+
 
 @pytest.mark.skipif(
     skip_exiftool,

From 08ed32869eae01d0b7c39944a092b90221f81ae6 Mon Sep 17 00:00:00 2001
From: yeungadrian <47532646+yeungadrian@users.noreply.github.com>
Date: Fri, 3 Jan 2025 21:58:17 +0000
Subject: [PATCH 35/40] Feature/ Add xls support (#169)

* add xlrd
* add xls converter with tests
---
 pyproject.toml                |   1 +
 src/markitdown/_markitdown.py |  27 ++++++++++++++++++++++++++-
 tests/test_files/test.xls     | Bin 0 -> 27648 bytes
 tests/test_markitdown.py      |  12 ++++++++++++
 4 files changed, 39 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_files/test.xls

diff --git a/pyproject.toml b/pyproject.toml
index 67f68252..9c113ade 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,6 +32,7 @@ dependencies = [
   "python-pptx",
   "pandas",
   "openpyxl",
+  "xlrd",
   "pdfminer.six",
   "puremagic",
   "pydub",
diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
index d209b5e0..50c83b46 100644
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -726,7 +726,31 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
         if extension.lower() != ".xlsx":
             return None
 
-        sheets = pd.read_excel(local_path, sheet_name=None)
+        sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
+        md_content = ""
+        for s in sheets:
+            md_content += f"## {s}\n"
+            html_content = sheets[s].to_html(index=False)
+            md_content += self._convert(html_content).text_content.strip() + "\n\n"
+
+        return DocumentConverterResult(
+            title=None,
+            text_content=md_content.strip(),
+        )
+
+
+class XlsConverter(HtmlConverter):
+    """
+    Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
+    """
+
+    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        # Bail if not a XLS
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".xls":
+            return None
+
+        sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
         md_content = ""
         for s in sheets:
             md_content += f"## {s}\n"
@@ -1353,6 +1377,7 @@ def __init__(
         self.register_page_converter(BingSerpConverter())
         self.register_page_converter(DocxConverter())
         self.register_page_converter(XlsxConverter())
+        self.register_page_converter(XlsConverter())
         self.register_page_converter(PptxConverter())
         self.register_page_converter(WavConverter())
         self.register_page_converter(Mp3Converter())
diff --git a/tests/test_files/test.xls b/tests/test_files/test.xls
new file mode 100644
index 0000000000000000000000000000000000000000..de4f368c24d489ff7aca786acc29d80652189eb9
GIT binary patch
literal 27648
zcmeHQ2V7J~x1U|wf+$5qipWY)L_m5`aVesxfY=K*$|6JrL{R)9fvCh9MKo5TF^C#t
z?_JbbP)t<B-ch5$9`hj@-FMF1UATL<0Pp+W`@OQ4-`u@(?wS8NbLPyMa_=q|e>1ti
zc8Api!a2GVE%I5SPjp#y9$b6N^2P-2OC*?;eR{z)kTl2tA`9pXpdod2$cTOy^o|og
z(G?K#06w>CV~&uAkVcYpVn|4ELPm0Is&KeCD|MXkzxoKFI>ho&hf;97K%O??0j&s<
z39+QQnbLDhdfrAk57ToXF(r~*L#?wT{C)kQJb{wK_(7EKTY9#m=N540k$tqxO-k_~
zZOLum7mxyUb`Y+|3nj56g(Q<Wk^yJrc>UkPaT|0Kki%#I6?!zJsWVA}TKLx0r8%cA
zLJ|Qr8b^{yCasqXj|bf9Bx*Ovfm*A_K<k;*>7>fOW({(5YmlRR0~|aeQd`^CP#5n8
zI4~lr_Z1$|ZxF+v0SrEA-2g{JzJgZQSBcS9yA9C+8N{yG&fdk|-hE8Rt|OgfnUNja
zYPW-tppD%9m;vI1QSwZ7M330iGr76T<3JvvI)Gc-ak)5XQdU3WOgsr0gQ_?SWtvH|
zBb_K=WXG;jf}N`!=sm_MXbebkBOT!nin7Z&3)zL7g>ofC#D#Q`7sk{;ULIAAB~m1+
z7wStoK~Jg|Dl1<nS5}yct}l6?m_hpx@|Q$NKbcdhP(w#c{QK>|BSOZ+c;++s81Sk6
zcS~<+ucG!>q;FF}pQD1lQU!gD3i>7$^vx>hs`Oc^g8yd~bXE9G<^Ol-vqOcR2PoZK
zp}iP>Mfx5U{QFeUcd4KsRY7O;ff*9Vf3{G5m|rpUB_<}znk{Qaw3&7>t)nL<9<x1W
zOSBn%bRmXNKYR}i^kDo`7s8oxx{C_Bvr;-n8jABy3NP(n&1CdC|7ZLVCWH+C6%tKI
zo`tMG>dVmyn4Qtnpq?&-OGP?IZVMTIoqZVpF_F=)kg;%SCBttnqu1%l_$RJZXuXO|
zOw5~^gPoavXC$KoK4WLb{<<(8Dzm4ajGv=tD;fO?ObE%2Iy<zK(K-A&K&NpA4TuNC
znP~zuGEJ7JL=mwIkE_iQxRM+>IxRq&!?>UYml%U^@PuK<xdn{We8mFD!f#3eW@6a{
z38+YuBxpCa0Gg~x60psiQ~(>dNd>S`np6P0MUx6(e`!(y?5IsDfRU+51+a@XsQ~uj
zCKbR=*`xy9n_8e#Qwz8>wSa3=3%E740L(_4W~Huj1+c@iqXsWW5X=cPFj<8FBMSoD
z0(^lq{$NjLCe}isA_&aT8wQbx)PvyWQG=RzddY)uqoy~2)Vq~Jbre;1uak=55CXe|
zZ7{wz)E{r0e)X$Fb^V>>K#GX2auCTGbs&E7AVXDXBB?H7X|jmZoIU`eN1GG17KtXS
z(4>4t9f+Q^in+PD5~1|KW=8-vO`k=BTl6qcLdy|wl?nh}wo)SHO1*viRzWF|NTi??
zn<i5VX;PhtOer=^Q7Kp&vXuIeqr%+Elww=uU$YdqH~NB7Odmk&D<~xsCduQ~SxT{H
z&UH0I|HnhAOP7?C)Pcx+fNfftFgC66=H_<HKv09}m{>DtD|sJKtc5?YF;0WEP?-P7
zI;KQYu274eJ9jEL4Vxyb1=0YU9x|t4(-h}eoq@N$7U&0Zrp*xy=JdyIP(ZsbyeiZJ
z?T*!B2Q{?2j3(=kigC*hO4Ub8$S5|l3`BL@!j(xqh=O)#V=W?1WNW9W2`;x91SyYD
z4+1+&4TErlKezYT%Yi^fxlM4AsoEyE>S(Y!;zaczu#wg<NO^>M5ZFj-7{t>{J&1!G
z2xOGo1n1GJZGtP+M%Y9>2&|VH1}TqF57M?VAf8_8K^)~kAfwzSxKvPW6I}H*!Y1lL
zEE@w-9-$t@wlN@{Ug|;G%Yi^fxlM32q}nECav&<>iFy!N=QnKF@(A@H!p4AjdZ`EL
zAO`{&<u<_upK6=nu0bRGL_LUgV?fFy)PuBZ42Y)}8)Pxxlo-SCMLgXdg|5Qj_;_(@
zrbnS5k`$upqz6yCt)w|@)4LN#A|$Smf=N95iAgGbu!IKS!2?7+ZE@nA>Kcg-UZZ!w
zEWi;rGf<2PSB$5k7&|p$m^BC$qa#lI`_Wb9VobSWycET>S0jd5rBE@t;=~s(UMLsS
zk}JkrQH--1F}RMA8ck1}D0y41T#Ol4jE|xiH#K5#xgZrI5GUSv@K(8)R$MW@iefy~
zh+(F+)M)zR#D}kLDi>ppV!YLRnk|U&ZiJ_mM;y8KQn{GcTro;K&0dWdwx^Xxyg$E1
zxflzs7$u(8L5&!;r<F(Cx^+vr7)!1gC7#wvjTpA4l}AXPtWYk-iYrEmr*&2%hV5zP
z5xY<Su3U^YSBw%*^HL**?P=u^r|<7pE~X8N@m23>LJ;HI2v76$T6N-yaxpesF-knm
zL5&!;r+Iq4+dfmdn6_LoN<7U;jTpA4d3u$Vl_?iv%N3)<(_GYuVSAdVm*hsGaxp@#
z7$u(Ou0{;o(>%Qv{G_7awBw3V;%VM$#IQZh)9a^;3zXHy5T0|xysK{94CaiRHWEFR
zV<jOmt11~#94O(Cj+KVQz8(}jiVNC~1IqRyLSE!}C<k?@2L=D(f;w_Q**-%E?j_Js
zovW#1Jt%k!7qlY>l<gr<<x-*?w0%7&_y!l$l>^H52SR#e{GuGRLp>;X0vFVS1IliF
zLejVNQU)p@PB6NXkl2JmL4bVA^Q<LWBm}k_6Y54_9o!Uw9YVAtUgI{Rw8f3#$scU`
zhsLJFXQjkuCa0wq3ci*H<eN*H5#~*$c#;M0E5*XpE19Ih6b$!KaO_1;+|H5-!0ja}
zK$pNHCAf+E89ZQS=LB*&eYxCXz7f2h0go?2lf?1kg`wh<ltMu+N3Yfr1JEl8CWG;0
zJWLHk;VK3G*jg!dADLFT|D;=|6_b-`#pGmKF}cOCY1=7JOHwEh%B!NI1yv6PlT3iW
zSWtiv<aNo@1zsl(uP(g$SkEgYF)$L0sw?A~rwwJ=bINE-1W+amUK7ipbKiI}wXTF)
zo*tBN<CM^o7%G&Im(_}Nh8Z{s%T7;;O^wY=%a|%0B+kw(6!emZ;`>O9Nf^n7H}$aX
zQ)&CBfg)4E=7T_;Z14x%yppv(Hn!Bg^r3FBJ<}^K4IH(rJTA{lqD^{%(b71siaX*m
zED$EcY5}jmzzQN<9Gj2~ZwbO$fupFFF<3Sn6hhxm21Un#g3MD0gwul#(__#V3>><m
z9%#k{!o~a+u*hJ6L+s;%n<xQi;-3m|iD+~AA5akl_hN%X4CI2FDgkHWpbBtiYQU%9
z-fVD)g`9w()lvyK#zUsX<jk^|Z$NyYAxZDl^ek|NEcqtmn?ZeggJsjf7qQXnyT1hX
z{bh}Y4+s9m?r#heB#289$*JST3DW-Gm!pgY#3g*oFco^g7@C|N0krD8u`U+Lq7QCy
z>w@77i67`sqQqI58L<#vIDwhh1_E;w0Ao!vz^5><Fe@4wPvK#StiDC?L`CZG;8J-6
zx}%r8t~>%FqrlTMXn;T{sr7kq3DLQ(zxZQ+iAu{97YgF!G57}1Ul1b`lp;_+S>(ly
zYFS9bP3HPQ%j9GsjmgPE8k1YhZ$<(LY-;t(%A_qmg(I{TK=-3IP;7H{$j1mN*XBs5
z*M!N*G+}ZwO_&@8%r1F4N`jvb1ixsga|X1qZ0V0iLuc)Z&N?VLGX?DBA+NfY5j4;s
z@Q+MrI<%Ogzw;2H8-+;AL?7VnK4@7|2sAm<{@T>7<aQX)^0G+6)K`{6Co>^|5PHba
z*o;&RT1<cy$t%il2C;f5NXP(Jmc=b)ffa{wYU2fhVPpYoo1_40X99nN03?pqb|9!c
zm0m?c-7y?yf#-RG9h1pq>O=J$>^F6N$yoZ%az51R6h0#c-x!!A7H4|olMbXJ7%8^I
zJ;2@9-NVz{HO@!u=j!d};pZBV;O^(@74PotD~?+!upld0>M=gfJZdS1m%@u#_YiPC
z_&g8ZkUY*4KqAEFj~3)_vIxh2nxIF7rlkZkA0f;~DD%++nm;%tJt-ElA>zzf$o7hz
zFabY%h*L6S9pJ%hVq&~sLTrMok7q)>t9M+0ziU984`e;WzCH>5vHr0M?!e}onCR`{
z=N0Gb?HTI>b@KNHHje<;MDKujfbdQT@Jf)t+us7x8y>bpo06IE*$Y171?G_8LVB1>
zKlx;-U~#jhtUGuky04bD8CVs`?Z~scM#rF8NgH@})e=5qLeo++VJB?NpsDHNNu%7d
zC!}m%bUx~&yKz`e_}?=gyY?yC86<FgHvP<kGe1r%J-foeyzKhU4TC?{L|h&evQcQB
z;&Q{QW_8!9j3c&sdrmGd+R$g)+mKEU)z-a6mE7~|cVo+tIbT~@bc-!9TK?|f(Zf!D
zv+{b+SvjEO%O~mMB6bEYD9EraSa<n@KmWYX<o0==@>*QT>VMm!dd?#Ms<O~)kG1x0
zw;2}r@^S5Qhld5b4~l%FHunh?%-B_0c;)%V+MZ?RA#Udmbok3{U8gx~FAP61sMd1(
z)6rs|OFkR#3g0%HZ+ddO;hZtQjT2hBJ~`HP^_=Iot`1K*d}qPV1=D;-mq#6!n2r5Q
z^t5F~d2UQDSVRI-S$TVvT;1?{DcH#u+yq}%8IvJSnbg@0&odYP_Qgqe!&$HLkIcA~
z)Bo~G3&+WwOA10e7w3=%PPs{T*0-#yZ}mH)Yj8*~bKR?&p8`K`omcx{$?rn*m78==
zJ`5F|PkJyp@L2Bjvix-$90f7i>E9H8d27#@!_k+DyIt_GE(zc3{++4O>>K?K57-)#
zaH-7ra-XnU-f<4O8x~C*bIo@7YRmh+5fy)#54h$xcI(28Ws+Sd-OE?^JD$7E<#CvH
zs^5}`<VSI9H+HTXc%)#Xc+-`(7x(P1z4{5NqFB-KQ<s|0fC|oqgpaT?lf)CmomuBd
zn6!-(XP}1|mPZ<#6dDIT$XPq3;D%rPA*Zmoo#(e6`X>o3vI;))!{s%%u05`r;rHu3
zv&vO{n-6y>(rZ3fe8Fw@zSq-SHdt@G8Q$iUOUi>Yt)4u~wkX^EZOePT&ra%}V_#X2
zzh%!I;nh8zD%-tsxik8J!xw?`dJTPAQPc5ZSaqlN27G_z$`@C==Qe{uWimX4z}~=8
zo2h~as{MTUg?ZL~X2)885E)KNOL_Ide)PlUhd!U(dm}RB`MO6=zn%4(>r)(DIHfA4
zchnIxgXCu=7h{Y{ye|fYcK#G^ZhG_G!l#Q9yDd!#_;`H4<w75qwijLnetLIz)xE{1
z{ZhO4-Z*9c0inx__j|?<zIQUYG^DVnjpTUJ>$0qSElo?>{pfPaCU-%8?xdQkg{eQ-
zSXOC&Id;uln=5bPJMQ|vPnfp*{9k;QJ}`S3Y0<A>X5Y4hLl#7priFBhsr+JBp|J47
z=((opUrzCC$NOi`b0x<PrQh9<zM{`sUH-XQ=Y00Mc>5MzH47;ZTbA+4;?e@6KW9Eq
zn7JnWUU7K%{!4Fu{_xwj=XW>$@coI|=bG>R=1!2L)#UtO|2vl_y-gY!HDa6J9~X{X
zKBgOeU_tf14;Q}L^=<>tNien~pmoK#_+v-zjW-*nwIny|{mVW7xaHF<?ft#cRfn%d
z+cw)4d_eNS>MQrBA!e?JbsxV!ZFO{X+JvlX^U*UhrcG*oZ_~@a{%{2@565Sj%|}h}
z-gjtT<<bKEV!f#BL;Y=c?=|?nui2o`;iJBotKawW`VoCE#-G+B(Lr+^dItvl_F7vf
z`qZc%Wy@Q>D(K;1Hu{?Z?Z@svIM`;;z6ZxXT=~n-#$@-5MX_75c`bMNu6v(tci&;L
zjl}EqAJzrewvGP2d(PAk@Ag%U`z>nC=K<wgMh%~==k#sFp?#vSKGoEW$<F=s;pDB~
z?>!p0?P30$+&}+ZmiCpTYShcvQ|F@(fBxsGRm(b$&aJ6jBMO|7eXlAh<wP%s<0lTy
z=y&*|%hx(x-dA}XTYk!^=Px^hJl;><|ML8G{@oS!f1G#`-*u)@z{dH_vd91P`ok{s
zYa>k0l<%whh2PerXN~0)-TQl2o}c|XX=%@wWwqtM-0Ai7y3dr>B}1lv+-3OayZs|4
z$ERF6+^fHH|Cs|_0<#uIdQ9H@(xlyxkqg61SB!1>EYR+5ce`H;=AXUpysqRDueY%C
zq}IXVds?k3o^`i-@z%jLuXlVq^9;YgUaiUcAr@(CSM->3{;|OEisAcqg*)awO8-@>
z<+awPo;I=P4u3T}KmEq-$J<BSKid}^dplY5<Db9u%;>$|ZOyFN9xZRgSESA_EYG(o
z^!DAcW^TtNPH$g&JezLo@ln73wF12*sS!1y)3;=<an&{o8X4K=Jiq^x#M_I`$1ETI
zSJ$En{Sza^K_6Xv_Y42W`lH*NzO1^}fAX1wTYFaAS+HgE$mk0-{u8?o)azt*RKLb{
z(95<fOKRM^xm15&JtIBWI8~C{`Sz;SkAGfPT@o@Q`{%&%M~7bPT9Z(9Iw!=*=3Kz3
zSs}0P=)5aQCo|4P*^Zbvwcvc*`%SA8mY1FgYm;of(5p6J(Hr-kmMO2tAI&hGb0W*(
z@O=I2HPx{j((*U#dNez~cIuuX1IJa2?516K{>>M~ZD#HIru@={t8Wd59K4odpEG2%
zarEBS?P4Pm7hj)TkmRa6xZqt`o9R!4YllvDbh+8W{MvUP!!}(VM)r7i+CK5^*h7_X
zth3yC_WlFf%?<r-L$}I#&;LjmEne@ZQ?ZF$4|8!IyW-j2hX*H4{UU4XPv6^DnClnc
zPBHD`d2rSoi~Ri?qkR6+)~fY|tS?s#n77&>f3;Dc@x6<)K8<p5TQz=rOr`G1m3@K-
zzFJr1x6eA-lXtGmXYF4<z4j^lVe-tIukEakpWhYq_V2Hc&EJ_Y@@!kviEYzeyBx4`
z=n?m_e_`gdJ%4*P_pY@1@j&I?Amh6+850V=tNFH#!<{J4)e-5!3%jlDoiEoqI!)W!
z>eoXz|9(4GT$WhyGPcd??D#HKH6}}j6%QWBGn|umO|+-m%64DZtSBjPtxhZPXti_v
z(da8aOBUL%Jhkzu)B5GVr@KCOw8<HKx^%zuRYRUn$N=vX6W3X7wjA{Hs->gsf8V~N
zdi8{LcH?G0**R{+rXLTkk2$s3(tN^L$EC~0zt#OYY<6%(d5blDYc@UF+j_*lo&B5+
zX=@*zzoxi)b#Xh3D@zv-zhYLEWjSQ!sQEXK#P2Be+W7s3-WHqgjod$KPRtGaqKf19
zOS*`zZyDlh7-{~ok4^HYle21XX<LhQ-#k4YbKmm2kPW6^O}#Jh?GbvULq~Dws+XT(
zO|CfhHNRA_W-kmI)-W>SlT!BZ!3_A6!Wsdk6M#sA6Ygd+Yo<>Tq@`6HG+xyH&g_d@
zu4v7C<kQ?&<k0{5i{@Kh1a7lpkFH2QwCww`mhql<J(4zA3A#M{ZsN-06ZSt}5H!F%
zYR-+W3ERg`Uy$baqSd%XeVroPI}C7g^H`qIqC=m>MHlA|s?FQ#U%YV0BfH}D!DkO$
z46n(``Qgnf-n3H=N8*gXtR$98qgP6Hee$=OGp@6(`KZtRgD2g{*KzvsboqVzRhuHT
z&&3=(_{%-07*ll~-xXemQ8gn3HrAnJei+?X9)LSVsxR#h_GC?%xCOaWj{1U@Fc)Jt
z0@wUpB<}cMk91WT5}gfjF}O}pkI@P?YS`MSaf#@@I{3l{tlD(QD5$+LE-A<=Sc9x$
zP0Qg1nGdx2EE@75GxH@@W4SM}HshP&ZF=ClC%53;*nnUH+y3~aPeV8-5F>hR0%wp4
zll{McdUH84E?P8((w!;26FpC-lsvj1U@#qFDNKk9?`ybo92N_Nw1PB)a?hma_Q0J;
zk0&Dh&YSGosSf3-s3YroEj%sI0z@9@$HN^4IK1dlA(DL>9V>!;$0CjmD690;4Y#D|
zF&MNkAUSjyM0k|~eCHYt*y+LpZNj{HCg9_XV0i6<MusJz5kVSexHPmBmj>ZMfri?1
z`Cz%HK*NUT^1*(O0u3w7<pU2>prJQ#`EUusrQy<$OM_>E3N*A8mk;g8rJ)76G<=ne
zOEXnM!zCq`5B-TtL;vE^aN^FT;oE0y+9%pa2<Z+~LYN#6a@ZbBjt@C(2_~lnIc$R?
z@KtB@0jO`#kRUkc4Z<@SPR~w3T9gAl0;vO$D&-5J9AQ!p^bh2SqSV3k>?J)j7^oL=
zv_Wc6e|pb>o;{=(s4LtnrKP1(7^J>Z4%8np{Af92JTrVGK+1u20%{(E&y?<me0h;l
z4)ij}m(qIXh0uFpcqTql4)i(XU@+Y1J!%^gB;`O4#1h{09>r&_OQWS6=#PjIO*uU2
znL!Gda-er21|!?7o`Yh11^Z#>tH==<gl*Ow+Q|qvn4v}R4HrJW=S$1kNiorrp+sId
zQiF!ldlvK@D&;^t<X~F48@=~hgcKPEdO74vY3aPal!NLIWtf&kpGTbD^j<GK=MAIh
z!BP(F0mva@aWH%`4D1ic(E~Yx7%MUUB9dZY?|^%Z9#W1#iq}udfqexrXglRa(0jr3
zEb|xaImp3iN2TO3t>q%cz<z{M7@wncW+W2S^cuW};tOtB&hv-3t^ChZ3B5btl<cMB
zT@c{ofm+Oik!_w_R5yTcQX3I11KLe_(8zG5vjUp7QZP=?nv7#H!+?g*hqZb%ZZyQc
z)GQQc8NmNPcQlNI=%XdX_=70{(l7`$m}E2@FL`i>887*8Zx~-TPRc%cFjoG1#!Ivs
z##Q)ni2sK1l8M9^v+-bZ7_+hNOb%l<#$G0eF&pD7D`pegccBNNe+lT>SnAnW4yjCJ
zVWI$yy+kPGki~2}T8@du?ddfW_bsR;5W|^LUFg{@h)?y=23=qd0e7##7B$W)qKT}R
zQ2&QJpcp+j;QywyT097AaHZ8K|ChpQJ{n_z&#<v=#7DQ1Kag@if6d$4eo4kZFA|VW
z9d_Mt2ijpsIFRD~AV}CD!XaT}kA;NcJsT1ZVMUN|pxOutgZger=z7K-2y||oa^sK6
z#7+?QOv}nh71O8X_4N85n*lb;k6Wyf$?+L!lhP71g^{uG$*GxXlahpC+415OxEBP8
zWKX(%Xw5|)UguptzW4TkTRe1T&7rY?#sV4(Xe^+yfW`tE3ur8$v4F+`8VhJFps~RJ
zXA7uX|MM#^R2I2e8!ui4>wlNpt+@Vw3JEuwas3|+3AccTLBh>a+@!<JZCs1v+CCE!
zuKTA!!tK_XkZ?a>J|x^9I0=cpAAoxacrfdAV~T}&#KeR9*%pLgdEAd+-hU)QxNP9X
zM`9=~hFc8?IW#R}{3LvRauSt{h?+xV0gVMT7SLEgV*!l?G#1cUKw|-o1vD1WSU_We
z|63Nobv&+kam|hEZ+y)K-%i3cH9phFwLZSmh6`m}v*Y?*`YIV+<Kqio_{<;I`S@BE
zzIcUeecTVgS4`o{MflU*1b@N`pWoxJ+Tp_67LpKBJ4kkr@P7z7z^5am_K@He0r>d@
zg8#w+eq?~T?*gB$klY}3hU5+jzdG&-3Cn+igrAzD@fH{W-!sLp&I;kz1Mug%@D~G`
za+nhx`tlh{L_ZP=nEjJd2BfDlEZP5y!h1tOdIo$qJdWxyUKzVu8vGBSI{B!BT4vVG
ziXKcxhYY0Oy2YP_z`ElX=M$+;NU8q~4OUUPODB+zeF*zcW98c;0&bAOp+kCrH2lqu
W3Gf|YY=Nff<Nu$ie<u_ymH!`&m{tJ*

literal 0
HcmV?d00001

diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py
index a0626d19..1ac9041e 100644
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@@ -54,6 +54,12 @@
     "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
 ]
 
+XLS_TEST_STRINGS = [
+    "## 09060124-b5e7-4717-9d07-3c046eb",
+    "6ff4173b-42a5-4784-9b19-f49caff4d93d",
+    "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
+]
+
 DOCX_TEST_STRINGS = [
     "314b0a30-5b04-470b-b9f7-eed2c2bec74a",
     "49e168b7-d2ae-407f-a055-2167576f39a1",
@@ -185,6 +191,12 @@ def test_markitdown_local() -> None:
     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
     validate_strings(result, XLSX_TEST_STRINGS)
 
+    # Test XLS processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xls"))
+    for test_string in XLS_TEST_STRINGS:
+        text_content = result.text_content.replace("\\", "")
+        assert test_string in text_content
+
     # Test DOCX processing
     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))
     validate_strings(result, DOCX_TEST_STRINGS)

From 731b39e7f5d36469b2912ed1608fd86c04a1ddcc Mon Sep 17 00:00:00 2001
From: afourney <adamfo@microsoft.com>
Date: Fri, 3 Jan 2025 14:34:33 -0800
Subject: [PATCH 36/40] Added a test for leading spaces. (#258)

---
 tests/test_markitdown.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py
index 1ac9041e..9dc7374a 100644
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@@ -257,6 +257,11 @@ def test_markitdown_local() -> None:
     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg"))
     validate_strings(result, MSG_TEST_STRINGS)
 
+    # Test input with leading blank characters
+    input_data = b"   \n\n\n<html><body><h1>Test</h1></body></html>"
+    result = markitdown.convert_stream(io.BytesIO(input_data), file_extension=".html")
+    assert "# Test" in result.text_content
+
 
 @pytest.mark.skipif(
     skip_exiftool,

From 436407288f01b5a2c31111062b0c2ac959dad443 Mon Sep 17 00:00:00 2001
From: afourney <adamfo@microsoft.com>
Date: Fri, 3 Jan 2025 16:03:11 -0800
Subject: [PATCH 37/40] If puremagic has no guesses, try again after ltrim.
 (#260)

---
 src/markitdown/_markitdown.py | 19 +++++++++++++++++++
 tests/test_markitdown.py      |  2 +-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
index 50c83b46..aceaa86d 100644
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -1594,6 +1594,25 @@ def _guess_ext_magic(self, path):
         # Use puremagic to guess
         try:
             guesses = puremagic.magic_file(path)
+
+            # Fix for: https://github.com/microsoft/markitdown/issues/222
+            # If there are no guesses, then try again after trimming leading ASCII whitespaces.
+            # ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f'
+            # (space, tab, newline, carriage return, vertical tab, form feed).
+            if len(guesses) == 0:
+                with open(path, "rb") as file:
+                    while True:
+                        char = file.read(1)
+                        if not char:  # End of file
+                            break
+                        if not char.isspace():
+                            file.seek(file.tell() - 1)
+                            break
+                    try:
+                        guesses = puremagic.magic_stream(file)
+                    except puremagic.main.PureError:
+                        pass
+
             extensions = list()
             for g in guesses:
                 ext = g.extension.strip()
diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py
index 9dc7374a..e2d2e75f 100644
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@@ -259,7 +259,7 @@ def test_markitdown_local() -> None:
 
     # Test input with leading blank characters
     input_data = b"   \n\n\n<html><body><h1>Test</h1></body></html>"
-    result = markitdown.convert_stream(io.BytesIO(input_data), file_extension=".html")
+    result = markitdown.convert_stream(io.BytesIO(input_data))
     assert "# Test" in result.text_content
 
 

From 05b78e7ce18cf2f8d8d75058a1f2c98f9930318b Mon Sep 17 00:00:00 2001
From: afourney <adamfo@microsoft.com>
Date: Fri, 3 Jan 2025 16:40:43 -0800
Subject: [PATCH 38/40] Recognize json as plain text (if no other handlers are
 present). (#261)

* Recognize json as plain text (if no other handlers are present).
---
 src/markitdown/_markitdown.py |  5 ++++-
 tests/test_files/test.json    | 10 ++++++++++
 tests/test_markitdown.py      |  9 +++++++++
 3 files changed, 23 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_files/test.json

diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
index aceaa86d..b6acfe80 100644
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -173,7 +173,10 @@ def convert(
         # Only accept text files
         if content_type is None:
             return None
-        elif "text/" not in content_type.lower():
+        elif all(
+            not content_type.lower().startswith(type_prefix)
+            for type_prefix in ["text/", "application/json"]
+        ):
             return None
 
         text_content = str(from_path(local_path).best())
diff --git a/tests/test_files/test.json b/tests/test_files/test.json
new file mode 100644
index 00000000..eba30594
--- /dev/null
+++ b/tests/test_files/test.json
@@ -0,0 +1,10 @@
+{
+    "key1": "string_value",
+    "key2": 1234,
+    "key3": [
+        "list_value1",
+        "list_value2"
+    ],
+    "5b64c88c-b3c3-4510-bcb8-da0b200602d8": "uuid_key",
+    "uuid_value": "9700dc99-6685-40b4-9a3a-5e406dcb37f3"
+}
diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py
index e2d2e75f..3333bcbc 100644
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@@ -145,6 +145,11 @@
     "5bda1dd6",
 ]
 
+JSON_TEST_STRINGS = [
+    "5b64c88c-b3c3-4510-bcb8-da0b200602d8",
+    "9700dc99-6685-40b4-9a3a-5e406dcb37f3",
+]
+
 
 # --- Helper Functions ---
 def validate_strings(result, expected_strings, exclude_strings=None):
@@ -257,6 +262,10 @@ def test_markitdown_local() -> None:
     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg"))
     validate_strings(result, MSG_TEST_STRINGS)
 
+    # Test JSON processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.json"))
+    validate_strings(result, JSON_TEST_STRINGS)
+
     # Test input with leading blank characters
     input_data = b"   \n\n\n<html><body><h1>Test</h1></body></html>"
     result = markitdown.convert_stream(io.BytesIO(input_data))

From 265aea2edf31bf1b022992e59f0ade1e54903aee Mon Sep 17 00:00:00 2001
From: afourney <adamfo@microsoft.com>
Date: Mon, 6 Jan 2025 09:06:21 -0800
Subject: [PATCH 39/40] Removed the holiday away message from README.md (#266)

---
 README.md | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/README.md b/README.md
index d2314c3b..6bc91e6c 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,3 @@
-> [!IMPORTANT]
-> (12/19/24) Hello! MarkItDown team members will be resting and recharging with family and friends over the holiday period. Activity/responses on the project may be delayed during the period of Dec 21-Jan 06. We will be excited to engage with you in the new year!
-
 # MarkItDown
 
 [![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/)

From f58a864951da6c720d3e10987371133c67db296a Mon Sep 17 00:00:00 2001
From: afourney <adamfo@microsoft.com>
Date: Mon, 6 Jan 2025 12:43:47 -0800
Subject: [PATCH 40/40] Set exiftool path explicitly. (#267)

---
 src/markitdown/_markitdown.py | 39 ++++++++++++++++++++++++++---------
 tests/test_markitdown.py      | 32 ++++++++++++++++++++++------
 2 files changed, 55 insertions(+), 16 deletions(-)

diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
index b6acfe80..33806e13 100644
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -892,14 +892,25 @@ class MediaConverter(DocumentConverter):
     Abstract class for multi-modal media (e.g., images and audio)
     """
 
-    def _get_metadata(self, local_path):
-        exiftool = shutil.which("exiftool")
-        if not exiftool:
+    def _get_metadata(self, local_path, exiftool_path=None):
+        if not exiftool_path:
+            which_exiftool = shutil.which("exiftool")
+            if which_exiftool:
+                warn(
+                    f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g., 
+
+    md = MarkItDown(exiftool_path="{which_exiftool}")
+
+This warning will be removed in future releases.
+""",
+                    DeprecationWarning,
+                )
+
             return None
         else:
             try:
                 result = subprocess.run(
-                    [exiftool, "-json", local_path], capture_output=True, text=True
+                    [exiftool_path, "-json", local_path], capture_output=True, text=True
                 ).stdout
                 return json.loads(result)[0]
             except Exception:
@@ -920,7 +931,7 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
         md_content = ""
 
         # Add metadata
-        metadata = self._get_metadata(local_path)
+        metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
         if metadata:
             for f in [
                 "Title",
@@ -975,7 +986,7 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
         md_content = ""
 
         # Add metadata
-        metadata = self._get_metadata(local_path)
+        metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
         if metadata:
             for f in [
                 "Title",
@@ -1036,7 +1047,7 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
         md_content = ""
 
         # Add metadata
-        metadata = self._get_metadata(local_path)
+        metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
         if metadata:
             for f in [
                 "ImageSize",
@@ -1325,6 +1336,7 @@ def __init__(
         llm_client: Optional[Any] = None,
         llm_model: Optional[str] = None,
         style_map: Optional[str] = None,
+        exiftool_path: Optional[str] = None,
         # Deprecated
         mlm_client: Optional[Any] = None,
         mlm_model: Optional[str] = None,
@@ -1334,6 +1346,9 @@ def __init__(
         else:
             self._requests_session = requests_session
 
+        if exiftool_path is None:
+            exiftool_path = os.environ.get("EXIFTOOL_PATH")
+
         # Handle deprecation notices
         #############################
         if mlm_client is not None:
@@ -1366,6 +1381,7 @@ def __init__(
         self._llm_client = llm_client
         self._llm_model = llm_model
         self._style_map = style_map
+        self._exiftool_path = exiftool_path
 
         self._page_converters: List[DocumentConverter] = []
 
@@ -1549,12 +1565,15 @@ def _convert(
                 if "llm_model" not in _kwargs and self._llm_model is not None:
                     _kwargs["llm_model"] = self._llm_model
 
-                # Add the list of converters for nested processing
-                _kwargs["_parent_converters"] = self._page_converters
-
                 if "style_map" not in _kwargs and self._style_map is not None:
                     _kwargs["style_map"] = self._style_map
 
+                if "exiftool_path" not in _kwargs and self._exiftool_path is not None:
+                    _kwargs["exiftool_path"] = self._exiftool_path
+
+                # Add the list of converters for nested processing
+                _kwargs["_parent_converters"] = self._page_converters
+
                 # If we hit an error log it and keep trying
                 try:
                     res = converter.convert(local_path, **_kwargs)
diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py
index 3333bcbc..689d6f31 100644
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@@ -277,9 +277,29 @@ def test_markitdown_local() -> None:
     reason="do not run if exiftool is not installed",
 )
 def test_markitdown_exiftool() -> None:
-    markitdown = MarkItDown()
+    # Test the automatic discovery of exiftool throws a warning
+    # and is disabled
+    try:
+        with catch_warnings(record=True) as w:
+            markitdown = MarkItDown()
+            result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
+            assert len(w) == 1
+            assert w[0].category is DeprecationWarning
+            assert result.text_content.strip() == ""
+    finally:
+        resetwarnings()
 
-    # Test JPG metadata processing
+    # Test explicitly setting the location of exiftool
+    which_exiftool = shutil.which("exiftool")
+    markitdown = MarkItDown(exiftool_path=which_exiftool)
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
+    for key in JPG_TEST_EXIFTOOL:
+        target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
+        assert target in result.text_content
+
+    # Test setting the exiftool path through an environment variable
+    os.environ["EXIFTOOL_PATH"] = which_exiftool
+    markitdown = MarkItDown()
     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
     for key in JPG_TEST_EXIFTOOL:
         target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
@@ -341,8 +361,8 @@ def test_markitdown_llm() -> None:
 
 if __name__ == "__main__":
     """Runs this file's tests from the command line."""
-    test_markitdown_remote()
-    test_markitdown_local()
+    # test_markitdown_remote()
+    # test_markitdown_local()
     test_markitdown_exiftool()
-    test_markitdown_deprecation()
-    test_markitdown_llm()
+    # test_markitdown_deprecation()
+    # test_markitdown_llm()