From 6da4b08f8c3c3159d411db05b6227b53deeaff25 Mon Sep 17 00:00:00 2001
From: lumin <baolong1027@icloud.com>
Date: Thu, 26 Dec 2024 19:15:07 +0900
Subject: [PATCH 1/3] feat(tests): add comprehensive tests for MarkItDown
 functionality

Add new test cases for MarkItDown to cover LLM, remote, and
local file conversions. Implement tests for handling
deprecation warnings, external URL queries, and EXIF data
processing. Ensure tests are skipped when necessary
environment conditions are not met, improving test reliability
and maintainability.
---
 tests/core/__init__.py            |   0
 tests/core/test_external_tools.py |  34 +++++++
 tests/core/test_llm.py            |  73 +++++++++++++++
 tests/core/test_local_file.py     | 151 ++++++++++++++++++++++++++++++
 tests/core/test_remote.py         |  46 +++++++++
 tests/helpers/__init__.py         |   0
 tests/helpers/utils.py            |  11 +++
 7 files changed, 315 insertions(+)
 create mode 100644 tests/core/__init__.py
 create mode 100644 tests/core/test_external_tools.py
 create mode 100644 tests/core/test_llm.py
 create mode 100644 tests/core/test_local_file.py
 create mode 100644 tests/core/test_remote.py
 create mode 100644 tests/helpers/__init__.py
 create mode 100644 tests/helpers/utils.py

diff --git a/tests/core/__init__.py b/tests/core/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/core/test_external_tools.py b/tests/core/test_external_tools.py
new file mode 100644
index 0000000..a7787b8
--- /dev/null
+++ b/tests/core/test_external_tools.py
@@ -0,0 +1,34 @@
+import os
+import shutil
+from pathlib import Path
+
+import pytest
+
+from markitdown import MarkItDown
+
+TEST_FILES_DIR = Path(__file__).parent.parent / "test_files"
+
+# Skip exiftool tests if not installed
+skip_exiftool = shutil.which("exiftool") is None
+
+JPG_TEST_EXIFTOOL = {
+    "Author": "AutoGen Authors",
+    "Title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
+    "Description": "AutoGen enables diverse LLM-based applications",
+    "ImageSize": "1615x1967",
+    "DateTimeOriginal": "2024:03:14 22:10:00",
+}
+
+
+@pytest.mark.skipif(
+    skip_exiftool,
+    reason="do not run if exiftool is not installed",
+)
+def test_markitdown_exiftool() -> None:
+    markitdown = MarkItDown()
+
+    # Test JPG metadata processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
+    for key in JPG_TEST_EXIFTOOL:
+        target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
+        assert target in result.text_content
diff --git a/tests/core/test_llm.py b/tests/core/test_llm.py
new file mode 100644
index 0000000..2958fe6
--- /dev/null
+++ b/tests/core/test_llm.py
@@ -0,0 +1,73 @@
+import os
+from pathlib import Path
+from warnings import catch_warnings, resetwarnings
+
+import pytest
+
+from markitdown import MarkItDown
+
+TEST_FILES_DIR = Path(__file__).parent.parent / "test_files"
+
+# Don't run the llm tests without a key and the client library
+skip_llm = False if os.environ.get("OPENAI_API_KEY") else True
+try:
+    import openai
+except ModuleNotFoundError:
+    skip_llm = True
+
+LLM_TEST_STRINGS = [
+    "5bda1dd6",
+]
+
+
+def test_markitdown_deprecation() -> None:
+    try:
+        with catch_warnings(record=True) as w:
+            test_client = object()
+            markitdown = MarkItDown(mlm_client=test_client)
+            assert len(w) == 1
+            assert w[0].category is DeprecationWarning
+            assert markitdown._llm_client == test_client
+    finally:
+        resetwarnings()
+
+    try:
+        with catch_warnings(record=True) as w:
+            markitdown = MarkItDown(mlm_model="gpt-4o")
+            assert len(w) == 1
+            assert w[0].category is DeprecationWarning
+            assert markitdown._llm_model == "gpt-4o"
+    finally:
+        resetwarnings()
+
+    try:
+        test_client = object()
+        markitdown = MarkItDown(mlm_client=test_client, llm_client=test_client)
+        assert False
+    except ValueError:
+        pass
+
+    try:
+        markitdown = MarkItDown(mlm_model="gpt-4o", llm_model="gpt-4o")
+        assert False
+    except ValueError:
+        pass
+
+
+@pytest.mark.skipif(
+    skip_llm,
+    reason="do not run llm tests without a key",
+)
+def test_markitdown_llm() -> None:
+    client = openai.OpenAI()
+    markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o")
+
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg"))
+
+    for test_string in LLM_TEST_STRINGS:
+        assert test_string in result.text_content
+
+    # This is not super precise. It would also accept "red square", "blue circle",
+    # "the square is not blue", etc. But it's sufficient for this test.
+    for test_string in ["red", "circle", "blue", "square"]:
+        assert test_string in result.text_content.lower()
diff --git a/tests/core/test_local_file.py b/tests/core/test_local_file.py
new file mode 100644
index 0000000..380783c
--- /dev/null
+++ b/tests/core/test_local_file.py
@@ -0,0 +1,151 @@
+import os
+from pathlib import Path
+
+import pytest
+
+from markitdown import MarkItDown
+from tests.helpers.utils import validate_strings
+
+TEST_FILES_DIR = Path(__file__).parent.parent / "test_files"
+
+XLSX_TEST_STRINGS = [
+    "## 09060124-b5e7-4717-9d07-3c046eb",
+    "6ff4173b-42a5-4784-9b19-f49caff4d93d",
+    "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
+]
+
+DOCX_TEST_STRINGS = [
+    "314b0a30-5b04-470b-b9f7-eed2c2bec74a",
+    "49e168b7-d2ae-407f-a055-2167576f39a1",
+    "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
+    "# Abstract",
+    "# Introduction",
+    "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
+]
+
+DOCX_COMMENT_TEST_STRINGS = [
+    "314b0a30-5b04-470b-b9f7-eed2c2bec74a",
+    "49e168b7-d2ae-407f-a055-2167576f39a1",
+    "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
+    "# Abstract",
+    "# Introduction",
+    "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
+    "This is a test comment. 12df-321a",
+    "Yet another comment in the doc. 55yiyi-asd09",
+]
+
+PPTX_TEST_STRINGS = [
+    "2cdda5c8-e50e-4db4-b5f0-9722a649f455",
+    "04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
+    "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
+    "1b92870d-e3b5-4e65-8153-919f4ff45592",
+    "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
+    "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f",  # chart title
+    "2003",  # chart value
+]
+
+BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"
+BLOG_TEST_STRINGS = [
+    "Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?",
+    "an example where high cost can easily prevent a generic complex",
+]
+
+
+RSS_TEST_STRINGS = [
+    "The Official Microsoft Blog",
+    "In the case of AI, it is absolutely true that the industry is moving incredibly fast",
+]
+
+
+WIKIPEDIA_TEST_URL = "https://en.wikipedia.org/wiki/Microsoft"
+WIKIPEDIA_TEST_STRINGS = [
+    "Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix]",
+    'Microsoft was founded by [Bill Gates](/wiki/Bill_Gates "Bill Gates")',
+]
+WIKIPEDIA_TEST_EXCLUDES = [
+    "You are encouraged to create an account and log in",
+    "154 languages",
+    "move to sidebar",
+]
+
+SERP_TEST_URL = "https://www.bing.com/search?q=microsoft+wikipedia"
+SERP_TEST_STRINGS = [
+    "](https://en.wikipedia.org/wiki/Microsoft",
+    "Microsoft Corporation is **an American multinational corporation and technology company headquartered** in Redmond",
+    "1995–2007: Foray into the Web, Windows 95, Windows XP, and Xbox",
+]
+SERP_TEST_EXCLUDES = [
+    "https://www.bing.com/ck/a?!&&p=",
+    "data:image/svg+xml,%3Csvg%20width%3D",
+]
+
+CSV_CP932_TEST_STRINGS = [
+    "名前,年齢,住所",
+    "佐藤太郎,30,東京",
+    "三木英子,25,大阪",
+    "髙橋淳,35,名古屋",
+]
+
+common_case = {
+    "xlsx": ("test.xlsx", XLSX_TEST_STRINGS, None, {}),
+    "pptx": ("test.pptx", PPTX_TEST_STRINGS, None, {}),
+    "blog": ("test_blog.html", BLOG_TEST_STRINGS, None, {"url": BLOG_TEST_URL}),
+    "zip": ("test_files.zip", XLSX_TEST_STRINGS, None, {}),
+    "wikipedia": (
+        "test_wikipedia.html",
+        WIKIPEDIA_TEST_STRINGS,
+        WIKIPEDIA_TEST_EXCLUDES,
+        {"url": WIKIPEDIA_TEST_URL},
+    ),
+    "serp": (
+        "test_serp.html",
+        SERP_TEST_STRINGS,
+        SERP_TEST_EXCLUDES,
+        {"url": SERP_TEST_URL},
+    ),
+    "rss": ("test_rss.xml", RSS_TEST_STRINGS, None, {}),
+    "mskanji": ("test_mskanji.csv", CSV_CP932_TEST_STRINGS, None, {}),
+}
+
+
+@pytest.fixture
+def markitdown() -> MarkItDown:
+    return MarkItDown()
+
+
+@pytest.mark.parametrize(
+    "filename, expected_strings, exclude_strings, kwargs",
+    common_case.values(),
+    ids=common_case.keys(),
+)
+def test_common(
+    markitdown: MarkItDown,
+    filename: str,
+    expected_strings: list,
+    exclude_strings: list,
+    kwargs,
+) -> None:
+    source = TEST_FILES_DIR / filename
+    result = markitdown.convert(source, **kwargs)
+    validate_strings(result, expected_strings, exclude_strings)
+
+
+def test_docx() -> None:
+    markitdown = MarkItDown()
+    # Test DOCX processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))
+    validate_strings(result, DOCX_TEST_STRINGS)
+
+    # Test DOCX processing, with comments
+    result = markitdown.convert(
+        os.path.join(TEST_FILES_DIR, "test_with_comment.docx"),
+        style_map="comment-reference => ",
+    )
+    validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
+
+    # Test DOCX processing, with comments and setting style_map on init
+    markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
+    result = markitdown_with_style_map.convert(
+        os.path.join(TEST_FILES_DIR, "test_with_comment.docx")
+    )
+    validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
diff --git a/tests/core/test_remote.py b/tests/core/test_remote.py
new file mode 100644
index 0000000..52e18b3
--- /dev/null
+++ b/tests/core/test_remote.py
@@ -0,0 +1,46 @@
+import io
+import os
+
+import pytest
+import requests
+
+from markitdown import MarkItDown
+
+TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
+
+skip_remote = (
+    True if os.environ.get("GITHUB_ACTIONS") else False
+)  # Don't run these tests in CI
+
+
+PDF_TEST_URL = "https://arxiv.org/pdf/2308.08155v2.pdf"
+PDF_TEST_STRINGS = [
+    "While there is contemporaneous exploration of multi-agent approaches"
+]
+
+
+@pytest.mark.skipif(
+    skip_remote,
+    reason="do not run tests that query external urls",
+)
+def test_markitdown_remote() -> None:
+    markitdown = MarkItDown()
+
+    # By URL
+    result = markitdown.convert(PDF_TEST_URL)
+    for test_string in PDF_TEST_STRINGS:
+        assert test_string in result.text_content
+
+    # By stream
+    response = requests.get(PDF_TEST_URL)
+    result = markitdown.convert_stream(
+        io.BytesIO(response.content), file_extension=".pdf", url=PDF_TEST_URL
+    )
+    for test_string in PDF_TEST_STRINGS:
+        assert test_string in result.text_content
+
+    # Youtube
+    # TODO: This test randomly fails for some reason. Haven't been able to repro it yet. Disabling until I can debug the issue
+    # result = markitdown.convert(YOUTUBE_TEST_URL)
+    # for test_string in YOUTUBE_TEST_STRINGS:
+    #     assert test_string in result.text_content
diff --git a/tests/helpers/__init__.py b/tests/helpers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/helpers/utils.py b/tests/helpers/utils.py
new file mode 100644
index 0000000..9b82159
--- /dev/null
+++ b/tests/helpers/utils.py
@@ -0,0 +1,11 @@
+# --- Helper Functions ---
+
+
+def validate_strings(result, expected_strings, exclude_strings=None):
+    """Validate presence or absence of specific strings."""
+    text_content = result.text_content.replace("\\", "")
+    for string in expected_strings:
+        assert string in text_content
+    if exclude_strings:
+        for string in exclude_strings:
+            assert string not in text_content

From 75c33b6713c5f6234280e318ff3315f48507e9f5 Mon Sep 17 00:00:00 2001
From: lumin <baolong1027@icloud.com>
Date: Thu, 26 Dec 2024 23:09:55 +0900
Subject: [PATCH 2/3] fix(coverage): simplify coverage paths configuration

Removes redundant paths for markitdown and tests in the
coverage configuration. This change streamlines the
coverage report by focusing on the primary source and
test directories, improving clarity and maintainability.
---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3e14cec..56a24b8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -69,8 +69,8 @@ omit = [
 ]
 
 [tool.coverage.paths]
-markitdown = ["src/markitdown", "*/markitdown/src/markitdown"]
-tests = ["tests", "*/markitdown/tests"]
+markitdown = ["src/markitdown"]
+tests = ["tests"]
 
 [tool.coverage.report]
 exclude_lines = [

From 2b4317dc9e474876c4b09ffa00f08d8a0859171f Mon Sep 17 00:00:00 2001
From: lumin <baolong1027@icloud.com>
Date: Thu, 26 Dec 2024 23:10:02 +0900
Subject: [PATCH 3/3] chore: update .gitignore and add pytest settings

Remove unnecessary .vscode directory from .gitignore and add
pytest configuration to .vscode/settings.json to enable
testing with pytest in the project. This improves the
development workflow by ensuring that pytest is the default
testing framework.
---
 .gitignore            | 2 --
 .vscode/settings.json | 7 +++++++
 2 files changed, 7 insertions(+), 2 deletions(-)
 create mode 100644 .vscode/settings.json

diff --git a/.gitignore b/.gitignore
index 7f0de2b..b6139eb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,3 @@
-.vscode
-
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..7a230e1
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,7 @@
+{
+    "python.testing.pytestArgs": [
+        "src"
+    ],
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true
+}
\ No newline at end of file