Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(tests): reorganize tests #215

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
.vscode

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
7 changes: 7 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"python.testing.pytestArgs": [
"src"
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true
}
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@ omit = [
]

[tool.coverage.paths]
markitdown = ["src/markitdown", "*/markitdown/src/markitdown"]
tests = ["tests", "*/markitdown/tests"]
markitdown = ["src/markitdown"]
tests = ["tests"]

[tool.coverage.report]
exclude_lines = [
Expand Down
Empty file added tests/core/__init__.py
Empty file.
34 changes: 34 additions & 0 deletions tests/core/test_external_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import os
import shutil
from pathlib import Path

import pytest

from markitdown import MarkItDown

TEST_FILES_DIR = Path(__file__).parent.parent / "test_files"

# Skip exiftool tests if not installed
skip_exiftool = shutil.which("exiftool") is None

JPG_TEST_EXIFTOOL = {
"Author": "AutoGen Authors",
"Title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
"Description": "AutoGen enables diverse LLM-based applications",
"ImageSize": "1615x1967",
"DateTimeOriginal": "2024:03:14 22:10:00",
}


@pytest.mark.skipif(
skip_exiftool,
reason="do not run if exiftool is not installed",
)
def test_markitdown_exiftool() -> None:
markitdown = MarkItDown()

# Test JPG metadata processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
for key in JPG_TEST_EXIFTOOL:
target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
assert target in result.text_content
73 changes: 73 additions & 0 deletions tests/core/test_llm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import os
from pathlib import Path
from warnings import catch_warnings, resetwarnings

import pytest

from markitdown import MarkItDown

TEST_FILES_DIR = Path(__file__).parent.parent / "test_files"

# Don't run the llm tests without a key and the client library
skip_llm = False if os.environ.get("OPENAI_API_KEY") else True
try:
import openai
except ModuleNotFoundError:
skip_llm = True

LLM_TEST_STRINGS = [
"5bda1dd6",
]


def test_markitdown_deprecation() -> None:
try:
with catch_warnings(record=True) as w:
test_client = object()
markitdown = MarkItDown(mlm_client=test_client)
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert markitdown._llm_client == test_client
finally:
resetwarnings()

try:
with catch_warnings(record=True) as w:
markitdown = MarkItDown(mlm_model="gpt-4o")
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert markitdown._llm_model == "gpt-4o"
finally:
resetwarnings()

try:
test_client = object()
markitdown = MarkItDown(mlm_client=test_client, llm_client=test_client)
assert False
except ValueError:
pass

try:
markitdown = MarkItDown(mlm_model="gpt-4o", llm_model="gpt-4o")
assert False
except ValueError:
pass


@pytest.mark.skipif(
skip_llm,
reason="do not run llm tests without a key",
)
def test_markitdown_llm() -> None:
client = openai.OpenAI()
markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o")

result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg"))

for test_string in LLM_TEST_STRINGS:
assert test_string in result.text_content

# This is not super precise. It would also accept "red square", "blue circle",
# "the square is not blue", etc. But it's sufficient for this test.
for test_string in ["red", "circle", "blue", "square"]:
assert test_string in result.text_content.lower()
151 changes: 151 additions & 0 deletions tests/core/test_local_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
import os
from pathlib import Path

import pytest

from markitdown import MarkItDown
from tests.helpers.utils import validate_strings

TEST_FILES_DIR = Path(__file__).parent.parent / "test_files"

XLSX_TEST_STRINGS = [
"## 09060124-b5e7-4717-9d07-3c046eb",
"6ff4173b-42a5-4784-9b19-f49caff4d93d",
"affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
]

DOCX_TEST_STRINGS = [
"314b0a30-5b04-470b-b9f7-eed2c2bec74a",
"49e168b7-d2ae-407f-a055-2167576f39a1",
"## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
"# Abstract",
"# Introduction",
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
]

DOCX_COMMENT_TEST_STRINGS = [
"314b0a30-5b04-470b-b9f7-eed2c2bec74a",
"49e168b7-d2ae-407f-a055-2167576f39a1",
"## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
"# Abstract",
"# Introduction",
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
"This is a test comment. 12df-321a",
"Yet another comment in the doc. 55yiyi-asd09",
]

PPTX_TEST_STRINGS = [
"2cdda5c8-e50e-4db4-b5f0-9722a649f455",
"04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
"44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
"1b92870d-e3b5-4e65-8153-919f4ff45592",
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
"a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title
"2003", # chart value
]

BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"
BLOG_TEST_STRINGS = [
"Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?",
"an example where high cost can easily prevent a generic complex",
]


RSS_TEST_STRINGS = [
"The Official Microsoft Blog",
"In the case of AI, it is absolutely true that the industry is moving incredibly fast",
]


WIKIPEDIA_TEST_URL = "https://en.wikipedia.org/wiki/Microsoft"
WIKIPEDIA_TEST_STRINGS = [
"Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix]",
'Microsoft was founded by [Bill Gates](/wiki/Bill_Gates "Bill Gates")',
]
WIKIPEDIA_TEST_EXCLUDES = [
"You are encouraged to create an account and log in",
"154 languages",
"move to sidebar",
]

SERP_TEST_URL = "https://www.bing.com/search?q=microsoft+wikipedia"
SERP_TEST_STRINGS = [
"](https://en.wikipedia.org/wiki/Microsoft",
"Microsoft Corporation is **an American multinational corporation and technology company headquartered** in Redmond",
"1995–2007: Foray into the Web, Windows 95, Windows XP, and Xbox",
]
SERP_TEST_EXCLUDES = [
"https://www.bing.com/ck/a?!&&p=",
"data:image/svg+xml,%3Csvg%20width%3D",
]

CSV_CP932_TEST_STRINGS = [
"名前,年齢,住所",
"佐藤太郎,30,東京",
"三木英子,25,大阪",
"髙橋淳,35,名古屋",
]

common_case = {
"xlsx": ("test.xlsx", XLSX_TEST_STRINGS, None, {}),
"pptx": ("test.pptx", PPTX_TEST_STRINGS, None, {}),
"blog": ("test_blog.html", BLOG_TEST_STRINGS, None, {"url": BLOG_TEST_URL}),
"zip": ("test_files.zip", XLSX_TEST_STRINGS, None, {}),
"wikipedia": (
"test_wikipedia.html",
WIKIPEDIA_TEST_STRINGS,
WIKIPEDIA_TEST_EXCLUDES,
{"url": WIKIPEDIA_TEST_URL},
),
"serp": (
"test_serp.html",
SERP_TEST_STRINGS,
SERP_TEST_EXCLUDES,
{"url": SERP_TEST_URL},
),
"rss": ("test_rss.xml", RSS_TEST_STRINGS, None, {}),
"mskanji": ("test_mskanji.csv", CSV_CP932_TEST_STRINGS, None, {}),
}


@pytest.fixture
def markitdown() -> MarkItDown:
return MarkItDown()


@pytest.mark.parametrize(
"filename, expected_strings, exclude_strings, kwargs",
common_case.values(),
ids=common_case.keys(),
)
def test_common(
markitdown: MarkItDown,
filename: str,
expected_strings: list,
exclude_strings: list,
kwargs,
) -> None:
source = TEST_FILES_DIR / filename
result = markitdown.convert(source, **kwargs)
validate_strings(result, expected_strings, exclude_strings)


def test_docx() -> None:
markitdown = MarkItDown()
# Test DOCX processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))
validate_strings(result, DOCX_TEST_STRINGS)

# Test DOCX processing, with comments
result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "test_with_comment.docx"),
style_map="comment-reference => ",
)
validate_strings(result, DOCX_COMMENT_TEST_STRINGS)

# Test DOCX processing, with comments and setting style_map on init
markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
result = markitdown_with_style_map.convert(
os.path.join(TEST_FILES_DIR, "test_with_comment.docx")
)
validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
46 changes: 46 additions & 0 deletions tests/core/test_remote.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import io
import os

import pytest
import requests

from markitdown import MarkItDown

TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")

skip_remote = (
True if os.environ.get("GITHUB_ACTIONS") else False
) # Don't run these tests in CI


PDF_TEST_URL = "https://arxiv.org/pdf/2308.08155v2.pdf"
PDF_TEST_STRINGS = [
"While there is contemporaneous exploration of multi-agent approaches"
]


@pytest.mark.skipif(
skip_remote,
reason="do not run tests that query external urls",
)
def test_markitdown_remote() -> None:
markitdown = MarkItDown()

# By URL
result = markitdown.convert(PDF_TEST_URL)
for test_string in PDF_TEST_STRINGS:
assert test_string in result.text_content

# By stream
response = requests.get(PDF_TEST_URL)
result = markitdown.convert_stream(
io.BytesIO(response.content), file_extension=".pdf", url=PDF_TEST_URL
)
for test_string in PDF_TEST_STRINGS:
assert test_string in result.text_content

# Youtube
# TODO: This test randomly fails for some reason. Haven't been able to repro it yet. Disabling until I can debug the issue
# result = markitdown.convert(YOUTUBE_TEST_URL)
# for test_string in YOUTUBE_TEST_STRINGS:
# assert test_string in result.text_content
Empty file added tests/helpers/__init__.py
Empty file.
11 changes: 11 additions & 0 deletions tests/helpers/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# --- Helper Functions ---


def validate_strings(result, expected_strings, exclude_strings=None):
"""Validate presence or absence of specific strings."""
text_content = result.text_content.replace("\\", "")
for string in expected_strings:
assert string in text_content
if exclude_strings:
for string in exclude_strings:
assert string not in text_content