Skip to content

Commit

Permalink
feat(tests): add comprehensive tests for MarkItDown
Browse files Browse the repository at this point in the history
Introduce new test cases for various file formats including XLSX, 
DOCX, PPTX, RSS, and CSV. a validation utility to 
the output strings match expected results. These changes enhance 
test coverage and ensure the reliability of the MarkItDown 
conversion functionality.
  • Loading branch information
l-lumin authored and gitbutler-client committed Dec 25, 2024
1 parent 125e206 commit c9bd809
Show file tree
Hide file tree
Showing 4 changed files with 167 additions and 0 deletions.
Empty file added tests/core/__init__.py
Empty file.
156 changes: 156 additions & 0 deletions tests/core/test_local_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import os
from pathlib import Path

import pytest

from markitdown import MarkItDown
from tests.helpers.utils import validate_strings

TEST_FILES_DIR = Path(__file__).parent.parent / "test_files"

XLSX_TEST_STRINGS = [
"## 09060124-b5e7-4717-9d07-3c046eb",
"6ff4173b-42a5-4784-9b19-f49caff4d93d",
"affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
]

DOCX_TEST_STRINGS = [
"314b0a30-5b04-470b-b9f7-eed2c2bec74a",
"49e168b7-d2ae-407f-a055-2167576f39a1",
"## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
"# Abstract",
"# Introduction",
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
]

DOCX_COMMENT_TEST_STRINGS = [
"314b0a30-5b04-470b-b9f7-eed2c2bec74a",
"49e168b7-d2ae-407f-a055-2167576f39a1",
"## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
"# Abstract",
"# Introduction",
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
"This is a test comment. 12df-321a",
"Yet another comment in the doc. 55yiyi-asd09",
]

PPTX_TEST_STRINGS = [
"2cdda5c8-e50e-4db4-b5f0-9722a649f455",
"04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
"44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
"1b92870d-e3b5-4e65-8153-919f4ff45592",
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
"a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title
"2003", # chart value
]

BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"
BLOG_TEST_STRINGS = [
"Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?",
"an example where high cost can easily prevent a generic complex",
]


RSS_TEST_STRINGS = [
"The Official Microsoft Blog",
"In the case of AI, it is absolutely true that the industry is moving incredibly fast",
]


WIKIPEDIA_TEST_URL = "https://en.wikipedia.org/wiki/Microsoft"
WIKIPEDIA_TEST_STRINGS = [
"Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix]",
'Microsoft was founded by [Bill Gates](/wiki/Bill_Gates "Bill Gates")',
]
WIKIPEDIA_TEST_EXCLUDES = [
"You are encouraged to create an account and log in",
"154 languages",
"move to sidebar",
]

SERP_TEST_URL = "https://www.bing.com/search?q=microsoft+wikipedia"
SERP_TEST_STRINGS = [
"](https://en.wikipedia.org/wiki/Microsoft",
"Microsoft Corporation is **an American multinational corporation and technology company headquartered** in Redmond",
"1995–2007: Foray into the Web, Windows 95, Windows XP, and Xbox",
]
SERP_TEST_EXCLUDES = [
"https://www.bing.com/ck/a?!&&p=",
"data:image/svg+xml,%3Csvg%20width%3D",
]

CSV_CP932_TEST_STRINGS = [
"名前,年齢,住所",
"佐藤太郎,30,東京",
"三木英子,25,大阪",
"髙橋淳,35,名古屋",
]

common_data = [
("test.xlsx", XLSX_TEST_STRINGS, None, {}),
("test.docx", DOCX_TEST_STRINGS, None, {}),
(
"test_with_comment.docx",
DOCX_COMMENT_TEST_STRINGS,
None,
{"style_map": "comment-reference => "},
),
("test.pptx", PPTX_TEST_STRINGS, None, {}),
("test_blog.html", BLOG_TEST_STRINGS, None, {"url": BLOG_TEST_URL}),
("test_files.zip", XLSX_TEST_STRINGS, None, {}),
(
"test_wikipedia.html",
WIKIPEDIA_TEST_STRINGS,
WIKIPEDIA_TEST_EXCLUDES,
{"url": WIKIPEDIA_TEST_URL},
),
(
"test_serp.html",
SERP_TEST_STRINGS,
SERP_TEST_EXCLUDES,
{"url": SERP_TEST_URL},
),
("test_rss.xml", RSS_TEST_STRINGS, None, {}),
("test_mskanji.csv", CSV_CP932_TEST_STRINGS, None, {}),
]


@pytest.fixture
def markitdown() -> MarkItDown:
return MarkItDown()


@pytest.mark.parametrize(
"filename, expected_strings, exclude_strings, kwargs", common_data
)
def test_common(
markitdown: MarkItDown,
filename: str,
expected_strings: list,
exclude_strings: list,
kwargs,
) -> None:
source = TEST_FILES_DIR / filename
result = markitdown.convert(source, **kwargs)
validate_strings(result, expected_strings, exclude_strings)


def test_docx() -> None:
markitdown = MarkItDown()
# Test DOCX processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))
validate_strings(result, DOCX_TEST_STRINGS)

# Test DOCX processing, with comments
result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "test_with_comment.docx"),
style_map="comment-reference => ",
)
validate_strings(result, DOCX_COMMENT_TEST_STRINGS)

# Test DOCX processing, with comments and setting style_map on init
markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
result = markitdown_with_style_map.convert(
os.path.join(TEST_FILES_DIR, "test_with_comment.docx")
)
validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
Empty file added tests/helpers/__init__.py
Empty file.
11 changes: 11 additions & 0 deletions tests/helpers/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# --- Helper Functions ---


def validate_strings(result, expected_strings, exclude_strings=None):
"""Validate presence or absence of specific strings."""
text_content = result.text_content.replace("\\", "")
for string in expected_strings:
assert string in text_content
if exclude_strings:
for string in exclude_strings:
assert string not in text_content

0 comments on commit c9bd809

Please sign in to comment.