Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions packages/markitdown/src/markitdown/converters/_ipynb_converter.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,24 @@
from typing import BinaryIO, Any
import json
import re

from .._base_converter import DocumentConverter, DocumentConverterResult
from .._exceptions import FileConversionException
from .._stream_info import StreamInfo


def _fenced_code_block(content: str, info_string: str = "") -> str:
"""Wrap content in a Markdown code fence that is guaranteed not to be
closed prematurely by backticks inside the content. Per CommonMark, the
fence must be longer than the longest run of backticks it contains, so a
cell that itself prints ``` (common in notebooks demoing Markdown) does not
leak out as prose."""
longest_backtick_run = max(
(len(m) for m in re.findall(r"`+", content)), default=0
)
fence = "`" * max(3, longest_backtick_run + 1)
return f"{fence}{info_string}\n{content}\n{fence}"

CANDIDATE_MIME_TYPE_PREFIXES = [
"application/json",
]
Expand Down Expand Up @@ -76,9 +90,11 @@ def _convert(self, notebook_content: dict) -> DocumentConverterResult:

elif cell_type == "code":
# Code cells are wrapped in Markdown code blocks
md_output.append(f"```python\n{''.join(source_lines)}\n```")
md_output.append(
_fenced_code_block("".join(source_lines), "python")
)
elif cell_type == "raw":
md_output.append(f"```\n{''.join(source_lines)}\n```")
md_output.append(_fenced_code_block("".join(source_lines)))

md_text = "\n\n".join(md_output)

Expand Down
37 changes: 37 additions & 0 deletions packages/markitdown/tests/test_module_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -532,9 +532,46 @@ def test_markitdown_llm() -> None:
validate_strings(result, PPTX_TEST_STRINGS)


def test_ipynb_code_cell_with_backtick_fence() -> None:
"""A notebook code/raw cell whose source itself contains a ``` fence must
be wrapped in a longer fence, otherwise the inner backticks close the code
block early and the cell's content leaks out as prose. The cell source must
survive verbatim as a single fenced block."""
import json

inner = 'print("""\n```\nnot python\n```\n""")'
notebook = {
"nbformat": 4,
"nbformat_minor": 5,
"metadata": {},
"cells": [
{
"cell_type": "code",
"source": [inner],
"metadata": {},
"outputs": [],
"execution_count": None,
}
],
}
result = MarkItDown().convert_stream(
io.BytesIO(json.dumps(notebook).encode("utf-8")), file_extension=".ipynb"
)

# The opening fence must be longer than the 3-backtick run inside the cell.
fence_match = re.match(r"(`{4,})python\n", result.markdown)
assert fence_match, (
f"code cell was not wrapped in a long-enough fence: {result.markdown!r}"
)
fence = fence_match.group(1)
# The cell's source must appear intact, enclosed by the longer fence.
assert f"{fence}python\n{inner}\n{fence}" in result.markdown


if __name__ == "__main__":
"""Runs this file's tests from the command line."""
for test in [
test_ipynb_code_cell_with_backtick_fence,
test_stream_info_operations,
test_data_uris,
test_file_uris,
Expand Down