From f6aa6aead28613d08d84a175606c529f4c880ee7 Mon Sep 17 00:00:00 2001 From: Hieu Lam Date: Wed, 15 Jan 2025 12:37:16 +0700 Subject: [PATCH] feat: support images in table and auto detect code languages (optional) --- src/markitdown/_markitdown.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 33806e1..e0b64e3 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -60,6 +60,17 @@ pass +try: + from guesslang import Guess +except ImportError: + warn("The 'guesslang' package is not installed. Please install it via 'pip install guesslang'.") + class Guess: + def language_name(self, code: str) -> str: + return "" + +guess = Guess() + + class _CustomMarkdownify(markdownify.MarkdownConverter): """ A custom version of markdownify's MarkdownConverter. Changes include: @@ -72,6 +83,19 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): def __init__(self, **options: Any): options["heading_style"] = options.get("heading_style", markdownify.ATX) + + # Keep inline images in table elements + options["keep_inline_images_in"] = options.get("keep_inline_images_in", ["td", "tr", "div", "p", "span"]) + + # Add a custom code language callback to guess the language of code snippets + def code_language_callback(el): + extracted_code_snippet = el.get_text() + if not extracted_code_snippet: + return "" + language = guess.language_name(extracted_code_snippet) + return language.lower() if language else "" + options["code_language_callback"] = options.get("code_language_callback", code_language_callback) + # Explicitly cast options to the expected type if necessary super().__init__(**options)