feat: Added support for converting images (JPG/PNG/BMP) to Markdown #6

jorben · jorben · commit 1eab0db854ba · 2025-03-22T20:27:25.000+08:00
diff --git a/README.md b/README.md
@@ -20,6 +20,7 @@ MarkPDFDown is designed to simplify the process of converting PDF documents into
 ## Features
 
 - **PDF to Markdown Conversion**: Transform any PDF document into well-formatted Markdown
+- **Image to Markdown Conversion**: Transform image into well-formatted Markdown
 - **Multimodal Understanding**: Leverages AI to comprehend document structure and content
 - **Format Preservation**: Maintains headings, lists, tables, and other formatting elements
 - **Customizable Model**: Configure the model to suit your needs
@@ -56,6 +57,9 @@ python main.py < tests/input.pdf > output.md
 ## Advanced Usage
 ```bash
 python main.py page_start page_end < tests/input.pdf > output.md
+
+# image to markdown
+python main.py < tests/input_image.png > output.md
 ```
 
 ## Docker Usage
diff --git a/README_zh.md b/README_zh.md
@@ -20,6 +20,7 @@ MarkPDFDown 是一款智能PDF转换Markdown工具，通过先进的多模态AI
 ## 功能特性
 
 - **PDF转Markdown**：支持任意PDF文档的格式转换
+- **图片转Markdown**：支持JPG/PNG/BMP图片内容转Markdown
 - **多模态理解**：利用AI理解文档结构和内容
 - **格式保留**：完整保留标题、列表、表格等排版元素
 - **模型定制**：支持自定义AI模型参数配置
@@ -52,6 +53,9 @@ export OPENAI_DEFAULT_MODEL=<你的模型>
 
 # 运行转换程序
 python main.py < tests/input.pdf > output.md
+
+# 图片转换
+python main.py < tests/input_image.png > output.md
 ```
 
 ## 高级用法
diff --git a/core/FileWorker.py b/core/FileWorker.py
@@ -49,10 +49,9 @@ def create_worker(input_path: str, start_page: int = 1, end_page: int = 0):
     if ext == '.pdf':
         from .PDFWorker import PDFWorker
         worker = PDFWorker(input_path, start_page, end_page)
-    # can add other types of Worker here
-    # elif ext == '.docx':
-    #    from .DocxWorker import DocxWorker
-    #    worker = DocxWorker(input_path, start_page, end_page)
+    elif ext == '.jpg' or ext == '.jpeg' or ext == '.png' or ext == '.bmp':
+        from .ImageWorker import ImageWorker
+        worker = ImageWorker(input_path)
     else:
         raise ValueError(f"Unsupported file type: {ext}")
         
diff --git a/core/ImageWorker.py b/core/ImageWorker.py
@@ -0,0 +1,26 @@
+import os
+import logging
+from typing import List
+from .FileWorker import FileWorker
+
+logger = logging.getLogger(__name__)
+
+class ImageWorker(FileWorker):
+    """
+    Worker class for processing image files
+    """
+    def __init__(self, input_path: str):
+        super().__init__(input_path)
+        self.output_dir = os.path.dirname(input_path)    
+        logger.info("Processing image file %s", input_path)
+        
+
+
+    def convert_to_images(self) -> List[str]:
+        """
+        Mock function for image conversion
+
+        Returns:
+            List[str]: List of generated image paths
+        """
+        return [self.input_path]
diff --git a/main.py b/main.py
@@ -98,7 +98,6 @@ def convert_image_to_markdown(image_path):
 
     # Try to get extension from file name
     input_filename = os.path.basename(sys.stdin.buffer.name)
-    logger.info("Input file: %s", input_filename)
     input_ext = os.path.splitext(input_filename)[1]
     
     # If there is no extension or the file comes from standard input, try to determine the type by file content
@@ -107,10 +106,18 @@ def convert_image_to_markdown(image_path):
         if input_data.startswith(b'%PDF-'):
             input_ext = '.pdf'
             logger.info("Recognized as PDF file by file content")
-        # You can add more file type detection
-        # elif input_data.startswith(b'PK\x03\x04'):  # DOCX, XLSX, etc. ZIP format files
-        #     input_ext = '.docx'  # Default set to docx
-        #     logger.info("Recognized as Office document file by file content")
+        # JPEG file magic number/signature is FF D8 FF DB
+        elif input_data.startswith(b'\xFF\xD8\xFF\xDB'):
+            input_ext = '.jpg'
+            logger.info("Recognized as JPEG file by file content")
+        # PNG file magic number/signature is 89 50 4E 47
+        elif input_data.startswith(b'\x89\x50\x4E\x47'):
+            input_ext = '.png'
+            logger.info("Recognized as PNG file by file content")
+        # BMP file magic number/signature is 42 4D
+        elif input_data.startswith(b'\x42\x4D'):
+            input_ext = '.bmp'
+            logger.info("Recognized as BMP file by file content")
         else:
             logger.error("Unsupported file type")
             exit(1)
diff --git a/tests/input_image.png b/tests/input_image.png