From e2470fc413ee793fc07c60657f9ab168a9dfafa8 Mon Sep 17 00:00:00 2001 From: Tom <64286918+JoyRushMedia@users.noreply.github.com> Date: Fri, 3 Jan 2025 13:48:19 -0700 Subject: [PATCH] Add Ollama integration for image descriptions --- For more details, open the [Copilot Workspace session](https://copilot-workspace.githubnext.com/microsoft/markitdown?shareId=XXXX-XXXX-XXXX-XXXX). --- .devcontainer/devcontainer.json | 27 ++++-------- Dockerfile | 2 +- README.md | 12 ++++++ src/markitdown/_markitdown.py | 54 ++++++++++++++++++++++++ tests/test_markitdown.py | 27 ++++++++++++ web-ui/package.json | 37 ++++++++++++++++ web-ui/src/App.js | 42 ++++++++++++++++++ web-ui/src/components/DownloadButton.js | 21 +++++++++ web-ui/src/components/FileUpload.js | 18 ++++++++ web-ui/src/components/MarkdownPreview.js | 12 ++++++ 10 files changed, 232 insertions(+), 20 deletions(-) create mode 100644 web-ui/package.json create mode 100644 web-ui/src/App.js create mode 100644 web-ui/src/components/DownloadButton.js create mode 100644 web-ui/src/components/FileUpload.js create mode 100644 web-ui/src/components/MarkdownPreview.js diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index e13e299..4411733 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,32 +1,21 @@ -// For format details, see https://aka.ms/devcontainer.json. For config options, see the -// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile { "name": "Existing Dockerfile", "build": { - // Sets the run context to one level up instead of the .devcontainer folder. "context": "..", - // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename. "dockerfile": "../Dockerfile", "args": { "INSTALL_GIT": "true" } }, - - // Features to add to the dev container. More info: https://containers.dev/features. - // "features": {}, "features": { - "ghcr.io/devcontainers-extra/features/hatch:2": {} + "ghcr.io/devcontainers-extra/features/hatch:2": {}, + "ghcr.io/devcontainers/features/python:1": { + "version": "3.10" + }, + "ghcr.io/devcontainers/features/node:1": { + "version": "16" + }, + "ghcr.io/devcontainers/features/ollama:1": {} }, - - // Use 'forwardPorts' to make a list of ports inside the container available locally. - // "forwardPorts": [], - - // Uncomment the next line to run commands after the container is created. - // "postCreateCommand": "cat /etc/os-release", - - // Configure tool-specific properties. - // "customizations": {}, - - // Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root. "remoteUser": "root" } diff --git a/Dockerfile b/Dockerfile index 0072d9e..b88b755 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,7 +12,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ ffmpeg \ && rm -rf /var/lib/apt/lists/* -RUN pip install markitdown +RUN pip install markitdown ollama # Default USERID and GROUPID ARG USERID=10000 diff --git a/README.md b/README.md index d2314c3..048a812 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,18 @@ result = md.convert("example.jpg") print(result.text_content) ``` +To use Ollama for image descriptions, provide `ollama_client`: + +```python +from markitdown import MarkItDown +from ollama import Ollama + +client = Ollama(api_key="your-api-key") +md = MarkItDown(ollama_client=client) +result = md.convert("example.jpg") +print(result.text_content) +``` + ### Docker ```sh diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 789c1e5..7c2cf91 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -1076,6 +1076,54 @@ def _get_llm_description(self, local_path, extension, client, model, prompt=None return response.choices[0].message.content +class OllamaConverter(DocumentConverter): + """ + Converts images to markdown via description using Ollama API. + """ + + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + # Bail if not an image + extension = kwargs.get("file_extension", "") + if extension.lower() not in [".jpg", ".jpeg", ".png"]: + return None + + md_content = "" + + # Try describing the image with Ollama + ollama_client = kwargs.get("ollama_client") + if ollama_client is not None: + md_content += ( + "\n# Description:\n" + + self._get_ollama_description( + local_path, + extension, + ollama_client, + prompt=kwargs.get("ollama_prompt"), + ).strip() + + "\n" + ) + + return DocumentConverterResult( + title=None, + text_content=md_content, + ) + + def _get_ollama_description(self, local_path, extension, client, prompt=None): + if prompt is None or prompt.strip() == "": + prompt = "Write a detailed caption for this image." + + data_uri = "" + with open(local_path, "rb") as image_file: + content_type, encoding = mimetypes.guess_type("_dummy" + extension) + if content_type is None: + content_type = "image/jpeg" + image_base64 = base64.b64encode(image_file.read()).decode("utf-8") + data_uri = f"data:{content_type};base64,{image_base64}" + + response = client.describe_image(data_uri, prompt) + return response["description"] + + class ZipConverter(DocumentConverter): """Converts ZIP files to markdown by extracting and converting all contained files. @@ -1223,6 +1271,7 @@ def __init__( llm_client: Optional[Any] = None, llm_model: Optional[str] = None, style_map: Optional[str] = None, + ollama_client: Optional[Any] = None, # Deprecated mlm_client: Optional[Any] = None, mlm_model: Optional[str] = None, @@ -1264,6 +1313,7 @@ def __init__( self._llm_client = llm_client self._llm_model = llm_model self._style_map = style_map + self._ollama_client = ollama_client self._page_converters: List[DocumentConverter] = [] @@ -1285,6 +1335,7 @@ def __init__( self.register_page_converter(IpynbConverter()) self.register_page_converter(PdfConverter()) self.register_page_converter(ZipConverter()) + self.register_page_converter(OllamaConverter()) def convert( self, source: Union[str, requests.Response, Path], **kwargs: Any @@ -1445,6 +1496,9 @@ def _convert( if "llm_model" not in _kwargs and self._llm_model is not None: _kwargs["llm_model"] = self._llm_model + if "ollama_client" not in _kwargs and self._ollama_client is not None: + _kwargs["ollama_client"] = self._ollama_client + # Add the list of converters for nested processing _kwargs["_parent_converters"] = self._page_converters diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 4a981bd..fad32ae 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -25,6 +25,13 @@ # Skip exiftool tests if not installed skip_exiftool = shutil.which("exiftool") is None +# Skip Ollama tests if not installed +skip_ollama = False if os.environ.get("OLLAMA_API_KEY") else True +try: + import ollama +except ModuleNotFoundError: + skip_ollama = True + TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") JPG_TEST_EXIFTOOL = { @@ -130,6 +137,11 @@ "5bda1dd6", ] +OLLAMA_TEST_STRINGS = [ + "detailed caption", + "image", +] + # --- Helper Functions --- def validate_strings(result, expected_strings, exclude_strings=None): @@ -300,6 +312,20 @@ def test_markitdown_llm() -> None: assert test_string in result.text_content.lower() +@pytest.mark.skipif( + skip_ollama, + reason="do not run ollama tests without a key", +) +def test_markitdown_ollama() -> None: + client = ollama.Ollama(api_key=os.environ.get("OLLAMA_API_KEY")) + markitdown = MarkItDown(ollama_client=client) + + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_ollama.jpg")) + + for test_string in OLLAMA_TEST_STRINGS: + assert test_string in result.text_content + + if __name__ == "__main__": """Runs this file's tests from the command line.""" test_markitdown_remote() @@ -307,3 +333,4 @@ def test_markitdown_llm() -> None: test_markitdown_exiftool() test_markitdown_deprecation() test_markitdown_llm() + test_markitdown_ollama() diff --git a/web-ui/package.json b/web-ui/package.json new file mode 100644 index 0000000..f9f8bbc --- /dev/null +++ b/web-ui/package.json @@ -0,0 +1,37 @@ +{ + "name": "markitdown-web-ui", + "version": "1.0.0", + "description": "Web-based UI for MarkItDown", + "main": "src/App.js", + "scripts": { + "start": "react-scripts start", + "build": "react-scripts build", + "test": "react-scripts test", + "eject": "react-scripts eject" + }, + "dependencies": { + "react": "^17.0.2", + "react-dom": "^17.0.2", + "react-scripts": "4.0.3", + "axios": "^0.21.1", + "react-markdown": "^7.0.0" + }, + "eslintConfig": { + "extends": [ + "react-app", + "react-app/jest" + ] + }, + "browserslist": { + "production": [ + ">0.2%", + "not dead", + "not op_mini all" + ], + "development": [ + "last 1 chrome version", + "last 1 firefox version", + "last 1 safari version" + ] + } +} diff --git a/web-ui/src/App.js b/web-ui/src/App.js new file mode 100644 index 0000000..107059c --- /dev/null +++ b/web-ui/src/App.js @@ -0,0 +1,42 @@ +import React, { useState } from 'react'; +import FileUpload from './components/FileUpload'; +import MarkdownPreview from './components/MarkdownPreview'; +import DownloadButton from './components/DownloadButton'; +import axios from 'axios'; + +function App() { + const [markdownContent, setMarkdownContent] = useState(''); + const [fileName, setFileName] = useState(''); + + const handleFileUpload = async (file) => { + const formData = new FormData(); + formData.append('file', file); + + try { + const response = await axios.post('/api/convert', formData, { + headers: { + 'Content-Type': 'multipart/form-data', + }, + }); + setMarkdownContent(response.data.markdown); + setFileName(file.name); + } catch (error) { + console.error('Error uploading file:', error); + } + }; + + return ( +
+
+

MarkItDown Web UI

+
+
+ + + +
+
+ ); +} + +export default App; diff --git a/web-ui/src/components/DownloadButton.js b/web-ui/src/components/DownloadButton.js new file mode 100644 index 0000000..f1e3a2a --- /dev/null +++ b/web-ui/src/components/DownloadButton.js @@ -0,0 +1,21 @@ +import React from 'react'; + +function DownloadButton({ content, fileName }) { + const handleDownload = () => { + const element = document.createElement('a'); + const file = new Blob([content], { type: 'text/markdown' }); + element.href = URL.createObjectURL(file); + element.download = fileName.replace(/\.[^/.]+$/, "") + ".md"; + document.body.appendChild(element); + element.click(); + document.body.removeChild(element); + }; + + return ( + + ); +} + +export default DownloadButton; diff --git a/web-ui/src/components/FileUpload.js b/web-ui/src/components/FileUpload.js new file mode 100644 index 0000000..7d69922 --- /dev/null +++ b/web-ui/src/components/FileUpload.js @@ -0,0 +1,18 @@ +import React from 'react'; + +function FileUpload({ onFileUpload }) { + const handleFileChange = (event) => { + const file = event.target.files[0]; + if (file) { + onFileUpload(file); + } + }; + + return ( +
+ +
+ ); +} + +export default FileUpload; diff --git a/web-ui/src/components/MarkdownPreview.js b/web-ui/src/components/MarkdownPreview.js new file mode 100644 index 0000000..757bf7b --- /dev/null +++ b/web-ui/src/components/MarkdownPreview.js @@ -0,0 +1,12 @@ +import React from 'react'; +import ReactMarkdown from 'react-markdown'; + +function MarkdownPreview({ content }) { + return ( +
+ {content} +
+ ); +} + +export default MarkdownPreview;