Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: adding support for images inside docx #277

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 76 additions & 0 deletions src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -714,6 +714,29 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
result = mammoth.convert_to_html(docx_file, style_map=style_map)
html_content = result.value
result = self._convert(html_content)

# Extract any base64 encoded images from the HTML
descriptions = []
if kwargs.get("llm_client") and kwargs.get("llm_model"):
for match in re.finditer(r'data:image/[^;]+;base64,([^"\']+)', html_content):
img_converter = ImageConverter()
descriptions.append(img_converter.convert_from_base64(match.group(1),'.png',**kwargs))

# Replace each base64 image with its description
if descriptions and result:
text_content = result.text_content

# Find all base64 image markdown patterns
base64_pattern = r'!\[[\s\S]*?\]\(data:image/[a-z]+;base64.*?\)'

# Find all base64 image markdown patterns
matches = list(re.finditer(base64_pattern, text_content))

# Replace each match with corresponding description
for i, match in enumerate(matches):
if i < len(descriptions):
text_content = text_content.replace(match.group(), f'[Image description {i}] \n{descriptions[i]}\n[End Image description {i}]')
result.text_content = text_content

return result

Expand Down Expand Up @@ -1114,6 +1137,59 @@ def _get_llm_description(self, local_path, extension, client, model, prompt=None

response = client.chat.completions.create(model=model, messages=messages)
return response.choices[0].message.content

def _get_llm_description_from_base64(
self,
base64_str: str,
extension: str,
client: Any,
model: str,
prompt: Optional[str] = None
) -> str:
"""Get LLM description for a base64-encoded image string."""
if prompt is None or prompt.strip() == "":
prompt = "Write a detailed caption for this image."

# Remove data URI prefix if present
if ',' in base64_str:
base64_str = base64_str.split(',')[1]

# Create data URI
content_type, encoding = mimetypes.guess_type("_dummy" + extension)
if content_type is None:
content_type = "image/jpeg"

data_uri = f"data:{content_type};base64,{base64_str}"
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": data_uri,
},
},
],
}
]

response = client.chat.completions.create(model=model, messages=messages)
return response.choices[0].message.content

def convert_from_base64(
self,
base64_str: str,
extension: str,
**kwargs: Any
) -> Union[None, DocumentConverterResult]:
"""Convert a base64-encoded image string to markdown."""
client = kwargs.get("llm_client")
model = kwargs.get("llm_model")
prompt = kwargs.get("llm_prompt")
result = self._get_llm_description_from_base64(base64_str, extension, client, model, prompt)
return result


class OutlookMsgConverter(DocumentConverter):
Expand Down