From 38a3a2034a0c15e2db50102afc6c923dc1aefa27 Mon Sep 17 00:00:00 2001 From: Aaryan Verma Date: Thu, 24 Jul 2025 18:20:17 +0530 Subject: [PATCH] Update _llm_caption.py Updated llm_caption.py to support Langchain wrapper client for OpenAI/AzureOpenAI for generating captions. --- .../src/markitdown/converters/_llm_caption.py | 58 +++++++++++++------ 1 file changed, 40 insertions(+), 18 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_llm_caption.py b/packages/markitdown/src/markitdown/converters/_llm_caption.py index 004a47ae..ec0c5f5a 100644 --- a/packages/markitdown/src/markitdown/converters/_llm_caption.py +++ b/packages/markitdown/src/markitdown/converters/_llm_caption.py @@ -29,22 +29,44 @@ def llm_caption( # Prepare the data-uri data_uri = f"data:{content_type};base64,{base64_image}" - # Prepare the OpenAI API request - messages = [ - { - "role": "user", - "content": [ - {"type": "text", "text": prompt}, - { - "type": "image_url", - "image_url": { - "url": data_uri, - }, - }, - ], - } - ] + #Check if client type is a langchain wrapper for OpenAI/AzureOpenAI or original OpenAI client + client_type = type(client).__module__ - # Call the OpenAI API - response = client.chat.completions.create(model=model, messages=messages) - return response.choices[0].message.content + #Prepare the Langchain OpenAI/AzureOpenAI wrapper request + if "langchain_openai" in client_type or has_attr(client, "invoke"): + content = [ + {"type": "text", "text": prompt}, + { + "type": "image_url", + "image_url": { + "url": data_uri, + }, + }, + ] + messages = {"role": "user", "content": content} + try: + response = client.invoke([messages]) + return response.content + except Exception as e: + return None + + else: + # Prepare the OpenAI API request + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + { + "type": "image_url", + "image_url": { + "url": data_uri, + }, + }, + ], + } + ] + + # Call the OpenAI API + response = client.chat.completions.create(model=model, messages=messages) + return response.choices[0].message.content