From dd4bc66b12c400c4cf0c59ede0efef7c14908e5c Mon Sep 17 00:00:00 2001
From: sallyannarize <sdelucia@arize.com>
Date: Tue, 12 Aug 2025 00:28:42 -0600
Subject: [PATCH] parse messages

---
 .../instrumentation/langchain/_tracer.py      | 96 ++++++++++++++++++-
 1 file changed, 95 insertions(+), 1 deletion(-)

diff --git a/python/instrumentation/openinference-instrumentation-langchain/src/openinference/instrumentation/langchain/_tracer.py b/python/instrumentation/openinference-instrumentation-langchain/src/openinference/instrumentation/langchain/_tracer.py
index 68ce915f01..c2cef908fa 100644
--- a/python/instrumentation/openinference-instrumentation-langchain/src/openinference/instrumentation/langchain/_tracer.py
+++ b/python/instrumentation/openinference-instrumentation-langchain/src/openinference/instrumentation/langchain/_tracer.py
@@ -360,6 +360,84 @@ def _as_output(values: Iterable[str]) -> Iterator[Tuple[str, str]]:
     return zip((OUTPUT_VALUE, OUTPUT_MIME_TYPE), values)
 
 
+def _extract_content_from_messages(messages: Any) -> Optional[str]:
+    """
+    Extract content strings from LangGraph/LangChain messages.
+
+    Args:
+        messages: List of message objects or similar structure
+
+    Returns:
+        str: Content from the last message, or concatenated content if multiple messages
+    """
+    if not messages:
+        return None
+
+    contents = []
+
+    # Handle list of messages
+    if isinstance(messages, list):
+        for message in messages:
+            content = _extract_single_message_content(message)
+            if content:
+                contents.append(content)
+    else:
+        # Handle single message
+        content = _extract_single_message_content(messages)
+        if content:
+            return content
+
+    # For multiple messages, return the last content (most recent) or join if needed
+    if contents:
+        # If there's only one content, return it directly
+        if len(contents) == 1:
+            return contents[0]
+        # For multiple contents, return the last one (most recent message)
+        return contents[-1]
+
+    return None
+
+
+def _extract_single_message_content(message: Any) -> Optional[str]:
+    """
+    Extract content from a single message object.
+
+    Args:
+        message: A single message object (BaseMessage, dict, or string representation)
+
+    Returns:
+        str: The content of the message, or None if no content found
+    """
+    if isinstance(message, str):
+        # Handle string representations like "content='...' additional_kwargs={} response_metadata={}"
+        content_match = re.search(r"content='([^']*)'", message)
+        if content_match:
+            return content_match.group(1)
+        # Handle string representations with double quotes
+        content_match = re.search(r'content="([^"]*)"', message)
+        if content_match:
+            return content_match.group(1)
+        return message
+
+    # Handle BaseMessage objects
+    if hasattr(message, "content"):
+        return str(message.content) if message.content else None
+
+    # Handle dict representations
+    if isinstance(message, dict):
+        # Direct content key
+        if "content" in message:
+            return str(message["content"]) if message["content"] else None
+
+        # Nested in kwargs (LangChain message format)
+        if "kwargs" in message and isinstance(message["kwargs"], dict):
+            kwargs = message["kwargs"]
+            if "content" in kwargs:
+                return str(kwargs["content"]) if kwargs["content"] else None
+
+    return None
+
+
 def _convert_io(obj: Optional[Mapping[str, Any]]) -> Iterator[str]:
     """
     Convert input/output data to appropriate string representation for OpenInference spans.
@@ -369,7 +447,8 @@ def _convert_io(obj: Optional[Mapping[str, Any]]) -> Iterator[str]:
     2. Single string values: return the string directly (performance optimization, no MIME type)
     3. Single input/output key with non-string: use custom JSON formatting via _json_dumps
        - Conditional MIME type: only for structured data (objects/arrays), not primitives
-    4. Multiple keys or other cases: use _json_dumps for consistent formatting
+    4. Special handling for LangGraph messages: extract content from message objects
+    5. Multiple keys or other cases: use _json_dumps for consistent formatting
        - Always includes JSON MIME type since these are always structured objects
 
     Args:
@@ -395,6 +474,13 @@ def _convert_io(obj: Optional[Mapping[str, Any]]) -> Iterator[str]:
 
         key = next(iter(obj.keys()))
 
+        # Special handling for LangGraph messages: extract content
+        if key == "messages" and value:
+            content = _extract_content_from_messages(value)
+            if content:
+                yield content
+                return
+
         # Special handling for input/output keys: use custom JSON formatting
         # that preserves readability and handles edge cases like NaN values
         if key in ("input", "output"):
@@ -412,6 +498,14 @@ def _convert_io(obj: Optional[Mapping[str, Any]]) -> Iterator[str]:
                 yield OpenInferenceMimeTypeValues.JSON.value
             return
 
+    # Special handling for multi-key objects that contain messages
+    if "messages" in obj and obj["messages"]:
+        content = _extract_content_from_messages(obj["messages"])
+        if content:
+            # Return just the extracted content, regardless of other keys
+            yield content
+            return
+
     # Default case: multiple keys or non-input/output keys
     # These are always complex structured objects, so always include JSON MIME type
     # Use _json_dumps for consistent formatting across all paths