ROB-717 stream (#240)

RoiGlinik · web-flow · commit b63bb56deead · 2025-03-16T17:21:48.000+02:00
diff --git a/holmes/core/investigation.py b/holmes/core/investigation.py
@@ -5,6 +5,15 @@
 from holmes.core.models import InvestigateRequest, InvestigationResult
 from holmes.core.supabase_dal import SupabaseDal
 from holmes.utils.robusta import load_robusta_api_key
+import logging
+
+from holmes.core.investigation_structured_output import (
+    DEFAULT_SECTIONS,
+    REQUEST_STRUCTURED_OUTPUT_FROM_LLM,
+    get_output_format_for_investigation,
+)
+
+from holmes.plugins.prompts import load_and_render_prompt
 
 
 def investigate_issues(
@@ -49,3 +58,90 @@ def investigate_issues(
         tool_calls=investigation.tool_calls or [],
         instructions=investigation.instructions,
     )
+
+
+def get_investigation_context(
+    investigate_request: InvestigateRequest, dal: SupabaseDal, config: Config
+):
+    load_robusta_api_key(dal=dal, config=config)
+    ai = config.create_issue_investigator(dal=dal)
+
+    raw_data = investigate_request.model_dump()
+    context = dal.get_issue_data(investigate_request.context.get("robusta_issue_id"))
+    if context:
+        raw_data["extra_context"] = context
+
+    issue = Issue(
+        id=context["id"] if context else "",
+        name=investigate_request.title,
+        source_type=investigate_request.source,
+        source_instance_id=investigate_request.source_instance_id,
+        raw=raw_data,
+    )
+
+    runbooks = ai.runbook_manager.get_instructions_for_issue(issue)
+
+    instructions = dal.get_resource_instructions(
+        "alert", investigate_request.context.get("issue_type")
+    )
+    if instructions is not None and instructions.instructions:
+        runbooks.extend(instructions.instructions)
+    if instructions is not None and len(instructions.documents) > 0:
+        docPrompts = []
+        for document in instructions.documents:
+            docPrompts.append(f"* fetch information from this URL: {document.url}\n")
+        runbooks.extend(docPrompts)
+
+    # This section is about setting vars to request the LLM to return structured output.
+    # It does not mean that Holmes will not return structured sections for investigation as it is
+    # capable of splitting the markdown into sections
+    request_structured_output_from_llm = True
+    response_format = None
+    sections = investigate_request.sections
+    if not sections or len(sections) == 0:
+        # If no sections are passed, we will not ask the LLM for structured output
+        sections = DEFAULT_SECTIONS
+        request_structured_output_from_llm = False
+        logging.info(
+            "No section received from the client. Default sections will be used."
+        )
+    elif ai.llm.model and ai.llm.model.startswith("bedrock"):
+        # Structured output does not work well with Bedrock Anthropic Sonnet 3.5 through litellm
+        request_structured_output_from_llm = False
+
+    if not REQUEST_STRUCTURED_OUTPUT_FROM_LLM:
+        request_structured_output_from_llm = False
+
+    if request_structured_output_from_llm:
+        response_format = get_output_format_for_investigation(sections)
+        logging.info("Structured output is enabled for this request")
+    else:
+        logging.info("Structured output is disabled for this request")
+
+    system_prompt = load_and_render_prompt(
+        investigate_request.prompt_template,
+        {
+            "issue": issue,
+            "sections": sections,
+            "structured_output": request_structured_output_from_llm,
+        },
+    )
+
+    user_prompt = ""
+    if runbooks:
+        for runbook_str in runbooks:
+            user_prompt += f"* {runbook_str}\n"
+
+        user_prompt = f'My instructions to check \n"""{user_prompt}"""'
+
+    global_instructions = dal.get_global_instructions_for_account()
+    if (
+        global_instructions
+        and global_instructions.instructions
+        and len(global_instructions.instructions[0]) > 0
+    ):
+        user_prompt += f"\n\nGlobal Instructions (use only if relevant): {global_instructions.instructions[0]}\n"
+
+    user_prompt = f"{user_prompt}\n This is context from the issue {issue.raw}"
+
+    return ai, system_prompt, user_prompt, response_format, sections, runbooks
diff --git a/holmes/core/llm.py b/holmes/core/llm.py
@@ -5,6 +5,7 @@
 from litellm.types.utils import ModelResponse
 import sentry_sdk
 
+from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper
 from holmes.core.tools import Tool
 from pydantic import BaseModel
 import litellm
@@ -49,6 +50,7 @@ def completion(
         response_format: Optional[Union[dict, Type[BaseModel]]] = None,
         temperature: Optional[float] = None,
         drop_params: Optional[bool] = None,
+        stream: Optional[bool] = None,
     ) -> ModelResponse:
         pass
 
@@ -170,6 +172,7 @@ def completion(
         response_format: Optional[Union[dict, Type[BaseModel]]] = None,
         temperature: Optional[float] = None,
         drop_params: Optional[bool] = None,
+        stream: Optional[bool] = None,
     ) -> ModelResponse:
         result = litellm.completion(
             model=self.model,
@@ -181,10 +184,13 @@ def completion(
             temperature=temperature,
             response_format=response_format,
             drop_params=drop_params,
+            stream=stream,
         )
 
         if isinstance(result, ModelResponse):
             return result
+        elif isinstance(result, CustomStreamWrapper):
+            return result
         else:
             raise Exception(f"Unexpected type returned by the LLM {type(result)}")
 
diff --git a/holmes/core/tool_calling_llm.py b/holmes/core/tool_calling_llm.py
@@ -12,6 +12,7 @@
     InputSectionsDataType,
     get_output_format_for_investigation,
     is_response_an_incorrect_tool_call,
+    process_response_into_sections,
 )
 from holmes.core.performance_timing import PerformanceTiming
 from holmes.utils.tags import format_tags_in_string, parse_messages_tags
@@ -37,6 +38,14 @@ class ToolCallResult(BaseModel):
     result: str
     size: Optional[int] = None
 
+    def as_dict(self):
+        return {
+            "tool_call_id": self.tool_call_id,
+            "role": "tool",
+            "name": self.tool_name,
+            "content": self.result,
+        }
+
 
 class LLMResult(BaseModel):
     tool_calls: Optional[List[ToolCallResult]] = None
@@ -357,6 +366,120 @@ def truncate_messages_to_fit_context(
                 message["content"] = message["content"][:tool_size]
         return messages
 
+    def call_stream(
+        self,
+        system_prompt: str,
+        user_prompt: Optional[str] = None,
+        response_format: Optional[Union[dict, Type[BaseModel]]] = None,
+        runbooks: List[str] = None,
+    ):
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ]
+        perf_timing = PerformanceTiming("tool_calling_llm.call")
+        tool_calls: List[ToolCallResult] = []
+        tools = self.tool_executor.get_all_tools_openai_format()
+        perf_timing.measure("get_all_tools_openai_format")
+        i = 0
+
+        while i < self.max_steps:
+            i += 1
+            perf_timing.measure(f"start iteration {i}")
+            logging.debug(f"running iteration {i}")
+
+            tools = [] if i == self.max_steps - 1 else tools
+            tool_choice = None if tools == [] else "auto"
+
+            total_tokens = self.llm.count_tokens_for_message(messages)
+            max_context_size = self.llm.get_context_window_size()
+            maximum_output_token = self.llm.get_maximum_output_token()
+            perf_timing.measure("count tokens")
+
+            if (total_tokens + maximum_output_token) > max_context_size:
+                logging.warning("Token limit exceeded. Truncating tool responses.")
+                messages = self.truncate_messages_to_fit_context(
+                    messages, max_context_size, maximum_output_token
+                )
+                perf_timing.measure("truncate_messages_to_fit_context")
+
+            logging.debug(f"sending messages={messages}\n\ntools={tools}")
+            try:
+                full_response = self.llm.completion(
+                    messages=parse_messages_tags(messages),
+                    tools=tools,
+                    tool_choice=tool_choice,
+                    temperature=0.00000001,
+                    response_format=response_format,
+                    stream=False,
+                    drop_params=True,
+                )
+                perf_timing.measure("llm.completion")
+
+            # catch a known error that occurs with Azure and replace the error message with something more obvious to the user
+            except BadRequestError as e:
+                if "Unrecognized request arguments supplied: tool_choice, tools" in str(
+                    e
+                ):
+                    yield json.dumps(
+                        {
+                            "type": "error",
+                            "details": {
+                                "msg": "The Azure model you chose is not supported. Model version 1106 and higher required."
+                            },
+                        }
+                    )
+                    return
+                raise
+            except Exception:
+                raise
+
+            response_message = full_response.choices[0].message
+            tools_to_call = getattr(response_message, "tool_calls", None)
+            if not tools_to_call:
+                (text_response, _) = process_response_into_sections(
+                    response_message.content
+                )
+                yield json.dumps(
+                    {"type": "ai_answer", "details": {"answer": text_response}}
+                )
+                if runbooks:
+                    yield json.dumps(
+                        {
+                            "type": "instructions",
+                            "details": {"instructions": json.dumps(runbooks)},
+                        }
+                    )
+                return
+
+            messages.append(
+                response_message.model_dump(
+                    exclude_defaults=True, exclude_unset=True, exclude_none=True
+                )
+            )
+
+            perf_timing.measure("pre-tool-calls")
+            with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
+                futures = []
+                for t in tools_to_call:
+                    futures.append(executor.submit(self._invoke_tool, t))
+                    yield json.dumps(
+                        {
+                            "type": "start_tool_calling",
+                            "details": {"tool_name": t.function.name, "id": t.id},
+                        }
+                    )
+
+                for future in concurrent.futures.as_completed(futures):
+                    tool_call_result: ToolCallResult = future.result()
+                    tool_calls.append(tool_call_result)
+                    tool_call_dict = tool_call_result.as_dict()
+                    messages.append(tool_call_dict)
+                    perf_timing.measure(f"tool completed {tool_call_result.tool_name}")
+                    yield json.dumps(
+                        {"type": "tool_calling_result", "details": tool_call_dict}
+                    )
+
 
 # TODO: consider getting rid of this entirely and moving templating into the cmds in holmes.py
 class IssueInvestigator(ToolCallingLLM):
diff --git a/server.py b/server.py
@@ -20,6 +20,7 @@
 
 from litellm.exceptions import AuthenticationError
 from fastapi import FastAPI, HTTPException, Request
+from fastapi.responses import StreamingResponse
 from holmes.utils.robusta import load_robusta_api_key
 
 from holmes.common.env_vars import (
@@ -145,6 +146,24 @@ def investigate_issues(investigate_request: InvestigateRequest):
         raise HTTPException(status_code=500, detail=str(e))
 
 
+@app.post("/api/stream/investigate")
+def stream_investigate_issues(req: InvestigateRequest):
+    ai, system_prompt, user_prompt, response_format, sections, runbooks = (
+        investigation.get_investigation_context(req, dal, config=config)
+    )
+
+    try:
+        return StreamingResponse(
+            ai.call_stream(system_prompt, user_prompt, response_format, runbooks),
+            media_type="text/event-stream",
+        )
+    except AuthenticationError as e:
+        raise HTTPException(status_code=401, detail=e.message)
+    except Exception as e:
+        logging.exception(f"Error in /api/stream/investigate: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+
 @app.post("/api/workload_health_check")
 def workload_health_check(request: WorkloadHealthRequest):
     load_robusta_api_key(dal=dal, config=config)