12
12
InputSectionsDataType ,
13
13
get_output_format_for_investigation ,
14
14
is_response_an_incorrect_tool_call ,
15
+ process_response_into_sections ,
15
16
)
16
17
from holmes .core .performance_timing import PerformanceTiming
17
18
from holmes .utils .tags import format_tags_in_string , parse_messages_tags
@@ -37,6 +38,14 @@ class ToolCallResult(BaseModel):
37
38
result : str
38
39
size : Optional [int ] = None
39
40
41
+ def as_dict (self ):
42
+ return {
43
+ "tool_call_id" : self .tool_call_id ,
44
+ "role" : "tool" ,
45
+ "name" : self .tool_name ,
46
+ "content" : self .result ,
47
+ }
48
+
40
49
41
50
class LLMResult (BaseModel ):
42
51
tool_calls : Optional [List [ToolCallResult ]] = None
@@ -357,6 +366,120 @@ def truncate_messages_to_fit_context(
357
366
message ["content" ] = message ["content" ][:tool_size ]
358
367
return messages
359
368
369
+ def call_stream (
370
+ self ,
371
+ system_prompt : str ,
372
+ user_prompt : Optional [str ] = None ,
373
+ response_format : Optional [Union [dict , Type [BaseModel ]]] = None ,
374
+ runbooks : List [str ] = None ,
375
+ ):
376
+ messages = [
377
+ {"role" : "system" , "content" : system_prompt },
378
+ {"role" : "user" , "content" : user_prompt },
379
+ ]
380
+ perf_timing = PerformanceTiming ("tool_calling_llm.call" )
381
+ tool_calls : List [ToolCallResult ] = []
382
+ tools = self .tool_executor .get_all_tools_openai_format ()
383
+ perf_timing .measure ("get_all_tools_openai_format" )
384
+ i = 0
385
+
386
+ while i < self .max_steps :
387
+ i += 1
388
+ perf_timing .measure (f"start iteration { i } " )
389
+ logging .debug (f"running iteration { i } " )
390
+
391
+ tools = [] if i == self .max_steps - 1 else tools
392
+ tool_choice = None if tools == [] else "auto"
393
+
394
+ total_tokens = self .llm .count_tokens_for_message (messages )
395
+ max_context_size = self .llm .get_context_window_size ()
396
+ maximum_output_token = self .llm .get_maximum_output_token ()
397
+ perf_timing .measure ("count tokens" )
398
+
399
+ if (total_tokens + maximum_output_token ) > max_context_size :
400
+ logging .warning ("Token limit exceeded. Truncating tool responses." )
401
+ messages = self .truncate_messages_to_fit_context (
402
+ messages , max_context_size , maximum_output_token
403
+ )
404
+ perf_timing .measure ("truncate_messages_to_fit_context" )
405
+
406
+ logging .debug (f"sending messages={ messages } \n \n tools={ tools } " )
407
+ try :
408
+ full_response = self .llm .completion (
409
+ messages = parse_messages_tags (messages ),
410
+ tools = tools ,
411
+ tool_choice = tool_choice ,
412
+ temperature = 0.00000001 ,
413
+ response_format = response_format ,
414
+ stream = False ,
415
+ drop_params = True ,
416
+ )
417
+ perf_timing .measure ("llm.completion" )
418
+
419
+ # catch a known error that occurs with Azure and replace the error message with something more obvious to the user
420
+ except BadRequestError as e :
421
+ if "Unrecognized request arguments supplied: tool_choice, tools" in str (
422
+ e
423
+ ):
424
+ yield json .dumps (
425
+ {
426
+ "type" : "error" ,
427
+ "details" : {
428
+ "msg" : "The Azure model you chose is not supported. Model version 1106 and higher required."
429
+ },
430
+ }
431
+ )
432
+ return
433
+ raise
434
+ except Exception :
435
+ raise
436
+
437
+ response_message = full_response .choices [0 ].message
438
+ tools_to_call = getattr (response_message , "tool_calls" , None )
439
+ if not tools_to_call :
440
+ (text_response , _ ) = process_response_into_sections (
441
+ response_message .content
442
+ )
443
+ yield json .dumps (
444
+ {"type" : "ai_answer" , "details" : {"answer" : text_response }}
445
+ )
446
+ if runbooks :
447
+ yield json .dumps (
448
+ {
449
+ "type" : "instructions" ,
450
+ "details" : {"instructions" : json .dumps (runbooks )},
451
+ }
452
+ )
453
+ return
454
+
455
+ messages .append (
456
+ response_message .model_dump (
457
+ exclude_defaults = True , exclude_unset = True , exclude_none = True
458
+ )
459
+ )
460
+
461
+ perf_timing .measure ("pre-tool-calls" )
462
+ with concurrent .futures .ThreadPoolExecutor (max_workers = 16 ) as executor :
463
+ futures = []
464
+ for t in tools_to_call :
465
+ futures .append (executor .submit (self ._invoke_tool , t ))
466
+ yield json .dumps (
467
+ {
468
+ "type" : "start_tool_calling" ,
469
+ "details" : {"tool_name" : t .function .name , "id" : t .id },
470
+ }
471
+ )
472
+
473
+ for future in concurrent .futures .as_completed (futures ):
474
+ tool_call_result : ToolCallResult = future .result ()
475
+ tool_calls .append (tool_call_result )
476
+ tool_call_dict = tool_call_result .as_dict ()
477
+ messages .append (tool_call_dict )
478
+ perf_timing .measure (f"tool completed { tool_call_result .tool_name } " )
479
+ yield json .dumps (
480
+ {"type" : "tool_calling_result" , "details" : tool_call_dict }
481
+ )
482
+
360
483
361
484
# TODO: consider getting rid of this entirely and moving templating into the cmds in holmes.py
362
485
class IssueInvestigator (ToolCallingLLM ):
0 commit comments