Add max_concurrency config option for agent llm calls (#79)

ashsong-nv · web-flow · commit d5566a150b3f · 2025-02-24T11:17:33.000-05:00
diff --git a/README.md b/README.md
@@ -70,8 +70,9 @@ limitations under the License.
     - [Resetting the entire cache](#resetting-the-entire-cache)
     - [Resetting just the LLM cache or the services cache](#resetting-just-the-llm-cache-or-the-services-cache)
     - [Vector databases](#vector-databases)
-  - [Service outages](#service-outages)
+  - [Service errors](#service-errors)
     - [National Vulnerability Database (NVD)](#national-vulnerability-database-nvd)
+    - [NVIDIA API Catalog / NVIDIA-hosted NIMs](#nvidia-api-catalog--nvidia-hosted-nims)
   - [Running out of credits](#running-out-of-credits)
 - [Testing and validation](#testing-and-validation)
 - [License](#license)
@@ -506,7 +507,7 @@ The configuration defines how the workflow operates, including model settings, i
 2. **LLM engine configuration (`engine`)**: The `engine` section configures various models for the LLM nodes.
     - LLM processing nodes: `agent`, `checklist_model`, `justification_model`, `summary_model`
       - `model_name`: The name of the LLM model used by the node.
-      - `prompt`: Manually set the prompt for the specific model in the configuration. The prompt can either be passed in as a string of text or as a path to a text file containing the desired prompting. 
+      - `prompt`: Manually set the prompt for the specific model in the configuration. The prompt can either be passed in as a string of text or as a path to a text file containing the desired prompting.
       - `service`: Specifies the service for running the LLM inference. (Set to `nvfoundation` if using NIM.)
       - `max_tokens`: Defines the maximum number of tokens that can be generated in one output step.
       - `temperature`: Controls randomness in the output. A lower temperature produces more deterministic results.
@@ -516,6 +517,7 @@ The configuration defines how the workflow operates, including model settings, i
         - `return_intermediate_steps`: Controls whether to return intermediate steps taken by the agent, and include them in the output file. Helpful for troubleshooting agent responses.
         - `return_source_documents`: Controls whether to return source documents from the VDB tools, and include them in the intermediate steps output. Helpful for identifying the source files used in agent responses.
           - Note: enabling this will also include source documents in the agent's memory and increase the agent's prompt length.
+        - `max_concurrency`: Controls the maximum number of concurrent requests to the LLM. Default is `None`, which doesn't limit concurrency.
     - Embedding model for generating VDB for RAG: `rag_embedding`
       - `_type`: Defines the source of the model used for generating embeddings (e.g., `nim`, `huggingface`, `openai`).
       - Other model-dependent parameters, such as `model`/`model_name`, `api_key`, `truncate`, or `encode_kwargs`: see the [embedding model customization](#customizing-the-embedding-model) section below for more details.
@@ -725,8 +727,7 @@ To customize the output, modify the configuration file accordingly. In any confi
     }
 ```
 
-To post the output to an HTTP endpoint, update the JSON object in the config file as follows, replacing the domain, port, and endpoint with the desired 
-destination (note the trailing slash in the "url" field). The output will be sent as JSON data.
+To post the output to an HTTP endpoint, update the JSON object in the config file as follows, replacing the domain, port, and endpoint with the desired destination (note the trailing slash in the "url" field). The output will be sent as JSON data.
 
 ```
     "output": {
@@ -853,7 +854,7 @@ We've integrated VDB and embedding creation directly into the pipeline with cach
 
 NVIDIA offers optimized models and tools like NIMs ([build.nvidia.com/explore/retrieval](https://build.nvidia.com/explore/retrieval)) and cuVS ([github.com/rapidsai/cuvs](https://github.com/rapidsai/cuvs)).
 
-### Service outages
+### Service errors
 
 #### National Vulnerability Database (NVD)
 These typically resolve on their own. Please wait and try running the pipeline again later. Example errors:
@@ -868,6 +869,13 @@ Error requesting [1/10]: (Retry 0.1 sec) https://services.nvd.nist.gov/rest/json
 Error requesting [1/10]: (Retry 0.1 sec) https://services.nvd.nist.gov/rest/json/cves/2.0: 503, message='Service Unavailable', url=URL('https://services.nvd.nist.gov/rest/json/cves/2.0?cveId=CVE-2023-50447')
 ```
 
+#### NVIDIA API Catalog / NVIDIA-hosted NIMs
+
+429 errors can occur when your requests exceed the rate limit for the model. Try setting the `engine.agent.max_concurrency` to a low value such as 5 to reduce the rate of requests.
+```
+Exception: [429] Too Many Requests
+```
+
 ### Running out of credits
 
 If you run out of credits for the NVIDIA API Catalog, you will need to obtain more credits to continue using the API. Please contact your NVIDIA representative to get more credits added.
diff --git a/configs/from_file.json b/configs/from_file.json
@@ -12,7 +12,10 @@
         "top_p": 0.01,
         "seed": 42
       },
-      "verbose": false
+      "verbose": false,
+      "return_intermediate_steps": false,
+      "return_source_documents": false,
+      "max_concurrency": null
     },
     "checklist_model": {
       "service": {
diff --git a/configs/from_http.json b/configs/from_http.json
@@ -12,7 +12,10 @@
         "top_p": 0.01,
         "seed": 42
       },
-      "verbose": false
+      "verbose": false,
+      "return_intermediate_steps": false,
+      "return_source_documents": false,
+      "max_concurrency": null
     },
     "checklist_model": {
       "service": {
diff --git a/configs/from_manual.json b/configs/from_manual.json
@@ -14,7 +14,8 @@
       },
       "verbose": false,
       "return_intermediate_steps": false,
-      "return_source_documents": false
+      "return_source_documents": false,
+      "max_concurrency": null
     },
     "checklist_model": {
       "service": {
diff --git a/src/cve/data_models/config.py b/src/cve/data_models/config.py
@@ -242,6 +242,7 @@ class EngineAgentConfig(BaseModel):
     verbose: bool = False
     return_intermediate_steps: bool = False
     return_source_documents: bool = False
+    max_concurrency: int | None = None
 
 
 class EngineConfig(BaseModel):
diff --git a/src/cve/nodes/cve_langchain_agent_node.py b/src/cve/nodes/cve_langchain_agent_node.py
@@ -13,13 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 import asyncio
 import logging
 import typing
 
 from langchain_core.exceptions import OutputParserException
-
 from morpheus_llm.llm import LLMContext
 from morpheus_llm.llm import LLMNodeBase
 
@@ -40,21 +38,27 @@ class CVELangChainAgentNode(LLMNodeBase):
     ----------
     agent_executor : AgentExecutor
         The agent executor to use to execute.
-
-    vdb_names : tuple[str, str]
-        Name of the VDBs to load from the input.
+    replace_exceptions : bool, optional
+        Whether to replace exceptions with a default value, by default False
+    replace_exceptions_value : Optional[str], optional
+        The value to replace exceptions with, by default None
+    max_concurrency : Optional[int], optional
+        Maximum number of concurrent agent invocations. None means no limit. By default None.
     """
 
     def __init__(self,
                  *,
                  create_agent_executor_fn: "typing.Callable[[LLMContext], AgentExecutor]",
                  replace_exceptions: bool = False,
-                 replace_exceptions_value: typing.Optional[str] = None):
+                 replace_exceptions_value: typing.Optional[str] = None,
+                 max_concurrency: typing.Optional[int] = None):
         super().__init__()
 
         self._create_agent_executor_fn = create_agent_executor_fn
         self._replace_exceptions = replace_exceptions
         self._replace_exceptions_value = replace_exceptions_value
+        self._max_concurrency = max_concurrency
+        self._semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency is not None else None
 
         self._input_names = ["input"]
 
@@ -154,16 +158,18 @@ async def _run_single(self,
 
             results = await asyncio.gather(*results_async, return_exceptions=True)
 
-            # # Transform from list[dict[str, Any]] to dict[str, list[Any]]
-            # results = {k: [x[k] for x in results] for k in results[0]}
-
             return results
 
         # We are not dealing with a list, so run single
         try:
             input_single = {"input": kwargs.pop("input")}
             config = {"callbacks": agent.callbacks, "tags": agent.tags, "metadata": metadata}
-            return await agent.ainvoke(input=input_single, config=config, **kwargs)
+
+            if self._semaphore is not None:
+                async with self._semaphore:
+                    return await agent.ainvoke(input=input_single, config=config, **kwargs)
+            else:
+                return await agent.ainvoke(input=input_single, config=config, **kwargs)
         except Exception as e:
             logger.exception("Error running agent: %s", e)
             return e
diff --git a/src/cve/pipeline/engine.py b/src/cve/pipeline/engine.py
@@ -22,7 +22,6 @@
 from langchain.chains.retrieval_qa.base import RetrievalQA
 from langchain.vectorstores.faiss import FAISS
 from langchain_core.embeddings import Embeddings
-
 from morpheus_llm.llm import LLMContext
 from morpheus_llm.llm import LLMEngine
 from morpheus_llm.llm.nodes.extracter_node import ManualExtracterNode
@@ -213,7 +212,8 @@ def build_engine(*, run_config: RunConfig, embeddings: Embeddings):
                     node=CVELangChainAgentNode(
                         create_agent_executor_fn=_build_dynamic_agent_fn(run_config, embeddings),
                         replace_exceptions=True,
-                        replace_exceptions_value="I do not have a definitive answer for this checklist item."))
+                        replace_exceptions_value="I do not have a definitive answer for this checklist item.",
+                        max_concurrency=run_config.engine.agent.max_concurrency))
 
     engine.add_node('summary',
                     inputs=[("/checklist", "checklist_inputs"), ("/agent/outputs", "checklist_outputs"),