From 164ad6ab8d731213fba80496879a9ec5ee51ec06 Mon Sep 17 00:00:00 2001 From: Austin Swinney Date: Wed, 18 Mar 2026 14:06:06 -0500 Subject: [PATCH 1/2] Add vLLM provider and infrastructure support Introduce a new vLLM provider enabling self-hosted LLM inference as a first-class deployment option in archi. This includes: - vLLM provider implementation with OpenAI-compatible API interface - Docker service template for vllm-server with configurable engine args - CLI integration: service registry, template manager, and compose generation - Example deployment config (basic-vllm) and GPU config example - Documentation for vLLM setup and usage - Unit tests and smoke tests for the vLLM provider - Include source URL in retriever tool output Co-Authored-By: Claude Opus 4.6 (1M context) --- .gitignore | 1 + docs/docs/user_guide.md | 568 ++++++++++++++++++ docs/docs/vllm.md | 320 ++++++++++ docs/mkdocs.yml | 2 + examples/deployments/basic-gpu/config.yaml | 57 ++ .../deployments/basic-vllm/condense.prompt | 12 + examples/deployments/basic-vllm/config.yaml | 55 ++ .../deployments/basic-vllm/miscellanea.list | 39 ++ examples/deployments/basic-vllm/qa.prompt | 18 + openspec/changes/add-vllm-provider/design.md | 30 + .../changes/add-vllm-provider/proposal.md | 14 + .../specs/vllm-provider/spec.md | 56 ++ .../specs/vllm-server/spec.md | 55 ++ openspec/changes/add-vllm-provider/tasks.md | 24 + src/archi/pipelines/agents/tools/retriever.py | 3 + src/archi/providers/__init__.py | 6 +- src/archi/providers/base.py | 5 +- src/archi/providers/vllm_provider.py | 172 ++++++ src/cli/managers/templates_manager.py | 26 + src/cli/service_registry.py | 11 + src/cli/templates/base-compose.yaml | 86 ++- src/cli/utils/service_builder.py | 3 + src/interfaces/chat_app/app.py | 1 - tests/smoke/combined_smoke.sh | 10 + tests/smoke/preflight.py | 20 +- tests/smoke/vllm_smoke.py | 122 ++++ tests/unit/test_vllm_provider.py | 205 +++++++ 27 files changed, 1908 insertions(+), 13 deletions(-) create mode 100644 docs/docs/vllm.md create mode 100644 examples/deployments/basic-gpu/config.yaml create mode 100644 examples/deployments/basic-vllm/condense.prompt create mode 100644 examples/deployments/basic-vllm/config.yaml create mode 100644 examples/deployments/basic-vllm/miscellanea.list create mode 100644 examples/deployments/basic-vllm/qa.prompt create mode 100644 openspec/changes/add-vllm-provider/design.md create mode 100644 openspec/changes/add-vllm-provider/proposal.md create mode 100644 openspec/changes/add-vllm-provider/specs/vllm-provider/spec.md create mode 100644 openspec/changes/add-vllm-provider/specs/vllm-server/spec.md create mode 100644 openspec/changes/add-vllm-provider/tasks.md create mode 100644 src/archi/providers/vllm_provider.py create mode 100644 tests/smoke/vllm_smoke.py create mode 100644 tests/unit/test_vllm_provider.py diff --git a/.gitignore b/.gitignore index 8386cf17a..50c17c208 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,7 @@ docs/site setup.sh .env configs/ +config/ build/ src/cli/utils/_repository_info.py openspec/specs/ diff --git a/docs/docs/user_guide.md b/docs/docs/user_guide.md index c512520e7..fa9335d98 100644 --- a/docs/docs/user_guide.md +++ b/docs/docs/user_guide.md @@ -99,6 +99,7 @@ See the [Configuration Reference](configuration.md) for the full YAML schema and --- +<<<<<<< HEAD ## Secrets Secrets are stored in a `.env` file passed via `--env-file`. Required secrets depend on your deployment: @@ -116,6 +117,573 @@ Secrets are stored in a `.env` file passed via `--env-file`. Required secrets de | `REDMINE_USER` / `REDMINE_PW` | Redmine source | See [Data Sources](data_sources.md) and [Services](services.md) for service-specific secrets. +======= +## Interfaces/Services + +These are the different apps that Archi supports, which allow you to interact with the AI pipelines. + +### Piazza Interface + +Set up Archi to read posts from your Piazza forum and post draft responses to a specified Slack channel. To do this, a Piazza login (email and password) is required, plus the network ID of your Piazza channel, and lastly, a Webhook for the slack channel Archi will post to. See below for a step-by-step description of this. + +1. Go to [https://api.slack.com/apps](https://api.slack.com/apps) and sign in to workspace where you will eventually want Archi to post to (note doing this in a business workspace like the MIT one will require approval of the app/bot). +2. Click 'Create New App', and then 'From scratch'. Name your app and again select the correct workspace. Then hit 'Create App' +3. Now you have your app, and there are a few things to configure before you can launch Archi: +4. Go to Incoming Webhooks under Features, and toggle it on. +5. Click 'Add New Webhook', and select the channel you want Archi to post to. +6. Now, copy the 'Webhook URL' and paste it into the secrets file, and handle it like any other secret! + +#### Configuration + +Beyond standard required configuration fields, the network ID of the Piazza channel is required (see below for an example config). You can get the network ID by simply navigating to the class homepage, and grabbing the sequence that follows 'https://piazza.com/class/'. For example, the 8.01 Fall 2024 homepage is: 'https://piazza.com/class/m0g3v0ahsqm2lg'. The network ID is thus 'm0g3v0ahsqm2lg'. + +Example minimal config for the Piazza interface: + +```yaml +name: bare_minimum_configuration #REQUIRED + +data_manager: + sources: + links: + input_lists: + - class_info.list # class info links + +archi: + [... archi config ...] + +services: + piazza: + network_id: # REQUIRED + chat_app: + trained_on: "Your class materials" # REQUIRED +``` + +#### Secrets + +The necessary secrets for deploying the Piazza service are the following: + +```bash +PIAZZA_EMAIL=... +PIAZZA_PASSWORD=... +SLACK_WEBHOOK=... +``` + +The Slack webhook secret is described above. The Piazza email and password should be those of one of the class instructors. Remember to put this information in files named following what is written above. + +#### Running + +To run the Piazza service, simply add the piazza flag. For example: + +```bash +archi create [...] --services=chatbot,piazza +``` + +--- + +### Redmine/Mailbox Interface + +Archi will read all new tickets in a Redmine project, and draft a response as a comment to the ticket. +Once the ticket is updated to the "Resolved" status by an admin, Archi will send the response as an email to the user who opened the ticket. +The admin can modify Archi's response before sending it out. + +#### Configuration + +```yaml +services: + redmine_mailbox: + url: https://redmine.example.com + project: my-project + redmine_update_time: 10 + mailbox_update_time: 10 + answer_tag: "-- Archi -- Resolving email was sent" +``` + +#### Secrets + +Add the following secrets to your `.env` file: +```bash +IMAP_USER=... +IMAP_PW=... +REDMINE_USER=... +REDMINE_PW=... +SENDER_SERVER=... +SENDER_PORT=587 +SENDER_REPLYTO=... +SENDER_USER=... +SENDER_PW=... +``` + +#### Running + +```bash +archi create [...] --services=chatbot,redmine-mailer +``` + +--- + +### Mattermost Interface + +Set up Archi to read posts from your Mattermost forum and post draft responses to a specified Mattermost channel. + +#### Configuration + +```yaml +services: + mattermost: + update_time: 60 +``` + +#### Secrets + +You need to specify a webhook, access token, and channel identifiers: +```bash +MATTERMOST_WEBHOOK=... +MATTERMOST_PAK=... +MATTERMOST_CHANNEL_ID_READ=... +MATTERMOST_CHANNEL_ID_WRITE=... +``` + +#### Running + +To run the Mattermost service, include it when selecting services. For example: +```bash +archi create [...] --services=chatbot,mattermost +``` + +--- + +### Grafana Interface + +Monitor the performance of your Archi instance with the Grafana interface. This service provides a web-based dashboard to visualize various metrics related to system performance, LLM usage, and more. + +> Note, if you are deploying a version of Archi you have already used (i.e., you haven't removed the images/volumes for a given `--name`), the postgres will have already been created without the Grafana user created, and it will not work, so make sure to deploy a fresh instance. + +#### Configuration + +```yaml +services: + grafana: + external_port: 3000 +``` + +#### Secrets + +Grafana shares the Postgres database with other services, so you need both the database password and a Grafana-specific password: +```bash +PG_PASSWORD= +GRAFANA_PG_PASSWORD= +``` + +#### Running + +Deploy Grafana alongside your other services: +```bash +archi create [...] --services=chatbot,grafana +``` +and you should see something like this +``` +CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES +87f1c7289d29 docker.io/library/postgres:17 postgres 9 minutes ago Up 9 minutes (healthy) 5432/tcp postgres-gtesting2 +40130e8e23de docker.io/library/grafana-gtesting2:2000 9 minutes ago Up 9 minutes 0.0.0.0:3000->3000/tcp, 3000/tcp grafana-gtesting2 +d6ce8a149439 localhost/chat-gtesting2:2000 python -u archi/... 9 minutes ago Up 9 minutes 0.0.0.0:7861->7861/tcp chat-gtesting2 +``` +where the grafana interface is accessible at `your-hostname:3000`. To change the external port from `3000`, you can do this in the config at `services.grafana.external_port`. The default login and password are both "admin", which you will be prompted to change should you want to after first logging in. Navigate to the Archi dashboard from the home page by going to the menu > Dashboards > Archi > Archi Usage. Note, `your-hostname` here is the just name of the machine. Grafana uses its default configuration which is `localhost` but unlike the chat interface, there are no APIs where we template with a selected hostname, so the container networking handles this nicely. + +> Pro tip: once at the web interface, for the "Recent Conversation Messages (Clean Text + Link)" panel, click the three little dots in the top right hand corner of the panel, click "Edit", and on the right, go to e.g., "Override 4" (should have Fields with name: clean text, also Override 7 for context column) and override property "Cell options > Cell value inspect". This will allow you to expand the text boxes with messages longer than can fit. Make sure you click apply to keep the changes. + +> Pro tip 2: If you want to download all of the information from any panel as a CSV, go to the same three dots and click "Inspect", and you should see the option. + +--- + +### Grader Interface + +Interface to launch a website which for a provided solution and rubric (and a couple of other things detailed below), will grade scanned images of a handwritten solution for the specified problem(s). + +> Nota bene: this is not yet fully generalized and "service" ready, but instead for testing grading pipelines and a base off of which to build a potential grading app. + +#### Requirements + +To launch the service the following files are required: + +- `users.csv`. This file is .csv file that contains two columns: "MIT email" and "Unique code", e.g.: + +``` +MIT email,Unique code +username@mit.edu,222 +``` + +For now, the system requires the emails to be in the MIT domain, namely, contain "@mit.edu". TODO: make this an argument that is passed (e.g., school/email domain) + +- `solution_with_rubric_*.txt`. These are .txt files that contain the problem solution followed by the rubric. The naming of the files should follow exactly, where the `*` is the problem number. There should be one of these files for every problem you want the app to be able to grade. The top of the file should be the problem name with a line of dashes ("-") below, e.g.: + +``` +Anti-Helmholtz Coils +--------------------------------------------------- +``` + +These files should live in a directory which you will pass to the config, and Archi will handle the rest. + +- `admin_password.txt`. This file will be passed as a secret and be the admin code to login in to the page where you can reset attempts for students. + +#### Secrets + +The only grading specific secret is the admin password, which like shown above, should be put in the following file + +```bash +ADMIN_PASSWORD=your_password +``` + +Then it behaves like any other secret. + +#### Configuration + +The required fields in the configuration file are different from the rest of the Archi services. Below is an example: + +```yaml +name: grading_test # REQUIRED + +archi: + pipelines: + - GradingPipeline + pipeline_map: + GradingPipeline: + prompts: + required: + final_grade_prompt: final_grade.prompt + models: + required: + final_grade_model: OllamaInterface + ImageProcessingPipeline: + prompts: + required: + image_processing_prompt: image_processing.prompt + models: + required: + image_processing_model: OllamaInterface + +services: + chat_app: + trained_on: "rubrics, class info, etc." # REQUIRED + grader_app: + num_problems: 1 # REQUIRED + local_rubric_dir: ~/grading/my_rubrics # REQUIRED + local_users_csv_dir: ~/grading/logins # REQUIRED + +data_manager: + [...] +``` + +1. `name` -- The name of your configuration (required). +2. `archi.pipelines` -- List of pipelines to use (e.g., `GradingPipeline`, `ImageProcessingPipeline`). +3. `archi.pipeline_map` -- Mapping of pipelines to their required prompts and models. +4. `archi.pipeline_map.GradingPipeline.prompts.required.final_grade_prompt` -- Path to the grading prompt file for evaluating student solutions. +5. `archi.pipeline_map.GradingPipeline.models.required.final_grade_model` -- Model class for grading (e.g., `OllamaInterface`, `HuggingFaceOpenLLM`). +6. `archi.pipeline_map.ImageProcessingPipeline.prompts.required.image_processing_prompt` -- Path to the prompt file for image processing. +7. `archi.pipeline_map.ImageProcessingPipeline.models.required.image_processing_model` -- Model class for image processing (e.g., `OllamaInterface`, `HuggingFaceImageLLM`). +8. `services.chat_app.trained_on` -- A brief description of the data or materials Archi is trained on (required). +9. `services.grader_app.num_problems` -- Number of problems the grading service should expect (must match the number of rubric files). +10. `services.grader_app.local_rubric_dir` -- Directory containing the `solution_with_rubric_*.txt` files. +11. `services.grader_app.local_users_csv_dir` -- Directory containing the `users.csv` file. + +For ReAct-style agents (e.g., `CMSCompOpsAgent`), you may optionally set `archi.pipeline_map..recursion_limit` (default `100`) to control the LangGraph recursion cap; when the limit is hit, the agent returns a final wrap-up response using the collected context. + +#### Running + +```bash +archi create [...] --services=grader +``` + +--- + +## Models + +Models are either: + +1. Hosted locally, either via VLLM or HuggingFace transformers. +2. Accessed via an API, e.g., OpenAI, Anthropic, etc. +3. Accessed via an Ollama server instance. + +### Local Models + +To use a local model, specify one of the local model classes in `models.py`: + +- `HuggingFaceOpenLLM` +- `HuggingFaceImageLLM` +- `VLLM` + +### vLLM + +For high-throughput GPU inference with tool-calling support, Archi can deploy a [vLLM](https://docs.vllm.ai/) server as a sidecar container. Reference models with the `vllm/` prefix in your config: + +```yaml +archi: + pipeline_map: + CMSCompOpsAgent: + models: + required: + agent_model: vllm/Qwen/Qwen3-8B +``` + +Deploy with `--services chatbot,vllm-server --gpu-ids all`. See the [vLLM Provider](vllm.md) page for full configuration, architecture details, and troubleshooting. + +### Models via APIs + +We support the following model classes in `models.py` for models accessed via APIs: + +- `OpenAILLM` +- `OpenRouterLLM` +- `AnthropicLLM` + +#### OpenRouter + +OpenRouter uses the OpenAI-compatible API. Configure it by setting `OpenRouterLLM` in your config and providing +`OPENROUTER_API_KEY`. Optional attribution headers can be set via `OPENROUTER_SITE_URL` and `OPENROUTER_APP_NAME`. + +```yaml +archi: + model_class_map: + OpenRouterLLM: + class: OpenRouterLLM + kwargs: + model_name: openai/gpt-4o-mini + temperature: 0.7 +``` + +### Ollama + +In order to use an Ollama server instance for the chatbot, it is possible to specify `OllamaInterface` for the model name. To then correctly use models on the Ollama server, in the keyword args, specify both the url of the server and the name of a model hosted on the server. + +```yaml +archi: + model_class_map: + OllamaInterface: + kwargs: + base_model: "gemma3" # example + url: "url-for-server" + +``` + +In this case, the `gemma3` model is hosted on the Ollama server at `url-for-server`. You can check which models are hosted on your server by going to `url-for-server/models`. + +### Bring Your Own Key (BYOK) + +Archi supports Bring Your Own Key (BYOK), allowing users to provide their own API keys for LLM providers at runtime. This enables: + +- **Cost attribution**: Users pay for their own API usage +- **Provider flexibility**: Switch between providers without admin intervention +- **Privacy**: Use personal accounts for sensitive queries + +#### Key Hierarchy + +API keys are resolved in the following order (highest priority first): + +1. **Environment Variables**: Admin-configured keys (e.g., `OPENAI_API_KEY`) +2. **Docker Secrets**: Keys mounted at `/run/secrets/` +3. **Session Storage**: User-provided keys via the Settings UI + +!!! note + Environment variable keys always take precedence. If an admin configures a key via environment variable, users cannot override it with their own key. + +#### Using BYOK in the Chat Interface + +1. Open the **Settings** modal (gear icon) +2. Expand the **API Keys** section +3. For each provider you want to use: + - Enter your API key in the input field + - Click **Save** to store it in your session +4. Select your preferred **Provider** and **Model** from the dropdowns +5. Start chatting! + +**Status Indicators:** + +| Icon | Meaning | +|------|---------| +| ✓ Env | Key configured via environment variable (cannot be changed) | +| ✓ Session | Key configured via your session | +| ○ | No key configured | + +#### Supported Providers + +| Provider | Environment Variable | API Key Format | +|----------|---------------------|----------------| +| OpenAI | `OPENAI_API_KEY` | `sk-...` | +| Anthropic | `ANTHROPIC_API_KEY` | `sk-ant-...` | +| Google Gemini | `GOOGLE_API_KEY` | `AIza...` | +| OpenRouter | `OPENROUTER_API_KEY` | `sk-or-...` | + +#### Security Considerations + +- **Keys are never logged** - API keys are redacted from all log output +- **Keys are never echoed** - The UI only shows masked placeholders +- **Session-scoped** - Keys are cleared when you log out or your session expires +- **HTTPS recommended** - For production deployments, always use HTTPS to protect keys in transit + +#### API Endpoints + +For programmatic access, the following endpoints are available: + +| Endpoint | Method | Description | +|----------|--------|-------------| +| `/api/providers/keys` | GET | Get status of all provider keys | +| `/api/providers/keys/set` | POST | Set a session API key (validates before storing) | +| `/api/providers/keys/clear` | POST | Clear a session API key | + +--- + +## Vector Store + +The vector store is a database that stores document embeddings, enabling semantic and/or lexical search over your knowledge base. Archi uses PostgreSQL with pgvector as the default vector store backend to index and retrieve relevant documents based on similarity to user queries. + +### Backend Selection + +Archi uses PostgreSQL with the pgvector extension as its vector store backend. This provides production-grade vector similarity search integrated with your existing PostgreSQL database. + +Configure vector store settings in your configuration file: + +```yaml +services: + vectorstore: + backend: postgres # PostgreSQL with pgvector (only supported backend) +``` + +### Configuration + +Vector store settings are configured under the `data_manager` section: + +```yaml +data_manager: + collection_name: default_collection + embedding_name: OpenAIEmbeddings + chunk_size: 1000 + chunk_overlap: 0 + reset_collection: true + num_documents_to_retrieve: 5 + distance_metric: cosine +``` + +#### Core Settings + +- **`collection_name`**: Name of the vector store collection. Default: `default_collection` +- **`chunk_size`**: Maximum size of text chunks (in characters) when splitting documents. Default: `1000` +- **`chunk_overlap`**: Number of overlapping characters between consecutive chunks. Default: `0` +- **`reset_collection`**: If `true`, deletes and recreates the collection on startup. Default: `true` +- **`num_documents_to_retrieve`**: Number of relevant document chunks to retrieve for each query. Default: `5` + +#### Distance Metrics + +The `distance_metric` determines how similarity is calculated between embeddings: + +- **`cosine`**: Cosine similarity (default) - measures the angle between vectors +- **`l2`**: Euclidean distance - measures straight-line distance +- **`ip`**: Inner product - measures dot product similarity + +```yaml +data_manager: + distance_metric: cosine # Options: cosine, l2, ip +``` + +### Embedding Models + +Embeddings convert text into numerical vectors. Archi supports multiple embedding providers: + +#### OpenAI Embeddings + +```yaml +data_manager: + embedding_name: OpenAIEmbeddings + embedding_class_map: + OpenAIEmbeddings: + class: OpenAIEmbeddings + kwargs: + model: text-embedding-3-small + similarity_score_reference: 10 +``` + +#### HuggingFace Embeddings + +```yaml +data_manager: + embedding_name: HuggingFaceEmbeddings + embedding_class_map: + HuggingFaceEmbeddings: + class: HuggingFaceEmbeddings + kwargs: + model_name: sentence-transformers/all-MiniLM-L6-v2 + model_kwargs: + device: cpu + encode_kwargs: + normalize_embeddings: true + similarity_score_reference: 10 + query_embedding_instructions: null +``` + +### Supported Document Formats + +The vector store can process the following file types: + +- **Text files**: `.txt`, `.C` +- **Markdown**: `.md` +- **Python**: `.py` +- **HTML**: `.html` +- **PDF**: `.pdf` + +Documents are automatically loaded with the appropriate parser based on file extension. + +### Document Synchronization + +Archi automatically synchronizes your data directory with the vector store: + +1. **Adding documents**: New files in the data directory are automatically chunked, embedded, and added to the collection +2. **Removing documents**: Files deleted from the data directory are removed from the collection +3. **Source tracking**: Each ingested artifact is recorded in the Postgres catalog (`resources` table) with its resource hash and relative file path + +### Hybrid Search + +Combine semantic search with keyword-based BM25 search for improved retrieval: + +```yaml +data_manager: + use_hybrid_search: true + bm25_weight: 0.6 + semantic_weight: 0.4 +``` + +- **`use_hybrid_search`**: Enable hybrid search combining BM25 and semantic similarity. Default: `true` +- **`bm25_weight`**: Weight for BM25 keyword scores (base config default: `0.6`). +- **`semantic_weight`**: Weight for semantic similarity scores (base config default: `0.4`). +- **BM25 tuning**: Parameters like `k1` and `b` are set when the PostgreSQL BM25 index is created and are no longer configurable via this file. + +### Stemming + +By specifying the stemming option within your configuration, stemming functionality for the documents in Archi will be enabled. By doing so, documents inserted into the retrieval pipeline, as well as the query that is matched with them, will be stemmed and simplified for faster and more accurate lookup. + +```yaml +data_manager: + stemming: + enabled: true +``` + +When enabled, both documents and queries are processed using the Porter Stemmer algorithm to reduce words to their root forms (e.g., "running" → "run"), improving matching accuracy. + +### PostgreSQL Backend (Default) + +Archi uses PostgreSQL with pgvector for vector storage by default. The PostgreSQL service is automatically started when you deploy with the chatbot service. + +```yaml +services: + postgres: + host: postgres + port: 5432 + database: archi + vectorstore: + backend: postgres +``` + +Required secrets for PostgreSQL: +```bash +PG_PASSWORD=your_secure_password +``` +>>>>>>> 9b860a3f (docs: add vllm provider documentation) --- diff --git a/docs/docs/vllm.md b/docs/docs/vllm.md new file mode 100644 index 000000000..99ad5fe3e --- /dev/null +++ b/docs/docs/vllm.md @@ -0,0 +1,320 @@ +# vLLM Provider + +Run open-weight models on your own GPUs using [vLLM](https://docs.vllm.ai/) as an inference backend. Archi deploys vLLM as a **sidecar container** alongside the chatbot — no external server management required. + +## Why vLLM? + +| | vLLM | Ollama | API providers | +|---|---|---|---| +| **Throughput** | High (PagedAttention, continuous batching) | Moderate | N/A (cloud) | +| **Multi-GPU** | Tensor-parallel across GPUs | Single GPU | N/A | +| **Tool calling** | Supported (with parser flag) | Model-dependent | Supported | +| **Cost** | Hardware only | Hardware only | Per-token | +| **Privacy** | Data stays on-premises | Data stays on-premises | Data leaves your network | + +vLLM is the best fit when you need high-throughput local inference, multi-GPU support, or full data privacy with tool-calling capabilities. + +## Prerequisites + +- NVIDIA GPUs with sufficient VRAM for your chosen model +- NVIDIA drivers and the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) installed +- Container runtime configured for GPU access (see [Advanced Setup](advanced_setup_deploy.md#running-llms-locally-on-your-gpus)) + +## Quick Start + +### 1. Configure your deployment + +In your config YAML, reference models with the `vllm/` provider prefix: + +```yaml +archi: + pipeline_map: + CMSCompOpsAgent: + models: + required: + agent_model: vllm/Qwen/Qwen3-8B + +services: + vllm: + model: Qwen/Qwen3-8B # HuggingFace model ID + tool_parser: hermes # tool-call parser (optional) +``` + +> **Model naming**: vLLM uses HuggingFace model IDs (e.g. `Qwen/Qwen3-8B`), not Ollama-style names (e.g. `Qwen/Qwen3:8B`). Make sure the model ID matches what is available on HuggingFace Hub. + +### 2. Deploy + +```bash +archi create -n my-deployment \ + -c config.yaml \ + -e .env \ + --services chatbot,vllm-server \ + --gpu-ids all +``` + +The CLI will: + +1. Add the `vllm-server` sidecar to Docker Compose +2. Wire `VLLM_BASE_URL` into the chatbot container +3. Set the chatbot to wait for vLLM's health check before starting + +### 3. Verify + +Once the deployment is up, check the vLLM server: + +```bash +curl http://localhost:8000/v1/models +``` + +You should see your model listed in the response. + +## Architecture + +``` +┌─────────────────────────────────────────────────┐ +│ Docker Compose stack │ +│ │ +│ ┌──────────┐ ┌────────────┐ ┌──────────┐ │ +│ │ chatbot │───>│ vllm-server│ │ postgres │ │ +│ │ (Flask) │ │ (sidecar) │ │ │ │ +│ └──────────┘ └────────────┘ └──────────┘ │ +│ :7861 :8000 :5432 │ +│ GPU access │ +└─────────────────────────────────────────────────┘ +``` + +The vLLM server runs as a **sidecar** — a companion container in the same Compose stack. It: + +- Exposes an OpenAI-compatible `/v1` API on port 8000 +- Receives requests from the chatbot over the Docker network +- Loads the model onto GPU at startup and serves it continuously +- Reports health via `/v1/models` (chatbot waits for this before starting) + +The chatbot talks to vLLM using the same `ChatOpenAI` LangChain class it would use for the OpenAI API. From the pipeline's perspective, vLLM looks identical to a remote OpenAI endpoint. + +## Configuration Reference + +### Config YAML + +#### Model references + +Anywhere a model is referenced in `pipeline_map`, use the `vllm/` prefix: + +```yaml +archi: + pipeline_map: + CMSCompOpsAgent: + models: + required: + agent_model: vllm/Qwen/Qwen3-8B +``` + +The part after `vllm/` must match the HuggingFace model ID that vLLM is serving. + +#### vLLM provider settings + +The vLLM provider is configured under `services.chat_app.providers.vllm` in your config YAML. At minimum you need `enabled` and `default_model`: + +```yaml +services: + chat_app: + providers: + vllm: + enabled: true + base_url: http://localhost:8000/v1 + default_model: "Qwen/Qwen3-8B" + tool_call_parser: hermes # optional, default: hermes + models: + - "Qwen/Qwen3-8B" +``` + +| Setting | Type | Default | Description | +|---|---|---|---| +| `enabled` | bool | `false` | Enable the vLLM provider | +| `base_url` | string | `http://localhost:8000/v1` | vLLM server OpenAI-compatible endpoint | +| `default_model` | string | `Qwen/Qwen2.5-7B-Instruct-1M` | HuggingFace model ID to serve | +| `tool_call_parser` | string | `hermes` | Parser for structured tool calls (`hermes`, `mistral`, `llama3_json`) | +| `models` | list | — | Available model IDs for the UI model selector | + +#### vLLM Server Tuning + +When archi manages the vLLM sidecar container (deployed via `--services vllm-server`), you can configure server launch arguments alongside the provider settings above. Each key is translated to a vLLM CLI flag at container startup. All keys are optional — when omitted, vLLM's own defaults apply. + +> **Note**: These keys only affect the managed vLLM sidecar container. If you are pointing `base_url` at an external vLLM server, configure that server directly instead. + +| Key | Type | Default | When to change | +|---|---|---|---| +| `gpu_memory_utilization` | float | `0.9` | Model barely fits in VRAM, or you want to reserve GPU memory for other processes | +| `max_model_len` | int | model default | Reduce context window to lower memory usage, or increase it for long-document workloads | +| `tensor_parallel_size` | int | `1` | Shard a large model across multiple GPUs | +| `dtype` | string | `auto` | Force a specific weight precision (`float16`, `bfloat16`) instead of auto-detection | +| `quantization` | string | none | Run quantized model weights (`awq`, `gptq`, `fp8`) to reduce memory | +| `enforce_eager` | bool | `false` | Disable CUDA graph compilation to save memory at the cost of throughput | +| `max_num_seqs` | int | `256` | Limit concurrent sequences to reduce memory pressure under high load | +| `enable_prefix_caching` | bool | `true` | Disable KV cache prefix sharing if it causes issues with your model | + +##### Complete config example + +A single-GPU deployment with memory tuning: + +```yaml +services: + chat_app: + providers: + vllm: + enabled: true + base_url: http://localhost:8000/v1 + default_model: "Qwen/Qwen3-8B" + tool_call_parser: hermes + models: + - "Qwen/Qwen3-8B" + gpu_memory_utilization: 0.85 + max_model_len: 8192 +``` + +##### `engine_args` passthrough + +For any vLLM flag not covered by a named key above, use the `engine_args` map. Each entry is passed as `-- ` to the vLLM server. Keys use kebab-case matching vLLM's CLI flags. For boolean flags that take no argument (e.g. `--trust-remote-code`), use an empty string as the value. Do not duplicate flags that already have a named key above. + +```yaml +services: + chat_app: + providers: + vllm: + engine_args: + swap-space: 8 # CPU swap space per GPU in GiB (default: 4) + seed: 42 + trust-remote-code: "" # bare flag (no value) — use "" for boolean flags +``` + +##### Multi-GPU example + +Sharding a 30B model across 4 GPUs: + +```yaml +services: + chat_app: + providers: + vllm: + enabled: true + base_url: http://localhost:8000/v1 + default_model: "Qwen/Qwen3-30B-A3B-Instruct" + tool_call_parser: hermes + models: + - "Qwen/Qwen3-30B-A3B-Instruct" + gpu_memory_utilization: 0.92 + tensor_parallel_size: 4 + max_model_len: 16384 + dtype: bfloat16 + engine_args: + swap-space: 8 +``` + +Deploy with all GPUs: + +```bash +archi create -n my-deployment \ + -c config.yaml \ + --services chatbot,vllm-server \ + --gpu-ids 0,1,2,3 +``` + +### Environment Variables + +| Variable | Default | Description | +|---|---|---| +| `VLLM_BASE_URL` | `http://vllm-server:8000/v1` | Override the vLLM server URL (auto-set by the CLI) | + +You generally don't need to set `VLLM_BASE_URL` manually — the CLI injects it into the chatbot container. It is useful if you are running vLLM on a separate host. + +### Host Networking + +When deploying with `--hostmode`, the vLLM server uses `network_mode: host` and all services communicate via `localhost`. Without host mode, services communicate via Docker DNS (`vllm-server:8000`). + +## Tool Calling + +vLLM supports function/tool calling for ReAct agents, but requires explicit server flags. Archi configures these automatically: + +- `--enable-auto-tool-choice` — enables the tool calling pathway +- `--tool-call-parser ` — selects the parser for the model family + +The `tool_parser` setting should match your model's chat template: + +| Model family | Parser | +|---|---| +| Qwen (Qwen2.5, Qwen3) | `hermes` | +| Mistral / Mixtral | `mistral` | +| Llama 3 | `llama3_json` | + +If tool calling is not needed for your use case, these flags are harmless and can be left at defaults. + +## Smoke Testing + +To run smoke tests against a vLLM deployment: + +```bash +export SMOKE_PROVIDER=vllm +export SMOKE_VLLM_BASE_URL=http://localhost:8000/v1 +export SMOKE_VLLM_MODEL=Qwen/Qwen3-8B +scripts/dev/run_smoke_preview.sh my-deployment +``` + +This runs preflight checks (verifies vLLM is reachable) followed by a basic chat completion test through the chatbot endpoint. + +## Troubleshooting + +### vLLM server not starting + +**Symptom**: Container exits immediately or stays in a restart loop. + +**Check logs**: +```bash +docker logs vllm-server- +``` + +Common causes: + +- **Insufficient VRAM**: The model doesn't fit in GPU memory. Options: + - Lower `gpu_memory_utilization` (e.g. `0.7`) to leave headroom for other processes + - Set `max_model_len` to a smaller value (e.g. `4096`) to reduce KV cache memory + - Add `quantization: awq` or `quantization: gptq` if the model has quantized weights available + - Set `enforce_eager: true` to disable CUDA graphs (saves memory, reduces throughput) + - Increase `tensor_parallel_size` and add more GPUs via `--gpu-ids` + - Try a smaller model +- **Missing NVIDIA runtime**: Ensure the NVIDIA Container Toolkit is installed and configured. +- **/dev/shm too small**: vLLM warns at startup if shared memory is below 1 GB. The container uses `ipc: host` by default, but if that is restricted, increase `shm_size`. +- **Invalid engine argument**: If the vLLM log shows `unrecognized arguments`, check for typos in `engine_args` keys (must be kebab-case, e.g. `swap-space` not `swap_space`) or boolean flags that need an empty-string value (`""`). + +### Chatbot can't reach vLLM + +**Symptom**: `ConnectionError: Name or service not known` or `Connection refused`. + +- Verify both containers are on the same Docker network (default when not using `--hostmode`). +- Check that `VLLM_BASE_URL` in the chatbot container resolves correctly: + ```bash + docker exec curl http://vllm-server:8000/v1/models + ``` +- If using `--hostmode`, ensure `VLLM_BASE_URL` uses `localhost` instead of `vllm-server`. + +### Model not found (404) + +**Symptom**: `Error: model 'Qwen/Qwen3:8B' does not exist`. + +vLLM uses HuggingFace model IDs, not Ollama-style names. Check: + +- Config uses dashes, not colons: `vllm/Qwen/Qwen3-8B` (not `Qwen/Qwen3:8B`) +- The model ID matches exactly what vLLM is serving (`curl localhost:8000/v1/models`) + +### Tool calling returns 400 + +**Symptom**: `400 Bad Request: "auto" tool choice requires --enable-auto-tool-choice`. + +This means the vLLM server wasn't started with tool calling flags. If you are deploying through the CLI, this is handled automatically. If running vLLM manually, add: + +```bash +--enable-auto-tool-choice --tool-call-parser hermes +``` + +### Slow first response + +The first request after startup may be slow (30-60s) while vLLM compiles CUDA kernels and warms up. Subsequent requests will be significantly faster. The chatbot's `depends_on` health check ensures it doesn't send requests before vLLM is ready, but the health check only confirms the server is listening — not that the first compilation is complete. If startup compilation time is a problem, set `enforce_eager: true` to skip CUDA graph compilation (at the cost of lower throughput). diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index feeeee508..4ae6bc6a4 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -11,6 +11,8 @@ nav: - Agents & Tools: agents_tools.md - Configuration: configuration.md - CLI Reference: cli_reference.md + - Advanced Setup and Deployment: advanced_setup_deploy.md + - vLLM Provider: vllm.md - API Reference: api_reference.md - Benchmarking: benchmarking.md - Advanced Setup: advanced_setup_deploy.md diff --git a/examples/deployments/basic-gpu/config.yaml b/examples/deployments/basic-gpu/config.yaml new file mode 100644 index 000000000..e76cf572c --- /dev/null +++ b/examples/deployments/basic-gpu/config.yaml @@ -0,0 +1,57 @@ +# Basic configuration file for an Archi deployment +# with a chat app interface, CMSCompOpsAgent pipeline, and +# PostgreSQL with pgvector for document storage. +# The LLM is a locally hosted VLLM instance on GPUs. +# +# This example demonstrates using local GPU resources with VLLM +# in OpenAI-compatible mode with the CMSCompOpsAgent pipeline. +# +# Prerequisites: +# - VLLM server running on localhost:8000 with a model loaded +# - GPU(s) available on the host machine +# +# Run with: +# archi create --name my-archi-gpu-agent --config examples/deployments/basic-gpu/agent-config.yaml --services chatbot --gpu-ids=all + +name: main-gpu-agent + +services: + chat_app: + pipeline: CMSCompOpsAgent + trained_on: "Documentation and guides" + port: 7861 + external_port: 7861 + providers: + vllm: + enabled: true + base_url: http://localhost:8000/v1 + default_model: "Qwen/Qwen3-8B" + vectorstore: + backend: postgres # PostgreSQL with pgvector (only supported backend) + data_manager: + port: 7872 + external_port: 7872 + postgres: + port: 5432 + user: archi + database: archi-db + host: postgres + +data_manager: + sources: + links: + input_lists: + - config/source.list + embedding_name: HuggingFaceEmbeddings + +archi: + pipelines: + - CMSCompOpsAgent + pipeline_map: + CMSCompOpsAgent: + prompts: + required: + agent_prompt: config/main.prompt + models: + required: + agent_model: local/Qwen/Qwen3:8B diff --git a/examples/deployments/basic-vllm/condense.prompt b/examples/deployments/basic-vllm/condense.prompt new file mode 100644 index 000000000..cca6c4581 --- /dev/null +++ b/examples/deployments/basic-vllm/condense.prompt @@ -0,0 +1,12 @@ +# Prompt used to condense a chat history and a follow up question into a stand alone question. +# This is a very general prompt for condensing histories, so for base installs it will not need to be modified +# +# All condensing prompts must have the following tags in them, which will be filled with the appropriate information: +# {chat_history} +# {question} +# +Given the following conversation between you (the AI named archi), a human user who needs help, and an expert, and a follow up question, rephrase the follow up question to be a standalone question, in its original language. + +Chat History: {history} +Follow Up Input: {question} +Standalone question: \ No newline at end of file diff --git a/examples/deployments/basic-vllm/config.yaml b/examples/deployments/basic-vllm/config.yaml new file mode 100644 index 000000000..0c497bb01 --- /dev/null +++ b/examples/deployments/basic-vllm/config.yaml @@ -0,0 +1,55 @@ +# Basic configuration file for a Archi deployment +# with a chat app interface and PostgreSQL with pgvector for document storage. +# The LLM is used through an existing Ollama server. +# +# run with: +# archi create --name my-archi-vllm --config examples/deployments/basic-vllm/config.yaml --services chatbot --hostmode + +name: my_archi + +services: + chat_app: + agent_class: CMSCompOpsAgent + agents_dir: examples/agents + default_provider: local + default_model: qwen3:32b + providers: + vllm: + enabled: true + base_url: http://localhost:8000/v1 # make sure this matches your vllm server URL! + mode: ollama + default_model: "vllm:qwen3:8b" # make sure this matches a model you have downloaded locally with ollama + models: + - "vllm:Gemma 3:4B" + # - "vllm:Gemma 3:12B" + - "vllm:qwen3:8B-Instruct" + # - "vllm:Qwen3-30B-Instruct" + # --- vLLM server engine args (all optional) --- + # gpu_memory_utilization: 0.9 # fraction of GPU VRAM (0.0-1.0, default: 0.9) + # max_model_len: 8192 # cap context window to reduce memory + # tensor_parallel_size: 2 # shard model across N GPUs + # dtype: auto # weight precision (auto, float16, bfloat16) + # quantization: awq # quantization method (awq, gptq, fp8) + # enforce_eager: false # disable CUDA graphs to save memory + # max_num_seqs: 256 # max concurrent sequences + # enable_prefix_caching: true # KV cache prefix sharing + # engine_args: # passthrough for any other vLLM flag + # swap-space: 4 # CPU swap space per GPU in GiB + # seed: 42 + trained_on: "FASRC DOCS" + port: 7861 + external_port: 7861 + vectorstore: + backend: postgres # PostgreSQL with pgvector (only supported backend) + data_manager: + port: 7889 + external_port: 7889 + auth: + enabled: false # set to true and provide DM_API_TOKEN in .env for production + +data_manager: + sources: + links: + input_lists: + - config/sources.list + embedding_name: HuggingFaceEmbeddings diff --git a/examples/deployments/basic-vllm/miscellanea.list b/examples/deployments/basic-vllm/miscellanea.list new file mode 100644 index 000000000..7e973aba6 --- /dev/null +++ b/examples/deployments/basic-vllm/miscellanea.list @@ -0,0 +1,39 @@ +# PPC +https://ppc.mit.edu/blog/2016/05/08/hello-world/ +https://ppc.mit.edu/ +https://ppc.mit.edu/news/ +https://ppc.mit.edu/publications/ +https://ppc.mit.edu/blog/2025/02/08/detailed-schedule-for-the-european-strategy/ +https://ppc.mit.edu/blog/2025/02/14/first-cms-week-in-2025/ +https://ppc.mit.edu/blog/2025/02/18/exploring-the-higgs-boson-in-our-latest-result/ +https://ppc.mit.edu/blog/2025/02/04/news-from-the-chamonix-meeting/ +https://ppc.mit.edu/blog/2025/02/11/cms-data-archival-at-mit/ +https://ppc.mit.edu/blog/2025/03/28/cern-gets-support-from-canada/ +https://ppc.mit.edu/blog/2025/04/08/breakthrough-prize-in-physics-2025/ +https://ppc.mit.edu/blog/2025/04/04/the-fcc-at-cern-a-feasibly-circular-collider/ +https://ppc.mit.edu/blog/2025/04/08/cleo-reached-magic-issue-number-5000/ +https://ppc.mit.edu/blog/2025/04/14/maximizing-cms-competitive-advantage/ +https://ppc.mit.edu/blog/2025/04/25/sueps-at-aps-march-april-meeting/ +https://ppc.mit.edu/blog/2025/04/18/round-three/ +https://ppc.mit.edu/blog/2025/04/14/first-beams-with-a-splash-in-2025/ +https://ppc.mit.edu/blog/2025/05/27/fcc-weak-in-vienna-building-our-future/ +https://ppc.mit.edu/blog/2025/06/04/new-paper-on-arxiv-submit-a-physics-analysis-facility-at-mit/ +https://ppc.mit.edu/blog/2025/06/16/summer-cms-week-2025/ +https://ppc.mit.edu/blog/2025/05/05/cms-records-first-2025-high-energy-collisions/ +https://ppc.mit.edu/blog/2025/06/17/long-term-vision-for-particle-physics-from-the-national-academies/ +https://ppc.mit.edu/blog/2025/06/20/conclusion-of-junes-cern-council-session-has-major-consequences-for-cms/ +https://ppc.mit.edu/blog/2025/06/20/highest-pileup-recorded-at-cms-last-night/ +https://ppc.mit.edu/blog/2025/06/25/selfie-station-at-wilson-hall/ +https://ppc.mit.edu/mariarosaria-dalfonso/ +https://ppc.mit.edu/kenneth-long-2/ +https://ppc.mit.edu/blog/2025/06/27/open-symposium-on-the-european-strategy-for-particle-physics/ +https://ppc.mit.edu/blog/2025/07/03/bridging-physics-and-computing-throughput-computing-2025/ +https://ppc.mit.edu/pietro-lugato-2/ +https://ppc.mit.edu/luca-lavezzo/ +https://ppc.mit.edu/zhangqier-wang-2/ +https://ppc.mit.edu/blog/2025/07/14/welcome-our-first-ever-in-house-masters-student/ +# A2 +https://ppc.mit.edu/a2/ +# Personnel +https://people.csail.mit.edu/kraska +https://physics.mit.edu/faculty/christoph-paus diff --git a/examples/deployments/basic-vllm/qa.prompt b/examples/deployments/basic-vllm/qa.prompt new file mode 100644 index 000000000..8ed5c6420 --- /dev/null +++ b/examples/deployments/basic-vllm/qa.prompt @@ -0,0 +1,18 @@ +# Prompt used to query LLM with appropriate context and question. +# This prompt is specific to subMIT and likely will not perform well for other applications, where it is recommended to write your own prompt and change it in the config +# +# All final prompts must have the following tags in them, which will be filled with the appropriate information: +# Question: {question} +# Context: {retriever_output} +# +You are a conversational chatbot named archi who helps people navigate a computing cluster named SubMIT. +You will be provided context in the form of relevant documents, such as previous communication between sys admins and Guides, a summary of the problem that the user is trying to solve and the important elements of the conversation, and the most recent chat history between you and the user to help you answer their questions. +Using your Linux and computing knowledge, answer the question at the end. +Unless otherwise indicated, assume the users are not well versed in computing. +Please do not assume that SubMIT machines have anything installed on top of native Linux except if the context mentions it. +If you don't know, say "I don't know", if you need to ask a follow up question, please do. + +Context: {retriever_output} +Question: {question} +Chat History: {history} +Helpful Answer: diff --git a/openspec/changes/add-vllm-provider/design.md b/openspec/changes/add-vllm-provider/design.md new file mode 100644 index 000000000..d4bdf79e6 --- /dev/null +++ b/openspec/changes/add-vllm-provider/design.md @@ -0,0 +1,30 @@ +# Design: vLLM Provider Integration + +## Architecture +- **Client:** `chatbot-main-gpu-agent` container. +- **Server:** `vllm-server` container (host-based or separate container). +- **Protocol:** HTTP/REST (OpenAI Schema). + +## Technical Decisions + +### Decision: Inherit from OpenAIProvider +Since vLLM is OpenAI-compatible, the `VLLMProvider` should inherit from Archi's `OpenAIProvider` (or `BaseProvider`) to reuse JSON mapping logic, but override the endpoint resolution to handle the Docker internal network. + +### Decision: V100 Stability +Inject `NCCL_P2P_DISABLE=1` into the provider's connection logic if not already handled by the environment to ensure stable communication with older NVLink/PCIe topologies on V100s. + +### Decision: Critical Docker Performance Tuning +To achieve bare-metal parity when running vLLM inside a container, the following runtime configurations MUST be enforced. These prevent the common "Docker Tax" on LLM inference. + +#### 1. Shared Memory Access (`--ipc=host`) +vLLM utilizes NCCL for multi-GPU communication and PagedAttention for memory management. +- **Requirement:** Containers must be started with `--ipc=host`. +- **Reason:** Docker’s default 64MB shm-size causes immediate crashes during Tensor Parallelism initialization. Using the host's IPC namespace provides the necessary memory bandwidth for inter-GPU coordination. + +#### 2. Network Latency Optimization (`--network=host`) +- **Requirement:** Use `--network=host` for the `vllm-server` container where feasible. +- **Reason:** Bypasses the Docker bridge (docker0) and user-land proxy (docker-proxy), reducing request/response overhead by 0.5–2ms per call—critical for high-concurrency streaming applications. + +#### 3. GPU Passthrough and Memory Locking +- **Requirement:** Ensure `--gpus all` and `--ulimit memlock=-1 --ulimit stack=67108864` are set. +- **Reason:** vLLM pre-allocates up to 90% of VRAM (default). Memlocking prevents the OS from swapping out these critical buffers, ensuring consistent P99 latencies. diff --git a/openspec/changes/add-vllm-provider/proposal.md b/openspec/changes/add-vllm-provider/proposal.md new file mode 100644 index 000000000..f2f2d9c74 --- /dev/null +++ b/openspec/changes/add-vllm-provider/proposal.md @@ -0,0 +1,14 @@ +# Proposal: Add vLLM Provider +## Intent +Enable high-throughput local inference on NVIDIA V100 GPUs using the vLLM engine. This provides an OpenAI-compatible alternative to the current Ollama and external API providers. + +## Scope +- New provider class `VLLMProvider` in `src/archi/providers/`. +- Integration into the provider factory. +- Support for streaming and non-streaming chat completions. +- V100-specific configuration (NCCL flags). + +## Constraints +- MUST use OpenAI-compatible API format. +- MUST support the `base_url` parameter for remote container access. +- MUST handle V100-specific environment variables for stability. \ No newline at end of file diff --git a/openspec/changes/add-vllm-provider/specs/vllm-provider/spec.md b/openspec/changes/add-vllm-provider/specs/vllm-provider/spec.md new file mode 100644 index 000000000..854ac5aee --- /dev/null +++ b/openspec/changes/add-vllm-provider/specs/vllm-provider/spec.md @@ -0,0 +1,56 @@ +## ADDED Requirements + +### Requirement: VLLMProvider registered as a first-class provider type +The system SHALL register a `VLLM` provider type in the `ProviderType` enum and provider registry, distinct from the `LOCAL` provider. Pipeline configs SHALL reference vLLM models as `vllm/`. + +#### Scenario: Pipeline resolves vLLM model reference +- **WHEN** a pipeline config specifies `vllm/Qwen/Qwen2.5-7B-Instruct-1M` as a model +- **THEN** `BasePipeline._parse_provider_model()` splits it into provider `"vllm"` and model `"Qwen/Qwen2.5-7B-Instruct-1M"`, and `get_model()` returns a `ChatOpenAI` instance from `VLLMProvider` + +#### Scenario: Provider name alias resolves to VLLM type +- **WHEN** `get_provider_by_name("vllm")` is called +- **THEN** it SHALL return a `VLLMProvider` instance (not `LocalProvider`) + +### Requirement: VLLMProvider returns ChatOpenAI with correct defaults +The `VLLMProvider.get_chat_model()` SHALL return a `ChatOpenAI` instance configured with `base_url` defaulting to `http://localhost:8000/v1` and `api_key` defaulting to `"not-needed"`. + +#### Scenario: Default base URL used when none configured +- **WHEN** `VLLMProvider` is instantiated with no `base_url` in config +- **THEN** `get_chat_model("my-model")` returns a `ChatOpenAI` with `base_url="http://localhost:8000/v1"` + +#### Scenario: Custom base URL from config +- **WHEN** `VLLMProvider` is instantiated with `base_url="http://vllm-host:9000/v1"` in config +- **THEN** `get_chat_model("my-model")` returns a `ChatOpenAI` with that base URL + +#### Scenario: Environment variable overrides config +- **WHEN** `VLLM_BASE_URL` environment variable is set +- **THEN** `VLLMProvider` SHALL use that value as base URL, overriding the config default + +### Requirement: VLLMProvider discovers models dynamically +The `VLLMProvider.list_models()` SHALL query the vLLM server's `/v1/models` endpoint and return discovered models as `ModelInfo` objects. + +#### Scenario: Server is reachable with loaded models +- **WHEN** `list_models()` is called and the vLLM server responds with a model list +- **THEN** each model is returned as a `ModelInfo` with `id`, `name`, and `display_name` populated from the response + +#### Scenario: Server is unreachable +- **WHEN** `list_models()` is called and the vLLM server does not respond +- **THEN** it SHALL return the statically configured model list from `ProviderConfig.models`, or an empty list if none configured + +### Requirement: VLLMProvider validates server connection +The `VLLMProvider.validate_connection()` SHALL check the vLLM server's health by hitting the `/v1/models` endpoint. + +#### Scenario: Server is healthy +- **WHEN** `validate_connection()` is called and `/v1/models` returns HTTP 200 +- **THEN** it SHALL return `True` + +#### Scenario: Server is down +- **WHEN** `validate_connection()` is called and the request fails or times out +- **THEN** it SHALL return `False` + +### Requirement: YAML config section for vLLM provider +The system SHALL support an `archi.providers.vllm` section in deployment YAML configs with fields: `enabled`, `base_url`, `default_model`, `models`. + +#### Scenario: Config loaded from YAML +- **WHEN** a deployment config contains `archi.providers.vllm` with `enabled: true` and `base_url: http://gpu-node:8000/v1` +- **THEN** `_build_provider_config_from_payload()` SHALL construct a `ProviderConfig` with `provider_type=ProviderType.VLLM` and the specified fields diff --git a/openspec/changes/add-vllm-provider/specs/vllm-server/spec.md b/openspec/changes/add-vllm-provider/specs/vllm-server/spec.md new file mode 100644 index 000000000..321b96e42 --- /dev/null +++ b/openspec/changes/add-vllm-provider/specs/vllm-server/spec.md @@ -0,0 +1,55 @@ +## ADDED Requirements + +### Requirement: vllm-server registered as a deployable service +The system SHALL register a `vllm-server` service in the `ServiceRegistry` that can be enabled via `archi create --services`. + +#### Scenario: User deploys with vllm-server +- **WHEN** `archi create --name my-bot --services chatbot,vllm-server --gpu-ids all` is run +- **THEN** the deployment directory SHALL contain a docker-compose service block for `vllm-server` with GPU passthrough + +#### Scenario: vllm-server not requested +- **WHEN** `archi create` is run without `vllm-server` in --services +- **THEN** no vllm-server service block SHALL be generated + +### Requirement: vllm-server container runs with required runtime config +The generated docker-compose service for `vllm-server` SHALL include `ipc: host`, `ulimits` (memlock unlimited, stack 67108864), and GPU device reservations. + +#### Scenario: Compose file generated with runtime config +- **WHEN** the deployment includes `vllm-server` +- **THEN** the docker-compose YAML for vllm-server SHALL contain `ipc: host`, `ulimits.memlock.soft: -1`, `ulimits.memlock.hard: -1`, `ulimits.stack: 67108864`, and `deploy.resources.reservations.devices` with GPU capabilities + +### Requirement: V100 stability via NCCL environment variable +The vllm-server container SHALL set `NCCL_P2P_DISABLE=1` in its environment to ensure stability on V100 GPU topologies. + +#### Scenario: NCCL flag present in container environment +- **WHEN** vllm-server is deployed +- **THEN** the container environment SHALL include `NCCL_P2P_DISABLE=1` + +### Requirement: vllm-server supports host networking mode +The vllm-server compose service SHALL use `network_mode: host` by default to minimize inference latency. + +#### Scenario: Host networking enabled +- **WHEN** vllm-server is deployed with default settings +- **THEN** the compose service SHALL include `network_mode: host` + +#### Scenario: Chatbot resolves vllm-server via host +- **WHEN** vllm-server uses host networking and chatbot uses bridge networking +- **THEN** the chatbot container SHALL receive a `VLLM_BASE_URL` environment variable pointing to the host IP and vLLM port + +### Requirement: vllm-server startup health check +The vllm-server compose service SHALL include a healthcheck that verifies the `/v1/models` endpoint is responding before dependent services start. + +#### Scenario: Healthy startup +- **WHEN** vllm-server finishes loading the model and `/v1/models` returns HTTP 200 +- **THEN** the healthcheck SHALL pass and dependent services (chatbot) SHALL start + +#### Scenario: Slow model load +- **WHEN** vllm-server takes longer than the healthcheck interval to load +- **THEN** the healthcheck SHALL retry until the model is loaded or the timeout is reached + +### Requirement: Shared memory size warning +The vllm-server startup SHALL log a warning if `/dev/shm` is smaller than 1GB. + +#### Scenario: Insufficient shared memory +- **WHEN** vllm-server starts and `/dev/shm` is less than 1GB +- **THEN** a warning SHALL be logged indicating that `ipc: host` or a larger `shm_size` is required for stable multi-GPU inference diff --git a/openspec/changes/add-vllm-provider/tasks.md b/openspec/changes/add-vllm-provider/tasks.md new file mode 100644 index 000000000..036de421d --- /dev/null +++ b/openspec/changes/add-vllm-provider/tasks.md @@ -0,0 +1,24 @@ +Phase 1: Provider (thin client layer) +[x] 1. Add ProviderType.VLLM to enum: Add VLLM = "vllm" to ProviderType in src/archi/providers/base.py. + +[x] 2. Create VLLMProvider class: New file src/archi/providers/vllm_provider.py. Inherit from BaseProvider (not OpenAIProvider — avoids coupling to OpenAI's default model list and API key logic). Default base_url http://localhost:8000/v1, api_key defaults to "not-needed". get_chat_model() returns ChatOpenAI with correct base_url. list_models() hits /v1/models for dynamic discovery. validate_connection() health-checks /v1/models. + +[x] 3. Register provider: Update src/archi/providers/__init__.py — add to _ensure_providers_registered(), repoint "vllm" alias from ProviderType.LOCAL to ProviderType.VLLM in name_map. + +[x] 4. Config schema support: Support archi.providers.vllm section in YAML (fields: enabled, base_url, default_model, models). Wire into _build_provider_config_from_payload() in src/interfaces/chat_app/app.py. + +Phase 2: Infrastructure (server-side) +[x] 5. Register vllm-server in ServiceRegistry: New ServiceDefinition in src/cli/service_registry.py. GPU-dependent, port 8000 default, no volume required (model weights bind-mounted or cached). + +[x] 6. Docker Compose template for vllm-server: Base image vllm/vllm-openai or custom from base-pytorch-image. Server command: python -m vllm.entrypoints.openai.api_server --model . Environment: NCCL_P2P_DISABLE=1 (V100 stability). Runtime: ipc: host, ulimits (memlock: -1, stack: 67108864), GPU passthrough via deploy.resources.reservations.devices. + +[x] 7. Inter-container networking: If vllm-server uses network_mode: host, chatbot must reach it via host IP not Docker DNS. Expose VLLM_BASE_URL env var to chatbot container. VLLMProvider reads base_url from config or VLLM_BASE_URL env fallback. + +[x] 8. CLI integration: Wire vllm-server into archi create --services. Leverage existing --gpu-ids flag for GPU passthrough. Support model name configuration (which model the server loads). + +Phase 3: Validation +[x] 9. Unit tests for VLLMProvider: Mock /v1/models response for list_models(). Verify ChatOpenAI instantiation with correct base_url and api_key. Verify validate_connection() success/failure paths. + +[x] 10. Startup health check: Compose healthcheck or entrypoint script for vllm-server. Log warning if /dev/shm < 1GB. Chatbot depends_on vllm-server with condition: service_healthy. + +[x] 11. Smoke test: Extend existing smoke test infrastructure. Verify end-to-end: deploy → ingest → query via vLLM provider. diff --git a/src/archi/pipelines/agents/tools/retriever.py b/src/archi/pipelines/agents/tools/retriever.py index 149aaf902..823f47661 100644 --- a/src/archi/pipelines/agents/tools/retriever.py +++ b/src/archi/pipelines/agents/tools/retriever.py @@ -45,6 +45,7 @@ def _format_documents_for_llm( doc.metadata.get("filename") or "unknown source" ) + url = doc.metadata.get("url") or "" hash = ( doc.metadata.get("resource_hash") or "n/a" @@ -53,6 +54,8 @@ def _format_documents_for_llm( if len(text) > max_chars: text = f"{text[:max_chars].rstrip()}..." header = f"[{idx}] {source} (hash={hash})" + if url: + header += f"\nURL: {url}" footer = f"Score: {score:.4f}" if isinstance(score, (float, int)) else "Score: n/a" snippets.append(f"{header}\n{footer}\n{text}") diff --git a/src/archi/providers/__init__.py b/src/archi/providers/__init__.py index cc968f5b7..e041505cd 100644 --- a/src/archi/providers/__init__.py +++ b/src/archi/providers/__init__.py @@ -76,13 +76,15 @@ def _ensure_providers_registered() -> None: from src.archi.providers.gemini_provider import GeminiProvider from src.archi.providers.openrouter_provider import OpenRouterProvider from src.archi.providers.local_provider import LocalProvider + from src.archi.providers.vllm_provider import VLLMProvider from src.archi.providers.cern_litellm_provider import CERNLiteLLMProvider - + register_provider(ProviderType.OPENAI, OpenAIProvider) register_provider(ProviderType.ANTHROPIC, AnthropicProvider) register_provider(ProviderType.GEMINI, GeminiProvider) register_provider(ProviderType.OPENROUTER, OpenRouterProvider) register_provider(ProviderType.LOCAL, LocalProvider) + register_provider(ProviderType.VLLM, VLLMProvider) register_provider(ProviderType.CERN_LITELLM, CERNLiteLLMProvider) @@ -168,7 +170,7 @@ def get_provider_by_name(name: str, **kwargs) -> BaseProvider: "openrouter": ProviderType.OPENROUTER, "local": ProviderType.LOCAL, "ollama": ProviderType.LOCAL, - "vllm": ProviderType.LOCAL, + "vllm": ProviderType.VLLM, "cern_litellm": ProviderType.CERN_LITELLM, } diff --git a/src/archi/providers/base.py b/src/archi/providers/base.py index 8157c70b3..bec087b45 100644 --- a/src/archi/providers/base.py +++ b/src/archi/providers/base.py @@ -25,6 +25,7 @@ class ProviderType(str, Enum): GEMINI = "gemini" OPENROUTER = "openrouter" LOCAL = "local" + VLLM = "vllm" CERN_LITELLM = "cern_litellm" @@ -117,8 +118,8 @@ def set_api_key(self, api_key: str) -> None: @property def is_configured(self) -> bool: """Check if the provider has necessary credentials configured.""" - # Local providers may not need an API key - if self.provider_type == ProviderType.LOCAL: + # Local/vLLM providers may not need an API key + if self.provider_type in (ProviderType.LOCAL, ProviderType.VLLM): return bool(self.config.base_url) return bool(self._api_key) diff --git a/src/archi/providers/vllm_provider.py b/src/archi/providers/vllm_provider.py new file mode 100644 index 000000000..c6e710857 --- /dev/null +++ b/src/archi/providers/vllm_provider.py @@ -0,0 +1,172 @@ +"""vLLM provider -- thin client for OpenAI-compatible vLLM servers. + +Wraps a locally hosted vLLM instance whose ``/v1`` API is wire-compatible +with OpenAI. No real API key is required; the placeholder ``"not-needed"`` +is sent instead. +""" + +import json +import os +import urllib.error +import urllib.request +from typing import Any, Dict, List, Optional + +from langchain_core.language_models.chat_models import BaseChatModel + +from src.archi.providers.base import ( + BaseProvider, + ModelInfo, + ProviderConfig, + ProviderType, +) +from src.utils.logging import get_logger + +logger = get_logger(__name__) + + +DEFAULT_VLLM_BASE_URL = "http://localhost:8000/v1" + + +class VLLMProvider(BaseProvider): + """ + Provider for vLLM inference servers. + + Communicates with a vLLM server via its OpenAI-compatible API. + The base URL can be configured via: + 1. VLLM_BASE_URL environment variable (highest priority) + 2. ProviderConfig.base_url + 3. Default: http://localhost:8000/v1 + """ + + provider_type = ProviderType.VLLM + display_name = "vLLM" + + @staticmethod + def _normalize_base_url(url: Optional[str]) -> Optional[str]: + """Ensure the base URL has a scheme so urllib requests succeed.""" + if not url: + return url + if url.startswith(("http://", "https://")): + return url + return f"http://{url}" + + def __init__(self, config: Optional[ProviderConfig] = None): + """Initialize the vLLM provider. + + Resolves the server URL in priority order: ``VLLM_BASE_URL`` env + var > ``config.base_url`` > ``DEFAULT_VLLM_BASE_URL``. Bare + ``host:port`` URLs are normalised with an ``http://`` scheme. + + Args: + config: Optional provider configuration. When *None*, a + default config targeting ``localhost:8000`` is created. + """ + env_base_url = self._normalize_base_url(os.environ.get("VLLM_BASE_URL")) + + if config is None: + config = ProviderConfig( + provider_type=ProviderType.VLLM, + base_url=env_base_url or DEFAULT_VLLM_BASE_URL, + api_key="not-needed", + enabled=True, + ) + else: + if env_base_url: + config.base_url = env_base_url + elif not config.base_url: + config.base_url = DEFAULT_VLLM_BASE_URL + config.base_url = self._normalize_base_url(config.base_url) + + super().__init__(config) + + def get_chat_model(self, model_name: str, **kwargs) -> BaseChatModel: + """Create a ChatOpenAI instance pointed at the vLLM server. + + Args: + model_name: HuggingFace model ID served by vLLM + (e.g. ``"Qwen/Qwen3-8B"``). + **kwargs: Extra arguments forwarded to ChatOpenAI. + + Returns: + A ChatOpenAI instance configured for the vLLM endpoint. + """ + from langchain_openai import ChatOpenAI + + model_kwargs = { + "model": model_name, + "base_url": self.config.base_url, + "api_key": self._api_key or "not-needed", + "streaming": True, + **self.config.extra_kwargs, + **kwargs, + } + + return ChatOpenAI(**model_kwargs) + + def list_models(self) -> List[ModelInfo]: + """Return available models, querying the server first. + + Falls back to statically configured models if the server is + unreachable. + + Returns: + A list of :class:`ModelInfo` discovered from the server or + from config, or an empty list if neither yields results. + """ + fetched = self._fetch_vllm_models() + if fetched: + return fetched + if self.config.models: + return self.config.models + return [] + + def _fetch_vllm_models(self) -> List[ModelInfo]: + """Fetch models from the vLLM ``/v1/models`` endpoint. + + Returns: + A list of :class:`ModelInfo`, or an empty list if the + server is unreachable or returns an unexpected payload. + """ + try: + url = f"{self.config.base_url}/models" + req = urllib.request.Request(url, method="GET") + with urllib.request.urlopen(req, timeout=10) as response: + if response.status == 200: + data = json.loads(response.read().decode()) + models = [] + for model_data in data.get("data", []): + model_id = model_data.get("id", "") + models.append(ModelInfo( + id=model_id, + name=model_id, + display_name=model_id, + supports_tools=True, + supports_streaming=True, + )) + logger.debug( + "[VLLMProvider] Discovered %d models: %s", + len(models), + [m.id for m in models], + ) + return models + except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError, json.JSONDecodeError) as e: + logger.warning("[VLLMProvider] Failed to fetch models from %s: %s", self.config.base_url, e) + + return [] + + def validate_connection(self) -> bool: + """Check whether the vLLM server is reachable. + + Sends a GET to ``/v1/models`` with a short timeout. + + Returns: + True if the server responds with HTTP 200, False otherwise. + """ + try: + url = f"{self.config.base_url}/models" + req = urllib.request.Request(url, method="GET") + with urllib.request.urlopen(req, timeout=5) as response: + return response.status == 200 + except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e: + logger.warning("[VLLMProvider] Connection failed: %s", e) + return False diff --git a/src/cli/managers/templates_manager.py b/src/cli/managers/templates_manager.py index cf40c4293..99f738854 100644 --- a/src/cli/managers/templates_manager.py +++ b/src/cli/managers/templates_manager.py @@ -481,6 +481,32 @@ def _render_compose_file(self, context: TemplateContext) -> None: if context.plan.get_service("grader").enabled: template_vars["rubrics"] = self._get_grader_rubrics(context.config_manager) + # Pass vLLM model name from provider config to compose template + vllm_cfg = context.config_manager.config.get("services", {}).get("chat_app", {}).get("providers", {}).get("vllm", {}) + if vllm_cfg.get("default_model"): + template_vars["vllm_model"] = vllm_cfg["default_model"] + if vllm_cfg.get("tool_call_parser"): + template_vars["vllm_tool_parser"] = vllm_cfg["tool_call_parser"] + + # Pass vLLM server configuration keys to compose template + if vllm_cfg.get("gpu_memory_utilization"): + template_vars["vllm_gpu_memory_utilization"] = vllm_cfg["gpu_memory_utilization"] + if vllm_cfg.get("max_model_len"): + template_vars["vllm_max_model_len"] = vllm_cfg["max_model_len"] + if vllm_cfg.get("tensor_parallel_size"): + template_vars["vllm_tensor_parallel_size"] = vllm_cfg["tensor_parallel_size"] + if vllm_cfg.get("dtype"): + template_vars["vllm_dtype"] = vllm_cfg["dtype"] + if vllm_cfg.get("quantization"): + template_vars["vllm_quantization"] = vllm_cfg["quantization"] + if "enforce_eager" in vllm_cfg: + template_vars["vllm_enforce_eager"] = vllm_cfg["enforce_eager"] + if vllm_cfg.get("max_num_seqs"): + template_vars["vllm_max_num_seqs"] = vllm_cfg["max_num_seqs"] + if "enable_prefix_caching" in vllm_cfg: + template_vars["vllm_enable_prefix_caching"] = vllm_cfg["enable_prefix_caching"] + template_vars["vllm_engine_args"] = vllm_cfg.get("engine_args", {}) + compose_template = self.env.get_template(BASE_COMPOSE_TEMPLATE) compose_rendered = compose_template.render(**template_vars) diff --git a/src/cli/service_registry.py b/src/cli/service_registry.py index 2f2e4718a..61554cd14 100644 --- a/src/cli/service_registry.py +++ b/src/cli/service_registry.py @@ -151,6 +151,17 @@ def _register_default_services(self): 'services.redmine_mailbox.project'] )) + # Compute services + self.register(ServiceDefinition( + name='vllm-server', + description='vLLM inference server for local GPU-accelerated LLM serving', + category='compute', + requires_image=False, + requires_volume=False, + default_host_port=8000, + default_container_port=8000, + )) + self.register(ServiceDefinition( name='benchmarking', depends_on=['postgres'], diff --git a/src/cli/templates/base-compose.yaml b/src/cli/templates/base-compose.yaml index 218ff7f86..21cf01fce 100644 --- a/src/cli/templates/base-compose.yaml +++ b/src/cli/templates/base-compose.yaml @@ -118,12 +118,18 @@ services: args: APP_VERSION: {{ app_version }} container_name: {{ chatbot_container_name }} - {% if postgres_enabled -%} + {% if postgres_enabled or vllm_server_enabled -%} depends_on: + {% if postgres_enabled -%} postgres: condition: service_healthy config-seed: condition: service_completed_successfully + {% endif -%} + {% if vllm_server_enabled -%} + vllm-server: + condition: service_healthy + {% endif %} {% endif -%} environment: PGHOST: {{ 'localhost' if host_mode else 'postgres' }} @@ -134,6 +140,9 @@ services: VERBOSITY: {{ verbosity }} # Allow overriding Ollama host via env so containers can reach host daemon OLLAMA_HOST: ${OLLAMA_HOST:-} + {% if vllm_server_enabled -%} + VLLM_BASE_URL: ${VLLM_BASE_URL:-http://{{ 'localhost' if host_mode else 'vllm-server' }}:8000/v1} + {% endif %} {% for secret in required_secrets | default([]) -%} {{ secret.upper() }}_FILE: /run/secrets/{{ secret.lower() }} {% endfor %} @@ -555,6 +564,81 @@ services: {%- endif %} {%- endif %} + {% if vllm_server_enabled -%} + vllm-server: + image: vllm/vllm-openai:latest + container_name: vllm-server-{{ name }} + {% if not host_mode -%} + ports: + - "8000:8000" + {% endif -%} + environment: + NCCL_P2P_DISABLE: "1" + NVIDIA_VISIBLE_DEVICES: all + NVIDIA_DRIVER_CAPABILITIES: compute,utility,graphics + entrypoint: ["/bin/sh", "-c"] + command: + - | + SHM_SIZE=$$(df /dev/shm | awk 'NR==2 {print $$2}') + if [ "$$SHM_SIZE" -lt 1048576 ] 2>/dev/null; then + echo "WARNING: /dev/shm is $$(( $$SHM_SIZE / 1024 ))MB — less than 1GB. Use ipc: host or increase shm_size for stable multi-GPU inference." + fi + exec python3 -m vllm.entrypoints.openai.api_server \ + --model "{{ vllm_model | default('Qwen/Qwen2.5-7B-Instruct-1M') }}" \ + --enable-auto-tool-choice \ + --tool-call-parser "{{ vllm_tool_parser | default('hermes') }}" \ + {% if vllm_gpu_memory_utilization is defined %}--gpu-memory-utilization {{ vllm_gpu_memory_utilization }} {% endif %}\ + {% if vllm_max_model_len is defined %}--max-model-len {{ vllm_max_model_len }} {% endif %}\ + {% if vllm_tensor_parallel_size is defined %}--tensor-parallel-size {{ vllm_tensor_parallel_size }} {% endif %}\ + {% if vllm_dtype is defined %}--dtype {{ vllm_dtype }} {% endif %}\ + {% if vllm_quantization is defined %}--quantization {{ vllm_quantization }} {% endif %}\ + {% if vllm_enforce_eager is defined and vllm_enforce_eager %}--enforce-eager {% endif %}\ + {% if vllm_max_num_seqs is defined %}--max-num-seqs {{ vllm_max_num_seqs }} {% endif %}\ + {% if vllm_enable_prefix_caching is defined and not vllm_enable_prefix_caching %}--no-enable-prefix-caching {% endif %}\ + {% for key, val in vllm_engine_args.items() %}{% if val != '' %}--{{ key }} {{ val }} {% else %}--{{ key }} {% endif %}{% endfor %} + + ipc: host + {% if host_mode -%} + network_mode: host + {% endif -%} + ulimits: + memlock: + soft: -1 + hard: -1 + stack: 67108864 + {% if not use_podman -%} + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + {% endif %} + {% if use_podman -%} + security_opt: + - label:disable + devices: + {%- if gpu_ids == "all" %} + - "nvidia.com/gpu=all" + {%- else %} + {%- for gpu_id in gpu_ids %} + - "nvidia.com/gpu={{ gpu_id }}" + {%- endfor %} + {%- endif %} + {%- endif %} + healthcheck: + test: ["CMD-SHELL", "curl -sf http://localhost:8000/v1/models || exit 1"] + interval: 30s + timeout: 10s + retries: 20 + start_period: 120s + logging: + options: + max-size: 10m + restart: always + {%- endif %} + {% if benchmarking_enabled -%} benchmark: image: {{ benchmarking_image }}:{{ benchmarking_tag }} diff --git a/src/cli/utils/service_builder.py b/src/cli/utils/service_builder.py index a6ff1334b..a4f2f64c3 100644 --- a/src/cli/utils/service_builder.py +++ b/src/cli/utils/service_builder.py @@ -71,6 +71,7 @@ def __init__( "mattermost": ServiceState(), "redmine-mailer": ServiceState(), "benchmarking": ServiceState(), + "vllm-server": ServiceState(), } self.use_redmine: bool = False @@ -144,9 +145,11 @@ class ServiceBuilder: def get_available_services() -> Dict[str, str]: available_services = service_registry.get_application_services() integration_services = service_registry.get_integration_services() + compute_services = service_registry.get_services_by_category('compute') return { **{name: svc.description for name, svc in available_services.items()}, **{name: svc.description for name, svc in integration_services.items()}, + **{name: svc.description for name, svc in compute_services.items()}, } @staticmethod diff --git a/src/interfaces/chat_app/app.py b/src/interfaces/chat_app/app.py index 34559b39a..c9680e6d5 100755 --- a/src/interfaces/chat_app/app.py +++ b/src/interfaces/chat_app/app.py @@ -98,7 +98,6 @@ def _build_provider_config_from_payload(config_payload: Dict[str, Any], provider extra = {} if provider_type == ProviderType.LOCAL and cfg.get("mode"): extra["local_mode"] = cfg.get("mode") - return ProviderConfig( provider_type=provider_type, enabled=cfg.get("enabled", True), diff --git a/tests/smoke/combined_smoke.sh b/tests/smoke/combined_smoke.sh index aef9cec7c..ebec8f74b 100755 --- a/tests/smoke/combined_smoke.sh +++ b/tests/smoke/combined_smoke.sh @@ -5,6 +5,9 @@ set -euo pipefail # BASE_URL, DM_BASE_URL, OLLAMA_URL, OLLAMA_MODEL, # PGHOST, PGPORT, PGUSER, PGPASSWORD, PGDATABASE, # ARCHI_CONFIG_PATH, ARCHI_CONFIG_NAME, ARCHI_PIPELINE_NAME, USE_PODMAN +# SMOKE_PROVIDER – set to "vllm" to run vLLM smoke checks instead of Ollama +# VLLM_BASE_URL – vLLM API base URL (default: http://localhost:8000/v1) +# VLLM_MODEL – expected model on vLLM server (optional) NAME="${1:-}" if [[ -z "${NAME}" ]]; then @@ -23,9 +26,16 @@ export BASE_URL export DM_BASE_URL export OLLAMA_URL +SMOKE_PROVIDER="${SMOKE_PROVIDER:-ollama}" + info "Running preflight checks..." python3 tests/smoke/preflight.py +if [[ "${SMOKE_PROVIDER,,}" == "vllm" ]]; then + info "Running vLLM smoke checks..." + python3 tests/smoke/vllm_smoke.py +fi + info "Running direct tool probes (chatbot container)..." tool="docker" use_podman="${USE_PODMAN:-false}" diff --git a/tests/smoke/preflight.py b/tests/smoke/preflight.py index 7334e26a7..e4a205d8d 100644 --- a/tests/smoke/preflight.py +++ b/tests/smoke/preflight.py @@ -204,19 +204,25 @@ def _check_config_ollama(config_path: str, pipeline_name: str, ollama_model: str def main() -> None: + smoke_provider = os.getenv("SMOKE_PROVIDER", "ollama").lower() + _wait_for_ingestion() _check_postgres() # ChromaDB removed - PostgreSQL with pgvector is the only supported backend _check_data_manager_catalog() - _check_ollama_model() - config_path = os.getenv("ARCHI_CONFIG_PATH") - pipeline_name = os.getenv("ARCHI_PIPELINE_NAME", "CMSCompOpsAgent") - ollama_model = os.getenv("OLLAMA_MODEL", "") - if config_path: - _check_config_ollama(config_path, pipeline_name, ollama_model) + if smoke_provider != "vllm": + _check_ollama_model() + + config_path = os.getenv("ARCHI_CONFIG_PATH") + pipeline_name = os.getenv("ARCHI_PIPELINE_NAME", "CMSCompOpsAgent") + ollama_model = os.getenv("OLLAMA_MODEL", "") + if config_path: + _check_config_ollama(config_path, pipeline_name, ollama_model) + else: + _info("ARCHI_CONFIG_PATH not set; skipping config Ollama validation") else: - _info("ARCHI_CONFIG_PATH not set; skipping config Ollama validation") + _info("Provider is vLLM — skipping Ollama preflight checks") _info("Preflight checks passed") diff --git a/tests/smoke/vllm_smoke.py b/tests/smoke/vllm_smoke.py new file mode 100644 index 000000000..805690dd9 --- /dev/null +++ b/tests/smoke/vllm_smoke.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +"""vLLM provider smoke checks. + +Validates that a vLLM server is reachable and serving at least one model, +then sends a minimal completion request to verify inference works end-to-end. + +Expected env vars: + VLLM_BASE_URL – vLLM OpenAI-compatible API base (default: http://localhost:8000/v1) + VLLM_MODEL – (optional) specific model id to validate is loaded +""" +import json +import os +import sys +import time +import urllib.error +import urllib.request + + +def _fail(message: str) -> None: + print(f"[vllm-smoke] ERROR: {message}", file=sys.stderr) + sys.exit(1) + + +def _info(message: str) -> None: + print(f"[vllm-smoke] {message}") + + +def _check_vllm_health(base_url: str, timeout: int = 120) -> None: + """Wait for /v1/models to return at least one model.""" + models_url = f"{base_url}/models" + _info(f"Waiting for vLLM at {models_url} (timeout {timeout}s) ...") + deadline = time.time() + timeout + last_err = None + while True: + try: + req = urllib.request.Request(models_url) + with urllib.request.urlopen(req, timeout=5) as resp: + if resp.status == 200: + data = json.loads(resp.read().decode()) + models = data.get("data", []) + if models: + model_ids = [m.get("id") for m in models] + _info(f"vLLM serving {len(models)} model(s): {model_ids}") + return + last_err = "No models loaded yet" + except Exception as exc: + last_err = str(exc) + + if time.time() >= deadline: + _fail(f"vLLM not ready: {last_err}") + time.sleep(3) + + +def _check_model_loaded(base_url: str, expected_model: str) -> None: + """Verify a specific model is loaded on the server.""" + models_url = f"{base_url}/models" + _info(f"Checking model '{expected_model}' is loaded ...") + try: + req = urllib.request.Request(models_url) + with urllib.request.urlopen(req, timeout=10) as resp: + data = json.loads(resp.read().decode()) + except Exception as exc: + _fail(f"Failed to query models: {exc}") + + model_ids = [m.get("id") for m in data.get("data", [])] + if expected_model not in model_ids: + _fail(f"Model '{expected_model}' not found in {model_ids}") + _info(f"Model '{expected_model}' OK") + + +def _check_inference(base_url: str, model: str) -> None: + """Send a minimal chat completion to verify inference works.""" + completions_url = f"{base_url}/chat/completions" + _info(f"Testing inference on '{model}' ...") + payload = json.dumps({ + "model": model, + "messages": [{"role": "user", "content": "Say OK."}], + "max_tokens": 8, + }).encode() + req = urllib.request.Request( + completions_url, + data=payload, + headers={"Content-Type": "application/json"}, + ) + try: + with urllib.request.urlopen(req, timeout=120) as resp: + if resp.status != 200: + _fail(f"Inference request failed: HTTP {resp.status}") + data = json.loads(resp.read().decode()) + except Exception as exc: + _fail(f"Inference request failed: {exc}") + + choices = data.get("choices", []) + if not choices: + _fail("Inference returned no choices") + content = choices[0].get("message", {}).get("content", "") + _info(f"Inference OK — response: {content!r}") + + +def main() -> None: + base_url = os.getenv("VLLM_BASE_URL", "http://localhost:8000/v1").rstrip("/") + expected_model = os.getenv("VLLM_MODEL", "") + timeout = int(os.getenv("VLLM_HEALTH_TIMEOUT", "120")) + + _check_vllm_health(base_url, timeout=timeout) + + if expected_model: + _check_model_loaded(base_url, expected_model) + else: + # Use the first available model for the inference check + models_url = f"{base_url}/models" + req = urllib.request.Request(models_url) + with urllib.request.urlopen(req, timeout=10) as resp: + data = json.loads(resp.read().decode()) + expected_model = data["data"][0]["id"] + + _check_inference(base_url, expected_model) + _info("vLLM smoke checks passed") + + +if __name__ == "__main__": + main() diff --git a/tests/unit/test_vllm_provider.py b/tests/unit/test_vllm_provider.py new file mode 100644 index 000000000..ae2ea9d1a --- /dev/null +++ b/tests/unit/test_vllm_provider.py @@ -0,0 +1,205 @@ +"""Unit tests for VLLMProvider.""" + +import json +import unittest +import urllib.error +from unittest.mock import MagicMock, patch + +from src.archi.providers.base import ModelInfo, ProviderConfig, ProviderType +from src.archi.providers.vllm_provider import VLLMProvider, DEFAULT_VLLM_BASE_URL + + +class TestVLLMProviderInit(unittest.TestCase): + """Test VLLMProvider initialization.""" + + def test_default_config(self): + provider = VLLMProvider() + assert provider.config.base_url == DEFAULT_VLLM_BASE_URL + assert provider.config.provider_type == ProviderType.VLLM + assert provider._api_key == "not-needed" + + def test_custom_base_url(self): + config = ProviderConfig( + provider_type=ProviderType.VLLM, + base_url="http://gpu-node:9000/v1", + ) + provider = VLLMProvider(config) + assert provider.config.base_url == "http://gpu-node:9000/v1" + + @patch.dict("os.environ", {"VLLM_BASE_URL": "http://env-host:8000/v1"}) + def test_env_overrides_default(self): + provider = VLLMProvider() + assert provider.config.base_url == "http://env-host:8000/v1" + + @patch.dict("os.environ", {"VLLM_BASE_URL": "http://env-host:8000/v1"}) + def test_env_overrides_config(self): + config = ProviderConfig( + provider_type=ProviderType.VLLM, + base_url="http://config-host:8000/v1", + ) + provider = VLLMProvider(config) + assert provider.config.base_url == "http://env-host:8000/v1" + + def test_api_key_defaults_to_not_needed(self): + # When no config provided, api_key is set to "not-needed" + provider = VLLMProvider() + assert provider._api_key == "not-needed" + + def test_api_key_not_mutated_on_passed_config(self): + # When config is provided without api_key, __init__ should not mutate it + config = ProviderConfig(provider_type=ProviderType.VLLM) + VLLMProvider(config) + assert config.api_key is None + + def test_normalizes_base_url_without_scheme(self): + config = ProviderConfig( + provider_type=ProviderType.VLLM, + base_url="gpu-node:8000/v1", + ) + provider = VLLMProvider(config) + assert provider.config.base_url == "http://gpu-node:8000/v1" + + def test_base_url_defaults_when_config_has_none(self): + config = ProviderConfig(provider_type=ProviderType.VLLM, base_url=None) + provider = VLLMProvider(config) + assert provider.config.base_url == DEFAULT_VLLM_BASE_URL + + +class TestVLLMProviderGetChatModel(unittest.TestCase): + """Test get_chat_model returns ChatOpenAI with correct params.""" + + @patch("langchain_openai.ChatOpenAI", autospec=True) + def test_returns_chat_openai_with_defaults(self, mock_chat_openai): + provider = VLLMProvider() + provider.get_chat_model("my-model") + + mock_chat_openai.assert_called_once() + call_kwargs = mock_chat_openai.call_args[1] + assert call_kwargs["model"] == "my-model" + assert call_kwargs["base_url"] == DEFAULT_VLLM_BASE_URL + assert call_kwargs["api_key"] == "not-needed" + assert call_kwargs["streaming"] is True + + @patch("langchain_openai.ChatOpenAI", autospec=True) + def test_custom_base_url_passed_through(self, mock_chat_openai): + config = ProviderConfig( + provider_type=ProviderType.VLLM, + base_url="http://custom:8000/v1", + ) + provider = VLLMProvider(config) + provider.get_chat_model("Qwen/Qwen2.5-7B") + + call_kwargs = mock_chat_openai.call_args[1] + assert call_kwargs["base_url"] == "http://custom:8000/v1" + + +class TestVLLMProviderListModels(unittest.TestCase): + """Test list_models with mocked /v1/models endpoint.""" + + def _mock_response(self, data, status=200): + mock_resp = MagicMock() + mock_resp.status = status + mock_resp.read.return_value = json.dumps(data).encode() + mock_resp.__enter__ = MagicMock(return_value=mock_resp) + mock_resp.__exit__ = MagicMock(return_value=False) + return mock_resp + + @patch("src.archi.providers.vllm_provider.urllib.request.urlopen") + def test_fetches_models_from_server(self, mock_urlopen): + mock_urlopen.return_value = self._mock_response({ + "data": [ + {"id": "Qwen/Qwen2.5-7B-Instruct-1M"}, + {"id": "meta-llama/Llama-3-8B"}, + ] + }) + + provider = VLLMProvider() + models = provider.list_models() + + assert len(models) == 2 + assert models[0].id == "Qwen/Qwen2.5-7B-Instruct-1M" + assert models[1].id == "meta-llama/Llama-3-8B" + assert all(isinstance(m, ModelInfo) for m in models) + + @patch("src.archi.providers.vllm_provider.urllib.request.urlopen", side_effect=urllib.error.URLError("Connection refused")) + def test_falls_back_to_config_models(self, mock_urlopen): + config = ProviderConfig( + provider_type=ProviderType.VLLM, + base_url=DEFAULT_VLLM_BASE_URL, + models=[ModelInfo(id="fallback-model", name="fallback-model", display_name="Fallback")], + ) + provider = VLLMProvider(config) + models = provider.list_models() + + assert len(models) == 1 + assert models[0].id == "fallback-model" + + @patch("src.archi.providers.vllm_provider.urllib.request.urlopen", side_effect=urllib.error.URLError("Connection refused")) + def test_returns_empty_when_no_config_models(self, mock_urlopen): + provider = VLLMProvider() + models = provider.list_models() + assert models == [] + + +class TestVLLMProviderValidateConnection(unittest.TestCase): + """Test validate_connection.""" + + def _mock_response(self, status=200): + mock_resp = MagicMock() + mock_resp.status = status + mock_resp.__enter__ = MagicMock(return_value=mock_resp) + mock_resp.__exit__ = MagicMock(return_value=False) + return mock_resp + + @patch("src.archi.providers.vllm_provider.urllib.request.urlopen") + def test_returns_true_on_200(self, mock_urlopen): + mock_urlopen.return_value = self._mock_response(200) + provider = VLLMProvider() + assert provider.validate_connection() is True + + @patch("src.archi.providers.vllm_provider.urllib.request.urlopen", side_effect=urllib.error.URLError("Connection refused")) + def test_returns_false_on_failure(self, mock_urlopen): + provider = VLLMProvider() + assert provider.validate_connection() is False + + +class TestVLLMProviderRegistration(unittest.TestCase): + """Test that vLLM is properly registered in the provider system.""" + + def test_provider_type_enum_exists(self): + assert ProviderType.VLLM == "vllm" + + def test_get_provider_returns_vllm(self): + from src.archi.providers import ( + _PROVIDER_REGISTRY, _PROVIDER_INSTANCES, + register_provider, get_provider, + ) + # Manually register only VLLMProvider to avoid importing all providers + _PROVIDER_REGISTRY.clear() + _PROVIDER_INSTANCES.clear() + register_provider(ProviderType.VLLM, VLLMProvider) + + provider = get_provider("vllm") + assert isinstance(provider, VLLMProvider) + + _PROVIDER_REGISTRY.clear() + _PROVIDER_INSTANCES.clear() + + def test_get_provider_by_name_returns_vllm(self): + from src.archi.providers import ( + _PROVIDER_REGISTRY, _PROVIDER_INSTANCES, + register_provider, get_provider_by_name, + ) + _PROVIDER_REGISTRY.clear() + _PROVIDER_INSTANCES.clear() + register_provider(ProviderType.VLLM, VLLMProvider) + + provider = get_provider_by_name("vllm") + assert isinstance(provider, VLLMProvider) + + _PROVIDER_REGISTRY.clear() + _PROVIDER_INSTANCES.clear() + + +if __name__ == "__main__": + unittest.main() From f1d0c3ac74320bdff923625f1ff4ee257f7a159d Mon Sep 17 00:00:00 2001 From: Austin Swinney Date: Thu, 26 Mar 2026 19:01:55 -0400 Subject: [PATCH 2/2] remove vLLM server management, keep provider client only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vLLM infrastructure (Docker compose service, GPU config, engine args, smoke tests) is now the operator's responsibility. archi connects to any vLLM instance via base_url — Docker, bare metal, Slurm, or k8s. removes: vllm-server from compose template, service registry, templates manager, service builder. reverts unrelated changes (retriever URL fix, .gitignore, chat_app whitespace). rewrites vllm docs as "how to connect" instead of "how to deploy". --- .gitignore | 1 - docs/docs/user_guide.md | 18 +- docs/docs/vllm.md | 345 +++++++----------- examples/deployments/basic-gpu/config.yaml | 57 --- examples/deployments/basic-vllm/config.yaml | 40 +- openspec/changes/add-vllm-provider/design.md | 30 -- .../changes/add-vllm-provider/proposal.md | 14 - .../specs/vllm-provider/spec.md | 56 --- .../specs/vllm-server/spec.md | 55 --- openspec/changes/add-vllm-provider/tasks.md | 24 -- src/archi/pipelines/agents/tools/retriever.py | 3 - src/cli/managers/templates_manager.py | 26 -- src/cli/service_registry.py | 11 - src/cli/templates/base-compose.yaml | 83 +---- src/cli/utils/service_builder.py | 1 - src/interfaces/chat_app/app.py | 1 + tests/smoke/combined_smoke.sh | 10 - tests/smoke/preflight.py | 19 +- tests/smoke/vllm_smoke.py | 122 ------- 19 files changed, 173 insertions(+), 743 deletions(-) delete mode 100644 examples/deployments/basic-gpu/config.yaml delete mode 100644 openspec/changes/add-vllm-provider/design.md delete mode 100644 openspec/changes/add-vllm-provider/proposal.md delete mode 100644 openspec/changes/add-vllm-provider/specs/vllm-provider/spec.md delete mode 100644 openspec/changes/add-vllm-provider/specs/vllm-server/spec.md delete mode 100644 openspec/changes/add-vllm-provider/tasks.md delete mode 100644 tests/smoke/vllm_smoke.py diff --git a/.gitignore b/.gitignore index 50c17c208..8386cf17a 100644 --- a/.gitignore +++ b/.gitignore @@ -21,7 +21,6 @@ docs/site setup.sh .env configs/ -config/ build/ src/cli/utils/_repository_info.py openspec/specs/ diff --git a/docs/docs/user_guide.md b/docs/docs/user_guide.md index fa9335d98..99157bebf 100644 --- a/docs/docs/user_guide.md +++ b/docs/docs/user_guide.md @@ -413,18 +413,19 @@ To use a local model, specify one of the local model classes in `models.py`: ### vLLM -For high-throughput GPU inference with tool-calling support, Archi can deploy a [vLLM](https://docs.vllm.ai/) server as a sidecar container. Reference models with the `vllm/` prefix in your config: +For high-throughput GPU inference with tool-calling support, Archi can connect to an external [vLLM](https://docs.vllm.ai/) server. Reference models with the `vllm/` prefix in your config: ```yaml -archi: - pipeline_map: - CMSCompOpsAgent: - models: - required: - agent_model: vllm/Qwen/Qwen3-8B +services: + chat_app: + providers: + vllm: + enabled: true + base_url: http://your-vllm-host:8000/v1 + default_model: "Qwen/Qwen3-8B" ``` -Deploy with `--services chatbot,vllm-server --gpu-ids all`. See the [vLLM Provider](vllm.md) page for full configuration, architecture details, and troubleshooting. +You deploy and manage the vLLM server independently. See the [vLLM Provider](vllm.md) page for setup examples (Docker, bare metal, Slurm) and troubleshooting. ### Models via APIs @@ -683,7 +684,6 @@ Required secrets for PostgreSQL: ```bash PG_PASSWORD=your_secure_password ``` ->>>>>>> 9b860a3f (docs: add vllm provider documentation) --- diff --git a/docs/docs/vllm.md b/docs/docs/vllm.md index 99ad5fe3e..5d7a030b9 100644 --- a/docs/docs/vllm.md +++ b/docs/docs/vllm.md @@ -1,6 +1,6 @@ # vLLM Provider -Run open-weight models on your own GPUs using [vLLM](https://docs.vllm.ai/) as an inference backend. Archi deploys vLLM as a **sidecar container** alongside the chatbot — no external server management required. +Run open-weight models on your own GPUs using [vLLM](https://docs.vllm.ai/) as an inference backend. Archi connects to any vLLM server via its OpenAI-compatible API — you deploy and manage vLLM independently. ## Why vLLM? @@ -14,106 +14,72 @@ Run open-weight models on your own GPUs using [vLLM](https://docs.vllm.ai/) as a vLLM is the best fit when you need high-throughput local inference, multi-GPU support, or full data privacy with tool-calling capabilities. -## Prerequisites +## Architecture + +``` +┌──────────────────────┐ ┌──────────────────────┐ +│ archi deployment │ │ vLLM (external) │ +│ │ │ │ +│ ┌────────────────┐ │ HTTP │ Docker container │ +│ │ VLLMProvider │──│────────>│ OR bare metal │ +│ │ (Python client)│ │ :8000 │ OR Slurm job │ +│ └────────────────┘ │ /v1/* │ OR Kubernetes pod │ +│ │ │ │ +└──────────────────────┘ └──────────────────────┘ +``` -- NVIDIA GPUs with sufficient VRAM for your chosen model -- NVIDIA drivers and the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) installed -- Container runtime configured for GPU access (see [Advanced Setup](advanced_setup_deploy.md#running-llms-locally-on-your-gpus)) +Archi's `VLLMProvider` is a thin client that talks to vLLM's `/v1` API using the same `ChatOpenAI` LangChain class it would use for the OpenAI API. From the pipeline's perspective, vLLM looks identical to a remote OpenAI endpoint. + +**Archi does not manage the vLLM server.** You deploy, configure, and maintain vLLM independently — whether as a Docker container, a bare metal process, a Slurm job, or a Kubernetes pod. Archi only needs a `base_url` to connect. ## Quick Start -### 1. Configure your deployment +### 1. Start a vLLM server -In your config YAML, reference models with the `vllm/` provider prefix: +See [Running vLLM](#running-vllm) below for Docker, bare metal, and Slurm examples. -```yaml -archi: - pipeline_map: - CMSCompOpsAgent: - models: - required: - agent_model: vllm/Qwen/Qwen3-8B +### 2. Configure archi + +In your config YAML, set up the vLLM provider with the URL of your server: +```yaml services: - vllm: - model: Qwen/Qwen3-8B # HuggingFace model ID - tool_parser: hermes # tool-call parser (optional) + chat_app: + default_provider: vllm + default_model: "vllm:Qwen/Qwen3-8B" + providers: + vllm: + enabled: true + base_url: http://localhost:8000/v1 # URL of your vLLM server + default_model: "Qwen/Qwen3-8B" + models: + - "vllm:Qwen/Qwen3-8B" ``` -> **Model naming**: vLLM uses HuggingFace model IDs (e.g. `Qwen/Qwen3-8B`), not Ollama-style names (e.g. `Qwen/Qwen3:8B`). Make sure the model ID matches what is available on HuggingFace Hub. - -### 2. Deploy +### 3. Deploy archi ```bash archi create -n my-deployment \ -c config.yaml \ -e .env \ - --services chatbot,vllm-server \ - --gpu-ids all + --services chatbot ``` -The CLI will: - -1. Add the `vllm-server` sidecar to Docker Compose -2. Wire `VLLM_BASE_URL` into the chatbot container -3. Set the chatbot to wait for vLLM's health check before starting - -### 3. Verify - -Once the deployment is up, check the vLLM server: +### 4. Verify ```bash +# Check vLLM is serving curl http://localhost:8000/v1/models -``` - -You should see your model listed in the response. -## Architecture - -``` -┌─────────────────────────────────────────────────┐ -│ Docker Compose stack │ -│ │ -│ ┌──────────┐ ┌────────────┐ ┌──────────┐ │ -│ │ chatbot │───>│ vllm-server│ │ postgres │ │ -│ │ (Flask) │ │ (sidecar) │ │ │ │ -│ └──────────┘ └────────────┘ └──────────┘ │ -│ :7861 :8000 :5432 │ -│ GPU access │ -└─────────────────────────────────────────────────┘ +# Check archi can reach it +curl http://localhost:7861/api/health ``` -The vLLM server runs as a **sidecar** — a companion container in the same Compose stack. It: - -- Exposes an OpenAI-compatible `/v1` API on port 8000 -- Receives requests from the chatbot over the Docker network -- Loads the model onto GPU at startup and serves it continuously -- Reports health via `/v1/models` (chatbot waits for this before starting) - -The chatbot talks to vLLM using the same `ChatOpenAI` LangChain class it would use for the OpenAI API. From the pipeline's perspective, vLLM looks identical to a remote OpenAI endpoint. - ## Configuration Reference -### Config YAML - -#### Model references - -Anywhere a model is referenced in `pipeline_map`, use the `vllm/` prefix: - -```yaml -archi: - pipeline_map: - CMSCompOpsAgent: - models: - required: - agent_model: vllm/Qwen/Qwen3-8B -``` - -The part after `vllm/` must match the HuggingFace model ID that vLLM is serving. - -#### vLLM provider settings +### Provider settings -The vLLM provider is configured under `services.chat_app.providers.vllm` in your config YAML. At minimum you need `enabled` and `default_model`: +The vLLM provider is configured under `services.chat_app.providers.vllm`: ```yaml services: @@ -123,178 +89,140 @@ services: enabled: true base_url: http://localhost:8000/v1 default_model: "Qwen/Qwen3-8B" - tool_call_parser: hermes # optional, default: hermes models: - - "Qwen/Qwen3-8B" + - "vllm:Qwen/Qwen3-8B" ``` | Setting | Type | Default | Description | |---|---|---|---| | `enabled` | bool | `false` | Enable the vLLM provider | | `base_url` | string | `http://localhost:8000/v1` | vLLM server OpenAI-compatible endpoint | -| `default_model` | string | `Qwen/Qwen2.5-7B-Instruct-1M` | HuggingFace model ID to serve | -| `tool_call_parser` | string | `hermes` | Parser for structured tool calls (`hermes`, `mistral`, `llama3_json`) | +| `default_model` | string | — | HuggingFace model ID to use for inference | | `models` | list | — | Available model IDs for the UI model selector | -#### vLLM Server Tuning +### Model references -When archi manages the vLLM sidecar container (deployed via `--services vllm-server`), you can configure server launch arguments alongside the provider settings above. Each key is translated to a vLLM CLI flag at container startup. All keys are optional — when omitted, vLLM's own defaults apply. - -> **Note**: These keys only affect the managed vLLM sidecar container. If you are pointing `base_url` at an external vLLM server, configure that server directly instead. +Anywhere a model is referenced in `pipeline_map`, use the `vllm/` prefix: -| Key | Type | Default | When to change | -|---|---|---|---| -| `gpu_memory_utilization` | float | `0.9` | Model barely fits in VRAM, or you want to reserve GPU memory for other processes | -| `max_model_len` | int | model default | Reduce context window to lower memory usage, or increase it for long-document workloads | -| `tensor_parallel_size` | int | `1` | Shard a large model across multiple GPUs | -| `dtype` | string | `auto` | Force a specific weight precision (`float16`, `bfloat16`) instead of auto-detection | -| `quantization` | string | none | Run quantized model weights (`awq`, `gptq`, `fp8`) to reduce memory | -| `enforce_eager` | bool | `false` | Disable CUDA graph compilation to save memory at the cost of throughput | -| `max_num_seqs` | int | `256` | Limit concurrent sequences to reduce memory pressure under high load | -| `enable_prefix_caching` | bool | `true` | Disable KV cache prefix sharing if it causes issues with your model | +```yaml +archi: + pipeline_map: + CMSCompOpsAgent: + models: + required: + agent_model: vllm/Qwen/Qwen3-8B +``` -##### Complete config example +The part after `vllm/` must match the HuggingFace model ID that vLLM is serving. -A single-GPU deployment with memory tuning: +> **Model naming**: vLLM uses HuggingFace model IDs (e.g. `Qwen/Qwen3-8B`), not Ollama-style names (e.g. `Qwen/Qwen3:8B`). -```yaml -services: - chat_app: - providers: - vllm: - enabled: true - base_url: http://localhost:8000/v1 - default_model: "Qwen/Qwen3-8B" - tool_call_parser: hermes - models: - - "Qwen/Qwen3-8B" - gpu_memory_utilization: 0.85 - max_model_len: 8192 -``` +## Running vLLM -##### `engine_args` passthrough +Archi does not manage the vLLM server. Below are examples for common deployment scenarios. -For any vLLM flag not covered by a named key above, use the `engine_args` map. Each entry is passed as `-- ` to the vLLM server. Keys use kebab-case matching vLLM's CLI flags. For boolean flags that take no argument (e.g. `--trust-remote-code`), use an empty string as the value. Do not duplicate flags that already have a named key above. +### Docker -```yaml -services: - chat_app: - providers: - vllm: - engine_args: - swap-space: 8 # CPU swap space per GPU in GiB (default: 4) - seed: 42 - trust-remote-code: "" # bare flag (no value) — use "" for boolean flags +```bash +docker run -d \ + --name vllm-server \ + --gpus all \ + --ipc=host \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + -p 8000:8000 \ + -e NCCL_P2P_DISABLE=1 \ + vllm/vllm-openai:latest \ + --model Qwen/Qwen3-8B \ + --enable-auto-tool-choice \ + --tool-call-parser hermes ``` -##### Multi-GPU example +Key flags: +- `--gpus all` — GPU passthrough +- `--ipc=host` — required for NCCL multi-GPU communication (Docker's default 64MB shm causes crashes) +- `--ulimit memlock=-1` — prevents OS from swapping out VRAM-mapped buffers +- `NCCL_P2P_DISABLE=1` — required for V100s and older GPU topologies -Sharding a 30B model across 4 GPUs: +### Bare metal -```yaml -services: - chat_app: - providers: - vllm: - enabled: true - base_url: http://localhost:8000/v1 - default_model: "Qwen/Qwen3-30B-A3B-Instruct" - tool_call_parser: hermes - models: - - "Qwen/Qwen3-30B-A3B-Instruct" - gpu_memory_utilization: 0.92 - tensor_parallel_size: 4 - max_model_len: 16384 - dtype: bfloat16 - engine_args: - swap-space: 8 +```bash +pip install vllm + +python -m vllm.entrypoints.openai.api_server \ + --model Qwen/Qwen3-8B \ + --enable-auto-tool-choice \ + --tool-call-parser hermes \ + --host 0.0.0.0 \ + --port 8000 ``` -Deploy with all GPUs: +### Slurm ```bash -archi create -n my-deployment \ - -c config.yaml \ - --services chatbot,vllm-server \ - --gpu-ids 0,1,2,3 +#!/bin/bash +#SBATCH --gres=gpu:4 +#SBATCH --cpus-per-task=16 +#SBATCH --mem=128G +#SBATCH --time=7-00:00:00 + +module load cuda +source activate vllm + +python -m vllm.entrypoints.openai.api_server \ + --model Qwen/Qwen3-8B \ + --tensor-parallel-size 4 \ + --enable-auto-tool-choice \ + --tool-call-parser hermes \ + --host 0.0.0.0 \ + --port 8000 ``` -### Environment Variables +Then set `base_url` in your archi config to the Slurm node's address. -| Variable | Default | Description | -|---|---|---| -| `VLLM_BASE_URL` | `http://vllm-server:8000/v1` | Override the vLLM server URL (auto-set by the CLI) | +### Common vLLM server flags -You generally don't need to set `VLLM_BASE_URL` manually — the CLI injects it into the chatbot container. It is useful if you are running vLLM on a separate host. +These are configured on the vLLM server itself, not in archi: -### Host Networking - -When deploying with `--hostmode`, the vLLM server uses `network_mode: host` and all services communicate via `localhost`. Without host mode, services communicate via Docker DNS (`vllm-server:8000`). +| Flag | Description | +|---|---| +| `--gpu-memory-utilization 0.9` | Fraction of GPU VRAM to use (0.0-1.0) | +| `--max-model-len 8192` | Cap context window to reduce memory | +| `--tensor-parallel-size 4` | Shard model across N GPUs | +| `--dtype bfloat16` | Force weight precision | +| `--quantization awq` | Run quantized weights (awq, gptq, fp8) | +| `--enforce-eager` | Disable CUDA graphs to save memory | +| `--max-num-seqs 256` | Limit concurrent sequences | +| `--enable-auto-tool-choice` | Enable tool calling pathway | +| `--tool-call-parser hermes` | Parser for structured tool calls | + +See [vLLM documentation](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html) for the full reference. ## Tool Calling -vLLM supports function/tool calling for ReAct agents, but requires explicit server flags. Archi configures these automatically: +vLLM supports function/tool calling for ReAct agents, but requires explicit server flags: - `--enable-auto-tool-choice` — enables the tool calling pathway - `--tool-call-parser ` — selects the parser for the model family -The `tool_parser` setting should match your model's chat template: - | Model family | Parser | |---|---| | Qwen (Qwen2.5, Qwen3) | `hermes` | | Mistral / Mixtral | `mistral` | | Llama 3 | `llama3_json` | -If tool calling is not needed for your use case, these flags are harmless and can be left at defaults. - -## Smoke Testing - -To run smoke tests against a vLLM deployment: - -```bash -export SMOKE_PROVIDER=vllm -export SMOKE_VLLM_BASE_URL=http://localhost:8000/v1 -export SMOKE_VLLM_MODEL=Qwen/Qwen3-8B -scripts/dev/run_smoke_preview.sh my-deployment -``` - -This runs preflight checks (verifies vLLM is reachable) followed by a basic chat completion test through the chatbot endpoint. +These flags must be set when starting the vLLM server, not in archi's config. ## Troubleshooting -### vLLM server not starting - -**Symptom**: Container exits immediately or stays in a restart loop. - -**Check logs**: -```bash -docker logs vllm-server- -``` +### Archi can't reach vLLM -Common causes: +**Symptom**: `ConnectionError: Connection refused` or timeout. -- **Insufficient VRAM**: The model doesn't fit in GPU memory. Options: - - Lower `gpu_memory_utilization` (e.g. `0.7`) to leave headroom for other processes - - Set `max_model_len` to a smaller value (e.g. `4096`) to reduce KV cache memory - - Add `quantization: awq` or `quantization: gptq` if the model has quantized weights available - - Set `enforce_eager: true` to disable CUDA graphs (saves memory, reduces throughput) - - Increase `tensor_parallel_size` and add more GPUs via `--gpu-ids` - - Try a smaller model -- **Missing NVIDIA runtime**: Ensure the NVIDIA Container Toolkit is installed and configured. -- **/dev/shm too small**: vLLM warns at startup if shared memory is below 1 GB. The container uses `ipc: host` by default, but if that is restricted, increase `shm_size`. -- **Invalid engine argument**: If the vLLM log shows `unrecognized arguments`, check for typos in `engine_args` keys (must be kebab-case, e.g. `swap-space` not `swap_space`) or boolean flags that need an empty-string value (`""`). - -### Chatbot can't reach vLLM - -**Symptom**: `ConnectionError: Name or service not known` or `Connection refused`. - -- Verify both containers are on the same Docker network (default when not using `--hostmode`). -- Check that `VLLM_BASE_URL` in the chatbot container resolves correctly: - ```bash - docker exec curl http://vllm-server:8000/v1/models - ``` -- If using `--hostmode`, ensure `VLLM_BASE_URL` uses `localhost` instead of `vllm-server`. +- Verify vLLM is running: `curl http://:8000/v1/models` +- If vLLM is on a different host, ensure network connectivity and firewall rules allow port 8000 +- If running in Docker, ensure the archi container can reach the vLLM host (use `--network=host` or configure Docker networking) +- Check that `base_url` in your archi config matches the actual vLLM server address ### Model not found (404) @@ -302,14 +230,14 @@ Common causes: vLLM uses HuggingFace model IDs, not Ollama-style names. Check: -- Config uses dashes, not colons: `vllm/Qwen/Qwen3-8B` (not `Qwen/Qwen3:8B`) -- The model ID matches exactly what vLLM is serving (`curl localhost:8000/v1/models`) +- Config uses the exact model ID from `curl :8000/v1/models` +- Use dashes, not colons: `Qwen/Qwen3-8B` (not `Qwen/Qwen3:8B`) ### Tool calling returns 400 **Symptom**: `400 Bad Request: "auto" tool choice requires --enable-auto-tool-choice`. -This means the vLLM server wasn't started with tool calling flags. If you are deploying through the CLI, this is handled automatically. If running vLLM manually, add: +The vLLM server wasn't started with tool calling flags. Add to your vLLM launch command: ```bash --enable-auto-tool-choice --tool-call-parser hermes @@ -317,4 +245,15 @@ This means the vLLM server wasn't started with tool calling flags. If you are de ### Slow first response -The first request after startup may be slow (30-60s) while vLLM compiles CUDA kernels and warms up. Subsequent requests will be significantly faster. The chatbot's `depends_on` health check ensures it doesn't send requests before vLLM is ready, but the health check only confirms the server is listening — not that the first compilation is complete. If startup compilation time is a problem, set `enforce_eager: true` to skip CUDA graph compilation (at the cost of lower throughput). +The first request after startup may be slow (30-60s) while vLLM compiles CUDA kernels. Subsequent requests will be significantly faster. If this is a problem, start vLLM with `--enforce-eager` to skip CUDA graph compilation (at the cost of lower throughput). + +### Insufficient VRAM + +If vLLM crashes or the model doesn't fit in GPU memory: + +- Lower `--gpu-memory-utilization` (e.g. `0.7`) +- Set `--max-model-len` to a smaller value (e.g. `4096`) +- Add `--quantization awq` or `--quantization gptq` if quantized weights are available +- Set `--enforce-eager` to disable CUDA graphs +- Increase `--tensor-parallel-size` and use more GPUs +- Try a smaller model diff --git a/examples/deployments/basic-gpu/config.yaml b/examples/deployments/basic-gpu/config.yaml deleted file mode 100644 index e76cf572c..000000000 --- a/examples/deployments/basic-gpu/config.yaml +++ /dev/null @@ -1,57 +0,0 @@ -# Basic configuration file for an Archi deployment -# with a chat app interface, CMSCompOpsAgent pipeline, and -# PostgreSQL with pgvector for document storage. -# The LLM is a locally hosted VLLM instance on GPUs. -# -# This example demonstrates using local GPU resources with VLLM -# in OpenAI-compatible mode with the CMSCompOpsAgent pipeline. -# -# Prerequisites: -# - VLLM server running on localhost:8000 with a model loaded -# - GPU(s) available on the host machine -# -# Run with: -# archi create --name my-archi-gpu-agent --config examples/deployments/basic-gpu/agent-config.yaml --services chatbot --gpu-ids=all - -name: main-gpu-agent - -services: - chat_app: - pipeline: CMSCompOpsAgent - trained_on: "Documentation and guides" - port: 7861 - external_port: 7861 - providers: - vllm: - enabled: true - base_url: http://localhost:8000/v1 - default_model: "Qwen/Qwen3-8B" - vectorstore: - backend: postgres # PostgreSQL with pgvector (only supported backend) - data_manager: - port: 7872 - external_port: 7872 - postgres: - port: 5432 - user: archi - database: archi-db - host: postgres - -data_manager: - sources: - links: - input_lists: - - config/source.list - embedding_name: HuggingFaceEmbeddings - -archi: - pipelines: - - CMSCompOpsAgent - pipeline_map: - CMSCompOpsAgent: - prompts: - required: - agent_prompt: config/main.prompt - models: - required: - agent_model: local/Qwen/Qwen3:8B diff --git a/examples/deployments/basic-vllm/config.yaml b/examples/deployments/basic-vllm/config.yaml index 0c497bb01..26573052e 100644 --- a/examples/deployments/basic-vllm/config.yaml +++ b/examples/deployments/basic-vllm/config.yaml @@ -1,9 +1,11 @@ -# Basic configuration file for a Archi deployment -# with a chat app interface and PostgreSQL with pgvector for document storage. -# The LLM is used through an existing Ollama server. +# Basic configuration file for an Archi deployment +# using a vLLM server for LLM inference. +# +# The vLLM server must be running and accessible at the base_url below. +# Archi does not manage the vLLM server — see docs/docs/vllm.md for setup guidance. # # run with: -# archi create --name my-archi-vllm --config examples/deployments/basic-vllm/config.yaml --services chatbot --hostmode +# archi create --name my-archi-vllm --config examples/deployments/basic-vllm/config.yaml --services chatbot name: my_archi @@ -11,41 +13,25 @@ services: chat_app: agent_class: CMSCompOpsAgent agents_dir: examples/agents - default_provider: local - default_model: qwen3:32b + default_provider: vllm + default_model: "vllm:Qwen/Qwen3-8B" providers: vllm: enabled: true - base_url: http://localhost:8000/v1 # make sure this matches your vllm server URL! - mode: ollama - default_model: "vllm:qwen3:8b" # make sure this matches a model you have downloaded locally with ollama + base_url: http://localhost:8000/v1 # URL of your vLLM server + default_model: "Qwen/Qwen3-8B" models: - - "vllm:Gemma 3:4B" - # - "vllm:Gemma 3:12B" - - "vllm:qwen3:8B-Instruct" - # - "vllm:Qwen3-30B-Instruct" - # --- vLLM server engine args (all optional) --- - # gpu_memory_utilization: 0.9 # fraction of GPU VRAM (0.0-1.0, default: 0.9) - # max_model_len: 8192 # cap context window to reduce memory - # tensor_parallel_size: 2 # shard model across N GPUs - # dtype: auto # weight precision (auto, float16, bfloat16) - # quantization: awq # quantization method (awq, gptq, fp8) - # enforce_eager: false # disable CUDA graphs to save memory - # max_num_seqs: 256 # max concurrent sequences - # enable_prefix_caching: true # KV cache prefix sharing - # engine_args: # passthrough for any other vLLM flag - # swap-space: 4 # CPU swap space per GPU in GiB - # seed: 42 + - "vllm:Qwen/Qwen3-8B" trained_on: "FASRC DOCS" port: 7861 external_port: 7861 vectorstore: - backend: postgres # PostgreSQL with pgvector (only supported backend) + backend: postgres data_manager: port: 7889 external_port: 7889 auth: - enabled: false # set to true and provide DM_API_TOKEN in .env for production + enabled: false data_manager: sources: diff --git a/openspec/changes/add-vllm-provider/design.md b/openspec/changes/add-vllm-provider/design.md deleted file mode 100644 index d4bdf79e6..000000000 --- a/openspec/changes/add-vllm-provider/design.md +++ /dev/null @@ -1,30 +0,0 @@ -# Design: vLLM Provider Integration - -## Architecture -- **Client:** `chatbot-main-gpu-agent` container. -- **Server:** `vllm-server` container (host-based or separate container). -- **Protocol:** HTTP/REST (OpenAI Schema). - -## Technical Decisions - -### Decision: Inherit from OpenAIProvider -Since vLLM is OpenAI-compatible, the `VLLMProvider` should inherit from Archi's `OpenAIProvider` (or `BaseProvider`) to reuse JSON mapping logic, but override the endpoint resolution to handle the Docker internal network. - -### Decision: V100 Stability -Inject `NCCL_P2P_DISABLE=1` into the provider's connection logic if not already handled by the environment to ensure stable communication with older NVLink/PCIe topologies on V100s. - -### Decision: Critical Docker Performance Tuning -To achieve bare-metal parity when running vLLM inside a container, the following runtime configurations MUST be enforced. These prevent the common "Docker Tax" on LLM inference. - -#### 1. Shared Memory Access (`--ipc=host`) -vLLM utilizes NCCL for multi-GPU communication and PagedAttention for memory management. -- **Requirement:** Containers must be started with `--ipc=host`. -- **Reason:** Docker’s default 64MB shm-size causes immediate crashes during Tensor Parallelism initialization. Using the host's IPC namespace provides the necessary memory bandwidth for inter-GPU coordination. - -#### 2. Network Latency Optimization (`--network=host`) -- **Requirement:** Use `--network=host` for the `vllm-server` container where feasible. -- **Reason:** Bypasses the Docker bridge (docker0) and user-land proxy (docker-proxy), reducing request/response overhead by 0.5–2ms per call—critical for high-concurrency streaming applications. - -#### 3. GPU Passthrough and Memory Locking -- **Requirement:** Ensure `--gpus all` and `--ulimit memlock=-1 --ulimit stack=67108864` are set. -- **Reason:** vLLM pre-allocates up to 90% of VRAM (default). Memlocking prevents the OS from swapping out these critical buffers, ensuring consistent P99 latencies. diff --git a/openspec/changes/add-vllm-provider/proposal.md b/openspec/changes/add-vllm-provider/proposal.md deleted file mode 100644 index f2f2d9c74..000000000 --- a/openspec/changes/add-vllm-provider/proposal.md +++ /dev/null @@ -1,14 +0,0 @@ -# Proposal: Add vLLM Provider -## Intent -Enable high-throughput local inference on NVIDIA V100 GPUs using the vLLM engine. This provides an OpenAI-compatible alternative to the current Ollama and external API providers. - -## Scope -- New provider class `VLLMProvider` in `src/archi/providers/`. -- Integration into the provider factory. -- Support for streaming and non-streaming chat completions. -- V100-specific configuration (NCCL flags). - -## Constraints -- MUST use OpenAI-compatible API format. -- MUST support the `base_url` parameter for remote container access. -- MUST handle V100-specific environment variables for stability. \ No newline at end of file diff --git a/openspec/changes/add-vllm-provider/specs/vllm-provider/spec.md b/openspec/changes/add-vllm-provider/specs/vllm-provider/spec.md deleted file mode 100644 index 854ac5aee..000000000 --- a/openspec/changes/add-vllm-provider/specs/vllm-provider/spec.md +++ /dev/null @@ -1,56 +0,0 @@ -## ADDED Requirements - -### Requirement: VLLMProvider registered as a first-class provider type -The system SHALL register a `VLLM` provider type in the `ProviderType` enum and provider registry, distinct from the `LOCAL` provider. Pipeline configs SHALL reference vLLM models as `vllm/`. - -#### Scenario: Pipeline resolves vLLM model reference -- **WHEN** a pipeline config specifies `vllm/Qwen/Qwen2.5-7B-Instruct-1M` as a model -- **THEN** `BasePipeline._parse_provider_model()` splits it into provider `"vllm"` and model `"Qwen/Qwen2.5-7B-Instruct-1M"`, and `get_model()` returns a `ChatOpenAI` instance from `VLLMProvider` - -#### Scenario: Provider name alias resolves to VLLM type -- **WHEN** `get_provider_by_name("vllm")` is called -- **THEN** it SHALL return a `VLLMProvider` instance (not `LocalProvider`) - -### Requirement: VLLMProvider returns ChatOpenAI with correct defaults -The `VLLMProvider.get_chat_model()` SHALL return a `ChatOpenAI` instance configured with `base_url` defaulting to `http://localhost:8000/v1` and `api_key` defaulting to `"not-needed"`. - -#### Scenario: Default base URL used when none configured -- **WHEN** `VLLMProvider` is instantiated with no `base_url` in config -- **THEN** `get_chat_model("my-model")` returns a `ChatOpenAI` with `base_url="http://localhost:8000/v1"` - -#### Scenario: Custom base URL from config -- **WHEN** `VLLMProvider` is instantiated with `base_url="http://vllm-host:9000/v1"` in config -- **THEN** `get_chat_model("my-model")` returns a `ChatOpenAI` with that base URL - -#### Scenario: Environment variable overrides config -- **WHEN** `VLLM_BASE_URL` environment variable is set -- **THEN** `VLLMProvider` SHALL use that value as base URL, overriding the config default - -### Requirement: VLLMProvider discovers models dynamically -The `VLLMProvider.list_models()` SHALL query the vLLM server's `/v1/models` endpoint and return discovered models as `ModelInfo` objects. - -#### Scenario: Server is reachable with loaded models -- **WHEN** `list_models()` is called and the vLLM server responds with a model list -- **THEN** each model is returned as a `ModelInfo` with `id`, `name`, and `display_name` populated from the response - -#### Scenario: Server is unreachable -- **WHEN** `list_models()` is called and the vLLM server does not respond -- **THEN** it SHALL return the statically configured model list from `ProviderConfig.models`, or an empty list if none configured - -### Requirement: VLLMProvider validates server connection -The `VLLMProvider.validate_connection()` SHALL check the vLLM server's health by hitting the `/v1/models` endpoint. - -#### Scenario: Server is healthy -- **WHEN** `validate_connection()` is called and `/v1/models` returns HTTP 200 -- **THEN** it SHALL return `True` - -#### Scenario: Server is down -- **WHEN** `validate_connection()` is called and the request fails or times out -- **THEN** it SHALL return `False` - -### Requirement: YAML config section for vLLM provider -The system SHALL support an `archi.providers.vllm` section in deployment YAML configs with fields: `enabled`, `base_url`, `default_model`, `models`. - -#### Scenario: Config loaded from YAML -- **WHEN** a deployment config contains `archi.providers.vllm` with `enabled: true` and `base_url: http://gpu-node:8000/v1` -- **THEN** `_build_provider_config_from_payload()` SHALL construct a `ProviderConfig` with `provider_type=ProviderType.VLLM` and the specified fields diff --git a/openspec/changes/add-vllm-provider/specs/vllm-server/spec.md b/openspec/changes/add-vllm-provider/specs/vllm-server/spec.md deleted file mode 100644 index 321b96e42..000000000 --- a/openspec/changes/add-vllm-provider/specs/vllm-server/spec.md +++ /dev/null @@ -1,55 +0,0 @@ -## ADDED Requirements - -### Requirement: vllm-server registered as a deployable service -The system SHALL register a `vllm-server` service in the `ServiceRegistry` that can be enabled via `archi create --services`. - -#### Scenario: User deploys with vllm-server -- **WHEN** `archi create --name my-bot --services chatbot,vllm-server --gpu-ids all` is run -- **THEN** the deployment directory SHALL contain a docker-compose service block for `vllm-server` with GPU passthrough - -#### Scenario: vllm-server not requested -- **WHEN** `archi create` is run without `vllm-server` in --services -- **THEN** no vllm-server service block SHALL be generated - -### Requirement: vllm-server container runs with required runtime config -The generated docker-compose service for `vllm-server` SHALL include `ipc: host`, `ulimits` (memlock unlimited, stack 67108864), and GPU device reservations. - -#### Scenario: Compose file generated with runtime config -- **WHEN** the deployment includes `vllm-server` -- **THEN** the docker-compose YAML for vllm-server SHALL contain `ipc: host`, `ulimits.memlock.soft: -1`, `ulimits.memlock.hard: -1`, `ulimits.stack: 67108864`, and `deploy.resources.reservations.devices` with GPU capabilities - -### Requirement: V100 stability via NCCL environment variable -The vllm-server container SHALL set `NCCL_P2P_DISABLE=1` in its environment to ensure stability on V100 GPU topologies. - -#### Scenario: NCCL flag present in container environment -- **WHEN** vllm-server is deployed -- **THEN** the container environment SHALL include `NCCL_P2P_DISABLE=1` - -### Requirement: vllm-server supports host networking mode -The vllm-server compose service SHALL use `network_mode: host` by default to minimize inference latency. - -#### Scenario: Host networking enabled -- **WHEN** vllm-server is deployed with default settings -- **THEN** the compose service SHALL include `network_mode: host` - -#### Scenario: Chatbot resolves vllm-server via host -- **WHEN** vllm-server uses host networking and chatbot uses bridge networking -- **THEN** the chatbot container SHALL receive a `VLLM_BASE_URL` environment variable pointing to the host IP and vLLM port - -### Requirement: vllm-server startup health check -The vllm-server compose service SHALL include a healthcheck that verifies the `/v1/models` endpoint is responding before dependent services start. - -#### Scenario: Healthy startup -- **WHEN** vllm-server finishes loading the model and `/v1/models` returns HTTP 200 -- **THEN** the healthcheck SHALL pass and dependent services (chatbot) SHALL start - -#### Scenario: Slow model load -- **WHEN** vllm-server takes longer than the healthcheck interval to load -- **THEN** the healthcheck SHALL retry until the model is loaded or the timeout is reached - -### Requirement: Shared memory size warning -The vllm-server startup SHALL log a warning if `/dev/shm` is smaller than 1GB. - -#### Scenario: Insufficient shared memory -- **WHEN** vllm-server starts and `/dev/shm` is less than 1GB -- **THEN** a warning SHALL be logged indicating that `ipc: host` or a larger `shm_size` is required for stable multi-GPU inference diff --git a/openspec/changes/add-vllm-provider/tasks.md b/openspec/changes/add-vllm-provider/tasks.md deleted file mode 100644 index 036de421d..000000000 --- a/openspec/changes/add-vllm-provider/tasks.md +++ /dev/null @@ -1,24 +0,0 @@ -Phase 1: Provider (thin client layer) -[x] 1. Add ProviderType.VLLM to enum: Add VLLM = "vllm" to ProviderType in src/archi/providers/base.py. - -[x] 2. Create VLLMProvider class: New file src/archi/providers/vllm_provider.py. Inherit from BaseProvider (not OpenAIProvider — avoids coupling to OpenAI's default model list and API key logic). Default base_url http://localhost:8000/v1, api_key defaults to "not-needed". get_chat_model() returns ChatOpenAI with correct base_url. list_models() hits /v1/models for dynamic discovery. validate_connection() health-checks /v1/models. - -[x] 3. Register provider: Update src/archi/providers/__init__.py — add to _ensure_providers_registered(), repoint "vllm" alias from ProviderType.LOCAL to ProviderType.VLLM in name_map. - -[x] 4. Config schema support: Support archi.providers.vllm section in YAML (fields: enabled, base_url, default_model, models). Wire into _build_provider_config_from_payload() in src/interfaces/chat_app/app.py. - -Phase 2: Infrastructure (server-side) -[x] 5. Register vllm-server in ServiceRegistry: New ServiceDefinition in src/cli/service_registry.py. GPU-dependent, port 8000 default, no volume required (model weights bind-mounted or cached). - -[x] 6. Docker Compose template for vllm-server: Base image vllm/vllm-openai or custom from base-pytorch-image. Server command: python -m vllm.entrypoints.openai.api_server --model . Environment: NCCL_P2P_DISABLE=1 (V100 stability). Runtime: ipc: host, ulimits (memlock: -1, stack: 67108864), GPU passthrough via deploy.resources.reservations.devices. - -[x] 7. Inter-container networking: If vllm-server uses network_mode: host, chatbot must reach it via host IP not Docker DNS. Expose VLLM_BASE_URL env var to chatbot container. VLLMProvider reads base_url from config or VLLM_BASE_URL env fallback. - -[x] 8. CLI integration: Wire vllm-server into archi create --services. Leverage existing --gpu-ids flag for GPU passthrough. Support model name configuration (which model the server loads). - -Phase 3: Validation -[x] 9. Unit tests for VLLMProvider: Mock /v1/models response for list_models(). Verify ChatOpenAI instantiation with correct base_url and api_key. Verify validate_connection() success/failure paths. - -[x] 10. Startup health check: Compose healthcheck or entrypoint script for vllm-server. Log warning if /dev/shm < 1GB. Chatbot depends_on vllm-server with condition: service_healthy. - -[x] 11. Smoke test: Extend existing smoke test infrastructure. Verify end-to-end: deploy → ingest → query via vLLM provider. diff --git a/src/archi/pipelines/agents/tools/retriever.py b/src/archi/pipelines/agents/tools/retriever.py index 823f47661..149aaf902 100644 --- a/src/archi/pipelines/agents/tools/retriever.py +++ b/src/archi/pipelines/agents/tools/retriever.py @@ -45,7 +45,6 @@ def _format_documents_for_llm( doc.metadata.get("filename") or "unknown source" ) - url = doc.metadata.get("url") or "" hash = ( doc.metadata.get("resource_hash") or "n/a" @@ -54,8 +53,6 @@ def _format_documents_for_llm( if len(text) > max_chars: text = f"{text[:max_chars].rstrip()}..." header = f"[{idx}] {source} (hash={hash})" - if url: - header += f"\nURL: {url}" footer = f"Score: {score:.4f}" if isinstance(score, (float, int)) else "Score: n/a" snippets.append(f"{header}\n{footer}\n{text}") diff --git a/src/cli/managers/templates_manager.py b/src/cli/managers/templates_manager.py index 99f738854..cf40c4293 100644 --- a/src/cli/managers/templates_manager.py +++ b/src/cli/managers/templates_manager.py @@ -481,32 +481,6 @@ def _render_compose_file(self, context: TemplateContext) -> None: if context.plan.get_service("grader").enabled: template_vars["rubrics"] = self._get_grader_rubrics(context.config_manager) - # Pass vLLM model name from provider config to compose template - vllm_cfg = context.config_manager.config.get("services", {}).get("chat_app", {}).get("providers", {}).get("vllm", {}) - if vllm_cfg.get("default_model"): - template_vars["vllm_model"] = vllm_cfg["default_model"] - if vllm_cfg.get("tool_call_parser"): - template_vars["vllm_tool_parser"] = vllm_cfg["tool_call_parser"] - - # Pass vLLM server configuration keys to compose template - if vllm_cfg.get("gpu_memory_utilization"): - template_vars["vllm_gpu_memory_utilization"] = vllm_cfg["gpu_memory_utilization"] - if vllm_cfg.get("max_model_len"): - template_vars["vllm_max_model_len"] = vllm_cfg["max_model_len"] - if vllm_cfg.get("tensor_parallel_size"): - template_vars["vllm_tensor_parallel_size"] = vllm_cfg["tensor_parallel_size"] - if vllm_cfg.get("dtype"): - template_vars["vllm_dtype"] = vllm_cfg["dtype"] - if vllm_cfg.get("quantization"): - template_vars["vllm_quantization"] = vllm_cfg["quantization"] - if "enforce_eager" in vllm_cfg: - template_vars["vllm_enforce_eager"] = vllm_cfg["enforce_eager"] - if vllm_cfg.get("max_num_seqs"): - template_vars["vllm_max_num_seqs"] = vllm_cfg["max_num_seqs"] - if "enable_prefix_caching" in vllm_cfg: - template_vars["vllm_enable_prefix_caching"] = vllm_cfg["enable_prefix_caching"] - template_vars["vllm_engine_args"] = vllm_cfg.get("engine_args", {}) - compose_template = self.env.get_template(BASE_COMPOSE_TEMPLATE) compose_rendered = compose_template.render(**template_vars) diff --git a/src/cli/service_registry.py b/src/cli/service_registry.py index 61554cd14..2f2e4718a 100644 --- a/src/cli/service_registry.py +++ b/src/cli/service_registry.py @@ -151,17 +151,6 @@ def _register_default_services(self): 'services.redmine_mailbox.project'] )) - # Compute services - self.register(ServiceDefinition( - name='vllm-server', - description='vLLM inference server for local GPU-accelerated LLM serving', - category='compute', - requires_image=False, - requires_volume=False, - default_host_port=8000, - default_container_port=8000, - )) - self.register(ServiceDefinition( name='benchmarking', depends_on=['postgres'], diff --git a/src/cli/templates/base-compose.yaml b/src/cli/templates/base-compose.yaml index 21cf01fce..387447097 100644 --- a/src/cli/templates/base-compose.yaml +++ b/src/cli/templates/base-compose.yaml @@ -118,7 +118,7 @@ services: args: APP_VERSION: {{ app_version }} container_name: {{ chatbot_container_name }} - {% if postgres_enabled or vllm_server_enabled -%} + {% if postgres_enabled -%} depends_on: {% if postgres_enabled -%} postgres: @@ -126,10 +126,6 @@ services: config-seed: condition: service_completed_successfully {% endif -%} - {% if vllm_server_enabled -%} - vllm-server: - condition: service_healthy - {% endif %} {% endif -%} environment: PGHOST: {{ 'localhost' if host_mode else 'postgres' }} @@ -140,9 +136,6 @@ services: VERBOSITY: {{ verbosity }} # Allow overriding Ollama host via env so containers can reach host daemon OLLAMA_HOST: ${OLLAMA_HOST:-} - {% if vllm_server_enabled -%} - VLLM_BASE_URL: ${VLLM_BASE_URL:-http://{{ 'localhost' if host_mode else 'vllm-server' }}:8000/v1} - {% endif %} {% for secret in required_secrets | default([]) -%} {{ secret.upper() }}_FILE: /run/secrets/{{ secret.lower() }} {% endfor %} @@ -564,80 +557,6 @@ services: {%- endif %} {%- endif %} - {% if vllm_server_enabled -%} - vllm-server: - image: vllm/vllm-openai:latest - container_name: vllm-server-{{ name }} - {% if not host_mode -%} - ports: - - "8000:8000" - {% endif -%} - environment: - NCCL_P2P_DISABLE: "1" - NVIDIA_VISIBLE_DEVICES: all - NVIDIA_DRIVER_CAPABILITIES: compute,utility,graphics - entrypoint: ["/bin/sh", "-c"] - command: - - | - SHM_SIZE=$$(df /dev/shm | awk 'NR==2 {print $$2}') - if [ "$$SHM_SIZE" -lt 1048576 ] 2>/dev/null; then - echo "WARNING: /dev/shm is $$(( $$SHM_SIZE / 1024 ))MB — less than 1GB. Use ipc: host or increase shm_size for stable multi-GPU inference." - fi - exec python3 -m vllm.entrypoints.openai.api_server \ - --model "{{ vllm_model | default('Qwen/Qwen2.5-7B-Instruct-1M') }}" \ - --enable-auto-tool-choice \ - --tool-call-parser "{{ vllm_tool_parser | default('hermes') }}" \ - {% if vllm_gpu_memory_utilization is defined %}--gpu-memory-utilization {{ vllm_gpu_memory_utilization }} {% endif %}\ - {% if vllm_max_model_len is defined %}--max-model-len {{ vllm_max_model_len }} {% endif %}\ - {% if vllm_tensor_parallel_size is defined %}--tensor-parallel-size {{ vllm_tensor_parallel_size }} {% endif %}\ - {% if vllm_dtype is defined %}--dtype {{ vllm_dtype }} {% endif %}\ - {% if vllm_quantization is defined %}--quantization {{ vllm_quantization }} {% endif %}\ - {% if vllm_enforce_eager is defined and vllm_enforce_eager %}--enforce-eager {% endif %}\ - {% if vllm_max_num_seqs is defined %}--max-num-seqs {{ vllm_max_num_seqs }} {% endif %}\ - {% if vllm_enable_prefix_caching is defined and not vllm_enable_prefix_caching %}--no-enable-prefix-caching {% endif %}\ - {% for key, val in vllm_engine_args.items() %}{% if val != '' %}--{{ key }} {{ val }} {% else %}--{{ key }} {% endif %}{% endfor %} - - ipc: host - {% if host_mode -%} - network_mode: host - {% endif -%} - ulimits: - memlock: - soft: -1 - hard: -1 - stack: 67108864 - {% if not use_podman -%} - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: all - capabilities: [gpu] - {% endif %} - {% if use_podman -%} - security_opt: - - label:disable - devices: - {%- if gpu_ids == "all" %} - - "nvidia.com/gpu=all" - {%- else %} - {%- for gpu_id in gpu_ids %} - - "nvidia.com/gpu={{ gpu_id }}" - {%- endfor %} - {%- endif %} - {%- endif %} - healthcheck: - test: ["CMD-SHELL", "curl -sf http://localhost:8000/v1/models || exit 1"] - interval: 30s - timeout: 10s - retries: 20 - start_period: 120s - logging: - options: - max-size: 10m - restart: always - {%- endif %} {% if benchmarking_enabled -%} benchmark: diff --git a/src/cli/utils/service_builder.py b/src/cli/utils/service_builder.py index a4f2f64c3..f4e046c5f 100644 --- a/src/cli/utils/service_builder.py +++ b/src/cli/utils/service_builder.py @@ -71,7 +71,6 @@ def __init__( "mattermost": ServiceState(), "redmine-mailer": ServiceState(), "benchmarking": ServiceState(), - "vllm-server": ServiceState(), } self.use_redmine: bool = False diff --git a/src/interfaces/chat_app/app.py b/src/interfaces/chat_app/app.py index c9680e6d5..34559b39a 100755 --- a/src/interfaces/chat_app/app.py +++ b/src/interfaces/chat_app/app.py @@ -98,6 +98,7 @@ def _build_provider_config_from_payload(config_payload: Dict[str, Any], provider extra = {} if provider_type == ProviderType.LOCAL and cfg.get("mode"): extra["local_mode"] = cfg.get("mode") + return ProviderConfig( provider_type=provider_type, enabled=cfg.get("enabled", True), diff --git a/tests/smoke/combined_smoke.sh b/tests/smoke/combined_smoke.sh index ebec8f74b..aef9cec7c 100755 --- a/tests/smoke/combined_smoke.sh +++ b/tests/smoke/combined_smoke.sh @@ -5,9 +5,6 @@ set -euo pipefail # BASE_URL, DM_BASE_URL, OLLAMA_URL, OLLAMA_MODEL, # PGHOST, PGPORT, PGUSER, PGPASSWORD, PGDATABASE, # ARCHI_CONFIG_PATH, ARCHI_CONFIG_NAME, ARCHI_PIPELINE_NAME, USE_PODMAN -# SMOKE_PROVIDER – set to "vllm" to run vLLM smoke checks instead of Ollama -# VLLM_BASE_URL – vLLM API base URL (default: http://localhost:8000/v1) -# VLLM_MODEL – expected model on vLLM server (optional) NAME="${1:-}" if [[ -z "${NAME}" ]]; then @@ -26,16 +23,9 @@ export BASE_URL export DM_BASE_URL export OLLAMA_URL -SMOKE_PROVIDER="${SMOKE_PROVIDER:-ollama}" - info "Running preflight checks..." python3 tests/smoke/preflight.py -if [[ "${SMOKE_PROVIDER,,}" == "vllm" ]]; then - info "Running vLLM smoke checks..." - python3 tests/smoke/vllm_smoke.py -fi - info "Running direct tool probes (chatbot container)..." tool="docker" use_podman="${USE_PODMAN:-false}" diff --git a/tests/smoke/preflight.py b/tests/smoke/preflight.py index e4a205d8d..0814b79d2 100644 --- a/tests/smoke/preflight.py +++ b/tests/smoke/preflight.py @@ -204,25 +204,20 @@ def _check_config_ollama(config_path: str, pipeline_name: str, ollama_model: str def main() -> None: - smoke_provider = os.getenv("SMOKE_PROVIDER", "ollama").lower() - _wait_for_ingestion() _check_postgres() # ChromaDB removed - PostgreSQL with pgvector is the only supported backend _check_data_manager_catalog() - if smoke_provider != "vllm": - _check_ollama_model() + _check_ollama_model() - config_path = os.getenv("ARCHI_CONFIG_PATH") - pipeline_name = os.getenv("ARCHI_PIPELINE_NAME", "CMSCompOpsAgent") - ollama_model = os.getenv("OLLAMA_MODEL", "") - if config_path: - _check_config_ollama(config_path, pipeline_name, ollama_model) - else: - _info("ARCHI_CONFIG_PATH not set; skipping config Ollama validation") + config_path = os.getenv("ARCHI_CONFIG_PATH") + pipeline_name = os.getenv("ARCHI_PIPELINE_NAME", "CMSCompOpsAgent") + ollama_model = os.getenv("OLLAMA_MODEL", "") + if config_path: + _check_config_ollama(config_path, pipeline_name, ollama_model) else: - _info("Provider is vLLM — skipping Ollama preflight checks") + _info("ARCHI_CONFIG_PATH not set; skipping config Ollama validation") _info("Preflight checks passed") diff --git a/tests/smoke/vllm_smoke.py b/tests/smoke/vllm_smoke.py deleted file mode 100644 index 805690dd9..000000000 --- a/tests/smoke/vllm_smoke.py +++ /dev/null @@ -1,122 +0,0 @@ -#!/usr/bin/env python3 -"""vLLM provider smoke checks. - -Validates that a vLLM server is reachable and serving at least one model, -then sends a minimal completion request to verify inference works end-to-end. - -Expected env vars: - VLLM_BASE_URL – vLLM OpenAI-compatible API base (default: http://localhost:8000/v1) - VLLM_MODEL – (optional) specific model id to validate is loaded -""" -import json -import os -import sys -import time -import urllib.error -import urllib.request - - -def _fail(message: str) -> None: - print(f"[vllm-smoke] ERROR: {message}", file=sys.stderr) - sys.exit(1) - - -def _info(message: str) -> None: - print(f"[vllm-smoke] {message}") - - -def _check_vllm_health(base_url: str, timeout: int = 120) -> None: - """Wait for /v1/models to return at least one model.""" - models_url = f"{base_url}/models" - _info(f"Waiting for vLLM at {models_url} (timeout {timeout}s) ...") - deadline = time.time() + timeout - last_err = None - while True: - try: - req = urllib.request.Request(models_url) - with urllib.request.urlopen(req, timeout=5) as resp: - if resp.status == 200: - data = json.loads(resp.read().decode()) - models = data.get("data", []) - if models: - model_ids = [m.get("id") for m in models] - _info(f"vLLM serving {len(models)} model(s): {model_ids}") - return - last_err = "No models loaded yet" - except Exception as exc: - last_err = str(exc) - - if time.time() >= deadline: - _fail(f"vLLM not ready: {last_err}") - time.sleep(3) - - -def _check_model_loaded(base_url: str, expected_model: str) -> None: - """Verify a specific model is loaded on the server.""" - models_url = f"{base_url}/models" - _info(f"Checking model '{expected_model}' is loaded ...") - try: - req = urllib.request.Request(models_url) - with urllib.request.urlopen(req, timeout=10) as resp: - data = json.loads(resp.read().decode()) - except Exception as exc: - _fail(f"Failed to query models: {exc}") - - model_ids = [m.get("id") for m in data.get("data", [])] - if expected_model not in model_ids: - _fail(f"Model '{expected_model}' not found in {model_ids}") - _info(f"Model '{expected_model}' OK") - - -def _check_inference(base_url: str, model: str) -> None: - """Send a minimal chat completion to verify inference works.""" - completions_url = f"{base_url}/chat/completions" - _info(f"Testing inference on '{model}' ...") - payload = json.dumps({ - "model": model, - "messages": [{"role": "user", "content": "Say OK."}], - "max_tokens": 8, - }).encode() - req = urllib.request.Request( - completions_url, - data=payload, - headers={"Content-Type": "application/json"}, - ) - try: - with urllib.request.urlopen(req, timeout=120) as resp: - if resp.status != 200: - _fail(f"Inference request failed: HTTP {resp.status}") - data = json.loads(resp.read().decode()) - except Exception as exc: - _fail(f"Inference request failed: {exc}") - - choices = data.get("choices", []) - if not choices: - _fail("Inference returned no choices") - content = choices[0].get("message", {}).get("content", "") - _info(f"Inference OK — response: {content!r}") - - -def main() -> None: - base_url = os.getenv("VLLM_BASE_URL", "http://localhost:8000/v1").rstrip("/") - expected_model = os.getenv("VLLM_MODEL", "") - timeout = int(os.getenv("VLLM_HEALTH_TIMEOUT", "120")) - - _check_vllm_health(base_url, timeout=timeout) - - if expected_model: - _check_model_loaded(base_url, expected_model) - else: - # Use the first available model for the inference check - models_url = f"{base_url}/models" - req = urllib.request.Request(models_url) - with urllib.request.urlopen(req, timeout=10) as resp: - data = json.loads(resp.read().decode()) - expected_model = data["data"][0]["id"] - - _check_inference(base_url, expected_model) - _info("vLLM smoke checks passed") - - -if __name__ == "__main__": - main()