archi-physics · JasonMoho · Feb 25, 2026 · Feb 25, 2026 · Feb 26, 2026 · Feb 26, 2026
diff --git a/configs/submit76/config.yaml b/configs/submit76/config.yaml
@@ -0,0 +1,62 @@
+# Submit76 deployment config
+# Deploy with:
+#   ./configs/submit76/deploy-submit76.sh --live
+#
+# Target: mohoney@submit76.mit.edu
+# Ollama must be running on submit76 at localhost:7870 with gpt-oss:120b pulled.
+
+name: my_archi
+
+services:
+  chat_app:
+    agent_class: CMSCompOpsAgent
+    agents_dir: examples/agents
+    default_provider: local
+    default_model: "qwen3:32b"
+    providers:
+      local:
+        enabled: true
+        base_url: http://localhost:7870
+        mode: ollama
+        default_model: "qwen3:32b"
+        models:
+          - "gpt-oss:120b"
+          - "qwen3:32b"
+    port: 7865
+    external_port: 7865
+    ab_testing:
+      enabled: true
+      pool:
+        champion: default
+        variants:
+          - name: default
+            provider: local
+            model: "qwen3:32b"
+          - name: gpt-oss-120b
+            provider: local
+            model: "gpt-oss:120b"
+  postgres:
+    port: 5435
+  data_manager:
+    port: 7878
+    external_port: 7878
+    auth:
+      enabled: true
+
+data_manager:
+  sources:
+    jira:
+      enabled: true
+      max_tickets: 10
+      url: https://its.cern.ch/jira/
+      projects:
+        - "CMSPROD"
+    links:
+      input_lists:
+        - /home/submit/pmlugato/random_configs/lists/sso_git.list
+    redmine:
+      url: https://cleo.mit.edu
+      project: emails-to-ticket
+      projects:
+        - emails-to-ticket
+  embedding_name: HuggingFaceEmbeddings
diff --git a/docs/docs/api_reference.md b/docs/docs/api_reference.md
@@ -46,7 +46,60 @@ Retrieve the full trace of a previous request.
 
 ### `POST /api/ab/create`
 
-Create an A/B comparison between two model responses.
+Create an A/B comparison between two model responses (legacy manual mode).
+
+### `GET /api/ab/pool`
+
+Get the server-side A/B testing pool configuration. The response shape depends on RBAC:
+
+- `ab:view` or `ab:manage`: full read-only experiment configuration
+- `ab:participate`: participant-focused payload including the effective per-user sample rate and participant eligibility diagnostics
+- otherwise: `enabled: false`
+
+**Response (pool active):**
+```json
+{
+  "success": true,
+  "enabled": true,
+  "can_view": true,
+  "can_manage": false,
+  "champion": "default",
+  "variants": ["default", "creative", "concise"],
+  "sample_rate": 0.25,
+  "default_sample_rate": 0.25,
+  "participant_eligible": true,
+  "participant_reason": "eligible"
+}
+```
+
+Participant payloads can also report `participant_reason: "not_targeted"` when the deployment has an active experiment but the current user's roles or permissions are not included in that experiment's target filters.
+
+### `POST /api/ab/compare`
+
+Stream a pool-based champion vs. challenger A/B comparison. The server randomly pairs the champion against a challenger from the pool and streams interleaved NDJSON events tagged with `arm: "a"` or `arm: "b"`. A final `ab_meta` event carries the `comparison_id` and variant mapping.
+
+**Request body:** Same as `/api/get_chat_response_stream`.
+
+### `GET /api/ab/metrics`
+
+Get per-variant aggregate metrics (wins, losses, ties, total comparisons).
+
+**Response:**
+```json
+{
+  "success": true,
+  "metrics": [
+    {
+      "variant_name": "creative",
+      "wins": 12,
+      "losses": 5,
+      "ties": 3,
+      "total_comparisons": 20,
+      "last_updated": "2025-01-15T10:30:00"
+    }
+  ]
+}
+```
 
 ---
 
@@ -92,14 +145,15 @@ Get or create the current user.
 
 ### `PATCH /api/users/me/preferences`
 
-Update user preferences (model, temperature, prompts, theme).
+Update user preferences (model, temperature, prompts, theme, and A/B participation override).
 
 **Request:**
 ```json
 {
   "theme": "light",
   "preferred_model": "claude-3-opus",
-  "preferred_temperature": 0.5
+  "preferred_temperature": 0.5,
+  "ab_participation_rate": 0.75
 }
 ```
 
@@ -232,6 +286,39 @@ Set the active agent for the current session.
 }
 ```
 
+### `GET /api/ab/agents/list`
+
+List the Postgres-backed A/B agent catalog for the A/B admin page. Requires A/B page access.
+
+### `GET /api/ab/agents/template`
+
+Get the A/B admin template payload with structured tool metadata. Requires `ab:manage`.
+
+**Response:**
+```json
+{
+  "name": "New A/B Agent",
+  "prompt": "Write your system prompt here.",
+  "tools": [
+    {"name": "search_vectorstore_hybrid", "description": "Search indexed documents."}
+  ],
+  "scope": "ab"
+}
+```
+
+### `POST /api/ab/agents`
+
+Create a new Postgres-backed A/B agent spec from structured fields. Requires `ab:manage`.
+
+**Request:**
+```json
+{
+  "name": "A/B Candidate",
+  "tools": ["search_vectorstore_hybrid"],
+  "prompt": "You are a helpful A/B experiment agent."
+}
+```
+
 ---
 
 ## Prompts

diff --git a/docs/docs/configuration.md b/docs/docs/configuration.md
@@ -97,6 +97,36 @@ services:
               - alerts:manage
 ```
 
+#### `services.chat_app.auth`
+
+Authentication can be enabled with SSO or basic auth.
+
+For RBAC-managed admin access, use SSO plus `auth_roles`. Basic auth supports identity-only login, but it does not assign RBAC roles.
+
+```yaml
+services:
+  chat_app:
+    auth:
+      enabled: true
+      basic:
+        enabled: true
+      auth_roles:
+        default_role: base-user
+        roles:
+          base-user:
+            permissions:
+              - chat:query
+              - ab:participate
+          ab-reviewer:
+            permissions:
+              - documents:view
+              - ab:view
+              - ab:metrics
+          ab-admin:
+            permissions:
+              - ab:manage
+```
+
 #### Provider Configuration
 
 ```yaml
@@ -257,6 +287,93 @@ data_manager:
 
 ---
 
+## A/B Testing Pool
+
+Archi supports champion/challenger A/B testing via a server-side variant pool. When configured, the system automatically pairs the champion agent against a random challenger for each comparison. Users vote on which response is better, and aggregate metrics are tracked per variant.
+
+Configure A/B testing under `services.chat_app.ab_testing`:
+
+```yaml
+services:
+  chat_app:
+    ab_testing:
+      enabled: true
+      sample_rate: 0.25
+      disclosure_mode: post_vote_reveal
+      default_trace_mode: minimal
+      max_pending_per_conversation: 1
+      target_roles: []
+      target_permissions: []
+      pool:
+        champion: default
+        variants:
+          - label: default
+            agent_spec: default.md
+          - label: creative
+            agent_spec: default.md
+            provider: openai
+            model: gpt-4o
+            recursion_limit: 30
+          - label: concise
+            agent_spec: concise.md
+            provider: anthropic
+            model: claude-sonnet-4-20250514
+            num_documents_to_retrieve: 3
+```
+
+`services.ab_testing` is deprecated and no longer loaded. Use `services.chat_app.ab_testing` only.
+
+If `enabled: true` is set before the A/B pool is fully configured, Archi starts successfully but keeps A/B inactive until setup is completed in the admin UI. Missing champion/variants or unresolved A/B agent-spec records are surfaced as warnings instead of blocking startup.
+
+The runtime source of truth for A/B agent specs is now PostgreSQL. The optional `ab_agents_dir` path is treated only as a legacy import source during reconciliation; runtime A/B loading never falls back to reading staged container markdown files directly.
+
+New A/B specs created through the admin UI are stored in the same PostgreSQL-backed catalog. Existing A/B specs are not edited through the admin page; if you need a changed prompt or tool selection, create a new A/B spec and point the variant at that new catalog entry.
+
+### Variant Fields
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `label` | string | *required* | Unique human-facing variant label used in the UI and metrics |
+| `agent_spec` | string | *required* | A/B agent-spec filename resolved from the database-backed A/B catalog |
+| `provider` | string | `null` | Override LLM provider |
+| `model` | string | `null` | Override LLM model |
+| `num_documents_to_retrieve` | int | `null` | Override retriever document count |
+| `recursion_limit` | int | `null` | Override agent recursion limit |
+
+### Pool Fields
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `enabled` | boolean | `false` | Enable the experiment pool |
+| `ab_agents_dir` | string | `/root/archi/ab_agents` | Optional legacy import directory for migrating A/B markdown specs into the DB catalog |
+| `sample_rate` | float | `1.0` | Fraction of eligible turns that should run A/B |
+| `disclosure_mode` | string | `post_vote_reveal` | One of `blind`, `post_vote_reveal`, `named` |
+| `default_trace_mode` | string | `minimal` | One of `minimal`, `normal`, `verbose` |
+| `max_pending_per_conversation` | int | `1` | Maximum unresolved comparisons per conversation |
+| `target_roles` | list[string] | `[]` | Restrict participation to matching RBAC roles |
+| `target_permissions` | list[string] | `[]` | Restrict participation to matching permissions |
+
+The `champion` field must reference an existing variant `label`. At least two variants are required before the experiment becomes active. `name`-only variant config is not supported. When a user enables A/B mode in the chat UI, the pool takes over — the champion always appears in one arm, and a random challenger is placed in the other. Arm positions (A vs B) are randomized per comparison.
+
+### A/B RBAC and User Preference
+
+Use RBAC to separate participation, read-only review, metrics access, and write access:
+
+| Permission | Purpose |
+|------------|---------|
+| `ab:participate` | Makes a user eligible for A/B comparisons and shows the per-user sampling slider in chat settings |
+| `ab:view` | Allows read-only access to the A/B admin page |
+| `ab:metrics` | Allows access to aggregate A/B metrics |
+| `ab:manage` | Allows editing variants, A/B agent specs, and experiment settings |
+
+`services.chat_app.ab_testing.sample_rate` remains the deployment default. Users with `ab:participate` can override that default per account with a `0..1` slider in chat settings.
+
+Users with `ab:view`, `ab:metrics`, or `ab:manage` can open the dedicated A/B Testing page from the data viewer and from chat settings. Users with `ab:participate` but not A/B page access still get the personal sampling slider in chat settings.
+
+Variant metrics (wins, losses, ties) are tracked in the `ab_variant_metrics` database table and available via `GET /api/ab/metrics`.
+
+---
+
 ## Agent Configuration Model
 
 Archi no longer uses a top-level `archi:` block in standard deployment YAML.

diff --git a/src/archi/pipelines/agents/agent_spec.py b/src/archi/pipelines/agents/agent_spec.py
@@ -13,6 +13,7 @@ class AgentSpec:
     tools: List[str]
     prompt: str
     source_path: Path
+    ab_only: bool = False
 
 
 class AgentSpecError(ValueError):
@@ -31,22 +32,26 @@ def load_agent_spec(path: Path) -> AgentSpec:
     text = path.read_text()
     frontmatter, prompt = _parse_frontmatter(text, path)
     name, tools = _extract_metadata(frontmatter, path)
+    ab_only = bool(frontmatter.get("ab_only", False))
     return AgentSpec(
         name=name,
         tools=tools,
         prompt=prompt,
         source_path=path,
+        ab_only=ab_only,
     )
 
 
 def load_agent_spec_from_text(text: str) -> AgentSpec:
     frontmatter, prompt = _parse_frontmatter(text, Path("<memory>"))
     name, tools = _extract_metadata(frontmatter, Path("<memory>"))
+    ab_only = bool(frontmatter.get("ab_only", False))
     return AgentSpec(
         name=name,
         tools=tools,
         prompt=prompt,
         source_path=Path("<memory>"),
+        ab_only=ab_only,
     )