archi-physics · JasonMoho · Feb 25, 2026 · Feb 25, 2026 · Feb 26, 2026 · Feb 26, 2026
diff --git a/configs/submit76/config.yaml b/configs/submit76/config.yaml
@@ -0,0 +1,62 @@
+# Submit76 deployment config
+# Deploy with:
+#   ./configs/submit76/deploy-submit76.sh --live
+#
+# Target: [email protected]
+# Ollama must be running on submit76 at localhost:7870 with gpt-oss:120b pulled.
+
+name: my_archi
+
+services:
+  chat_app:
+    agent_class: CMSCompOpsAgent
+    agents_dir: examples/agents
+    default_provider: local
+    default_model: "qwen3:32b"
+    providers:
+      local:
+        enabled: true
+        base_url: http://localhost:7870
+        mode: ollama
+        default_model: "qwen3:32b"
+        models:
+          - "gpt-oss:120b"
+          - "qwen3:32b"
+    port: 7865
+    external_port: 7865
+    ab_testing:
+      enabled: true
+      pool:
+        champion: default
+        variants:
+          - name: default
+            provider: local
+            model: "qwen3:32b"
+          - name: gpt-oss-120b
+            provider: local
+            model: "gpt-oss:120b"
+  postgres:
+    port: 5435
+  data_manager:
+    port: 7878
+    external_port: 7878
+    auth:
+      enabled: true
+
+data_manager:
+  sources:
+    jira:
+      enabled: true
+      max_tickets: 10
+      url: https://its.cern.ch/jira/
+      projects:
+        - "CMSPROD"
+    links:
+      input_lists:
+        - /home/submit/pmlugato/random_configs/lists/sso_git.list
+    redmine:
+      url: https://cleo.mit.edu
+      project: emails-to-ticket
+      projects:
+        - emails-to-ticket
+  embedding_name: HuggingFaceEmbeddings
diff --git a/docs/docs/api_reference.md b/docs/docs/api_reference.md
@@ -46,7 +46,48 @@ Retrieve the full trace of a previous request.
 
 ### `POST /api/ab/create`
 
-Create an A/B comparison between two model responses.
+Create an A/B comparison between two model responses (legacy manual mode).
+
+### `GET /api/ab/pool`
+
+Get the server-side A/B testing pool configuration. Returns `enabled: true` with champion name and variant list when a pool is configured, or `enabled: false` otherwise.
+
+**Response (pool active):**
+```json
+{
+  "success": true,
+  "enabled": true,
+  "champion": "default",
+  "variants": ["default", "creative", "concise"]
+}
+```
+
+### `POST /api/ab/compare`
+
+Stream a pool-based champion vs. challenger A/B comparison. The server randomly pairs the champion against a challenger from the pool and streams interleaved NDJSON events tagged with `arm: "a"` or `arm: "b"`. A final `ab_meta` event carries the `comparison_id` and variant mapping.
+
+**Request body:** Same as `/api/get_chat_response_stream`.
+
+### `GET /api/ab/metrics`
+
+Get per-variant aggregate metrics (wins, losses, ties, total comparisons).
+
+**Response:**
+```json
+{
+  "success": true,
+  "metrics": [
+    {
+      "variant_name": "creative",
+      "wins": 12,
+      "losses": 5,
+      "ties": 3,
+      "total_comparisons": 20,
+      "last_updated": "2025-01-15T10:30:00"
+    }
+  ]
+}
+```
 
 ---
 

diff --git a/docs/docs/configuration.md b/docs/docs/configuration.md
@@ -97,6 +97,32 @@ services:
               - alerts:manage
 ```
 
+#### `services.chat_app.auth`
+
+Authentication can be enabled with SSO or basic auth.
+
+For RBAC-managed admin access, use SSO plus `auth_roles`. Basic auth supports identity-only login, but it does not assign RBAC roles.
+
+```yaml
+services:
+  chat_app:
+    auth:
+      enabled: true
+      basic:
+        enabled: true
+      auth_roles:
+        default_role: base-user
+        roles:
+          base-user:
+            permissions:
+              - chat:query
+          ab-admin:
+            permissions:
+              - documents:view
+              - config:modify
+              - view:metrics
+```
+
 #### Provider Configuration
 
 ```yaml
@@ -257,6 +283,77 @@ data_manager:
 
 ---
 
+## A/B Testing Pool
+
+Archi supports champion/challenger A/B testing via a server-side variant pool. When configured, the system automatically pairs the champion agent against a random challenger for each comparison. Users vote on which response is better, and aggregate metrics are tracked per variant.
+
+Configure A/B testing under `services.chat_app.ab_testing`:
+
+```yaml
+services:
+  chat_app:
+    ab_testing:
+      enabled: true
+      ab_agents_dir: /root/archi/ab_agents
+      sample_rate: 0.25
+      disclosure_mode: post_vote_reveal
+      default_trace_mode: minimal
+      max_pending_per_conversation: 1
+      target_roles: []
+      target_permissions: []
+      pool:
+        champion: default
+        variants:
+          - label: default
+            agent_spec: default.md
+          - label: creative
+            agent_spec: default.md
+            provider: openai
+            model: gpt-4o
+            recursion_limit: 30
+          - label: concise
+            agent_spec: concise.md
+            provider: anthropic
+            model: claude-sonnet-4-20250514
+            num_documents_to_retrieve: 3
+```
+
+`services.ab_testing` is deprecated and no longer loaded. Use `services.chat_app.ab_testing` only.
+
+If `enabled: true` is set before the A/B pool is fully configured, Archi starts successfully but keeps A/B inactive until setup is completed in the admin UI. Missing `ab_agents_dir`, missing champion/variants, or missing A/B agent-spec files are surfaced as warnings instead of blocking startup.
+
+When `ab_agents_dir` is set in the source deployment config, Archi copies those markdown files into the generated deployment and rewrites the runtime config to the internal A/B pool path (`/root/archi/ab_agents`). Pool `agent_spec` values should therefore always be filenames, not host paths.
+
+### Variant Fields
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `label` | string | *required* | Unique human-facing variant label used in the UI and metrics |
+| `agent_spec` | string | *required* | Agent markdown filename to load from `ab_agents_dir` |
+| `provider` | string | `null` | Override LLM provider |
+| `model` | string | `null` | Override LLM model |
+| `num_documents_to_retrieve` | int | `null` | Override retriever document count |
+| `recursion_limit` | int | `null` | Override agent recursion limit |
+
+### Pool Fields
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `enabled` | boolean | `false` | Enable the experiment pool |
+| `ab_agents_dir` | string | `/root/archi/ab_agents` | Isolated directory for A/B-only agent markdown files |
+| `sample_rate` | float | `1.0` | Fraction of eligible turns that should run A/B |
+| `disclosure_mode` | string | `post_vote_reveal` | One of `blind`, `post_vote_reveal`, `named` |
+| `default_trace_mode` | string | `minimal` | One of `minimal`, `normal`, `verbose` |
+| `max_pending_per_conversation` | int | `1` | Maximum unresolved comparisons per conversation |
+| `target_roles` | list[string] | `[]` | Restrict participation to matching RBAC roles |
+| `target_permissions` | list[string] | `[]` | Restrict participation to matching permissions |
+
+The `champion` field must reference an existing variant `label`. At least two variants are required before the experiment becomes active. `name`-only variant config is not supported. When a user enables A/B mode in the chat UI, the pool takes over — the champion always appears in one arm, and a random challenger is placed in the other. Arm positions (A vs B) are randomized per comparison.
+
+Variant metrics (wins, losses, ties) are tracked in the `ab_variant_metrics` database table and available via `GET /api/ab/metrics`.
+
+---
+
 ## Agent Configuration Model
 
 Archi no longer uses a top-level `archi:` block in standard deployment YAML.

diff --git a/src/archi/pipelines/agents/agent_spec.py b/src/archi/pipelines/agents/agent_spec.py
@@ -13,6 +13,7 @@ class AgentSpec:
     tools: List[str]
     prompt: str
     source_path: Path
+    ab_only: bool = False
 
 
 class AgentSpecError(ValueError):
@@ -31,22 +32,26 @@ def load_agent_spec(path: Path) -> AgentSpec:
     text = path.read_text()
     frontmatter, prompt = _parse_frontmatter(text, path)
     name, tools = _extract_metadata(frontmatter, path)
+    ab_only = bool(frontmatter.get("ab_only", False))
     return AgentSpec(
         name=name,
         tools=tools,
         prompt=prompt,
         source_path=path,
+        ab_only=ab_only,
     )
 
 
 def load_agent_spec_from_text(text: str) -> AgentSpec:
     frontmatter, prompt = _parse_frontmatter(text, Path("<memory>"))
     name, tools = _extract_metadata(frontmatter, Path("<memory>"))
+    ab_only = bool(frontmatter.get("ab_only", False))
     return AgentSpec(
         name=name,
         tools=tools,
         prompt=prompt,
         source_path=Path("<memory>"),
+        ab_only=ab_only,
     )
 
 

diff --git a/src/archi/pipelines/agents/base_react.py b/src/archi/pipelines/agents/base_react.py
@@ -390,6 +390,29 @@ def stream(self, **kwargs) -> Iterator[PipelineOutput]:
                         content = self._message_content(message)
                         additional_kwargs = getattr(message, "additional_kwargs", None) or {}
                         reasoning_content = additional_kwargs.get("reasoning_content", "")
+
+                        # Detect empty AI chunks as implicit thinking activity.
+                        # Some LLM integrations (e.g. langchain-ollama <1.1) drop
+                        # the thinking/reasoning payload, producing chunks where
+                        # both content and reasoning_content are empty while the
+                        # model is still in its thinking phase.  We treat these as
+                        # a signal to start (or continue) the thinking indicator
+                        # so the UI stays responsive.
+                        if not content and not reasoning_content:
+                            if thinking_step_id is None and "chunk" in msg_class:
+                                thinking_step_id = str(uuid.uuid4())
+                                thinking_start_time = time.time()
+                                yield self.finalize_output(
+                                    answer="",
+                                    memory=self.active_memory,
+                                    messages=[],
+                                    metadata={
+                                        "event_type": "thinking_start",
+                                        "step_id": thinking_step_id,
+                                    },
+                                    final=False,
+                                )
+
                         if content or reasoning_content:
                             # Start thinking phase if not already active
                             if thinking_step_id is None:
@@ -665,6 +688,23 @@ async def astream(self, **kwargs) -> AsyncIterator[PipelineOutput]:
                         content = self._message_content(message)
                         additional_kwargs = getattr(message, "additional_kwargs", None) or {}
                         reasoning_content = additional_kwargs.get("reasoning_content", "")
+
+                        # Detect empty AI chunks as implicit thinking activity.
+                        if not content and not reasoning_content:
+                            if thinking_step_id is None and "chunk" in msg_class:
+                                thinking_step_id = str(uuid.uuid4())
+                                thinking_start_time = time.time()
+                                yield self.finalize_output(
+                                    answer="",
+                                    memory=self.active_memory,
+                                    messages=[],
+                                    metadata={
+                                        "event_type": "thinking_start",
+                                        "step_id": thinking_step_id,
+                                    },
+                                    final=False,
+                                )
+
                         if content or reasoning_content:
                             # Start thinking phase if not already active
                             if thinking_step_id is None:

diff --git a/src/archi/providers/local_provider.py b/src/archi/providers/local_provider.py
@@ -94,6 +94,7 @@ def _get_ollama_model(self, model_name: str, **kwargs) -> BaseChatModel:
         model_kwargs = {
             "model": model_name,
             "streaming": True,
+            "keep_alive": "24h",
             **self.config.extra_kwargs,
             **kwargs,
         }

diff --git a/src/bin/service_chat.py b/src/bin/service_chat.py
@@ -27,30 +27,43 @@ def main():
     # Reload config from Postgres (runtime source of truth)
     config = get_full_config()
     chat_config = config["services"]["chat_app"]
-    print(f"Starting Chat Service with (host, port): ({chat_config['host']}, {chat_config['port']})")
-    print(f"Accessible externally at (host, port): ({chat_config['hostname']}, {chat_config['external_port']})")
 
-    generate_script(chat_config)
+    # Deployment-time fields may not be in Postgres; use sensible defaults
+    host = chat_config.get("host", "0.0.0.0")
+    port = chat_config.get("port", 7681)
+    hostname = chat_config.get("hostname", host)
+    external_port = chat_config.get("external_port", port)
+
+    # Resolve template/static folders from installed package location
+    _pkg_dir = os.path.dirname(os.path.abspath(__file__))
+    _chat_app_dir = os.path.join(os.path.dirname(_pkg_dir), "interfaces", "chat_app")
+    template_folder = chat_config.get("template_folder", os.path.join(_chat_app_dir, "templates"))
+    static_folder = chat_config.get("static_folder", os.path.join(_chat_app_dir, "static"))
+
+    print(f"Starting Chat Service with (host, port): ({host}, {port})")
+    print(f"Accessible externally at (host, port): ({hostname}, {external_port})")
+
+    generate_script(chat_config, static_folder)
     app = FlaskAppWrapper(Flask(
         __name__,
-        template_folder=chat_config["template_folder"],
-        static_folder=chat_config["static_folder"],
+        template_folder=template_folder,
+        static_folder=static_folder,
     ))
-    app.run(debug=True, use_reloader=False, port=chat_config["port"], host=chat_config["host"])
+    app.run(debug=True, use_reloader=False, port=port, host=host)
 
 
-def generate_script(chat_config):
+def generate_script(chat_config, static_folder):
     """
     This is not elegant but it creates the javascript file from the template using the config.yaml parameters
     """
-    script_template = os.path.join(chat_config["static_folder"], "script.js-template")
+    script_template = os.path.join(static_folder, "script.js-template")
     with open(script_template, "r") as f:
         template = f.read()
 
-    filled_template = template.replace('XX-NUM-RESPONSES-XX', str(chat_config["num_responses_until_feedback"]))
+    filled_template = template.replace('XX-NUM-RESPONSES-XX', str(chat_config.get("num_responses_until_feedback", 3)))
     filled_template = filled_template.replace('XX-TRAINED_ON-XX', str(chat_config.get("trained_on", "")))
 
-    script_file = os.path.join(chat_config["static_folder"], "script.js")
+    script_file = os.path.join(static_folder, "script.js")
     with open(script_file, "w") as f:
         f.write(filled_template)