archi-physics · JasonMoho · Feb 25, 2026 · Feb 25, 2026 · Feb 26, 2026 · Feb 26, 2026
diff --git a/configs/submit76/config.yaml b/configs/submit76/config.yaml
@@ -0,0 +1,62 @@
+# Submit76 deployment config
+# Deploy with:
+#   ./configs/submit76/deploy-submit76.sh --live
+#
+# Target: mohoney@submit76.mit.edu
+# Ollama must be running on submit76 at localhost:7870 with gpt-oss:120b pulled.
+
+name: my_archi
+
+services:
+  chat_app:
+    agent_class: CMSCompOpsAgent
+    agents_dir: examples/agents
+    default_provider: local
+    default_model: "qwen3:32b"
+    providers:
+      local:
+        enabled: true
+        base_url: http://localhost:7870
+        mode: ollama
+        default_model: "qwen3:32b"
+        models:
+          - "gpt-oss:120b"
+          - "qwen3:32b"
+    port: 7865
+    external_port: 7865
+    ab_testing:
+      enabled: true
+      pool:
+        champion: default
+        variants:
+          - name: default
+            provider: local
+            model: "qwen3:32b"
+          - name: gpt-oss-120b
+            provider: local
+            model: "gpt-oss:120b"
+  postgres:
+    port: 5435
+  data_manager:
+    port: 7878
+    external_port: 7878
+    auth:
+      enabled: true
+
+data_manager:
+  sources:
+    jira:
+      enabled: true
+      max_tickets: 10
+      url: https://its.cern.ch/jira/
+      projects:
+        - "CMSPROD"
+    links:
+      input_lists:
+        - /home/submit/pmlugato/random_configs/lists/sso_git.list
+    redmine:
+      url: https://cleo.mit.edu
+      project: emails-to-ticket
+      projects:
+        - emails-to-ticket
+  embedding_name: HuggingFaceEmbeddings
diff --git a/docs/docs/api_reference.md b/docs/docs/api_reference.md
@@ -46,7 +46,48 @@ Retrieve the full trace of a previous request.
 
 ### `POST /api/ab/create`
 
-Create an A/B comparison between two model responses.
+Create an A/B comparison between two model responses (legacy manual mode).
+
+### `GET /api/ab/pool`
+
+Get the server-side A/B testing pool configuration. Returns `enabled: true` with champion name and variant list when a pool is configured, or `enabled: false` otherwise.
+
+**Response (pool active):**
+```json
+{
+  "success": true,
+  "enabled": true,
+  "champion": "default",
+  "variants": ["default", "creative", "concise"]
+}
+```
+
+### `POST /api/ab/compare`
+
+Stream a pool-based champion vs. challenger A/B comparison. The server randomly pairs the champion against a challenger from the pool and streams interleaved NDJSON events tagged with `arm: "a"` or `arm: "b"`. A final `ab_meta` event carries the `comparison_id` and variant mapping.
+
+**Request body:** Same as `/api/get_chat_response_stream`.
+
+### `GET /api/ab/metrics`
+
+Get per-variant aggregate metrics (wins, losses, ties, total comparisons).
+
+**Response:**
+```json
+{
+  "success": true,
+  "metrics": [
+    {
+      "variant_name": "creative",
+      "wins": 12,
+      "losses": 5,
+      "ties": 3,
+      "total_comparisons": 20,
+      "last_updated": "2025-01-15T10:30:00"
+    }
+  ]
+}
+```
 
 ---
 

diff --git a/docs/docs/configuration.md b/docs/docs/configuration.md
@@ -220,6 +220,47 @@ data_manager:
 
 ---
 
+## A/B Testing Pool
+
+Archi supports champion/challenger A/B testing via a server-side variant pool. When configured, the system automatically pairs the champion agent against a random challenger for each comparison. Users vote on which response is better, and aggregate metrics are tracked per variant.
+
+Add an `ab_testing` block under your `archi:` or top-level config:
+
+```yaml
+ab_testing:
+  champion: default            # The variant that always appears in every matchup
+  variants:
+    - name: default
+      agent_spec: default      # Name of the agent spec (from agents_dir)
+    - name: creative
+      agent_spec: default
+      provider: openai
+      model: gpt-4o
+      recursion_limit: 30
+    - name: concise
+      agent_spec: concise      # A different agent spec with a shorter prompt
+      provider: anthropic
+      model: claude-sonnet-4-20250514
+      num_documents_to_retrieve: 3
+```
+
+### Variant Fields
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `name` | string | *required* | Unique variant identifier |
+| `agent_spec` | string | `null` | Agent spec name to use (must exist in `agents_dir`) |
+| `provider` | string | `null` | Override LLM provider |
+| `model` | string | `null` | Override LLM model |
+| `num_documents_to_retrieve` | int | `null` | Override retriever document count |
+| `recursion_limit` | int | `null` | Override agent recursion limit |
+
+The `champion` field must reference an existing variant name. At least two variants are required. When a user enables A/B mode in the chat UI, the pool takes over — the champion always appears in one arm, and a random challenger is placed in the other. Arm positions (A vs B) are randomized per comparison.
+
+Variant metrics (wins, losses, ties) are tracked in the `ab_variant_metrics` database table and available via `GET /api/ab/metrics`.
+
+---
+
 ## Agent Configuration Model
 
 Archi no longer uses a top-level `archi:` block in standard deployment YAML.

diff --git a/src/archi/pipelines/agents/agent_spec.py b/src/archi/pipelines/agents/agent_spec.py
@@ -13,6 +13,7 @@ class AgentSpec:
     tools: List[str]
     prompt: str
     source_path: Path
+    ab_only: bool = False
 
 
 class AgentSpecError(ValueError):
@@ -31,22 +32,26 @@ def load_agent_spec(path: Path) -> AgentSpec:
     text = path.read_text()
     frontmatter, prompt = _parse_frontmatter(text, path)
     name, tools = _extract_metadata(frontmatter, path)
+    ab_only = bool(frontmatter.get("ab_only", False))
     return AgentSpec(
         name=name,
         tools=tools,
         prompt=prompt,
         source_path=path,
+        ab_only=ab_only,
     )
 
 
 def load_agent_spec_from_text(text: str) -> AgentSpec:
     frontmatter, prompt = _parse_frontmatter(text, Path("<memory>"))
     name, tools = _extract_metadata(frontmatter, Path("<memory>"))
+    ab_only = bool(frontmatter.get("ab_only", False))
     return AgentSpec(
         name=name,
         tools=tools,
         prompt=prompt,
         source_path=Path("<memory>"),
+        ab_only=ab_only,
     )
 
 

diff --git a/src/archi/pipelines/agents/base_react.py b/src/archi/pipelines/agents/base_react.py
@@ -390,6 +390,29 @@ def stream(self, **kwargs) -> Iterator[PipelineOutput]:
                         content = self._message_content(message)
                         additional_kwargs = getattr(message, "additional_kwargs", None) or {}
                         reasoning_content = additional_kwargs.get("reasoning_content", "")
+
+                        # Detect empty AI chunks as implicit thinking activity.
+                        # Some LLM integrations (e.g. langchain-ollama <1.1) drop
+                        # the thinking/reasoning payload, producing chunks where
+                        # both content and reasoning_content are empty while the
+                        # model is still in its thinking phase.  We treat these as
+                        # a signal to start (or continue) the thinking indicator
+                        # so the UI stays responsive.
+                        if not content and not reasoning_content:
+                            if thinking_step_id is None and "chunk" in msg_class:
+                                thinking_step_id = str(uuid.uuid4())
+                                thinking_start_time = time.time()
+                                yield self.finalize_output(
+                                    answer="",
+                                    memory=self.active_memory,
+                                    messages=[],
+                                    metadata={
+                                        "event_type": "thinking_start",
+                                        "step_id": thinking_step_id,
+                                    },
+                                    final=False,
+                                )
+
                         if content or reasoning_content:
                             # Start thinking phase if not already active
                             if thinking_step_id is None:
@@ -636,6 +659,23 @@ async def astream(self, **kwargs) -> AsyncIterator[PipelineOutput]:
                         content = self._message_content(message)
                         additional_kwargs = getattr(message, "additional_kwargs", None) or {}
                         reasoning_content = additional_kwargs.get("reasoning_content", "")
+
+                        # Detect empty AI chunks as implicit thinking activity.
+                        if not content and not reasoning_content:
+                            if thinking_step_id is None and "chunk" in msg_class:
+                                thinking_step_id = str(uuid.uuid4())
+                                thinking_start_time = time.time()
+                                yield self.finalize_output(
+                                    answer="",
+                                    memory=self.active_memory,
+                                    messages=[],
+                                    metadata={
+                                        "event_type": "thinking_start",
+                                        "step_id": thinking_step_id,
+                                    },
+                                    final=False,
+                                )
+
                         if content or reasoning_content:
                             # Start thinking phase if not already active
                             if thinking_step_id is None:

diff --git a/src/archi/providers/local_provider.py b/src/archi/providers/local_provider.py
@@ -94,6 +94,7 @@ def _get_ollama_model(self, model_name: str, **kwargs) -> BaseChatModel:
         model_kwargs = {
             "model": model_name,
             "streaming": True,
+            "keep_alive": "24h",
             **self.config.extra_kwargs,
             **kwargs,
         }

diff --git a/src/bin/service_chat.py b/src/bin/service_chat.py
@@ -27,30 +27,43 @@ def main():
     # Reload config from Postgres (runtime source of truth)
     config = get_full_config()
     chat_config = config["services"]["chat_app"]
-    print(f"Starting Chat Service with (host, port): ({chat_config['host']}, {chat_config['port']})")
-    print(f"Accessible externally at (host, port): ({chat_config['hostname']}, {chat_config['external_port']})")
 
-    generate_script(chat_config)
+    # Deployment-time fields may not be in Postgres; use sensible defaults
+    host = chat_config.get("host", "0.0.0.0")
+    port = chat_config.get("port", 7681)
+    hostname = chat_config.get("hostname", host)
+    external_port = chat_config.get("external_port", port)
+
+    # Resolve template/static folders from installed package location
+    _pkg_dir = os.path.dirname(os.path.abspath(__file__))
+    _chat_app_dir = os.path.join(os.path.dirname(_pkg_dir), "interfaces", "chat_app")
+    template_folder = chat_config.get("template_folder", os.path.join(_chat_app_dir, "templates"))
+    static_folder = chat_config.get("static_folder", os.path.join(_chat_app_dir, "static"))
+
+    print(f"Starting Chat Service with (host, port): ({host}, {port})")
+    print(f"Accessible externally at (host, port): ({hostname}, {external_port})")
+
+    generate_script(chat_config, static_folder)
     app = FlaskAppWrapper(Flask(
         __name__,
-        template_folder=chat_config["template_folder"],
-        static_folder=chat_config["static_folder"],
+        template_folder=template_folder,
+        static_folder=static_folder,
     ))
-    app.run(debug=True, use_reloader=False, port=chat_config["port"], host=chat_config["host"])
+    app.run(debug=True, use_reloader=False, port=port, host=host)
 
 
-def generate_script(chat_config):
+def generate_script(chat_config, static_folder):
     """
     This is not elegant but it creates the javascript file from the template using the config.yaml parameters
     """
-    script_template = os.path.join(chat_config["static_folder"], "script.js-template")
+    script_template = os.path.join(static_folder, "script.js-template")
     with open(script_template, "r") as f:
         template = f.read()
 
-    filled_template = template.replace('XX-NUM-RESPONSES-XX', str(chat_config["num_responses_until_feedback"]))
+    filled_template = template.replace('XX-NUM-RESPONSES-XX', str(chat_config.get("num_responses_until_feedback", 3)))
     filled_template = filled_template.replace('XX-TRAINED_ON-XX', str(chat_config.get("trained_on", "")))
 
-    script_file = os.path.join(chat_config["static_folder"], "script.js")
+    script_file = os.path.join(static_folder, "script.js")
     with open(script_file, "w") as f:
         f.write(filled_template)
 

diff --git a/src/cli/templates/base-config.yaml b/src/cli/templates/base-config.yaml
@@ -94,6 +94,31 @@ services:
       {% if services.chat_app.auth.auth_roles is defined %}
       auth_roles: {{ services.chat_app.auth.auth_roles | tojson }}
       {% endif %}
+  {%- if services.ab_testing is defined and services.ab_testing.enabled | default(false) %}
+  ab_testing:
+    enabled: true
+    pool:
+      champion: {{ services.ab_testing.pool.champion }}
+      variants:
+        {%- for v in services.ab_testing.pool.variants %}
+        - name: {{ v.name }}
+          {%- if v.agent_spec is defined %}
+          agent_spec: "{{ v.agent_spec }}"
+          {%- endif %}
+          {%- if v.provider is defined %}
+          provider: {{ v.provider }}
+          {%- endif %}
+          {%- if v.model is defined %}
+          model: "{{ v.model }}"
+          {%- endif %}
+          {%- if v.num_documents_to_retrieve is defined %}
+          num_documents_to_retrieve: {{ v.num_documents_to_retrieve }}
+          {%- endif %}
+          {%- if v.recursion_limit is defined %}
+          recursion_limit: {{ v.recursion_limit }}
+          {%- endif %}
+        {%- endfor %}
+  {%- endif %}
   data_manager:
     auth:
       enabled: {{ services.data_manager.auth.enabled | default(false) }}

diff --git a/src/cli/templates/init.sql b/src/cli/templates/init.sql
@@ -502,6 +502,12 @@ CREATE TABLE IF NOT EXISTS ab_comparisons (
     config_a_id INTEGER REFERENCES configs(config_id),
     config_b_id INTEGER REFERENCES configs(config_id),
 
+    -- Pool-based variant info (populated when ab_testing pool is active)
+    variant_a_name VARCHAR(200),
+    variant_b_name VARCHAR(200),
+    variant_a_meta JSONB,
+    variant_b_meta JSONB,
+
     is_config_a_first BOOLEAN NOT NULL,
     preference VARCHAR(10),
     preference_ts TIMESTAMP,
@@ -512,6 +518,18 @@ CREATE INDEX IF NOT EXISTS idx_ab_comparisons_conversation ON ab_comparisons(con
 CREATE INDEX IF NOT EXISTS idx_ab_comparisons_models ON ab_comparisons(model_a, model_b);
 CREATE INDEX IF NOT EXISTS idx_ab_comparisons_preference ON ab_comparisons(preference) WHERE preference IS NOT NULL;
 CREATE INDEX IF NOT EXISTS idx_ab_comparisons_pending ON ab_comparisons(conversation_id) WHERE preference IS NULL;
+CREATE INDEX IF NOT EXISTS idx_ab_comparisons_variant_a ON ab_comparisons(variant_a_name) WHERE variant_a_name IS NOT NULL;
+CREATE INDEX IF NOT EXISTS idx_ab_comparisons_variant_b ON ab_comparisons(variant_b_name) WHERE variant_b_name IS NOT NULL;
+
+-- Per-variant aggregate metrics (wins/losses/ties)
+CREATE TABLE IF NOT EXISTS ab_variant_metrics (
+    variant_name VARCHAR(200) PRIMARY KEY,
+    wins INTEGER NOT NULL DEFAULT 0,
+    losses INTEGER NOT NULL DEFAULT 0,
+    ties INTEGER NOT NULL DEFAULT 0,
+    total_comparisons INTEGER NOT NULL DEFAULT 0,
+    last_updated TIMESTAMP NOT NULL DEFAULT NOW()
+);
 
 -- ============================================================================
 -- 9. MIGRATION STATE (for resumable migrations)
@@ -555,6 +573,7 @@ GRANT SELECT ON
     timing,
     agent_tool_calls,
     ab_comparisons,
+    ab_variant_metrics,
     migration_state
 TO grafana;
 {% endif %}