diff --git a/cli/test/test_config_generator.py b/cli/test/test_config_generator.py
index 9aade29e3..45d28c9ec 100644
--- a/cli/test/test_config_generator.py
+++ b/cli/test/test_config_generator.py
@@ -823,3 +823,98 @@ def test_apply_kimi_code_provider_defaults_injects_user_agent():
     apply_kimi_code_provider_defaults(provider)
     assert provider["base_url"] == "https://api.kimi.com/coding/v1"
     assert provider["headers"]["User-Agent"] == "KimiCLI/1.3"
+
+
+def _load_schema():
+    from jsonschema import validate
+
+    with open("../config/plano_config_schema.yaml", "r") as f:
+        return validate, yaml.safe_load(f.read())
+
+
+def test_schema_accepts_capabilities_and_signal_routing():
+    """Capability/signal-aware routing fields validate against the schema."""
+    validate, schema = _load_schema()
+    config = yaml.safe_load("""
+version: v0.4.0
+listeners:
+  - type: model
+    name: model_1
+    address: 0.0.0.0
+    port: 12000
+model_providers:
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    default: true
+  - model: anthropic/claude-opus-4-1-128k
+    access_key: $ANTHROPIC_API_KEY
+    capabilities:
+      context_window: 128000
+  - model: openai/llama-3.3-70b-vision
+    base_url: https://api.custom-provider.com
+    access_key: $CUSTOM_API_KEY
+    capabilities:
+      context_window: 128000
+      supports_vision: true
+      supports_image_generation: false
+      supports_audio_out: false
+      max_output_tokens: 8192
+routing_preferences:
+  - name: long document analysis
+    description: summarizing or querying very large documents
+    models:
+      - anthropic/claude-opus-4-1-128k
+      - openai/gpt-4o
+    selection_policy:
+      prefer: long_context_quality
+overrides:
+  llm_routing_model: Plano-Orchestrator
+  empty_pool_behavior: warning
+  model_capabilities_source:
+    url: https://models.dev/api.json
+    refresh_interval: 86400
+""")
+    validate(config, schema)
+
+
+def test_schema_rejects_unknown_capability_field():
+    """capabilities block is closed (additionalProperties: false)."""
+    from jsonschema import ValidationError
+
+    validate, schema = _load_schema()
+    config = yaml.safe_load("""
+version: v0.4.0
+listeners:
+  - type: model
+    name: model_1
+    address: 0.0.0.0
+    port: 12000
+model_providers:
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    capabilities:
+      supports_telepathy: true
+""")
+    with pytest.raises(ValidationError):
+        validate(config, schema)
+
+
+def test_schema_rejects_invalid_empty_pool_behavior():
+    from jsonschema import ValidationError
+
+    validate, schema = _load_schema()
+    config = yaml.safe_load("""
+version: v0.4.0
+listeners:
+  - type: model
+    name: model_1
+    address: 0.0.0.0
+    port: 12000
+model_providers:
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+overrides:
+  empty_pool_behavior: explode
+""")
+    with pytest.raises(ValidationError):
+        validate(config, schema)
diff --git a/config/plano_config_schema.yaml b/config/plano_config_schema.yaml
index 2ecf38921..cf4f6fc92 100644
--- a/config/plano_config_schema.yaml
+++ b/config/plano_config_schema.yaml
@@ -184,22 +184,54 @@ properties:
           enum:
             - plano
             - claude
+            - anthropic
             - deepseek
             - groq
             - mistral
             - openai
             - xiaomi
             - gemini
+            - xai
+            - together_ai
+            - azure_openai
+            - ollama
+            - zhipu
+            - qwen
+            - amazon_bedrock
             - chatgpt
             - digitalocean
             - vercel
             - openrouter
             - moonshotai
+            - astraflow
+            - astraflow_cn
         headers:
           type: object
           additionalProperties:
             type: string
           description: "Additional headers to send with upstream requests (e.g., ChatGPT-Account-Id, originator)."
+        capabilities:
+          type: object
+          additionalProperties: false
+          description: "Optional model capability overrides. Capabilities default from models.dev (vendored seed + runtime refresh); declare these only to override models.dev or to describe a model models.dev doesn't have. Precedence: this block > models.dev > conservative default."
+          properties:
+            context_window:
+              type: integer
+              minimum: 1
+              description: "Maximum total context (input + output) tokens the model accepts."
+            max_output_tokens:
+              type: integer
+              minimum: 1
+              description: "Maximum output tokens the model can produce."
+            supports_vision:
+              type: boolean
+              description: "Whether the model accepts image input (vision)."
+            supports_image_generation:
+              type: boolean
+              description: "Whether the model serves /v1/images/generations."
+            supports_audio_out:
+              type: boolean
+              description: "Whether the model serves /v1/audio/speech (TTS)."
         routing_preferences:
           type: array
           description: "[DEPRECATED] Inline routing_preferences under a model_provider are auto-migrated to the top-level routing_preferences list by the config generator. New configs should declare routing_preferences at the top level with an explicit models: [...] list. See docs/routing-api.md."
@@ -243,17 +275,27 @@ properties:
           enum:
             - plano
             - claude
+            - anthropic
             - deepseek
             - groq
             - mistral
             - openai
             - xiaomi
             - gemini
+            - xai
+            - together_ai
+            - azure_openai
+            - ollama
+            - zhipu
+            - qwen
+            - amazon_bedrock
             - chatgpt
             - digitalocean
             - vercel
             - openrouter
             - moonshotai
+            - astraflow
+            - astraflow_cn
         headers:
           type: object
           additionalProperties:
@@ -316,6 +358,26 @@ properties:
       orchestrator_model_context_length:
         type: integer
         description: "Maximum token length for the orchestrator/routing model context window. Default is 8192."
+      empty_pool_behavior:
+        type: string
+        enum:
+          - error
+          - warning
+        default: error
+        description: "Tier 1 capability filtering is a hard gate. When it removes every model in a matched route, 'error' returns HTTP 422 (default); 'warning' logs and proceeds with the pre-filter pool. This is the only lever that lets routing_preferences win over capability."
+      model_capabilities_source:
+        type: object
+        additionalProperties: false
+        description: "Optional source for model capabilities, fetched at runtime from models.dev (like cost/latency metrics). Defaults to the public models.dev API; users rarely set this. Omit refresh_interval to fetch only once at startup."
+        properties:
+          url:
+            type: string
+            default: "https://models.dev/api.json"
+            description: "models.dev-compatible capability API URL."
+          refresh_interval:
+            type: integer
+            minimum: 1
+            description: "Refresh interval in seconds. Omit to fetch only once at startup."
   system_prompt:
     type: string
   prompt_targets:
@@ -559,6 +621,7 @@ properties:
               enum:
                 - cheapest
                 - fastest
+                - long_context_quality
                 - none
           additionalProperties: false
           required:
diff --git a/crates/brightstaff/src/handlers/llm/model_selection.rs b/crates/brightstaff/src/handlers/llm/model_selection.rs
index a1378d86d..c1535dad6 100644
--- a/crates/brightstaff/src/handlers/llm/model_selection.rs
+++ b/crates/brightstaff/src/handlers/llm/model_selection.rs
@@ -1,13 +1,13 @@
-use common::configuration::TopLevelRoutingPreference;
+use common::configuration::{EmptyPoolBehavior, TopLevelRoutingPreference};
 use hermesllm::clients::endpoints::SupportedUpstreamAPIs;
-use hermesllm::ProviderRequestType;
+use hermesllm::{ProviderRequest, ProviderRequestType, RequiredCapabilities};
 use hyper::StatusCode;
 use std::sync::Arc;
 use tracing::{debug, info, warn};
 
 use crate::metrics as bs_metrics;
 use crate::metrics::labels as metric_labels;
-use crate::router::orchestrator::OrchestratorService;
+use crate::router::orchestrator::{OrchestrationError, OrchestratorService};
 use crate::streaming::truncate_message;
 use crate::tracing::routing;
 
@@ -43,6 +43,15 @@ impl RoutingError {
             status_code: StatusCode::INTERNAL_SERVER_ERROR,
         }
     }
+
+    /// Tier 1 capability filter left no viable model (D3 `error`). 422 so the
+    /// client can see this is a request/config mismatch, not a server fault.
+    pub fn capability_error(message: String) -> Self {
+        Self {
+            message,
+            status_code: StatusCode::UNPROCESSABLE_ENTITY,
+        }
+    }
 }
 
 /// Determines the routing decision if
@@ -109,6 +118,14 @@ pub async fn router_chat_get_upstream_model(
         "processing router request"
     );
 
+    // Tier 1: compute the capability requirements implied by this request shape.
+    let mut required = RequiredCapabilities::for_endpoint(request_path);
+    required.vision = chat_request.has_vision();
+    required.min_context_tokens = chat_request.required_context_tokens();
+    if !required.is_unconstrained() {
+        debug!(requirement = %required.describe(), "computed required capabilities");
+    }
+
     // Capture start time for routing span
     let routing_start_time = std::time::Instant::now();
 
@@ -117,6 +134,7 @@ pub async fn router_chat_get_upstream_model(
             &chat_request.messages,
             inline_routing_preferences,
             request_id,
+            &required,
         )
         .await;
 
@@ -144,8 +162,42 @@ pub async fn router_chat_get_upstream_model(
                 })
             }
             None => {
-                // No route determined, return sentinel value "none"
-                // This signals to llm.rs to use the original validated request model
+                // No preference matched. The "none" sentinel tells llm.rs to use
+                // the original validated request model — but capability is a hard
+                // gate even here, so validate that explicit model against the
+                // request shape (catches e.g. an image sent to a text-only model).
+                let explicit_model = chat_request.model();
+                if !required.is_unconstrained()
+                    && orchestrator_service.has_capability_filter()
+                    && !orchestrator_service
+                        .is_model_capable(explicit_model, &required)
+                        .await
+                {
+                    match orchestrator_service.empty_pool_behavior() {
+                        EmptyPoolBehavior::Error => {
+                            current_span.record("route.selected_model", "unknown");
+                            bs_metrics::record_router_decision(
+                                route_label,
+                                "unknown",
+                                true,
+                                determination_elapsed,
+                            );
+                            return Err(RoutingError::capability_error(format!(
+                                "model '{}' cannot serve this request (requires {})",
+                                explicit_model,
+                                required.describe()
+                            )));
+                        }
+                        EmptyPoolBehavior::Warning => {
+                            warn!(
+                                model = %explicit_model,
+                                requirement = %required.describe(),
+                                "explicit model is not capable; empty_pool_behavior=warning, proceeding"
+                            );
+                        }
+                    }
+                }
+
                 current_span.record("route.selected_model", "none");
                 info!("no route determined, using default model");
                 bs_metrics::record_router_decision(
@@ -165,10 +217,18 @@ pub async fn router_chat_get_upstream_model(
         Err(err) => {
             current_span.record("route.selected_model", "unknown");
             bs_metrics::record_router_decision(route_label, "unknown", true, determination_elapsed);
-            Err(RoutingError::internal_error(format!(
-                "Failed to determine route: {}",
-                err
-            )))
+            match err {
+                OrchestrationError::CapabilityFilterEmpty { route, requirement } => {
+                    Err(RoutingError::capability_error(format!(
+                        "no capable model for route '{}': request requires {}",
+                        route, requirement
+                    )))
+                }
+                other => Err(RoutingError::internal_error(format!(
+                    "Failed to determine route: {}",
+                    other
+                ))),
+            }
         }
     }
 }
diff --git a/crates/brightstaff/src/main.rs b/crates/brightstaff/src/main.rs
index 90ed84c35..165ff904f 100644
--- a/crates/brightstaff/src/main.rs
+++ b/crates/brightstaff/src/main.rs
@@ -12,6 +12,7 @@ use brightstaff::handlers::models::list_models;
 use brightstaff::handlers::routing_service::routing_decision;
 use brightstaff::metrics as bs_metrics;
 use brightstaff::metrics::labels as metric_labels;
+use brightstaff::router::model_capabilities::ModelCapabilitiesService;
 use brightstaff::router::model_metrics::ModelMetricsService;
 use brightstaff::router::orchestrator::OrchestratorService;
 use brightstaff::session_cache::init_session_cache;
@@ -308,17 +309,50 @@ async fn init_app_state(
         .orchestrator_model_context_length
         .unwrap_or(brightstaff::router::orchestrator_model_v1::MAX_TOKEN_LEN);
 
-    let orchestrator_service = Arc::new(OrchestratorService::with_routing(
-        format!("{llm_provider_url}{CHAT_COMPLETIONS_PATH}"),
-        orchestrator_model_name,
-        orchestrator_llm_provider,
-        config.routing_preferences.clone(),
-        metrics_service,
-        session_ttl_seconds,
-        session_cache,
-        session_tenant_header,
-        orchestrator_max_tokens,
+    // Tier 1 capability source: vendored models.dev seed + optional runtime
+    // refresh, layered with per-provider user overrides.
+    let capabilities_service = Some(Arc::new(
+        ModelCapabilitiesService::new(
+            &config.model_providers,
+            overrides.model_capabilities_source.as_ref(),
+            reqwest::Client::new(),
+        )
+        .await,
     ));
+    let empty_pool_behavior = overrides.empty_pool_behavior.unwrap_or_default();
+
+    // Surface internal long-context-quality dataset provenance + staleness (R5).
+    {
+        let lcq = hermesllm::providers::long_context_quality::dataset();
+        if let Ok(dated) = chrono::NaiveDate::parse_from_str(&lcq.dated, "%Y-%m-%d") {
+            let age_days = (chrono::Utc::now().date_naive() - dated).num_days().max(0) as f64;
+            brightstaff::metrics::record_lcq_staleness_days(age_days);
+            info!(
+                models = lcq.len(),
+                source = %lcq.source,
+                dated = %lcq.dated,
+                age_days,
+                "loaded internal long-context-quality dataset"
+            );
+        } else {
+            info!(models = lcq.len(), source = %lcq.source, "loaded internal long-context-quality dataset");
+        }
+    }
+
+    let orchestrator_service = Arc::new(
+        OrchestratorService::with_routing(
+            format!("{llm_provider_url}{CHAT_COMPLETIONS_PATH}"),
+            orchestrator_model_name,
+            orchestrator_llm_provider,
+            config.routing_preferences.clone(),
+            metrics_service,
+            session_ttl_seconds,
+            session_cache,
+            session_tenant_header,
+            orchestrator_max_tokens,
+        )
+        .with_capability_filter(capabilities_service, empty_pool_behavior),
+    );
 
     let state_storage = init_state_storage(config).await?;
 
diff --git a/crates/brightstaff/src/metrics/labels.rs b/crates/brightstaff/src/metrics/labels.rs
index 4eaf3e59a..8b0921e2a 100644
--- a/crates/brightstaff/src/metrics/labels.rs
+++ b/crates/brightstaff/src/metrics/labels.rs
@@ -36,3 +36,9 @@ pub const ROUTING_SVC_POLICY_ERROR: &str = "policy_error";
 pub const SESSION_CACHE_HIT: &str = "hit";
 pub const SESSION_CACHE_MISS: &str = "miss";
 pub const SESSION_CACHE_STORE: &str = "store";
+
+// Tier 1 capability-filter outcome values.
+pub const CAPABILITY_FILTER_PASS: &str = "pass";
+pub const CAPABILITY_FILTER_FILTERED: &str = "filtered";
+pub const CAPABILITY_FILTER_EMPTY_ERROR: &str = "empty_error";
+pub const CAPABILITY_FILTER_EMPTY_WARNING: &str = "empty_warning";
diff --git a/crates/brightstaff/src/metrics/mod.rs b/crates/brightstaff/src/metrics/mod.rs
index 34679ccac..c0d0ffa93 100644
--- a/crates/brightstaff/src/metrics/mod.rs
+++ b/crates/brightstaff/src/metrics/mod.rs
@@ -317,6 +317,26 @@ pub fn record_llm_ttft(provider: &str, model: &str, ttft: Duration) {
     .record(ttft.as_secs_f64());
 }
 
+/// Per-modality generation latency, the cold-start-seedable signal that powers
+/// `prefer: fastest` for the non-text modalities (WS10). `kind` distinguishes the
+/// per-modality timing definition: `e2e` (image-gen, request→asset), `ttfa`
+/// (streaming audio, time-to-first-audio-chunk), or `synthesis` (non-streaming
+/// audio, total synthesis time). Text continues to use TTFT and is unaffected.
+pub fn record_modality_latency(
+    model: &str,
+    modality: &str,
+    kind: &'static str,
+    duration: Duration,
+) {
+    histogram!(
+        "brightstaff_llm_modality_latency_seconds",
+        "model" => model.to_string(),
+        "modality" => modality.to_string(),
+        "kind" => kind,
+    )
+    .record(duration.as_secs_f64());
+}
+
 pub fn record_llm_tokens(provider: &str, model: &str, kind: &'static str, count: u64) {
     counter!(
         "brightstaff_llm_tokens_total",
@@ -360,6 +380,36 @@ pub fn record_router_decision(
     .record(duration.as_secs_f64());
 }
 
+/// Record Tier 1 capability-filter timing + pool sizes for a matched route.
+/// `outcome` is one of `pass`, `filtered`, `empty_error`, `empty_warning`.
+pub fn record_capability_filter(
+    route: &str,
+    outcome: &'static str,
+    pre_filter: usize,
+    post_filter: usize,
+    duration: Duration,
+) {
+    counter!(
+        "brightstaff_capability_filter_total",
+        "route" => route.to_string(),
+        "outcome" => outcome,
+    )
+    .increment(1);
+    histogram!(
+        "brightstaff_capability_filter_duration_seconds",
+        "route" => route.to_string(),
+    )
+    .record(duration.as_secs_f64());
+    histogram!("brightstaff_capability_filter_pre_pool_size").record(pre_filter as f64);
+    histogram!("brightstaff_capability_filter_post_pool_size").record(post_filter as f64);
+}
+
+/// Surface the age (in days) of the internal long-context-quality dataset so
+/// staleness is observable (R5).
+pub fn record_lcq_staleness_days(age_days: f64) {
+    gauge!("brightstaff_long_context_quality_staleness_days").set(age_days);
+}
+
 pub fn record_routing_service_outcome(outcome: &'static str) {
     counter!(
         "brightstaff_routing_service_requests_total",
diff --git a/crates/brightstaff/src/router/mod.rs b/crates/brightstaff/src/router/mod.rs
index 0f48c0907..53af701d9 100644
--- a/crates/brightstaff/src/router/mod.rs
+++ b/crates/brightstaff/src/router/mod.rs
@@ -1,4 +1,5 @@
 pub(crate) mod http;
+pub mod model_capabilities;
 pub mod model_metrics;
 pub mod orchestrator;
 pub mod orchestrator_model;
diff --git a/crates/brightstaff/src/router/model_capabilities.rs b/crates/brightstaff/src/router/model_capabilities.rs
new file mode 100644
index 000000000..5ce41fb45
--- /dev/null
+++ b/crates/brightstaff/src/router/model_capabilities.rs
@@ -0,0 +1,185 @@
+//! Tier 1 capability source for routing.
+//!
+//! Mirrors [`super::model_metrics::ModelMetricsService`] (and how DigitalOcean
+//! pricing is handled): nothing is vendored into the binary. The catalog is
+//! fetched from models.dev at startup, optionally refreshed on an interval, and
+//! left empty on fetch failure (so absent models resolve to the conservative
+//! default rather than failing the build with a committed snapshot). Per-model
+//! capabilities resolve with the precedence:
+//! `user config capabilities > models.dev > conservative default`.
+
+use std::collections::HashMap;
+use std::sync::Arc;
+use std::time::Duration;
+
+use common::configuration::{LlmProvider, ModelCapabilitiesSource};
+use hermesllm::providers::capabilities::{
+    canonical_model_key, CapabilitiesCatalog, CapabilitiesSnapshot,
+};
+use hermesllm::ModelCapabilities;
+use tokio::sync::RwLock;
+use tracing::{info, warn};
+
+const DEFAULT_MODELS_DEV_URL: &str = "https://models.dev/api.json";
+
+pub struct ModelCapabilitiesService {
+    /// User config overrides, keyed by canonical `"<provider>/<model>"`.
+    user_overrides: HashMap<String, ModelCapabilities>,
+    /// models.dev-derived catalog, fetched at runtime (empty until first fetch).
+    catalog: Arc<RwLock<CapabilitiesCatalog>>,
+}
+
+impl ModelCapabilitiesService {
+    /// Build the service from configured providers and an optional capability
+    /// source. Fetches the models.dev catalog once at startup (like DO pricing),
+    /// then spawns a refresh loop if an interval is configured. On fetch failure
+    /// the catalog stays empty and models resolve to user overrides or the
+    /// conservative default.
+    pub async fn new(
+        providers: &[LlmProvider],
+        source: Option<&ModelCapabilitiesSource>,
+        client: reqwest::Client,
+    ) -> Self {
+        let mut user_overrides = HashMap::new();
+        for p in providers {
+            if let Some(caps) = &p.capabilities {
+                let model_str = p.model.clone().unwrap_or_else(|| p.name.clone());
+                let key = canonical_model_key(&model_str).unwrap_or(model_str);
+                user_overrides.insert(key, caps.clone());
+            }
+        }
+
+        let url = source
+            .and_then(|s| s.url.clone())
+            .unwrap_or_else(|| DEFAULT_MODELS_DEV_URL.to_string());
+
+        // Fetch once at startup so capabilities are available immediately. On
+        // failure the catalog is empty (conservative defaults), never fatal.
+        let catalog = match fetch_catalog(&client, &url).await {
+            Some(fresh) => {
+                info!(models = fresh.len(), url = %url, user_overrides = user_overrides.len(), "fetched model capabilities from models.dev");
+                fresh
+            }
+            None => {
+                warn!(url = %url, "models.dev fetch failed at startup — capabilities default to conservative (text-only) until refresh");
+                CapabilitiesCatalog::default()
+            }
+        };
+        let catalog = Arc::new(RwLock::new(catalog));
+
+        if let Some(interval_secs) = source.and_then(|s| s.refresh_interval) {
+            let catalog_clone = Arc::clone(&catalog);
+            let client_clone = client.clone();
+            let url_clone = url.clone();
+            let interval = Duration::from_secs(interval_secs);
+            tokio::spawn(async move {
+                loop {
+                    tokio::time::sleep(interval).await;
+                    if let Some(fresh) = fetch_catalog(&client_clone, &url_clone).await {
+                        info!(models = fresh.len(), url = %url_clone, "refreshed model capabilities from models.dev");
+                        *catalog_clone.write().await = fresh;
+                    } else {
+                        warn!(url = %url_clone, "models.dev refresh failed — keeping previous catalog");
+                    }
+                }
+            });
+        }
+
+        ModelCapabilitiesService {
+            user_overrides,
+            catalog,
+        }
+    }
+
+    /// Construct a service from an explicit catalog with no refresh (tests).
+    pub fn from_catalog(
+        catalog: CapabilitiesCatalog,
+        user_overrides: HashMap<String, ModelCapabilities>,
+    ) -> Self {
+        ModelCapabilitiesService {
+            user_overrides,
+            catalog: Arc::new(RwLock::new(catalog)),
+        }
+    }
+
+    /// Resolve capabilities for a model, applying
+    /// `user override > models.dev > conservative default`.
+    pub async fn capabilities_for(&self, model: &str) -> ModelCapabilities {
+        let from_catalog = {
+            let catalog = self.catalog.read().await;
+            catalog.get(model).cloned().unwrap_or_default()
+        };
+        let key = canonical_model_key(model);
+        let user = key
+            .as_ref()
+            .and_then(|k| self.user_overrides.get(k))
+            .or_else(|| self.user_overrides.get(model));
+        match user {
+            Some(u) => u.fill_from(&from_catalog),
+            None => from_catalog,
+        }
+    }
+}
+
+/// Fetch + map a models.dev catalog. Returns `None` on any network/parse error
+/// so the caller can fall back to the previous (seed) catalog.
+async fn fetch_catalog(client: &reqwest::Client, url: &str) -> Option<CapabilitiesCatalog> {
+    let bytes = match client.get(url).send().await {
+        Ok(resp) => match resp.bytes().await {
+            Ok(b) => b,
+            Err(err) => {
+                warn!(error = %err, url = %url, "failed to read models.dev response");
+                return None;
+            }
+        },
+        Err(err) => {
+            warn!(error = %err, url = %url, "failed to fetch models.dev");
+            return None;
+        }
+    };
+    match CapabilitiesSnapshot::from_models_dev_json(&bytes) {
+        Ok(snapshot) => Some(CapabilitiesCatalog::from_snapshot(snapshot)),
+        Err(err) => {
+            warn!(error = %err, url = %url, "failed to parse models.dev payload");
+            None
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn caps(window: Option<u32>, vision: Option<bool>) -> ModelCapabilities {
+        ModelCapabilities {
+            context_window: window,
+            supports_vision: vision,
+            ..Default::default()
+        }
+    }
+
+    #[tokio::test]
+    async fn user_override_wins_over_catalog() {
+        let mut catalog_models = HashMap::new();
+        catalog_models.insert("openai/gpt-4o".to_string(), caps(Some(200000), Some(true)));
+        let catalog = CapabilitiesCatalog::new(catalog_models);
+
+        let mut overrides = HashMap::new();
+        overrides.insert("openai/gpt-4o".to_string(), caps(Some(128000), None));
+
+        let svc = ModelCapabilitiesService::from_catalog(catalog, overrides);
+        let resolved = svc.capabilities_for("openai/gpt-4o").await;
+        // user override wins for window, catalog backfills vision
+        assert_eq!(resolved.window(), Some(128000));
+        assert!(resolved.vision());
+    }
+
+    #[tokio::test]
+    async fn unknown_model_falls_back_to_conservative_default() {
+        let svc =
+            ModelCapabilitiesService::from_catalog(CapabilitiesCatalog::default(), HashMap::new());
+        let resolved = svc.capabilities_for("openai/unknown").await;
+        assert!(!resolved.vision());
+        assert_eq!(resolved.window(), None);
+    }
+}
diff --git a/crates/brightstaff/src/router/model_metrics.rs b/crates/brightstaff/src/router/model_metrics.rs
index 1adb408d8..24c22d614 100644
--- a/crates/brightstaff/src/router/model_metrics.rs
+++ b/crates/brightstaff/src/router/model_metrics.rs
@@ -10,8 +10,42 @@ use tracing::{debug, info, warn};
 
 const DO_PRICING_URL: &str = "https://api.digitalocean.com/v2/gen-ai/models/catalog";
 
+/// Routing modality used to scope the per-modality `fastest` latency signal
+/// (WS10). Latency is keyed by `(model, modality)`: text keeps its existing
+/// TTFT behavior; image/audio carry their own per-modality timing definitions.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum Modality {
+    Text,
+    Vision,
+    ImageOut,
+    AudioOut,
+}
+
+impl Modality {
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            Modality::Text => "text",
+            Modality::Vision => "vision",
+            Modality::ImageOut => "image_out",
+            Modality::AudioOut => "audio_out",
+        }
+    }
+}
+
+/// Composite latency-map key scoping a model's latency to a modality. Text uses
+/// the bare model id (back-compat with the existing flat Prometheus map); other
+/// modalities are namespaced so they never collide with the text signal.
+pub fn modality_latency_key(model: &str, modality: Modality) -> String {
+    match modality {
+        Modality::Text => model.to_string(),
+        other => format!("{}\u{1}{}", other.as_str(), model),
+    }
+}
+
 pub struct ModelMetricsService {
     cost: Arc<RwLock<HashMap<String, f64>>>,
+    /// Latency map keyed by either a bare model id (text) or a
+    /// `modality\u{1}model` composite (see [`modality_latency_key`]).
     latency: Arc<RwLock<HashMap<String, f64>>>,
 }
 
@@ -107,10 +141,69 @@ impl ModelMetricsService {
                 }
                 rank_by_ascending_metric(models, &latency_data)
             }
+            SelectionPreference::LongContextQuality => {
+                // Tier 2: rank by Plano's internal long-context-quality dataset.
+                // Higher score is better, so rank descending; unscored models last.
+                let scores: HashMap<String, f64> = models
+                    .iter()
+                    .filter_map(|m| {
+                        match hermesllm::long_context_quality_score(m) {
+                            Some(s) => Some((m.clone(), s)),
+                            None => {
+                                warn!(model = %m, "no long-context-quality score — ranking last (prefer: long_context_quality)");
+                                None
+                            }
+                        }
+                    })
+                    .collect();
+                rank_by_descending_metric(models, &scores)
+            }
             SelectionPreference::None => models.to_vec(),
         }
     }
 
+    /// Rank `models` by latency for a specific `modality` (WS10 groundwork).
+    /// Looks up the `(model, modality)` composite key first, then falls back to
+    /// the bare model latency, then ranks last. For `Modality::Text` this is
+    /// identical to `prefer: fastest` today.
+    pub async fn rank_models_by_modality_latency(
+        &self,
+        models: &[String],
+        modality: Modality,
+    ) -> Vec<String> {
+        let latency_data = self.latency.read().await;
+        let resolved: HashMap<String, f64> = models
+            .iter()
+            .filter_map(|m| {
+                let composite = modality_latency_key(m, modality);
+                let v = latency_data
+                    .get(&composite)
+                    .or_else(|| latency_data.get(m.as_str()))
+                    .copied();
+                match v {
+                    Some(v) if !v.is_nan() => Some((m.clone(), v)),
+                    _ => {
+                        warn!(model = %m, modality = modality.as_str(), "no per-modality latency — ranking last (prefer: fastest)");
+                        None
+                    }
+                }
+            })
+            .collect();
+        rank_by_ascending_metric(models, &resolved)
+    }
+
+    /// Seed the latency map for cold start (WS10): at launch there is no live
+    /// traffic, so `prefer: fastest` on the new modalities must be primed from
+    /// benchmark data. Entries should use [`modality_latency_key`] for non-text
+    /// modalities. Existing keys are overwritten; live data takes over on the
+    /// next Prometheus refresh.
+    pub async fn seed_latency(&self, seed: HashMap<String, f64>) {
+        let mut latency = self.latency.write().await;
+        for (k, v) in seed {
+            latency.insert(k, v);
+        }
+    }
+
     /// Returns a snapshot of the current cost data. Used at startup to warn about unmatched models.
     pub async fn cost_snapshot(&self) -> HashMap<String, f64> {
         self.cost.read().await.clone()
@@ -148,6 +241,34 @@ fn rank_by_ascending_metric(models: &[String], data: &HashMap<String, f64>) -> V
         .collect()
 }
 
+/// Rank `models` by a metric where **higher is better** (e.g. quality scores).
+/// Models with no data are appended at the end in their original order.
+fn rank_by_descending_metric(models: &[String], data: &HashMap<String, f64>) -> Vec<String> {
+    let mut with_data: Vec<(&String, f64)> = models
+        .iter()
+        .filter_map(|m| {
+            let v = *data.get(m.as_str())?;
+            if v.is_nan() {
+                None
+            } else {
+                Some((m, v))
+            }
+        })
+        .collect();
+    with_data.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
+
+    let without_data: Vec<&String> = models
+        .iter()
+        .filter(|m| data.get(m.as_str()).is_none_or(|v| v.is_nan()))
+        .collect();
+
+    with_data
+        .iter()
+        .map(|(m, _)| (*m).clone())
+        .chain(without_data.iter().map(|m| (*m).clone()))
+        .collect()
+}
+
 #[derive(serde::Deserialize)]
 struct DoModelList {
     data: Vec<DoModel>,
@@ -231,7 +352,16 @@ async fn fetch_prometheus_metrics(
                 .filter_map(|r| {
                     let model_name = r.metric.get("model_name")?.clone();
                     let value: f64 = r.value.1.parse().ok()?;
-                    Some((model_name, value))
+                    // If the series carries a `modality` label, namespace the key
+                    // so per-modality latency (WS10) never collides with text.
+                    let key = match r.metric.get("modality").map(String::as_str) {
+                        Some("text") | None => model_name,
+                        Some("vision") => modality_latency_key(&model_name, Modality::Vision),
+                        Some("image_out") => modality_latency_key(&model_name, Modality::ImageOut),
+                        Some("audio_out") => modality_latency_key(&model_name, Modality::AudioOut),
+                        Some(_) => model_name,
+                    };
+                    Some((key, value))
                 })
                 .collect(),
             Err(err) => {
@@ -319,6 +449,49 @@ mod tests {
         assert_eq!(result, vec!["claude-sonnet", "gpt-4o"]);
     }
 
+    #[test]
+    fn test_rank_by_descending_metric_picks_highest_first() {
+        let models = vec!["a".to_string(), "b".to_string(), "c".to_string()];
+        let mut data = HashMap::new();
+        data.insert("a".to_string(), 0.80);
+        data.insert("b".to_string(), 0.95);
+        data.insert("c".to_string(), 0.70);
+        // unscored models would sort last; here all scored, highest first
+        assert_eq!(
+            rank_by_descending_metric(&models, &data),
+            vec!["b", "a", "c"]
+        );
+    }
+
+    #[tokio::test]
+    async fn test_rank_models_long_context_quality() {
+        // Uses the vendored internal LCQ dataset. gemini-2.5-pro outranks gpt-4o,
+        // and an unscored model sorts last.
+        let service = ModelMetricsService {
+            cost: Arc::new(RwLock::new(HashMap::new())),
+            latency: Arc::new(RwLock::new(HashMap::new())),
+        };
+        let models = vec![
+            "openai/gpt-4o".to_string(),
+            "openai/no-lcq-model".to_string(),
+            "gemini/gemini-2.5-pro".to_string(),
+        ];
+        let result = service
+            .rank_models(
+                &models,
+                &make_policy(SelectionPreference::LongContextQuality),
+            )
+            .await;
+        assert_eq!(
+            result,
+            vec![
+                "gemini/gemini-2.5-pro",
+                "openai/gpt-4o",
+                "openai/no-lcq-model"
+            ]
+        );
+    }
+
     #[tokio::test]
     async fn test_rank_models_fallback_no_metrics() {
         let service = ModelMetricsService {
@@ -368,6 +541,65 @@ mod tests {
         assert_eq!(result, vec!["gpt-4o", "gpt-4o-mini"]);
     }
 
+    #[test]
+    fn test_modality_latency_key_namespaces_non_text() {
+        assert_eq!(
+            modality_latency_key("openai/gpt-4o", Modality::Text),
+            "openai/gpt-4o"
+        );
+        assert_eq!(
+            modality_latency_key("openai/gpt-image-1", Modality::ImageOut),
+            "image_out\u{1}openai/gpt-image-1"
+        );
+        assert_ne!(
+            modality_latency_key("m", Modality::Vision),
+            modality_latency_key("m", Modality::AudioOut)
+        );
+    }
+
+    #[tokio::test]
+    async fn test_rank_models_by_modality_latency_prefers_composite_key() {
+        let service = ModelMetricsService {
+            cost: Arc::new(RwLock::new(HashMap::new())),
+            latency: Arc::new(RwLock::new(HashMap::new())),
+        };
+        // Seed per-modality (audio) latency for two TTS models.
+        let mut seed = HashMap::new();
+        seed.insert(
+            modality_latency_key("openai/tts-slow", Modality::AudioOut),
+            500.0,
+        );
+        seed.insert(
+            modality_latency_key("openai/tts-fast", Modality::AudioOut),
+            100.0,
+        );
+        service.seed_latency(seed).await;
+
+        let models = vec!["openai/tts-slow".to_string(), "openai/tts-fast".to_string()];
+        let result = service
+            .rank_models_by_modality_latency(&models, Modality::AudioOut)
+            .await;
+        assert_eq!(result, vec!["openai/tts-fast", "openai/tts-slow"]);
+    }
+
+    #[tokio::test]
+    async fn test_rank_models_by_modality_latency_falls_back_to_bare_model() {
+        let service = ModelMetricsService {
+            cost: Arc::new(RwLock::new(HashMap::new())),
+            latency: Arc::new(RwLock::new({
+                let mut m = HashMap::new();
+                m.insert("text-model".to_string(), 42.0);
+                m
+            })),
+        };
+        let models = vec!["text-model".to_string(), "no-data".to_string()];
+        let result = service
+            .rank_models_by_modality_latency(&models, Modality::Vision)
+            .await;
+        // bare-model fallback ranks first; the unseen model sorts last.
+        assert_eq!(result, vec!["text-model", "no-data"]);
+    }
+
     #[test]
     fn test_rank_by_ascending_metric_nan_treated_as_missing() {
         let models = vec![
diff --git a/crates/brightstaff/src/router/orchestrator.rs b/crates/brightstaff/src/router/orchestrator.rs
index 2d7b25dee..600946aa2 100644
--- a/crates/brightstaff/src/router/orchestrator.rs
+++ b/crates/brightstaff/src/router/orchestrator.rs
@@ -1,17 +1,21 @@
 use std::{borrow::Cow, collections::HashMap, sync::Arc, time::Duration};
 
 use common::{
-    configuration::{AgentUsagePreference, OrchestrationPreference, TopLevelRoutingPreference},
+    configuration::{
+        AgentUsagePreference, EmptyPoolBehavior, OrchestrationPreference, TopLevelRoutingPreference,
+    },
     consts::{ARCH_PROVIDER_HINT_HEADER, REQUEST_ID_HEADER},
 };
 use hermesllm::apis::openai::Message;
+use hermesllm::RequiredCapabilities;
 use hyper::header;
 use opentelemetry::global;
 use opentelemetry_http::HeaderInjector;
 use thiserror::Error;
-use tracing::{debug, info};
+use tracing::{debug, info, warn};
 
 use super::http::{self, post_and_extract_content};
+use super::model_capabilities::ModelCapabilitiesService;
 use super::model_metrics::ModelMetricsService;
 use super::orchestrator_model::OrchestratorModel;
 
@@ -31,6 +35,8 @@ pub struct OrchestratorService {
     orchestrator_provider_name: String,
     top_level_preferences: HashMap<String, TopLevelRoutingPreference>,
     metrics_service: Option<Arc<ModelMetricsService>>,
+    capabilities_service: Option<Arc<ModelCapabilitiesService>>,
+    empty_pool_behavior: EmptyPoolBehavior,
     session_cache: Option<Arc<dyn SessionCache>>,
     session_ttl: Duration,
     tenant_header: Option<String>,
@@ -43,6 +49,11 @@ pub enum OrchestrationError {
 
     #[error("Orchestrator model error: {0}")]
     OrchestratorModelError(#[from] super::orchestrator_model::OrchestratorModelError),
+
+    /// Tier 1 capability filtering removed every candidate from the matched
+    /// route and `empty_pool_behavior` is `error` (D3).
+    #[error("no capable model for route '{route}': request requires {requirement}")]
+    CapabilityFilterEmpty { route: String, requirement: String },
 }
 
 pub type Result<T> = std::result::Result<T, OrchestrationError>;
@@ -67,12 +78,46 @@ impl OrchestratorService {
             orchestrator_provider_name,
             top_level_preferences: HashMap::new(),
             metrics_service: None,
+            capabilities_service: None,
+            empty_pool_behavior: EmptyPoolBehavior::default(),
             session_cache: None,
             session_ttl: Duration::from_secs(DEFAULT_SESSION_TTL_SECONDS),
             tenant_header: None,
         }
     }
 
+    /// Attach the Tier 1 capability filter and the empty-pool policy (D3).
+    /// Builder-style so existing constructor call sites stay unchanged.
+    #[must_use]
+    pub fn with_capability_filter(
+        mut self,
+        capabilities_service: Option<Arc<ModelCapabilitiesService>>,
+        empty_pool_behavior: EmptyPoolBehavior,
+    ) -> Self {
+        self.capabilities_service = capabilities_service;
+        self.empty_pool_behavior = empty_pool_behavior;
+        self
+    }
+
+    /// Whether a single model can serve a request with the given required
+    /// capabilities (used for the no-preference-match validation path).
+    pub async fn is_model_capable(&self, model: &str, required: &RequiredCapabilities) -> bool {
+        match &self.capabilities_service {
+            Some(svc) if !required.is_unconstrained() => {
+                required.satisfied_by(&svc.capabilities_for(model).await)
+            }
+            _ => true,
+        }
+    }
+
+    pub fn empty_pool_behavior(&self) -> EmptyPoolBehavior {
+        self.empty_pool_behavior
+    }
+
+    pub fn has_capability_filter(&self) -> bool {
+        self.capabilities_service.is_some()
+    }
+
     #[allow(clippy::too_many_arguments)]
     pub fn with_routing(
         orchestrator_url: String,
@@ -106,6 +151,8 @@ impl OrchestratorService {
             orchestrator_provider_name,
             top_level_preferences,
             metrics_service,
+            capabilities_service: None,
+            empty_pool_behavior: EmptyPoolBehavior::default(),
             session_cache: Some(session_cache),
             session_ttl,
             tenant_header,
@@ -170,6 +217,7 @@ impl OrchestratorService {
         messages: &[Message],
         inline_routing_preferences: Option<Vec<TopLevelRoutingPreference>>,
         request_id: &str,
+        required: &RequiredCapabilities,
     ) -> Result<Option<(String, Vec<String>)>> {
         if messages.is_empty() {
             return Ok(None);
@@ -223,10 +271,27 @@ impl OrchestratorService {
                     .or_else(|| self.top_level_preferences.get(route_name));
 
                 if let Some(pref) = top_pref {
+                    // Tier 1: hard capability filter (intersection) before ranking.
+                    let effective_models = self
+                        .apply_capability_filter(route_name, &pref.models, required)
+                        .await?;
+
+                    // Tier 2: rank the surviving capable pool (preserves order for `none`).
                     let ranked = match &self.metrics_service {
-                        Some(svc) => svc.rank_models(&pref.models, &pref.selection_policy).await,
-                        None => pref.models.clone(),
+                        Some(svc) => {
+                            svc.rank_models(&effective_models, &pref.selection_policy)
+                                .await
+                        }
+                        None => effective_models.clone(),
                     };
+                    info!(
+                        route = %route_name,
+                        tier = "tier2",
+                        selection_policy = ?pref.selection_policy,
+                        capable_pool = effective_models.len(),
+                        selected = %ranked.first().map(|s| s.as_str()).unwrap_or(""),
+                        "Tier 2 preference ranking applied"
+                    );
                     Some((route_name.clone(), ranked))
                 } else {
                     None
@@ -246,6 +311,90 @@ impl OrchestratorService {
         Ok(result)
     }
 
+    /// Tier 1: intersect a route's model pool with the models capable of serving
+    /// the request shape, preserving preference order among survivors. When the
+    /// intersection is empty, honor `empty_pool_behavior` (D3): `error` returns a
+    /// typed error; `warning` logs and proceeds with the pre-filter pool.
+    async fn apply_capability_filter(
+        &self,
+        route_name: &str,
+        models: &[String],
+        required: &RequiredCapabilities,
+    ) -> Result<Vec<String>> {
+        let Some(svc) = &self.capabilities_service else {
+            return Ok(models.to_vec());
+        };
+        if required.is_unconstrained() {
+            return Ok(models.to_vec());
+        }
+
+        let started = std::time::Instant::now();
+        let mut capable = Vec::new();
+        for m in models {
+            let caps = svc.capabilities_for(m).await;
+            if required.satisfied_by(&caps) {
+                capable.push(m.clone());
+            }
+        }
+        let elapsed = started.elapsed();
+
+        if capable.is_empty() {
+            match self.empty_pool_behavior {
+                EmptyPoolBehavior::Error => {
+                    bs_metrics::record_capability_filter(
+                        route_name,
+                        metric_labels::CAPABILITY_FILTER_EMPTY_ERROR,
+                        models.len(),
+                        0,
+                        elapsed,
+                    );
+                    return Err(OrchestrationError::CapabilityFilterEmpty {
+                        route: route_name.to_string(),
+                        requirement: required.describe(),
+                    });
+                }
+                EmptyPoolBehavior::Warning => {
+                    bs_metrics::record_capability_filter(
+                        route_name,
+                        metric_labels::CAPABILITY_FILTER_EMPTY_WARNING,
+                        models.len(),
+                        0,
+                        elapsed,
+                    );
+                    warn!(
+                        route = %route_name,
+                        requirement = %required.describe(),
+                        "Tier 1 capability filter emptied the pool; empty_pool_behavior=warning, proceeding with pre-filter pool"
+                    );
+                    return Ok(models.to_vec());
+                }
+            }
+        }
+
+        let outcome = if capable.len() == models.len() {
+            metric_labels::CAPABILITY_FILTER_PASS
+        } else {
+            metric_labels::CAPABILITY_FILTER_FILTERED
+        };
+        bs_metrics::record_capability_filter(
+            route_name,
+            outcome,
+            models.len(),
+            capable.len(),
+            elapsed,
+        );
+        info!(
+            route = %route_name,
+            tier = "tier1",
+            pre_filter = models.len(),
+            post_filter = capable.len(),
+            requirement = %required.describe(),
+            filter_us = elapsed.as_micros() as u64,
+            "Tier 1 capability filter applied"
+        );
+        Ok(capable)
+    }
+
     // ---- Agent orchestration (existing) ----
 
     pub async fn determine_orchestration(
diff --git a/crates/brightstaff/src/router/stress_tests.rs b/crates/brightstaff/src/router/stress_tests.rs
index 6c3ffefd4..8dee7e116 100644
--- a/crates/brightstaff/src/router/stress_tests.rs
+++ b/crates/brightstaff/src/router/stress_tests.rs
@@ -107,7 +107,7 @@ mod tests {
         for _ in 0..10 {
             let msgs = make_messages(5);
             let _ = orchestrator_service
-                .determine_route(&msgs, None, "warmup")
+                .determine_route(&msgs, None, "warmup", &Default::default())
                 .await;
         }
 
@@ -124,7 +124,7 @@ mod tests {
                 None
             };
             let _ = orchestrator_service
-                .determine_route(&msgs, inline, &format!("req-{i}"))
+                .determine_route(&msgs, inline, &format!("req-{i}"), &Default::default())
                 .await;
         }
 
@@ -198,7 +198,7 @@ mod tests {
         for _ in 0..20 {
             let msgs = make_messages(3);
             let _ = orchestrator_service
-                .determine_route(&msgs, None, "warmup")
+                .determine_route(&msgs, None, "warmup", &Default::default())
                 .await;
         }
 
@@ -215,7 +215,7 @@ mod tests {
                 for r in 0..requests_per_task {
                     let msgs = make_messages(3 + (r % 8));
                     let _ = svc
-                        .determine_route(&msgs, None, &format!("req-{t}-{r}"))
+                        .determine_route(&msgs, None, &format!("req-{t}-{r}"), &Default::default())
                         .await;
                 }
             });
diff --git a/crates/common/src/configuration.rs b/crates/common/src/configuration.rs
index 8aa521fa5..047c5e5ac 100644
--- a/crates/common/src/configuration.rs
+++ b/crates/common/src/configuration.rs
@@ -137,6 +137,10 @@ pub enum StateStorageType {
 pub enum SelectionPreference {
     Cheapest,
     Fastest,
+    /// Tier 2 ranking: rank the (already capability-filtered) pool by Plano's
+    /// internal long-context-quality score.
+    #[serde(rename = "long_context_quality")]
+    LongContextQuality,
     /// Return models in the same order they were defined — no reordering.
     #[default]
     #[serde(alias = "")]
@@ -235,6 +239,35 @@ pub struct Overrides {
     pub agent_orchestration_model: Option<String>,
     pub orchestrator_model_context_length: Option<usize>,
     pub disable_signals: Option<bool>,
+    /// What to do when Tier 1 capability filtering removes every candidate model
+    /// from a matched route (D3). Defaults to `error`.
+    #[serde(default)]
+    pub empty_pool_behavior: Option<EmptyPoolBehavior>,
+    /// Optional override for where model capabilities are sourced from. Defaults
+    /// to the public models.dev URL + vendored snapshot when unset.
+    #[serde(default)]
+    pub model_capabilities_source: Option<ModelCapabilitiesSource>,
+}
+
+/// Behavior when Tier 1 capability filtering empties a matched route's model pool.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
+#[serde(rename_all = "lowercase")]
+pub enum EmptyPoolBehavior {
+    /// Return an HTTP 422 with a clear message (no silent degrade).
+    #[default]
+    Error,
+    /// Log a warning and proceed with the developer's pre-filter pool as-is.
+    Warning,
+}
+
+/// Optional configuration for the models.dev capability source (nested under
+/// `overrides`). Defaults to the public models.dev URL when unset.
+#[derive(Debug, Clone, Serialize, Deserialize, Default)]
+pub struct ModelCapabilitiesSource {
+    /// models.dev-compatible API URL. Defaults to `https://models.dev/api.json`.
+    pub url: Option<String>,
+    /// Refresh interval in seconds. Omit to use the vendored seed only.
+    pub refresh_interval: Option<u64>,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize, Default)]
@@ -498,6 +531,11 @@ pub struct LlmProvider {
     pub internal: Option<bool>,
     pub passthrough_auth: Option<bool>,
     pub headers: Option<HashMap<String, String>>,
+    /// Optional per-model capability overrides (Tier 1 routing). All fields are
+    /// optional; anything left unset is resolved from the models.dev catalog,
+    /// then a conservative default. Precedence: user config > models.dev > default.
+    #[serde(default)]
+    pub capabilities: Option<hermesllm::ModelCapabilities>,
 }
 
 pub trait IntoModels {
@@ -542,6 +580,7 @@ impl Default for LlmProvider {
             internal: None,
             passthrough_auth: None,
             headers: None,
+            capabilities: None,
         }
     }
 }
diff --git a/crates/common/src/llm_providers.rs b/crates/common/src/llm_providers.rs
index b4355a2f5..d1181a98c 100644
--- a/crates/common/src/llm_providers.rs
+++ b/crates/common/src/llm_providers.rs
@@ -278,6 +278,7 @@ mod tests {
             stream: None,
             passthrough_auth: None,
             headers: None,
+            capabilities: None,
         }
     }
 
diff --git a/crates/hermesllm/src/apis/mod.rs b/crates/hermesllm/src/apis/mod.rs
index ea0563926..34e277fa1 100644
--- a/crates/hermesllm/src/apis/mod.rs
+++ b/crates/hermesllm/src/apis/mod.rs
@@ -1,6 +1,7 @@
 pub mod amazon_bedrock;
 pub mod anthropic;
 pub mod openai;
+pub mod openai_multimodal;
 pub mod openai_responses;
 pub mod streaming_shapes;
 
@@ -14,6 +15,9 @@ pub use openai::{
     ChatCompletionsRequest, ChatCompletionsResponse, ChatCompletionsStreamResponse, OpenAIApi,
 };
 pub use openai::{Message as OpenAIMessage, Tool as OpenAITool, ToolChoice as OpenAIToolChoice};
+pub use openai_multimodal::{
+    AudioSpeechRequest, ImageData, ImagesRequest, ImagesResponse, ImagesUsage,
+};
 
 pub trait ApiDefinition {
     /// Returns the endpoint path for this API
@@ -88,9 +92,11 @@ mod tests {
     fn test_all_variants_method() {
         // Test that all_variants returns the expected variants
         let openai_variants = OpenAIApi::all_variants();
-        assert_eq!(openai_variants.len(), 2);
+        assert_eq!(openai_variants.len(), 4);
         assert!(openai_variants.contains(&OpenAIApi::ChatCompletions));
         assert!(openai_variants.contains(&OpenAIApi::Responses));
+        assert!(openai_variants.contains(&OpenAIApi::Images));
+        assert!(openai_variants.contains(&OpenAIApi::Audio));
 
         let anthropic_variants = AnthropicApi::all_variants();
         assert_eq!(anthropic_variants.len(), 1);
diff --git a/crates/hermesllm/src/apis/openai.rs b/crates/hermesllm/src/apis/openai.rs
index 8e66f0ad6..3a5fd698a 100644
--- a/crates/hermesllm/src/apis/openai.rs
+++ b/crates/hermesllm/src/apis/openai.rs
@@ -11,7 +11,9 @@ use crate::providers::request::{ProviderRequest, ProviderRequestError};
 use crate::providers::response::{ProviderResponse, TokenUsage};
 use crate::providers::streaming_response::ProviderStreamResponse;
 use crate::transforms::lib::ExtractText;
-use crate::{CHAT_COMPLETIONS_PATH, OPENAI_RESPONSES_API_PATH};
+use crate::{
+    AUDIO_SPEECH_PATH, CHAT_COMPLETIONS_PATH, IMAGES_GENERATIONS_PATH, OPENAI_RESPONSES_API_PATH,
+};
 
 // ============================================================================
 // OPENAI API ENUMERATION
@@ -22,6 +24,10 @@ use crate::{CHAT_COMPLETIONS_PATH, OPENAI_RESPONSES_API_PATH};
 pub enum OpenAIApi {
     ChatCompletions,
     Responses,
+    /// Image generation (`/v1/images/generations`).
+    Images,
+    /// Text-to-speech / audio output (`/v1/audio/speech`). Returns binary audio.
+    Audio,
     // Future APIs can be added here:
     // Embeddings,
     // FineTuning,
@@ -33,6 +39,8 @@ impl ApiDefinition for OpenAIApi {
         match self {
             OpenAIApi::ChatCompletions => CHAT_COMPLETIONS_PATH,
             OpenAIApi::Responses => OPENAI_RESPONSES_API_PATH,
+            OpenAIApi::Images => IMAGES_GENERATIONS_PATH,
+            OpenAIApi::Audio => AUDIO_SPEECH_PATH,
         }
     }
 
@@ -40,6 +48,8 @@ impl ApiDefinition for OpenAIApi {
         match endpoint {
             CHAT_COMPLETIONS_PATH => Some(OpenAIApi::ChatCompletions),
             OPENAI_RESPONSES_API_PATH => Some(OpenAIApi::Responses),
+            IMAGES_GENERATIONS_PATH => Some(OpenAIApi::Images),
+            AUDIO_SPEECH_PATH => Some(OpenAIApi::Audio),
             _ => None,
         }
     }
@@ -48,6 +58,9 @@ impl ApiDefinition for OpenAIApi {
         match self {
             OpenAIApi::ChatCompletions => true,
             OpenAIApi::Responses => true,
+            // TTS supports streamed audio chunks; image generation does not.
+            OpenAIApi::Audio => true,
+            OpenAIApi::Images => false,
         }
     }
 
@@ -55,6 +68,8 @@ impl ApiDefinition for OpenAIApi {
         match self {
             OpenAIApi::ChatCompletions => true,
             OpenAIApi::Responses => true,
+            OpenAIApi::Images => false,
+            OpenAIApi::Audio => false,
         }
     }
 
@@ -62,11 +77,19 @@ impl ApiDefinition for OpenAIApi {
         match self {
             OpenAIApi::ChatCompletions => true,
             OpenAIApi::Responses => true,
+            // These are output modalities, not vision input.
+            OpenAIApi::Images => false,
+            OpenAIApi::Audio => false,
         }
     }
 
     fn all_variants() -> Vec<Self> {
-        vec![OpenAIApi::ChatCompletions, OpenAIApi::Responses]
+        vec![
+            OpenAIApi::ChatCompletions,
+            OpenAIApi::Responses,
+            OpenAIApi::Images,
+            OpenAIApi::Audio,
+        ]
     }
 }
 
@@ -1183,9 +1206,11 @@ mod tests {
 
         // Test all_variants
         let all_variants = OpenAIApi::all_variants();
-        assert_eq!(all_variants.len(), 2);
+        assert_eq!(all_variants.len(), 4);
         assert!(all_variants.contains(&OpenAIApi::ChatCompletions));
         assert!(all_variants.contains(&OpenAIApi::Responses));
+        assert!(all_variants.contains(&OpenAIApi::Images));
+        assert!(all_variants.contains(&OpenAIApi::Audio));
     }
 
     #[test]
diff --git a/crates/hermesllm/src/apis/openai_multimodal.rs b/crates/hermesllm/src/apis/openai_multimodal.rs
new file mode 100644
index 000000000..472effaf9
--- /dev/null
+++ b/crates/hermesllm/src/apis/openai_multimodal.rs
@@ -0,0 +1,183 @@
+//! OpenAI multimodal output APIs: image generation (`/v1/images/generations`)
+//! and text-to-speech (`/v1/audio/speech`).
+//!
+//! These are OpenAI-native serde shapes. Image responses are JSON; audio/speech
+//! responses are **binary** and are passed through untouched by the gateway
+//! (see `llm_gateway::stream_context`), so there is no audio *response* struct.
+
+use serde::{Deserialize, Serialize};
+use serde_with::skip_serializing_none;
+
+use crate::providers::request::ProviderRequestError;
+
+/// Request body for `POST /v1/images/generations`.
+#[skip_serializing_none]
+#[derive(Serialize, Deserialize, Debug, Clone, Default)]
+pub struct ImagesRequest {
+    #[serde(default)]
+    pub model: String,
+    pub prompt: String,
+    /// Number of images to generate.
+    pub n: Option<u32>,
+    /// e.g. `1024x1024`, `1792x1024`.
+    pub size: Option<String>,
+    /// e.g. `standard`, `hd`, or model-specific quality levels.
+    pub quality: Option<String>,
+    /// `url` or `b64_json`.
+    pub response_format: Option<String>,
+    pub style: Option<String>,
+    pub background: Option<String>,
+    pub output_format: Option<String>,
+    pub user: Option<String>,
+}
+
+impl ImagesRequest {
+    pub fn try_from_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
+        serde_json::from_slice(bytes)
+    }
+
+    pub fn model(&self) -> &str {
+        &self.model
+    }
+
+    pub fn set_model(&mut self, model: String) {
+        self.model = model;
+    }
+
+    pub fn to_bytes(&self) -> Result<Vec<u8>, ProviderRequestError> {
+        serde_json::to_vec(self).map_err(|e| ProviderRequestError {
+            message: format!("failed to serialize ImagesRequest: {}", e),
+            source: Some(Box::new(e)),
+        })
+    }
+}
+
+/// One generated image in an [`ImagesResponse`].
+#[skip_serializing_none]
+#[derive(Serialize, Deserialize, Debug, Clone, Default)]
+pub struct ImageData {
+    pub b64_json: Option<String>,
+    pub url: Option<String>,
+    pub revised_prompt: Option<String>,
+}
+
+/// Response body for `POST /v1/images/generations`.
+#[skip_serializing_none]
+#[derive(Serialize, Deserialize, Debug, Clone, Default)]
+pub struct ImagesResponse {
+    pub created: Option<u64>,
+    #[serde(default)]
+    pub data: Vec<ImageData>,
+    /// Some providers report token/image usage here.
+    pub usage: Option<ImagesUsage>,
+}
+
+/// Usage block for image generation (used for per-image cost accounting).
+#[skip_serializing_none]
+#[derive(Serialize, Deserialize, Debug, Clone, Default)]
+pub struct ImagesUsage {
+    /// Number of images produced (the primary billable unit).
+    pub images: Option<u32>,
+    pub input_tokens: Option<u32>,
+    pub output_tokens: Option<u32>,
+    pub total_tokens: Option<u32>,
+}
+
+impl ImagesResponse {
+    pub fn try_from_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
+        serde_json::from_slice(bytes)
+    }
+
+    /// Number of images produced (billable unit), preferring an explicit usage
+    /// count and falling back to the number of returned images.
+    pub fn image_units(&self) -> usize {
+        self.usage
+            .as_ref()
+            .and_then(|u| u.images)
+            .map(|n| n as usize)
+            .unwrap_or(self.data.len())
+    }
+}
+
+/// Request body for `POST /v1/audio/speech` (text-to-speech). The response is
+/// binary audio and is streamed/passed through without a typed response body.
+#[skip_serializing_none]
+#[derive(Serialize, Deserialize, Debug, Clone, Default)]
+pub struct AudioSpeechRequest {
+    #[serde(default)]
+    pub model: String,
+    /// Text to synthesize.
+    pub input: String,
+    /// Voice id (e.g. `alloy`, `verse`).
+    pub voice: String,
+    /// Output container, e.g. `mp3`, `wav`, `opus`, `pcm`.
+    pub response_format: Option<String>,
+    pub speed: Option<f32>,
+    pub instructions: Option<String>,
+    /// Whether the client requested streamed audio chunks.
+    pub stream: Option<bool>,
+}
+
+impl AudioSpeechRequest {
+    pub fn try_from_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
+        serde_json::from_slice(bytes)
+    }
+
+    pub fn model(&self) -> &str {
+        &self.model
+    }
+
+    pub fn set_model(&mut self, model: String) {
+        self.model = model;
+    }
+
+    pub fn is_streaming(&self) -> bool {
+        self.stream.unwrap_or(false)
+    }
+
+    /// Billable unit for TTS: number of input characters.
+    pub fn audio_units(&self) -> usize {
+        self.input.chars().count()
+    }
+
+    pub fn to_bytes(&self) -> Result<Vec<u8>, ProviderRequestError> {
+        serde_json::to_vec(self).map_err(|e| ProviderRequestError {
+            message: format!("failed to serialize AudioSpeechRequest: {}", e),
+            source: Some(Box::new(e)),
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn images_request_roundtrips() {
+        let raw = br#"{"model":"gpt-image-1","prompt":"a cat","n":2,"size":"1024x1024"}"#;
+        let req = ImagesRequest::try_from_bytes(raw).unwrap();
+        assert_eq!(req.model, "gpt-image-1");
+        assert_eq!(req.n, Some(2));
+        assert!(req.to_bytes().is_ok());
+    }
+
+    #[test]
+    fn images_response_counts_units() {
+        let raw = br#"{"created":1,"data":[{"b64_json":"aaa"},{"b64_json":"bbb"}]}"#;
+        let resp = ImagesResponse::try_from_bytes(raw).unwrap();
+        assert_eq!(resp.image_units(), 2);
+
+        let raw2 = br#"{"created":1,"data":[{"url":"x"}],"usage":{"images":5}}"#;
+        let resp2 = ImagesResponse::try_from_bytes(raw2).unwrap();
+        assert_eq!(resp2.image_units(), 5);
+    }
+
+    #[test]
+    fn audio_speech_request_units() {
+        let raw = br#"{"model":"gpt-4o-mini-tts","input":"hello","voice":"alloy"}"#;
+        let req = AudioSpeechRequest::try_from_bytes(raw).unwrap();
+        assert_eq!(req.audio_units(), 5);
+        assert_eq!(req.voice, "alloy");
+        assert!(!req.is_streaming());
+    }
+}
diff --git a/crates/hermesllm/src/bin/long_context_quality.yaml b/crates/hermesllm/src/bin/long_context_quality.yaml
new file mode 100644
index 000000000..f26af8b58
--- /dev/null
+++ b/crates/hermesllm/src/bin/long_context_quality.yaml
@@ -0,0 +1,77 @@
+# Internal long-context-quality (LCQ) dataset — Tier 2 routing signal.
+#
+# This is Plano-shipped data, NOT user config. Scores (0.0–1.0) estimate how well
+# a model stays coherent at long context lengths, seeded from public benchmarks
+# (RULER, HELMET, LongBench v2, NoLiMa). They are intentionally kept SEPARATE from
+# the capability catalog (capabilities are stable; LCQ drifts — see plan R5).
+#
+# Keys are canonical "<provider>/<model-id>" (see ProviderId::canonical_key); the
+# loader normalizes lookups so config aliases (e.g. google/ vs gemini/) resolve.
+#
+# Refresh cadence: review quarterly against the latest public leaderboards and
+# bump `dated`. Each entry records its `source` + `dated` for provenance; staleness
+# is surfaced in telemetry.
+version: "1.0"
+source: "RULER / HELMET / LongBench v2 / NoLiMa (public benchmarks)"
+dated: "2026-06-01"
+models:
+  gemini/gemini-2.5-pro:
+    score: 0.96
+    source: "RULER@128k, LongBench v2"
+    dated: "2026-06-01"
+  gemini/gemini-2.5-flash:
+    score: 0.90
+    source: "RULER@128k"
+    dated: "2026-06-01"
+  gemini/gemini-1.5-pro:
+    score: 0.88
+    source: "RULER@128k"
+    dated: "2026-06-01"
+  openai/gpt-4o:
+    score: 0.84
+    source: "HELMET@128k"
+    dated: "2026-06-01"
+  openai/gpt-4o-mini:
+    score: 0.74
+    source: "HELMET@128k"
+    dated: "2026-06-01"
+  openai/gpt-4.1:
+    score: 0.92
+    source: "HELMET@1M, RULER"
+    dated: "2026-06-01"
+  openai/o3:
+    score: 0.91
+    source: "LongBench v2"
+    dated: "2026-06-01"
+  anthropic/claude-opus-4-5:
+    score: 0.93
+    source: "LongBench v2, RULER@200k"
+    dated: "2026-06-01"
+  anthropic/claude-opus-4-8:
+    score: 0.94
+    source: "LongBench v2, RULER@200k"
+    dated: "2026-06-01"
+  anthropic/claude-3-5-sonnet:
+    score: 0.86
+    source: "RULER@200k"
+    dated: "2026-06-01"
+  anthropic/claude-3-5-haiku:
+    score: 0.78
+    source: "RULER@200k"
+    dated: "2026-06-01"
+  deepseek/deepseek-v4-pro:
+    score: 0.83
+    source: "LongBench v2"
+    dated: "2026-06-01"
+  qwen/qwen3-coder-plus:
+    score: 0.80
+    source: "RULER@128k"
+    dated: "2026-06-01"
+  moonshotai/kimi-k2-0905-preview:
+    score: 0.82
+    source: "LongBench v2"
+    dated: "2026-06-01"
+  zhipu/glm-5.1:
+    score: 0.79
+    source: "LongBench v2"
+    dated: "2026-06-01"
diff --git a/crates/hermesllm/src/clients/endpoints.rs b/crates/hermesllm/src/clients/endpoints.rs
index d7a9b4719..7b7136295 100644
--- a/crates/hermesllm/src/clients/endpoints.rs
+++ b/crates/hermesllm/src/clients/endpoints.rs
@@ -61,12 +61,18 @@ impl SupportedAPIsFromClient {
     /// Create a SupportedApi from an endpoint path
     pub fn from_endpoint(endpoint: &str) -> Option<Self> {
         if let Some(openai_api) = OpenAIApi::from_endpoint(endpoint) {
-            // Check if this is the Responses API endpoint
-            if openai_api == OpenAIApi::Responses {
-                return Some(SupportedAPIsFromClient::OpenAIResponsesAPI(openai_api));
+            match openai_api {
+                OpenAIApi::Responses => {
+                    return Some(SupportedAPIsFromClient::OpenAIResponsesAPI(openai_api));
+                }
+                OpenAIApi::ChatCompletions => {
+                    return Some(SupportedAPIsFromClient::OpenAIChatCompletions(openai_api));
+                }
+                // Image-generation / audio-speech are identified for routing
+                // (capability gating + binary passthrough) but are not parsed as
+                // chat-style client request bodies, so they don't resolve here.
+                OpenAIApi::Images | OpenAIApi::Audio => {}
             }
-            // Otherwise it's ChatCompletions
-            return Some(SupportedAPIsFromClient::OpenAIChatCompletions(openai_api));
         }
 
         if let Some(anthropic_api) = AnthropicApi::from_endpoint(endpoint) {
@@ -247,8 +253,13 @@ impl SupportedUpstreamAPIs {
 pub fn supported_endpoints() -> Vec<&'static str> {
     let mut endpoints = Vec::new();
 
-    // Add all OpenAI endpoints
+    // Add OpenAI client-routable endpoints. Image-generation / audio-speech
+    // endpoints are recognized for capability routing but not yet advertised as
+    // chat-style client endpoints (handled via native/binary passthrough).
     for api in OpenAIApi::all_variants() {
+        if matches!(api, OpenAIApi::Images | OpenAIApi::Audio) {
+            continue;
+        }
         endpoints.push(api.endpoint());
     }
 
@@ -310,9 +321,11 @@ mod tests {
     fn test_endpoints_generated_from_api_definitions() {
         let endpoints = supported_endpoints();
 
-        // Verify that we get endpoints from all API variants
+        // Verify that we get endpoints from all client-routable API variants.
+        // Image-generation / audio-speech are excluded (not chat-style clients).
         let openai_endpoints: Vec<_> = OpenAIApi::all_variants()
             .iter()
+            .filter(|api| !matches!(api, OpenAIApi::Images | OpenAIApi::Audio))
             .map(|api| api.endpoint())
             .collect();
         let anthropic_endpoints: Vec<_> = AnthropicApi::all_variants()
@@ -337,10 +350,14 @@ mod tests {
                 endpoint
             );
         }
-        // Total should match
+        // Total should match the client-routable variants (excludes Images/Audio).
+        let openai_client_routable = OpenAIApi::all_variants()
+            .iter()
+            .filter(|api| !matches!(api, OpenAIApi::Images | OpenAIApi::Audio))
+            .count();
         assert_eq!(
             endpoints.len(),
-            OpenAIApi::all_variants().len() + AnthropicApi::all_variants().len()
+            openai_client_routable + AnthropicApi::all_variants().len()
         );
     }
 
diff --git a/crates/hermesllm/src/lib.rs b/crates/hermesllm/src/lib.rs
index 3b9611e00..6323361ca 100644
--- a/crates/hermesllm/src/lib.rs
+++ b/crates/hermesllm/src/lib.rs
@@ -9,7 +9,13 @@ pub mod transforms;
 pub use apis::streaming_shapes::amazon_bedrock_binary_frame::BedrockBinaryFrameDecoder;
 pub use apis::streaming_shapes::sse::{SseEvent, SseStreamIter};
 pub use aws_smithy_eventstream::frame::DecodedFrame;
+pub use providers::capabilities::{
+    CapabilitiesCatalog, CapabilitiesSnapshot, ModelCapabilities, RequiredCapabilities,
+};
 pub use providers::id::ProviderId;
+pub use providers::long_context_quality::{
+    score_for as long_context_quality_score, LongContextQualityDataset,
+};
 pub use providers::request::{ProviderRequest, ProviderRequestError, ProviderRequestType};
 pub use providers::response::{
     ProviderResponse, ProviderResponseError, ProviderResponseType, TokenUsage,
@@ -20,6 +26,8 @@ pub use providers::streaming_response::{ProviderStreamResponse, ProviderStreamRe
 pub const CHAT_COMPLETIONS_PATH: &str = "/v1/chat/completions";
 pub const OPENAI_RESPONSES_API_PATH: &str = "/v1/responses";
 pub const MESSAGES_PATH: &str = "/v1/messages";
+pub const IMAGES_GENERATIONS_PATH: &str = "/v1/images/generations";
+pub const AUDIO_SPEECH_PATH: &str = "/v1/audio/speech";
 
 #[cfg(test)]
 mod tests {
diff --git a/crates/hermesllm/src/providers/capabilities.rs b/crates/hermesllm/src/providers/capabilities.rs
new file mode 100644
index 000000000..26f73a293
--- /dev/null
+++ b/crates/hermesllm/src/providers/capabilities.rs
@@ -0,0 +1,421 @@
+//! Model capability metadata (Tier 1 routing).
+//!
+//! Capabilities are objective, stable properties of a model: which modalities it
+//! accepts/produces and how large a context window it supports. They are sourced
+//! at runtime from [models.dev](https://models.dev) (fetched by brightstaff's
+//! `ModelCapabilitiesService`, mirroring how DigitalOcean pricing is fetched) and
+//! can be overridden per-model by user config. This module owns only the snapshot
+//! parsing and the canonical lookup — no data is vendored into the binary.
+//!
+//! Precedence (applied by the caller, e.g. brightstaff's `ModelCapabilitiesService`):
+//! `user config capabilities > models.dev > conservative default`.
+
+use crate::providers::id::ProviderId;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+
+/// Objective, stable per-model capabilities. All fields are optional so that a
+/// user-config override can be merged field-by-field over the models.dev default.
+#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
+pub struct ModelCapabilities {
+    /// Maximum input context window (tokens).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub context_window: Option<u32>,
+    /// Maximum output tokens the model will emit.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub max_output_tokens: Option<u32>,
+    /// Accepts image input (vision).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub supports_vision: Option<bool>,
+    /// Produces images (`/v1/images/generations`).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub supports_image_generation: Option<bool>,
+    /// Produces audio (`/v1/audio/speech`).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub supports_audio_out: Option<bool>,
+}
+
+impl ModelCapabilities {
+    /// Resolve vision support, defaulting to text-only (`false`) when unknown.
+    pub fn vision(&self) -> bool {
+        self.supports_vision.unwrap_or(false)
+    }
+
+    /// Resolve image-generation support, defaulting to `false` when unknown.
+    pub fn image_generation(&self) -> bool {
+        self.supports_image_generation.unwrap_or(false)
+    }
+
+    /// Resolve audio-out support, defaulting to `false` when unknown.
+    pub fn audio_out(&self) -> bool {
+        self.supports_audio_out.unwrap_or(false)
+    }
+
+    /// Known context window, treating `0`/absent as "unknown" (no constraint).
+    pub fn window(&self) -> Option<u32> {
+        self.context_window.filter(|&w| w > 0)
+    }
+
+    /// Fill any `None` field on `self` from `fallback`. Used to apply precedence:
+    /// `user.fill_from(models_dev)` keeps user-set fields and backfills the rest.
+    pub fn fill_from(&self, fallback: &ModelCapabilities) -> ModelCapabilities {
+        ModelCapabilities {
+            context_window: self.context_window.or(fallback.context_window),
+            max_output_tokens: self.max_output_tokens.or(fallback.max_output_tokens),
+            supports_vision: self.supports_vision.or(fallback.supports_vision),
+            supports_image_generation: self
+                .supports_image_generation
+                .or(fallback.supports_image_generation),
+            supports_audio_out: self.supports_audio_out.or(fallback.supports_audio_out),
+        }
+    }
+}
+
+/// The capability requirements implied by a single request's shape (Tier 1).
+/// Computed from the endpoint + request content; checked against each candidate
+/// model's [`ModelCapabilities`].
+#[derive(Debug, Clone, Default, PartialEq, Eq)]
+pub struct RequiredCapabilities {
+    /// Request carries image input (vision).
+    pub vision: bool,
+    /// Request targets image generation (`/v1/images/generations`).
+    pub image_out: bool,
+    /// Request targets audio/TTS output (`/v1/audio/speech`).
+    pub audio_out: bool,
+    /// Minimum input context window required (estimated token count).
+    pub min_context_tokens: usize,
+}
+
+impl RequiredCapabilities {
+    /// Derive the modality requirements implied by an endpoint path. Vision and
+    /// context-token requirements are request-content-derived and set separately.
+    pub fn for_endpoint(path: &str) -> Self {
+        RequiredCapabilities {
+            image_out: path.contains("/images/generations"),
+            audio_out: path.contains("/audio/speech"),
+            ..Default::default()
+        }
+    }
+
+    /// True when this request imposes no capability constraints (plain text chat
+    /// that fits any window) — lets callers skip filtering entirely.
+    pub fn is_unconstrained(&self) -> bool {
+        !self.vision && !self.image_out && !self.audio_out && self.min_context_tokens == 0
+    }
+
+    /// Whether a model with the given capabilities can serve this request.
+    /// Unknown context windows are treated permissively (conservative default):
+    /// we only eliminate a model when we can *prove* the window is too small.
+    pub fn satisfied_by(&self, caps: &ModelCapabilities) -> bool {
+        if self.vision && !caps.vision() {
+            return false;
+        }
+        if self.image_out && !caps.image_generation() {
+            return false;
+        }
+        if self.audio_out && !caps.audio_out() {
+            return false;
+        }
+        if self.min_context_tokens > 0 {
+            if let Some(window) = caps.window() {
+                if self.min_context_tokens as u64 > window as u64 {
+                    return false;
+                }
+            }
+        }
+        true
+    }
+
+    /// Human-readable description of the unmet requirement(s) for error messages.
+    pub fn describe(&self) -> String {
+        let mut parts = Vec::new();
+        if self.vision {
+            parts.push("vision input".to_string());
+        }
+        if self.image_out {
+            parts.push("image generation".to_string());
+        }
+        if self.audio_out {
+            parts.push("audio output".to_string());
+        }
+        if self.min_context_tokens > 0 {
+            parts.push(format!(
+                "context window >= {} tokens",
+                self.min_context_tokens
+            ));
+        }
+        if parts.is_empty() {
+            "no special capabilities".to_string()
+        } else {
+            parts.join(", ")
+        }
+    }
+}
+
+/// On-disk shape of the vendored snapshot / models.dev refresh output.
+#[derive(Debug, Clone, Default, Serialize, Deserialize)]
+pub struct CapabilitiesSnapshot {
+    #[serde(default)]
+    pub version: String,
+    #[serde(default)]
+    pub source: String,
+    #[serde(default)]
+    pub generated: String,
+    /// Keyed by canonical `"<provider>/<model_id>"` (see [`ProviderId::canonical_key`]).
+    #[serde(default)]
+    pub models: HashMap<String, ModelCapabilities>,
+}
+
+impl CapabilitiesSnapshot {
+    /// Parse a pre-built (canonical-keyed) snapshot from JSON bytes, i.e. the
+    /// `{ "models": { "<provider>/<id>": { ... } } }` shape this module emits.
+    pub fn from_json_slice(bytes: &[u8]) -> Result<Self, serde_json::Error> {
+        serde_json::from_slice(bytes)
+    }
+
+    /// Build a snapshot directly from a raw `models.dev` `api.json` payload,
+    /// applying the provider-key alias map and modality/limit mapping. Used by
+    /// the brightstaff runtime fetch/refresh.
+    pub fn from_models_dev_json(bytes: &[u8]) -> Result<Self, serde_json::Error> {
+        let providers: HashMap<String, ModelsDevProvider> = serde_json::from_slice(bytes)?;
+        let mut models = HashMap::new();
+        for (pkey, provider) in providers {
+            let Some(canon) = models_dev_provider_to_canonical(&pkey) else {
+                continue; // unmapped / aggregator provider
+            };
+            for (mid, m) in provider.models {
+                models.insert(format!("{}/{}", canon, mid), m.into_capabilities());
+            }
+        }
+        Ok(CapabilitiesSnapshot {
+            version: "1.0".to_string(),
+            source: "models.dev".to_string(),
+            generated: String::new(),
+            models,
+        })
+    }
+}
+
+/// models.dev provider key -> Plano canonical provider token (matches
+/// [`ProviderId::canonical_key`]). Aggregator/unmapped keys return `None`.
+pub fn models_dev_provider_to_canonical(provider_key: &str) -> Option<&'static str> {
+    Some(match provider_key {
+        "openai" => "openai",
+        "anthropic" => "anthropic",
+        "google" => "gemini",
+        "mistral" => "mistral",
+        "groq" => "groq",
+        "xai" => "xai",
+        "deepseek" => "deepseek",
+        "moonshotai" => "moonshotai",
+        "zhipuai" => "zhipu",
+        "xiaomi" => "xiaomi",
+        "togetherai" => "together_ai",
+        "amazon-bedrock" => "amazon_bedrock",
+        "digitalocean" => "digitalocean",
+        "openrouter" => "openrouter",
+        "vercel" => "vercel",
+        "github-models" => "github",
+        "alibaba" => "qwen",
+        _ => return None,
+    })
+}
+
+/// Raw models.dev per-provider shape (only the fields we map).
+#[derive(Debug, Deserialize)]
+struct ModelsDevProvider {
+    #[serde(default)]
+    models: HashMap<String, ModelsDevModel>,
+}
+
+#[derive(Debug, Deserialize)]
+struct ModelsDevModel {
+    #[serde(default)]
+    modalities: ModelsDevModalities,
+    #[serde(default)]
+    limit: ModelsDevLimit,
+}
+
+#[derive(Debug, Default, Deserialize)]
+struct ModelsDevModalities {
+    #[serde(default)]
+    input: Vec<String>,
+    #[serde(default)]
+    output: Vec<String>,
+}
+
+#[derive(Debug, Default, Deserialize)]
+struct ModelsDevLimit {
+    #[serde(default)]
+    context: Option<u32>,
+    #[serde(default)]
+    output: Option<u32>,
+}
+
+impl ModelsDevModel {
+    fn into_capabilities(self) -> ModelCapabilities {
+        ModelCapabilities {
+            context_window: self.limit.context,
+            max_output_tokens: self.limit.output,
+            supports_vision: Some(self.modalities.input.iter().any(|s| s == "image")),
+            supports_image_generation: Some(self.modalities.output.iter().any(|s| s == "image")),
+            supports_audio_out: Some(self.modalities.output.iter().any(|s| s == "audio")),
+        }
+    }
+}
+
+/// In-memory capability catalog keyed by canonical `"<provider>/<model_id>"`.
+#[derive(Debug, Clone, Default)]
+pub struct CapabilitiesCatalog {
+    models: HashMap<String, ModelCapabilities>,
+}
+
+impl CapabilitiesCatalog {
+    pub fn new(models: HashMap<String, ModelCapabilities>) -> Self {
+        Self { models }
+    }
+
+    pub fn from_snapshot(snapshot: CapabilitiesSnapshot) -> Self {
+        Self::new(snapshot.models)
+    }
+
+    /// Number of models in the catalog.
+    pub fn len(&self) -> usize {
+        self.models.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.models.is_empty()
+    }
+
+    /// Look up capabilities for a `"<provider>/<model_id>"` string, normalizing
+    /// the provider token to its canonical form. Returns `None` when the provider
+    /// is unknown or the model is absent from the catalog.
+    pub fn get(&self, model: &str) -> Option<&ModelCapabilities> {
+        let key = canonical_model_key(model)?;
+        self.models.get(&key)
+    }
+}
+
+/// Normalize a `"<provider>/<model_id>"` string into the canonical catalog key
+/// `"<canonical_provider>/<model_id>"`. Splits on the first `/` so model ids that
+/// themselves contain `/` (e.g. `meta-llama/Llama-3.3-70B`) are preserved.
+pub fn canonical_model_key(model: &str) -> Option<String> {
+    let (provider, model_id) = model.split_once('/')?;
+    let canonical = ProviderId::try_from(provider).ok()?.canonical_key();
+    Some(format!("{}/{}", canonical, model_id))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn canonical_key_normalizes_provider_aliases() {
+        // google -> gemini, x-ai handled via try_from
+        assert_eq!(
+            canonical_model_key("google/gemini-2.5-pro").as_deref(),
+            Some("gemini/gemini-2.5-pro")
+        );
+        assert_eq!(
+            canonical_model_key("openai/gpt-4o").as_deref(),
+            Some("openai/gpt-4o")
+        );
+        // model id containing a slash is preserved
+        assert_eq!(
+            canonical_model_key("together_ai/meta-llama/Llama-3.3-70B").as_deref(),
+            Some("together_ai/meta-llama/Llama-3.3-70B")
+        );
+        // unknown provider -> None
+        assert!(canonical_model_key("notaprovider/foo").is_none());
+        // no provider prefix -> None
+        assert!(canonical_model_key("gpt-4o").is_none());
+    }
+
+    #[test]
+    fn catalog_resolves_known_model_and_defaults_unknown() {
+        // A catalog built from a models.dev payload resolves known models by
+        // canonical key; absent models fall back to the conservative default.
+        let raw = br#"{
+            "openai": { "models": {
+                "gpt-4o": {
+                    "modalities": { "input": ["text","image"], "output": ["text"] },
+                    "limit": { "context": 128000, "output": 16384 }
+                }
+            }}
+        }"#;
+        let snapshot = CapabilitiesSnapshot::from_models_dev_json(raw).unwrap();
+        let catalog = CapabilitiesCatalog::from_snapshot(snapshot);
+
+        let caps = catalog.get("openai/gpt-4o").cloned().unwrap_or_default();
+        assert!(caps.vision());
+        assert!(!caps.image_generation());
+        assert_eq!(caps.window(), Some(128000));
+
+        // Unknown model -> conservative default (text-only, unknown window).
+        let missing = catalog
+            .get("openai/totally-made-up-model")
+            .cloned()
+            .unwrap_or_default();
+        assert!(!missing.vision());
+        assert!(!missing.audio_out());
+        assert_eq!(missing.window(), None);
+    }
+
+    #[test]
+    fn models_dev_raw_json_maps_to_canonical_capabilities() {
+        let raw = br#"{
+            "google": { "models": {
+                "gemini-2.5-pro": {
+                    "modalities": { "input": ["text","image"], "output": ["text"] },
+                    "limit": { "context": 1048576, "output": 65536 }
+                }
+            }},
+            "requesty": { "models": {
+                "openai/gpt-4o": { "modalities": { "input": ["text"], "output": ["text"] } }
+            }}
+        }"#;
+        let snapshot = CapabilitiesSnapshot::from_models_dev_json(raw).unwrap();
+        // google -> gemini canonical key
+        let caps = snapshot.models.get("gemini/gemini-2.5-pro").unwrap();
+        assert_eq!(caps.supports_vision, Some(true));
+        assert_eq!(caps.context_window, Some(1048576));
+        assert_eq!(caps.max_output_tokens, Some(65536));
+        // aggregator provider "requesty" is skipped
+        assert!(snapshot.models.keys().all(|k| !k.contains("requesty")));
+    }
+
+    #[test]
+    fn provider_alias_map_matches_canonical_keys() {
+        assert_eq!(models_dev_provider_to_canonical("google"), Some("gemini"));
+        assert_eq!(
+            models_dev_provider_to_canonical("amazon-bedrock"),
+            Some("amazon_bedrock")
+        );
+        assert_eq!(models_dev_provider_to_canonical("requesty"), None);
+        // Alias targets must be valid canonical provider tokens (round-trip).
+        for key in ["google", "amazon-bedrock", "togetherai", "github-models"] {
+            let canon = models_dev_provider_to_canonical(key).unwrap();
+            let provider = ProviderId::try_from(canon).expect("canonical token must parse");
+            assert_eq!(provider.canonical_key(), canon);
+        }
+    }
+
+    #[test]
+    fn fill_from_applies_precedence() {
+        let user = ModelCapabilities {
+            context_window: Some(128000),
+            ..Default::default()
+        };
+        let models_dev = ModelCapabilities {
+            context_window: Some(200000),
+            supports_vision: Some(true),
+            ..Default::default()
+        };
+        let resolved = user.fill_from(&models_dev);
+        // user override wins for context_window
+        assert_eq!(resolved.context_window, Some(128000));
+        // models.dev backfills vision
+        assert_eq!(resolved.supports_vision, Some(true));
+    }
+}
diff --git a/crates/hermesllm/src/providers/id.rs b/crates/hermesllm/src/providers/id.rs
index 91b744de1..ff0e32e06 100644
--- a/crates/hermesllm/src/providers/id.rs
+++ b/crates/hermesllm/src/providers/id.rs
@@ -91,6 +91,38 @@ impl TryFrom<&str> for ProviderId {
 }
 
 impl ProviderId {
+    /// Stable lowercase token used as the canonical provider key for capability
+    /// lookups (see `providers::capabilities`). Must round-trip through
+    /// `ProviderId::try_from`, and the capability snapshot generator maps
+    /// models.dev provider keys to these same tokens.
+    pub fn canonical_key(&self) -> &'static str {
+        match self {
+            ProviderId::OpenAI => "openai",
+            ProviderId::Xiaomi => "xiaomi",
+            ProviderId::Mistral => "mistral",
+            ProviderId::Deepseek => "deepseek",
+            ProviderId::Groq => "groq",
+            ProviderId::Gemini => "gemini",
+            ProviderId::Anthropic => "anthropic",
+            ProviderId::GitHub => "github",
+            ProviderId::Plano => "plano",
+            ProviderId::AzureOpenAI => "azure_openai",
+            ProviderId::XAI => "xai",
+            ProviderId::TogetherAI => "together_ai",
+            ProviderId::Ollama => "ollama",
+            ProviderId::Moonshotai => "moonshotai",
+            ProviderId::Zhipu => "zhipu",
+            ProviderId::Qwen => "qwen",
+            ProviderId::AmazonBedrock => "amazon_bedrock",
+            ProviderId::ChatGPT => "chatgpt",
+            ProviderId::DigitalOcean => "digitalocean",
+            ProviderId::Vercel => "vercel",
+            ProviderId::OpenRouter => "openrouter",
+            ProviderId::Astraflow => "astraflow",
+            ProviderId::AstraflowCN => "astraflow_cn",
+        }
+    }
+
     /// Get all available models for this provider
     /// Returns model names without the provider prefix (e.g., "gpt-4" not "openai/gpt-4")
     pub fn models(&self) -> Vec<String> {
diff --git a/crates/hermesllm/src/providers/long_context_quality.rs b/crates/hermesllm/src/providers/long_context_quality.rs
new file mode 100644
index 000000000..73fae1cc9
--- /dev/null
+++ b/crates/hermesllm/src/providers/long_context_quality.rs
@@ -0,0 +1,123 @@
+//! Internal long-context-quality (LCQ) dataset — the v1 Tier 2 routing signal.
+//!
+//! LCQ scores estimate how well a model stays coherent at long context lengths.
+//! They are **Plano-internal** data seeded from public benchmarks (RULER, HELMET,
+//! LongBench v2, NoLiMa), loaded from a vendored YAML file. They are intentionally
+//! kept separate from the capability catalog: capabilities are stable, LCQ drifts.
+//!
+//! Used by brightstaff's `rank_models` when `selection_policy.prefer = long_context_quality`.
+
+use crate::providers::capabilities::canonical_model_key;
+use serde::Deserialize;
+use std::collections::HashMap;
+use std::sync::OnceLock;
+
+static LONG_CONTEXT_QUALITY_YAML: &str = include_str!(concat!(
+    env!("CARGO_MANIFEST_DIR"),
+    "/src/bin/long_context_quality.yaml"
+));
+
+/// One model's LCQ entry, carrying provenance.
+#[derive(Debug, Clone, Deserialize)]
+pub struct LcqEntry {
+    pub score: f64,
+    #[serde(default)]
+    pub source: String,
+    #[serde(default)]
+    pub dated: String,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+struct LcqFile {
+    #[serde(default)]
+    version: String,
+    #[serde(default)]
+    source: String,
+    #[serde(default)]
+    dated: String,
+    #[serde(default)]
+    models: HashMap<String, LcqEntry>,
+}
+
+/// The internal LCQ dataset, parsed once.
+#[derive(Debug, Clone)]
+pub struct LongContextQualityDataset {
+    pub version: String,
+    pub source: String,
+    /// Provenance date of the dataset (used for staleness telemetry).
+    pub dated: String,
+    models: HashMap<String, LcqEntry>,
+}
+
+impl LongContextQualityDataset {
+    fn parse(yaml: &str) -> Self {
+        let file: LcqFile = serde_yaml::from_str(yaml).expect("parse long_context_quality.yaml");
+        LongContextQualityDataset {
+            version: file.version,
+            source: file.source,
+            dated: file.dated,
+            models: file.models,
+        }
+    }
+
+    /// Score for a `"<provider>/<model_id>"` string, normalizing the provider
+    /// token so config aliases resolve. Returns `None` when not benchmarked.
+    pub fn score_for(&self, model: &str) -> Option<f64> {
+        // Try the canonical key first, then the raw string as a fallback.
+        if let Some(key) = canonical_model_key(model) {
+            if let Some(entry) = self.models.get(&key) {
+                return Some(entry.score);
+            }
+        }
+        self.models.get(model).map(|e| e.score)
+    }
+
+    pub fn len(&self) -> usize {
+        self.models.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.models.is_empty()
+    }
+}
+
+/// The vendored LCQ dataset, parsed once and shared.
+pub fn dataset() -> &'static LongContextQualityDataset {
+    static DATA: OnceLock<LongContextQualityDataset> = OnceLock::new();
+    DATA.get_or_init(|| LongContextQualityDataset::parse(LONG_CONTEXT_QUALITY_YAML))
+}
+
+/// Convenience: LCQ score for a model from the vendored dataset.
+pub fn score_for(model: &str) -> Option<f64> {
+    dataset().score_for(model)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn dataset_loads_with_provenance() {
+        let ds = dataset();
+        assert!(!ds.is_empty());
+        assert!(
+            !ds.dated.is_empty(),
+            "dataset should carry a provenance date"
+        );
+        assert!(!ds.source.is_empty());
+    }
+
+    #[test]
+    fn scores_resolve_with_alias_normalization() {
+        // google/ alias normalizes to canonical gemini/
+        let via_alias = score_for("google/gemini-2.5-pro");
+        let via_canonical = score_for("gemini/gemini-2.5-pro");
+        assert!(via_alias.is_some());
+        assert_eq!(via_alias, via_canonical);
+    }
+
+    #[test]
+    fn unknown_model_has_no_score() {
+        assert!(score_for("openai/no-such-model").is_none());
+    }
+}
diff --git a/crates/hermesllm/src/providers/mod.rs b/crates/hermesllm/src/providers/mod.rs
index 4343022f4..b09d5b1e5 100644
--- a/crates/hermesllm/src/providers/mod.rs
+++ b/crates/hermesllm/src/providers/mod.rs
@@ -3,12 +3,20 @@
 //! This module contains provider-specific implementations that handle
 //! request/response conversion for different LLM service APIs.
 //!
+pub mod capabilities;
 pub mod id;
+pub mod long_context_quality;
 pub mod request;
 pub mod response;
 pub mod streaming_response;
 
+pub use capabilities::{
+    CapabilitiesCatalog, CapabilitiesSnapshot, ModelCapabilities, RequiredCapabilities,
+};
 pub use id::ProviderId;
+pub use long_context_quality::{
+    score_for as long_context_quality_score, LongContextQualityDataset,
+};
 pub use request::{ProviderRequest, ProviderRequestError, ProviderRequestType};
 pub use response::{ProviderResponse, ProviderResponseType, TokenUsage};
 pub use streaming_response::{ProviderStreamResponse, ProviderStreamResponseType};
diff --git a/crates/hermesllm/src/providers/request.rs b/crates/hermesllm/src/providers/request.rs
index bcc0eafd2..75bdd3359 100644
--- a/crates/hermesllm/src/providers/request.rs
+++ b/crates/hermesllm/src/providers/request.rs
@@ -58,6 +58,25 @@ pub trait ProviderRequest: Send + Sync {
     /// Set message history from OpenAI Message format
     /// This converts OpenAI messages to the appropriate format for each provider type
     fn set_messages(&mut self, messages: &[crate::apis::openai::Message]);
+
+    /// Request-shape introspection: does any message carry image input (vision)?
+    /// Drives Tier 1 capability filtering (`supports_vision`).
+    fn has_vision(&self) -> bool {
+        use crate::apis::openai::{ContentPart, MessageContent};
+        self.get_messages().iter().any(|m| match &m.content {
+            Some(MessageContent::Parts(parts)) => parts
+                .iter()
+                .any(|p| matches!(p, ContentPart::ImageUrl { .. })),
+            _ => false,
+        })
+    }
+
+    /// Approximate input token count for context-window filtering (Tier 1).
+    /// Uses a coarse ~4-chars-per-token heuristic over message text; precise
+    /// tokenization is unnecessary for a "fits the window" hard gate.
+    fn required_context_tokens(&self) -> usize {
+        self.extract_messages_text().chars().count() / 4
+    }
 }
 
 impl ProviderRequestType {
diff --git a/crates/hermesllm/src/providers/response.rs b/crates/hermesllm/src/providers/response.rs
index b8565ddf3..b2383e4a1 100644
--- a/crates/hermesllm/src/providers/response.rs
+++ b/crates/hermesllm/src/providers/response.rs
@@ -37,6 +37,16 @@ pub trait TokenUsage {
     fn reasoning_tokens(&self) -> Option<usize> {
         None
     }
+    /// Number of generated images (billable unit for `/v1/images/generations`).
+    /// `None` for text/audio responses.
+    fn image_units(&self) -> Option<usize> {
+        None
+    }
+    /// Number of synthesized audio units — characters or seconds depending on the
+    /// provider (billable unit for `/v1/audio/speech`). `None` for non-audio.
+    fn audio_units(&self) -> Option<usize> {
+        None
+    }
 }
 
 /// Rich usage breakdown extracted from a provider response.
diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs
index fa9964dd2..41e731c57 100644
--- a/crates/llm_gateway/src/stream_context.rs
+++ b/crates/llm_gateway/src/stream_context.rs
@@ -56,6 +56,9 @@ pub struct StreamContext {
     http_protocol: Option<String>,
     sse_buffer: Option<SseStreamBuffer>,
     sse_chunk_processor: Option<SseChunkProcessor>,
+    /// True when the upstream response is binary (e.g. `audio/*` from
+    /// `/v1/audio/speech`) and must be passed through without JSON parsing.
+    response_is_binary: bool,
 }
 
 impl StreamContext {
@@ -87,6 +90,7 @@ impl StreamContext {
             http_protocol: None,
             sse_buffer: None,
             sse_chunk_processor: None,
+            response_is_binary: false,
         }
     }
 
@@ -1152,6 +1156,27 @@ impl HttpContext for StreamContext {
             }
         }
 
+        // Detect binary (non-JSON) response bodies — e.g. audio from
+        // /v1/audio/speech. These must be passed through untouched: the gateway
+        // otherwise JSON-parses and re-serializes every non-streaming response,
+        // which would corrupt binary audio. Keep content-length intact for these.
+        let content_type = self
+            .get_http_response_header("content-type")
+            .unwrap_or_default()
+            .to_ascii_lowercase();
+        self.response_is_binary = content_type.starts_with("audio/")
+            || content_type.starts_with("image/")
+            || content_type == "application/octet-stream";
+
+        if self.response_is_binary {
+            debug!(
+                "request_id={}: binary response (content-type={}), passing through untouched",
+                self.request_identifier(),
+                content_type
+            );
+            return Action::Continue;
+        }
+
         self.remove_http_response_header("content-length");
         self.remove_http_response_header("content-encoding");
 
@@ -1183,6 +1208,20 @@ impl HttpContext for StreamContext {
             return Action::Continue;
         }
 
+        // Binary responses (e.g. audio/* from /v1/audio/speech) are passed
+        // through without JSON parsing or token accounting (G3 / WS8).
+        if self.response_is_binary {
+            debug!(
+                "request_id={}: binary response body passthrough, body_size={}",
+                self.request_identifier(),
+                body_size
+            );
+            if end_of_stream {
+                self.handle_end_of_request_metrics_and_traces(current_time);
+            }
+            return Action::Continue;
+        }
+
         // Check if this is an error response from upstream
         if let Some(status_code) = &self.upstream_status_code {
             if status_code.is_client_error() || status_code.is_server_error() {
diff --git a/docs/source/resources/includes/plano_config_full_reference.yaml b/docs/source/resources/includes/plano_config_full_reference.yaml
index 2231a01f9..20f17d7ed 100644
--- a/docs/source/resources/includes/plano_config_full_reference.yaml
+++ b/docs/source/resources/includes/plano_config_full_reference.yaml
@@ -55,6 +55,32 @@ model_providers:
     headers:
       User-Agent: "KimiCLI/1.3"
 
+  # Multimodal models — capabilities (vision / image-out / audio-out / context
+  # window) are fetched at runtime from models.dev (like cost/latency metrics),
+  # so you normally declare NOTHING. Add an optional `capabilities:` block only to
+  # override models.dev or to describe a model models.dev doesn't have.
+  # Precedence: capabilities block > models.dev > conservative default.
+  - model: openai/gpt-image-1          # serves /v1/images/generations
+    access_key: $OPENAI_API_KEY
+
+  - model: openai/gpt-4o-mini-tts      # serves /v1/audio/speech (TTS)
+    access_key: $OPENAI_API_KEY
+
+  # Override example: models.dev reports 200k, but this deployment is pinned to 128k.
+  - model: anthropic/claude-opus-4-1-128k
+    access_key: $ANTHROPIC_API_KEY
+    capabilities:
+      context_window: 128000
+
+  # Custom/self-hosted model models.dev doesn't know — declare its capabilities.
+  - model: openai/llama-3.3-70b-vision
+    base_url: https://api.custom-provider.com
+    http_host: api.custom-provider.com
+    access_key: $CUSTOM_API_KEY
+    capabilities:
+      context_window: 128000
+      supports_vision: true
+
 # Model aliases - use friendly names instead of full provider model names
 model_aliases:
   fast-llm:
@@ -69,8 +95,14 @@ model_aliases:
 # uses models[0] as primary and retries with models[1], models[2]... on 429/5xx.
 # Requires overrides.llm_routing_model to point at Plano-Orchestrator (or equivalent).
 # Each model in `models` must be declared in model_providers above.
-# selection_policy is optional: {prefer: cheapest|fastest|none} lets the router
-# reorder candidates using live cost/latency data from model_metrics_sources.
+# selection_policy is optional: {prefer: cheapest|fastest|long_context_quality|none}
+# lets the router reorder candidates. cheapest/fastest use live cost/latency data
+# from model_metrics_sources; long_context_quality ranks by Plano's internal
+# benchmark-seeded long-context-quality score (supplied internally, not user-authored).
+#
+# Tier 1 capability filters (modality + context-window fit) apply AUTOMATICALLY from
+# the request shape: each preset's `models` pool is narrowed to the models that can
+# physically serve the request before selection_policy ranks the survivors.
 routing_preferences:
   - name: code generation
     description: generating new code snippets, functions, or boilerplate based on user prompts or requirements
@@ -86,6 +118,36 @@ routing_preferences:
     selection_policy:
       prefer: cheapest
 
+  # long-context preset — ranks the window-sufficient pool by long-context quality.
+  - name: long document analysis
+    description: summarizing or querying very large documents and codebases
+    models:
+      - anthropic/claude-opus-4-1-128k
+      - openai/gpt-4o
+    selection_policy:
+      prefer: long_context_quality
+
+  # vision preset — pool auto-filtered to vision-capable models (Tier 1).
+  - name: image understanding
+    description: answering questions about uploaded images, screenshots, or diagrams
+    models:
+      - anthropic/claude-sonnet-4-0
+      - openai/gpt-4o
+
+  # image-generation preset — serves /v1/images/generations (Tier 1).
+  - name: image generation
+    description: creating or editing images from a text prompt
+    models:
+      - openai/gpt-image-1
+    selection_policy:
+      prefer: cheapest
+
+  # TTS preset — serves /v1/audio/speech (Tier 1).
+  - name: text to speech
+    description: synthesizing spoken audio from text
+    models:
+      - openai/gpt-4o-mini-tts
+
 # HTTP listeners - entry points for agent routing, prompt targets, and direct LLM access
 listeners:
   # Agent listener for routing requests to multiple agents
@@ -198,6 +260,17 @@ overrides:
   # Disable agentic signal analysis (frustration, repetition, escalation, etc.)
   # on LLM responses to save CPU. Default: false.
   disable_signals: false
+  # Tier 1 capability filtering is a hard gate. When it removes every model in a
+  # matched route, "error" returns HTTP 422 (default); "warning" logs and proceeds
+  # with the developer's pre-filter pool. This is the only lever that lets
+  # routing_preferences win over capability.
+  empty_pool_behavior: error
+  # Optional — model capabilities are fetched at runtime from models.dev. Set this
+  # only to change the URL or refresh cadence; omit refresh_interval to fetch once
+  # at startup (no periodic refresh).
+  model_capabilities_source:
+    url: https://models.dev/api.json
+    refresh_interval: 86400   # seconds (daily)
 
 # Model affinity — pin routing decisions for agentic loops
 routing: