diff --git a/cli/test/test_config_generator.py b/cli/test/test_config_generator.py index 9aade29e3..45d28c9ec 100644 --- a/cli/test/test_config_generator.py +++ b/cli/test/test_config_generator.py @@ -823,3 +823,98 @@ def test_apply_kimi_code_provider_defaults_injects_user_agent(): apply_kimi_code_provider_defaults(provider) assert provider["base_url"] == "https://api.kimi.com/coding/v1" assert provider["headers"]["User-Agent"] == "KimiCLI/1.3" + + +def _load_schema(): + from jsonschema import validate + + with open("../config/plano_config_schema.yaml", "r") as f: + return validate, yaml.safe_load(f.read()) + + +def test_schema_accepts_capabilities_and_signal_routing(): + """Capability/signal-aware routing fields validate against the schema.""" + validate, schema = _load_schema() + config = yaml.safe_load(""" +version: v0.4.0 +listeners: + - type: model + name: model_1 + address: 0.0.0.0 + port: 12000 +model_providers: + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + default: true + - model: anthropic/claude-opus-4-1-128k + access_key: $ANTHROPIC_API_KEY + capabilities: + context_window: 128000 + - model: openai/llama-3.3-70b-vision + base_url: https://api.custom-provider.com + access_key: $CUSTOM_API_KEY + capabilities: + context_window: 128000 + supports_vision: true + supports_image_generation: false + supports_audio_out: false + max_output_tokens: 8192 +routing_preferences: + - name: long document analysis + description: summarizing or querying very large documents + models: + - anthropic/claude-opus-4-1-128k + - openai/gpt-4o + selection_policy: + prefer: long_context_quality +overrides: + llm_routing_model: Plano-Orchestrator + empty_pool_behavior: warning + model_capabilities_source: + url: https://models.dev/api.json + refresh_interval: 86400 +""") + validate(config, schema) + + +def test_schema_rejects_unknown_capability_field(): + """capabilities block is closed (additionalProperties: false).""" + from jsonschema import ValidationError + + validate, schema = _load_schema() + config = yaml.safe_load(""" +version: v0.4.0 +listeners: + - type: model + name: model_1 + address: 0.0.0.0 + port: 12000 +model_providers: + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + capabilities: + supports_telepathy: true +""") + with pytest.raises(ValidationError): + validate(config, schema) + + +def test_schema_rejects_invalid_empty_pool_behavior(): + from jsonschema import ValidationError + + validate, schema = _load_schema() + config = yaml.safe_load(""" +version: v0.4.0 +listeners: + - type: model + name: model_1 + address: 0.0.0.0 + port: 12000 +model_providers: + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY +overrides: + empty_pool_behavior: explode +""") + with pytest.raises(ValidationError): + validate(config, schema) diff --git a/config/plano_config_schema.yaml b/config/plano_config_schema.yaml index 2ecf38921..cf4f6fc92 100644 --- a/config/plano_config_schema.yaml +++ b/config/plano_config_schema.yaml @@ -184,22 +184,54 @@ properties: enum: - plano - claude + - anthropic - deepseek - groq - mistral - openai - xiaomi - gemini + - xai + - together_ai + - azure_openai + - ollama + - zhipu + - qwen + - amazon_bedrock - chatgpt - digitalocean - vercel - openrouter - moonshotai + - astraflow + - astraflow_cn headers: type: object additionalProperties: type: string description: "Additional headers to send with upstream requests (e.g., ChatGPT-Account-Id, originator)." + capabilities: + type: object + additionalProperties: false + description: "Optional model capability overrides. Capabilities default from models.dev (vendored seed + runtime refresh); declare these only to override models.dev or to describe a model models.dev doesn't have. Precedence: this block > models.dev > conservative default." + properties: + context_window: + type: integer + minimum: 1 + description: "Maximum total context (input + output) tokens the model accepts." + max_output_tokens: + type: integer + minimum: 1 + description: "Maximum output tokens the model can produce." + supports_vision: + type: boolean + description: "Whether the model accepts image input (vision)." + supports_image_generation: + type: boolean + description: "Whether the model serves /v1/images/generations." + supports_audio_out: + type: boolean + description: "Whether the model serves /v1/audio/speech (TTS)." routing_preferences: type: array description: "[DEPRECATED] Inline routing_preferences under a model_provider are auto-migrated to the top-level routing_preferences list by the config generator. New configs should declare routing_preferences at the top level with an explicit models: [...] list. See docs/routing-api.md." @@ -243,17 +275,27 @@ properties: enum: - plano - claude + - anthropic - deepseek - groq - mistral - openai - xiaomi - gemini + - xai + - together_ai + - azure_openai + - ollama + - zhipu + - qwen + - amazon_bedrock - chatgpt - digitalocean - vercel - openrouter - moonshotai + - astraflow + - astraflow_cn headers: type: object additionalProperties: @@ -316,6 +358,26 @@ properties: orchestrator_model_context_length: type: integer description: "Maximum token length for the orchestrator/routing model context window. Default is 8192." + empty_pool_behavior: + type: string + enum: + - error + - warning + default: error + description: "Tier 1 capability filtering is a hard gate. When it removes every model in a matched route, 'error' returns HTTP 422 (default); 'warning' logs and proceeds with the pre-filter pool. This is the only lever that lets routing_preferences win over capability." + model_capabilities_source: + type: object + additionalProperties: false + description: "Optional source for model capabilities, fetched at runtime from models.dev (like cost/latency metrics). Defaults to the public models.dev API; users rarely set this. Omit refresh_interval to fetch only once at startup." + properties: + url: + type: string + default: "https://models.dev/api.json" + description: "models.dev-compatible capability API URL." + refresh_interval: + type: integer + minimum: 1 + description: "Refresh interval in seconds. Omit to fetch only once at startup." system_prompt: type: string prompt_targets: @@ -559,6 +621,7 @@ properties: enum: - cheapest - fastest + - long_context_quality - none additionalProperties: false required: diff --git a/crates/brightstaff/src/handlers/llm/model_selection.rs b/crates/brightstaff/src/handlers/llm/model_selection.rs index a1378d86d..c1535dad6 100644 --- a/crates/brightstaff/src/handlers/llm/model_selection.rs +++ b/crates/brightstaff/src/handlers/llm/model_selection.rs @@ -1,13 +1,13 @@ -use common::configuration::TopLevelRoutingPreference; +use common::configuration::{EmptyPoolBehavior, TopLevelRoutingPreference}; use hermesllm::clients::endpoints::SupportedUpstreamAPIs; -use hermesllm::ProviderRequestType; +use hermesllm::{ProviderRequest, ProviderRequestType, RequiredCapabilities}; use hyper::StatusCode; use std::sync::Arc; use tracing::{debug, info, warn}; use crate::metrics as bs_metrics; use crate::metrics::labels as metric_labels; -use crate::router::orchestrator::OrchestratorService; +use crate::router::orchestrator::{OrchestrationError, OrchestratorService}; use crate::streaming::truncate_message; use crate::tracing::routing; @@ -43,6 +43,15 @@ impl RoutingError { status_code: StatusCode::INTERNAL_SERVER_ERROR, } } + + /// Tier 1 capability filter left no viable model (D3 `error`). 422 so the + /// client can see this is a request/config mismatch, not a server fault. + pub fn capability_error(message: String) -> Self { + Self { + message, + status_code: StatusCode::UNPROCESSABLE_ENTITY, + } + } } /// Determines the routing decision if @@ -109,6 +118,14 @@ pub async fn router_chat_get_upstream_model( "processing router request" ); + // Tier 1: compute the capability requirements implied by this request shape. + let mut required = RequiredCapabilities::for_endpoint(request_path); + required.vision = chat_request.has_vision(); + required.min_context_tokens = chat_request.required_context_tokens(); + if !required.is_unconstrained() { + debug!(requirement = %required.describe(), "computed required capabilities"); + } + // Capture start time for routing span let routing_start_time = std::time::Instant::now(); @@ -117,6 +134,7 @@ pub async fn router_chat_get_upstream_model( &chat_request.messages, inline_routing_preferences, request_id, + &required, ) .await; @@ -144,8 +162,42 @@ pub async fn router_chat_get_upstream_model( }) } None => { - // No route determined, return sentinel value "none" - // This signals to llm.rs to use the original validated request model + // No preference matched. The "none" sentinel tells llm.rs to use + // the original validated request model — but capability is a hard + // gate even here, so validate that explicit model against the + // request shape (catches e.g. an image sent to a text-only model). + let explicit_model = chat_request.model(); + if !required.is_unconstrained() + && orchestrator_service.has_capability_filter() + && !orchestrator_service + .is_model_capable(explicit_model, &required) + .await + { + match orchestrator_service.empty_pool_behavior() { + EmptyPoolBehavior::Error => { + current_span.record("route.selected_model", "unknown"); + bs_metrics::record_router_decision( + route_label, + "unknown", + true, + determination_elapsed, + ); + return Err(RoutingError::capability_error(format!( + "model '{}' cannot serve this request (requires {})", + explicit_model, + required.describe() + ))); + } + EmptyPoolBehavior::Warning => { + warn!( + model = %explicit_model, + requirement = %required.describe(), + "explicit model is not capable; empty_pool_behavior=warning, proceeding" + ); + } + } + } + current_span.record("route.selected_model", "none"); info!("no route determined, using default model"); bs_metrics::record_router_decision( @@ -165,10 +217,18 @@ pub async fn router_chat_get_upstream_model( Err(err) => { current_span.record("route.selected_model", "unknown"); bs_metrics::record_router_decision(route_label, "unknown", true, determination_elapsed); - Err(RoutingError::internal_error(format!( - "Failed to determine route: {}", - err - ))) + match err { + OrchestrationError::CapabilityFilterEmpty { route, requirement } => { + Err(RoutingError::capability_error(format!( + "no capable model for route '{}': request requires {}", + route, requirement + ))) + } + other => Err(RoutingError::internal_error(format!( + "Failed to determine route: {}", + other + ))), + } } } } diff --git a/crates/brightstaff/src/main.rs b/crates/brightstaff/src/main.rs index 90ed84c35..165ff904f 100644 --- a/crates/brightstaff/src/main.rs +++ b/crates/brightstaff/src/main.rs @@ -12,6 +12,7 @@ use brightstaff::handlers::models::list_models; use brightstaff::handlers::routing_service::routing_decision; use brightstaff::metrics as bs_metrics; use brightstaff::metrics::labels as metric_labels; +use brightstaff::router::model_capabilities::ModelCapabilitiesService; use brightstaff::router::model_metrics::ModelMetricsService; use brightstaff::router::orchestrator::OrchestratorService; use brightstaff::session_cache::init_session_cache; @@ -308,17 +309,50 @@ async fn init_app_state( .orchestrator_model_context_length .unwrap_or(brightstaff::router::orchestrator_model_v1::MAX_TOKEN_LEN); - let orchestrator_service = Arc::new(OrchestratorService::with_routing( - format!("{llm_provider_url}{CHAT_COMPLETIONS_PATH}"), - orchestrator_model_name, - orchestrator_llm_provider, - config.routing_preferences.clone(), - metrics_service, - session_ttl_seconds, - session_cache, - session_tenant_header, - orchestrator_max_tokens, + // Tier 1 capability source: vendored models.dev seed + optional runtime + // refresh, layered with per-provider user overrides. + let capabilities_service = Some(Arc::new( + ModelCapabilitiesService::new( + &config.model_providers, + overrides.model_capabilities_source.as_ref(), + reqwest::Client::new(), + ) + .await, )); + let empty_pool_behavior = overrides.empty_pool_behavior.unwrap_or_default(); + + // Surface internal long-context-quality dataset provenance + staleness (R5). + { + let lcq = hermesllm::providers::long_context_quality::dataset(); + if let Ok(dated) = chrono::NaiveDate::parse_from_str(&lcq.dated, "%Y-%m-%d") { + let age_days = (chrono::Utc::now().date_naive() - dated).num_days().max(0) as f64; + brightstaff::metrics::record_lcq_staleness_days(age_days); + info!( + models = lcq.len(), + source = %lcq.source, + dated = %lcq.dated, + age_days, + "loaded internal long-context-quality dataset" + ); + } else { + info!(models = lcq.len(), source = %lcq.source, "loaded internal long-context-quality dataset"); + } + } + + let orchestrator_service = Arc::new( + OrchestratorService::with_routing( + format!("{llm_provider_url}{CHAT_COMPLETIONS_PATH}"), + orchestrator_model_name, + orchestrator_llm_provider, + config.routing_preferences.clone(), + metrics_service, + session_ttl_seconds, + session_cache, + session_tenant_header, + orchestrator_max_tokens, + ) + .with_capability_filter(capabilities_service, empty_pool_behavior), + ); let state_storage = init_state_storage(config).await?; diff --git a/crates/brightstaff/src/metrics/labels.rs b/crates/brightstaff/src/metrics/labels.rs index 4eaf3e59a..8b0921e2a 100644 --- a/crates/brightstaff/src/metrics/labels.rs +++ b/crates/brightstaff/src/metrics/labels.rs @@ -36,3 +36,9 @@ pub const ROUTING_SVC_POLICY_ERROR: &str = "policy_error"; pub const SESSION_CACHE_HIT: &str = "hit"; pub const SESSION_CACHE_MISS: &str = "miss"; pub const SESSION_CACHE_STORE: &str = "store"; + +// Tier 1 capability-filter outcome values. +pub const CAPABILITY_FILTER_PASS: &str = "pass"; +pub const CAPABILITY_FILTER_FILTERED: &str = "filtered"; +pub const CAPABILITY_FILTER_EMPTY_ERROR: &str = "empty_error"; +pub const CAPABILITY_FILTER_EMPTY_WARNING: &str = "empty_warning"; diff --git a/crates/brightstaff/src/metrics/mod.rs b/crates/brightstaff/src/metrics/mod.rs index 34679ccac..c0d0ffa93 100644 --- a/crates/brightstaff/src/metrics/mod.rs +++ b/crates/brightstaff/src/metrics/mod.rs @@ -317,6 +317,26 @@ pub fn record_llm_ttft(provider: &str, model: &str, ttft: Duration) { .record(ttft.as_secs_f64()); } +/// Per-modality generation latency, the cold-start-seedable signal that powers +/// `prefer: fastest` for the non-text modalities (WS10). `kind` distinguishes the +/// per-modality timing definition: `e2e` (image-gen, request→asset), `ttfa` +/// (streaming audio, time-to-first-audio-chunk), or `synthesis` (non-streaming +/// audio, total synthesis time). Text continues to use TTFT and is unaffected. +pub fn record_modality_latency( + model: &str, + modality: &str, + kind: &'static str, + duration: Duration, +) { + histogram!( + "brightstaff_llm_modality_latency_seconds", + "model" => model.to_string(), + "modality" => modality.to_string(), + "kind" => kind, + ) + .record(duration.as_secs_f64()); +} + pub fn record_llm_tokens(provider: &str, model: &str, kind: &'static str, count: u64) { counter!( "brightstaff_llm_tokens_total", @@ -360,6 +380,36 @@ pub fn record_router_decision( .record(duration.as_secs_f64()); } +/// Record Tier 1 capability-filter timing + pool sizes for a matched route. +/// `outcome` is one of `pass`, `filtered`, `empty_error`, `empty_warning`. +pub fn record_capability_filter( + route: &str, + outcome: &'static str, + pre_filter: usize, + post_filter: usize, + duration: Duration, +) { + counter!( + "brightstaff_capability_filter_total", + "route" => route.to_string(), + "outcome" => outcome, + ) + .increment(1); + histogram!( + "brightstaff_capability_filter_duration_seconds", + "route" => route.to_string(), + ) + .record(duration.as_secs_f64()); + histogram!("brightstaff_capability_filter_pre_pool_size").record(pre_filter as f64); + histogram!("brightstaff_capability_filter_post_pool_size").record(post_filter as f64); +} + +/// Surface the age (in days) of the internal long-context-quality dataset so +/// staleness is observable (R5). +pub fn record_lcq_staleness_days(age_days: f64) { + gauge!("brightstaff_long_context_quality_staleness_days").set(age_days); +} + pub fn record_routing_service_outcome(outcome: &'static str) { counter!( "brightstaff_routing_service_requests_total", diff --git a/crates/brightstaff/src/router/mod.rs b/crates/brightstaff/src/router/mod.rs index 0f48c0907..53af701d9 100644 --- a/crates/brightstaff/src/router/mod.rs +++ b/crates/brightstaff/src/router/mod.rs @@ -1,4 +1,5 @@ pub(crate) mod http; +pub mod model_capabilities; pub mod model_metrics; pub mod orchestrator; pub mod orchestrator_model; diff --git a/crates/brightstaff/src/router/model_capabilities.rs b/crates/brightstaff/src/router/model_capabilities.rs new file mode 100644 index 000000000..5ce41fb45 --- /dev/null +++ b/crates/brightstaff/src/router/model_capabilities.rs @@ -0,0 +1,185 @@ +//! Tier 1 capability source for routing. +//! +//! Mirrors [`super::model_metrics::ModelMetricsService`] (and how DigitalOcean +//! pricing is handled): nothing is vendored into the binary. The catalog is +//! fetched from models.dev at startup, optionally refreshed on an interval, and +//! left empty on fetch failure (so absent models resolve to the conservative +//! default rather than failing the build with a committed snapshot). Per-model +//! capabilities resolve with the precedence: +//! `user config capabilities > models.dev > conservative default`. + +use std::collections::HashMap; +use std::sync::Arc; +use std::time::Duration; + +use common::configuration::{LlmProvider, ModelCapabilitiesSource}; +use hermesllm::providers::capabilities::{ + canonical_model_key, CapabilitiesCatalog, CapabilitiesSnapshot, +}; +use hermesllm::ModelCapabilities; +use tokio::sync::RwLock; +use tracing::{info, warn}; + +const DEFAULT_MODELS_DEV_URL: &str = "https://models.dev/api.json"; + +pub struct ModelCapabilitiesService { + /// User config overrides, keyed by canonical `"/"`. + user_overrides: HashMap, + /// models.dev-derived catalog, fetched at runtime (empty until first fetch). + catalog: Arc>, +} + +impl ModelCapabilitiesService { + /// Build the service from configured providers and an optional capability + /// source. Fetches the models.dev catalog once at startup (like DO pricing), + /// then spawns a refresh loop if an interval is configured. On fetch failure + /// the catalog stays empty and models resolve to user overrides or the + /// conservative default. + pub async fn new( + providers: &[LlmProvider], + source: Option<&ModelCapabilitiesSource>, + client: reqwest::Client, + ) -> Self { + let mut user_overrides = HashMap::new(); + for p in providers { + if let Some(caps) = &p.capabilities { + let model_str = p.model.clone().unwrap_or_else(|| p.name.clone()); + let key = canonical_model_key(&model_str).unwrap_or(model_str); + user_overrides.insert(key, caps.clone()); + } + } + + let url = source + .and_then(|s| s.url.clone()) + .unwrap_or_else(|| DEFAULT_MODELS_DEV_URL.to_string()); + + // Fetch once at startup so capabilities are available immediately. On + // failure the catalog is empty (conservative defaults), never fatal. + let catalog = match fetch_catalog(&client, &url).await { + Some(fresh) => { + info!(models = fresh.len(), url = %url, user_overrides = user_overrides.len(), "fetched model capabilities from models.dev"); + fresh + } + None => { + warn!(url = %url, "models.dev fetch failed at startup — capabilities default to conservative (text-only) until refresh"); + CapabilitiesCatalog::default() + } + }; + let catalog = Arc::new(RwLock::new(catalog)); + + if let Some(interval_secs) = source.and_then(|s| s.refresh_interval) { + let catalog_clone = Arc::clone(&catalog); + let client_clone = client.clone(); + let url_clone = url.clone(); + let interval = Duration::from_secs(interval_secs); + tokio::spawn(async move { + loop { + tokio::time::sleep(interval).await; + if let Some(fresh) = fetch_catalog(&client_clone, &url_clone).await { + info!(models = fresh.len(), url = %url_clone, "refreshed model capabilities from models.dev"); + *catalog_clone.write().await = fresh; + } else { + warn!(url = %url_clone, "models.dev refresh failed — keeping previous catalog"); + } + } + }); + } + + ModelCapabilitiesService { + user_overrides, + catalog, + } + } + + /// Construct a service from an explicit catalog with no refresh (tests). + pub fn from_catalog( + catalog: CapabilitiesCatalog, + user_overrides: HashMap, + ) -> Self { + ModelCapabilitiesService { + user_overrides, + catalog: Arc::new(RwLock::new(catalog)), + } + } + + /// Resolve capabilities for a model, applying + /// `user override > models.dev > conservative default`. + pub async fn capabilities_for(&self, model: &str) -> ModelCapabilities { + let from_catalog = { + let catalog = self.catalog.read().await; + catalog.get(model).cloned().unwrap_or_default() + }; + let key = canonical_model_key(model); + let user = key + .as_ref() + .and_then(|k| self.user_overrides.get(k)) + .or_else(|| self.user_overrides.get(model)); + match user { + Some(u) => u.fill_from(&from_catalog), + None => from_catalog, + } + } +} + +/// Fetch + map a models.dev catalog. Returns `None` on any network/parse error +/// so the caller can fall back to the previous (seed) catalog. +async fn fetch_catalog(client: &reqwest::Client, url: &str) -> Option { + let bytes = match client.get(url).send().await { + Ok(resp) => match resp.bytes().await { + Ok(b) => b, + Err(err) => { + warn!(error = %err, url = %url, "failed to read models.dev response"); + return None; + } + }, + Err(err) => { + warn!(error = %err, url = %url, "failed to fetch models.dev"); + return None; + } + }; + match CapabilitiesSnapshot::from_models_dev_json(&bytes) { + Ok(snapshot) => Some(CapabilitiesCatalog::from_snapshot(snapshot)), + Err(err) => { + warn!(error = %err, url = %url, "failed to parse models.dev payload"); + None + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn caps(window: Option, vision: Option) -> ModelCapabilities { + ModelCapabilities { + context_window: window, + supports_vision: vision, + ..Default::default() + } + } + + #[tokio::test] + async fn user_override_wins_over_catalog() { + let mut catalog_models = HashMap::new(); + catalog_models.insert("openai/gpt-4o".to_string(), caps(Some(200000), Some(true))); + let catalog = CapabilitiesCatalog::new(catalog_models); + + let mut overrides = HashMap::new(); + overrides.insert("openai/gpt-4o".to_string(), caps(Some(128000), None)); + + let svc = ModelCapabilitiesService::from_catalog(catalog, overrides); + let resolved = svc.capabilities_for("openai/gpt-4o").await; + // user override wins for window, catalog backfills vision + assert_eq!(resolved.window(), Some(128000)); + assert!(resolved.vision()); + } + + #[tokio::test] + async fn unknown_model_falls_back_to_conservative_default() { + let svc = + ModelCapabilitiesService::from_catalog(CapabilitiesCatalog::default(), HashMap::new()); + let resolved = svc.capabilities_for("openai/unknown").await; + assert!(!resolved.vision()); + assert_eq!(resolved.window(), None); + } +} diff --git a/crates/brightstaff/src/router/model_metrics.rs b/crates/brightstaff/src/router/model_metrics.rs index 1adb408d8..24c22d614 100644 --- a/crates/brightstaff/src/router/model_metrics.rs +++ b/crates/brightstaff/src/router/model_metrics.rs @@ -10,8 +10,42 @@ use tracing::{debug, info, warn}; const DO_PRICING_URL: &str = "https://api.digitalocean.com/v2/gen-ai/models/catalog"; +/// Routing modality used to scope the per-modality `fastest` latency signal +/// (WS10). Latency is keyed by `(model, modality)`: text keeps its existing +/// TTFT behavior; image/audio carry their own per-modality timing definitions. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Modality { + Text, + Vision, + ImageOut, + AudioOut, +} + +impl Modality { + pub fn as_str(&self) -> &'static str { + match self { + Modality::Text => "text", + Modality::Vision => "vision", + Modality::ImageOut => "image_out", + Modality::AudioOut => "audio_out", + } + } +} + +/// Composite latency-map key scoping a model's latency to a modality. Text uses +/// the bare model id (back-compat with the existing flat Prometheus map); other +/// modalities are namespaced so they never collide with the text signal. +pub fn modality_latency_key(model: &str, modality: Modality) -> String { + match modality { + Modality::Text => model.to_string(), + other => format!("{}\u{1}{}", other.as_str(), model), + } +} + pub struct ModelMetricsService { cost: Arc>>, + /// Latency map keyed by either a bare model id (text) or a + /// `modality\u{1}model` composite (see [`modality_latency_key`]). latency: Arc>>, } @@ -107,10 +141,69 @@ impl ModelMetricsService { } rank_by_ascending_metric(models, &latency_data) } + SelectionPreference::LongContextQuality => { + // Tier 2: rank by Plano's internal long-context-quality dataset. + // Higher score is better, so rank descending; unscored models last. + let scores: HashMap = models + .iter() + .filter_map(|m| { + match hermesllm::long_context_quality_score(m) { + Some(s) => Some((m.clone(), s)), + None => { + warn!(model = %m, "no long-context-quality score — ranking last (prefer: long_context_quality)"); + None + } + } + }) + .collect(); + rank_by_descending_metric(models, &scores) + } SelectionPreference::None => models.to_vec(), } } + /// Rank `models` by latency for a specific `modality` (WS10 groundwork). + /// Looks up the `(model, modality)` composite key first, then falls back to + /// the bare model latency, then ranks last. For `Modality::Text` this is + /// identical to `prefer: fastest` today. + pub async fn rank_models_by_modality_latency( + &self, + models: &[String], + modality: Modality, + ) -> Vec { + let latency_data = self.latency.read().await; + let resolved: HashMap = models + .iter() + .filter_map(|m| { + let composite = modality_latency_key(m, modality); + let v = latency_data + .get(&composite) + .or_else(|| latency_data.get(m.as_str())) + .copied(); + match v { + Some(v) if !v.is_nan() => Some((m.clone(), v)), + _ => { + warn!(model = %m, modality = modality.as_str(), "no per-modality latency — ranking last (prefer: fastest)"); + None + } + } + }) + .collect(); + rank_by_ascending_metric(models, &resolved) + } + + /// Seed the latency map for cold start (WS10): at launch there is no live + /// traffic, so `prefer: fastest` on the new modalities must be primed from + /// benchmark data. Entries should use [`modality_latency_key`] for non-text + /// modalities. Existing keys are overwritten; live data takes over on the + /// next Prometheus refresh. + pub async fn seed_latency(&self, seed: HashMap) { + let mut latency = self.latency.write().await; + for (k, v) in seed { + latency.insert(k, v); + } + } + /// Returns a snapshot of the current cost data. Used at startup to warn about unmatched models. pub async fn cost_snapshot(&self) -> HashMap { self.cost.read().await.clone() @@ -148,6 +241,34 @@ fn rank_by_ascending_metric(models: &[String], data: &HashMap) -> V .collect() } +/// Rank `models` by a metric where **higher is better** (e.g. quality scores). +/// Models with no data are appended at the end in their original order. +fn rank_by_descending_metric(models: &[String], data: &HashMap) -> Vec { + let mut with_data: Vec<(&String, f64)> = models + .iter() + .filter_map(|m| { + let v = *data.get(m.as_str())?; + if v.is_nan() { + None + } else { + Some((m, v)) + } + }) + .collect(); + with_data.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)); + + let without_data: Vec<&String> = models + .iter() + .filter(|m| data.get(m.as_str()).is_none_or(|v| v.is_nan())) + .collect(); + + with_data + .iter() + .map(|(m, _)| (*m).clone()) + .chain(without_data.iter().map(|m| (*m).clone())) + .collect() +} + #[derive(serde::Deserialize)] struct DoModelList { data: Vec, @@ -231,7 +352,16 @@ async fn fetch_prometheus_metrics( .filter_map(|r| { let model_name = r.metric.get("model_name")?.clone(); let value: f64 = r.value.1.parse().ok()?; - Some((model_name, value)) + // If the series carries a `modality` label, namespace the key + // so per-modality latency (WS10) never collides with text. + let key = match r.metric.get("modality").map(String::as_str) { + Some("text") | None => model_name, + Some("vision") => modality_latency_key(&model_name, Modality::Vision), + Some("image_out") => modality_latency_key(&model_name, Modality::ImageOut), + Some("audio_out") => modality_latency_key(&model_name, Modality::AudioOut), + Some(_) => model_name, + }; + Some((key, value)) }) .collect(), Err(err) => { @@ -319,6 +449,49 @@ mod tests { assert_eq!(result, vec!["claude-sonnet", "gpt-4o"]); } + #[test] + fn test_rank_by_descending_metric_picks_highest_first() { + let models = vec!["a".to_string(), "b".to_string(), "c".to_string()]; + let mut data = HashMap::new(); + data.insert("a".to_string(), 0.80); + data.insert("b".to_string(), 0.95); + data.insert("c".to_string(), 0.70); + // unscored models would sort last; here all scored, highest first + assert_eq!( + rank_by_descending_metric(&models, &data), + vec!["b", "a", "c"] + ); + } + + #[tokio::test] + async fn test_rank_models_long_context_quality() { + // Uses the vendored internal LCQ dataset. gemini-2.5-pro outranks gpt-4o, + // and an unscored model sorts last. + let service = ModelMetricsService { + cost: Arc::new(RwLock::new(HashMap::new())), + latency: Arc::new(RwLock::new(HashMap::new())), + }; + let models = vec![ + "openai/gpt-4o".to_string(), + "openai/no-lcq-model".to_string(), + "gemini/gemini-2.5-pro".to_string(), + ]; + let result = service + .rank_models( + &models, + &make_policy(SelectionPreference::LongContextQuality), + ) + .await; + assert_eq!( + result, + vec![ + "gemini/gemini-2.5-pro", + "openai/gpt-4o", + "openai/no-lcq-model" + ] + ); + } + #[tokio::test] async fn test_rank_models_fallback_no_metrics() { let service = ModelMetricsService { @@ -368,6 +541,65 @@ mod tests { assert_eq!(result, vec!["gpt-4o", "gpt-4o-mini"]); } + #[test] + fn test_modality_latency_key_namespaces_non_text() { + assert_eq!( + modality_latency_key("openai/gpt-4o", Modality::Text), + "openai/gpt-4o" + ); + assert_eq!( + modality_latency_key("openai/gpt-image-1", Modality::ImageOut), + "image_out\u{1}openai/gpt-image-1" + ); + assert_ne!( + modality_latency_key("m", Modality::Vision), + modality_latency_key("m", Modality::AudioOut) + ); + } + + #[tokio::test] + async fn test_rank_models_by_modality_latency_prefers_composite_key() { + let service = ModelMetricsService { + cost: Arc::new(RwLock::new(HashMap::new())), + latency: Arc::new(RwLock::new(HashMap::new())), + }; + // Seed per-modality (audio) latency for two TTS models. + let mut seed = HashMap::new(); + seed.insert( + modality_latency_key("openai/tts-slow", Modality::AudioOut), + 500.0, + ); + seed.insert( + modality_latency_key("openai/tts-fast", Modality::AudioOut), + 100.0, + ); + service.seed_latency(seed).await; + + let models = vec!["openai/tts-slow".to_string(), "openai/tts-fast".to_string()]; + let result = service + .rank_models_by_modality_latency(&models, Modality::AudioOut) + .await; + assert_eq!(result, vec!["openai/tts-fast", "openai/tts-slow"]); + } + + #[tokio::test] + async fn test_rank_models_by_modality_latency_falls_back_to_bare_model() { + let service = ModelMetricsService { + cost: Arc::new(RwLock::new(HashMap::new())), + latency: Arc::new(RwLock::new({ + let mut m = HashMap::new(); + m.insert("text-model".to_string(), 42.0); + m + })), + }; + let models = vec!["text-model".to_string(), "no-data".to_string()]; + let result = service + .rank_models_by_modality_latency(&models, Modality::Vision) + .await; + // bare-model fallback ranks first; the unseen model sorts last. + assert_eq!(result, vec!["text-model", "no-data"]); + } + #[test] fn test_rank_by_ascending_metric_nan_treated_as_missing() { let models = vec![ diff --git a/crates/brightstaff/src/router/orchestrator.rs b/crates/brightstaff/src/router/orchestrator.rs index 2d7b25dee..600946aa2 100644 --- a/crates/brightstaff/src/router/orchestrator.rs +++ b/crates/brightstaff/src/router/orchestrator.rs @@ -1,17 +1,21 @@ use std::{borrow::Cow, collections::HashMap, sync::Arc, time::Duration}; use common::{ - configuration::{AgentUsagePreference, OrchestrationPreference, TopLevelRoutingPreference}, + configuration::{ + AgentUsagePreference, EmptyPoolBehavior, OrchestrationPreference, TopLevelRoutingPreference, + }, consts::{ARCH_PROVIDER_HINT_HEADER, REQUEST_ID_HEADER}, }; use hermesllm::apis::openai::Message; +use hermesllm::RequiredCapabilities; use hyper::header; use opentelemetry::global; use opentelemetry_http::HeaderInjector; use thiserror::Error; -use tracing::{debug, info}; +use tracing::{debug, info, warn}; use super::http::{self, post_and_extract_content}; +use super::model_capabilities::ModelCapabilitiesService; use super::model_metrics::ModelMetricsService; use super::orchestrator_model::OrchestratorModel; @@ -31,6 +35,8 @@ pub struct OrchestratorService { orchestrator_provider_name: String, top_level_preferences: HashMap, metrics_service: Option>, + capabilities_service: Option>, + empty_pool_behavior: EmptyPoolBehavior, session_cache: Option>, session_ttl: Duration, tenant_header: Option, @@ -43,6 +49,11 @@ pub enum OrchestrationError { #[error("Orchestrator model error: {0}")] OrchestratorModelError(#[from] super::orchestrator_model::OrchestratorModelError), + + /// Tier 1 capability filtering removed every candidate from the matched + /// route and `empty_pool_behavior` is `error` (D3). + #[error("no capable model for route '{route}': request requires {requirement}")] + CapabilityFilterEmpty { route: String, requirement: String }, } pub type Result = std::result::Result; @@ -67,12 +78,46 @@ impl OrchestratorService { orchestrator_provider_name, top_level_preferences: HashMap::new(), metrics_service: None, + capabilities_service: None, + empty_pool_behavior: EmptyPoolBehavior::default(), session_cache: None, session_ttl: Duration::from_secs(DEFAULT_SESSION_TTL_SECONDS), tenant_header: None, } } + /// Attach the Tier 1 capability filter and the empty-pool policy (D3). + /// Builder-style so existing constructor call sites stay unchanged. + #[must_use] + pub fn with_capability_filter( + mut self, + capabilities_service: Option>, + empty_pool_behavior: EmptyPoolBehavior, + ) -> Self { + self.capabilities_service = capabilities_service; + self.empty_pool_behavior = empty_pool_behavior; + self + } + + /// Whether a single model can serve a request with the given required + /// capabilities (used for the no-preference-match validation path). + pub async fn is_model_capable(&self, model: &str, required: &RequiredCapabilities) -> bool { + match &self.capabilities_service { + Some(svc) if !required.is_unconstrained() => { + required.satisfied_by(&svc.capabilities_for(model).await) + } + _ => true, + } + } + + pub fn empty_pool_behavior(&self) -> EmptyPoolBehavior { + self.empty_pool_behavior + } + + pub fn has_capability_filter(&self) -> bool { + self.capabilities_service.is_some() + } + #[allow(clippy::too_many_arguments)] pub fn with_routing( orchestrator_url: String, @@ -106,6 +151,8 @@ impl OrchestratorService { orchestrator_provider_name, top_level_preferences, metrics_service, + capabilities_service: None, + empty_pool_behavior: EmptyPoolBehavior::default(), session_cache: Some(session_cache), session_ttl, tenant_header, @@ -170,6 +217,7 @@ impl OrchestratorService { messages: &[Message], inline_routing_preferences: Option>, request_id: &str, + required: &RequiredCapabilities, ) -> Result)>> { if messages.is_empty() { return Ok(None); @@ -223,10 +271,27 @@ impl OrchestratorService { .or_else(|| self.top_level_preferences.get(route_name)); if let Some(pref) = top_pref { + // Tier 1: hard capability filter (intersection) before ranking. + let effective_models = self + .apply_capability_filter(route_name, &pref.models, required) + .await?; + + // Tier 2: rank the surviving capable pool (preserves order for `none`). let ranked = match &self.metrics_service { - Some(svc) => svc.rank_models(&pref.models, &pref.selection_policy).await, - None => pref.models.clone(), + Some(svc) => { + svc.rank_models(&effective_models, &pref.selection_policy) + .await + } + None => effective_models.clone(), }; + info!( + route = %route_name, + tier = "tier2", + selection_policy = ?pref.selection_policy, + capable_pool = effective_models.len(), + selected = %ranked.first().map(|s| s.as_str()).unwrap_or(""), + "Tier 2 preference ranking applied" + ); Some((route_name.clone(), ranked)) } else { None @@ -246,6 +311,90 @@ impl OrchestratorService { Ok(result) } + /// Tier 1: intersect a route's model pool with the models capable of serving + /// the request shape, preserving preference order among survivors. When the + /// intersection is empty, honor `empty_pool_behavior` (D3): `error` returns a + /// typed error; `warning` logs and proceeds with the pre-filter pool. + async fn apply_capability_filter( + &self, + route_name: &str, + models: &[String], + required: &RequiredCapabilities, + ) -> Result> { + let Some(svc) = &self.capabilities_service else { + return Ok(models.to_vec()); + }; + if required.is_unconstrained() { + return Ok(models.to_vec()); + } + + let started = std::time::Instant::now(); + let mut capable = Vec::new(); + for m in models { + let caps = svc.capabilities_for(m).await; + if required.satisfied_by(&caps) { + capable.push(m.clone()); + } + } + let elapsed = started.elapsed(); + + if capable.is_empty() { + match self.empty_pool_behavior { + EmptyPoolBehavior::Error => { + bs_metrics::record_capability_filter( + route_name, + metric_labels::CAPABILITY_FILTER_EMPTY_ERROR, + models.len(), + 0, + elapsed, + ); + return Err(OrchestrationError::CapabilityFilterEmpty { + route: route_name.to_string(), + requirement: required.describe(), + }); + } + EmptyPoolBehavior::Warning => { + bs_metrics::record_capability_filter( + route_name, + metric_labels::CAPABILITY_FILTER_EMPTY_WARNING, + models.len(), + 0, + elapsed, + ); + warn!( + route = %route_name, + requirement = %required.describe(), + "Tier 1 capability filter emptied the pool; empty_pool_behavior=warning, proceeding with pre-filter pool" + ); + return Ok(models.to_vec()); + } + } + } + + let outcome = if capable.len() == models.len() { + metric_labels::CAPABILITY_FILTER_PASS + } else { + metric_labels::CAPABILITY_FILTER_FILTERED + }; + bs_metrics::record_capability_filter( + route_name, + outcome, + models.len(), + capable.len(), + elapsed, + ); + info!( + route = %route_name, + tier = "tier1", + pre_filter = models.len(), + post_filter = capable.len(), + requirement = %required.describe(), + filter_us = elapsed.as_micros() as u64, + "Tier 1 capability filter applied" + ); + Ok(capable) + } + // ---- Agent orchestration (existing) ---- pub async fn determine_orchestration( diff --git a/crates/brightstaff/src/router/stress_tests.rs b/crates/brightstaff/src/router/stress_tests.rs index 6c3ffefd4..8dee7e116 100644 --- a/crates/brightstaff/src/router/stress_tests.rs +++ b/crates/brightstaff/src/router/stress_tests.rs @@ -107,7 +107,7 @@ mod tests { for _ in 0..10 { let msgs = make_messages(5); let _ = orchestrator_service - .determine_route(&msgs, None, "warmup") + .determine_route(&msgs, None, "warmup", &Default::default()) .await; } @@ -124,7 +124,7 @@ mod tests { None }; let _ = orchestrator_service - .determine_route(&msgs, inline, &format!("req-{i}")) + .determine_route(&msgs, inline, &format!("req-{i}"), &Default::default()) .await; } @@ -198,7 +198,7 @@ mod tests { for _ in 0..20 { let msgs = make_messages(3); let _ = orchestrator_service - .determine_route(&msgs, None, "warmup") + .determine_route(&msgs, None, "warmup", &Default::default()) .await; } @@ -215,7 +215,7 @@ mod tests { for r in 0..requests_per_task { let msgs = make_messages(3 + (r % 8)); let _ = svc - .determine_route(&msgs, None, &format!("req-{t}-{r}")) + .determine_route(&msgs, None, &format!("req-{t}-{r}"), &Default::default()) .await; } }); diff --git a/crates/common/src/configuration.rs b/crates/common/src/configuration.rs index 8aa521fa5..047c5e5ac 100644 --- a/crates/common/src/configuration.rs +++ b/crates/common/src/configuration.rs @@ -137,6 +137,10 @@ pub enum StateStorageType { pub enum SelectionPreference { Cheapest, Fastest, + /// Tier 2 ranking: rank the (already capability-filtered) pool by Plano's + /// internal long-context-quality score. + #[serde(rename = "long_context_quality")] + LongContextQuality, /// Return models in the same order they were defined — no reordering. #[default] #[serde(alias = "")] @@ -235,6 +239,35 @@ pub struct Overrides { pub agent_orchestration_model: Option, pub orchestrator_model_context_length: Option, pub disable_signals: Option, + /// What to do when Tier 1 capability filtering removes every candidate model + /// from a matched route (D3). Defaults to `error`. + #[serde(default)] + pub empty_pool_behavior: Option, + /// Optional override for where model capabilities are sourced from. Defaults + /// to the public models.dev URL + vendored snapshot when unset. + #[serde(default)] + pub model_capabilities_source: Option, +} + +/// Behavior when Tier 1 capability filtering empties a matched route's model pool. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)] +#[serde(rename_all = "lowercase")] +pub enum EmptyPoolBehavior { + /// Return an HTTP 422 with a clear message (no silent degrade). + #[default] + Error, + /// Log a warning and proceed with the developer's pre-filter pool as-is. + Warning, +} + +/// Optional configuration for the models.dev capability source (nested under +/// `overrides`). Defaults to the public models.dev URL when unset. +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub struct ModelCapabilitiesSource { + /// models.dev-compatible API URL. Defaults to `https://models.dev/api.json`. + pub url: Option, + /// Refresh interval in seconds. Omit to use the vendored seed only. + pub refresh_interval: Option, } #[derive(Debug, Clone, Serialize, Deserialize, Default)] @@ -498,6 +531,11 @@ pub struct LlmProvider { pub internal: Option, pub passthrough_auth: Option, pub headers: Option>, + /// Optional per-model capability overrides (Tier 1 routing). All fields are + /// optional; anything left unset is resolved from the models.dev catalog, + /// then a conservative default. Precedence: user config > models.dev > default. + #[serde(default)] + pub capabilities: Option, } pub trait IntoModels { @@ -542,6 +580,7 @@ impl Default for LlmProvider { internal: None, passthrough_auth: None, headers: None, + capabilities: None, } } } diff --git a/crates/common/src/llm_providers.rs b/crates/common/src/llm_providers.rs index b4355a2f5..d1181a98c 100644 --- a/crates/common/src/llm_providers.rs +++ b/crates/common/src/llm_providers.rs @@ -278,6 +278,7 @@ mod tests { stream: None, passthrough_auth: None, headers: None, + capabilities: None, } } diff --git a/crates/hermesllm/src/apis/mod.rs b/crates/hermesllm/src/apis/mod.rs index ea0563926..34e277fa1 100644 --- a/crates/hermesllm/src/apis/mod.rs +++ b/crates/hermesllm/src/apis/mod.rs @@ -1,6 +1,7 @@ pub mod amazon_bedrock; pub mod anthropic; pub mod openai; +pub mod openai_multimodal; pub mod openai_responses; pub mod streaming_shapes; @@ -14,6 +15,9 @@ pub use openai::{ ChatCompletionsRequest, ChatCompletionsResponse, ChatCompletionsStreamResponse, OpenAIApi, }; pub use openai::{Message as OpenAIMessage, Tool as OpenAITool, ToolChoice as OpenAIToolChoice}; +pub use openai_multimodal::{ + AudioSpeechRequest, ImageData, ImagesRequest, ImagesResponse, ImagesUsage, +}; pub trait ApiDefinition { /// Returns the endpoint path for this API @@ -88,9 +92,11 @@ mod tests { fn test_all_variants_method() { // Test that all_variants returns the expected variants let openai_variants = OpenAIApi::all_variants(); - assert_eq!(openai_variants.len(), 2); + assert_eq!(openai_variants.len(), 4); assert!(openai_variants.contains(&OpenAIApi::ChatCompletions)); assert!(openai_variants.contains(&OpenAIApi::Responses)); + assert!(openai_variants.contains(&OpenAIApi::Images)); + assert!(openai_variants.contains(&OpenAIApi::Audio)); let anthropic_variants = AnthropicApi::all_variants(); assert_eq!(anthropic_variants.len(), 1); diff --git a/crates/hermesllm/src/apis/openai.rs b/crates/hermesllm/src/apis/openai.rs index 8e66f0ad6..3a5fd698a 100644 --- a/crates/hermesllm/src/apis/openai.rs +++ b/crates/hermesllm/src/apis/openai.rs @@ -11,7 +11,9 @@ use crate::providers::request::{ProviderRequest, ProviderRequestError}; use crate::providers::response::{ProviderResponse, TokenUsage}; use crate::providers::streaming_response::ProviderStreamResponse; use crate::transforms::lib::ExtractText; -use crate::{CHAT_COMPLETIONS_PATH, OPENAI_RESPONSES_API_PATH}; +use crate::{ + AUDIO_SPEECH_PATH, CHAT_COMPLETIONS_PATH, IMAGES_GENERATIONS_PATH, OPENAI_RESPONSES_API_PATH, +}; // ============================================================================ // OPENAI API ENUMERATION @@ -22,6 +24,10 @@ use crate::{CHAT_COMPLETIONS_PATH, OPENAI_RESPONSES_API_PATH}; pub enum OpenAIApi { ChatCompletions, Responses, + /// Image generation (`/v1/images/generations`). + Images, + /// Text-to-speech / audio output (`/v1/audio/speech`). Returns binary audio. + Audio, // Future APIs can be added here: // Embeddings, // FineTuning, @@ -33,6 +39,8 @@ impl ApiDefinition for OpenAIApi { match self { OpenAIApi::ChatCompletions => CHAT_COMPLETIONS_PATH, OpenAIApi::Responses => OPENAI_RESPONSES_API_PATH, + OpenAIApi::Images => IMAGES_GENERATIONS_PATH, + OpenAIApi::Audio => AUDIO_SPEECH_PATH, } } @@ -40,6 +48,8 @@ impl ApiDefinition for OpenAIApi { match endpoint { CHAT_COMPLETIONS_PATH => Some(OpenAIApi::ChatCompletions), OPENAI_RESPONSES_API_PATH => Some(OpenAIApi::Responses), + IMAGES_GENERATIONS_PATH => Some(OpenAIApi::Images), + AUDIO_SPEECH_PATH => Some(OpenAIApi::Audio), _ => None, } } @@ -48,6 +58,9 @@ impl ApiDefinition for OpenAIApi { match self { OpenAIApi::ChatCompletions => true, OpenAIApi::Responses => true, + // TTS supports streamed audio chunks; image generation does not. + OpenAIApi::Audio => true, + OpenAIApi::Images => false, } } @@ -55,6 +68,8 @@ impl ApiDefinition for OpenAIApi { match self { OpenAIApi::ChatCompletions => true, OpenAIApi::Responses => true, + OpenAIApi::Images => false, + OpenAIApi::Audio => false, } } @@ -62,11 +77,19 @@ impl ApiDefinition for OpenAIApi { match self { OpenAIApi::ChatCompletions => true, OpenAIApi::Responses => true, + // These are output modalities, not vision input. + OpenAIApi::Images => false, + OpenAIApi::Audio => false, } } fn all_variants() -> Vec { - vec![OpenAIApi::ChatCompletions, OpenAIApi::Responses] + vec![ + OpenAIApi::ChatCompletions, + OpenAIApi::Responses, + OpenAIApi::Images, + OpenAIApi::Audio, + ] } } @@ -1183,9 +1206,11 @@ mod tests { // Test all_variants let all_variants = OpenAIApi::all_variants(); - assert_eq!(all_variants.len(), 2); + assert_eq!(all_variants.len(), 4); assert!(all_variants.contains(&OpenAIApi::ChatCompletions)); assert!(all_variants.contains(&OpenAIApi::Responses)); + assert!(all_variants.contains(&OpenAIApi::Images)); + assert!(all_variants.contains(&OpenAIApi::Audio)); } #[test] diff --git a/crates/hermesllm/src/apis/openai_multimodal.rs b/crates/hermesllm/src/apis/openai_multimodal.rs new file mode 100644 index 000000000..472effaf9 --- /dev/null +++ b/crates/hermesllm/src/apis/openai_multimodal.rs @@ -0,0 +1,183 @@ +//! OpenAI multimodal output APIs: image generation (`/v1/images/generations`) +//! and text-to-speech (`/v1/audio/speech`). +//! +//! These are OpenAI-native serde shapes. Image responses are JSON; audio/speech +//! responses are **binary** and are passed through untouched by the gateway +//! (see `llm_gateway::stream_context`), so there is no audio *response* struct. + +use serde::{Deserialize, Serialize}; +use serde_with::skip_serializing_none; + +use crate::providers::request::ProviderRequestError; + +/// Request body for `POST /v1/images/generations`. +#[skip_serializing_none] +#[derive(Serialize, Deserialize, Debug, Clone, Default)] +pub struct ImagesRequest { + #[serde(default)] + pub model: String, + pub prompt: String, + /// Number of images to generate. + pub n: Option, + /// e.g. `1024x1024`, `1792x1024`. + pub size: Option, + /// e.g. `standard`, `hd`, or model-specific quality levels. + pub quality: Option, + /// `url` or `b64_json`. + pub response_format: Option, + pub style: Option, + pub background: Option, + pub output_format: Option, + pub user: Option, +} + +impl ImagesRequest { + pub fn try_from_bytes(bytes: &[u8]) -> Result { + serde_json::from_slice(bytes) + } + + pub fn model(&self) -> &str { + &self.model + } + + pub fn set_model(&mut self, model: String) { + self.model = model; + } + + pub fn to_bytes(&self) -> Result, ProviderRequestError> { + serde_json::to_vec(self).map_err(|e| ProviderRequestError { + message: format!("failed to serialize ImagesRequest: {}", e), + source: Some(Box::new(e)), + }) + } +} + +/// One generated image in an [`ImagesResponse`]. +#[skip_serializing_none] +#[derive(Serialize, Deserialize, Debug, Clone, Default)] +pub struct ImageData { + pub b64_json: Option, + pub url: Option, + pub revised_prompt: Option, +} + +/// Response body for `POST /v1/images/generations`. +#[skip_serializing_none] +#[derive(Serialize, Deserialize, Debug, Clone, Default)] +pub struct ImagesResponse { + pub created: Option, + #[serde(default)] + pub data: Vec, + /// Some providers report token/image usage here. + pub usage: Option, +} + +/// Usage block for image generation (used for per-image cost accounting). +#[skip_serializing_none] +#[derive(Serialize, Deserialize, Debug, Clone, Default)] +pub struct ImagesUsage { + /// Number of images produced (the primary billable unit). + pub images: Option, + pub input_tokens: Option, + pub output_tokens: Option, + pub total_tokens: Option, +} + +impl ImagesResponse { + pub fn try_from_bytes(bytes: &[u8]) -> Result { + serde_json::from_slice(bytes) + } + + /// Number of images produced (billable unit), preferring an explicit usage + /// count and falling back to the number of returned images. + pub fn image_units(&self) -> usize { + self.usage + .as_ref() + .and_then(|u| u.images) + .map(|n| n as usize) + .unwrap_or(self.data.len()) + } +} + +/// Request body for `POST /v1/audio/speech` (text-to-speech). The response is +/// binary audio and is streamed/passed through without a typed response body. +#[skip_serializing_none] +#[derive(Serialize, Deserialize, Debug, Clone, Default)] +pub struct AudioSpeechRequest { + #[serde(default)] + pub model: String, + /// Text to synthesize. + pub input: String, + /// Voice id (e.g. `alloy`, `verse`). + pub voice: String, + /// Output container, e.g. `mp3`, `wav`, `opus`, `pcm`. + pub response_format: Option, + pub speed: Option, + pub instructions: Option, + /// Whether the client requested streamed audio chunks. + pub stream: Option, +} + +impl AudioSpeechRequest { + pub fn try_from_bytes(bytes: &[u8]) -> Result { + serde_json::from_slice(bytes) + } + + pub fn model(&self) -> &str { + &self.model + } + + pub fn set_model(&mut self, model: String) { + self.model = model; + } + + pub fn is_streaming(&self) -> bool { + self.stream.unwrap_or(false) + } + + /// Billable unit for TTS: number of input characters. + pub fn audio_units(&self) -> usize { + self.input.chars().count() + } + + pub fn to_bytes(&self) -> Result, ProviderRequestError> { + serde_json::to_vec(self).map_err(|e| ProviderRequestError { + message: format!("failed to serialize AudioSpeechRequest: {}", e), + source: Some(Box::new(e)), + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn images_request_roundtrips() { + let raw = br#"{"model":"gpt-image-1","prompt":"a cat","n":2,"size":"1024x1024"}"#; + let req = ImagesRequest::try_from_bytes(raw).unwrap(); + assert_eq!(req.model, "gpt-image-1"); + assert_eq!(req.n, Some(2)); + assert!(req.to_bytes().is_ok()); + } + + #[test] + fn images_response_counts_units() { + let raw = br#"{"created":1,"data":[{"b64_json":"aaa"},{"b64_json":"bbb"}]}"#; + let resp = ImagesResponse::try_from_bytes(raw).unwrap(); + assert_eq!(resp.image_units(), 2); + + let raw2 = br#"{"created":1,"data":[{"url":"x"}],"usage":{"images":5}}"#; + let resp2 = ImagesResponse::try_from_bytes(raw2).unwrap(); + assert_eq!(resp2.image_units(), 5); + } + + #[test] + fn audio_speech_request_units() { + let raw = br#"{"model":"gpt-4o-mini-tts","input":"hello","voice":"alloy"}"#; + let req = AudioSpeechRequest::try_from_bytes(raw).unwrap(); + assert_eq!(req.audio_units(), 5); + assert_eq!(req.voice, "alloy"); + assert!(!req.is_streaming()); + } +} diff --git a/crates/hermesllm/src/bin/long_context_quality.yaml b/crates/hermesllm/src/bin/long_context_quality.yaml new file mode 100644 index 000000000..f26af8b58 --- /dev/null +++ b/crates/hermesllm/src/bin/long_context_quality.yaml @@ -0,0 +1,77 @@ +# Internal long-context-quality (LCQ) dataset — Tier 2 routing signal. +# +# This is Plano-shipped data, NOT user config. Scores (0.0–1.0) estimate how well +# a model stays coherent at long context lengths, seeded from public benchmarks +# (RULER, HELMET, LongBench v2, NoLiMa). They are intentionally kept SEPARATE from +# the capability catalog (capabilities are stable; LCQ drifts — see plan R5). +# +# Keys are canonical "/" (see ProviderId::canonical_key); the +# loader normalizes lookups so config aliases (e.g. google/ vs gemini/) resolve. +# +# Refresh cadence: review quarterly against the latest public leaderboards and +# bump `dated`. Each entry records its `source` + `dated` for provenance; staleness +# is surfaced in telemetry. +version: "1.0" +source: "RULER / HELMET / LongBench v2 / NoLiMa (public benchmarks)" +dated: "2026-06-01" +models: + gemini/gemini-2.5-pro: + score: 0.96 + source: "RULER@128k, LongBench v2" + dated: "2026-06-01" + gemini/gemini-2.5-flash: + score: 0.90 + source: "RULER@128k" + dated: "2026-06-01" + gemini/gemini-1.5-pro: + score: 0.88 + source: "RULER@128k" + dated: "2026-06-01" + openai/gpt-4o: + score: 0.84 + source: "HELMET@128k" + dated: "2026-06-01" + openai/gpt-4o-mini: + score: 0.74 + source: "HELMET@128k" + dated: "2026-06-01" + openai/gpt-4.1: + score: 0.92 + source: "HELMET@1M, RULER" + dated: "2026-06-01" + openai/o3: + score: 0.91 + source: "LongBench v2" + dated: "2026-06-01" + anthropic/claude-opus-4-5: + score: 0.93 + source: "LongBench v2, RULER@200k" + dated: "2026-06-01" + anthropic/claude-opus-4-8: + score: 0.94 + source: "LongBench v2, RULER@200k" + dated: "2026-06-01" + anthropic/claude-3-5-sonnet: + score: 0.86 + source: "RULER@200k" + dated: "2026-06-01" + anthropic/claude-3-5-haiku: + score: 0.78 + source: "RULER@200k" + dated: "2026-06-01" + deepseek/deepseek-v4-pro: + score: 0.83 + source: "LongBench v2" + dated: "2026-06-01" + qwen/qwen3-coder-plus: + score: 0.80 + source: "RULER@128k" + dated: "2026-06-01" + moonshotai/kimi-k2-0905-preview: + score: 0.82 + source: "LongBench v2" + dated: "2026-06-01" + zhipu/glm-5.1: + score: 0.79 + source: "LongBench v2" + dated: "2026-06-01" diff --git a/crates/hermesllm/src/clients/endpoints.rs b/crates/hermesllm/src/clients/endpoints.rs index d7a9b4719..7b7136295 100644 --- a/crates/hermesllm/src/clients/endpoints.rs +++ b/crates/hermesllm/src/clients/endpoints.rs @@ -61,12 +61,18 @@ impl SupportedAPIsFromClient { /// Create a SupportedApi from an endpoint path pub fn from_endpoint(endpoint: &str) -> Option { if let Some(openai_api) = OpenAIApi::from_endpoint(endpoint) { - // Check if this is the Responses API endpoint - if openai_api == OpenAIApi::Responses { - return Some(SupportedAPIsFromClient::OpenAIResponsesAPI(openai_api)); + match openai_api { + OpenAIApi::Responses => { + return Some(SupportedAPIsFromClient::OpenAIResponsesAPI(openai_api)); + } + OpenAIApi::ChatCompletions => { + return Some(SupportedAPIsFromClient::OpenAIChatCompletions(openai_api)); + } + // Image-generation / audio-speech are identified for routing + // (capability gating + binary passthrough) but are not parsed as + // chat-style client request bodies, so they don't resolve here. + OpenAIApi::Images | OpenAIApi::Audio => {} } - // Otherwise it's ChatCompletions - return Some(SupportedAPIsFromClient::OpenAIChatCompletions(openai_api)); } if let Some(anthropic_api) = AnthropicApi::from_endpoint(endpoint) { @@ -247,8 +253,13 @@ impl SupportedUpstreamAPIs { pub fn supported_endpoints() -> Vec<&'static str> { let mut endpoints = Vec::new(); - // Add all OpenAI endpoints + // Add OpenAI client-routable endpoints. Image-generation / audio-speech + // endpoints are recognized for capability routing but not yet advertised as + // chat-style client endpoints (handled via native/binary passthrough). for api in OpenAIApi::all_variants() { + if matches!(api, OpenAIApi::Images | OpenAIApi::Audio) { + continue; + } endpoints.push(api.endpoint()); } @@ -310,9 +321,11 @@ mod tests { fn test_endpoints_generated_from_api_definitions() { let endpoints = supported_endpoints(); - // Verify that we get endpoints from all API variants + // Verify that we get endpoints from all client-routable API variants. + // Image-generation / audio-speech are excluded (not chat-style clients). let openai_endpoints: Vec<_> = OpenAIApi::all_variants() .iter() + .filter(|api| !matches!(api, OpenAIApi::Images | OpenAIApi::Audio)) .map(|api| api.endpoint()) .collect(); let anthropic_endpoints: Vec<_> = AnthropicApi::all_variants() @@ -337,10 +350,14 @@ mod tests { endpoint ); } - // Total should match + // Total should match the client-routable variants (excludes Images/Audio). + let openai_client_routable = OpenAIApi::all_variants() + .iter() + .filter(|api| !matches!(api, OpenAIApi::Images | OpenAIApi::Audio)) + .count(); assert_eq!( endpoints.len(), - OpenAIApi::all_variants().len() + AnthropicApi::all_variants().len() + openai_client_routable + AnthropicApi::all_variants().len() ); } diff --git a/crates/hermesllm/src/lib.rs b/crates/hermesllm/src/lib.rs index 3b9611e00..6323361ca 100644 --- a/crates/hermesllm/src/lib.rs +++ b/crates/hermesllm/src/lib.rs @@ -9,7 +9,13 @@ pub mod transforms; pub use apis::streaming_shapes::amazon_bedrock_binary_frame::BedrockBinaryFrameDecoder; pub use apis::streaming_shapes::sse::{SseEvent, SseStreamIter}; pub use aws_smithy_eventstream::frame::DecodedFrame; +pub use providers::capabilities::{ + CapabilitiesCatalog, CapabilitiesSnapshot, ModelCapabilities, RequiredCapabilities, +}; pub use providers::id::ProviderId; +pub use providers::long_context_quality::{ + score_for as long_context_quality_score, LongContextQualityDataset, +}; pub use providers::request::{ProviderRequest, ProviderRequestError, ProviderRequestType}; pub use providers::response::{ ProviderResponse, ProviderResponseError, ProviderResponseType, TokenUsage, @@ -20,6 +26,8 @@ pub use providers::streaming_response::{ProviderStreamResponse, ProviderStreamRe pub const CHAT_COMPLETIONS_PATH: &str = "/v1/chat/completions"; pub const OPENAI_RESPONSES_API_PATH: &str = "/v1/responses"; pub const MESSAGES_PATH: &str = "/v1/messages"; +pub const IMAGES_GENERATIONS_PATH: &str = "/v1/images/generations"; +pub const AUDIO_SPEECH_PATH: &str = "/v1/audio/speech"; #[cfg(test)] mod tests { diff --git a/crates/hermesllm/src/providers/capabilities.rs b/crates/hermesllm/src/providers/capabilities.rs new file mode 100644 index 000000000..26f73a293 --- /dev/null +++ b/crates/hermesllm/src/providers/capabilities.rs @@ -0,0 +1,421 @@ +//! Model capability metadata (Tier 1 routing). +//! +//! Capabilities are objective, stable properties of a model: which modalities it +//! accepts/produces and how large a context window it supports. They are sourced +//! at runtime from [models.dev](https://models.dev) (fetched by brightstaff's +//! `ModelCapabilitiesService`, mirroring how DigitalOcean pricing is fetched) and +//! can be overridden per-model by user config. This module owns only the snapshot +//! parsing and the canonical lookup — no data is vendored into the binary. +//! +//! Precedence (applied by the caller, e.g. brightstaff's `ModelCapabilitiesService`): +//! `user config capabilities > models.dev > conservative default`. + +use crate::providers::id::ProviderId; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; + +/// Objective, stable per-model capabilities. All fields are optional so that a +/// user-config override can be merged field-by-field over the models.dev default. +#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] +pub struct ModelCapabilities { + /// Maximum input context window (tokens). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub context_window: Option, + /// Maximum output tokens the model will emit. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub max_output_tokens: Option, + /// Accepts image input (vision). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub supports_vision: Option, + /// Produces images (`/v1/images/generations`). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub supports_image_generation: Option, + /// Produces audio (`/v1/audio/speech`). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub supports_audio_out: Option, +} + +impl ModelCapabilities { + /// Resolve vision support, defaulting to text-only (`false`) when unknown. + pub fn vision(&self) -> bool { + self.supports_vision.unwrap_or(false) + } + + /// Resolve image-generation support, defaulting to `false` when unknown. + pub fn image_generation(&self) -> bool { + self.supports_image_generation.unwrap_or(false) + } + + /// Resolve audio-out support, defaulting to `false` when unknown. + pub fn audio_out(&self) -> bool { + self.supports_audio_out.unwrap_or(false) + } + + /// Known context window, treating `0`/absent as "unknown" (no constraint). + pub fn window(&self) -> Option { + self.context_window.filter(|&w| w > 0) + } + + /// Fill any `None` field on `self` from `fallback`. Used to apply precedence: + /// `user.fill_from(models_dev)` keeps user-set fields and backfills the rest. + pub fn fill_from(&self, fallback: &ModelCapabilities) -> ModelCapabilities { + ModelCapabilities { + context_window: self.context_window.or(fallback.context_window), + max_output_tokens: self.max_output_tokens.or(fallback.max_output_tokens), + supports_vision: self.supports_vision.or(fallback.supports_vision), + supports_image_generation: self + .supports_image_generation + .or(fallback.supports_image_generation), + supports_audio_out: self.supports_audio_out.or(fallback.supports_audio_out), + } + } +} + +/// The capability requirements implied by a single request's shape (Tier 1). +/// Computed from the endpoint + request content; checked against each candidate +/// model's [`ModelCapabilities`]. +#[derive(Debug, Clone, Default, PartialEq, Eq)] +pub struct RequiredCapabilities { + /// Request carries image input (vision). + pub vision: bool, + /// Request targets image generation (`/v1/images/generations`). + pub image_out: bool, + /// Request targets audio/TTS output (`/v1/audio/speech`). + pub audio_out: bool, + /// Minimum input context window required (estimated token count). + pub min_context_tokens: usize, +} + +impl RequiredCapabilities { + /// Derive the modality requirements implied by an endpoint path. Vision and + /// context-token requirements are request-content-derived and set separately. + pub fn for_endpoint(path: &str) -> Self { + RequiredCapabilities { + image_out: path.contains("/images/generations"), + audio_out: path.contains("/audio/speech"), + ..Default::default() + } + } + + /// True when this request imposes no capability constraints (plain text chat + /// that fits any window) — lets callers skip filtering entirely. + pub fn is_unconstrained(&self) -> bool { + !self.vision && !self.image_out && !self.audio_out && self.min_context_tokens == 0 + } + + /// Whether a model with the given capabilities can serve this request. + /// Unknown context windows are treated permissively (conservative default): + /// we only eliminate a model when we can *prove* the window is too small. + pub fn satisfied_by(&self, caps: &ModelCapabilities) -> bool { + if self.vision && !caps.vision() { + return false; + } + if self.image_out && !caps.image_generation() { + return false; + } + if self.audio_out && !caps.audio_out() { + return false; + } + if self.min_context_tokens > 0 { + if let Some(window) = caps.window() { + if self.min_context_tokens as u64 > window as u64 { + return false; + } + } + } + true + } + + /// Human-readable description of the unmet requirement(s) for error messages. + pub fn describe(&self) -> String { + let mut parts = Vec::new(); + if self.vision { + parts.push("vision input".to_string()); + } + if self.image_out { + parts.push("image generation".to_string()); + } + if self.audio_out { + parts.push("audio output".to_string()); + } + if self.min_context_tokens > 0 { + parts.push(format!( + "context window >= {} tokens", + self.min_context_tokens + )); + } + if parts.is_empty() { + "no special capabilities".to_string() + } else { + parts.join(", ") + } + } +} + +/// On-disk shape of the vendored snapshot / models.dev refresh output. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct CapabilitiesSnapshot { + #[serde(default)] + pub version: String, + #[serde(default)] + pub source: String, + #[serde(default)] + pub generated: String, + /// Keyed by canonical `"/"` (see [`ProviderId::canonical_key`]). + #[serde(default)] + pub models: HashMap, +} + +impl CapabilitiesSnapshot { + /// Parse a pre-built (canonical-keyed) snapshot from JSON bytes, i.e. the + /// `{ "models": { "/": { ... } } }` shape this module emits. + pub fn from_json_slice(bytes: &[u8]) -> Result { + serde_json::from_slice(bytes) + } + + /// Build a snapshot directly from a raw `models.dev` `api.json` payload, + /// applying the provider-key alias map and modality/limit mapping. Used by + /// the brightstaff runtime fetch/refresh. + pub fn from_models_dev_json(bytes: &[u8]) -> Result { + let providers: HashMap = serde_json::from_slice(bytes)?; + let mut models = HashMap::new(); + for (pkey, provider) in providers { + let Some(canon) = models_dev_provider_to_canonical(&pkey) else { + continue; // unmapped / aggregator provider + }; + for (mid, m) in provider.models { + models.insert(format!("{}/{}", canon, mid), m.into_capabilities()); + } + } + Ok(CapabilitiesSnapshot { + version: "1.0".to_string(), + source: "models.dev".to_string(), + generated: String::new(), + models, + }) + } +} + +/// models.dev provider key -> Plano canonical provider token (matches +/// [`ProviderId::canonical_key`]). Aggregator/unmapped keys return `None`. +pub fn models_dev_provider_to_canonical(provider_key: &str) -> Option<&'static str> { + Some(match provider_key { + "openai" => "openai", + "anthropic" => "anthropic", + "google" => "gemini", + "mistral" => "mistral", + "groq" => "groq", + "xai" => "xai", + "deepseek" => "deepseek", + "moonshotai" => "moonshotai", + "zhipuai" => "zhipu", + "xiaomi" => "xiaomi", + "togetherai" => "together_ai", + "amazon-bedrock" => "amazon_bedrock", + "digitalocean" => "digitalocean", + "openrouter" => "openrouter", + "vercel" => "vercel", + "github-models" => "github", + "alibaba" => "qwen", + _ => return None, + }) +} + +/// Raw models.dev per-provider shape (only the fields we map). +#[derive(Debug, Deserialize)] +struct ModelsDevProvider { + #[serde(default)] + models: HashMap, +} + +#[derive(Debug, Deserialize)] +struct ModelsDevModel { + #[serde(default)] + modalities: ModelsDevModalities, + #[serde(default)] + limit: ModelsDevLimit, +} + +#[derive(Debug, Default, Deserialize)] +struct ModelsDevModalities { + #[serde(default)] + input: Vec, + #[serde(default)] + output: Vec, +} + +#[derive(Debug, Default, Deserialize)] +struct ModelsDevLimit { + #[serde(default)] + context: Option, + #[serde(default)] + output: Option, +} + +impl ModelsDevModel { + fn into_capabilities(self) -> ModelCapabilities { + ModelCapabilities { + context_window: self.limit.context, + max_output_tokens: self.limit.output, + supports_vision: Some(self.modalities.input.iter().any(|s| s == "image")), + supports_image_generation: Some(self.modalities.output.iter().any(|s| s == "image")), + supports_audio_out: Some(self.modalities.output.iter().any(|s| s == "audio")), + } + } +} + +/// In-memory capability catalog keyed by canonical `"/"`. +#[derive(Debug, Clone, Default)] +pub struct CapabilitiesCatalog { + models: HashMap, +} + +impl CapabilitiesCatalog { + pub fn new(models: HashMap) -> Self { + Self { models } + } + + pub fn from_snapshot(snapshot: CapabilitiesSnapshot) -> Self { + Self::new(snapshot.models) + } + + /// Number of models in the catalog. + pub fn len(&self) -> usize { + self.models.len() + } + + pub fn is_empty(&self) -> bool { + self.models.is_empty() + } + + /// Look up capabilities for a `"/"` string, normalizing + /// the provider token to its canonical form. Returns `None` when the provider + /// is unknown or the model is absent from the catalog. + pub fn get(&self, model: &str) -> Option<&ModelCapabilities> { + let key = canonical_model_key(model)?; + self.models.get(&key) + } +} + +/// Normalize a `"/"` string into the canonical catalog key +/// `"/"`. Splits on the first `/` so model ids that +/// themselves contain `/` (e.g. `meta-llama/Llama-3.3-70B`) are preserved. +pub fn canonical_model_key(model: &str) -> Option { + let (provider, model_id) = model.split_once('/')?; + let canonical = ProviderId::try_from(provider).ok()?.canonical_key(); + Some(format!("{}/{}", canonical, model_id)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn canonical_key_normalizes_provider_aliases() { + // google -> gemini, x-ai handled via try_from + assert_eq!( + canonical_model_key("google/gemini-2.5-pro").as_deref(), + Some("gemini/gemini-2.5-pro") + ); + assert_eq!( + canonical_model_key("openai/gpt-4o").as_deref(), + Some("openai/gpt-4o") + ); + // model id containing a slash is preserved + assert_eq!( + canonical_model_key("together_ai/meta-llama/Llama-3.3-70B").as_deref(), + Some("together_ai/meta-llama/Llama-3.3-70B") + ); + // unknown provider -> None + assert!(canonical_model_key("notaprovider/foo").is_none()); + // no provider prefix -> None + assert!(canonical_model_key("gpt-4o").is_none()); + } + + #[test] + fn catalog_resolves_known_model_and_defaults_unknown() { + // A catalog built from a models.dev payload resolves known models by + // canonical key; absent models fall back to the conservative default. + let raw = br#"{ + "openai": { "models": { + "gpt-4o": { + "modalities": { "input": ["text","image"], "output": ["text"] }, + "limit": { "context": 128000, "output": 16384 } + } + }} + }"#; + let snapshot = CapabilitiesSnapshot::from_models_dev_json(raw).unwrap(); + let catalog = CapabilitiesCatalog::from_snapshot(snapshot); + + let caps = catalog.get("openai/gpt-4o").cloned().unwrap_or_default(); + assert!(caps.vision()); + assert!(!caps.image_generation()); + assert_eq!(caps.window(), Some(128000)); + + // Unknown model -> conservative default (text-only, unknown window). + let missing = catalog + .get("openai/totally-made-up-model") + .cloned() + .unwrap_or_default(); + assert!(!missing.vision()); + assert!(!missing.audio_out()); + assert_eq!(missing.window(), None); + } + + #[test] + fn models_dev_raw_json_maps_to_canonical_capabilities() { + let raw = br#"{ + "google": { "models": { + "gemini-2.5-pro": { + "modalities": { "input": ["text","image"], "output": ["text"] }, + "limit": { "context": 1048576, "output": 65536 } + } + }}, + "requesty": { "models": { + "openai/gpt-4o": { "modalities": { "input": ["text"], "output": ["text"] } } + }} + }"#; + let snapshot = CapabilitiesSnapshot::from_models_dev_json(raw).unwrap(); + // google -> gemini canonical key + let caps = snapshot.models.get("gemini/gemini-2.5-pro").unwrap(); + assert_eq!(caps.supports_vision, Some(true)); + assert_eq!(caps.context_window, Some(1048576)); + assert_eq!(caps.max_output_tokens, Some(65536)); + // aggregator provider "requesty" is skipped + assert!(snapshot.models.keys().all(|k| !k.contains("requesty"))); + } + + #[test] + fn provider_alias_map_matches_canonical_keys() { + assert_eq!(models_dev_provider_to_canonical("google"), Some("gemini")); + assert_eq!( + models_dev_provider_to_canonical("amazon-bedrock"), + Some("amazon_bedrock") + ); + assert_eq!(models_dev_provider_to_canonical("requesty"), None); + // Alias targets must be valid canonical provider tokens (round-trip). + for key in ["google", "amazon-bedrock", "togetherai", "github-models"] { + let canon = models_dev_provider_to_canonical(key).unwrap(); + let provider = ProviderId::try_from(canon).expect("canonical token must parse"); + assert_eq!(provider.canonical_key(), canon); + } + } + + #[test] + fn fill_from_applies_precedence() { + let user = ModelCapabilities { + context_window: Some(128000), + ..Default::default() + }; + let models_dev = ModelCapabilities { + context_window: Some(200000), + supports_vision: Some(true), + ..Default::default() + }; + let resolved = user.fill_from(&models_dev); + // user override wins for context_window + assert_eq!(resolved.context_window, Some(128000)); + // models.dev backfills vision + assert_eq!(resolved.supports_vision, Some(true)); + } +} diff --git a/crates/hermesllm/src/providers/id.rs b/crates/hermesllm/src/providers/id.rs index 91b744de1..ff0e32e06 100644 --- a/crates/hermesllm/src/providers/id.rs +++ b/crates/hermesllm/src/providers/id.rs @@ -91,6 +91,38 @@ impl TryFrom<&str> for ProviderId { } impl ProviderId { + /// Stable lowercase token used as the canonical provider key for capability + /// lookups (see `providers::capabilities`). Must round-trip through + /// `ProviderId::try_from`, and the capability snapshot generator maps + /// models.dev provider keys to these same tokens. + pub fn canonical_key(&self) -> &'static str { + match self { + ProviderId::OpenAI => "openai", + ProviderId::Xiaomi => "xiaomi", + ProviderId::Mistral => "mistral", + ProviderId::Deepseek => "deepseek", + ProviderId::Groq => "groq", + ProviderId::Gemini => "gemini", + ProviderId::Anthropic => "anthropic", + ProviderId::GitHub => "github", + ProviderId::Plano => "plano", + ProviderId::AzureOpenAI => "azure_openai", + ProviderId::XAI => "xai", + ProviderId::TogetherAI => "together_ai", + ProviderId::Ollama => "ollama", + ProviderId::Moonshotai => "moonshotai", + ProviderId::Zhipu => "zhipu", + ProviderId::Qwen => "qwen", + ProviderId::AmazonBedrock => "amazon_bedrock", + ProviderId::ChatGPT => "chatgpt", + ProviderId::DigitalOcean => "digitalocean", + ProviderId::Vercel => "vercel", + ProviderId::OpenRouter => "openrouter", + ProviderId::Astraflow => "astraflow", + ProviderId::AstraflowCN => "astraflow_cn", + } + } + /// Get all available models for this provider /// Returns model names without the provider prefix (e.g., "gpt-4" not "openai/gpt-4") pub fn models(&self) -> Vec { diff --git a/crates/hermesllm/src/providers/long_context_quality.rs b/crates/hermesllm/src/providers/long_context_quality.rs new file mode 100644 index 000000000..73fae1cc9 --- /dev/null +++ b/crates/hermesllm/src/providers/long_context_quality.rs @@ -0,0 +1,123 @@ +//! Internal long-context-quality (LCQ) dataset — the v1 Tier 2 routing signal. +//! +//! LCQ scores estimate how well a model stays coherent at long context lengths. +//! They are **Plano-internal** data seeded from public benchmarks (RULER, HELMET, +//! LongBench v2, NoLiMa), loaded from a vendored YAML file. They are intentionally +//! kept separate from the capability catalog: capabilities are stable, LCQ drifts. +//! +//! Used by brightstaff's `rank_models` when `selection_policy.prefer = long_context_quality`. + +use crate::providers::capabilities::canonical_model_key; +use serde::Deserialize; +use std::collections::HashMap; +use std::sync::OnceLock; + +static LONG_CONTEXT_QUALITY_YAML: &str = include_str!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/src/bin/long_context_quality.yaml" +)); + +/// One model's LCQ entry, carrying provenance. +#[derive(Debug, Clone, Deserialize)] +pub struct LcqEntry { + pub score: f64, + #[serde(default)] + pub source: String, + #[serde(default)] + pub dated: String, +} + +#[derive(Debug, Clone, Deserialize)] +struct LcqFile { + #[serde(default)] + version: String, + #[serde(default)] + source: String, + #[serde(default)] + dated: String, + #[serde(default)] + models: HashMap, +} + +/// The internal LCQ dataset, parsed once. +#[derive(Debug, Clone)] +pub struct LongContextQualityDataset { + pub version: String, + pub source: String, + /// Provenance date of the dataset (used for staleness telemetry). + pub dated: String, + models: HashMap, +} + +impl LongContextQualityDataset { + fn parse(yaml: &str) -> Self { + let file: LcqFile = serde_yaml::from_str(yaml).expect("parse long_context_quality.yaml"); + LongContextQualityDataset { + version: file.version, + source: file.source, + dated: file.dated, + models: file.models, + } + } + + /// Score for a `"/"` string, normalizing the provider + /// token so config aliases resolve. Returns `None` when not benchmarked. + pub fn score_for(&self, model: &str) -> Option { + // Try the canonical key first, then the raw string as a fallback. + if let Some(key) = canonical_model_key(model) { + if let Some(entry) = self.models.get(&key) { + return Some(entry.score); + } + } + self.models.get(model).map(|e| e.score) + } + + pub fn len(&self) -> usize { + self.models.len() + } + + pub fn is_empty(&self) -> bool { + self.models.is_empty() + } +} + +/// The vendored LCQ dataset, parsed once and shared. +pub fn dataset() -> &'static LongContextQualityDataset { + static DATA: OnceLock = OnceLock::new(); + DATA.get_or_init(|| LongContextQualityDataset::parse(LONG_CONTEXT_QUALITY_YAML)) +} + +/// Convenience: LCQ score for a model from the vendored dataset. +pub fn score_for(model: &str) -> Option { + dataset().score_for(model) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn dataset_loads_with_provenance() { + let ds = dataset(); + assert!(!ds.is_empty()); + assert!( + !ds.dated.is_empty(), + "dataset should carry a provenance date" + ); + assert!(!ds.source.is_empty()); + } + + #[test] + fn scores_resolve_with_alias_normalization() { + // google/ alias normalizes to canonical gemini/ + let via_alias = score_for("google/gemini-2.5-pro"); + let via_canonical = score_for("gemini/gemini-2.5-pro"); + assert!(via_alias.is_some()); + assert_eq!(via_alias, via_canonical); + } + + #[test] + fn unknown_model_has_no_score() { + assert!(score_for("openai/no-such-model").is_none()); + } +} diff --git a/crates/hermesllm/src/providers/mod.rs b/crates/hermesllm/src/providers/mod.rs index 4343022f4..b09d5b1e5 100644 --- a/crates/hermesllm/src/providers/mod.rs +++ b/crates/hermesllm/src/providers/mod.rs @@ -3,12 +3,20 @@ //! This module contains provider-specific implementations that handle //! request/response conversion for different LLM service APIs. //! +pub mod capabilities; pub mod id; +pub mod long_context_quality; pub mod request; pub mod response; pub mod streaming_response; +pub use capabilities::{ + CapabilitiesCatalog, CapabilitiesSnapshot, ModelCapabilities, RequiredCapabilities, +}; pub use id::ProviderId; +pub use long_context_quality::{ + score_for as long_context_quality_score, LongContextQualityDataset, +}; pub use request::{ProviderRequest, ProviderRequestError, ProviderRequestType}; pub use response::{ProviderResponse, ProviderResponseType, TokenUsage}; pub use streaming_response::{ProviderStreamResponse, ProviderStreamResponseType}; diff --git a/crates/hermesllm/src/providers/request.rs b/crates/hermesllm/src/providers/request.rs index bcc0eafd2..75bdd3359 100644 --- a/crates/hermesllm/src/providers/request.rs +++ b/crates/hermesllm/src/providers/request.rs @@ -58,6 +58,25 @@ pub trait ProviderRequest: Send + Sync { /// Set message history from OpenAI Message format /// This converts OpenAI messages to the appropriate format for each provider type fn set_messages(&mut self, messages: &[crate::apis::openai::Message]); + + /// Request-shape introspection: does any message carry image input (vision)? + /// Drives Tier 1 capability filtering (`supports_vision`). + fn has_vision(&self) -> bool { + use crate::apis::openai::{ContentPart, MessageContent}; + self.get_messages().iter().any(|m| match &m.content { + Some(MessageContent::Parts(parts)) => parts + .iter() + .any(|p| matches!(p, ContentPart::ImageUrl { .. })), + _ => false, + }) + } + + /// Approximate input token count for context-window filtering (Tier 1). + /// Uses a coarse ~4-chars-per-token heuristic over message text; precise + /// tokenization is unnecessary for a "fits the window" hard gate. + fn required_context_tokens(&self) -> usize { + self.extract_messages_text().chars().count() / 4 + } } impl ProviderRequestType { diff --git a/crates/hermesllm/src/providers/response.rs b/crates/hermesllm/src/providers/response.rs index b8565ddf3..b2383e4a1 100644 --- a/crates/hermesllm/src/providers/response.rs +++ b/crates/hermesllm/src/providers/response.rs @@ -37,6 +37,16 @@ pub trait TokenUsage { fn reasoning_tokens(&self) -> Option { None } + /// Number of generated images (billable unit for `/v1/images/generations`). + /// `None` for text/audio responses. + fn image_units(&self) -> Option { + None + } + /// Number of synthesized audio units — characters or seconds depending on the + /// provider (billable unit for `/v1/audio/speech`). `None` for non-audio. + fn audio_units(&self) -> Option { + None + } } /// Rich usage breakdown extracted from a provider response. diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs index fa9964dd2..41e731c57 100644 --- a/crates/llm_gateway/src/stream_context.rs +++ b/crates/llm_gateway/src/stream_context.rs @@ -56,6 +56,9 @@ pub struct StreamContext { http_protocol: Option, sse_buffer: Option, sse_chunk_processor: Option, + /// True when the upstream response is binary (e.g. `audio/*` from + /// `/v1/audio/speech`) and must be passed through without JSON parsing. + response_is_binary: bool, } impl StreamContext { @@ -87,6 +90,7 @@ impl StreamContext { http_protocol: None, sse_buffer: None, sse_chunk_processor: None, + response_is_binary: false, } } @@ -1152,6 +1156,27 @@ impl HttpContext for StreamContext { } } + // Detect binary (non-JSON) response bodies — e.g. audio from + // /v1/audio/speech. These must be passed through untouched: the gateway + // otherwise JSON-parses and re-serializes every non-streaming response, + // which would corrupt binary audio. Keep content-length intact for these. + let content_type = self + .get_http_response_header("content-type") + .unwrap_or_default() + .to_ascii_lowercase(); + self.response_is_binary = content_type.starts_with("audio/") + || content_type.starts_with("image/") + || content_type == "application/octet-stream"; + + if self.response_is_binary { + debug!( + "request_id={}: binary response (content-type={}), passing through untouched", + self.request_identifier(), + content_type + ); + return Action::Continue; + } + self.remove_http_response_header("content-length"); self.remove_http_response_header("content-encoding"); @@ -1183,6 +1208,20 @@ impl HttpContext for StreamContext { return Action::Continue; } + // Binary responses (e.g. audio/* from /v1/audio/speech) are passed + // through without JSON parsing or token accounting (G3 / WS8). + if self.response_is_binary { + debug!( + "request_id={}: binary response body passthrough, body_size={}", + self.request_identifier(), + body_size + ); + if end_of_stream { + self.handle_end_of_request_metrics_and_traces(current_time); + } + return Action::Continue; + } + // Check if this is an error response from upstream if let Some(status_code) = &self.upstream_status_code { if status_code.is_client_error() || status_code.is_server_error() { diff --git a/docs/source/resources/includes/plano_config_full_reference.yaml b/docs/source/resources/includes/plano_config_full_reference.yaml index 2231a01f9..20f17d7ed 100644 --- a/docs/source/resources/includes/plano_config_full_reference.yaml +++ b/docs/source/resources/includes/plano_config_full_reference.yaml @@ -55,6 +55,32 @@ model_providers: headers: User-Agent: "KimiCLI/1.3" + # Multimodal models — capabilities (vision / image-out / audio-out / context + # window) are fetched at runtime from models.dev (like cost/latency metrics), + # so you normally declare NOTHING. Add an optional `capabilities:` block only to + # override models.dev or to describe a model models.dev doesn't have. + # Precedence: capabilities block > models.dev > conservative default. + - model: openai/gpt-image-1 # serves /v1/images/generations + access_key: $OPENAI_API_KEY + + - model: openai/gpt-4o-mini-tts # serves /v1/audio/speech (TTS) + access_key: $OPENAI_API_KEY + + # Override example: models.dev reports 200k, but this deployment is pinned to 128k. + - model: anthropic/claude-opus-4-1-128k + access_key: $ANTHROPIC_API_KEY + capabilities: + context_window: 128000 + + # Custom/self-hosted model models.dev doesn't know — declare its capabilities. + - model: openai/llama-3.3-70b-vision + base_url: https://api.custom-provider.com + http_host: api.custom-provider.com + access_key: $CUSTOM_API_KEY + capabilities: + context_window: 128000 + supports_vision: true + # Model aliases - use friendly names instead of full provider model names model_aliases: fast-llm: @@ -69,8 +95,14 @@ model_aliases: # uses models[0] as primary and retries with models[1], models[2]... on 429/5xx. # Requires overrides.llm_routing_model to point at Plano-Orchestrator (or equivalent). # Each model in `models` must be declared in model_providers above. -# selection_policy is optional: {prefer: cheapest|fastest|none} lets the router -# reorder candidates using live cost/latency data from model_metrics_sources. +# selection_policy is optional: {prefer: cheapest|fastest|long_context_quality|none} +# lets the router reorder candidates. cheapest/fastest use live cost/latency data +# from model_metrics_sources; long_context_quality ranks by Plano's internal +# benchmark-seeded long-context-quality score (supplied internally, not user-authored). +# +# Tier 1 capability filters (modality + context-window fit) apply AUTOMATICALLY from +# the request shape: each preset's `models` pool is narrowed to the models that can +# physically serve the request before selection_policy ranks the survivors. routing_preferences: - name: code generation description: generating new code snippets, functions, or boilerplate based on user prompts or requirements @@ -86,6 +118,36 @@ routing_preferences: selection_policy: prefer: cheapest + # long-context preset — ranks the window-sufficient pool by long-context quality. + - name: long document analysis + description: summarizing or querying very large documents and codebases + models: + - anthropic/claude-opus-4-1-128k + - openai/gpt-4o + selection_policy: + prefer: long_context_quality + + # vision preset — pool auto-filtered to vision-capable models (Tier 1). + - name: image understanding + description: answering questions about uploaded images, screenshots, or diagrams + models: + - anthropic/claude-sonnet-4-0 + - openai/gpt-4o + + # image-generation preset — serves /v1/images/generations (Tier 1). + - name: image generation + description: creating or editing images from a text prompt + models: + - openai/gpt-image-1 + selection_policy: + prefer: cheapest + + # TTS preset — serves /v1/audio/speech (Tier 1). + - name: text to speech + description: synthesizing spoken audio from text + models: + - openai/gpt-4o-mini-tts + # HTTP listeners - entry points for agent routing, prompt targets, and direct LLM access listeners: # Agent listener for routing requests to multiple agents @@ -198,6 +260,17 @@ overrides: # Disable agentic signal analysis (frustration, repetition, escalation, etc.) # on LLM responses to save CPU. Default: false. disable_signals: false + # Tier 1 capability filtering is a hard gate. When it removes every model in a + # matched route, "error" returns HTTP 422 (default); "warning" logs and proceeds + # with the developer's pre-filter pool. This is the only lever that lets + # routing_preferences win over capability. + empty_pool_behavior: error + # Optional — model capabilities are fetched at runtime from models.dev. Set this + # only to change the URL or refresh cadence; omit refresh_interval to fetch once + # at startup (no periodic refresh). + model_capabilities_source: + url: https://models.dev/api.json + refresh_interval: 86400 # seconds (daily) # Model affinity — pin routing decisions for agentic loops routing: