diff --git a/cli/planoai/obs/pricing.py b/cli/planoai/obs/pricing.py index 6f2ce5b4a..0a8b7321c 100644 --- a/cli/planoai/obs/pricing.py +++ b/cli/planoai/obs/pricing.py @@ -1,7 +1,8 @@ -"""DigitalOcean Gradient pricing catalog for the obs console. +"""Model pricing catalog for the obs console. -Ported loosely from ``crates/brightstaff/src/router/model_metrics.rs::fetch_do_pricing``. -Single-source: one fetch at startup, cached for the life of the process. +Mirrors ``crates/brightstaff/src/router/model_metrics.rs``. The source is +configurable: ``digitalocean`` (DO GenAI catalog) or ``models.dev``. A single +fetch at startup is cached for the life of the process. """ from __future__ import annotations @@ -14,7 +15,18 @@ import requests -DEFAULT_PRICING_URL = "https://api.digitalocean.com/v2/gen-ai/models/catalog" +DO_PRICING_URL = "https://api.digitalocean.com/v2/gen-ai/models/catalog" +MODELS_DEV_URL = "https://models.dev/api.json" + +# Backwards-compatible default (DigitalOcean) used when no provider is given. +DEFAULT_PRICING_URL = DO_PRICING_URL +DEFAULT_PRICING_PROVIDER = "digitalocean" + +_DEFAULT_URLS = { + "digitalocean": DO_PRICING_URL, + "models.dev": MODELS_DEV_URL, +} + FETCH_TIMEOUT_SECS = 5.0 @@ -51,36 +63,52 @@ def sample_models(self, n: int = 5) -> list[str]: return list(self._prices.keys())[:n] @classmethod - def fetch(cls, url: str = DEFAULT_PRICING_URL) -> "PricingCatalog": - """Fetch pricing from DO's catalog endpoint. On failure, returns an + def fetch( + cls, + provider: str = DEFAULT_PRICING_PROVIDER, + url: str | None = None, + ) -> "PricingCatalog": + """Fetch pricing from the configured catalog. On failure, returns an empty catalog (cost column will be blank). - The catalog endpoint is public — no auth required, no signup — so - ``planoai obs`` gets cost data on first run out of the box. + ``provider`` selects the parser/default URL: ``digitalocean`` or + ``models.dev``. Both catalog endpoints are public — no auth required — + so ``planoai obs`` gets cost data on first run out of the box. """ + provider = (provider or DEFAULT_PRICING_PROVIDER).strip().lower() + resolved_url = url or _DEFAULT_URLS.get(provider, DO_PRICING_URL) try: - resp = requests.get(url, timeout=FETCH_TIMEOUT_SECS) + resp = requests.get(resolved_url, timeout=FETCH_TIMEOUT_SECS) resp.raise_for_status() data = resp.json() except Exception as exc: # noqa: BLE001 — best-effort; never fatal logger.warning( - "DO pricing fetch failed: %s; cost column will be blank.", + "%s pricing fetch failed: %s; cost column will be blank.", + provider, exc, ) return cls() - prices = _parse_do_pricing(data) + if provider == "models.dev": + prices = _parse_models_dev_pricing(data) + else: + prices = _parse_do_pricing(data) + if not prices: - # Dump the first entry's raw shape so we can see which fields DO - # actually returned — helps when the catalog adds new fields or - # the response doesn't match our parser. + # Dump a sample of the raw shape so we can see which fields the + # catalog returned — helps when it adds new fields or the response + # doesn't match our parser. import json as _json - sample_items = _coerce_items(data) - sample = sample_items[0] if sample_items else data + if provider == "models.dev" and isinstance(data, dict): + sample = next(iter(data.values()), data) + else: + sample_items = _coerce_items(data) + sample = sample_items[0] if sample_items else data logger.warning( - "DO pricing response had no parseable entries; cost column " + "%s pricing response had no parseable entries; cost column " "will be blank. Sample entry: %s", + provider, _json.dumps(sample, default=str)[:400], ) return cls(prices) @@ -278,6 +306,75 @@ def _parse_do_pricing(data: Any) -> dict[str, ModelPrice]: return prices +def _parse_models_dev_pricing(data: Any) -> dict[str, ModelPrice]: + """Parse a models.dev ``api.json`` response into a ModelPrice map. + + models.dev shape (top-level object keyed by provider id):: + + { + "anthropic": { + "models": { + "claude-opus-4-5": { + "cost": {"input": 5, "output": 25, "cache_read": 0.5} + } + } + }, + ... + } + + ``cost.*`` values are USD per *million* tokens, so we divide by 1e6 to get a + per-token rate. First-party providers use bare model keys, so we register + both ``provider/model`` (matching Plano's routing names) and the bare model + id as a fallback. + """ + prices: dict[str, ModelPrice] = {} + if not isinstance(data, dict): + return prices + + for provider_id, provider in data.items(): + if not isinstance(provider, dict): + continue + models = provider.get("models") + if not isinstance(models, dict): + continue + for model_key, model in models.items(): + if not isinstance(model, dict): + continue + cost = model.get("cost") + if not isinstance(cost, dict): + continue + input_pm = _as_float(cost.get("input")) + output_pm = _as_float(cost.get("output")) + if input_pm is None or output_pm is None: + continue + # Skip 0-rate entries so cost falls back to `—` rather than $0.0000. + if input_pm == 0 and output_pm == 0: + continue + cached_pm = _as_float(cost.get("cache_read")) + price = ModelPrice( + input_per_token_usd=input_pm / 1_000_000, + output_per_token_usd=output_pm / 1_000_000, + cached_input_per_token_usd=( + cached_pm / 1_000_000 if cached_pm is not None else None + ), + ) + composite = f"{provider_id}/{model_key}" + prices[composite] = price + prices.setdefault(composite.lower(), price) + prices.setdefault(str(model_key), price) + prices.setdefault(str(model_key).lower(), price) + return prices + + +def _as_float(value: Any) -> float | None: + if value is None: + return None + try: + return float(value) + except (TypeError, ValueError): + return None + + def _coerce_items(data: Any) -> list[dict]: if isinstance(data, list): return [x for x in data if isinstance(x, dict)] diff --git a/cli/planoai/obs_cmd.py b/cli/planoai/obs_cmd.py index 6249df300..a0867b6e9 100644 --- a/cli/planoai/obs_cmd.py +++ b/cli/planoai/obs_cmd.py @@ -2,9 +2,12 @@ from __future__ import annotations +import logging +import os import time import rich_click as click +import yaml from rich.console import Console from rich.live import Live @@ -15,8 +18,50 @@ LLMCallStore, ObsCollector, ) -from planoai.obs.pricing import PricingCatalog +from planoai.obs.pricing import DEFAULT_PRICING_PROVIDER, PricingCatalog from planoai.obs.render import render +from planoai.utils import find_config_file + +logger = logging.getLogger(__name__) + + +def _resolve_pricing_source( + config_file: str | None, + provider_override: str | None, + url_override: str | None, +) -> tuple[str, str | None]: + """Pick the cost pricing source. + + Precedence: explicit CLI overrides > the first ``type: cost`` entry in + ``model_metrics_sources`` from the Plano config > the DigitalOcean default. + """ + provider = DEFAULT_PRICING_PROVIDER + url: str | None = None + + config_path = find_config_file(file=config_file) + if config_path and os.path.exists(config_path): + try: + with open(config_path, "r") as f: + config = yaml.safe_load(f) or {} + sources = config.get("model_metrics_sources") or [] + for source in sources: + if isinstance(source, dict) and source.get("type") == "cost": + if source.get("provider"): + provider = str(source["provider"]) + if source.get("url"): + url = str(source["url"]) + break + except Exception as exc: # noqa: BLE001 — config is optional for obs + logger.warning( + "could not read pricing source from %s: %s", config_path, exc + ) + + if provider_override: + provider = provider_override + if url_override: + url = url_override + + return provider, url @click.command(name="obs", help="Live observability console for Plano LLM traffic.") @@ -48,13 +93,42 @@ show_default=True, help="TUI refresh interval.", ) -def obs(port: int, host: str, capacity: int, refresh_ms: int) -> None: +@click.option( + "--config", + "config_file", + type=str, + default=None, + help="Path to the Plano config to read the pricing source from " + "(defaults to ./config.yaml or ./plano_config.yaml).", +) +@click.option( + "--pricing-provider", + type=click.Choice(["digitalocean", "models.dev"]), + default=None, + help="Override the cost pricing provider (otherwise read from config).", +) +@click.option( + "--pricing-url", + type=str, + default=None, + help="Override the pricing catalog URL (otherwise read from config / provider default).", +) +def obs( + port: int, + host: str, + capacity: int, + refresh_ms: int, + config_file: str | None, + pricing_provider: str | None, + pricing_url: str | None, +) -> None: console = Console() + provider, url = _resolve_pricing_source(config_file, pricing_provider, pricing_url) console.print( - f"[bold {PLANO_COLOR}]planoai obs[/] — loading DO pricing catalog...", + f"[bold {PLANO_COLOR}]planoai obs[/] — loading {provider} pricing catalog...", end="", ) - pricing = PricingCatalog.fetch() + pricing = PricingCatalog.fetch(provider=provider, url=url) if len(pricing): sample = ", ".join(pricing.sample_models(3)) console.print( @@ -63,7 +137,7 @@ def obs(port: int, host: str, capacity: int, refresh_ms: int) -> None: else: console.print( " [yellow]no pricing loaded[/] — " - "[dim]cost column will be blank (DO catalog unreachable)[/]" + f"[dim]cost column will be blank ({provider} catalog unreachable)[/]" ) store = LLMCallStore(capacity=capacity) diff --git a/cli/test/test_obs_pricing.py b/cli/test/test_obs_pricing.py index 02247d3dd..322607a9e 100644 --- a/cli/test/test_obs_pricing.py +++ b/cli/test/test_obs_pricing.py @@ -144,3 +144,68 @@ def test_parse_do_catalog_divides_large_values_as_per_million(): prices = _parse_do_pricing(sample) assert prices["mystery-model"].input_per_token_usd == 5.0 / 1_000_000 assert prices["mystery-model"].output_per_token_usd == 15.0 / 1_000_000 + + +_MODELS_DEV_SAMPLE = { + "anthropic": { + "id": "anthropic", + "models": { + "claude-opus-4-5": { + "id": "claude-opus-4-5", + "cost": {"input": 5, "output": 25, "cache_read": 0.5}, + } + }, + }, + "groq": { + "id": "groq", + "models": { + "llama-3.3-70b-versatile": { + "id": "llama-3.3-70b-versatile", + "cost": {"input": 0.59, "output": 0.79}, + }, + # No cost block → skipped. + "whisper-large-v3-turbo": {"id": "whisper-large-v3-turbo"}, + }, + }, +} + + +def test_parse_models_dev_composes_provider_keys_and_per_token_rates(): + from planoai.obs.pricing import _parse_models_dev_pricing + + prices = _parse_models_dev_pricing(_MODELS_DEV_SAMPLE) + + # models.dev cost values are per-million → divided by 1e6. + opus = prices["anthropic/claude-opus-4-5"] + assert opus.input_per_token_usd == 5 / 1_000_000 + assert opus.output_per_token_usd == 25 / 1_000_000 + assert opus.cached_input_per_token_usd == 0.5 / 1_000_000 + + # Composite provider/model keys match Plano's routing names. + assert "groq/llama-3.3-70b-versatile" in prices + # Bare model id registered as a fallback. + assert "llama-3.3-70b-versatile" in prices + # Models without a cost block are skipped. + assert "groq/whisper-large-v3-turbo" not in prices + + +def test_models_dev_catalog_cost_computation(): + from planoai.obs.pricing import PricingCatalog, _parse_models_dev_pricing + + catalog = PricingCatalog(_parse_models_dev_pricing(_MODELS_DEV_SAMPLE)) + # 1000 input @ 5e-6 = 0.005; 500 output @ 25e-6 = 0.0125 + cost = catalog.cost_for_call(_call("anthropic/claude-opus-4-5", 1000, 500)) + assert cost == round(0.005 + 0.0125, 6) + + +def test_models_dev_skips_zero_rate_entries(): + from planoai.obs.pricing import _parse_models_dev_pricing + + sample = { + "free": { + "models": { + "promo-model": {"cost": {"input": 0, "output": 0}}, + } + } + } + assert _parse_models_dev_pricing(sample) == {} diff --git a/config/plano_config_schema.yaml b/config/plano_config_schema.yaml index 2ecf38921..5e4afc77d 100644 --- a/config/plano_config_schema.yaml +++ b/config/plano_config_schema.yaml @@ -582,13 +582,17 @@ properties: type: string enum: - digitalocean + - models.dev + url: + type: string + description: "Optional override for the pricing catalog endpoint. Defaults per provider (digitalocean: DO GenAI catalog; models.dev: https://models.dev/api.json)." refresh_interval: type: integer minimum: 1 description: "Refresh interval in seconds" model_aliases: type: object - description: "Map DO catalog keys (lowercase(creator)/model_id) to Plano model names used in routing_preferences. Example: 'openai/openai-gpt-oss-120b: openai/gpt-4o'" + description: "Map catalog keys to Plano model names used in routing_preferences. DigitalOcean keys are 'lowercase(creator)/model_id'; models.dev keys are 'creator/model_id'. Example: 'openai/openai-gpt-oss-120b: openai/gpt-4o'" additionalProperties: type: string required: diff --git a/crates/brightstaff/src/router/model_metrics.rs b/crates/brightstaff/src/router/model_metrics.rs index 1adb408d8..a4b3df48b 100644 --- a/crates/brightstaff/src/router/model_metrics.rs +++ b/crates/brightstaff/src/router/model_metrics.rs @@ -9,6 +9,7 @@ use tokio::sync::RwLock; use tracing::{debug, info, warn}; const DO_PRICING_URL: &str = "https://api.digitalocean.com/v2/gen-ai/models/catalog"; +const MODELS_DEV_URL: &str = "https://models.dev/api.json"; pub struct ModelMetricsService { cost: Arc>>, @@ -22,28 +23,35 @@ impl ModelMetricsService { for source in sources { match source { - MetricsSource::Cost(cfg) => match cfg.provider { - CostProvider::Digitalocean => { - let aliases = cfg.model_aliases.clone().unwrap_or_default(); - let data = fetch_do_pricing(&client, &aliases).await; - info!(models = data.len(), "fetched digitalocean pricing"); - *cost_data.write().await = data; - - if let Some(interval_secs) = cfg.refresh_interval { - let cost_clone = Arc::clone(&cost_data); - let client_clone = client.clone(); - let interval = Duration::from_secs(interval_secs); - tokio::spawn(async move { - loop { - tokio::time::sleep(interval).await; - let data = fetch_do_pricing(&client_clone, &aliases).await; - info!(models = data.len(), "refreshed digitalocean pricing"); - *cost_clone.write().await = data; - } - }); - } + MetricsSource::Cost(cfg) => { + let provider = cfg.provider.clone(); + let url = cfg + .url + .clone() + .unwrap_or_else(|| default_cost_url(&provider).to_string()); + let aliases = cfg.model_aliases.clone().unwrap_or_default(); + let provider_name = cost_provider_name(&provider); + + let data = fetch_cost_pricing(&provider, &url, &client, &aliases).await; + info!(models = data.len(), provider = provider_name, url = %url, "fetched cost pricing"); + *cost_data.write().await = data; + + if let Some(interval_secs) = cfg.refresh_interval { + let cost_clone = Arc::clone(&cost_data); + let client_clone = client.clone(); + let interval = Duration::from_secs(interval_secs); + tokio::spawn(async move { + loop { + tokio::time::sleep(interval).await; + let data = + fetch_cost_pricing(&provider, &url, &client_clone, &aliases) + .await; + info!(models = data.len(), provider = provider_name, url = %url, "refreshed cost pricing"); + *cost_clone.write().await = data; + } + }); } - }, + } MetricsSource::Latency(cfg) => match cfg.provider { LatencyProvider::Prometheus => { let data = fetch_prometheus_metrics(&cfg.url, &cfg.query, &client).await; @@ -165,11 +173,55 @@ struct DoPricing { output_price_per_million: Option, } +#[derive(serde::Deserialize)] +struct ModelsDevProvider { + #[serde(default)] + models: HashMap, +} + +#[derive(serde::Deserialize)] +struct ModelsDevModel { + cost: Option, +} + +#[derive(serde::Deserialize)] +struct ModelsDevCost { + input: Option, + output: Option, +} + +fn default_cost_url(provider: &CostProvider) -> &'static str { + match provider { + CostProvider::Digitalocean => DO_PRICING_URL, + CostProvider::ModelsDev => MODELS_DEV_URL, + } +} + +fn cost_provider_name(provider: &CostProvider) -> &'static str { + match provider { + CostProvider::Digitalocean => "digitalocean", + CostProvider::ModelsDev => "models.dev", + } +} + +async fn fetch_cost_pricing( + provider: &CostProvider, + url: &str, + client: &reqwest::Client, + aliases: &HashMap, +) -> HashMap { + match provider { + CostProvider::Digitalocean => fetch_do_pricing(url, client, aliases).await, + CostProvider::ModelsDev => fetch_models_dev_pricing(url, client, aliases).await, + } +} + async fn fetch_do_pricing( + url: &str, client: &reqwest::Client, aliases: &HashMap, ) -> HashMap { - match client.get(DO_PRICING_URL).send().await { + match client.get(url).send().await { Ok(resp) => match resp.json::().await { Ok(list) => list .data @@ -184,17 +236,66 @@ async fn fetch_do_pricing( }) .collect(), Err(err) => { - warn!(error = %err, url = DO_PRICING_URL, "failed to parse digitalocean pricing response"); + warn!(error = %err, url = %url, "failed to parse digitalocean pricing response"); HashMap::new() } }, Err(err) => { - warn!(error = %err, url = DO_PRICING_URL, "failed to fetch digitalocean pricing"); + warn!(error = %err, url = %url, "failed to fetch digitalocean pricing"); HashMap::new() } } } +/// models.dev publishes a top-level object keyed by provider id; each provider +/// carries a `models` map whose keys are `creator/model` ids and whose `cost` +/// block holds per-million USD rates. We sum input + output (mirroring the DO +/// ranking metric) and key the result by `creator/model_id` so it lines up with +/// Plano's `provider/model` routing names. +async fn fetch_models_dev_pricing( + url: &str, + client: &reqwest::Client, + aliases: &HashMap, +) -> HashMap { + match client.get(url).send().await { + Ok(resp) => match resp.json::>().await { + Ok(providers) => parse_models_dev_pricing(providers, aliases), + Err(err) => { + warn!(error = %err, url = %url, "failed to parse models.dev pricing response"); + HashMap::new() + } + }, + Err(err) => { + warn!(error = %err, url = %url, "failed to fetch models.dev pricing"); + HashMap::new() + } + } +} + +fn parse_models_dev_pricing( + providers: HashMap, + aliases: &HashMap, +) -> HashMap { + let mut out = HashMap::new(); + for (provider_id, provider) in providers { + for (model_key, model) in provider.models { + let Some(cost) = model.cost else { continue }; + let (Some(input), Some(output)) = (cost.input, cost.output) else { + continue; + }; + // First-party providers use bare model keys (`claude-opus-4-5`), + // so compose `provider/model` to line up with Plano routing names. + let raw_key = format!("{provider_id}/{model_key}"); + let total = input + output; + let key = aliases.get(&raw_key).cloned().unwrap_or(raw_key); + out.insert(key, total); + // Also register the bare model id as a fallback lookup. + out.entry(model_key).or_insert(total); + } + } + out +} + #[derive(serde::Deserialize)] struct PrometheusResponse { data: PrometheusData, @@ -368,6 +469,50 @@ mod tests { assert_eq!(result, vec!["gpt-4o", "gpt-4o-mini"]); } + #[test] + fn test_parse_models_dev_pricing_composes_provider_keys() { + let json = r#"{ + "anthropic": { + "models": { + "claude-opus-4-5": {"cost": {"input": 5.0, "output": 25.0}} + } + }, + "groq": { + "models": { + "llama-3.3-70b-versatile": {"cost": {"input": 0.59, "output": 0.79}}, + "whisper-large-v3-turbo": {"cost": null} + } + } + }"#; + let providers: HashMap = serde_json::from_str(json).unwrap(); + let aliases = HashMap::new(); + let prices = parse_models_dev_pricing(providers, &aliases); + + assert_eq!(prices.get("anthropic/claude-opus-4-5"), Some(&30.0)); + assert_eq!(prices.get("groq/llama-3.3-70b-versatile"), Some(&1.38)); + // bare fallback also registered + assert_eq!(prices.get("claude-opus-4-5"), Some(&30.0)); + // models with no cost block are skipped + assert!(!prices.contains_key("groq/whisper-large-v3-turbo")); + } + + #[test] + fn test_parse_models_dev_pricing_applies_aliases() { + let json = r#"{ + "openai": {"models": {"gpt-oss-120b": {"cost": {"input": 1.0, "output": 2.0}}}} + }"#; + let providers: HashMap = serde_json::from_str(json).unwrap(); + let mut aliases = HashMap::new(); + aliases.insert( + "openai/gpt-oss-120b".to_string(), + "openai/gpt-4o".to_string(), + ); + let prices = parse_models_dev_pricing(providers, &aliases); + + assert_eq!(prices.get("openai/gpt-4o"), Some(&3.0)); + assert!(!prices.contains_key("openai/gpt-oss-120b")); + } + #[test] fn test_rank_by_ascending_metric_nan_treated_as_missing() { let models = vec![ diff --git a/crates/common/src/configuration.rs b/crates/common/src/configuration.rs index 8aa521fa5..924c9b03c 100644 --- a/crates/common/src/configuration.rs +++ b/crates/common/src/configuration.rs @@ -177,8 +177,13 @@ pub enum MetricsSource { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct CostMetricsConfig { pub provider: CostProvider, + /// Optional override for the pricing catalog endpoint. When omitted, a + /// sensible default is used per provider. + pub url: Option, pub refresh_interval: Option, - /// Map DO catalog keys (`lowercase(creator)/model_id`) to Plano model names. + /// Map catalog keys to Plano model names used in `routing_preferences`. + /// DigitalOcean keys look like `lowercase(creator)/model_id`; models.dev + /// keys look like `creator/model_id`. /// Example: `openai/openai-gpt-oss-120b: openai/gpt-4o` pub model_aliases: Option>, } @@ -187,6 +192,8 @@ pub struct CostMetricsConfig { #[serde(rename_all = "snake_case")] pub enum CostProvider { Digitalocean, + #[serde(rename = "models.dev")] + ModelsDev, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -741,6 +748,51 @@ mod test { } } + #[test] + fn test_deserialize_models_dev_cost_source() { + let yaml = r#" +- type: cost + provider: models.dev + url: https://models.dev/api.json + refresh_interval: 3600 + model_aliases: + openai/gpt-oss-120b: openai/gpt-4o +"#; + let sources: Vec = serde_yaml::from_str(yaml).unwrap(); + assert_eq!(sources.len(), 1); + match &sources[0] { + super::MetricsSource::Cost(cfg) => { + assert!(matches!(cfg.provider, super::CostProvider::ModelsDev)); + assert_eq!(cfg.url.as_deref(), Some("https://models.dev/api.json")); + assert_eq!(cfg.refresh_interval, Some(3600)); + assert_eq!( + cfg.model_aliases + .as_ref() + .and_then(|m| m.get("openai/gpt-oss-120b")) + .map(String::as_str), + Some("openai/gpt-4o") + ); + } + other => panic!("expected cost source, got {other:?}"), + } + } + + #[test] + fn test_deserialize_digitalocean_cost_source_without_url() { + let yaml = r#" +- type: cost + provider: digitalocean +"#; + let sources: Vec = serde_yaml::from_str(yaml).unwrap(); + match &sources[0] { + super::MetricsSource::Cost(cfg) => { + assert!(matches!(cfg.provider, super::CostProvider::Digitalocean)); + assert_eq!(cfg.url, None); + } + other => panic!("expected cost source, got {other:?}"), + } + } + #[test] fn test_into_models_filters_internal_providers() { let providers = vec![ diff --git a/docs/source/guides/llm_router.rst b/docs/source/guides/llm_router.rst index b66c01f25..422e3a494 100644 --- a/docs/source/guides/llm_router.rst +++ b/docs/source/guides/llm_router.rst @@ -209,6 +209,178 @@ Clients can let the router decide or still specify aliases: ) +.. _cost_latency_aware_selection: + +Cost- and latency-aware selection +--------------------------------- + +When a route lists more than one candidate model, you can let Plano reorder that +candidate pool using **live cost or latency data** instead of relying solely on the +order you wrote them in. This is controlled per route with ``selection_policy`` and +backed by one or more ``model_metrics_sources``. + +This is useful when several models are equally capable for a route and you want Plano +to always reach for the cheapest (or fastest) option first, with the others kept as +fallbacks. + +Selection policy +~~~~~~~~~~~~~~~~~ + +Attach an optional ``selection_policy`` to any entry in ``routing_preferences``: + +.. code-block:: yaml + :caption: Per-route selection policy + + routing_preferences: + - name: code review + description: reviewing, analyzing, and suggesting improvements to existing code + models: + - anthropic/claude-sonnet-4-5 + - groq/llama-3.3-70b-versatile + selection_policy: + prefer: cheapest # cheapest | fastest | none + +``prefer`` accepts: + +- ``cheapest`` — order candidates by total price (input + output rate) ascending, using a ``cost`` metrics source. +- ``fastest`` — order candidates by observed latency ascending, using a ``latency`` metrics source. +- ``none`` (default) — keep the order you declared; no reordering. + +Models that have no data in the selected source are ranked **last**, in their original +order, so routing always degrades gracefully rather than dropping a candidate. + +Configuring the pricing source +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +``cheapest`` routing needs a price catalog. Plano's **default pricing provider is +DigitalOcean** — its GenAI model catalog is public (no API key, no signup), so cost data +is available out of the box and is what ``planoai obs`` uses if you don't configure +anything. The pricing source is fully swappable: point Plano at `models.dev `_, +or at **any endpoint that exposes a supported pricing structure**. + +The ``provider`` field selects which response schema Plano expects (and therefore how it +parses the catalog); the optional ``url`` lets you override the endpoint — for example to +use a mirror, a cached copy, or an internal catalog service that returns the same shape. + +.. list-table:: + :header-rows: 1 + :widths: 18 34 28 20 + + * - ``provider`` + - Default catalog URL + - Key format + - Expected structure + * - ``digitalocean`` *(default)* + - DigitalOcean GenAI model catalog + - ``lowercase(creator)/model_id`` + - ``{ data: [ { model_id, pricing: { input_price_per_million, output_price_per_million } } ] }`` + * - ``models.dev`` + - ``https://models.dev/api.json`` + - ``creator/model`` (e.g. ``anthropic/claude-sonnet-4-5``) + - ``{ : { models: { : { cost: { input, output } } } } }`` + +Because the source is selected per ``provider``, switching is a one-line change. To stay +on the default DigitalOcean catalog you can omit ``model_metrics_sources`` entirely for +``planoai obs``, or declare it explicitly for routing: + +.. code-block:: yaml + :caption: Default cost source (DigitalOcean) + + model_metrics_sources: + - type: cost + provider: digitalocean # default; uses the public DO GenAI catalog + +To switch to models.dev — an open, community-maintained catalog covering a broad range of +providers and models — change the ``provider`` (and optionally ``url``): + +.. code-block:: yaml + :caption: Cost source backed by models.dev + + model_metrics_sources: + - type: cost + provider: models.dev # models.dev | digitalocean + url: https://models.dev/api.json # optional; defaults per provider + refresh_interval: 3600 # optional, seconds; refetch on this interval + model_aliases: # optional; see below + openai/gpt-oss-120b: openai/gpt-4o + +To use your own endpoint, pick the ``provider`` whose structure your endpoint matches and +override ``url`` — Plano parses the response with that provider's schema: + +.. code-block:: yaml + :caption: Custom endpoint exposing the DigitalOcean catalog structure + + model_metrics_sources: + - type: cost + provider: digitalocean # selects the DO response schema + url: https://catalog.internal.example.com/pricing + +.. note:: + The cost metric used for ranking is the sum of the input and output per-million-token + rates — a relative signal for ordering candidates, not a per-request bill. For actual + per-request cost, see the observability console below. + +Matching catalog keys to your models +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The router looks up each candidate model by the exact name you use in +``routing_preferences`` (e.g. ``anthropic/claude-sonnet-4-5``). models.dev keys models as +``creator/model``, which lines up with Plano's ``provider/model`` naming, so most models +match automatically. + +When a catalog key does not match your model name — for example a version skew, or an +open-weight model you serve under a different provider — use ``model_aliases`` to map the +**catalog key** to the **Plano model name** used in your routing preferences: + +.. code-block:: yaml + + model_metrics_sources: + - type: cost + provider: models.dev + model_aliases: + # catalog key : plano model name + openai/gpt-oss-120b: openai/gpt-4o + +Latency source +~~~~~~~~~~~~~~~ + +``fastest`` routing reads observed latency from a Prometheus instance. Provide the query +that returns a per-model latency value (lower is faster), labelled by ``model_name``: + +.. code-block:: yaml + :caption: Latency source backed by Prometheus + + model_metrics_sources: + - type: latency + provider: prometheus + url: http://prometheus:9090 + query: avg by (model_name) (rate(plano_llm_latency_seconds_sum[5m])) + refresh_interval: 60 + +You can declare both a ``cost`` and a ``latency`` source at the same time; each route +picks whichever it needs based on its ``selection_policy``. + +Cost in the observability console +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +``planoai obs`` displays a per-request USD cost column derived from the same pricing +catalog. By default it reads the ``cost`` source from your config (the first +``type: cost`` entry under ``model_metrics_sources``); you can also override it on the +command line: + +.. code-block:: bash + + # Use the cost source from ./config.yaml (default) + planoai obs + + # Or override the provider / endpoint explicitly + planoai obs --pricing-provider models.dev + planoai obs --pricing-url https://models.dev/api.json + +If no source is configured and no override is given, ``planoai obs`` falls back to the +DigitalOcean catalog so the cost column still populates out of the box. + + Plano-Orchestrator ------------------- Plano-Orchestrator is a **preference-based routing model** specifically designed to address the limitations of traditional LLM routing. It delivers production-ready performance with low latency and high accuracy while solving key routing challenges. diff --git a/docs/source/resources/includes/plano_config_full_reference.yaml b/docs/source/resources/includes/plano_config_full_reference.yaml index 2231a01f9..17c8161d9 100644 --- a/docs/source/resources/includes/plano_config_full_reference.yaml +++ b/docs/source/resources/includes/plano_config_full_reference.yaml @@ -86,6 +86,24 @@ routing_preferences: selection_policy: prefer: cheapest +# model_metrics_sources: external catalogs the router reads to reorder candidate +# models for selection_policy.prefer. A `cost` source ranks `prefer: cheapest`; +# a `latency` source ranks `prefer: fastest`. Both are optional. +model_metrics_sources: + # Cost catalog. provider: models.dev | digitalocean (default url per provider). + - type: cost + provider: models.dev + url: https://models.dev/api.json # optional; omit to use the provider default + refresh_interval: 3600 # optional, seconds + model_aliases: # optional: catalog key -> Plano model name + openai/gpt-oss-120b: openai/gpt-4o + # Latency catalog (Prometheus). Used for selection_policy.prefer: fastest. + - type: latency + provider: prometheus + url: http://prometheus:9090 + query: avg by (model_name) (rate(plano_llm_latency_seconds_sum[5m])) + refresh_interval: 60 + # HTTP listeners - entry points for agent routing, prompt targets, and direct LLM access listeners: # Agent listener for routing requests to multiple agents diff --git a/docs/source/resources/includes/plano_config_full_reference_rendered.yaml b/docs/source/resources/includes/plano_config_full_reference_rendered.yaml index 3779dd739..3afa4404a 100644 --- a/docs/source/resources/includes/plano_config_full_reference_rendered.yaml +++ b/docs/source/resources/includes/plano_config_full_reference_rendered.yaml @@ -115,6 +115,18 @@ model_aliases: target: gpt-4o-mini smart-llm: target: gpt-4o +model_metrics_sources: +- model_aliases: + openai/gpt-oss-120b: openai/gpt-4o + provider: models.dev + refresh_interval: 3600 + type: cost + url: https://models.dev/api.json +- provider: prometheus + query: avg by (model_name) (rate(plano_llm_latency_seconds_sum[5m])) + refresh_interval: 60 + type: latency + url: http://prometheus:9090 model_providers: - access_key: $OPENAI_API_KEY default: true