Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions app/control/account/invalid_credentials.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,7 @@ async def mark_account_invalid_credentials(
"expired_reason": reason,
},
)
]
)
])
logger.info(
"account expired from {}: token={}... status={} upstream_status={}",
source,
Expand All @@ -72,6 +71,12 @@ def feedback_kind_for_error(exc: BaseException | None) -> FeedbackKind:
status = getattr(exc, "status", 0)
if status == 429:
return FeedbackKind.RATE_LIMITED
if status == 402:
# console.x.ai returns 402 when the account has exhausted its
# prepaid web_search/credit balance. Treat it like a rate limit
# so the account pool routes around this token until credits
# refresh or the operator tops up the balance.
return FeedbackKind.RATE_LIMITED
if status == 401:
return FeedbackKind.UNAUTHORIZED
if status == 403:
Expand Down
48 changes: 19 additions & 29 deletions app/control/account/state_machine.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ class StatePolicy:

_DEFAULT_POLICY = StatePolicy()


# ---------------------------------------------------------------------------
# Feedback
# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -66,7 +65,9 @@ def from_status_code(
kind = FeedbackKind.UNAUTHORIZED
elif status_code == 403:
kind = FeedbackKind.FORBIDDEN
elif status_code == 429:
elif status_code == 429 or status_code == 402:
# 402 from console.x.ai = account credits exhausted; treat as a
# rate-limited token so the pool routes around it.
kind = FeedbackKind.RATE_LIMITED
elif status_code >= 500:
kind = FeedbackKind.SERVER_ERROR
Expand Down Expand Up @@ -111,9 +112,7 @@ def derive_status(record: AccountRecord, *, now: int | None = None) -> AccountSt
return AccountStatus.COOLING


def is_selectable(
record: AccountRecord, mode_id: int, *, now: int | None = None
) -> bool:
def is_selectable(record: AccountRecord, mode_id: int, *, now: int | None = None) -> bool:
"""Return True if the account can be selected for *mode_id*."""
if record.is_deleted():
return False
Expand Down Expand Up @@ -185,10 +184,7 @@ def apply_feedback(
win = qs.get(feedback.mode_id)
if win is not None:
reset_at = (
ts + feedback.retry_after_ms
if feedback.retry_after_ms
else (ts + win.window_seconds * 1000)
)
ts + feedback.retry_after_ms if feedback.retry_after_ms else (ts + win.window_seconds * 1000))
qs.set(
feedback.mode_id,
QuotaWindow(
Expand All @@ -206,10 +202,10 @@ def apply_feedback(
use_count += 1
last_use_at = ts
elif feedback.kind not in (
FeedbackKind.SUCCESS,
FeedbackKind.RESTORE,
FeedbackKind.DISABLE,
FeedbackKind.DELETE,
FeedbackKind.SUCCESS,
FeedbackKind.RESTORE,
FeedbackKind.DISABLE,
FeedbackKind.DELETE,
):
fail_count += 1
last_fail_at = ts
Expand All @@ -236,11 +232,7 @@ def apply_feedback(
ext[_DISABLED_REASON_KEY] = state_reason

elif feedback.kind == FeedbackKind.RATE_LIMITED:
cooldown_ms = (
feedback.retry_after_ms
if feedback.retry_after_ms
else policy.default_cooling_ms
)
cooldown_ms = (feedback.retry_after_ms if feedback.retry_after_ms else policy.default_cooling_ms)
status = AccountStatus.COOLING
state_reason = feedback.reason or "rate_limited"
ext[_COOLDOWN_UNTIL_KEY] = ts + cooldown_ms
Expand Down Expand Up @@ -292,21 +284,20 @@ def apply_feedback(
"state_reason": state_reason,
"ext": ext,
"updated_at": ts,
}
)
})


def clear_failures(record: AccountRecord) -> AccountRecord:
"""Reset failure counters and restore ACTIVE status."""
ext = dict(record.ext)
for k in (
_COOLDOWN_UNTIL_KEY,
_COOLDOWN_REASON_KEY,
_DISABLED_AT_KEY,
_DISABLED_REASON_KEY,
_EXPIRED_AT_KEY,
_EXPIRED_REASON_KEY,
_FORBIDDEN_STRIKE_KEY,
_COOLDOWN_UNTIL_KEY,
_COOLDOWN_REASON_KEY,
_DISABLED_AT_KEY,
_DISABLED_REASON_KEY,
_EXPIRED_AT_KEY,
_EXPIRED_REASON_KEY,
_FORBIDDEN_STRIKE_KEY,
):
ext.pop(k, None)
return record.model_copy(
Expand All @@ -318,8 +309,7 @@ def clear_failures(record: AccountRecord) -> AccountRecord:
"state_reason": None,
"ext": ext,
"updated_at": now_ms(),
}
)
})


__all__ = [
Expand Down
18 changes: 17 additions & 1 deletion app/control/model/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,23 @@
# Super+(basic 池不支持此模式)
ModelSpec("grok-4.3-beta", ModeId.GROK_4_3, Tier.SUPER, Capability.CHAT, True, "Grok 4.3 Beta"),

# === Console API (console.x.ai/v1/responses) ============================
# 通过 SSO cookie 直接调用 console.x.ai,basic 账号即可使用所有模型
# 速率限制由 console.x.ai 控制(免费 tier: 1 rps / 60 RPM)
# Hybrid reasoning models default to effort="high" so callers that omit
# reasoning_effort still get the "think hard" experience the model name
# implies. Pass an explicit value (e.g. "minimal") to override.
ModelSpec("grok-4.3", ModeId.FAST, Tier.BASIC, Capability.CHAT, True, "Grok 4.3 (Console)", console_model="grok-4.3", default_reasoning_effort="high"),
ModelSpec("grok-4", ModeId.FAST, Tier.BASIC, Capability.CHAT, True, "Grok 4 (Console)", console_model="grok-4", default_reasoning_effort="high"),
ModelSpec("grok-4.20", ModeId.FAST, Tier.BASIC, Capability.CHAT, True, "Grok 4.20 (Console)", console_model="grok-4.20", default_reasoning_effort="high"),
# Fixed-intensity reasoning model — upstream rejects reasoning.effort.
ModelSpec("grok-4.20-reasoning", ModeId.FAST, Tier.BASIC, Capability.CHAT, True, "Grok 4.20 Reasoning (Console)", console_model="grok-4.20-0309-reasoning"),
# Non-reasoning model — effort is not applicable.
ModelSpec("grok-4.20-non-reasoning", ModeId.FAST, Tier.BASIC, Capability.CHAT, True, "Grok 4.20 Non-Reasoning (Console)", console_model="grok-4.20-0309-non-reasoning"),
# Multi-agent — left default; effort behaviour with this variant has not
# been verified, so we don't auto-inject "high" to avoid surprising 400s.
ModelSpec("grok-4.20-multi-agent", ModeId.FAST, Tier.BASIC, Capability.CHAT, True, "Grok 4.20 Multi-Agent (Console)", console_model="grok-4.20-multi-agent-0309"),

# === Image ==============================================================

# Basic fast
Expand Down Expand Up @@ -66,7 +83,6 @@
for _m in MODELS:
_BY_CAP.setdefault(int(_m.capability), []).append(_m)


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
Expand Down
27 changes: 25 additions & 2 deletions app/control/model/spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,23 @@ class ModelSpec:
``public_name`` is the human-readable display name.
``prefer_best`` when True, reverses pool priority to try higher-tier
pools first (hard priority, not soft preference).
``console_model`` when non-empty, route this model through the
``console.x.ai/v1/responses`` endpoint instead of the
``grok.com`` web chat API. The string is the actual
model ID sent to console.x.ai (e.g. ``"grok-4.3"``).
SSO cookies from grok.com work for both endpoints,
so basic-tier accounts can access all models this way.
``default_reasoning_effort`` when non-empty, this value is forwarded as
``reasoning.effort`` to the console upstream when the
caller doesn't specify ``reasoning_effort`` themselves.
Use ``"high"`` for hybrid reasoning models the user
expects to "think hard by default" (grok-4, grok-4.3,
grok-4.20). Leave empty for models that either don't
support the effort field (grok-4.20-reasoning is fixed
intensity; the upstream rejects effort with HTTP 400)
or don't reason at all (grok-4.20-non-reasoning).
Only consulted when ``console_model`` is set; ignored
on the legacy grok.com path.
"""

model_name: str
Expand All @@ -29,6 +46,8 @@ class ModelSpec:
enabled: bool
public_name: str
prefer_best: bool = False
console_model: str = ""
default_reasoning_effort: str = ""

# --- convenience predicates ---

Expand All @@ -47,6 +66,10 @@ def is_video(self) -> bool:
def is_voice(self) -> bool:
return bool(self.capability & Capability.VOICE)

def is_console(self) -> bool:
"""Return True if this model routes through console.x.ai."""
return bool(self.console_model)

def pool_name(self) -> str:
"""Return the canonical pool string for this tier."""
if self.tier == Tier.SUPER:
Expand Down Expand Up @@ -80,15 +103,15 @@ def pool_candidates(self) -> tuple[int, ...]:
"""
if self.prefer_best:
if self.tier == Tier.HEAVY:
return (2,) # heavy only
return (2, ) # heavy only
if self.tier == Tier.SUPER:
return (2, 1) # heavy, super
return (2, 1, 0) # heavy, super, basic
if self.tier == Tier.BASIC:
return (0, 1, 2) # basic, super, heavy
if self.tier == Tier.SUPER:
return (1, 2) # super, heavy
return (2,) # heavy only
return (2, ) # heavy only


__all__ = ["ModelSpec"]
49 changes: 36 additions & 13 deletions app/dataplane/reverse/planner.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,27 +8,41 @@

from app.control.model.spec import ModelSpec
from app.dataplane.reverse.runtime.endpoint_table import (
CHAT, MEDIA_POST, WS_IMAGINE,
CHAT,
CONSOLE_RESPONSES,
MEDIA_POST,
WS_IMAGINE,
)
from .types import ReversePlan, TransportKind


# ---------------------------------------------------------------------------
# Profile defaults (timeout / content-type per transport)
# ---------------------------------------------------------------------------

_DEFAULTS: dict[TransportKind, dict[str, Any]] = {
TransportKind.HTTP_SSE: {"timeout_s": 120.0, "content_type": "application/json"},
TransportKind.HTTP_JSON: {"timeout_s": 30.0, "content_type": "application/json"},
TransportKind.WEBSOCKET: {"timeout_s": 300.0, "content_type": "application/json"},
TransportKind.GRPC_WEB: {"timeout_s": 15.0, "content_type": "application/grpc-web+proto"},
TransportKind.HTTP_SSE: {
"timeout_s": 120.0,
"content_type": "application/json"
},
TransportKind.HTTP_JSON: {
"timeout_s": 30.0,
"content_type": "application/json"
},
TransportKind.WEBSOCKET: {
"timeout_s": 300.0,
"content_type": "application/json"
},
TransportKind.GRPC_WEB: {
"timeout_s": 15.0,
"content_type": "application/grpc-web+proto"
},
}


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------


def build_plan(spec: ModelSpec, request: dict[str, Any] | None = None) -> ReversePlan:
"""Produce a ReversePlan for the given model spec.

Expand All @@ -39,25 +53,34 @@ def build_plan(spec: ModelSpec, request: dict[str, Any] | None = None) -> Revers
defaults = _DEFAULTS.get(tkind, _DEFAULTS[TransportKind.HTTP_JSON])

return ReversePlan(
endpoint = endpoint,
transport_kind = tkind,
pool_candidates = spec.pool_candidates(),
mode_id = int(spec.mode_id),
timeout_s = defaults["timeout_s"],
content_type = defaults["content_type"],
endpoint=endpoint,
transport_kind=tkind,
pool_candidates=spec.pool_candidates(),
mode_id=int(spec.mode_id),
timeout_s=defaults["timeout_s"],
content_type=defaults["content_type"],
)


# ---------------------------------------------------------------------------
# Internal routing logic
# ---------------------------------------------------------------------------


def _resolve_endpoint(
spec: ModelSpec,
request: dict[str, Any],
) -> tuple[str, TransportKind]:
"""Determine (endpoint_url, transport_kind) for the given capability."""

# Console models route through console.x.ai/v1/responses (OpenAI Responses API)
if spec.is_console() and spec.is_chat():
# When stream=true the response is SSE; otherwise plain JSON.
# Use HTTP_SSE as the default since both streaming and non-streaming
# share the same content-type and the timeout profile is more
# permissive (long-running responses).
return CONSOLE_RESPONSES, TransportKind.HTTP_SSE

if spec.is_chat():
return CHAT, TransportKind.HTTP_SSE

Expand Down
Loading