llm_backend: auto-escalate tokens on length truncation

mcgrof · mcgrof · commit 061a608acd4a · 2026-04-08T17:50:32.000-07:00
The recurring "OpenAI returned empty content" failures were
all gpt-5.4 hitting finish_reason=length. Reasoning models
(gpt-5, o1/o3/o4) budget BOTH internal reasoning tokens AND
output tokens against max_completion_tokens. The default
16000 was too small: complex prompts burned the entire budget
thinking and produced empty output.

Fix it in two places.

First, _call_openai now auto-escalates the budget when a
reasoning model returns empty content with finish_reason=
length. The budget doubles up to a 128K hard cap, so a
prompt that needs more headroom recovers without operator
intervention. Non-reasoning models, refusals, content-filter
hits, and other empty-content cases are NOT retried — those
are not recoverable by giving more tokens.

Second, llm_call now bumps the starting budget for reasoning
models to at least 32K. The auto-escalation handles further
growth, but starting at 32K avoids the wasted first attempt
that almost always hit length on the old 16K default.

Tests cover:
- Auto-escalation succeeds after one length-truncated retry
- Escalation caps at 128K and surfaces a clear error after
- Legacy chat models do NOT auto-escalate (different budget)
- Content filter / refusal does NOT trigger escalation
- llm_call bumps default 16K to 32K for reasoning models
- Caller-provided values above 32K are respected as-is

This fixes the four submissions stuck in generation_failed
with the same RuntimeError, and prevents new ones from
accumulating in the same way.

Generated-by: Claude AI
Signed-off-by: Luis Chamberlain &lt;mcgrof@kernel.org&gt;
diff --git a/llm_backend.py b/llm_backend.py
@@ -76,15 +76,24 @@ def llm_call(backend, model, prompt, temperature=0.4,
         model: Model name (backend-specific).
         prompt: User prompt string.
         temperature: Sampling temperature (ignored by claude-cli).
-        max_tokens: Maximum output tokens.
+        max_tokens: Maximum output tokens.  For OpenAI reasoning
+            models (gpt-5, o1/o3/o4) this is also the budget for
+            internal reasoning tokens, so the effective starting
+            budget is bumped to give the model headroom.
         json_mode: If True, parse response as JSON with repair logic.
                    If False, return raw text string.
     """
     btype = backend["type"]
 
     if btype == "openai":
+        # Reasoning models budget reasoning + output against the same
+        # ceiling. Start with at least 32K headroom; auto-escalation
+        # in _call_openai handles further growth on length-truncation.
+        effective = max_tokens
+        if _is_reasoning_model(model) and effective < 32000:
+            effective = 32000
         raw = _call_openai(backend["client"], model, prompt,
-                           temperature, max_tokens)
+                           temperature, effective)
     elif btype == "anthropic":
         raw = _call_anthropic(backend["client"], model, prompt,
                               temperature, max_tokens)
@@ -105,39 +114,89 @@ def llm_call(backend, model, prompt, temperature=0.4,
 # Backend implementations
 # ---------------------------------------------------------------------------
 
-def _call_openai(client, model, prompt, temperature, max_tokens):
+# Hard cap on max_completion_tokens auto-escalation. Reasoning models
+# can burn enormous budgets thinking; this prevents an unbounded retry
+# loop. 128K is the practical ceiling for current GPT-5 / o-family
+# models.
+_OPENAI_MAX_TOKENS_CAP = 128000
+
+# Reasoning model prefixes. These models budget BOTH internal reasoning
+# tokens AND output tokens against max_completion_tokens, so they need
+# substantially more headroom than legacy chat models.
+_REASONING_MODEL_PREFIXES = ("gpt-5", "o1", "o3", "o4")
+
+
+def _is_reasoning_model(model):
+    return (model or "").startswith(_REASONING_MODEL_PREFIXES)
+
+
+def _openai_create_once(client, model, prompt, temperature, max_tokens):
+    """Single OpenAI chat completion call. Returns the choice object."""
     kwargs = {
         "model": model,
         "messages": [{"role": "user", "content": prompt}],
     }
-
-    # Newer reasoning models are stricter about accepted request fields.
-    # Avoid sending temperature unless we have to, and prefer the newer
-    # completion-token field for GPT-5 / o* families.
-    if not (model or "").startswith(("gpt-5", "o1", "o3", "o4")):
+    if not _is_reasoning_model(model):
         kwargs["temperature"] = temperature
-
-    if (model or "").startswith(("gpt-5", "o1", "o3", "o4")):
-        kwargs["max_completion_tokens"] = max_tokens
-    else:
         kwargs["max_tokens"] = max_tokens
-
+    else:
+        kwargs["max_completion_tokens"] = max_tokens
     resp = client.chat.completions.create(**kwargs)
-    choice = resp.choices[0]
-    content = choice.message.content
-    finish = getattr(choice, "finish_reason", None)
-    refusal = getattr(choice.message, "refusal", None)
+    return resp.choices[0]
+
+
+def _call_openai(client, model, prompt, temperature, max_tokens):
+    """Call OpenAI with auto-escalation on length-truncation.
+
+    Reasoning models (gpt-5, o1/o3/o4) budget BOTH internal reasoning
+    tokens AND output tokens against max_completion_tokens. A complex
+    prompt can burn the entire budget thinking, leaving zero output
+    and finish_reason=length. When that happens we retry with double
+    the budget up to _OPENAI_MAX_TOKENS_CAP. This is the difference
+    between "the API failed" and "the model needs more headroom".
+    """
+    attempts = []
+    budget = max_tokens
+    is_reasoning = _is_reasoning_model(model)
+
+    while True:
+        choice = _openai_create_once(
+            client, model, prompt, temperature, budget)
+        content = choice.message.content
+        finish = getattr(choice, "finish_reason", None)
+        refusal = getattr(choice.message, "refusal", None)
+        attempts.append({
+            "budget": budget, "finish": finish,
+            "had_content": bool(content and content.strip()),
+        })
+
+        if content is not None and content.strip():
+            return content.strip()
+
+        # Empty content. Decide whether to escalate or give up.
+        # Only auto-escalate when the cause is length truncation on a
+        # reasoning model — that's the recoverable case. Refusals,
+        # content-filter hits, and stop-with-empty are not recoverable
+        # by retrying with more tokens.
+        if (is_reasoning and finish == "length"
+                and budget < _OPENAI_MAX_TOKENS_CAP):
+            new_budget = min(budget * 2, _OPENAI_MAX_TOKENS_CAP)
+            print(f"[LLM] OpenAI hit length on {model} with "
+                  f"max_completion_tokens={budget}; "
+                  f"retrying with {new_budget}", file=sys.stderr)
+            budget = new_budget
+            continue
 
-    if content is None or not content.strip():
         parts = [f"OpenAI returned empty content (model={model}"]
         if finish:
             parts.append(f"finish_reason={finish}")
         if refusal:
             parts.append(f"refusal={refusal}")
+        if len(attempts) > 1:
+            parts.append(f"attempts={len(attempts)}")
+            parts.append(f"final_budget={budget}")
         raise RuntimeError(", ".join(parts) + ")")
 
-    return content.strip()
-
 
 def _call_anthropic(client, model, prompt, temperature, max_tokens):
     resp = client.messages.create(
diff --git a/tests/test_llm_backend.py b/tests/test_llm_backend.py
@@ -42,15 +42,27 @@ def fake_import(name, *args, **kwargs):
     assert "requires the 'openai' Python package" in str(exc.value)
 
 
+def _ok_choice(content='{"ok": true}', finish="stop"):
+    return SimpleNamespace(
+        message=SimpleNamespace(content=content, refusal=None),
+        finish_reason=finish,
+    )
+
+
+def _empty_choice(finish="length", refusal=None):
+    return SimpleNamespace(
+        message=SimpleNamespace(content=None, refusal=refusal),
+        finish_reason=finish,
+    )
+
+
 def test_call_openai_uses_max_completion_tokens_for_gpt5_models():
     calls = []
 
     class _Create:
         def create(self, **kwargs):
             calls.append(kwargs)
-            return SimpleNamespace(
-                choices=[SimpleNamespace(message=SimpleNamespace(content='{"ok": true}'))]
-            )
+            return SimpleNamespace(choices=[_ok_choice()])
 
     client = SimpleNamespace(chat=SimpleNamespace(completions=_Create()))
 
@@ -69,9 +81,7 @@ def test_call_openai_uses_max_tokens_for_legacy_models():
     class _Create:
         def create(self, **kwargs):
             calls.append(kwargs)
-            return SimpleNamespace(
-                choices=[SimpleNamespace(message=SimpleNamespace(content='{"ok": true}'))]
-            )
+            return SimpleNamespace(choices=[_ok_choice()])
 
     client = SimpleNamespace(chat=SimpleNamespace(completions=_Create()))
 
@@ -83,6 +93,130 @@ def create(self, **kwargs):
     assert calls[0]['temperature'] == 0.4
 
 
+def test_call_openai_escalates_budget_on_length_truncation():
+    """Reasoning model hits length on first try, succeeds on retry with bigger budget."""
+    calls = []
+    responses = [
+        SimpleNamespace(choices=[_empty_choice(finish="length")]),
+        SimpleNamespace(choices=[_empty_choice(finish="length")]),
+        SimpleNamespace(choices=[_ok_choice(content='{"result": "success"}')]),
+    ]
+
+    class _Create:
+        def create(self, **kwargs):
+            calls.append(kwargs)
+            return responses[len(calls) - 1]
+
+    client = SimpleNamespace(chat=SimpleNamespace(completions=_Create()))
+
+    raw = _call_openai(client, "gpt-5.4", "complex prompt", 0.4, 16000)
+
+    assert raw == '{"result": "success"}'
+    assert len(calls) == 3
+    # Each retry doubles the budget
+    assert calls[0]["max_completion_tokens"] == 16000
+    assert calls[1]["max_completion_tokens"] == 32000
+    assert calls[2]["max_completion_tokens"] == 64000
+
+
+def test_call_openai_escalation_caps_at_128k():
+    """Auto-escalation must not grow unbounded."""
+    calls = []
+
+    class _Create:
+        def create(self, **kwargs):
+            calls.append(kwargs)
+            return SimpleNamespace(choices=[_empty_choice(finish="length")])
+
+    client = SimpleNamespace(chat=SimpleNamespace(completions=_Create()))
+
+    with pytest.raises(RuntimeError) as exc:
+        _call_openai(client, "gpt-5.4", "prompt", 0.4, 64000)
+
+    msg = str(exc.value)
+    assert "finish_reason=length" in msg
+    assert "attempts=" in msg
+    # Should escalate 64K -> 128K (cap), then give up
+    budgets = [c["max_completion_tokens"] for c in calls]
+    assert budgets[0] == 64000
+    assert budgets[-1] == 128000
+    # No budget should exceed the cap
+    assert all(b <= 128000 for b in budgets)
+
+
+def test_call_openai_does_not_escalate_legacy_models():
+    """Legacy chat models with finish=length should NOT auto-escalate.
+
+    They have a separate input/output budget so length-truncation
+    on the output side is not recoverable by giving more budget —
+    it just means the response was cut off mid-stream.
+    """
+    calls = []
+
+    class _Create:
+        def create(self, **kwargs):
+            calls.append(kwargs)
+            return SimpleNamespace(choices=[_empty_choice(finish="length")])
+
+    client = SimpleNamespace(chat=SimpleNamespace(completions=_Create()))
+
+    with pytest.raises(RuntimeError):
+        _call_openai(client, "gpt-4o-mini", "prompt", 0.4, 16000)
+
+    # No retry — single attempt
+    assert len(calls) == 1
+
+
+def test_call_openai_does_not_escalate_on_content_filter():
+    """Content filter / refusal must not trigger budget escalation."""
+    calls = []
+
+    class _Create:
+        def create(self, **kwargs):
+            calls.append(kwargs)
+            return SimpleNamespace(choices=[
+                _empty_choice(finish="content_filter", refusal="policy")
+            ])
+
+    client = SimpleNamespace(chat=SimpleNamespace(completions=_Create()))
+
+    with pytest.raises(RuntimeError) as exc:
+        _call_openai(client, "gpt-5.4", "prompt", 0.4, 16000)
+
+    assert len(calls) == 1
+    assert "finish_reason=content_filter" in str(exc.value)
+    assert "refusal=policy" in str(exc.value)
+
+
+def test_llm_call_bumps_starting_budget_for_reasoning_models():
+    """llm_call should give reasoning models at least 32K headroom."""
+    from llm_backend import llm_call
+    calls = []
+
+    class _Create:
+        def create(self, **kwargs):
+            calls.append(kwargs)
+            return SimpleNamespace(choices=[_ok_choice()])
+
+    client = SimpleNamespace(chat=SimpleNamespace(completions=_Create()))
+    backend = {"type": "openai", "client": client}
+
+    # Caller passes default 16000, but reasoning models should be
+    # bumped to 32000 starting budget.
+    llm_call(backend, "gpt-5.4", "prompt", max_tokens=16000)
+    assert calls[0]["max_completion_tokens"] == 32000
+
+    # Caller-provided value above 32000 is respected as-is.
+    calls.clear()
+    llm_call(backend, "gpt-5.4", "prompt", max_tokens=48000)
+    assert calls[0]["max_completion_tokens"] == 48000
+
+    # Legacy models are NOT bumped.
+    calls.clear()
+    llm_call(backend, "gpt-4o-mini", "prompt", max_tokens=16000)
+    assert calls[0]["max_tokens"] == 16000
+
+
 
 def test_call_codex_surfaces_useful_stderr_tail(monkeypatch):
     failure = SimpleNamespace(