Skip to content

Commit 061a608

Browse files
committed
llm_backend: auto-escalate tokens on length truncation
The recurring "OpenAI returned empty content" failures were all gpt-5.4 hitting finish_reason=length. Reasoning models (gpt-5, o1/o3/o4) budget BOTH internal reasoning tokens AND output tokens against max_completion_tokens. The default 16000 was too small: complex prompts burned the entire budget thinking and produced empty output. Fix it in two places. First, _call_openai now auto-escalates the budget when a reasoning model returns empty content with finish_reason= length. The budget doubles up to a 128K hard cap, so a prompt that needs more headroom recovers without operator intervention. Non-reasoning models, refusals, content-filter hits, and other empty-content cases are NOT retried — those are not recoverable by giving more tokens. Second, llm_call now bumps the starting budget for reasoning models to at least 32K. The auto-escalation handles further growth, but starting at 32K avoids the wasted first attempt that almost always hit length on the old 16K default. Tests cover: - Auto-escalation succeeds after one length-truncated retry - Escalation caps at 128K and surfaces a clear error after - Legacy chat models do NOT auto-escalate (different budget) - Content filter / refusal does NOT trigger escalation - llm_call bumps default 16K to 32K for reasoning models - Caller-provided values above 32K are respected as-is This fixes the four submissions stuck in generation_failed with the same RuntimeError, and prevents new ones from accumulating in the same way. Generated-by: Claude AI Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
1 parent f2be60a commit 061a608

2 files changed

Lines changed: 219 additions & 26 deletions

File tree

llm_backend.py

Lines changed: 79 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -76,15 +76,24 @@ def llm_call(backend, model, prompt, temperature=0.4,
7676
model: Model name (backend-specific).
7777
prompt: User prompt string.
7878
temperature: Sampling temperature (ignored by claude-cli).
79-
max_tokens: Maximum output tokens.
79+
max_tokens: Maximum output tokens. For OpenAI reasoning
80+
models (gpt-5, o1/o3/o4) this is also the budget for
81+
internal reasoning tokens, so the effective starting
82+
budget is bumped to give the model headroom.
8083
json_mode: If True, parse response as JSON with repair logic.
8184
If False, return raw text string.
8285
"""
8386
btype = backend["type"]
8487

8588
if btype == "openai":
89+
# Reasoning models budget reasoning + output against the same
90+
# ceiling. Start with at least 32K headroom; auto-escalation
91+
# in _call_openai handles further growth on length-truncation.
92+
effective = max_tokens
93+
if _is_reasoning_model(model) and effective < 32000:
94+
effective = 32000
8695
raw = _call_openai(backend["client"], model, prompt,
87-
temperature, max_tokens)
96+
temperature, effective)
8897
elif btype == "anthropic":
8998
raw = _call_anthropic(backend["client"], model, prompt,
9099
temperature, max_tokens)
@@ -105,39 +114,89 @@ def llm_call(backend, model, prompt, temperature=0.4,
105114
# Backend implementations
106115
# ---------------------------------------------------------------------------
107116

108-
def _call_openai(client, model, prompt, temperature, max_tokens):
117+
# Hard cap on max_completion_tokens auto-escalation. Reasoning models
118+
# can burn enormous budgets thinking; this prevents an unbounded retry
119+
# loop. 128K is the practical ceiling for current GPT-5 / o-family
120+
# models.
121+
_OPENAI_MAX_TOKENS_CAP = 128000
122+
123+
# Reasoning model prefixes. These models budget BOTH internal reasoning
124+
# tokens AND output tokens against max_completion_tokens, so they need
125+
# substantially more headroom than legacy chat models.
126+
_REASONING_MODEL_PREFIXES = ("gpt-5", "o1", "o3", "o4")
127+
128+
129+
def _is_reasoning_model(model):
130+
return (model or "").startswith(_REASONING_MODEL_PREFIXES)
131+
132+
133+
def _openai_create_once(client, model, prompt, temperature, max_tokens):
134+
"""Single OpenAI chat completion call. Returns the choice object."""
109135
kwargs = {
110136
"model": model,
111137
"messages": [{"role": "user", "content": prompt}],
112138
}
113-
114-
# Newer reasoning models are stricter about accepted request fields.
115-
# Avoid sending temperature unless we have to, and prefer the newer
116-
# completion-token field for GPT-5 / o* families.
117-
if not (model or "").startswith(("gpt-5", "o1", "o3", "o4")):
139+
if not _is_reasoning_model(model):
118140
kwargs["temperature"] = temperature
119-
120-
if (model or "").startswith(("gpt-5", "o1", "o3", "o4")):
121-
kwargs["max_completion_tokens"] = max_tokens
122-
else:
123141
kwargs["max_tokens"] = max_tokens
124-
142+
else:
143+
kwargs["max_completion_tokens"] = max_tokens
125144
resp = client.chat.completions.create(**kwargs)
126-
choice = resp.choices[0]
127-
content = choice.message.content
128-
finish = getattr(choice, "finish_reason", None)
129-
refusal = getattr(choice.message, "refusal", None)
145+
return resp.choices[0]
146+
147+
148+
def _call_openai(client, model, prompt, temperature, max_tokens):
149+
"""Call OpenAI with auto-escalation on length-truncation.
150+
151+
Reasoning models (gpt-5, o1/o3/o4) budget BOTH internal reasoning
152+
tokens AND output tokens against max_completion_tokens. A complex
153+
prompt can burn the entire budget thinking, leaving zero output
154+
and finish_reason=length. When that happens we retry with double
155+
the budget up to _OPENAI_MAX_TOKENS_CAP. This is the difference
156+
between "the API failed" and "the model needs more headroom".
157+
"""
158+
attempts = []
159+
budget = max_tokens
160+
is_reasoning = _is_reasoning_model(model)
161+
162+
while True:
163+
choice = _openai_create_once(
164+
client, model, prompt, temperature, budget)
165+
content = choice.message.content
166+
finish = getattr(choice, "finish_reason", None)
167+
refusal = getattr(choice.message, "refusal", None)
168+
attempts.append({
169+
"budget": budget, "finish": finish,
170+
"had_content": bool(content and content.strip()),
171+
})
172+
173+
if content is not None and content.strip():
174+
return content.strip()
175+
176+
# Empty content. Decide whether to escalate or give up.
177+
# Only auto-escalate when the cause is length truncation on a
178+
# reasoning model — that's the recoverable case. Refusals,
179+
# content-filter hits, and stop-with-empty are not recoverable
180+
# by retrying with more tokens.
181+
if (is_reasoning and finish == "length"
182+
and budget < _OPENAI_MAX_TOKENS_CAP):
183+
new_budget = min(budget * 2, _OPENAI_MAX_TOKENS_CAP)
184+
print(f"[LLM] OpenAI hit length on {model} with "
185+
f"max_completion_tokens={budget}; "
186+
f"retrying with {new_budget}", file=sys.stderr)
187+
budget = new_budget
188+
continue
130189

131-
if content is None or not content.strip():
132190
parts = [f"OpenAI returned empty content (model={model}"]
133191
if finish:
134192
parts.append(f"finish_reason={finish}")
135193
if refusal:
136194
parts.append(f"refusal={refusal}")
195+
if len(attempts) > 1:
196+
parts.append(f"attempts={len(attempts)}")
197+
parts.append(f"final_budget={budget}")
137198
raise RuntimeError(", ".join(parts) + ")")
138199

139-
return content.strip()
140-
141200

142201
def _call_anthropic(client, model, prompt, temperature, max_tokens):
143202
resp = client.messages.create(

tests/test_llm_backend.py

Lines changed: 140 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -42,15 +42,27 @@ def fake_import(name, *args, **kwargs):
4242
assert "requires the 'openai' Python package" in str(exc.value)
4343

4444

45+
def _ok_choice(content='{"ok": true}', finish="stop"):
46+
return SimpleNamespace(
47+
message=SimpleNamespace(content=content, refusal=None),
48+
finish_reason=finish,
49+
)
50+
51+
52+
def _empty_choice(finish="length", refusal=None):
53+
return SimpleNamespace(
54+
message=SimpleNamespace(content=None, refusal=refusal),
55+
finish_reason=finish,
56+
)
57+
58+
4559
def test_call_openai_uses_max_completion_tokens_for_gpt5_models():
4660
calls = []
4761

4862
class _Create:
4963
def create(self, **kwargs):
5064
calls.append(kwargs)
51-
return SimpleNamespace(
52-
choices=[SimpleNamespace(message=SimpleNamespace(content='{"ok": true}'))]
53-
)
65+
return SimpleNamespace(choices=[_ok_choice()])
5466

5567
client = SimpleNamespace(chat=SimpleNamespace(completions=_Create()))
5668

@@ -69,9 +81,7 @@ def test_call_openai_uses_max_tokens_for_legacy_models():
6981
class _Create:
7082
def create(self, **kwargs):
7183
calls.append(kwargs)
72-
return SimpleNamespace(
73-
choices=[SimpleNamespace(message=SimpleNamespace(content='{"ok": true}'))]
74-
)
84+
return SimpleNamespace(choices=[_ok_choice()])
7585

7686
client = SimpleNamespace(chat=SimpleNamespace(completions=_Create()))
7787

@@ -83,6 +93,130 @@ def create(self, **kwargs):
8393
assert calls[0]['temperature'] == 0.4
8494

8595

96+
def test_call_openai_escalates_budget_on_length_truncation():
97+
"""Reasoning model hits length on first try, succeeds on retry with bigger budget."""
98+
calls = []
99+
responses = [
100+
SimpleNamespace(choices=[_empty_choice(finish="length")]),
101+
SimpleNamespace(choices=[_empty_choice(finish="length")]),
102+
SimpleNamespace(choices=[_ok_choice(content='{"result": "success"}')]),
103+
]
104+
105+
class _Create:
106+
def create(self, **kwargs):
107+
calls.append(kwargs)
108+
return responses[len(calls) - 1]
109+
110+
client = SimpleNamespace(chat=SimpleNamespace(completions=_Create()))
111+
112+
raw = _call_openai(client, "gpt-5.4", "complex prompt", 0.4, 16000)
113+
114+
assert raw == '{"result": "success"}'
115+
assert len(calls) == 3
116+
# Each retry doubles the budget
117+
assert calls[0]["max_completion_tokens"] == 16000
118+
assert calls[1]["max_completion_tokens"] == 32000
119+
assert calls[2]["max_completion_tokens"] == 64000
120+
121+
122+
def test_call_openai_escalation_caps_at_128k():
123+
"""Auto-escalation must not grow unbounded."""
124+
calls = []
125+
126+
class _Create:
127+
def create(self, **kwargs):
128+
calls.append(kwargs)
129+
return SimpleNamespace(choices=[_empty_choice(finish="length")])
130+
131+
client = SimpleNamespace(chat=SimpleNamespace(completions=_Create()))
132+
133+
with pytest.raises(RuntimeError) as exc:
134+
_call_openai(client, "gpt-5.4", "prompt", 0.4, 64000)
135+
136+
msg = str(exc.value)
137+
assert "finish_reason=length" in msg
138+
assert "attempts=" in msg
139+
# Should escalate 64K -> 128K (cap), then give up
140+
budgets = [c["max_completion_tokens"] for c in calls]
141+
assert budgets[0] == 64000
142+
assert budgets[-1] == 128000
143+
# No budget should exceed the cap
144+
assert all(b <= 128000 for b in budgets)
145+
146+
147+
def test_call_openai_does_not_escalate_legacy_models():
148+
"""Legacy chat models with finish=length should NOT auto-escalate.
149+
150+
They have a separate input/output budget so length-truncation
151+
on the output side is not recoverable by giving more budget —
152+
it just means the response was cut off mid-stream.
153+
"""
154+
calls = []
155+
156+
class _Create:
157+
def create(self, **kwargs):
158+
calls.append(kwargs)
159+
return SimpleNamespace(choices=[_empty_choice(finish="length")])
160+
161+
client = SimpleNamespace(chat=SimpleNamespace(completions=_Create()))
162+
163+
with pytest.raises(RuntimeError):
164+
_call_openai(client, "gpt-4o-mini", "prompt", 0.4, 16000)
165+
166+
# No retry — single attempt
167+
assert len(calls) == 1
168+
169+
170+
def test_call_openai_does_not_escalate_on_content_filter():
171+
"""Content filter / refusal must not trigger budget escalation."""
172+
calls = []
173+
174+
class _Create:
175+
def create(self, **kwargs):
176+
calls.append(kwargs)
177+
return SimpleNamespace(choices=[
178+
_empty_choice(finish="content_filter", refusal="policy")
179+
])
180+
181+
client = SimpleNamespace(chat=SimpleNamespace(completions=_Create()))
182+
183+
with pytest.raises(RuntimeError) as exc:
184+
_call_openai(client, "gpt-5.4", "prompt", 0.4, 16000)
185+
186+
assert len(calls) == 1
187+
assert "finish_reason=content_filter" in str(exc.value)
188+
assert "refusal=policy" in str(exc.value)
189+
190+
191+
def test_llm_call_bumps_starting_budget_for_reasoning_models():
192+
"""llm_call should give reasoning models at least 32K headroom."""
193+
from llm_backend import llm_call
194+
calls = []
195+
196+
class _Create:
197+
def create(self, **kwargs):
198+
calls.append(kwargs)
199+
return SimpleNamespace(choices=[_ok_choice()])
200+
201+
client = SimpleNamespace(chat=SimpleNamespace(completions=_Create()))
202+
backend = {"type": "openai", "client": client}
203+
204+
# Caller passes default 16000, but reasoning models should be
205+
# bumped to 32000 starting budget.
206+
llm_call(backend, "gpt-5.4", "prompt", max_tokens=16000)
207+
assert calls[0]["max_completion_tokens"] == 32000
208+
209+
# Caller-provided value above 32000 is respected as-is.
210+
calls.clear()
211+
llm_call(backend, "gpt-5.4", "prompt", max_tokens=48000)
212+
assert calls[0]["max_completion_tokens"] == 48000
213+
214+
# Legacy models are NOT bumped.
215+
calls.clear()
216+
llm_call(backend, "gpt-4o-mini", "prompt", max_tokens=16000)
217+
assert calls[0]["max_tokens"] == 16000
218+
219+
86220

87221
def test_call_codex_surfaces_useful_stderr_tail(monkeypatch):
88222
failure = SimpleNamespace(

0 commit comments

Comments
 (0)