Skip to content

feat: automatic back-off + prompt/token optimizer (#691) #703

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,19 @@ jobs:
- name: Install dependencies
run: pnpm install

- name: Fix code formatting
working-directory: codex-cli
run: pnpm run format:fix

# Run all tasks using workspace filters

- name: Check TypeScript code formatting
working-directory: codex-cli
run: pnpm run format

- name: Fix Markdown and config file formatting
run: pnpm run format:fix

- name: Check Markdown and config file formatting
run: pnpm run format

Expand Down
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,16 @@ The hardening mechanism Codex uses depends on your OS:

Key flags: `--model/-m`, `--approval-mode/-a`, `--quiet/-q`, and `--notify`.

**Retry & Rate-Limit Options**

- `--max-retries <number>` Maximum number of retry attempts for rate-limit and transient errors (default: 5)
- `--base-delay-ms <ms>` Base back-off delay in milliseconds (default: 2500)
- `--max-delay-ms <ms>` Maximum back-off delay in milliseconds (default: 60000)

**Token Usage Reporting**

- `--token-report` Print a token usage summary after each response

---

## Memory & Project Docs
Expand Down
27 changes: 27 additions & 0 deletions codex-cli/src/cli.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,28 @@ const cli = meow(
type: "boolean",
description: "Enable desktop notifications for responses",
},
// Retry/back-off options
maxRetries: {
type: "number",
default: 5,
description:
"Max number of retry attempts for rate-limit and transient errors",
},
baseDelayMs: {
type: "number",
default: 2500,
description: "Base delay in milliseconds for exponential back-off",
},
maxDelayMs: {
type: "number",
default: 60000,
description: "Maximum back-off delay in milliseconds",
},
tokenReport: {
type: "boolean",
default: false,
description: "Print token usage report after each request",
},

disableResponseStorage: {
type: "boolean",
Expand Down Expand Up @@ -294,6 +316,11 @@ config = {
cli.flags.disableResponseStorage !== undefined
? Boolean(cli.flags.disableResponseStorage)
: config.disableResponseStorage,
// Retry/back-off and token-report settings
maxRetries: cli.flags.maxRetries,
baseDelayMs: cli.flags.baseDelayMs,
maxDelayMs: cli.flags.maxDelayMs,
tokenReport: Boolean(cli.flags.tokenReport),
};

// Check for updates after loading config. This is important because we write state file in
Expand Down
42 changes: 42 additions & 0 deletions codex-cli/src/components/chat/terminal-chat.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import { useConfirmation } from "../../hooks/use-confirmation.js";
import { useTerminalSize } from "../../hooks/use-terminal-size.js";
import { AgentLoop } from "../../utils/agent/agent-loop.js";
import { ReviewDecision } from "../../utils/agent/review.js";
import { approximateTokensUsed } from "../../utils/approximate-tokens-used";
import { generateCompactSummary } from "../../utils/compact-summary.js";
import { getBaseUrl, getApiKey, saveConfig } from "../../utils/config.js";
import { extractAppliedPatches as _extractAppliedPatches } from "../../utils/extract-applied-patches.js";
Expand Down Expand Up @@ -148,10 +149,51 @@ export default function TerminalChat({
const [lastResponseId, setLastResponseId] = useState<string | null>(null);
const [items, setItems] = useState<Array<ResponseItem>>([]);
const [loading, setLoading] = useState<boolean>(false);
// Token-report refs to track per-turn usage
const prevPromptTokRef = useRef<number>(0);
const prevCompletionTokRef = useRef<number>(0);
const [approvalPolicy, setApprovalPolicy] = useState<ApprovalPolicy>(
initialApprovalPolicy,
);
const [thinkingSeconds, setThinkingSeconds] = useState(0);
// Inject a token usage report after each completed response
useEffect(() => {
if (config.tokenReport && lastResponseId) {
// Filter only chat message items with explicit roles
// Filter messages with role 'user'
const userItems = items.filter(
(it): it is ResponseItem & { type: "message"; role: "user" } => {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
return it.type === "message" && (it as any).role === "user";
},
);
// Filter messages with role 'assistant'
const aiItems = items.filter(
(it): it is ResponseItem & { type: "message"; role: "assistant" } => {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
return it.type === "message" && (it as any).role === "assistant";
},
);
const promptTok = approximateTokensUsed(userItems);
const completionTok = approximateTokensUsed(aiItems);
const deltaPrompt = promptTok - prevPromptTokRef.current;
const deltaCompletion = completionTok - prevCompletionTokRef.current;
const deltaTotal = deltaPrompt + deltaCompletion;
prevPromptTokRef.current = promptTok;
prevCompletionTokRef.current = completionTok;
const text = `Prompt: ${deltaPrompt} tok Completion: ${deltaCompletion} tok Total: ${deltaTotal} tok`;
setItems((prev) => [
...prev,
{
id: `token-report-${Date.now()}`,
type: "message",
role: "system",
content: [{ type: "input_text", text }],
} as ResponseItem,
]);
}
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [lastResponseId]);

const handleCompact = async () => {
setLoading(true);
Expand Down
31 changes: 22 additions & 9 deletions codex-cli/src/utils/agent/agent-loop.ts
Original file line number Diff line number Diff line change
Expand Up @@ -670,8 +670,8 @@ export class AgentLoop {
// Send request to OpenAI with retry on timeout.
let stream;

// Retry loop for transient errors. Up to MAX_RETRIES attempts.
const MAX_RETRIES = 5;
// Retry loop for transient errors. Up to maxRetries attempts.
const MAX_RETRIES = this.config.maxRetries ?? 5;
for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
try {
let reasoning: Reasoning | undefined;
Expand Down Expand Up @@ -779,19 +779,25 @@ export class AgentLoop {
/rate limit/i.test(errCtx.message ?? "");
if (isRateLimit) {
if (attempt < MAX_RETRIES) {
// Exponential backoff: base wait * 2^(attempt-1), or use suggested retry time
// if provided.
let delayMs = RATE_LIMIT_RETRY_WAIT_MS * 2 ** (attempt - 1);
// Exponential backoff: baseDelayMs * 2^(attempt-1), or use suggested retry time if provided.
const base =
this.config.baseDelayMs ?? RATE_LIMIT_RETRY_WAIT_MS;
let delayMs = base * 2 ** (attempt - 1);
const maxDelay = this.config.maxDelayMs ?? delayMs;

// Parse suggested retry time from error message, e.g., "Please try again in 1.3s"
const msg = errCtx?.message ?? "";
const m = /(?:retry|try) again in ([\d.]+)s/i.exec(msg);
if (m && m[1]) {
const suggested = parseFloat(m[1]) * 1000;
if (!Number.isNaN(suggested)) {
delayMs = suggested;
delayMs = Math.min(suggested, maxDelay);
}
}
// Cap delay to maxDelay
if (delayMs > maxDelay) {
delayMs = maxDelay;
}
log(
`OpenAI rate limit exceeded (attempt ${attempt}/${MAX_RETRIES}), retrying in ${Math.round(
delayMs,
Expand Down Expand Up @@ -907,7 +913,8 @@ export class AgentLoop {
return;
}

const MAX_STREAM_RETRIES = 5;
// Max number of retries for streaming on rate-limit or transient errors
const MAX_STREAM_RETRIES = this.config.maxRetries ?? 5;
let streamRetryAttempt = 0;

// eslint-disable-next-line no-constant-condition
Expand Down Expand Up @@ -1037,8 +1044,14 @@ export class AgentLoop {
) {
streamRetryAttempt += 1;

const waitMs =
RATE_LIMIT_RETRY_WAIT_MS * 2 ** (streamRetryAttempt - 1);
// Exponential backoff for stream retry
const baseStream =
this.config.baseDelayMs ?? RATE_LIMIT_RETRY_WAIT_MS;
let waitMs = baseStream * 2 ** (streamRetryAttempt - 1);
const maxDelay = this.config.maxDelayMs ?? waitMs;
if (waitMs > maxDelay) {
waitMs = maxDelay;
}
log(
`OpenAI stream rate‑limited – retry ${streamRetryAttempt}/${MAX_STREAM_RETRIES} in ${waitMs} ms`,
);
Expand Down
8 changes: 8 additions & 0 deletions codex-cli/src/utils/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,14 @@ export type AppConfig = {

/** Enable the "flex-mode" processing mode for supported models (o3, o4-mini) */
flexMode?: boolean;
/** Max number of retry attempts for rate-limit and transient errors */
maxRetries?: number;
/** Base delay in milliseconds for exponential back-off */
baseDelayMs?: number;
/** Maximum back-off delay in milliseconds */
maxDelayMs?: number;
/** Print token usage report after each request */
tokenReport?: boolean;
providers?: Record<string, { name: string; baseURL: string; envKey: string }>;
history?: {
maxSize: number;
Expand Down