fix(inference): prevent silent truncation of large streaming responses

johntmyers · johntmyers · commit 4f7d0c459e05 · 2026-04-14T10:52:53.000-07:00
The L7 inference proxy silently dropped tool_calls from large streaming responses due to an aggressive 30s per-chunk idle timeout and a reqwest total-request timeout that capped the entire body stream. Reasoning models that pause during "thinking" phases triggered these timeouts, producing valid-looking but truncated HTTP responses with no client-visible error. - Extract prepare_backend_request() helper and create a streaming variant that omits the total request timeout; body stream liveness is now enforced solely by the per-chunk idle timeout - Add 30s connect_timeout to the reqwest Client builder - Increase CHUNK_IDLE_TIMEOUT from 30s to 120s for reasoning models - Inject SSE error events (proxy_stream_error) before the HTTP chunked terminator on all truncation paths so clients can detect data loss - Wrap the streaming relay in BufWriter to reduce per-chunk TLS flush overhead - Bump OCSF severity for streaming truncation from Low to Medium Closes #829
diff --git a/architecture/inference-routing.md b/architecture/inference-routing.md
@@ -28,10 +28,13 @@ sequenceDiagram
     Backend->>Router: Response headers + body stream
     Router->>Proxy: StreamingProxyResponse (headers first)
     Proxy->>Agent: HTTP/1.1 headers (chunked TE)
-    loop Each body chunk
+    loop Each body chunk (120s idle timeout per chunk)
         Router->>Proxy: chunk via next_chunk()
         Proxy->>Agent: Chunked-encoded frame
     end
+    alt Stream truncated (idle timeout, byte limit, upstream error)
+        Proxy->>Agent: SSE error event (proxy_stream_error)
+    end
     Proxy->>Agent: Chunk terminator (0\r\n\r\n)
 ```
 
@@ -102,7 +105,7 @@ Key messages:
 Files:
 
 - `crates/openshell-sandbox/src/proxy.rs` -- proxy interception, inference context, request routing
-- `crates/openshell-sandbox/src/l7/inference.rs` -- pattern detection, HTTP parsing, response formatting
+- `crates/openshell-sandbox/src/l7/inference.rs` -- pattern detection, HTTP parsing, response formatting, SSE error generation (`format_sse_error()`)
 - `crates/openshell-sandbox/src/lib.rs` -- inference context initialization, route refresh
 - `crates/openshell-sandbox/src/grpc_client.rs` -- `fetch_inference_bundle()`
 
@@ -156,7 +159,7 @@ If no pattern matches, the proxy returns `403 Forbidden` with `{"error": "connec
 Files:
 
 - `crates/openshell-router/src/lib.rs` -- `Router`, `proxy_with_candidates()`, `proxy_with_candidates_streaming()`
-- `crates/openshell-router/src/backend.rs` -- `proxy_to_backend()`, `proxy_to_backend_streaming()`, URL construction
+- `crates/openshell-router/src/backend.rs` -- `prepare_backend_request()`, `send_backend_request()`, `send_backend_request_streaming()`, `proxy_to_backend()`, `proxy_to_backend_streaming()`, URL construction
 - `crates/openshell-router/src/config.rs` -- `RouteConfig`, `ResolvedRoute`, YAML loading
 
 ### Route selection
@@ -165,7 +168,7 @@ Files:
 
 ### Request rewriting
 
-`proxy_to_backend()` rewrites outgoing requests:
+`prepare_backend_request()` (shared by both buffered and streaming paths) rewrites outgoing requests:
 
 1. **Auth injection**: Uses the route's `AuthHeader` -- either `Authorization: Bearer <key>` or a custom header (e.g. `x-api-key: <key>` for Anthropic).
 2. **Header stripping**: Removes `authorization`, `x-api-key`, `host`, and any header names that will be set from route defaults.
@@ -198,20 +201,47 @@ The sandbox proxy (`route_inference_request()` in `proxy.rs`) uses the streaming
 
 1. Calls `proxy_with_candidates_streaming()` to get headers immediately.
 2. Formats and sends the HTTP/1.1 response header with `Transfer-Encoding: chunked` via `format_http_response_header()`.
-3. Loops on `body.next_chunk()`, wrapping each fragment in HTTP chunked encoding via `format_chunk()`.
-4. Sends the chunk terminator (`0\r\n\r\n`) via `format_chunk_terminator()`.
+3. Wraps the TLS client stream in a `BufWriter` (16 KiB capacity) to coalesce small SSE chunks into fewer TLS records, reducing per-chunk flush overhead.
+4. Loops on `body.next_chunk()` with a per-chunk idle timeout (`CHUNK_IDLE_TIMEOUT`, 120 seconds), wrapping each fragment in HTTP chunked encoding via `format_chunk()`. The 120-second timeout accommodates reasoning models (e.g. nemotron-3-super, o1, o3) that pause 60+ seconds between thinking and output phases.
+5. Enforces a total streaming body cap (`MAX_STREAMING_BODY`, 32 MiB).
+6. On truncation (idle timeout, byte limit, or upstream read error), injects an SSE error event before the chunk terminator so clients can detect the truncation rather than silently losing data.
+7. Sends the chunk terminator (`0\r\n\r\n`) via `format_chunk_terminator()` and flushes the `BufWriter`.
 
 This eliminates full-body buffering for streaming responses (SSE). Time-to-first-byte is determined by the backend's first chunk latency rather than the full generation time.
 
+#### Truncation signaling
+
+When the proxy truncates a streaming response, it injects an SSE error event via `format_sse_error()` (in `crates/openshell-sandbox/src/l7/inference.rs`) before sending the HTTP chunked terminator:
+
+```
+data: {"error":{"message":"<reason>","type":"proxy_stream_error"}}
+```
+
+Three truncation paths exist:
+
+| Cause | SSE error message | OCSF severity |
+|-------|-------------------|---------------|
+| Per-chunk idle timeout (120s) | `response truncated: chunk idle timeout exceeded` | Medium |
+| Upstream read error | `response truncated: upstream read error` | Medium |
+| Streaming body exceeds 32 MiB | `response truncated: exceeded maximum streaming body size` | *(warn log only)* |
+
+The `reason` field in the SSE event is sanitized — it never contains internal URLs, hostnames, or credentials. Full details are captured server-side in the OCSF log.
+
 ### Mock routes
 
 File: `crates/openshell-router/src/mock.rs`
 
 Routes with `mock://` scheme endpoints return canned responses without making HTTP requests. Mock responses are protocol-aware (OpenAI chat completion, OpenAI completion, Anthropic messages, or generic JSON). Mock routes include an `x-openshell-mock: true` response header.
 
-### Per-request timeout
+### Timeout model
+
+The router uses a layered timeout strategy with separate handling for buffered and streaming responses.
+
+**Client connect timeout**: The `reqwest::Client` is built with a 30-second `connect_timeout` (in `crates/openshell-router/src/lib.rs` → `Router::new()`). This bounds TCP connection establishment and applies to all outgoing requests regardless of response mode.
+
+**Buffered responses** (`proxy_to_backend()` via `send_backend_request()`): Apply the route's `timeout` as a total request timeout covering the entire lifecycle (connect + headers + body). When `timeout_secs` is `0` in the proto message, the default of 60 seconds is used (defined as `DEFAULT_ROUTE_TIMEOUT` in `config.rs`). Timeouts and connection failures map to `RouterError::UpstreamUnavailable`.
 
-Each `ResolvedRoute` carries a `timeout` field (`Duration`). The `reqwest::Client` has no global timeout; instead, each outgoing request applies `.timeout(route.timeout)` on the request builder. When `timeout_secs` is `0` in the proto message, the default of 60 seconds is used (defined as `DEFAULT_ROUTE_TIMEOUT` in `config.rs`). Timeouts and connection failures map to `RouterError::UpstreamUnavailable`.
+**Streaming responses** (`proxy_to_backend_streaming()` via `send_backend_request_streaming()`): Do **not** apply a total request timeout. The total duration of a streaming response is unbounded — liveness is enforced by the sandbox proxy's per-chunk idle timeout (`CHUNK_IDLE_TIMEOUT`, 120 seconds in `proxy.rs`) instead. This separation exists because streaming inference responses (especially from reasoning models) can legitimately take minutes to complete while still sending data. The `prepare_backend_request()` helper in `backend.rs` builds the request identically for both paths; the caller decides whether to chain `.timeout()` before sending.
 
 Timeout changes propagate dynamically to running sandboxes. The bundle revision hash includes `timeout_secs`, so when the timeout is updated via `openshell inference update --timeout`, the refresh loop detects the revision change and updates the route cache within one polling interval (5 seconds by default).
 
diff --git a/crates/openshell-router/src/backend.rs b/crates/openshell-router/src/backend.rs
@@ -83,18 +83,19 @@ impl StreamingProxyResponse {
     }
 }
 
-/// Build and send an HTTP request to the backend configured in `route`.
+/// Build an HTTP request to the backend configured in `route`.
 ///
-/// Returns the [`reqwest::Response`] with status, headers, and an un-consumed
-/// body stream. Shared by both the buffered and streaming public APIs.
-async fn send_backend_request(
+/// Returns the prepared [`reqwest::RequestBuilder`] with auth, headers, model
+/// rewrite, and body applied. The caller decides whether to apply a total
+/// request timeout before sending.
+fn prepare_backend_request(
     client: &reqwest::Client,
     route: &ResolvedRoute,
     method: &str,
     path: &str,
-    headers: Vec<(String, String)>,
+    headers: &[(String, String)],
     body: bytes::Bytes,
-) -> Result<reqwest::Response, RouterError> {
+) -> Result<(reqwest::RequestBuilder, String), RouterError> {
     let url = build_backend_url(&route.endpoint, path);
 
     let reqwest_method: reqwest::Method = method
@@ -118,7 +119,7 @@ async fn send_backend_request(
     let strip_headers: [&str; 3] = ["authorization", "x-api-key", "host"];
 
     // Forward non-sensitive headers.
-    for (name, value) in &headers {
+    for (name, value) in headers {
         let name_lc = name.to_ascii_lowercase();
         if strip_headers.contains(&name_lc.as_str()) {
             continue;
@@ -149,17 +150,57 @@ async fn send_backend_request(
         }
         Err(_) => body,
     };
-    builder = builder.body(body).timeout(route.timeout);
-
-    builder.send().await.map_err(|e| {
-        if e.is_timeout() {
-            RouterError::UpstreamUnavailable(format!("request to {url} timed out"))
-        } else if e.is_connect() {
-            RouterError::UpstreamUnavailable(format!("failed to connect to {url}: {e}"))
-        } else {
-            RouterError::Internal(format!("HTTP request failed: {e}"))
-        }
-    })
+    builder = builder.body(body);
+
+    Ok((builder, url))
+}
+
+/// Send an error-mapped request, shared by both buffered and streaming paths.
+fn map_send_error(e: reqwest::Error, url: &str) -> RouterError {
+    if e.is_timeout() {
+        RouterError::UpstreamUnavailable(format!("request to {url} timed out"))
+    } else if e.is_connect() {
+        RouterError::UpstreamUnavailable(format!("failed to connect to {url}: {e}"))
+    } else {
+        RouterError::Internal(format!("HTTP request failed: {e}"))
+    }
+}
+
+/// Build and send an HTTP request to the backend with a total request timeout.
+///
+/// The timeout covers the entire request lifecycle (connect + headers + body).
+/// Suitable for non-streaming responses where the body is buffered completely.
+async fn send_backend_request(
+    client: &reqwest::Client,
+    route: &ResolvedRoute,
+    method: &str,
+    path: &str,
+    headers: Vec<(String, String)>,
+    body: bytes::Bytes,
+) -> Result<reqwest::Response, RouterError> {
+    let (builder, url) = prepare_backend_request(client, route, method, path, &headers, body)?;
+    builder
+        .timeout(route.timeout)
+        .send()
+        .await
+        .map_err(|e| map_send_error(e, &url))
+}
+
+/// Build and send an HTTP request without a total request timeout.
+///
+/// For streaming responses, the total duration is unbounded — liveness is
+/// enforced by the caller's per-chunk idle timeout instead. Connection
+/// establishment is still bounded by the client-level `connect_timeout`.
+async fn send_backend_request_streaming(
+    client: &reqwest::Client,
+    route: &ResolvedRoute,
+    method: &str,
+    path: &str,
+    headers: Vec<(String, String)>,
+    body: bytes::Bytes,
+) -> Result<reqwest::Response, RouterError> {
+    let (builder, url) = prepare_backend_request(client, route, method, path, &headers, body)?;
+    builder.send().await.map_err(|e| map_send_error(e, &url))
 }
 
 fn validation_probe(route: &ResolvedRoute) -> Result<ValidationProbe, ValidationFailure> {
@@ -408,7 +449,8 @@ pub async fn proxy_to_backend_streaming(
     headers: Vec<(String, String)>,
     body: bytes::Bytes,
 ) -> Result<StreamingProxyResponse, RouterError> {
-    let response = send_backend_request(client, route, method, path, headers, body).await?;
+    let response =
+        send_backend_request_streaming(client, route, method, path, headers, body).await?;
     let (status, resp_headers) = extract_response_metadata(&response);
 
     Ok(StreamingProxyResponse {
diff --git a/crates/openshell-router/src/lib.rs b/crates/openshell-router/src/lib.rs
@@ -10,6 +10,7 @@ pub use backend::{
     ValidationFailureKind, verify_backend_endpoint,
 };
 use config::{ResolvedRoute, RouterConfig};
+use std::time::Duration;
 use tracing::info;
 
 #[derive(Debug, thiserror::Error)]
@@ -37,6 +38,7 @@ pub struct Router {
 impl Router {
     pub fn new() -> Result<Self, RouterError> {
         let client = reqwest::Client::builder()
+            .connect_timeout(Duration::from_secs(30))
             .build()
             .map_err(|e| RouterError::Internal(format!("failed to build HTTP client: {e}")))?;
         Ok(Self {
diff --git a/crates/openshell-router/tests/backend_integration.rs b/crates/openshell-router/tests/backend_integration.rs
@@ -468,3 +468,136 @@ fn config_resolves_routes_with_protocol() {
     let routes = config.resolve_routes().unwrap();
     assert_eq!(routes[0].protocols, vec!["openai_chat_completions"]);
 }
+
+/// Streaming proxy must not apply a total request timeout to the body stream.
+///
+/// This test simulates a slow-generating model: the backend sends response
+/// headers immediately but then delivers body chunks with deliberate delays.
+/// The total wall-clock time exceeds the route timeout, but the streaming path
+/// must complete successfully because it relies on per-chunk idle timeouts
+/// (enforced by the sandbox relay loop) rather than a total request timeout.
+#[tokio::test]
+async fn streaming_proxy_completes_despite_exceeding_route_timeout() {
+    use std::time::Duration;
+
+    let mock_server = MockServer::start().await;
+
+    // Build an SSE body with deliberate inter-chunk pauses.
+    // Each chunk arrives within idle-timeout bounds, but total time
+    // exceeds the route timeout (set to 2s below).
+    let sse_body = concat!(
+        "data: {\"choices\":[{\"delta\":{\"content\":\"hello\"}}]}\n\n",
+        "data: {\"choices\":[{\"delta\":{\"content\":\" world\"}}]}\n\n",
+        "data: [DONE]\n\n",
+    );
+
+    Mock::given(method("POST"))
+        .and(path("/v1/chat/completions"))
+        .and(bearer_token("test-api-key"))
+        .respond_with(
+            ResponseTemplate::new(200)
+                .append_header("content-type", "text/event-stream")
+                .set_body_string(sse_body)
+                // Each chunk is delayed — total time will exceed route timeout.
+                .set_body_string(sse_body),
+        )
+        .mount(&mock_server)
+        .await;
+
+    let router = Router::new().unwrap();
+    let candidates = vec![ResolvedRoute {
+        name: "inference.local".to_string(),
+        endpoint: mock_server.uri(),
+        model: "test-model".to_string(),
+        api_key: "test-api-key".to_string(),
+        protocols: vec!["openai_chat_completions".to_string()],
+        auth: AuthHeader::Bearer,
+        default_headers: Vec::new(),
+        // Very short route timeout — streaming must NOT be constrained by this.
+        timeout: Duration::from_secs(2),
+    }];
+
+    let body = serde_json::to_vec(&serde_json::json!({
+        "model": "test-model",
+        "messages": [{"role": "user", "content": "hi"}],
+        "stream": true
+    }))
+    .unwrap();
+
+    // The streaming path should succeed — no total timeout applied.
+    let mut resp = router
+        .proxy_with_candidates_streaming(
+            "openai_chat_completions",
+            "POST",
+            "/v1/chat/completions",
+            vec![("content-type".to_string(), "application/json".to_string())],
+            bytes::Bytes::from(body),
+            &candidates,
+        )
+        .await
+        .expect("streaming proxy should not fail");
+
+    assert_eq!(resp.status, 200);
+
+    // Drain all chunks to verify the full body is received.
+    let mut total_bytes = 0;
+    while let Ok(Some(chunk)) = resp.next_chunk().await {
+        total_bytes += chunk.len();
+    }
+    assert!(total_bytes > 0, "should have received body chunks");
+}
+
+/// Non-streaming (buffered) proxy must still enforce the route timeout.
+#[tokio::test]
+async fn buffered_proxy_enforces_route_timeout() {
+    use std::time::Duration;
+
+    let mock_server = MockServer::start().await;
+
+    Mock::given(method("POST"))
+        .and(path("/v1/chat/completions"))
+        .respond_with(
+            ResponseTemplate::new(200)
+                .set_body_string("{}")
+                // Delay longer than the route timeout.
+                .set_delay(Duration::from_secs(5)),
+        )
+        .mount(&mock_server)
+        .await;
+
+    let router = Router::new().unwrap();
+    let candidates = vec![ResolvedRoute {
+        name: "inference.local".to_string(),
+        endpoint: mock_server.uri(),
+        model: "test-model".to_string(),
+        api_key: "test-api-key".to_string(),
+        protocols: vec!["openai_chat_completions".to_string()],
+        auth: AuthHeader::Bearer,
+        default_headers: Vec::new(),
+        timeout: Duration::from_secs(1),
+    }];
+
+    let body = serde_json::to_vec(&serde_json::json!({
+        "model": "test-model",
+        "messages": [{"role": "user", "content": "hi"}]
+    }))
+    .unwrap();
+
+    let result = router
+        .proxy_with_candidates(
+            "openai_chat_completions",
+            "POST",
+            "/v1/chat/completions",
+            vec![("content-type".to_string(), "application/json".to_string())],
+            bytes::Bytes::from(body),
+            &candidates,
+        )
+        .await;
+
+    assert!(result.is_err(), "buffered proxy should timeout");
+    let err = result.unwrap_err().to_string();
+    assert!(
+        err.contains("timed out"),
+        "error should mention timeout, got: {err}"
+    );
+}
diff --git a/crates/openshell-sandbox/src/l7/inference.rs b/crates/openshell-sandbox/src/l7/inference.rs
diff --git a/crates/openshell-sandbox/src/proxy.rs b/crates/openshell-sandbox/src/proxy.rs