From 85495d5742e6eeb6123df81894fa85056a8baedc Mon Sep 17 00:00:00 2001 From: Thushan Fernando Date: Thu, 23 Apr 2026 11:56:10 +1000 Subject: [PATCH 01/11] revert anthropic_support default + fix stale comment --- config/profiles/openai-compatible.yaml | 10 ++++++++++ internal/app/handlers/handler_common.go | 20 +++++++++++++++++--- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/config/profiles/openai-compatible.yaml b/config/profiles/openai-compatible.yaml index c6ce14d8..d40be0ca 100644 --- a/config/profiles/openai-compatible.yaml +++ b/config/profiles/openai-compatible.yaml @@ -13,6 +13,16 @@ routing: # API compatibility api: openai_compatible: true + + # Anthropic Messages API support + # Most OpenAI-compatible backends (LiteLLM, generic proxies) do not natively serve + # /v1/messages, so passthrough is disabled by default to avoid 404s. Enable this + # per backend profile (e.g. vllm.yaml, lmstudio.yaml) when the server supports it. + anthropic_support: + enabled: false + messages_path: /v1/messages + token_count: false + paths: - /v1/models # 0: health check & models - /v1/chat/completions # 1: chat completions diff --git a/internal/app/handlers/handler_common.go b/internal/app/handlers/handler_common.go index 90a1370d..eef653ce 100644 --- a/internal/app/handlers/handler_common.go +++ b/internal/app/handlers/handler_common.go @@ -89,9 +89,23 @@ func (a *Application) isProviderSupported(provider string) bool { return staticProviders[normalised] } -// getProviderPrefix returns the URL prefix for a provider +// getProviderPrefix returns the canonical /olla// prefix for strip-and-forward routing. func getProviderPrefix(provider string) string { - // use the original provider name in the URL to maintain compatibility - // (e.g., if user accessed /olla/lmstudio/, keep that in the prefix) return constants.DefaultOllaProxyPathPrefix + provider } + +// getRawProviderPrefix extracts the URL prefix to strip from the incoming request path. +// Unlike getProviderPrefix, this preserves the original spelling used by the caller +// (e.g., /olla/lmstudio/ rather than /olla/lm-studio/) so that path stripping works +// even when the caller uses an alias spelling. +func getRawProviderPrefix(path string) string { + if !strings.HasPrefix(path, constants.DefaultOllaProxyPathPrefix) { + return constants.DefaultOllaProxyPathPrefix + } + withoutBase := strings.TrimPrefix(path, constants.DefaultOllaProxyPathPrefix) + slashIdx := strings.Index(withoutBase, constants.DefaultPathPrefix) + if slashIdx == -1 { + return constants.DefaultOllaProxyPathPrefix + withoutBase + } + return constants.DefaultOllaProxyPathPrefix + withoutBase[:slashIdx] +} From 305548098e4856a781d5dde16b3fa1fd28bccae1 Mon Sep 17 00:00:00 2001 From: Thushan Fernando Date: Thu, 23 Apr 2026 11:56:11 +1000 Subject: [PATCH 02/11] fix sticky sessions for provider-scoped routes (#139) --- internal/adapter/balancer/sticky.go | 5 + .../adapter/balancer/sticky_metrics_test.go | 152 ++++++++++++++++++ internal/adapter/balancer/sticky_test.go | 24 ++- .../app/handlers/handler_provider_common.go | 14 +- .../app/handlers/handler_provider_test.go | 142 ++++++++++++++++ internal/app/services/discovery.go | 4 + 6 files changed, 324 insertions(+), 17 deletions(-) create mode 100644 internal/adapter/balancer/sticky_metrics_test.go diff --git a/internal/adapter/balancer/sticky.go b/internal/adapter/balancer/sticky.go index a36f067d..22369d96 100644 --- a/internal/adapter/balancer/sticky.go +++ b/internal/adapter/balancer/sticky.go @@ -199,11 +199,16 @@ func stickyKeyFromSessionHeader(r *http.Request, modelName string) (string, stri // stickyKeyFromPrefixHash hashes the first prefixBytes bytes of the messages // JSON array so requests with identical conversation prefixes are routed together. +// Falls back to the legacy completions "prompt" field so non-chat endpoints +// (e.g. /v1/completions, llamaswap-style passthroughs) also produce a key. func stickyKeyFromPrefixHash(body []byte, modelName string, prefixBytes int) (string, string) { if len(body) == 0 { return "", "" } raw := gjson.GetBytes(body, "messages").Raw + if raw == "" { + raw = gjson.GetBytes(body, "prompt").Raw + } if raw == "" { return "", "" } diff --git a/internal/adapter/balancer/sticky_metrics_test.go b/internal/adapter/balancer/sticky_metrics_test.go new file mode 100644 index 00000000..214cb14d --- /dev/null +++ b/internal/adapter/balancer/sticky_metrics_test.go @@ -0,0 +1,152 @@ +package balancer + +import ( + "context" + "net/http" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/thushan/olla/internal/config" + "github.com/thushan/olla/internal/core/constants" + "github.com/thushan/olla/internal/core/domain" +) + +// TestStickySessionWrapper_Stats_TracksMetrics verifies that Stats() reports +// non-zero counters after Select activity. Issue #139: the reporter saw +// /internal/stats/sticky returning all zero counters despite sticky being +// enabled and active. This guards against any future regression where the +// underlying ttlcache stops collecting metrics (e.g. an opt-in option being +// added in a future version). +func TestStickySessionWrapper_Stats_TracksMetrics(t *testing.T) { + t.Parallel() + + w := makeWrapper(t, defaultStickyConfig()) + ep1 := makeEndpoint("ep1", "http://backend1:8080") + endpoints := []*domain.Endpoint{ep1} + + // First select: miss → insertion + miss-style get (Get returns nil → not a hit). + ctx1, _ := injectKey(context.Background(), "stats-key:llama3", "session_header") + _, err := w.Select(ctx1, endpoints) + require.NoError(t, err) + + // Second select: hit on the same key → Hits++. + ctx2, _ := injectKey(context.Background(), "stats-key:llama3", "session_header") + _, err = w.Select(ctx2, endpoints) + require.NoError(t, err) + + stats := w.Stats() + assert.True(t, stats.Enabled, "Stats.Enabled must be true for an active wrapper") + assert.Equal(t, 1, stats.ActiveSessions, "one unique key should produce one active session") + assert.GreaterOrEqual(t, stats.Insertions, uint64(1), "insertion counter must be tracked by the underlying ttlcache") + assert.GreaterOrEqual(t, stats.Hits, uint64(1), "hit counter must be tracked by the underlying ttlcache") +} + +// TestComputeStickyKey_PrefixHash_PromptFallback verifies that legacy completions +// requests (which use "prompt" instead of "messages") still produce a sticky key +// via the prefix_hash source. Without the fallback, /v1/completions and +// llamaswap-style passthrough requests bypass sticky entirely and the metrics +// counters never advance. +func TestComputeStickyKey_PrefixHash_PromptFallback(t *testing.T) { + t.Parallel() + + body := []byte(`{"model":"llama3","prompt":"why is the sky blue?","max_tokens":50}`) + req, _ := http.NewRequest(http.MethodPost, "/", nil) + + cfg := config.StickySessionConfig{ + KeySources: []string{"prefix_hash"}, + PrefixHashBytes: 512, + } + key, source := ComputeStickyKey(req, "llama3", cfg, body) + + assert.Equal(t, "prefix_hash", source, "prompt-only body must still resolve via prefix_hash") + assert.NotEmpty(t, key, "prompt fallback must produce a non-empty key") +} + +// TestComputeStickyKey_PrefixHash_PreferMessagesOverPrompt ensures the prompt +// fallback does not accidentally override the canonical messages path when both +// are present. +func TestComputeStickyKey_PrefixHash_PreferMessagesOverPrompt(t *testing.T) { + t.Parallel() + + cfg := config.StickySessionConfig{ + KeySources: []string{"prefix_hash"}, + PrefixHashBytes: 512, + } + + chatBody := []byte(`{"messages":[{"role":"user","content":"hi"}]}`) + mixedBody := []byte(`{"messages":[{"role":"user","content":"hi"}],"prompt":"different content"}`) + + keyChat, _ := ComputeStickyKey(&http.Request{}, "m", cfg, chatBody) + keyMixed, _ := ComputeStickyKey(&http.Request{}, "m", cfg, mixedBody) + + assert.Equal(t, keyChat, keyMixed, "messages must take precedence over prompt when both exist") +} + +// TestComputeStickyKey_SessionHeader_EmptyModel verifies that an unidentified +// model (e.g. llamaswap requests where no inspector populates pr.model) still +// produces a usable, distinct key per session ID. Reproduces a slice of +// issue #139 where requests without model identification appeared to bypass +// sticky entirely. +func TestComputeStickyKey_SessionHeader_EmptyModel(t *testing.T) { + t.Parallel() + + cfg := config.StickySessionConfig{ + KeySources: []string{"session_header"}, + PrefixHashBytes: 512, + } + + req1, _ := http.NewRequest(http.MethodPost, "/", nil) + req1.Header.Set(constants.HeaderXOllaSessionID, "session-A") + + req2, _ := http.NewRequest(http.MethodPost, "/", nil) + req2.Header.Set(constants.HeaderXOllaSessionID, "session-B") + + keyA, sourceA := ComputeStickyKey(req1, "", cfg, nil) + keyB, sourceB := ComputeStickyKey(req2, "", cfg, nil) + + assert.Equal(t, "session_header", sourceA) + assert.Equal(t, "session_header", sourceB) + assert.NotEmpty(t, keyA, "empty model must still produce a key when session header is present") + assert.NotEmpty(t, keyB) + assert.NotEqual(t, keyA, keyB, "distinct session IDs must produce distinct keys even with empty model") +} + +// TestStickySessionWrapper_EmptyModel_RoutesAndPins verifies the end-to-end path: +// a session header arrives with no identified model, the wrapper computes a key, +// pins a backend, and a second request with the same session ID hits the same +// backend. This is the scenario reported in issue #139 for llamaswap. +func TestStickySessionWrapper_EmptyModel_RoutesAndPins(t *testing.T) { + t.Parallel() + + w := makeWrapper(t, defaultStickyConfig()) + ep1 := makeEndpoint("ep1", "http://backend1:8080") + ep2 := makeEndpoint("ep2", "http://backend2:8080") + endpoints := []*domain.Endpoint{ep1, ep2} + + cfg := config.StickySessionConfig{ + KeySources: []string{"session_header"}, + PrefixHashBytes: 512, + } + + req, _ := http.NewRequest(http.MethodPost, "/", nil) + req.Header.Set(constants.HeaderXOllaSessionID, "llamaswap-session") + key, source := ComputeStickyKey(req, "", cfg, nil) + require.NotEmpty(t, key, "issue #139: empty model + session header must still yield a key") + + ctx1, out1 := injectKey(context.Background(), key, source) + first, err := w.Select(ctx1, endpoints) + require.NoError(t, err) + assert.Equal(t, "miss", out1.Result) + + ctx2, out2 := injectKey(context.Background(), key, source) + second, err := w.Select(ctx2, endpoints) + require.NoError(t, err) + assert.Equal(t, "hit", out2.Result) + assert.Equal(t, first.URLString, second.URLString, "same key must pin to same backend across calls") + + // Confirm the metrics counters reflect the activity — the symptom in #139. + stats := w.Stats() + assert.GreaterOrEqual(t, stats.Insertions, uint64(1)) + assert.GreaterOrEqual(t, stats.Hits, uint64(1)) +} diff --git a/internal/adapter/balancer/sticky_test.go b/internal/adapter/balancer/sticky_test.go index 89d6d5d3..e0720782 100644 --- a/internal/adapter/balancer/sticky_test.go +++ b/internal/adapter/balancer/sticky_test.go @@ -182,22 +182,16 @@ func TestStickySessionWrapper_TTLExpiry(t *testing.T) { assert.Equal(t, first.URLString, second.URLString) assert.Equal(t, "hit", outcome2.Result) - // Poll until the ttlcache TTL expires (1 s) rather than sleeping a fixed - // duration. Cap at 2 s to stay well above the TTL without being brittle. - deadline := time.Now().Add(2 * time.Second) - var outcome3 *StickyOutcome - for time.Now().Before(deadline) { - ctx3, o3 := injectKey(context.Background(), stickyKey, "session_header") - _, err = w.Select(ctx3, endpoints) - require.NoError(t, err) - outcome3 = o3 - if outcome3.Result == "miss" { - break - } - time.Sleep(50 * time.Millisecond) - } + // ttlcache uses sliding TTL (touch-on-hit) — every Get refreshes the + // entry. Polling with Select would keep the entry alive forever, so we + // must wait the full TTL without touching the store. Add a small buffer + // to allow the background janitor goroutine to run the eviction sweep. + time.Sleep(time.Duration(cfg.IdleTTLSeconds)*time.Second + 500*time.Millisecond) + + ctx3, outcome3 := injectKey(context.Background(), stickyKey, "session_header") + _, err = w.Select(ctx3, endpoints) + require.NoError(t, err) // After TTL the entry is gone, so it's a fresh miss not a repin. - require.NotNil(t, outcome3) assert.Equal(t, "miss", outcome3.Result) } diff --git a/internal/app/handlers/handler_provider_common.go b/internal/app/handlers/handler_provider_common.go index 837ceb9f..94a92095 100644 --- a/internal/app/handlers/handler_provider_common.go +++ b/internal/app/handlers/handler_provider_common.go @@ -78,14 +78,24 @@ func (a *Application) providerProxyHandler(w http.ResponseWriter, r *http.Reques ctx = context.WithValue(ctx, constants.ContextProviderTypeKey, providerType) // The proxy needs to know which prefix to strip before forwarding. - // This mimics the behaviour of the main router for consistency. - providerPrefix := getProviderPrefix(providerType) + // We must use the RAW (pre-normalisation) path segment as the strip prefix so + // that alias spellings like /olla/lmstudio/ strip correctly — using the + // normalised name (lm-studio) would produce a non-matching prefix and forward + // the full /olla/lmstudio/... path to the backend, causing a 404. + providerPrefix := getRawProviderPrefix(r.URL.Path) ctx = context.WithValue(ctx, constants.ContextRoutePrefixKey, providerPrefix) r = r.WithContext(ctx) ctx, r = a.setupRequestContext(r, pr.stats) a.analyzeRequest(ctx, r, pr) + // Sticky session key must be computed after analyzeRequest so the model name + // is populated; inject into context before endpoint selection so the sticky + // balancer wrapper observes the key during Select(). Mirrors proxyHandler. + if a.Config.Proxy.StickySessions.Enabled { + ctx, r, _ = a.injectStickyKey(ctx, r, pr.model) + } + endpoints, err := a.getProviderEndpoints(ctx, providerType, pr) if err != nil { a.handleEndpointError(w, pr, err) diff --git a/internal/app/handlers/handler_provider_test.go b/internal/app/handlers/handler_provider_test.go index 70af6803..8eaef2e9 100644 --- a/internal/app/handlers/handler_provider_test.go +++ b/internal/app/handlers/handler_provider_test.go @@ -4,14 +4,18 @@ import ( "context" "net/http" "net/http/httptest" + "net/url" "strings" "testing" "time" + "github.com/thushan/olla/internal/adapter/balancer" "github.com/thushan/olla/internal/adapter/inspector" "github.com/thushan/olla/internal/config" "github.com/thushan/olla/internal/core/constants" "github.com/thushan/olla/internal/core/domain" + "github.com/thushan/olla/internal/core/ports" + "github.com/thushan/olla/internal/logger" ) // mockDiscoveryService for testing @@ -143,6 +147,144 @@ func TestProviderRouting(t *testing.T) { } } +// mockDiscoveryServiceWithHealthy returns a single healthy endpoint matching the +// requested provider type so provider-scoped routing can reach the proxy stage. +type mockDiscoveryServiceWithHealthy struct { + endpoints []*domain.Endpoint +} + +func (m *mockDiscoveryServiceWithHealthy) GetEndpoints(ctx context.Context) ([]*domain.Endpoint, error) { + return m.endpoints, nil +} +func (m *mockDiscoveryServiceWithHealthy) GetHealthyEndpoints(ctx context.Context) ([]*domain.Endpoint, error) { + return m.endpoints, nil +} +func (m *mockDiscoveryServiceWithHealthy) RefreshEndpoints(ctx context.Context) error { return nil } +func (m *mockDiscoveryServiceWithHealthy) UpdateEndpointStatus(ctx context.Context, endpoint *domain.Endpoint) error { + return nil +} + +// captureProxyService records the request context so tests can assert which +// values the handler propagated to the proxy engine. +type captureProxyService struct { + capturedCtx context.Context +} + +func (m *captureProxyService) ProxyRequestToEndpoints(ctx context.Context, w http.ResponseWriter, r *http.Request, endpoints []*domain.Endpoint, stats *ports.RequestStats, rlog logger.StyledLogger) error { + m.capturedCtx = r.Context() + // Simulate the sticky wrapper writing outcome headers before the proxy flushes. + if outcome, ok := r.Context().Value(constants.ContextStickyOutcomeKey).(*balancer.StickyOutcome); ok && outcome != nil { + outcome.Result = "miss" + outcome.Source, _ = r.Context().Value(constants.ContextStickyKeySourceKey).(string) + } + w.WriteHeader(http.StatusOK) + return nil +} + +func (m *captureProxyService) ProxyRequest(ctx context.Context, w http.ResponseWriter, r *http.Request, stats *ports.RequestStats, rlog logger.StyledLogger) error { + return nil +} +func (m *captureProxyService) GetStats(ctx context.Context) (ports.ProxyStats, error) { + return ports.ProxyStats{}, nil +} +func (m *captureProxyService) UpdateConfig(configuration ports.ProxyConfiguration) {} + +// TestProviderProxyHandler_InjectsStickyKey verifies that provider-scoped routes +// (e.g. /olla/ollama/, /olla/lemonade/) invoke sticky key injection just like +// the main proxyHandler. Regression test for github.com/thushan/olla#139 where +// requests to provider URLs bypassed sticky sessions entirely — counters stayed +// at zero and no X-Olla-Sticky-Session header was ever emitted. +func TestProviderProxyHandler_InjectsStickyKey(t *testing.T) { + app := createTestApplication(t) + + // Enable sticky sessions; without this the handler intentionally skips injection. + app.Config.Proxy.StickySessions = config.StickySessionConfig{ + Enabled: true, + KeySources: []string{"session_header", "prefix_hash", "ip"}, + MaxSessions: 100, + IdleTTLSeconds: 60, + PrefixHashBytes: 512, + } + + u, _ := url.Parse("http://localhost:11434") + app.discoveryService = &mockDiscoveryServiceWithHealthy{ + endpoints: []*domain.Endpoint{{ + Name: "ollama-1", + URL: u, + URLString: u.String(), + Type: "ollama", + Status: domain.StatusHealthy, + }}, + } + + capture := &captureProxyService{} + app.proxyService = capture + + sessionID := "session-abc-123" + req := httptest.NewRequest(http.MethodPost, "/olla/ollama/api/chat", strings.NewReader(`{"model":"llama3"}`)) + req.Header.Set(constants.HeaderContentType, constants.ContentTypeJSON) + req.Header.Set(constants.HeaderXOllaSessionID, sessionID) + w := httptest.NewRecorder() + + app.providerProxyHandler(w, req) + + if capture.capturedCtx == nil { + t.Fatalf("proxy was never invoked; handler failed before reaching executeProxyRequest (status=%d body=%q)", w.Code, w.Body.String()) + } + + stickyKey, _ := capture.capturedCtx.Value(constants.ContextStickyKeyKey).(string) + if stickyKey == "" { + t.Fatal("expected sticky key to be injected into context, got empty string — providerProxyHandler is bypassing sticky sessions") + } + + source, _ := capture.capturedCtx.Value(constants.ContextStickyKeySourceKey).(string) + if source != "session_header" { + t.Errorf("expected key source 'session_header' (X-Olla-Session-ID was supplied), got %q", source) + } + + outcome, _ := capture.capturedCtx.Value(constants.ContextStickyOutcomeKey).(*balancer.StickyOutcome) + if outcome == nil { + t.Fatal("expected StickyOutcome pointer in context for the balancer wrapper to populate") + } +} + +// TestProviderProxyHandler_SkipsStickyWhenDisabled guards against accidental +// breakage of the config gate — requests must not pay the body-read cost or +// pollute the context when sticky sessions are disabled. +func TestProviderProxyHandler_SkipsStickyWhenDisabled(t *testing.T) { + app := createTestApplication(t) + + // StickySessions.Enabled defaults to false via createTestApplication. + + u, _ := url.Parse("http://localhost:11434") + app.discoveryService = &mockDiscoveryServiceWithHealthy{ + endpoints: []*domain.Endpoint{{ + Name: "ollama-1", + URL: u, + URLString: u.String(), + Type: "ollama", + Status: domain.StatusHealthy, + }}, + } + + capture := &captureProxyService{} + app.proxyService = capture + + req := httptest.NewRequest(http.MethodPost, "/olla/ollama/api/chat", strings.NewReader(`{"model":"llama3"}`)) + req.Header.Set(constants.HeaderContentType, constants.ContentTypeJSON) + req.Header.Set(constants.HeaderXOllaSessionID, "abc") + w := httptest.NewRecorder() + + app.providerProxyHandler(w, req) + + if capture.capturedCtx == nil { + t.Fatalf("proxy was never invoked (status=%d body=%q)", w.Code, w.Body.String()) + } + if key, _ := capture.capturedCtx.Value(constants.ContextStickyKeyKey).(string); key != "" { + t.Errorf("expected no sticky key when disabled, got %q", key) + } +} + // TestProviderPathStripping tests that provider prefixes are correctly stripped func TestProviderPathStripping(t *testing.T) { tests := []struct { diff --git a/internal/app/services/discovery.go b/internal/app/services/discovery.go index d040d8ec..627197f6 100644 --- a/internal/app/services/discovery.go +++ b/internal/app/services/discovery.go @@ -80,6 +80,10 @@ func (s *DiscoveryService) Start(ctx context.Context) error { Type: s.registryConfig.Type, EnableUnifier: s.registryConfig.EnableUnifier, UnificationConf: &s.registryConfig.Unification, + // Routing strategy must be forwarded so the configured type (strict, + // optimistic, discovery) takes effect. Without this, registries fall + // back to strict regardless of what the operator configured. + RoutingStrategy: &s.registryConfig.RoutingStrategy, } var err error s.registry, err = registry.NewModelRegistry(registryConfig, s.logger) From b900a20f3fbab9c2d33445b87a41ae6f61d281d4 Mon Sep 17 00:00:00 2001 From: Thushan Fernando Date: Thu, 23 Apr 2026 11:56:12 +1000 Subject: [PATCH 03/11] add aimock test harness for sticky sessions --- .claude/skills/test-sticky-sessions.md | 223 +++++++++++ makefile | 60 ++- test/manual/config.sticky.yaml | 285 ++++++++++++++ test/mock/compose.yaml | 64 ++++ test/mock/fixtures/instance-a.json | 11 + test/mock/fixtures/instance-b.json | 11 + test/mock/fixtures/instance-c.json | 11 + test/scripts/sticky/run-manual.sh | 54 +++ .../sticky/test-sticky-provider-routes.sh | 359 ++++++++++++++++++ 9 files changed, 1077 insertions(+), 1 deletion(-) create mode 100644 .claude/skills/test-sticky-sessions.md create mode 100644 test/manual/config.sticky.yaml create mode 100644 test/mock/compose.yaml create mode 100644 test/mock/fixtures/instance-a.json create mode 100644 test/mock/fixtures/instance-b.json create mode 100644 test/mock/fixtures/instance-c.json create mode 100644 test/scripts/sticky/run-manual.sh create mode 100644 test/scripts/sticky/test-sticky-provider-routes.sh diff --git a/.claude/skills/test-sticky-sessions.md b/.claude/skills/test-sticky-sessions.md new file mode 100644 index 00000000..7a2c93f0 --- /dev/null +++ b/.claude/skills/test-sticky-sessions.md @@ -0,0 +1,223 @@ +--- +name: test-sticky-sessions +description: > + Runs the Olla sticky session integration test harness end-to-end across all + provider-scoped routes. Trigger when the user asks to: verify sticky sessions + work, run the sticky session integration test, test provider-route affinity, + or check whether the providerProxyHandler bug fix is holding. + Delegable to Sonnet — does not require Opus. +--- + +# Sticky Session Integration Test + +This skill exercises sticky session affinity across **all** provider-scoped +routes that AIMock can serve: + +| Route | Request path | Status | +|---|---|---| +| Main proxy | `/olla/proxy/v1/chat/completions` | tested | +| openai-compatible | `/olla/openai-compatible/v1/chat/completions` | tested (primary regression target) | +| openai | `/olla/openai/v1/chat/completions` | tested | +| vllm | `/olla/vllm/v1/chat/completions` | tested | +| sglang | `/olla/sglang/v1/chat/completions` | tested | +| llamacpp | `/olla/llamacpp/v1/chat/completions` | tested | +| lmstudio | `/olla/lmstudio/v1/chat/completions` | tested | +| lm-studio (alt prefix) | `/olla/lm-studio/v1/chat/completions` | tested | +| litellm | `/olla/litellm/v1/chat/completions` | tested | +| dmr | `/olla/dmr/v1/chat/completions` | tested | +| vllm-mlx | `/olla/vllm-mlx/v1/chat/completions` | tested | +| anthropic translator | `/olla/anthropic/v1/messages` | tested + passthrough assertion | +| lemonade | `/olla/lemonade/api/v1/chat/completions` | **skipped** — AIMock does not serve `/api/v1/*` | +| ollama | `/olla/ollama/api/chat` | **skipped** — AIMock does not speak Ollama `/api/*` protocol | + +The `/olla/openai-compatible/` and `/olla/openai/` paths were affected by a bug +where `providerProxyHandler` never injected sticky session context — those +routes are the primary regression targets. + +## Steps + +### 1. Pre-flight: verify Docker is running + +```bash +docker info > /dev/null 2>&1 || { echo "Docker is not running — start Docker Desktop first"; exit 1; } +``` + +### 2. Start AIMock instances + +```bash +make mock-up +``` + +Waits until all three AIMock containers report healthy (ports 9300/9301/9302). +Each instance returns a unique `BACKEND:instance-{a,b,c}` marker so the test +can confirm which backend served each response. + +### 3. Build Olla and start with sticky config + +```bash +LOG="${TMPDIR:-/tmp}/olla-sticky.log" +go run . --config test/manual/config.sticky.yaml > "$LOG" 2>&1 & +OLLA_PID=$! +``` + +Wait until ready: +```bash +until curl -sf http://localhost:40114/internal/health > /dev/null; do sleep 1; done +echo "Olla ready (PID $OLLA_PID, log $LOG)" +``` + +### 4. Run the assertion script + +```bash +OLLA_URL=http://localhost:40114 bash test/scripts/sticky/test-sticky-provider-routes.sh +RESULT=$? +``` + +For each active (non-skipped) route, the script asserts: +- Turn 1: `X-Olla-Sticky-Session: miss`, `X-Olla-Sticky-Key-Source: session_header` +- Turn 2: `X-Olla-Sticky-Session: hit`, same `X-Olla-Endpoint` as Turn 1, same backend marker +- Turn 3: across 10 fresh sessions, at least one lands on a different backend +- Anthropic path additionally asserts `X-Olla-Mode: passthrough` +- Stats endpoint: `insertions > 0`, `hits > 0`, `active_sessions > 0` + +Skipped routes print clearly: `SKIP