From 85495d5742e6eeb6123df81894fa85056a8baedc Mon Sep 17 00:00:00 2001
From: Thushan Fernando <thushan@thushanfernando.com>
Date: Thu, 23 Apr 2026 11:56:10 +1000
Subject: [PATCH 01/11] revert anthropic_support default + fix stale comment

---
 config/profiles/openai-compatible.yaml  | 10 ++++++++++
 internal/app/handlers/handler_common.go | 20 +++++++++++++++++---
 2 files changed, 27 insertions(+), 3 deletions(-)
diff --git a/config/profiles/openai-compatible.yaml b/config/profiles/openai-compatible.yaml
index c6ce14d8..d40be0ca 100644
--- a/config/profiles/openai-compatible.yaml
+++ b/config/profiles/openai-compatible.yaml
@@ -13,6 +13,16 @@ routing:
 # API compatibility
 api:
   openai_compatible: true
+
+  # Anthropic Messages API support
+  # Most OpenAI-compatible backends (LiteLLM, generic proxies) do not natively serve
+  # /v1/messages, so passthrough is disabled by default to avoid 404s. Enable this
+  # per backend profile (e.g. vllm.yaml, lmstudio.yaml) when the server supports it.
+  anthropic_support:
+    enabled: false
+    messages_path: /v1/messages
+    token_count: false
+
   paths:
     - /v1/models          # 0: health check & models
     - /v1/chat/completions # 1: chat completions
diff --git a/internal/app/handlers/handler_common.go b/internal/app/handlers/handler_common.go
index 90a1370d..eef653ce 100644
--- a/internal/app/handlers/handler_common.go
+++ b/internal/app/handlers/handler_common.go
@@ -89,9 +89,23 @@ func (a *Application) isProviderSupported(provider string) bool {
 	return staticProviders[normalised]
 }
 
-// getProviderPrefix returns the URL prefix for a provider
+// getProviderPrefix returns the canonical /olla/<provider>/ prefix for strip-and-forward routing.
 func getProviderPrefix(provider string) string {
-	// use the original provider name in the URL to maintain compatibility
-	// (e.g., if user accessed /olla/lmstudio/, keep that in the prefix)
 	return constants.DefaultOllaProxyPathPrefix + provider
 }
+
+// getRawProviderPrefix extracts the URL prefix to strip from the incoming request path.
+// Unlike getProviderPrefix, this preserves the original spelling used by the caller
+// (e.g., /olla/lmstudio/ rather than /olla/lm-studio/) so that path stripping works
+// even when the caller uses an alias spelling.
+func getRawProviderPrefix(path string) string {
+	if !strings.HasPrefix(path, constants.DefaultOllaProxyPathPrefix) {
+		return constants.DefaultOllaProxyPathPrefix
+	}
+	withoutBase := strings.TrimPrefix(path, constants.DefaultOllaProxyPathPrefix)
+	slashIdx := strings.Index(withoutBase, constants.DefaultPathPrefix)
+	if slashIdx == -1 {
+		return constants.DefaultOllaProxyPathPrefix + withoutBase
+	}
+	return constants.DefaultOllaProxyPathPrefix + withoutBase[:slashIdx]
+}

From 305548098e4856a781d5dde16b3fa1fd28bccae1 Mon Sep 17 00:00:00 2001
From: Thushan Fernando <thushan@thushanfernando.com>
Date: Thu, 23 Apr 2026 11:56:11 +1000
Subject: [PATCH 02/11] fix sticky sessions for provider-scoped routes (#139)

---
 internal/adapter/balancer/sticky.go           |   5 +
 .../adapter/balancer/sticky_metrics_test.go   | 152 ++++++++++++++++++
 internal/adapter/balancer/sticky_test.go      |  24 ++-
 .../app/handlers/handler_provider_common.go   |  14 +-
 .../app/handlers/handler_provider_test.go     | 142 ++++++++++++++++
 internal/app/services/discovery.go            |   4 +
 6 files changed, 324 insertions(+), 17 deletions(-)
 create mode 100644 internal/adapter/balancer/sticky_metrics_test.go

diff --git a/internal/adapter/balancer/sticky.go b/internal/adapter/balancer/sticky.go
index a36f067d..22369d96 100644
--- a/internal/adapter/balancer/sticky.go
+++ b/internal/adapter/balancer/sticky.go
@@ -199,11 +199,16 @@ func stickyKeyFromSessionHeader(r *http.Request, modelName string) (string, stri
 
 // stickyKeyFromPrefixHash hashes the first prefixBytes bytes of the messages
 // JSON array so requests with identical conversation prefixes are routed together.
+// Falls back to the legacy completions "prompt" field so non-chat endpoints
+// (e.g. /v1/completions, llamaswap-style passthroughs) also produce a key.
 func stickyKeyFromPrefixHash(body []byte, modelName string, prefixBytes int) (string, string) {
 	if len(body) == 0 {
 		return "", ""
 	}
 	raw := gjson.GetBytes(body, "messages").Raw
+	if raw == "" {
+		raw = gjson.GetBytes(body, "prompt").Raw
+	}
 	if raw == "" {
 		return "", ""
 	}
diff --git a/internal/adapter/balancer/sticky_metrics_test.go b/internal/adapter/balancer/sticky_metrics_test.go
new file mode 100644
index 00000000..214cb14d
--- /dev/null
+++ b/internal/adapter/balancer/sticky_metrics_test.go
@@ -0,0 +1,152 @@
+package balancer
+
+import (
+	"context"
+	"net/http"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	"github.com/thushan/olla/internal/config"
+	"github.com/thushan/olla/internal/core/constants"
+	"github.com/thushan/olla/internal/core/domain"
+)
+
+// TestStickySessionWrapper_Stats_TracksMetrics verifies that Stats() reports
+// non-zero counters after Select activity. Issue #139: the reporter saw
+// /internal/stats/sticky returning all zero counters despite sticky being
+// enabled and active. This guards against any future regression where the
+// underlying ttlcache stops collecting metrics (e.g. an opt-in option being
+// added in a future version).
+func TestStickySessionWrapper_Stats_TracksMetrics(t *testing.T) {
+	t.Parallel()
+
+	w := makeWrapper(t, defaultStickyConfig())
+	ep1 := makeEndpoint("ep1", "http://backend1:8080")
+	endpoints := []*domain.Endpoint{ep1}
+
+	// First select: miss → insertion + miss-style get (Get returns nil → not a hit).
+	ctx1, _ := injectKey(context.Background(), "stats-key:llama3", "session_header")
+	_, err := w.Select(ctx1, endpoints)
+	require.NoError(t, err)
+
+	// Second select: hit on the same key → Hits++.
+	ctx2, _ := injectKey(context.Background(), "stats-key:llama3", "session_header")
+	_, err = w.Select(ctx2, endpoints)
+	require.NoError(t, err)
+
+	stats := w.Stats()
+	assert.True(t, stats.Enabled, "Stats.Enabled must be true for an active wrapper")
+	assert.Equal(t, 1, stats.ActiveSessions, "one unique key should produce one active session")
+	assert.GreaterOrEqual(t, stats.Insertions, uint64(1), "insertion counter must be tracked by the underlying ttlcache")
+	assert.GreaterOrEqual(t, stats.Hits, uint64(1), "hit counter must be tracked by the underlying ttlcache")
+}
+
+// TestComputeStickyKey_PrefixHash_PromptFallback verifies that legacy completions
+// requests (which use "prompt" instead of "messages") still produce a sticky key
+// via the prefix_hash source. Without the fallback, /v1/completions and
+// llamaswap-style passthrough requests bypass sticky entirely and the metrics
+// counters never advance.
+func TestComputeStickyKey_PrefixHash_PromptFallback(t *testing.T) {
+	t.Parallel()
+
+	body := []byte(`{"model":"llama3","prompt":"why is the sky blue?","max_tokens":50}`)
+	req, _ := http.NewRequest(http.MethodPost, "/", nil)
+
+	cfg := config.StickySessionConfig{
+		KeySources:      []string{"prefix_hash"},
+		PrefixHashBytes: 512,
+	}
+	key, source := ComputeStickyKey(req, "llama3", cfg, body)
+
+	assert.Equal(t, "prefix_hash", source, "prompt-only body must still resolve via prefix_hash")
+	assert.NotEmpty(t, key, "prompt fallback must produce a non-empty key")
+}
+
+// TestComputeStickyKey_PrefixHash_PreferMessagesOverPrompt ensures the prompt
+// fallback does not accidentally override the canonical messages path when both
+// are present.
+func TestComputeStickyKey_PrefixHash_PreferMessagesOverPrompt(t *testing.T) {
+	t.Parallel()
+
+	cfg := config.StickySessionConfig{
+		KeySources:      []string{"prefix_hash"},
+		PrefixHashBytes: 512,
+	}
+
+	chatBody := []byte(`{"messages":[{"role":"user","content":"hi"}]}`)
+	mixedBody := []byte(`{"messages":[{"role":"user","content":"hi"}],"prompt":"different content"}`)
+
+	keyChat, _ := ComputeStickyKey(&http.Request{}, "m", cfg, chatBody)
+	keyMixed, _ := ComputeStickyKey(&http.Request{}, "m", cfg, mixedBody)
+
+	assert.Equal(t, keyChat, keyMixed, "messages must take precedence over prompt when both exist")
+}
+
+// TestComputeStickyKey_SessionHeader_EmptyModel verifies that an unidentified
+// model (e.g. llamaswap requests where no inspector populates pr.model) still
+// produces a usable, distinct key per session ID. Reproduces a slice of
+// issue #139 where requests without model identification appeared to bypass
+// sticky entirely.
+func TestComputeStickyKey_SessionHeader_EmptyModel(t *testing.T) {
+	t.Parallel()
+
+	cfg := config.StickySessionConfig{
+		KeySources:      []string{"session_header"},
+		PrefixHashBytes: 512,
+	}
+
+	req1, _ := http.NewRequest(http.MethodPost, "/", nil)
+	req1.Header.Set(constants.HeaderXOllaSessionID, "session-A")
+
+	req2, _ := http.NewRequest(http.MethodPost, "/", nil)
+	req2.Header.Set(constants.HeaderXOllaSessionID, "session-B")
+
+	keyA, sourceA := ComputeStickyKey(req1, "", cfg, nil)
+	keyB, sourceB := ComputeStickyKey(req2, "", cfg, nil)
+
+	assert.Equal(t, "session_header", sourceA)
+	assert.Equal(t, "session_header", sourceB)
+	assert.NotEmpty(t, keyA, "empty model must still produce a key when session header is present")
+	assert.NotEmpty(t, keyB)
+	assert.NotEqual(t, keyA, keyB, "distinct session IDs must produce distinct keys even with empty model")
+}
+
+// TestStickySessionWrapper_EmptyModel_RoutesAndPins verifies the end-to-end path:
+// a session header arrives with no identified model, the wrapper computes a key,
+// pins a backend, and a second request with the same session ID hits the same
+// backend. This is the scenario reported in issue #139 for llamaswap.
+func TestStickySessionWrapper_EmptyModel_RoutesAndPins(t *testing.T) {
+	t.Parallel()
+
+	w := makeWrapper(t, defaultStickyConfig())
+	ep1 := makeEndpoint("ep1", "http://backend1:8080")
+	ep2 := makeEndpoint("ep2", "http://backend2:8080")
+	endpoints := []*domain.Endpoint{ep1, ep2}
+
+	cfg := config.StickySessionConfig{
+		KeySources:      []string{"session_header"},
+		PrefixHashBytes: 512,
+	}
+
+	req, _ := http.NewRequest(http.MethodPost, "/", nil)
+	req.Header.Set(constants.HeaderXOllaSessionID, "llamaswap-session")
+	key, source := ComputeStickyKey(req, "", cfg, nil)
+	require.NotEmpty(t, key, "issue #139: empty model + session header must still yield a key")
+
+	ctx1, out1 := injectKey(context.Background(), key, source)
+	first, err := w.Select(ctx1, endpoints)
+	require.NoError(t, err)
+	assert.Equal(t, "miss", out1.Result)
+
+	ctx2, out2 := injectKey(context.Background(), key, source)
+	second, err := w.Select(ctx2, endpoints)
+	require.NoError(t, err)
+	assert.Equal(t, "hit", out2.Result)
+	assert.Equal(t, first.URLString, second.URLString, "same key must pin to same backend across calls")
+
+	// Confirm the metrics counters reflect the activity — the symptom in #139.
+	stats := w.Stats()
+	assert.GreaterOrEqual(t, stats.Insertions, uint64(1))
+	assert.GreaterOrEqual(t, stats.Hits, uint64(1))
+}
diff --git a/internal/adapter/balancer/sticky_test.go b/internal/adapter/balancer/sticky_test.go
index 89d6d5d3..e0720782 100644
--- a/internal/adapter/balancer/sticky_test.go
+++ b/internal/adapter/balancer/sticky_test.go
@@ -182,22 +182,16 @@ func TestStickySessionWrapper_TTLExpiry(t *testing.T) {
 	assert.Equal(t, first.URLString, second.URLString)
 	assert.Equal(t, "hit", outcome2.Result)
 
-	// Poll until the ttlcache TTL expires (1 s) rather than sleeping a fixed
-	// duration. Cap at 2 s to stay well above the TTL without being brittle.
-	deadline := time.Now().Add(2 * time.Second)
-	var outcome3 *StickyOutcome
-	for time.Now().Before(deadline) {
-		ctx3, o3 := injectKey(context.Background(), stickyKey, "session_header")
-		_, err = w.Select(ctx3, endpoints)
-		require.NoError(t, err)
-		outcome3 = o3
-		if outcome3.Result == "miss" {
-			break
-		}
-		time.Sleep(50 * time.Millisecond)
-	}
+	// ttlcache uses sliding TTL (touch-on-hit) — every Get refreshes the
+	// entry. Polling with Select would keep the entry alive forever, so we
+	// must wait the full TTL without touching the store. Add a small buffer
+	// to allow the background janitor goroutine to run the eviction sweep.
+	time.Sleep(time.Duration(cfg.IdleTTLSeconds)*time.Second + 500*time.Millisecond)
+
+	ctx3, outcome3 := injectKey(context.Background(), stickyKey, "session_header")
+	_, err = w.Select(ctx3, endpoints)
+	require.NoError(t, err)
 	// After TTL the entry is gone, so it's a fresh miss not a repin.
-	require.NotNil(t, outcome3)
 	assert.Equal(t, "miss", outcome3.Result)
 }
 
diff --git a/internal/app/handlers/handler_provider_common.go b/internal/app/handlers/handler_provider_common.go
index 837ceb9f..94a92095 100644
--- a/internal/app/handlers/handler_provider_common.go
+++ b/internal/app/handlers/handler_provider_common.go
@@ -78,14 +78,24 @@ func (a *Application) providerProxyHandler(w http.ResponseWriter, r *http.Reques
 	ctx = context.WithValue(ctx, constants.ContextProviderTypeKey, providerType)
 
 	// The proxy needs to know which prefix to strip before forwarding.
-	// This mimics the behaviour of the main router for consistency.
-	providerPrefix := getProviderPrefix(providerType)
+	// We must use the RAW (pre-normalisation) path segment as the strip prefix so
+	// that alias spellings like /olla/lmstudio/ strip correctly — using the
+	// normalised name (lm-studio) would produce a non-matching prefix and forward
+	// the full /olla/lmstudio/... path to the backend, causing a 404.
+	providerPrefix := getRawProviderPrefix(r.URL.Path)
 	ctx = context.WithValue(ctx, constants.ContextRoutePrefixKey, providerPrefix)
 	r = r.WithContext(ctx)
 
 	ctx, r = a.setupRequestContext(r, pr.stats)
 	a.analyzeRequest(ctx, r, pr)
 
+	// Sticky session key must be computed after analyzeRequest so the model name
+	// is populated; inject into context before endpoint selection so the sticky
+	// balancer wrapper observes the key during Select(). Mirrors proxyHandler.
+	if a.Config.Proxy.StickySessions.Enabled {
+		ctx, r, _ = a.injectStickyKey(ctx, r, pr.model)
+	}
+
 	endpoints, err := a.getProviderEndpoints(ctx, providerType, pr)
 	if err != nil {
 		a.handleEndpointError(w, pr, err)
diff --git a/internal/app/handlers/handler_provider_test.go b/internal/app/handlers/handler_provider_test.go
index 70af6803..8eaef2e9 100644
--- a/internal/app/handlers/handler_provider_test.go
+++ b/internal/app/handlers/handler_provider_test.go
@@ -4,14 +4,18 @@ import (
 	"context"
 	"net/http"
 	"net/http/httptest"
+	"net/url"
 	"strings"
 	"testing"
 	"time"
 
+	"github.com/thushan/olla/internal/adapter/balancer"
 	"github.com/thushan/olla/internal/adapter/inspector"
 	"github.com/thushan/olla/internal/config"
 	"github.com/thushan/olla/internal/core/constants"
 	"github.com/thushan/olla/internal/core/domain"
+	"github.com/thushan/olla/internal/core/ports"
+	"github.com/thushan/olla/internal/logger"
 )
 
 // mockDiscoveryService for testing
@@ -143,6 +147,144 @@ func TestProviderRouting(t *testing.T) {
 	}
 }
 
+// mockDiscoveryServiceWithHealthy returns a single healthy endpoint matching the
+// requested provider type so provider-scoped routing can reach the proxy stage.
+type mockDiscoveryServiceWithHealthy struct {
+	endpoints []*domain.Endpoint
+}
+
+func (m *mockDiscoveryServiceWithHealthy) GetEndpoints(ctx context.Context) ([]*domain.Endpoint, error) {
+	return m.endpoints, nil
+}
+func (m *mockDiscoveryServiceWithHealthy) GetHealthyEndpoints(ctx context.Context) ([]*domain.Endpoint, error) {
+	return m.endpoints, nil
+}
+func (m *mockDiscoveryServiceWithHealthy) RefreshEndpoints(ctx context.Context) error { return nil }
+func (m *mockDiscoveryServiceWithHealthy) UpdateEndpointStatus(ctx context.Context, endpoint *domain.Endpoint) error {
+	return nil
+}
+
+// captureProxyService records the request context so tests can assert which
+// values the handler propagated to the proxy engine.
+type captureProxyService struct {
+	capturedCtx context.Context
+}
+
+func (m *captureProxyService) ProxyRequestToEndpoints(ctx context.Context, w http.ResponseWriter, r *http.Request, endpoints []*domain.Endpoint, stats *ports.RequestStats, rlog logger.StyledLogger) error {
+	m.capturedCtx = r.Context()
+	// Simulate the sticky wrapper writing outcome headers before the proxy flushes.
+	if outcome, ok := r.Context().Value(constants.ContextStickyOutcomeKey).(*balancer.StickyOutcome); ok && outcome != nil {
+		outcome.Result = "miss"
+		outcome.Source, _ = r.Context().Value(constants.ContextStickyKeySourceKey).(string)
+	}
+	w.WriteHeader(http.StatusOK)
+	return nil
+}
+
+func (m *captureProxyService) ProxyRequest(ctx context.Context, w http.ResponseWriter, r *http.Request, stats *ports.RequestStats, rlog logger.StyledLogger) error {
+	return nil
+}
+func (m *captureProxyService) GetStats(ctx context.Context) (ports.ProxyStats, error) {
+	return ports.ProxyStats{}, nil
+}
+func (m *captureProxyService) UpdateConfig(configuration ports.ProxyConfiguration) {}
+
+// TestProviderProxyHandler_InjectsStickyKey verifies that provider-scoped routes
+// (e.g. /olla/ollama/, /olla/lemonade/) invoke sticky key injection just like
+// the main proxyHandler. Regression test for github.com/thushan/olla#139 where
+// requests to provider URLs bypassed sticky sessions entirely — counters stayed
+// at zero and no X-Olla-Sticky-Session header was ever emitted.
+func TestProviderProxyHandler_InjectsStickyKey(t *testing.T) {
+	app := createTestApplication(t)
+
+	// Enable sticky sessions; without this the handler intentionally skips injection.
+	app.Config.Proxy.StickySessions = config.StickySessionConfig{
+		Enabled:         true,
+		KeySources:      []string{"session_header", "prefix_hash", "ip"},
+		MaxSessions:     100,
+		IdleTTLSeconds:  60,
+		PrefixHashBytes: 512,
+	}
+
+	u, _ := url.Parse("http://localhost:11434")
+	app.discoveryService = &mockDiscoveryServiceWithHealthy{
+		endpoints: []*domain.Endpoint{{
+			Name:      "ollama-1",
+			URL:       u,
+			URLString: u.String(),
+			Type:      "ollama",
+			Status:    domain.StatusHealthy,
+		}},
+	}
+
+	capture := &captureProxyService{}
+	app.proxyService = capture
+
+	sessionID := "session-abc-123"
+	req := httptest.NewRequest(http.MethodPost, "/olla/ollama/api/chat", strings.NewReader(`{"model":"llama3"}`))
+	req.Header.Set(constants.HeaderContentType, constants.ContentTypeJSON)
+	req.Header.Set(constants.HeaderXOllaSessionID, sessionID)
+	w := httptest.NewRecorder()
+
+	app.providerProxyHandler(w, req)
+
+	if capture.capturedCtx == nil {
+		t.Fatalf("proxy was never invoked; handler failed before reaching executeProxyRequest (status=%d body=%q)", w.Code, w.Body.String())
+	}
+
+	stickyKey, _ := capture.capturedCtx.Value(constants.ContextStickyKeyKey).(string)
+	if stickyKey == "" {
+		t.Fatal("expected sticky key to be injected into context, got empty string — providerProxyHandler is bypassing sticky sessions")
+	}
+
+	source, _ := capture.capturedCtx.Value(constants.ContextStickyKeySourceKey).(string)
+	if source != "session_header" {
+		t.Errorf("expected key source 'session_header' (X-Olla-Session-ID was supplied), got %q", source)
+	}
+
+	outcome, _ := capture.capturedCtx.Value(constants.ContextStickyOutcomeKey).(*balancer.StickyOutcome)
+	if outcome == nil {
+		t.Fatal("expected StickyOutcome pointer in context for the balancer wrapper to populate")
+	}
+}
+
+// TestProviderProxyHandler_SkipsStickyWhenDisabled guards against accidental
+// breakage of the config gate — requests must not pay the body-read cost or
+// pollute the context when sticky sessions are disabled.
+func TestProviderProxyHandler_SkipsStickyWhenDisabled(t *testing.T) {
+	app := createTestApplication(t)
+
+	// StickySessions.Enabled defaults to false via createTestApplication.
+
+	u, _ := url.Parse("http://localhost:11434")
+	app.discoveryService = &mockDiscoveryServiceWithHealthy{
+		endpoints: []*domain.Endpoint{{
+			Name:      "ollama-1",
+			URL:       u,
+			URLString: u.String(),
+			Type:      "ollama",
+			Status:    domain.StatusHealthy,
+		}},
+	}
+
+	capture := &captureProxyService{}
+	app.proxyService = capture
+
+	req := httptest.NewRequest(http.MethodPost, "/olla/ollama/api/chat", strings.NewReader(`{"model":"llama3"}`))
+	req.Header.Set(constants.HeaderContentType, constants.ContentTypeJSON)
+	req.Header.Set(constants.HeaderXOllaSessionID, "abc")
+	w := httptest.NewRecorder()
+
+	app.providerProxyHandler(w, req)
+
+	if capture.capturedCtx == nil {
+		t.Fatalf("proxy was never invoked (status=%d body=%q)", w.Code, w.Body.String())
+	}
+	if key, _ := capture.capturedCtx.Value(constants.ContextStickyKeyKey).(string); key != "" {
+		t.Errorf("expected no sticky key when disabled, got %q", key)
+	}
+}
+
 // TestProviderPathStripping tests that provider prefixes are correctly stripped
 func TestProviderPathStripping(t *testing.T) {
 	tests := []struct {
diff --git a/internal/app/services/discovery.go b/internal/app/services/discovery.go
index d040d8ec..627197f6 100644
--- a/internal/app/services/discovery.go
+++ b/internal/app/services/discovery.go
@@ -80,6 +80,10 @@ func (s *DiscoveryService) Start(ctx context.Context) error {
 			Type:            s.registryConfig.Type,
 			EnableUnifier:   s.registryConfig.EnableUnifier,
 			UnificationConf: &s.registryConfig.Unification,
+			// Routing strategy must be forwarded so the configured type (strict,
+			// optimistic, discovery) takes effect. Without this, registries fall
+			// back to strict regardless of what the operator configured.
+			RoutingStrategy: &s.registryConfig.RoutingStrategy,
 		}
 		var err error
 		s.registry, err = registry.NewModelRegistry(registryConfig, s.logger)

From b900a20f3fbab9c2d33445b87a41ae6f61d281d4 Mon Sep 17 00:00:00 2001
From: Thushan Fernando <thushan@thushanfernando.com>
Date: Thu, 23 Apr 2026 11:56:12 +1000
Subject: [PATCH 03/11] add aimock test harness for sticky sessions

---
 .claude/skills/test-sticky-sessions.md        | 223 +++++++++++
 makefile                                      |  60 ++-
 test/manual/config.sticky.yaml                | 285 ++++++++++++++
 test/mock/compose.yaml                        |  64 ++++
 test/mock/fixtures/instance-a.json            |  11 +
 test/mock/fixtures/instance-b.json            |  11 +
 test/mock/fixtures/instance-c.json            |  11 +
 test/scripts/sticky/run-manual.sh             |  54 +++
 .../sticky/test-sticky-provider-routes.sh     | 359 ++++++++++++++++++
 9 files changed, 1077 insertions(+), 1 deletion(-)
 create mode 100644 .claude/skills/test-sticky-sessions.md
 create mode 100644 test/manual/config.sticky.yaml
 create mode 100644 test/mock/compose.yaml
 create mode 100644 test/mock/fixtures/instance-a.json
 create mode 100644 test/mock/fixtures/instance-b.json
 create mode 100644 test/mock/fixtures/instance-c.json
 create mode 100644 test/scripts/sticky/run-manual.sh
 create mode 100644 test/scripts/sticky/test-sticky-provider-routes.sh

diff --git a/.claude/skills/test-sticky-sessions.md b/.claude/skills/test-sticky-sessions.md
new file mode 100644
index 00000000..7a2c93f0
--- /dev/null
+++ b/.claude/skills/test-sticky-sessions.md
@@ -0,0 +1,223 @@
+---
+name: test-sticky-sessions
+description: >
+  Runs the Olla sticky session integration test harness end-to-end across all
+  provider-scoped routes. Trigger when the user asks to: verify sticky sessions
+  work, run the sticky session integration test, test provider-route affinity,
+  or check whether the providerProxyHandler bug fix is holding.
+  Delegable to Sonnet — does not require Opus.
+---
+
+# Sticky Session Integration Test
+
+This skill exercises sticky session affinity across **all** provider-scoped
+routes that AIMock can serve:
+
+| Route | Request path | Status |
+|---|---|---|
+| Main proxy | `/olla/proxy/v1/chat/completions` | tested |
+| openai-compatible | `/olla/openai-compatible/v1/chat/completions` | tested (primary regression target) |
+| openai | `/olla/openai/v1/chat/completions` | tested |
+| vllm | `/olla/vllm/v1/chat/completions` | tested |
+| sglang | `/olla/sglang/v1/chat/completions` | tested |
+| llamacpp | `/olla/llamacpp/v1/chat/completions` | tested |
+| lmstudio | `/olla/lmstudio/v1/chat/completions` | tested |
+| lm-studio (alt prefix) | `/olla/lm-studio/v1/chat/completions` | tested |
+| litellm | `/olla/litellm/v1/chat/completions` | tested |
+| dmr | `/olla/dmr/v1/chat/completions` | tested |
+| vllm-mlx | `/olla/vllm-mlx/v1/chat/completions` | tested |
+| anthropic translator | `/olla/anthropic/v1/messages` | tested + passthrough assertion |
+| lemonade | `/olla/lemonade/api/v1/chat/completions` | **skipped** — AIMock does not serve `/api/v1/*` |
+| ollama | `/olla/ollama/api/chat` | **skipped** — AIMock does not speak Ollama `/api/*` protocol |
+
+The `/olla/openai-compatible/` and `/olla/openai/` paths were affected by a bug
+where `providerProxyHandler` never injected sticky session context — those
+routes are the primary regression targets.
+
+## Steps
+
+### 1. Pre-flight: verify Docker is running
+
+```bash
+docker info > /dev/null 2>&1 || { echo "Docker is not running — start Docker Desktop first"; exit 1; }
+```
+
+### 2. Start AIMock instances
+
+```bash
+make mock-up
+```
+
+Waits until all three AIMock containers report healthy (ports 9300/9301/9302).
+Each instance returns a unique `BACKEND:instance-{a,b,c}` marker so the test
+can confirm which backend served each response.
+
+### 3. Build Olla and start with sticky config
+
+```bash
+LOG="${TMPDIR:-/tmp}/olla-sticky.log"
+go run . --config test/manual/config.sticky.yaml > "$LOG" 2>&1 &
+OLLA_PID=$!
+```
+
+Wait until ready:
+```bash
+until curl -sf http://localhost:40114/internal/health > /dev/null; do sleep 1; done
+echo "Olla ready (PID $OLLA_PID, log $LOG)"
+```
+
+### 4. Run the assertion script
+
+```bash
+OLLA_URL=http://localhost:40114 bash test/scripts/sticky/test-sticky-provider-routes.sh
+RESULT=$?
+```
+
+For each active (non-skipped) route, the script asserts:
+- Turn 1: `X-Olla-Sticky-Session: miss`, `X-Olla-Sticky-Key-Source: session_header`
+- Turn 2: `X-Olla-Sticky-Session: hit`, same `X-Olla-Endpoint` as Turn 1, same backend marker
+- Turn 3: across 10 fresh sessions, at least one lands on a different backend
+- Anthropic path additionally asserts `X-Olla-Mode: passthrough`
+- Stats endpoint: `insertions > 0`, `hits > 0`, `active_sessions > 0`
+
+Skipped routes print clearly: `SKIP <label> — <reason>` and do not count as failures.
+
+### 5. Teardown (bulletproof — always runs)
+
+```bash
+kill "$OLLA_PID" 2>/dev/null || true
+make mock-down
+exit "$RESULT"
+```
+
+### Fully automated (single command)
+
+```bash
+make test-sticky-manual
+```
+
+This target handles all five steps including the EXIT trap teardown.
+
+---
+
+## Expected output (passing run)
+
+```
+╔══════════════════════════════════════════════════════════════╗
+║  Olla Sticky Session — All Provider Routes Regression Test  ║
+╚══════════════════════════════════════════════════════════════╝
+
+── main-proxy ──
+  ✓ PASS — Turn 1 HTTP 200
+  ✓ PASS — Turn 1 sticky=miss
+  ✓ PASS — Turn 1 key-source=session_header
+  Pinned to: mock-compat-b (BACKEND:instance-b)
+  ✓ PASS — Turn 2 HTTP 200
+  ✓ PASS — Turn 2 sticky=hit
+  ✓ PASS — Turn 2 same endpoint (mock-compat-b)
+  ✓ PASS — Turn 2 same backend marker (BACKEND:instance-b)
+  ✓ PASS — Turn 3 load balancing reaches multiple backends
+
+── openai-compatible ──
+  ... same pattern ...
+
+  ... (vllm, sglang, llamacpp, lmstudio, lm-studio, litellm, dmr, vllm-mlx) ...
+
+SKIP lemonade (/olla/lemonade/api/v1/chat/completions) — AIMock does not serve /api/v1/* — Lemonade uses a non-standard path prefix
+SKIP ollama (/olla/ollama/api/chat) — AIMock does not speak the Ollama /api/* protocol
+
+── anthropic-translator ──
+  ✓ PASS — Turn 1 X-Olla-Mode=passthrough
+  ...
+
+── Sticky Session Stats ──
+  ✓ PASS — stats.insertions > 0
+  ✓ PASS — stats.hits > 0
+  ✓ PASS — stats.active_sessions > 0
+
+Results:  99 passed  0 failed  2 skipped  (99 total assertions)
+✓ All sticky session assertions passed.
+```
+
+---
+
+## Manual verification (troubleshooting)
+
+**Health check:**
+```bash
+curl -s http://localhost:40114/internal/health | python3 -m json.tool
+```
+
+**Turn 1 — main proxy:**
+```bash
+curl -s -D - -X POST http://localhost:40114/olla/proxy/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "X-Olla-Session-ID: debug-sess-001" \
+  -d '{"model":"test","messages":[{"role":"user","content":"ping"}],"max_tokens":20}'
+```
+
+Expected response headers:
+```
+X-Olla-Sticky-Session: miss
+X-Olla-Sticky-Key-Source: session_header
+X-Olla-Endpoint: mock-compat-{a,b,c}
+```
+
+**Turn 2 — same session, expect hit:**
+```bash
+curl -s -D - -X POST http://localhost:40114/olla/proxy/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "X-Olla-Session-ID: debug-sess-001" \
+  -d '{"model":"test","messages":[{"role":"user","content":"ping"}],"max_tokens":20}'
+```
+
+Expected: `X-Olla-Sticky-Session: hit`
+
+**Provider-scoped route (regression path):**
+```bash
+curl -s -D - -X POST http://localhost:40114/olla/openai-compatible/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "X-Olla-Session-ID: debug-sess-002" \
+  -d '{"model":"test","messages":[{"role":"user","content":"ping"}],"max_tokens":20}'
+```
+
+**vLLM-specific route:**
+```bash
+curl -s -D - -X POST http://localhost:40114/olla/vllm/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "X-Olla-Session-ID: debug-sess-vllm" \
+  -d '{"model":"test","messages":[{"role":"user","content":"ping"}],"max_tokens":20}'
+```
+
+**Anthropic passthrough:**
+```bash
+curl -s -D - -X POST http://localhost:40114/olla/anthropic/v1/messages \
+  -H "Content-Type: application/json" \
+  -H "x-api-key: test" \
+  -H "anthropic-version: 2023-06-01" \
+  -H "X-Olla-Session-ID: debug-sess-003" \
+  -d '{"model":"claude-3-haiku-20240307","max_tokens":20,"messages":[{"role":"user","content":"ping"}]}'
+```
+
+Expected: `X-Olla-Mode: passthrough`
+
+**Stats:**
+```bash
+curl -s http://localhost:40114/internal/stats/sticky | python3 -m json.tool
+```
+
+---
+
+## Notes
+
+- `test/manual/config.sticky.yaml` registers three endpoints per provider type
+  (all pointing at AIMock on 9300/9301/9302) so affinity checks are meaningful.
+- The `openai-compatible` profile declares `anthropic_support.enabled: true`,
+  enabling passthrough mode on the Anthropic translator path.
+- Lemonade and Ollama routes are skipped cleanly — they require a dedicated mock
+  that speaks their native protocols (`/api/v1/chat/completions` and
+  `/api/chat`/`/api/generate` respectively).
+- To test the Olla engine (high-performance), change `engine: "sherpa"` to
+  `engine: "olla"` in `test/manual/config.sticky.yaml` and re-run.
+- The script is portable: `#!/usr/bin/env bash`, no absolute paths, no
+  platform-specific constructs. Runs on macOS, Linux, and Git-Bash on Windows.
diff --git a/makefile b/makefile
index 12890a76..3113f02b 100644
--- a/makefile
+++ b/makefile
@@ -18,7 +18,7 @@ LDFLAGS := -ldflags "\
 	-X '$(PKG).Tool=$(TOOL)' \
 	-X '$(PKG).User=$(USER)'"
 
-.PHONY: run clean build test test-verbose test-short test-race test-cover bench version install-deps check-deps vet test-script-integration test-script-sticky
+.PHONY: run clean build test test-verbose test-short test-race test-cover bench version install-deps check-deps vet test-script-integration test-script-sticky mock-up mock-down mock-status mock-logs test-sticky-manual
 
 # Build the application with version info
 build:
@@ -357,6 +357,59 @@ test-script-sticky:
 	@echo "Running sticky session test scripts..."
 	@cd test/scripts && python sticky/test-sticky-sessions.py $(ARGS)
 
+# ── AIMock helpers ────────────────────────────────────────────────────────────
+# These targets manage the three AIMock containers used by the sticky session
+# manual test harness. AIMock (ghcr.io/copilotkit/aimock) serves OpenAI and
+# Anthropic endpoints with embedded BACKEND:<id> markers so affinity tests can
+# confirm which instance handled each response.
+#
+# Ports: mock-instance-a=9300, mock-instance-b=9301, mock-instance-c=9302
+.PHONY: mock-up mock-down mock-status mock-logs test-sticky-manual
+
+## mock-up: Start AIMock instances and wait until all are healthy
+mock-up:
+	@echo "Starting AIMock instances..."
+	@docker compose -f test/mock/compose.yaml up -d
+	@echo "Waiting for AIMock health checks..."
+	@attempt=0; \
+	while true; do \
+		attempt=$$((attempt+1)); \
+		if [ $$attempt -gt 40 ]; then \
+			echo "ERROR: AIMock did not become healthy after 40s"; \
+			docker compose -f test/mock/compose.yaml ps; \
+			exit 1; \
+		fi; \
+		unhealthy=$$(docker compose -f test/mock/compose.yaml ps --format json 2>/dev/null | \
+			python3 -c "import sys,json; data=sys.stdin.read(); items=[json.loads(l) for l in data.splitlines() if l.strip()]; print(sum(1 for i in items if i.get('Health','') not in ('healthy','')))" 2>/dev/null || echo "0"); \
+		total=$$(docker compose -f test/mock/compose.yaml ps -q 2>/dev/null | wc -l | tr -d ' '); \
+		healthy=$$(docker compose -f test/mock/compose.yaml ps --format json 2>/dev/null | \
+			python3 -c "import sys,json; data=sys.stdin.read(); items=[json.loads(l) for l in data.splitlines() if l.strip()]; print(sum(1 for i in items if i.get('Health','') == 'healthy'))" 2>/dev/null || echo "0"); \
+		if [ "$$healthy" = "3" ]; then \
+			echo "All 3 AIMock instances are healthy"; \
+			break; \
+		fi; \
+		sleep 1; \
+	done
+
+## mock-down: Stop and remove AIMock containers and volumes
+mock-down:
+	@echo "Stopping AIMock instances..."
+	@docker compose -f test/mock/compose.yaml down -v
+
+## mock-status: Show running state of AIMock instances
+mock-status:
+	@docker compose -f test/mock/compose.yaml ps
+
+## mock-logs: Tail logs from all three AIMock instances
+mock-logs:
+	@docker compose -f test/mock/compose.yaml logs -f
+
+## test-sticky-manual: Full end-to-end sticky session test (mock-up → olla → assert → teardown)
+## Starts AIMock, runs Olla with sticky config, executes assertion script, tears down on exit.
+test-sticky-manual:
+	@echo "Running full sticky session manual test..."
+	@bash test/scripts/sticky/run-manual.sh
+
 # Show help
 help:
 	@echo "Available targets:"
@@ -401,4 +454,9 @@ help:
 	@echo "  ci              - Run full CI pipeline locally"
 	@echo "  test-script-integration - Run integration test scripts (requires running Olla)"
 	@echo "  test-script-sticky      - Run sticky session test scripts (requires running Olla)"
+	@echo "  mock-up                 - Start AIMock instances and wait for health"
+	@echo "  mock-down               - Stop AIMock instances and remove volumes"
+	@echo "  mock-status             - Show AIMock container state"
+	@echo "  mock-logs               - Tail logs from all AIMock instances"
+	@echo "  test-sticky-manual      - Full sticky session end-to-end test (mock + olla + assert)"
 	@echo "  help            - Show this help"
\ No newline at end of file
diff --git a/test/manual/config.sticky.yaml b/test/manual/config.sticky.yaml
new file mode 100644
index 00000000..203a2cdb
--- /dev/null
+++ b/test/manual/config.sticky.yaml
@@ -0,0 +1,285 @@
+## Olla Sticky Session Manual Test Configuration
+## Points at three AIMock instances (test/mock/compose.yaml).
+##
+## One endpoint per provider type is registered so each provider-scoped route
+## (/olla/vllm/, /olla/sglang/, etc.) has a live backend to route to. All
+## endpoints point at the same three AIMock URLs; we vary the declared type so
+## Olla's per-provider filtering can select them. AIMock speaks
+## /v1/chat/completions and /v1/messages for all declared types.
+##
+## Lemonade and Ollama endpoints are intentionally absent: AIMock cannot serve
+## Lemonade's /api/v1/* prefix or Ollama's /api/* protocol, so those routes are
+## skipped by the test script with a printed explanation.
+
+server:
+  host: "127.0.0.1"
+  port: 40114
+  read_timeout: 10s
+  write_timeout: 0s
+  shutdown_timeout: 5s
+  request_logging: true
+  request_limits:
+    max_body_size: 10485760
+    max_header_size: 524288
+  rate_limits:
+    global_requests_per_minute: 1000
+    per_ip_requests_per_minute: 200
+    health_requests_per_minute: 1000
+    burst_size: 100
+    cleanup_interval: 5m
+    trust_proxy_headers: false
+    trusted_proxy_cidrs: ["127.0.0.0/8"]
+
+proxy:
+  # Sherpa is simpler to reason about for affinity debugging.
+  # Change engine to "olla" to exercise the high-performance path.
+  engine: "sherpa"
+  profile: "auto"
+  # least-connections distributes across all three instances rather than always
+  # picking the highest-priority one, which makes sticky affinity verifiable.
+  load_balancer: "least-connections"
+  stream_buffer_size: 8192
+  connection_timeout: 10s
+  response_timeout: 30s
+  read_timeout: 30s
+  retry:
+    enabled: true
+    on_connection_failure: true
+    max_attempts: 0
+
+  sticky_sessions:
+    enabled: true
+    idle_ttl_seconds: 600
+    max_sessions: 1000
+    # session_header: pin on X-Olla-Session-ID supplied by the client
+    # prefix_hash: fallback when no explicit session header is present
+    key_sources: [session_header, prefix_hash]
+    prefix_hash_bytes: 512
+
+discovery:
+  type: "static"
+  refresh_interval: 10s
+  health_check:
+    initial_delay: 1s
+  static:
+    endpoints:
+      # ── openai-compatible ─────────────────────────────────────────────────
+      # Three instances give the diversity check something real to find.
+      - url: "http://127.0.0.1:9300"
+        name: "mock-compat-a"
+        type: "openai-compatible"
+        model_url: "/v1/models"
+        health_check_url: "/v1/models"
+        check_interval: 5s
+        check_timeout: 2s
+      - url: "http://127.0.0.1:9301"
+        name: "mock-compat-b"
+        type: "openai-compatible"
+        model_url: "/v1/models"
+        health_check_url: "/v1/models"
+        check_interval: 5s
+        check_timeout: 2s
+      - url: "http://127.0.0.1:9302"
+        name: "mock-compat-c"
+        type: "openai-compatible"
+        model_url: "/v1/models"
+        health_check_url: "/v1/models"
+        check_interval: 5s
+        check_timeout: 2s
+
+      # ── vllm ──────────────────────────────────────────────────────────────
+      # Spread across all three AIMock instances so /olla/vllm/ has real affinity.
+      - url: "http://127.0.0.1:9300"
+        name: "mock-vllm-a"
+        type: "vllm"
+        model_url: "/v1/models"
+        health_check_url: "/v1/models"
+        check_interval: 5s
+        check_timeout: 2s
+      - url: "http://127.0.0.1:9301"
+        name: "mock-vllm-b"
+        type: "vllm"
+        model_url: "/v1/models"
+        health_check_url: "/v1/models"
+        check_interval: 5s
+        check_timeout: 2s
+      - url: "http://127.0.0.1:9302"
+        name: "mock-vllm-c"
+        type: "vllm"
+        model_url: "/v1/models"
+        health_check_url: "/v1/models"
+        check_interval: 5s
+        check_timeout: 2s
+
+      # ── sglang ────────────────────────────────────────────────────────────
+      - url: "http://127.0.0.1:9300"
+        name: "mock-sglang-a"
+        type: "sglang"
+        model_url: "/v1/models"
+        health_check_url: "/v1/models"
+        check_interval: 5s
+        check_timeout: 2s
+      - url: "http://127.0.0.1:9301"
+        name: "mock-sglang-b"
+        type: "sglang"
+        model_url: "/v1/models"
+        health_check_url: "/v1/models"
+        check_interval: 5s
+        check_timeout: 2s
+      - url: "http://127.0.0.1:9302"
+        name: "mock-sglang-c"
+        type: "sglang"
+        model_url: "/v1/models"
+        health_check_url: "/v1/models"
+        check_interval: 5s
+        check_timeout: 2s
+
+      # ── llamacpp ──────────────────────────────────────────────────────────
+      - url: "http://127.0.0.1:9300"
+        name: "mock-llamacpp-a"
+        type: "llamacpp"
+        model_url: "/v1/models"
+        health_check_url: "/v1/models"
+        check_interval: 5s
+        check_timeout: 2s
+      - url: "http://127.0.0.1:9301"
+        name: "mock-llamacpp-b"
+        type: "llamacpp"
+        model_url: "/v1/models"
+        health_check_url: "/v1/models"
+        check_interval: 5s
+        check_timeout: 2s
+      - url: "http://127.0.0.1:9302"
+        name: "mock-llamacpp-c"
+        type: "llamacpp"
+        model_url: "/v1/models"
+        health_check_url: "/v1/models"
+        check_interval: 5s
+        check_timeout: 2s
+
+      # ── lm-studio ─────────────────────────────────────────────────────────
+      # Covers /olla/lmstudio/, /olla/lm-studio/, /olla/lm_studio/ routes.
+      - url: "http://127.0.0.1:9300"
+        name: "mock-lmstudio-a"
+        type: "lm-studio"
+        model_url: "/v1/models"
+        health_check_url: "/v1/models"
+        check_interval: 5s
+        check_timeout: 2s
+      - url: "http://127.0.0.1:9301"
+        name: "mock-lmstudio-b"
+        type: "lm-studio"
+        model_url: "/v1/models"
+        health_check_url: "/v1/models"
+        check_interval: 5s
+        check_timeout: 2s
+      - url: "http://127.0.0.1:9302"
+        name: "mock-lmstudio-c"
+        type: "lm-studio"
+        model_url: "/v1/models"
+        health_check_url: "/v1/models"
+        check_interval: 5s
+        check_timeout: 2s
+
+      # ── litellm ───────────────────────────────────────────────────────────
+      - url: "http://127.0.0.1:9300"
+        name: "mock-litellm-a"
+        type: "litellm"
+        model_url: "/v1/models"
+        health_check_url: "/v1/models"
+        check_interval: 5s
+        check_timeout: 2s
+      - url: "http://127.0.0.1:9301"
+        name: "mock-litellm-b"
+        type: "litellm"
+        model_url: "/v1/models"
+        health_check_url: "/v1/models"
+        check_interval: 5s
+        check_timeout: 2s
+      - url: "http://127.0.0.1:9302"
+        name: "mock-litellm-c"
+        type: "litellm"
+        model_url: "/v1/models"
+        health_check_url: "/v1/models"
+        check_interval: 5s
+        check_timeout: 2s
+
+      # ── docker-model-runner ───────────────────────────────────────────────
+      - url: "http://127.0.0.1:9300"
+        name: "mock-dmr-a"
+        type: "docker-model-runner"
+        model_url: "/v1/models"
+        health_check_url: "/v1/models"
+        check_interval: 5s
+        check_timeout: 2s
+      - url: "http://127.0.0.1:9301"
+        name: "mock-dmr-b"
+        type: "docker-model-runner"
+        model_url: "/v1/models"
+        health_check_url: "/v1/models"
+        check_interval: 5s
+        check_timeout: 2s
+      - url: "http://127.0.0.1:9302"
+        name: "mock-dmr-c"
+        type: "docker-model-runner"
+        model_url: "/v1/models"
+        health_check_url: "/v1/models"
+        check_interval: 5s
+        check_timeout: 2s
+
+      # ── vllm-mlx ──────────────────────────────────────────────────────────
+      - url: "http://127.0.0.1:9300"
+        name: "mock-vllm-mlx-a"
+        type: "vllm-mlx"
+        model_url: "/v1/models"
+        health_check_url: "/v1/models"
+        check_interval: 5s
+        check_timeout: 2s
+      - url: "http://127.0.0.1:9301"
+        name: "mock-vllm-mlx-b"
+        type: "vllm-mlx"
+        model_url: "/v1/models"
+        health_check_url: "/v1/models"
+        check_interval: 5s
+        check_timeout: 2s
+      - url: "http://127.0.0.1:9302"
+        name: "mock-vllm-mlx-c"
+        type: "vllm-mlx"
+        model_url: "/v1/models"
+        health_check_url: "/v1/models"
+        check_interval: 5s
+        check_timeout: 2s
+
+  model_discovery:
+    enabled: false  # AIMock has no real model list; skip discovery noise
+
+model_registry:
+  type: "memory"
+  enable_unifier: false
+  unification:
+    enabled: false
+  routing_strategy:
+    # optimistic so requests are not rejected for unknown models
+    type: "optimistic"
+    options:
+      fallback_behavior: "all"
+      discovery_timeout: 2s
+      discovery_refresh_on_miss: false
+
+translators:
+  anthropic:
+    enabled: true
+    # passthrough_enabled lets us verify X-Olla-Mode: passthrough is set
+    # when the backend (openai-compatible + anthropic_support.enabled=true) is used
+    passthrough_enabled: true
+    max_message_size: 10485760
+    inspector:
+      enabled: false
+
+logging:
+  level: "debug"
+  format: "text"
+  output: "stdout"
+
+engineering:
+  show_nerdstats: false
diff --git a/test/mock/compose.yaml b/test/mock/compose.yaml
new file mode 100644
index 00000000..ed736049
--- /dev/null
+++ b/test/mock/compose.yaml
@@ -0,0 +1,64 @@
+# Olla Sticky Session Test Mock Backends (AIMock)
+#
+# Three independent AIMock instances, each responding with a unique BACKEND:<id>
+# marker embedded in the response content. This lets the sticky session test script
+# confirm that repeated requests with the same session land on the same backend.
+#
+# Each instance serves both OpenAI (/v1/chat/completions) and Anthropic (/v1/messages).
+# The fixture file name differs per instance so AIMock serves the correct fixture.
+#
+# Ports chosen to match the FoundryOS convention (9300-9302) and avoid Windows
+# Hyper-V exclusion ranges.
+#
+# Usage:
+#   make mock-up      — start all three instances (waits for health)
+#   make mock-down    — stop and remove containers + volumes
+#   make mock-status  — show running state
+#   make mock-logs    — tail all three logs
+
+services:
+  mock-instance-a:
+    image: ghcr.io/copilotkit/aimock:latest
+    ports:
+      - "9300:4010"
+    volumes:
+      - ./fixtures/instance-a.json:/fixtures/fixture.json:ro
+    command: ["--fixtures", "/fixtures", "--host", "0.0.0.0"]
+    healthcheck:
+      # AIMock image ships Node.js but not wget/curl — use node directly
+      test: ["CMD", "node", "-e",
+             "const h=require('http');h.get('http://localhost:4010/health',r=>{process.exit(r.statusCode===200?0:1)}).on('error',()=>process.exit(1))"]
+      interval: 5s
+      timeout: 3s
+      retries: 6
+      start_period: 5s
+
+  mock-instance-b:
+    image: ghcr.io/copilotkit/aimock:latest
+    ports:
+      - "9301:4010"
+    volumes:
+      - ./fixtures/instance-b.json:/fixtures/fixture.json:ro
+    command: ["--fixtures", "/fixtures", "--host", "0.0.0.0"]
+    healthcheck:
+      test: ["CMD", "node", "-e",
+             "const h=require('http');h.get('http://localhost:4010/health',r=>{process.exit(r.statusCode===200?0:1)}).on('error',()=>process.exit(1))"]
+      interval: 5s
+      timeout: 3s
+      retries: 6
+      start_period: 5s
+
+  mock-instance-c:
+    image: ghcr.io/copilotkit/aimock:latest
+    ports:
+      - "9302:4010"
+    volumes:
+      - ./fixtures/instance-c.json:/fixtures/fixture.json:ro
+    command: ["--fixtures", "/fixtures", "--host", "0.0.0.0"]
+    healthcheck:
+      test: ["CMD", "node", "-e",
+             "const h=require('http');h.get('http://localhost:4010/health',r=>{process.exit(r.statusCode===200?0:1)}).on('error',()=>process.exit(1))"]
+      interval: 5s
+      timeout: 3s
+      retries: 6
+      start_period: 5s
diff --git a/test/mock/fixtures/instance-a.json b/test/mock/fixtures/instance-a.json
new file mode 100644
index 00000000..9444e2e6
--- /dev/null
+++ b/test/mock/fixtures/instance-a.json
@@ -0,0 +1,11 @@
+{
+  "fixtures": [
+    {
+      "match": {},
+      "response": {
+        "content": "BACKEND:instance-a — synthetic response for sticky session affinity testing."
+      },
+      "streamingProfile": { "ttft": 50, "tps": 200, "jitter": 0.05 }
+    }
+  ]
+}
diff --git a/test/mock/fixtures/instance-b.json b/test/mock/fixtures/instance-b.json
new file mode 100644
index 00000000..d5c7b298
--- /dev/null
+++ b/test/mock/fixtures/instance-b.json
@@ -0,0 +1,11 @@
+{
+  "fixtures": [
+    {
+      "match": {},
+      "response": {
+        "content": "BACKEND:instance-b — synthetic response for sticky session affinity testing."
+      },
+      "streamingProfile": { "ttft": 50, "tps": 200, "jitter": 0.05 }
+    }
+  ]
+}
diff --git a/test/mock/fixtures/instance-c.json b/test/mock/fixtures/instance-c.json
new file mode 100644
index 00000000..c9f559fd
--- /dev/null
+++ b/test/mock/fixtures/instance-c.json
@@ -0,0 +1,11 @@
+{
+  "fixtures": [
+    {
+      "match": {},
+      "response": {
+        "content": "BACKEND:instance-c — synthetic response for sticky session affinity testing."
+      },
+      "streamingProfile": { "ttft": 50, "tps": 200, "jitter": 0.05 }
+    }
+  ]
+}
diff --git a/test/scripts/sticky/run-manual.sh b/test/scripts/sticky/run-manual.sh
new file mode 100644
index 00000000..a5366aac
--- /dev/null
+++ b/test/scripts/sticky/run-manual.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+# Full end-to-end sticky session manual test orchestrator.
+# Starts AIMock, runs Olla with the sticky config, invokes the assertion
+# script, and tears everything down on exit (success or failure).
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
+
+OLLA_PORT="${OLLA_PORT:-40114}"
+OLLA_URL="${OLLA_URL:-http://localhost:${OLLA_PORT}}"
+OLLA_LOG="${OLLA_LOG:-${TMPDIR:-/tmp}/olla-sticky.log}"
+CONFIG="${OLLA_CONFIG:-test/manual/config.sticky.yaml}"
+
+OLLA_PID=""
+
+cleanup() {
+    echo "Tearing down..."
+    if [ -n "$OLLA_PID" ] && kill -0 "$OLLA_PID" 2>/dev/null; then
+        kill "$OLLA_PID" 2>/dev/null || true
+        wait "$OLLA_PID" 2>/dev/null || true
+        echo "Olla stopped"
+    fi
+    (cd "$REPO_ROOT" && make mock-down 2>/dev/null) || true
+}
+trap cleanup EXIT INT TERM
+
+cd "$REPO_ROOT"
+
+echo "Starting AIMock containers..."
+make mock-up
+
+echo "Starting Olla with sticky session config..."
+# Background go run so we can signal the parent process on teardown.
+go run . --config "$CONFIG" >"$OLLA_LOG" 2>&1 &
+OLLA_PID=$!
+echo "Olla PID: $OLLA_PID (log: $OLLA_LOG)"
+
+echo "Waiting for Olla to become healthy..."
+attempt=0
+until curl -sf --max-time 2 "${OLLA_URL}/internal/health" >/dev/null 2>&1; do
+    attempt=$((attempt + 1))
+    if [ "$attempt" -ge 30 ]; then
+        echo "ERROR: Olla did not become healthy within 30s"
+        echo "--- last 80 log lines ---"
+        tail -n 80 "$OLLA_LOG" || true
+        exit 1
+    fi
+    sleep 1
+done
+echo "Olla is ready"
+
+OLLA_URL="$OLLA_URL" bash "$SCRIPT_DIR/test-sticky-provider-routes.sh"
diff --git a/test/scripts/sticky/test-sticky-provider-routes.sh b/test/scripts/sticky/test-sticky-provider-routes.sh
new file mode 100644
index 00000000..db1bb9d3
--- /dev/null
+++ b/test/scripts/sticky/test-sticky-provider-routes.sh
@@ -0,0 +1,359 @@
+#!/usr/bin/env bash
+# test-sticky-provider-routes.sh
+#
+# Verifies sticky session affinity across every provider-scoped route that
+# AIMock can serve. Routes are defined as a data table at the top; a single
+# run_sticky_test() function handles miss→hit→diversity assertions for each.
+#
+# Routes whose backend speaks a non-OpenAI protocol (Ollama /api/*, Lemonade
+# /api/v1/chat/completions) are explicitly skipped with a printed reason — they
+# require a dedicated mock server that is not included in this harness.
+#
+# The /olla/openai/ and /olla/openai-compatible/ routes were affected by the
+# sticky session context injection bug in providerProxyHandler — they are the
+# primary regression targets here.
+#
+# Usage:
+#   OLLA_URL=http://localhost:40114 bash test/scripts/sticky/test-sticky-provider-routes.sh
+#
+# Prerequisites:
+#   - AIMock running (make mock-up)
+#   - Olla running with test/manual/config.sticky.yaml (sticky sessions enabled)
+#   - curl, jq available
+
+set -euo pipefail
+
+OLLA_URL="${OLLA_URL:-http://localhost:40114}"
+CURL_TIMEOUT=30
+TOTAL=0
+PASSED=0
+FAILED=0
+SKIPPED=0
+
+# Colour codes
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+CYAN='\033[0;36m'
+PURPLE='\033[0;35m'
+WHITE='\033[1;37m'
+GREY='\033[0;37m'
+RESET='\033[0m'
+
+# ── helpers ───────────────────────────────────────────────────────────────────
+
+pass() { echo -e "  ${GREEN}✓ PASS${RESET} — $*"; TOTAL=$((TOTAL+1)); PASSED=$((PASSED+1)); }
+fail() { echo -e "  ${RED}✗ FAIL${RESET} — $*"; TOTAL=$((TOTAL+1)); FAILED=$((FAILED+1)); }
+
+skip() {
+    local label="$1"
+    local reason="$2"
+    echo -e "${YELLOW}SKIP${RESET} ${WHITE}${label}${RESET} — ${reason}"
+    SKIPPED=$((SKIPPED+1))
+    echo
+}
+
+banner() {
+    echo
+    echo -e "${PURPLE}╔══════════════════════════════════════════════════════════════╗${RESET}"
+    echo -e "${PURPLE}║${RESET}  ${CYAN}Olla Sticky Session — All Provider Routes Regression Test${RESET}  ${PURPLE}║${RESET}"
+    echo -e "${PURPLE}╚══════════════════════════════════════════════════════════════╝${RESET}"
+    echo
+}
+
+wait_for_olla() {
+    echo -e "${YELLOW}Waiting for Olla at ${OLLA_URL}...${RESET}"
+    local attempts=0
+    until curl -sf --max-time 2 "${OLLA_URL}/internal/health" > /dev/null 2>&1; do
+        attempts=$((attempts+1))
+        if [ "$attempts" -ge 30 ]; then
+            echo -e "${RED}ERROR: Olla did not become ready after 30s${RESET}"
+            exit 1
+        fi
+        sleep 1
+    done
+    echo -e "${GREEN}✓ Olla is ready${RESET}"
+    echo
+}
+
+# Extract a response header value (case-insensitive).
+# Returns empty string when header is absent — || true prevents set -e from
+# triggering when grep finds no match.
+extract_header() {
+    local file=$1
+    local header=$2
+    grep -i "^${header}:" "$file" | head -1 | cut -d' ' -f2- | tr -d '\r\n' || true
+}
+
+# Extract BACKEND:instance-X from a response body (handles both OpenAI and
+# Anthropic response shapes). Returns empty string when marker is absent.
+extract_backend_marker() {
+    local body=$1
+    echo "$body" | grep -oE 'BACKEND:instance-[a-z]+' | head -1 || true
+}
+
+# ── per-path test ─────────────────────────────────────────────────────────────
+#
+# run_sticky_test <label> <url_path> <body_json> [check_passthrough]
+#
+#   Three-turn sticky session verification:
+#     Turn 1: miss  — new session is pinned to a backend
+#     Turn 2: hit   — same session lands on the same backend
+#     Turn 3: diversity — across 10 fresh sessions at least one hits elsewhere
+#   Optionally asserts X-Olla-Mode: passthrough on turn 1 (Anthropic path).
+
+run_sticky_test() {
+    local label="$1"
+    local url_path="$2"
+    local body_json="$3"
+    local check_passthrough="${4:-false}"
+
+    local ts
+    ts=$(date +%s%3N)
+    local session_id="sess-${label}-${ts}"
+    local headers_file
+    headers_file=$(mktemp)
+    local body_file
+    body_file=$(mktemp)
+
+    echo -e "${WHITE}── ${label} ──${RESET}"
+    echo -e "  ${GREY}Path: ${url_path}${RESET}"
+
+    # ── Turn 1: expect miss ───────────────────────────────────────────────────
+    local http_code
+    http_code=$(curl -s -w "%{http_code}" -o "$body_file" -D "$headers_file" \
+        --max-time "$CURL_TIMEOUT" \
+        -X POST \
+        -H "Content-Type: application/json" \
+        -H "X-Olla-Session-ID: ${session_id}" \
+        -d "$body_json" \
+        "${OLLA_URL}${url_path}" 2>/dev/null)
+
+    if [[ ! "$http_code" =~ ^2 ]]; then
+        fail "Turn 1 HTTP ${http_code} (expected 2xx) — body: $(head -c 200 "$body_file")"
+        rm -f "$headers_file" "$body_file"
+        return
+    fi
+    pass "Turn 1 HTTP ${http_code}"
+
+    local sticky1 ep1 mode1 marker1
+    sticky1=$(extract_header "$headers_file" "X-Olla-Sticky-Session")
+    ep1=$(extract_header "$headers_file" "X-Olla-Endpoint")
+    mode1=$(extract_header "$headers_file" "X-Olla-Mode")
+    marker1=$(extract_backend_marker "$(cat "$body_file")")
+
+    [[ "$sticky1" == "miss" ]] && pass "Turn 1 sticky=miss" || fail "Turn 1 sticky='${sticky1}' (expected miss)"
+
+    local key_src1
+    key_src1=$(extract_header "$headers_file" "X-Olla-Sticky-Key-Source")
+    [[ "$key_src1" == "session_header" ]] && pass "Turn 1 key-source=session_header" || fail "Turn 1 key-source='${key_src1}' (expected session_header)"
+
+    if [[ "$check_passthrough" == "true" ]]; then
+        [[ "$mode1" == "passthrough" ]] && pass "Turn 1 X-Olla-Mode=passthrough" || fail "Turn 1 X-Olla-Mode='${mode1}' (expected passthrough)"
+    fi
+
+    echo -e "  ${GREY}Pinned to: ${ep1} (${marker1})${RESET}"
+
+    # ── Turn 2: expect hit ────────────────────────────────────────────────────
+    : > "$headers_file"
+    : > "$body_file"
+    http_code=$(curl -s -w "%{http_code}" -o "$body_file" -D "$headers_file" \
+        --max-time "$CURL_TIMEOUT" \
+        -X POST \
+        -H "Content-Type: application/json" \
+        -H "X-Olla-Session-ID: ${session_id}" \
+        -d "$body_json" \
+        "${OLLA_URL}${url_path}" 2>/dev/null)
+
+    [[ "$http_code" =~ ^2 ]] && pass "Turn 2 HTTP ${http_code}" || fail "Turn 2 HTTP ${http_code} (expected 2xx)"
+
+    local sticky2 ep2 marker2
+    sticky2=$(extract_header "$headers_file" "X-Olla-Sticky-Session")
+    ep2=$(extract_header "$headers_file" "X-Olla-Endpoint")
+    marker2=$(extract_backend_marker "$(cat "$body_file")")
+
+    [[ "$sticky2" == "hit" ]] && pass "Turn 2 sticky=hit" || fail "Turn 2 sticky='${sticky2}' (expected hit)"
+    [[ "$ep2" == "$ep1" ]] && pass "Turn 2 same endpoint (${ep1})" || fail "Turn 2 endpoint changed: '${ep1}' → '${ep2}'"
+    [[ "$marker2" == "$marker1" ]] && pass "Turn 2 same backend marker (${marker1})" || fail "Turn 2 backend marker changed: '${marker1}' → '${marker2}'"
+
+    # ── Turn 3: diversity — at least one new session lands elsewhere ──────────
+    # Without diversity validation, a single-instance deploy could trivially pass
+    # turns 1+2, masking a broken balancer.
+    local seen_other=false
+    local attempt
+    for attempt in $(seq 1 10); do
+        local new_session="sess-diversity-${label}-${ts}-${attempt}"
+        : > "$headers_file"; : > "$body_file"
+        http_code=$(curl -s -w "%{http_code}" -o "$body_file" -D "$headers_file" \
+            --max-time "$CURL_TIMEOUT" \
+            -X POST \
+            -H "Content-Type: application/json" \
+            -H "X-Olla-Session-ID: ${new_session}" \
+            -d "$body_json" \
+            "${OLLA_URL}${url_path}" 2>/dev/null)
+        local ep_div
+        ep_div=$(extract_header "$headers_file" "X-Olla-Endpoint")
+        if [[ "$ep_div" != "$ep1" ]]; then
+            seen_other=true
+            break
+        fi
+    done
+    $seen_other && pass "Turn 3 load balancing reaches multiple backends" || fail "Turn 3 all 10 attempts hit '${ep1}' only — balancer may be stuck"
+
+    rm -f "$headers_file" "$body_file"
+    echo
+}
+
+# ── stats assertion ────────────────────────────────────────────────────────────
+
+check_sticky_stats() {
+    echo -e "${WHITE}── Sticky Session Stats ──${RESET}"
+    local body
+    body=$(curl -sf --max-time 10 "${OLLA_URL}/internal/stats/sticky" 2>/dev/null || true)
+
+    if [[ -z "$body" ]]; then
+        fail "Could not reach /internal/stats/sticky"
+        return
+    fi
+
+    pass "Stats endpoint responded"
+
+    local insertions hits active
+    if command -v jq >/dev/null 2>&1; then
+        insertions=$(echo "$body" | jq -r '.insertions // 0')
+        hits=$(echo "$body" | jq -r '.hits // 0')
+        active=$(echo "$body" | jq -r '.active_sessions // 0')
+    else
+        insertions=$(echo "$body" | grep -oE '"insertions"[[:space:]]*:[[:space:]]*[0-9]+' | grep -oE '[0-9]+' || echo 0)
+        hits=$(echo "$body" | grep -oE '"hits"[[:space:]]*:[[:space:]]*[0-9]+' | grep -oE '[0-9]+' || echo 0)
+        active=$(echo "$body" | grep -oE '"active_sessions"[[:space:]]*:[[:space:]]*[0-9]+' | grep -oE '[0-9]+' || echo 0)
+    fi
+
+    echo -e "  ${GREY}insertions=${insertions}  hits=${hits}  active_sessions=${active}${RESET}"
+
+    [[ "${insertions:-0}" -gt 0 ]] && pass "stats.insertions > 0 (${insertions})" || fail "stats.insertions=0 (sticky sessions may not be recording)"
+    [[ "${hits:-0}" -gt 0 ]]       && pass "stats.hits > 0 (${hits})"             || fail "stats.hits=0 (no hits recorded)"
+    [[ "${active:-0}" -gt 0 ]]     && pass "stats.active_sessions > 0 (${active})" || fail "stats.active_sessions=0"
+    echo
+}
+
+# ── route table ───────────────────────────────────────────────────────────────
+#
+# Format per entry:
+#   LABEL|URL_PATH|BODY_TEMPLATE|CHECK_PASSTHROUGH|SKIP_REASON
+#
+# SKIP_REASON is non-empty when AIMock cannot serve the route's native protocol.
+# BODY_TEMPLATE is a key selecting a pre-defined body below.
+
+OPENAI_BODY='{"model":"test-model","messages":[{"role":"user","content":"ping"}],"max_tokens":50}'
+ANTHROPIC_BODY='{"model":"claude-3-haiku-20240307","max_tokens":50,"messages":[{"role":"user","content":"ping"}]}'
+
+# Each row: label|path|body_key|check_passthrough|skip_reason
+# body_key: "openai" or "anthropic"
+ROUTES=(
+    # ── main proxy (backward-compat baseline) ──────────────────────────────────
+    "main-proxy|/olla/proxy/v1/chat/completions|openai|false|"
+
+    # ── openai-compatible provider route (primary regression target) ───────────
+    # createProviderProfile("openai-compatible") widens to all OpenAI-compat types,
+    # so all three mock-instance-{a,b,c} endpoints are reachable.
+    "openai-compatible|/olla/openai-compatible/v1/chat/completions|openai|false|"
+
+    # ── openai provider route ──────────────────────────────────────────────────
+    # /olla/openai/ is registered via the openai-compatible profile (prefixes: openai, openai-compatible).
+    # createProviderProfile("openai") also widens to all OpenAI-compat backends.
+    "openai|/olla/openai/v1/chat/completions|openai|false|"
+
+    # ── vllm provider route ────────────────────────────────────────────────────
+    # Requires type: vllm endpoints. Config has dedicated vllm endpoints.
+    "vllm|/olla/vllm/v1/chat/completions|openai|false|"
+
+    # ── sglang provider route ──────────────────────────────────────────────────
+    # Requires type: sglang endpoints.
+    "sglang|/olla/sglang/v1/chat/completions|openai|false|"
+
+    # ── llamacpp provider route ────────────────────────────────────────────────
+    # Requires type: llamacpp endpoints.
+    "llamacpp|/olla/llamacpp/v1/chat/completions|openai|false|"
+
+    # ── lmstudio provider route ────────────────────────────────────────────────
+    # Registered under three prefixes; test the canonical lmstudio one.
+    # Requires type: lm-studio endpoints.
+    "lmstudio|/olla/lmstudio/v1/chat/completions|openai|false|"
+
+    # ── lm-studio alternate prefix ─────────────────────────────────────────────
+    "lm-studio|/olla/lm-studio/v1/chat/completions|openai|false|"
+
+    # ── litellm provider route ─────────────────────────────────────────────────
+    # Requires type: litellm endpoints.
+    "litellm|/olla/litellm/v1/chat/completions|openai|false|"
+
+    # ── dmr (Docker Model Runner) provider route ───────────────────────────────
+    # Requires type: docker-model-runner endpoints.
+    "dmr|/olla/dmr/v1/chat/completions|openai|false|"
+
+    # ── vllm-mlx provider route ────────────────────────────────────────────────
+    # Requires type: vllm-mlx endpoints.
+    "vllm-mlx|/olla/vllm-mlx/v1/chat/completions|openai|false|"
+
+    # ── lemonade provider route ────────────────────────────────────────────────
+    # Lemonade uses /api/v1/chat/completions, NOT /v1/chat/completions.
+    # AIMock does not serve that path prefix, so this route must be skipped.
+    "lemonade|/olla/lemonade/api/v1/chat/completions|openai|false|AIMock does not serve /api/v1/* — Lemonade uses a non-standard path prefix"
+
+    # ── ollama provider route ──────────────────────────────────────────────────
+    # Ollama speaks /api/chat or /api/generate, not OpenAI /v1/chat/completions.
+    # AIMock does not implement the Ollama protocol.
+    "ollama|/olla/ollama/api/chat|openai|false|AIMock does not speak the Ollama /api/* protocol"
+
+    # ── anthropic translator route ─────────────────────────────────────────────
+    # Registered via the translator layer, not the provider proxy.
+    # Exercises sticky session injection in translationHandler.
+    # Passthrough mode applies because openai-compatible endpoints declare
+    # anthropic_support.enabled=true in their profile.
+    "anthropic-translator|/olla/anthropic/v1/messages|anthropic|true|"
+)
+
+# ── main ──────────────────────────────────────────────────────────────────────
+
+main() {
+    banner
+    wait_for_olla
+
+    for entry in "${ROUTES[@]}"; do
+        # Parse the pipe-delimited row
+        IFS='|' read -r label url_path body_key check_passthrough skip_reason <<< "$entry"
+
+        if [[ -n "$skip_reason" ]]; then
+            skip "$label ($url_path)" "$skip_reason"
+            continue
+        fi
+
+        # Select the appropriate request body
+        local body
+        case "$body_key" in
+            anthropic) body="$ANTHROPIC_BODY" ;;
+            *)         body="$OPENAI_BODY" ;;
+        esac
+
+        run_sticky_test "$label" "$url_path" "$body" "$check_passthrough"
+    done
+
+    check_sticky_stats
+
+    # ── summary ───────────────────────────────────────────────────────────────
+    echo -e "${PURPLE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${RESET}"
+    echo -e "${WHITE}Results:${RESET}  ${GREEN}${PASSED} passed${RESET}  ${RED}${FAILED} failed${RESET}  ${YELLOW}${SKIPPED} skipped${RESET}  (${TOTAL} total assertions)"
+
+    if [[ "$FAILED" -eq 0 ]]; then
+        echo -e "${GREEN}✓ All sticky session assertions passed.${RESET}"
+        echo
+        exit 0
+    else
+        echo -e "${RED}✗ ${FAILED} assertion(s) failed — review output above.${RESET}"
+        echo
+        exit 1
+    fi
+}
+
+main "$@"

From 7985c7c61ea20a75ec9008199d95d2dd31e57e77 Mon Sep 17 00:00:00 2001
From: Thushan Fernando <thushan@thushanfernando.com>
Date: Thu, 23 Apr 2026 11:56:12 +1000
Subject: [PATCH 04/11] stabilise sticky harness turn-3 diversity

---
 .../sticky/test-sticky-provider-routes.sh     | 100 ++++++++++--------
 1 file changed, 58 insertions(+), 42 deletions(-)

diff --git a/test/scripts/sticky/test-sticky-provider-routes.sh b/test/scripts/sticky/test-sticky-provider-routes.sh
index db1bb9d3..ea964609 100644
--- a/test/scripts/sticky/test-sticky-provider-routes.sh
+++ b/test/scripts/sticky/test-sticky-provider-routes.sh
@@ -94,12 +94,13 @@ extract_backend_marker() {
 
 # ── per-path test ─────────────────────────────────────────────────────────────
 #
-# run_sticky_test <label> <url_path> <body_json> [check_passthrough]
+# run_sticky_test <label> <url_path> <body_json> [check_passthrough] [skip_turn3_reason]
 #
 #   Three-turn sticky session verification:
 #     Turn 1: miss  — new session is pinned to a backend
 #     Turn 2: hit   — same session lands on the same backend
-#     Turn 3: diversity — across 10 fresh sessions at least one hits elsewhere
+#     Turn 3: diversity — across 30 fresh sessions at least one hits elsewhere
+#              (skipped when skip_turn3_reason is non-empty)
 #   Optionally asserts X-Olla-Mode: passthrough on turn 1 (Anthropic path).
 
 run_sticky_test() {
@@ -107,6 +108,7 @@ run_sticky_test() {
     local url_path="$2"
     local body_json="$3"
     local check_passthrough="${4:-false}"
+    local skip_turn3_reason="${5:-}"
 
     local ts
     ts=$(date +%s%3N)
@@ -179,26 +181,33 @@ run_sticky_test() {
     # ── Turn 3: diversity — at least one new session lands elsewhere ──────────
     # Without diversity validation, a single-instance deploy could trivially pass
     # turns 1+2, masking a broken balancer.
-    local seen_other=false
-    local attempt
-    for attempt in $(seq 1 10); do
-        local new_session="sess-diversity-${label}-${ts}-${attempt}"
-        : > "$headers_file"; : > "$body_file"
-        http_code=$(curl -s -w "%{http_code}" -o "$body_file" -D "$headers_file" \
-            --max-time "$CURL_TIMEOUT" \
-            -X POST \
-            -H "Content-Type: application/json" \
-            -H "X-Olla-Session-ID: ${new_session}" \
-            -d "$body_json" \
-            "${OLLA_URL}${url_path}" 2>/dev/null)
-        local ep_div
-        ep_div=$(extract_header "$headers_file" "X-Olla-Endpoint")
-        if [[ "$ep_div" != "$ep1" ]]; then
-            seen_other=true
-            break
-        fi
-    done
-    $seen_other && pass "Turn 3 load balancing reaches multiple backends" || fail "Turn 3 all 10 attempts hit '${ep1}' only — balancer may be stuck"
+    # Skipped for routes whose pool is so large (e.g. main-proxy) that LCB
+    # tie-breaks deterministically at zero connections, making spread meaningless.
+    if [[ -n "$skip_turn3_reason" ]]; then
+        echo -e "  ${YELLOW}SKIP${RESET} Turn 3 diversity — ${skip_turn3_reason}"
+        SKIPPED=$((SKIPPED+1))
+    else
+        local seen_other=false
+        local attempt
+        for attempt in $(seq 1 30); do
+            local new_session="sess-diversity-${label}-${ts}-${attempt}"
+            : > "$headers_file"; : > "$body_file"
+            http_code=$(curl -s -w "%{http_code}" -o "$body_file" -D "$headers_file" \
+                --max-time "$CURL_TIMEOUT" \
+                -X POST \
+                -H "Content-Type: application/json" \
+                -H "X-Olla-Session-ID: ${new_session}" \
+                -d "$body_json" \
+                "${OLLA_URL}${url_path}" 2>/dev/null)
+            local ep_div
+            ep_div=$(extract_header "$headers_file" "X-Olla-Endpoint")
+            if [[ "$ep_div" != "$ep1" ]]; then
+                seen_other=true
+                break
+            fi
+        done
+        $seen_other && pass "Turn 3 load balancing reaches multiple backends" || fail "Turn 3 all 30 attempts hit '${ep1}' only — balancer may be stuck"
+    fi
 
     rm -f "$headers_file" "$body_file"
     echo
@@ -240,78 +249,85 @@ check_sticky_stats() {
 # ── route table ───────────────────────────────────────────────────────────────
 #
 # Format per entry:
-#   LABEL|URL_PATH|BODY_TEMPLATE|CHECK_PASSTHROUGH|SKIP_REASON
+#   LABEL|URL_PATH|BODY_TEMPLATE|CHECK_PASSTHROUGH|SKIP_REASON|SKIP_TURN3_REASON
 #
-# SKIP_REASON is non-empty when AIMock cannot serve the route's native protocol.
+# SKIP_REASON is non-empty when AIMock cannot serve the route's native protocol
+# (the entire route is skipped).
+# SKIP_TURN3_REASON is non-empty when the turn-3 diversity assertion is not
+# meaningful for this route (turns 1 and 2 still run).
 # BODY_TEMPLATE is a key selecting a pre-defined body below.
 
 OPENAI_BODY='{"model":"test-model","messages":[{"role":"user","content":"ping"}],"max_tokens":50}'
 ANTHROPIC_BODY='{"model":"claude-3-haiku-20240307","max_tokens":50,"messages":[{"role":"user","content":"ping"}]}'
 
-# Each row: label|path|body_key|check_passthrough|skip_reason
+# Each row: label|path|body_key|check_passthrough|skip_reason|skip_turn3_reason
 # body_key: "openai" or "anthropic"
 ROUTES=(
     # ── main proxy (backward-compat baseline) ──────────────────────────────────
-    "main-proxy|/olla/proxy/v1/chat/completions|openai|false|"
+    # Turn-3 diversity is skipped: the main-proxy pool spans all ~24 registered
+    # endpoints across every provider type; LCB tie-breaks deterministically at
+    # zero connections, so 30 fresh sessions consistently land on the same
+    # first-ranked instance — spread is not meaningful here.
+    "main-proxy|/olla/proxy/v1/chat/completions|openai|false||main-proxy pool is huge and LCB tie-break is deterministic at zero connections — spread not meaningful here"
 
     # ── openai-compatible provider route (primary regression target) ───────────
     # createProviderProfile("openai-compatible") widens to all OpenAI-compat types,
     # so all three mock-instance-{a,b,c} endpoints are reachable.
-    "openai-compatible|/olla/openai-compatible/v1/chat/completions|openai|false|"
+    "openai-compatible|/olla/openai-compatible/v1/chat/completions|openai|false||"
 
     # ── openai provider route ──────────────────────────────────────────────────
     # /olla/openai/ is registered via the openai-compatible profile (prefixes: openai, openai-compatible).
     # createProviderProfile("openai") also widens to all OpenAI-compat backends.
-    "openai|/olla/openai/v1/chat/completions|openai|false|"
+    "openai|/olla/openai/v1/chat/completions|openai|false||"
 
     # ── vllm provider route ────────────────────────────────────────────────────
     # Requires type: vllm endpoints. Config has dedicated vllm endpoints.
-    "vllm|/olla/vllm/v1/chat/completions|openai|false|"
+    "vllm|/olla/vllm/v1/chat/completions|openai|false||"
 
     # ── sglang provider route ──────────────────────────────────────────────────
     # Requires type: sglang endpoints.
-    "sglang|/olla/sglang/v1/chat/completions|openai|false|"
+    "sglang|/olla/sglang/v1/chat/completions|openai|false||"
 
     # ── llamacpp provider route ────────────────────────────────────────────────
     # Requires type: llamacpp endpoints.
-    "llamacpp|/olla/llamacpp/v1/chat/completions|openai|false|"
+    "llamacpp|/olla/llamacpp/v1/chat/completions|openai|false||"
 
     # ── lmstudio provider route ────────────────────────────────────────────────
     # Registered under three prefixes; test the canonical lmstudio one.
     # Requires type: lm-studio endpoints.
-    "lmstudio|/olla/lmstudio/v1/chat/completions|openai|false|"
+    "lmstudio|/olla/lmstudio/v1/chat/completions|openai|false||"
 
     # ── lm-studio alternate prefix ─────────────────────────────────────────────
-    "lm-studio|/olla/lm-studio/v1/chat/completions|openai|false|"
+    "lm-studio|/olla/lm-studio/v1/chat/completions|openai|false||"
 
     # ── litellm provider route ─────────────────────────────────────────────────
     # Requires type: litellm endpoints.
-    "litellm|/olla/litellm/v1/chat/completions|openai|false|"
+    "litellm|/olla/litellm/v1/chat/completions|openai|false||"
 
     # ── dmr (Docker Model Runner) provider route ───────────────────────────────
     # Requires type: docker-model-runner endpoints.
-    "dmr|/olla/dmr/v1/chat/completions|openai|false|"
+    "dmr|/olla/dmr/v1/chat/completions|openai|false||"
 
     # ── vllm-mlx provider route ────────────────────────────────────────────────
     # Requires type: vllm-mlx endpoints.
-    "vllm-mlx|/olla/vllm-mlx/v1/chat/completions|openai|false|"
+    "vllm-mlx|/olla/vllm-mlx/v1/chat/completions|openai|false||"
 
     # ── lemonade provider route ────────────────────────────────────────────────
     # Lemonade uses /api/v1/chat/completions, NOT /v1/chat/completions.
     # AIMock does not serve that path prefix, so this route must be skipped.
-    "lemonade|/olla/lemonade/api/v1/chat/completions|openai|false|AIMock does not serve /api/v1/* — Lemonade uses a non-standard path prefix"
+    "lemonade|/olla/lemonade/api/v1/chat/completions|openai|false|AIMock does not serve /api/v1/* — Lemonade uses a non-standard path prefix|"
 
     # ── ollama provider route ──────────────────────────────────────────────────
     # Ollama speaks /api/chat or /api/generate, not OpenAI /v1/chat/completions.
     # AIMock does not implement the Ollama protocol.
-    "ollama|/olla/ollama/api/chat|openai|false|AIMock does not speak the Ollama /api/* protocol"
+    "ollama|/olla/ollama/api/chat|openai|false|AIMock does not speak the Ollama /api/* protocol|"
 
     # ── anthropic translator route ─────────────────────────────────────────────
     # Registered via the translator layer, not the provider proxy.
     # Exercises sticky session injection in translationHandler.
     # Passthrough mode applies because openai-compatible endpoints declare
     # anthropic_support.enabled=true in their profile.
-    "anthropic-translator|/olla/anthropic/v1/messages|anthropic|true|"
+    "anthropic-translator|/olla/anthropic/v1/messages|anthropic|true||"
 )
 
 # ── main ──────────────────────────────────────────────────────────────────────
@@ -321,8 +337,8 @@ main() {
     wait_for_olla
 
     for entry in "${ROUTES[@]}"; do
-        # Parse the pipe-delimited row
-        IFS='|' read -r label url_path body_key check_passthrough skip_reason <<< "$entry"
+        # Parse the pipe-delimited row (6 fields)
+        IFS='|' read -r label url_path body_key check_passthrough skip_reason skip_turn3_reason <<< "$entry"
 
         if [[ -n "$skip_reason" ]]; then
             skip "$label ($url_path)" "$skip_reason"
@@ -336,7 +352,7 @@ main() {
             *)         body="$OPENAI_BODY" ;;
         esac
 
-        run_sticky_test "$label" "$url_path" "$body" "$check_passthrough"
+        run_sticky_test "$label" "$url_path" "$body" "$check_passthrough" "${skip_turn3_reason:-}"
     done
 
     check_sticky_stats

From a57501bae71165ad902250d2e14ea0a386e78bd7 Mon Sep 17 00:00:00 2001
From: Thushan Fernando <thushan@thushanfernando.com>
Date: Thu, 23 Apr 2026 12:26:01 +1000
Subject: [PATCH 05/11] fix prefix_hash fallback for empty messages array

---
 internal/adapter/balancer/sticky.go           |  2 +-
 .../adapter/balancer/sticky_metrics_test.go   | 21 +++++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/internal/adapter/balancer/sticky.go b/internal/adapter/balancer/sticky.go
index 22369d96..08c28728 100644
--- a/internal/adapter/balancer/sticky.go
+++ b/internal/adapter/balancer/sticky.go
@@ -206,7 +206,7 @@ func stickyKeyFromPrefixHash(body []byte, modelName string, prefixBytes int) (st
 		return "", ""
 	}
 	raw := gjson.GetBytes(body, "messages").Raw
-	if raw == "" {
+	if raw == "" || raw == "[]" || raw == "null" {
 		raw = gjson.GetBytes(body, "prompt").Raw
 	}
 	if raw == "" {
diff --git a/internal/adapter/balancer/sticky_metrics_test.go b/internal/adapter/balancer/sticky_metrics_test.go
index 214cb14d..30e40058 100644
--- a/internal/adapter/balancer/sticky_metrics_test.go
+++ b/internal/adapter/balancer/sticky_metrics_test.go
@@ -112,6 +112,27 @@ func TestComputeStickyKey_SessionHeader_EmptyModel(t *testing.T) {
 	assert.NotEqual(t, keyA, keyB, "distinct session IDs must produce distinct keys even with empty model")
 }
 
+// TestComputeStickyKey_PrefixHash_EmptyMessagesArray verifies that an empty
+// messages array falls through to the prompt field, so requests like
+// {"messages":[],"prompt":"hi"} still produce a sticky key rather than
+// being treated as keyless (which would break sticky for completions endpoints
+// that serialise an empty messages slice alongside a prompt).
+func TestComputeStickyKey_PrefixHash_EmptyMessagesArray(t *testing.T) {
+	t.Parallel()
+
+	body := []byte(`{"model":"llama3","messages":[],"prompt":"hi","max_tokens":50}`)
+	req, _ := http.NewRequest(http.MethodPost, "/", nil)
+
+	cfg := config.StickySessionConfig{
+		KeySources:      []string{"prefix_hash"},
+		PrefixHashBytes: 512,
+	}
+	key, source := ComputeStickyKey(req, "llama3", cfg, body)
+
+	assert.Equal(t, "prefix_hash", source, "empty messages array must fall back to prompt via prefix_hash")
+	assert.NotEmpty(t, key, "prompt fallback must produce a non-empty key when messages is []")
+}
+
 // TestStickySessionWrapper_EmptyModel_RoutesAndPins verifies the end-to-end path:
 // a session header arrives with no identified model, the wrapper computes a key,
 // pins a backend, and a second request with the same session ID hits the same

From 45f9865a9369bb091d2ab6c0c205759a1b25fa4e Mon Sep 17 00:00:00 2001
From: Thushan Fernando <thushan@thushanfernando.com>
Date: Thu, 23 Apr 2026 12:26:16 +1000
Subject: [PATCH 06/11] clarify provider prefix helpers return no trailing
 slash

---
 internal/app/handlers/handler_common.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/internal/app/handlers/handler_common.go b/internal/app/handlers/handler_common.go
index eef653ce..ac6fb8df 100644
--- a/internal/app/handlers/handler_common.go
+++ b/internal/app/handlers/handler_common.go
@@ -89,14 +89,14 @@ func (a *Application) isProviderSupported(provider string) bool {
 	return staticProviders[normalised]
 }
 
-// getProviderPrefix returns the canonical /olla/<provider>/ prefix for strip-and-forward routing.
+// getProviderPrefix returns the canonical /olla/<provider> prefix (no trailing slash) for strip-and-forward routing.
 func getProviderPrefix(provider string) string {
 	return constants.DefaultOllaProxyPathPrefix + provider
 }
 
-// getRawProviderPrefix extracts the URL prefix to strip from the incoming request path.
+// getRawProviderPrefix extracts the /olla/<provider> prefix (no trailing slash) from the incoming request path.
 // Unlike getProviderPrefix, this preserves the original spelling used by the caller
-// (e.g., /olla/lmstudio/ rather than /olla/lm-studio/) so that path stripping works
+// (e.g., /olla/lmstudio rather than /olla/lm-studio) so that path stripping works
 // even when the caller uses an alias spelling.
 func getRawProviderPrefix(path string) string {
 	if !strings.HasPrefix(path, constants.DefaultOllaProxyPathPrefix) {

From 3fa18753f1489e33fb223d4e1b4ab9482d89c2b4 Mon Sep 17 00:00:00 2001
From: Thushan Fernando <thushan@thushanfernando.com>
Date: Thu, 23 Apr 2026 12:26:32 +1000
Subject: [PATCH 07/11] assert backend marker is non-empty in sticky tests

---
 test/scripts/sticky/test-sticky-provider-routes.sh | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/test/scripts/sticky/test-sticky-provider-routes.sh b/test/scripts/sticky/test-sticky-provider-routes.sh
index ea964609..ad196219 100644
--- a/test/scripts/sticky/test-sticky-provider-routes.sh
+++ b/test/scripts/sticky/test-sticky-provider-routes.sh
@@ -144,6 +144,12 @@ run_sticky_test() {
     mode1=$(extract_header "$headers_file" "X-Olla-Mode")
     marker1=$(extract_backend_marker "$(cat "$body_file")")
 
+    if [[ -z "$marker1" ]]; then
+        fail "Turn 1 fixture marker missing — AIMock fixture regression"
+        rm -f "$headers_file" "$body_file"
+        return
+    fi
+
     [[ "$sticky1" == "miss" ]] && pass "Turn 1 sticky=miss" || fail "Turn 1 sticky='${sticky1}' (expected miss)"
 
     local key_src1
@@ -176,7 +182,11 @@ run_sticky_test() {
 
     [[ "$sticky2" == "hit" ]] && pass "Turn 2 sticky=hit" || fail "Turn 2 sticky='${sticky2}' (expected hit)"
     [[ "$ep2" == "$ep1" ]] && pass "Turn 2 same endpoint (${ep1})" || fail "Turn 2 endpoint changed: '${ep1}' → '${ep2}'"
-    [[ "$marker2" == "$marker1" ]] && pass "Turn 2 same backend marker (${marker1})" || fail "Turn 2 backend marker changed: '${marker1}' → '${marker2}'"
+    if [[ -z "$marker2" ]]; then
+        fail "Turn 2 fixture marker missing — AIMock fixture regression"
+    else
+        [[ "$marker2" == "$marker1" ]] && pass "Turn 2 same backend marker (${marker1})" || fail "Turn 2 backend marker changed: '${marker1}' → '${marker2}'"
+    fi
 
     # ── Turn 3: diversity — at least one new session lands elsewhere ──────────
     # Without diversity validation, a single-instance deploy could trivially pass

From 85103410fb6eebfb3050ebfd0a6a7393b15ccfc3 Mon Sep 17 00:00:00 2001
From: Thushan Fernando <thushan@thushanfernando.com>
Date: Thu, 23 Apr 2026 12:26:41 +1000
Subject: [PATCH 08/11] fix sticky diversity check to ignore failed requests

---
 test/scripts/sticky/test-sticky-provider-routes.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/scripts/sticky/test-sticky-provider-routes.sh b/test/scripts/sticky/test-sticky-provider-routes.sh
index ad196219..bc1a1e0f 100644
--- a/test/scripts/sticky/test-sticky-provider-routes.sh
+++ b/test/scripts/sticky/test-sticky-provider-routes.sh
@@ -211,7 +211,7 @@ run_sticky_test() {
                 "${OLLA_URL}${url_path}" 2>/dev/null)
             local ep_div
             ep_div=$(extract_header "$headers_file" "X-Olla-Endpoint")
-            if [[ "$ep_div" != "$ep1" ]]; then
+            if [[ "$http_code" =~ ^2 ]] && [[ -n "$ep_div" ]] && [[ "$ep_div" != "$ep1" ]]; then
                 seen_other=true
                 break
             fi

From 4ac53f26a7e3d32b24f5efa95319ed17e3f268a8 Mon Sep 17 00:00:00 2001
From: Thushan Fernando <thushan@thushanfernando.com>
Date: Thu, 23 Apr 2026 12:26:52 +1000
Subject: [PATCH 09/11] pin aimock image digest for reproducible tests

---
 test/mock/compose.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/mock/compose.yaml b/test/mock/compose.yaml
index ed736049..a2e9dc87 100644
--- a/test/mock/compose.yaml
+++ b/test/mock/compose.yaml
@@ -18,7 +18,7 @@
 
 services:
   mock-instance-a:
-    image: ghcr.io/copilotkit/aimock:latest
+    image: ghcr.io/copilotkit/aimock@sha256:b9d1e5c809b0800491139f4b40f3e0de4d197062e2ba77a4cce7bec78a9db4f1
     ports:
       - "9300:4010"
     volumes:
@@ -34,7 +34,7 @@ services:
       start_period: 5s
 
   mock-instance-b:
-    image: ghcr.io/copilotkit/aimock:latest
+    image: ghcr.io/copilotkit/aimock@sha256:b9d1e5c809b0800491139f4b40f3e0de4d197062e2ba77a4cce7bec78a9db4f1
     ports:
       - "9301:4010"
     volumes:
@@ -49,7 +49,7 @@ services:
       start_period: 5s
 
   mock-instance-c:
-    image: ghcr.io/copilotkit/aimock:latest
+    image: ghcr.io/copilotkit/aimock@sha256:b9d1e5c809b0800491139f4b40f3e0de4d197062e2ba77a4cce7bec78a9db4f1
     ports:
       - "9302:4010"
     volumes:

From ac2386f8abacc2b9fcf3581f025dc969f09917e3 Mon Sep 17 00:00:00 2001
From: Thushan Fernando <thushan@thushanfernando.com>
Date: Thu, 23 Apr 2026 12:27:03 +1000
Subject: [PATCH 10/11] use docker compose --wait instead of custom poll

---
 makefile | 23 ++---------------------
 1 file changed, 2 insertions(+), 21 deletions(-)

diff --git a/makefile b/makefile
index 3113f02b..83dfdc8a 100644
--- a/makefile
+++ b/makefile
@@ -369,27 +369,8 @@ test-script-sticky:
 ## mock-up: Start AIMock instances and wait until all are healthy
 mock-up:
 	@echo "Starting AIMock instances..."
-	@docker compose -f test/mock/compose.yaml up -d
-	@echo "Waiting for AIMock health checks..."
-	@attempt=0; \
-	while true; do \
-		attempt=$$((attempt+1)); \
-		if [ $$attempt -gt 40 ]; then \
-			echo "ERROR: AIMock did not become healthy after 40s"; \
-			docker compose -f test/mock/compose.yaml ps; \
-			exit 1; \
-		fi; \
-		unhealthy=$$(docker compose -f test/mock/compose.yaml ps --format json 2>/dev/null | \
-			python3 -c "import sys,json; data=sys.stdin.read(); items=[json.loads(l) for l in data.splitlines() if l.strip()]; print(sum(1 for i in items if i.get('Health','') not in ('healthy','')))" 2>/dev/null || echo "0"); \
-		total=$$(docker compose -f test/mock/compose.yaml ps -q 2>/dev/null | wc -l | tr -d ' '); \
-		healthy=$$(docker compose -f test/mock/compose.yaml ps --format json 2>/dev/null | \
-			python3 -c "import sys,json; data=sys.stdin.read(); items=[json.loads(l) for l in data.splitlines() if l.strip()]; print(sum(1 for i in items if i.get('Health','') == 'healthy'))" 2>/dev/null || echo "0"); \
-		if [ "$$healthy" = "3" ]; then \
-			echo "All 3 AIMock instances are healthy"; \
-			break; \
-		fi; \
-		sleep 1; \
-	done
+	@docker compose -f test/mock/compose.yaml up -d --wait --wait-timeout 40
+	@echo "All 3 AIMock instances are healthy"
 
 ## mock-down: Stop and remove AIMock containers and volumes
 mock-down:

From f350dcd086d34707cdb149a78d4c1aad05b6ccae Mon Sep 17 00:00:00 2001
From: Thushan Fernando <thushan@thushanfernando.com>
Date: Thu, 23 Apr 2026 12:27:23 +1000
Subject: [PATCH 11/11] add language tags to skill code fences

---
 .claude/skills/test-sticky-sessions.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.claude/skills/test-sticky-sessions.md b/.claude/skills/test-sticky-sessions.md
index 7a2c93f0..6633e53a 100644
--- a/.claude/skills/test-sticky-sessions.md
+++ b/.claude/skills/test-sticky-sessions.md
@@ -102,7 +102,7 @@ This target handles all five steps including the EXIT trap teardown.
 
 ## Expected output (passing run)
 
-```
+```text
 ╔══════════════════════════════════════════════════════════════╗
 ║  Olla Sticky Session — All Provider Routes Regression Test  ║
 ╚══════════════════════════════════════════════════════════════╝
@@ -157,7 +157,7 @@ curl -s -D - -X POST http://localhost:40114/olla/proxy/v1/chat/completions \
 ```
 
 Expected response headers:
-```
+```text
 X-Olla-Sticky-Session: miss
 X-Olla-Sticky-Key-Source: session_header
 X-Olla-Endpoint: mock-compat-{a,b,c}