diff --git a/docs/content/api-reference/vllm-mlx.md b/docs/content/api-reference/vllm-mlx.md new file mode 100644 index 00000000..2ab53504 --- /dev/null +++ b/docs/content/api-reference/vllm-mlx.md @@ -0,0 +1,266 @@ +# vLLM-MLX API + +Proxy endpoints for vLLM-MLX inference servers running on Apple Silicon. Available through the `/olla/vllm-mlx/` prefix. + +vLLM-MLX serves a single model per instance using MLX-format weights from HuggingFace (e.g. `mlx-community/Llama-3.2-3B-Instruct-4bit`). It exposes a standard OpenAI-compatible API without guided generation or advanced vLLM features. + +## Endpoints Overview + +| Method | URI | Description | +|--------|-----|-------------| +| GET | `/olla/vllm-mlx/health` | Health check | +| GET | `/olla/vllm-mlx/v1/models` | List available models | +| POST | `/olla/vllm-mlx/v1/chat/completions` | Chat completion | +| POST | `/olla/vllm-mlx/v1/completions` | Text completion | +| POST | `/olla/vllm-mlx/v1/embeddings` | Generate embeddings | + +--- + +## GET /olla/vllm-mlx/health + +Check vLLM-MLX server health status. + +### Request + +```bash +curl -X GET http://localhost:40114/olla/vllm-mlx/health +``` + +### Response + +```json +{ + "status": "healthy" +} +``` + +--- + +## GET /olla/vllm-mlx/v1/models + +List the model available on the vLLM-MLX server. Each instance serves a single model. + +### Request + +```bash +curl -X GET http://localhost:40114/olla/vllm-mlx/v1/models +``` + +### Response + +```json +{ + "object": "list", + "data": [ + { + "id": "mlx-community/Llama-3.2-3B-Instruct-4bit", + "object": "model", + "created": 1705334400, + "owned_by": "vllm-mlx", + "root": "mlx-community/Llama-3.2-3B-Instruct-4bit", + "parent": null, + "permission": [] + } + ] +} +``` + +--- + +## POST /olla/vllm-mlx/v1/chat/completions + +OpenAI-compatible chat completion. + +### Request + +```bash +curl -X POST http://localhost:40114/olla/vllm-mlx/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "mlx-community/Llama-3.2-3B-Instruct-4bit", + "messages": [ + { + "role": "system", + "content": "You are a helpful AI assistant." + }, + { + "role": "user", + "content": "What is MLX?" + } + ], + "temperature": 0.7, + "max_tokens": 300, + "stream": false + }' +``` + +### Response + +```json +{ + "id": "chatcmpl-abc123", + "object": "chat.completion", + "created": 1705334400, + "model": "mlx-community/Llama-3.2-3B-Instruct-4bit", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "MLX is an array framework for machine learning on Apple Silicon, developed by Apple's machine learning research team. It provides efficient GPU-accelerated computation using the unified memory architecture of Apple's M-series chips." + }, + "logprobs": null, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 25, + "completion_tokens": 42, + "total_tokens": 67 + } +} +``` + +### Streaming Response + +When `"stream": true`: + +``` +data: {"id":"chatcmpl-abc123","object":"chat.completion.chunk","created":1705334400,"model":"mlx-community/Llama-3.2-3B-Instruct-4bit","choices":[{"index":0,"delta":{"role":"assistant"},"logprobs":null,"finish_reason":null}]} + +data: {"id":"chatcmpl-abc123","object":"chat.completion.chunk","created":1705334400,"model":"mlx-community/Llama-3.2-3B-Instruct-4bit","choices":[{"index":0,"delta":{"content":"MLX"},"logprobs":null,"finish_reason":null}]} + +... + +data: {"id":"chatcmpl-abc123","object":"chat.completion.chunk","created":1705334401,"model":"mlx-community/Llama-3.2-3B-Instruct-4bit","choices":[{"index":0,"delta":{},"logprobs":null,"finish_reason":"stop"}]} + +data: [DONE] +``` + +--- + +## POST /olla/vllm-mlx/v1/completions + +Text completion. + +### Request + +```bash +curl -X POST http://localhost:40114/olla/vllm-mlx/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "mlx-community/Llama-3.2-3B-Instruct-4bit", + "prompt": "Apple Silicon is designed for", + "max_tokens": 200, + "temperature": 0.8, + "top_p": 0.95, + "stream": false + }' +``` + +### Response + +```json +{ + "id": "cmpl-xyz789", + "object": "text_completion", + "created": 1705334400, + "model": "mlx-community/Llama-3.2-3B-Instruct-4bit", + "choices": [ + { + "text": " high-performance computing with exceptional energy efficiency. The unified memory architecture allows the CPU, GPU, and Neural Engine to share the same memory pool, eliminating the overhead of copying data between processors.", + "index": 0, + "logprobs": null, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 6, + "completion_tokens": 38, + "total_tokens": 44 + } +} +``` + +--- + +## POST /olla/vllm-mlx/v1/embeddings + +Generate embeddings (if the loaded model supports embeddings). + +### Request + +```bash +curl -X POST http://localhost:40114/olla/vllm-mlx/v1/embeddings \ + -H "Content-Type: application/json" \ + -d '{ + "model": "mlx-community/Llama-3.2-3B-Instruct-4bit", + "input": "MLX is optimised for Apple Silicon", + "encoding_format": "float" + }' +``` + +### Response + +```json +{ + "object": "list", + "data": [ + { + "object": "embedding", + "index": 0, + "embedding": [0.0234, -0.0567, 0.0891, ...] + } + ], + "model": "mlx-community/Llama-3.2-3B-Instruct-4bit", + "usage": { + "prompt_tokens": 8, + "total_tokens": 8 + } +} +``` + +## Sampling Parameters + +Standard OpenAI-compatible sampling parameters are supported. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `temperature` | float | 1.0 | Sampling temperature | +| `top_p` | float | 1.0 | Nucleus sampling threshold | +| `max_tokens` | integer | - | Maximum tokens to generate | +| `stop` | string/array | - | Stop sequences | +| `stream` | boolean | false | Enable streaming response | +| `frequency_penalty` | float | 0.0 | Frequency penalty | +| `presence_penalty` | float | 0.0 | Presence penalty | + +## Configuration Example + +```yaml +endpoints: + - url: "http://192.168.0.100:8000" + name: "vllm-mlx-server" + type: "vllm-mlx" + priority: 80 + model_url: "/v1/models" + health_check_url: "/health" + check_interval: 5s + check_timeout: 2s +``` + +## Request Headers + +All requests are forwarded with: + +- `X-Olla-Request-ID` - Unique request identifier +- `X-Forwarded-For` - Client IP address +- Custom headers from endpoint configuration + +## Response Headers + +All responses include: + +- `X-Olla-Endpoint` - Backend endpoint name (e.g., "vllm-mlx-server") +- `X-Olla-Model` - Model used for the request +- `X-Olla-Backend-Type` - Always "vllm-mlx" for these endpoints +- `X-Olla-Response-Time` - Total processing time diff --git a/docs/content/index.md b/docs/content/index.md index 887590aa..dc9f6bd2 100644 --- a/docs/content/index.md +++ b/docs/content/index.md @@ -16,10 +16,12 @@ keywords: llm proxy, ollama proxy, lm studio proxy, vllm proxy, sglang, lemonade vLLM: Native Support SGLang: Native Support LiteLLM: Native Support - LM Deploy: OpenAI Compatible
+ LM Deploy: OpenAI Compatible
+ vLLM-MLX: Native Support + Docker Model Runner: Native Support
Ollama: Native Support LM Studio: Native Support - LemonadeSDK: Native Support + LemonadeSDK: Native Support

diff --git a/docs/content/integrations/backend/docker-model-runner.md b/docs/content/integrations/backend/docker-model-runner.md index 34b483e9..82164cb6 100644 --- a/docs/content/integrations/backend/docker-model-runner.md +++ b/docs/content/integrations/backend/docker-model-runner.md @@ -13,7 +13,7 @@ keywords: [Docker Model Runner, DMR, Olla proxy, Docker Desktop, LLM inference, Since - Olla v0.0.17 + Olla v0.0.23 Type diff --git a/docs/content/integrations/backend/vllm-mlx.md b/docs/content/integrations/backend/vllm-mlx.md new file mode 100644 index 00000000..84d62332 --- /dev/null +++ b/docs/content/integrations/backend/vllm-mlx.md @@ -0,0 +1,552 @@ +--- +title: vLLM-MLX Integration - Apple Silicon LLM Inference with Olla +description: Configure vLLM-MLX with Olla proxy for high-performance LLM serving on Apple Silicon. MLX framework acceleration, unified memory architecture, and OpenAI compatibility. +keywords: vLLM-MLX, Olla proxy, Apple Silicon, MLX, M1, M2, M3, M4, unified memory, LLM inference, macOS +--- + +# vLLM-MLX Integration + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Homegithub.com/waybarrios/vllm-mlx
SinceOlla v0.0.23
Typevllm-mlx (use in endpoint configuration)
Profilevllm-mlx.yaml (see latest)
Features +
    +
  • Proxy Forwarding
  • +
  • Health Check (native)
  • +
  • Model Unification
  • +
  • Model Detection & Normalisation
  • +
  • OpenAI API Compatibility
  • +
  • Native Anthropic Messages API
  • +
  • Token Counting API
  • +
  • Embeddings API
  • +
+
Unsupported +
    +
  • Model Management (single model per instance, loaded at startup)
  • +
  • Prometheus Metrics
  • +
  • Tokenisation API
  • +
  • Reranking API
  • +
+
Attributes +
    +
  • Apple Silicon Only (M1/M2/M3/M4)
  • +
  • MLX Framework Acceleration
  • +
  • Unified Memory Architecture
  • +
  • Single Model Server
  • +
+
Prefixes + +
Endpoints + See below +
+ +## Configuration + +### Basic Setup + +Add vLLM-MLX to your Olla configuration: + +```yaml +discovery: + static: + endpoints: + - url: "http://localhost:8000" + name: "mlx-server" + type: "vllm-mlx" + priority: 85 + model_url: "/v1/models" + health_check_url: "/health" + check_interval: 5s + check_timeout: 2s +``` + +### Apple Silicon Network Setup + +Configure vLLM-MLX across multiple Mac instances behind Olla: + +```yaml +discovery: + static: + endpoints: + - url: "http://mac-mini:8000" + name: "mlx-llama" + type: "vllm-mlx" + priority: 85 + model_url: "/v1/models" + health_check_url: "/health" + check_interval: 5s + check_timeout: 2s + + - url: "http://mac-studio:8000" + name: "mlx-qwen" + type: "vllm-mlx" + priority: 90 + model_url: "/v1/models" + health_check_url: "/health" + check_interval: 5s + check_timeout: 2s + +proxy: + engine: "olla" # Use high-performance engine + load_balancer: "priority" +``` + +## Anthropic Messages API Support + +vLLM-MLX natively supports the Anthropic Messages API, enabling Olla to forward Anthropic-format requests directly without translation overhead (passthrough mode). + +When Olla detects that a vLLM-MLX endpoint supports native Anthropic format (via the `anthropic_support` section in `config/profiles/vllm-mlx.yaml`), it will bypass the Anthropic-to-OpenAI translation pipeline and forward requests directly to `/v1/messages` on the backend. + +**Profile configuration** (from `config/profiles/vllm-mlx.yaml`): + +```yaml +api: + anthropic_support: + enabled: true + messages_path: /v1/messages + token_count: true +``` + +**Key details**: + +- Token counting (`/v1/messages/count_tokens`): **Supported** (unlike standard vLLM) +- Passthrough mode is automatic -- no client-side configuration needed +- Responses include `X-Olla-Mode: passthrough` header when passthrough is active +- Falls back to translation mode if passthrough conditions are not met + +For more information, see [API Translation](../../concepts/api-translation.md#passthrough-mode) and [Anthropic API Reference](../../api-reference/anthropic.md). + +## Endpoints Supported + +The following endpoints are supported by the vLLM-MLX integration profile: + + + + + + + + + + + + + + + + + + + + + + + + + + +
PathDescription
/healthHealth Check
/v1/modelsList Models (OpenAI format)
/v1/chat/completionsChat Completions (OpenAI format)
/v1/completionsText Completions (OpenAI format)
/v1/embeddingsEmbeddings API
+ +## Usage Examples + +### Chat Completion + +```bash +curl -X POST http://localhost:40114/olla/vllm-mlx/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "mlx-community/Llama-3.2-3B-Instruct-4bit", + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Explain quantum computing in simple terms"} + ], + "temperature": 0.7, + "max_tokens": 500 + }' +``` + +### Streaming Response + +```bash +curl -X POST http://localhost:40114/olla/vllm-mlx/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "mlx-community/Llama-3.2-3B-Instruct-4bit", + "messages": [ + {"role": "user", "content": "Write a story about a robot"} + ], + "stream": true, + "temperature": 0.8 + }' +``` + +### Anthropic Messages API (Passthrough) + +```bash +curl -X POST http://localhost:40114/olla/anthropic/v1/messages \ + -H "Content-Type: application/json" \ + -H "x-api-key: not-needed" \ + -H "anthropic-version: 2023-06-01" \ + -d '{ + "model": "mlx-community/Llama-3.2-3B-Instruct-4bit", + "max_tokens": 500, + "messages": [ + {"role": "user", "content": "Hello!"} + ] + }' +``` + +### Models and Health + +```bash +# List available models +curl http://localhost:40114/olla/vllm-mlx/v1/models + +# Check health status +curl http://localhost:40114/olla/vllm-mlx/health +``` + +## vLLM-MLX Specifics + +### Apple Silicon Features + +vLLM-MLX leverages the MLX framework for optimised inference on Apple Silicon: + +- **Unified Memory Architecture**: Model weights and KV cache share the same memory pool -- no CPU-to-GPU transfer overhead +- **MLX Framework Acceleration**: Purpose-built for Apple's M-series neural engine and GPU cores +- **Continuous Batching**: Dynamic request batching for improved multi-user throughput (2-3.4x speedup) +- **Quantisation Support**: 2bit, 3bit, 4bit (most common), 6bit, 8bit, and bf16 + +### Resource Configuration + +The vLLM-MLX profile includes Apple Silicon-optimised settings: + +```yaml +characteristics: + timeout: 2m + max_concurrent_requests: 50 + streaming_support: true + +resources: + defaults: + requires_gpu: false # Uses unified memory, not discrete GPU +``` + +### Memory Requirements + +vLLM-MLX uses unified memory shared between the system and model inference. There is no discrete GPU -- all memory is drawn from the Mac's unified pool: + +| Model Size | Min Memory | Recommended | Max Concurrent | +|------------|------------|-------------|----------------| +| 70B+ (4bit) | 40GB | 48GB | 2 | +| 30B+ (4bit) | 20GB | 24GB | 5 | +| 13B (4bit) | 10GB | 16GB | 10 | +| 7-8B (4bit) | 6GB | 8GB | 20 | +| 3B (4bit) | 3GB | 4GB | 20 | +| 1B (4bit) | 2GB | 3GB | 20 | + +### Performance (M4 Max 128GB) + +| Model | Throughput | +|-------|-----------| +| Llama-3.2-3B-4bit | ~200 tok/s | +| Llama-3.2-1B-4bit | ~464 tok/s | +| With continuous batching | 2-3.4x speedup | + +### Model Naming + +vLLM-MLX uses HuggingFace model names from the `mlx-community` organisation: + +- `mlx-community/Llama-3.2-3B-Instruct-4bit` +- `mlx-community/Llama-3.2-1B-Instruct-4bit` +- `mlx-community/Qwen2.5-7B-Instruct-4bit` +- `mlx-community/Mistral-7B-Instruct-v0.3-4bit` + +## Starting vLLM-MLX Server + +### Installation + +```bash +# Using pip +pip install vllm-mlx + +# Using uv (recommended) +uv tool install vllm-mlx +``` + +### Basic Start + +```bash +vllm-mlx serve mlx-community/Llama-3.2-3B-Instruct-4bit --port 8000 +``` + +### With Options + +```bash +vllm-mlx serve mlx-community/Llama-3.2-3B-Instruct-4bit \ + --port 8000 \ + --host 0.0.0.0 \ + --continuous-batching \ + --max-tokens 4096 +``` + +### With Reasoning Support + +```bash +vllm-mlx serve mlx-community/QwQ-32B-4bit \ + --port 8000 \ + --host 0.0.0.0 \ + --continuous-batching \ + --reasoning-parser deepseek_r1 +``` + +### With Embeddings + +```bash +vllm-mlx serve mlx-community/Llama-3.2-3B-Instruct-4bit \ + --port 8000 \ + --embedding-model mlx-community/bge-small-en-v1.5 +``` + +### Key CLI Flags + +| Flag | Description | +|------|-------------| +| `--port` | Server port (default: 8000) | +| `--host` | Bind address (default: 127.0.0.1) | +| `--continuous-batching` | Enable dynamic request batching | +| `--cache-memory-percent` | Percentage of memory for KV cache | +| `--max-tokens` | Maximum token generation length | +| `--reasoning-parser` | Enable reasoning model support | +| `--embedding-model` | Load a separate embedding model | + +## Profile Customisation + +To customise vLLM-MLX behaviour, create `config/profiles/vllm-mlx-custom.yaml`. See [Profile Configuration](../../concepts/profile-system.md) for detailed explanations of each section. + +### Example Customisation + +```yaml +name: vllm-mlx +version: "1.0" + +# Add custom prefixes +routing: + prefixes: + - vllm-mlx + - mlx # Add custom prefix + +# Adjust for larger models +characteristics: + timeout: 5m # Increase for 70B+ models + +# Modify concurrency limits +resources: + concurrency_limits: + - min_memory_gb: 40 + max_concurrent: 2 # Reduce for very large models + - min_memory_gb: 16 + max_concurrent: 10 # Adjust based on unified memory +``` + +See [Profile Configuration](../../concepts/profile-system.md) for complete customisation options. + +## Troubleshooting + +### Apple Silicon Only + +**Issue**: vLLM-MLX fails to start or install + +**Solution**: vLLM-MLX only runs on Apple Silicon Macs (M1/M2/M3/M4). It does not support Intel Macs, Linux, or Windows. Verify your hardware: + +```bash +sysctl -n machdep.cpu.brand_string +# Should show "Apple M1", "Apple M2", etc. +``` + +### Single Model Server + +**Issue**: Cannot switch models without restarting + +**Solution**: vLLM-MLX loads a single model at startup. To serve multiple models, run separate instances on different ports and configure them as distinct endpoints in Olla: + +```yaml +discovery: + static: + endpoints: + - url: "http://localhost:8000" + name: "mlx-llama" + type: "vllm-mlx" + priority: 85 + + - url: "http://localhost:8001" + name: "mlx-qwen" + type: "vllm-mlx" + priority: 85 +``` + +### Memory Pressure + +**Issue**: macOS becomes sluggish or model inference slows down + +**Solution**: Unified memory is shared between macOS and model inference. If the model consumes too much memory, the system will swap to disk, degrading performance: + +1. Choose a smaller quantisation (e.g. 4bit instead of 8bit) +2. Use a smaller model that fits comfortably within available memory +3. Close memory-intensive applications +4. Monitor memory usage with `Activity Monitor` or `vm_stat` + +### Connection Timeout + +**Issue**: Requests timeout during model loading + +**Solution**: Increase timeout in profile: +```yaml +characteristics: + timeout: 10m # Increase for initial model load + +resources: + timeout_scaling: + base_timeout_seconds: 300 + load_time_buffer: true +``` + +## Best Practices + +### 1. Use 4bit Quantisation + +4bit quantisation provides the best balance of quality and performance on Apple Silicon: + +```bash +# Recommended: 4bit models from mlx-community +vllm-mlx serve mlx-community/Llama-3.2-3B-Instruct-4bit --port 8000 +``` + +### 2. Match Model Size to Memory + +Choose models that fit comfortably within your Mac's unified memory, leaving headroom for macOS and other applications: + +| Mac | Unified Memory | Recommended Max Model | +|-----|---------------|----------------------| +| Mac Mini (base) | 16GB | 7-8B (4bit) | +| Mac Mini (max) | 64GB | 30B+ (4bit) | +| Mac Studio | 64-192GB | 70B+ (4bit) | +| MacBook Pro | 18-128GB | Varies by config | + +### 3. Enable Continuous Batching + +For multi-user scenarios, continuous batching significantly improves throughput: + +```bash +vllm-mlx serve mlx-community/Llama-3.2-3B-Instruct-4bit \ + --port 8000 \ + --continuous-batching +``` + +### 4. Deploy Multiple Instances + +Run different models on separate vLLM-MLX instances and let Olla handle routing: + +```yaml +discovery: + static: + endpoints: + - url: "http://mac-mini-1:8000" + name: "mlx-coding" + type: "vllm-mlx" + priority: 90 + + - url: "http://mac-mini-2:8000" + name: "mlx-general" + type: "vllm-mlx" + priority: 85 + +proxy: + load_balancer: "priority" +``` + +## Integration with Tools + +### OpenAI SDK + +```python +from openai import OpenAI + +client = OpenAI( + base_url="http://localhost:40114/olla/vllm-mlx/v1", + api_key="not-needed" # vLLM-MLX doesn't require API keys +) + +response = client.chat.completions.create( + model="mlx-community/Llama-3.2-3B-Instruct-4bit", + messages=[ + {"role": "user", "content": "Hello!"} + ] +) +``` + +### Claude Code + +```bash +# Set Olla as the Anthropic endpoint for Claude Code +export ANTHROPIC_BASE_URL="http://localhost:40114/olla/anthropic" + +# Claude Code will use passthrough mode automatically +claude +``` + +### LangChain + +```python +from langchain_openai import ChatOpenAI + +llm = ChatOpenAI( + base_url="http://localhost:40114/olla/vllm-mlx/v1", + api_key="not-needed", + model="mlx-community/Llama-3.2-3B-Instruct-4bit", + temperature=0.7 +) +``` + +## Next Steps + +- [Profile Configuration](../../concepts/profile-system.md) - Customise vLLM-MLX behaviour +- [Model Unification](../../concepts/model-unification.md) - Understand model management +- [Load Balancing](../../concepts/load-balancing.md) - Scale with multiple vLLM-MLX instances +- [API Translation](../../concepts/api-translation.md) - Anthropic passthrough and translation modes diff --git a/docs/content/integrations/overview.md b/docs/content/integrations/overview.md index 148f5565..0b9d0452 100644 --- a/docs/content/integrations/overview.md +++ b/docs/content/integrations/overview.md @@ -18,6 +18,7 @@ Olla natively supports the following backends: | [LM Studio](./backend/lmstudio.md) | `lm-studio` | Native support for [LM Studio](https://lmstudio.ai/), including model unification | | [llama.cpp](./backend/llamacpp.md) | `llamacpp` | Native support for [llama.cpp](https://github.com/ggml-org/llama.cpp) lightweight C++ inference server with GGUF models, including slot management, code infill, and CPU-first design for edge deployment | | [vLLM](./backend/vllm.md) | `vllm` | Native support for [vLLM](https://github.com/vllm-project/vllm), including model unification | +| [vLLM-MLX](./backend/vllm-mlx.md) | `vllm-mlx` | Native support for [vLLM-MLX](https://github.com/waybarrios/vllm-mlx), Apple Silicon inference using the MLX framework with native Anthropic Messages API | | [SGLang](./backend/sglang.md) | `sglang` | Native support for [SGLang](https://github.com/sgl-project/sglang) with RadixAttention and Frontend Language, including model unification and vision support | | [Lemonade SDK](./backend/lemonade.md) | `lemonade` | Native support for [Lemonade SDK](https://lemonade-server.ai/), AMD's local inference solution with Ryzen AI optimisation, including model unification | | [LiteLLM](./backend/litellm.md) | `litellm` | Native support for [LiteLLM](https://github.com/BerriAI/litellm), providing unified gateway to 100+ LLM providers | @@ -45,7 +46,7 @@ Olla provides Anthropic Messages API translation, enabling Claude-compatible cli | [OpenCode](./frontend/opencode.md) | Open-source AI coding assistant (SST fork) | OpenAI or Anthropic API | | [Crush CLI](./frontend/crush-cli.md) | Modern terminal AI assistant by Charmbracelet | Dual OpenAI/Anthropic support | -These clients can use local models (Ollama, LM Studio, vLLM, llama.cpp) through Olla's API translation layer. +These clients can use local models (Ollama, LM Studio, vLLM, vLLM-MLX, llama.cpp) through Olla's API translation layer. ### API Translation diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 3875b8e3..e62253a5 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -156,6 +156,7 @@ nav: - Ollama: integrations/backend/ollama.md - LM Studio: integrations/backend/lmstudio.md - vLLM: integrations/backend/vllm.md + - vLLM-MLX: integrations/backend/vllm-mlx.md - SGLang: integrations/backend/sglang.md - Lemonade SDK: integrations/backend/lemonade.md - LiteLLM: integrations/backend/litellm.md @@ -188,6 +189,7 @@ nav: - Ollama: api-reference/ollama.md - LM Studio: api-reference/lmstudio.md - vLLM: api-reference/vllm.md + - vLLM-MLX: api-reference/vllm-mlx.md - SGLang: api-reference/sglang.md - Lemonade SDK: api-reference/lemonade.md - LiteLLM: api-reference/litellm.md diff --git a/readme.md b/readme.md index 3dae1511..d025fe87 100644 --- a/readme.md +++ b/readme.md @@ -10,7 +10,8 @@ vLLM: Native Support SGLang: Native Support LiteLLM: Native Support - LM Deploy: OpenAI Compatible
+ LM Deploy: OpenAI Compatible
+ vLLM-MLX: Native Support Docker Model Runner: Native Support
Ollama: Native Support LM Studio: Native Support @@ -99,6 +100,8 @@ git clone https://github.com/thushan/olla.git && cd olla && make build-release ./bin/olla ``` +### Verification + When you have everything running, you can check it's all working with: ```bash @@ -114,6 +117,62 @@ curl http://localhost:40114/internal/status/models For detailed installation and deployment options, see [Getting Started Guide](https://thushan.github.io/olla/getting-started/quickstart/). +### Querying Olla + +Olla exposes multiple API paths depending on your use case: + +| Path | Format | Use Case | +|------|--------|----------| +| `/olla/proxy/` | OpenAI | Routes to any backend — universal endpoint | +| `/olla/anthropic/` | Anthropic | Claude-compatible clients (passthrough or translated) | +| `/olla/{provider}/` | OpenAI | Target a specific backend type (e.g. `/olla/vllm/`, `/olla/ollama/`) | + +#### OpenAI-Compatible (Universal Proxy) + +```bash +# Chat completion (routes to best available backend) +curl http://localhost:40114/olla/proxy/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{"model": "llama3.2", "messages": [{"role": "user", "content": "Hello!"}], "max_tokens": 100}' + +# Streaming +curl http://localhost:40114/olla/proxy/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{"model": "llama3.2", "messages": [{"role": "user", "content": "Hello!"}], "max_tokens": 100, "stream": true}' + +# List all models across backends +curl http://localhost:40114/olla/proxy/v1/models +``` + +#### Anthropic Messages API + +```bash +# Chat completion (passthrough for supported backends, translated for others) +curl http://localhost:40114/olla/anthropic/v1/messages \ + -H "Content-Type: application/json" \ + -H "x-api-key: not-needed" \ + -H "anthropic-version: 2023-06-01" \ + -d '{"model": "llama3.2", "max_tokens": 100, "messages": [{"role": "user", "content": "Hello!"}]}' + +# Streaming +curl http://localhost:40114/olla/anthropic/v1/messages \ + -H "Content-Type: application/json" \ + -H "x-api-key: not-needed" \ + -H "anthropic-version: 2023-06-01" \ + -d '{"model": "llama3.2", "max_tokens": 100, "messages": [{"role": "user", "content": "Hello!"}], "stream": true}' +``` + +#### Provider-Specific Endpoints + +```bash +# Target a specific backend type directly +curl http://localhost:40114/olla/ollama/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{"model": "llama3.2", "messages": [{"role": "user", "content": "Hello!"}], "max_tokens": 100}' + +# Other providers: /olla/vllm/, /olla/vllm-mlx/, /olla/lm-studio/, /olla/llamacpp/, etc. +``` + ## Examples We've also got ready-to-use Docker Compose setups for common scenarios: diff --git a/test/scripts/README.md b/test/scripts/README.md index c8aa3442..280b9747 100644 --- a/test/scripts/README.md +++ b/test/scripts/README.md @@ -57,6 +57,16 @@ Comprehensive test suites for automated validation. - Request size limits - Authentication and authorization +### `/passthrough` - Anthropic Passthrough Tests +Validates that the Anthropic Messages API translator correctly selects passthrough or translation mode based on backend capability. +- Auto-discovers available backends and models +- Verifies passthrough mode for natively supported backends (vLLM, LM Studio, Ollama, llama.cpp, Lemonade) +- Verifies translation mode for backends without native support (OpenAI-compatible, LiteLLM) +- Non-streaming and streaming response validation +- OpenAI baseline comparison per backend +- Edge cases: non-existent models, system parameters, multi-turn conversations +- Translator statistics reporting + ## Common Usage Patterns Most scripts support these common arguments: diff --git a/test/scripts/passthrough/README.md b/test/scripts/passthrough/README.md new file mode 100644 index 00000000..1e3d8a4c --- /dev/null +++ b/test/scripts/passthrough/README.md @@ -0,0 +1,119 @@ +# Passthrough Behaviour Test Script + +Validates that Olla's Anthropic Messages API translator correctly selects **passthrough** mode for backends with native Anthropic support, and falls back to **translation** mode for those without. + +## What It Tests + +The script auto-discovers available backends and models, then runs a test matrix covering: + +- **Non-streaming requests** - Verifies `X-Olla-Mode` header matches expected mode +- **Streaming requests** - Validates SSE event types and Content-Type headers +- **OpenAI baseline** - Confirms the standard proxy path still works alongside translation +- **Edge cases** - Non-existent model (expects 4xx), system parameter, multi-turn conversations +- **Translator stats** - Fetches and displays passthrough/translation rate metrics + +## Mode Selection + +Backends are categorised by whether they natively support the Anthropic Messages API: + +| Mode | Backend Types | Behaviour | +|------|--------------|-----------| +| **Passthrough** | `vllm`, `vllm-mlx`, `lm-studio`, `ollama`, `llamacpp`, `lemonade` | Request forwarded directly in Anthropic format | +| **Translation** | `openai-compatible`, `litellm` | Anthropic -> OpenAI -> backend -> OpenAI -> Anthropic | + +## Prerequisites + +- Python 3.8+ +- Olla running with at least one healthy backend +- Dependencies installed (`pip install -r ../requirements.txt`) + +## Usage + +```bash +# Run full test suite +python test-passthrough.py + +# Custom Olla URL +python test-passthrough.py --url http://localhost:8080 + +# Skip streaming tests (faster) +python test-passthrough.py --skip-streaming + +# Skip edge case tests +python test-passthrough.py --skip-edge-cases + +# Show full response bodies +python test-passthrough.py --verbose + +# Combine flags +python test-passthrough.py --url http://myhost:40114 --timeout 60 --skip-streaming +``` + +## CLI Arguments + +| Argument | Default | Description | +|----------|---------|-------------| +| `--url` | `http://localhost:40114` | Olla base URL | +| `--timeout` | `30` | Request timeout in seconds | +| `--skip-edge-cases` | off | Skip edge case tests | +| `--skip-streaming` | off | Skip streaming tests | +| `--verbose` | off | Show full response bodies | + +## Example Output + +``` +======================================================================== + Olla Anthropic Passthrough Test + Validates passthrough vs translation mode selection +======================================================================== + +Checking Olla availability... +[OK] Olla is reachable + +Discovering endpoints... +[OK] Found 2 endpoint(s) +Discovering models... + +Configuration Summary +------------------------------------------------------------------------ + Backend Type Status Models Mode + my-vllm vllm healthy 3 passthrough + my-openai openai-compatible healthy 2 translation + +Test Matrix +======================================================================== + +Backend: my-vllm type=vllm model=llama3:latest + Expected mode: passthrough + Non-streaming: [PASS] + Streaming: .....[PASS] (5 events) + OpenAI check: [PASS] + +Edge Cases +======================================================================== + Non-existent model: [PASS] HTTP 404 + System parameter: [PASS] HTTP 200 + Multi-turn conversation: [PASS] HTTP 200 + +Translator Stats +======================================================================== + anthropic + Requests: 4 total | 3 passthrough | 1 translation + Passthrough rate: 75.00% | Streaming: 1 | Non-streaming: 3 + +======================================================================== + Results Summary +======================================================================== + + Total: 6 | Passed: 6 | Failed: 0 + + All tests passed. +``` + +## Exit Codes + +| Code | Meaning | +|------|---------| +| `0` | All tests passed | +| `1` | One or more tests failed, or Olla unreachable | +| `130` | User interrupted (Ctrl+C) | diff --git a/test/scripts/passthrough/test-passthrough.py b/test/scripts/passthrough/test-passthrough.py new file mode 100644 index 00000000..ef3ac4ed --- /dev/null +++ b/test/scripts/passthrough/test-passthrough.py @@ -0,0 +1,724 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Olla Anthropic Passthrough Test Script + +Validates that the Anthropic Messages API translator correctly uses passthrough +mode for backends with native support (vllm, vllm-mlx, lm-studio, ollama, +llamacpp) and falls back to translation for others (openai-compatible, litellm). + +Auto-discovers available backends and models, then runs a test matrix covering +non-streaming, streaming, OpenAI baseline, edge cases, and translator stats. +""" + +import sys +import json +import time +import argparse +import requests +import os +from typing import Dict, List, Optional, Any, Tuple + +# Fix Windows console encoding for Unicode +if sys.platform == 'win32': + import io + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8') + os.environ['PYTHONIOENCODING'] = 'utf-8' + +# ANSI colour codes +RED = '\033[0;31m' +GREEN = '\033[0;32m' +YELLOW = '\033[1;33m' +BLUE = '\033[0;34m' +PURPLE = '\033[0;35m' +CYAN = '\033[0;36m' +WHITE = '\033[1;37m' +GREY = '\033[0;37m' +RESET = '\033[0m' +BOLD = '\033[1m' + +# Configuration +TARGET_URL = "http://localhost:40114" +DEFAULT_TIMEOUT = 30 + +# Backend types that natively support the Anthropic Messages API +PASSTHROUGH_TYPES = {"vllm", "vllm-mlx", "lm-studio", "ollama", "llamacpp", "lemonade"} +# Backend types that require translation (Anthropic -> OpenAI -> backend -> OpenAI -> Anthropic) +TRANSLATION_TYPES = {"openai-compatible", "litellm"} + +# Anthropic SSE event types expected in a well-formed streaming response +PASSTHROUGH_EVENTS = { + "message_start", "content_block_start", "content_block_delta", + "content_block_stop", "message_delta", "message_stop", +} +TRANSLATION_MIN_EVENTS = {"message_start", "message_delta", "message_stop"} + + +class BackendInfo: + """Discovered backend with its health, type, and selected model.""" + __slots__ = ("name", "backend_type", "status", "models", "selected_model") + + def __init__(self, name: str, backend_type: str, status: str): + self.name = name + self.backend_type = backend_type + self.status = status + self.models: List[str] = [] + self.selected_model: Optional[str] = None + + @property + def expects_passthrough(self) -> bool: + return self.backend_type in PASSTHROUGH_TYPES + + @property + def mode_label(self) -> str: + return "passthrough" if self.expects_passthrough else "translation" + + +class TestResult: + """Outcome of a single test case.""" + __slots__ = ("name", "passed", "detail") + + def __init__(self, name: str, passed: bool, detail: str = ""): + self.name = name + self.passed = passed + self.detail = detail + + +class PassthroughTester: + def __init__(self, base_url: str, timeout: int, verbose: bool): + self.base_url = base_url + self.timeout = timeout + self.verbose = verbose + self.backends: List[BackendInfo] = [] + self.results: List[TestResult] = [] + + # ── Helpers ───────────────────────────────────────────────────────── + + def pcolor(self, color: str, msg: str, end: str = '\n'): + print(f"{color}{msg}{RESET}", end=end) + sys.stdout.flush() + + def print_header(self): + self.pcolor(PURPLE, "=" * 72) + self.pcolor(PURPLE, f" {CYAN}Olla Anthropic Passthrough Test{RESET}") + self.pcolor(PURPLE, f" {GREY}Validates passthrough vs translation mode selection{RESET}") + self.pcolor(PURPLE, "=" * 72) + print() + + def record(self, name: str, passed: bool, detail: str = "") -> bool: + self.results.append(TestResult(name, passed, detail)) + return passed + + def _anthropic_headers(self) -> Dict[str, str]: + return { + "Content-Type": "application/json", + "x-api-key": "test-key", + "anthropic-version": "2023-06-01", + } + + def _anthropic_body(self, model: str, stream: bool = False, + system: Optional[str] = None, + messages: Optional[List[Dict]] = None) -> Dict[str, Any]: + body: Dict[str, Any] = { + "model": model, + "messages": messages or [{"role": "user", "content": "Say hello briefly"}], + "max_tokens": 10, + } + if stream: + body["stream"] = True + if system: + body["system"] = system + return body + + # ── Discovery ─────────────────────────────────────────────────────── + + def check_health(self) -> bool: + self.pcolor(YELLOW, "Checking Olla availability...") + try: + r = requests.get(f"{self.base_url}/internal/health", timeout=5) + if r.status_code == 200: + self.pcolor(GREEN, "[OK] Olla is reachable") + return True + except Exception: + pass + self.pcolor(RED, f"[FAIL] Cannot reach Olla at {self.base_url}") + return False + + def discover(self) -> bool: + """Discover endpoints, map models to backends, pick one model each.""" + # Fetch endpoints + self.pcolor(YELLOW, "Discovering endpoints...") + try: + r = requests.get(f"{self.base_url}/internal/status/endpoints", timeout=self.timeout) + r.raise_for_status() + endpoints = r.json().get("endpoints", []) + except Exception as e: + self.pcolor(RED, f"[FAIL] Endpoint discovery failed: {e}") + return False + + ep_map: Dict[str, BackendInfo] = {} + for ep in endpoints: + name = ep.get("name", "unknown") + btype = ep.get("type", "unknown") + status = ep.get("status", "unknown") + bi = BackendInfo(name, btype, status) + ep_map[name] = bi + self.backends.append(bi) + + if not self.backends: + self.pcolor(RED, "[FAIL] No endpoints discovered") + return False + + self.pcolor(GREEN, f"[OK] Found {len(self.backends)} endpoint(s)") + + # Fetch models and map to endpoints + self.pcolor(YELLOW, "Discovering models...") + try: + r = requests.get(f"{self.base_url}/internal/status/models", timeout=self.timeout) + r.raise_for_status() + data = r.json() + recent = data.get("recent_models", []) + except Exception as e: + self.pcolor(RED, f"[FAIL] Model discovery failed: {e}") + return False + + # Build URL -> endpoint name reverse map since models reference + # endpoints by URL, not by name. + url_to_name: Dict[str, str] = {} + for ep in endpoints: + url = ep.get("url", "") + name = ep.get("name", "") + if url and name: + url_to_name[url.rstrip("/")] = name + + # Also index backends by type for fallback matching + type_to_backends: Dict[str, List[str]] = {} + for ep in endpoints: + btype = ep.get("type", "") + name = ep.get("name", "") + if btype and name: + type_to_backends.setdefault(btype, []).append(name) + + for m in recent: + model_name = m.get("name", "") + model_endpoints = m.get("endpoints", []) + model_type = m.get("type", "") + matched = False + + # Primary: match by URL using the reverse map + for ep_url in model_endpoints: + normalised = ep_url.rstrip("/") + ep_name = url_to_name.get(normalised) + if ep_name and ep_name in ep_map: + ep_map[ep_name].models.append(model_name) + matched = True + + # Fallback: match by backend type if URL lookup missed + if not matched and model_type and model_type in type_to_backends: + for ep_name in type_to_backends[model_type]: + if ep_name in ep_map: + ep_map[ep_name].models.append(model_name) + matched = True + + # Select one non-embedding model per backend + for bi in self.backends: + candidates = [m for m in bi.models if "embed" not in m.lower()] + if candidates: + bi.selected_model = candidates[0] + + self._print_discovery_summary() + return True + + def _print_discovery_summary(self): + print() + self.pcolor(WHITE, "Configuration Summary") + self.pcolor(PURPLE, "-" * 72) + header = f" {'Backend':<20} {'Type':<18} {'Status':<10} {'Models':<6} {'Mode'}" + self.pcolor(GREY, header) + self.pcolor(GREY, f" {'-'*20} {'-'*18} {'-'*10} {'-'*6} {'-'*14}") + + for bi in self.backends: + status_c = GREEN if bi.status == "healthy" else RED + mode_c = CYAN if bi.expects_passthrough else YELLOW + model_str = str(len(bi.models)) + status_str = f"{status_c}{bi.status:<10}{RESET}" + mode_str = f"{mode_c}{bi.mode_label:<14}{RESET}" + print(f" {bi.name:<20} {bi.backend_type:<18} {status_str} {model_str:<6} {mode_str}") + + pt = [b for b in self.backends if b.expects_passthrough and b.status == "healthy"] + tr = [b for b in self.backends if not b.expects_passthrough and b.status == "healthy"] + print() + self.pcolor(GREY, f" Passthrough-capable: {len(pt)} | Translation-required: {len(tr)}") + + # ── Test Matrix ───────────────────────────────────────────────────── + + def _testable_backends(self) -> List[BackendInfo]: + return [b for b in self.backends if b.status == "healthy" and b.selected_model] + + def run_matrix(self, skip_streaming: bool): + """Run the core test matrix for every testable backend.""" + testable = self._testable_backends() + if not testable: + self.pcolor(YELLOW, "\nNo healthy backends with models to test") + return + + print() + self.pcolor(WHITE, "Test Matrix") + self.pcolor(PURPLE, "=" * 72) + + for bi in testable: + print() + self.pcolor(WHITE, f"Backend: {CYAN}{bi.name}{RESET} " + f"type={bi.backend_type} model={bi.selected_model}") + self.pcolor(GREY, f" Expected mode: {bi.mode_label}") + + self._test_anthropic_nonstreaming(bi) + if not skip_streaming: + self._test_anthropic_streaming(bi) + self._test_openai_baseline(bi) + + # -- Non-streaming Anthropic ---------------------------------------- + + def _test_anthropic_nonstreaming(self, bi: BackendInfo): + label = f"{bi.name}/anthropic-nonstream" + self.pcolor(YELLOW, f" Non-streaming: ", end="") + + try: + r = requests.post( + f"{self.base_url}/olla/anthropic/v1/messages", + headers=self._anthropic_headers(), + json=self._anthropic_body(bi.selected_model), + timeout=self.timeout, + ) + except Exception as e: + self.pcolor(RED, f"[FAIL] {e}") + self.record(label, False, str(e)) + return + + ok = True + notes = [] + + # HTTP status + if r.status_code != 200: + self.pcolor(RED, f"[FAIL] HTTP {r.status_code}") + self.record(label, False, f"HTTP {r.status_code}") + return + + # Mode header + mode = r.headers.get("X-Olla-Mode", "") + if bi.expects_passthrough: + if mode != "passthrough": + ok = False + notes.append(f"expected X-Olla-Mode=passthrough, got '{mode}'") + else: + # Translation mode: header should be absent + if mode == "passthrough": + ok = False + notes.append("unexpected X-Olla-Mode=passthrough for translation backend") + + # Endpoint header + ep = r.headers.get("X-Olla-Endpoint", "") + if ep and ep != bi.name: + notes.append(f"routed to {ep} (expected {bi.name})") + + # Backend type header + bt = r.headers.get("X-Olla-Backend-Type", "") + if bt and bt != bi.backend_type: + notes.append(f"backend-type={bt} (expected {bi.backend_type})") + + # Valid JSON + try: + r.json() + except Exception: + ok = False + notes.append("invalid JSON response") + + if self.verbose and ok: + self.pcolor(GREY, "") + self.pcolor(GREY, f" {json.dumps(r.json(), indent=2)[:300]}") + + status_str = f"{GREEN}[PASS]{RESET}" if ok else f"{RED}[FAIL]{RESET}" + detail = f" {' | '.join(notes)}" if notes else "" + print(f"{status_str}{GREY}{detail}{RESET}") + self.record(label, ok, "; ".join(notes)) + + # -- Streaming Anthropic ------------------------------------------- + + def _test_anthropic_streaming(self, bi: BackendInfo): + label = f"{bi.name}/anthropic-stream" + self.pcolor(YELLOW, f" Streaming: ", end="") + + try: + r = requests.post( + f"{self.base_url}/olla/anthropic/v1/messages", + headers=self._anthropic_headers(), + json=self._anthropic_body(bi.selected_model, stream=True), + timeout=self.timeout, + stream=True, + ) + except Exception as e: + self.pcolor(RED, f"[FAIL] {e}") + self.record(label, False, str(e)) + return + + ok = True + notes = [] + + if r.status_code != 200: + self.pcolor(RED, f"[FAIL] HTTP {r.status_code}") + self.record(label, False, f"HTTP {r.status_code}") + return + + # Mode header + mode = r.headers.get("X-Olla-Mode", "") + if bi.expects_passthrough: + if mode != "passthrough": + ok = False + notes.append(f"expected X-Olla-Mode=passthrough, got '{mode}'") + else: + if mode == "passthrough": + ok = False + notes.append("unexpected passthrough for translation backend") + + # Content type + ct = r.headers.get("Content-Type", "") + if "text/event-stream" not in ct: + ok = False + notes.append(f"content-type={ct}, expected text/event-stream") + + # Parse SSE events + event_types = set() + event_count = 0 + for line in r.iter_lines(decode_unicode=True): + if not line: + continue + if line.startswith("event: "): + event_types.add(line[7:].strip()) + if line.startswith("data: "): + event_count += 1 + self.pcolor(CYAN, ".", end="") + + # Validate event types + if bi.expects_passthrough: + missing = PASSTHROUGH_EVENTS - event_types + # Some events are optional depending on content length; require at least the minimum set + if TRANSLATION_MIN_EVENTS - event_types: + ok = False + notes.append(f"missing events: {TRANSLATION_MIN_EVENTS - event_types}") + else: + missing = TRANSLATION_MIN_EVENTS - event_types + if missing: + ok = False + notes.append(f"missing events: {missing}") + + if event_count == 0: + ok = False + notes.append("no SSE data lines received") + + status_str = f" {GREEN}[PASS]{RESET}" if ok else f" {RED}[FAIL]{RESET}" + detail = f" {' | '.join(notes)}" if notes else f" ({event_count} events)" + print(f"{status_str}{GREY}{detail}{RESET}") + self.record(label, ok, "; ".join(notes)) + + # -- OpenAI baseline ------------------------------------------------ + + def _test_openai_baseline(self, bi: BackendInfo): + label = f"{bi.name}/openai-baseline" + self.pcolor(YELLOW, f" OpenAI check: ", end="") + + try: + r = requests.post( + f"{self.base_url}/olla/proxy/v1/chat/completions", + headers={"Content-Type": "application/json"}, + json={ + "model": bi.selected_model, + "messages": [{"role": "user", "content": "Say hello briefly"}], + "max_tokens": 10, + }, + timeout=self.timeout, + ) + except Exception as e: + self.pcolor(RED, f"[FAIL] {e}") + self.record(label, False, str(e)) + return + + ok = r.status_code == 200 + ep = r.headers.get("X-Olla-Endpoint", "") + notes = [] + + if not ok: + notes.append(f"HTTP {r.status_code}") + if ep and ep != bi.name: + notes.append(f"routed to {ep} (expected {bi.name})") + + status_str = f"{GREEN}[PASS]{RESET}" if ok else f"{RED}[FAIL]{RESET}" + detail = f" {' | '.join(notes)}" if notes else "" + print(f"{status_str}{GREY}{detail}{RESET}") + self.record(label, ok, "; ".join(notes)) + + # ── Edge Cases ────────────────────────────────────────────────────── + + def run_edge_cases(self): + print() + self.pcolor(WHITE, "Edge Cases") + self.pcolor(PURPLE, "=" * 72) + + self._test_nonexistent_model() + + pt_backends = [b for b in self._testable_backends() if b.expects_passthrough] + if pt_backends: + bi = pt_backends[0] + self._test_system_param(bi) + self._test_multiturn(bi) + else: + self.pcolor(GREY, " Skipping system/multiturn tests (no passthrough backends)") + + def _test_nonexistent_model(self): + label = "edge/nonexistent-model" + self.pcolor(YELLOW, f" Non-existent model: ", end="") + + try: + r = requests.post( + f"{self.base_url}/olla/anthropic/v1/messages", + headers=self._anthropic_headers(), + json=self._anthropic_body("nonexistent-model-xyz-999"), + timeout=self.timeout, + ) + except Exception as e: + self.pcolor(RED, f"[FAIL] {e}") + self.record(label, False, str(e)) + return + + # Accept any 4xx as correct behaviour; the exact code may vary + ok = 400 <= r.status_code < 500 + status_str = f"{GREEN}[PASS]{RESET}" if ok else f"{RED}[FAIL]{RESET}" + detail = f" HTTP {r.status_code}" + print(f"{status_str}{GREY}{detail}{RESET}") + self.record(label, ok, f"HTTP {r.status_code}") + + def _test_system_param(self, bi: BackendInfo): + label = f"edge/system-param ({bi.name})" + self.pcolor(YELLOW, f" System parameter: ", end="") + + try: + r = requests.post( + f"{self.base_url}/olla/anthropic/v1/messages", + headers=self._anthropic_headers(), + json=self._anthropic_body( + bi.selected_model, + system="You are a pirate", + messages=[{"role": "user", "content": "Say hello"}], + ), + timeout=self.timeout, + ) + except Exception as e: + self.pcolor(RED, f"[FAIL] {e}") + self.record(label, False, str(e)) + return + + ok = r.status_code == 200 + status_str = f"{GREEN}[PASS]{RESET}" if ok else f"{RED}[FAIL]{RESET}" + print(f"{status_str}{GREY} HTTP {r.status_code}{RESET}") + self.record(label, ok, f"HTTP {r.status_code}") + + def _test_multiturn(self, bi: BackendInfo): + label = f"edge/multiturn ({bi.name})" + self.pcolor(YELLOW, f" Multi-turn conversation: ", end="") + + messages = [ + {"role": "user", "content": "My name is Test."}, + {"role": "assistant", "content": "Hello, Test!"}, + {"role": "user", "content": "What is my name?"}, + ] + body = self._anthropic_body(bi.selected_model, messages=messages) + body["max_tokens"] = 20 + + try: + r = requests.post( + f"{self.base_url}/olla/anthropic/v1/messages", + headers=self._anthropic_headers(), + json=body, + timeout=self.timeout, + ) + except Exception as e: + self.pcolor(RED, f"[FAIL] {e}") + self.record(label, False, str(e)) + return + + ok = r.status_code == 200 + status_str = f"{GREEN}[PASS]{RESET}" if ok else f"{RED}[FAIL]{RESET}" + print(f"{status_str}{GREY} HTTP {r.status_code}{RESET}") + self.record(label, ok, f"HTTP {r.status_code}") + + # ── Translator Stats ──────────────────────────────────────────────── + + def report_translator_stats(self): + print() + self.pcolor(WHITE, "Translator Stats") + self.pcolor(PURPLE, "=" * 72) + + try: + r = requests.get(f"{self.base_url}/internal/stats/translators", timeout=self.timeout) + r.raise_for_status() + data = r.json() + except Exception as e: + self.pcolor(RED, f" [FAIL] Could not fetch translator stats: {e}") + return + + translators = data.get("translators", []) + summary = data.get("summary", {}) + + if not translators: + self.pcolor(GREY, " No translator stats available yet") + return + + for t in translators: + name = t.get("translator_name", "?") + total = t.get("total_requests", 0) + pt_req = t.get("passthrough_requests", 0) + tr_req = t.get("translation_requests", 0) + pt_rate = t.get("passthrough_rate", "N/A") + stream = t.get("streaming_requests", 0) + nonstream = t.get("non_streaming_requests", 0) + + self.pcolor(CYAN, f" {name}") + self.pcolor(GREY, f" Requests: {total} total | " + f"{pt_req} passthrough | {tr_req} translation") + self.pcolor(GREY, f" Passthrough rate: {pt_rate} | " + f"Streaming: {stream} | Non-streaming: {nonstream}") + + fb1 = t.get("fallback_no_compatible_endpoints", 0) + fb2 = t.get("fallback_translator_does_not_support_passthrough", 0) + fb3 = t.get("fallback_cannot_passthrough", 0) + if fb1 or fb2 or fb3: + self.pcolor(YELLOW, f" Fallbacks: no_compatible={fb1} " + f"no_support={fb2} cannot={fb3}") + + overall_pt = summary.get("overall_passthrough_rate", "N/A") + overall_sr = summary.get("overall_success_rate", "N/A") + print() + self.pcolor(GREY, f" Overall: passthrough={overall_pt} success={overall_sr}") + + # ── Summary ───────────────────────────────────────────────────────── + + def print_summary(self) -> bool: + print() + self.pcolor(PURPLE, "=" * 72) + self.pcolor(WHITE, f" {BOLD}Results Summary{RESET}") + self.pcolor(PURPLE, "=" * 72) + + # Main results table + testable = self._testable_backends() + if testable: + print() + header = (f" {'Backend':<18} {'Type':<18} {'Model':<24} " + f"{'Mode':<14} {'Stream':<8} {'Result'}") + self.pcolor(GREY, header) + self.pcolor(GREY, f" {'-'*18} {'-'*18} {'-'*24} {'-'*14} {'-'*8} {'-'*6}") + + for bi in testable: + ns_ok = self._result_ok(f"{bi.name}/anthropic-nonstream") + st_ok = self._result_ok(f"{bi.name}/anthropic-stream") + + # Stream column: tick/cross for nonstream/stream + ns_mark = f"{GREEN}v{RESET}" if ns_ok else f"{RED}x{RESET}" + st_mark = f"{GREEN}v{RESET}" if st_ok else (f"{RED}x{RESET}" if st_ok is not None else f"{GREY}-{RESET}") + stream_col = f"{ns_mark}/{st_mark}" + + all_ok = ns_ok and (st_ok is not False) + result_c = GREEN if all_ok else RED + result_str = f"{result_c}{'PASS' if all_ok else 'FAIL'}{RESET}" + mode_c = CYAN if bi.expects_passthrough else YELLOW + + model_short = (bi.selected_model[:22] + "..") if len(bi.selected_model or "") > 24 else (bi.selected_model or "N/A") + + print(f" {bi.name:<18} {bi.backend_type:<18} {model_short:<24} " + f"{mode_c}{bi.mode_label:<14}{RESET} {stream_col} {result_str}") + + # Edge case results + edge_results = [r for r in self.results if r.name.startswith("edge/")] + if edge_results: + print() + self.pcolor(WHITE, " Edge Cases:") + for r in edge_results: + c = GREEN if r.passed else RED + mark = "PASS" if r.passed else "FAIL" + detail = f" ({r.detail})" if r.detail else "" + self.pcolor(c, f" [{mark}] {r.name}{GREY}{detail}{RESET}") + + # Totals + passed = sum(1 for r in self.results if r.passed) + failed = sum(1 for r in self.results if not r.passed) + total = len(self.results) + + print() + self.pcolor(GREY, f" Total: {total} | ", end="") + self.pcolor(GREEN, f"Passed: {passed}", end="") + self.pcolor(GREY, " | ", end="") + self.pcolor(RED if failed else GREEN, f"Failed: {failed}") + print() + + all_pass = failed == 0 + if all_pass: + self.pcolor(GREEN, " All tests passed.") + else: + self.pcolor(RED, f" {failed} test(s) failed.") + + return all_pass + + def _result_ok(self, label: str) -> Optional[bool]: + for r in self.results: + if r.name == label: + return r.passed + return None + + +def main(): + parser = argparse.ArgumentParser( + description="Test Olla Anthropic passthrough vs translation mode selection" + ) + parser.add_argument("--url", default=TARGET_URL, + help=f"Olla base URL (default: {TARGET_URL})") + parser.add_argument("--timeout", type=int, default=DEFAULT_TIMEOUT, + help=f"Request timeout in seconds (default: {DEFAULT_TIMEOUT})") + parser.add_argument("--skip-edge-cases", action="store_true", + help="Skip edge case tests") + parser.add_argument("--skip-streaming", action="store_true", + help="Skip streaming tests") + parser.add_argument("--verbose", action="store_true", + help="Show full response bodies") + + args = parser.parse_args() + + tester = PassthroughTester(args.url, args.timeout, args.verbose) + tester.print_header() + + # Phase 1: Health and discovery + if not tester.check_health(): + sys.exit(1) + print() + if not tester.discover(): + sys.exit(1) + + # Phase 2: Test matrix + tester.run_matrix(skip_streaming=args.skip_streaming) + + # Phase 3: Edge cases + if not args.skip_edge_cases: + tester.run_edge_cases() + + # Phase 4: Translator stats + tester.report_translator_stats() + + # Phase 5: Summary + all_pass = tester.print_summary() + sys.exit(0 if all_pass else 1) + + +if __name__ == "__main__": + try: + main() + except KeyboardInterrupt: + print(f"\n{YELLOW}Test interrupted by user (Ctrl+C){RESET}") + sys.exit(130)