diff --git a/config/profiles/lmdeploy.yaml b/config/profiles/lmdeploy.yaml new file mode 100644 index 0000000..78d3356 --- /dev/null +++ b/config/profiles/lmdeploy.yaml @@ -0,0 +1,173 @@ +# LMDeploy inference platform profile +name: lmdeploy +home: "https://github.com/InternLM/lmdeploy" +version: "1.0" +display_name: "LMDeploy" +description: "LMDeploy efficient inference and serving for LLMs and VLMs" + +# Routing configuration +routing: + prefixes: + - lmdeploy + +# API compatibility +api: + openai_compatible: true + paths: + # Health and system endpoints + - /health # 0: health check (HTTP 200, empty body) + + # Model management + - /v1/models # 1: list models (OpenAI-compatible ModelList) + + # Text generation endpoints (OpenAI-compatible) + - /v1/chat/completions # 2: chat completions with SSE streaming + - /v1/completions # 3: text completions + + # LMDeploy-specific endpoints + - /v1/encode # 4: token encoding (LMDeploy-specific) + - /generate # 5: native generation endpoint + - /pooling # 6: reward/score pooling (the real embeddings path) + # NOTE: /v1/embeddings is intentionally excluded — LMDeploy returns HTTP 400 + # unconditionally for all backends on that path. + + # Sleep state probe (LMDeploy-specific, used for auto-detection) + - /is_sleeping # 7: sleep state probe + + model_discovery_path: /v1/models + health_check_path: /health + +# Platform characteristics +characteristics: + timeout: 2m + max_concurrent_requests: 100 + default_priority: 82 # between SGLang (85) and vLLM (80) + streaming_support: true + +# Detection hints for auto-discovery +detection: + path_indicators: + - "/v1/encode" # LMDeploy-specific token encoding + - "/generate" # LMDeploy native generation + - "/pooling" # LMDeploy reward/score path + - "/is_sleeping" # distinct from vLLM and SGLang + default_ports: + - 23333 # api_server default (not 8000 which is proxy_server) + +# Request/response handling +request: + model_field_paths: + - "model" + response_format: "lmdeploy" + parsing_rules: + chat_completions_path: "/v1/chat/completions" + completions_path: "/v1/completions" + model_field_name: "model" + supports_streaming: true + +# Path indices for specific functions +path_indices: + health: 0 + models: 1 + chat_completions: 2 + completions: 3 + +# Model handling +models: + name_format: "{{.Name}}" + capability_patterns: + chat: + - "*-Chat-*" + - "*-Instruct*" + - "*-chat-*" + vision: + - "*vision*" + - "*llava*" + - "*VL*" + code: + - "*code*" + - "*Code*" + # Context window patterns for common LMDeploy models + context_patterns: + - pattern: "*llama-3.1*" + context: 131072 + - pattern: "*llama-3*" + context: 8192 + - pattern: "*internlm2_5*" + context: 32768 + - pattern: "*internlm2*" + context: 32768 + - pattern: "*mistral*" + context: 32768 + - pattern: "*qwen2*" + context: 32768 + +# Resource management +resources: + model_sizes: + - patterns: ["*70b*", "*72b*"] + min_memory_gb: 140 + recommended_memory_gb: 160 + min_gpu_memory_gb: 140 + estimated_load_time_ms: 60000 + - patterns: ["*34b*", "*33b*", "*30b*"] + min_memory_gb: 70 + recommended_memory_gb: 80 + min_gpu_memory_gb: 70 + estimated_load_time_ms: 45000 + - patterns: ["*13b*", "*14b*"] + min_memory_gb: 30 + recommended_memory_gb: 40 + min_gpu_memory_gb: 30 + estimated_load_time_ms: 30000 + - patterns: ["*7b*", "*8b*"] + min_memory_gb: 16 + recommended_memory_gb: 24 + min_gpu_memory_gb: 16 + estimated_load_time_ms: 20000 + - patterns: ["*3b*"] + min_memory_gb: 8 + recommended_memory_gb: 12 + min_gpu_memory_gb: 8 + estimated_load_time_ms: 15000 + - patterns: ["*1b*", "*1.1b*", "*1.5b*"] + min_memory_gb: 4 + recommended_memory_gb: 8 + min_gpu_memory_gb: 4 + estimated_load_time_ms: 10000 + + defaults: + min_memory_gb: 8 + recommended_memory_gb: 16 + min_gpu_memory_gb: 8 + requires_gpu: true + estimated_load_time_ms: 30000 + + concurrency_limits: + - min_memory_gb: 100 + max_concurrent: 10 + - min_memory_gb: 50 + max_concurrent: 20 + - min_memory_gb: 20 + max_concurrent: 50 + - min_memory_gb: 0 + max_concurrent: 100 + + timeout_scaling: + base_timeout_seconds: 120 + load_time_buffer: true + +# Metrics extraction for LMDeploy responses +metrics: + extraction: + enabled: true + source: response_body + format: json + paths: + model: "$.model" + finish_reason: "$.choices[0].finish_reason" + input_tokens: "$.usage.prompt_tokens" + output_tokens: "$.usage.completion_tokens" + total_tokens: "$.usage.total_tokens" + calculations: + is_complete: 'len(finish_reason) > 0' diff --git a/docs/content/api-reference/lmdeploy.md b/docs/content/api-reference/lmdeploy.md new file mode 100644 index 0000000..5c3158d --- /dev/null +++ b/docs/content/api-reference/lmdeploy.md @@ -0,0 +1,313 @@ +# LMDeploy API + +Proxy endpoints for LMDeploy inference servers. Available through the `/olla/lmdeploy/` prefix. + +## Endpoints Overview + +| Method | URI | Description | +|--------|-----|-------------| +| GET | `/olla/lmdeploy/health` | Health check | +| GET | `/olla/lmdeploy/v1/models` | List available models | +| POST | `/olla/lmdeploy/v1/chat/completions` | Chat completion | +| POST | `/olla/lmdeploy/v1/completions` | Text completion | +| POST | `/olla/lmdeploy/v1/encode` | Token encoding | +| POST | `/olla/lmdeploy/generate` | Native generation | +| POST | `/olla/lmdeploy/pooling` | Reward/score pooling | +| GET | `/olla/lmdeploy/is_sleeping` | Sleep state probe | + +!!! warning "/v1/embeddings is not supported" + LMDeploy returns HTTP 400 on `/v1/embeddings` unconditionally across all backends. Use `/pooling` for reward/score tasks instead. + +--- + +## GET /olla/lmdeploy/health + +Check LMDeploy server health. + +### Request + +```bash +curl http://localhost:40114/olla/lmdeploy/health +``` + +### Response + +```json +{} +``` + +LMDeploy returns an empty body with HTTP 200 on a healthy server. + +--- + +## GET /olla/lmdeploy/v1/models + +List models available on the LMDeploy server. + +### Request + +```bash +curl http://localhost:40114/olla/lmdeploy/v1/models +``` + +### Response + +```json +{ + "object": "list", + "data": [ + { + "id": "internlm/internlm2_5-7b-chat", + "object": "model", + "created": 1705334400, + "owned_by": "lmdeploy", + "root": "internlm/internlm2_5-7b-chat", + "permission": [] + } + ] +} +``` + +--- + +## POST /olla/lmdeploy/v1/chat/completions + +OpenAI-compatible chat completion. + +### Request + +```bash +curl -X POST http://localhost:40114/olla/lmdeploy/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "internlm/internlm2_5-7b-chat", + "messages": [ + {"role": "user", "content": "Explain the TurboMind engine in one paragraph"} + ], + "temperature": 0.7, + "max_tokens": 200, + "stream": false + }' +``` + +### Response + +```json +{ + "id": "chatcmpl-lmdeploy-abc123", + "object": "chat.completion", + "created": 1705334400, + "model": "internlm/internlm2_5-7b-chat", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "TurboMind is LMDeploy's C++/CUDA inference engine..." + }, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 18, + "completion_tokens": 64, + "total_tokens": 82 + } +} +``` + +### Streaming Response + +When `"stream": true`: + +```text +data: {"id":"chatcmpl-lmdeploy-abc123","object":"chat.completion.chunk","created":1705334400,"model":"internlm/internlm2_5-7b-chat","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":null}]} + +data: {"id":"chatcmpl-lmdeploy-abc123","object":"chat.completion.chunk","created":1705334400,"model":"internlm/internlm2_5-7b-chat","choices":[{"index":0,"delta":{"content":"TurboMind"},"finish_reason":null}]} + +... + +data: [DONE] +``` + +--- + +## POST /olla/lmdeploy/v1/completions + +Text completion (OpenAI-compatible). + +### Request + +```bash +curl -X POST http://localhost:40114/olla/lmdeploy/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "internlm/internlm2_5-7b-chat", + "prompt": "LMDeploy uses TurboMind because", + "max_tokens": 100, + "temperature": 0.8, + "stream": false + }' +``` + +### Response + +```json +{ + "id": "cmpl-lmdeploy-xyz789", + "object": "text_completion", + "created": 1705334400, + "model": "internlm/internlm2_5-7b-chat", + "choices": [ + { + "text": " it provides efficient GPU utilisation through continuous batching...", + "index": 0, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 9, + "completion_tokens": 55, + "total_tokens": 64 + } +} +``` + +--- + +## POST /olla/lmdeploy/v1/encode + +Token encoding (LMDeploy-specific). Encodes input text to token IDs without running inference. + +### Request + +```bash +curl -X POST http://localhost:40114/olla/lmdeploy/v1/encode \ + -H "Content-Type: application/json" \ + -d '{ + "model": "internlm/internlm2_5-7b-chat", + "input": "Hello, world!" + }' +``` + +### Response + +```json +{ + "input_ids": [1, 22172, 29892, 3186, 29991], + "length": 5 +} +``` + +--- + +## POST /olla/lmdeploy/generate + +LMDeploy's native generation endpoint. + +### Request + +```bash +curl -X POST http://localhost:40114/olla/lmdeploy/generate \ + -H "Content-Type: application/json" \ + -d '{ + "inputs": "def fibonacci(n):\n ", + "parameters": { + "temperature": 0.2, + "max_new_tokens": 100 + } + }' +``` + +### Response + +```json +{ + "generated_text": "def fibonacci(n):\n if n <= 1:\n return n\n return fibonacci(n-1) + fibonacci(n-2)", + "finish_reason": "stop" +} +``` + +--- + +## POST /olla/lmdeploy/pooling + +Reward or score pooling for embedding-style tasks. This is the correct path for pooling operations — `/v1/embeddings` is not supported. + +### Request + +```bash +curl -X POST http://localhost:40114/olla/lmdeploy/pooling \ + -H "Content-Type: application/json" \ + -d '{ + "model": "internlm/internlm2_5-7b-chat", + "input": "The quick brown fox" + }' +``` + +### Response + +```json +{ + "object": "list", + "data": [ + { + "object": "embedding", + "index": 0, + "embedding": [0.0123, -0.0456, 0.0789, ...] + } + ], + "model": "internlm/internlm2_5-7b-chat", + "usage": { + "prompt_tokens": 5, + "total_tokens": 5 + } +} +``` + +--- + +## GET /olla/lmdeploy/is_sleeping + +Probe whether the LMDeploy engine is in sleep mode. Sleeping instances return HTTP 503 on generation endpoints — Olla's health checker treats this as a transient failure rather than a hard outage. + +### Request + +```bash +curl http://localhost:40114/olla/lmdeploy/is_sleeping +``` + +### Response + +```json +{"is_sleeping": false} +``` + +--- + +## Configuration Example + +```yaml +discovery: + static: + endpoints: + - url: "http://192.168.0.100:23333" + name: "lmdeploy-server" + type: "lmdeploy" + priority: 82 + model_url: "/v1/models" + health_check_url: "/health" + check_interval: 5s + check_timeout: 2s +``` + +The default port for `lmdeploy serve api_server` is **23333**. The `proxy_server` component runs on 8000 but does not expose `/health` and is not supported by Olla. + +## Response Headers + +All responses include: + +- `X-Olla-Endpoint` - Backend endpoint name (e.g., `lmdeploy-server`) +- `X-Olla-Model` - Model used for the request +- `X-Olla-Backend-Type` - Always `lmdeploy` for these endpoints +- `X-Olla-Response-Time` - Total processing time diff --git a/docs/content/api-reference/overview.md b/docs/content/api-reference/overview.md index 242083d..d9a0db6 100644 --- a/docs/content/api-reference/overview.md +++ b/docs/content/api-reference/overview.md @@ -52,6 +52,12 @@ Proxy endpoints for OpenAI-compatible services. - `/olla/openai/*` - OpenAI API endpoints +### [LMDeploy API](lmdeploy.md) +Proxy endpoints for LMDeploy inference servers. + +- `/olla/lmdeploy/*` - LMDeploy API endpoints +- OpenAI-compatible endpoints plus token encoding and reward pooling + ### [vLLM API](vllm.md) Proxy endpoints for vLLM servers. @@ -147,7 +153,7 @@ All responses include: | `X-Olla-Request-ID` | Unique request identifier | | `X-Olla-Endpoint` | Backend endpoint name | | `X-Olla-Model` | Model used (if applicable) | -| `X-Olla-Backend-Type` | Provider type, examples:
`ollama/lm-studio/llamacpp/openai/openai-compatible/vllm/sglang/lemonade` | +| `X-Olla-Backend-Type` | Provider type, examples:
`ollama/lm-studio/llamacpp/openai/openai-compatible/vllm/sglang/lemonade/lmdeploy` | | `X-Olla-Response-Time` | Total processing time | | `X-Olla-Routing-Strategy` | Routing strategy used (when model routing is active) | | `X-Olla-Routing-Decision` | Routing decision made (routed/fallback/rejected) | diff --git a/docs/content/concepts/overview.md b/docs/content/concepts/overview.md index ed0f8ec..3726a46 100644 --- a/docs/content/concepts/overview.md +++ b/docs/content/concepts/overview.md @@ -66,7 +66,7 @@ Profiles optimise response handling for specific workload patterns. ### [Profile System](profile-system.md) Provider-specific configuration templates: -- Pre-configured profiles for Ollama, LM Studio, vLLM, SGLang, Lemonade SDK, LiteLLM, OpenAI +- Pre-configured profiles for Ollama, LM Studio, LMDeploy, vLLM, SGLang, Lemonade SDK, LiteLLM, OpenAI - Custom header mappings and endpoint patterns - Model format converters for each provider diff --git a/docs/content/faq.md b/docs/content/faq.md index 036f589..ef1d6d5 100644 --- a/docs/content/faq.md +++ b/docs/content/faq.md @@ -10,7 +10,7 @@ keywords: olla faq, troubleshooting, common questions, proxy help ### What is Olla? -Olla is a high-performance proxy and load balancer specifically designed for LLM infrastructure. It intelligently routes requests across multiple LLM backends (Ollama, LM Studio, vLLM, SGLang, Lemonade SDK, LiteLLM, and OpenAI-compatible endpoints) while providing load balancing, health checking, and unified model management. +Olla is a high-performance proxy and load balancer specifically designed for LLM infrastructure. It intelligently routes requests across multiple LLM backends (Ollama, LM Studio, LMDeploy, vLLM, SGLang, Lemonade SDK, LiteLLM, and OpenAI-compatible endpoints) while providing load balancing, health checking, and unified model management. See how Olla compares to [other tools](compare/overview.md) in the ecosystem. diff --git a/docs/content/index.md b/docs/content/index.md index dc9f6bd..7c918c1 100644 --- a/docs/content/index.md +++ b/docs/content/index.md @@ -16,7 +16,7 @@ keywords: llm proxy, ollama proxy, lm studio proxy, vllm proxy, sglang, lemonade vLLM: Native Support SGLang: Native Support LiteLLM: Native Support - LM Deploy: OpenAI Compatible
+ LM Deploy: Native Support
vLLM-MLX: Native Support Docker Model Runner: Native Support
Ollama: Native Support @@ -97,7 +97,7 @@ Olla provides detailed response headers for observability: |--------|-------------| | `X-Olla-Endpoint` | Backend endpoint name | | `X-Olla-Model` | Model used for the request | -| `X-Olla-Backend-Type` | Backend type (ollama/openai/openai-compatible/lm-studio/llamacpp/vllm/sglang/lemonade) | +| `X-Olla-Backend-Type` | Backend type (ollama/openai/openai-compatible/lm-studio/llamacpp/vllm/sglang/lemonade/lmdeploy) | | `X-Olla-Request-ID` | Unique request identifier | | `X-Olla-Response-Time` | Total processing time | diff --git a/docs/content/integrations/backend/lmdeploy.md b/docs/content/integrations/backend/lmdeploy.md new file mode 100644 index 0000000..afcbfd6 --- /dev/null +++ b/docs/content/integrations/backend/lmdeploy.md @@ -0,0 +1,377 @@ +--- +title: LMDeploy Integration - High-Performance LLM Inference with Olla +description: Configure LMDeploy with Olla proxy for efficient LLM serving. TurboMind engine, OpenAI-compatible API, VLM support, and GPU-optimised inference from InternLM. +keywords: LMDeploy, Olla proxy, TurboMind, InternLM, LLM inference, GPU optimisation, high throughput +--- + +# LMDeploy Integration + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Homegithub.com/InternLM/lmdeploy
SinceOlla v0.0.21
Typelmdeploy (use in endpoint configuration)
Profilelmdeploy.yaml (see latest)
Features +
    +
  • Proxy Forwarding
  • +
  • Health Check (native)
  • +
  • Model Unification
  • +
  • Model Detection & Normalisation
  • +
  • OpenAI API Compatibility
  • +
  • Token Encoding API
  • +
  • Reward/Score Pooling
  • +
  • VLM Inference (same api_server)
  • +
+
Unsupported +
    +
  • /v1/embeddings (returns HTTP 400 — use /pooling)
  • +
  • proxy_server component (no /health endpoint)
  • +
  • Model Management (loading/unloading)
  • +
+
Attributes +
    +
  • OpenAI Compatible
  • +
  • GPU Optimised (TurboMind C++/CUDA engine)
  • +
  • Continuous Batching
  • +
  • VLM Support
  • +
+
Prefixes + +
Endpoints + See below +
+ +## Configuration + +### Basic Setup + +Register an LMDeploy `api_server` instance with Olla: + +```yaml +discovery: + static: + endpoints: + - url: "http://localhost:23333" + name: "local-lmdeploy" + type: "lmdeploy" + priority: 82 + model_url: "/v1/models" + health_check_url: "/health" + check_interval: 5s + check_timeout: 2s +``` + +The default port for `lmdeploy serve api_server` is **23333**. Register individual `api_server` instances directly — do not point Olla at the `proxy_server` component, which lacks a `/health` endpoint and only forwards a subset of routes. + +### Authentication + +LMDeploy supports optional Bearer-token authentication via the `--api-keys` flag. Configure the token in Olla's endpoint headers so it is forwarded on every proxied request: + +```yaml +discovery: + static: + endpoints: + - url: "http://gpu-server:23333" + name: "lmdeploy-prod" + type: "lmdeploy" + priority: 82 + health_check_url: "/health" + check_interval: 10s + check_timeout: 5s + headers: + Authorization: "Bearer ${LMDEPLOY_API_KEY}" +``` + +The `/health` endpoint is auth-exempt on LMDeploy, so health checks will succeed even when a key is required for inference. + +### Multiple Instances + +```yaml +discovery: + static: + endpoints: + - url: "http://gpu1:23333" + name: "lmdeploy-1" + type: "lmdeploy" + priority: 100 + + - url: "http://gpu2:23333" + name: "lmdeploy-2" + type: "lmdeploy" + priority: 100 + +proxy: + engine: "olla" + load_balancer: "least-connections" +``` + +## Endpoints Supported + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
PathDescription
/healthHealth Check
/v1/modelsList Models (OpenAI format)
/v1/chat/completionsChat Completions (OpenAI format)
/v1/completionsText Completions (OpenAI format)
/v1/encodeToken Encoding (LMDeploy-specific)
/generateNative Generation Endpoint
/poolingReward/Score Pooling (not /v1/embeddings)
/is_sleepingSleep State Probe
+ +## Usage Examples + +### Chat Completion + +```bash +curl -X POST http://localhost:40114/olla/lmdeploy/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "internlm/internlm2_5-7b-chat", + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What is TurboMind?"} + ], + "temperature": 0.7, + "max_tokens": 300 + }' +``` + +### Streaming + +```bash +curl -X POST http://localhost:40114/olla/lmdeploy/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "internlm/internlm2_5-7b-chat", + "messages": [{"role": "user", "content": "Write a short story"}], + "stream": true, + "temperature": 0.8 + }' +``` + +### Token Encoding + +```bash +curl -X POST http://localhost:40114/olla/lmdeploy/v1/encode \ + -H "Content-Type: application/json" \ + -d '{ + "model": "internlm/internlm2_5-7b-chat", + "input": "Hello, world!" + }' +``` + +### Pooling (Reward/Score) + +```bash +# Use /pooling — not /v1/embeddings (which returns HTTP 400) +curl -X POST http://localhost:40114/olla/lmdeploy/pooling \ + -H "Content-Type: application/json" \ + -d '{ + "model": "internlm/internlm2_5-7b-chat", + "input": "The quick brown fox" + }' +``` + +## Starting LMDeploy + +### Basic Start + +```bash +pip install lmdeploy + +lmdeploy serve api_server internlm/internlm2_5-7b-chat \ + --server-port 23333 +``` + +### TurboMind Backend (Default, GPU) + +```bash +lmdeploy serve api_server internlm/internlm2_5-7b-chat \ + --backend turbomind \ + --server-port 23333 \ + --tp 1 +``` + +### PyTorch Backend + +Use `pytorch` when a model is not supported by TurboMind, or for CPU inference: + +```bash +lmdeploy serve api_server internlm/internlm2_5-7b-chat \ + --backend pytorch \ + --server-port 23333 +``` + +### With Authentication + +```bash +lmdeploy serve api_server internlm/internlm2_5-7b-chat \ + --server-port 23333 \ + --api-keys my-secret-key +``` + +### VLM Inference + +Vision-language models use the same `api_server` entrypoint — no separate binary: + +```bash +lmdeploy serve api_server InternLM/internlm-xcomposer2-7b \ + --server-port 23333 +``` + +### Docker + +```bash +docker run --gpus all \ + -p 23333:23333 \ + openmmlab/lmdeploy:latest \ + lmdeploy serve api_server internlm/internlm2_5-7b-chat \ + --server-port 23333 +``` + +## LMDeploy Specifics + +### Sleep/Wake + +LMDeploy supports a sleep mode to release GPU memory when idle: + +```bash +# Suspend the engine (GPU memory freed) +curl -X POST http://localhost:23333/sleep + +# Resume the engine +curl -X POST http://localhost:23333/wakeup + +# Check state (proxied via Olla) +curl http://localhost:40114/olla/lmdeploy/is_sleeping +``` + +Olla treats a sleeping engine as transiently unavailable and will route around it if other healthy instances exist. Once the engine wakes, health checks recover it automatically. + +### Embeddings vs Pooling + +LMDeploy does not implement `/v1/embeddings`. The correct path for reward-model scoring and embedding-style pooling is `/pooling`. This is a deliberate upstream design decision — using TurboMind's native pooling path rather than the OpenAI embeddings spec. + +### Model Naming + +LMDeploy serves models by their HuggingFace identifiers: + +- `internlm/internlm2_5-7b-chat` +- `meta-llama/Meta-Llama-3.1-8B-Instruct` +- `mistralai/Mistral-7B-Instruct-v0.2` +- `Qwen/Qwen2.5-7B-Instruct` + +### Proxy Server vs API Server + +LMDeploy ships two server components: + +| Component | Port | Use with Olla? | +|-----------|------|----------------| +| `api_server` | 23333 | Yes — has `/health`, full route support | +| `proxy_server` | 8000 | No — no `/health`, limited routes | + +Always register individual `api_server` instances. The `proxy_server` is LMDeploy's own load balancer and is redundant when Olla is in the stack. + +## Profile Customisation + +Create `config/profiles/lmdeploy-custom.yaml` to override defaults. See [Profile Configuration](../../concepts/profile-system.md) for the full schema. + +```yaml +name: lmdeploy +version: "1.0" + +# Add a shorter routing prefix +routing: + prefixes: + - lmdeploy + - turbomind + +# Increase timeout for large 70B models +characteristics: + timeout: 5m +``` + +## OpenAI SDK + +```python +from openai import OpenAI + +client = OpenAI( + base_url="http://localhost:40114/olla/lmdeploy/v1", + api_key="not-needed" # omit if no --api-keys set on lmdeploy +) + +response = client.chat.completions.create( + model="internlm/internlm2_5-7b-chat", + messages=[{"role": "user", "content": "Hello!"}] +) +``` + +## Next Steps + +- [LMDeploy API Reference](../../api-reference/lmdeploy.md) - Endpoint details and response formats +- [Profile Configuration](../../concepts/profile-system.md) - Customise LMDeploy behaviour +- [Load Balancing](../../concepts/load-balancing.md) - Scale across multiple LMDeploy instances +- [Health Checking](../../concepts/health-checking.md) - Circuit breakers and failover diff --git a/docs/content/integrations/overview.md b/docs/content/integrations/overview.md index 0b9d045..eb3d985 100644 --- a/docs/content/integrations/overview.md +++ b/docs/content/integrations/overview.md @@ -17,6 +17,7 @@ Olla natively supports the following backends: | [Ollama](./backend/ollama.md) | `ollama` | Native support for [Ollama](https://github.com/ollama/ollama), including model unification | | [LM Studio](./backend/lmstudio.md) | `lm-studio` | Native support for [LM Studio](https://lmstudio.ai/), including model unification | | [llama.cpp](./backend/llamacpp.md) | `llamacpp` | Native support for [llama.cpp](https://github.com/ggml-org/llama.cpp) lightweight C++ inference server with GGUF models, including slot management, code infill, and CPU-first design for edge deployment | +| [LMDeploy](./backend/lmdeploy.md) | `lmdeploy` | Native support for [LMDeploy](https://github.com/InternLM/lmdeploy), InternLM's high-performance inference server with TurboMind C++/CUDA engine | | [vLLM](./backend/vllm.md) | `vllm` | Native support for [vLLM](https://github.com/vllm-project/vllm), including model unification | | [vLLM-MLX](./backend/vllm-mlx.md) | `vllm-mlx` | Native support for [vLLM-MLX](https://github.com/waybarrios/vllm-mlx), Apple Silicon inference using the MLX framework with native Anthropic Messages API | | [SGLang](./backend/sglang.md) | `sglang` | Native support for [SGLang](https://github.com/sgl-project/sglang) with RadixAttention and Frontend Language, including model unification and vision support | diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index f8d23f6..306e59c 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -157,6 +157,7 @@ nav: - Backends: - Ollama: integrations/backend/ollama.md - LM Studio: integrations/backend/lmstudio.md + - LMDeploy: integrations/backend/lmdeploy.md - vLLM: integrations/backend/vllm.md - vLLM-MLX: integrations/backend/vllm-mlx.md - SGLang: integrations/backend/sglang.md @@ -190,6 +191,7 @@ nav: - Provider APIs: - Ollama: api-reference/ollama.md - LM Studio: api-reference/lmstudio.md + - LMDeploy: api-reference/lmdeploy.md - vLLM: api-reference/vllm.md - vLLM-MLX: api-reference/vllm-mlx.md - SGLang: api-reference/sglang.md diff --git a/internal/adapter/converter/factory.go b/internal/adapter/converter/factory.go index 8c53812..b14def3 100644 --- a/internal/adapter/converter/factory.go +++ b/internal/adapter/converter/factory.go @@ -24,6 +24,7 @@ func NewConverterFactory() *ConverterFactory { factory.RegisterConverter(NewOllamaConverter()) factory.RegisterConverter(NewLemonadeConverter()) factory.RegisterConverter(NewLMStudioConverter()) + factory.RegisterConverter(NewLMDeployConverter()) factory.RegisterConverter(NewSGLangConverter()) factory.RegisterConverter(NewVLLMConverter()) factory.RegisterConverter(NewLlamaCppConverter()) diff --git a/internal/adapter/converter/factory_test.go b/internal/adapter/converter/factory_test.go index f976adb..7a3ccde 100644 --- a/internal/adapter/converter/factory_test.go +++ b/internal/adapter/converter/factory_test.go @@ -23,6 +23,7 @@ func TestConverterFactory(t *testing.T) { {"ollama", "*converter.OllamaConverter"}, {"lemonade", "*converter.LemonadeConverter"}, {"llamacpp", "*converter.LlamaCppConverter"}, + {"lmdeploy", "*converter.LMDeployConverter"}, {"lmstudio", "*converter.LMStudioConverter"}, {"sglang", "*converter.SGLangConverter"}, {"vllm", "*converter.VLLMConverter"}, @@ -56,6 +57,7 @@ func TestConverterFactory(t *testing.T) { assert.Contains(t, qpErr.Reason, "ollama") assert.Contains(t, qpErr.Reason, "lemonade") assert.Contains(t, qpErr.Reason, "llamacpp") + assert.Contains(t, qpErr.Reason, "lmdeploy") assert.Contains(t, qpErr.Reason, "lmstudio") assert.Contains(t, qpErr.Reason, "sglang") assert.Contains(t, qpErr.Reason, "vllm") @@ -65,7 +67,7 @@ func TestConverterFactory(t *testing.T) { t.Run("GetSupportedFormats returns all formats", func(t *testing.T) { formats := factory.GetSupportedFormats() - assert.Len(t, formats, 10) + assert.Len(t, formats, 11) // Check all expected formats are present formatMap := make(map[string]bool) @@ -78,6 +80,7 @@ func TestConverterFactory(t *testing.T) { assert.True(t, formatMap["ollama"]) assert.True(t, formatMap["lemonade"]) assert.True(t, formatMap["llamacpp"]) + assert.True(t, formatMap["lmdeploy"]) assert.True(t, formatMap["sglang"]) assert.True(t, formatMap["vllm"]) assert.True(t, formatMap["vllm-mlx"]) diff --git a/internal/adapter/converter/lmdeploy_converter.go b/internal/adapter/converter/lmdeploy_converter.go new file mode 100644 index 0000000..475f6ac --- /dev/null +++ b/internal/adapter/converter/lmdeploy_converter.go @@ -0,0 +1,108 @@ +package converter + +import ( + "strings" + "time" + + "github.com/thushan/olla/internal/adapter/registry/profile" + "github.com/thushan/olla/internal/core/constants" + "github.com/thushan/olla/internal/core/domain" + "github.com/thushan/olla/internal/core/ports" +) + +// Type aliases for backward compatibility with tests +type LMDeployModelResponse = profile.LMDeployResponse +type LMDeployModelData = profile.LMDeployModel + +// LMDeployConverter converts models to LMDeploy-compatible format. +// LMDeploy's /v1/models shape is OpenAI-compatible but without max_model_len +// and with owned_by defaulting to "lmdeploy". +type LMDeployConverter struct { + *BaseConverter +} + +// NewLMDeployConverter creates a new LMDeploy format converter. +func NewLMDeployConverter() ports.ModelResponseConverter { + return &LMDeployConverter{ + BaseConverter: NewBaseConverter(constants.ProviderTypeLMDeploy), + } +} + +func (c *LMDeployConverter) GetFormatName() string { + return constants.ProviderTypeLMDeploy +} + +func (c *LMDeployConverter) ConvertToFormat(models []*domain.UnifiedModel, filters ports.ModelFilters) (interface{}, error) { + filtered := filterModels(models, filters) + + data := make([]profile.LMDeployModel, 0, len(filtered)) + for _, model := range filtered { + m := c.convertModel(model) + if m != nil { + data = append(data, *m) + } + } + + return profile.LMDeployResponse{ + Object: "list", + Data: data, + }, nil +} + +func (c *LMDeployConverter) convertModel(model *domain.UnifiedModel) *profile.LMDeployModel { + now := time.Now().Unix() + + modelID := c.findLMDeployNativeName(model) + if modelID == "" { + if len(model.Aliases) > 0 { + modelID = model.Aliases[0].Name + } else { + modelID = model.ID + } + } + + m := &profile.LMDeployModel{ + ID: modelID, + Object: "model", + Created: now, + OwnedBy: c.determineOwner(modelID), + } + + // LMDeploy does not expose max_model_len on the wire; omit it here too. + + // Generate standard permissions mirroring the LMDeploy default. + m.Permission = []profile.LMDeployModelPermission{ + { + ID: "modelperm-olla-" + strings.ReplaceAll(modelID, "/", "-"), + Object: "model_permission", + Created: now, + AllowCreateEngine: false, + AllowSampling: true, + AllowLogprobs: true, + AllowSearchIndices: false, + AllowView: true, + AllowFineTuning: false, + Organization: "*", + IsBlocking: false, + }, + } + + return m +} + +func (c *LMDeployConverter) findLMDeployNativeName(model *domain.UnifiedModel) string { + alias, found := c.BaseConverter.FindProviderAlias(model) + if found { + return alias + } + return "" +} + +// determineOwner extracts the organisation from org/model-name style IDs, +// defaulting to "lmdeploy" when there is no slash. +func (c *LMDeployConverter) determineOwner(modelID string) string { + if parts := strings.SplitN(modelID, "/", 2); len(parts) == 2 { + return parts[0] + } + return constants.ProviderTypeLMDeploy +} diff --git a/internal/adapter/converter/lmdeploy_converter_test.go b/internal/adapter/converter/lmdeploy_converter_test.go new file mode 100644 index 0000000..8ef5503 --- /dev/null +++ b/internal/adapter/converter/lmdeploy_converter_test.go @@ -0,0 +1,150 @@ +package converter + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/thushan/olla/internal/adapter/registry/profile" + "github.com/thushan/olla/internal/core/constants" + "github.com/thushan/olla/internal/core/domain" + "github.com/thushan/olla/internal/core/ports" +) + +func TestNewLMDeployConverter(t *testing.T) { + t.Parallel() + + c := NewLMDeployConverter() + assert.NotNil(t, c) + assert.Equal(t, constants.ProviderTypeLMDeploy, c.GetFormatName()) +} + +func TestLMDeployConverter_ConvertToFormat_Empty(t *testing.T) { + t.Parallel() + + c := NewLMDeployConverter() + result, err := c.ConvertToFormat([]*domain.UnifiedModel{}, ports.ModelFilters{}) + + require.NoError(t, err) + resp, ok := result.(profile.LMDeployResponse) + require.True(t, ok) + assert.Equal(t, "list", resp.Object) + assert.Empty(t, resp.Data) +} + +func TestLMDeployConverter_ConvertToFormat_SingleModel(t *testing.T) { + t.Parallel() + + c := NewLMDeployConverter() + + model := &domain.UnifiedModel{ + ID: "internlm/internlm2_5-7b-chat", + Aliases: []domain.AliasEntry{ + { + Name: "internlm/internlm2_5-7b-chat", + Source: constants.ProviderTypeLMDeploy, + }, + }, + } + + result, err := c.ConvertToFormat([]*domain.UnifiedModel{model}, ports.ModelFilters{}) + + require.NoError(t, err) + resp, ok := result.(profile.LMDeployResponse) + require.True(t, ok) + require.Len(t, resp.Data, 1) + + m := resp.Data[0] + assert.Equal(t, "internlm/internlm2_5-7b-chat", m.ID) + assert.Equal(t, "model", m.Object) + assert.NotZero(t, m.Created) + // org extracted from ID + assert.Equal(t, "internlm", m.OwnedBy) + // LMDeploy does not expose max_model_len — the field should remain zero-value + // (permissions are always generated) + require.Len(t, m.Permission, 1) + assert.True(t, m.Permission[0].AllowSampling) +} + +func TestLMDeployConverter_ConvertToFormat_NoOrgInID(t *testing.T) { + t.Parallel() + + c := NewLMDeployConverter() + + model := &domain.UnifiedModel{ + ID: "simple-model", + } + + result, err := c.ConvertToFormat([]*domain.UnifiedModel{model}, ports.ModelFilters{}) + + require.NoError(t, err) + resp, ok := result.(profile.LMDeployResponse) + require.True(t, ok) + require.Len(t, resp.Data, 1) + // Default owner when there is no org/model-name slash + assert.Equal(t, constants.ProviderTypeLMDeploy, resp.Data[0].OwnedBy) +} + +func TestLMDeployConverter_ConvertToFormat_MultipleModels(t *testing.T) { + t.Parallel() + + c := NewLMDeployConverter() + + models := []*domain.UnifiedModel{ + { + ID: "internlm/internlm2_5-7b-chat", + Aliases: []domain.AliasEntry{ + {Name: "internlm/internlm2_5-7b-chat", Source: constants.ProviderTypeLMDeploy}, + }, + }, + { + ID: "meta-llama/Meta-Llama-3.1-8B-Instruct", + Aliases: []domain.AliasEntry{ + {Name: "meta-llama/Meta-Llama-3.1-8B-Instruct", Source: constants.ProviderTypeLMDeploy}, + }, + }, + } + + result, err := c.ConvertToFormat(models, ports.ModelFilters{}) + + require.NoError(t, err) + resp, ok := result.(profile.LMDeployResponse) + require.True(t, ok) + assert.Equal(t, "list", resp.Object) + require.Len(t, resp.Data, 2) + assert.Equal(t, "internlm/internlm2_5-7b-chat", resp.Data[0].ID) + assert.Equal(t, "meta-llama/Meta-Llama-3.1-8B-Instruct", resp.Data[1].ID) +} + +func TestLMDeployConverter_FallbackToAliasOrID(t *testing.T) { + t.Parallel() + + c := NewLMDeployConverter() + + // No LMDeploy-sourced alias — should fall back to first alias + modelWithOtherAlias := &domain.UnifiedModel{ + ID: "fallback-id", + Aliases: []domain.AliasEntry{ + {Name: "alias-from-ollama", Source: constants.ProviderTypeOllama}, + }, + } + + result, err := c.ConvertToFormat([]*domain.UnifiedModel{modelWithOtherAlias}, ports.ModelFilters{}) + require.NoError(t, err) + resp, ok := result.(profile.LMDeployResponse) + require.True(t, ok) + require.Len(t, resp.Data, 1) + assert.Equal(t, "alias-from-ollama", resp.Data[0].ID) + + // No aliases at all — should use unified ID + modelWithNoAlias := &domain.UnifiedModel{ + ID: "bare-id", + } + + result2, err2 := c.ConvertToFormat([]*domain.UnifiedModel{modelWithNoAlias}, ports.ModelFilters{}) + require.NoError(t, err2) + resp2, ok2 := result2.(profile.LMDeployResponse) + require.True(t, ok2) + require.Len(t, resp2.Data, 1) + assert.Equal(t, "bare-id", resp2.Data[0].ID) +} diff --git a/internal/adapter/registry/profile/lmdeploy.go b/internal/adapter/registry/profile/lmdeploy.go new file mode 100644 index 0000000..63ce734 --- /dev/null +++ b/internal/adapter/registry/profile/lmdeploy.go @@ -0,0 +1,38 @@ +package profile + +// LMDeployResponse is the response structure from LMDeploy /v1/models endpoint. +// The shape follows the OpenAI ModelList format but with LMDeploy-specific field +// values — notably owned_by defaults to "lmdeploy" and there is no max_model_len. +type LMDeployResponse struct { + Object string `json:"object"` + Data []LMDeployModel `json:"data"` +} + +// LMDeployModel represents a single model entry in the LMDeploy /v1/models response. +// Key difference from vLLM: no max_model_len field. Key difference from vLLM/SGLang: +// owned_by defaults to "lmdeploy" (not "vllm" or "sglang"). +type LMDeployModel struct { + Root *string `json:"root,omitempty"` + Parent *string `json:"parent,omitempty"` + ID string `json:"id"` + Object string `json:"object"` + OwnedBy string `json:"owned_by"` + Permission []LMDeployModelPermission `json:"permission,omitempty"` + Created int64 `json:"created"` +} + +// LMDeployModelPermission mirrors the OpenAI permission shape that LMDeploy exposes. +type LMDeployModelPermission struct { + Group *string `json:"group"` + ID string `json:"id"` + Object string `json:"object"` + Organization string `json:"organization"` + Created int64 `json:"created"` + AllowCreateEngine bool `json:"allow_create_engine"` + AllowSampling bool `json:"allow_sampling"` + AllowLogprobs bool `json:"allow_logprobs"` + AllowSearchIndices bool `json:"allow_search_indices"` + AllowView bool `json:"allow_view"` + AllowFineTuning bool `json:"allow_fine_tuning"` + IsBlocking bool `json:"is_blocking"` +} diff --git a/internal/adapter/registry/profile/lmdeploy_parser.go b/internal/adapter/registry/profile/lmdeploy_parser.go new file mode 100644 index 0000000..fce8757 --- /dev/null +++ b/internal/adapter/registry/profile/lmdeploy_parser.go @@ -0,0 +1,64 @@ +package profile + +import ( + "fmt" + "time" + + "github.com/thushan/olla/internal/core/domain" +) + +type lmdeployParser struct{} + +func (p *lmdeployParser) Parse(data []byte) ([]*domain.ModelInfo, error) { + if len(data) == 0 { + return make([]*domain.ModelInfo, 0), nil + } + + var response LMDeployResponse + if err := json.Unmarshal(data, &response); err != nil { + return nil, fmt.Errorf("failed to parse LMDeploy response: %w", err) + } + + models := make([]*domain.ModelInfo, 0, len(response.Data)) + now := time.Now() + + for _, model := range response.Data { + if model.ID == "" { + continue + } + + modelInfo := &domain.ModelInfo{ + Name: model.ID, + Type: "lmdeploy", + LastSeen: now, + } + + details := &domain.ModelDetails{} + hasDetails := false + + if model.Created > 0 { + createdTime := time.Unix(model.Created, 0) + details.ModifiedAt = &createdTime + hasDetails = true + } + + // Skip the default owned_by value to avoid storing noise. + if model.OwnedBy != "" && model.OwnedBy != "lmdeploy" { + details.Publisher = &model.OwnedBy + hasDetails = true + } + + if model.Parent != nil { + details.ParentModel = model.Parent + hasDetails = true + } + + if hasDetails { + modelInfo.Details = details + } + + models = append(models, modelInfo) + } + + return models, nil +} diff --git a/internal/adapter/registry/profile/lmdeploy_parser_test.go b/internal/adapter/registry/profile/lmdeploy_parser_test.go new file mode 100644 index 0000000..3b7fbb3 --- /dev/null +++ b/internal/adapter/registry/profile/lmdeploy_parser_test.go @@ -0,0 +1,220 @@ +package profile + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestLMDeployParser_Parse(t *testing.T) { + t.Parallel() + + parser := &lmdeployParser{} + + t.Run("parses valid response with full metadata", func(t *testing.T) { + t.Parallel() + + response := `{ + "object": "list", + "data": [ + { + "id": "internlm/internlm2_5-7b-chat", + "object": "model", + "created": 1754535984, + "owned_by": "lmdeploy", + "root": "internlm/internlm2_5-7b-chat", + "parent": null, + "permission": [] + }, + { + "id": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "object": "model", + "created": 1754535985, + "owned_by": "meta-llama", + "root": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "parent": null, + "permission": [] + } + ] + }` + + models, err := parser.Parse([]byte(response)) + require.NoError(t, err) + require.Len(t, models, 2) + + internlm := models[0] + assert.Equal(t, "internlm/internlm2_5-7b-chat", internlm.Name) + assert.Equal(t, "lmdeploy", internlm.Type) + // owned_by "lmdeploy" is the default — publisher should not be set + require.NotNil(t, internlm.Details) + assert.Nil(t, internlm.Details.Publisher) + require.NotNil(t, internlm.Details.ModifiedAt) + assert.Equal(t, time.Unix(1754535984, 0), *internlm.Details.ModifiedAt) + + llama := models[1] + assert.Equal(t, "meta-llama/Meta-Llama-3.1-8B-Instruct", llama.Name) + assert.Equal(t, "lmdeploy", llama.Type) + require.NotNil(t, llama.Details) + require.NotNil(t, llama.Details.Publisher) + assert.Equal(t, "meta-llama", *llama.Details.Publisher) + }) + + t.Run("handles fine-tuned model with parent", func(t *testing.T) { + t.Parallel() + + response := `{ + "object": "list", + "data": [ + { + "id": "custom/fine-tuned-internlm", + "object": "model", + "created": 1754535986, + "owned_by": "custom-org", + "parent": "internlm/internlm2_5-7b-chat" + } + ] + }` + + models, err := parser.Parse([]byte(response)) + require.NoError(t, err) + require.Len(t, models, 1) + + model := models[0] + require.NotNil(t, model.Details) + require.NotNil(t, model.Details.ParentModel) + assert.Equal(t, "internlm/internlm2_5-7b-chat", *model.Details.ParentModel) + require.NotNil(t, model.Details.Publisher) + assert.Equal(t, "custom-org", *model.Details.Publisher) + }) + + t.Run("skips models without ID", func(t *testing.T) { + t.Parallel() + + response := `{ + "object": "list", + "data": [ + { + "object": "model", + "created": 1754535987, + "owned_by": "lmdeploy" + }, + { + "id": "valid-model", + "object": "model", + "created": 1754535988, + "owned_by": "lmdeploy" + } + ] + }` + + models, err := parser.Parse([]byte(response)) + require.NoError(t, err) + require.Len(t, models, 1) + assert.Equal(t, "valid-model", models[0].Name) + }) + + t.Run("handles empty response bytes", func(t *testing.T) { + t.Parallel() + + models, err := parser.Parse([]byte{}) + require.NoError(t, err) + assert.Empty(t, models) + }) + + t.Run("handles empty data array", func(t *testing.T) { + t.Parallel() + + response := `{"object": "list", "data": []}` + models, err := parser.Parse([]byte(response)) + require.NoError(t, err) + assert.Empty(t, models) + }) + + t.Run("returns error for malformed JSON", func(t *testing.T) { + t.Parallel() + + invalidJSON := `{"object": "list", "data": [{"id": "m", invalid}]}` + models, err := parser.Parse([]byte(invalidJSON)) + assert.Error(t, err) + assert.Contains(t, err.Error(), "failed to parse LMDeploy response") + assert.Nil(t, models) + }) + + t.Run("details nil when no metadata beyond default owned_by", func(t *testing.T) { + t.Parallel() + + response := `{ + "object": "list", + "data": [ + { + "id": "simple-model", + "object": "model", + "created": 0, + "owned_by": "lmdeploy" + } + ] + }` + + models, err := parser.Parse([]byte(response)) + require.NoError(t, err) + require.Len(t, models, 1) + assert.Equal(t, "simple-model", models[0].Name) + // No created timestamp and owned_by is the default — details should be nil + assert.Nil(t, models[0].Details) + }) + + t.Run("no max_model_len field — LMDeploy differs from vLLM here", func(t *testing.T) { + t.Parallel() + + // LMDeploy /v1/models does not include max_model_len; verify we tolerate + // the field being absent (and don't panic or error if it somehow appears). + response := `{ + "object": "list", + "data": [ + { + "id": "qwen/Qwen2-7B-Instruct", + "object": "model", + "created": 1754535990, + "owned_by": "lmdeploy" + } + ] + }` + + models, err := parser.Parse([]byte(response)) + require.NoError(t, err) + require.Len(t, models, 1) + model := models[0] + assert.Equal(t, "qwen/Qwen2-7B-Instruct", model.Name) + // MaxContextLength is never populated from the wire response for LMDeploy + if model.Details != nil { + assert.Nil(t, model.Details.MaxContextLength) + } + }) + + t.Run("preserves LastSeen timestamp", func(t *testing.T) { + t.Parallel() + + response := `{ + "object": "list", + "data": [ + { + "id": "test-model", + "object": "model", + "created": 1754535991, + "owned_by": "lmdeploy" + } + ] + }` + + before := time.Now() + models, err := parser.Parse([]byte(response)) + after := time.Now() + + require.NoError(t, err) + require.Len(t, models, 1) + assert.True(t, !models[0].LastSeen.Before(before)) + assert.True(t, !models[0].LastSeen.After(after)) + }) +} diff --git a/internal/adapter/registry/profile/parsers.go b/internal/adapter/registry/profile/parsers.go index 501fc78..ea91e53 100644 --- a/internal/adapter/registry/profile/parsers.go +++ b/internal/adapter/registry/profile/parsers.go @@ -24,6 +24,8 @@ func getParserForFormat(format string) ModelResponseParser { return &lemonadeParser{} case constants.ProviderPrefixLMStudio1: return &lmStudioParser{} + case constants.ProviderTypeLMDeploy: + return &lmdeployParser{} case constants.ProviderTypeSGLang: return &sglangParser{} case constants.ProviderTypeVLLM: diff --git a/internal/app/handlers/handler_common.go b/internal/app/handlers/handler_common.go index ac6fb8d..1dccc62 100644 --- a/internal/app/handlers/handler_common.go +++ b/internal/app/handlers/handler_common.go @@ -80,6 +80,7 @@ func (a *Application) isProviderSupported(provider string) bool { // this ensures consistency between validation and route registration staticProviders := map[string]bool{ constants.ProviderTypeLemonade: true, + constants.ProviderTypeLMDeploy: true, constants.ProviderTypeLMStudio: true, constants.ProviderTypeOllama: true, constants.ProviderTypeOpenAI: true, diff --git a/internal/app/handlers/server_routes.go b/internal/app/handlers/server_routes.go index cd8d8d4..8e5cd05 100644 --- a/internal/app/handlers/server_routes.go +++ b/internal/app/handlers/server_routes.go @@ -315,6 +315,13 @@ func getStaticProviders(a *Application) map[string]staticProvider { {path: "", handler: a.providerProxyHandler, description: "OpenAI-compatible proxy", isProxy: true}, }, }, + constants.ProviderTypeLMDeploy: { + prefixes: []string{constants.ProviderTypeLMDeploy}, + routes: []staticRoute{ + {path: "v1/models", handler: a.genericProviderModelsHandler(constants.ProviderTypeLMDeploy, constants.ProviderTypeOpenAI), description: "LMDeploy models (OpenAI format)", method: "GET"}, + {path: "", handler: a.providerProxyHandler, description: "LMDeploy proxy", isProxy: true}, + }, + }, constants.ProviderTypeSGLang: { prefixes: []string{constants.ProviderTypeSGLang}, routes: []staticRoute{ diff --git a/internal/core/constants/providers.go b/internal/core/constants/providers.go index 1696ab1..32ad4b1 100644 --- a/internal/core/constants/providers.go +++ b/internal/core/constants/providers.go @@ -7,6 +7,7 @@ const ( ProviderTypeLMStudio = "lm-studio" ProviderTypeOpenAI = "openai" ProviderTypeOpenAICompat = "openai-compatible" + ProviderTypeLMDeploy = "lmdeploy" ProviderTypeSGLang = "sglang" ProviderTypeVLLM = "vllm" ProviderTypeVLLMMLX = "vllm-mlx" @@ -18,6 +19,7 @@ const ( ProviderDisplayLlamaCpp = "llama.cpp" ProviderDisplayLMStudio = "LM Studio" ProviderDisplayOpenAI = "OpenAI" + ProviderDisplayLMDeploy = "LMDeploy" ProviderDisplaySGLang = "SGLang" ProviderDisplayVLLM = "vLLM" ProviderDisplayVLLMMLX = "vLLM-MLX" diff --git a/internal/core/domain/profile.go b/internal/core/domain/profile.go index 47e0f89..17420ad 100644 --- a/internal/core/domain/profile.go +++ b/internal/core/domain/profile.go @@ -5,6 +5,7 @@ const ( ProfileLemonade = "lemonade" ProfileLlamaCpp = "llamacpp" ProfileLmStudio = "lm-studio" + ProfileLMDeploy = "lmdeploy" ProfileSGLang = "sglang" ProfileVLLM = "vllm" ProfileVLLMMLX = "vllm-mlx" diff --git a/internal/version/version.go b/internal/version/version.go index 31e9240..041bec1 100644 --- a/internal/version/version.go +++ b/internal/version/version.go @@ -37,6 +37,7 @@ var ( "lemonade", "litellm", "llamacpp", + "lmdeploy", "lm_studio", "sglang", "vllm", diff --git a/readme.md b/readme.md index f23e176..7d2497b 100644 --- a/readme.md +++ b/readme.md @@ -10,7 +10,7 @@ vLLM: Native Support SGLang: Native Support LiteLLM: Native Support - LM Deploy: OpenAI Compatible
+ LM Deploy: Native Support
vLLM-MLX: Native Support Docker Model Runner: Native Support
Ollama: Native Support