diff --git a/config/profiles/lmdeploy.yaml b/config/profiles/lmdeploy.yaml
new file mode 100644
index 0000000..78d3356
--- /dev/null
+++ b/config/profiles/lmdeploy.yaml
@@ -0,0 +1,173 @@
+# LMDeploy inference platform profile
+name: lmdeploy
+home: "https://github.com/InternLM/lmdeploy"
+version: "1.0"
+display_name: "LMDeploy"
+description: "LMDeploy efficient inference and serving for LLMs and VLMs"
+
+# Routing configuration
+routing:
+ prefixes:
+ - lmdeploy
+
+# API compatibility
+api:
+ openai_compatible: true
+ paths:
+ # Health and system endpoints
+ - /health # 0: health check (HTTP 200, empty body)
+
+ # Model management
+ - /v1/models # 1: list models (OpenAI-compatible ModelList)
+
+ # Text generation endpoints (OpenAI-compatible)
+ - /v1/chat/completions # 2: chat completions with SSE streaming
+ - /v1/completions # 3: text completions
+
+ # LMDeploy-specific endpoints
+ - /v1/encode # 4: token encoding (LMDeploy-specific)
+ - /generate # 5: native generation endpoint
+ - /pooling # 6: reward/score pooling (the real embeddings path)
+ # NOTE: /v1/embeddings is intentionally excluded — LMDeploy returns HTTP 400
+ # unconditionally for all backends on that path.
+
+ # Sleep state probe (LMDeploy-specific, used for auto-detection)
+ - /is_sleeping # 7: sleep state probe
+
+ model_discovery_path: /v1/models
+ health_check_path: /health
+
+# Platform characteristics
+characteristics:
+ timeout: 2m
+ max_concurrent_requests: 100
+ default_priority: 82 # between SGLang (85) and vLLM (80)
+ streaming_support: true
+
+# Detection hints for auto-discovery
+detection:
+ path_indicators:
+ - "/v1/encode" # LMDeploy-specific token encoding
+ - "/generate" # LMDeploy native generation
+ - "/pooling" # LMDeploy reward/score path
+ - "/is_sleeping" # distinct from vLLM and SGLang
+ default_ports:
+ - 23333 # api_server default (not 8000 which is proxy_server)
+
+# Request/response handling
+request:
+ model_field_paths:
+ - "model"
+ response_format: "lmdeploy"
+ parsing_rules:
+ chat_completions_path: "/v1/chat/completions"
+ completions_path: "/v1/completions"
+ model_field_name: "model"
+ supports_streaming: true
+
+# Path indices for specific functions
+path_indices:
+ health: 0
+ models: 1
+ chat_completions: 2
+ completions: 3
+
+# Model handling
+models:
+ name_format: "{{.Name}}"
+ capability_patterns:
+ chat:
+ - "*-Chat-*"
+ - "*-Instruct*"
+ - "*-chat-*"
+ vision:
+ - "*vision*"
+ - "*llava*"
+ - "*VL*"
+ code:
+ - "*code*"
+ - "*Code*"
+ # Context window patterns for common LMDeploy models
+ context_patterns:
+ - pattern: "*llama-3.1*"
+ context: 131072
+ - pattern: "*llama-3*"
+ context: 8192
+ - pattern: "*internlm2_5*"
+ context: 32768
+ - pattern: "*internlm2*"
+ context: 32768
+ - pattern: "*mistral*"
+ context: 32768
+ - pattern: "*qwen2*"
+ context: 32768
+
+# Resource management
+resources:
+ model_sizes:
+ - patterns: ["*70b*", "*72b*"]
+ min_memory_gb: 140
+ recommended_memory_gb: 160
+ min_gpu_memory_gb: 140
+ estimated_load_time_ms: 60000
+ - patterns: ["*34b*", "*33b*", "*30b*"]
+ min_memory_gb: 70
+ recommended_memory_gb: 80
+ min_gpu_memory_gb: 70
+ estimated_load_time_ms: 45000
+ - patterns: ["*13b*", "*14b*"]
+ min_memory_gb: 30
+ recommended_memory_gb: 40
+ min_gpu_memory_gb: 30
+ estimated_load_time_ms: 30000
+ - patterns: ["*7b*", "*8b*"]
+ min_memory_gb: 16
+ recommended_memory_gb: 24
+ min_gpu_memory_gb: 16
+ estimated_load_time_ms: 20000
+ - patterns: ["*3b*"]
+ min_memory_gb: 8
+ recommended_memory_gb: 12
+ min_gpu_memory_gb: 8
+ estimated_load_time_ms: 15000
+ - patterns: ["*1b*", "*1.1b*", "*1.5b*"]
+ min_memory_gb: 4
+ recommended_memory_gb: 8
+ min_gpu_memory_gb: 4
+ estimated_load_time_ms: 10000
+
+ defaults:
+ min_memory_gb: 8
+ recommended_memory_gb: 16
+ min_gpu_memory_gb: 8
+ requires_gpu: true
+ estimated_load_time_ms: 30000
+
+ concurrency_limits:
+ - min_memory_gb: 100
+ max_concurrent: 10
+ - min_memory_gb: 50
+ max_concurrent: 20
+ - min_memory_gb: 20
+ max_concurrent: 50
+ - min_memory_gb: 0
+ max_concurrent: 100
+
+ timeout_scaling:
+ base_timeout_seconds: 120
+ load_time_buffer: true
+
+# Metrics extraction for LMDeploy responses
+metrics:
+ extraction:
+ enabled: true
+ source: response_body
+ format: json
+ paths:
+ model: "$.model"
+ finish_reason: "$.choices[0].finish_reason"
+ input_tokens: "$.usage.prompt_tokens"
+ output_tokens: "$.usage.completion_tokens"
+ total_tokens: "$.usage.total_tokens"
+ calculations:
+ is_complete: 'len(finish_reason) > 0'
diff --git a/docs/content/api-reference/lmdeploy.md b/docs/content/api-reference/lmdeploy.md
new file mode 100644
index 0000000..5c3158d
--- /dev/null
+++ b/docs/content/api-reference/lmdeploy.md
@@ -0,0 +1,313 @@
+# LMDeploy API
+
+Proxy endpoints for LMDeploy inference servers. Available through the `/olla/lmdeploy/` prefix.
+
+## Endpoints Overview
+
+| Method | URI | Description |
+|--------|-----|-------------|
+| GET | `/olla/lmdeploy/health` | Health check |
+| GET | `/olla/lmdeploy/v1/models` | List available models |
+| POST | `/olla/lmdeploy/v1/chat/completions` | Chat completion |
+| POST | `/olla/lmdeploy/v1/completions` | Text completion |
+| POST | `/olla/lmdeploy/v1/encode` | Token encoding |
+| POST | `/olla/lmdeploy/generate` | Native generation |
+| POST | `/olla/lmdeploy/pooling` | Reward/score pooling |
+| GET | `/olla/lmdeploy/is_sleeping` | Sleep state probe |
+
+!!! warning "/v1/embeddings is not supported"
+ LMDeploy returns HTTP 400 on `/v1/embeddings` unconditionally across all backends. Use `/pooling` for reward/score tasks instead.
+
+---
+
+## GET /olla/lmdeploy/health
+
+Check LMDeploy server health.
+
+### Request
+
+```bash
+curl http://localhost:40114/olla/lmdeploy/health
+```
+
+### Response
+
+```json
+{}
+```
+
+LMDeploy returns an empty body with HTTP 200 on a healthy server.
+
+---
+
+## GET /olla/lmdeploy/v1/models
+
+List models available on the LMDeploy server.
+
+### Request
+
+```bash
+curl http://localhost:40114/olla/lmdeploy/v1/models
+```
+
+### Response
+
+```json
+{
+ "object": "list",
+ "data": [
+ {
+ "id": "internlm/internlm2_5-7b-chat",
+ "object": "model",
+ "created": 1705334400,
+ "owned_by": "lmdeploy",
+ "root": "internlm/internlm2_5-7b-chat",
+ "permission": []
+ }
+ ]
+}
+```
+
+---
+
+## POST /olla/lmdeploy/v1/chat/completions
+
+OpenAI-compatible chat completion.
+
+### Request
+
+```bash
+curl -X POST http://localhost:40114/olla/lmdeploy/v1/chat/completions \
+ -H "Content-Type: application/json" \
+ -d '{
+ "model": "internlm/internlm2_5-7b-chat",
+ "messages": [
+ {"role": "user", "content": "Explain the TurboMind engine in one paragraph"}
+ ],
+ "temperature": 0.7,
+ "max_tokens": 200,
+ "stream": false
+ }'
+```
+
+### Response
+
+```json
+{
+ "id": "chatcmpl-lmdeploy-abc123",
+ "object": "chat.completion",
+ "created": 1705334400,
+ "model": "internlm/internlm2_5-7b-chat",
+ "choices": [
+ {
+ "index": 0,
+ "message": {
+ "role": "assistant",
+ "content": "TurboMind is LMDeploy's C++/CUDA inference engine..."
+ },
+ "finish_reason": "stop"
+ }
+ ],
+ "usage": {
+ "prompt_tokens": 18,
+ "completion_tokens": 64,
+ "total_tokens": 82
+ }
+}
+```
+
+### Streaming Response
+
+When `"stream": true`:
+
+```text
+data: {"id":"chatcmpl-lmdeploy-abc123","object":"chat.completion.chunk","created":1705334400,"model":"internlm/internlm2_5-7b-chat","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":null}]}
+
+data: {"id":"chatcmpl-lmdeploy-abc123","object":"chat.completion.chunk","created":1705334400,"model":"internlm/internlm2_5-7b-chat","choices":[{"index":0,"delta":{"content":"TurboMind"},"finish_reason":null}]}
+
+...
+
+data: [DONE]
+```
+
+---
+
+## POST /olla/lmdeploy/v1/completions
+
+Text completion (OpenAI-compatible).
+
+### Request
+
+```bash
+curl -X POST http://localhost:40114/olla/lmdeploy/v1/completions \
+ -H "Content-Type: application/json" \
+ -d '{
+ "model": "internlm/internlm2_5-7b-chat",
+ "prompt": "LMDeploy uses TurboMind because",
+ "max_tokens": 100,
+ "temperature": 0.8,
+ "stream": false
+ }'
+```
+
+### Response
+
+```json
+{
+ "id": "cmpl-lmdeploy-xyz789",
+ "object": "text_completion",
+ "created": 1705334400,
+ "model": "internlm/internlm2_5-7b-chat",
+ "choices": [
+ {
+ "text": " it provides efficient GPU utilisation through continuous batching...",
+ "index": 0,
+ "finish_reason": "stop"
+ }
+ ],
+ "usage": {
+ "prompt_tokens": 9,
+ "completion_tokens": 55,
+ "total_tokens": 64
+ }
+}
+```
+
+---
+
+## POST /olla/lmdeploy/v1/encode
+
+Token encoding (LMDeploy-specific). Encodes input text to token IDs without running inference.
+
+### Request
+
+```bash
+curl -X POST http://localhost:40114/olla/lmdeploy/v1/encode \
+ -H "Content-Type: application/json" \
+ -d '{
+ "model": "internlm/internlm2_5-7b-chat",
+ "input": "Hello, world!"
+ }'
+```
+
+### Response
+
+```json
+{
+ "input_ids": [1, 22172, 29892, 3186, 29991],
+ "length": 5
+}
+```
+
+---
+
+## POST /olla/lmdeploy/generate
+
+LMDeploy's native generation endpoint.
+
+### Request
+
+```bash
+curl -X POST http://localhost:40114/olla/lmdeploy/generate \
+ -H "Content-Type: application/json" \
+ -d '{
+ "inputs": "def fibonacci(n):\n ",
+ "parameters": {
+ "temperature": 0.2,
+ "max_new_tokens": 100
+ }
+ }'
+```
+
+### Response
+
+```json
+{
+ "generated_text": "def fibonacci(n):\n if n <= 1:\n return n\n return fibonacci(n-1) + fibonacci(n-2)",
+ "finish_reason": "stop"
+}
+```
+
+---
+
+## POST /olla/lmdeploy/pooling
+
+Reward or score pooling for embedding-style tasks. This is the correct path for pooling operations — `/v1/embeddings` is not supported.
+
+### Request
+
+```bash
+curl -X POST http://localhost:40114/olla/lmdeploy/pooling \
+ -H "Content-Type: application/json" \
+ -d '{
+ "model": "internlm/internlm2_5-7b-chat",
+ "input": "The quick brown fox"
+ }'
+```
+
+### Response
+
+```json
+{
+ "object": "list",
+ "data": [
+ {
+ "object": "embedding",
+ "index": 0,
+ "embedding": [0.0123, -0.0456, 0.0789, ...]
+ }
+ ],
+ "model": "internlm/internlm2_5-7b-chat",
+ "usage": {
+ "prompt_tokens": 5,
+ "total_tokens": 5
+ }
+}
+```
+
+---
+
+## GET /olla/lmdeploy/is_sleeping
+
+Probe whether the LMDeploy engine is in sleep mode. Sleeping instances return HTTP 503 on generation endpoints — Olla's health checker treats this as a transient failure rather than a hard outage.
+
+### Request
+
+```bash
+curl http://localhost:40114/olla/lmdeploy/is_sleeping
+```
+
+### Response
+
+```json
+{"is_sleeping": false}
+```
+
+---
+
+## Configuration Example
+
+```yaml
+discovery:
+ static:
+ endpoints:
+ - url: "http://192.168.0.100:23333"
+ name: "lmdeploy-server"
+ type: "lmdeploy"
+ priority: 82
+ model_url: "/v1/models"
+ health_check_url: "/health"
+ check_interval: 5s
+ check_timeout: 2s
+```
+
+The default port for `lmdeploy serve api_server` is **23333**. The `proxy_server` component runs on 8000 but does not expose `/health` and is not supported by Olla.
+
+## Response Headers
+
+All responses include:
+
+- `X-Olla-Endpoint` - Backend endpoint name (e.g., `lmdeploy-server`)
+- `X-Olla-Model` - Model used for the request
+- `X-Olla-Backend-Type` - Always `lmdeploy` for these endpoints
+- `X-Olla-Response-Time` - Total processing time
diff --git a/docs/content/api-reference/overview.md b/docs/content/api-reference/overview.md
index 242083d..d9a0db6 100644
--- a/docs/content/api-reference/overview.md
+++ b/docs/content/api-reference/overview.md
@@ -52,6 +52,12 @@ Proxy endpoints for OpenAI-compatible services.
- `/olla/openai/*` - OpenAI API endpoints
+### [LMDeploy API](lmdeploy.md)
+Proxy endpoints for LMDeploy inference servers.
+
+- `/olla/lmdeploy/*` - LMDeploy API endpoints
+- OpenAI-compatible endpoints plus token encoding and reward pooling
+
### [vLLM API](vllm.md)
Proxy endpoints for vLLM servers.
@@ -147,7 +153,7 @@ All responses include:
| `X-Olla-Request-ID` | Unique request identifier |
| `X-Olla-Endpoint` | Backend endpoint name |
| `X-Olla-Model` | Model used (if applicable) |
-| `X-Olla-Backend-Type` | Provider type, examples:
`ollama/lm-studio/llamacpp/openai/openai-compatible/vllm/sglang/lemonade` |
+| `X-Olla-Backend-Type` | Provider type, examples:
`ollama/lm-studio/llamacpp/openai/openai-compatible/vllm/sglang/lemonade/lmdeploy` |
| `X-Olla-Response-Time` | Total processing time |
| `X-Olla-Routing-Strategy` | Routing strategy used (when model routing is active) |
| `X-Olla-Routing-Decision` | Routing decision made (routed/fallback/rejected) |
diff --git a/docs/content/concepts/overview.md b/docs/content/concepts/overview.md
index ed0f8ec..3726a46 100644
--- a/docs/content/concepts/overview.md
+++ b/docs/content/concepts/overview.md
@@ -66,7 +66,7 @@ Profiles optimise response handling for specific workload patterns.
### [Profile System](profile-system.md)
Provider-specific configuration templates:
-- Pre-configured profiles for Ollama, LM Studio, vLLM, SGLang, Lemonade SDK, LiteLLM, OpenAI
+- Pre-configured profiles for Ollama, LM Studio, LMDeploy, vLLM, SGLang, Lemonade SDK, LiteLLM, OpenAI
- Custom header mappings and endpoint patterns
- Model format converters for each provider
diff --git a/docs/content/faq.md b/docs/content/faq.md
index 036f589..ef1d6d5 100644
--- a/docs/content/faq.md
+++ b/docs/content/faq.md
@@ -10,7 +10,7 @@ keywords: olla faq, troubleshooting, common questions, proxy help
### What is Olla?
-Olla is a high-performance proxy and load balancer specifically designed for LLM infrastructure. It intelligently routes requests across multiple LLM backends (Ollama, LM Studio, vLLM, SGLang, Lemonade SDK, LiteLLM, and OpenAI-compatible endpoints) while providing load balancing, health checking, and unified model management.
+Olla is a high-performance proxy and load balancer specifically designed for LLM infrastructure. It intelligently routes requests across multiple LLM backends (Ollama, LM Studio, LMDeploy, vLLM, SGLang, Lemonade SDK, LiteLLM, and OpenAI-compatible endpoints) while providing load balancing, health checking, and unified model management.
See how Olla compares to [other tools](compare/overview.md) in the ecosystem.
diff --git a/docs/content/index.md b/docs/content/index.md
index dc9f6bd..7c918c1 100644
--- a/docs/content/index.md
+++ b/docs/content/index.md
@@ -16,7 +16,7 @@ keywords: llm proxy, ollama proxy, lm studio proxy, vllm proxy, sglang, lemonade
-
+

@@ -97,7 +97,7 @@ Olla provides detailed response headers for observability:
|--------|-------------|
| `X-Olla-Endpoint` | Backend endpoint name |
| `X-Olla-Model` | Model used for the request |
-| `X-Olla-Backend-Type` | Backend type (ollama/openai/openai-compatible/lm-studio/llamacpp/vllm/sglang/lemonade) |
+| `X-Olla-Backend-Type` | Backend type (ollama/openai/openai-compatible/lm-studio/llamacpp/vllm/sglang/lemonade/lmdeploy) |
| `X-Olla-Request-ID` | Unique request identifier |
| `X-Olla-Response-Time` | Total processing time |
diff --git a/docs/content/integrations/backend/lmdeploy.md b/docs/content/integrations/backend/lmdeploy.md
new file mode 100644
index 0000000..afcbfd6
--- /dev/null
+++ b/docs/content/integrations/backend/lmdeploy.md
@@ -0,0 +1,377 @@
+---
+title: LMDeploy Integration - High-Performance LLM Inference with Olla
+description: Configure LMDeploy with Olla proxy for efficient LLM serving. TurboMind engine, OpenAI-compatible API, VLM support, and GPU-optimised inference from InternLM.
+keywords: LMDeploy, Olla proxy, TurboMind, InternLM, LLM inference, GPU optimisation, high throughput
+---
+
+# LMDeploy Integration
+
+
+
+ | Home |
+ github.com/InternLM/lmdeploy |
+
+
+ | Since |
+ Olla v0.0.21 |
+
+
+ | Type |
+ lmdeploy (use in endpoint configuration) |
+
+
+ | Profile |
+ lmdeploy.yaml (see latest) |
+
+
+ | Features |
+
+
+ - Proxy Forwarding
+ - Health Check (native)
+ - Model Unification
+ - Model Detection & Normalisation
+ - OpenAI API Compatibility
+ - Token Encoding API
+ - Reward/Score Pooling
+ - VLM Inference (same
api_server)
+
+ |
+
+
+ | Unsupported |
+
+
+ /v1/embeddings (returns HTTP 400 — use /pooling)
+ proxy_server component (no /health endpoint)
+ - Model Management (loading/unloading)
+
+ |
+
+
+ | Attributes |
+
+
+ - OpenAI Compatible
+ - GPU Optimised (TurboMind C++/CUDA engine)
+ - Continuous Batching
+ - VLM Support
+
+ |
+
+
+ | Prefixes |
+
+
+ |
+
+
+ | Endpoints |
+
+ See below
+ |
+
+
+
+## Configuration
+
+### Basic Setup
+
+Register an LMDeploy `api_server` instance with Olla:
+
+```yaml
+discovery:
+ static:
+ endpoints:
+ - url: "http://localhost:23333"
+ name: "local-lmdeploy"
+ type: "lmdeploy"
+ priority: 82
+ model_url: "/v1/models"
+ health_check_url: "/health"
+ check_interval: 5s
+ check_timeout: 2s
+```
+
+The default port for `lmdeploy serve api_server` is **23333**. Register individual `api_server` instances directly — do not point Olla at the `proxy_server` component, which lacks a `/health` endpoint and only forwards a subset of routes.
+
+### Authentication
+
+LMDeploy supports optional Bearer-token authentication via the `--api-keys` flag. Configure the token in Olla's endpoint headers so it is forwarded on every proxied request:
+
+```yaml
+discovery:
+ static:
+ endpoints:
+ - url: "http://gpu-server:23333"
+ name: "lmdeploy-prod"
+ type: "lmdeploy"
+ priority: 82
+ health_check_url: "/health"
+ check_interval: 10s
+ check_timeout: 5s
+ headers:
+ Authorization: "Bearer ${LMDEPLOY_API_KEY}"
+```
+
+The `/health` endpoint is auth-exempt on LMDeploy, so health checks will succeed even when a key is required for inference.
+
+### Multiple Instances
+
+```yaml
+discovery:
+ static:
+ endpoints:
+ - url: "http://gpu1:23333"
+ name: "lmdeploy-1"
+ type: "lmdeploy"
+ priority: 100
+
+ - url: "http://gpu2:23333"
+ name: "lmdeploy-2"
+ type: "lmdeploy"
+ priority: 100
+
+proxy:
+ engine: "olla"
+ load_balancer: "least-connections"
+```
+
+## Endpoints Supported
+
+
+
+ | Path |
+ Description |
+
+
+ /health |
+ Health Check |
+
+
+ /v1/models |
+ List Models (OpenAI format) |
+
+
+ /v1/chat/completions |
+ Chat Completions (OpenAI format) |
+
+
+ /v1/completions |
+ Text Completions (OpenAI format) |
+
+
+ /v1/encode |
+ Token Encoding (LMDeploy-specific) |
+
+
+ /generate |
+ Native Generation Endpoint |
+
+
+ /pooling |
+ Reward/Score Pooling (not /v1/embeddings) |
+
+
+ /is_sleeping |
+ Sleep State Probe |
+
+
+
+## Usage Examples
+
+### Chat Completion
+
+```bash
+curl -X POST http://localhost:40114/olla/lmdeploy/v1/chat/completions \
+ -H "Content-Type: application/json" \
+ -d '{
+ "model": "internlm/internlm2_5-7b-chat",
+ "messages": [
+ {"role": "system", "content": "You are a helpful assistant."},
+ {"role": "user", "content": "What is TurboMind?"}
+ ],
+ "temperature": 0.7,
+ "max_tokens": 300
+ }'
+```
+
+### Streaming
+
+```bash
+curl -X POST http://localhost:40114/olla/lmdeploy/v1/chat/completions \
+ -H "Content-Type: application/json" \
+ -d '{
+ "model": "internlm/internlm2_5-7b-chat",
+ "messages": [{"role": "user", "content": "Write a short story"}],
+ "stream": true,
+ "temperature": 0.8
+ }'
+```
+
+### Token Encoding
+
+```bash
+curl -X POST http://localhost:40114/olla/lmdeploy/v1/encode \
+ -H "Content-Type: application/json" \
+ -d '{
+ "model": "internlm/internlm2_5-7b-chat",
+ "input": "Hello, world!"
+ }'
+```
+
+### Pooling (Reward/Score)
+
+```bash
+# Use /pooling — not /v1/embeddings (which returns HTTP 400)
+curl -X POST http://localhost:40114/olla/lmdeploy/pooling \
+ -H "Content-Type: application/json" \
+ -d '{
+ "model": "internlm/internlm2_5-7b-chat",
+ "input": "The quick brown fox"
+ }'
+```
+
+## Starting LMDeploy
+
+### Basic Start
+
+```bash
+pip install lmdeploy
+
+lmdeploy serve api_server internlm/internlm2_5-7b-chat \
+ --server-port 23333
+```
+
+### TurboMind Backend (Default, GPU)
+
+```bash
+lmdeploy serve api_server internlm/internlm2_5-7b-chat \
+ --backend turbomind \
+ --server-port 23333 \
+ --tp 1
+```
+
+### PyTorch Backend
+
+Use `pytorch` when a model is not supported by TurboMind, or for CPU inference:
+
+```bash
+lmdeploy serve api_server internlm/internlm2_5-7b-chat \
+ --backend pytorch \
+ --server-port 23333
+```
+
+### With Authentication
+
+```bash
+lmdeploy serve api_server internlm/internlm2_5-7b-chat \
+ --server-port 23333 \
+ --api-keys my-secret-key
+```
+
+### VLM Inference
+
+Vision-language models use the same `api_server` entrypoint — no separate binary:
+
+```bash
+lmdeploy serve api_server InternLM/internlm-xcomposer2-7b \
+ --server-port 23333
+```
+
+### Docker
+
+```bash
+docker run --gpus all \
+ -p 23333:23333 \
+ openmmlab/lmdeploy:latest \
+ lmdeploy serve api_server internlm/internlm2_5-7b-chat \
+ --server-port 23333
+```
+
+## LMDeploy Specifics
+
+### Sleep/Wake
+
+LMDeploy supports a sleep mode to release GPU memory when idle:
+
+```bash
+# Suspend the engine (GPU memory freed)
+curl -X POST http://localhost:23333/sleep
+
+# Resume the engine
+curl -X POST http://localhost:23333/wakeup
+
+# Check state (proxied via Olla)
+curl http://localhost:40114/olla/lmdeploy/is_sleeping
+```
+
+Olla treats a sleeping engine as transiently unavailable and will route around it if other healthy instances exist. Once the engine wakes, health checks recover it automatically.
+
+### Embeddings vs Pooling
+
+LMDeploy does not implement `/v1/embeddings`. The correct path for reward-model scoring and embedding-style pooling is `/pooling`. This is a deliberate upstream design decision — using TurboMind's native pooling path rather than the OpenAI embeddings spec.
+
+### Model Naming
+
+LMDeploy serves models by their HuggingFace identifiers:
+
+- `internlm/internlm2_5-7b-chat`
+- `meta-llama/Meta-Llama-3.1-8B-Instruct`
+- `mistralai/Mistral-7B-Instruct-v0.2`
+- `Qwen/Qwen2.5-7B-Instruct`
+
+### Proxy Server vs API Server
+
+LMDeploy ships two server components:
+
+| Component | Port | Use with Olla? |
+|-----------|------|----------------|
+| `api_server` | 23333 | Yes — has `/health`, full route support |
+| `proxy_server` | 8000 | No — no `/health`, limited routes |
+
+Always register individual `api_server` instances. The `proxy_server` is LMDeploy's own load balancer and is redundant when Olla is in the stack.
+
+## Profile Customisation
+
+Create `config/profiles/lmdeploy-custom.yaml` to override defaults. See [Profile Configuration](../../concepts/profile-system.md) for the full schema.
+
+```yaml
+name: lmdeploy
+version: "1.0"
+
+# Add a shorter routing prefix
+routing:
+ prefixes:
+ - lmdeploy
+ - turbomind
+
+# Increase timeout for large 70B models
+characteristics:
+ timeout: 5m
+```
+
+## OpenAI SDK
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+ base_url="http://localhost:40114/olla/lmdeploy/v1",
+ api_key="not-needed" # omit if no --api-keys set on lmdeploy
+)
+
+response = client.chat.completions.create(
+ model="internlm/internlm2_5-7b-chat",
+ messages=[{"role": "user", "content": "Hello!"}]
+)
+```
+
+## Next Steps
+
+- [LMDeploy API Reference](../../api-reference/lmdeploy.md) - Endpoint details and response formats
+- [Profile Configuration](../../concepts/profile-system.md) - Customise LMDeploy behaviour
+- [Load Balancing](../../concepts/load-balancing.md) - Scale across multiple LMDeploy instances
+- [Health Checking](../../concepts/health-checking.md) - Circuit breakers and failover
diff --git a/docs/content/integrations/overview.md b/docs/content/integrations/overview.md
index 0b9d045..eb3d985 100644
--- a/docs/content/integrations/overview.md
+++ b/docs/content/integrations/overview.md
@@ -17,6 +17,7 @@ Olla natively supports the following backends:
| [Ollama](./backend/ollama.md) | `ollama` | Native support for [Ollama](https://github.com/ollama/ollama), including model unification |
| [LM Studio](./backend/lmstudio.md) | `lm-studio` | Native support for [LM Studio](https://lmstudio.ai/), including model unification |
| [llama.cpp](./backend/llamacpp.md) | `llamacpp` | Native support for [llama.cpp](https://github.com/ggml-org/llama.cpp) lightweight C++ inference server with GGUF models, including slot management, code infill, and CPU-first design for edge deployment |
+| [LMDeploy](./backend/lmdeploy.md) | `lmdeploy` | Native support for [LMDeploy](https://github.com/InternLM/lmdeploy), InternLM's high-performance inference server with TurboMind C++/CUDA engine |
| [vLLM](./backend/vllm.md) | `vllm` | Native support for [vLLM](https://github.com/vllm-project/vllm), including model unification |
| [vLLM-MLX](./backend/vllm-mlx.md) | `vllm-mlx` | Native support for [vLLM-MLX](https://github.com/waybarrios/vllm-mlx), Apple Silicon inference using the MLX framework with native Anthropic Messages API |
| [SGLang](./backend/sglang.md) | `sglang` | Native support for [SGLang](https://github.com/sgl-project/sglang) with RadixAttention and Frontend Language, including model unification and vision support |
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
index f8d23f6..306e59c 100644
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -157,6 +157,7 @@ nav:
- Backends:
- Ollama: integrations/backend/ollama.md
- LM Studio: integrations/backend/lmstudio.md
+ - LMDeploy: integrations/backend/lmdeploy.md
- vLLM: integrations/backend/vllm.md
- vLLM-MLX: integrations/backend/vllm-mlx.md
- SGLang: integrations/backend/sglang.md
@@ -190,6 +191,7 @@ nav:
- Provider APIs:
- Ollama: api-reference/ollama.md
- LM Studio: api-reference/lmstudio.md
+ - LMDeploy: api-reference/lmdeploy.md
- vLLM: api-reference/vllm.md
- vLLM-MLX: api-reference/vllm-mlx.md
- SGLang: api-reference/sglang.md
diff --git a/internal/adapter/converter/factory.go b/internal/adapter/converter/factory.go
index 8c53812..b14def3 100644
--- a/internal/adapter/converter/factory.go
+++ b/internal/adapter/converter/factory.go
@@ -24,6 +24,7 @@ func NewConverterFactory() *ConverterFactory {
factory.RegisterConverter(NewOllamaConverter())
factory.RegisterConverter(NewLemonadeConverter())
factory.RegisterConverter(NewLMStudioConverter())
+ factory.RegisterConverter(NewLMDeployConverter())
factory.RegisterConverter(NewSGLangConverter())
factory.RegisterConverter(NewVLLMConverter())
factory.RegisterConverter(NewLlamaCppConverter())
diff --git a/internal/adapter/converter/factory_test.go b/internal/adapter/converter/factory_test.go
index f976adb..7a3ccde 100644
--- a/internal/adapter/converter/factory_test.go
+++ b/internal/adapter/converter/factory_test.go
@@ -23,6 +23,7 @@ func TestConverterFactory(t *testing.T) {
{"ollama", "*converter.OllamaConverter"},
{"lemonade", "*converter.LemonadeConverter"},
{"llamacpp", "*converter.LlamaCppConverter"},
+ {"lmdeploy", "*converter.LMDeployConverter"},
{"lmstudio", "*converter.LMStudioConverter"},
{"sglang", "*converter.SGLangConverter"},
{"vllm", "*converter.VLLMConverter"},
@@ -56,6 +57,7 @@ func TestConverterFactory(t *testing.T) {
assert.Contains(t, qpErr.Reason, "ollama")
assert.Contains(t, qpErr.Reason, "lemonade")
assert.Contains(t, qpErr.Reason, "llamacpp")
+ assert.Contains(t, qpErr.Reason, "lmdeploy")
assert.Contains(t, qpErr.Reason, "lmstudio")
assert.Contains(t, qpErr.Reason, "sglang")
assert.Contains(t, qpErr.Reason, "vllm")
@@ -65,7 +67,7 @@ func TestConverterFactory(t *testing.T) {
t.Run("GetSupportedFormats returns all formats", func(t *testing.T) {
formats := factory.GetSupportedFormats()
- assert.Len(t, formats, 10)
+ assert.Len(t, formats, 11)
// Check all expected formats are present
formatMap := make(map[string]bool)
@@ -78,6 +80,7 @@ func TestConverterFactory(t *testing.T) {
assert.True(t, formatMap["ollama"])
assert.True(t, formatMap["lemonade"])
assert.True(t, formatMap["llamacpp"])
+ assert.True(t, formatMap["lmdeploy"])
assert.True(t, formatMap["sglang"])
assert.True(t, formatMap["vllm"])
assert.True(t, formatMap["vllm-mlx"])
diff --git a/internal/adapter/converter/lmdeploy_converter.go b/internal/adapter/converter/lmdeploy_converter.go
new file mode 100644
index 0000000..475f6ac
--- /dev/null
+++ b/internal/adapter/converter/lmdeploy_converter.go
@@ -0,0 +1,108 @@
+package converter
+
+import (
+ "strings"
+ "time"
+
+ "github.com/thushan/olla/internal/adapter/registry/profile"
+ "github.com/thushan/olla/internal/core/constants"
+ "github.com/thushan/olla/internal/core/domain"
+ "github.com/thushan/olla/internal/core/ports"
+)
+
+// Type aliases for backward compatibility with tests
+type LMDeployModelResponse = profile.LMDeployResponse
+type LMDeployModelData = profile.LMDeployModel
+
+// LMDeployConverter converts models to LMDeploy-compatible format.
+// LMDeploy's /v1/models shape is OpenAI-compatible but without max_model_len
+// and with owned_by defaulting to "lmdeploy".
+type LMDeployConverter struct {
+ *BaseConverter
+}
+
+// NewLMDeployConverter creates a new LMDeploy format converter.
+func NewLMDeployConverter() ports.ModelResponseConverter {
+ return &LMDeployConverter{
+ BaseConverter: NewBaseConverter(constants.ProviderTypeLMDeploy),
+ }
+}
+
+func (c *LMDeployConverter) GetFormatName() string {
+ return constants.ProviderTypeLMDeploy
+}
+
+func (c *LMDeployConverter) ConvertToFormat(models []*domain.UnifiedModel, filters ports.ModelFilters) (interface{}, error) {
+ filtered := filterModels(models, filters)
+
+ data := make([]profile.LMDeployModel, 0, len(filtered))
+ for _, model := range filtered {
+ m := c.convertModel(model)
+ if m != nil {
+ data = append(data, *m)
+ }
+ }
+
+ return profile.LMDeployResponse{
+ Object: "list",
+ Data: data,
+ }, nil
+}
+
+func (c *LMDeployConverter) convertModel(model *domain.UnifiedModel) *profile.LMDeployModel {
+ now := time.Now().Unix()
+
+ modelID := c.findLMDeployNativeName(model)
+ if modelID == "" {
+ if len(model.Aliases) > 0 {
+ modelID = model.Aliases[0].Name
+ } else {
+ modelID = model.ID
+ }
+ }
+
+ m := &profile.LMDeployModel{
+ ID: modelID,
+ Object: "model",
+ Created: now,
+ OwnedBy: c.determineOwner(modelID),
+ }
+
+ // LMDeploy does not expose max_model_len on the wire; omit it here too.
+
+ // Generate standard permissions mirroring the LMDeploy default.
+ m.Permission = []profile.LMDeployModelPermission{
+ {
+ ID: "modelperm-olla-" + strings.ReplaceAll(modelID, "/", "-"),
+ Object: "model_permission",
+ Created: now,
+ AllowCreateEngine: false,
+ AllowSampling: true,
+ AllowLogprobs: true,
+ AllowSearchIndices: false,
+ AllowView: true,
+ AllowFineTuning: false,
+ Organization: "*",
+ IsBlocking: false,
+ },
+ }
+
+ return m
+}
+
+func (c *LMDeployConverter) findLMDeployNativeName(model *domain.UnifiedModel) string {
+ alias, found := c.BaseConverter.FindProviderAlias(model)
+ if found {
+ return alias
+ }
+ return ""
+}
+
+// determineOwner extracts the organisation from org/model-name style IDs,
+// defaulting to "lmdeploy" when there is no slash.
+func (c *LMDeployConverter) determineOwner(modelID string) string {
+ if parts := strings.SplitN(modelID, "/", 2); len(parts) == 2 {
+ return parts[0]
+ }
+ return constants.ProviderTypeLMDeploy
+}
diff --git a/internal/adapter/converter/lmdeploy_converter_test.go b/internal/adapter/converter/lmdeploy_converter_test.go
new file mode 100644
index 0000000..8ef5503
--- /dev/null
+++ b/internal/adapter/converter/lmdeploy_converter_test.go
@@ -0,0 +1,150 @@
+package converter
+
+import (
+ "testing"
+
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+ "github.com/thushan/olla/internal/adapter/registry/profile"
+ "github.com/thushan/olla/internal/core/constants"
+ "github.com/thushan/olla/internal/core/domain"
+ "github.com/thushan/olla/internal/core/ports"
+)
+
+func TestNewLMDeployConverter(t *testing.T) {
+ t.Parallel()
+
+ c := NewLMDeployConverter()
+ assert.NotNil(t, c)
+ assert.Equal(t, constants.ProviderTypeLMDeploy, c.GetFormatName())
+}
+
+func TestLMDeployConverter_ConvertToFormat_Empty(t *testing.T) {
+ t.Parallel()
+
+ c := NewLMDeployConverter()
+ result, err := c.ConvertToFormat([]*domain.UnifiedModel{}, ports.ModelFilters{})
+
+ require.NoError(t, err)
+ resp, ok := result.(profile.LMDeployResponse)
+ require.True(t, ok)
+ assert.Equal(t, "list", resp.Object)
+ assert.Empty(t, resp.Data)
+}
+
+func TestLMDeployConverter_ConvertToFormat_SingleModel(t *testing.T) {
+ t.Parallel()
+
+ c := NewLMDeployConverter()
+
+ model := &domain.UnifiedModel{
+ ID: "internlm/internlm2_5-7b-chat",
+ Aliases: []domain.AliasEntry{
+ {
+ Name: "internlm/internlm2_5-7b-chat",
+ Source: constants.ProviderTypeLMDeploy,
+ },
+ },
+ }
+
+ result, err := c.ConvertToFormat([]*domain.UnifiedModel{model}, ports.ModelFilters{})
+
+ require.NoError(t, err)
+ resp, ok := result.(profile.LMDeployResponse)
+ require.True(t, ok)
+ require.Len(t, resp.Data, 1)
+
+ m := resp.Data[0]
+ assert.Equal(t, "internlm/internlm2_5-7b-chat", m.ID)
+ assert.Equal(t, "model", m.Object)
+ assert.NotZero(t, m.Created)
+ // org extracted from ID
+ assert.Equal(t, "internlm", m.OwnedBy)
+ // LMDeploy does not expose max_model_len — the field should remain zero-value
+ // (permissions are always generated)
+ require.Len(t, m.Permission, 1)
+ assert.True(t, m.Permission[0].AllowSampling)
+}
+
+func TestLMDeployConverter_ConvertToFormat_NoOrgInID(t *testing.T) {
+ t.Parallel()
+
+ c := NewLMDeployConverter()
+
+ model := &domain.UnifiedModel{
+ ID: "simple-model",
+ }
+
+ result, err := c.ConvertToFormat([]*domain.UnifiedModel{model}, ports.ModelFilters{})
+
+ require.NoError(t, err)
+ resp, ok := result.(profile.LMDeployResponse)
+ require.True(t, ok)
+ require.Len(t, resp.Data, 1)
+ // Default owner when there is no org/model-name slash
+ assert.Equal(t, constants.ProviderTypeLMDeploy, resp.Data[0].OwnedBy)
+}
+
+func TestLMDeployConverter_ConvertToFormat_MultipleModels(t *testing.T) {
+ t.Parallel()
+
+ c := NewLMDeployConverter()
+
+ models := []*domain.UnifiedModel{
+ {
+ ID: "internlm/internlm2_5-7b-chat",
+ Aliases: []domain.AliasEntry{
+ {Name: "internlm/internlm2_5-7b-chat", Source: constants.ProviderTypeLMDeploy},
+ },
+ },
+ {
+ ID: "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ Aliases: []domain.AliasEntry{
+ {Name: "meta-llama/Meta-Llama-3.1-8B-Instruct", Source: constants.ProviderTypeLMDeploy},
+ },
+ },
+ }
+
+ result, err := c.ConvertToFormat(models, ports.ModelFilters{})
+
+ require.NoError(t, err)
+ resp, ok := result.(profile.LMDeployResponse)
+ require.True(t, ok)
+ assert.Equal(t, "list", resp.Object)
+ require.Len(t, resp.Data, 2)
+ assert.Equal(t, "internlm/internlm2_5-7b-chat", resp.Data[0].ID)
+ assert.Equal(t, "meta-llama/Meta-Llama-3.1-8B-Instruct", resp.Data[1].ID)
+}
+
+func TestLMDeployConverter_FallbackToAliasOrID(t *testing.T) {
+ t.Parallel()
+
+ c := NewLMDeployConverter()
+
+ // No LMDeploy-sourced alias — should fall back to first alias
+ modelWithOtherAlias := &domain.UnifiedModel{
+ ID: "fallback-id",
+ Aliases: []domain.AliasEntry{
+ {Name: "alias-from-ollama", Source: constants.ProviderTypeOllama},
+ },
+ }
+
+ result, err := c.ConvertToFormat([]*domain.UnifiedModel{modelWithOtherAlias}, ports.ModelFilters{})
+ require.NoError(t, err)
+ resp, ok := result.(profile.LMDeployResponse)
+ require.True(t, ok)
+ require.Len(t, resp.Data, 1)
+ assert.Equal(t, "alias-from-ollama", resp.Data[0].ID)
+
+ // No aliases at all — should use unified ID
+ modelWithNoAlias := &domain.UnifiedModel{
+ ID: "bare-id",
+ }
+
+ result2, err2 := c.ConvertToFormat([]*domain.UnifiedModel{modelWithNoAlias}, ports.ModelFilters{})
+ require.NoError(t, err2)
+ resp2, ok2 := result2.(profile.LMDeployResponse)
+ require.True(t, ok2)
+ require.Len(t, resp2.Data, 1)
+ assert.Equal(t, "bare-id", resp2.Data[0].ID)
+}
diff --git a/internal/adapter/registry/profile/lmdeploy.go b/internal/adapter/registry/profile/lmdeploy.go
new file mode 100644
index 0000000..63ce734
--- /dev/null
+++ b/internal/adapter/registry/profile/lmdeploy.go
@@ -0,0 +1,38 @@
+package profile
+
+// LMDeployResponse is the response structure from LMDeploy /v1/models endpoint.
+// The shape follows the OpenAI ModelList format but with LMDeploy-specific field
+// values — notably owned_by defaults to "lmdeploy" and there is no max_model_len.
+type LMDeployResponse struct {
+ Object string `json:"object"`
+ Data []LMDeployModel `json:"data"`
+}
+
+// LMDeployModel represents a single model entry in the LMDeploy /v1/models response.
+// Key difference from vLLM: no max_model_len field. Key difference from vLLM/SGLang:
+// owned_by defaults to "lmdeploy" (not "vllm" or "sglang").
+type LMDeployModel struct {
+ Root *string `json:"root,omitempty"`
+ Parent *string `json:"parent,omitempty"`
+ ID string `json:"id"`
+ Object string `json:"object"`
+ OwnedBy string `json:"owned_by"`
+ Permission []LMDeployModelPermission `json:"permission,omitempty"`
+ Created int64 `json:"created"`
+}
+
+// LMDeployModelPermission mirrors the OpenAI permission shape that LMDeploy exposes.
+type LMDeployModelPermission struct {
+ Group *string `json:"group"`
+ ID string `json:"id"`
+ Object string `json:"object"`
+ Organization string `json:"organization"`
+ Created int64 `json:"created"`
+ AllowCreateEngine bool `json:"allow_create_engine"`
+ AllowSampling bool `json:"allow_sampling"`
+ AllowLogprobs bool `json:"allow_logprobs"`
+ AllowSearchIndices bool `json:"allow_search_indices"`
+ AllowView bool `json:"allow_view"`
+ AllowFineTuning bool `json:"allow_fine_tuning"`
+ IsBlocking bool `json:"is_blocking"`
+}
diff --git a/internal/adapter/registry/profile/lmdeploy_parser.go b/internal/adapter/registry/profile/lmdeploy_parser.go
new file mode 100644
index 0000000..fce8757
--- /dev/null
+++ b/internal/adapter/registry/profile/lmdeploy_parser.go
@@ -0,0 +1,64 @@
+package profile
+
+import (
+ "fmt"
+ "time"
+
+ "github.com/thushan/olla/internal/core/domain"
+)
+
+type lmdeployParser struct{}
+
+func (p *lmdeployParser) Parse(data []byte) ([]*domain.ModelInfo, error) {
+ if len(data) == 0 {
+ return make([]*domain.ModelInfo, 0), nil
+ }
+
+ var response LMDeployResponse
+ if err := json.Unmarshal(data, &response); err != nil {
+ return nil, fmt.Errorf("failed to parse LMDeploy response: %w", err)
+ }
+
+ models := make([]*domain.ModelInfo, 0, len(response.Data))
+ now := time.Now()
+
+ for _, model := range response.Data {
+ if model.ID == "" {
+ continue
+ }
+
+ modelInfo := &domain.ModelInfo{
+ Name: model.ID,
+ Type: "lmdeploy",
+ LastSeen: now,
+ }
+
+ details := &domain.ModelDetails{}
+ hasDetails := false
+
+ if model.Created > 0 {
+ createdTime := time.Unix(model.Created, 0)
+ details.ModifiedAt = &createdTime
+ hasDetails = true
+ }
+
+ // Skip the default owned_by value to avoid storing noise.
+ if model.OwnedBy != "" && model.OwnedBy != "lmdeploy" {
+ details.Publisher = &model.OwnedBy
+ hasDetails = true
+ }
+
+ if model.Parent != nil {
+ details.ParentModel = model.Parent
+ hasDetails = true
+ }
+
+ if hasDetails {
+ modelInfo.Details = details
+ }
+
+ models = append(models, modelInfo)
+ }
+
+ return models, nil
+}
diff --git a/internal/adapter/registry/profile/lmdeploy_parser_test.go b/internal/adapter/registry/profile/lmdeploy_parser_test.go
new file mode 100644
index 0000000..3b7fbb3
--- /dev/null
+++ b/internal/adapter/registry/profile/lmdeploy_parser_test.go
@@ -0,0 +1,220 @@
+package profile
+
+import (
+ "testing"
+ "time"
+
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+)
+
+func TestLMDeployParser_Parse(t *testing.T) {
+ t.Parallel()
+
+ parser := &lmdeployParser{}
+
+ t.Run("parses valid response with full metadata", func(t *testing.T) {
+ t.Parallel()
+
+ response := `{
+ "object": "list",
+ "data": [
+ {
+ "id": "internlm/internlm2_5-7b-chat",
+ "object": "model",
+ "created": 1754535984,
+ "owned_by": "lmdeploy",
+ "root": "internlm/internlm2_5-7b-chat",
+ "parent": null,
+ "permission": []
+ },
+ {
+ "id": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "object": "model",
+ "created": 1754535985,
+ "owned_by": "meta-llama",
+ "root": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "parent": null,
+ "permission": []
+ }
+ ]
+ }`
+
+ models, err := parser.Parse([]byte(response))
+ require.NoError(t, err)
+ require.Len(t, models, 2)
+
+ internlm := models[0]
+ assert.Equal(t, "internlm/internlm2_5-7b-chat", internlm.Name)
+ assert.Equal(t, "lmdeploy", internlm.Type)
+ // owned_by "lmdeploy" is the default — publisher should not be set
+ require.NotNil(t, internlm.Details)
+ assert.Nil(t, internlm.Details.Publisher)
+ require.NotNil(t, internlm.Details.ModifiedAt)
+ assert.Equal(t, time.Unix(1754535984, 0), *internlm.Details.ModifiedAt)
+
+ llama := models[1]
+ assert.Equal(t, "meta-llama/Meta-Llama-3.1-8B-Instruct", llama.Name)
+ assert.Equal(t, "lmdeploy", llama.Type)
+ require.NotNil(t, llama.Details)
+ require.NotNil(t, llama.Details.Publisher)
+ assert.Equal(t, "meta-llama", *llama.Details.Publisher)
+ })
+
+ t.Run("handles fine-tuned model with parent", func(t *testing.T) {
+ t.Parallel()
+
+ response := `{
+ "object": "list",
+ "data": [
+ {
+ "id": "custom/fine-tuned-internlm",
+ "object": "model",
+ "created": 1754535986,
+ "owned_by": "custom-org",
+ "parent": "internlm/internlm2_5-7b-chat"
+ }
+ ]
+ }`
+
+ models, err := parser.Parse([]byte(response))
+ require.NoError(t, err)
+ require.Len(t, models, 1)
+
+ model := models[0]
+ require.NotNil(t, model.Details)
+ require.NotNil(t, model.Details.ParentModel)
+ assert.Equal(t, "internlm/internlm2_5-7b-chat", *model.Details.ParentModel)
+ require.NotNil(t, model.Details.Publisher)
+ assert.Equal(t, "custom-org", *model.Details.Publisher)
+ })
+
+ t.Run("skips models without ID", func(t *testing.T) {
+ t.Parallel()
+
+ response := `{
+ "object": "list",
+ "data": [
+ {
+ "object": "model",
+ "created": 1754535987,
+ "owned_by": "lmdeploy"
+ },
+ {
+ "id": "valid-model",
+ "object": "model",
+ "created": 1754535988,
+ "owned_by": "lmdeploy"
+ }
+ ]
+ }`
+
+ models, err := parser.Parse([]byte(response))
+ require.NoError(t, err)
+ require.Len(t, models, 1)
+ assert.Equal(t, "valid-model", models[0].Name)
+ })
+
+ t.Run("handles empty response bytes", func(t *testing.T) {
+ t.Parallel()
+
+ models, err := parser.Parse([]byte{})
+ require.NoError(t, err)
+ assert.Empty(t, models)
+ })
+
+ t.Run("handles empty data array", func(t *testing.T) {
+ t.Parallel()
+
+ response := `{"object": "list", "data": []}`
+ models, err := parser.Parse([]byte(response))
+ require.NoError(t, err)
+ assert.Empty(t, models)
+ })
+
+ t.Run("returns error for malformed JSON", func(t *testing.T) {
+ t.Parallel()
+
+ invalidJSON := `{"object": "list", "data": [{"id": "m", invalid}]}`
+ models, err := parser.Parse([]byte(invalidJSON))
+ assert.Error(t, err)
+ assert.Contains(t, err.Error(), "failed to parse LMDeploy response")
+ assert.Nil(t, models)
+ })
+
+ t.Run("details nil when no metadata beyond default owned_by", func(t *testing.T) {
+ t.Parallel()
+
+ response := `{
+ "object": "list",
+ "data": [
+ {
+ "id": "simple-model",
+ "object": "model",
+ "created": 0,
+ "owned_by": "lmdeploy"
+ }
+ ]
+ }`
+
+ models, err := parser.Parse([]byte(response))
+ require.NoError(t, err)
+ require.Len(t, models, 1)
+ assert.Equal(t, "simple-model", models[0].Name)
+ // No created timestamp and owned_by is the default — details should be nil
+ assert.Nil(t, models[0].Details)
+ })
+
+ t.Run("no max_model_len field — LMDeploy differs from vLLM here", func(t *testing.T) {
+ t.Parallel()
+
+ // LMDeploy /v1/models does not include max_model_len; verify we tolerate
+ // the field being absent (and don't panic or error if it somehow appears).
+ response := `{
+ "object": "list",
+ "data": [
+ {
+ "id": "qwen/Qwen2-7B-Instruct",
+ "object": "model",
+ "created": 1754535990,
+ "owned_by": "lmdeploy"
+ }
+ ]
+ }`
+
+ models, err := parser.Parse([]byte(response))
+ require.NoError(t, err)
+ require.Len(t, models, 1)
+ model := models[0]
+ assert.Equal(t, "qwen/Qwen2-7B-Instruct", model.Name)
+ // MaxContextLength is never populated from the wire response for LMDeploy
+ if model.Details != nil {
+ assert.Nil(t, model.Details.MaxContextLength)
+ }
+ })
+
+ t.Run("preserves LastSeen timestamp", func(t *testing.T) {
+ t.Parallel()
+
+ response := `{
+ "object": "list",
+ "data": [
+ {
+ "id": "test-model",
+ "object": "model",
+ "created": 1754535991,
+ "owned_by": "lmdeploy"
+ }
+ ]
+ }`
+
+ before := time.Now()
+ models, err := parser.Parse([]byte(response))
+ after := time.Now()
+
+ require.NoError(t, err)
+ require.Len(t, models, 1)
+ assert.True(t, !models[0].LastSeen.Before(before))
+ assert.True(t, !models[0].LastSeen.After(after))
+ })
+}
diff --git a/internal/adapter/registry/profile/parsers.go b/internal/adapter/registry/profile/parsers.go
index 501fc78..ea91e53 100644
--- a/internal/adapter/registry/profile/parsers.go
+++ b/internal/adapter/registry/profile/parsers.go
@@ -24,6 +24,8 @@ func getParserForFormat(format string) ModelResponseParser {
return &lemonadeParser{}
case constants.ProviderPrefixLMStudio1:
return &lmStudioParser{}
+ case constants.ProviderTypeLMDeploy:
+ return &lmdeployParser{}
case constants.ProviderTypeSGLang:
return &sglangParser{}
case constants.ProviderTypeVLLM:
diff --git a/internal/app/handlers/handler_common.go b/internal/app/handlers/handler_common.go
index ac6fb8d..1dccc62 100644
--- a/internal/app/handlers/handler_common.go
+++ b/internal/app/handlers/handler_common.go
@@ -80,6 +80,7 @@ func (a *Application) isProviderSupported(provider string) bool {
// this ensures consistency between validation and route registration
staticProviders := map[string]bool{
constants.ProviderTypeLemonade: true,
+ constants.ProviderTypeLMDeploy: true,
constants.ProviderTypeLMStudio: true,
constants.ProviderTypeOllama: true,
constants.ProviderTypeOpenAI: true,
diff --git a/internal/app/handlers/server_routes.go b/internal/app/handlers/server_routes.go
index cd8d8d4..8e5cd05 100644
--- a/internal/app/handlers/server_routes.go
+++ b/internal/app/handlers/server_routes.go
@@ -315,6 +315,13 @@ func getStaticProviders(a *Application) map[string]staticProvider {
{path: "", handler: a.providerProxyHandler, description: "OpenAI-compatible proxy", isProxy: true},
},
},
+ constants.ProviderTypeLMDeploy: {
+ prefixes: []string{constants.ProviderTypeLMDeploy},
+ routes: []staticRoute{
+ {path: "v1/models", handler: a.genericProviderModelsHandler(constants.ProviderTypeLMDeploy, constants.ProviderTypeOpenAI), description: "LMDeploy models (OpenAI format)", method: "GET"},
+ {path: "", handler: a.providerProxyHandler, description: "LMDeploy proxy", isProxy: true},
+ },
+ },
constants.ProviderTypeSGLang: {
prefixes: []string{constants.ProviderTypeSGLang},
routes: []staticRoute{
diff --git a/internal/core/constants/providers.go b/internal/core/constants/providers.go
index 1696ab1..32ad4b1 100644
--- a/internal/core/constants/providers.go
+++ b/internal/core/constants/providers.go
@@ -7,6 +7,7 @@ const (
ProviderTypeLMStudio = "lm-studio"
ProviderTypeOpenAI = "openai"
ProviderTypeOpenAICompat = "openai-compatible"
+ ProviderTypeLMDeploy = "lmdeploy"
ProviderTypeSGLang = "sglang"
ProviderTypeVLLM = "vllm"
ProviderTypeVLLMMLX = "vllm-mlx"
@@ -18,6 +19,7 @@ const (
ProviderDisplayLlamaCpp = "llama.cpp"
ProviderDisplayLMStudio = "LM Studio"
ProviderDisplayOpenAI = "OpenAI"
+ ProviderDisplayLMDeploy = "LMDeploy"
ProviderDisplaySGLang = "SGLang"
ProviderDisplayVLLM = "vLLM"
ProviderDisplayVLLMMLX = "vLLM-MLX"
diff --git a/internal/core/domain/profile.go b/internal/core/domain/profile.go
index 47e0f89..17420ad 100644
--- a/internal/core/domain/profile.go
+++ b/internal/core/domain/profile.go
@@ -5,6 +5,7 @@ const (
ProfileLemonade = "lemonade"
ProfileLlamaCpp = "llamacpp"
ProfileLmStudio = "lm-studio"
+ ProfileLMDeploy = "lmdeploy"
ProfileSGLang = "sglang"
ProfileVLLM = "vllm"
ProfileVLLMMLX = "vllm-mlx"
diff --git a/internal/version/version.go b/internal/version/version.go
index 31e9240..041bec1 100644
--- a/internal/version/version.go
+++ b/internal/version/version.go
@@ -37,6 +37,7 @@ var (
"lemonade",
"litellm",
"llamacpp",
+ "lmdeploy",
"lm_studio",
"sglang",
"vllm",
diff --git a/readme.md b/readme.md
index f23e176..7d2497b 100644
--- a/readme.md
+++ b/readme.md
@@ -10,7 +10,7 @@
-
+
