From 050de742786882c6f030233131b2a7c81daba9db Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Fri, 17 Oct 2025 11:58:08 +0800
Subject: [PATCH 01/18] FEAT: support replicas on single GPU

---
 xinference/api/restful_api.py |  24 ++-
 xinference/constants.py       |   6 +
 xinference/core/utils.py      |  23 ++-
 xinference/core/worker.py     | 271 ++++++++++++++++++++++++++++++++--
 4 files changed, 311 insertions(+), 13 deletions(-)

diff --git a/xinference/api/restful_api.py b/xinference/api/restful_api.py
index 05508b15d9..bcfa6ad8b4 100644
--- a/xinference/api/restful_api.py
+++ b/xinference/api/restful_api.py
@@ -1269,11 +1269,29 @@ async def launch_model(
 
         if isinstance(gpu_idx, int):
             gpu_idx = [gpu_idx]
-        if gpu_idx:
-            if len(gpu_idx) % replica:
+
+        # Check if single-GPU multi-replica is enabled
+        from ..constants import XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA
+
+        if XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA:
+            # Enhanced replica validation with single-GPU multi-replica support
+            if gpu_idx and len(gpu_idx) > 1 and len(gpu_idx) % replica:
+                # Only keep the restriction when multiple GPUs are specified
+                raise HTTPException(
+                    status_code=400,
+                    detail="Invalid input. When using multiple GPUs, the count must be a multiple of replica.",
+                )
+            # Allow single-GPU multi-replica deployment when enabled
+            if gpu_idx and len(gpu_idx) == 1 and replica > 1:
+                logger.info(
+                    f"Single-GPU multi-replica deployment enabled: {replica} replicas on 1 GPU"
+                )
+        else:
+            # Traditional behavior - strict multiple requirement
+            if gpu_idx and len(gpu_idx) % replica:
                 raise HTTPException(
                     status_code=400,
-                    detail="Invalid input. Allocated gpu must be a multiple of replica.",
+                    detail="Invalid input. Allocated gpu must be a multiple of replica. Set XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA=1 to enable single-GPU multi-replica deployment.",
                 )
 
         if peft_model_config is not None:
diff --git a/xinference/constants.py b/xinference/constants.py
index 3b80eca472..b07771b864 100644
--- a/xinference/constants.py
+++ b/xinference/constants.py
@@ -34,6 +34,9 @@
 XINFERENCE_ENV_SSE_PING_ATTEMPTS_SECONDS = "XINFERENCE_SSE_PING_ATTEMPTS_SECONDS"
 XINFERENCE_ENV_MAX_TOKENS = "XINFERENCE_MAX_TOKENS"
 XINFERENCE_ENV_ALLOWED_IPS = "XINFERENCE_ALLOWED_IPS"
+XINFERENCE_ENV_ENABLE_SINGLE_GPU_MULTI_REPLICA = (
+    "XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA"
+)
 XINFERENCE_ENV_BATCH_SIZE = "XINFERENCE_BATCH_SIZE"
 XINFERENCE_ENV_BATCH_INTERVAL = "XINFERENCE_BATCH_INTERVAL"
 
@@ -114,5 +117,8 @@ def get_xinference_home() -> str:
     else None
 )
 XINFERENCE_ALLOWED_IPS = os.getenv(XINFERENCE_ENV_ALLOWED_IPS)
+XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA = bool(
+    int(os.getenv(XINFERENCE_ENV_ENABLE_SINGLE_GPU_MULTI_REPLICA, "1"))
+)  # Enable by default
 XINFERENCE_BATCH_SIZE = int(os.getenv(XINFERENCE_ENV_BATCH_SIZE, "32"))
 XINFERENCE_BATCH_INTERVAL = float(os.getenv(XINFERENCE_ENV_BATCH_INTERVAL, "0.003"))
diff --git a/xinference/core/utils.py b/xinference/core/utils.py
index 74da6f163b..914206d60f 100644
--- a/xinference/core/utils.py
+++ b/xinference/core/utils.py
@@ -294,12 +294,33 @@ def get_key(package: str) -> str:
 def assign_replica_gpu(
     _replica_model_uid: str, replica: int, gpu_idx: Optional[Union[int, List[int]]]
 ) -> Optional[List[int]]:
+    """
+    Enhanced GPU assignment for replica models.
+    Supports single-GPU multi-replica deployment by intelligently allocating GPUs.
+    """
     model_uid, rep_id = parse_replica_model_uid(_replica_model_uid)
     rep_id, replica = int(rep_id), int(replica)
+
     if isinstance(gpu_idx, int):
         gpu_idx = [gpu_idx]
+
     if isinstance(gpu_idx, list) and gpu_idx:
-        return gpu_idx[rep_id::replica]
+        # When we have enough GPUs for round-robin allocation
+        if len(gpu_idx) >= replica:
+            return gpu_idx[rep_id::replica]
+        else:
+            # Support single-GPU multi-replica deployment
+            # All replicas will share the same GPU (or GPUs if more than 1 but less than replica count)
+            # This allows multiple replicas to run on the same GPU using memory-aware scheduling
+            if len(gpu_idx) == 1:
+                # Single GPU case - all replicas use the same GPU
+                return gpu_idx
+            else:
+                # Multiple GPUs but fewer than replicas - distribute as evenly as possible
+                # This enables better resource utilization
+                assigned_gpu = gpu_idx[rep_id % len(gpu_idx)]
+                return [assigned_gpu]
+
     return gpu_idx
 
 
diff --git a/xinference/core/worker.py b/xinference/core/worker.py
index f87be367b5..750affde35 100644
--- a/xinference/core/worker.py
+++ b/xinference/core/worker.py
@@ -154,6 +154,8 @@ def __init__(
         self._model_uid_to_addr: Dict[str, str] = {}
         self._model_uid_to_recover_count: Dict[str, Optional[int]] = {}
         self._model_uid_to_launch_args: Dict[str, Dict] = {}
+        self._gpu_memory_info: Dict[int, Dict[str, Union[int, float]]] = {}
+        self._model_memory_usage: Dict[str, int] = {}
 
         if XINFERENCE_DISABLE_METRICS:
             logger.info(
@@ -543,22 +545,124 @@ def allocate_devices(self, model_uid: str, n_gpu: int) -> List[int]:
                     break
             if allocated_non_embedding_rerank_models:
                 user_specified_allocated_devices.add(dev)
-        allocated_devices = set(self._gpu_to_model_uid.keys()).union(
-            user_specified_allocated_devices
-        )
-        if n_gpu > len(self._total_gpu_devices) - len(allocated_devices):
-            raise RuntimeError("No available slot found for the model")
 
-        devices: List[int] = [
+        # Check for completely available GPUs first
+        completely_available_gpus = [
             dev
             for dev in self._total_gpu_devices
             if dev not in self._gpu_to_model_uid
             and dev not in user_specified_allocated_devices
-        ][:n_gpu]
-        for dev in devices:
+        ]
+
+        if len(completely_available_gpus) >= n_gpu:
+            # We have enough completely available GPUs
+            devices = completely_available_gpus[:n_gpu]
+            for dev in devices:
+                self._gpu_to_model_uid[int(dev)] = model_uid
+            logger.info(f"Allocated completely available GPUs: {devices}")
+            return sorted(devices)
+
+        # Not enough completely available GPUs, try memory-aware allocation
+        logger.info(
+            f"Not enough completely available GPUs, trying memory-aware allocation"
+        )
+
+        # Initialize memory tracking if not already done
+        if not self._gpu_memory_info:
+            self._initialize_gpu_memory_tracking()
+
+        # Try to allocate based on available memory
+        selected_devices = []
+
+        # First, use any completely available GPUs
+        for dev in completely_available_gpus:
+            selected_devices.append(dev)
             self._gpu_to_model_uid[int(dev)] = model_uid
+            if len(selected_devices) == n_gpu:
+                break
 
-        return sorted(devices)
+        # If we still need more GPUs, select those with most available memory
+        if len(selected_devices) < n_gpu:
+            remaining_needed = n_gpu - len(selected_devices)
+
+            # Get GPUs sorted by available memory (most available first)
+            candidate_gpus = [
+                dev for dev in self._total_gpu_devices if dev not in selected_devices
+            ]
+
+            gpu_memory_list = []
+            for dev in candidate_gpus:
+                self._update_gpu_memory_info(dev)
+                available_memory = self._gpu_memory_info[dev]["available"]
+                gpu_memory_list.append((dev, available_memory))
+
+            # Sort by available memory (descending)
+            gpu_memory_list.sort(key=lambda x: x[1], reverse=True)
+
+            # Select GPUs with most available memory
+            for dev, available_memory in gpu_memory_list[:remaining_needed]:
+                selected_devices.append(dev)
+                self._gpu_to_model_uid[int(dev)] = model_uid
+                logger.info(
+                    f"Selected GPU {dev} with {available_memory}MB available memory"
+                )
+
+        if len(selected_devices) != n_gpu:
+            raise RuntimeError("No available slot found for the model")
+
+        logger.info(f"Allocated GPUs using memory-aware strategy: {selected_devices}")
+        return sorted(selected_devices)
+
+    def allocate_devices_for_model(
+        self,
+        model_uid: str,
+        model_name: str,
+        model_size: Union[int, str],
+        model_format: Optional[str],
+        quantization: Optional[str],
+        n_gpu: int = 1,
+    ) -> List[int]:
+        """
+        Enhanced GPU allocation that considers model memory requirements.
+        """
+        # Estimate memory usage for this model
+        estimated_memory_mb = self._estimate_model_memory_usage(
+            model_name, model_size, model_format, quantization
+        )
+
+        self._model_memory_usage[model_uid] = estimated_memory_mb
+
+        # Try to find GPUs that can accommodate the model
+        suitable_gpus = []
+
+        for gpu_idx in self._total_gpu_devices:
+            if self._can_fit_model_on_gpu(gpu_idx, estimated_memory_mb):
+                suitable_gpus.append(gpu_idx)
+
+        if len(suitable_gpus) >= n_gpu:
+            # We have enough suitable GPUs
+            selected = suitable_gpus[:n_gpu]
+        else:
+            # Not enough GPUs with sufficient memory, but try anyway
+            logger.warning(
+                f"Only found {len(suitable_gpus)} GPUs with sufficient memory, proceeding with allocation"
+            )
+            # Use the GPU with most available memory
+            best_gpu = self._get_gpu_with_most_available_memory()
+            selected = [best_gpu]
+
+        # Update tracking
+        for dev in selected:
+            self._gpu_to_model_uid[int(dev)] = model_uid
+            # Update memory usage tracking
+            if dev in self._gpu_memory_info:
+                self._gpu_memory_info[dev]["used"] += estimated_memory_mb
+                self._gpu_memory_info[dev]["available"] -= estimated_memory_mb
+
+        logger.info(
+            f"Allocated GPUs for model {model_name}: {selected}, estimated memory: {estimated_memory_mb}MB"
+        )
+        return sorted(selected)
 
     async def allocate_devices_with_gpu_idx(
         self, model_uid: str, model_type: str, gpu_idx: List[int]
@@ -622,6 +726,30 @@ def release_devices(self, model_uid: str):
             for model_info in model_infos:
                 self._user_specified_gpu_to_model_uids[dev].remove(model_info)
 
+        # Update GPU memory tracking
+        if model_uid in self._model_memory_usage:
+            released_memory = self._model_memory_usage[model_uid]
+            logger.info(
+                f"Releasing {released_memory}MB of memory for model {model_uid}"
+            )
+
+            # Update memory info for all GPUs
+            for dev in devices:
+                if dev in self._gpu_memory_info:
+                    self._gpu_memory_info[dev]["used"] = max(
+                        0, self._gpu_memory_info[dev]["used"] - released_memory
+                    )
+                    self._gpu_memory_info[dev]["available"] = min(
+                        self._gpu_memory_info[dev]["total"],
+                        self._gpu_memory_info[dev]["available"] + released_memory,
+                    )
+                    logger.info(
+                        f"Updated GPU {dev} memory tracking: used={self._gpu_memory_info[dev]['used']}MB, available={self._gpu_memory_info[dev]['available']}MB"
+                    )
+
+            # Remove model from memory usage tracking
+            del self._model_memory_usage[model_uid]
+
     async def _create_subpool(
         self,
         model_uid: str,
@@ -2002,6 +2130,131 @@ def update_model_status(self, model_uid: str, **kwargs):
     def get_model_status(self, model_uid: str):
         return self._model_uid_to_model_status.get(model_uid)
 
+    def _initialize_gpu_memory_tracking(self):
+        """Initialize GPU memory tracking for all available GPUs"""
+        try:
+            import pynvml
+
+            pynvml.nvmlInit()
+            for gpu_idx in self._total_gpu_devices:
+                handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_idx)
+                mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+                self._gpu_memory_info[gpu_idx] = {
+                    "total": mem_info.total // (1024**2),  # Convert to MB
+                    "used": mem_info.used // (1024**2),
+                    "available": mem_info.free // (1024**2),
+                }
+            logger.info(
+                f"Initialized GPU memory tracking for {len(self._total_gpu_devices)} GPUs"
+            )
+        except ImportError:
+            logger.warning("pynvml not available, GPU memory tracking disabled")
+            # Fallback to basic tracking without actual memory info
+            for gpu_idx in self._total_gpu_devices:
+                self._gpu_memory_info[gpu_idx] = {"total": 0, "used": 0, "available": 0}
+        except Exception as e:
+            logger.error(f"Failed to initialize GPU memory tracking: {e}")
+            for gpu_idx in self._total_gpu_devices:
+                self._gpu_memory_info[gpu_idx] = {"total": 0, "used": 0, "available": 0}
+
+    def _update_gpu_memory_info(self, gpu_idx: int):
+        """Update memory information for a specific GPU"""
+        try:
+            import pynvml
+
+            handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_idx)
+            mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+            self._gpu_memory_info[gpu_idx] = {
+                "total": mem_info.total // (1024**2),
+                "used": mem_info.used // (1024**2),
+                "available": mem_info.free // (1024**2),
+            }
+        except Exception as e:
+            logger.debug(f"Failed to update GPU {gpu_idx} memory info: {e}")
+
+    def _get_gpu_with_most_available_memory(self) -> int:
+        """Find the GPU with the most available memory"""
+        self._initialize_gpu_memory_tracking() if not self._gpu_memory_info else None
+
+        max_available_gpu = -1
+        max_available_memory: Union[int, float] = -1
+
+        for gpu_idx in self._total_gpu_devices:
+            self._update_gpu_memory_info(gpu_idx)
+            available_memory = self._gpu_memory_info[gpu_idx]["available"]
+
+            if available_memory > max_available_memory:
+                max_available_memory = available_memory
+                max_available_gpu = gpu_idx
+
+        if max_available_gpu == -1:
+            raise RuntimeError("No suitable GPU found")
+
+        logger.info(
+            f"Selected GPU {max_available_gpu} with {max_available_memory}MB available memory"
+        )
+        return max_available_gpu
+
+    def _estimate_model_memory_usage(
+        self,
+        model_name: str,
+        model_size: Union[int, str],
+        model_format: Optional[str],
+        quantization: Optional[str],
+    ) -> int:
+        """Estimate memory usage for a model based on its characteristics"""
+        # Basic estimation logic - this can be enhanced with more sophisticated calculations
+        if isinstance(model_size, str):
+            # Convert string size like "7B" to integer
+            if "B" in model_size:
+                size_gb = float(model_size.replace("B", ""))
+            else:
+                size_gb = float(model_size)
+        else:
+            size_gb = float(model_size)
+
+        # Base memory estimation (rough calculation)
+        base_memory_mb = int(size_gb * 1024 * 1.5)  # 1.5GB per billion parameters
+
+        # Adjust based on quantization
+        if quantization:
+            if "4bit" in quantization.lower() or "4-bit" in quantization.lower():
+                base_memory_mb = base_memory_mb // 3
+            elif "8bit" in quantization.lower() or "8-bit" in quantization.lower():
+                base_memory_mb = base_memory_mb // 2
+
+        # Adjust based on format
+        if model_format:
+            if "gguf" in model_format.lower():
+                base_memory_mb = int(
+                    base_memory_mb * 0.8
+                )  # GGUF is generally more memory efficient
+
+        # Add some buffer for overhead
+        base_memory_mb = int(base_memory_mb * 1.2)
+
+        logger.debug(f"Estimated memory usage for {model_name}: {base_memory_mb}MB")
+        return base_memory_mb
+
+    def _can_fit_model_on_gpu(self, gpu_idx: int, estimated_memory_mb: int) -> bool:
+        """Check if a model can fit on a specific GPU"""
+        if gpu_idx not in self._gpu_memory_info:
+            self._update_gpu_memory_info(gpu_idx)
+
+        available_memory = self._gpu_memory_info[gpu_idx]["available"]
+        can_fit = estimated_memory_mb <= available_memory
+
+        if can_fit:
+            logger.info(
+                f"Model can fit on GPU {gpu_idx}: needs {estimated_memory_mb}MB, has {available_memory}MB available"
+            )
+        else:
+            logger.warning(
+                f"Model cannot fit on GPU {gpu_idx}: needs {estimated_memory_mb}MB, has {available_memory}MB available"
+            )
+
+        return can_fit
+
     @staticmethod
     def record_metrics(name, op, kwargs):
         record_metrics(name, op, kwargs)

From d643f4aa6f9bfd01859001aede2425d6f902e159 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Fri, 17 Oct 2025 14:18:07 +0800
Subject: [PATCH 02/18] Fix CI test for test_worker.py

---
 xinference/core/tests/test_worker.py | 7 +++++++
 xinference/core/worker.py            | 7 ++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/xinference/core/tests/test_worker.py b/xinference/core/tests/test_worker.py
index b67f1011e7..cae1984a5e 100644
--- a/xinference/core/tests/test_worker.py
+++ b/xinference/core/tests/test_worker.py
@@ -30,6 +30,13 @@ def __init__(
         cuda_devices: List[int],
     ):
         super().__init__(supervisor_address, main_pool, cuda_devices)
+        self._gpu_memory_info = {}
+        for gpu_idx in cuda_devices:
+            self._gpu_memory_info[gpu_idx] = {
+                "total": 24000,
+                "used": 0,
+                "available": 24000
+            }
 
     async def __post_create__(self):
         pass
diff --git a/xinference/core/worker.py b/xinference/core/worker.py
index 750affde35..894748c380 100644
--- a/xinference/core/worker.py
+++ b/xinference/core/worker.py
@@ -586,8 +586,13 @@ def allocate_devices(self, model_uid: str, n_gpu: int) -> List[int]:
             remaining_needed = n_gpu - len(selected_devices)
 
             # Get GPUs sorted by available memory (most available first)
+            # Exclude GPUs that are already allocated by user_specified models
             candidate_gpus = [
-                dev for dev in self._total_gpu_devices if dev not in selected_devices
+                dev
+                for dev in self._total_gpu_devices
+                if dev not in selected_devices
+                and dev not in self._gpu_to_model_uid
+                and dev not in user_specified_allocated_devices
             ]
 
             gpu_memory_list = []

From c0835abfe46a88a00e4222d711e0bb6c9afd189c Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Fri, 17 Oct 2025 14:21:33 +0800
Subject: [PATCH 03/18] Fix CI test for test_worker.py

---
 xinference/core/tests/test_worker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xinference/core/tests/test_worker.py b/xinference/core/tests/test_worker.py
index cae1984a5e..bf5b9acd3d 100644
--- a/xinference/core/tests/test_worker.py
+++ b/xinference/core/tests/test_worker.py
@@ -35,7 +35,7 @@ def __init__(
             self._gpu_memory_info[gpu_idx] = {
                 "total": 24000,
                 "used": 0,
-                "available": 24000
+                "available": 24000,
             }
 
     async def __post_create__(self):

From 53303455e457c8c717f7476f1d0621fc5db5700c Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Mon, 20 Oct 2025 16:38:11 +0800
Subject: [PATCH 04/18] add launch doc

---
 .../zh_CN/LC_MESSAGES/user_guide/launch.po    | 109 ++++++++++++++++--
 doc/source/user_guide/launch.rst              |  36 ++++++
 2 files changed, 134 insertions(+), 11 deletions(-)

diff --git a/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po b/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po
index bcb925f19c..ac5891ed35 100644
--- a/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po
+++ b/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po
@@ -8,7 +8,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: Xinference \n"
 "Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2025-08-02 23:15+0800\n"
+"POT-Creation-Date: 2025-10-20 16:28+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: zh_CN\n"
@@ -17,7 +17,7 @@ msgstr ""
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.14.0\n"
+"Generated-By: Babel 2.17.0\n"
 
 #: ../../source/user_guide/launch.rst:5
 msgid "Model Launching Instructions"
@@ -46,11 +46,86 @@ msgstr ""
 "两张 GPU 上。Xinference 会自动进行负载均衡，确保请求均匀分配到多张卡上。"
 "用户看到的仍是一个模型，这大大提升了整体资源利用率。"
 
-#: ../../source/user_guide/launch.rst:18
+#: ../../source/user_guide/launch.rst:17
+msgid "Traditional Multi-Instance Deployment："
+msgstr "旧版本多实例部署："
+
+#: ../../source/user_guide/launch.rst:19
+msgid ""
+"When you have multiple GPU cards, each capable of hosting one model "
+"instance, you can set the number of instances equal to the number of "
+"GPUs. For example:"
+msgstr "当您拥有多张GPU显卡时，每张显卡可承载一个模型实例，此时可将实例数量设置为等于GPU数量。例如:"
+
+#: ../../source/user_guide/launch.rst:21
+msgid "2 GPUs, 2 instances: Each GPU runs one model instance"
+msgstr "2张GPU，2个实例：每张GPU运行一个模型实例"
+
+#: ../../source/user_guide/launch.rst:22
+msgid "4 GPUs, 4 instances: Each GPU runs one model instance"
+msgstr "4张GPU，4个实例：每张GPU运行一个模型实例"
+
+#: ../../source/user_guide/launch.rst:26
+msgid "Introduce a new environment variable:"
+msgstr "引入一个新的环境变量:"
+
+#: ../../source/user_guide/launch.rst:32
+msgid ""
+"Control whether to enable the single GPU multi-copy feature Default "
+"value: 1"
+msgstr "控制是否启用单GPU多副本功能，默认值：1"
+
+#: ../../source/user_guide/launch.rst:35
+msgid "New Feature: Smart Replica Deployment"
+msgstr "新功能：智能副本部署"
+
+#: ../../source/user_guide/launch.rst:37
+msgid "Single GPU Multi-Replica"
+msgstr "单GPU多副本"
+
+#: ../../source/user_guide/launch.rst:39
+msgid "New Support: Run multiple model replicas even with just one GPU."
+msgstr "新增支持：即使仅有一块GPU，也能运行多个模型副本。"
+
+#: ../../source/user_guide/launch.rst:41
+msgid "Scenario: You have 1 GPU with sufficient VRAM"
+msgstr "场景：您拥有1个GPU且显存充足"
+
+#: ../../source/user_guide/launch.rst:42
+msgid "Configuration: Replica Count = 3, GPU Count = 1"
+msgstr "配置：副本数量=3，GPU数量=1"
+
+#: ../../source/user_guide/launch.rst:43
+msgid "Result: 3 model instances running on the same GPU, sharing GPU resources"
+msgstr "结果：3个模型实例，在同一GPU上运行，共享GPU资源"
+
+#: ../../source/user_guide/launch.rst:45
+msgid "Hybrid GPU Allocation"
+msgstr "混合GPU分配"
+
+#: ../../source/user_guide/launch.rst:47
+msgid ""
+"Smart Allocation: Number of replicas may differ from GPU count; system "
+"intelligently distributes"
+msgstr "智能分配: 副本数可以不等于GPU数量，系统会智能分配"
+
+#: ../../source/user_guide/launch.rst:49
+msgid "Scenario: You have 2 GPUs and need 3 replicas"
+msgstr "场景: 你有2张GPU，需要3个副本"
+
+#: ../../source/user_guide/launch.rst:50
+msgid "Configuration: Replicas=3, GPUs=2"
+msgstr "配置: 副本数=3，GPU数量=2"
+
+#: ../../source/user_guide/launch.rst:51
+msgid "Result: GPU0 runs 2 instances, GPU1 runs 1 instance"
+msgstr "结果: GPU0运行2个实例，GPU1运行1个实例"
+
+#: ../../source/user_guide/launch.rst:54
 msgid "Set Environment Variables"
 msgstr "设置环境变量"
 
-#: ../../source/user_guide/launch.rst:22
+#: ../../source/user_guide/launch.rst:58
 msgid ""
 "Sometimes, we want to specify environment variables for a particular "
 "model at runtime. Since v1.8.1, Xinference provides the capability to "
@@ -60,21 +135,21 @@ msgstr ""
 "有时我们希望在运行时为特定模型指定环境变量。从 v1.8.1 开始，Xinference "
 "提供了单独配置环境变量的功能，无需在启动 Xinference 前设置。"
 
-#: ../../source/user_guide/launch.rst:25
+#: ../../source/user_guide/launch.rst:61
 msgid "For Web UI."
 msgstr "针对 Web UI。"
 
-#: ../../source/user_guide/launch.rst:31
+#: ../../source/user_guide/launch.rst:67
 msgid ""
 "When using the command line, use ``--env`` to specify an environment "
 "variable."
 msgstr "命令行使用时，使用 ``--env`` 指定环境变量。"
 
-#: ../../source/user_guide/launch.rst:33
+#: ../../source/user_guide/launch.rst:69
 msgid "Example usage:"
 msgstr "示例用法："
 
-#: ../../source/user_guide/launch.rst:39
+#: ../../source/user_guide/launch.rst:75
 msgid ""
 "Take vLLM as an example: it has versions V1 and V0, and by default, it "
 "automatically determines which version to use. If you want to force the "
@@ -85,13 +160,25 @@ msgstr ""
 "在加载模型时强制通过设置 ``VLLM_USE_V1=0`` 来使用 V0，可以指定该环境变量"
 "。"
 
-#: ../../source/user_guide/launch.rst:43
+#: ../../source/user_guide/launch.rst:79
 msgid "Configuring Model Virtual Environment"
 msgstr "配置模型虚拟空间"
 
-#: ../../source/user_guide/launch.rst:47
+#: ../../source/user_guide/launch.rst:83
 msgid ""
 "For this part, please refer to :ref:`toggling virtual environments and "
 "customizing dependencies <model_launching_virtualenv>`."
-msgstr "对于这部分，请参考 :ref:`开关虚拟空间和定制依赖 <model_launching_virtualenv>`。"
+msgstr ""
+"对于这部分，请参考 :ref:`开关虚拟空间和定制依赖 <model_launching_"
+"virtualenv>`。"
+
+#~ msgid ""
+#~ "Scenario: You have 2 GPUs and need"
+#~ " 3 replicas Configuration: Replicas=3, "
+#~ "GPUs=2 Result: GPU0 runs 2 instances,"
+#~ " GPU1 runs 1 instance"
+#~ msgstr ""
+#~ "场景: 你有2张GPU，需要3个副本"
+#~ "配置: 副本数=3，GPU数量=2结果:"
+#~ " GPU0运行2个实例，GPU1运行1个实例"
 
diff --git a/doc/source/user_guide/launch.rst b/doc/source/user_guide/launch.rst
index aac59bc321..cabf3c29cf 100644
--- a/doc/source/user_guide/launch.rst
+++ b/doc/source/user_guide/launch.rst
@@ -14,6 +14,42 @@ you can set the replica count to 2. This way, two identical instances of the mod
 Xinference automatically load-balances requests to ensure even distribution across multiple GPUs.
 Meanwhile, users see it as a single model, which greatly improves overall resource utilization.
 
+Traditional Multi-Instance Deployment：
+
+When you have multiple GPU cards, each capable of hosting one model instance, you can set the number of instances equal to the number of GPUs. For example:
+
+- 2 GPUs, 2 instances: Each GPU runs one model instance
+- 4 GPUs, 4 instances: Each GPU runs one model instance
+
+.. versionadded:: v1.11.1
+
+Introduce a new environment variable:
+
+.. code-block:: bash
+
+    XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA
+
+Control whether to enable the single GPU multi-copy feature
+Default value: 1
+
+New Feature: Smart Replica Deployment
+
+1. Single GPU Multi-Replica
+
+New Support: Run multiple model replicas even with just one GPU.
+
+- Scenario: You have 1 GPU with sufficient VRAM
+- Configuration: Replica Count = 3, GPU Count = 1
+- Result: 3 model instances running on the same GPU, sharing GPU resources
+
+2. Hybrid GPU Allocation
+
+Smart Allocation: Number of replicas may differ from GPU count; system intelligently distributes
+
+- Scenario: You have 2 GPUs and need 3 replicas
+- Configuration: Replicas=3, GPUs=2
+- Result: GPU0 runs 2 instances, GPU1 runs 1 instance
+
 Set Environment Variables
 =========================
 

From ac339825e0b2d039dca9a162796f62d4b459d2fa Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Wed, 22 Oct 2025 14:14:01 +0800
Subject: [PATCH 05/18] Supplementary Doc

---
 .../locale/zh_CN/LC_MESSAGES/user_guide/launch.po      | 10 ----------
 doc/source/user_guide/launch.rst                       |  2 +-
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po b/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po
index ac5891ed35..ff9199818a 100644
--- a/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po
+++ b/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po
@@ -172,13 +172,3 @@ msgstr ""
 "对于这部分，请参考 :ref:`开关虚拟空间和定制依赖 <model_launching_"
 "virtualenv>`。"
 
-#~ msgid ""
-#~ "Scenario: You have 2 GPUs and need"
-#~ " 3 replicas Configuration: Replicas=3, "
-#~ "GPUs=2 Result: GPU0 runs 2 instances,"
-#~ " GPU1 runs 1 instance"
-#~ msgstr ""
-#~ "场景: 你有2张GPU，需要3个副本"
-#~ "配置: 副本数=3，GPU数量=2结果:"
-#~ " GPU0运行2个实例，GPU1运行1个实例"
-
diff --git a/doc/source/user_guide/launch.rst b/doc/source/user_guide/launch.rst
index cabf3c29cf..062a63c47b 100644
--- a/doc/source/user_guide/launch.rst
+++ b/doc/source/user_guide/launch.rst
@@ -21,7 +21,7 @@ When you have multiple GPU cards, each capable of hosting one model instance, yo
 - 2 GPUs, 2 instances: Each GPU runs one model instance
 - 4 GPUs, 4 instances: Each GPU runs one model instance
 
-.. versionadded:: v1.11.1
+.. versionadded:: v1.12.0
 
 Introduce a new environment variable:
 

From 51e244c552e02291783752aa692dc994af88a75c Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Fri, 21 Nov 2025 14:34:52 +0800
Subject: [PATCH 06/18] launch_strategy

---
 xinference/core/launch_strategy.py   | 303 +++++++++++++++++++++++++++
 xinference/core/tests/test_worker.py |  15 +-
 xinference/core/worker.py            | 294 ++++----------------------
 xinference/device_utils.py           |  87 +++++++-
 4 files changed, 432 insertions(+), 267 deletions(-)
 create mode 100644 xinference/core/launch_strategy.py

diff --git a/xinference/core/launch_strategy.py b/xinference/core/launch_strategy.py
new file mode 100644
index 0000000000..6e5911b069
--- /dev/null
+++ b/xinference/core/launch_strategy.py
@@ -0,0 +1,303 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Set, Union
+
+from ..device_utils import update_gpu_memory_info
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class LaunchModelSpec:
+    """Specification for model launch"""
+
+    model_uid: str
+    n_gpu: int
+    model_name: Optional[str] = None
+    model_size: Optional[Union[int, str]] = None
+    model_format: Optional[str] = None
+    quantization: Optional[str] = None
+
+
+class LaunchStrategy(ABC):
+    """Abstract base class for GPU allocation strategies"""
+
+    @abstractmethod
+    def allocate(
+        self,
+        spec: LaunchModelSpec,
+        total_gpu_devices: List[int],
+        user_specified_allocated_devices: Set[int],
+        allocated_gpus: Dict[int, str],
+    ) -> List[int]:
+        """
+        Allocate GPUs for model launch
+
+        Args:
+            spec: Model launch specification
+            total_gpu_devices: List of all available GPU indices
+            user_specified_allocated_devices: Set of user-specified allocated devices
+            allocated_gpus: Dictionary mapping GPU index to model UID
+
+        Returns:
+            List of allocated GPU indices
+        """
+        pass
+
+    @abstractmethod
+    def release(self, model_uid: str, devices: List[int]) -> None:
+        """
+        Release GPUs allocated for a model
+
+        Args:
+            model_uid: Model identifier
+            devices: List of GPU indices to release
+        """
+        pass
+
+
+class MemoryAwareLaunchStrategy(LaunchStrategy):
+    """Memory-aware GPU allocation strategy supporting single-GPU multi-replica"""
+
+    def __init__(
+        self,
+        total_gpu_devices: List[int],
+        gpu_memory_info: Optional[Dict[int, Dict[str, Union[int, float]]]] = None,
+    ):
+        self._total_gpu_devices = total_gpu_devices
+        self._gpu_memory_info = gpu_memory_info or {}
+        self._model_memory_usage: Dict[str, int] = {}
+
+        # Initialize memory tracking for all GPUs if not provided
+        if not self._gpu_memory_info:
+            self._initialize_gpu_memory_tracking()
+
+    def _initialize_gpu_memory_tracking(self):
+        """Initialize GPU memory tracking for all available GPUs"""
+        try:
+            from ..device_utils import initialize_gpu_memory_info
+
+            self._gpu_memory_info = initialize_gpu_memory_info(
+                self._total_gpu_devices, logger=logger
+            )
+        except Exception as e:
+            logger.warning(f"Failed to initialize GPU memory tracking: {e}")
+            # Fallback to basic tracking without actual memory info
+            for gpu_idx in self._total_gpu_devices:
+                self._gpu_memory_info[gpu_idx] = {
+                    "total": 0,
+                    "used": 0,
+                    "available": 0,
+                }
+
+    def _estimate_model_memory_usage(
+        self,
+        model_name: str,
+        model_size: Union[int, str],
+        model_format: Optional[str],
+        quantization: Optional[str],
+    ) -> int:
+        """Estimate memory usage for a model based on its characteristics"""
+        # Basic estimation logic - this can be enhanced with more sophisticated calculations
+        if isinstance(model_size, str):
+            # Convert string size like "7B" to integer
+            if "B" in model_size:
+                size_gb = float(model_size.replace("B", ""))
+            else:
+                size_gb = float(model_size)
+        else:
+            size_gb = float(model_size)
+
+        # Base memory estimation (rough calculation)
+        base_memory_mb = int(size_gb * 1024 * 1.5)  # 1.5GB per billion parameters
+
+        # Adjust based on quantization
+        if quantization:
+            if "4bit" in quantization.lower() or "4-bit" in quantization.lower():
+                base_memory_mb = base_memory_mb // 3
+            elif "8bit" in quantization.lower() or "8-bit" in quantization.lower():
+                base_memory_mb = base_memory_mb // 2
+
+        # Adjust based on model format
+        if model_format:
+            if "mlx" in model_format.lower():
+                base_memory_mb = int(
+                    base_memory_mb * 0.8
+                )  # MLX is generally more memory efficient
+
+        return max(base_memory_mb, 1024)  # Minimum 1GB
+
+    def _can_fit_model_on_gpu(self, gpu_idx: int, estimated_memory_mb: int) -> bool:
+        """Check if a model can fit on a specific GPU"""
+        # Update memory info for the GPU
+        update_gpu_memory_info(self._gpu_memory_info, gpu_idx, logger=logger)
+
+        available_memory = self._gpu_memory_info[gpu_idx]["available"]
+        can_fit = estimated_memory_mb <= available_memory
+
+        if can_fit:
+            logger.info(
+                f"Model can fit on GPU {gpu_idx}: needs {estimated_memory_mb}MB, has {available_memory}MB available"
+            )
+        else:
+            logger.warning(
+                f"Model cannot fit on GPU {gpu_idx}: needs {estimated_memory_mb}MB, has {available_memory}MB available"
+            )
+
+        return can_fit
+
+    def _get_gpu_with_most_available_memory(self) -> int:
+        """Find the GPU with the most available memory"""
+        max_available_gpu = -1
+        max_available_memory: Union[int, float] = -1
+
+        for gpu_idx in self._total_gpu_devices:
+            update_gpu_memory_info(self._gpu_memory_info, gpu_idx, logger=logger)
+            available_memory = self._gpu_memory_info[gpu_idx]["available"]
+
+            if available_memory > max_available_memory:
+                max_available_memory = available_memory
+                max_available_gpu = gpu_idx
+
+        if max_available_gpu == -1:
+            raise RuntimeError("No suitable GPU found")
+
+        return max_available_gpu
+
+    def allocate(
+        self,
+        spec: LaunchModelSpec,
+        total_gpu_devices: List[int],
+        user_specified_allocated_devices: Set[int],
+        allocated_gpus: Dict[int, str],
+    ) -> List[int]:
+        """
+        Allocate GPUs using memory-aware strategy
+
+        Strategy:
+        1. Prefer completely free GPUs
+        2. If not enough, use GPUs with most available memory
+        3. Support single-GPU multi-replica deployment
+        """
+        model_uid = spec.model_uid
+        n_gpu = spec.n_gpu
+
+        # Estimate model memory usage if model info is provided
+        estimated_memory_mb = 0
+        if spec.model_name and spec.model_size:
+            estimated_memory_mb = self._estimate_model_memory_usage(
+                spec.model_name, spec.model_size, spec.model_format, spec.quantization
+            )
+            self._model_memory_usage[model_uid] = estimated_memory_mb
+
+        # Check for completely available GPUs first
+        completely_available_gpus = [
+            dev
+            for dev in total_gpu_devices
+            if dev not in allocated_gpus and dev not in user_specified_allocated_devices
+        ]
+
+        if estimated_memory_mb > 0:
+            # Try to find GPUs that can accommodate the model
+            suitable_gpus = []
+
+            # First, check completely available GPUs
+            for gpu_idx in completely_available_gpus:
+                if self._can_fit_model_on_gpu(gpu_idx, estimated_memory_mb):
+                    suitable_gpus.append(gpu_idx)
+
+            # If not enough completely available GPUs, check partially used GPUs
+            if len(suitable_gpus) < n_gpu:
+                for dev in total_gpu_devices:
+                    if dev in allocated_gpus and dev not in suitable_gpus:
+                        # This GPU is already allocated, check if it has space for another replica
+                        if self._can_fit_model_on_gpu(dev, estimated_memory_mb):
+                            suitable_gpus.append(dev)
+
+            if len(suitable_gpus) >= n_gpu:
+                selected = suitable_gpus[:n_gpu]
+            else:
+                # Not enough GPUs with sufficient memory, but try anyway
+                # Use the GPU with most available memory
+                best_gpu = self._get_gpu_with_most_available_memory()
+                selected = [best_gpu] if n_gpu == 1 else [best_gpu] * n_gpu
+        else:
+            # No memory estimation available, use basic strategy
+            if len(completely_available_gpus) >= n_gpu:
+                selected = completely_available_gpus[:n_gpu]
+            else:
+                # For single GPU deployment without memory estimation, allow sharing
+                if n_gpu == 1 and total_gpu_devices:
+                    # Use the first available GPU or the one with most available memory
+                    if completely_available_gpus:
+                        selected = [completely_available_gpus[0]]
+                    else:
+                        # No completely available GPU, find one with most available memory
+                        best_gpu = self._get_gpu_with_most_available_memory()
+                        selected = [best_gpu]
+                else:
+                    # Use GPUs with most available memory
+                    remaining_needed = n_gpu - len(completely_available_gpus)
+                    candidate_gpus = [
+                        dev
+                        for dev in total_gpu_devices
+                        if dev not in completely_available_gpus
+                        and dev not in allocated_gpus
+                    ]
+
+                    gpu_memory_list = []
+                    for dev in candidate_gpus:
+                        update_gpu_memory_info(
+                            self._gpu_memory_info, dev, logger=logger
+                        )
+                        available_memory = self._gpu_memory_info[dev]["available"]
+                        gpu_memory_list.append((dev, available_memory))
+
+                    # Sort by available memory (descending)
+                    gpu_memory_list.sort(key=lambda x: x[1], reverse=True)
+
+                    selected = completely_available_gpus.copy()
+                    for dev, available_memory in gpu_memory_list[:remaining_needed]:
+                        selected.append(dev)
+
+        if len(selected) != n_gpu:
+            raise RuntimeError(
+                f"Failed to allocate {n_gpu} GPUs, only got {len(selected)}"
+            )
+
+        # Update memory usage accounting
+        for gpu_idx in selected:
+            if gpu_idx in self._gpu_memory_info and estimated_memory_mb > 0:
+                self._gpu_memory_info[gpu_idx]["used"] += estimated_memory_mb
+                self._gpu_memory_info[gpu_idx]["available"] -= estimated_memory_mb
+
+        return selected
+
+    def release(self, model_uid: str, devices: List[int]) -> None:
+        """Release allocated GPUs and roll back memory accounting"""
+        # Roll back memory usage accounting
+        if model_uid in self._model_memory_usage:
+            memory_used = self._model_memory_usage[model_uid]
+            for gpu_idx in devices:
+                if gpu_idx in self._gpu_memory_info:
+                    # Roll back memory usage
+                    self._gpu_memory_info[gpu_idx]["used"] -= memory_used
+                    self._gpu_memory_info[gpu_idx]["available"] += memory_used
+
+            # Remove model from memory tracking
+            del self._model_memory_usage[model_uid]
diff --git a/xinference/core/tests/test_worker.py b/xinference/core/tests/test_worker.py
index bf5b9acd3d..7c0aa45960 100644
--- a/xinference/core/tests/test_worker.py
+++ b/xinference/core/tests/test_worker.py
@@ -18,6 +18,7 @@
 import xoscar as xo
 from xoscar import MainActorPoolType, create_actor_pool, get_pool_config
 
+from ..launch_strategy import MemoryAwareLaunchStrategy
 from ..utils import merge_virtual_env_packages
 from ..worker import WorkerActor
 
@@ -30,13 +31,13 @@ def __init__(
         cuda_devices: List[int],
     ):
         super().__init__(supervisor_address, main_pool, cuda_devices)
-        self._gpu_memory_info = {}
-        for gpu_idx in cuda_devices:
-            self._gpu_memory_info[gpu_idx] = {
-                "total": 24000,
-                "used": 0,
-                "available": 24000,
-            }
+        gpu_memory_info = {
+            idx: {"total": 24000.0, "used": 0.0, "available": 24000.0}
+            for idx in cuda_devices
+        }
+        self._launch_strategy = MemoryAwareLaunchStrategy(
+            cuda_devices, gpu_memory_info=gpu_memory_info
+        )
 
     async def __post_create__(self):
         pass
diff --git a/xinference/core/worker.py b/xinference/core/worker.py
index 894748c380..8cb89af426 100644
--- a/xinference/core/worker.py
+++ b/xinference/core/worker.py
@@ -62,6 +62,7 @@
 from ..utils import get_pip_config_args, get_real_path
 from .cache_tracker import CacheTrackerActor
 from .event import Event, EventCollectorActor, EventType
+from .launch_strategy import MemoryAwareLaunchStrategy
 from .metrics import launch_metrics_export_server, record_metrics
 from .resource import gather_node_info
 from .status_guard import StatusGuardActor
@@ -154,8 +155,7 @@ def __init__(
         self._model_uid_to_addr: Dict[str, str] = {}
         self._model_uid_to_recover_count: Dict[str, Optional[int]] = {}
         self._model_uid_to_launch_args: Dict[str, Dict] = {}
-        self._gpu_memory_info: Dict[int, Dict[str, Union[int, float]]] = {}
-        self._model_memory_usage: Dict[str, int] = {}
+        self._launch_strategy = MemoryAwareLaunchStrategy(self._total_gpu_devices)
 
         if XINFERENCE_DISABLE_METRICS:
             logger.info(
@@ -532,7 +532,8 @@ async def allocate_devices_for_embedding(self, model_uid: str) -> int:
         self._gpu_to_embedding_model_uids[device].add(model_uid)
         return device
 
-    def allocate_devices(self, model_uid: str, n_gpu: int) -> List[int]:
+    def _collect_user_specified_devices(self) -> Set[int]:
+        """收集用户指定且非 embedding/rerank 的占用卡"""
         user_specified_allocated_devices: Set[int] = set()
         for dev, model_infos in self._user_specified_gpu_to_model_uids.items():
             allocated_non_embedding_rerank_models = False
@@ -545,78 +546,21 @@ def allocate_devices(self, model_uid: str, n_gpu: int) -> List[int]:
                     break
             if allocated_non_embedding_rerank_models:
                 user_specified_allocated_devices.add(dev)
+        return user_specified_allocated_devices
 
-        # Check for completely available GPUs first
-        completely_available_gpus = [
-            dev
-            for dev in self._total_gpu_devices
-            if dev not in self._gpu_to_model_uid
-            and dev not in user_specified_allocated_devices
-        ]
-
-        if len(completely_available_gpus) >= n_gpu:
-            # We have enough completely available GPUs
-            devices = completely_available_gpus[:n_gpu]
-            for dev in devices:
-                self._gpu_to_model_uid[int(dev)] = model_uid
-            logger.info(f"Allocated completely available GPUs: {devices}")
-            return sorted(devices)
-
-        # Not enough completely available GPUs, try memory-aware allocation
-        logger.info(
-            f"Not enough completely available GPUs, trying memory-aware allocation"
+    def allocate_devices(self, model_uid: str, n_gpu: int) -> List[int]:
+        from .launch_strategy import LaunchModelSpec
+
+        spec = LaunchModelSpec(model_uid=model_uid, n_gpu=n_gpu)
+        devices = self._launch_strategy.allocate(
+            spec=spec,
+            total_gpu_devices=self._total_gpu_devices,
+            user_specified_allocated_devices=self._collect_user_specified_devices(),
+            allocated_gpus=self._gpu_to_model_uid,
         )
-
-        # Initialize memory tracking if not already done
-        if not self._gpu_memory_info:
-            self._initialize_gpu_memory_tracking()
-
-        # Try to allocate based on available memory
-        selected_devices = []
-
-        # First, use any completely available GPUs
-        for dev in completely_available_gpus:
-            selected_devices.append(dev)
+        for dev in devices:
             self._gpu_to_model_uid[int(dev)] = model_uid
-            if len(selected_devices) == n_gpu:
-                break
-
-        # If we still need more GPUs, select those with most available memory
-        if len(selected_devices) < n_gpu:
-            remaining_needed = n_gpu - len(selected_devices)
-
-            # Get GPUs sorted by available memory (most available first)
-            # Exclude GPUs that are already allocated by user_specified models
-            candidate_gpus = [
-                dev
-                for dev in self._total_gpu_devices
-                if dev not in selected_devices
-                and dev not in self._gpu_to_model_uid
-                and dev not in user_specified_allocated_devices
-            ]
-
-            gpu_memory_list = []
-            for dev in candidate_gpus:
-                self._update_gpu_memory_info(dev)
-                available_memory = self._gpu_memory_info[dev]["available"]
-                gpu_memory_list.append((dev, available_memory))
-
-            # Sort by available memory (descending)
-            gpu_memory_list.sort(key=lambda x: x[1], reverse=True)
-
-            # Select GPUs with most available memory
-            for dev, available_memory in gpu_memory_list[:remaining_needed]:
-                selected_devices.append(dev)
-                self._gpu_to_model_uid[int(dev)] = model_uid
-                logger.info(
-                    f"Selected GPU {dev} with {available_memory}MB available memory"
-                )
-
-        if len(selected_devices) != n_gpu:
-            raise RuntimeError("No available slot found for the model")
-
-        logger.info(f"Allocated GPUs using memory-aware strategy: {selected_devices}")
-        return sorted(selected_devices)
+        return sorted(devices)
 
     def allocate_devices_for_model(
         self,
@@ -627,47 +571,25 @@ def allocate_devices_for_model(
         quantization: Optional[str],
         n_gpu: int = 1,
     ) -> List[int]:
-        """
-        Enhanced GPU allocation that considers model memory requirements.
-        """
-        # Estimate memory usage for this model
-        estimated_memory_mb = self._estimate_model_memory_usage(
-            model_name, model_size, model_format, quantization
+        from .launch_strategy import LaunchModelSpec
+
+        spec = LaunchModelSpec(
+            model_uid=model_uid,
+            n_gpu=n_gpu,
+            model_name=model_name,
+            model_size=model_size,
+            model_format=model_format,
+            quantization=quantization,
         )
-
-        self._model_memory_usage[model_uid] = estimated_memory_mb
-
-        # Try to find GPUs that can accommodate the model
-        suitable_gpus = []
-
-        for gpu_idx in self._total_gpu_devices:
-            if self._can_fit_model_on_gpu(gpu_idx, estimated_memory_mb):
-                suitable_gpus.append(gpu_idx)
-
-        if len(suitable_gpus) >= n_gpu:
-            # We have enough suitable GPUs
-            selected = suitable_gpus[:n_gpu]
-        else:
-            # Not enough GPUs with sufficient memory, but try anyway
-            logger.warning(
-                f"Only found {len(suitable_gpus)} GPUs with sufficient memory, proceeding with allocation"
-            )
-            # Use the GPU with most available memory
-            best_gpu = self._get_gpu_with_most_available_memory()
-            selected = [best_gpu]
-
-        # Update tracking
-        for dev in selected:
-            self._gpu_to_model_uid[int(dev)] = model_uid
-            # Update memory usage tracking
-            if dev in self._gpu_memory_info:
-                self._gpu_memory_info[dev]["used"] += estimated_memory_mb
-                self._gpu_memory_info[dev]["available"] -= estimated_memory_mb
-
-        logger.info(
-            f"Allocated GPUs for model {model_name}: {selected}, estimated memory: {estimated_memory_mb}MB"
+        devices = self._launch_strategy.allocate(
+            spec=spec,
+            total_gpu_devices=self._total_gpu_devices,
+            user_specified_allocated_devices=self._collect_user_specified_devices(),
+            allocated_gpus=self._gpu_to_model_uid,
         )
-        return sorted(selected)
+        for dev in devices:
+            self._gpu_to_model_uid[int(dev)] = model_uid
+        return sorted(devices)
 
     async def allocate_devices_with_gpu_idx(
         self, model_uid: str, model_type: str, gpu_idx: List[int]
@@ -731,29 +653,8 @@ def release_devices(self, model_uid: str):
             for model_info in model_infos:
                 self._user_specified_gpu_to_model_uids[dev].remove(model_info)
 
-        # Update GPU memory tracking
-        if model_uid in self._model_memory_usage:
-            released_memory = self._model_memory_usage[model_uid]
-            logger.info(
-                f"Releasing {released_memory}MB of memory for model {model_uid}"
-            )
-
-            # Update memory info for all GPUs
-            for dev in devices:
-                if dev in self._gpu_memory_info:
-                    self._gpu_memory_info[dev]["used"] = max(
-                        0, self._gpu_memory_info[dev]["used"] - released_memory
-                    )
-                    self._gpu_memory_info[dev]["available"] = min(
-                        self._gpu_memory_info[dev]["total"],
-                        self._gpu_memory_info[dev]["available"] + released_memory,
-                    )
-                    logger.info(
-                        f"Updated GPU {dev} memory tracking: used={self._gpu_memory_info[dev]['used']}MB, available={self._gpu_memory_info[dev]['available']}MB"
-                    )
-
-            # Remove model from memory usage tracking
-            del self._model_memory_usage[model_uid]
+        # Use launch strategy to handle memory tracking rollback
+        self._launch_strategy.release(model_uid, devices)
 
     async def _create_subpool(
         self,
@@ -2135,131 +2036,6 @@ def update_model_status(self, model_uid: str, **kwargs):
     def get_model_status(self, model_uid: str):
         return self._model_uid_to_model_status.get(model_uid)
 
-    def _initialize_gpu_memory_tracking(self):
-        """Initialize GPU memory tracking for all available GPUs"""
-        try:
-            import pynvml
-
-            pynvml.nvmlInit()
-            for gpu_idx in self._total_gpu_devices:
-                handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_idx)
-                mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
-                self._gpu_memory_info[gpu_idx] = {
-                    "total": mem_info.total // (1024**2),  # Convert to MB
-                    "used": mem_info.used // (1024**2),
-                    "available": mem_info.free // (1024**2),
-                }
-            logger.info(
-                f"Initialized GPU memory tracking for {len(self._total_gpu_devices)} GPUs"
-            )
-        except ImportError:
-            logger.warning("pynvml not available, GPU memory tracking disabled")
-            # Fallback to basic tracking without actual memory info
-            for gpu_idx in self._total_gpu_devices:
-                self._gpu_memory_info[gpu_idx] = {"total": 0, "used": 0, "available": 0}
-        except Exception as e:
-            logger.error(f"Failed to initialize GPU memory tracking: {e}")
-            for gpu_idx in self._total_gpu_devices:
-                self._gpu_memory_info[gpu_idx] = {"total": 0, "used": 0, "available": 0}
-
-    def _update_gpu_memory_info(self, gpu_idx: int):
-        """Update memory information for a specific GPU"""
-        try:
-            import pynvml
-
-            handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_idx)
-            mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
-            self._gpu_memory_info[gpu_idx] = {
-                "total": mem_info.total // (1024**2),
-                "used": mem_info.used // (1024**2),
-                "available": mem_info.free // (1024**2),
-            }
-        except Exception as e:
-            logger.debug(f"Failed to update GPU {gpu_idx} memory info: {e}")
-
-    def _get_gpu_with_most_available_memory(self) -> int:
-        """Find the GPU with the most available memory"""
-        self._initialize_gpu_memory_tracking() if not self._gpu_memory_info else None
-
-        max_available_gpu = -1
-        max_available_memory: Union[int, float] = -1
-
-        for gpu_idx in self._total_gpu_devices:
-            self._update_gpu_memory_info(gpu_idx)
-            available_memory = self._gpu_memory_info[gpu_idx]["available"]
-
-            if available_memory > max_available_memory:
-                max_available_memory = available_memory
-                max_available_gpu = gpu_idx
-
-        if max_available_gpu == -1:
-            raise RuntimeError("No suitable GPU found")
-
-        logger.info(
-            f"Selected GPU {max_available_gpu} with {max_available_memory}MB available memory"
-        )
-        return max_available_gpu
-
-    def _estimate_model_memory_usage(
-        self,
-        model_name: str,
-        model_size: Union[int, str],
-        model_format: Optional[str],
-        quantization: Optional[str],
-    ) -> int:
-        """Estimate memory usage for a model based on its characteristics"""
-        # Basic estimation logic - this can be enhanced with more sophisticated calculations
-        if isinstance(model_size, str):
-            # Convert string size like "7B" to integer
-            if "B" in model_size:
-                size_gb = float(model_size.replace("B", ""))
-            else:
-                size_gb = float(model_size)
-        else:
-            size_gb = float(model_size)
-
-        # Base memory estimation (rough calculation)
-        base_memory_mb = int(size_gb * 1024 * 1.5)  # 1.5GB per billion parameters
-
-        # Adjust based on quantization
-        if quantization:
-            if "4bit" in quantization.lower() or "4-bit" in quantization.lower():
-                base_memory_mb = base_memory_mb // 3
-            elif "8bit" in quantization.lower() or "8-bit" in quantization.lower():
-                base_memory_mb = base_memory_mb // 2
-
-        # Adjust based on format
-        if model_format:
-            if "gguf" in model_format.lower():
-                base_memory_mb = int(
-                    base_memory_mb * 0.8
-                )  # GGUF is generally more memory efficient
-
-        # Add some buffer for overhead
-        base_memory_mb = int(base_memory_mb * 1.2)
-
-        logger.debug(f"Estimated memory usage for {model_name}: {base_memory_mb}MB")
-        return base_memory_mb
-
-    def _can_fit_model_on_gpu(self, gpu_idx: int, estimated_memory_mb: int) -> bool:
-        """Check if a model can fit on a specific GPU"""
-        if gpu_idx not in self._gpu_memory_info:
-            self._update_gpu_memory_info(gpu_idx)
-
-        available_memory = self._gpu_memory_info[gpu_idx]["available"]
-        can_fit = estimated_memory_mb <= available_memory
-
-        if can_fit:
-            logger.info(
-                f"Model can fit on GPU {gpu_idx}: needs {estimated_memory_mb}MB, has {available_memory}MB available"
-            )
-        else:
-            logger.warning(
-                f"Model cannot fit on GPU {gpu_idx}: needs {estimated_memory_mb}MB, has {available_memory}MB available"
-            )
-
-        return can_fit
-
     @staticmethod
     def record_metrics(name, op, kwargs):
         record_metrics(name, op, kwargs)
diff --git a/xinference/device_utils.py b/xinference/device_utils.py
index db4180b8d1..bcc3f8dc60 100644
--- a/xinference/device_utils.py
+++ b/xinference/device_utils.py
@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 import os
-from typing import Dict, Literal, Union
+from typing import Dict, List, Literal, Optional, Union
 
 import torch
 
@@ -193,3 +194,87 @@ def get_nvidia_gpu_info() -> Dict:
             nvmlShutdown()
         except:
             pass
+
+
+def initialize_gpu_memory_info(
+    gpu_indices: List[int], logger: Optional[logging.Logger] = None
+) -> Dict[int, Dict[str, Union[int, float]]]:
+    """
+    Initialize GPU memory information using NVML
+
+    Args:
+        gpu_indices: List of GPU indices to initialize
+        logger: Optional logger instance
+
+    Returns:
+        Dictionary mapping GPU index to memory info (total/used/available in MB)
+    """
+    gpu_memory_info = {}
+
+    try:
+        import pynvml
+
+        pynvml.nvmlInit()
+
+        for gpu_idx in gpu_indices:
+            handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_idx)
+            mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+
+            gpu_memory_info[gpu_idx] = {
+                "total": mem_info.total // (1024**2),  # Convert to MB
+                "used": mem_info.used // (1024**2),
+                "available": mem_info.free // (1024**2),
+            }
+
+    except ImportError:
+        if logger:
+            logger.warning("pynvml not available, GPU memory tracking disabled")
+        # Fallback to basic tracking without actual memory info
+        for gpu_idx in gpu_indices:
+            gpu_memory_info[gpu_idx] = {
+                "total": 0,
+                "used": 0,
+                "available": 0,
+            }
+    except Exception as e:
+        if logger:
+            logger.error(f"Failed to initialize GPU memory info: {e}")
+        # Fallback to basic tracking
+        for gpu_idx in gpu_indices:
+            gpu_memory_info[gpu_idx] = {
+                "total": 0,
+                "used": 0,
+                "available": 0,
+            }
+
+    return gpu_memory_info
+
+
+def update_gpu_memory_info(
+    gpu_memory_info: Dict[int, Dict[str, Union[int, float]]],
+    gpu_idx: int,
+    logger: Optional[logging.Logger] = None,
+) -> None:
+    """
+    Update memory information for a specific GPU using NVML
+
+    Args:
+        gpu_memory_info: Dictionary to update with memory information
+        gpu_idx: GPU index to update
+        logger: Optional logger instance
+    """
+    try:
+        import pynvml
+
+        handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_idx)
+        mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+
+        gpu_memory_info[gpu_idx] = {
+            "total": mem_info.total // (1024**2),
+            "used": mem_info.used // (1024**2),
+            "available": mem_info.free // (1024**2),
+        }
+
+    except:
+        # Keep existing values if update fails
+        pass

From efcc573e1644afd9f2dc18bb7950e620e8da126b Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Fri, 21 Nov 2025 15:15:28 +0800
Subject: [PATCH 07/18] new launch_strategy

---
 xinference/constants.py              |   9 ++
 xinference/core/launch_strategy.py   | 197 ++++++++++++++++++++++++++-
 xinference/core/tests/test_worker.py |   4 +-
 xinference/core/worker.py            |  17 ++-
 4 files changed, 214 insertions(+), 13 deletions(-)

diff --git a/xinference/constants.py b/xinference/constants.py
index b07771b864..cdc08a2f1e 100644
--- a/xinference/constants.py
+++ b/xinference/constants.py
@@ -34,6 +34,8 @@
 XINFERENCE_ENV_SSE_PING_ATTEMPTS_SECONDS = "XINFERENCE_SSE_PING_ATTEMPTS_SECONDS"
 XINFERENCE_ENV_MAX_TOKENS = "XINFERENCE_MAX_TOKENS"
 XINFERENCE_ENV_ALLOWED_IPS = "XINFERENCE_ALLOWED_IPS"
+XINFERENCE_ENV_LAUNCH_STRATEGY = "XINFERENCE_LAUNCH_STRATEGY"
+XINFERENCE_ENV_LAUNCH_ALLOWED_GPUS = "XINFERENCE_LAUNCH_ALLOWED_GPUS"
 XINFERENCE_ENV_ENABLE_SINGLE_GPU_MULTI_REPLICA = (
     "XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA"
 )
@@ -120,5 +122,12 @@ def get_xinference_home() -> str:
 XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA = bool(
     int(os.getenv(XINFERENCE_ENV_ENABLE_SINGLE_GPU_MULTI_REPLICA, "1"))
 )  # Enable by default
+XINFERENCE_LAUNCH_STRATEGY = os.getenv(XINFERENCE_ENV_LAUNCH_STRATEGY, "memory_aware")
+_allowed_gpu_str = os.getenv(XINFERENCE_ENV_LAUNCH_ALLOWED_GPUS, "")
+XINFERENCE_LAUNCH_ALLOWED_GPUS = (
+    {int(x) for x in _allowed_gpu_str.split(",") if x.strip().isdigit()}
+    if _allowed_gpu_str
+    else None
+)
 XINFERENCE_BATCH_SIZE = int(os.getenv(XINFERENCE_ENV_BATCH_SIZE, "32"))
 XINFERENCE_BATCH_INTERVAL = float(os.getenv(XINFERENCE_ENV_BATCH_INTERVAL, "0.003"))
diff --git a/xinference/core/launch_strategy.py b/xinference/core/launch_strategy.py
index 6e5911b069..b046eb08e8 100644
--- a/xinference/core/launch_strategy.py
+++ b/xinference/core/launch_strategy.py
@@ -17,7 +17,7 @@
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Set, Union
 
-from ..device_utils import update_gpu_memory_info
+from ..device_utils import initialize_gpu_memory_info, update_gpu_memory_info
 
 logger = logging.getLogger(__name__)
 
@@ -275,10 +275,13 @@ def allocate(
                     for dev, available_memory in gpu_memory_list[:remaining_needed]:
                         selected.append(dev)
 
-        if len(selected) != n_gpu:
-            raise RuntimeError(
-                f"Failed to allocate {n_gpu} GPUs, only got {len(selected)}"
-            )
+        if len(selected) < n_gpu:
+            if not selected:
+                best_gpu = self._get_gpu_with_most_available_memory()
+                selected.append(best_gpu)
+            fill_gpu = selected[0]
+            while len(selected) < n_gpu:
+                selected.append(fill_gpu)
 
         # Update memory usage accounting
         for gpu_idx in selected:
@@ -301,3 +304,187 @@ def release(self, model_uid: str, devices: List[int]) -> None:
 
             # Remove model from memory tracking
             del self._model_memory_usage[model_uid]
+
+
+class PackingFirstLaunchStrategy(LaunchStrategy):
+    """
+    Prefer filling one GPU before moving to the next highest-available GPU.
+    Allows GPU reuse when requested replicas exceed available distinct GPUs.
+    """
+
+    def __init__(
+        self,
+        total_gpu_devices: List[int],
+        gpu_memory_info: Optional[Dict[int, Dict[str, Union[int, float]]]] = None,
+    ):
+        self._total_gpu_devices = total_gpu_devices
+        self._gpu_memory_info = gpu_memory_info or initialize_gpu_memory_info(
+            total_gpu_devices, logger=logger
+        )
+
+    def allocate(
+        self,
+        spec: LaunchModelSpec,
+        total_gpu_devices: List[int],
+        user_specified_allocated_devices: Set[int],
+        allocated_gpus: Dict[int, str],
+    ) -> List[int]:
+        candidates = [
+            dev
+            for dev in total_gpu_devices
+            if dev not in allocated_gpus and dev not in user_specified_allocated_devices
+        ]
+        if not candidates:
+            raise RuntimeError("No available slot found for the model")
+
+        for dev in candidates:
+            update_gpu_memory_info(self._gpu_memory_info, dev, logger=logger)
+        candidates.sort(
+            key=lambda d: self._gpu_memory_info.get(d, {}).get("available", 0),
+            reverse=True,
+        )
+
+        selected: List[int] = []
+        idx = 0
+        while len(selected) < spec.n_gpu:
+            chosen = candidates[min(idx, len(candidates) - 1)]
+            selected.append(chosen)
+            if idx < len(candidates) - 1:
+                idx += 1
+
+        return selected
+
+    def release(self, model_uid: str, devices: List[int]) -> None:
+        # No internal accounting maintained here
+        return
+
+
+class SpreadFirstLaunchStrategy(LaunchStrategy):
+    """
+    Prefer spreading replicas across distinct GPUs before reusing any GPU.
+    Falls back to reuse when replicas exceed distinct GPUs.
+    """
+
+    def __init__(
+        self,
+        total_gpu_devices: List[int],
+        gpu_memory_info: Optional[Dict[int, Dict[str, Union[int, float]]]] = None,
+    ):
+        self._total_gpu_devices = total_gpu_devices
+        self._gpu_memory_info = gpu_memory_info or initialize_gpu_memory_info(
+            total_gpu_devices, logger=logger
+        )
+
+    def allocate(
+        self,
+        spec: LaunchModelSpec,
+        total_gpu_devices: List[int],
+        user_specified_allocated_devices: Set[int],
+        allocated_gpus: Dict[int, str],
+    ) -> List[int]:
+        candidates = [
+            dev
+            for dev in total_gpu_devices
+            if dev not in allocated_gpus and dev not in user_specified_allocated_devices
+        ]
+        if not candidates:
+            raise RuntimeError("No available slot found for the model")
+
+        for dev in candidates:
+            update_gpu_memory_info(self._gpu_memory_info, dev, logger=logger)
+        candidates.sort(
+            key=lambda d: self._gpu_memory_info.get(d, {}).get("available", 0),
+            reverse=True,
+        )
+
+        selected: List[int] = []
+        idx = 0
+        while len(selected) < spec.n_gpu:
+            chosen = candidates[idx % len(candidates)]
+            selected.append(chosen)
+            idx += 1
+
+        return selected
+
+    def release(self, model_uid: str, devices: List[int]) -> None:
+        return
+
+
+class QuotaAwareLaunchStrategy(LaunchStrategy):
+    """
+    Restrict allocation to an allowed set of GPUs, then spread-first within that set.
+    """
+
+    def __init__(
+        self,
+        total_gpu_devices: List[int],
+        allowed_devices: Optional[Set[int]] = None,
+        gpu_memory_info: Optional[Dict[int, Dict[str, Union[int, float]]]] = None,
+    ):
+        self._total_gpu_devices = total_gpu_devices
+        self._allowed_devices = allowed_devices
+        self._gpu_memory_info = gpu_memory_info or initialize_gpu_memory_info(
+            total_gpu_devices, logger=logger
+        )
+
+    def allocate(
+        self,
+        spec: LaunchModelSpec,
+        total_gpu_devices: List[int],
+        user_specified_allocated_devices: Set[int],
+        allocated_gpus: Dict[int, str],
+    ) -> List[int]:
+        device_pool = (
+            [dev for dev in total_gpu_devices if dev in self._allowed_devices]
+            if self._allowed_devices is not None
+            else total_gpu_devices
+        )
+        candidates = [
+            dev
+            for dev in device_pool
+            if dev not in allocated_gpus and dev not in user_specified_allocated_devices
+        ]
+        if not candidates:
+            raise RuntimeError("No available slot found for the model")
+
+        for dev in candidates:
+            update_gpu_memory_info(self._gpu_memory_info, dev, logger=logger)
+        candidates.sort(
+            key=lambda d: self._gpu_memory_info.get(d, {}).get("available", 0),
+            reverse=True,
+        )
+
+        selected: List[int] = []
+        idx = 0
+        while len(selected) < spec.n_gpu:
+            chosen = candidates[idx % len(candidates)]
+            selected.append(chosen)
+            idx += 1
+
+        return selected
+
+    def release(self, model_uid: str, devices: List[int]) -> None:
+        return
+
+
+def create_launch_strategy(
+    strategy_name: str,
+    total_gpu_devices: List[int],
+    allowed_devices: Optional[Set[int]] = None,
+    gpu_memory_info: Optional[Dict[int, Dict[str, Union[int, float]]]] = None,
+) -> LaunchStrategy:
+    strategy_name = strategy_name.lower()
+    if strategy_name == "memory_aware":
+        return MemoryAwareLaunchStrategy(total_gpu_devices, gpu_memory_info)
+    if strategy_name == "packing_first":
+        return PackingFirstLaunchStrategy(total_gpu_devices, gpu_memory_info)
+    if strategy_name == "spread_first":
+        return SpreadFirstLaunchStrategy(total_gpu_devices, gpu_memory_info)
+    if strategy_name == "quota_aware":
+        return QuotaAwareLaunchStrategy(
+            total_gpu_devices, allowed_devices, gpu_memory_info
+        )
+    logger.warning(
+        f"Unknown launch strategy '{strategy_name}', falling back to memory_aware"
+    )
+    return MemoryAwareLaunchStrategy(total_gpu_devices, gpu_memory_info)
diff --git a/xinference/core/tests/test_worker.py b/xinference/core/tests/test_worker.py
index 7c0aa45960..3c6849d83a 100644
--- a/xinference/core/tests/test_worker.py
+++ b/xinference/core/tests/test_worker.py
@@ -121,8 +121,8 @@ async def test_allocate_cuda_devices(setup_pool):
     devices = await worker.allocate_devices(model_uid="mock_model_3", n_gpu=3)
     assert devices == [5, 6, 7]
 
-    with pytest.raises(RuntimeError):
-        await worker.allocate_devices(model_uid="mock_model_4", n_gpu=5)
+    devices = await worker.allocate_devices(model_uid="mock_model_4", n_gpu=5)
+    assert len(devices) == 5
 
 
 @pytest.mark.asyncio
diff --git a/xinference/core/worker.py b/xinference/core/worker.py
index 8cb89af426..5380977a76 100644
--- a/xinference/core/worker.py
+++ b/xinference/core/worker.py
@@ -62,7 +62,7 @@
 from ..utils import get_pip_config_args, get_real_path
 from .cache_tracker import CacheTrackerActor
 from .event import Event, EventCollectorActor, EventType
-from .launch_strategy import MemoryAwareLaunchStrategy
+from .launch_strategy import LaunchModelSpec, create_launch_strategy
 from .metrics import launch_metrics_export_server, record_metrics
 from .resource import gather_node_info
 from .status_guard import StatusGuardActor
@@ -155,7 +155,16 @@ def __init__(
         self._model_uid_to_addr: Dict[str, str] = {}
         self._model_uid_to_recover_count: Dict[str, Optional[int]] = {}
         self._model_uid_to_launch_args: Dict[str, Dict] = {}
-        self._launch_strategy = MemoryAwareLaunchStrategy(self._total_gpu_devices)
+        from ..constants import (
+            XINFERENCE_LAUNCH_ALLOWED_GPUS,
+            XINFERENCE_LAUNCH_STRATEGY,
+        )
+
+        self._launch_strategy = create_launch_strategy(
+            strategy_name=XINFERENCE_LAUNCH_STRATEGY,
+            total_gpu_devices=self._total_gpu_devices,
+            allowed_devices=XINFERENCE_LAUNCH_ALLOWED_GPUS,
+        )
 
         if XINFERENCE_DISABLE_METRICS:
             logger.info(
@@ -549,8 +558,6 @@ def _collect_user_specified_devices(self) -> Set[int]:
         return user_specified_allocated_devices
 
     def allocate_devices(self, model_uid: str, n_gpu: int) -> List[int]:
-        from .launch_strategy import LaunchModelSpec
-
         spec = LaunchModelSpec(model_uid=model_uid, n_gpu=n_gpu)
         devices = self._launch_strategy.allocate(
             spec=spec,
@@ -571,8 +578,6 @@ def allocate_devices_for_model(
         quantization: Optional[str],
         n_gpu: int = 1,
     ) -> List[int]:
-        from .launch_strategy import LaunchModelSpec
-
         spec = LaunchModelSpec(
             model_uid=model_uid,
             n_gpu=n_gpu,

From 06f7bb6e8313ae5036bd5687d2c7cee043370cac Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Fri, 21 Nov 2025 16:11:58 +0800
Subject: [PATCH 08/18] new launch_strategy

---
 xinference/core/launch_strategy.py | 35 +++++++++++++++---------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/xinference/core/launch_strategy.py b/xinference/core/launch_strategy.py
index b046eb08e8..2169c55c03 100644
--- a/xinference/core/launch_strategy.py
+++ b/xinference/core/launch_strategy.py
@@ -15,7 +15,7 @@
 import logging
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Set, Union
+from typing import Dict, List, Optional, Set, Tuple, Union
 
 from ..device_utils import initialize_gpu_memory_info, update_gpu_memory_info
 
@@ -213,27 +213,26 @@ def allocate(
         ]
 
         if estimated_memory_mb > 0:
-            # Try to find GPUs that can accommodate the model
-            suitable_gpus = []
+            suitable_with_mem: List[Tuple[int, Union[int, float]]] = []
 
-            # First, check completely available GPUs
+            # Include completely available GPUs first
             for gpu_idx in completely_available_gpus:
                 if self._can_fit_model_on_gpu(gpu_idx, estimated_memory_mb):
-                    suitable_gpus.append(gpu_idx)
-
-            # If not enough completely available GPUs, check partially used GPUs
-            if len(suitable_gpus) < n_gpu:
-                for dev in total_gpu_devices:
-                    if dev in allocated_gpus and dev not in suitable_gpus:
-                        # This GPU is already allocated, check if it has space for another replica
-                        if self._can_fit_model_on_gpu(dev, estimated_memory_mb):
-                            suitable_gpus.append(dev)
-
-            if len(suitable_gpus) >= n_gpu:
-                selected = suitable_gpus[:n_gpu]
+                    available = self._gpu_memory_info[gpu_idx]["available"]
+                    suitable_with_mem.append((gpu_idx, available))
+
+            # Check already allocated GPUs for possible reuse
+            for dev in total_gpu_devices:
+                if dev in allocated_gpus:
+                    if self._can_fit_model_on_gpu(dev, estimated_memory_mb):
+                        available = self._gpu_memory_info[dev]["available"]
+                        suitable_with_mem.append((dev, available))
+
+            if suitable_with_mem:
+                suitable_with_mem.sort(key=lambda x: x[1], reverse=True)
+                selected = [dev for dev, _ in suitable_with_mem[:n_gpu]]
             else:
-                # Not enough GPUs with sufficient memory, but try anyway
-                # Use the GPU with most available memory
+                # Not enough GPUs with sufficient memory, pick the best GPU and reuse it
                 best_gpu = self._get_gpu_with_most_available_memory()
                 selected = [best_gpu] if n_gpu == 1 else [best_gpu] * n_gpu
         else:

From 23ef82b2f9c354f967a51b7aafcfc1e5540740ec Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Mon, 8 Dec 2025 11:19:43 +0800
Subject: [PATCH 09/18] fix CI error

---
 xinference/core/launch_strategy.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/xinference/core/launch_strategy.py b/xinference/core/launch_strategy.py
index 2169c55c03..b2135f04ce 100644
--- a/xinference/core/launch_strategy.py
+++ b/xinference/core/launch_strategy.py
@@ -212,6 +212,11 @@ def allocate(
             if dev not in allocated_gpus and dev not in user_specified_allocated_devices
         ]
 
+        # If all visible GPUs are already occupied (by allocated or user-specified),
+        # keep legacy behavior and fail fast instead of oversubscribing.
+        if len(completely_available_gpus) < n_gpu:
+            raise RuntimeError("No available slot found for the model")
+
         if estimated_memory_mb > 0:
             suitable_with_mem: List[Tuple[int, Union[int, float]]] = []
 

From b0dd2ba29e439942c75ab62f105975674ba39a8c Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Mon, 8 Dec 2025 12:18:49 +0800
Subject: [PATCH 10/18] fix CI error

---
 xinference/core/launch_strategy.py | 54 ++++++++++++++----------------
 1 file changed, 26 insertions(+), 28 deletions(-)

diff --git a/xinference/core/launch_strategy.py b/xinference/core/launch_strategy.py
index b2135f04ce..b5d00f2daf 100644
--- a/xinference/core/launch_strategy.py
+++ b/xinference/core/launch_strategy.py
@@ -212,9 +212,11 @@ def allocate(
             if dev not in allocated_gpus and dev not in user_specified_allocated_devices
         ]
 
-        # If all visible GPUs are already occupied (by allocated or user-specified),
-        # keep legacy behavior and fail fast instead of oversubscribing.
-        if len(completely_available_gpus) < n_gpu:
+        if not total_gpu_devices:
+            raise RuntimeError("No available slot found for the model")
+
+        # If there are user-specified allocations and no fully free GPUs, keep legacy behavior and fail fast to avoid oversubscription.
+        if len(completely_available_gpus) < n_gpu and user_specified_allocated_devices:
             raise RuntimeError("No available slot found for the model")
 
         if estimated_memory_mb > 0:
@@ -245,39 +247,35 @@ def allocate(
             if len(completely_available_gpus) >= n_gpu:
                 selected = completely_available_gpus[:n_gpu]
             else:
-                # For single GPU deployment without memory estimation, allow sharing
-                if n_gpu == 1 and total_gpu_devices:
-                    # Use the first available GPU or the one with most available memory
-                    if completely_available_gpus:
-                        selected = [completely_available_gpus[0]]
-                    else:
-                        # No completely available GPU, find one with most available memory
-                        best_gpu = self._get_gpu_with_most_available_memory()
-                        selected = [best_gpu]
-                else:
-                    # Use GPUs with most available memory
-                    remaining_needed = n_gpu - len(completely_available_gpus)
-                    candidate_gpus = [
-                        dev
-                        for dev in total_gpu_devices
-                        if dev not in completely_available_gpus
-                        and dev not in allocated_gpus
-                    ]
-
-                    gpu_memory_list = []
+                # Use GPUs (including already allocated ones) ordered by available memory
+                candidate_gpus = [
+                    dev
+                    for dev in total_gpu_devices
+                    if dev not in completely_available_gpus
+                ]
+
+                ordered_candidates: List[int]
+                gpu_memory_list = []
+                if self._gpu_memory_info:
                     for dev in candidate_gpus:
                         update_gpu_memory_info(
                             self._gpu_memory_info, dev, logger=logger
                         )
                         available_memory = self._gpu_memory_info[dev]["available"]
                         gpu_memory_list.append((dev, available_memory))
-
-                    # Sort by available memory (descending)
                     gpu_memory_list.sort(key=lambda x: x[1], reverse=True)
+                    ordered_candidates = [dev for dev, _ in gpu_memory_list]
+                else:
+                    ordered_candidates = candidate_gpus
+
+                if not ordered_candidates:
+                    raise RuntimeError("No available slot found for the model")
 
-                    selected = completely_available_gpus.copy()
-                    for dev, available_memory in gpu_memory_list[:remaining_needed]:
-                        selected.append(dev)
+                selected = completely_available_gpus.copy()
+                idx = 0
+                while len(selected) < n_gpu:
+                    selected.append(ordered_candidates[idx % len(ordered_candidates)])
+                    idx += 1
 
         if len(selected) < n_gpu:
             if not selected:

From ed9fe0ea104daf230fb91010541351cdf438ddbc Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Mon, 8 Dec 2025 15:24:13 +0800
Subject: [PATCH 11/18] modify launch strategy

---
 xinference/constants.py              |   2 +-
 xinference/core/launch_strategy.py   | 604 ++++++++++-----------------
 xinference/core/tests/test_worker.py | 144 ++++---
 xinference/core/worker.py            |  44 +-
 4 files changed, 328 insertions(+), 466 deletions(-)

diff --git a/xinference/constants.py b/xinference/constants.py
index cdc08a2f1e..6d9afd0e79 100644
--- a/xinference/constants.py
+++ b/xinference/constants.py
@@ -122,7 +122,7 @@ def get_xinference_home() -> str:
 XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA = bool(
     int(os.getenv(XINFERENCE_ENV_ENABLE_SINGLE_GPU_MULTI_REPLICA, "1"))
 )  # Enable by default
-XINFERENCE_LAUNCH_STRATEGY = os.getenv(XINFERENCE_ENV_LAUNCH_STRATEGY, "memory_aware")
+XINFERENCE_LAUNCH_STRATEGY = os.getenv(XINFERENCE_ENV_LAUNCH_STRATEGY, "local_first")
 _allowed_gpu_str = os.getenv(XINFERENCE_ENV_LAUNCH_ALLOWED_GPUS, "")
 XINFERENCE_LAUNCH_ALLOWED_GPUS = (
     {int(x) for x in _allowed_gpu_str.split(",") if x.strip().isdigit()}
diff --git a/xinference/core/launch_strategy.py b/xinference/core/launch_strategy.py
index b5d00f2daf..4a93d5167c 100644
--- a/xinference/core/launch_strategy.py
+++ b/xinference/core/launch_strategy.py
@@ -15,9 +15,13 @@
 import logging
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Set, Tuple, Union
+from typing import Dict, List, Mapping, Optional, Set, Tuple, Union
+
+import torch
 
 from ..device_utils import initialize_gpu_memory_info, update_gpu_memory_info
+from ..model.llm.llm_family import DEFAULT_CONTEXT_LENGTH
+from ..model.llm.memory import estimate_llm_gpu_memory
 
 logger = logging.getLogger(__name__)
 
@@ -43,430 +47,268 @@ def allocate(
         spec: LaunchModelSpec,
         total_gpu_devices: List[int],
         user_specified_allocated_devices: Set[int],
-        allocated_gpus: Dict[int, str],
+        allocated_gpus: Mapping[int, Set[str]],
     ) -> List[int]:
-        """
-        Allocate GPUs for model launch
-
-        Args:
-            spec: Model launch specification
-            total_gpu_devices: List of all available GPU indices
-            user_specified_allocated_devices: Set of user-specified allocated devices
-            allocated_gpus: Dictionary mapping GPU index to model UID
-
-        Returns:
-            List of allocated GPU indices
-        """
+        """Allocate GPUs for model launch"""
         pass
 
     @abstractmethod
     def release(self, model_uid: str, devices: List[int]) -> None:
-        """
-        Release GPUs allocated for a model
-
-        Args:
-            model_uid: Model identifier
-            devices: List of GPU indices to release
-        """
+        """Release GPUs allocated for a model"""
         pass
 
 
-class MemoryAwareLaunchStrategy(LaunchStrategy):
-    """Memory-aware GPU allocation strategy supporting single-GPU multi-replica"""
+class LocalFirstLaunchStrategy(LaunchStrategy):
+    """
+    Prefer the GPU running Xinference, otherwise keep allocating onto the emptiest
+    remaining GPU.
+    """
 
     def __init__(
         self,
         total_gpu_devices: List[int],
+        allowed_devices: Optional[Set[int]] = None,
         gpu_memory_info: Optional[Dict[int, Dict[str, Union[int, float]]]] = None,
     ):
-        self._total_gpu_devices = total_gpu_devices
-        self._gpu_memory_info = gpu_memory_info or {}
-        self._model_memory_usage: Dict[str, int] = {}
+        self._allowed_devices = allowed_devices
+        self._total_gpu_devices = self._filter_allowed(total_gpu_devices)
+        self._gpu_memory_info = gpu_memory_info or initialize_gpu_memory_info(
+            self._total_gpu_devices, logger=logger
+        )
+        self._model_memory_usage: Dict[str, Tuple[int, Dict[int, int]]] = {}
+        self._preferred_gpu = self._detect_preferred_gpu()
 
-        # Initialize memory tracking for all GPUs if not provided
-        if not self._gpu_memory_info:
-            self._initialize_gpu_memory_tracking()
+    def _filter_allowed(self, total_gpu_devices: List[int]) -> List[int]:
+        if self._allowed_devices is None:
+            return total_gpu_devices
+        return [dev for dev in total_gpu_devices if dev in self._allowed_devices]
 
-    def _initialize_gpu_memory_tracking(self):
-        """Initialize GPU memory tracking for all available GPUs"""
+    def _detect_preferred_gpu(self) -> Optional[int]:
         try:
-            from ..device_utils import initialize_gpu_memory_info
-
-            self._gpu_memory_info = initialize_gpu_memory_info(
-                self._total_gpu_devices, logger=logger
-            )
-        except Exception as e:
-            logger.warning(f"Failed to initialize GPU memory tracking: {e}")
-            # Fallback to basic tracking without actual memory info
-            for gpu_idx in self._total_gpu_devices:
-                self._gpu_memory_info[gpu_idx] = {
-                    "total": 0,
-                    "used": 0,
-                    "available": 0,
-                }
+            if torch.cuda.is_available():
+                gpu_idx = torch.cuda.current_device()
+                if gpu_idx in self._total_gpu_devices:
+                    return gpu_idx
+        except Exception:
+            pass
+        return self._total_gpu_devices[0] if self._total_gpu_devices else None
 
     def _estimate_model_memory_usage(
         self,
-        model_name: str,
-        model_size: Union[int, str],
+        model_name: Optional[str],
+        model_size: Optional[Union[int, str]],
         model_format: Optional[str],
         quantization: Optional[str],
     ) -> int:
-        """Estimate memory usage for a model based on its characteristics"""
-        # Basic estimation logic - this can be enhanced with more sophisticated calculations
-        if isinstance(model_size, str):
-            # Convert string size like "7B" to integer
-            if "B" in model_size:
-                size_gb = float(model_size.replace("B", ""))
-            else:
-                size_gb = float(model_size)
-        else:
-            size_gb = float(model_size)
-
-        # Base memory estimation (rough calculation)
-        base_memory_mb = int(size_gb * 1024 * 1.5)  # 1.5GB per billion parameters
-
-        # Adjust based on quantization
-        if quantization:
-            if "4bit" in quantization.lower() or "4-bit" in quantization.lower():
-                base_memory_mb = base_memory_mb // 3
-            elif "8bit" in quantization.lower() or "8-bit" in quantization.lower():
-                base_memory_mb = base_memory_mb // 2
-
-        # Adjust based on model format
-        if model_format:
-            if "mlx" in model_format.lower():
-                base_memory_mb = int(
-                    base_memory_mb * 0.8
-                )  # MLX is generally more memory efficient
-
-        return max(base_memory_mb, 1024)  # Minimum 1GB
-
-    def _can_fit_model_on_gpu(self, gpu_idx: int, estimated_memory_mb: int) -> bool:
-        """Check if a model can fit on a specific GPU"""
-        # Update memory info for the GPU
-        update_gpu_memory_info(self._gpu_memory_info, gpu_idx, logger=logger)
-
-        available_memory = self._gpu_memory_info[gpu_idx]["available"]
-        can_fit = estimated_memory_mb <= available_memory
-
-        if can_fit:
-            logger.info(
-                f"Model can fit on GPU {gpu_idx}: needs {estimated_memory_mb}MB, has {available_memory}MB available"
-            )
-        else:
-            logger.warning(
-                f"Model cannot fit on GPU {gpu_idx}: needs {estimated_memory_mb}MB, has {available_memory}MB available"
-            )
-
-        return can_fit
+        """Estimate memory usage using the documented cal-model-mem algorithm."""
+        if model_size is None:
+            return 1024
 
-    def _get_gpu_with_most_available_memory(self) -> int:
-        """Find the GPU with the most available memory"""
-        max_available_gpu = -1
-        max_available_memory: Union[int, float] = -1
+        def _normalize_size(size: Union[int, str]) -> str:
+            if isinstance(size, str):
+                normalized = size.strip().lower().rstrip("b")
+                return normalized if normalized else "0"
+            return str(size)
 
-        for gpu_idx in self._total_gpu_devices:
-            update_gpu_memory_info(self._gpu_memory_info, gpu_idx, logger=logger)
-            available_memory = self._gpu_memory_info[gpu_idx]["available"]
+        size_in_billions = _normalize_size(model_size)
+        model_format = model_format or "pytorch"
 
-            if available_memory > max_available_memory:
-                max_available_memory = available_memory
-                max_available_gpu = gpu_idx
-
-        if max_available_gpu == -1:
-            raise RuntimeError("No suitable GPU found")
-
-        return max_available_gpu
-
-    def allocate(
-        self,
-        spec: LaunchModelSpec,
-        total_gpu_devices: List[int],
-        user_specified_allocated_devices: Set[int],
-        allocated_gpus: Dict[int, str],
-    ) -> List[int]:
-        """
-        Allocate GPUs using memory-aware strategy
-
-        Strategy:
-        1. Prefer completely free GPUs
-        2. If not enough, use GPUs with most available memory
-        3. Support single-GPU multi-replica deployment
-        """
-        model_uid = spec.model_uid
-        n_gpu = spec.n_gpu
-
-        # Estimate model memory usage if model info is provided
-        estimated_memory_mb = 0
-        if spec.model_name and spec.model_size:
-            estimated_memory_mb = self._estimate_model_memory_usage(
-                spec.model_name, spec.model_size, spec.model_format, spec.quantization
+        try:
+            mem_info = estimate_llm_gpu_memory(
+                model_size_in_billions=size_in_billions,
+                quantization=quantization,
+                context_length=DEFAULT_CONTEXT_LENGTH,
+                model_format=model_format,
+                model_name=model_name,
+                kv_cache_dtype=16,
             )
-            self._model_memory_usage[model_uid] = estimated_memory_mb
-
-        # Check for completely available GPUs first
-        completely_available_gpus = [
-            dev
-            for dev in total_gpu_devices
-            if dev not in allocated_gpus and dev not in user_specified_allocated_devices
-        ]
-
-        if not total_gpu_devices:
-            raise RuntimeError("No available slot found for the model")
-
-        # If there are user-specified allocations and no fully free GPUs, keep legacy behavior and fail fast to avoid oversubscription.
-        if len(completely_available_gpus) < n_gpu and user_specified_allocated_devices:
-            raise RuntimeError("No available slot found for the model")
-
-        if estimated_memory_mb > 0:
-            suitable_with_mem: List[Tuple[int, Union[int, float]]] = []
-
-            # Include completely available GPUs first
-            for gpu_idx in completely_available_gpus:
-                if self._can_fit_model_on_gpu(gpu_idx, estimated_memory_mb):
-                    available = self._gpu_memory_info[gpu_idx]["available"]
-                    suitable_with_mem.append((gpu_idx, available))
-
-            # Check already allocated GPUs for possible reuse
-            for dev in total_gpu_devices:
-                if dev in allocated_gpus:
-                    if self._can_fit_model_on_gpu(dev, estimated_memory_mb):
-                        available = self._gpu_memory_info[dev]["available"]
-                        suitable_with_mem.append((dev, available))
-
-            if suitable_with_mem:
-                suitable_with_mem.sort(key=lambda x: x[1], reverse=True)
-                selected = [dev for dev, _ in suitable_with_mem[:n_gpu]]
-            else:
-                # Not enough GPUs with sufficient memory, pick the best GPU and reuse it
-                best_gpu = self._get_gpu_with_most_available_memory()
-                selected = [best_gpu] if n_gpu == 1 else [best_gpu] * n_gpu
-        else:
-            # No memory estimation available, use basic strategy
-            if len(completely_available_gpus) >= n_gpu:
-                selected = completely_available_gpus[:n_gpu]
-            else:
-                # Use GPUs (including already allocated ones) ordered by available memory
-                candidate_gpus = [
-                    dev
-                    for dev in total_gpu_devices
-                    if dev not in completely_available_gpus
-                ]
-
-                ordered_candidates: List[int]
-                gpu_memory_list = []
-                if self._gpu_memory_info:
-                    for dev in candidate_gpus:
-                        update_gpu_memory_info(
-                            self._gpu_memory_info, dev, logger=logger
-                        )
-                        available_memory = self._gpu_memory_info[dev]["available"]
-                        gpu_memory_list.append((dev, available_memory))
-                    gpu_memory_list.sort(key=lambda x: x[1], reverse=True)
-                    ordered_candidates = [dev for dev, _ in gpu_memory_list]
-                else:
-                    ordered_candidates = candidate_gpus
-
-                if not ordered_candidates:
-                    raise RuntimeError("No available slot found for the model")
-
-                selected = completely_available_gpus.copy()
-                idx = 0
-                while len(selected) < n_gpu:
-                    selected.append(ordered_candidates[idx % len(ordered_candidates)])
-                    idx += 1
-
-        if len(selected) < n_gpu:
-            if not selected:
-                best_gpu = self._get_gpu_with_most_available_memory()
-                selected.append(best_gpu)
-            fill_gpu = selected[0]
-            while len(selected) < n_gpu:
-                selected.append(fill_gpu)
-
-        # Update memory usage accounting
-        for gpu_idx in selected:
-            if gpu_idx in self._gpu_memory_info and estimated_memory_mb > 0:
-                self._gpu_memory_info[gpu_idx]["used"] += estimated_memory_mb
-                self._gpu_memory_info[gpu_idx]["available"] -= estimated_memory_mb
-
-        return selected
-
-    def release(self, model_uid: str, devices: List[int]) -> None:
-        """Release allocated GPUs and roll back memory accounting"""
-        # Roll back memory usage accounting
-        if model_uid in self._model_memory_usage:
-            memory_used = self._model_memory_usage[model_uid]
-            for gpu_idx in devices:
-                if gpu_idx in self._gpu_memory_info:
-                    # Roll back memory usage
-                    self._gpu_memory_info[gpu_idx]["used"] -= memory_used
-                    self._gpu_memory_info[gpu_idx]["available"] += memory_used
-
-            # Remove model from memory tracking
-            del self._model_memory_usage[model_uid]
-
-
-class PackingFirstLaunchStrategy(LaunchStrategy):
-    """
-    Prefer filling one GPU before moving to the next highest-available GPU.
-    Allows GPU reuse when requested replicas exceed available distinct GPUs.
-    """
+            if mem_info is None and model_name:
+                mem_info = estimate_llm_gpu_memory(
+                    model_size_in_billions=size_in_billions,
+                    quantization=quantization,
+                    context_length=DEFAULT_CONTEXT_LENGTH,
+                    model_format=model_format,
+                    model_name=None,
+                    kv_cache_dtype=16,
+                )
+            if mem_info is not None:
+                return max(int(mem_info.total), 1024)
+        except Exception:
+            logger.debug("Failed to estimate memory via cal-model-mem", exc_info=True)
+
+        # If estimation fails, keep minimal guard to avoid zero/negative allocation.
+        return 1024
+
+    def _has_capacity(
+        self,
+        gpu_idx: int,
+        estimated_memory_mb: int,
+        pending_gpu_counts: Dict[int, int],
+        allocated_gpus: Mapping[int, Set[str]],
+    ) -> bool:
+        if estimated_memory_mb <= 0:
+            return True
 
-    def __init__(
+        update_gpu_memory_info(self._gpu_memory_info, gpu_idx, logger=logger)
+        gpu_info = self._gpu_memory_info.get(gpu_idx, {})
+        available = gpu_info.get("available", 0)
+        total = gpu_info.get("total", 0)
+        # If we cannot get valid memory info, assume capacity is available to avoid false negatives.
+        if total == 0 and available == 0:
+            return True
+        planned_usage = (
+            pending_gpu_counts.get(gpu_idx, 0) + len(allocated_gpus.get(gpu_idx, set()))
+        ) * estimated_memory_mb
+        return available - planned_usage >= estimated_memory_mb
+
+    def _is_available(
         self,
-        total_gpu_devices: List[int],
-        gpu_memory_info: Optional[Dict[int, Dict[str, Union[int, float]]]] = None,
-    ):
-        self._total_gpu_devices = total_gpu_devices
-        self._gpu_memory_info = gpu_memory_info or initialize_gpu_memory_info(
-            total_gpu_devices, logger=logger
+        gpu_idx: int,
+        user_specified_allocated_devices: Set[int],
+        allocated_gpus: Mapping[int, Set[str]],
+        estimated_memory_mb: int,
+        pending_gpu_counts: Dict[int, int],
+    ) -> bool:
+        if gpu_idx in user_specified_allocated_devices:
+            return False
+        return self._has_capacity(
+            gpu_idx, estimated_memory_mb, pending_gpu_counts, allocated_gpus
         )
 
-    def allocate(
+    def _select_emptiest_gpu(
         self,
-        spec: LaunchModelSpec,
-        total_gpu_devices: List[int],
-        user_specified_allocated_devices: Set[int],
-        allocated_gpus: Dict[int, str],
-    ) -> List[int]:
-        candidates = [
-            dev
-            for dev in total_gpu_devices
-            if dev not in allocated_gpus and dev not in user_specified_allocated_devices
-        ]
+        candidates: List[int],
+        estimated_memory_mb: int,
+        pending_gpu_counts: Dict[int, int],
+        allocated_gpus: Mapping[int, Set[str]],
+    ) -> Optional[int]:
         if not candidates:
-            raise RuntimeError("No available slot found for the model")
+            return None
 
+        scored: List[Tuple[int, Union[int, float]]] = []
         for dev in candidates:
+            if not self._has_capacity(
+                dev, estimated_memory_mb, pending_gpu_counts, allocated_gpus
+            ):
+                continue
             update_gpu_memory_info(self._gpu_memory_info, dev, logger=logger)
-        candidates.sort(
-            key=lambda d: self._gpu_memory_info.get(d, {}).get("available", 0),
-            reverse=True,
-        )
-
-        selected: List[int] = []
-        idx = 0
-        while len(selected) < spec.n_gpu:
-            chosen = candidates[min(idx, len(candidates) - 1)]
-            selected.append(chosen)
-            if idx < len(candidates) - 1:
-                idx += 1
-
-        return selected
-
-    def release(self, model_uid: str, devices: List[int]) -> None:
-        # No internal accounting maintained here
-        return
+            available = self._gpu_memory_info.get(dev, {}).get("available", 0)
+            available -= (
+                pending_gpu_counts.get(dev, 0) + len(allocated_gpus.get(dev, set()))
+            ) * estimated_memory_mb
+            scored.append((dev, available))
 
-
-class SpreadFirstLaunchStrategy(LaunchStrategy):
-    """
-    Prefer spreading replicas across distinct GPUs before reusing any GPU.
-    Falls back to reuse when replicas exceed distinct GPUs.
-    """
-
-    def __init__(
-        self,
-        total_gpu_devices: List[int],
-        gpu_memory_info: Optional[Dict[int, Dict[str, Union[int, float]]]] = None,
-    ):
-        self._total_gpu_devices = total_gpu_devices
-        self._gpu_memory_info = gpu_memory_info or initialize_gpu_memory_info(
-            total_gpu_devices, logger=logger
-        )
+        scored.sort(key=lambda item: item[1], reverse=True)
+        return scored[0][0] if scored else None
 
     def allocate(
         self,
         spec: LaunchModelSpec,
         total_gpu_devices: List[int],
         user_specified_allocated_devices: Set[int],
-        allocated_gpus: Dict[int, str],
+        allocated_gpus: Mapping[int, Set[str]],
     ) -> List[int]:
-        candidates = [
-            dev
-            for dev in total_gpu_devices
-            if dev not in allocated_gpus and dev not in user_specified_allocated_devices
-        ]
-        if not candidates:
+        available_total = self._filter_allowed(total_gpu_devices)
+        if not available_total:
             raise RuntimeError("No available slot found for the model")
 
-        for dev in candidates:
-            update_gpu_memory_info(self._gpu_memory_info, dev, logger=logger)
-        candidates.sort(
-            key=lambda d: self._gpu_memory_info.get(d, {}).get("available", 0),
-            reverse=True,
+        model_uid = spec.model_uid
+        n_gpu = spec.n_gpu
+        estimated_memory_mb = self._estimate_model_memory_usage(
+            spec.model_name, spec.model_size, spec.model_format, spec.quantization
+        )
+        logger.info(
+            "Launch estimate for %s: %s MB (name=%s, size=%s, format=%s, quant=%s)",
+            spec.model_uid,
+            estimated_memory_mb,
+            spec.model_name,
+            spec.model_size,
+            spec.model_format,
+            spec.quantization,
         )
 
+        pending_gpu_counts: Dict[int, int] = {}
         selected: List[int] = []
-        idx = 0
-        while len(selected) < spec.n_gpu:
-            chosen = candidates[idx % len(candidates)]
-            selected.append(chosen)
-            idx += 1
-
-        return selected
 
-    def release(self, model_uid: str, devices: List[int]) -> None:
-        return
-
-
-class QuotaAwareLaunchStrategy(LaunchStrategy):
-    """
-    Restrict allocation to an allowed set of GPUs, then spread-first within that set.
-    """
-
-    def __init__(
-        self,
-        total_gpu_devices: List[int],
-        allowed_devices: Optional[Set[int]] = None,
-        gpu_memory_info: Optional[Dict[int, Dict[str, Union[int, float]]]] = None,
-    ):
-        self._total_gpu_devices = total_gpu_devices
-        self._allowed_devices = allowed_devices
-        self._gpu_memory_info = gpu_memory_info or initialize_gpu_memory_info(
-            total_gpu_devices, logger=logger
+        preferred_gpu = (
+            self._preferred_gpu
+            if self._preferred_gpu in available_total
+            else (available_total[0] if available_total else None)
         )
 
-    def allocate(
-        self,
-        spec: LaunchModelSpec,
-        total_gpu_devices: List[int],
-        user_specified_allocated_devices: Set[int],
-        allocated_gpus: Dict[int, str],
-    ) -> List[int]:
-        device_pool = (
-            [dev for dev in total_gpu_devices if dev in self._allowed_devices]
-            if self._allowed_devices is not None
-            else total_gpu_devices
-        )
-        candidates = [
-            dev
-            for dev in device_pool
-            if dev not in allocated_gpus and dev not in user_specified_allocated_devices
-        ]
-        if not candidates:
-            raise RuntimeError("No available slot found for the model")
+        if preferred_gpu is not None and self._is_available(
+            preferred_gpu,
+            user_specified_allocated_devices,
+            allocated_gpus,
+            estimated_memory_mb,
+            pending_gpu_counts,
+        ):
+            while len(selected) < n_gpu and self._is_available(
+                preferred_gpu,
+                user_specified_allocated_devices,
+                allocated_gpus,
+                estimated_memory_mb,
+                pending_gpu_counts,
+            ):
+                selected.append(preferred_gpu)
+                pending_gpu_counts[preferred_gpu] = (
+                    pending_gpu_counts.get(preferred_gpu, 0) + 1
+                )
 
-        for dev in candidates:
-            update_gpu_memory_info(self._gpu_memory_info, dev, logger=logger)
-        candidates.sort(
-            key=lambda d: self._gpu_memory_info.get(d, {}).get("available", 0),
-            reverse=True,
-        )
+        if len(selected) < n_gpu:
+            candidate_pool = [
+                dev
+                for dev in available_total
+                if dev != preferred_gpu and dev not in user_specified_allocated_devices
+            ]
+            emptiest_gpu = self._select_emptiest_gpu(
+                candidate_pool, estimated_memory_mb, pending_gpu_counts, allocated_gpus
+            )
+            if emptiest_gpu is None:
+                raise RuntimeError("No available slot found for the model")
+
+            while len(selected) < n_gpu and self._is_available(
+                emptiest_gpu,
+                user_specified_allocated_devices,
+                allocated_gpus,
+                estimated_memory_mb,
+                pending_gpu_counts,
+            ):
+                selected.append(emptiest_gpu)
+                pending_gpu_counts[emptiest_gpu] = (
+                    pending_gpu_counts.get(emptiest_gpu, 0) + 1
+                )
 
-        selected: List[int] = []
-        idx = 0
-        while len(selected) < spec.n_gpu:
-            chosen = candidates[idx % len(candidates)]
-            selected.append(chosen)
-            idx += 1
+        if len(selected) < n_gpu:
+            raise RuntimeError("No available slot found for the model")
 
+        if estimated_memory_mb > 0:
+            for gpu_idx, count in pending_gpu_counts.items():
+                if gpu_idx in self._gpu_memory_info:
+                    self._gpu_memory_info[gpu_idx]["used"] += (
+                        estimated_memory_mb * count
+                    )
+                    self._gpu_memory_info[gpu_idx]["available"] -= (
+                        estimated_memory_mb * count
+                    )
+
+        self._model_memory_usage[model_uid] = (estimated_memory_mb, pending_gpu_counts)
         return selected
 
     def release(self, model_uid: str, devices: List[int]) -> None:
-        return
+        record = self._model_memory_usage.pop(model_uid, None)
+        if not record:
+            return
+        estimated_memory_mb, gpu_counts = record
+        if estimated_memory_mb <= 0:
+            return
+
+        for gpu_idx, count in gpu_counts.items():
+            if gpu_idx in self._gpu_memory_info:
+                self._gpu_memory_info[gpu_idx]["used"] -= estimated_memory_mb * count
+                self._gpu_memory_info[gpu_idx]["available"] += (
+                    estimated_memory_mb * count
+                )
 
 
 def create_launch_strategy(
@@ -475,18 +317,20 @@ def create_launch_strategy(
     allowed_devices: Optional[Set[int]] = None,
     gpu_memory_info: Optional[Dict[int, Dict[str, Union[int, float]]]] = None,
 ) -> LaunchStrategy:
-    strategy_name = strategy_name.lower()
-    if strategy_name == "memory_aware":
-        return MemoryAwareLaunchStrategy(total_gpu_devices, gpu_memory_info)
-    if strategy_name == "packing_first":
-        return PackingFirstLaunchStrategy(total_gpu_devices, gpu_memory_info)
-    if strategy_name == "spread_first":
-        return SpreadFirstLaunchStrategy(total_gpu_devices, gpu_memory_info)
-    if strategy_name == "quota_aware":
-        return QuotaAwareLaunchStrategy(
-            total_gpu_devices, allowed_devices, gpu_memory_info
+    normalized = strategy_name.lower()
+    supported = {
+        "local_first",
+        "memory_aware",
+        "packing_first",
+        "spread_first",
+        "quota_aware",
+    }
+    if normalized not in supported:
+        logger.warning(
+            f"Unknown launch strategy '{strategy_name}', falling back to local_first"
         )
-    logger.warning(
-        f"Unknown launch strategy '{strategy_name}', falling back to memory_aware"
+    return LocalFirstLaunchStrategy(
+        total_gpu_devices,
+        allowed_devices=allowed_devices,
+        gpu_memory_info=gpu_memory_info,
     )
-    return MemoryAwareLaunchStrategy(total_gpu_devices, gpu_memory_info)
diff --git a/xinference/core/tests/test_worker.py b/xinference/core/tests/test_worker.py
index 3c6849d83a..1cae693945 100644
--- a/xinference/core/tests/test_worker.py
+++ b/xinference/core/tests/test_worker.py
@@ -18,6 +18,7 @@
 import xoscar as xo
 from xoscar import MainActorPoolType, create_actor_pool, get_pool_config
 
+from ..launch_strategy import LocalFirstLaunchStrategy
 from ..launch_strategy import MemoryAwareLaunchStrategy
 from ..utils import merge_virtual_env_packages
 from ..worker import WorkerActor
@@ -35,7 +36,7 @@ def __init__(
             idx: {"total": 24000.0, "used": 0.0, "available": 24000.0}
             for idx in cuda_devices
         }
-        self._launch_strategy = MemoryAwareLaunchStrategy(
+        self._launch_strategy = LocalFirstLaunchStrategy(
             cuda_devices, gpu_memory_info=gpu_memory_info
         )
 
@@ -115,14 +116,14 @@ async def test_allocate_cuda_devices(setup_pool):
     devices = await worker.allocate_devices(model_uid="mock_model_1", n_gpu=1)
     assert devices == [0]
 
-    devices = await worker.allocate_devices(model_uid="mock_model_2", n_gpu=4)
-    assert devices == [1, 2, 3, 4]
+    devices = await worker.allocate_devices(model_uid="mock_model_2", n_gpu=2)
+    assert devices == [0, 0]
 
-    devices = await worker.allocate_devices(model_uid="mock_model_3", n_gpu=3)
-    assert devices == [5, 6, 7]
+    devices = await worker.allocate_devices(model_uid="mock_model_3", n_gpu=1)
+    assert devices == [0]
 
-    devices = await worker.allocate_devices(model_uid="mock_model_4", n_gpu=5)
-    assert len(devices) == 5
+    devices = await worker.allocate_devices(model_uid="mock_model_4", n_gpu=1)
+    assert devices == [0]
 
 
 @pytest.mark.asyncio
@@ -148,7 +149,7 @@ async def test_terminate_model_flag(setup_pool):
     )
 
     devices = await worker.allocate_devices(model_uid="model_model_3", n_gpu=3)
-    assert devices == [5, 6, 7]
+    assert devices == [0, 0, 0]
     await worker.release_devices(model_uid="model_model_3")
 
     await worker.launch_builtin_model(
@@ -166,13 +167,17 @@ async def test_terminate_model_flag(setup_pool):
     assert len(pool_config["pools"]) == 3  # A main pool and 2 sub pools.
 
     gpu_to_model_id = await worker.get_gpu_to_model_uid()
-    for dev in devices:
-        assert "model_model_3" == gpu_to_model_id[dev]
+    model3_devices = [
+        dev for dev, uids in gpu_to_model_id.items() if "model_model_3" in uids
+    ]
+    assert model3_devices
+    for dev in model3_devices:
+        assert "model_model_3" in gpu_to_model_id[dev]
     await worker.terminate_model("model_model_3")
 
     gpu_to_model_id = await worker.get_gpu_to_model_uid()
-    for dev in devices:
-        assert dev not in gpu_to_model_id
+    for dev in model3_devices:
+        assert "model_model_3" not in gpu_to_model_id.get(dev, set())
 
 
 def test_merge_virtual_env_packages_override_and_append():
@@ -219,9 +224,9 @@ async def test_launch_embedding_model(setup_pool):
     )
 
     embedding_info = await worker.get_gpu_to_embedding_model_uids()
-    assert 3 in embedding_info
-    assert len(embedding_info[3]) == 1
-    assert "model_model_2" in embedding_info[3]
+    assert 1 in embedding_info
+    assert len(embedding_info[1]) == 1
+    assert "model_model_2" in embedding_info[1]
 
     # test terminate LLM model, then launch embedding model
     await worker.terminate_model("model_model_1")
@@ -237,7 +242,7 @@ async def test_launch_embedding_model(setup_pool):
     await worker.terminate_model("model_model_3")
     embedding_info = await worker.get_gpu_to_embedding_model_uids()
     assert len(embedding_info[0]) == 0
-    assert len(embedding_info[3]) == 0
+    assert len(embedding_info[1]) == 0
 
     # test embedding device candidates 2
     await worker.launch_builtin_model(
@@ -251,24 +256,28 @@ async def test_launch_embedding_model(setup_pool):
     await worker.launch_builtin_model(
         "model_model_3", "mock_model_name", None, None, None, "embedding", n_gpu=1
     )
+    embedding_info = await worker.get_gpu_to_embedding_model_uids()
+    assert 1 in embedding_info
     assert 2 in embedding_info
-    assert 3 in embedding_info
+    assert len(embedding_info[1]) == 1
     assert len(embedding_info[2]) == 1
-    assert len(embedding_info[3]) == 1
-    assert "model_model_2" in embedding_info[2]
-    assert "model_model_3" in embedding_info[3]
+    assert "model_model_2" in embedding_info[1]
+    assert "model_model_3" in embedding_info[2]
 
     await worker.launch_builtin_model(
         "model_model_4", "mock_model_name", None, None, None, "embedding", n_gpu=1
     )
-    assert len(embedding_info[2]) == 2
+    embedding_info = await worker.get_gpu_to_embedding_model_uids()
+    assert len(embedding_info[1]) == 1
+    assert len(embedding_info[2]) == 1
     assert len(embedding_info[3]) == 1
-    assert "model_model_2" in embedding_info[2]
-    assert "model_model_4" in embedding_info[2]
-    assert "model_model_3" in embedding_info[3]
+    assert "model_model_2" in embedding_info[1]
+    assert "model_model_3" in embedding_info[2]
+    assert "model_model_4" in embedding_info[3]
 
     for i in range(1, 5):
         await worker.terminate_model(f"model_model_{i}")
+    embedding_info = await worker.get_gpu_to_embedding_model_uids()
     assert len(embedding_info[2]) == 0
     assert len(embedding_info[3]) == 0
 
@@ -277,10 +286,9 @@ async def test_launch_embedding_model(setup_pool):
         await worker.launch_builtin_model(
             f"model_model_{i}", "mock_model_name", None, None, None, n_gpu=1
         )
-    with pytest.raises(RuntimeError):
-        await worker.launch_builtin_model(
-            "model_model_5", "mock_model_name", None, None, None, "embedding", n_gpu=1
-        )
+    await worker.launch_builtin_model(
+        "model_model_5", "mock_model_name", None, None, None, "embedding", n_gpu=1
+    )
     # launch CPU would work
     await worker.launch_builtin_model(
         "model_model_5", "mock_model_name", None, None, None, "embedding", n_gpu=None
@@ -309,15 +317,15 @@ async def test_launch_model_with_gpu_idx(setup_pool):
         "normal_model_model_1", "mock_model_name", None, None, None, "LLM", n_gpu=1
     )
     llm_info = await worker.get_gpu_to_model_uid()
-    assert len(llm_info) == 1
     assert 0 in llm_info
+    assert "normal_model_model_1" in llm_info[0]
 
     await worker.launch_builtin_model(
         "model_model_2", "mock_model_name", None, None, None, "LLM", gpu_idx=[0]
     )
     llm_info = await worker.get_gpu_to_model_uid()
-    assert len(llm_info) == 1
     assert 0 in llm_info
+    assert "model_model_2" in llm_info[0]
 
     user_specified_info = await worker.get_user_specified_gpu_to_model_uids()
     assert len(user_specified_info) == 1
@@ -331,46 +339,55 @@ async def test_launch_model_with_gpu_idx(setup_pool):
         "vllm_model_model_3", "mock_model_name", None, None, None, "LLM", n_gpu=1
     )
     llm_info = await worker.get_gpu_to_model_uid()
-    assert len(llm_info) == 2
-    assert 0 in llm_info
-    assert 1 in llm_info
+    vllm_gpu = next(
+        dev for dev, uids in llm_info.items() if "vllm_model_model_3" in uids
+    )
+    assert vllm_gpu != 0
 
     with pytest.raises(RuntimeError):
         await worker.launch_builtin_model(
-            "model_model_4", "mock_model_name", None, None, None, "LLM", gpu_idx=[1]
+            "model_model_4",
+            "mock_model_name",
+            None,
+            None,
+            None,
+            "LLM",
+            gpu_idx=[vllm_gpu],
         )
 
+    target_gpu = next(dev for dev in [1, 2, 3] if dev != vllm_gpu)
     await worker.launch_builtin_model(
-        "model_model_4", "mock_model_name", None, None, None, "LLM", gpu_idx=[2]
+        "model_model_4",
+        "mock_model_name",
+        None,
+        None,
+        None,
+        "LLM",
+        gpu_idx=[target_gpu],
     )
     llm_info = await worker.get_gpu_to_model_uid()
-    assert len(llm_info) == 2
-    assert 0 in llm_info
-    assert 1 in llm_info
+    assert target_gpu in llm_info
+    assert "model_model_4" in llm_info[target_gpu]
 
     user_specified_info = await worker.get_user_specified_gpu_to_model_uids()
     assert len(user_specified_info) == 2
     assert 0 in user_specified_info
-    assert 2 in user_specified_info
-    assert len(user_specified_info[2]) == 1
-    assert list(user_specified_info[2])[0][0] == "model_model_4"
-    assert list(user_specified_info[2])[0][1] == "LLM"
+    assert target_gpu in user_specified_info
+    assert len(user_specified_info[target_gpu]) == 1
+    assert list(user_specified_info[target_gpu])[0][0] == "model_model_4"
+    assert list(user_specified_info[target_gpu])[0][1] == "LLM"
 
     # then launch a LLM without gpu_idx
     await worker.launch_builtin_model(
         "normal_model_model_5", "mock_model_name", None, None, None, "LLM", n_gpu=1
     )
     llm_info = await worker.get_gpu_to_model_uid()
-    assert len(llm_info) == 3
     assert 0 in llm_info
-    assert 1 in llm_info
-    assert 3 in llm_info
 
     # launch without gpu_idx again, error
-    with pytest.raises(RuntimeError):
-        await worker.launch_builtin_model(
-            "normal_model_model_6", "mock_model_name", None, None, None, "LLM", n_gpu=1
-        )
+    await worker.launch_builtin_model(
+        "normal_model_model_6", "mock_model_name", None, None, None, "LLM", n_gpu=1
+    )
 
     #  test terminate and cleanup
     await worker.terminate_model("normal_model_model_1")
@@ -378,6 +395,7 @@ async def test_launch_model_with_gpu_idx(setup_pool):
     await worker.terminate_model("vllm_model_model_3")
     await worker.terminate_model("model_model_4")
     await worker.terminate_model("normal_model_model_5")
+    await worker.terminate_model("normal_model_model_6")
 
     llm_info = await worker.get_gpu_to_model_uid()
     assert len(llm_info) == 0
@@ -407,19 +425,18 @@ async def test_launch_model_with_gpu_idx(setup_pool):
     assert list(user_specified_info[0])[0][1] == "LLM"
 
     # never choose gpu 0 again
-    with pytest.raises(RuntimeError):
-        await worker.launch_builtin_model(
-            "normal_mock_model_3", "mock_model_name", None, None, None, "LLM", n_gpu=4
-        )
+    devices = await worker.allocate_devices(model_uid="normal_mock_model_3", n_gpu=4)
+    assert all(dev != 0 for dev in devices)
 
     # should be on gpu 1
     await worker.launch_builtin_model(
         "embedding_3", "mock_model_name", None, None, None, "embedding", n_gpu=1
     )
     # should be on gpu 0
-    await worker.launch_builtin_model(
-        "rerank_4", "mock_model_name", None, None, None, "rerank", gpu_idx=[0]
-    )
+    with pytest.raises(RuntimeError):
+        await worker.launch_builtin_model(
+            "rerank_4", "mock_model_name", None, None, None, "rerank", gpu_idx=[0]
+        )
     # should be on gpu 2
     await worker.launch_builtin_model(
         "embedding_5", "mock_model_name", None, None, None, "embedding", n_gpu=1
@@ -434,21 +451,18 @@ async def test_launch_model_with_gpu_idx(setup_pool):
     )
     embedding_info = await worker.get_gpu_to_embedding_model_uids()
     user_specified_info = await worker.get_user_specified_gpu_to_model_uids()
-    assert "rerank_7" in embedding_info[1]
+    rerank7_gpu = next(
+        dev for dev, uids in embedding_info.items() if "rerank_7" in uids
+    )
+    assert rerank7_gpu != 0
     assert len(embedding_info[0]) == 1
-    assert len(user_specified_info[0]) == 2
-    assert len(embedding_info[1]) == 2
-    assert len(user_specified_info[1]) == 0
-    assert len(embedding_info[2]) == 1
-    assert len(user_specified_info[2]) == 0
-    assert len(embedding_info[3]) == 1
-    assert len(user_specified_info[3]) == 0
+    assert len(user_specified_info[0]) == 1
+    assert len(user_specified_info[rerank7_gpu]) == 0
 
     # cleanup
     await worker.terminate_model("embedding_1")
     await worker.terminate_model("vllm_mock_model_2")
     await worker.terminate_model("embedding_3")
-    await worker.terminate_model("rerank_4")
     await worker.terminate_model("embedding_5")
     await worker.terminate_model("rerank_6")
     await worker.terminate_model("rerank_7")
diff --git a/xinference/core/worker.py b/xinference/core/worker.py
index 5380977a76..1c97cc8523 100644
--- a/xinference/core/worker.py
+++ b/xinference/core/worker.py
@@ -146,7 +146,7 @@ def __init__(
         self._model_uid_to_model: Dict[str, xo.ActorRefType["ModelActor"]] = {}
         self._model_uid_to_model_spec: Dict[str, Dict[str, Any]] = {}
         self._model_uid_to_model_status: Dict[str, ModelStatus] = {}
-        self._gpu_to_model_uid: Dict[int, str] = {}
+        self._gpu_to_model_uid: Dict[int, Set[str]] = defaultdict(set)
         self._gpu_to_embedding_model_uids: Dict[int, Set[str]] = defaultdict(set)
         # Dict structure: gpu_index: {(replica_model_uid, model_type)}
         self._user_specified_gpu_to_model_uids: Dict[int, Set[Tuple[str, str]]] = (
@@ -504,10 +504,12 @@ async def allocate_devices_for_embedding(self, model_uid: str) -> int:
             else:  # need to judge that whether to have vllm model on this device
                 has_vllm_model = False
                 if _dev in self._gpu_to_model_uid:
-                    existing_model_uid = self._gpu_to_model_uid[_dev]
-                    has_vllm_model = await self.is_model_vllm_backend(
-                        existing_model_uid
-                    )
+                    for existing_model_uid in self._gpu_to_model_uid[_dev]:
+                        has_vllm_model = await self.is_model_vllm_backend(
+                            existing_model_uid
+                        )
+                        if has_vllm_model:
+                            break
                 if (
                     not has_vllm_model
                     and _dev in self._user_specified_gpu_to_model_uids
@@ -532,7 +534,7 @@ async def allocate_devices_for_embedding(self, model_uid: str) -> int:
             if _dev in self._gpu_to_embedding_model_uids:
                 existing_cnt += len(self._gpu_to_embedding_model_uids[_dev])
             if _dev in self._gpu_to_model_uid:
-                existing_cnt += 1
+                existing_cnt += len(self._gpu_to_model_uid[_dev])
             if _dev in self._user_specified_gpu_to_model_uids:
                 existing_cnt += len(self._user_specified_gpu_to_model_uids[_dev])
             if min_cnt == -1 or existing_cnt < min_cnt:
@@ -566,7 +568,7 @@ def allocate_devices(self, model_uid: str, n_gpu: int) -> List[int]:
             allocated_gpus=self._gpu_to_model_uid,
         )
         for dev in devices:
-            self._gpu_to_model_uid[int(dev)] = model_uid
+            self._gpu_to_model_uid[int(dev)].add(model_uid)
         return sorted(devices)
 
     def allocate_devices_for_model(
@@ -593,7 +595,7 @@ def allocate_devices_for_model(
             allocated_gpus=self._gpu_to_model_uid,
         )
         for dev in devices:
-            self._gpu_to_model_uid[int(dev)] = model_uid
+            self._gpu_to_model_uid[int(dev)].add(model_uid)
         return sorted(devices)
 
     async def allocate_devices_with_gpu_idx(
@@ -612,14 +614,14 @@ async def allocate_devices_with_gpu_idx(
         for idx in gpu_idx:
             existing_model_uids = []
             if idx in self._gpu_to_model_uid:
-                rep_uid = self._gpu_to_model_uid[idx]
-                is_vllm_model = await self.is_model_vllm_backend(rep_uid)
-                if is_vllm_model:
-                    raise RuntimeError(
-                        f"GPU index {idx} has been occupied with a vLLM model: {rep_uid}, "
-                        f"therefore cannot allocate GPU memory for a new model."
-                    )
-                existing_model_uids.append(rep_uid)
+                for rep_uid in self._gpu_to_model_uid[idx]:
+                    is_vllm_model = await self.is_model_vllm_backend(rep_uid)
+                    if is_vllm_model:
+                        raise RuntimeError(
+                            f"GPU index {idx} has been occupied with a vLLM model: {rep_uid}, "
+                            f"therefore cannot allocate GPU memory for a new model."
+                        )
+                    existing_model_uids.append(rep_uid)
             if idx in self._gpu_to_embedding_model_uids:
                 existing_model_uids.extend(self._gpu_to_embedding_model_uids[idx])
 
@@ -631,16 +633,18 @@ async def allocate_devices_with_gpu_idx(
 
         for idx in gpu_idx:
             self._user_specified_gpu_to_model_uids[idx].add((model_uid, model_type))
+            self._gpu_to_model_uid[idx].add(model_uid)
         return sorted(gpu_idx)
 
     def release_devices(self, model_uid: str):
         devices = [
-            dev
-            for dev in self._gpu_to_model_uid
-            if self._gpu_to_model_uid[dev] == model_uid
+            dev for dev, uids in self._gpu_to_model_uid.items() if model_uid in uids
         ]
         for dev in devices:
-            del self._gpu_to_model_uid[dev]
+            if model_uid in self._gpu_to_model_uid[dev]:
+                self._gpu_to_model_uid[dev].remove(model_uid)
+            if not self._gpu_to_model_uid[dev]:
+                del self._gpu_to_model_uid[dev]
 
         # check embedding
         for dev in self._gpu_to_embedding_model_uids:

From 9421d6969a67e45fca3d6d0937c8aeaaec6abd39 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Thu, 11 Dec 2025 16:44:02 +0800
Subject: [PATCH 12/18] some modify

---
 .../zh_CN/LC_MESSAGES/user_guide/launch.po    |  43 ++-
 doc/source/user_guide/launch.rst              |   9 +-
 xinference/constants.py                       |   2 +-
 xinference/core/launch_strategy.py            | 294 +++++-------------
 xinference/core/tests/test_worker.py          |  12 +-
 xinference/core/worker.py                     |  54 +++-
 6 files changed, 162 insertions(+), 252 deletions(-)

diff --git a/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po b/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po
index ff9199818a..cbf6c58381 100644
--- a/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po
+++ b/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po
@@ -8,7 +8,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: Xinference \n"
 "Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2025-10-20 16:28+0800\n"
+"POT-Creation-Date: 2025-12-11 16:32+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: zh_CN\n"
@@ -52,10 +52,12 @@ msgstr "旧版本多实例部署："
 
 #: ../../source/user_guide/launch.rst:19
 msgid ""
-"When you have multiple GPU cards, each capable of hosting one model "
-"instance, you can set the number of instances equal to the number of "
-"GPUs. For example:"
-msgstr "当您拥有多张GPU显卡时，每张显卡可承载一个模型实例，此时可将实例数量设置为等于GPU数量。例如:"
+"Before v1.15.0：When you have multiple GPU cards, each capable of hosting"
+" one model instance, you can set the number of instances equal to the "
+"number of GPUs. For example:"
+msgstr ""
+"在v1.15.0版本前：当您拥有多张GPU显卡时，每张显卡可承载一个模型实例，此时可将实例数量设置为"
+"等于GPU数量。例如:"
 
 #: ../../source/user_guide/launch.rst:21
 msgid "2 GPUs, 2 instances: Each GPU runs one model instance"
@@ -122,10 +124,27 @@ msgid "Result: GPU0 runs 2 instances, GPU1 runs 1 instance"
 msgstr "结果: GPU0运行2个实例，GPU1运行1个实例"
 
 #: ../../source/user_guide/launch.rst:54
+msgid "GPU Allocation Strategy"
+msgstr "GPU分配策略"
+
+#: ../../source/user_guide/launch.rst:56
+msgid ""
+"The current strategy is *idle-first with a first round spread*: the "
+"scheduler first tries to place one replica on each available GPU (always "
+"picking the emptiest unused GPU). Once every GPU has at least one "
+"replica, remaining replicas keep stacking onto the GPU that is currently "
+"the emptiest (single-GPU multi-replica is allowed). Use "
+"``XINFERENCE_LAUNCH_ALLOWED_GPUS`` to limit which GPUs can be chosen."
+msgstr ""
+"当前策略为 *空闲优先且首轮分散* ：调度器首先尝试将每个副本分配至可用GPU（始终选择最空闲的未用GPU）。"
+"当每块GPU至少承载一个副本后，剩余副本将持续堆叠至当前最空闲的GPU（允许单GPU承载多个副本）。"
+"使用 ``XINFERENCE_LAUNCH_ALLOWED_GPUS`` 参数限制可选GPU范围。"
+
+#: ../../source/user_guide/launch.rst:59
 msgid "Set Environment Variables"
 msgstr "设置环境变量"
 
-#: ../../source/user_guide/launch.rst:58
+#: ../../source/user_guide/launch.rst:63
 msgid ""
 "Sometimes, we want to specify environment variables for a particular "
 "model at runtime. Since v1.8.1, Xinference provides the capability to "
@@ -135,21 +154,21 @@ msgstr ""
 "有时我们希望在运行时为特定模型指定环境变量。从 v1.8.1 开始，Xinference "
 "提供了单独配置环境变量的功能，无需在启动 Xinference 前设置。"
 
-#: ../../source/user_guide/launch.rst:61
+#: ../../source/user_guide/launch.rst:66
 msgid "For Web UI."
 msgstr "针对 Web UI。"
 
-#: ../../source/user_guide/launch.rst:67
+#: ../../source/user_guide/launch.rst:72
 msgid ""
 "When using the command line, use ``--env`` to specify an environment "
 "variable."
 msgstr "命令行使用时，使用 ``--env`` 指定环境变量。"
 
-#: ../../source/user_guide/launch.rst:69
+#: ../../source/user_guide/launch.rst:74
 msgid "Example usage:"
 msgstr "示例用法："
 
-#: ../../source/user_guide/launch.rst:75
+#: ../../source/user_guide/launch.rst:80
 msgid ""
 "Take vLLM as an example: it has versions V1 and V0, and by default, it "
 "automatically determines which version to use. If you want to force the "
@@ -160,11 +179,11 @@ msgstr ""
 "在加载模型时强制通过设置 ``VLLM_USE_V1=0`` 来使用 V0，可以指定该环境变量"
 "。"
 
-#: ../../source/user_guide/launch.rst:79
+#: ../../source/user_guide/launch.rst:84
 msgid "Configuring Model Virtual Environment"
 msgstr "配置模型虚拟空间"
 
-#: ../../source/user_guide/launch.rst:83
+#: ../../source/user_guide/launch.rst:88
 msgid ""
 "For this part, please refer to :ref:`toggling virtual environments and "
 "customizing dependencies <model_launching_virtualenv>`."
diff --git a/doc/source/user_guide/launch.rst b/doc/source/user_guide/launch.rst
index 062a63c47b..dd7b2ded6f 100644
--- a/doc/source/user_guide/launch.rst
+++ b/doc/source/user_guide/launch.rst
@@ -16,12 +16,12 @@ Meanwhile, users see it as a single model, which greatly improves overall resour
 
 Traditional Multi-Instance Deployment：
 
-When you have multiple GPU cards, each capable of hosting one model instance, you can set the number of instances equal to the number of GPUs. For example:
+Before v1.15.0：When you have multiple GPU cards, each capable of hosting one model instance, you can set the number of instances equal to the number of GPUs. For example:
 
 - 2 GPUs, 2 instances: Each GPU runs one model instance
 - 4 GPUs, 4 instances: Each GPU runs one model instance
 
-.. versionadded:: v1.12.0
+.. versionadded:: v1.15.0
 
 Introduce a new environment variable:
 
@@ -50,6 +50,11 @@ Smart Allocation: Number of replicas may differ from GPU count; system intellige
 - Configuration: Replicas=3, GPUs=2
 - Result: GPU0 runs 2 instances, GPU1 runs 1 instance
 
+GPU Allocation Strategy
+=======================
+
+The current strategy is *idle-first with a first round spread*: the scheduler first tries to place one replica on each available GPU (always picking the emptiest unused GPU). Once every GPU has at least one replica, remaining replicas keep stacking onto the GPU that is currently the emptiest (single-GPU multi-replica is allowed). Use ``XINFERENCE_LAUNCH_ALLOWED_GPUS`` to limit which GPUs can be chosen.
+
 Set Environment Variables
 =========================
 
diff --git a/xinference/constants.py b/xinference/constants.py
index 6d9afd0e79..cc712febab 100644
--- a/xinference/constants.py
+++ b/xinference/constants.py
@@ -122,7 +122,7 @@ def get_xinference_home() -> str:
 XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA = bool(
     int(os.getenv(XINFERENCE_ENV_ENABLE_SINGLE_GPU_MULTI_REPLICA, "1"))
 )  # Enable by default
-XINFERENCE_LAUNCH_STRATEGY = os.getenv(XINFERENCE_ENV_LAUNCH_STRATEGY, "local_first")
+XINFERENCE_LAUNCH_STRATEGY = os.getenv(XINFERENCE_ENV_LAUNCH_STRATEGY, "idle_first")
 _allowed_gpu_str = os.getenv(XINFERENCE_ENV_LAUNCH_ALLOWED_GPUS, "")
 XINFERENCE_LAUNCH_ALLOWED_GPUS = (
     {int(x) for x in _allowed_gpu_str.split(",") if x.strip().isdigit()}
diff --git a/xinference/core/launch_strategy.py b/xinference/core/launch_strategy.py
index 4a93d5167c..e5b972c0a2 100644
--- a/xinference/core/launch_strategy.py
+++ b/xinference/core/launch_strategy.py
@@ -13,15 +13,11 @@
 # limitations under the License.
 
 import logging
-from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import Dict, List, Mapping, Optional, Set, Tuple, Union
 
-import torch
-
 from ..device_utils import initialize_gpu_memory_info, update_gpu_memory_info
-from ..model.llm.llm_family import DEFAULT_CONTEXT_LENGTH
-from ..model.llm.memory import estimate_llm_gpu_memory
+from .utils import parse_replica_model_uid
 
 logger = logging.getLogger(__name__)
 
@@ -38,27 +34,7 @@ class LaunchModelSpec:
     quantization: Optional[str] = None
 
 
-class LaunchStrategy(ABC):
-    """Abstract base class for GPU allocation strategies"""
-
-    @abstractmethod
-    def allocate(
-        self,
-        spec: LaunchModelSpec,
-        total_gpu_devices: List[int],
-        user_specified_allocated_devices: Set[int],
-        allocated_gpus: Mapping[int, Set[str]],
-    ) -> List[int]:
-        """Allocate GPUs for model launch"""
-        pass
-
-    @abstractmethod
-    def release(self, model_uid: str, devices: List[int]) -> None:
-        """Release GPUs allocated for a model"""
-        pass
-
-
-class LocalFirstLaunchStrategy(LaunchStrategy):
+class IdleFirstLaunchStrategy:
     """
     Prefer the GPU running Xinference, otherwise keep allocating onto the emptiest
     remaining GPU.
@@ -69,116 +45,31 @@ def __init__(
         total_gpu_devices: List[int],
         allowed_devices: Optional[Set[int]] = None,
         gpu_memory_info: Optional[Dict[int, Dict[str, Union[int, float]]]] = None,
+        model_spread_used_gpus: Optional[Dict[str, Set[int]]] = None,
+        active_model_counts: Optional[Dict[str, int]] = None,
     ):
         self._allowed_devices = allowed_devices
         self._total_gpu_devices = self._filter_allowed(total_gpu_devices)
         self._gpu_memory_info = gpu_memory_info or initialize_gpu_memory_info(
             self._total_gpu_devices, logger=logger
         )
-        self._model_memory_usage: Dict[str, Tuple[int, Dict[int, int]]] = {}
-        self._preferred_gpu = self._detect_preferred_gpu()
+        # Track which GPUs have been used in the first round for each model
+        self._model_spread_used_gpus: Dict[str, Set[int]] = (
+            model_spread_used_gpus if model_spread_used_gpus is not None else {}
+        )
+        # Track active replicas per base model to clean spread history
+        self._active_model_counts: Dict[str, int] = (
+            active_model_counts if active_model_counts is not None else {}
+        )
 
     def _filter_allowed(self, total_gpu_devices: List[int]) -> List[int]:
         if self._allowed_devices is None:
             return total_gpu_devices
         return [dev for dev in total_gpu_devices if dev in self._allowed_devices]
 
-    def _detect_preferred_gpu(self) -> Optional[int]:
-        try:
-            if torch.cuda.is_available():
-                gpu_idx = torch.cuda.current_device()
-                if gpu_idx in self._total_gpu_devices:
-                    return gpu_idx
-        except Exception:
-            pass
-        return self._total_gpu_devices[0] if self._total_gpu_devices else None
-
-    def _estimate_model_memory_usage(
-        self,
-        model_name: Optional[str],
-        model_size: Optional[Union[int, str]],
-        model_format: Optional[str],
-        quantization: Optional[str],
-    ) -> int:
-        """Estimate memory usage using the documented cal-model-mem algorithm."""
-        if model_size is None:
-            return 1024
-
-        def _normalize_size(size: Union[int, str]) -> str:
-            if isinstance(size, str):
-                normalized = size.strip().lower().rstrip("b")
-                return normalized if normalized else "0"
-            return str(size)
-
-        size_in_billions = _normalize_size(model_size)
-        model_format = model_format or "pytorch"
-
-        try:
-            mem_info = estimate_llm_gpu_memory(
-                model_size_in_billions=size_in_billions,
-                quantization=quantization,
-                context_length=DEFAULT_CONTEXT_LENGTH,
-                model_format=model_format,
-                model_name=model_name,
-                kv_cache_dtype=16,
-            )
-            if mem_info is None and model_name:
-                mem_info = estimate_llm_gpu_memory(
-                    model_size_in_billions=size_in_billions,
-                    quantization=quantization,
-                    context_length=DEFAULT_CONTEXT_LENGTH,
-                    model_format=model_format,
-                    model_name=None,
-                    kv_cache_dtype=16,
-                )
-            if mem_info is not None:
-                return max(int(mem_info.total), 1024)
-        except Exception:
-            logger.debug("Failed to estimate memory via cal-model-mem", exc_info=True)
-
-        # If estimation fails, keep minimal guard to avoid zero/negative allocation.
-        return 1024
-
-    def _has_capacity(
-        self,
-        gpu_idx: int,
-        estimated_memory_mb: int,
-        pending_gpu_counts: Dict[int, int],
-        allocated_gpus: Mapping[int, Set[str]],
-    ) -> bool:
-        if estimated_memory_mb <= 0:
-            return True
-
-        update_gpu_memory_info(self._gpu_memory_info, gpu_idx, logger=logger)
-        gpu_info = self._gpu_memory_info.get(gpu_idx, {})
-        available = gpu_info.get("available", 0)
-        total = gpu_info.get("total", 0)
-        # If we cannot get valid memory info, assume capacity is available to avoid false negatives.
-        if total == 0 and available == 0:
-            return True
-        planned_usage = (
-            pending_gpu_counts.get(gpu_idx, 0) + len(allocated_gpus.get(gpu_idx, set()))
-        ) * estimated_memory_mb
-        return available - planned_usage >= estimated_memory_mb
-
-    def _is_available(
-        self,
-        gpu_idx: int,
-        user_specified_allocated_devices: Set[int],
-        allocated_gpus: Mapping[int, Set[str]],
-        estimated_memory_mb: int,
-        pending_gpu_counts: Dict[int, int],
-    ) -> bool:
-        if gpu_idx in user_specified_allocated_devices:
-            return False
-        return self._has_capacity(
-            gpu_idx, estimated_memory_mb, pending_gpu_counts, allocated_gpus
-        )
-
     def _select_emptiest_gpu(
         self,
         candidates: List[int],
-        estimated_memory_mb: int,
         pending_gpu_counts: Dict[int, int],
         allocated_gpus: Mapping[int, Set[str]],
     ) -> Optional[int]:
@@ -187,16 +78,13 @@ def _select_emptiest_gpu(
 
         scored: List[Tuple[int, Union[int, float]]] = []
         for dev in candidates:
-            if not self._has_capacity(
-                dev, estimated_memory_mb, pending_gpu_counts, allocated_gpus
-            ):
-                continue
             update_gpu_memory_info(self._gpu_memory_info, dev, logger=logger)
             available = self._gpu_memory_info.get(dev, {}).get("available", 0)
-            available -= (
-                pending_gpu_counts.get(dev, 0) + len(allocated_gpus.get(dev, set()))
-            ) * estimated_memory_mb
-            scored.append((dev, available))
+            # Penalize GPUs already planned/allocated to avoid stacking too early
+            penalty = pending_gpu_counts.get(dev, 0) + len(
+                allocated_gpus.get(dev, set())
+            )
+            scored.append((dev, available - penalty))
 
         scored.sort(key=lambda item: item[1], reverse=True)
         return scored[0][0] if scored else None
@@ -213,102 +101,67 @@ def allocate(
             raise RuntimeError("No available slot found for the model")
 
         model_uid = spec.model_uid
+        try:
+            base_model_uid, _ = parse_replica_model_uid(model_uid)
+        except Exception:
+            base_model_uid = model_uid
+        used_in_spread = self._model_spread_used_gpus.setdefault(base_model_uid, set())
         n_gpu = spec.n_gpu
-        estimated_memory_mb = self._estimate_model_memory_usage(
-            spec.model_name, spec.model_size, spec.model_format, spec.quantization
-        )
-        logger.info(
-            "Launch estimate for %s: %s MB (name=%s, size=%s, format=%s, quant=%s)",
-            spec.model_uid,
-            estimated_memory_mb,
-            spec.model_name,
-            spec.model_size,
-            spec.model_format,
-            spec.quantization,
-        )
 
         pending_gpu_counts: Dict[int, int] = {}
         selected: List[int] = []
 
-        preferred_gpu = (
-            self._preferred_gpu
-            if self._preferred_gpu in available_total
-            else (available_total[0] if available_total else None)
-        )
-
-        if preferred_gpu is not None and self._is_available(
-            preferred_gpu,
-            user_specified_allocated_devices,
-            allocated_gpus,
-            estimated_memory_mb,
-            pending_gpu_counts,
-        ):
-            while len(selected) < n_gpu and self._is_available(
-                preferred_gpu,
-                user_specified_allocated_devices,
-                allocated_gpus,
-                estimated_memory_mb,
-                pending_gpu_counts,
-            ):
-                selected.append(preferred_gpu)
-                pending_gpu_counts[preferred_gpu] = (
-                    pending_gpu_counts.get(preferred_gpu, 0) + 1
-                )
-
-        if len(selected) < n_gpu:
-            candidate_pool = [
-                dev
-                for dev in available_total
-                if dev != preferred_gpu and dev not in user_specified_allocated_devices
-            ]
+        while len(selected) < n_gpu:
+            # If some GPUs haven't received a replica for this model yet, try them first
+            if len(used_in_spread) < len(available_total):
+                candidate_pool = [
+                    dev
+                    for dev in available_total
+                    if dev not in user_specified_allocated_devices
+                    and dev not in used_in_spread
+                ]
+                if not candidate_pool:
+                    candidate_pool = [
+                        dev
+                        for dev in available_total
+                        if dev not in user_specified_allocated_devices
+                    ]
+            else:
+                candidate_pool = [
+                    dev
+                    for dev in available_total
+                    if dev not in user_specified_allocated_devices
+                ]
             emptiest_gpu = self._select_emptiest_gpu(
-                candidate_pool, estimated_memory_mb, pending_gpu_counts, allocated_gpus
+                candidate_pool, pending_gpu_counts, allocated_gpus
             )
             if emptiest_gpu is None:
                 raise RuntimeError("No available slot found for the model")
 
-            while len(selected) < n_gpu and self._is_available(
-                emptiest_gpu,
-                user_specified_allocated_devices,
-                allocated_gpus,
-                estimated_memory_mb,
-                pending_gpu_counts,
-            ):
-                selected.append(emptiest_gpu)
-                pending_gpu_counts[emptiest_gpu] = (
-                    pending_gpu_counts.get(emptiest_gpu, 0) + 1
-                )
-
-        if len(selected) < n_gpu:
-            raise RuntimeError("No available slot found for the model")
-
-        if estimated_memory_mb > 0:
-            for gpu_idx, count in pending_gpu_counts.items():
-                if gpu_idx in self._gpu_memory_info:
-                    self._gpu_memory_info[gpu_idx]["used"] += (
-                        estimated_memory_mb * count
-                    )
-                    self._gpu_memory_info[gpu_idx]["available"] -= (
-                        estimated_memory_mb * count
-                    )
+            selected.append(emptiest_gpu)
+            pending_gpu_counts[emptiest_gpu] = (
+                pending_gpu_counts.get(emptiest_gpu, 0) + 1
+            )
+            used_in_spread.add(emptiest_gpu)
 
-        self._model_memory_usage[model_uid] = (estimated_memory_mb, pending_gpu_counts)
+        # Persist spread history for this base model
+        self._model_spread_used_gpus[base_model_uid] = used_in_spread
+        self._active_model_counts[base_model_uid] = (
+            self._active_model_counts.get(base_model_uid, 0) + 1
+        )
         return selected
 
     def release(self, model_uid: str, devices: List[int]) -> None:
-        record = self._model_memory_usage.pop(model_uid, None)
-        if not record:
-            return
-        estimated_memory_mb, gpu_counts = record
-        if estimated_memory_mb <= 0:
-            return
-
-        for gpu_idx, count in gpu_counts.items():
-            if gpu_idx in self._gpu_memory_info:
-                self._gpu_memory_info[gpu_idx]["used"] -= estimated_memory_mb * count
-                self._gpu_memory_info[gpu_idx]["available"] += (
-                    estimated_memory_mb * count
-                )
+        try:
+            base_model_uid, _ = parse_replica_model_uid(model_uid)
+        except Exception:
+            base_model_uid = model_uid
+        count = self._active_model_counts.get(base_model_uid, 0)
+        if count <= 1:
+            self._active_model_counts.pop(base_model_uid, None)
+            self._model_spread_used_gpus.pop(base_model_uid, None)
+        else:
+            self._active_model_counts[base_model_uid] = count - 1
 
 
 def create_launch_strategy(
@@ -316,21 +169,18 @@ def create_launch_strategy(
     total_gpu_devices: List[int],
     allowed_devices: Optional[Set[int]] = None,
     gpu_memory_info: Optional[Dict[int, Dict[str, Union[int, float]]]] = None,
-) -> LaunchStrategy:
+    model_spread_used_gpus: Optional[Dict[str, Set[int]]] = None,
+    active_model_counts: Optional[Dict[str, int]] = None,
+) -> IdleFirstLaunchStrategy:
     normalized = strategy_name.lower()
-    supported = {
-        "local_first",
-        "memory_aware",
-        "packing_first",
-        "spread_first",
-        "quota_aware",
-    }
-    if normalized not in supported:
+    if normalized != "idle_first":
         logger.warning(
-            f"Unknown launch strategy '{strategy_name}', falling back to local_first"
+            f"Unknown launch strategy '{strategy_name}', falling back to idle_first"
         )
-    return LocalFirstLaunchStrategy(
+    return IdleFirstLaunchStrategy(
         total_gpu_devices,
         allowed_devices=allowed_devices,
         gpu_memory_info=gpu_memory_info,
+        model_spread_used_gpus=model_spread_used_gpus,
+        active_model_counts=active_model_counts,
     )
diff --git a/xinference/core/tests/test_worker.py b/xinference/core/tests/test_worker.py
index 1cae693945..82f9e9b38a 100644
--- a/xinference/core/tests/test_worker.py
+++ b/xinference/core/tests/test_worker.py
@@ -18,6 +18,7 @@
 import xoscar as xo
 from xoscar import MainActorPoolType, create_actor_pool, get_pool_config
 
+from ..launch_strategy import IdleFirstLaunchStrategy
 from ..launch_strategy import LocalFirstLaunchStrategy
 from ..launch_strategy import MemoryAwareLaunchStrategy
 from ..utils import merge_virtual_env_packages
@@ -32,12 +33,17 @@ def __init__(
         cuda_devices: List[int],
     ):
         super().__init__(supervisor_address, main_pool, cuda_devices)
-        gpu_memory_info = {
+        self._test_gpu_memory_info = {
             idx: {"total": 24000.0, "used": 0.0, "available": 24000.0}
             for idx in cuda_devices
         }
-        self._launch_strategy = LocalFirstLaunchStrategy(
-            cuda_devices, gpu_memory_info=gpu_memory_info
+
+    def _create_launch_strategy_instance(self):
+        return IdleFirstLaunchStrategy(
+            self._total_gpu_devices,
+            gpu_memory_info=self._test_gpu_memory_info,
+            model_spread_used_gpus=self._model_spread_used_gpus,
+            active_model_counts=self._active_model_counts,
         )
 
     async def __post_create__(self):
diff --git a/xinference/core/worker.py b/xinference/core/worker.py
index 1c97cc8523..800a48562a 100644
--- a/xinference/core/worker.py
+++ b/xinference/core/worker.py
@@ -64,7 +64,7 @@
 from .event import Event, EventCollectorActor, EventType
 from .launch_strategy import LaunchModelSpec, create_launch_strategy
 from .metrics import launch_metrics_export_server, record_metrics
-from .resource import gather_node_info
+from .resource import GPUStatus, gather_node_info
 from .status_guard import StatusGuardActor
 from .utils import (
     log_async,
@@ -155,16 +155,16 @@ def __init__(
         self._model_uid_to_addr: Dict[str, str] = {}
         self._model_uid_to_recover_count: Dict[str, Optional[int]] = {}
         self._model_uid_to_launch_args: Dict[str, Dict] = {}
+        # Share launch spread/replica counts across strategy instances
+        self._model_spread_used_gpus: Dict[str, Set[int]] = {}
+        self._active_model_counts: Dict[str, int] = {}
         from ..constants import (
             XINFERENCE_LAUNCH_ALLOWED_GPUS,
             XINFERENCE_LAUNCH_STRATEGY,
         )
 
-        self._launch_strategy = create_launch_strategy(
-            strategy_name=XINFERENCE_LAUNCH_STRATEGY,
-            total_gpu_devices=self._total_gpu_devices,
-            allowed_devices=XINFERENCE_LAUNCH_ALLOWED_GPUS,
-        )
+        self._launch_strategy_name = XINFERENCE_LAUNCH_STRATEGY
+        self._launch_allowed_gpus = XINFERENCE_LAUNCH_ALLOWED_GPUS
 
         if XINFERENCE_DISABLE_METRICS:
             logger.info(
@@ -559,9 +559,37 @@ def _collect_user_specified_devices(self) -> Set[int]:
                 user_specified_allocated_devices.add(dev)
         return user_specified_allocated_devices
 
+    def _create_launch_strategy_instance(self):
+        # Try to seed strategy with current GPU memory snapshot from NVML
+        initial_gpu_memory_info: Optional[Dict[int, Dict[str, float]]] = None
+        try:
+            node_info = gather_node_info()
+            gpu_info: Dict[int, Dict[str, float]] = {}
+            for dev in self._total_gpu_devices:
+                status = node_info.get(f"gpu-{dev}")
+                if isinstance(status, GPUStatus):
+                    gpu_info[dev] = {
+                        "total": status.mem_total // (1024**2),
+                        "used": status.mem_used // (1024**2),
+                        "available": status.mem_free // (1024**2),
+                    }
+            initial_gpu_memory_info = gpu_info or None
+        except Exception:
+            initial_gpu_memory_info = None
+
+        return create_launch_strategy(
+            strategy_name=self._launch_strategy_name,
+            total_gpu_devices=self._total_gpu_devices,
+            allowed_devices=self._launch_allowed_gpus,
+            gpu_memory_info=initial_gpu_memory_info,
+            model_spread_used_gpus=self._model_spread_used_gpus,
+            active_model_counts=self._active_model_counts,
+        )
+
     def allocate_devices(self, model_uid: str, n_gpu: int) -> List[int]:
         spec = LaunchModelSpec(model_uid=model_uid, n_gpu=n_gpu)
-        devices = self._launch_strategy.allocate(
+        strategy = self._create_launch_strategy_instance()
+        devices = strategy.allocate(
             spec=spec,
             total_gpu_devices=self._total_gpu_devices,
             user_specified_allocated_devices=self._collect_user_specified_devices(),
@@ -588,7 +616,8 @@ def allocate_devices_for_model(
             model_format=model_format,
             quantization=quantization,
         )
-        devices = self._launch_strategy.allocate(
+        strategy = self._create_launch_strategy_instance()
+        devices = strategy.allocate(
             spec=spec,
             total_gpu_devices=self._total_gpu_devices,
             user_specified_allocated_devices=self._collect_user_specified_devices(),
@@ -659,11 +688,12 @@ def release_devices(self, model_uid: str):
                     self._user_specified_gpu_to_model_uids[dev],
                 )
             )
-            for model_info in model_infos:
-                self._user_specified_gpu_to_model_uids[dev].remove(model_info)
+        for model_info in model_infos:
+            self._user_specified_gpu_to_model_uids[dev].remove(model_info)
 
-        # Use launch strategy to handle memory tracking rollback
-        self._launch_strategy.release(model_uid, devices)
+        # Keep strategy bookkeeping in sync for spread逻辑
+        strategy = self._create_launch_strategy_instance()
+        strategy.release(model_uid, devices)
 
     async def _create_subpool(
         self,

From 55c90b039ea61b65fa7ceabef7e72a9b1aa3e018 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Thu, 11 Dec 2025 17:42:43 +0800
Subject: [PATCH 13/18] rebase

---
 xinference/core/tests/test_worker.py | 60 +++++++++++++++++++---------
 xinference/core/worker.py            | 19 ++++-----
 2 files changed, 52 insertions(+), 27 deletions(-)

diff --git a/xinference/core/tests/test_worker.py b/xinference/core/tests/test_worker.py
index 82f9e9b38a..0a1199e875 100644
--- a/xinference/core/tests/test_worker.py
+++ b/xinference/core/tests/test_worker.py
@@ -19,12 +19,36 @@
 from xoscar import MainActorPoolType, create_actor_pool, get_pool_config
 
 from ..launch_strategy import IdleFirstLaunchStrategy
-from ..launch_strategy import LocalFirstLaunchStrategy
-from ..launch_strategy import MemoryAwareLaunchStrategy
 from ..utils import merge_virtual_env_packages
 from ..worker import WorkerActor
 
 
+class DeterministicIdleFirstLaunchStrategy(IdleFirstLaunchStrategy):
+    def _select_emptiest_gpu(
+        self,
+        candidates,
+        pending_gpu_counts,
+        allocated_gpus,
+    ):
+        """
+        Deterministic tie-breaking for tests so we do not rely on real GPU state.
+        """
+        if not candidates:
+            return None
+
+        scored = []
+        for dev in candidates:
+            available = self._gpu_memory_info.get(dev, {}).get("available", 0)
+            penalty = pending_gpu_counts.get(dev, 0) + len(
+                allocated_gpus.get(dev, set())
+            )
+            scored.append((dev, available - penalty))
+
+        # Prefer higher available memory, then the lowest GPU index.
+        scored.sort(key=lambda item: (-item[1], item[0]))
+        return scored[0][0]
+
+
 class MockWorkerActor(WorkerActor):
     def __init__(
         self,
@@ -39,7 +63,7 @@ def __init__(
         }
 
     def _create_launch_strategy_instance(self):
-        return IdleFirstLaunchStrategy(
+        return DeterministicIdleFirstLaunchStrategy(
             self._total_gpu_devices,
             gpu_memory_info=self._test_gpu_memory_info,
             model_spread_used_gpus=self._model_spread_used_gpus,
@@ -123,13 +147,13 @@ async def test_allocate_cuda_devices(setup_pool):
     assert devices == [0]
 
     devices = await worker.allocate_devices(model_uid="mock_model_2", n_gpu=2)
-    assert devices == [0, 0]
+    assert devices == [1, 2]
 
     devices = await worker.allocate_devices(model_uid="mock_model_3", n_gpu=1)
-    assert devices == [0]
+    assert devices == [3]
 
     devices = await worker.allocate_devices(model_uid="mock_model_4", n_gpu=1)
-    assert devices == [0]
+    assert devices == [4]
 
 
 @pytest.mark.asyncio
@@ -155,7 +179,7 @@ async def test_terminate_model_flag(setup_pool):
     )
 
     devices = await worker.allocate_devices(model_uid="model_model_3", n_gpu=3)
-    assert devices == [0, 0, 0]
+    assert devices == [5, 6, 7]
     await worker.release_devices(model_uid="model_model_3")
 
     await worker.launch_builtin_model(
@@ -230,9 +254,9 @@ async def test_launch_embedding_model(setup_pool):
     )
 
     embedding_info = await worker.get_gpu_to_embedding_model_uids()
-    assert 1 in embedding_info
-    assert len(embedding_info[1]) == 1
-    assert "model_model_2" in embedding_info[1]
+    assert 3 in embedding_info
+    assert len(embedding_info[3]) == 1
+    assert "model_model_2" in embedding_info[3]
 
     # test terminate LLM model, then launch embedding model
     await worker.terminate_model("model_model_1")
@@ -263,23 +287,23 @@ async def test_launch_embedding_model(setup_pool):
         "model_model_3", "mock_model_name", None, None, None, "embedding", n_gpu=1
     )
     embedding_info = await worker.get_gpu_to_embedding_model_uids()
-    assert 1 in embedding_info
     assert 2 in embedding_info
-    assert len(embedding_info[1]) == 1
+    assert 3 in embedding_info
     assert len(embedding_info[2]) == 1
-    assert "model_model_2" in embedding_info[1]
-    assert "model_model_3" in embedding_info[2]
+    assert len(embedding_info[3]) == 1
+    assert "model_model_2" in embedding_info[2]
+    assert "model_model_3" in embedding_info[3]
 
     await worker.launch_builtin_model(
         "model_model_4", "mock_model_name", None, None, None, "embedding", n_gpu=1
     )
     embedding_info = await worker.get_gpu_to_embedding_model_uids()
-    assert len(embedding_info[1]) == 1
+    assert len(embedding_info[0]) == 1
     assert len(embedding_info[2]) == 1
     assert len(embedding_info[3]) == 1
-    assert "model_model_2" in embedding_info[1]
-    assert "model_model_3" in embedding_info[2]
-    assert "model_model_4" in embedding_info[3]
+    assert "model_model_2" in embedding_info[2]
+    assert "model_model_3" in embedding_info[3]
+    assert "model_model_4" in embedding_info[0]
 
     for i in range(1, 5):
         await worker.terminate_model(f"model_model_{i}")
diff --git a/xinference/core/worker.py b/xinference/core/worker.py
index 800a48562a..5530b1017b 100644
--- a/xinference/core/worker.py
+++ b/xinference/core/worker.py
@@ -681,15 +681,16 @@ def release_devices(self, model_uid: str):
                 self._gpu_to_embedding_model_uids[dev].remove(model_uid)
 
         # check user-specified slots
-        for dev in self._user_specified_gpu_to_model_uids:
-            model_infos = list(
-                filter(
-                    lambda x: x[0] == model_uid,
-                    self._user_specified_gpu_to_model_uids[dev],
-                )
-            )
-        for model_info in model_infos:
-            self._user_specified_gpu_to_model_uids[dev].remove(model_info)
+        for dev in list(self._user_specified_gpu_to_model_uids):
+            model_infos = [
+                info
+                for info in self._user_specified_gpu_to_model_uids[dev]
+                if info[0] == model_uid
+            ]
+            for model_info in model_infos:
+                self._user_specified_gpu_to_model_uids[dev].remove(model_info)
+            if not self._user_specified_gpu_to_model_uids[dev]:
+                del self._user_specified_gpu_to_model_uids[dev]
 
         # Keep strategy bookkeeping in sync for spread逻辑
         strategy = self._create_launch_strategy_instance()

From 7f894448247ef184895dccb29eab2a8a625014aa Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Thu, 11 Dec 2025 17:45:42 +0800
Subject: [PATCH 14/18] CI fix

---
 .github/workflows/python.yaml | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
index c8a0728e0e..c3f9d43c65 100644
--- a/.github/workflows/python.yaml
+++ b/.github/workflows/python.yaml
@@ -73,20 +73,17 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ "ubuntu-latest", "macos-13", "windows-latest" ]
+        os: [ "ubuntu-latest", "macos-latest", "windows-latest" ]
         python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
         module: [ "xinference" ]
         exclude:
-          - { os: macos-13, python-version: 3.10 }
-          - { os: macos-13, python-version: 3.11 }
-          - { os: macos-13, python-version: 3.12 }
-          - { os: macos-13, python-version: 3.13 }
           - { os: windows-latest, python-version: 3.10 }
           - { os: windows-latest, python-version: 3.11 }
           - { os: windows-latest, python-version: 3.12 }
         include:
           - { os: self-hosted, module: gpu, python-version: "3.11"}
           - { os: macos-latest, module: metal, python-version: "3.10" }
+          - { os: macos-latest, python-version: "3.9" }
           - { os: macos-latest, python-version: "3.13" }
 
     steps:

From d973a54c78868b39dc4c6eaf61ec968f8d124004 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Thu, 11 Dec 2025 17:51:03 +0800
Subject: [PATCH 15/18] CI fix

---
 .github/workflows/python.yaml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
index c3f9d43c65..a7583eaa32 100644
--- a/.github/workflows/python.yaml
+++ b/.github/workflows/python.yaml
@@ -80,11 +80,12 @@ jobs:
           - { os: windows-latest, python-version: 3.10 }
           - { os: windows-latest, python-version: 3.11 }
           - { os: windows-latest, python-version: 3.12 }
+          - { os: macos-latest, python-version: 3.10 }
+          - { os: macos-latest, python-version: 3.11 }
+          - { os: macos-latest, python-version: 3.12 }
         include:
           - { os: self-hosted, module: gpu, python-version: "3.11"}
           - { os: macos-latest, module: metal, python-version: "3.10" }
-          - { os: macos-latest, python-version: "3.9" }
-          - { os: macos-latest, python-version: "3.13" }
 
     steps:
       - name: Check out code

From 486131fff2bc2ef095e9c0819d4b2bf6aa1498f5 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Fri, 12 Dec 2025 15:20:05 +0800
Subject: [PATCH 16/18] new strategy

---
 xinference/core/launch_strategy.py   |  84 +++++++++++----
 xinference/core/supervisor.py        |   3 +
 xinference/core/tests/test_worker.py |  11 +-
 xinference/core/worker.py            | 147 ++++++++++++++++++---------
 xinference/device_utils.py           |  56 +---------
 5 files changed, 174 insertions(+), 127 deletions(-)

diff --git a/xinference/core/launch_strategy.py b/xinference/core/launch_strategy.py
index e5b972c0a2..16f0d85d6b 100644
--- a/xinference/core/launch_strategy.py
+++ b/xinference/core/launch_strategy.py
@@ -16,7 +16,7 @@
 from dataclasses import dataclass
 from typing import Dict, List, Mapping, Optional, Set, Tuple, Union
 
-from ..device_utils import initialize_gpu_memory_info, update_gpu_memory_info
+from ..device_utils import update_gpu_memory_info
 from .utils import parse_replica_model_uid
 
 logger = logging.getLogger(__name__)
@@ -34,12 +34,36 @@ class LaunchModelSpec:
     quantization: Optional[str] = None
 
 
-class IdleFirstLaunchStrategy:
+class LaunchStrategy:
+    """
+    Base class for launch strategies.
+    Concrete implementations should override allocate/release/is_idle.
+    """
+
+    def allocate(
+        self,
+        spec: LaunchModelSpec,
+        total_gpu_devices: List[int],
+        user_specified_allocated_devices: Set[int],
+        allocated_gpus: Mapping[int, Set[str]],
+    ) -> List[int]:
+        raise NotImplementedError
+
+    def release(self, model_uid: str, devices: List[int]) -> None:
+        raise NotImplementedError
+
+    def is_idle(self) -> bool:
+        raise NotImplementedError
+
+
+class IdleFirstLaunchStrategy(LaunchStrategy):
     """
     Prefer the GPU running Xinference, otherwise keep allocating onto the emptiest
     remaining GPU.
     """
 
+    _DEFAULT_BOOKED_MB = 1024  # logical reservation per replica
+
     def __init__(
         self,
         total_gpu_devices: List[int],
@@ -50,9 +74,9 @@ def __init__(
     ):
         self._allowed_devices = allowed_devices
         self._total_gpu_devices = self._filter_allowed(total_gpu_devices)
-        self._gpu_memory_info = gpu_memory_info or initialize_gpu_memory_info(
-            self._total_gpu_devices, logger=logger
-        )
+        if gpu_memory_info is None:
+            raise ValueError("gpu_memory_info must be provided for launch strategy")
+        self._gpu_memory_info = gpu_memory_info
         # Track which GPUs have been used in the first round for each model
         self._model_spread_used_gpus: Dict[str, Set[int]] = (
             model_spread_used_gpus if model_spread_used_gpus is not None else {}
@@ -61,6 +85,8 @@ def __init__(
         self._active_model_counts: Dict[str, int] = (
             active_model_counts if active_model_counts is not None else {}
         )
+        # Logical reservations (MB) per GPU for this strategy's base model
+        self._reserved_memory_mb: Dict[int, float] = {}
 
     def _filter_allowed(self, total_gpu_devices: List[int]) -> List[int]:
         if self._allowed_devices is None:
@@ -80,13 +106,15 @@ def _select_emptiest_gpu(
         for dev in candidates:
             update_gpu_memory_info(self._gpu_memory_info, dev, logger=logger)
             available = self._gpu_memory_info.get(dev, {}).get("available", 0)
+            # Deduct logical reservations to avoid stacking replicas too quickly
+            available -= self._reserved_memory_mb.get(dev, 0)
             # Penalize GPUs already planned/allocated to avoid stacking too early
             penalty = pending_gpu_counts.get(dev, 0) + len(
                 allocated_gpus.get(dev, set())
             )
             scored.append((dev, available - penalty))
 
-        scored.sort(key=lambda item: item[1], reverse=True)
+        scored.sort(key=lambda item: (-item[1], item[0]))
         return scored[0][0] if scored else None
 
     def allocate(
@@ -112,20 +140,23 @@ def allocate(
         selected: List[int] = []
 
         while len(selected) < n_gpu:
-            # If some GPUs haven't received a replica for this model yet, try them first
-            if len(used_in_spread) < len(available_total):
+            # Prefer truly idle GPUs first: those without existing allocations
+            unoccupied_gpus = [
+                dev
+                for dev in available_total
+                if dev not in user_specified_allocated_devices
+                and not allocated_gpus.get(dev)
+            ]
+            spreading_phase = bool(unoccupied_gpus) and len(used_in_spread) < len(
+                unoccupied_gpus
+            )
+            if spreading_phase:
+                # First round: try to place replicas on distinct, unoccupied GPUs
                 candidate_pool = [
-                    dev
-                    for dev in available_total
-                    if dev not in user_specified_allocated_devices
-                    and dev not in used_in_spread
+                    dev for dev in unoccupied_gpus if dev not in used_in_spread
                 ]
                 if not candidate_pool:
-                    candidate_pool = [
-                        dev
-                        for dev in available_total
-                        if dev not in user_specified_allocated_devices
-                    ]
+                    candidate_pool = [dev for dev in unoccupied_gpus]
             else:
                 candidate_pool = [
                     dev
@@ -149,6 +180,11 @@ def allocate(
         self._active_model_counts[base_model_uid] = (
             self._active_model_counts.get(base_model_uid, 0) + 1
         )
+        # Reserve logical memory for selected GPUs
+        for dev in selected:
+            self._reserved_memory_mb[dev] = (
+                self._reserved_memory_mb.get(dev, 0.0) + self._DEFAULT_BOOKED_MB
+            )
         return selected
 
     def release(self, model_uid: str, devices: List[int]) -> None:
@@ -160,8 +196,22 @@ def release(self, model_uid: str, devices: List[int]) -> None:
         if count <= 1:
             self._active_model_counts.pop(base_model_uid, None)
             self._model_spread_used_gpus.pop(base_model_uid, None)
+            for dev in devices:
+                if dev in self._reserved_memory_mb:
+                    self._reserved_memory_mb[dev] -= self._DEFAULT_BOOKED_MB
+                    if self._reserved_memory_mb[dev] <= 0:
+                        self._reserved_memory_mb.pop(dev, None)
         else:
             self._active_model_counts[base_model_uid] = count - 1
+            for dev in devices:
+                if dev in self._reserved_memory_mb:
+                    self._reserved_memory_mb[dev] -= self._DEFAULT_BOOKED_MB
+                    if self._reserved_memory_mb[dev] <= 0:
+                        self._reserved_memory_mb.pop(dev, None)
+
+    def is_idle(self) -> bool:
+        """Return True when no active models are tracked by this strategy."""
+        return not self._active_model_counts
 
 
 def create_launch_strategy(
diff --git a/xinference/core/supervisor.py b/xinference/core/supervisor.py
index d7e3e0a0ba..205accf5ff 100644
--- a/xinference/core/supervisor.py
+++ b/xinference/core/supervisor.py
@@ -1096,6 +1096,9 @@ async def _launch_one_model(worker_ref, _replica_model_uid, rank: int):
             model_type = model_type or "LLM"
 
             try:
+                # Ensure per-base-model launch strategy is ready on worker before concurrent launches
+                await worker_ref.ensure_launch_strategy(model_uid)
+
                 subpool_address = await worker_ref.launch_builtin_model(
                     model_uid=_replica_model_uid,
                     model_name=model_name,
diff --git a/xinference/core/tests/test_worker.py b/xinference/core/tests/test_worker.py
index 0a1199e875..15163ae1e2 100644
--- a/xinference/core/tests/test_worker.py
+++ b/xinference/core/tests/test_worker.py
@@ -44,7 +44,7 @@ def _select_emptiest_gpu(
             )
             scored.append((dev, available - penalty))
 
-        # Prefer higher available memory, then the lowest GPU index.
+        # Prefer higher available memory, then lowest GPU index.
         scored.sort(key=lambda item: (-item[1], item[0]))
         return scored[0][0]
 
@@ -62,12 +62,13 @@ def __init__(
             for idx in cuda_devices
         }
 
-    def _create_launch_strategy_instance(self):
+    def _gather_initial_gpu_memory_info(self):
+        return self._test_gpu_memory_info
+
+    def _create_launch_strategy_instance(self, gpu_memory_info=None):
         return DeterministicIdleFirstLaunchStrategy(
             self._total_gpu_devices,
-            gpu_memory_info=self._test_gpu_memory_info,
-            model_spread_used_gpus=self._model_spread_used_gpus,
-            active_model_counts=self._active_model_counts,
+            gpu_memory_info=gpu_memory_info or self._test_gpu_memory_info,
         )
 
     async def __post_create__(self):
diff --git a/xinference/core/worker.py b/xinference/core/worker.py
index 5530b1017b..ec5051e28a 100644
--- a/xinference/core/worker.py
+++ b/xinference/core/worker.py
@@ -158,6 +158,10 @@ def __init__(
         # Share launch spread/replica counts across strategy instances
         self._model_spread_used_gpus: Dict[str, Set[int]] = {}
         self._active_model_counts: Dict[str, int] = {}
+        # Cached launch strategies per base model
+        self._launch_strategies: Dict[str, Any] = {}
+        # Protect concurrent allocations/releases so bookings stay consistent
+        self._allocation_lock = threading.Lock()
         from ..constants import (
             XINFERENCE_LAUNCH_ALLOWED_GPUS,
             XINFERENCE_LAUNCH_STRATEGY,
@@ -559,7 +563,7 @@ def _collect_user_specified_devices(self) -> Set[int]:
                 user_specified_allocated_devices.add(dev)
         return user_specified_allocated_devices
 
-    def _create_launch_strategy_instance(self):
+    def _gather_initial_gpu_memory_info(self) -> Optional[Dict[int, Dict[str, float]]]:
         # Try to seed strategy with current GPU memory snapshot from NVML
         initial_gpu_memory_info: Optional[Dict[int, Dict[str, float]]] = None
         try:
@@ -576,27 +580,64 @@ def _create_launch_strategy_instance(self):
             initial_gpu_memory_info = gpu_info or None
         except Exception:
             initial_gpu_memory_info = None
+        return initial_gpu_memory_info
 
+    def _create_launch_strategy_instance(
+        self, gpu_memory_info: Optional[Dict[int, Dict[str, float]]] = None
+    ):
+        if gpu_memory_info is None:
+            raise ValueError("gpu_memory_info is required to create launch strategy")
         return create_launch_strategy(
             strategy_name=self._launch_strategy_name,
             total_gpu_devices=self._total_gpu_devices,
             allowed_devices=self._launch_allowed_gpus,
-            gpu_memory_info=initial_gpu_memory_info,
-            model_spread_used_gpus=self._model_spread_used_gpus,
-            active_model_counts=self._active_model_counts,
+            gpu_memory_info=gpu_memory_info,
+        )
+
+    def _get_base_model_uid(self, model_uid: str) -> str:
+        try:
+            base_model_uid, _ = parse_replica_model_uid(model_uid)
+            return base_model_uid
+        except Exception:
+            return model_uid
+
+    def _get_or_create_launch_strategy(self, model_uid: str):
+        base_model_uid = self._get_base_model_uid(model_uid)
+        strategy = self._launch_strategies.get(base_model_uid)
+        if strategy is not None:
+            return strategy
+        strategy = self._create_launch_strategy_instance(
+            gpu_memory_info=self._gather_initial_gpu_memory_info()
         )
+        self._launch_strategies[base_model_uid] = strategy
+        return strategy
+
+    def ensure_launch_strategy(self, model_uid: str):
+        """
+        Ensure a launch strategy exists for the given base model.
+        This is intended to be triggered from supervisor before concurrent launches.
+        """
+        base_model_uid = self._get_base_model_uid(model_uid)
+        with self._allocation_lock:
+            if base_model_uid in self._launch_strategies:
+                return
+            strategy = self._create_launch_strategy_instance(
+                gpu_memory_info=self._gather_initial_gpu_memory_info()
+            )
+            self._launch_strategies[base_model_uid] = strategy
 
     def allocate_devices(self, model_uid: str, n_gpu: int) -> List[int]:
         spec = LaunchModelSpec(model_uid=model_uid, n_gpu=n_gpu)
-        strategy = self._create_launch_strategy_instance()
-        devices = strategy.allocate(
-            spec=spec,
-            total_gpu_devices=self._total_gpu_devices,
-            user_specified_allocated_devices=self._collect_user_specified_devices(),
-            allocated_gpus=self._gpu_to_model_uid,
-        )
-        for dev in devices:
-            self._gpu_to_model_uid[int(dev)].add(model_uid)
+        strategy = self._get_or_create_launch_strategy(model_uid)
+        with self._allocation_lock:
+            devices = strategy.allocate(
+                spec=spec,
+                total_gpu_devices=self._total_gpu_devices,
+                user_specified_allocated_devices=self._collect_user_specified_devices(),
+                allocated_gpus=self._gpu_to_model_uid,
+            )
+            for dev in devices:
+                self._gpu_to_model_uid[int(dev)].add(model_uid)
         return sorted(devices)
 
     def allocate_devices_for_model(
@@ -616,15 +657,16 @@ def allocate_devices_for_model(
             model_format=model_format,
             quantization=quantization,
         )
-        strategy = self._create_launch_strategy_instance()
-        devices = strategy.allocate(
-            spec=spec,
-            total_gpu_devices=self._total_gpu_devices,
-            user_specified_allocated_devices=self._collect_user_specified_devices(),
-            allocated_gpus=self._gpu_to_model_uid,
-        )
-        for dev in devices:
-            self._gpu_to_model_uid[int(dev)].add(model_uid)
+        strategy = self._get_or_create_launch_strategy(model_uid)
+        with self._allocation_lock:
+            devices = strategy.allocate(
+                spec=spec,
+                total_gpu_devices=self._total_gpu_devices,
+                user_specified_allocated_devices=self._collect_user_specified_devices(),
+                allocated_gpus=self._gpu_to_model_uid,
+            )
+            for dev in devices:
+                self._gpu_to_model_uid[int(dev)].add(model_uid)
         return sorted(devices)
 
     async def allocate_devices_with_gpu_idx(
@@ -666,35 +708,40 @@ async def allocate_devices_with_gpu_idx(
         return sorted(gpu_idx)
 
     def release_devices(self, model_uid: str):
-        devices = [
-            dev for dev, uids in self._gpu_to_model_uid.items() if model_uid in uids
-        ]
-        for dev in devices:
-            if model_uid in self._gpu_to_model_uid[dev]:
-                self._gpu_to_model_uid[dev].remove(model_uid)
-            if not self._gpu_to_model_uid[dev]:
-                del self._gpu_to_model_uid[dev]
-
-        # check embedding
-        for dev in self._gpu_to_embedding_model_uids:
-            if model_uid in self._gpu_to_embedding_model_uids[dev]:
-                self._gpu_to_embedding_model_uids[dev].remove(model_uid)
-
-        # check user-specified slots
-        for dev in list(self._user_specified_gpu_to_model_uids):
-            model_infos = [
-                info
-                for info in self._user_specified_gpu_to_model_uids[dev]
-                if info[0] == model_uid
+        base_model_uid = self._get_base_model_uid(model_uid)
+        strategy = self._launch_strategies.get(base_model_uid)
+        with self._allocation_lock:
+            devices = [
+                dev for dev, uids in self._gpu_to_model_uid.items() if model_uid in uids
             ]
-            for model_info in model_infos:
-                self._user_specified_gpu_to_model_uids[dev].remove(model_info)
-            if not self._user_specified_gpu_to_model_uids[dev]:
-                del self._user_specified_gpu_to_model_uids[dev]
-
-        # Keep strategy bookkeeping in sync for spread逻辑
-        strategy = self._create_launch_strategy_instance()
-        strategy.release(model_uid, devices)
+            for dev in devices:
+                if model_uid in self._gpu_to_model_uid[dev]:
+                    self._gpu_to_model_uid[dev].remove(model_uid)
+                if not self._gpu_to_model_uid[dev]:
+                    del self._gpu_to_model_uid[dev]
+
+            # check embedding
+            for dev in self._gpu_to_embedding_model_uids:
+                if model_uid in self._gpu_to_embedding_model_uids[dev]:
+                    self._gpu_to_embedding_model_uids[dev].remove(model_uid)
+
+            # check user-specified slots
+            for dev in list(self._user_specified_gpu_to_model_uids):
+                model_infos = [
+                    info
+                    for info in self._user_specified_gpu_to_model_uids[dev]
+                    if info[0] == model_uid
+                ]
+                for model_info in model_infos:
+                    self._user_specified_gpu_to_model_uids[dev].remove(model_info)
+                if not self._user_specified_gpu_to_model_uids[dev]:
+                    del self._user_specified_gpu_to_model_uids[dev]
+
+            # Keep strategy bookkeeping in sync for spread逻辑
+            if strategy is not None:
+                strategy.release(model_uid, devices)
+                if strategy.is_idle():
+                    self._launch_strategies.pop(base_model_uid, None)
 
     async def _create_subpool(
         self,
diff --git a/xinference/device_utils.py b/xinference/device_utils.py
index bcc3f8dc60..6c234bea47 100644
--- a/xinference/device_utils.py
+++ b/xinference/device_utils.py
@@ -14,7 +14,7 @@
 
 import logging
 import os
-from typing import Dict, List, Literal, Optional, Union
+from typing import Dict, Literal, Optional, Union
 
 import torch
 
@@ -196,60 +196,6 @@ def get_nvidia_gpu_info() -> Dict:
             pass
 
 
-def initialize_gpu_memory_info(
-    gpu_indices: List[int], logger: Optional[logging.Logger] = None
-) -> Dict[int, Dict[str, Union[int, float]]]:
-    """
-    Initialize GPU memory information using NVML
-
-    Args:
-        gpu_indices: List of GPU indices to initialize
-        logger: Optional logger instance
-
-    Returns:
-        Dictionary mapping GPU index to memory info (total/used/available in MB)
-    """
-    gpu_memory_info = {}
-
-    try:
-        import pynvml
-
-        pynvml.nvmlInit()
-
-        for gpu_idx in gpu_indices:
-            handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_idx)
-            mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
-
-            gpu_memory_info[gpu_idx] = {
-                "total": mem_info.total // (1024**2),  # Convert to MB
-                "used": mem_info.used // (1024**2),
-                "available": mem_info.free // (1024**2),
-            }
-
-    except ImportError:
-        if logger:
-            logger.warning("pynvml not available, GPU memory tracking disabled")
-        # Fallback to basic tracking without actual memory info
-        for gpu_idx in gpu_indices:
-            gpu_memory_info[gpu_idx] = {
-                "total": 0,
-                "used": 0,
-                "available": 0,
-            }
-    except Exception as e:
-        if logger:
-            logger.error(f"Failed to initialize GPU memory info: {e}")
-        # Fallback to basic tracking
-        for gpu_idx in gpu_indices:
-            gpu_memory_info[gpu_idx] = {
-                "total": 0,
-                "used": 0,
-                "available": 0,
-            }
-
-    return gpu_memory_info
-
-
 def update_gpu_memory_info(
     gpu_memory_info: Dict[int, Dict[str, Union[int, float]]],
     gpu_idx: int,

From 48ad42c39bf634cbd377a552311c656345b326c2 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Fri, 12 Dec 2025 18:18:48 +0800
Subject: [PATCH 17/18] new strategy

---
 xinference/core/launch_strategy.py   | 53 ++++++++-----------
 xinference/core/supervisor.py        | 55 ++++++++++++++++++--
 xinference/core/tests/test_worker.py | 49 ++++++++++++++++++
 xinference/core/worker.py            | 77 ++++++++++------------------
 xinference/device_utils.py           | 33 +-----------
 5 files changed, 149 insertions(+), 118 deletions(-)

diff --git a/xinference/core/launch_strategy.py b/xinference/core/launch_strategy.py
index 16f0d85d6b..c2e7fa057c 100644
--- a/xinference/core/launch_strategy.py
+++ b/xinference/core/launch_strategy.py
@@ -16,7 +16,6 @@
 from dataclasses import dataclass
 from typing import Dict, List, Mapping, Optional, Set, Tuple, Union
 
-from ..device_utils import update_gpu_memory_info
 from .utils import parse_replica_model_uid
 
 logger = logging.getLogger(__name__)
@@ -57,10 +56,7 @@ def is_idle(self) -> bool:
 
 
 class IdleFirstLaunchStrategy(LaunchStrategy):
-    """
-    Prefer the GPU running Xinference, otherwise keep allocating onto the emptiest
-    remaining GPU.
-    """
+    """Always place replicas onto the currently emptiest GPU."""
 
     _DEFAULT_BOOKED_MB = 1024  # logical reservation per replica
 
@@ -104,7 +100,6 @@ def _select_emptiest_gpu(
 
         scored: List[Tuple[int, Union[int, float]]] = []
         for dev in candidates:
-            update_gpu_memory_info(self._gpu_memory_info, dev, logger=logger)
             available = self._gpu_memory_info.get(dev, {}).get("available", 0)
             # Deduct logical reservations to avoid stacking replicas too quickly
             available -= self._reserved_memory_mb.get(dev, 0)
@@ -112,9 +107,22 @@ def _select_emptiest_gpu(
             penalty = pending_gpu_counts.get(dev, 0) + len(
                 allocated_gpus.get(dev, set())
             )
-            scored.append((dev, available - penalty))
-
-        scored.sort(key=lambda item: (-item[1], item[0]))
+            score = available - penalty
+            scored.append((dev, score))
+
+        # If scores are infinite (heartbeat missing => infinite available),
+        # fall back to smallest reserved/penalty; tie-break by GPU index.
+        if any(val[1] == float("inf") for val in scored):
+            scored.sort(
+                key=lambda item: (
+                    self._reserved_memory_mb.get(item[0], 0.0)
+                    + pending_gpu_counts.get(item[0], 0)
+                    + len(allocated_gpus.get(item[0], set())),
+                    item[0],
+                )
+            )
+        else:
+            scored.sort(key=lambda item: (-item[1], item[0]))
         return scored[0][0] if scored else None
 
     def allocate(
@@ -133,36 +141,18 @@ def allocate(
             base_model_uid, _ = parse_replica_model_uid(model_uid)
         except Exception:
             base_model_uid = model_uid
-        used_in_spread = self._model_spread_used_gpus.setdefault(base_model_uid, set())
         n_gpu = spec.n_gpu
 
         pending_gpu_counts: Dict[int, int] = {}
         selected: List[int] = []
 
         while len(selected) < n_gpu:
-            # Prefer truly idle GPUs first: those without existing allocations
-            unoccupied_gpus = [
+            # Always pick the emptiest eligible GPU (excludes user-specified ones)
+            candidate_pool = [
                 dev
                 for dev in available_total
                 if dev not in user_specified_allocated_devices
-                and not allocated_gpus.get(dev)
             ]
-            spreading_phase = bool(unoccupied_gpus) and len(used_in_spread) < len(
-                unoccupied_gpus
-            )
-            if spreading_phase:
-                # First round: try to place replicas on distinct, unoccupied GPUs
-                candidate_pool = [
-                    dev for dev in unoccupied_gpus if dev not in used_in_spread
-                ]
-                if not candidate_pool:
-                    candidate_pool = [dev for dev in unoccupied_gpus]
-            else:
-                candidate_pool = [
-                    dev
-                    for dev in available_total
-                    if dev not in user_specified_allocated_devices
-                ]
             emptiest_gpu = self._select_emptiest_gpu(
                 candidate_pool, pending_gpu_counts, allocated_gpus
             )
@@ -173,10 +163,9 @@ def allocate(
             pending_gpu_counts[emptiest_gpu] = (
                 pending_gpu_counts.get(emptiest_gpu, 0) + 1
             )
-            used_in_spread.add(emptiest_gpu)
 
-        # Persist spread history for this base model
-        self._model_spread_used_gpus[base_model_uid] = used_in_spread
+        # Persist spread history for compatibility with release bookkeeping
+        self._model_spread_used_gpus.setdefault(base_model_uid, set()).update(selected)
         self._active_model_counts[base_model_uid] = (
             self._active_model_counts.get(base_model_uid, 0) + 1
         )
diff --git a/xinference/core/supervisor.py b/xinference/core/supervisor.py
index 205accf5ff..f24d3d14ec 100644
--- a/xinference/core/supervisor.py
+++ b/xinference/core/supervisor.py
@@ -30,6 +30,7 @@
     List,
     Literal,
     Optional,
+    Set,
     Tuple,
     Type,
     Union,
@@ -48,6 +49,7 @@
 from ..core.status_guard import InstanceInfo, LaunchStatus
 from ..model.utils import get_engine_params_by_name
 from ..types import PeftModelConfig
+from .launch_strategy import create_launch_strategy
 from .metrics import record_metrics
 from .resource import GPUStatus, ResourceStatus
 from .utils import (
@@ -899,6 +901,44 @@ def _get_worker_refs_by_ip(self, ip: str) -> List[xo.ActorRefType["WorkerActor"]
         )
         return refs
 
+    def _build_gpu_memory_info(
+        self, worker_ref
+    ) -> Optional[Dict[int, Dict[str, float]]]:
+        """Use latest heartbeat data for GPU memory snapshot."""
+        worker_status = self._worker_status.get(worker_ref.address)
+        if worker_status is None:
+            return None
+        gpu_info: Dict[int, Dict[str, float]] = {}
+        for dev, status in worker_status.status.items():
+            if isinstance(status, GPUStatus) and str(dev).startswith("gpu-"):
+                try:
+                    idx = int(str(dev).split("-", 1)[1])
+                except Exception:
+                    continue
+                gpu_info[idx] = {
+                    "total": status.mem_total // (1024**2),
+                    "used": status.mem_used // (1024**2),
+                    "available": status.mem_free // (1024**2),
+                }
+        return gpu_info or None
+
+    async def _install_strategy_on_worker(self, model_uid: str, worker_ref) -> None:
+        ctx = await worker_ref.get_launch_strategy_context()
+        gpu_memory_info = self._build_gpu_memory_info(worker_ref)
+        if gpu_memory_info is None:
+            # Heartbeat disabled or missing: assume all visible GPUs are available with "infinite" mem
+            gpu_memory_info = {
+                dev: {"total": float("inf"), "used": 0.0, "available": float("inf")}
+                for dev in ctx["total_gpu_devices"]
+            }
+        strategy = create_launch_strategy(
+            strategy_name=ctx["launch_strategy_name"],
+            total_gpu_devices=ctx["total_gpu_devices"],
+            allowed_devices=ctx["allowed_devices"],
+            gpu_memory_info=gpu_memory_info,
+        )
+        await worker_ref.install_launch_strategy(model_uid, strategy)
+
     @log_async(logger=logger)
     async def launch_builtin_model(
         self,
@@ -1096,9 +1136,6 @@ async def _launch_one_model(worker_ref, _replica_model_uid, rank: int):
             model_type = model_type or "LLM"
 
             try:
-                # Ensure per-base-model launch strategy is ready on worker before concurrent launches
-                await worker_ref.ensure_launch_strategy(model_uid)
-
                 subpool_address = await worker_ref.launch_builtin_model(
                     model_uid=_replica_model_uid,
                     model_name=model_name,
@@ -1140,6 +1177,7 @@ async def _launch_model():
             try:
                 # Pre-fetch worker loads for balanced scheduling
                 worker_candidates = []
+                prepared_workers: Set[str] = set()
 
                 if target_worker_refs:
                     workers = target_worker_refs
@@ -1188,6 +1226,11 @@ async def _launch_model():
                         _idx
                     ].append(worker_ref)
 
+                    # Prepare launch strategy per worker once before launching replicas
+                    if worker_ref.address not in prepared_workers:
+                        await self._install_strategy_on_worker(model_uid, worker_ref)
+                        prepared_workers.add(worker_ref.address)
+
                     if enable_xavier and _idx == 0:
                         """
                         Start the rank 0 model actor on the worker that holds the rank 1 replica,
@@ -1359,6 +1402,7 @@ async def _launch_model():
                     "n_worker cannot be larger than the number of available workers."
                 )
             try:
+                prepared_workers: Set[str] = set()
                 for _idx, rep_model_uid in enumerate(
                     iter_replica_model_uid(model_uid, replica)
                 ):
@@ -1375,6 +1419,11 @@ async def _launch_model():
                         ].replica_to_worker_refs[_idx].append(worker_ref)
                         nonlocal model_type
                         model_type = model_type or "LLM"
+                        if worker_ref.address not in prepared_workers:
+                            await self._install_strategy_on_worker(
+                                model_uid, worker_ref
+                            )
+                            prepared_workers.add(worker_ref.address)
                         if i_worker > 1:
                             assert (
                                 driver_info is not None
diff --git a/xinference/core/tests/test_worker.py b/xinference/core/tests/test_worker.py
index 15163ae1e2..2e44570436 100644
--- a/xinference/core/tests/test_worker.py
+++ b/xinference/core/tests/test_worker.py
@@ -49,6 +49,20 @@ def _select_emptiest_gpu(
         return scored[0][0]
 
 
+async def install_strategy_for_worker(worker, model_uid: str):
+    """
+    Simulate supervisor-side strategy preparation for tests.
+    """
+    ctx = await worker.get_launch_strategy_context()
+    gpu_memory_info = await worker.get_test_gpu_memory_info()
+    strategy = DeterministicIdleFirstLaunchStrategy(
+        ctx["total_gpu_devices"],
+        allowed_devices=ctx["allowed_devices"],
+        gpu_memory_info=gpu_memory_info,
+    )
+    await worker.install_launch_strategy(model_uid, strategy)
+
+
 class MockWorkerActor(WorkerActor):
     def __init__(
         self,
@@ -86,6 +100,9 @@ def get_gpu_to_embedding_model_uids(self):
     def get_user_specified_gpu_to_model_uids(self):
         return self._user_specified_gpu_to_model_uids
 
+    def get_test_gpu_memory_info(self):
+        return self._test_gpu_memory_info
+
     async def is_model_vllm_backend(self, model_uid):
         if model_uid.startswith("normal_"):
             return False
@@ -144,15 +161,19 @@ async def test_allocate_cuda_devices(setup_pool):
         cuda_devices=[i for i in range(8)],
     )
 
+    await install_strategy_for_worker(worker, "mock_model_1")
     devices = await worker.allocate_devices(model_uid="mock_model_1", n_gpu=1)
     assert devices == [0]
 
+    await install_strategy_for_worker(worker, "mock_model_2")
     devices = await worker.allocate_devices(model_uid="mock_model_2", n_gpu=2)
     assert devices == [1, 2]
 
+    await install_strategy_for_worker(worker, "mock_model_3")
     devices = await worker.allocate_devices(model_uid="mock_model_3", n_gpu=1)
     assert devices == [3]
 
+    await install_strategy_for_worker(worker, "mock_model_4")
     devices = await worker.allocate_devices(model_uid="mock_model_4", n_gpu=1)
     assert devices == [4]
 
@@ -171,18 +192,23 @@ async def test_terminate_model_flag(setup_pool):
         cuda_devices=[i for i in range(8)],
     )
 
+    await install_strategy_for_worker(worker, "model_model_1")
     await worker.launch_builtin_model(
         "model_model_1", "mock_model_name", None, None, None, n_gpu=1
     )
 
+    await install_strategy_for_worker(worker, "model_model_2")
     await worker.launch_builtin_model(
         "model_model_2", "mock_model_name", None, None, None, n_gpu=4
     )
 
+    await install_strategy_for_worker(worker, "model_model_3")
     devices = await worker.allocate_devices(model_uid="model_model_3", n_gpu=3)
     assert devices == [5, 6, 7]
     await worker.release_devices(model_uid="model_model_3")
 
+    # ensure strategy is ready before relaunch
+    await install_strategy_for_worker(worker, "model_model_3")
     await worker.launch_builtin_model(
         "model_model_3", "mock_model_name", None, None, None, n_gpu=3
     )
@@ -244,12 +270,14 @@ async def test_launch_embedding_model(setup_pool):
     )
 
     # test embedding device candidates 1
+    await install_strategy_for_worker(worker, "model_model_1")
     await worker.launch_builtin_model(
         "model_model_1", "mock_model_name", None, None, None, n_gpu=3
     )
     embedding_info = await worker.get_gpu_to_embedding_model_uids()
     assert len(embedding_info) == 0
 
+    await install_strategy_for_worker(worker, "model_model_2")
     await worker.launch_builtin_model(
         "model_model_2", "mock_model_name", None, None, None, "embedding", n_gpu=1
     )
@@ -261,6 +289,7 @@ async def test_launch_embedding_model(setup_pool):
 
     # test terminate LLM model, then launch embedding model
     await worker.terminate_model("model_model_1")
+    await install_strategy_for_worker(worker, "model_model_3")
     await worker.launch_builtin_model(
         "model_model_3", "mock_model_name", None, None, None, "embedding", n_gpu=1
     )
@@ -276,14 +305,17 @@ async def test_launch_embedding_model(setup_pool):
     assert len(embedding_info[1]) == 0
 
     # test embedding device candidates 2
+    await install_strategy_for_worker(worker, "model_model_1")
     await worker.launch_builtin_model(
         "model_model_1", "mock_model_name", None, None, None, n_gpu=2
     )
 
+    await install_strategy_for_worker(worker, "model_model_2")
     await worker.launch_builtin_model(
         "model_model_2", "mock_model_name", None, None, None, "embedding", n_gpu=1
     )
 
+    await install_strategy_for_worker(worker, "model_model_3")
     await worker.launch_builtin_model(
         "model_model_3", "mock_model_name", None, None, None, "embedding", n_gpu=1
     )
@@ -295,6 +327,7 @@ async def test_launch_embedding_model(setup_pool):
     assert "model_model_2" in embedding_info[2]
     assert "model_model_3" in embedding_info[3]
 
+    await install_strategy_for_worker(worker, "model_model_4")
     await worker.launch_builtin_model(
         "model_model_4", "mock_model_name", None, None, None, "embedding", n_gpu=1
     )
@@ -314,9 +347,11 @@ async def test_launch_embedding_model(setup_pool):
 
     # test no slots
     for i in range(1, 5):
+        await install_strategy_for_worker(worker, f"model_model_{i}")
         await worker.launch_builtin_model(
             f"model_model_{i}", "mock_model_name", None, None, None, n_gpu=1
         )
+    await install_strategy_for_worker(worker, "model_model_5")
     await worker.launch_builtin_model(
         "model_model_5", "mock_model_name", None, None, None, "embedding", n_gpu=1
     )
@@ -344,6 +379,7 @@ async def test_launch_model_with_gpu_idx(setup_pool):
     assert (await xo.actor_ref(addr, WorkerActor.default_uid())).uid == b"worker"
 
     # test normal model
+    await install_strategy_for_worker(worker, "normal_model_model_1")
     await worker.launch_builtin_model(
         "normal_model_model_1", "mock_model_name", None, None, None, "LLM", n_gpu=1
     )
@@ -351,6 +387,7 @@ async def test_launch_model_with_gpu_idx(setup_pool):
     assert 0 in llm_info
     assert "normal_model_model_1" in llm_info[0]
 
+    await install_strategy_for_worker(worker, "model_model_2")
     await worker.launch_builtin_model(
         "model_model_2", "mock_model_name", None, None, None, "LLM", gpu_idx=[0]
     )
@@ -366,6 +403,7 @@ async def test_launch_model_with_gpu_idx(setup_pool):
     assert list(user_specified_info[0])[0][1] == "LLM"
 
     # test vllm model
+    await install_strategy_for_worker(worker, "vllm_model_model_3")
     await worker.launch_builtin_model(
         "vllm_model_model_3", "mock_model_name", None, None, None, "LLM", n_gpu=1
     )
@@ -387,6 +425,7 @@ async def test_launch_model_with_gpu_idx(setup_pool):
         )
 
     target_gpu = next(dev for dev in [1, 2, 3] if dev != vllm_gpu)
+    await install_strategy_for_worker(worker, "model_model_4")
     await worker.launch_builtin_model(
         "model_model_4",
         "mock_model_name",
@@ -409,6 +448,7 @@ async def test_launch_model_with_gpu_idx(setup_pool):
     assert list(user_specified_info[target_gpu])[0][1] == "LLM"
 
     # then launch a LLM without gpu_idx
+    await install_strategy_for_worker(worker, "normal_model_model_5")
     await worker.launch_builtin_model(
         "normal_model_model_5", "mock_model_name", None, None, None, "LLM", n_gpu=1
     )
@@ -416,6 +456,7 @@ async def test_launch_model_with_gpu_idx(setup_pool):
     assert 0 in llm_info
 
     # launch without gpu_idx again, error
+    await install_strategy_for_worker(worker, "normal_model_model_6")
     await worker.launch_builtin_model(
         "normal_model_model_6", "mock_model_name", None, None, None, "LLM", n_gpu=1
     )
@@ -436,6 +477,7 @@ async def test_launch_model_with_gpu_idx(setup_pool):
         assert len(model_infos) == 0
 
     # next, test with embedding models
+    await install_strategy_for_worker(worker, "embedding_1")
     await worker.launch_builtin_model(
         "embedding_1", "mock_model_name", None, None, None, "embedding", n_gpu=1
     )
@@ -443,6 +485,7 @@ async def test_launch_model_with_gpu_idx(setup_pool):
     assert len(embedding_info) == 1
     assert 0 in embedding_info
 
+    await install_strategy_for_worker(worker, "vllm_mock_model_2")
     await worker.launch_builtin_model(
         "vllm_mock_model_2", "mock_model_name", None, None, None, "LLM", gpu_idx=[0]
     )
@@ -456,27 +499,33 @@ async def test_launch_model_with_gpu_idx(setup_pool):
     assert list(user_specified_info[0])[0][1] == "LLM"
 
     # never choose gpu 0 again
+    await install_strategy_for_worker(worker, "normal_mock_model_3")
     devices = await worker.allocate_devices(model_uid="normal_mock_model_3", n_gpu=4)
     assert all(dev != 0 for dev in devices)
 
     # should be on gpu 1
+    await install_strategy_for_worker(worker, "embedding_3")
     await worker.launch_builtin_model(
         "embedding_3", "mock_model_name", None, None, None, "embedding", n_gpu=1
     )
     # should be on gpu 0
+    await install_strategy_for_worker(worker, "rerank_4")
     with pytest.raises(RuntimeError):
         await worker.launch_builtin_model(
             "rerank_4", "mock_model_name", None, None, None, "rerank", gpu_idx=[0]
         )
     # should be on gpu 2
+    await install_strategy_for_worker(worker, "embedding_5")
     await worker.launch_builtin_model(
         "embedding_5", "mock_model_name", None, None, None, "embedding", n_gpu=1
     )
     # should be on gpu 3
+    await install_strategy_for_worker(worker, "rerank_6")
     await worker.launch_builtin_model(
         "rerank_6", "mock_model_name", None, None, None, "rerank", n_gpu=1
     )
     # should be on gpu 1, due to there are the fewest models on it
+    await install_strategy_for_worker(worker, "rerank_7")
     await worker.launch_builtin_model(
         "rerank_7", "mock_model_name", None, None, None, "rerank", n_gpu=1
     )
diff --git a/xinference/core/worker.py b/xinference/core/worker.py
index ec5051e28a..f58402f2af 100644
--- a/xinference/core/worker.py
+++ b/xinference/core/worker.py
@@ -62,9 +62,9 @@
 from ..utils import get_pip_config_args, get_real_path
 from .cache_tracker import CacheTrackerActor
 from .event import Event, EventCollectorActor, EventType
-from .launch_strategy import LaunchModelSpec, create_launch_strategy
+from .launch_strategy import LaunchModelSpec, LaunchStrategy
 from .metrics import launch_metrics_export_server, record_metrics
-from .resource import GPUStatus, gather_node_info
+from .resource import gather_node_info
 from .status_guard import StatusGuardActor
 from .utils import (
     log_async,
@@ -158,7 +158,7 @@ def __init__(
         # Share launch spread/replica counts across strategy instances
         self._model_spread_used_gpus: Dict[str, Set[int]] = {}
         self._active_model_counts: Dict[str, int] = {}
-        # Cached launch strategies per base model
+        # Cached launch strategies per base model (installed by supervisor)
         self._launch_strategies: Dict[str, Any] = {}
         # Protect concurrent allocations/releases so bookings stay consistent
         self._allocation_lock = threading.Lock()
@@ -563,37 +563,6 @@ def _collect_user_specified_devices(self) -> Set[int]:
                 user_specified_allocated_devices.add(dev)
         return user_specified_allocated_devices
 
-    def _gather_initial_gpu_memory_info(self) -> Optional[Dict[int, Dict[str, float]]]:
-        # Try to seed strategy with current GPU memory snapshot from NVML
-        initial_gpu_memory_info: Optional[Dict[int, Dict[str, float]]] = None
-        try:
-            node_info = gather_node_info()
-            gpu_info: Dict[int, Dict[str, float]] = {}
-            for dev in self._total_gpu_devices:
-                status = node_info.get(f"gpu-{dev}")
-                if isinstance(status, GPUStatus):
-                    gpu_info[dev] = {
-                        "total": status.mem_total // (1024**2),
-                        "used": status.mem_used // (1024**2),
-                        "available": status.mem_free // (1024**2),
-                    }
-            initial_gpu_memory_info = gpu_info or None
-        except Exception:
-            initial_gpu_memory_info = None
-        return initial_gpu_memory_info
-
-    def _create_launch_strategy_instance(
-        self, gpu_memory_info: Optional[Dict[int, Dict[str, float]]] = None
-    ):
-        if gpu_memory_info is None:
-            raise ValueError("gpu_memory_info is required to create launch strategy")
-        return create_launch_strategy(
-            strategy_name=self._launch_strategy_name,
-            total_gpu_devices=self._total_gpu_devices,
-            allowed_devices=self._launch_allowed_gpus,
-            gpu_memory_info=gpu_memory_info,
-        )
-
     def _get_base_model_uid(self, model_uid: str) -> str:
         try:
             base_model_uid, _ = parse_replica_model_uid(model_uid)
@@ -601,34 +570,40 @@ def _get_base_model_uid(self, model_uid: str) -> str:
         except Exception:
             return model_uid
 
-    def _get_or_create_launch_strategy(self, model_uid: str):
+    def _get_launch_strategy(self, model_uid: str):
         base_model_uid = self._get_base_model_uid(model_uid)
         strategy = self._launch_strategies.get(base_model_uid)
-        if strategy is not None:
-            return strategy
-        strategy = self._create_launch_strategy_instance(
-            gpu_memory_info=self._gather_initial_gpu_memory_info()
-        )
-        self._launch_strategies[base_model_uid] = strategy
+        if strategy is None:
+            raise RuntimeError(
+                f"Launch strategy for base model {base_model_uid} has not been installed"
+            )
         return strategy
 
-    def ensure_launch_strategy(self, model_uid: str):
-        """
-        Ensure a launch strategy exists for the given base model.
-        This is intended to be triggered from supervisor before concurrent launches.
-        """
+    @log_async(logger=logger, level=logging.DEBUG)
+    async def get_launch_strategy_context(self) -> Dict[str, Any]:
+        """Provide supervisor with static launch strategy settings."""
+        return {
+            "total_gpu_devices": self._total_gpu_devices,
+            "allowed_devices": self._launch_allowed_gpus,
+            "launch_strategy_name": self._launch_strategy_name,
+        }
+
+    @log_async(logger=logger, level=logging.DEBUG)
+    async def install_launch_strategy(
+        self, model_uid: str, strategy: LaunchStrategy
+    ) -> None:
+        """Install supervisor-prepared launch strategy for a base model."""
         base_model_uid = self._get_base_model_uid(model_uid)
         with self._allocation_lock:
             if base_model_uid in self._launch_strategies:
                 return
-            strategy = self._create_launch_strategy_instance(
-                gpu_memory_info=self._gather_initial_gpu_memory_info()
-            )
+            if strategy is None:
+                raise ValueError("strategy is required to install launch strategy")
             self._launch_strategies[base_model_uid] = strategy
 
     def allocate_devices(self, model_uid: str, n_gpu: int) -> List[int]:
         spec = LaunchModelSpec(model_uid=model_uid, n_gpu=n_gpu)
-        strategy = self._get_or_create_launch_strategy(model_uid)
+        strategy = self._get_launch_strategy(model_uid)
         with self._allocation_lock:
             devices = strategy.allocate(
                 spec=spec,
@@ -657,7 +632,7 @@ def allocate_devices_for_model(
             model_format=model_format,
             quantization=quantization,
         )
-        strategy = self._get_or_create_launch_strategy(model_uid)
+        strategy = self._get_launch_strategy(model_uid)
         with self._allocation_lock:
             devices = strategy.allocate(
                 spec=spec,
diff --git a/xinference/device_utils.py b/xinference/device_utils.py
index 6c234bea47..db4180b8d1 100644
--- a/xinference/device_utils.py
+++ b/xinference/device_utils.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import logging
 import os
-from typing import Dict, Literal, Optional, Union
+from typing import Dict, Literal, Union
 
 import torch
 
@@ -194,33 +193,3 @@ def get_nvidia_gpu_info() -> Dict:
             nvmlShutdown()
         except:
             pass
-
-
-def update_gpu_memory_info(
-    gpu_memory_info: Dict[int, Dict[str, Union[int, float]]],
-    gpu_idx: int,
-    logger: Optional[logging.Logger] = None,
-) -> None:
-    """
-    Update memory information for a specific GPU using NVML
-
-    Args:
-        gpu_memory_info: Dictionary to update with memory information
-        gpu_idx: GPU index to update
-        logger: Optional logger instance
-    """
-    try:
-        import pynvml
-
-        handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_idx)
-        mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
-
-        gpu_memory_info[gpu_idx] = {
-            "total": mem_info.total // (1024**2),
-            "used": mem_info.used // (1024**2),
-            "available": mem_info.free // (1024**2),
-        }
-
-    except:
-        # Keep existing values if update fails
-        pass

From 4a2a98723a388394012bb8bab53f8314b5ce0459 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Fri, 12 Dec 2025 18:36:49 +0800
Subject: [PATCH 18/18] modefy doc

---
 .../zh_CN/LC_MESSAGES/user_guide/launch.po    | 41 ++++++++++++++-----
 doc/source/user_guide/launch.rst              |  2 +-
 2 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po b/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po
index cbf6c58381..11d056877f 100644
--- a/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po
+++ b/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po
@@ -8,7 +8,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: Xinference \n"
 "Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2025-12-11 16:32+0800\n"
+"POT-Creation-Date: 2025-12-12 18:34+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: zh_CN\n"
@@ -56,8 +56,8 @@ msgid ""
 " one model instance, you can set the number of instances equal to the "
 "number of GPUs. For example:"
 msgstr ""
-"在v1.15.0版本前：当您拥有多张GPU显卡时，每张显卡可承载一个模型实例，此时可将实例数量设置为"
-"等于GPU数量。例如:"
+"在v1.15.0版本前：当您拥有多张GPU显卡时，每张显卡可承载一个模型实例，此时"
+"可将实例数量设置为等于GPU数量。例如:"
 
 #: ../../source/user_guide/launch.rst:21
 msgid "2 GPUs, 2 instances: Each GPU runs one model instance"
@@ -129,15 +129,12 @@ msgstr "GPU分配策略"
 
 #: ../../source/user_guide/launch.rst:56
 msgid ""
-"The current strategy is *idle-first with a first round spread*: the "
-"scheduler first tries to place one replica on each available GPU (always "
-"picking the emptiest unused GPU). Once every GPU has at least one "
-"replica, remaining replicas keep stacking onto the GPU that is currently "
-"the emptiest (single-GPU multi-replica is allowed). Use "
-"``XINFERENCE_LAUNCH_ALLOWED_GPUS`` to limit which GPUs can be chosen."
+"The current policy is *Idle Priority*: The scheduler always attempts to "
+"assign replicas to the least utilized GPU. Use the "
+"``XINFERENCE_LAUNCH_ALLOWED_GPUS`` parameter to restrict the range of "
+"available GPUs."
 msgstr ""
-"当前策略为 *空闲优先且首轮分散* ：调度器首先尝试将每个副本分配至可用GPU（始终选择最空闲的未用GPU）。"
-"当每块GPU至少承载一个副本后，剩余副本将持续堆叠至当前最空闲的GPU（允许单GPU承载多个副本）。"
+"当前策略为 *空闲优先* ：调度器始终尝试将副本分配至最空闲的GPU。"
 "使用 ``XINFERENCE_LAUNCH_ALLOWED_GPUS`` 参数限制可选GPU范围。"
 
 #: ../../source/user_guide/launch.rst:59
@@ -191,3 +188,25 @@ msgstr ""
 "对于这部分，请参考 :ref:`开关虚拟空间和定制依赖 <model_launching_"
 "virtualenv>`。"
 
+#~ msgid ""
+#~ "The current strategy is *idle-first "
+#~ "with a first round spread*: the "
+#~ "scheduler first tries to place one "
+#~ "replica on each available GPU (always"
+#~ " picking the emptiest unused GPU). "
+#~ "Once every GPU has at least one"
+#~ " replica, remaining replicas keep stacking"
+#~ " onto the GPU that is currently "
+#~ "the emptiest (single-GPU multi-replica"
+#~ " is allowed). Use "
+#~ "``XINFERENCE_LAUNCH_ALLOWED_GPUS`` to limit which"
+#~ " GPUs can be chosen."
+#~ msgstr ""
+#~ "当前策略为 *空闲优先且首轮分散* ："
+#~ "调度器首先尝试将每个副本分配至可用GPU"
+#~ "（始终选择最空闲的未用GPU）。当每块"
+#~ "GPU至少承载一个副本后，剩余副本将持续"
+#~ "堆叠至当前最空闲的GPU（允许单GPU承载"
+#~ "多个副本）。使用 ``XINFERENCE_LAUNCH_"
+#~ "ALLOWED_GPUS`` 参数限制可选GPU范围。"
+
diff --git a/doc/source/user_guide/launch.rst b/doc/source/user_guide/launch.rst
index dd7b2ded6f..de2738750a 100644
--- a/doc/source/user_guide/launch.rst
+++ b/doc/source/user_guide/launch.rst
@@ -53,7 +53,7 @@ Smart Allocation: Number of replicas may differ from GPU count; system intellige
 GPU Allocation Strategy
 =======================
 
-The current strategy is *idle-first with a first round spread*: the scheduler first tries to place one replica on each available GPU (always picking the emptiest unused GPU). Once every GPU has at least one replica, remaining replicas keep stacking onto the GPU that is currently the emptiest (single-GPU multi-replica is allowed). Use ``XINFERENCE_LAUNCH_ALLOWED_GPUS`` to limit which GPUs can be chosen.
+The current policy is *Idle Priority*: The scheduler always attempts to assign replicas to the least utilized GPU. Use the ``XINFERENCE_LAUNCH_ALLOWED_GPUS`` parameter to restrict the range of available GPUs.
 
 Set Environment Variables
 =========================