Merge pull request #3 from KuntaiDu/yihua-kv-pipe

KuntaiDu · web-flow · commit 13779128b96a · 2024-09-10T20:17:19.000-07:00
Optimize the KV transfer pipe implementation
diff --git a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
@@ -1,20 +1,12 @@
-
-from vllm.distributed.group_coordinator import GroupCoordinator
-from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
-from torch.distributed import Backend, ProcessGroup
+from torch.distributed import Backend
 import torch
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import List, Optional, Union
 import threading
 from concurrent.futures import ThreadPoolExecutor
 import time
-import threading
-from collections import namedtuple
-from typing import Dict, Any, Tuple, List
-import pickle
 
 from vllm.logger import init_logger
 
-
 logger = init_logger(__name__)
 
 
@@ -52,34 +44,32 @@ def __init__(self, message):
         self.message = message
         super().__init__(self.message)
 
-class TorchDistributedPipe(KVPipeBase):
-    
+
+class TorchDistributedPipe:
+    METADATA_LENGTH = 16
+    MAX_TENSOR_DIMENSIONS = 14
+    METADATA_DTYPE = torch.int64
+
     def __init__(
         self,
         group_ranks: List[List[int]],
         local_rank: int,
-        torch_distributed_backend: Union[str, Backend]
+        torch_distributed_backend: Union[str, Backend],
     ):
-
         self.rank = torch.distributed.get_rank()
         self.local_rank = local_rank
         self.device_group = None
-        self.cpu_group = None
 
         for ranks in group_ranks:
             device_group = torch.distributed.new_group(
-                ranks, backend=torch_distributed_backend)
-            # a group with `gloo` backend, to allow direct coordination between
-            # processes through the CPU.
-            cpu_group = torch.distributed.new_group(ranks, backend="gloo")
+                ranks, backend=torch_distributed_backend
+            )
             if self.rank in ranks:
                 self.ranks = ranks
                 self.world_size = len(ranks)
                 self.rank_in_group = ranks.index(self.rank)
                 self.device_group = device_group
-                self.cpu_group = cpu_group
 
-        assert self.cpu_group is not None
         assert self.device_group is not None
         assert self.rank_in_group <= 1
 
@@ -88,120 +78,215 @@ def __init__(
         else:
             self.device = torch.device("cpu")
 
-        # if turned on, will use CPU-based communication to perform a series of sanity check.
-        # but it adds ~5ms delay, so please turn it off in performance-demanding usecases (e.g. disaggregated prefill)
-        self.target_rank_for_send = self.ranks[(self.rank_in_group + 1) %
-                                               self.world_size]
-        self.target_rank_for_recv = self.ranks[(self.rank_in_group - 1) %
-                                               self.world_size]
+        self.target_rank_for_send = self.ranks[
+            (self.rank_in_group + 1) % self.world_size
+        ]
+        self.target_rank_for_recv = self.ranks[
+            (self.rank_in_group - 1) % self.world_size
+        ]
+
+        # FIXME: why we need this?
         torch.set_default_device(self.device)
 
-        self.kv_sending_thread = None
+        self.transport_thread = None
         self.buffer_size = 0
         self.buffer_size_lock = threading.Lock()
 
-        self.none_tensor = torch.tensor([NONE_INT]).to(self.device)
-        self.broken = False
+        self.none_tensor = torch.tensor([NONE_INT], device=self.device)
+
+        # On-device tensors to be reused for recv
+        self.rcv_metadata_buffer = torch.zeros(
+            self.METADATA_LENGTH, dtype=self.METADATA_DTYPE, device=self.device
+        )
+
+    def _make_metadata(self, tensor: torch.Tensor) -> torch.Tensor:
+        """
+        Create the metadata on based on the input tensor, and move it to GPU.
+        The metadata's length is `TorchDistributedPipe.METADATA_LENGTH`.
 
-        
-    def quick_send(self, tensor):
+        Currently, the metadata is a int64 tensor and it includes dtype, number
+        of dimensions, and the shape information of the input tensor.
 
-        group = self.device_group
 
-        # NCCL is NOT fully duplex
-        # so CPU communication is ALWAYS necessary
-        torch.distributed.send_object_list(
-            [tensor.dtype, tensor.shape, str(tensor.device)],
-            dst=self.target_rank_for_send,
-            group=self.cpu_group
+        The information follows the layout below:
+        - metadata[0] -- dtype
+        - metadata[1] -- number of dimensions
+        - metadata[2 : 2+ndims] -- the shape of the input tensor
+
+        Parameters:
+            - tensor: the input tensor
+
+        Returns:
+            - metadata: the metadata tensor, on self.device
+        """
+        buffer = torch.empty(self.METADATA_LENGTH, dtype=self.METADATA_DTYPE)
+        buffer[0] = DTYPE2INT[tensor.dtype]
+        ndims = len(tensor.shape)
+        buffer[1] = len(tensor.shape)
+        buffer[2 : 2 + ndims] = torch.tensor(
+            tensor.shape, dtype=self.METADATA_DTYPE
         )
+        return buffer.to(self.device)
+
+    def _prepare_recv_buffer(
+        self, d_metadata_buffer: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Create a buffer to receive the tensor based on the metadata.
+
+        Parameters:
+            - d_metadata_buffer: the metadata tensor on self.device
+
+        Returns:
+            - buffer: the buffer tensor to receive the tensor, on self.device
+        """
+        h_buffer = d_metadata_buffer.cpu().numpy()
+        dtype = INT2DTYPE[h_buffer[0]]
+        ndims = h_buffer[1]
+        shape = tuple(h_buffer[2 : 2 + ndims])
+        return torch.empty(shape, dtype=dtype, device=self.device)
 
+    def _send_metadata(self, d_metadata_buffer: torch.Tensor):
+        """
+        Send the metadata buffer to the target rank.
+        """
         torch.distributed.send(
-            tensor,
+            d_metadata_buffer,
             dst=self.target_rank_for_send,
-            group=self.device_group
+            group=self.device_group,
         )
 
+    def _recv_metadata(self) -> torch.Tensor:
+        """
+        Receive the metadata buffer from the target rank.
 
-    def quick_recv(self):
+        Returns:
+            - metadata_buffer: the metadata buffer tensor, on self.device
 
-        # NCCL is NOT fully duplex
-        # so CPU communication is necessary
-        metadata = [None, None, None]
-        torch.distributed.recv_object_list(
-            metadata,
+        Note:
+            The current implementation uses the assumption that there is no
+            race conditions during sending/receiving. Therefore, the metadata
+            buffer can be reused
+        """
+        torch.distributed.recv(
+            self.rcv_metadata_buffer,
             src=self.target_rank_for_recv,
-            group=self.cpu_group
+            group=self.device_group,
         )
-        
-        dtype, shape, device = metadata
-        if 'cuda' in device:
-            device = self.device
-        else:
-            device = 'cpu'
-        buffer = torch.zeros(shape, dtype=dtype).to(device)
-        
+        return self.rcv_metadata_buffer
+
+    def _send_impl(self, tensor):
+        """
+        The actual implementation of sending the tensor to the target rank.
+        This function will first send the metadata, and then send the tensor.
+
+        Parameters:
+            - tensor: the input tensor to be sent
+        """
+
+        metadata = self._make_metadata(tensor)
+        self._send_metadata(metadata)
+
+        torch.distributed.send(
+            tensor, dst=self.target_rank_for_send, group=self.device_group
+        )
+
+    def _recv_impl(self) -> torch.Tensor:
+        """
+        The actual implementation of receiving the tensor from the target rank.
+        This function will first receive the metadata, then receive the tensor.
+
+        This function will block if there is no tensor to receive.
+
+        Returns:
+            - buffer: the received tensor, on self.device
+        """
+        d_metadata = self._recv_metadata()
+        buffer = self._prepare_recv_buffer(d_metadata)
+
         torch.distributed.recv(
-            buffer,
-            src=self.target_rank_for_recv,
-            group=self.device_group
+            buffer, src=self.target_rank_for_recv, group=self.device_group
         )
-        return buffer
-        
 
-        
-    def send_tensor_wrapper(self, tensor) -> None:
+        return buffer
 
+    def send_tensor_wrapper(self, tensor):
         try:
+            """Wrapper for send_tensor_dict"""
             tensor_size = tensor.element_size() * tensor.numel()
-            self.quick_send(tensor)
-            
+            self._send_impl(tensor)
+
             with self.buffer_size_lock:
                 self.buffer_size = self.buffer_size - tensor_size
         except Exception as e:
             logger.error("Encountering exception in KV sending thread")
             logger.error("%s", e)
-        
+
     def block_if_full(self):
-        
+        """
+        Block the current thread if the buffer size is larger than 1e9.
+        """
+        # TODO: replace this 1e9 with a configurable parameter or a constant
         while self.buffer_size > 1e9:
             logger.debug("KV cache transfer pipe is full. Waiting...")
             time.sleep(0.05)
 
-    def send_tensor(self,
-                   tensor: Optional[torch.Tensor]) -> None:
+    def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
         """
         Sends a tensor to the destination rank in a non-blocking way.
         Flow: send tensor dim -- send tensor shape -- send tensor data
         """
-        
-        if self.kv_sending_thread is None:
-            self.kv_sending_thread = ThreadPoolExecutor(max_workers=1)
+
+        if self.transport_thread is None:
+            self.transport_thread = ThreadPoolExecutor(max_workers=1)
 
         if tensor is None:
             tensor = self.none_tensor
             tensor_size = 0
         else:
             tensor_size = tensor.element_size() * tensor.numel()
 
+        assert (
+            0 < len(tensor.shape) < self.MAX_TENSOR_DIMENSIONS
+        ), f"Only support dimensions within 1-{self.MAX_TENSOR_DIMENSIONS}"
+
         self.block_if_full()
 
         with self.buffer_size_lock:
+            # print("Remaining size:", self.buffer_size)
             self.buffer_size = self.buffer_size + tensor_size
-            
+
         # prepare the metadata before sending the tensor.
-        self.kv_sending_thread.submit(
-            self.send_tensor_wrapper, 
-            tensor)
-    
+        self.transport_thread.submit(
+            self.send_tensor_wrapper,
+            tensor,
+        )
+
     def recv_tensor(self) -> Optional[torch.Tensor]:
         """Receives a tensor from the src rank. Blocking."""
-        
-        tensor = self.quick_recv()
+
+        if self.transport_thread is None:
+            self.transport_thread = ThreadPoolExecutor(max_workers=1)
+
+        future = self.transport_thread.submit(self._recv_impl)
+
+        try:
+            tensor = future.result()
+        except Exception as e:
+            logger.error("Encountering exception in KV receiving thread")
+            logger.error("%s", e)
+
         if tensor.numel() == 1 and tensor.item() == NONE_INT:
             return None
         else:
             return tensor
-    
 
-    
+    def close(self):
+        """
+        Close the pipe and release the resources.
+        """
+        if (
+            hasattr(self, "transport_thread")
+            and self.transport_thread is not None
+        ):
+            self.transport_thread.shutdown()