huggingface
diff --git a/‎Dockerfile_amd
Lines changed: 1 addition & 1 deletion b/‎Dockerfile_amd
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/supported_models.md
Lines changed: 1 addition & 0 deletions b/‎docs/source/supported_models.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎flake.lock
Lines changed: 4 additions & 4 deletions b/‎flake.lock
Lines changed: 4 additions & 4 deletions
diff --git a/‎flake.nix
Lines changed: 1 addition & 1 deletion b/‎flake.nix
Lines changed: 1 addition & 1 deletion
diff --git a/‎launcher/src/main.rs
Lines changed: 7 additions & 2 deletions b/‎launcher/src/main.rs
Lines changed: 7 additions & 2 deletions
diff --git a/‎router/src/config.rs
Lines changed: 2 additions & 0 deletions b/‎router/src/config.rs
Lines changed: 2 additions & 0 deletions
diff --git a/‎server/pyproject.toml
Lines changed: 1 addition & 1 deletion b/‎server/pyproject.toml
Lines changed: 1 addition & 1 deletion
diff --git a/‎server/text_generation_server/layers/fp8.py
Lines changed: 74 additions & 2 deletions b/‎server/text_generation_server/layers/fp8.py
Lines changed: 74 additions & 2 deletions
diff --git a/‎server/text_generation_server/layers/moe/__init__.py
Lines changed: 22 additions & 10 deletions b/‎server/text_generation_server/layers/moe/__init__.py
Lines changed: 22 additions & 10 deletions
diff --git a/‎server/text_generation_server/layers/moe/fp8.py
Lines changed: 8 additions & 1 deletion b/‎server/text_generation_server/layers/moe/fp8.py
Lines changed: 8 additions & 1 deletion
@@ -279,7 +279,7 @@ RUN git clone https://github.com/danieldk/marlin-kernels.git && \
 
 FROM kernel-builder AS moe-kernels
 WORKDIR /usr/src
-ENV MOE_KERNELS_BRANCH=a67b35841774b2056a73806c36661134b5054edd
+ENV MOE_KERNELS_BRANCH=d7e042bf9f7aff10c631212fc71b24895d66eb59
 ENV VLLM_TARGET_DEVICE=rocm
 RUN git clone https://github.com/danieldk/moe-kernels.git && \
     cd moe-kernels && \
 
@@ -4,6 +4,7 @@
 Text Generation Inference enables serving optimized models. The following sections list which models (VLMs & LLMs) are supported.
 
 - [Deepseek V2](https://huggingface.co/deepseek-ai/DeepSeek-V2)
+- [Deepseek V3](https://huggingface.co/deepseek-ai/DeepSeek-V3)
 - [Idefics 2](https://huggingface.co/HuggingFaceM4/idefics2-8b) (Multimodal)
 - [Idefics 3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3) (Multimodal)
 - [Llava Next (1.6)](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) (Multimodal)
 
@@ -5,7 +5,7 @@
       inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
     };
     nix-filter.url = "github:numtide/nix-filter";
-    tgi-nix.url = "github:huggingface/text-generation-inference-nix/moe-kernels-0.8.0";
+    tgi-nix.url = "github:huggingface/text-generation-inference-nix/moe_0_8_1";
     nixpkgs.follows = "tgi-nix/nixpkgs";
     flake-utils.url = "github:numtide/flake-utils";
     rust-overlay = {
 
@@ -1635,6 +1635,7 @@ enum Gpu {
     A40,
     H100,
     A100,
+    H200,
     Unknown(String),
 }
 
@@ -1661,6 +1662,7 @@ impl From<&str> for Gpu {
             "nvidia-a100-sxm4-40gb" => Gpu::A100,
             "nvidia-a100-80gb-pcie" => Gpu::A100,
             "nvidia-a100" => Gpu::A100,
+            "nvidia-h200" => Gpu::H200,
             card => Gpu::Unknown(card.to_string()),
         }
     }
@@ -1678,6 +1680,7 @@ impl std::fmt::Display for Gpu {
             Gpu::A40 => write!(f, "nvidia-a40"),
             Gpu::H100 => write!(f, "nvidia-h100-80fb-hbm3"),
             Gpu::A100 => write!(f, "nvida-a100-sxm4-80gb"),
+            Gpu::H200 => write!(f, "nvida-h200"),
             Gpu::Unknown(card) => write!(f, "{}", card),
         }
     }
@@ -1702,11 +1705,13 @@ impl ComputeType {
             // https://www.nvidia.com/en-us/data-center/a40/
             // https://images.nvidia.com/content/Solutions/data-center/a40/nvidia-a40-datasheet.pdf
             Gpu::A40 => Some(149 * 10u64.pow(12)),
+            // https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf
+            Gpu::A100 => Some(312 * 10u64.pow(12)),
             // https://www.nvidia.com/en-us/data-center/h100/
             // https://www.techpowerup.com/gpu-specs/docs/nvidia-gh100-architecture.pdf
             Gpu::H100 => Some(900 * 10u64.pow(12)),
-            // https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf
-            Gpu::A100 => Some(312 * 10u64.pow(12)),
+            // https://www.nvidia.com/en-us/data-center/h200/
+            Gpu::H200 => Some(989 * 10u64.pow(12)),
             Gpu::Unknown(card) => {
                 tracing::warn!("Unkown compute for card {card}");
                 None
 
@@ -224,6 +224,8 @@ pub enum Config {
     Qwen2,
     Opt,
     T5,
+    DeepseekV2,
+    DeepseekV3,
 }
 
 #[derive(Clone, Debug, Serialize, Deserialize)]
 
@@ -75,7 +75,7 @@ marlin-kernels = [
   { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
   { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
 ]
-moe-kernels.url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.0/moe_kernels-0.8.0+cu123torch2.5-cp39-abi3-linux_x86_64.whl"
+moe-kernels.url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.1/moe_kernels-0.8.1+cu123torch2.5-cp39-abi3-linux_x86_64.whl"
 
 [tool.pytest.ini_options]
 markers = ["private: marks tests as requiring an admin hf token (deselect with '-m \"not private\"')"]
 
@@ -19,6 +19,12 @@
 except ImportError:
     marlin_kernels = None
 
+try:
+    from moe_kernels.fp8_utils import w8a8_block_fp8_matmul, per_token_group_quant_fp8
+except ImportError:
+    w8a8_block_fp8_matmul = None
+    per_token_group_quant_fp8 = None
+
 quant_dtype: torch.dtype = (
     torch.float8_e4m3fnuz if SYSTEM == "rocm" else torch.float8_e4m3fn
 )
@@ -38,7 +44,6 @@ def get_fp8_linear(force_w8a16: bool = False) -> Type[torch.nn.Module]:
     """
 
     if SYSTEM == "cuda":
-
         major, _ = torch.cuda.get_device_capability()
         # Marlin is W8A16, use it when:
         #
@@ -180,14 +185,29 @@ def fp8_quantize(
 class HybridFP8UnquantLoader(WeightsLoader):
     """Weight loader that loads FP8 and unquantized Torch tensors."""
 
-    def __init__(self, activation_scale_ub: Optional[float], to_fp8: bool):
+    def __init__(
+        self,
+        activation_scale_ub: Optional[float],
+        to_fp8: bool,
+        weight_block_size: Optional[List[int]] = None,
+    ):
         self.activation_scale_ub = activation_scale_ub
         self.to_fp8 = to_fp8
+        self.weight_block_size = weight_block_size
 
     def get_weights(self, weights: "Weights", prefix: str):
         w = weights.get_tensor(f"{prefix}.weight")
 
         if w.dtype == torch.float8_e4m3fn:
+            if self.weight_block_size is not None:
+                scale = weights.get_tensor(f"{prefix}.weight_scale_inv")
+                return Fp8Weight(
+                    weight=w,
+                    weight_scale=scale,
+                    activation_scale_ub=self.activation_scale_ub,
+                    dtype=weights.dtype,
+                    weight_block_size=self.weight_block_size,
+                )
             # FP8 branch
             scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
 
@@ -276,6 +296,21 @@ def get_multi_weights_col(self, weights: "Weights", prefixes: List[str], dim: in
 
         # FP8 branch
         if w.dtype == torch.float8_e4m3fn:
+            if self.weight_block_size is not None:
+                scale = [
+                    weights.get_sharded(f"{p}.weight_scale_inv", dim=0, to_device=False)
+                    for p in prefixes
+                ]
+                scale = torch.cat(scale, dim=dim)
+                scale = scale.to(weights.device)
+                return Fp8Weight(
+                    weight=w,
+                    weight_scale=scale,
+                    activation_scale_ub=self.activation_scale_ub,
+                    dtype=weights.dtype,
+                    weight_block_size=self.weight_block_size,
+                )
+
             scale = [
                 _load_scalar_or_matrix_scale(weights, f"{p}.weight_scale", shape)
                 for p, shape in zip(prefixes, shapes)
@@ -321,6 +356,18 @@ def get_weights_row(self, weights: "Weights", prefix: str):
         w = weights.get_sharded(f"{prefix}.weight", dim=1)
         # FP8 branch
         if w.dtype == torch.float8_e4m3fn:
+            if self.weight_block_size is not None:
+                # XXX: Yes the weights is named scale_inv, but corresponds to scale it seems.
+                scale = weights.get_sharded(f"{prefix}.weight_scale_inv", dim=1)
+
+                return Fp8Weight(
+                    weight=w,
+                    weight_scale=scale,
+                    activation_scale_ub=self.activation_scale_ub,
+                    dtype=weights.dtype,
+                    weight_block_size=self.weight_block_size,
+                )
+
             scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
 
             if SYSTEM == "cuda":
@@ -355,6 +402,7 @@ class Fp8Weight(Weight):
     input_scale: Optional[torch.Tensor] = None
     activation_scale_ub: Optional[float] = None
     force_w8a16: bool = False
+    weight_block_size: Optional[List[int]] = None
 
     def get_linear(self, bias: torch.Tensor):
         if self.weight_scale is None:
@@ -371,6 +419,7 @@ def get_linear(self, bias: torch.Tensor):
             bias=bias,
             input_scale=self.input_scale,
             scale_upper_bound=self.activation_scale_ub,
+            weight_block_size=self.weight_block_size,
         )
 
 
@@ -385,6 +434,7 @@ def __init__(
         bias: Optional[torch.Tensor] = None,
         input_scale: Optional[torch.Tensor] = None,
         scale_upper_bound: Optional[float] = None,
+        weight_block_size: Optional[List[int]] = None,
     ) -> None:
         super().__init__()
         if CUTLASS_FP8_AVAILABLE:
@@ -398,6 +448,7 @@ def __init__(
         self.qweight = qweight
         self.scale = scale.float()
         self.input_scale = input_scale.float() if input_scale is not None else None
+        self.weight_block_size = weight_block_size
 
         if CUTLASS_FP8_AVAILABLE and scale_upper_bound is not None:
             self.scale_upper_bound = torch.tensor(
@@ -431,6 +482,7 @@ def from_fp8(
     ) -> "Fp8Linear":
         input_scale = kwargs.get("input_scale", None)
         scale_upper_bound = kwargs.get("scale_upper_bound", None)
+        weight_block_size = kwargs.get("weight_block_size", None)
 
         return cls(
             qweight=weight,
@@ -439,6 +491,7 @@ def from_fp8(
             scale_upper_bound=scale_upper_bound,
             bias=bias,
             dtype=dtype,
+            weight_block_size=weight_block_size,
         )
 
     @classmethod
@@ -450,6 +503,25 @@ def get_shared_device_identity(cls, device):
         return cls._device_identity_cache[device]
 
     def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if self.weight_block_size is not None:
+            # https://arxiv.org/pdf/2412.19437
+            # At a more granular level. As illustrated in Figure 7 (a), (1) for activations, we group and
+            # scale elements on a 1x128 tile basis (i.e., per token per 128 channels); and (2) for weights, we
+            # group and scale elements on a 128x128 block basis (i.e., per 128 input channels per 128 output
+            # channels).
+            qinput, scale = per_token_group_quant_fp8(input, self.weight_block_size[1])
+            output = w8a8_block_fp8_matmul(
+                qinput,
+                self.qweight,
+                scale,
+                self.scale,
+                self.weight_block_size,
+                output_dtype=input.dtype,
+            )
+
+            if self.bias is not None:
+                output = output + self.bias
+            return output.to(dtype=input.dtype)
         if CUTLASS_FP8_AVAILABLE:
             # cutlass FP8 supports per-token scales, so get non-scalar scales.
             qinput, scale = fp8_quantize(
 
@@ -52,6 +52,8 @@ def __init__(
         up_proj_name: str = "up_proj",
         down_proj_name: str = "down_proj",
         hidden_act: str = "silu",
+        scoring_func: Optional[str] = None,
+        e_score_correction_bias: Optional[float] = None,
     ): ...
 
     def forward(
@@ -81,9 +83,14 @@ def __init__(
         up_proj_name: str = "up_proj",
         down_proj_name: str = "down_proj",
         hidden_act: str = "silu",
+        scoring_func: Optional[str] = None,
+        e_score_correction_bias: Optional[float] = None,
     ):
         super().__init__()
 
+        assert scoring_func is None, "scoring func is not handled"
+        assert e_score_correction_bias is None, "scoring correction bias is not handled"
+
         log_once(
             logger.info,
             "No fused layers are available for this model type, using (slower) dense MoE layer",
@@ -199,21 +206,24 @@ def __init__(
         topk: int,
         topk_group: Optional[int],
         weights: Weights,
+        scoring_func: Optional[str] = "softmax",
+        e_score_correction_bias: Optional[float] = None,
         gate_proj_name: str = "gate_proj",
         up_proj_name: str = "up_proj",
         down_proj_name: str = "down_proj",
     ):
         super().__init__()
-        if isinstance(weights.loader, DefaultWeightsLoader) and isinstance(
-            weights.loader.weight_class, UnquantizedWeight
-        ):
-            cls = UnquantizedSparseMoELayer
-        elif isinstance(weights.loader, HybridFP8UnquantLoader):
-            cls = (
-                FP8SparseMoELayer
-                if weights.loader.to_fp8
-                else UnquantizedSparseMoELayer
-            )
+        if (
+            isinstance(weights.loader, DefaultWeightsLoader)
+            and isinstance(weights.loader.weight_class, UnquantizedWeight)
+        ) or isinstance(weights.loader, HybridFP8UnquantLoader):
+            if (
+                isinstance(weights.loader, HybridFP8UnquantLoader)
+                and weights.loader.to_fp8
+            ):
+                cls = FP8SparseMoELayer
+            else:
+                cls = UnquantizedSparseMoELayer
         elif isinstance(
             weights.loader, GPTQMarlinWeightsLoader
         ) and can_use_marlin_moe_gemm(
@@ -240,6 +250,8 @@ def __init__(
             topk=topk,
             topk_group=topk_group,
             weights=weights,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias,
             gate_proj_name=gate_proj_name,
             up_proj_name=up_proj_name,
             down_proj_name=down_proj_name,
 
@@ -28,6 +28,8 @@ def __init__(
         topk: int,
         topk_group: Optional[int],
         weights: Weights,
+        scoring_func: Optional[str] = "softmax",
+        e_score_correction_bias: Optional[float] = None,
         gate_proj_name: str = "gate_proj",
         up_proj_name: str = "up_proj",
         down_proj_name: str = "down_proj",
@@ -42,6 +44,9 @@ def __init__(
         self.topk = topk
         self.topk_group = topk_group
         self.renormalize = renormalize
+        self.weight_block_size = weights.weights_loader.weight_block_size
+        self.scoring_func = scoring_func
+        self.e_score_correction_bias = e_score_correction_bias
 
         (
             self.gate_up_proj,
@@ -76,6 +81,8 @@ def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tens
             use_grouped_topk=self.n_expert_group is not None,
             num_expert_group=self.n_expert_group,
             topk_group=self.topk_group,
+            scoring_func=self.scoring_func,
+            e_score_correction_bias=self.e_score_correction_bias,
             use_fp8_w8a8=True,
             w1_scale=self.gate_up_proj_weight_scale,
             w2_scale=self.down_proj_weight_scale,
@@ -109,7 +116,7 @@ def _load_expert_weights(
             )
         if all_weight_scales is None:
             all_weight_scales = torch.empty(
-                (n_experts,),
+                (n_experts,) + weight.weight_scale.shape,
                 dtype=torch.float32,
                 device=weight.weight.device,
             )
Original file line number	Diff line number	Diff line change
`@@ -224,6 +224,8 @@ pub enum Config {`
`224`	`224`	`Qwen2,`
`225`	`225`	`Opt,`
`226`	`226`	`T5,`
	`227`	`+ DeepseekV2,`
	`228`	`+ DeepseekV3,`
`227`	`229`	`}`
`228`	`230`
`229`	`231`	`#[derive(Clone, Debug, Serialize, Deserialize)]`
Original file line number	Diff line number	Diff line change
`@@ -75,7 +75,7 @@ marlin-kernels = [`
`75`	`75`	`{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },`
`76`	`76`	`{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },`
`77`	`77`	`]`
`78`		`-moe-kernels.url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.0/moe_kernels-0.8.0+cu123torch2.5-cp39-abi3-linux_x86_64.whl"`
	`78`	`+moe-kernels.url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.1/moe_kernels-0.8.1+cu123torch2.5-cp39-abi3-linux_x86_64.whl"`
`79`	`79`
`80`	`80`	`[tool.pytest.ini_options]`
`81`	`81`	`markers = ["private: marks tests as requiring an admin hf token (deselect with '-m \"not private\"')"]`