NVIDIA · vthumbe1503 · Dec 8, 2025 · Nov 26, 2025 · Nov 26, 2025 · Nov 26, 2025
diff --git a/tests/pytorch/test_numerics.py b/tests/pytorch/test_numerics.py
@@ -2976,3 +2976,75 @@ def _run_module(m, inp):
     out = _run_module(g2, b)
 
     assert_allclose(out, outT, 1e-7)
+
+
+@pytest.mark.parametrize(
+    "quantization_type",
+    [
+        "fp8",
+        "mxfp8",
+    ],
+)
+@pytest.mark.parametrize(
+    "shape,chunks,dim",
+    [
+        ((64, 128), 2, 0),  # Split along first dimension, needs padding for mxfp8
+        ((64, 128), 2, 1),  # Split along second dimension, goes down dequantization path for mxfp8
+    ],
+)
+def test_fp8_split_functionality(quantization_type, shape, chunks, dim):
+    """Test torch.chunk on FP8 and MXFP8 tensors and verify correctness via dequantization."""
+    if quantization_type == "fp8" and not fp8_available:
+        pytest.skip(reason_for_no_fp8)
+    if quantization_type == "mxfp8" and not mxfp8_available:
+        pytest.skip(reason_for_no_mxfp8)
+
+    device = "cuda"
+    dtype = torch.bfloat16
+
+    # Create reference tensor
+    torch.manual_seed(1234)
+    torch.cuda.manual_seed(1234)
+    ref_tensor = torch.randn(shape, device=device, dtype=dtype)
+
+    # Quantize the tensor
+    if quantization_type == "fp8":
+        quantizer = Float8Quantizer(
+            scale=torch.ones(1, dtype=torch.float32, device=device).squeeze(),
+            amax=torch.zeros(1, dtype=torch.float32, device=device),
+            fp8_dtype=tex.DType.kFloat8E4M3,
+        )
+        quantized_tensor = quantizer(ref_tensor)
+    elif quantization_type == "mxfp8":
+        quantizer = MXFP8Quantizer(fp8_dtype=tex.DType.kFloat8E4M3)
+        quantized_tensor = quantizer(ref_tensor)
+
+    # Apply torch.chunk on quantized tensor
+    quantized_tensor_dispatch_out = torch.chunk(quantized_tensor, chunks, dim=dim)
+    # need to make tensor contigous for dim=1 splitting.
+    outs = [out.contiguous() for out in quantized_tensor_dispatch_out]
+    if dim == 0 or quantization_type == "fp8":
+        # Dequantize the chunked results
+        chunked_dequantized = [chunk.dequantize() for chunk in outs]
+    else:
+        # When splitting along second dimension, we go down dequantization
+        # route in case of mxfp8 for now.
+        chunked_dequantized = outs
+
+    # Reference: chunk the dequantized tensor directly
+    ref_dequantized = quantized_tensor.dequantize()
+    ref_chunked = torch.chunk(ref_dequantized, chunks, dim=dim)
+
+    # Compare results
+    assert len(chunked_dequantized) == len(
+        ref_chunked
+    ), f"Number of chunks mismatch: {len(chunked_dequantized)} vs {len(ref_chunked)}"
+
+    for i, (chunk_deq, ref_chunk) in enumerate(zip(chunked_dequantized, ref_chunked)):
+        assert (
+            chunk_deq.shape == ref_chunk.shape
+        ), f"Chunk {i} shape mismatch: {chunk_deq.shape} vs {ref_chunk.shape}"
+        torch.testing.assert_close(
+            chunk_deq,
+            ref_chunk,
+        )
diff --git a/transformer_engine/pytorch/tensor/float8_tensor.py b/transformer_engine/pytorch/tensor/float8_tensor.py
@@ -560,7 +560,11 @@ def contiguous(
             memory_format=memory_format
         ):
             return self
-        return Float8Tensor.make_like(tensor=self, data=self._data.contiguous())
+        return Float8Tensor.make_like(
+            tensor=self,
+            data=self._data.contiguous(),
+            data_transpose=self._transpose.contiguous() if self._transpose is not None else None,
+        )
 
         # raise ValueError("Float8Tensor does not support different memory formats!")
 

diff --git a/transformer_engine/pytorch/tensor/mxfp8_tensor.py b/transformer_engine/pytorch/tensor/mxfp8_tensor.py
@@ -434,13 +434,16 @@ def __torch_dispatch__(cls, func, types, args, kwargs=None):
                     if scale_inv is not None
                     else None
                 )
+                scale_inv_out = list(scale_inv_out) if scale_inv_out is not None else None
                 # Pad scale_inv_out to be a multiple of pad_multiple
                 if scale_inv_out is not None:
-                    current_shape = scale_inv_out.shape
-                    pad_dim0 = (pad_multiple - current_shape[0] % pad_multiple) % pad_multiple
-                    if pad_dim0 > 0:
-                        scale_inv_out = torch.nn.functional.pad(scale_inv_out, (0, 0, 0, pad_dim0))
-
+                    for idx, split_scale_inv_out in enumerate(scale_inv_out):
+                        current_shape = split_scale_inv_out.shape
+                        pad_dim0 = (pad_multiple - current_shape[0] % pad_multiple) % pad_multiple
+                        if pad_dim0 > 0:
+                            scale_inv_out[idx] = torch.nn.functional.pad(
+                                split_scale_inv_out, (0, 0, 0, pad_dim0)
+                            )
                 out_data.append(scale_inv_out)
             return [
                 MXFP8Tensor(