feat: Softmax free sampling (#1035)

kf-zhang · web-flow · commit 3f76969aced6 · 2025-04-28T10:11:37.000-04:00
diff --git a/benchmarks/bench_sampling.py b/benchmarks/bench_sampling.py
@@ -27,6 +27,18 @@ def init_seed_sampling(*args, **kwargs):
     return flashinfer.sampling.sampling_from_probs(*args, **kwargs)
 
 
+def init_seed_sampling_from_logits(*args, **kwargs):
+    torch.manual_seed(42)
+    return flashinfer.sampling.sampling_from_logits(*args, **kwargs)
+
+
+def init_seed_sampling_from_softmax_logits(logits, *args, **kwargs):
+    torch.manual_seed(42)
+    return flashinfer.sampling.sampling_from_probs(
+        torch.softmax(logits, dim=-1), *args, **kwargs
+    )
+
+
 def init_seed_top_k_sampling(*args, **kwargs):
     torch.manual_seed(42)
     return flashinfer.sampling.top_k_sampling_from_probs(*args, **kwargs)
@@ -139,6 +151,69 @@ def main():
                             f"vocab_size: {vocab_size}, batch_size: {batch_size}, distrib: {distrib.__name__}, deterministic: {deterministic}, p: {p}, duration: {ms*1e3:.2f} us, effective bandwidth: {bandwidth:.2f} GB/s"
                         )
 
+    print("---")
+    print("sampling from softmax(logits)")
+    for vocab_size in [128512]:
+        for batch_size in [1, 16, 32, 64, 128, 256, 512]:
+            for distrib in [
+                normal_distribution(1),
+                normal_distribution(5),
+                gumbel_distribution(0.1),
+                gumbel_distribution(1),
+            ]:
+                for deterministic in [True, False]:
+                    logits = distrib((batch_size, vocab_size), device="cuda")
+                    samples = torch.zeros(
+                        batch_size, dtype=torch.int32, device=logits.device
+                    )
+                    ms = do_bench(
+                        lambda: init_seed_sampling_from_softmax_logits(
+                            logits, samples, deterministic=deterministic
+                        ),
+                        warmup=100,
+                        rep=1000,
+                    )
+                    io = (
+                        logits.numel() * logits.element_size()
+                        + samples.numel() * samples.element_size()
+                    )
+                    bandwidth = io * 1e-6 / ms
+                    print(
+                        f"vocab_size: {vocab_size}, batch_size: {batch_size}, distrib: {distrib.__name__}, deterministic: {deterministic}, duration: {ms*1e3:.2f} us, effective bandwidth: {bandwidth:.2f} GB/s"
+                    )
+
+    print("---")
+    print("sampling from logits")
+    for vocab_size in [128512]:
+        for batch_size in [1, 16, 32, 64, 128, 256, 512]:
+            for distrib in [
+                normal_distribution(1),
+                normal_distribution(5),
+                gumbel_distribution(0.1),
+                gumbel_distribution(1),
+            ]:
+                for deterministic in [True, False]:
+                    logits = distrib((batch_size, vocab_size), device="cuda")
+                    samples = torch.zeros(
+                        batch_size, dtype=torch.int32, device=logits.device
+                    )
+                    ms = do_bench(
+                        lambda: init_seed_sampling_from_logits(
+                            logits, samples, deterministic=deterministic
+                        ),
+                        warmup=100,
+                        rep=1000,
+                    )
+
+                    io = (
+                        logits.numel() * logits.element_size()
+                        + samples.numel() * samples.element_size()
+                    )
+                    bandwidth = io * 1e-6 / ms
+                    print(
+                        f"vocab_size: {vocab_size}, batch_size: {batch_size}, distrib: {distrib.__name__}, deterministic: {deterministic}, duration: {ms*1e3:.2f} us, effective bandwidth: {bandwidth:.2f} GB/s"
+                    )
+
 
 if __name__ == "__main__":
     main()
diff --git a/csrc/flashinfer_ops.cu b/csrc/flashinfer_ops.cu
@@ -176,6 +176,10 @@ void sampling_from_probs(at::Tensor probs, at::Tensor output,
                          std::optional<at::Tensor> maybe_indices, bool deterministic,
                          std::optional<at::Generator> gen);
 
+void sampling_from_logits(at::Tensor logits, at::Tensor output,
+                          std::optional<at::Tensor> maybe_indices, bool deterministic,
+                          std::optional<at::Generator> gen);
+
 void top_p_sampling_from_probs(at::Tensor probs, at::Tensor output,
                                std::optional<at::Tensor> maybe_indices,
                                std::optional<at::Tensor> maybe_top_p_arr, double top_p_val,
@@ -294,6 +298,8 @@ TORCH_LIBRARY_FRAGMENT(TORCH_EXTENSION_NAME, m) {
   // sampling
   // Sample from probabilities
   m.def("sampling_from_probs", sampling_from_probs);
+  // Sample from logits
+  m.def("sampling_from_logits", sampling_from_logits);
   // Top-k sampling from probabilities
   m.def("top_k_sampling_from_probs", top_k_sampling_from_probs);
   // Min-p sampling from probabilities
diff --git a/csrc/flashinfer_sampling_ops.cu b/csrc/flashinfer_sampling_ops.cu
@@ -19,6 +19,10 @@ void sampling_from_probs(at::Tensor probs, at::Tensor output,
                          std::optional<at::Tensor> maybe_indices, bool deterministic,
                          std::optional<at::Generator> gen);
 
+void sampling_from_logits(at::Tensor logits, at::Tensor output,
+                          std::optional<at::Tensor> maybe_indices, bool deterministic,
+                          std::optional<at::Generator> gen);
+
 void top_p_sampling_from_probs(at::Tensor probs, at::Tensor output,
                                std::optional<at::Tensor> maybe_indices,
                                std::optional<at::Tensor> maybe_top_p_arr, double top_p_val,
@@ -58,6 +62,8 @@ void chain_speculative_sampling(at::Tensor draft_probs, at::Tensor draft_token_i
 TORCH_LIBRARY_FRAGMENT(TORCH_EXTENSION_NAME, m) {
   // Sample from probabilities
   m.def("sampling_from_probs", sampling_from_probs);
+  // Sample from logits
+  m.def("sampling_from_logits", sampling_from_logits);
   // Top-k sampling from probabilities
   m.def("top_k_sampling_from_probs", top_k_sampling_from_probs);
   // Min-p sampling from probabilities
diff --git a/csrc/sampling.cu b/csrc/sampling.cu
@@ -25,6 +25,33 @@
 
 using namespace flashinfer;
 
+void sampling_from_logits(at::Tensor logits, at::Tensor output,
+                          std::optional<at::Tensor> maybe_indices, bool deterministic,
+                          std::optional<at::Generator> gen_) {
+  CHECK_INPUT(logits);
+  auto device = logits.device();
+  CHECK_DIM(2, logits);  // logits: (batch_size, vocab_size)
+  unsigned int batch_size = output.size(0);
+  unsigned int vocab_size = logits.size(1);
+
+  uint64_t philox_seed, philox_offset;
+  auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
+      gen_, at::cuda::detail::getDefaultCUDAGenerator());
+  std::lock_guard<std::mutex> lock(gen->mutex_);
+  at::PhiloxCudaState rng_engine_inputs = gen->philox_cuda_state(batch_size * vocab_size);
+  philox_seed = rng_engine_inputs.seed_.val;
+  philox_offset = rng_engine_inputs.offset_.val;
+
+  const c10::cuda::OptionalCUDAGuard device_guard(device);
+  auto stream = at::cuda::getCurrentCUDAStream();
+  cudaError_t status = sampling::SamplingFromLogits(
+      static_cast<float*>(logits.data_ptr()), static_cast<int*>(output.data_ptr()),
+      maybe_indices.has_value() ? static_cast<int*>(maybe_indices->data_ptr()) : nullptr,
+      batch_size, vocab_size, deterministic, philox_seed, philox_offset, stream);
+  TORCH_CHECK(status == cudaSuccess, "SamplingFromLogits failed with error code " +
+                                         std::string(cudaGetErrorString(status)));
+}
+
 void sampling_from_probs(at::Tensor probs, at::Tensor output,
                          std::optional<at::Tensor> maybe_indices, bool deterministic,
                          std::optional<at::Generator> gen_) {
diff --git a/flashinfer/__init__.py b/flashinfer/__init__.py
@@ -80,6 +80,7 @@
 )
 from .sampling import chain_speculative_sampling as chain_speculative_sampling
 from .sampling import min_p_sampling_from_probs as min_p_sampling_from_probs
+from .sampling import sampling_from_logits as sampling_from_logits
 from .sampling import sampling_from_probs as sampling_from_probs
 from .sampling import top_k_mask_logits as top_k_mask_logits
 from .sampling import top_k_renorm_probs as top_k_renorm_probs
diff --git a/flashinfer/sampling.py b/flashinfer/sampling.py
@@ -42,6 +42,39 @@ def get_sampling_module():
                 ],
             )
 
+        # torch library for sampling_from_logits
+        @register_custom_op("flashinfer::sampling_from_logits", mutates_args=())
+        def sampling_from_logits(
+            logits: torch.Tensor,
+            indices: Optional[torch.Tensor],
+            deterministic: bool,
+            generator: Optional[torch.Generator],
+        ) -> torch.Tensor:
+            device = logits.device
+            # TODO: support more data types in logits to avoid conversion
+            # to float32
+            logits = logits.float()
+            batch_size = indices.size(0) if indices is not None else logits.size(0)
+            samples = torch.empty(batch_size, dtype=torch.int32, device=device)
+            module.sampling_from_logits.default(
+                logits,
+                samples,
+                indices,
+                deterministic,
+                generator,
+            )
+            return samples
+
+        @register_fake_op("flashinfer::sampling_from_logits")
+        def _fake_sampling_from_logits(
+            logits: torch.Tensor,
+            indices: Optional[torch.Tensor],
+            deterministic: bool,
+            generator: Optional[torch.Generator],
+        ) -> torch.Tensor:
+            batch_size = indices.size(0) if indices is not None else logits.size(0)
+            return torch.empty(batch_size, dtype=torch.int32, device=logits.device)
+
         # torch library for sampling_from_probs
 
         @register_custom_op("flashinfer::sampling_from_probs", mutates_args=())
@@ -64,6 +97,8 @@ def sampling_from_probs(
             )
             return samples
 
+        # torch library for sampling_from_probs
+
         @register_fake_op("flashinfer::sampling_from_probs")
         def _fake_sampling_from_probs(
             probs: torch.Tensor,
@@ -384,6 +419,7 @@ def _fake_chain_speculative_sampling(
         # Register the module
         _sampling_module = SimpleNamespace(
             sampling_from_probs=sampling_from_probs,
+            sampling_from_logits=sampling_from_logits,
             top_p_sampling_from_probs=top_p_sampling_from_probs,
             top_k_sampling_from_probs=top_k_sampling_from_probs,
             min_p_sampling_from_probs=min_p_sampling_from_probs,
@@ -404,6 +440,64 @@ def _to_tensor_scalar_tuple(x):
         return (None, x)
 
 
+def sampling_from_logits(
+    logits: torch.Tensor,
+    indices: Optional[torch.Tensor] = None,
+    deterministic: bool = True,
+    generator: Optional[torch.Generator] = None,
+    check_nan: bool = False,
+) -> torch.Tensor:
+    r"""Fused GPU kernel for category sampling from logits. It's equivalent to sampling
+    from :attr:`logits` after applying softmax.
+    Parameters
+    ----------
+    logits: torch.Tensor
+        Logits for sampling. When indices is not provided, shape should be ``(batch_size, num_classes)``
+        and the i-th output will be sampled from the i-th row of logits. When indices is provided,
+        shape should be ``(unique_batch_size, num_classes)`` where unique_batch_size is the number of unique
+        probability distributions.
+    indices: Optional[torch.Tensor]
+        Optional indices tensor of shape ``(batch_size,)`` that maps each output to a row in logits.
+        For example, if indices[i] = j, then the i-th output will be sampled from logits[j].
+        This allows reusing the same probability distribution for multiple outputs.
+        If indices is not provided, the i-th output will be sampled from the i-th row of logits.
+    deterministic: bool
+        Since the sampling doesn't use cub's BlockScan, the sampling is deterministic. We keep this
+        argument for compatibility with other sampling functions.
+    generator: Optional[torch.Generator]
+        A random number generator for the operation.
+    check_nan: bool
+        Whether to check nan in :attr:`logits`, default is ``False``.
+    Returns
+    -------
+    samples: torch.Tensor
+        Sampled categories, shape (batch_size,). It's equivalent to sampling from
+        :attr:`logits` after applying softmax.
+    Examples
+    --------
+    >>> import torch
+    >>> import flashinfer
+    >>> torch.manual_seed(42)
+    >>> batch_size = 4
+    >>> vocab_size = 5
+    >>> logits = torch.rand(batch_size, vocab_size).to(0)
+    >>> logits
+    tensor([[0.8823, 0.9150, 0.3829, 0.9593, 0.3904],
+            [0.6009, 0.2566, 0.7936, 0.9408, 0.1332],
+            [0.9346, 0.5936, 0.8694, 0.5677, 0.7411],
+            [0.4294, 0.8854, 0.5739, 0.2666, 0.6274]], device='cuda:0')
+    >>> samples = flashinfer.sampling.sampling_from_logits(logits)
+    >>> samples
+    tensor([0, 1, 1, 1], device='cuda:0', dtype=torch.int32)
+    """
+    if check_nan:
+        if torch.any(torch.isnan(logits)):
+            raise ValueError("Input logits contains NaN.")
+    return get_sampling_module().sampling_from_logits(
+        logits, indices, deterministic, generator
+    )
+
+
 def sampling_from_probs(
     probs: torch.Tensor,
     indices: Optional[torch.Tensor] = None,
diff --git a/include/flashinfer/sampling.cuh b/include/flashinfer/sampling.cuh
diff --git a/tests/conftest.py b/tests/conftest.py
diff --git a/tests/test_sampling.py b/tests/test_sampling.py

Original file line number	Diff line number	Diff line change
`@@ -80,6 +80,7 @@`
`80`	`80`	`)`
`81`	`81`	`from .sampling import chain_speculative_sampling as chain_speculative_sampling`
`82`	`82`	`from .sampling import min_p_sampling_from_probs as min_p_sampling_from_probs`
	`83`	`+from .sampling import sampling_from_logits as sampling_from_logits`
`83`	`84`	`from .sampling import sampling_from_probs as sampling_from_probs`
`84`	`85`	`from .sampling import top_k_mask_logits as top_k_mask_logits`
`85`	`86`	`from .sampling import top_k_renorm_probs as top_k_renorm_probs`