feat(speculative): add typical-acceptance verify mode for Eagle3 draft

in-vm-agent · lujangus · commit d1378323372e · 2026-05-09T14:30:22.000Z
Squeeze pipeline Track B B1: drop-in flag-gated alternative to strict rejection-sampling verification. Adds two server-args flags to ServerArgs in sglang/srt/server_args.py: --speculative-verify-mode {rejection_sampling, typical_acceptance} --speculative-typical-acceptance-alpha FLOAT (default 0.8) When `speculative_verify_mode == "typical_acceptance"`, the Eagle3 verification path in `sglang/srt/speculative/eagle_info.py` overrides both `threshold_single` and `threshold_acc` with the alpha value before calling the existing `tree_speculative_sampling_target_only` kernel. The kernel acceptance condition if (coin <= prob_acc / threshold_acc || target_prob_single >= threshold_single) { // accept token } (in `sgl-kernel/csrc/speculative/speculative_sampling.cuh:80`) is the Medusa typical-acceptance formula when threshold_single == threshold_acc == alpha and 0 < alpha <= 1. So the kernel math is already correct; this commit just exposes the alpha knob. Defaults preserve existing behavior: rejection_sampling is the default mode and the existing `--speculative-accept-threshold-{single,acc}` flags continue to work unchanged. alpha=1.0 in typical_acceptance mode also reproduces strict rejection sampling. Scope intentionally narrow per the squeeze B1 preflight at `experiments/MiniMax-M2.5/squeeze/relaxed/B1-typical-acceptance/preflight.md`: - Eagle3 path only (eagle_info.py). ngram_info.py and dflash_utils.py also call tree_speculative_sampling_target_only but are not in the squeeze experiment scope; they continue to use the strict thresholds. - Global server-args flag, not per-request. Avoids mixed-mode KV-cache state. Per-request override deferred to a future revision if needed. To use: python -m sglang.launch_server \ --model-path <target> \ --speculative-algorithm EAGLE3 \ --speculative-draft-model-path thoughtworks/<Model>-Eagle3 \ --speculative-verify-mode typical_acceptance \ --speculative-typical-acceptance-alpha 0.8 \ ... Squeeze B1 alpha-sweep protocol: alpha in {0.7, 0.8, 0.9}, with alpha=1.0 as a control reproducing rejection-sampling baseline. Per-dataset quality must stay within 3% of the lossless Exp F baseline at every concurrency point per the squeeze plan §187 quality floor. Branch sits on top of `fix/llama-eagle3-fp8-aux-dtype-cast` (commit 71e0bf0) so it can run end-to-end on FP8 targets like MiniMaxAI/MiniMax-M2.5 immediately.
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
@@ -514,6 +514,14 @@ class ServerArgs:
     speculative_adaptive_topk_recovery: float = 0.0
     speculative_adaptive_topk_window: int = 32
     speculative_adaptive_topk_alpha: float = 0.1  # EMA smoothing
+    # Track-B Squeeze: relaxed verification via Medusa-style typical-acceptance.
+    # When `speculative_verify_mode == "typical_acceptance"`, both threshold knobs above
+    # are overridden by `speculative_typical_acceptance_alpha` at verify time.
+    # Default `rejection_sampling` preserves existing strict behavior.
+    speculative_verify_mode: Literal[
+        "rejection_sampling", "typical_acceptance"
+    ] = "rejection_sampling"
+    speculative_typical_acceptance_alpha: float = 0.8
     speculative_token_map: Optional[str] = None
     speculative_attention_mode: str = "prefill"
     speculative_draft_attention_backend: Optional[str] = None
@@ -5132,6 +5140,34 @@ def add_cli_args(parser: argparse.ArgumentParser):
             help="A3.1 — EMA smoothing factor in [0, 1] for accept-length tracking. Higher = more responsive to recent batches.",
             default=ServerArgs.speculative_adaptive_topk_alpha,
         )
+        parser.add_argument(
+            "--speculative-verify-mode",
+            type=str,
+            choices=["rejection_sampling", "typical_acceptance"],
+            default=ServerArgs.speculative_verify_mode,
+            help=(
+                "Verification regime for speculative decoding. "
+                "'rejection_sampling' (default) is strict: a draft token is accepted "
+                "iff coin <= prob_acc/threshold_acc OR target_prob_single >= "
+                "threshold_single, with both thresholds defaulting to 1.0. "
+                "'typical_acceptance' is the Medusa-style alpha-tunable mode: both "
+                "thresholds are set to --speculative-typical-acceptance-alpha at verify "
+                "time, trading a small amount of distributional fidelity for higher "
+                "accept rate (and hence higher throughput) on long-tail tokens. The "
+                "trade-off is alpha-tunable; alpha=1.0 reproduces rejection sampling, "
+                "alpha~0.7-0.9 is the typical Squeeze Track-B operating range."
+            ),
+        )
+        parser.add_argument(
+            "--speculative-typical-acceptance-alpha",
+            type=float,
+            default=ServerArgs.speculative_typical_acceptance_alpha,
+            help=(
+                "Alpha threshold for typical_acceptance verify mode. Ignored when "
+                "--speculative-verify-mode is rejection_sampling. Range (0, 1]. "
+                "Smaller values -> higher accept rate, lower distributional fidelity."
+            ),
+        )
         parser.add_argument(
             "--speculative-token-map",
             type=str,
diff --git a/python/sglang/srt/speculative/eagle_info.py b/python/sglang/srt/speculative/eagle_info.py
@@ -381,6 +381,17 @@ def verify(
             coins_for_final_sampling = torch.rand(
                 (bs,), dtype=torch.float32, device=batch.device
             )
+            # Track-B Squeeze: typical-acceptance verify mode overrides both
+            # threshold knobs with a single alpha. alpha=1.0 reproduces
+            # rejection sampling (the strict default).
+            _server_args = get_global_server_args()
+            if _server_args.speculative_verify_mode == "typical_acceptance":
+                _alpha = _server_args.speculative_typical_acceptance_alpha
+                _threshold_single = _alpha
+                _threshold_acc = _alpha
+            else:
+                _threshold_single = _server_args.speculative_accept_threshold_single
+                _threshold_acc = _server_args.speculative_accept_threshold_acc
             tree_speculative_sampling_target_only(
                 predicts=predict,  # mutable
                 accept_index=accept_index,  # mutable
@@ -393,8 +404,8 @@ def verify(
                 uniform_samples_for_final_sampling=coins_for_final_sampling,
                 target_probs=target_probs,
                 draft_probs=draft_probs,
-                threshold_single=get_global_server_args().speculative_accept_threshold_single,
-                threshold_acc=get_global_server_args().speculative_accept_threshold_acc,
+                threshold_single=_threshold_single,
+                threshold_acc=_threshold_acc,
                 deterministic=True,
             )