MustaphaU
diff --git a/‎benchmarks/cpp/gptManagerBenchmark.cpp
+44 b/‎benchmarks/cpp/gptManagerBenchmark.cpp
+44
diff --git a/‎benchmarks/python/build.py
+1 b/‎benchmarks/python/build.py
+1
diff --git a/‎cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+2-2 b/‎cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+2-2
diff --git a/‎cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+2-2 b/‎cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+2-2
diff --git a/‎cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
+3-3 b/‎cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
+3-3
diff --git a/‎cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+2-2 b/‎cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+2-2
diff --git a/‎cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+2-2 b/‎cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+2-2
diff --git a/‎cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
+2-2 b/‎cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
+2-2
diff --git a/‎cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
+2-2 b/‎cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
+2-2
diff --git a/‎cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+2-2 b/‎cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+2-2
diff --git a/‎cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
+3-3 b/‎cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
+3-3
diff --git a/‎cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
+2-2 b/‎cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
+2-2
diff --git a/‎cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+2-2 b/‎cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+2-2
diff --git a/‎cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
+2-2 b/‎cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
+2-2
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h
+3-2 b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h
+3-2
diff --git a/‎cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt
+1-1 b/‎cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt
+1-1
diff --git a/‎cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll
+1-1 b/‎cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll
+1-1
diff --git a/‎cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_bf16 .cu ‎cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_bf16.cu b/‎cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_bf16 .cu ‎cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_bf16.cu
diff --git a/‎cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu
+53-13 b/‎cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu
+53-13
diff --git a/‎cpp/tensorrt_llm/pybind/executor/bindings.cpp
+4-2 b/‎cpp/tensorrt_llm/pybind/executor/bindings.cpp
+4-2
diff --git a/‎cpp/tensorrt_llm/runtime/medusaModule.cpp
+3 b/‎cpp/tensorrt_llm/runtime/medusaModule.cpp
+3
@@ -165,6 +165,9 @@ struct BenchmarkParams
 
     // Weights offloading
     float gpuWeightsPercent{1.0};
+
+    // Decoding params
+    std::optional<std::vector<std::vector<SizeType32>>> medusaChoices;
 };
 
 class InferenceRequestsSyncSend
@@ -791,6 +794,10 @@ class ExecutorServer
             executorConfig.setMaxBatchSize(benchmarkParams.maxBatchSize.value());
         }
 
+        executorConfig.setDecodingConfig(texec::DecodingConfig(
+            benchmarkParams.medusaChoices.has_value() ? texec::DecodingMode::Medusa() : texec::DecodingMode::Auto(),
+            std::nullopt, benchmarkParams.medusaChoices));
+
         mExecutor = std::make_unique<texec::Executor>(trtEnginePath, texec::ModelType::kDECODER_ONLY, executorConfig);
 
         if (logIterationData)
@@ -1346,6 +1353,9 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
     optionalParams.maxBeamWidth = beamWidth;
     optionalParams.maxBatchSize = benchmarkParams.maxBatchSize;
     optionalParams.schedulerConfig = texec::SchedulerConfig{capacitySchedulerPolicy};
+    optionalParams.decodingConfig = texec::DecodingConfig(
+        benchmarkParams.medusaChoices.has_value() ? texec::DecodingMode::Medusa() : texec::DecodingMode::Auto(),
+        std::nullopt, benchmarkParams.medusaChoices);
 
     auto const jsonConfig = GptJsonConfig::parse(engineDir / "config.json");
     SizeType32 deviceCount{0};
@@ -1600,6 +1610,32 @@ void benchmarkExecutor(std::filesystem::path const& engineDir, TrtGptModelType m
     }
 }
 
+std::vector<std::vector<SizeType32>> parseVectorOfVectors(std::string const& input)
+{
+    std::vector<std::vector<SizeType32>> result;
+    std::regex outer_regex(R"(\[(.*?)\])");
+    std::regex inner_regex(R"(\d+)");
+    auto outer_begin = std::sregex_iterator(input.begin(), input.end(), outer_regex);
+    auto outer_end = std::sregex_iterator();
+
+    for (std::sregex_iterator i = outer_begin; i != outer_end; ++i)
+    {
+        std::smatch match = *i;
+        std::string inner_str = match.str(1);
+        std::vector<int> inner_vec;
+        auto inner_begin = std::sregex_iterator(inner_str.begin(), inner_str.end(), inner_regex);
+        auto inner_end = std::sregex_iterator();
+
+        for (std::sregex_iterator j = inner_begin; j != inner_end; ++j)
+        {
+            std::smatch inner_match = *j;
+            inner_vec.push_back(std::stoi(inner_match.str()));
+        }
+        result.push_back(inner_vec);
+    }
+    return result;
+}
+
 } // namespace
 
 int main(int argc, char* argv[])
@@ -1692,6 +1728,8 @@ int main(int argc, char* argv[])
     options.add_options()("gpu_weights_percent",
         "Specify the percentage of weights that reside on GPU (from 0.0 to 1.0).",
         cxxopts::value<float>()->default_value("1.0"));
+    options.add_options()(
+        "medusa_choices", "Medusa choices in the format of [[0], [0, 1], [0, 0, 1]]", cxxopts::value<std::string>());
 
     auto result = options.parse(argc, argv);
 
@@ -1823,6 +1861,12 @@ int main(int argc, char* argv[])
     // Argument: If offloaded blocks should be onboarded to primary memory before they are reused.
     benchmarkParams.kvOnboardBlocks = !result["kv_dont_onboard_blocks"].as<bool>();
 
+    // Argument: Medusa choices for the Medusa speculative decoding.
+    if (result.count("medusa_choices"))
+    {
+        benchmarkParams.medusaChoices = parseVectorOfVectors(result["medusa_choices"].as<std::string>());
+    }
+
     std::optional<TokenIdType> padId;
     // Argument: Padding token id
     if (result.count("pad_id"))
 
@@ -944,6 +944,7 @@ def build_gpt(args):
     network = builder.create_network()
     network.trt_network.name = engine_name
     network.plugin_config.to_legacy_setting()
+    network.plugin_config.dtype = args.dtype
 
     # Plugins
     if args.mode in ['plugin', 'plugin-ifb']:
 
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1fec0fdc00c076761ec48eb5e2ea93473a329e844a8091e26c6e3e02fd14a8b1
-size 3931604
+oid sha256:8b6ad33047e2684c7d22471f87febbb96ae26f4eac6529e2f3b7c1469ec2ec6d
+size 3931504
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1fec0fdc00c076761ec48eb5e2ea93473a329e844a8091e26c6e3e02fd14a8b1
-size 3931604
+oid sha256:560f736af15a4dfba849ab29efc3520d6ec8c87bf2aa16589299b232dc171cca
+size 3989220
@@ -1,3 +1,3 @@
-93adf3003d7c422586a9bf892367371d libtensorrt_llm_batch_manager_static.a
-93adf3003d7c422586a9bf892367371d libtensorrt_llm_batch_manager_static.pre_cxx11.a
-c0bd2b69c932257678a2aad9bd8baba4b291795e commit
+f8538ac35803837e5d457ea8c1a58053 libtensorrt_llm_batch_manager_static.a
+dc6fc82dc4ba319899e1d6777bd8c3a4 libtensorrt_llm_batch_manager_static.pre_cxx11.a
+265b039443334094026fbd8f396d52fe29c2d9d1 commit
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bd757c26886a3ffd6947615d9f2829434e94839b693007a64b47c6b5c26416e4
-size 3812158
+oid sha256:74948e00ff7341914b1831ccfdce9ae242dd149603b1ba7e24ee993f08b63542
+size 3812960
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:87321383075adf2d87cfbdc8a12a3d3815ef058d5da9b6aaa8d7d3f3263af439
-size 3773896
+oid sha256:0421ceacd5d07bc172bb4d0979edaf466aa8950290b4d6d1a7d355dbcefc2c84
+size 3772832
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:58cdc0a330f8bfb7b50e3202aeac47bde0835b1dc600b4bfdcd2b30801e66e03
-size 22381766
+oid sha256:46eb1d351e3e8da3945a3f451166f12536aae3e440d57337d8891492424aff78
+size 22387798
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:18a967eaa1e9a7164e0b104a84b13ea95404f7c7c278375feb2513d5f063bafe
-size 1396404
+oid sha256:19585b7709736197d9c1762d1bb8e3099e298d6dcc1c521d51c83637cc624c20
+size 1397814
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:18a967eaa1e9a7164e0b104a84b13ea95404f7c7c278375feb2513d5f063bafe
-size 1396404
+oid sha256:f5d5475663640c068af2e9b5772b9b602656641dd17ca473ce7125ef7f2ec855
+size 1423172
@@ -1,3 +1,3 @@
-7d12b9c04cb6738bb5f7747a88b00c1c libtensorrt_llm_executor_static.a
-7d12b9c04cb6738bb5f7747a88b00c1c libtensorrt_llm_executor_static.pre_cxx11.a
-c0bd2b69c932257678a2aad9bd8baba4b291795e commit
+e18e84fb356995b11c04b79e55c4c3f5 libtensorrt_llm_executor_static.a
+f0555b76f21d43e676e5808bf197cc58 libtensorrt_llm_executor_static.pre_cxx11.a
+265b039443334094026fbd8f396d52fe29c2d9d1 commit
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e503b4cfb1c842850287a359ffed23a1773a67a96475d365b66d757a283ac218
-size 1448772
+oid sha256:8496c9e4a20efd3d2072520cf843dac70cbb0fe23621cfba2a1e0ef3e5fa22ed
+size 1450288
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f8c80cf7aca2b135a656a060456fb30a820e459b4b36560162b02fa65121ef50
-size 1375430
+oid sha256:1b76267834252836e26ddecc2e1b9449e33a67fb1981e5d42f721bc439be1c02
+size 1377018
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cc65971d6d74260cb49b354aa4b0b82f92863cc722fbf206bf8a4919a4897532
-size 14031364
+oid sha256:9bd0faf48175745d7aeff58f539ae021db365b73933dab9c51329de9e92f2d86
+size 14039826
@@ -424,7 +424,8 @@ std::vector<cutlass_extensions::CutlassGemmConfig> MoeGemmRunner<T, WeightType>:
 template <typename T, typename WeightType>
 bool MoeGemmRunner<T, WeightType>::isHopperSpecialised() const
 {
-    bool config_is_sm90 = best_config_ && best_config_->is_sm90;
+    TLLM_CHECK_WITH_INFO(best_config_, "Cannot determine if hopper is specialised without a selected config");
+    bool config_is_sm90 = best_config_->is_sm90;
     return supportsHopperSpecialisation() && config_is_sm90;
 }
 
@@ -440,7 +441,7 @@ int MoeGemmRunner<T, WeightType>::getSM() const
     return this->sm_;
 }
 
-// currently support sm80 bf16/fp16 gate ativation, only set predication tensor for m direction
+// currently support sm80 bf16/fp16 gate activation, only set predication tensor for m direction
 template <typename T, typename WeightType>
 bool MoeGemmRunner<T, WeightType>::isFusedGatedActivation(bool is_gated_activation, int gemm_n, int gemm_k) const
 {
 
@@ -1,2 +1,2 @@
 5b6c74ce66f62d2a58aa9cac16f11ad6 libtensorrt_llm_nvrtc_wrapper.so
-c0bd2b69c932257678a2aad9bd8baba4b291795e commit
+265b039443334094026fbd8f396d52fe29c2d9d1 commit
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:84319476e8ecf9666f40f69355f19ec3b585fc0987f940be14af9e11e3f524c3
+oid sha256:9f2f97eb5b4181917a47b6028a857d7a597ca93faa5846af42c4cb24797d7fa7
 size 1080832
@@ -1072,10 +1072,38 @@ std::vector<size_t> CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::getWo
     size_t const hopper_size = using_hopper ? HopperGroupedGemmInput::workspaceSize(num_experts_per_node) : 0;
     size_t const gemm_workspace_size = moe_gemm_runner_.getMaxWorkspaceSize(num_experts_per_node);
 
-    std::vector<size_t> workspace{source_rows_size, permuted_rows_size, permuted_experts_size, permuted_data_size,
-        total_rows_before_expert_size, softmax_out_size, glu_inter_size,
+    // We do some overlapping of the large workspace buffers. Although we could overlap some of the other buffers, they
+    // are small enough (i.e no factor of hidden size) they will only be a couple MiB at most, so we don't bother
+    // in the case of fused activation we overlap permuted_data and fc2_result
+    // in the case of unfused activation we overlap permuted_data and fc1_result
+    // we need to calculate the max possible size, so use the max of all three
+    size_t overlapped_gemm1_gemm2_inputs = std::max(permuted_data_size, fc2_result_size);
+    // When glu_inter_elems is 0 we are always fused, otherwise we may need the un-fused case
+    if (glu_inter_elems > 0)
+    {
+        overlapped_gemm1_gemm2_inputs = std::max(overlapped_gemm1_gemm2_inputs, fc1_result_size);
+    }
+
+    // if we have glu_inter we overlap it with fc2_result, otherwise we use fc1_result by itself
+    size_t overlapped_gemm1_gemm2_outputs = fc1_result_size;
+    if (glu_inter_elems > 0)
+    {
+        overlapped_gemm1_gemm2_outputs
+            = std::max(std::max(glu_inter_size, fc2_result_size), overlapped_gemm1_gemm2_outputs);
+    }
+
+    std::vector<size_t> workspace{     //
+        source_rows_size,              //
+        permuted_rows_size,            //
+        permuted_experts_size,         //
+        total_rows_before_expert_size, //
+        softmax_out_size,              //
+        sorter_size,                   //
         // These pointers reuse the same memory
-        std::max(fc1_result_size, sorter_size), fc2_result_size, hopper_size, gemm_workspace_size};
+        overlapped_gemm1_gemm2_inputs,  //
+        overlapped_gemm1_gemm2_outputs, //
+        hopper_size,                    //
+        gemm_workspace_size};
     return workspace;
 }
 
@@ -1088,7 +1116,9 @@ size_t CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::getWorkspaceSize(i
     TLLM_CHECK_WITH_INFO(num_experts % ep_size == 0, "Number of experts must be a multiple of ep size");
     auto workspace = getWorkspaceBufferSizes(
         num_rows, hidden_size, inter_size, num_experts, num_experts / ep_size, k, activation_type);
-    return tensorrt_llm::common::calculateTotalWorkspaceSize(workspace.data(), workspace.size());
+    auto ws_size = tensorrt_llm::common::calculateTotalWorkspaceSize(workspace.data(), workspace.size());
+    TLLM_LOG_DEBUG("Mixture Of Experts Plugin requires workspace of %2f MiB", ws_size / 1024.f / 1024.f);
+    return ws_size;
 }
 
 template <class T, class WeightType, class OutputType, class Enable>
@@ -1109,29 +1139,38 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::configureWsPtrs(char
     source_rows_ = (int*) ws_sliced[0];
     permuted_rows_ = (int*) ws_sliced[1];
     permuted_experts_ = (int*) ws_sliced[2];
-    permuted_data_ = (T*) ws_sliced[3];
 
-    total_rows_before_expert_ = (int64_t*) ws_sliced[4];
+    total_rows_before_expert_ = (int64_t*) ws_sliced[3];
 
     softmax_out_ = nullptr;
     bool const is_pow_2 = (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
     if (!is_pow_2 || num_experts > 256)
     {
-        softmax_out_ = (float*) ws_sliced[5];
+        softmax_out_ = (float*) ws_sliced[4];
     }
 
-    glu_inter_result_ = (T*) ws_sliced[6];
+    sorter_ws_ = (char*) ws_sliced[5];
 
-    // These pointers are aliased. Since the sort ws can be overwritten after it is finished
-    sorter_ws_ = (char*) ws_sliced[7];
-    fc1_result_ = (T*) ws_sliced[7];
+    // Always 6, but overlapped with either fc1_result_ or fc2_result_
+    permuted_data_ = (T*) ws_sliced[6];
 
-    fc2_result_ = (T*) ws_sliced[8];
+    bool const is_gated_activation = isGatedActivation(activation_type);
+    bool const use_fused_moe = moe_gemm_runner_.isFusedGatedActivation(is_gated_activation, inter_size, hidden_size);
+    bool const using_hopper = moe_gemm_runner_.isHopperSpecialised();
+    bool const hopper_has_glu = using_hopper && (mayHaveDifferentGEMMOutputType() || is_gated_activation);
+    bool const non_hopper_has_glu = !using_hopper && !use_fused_moe && is_gated_activation;
+    bool const has_glu_inter_result = hopper_has_glu || non_hopper_has_glu;
+    // Always 7, ignored if not needed
+    glu_inter_result_ = has_glu_inter_result ? (T*) ws_sliced[7] : nullptr;
+
+    // fc1 and fc2 alias one of the above pointers, but it depends on if actfn is fused/unfused which is overlapped
+    fc1_result_ = has_glu_inter_result ? (T*) ws_sliced[6] : (T*) ws_sliced[7];
+    fc2_result_ = has_glu_inter_result ? (T*) ws_sliced[7] : (T*) ws_sliced[6];
 
     hopper_grouped_gemm_input_ = {};
     if (moe_gemm_runner_.isHopperSpecialised())
     {
-        hopper_grouped_gemm_input_.configureWorkspace(ws_sliced[9], num_experts_per_node, ws_sliced[10], ws_sizes[10]);
+        hopper_grouped_gemm_input_.configureWorkspace(ws_sliced[8], num_experts_per_node, ws_sliced[9], ws_sizes[9]);
     }
 }
 
@@ -1293,6 +1332,7 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::runMoe(void const* i
     }
     else
     {
+
         // Run the GEMM with activation function overridden with `Identity`, we do the activation separately
         ActivationType activation_type = (use_fused_moe) ? fc1_activation_type : ActivationType::Identity;
         T* gemm_result = (use_fused_moe) ? fc1_result_ : static_cast<T*>(glu_inter_result_);
 
@@ -431,7 +431,8 @@ void InitBindings(pybind11::module_& m)
             &tle::DecodingConfig::setLookaheadDecoding)
         .def_property("medusa_choices", &tle::DecodingConfig::getMedusaChoices, &tle::DecodingConfig::setMedusaChoices);
 
-    auto executorConfigGetState = [&](tle::ExecutorConfig const& self)
+    auto executorConfigGetState = [&peftCacheConfigGetstate, &kvCacheConfigGetstate, &schedulerConfigGetstate,
+                                      &parallelConfigGetstate](tle::ExecutorConfig const& self)
     {
         py::object peftCacheConfigState = py::none();
 
@@ -453,7 +454,8 @@ void InitBindings(pybind11::module_& m)
             peftCacheConfigState, self.getLogitsPostProcessorMap(), self.getLogitsPostProcessorBatched(),
             self.getDecodingConfig(), self.getGpuWeightsPercent());
     };
-    auto executorConfigSetState = [&](py::tuple state)
+    auto executorConfigSetState = [&kvCacheConfigSetstate, &peftCacheConfigSetstate, &schedulerConfigSetstate,
+                                      &parallelConfigSetstate](py::tuple state)
     {
         if (state.size() != 15)
         {
 
@@ -96,8 +96,11 @@ void MedusaModule::initMedusaTensorsFromChoices(MedusaChoices const& choices, st
         if (curDepth != depth)
         {
             TLLM_CHECK(depth + 1 == curDepth);
+            TLLM_CHECK_WITH_INFO(depth <= getMaxDraftPathLen(),
+                "Medusa choices require more Medusa heads than the engine was built with.");
             // Save TopK
             topKs[depth - 1] = maxTopK;
+
             // Accumulate TopK for global indexing in tree
             globalNodeInTreeIdx += maxTopK;
Original file line number	Diff line number	Diff line change
`@@ -424,7 +424,8 @@ std::vector<cutlass_extensions::CutlassGemmConfig> MoeGemmRunner<T, WeightType>:`
`424`	`424`	`template <typename T, typename WeightType>`
`425`	`425`	`bool MoeGemmRunner<T, WeightType>::isHopperSpecialised() const`
`426`	`426`	`{`
`427`		`- bool config_is_sm90 = best_config_ && best_config_->is_sm90;`
	`427`	`+ TLLM_CHECK_WITH_INFO(best_config_, "Cannot determine if hopper is specialised without a selected config");`
	`428`	`+ bool config_is_sm90 = best_config_->is_sm90;`
`428`	`429`	`return supportsHopperSpecialisation() && config_is_sm90;`
`429`	`430`	`}`
`430`	`431`
`@@ -440,7 +441,7 @@ int MoeGemmRunner<T, WeightType>::getSM() const`
`440`	`441`	`return this->sm_;`
`441`	`442`	`}`
`442`	`443`
`443`		`-// currently support sm80 bf16/fp16 gate ativation, only set predication tensor for m direction`
	`444`	`+// currently support sm80 bf16/fp16 gate activation, only set predication tensor for m direction`
`444`	`445`	`template <typename T, typename WeightType>`
`445`	`446`	`bool MoeGemmRunner<T, WeightType>::isFusedGatedActivation(bool is_gated_activation, int gemm_n, int gemm_k) const`
`446`	`447`	`{`
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`5b6c74ce66f62d2a58aa9cac16f11ad6 libtensorrt_llm_nvrtc_wrapper.so`
`2`		`-c0bd2b69c932257678a2aad9bd8baba4b291795e commit`
	`2`	`+265b039443334094026fbd8f396d52fe29c2d9d1 commit`
Original file line number	Diff line number	Diff line change
`@@ -431,7 +431,8 @@ void InitBindings(pybind11::module_& m)`
`431`	`431`	`&tle::DecodingConfig::setLookaheadDecoding)`
`432`	`432`	`.def_property("medusa_choices", &tle::DecodingConfig::getMedusaChoices, &tle::DecodingConfig::setMedusaChoices);`
`433`	`433`
`434`		`- auto executorConfigGetState = [&](tle::ExecutorConfig const& self)`
	`434`	`+ auto executorConfigGetState = [&peftCacheConfigGetstate, &kvCacheConfigGetstate, &schedulerConfigGetstate,`
	`435`	`+ &parallelConfigGetstate](tle::ExecutorConfig const& self)`
`435`	`436`	`{`
`436`	`437`	`py::object peftCacheConfigState = py::none();`
`437`	`438`
`@@ -453,7 +454,8 @@ void InitBindings(pybind11::module_& m)`
`453`	`454`	`peftCacheConfigState, self.getLogitsPostProcessorMap(), self.getLogitsPostProcessorBatched(),`
`454`	`455`	`self.getDecodingConfig(), self.getGpuWeightsPercent());`
`455`	`456`	`};`
`456`		`- auto executorConfigSetState = [&](py::tuple state)`
	`457`	`+ auto executorConfigSetState = [&kvCacheConfigSetstate, &peftCacheConfigSetstate, &schedulerConfigSetstate,`
	`458`	`+ &parallelConfigSetstate](py::tuple state)`
`457`	`459`	`{`
`458`	`460`	`if (state.size() != 15)`
`459`	`461`	`{`