MustaphaU
diff --git a/‎.github/workflows/auto-assign.yml
-43 b/‎.github/workflows/auto-assign.yml
-43
diff --git a/‎.github/workflows/auto_close_inactive_issues.yml
+10-14 b/‎.github/workflows/auto_close_inactive_issues.yml
+10-14
diff --git a/‎.github/workflows/module-owners.json
-13 b/‎.github/workflows/module-owners.json
-13
diff --git a/‎README.md
+5-4 b/‎README.md
+5-4
diff --git a/‎benchmarks/cpp/gptManagerBenchmark.cpp
+37-14 b/‎benchmarks/cpp/gptManagerBenchmark.cpp
+37-14
diff --git a/‎benchmarks/cpp/utils/utils.cpp
+7 b/‎benchmarks/cpp/utils/utils.cpp
+7
diff --git a/‎benchmarks/cpp/utils/utils.h
+1-1 b/‎benchmarks/cpp/utils/utils.h
+1-1
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
+4-7 b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
+4-7
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheTransferManager.h
+62 b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheTransferManager.h
+62
@@ -1,29 +1,25 @@
 # Ref: https://docs.github.com/en/actions/managing-issues-and-pull-requests/closing-inactive-issues
 name: Close inactive issues
 on:
-  workflow_dispatch:
   schedule:
-    - cron: "0 * * * *"
+    - cron: "30 1 * * *"
 
 jobs:
   stale:
     runs-on: ubuntu-latest
     permissions:
       issues: write
       pull-requests: write
-
     steps:
       - uses: actions/stale@v9
         with:
+          days-before-issue-stale: 30
+          days-before-issue-close: 15
+          stale-issue-label: "stale"
+          exempt-issue-labels: ""
+          stale-issue-message: This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 15 days."
+          close-issue-message: "This issue was closed because it has been stalled for 15 days with no activity."
+          days-before-pr-stale: -1
+          days-before-pr-close: -1
           repo-token: ${{ secrets.GITHUB_TOKEN }}
-          stale-issue-message: 'Issue has not received an update in over 14 days. Adding stale label.'
-          stale-pr-message: 'PR has not received an update in over 14 days. Adding stale label.'
-          close-issue-message: 'This issue was closed because it has been 14 days without activity since it has been marked as stale.'
-          close-pr-message: 'This PR was closed because it has been 14 days without activity since it has been marked as stale.'
-          days-before-issue-stale: 14
-          days-before-close: 14
-          only-labels: 'waiting for feedback'
-          labels-to-add-when-unstale: 'investigating'
-          labels-to-remove-when-unstale: 'stale,waiting for feedback'
-          stale-issue-label: 'stale'
-          stale-pr-label: 'stale'
+          debug-only: false
@@ -5,13 +5,13 @@ TensorRT-LLM
 <h4> A TensorRT Toolbox for Optimized Large Language Model Inference</h4>
 
 [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/TensorRT-LLM/)
-[![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
-[![cuda](https://img.shields.io/badge/cuda-12.6.2-green)](https://developer.nvidia.com/cuda-downloads)
-[![trt](https://img.shields.io/badge/TRT-10.6.0-green)](https://developer.nvidia.com/tensorrt)
+[![python](https://img.shields.io/badge/python-3.12.3-green)](https://www.python.org/downloads/release/python-3123/)
+[![cuda](https://img.shields.io/badge/cuda-12.6.3-green)](https://developer.nvidia.com/cuda-downloads)
+[![trt](https://img.shields.io/badge/TRT-10.7.0-green)](https://developer.nvidia.com/tensorrt)
 [![version](https://img.shields.io/badge/release-0.16.0.dev-green)](./tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 
-[Architecture](./docs/source/architecture/overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Results](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](./examples/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://docs.google.com/presentation/d/1gycPmtdh7uUcH6laOvW65Dbp9F1McUkGDIcAyjicBZs/edit?usp=sharing)
+[Architecture](./docs/source/architecture/overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Performance](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](./examples/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://docs.google.com/presentation/d/1gycPmtdh7uUcH6laOvW65Dbp9F1McUkGDIcAyjicBZs/edit?usp=sharing)
 
 ---
 <div align="left">
@@ -151,6 +151,7 @@ To get started with TensorRT-LLM, visit our documentation:
 - [Release Notes](https://nvidia.github.io/TensorRT-LLM/release-notes.html)
 - [Installation Guide for Linux](https://nvidia.github.io/TensorRT-LLM/installation/linux.html)
 - [Installation Guide for Windows](https://nvidia.github.io/TensorRT-LLM/installation/windows.html)
+- [Installation Guide for Grace Hopper](https://nvidia.github.io/TensorRT-LLM/installation/grace-hopper.html)
 - [Supported Hardware, Models, and other Software](https://nvidia.github.io/TensorRT-LLM/reference/support-matrix.html)
 
 ## Community
 
@@ -259,6 +259,9 @@ class Recorder
                 const std::lock_guard<std::mutex> lock(mRequestBenchInfosMutex);
                 mRequestBenchInfos[requestId].outputLength = outSeqLen;
                 mRequestBenchInfos[requestId].decodingIter = response.getResult().decodingIter;
+
+                // We record the first beam for the response file
+                mResponseTensors[requestId] = outputTokenIds[0];
             }
             else
             {
@@ -492,14 +495,19 @@ class Recorder
         nlohmann::json jsonResponses = nlohmann::json::array();
         for (auto const& [respId, respTokensTensor] : mResponseTensors)
         {
-            int inputLength = mRequestBenchInfos[respId].inputLength;
-            int outputLength = mRequestBenchInfos[respId].outputLength;
-            std::vector<int32_t> outputTokens(outputLength);
+            auto respTokens = mResponseTensors[respId];
+            int respLength = respTokens.size();
+            int* respBufferPtr = respTokens.data();
 
-            int32_t* outputToksBufferPtr = bufferCast<int32_t>(*respTokensTensor);
             if (mOutputHasInput)
-                outputToksBufferPtr += inputLength;
-            std::copy(outputToksBufferPtr, outputToksBufferPtr + outputLength, outputTokens.begin());
+            {
+                int inputSeqLen = mRequestBenchInfos[respId].inputLength;
+                respBufferPtr += inputSeqLen;
+                respLength -= inputSeqLen;
+            }
+
+            std::vector<int32_t> outputTokens(respLength);
+            std::copy(respBufferPtr, respBufferPtr + respLength, outputTokens.begin());
 
             nlohmann::json currResp;
             currResp["response_id"] = respId;
@@ -552,7 +560,7 @@ class Recorder
     bool mStreaming;
     int mBeamWidth;
     std::string mRespJsonFile;
-    std::unordered_map<uint64_t, TensorPtr> mResponseTensors;
+    std::unordered_map<uint64_t, texec::VecTokens> mResponseTensors;
     bool mOutputHasInput;
     std::mutex mRequestBenchInfosMutex;
 
@@ -792,7 +800,8 @@ void benchmarkExecutor(std::optional<std::filesystem::path> const& decoderEngine
     std::optional<int32_t> const& eosId, std::optional<int32_t> const& padId, BenchmarkParams const& benchmarkParams,
     texec::CapacitySchedulerPolicy capacitySchedulerPolicy, std::chrono::milliseconds waitSleep,
     bool returnContextLogits, bool returnGenerationLogits, std::optional<int> const staticEmulatedBatchSize,
-    bool logIterationData, std::optional<SizeType32> const maxPromptLen, texec::ModelType executorModelType)
+    bool logIterationData, std::optional<SizeType32> const maxPromptLen, texec::ModelType executorModelType,
+    std::string const& responsesJsonFile)
 {
     auto const& world = tensorrt_llm::mpi::MpiComm::world();
     auto worldRank = world.getRank();
@@ -801,7 +810,7 @@ void benchmarkExecutor(std::optional<std::filesystem::path> const& decoderEngine
     auto const samples = parseWorkloadJson(datasetPath, maxNumSamples, maxPromptLen);
     auto const numSamples = samples.size();
 
-    auto recorder = std::make_shared<Recorder>(opCsvFile, benchmarkParams.streaming, beamWidth);
+    auto recorder = std::make_shared<Recorder>(opCsvFile, benchmarkParams.streaming, beamWidth, responsesJsonFile);
     int32_t decoderStartTokenId = 0;
     std::shared_ptr<ExecutorServer> executorServer;
 
@@ -989,6 +998,7 @@ void benchmarkExecutor(std::optional<std::filesystem::path> const& decoderEngine
         recorder->calculateMetrics();
         recorder->report();
         recorder->writeOpMetricsToCsv();
+        recorder->dumpResponseSeqs();
         // Send terminateReqId to terminate servers on all ranks
         // Sever on rank 0 will broadcast the terminate signal to other servers on multi-GPU cases
         // gptServer->enqueue(std::make_shared<InferenceRequest>(terminateReqId));
@@ -1047,11 +1057,13 @@ int main(int argc, char* argv[])
         cxxopts::value<bool>()->default_value("false"));
     options.add_options()("enable_exp_delays", "Enables exponential delay distr to mimic real world request arrival",
         cxxopts::value<bool>()->default_value("false"));
-    options.add_options()("streaming", "Operate in streaming mode", cxxopts::value<bool>()->default_value("false"));
+    options.add_options()("streaming",
+        "Operate in streaming mode. Note: it reflects time-to-first-token and inter-token-latency",
+        cxxopts::value<bool>()->default_value("false"));
     options.add_options()(
         "enable_kv_cache_reuse", "Enables the KV cache reuse.", cxxopts::value<bool>()->default_value("false"));
-    options.add_options()("enable_chunked_context", "Whether to enable context chunking.",
-        cxxopts::value<bool>()->default_value("false"));
+    options.add_options()(
+        "enable_chunked_context", "Whether to enable context chunking.", cxxopts::value<bool>()->default_value("true"));
     options.add_options()(
         "return_context_logits", "Whether to return context logits.", cxxopts::value<bool>()->default_value("false"));
     options.add_options()("return_generation_logits", "Whether to return generation logits.",
@@ -1064,7 +1076,7 @@ int main(int argc, char* argv[])
     options.add_options()("static_emulated_batch_size",
         "Emulate static batching performance with the provided batch size.", cxxopts::value<SizeType32>());
     options.add_options()("log_level", "Choose log level between verbose/info/warning/error/internal_error.",
-        cxxopts::value<std::string>()->default_value("error"));
+        cxxopts::value<std::string>()->default_value("warning"));
     options.add_options()("log_iteration_data", "On each decoder iteration, print batch state metadata.",
         cxxopts::value<bool>()->default_value("false"));
     options.add_options()("wait_sleep", "Specify how many milliseconds to sleep each iteration of waitForEmpty loop.",
@@ -1111,6 +1123,8 @@ int main(int argc, char* argv[])
         "lookahead config in the format of [max_window_size, max_ngram_size, max_verification_set_size], and each <= "
         "executor lookahead config",
         cxxopts::value<std::string>());
+    options.add_options()("responses_json", "Write output response sequences to a json file",
+        cxxopts::value<std::string>()->default_value(""));
 
     auto result = options.parse(argc, argv);
 
@@ -1137,6 +1151,12 @@ int main(int argc, char* argv[])
         {
             TLLM_LOG_WARNING("type option \"V1\" is going to be renamed to \"static\".");
         }
+        bool streaming = result["streaming"].as<bool>();
+        if (streaming)
+        {
+            TLLM_LOG_ERROR("Streaming is not supported in static batching.\n");
+            return 1;
+        }
         batchingType = texec::BatchingType::kSTATIC;
     }
     else if (type == "IFB" || type == "inflight")
@@ -1419,6 +1439,9 @@ int main(int argc, char* argv[])
 
     initTrtLlmPlugins(logger.get());
 
+    // Argument: output sequences JSON
+    auto const responsesJsonFile = result["responses_json"].as<std::string>();
+
     // Argument: API
     auto const api = result["api"].as<std::string>();
     if (api == "executor")
@@ -1449,7 +1472,7 @@ int main(int argc, char* argv[])
             benchmarkExecutor(decoderEngineDir, encoderEngineDir, batchingType, datasetPath, opCsvFile, maxNumSamples,
                 beamWidth, result["warm_up"].as<int>(), eosId, padId, benchmarkParams, capacitySchedulerPolicy,
                 waitSleep, returnContextLogits, returnGenerationLogits, staticEmulatedBatchSize, logIterationData,
-                maxPromptLen, executorModelType);
+                maxPromptLen, executorModelType, responsesJsonFile);
         }
         catch (std::exception const& e)
         {
 
@@ -94,6 +94,13 @@ Samples parseWorkloadJson(
         }
         samples.emplace_back(Sample{std::move(input_ids), sample["output_len"], taskId});
     }
+
+    if (samples.size() < maxNumSamples)
+    {
+        TLLM_LOG_WARNING(
+            "Dataset size %zu is smaller than given max_num_samples %d, max_num_samples will be ignored.\n",
+            samples.size(), maxNumSamples);
+    }
     return samples;
 }
 
 
@@ -52,7 +52,7 @@ struct BenchmarkParams
     bool enableBatchSizeTuning{false};
     bool enableMaxNumTokensTuning{false};
     bool enableBlockReuse{false};
-    bool enableChunkedContext{false};
+    bool enableChunkedContext{true};
     bool streaming{false};
     bool enableExpDelays{false};
     std::optional<float> requestRate{std::nullopt};
 
@@ -35,6 +35,7 @@
 #include <list>
 #include <memory>
 #include <optional>
+#include <set>
 #include <unordered_map>
 #include <vector>
 
@@ -52,6 +53,7 @@ static constexpr SizeType32 kSecondaryLevel = 1;
 
 class KVCacheBlock;
 class KVCacheManager;
+class KVCacheTransferManager;
 
 using SizeType32 = tensorrt_llm::runtime::SizeType32;
 using TokenIdType = tensorrt_llm::runtime::TokenIdType;
@@ -622,13 +624,6 @@ class BlockManager
     void claimLeafBlock(BlockPtr block, std::optional<executor::RetentionPriority> priority = std::nullopt,
         std::optional<std::chrono::milliseconds> durationMs = std::nullopt);
 
-    //! \brief Compute pointer to raw KV block (K & V, all layers).
-    [[nodiscard]] runtime::ITensor::SharedPtr computeBlockPointer(
-        std::shared_ptr<KVCacheBlock> block, SizeType32 poolIdx) const;
-
-    //! \brief Copy content of src block to dst.
-    void copyBlock(BlockPtr src, BlockPtr dst);
-
 private:
     // Number of blocks in pools
     SizeType32 mNumPrimaryBlocks;
@@ -667,6 +662,8 @@ class BlockManager
     std::shared_ptr<BaseEvictionPolicy> mEvictionPolicy;
     // Event manager
     std::shared_ptr<KVCacheEventManager> mEventManager;
+    // Transfer manager
+    std::shared_ptr<KVCacheTransferManager> mTransferManager;
 
     // Statistics for block allocations/reuse
     // Total number of blocks allocated by all requests
 
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/batch_manager/kvCacheManager.h"
+#include "tensorrt_llm/runtime/bufferManager.h"
+#include "tensorrt_llm/runtime/cudaEvent.h"
+
+namespace tr = tensorrt_llm::runtime;
+
+#pragma once
+
+namespace tensorrt_llm::batch_manager::kv_cache_manager
+{
+
+// The TransferManager accelerates transfers to/from the GPU by overlapping HtoD and DtoH transfers, and tracks ongoing
+// transfers in order to avoid race conditions. It is functionally equivalent to the prior approach of putting all
+// transfers into the forward pass stream. This is only ever used as a component of a KVCacheManager.
+class KVCacheTransferManager
+{
+public:
+    explicit KVCacheTransferManager(tr::BufferManager const& bufferManager);
+
+    //! \brief Onboard a block to gpu memory.
+    void onboard(BlockPtr const& offloadBlock, BlockPtr const& block, std::vector<KVCacheBlockPool> const& pools);
+
+    //! \brief Offload a block to cpu memory.
+    void offload(BlockPtr const& block, BlockPtr const& offloadBlock, std::vector<KVCacheBlockPool> const& pools);
+
+    //! \brief Synchronize the offload/onboard streams with the bufferManager stream.
+    void syncTransfers();
+
+private:
+    //! \brief Get pointer to pool specified by cache block.
+    static tr::ITensor::SharedPtr computeBlockPointer(
+        BlockPtr const& block, std::vector<KVCacheBlockPool> const& pools, size_t poolIdx);
+
+    //! \brief Copy content of src block to dst.
+    void copyBlock(
+        BlockPtr const& src, BlockPtr const& dst, std::vector<KVCacheBlockPool> const& pools, bool isOffload);
+
+    runtime::BufferManager mBufferManager;
+    runtime::BufferManager mOnboardManager;
+    runtime::BufferManager mOffloadManager;
+
+    // Track the block ids offloaded in this iteration.
+    std::unordered_map<int32_t, tr::CudaEvent> mPendingOffloads;
+};
+
+} // namespace tensorrt_llm::batch_manager::kv_cache_manager
Original file line number	Diff line number	Diff line change
`@@ -94,6 +94,13 @@ Samples parseWorkloadJson(`
`94`	`94`	`}`
`95`	`95`	`samples.emplace_back(Sample{std::move(input_ids), sample["output_len"], taskId});`
`96`	`96`	`}`
	`97`	`+`
	`98`	`+ if (samples.size() < maxNumSamples)`
	`99`	`+ {`
	`100`	`+ TLLM_LOG_WARNING(`
	`101`	`+ "Dataset size %zu is smaller than given max_num_samples %d, max_num_samples will be ignored.\n",`
	`102`	`+ samples.size(), maxNumSamples);`
	`103`	`+ }`
`97`	`104`	`return samples;`
`98`	`105`	`}`
`99`	`106`