@@ -491,11 +491,14 @@ class BlockManager
491
491
492
492
void replaceSharedBlock (GenerationRequest& sequence, SizeType32 blockIdx);
493
493
494
+ // ! \brief Get the ids of all newly allocated (not reused) blocks for the sequence.
495
+ std::vector<KVCacheBlock::IdType> getNewlyAllocatedBlockIds (GenerationRequest const & sequence) const ;
496
+
494
497
// ! \brief Release blocks of the sequence. Store blocks for reuse if llmReqeust is provided.
495
498
void releaseBlocks (GenerationRequest& sequence, OptionalRef<LlmRequest const > llmRequest = std::nullopt);
496
499
497
500
// ! \brief Simulate freeing all blocks for that sequence to check impact on number of free blocks
498
- void schedulingReleaseBlocks (GenerationRequest& sequence );
501
+ void schedulingReleaseBlocks (LlmRequest::RequestIdType requestId );
499
502
500
503
// ! \brief Release last block in the sequence
501
504
void releaseLastBlock (GenerationRequest& sequence);
@@ -658,6 +661,11 @@ class BlockManager
658
661
659
662
[[nodiscard]] static bool blockInRadixTree (BlockPtr const & block);
660
663
664
+ [[nodiscard]] bool isEnableHashKey () const
665
+ {
666
+ return mEnableHashKey ;
667
+ }
668
+
661
669
private:
662
670
// ! \brief Add single block to beam of sequence and mAllocatedBlocksPerSeq.
663
671
void addBlockToBeam (BlockPtr& block, GenerationRequest& sequence, SizeType32 beamIdx);
@@ -849,6 +857,7 @@ class BaseKVCacheManager
849
857
virtual void rewindKVCache (LlmRequest::RequestIdType requestId, SizeType32 rewindLengths) = 0;
850
858
851
859
[[nodiscard]] virtual GenerationRequest const & getSequence (LlmRequest::RequestIdType requestId) const = 0;
860
+ [[nodiscard]] virtual GenerationRequest& getSequence (LlmRequest::RequestIdType requestId) = 0;
852
861
853
862
[[nodiscard]] virtual bool isCrossKv () const = 0;
854
863
@@ -872,6 +881,10 @@ class BaseKVCacheManager
872
881
std::vector<LlmRequest::RequestIdType> const & requestIds) const
873
882
= 0;
874
883
884
+ [[nodiscard]] virtual std::vector<KVCacheBlock::IdType> getNewlyAllocatedBlockIds (
885
+ LlmRequest::RequestIdType requestId) const
886
+ = 0;
887
+
875
888
[[nodiscard]] virtual runtime::ITensor::SharedPtr getPrimaryPool (SizeType32 layer_idx) const = 0;
876
889
[[nodiscard]] virtual SizeType32 getPoolLayerIdx (SizeType32 layer_idx) const = 0;
877
890
@@ -904,6 +917,8 @@ class BaseKVCacheManager
904
917
// / @param outputLength The number of output tokens in each sequence in the batch.
905
918
// / @return SizeType32 A number of sequences per batch.
906
919
[[nodiscard]] virtual SizeType32 getMaxCapacityBatchSize (SizeType32 inputLength, SizeType32 outputLength) const = 0;
920
+
921
+ [[nodiscard]] virtual CacheType getCacheType () const = 0;
907
922
};
908
923
909
924
class KVCacheManager : public BaseKVCacheManager
@@ -935,7 +950,7 @@ class KVCacheManager : public BaseKVCacheManager
935
950
SizeType32 sinkTokenLength, CudaStreamPtr stream, std::optional<SizeType32> maxSequenceLength,
936
951
bool enableBlockReuse = true , bool onboardBlocks = true , CacheType cacheType = CacheType::kSELF ,
937
952
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
938
- std::shared_ptr<KVCacheEventManager> eventManager = nullptr );
953
+ std::shared_ptr<KVCacheEventManager> eventManager = nullptr , bool enableHashKey = false );
939
954
940
955
KVCacheManager (SizeType32 numLayers, SizeType32 numKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
941
956
SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences,
@@ -1100,12 +1115,18 @@ class KVCacheManager : public BaseKVCacheManager
1100
1115
void rewindKVCache (LlmRequest::RequestIdType requestId, SizeType32 rewindLengths) override ;
1101
1116
1102
1117
[[nodiscard]] GenerationRequest const & getSequence (LlmRequest::RequestIdType requestId) const override ;
1118
+ [[nodiscard]] GenerationRequest& getSequence (LlmRequest::RequestIdType requestId) override ;
1103
1119
1104
1120
[[nodiscard]] bool isCrossKv () const override
1105
1121
{
1106
1122
return mBlockManager .getCacheType () == CacheType::kCROSS ;
1107
1123
}
1108
1124
1125
+ [[nodiscard]] CacheType getCacheType () const override
1126
+ {
1127
+ return mBlockManager .getCacheType ();
1128
+ }
1129
+
1109
1130
// ! \brief Find first new block that must be allocated for context phase and return it's concatenated token vector.
1110
1131
// ! \details Only full blocks are considered.
1111
1132
[[nodiscard]] std::optional<BlockKey> findNewContextBlock (
@@ -1148,6 +1169,8 @@ class KVCacheManager : public BaseKVCacheManager
1148
1169
std::vector<std::vector<std::vector<SizeType32>>> getBatchCacheBlockIds (
1149
1170
std::vector<LlmRequest::RequestIdType> const & requestIds) const override ;
1150
1171
1172
+ std::vector<SizeType32> getNewlyAllocatedBlockIds (LlmRequest::RequestIdType requestId) const override ;
1173
+
1151
1174
runtime::ITensor::SharedPtr getPrimaryPool (SizeType32 layer_idx) const override ;
1152
1175
1153
1176
SizeType32 getPoolLayerIdx (SizeType32 layer_idx) const override
@@ -1219,6 +1242,8 @@ class KVCacheManager : public BaseKVCacheManager
1219
1242
bool mEnableHashKey ;
1220
1243
// Whether use one more block for each sequence
1221
1244
bool mUseOneMoreBlock ;
1245
+ // Mutex to protect access to mSequences
1246
+ mutable std::mutex mSequencesMtx ;
1222
1247
// buffers for static tensors, will be created after allocating pools
1223
1248
runtime::ITensor::SharedPtr mBlockPoolPointers ;
1224
1249
runtime::ITensor::SharedPtr mLayerToPoolMapping ;
0 commit comments