huggingface · fxmarty · Jun 10, 2024 · Jun 6, 2024 · Jun 6, 2024 · Jun 6, 2024
diff --git a/server/Makefile-vllm b/server/Makefile-vllm
@@ -1,5 +1,5 @@
 commit_cuda := b5dfc61db88a81069e45b44f7cc99bd9e62a60fa
-commit_rocm := ca6913b3c2ffacdcb7d15e914dc34adbc6c89479
+commit_rocm := 559200c1a028de990c1ddea761b0ccd62109e3a0
 build-vllm-cuda:
 	if [ ! -d 'vllm' ]; then \
 		pip install -U ninja packaging --no-cache-dir && \
@@ -19,5 +19,5 @@ build-vllm-rocm:
 	PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py build
 
 install-vllm-rocm: build-vllm-rocm
-	cd vllm && git fetch && git checkout $(commit_rocm) &&  \
+	cd vllm && git fetch && git checkout $(commit_rocm) && \
 	PYTORCH_ROCM_ARCH="gfx90a;gfx942" pip install -e .
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
@@ -412,9 +412,8 @@ def get_model(
     sliding_window = config_dict.get("sliding_window", -1)
     if sliding_window != -1 and not SUPPORTS_WINDOWING:
         logger.warning(
-            f"Flash attention is available, but doesn't support windowing which is required by model {model_id}"
+            f"Flash attention is available, but doesn't support windowing which is required by model {model_id} for long contexts."
         )
-        FLASH_ATTENTION = False
 
     if model_type == MAMBA:
         return Mamba(

diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
@@ -899,6 +899,8 @@ def warmup(self, batch: FlashCausalLMBatch):
                 os.environ.get("PYTORCH_TUNABLEOP_ENABLED") is None
                 or os.environ.get("PYTORCH_TUNABLEOP_ENABLED") == "1"
             ):
+                torch.cuda.tunable.enable()
+
                 if os.environ.get("PYTORCH_TUNABLEOP_TUNING") != "0":
                     torch.cuda.tunable.tuning_enable(True)
 
@@ -907,8 +909,11 @@ def warmup(self, batch: FlashCausalLMBatch):
                         int(val)
                         for val in os.environ["PYTORCH_TUNABLEOP_SEQLENS"].split(",")
                     ]
-                else:
+                elif CUDA_GRAPHS is not None:
                     tuning_sequences = CUDA_GRAPHS
+                else:
+                    # For seqlen = 1, we dispatch to LLMM1 kernel.
+                    tuning_sequences = [2, 3, 4, 5, 6, 7]
 
                 tunableop_filepath = os.path.join(
                     HUGGINGFACE_HUB_CACHE,