GeeeekExplorer · chakpongchung · May 7, 2026
diff --git a/bench.py b/bench.py
@@ -12,7 +12,7 @@ def main():
     max_ouput_len = 1024
 
     path = os.path.expanduser("~/huggingface/Qwen3-0.6B/")
-    llm = LLM(path, enforce_eager=False, max_model_len=4096)
+    llm = LLM(path, enforce_eager=False, max_model_len=4096, gpu_memory_utilization=0.93)
 
     prompt_token_ids = [[randint(0, 10000) for _ in range(randint(100, max_input_len))] for _ in range(num_seqs)]
     sampling_params = [SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=randint(100, max_ouput_len)) for _ in range(num_seqs)]

diff --git a/example.py b/example.py
@@ -6,7 +6,7 @@
 def main():
     path = os.path.expanduser("~/huggingface/Qwen3-0.6B/")
     tokenizer = AutoTokenizer.from_pretrained(path)
-    llm = LLM(path, enforce_eager=True, tensor_parallel_size=1)
+    llm = LLM(path, enforce_eager=True, tensor_parallel_size=1, gpu_memory_utilization=0.93)
 
     sampling_params = SamplingParams(temperature=0.6, max_tokens=256)
     prompts = [

diff --git a/nanovllm/layers/attention.py b/nanovllm/layers/attention.py
@@ -3,7 +3,7 @@
 import triton
 import triton.language as tl
 
-from flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache
+from nanovllm.layers.cutile_attention import flash_attn_varlen_func, flash_attn_with_kvcache
 from nanovllm.utils.context import get_context