Skip to content

Commit 638d7ab

Browse files
committed
Simplify the attention function
- Use one definition rather than multiple. - Add `key`/`value` arguments, so that we don't need the `PREFILL_IN_KVCACHE` constant. - Make it kwargs-only (to avoid mixing up the various `Tensor` args).
1 parent a6a0c97 commit 638d7ab

21 files changed

+310
-466
lines changed

server/text_generation_server/layers/attention/__init__.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,23 +8,20 @@
88
raise ImportError("`USE_FLASH_ATTENTION` is false.")
99
if SYSTEM == "cuda":
1010
from .cuda import (
11-
PREFILL_IN_KV_CACHE,
1211
SUPPORTS_WINDOWING,
1312
attention,
1413
paged_attention,
1514
reshape_and_cache,
1615
)
1716
elif SYSTEM == "rocm":
1817
from .rocm import (
19-
PREFILL_IN_KV_CACHE,
2018
SUPPORTS_WINDOWING,
2119
attention,
2220
paged_attention,
2321
reshape_and_cache,
2422
)
2523
elif SYSTEM == "ipex":
2624
from .ipex import (
27-
PREFILL_IN_KV_CACHE,
2825
SUPPORTS_WINDOWING,
2926
attention,
3027
paged_attention,
@@ -40,7 +37,6 @@
4037
"attention",
4138
"paged_attention",
4239
"reshape_and_cache",
43-
"PREFILL_IN_KV_CACHE",
4440
"SUPPORTS_WINDOWING",
4541
"KVCache",
4642
"Seqlen",

0 commit comments

Comments
 (0)