Skip to content

Commit

Permalink
Simplify the attention function
Browse files Browse the repository at this point in the history
- Use one definition rather than multiple.
- Add `key`/`value` arguments, so that we don't need the
  `PREFILL_IN_KVCACHE` constant.
- Make it kwargs-only (to avoid mixing up the various `Tensor` args).
  • Loading branch information
danieldk committed Oct 16, 2024
1 parent a6a0c97 commit 638d7ab
Show file tree
Hide file tree
Showing 21 changed files with 310 additions and 466 deletions.
4 changes: 0 additions & 4 deletions server/text_generation_server/layers/attention/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,20 @@
raise ImportError("`USE_FLASH_ATTENTION` is false.")
if SYSTEM == "cuda":
from .cuda import (
PREFILL_IN_KV_CACHE,
SUPPORTS_WINDOWING,
attention,
paged_attention,
reshape_and_cache,
)
elif SYSTEM == "rocm":
from .rocm import (
PREFILL_IN_KV_CACHE,
SUPPORTS_WINDOWING,
attention,
paged_attention,
reshape_and_cache,
)
elif SYSTEM == "ipex":
from .ipex import (
PREFILL_IN_KV_CACHE,
SUPPORTS_WINDOWING,
attention,
paged_attention,
Expand All @@ -40,7 +37,6 @@
"attention",
"paged_attention",
"reshape_and_cache",
"PREFILL_IN_KV_CACHE",
"SUPPORTS_WINDOWING",
"KVCache",
"Seqlen",
Expand Down
Loading

0 comments on commit 638d7ab

Please sign in to comment.