We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 4583051 commit 4eea5e1Copy full SHA for 4eea5e1
torch_xla/experimental/tuned_block_sizes.py
@@ -33,6 +33,7 @@ def _simplify_key_ragged_paged_attention(q_head_num, kv_head_num, token_num,
33
34
35
# TODO: add more tuned block sizes in the table
36
+# q_head_num, kv_head_num, token_num, max_model_len
37
_ragged_attention_table = {
38
(32, 8, 4096, 2048): (128, 64),
39
(4, 1, 4096, 2048): (128, 128),
@@ -58,6 +59,12 @@ def _simplify_key_ragged_paged_attention(q_head_num, kv_head_num, token_num,
58
59
(4, 1, 2048, 128): (32, 32),
60
(32, 8, 1024, 128): (32, 32),
61
(1, 1, 1024, 128): (32, 32),
62
+ (10, 2, 4096, 2048): (128, 32), # Qwen/Qwen2.5-32B
63
+ (10, 2, 2048, 2048): (128, 32), # Qwen/Qwen2.5-32B
64
+ (10, 2, 1024, 2048): (128, 32), # Qwen/Qwen2.5-32B
65
+ (5, 1, 4098, 2048): (128, 64), # Qwen/Qwen2.5-32B
66
+ (5, 1, 2048, 2048): (128, 32), # Qwen/Qwen2.5-32B
67
+ (5, 1, 1024, 2048): (128, 32), # Qwen/Qwen2.5-32B
68
}
69
70
0 commit comments