Fix the bugs about operator registration about PyTorch Dispatcher

FFFrog · FFFrog · commit 7078cb8ae4c4 · 2025-09-08T20:44:41.000+08:00
**Background:**

There are two principles about operator registration in PyTorch
- The same namespace can be only registered once by `TORCH_LIBRARY`
- The operator signatures can be only registered once by `def`

Therefore,
- for the first problem, we can use `TORCH_LIBRARY_FRAGMEN` to expand operators within the same NAMESPACE.
- for the second problem, the best way to fix it is to define all the general operator schemas in vLLM insteal of in every plugin repo.

Signed-off-by: FFFrog &lt;ljw1101.vip@gmail.com&gt;
diff --git a/csrc/torch_binding.cpp b/csrc/torch_binding.cpp
@@ -38,7 +38,7 @@ AscendType get_dtype_from_torch(at::ScalarType scalarType)
     }
 }
 
-std::tuple<at::Tensor, at::Tensor> rotary_embedding(at::Tensor &positions, at::Tensor &query, at::Tensor &key,
+void rotary_embedding(at::Tensor &positions, at::Tensor &query, std::optional<at::Tensor> key,
     int64_t head_size, at::Tensor &cos_sin_cache,  bool is_neox)
 {
     int32_t deviceId = 0;
@@ -47,22 +47,23 @@ std::tuple<at::Tensor, at::Tensor> rotary_embedding(at::Tensor &positions, at::T
     TORCH_CHECK(
         positions_ndim == 1 || positions_ndim == 2,
         "positions must have shape [num_tokens] or [batch_size, seq_len]");
+    TORCH_CHECK(key.has_value(), "rotary_embedding: key must have a value");
     if (positions_ndim == 1) {
       TORCH_CHECK(
-          query.size(0) == positions.size(0) && key.size(0) == positions.size(0),
+          query.size(0) == positions.size(0) && key.value().size(0) == positions.size(0),
           "query, key and positions must have the same number of tokens");
     }
     if (positions_ndim == 2) {
       TORCH_CHECK(
           query.size(0) == positions.size(0) &&
-              key.size(0) == positions.size(0) &&
+              key.value().size(0) == positions.size(0) &&
               query.size(1) == positions.size(1) &&
-              key.size(1) == positions.size(1),
+              key.value().size(1) == positions.size(1),
           "query, key and positions must have the same batch_size and seq_len");
     }
     TORCH_CHECK(head_size % 32 == 0, "rotary_embedding: headSize should be divisible by 32");
     int query_hidden_size = query.numel() / num_tokens;
-    int key_hidden_size = key.numel() / num_tokens;
+    int key_hidden_size = key.value().numel() / num_tokens;
     TORCH_CHECK(query_hidden_size % head_size == 0);
     TORCH_CHECK(key_hidden_size % head_size == 0);
     TORCH_CHECK(is_neox == true, "rotary_embedding: neox=false is not supported as custom kernel in vllm-ascend");
@@ -72,18 +73,18 @@ std::tuple<at::Tensor, at::Tensor> rotary_embedding(at::Tensor &positions, at::T
     int num_kv_heads = key_hidden_size / head_size;
     TORCH_CHECK(num_heads % num_kv_heads == 0);
     at::Tensor query_dst = at::empty({num_tokens, num_heads, head_size}, query.options());
-    at::Tensor key_dst = at::empty({num_tokens, num_kv_heads, head_size}, key.options());
+    at::Tensor key_dst = at::empty({num_tokens, num_kv_heads, head_size}, key.value().options());
 
     int rot_dim = cos_sin_cache.size(1);
     int seq_dim_idx = positions_ndim - 1;
     int64_t *position_ids_ptr = positions.data_ptr<int64_t>();
     void *query_dst_ptr = query_dst.data_ptr();
     void *key_dst_ptr = key_dst.data_ptr();
     void *query_ptr = query.data_ptr();
-    void *key_ptr = key.data_ptr();
+    void *key_ptr = key.value().data_ptr();
     void *cos_sin_cache_ptr = cos_sin_cache.data_ptr();
     int64_t query_stride = query.stride(seq_dim_idx);
-    int64_t key_stride = key.stride(seq_dim_idx);
+    int64_t key_stride = key.value().stride(seq_dim_idx);
     int64_t dst_query_stride = query_dst.stride(0);
     int64_t dst_key_stride = key_dst.stride(0);
     at::ScalarType scalar_type = query.scalar_type();
@@ -104,7 +105,9 @@ std::tuple<at::Tensor, at::Tensor> rotary_embedding(at::Tensor &positions, at::T
         return 0;
     });
     cmd.Run();
-    return {query_dst, key_dst};
+
+    query.copy_(query_dst);
+    key.value().copy_(key_dst);
 }
 
 std::tuple<at::Tensor, at::Tensor> get_masked_input_and_mask(
@@ -142,7 +145,7 @@ std::tuple<at::Tensor, at::Tensor> get_masked_input_and_mask(
     TP2, rank 1:
                             |< -----------BASE----------- >|< -BASE PADDING- >|< -----------LORA PADDING----------- >|
     corresponding token_id: | 512 | 513 | 514 | ... | 1009 | -1  | ...  | -1  |  -1  | ... |  -1  | -1  | ... |   -1 |
-                     index: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 512  | ... | 519  | 520 | ... |  543 | 
+                     index: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 512  | ... | 519  | 520 | ... |  543 |
     Parameters:
         org_vocab_start_index //base embeddings start
         org_vocab_end_index //base embeddings end
@@ -165,22 +168,22 @@ std::tuple<at::Tensor, at::Tensor> get_masked_input_and_mask(
     // Create output tensors
     at::Tensor masked_input = at::empty_like(input);
 	at::Tensor mask = at::empty_like(input).to(at::kBool);
-    
+
     // Get data pointers
     void *input_ptr = input.data_ptr();
     void *masked_input_ptr = masked_input.data_ptr();
     void *mask_ptr = mask.data_ptr();
-    
+
     // Get current stream
     aclrtStream stream = c10_npu::getCurrentNPUStream().stream();
-    
+
     // Get scalar type
     at::ScalarType scalar_type = input.scalar_type();
-    
+
     // Create and configure OpCommand
     at_npu::native::OpCommand cmd;
     cmd.Name("get_masked_input_and_mask");
-    cmd.SetCustomHandler([scalar_type, size, stream, 
+    cmd.SetCustomHandler([scalar_type, size, stream,
                          input_ptr, masked_input_ptr, mask_ptr,
                          org_vocab_start_index, org_vocab_end_index,
                          num_org_vocab_padding, added_vocab_start_index,
@@ -194,7 +197,7 @@ std::tuple<at::Tensor, at::Tensor> get_masked_input_and_mask(
         get_masked_input_and_mask_impl(
             stream,
             input_ptr,
-            masked_input_ptr, 
+            masked_input_ptr,
             mask_ptr,
             org_vocab_start_index,
             org_vocab_end_index,
@@ -204,7 +207,7 @@ std::tuple<at::Tensor, at::Tensor> get_masked_input_and_mask(
             size,
             loop_cnt,
             aiv_num);
-            
+
         return 0;
     });
     cmd.Run();
@@ -321,8 +324,8 @@ void sgmv_shrink(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_indices, at
     aclrtStream stream = c10_npu::getCurrentNPUStream().stream();
     at_npu::native::OpCommand cmd;
     cmd.Name("sgmv_shrink");
-    cmd.SetCustomHandler([scalar_type, stream, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size, 
-                          seq_len_ptr, seq_len_size, y_ptr, 
+    cmd.SetCustomHandler([scalar_type, stream, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size,
+                          seq_len_ptr, seq_len_size, y_ptr,
                           batch_size, input_hidden_token, lora_rank, scale_f]() -> int {
         auto dtype = get_dtype_from_torch(scalar_type);
         int device_id = 0;
@@ -331,7 +334,7 @@ void sgmv_shrink(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_indices, at
         int num_tokens_per_core = (batch_size + aiv_num - 1) / aiv_num;
         TORCH_CHECK("num_tokens_per_core != 0", "num_tokens_per_core should not be 0");
         sgmv_shrink_impl(dtype, stream, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size, seq_len_ptr, seq_len_size,
-                         y_ptr, batch_size, 
+                         y_ptr, batch_size,
                          num_tokens_per_core, input_hidden_token, lora_rank, scale_f);
         return 0;
     });
@@ -368,15 +371,15 @@ at::Tensor sgmv_expand(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_indic
     aclrtStream stream = c10_npu::getCurrentNPUStream().stream();
     at_npu::native::OpCommand cmd;
     cmd.Name("sgmv_expand");
-    cmd.SetCustomHandler([scalar_type, stream, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size, seq_len_ptr, seq_len_size, y_ptr, y_out_ptr, 
+    cmd.SetCustomHandler([scalar_type, stream, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size, seq_len_ptr, seq_len_size, y_ptr, y_out_ptr,
                           batch_size, lora_rank, slice_offset, slice_size, output_full_dim]() -> int {
         auto dtype = get_dtype_from_torch(scalar_type);
         int device_id = 0;
         int64_t aiv_num = 0;
         TORCH_CHECK(aclGetDeviceCapability(device_id, ACL_DEVICE_INFO_VECTOR_CORE_NUM, &aiv_num) == ACL_SUCCESS);
         int num_tokens_per_core = (batch_size + aiv_num - 1) / aiv_num;
         TORCH_CHECK("num_tokens_per_core != 0", "num_tokens_per_core should not be 0");
-        sgmv_expand_impl(dtype, stream, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size, seq_len_ptr, seq_len_size, y_ptr, y_out_ptr, 
+        sgmv_expand_impl(dtype, stream, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size, seq_len_ptr, seq_len_size, y_ptr, y_out_ptr,
                          batch_size, num_tokens_per_core, lora_rank, slice_size, slice_offset, output_full_dim);
         return 0;
     });
@@ -385,43 +388,34 @@ at::Tensor sgmv_expand(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_indic
 }
 } // namespace vllm_ascend
 
-TORCH_LIBRARY_EXPAND(_C, ops)
+TORCH_LIBRARY_FRAGMENT_EXPAND(_C, ops)
 {
-    // vLLM-Ascend custom ops
     ops.def("weak_ref_tensor(Tensor input) -> Tensor");
-    ops.impl("weak_ref_tensor", torch::kPrivateUse1, &vllm_ascend::weak_ref_tensor);
-
-    // Rotary embedding
-    // Apply GPT-NeoX style rotary embedding to query and key.
-    ops.def(
-        "rotary_embedding(Tensor positions, Tensor! query,"
-        "                 Tensor! key, int head_size,"
-        "                 Tensor cos_sin_cache, bool is_neox) -> (Tensor query, Tensor key)");
-    ops.impl("rotary_embedding", torch::kPrivateUse1, &vllm_ascend::rotary_embedding);
-
     ops.def(
         "get_masked_input_and_mask(Tensor input, "
         "                         int org_vocab_start_index, "
         "                         int org_vocab_end_index, "
         "                         int num_org_vocab_padding, "
         "                         int added_vocab_start_index, "
         "                         int added_vocab_end_index) -> (Tensor masked_input, Tensor mask)");
-    ops.impl("get_masked_input_and_mask", torch::kPrivateUse1, &vllm_ascend::get_masked_input_and_mask);
-
     ops.def("bgmv_shrink(Tensor! x, Tensor! weight, Tensor! indices, Tensor! y, float scale) -> ()");
-    ops.impl("bgmv_shrink", torch::kPrivateUse1, &vllm_ascend::bgmv_shrink);
-
     ops.def(
         "bgmv_expand(Tensor! x, Tensor! weight, Tensor! indices, Tensor! y,"
         "            int slice_offset, int slice_size) -> Tensor");
-    ops.impl("bgmv_expand", torch::kPrivateUse1, &vllm_ascend::bgmv_expand);
-
     ops.def("sgmv_shrink(Tensor! x, Tensor! weight, Tensor! lora_indices, Tensor! seq_len, Tensor! y, float scale) -> ()");
-    ops.impl("sgmv_shrink", torch::kPrivateUse1, &vllm_ascend::sgmv_shrink);
-
     ops.def(
         "sgmv_expand(Tensor! x, Tensor! weight, Tensor! lora_indices, Tensor! seq_len, Tensor! y,"
         "            int slice_offset, int slice_size) -> Tensor");
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(_C, PrivateUse1, ops)
+{
+    ops.impl("weak_ref_tensor", torch::kPrivateUse1, &vllm_ascend::weak_ref_tensor);
+    ops.impl("rotary_embedding", torch::kPrivateUse1, &vllm_ascend::rotary_embedding);
+    ops.impl("get_masked_input_and_mask", torch::kPrivateUse1, &vllm_ascend::get_masked_input_and_mask);
+    ops.impl("bgmv_shrink", torch::kPrivateUse1, &vllm_ascend::bgmv_shrink);
+    ops.impl("bgmv_expand", torch::kPrivateUse1, &vllm_ascend::bgmv_expand);
+    ops.impl("sgmv_shrink", torch::kPrivateUse1, &vllm_ascend::sgmv_shrink);
     ops.impl("sgmv_expand", torch::kPrivateUse1, &vllm_ascend::sgmv_expand);
 }
 
diff --git a/csrc/torch_binding_meta.cpp b/csrc/torch_binding_meta.cpp
@@ -36,23 +36,25 @@
 namespace vllm_ascend {
 namespace meta {
 
-std::tuple<at::Tensor, at::Tensor> rotary_embedding_meta(
+void rotary_embedding_meta(
   at::Tensor &positions,
   at::Tensor &query,
-  at::Tensor &key,
+  std::optional<at::Tensor> key,
   int64_t head_size, 
   at::Tensor &cos_sin_cache,
   bool is_neox) {
+    TORCH_CHECK(key.has_value(), "rotary_embedding_meta: key must have a value");
     auto num_tokens = positions.sym_numel();
     auto query_hidden_size = query.sym_numel() / num_tokens;
-    auto key_hidden_size = key.sym_numel() / num_tokens;
+    auto key_hidden_size = key.value().sym_numel() / num_tokens;
 
     auto num_heads = query_hidden_size / head_size;
     auto num_kv_heads = key_hidden_size / head_size;
-    at::Tensor query_dst = at::empty_symint({num_tokens, num_heads, head_size}, query.options());
-    at::Tensor key_dst = at::empty_symint({num_tokens, num_kv_heads, head_size}, key.options());
 
-    return {query_dst, key_dst};
+    c10::SymIntArrayRef query_shape({num_tokens, num_heads, head_size});
+    c10::SymIntArrayRef key_shape({num_tokens, num_kv_heads, head_size});
+    query.resize__symint(query_shape);
+    key.value().resize__symint(key_shape);
 }
 
 std::tuple<at::Tensor, at::Tensor> get_masked_input_and_mask_meta(
@@ -99,4 +101,4 @@ namespace {
     ops.impl("sgmv_expand", &vllm_ascend::meta::sgmv_expand_meta);
 
 }
-}
+}
diff --git a/csrc/utils.h b/csrc/utils.h
@@ -13,6 +13,10 @@
 // A version of the TORCH_LIBRARY macro that expands the NAME, i.e. so NAME
 // could be a macro instead of a literal token.
 #define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE)
+//
+// A version of the TORCH_LIBRARY_FRAGMENT macro that expands the NAME, i.e. so NAME
+// could be a macro instead of a literal token.
+#define TORCH_LIBRARY_FRAGMENT_EXPAND(NAME, MODULE) TORCH_LIBRARY_FRAGMENT(NAME, MODULE)
 
 // A version of the TORCH_LIBRARY_IMPL macro that expands the NAME, i.e. so NAME
 // could be a macro instead of a literal token.
diff --git a/tests/e2e/singlecard/ops/test_rotary_embedding.py b/tests/e2e/singlecard/ops/test_rotary_embedding.py
@@ -182,7 +182,7 @@ def test_rotary_embedding_quant_with_leading_dim(
     )
 
     ref_query, ref_key = rope.forward_native(positions, query, key)
-    query, key = torch.ops._C.rotary_embedding(
+    torch.ops._C.rotary_embedding(
         positions,
         query,
         key,
@@ -239,16 +239,16 @@ def forward(
         # we simulated a simple attention layer to test if it can be seamlessly captured into aclgraph
         qkv = self.qkv_proj(hidden_states)
         q, k, v = qkv.chunk(3, dim=-1)
-        query, key = torch.ops._C.rotary_embedding(
+        q_shape = q.shape
+        torch.ops._C.rotary_embedding(
             positions,
             q,
             k,
             self.rope.head_size,
             self.rope.cos_sin_cache,
             self.rope.is_neox_style,
         )
-        query = query.view(q.shape)
-        key = key.view(k.shape)
+        query = q.view(q_shape)
         o = self.o_proj(query)
         return o
 
diff --git a/tests/ut/test_utils.py b/tests/ut/test_utils.py
@@ -261,6 +261,20 @@ def test_update_aclgraph_sizes(self):
         self.assertEqual(
             147,
             len(test_vllm_config.compilation_config.cudagraph_capture_sizes))
+
+        test_vllm_config.speculative_config = mock.MagicMock()
+        test_vllm_config.speculative_config.draft_model_config = mock.MagicMock(
+        )
+        test_vllm_config.speculative_config.draft_model_config.hf_config = mock.MagicMock(
+        )
+        test_vllm_config.speculative_config.draft_model_config.hf_config.num_hidden_layers = 2
+        os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV'
+        utils.update_aclgraph_sizes(test_vllm_config)
+        del os.environ['HCCL_OP_EXPANSION_MODE']
+        self.assertEqual(
+            120,
+            len(test_vllm_config.compilation_config.cudagraph_capture_sizes))
+
         # max_num_batch_sizes >= len(original_sizes)
         test_compilation_config = CompilationConfig(
             cudagraph_capture_sizes=[1, 2, 3])
diff --git a/vllm_ascend/ops/rotary_embedding.py b/vllm_ascend/ops/rotary_embedding.py
@@ -51,7 +51,7 @@ def _rope_forward_oot(
     # adopt custom kernel path for rotary_embedding
     if _custom_rotary_embedding_enabled(query, neox_style,
                                         self.head_size) and not is_310p():
-        query, key = torch.ops._C.rotary_embedding(
+        torch.ops._C.rotary_embedding(
             positions,
             query,
             key,
diff --git a/vllm_ascend/torchair/ops/torchair_rotary_embedding.py b/vllm_ascend/torchair/ops/torchair_rotary_embedding.py
@@ -62,7 +62,7 @@ def rope_forward_oot(
     # adopt custom kernel path for rotary_embedding
     if custom_rotary_embedding_enabled(query, neox_style,
                                        self.head_size) and not is_310p():
-        query, key = torch.ops._C.rotary_embedding(
+        torch.ops._C.rotary_embedding(
             positions,
             query,
             key,
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
@@ -304,6 +304,12 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
         num_hidden_layers = get_max_hidden_layers(hf_config)
     parallel_config = vllm_config.parallel_config
 
+    # Calculate maximum supported batch sizes considering model architecture
+    resources_per_graph = num_hidden_layers + 1
+    if vllm_config.speculative_config is not None:
+        draft_model_hf_config = vllm_config.speculative_config.draft_model_config.hf_config
+        resources_per_graph += draft_model_hf_config.num_hidden_layers + 1
+
     # TODO: Find out whether we need to take into account the pp_size
     num_comm_groups = sum(size > 1 for size in [
         parallel_config.data_parallel_size,
@@ -318,8 +324,8 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
         # Assume the following case:
         # MAX_CAPTURE_SIZE = 1920, num_hidden_layers = 48, data_parallel_size is 1, tensor_parallel_size is 4,
         # According to the formula, max_num_batch_sizes = math.floor(1920 / (48 + 1) / 2) = 19
-        max_num_batch_sizes = math.floor(
-            MAX_CAPTURE_SIZE / (num_hidden_layers + 1) / parallel_factor)
+        max_num_batch_sizes = math.floor(MAX_CAPTURE_SIZE /
+                                         resources_per_graph / parallel_factor)
         logger.info(
             "Calculated maximum supported batch sizes for ACL graph: %s",
             max_num_batch_sizes)
@@ -335,8 +341,8 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
         # MAX_CAPTURE_SIZE = 1920, num_hidden_layers = 48, data_parallel_size is 1, tensor_parallel_size is 4,
         # According to the formula, max_num_batch_sizes = math.floor((1920 - 1 * 40) / (48 + 1) / (1 + 1 * 2)) = 12
         max_num_batch_sizes = math.floor(
-            (MAX_CAPTURE_SIZE - num_comm_groups * 40) /
-            (num_hidden_layers + 1) / (1 + num_comm_groups * 2))
+            (MAX_CAPTURE_SIZE - num_comm_groups * 40) / resources_per_graph /
+            (1 + num_comm_groups * 2))
         logger.info(
             "Calculated maximum supported batch sizes for ACL graph: %s",
             max_num_batch_sizes)