Add Ascend NPU support for generate and chat

xuedinge233 · xuedinge233 · commit 660494a82e81 · 2025-04-08T16:45:38.000+08:00
diff --git a/README.md b/README.md
@@ -582,11 +582,13 @@ We provide
 
 ## Community Contributions
 
-We really value our community and the contributions made by our wonderful users. We'll use this section to call out some of these contributions! If you'd like to help out as well, please see the [CONTRIBUTING](CONTRIBUTING.md) guide.
+We really value our community and the contributions made by our wonderful users! 
 
-To connect with us and other community members, we invite you to join our Slack community by filling out this [form](https://docs.google.com/forms/d/e/1FAIpQLSeADnUNW36fjKjYzyHDOzEB_abKQE9b6gqqW9NXse6O0MWh0A/viewform). Once you've joined, you can:
+If you'd like to help out, connect with us and other community members by joining our [Discord](https://discord.gg/hm2Keduk3v). Once you've joined, you can:
 * Head to the `#torchchat-general` channel for general questions, discussion, and community support.
-* Join the `#torchchat-contributors` channel if you're interested in contributing directly to project development.
+* Hop in the `#torchchat-contributors` channel if you're interested in contributing directly to project development.
+
+Also give our [CONTRIBUTING](CONTRIBUTING.md) guide a read.
 
 Looking forward to discussing with you about torchchat future!
 
diff --git a/install/.pins/et-pin.txt b/install/.pins/et-pin.txt
@@ -1 +1 @@
-791472d6706b027552f39f11b28d034e4839c9af
+73740e9268a4a47baeaedc58a1f75597038d2377
diff --git a/install/install_requirements.sh b/install/install_requirements.sh
@@ -51,26 +51,29 @@ echo "Using pip executable: $PIP_EXECUTABLE"
 # NOTE: If a newly-fetched version of the executorch repo changes the value of
 # PYTORCH_NIGHTLY_VERSION, you should re-run this script to install the necessary
 # package versions.
-PYTORCH_NIGHTLY_VERSION=dev20250131
+PYTORCH_NIGHTLY_VERSION=dev20250327
 
 # Nightly version for torchvision
-VISION_NIGHTLY_VERSION=dev20250131
+VISION_NIGHTLY_VERSION=dev20250327
 
 # Nightly version for torchtune
-TUNE_NIGHTLY_VERSION=dev20250131
+TUNE_NIGHTLY_VERSION=dev20250327
 
 # The pip repository that hosts nightly torch packages. cpu by default.
 # If cuda is available, based on presence of nvidia-smi, install the pytorch nightly
 # with cuda for faster execution on cuda GPUs.
 if [[ -x "$(command -v nvidia-smi)" ]];
 then
-  TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/cu124"
+  TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/cu126"
 elif [[ -x "$(command -v rocminfo)" ]];
 then
   TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/rocm6.2"
 elif [[ -x "$(command -v xpu-smi)" ]];
 then
   TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/xpu"
+elif [[ -x "$(command -v npu-smi)" ]]
+then
+  TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/test/cpu"
 else
   TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/cpu"
 fi
@@ -79,15 +82,22 @@ fi
 if [[ -x "$(command -v xpu-smi)" ]];
 then
   REQUIREMENTS_TO_INSTALL=(
-    torch=="2.7.0.${PYTORCH_NIGHTLY_VERSION}"
+    torch=="2.8.0.${PYTORCH_NIGHTLY_VERSION}"
     torchvision=="0.22.0.${VISION_NIGHTLY_VERSION}"
-    #torchtune=="0.6.0" # no 0.6.0 on xpu nightly
+    #torchtune=="0.7.0" # no 0.6.0 on xpu nightly
+  )
+elif [[ -x "$(command -v npu-smi)" ]];
+then
+  REQUIREMENTS_TO_INSTALL=(
+    torch=="2.7.0"
+    torchvision=="0.22.0"
+    torchtune=="0.6.0"
   )
 else
   REQUIREMENTS_TO_INSTALL=(
-    torch=="2.7.0.${PYTORCH_NIGHTLY_VERSION}"
+    torch=="2.8.0.${PYTORCH_NIGHTLY_VERSION}"
     torchvision=="0.22.0.${VISION_NIGHTLY_VERSION}"
-    torchtune=="0.6.0.${TUNE_NIGHTLY_VERSION}"
+    torchtune=="0.7.0.${TUNE_NIGHTLY_VERSION}"
   )
 fi
 
@@ -136,5 +146,5 @@ if [[ -x "$(command -v nvidia-smi)" ]]; then
 fi
 (
   set -x
-  $PIP_EXECUTABLE install evaluate=="0.4.3" lm-eval=="0.4.2" psutil=="6.0.0"
+  $PIP_EXECUTABLE install evaluate=="0.4.3" lm-eval=="0.4.7" psutil=="6.0.0"
 )
diff --git a/install/requirements.txt b/install/requirements.txt
@@ -23,7 +23,7 @@ openai
 
 # Build tools
 wheel
-cmake>=3.24
+cmake>=3.24, < 4.0.0  # 4.0 is BC breaking
 ninja
 zstd
 
@@ -34,4 +34,4 @@ streamlit
 flask
 
 # eval
-lm_eval==0.4.2
+lm_eval==0.4.7
diff --git a/runner/run.cpp b/runner/run.cpp
@@ -53,6 +53,9 @@ using executorch::extension::TensorPtr;
 using torch::executor::EValue;
 using torch::executor::Module;
 using torch::executor::Result;
+using executorch::runtime::MemoryManager;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::Error;
 #endif
 
 using tokenizers::SPTokenizer;
@@ -867,7 +870,26 @@ int main(int argc, char *argv[]) {
                     : torch::Device(torch::kCUDA);
   ModelType model_type = get_model_type(std::stoi(aoti_metadata["tokenizer_type"]));
 #else // __ET_MODEL__
-  ModelType model_type = get_model_type(llama_ver);
+  Error load_status = transformer.runner->load();
+  ET_CHECK_MSG(
+      load_status == torch::executor::Error::Ok,
+      "program::load() failed with status 0x%" PRIx32,
+      static_cast<uint32_t>(load_status));
+
+  static std::array<uint8_t, 4 * 1024U * 1024U> method_allocator_pool; // 4MB
+  MemoryAllocator method_allocator{MemoryAllocator(
+      sizeof(method_allocator_pool), method_allocator_pool.data())};
+  MemoryManager memory_manager(&method_allocator, nullptr);
+  auto tokenizer_method = transformer.runner->program()->load_method("tokenizer_type", &memory_manager);
+
+  Error execute_status = tokenizer_method->execute();
+  ET_CHECK_MSG(
+      execute_status == torch::executor::Error::Ok,
+      "method::execute() failed with status 0x%" PRIx32,
+      static_cast<uint32_t>(execute_status));
+
+  auto tokenizer_type = tokenizer_method->get_output(0).toInt();
+  ModelType model_type = get_model_type(tokenizer_type);
 #endif
 
   if (model_type == UNKNOWN_MODEL) {
diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py
@@ -29,7 +29,7 @@
 from torchchat.utils.build_utils import (
     device_sync,
     is_cpu_device,
-    is_cuda_or_cpu_or_xpu_device,
+    is_supported_device,
     name_to_dtype,
 )
 from torchchat.utils.measure_time import measure_time
@@ -78,6 +78,8 @@ def __post_init__(self):
                 self.device = "cuda"
             elif torch.xpu.is_available():
                 self.device = "xpu"
+            elif hasattr(torch, "npu") and torch.npu.is_available():
+                self.device = "npu"
             else:
                 self.device = "cpu"
 
@@ -539,7 +541,7 @@ def _initialize_model(
         _set_gguf_kwargs(builder_args, is_et=is_pte, context="generate")
 
     if builder_args.dso_path:
-        if not is_cuda_or_cpu_or_xpu_device(builder_args.device):
+        if not is_supported_device(builder_args.device):
             print(
                 f"Cannot load specified DSO to {builder_args.device}. Attempting to load model to CPU instead"
             )
@@ -573,7 +575,7 @@ def do_nothing(max_batch_size, max_seq_length):
             raise RuntimeError(f"Failed to load AOTI compiled {builder_args.dso_path}")
 
     elif builder_args.aoti_package_path:
-        if not is_cuda_or_cpu_or_xpu_device(builder_args.device):
+        if not is_supported_device(builder_args.device):
             print(
                 f"Cannot load specified PT2 to {builder_args.device}. Attempting to load model to CPU instead"
             )
diff --git a/torchchat/cli/cli.py b/torchchat/cli/cli.py
@@ -176,8 +176,8 @@ def _add_model_config_args(parser, verb: str) -> None:
         "--device",
         type=str,
         default=None,
-        choices=["fast", "cpu", "cuda", "mps", "xpu"],
-        help="Hardware device to use. Options: fast, cpu, cuda, mps, xpu",
+        choices=["fast", "cpu", "cuda", "mps", "xpu", "npu"],
+        help="Hardware device to use. Options: fast, cpu, cuda, mps, xpu, npu",
     )
     model_config_parser.add_argument(
         "--attention-backend",
@@ -432,6 +432,14 @@ def _add_evaluation_args(parser) -> None:
         help="Maximum length sequence to evaluate",
     )
 
+    eval_parser.add_argument(
+        "--modality",
+        type=str,
+        default="text",
+        choices=["text", "text-image"],
+        help="Modality of the model. Options: text, text-image",
+    )
+
 
 # Add CLI Args related to distributed inference
 # This feature is currently a [WIP] and hidden from --help
diff --git a/torchchat/export.py b/torchchat/export.py
@@ -313,7 +313,7 @@ def export_to_edge(
             core_aten_ep, edge_constant_methods, edge_compile_config, verbose=verbose
         )
 
-    def export_for_et(model, device, output_path) -> str:
+    def export_for_et(model, device, output_path, edge_constant_methods) -> str:
 
         input = (
             torch.tensor([[1]], dtype=torch.long, device=device),
@@ -344,12 +344,15 @@ def export_for_et(model, device, output_path) -> str:
         with torch.nn.attention.sdpa_kernel(
             [torch.nn.attention.SDPBackend.MATH]
         ), torch.no_grad():
-            m = export_for_training(model, input, dynamic_shapes=dynamic_shapes).module()
+            m = export_for_training(
+                model, input, dynamic_shapes=dynamic_shapes
+            ).module()
 
             edge_manager = export_to_edge(
                 m,
                 input,
                 dynamic_shapes=dynamic_shapes,
+                edge_constant_methods=edge_constant_methods,
                 edge_compile_config=edge_config,
             )
         edge_manager = edge_manager.to_backend(XnnpackDynamicallyQuantizedPartitioner())
@@ -365,6 +368,7 @@ def export_for_et(model, device, output_path) -> str:
         )
 
         print("The methods are: ", export_program.methods)
+        print("The config methods are: ", export_program.config_methods)
         with open(output_path, "wb") as f:
             export_program.write_to_file(f)
 
@@ -407,7 +411,9 @@ def main(args):
             f"Warning! ExecuTorch export target is controlled by export recipe, not device setting. Ignoring device={builder_args.device} setting."
         )
         builder_args.device = "cpu"
-    elif (output_pte_path or output_dso_path or output_aoti_package_path) and "mps" in builder_args.device:
+    elif (
+        output_pte_path or output_dso_path or output_aoti_package_path
+    ) and "mps" in builder_args.device:
         print("Warning! Device MPS not supported for export. Exporting for device CPU.")
         builder_args.device = "cpu"
 
@@ -473,13 +479,26 @@ def main(args):
                 support_tensor_subclass=False,
             )
             _unset_gguf_kwargs(builder_args)
- 
+
+    if tokenizer_args is None:
+        tokenizer_type = "0"
+    elif tokenizer_args.is_sentencepiece:
+        tokenizer_type = "2"  # Corresponding to llama2
+    else:
+        tokenizer_type = "3"  # Corresponding to llama3
+
     with torch.no_grad():
         if output_pte_path:
             output_pte_path = str(os.path.abspath(output_pte_path))
             if executorch_export_available:
                 print(f"Exporting model using ExecuTorch to {output_pte_path}")
-                export_for_et(model_to_pte, builder_args.device, args.output_pte_path)
+                print(f"Tokenizer type is {tokenizer_type}")
+                export_for_et(
+                    model_to_pte,
+                    builder_args.device,
+                    args.output_pte_path,
+                    {"tokenizer_type": int(tokenizer_type)},
+                )
             else:
                 print(
                     "Export with executorch requested but ExecuTorch could not be loaded"
@@ -503,13 +522,6 @@ def main(args):
         if output_aoti_package_path:
             output_aoti_package_path = str(os.path.abspath(output_aoti_package_path))
 
-            if tokenizer_args is None:
-                tokenizer_type = "0"
-            elif tokenizer_args.is_sentencepiece:
-                tokenizer_type = "2"  # Corresponding to llama2
-            else:
-                tokenizer_type = "3"  # Corresponding to llama3
-
             metadata = {"tokenizer_type": tokenizer_type}
             print(
                 "Exporting model using AOT Inductor to " f"{output_aoti_package_path}."
diff --git a/torchchat/generate.py b/torchchat/generate.py
@@ -1213,6 +1213,8 @@ def callback(x, *, done_generating=False):
                     print(prof.key_averages().table(sort_by="self_cpu_time_total"))
                 elif self.builder_args.device == "cuda":
                     print(prof.key_averages().table(sort_by="self_cuda_time_total"))
+                elif self.builder_args.device == "npu":
+                    print(prof.key_averages().table(sort_by="self_npu_time_total"))
                 else:
                     print(prof.key_averages().table(sort_by="self_xpu_time_total"))
                 prof.export_chrome_trace(f"{self.profile}.json")
@@ -1299,8 +1301,10 @@ def callback(x, *, done_generating=False):
             )
         if torch.cuda.is_available():
             print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
-        if torch.xpu.is_available():
+        elif torch.xpu.is_available():
             print(f"Memory used: {torch.xpu.max_memory_reserved() / 1e9:.02f} GB")
+        elif hasattr(torch, "npu") and torch.npu.is_available():
+            print(f"Memory used: {torch.npu.max_memory_reserved() / 1e9:.02f} GB")
 
 
 
@@ -1595,7 +1599,6 @@ def sample(
         
         return idx_next, probs
 
-
 def run_generator(
     args,
     rank: Optional[int] =None
@@ -1628,8 +1631,10 @@ def run_generator(
         )
         if torch.cuda.is_available():
             torch.cuda.reset_peak_memory_stats()
-        if torch.xpu.is_available():
+        elif torch.xpu.is_available():
             torch.xpu.reset_peak_memory_stats()
+        elif hasattr(torch, "npu") and torch.npu.is_available():
+            torch.npu.reset_peak_memory_stats()
 
         for _ in gen.chat(generator_args):
             pass
diff --git a/torchchat/model.py b/torchchat/model.py
@@ -608,6 +608,12 @@ def setup_caches(self, batch_size, dtype, encoder_max_seq_len, decoder_max_seq_l
             decoder_max_seq_len=decoder_max_seq_len,
         )
 
+    def caches_are_setup(self) -> bool:
+        return self.model.caches_are_setup()
+
+    def caches_are_enabled(self) -> bool:
+        return self.model.caches_are_enabled()
+
     def reset_caches(self):
         self.model.reset_caches()
 
diff --git a/torchchat/usages/eval.py b/torchchat/usages/eval.py
diff --git a/torchchat/utils/build_utils.py b/torchchat/utils/build_utils.py
diff --git a/torchchat/utils/device_info.py b/torchchat/utils/device_info.py
diff --git a/torchchat/utils/quantize.py b/torchchat/utils/quantize.py
diff --git a/torchchat/utils/scripts/build_native.sh b/torchchat/utils/scripts/build_native.sh

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-791472d6706b027552f39f11b28d034e4839c9af`
	`1`	`+73740e9268a4a47baeaedc58a1f75597038d2377`