Merge branch 'Rust-GPU:main' into main

Schmiedium · web-flow · commit 0da1a4946ef6 · 2025-03-07T14:33:57.000-05:00
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -7,8 +7,6 @@ on:
   push:
     paths-ignore:
       - '**.md'
-    branches:
-      - master
 
 env:
   RUST_LOG: info
@@ -33,7 +31,7 @@ jobs:
         uses: actions/checkout@v2
 
       - name: Install CUDA 
-        uses: Jimver/cuda-toolkit@v0.2.4
+        uses: Jimver/cuda-toolkit@v0.2.21
         id: cuda-toolkit
         with:
           cuda: '11.2.2'
@@ -74,4 +72,4 @@ jobs:
       - name: Check documentation
         env:
           RUSTDOCFLAGS: -Dwarnings
-        run: cargo doc --workspace --all-features --document-private-items --no-deps --exclude "optix" --exclude "path_tracer" --exclude "denoiser" --exclude "add" --exclude "ex*"
+        run: cargo doc --workspace --all-features --document-private-items --no-deps --exclude "optix" --exclude "path_tracer" --exclude "denoiser" --exclude "add" --exclude "ex*"
diff --git a/README.md b/README.md
@@ -82,6 +82,16 @@ Other projects related to using Rust on the GPU:
 - 2020: [rlsl](https://github.com/MaikKlein/rlsl) Experimental Rust -> SPIR-V compiler (predecessor to rust-gpu)
 - 2020: [rust-gpu](https://github.com/Rust-GPU/rust-gpu) `rustc` compiler backend to compile Rust to SPIR-V for use in shaders, similar mechanism as our project.
 
+## Usage
+```bash
+## setup your environment like:
+### export OPTIX_ROOT=/opt/NVIDIA-OptiX-SDK-9.0.0-linux64-x86_64
+### export OPTIX_ROOT_DIR=/opt/NVIDIA-OptiX-SDK-9.0.0-linux64-x86_64
+
+## build proj
+cargo build
+```
+
 ## License
 
 Licensed under either of
diff --git a/crates/cuda_std/src/lib.rs b/crates/cuda_std/src/lib.rs
@@ -76,15 +76,15 @@ pub mod prelude {
     };
 }
 
-#[cfg(any(target_arch = "nvptx", target_arch = "nvptx64"))]
+#[cfg(target_arch = "nvptx64")]
 #[alloc_error_handler]
 fn alloc_handler(layout: core::alloc::Layout) -> ! {
     core::panic!("Memory allocation of {} bytes failed", layout.size());
 }
 
 // FIXME(RDambrosio016): For some very odd reason, this function causes an InvalidAddress error when called,
 // despite it having no reason for doing that. It needs more debugging to see what is causing it exactly. For now we just trap.
-#[cfg(any(target_arch = "nvptx", target_arch = "nvptx64"))]
+#[cfg(target_arch = "nvptx64")]
 #[panic_handler]
 fn panic(_info: &core::panic::PanicInfo) -> ! {
     // use crate::prelude::*;
diff --git a/crates/cuda_std_macros/src/lib.rs b/crates/cuda_std_macros/src/lib.rs
@@ -27,7 +27,7 @@ pub fn kernel(input: proc_macro::TokenStream, item: proc_macro::TokenStream) ->
     let mut item = parse_macro_input!(item as ItemFn);
     let no_mangle = parse_quote!(#[no_mangle]);
     item.attrs.push(no_mangle);
-    let internal = parse_quote!(#[cfg_attr(any(target_arch="nvptx", target_arch="nvptx64"), nvvm_internal(kernel(#input)))]);
+    let internal = parse_quote!(#[cfg_attr(target_arch="nvptx64", nvvm_internal(kernel(#input)))]);
     item.attrs.push(internal);
 
     // used to guarantee some things about how params are passed in the codegen.
@@ -170,13 +170,13 @@ pub fn gpu_only(_attr: proc_macro::TokenStream, item: proc_macro::TokenStream) -
     };
 
     let output = quote::quote! {
-        #[cfg(not(any(target_arch="nvptx", target_arch="nvptx64")))]
+        #[cfg(not(target_arch="nvptx64"))]
         #[allow(unused_variables)]
         #(#cloned_attrs)* #vis #sig_cpu {
             unimplemented!(concat!("`", stringify!(#fn_name), "` can only be used on the GPU with rustc_codegen_nvvm"))
         }
 
-        #[cfg(any(target_arch="nvptx", target_arch="nvptx64"))]
+        #[cfg(target_arch="nvptx64")]
         #(#attrs)* #vis #sig {
             #block
         }
diff --git a/crates/cust/src/memory/device/device_buffer.rs b/crates/cust/src/memory/device/device_buffer.rs
@@ -314,7 +314,7 @@ impl<A: DeviceCopy + Pod> DeviceBuffer<A> {
     /// whole number of elements. Such as `3` x [`u16`] -> `1.5` x [`u32`].
     /// - If either type is a ZST (but not both).
     #[cfg_attr(docsrs, doc(cfg(feature = "bytemuck")))]
-    pub fn try_cast<B: Pod + DeviceCopy>(self) -> Result<DeviceBuffer<B>, PodCastError> {
+    pub fn try_cast<B: Pod + DeviceCopy>(mut self) -> Result<DeviceBuffer<B>, PodCastError> {
         if align_of::<B>() > align_of::<A>() && (self.buf.as_raw() as usize) % align_of::<B>() != 0
         {
             Err(PodCastError::TargetAlignmentGreaterAndInputNotAligned)
@@ -325,10 +325,12 @@ impl<A: DeviceCopy + Pod> DeviceBuffer<A> {
             Err(PodCastError::SizeMismatch)
         } else if (size_of::<A>() * self.len) % size_of::<B>() == 0 {
             let new_len = (size_of::<A>() * self.len) / size_of::<B>();
-            Ok(DeviceBuffer {
+            let ret = Ok(DeviceBuffer {
                 buf: self.buf.cast(),
                 len: new_len,
-            })
+            });
+            unsafe{std::mem::forget(self);}
+            ret
         } else {
             Err(PodCastError::OutputSliceWouldHaveSlop)
         }