diff --git a/CHANGELOG.md b/CHANGELOG.md
index 92300b29..bfd61687 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## Unreleased
 ### Changed
 
+## v0.16.0
+### Changed
+- PyTorch v2.3 support
+
 ## v0.15.0
 ### Changed
 - PyTorch v2.2 support
diff --git a/Cargo.toml b/Cargo.toml
index 49286e9c..c1513dba 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tch"
-version = "0.15.0"
+version = "0.16.0"
 authors = ["Laurent Mazare <lmazare@gmail.com>"]
 edition = "2021"
 build = "build.rs"
@@ -22,7 +22,7 @@ libc = "0.2.0"
 ndarray = "0.15"
 rand = "0.8"
 thiserror = "1"
-torch-sys = { version = "0.15.0", path = "torch-sys" }
+torch-sys = { version = "0.16.0", path = "torch-sys" }
 zip = "0.6"
 half = "2"
 safetensors = "0.3.0"
diff --git a/README.md b/README.md
index 6761797a..7986e8d2 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ The code generation part for the C api on top of libtorch comes from
 
 ## Getting Started
 
-This crate requires the C++ PyTorch library (libtorch) in version *v2.2.0* to be available on
+This crate requires the C++ PyTorch library (libtorch) in version *v2.3.0* to be available on
 your system. You can either:
 
 - Use the system-wide libtorch installation (default).
@@ -85,7 +85,7 @@ seem to include `libtorch.a` by default so this would have to be compiled
 manually, e.g. via the following:
 
 ```bash
-git clone -b v2.2.0 --recurse-submodule https://github.com/pytorch/pytorch.git pytorch-static --depth 1
+git clone -b v2.3.0 --recurse-submodule https://github.com/pytorch/pytorch.git pytorch-static --depth 1
 cd pytorch-static
 USE_CUDA=OFF BUILD_SHARED_LIBS=OFF python setup.py build
 # export LIBTORCH to point at the build directory in pytorch-static.
diff --git a/examples/python-extension/Cargo.toml b/examples/python-extension/Cargo.toml
index f56fe003..a312e10c 100644
--- a/examples/python-extension/Cargo.toml
+++ b/examples/python-extension/Cargo.toml
@@ -18,6 +18,6 @@ crate-type = ["cdylib"]
 
 [dependencies]
 pyo3 = { version = "0.21", features = ["extension-module"] }
-pyo3-tch = { path = "../../pyo3-tch", version = "0.15.0" }
-tch = { path = "../..", features = ["python-extension"], version = "0.15.0" }
-torch-sys = { path = "../../torch-sys", features = ["python-extension"], version = "0.15.0" }
+pyo3-tch = { path = "../../pyo3-tch", version = "0.16.0" }
+tch = { path = "../..", features = ["python-extension"], version = "0.16.0" }
+torch-sys = { path = "../../torch-sys", features = ["python-extension"], version = "0.16.0" }
\ No newline at end of file
diff --git a/gen/gen.ml b/gen/gen.ml
index 009d82b5..b0357481 100644
--- a/gen/gen.ml
+++ b/gen/gen.ml
@@ -93,6 +93,7 @@ let excluded_prefixes =
   ; "_nested_tensor"
   ; "_fused_adam"
   ; "sym_"
+  ; "_fused_sgd"
   ]
 
 let excluded_suffixes = [ "_forward"; "_forward_out" ]
@@ -878,7 +879,7 @@ let run
 
 let () =
   run
-    ~yaml_filename:"third_party/pytorch/Declarations-v2.2.0.yaml"
+    ~yaml_filename:"third_party/pytorch/Declarations-v2.3.0.yaml"
     ~cpp_filename:"torch-sys/libtch/torch_api_generated"
     ~ffi_filename:"torch-sys/src/c_generated.rs"
     ~wrapper_filename:"src/wrappers/tensor_generated.rs"
diff --git a/pyo3-tch/Cargo.toml b/pyo3-tch/Cargo.toml
index c1fde9f2..d833aff6 100644
--- a/pyo3-tch/Cargo.toml
+++ b/pyo3-tch/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "pyo3-tch"
-version = "0.15.0"
+version = "0.16.0"
 authors = ["Laurent Mazare <lmazare@gmail.com>"]
 edition = "2021"
 build = "build.rs"
@@ -12,6 +12,6 @@ categories = ["science"]
 license = "MIT/Apache-2.0"
 
 [dependencies]
-tch = { path = "..", features = ["python-extension"], version = "0.15.0" }
-torch-sys = { path = "../torch-sys", features = ["python-extension"], version = "0.15.0" }
-pyo3 = { version = "0.21", features = ["extension-module"] }
+tch = { path = "..", features = ["python-extension"], version = "0.16.0" }
+torch-sys = { path = "../torch-sys", features = ["python-extension"], version = "0.16.0" }
+pyo3 = { version = "0.21", features = ["extension-module"] }
\ No newline at end of file
diff --git a/src/wrappers/tensor_fallible_generated.rs b/src/wrappers/tensor_fallible_generated.rs
index 4939ea59..c28a5f40 100644
--- a/src/wrappers/tensor_fallible_generated.rs
+++ b/src/wrappers/tensor_fallible_generated.rs
@@ -618,6 +618,18 @@ impl Tensor {
         Ok(Tensor { c_tensor: c_tensors[0] })
     }
 
+    pub fn f_internal_assert_scalar<S: Into<Scalar>>(
+        self_scalar: S,
+        assert_msg: &str,
+    ) -> Result<(), TchError> {
+        unsafe_torch_err!(atg__assert_scalar(
+            self_scalar.into().c_scalar,
+            assert_msg.as_ptr(),
+            assert_msg.len() as i32
+        ));
+        Ok(())
+    }
+
     pub fn f_internal_assert_tensor_metadata(
         a: &Tensor,
         size: impl IntListOption,
@@ -821,6 +833,40 @@ impl Tensor {
         Ok(Tensor { c_tensor: c_tensors[0] })
     }
 
+    pub fn f_internal_chunk_cat<T: Borrow<Tensor>>(
+        tensors: &[T],
+        dim: i64,
+        num_chunks: i64,
+    ) -> Result<Tensor, TchError> {
+        let mut c_tensors = [std::ptr::null_mut(); 1];
+        unsafe_torch_err!(atg__chunk_cat(
+            c_tensors.as_mut_ptr(),
+            ptr_list(tensors).as_ptr(),
+            tensors.len() as i32,
+            dim,
+            num_chunks
+        ));
+        Ok(Tensor { c_tensor: c_tensors[0] })
+    }
+
+    pub fn f_internal_chunk_cat_out<T: Borrow<Tensor>>(
+        out: &Tensor,
+        tensors: &[T],
+        dim: i64,
+        num_chunks: i64,
+    ) -> Result<Tensor, TchError> {
+        let mut c_tensors = [std::ptr::null_mut(); 1];
+        unsafe_torch_err!(atg__chunk_cat_out(
+            c_tensors.as_mut_ptr(),
+            out.c_tensor,
+            ptr_list(tensors).as_ptr(),
+            tensors.len() as i32,
+            dim,
+            num_chunks
+        ));
+        Ok(Tensor { c_tensor: c_tensors[0] })
+    }
+
     pub fn f_internal_coalesce(&self) -> Result<Tensor, TchError> {
         let mut c_tensors = [std::ptr::null_mut(); 1];
         unsafe_torch_err!(atg__coalesce(c_tensors.as_mut_ptr(), self.c_tensor));
@@ -1280,6 +1326,7 @@ impl Tensor {
         alpha: Option<T>,
         out_dtype: impl Into<Option<Kind>>,
         transpose_result: bool,
+        alg_id: i64,
     ) -> Result<Tensor, TchError> {
         let mut c_tensors = [std::ptr::null_mut(); 1];
         unsafe_torch_err!(atg__cslt_sparse_mm(
@@ -1289,11 +1336,34 @@ impl Tensor {
             bias.as_ref().map_or(std::ptr::null_mut(), |t| t.borrow().c_tensor),
             alpha.as_ref().map_or(std::ptr::null_mut(), |t| t.borrow().c_tensor),
             out_dtype.into().map_or(-1, |s| s.c_int()),
-            if transpose_result { 1 } else { 0 }
+            if transpose_result { 1 } else { 0 },
+            alg_id
         ));
         Ok(Tensor { c_tensor: c_tensors[0] })
     }
 
+    pub fn f_internal_cslt_sparse_mm_search<T: Borrow<Tensor>>(
+        compressed_a: &Tensor,
+        dense_b: &Tensor,
+        bias: Option<T>,
+        alpha: Option<T>,
+        out_dtype: impl Into<Option<Kind>>,
+        transpose_result: bool,
+    ) -> Result<i64, TchError> {
+        let return_;
+        unsafe_torch_err!(
+            return_ = atg__cslt_sparse_mm_search(
+                compressed_a.c_tensor,
+                dense_b.c_tensor,
+                bias.as_ref().map_or(std::ptr::null_mut(), |t| t.borrow().c_tensor),
+                alpha.as_ref().map_or(std::ptr::null_mut(), |t| t.borrow().c_tensor),
+                out_dtype.into().map_or(-1, |s| s.c_int()),
+                if transpose_result { 1 } else { 0 }
+            )
+        );
+        Ok(return_)
+    }
+
     pub fn f_internal_ctc_loss(
         log_probs: &Tensor,
         targets: &Tensor,
@@ -2729,6 +2799,22 @@ impl Tensor {
         Ok(Tensor { c_tensor: c_tensors[0] })
     }
 
+    pub fn f_internal_functional_assert_scalar<S: Into<Scalar>>(
+        self_scalar: S,
+        assert_msg: &str,
+        dep_token: &Tensor,
+    ) -> Result<Tensor, TchError> {
+        let mut c_tensors = [std::ptr::null_mut(); 1];
+        unsafe_torch_err!(atg__functional_assert_scalar(
+            c_tensors.as_mut_ptr(),
+            self_scalar.into().c_scalar,
+            assert_msg.as_ptr(),
+            assert_msg.len() as i32,
+            dep_token.c_tensor
+        ));
+        Ok(Tensor { c_tensor: c_tensors[0] })
+    }
+
     pub fn f_internal_functional_sym_constrain_range<S: Into<Scalar>>(
         size: S,
         min: impl Into<Option<i64>>,
@@ -3311,6 +3397,12 @@ impl Tensor {
         Ok(return_ != 0)
     }
 
+    pub fn f_internal_lazy_clone(&self) -> Result<Tensor, TchError> {
+        let mut c_tensors = [std::ptr::null_mut(); 1];
+        unsafe_torch_err!(atg__lazy_clone(c_tensors.as_mut_ptr(), self.c_tensor));
+        Ok(Tensor { c_tensor: c_tensors[0] })
+    }
+
     pub fn f_internal_linalg_check_errors(
         info: &Tensor,
         api_name: &str,
@@ -3392,6 +3484,12 @@ impl Tensor {
         Ok((Tensor { c_tensor: c_tensors[0] }, Tensor { c_tensor: c_tensors[1] }))
     }
 
+    pub fn f_internal_linalg_eigvals(&self) -> Result<Tensor, TchError> {
+        let mut c_tensors = [std::ptr::null_mut(); 1];
+        unsafe_torch_err!(atg__linalg_eigvals(c_tensors.as_mut_ptr(), self.c_tensor));
+        Ok(Tensor { c_tensor: c_tensors[0] })
+    }
+
     pub fn f_internal_linalg_slogdet(
         a: &Tensor,
     ) -> Result<(Tensor, Tensor, Tensor, Tensor), TchError> {
@@ -4522,6 +4620,52 @@ impl Tensor {
         Ok(Tensor { c_tensor: c_tensors[0] })
     }
 
+    pub fn f_internal_nested_get_jagged_dummy(any: &Tensor) -> Result<Tensor, TchError> {
+        let mut c_tensors = [std::ptr::null_mut(); 1];
+        unsafe_torch_err!(atg__nested_get_jagged_dummy(c_tensors.as_mut_ptr(), any.c_tensor));
+        Ok(Tensor { c_tensor: c_tensors[0] })
+    }
+
+    pub fn f_internal_nested_get_lengths(&self) -> Result<Tensor, TchError> {
+        let mut c_tensors = [std::ptr::null_mut(); 1];
+        unsafe_torch_err!(atg__nested_get_lengths(c_tensors.as_mut_ptr(), self.c_tensor));
+        Ok(Tensor { c_tensor: c_tensors[0] })
+    }
+
+    pub fn f_internal_nested_get_offsets(&self) -> Result<Tensor, TchError> {
+        let mut c_tensors = [std::ptr::null_mut(); 1];
+        unsafe_torch_err!(atg__nested_get_offsets(c_tensors.as_mut_ptr(), self.c_tensor));
+        Ok(Tensor { c_tensor: c_tensors[0] })
+    }
+
+    pub fn f_internal_nested_get_ragged_idx(&self) -> Result<i64, TchError> {
+        let return_;
+        unsafe_torch_err!(return_ = atg__nested_get_ragged_idx(self.c_tensor));
+        Ok(return_)
+    }
+
+    pub fn f_internal_nested_get_values(&self) -> Result<Tensor, TchError> {
+        let mut c_tensors = [std::ptr::null_mut(); 1];
+        unsafe_torch_err!(atg__nested_get_values(c_tensors.as_mut_ptr(), self.c_tensor));
+        Ok(Tensor { c_tensor: c_tensors[0] })
+    }
+
+    pub fn f_internal_nested_get_values_copy(&self) -> Result<Tensor, TchError> {
+        let mut c_tensors = [std::ptr::null_mut(); 1];
+        unsafe_torch_err!(atg__nested_get_values_copy(c_tensors.as_mut_ptr(), self.c_tensor));
+        Ok(Tensor { c_tensor: c_tensors[0] })
+    }
+
+    pub fn f_internal_nested_get_values_copy_out(&self, out: &Tensor) -> Result<Tensor, TchError> {
+        let mut c_tensors = [std::ptr::null_mut(); 1];
+        unsafe_torch_err!(atg__nested_get_values_copy_out(
+            c_tensors.as_mut_ptr(),
+            out.c_tensor,
+            self.c_tensor
+        ));
+        Ok(Tensor { c_tensor: c_tensors[0] })
+    }
+
     pub fn f_internal_nested_select_backward(
         &self,
         grad_output: &Tensor,
@@ -4610,6 +4754,65 @@ impl Tensor {
         Ok(Tensor { c_tensor: c_tensors[0] })
     }
 
+    pub fn f_internal_nested_view_from_jagged<T: Borrow<Tensor>>(
+        &self,
+        offsets: &Tensor,
+        dummy: &Tensor,
+        lengths: Option<T>,
+        ragged_idx: i64,
+    ) -> Result<Tensor, TchError> {
+        let mut c_tensors = [std::ptr::null_mut(); 1];
+        unsafe_torch_err!(atg__nested_view_from_jagged(
+            c_tensors.as_mut_ptr(),
+            self.c_tensor,
+            offsets.c_tensor,
+            dummy.c_tensor,
+            lengths.as_ref().map_or(std::ptr::null_mut(), |t| t.borrow().c_tensor),
+            ragged_idx
+        ));
+        Ok(Tensor { c_tensor: c_tensors[0] })
+    }
+
+    pub fn f_internal_nested_view_from_jagged_copy<T: Borrow<Tensor>>(
+        &self,
+        offsets: &Tensor,
+        dummy: &Tensor,
+        lengths: Option<T>,
+        ragged_idx: i64,
+    ) -> Result<Tensor, TchError> {
+        let mut c_tensors = [std::ptr::null_mut(); 1];
+        unsafe_torch_err!(atg__nested_view_from_jagged_copy(
+            c_tensors.as_mut_ptr(),
+            self.c_tensor,
+            offsets.c_tensor,
+            dummy.c_tensor,
+            lengths.as_ref().map_or(std::ptr::null_mut(), |t| t.borrow().c_tensor),
+            ragged_idx
+        ));
+        Ok(Tensor { c_tensor: c_tensors[0] })
+    }
+
+    pub fn f_internal_nested_view_from_jagged_copy_out<T: Borrow<Tensor>>(
+        &self,
+        out: &Tensor,
+        offsets: &Tensor,
+        dummy: &Tensor,
+        lengths: Option<T>,
+        ragged_idx: i64,
+    ) -> Result<Tensor, TchError> {
+        let mut c_tensors = [std::ptr::null_mut(); 1];
+        unsafe_torch_err!(atg__nested_view_from_jagged_copy_out(
+            c_tensors.as_mut_ptr(),
+            out.c_tensor,
+            self.c_tensor,
+            offsets.c_tensor,
+            dummy.c_tensor,
+            lengths.as_ref().map_or(std::ptr::null_mut(), |t| t.borrow().c_tensor),
+            ragged_idx
+        ));
+        Ok(Tensor { c_tensor: c_tensors[0] })
+    }
+
     pub fn f_internal_new_zeros_with_same_feature_meta(
         &self,
         other: &Tensor,
@@ -4882,6 +5085,11 @@ impl Tensor {
         Ok((Tensor { c_tensor: c_tensors[0] }, Tensor { c_tensor: c_tensors[1] }))
     }
 
+    pub fn f_internal_print(s: &str) -> Result<(), TchError> {
+        unsafe_torch_err!(atg__print(s.as_ptr(), s.len() as i32));
+        Ok(())
+    }
+
     pub fn f_internal_propagate_xla_data(&self, output: &Tensor) -> Result<(), TchError> {
         unsafe_torch_err!(atg__propagate_xla_data(self.c_tensor, output.c_tensor));
         Ok(())
@@ -5092,6 +5300,36 @@ impl Tensor {
         Ok((Tensor { c_tensor: c_tensors[0] }, Tensor { c_tensor: c_tensors[1] }))
     }
 
+    pub fn f_internal_scaled_dot_product_cudnn_attention(
+        query: &Tensor,
+        key: &Tensor,
+        value: &Tensor,
+        dropout_p: f64,
+        is_causal: bool,
+        return_debug_mask: bool,
+        scale: impl Into<Option<f64>>,
+    ) -> Result<(Tensor, Tensor, Tensor, Tensor), TchError> {
+        let scale = scale.into();
+        let mut c_tensors = [std::ptr::null_mut(); 4];
+        unsafe_torch_err!(atg__scaled_dot_product_cudnn_attention(
+            c_tensors.as_mut_ptr(),
+            query.c_tensor,
+            key.c_tensor,
+            value.c_tensor,
+            dropout_p,
+            if is_causal { 1 } else { 0 },
+            if return_debug_mask { 1 } else { 0 },
+            scale.unwrap_or(std::f64::NAN),
+            scale.is_none() as i8
+        ));
+        Ok((
+            Tensor { c_tensor: c_tensors[0] },
+            Tensor { c_tensor: c_tensors[1] },
+            Tensor { c_tensor: c_tensors[2] },
+            Tensor { c_tensor: c_tensors[3] },
+        ))
+    }
+
     pub fn f_internal_scaled_dot_product_efficient_attention<T: Borrow<Tensor>>(
         query: &Tensor,
         key: &Tensor,
@@ -5169,6 +5407,66 @@ impl Tensor {
         ))
     }
 
+    pub fn f_internal_scaled_dot_product_flash_attention_for_cpu<T: Borrow<Tensor>>(
+        query: &Tensor,
+        key: &Tensor,
+        value: &Tensor,
+        dropout_p: f64,
+        is_causal: bool,
+        attn_mask: Option<T>,
+        scale: impl Into<Option<f64>>,
+    ) -> Result<(Tensor, Tensor), TchError> {
+        let scale = scale.into();
+        let mut c_tensors = [std::ptr::null_mut(); 2];
+        unsafe_torch_err!(atg__scaled_dot_product_flash_attention_for_cpu(
+            c_tensors.as_mut_ptr(),
+            query.c_tensor,
+            key.c_tensor,
+            value.c_tensor,
+            dropout_p,
+            if is_causal { 1 } else { 0 },
+            attn_mask.as_ref().map_or(std::ptr::null_mut(), |t| t.borrow().c_tensor),
+            scale.unwrap_or(std::f64::NAN),
+            scale.is_none() as i8
+        ));
+        Ok((Tensor { c_tensor: c_tensors[0] }, Tensor { c_tensor: c_tensors[1] }))
+    }
+
+    pub fn f_internal_scaled_dot_product_flash_attention_for_cpu_backward<T: Borrow<Tensor>>(
+        grad_out: &Tensor,
+        query: &Tensor,
+        key: &Tensor,
+        value: &Tensor,
+        out: &Tensor,
+        logsumexp: &Tensor,
+        dropout_p: f64,
+        is_causal: bool,
+        attn_mask: Option<T>,
+        scale: impl Into<Option<f64>>,
+    ) -> Result<(Tensor, Tensor, Tensor), TchError> {
+        let scale = scale.into();
+        let mut c_tensors = [std::ptr::null_mut(); 3];
+        unsafe_torch_err!(atg__scaled_dot_product_flash_attention_for_cpu_backward(
+            c_tensors.as_mut_ptr(),
+            grad_out.c_tensor,
+            query.c_tensor,
+            key.c_tensor,
+            value.c_tensor,
+            out.c_tensor,
+            logsumexp.c_tensor,
+            dropout_p,
+            if is_causal { 1 } else { 0 },
+            attn_mask.as_ref().map_or(std::ptr::null_mut(), |t| t.borrow().c_tensor),
+            scale.unwrap_or(std::f64::NAN),
+            scale.is_none() as i8
+        ));
+        Ok((
+            Tensor { c_tensor: c_tensors[0] },
+            Tensor { c_tensor: c_tensors[1] },
+            Tensor { c_tensor: c_tensors[2] },
+        ))
+    }
+
     pub fn f_internal_scaled_mm<T: Borrow<Tensor>>(
         &self,
         mat2: &Tensor,
@@ -6037,6 +6335,7 @@ impl Tensor {
         meta: &Tensor,
         bias: Option<T>,
         activation: &str,
+        out_dtype: impl Into<Option<Kind>>,
     ) -> Result<Tensor, TchError> {
         let mut c_tensors = [std::ptr::null_mut(); 1];
         unsafe_torch_err!(atg__sparse_semi_structured_linear(
@@ -6046,7 +6345,8 @@ impl Tensor {
             meta.c_tensor,
             bias.as_ref().map_or(std::ptr::null_mut(), |t| t.borrow().c_tensor),
             activation.as_ptr(),
-            activation.len() as i32
+            activation.len() as i32,
+            out_dtype.into().map_or(-1, |s| s.c_int())
         ));
         Ok(Tensor { c_tensor: c_tensors[0] })
     }
@@ -6579,6 +6879,21 @@ impl Tensor {
         Ok(Tensor { c_tensor: c_tensors[0] })
     }
 
+    pub fn f_internal_test_parallel_materialize(
+        &self,
+        num_parallel: i64,
+        skip_first: bool,
+    ) -> Result<Tensor, TchError> {
+        let mut c_tensors = [std::ptr::null_mut(); 1];
+        unsafe_torch_err!(atg__test_parallel_materialize(
+            c_tensors.as_mut_ptr(),
+            self.c_tensor,
+            num_parallel,
+            if skip_first { 1 } else { 0 }
+        ));
+        Ok(Tensor { c_tensor: c_tensors[0] })
+    }
+
     pub fn f_internal_test_serialization_subcmul(
         &self,
         other: &Tensor,
@@ -8160,6 +8475,21 @@ impl Tensor {
         Ok(Tensor { c_tensor: c_tensors[0] })
     }
 
+    pub fn f_internal_weight_int8pack_mm(
+        &self,
+        mat2: &Tensor,
+        scales: &Tensor,
+    ) -> Result<Tensor, TchError> {
+        let mut c_tensors = [std::ptr::null_mut(); 1];
+        unsafe_torch_err!(atg__weight_int8pack_mm(
+            c_tensors.as_mut_ptr(),
+            self.c_tensor,
+            mat2.c_tensor,
+            scales.c_tensor
+        ));
+        Ok(Tensor { c_tensor: c_tensors[0] })
+    }
+
     pub fn f_internal_weight_norm(v: &Tensor, g: &Tensor, dim: i64) -> Result<Tensor, TchError> {
         let mut c_tensors = [std::ptr::null_mut(); 1];
         unsafe_torch_err!(atg__weight_norm(c_tensors.as_mut_ptr(), v.c_tensor, g.c_tensor, dim));
@@ -30749,6 +31079,31 @@ impl Tensor {
         Ok(Tensor { c_tensor: c_tensors[0] })
     }
 
+    pub fn f_slice_inverse(
+        &self,
+        src: &Tensor,
+        dim: i64,
+        start: impl Into<Option<i64>>,
+        end: impl Into<Option<i64>>,
+        step: i64,
+    ) -> Result<Tensor, TchError> {
+        let start = start.into();
+        let end = end.into();
+        let mut c_tensors = [std::ptr::null_mut(); 1];
+        unsafe_torch_err!(atg_slice_inverse(
+            c_tensors.as_mut_ptr(),
+            self.c_tensor,
+            src.c_tensor,
+            dim,
+            start.unwrap_or(0i64),
+            start.is_none() as i8,
+            end.unwrap_or(0i64),
+            end.is_none() as i8,
+            step
+        ));
+        Ok(Tensor { c_tensor: c_tensors[0] })
+    }
+
     pub fn f_slice_scatter(
         &self,
         src: &Tensor,
diff --git a/src/wrappers/tensor_generated.rs b/src/wrappers/tensor_generated.rs
index a4471129..56a2217f 100644
--- a/src/wrappers/tensor_generated.rs
+++ b/src/wrappers/tensor_generated.rs
@@ -275,6 +275,10 @@ impl Tensor {
         .unwrap()
     }
 
+    pub fn internal_assert_scalar<S: Into<Scalar>>(self_scalar: S, assert_msg: &str) {
+        Tensor::f_internal_assert_scalar(self_scalar, assert_msg).unwrap()
+    }
+
     pub fn internal_assert_tensor_metadata(
         a: &Tensor,
         size: impl IntListOption,
@@ -374,6 +378,23 @@ impl Tensor {
         self.f_internal_cholesky_solve_helper_out(out, a, upper).unwrap()
     }
 
+    pub fn internal_chunk_cat<T: Borrow<Tensor>>(
+        tensors: &[T],
+        dim: i64,
+        num_chunks: i64,
+    ) -> Tensor {
+        Tensor::f_internal_chunk_cat(tensors, dim, num_chunks).unwrap()
+    }
+
+    pub fn internal_chunk_cat_out<T: Borrow<Tensor>>(
+        out: &Tensor,
+        tensors: &[T],
+        dim: i64,
+        num_chunks: i64,
+    ) -> Tensor {
+        Tensor::f_internal_chunk_cat_out(out, tensors, dim, num_chunks).unwrap()
+    }
+
     pub fn internal_coalesce(&self) -> Tensor {
         self.f_internal_coalesce().unwrap()
     }
@@ -645,6 +666,7 @@ impl Tensor {
         alpha: Option<T>,
         out_dtype: impl Into<Option<Kind>>,
         transpose_result: bool,
+        alg_id: i64,
     ) -> Tensor {
         Tensor::f_internal_cslt_sparse_mm(
             compressed_a,
@@ -653,6 +675,26 @@ impl Tensor {
             alpha,
             out_dtype,
             transpose_result,
+            alg_id,
+        )
+        .unwrap()
+    }
+
+    pub fn internal_cslt_sparse_mm_search<T: Borrow<Tensor>>(
+        compressed_a: &Tensor,
+        dense_b: &Tensor,
+        bias: Option<T>,
+        alpha: Option<T>,
+        out_dtype: impl Into<Option<Kind>>,
+        transpose_result: bool,
+    ) -> i64 {
+        Tensor::f_internal_cslt_sparse_mm_search(
+            compressed_a,
+            dense_b,
+            bias,
+            alpha,
+            out_dtype,
+            transpose_result,
         )
         .unwrap()
     }
@@ -1727,6 +1769,14 @@ impl Tensor {
         self.f_internal_functional_assert_async(assert_msg, dep_token).unwrap()
     }
 
+    pub fn internal_functional_assert_scalar<S: Into<Scalar>>(
+        self_scalar: S,
+        assert_msg: &str,
+        dep_token: &Tensor,
+    ) -> Tensor {
+        Tensor::f_internal_functional_assert_scalar(self_scalar, assert_msg, dep_token).unwrap()
+    }
+
     pub fn internal_functional_sym_constrain_range<S: Into<Scalar>>(
         size: S,
         min: impl Into<Option<i64>>,
@@ -2079,6 +2129,10 @@ impl Tensor {
         self.f_internal_is_zerotensor().unwrap()
     }
 
+    pub fn internal_lazy_clone(&self) -> Tensor {
+        self.f_internal_lazy_clone().unwrap()
+    }
+
     pub fn internal_linalg_check_errors(info: &Tensor, api_name: &str, is_matrix: bool) {
         Tensor::f_internal_linalg_check_errors(info, api_name, is_matrix).unwrap()
     }
@@ -2111,6 +2165,10 @@ impl Tensor {
             .unwrap()
     }
 
+    pub fn internal_linalg_eigvals(&self) -> Tensor {
+        self.f_internal_linalg_eigvals().unwrap()
+    }
+
     pub fn internal_linalg_slogdet(a: &Tensor) -> (Tensor, Tensor, Tensor, Tensor) {
         Tensor::f_internal_linalg_slogdet(a).unwrap()
     }
@@ -2755,6 +2813,34 @@ impl Tensor {
         .unwrap()
     }
 
+    pub fn internal_nested_get_jagged_dummy(any: &Tensor) -> Tensor {
+        Tensor::f_internal_nested_get_jagged_dummy(any).unwrap()
+    }
+
+    pub fn internal_nested_get_lengths(&self) -> Tensor {
+        self.f_internal_nested_get_lengths().unwrap()
+    }
+
+    pub fn internal_nested_get_offsets(&self) -> Tensor {
+        self.f_internal_nested_get_offsets().unwrap()
+    }
+
+    pub fn internal_nested_get_ragged_idx(&self) -> i64 {
+        self.f_internal_nested_get_ragged_idx().unwrap()
+    }
+
+    pub fn internal_nested_get_values(&self) -> Tensor {
+        self.f_internal_nested_get_values().unwrap()
+    }
+
+    pub fn internal_nested_get_values_copy(&self) -> Tensor {
+        self.f_internal_nested_get_values_copy().unwrap()
+    }
+
+    pub fn internal_nested_get_values_copy_out(&self, out: &Tensor) -> Tensor {
+        self.f_internal_nested_get_values_copy_out(out).unwrap()
+    }
+
     pub fn internal_nested_select_backward(
         &self,
         grad_output: &Tensor,
@@ -2802,6 +2888,38 @@ impl Tensor {
             .unwrap()
     }
 
+    pub fn internal_nested_view_from_jagged<T: Borrow<Tensor>>(
+        &self,
+        offsets: &Tensor,
+        dummy: &Tensor,
+        lengths: Option<T>,
+        ragged_idx: i64,
+    ) -> Tensor {
+        self.f_internal_nested_view_from_jagged(offsets, dummy, lengths, ragged_idx).unwrap()
+    }
+
+    pub fn internal_nested_view_from_jagged_copy<T: Borrow<Tensor>>(
+        &self,
+        offsets: &Tensor,
+        dummy: &Tensor,
+        lengths: Option<T>,
+        ragged_idx: i64,
+    ) -> Tensor {
+        self.f_internal_nested_view_from_jagged_copy(offsets, dummy, lengths, ragged_idx).unwrap()
+    }
+
+    pub fn internal_nested_view_from_jagged_copy_out<T: Borrow<Tensor>>(
+        &self,
+        out: &Tensor,
+        offsets: &Tensor,
+        dummy: &Tensor,
+        lengths: Option<T>,
+        ragged_idx: i64,
+    ) -> Tensor {
+        self.f_internal_nested_view_from_jagged_copy_out(out, offsets, dummy, lengths, ragged_idx)
+            .unwrap()
+    }
+
     pub fn internal_new_zeros_with_same_feature_meta(
         &self,
         other: &Tensor,
@@ -2941,6 +3059,10 @@ impl Tensor {
         self.f_internal_prelu_kernel_backward(grad_output, weight).unwrap()
     }
 
+    pub fn internal_print(s: &str) {
+        Tensor::f_internal_print(s).unwrap()
+    }
+
     pub fn internal_propagate_xla_data(&self, output: &Tensor) {
         self.f_internal_propagate_xla_data(output).unwrap()
     }
@@ -3034,6 +3156,27 @@ impl Tensor {
         .unwrap()
     }
 
+    pub fn internal_scaled_dot_product_cudnn_attention(
+        query: &Tensor,
+        key: &Tensor,
+        value: &Tensor,
+        dropout_p: f64,
+        is_causal: bool,
+        return_debug_mask: bool,
+        scale: impl Into<Option<f64>>,
+    ) -> (Tensor, Tensor, Tensor, Tensor) {
+        Tensor::f_internal_scaled_dot_product_cudnn_attention(
+            query,
+            key,
+            value,
+            dropout_p,
+            is_causal,
+            return_debug_mask,
+            scale,
+        )
+        .unwrap()
+    }
+
     pub fn internal_scaled_dot_product_efficient_attention<T: Borrow<Tensor>>(
         query: &Tensor,
         key: &Tensor,
@@ -3094,6 +3237,39 @@ impl Tensor {
         .unwrap()
     }
 
+    pub fn internal_scaled_dot_product_flash_attention_for_cpu<T: Borrow<Tensor>>(
+        query: &Tensor,
+        key: &Tensor,
+        value: &Tensor,
+        dropout_p: f64,
+        is_causal: bool,
+        attn_mask: Option<T>,
+        scale: impl Into<Option<f64>>,
+    ) -> (Tensor, Tensor) {
+        Tensor::f_internal_scaled_dot_product_flash_attention_for_cpu(
+            query, key, value, dropout_p, is_causal, attn_mask, scale,
+        )
+        .unwrap()
+    }
+
+    pub fn internal_scaled_dot_product_flash_attention_for_cpu_backward<T: Borrow<Tensor>>(
+        grad_out: &Tensor,
+        query: &Tensor,
+        key: &Tensor,
+        value: &Tensor,
+        out: &Tensor,
+        logsumexp: &Tensor,
+        dropout_p: f64,
+        is_causal: bool,
+        attn_mask: Option<T>,
+        scale: impl Into<Option<f64>>,
+    ) -> (Tensor, Tensor, Tensor) {
+        Tensor::f_internal_scaled_dot_product_flash_attention_for_cpu_backward(
+            grad_out, query, key, value, out, logsumexp, dropout_p, is_causal, attn_mask, scale,
+        )
+        .unwrap()
+    }
+
     pub fn internal_scaled_mm<T: Borrow<Tensor>>(
         &self,
         mat2: &Tensor,
@@ -3596,8 +3772,10 @@ impl Tensor {
         meta: &Tensor,
         bias: Option<T>,
         activation: &str,
+        out_dtype: impl Into<Option<Kind>>,
     ) -> Tensor {
-        self.f_internal_sparse_semi_structured_linear(weight, meta, bias, activation).unwrap()
+        self.f_internal_sparse_semi_structured_linear(weight, meta, bias, activation, out_dtype)
+            .unwrap()
     }
 
     pub fn internal_sparse_softmax(&self, dim: i64, half_to_float: bool) -> Tensor {
@@ -3806,6 +3984,14 @@ impl Tensor {
         Tensor::f_internal_test_optional_intlist_out(out, values, addends).unwrap()
     }
 
+    pub fn internal_test_parallel_materialize(
+        &self,
+        num_parallel: i64,
+        skip_first: bool,
+    ) -> Tensor {
+        self.f_internal_test_parallel_materialize(num_parallel, skip_first).unwrap()
+    }
+
     pub fn internal_test_serialization_subcmul(&self, other: &Tensor) -> Tensor {
         self.f_internal_test_serialization_subcmul(other).unwrap()
     }
@@ -4708,6 +4894,10 @@ impl Tensor {
         self.f_internal_weight_int4pack_mm(mat2, qgroupsize, qscaleandzeros).unwrap()
     }
 
+    pub fn internal_weight_int8pack_mm(&self, mat2: &Tensor, scales: &Tensor) -> Tensor {
+        self.f_internal_weight_int8pack_mm(mat2, scales).unwrap()
+    }
+
     pub fn internal_weight_norm(v: &Tensor, g: &Tensor, dim: i64) -> Tensor {
         Tensor::f_internal_weight_norm(v, g, dim).unwrap()
     }
@@ -15651,6 +15841,17 @@ impl Tensor {
         self.f_slice_copy_tensor_out(out, dim, start, end, step).unwrap()
     }
 
+    pub fn slice_inverse(
+        &self,
+        src: &Tensor,
+        dim: i64,
+        start: impl Into<Option<i64>>,
+        end: impl Into<Option<i64>>,
+        step: i64,
+    ) -> Tensor {
+        self.f_slice_inverse(src, dim, start, end, step).unwrap()
+    }
+
     pub fn slice_scatter(
         &self,
         src: &Tensor,
diff --git a/torch-sys/Cargo.toml b/torch-sys/Cargo.toml
index 5990458d..b23bf0e9 100644
--- a/torch-sys/Cargo.toml
+++ b/torch-sys/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "torch-sys"
-version = "0.15.0"
+version = "0.16.0"
 authors = ["Laurent Mazare <lmazare@gmail.com>"]
 edition = "2021"
 build = "build.rs"
diff --git a/torch-sys/build.rs b/torch-sys/build.rs
index c66087ca..637bb50e 100644
--- a/torch-sys/build.rs
+++ b/torch-sys/build.rs
@@ -10,7 +10,7 @@ use anyhow::{Context, Result};
 use std::path::{Path, PathBuf};
 use std::{env, fs, io};
 
-const TORCH_VERSION: &str = "2.2.0";
+const TORCH_VERSION: &str = "2.3.0";
 const PYTHON_PRINT_PYTORCH_DETAILS: &str = r"
 import torch
 from torch.utils import cpp_extension
@@ -158,7 +158,7 @@ fn version_check(version: &str) -> Result<()> {
         return Ok(());
     }
     let version = version.trim();
-    // Typical version number is 2.2.0+cpu or 2.2.0+cu121
+    // Typical version number is 2.3.0+cpu or 2.3.0+cu121
     let version = match version.split_once('+') {
         None => version,
         Some((version, _)) => version,
diff --git a/torch-sys/libtch/torch_api_generated.cpp b/torch-sys/libtch/torch_api_generated.cpp
index bbf5d251..b54d6b0c 100644
--- a/torch-sys/libtch/torch_api_generated.cpp
+++ b/torch-sys/libtch/torch_api_generated.cpp
@@ -342,6 +342,12 @@ void atg__amp_update_scale_out(tensor *out__, tensor out, tensor self, tensor gr
   )
 }
 
+void atg__assert_scalar(scalar self_scalar, char* assert_msg_ptr, int assert_msg_len) {
+  PROTECT(
+    torch::_assert_scalar(*self_scalar, std::string(assert_msg_ptr, assert_msg_len));
+  )
+}
+
 void atg__assert_tensor_metadata(tensor a, int64_t *size_data, int size_len, int64_t *stride_data, int stride_len, int dtype) {
   PROTECT(
     torch::_assert_tensor_metadata(*a, size_data == nullptr ? c10::nullopt : c10::optional<torch::IntArrayRef>(torch::IntArrayRef(size_data, size_len)), stride_data == nullptr ? c10::nullopt : c10::optional<torch::IntArrayRef>(torch::IntArrayRef(stride_data, stride_len)), dtype < 0 ? c10::nullopt : c10::optional<at::ScalarType>(at::ScalarType(dtype)));
@@ -446,6 +452,20 @@ void atg__cholesky_solve_helper_out(tensor *out__, tensor out, tensor self, tens
   )
 }
 
+void atg__chunk_cat(tensor *out__, tensor *tensors_data, int tensors_len, int64_t dim, int64_t num_chunks) {
+  PROTECT(
+    auto outputs__ = torch::_chunk_cat(of_carray_tensor(tensors_data, tensors_len), dim, num_chunks);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg__chunk_cat_out(tensor *out__, tensor out, tensor *tensors_data, int tensors_len, int64_t dim, int64_t num_chunks) {
+  PROTECT(
+    auto outputs__ = torch::_chunk_cat_out(*out, of_carray_tensor(tensors_data, tensors_len), dim, num_chunks);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
 void atg__coalesce(tensor *out__, tensor self) {
   PROTECT(
     auto outputs__ = torch::_coalesce(*self);
@@ -642,13 +662,20 @@ void atg__cslt_compress(tensor *out__, tensor input) {
   )
 }
 
-void atg__cslt_sparse_mm(tensor *out__, tensor compressed_A, tensor dense_B, tensor bias, tensor alpha, int out_dtype, int transpose_result) {
+void atg__cslt_sparse_mm(tensor *out__, tensor compressed_A, tensor dense_B, tensor bias, tensor alpha, int out_dtype, int transpose_result, int64_t alg_id) {
   PROTECT(
-    auto outputs__ = torch::_cslt_sparse_mm(*compressed_A, *dense_B, (bias ? *bias : torch::Tensor()), (alpha ? *alpha : torch::Tensor()), out_dtype < 0 ? c10::nullopt : c10::optional<at::ScalarType>(at::ScalarType(out_dtype)), (bool)transpose_result);
+    auto outputs__ = torch::_cslt_sparse_mm(*compressed_A, *dense_B, (bias ? *bias : torch::Tensor()), (alpha ? *alpha : torch::Tensor()), out_dtype < 0 ? c10::nullopt : c10::optional<at::ScalarType>(at::ScalarType(out_dtype)), (bool)transpose_result, alg_id);
     out__[0] = new torch::Tensor(outputs__);
   )
 }
 
+int64_t atg__cslt_sparse_mm_search(tensor compressed_A, tensor dense_B, tensor bias, tensor alpha, int out_dtype, int transpose_result) {
+  PROTECT(
+    return torch::_cslt_sparse_mm_search(*compressed_A, *dense_B, (bias ? *bias : torch::Tensor()), (alpha ? *alpha : torch::Tensor()), out_dtype < 0 ? c10::nullopt : c10::optional<at::ScalarType>(at::ScalarType(out_dtype)), (bool)transpose_result);
+  )
+  return 0;
+}
+
 void atg__ctc_loss(tensor *out__, tensor log_probs, tensor targets, int64_t *input_lengths_data, int input_lengths_len, int64_t *target_lengths_data, int target_lengths_len, int64_t blank, int zero_infinity) {
   PROTECT(
     auto outputs__ = torch::_ctc_loss(*log_probs, *targets, torch::IntArrayRef(input_lengths_data, input_lengths_len), torch::IntArrayRef(target_lengths_data, target_lengths_len), blank, (bool)zero_infinity);
@@ -1107,6 +1134,13 @@ void atg__functional_assert_async(tensor *out__, tensor self, char* assert_msg_p
   )
 }
 
+void atg__functional_assert_scalar(tensor *out__, scalar self_scalar, char* assert_msg_ptr, int assert_msg_len, tensor dep_token) {
+  PROTECT(
+    auto outputs__ = torch::_functional_assert_scalar(*self_scalar, std::string(assert_msg_ptr, assert_msg_len), *dep_token);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
 void atg__functional_sym_constrain_range(tensor *out__, scalar size, int64_t min_v, uint8_t min_null, int64_t max_v, uint8_t max_null, tensor dep_token) {
   PROTECT(
     auto outputs__ = torch::_functional_sym_constrain_range(*size, min_null ? c10::nullopt : c10::optional<int64_t>(min_v), max_null ? c10::nullopt : c10::optional<int64_t>(max_v), *dep_token);
@@ -1360,6 +1394,13 @@ int atg__is_zerotensor(tensor self) {
   return 0;
 }
 
+void atg__lazy_clone(tensor *out__, tensor self) {
+  PROTECT(
+    auto outputs__ = torch::_lazy_clone(*self);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
 void atg__linalg_check_errors(tensor info, char* api_name_ptr, int api_name_len, int is_matrix) {
   PROTECT(
     torch::_linalg_check_errors(*info, std::string(api_name_ptr, api_name_len), (bool)is_matrix);
@@ -1400,6 +1441,13 @@ void atg__linalg_eigh_eigenvalues(tensor *out__, tensor eigenvalues, tensor eige
   )
 }
 
+void atg__linalg_eigvals(tensor *out__, tensor self) {
+  PROTECT(
+    auto outputs__ = torch::_linalg_eigvals(*self);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
 void atg__linalg_slogdet(tensor *out__, tensor A) {
   PROTECT(
     auto outputs__ = torch::_linalg_slogdet(*A);
@@ -1831,6 +1879,55 @@ void atg__nested_from_padded_out(tensor *out__, tensor out, tensor padded, tenso
   )
 }
 
+void atg__nested_get_jagged_dummy(tensor *out__, tensor any) {
+  PROTECT(
+    auto outputs__ = torch::_nested_get_jagged_dummy(*any);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg__nested_get_lengths(tensor *out__, tensor self) {
+  PROTECT(
+    auto outputs__ = torch::_nested_get_lengths(*self);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg__nested_get_offsets(tensor *out__, tensor self) {
+  PROTECT(
+    auto outputs__ = torch::_nested_get_offsets(*self);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+int64_t atg__nested_get_ragged_idx(tensor self) {
+  PROTECT(
+    return torch::_nested_get_ragged_idx(*self);
+  )
+  return 0;
+}
+
+void atg__nested_get_values(tensor *out__, tensor self) {
+  PROTECT(
+    auto outputs__ = torch::_nested_get_values(*self);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg__nested_get_values_copy(tensor *out__, tensor self) {
+  PROTECT(
+    auto outputs__ = torch::_nested_get_values_copy(*self);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg__nested_get_values_copy_out(tensor *out__, tensor out, tensor self) {
+  PROTECT(
+    auto outputs__ = torch::_nested_get_values_copy_out(*out, *self);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
 void atg__nested_select_backward(tensor *out__, tensor grad_output, tensor self, int64_t dim, int64_t index) {
   PROTECT(
     auto outputs__ = torch::_nested_select_backward(*grad_output, *self, dim, index);
@@ -1866,6 +1963,27 @@ void atg__nested_view_from_buffer_copy_out(tensor *out__, tensor out, tensor sel
   )
 }
 
+void atg__nested_view_from_jagged(tensor *out__, tensor self, tensor offsets, tensor dummy, tensor lengths, int64_t ragged_idx) {
+  PROTECT(
+    auto outputs__ = torch::_nested_view_from_jagged(*self, *offsets, *dummy, (lengths ? *lengths : torch::Tensor()), ragged_idx);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg__nested_view_from_jagged_copy(tensor *out__, tensor self, tensor offsets, tensor dummy, tensor lengths, int64_t ragged_idx) {
+  PROTECT(
+    auto outputs__ = torch::_nested_view_from_jagged_copy(*self, *offsets, *dummy, (lengths ? *lengths : torch::Tensor()), ragged_idx);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg__nested_view_from_jagged_copy_out(tensor *out__, tensor out, tensor self, tensor offsets, tensor dummy, tensor lengths, int64_t ragged_idx) {
+  PROTECT(
+    auto outputs__ = torch::_nested_view_from_jagged_copy_out(*out, *self, *offsets, *dummy, (lengths ? *lengths : torch::Tensor()), ragged_idx);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
 void atg__new_zeros_with_same_feature_meta(tensor *out__, tensor self, tensor other, int64_t self_num_batch_dims) {
   PROTECT(
     auto outputs__ = torch::_new_zeros_with_same_feature_meta(*self, *other, self_num_batch_dims);
@@ -1996,6 +2114,12 @@ void atg__prelu_kernel_backward(tensor *out__, tensor grad_output, tensor self,
   )
 }
 
+void atg__print(char* s_ptr, int s_len) {
+  PROTECT(
+    torch::_print(std::string(s_ptr, s_len));
+  )
+}
+
 void atg__propagate_xla_data(tensor input, tensor output) {
   PROTECT(
     torch::_propagate_xla_data(*input, *output);
@@ -2102,6 +2226,16 @@ void atg__scaled_dot_product_attention_math(tensor *out__, tensor query, tensor
   )
 }
 
+void atg__scaled_dot_product_cudnn_attention(tensor *out__, tensor query, tensor key, tensor value, double dropout_p, int is_causal, int return_debug_mask, double scale_v, uint8_t scale_null) {
+  PROTECT(
+    auto outputs__ = torch::_scaled_dot_product_cudnn_attention(*query, *key, *value, dropout_p, (bool)is_causal, (bool)return_debug_mask, scale_null ? c10::nullopt : c10::optional<double>(scale_v));
+    out__[0] = new torch::Tensor(std::get<0>(outputs__));
+    out__[1] = new torch::Tensor(std::get<1>(outputs__));
+    out__[2] = new torch::Tensor(std::get<2>(outputs__));
+    out__[3] = new torch::Tensor(std::get<3>(outputs__));
+  )
+}
+
 void atg__scaled_dot_product_efficient_attention(tensor *out__, tensor query, tensor key, tensor value, tensor attn_bias, int compute_log_sumexp, double dropout_p, int is_causal, double scale_v, uint8_t scale_null) {
   PROTECT(
     auto outputs__ = torch::_scaled_dot_product_efficient_attention(*query, *key, *value, (attn_bias ? *attn_bias : torch::Tensor()), (bool)compute_log_sumexp, dropout_p, (bool)is_causal, scale_null ? c10::nullopt : c10::optional<double>(scale_v));
@@ -2121,6 +2255,23 @@ void atg__scaled_dot_product_flash_attention_backward(tensor *out__, tensor grad
   )
 }
 
+void atg__scaled_dot_product_flash_attention_for_cpu(tensor *out__, tensor query, tensor key, tensor value, double dropout_p, int is_causal, tensor attn_mask, double scale_v, uint8_t scale_null) {
+  PROTECT(
+    auto outputs__ = torch::_scaled_dot_product_flash_attention_for_cpu(*query, *key, *value, dropout_p, (bool)is_causal, (attn_mask ? *attn_mask : torch::Tensor()), scale_null ? c10::nullopt : c10::optional<double>(scale_v));
+    out__[0] = new torch::Tensor(std::get<0>(outputs__));
+    out__[1] = new torch::Tensor(std::get<1>(outputs__));
+  )
+}
+
+void atg__scaled_dot_product_flash_attention_for_cpu_backward(tensor *out__, tensor grad_out, tensor query, tensor key, tensor value, tensor out, tensor logsumexp, double dropout_p, int is_causal, tensor attn_mask, double scale_v, uint8_t scale_null) {
+  PROTECT(
+    auto outputs__ = torch::_scaled_dot_product_flash_attention_for_cpu_backward(*grad_out, *query, *key, *value, *out, *logsumexp, dropout_p, (bool)is_causal, (attn_mask ? *attn_mask : torch::Tensor()), scale_null ? c10::nullopt : c10::optional<double>(scale_v));
+    out__[0] = new torch::Tensor(std::get<0>(outputs__));
+    out__[1] = new torch::Tensor(std::get<1>(outputs__));
+    out__[2] = new torch::Tensor(std::get<2>(outputs__));
+  )
+}
+
 void atg__scaled_mm(tensor *out__, tensor self, tensor mat2, tensor bias, int out_dtype, tensor scale_a, tensor scale_b, tensor scale_result, int use_fast_accum) {
   PROTECT(
     auto outputs__ = torch::_scaled_mm(*self, *mat2, (bias ? *bias : torch::Tensor()), out_dtype < 0 ? c10::nullopt : c10::optional<at::ScalarType>(at::ScalarType(out_dtype)), (scale_a ? *scale_a : torch::Tensor()), (scale_b ? *scale_b : torch::Tensor()), (scale_result ? *scale_result : torch::Tensor()), (bool)use_fast_accum);
@@ -2449,9 +2600,9 @@ void atg__sparse_mm_reduce_impl(tensor *out__, tensor self, tensor other, char*
   )
 }
 
-void atg__sparse_semi_structured_linear(tensor *out__, tensor input, tensor weight, tensor meta, tensor bias, char* activation_ptr, int activation_len) {
+void atg__sparse_semi_structured_linear(tensor *out__, tensor input, tensor weight, tensor meta, tensor bias, char* activation_ptr, int activation_len, int out_dtype) {
   PROTECT(
-    auto outputs__ = torch::_sparse_semi_structured_linear(*input, *weight, *meta, (bias ? *bias : torch::Tensor()), std::string(activation_ptr, activation_len));
+    auto outputs__ = torch::_sparse_semi_structured_linear(*input, *weight, *meta, (bias ? *bias : torch::Tensor()), std::string(activation_ptr, activation_len), out_dtype < 0 ? c10::nullopt : c10::optional<at::ScalarType>(at::ScalarType(out_dtype)));
     out__[0] = new torch::Tensor(outputs__);
   )
 }
@@ -2729,6 +2880,13 @@ void atg__test_optional_intlist_out(tensor *out__, tensor out, tensor values, in
   )
 }
 
+void atg__test_parallel_materialize(tensor *out__, tensor self, int64_t num_parallel, int skip_first) {
+  PROTECT(
+    auto outputs__ = torch::_test_parallel_materialize(*self, num_parallel, (bool)skip_first);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
 void atg__test_serialization_subcmul(tensor *out__, tensor self, tensor other) {
   PROTECT(
     auto outputs__ = torch::_test_serialization_subcmul(*self, *other);
@@ -3301,6 +3459,13 @@ void atg__weight_int4pack_mm(tensor *out__, tensor self, tensor mat2, int64_t qG
   )
 }
 
+void atg__weight_int8pack_mm(tensor *out__, tensor self, tensor mat2, tensor scales) {
+  PROTECT(
+    auto outputs__ = torch::_weight_int8pack_mm(*self, *mat2, *scales);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
 void atg__weight_norm(tensor *out__, tensor v, tensor g, int64_t dim) {
   PROTECT(
     auto outputs__ = torch::_weight_norm(*v, *g, dim);
@@ -14954,6 +15119,13 @@ void atg_slice_copy_tensor_out(tensor *out__, tensor out, tensor self, int64_t d
   )
 }
 
+void atg_slice_inverse(tensor *out__, tensor self, tensor src, int64_t dim, int64_t start_v, uint8_t start_null, int64_t end_v, uint8_t end_null, int64_t step) {
+  PROTECT(
+    auto outputs__ = torch::slice_inverse(*self, *src, dim, start_null ? c10::nullopt : c10::optional<int64_t>(start_v), end_null ? c10::nullopt : c10::optional<int64_t>(end_v), step);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
 void atg_slice_scatter(tensor *out__, tensor self, tensor src, int64_t dim, int64_t start_v, uint8_t start_null, int64_t end_v, uint8_t end_null, int64_t step) {
   PROTECT(
     auto outputs__ = torch::slice_scatter(*self, *src, dim, start_null ? c10::nullopt : c10::optional<int64_t>(start_v), end_null ? c10::nullopt : c10::optional<int64_t>(end_v), step);
diff --git a/torch-sys/libtch/torch_api_generated.h b/torch-sys/libtch/torch_api_generated.h
index 97beb08c..da3bc167 100644
--- a/torch-sys/libtch/torch_api_generated.h
+++ b/torch-sys/libtch/torch_api_generated.h
@@ -50,6 +50,7 @@ void atg__aminmax_out(tensor *, tensor out0, tensor out1, tensor self);
 void atg__amp_update_scale(tensor *, tensor self, tensor growth_tracker, tensor found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval);
 void atg__amp_update_scale_(tensor *, tensor self, tensor growth_tracker, tensor found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval);
 void atg__amp_update_scale_out(tensor *, tensor out, tensor self, tensor growth_tracker, tensor found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval);
+void atg__assert_scalar(scalar self_scalar, char* assert_msg_ptr, int assert_msg_len);
 void atg__assert_tensor_metadata(tensor a, int64_t *size_data, int size_len, int64_t *stride_data, int stride_len, int dtype);
 void atg__autocast_to_full_precision(tensor *, tensor self, int cuda_enabled, int cpu_enabled);
 void atg__autocast_to_reduced_precision(tensor *, tensor self, int cuda_enabled, int cpu_enabled, int cuda_dtype, int cpu_dtype);
@@ -65,6 +66,8 @@ void atg__cdist_backward(tensor *, tensor grad, tensor x1, tensor x2, double p,
 void atg__cdist_backward_out(tensor *, tensor out, tensor grad, tensor x1, tensor x2, double p, tensor cdist);
 void atg__cholesky_solve_helper(tensor *, tensor self, tensor A, int upper);
 void atg__cholesky_solve_helper_out(tensor *, tensor out, tensor self, tensor A, int upper);
+void atg__chunk_cat(tensor *, tensor *tensors_data, int tensors_len, int64_t dim, int64_t num_chunks);
+void atg__chunk_cat_out(tensor *, tensor out, tensor *tensors_data, int tensors_len, int64_t dim, int64_t num_chunks);
 void atg__coalesce(tensor *, tensor self);
 void atg__coalesce_out(tensor *, tensor out, tensor self);
 void atg__coalesced(tensor *, tensor self, int coalesced);
@@ -93,7 +96,8 @@ void atg__copy_from_and_resize(tensor *, tensor self, tensor dst);
 void atg__copy_from_and_resize_out(tensor *, tensor out, tensor self, tensor dst);
 void atg__copy_from_out(tensor *, tensor out, tensor self, tensor dst, int non_blocking);
 void atg__cslt_compress(tensor *, tensor input);
-void atg__cslt_sparse_mm(tensor *, tensor compressed_A, tensor dense_B, tensor bias, tensor alpha, int out_dtype, int transpose_result);
+void atg__cslt_sparse_mm(tensor *, tensor compressed_A, tensor dense_B, tensor bias, tensor alpha, int out_dtype, int transpose_result, int64_t alg_id);
+int64_t atg__cslt_sparse_mm_search(tensor compressed_A, tensor dense_B, tensor bias, tensor alpha, int out_dtype, int transpose_result);
 void atg__ctc_loss(tensor *, tensor log_probs, tensor targets, int64_t *input_lengths_data, int input_lengths_len, int64_t *target_lengths_data, int target_lengths_len, int64_t blank, int zero_infinity);
 void atg__ctc_loss_backward(tensor *, tensor grad, tensor log_probs, tensor targets, int64_t *input_lengths_data, int input_lengths_len, int64_t *target_lengths_data, int target_lengths_len, tensor neg_log_likelihood, tensor log_alpha, int64_t blank, int zero_infinity);
 void atg__ctc_loss_backward_out(tensor *, tensor out, tensor grad, tensor log_probs, tensor targets, int64_t *input_lengths_data, int input_lengths_len, int64_t *target_lengths_data, int target_lengths_len, tensor neg_log_likelihood, tensor log_alpha, int64_t blank, int zero_infinity);
@@ -154,6 +158,7 @@ void atg__flash_attention_backward(tensor *, tensor grad_out, tensor query, tens
 void atg__foobar(tensor *, tensor self, int arg1, int arg2, int arg3);
 void atg__foobar_out(tensor *, tensor out, tensor self, int arg1, int arg2, int arg3);
 void atg__functional_assert_async(tensor *, tensor self, char* assert_msg_ptr, int assert_msg_len, tensor dep_token);
+void atg__functional_assert_scalar(tensor *, scalar self_scalar, char* assert_msg_ptr, int assert_msg_len, tensor dep_token);
 void atg__functional_sym_constrain_range(tensor *, scalar size, int64_t min_v, uint8_t min_null, int64_t max_v, uint8_t max_null, tensor dep_token);
 void atg__functional_sym_constrain_range_for_size(tensor *, scalar size, int64_t min_v, uint8_t min_null, int64_t max_v, uint8_t max_null, tensor dep_token);
 void atg__fused_dropout(tensor *, tensor self, double p);
@@ -188,11 +193,13 @@ void atg__int_mm_out(tensor *, tensor out, tensor self, tensor mat2);
 void atg__is_all_true(tensor *, tensor self);
 void atg__is_any_true(tensor *, tensor self);
 int atg__is_zerotensor(tensor self);
+void atg__lazy_clone(tensor *, tensor self);
 void atg__linalg_check_errors(tensor info, char* api_name_ptr, int api_name_len, int is_matrix);
 void atg__linalg_det(tensor *, tensor A);
 void atg__linalg_det_result(tensor *, tensor result, tensor LU, tensor pivots, tensor A);
 void atg__linalg_eigh(tensor *, tensor A, char* UPLO_ptr, int UPLO_len, int compute_v);
 void atg__linalg_eigh_eigenvalues(tensor *, tensor eigenvalues, tensor eigenvectors, tensor A, char* UPLO_ptr, int UPLO_len, int compute_v);
+void atg__linalg_eigvals(tensor *, tensor self);
 void atg__linalg_slogdet(tensor *, tensor A);
 void atg__linalg_slogdet_sign(tensor *, tensor sign, tensor logabsdet, tensor LU, tensor pivots, tensor A);
 void atg__linalg_solve_ex(tensor *, tensor A, tensor B, int left, int check_errors);
@@ -248,11 +255,21 @@ void atg__nested_from_padded(tensor *, tensor padded, tensor cpu_nested_shape_ex
 void atg__nested_from_padded_and_nested_example(tensor *, tensor padded, tensor nt_example);
 void atg__nested_from_padded_and_nested_example_out(tensor *, tensor out, tensor padded, tensor nt_example);
 void atg__nested_from_padded_out(tensor *, tensor out, tensor padded, tensor cpu_nested_shape_example, int fuse_transform_0213);
+void atg__nested_get_jagged_dummy(tensor *, tensor any);
+void atg__nested_get_lengths(tensor *, tensor self);
+void atg__nested_get_offsets(tensor *, tensor self);
+int64_t atg__nested_get_ragged_idx(tensor self);
+void atg__nested_get_values(tensor *, tensor self);
+void atg__nested_get_values_copy(tensor *, tensor self);
+void atg__nested_get_values_copy_out(tensor *, tensor out, tensor self);
 void atg__nested_select_backward(tensor *, tensor grad_output, tensor self, int64_t dim, int64_t index);
 void atg__nested_sum_backward(tensor *, tensor grad, tensor self, int64_t *dim_data, int dim_len, int keepdim);
 void atg__nested_view_from_buffer(tensor *, tensor self, tensor nested_size, tensor nested_strides, tensor offsets);
 void atg__nested_view_from_buffer_copy(tensor *, tensor self, tensor nested_size, tensor nested_strides, tensor offsets);
 void atg__nested_view_from_buffer_copy_out(tensor *, tensor out, tensor self, tensor nested_size, tensor nested_strides, tensor offsets);
+void atg__nested_view_from_jagged(tensor *, tensor self, tensor offsets, tensor dummy, tensor lengths, int64_t ragged_idx);
+void atg__nested_view_from_jagged_copy(tensor *, tensor self, tensor offsets, tensor dummy, tensor lengths, int64_t ragged_idx);
+void atg__nested_view_from_jagged_copy_out(tensor *, tensor out, tensor self, tensor offsets, tensor dummy, tensor lengths, int64_t ragged_idx);
 void atg__new_zeros_with_same_feature_meta(tensor *, tensor self, tensor other, int64_t self_num_batch_dims);
 void atg__new_zeros_with_same_feature_meta_out(tensor *, tensor out, tensor self, tensor other, int64_t self_num_batch_dims);
 int atg__nnpack_available();
@@ -271,6 +288,7 @@ void atg__pin_memory(tensor *, tensor self, int device);
 void atg__pin_memory_out(tensor *, tensor out, tensor self, int device);
 void atg__prelu_kernel(tensor *, tensor self, tensor weight);
 void atg__prelu_kernel_backward(tensor *, tensor grad_output, tensor self, tensor weight);
+void atg__print(char* s_ptr, int s_len);
 void atg__propagate_xla_data(tensor input, tensor output);
 void atg__remove_batch_dim(tensor *, tensor self, int64_t level, int64_t batch_size, int64_t out_dim);
 void atg__reshape_alias(tensor *, tensor self, int64_t *size_data, int size_len, int64_t *stride_data, int stride_len);
@@ -286,8 +304,11 @@ void atg__sample_dirichlet(tensor *, tensor self);
 void atg__sample_dirichlet_out(tensor *, tensor out, tensor self);
 void atg__saturate_weight_to_fp16(tensor *, tensor weight);
 void atg__scaled_dot_product_attention_math(tensor *, tensor query, tensor key, tensor value, tensor attn_mask, double dropout_p, int is_causal, tensor dropout_mask, double scale_v, uint8_t scale_null);
+void atg__scaled_dot_product_cudnn_attention(tensor *, tensor query, tensor key, tensor value, double dropout_p, int is_causal, int return_debug_mask, double scale_v, uint8_t scale_null);
 void atg__scaled_dot_product_efficient_attention(tensor *, tensor query, tensor key, tensor value, tensor attn_bias, int compute_log_sumexp, double dropout_p, int is_causal, double scale_v, uint8_t scale_null);
 void atg__scaled_dot_product_flash_attention_backward(tensor *, tensor grad_out, tensor query, tensor key, tensor value, tensor out, tensor logsumexp, tensor cum_seq_q, tensor cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int is_causal, tensor philox_seed, tensor philox_offset, double scale_v, uint8_t scale_null);
+void atg__scaled_dot_product_flash_attention_for_cpu(tensor *, tensor query, tensor key, tensor value, double dropout_p, int is_causal, tensor attn_mask, double scale_v, uint8_t scale_null);
+void atg__scaled_dot_product_flash_attention_for_cpu_backward(tensor *, tensor grad_out, tensor query, tensor key, tensor value, tensor out, tensor logsumexp, double dropout_p, int is_causal, tensor attn_mask, double scale_v, uint8_t scale_null);
 void atg__scaled_mm(tensor *, tensor self, tensor mat2, tensor bias, int out_dtype, tensor scale_a, tensor scale_b, tensor scale_result, int use_fast_accum);
 void atg__scaled_mm_out(tensor *, tensor out, tensor out_amax, tensor self, tensor mat2, tensor bias, int out_dtype, tensor scale_a, tensor scale_b, tensor scale_result, int use_fast_accum);
 void atg__scatter_reduce(tensor *, tensor self, int64_t dim, tensor index, tensor src, char* reduce_ptr, int reduce_len, int include_self);
@@ -334,7 +355,7 @@ void atg__sparse_mask_projection_out(tensor *, tensor out, tensor self, tensor m
 void atg__sparse_mm(tensor *, tensor sparse, tensor dense);
 void atg__sparse_mm_reduce(tensor *, tensor sparse, tensor dense, char* reduce_ptr, int reduce_len);
 void atg__sparse_mm_reduce_impl(tensor *, tensor self, tensor other, char* reduce_ptr, int reduce_len);
-void atg__sparse_semi_structured_linear(tensor *, tensor input, tensor weight, tensor meta, tensor bias, char* activation_ptr, int activation_len);
+void atg__sparse_semi_structured_linear(tensor *, tensor input, tensor weight, tensor meta, tensor bias, char* activation_ptr, int activation_len, int out_dtype);
 void atg__sparse_softmax(tensor *, tensor self, int64_t dim, int half_to_float);
 void atg__sparse_softmax_backward_data(tensor *, tensor grad_output, tensor output, int64_t dim, tensor self);
 void atg__sparse_softmax_backward_data_out(tensor *, tensor out, tensor grad_output, tensor output, int64_t dim, tensor self);
@@ -374,6 +395,7 @@ void atg__test_optional_floatlist(tensor *, tensor values, double *addends_data,
 void atg__test_optional_floatlist_out(tensor *, tensor out, tensor values, double *addends_data, int addends_len);
 void atg__test_optional_intlist(tensor *, tensor values, int64_t *addends_data, int addends_len);
 void atg__test_optional_intlist_out(tensor *, tensor out, tensor values, int64_t *addends_data, int addends_len);
+void atg__test_parallel_materialize(tensor *, tensor self, int64_t num_parallel, int skip_first);
 void atg__test_serialization_subcmul(tensor *, tensor self, tensor other);
 void atg__test_string_default(tensor *, tensor dummy, char* a_ptr, int a_len, char* b_ptr, int b_len);
 void atg__test_warn_in_autograd(tensor *, tensor self);
@@ -454,6 +476,7 @@ void atg__values_copy(tensor *, tensor self);
 void atg__values_copy_out(tensor *, tensor out, tensor self);
 int64_t atg__version(tensor self);
 void atg__weight_int4pack_mm(tensor *, tensor self, tensor mat2, int64_t qGroupSize, tensor qScaleAndZeros);
+void atg__weight_int8pack_mm(tensor *, tensor self, tensor mat2, tensor scales);
 void atg__weight_norm(tensor *, tensor v, tensor g, int64_t dim);
 void atg__weight_norm_differentiable_backward(tensor *, tensor grad_w, tensor saved_v, tensor saved_g, tensor saved_norms, int64_t dim);
 void atg__weight_norm_interface(tensor *, tensor v, tensor g, int64_t dim);
@@ -2079,6 +2102,7 @@ void atg_slice_backward(tensor *, tensor grad_output, int64_t *input_sizes_data,
 void atg_slice_backward_out(tensor *, tensor out, tensor grad_output, int64_t *input_sizes_data, int input_sizes_len, int64_t dim, int64_t start, int64_t end, int64_t step);
 void atg_slice_copy(tensor *, tensor self, int64_t dim, int64_t start_v, uint8_t start_null, int64_t end_v, uint8_t end_null, int64_t step);
 void atg_slice_copy_tensor_out(tensor *, tensor out, tensor self, int64_t dim, int64_t start_v, uint8_t start_null, int64_t end_v, uint8_t end_null, int64_t step);
+void atg_slice_inverse(tensor *, tensor self, tensor src, int64_t dim, int64_t start_v, uint8_t start_null, int64_t end_v, uint8_t end_null, int64_t step);
 void atg_slice_scatter(tensor *, tensor self, tensor src, int64_t dim, int64_t start_v, uint8_t start_null, int64_t end_v, uint8_t end_null, int64_t step);
 void atg_slice_scatter_out(tensor *, tensor out, tensor self, tensor src, int64_t dim, int64_t start_v, uint8_t start_null, int64_t end_v, uint8_t end_null, int64_t step);
 void atg_slogdet(tensor *, tensor self);
diff --git a/torch-sys/src/c_generated.rs b/torch-sys/src/c_generated.rs
index 883f61ad..a84e9808 100644
--- a/torch-sys/src/c_generated.rs
+++ b/torch-sys/src/c_generated.rs
@@ -230,6 +230,11 @@ extern "C" {
         scale_backoff_factor_: f64,
         growth_interval_: i64,
     );
+    pub fn atg__assert_scalar(
+        self_scalar_: *mut C_scalar,
+        assert_msg_ptr: *const u8,
+        assert_msg_len: c_int,
+    );
     pub fn atg__assert_tensor_metadata(
         a_: *mut C_tensor,
         size_data: *const i64,
@@ -290,6 +295,21 @@ extern "C" {
         A_: *mut C_tensor,
         upper_: c_int,
     );
+    pub fn atg__chunk_cat(
+        out__: *mut *mut C_tensor,
+        tensors_data: *const *mut C_tensor,
+        tensors_len: c_int,
+        dim_: i64,
+        num_chunks_: i64,
+    );
+    pub fn atg__chunk_cat_out(
+        out__: *mut *mut C_tensor,
+        out_: *mut C_tensor,
+        tensors_data: *const *mut C_tensor,
+        tensors_len: c_int,
+        dim_: i64,
+        num_chunks_: i64,
+    );
     pub fn atg__coalesce(out__: *mut *mut C_tensor, self_: *mut C_tensor);
     pub fn atg__coalesce_out(out__: *mut *mut C_tensor, out_: *mut C_tensor, self_: *mut C_tensor);
     pub fn atg__coalesced(out__: *mut *mut C_tensor, self_: *mut C_tensor, coalesced_: c_int);
@@ -488,7 +508,16 @@ extern "C" {
         alpha_: *mut C_tensor,
         out_dtype_: c_int,
         transpose_result_: c_int,
+        alg_id_: i64,
     );
+    pub fn atg__cslt_sparse_mm_search(
+        compressed_A_: *mut C_tensor,
+        dense_B_: *mut C_tensor,
+        bias_: *mut C_tensor,
+        alpha_: *mut C_tensor,
+        out_dtype_: c_int,
+        transpose_result_: c_int,
+    ) -> i64;
     pub fn atg__ctc_loss(
         out__: *mut *mut C_tensor,
         log_probs_: *mut C_tensor,
@@ -1107,6 +1136,13 @@ extern "C" {
         assert_msg_len: c_int,
         dep_token_: *mut C_tensor,
     );
+    pub fn atg__functional_assert_scalar(
+        out__: *mut *mut C_tensor,
+        self_scalar_: *mut C_scalar,
+        assert_msg_ptr: *const u8,
+        assert_msg_len: c_int,
+        dep_token_: *mut C_tensor,
+    );
     pub fn atg__functional_sym_constrain_range(
         out__: *mut *mut C_tensor,
         size_: *mut C_scalar,
@@ -1342,6 +1378,7 @@ extern "C" {
     pub fn atg__is_all_true(out__: *mut *mut C_tensor, self_: *mut C_tensor);
     pub fn atg__is_any_true(out__: *mut *mut C_tensor, self_: *mut C_tensor);
     pub fn atg__is_zerotensor(self_: *mut C_tensor) -> c_int;
+    pub fn atg__lazy_clone(out__: *mut *mut C_tensor, self_: *mut C_tensor);
     pub fn atg__linalg_check_errors(
         info_: *mut C_tensor,
         api_name_ptr: *const u8,
@@ -1372,6 +1409,7 @@ extern "C" {
         UPLO_len: c_int,
         compute_v_: c_int,
     );
+    pub fn atg__linalg_eigvals(out__: *mut *mut C_tensor, self_: *mut C_tensor);
     pub fn atg__linalg_slogdet(out__: *mut *mut C_tensor, A_: *mut C_tensor);
     pub fn atg__linalg_slogdet_sign(
         out__: *mut *mut C_tensor,
@@ -1838,6 +1876,17 @@ extern "C" {
         cpu_nested_shape_example_: *mut C_tensor,
         fuse_transform_0213_: c_int,
     );
+    pub fn atg__nested_get_jagged_dummy(out__: *mut *mut C_tensor, any_: *mut C_tensor);
+    pub fn atg__nested_get_lengths(out__: *mut *mut C_tensor, self_: *mut C_tensor);
+    pub fn atg__nested_get_offsets(out__: *mut *mut C_tensor, self_: *mut C_tensor);
+    pub fn atg__nested_get_ragged_idx(self_: *mut C_tensor) -> i64;
+    pub fn atg__nested_get_values(out__: *mut *mut C_tensor, self_: *mut C_tensor);
+    pub fn atg__nested_get_values_copy(out__: *mut *mut C_tensor, self_: *mut C_tensor);
+    pub fn atg__nested_get_values_copy_out(
+        out__: *mut *mut C_tensor,
+        out_: *mut C_tensor,
+        self_: *mut C_tensor,
+    );
     pub fn atg__nested_select_backward(
         out__: *mut *mut C_tensor,
         grad_output_: *mut C_tensor,
@@ -1875,6 +1924,31 @@ extern "C" {
         nested_strides_: *mut C_tensor,
         offsets_: *mut C_tensor,
     );
+    pub fn atg__nested_view_from_jagged(
+        out__: *mut *mut C_tensor,
+        self_: *mut C_tensor,
+        offsets_: *mut C_tensor,
+        dummy_: *mut C_tensor,
+        lengths_: *mut C_tensor,
+        ragged_idx_: i64,
+    );
+    pub fn atg__nested_view_from_jagged_copy(
+        out__: *mut *mut C_tensor,
+        self_: *mut C_tensor,
+        offsets_: *mut C_tensor,
+        dummy_: *mut C_tensor,
+        lengths_: *mut C_tensor,
+        ragged_idx_: i64,
+    );
+    pub fn atg__nested_view_from_jagged_copy_out(
+        out__: *mut *mut C_tensor,
+        out_: *mut C_tensor,
+        self_: *mut C_tensor,
+        offsets_: *mut C_tensor,
+        dummy_: *mut C_tensor,
+        lengths_: *mut C_tensor,
+        ragged_idx_: i64,
+    );
     pub fn atg__new_zeros_with_same_feature_meta(
         out__: *mut *mut C_tensor,
         self_: *mut C_tensor,
@@ -1989,6 +2063,7 @@ extern "C" {
         self_: *mut C_tensor,
         weight_: *mut C_tensor,
     );
+    pub fn atg__print(s_ptr: *const u8, s_len: c_int);
     pub fn atg__propagate_xla_data(input_: *mut C_tensor, output_: *mut C_tensor);
     pub fn atg__remove_batch_dim(
         out__: *mut *mut C_tensor,
@@ -2080,6 +2155,17 @@ extern "C" {
         scale_v: f64,
         scale_null: i8,
     );
+    pub fn atg__scaled_dot_product_cudnn_attention(
+        out__: *mut *mut C_tensor,
+        query_: *mut C_tensor,
+        key_: *mut C_tensor,
+        value_: *mut C_tensor,
+        dropout_p_: f64,
+        is_causal_: c_int,
+        return_debug_mask_: c_int,
+        scale_v: f64,
+        scale_null: i8,
+    );
     pub fn atg__scaled_dot_product_efficient_attention(
         out__: *mut *mut C_tensor,
         query_: *mut C_tensor,
@@ -2111,6 +2197,31 @@ extern "C" {
         scale_v: f64,
         scale_null: i8,
     );
+    pub fn atg__scaled_dot_product_flash_attention_for_cpu(
+        out__: *mut *mut C_tensor,
+        query_: *mut C_tensor,
+        key_: *mut C_tensor,
+        value_: *mut C_tensor,
+        dropout_p_: f64,
+        is_causal_: c_int,
+        attn_mask_: *mut C_tensor,
+        scale_v: f64,
+        scale_null: i8,
+    );
+    pub fn atg__scaled_dot_product_flash_attention_for_cpu_backward(
+        out__: *mut *mut C_tensor,
+        grad_out_: *mut C_tensor,
+        query_: *mut C_tensor,
+        key_: *mut C_tensor,
+        value_: *mut C_tensor,
+        out_: *mut C_tensor,
+        logsumexp_: *mut C_tensor,
+        dropout_p_: f64,
+        is_causal_: c_int,
+        attn_mask_: *mut C_tensor,
+        scale_v: f64,
+        scale_null: i8,
+    );
     pub fn atg__scaled_mm(
         out__: *mut *mut C_tensor,
         self_: *mut C_tensor,
@@ -2499,6 +2610,7 @@ extern "C" {
         bias_: *mut C_tensor,
         activation_ptr: *const u8,
         activation_len: c_int,
+        out_dtype_: c_int,
     );
     pub fn atg__sparse_softmax(
         out__: *mut *mut C_tensor,
@@ -2717,6 +2829,12 @@ extern "C" {
         addends_data: *const i64,
         addends_len: c_int,
     );
+    pub fn atg__test_parallel_materialize(
+        out__: *mut *mut C_tensor,
+        self_: *mut C_tensor,
+        num_parallel_: i64,
+        skip_first_: c_int,
+    );
     pub fn atg__test_serialization_subcmul(
         out__: *mut *mut C_tensor,
         self_: *mut C_tensor,
@@ -3409,6 +3527,12 @@ extern "C" {
         qGroupSize_: i64,
         qScaleAndZeros_: *mut C_tensor,
     );
+    pub fn atg__weight_int8pack_mm(
+        out__: *mut *mut C_tensor,
+        self_: *mut C_tensor,
+        mat2_: *mut C_tensor,
+        scales_: *mut C_tensor,
+    );
     pub fn atg__weight_norm(
         out__: *mut *mut C_tensor,
         v_: *mut C_tensor,
@@ -12691,6 +12815,17 @@ extern "C" {
         end_null: i8,
         step_: i64,
     );
+    pub fn atg_slice_inverse(
+        out__: *mut *mut C_tensor,
+        self_: *mut C_tensor,
+        src_: *mut C_tensor,
+        dim_: i64,
+        start_v: i64,
+        start_null: i8,
+        end_v: i64,
+        end_null: i8,
+        step_: i64,
+    );
     pub fn atg_slice_scatter(
         out__: *mut *mut C_tensor,
         self_: *mut C_tensor,