zama-ai · agnesLeroy · Sep 15, 2025
@@ -56,6 +56,7 @@ typedef struct {
   uint32_t num_radix_blocks;
   uint32_t max_num_radix_blocks;
   uint32_t lwe_dimension;
+  uint32_t num_radix_ciphertexts;
 } CudaRadixCiphertextFFI;
 
 typedef struct {

@@ -24,33 +24,43 @@ __host__ void host_integer_radix_bitop_kb(
           lwe_array_out->num_radix_blocks == lwe_array_2->num_radix_blocks,
       "Cuda error: input and output num radix blocks must be equal");
 
+  PANIC_IF_FALSE(
+      lwe_array_out->num_radix_ciphertexts ==
+              lwe_array_1->num_radix_ciphertexts &&
+          lwe_array_out->num_radix_ciphertexts ==
+              lwe_array_2->num_radix_ciphertexts,
+      "Cuda error: input and output num radix ciphertexts must be equal");
+
   PANIC_IF_FALSE(lwe_array_out->lwe_dimension == lwe_array_1->lwe_dimension &&
                      lwe_array_out->lwe_dimension == lwe_array_2->lwe_dimension,
                  "Cuda error: input and output lwe dimension must be equal");
 
   auto lut = mem_ptr->lut;
-  uint64_t degrees[lwe_array_1->num_radix_blocks];
+  uint64_t degrees[lwe_array_1->num_radix_blocks *
+                   lwe_array_1->num_radix_ciphertexts];
   if (mem_ptr->op == BITOP_TYPE::BITAND) {
-    update_degrees_after_bitand(degrees, lwe_array_1->degrees,
-                                lwe_array_2->degrees,
-                                lwe_array_1->num_radix_blocks);
+    update_degrees_after_bitand(
+        degrees, lwe_array_1->degrees, lwe_array_2->degrees,
+        lwe_array_1->num_radix_blocks * lwe_array_1->num_radix_ciphertexts);
   } else if (mem_ptr->op == BITOP_TYPE::BITOR) {
-    update_degrees_after_bitor(degrees, lwe_array_1->degrees,
-                               lwe_array_2->degrees,
-                               lwe_array_1->num_radix_blocks);
+    update_degrees_after_bitor(
+        degrees, lwe_array_1->degrees, lwe_array_2->degrees,
+        lwe_array_1->num_radix_blocks * lwe_array_1->num_radix_ciphertexts);
   } else if (mem_ptr->op == BITOP_TYPE::BITXOR) {
-    update_degrees_after_bitxor(degrees, lwe_array_1->degrees,
-                                lwe_array_2->degrees,
-                                lwe_array_1->num_radix_blocks);
+    update_degrees_after_bitxor(
+        degrees, lwe_array_1->degrees, lwe_array_2->degrees,
+        lwe_array_1->num_radix_blocks * lwe_array_1->num_radix_ciphertexts);
   }
 
   integer_radix_apply_bivariate_lookup_table_kb<Torus>(
       streams, lwe_array_out, lwe_array_1, lwe_array_2, bsks, ksks,
-      ms_noise_reduction_key, lut, lwe_array_out->num_radix_blocks,
+      ms_noise_reduction_key, lut,
+      lwe_array_out->num_radix_blocks * lwe_array_out->num_radix_ciphertexts,
       lut->params.message_modulus);
 
   memcpy(lwe_array_out->degrees, degrees,
-         lwe_array_out->num_radix_blocks * sizeof(uint64_t));
+         lwe_array_out->num_radix_blocks *
+             lwe_array_out->num_radix_ciphertexts * sizeof(uint64_t));
 }
 
 template <typename Torus>

@@ -417,9 +417,12 @@ __host__ void host_pack_bivariate_blocks(
       lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension)
     PANIC("Cuda error: input and output radix ciphertexts should have the same "
           "lwe dimension")
-  if (num_radix_blocks > lwe_array_out->num_radix_blocks ||
-      num_radix_blocks > lwe_array_1->num_radix_blocks ||
-      num_radix_blocks > lwe_array_2->num_radix_blocks)
+  if (num_radix_blocks > lwe_array_out->num_radix_blocks *
+                             lwe_array_out->num_radix_ciphertexts ||
+      num_radix_blocks >
+          lwe_array_1->num_radix_blocks * lwe_array_1->num_radix_ciphertexts ||
+      num_radix_blocks >
+          lwe_array_2->num_radix_blocks * lwe_array_2->num_radix_ciphertexts)
     PANIC("Cuda error: num radix blocks on which packing is applied should be "
           "smaller or equal to the number of input & output radix blocks")
 
@@ -530,7 +533,8 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
   if (num_radix_blocks > lut->num_blocks)
     PANIC("Cuda error: num radix blocks on which lut is applied should be "
           "smaller or equal to the number of lut radix blocks")
-  if (num_radix_blocks > lwe_array_out->num_radix_blocks)
+  if (num_radix_blocks >
+      lwe_array_out->num_radix_blocks * lwe_array_out->num_radix_ciphertexts)
     PANIC("Cuda error: num radix blocks on which lut is applied should be "
           "smaller or equal to the number of input & output radix blocks")
 
@@ -756,11 +760,14 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
   if (num_radix_blocks > lut->num_blocks)
     PANIC("Cuda error: num radix blocks on which lut is applied should be "
           "smaller or equal to the number of lut radix blocks")
-  if (num_radix_blocks > lwe_array_out->num_radix_blocks ||
-      num_radix_blocks > lwe_array_1->num_radix_blocks ||
-      num_radix_blocks > lwe_array_2->num_radix_blocks)
+  if (num_radix_blocks > lwe_array_out->num_radix_blocks *
+                             lwe_array_out->num_radix_ciphertexts ||
+      num_radix_blocks >
+          lwe_array_1->num_radix_blocks * lwe_array_1->num_radix_ciphertexts ||
+      num_radix_blocks >
+          lwe_array_2->num_radix_blocks * lwe_array_2->num_radix_ciphertexts)
     PANIC("Cuda error: num radix blocks on which lut is applied should be "
-          "smaller or equal to the number of input & output radix blocks")
+          "smaller or equal to the number of total input & output radix blocks")
 
   auto params = lut->params;
   auto pbs_type = params.pbs_type;

@@ -25,6 +25,7 @@ void into_radix_ciphertext(CudaRadixCiphertextFFI *radix, void *lwe_array,
   radix->num_radix_blocks = num_radix_blocks;
   radix->max_num_radix_blocks = num_radix_blocks;
   radix->ptr = lwe_array;
+  radix->num_radix_ciphertexts = 1;
 
   radix->degrees = (uint64_t *)(calloc(num_radix_blocks, sizeof(uint64_t)));
   radix->noise_levels =

@@ -19,6 +19,7 @@ void create_zero_radix_ciphertext_async(cudaStream_t const stream,
   radix->lwe_dimension = lwe_dimension;
   radix->num_radix_blocks = num_radix_blocks;
   radix->max_num_radix_blocks = num_radix_blocks;
+  radix->num_radix_ciphertexts = 1;
   uint64_t size = (lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
   radix->ptr = (void *)cuda_malloc_with_size_tracking_async(
       size, stream, gpu_index, size_tracker, allocate_gpu_memory);
@@ -63,6 +64,7 @@ void as_radix_ciphertext_slice(CudaRadixCiphertextFFI *output_radix,
 
   auto lwe_size = input_radix->lwe_dimension + 1;
   output_radix->num_radix_blocks = end_input_lwe_index - start_input_lwe_index;
+  output_radix->num_radix_ciphertexts = input_radix->num_radix_ciphertexts;
   output_radix->max_num_radix_blocks = input_radix->max_num_radix_blocks;
   output_radix->lwe_dimension = input_radix->lwe_dimension;
   Torus *in_ptr = (Torus *)input_radix->ptr;

@@ -183,6 +183,7 @@ pub struct CudaRadixCiphertextFFI {
     pub num_radix_blocks: u32,
     pub max_num_radix_blocks: u32,
     pub lwe_dimension: u32,
+    pub num_radix_ciphertexts: u32,
 }
 #[allow(clippy::unnecessary_operation, clippy::identity_op)]
 const _: () = {
@@ -201,6 +202,8 @@ const _: () = {
         [::std::mem::offset_of!(CudaRadixCiphertextFFI, max_num_radix_blocks) - 28usize];
     ["Offset of field: CudaRadixCiphertextFFI::lwe_dimension"]
         [::std::mem::offset_of!(CudaRadixCiphertextFFI, lwe_dimension) - 32usize];
+    ["Offset of field: CudaRadixCiphertextFFI::num_radix_ciphertexts"]
+        [::std::mem::offset_of!(CudaRadixCiphertextFFI, num_radix_ciphertexts) - 36usize];
 };
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]

@@ -90,6 +90,12 @@ path = "benches/high_level_api/noise_squash.rs"
 harness = false
 required-features = ["integer", "internal-keycache"]
 
+[[bench]]
+name = "hlapi-arrays"
+path = "benches/high_level_api/arrays.rs"
+harness = false
+required-features = ["integer", "internal-keycache"]
+
 [[bench]]
 name = "glwe_packing_compression-integer-bench"
 path = "benches/integer/glwe_packing_compression.rs"

@@ -0,0 +1,63 @@
+use benchmark::params_aliases::BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+use benchmark::utilities::{write_to_json, OperatorType};
+use criterion::Criterion;
+use rand::prelude::*;
+use tfhe::array::GpuFheUint64Array;
+use tfhe::keycache::NamedParam;
+use tfhe::prelude::*;
+use tfhe::{ClientKey, CompressedServerKey};
+
+#[cfg(feature = "gpu")]
+fn main() {
+    let cks = {
+        use tfhe::{set_server_key, ConfigBuilder};
+        let config = ConfigBuilder::with_custom_parameters(
+            BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+        )
+        .build();
+        let cks = ClientKey::generate(config);
+        let compressed_sks = CompressedServerKey::new(&cks);
+
+        set_server_key(compressed_sks.decompress_to_gpu());
+        cks
+    };
+
+    let array_dim = 32;
+    let num_elems = array_dim * array_dim;
+    let mut rng = thread_rng();
+    let clear_xs = (0..num_elems as u64)
+        .map(|_| rng.gen::<u64>())
+        .collect::<Vec<_>>();
+    let clear_ys = (0..num_elems as u64)
+        .map(|_| rng.gen::<u64>())
+        .collect::<Vec<_>>();
+
+    let xs =
+        GpuFheUint64Array::try_encrypt((clear_xs.as_slice(), vec![array_dim, array_dim]), &cks)
+            .unwrap();
+    let ys =
+        GpuFheUint64Array::try_encrypt((clear_ys.as_slice(), vec![array_dim, array_dim]), &cks)
+            .unwrap();
+
+    let mut c = Criterion::default().configure_from_args();
+    let bench_id = format!("bench::hlapi::array::cuda::bitand::");
+    c.bench_function(&bench_id, |b| {
+        b.iter(|| {
+            let _ = &xs & &ys;
+        })
+    });
+
+    let params = cks.computation_parameters();
+
+    write_to_json::<u64, _>(
+        &bench_id,
+        params,
+        params.name(),
+        "erc20-transfer",
+        &OperatorType::Atomic,
+        64,
+        vec![],
+    );
+
+    c.final_summary();
+}
@@ -104,12 +104,19 @@ impl<T: UnsignedInteger> CudaLweCiphertextList<T> {
                 .map(|list| list.0.lwe_ciphertext_count.0)
                 .sum(),
         );
-
         assert_ne!(
             lwe_ciphertext_count.0, 0,
             "Empty iterator of CudaLweCiphertextList"
         );
 
+        let stream_count = lwe_ciphertext_count.0.min(6);
+        let mut new_streams: Vec<CudaStreams> = Vec::with_capacity(stream_count);
+
+        for _ in 0..stream_count {
+            let stream = CudaStreams::new_single_gpu(streams.gpu_indexes[0]);
+            new_streams.push(stream);
+        }
+
         let first_item = cuda_ciphertexts_list_vec.next().unwrap();
         let lwe_dimension = first_item.lwe_dimension();
         let mut d_vec = CudaVec::new(
@@ -123,25 +130,20 @@ impl<T: UnsignedInteger> CudaLweCiphertextList<T> {
             * std::mem::size_of::<T>();
         // Concatenate gpu_index memory
         unsafe {
-            cuda_memcpy_async_gpu_to_gpu(
-                ptr,
-                first_item.0.d_vec.as_c_ptr(0),
-                size as u64,
-                streams.ptr[0],
-                streams.gpu_indexes[0].get(),
-            );
-            ptr = ptr.wrapping_byte_add(size);
-            for list in cuda_ciphertexts_list_vec {
+            for (i, list) in cuda_ciphertexts_list_vec.enumerate() {
                 cuda_memcpy_async_gpu_to_gpu(
                     ptr,
                     list.0.d_vec.as_c_ptr(0),
                     size as u64,
-                    streams.ptr[0],
-                    streams.gpu_indexes[0].get(),
+                    new_streams[i % stream_count].ptr[0],
+                    new_streams[i % stream_count].gpu_indexes[0].get(),
                 );
                 ptr = ptr.wrapping_byte_add(size);
             }
         }
+        for s in new_streams.iter() {
+            s.synchronize();
+        }
 
         let cuda_lwe_list = CudaLweList {
             d_vec,

@@ -840,6 +840,7 @@ pub unsafe fn add_lwe_ciphertext_vector_async<T: UnsignedInteger>(
         num_radix_blocks: num_samples,
         max_num_radix_blocks: num_samples,
         lwe_dimension: lwe_dimension.0 as u32,
+        num_radix_ciphertexts: 1u32,
     };
     let lwe_array_in_1_data = CudaRadixCiphertextFFI {
         ptr: lwe_array_in_1.get_mut_c_ptr(0),
@@ -848,6 +849,7 @@ pub unsafe fn add_lwe_ciphertext_vector_async<T: UnsignedInteger>(
         num_radix_blocks: num_samples,
         max_num_radix_blocks: num_samples,
         lwe_dimension: lwe_dimension.0 as u32,
+        num_radix_ciphertexts: 1u32,
     };
     let lwe_array_in_2_data = CudaRadixCiphertextFFI {
         ptr: lwe_array_in_2.get_mut_c_ptr(0),
@@ -856,6 +858,7 @@ pub unsafe fn add_lwe_ciphertext_vector_async<T: UnsignedInteger>(
         num_radix_blocks: num_samples,
         max_num_radix_blocks: num_samples,
         lwe_dimension: lwe_dimension.0 as u32,
+        num_radix_ciphertexts: 1u32,
     };
     cuda_add_lwe_ciphertext_vector_64(
         streams.ptr[0],
@@ -890,6 +893,7 @@ pub unsafe fn add_lwe_ciphertext_vector_assign_async<T: UnsignedInteger>(
         num_radix_blocks: num_samples,
         max_num_radix_blocks: num_samples,
         lwe_dimension: lwe_dimension.0 as u32,
+        num_radix_ciphertexts: 1u32,
     };
     let lwe_array_in_data = CudaRadixCiphertextFFI {
         ptr: lwe_array_in.get_mut_c_ptr(0),
@@ -898,6 +902,7 @@ pub unsafe fn add_lwe_ciphertext_vector_assign_async<T: UnsignedInteger>(
         num_radix_blocks: num_samples,
         max_num_radix_blocks: num_samples,
         lwe_dimension: lwe_dimension.0 as u32,
+        num_radix_ciphertexts: 1u32,
     };
     cuda_add_lwe_ciphertext_vector_64(
         streams.ptr[0],

@@ -19,7 +19,8 @@ use crate::integer::block_decomposition::{
     DecomposableInto, RecomposableFrom, RecomposableSignedInteger,
 };
 use crate::integer::gpu::ciphertext::{
-    CudaIntegerRadixCiphertext, CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext,
+    CudaIntegerRadixCiphertext, CudaRadixCiphertext, CudaSignedRadixCiphertext,
+    CudaUnsignedRadixCiphertext,
 };
 use crate::integer::server_key::radix_parallel::scalar_div_mod::SignedReciprocable;
 use crate::integer::server_key::{Reciprocable, ScalarMultiplier};
@@ -83,6 +84,12 @@ impl<'a, T> TensorSlice<'a, GpuSlice<'a, T>> {
     pub fn par_iter(self) -> ParStridedIter<'a, T> {
         ParStridedIter::new(self.slice.0, self.dims.clone())
     }
+    pub fn len(&self) -> usize {
+        self.dims.flattened_len()
+    }
+    pub fn as_slice(&self) -> &'a [T] {
+        self.slice.0
+    }
 }
 
 impl<'a, T> TensorSlice<'a, GpuSliceMut<'a, T>> {
@@ -316,7 +323,25 @@ where
         lhs: TensorSlice<'_, Self::Slice<'a>>,
         rhs: TensorSlice<'_, Self::Slice<'a>>,
     ) -> Self::Owned {
-        par_map_sks_op_on_pair_of_elements(lhs, rhs, crate::integer::gpu::CudaServerKey::bitand)
+        GpuOwned(global_state::with_cuda_internal_keys(|cuda_key| {
+            let streams = &cuda_key.streams;
+            let num_ciphertexts = lhs.len() as u32;
+            let lhs_slice: &[T] = lhs.as_slice();
+            let rhs_slice: &[T] = rhs.as_slice();
+            let mut lhs_aligned = T::from(CudaRadixCiphertext::from_radix_ciphertext_vec(
+                lhs_slice, streams,
+            ));
+            let rhs_aligned = T::from(CudaRadixCiphertext::from_radix_ciphertext_vec(
+                rhs_slice, streams,
+            ));
+            crate::integer::gpu::CudaServerKey::bitand_vec(
+                cuda_key.pbs_key(),
+                &mut lhs_aligned,
+                &rhs_aligned,
+                num_ciphertexts,
+                streams,
+            )
+        }))
     }
 
     fn bitor<'a>(

@@ -28,6 +28,12 @@ impl<'a, T> TensorSlice<'a, &'a [T]> {
     pub fn par_iter(self) -> ParStridedIter<'a, T> {
         ParStridedIter::new(self.slice, self.dims.clone())
     }
+    pub fn len(&self) -> usize {
+        self.dims.flattened_len()
+    }
+    pub fn as_slice(&self) -> &'a [T] {
+        self.slice
+    }
 }
 
 impl<'a, T> TensorSlice<'a, &'a mut [T]> {