diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h index eac7ac8e1c..cb03d9abe0 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h @@ -56,6 +56,7 @@ typedef struct { uint32_t num_radix_blocks; uint32_t max_num_radix_blocks; uint32_t lwe_dimension; + uint32_t num_radix_ciphertexts; } CudaRadixCiphertextFFI; typedef struct { diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh index 5539b16711..d7463a13e8 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh @@ -24,33 +24,43 @@ __host__ void host_integer_radix_bitop_kb( lwe_array_out->num_radix_blocks == lwe_array_2->num_radix_blocks, "Cuda error: input and output num radix blocks must be equal"); + PANIC_IF_FALSE( + lwe_array_out->num_radix_ciphertexts == + lwe_array_1->num_radix_ciphertexts && + lwe_array_out->num_radix_ciphertexts == + lwe_array_2->num_radix_ciphertexts, + "Cuda error: input and output num radix ciphertexts must be equal"); + PANIC_IF_FALSE(lwe_array_out->lwe_dimension == lwe_array_1->lwe_dimension && lwe_array_out->lwe_dimension == lwe_array_2->lwe_dimension, "Cuda error: input and output lwe dimension must be equal"); auto lut = mem_ptr->lut; - uint64_t degrees[lwe_array_1->num_radix_blocks]; + uint64_t degrees[lwe_array_1->num_radix_blocks * + lwe_array_1->num_radix_ciphertexts]; if (mem_ptr->op == BITOP_TYPE::BITAND) { - update_degrees_after_bitand(degrees, lwe_array_1->degrees, - lwe_array_2->degrees, - lwe_array_1->num_radix_blocks); + update_degrees_after_bitand( + degrees, lwe_array_1->degrees, lwe_array_2->degrees, + lwe_array_1->num_radix_blocks * lwe_array_1->num_radix_ciphertexts); } else if (mem_ptr->op == BITOP_TYPE::BITOR) { - update_degrees_after_bitor(degrees, lwe_array_1->degrees, - lwe_array_2->degrees, - lwe_array_1->num_radix_blocks); + update_degrees_after_bitor( + degrees, lwe_array_1->degrees, lwe_array_2->degrees, + lwe_array_1->num_radix_blocks * lwe_array_1->num_radix_ciphertexts); } else if (mem_ptr->op == BITOP_TYPE::BITXOR) { - update_degrees_after_bitxor(degrees, lwe_array_1->degrees, - lwe_array_2->degrees, - lwe_array_1->num_radix_blocks); + update_degrees_after_bitxor( + degrees, lwe_array_1->degrees, lwe_array_2->degrees, + lwe_array_1->num_radix_blocks * lwe_array_1->num_radix_ciphertexts); } integer_radix_apply_bivariate_lookup_table_kb( streams, lwe_array_out, lwe_array_1, lwe_array_2, bsks, ksks, - ms_noise_reduction_key, lut, lwe_array_out->num_radix_blocks, + ms_noise_reduction_key, lut, + lwe_array_out->num_radix_blocks * lwe_array_out->num_radix_ciphertexts, lut->params.message_modulus); memcpy(lwe_array_out->degrees, degrees, - lwe_array_out->num_radix_blocks * sizeof(uint64_t)); + lwe_array_out->num_radix_blocks * + lwe_array_out->num_radix_ciphertexts * sizeof(uint64_t)); } template diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index 34b144680e..18b3980259 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -417,9 +417,12 @@ __host__ void host_pack_bivariate_blocks( lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension) PANIC("Cuda error: input and output radix ciphertexts should have the same " "lwe dimension") - if (num_radix_blocks > lwe_array_out->num_radix_blocks || - num_radix_blocks > lwe_array_1->num_radix_blocks || - num_radix_blocks > lwe_array_2->num_radix_blocks) + if (num_radix_blocks > lwe_array_out->num_radix_blocks * + lwe_array_out->num_radix_ciphertexts || + num_radix_blocks > + lwe_array_1->num_radix_blocks * lwe_array_1->num_radix_ciphertexts || + num_radix_blocks > + lwe_array_2->num_radix_blocks * lwe_array_2->num_radix_ciphertexts) PANIC("Cuda error: num radix blocks on which packing is applied should be " "smaller or equal to the number of input & output radix blocks") @@ -530,7 +533,8 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb( if (num_radix_blocks > lut->num_blocks) PANIC("Cuda error: num radix blocks on which lut is applied should be " "smaller or equal to the number of lut radix blocks") - if (num_radix_blocks > lwe_array_out->num_radix_blocks) + if (num_radix_blocks > + lwe_array_out->num_radix_blocks * lwe_array_out->num_radix_ciphertexts) PANIC("Cuda error: num radix blocks on which lut is applied should be " "smaller or equal to the number of input & output radix blocks") @@ -756,11 +760,14 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb( if (num_radix_blocks > lut->num_blocks) PANIC("Cuda error: num radix blocks on which lut is applied should be " "smaller or equal to the number of lut radix blocks") - if (num_radix_blocks > lwe_array_out->num_radix_blocks || - num_radix_blocks > lwe_array_1->num_radix_blocks || - num_radix_blocks > lwe_array_2->num_radix_blocks) + if (num_radix_blocks > lwe_array_out->num_radix_blocks * + lwe_array_out->num_radix_ciphertexts || + num_radix_blocks > + lwe_array_1->num_radix_blocks * lwe_array_1->num_radix_ciphertexts || + num_radix_blocks > + lwe_array_2->num_radix_blocks * lwe_array_2->num_radix_ciphertexts) PANIC("Cuda error: num radix blocks on which lut is applied should be " - "smaller or equal to the number of input & output radix blocks") + "smaller or equal to the number of total input & output radix blocks") auto params = lut->params; auto pbs_type = params.pbs_type; diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cu b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cu index 6e0a334b0b..7b4793314e 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cu @@ -25,6 +25,7 @@ void into_radix_ciphertext(CudaRadixCiphertextFFI *radix, void *lwe_array, radix->num_radix_blocks = num_radix_blocks; radix->max_num_radix_blocks = num_radix_blocks; radix->ptr = lwe_array; + radix->num_radix_ciphertexts = 1; radix->degrees = (uint64_t *)(calloc(num_radix_blocks, sizeof(uint64_t))); radix->noise_levels = diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh index 0960bb90fa..4998da3d97 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh @@ -19,6 +19,7 @@ void create_zero_radix_ciphertext_async(cudaStream_t const stream, radix->lwe_dimension = lwe_dimension; radix->num_radix_blocks = num_radix_blocks; radix->max_num_radix_blocks = num_radix_blocks; + radix->num_radix_ciphertexts = 1; uint64_t size = (lwe_dimension + 1) * num_radix_blocks * sizeof(Torus); radix->ptr = (void *)cuda_malloc_with_size_tracking_async( size, stream, gpu_index, size_tracker, allocate_gpu_memory); @@ -63,6 +64,7 @@ void as_radix_ciphertext_slice(CudaRadixCiphertextFFI *output_radix, auto lwe_size = input_radix->lwe_dimension + 1; output_radix->num_radix_blocks = end_input_lwe_index - start_input_lwe_index; + output_radix->num_radix_ciphertexts = input_radix->num_radix_ciphertexts; output_radix->max_num_radix_blocks = input_radix->max_num_radix_blocks; output_radix->lwe_dimension = input_radix->lwe_dimension; Torus *in_ptr = (Torus *)input_radix->ptr; diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs index 91660f4db2..133f244e01 100644 --- a/backends/tfhe-cuda-backend/src/bindings.rs +++ b/backends/tfhe-cuda-backend/src/bindings.rs @@ -183,6 +183,7 @@ pub struct CudaRadixCiphertextFFI { pub num_radix_blocks: u32, pub max_num_radix_blocks: u32, pub lwe_dimension: u32, + pub num_radix_ciphertexts: u32, } #[allow(clippy::unnecessary_operation, clippy::identity_op)] const _: () = { @@ -201,6 +202,8 @@ const _: () = { [::std::mem::offset_of!(CudaRadixCiphertextFFI, max_num_radix_blocks) - 28usize]; ["Offset of field: CudaRadixCiphertextFFI::lwe_dimension"] [::std::mem::offset_of!(CudaRadixCiphertextFFI, lwe_dimension) - 32usize]; + ["Offset of field: CudaRadixCiphertextFFI::num_radix_ciphertexts"] + [::std::mem::offset_of!(CudaRadixCiphertextFFI, num_radix_ciphertexts) - 36usize]; }; #[repr(C)] #[derive(Debug, Copy, Clone)] diff --git a/tfhe-benchmark/Cargo.toml b/tfhe-benchmark/Cargo.toml index 1684f62686..edfdd4f1e3 100644 --- a/tfhe-benchmark/Cargo.toml +++ b/tfhe-benchmark/Cargo.toml @@ -90,6 +90,12 @@ path = "benches/high_level_api/noise_squash.rs" harness = false required-features = ["integer", "internal-keycache"] +[[bench]] +name = "hlapi-arrays" +path = "benches/high_level_api/arrays.rs" +harness = false +required-features = ["integer", "internal-keycache"] + [[bench]] name = "glwe_packing_compression-integer-bench" path = "benches/integer/glwe_packing_compression.rs" diff --git a/tfhe-benchmark/benches/high_level_api/arrays.rs b/tfhe-benchmark/benches/high_level_api/arrays.rs new file mode 100644 index 0000000000..409210f5b5 --- /dev/null +++ b/tfhe-benchmark/benches/high_level_api/arrays.rs @@ -0,0 +1,63 @@ +use benchmark::params_aliases::BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; +use benchmark::utilities::{write_to_json, OperatorType}; +use criterion::Criterion; +use rand::prelude::*; +use tfhe::array::GpuFheUint64Array; +use tfhe::keycache::NamedParam; +use tfhe::prelude::*; +use tfhe::{ClientKey, CompressedServerKey}; + +#[cfg(feature = "gpu")] +fn main() { + let cks = { + use tfhe::{set_server_key, ConfigBuilder}; + let config = ConfigBuilder::with_custom_parameters( + BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, + ) + .build(); + let cks = ClientKey::generate(config); + let compressed_sks = CompressedServerKey::new(&cks); + + set_server_key(compressed_sks.decompress_to_gpu()); + cks + }; + + let array_dim = 32; + let num_elems = array_dim * array_dim; + let mut rng = thread_rng(); + let clear_xs = (0..num_elems as u64) + .map(|_| rng.gen::()) + .collect::>(); + let clear_ys = (0..num_elems as u64) + .map(|_| rng.gen::()) + .collect::>(); + + let xs = + GpuFheUint64Array::try_encrypt((clear_xs.as_slice(), vec![array_dim, array_dim]), &cks) + .unwrap(); + let ys = + GpuFheUint64Array::try_encrypt((clear_ys.as_slice(), vec![array_dim, array_dim]), &cks) + .unwrap(); + + let mut c = Criterion::default().configure_from_args(); + let bench_id = format!("bench::hlapi::array::cuda::bitand::"); + c.bench_function(&bench_id, |b| { + b.iter(|| { + let _ = &xs & &ys; + }) + }); + + let params = cks.computation_parameters(); + + write_to_json::( + &bench_id, + params, + params.name(), + "erc20-transfer", + &OperatorType::Atomic, + 64, + vec![], + ); + + c.final_summary(); +} diff --git a/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs b/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs index 9b666a49de..d9f9269388 100644 --- a/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs +++ b/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs @@ -104,12 +104,19 @@ impl CudaLweCiphertextList { .map(|list| list.0.lwe_ciphertext_count.0) .sum(), ); - assert_ne!( lwe_ciphertext_count.0, 0, "Empty iterator of CudaLweCiphertextList" ); + let stream_count = lwe_ciphertext_count.0.min(6); + let mut new_streams: Vec = Vec::with_capacity(stream_count); + + for _ in 0..stream_count { + let stream = CudaStreams::new_single_gpu(streams.gpu_indexes[0]); + new_streams.push(stream); + } + let first_item = cuda_ciphertexts_list_vec.next().unwrap(); let lwe_dimension = first_item.lwe_dimension(); let mut d_vec = CudaVec::new( @@ -123,25 +130,20 @@ impl CudaLweCiphertextList { * std::mem::size_of::(); // Concatenate gpu_index memory unsafe { - cuda_memcpy_async_gpu_to_gpu( - ptr, - first_item.0.d_vec.as_c_ptr(0), - size as u64, - streams.ptr[0], - streams.gpu_indexes[0].get(), - ); - ptr = ptr.wrapping_byte_add(size); - for list in cuda_ciphertexts_list_vec { + for (i, list) in cuda_ciphertexts_list_vec.enumerate() { cuda_memcpy_async_gpu_to_gpu( ptr, list.0.d_vec.as_c_ptr(0), size as u64, - streams.ptr[0], - streams.gpu_indexes[0].get(), + new_streams[i % stream_count].ptr[0], + new_streams[i % stream_count].gpu_indexes[0].get(), ); ptr = ptr.wrapping_byte_add(size); } } + for s in new_streams.iter() { + s.synchronize(); + } let cuda_lwe_list = CudaLweList { d_vec, diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs index 497010eef4..de74cbcab5 100644 --- a/tfhe/src/core_crypto/gpu/mod.rs +++ b/tfhe/src/core_crypto/gpu/mod.rs @@ -840,6 +840,7 @@ pub unsafe fn add_lwe_ciphertext_vector_async( num_radix_blocks: num_samples, max_num_radix_blocks: num_samples, lwe_dimension: lwe_dimension.0 as u32, + num_radix_ciphertexts: 1u32, }; let lwe_array_in_1_data = CudaRadixCiphertextFFI { ptr: lwe_array_in_1.get_mut_c_ptr(0), @@ -848,6 +849,7 @@ pub unsafe fn add_lwe_ciphertext_vector_async( num_radix_blocks: num_samples, max_num_radix_blocks: num_samples, lwe_dimension: lwe_dimension.0 as u32, + num_radix_ciphertexts: 1u32, }; let lwe_array_in_2_data = CudaRadixCiphertextFFI { ptr: lwe_array_in_2.get_mut_c_ptr(0), @@ -856,6 +858,7 @@ pub unsafe fn add_lwe_ciphertext_vector_async( num_radix_blocks: num_samples, max_num_radix_blocks: num_samples, lwe_dimension: lwe_dimension.0 as u32, + num_radix_ciphertexts: 1u32, }; cuda_add_lwe_ciphertext_vector_64( streams.ptr[0], @@ -890,6 +893,7 @@ pub unsafe fn add_lwe_ciphertext_vector_assign_async( num_radix_blocks: num_samples, max_num_radix_blocks: num_samples, lwe_dimension: lwe_dimension.0 as u32, + num_radix_ciphertexts: 1u32, }; let lwe_array_in_data = CudaRadixCiphertextFFI { ptr: lwe_array_in.get_mut_c_ptr(0), @@ -898,6 +902,7 @@ pub unsafe fn add_lwe_ciphertext_vector_assign_async( num_radix_blocks: num_samples, max_num_radix_blocks: num_samples, lwe_dimension: lwe_dimension.0 as u32, + num_radix_ciphertexts: 1u32, }; cuda_add_lwe_ciphertext_vector_64( streams.ptr[0], diff --git a/tfhe/src/high_level_api/array/gpu/integers.rs b/tfhe/src/high_level_api/array/gpu/integers.rs index 3fc4cbdc8a..53a023c62b 100644 --- a/tfhe/src/high_level_api/array/gpu/integers.rs +++ b/tfhe/src/high_level_api/array/gpu/integers.rs @@ -19,7 +19,8 @@ use crate::integer::block_decomposition::{ DecomposableInto, RecomposableFrom, RecomposableSignedInteger, }; use crate::integer::gpu::ciphertext::{ - CudaIntegerRadixCiphertext, CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext, + CudaIntegerRadixCiphertext, CudaRadixCiphertext, CudaSignedRadixCiphertext, + CudaUnsignedRadixCiphertext, }; use crate::integer::server_key::radix_parallel::scalar_div_mod::SignedReciprocable; use crate::integer::server_key::{Reciprocable, ScalarMultiplier}; @@ -83,6 +84,12 @@ impl<'a, T> TensorSlice<'a, GpuSlice<'a, T>> { pub fn par_iter(self) -> ParStridedIter<'a, T> { ParStridedIter::new(self.slice.0, self.dims.clone()) } + pub fn len(&self) -> usize { + self.dims.flattened_len() + } + pub fn as_slice(&self) -> &'a [T] { + self.slice.0 + } } impl<'a, T> TensorSlice<'a, GpuSliceMut<'a, T>> { @@ -316,7 +323,25 @@ where lhs: TensorSlice<'_, Self::Slice<'a>>, rhs: TensorSlice<'_, Self::Slice<'a>>, ) -> Self::Owned { - par_map_sks_op_on_pair_of_elements(lhs, rhs, crate::integer::gpu::CudaServerKey::bitand) + GpuOwned(global_state::with_cuda_internal_keys(|cuda_key| { + let streams = &cuda_key.streams; + let num_ciphertexts = lhs.len() as u32; + let lhs_slice: &[T] = lhs.as_slice(); + let rhs_slice: &[T] = rhs.as_slice(); + let mut lhs_aligned = T::from(CudaRadixCiphertext::from_radix_ciphertext_vec( + lhs_slice, streams, + )); + let rhs_aligned = T::from(CudaRadixCiphertext::from_radix_ciphertext_vec( + rhs_slice, streams, + )); + crate::integer::gpu::CudaServerKey::bitand_vec( + cuda_key.pbs_key(), + &mut lhs_aligned, + &rhs_aligned, + num_ciphertexts, + streams, + ) + })) } fn bitor<'a>( diff --git a/tfhe/src/high_level_api/array/traits.rs b/tfhe/src/high_level_api/array/traits.rs index fed40d1aab..1104cc634b 100644 --- a/tfhe/src/high_level_api/array/traits.rs +++ b/tfhe/src/high_level_api/array/traits.rs @@ -28,6 +28,12 @@ impl<'a, T> TensorSlice<'a, &'a [T]> { pub fn par_iter(self) -> ParStridedIter<'a, T> { ParStridedIter::new(self.slice, self.dims.clone()) } + pub fn len(&self) -> usize { + self.dims.flattened_len() + } + pub fn as_slice(&self) -> &'a [T] { + self.slice + } } impl<'a, T> TensorSlice<'a, &'a mut [T]> { diff --git a/tfhe/src/integer/gpu/ciphertext/mod.rs b/tfhe/src/integer/gpu/ciphertext/mod.rs index 47bfb58fc7..f25bc164fd 100644 --- a/tfhe/src/integer/gpu/ciphertext/mod.rs +++ b/tfhe/src/integer/gpu/ciphertext/mod.rs @@ -7,7 +7,7 @@ pub mod squashed_noise; use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; use crate::core_crypto::gpu::vec::CudaVec; -use crate::core_crypto::gpu::CudaStreams; +use crate::core_crypto::gpu::{CudaLweList, CudaStreams}; use crate::core_crypto::prelude::{LweCiphertextList, LweCiphertextOwned}; use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextInfo}; use crate::integer::parameters::LweDimension; @@ -15,6 +15,7 @@ use crate::integer::{IntegerCiphertext, RadixCiphertext, SignedRadixCiphertext}; use crate::shortint::{Ciphertext, EncryptionKeyChoice}; use crate::GpuIndex; +use crate::shortint::parameters::LweCiphertextCount; pub use compressed_noise_squashed_ciphertext_list::*; pub trait CudaIntegerRadixCiphertext: Sized { @@ -70,8 +71,68 @@ pub trait CudaIntegerRadixCiphertext: Sized { fn gpu_indexes(&self) -> &[GpuIndex] { &self.as_ref().d_blocks.0.d_vec.gpu_indexes } + + // Converts a CudaIntegerRadixCiphertext with num_blocks * num_ciphertexts LWEs into a + // Vec of length num_radix_ciphertexts, where each ciphertext has + // num_blocks LWEs + fn to_integer_radix_ciphertext_vec( + &self, + num_radix_ciphertexts: u32, + streams: &CudaStreams, + ) -> Vec { + let total_blocks = self.as_ref().d_blocks.0.lwe_ciphertext_count.0; + assert_eq!(total_blocks % num_radix_ciphertexts as usize, 0, "Total number of blocks ({total_blocks}) is not divisible by number of radix ciphertexts ({num_radix_ciphertexts})"); + + let num_blocks = total_blocks / num_radix_ciphertexts as usize; + + let mut result = Vec::with_capacity(num_radix_ciphertexts as usize); + let lwe_dimension = self.as_ref().d_blocks.lwe_dimension(); + + for i in 0..num_radix_ciphertexts as usize { + let block_start = i * num_blocks; + let block_end = block_start + num_blocks; + + let d_vec = unsafe { + let mut d_vec = + CudaVec::new_async(lwe_dimension.to_lwe_size().0 * num_blocks, streams, 0); + + let copy_start = block_start * lwe_dimension.to_lwe_size().0; + let copy_end = block_end * lwe_dimension.to_lwe_size().0; + d_vec.copy_src_range_gpu_to_gpu_async( + copy_start..copy_end, + &self.as_ref().d_blocks.0.d_vec, + streams, + 0, + ); + + streams.synchronize(); + d_vec + }; + let lwe_list = CudaLweList:: { + d_vec, + lwe_ciphertext_count: LweCiphertextCount(num_blocks), + lwe_dimension, + ciphertext_modulus: self.as_ref().d_blocks.ciphertext_modulus(), + }; + + // Copy the associated block metadata + let block_info = self.as_ref().info.blocks[block_start..block_end].to_vec(); + + let info = CudaRadixCiphertextInfo { blocks: block_info }; + + result.push(Self::from(CudaRadixCiphertext::new( + CudaLweCiphertextList(lwe_list), + info, + ))); + } + + result + } } +/// This struct corresponds to the pointers on GPU and +/// metadata representing an array of LWEs corresponding +/// to one or more RadixCiphertexts pub struct CudaRadixCiphertext { pub d_blocks: CudaLweCiphertextList, pub info: CudaRadixCiphertextInfo, diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs index 80380b8104..6a4925beba 100644 --- a/tfhe/src/integer/gpu/mod.rs +++ b/tfhe/src/integer/gpu/mod.rs @@ -183,6 +183,25 @@ fn prepare_cuda_radix_ffi( num_radix_blocks: input.d_blocks.0.lwe_ciphertext_count.0 as u32, max_num_radix_blocks: input.d_blocks.0.lwe_ciphertext_count.0 as u32, lwe_dimension: input.d_blocks.0.lwe_dimension.0 as u32, + num_radix_ciphertexts: 1u32, + } +} + +fn prepare_cuda_radix_vec_ffi( + input: &CudaRadixCiphertext, + degrees_vec: &mut Vec, + noise_levels_vec: &mut Vec, + num_radix_ciphertexts: u32, +) -> CudaRadixCiphertextFFI { + CudaRadixCiphertextFFI { + ptr: input.d_blocks.0.d_vec.get_mut_c_ptr(0), + degrees: degrees_vec.as_mut_ptr(), + noise_levels: noise_levels_vec.as_mut_ptr(), + num_radix_blocks: input.d_blocks.0.lwe_ciphertext_count.0 as u32 / num_radix_ciphertexts, + max_num_radix_blocks: input.d_blocks.0.lwe_ciphertext_count.0 as u32 + / num_radix_ciphertexts, + lwe_dimension: input.d_blocks.0.lwe_dimension.0 as u32, + num_radix_ciphertexts, } } @@ -200,6 +219,7 @@ fn prepare_cuda_radix_ffi_from_slice( num_radix_blocks, max_num_radix_blocks: num_radix_blocks, lwe_dimension, + num_radix_ciphertexts: 1u32, } } @@ -217,6 +237,7 @@ fn prepare_cuda_radix_ffi_from_slice_mut( num_radix_blocks, max_num_radix_blocks: num_radix_blocks, lwe_dimension, + num_radix_ciphertexts: 1u32, } } @@ -7604,3 +7625,143 @@ pub unsafe fn expand_async( ); cleanup_expand_without_verification_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr)); } + +#[allow(clippy::too_many_arguments)] +/// # Safety +/// +/// This operation modifies raw GPU pointers on the GPU +pub unsafe fn unchecked_bitop_vec_radix_kb_assign( + streams: &CudaStreams, + radix_lwe_left: &mut CudaRadixCiphertext, + radix_lwe_right: &CudaRadixCiphertext, + bootstrapping_key: &CudaVec, + keyswitch_key: &CudaVec, + message_modulus: MessageModulus, + carry_modulus: CarryModulus, + glwe_dimension: GlweDimension, + polynomial_size: PolynomialSize, + big_lwe_dimension: LweDimension, + small_lwe_dimension: LweDimension, + ks_level: DecompositionLevelCount, + ks_base_log: DecompositionBaseLog, + pbs_level: DecompositionLevelCount, + pbs_base_log: DecompositionBaseLog, + op: BitOpType, + num_blocks: u32, + num_radix_ciphertexts: u32, + pbs_type: PBSType, + grouping_factor: LweBskGroupingFactor, + ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>, +) { + assert_eq!( + streams.gpu_indexes[0], + radix_lwe_left.d_blocks.0.d_vec.gpu_index(0), + "GPU error: first stream is on GPU {}, first lhs pointer is on GPU {}", + streams.gpu_indexes[0].get(), + radix_lwe_left.d_blocks.0.d_vec.gpu_index(0).get(), + ); + assert_eq!( + streams.gpu_indexes[0], + radix_lwe_right.d_blocks.0.d_vec.gpu_index(0), + "GPU error: first stream is on GPU {}, first rhs pointer is on GPU {}", + streams.gpu_indexes[0].get(), + radix_lwe_right.d_blocks.0.d_vec.gpu_index(0).get(), + ); + assert_eq!( + streams.gpu_indexes[0], + bootstrapping_key.gpu_index(0), + "GPU error: first stream is on GPU {}, first bsk pointer is on GPU {}", + streams.gpu_indexes[0].get(), + bootstrapping_key.gpu_index(0).get(), + ); + assert_eq!( + streams.gpu_indexes[0], + keyswitch_key.gpu_index(0), + "GPU error: first stream is on GPU {}, first ksk pointer is on GPU {}", + streams.gpu_indexes[0].get(), + keyswitch_key.gpu_index(0).get(), + ); + let ct_modulus = radix_lwe_left + .d_blocks + .ciphertext_modulus() + .raw_modulus_float(); + let (noise_reduction_type, ms_noise_reduction_key_ffi) = + resolve_ms_noise_reduction_config(ms_noise_reduction_configuration, ct_modulus); + + let mut mem_ptr: *mut i8 = std::ptr::null_mut(); + let mut radix_lwe_left_degrees = radix_lwe_left + .info + .blocks + .iter() + .map(|b| b.degree.0) + .collect(); + let mut radix_lwe_left_noise_levels = radix_lwe_left + .info + .blocks + .iter() + .map(|b| b.noise_level.0) + .collect(); + let mut cuda_ffi_radix_lwe_left = prepare_cuda_radix_vec_ffi( + radix_lwe_left, + &mut radix_lwe_left_degrees, + &mut radix_lwe_left_noise_levels, + num_radix_ciphertexts, + ); + // Here even though the input is not modified, data is passed as mutable. + // This avoids having to create two structs for the CudaRadixCiphertext pointers, + // one const and the other mutable. + // Having two structs on the Cuda side complicates things as we need to be sure we pass the + // Const structure as input instead of the mutable structure, which leads to complicated + // data manipulation on the C++ side to change mutability of data. + let mut radix_lwe_right_degrees = radix_lwe_right + .info + .blocks + .iter() + .map(|b| b.degree.0) + .collect(); + let mut radix_lwe_right_noise_levels = radix_lwe_right + .info + .blocks + .iter() + .map(|b| b.noise_level.0) + .collect(); + let cuda_ffi_radix_lwe_right = prepare_cuda_radix_vec_ffi( + radix_lwe_right, + &mut radix_lwe_right_degrees, + &mut radix_lwe_right_noise_levels, + num_radix_ciphertexts, + ); + scratch_cuda_integer_radix_bitop_kb_64( + streams.ffi(), + std::ptr::addr_of_mut!(mem_ptr), + glwe_dimension.0 as u32, + polynomial_size.0 as u32, + big_lwe_dimension.0 as u32, + small_lwe_dimension.0 as u32, + ks_level.0 as u32, + ks_base_log.0 as u32, + pbs_level.0 as u32, + pbs_base_log.0 as u32, + grouping_factor.0 as u32, + num_blocks * num_radix_ciphertexts, + message_modulus.0 as u32, + carry_modulus.0 as u32, + pbs_type as u32, + op as u32, + true, + noise_reduction_type as u32, + ); + cuda_bitop_integer_radix_ciphertext_kb_64( + streams.ffi(), + &raw mut cuda_ffi_radix_lwe_left, + &raw const cuda_ffi_radix_lwe_left, + &raw const cuda_ffi_radix_lwe_right, + mem_ptr, + bootstrapping_key.ptr.as_ptr(), + keyswitch_key.ptr.as_ptr(), + &raw const ms_noise_reduction_key_ffi, + ); + cleanup_cuda_integer_bitop(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr)); + update_noise_degree(radix_lwe_left, &cuda_ffi_radix_lwe_left); + streams.synchronize(); +} diff --git a/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs b/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs index 2210b75c19..80dece9b6b 100644 --- a/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs +++ b/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs @@ -10,7 +10,8 @@ use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext; use crate::integer::gpu::server_key::CudaBootstrappingKey; use crate::integer::gpu::{ get_bitop_integer_radix_kb_size_on_gpu, get_full_propagate_assign_size_on_gpu, - unchecked_bitop_integer_radix_kb_assign_async, BitOpType, CudaServerKey, PBSType, + unchecked_bitop_integer_radix_kb_assign_async, unchecked_bitop_vec_radix_kb_assign, BitOpType, + CudaServerKey, PBSType, }; impl CudaServerKey { @@ -977,4 +978,142 @@ impl CudaServerKey { let bitnot_mem = (lwe_ciphertext_count.0 * size_of::()) as u64; full_prop_mem.max(bitnot_mem) } + + pub fn unchecked_bitop_vec_assign( + &self, + ct_left: &mut T, + ct_right: &T, + op: BitOpType, + num_radix_ciphertexts: u32, + streams: &CudaStreams, + ) { + assert_eq!( + ct_left.as_ref().d_blocks.lwe_dimension(), + ct_right.as_ref().d_blocks.lwe_dimension() + ); + assert_eq!( + ct_left.as_ref().d_blocks.lwe_ciphertext_count(), + ct_right.as_ref().d_blocks.lwe_ciphertext_count() + ); + + let num_blocks = + ct_left.as_ref().d_blocks.lwe_ciphertext_count().0 as u32 / num_radix_ciphertexts; + + unsafe { + match &self.bootstrapping_key { + CudaBootstrappingKey::Classic(d_bsk) => { + unchecked_bitop_vec_radix_kb_assign( + streams, + ct_left.as_mut(), + ct_right.as_ref(), + &d_bsk.d_vec, + &self.key_switching_key.d_vec, + self.message_modulus, + self.carry_modulus, + d_bsk.glwe_dimension, + d_bsk.polynomial_size, + self.key_switching_key + .input_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key + .output_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_bsk.decomp_level_count, + d_bsk.decomp_base_log, + op, + num_blocks, + num_radix_ciphertexts, + PBSType::Classical, + LweBskGroupingFactor(0), + d_bsk.ms_noise_reduction_configuration.as_ref(), + ); + } + CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { + unchecked_bitop_vec_radix_kb_assign( + streams, + ct_left.as_mut(), + ct_right.as_ref(), + &d_multibit_bsk.d_vec, + &self.key_switching_key.d_vec, + self.message_modulus, + self.carry_modulus, + d_multibit_bsk.glwe_dimension, + d_multibit_bsk.polynomial_size, + self.key_switching_key + .input_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key + .output_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_multibit_bsk.decomp_level_count, + d_multibit_bsk.decomp_base_log, + op, + num_blocks, + num_radix_ciphertexts, + PBSType::MultiBit, + d_multibit_bsk.grouping_factor, + None, + ); + } + } + } + } + + pub fn unchecked_bitand_vec( + &self, + ct_left: &T, + ct_right: &T, + num_radix_ciphertexts: u32, + streams: &CudaStreams, + ) -> T { + let mut result = unsafe { ct_left.duplicate_async(streams) }; + self.unchecked_bitop_vec_assign( + &mut result, + ct_right, + BitOpType::And, + num_radix_ciphertexts, + streams, + ); + result + } + pub fn bitand_vec( + &self, + ct_left: &mut T, + ct_right: &T, + num_radix_ciphertexts: u32, + streams: &CudaStreams, + ) -> Vec { + let mut tmp_rhs; + + let (lhs, rhs) = unsafe { + match ( + ct_left.block_carries_are_empty(), + ct_right.block_carries_are_empty(), + ) { + (true, true) => (ct_left, ct_right), + (true, false) => { + tmp_rhs = ct_right.duplicate_async(streams); + self.full_propagate_assign_async(&mut tmp_rhs, streams); + (ct_left, &tmp_rhs) + } + (false, true) => { + self.full_propagate_assign_async(ct_left, streams); + (ct_left, ct_right) + } + (false, false) => { + tmp_rhs = ct_right.duplicate_async(streams); + + self.full_propagate_assign_async(ct_left, streams); + self.full_propagate_assign_async(&mut tmp_rhs, streams); + (ct_left, &tmp_rhs) + } + } + }; + let result = self.unchecked_bitand_vec(lhs, rhs, num_radix_ciphertexts, streams); + result.to_integer_radix_ciphertext_vec(num_radix_ciphertexts, streams) + } }