Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ typedef struct {
uint32_t num_radix_blocks;
uint32_t max_num_radix_blocks;
uint32_t lwe_dimension;
uint32_t num_radix_ciphertexts;
} CudaRadixCiphertextFFI;

typedef struct {
Expand Down
34 changes: 22 additions & 12 deletions backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -24,33 +24,43 @@ __host__ void host_integer_radix_bitop_kb(
lwe_array_out->num_radix_blocks == lwe_array_2->num_radix_blocks,
"Cuda error: input and output num radix blocks must be equal");

PANIC_IF_FALSE(
lwe_array_out->num_radix_ciphertexts ==
lwe_array_1->num_radix_ciphertexts &&
lwe_array_out->num_radix_ciphertexts ==
lwe_array_2->num_radix_ciphertexts,
"Cuda error: input and output num radix ciphertexts must be equal");

PANIC_IF_FALSE(lwe_array_out->lwe_dimension == lwe_array_1->lwe_dimension &&
lwe_array_out->lwe_dimension == lwe_array_2->lwe_dimension,
"Cuda error: input and output lwe dimension must be equal");

auto lut = mem_ptr->lut;
uint64_t degrees[lwe_array_1->num_radix_blocks];
uint64_t degrees[lwe_array_1->num_radix_blocks *
lwe_array_1->num_radix_ciphertexts];
if (mem_ptr->op == BITOP_TYPE::BITAND) {
update_degrees_after_bitand(degrees, lwe_array_1->degrees,
lwe_array_2->degrees,
lwe_array_1->num_radix_blocks);
update_degrees_after_bitand(
degrees, lwe_array_1->degrees, lwe_array_2->degrees,
lwe_array_1->num_radix_blocks * lwe_array_1->num_radix_ciphertexts);
} else if (mem_ptr->op == BITOP_TYPE::BITOR) {
update_degrees_after_bitor(degrees, lwe_array_1->degrees,
lwe_array_2->degrees,
lwe_array_1->num_radix_blocks);
update_degrees_after_bitor(
degrees, lwe_array_1->degrees, lwe_array_2->degrees,
lwe_array_1->num_radix_blocks * lwe_array_1->num_radix_ciphertexts);
} else if (mem_ptr->op == BITOP_TYPE::BITXOR) {
update_degrees_after_bitxor(degrees, lwe_array_1->degrees,
lwe_array_2->degrees,
lwe_array_1->num_radix_blocks);
update_degrees_after_bitxor(
degrees, lwe_array_1->degrees, lwe_array_2->degrees,
lwe_array_1->num_radix_blocks * lwe_array_1->num_radix_ciphertexts);
}

integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, lwe_array_out, lwe_array_1, lwe_array_2, bsks, ksks,
ms_noise_reduction_key, lut, lwe_array_out->num_radix_blocks,
ms_noise_reduction_key, lut,
lwe_array_out->num_radix_blocks * lwe_array_out->num_radix_ciphertexts,
lut->params.message_modulus);

memcpy(lwe_array_out->degrees, degrees,
lwe_array_out->num_radix_blocks * sizeof(uint64_t));
lwe_array_out->num_radix_blocks *
lwe_array_out->num_radix_ciphertexts * sizeof(uint64_t));
}

template <typename Torus>
Expand Down
23 changes: 15 additions & 8 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -417,9 +417,12 @@ __host__ void host_pack_bivariate_blocks(
lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension)
PANIC("Cuda error: input and output radix ciphertexts should have the same "
"lwe dimension")
if (num_radix_blocks > lwe_array_out->num_radix_blocks ||
num_radix_blocks > lwe_array_1->num_radix_blocks ||
num_radix_blocks > lwe_array_2->num_radix_blocks)
if (num_radix_blocks > lwe_array_out->num_radix_blocks *
lwe_array_out->num_radix_ciphertexts ||
num_radix_blocks >
lwe_array_1->num_radix_blocks * lwe_array_1->num_radix_ciphertexts ||
num_radix_blocks >
lwe_array_2->num_radix_blocks * lwe_array_2->num_radix_ciphertexts)
PANIC("Cuda error: num radix blocks on which packing is applied should be "
"smaller or equal to the number of input & output radix blocks")

Expand Down Expand Up @@ -530,7 +533,8 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
if (num_radix_blocks > lut->num_blocks)
PANIC("Cuda error: num radix blocks on which lut is applied should be "
"smaller or equal to the number of lut radix blocks")
if (num_radix_blocks > lwe_array_out->num_radix_blocks)
if (num_radix_blocks >
lwe_array_out->num_radix_blocks * lwe_array_out->num_radix_ciphertexts)
PANIC("Cuda error: num radix blocks on which lut is applied should be "
"smaller or equal to the number of input & output radix blocks")

Expand Down Expand Up @@ -756,11 +760,14 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
if (num_radix_blocks > lut->num_blocks)
PANIC("Cuda error: num radix blocks on which lut is applied should be "
"smaller or equal to the number of lut radix blocks")
if (num_radix_blocks > lwe_array_out->num_radix_blocks ||
num_radix_blocks > lwe_array_1->num_radix_blocks ||
num_radix_blocks > lwe_array_2->num_radix_blocks)
if (num_radix_blocks > lwe_array_out->num_radix_blocks *
lwe_array_out->num_radix_ciphertexts ||
num_radix_blocks >
lwe_array_1->num_radix_blocks * lwe_array_1->num_radix_ciphertexts ||
num_radix_blocks >
lwe_array_2->num_radix_blocks * lwe_array_2->num_radix_ciphertexts)
PANIC("Cuda error: num radix blocks on which lut is applied should be "
"smaller or equal to the number of input & output radix blocks")
"smaller or equal to the number of total input & output radix blocks")

auto params = lut->params;
auto pbs_type = params.pbs_type;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ void into_radix_ciphertext(CudaRadixCiphertextFFI *radix, void *lwe_array,
radix->num_radix_blocks = num_radix_blocks;
radix->max_num_radix_blocks = num_radix_blocks;
radix->ptr = lwe_array;
radix->num_radix_ciphertexts = 1;

radix->degrees = (uint64_t *)(calloc(num_radix_blocks, sizeof(uint64_t)));
radix->noise_levels =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ void create_zero_radix_ciphertext_async(cudaStream_t const stream,
radix->lwe_dimension = lwe_dimension;
radix->num_radix_blocks = num_radix_blocks;
radix->max_num_radix_blocks = num_radix_blocks;
radix->num_radix_ciphertexts = 1;
uint64_t size = (lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
radix->ptr = (void *)cuda_malloc_with_size_tracking_async(
size, stream, gpu_index, size_tracker, allocate_gpu_memory);
Expand Down Expand Up @@ -63,6 +64,7 @@ void as_radix_ciphertext_slice(CudaRadixCiphertextFFI *output_radix,

auto lwe_size = input_radix->lwe_dimension + 1;
output_radix->num_radix_blocks = end_input_lwe_index - start_input_lwe_index;
output_radix->num_radix_ciphertexts = input_radix->num_radix_ciphertexts;
output_radix->max_num_radix_blocks = input_radix->max_num_radix_blocks;
output_radix->lwe_dimension = input_radix->lwe_dimension;
Torus *in_ptr = (Torus *)input_radix->ptr;
Expand Down
3 changes: 3 additions & 0 deletions backends/tfhe-cuda-backend/src/bindings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ pub struct CudaRadixCiphertextFFI {
pub num_radix_blocks: u32,
pub max_num_radix_blocks: u32,
pub lwe_dimension: u32,
pub num_radix_ciphertexts: u32,
}
#[allow(clippy::unnecessary_operation, clippy::identity_op)]
const _: () = {
Expand All @@ -201,6 +202,8 @@ const _: () = {
[::std::mem::offset_of!(CudaRadixCiphertextFFI, max_num_radix_blocks) - 28usize];
["Offset of field: CudaRadixCiphertextFFI::lwe_dimension"]
[::std::mem::offset_of!(CudaRadixCiphertextFFI, lwe_dimension) - 32usize];
["Offset of field: CudaRadixCiphertextFFI::num_radix_ciphertexts"]
[::std::mem::offset_of!(CudaRadixCiphertextFFI, num_radix_ciphertexts) - 36usize];
};
#[repr(C)]
#[derive(Debug, Copy, Clone)]
Expand Down
6 changes: 6 additions & 0 deletions tfhe-benchmark/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,12 @@ path = "benches/high_level_api/noise_squash.rs"
harness = false
required-features = ["integer", "internal-keycache"]

[[bench]]
name = "hlapi-arrays"
path = "benches/high_level_api/arrays.rs"
harness = false
required-features = ["integer", "internal-keycache"]

[[bench]]
name = "glwe_packing_compression-integer-bench"
path = "benches/integer/glwe_packing_compression.rs"
Expand Down
63 changes: 63 additions & 0 deletions tfhe-benchmark/benches/high_level_api/arrays.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
use benchmark::params_aliases::BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
use benchmark::utilities::{write_to_json, OperatorType};
use criterion::Criterion;
use rand::prelude::*;
use tfhe::array::GpuFheUint64Array;
use tfhe::keycache::NamedParam;
use tfhe::prelude::*;
use tfhe::{ClientKey, CompressedServerKey};

#[cfg(feature = "gpu")]
fn main() {
let cks = {
use tfhe::{set_server_key, ConfigBuilder};
let config = ConfigBuilder::with_custom_parameters(
BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
)
.build();
let cks = ClientKey::generate(config);
let compressed_sks = CompressedServerKey::new(&cks);

set_server_key(compressed_sks.decompress_to_gpu());
cks
};

let array_dim = 32;
let num_elems = array_dim * array_dim;
let mut rng = thread_rng();
let clear_xs = (0..num_elems as u64)
.map(|_| rng.gen::<u64>())
.collect::<Vec<_>>();
let clear_ys = (0..num_elems as u64)
.map(|_| rng.gen::<u64>())
.collect::<Vec<_>>();

let xs =
GpuFheUint64Array::try_encrypt((clear_xs.as_slice(), vec![array_dim, array_dim]), &cks)
.unwrap();
let ys =
GpuFheUint64Array::try_encrypt((clear_ys.as_slice(), vec![array_dim, array_dim]), &cks)
.unwrap();

let mut c = Criterion::default().configure_from_args();
let bench_id = format!("bench::hlapi::array::cuda::bitand::");
c.bench_function(&bench_id, |b| {
b.iter(|| {
let _ = &xs & &ys;
})
});

let params = cks.computation_parameters();

write_to_json::<u64, _>(
&bench_id,
params,
params.name(),
"erc20-transfer",
&OperatorType::Atomic,
64,
vec![],
);

c.final_summary();
}
26 changes: 14 additions & 12 deletions tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,19 @@ impl<T: UnsignedInteger> CudaLweCiphertextList<T> {
.map(|list| list.0.lwe_ciphertext_count.0)
.sum(),
);

assert_ne!(
lwe_ciphertext_count.0, 0,
"Empty iterator of CudaLweCiphertextList"
);

let stream_count = lwe_ciphertext_count.0.min(6);
let mut new_streams: Vec<CudaStreams> = Vec::with_capacity(stream_count);

for _ in 0..stream_count {
let stream = CudaStreams::new_single_gpu(streams.gpu_indexes[0]);
new_streams.push(stream);
}

let first_item = cuda_ciphertexts_list_vec.next().unwrap();
let lwe_dimension = first_item.lwe_dimension();
let mut d_vec = CudaVec::new(
Expand All @@ -123,25 +130,20 @@ impl<T: UnsignedInteger> CudaLweCiphertextList<T> {
* std::mem::size_of::<T>();
// Concatenate gpu_index memory
unsafe {
cuda_memcpy_async_gpu_to_gpu(
ptr,
first_item.0.d_vec.as_c_ptr(0),
size as u64,
streams.ptr[0],
streams.gpu_indexes[0].get(),
);
ptr = ptr.wrapping_byte_add(size);
for list in cuda_ciphertexts_list_vec {
for (i, list) in cuda_ciphertexts_list_vec.enumerate() {
cuda_memcpy_async_gpu_to_gpu(
ptr,
list.0.d_vec.as_c_ptr(0),
size as u64,
streams.ptr[0],
streams.gpu_indexes[0].get(),
new_streams[i % stream_count].ptr[0],
new_streams[i % stream_count].gpu_indexes[0].get(),
);
ptr = ptr.wrapping_byte_add(size);
}
}
for s in new_streams.iter() {
s.synchronize();
}

let cuda_lwe_list = CudaLweList {
d_vec,
Expand Down
5 changes: 5 additions & 0 deletions tfhe/src/core_crypto/gpu/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -840,6 +840,7 @@ pub unsafe fn add_lwe_ciphertext_vector_async<T: UnsignedInteger>(
num_radix_blocks: num_samples,
max_num_radix_blocks: num_samples,
lwe_dimension: lwe_dimension.0 as u32,
num_radix_ciphertexts: 1u32,
};
let lwe_array_in_1_data = CudaRadixCiphertextFFI {
ptr: lwe_array_in_1.get_mut_c_ptr(0),
Expand All @@ -848,6 +849,7 @@ pub unsafe fn add_lwe_ciphertext_vector_async<T: UnsignedInteger>(
num_radix_blocks: num_samples,
max_num_radix_blocks: num_samples,
lwe_dimension: lwe_dimension.0 as u32,
num_radix_ciphertexts: 1u32,
};
let lwe_array_in_2_data = CudaRadixCiphertextFFI {
ptr: lwe_array_in_2.get_mut_c_ptr(0),
Expand All @@ -856,6 +858,7 @@ pub unsafe fn add_lwe_ciphertext_vector_async<T: UnsignedInteger>(
num_radix_blocks: num_samples,
max_num_radix_blocks: num_samples,
lwe_dimension: lwe_dimension.0 as u32,
num_radix_ciphertexts: 1u32,
};
cuda_add_lwe_ciphertext_vector_64(
streams.ptr[0],
Expand Down Expand Up @@ -890,6 +893,7 @@ pub unsafe fn add_lwe_ciphertext_vector_assign_async<T: UnsignedInteger>(
num_radix_blocks: num_samples,
max_num_radix_blocks: num_samples,
lwe_dimension: lwe_dimension.0 as u32,
num_radix_ciphertexts: 1u32,
};
let lwe_array_in_data = CudaRadixCiphertextFFI {
ptr: lwe_array_in.get_mut_c_ptr(0),
Expand All @@ -898,6 +902,7 @@ pub unsafe fn add_lwe_ciphertext_vector_assign_async<T: UnsignedInteger>(
num_radix_blocks: num_samples,
max_num_radix_blocks: num_samples,
lwe_dimension: lwe_dimension.0 as u32,
num_radix_ciphertexts: 1u32,
};
cuda_add_lwe_ciphertext_vector_64(
streams.ptr[0],
Expand Down
29 changes: 27 additions & 2 deletions tfhe/src/high_level_api/array/gpu/integers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ use crate::integer::block_decomposition::{
DecomposableInto, RecomposableFrom, RecomposableSignedInteger,
};
use crate::integer::gpu::ciphertext::{
CudaIntegerRadixCiphertext, CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext,
CudaIntegerRadixCiphertext, CudaRadixCiphertext, CudaSignedRadixCiphertext,
CudaUnsignedRadixCiphertext,
};
use crate::integer::server_key::radix_parallel::scalar_div_mod::SignedReciprocable;
use crate::integer::server_key::{Reciprocable, ScalarMultiplier};
Expand Down Expand Up @@ -83,6 +84,12 @@ impl<'a, T> TensorSlice<'a, GpuSlice<'a, T>> {
pub fn par_iter(self) -> ParStridedIter<'a, T> {
ParStridedIter::new(self.slice.0, self.dims.clone())
}
pub fn len(&self) -> usize {
self.dims.flattened_len()
}
pub fn as_slice(&self) -> &'a [T] {
self.slice.0
}
}

impl<'a, T> TensorSlice<'a, GpuSliceMut<'a, T>> {
Expand Down Expand Up @@ -316,7 +323,25 @@ where
lhs: TensorSlice<'_, Self::Slice<'a>>,
rhs: TensorSlice<'_, Self::Slice<'a>>,
) -> Self::Owned {
par_map_sks_op_on_pair_of_elements(lhs, rhs, crate::integer::gpu::CudaServerKey::bitand)
GpuOwned(global_state::with_cuda_internal_keys(|cuda_key| {
let streams = &cuda_key.streams;
let num_ciphertexts = lhs.len() as u32;
let lhs_slice: &[T] = lhs.as_slice();
let rhs_slice: &[T] = rhs.as_slice();
let mut lhs_aligned = T::from(CudaRadixCiphertext::from_radix_ciphertext_vec(
lhs_slice, streams,
));
let rhs_aligned = T::from(CudaRadixCiphertext::from_radix_ciphertext_vec(
rhs_slice, streams,
));
crate::integer::gpu::CudaServerKey::bitand_vec(
cuda_key.pbs_key(),
&mut lhs_aligned,
&rhs_aligned,
num_ciphertexts,
streams,
)
}))
}

fn bitor<'a>(
Expand Down
6 changes: 6 additions & 0 deletions tfhe/src/high_level_api/array/traits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,12 @@ impl<'a, T> TensorSlice<'a, &'a [T]> {
pub fn par_iter(self) -> ParStridedIter<'a, T> {
ParStridedIter::new(self.slice, self.dims.clone())
}
pub fn len(&self) -> usize {
self.dims.flattened_len()
}
pub fn as_slice(&self) -> &'a [T] {
self.slice
}
}

impl<'a, T> TensorSlice<'a, &'a mut [T]> {
Expand Down
Loading
Loading