Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -1454,6 +1454,13 @@ bench_integer_aes256_gpu: install_rs_check_toolchain
--bench integer-aes256 \
--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --

.PHONY: bench_integer_trivium_gpu # Run benchmarks for trivium on GPU backend
bench_integer_trivium_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-trivium \
--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --

.PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
bench_integer_multi_bit: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=MULTI_BIT __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
Expand Down
1 change: 1 addition & 0 deletions backends/tfhe-cuda-backend/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ fn main() {
"cuda/include/integer/integer.h",
"cuda/include/integer/rerand.h",
"cuda/include/aes/aes.h",
"cuda/include/trivium/trivium.h",
"cuda/include/zk/zk.h",
"cuda/include/keyswitch/keyswitch.h",
"cuda/include/keyswitch/ks_enums.h",
Expand Down
24 changes: 24 additions & 0 deletions backends/tfhe-cuda-backend/cuda/include/trivium/trivium.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#ifndef TRIVIUM_H
#define TRIVIUM_H

#include "../integer/integer.h"

extern "C" {
uint64_t scratch_cuda_trivium_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_inputs);

void cuda_trivium_generate_keystream_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *keystream_output,
const CudaRadixCiphertextFFI *key, const CudaRadixCiphertextFFI *iv,
uint32_t num_inputs, uint32_t num_steps, int8_t *mem_ptr, void *const *bsks,
void *const *ksks);

void cleanup_cuda_trivium_64(CudaStreamsFFI streams, int8_t **mem_ptr_void);
}

#endif
305 changes: 305 additions & 0 deletions backends/tfhe-cuda-backend/cuda/include/trivium/trivium_utilities.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,305 @@
#ifndef TRIVIUM_UTILITIES_H
#define TRIVIUM_UTILITIES_H
#include "../integer/integer_utilities.h"

/// Struct to hold the LUTs.
template <typename Torus> struct int_trivium_lut_buffers {
// Bivariate AND Gate LUT:
// AND operation: f(a, b) = (a & 1) & (b & 1).
// This is a Bivariate PBS used for the non-linear parts of Trivium.
int_radix_lut<Torus> *and_lut;

// Univariate Identity LUT:
// MESSAGE EXTRACTION operation: f(x) = x & 1.
// This is a Univariate PBS used to "flush" the state: it resets the noise
// after additions and ensures the message stays within the binary message
// space.
int_radix_lut<Torus> *flush_lut;

int_trivium_lut_buffers(CudaStreams streams, const int_radix_params &params,
bool allocate_gpu_memory, uint32_t num_trivium_inputs,
uint64_t &size_tracker) {

constexpr uint32_t BATCH_SIZE = 64;
constexpr uint32_t MAX_AND_PER_STEP = 3;
uint32_t total_lut_ops = num_trivium_inputs * BATCH_SIZE * MAX_AND_PER_STEP;

this->and_lut = new int_radix_lut<Torus>(streams, params, 1, total_lut_ops,
allocate_gpu_memory, size_tracker);

std::function<Torus(Torus, Torus)> and_lambda =
[](Torus a, Torus b) -> Torus { return (a & 1) & (b & 1); };

generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), this->and_lut->get_lut(0, 0),
this->and_lut->get_degree(0), this->and_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, and_lambda, allocate_gpu_memory);

auto active_streams_and =
streams.active_gpu_subset(total_lut_ops, params.pbs_type);
this->and_lut->broadcast_lut(active_streams_and);
this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);

uint32_t total_flush_ops = num_trivium_inputs * BATCH_SIZE * 4;

this->flush_lut = new int_radix_lut<Torus>(
streams, params, 1, total_flush_ops, allocate_gpu_memory, size_tracker);

std::function<Torus(Torus)> flush_lambda = [](Torus x) -> Torus {
return x & 1;
};

generate_device_accumulator(
streams.stream(0), streams.gpu_index(0), this->flush_lut->get_lut(0, 0),
this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, flush_lambda, allocate_gpu_memory);

auto active_streams_flush =
streams.active_gpu_subset(total_flush_ops, params.pbs_type);
this->flush_lut->broadcast_lut(active_streams_flush);
this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
}

void release(CudaStreams streams) {
this->and_lut->release(streams);
delete this->and_lut;
this->and_lut = nullptr;

this->flush_lut->release(streams);
delete this->flush_lut;
this->flush_lut = nullptr;
}
};

/// Struct to hold the state and temporary workspaces required for
/// Trivium execution on the GPU.
///
/// This struct manages the memory for the internal registers (A, B, C),
/// temporary buffers used during the update function, and buffers used for
/// packing data before and after PBS.
template <typename Torus> struct int_trivium_state_workspaces {
// Trivium Internal State Registers:
// Register A: 93 bits
CudaRadixCiphertextFFI *a_reg;
// Register B: 84 bits
CudaRadixCiphertextFFI *b_reg;
// Register C: 111 bits
CudaRadixCiphertextFFI *c_reg;

// Shift Workspace:
// Used to manage bitshifting operations on the registers
CudaRadixCiphertextFFI *shift_workspace;

// Temporary Update Buffers:
// Intermediate buffers for the trivium update logic (t1, t2, t3)
CudaRadixCiphertextFFI *temp_t1;
CudaRadixCiphertextFFI *temp_t2;
CudaRadixCiphertextFFI *temp_t3;

// Buffers to hold the new values for the registers after an update step
CudaRadixCiphertextFFI *new_a;
CudaRadixCiphertextFFI *new_b;
CudaRadixCiphertextFFI *new_c;

// PBS Packing Buffers:
// Buffers for packing inputs into the bivariate lookup table (AND gate)
CudaRadixCiphertextFFI *packed_pbs_lhs;
CudaRadixCiphertextFFI *packed_pbs_rhs;
// Buffer for the output of the bivariate PBS
CudaRadixCiphertextFFI *packed_pbs_out;

// Flush/Cleanup Packing Buffers:
// Buffers for the "flush" LUT which cleans up noise after additions
CudaRadixCiphertextFFI *packed_flush_in;
CudaRadixCiphertextFFI *packed_flush_out;

int_trivium_state_workspaces(CudaStreams streams,
const int_radix_params &params,
bool allocate_gpu_memory, uint32_t num_inputs,
uint64_t &size_tracker) {

this->a_reg = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->a_reg, 93 * num_inputs,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);

this->b_reg = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->b_reg, 84 * num_inputs,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);

this->c_reg = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->c_reg, 111 * num_inputs,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);

this->shift_workspace = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->shift_workspace,
128 * num_inputs, params.big_lwe_dimension, size_tracker,
allocate_gpu_memory);

uint32_t batch_blocks = 64 * num_inputs;

this->temp_t1 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->temp_t1, batch_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);

this->temp_t2 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->temp_t2, batch_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);

this->temp_t3 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->temp_t3, batch_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);

this->new_a = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->new_a, batch_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);

this->new_b = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->new_b, batch_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);

this->new_c = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->new_c, batch_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);

this->packed_pbs_lhs = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->packed_pbs_lhs,
3 * batch_blocks, params.big_lwe_dimension, size_tracker,
allocate_gpu_memory);

this->packed_pbs_rhs = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->packed_pbs_rhs,
3 * batch_blocks, params.big_lwe_dimension, size_tracker,
allocate_gpu_memory);

this->packed_pbs_out = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->packed_pbs_out,
3 * batch_blocks, params.big_lwe_dimension, size_tracker,
allocate_gpu_memory);

this->packed_flush_in = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->packed_flush_in,
4 * batch_blocks, params.big_lwe_dimension, size_tracker,
allocate_gpu_memory);

this->packed_flush_out = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->packed_flush_out,
4 * batch_blocks, params.big_lwe_dimension, size_tracker,
allocate_gpu_memory);
}

void release(CudaStreams streams, bool allocate_gpu_memory) {
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->a_reg, allocate_gpu_memory);
delete this->a_reg;

release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->b_reg, allocate_gpu_memory);
delete this->b_reg;

release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->c_reg, allocate_gpu_memory);
delete this->c_reg;

release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->shift_workspace, allocate_gpu_memory);
delete this->shift_workspace;

release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->temp_t1, allocate_gpu_memory);
delete this->temp_t1;

release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->temp_t2, allocate_gpu_memory);
delete this->temp_t2;

release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->temp_t3, allocate_gpu_memory);
delete this->temp_t3;

release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->new_a, allocate_gpu_memory);
delete this->new_a;

release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->new_b, allocate_gpu_memory);
delete this->new_b;

release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->new_c, allocate_gpu_memory);
delete this->new_c;

release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->packed_pbs_lhs, allocate_gpu_memory);
delete this->packed_pbs_lhs;

release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->packed_pbs_rhs, allocate_gpu_memory);
delete this->packed_pbs_rhs;

release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->packed_pbs_out, allocate_gpu_memory);
delete this->packed_pbs_out;

release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->packed_flush_in, allocate_gpu_memory);
delete this->packed_flush_in;

release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->packed_flush_out, allocate_gpu_memory);
delete this->packed_flush_out;
}
};

template <typename Torus> struct int_trivium_buffer {
int_radix_params params;
bool allocate_gpu_memory;
uint32_t num_inputs;

int_trivium_lut_buffers<Torus> *luts;
int_trivium_state_workspaces<Torus> *state;

int_trivium_buffer(CudaStreams streams, const int_radix_params &params,
bool allocate_gpu_memory, uint32_t num_inputs,
uint64_t &size_tracker) {
this->params = params;
this->allocate_gpu_memory = allocate_gpu_memory;
this->num_inputs = num_inputs;

this->luts = new int_trivium_lut_buffers<Torus>(
streams, params, allocate_gpu_memory, num_inputs, size_tracker);

this->state = new int_trivium_state_workspaces<Torus>(
streams, params, allocate_gpu_memory, num_inputs, size_tracker);
}

void release(CudaStreams streams) {
luts->release(streams);
delete luts;
luts = nullptr;

state->release(streams, allocate_gpu_memory);
delete state;
state = nullptr;

cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
}
};

#endif
45 changes: 45 additions & 0 deletions backends/tfhe-cuda-backend/cuda/src/trivium/trivium.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#include "../../include/trivium/trivium.h"
#include "trivium.cuh"

uint64_t scratch_cuda_trivium_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_inputs) {

int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
ks_level, ks_base_log, pbs_level, pbs_base_log,
grouping_factor, message_modulus, carry_modulus,
noise_reduction_type);

return scratch_cuda_trivium_encrypt<uint64_t>(
CudaStreams(streams), (int_trivium_buffer<uint64_t> **)mem_ptr, params,
allocate_gpu_memory, num_inputs);
}

void cuda_trivium_generate_keystream_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *keystream_output,
const CudaRadixCiphertextFFI *key, const CudaRadixCiphertextFFI *iv,
uint32_t num_inputs, uint32_t num_steps, int8_t *mem_ptr, void *const *bsks,
void *const *ksks) {

auto buffer = (int_trivium_buffer<uint64_t> *)mem_ptr;

host_trivium_generate_keystream<uint64_t>(
CudaStreams(streams), keystream_output, key, iv, num_inputs, num_steps,
buffer, bsks, (uint64_t *const *)ksks);
}

void cleanup_cuda_trivium_64(CudaStreamsFFI streams, int8_t **mem_ptr_void) {

int_trivium_buffer<uint64_t> *mem_ptr =
(int_trivium_buffer<uint64_t> *)(*mem_ptr_void);

mem_ptr->release(CudaStreams(streams));

delete mem_ptr;
*mem_ptr_void = nullptr;
}
Loading
Loading