zama-ai · enzodimaria · Jan 15, 2026 · Dec 19, 2025
@@ -1454,6 +1454,13 @@ bench_integer_aes256_gpu: install_rs_check_toolchain
 	--bench integer-aes256 \
 	--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --
 
+.PHONY: bench_integer_trivium_gpu # Run benchmarks for trivium on GPU backend
+bench_integer_trivium_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer-trivium \
+	--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --
+
 .PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
 bench_integer_multi_bit: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=MULTI_BIT __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \

@@ -86,6 +86,7 @@ fn main() {
             "cuda/include/integer/integer.h",
             "cuda/include/integer/rerand.h",
             "cuda/include/aes/aes.h",
+            "cuda/include/trivium/trivium.h",
             "cuda/include/zk/zk.h",
             "cuda/include/keyswitch/keyswitch.h",
             "cuda/include/keyswitch/ks_enums.h",

@@ -0,0 +1,24 @@
+#ifndef TRIVIUM_H
+#define TRIVIUM_H
+
+#include "../integer/integer.h"
+
+extern "C" {
+uint64_t scratch_cuda_trivium_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_inputs);
+
+void cuda_trivium_generate_keystream_64(
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *keystream_output,
+    const CudaRadixCiphertextFFI *key, const CudaRadixCiphertextFFI *iv,
+    uint32_t num_inputs, uint32_t num_steps, int8_t *mem_ptr, void *const *bsks,
+    void *const *ksks);
+
+void cleanup_cuda_trivium_64(CudaStreamsFFI streams, int8_t **mem_ptr_void);
+}
+
+#endif
@@ -0,0 +1,305 @@
+#ifndef TRIVIUM_UTILITIES_H
+#define TRIVIUM_UTILITIES_H
+#include "../integer/integer_utilities.h"
+
+/// Struct to hold the LUTs.
+template <typename Torus> struct int_trivium_lut_buffers {
+  // Bivariate AND Gate LUT:
+  // AND operation: f(a, b) = (a & 1) & (b & 1).
+  // This is a Bivariate PBS used for the non-linear parts of Trivium.
+  int_radix_lut<Torus> *and_lut;
+
+  // Univariate Identity LUT:
+  // MESSAGE EXTRACTION operation: f(x) = x & 1.
+  // This is a Univariate PBS used to "flush" the state: it resets the noise
+  // after additions and ensures the message stays within the binary message
+  // space.
+  int_radix_lut<Torus> *flush_lut;
+
+  int_trivium_lut_buffers(CudaStreams streams, const int_radix_params &params,
+                          bool allocate_gpu_memory, uint32_t num_trivium_inputs,
+                          uint64_t &size_tracker) {
+
+    constexpr uint32_t BATCH_SIZE = 64;
+    constexpr uint32_t MAX_AND_PER_STEP = 3;
+    uint32_t total_lut_ops = num_trivium_inputs * BATCH_SIZE * MAX_AND_PER_STEP;
+
+    this->and_lut = new int_radix_lut<Torus>(streams, params, 1, total_lut_ops,
+                                             allocate_gpu_memory, size_tracker);
+
+    std::function<Torus(Torus, Torus)> and_lambda =
+        [](Torus a, Torus b) -> Torus { return (a & 1) & (b & 1); };
+
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->and_lut->get_lut(0, 0),
+        this->and_lut->get_degree(0), this->and_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, and_lambda, allocate_gpu_memory);
+
+    auto active_streams_and =
+        streams.active_gpu_subset(total_lut_ops, params.pbs_type);
+    this->and_lut->broadcast_lut(active_streams_and);
+    this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
+
+    uint32_t total_flush_ops = num_trivium_inputs * BATCH_SIZE * 4;
+
+    this->flush_lut = new int_radix_lut<Torus>(
+        streams, params, 1, total_flush_ops, allocate_gpu_memory, size_tracker);
+
+    std::function<Torus(Torus)> flush_lambda = [](Torus x) -> Torus {
+      return x & 1;
+    };
+
+    generate_device_accumulator(
+        streams.stream(0), streams.gpu_index(0), this->flush_lut->get_lut(0, 0),
+        this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, flush_lambda, allocate_gpu_memory);
+
+    auto active_streams_flush =
+        streams.active_gpu_subset(total_flush_ops, params.pbs_type);
+    this->flush_lut->broadcast_lut(active_streams_flush);
+    this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
+  }
+
+  void release(CudaStreams streams) {
+    this->and_lut->release(streams);
+    delete this->and_lut;
+    this->and_lut = nullptr;
+
+    this->flush_lut->release(streams);
+    delete this->flush_lut;
+    this->flush_lut = nullptr;
+  }
+};
+
+/// Struct to hold the state and temporary workspaces required for
+/// Trivium execution on the GPU.
+///
+/// This struct manages the memory for the internal registers (A, B, C),
+/// temporary buffers used during the update function, and buffers used for
+/// packing data before and after PBS.
+template <typename Torus> struct int_trivium_state_workspaces {
+  // Trivium Internal State Registers:
+  // Register A: 93 bits
+  CudaRadixCiphertextFFI *a_reg;
+  // Register B: 84 bits
+  CudaRadixCiphertextFFI *b_reg;
+  // Register C: 111 bits
+  CudaRadixCiphertextFFI *c_reg;
+
+  // Shift Workspace:
+  // Used to manage bitshifting operations on the registers
+  CudaRadixCiphertextFFI *shift_workspace;
+
+  // Temporary Update Buffers:
+  // Intermediate buffers for the trivium update logic (t1, t2, t3)
+  CudaRadixCiphertextFFI *temp_t1;
+  CudaRadixCiphertextFFI *temp_t2;
+  CudaRadixCiphertextFFI *temp_t3;
+
+  // Buffers to hold the new values for the registers after an update step
+  CudaRadixCiphertextFFI *new_a;
+  CudaRadixCiphertextFFI *new_b;
+  CudaRadixCiphertextFFI *new_c;
+
+  // PBS Packing Buffers:
+  // Buffers for packing inputs into the bivariate lookup table (AND gate)
+  CudaRadixCiphertextFFI *packed_pbs_lhs;
+  CudaRadixCiphertextFFI *packed_pbs_rhs;
+  // Buffer for the output of the bivariate PBS
+  CudaRadixCiphertextFFI *packed_pbs_out;
+
+  // Flush/Cleanup Packing Buffers:
+  // Buffers for the "flush" LUT which cleans up noise after additions
+  CudaRadixCiphertextFFI *packed_flush_in;
+  CudaRadixCiphertextFFI *packed_flush_out;
+
+  int_trivium_state_workspaces(CudaStreams streams,
+                               const int_radix_params &params,
+                               bool allocate_gpu_memory, uint32_t num_inputs,
+                               uint64_t &size_tracker) {
+
+    this->a_reg = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->a_reg, 93 * num_inputs,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->b_reg = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->b_reg, 84 * num_inputs,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->c_reg = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->c_reg, 111 * num_inputs,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->shift_workspace = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->shift_workspace,
+        128 * num_inputs, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+
+    uint32_t batch_blocks = 64 * num_inputs;
+
+    this->temp_t1 = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->temp_t1, batch_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->temp_t2 = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->temp_t2, batch_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->temp_t3 = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->temp_t3, batch_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->new_a = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->new_a, batch_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->new_b = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->new_b, batch_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->new_c = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->new_c, batch_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->packed_pbs_lhs = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->packed_pbs_lhs,
+        3 * batch_blocks, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+
+    this->packed_pbs_rhs = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->packed_pbs_rhs,
+        3 * batch_blocks, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+
+    this->packed_pbs_out = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->packed_pbs_out,
+        3 * batch_blocks, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+
+    this->packed_flush_in = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->packed_flush_in,
+        4 * batch_blocks, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+
+    this->packed_flush_out = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->packed_flush_out,
+        4 * batch_blocks, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+  }
+
+  void release(CudaStreams streams, bool allocate_gpu_memory) {
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->a_reg, allocate_gpu_memory);
+    delete this->a_reg;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->b_reg, allocate_gpu_memory);
+    delete this->b_reg;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->c_reg, allocate_gpu_memory);
+    delete this->c_reg;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->shift_workspace, allocate_gpu_memory);
+    delete this->shift_workspace;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->temp_t1, allocate_gpu_memory);
+    delete this->temp_t1;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->temp_t2, allocate_gpu_memory);
+    delete this->temp_t2;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->temp_t3, allocate_gpu_memory);
+    delete this->temp_t3;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->new_a, allocate_gpu_memory);
+    delete this->new_a;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->new_b, allocate_gpu_memory);
+    delete this->new_b;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->new_c, allocate_gpu_memory);
+    delete this->new_c;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->packed_pbs_lhs, allocate_gpu_memory);
+    delete this->packed_pbs_lhs;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->packed_pbs_rhs, allocate_gpu_memory);
+    delete this->packed_pbs_rhs;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->packed_pbs_out, allocate_gpu_memory);
+    delete this->packed_pbs_out;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->packed_flush_in, allocate_gpu_memory);
+    delete this->packed_flush_in;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->packed_flush_out, allocate_gpu_memory);
+    delete this->packed_flush_out;
+  }
+};
+
+template <typename Torus> struct int_trivium_buffer {
+  int_radix_params params;
+  bool allocate_gpu_memory;
+  uint32_t num_inputs;
+
+  int_trivium_lut_buffers<Torus> *luts;
+  int_trivium_state_workspaces<Torus> *state;
+
+  int_trivium_buffer(CudaStreams streams, const int_radix_params &params,
+                     bool allocate_gpu_memory, uint32_t num_inputs,
+                     uint64_t &size_tracker) {
+    this->params = params;
+    this->allocate_gpu_memory = allocate_gpu_memory;
+    this->num_inputs = num_inputs;
+
+    this->luts = new int_trivium_lut_buffers<Torus>(
+        streams, params, allocate_gpu_memory, num_inputs, size_tracker);
+
+    this->state = new int_trivium_state_workspaces<Torus>(
+        streams, params, allocate_gpu_memory, num_inputs, size_tracker);
+  }
+
+  void release(CudaStreams streams) {
+    luts->release(streams);
+    delete luts;
+    luts = nullptr;
+
+    state->release(streams, allocate_gpu_memory);
+    delete state;
+    state = nullptr;
+
+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
+  }
+};
+
+#endif
@@ -0,0 +1,45 @@
+#include "../../include/trivium/trivium.h"
+#include "trivium.cuh"
+
+uint64_t scratch_cuda_trivium_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_inputs) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          glwe_dimension * polynomial_size, lwe_dimension,
+                          ks_level, ks_base_log, pbs_level, pbs_base_log,
+                          grouping_factor, message_modulus, carry_modulus,
+                          noise_reduction_type);
+
+  return scratch_cuda_trivium_encrypt<uint64_t>(
+      CudaStreams(streams), (int_trivium_buffer<uint64_t> **)mem_ptr, params,
+      allocate_gpu_memory, num_inputs);
+}
+
+void cuda_trivium_generate_keystream_64(
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *keystream_output,
+    const CudaRadixCiphertextFFI *key, const CudaRadixCiphertextFFI *iv,
+    uint32_t num_inputs, uint32_t num_steps, int8_t *mem_ptr, void *const *bsks,
+    void *const *ksks) {
+
+  auto buffer = (int_trivium_buffer<uint64_t> *)mem_ptr;
+
+  host_trivium_generate_keystream<uint64_t>(
+      CudaStreams(streams), keystream_output, key, iv, num_inputs, num_steps,
+      buffer, bsks, (uint64_t *const *)ksks);
+}
+
+void cleanup_cuda_trivium_64(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
+
+  int_trivium_buffer<uint64_t> *mem_ptr =
+      (int_trivium_buffer<uint64_t> *)(*mem_ptr_void);
+
+  mem_ptr->release(CudaStreams(streams));
+
+  delete mem_ptr;
+  *mem_ptr_void = nullptr;
+}