Skip to content

Commit

Permalink
upstream update
Browse files Browse the repository at this point in the history
  • Loading branch information
krichprollsch committed Feb 11, 2025
1 parent c073b9e commit e87df11
Show file tree
Hide file tree
Showing 11 changed files with 3,946 additions and 346 deletions.
205 changes: 205 additions & 0 deletions checksum.zig
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
//! This file implements vsr.checksum. TigerBeetle uses this checksum to:
//!
//! - detect bitrot in data on disk,
//! - validate network messages before casting raw bytes to an `extern struct` type,
//! - hash-chain prepares and client requests to have strong consistency and ordering guarantees.
//!
//! As this checksum is stored on disk, it is set in stone and impossible to change.
//!
//! We need this checksum to be fast (it's in all our hotpaths) and strong (it's our ultimate line
//! of defense against storage failures and some classes of software bugs).
//!
//! Our checksum of choice is based on Aegis:
//!
//! <https://datatracker.ietf.org/doc/draft-irtf-cfrg-aegis-aead/>
//!
//! We use the implementation from the Zig standard library, but here's the overall overview of the
//! thing works:
//!
//! - AES-block is a widely supported in hardware symmetric encryption primitive (`vaesenc`,
//! `vaesdec` instructions). Hardware acceleration is what provides speed.
//! - Aegis is an modern Authenticated Encryption with Associated Data (AEAD) scheme based on
//! AES-block.
//! - In AEAD, the user provides, a key, a nonce, a secret message, and associated data, and gets
//! a ciphertext and an authentication tag back. Associated data is expected to be sent as plain
//! text (eg, it could be routing information). The tag authenticates _both_ the secret message
//! and associated data.
//! - AEAD can be specialized to be a MAC by using an empty secret message and zero nonce. NB:
//! in mac mode, message to sign is treated as AD, not as a secret message.
//! - A MAC can further be specialized to be a checksum by setting the secret key to zero.
//! And that's what we do here!

const std = @import("std");
const builtin = @import("builtin");
const mem = std.mem;
const testing = std.testing;
const assert = std.debug.assert;

const Aegis128LMac_128 = std.crypto.auth.aegis.Aegis128LMac_128;

var seed_once = std.once(seed_init);
var seed_state: Aegis128LMac_128 = undefined;

comptime {
// As described above, TigerBeetle uses Aegis (and thus AES Blocks), for its checksumming.
// While there is a software implementation, it's much slower and we don't expect to ever be
// using it considering we target platforms with AES hardware acceleration.
//
// If you're trying to compile TigerBeetle for an older CPU without AES hardware acceleration,
// you'll need to disable the following assert.
assert(std.crypto.core.aes.has_hardware_support);
}

fn seed_init() void {
const key = mem.zeroes([16]u8);
seed_state = Aegis128LMac_128.init(&key);
}

// Lazily initialize the Aegis State instead of recomputing it on each call to checksum().
// Then, make a copy of the state and use that to hash the source input bytes.
pub fn checksum(source: []const u8) u128 {
if (@inComptime()) {
// Aegis128 uses hardware accelerated AES via inline asm which isn't available at comptime.
// Use a hard-coded value instead and verify via a test.
if (source.len == 0) return 0x49F174618255402DE6E7E3C40D60CC83;
}
var stream = ChecksumStream.init();
stream.add(source);
return stream.checksum();
}

test "checksum empty" {
var stream = ChecksumStream.init();
stream.add(&.{});
try std.testing.expectEqual(stream.checksum(), comptime checksum(&.{}));
}

pub const ChecksumStream = struct {
state: Aegis128LMac_128,

pub fn init() ChecksumStream {
seed_once.call();
return ChecksumStream{ .state = seed_state };
}

pub fn add(stream: *ChecksumStream, bytes: []const u8) void {
stream.state.update(bytes);
}

pub fn checksum(stream: *ChecksumStream) u128 {
var result: u128 = undefined;
stream.state.final(mem.asBytes(&result));
stream.* = undefined;
return result;
}
};

// Note: these test vectors are not independent --- there are test vectors in AEAD papers, but they
// don't zero all of (nonce, key, secret message). However, the as underlying AEAD implementation
// matches those test vectors, the entries here are correct.
//
// They can be used to smoke-test independent implementations of TigerBeetle checksum.
//
// "checksum stability" test further nails down the exact behavior.
test "checksum test vectors" {
const TestVector = struct {
source: []const u8,
hash: u128,
};

for (&[_]TestVector{
.{
.source = &[_]u8{0x00} ** 16,
.hash = @byteSwap(@as(u128, 0xf72ad48dd05dd1656133101cd4be3a26)),
},
.{
.source = &[_]u8{},
.hash = @byteSwap(@as(u128, 0x83cc600dc4e3e7e62d4055826174f149)),
},
}) |test_vector| {
try testing.expectEqual(test_vector.hash, checksum(test_vector.source));
}
}

test "checksum simple fuzzing" {
var prng = std.rand.DefaultPrng.init(42);

const msg_min = 1;
const msg_max = 1 * 1024 * 1024;

var msg_buf = try testing.allocator.alloc(u8, msg_max);
defer testing.allocator.free(msg_buf);

const cipher_buf = try testing.allocator.alloc(u8, msg_max);
defer testing.allocator.free(cipher_buf);

var i: usize = 0;
while (i < 1_000) : (i += 1) {
const msg_len = prng.random().intRangeAtMostBiased(usize, msg_min, msg_max);
const msg = msg_buf[0..msg_len];
prng.fill(msg);

const msg_checksum = checksum(msg);

// Sanity check that it's a pure function.
const msg_checksum_again = checksum(msg);
try testing.expectEqual(msg_checksum, msg_checksum_again);

// Change the message and make sure the checksum changes.
msg[prng.random().uintLessThan(usize, msg.len)] +%= 1;
const changed_checksum = checksum(msg);
try testing.expect(changed_checksum != msg_checksum);
}
}

// Change detector test to ensure we don't inadvertency modify our checksum function.
test "checksum stability" {
var buf: [1024]u8 = undefined;
var cases: [896]u128 = undefined;
var case_index: usize = 0;

// Zeros of various lengths.
var subcase: usize = 0;
while (subcase < 128) : (subcase += 1) {
const message = buf[0..subcase];
@memset(message, 0);

cases[case_index] = checksum(message);
case_index += 1;
}

// 64 bytes with exactly one bit set.
subcase = 0;
while (subcase < 64 * 8) : (subcase += 1) {
const message = buf[0..64];
@memset(message, 0);
message[@divFloor(subcase, 8)] = @shlExact(@as(u8, 1), @as(u3, @intCast(subcase % 8)));

cases[case_index] = checksum(message);
case_index += 1;
}

// Pseudo-random data from a specific PRNG of various lengths.
var prng = std.rand.Xoshiro256.init(92);
subcase = 0;
while (subcase < 256) : (subcase += 1) {
const message = buf[0 .. subcase + 13];
prng.fill(message);

cases[case_index] = checksum(message);
case_index += 1;
}

// Sanity check that we are not getting trivial answers.
for (cases, 0..) |case_a, i| {
assert(case_a != 0);
assert(case_a != std.math.maxInt(u128));
for (cases[0..i]) |case_b| assert(case_a != case_b);
}

// Hash me, baby, one more time! If this final hash changes, we broke compatibility in a major
// way.
comptime assert(builtin.target.cpu.arch.endian() == .little);
const hash = checksum(mem.sliceAsBytes(&cases));
try testing.expectEqual(hash, 0x82dcaacf4875b279446825b6830d1263);
}
25 changes: 25 additions & 0 deletions constants.zig
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,28 @@ pub const sector_size = 4096;
/// when they were never written to disk.
pub const direct_io = true;
pub const direct_io_required = true;

/// The number of milliseconds between each replica tick, the basic unit of time in TigerBeetle.
/// Used to regulate heartbeats, retries and timeouts, all specified as multiples of a tick.
pub const tick_ms = 10;

/// TigerBeetle uses asserts proactively, unless they severely degrade performance. For production,
/// 5% slow down might be deemed critical, tests tolerate slowdowns up to 5x. Tests should be
/// reasonably fast to make deterministic simulation effective. `constants.verify` disambiguate the
/// two cases.
///
/// In the control plane (eg, vsr proper) assert unconditionally. Due to batching, control plane
/// overhead is negligible. It is acceptable to spend O(N) time to verify O(1) computation.
///
/// In the data plane (eg, lsm tree), finer grained judgement is required. Do an unconditional O(1)
/// assert before an O(N) loop (e.g, a bounds check). Inside the loop, it might or might not be
/// feasible to add an extra assert per iteration. In the latter case, guard the assert with `if
/// (constants.verify)`, but prefer an unconditional assert unless benchmarks prove it to be costly.
///
/// In the data plane, never use O(N) asserts for O(1) computations --- due to do randomized testing
/// the overall coverage is proportional to the number of tests run. Slow thorough assertions
/// decrease the overall test coverage.
///
/// Specific data structures might use a comptime parameter, to enable extra costly verification
/// only during unit tests of the data structure.
pub const verify = false;
59 changes: 50 additions & 9 deletions fifo.zig
Original file line number Diff line number Diff line change
@@ -1,16 +1,28 @@
const std = @import("std");
const assert = std.debug.assert;

const constants = @import("./constants.zig");

/// An intrusive first in/first out linked list.
/// The element type T must have a field called "next" of type ?*T
pub fn FIFO(comptime T: type) type {
pub fn FIFOType(comptime T: type) type {
return struct {
const Self = @This();
const FIFO = @This();

in: ?*T = null,
out: ?*T = null,
count: u64 = 0,

// This should only be null if you're sure we'll never want to monitor `count`.
name: ?[]const u8,

// If the number of elements is large, the constants.verify check in push() can be too
// expensive. Allow the user to gate it. Could also be a comptime param?
verify_push: bool = true,

pub fn push(self: *FIFO, elem: *T) void {
if (constants.verify and self.verify_push) assert(!self.contains(elem));

pub fn push(self: *Self, elem: *T) void {
assert(elem.next == null);
if (self.in) |in| {
in.next = elem;
Expand All @@ -20,28 +32,43 @@ pub fn FIFO(comptime T: type) type {
self.in = elem;
self.out = elem;
}
self.count += 1;
}

pub fn pop(self: *Self) ?*T {
pub fn pop(self: *FIFO) ?*T {
const ret = self.out orelse return null;
self.out = ret.next;
ret.next = null;
if (self.in == ret) self.in = null;
self.count -= 1;
return ret;
}

pub fn peek(self: Self) ?*T {
pub fn peek_last(self: FIFO) ?*T {
return self.in;
}

pub fn peek(self: FIFO) ?*T {
return self.out;
}

pub fn empty(self: Self) bool {
pub fn empty(self: FIFO) bool {
return self.peek() == null;
}

/// Returns whether the linked list contains the given *exact element* (pointer comparison).
pub fn contains(self: *const FIFO, elem_needle: *const T) bool {
var iterator = self.peek();
while (iterator) |elem| : (iterator = elem.next) {
if (elem == elem_needle) return true;
}
return false;
}

/// Remove an element from the FIFO. Asserts that the element is
/// in the FIFO. This operation is O(N), if this is done often you
/// probably want a different data structure.
pub fn remove(self: *Self, to_remove: *T) void {
pub fn remove(self: *FIFO, to_remove: *T) void {
if (to_remove == self.out) {
_ = self.pop();
return;
Expand All @@ -52,14 +79,19 @@ pub fn FIFO(comptime T: type) type {
if (to_remove == self.in) self.in = elem;
elem.next = to_remove.next;
to_remove.next = null;
self.count -= 1;
break;
}
} else unreachable;
}

pub fn reset(self: *FIFO) void {
self.* = .{ .name = self.name };
}
};
}

test "push/pop/peek/remove/empty" {
test "FIFO: push/pop/peek/remove/empty" {
const testing = @import("std").testing;

const Foo = struct { next: ?*@This() = null };
Expand All @@ -68,24 +100,33 @@ test "push/pop/peek/remove/empty" {
var two: Foo = .{};
var three: Foo = .{};

var fifo: FIFO(Foo) = .{};
var fifo: FIFOType(Foo) = .{ .name = null };
try testing.expect(fifo.empty());

fifo.push(&one);
try testing.expect(!fifo.empty());
try testing.expectEqual(@as(?*Foo, &one), fifo.peek());
try testing.expect(fifo.contains(&one));
try testing.expect(!fifo.contains(&two));
try testing.expect(!fifo.contains(&three));

fifo.push(&two);
fifo.push(&three);
try testing.expect(!fifo.empty());
try testing.expectEqual(@as(?*Foo, &one), fifo.peek());
try testing.expect(fifo.contains(&one));
try testing.expect(fifo.contains(&two));
try testing.expect(fifo.contains(&three));

fifo.remove(&one);
try testing.expect(!fifo.empty());
try testing.expectEqual(@as(?*Foo, &two), fifo.pop());
try testing.expectEqual(@as(?*Foo, &three), fifo.pop());
try testing.expectEqual(@as(?*Foo, null), fifo.pop());
try testing.expect(fifo.empty());
try testing.expect(!fifo.contains(&one));
try testing.expect(!fifo.contains(&two));
try testing.expect(!fifo.contains(&three));

fifo.push(&one);
fifo.push(&two);
Expand Down
7 changes: 6 additions & 1 deletion io.zig
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
const std = @import("std");
const builtin = @import("builtin");
const assert = std.debug.assert;
const os = std.os;

const FIFO = @import("fifo.zig").FIFO;
Expand All @@ -13,6 +12,12 @@ pub const IO = switch (builtin.target.os.tag) {
else => @compileError("IO is not supported for platform"),
};

pub const DirectIO = enum {
direct_io_required,
direct_io_optional,
direct_io_disabled,
};

pub fn buffer_limit(buffer_len: usize) usize {
// Linux limits how much may be written in a `pwrite()/pread()` call, which is `0x7ffff000` on
// both 64-bit and 32-bit systems, due to using a signed C int as the return value, as well as
Expand Down
Loading

0 comments on commit e87df11

Please sign in to comment.