Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

upstream update #13

Merged
merged 2 commits into from
Feb 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
205 changes: 205 additions & 0 deletions checksum.zig
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
//! This file implements vsr.checksum. TigerBeetle uses this checksum to:
//!
//! - detect bitrot in data on disk,
//! - validate network messages before casting raw bytes to an `extern struct` type,
//! - hash-chain prepares and client requests to have strong consistency and ordering guarantees.
//!
//! As this checksum is stored on disk, it is set in stone and impossible to change.
//!
//! We need this checksum to be fast (it's in all our hotpaths) and strong (it's our ultimate line
//! of defense against storage failures and some classes of software bugs).
//!
//! Our checksum of choice is based on Aegis:
//!
//! <https://datatracker.ietf.org/doc/draft-irtf-cfrg-aegis-aead/>
//!
//! We use the implementation from the Zig standard library, but here's the overall overview of the
//! thing works:
//!
//! - AES-block is a widely supported in hardware symmetric encryption primitive (`vaesenc`,
//! `vaesdec` instructions). Hardware acceleration is what provides speed.
//! - Aegis is an modern Authenticated Encryption with Associated Data (AEAD) scheme based on
//! AES-block.
//! - In AEAD, the user provides, a key, a nonce, a secret message, and associated data, and gets
//! a ciphertext and an authentication tag back. Associated data is expected to be sent as plain
//! text (eg, it could be routing information). The tag authenticates _both_ the secret message
//! and associated data.
//! - AEAD can be specialized to be a MAC by using an empty secret message and zero nonce. NB:
//! in mac mode, message to sign is treated as AD, not as a secret message.
//! - A MAC can further be specialized to be a checksum by setting the secret key to zero.
//! And that's what we do here!

const std = @import("std");
const builtin = @import("builtin");
const mem = std.mem;
const testing = std.testing;
const assert = std.debug.assert;

const Aegis128LMac_128 = std.crypto.auth.aegis.Aegis128LMac_128;

var seed_once = std.once(seed_init);
var seed_state: Aegis128LMac_128 = undefined;

comptime {
// As described above, TigerBeetle uses Aegis (and thus AES Blocks), for its checksumming.
// While there is a software implementation, it's much slower and we don't expect to ever be
// using it considering we target platforms with AES hardware acceleration.
//
// If you're trying to compile TigerBeetle for an older CPU without AES hardware acceleration,
// you'll need to disable the following assert.
assert(std.crypto.core.aes.has_hardware_support);
}

fn seed_init() void {
const key = mem.zeroes([16]u8);
seed_state = Aegis128LMac_128.init(&key);
}

// Lazily initialize the Aegis State instead of recomputing it on each call to checksum().
// Then, make a copy of the state and use that to hash the source input bytes.
pub fn checksum(source: []const u8) u128 {
if (@inComptime()) {
// Aegis128 uses hardware accelerated AES via inline asm which isn't available at comptime.
// Use a hard-coded value instead and verify via a test.
if (source.len == 0) return 0x49F174618255402DE6E7E3C40D60CC83;
}
var stream = ChecksumStream.init();
stream.add(source);
return stream.checksum();
}

test "checksum empty" {
var stream = ChecksumStream.init();
stream.add(&.{});
try std.testing.expectEqual(stream.checksum(), comptime checksum(&.{}));
}

pub const ChecksumStream = struct {
state: Aegis128LMac_128,

pub fn init() ChecksumStream {
seed_once.call();
return ChecksumStream{ .state = seed_state };
}

pub fn add(stream: *ChecksumStream, bytes: []const u8) void {
stream.state.update(bytes);
}

pub fn checksum(stream: *ChecksumStream) u128 {
var result: u128 = undefined;
stream.state.final(mem.asBytes(&result));
stream.* = undefined;
return result;
}
};

// Note: these test vectors are not independent --- there are test vectors in AEAD papers, but they
// don't zero all of (nonce, key, secret message). However, the as underlying AEAD implementation
// matches those test vectors, the entries here are correct.
//
// They can be used to smoke-test independent implementations of TigerBeetle checksum.
//
// "checksum stability" test further nails down the exact behavior.
test "checksum test vectors" {
const TestVector = struct {
source: []const u8,
hash: u128,
};

for (&[_]TestVector{
.{
.source = &[_]u8{0x00} ** 16,
.hash = @byteSwap(@as(u128, 0xf72ad48dd05dd1656133101cd4be3a26)),
},
.{
.source = &[_]u8{},
.hash = @byteSwap(@as(u128, 0x83cc600dc4e3e7e62d4055826174f149)),
},
}) |test_vector| {
try testing.expectEqual(test_vector.hash, checksum(test_vector.source));
}
}

test "checksum simple fuzzing" {
var prng = std.rand.DefaultPrng.init(42);

const msg_min = 1;
const msg_max = 1 * 1024 * 1024;

var msg_buf = try testing.allocator.alloc(u8, msg_max);
defer testing.allocator.free(msg_buf);

const cipher_buf = try testing.allocator.alloc(u8, msg_max);
defer testing.allocator.free(cipher_buf);

var i: usize = 0;
while (i < 1_000) : (i += 1) {
const msg_len = prng.random().intRangeAtMostBiased(usize, msg_min, msg_max);
const msg = msg_buf[0..msg_len];
prng.fill(msg);

const msg_checksum = checksum(msg);

// Sanity check that it's a pure function.
const msg_checksum_again = checksum(msg);
try testing.expectEqual(msg_checksum, msg_checksum_again);

// Change the message and make sure the checksum changes.
msg[prng.random().uintLessThan(usize, msg.len)] +%= 1;
const changed_checksum = checksum(msg);
try testing.expect(changed_checksum != msg_checksum);
}
}

// Change detector test to ensure we don't inadvertency modify our checksum function.
test "checksum stability" {
var buf: [1024]u8 = undefined;
var cases: [896]u128 = undefined;
var case_index: usize = 0;

// Zeros of various lengths.
var subcase: usize = 0;
while (subcase < 128) : (subcase += 1) {
const message = buf[0..subcase];
@memset(message, 0);

cases[case_index] = checksum(message);
case_index += 1;
}

// 64 bytes with exactly one bit set.
subcase = 0;
while (subcase < 64 * 8) : (subcase += 1) {
const message = buf[0..64];
@memset(message, 0);
message[@divFloor(subcase, 8)] = @shlExact(@as(u8, 1), @as(u3, @intCast(subcase % 8)));

cases[case_index] = checksum(message);
case_index += 1;
}

// Pseudo-random data from a specific PRNG of various lengths.
var prng = std.rand.Xoshiro256.init(92);
subcase = 0;
while (subcase < 256) : (subcase += 1) {
const message = buf[0 .. subcase + 13];
prng.fill(message);

cases[case_index] = checksum(message);
case_index += 1;
}

// Sanity check that we are not getting trivial answers.
for (cases, 0..) |case_a, i| {
assert(case_a != 0);
assert(case_a != std.math.maxInt(u128));
for (cases[0..i]) |case_b| assert(case_a != case_b);
}

// Hash me, baby, one more time! If this final hash changes, we broke compatibility in a major
// way.
comptime assert(builtin.target.cpu.arch.endian() == .little);
const hash = checksum(mem.sliceAsBytes(&cases));
try testing.expectEqual(hash, 0x82dcaacf4875b279446825b6830d1263);
}
25 changes: 25 additions & 0 deletions constants.zig
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,28 @@ pub const sector_size = 4096;
/// when they were never written to disk.
pub const direct_io = true;
pub const direct_io_required = true;

/// The number of milliseconds between each replica tick, the basic unit of time in TigerBeetle.
/// Used to regulate heartbeats, retries and timeouts, all specified as multiples of a tick.
pub const tick_ms = 10;

/// TigerBeetle uses asserts proactively, unless they severely degrade performance. For production,
/// 5% slow down might be deemed critical, tests tolerate slowdowns up to 5x. Tests should be
/// reasonably fast to make deterministic simulation effective. `constants.verify` disambiguate the
/// two cases.
///
/// In the control plane (eg, vsr proper) assert unconditionally. Due to batching, control plane
/// overhead is negligible. It is acceptable to spend O(N) time to verify O(1) computation.
///
/// In the data plane (eg, lsm tree), finer grained judgement is required. Do an unconditional O(1)
/// assert before an O(N) loop (e.g, a bounds check). Inside the loop, it might or might not be
/// feasible to add an extra assert per iteration. In the latter case, guard the assert with `if
/// (constants.verify)`, but prefer an unconditional assert unless benchmarks prove it to be costly.
///
/// In the data plane, never use O(N) asserts for O(1) computations --- due to do randomized testing
/// the overall coverage is proportional to the number of tests run. Slow thorough assertions
/// decrease the overall test coverage.
///
/// Specific data structures might use a comptime parameter, to enable extra costly verification
/// only during unit tests of the data structure.
pub const verify = false;
59 changes: 50 additions & 9 deletions fifo.zig
Original file line number Diff line number Diff line change
@@ -1,16 +1,28 @@
const std = @import("std");
const assert = std.debug.assert;

const constants = @import("./constants.zig");

/// An intrusive first in/first out linked list.
/// The element type T must have a field called "next" of type ?*T
pub fn FIFO(comptime T: type) type {
pub fn FIFOType(comptime T: type) type {
return struct {
const Self = @This();
const FIFO = @This();

in: ?*T = null,
out: ?*T = null,
count: u64 = 0,

// This should only be null if you're sure we'll never want to monitor `count`.
name: ?[]const u8,

// If the number of elements is large, the constants.verify check in push() can be too
// expensive. Allow the user to gate it. Could also be a comptime param?
verify_push: bool = true,

pub fn push(self: *FIFO, elem: *T) void {
if (constants.verify and self.verify_push) assert(!self.contains(elem));

pub fn push(self: *Self, elem: *T) void {
assert(elem.next == null);
if (self.in) |in| {
in.next = elem;
Expand All @@ -20,28 +32,43 @@ pub fn FIFO(comptime T: type) type {
self.in = elem;
self.out = elem;
}
self.count += 1;
}

pub fn pop(self: *Self) ?*T {
pub fn pop(self: *FIFO) ?*T {
const ret = self.out orelse return null;
self.out = ret.next;
ret.next = null;
if (self.in == ret) self.in = null;
self.count -= 1;
return ret;
}

pub fn peek(self: Self) ?*T {
pub fn peek_last(self: FIFO) ?*T {
return self.in;
}

pub fn peek(self: FIFO) ?*T {
return self.out;
}

pub fn empty(self: Self) bool {
pub fn empty(self: FIFO) bool {
return self.peek() == null;
}

/// Returns whether the linked list contains the given *exact element* (pointer comparison).
pub fn contains(self: *const FIFO, elem_needle: *const T) bool {
var iterator = self.peek();
while (iterator) |elem| : (iterator = elem.next) {
if (elem == elem_needle) return true;
}
return false;
}

/// Remove an element from the FIFO. Asserts that the element is
/// in the FIFO. This operation is O(N), if this is done often you
/// probably want a different data structure.
pub fn remove(self: *Self, to_remove: *T) void {
pub fn remove(self: *FIFO, to_remove: *T) void {
if (to_remove == self.out) {
_ = self.pop();
return;
Expand All @@ -52,14 +79,19 @@ pub fn FIFO(comptime T: type) type {
if (to_remove == self.in) self.in = elem;
elem.next = to_remove.next;
to_remove.next = null;
self.count -= 1;
break;
}
} else unreachable;
}

pub fn reset(self: *FIFO) void {
self.* = .{ .name = self.name };
}
};
}

test "push/pop/peek/remove/empty" {
test "FIFO: push/pop/peek/remove/empty" {
const testing = @import("std").testing;

const Foo = struct { next: ?*@This() = null };
Expand All @@ -68,24 +100,33 @@ test "push/pop/peek/remove/empty" {
var two: Foo = .{};
var three: Foo = .{};

var fifo: FIFO(Foo) = .{};
var fifo: FIFOType(Foo) = .{ .name = null };
try testing.expect(fifo.empty());

fifo.push(&one);
try testing.expect(!fifo.empty());
try testing.expectEqual(@as(?*Foo, &one), fifo.peek());
try testing.expect(fifo.contains(&one));
try testing.expect(!fifo.contains(&two));
try testing.expect(!fifo.contains(&three));

fifo.push(&two);
fifo.push(&three);
try testing.expect(!fifo.empty());
try testing.expectEqual(@as(?*Foo, &one), fifo.peek());
try testing.expect(fifo.contains(&one));
try testing.expect(fifo.contains(&two));
try testing.expect(fifo.contains(&three));

fifo.remove(&one);
try testing.expect(!fifo.empty());
try testing.expectEqual(@as(?*Foo, &two), fifo.pop());
try testing.expectEqual(@as(?*Foo, &three), fifo.pop());
try testing.expectEqual(@as(?*Foo, null), fifo.pop());
try testing.expect(fifo.empty());
try testing.expect(!fifo.contains(&one));
try testing.expect(!fifo.contains(&two));
try testing.expect(!fifo.contains(&three));

fifo.push(&one);
fifo.push(&two);
Expand Down
7 changes: 6 additions & 1 deletion io.zig
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
const std = @import("std");
const builtin = @import("builtin");
const assert = std.debug.assert;
const os = std.os;

const FIFO = @import("fifo.zig").FIFO;
Expand All @@ -13,6 +12,12 @@ pub const IO = switch (builtin.target.os.tag) {
else => @compileError("IO is not supported for platform"),
};

pub const DirectIO = enum {
direct_io_required,
direct_io_optional,
direct_io_disabled,
};

pub fn buffer_limit(buffer_len: usize) usize {
// Linux limits how much may be written in a `pwrite()/pread()` call, which is `0x7ffff000` on
// both 64-bit and 32-bit systems, due to using a signed C int as the return value, as well as
Expand Down
Loading