upstream update

lightpanda-io · Feb 11, 2025 · e87df11 · e87df11
1 parent c073b9e
commit e87df11
Show file tree

Hide file tree

Showing 11 changed files with 3,946 additions and 346 deletions.
diff --git a/checksum.zig b/checksum.zig
@@ -0,0 +1,205 @@
+//! This file implements vsr.checksum. TigerBeetle uses this checksum to:
+//!
+//! - detect bitrot in data on disk,
+//! - validate network messages before casting raw bytes to an `extern struct` type,
+//! - hash-chain prepares and client requests to have strong consistency and ordering guarantees.
+//!
+//! As this checksum is stored on disk, it is set in stone and impossible to change.
+//!
+//! We need this checksum to be fast (it's in all our hotpaths) and strong (it's our ultimate line
+//! of defense against storage failures and some classes of software bugs).
+//!
+//! Our checksum of choice is based on Aegis:
+//!
+//! <https://datatracker.ietf.org/doc/draft-irtf-cfrg-aegis-aead/>
+//!
+//! We use the implementation from the Zig standard library, but here's the overall overview of the
+//! thing works:
+//!
+//! - AES-block is a widely supported in hardware symmetric encryption primitive (`vaesenc`,
+//!   `vaesdec` instructions). Hardware acceleration is what provides speed.
+//! - Aegis is an modern Authenticated Encryption with Associated Data (AEAD) scheme based on
+//!   AES-block.
+//! - In AEAD, the user provides, a key, a nonce, a secret message, and associated data, and gets
+//!   a ciphertext and an authentication tag back. Associated data is expected to be sent as plain
+//!   text (eg, it could be routing information). The tag authenticates _both_ the secret message
+//!   and associated data.
+//! - AEAD can be specialized to be a MAC by using an empty secret message and zero nonce. NB:
+//!   in mac mode, message to sign is treated as AD, not as a secret message.
+//! - A MAC can further be specialized to be a checksum by setting the secret key to zero.
+//!   And that's what we do here!
+
+const std = @import("std");
+const builtin = @import("builtin");
+const mem = std.mem;
+const testing = std.testing;
+const assert = std.debug.assert;
+
+const Aegis128LMac_128 = std.crypto.auth.aegis.Aegis128LMac_128;
+
+var seed_once = std.once(seed_init);
+var seed_state: Aegis128LMac_128 = undefined;
+
+comptime {
+    // As described above, TigerBeetle uses Aegis (and thus AES Blocks), for its checksumming.
+    // While there is a software implementation, it's much slower and we don't expect to ever be
+    // using it considering we target platforms with AES hardware acceleration.
+    //
+    // If you're trying to compile TigerBeetle for an older CPU without AES hardware acceleration,
+    // you'll need to disable the following assert.
+    assert(std.crypto.core.aes.has_hardware_support);
+}
+
+fn seed_init() void {
+    const key = mem.zeroes([16]u8);
+    seed_state = Aegis128LMac_128.init(&key);
+}
+
+// Lazily initialize the Aegis State instead of recomputing it on each call to checksum().
+// Then, make a copy of the state and use that to hash the source input bytes.
+pub fn checksum(source: []const u8) u128 {
+    if (@inComptime()) {
+        // Aegis128 uses hardware accelerated AES via inline asm which isn't available at comptime.
+        // Use a hard-coded value instead and verify via a test.
+        if (source.len == 0) return 0x49F174618255402DE6E7E3C40D60CC83;
+    }
+    var stream = ChecksumStream.init();
+    stream.add(source);
+    return stream.checksum();
+}
+
+test "checksum empty" {
+    var stream = ChecksumStream.init();
+    stream.add(&.{});
+    try std.testing.expectEqual(stream.checksum(), comptime checksum(&.{}));
+}
+
+pub const ChecksumStream = struct {
+    state: Aegis128LMac_128,
+
+    pub fn init() ChecksumStream {
+        seed_once.call();
+        return ChecksumStream{ .state = seed_state };
+    }
+
+    pub fn add(stream: *ChecksumStream, bytes: []const u8) void {
+        stream.state.update(bytes);
+    }
+
+    pub fn checksum(stream: *ChecksumStream) u128 {
+        var result: u128 = undefined;
+        stream.state.final(mem.asBytes(&result));
+        stream.* = undefined;
+        return result;
+    }
+};
+
+// Note: these test vectors are not independent --- there are test vectors in AEAD papers, but they
+// don't zero all of (nonce, key, secret message). However, the as underlying AEAD implementation
+// matches those test vectors, the entries here are correct.
+//
+// They can be used to smoke-test independent implementations of TigerBeetle checksum.
+//
+// "checksum stability" test further nails down the exact behavior.
+test "checksum test vectors" {
+    const TestVector = struct {
+        source: []const u8,
+        hash: u128,
+    };
+
+    for (&[_]TestVector{
+        .{
+            .source = &[_]u8{0x00} ** 16,
+            .hash = @byteSwap(@as(u128, 0xf72ad48dd05dd1656133101cd4be3a26)),
+        },
+        .{
+            .source = &[_]u8{},
+            .hash = @byteSwap(@as(u128, 0x83cc600dc4e3e7e62d4055826174f149)),
+        },
+    }) |test_vector| {
+        try testing.expectEqual(test_vector.hash, checksum(test_vector.source));
+    }
+}
+
+test "checksum simple fuzzing" {
+    var prng = std.rand.DefaultPrng.init(42);
+
+    const msg_min = 1;
+    const msg_max = 1 * 1024 * 1024;
+
+    var msg_buf = try testing.allocator.alloc(u8, msg_max);
+    defer testing.allocator.free(msg_buf);
+
+    const cipher_buf = try testing.allocator.alloc(u8, msg_max);
+    defer testing.allocator.free(cipher_buf);
+
+    var i: usize = 0;
+    while (i < 1_000) : (i += 1) {
+        const msg_len = prng.random().intRangeAtMostBiased(usize, msg_min, msg_max);
+        const msg = msg_buf[0..msg_len];
+        prng.fill(msg);
+
+        const msg_checksum = checksum(msg);
+
+        // Sanity check that it's a pure function.
+        const msg_checksum_again = checksum(msg);
+        try testing.expectEqual(msg_checksum, msg_checksum_again);
+
+        // Change the message and make sure the checksum changes.
+        msg[prng.random().uintLessThan(usize, msg.len)] +%= 1;
+        const changed_checksum = checksum(msg);
+        try testing.expect(changed_checksum != msg_checksum);
+    }
+}
+
+// Change detector test to ensure we don't inadvertency modify our checksum function.
+test "checksum stability" {
+    var buf: [1024]u8 = undefined;
+    var cases: [896]u128 = undefined;
+    var case_index: usize = 0;
+
+    // Zeros of various lengths.
+    var subcase: usize = 0;
+    while (subcase < 128) : (subcase += 1) {
+        const message = buf[0..subcase];
+        @memset(message, 0);
+
+        cases[case_index] = checksum(message);
+        case_index += 1;
+    }
+
+    // 64 bytes with exactly one bit set.
+    subcase = 0;
+    while (subcase < 64 * 8) : (subcase += 1) {
+        const message = buf[0..64];
+        @memset(message, 0);
+        message[@divFloor(subcase, 8)] = @shlExact(@as(u8, 1), @as(u3, @intCast(subcase % 8)));
+
+        cases[case_index] = checksum(message);
+        case_index += 1;
+    }
+
+    // Pseudo-random data from a specific PRNG of various lengths.
+    var prng = std.rand.Xoshiro256.init(92);
+    subcase = 0;
+    while (subcase < 256) : (subcase += 1) {
+        const message = buf[0 .. subcase + 13];
+        prng.fill(message);
+
+        cases[case_index] = checksum(message);
+        case_index += 1;
+    }
+
+    // Sanity check that we are not getting trivial answers.
+    for (cases, 0..) |case_a, i| {
+        assert(case_a != 0);
+        assert(case_a != std.math.maxInt(u128));
+        for (cases[0..i]) |case_b| assert(case_a != case_b);
+    }
+
+    // Hash me, baby, one more time! If this final hash changes, we broke compatibility in a major
+    // way.
+    comptime assert(builtin.target.cpu.arch.endian() == .little);
+    const hash = checksum(mem.sliceAsBytes(&cases));
+    try testing.expectEqual(hash, 0x82dcaacf4875b279446825b6830d1263);
+}
diff --git a/constants.zig b/constants.zig
@@ -16,3 +16,28 @@ pub const sector_size = 4096;
 /// when they were never written to disk.
 pub const direct_io = true;
 pub const direct_io_required = true;
+
+/// The number of milliseconds between each replica tick, the basic unit of time in TigerBeetle.
+/// Used to regulate heartbeats, retries and timeouts, all specified as multiples of a tick.
+pub const tick_ms = 10;
+
+/// TigerBeetle uses asserts proactively, unless they severely degrade performance. For production,
+/// 5% slow down might be deemed critical, tests tolerate slowdowns up to 5x. Tests should be
+/// reasonably fast to make deterministic simulation effective. `constants.verify` disambiguate the
+/// two cases.
+///
+/// In the control plane (eg, vsr proper) assert unconditionally. Due to batching, control plane
+/// overhead is negligible. It is acceptable to spend O(N) time to verify O(1) computation.
+///
+/// In the data plane (eg, lsm tree), finer grained judgement is required. Do an unconditional O(1)
+/// assert before an O(N) loop (e.g, a bounds check). Inside the loop, it might or might not be
+/// feasible to add an extra assert per iteration. In the latter case, guard the assert with `if
+/// (constants.verify)`, but prefer an unconditional assert unless benchmarks prove it to be costly.
+///
+/// In the data plane, never use O(N) asserts for O(1) computations --- due to do randomized testing
+/// the overall coverage is proportional to the number of tests run. Slow thorough assertions
+/// decrease the overall test coverage.
+///
+/// Specific data structures might use a comptime parameter, to enable extra costly verification
+/// only during unit tests of the data structure.
+pub const verify = false;
diff --git a/fifo.zig b/fifo.zig
@@ -1,16 +1,28 @@
 const std = @import("std");
 const assert = std.debug.assert;
 
+const constants = @import("./constants.zig");
+
 /// An intrusive first in/first out linked list.
 /// The element type T must have a field called "next" of type ?*T
-pub fn FIFO(comptime T: type) type {
+pub fn FIFOType(comptime T: type) type {
     return struct {
-        const Self = @This();
+        const FIFO = @This();
 
         in: ?*T = null,
         out: ?*T = null,
+        count: u64 = 0,
+
+        // This should only be null if you're sure we'll never want to monitor `count`.
+        name: ?[]const u8,
+
+        // If the number of elements is large, the constants.verify check in push() can be too
+        // expensive. Allow the user to gate it. Could also be a comptime param?
+        verify_push: bool = true,
+
+        pub fn push(self: *FIFO, elem: *T) void {
+            if (constants.verify and self.verify_push) assert(!self.contains(elem));
 
-        pub fn push(self: *Self, elem: *T) void {
             assert(elem.next == null);
             if (self.in) |in| {
                 in.next = elem;
@@ -20,28 +32,43 @@ pub fn FIFO(comptime T: type) type {
                 self.in = elem;
                 self.out = elem;
             }
+            self.count += 1;
         }
 
-        pub fn pop(self: *Self) ?*T {
+        pub fn pop(self: *FIFO) ?*T {
             const ret = self.out orelse return null;
             self.out = ret.next;
             ret.next = null;
             if (self.in == ret) self.in = null;
+            self.count -= 1;
             return ret;
         }
 
-        pub fn peek(self: Self) ?*T {
+        pub fn peek_last(self: FIFO) ?*T {
+            return self.in;
+        }
+
+        pub fn peek(self: FIFO) ?*T {
             return self.out;
         }
 
-        pub fn empty(self: Self) bool {
+        pub fn empty(self: FIFO) bool {
             return self.peek() == null;
         }
 
+        /// Returns whether the linked list contains the given *exact element* (pointer comparison).
+        pub fn contains(self: *const FIFO, elem_needle: *const T) bool {
+            var iterator = self.peek();
+            while (iterator) |elem| : (iterator = elem.next) {
+                if (elem == elem_needle) return true;
+            }
+            return false;
+        }
+
         /// Remove an element from the FIFO. Asserts that the element is
         /// in the FIFO. This operation is O(N), if this is done often you
         /// probably want a different data structure.
-        pub fn remove(self: *Self, to_remove: *T) void {
+        pub fn remove(self: *FIFO, to_remove: *T) void {
             if (to_remove == self.out) {
                 _ = self.pop();
                 return;
@@ -52,14 +79,19 @@ pub fn FIFO(comptime T: type) type {
                     if (to_remove == self.in) self.in = elem;
                     elem.next = to_remove.next;
                     to_remove.next = null;
+                    self.count -= 1;
                     break;
                 }
             } else unreachable;
         }
+
+        pub fn reset(self: *FIFO) void {
+            self.* = .{ .name = self.name };
+        }
     };
 }
 
-test "push/pop/peek/remove/empty" {
+test "FIFO: push/pop/peek/remove/empty" {
     const testing = @import("std").testing;
 
     const Foo = struct { next: ?*@This() = null };
@@ -68,24 +100,33 @@ test "push/pop/peek/remove/empty" {
     var two: Foo = .{};
     var three: Foo = .{};
 
-    var fifo: FIFO(Foo) = .{};
+    var fifo: FIFOType(Foo) = .{ .name = null };
     try testing.expect(fifo.empty());
 
     fifo.push(&one);
     try testing.expect(!fifo.empty());
     try testing.expectEqual(@as(?*Foo, &one), fifo.peek());
+    try testing.expect(fifo.contains(&one));
+    try testing.expect(!fifo.contains(&two));
+    try testing.expect(!fifo.contains(&three));
 
     fifo.push(&two);
     fifo.push(&three);
     try testing.expect(!fifo.empty());
     try testing.expectEqual(@as(?*Foo, &one), fifo.peek());
+    try testing.expect(fifo.contains(&one));
+    try testing.expect(fifo.contains(&two));
+    try testing.expect(fifo.contains(&three));
 
     fifo.remove(&one);
     try testing.expect(!fifo.empty());
     try testing.expectEqual(@as(?*Foo, &two), fifo.pop());
     try testing.expectEqual(@as(?*Foo, &three), fifo.pop());
     try testing.expectEqual(@as(?*Foo, null), fifo.pop());
     try testing.expect(fifo.empty());
+    try testing.expect(!fifo.contains(&one));
+    try testing.expect(!fifo.contains(&two));
+    try testing.expect(!fifo.contains(&three));
 
     fifo.push(&one);
     fifo.push(&two);

diff --git a/io.zig b/io.zig
@@ -1,6 +1,5 @@
 const std = @import("std");
 const builtin = @import("builtin");
-const assert = std.debug.assert;
 const os = std.os;
 
 const FIFO = @import("fifo.zig").FIFO;
@@ -13,6 +12,12 @@ pub const IO = switch (builtin.target.os.tag) {
     else => @compileError("IO is not supported for platform"),
 };
 
+pub const DirectIO = enum {
+    direct_io_required,
+    direct_io_optional,
+    direct_io_disabled,
+};
+
 pub fn buffer_limit(buffer_len: usize) usize {
     // Linux limits how much may be written in a `pwrite()/pread()` call, which is `0x7ffff000` on
     // both 64-bit and 32-bit systems, due to using a signed C int as the return value, as well as